diff --git a/PKGBUILD b/PKGBUILD index a9a7a6f..9d171e7 100644 --- a/PKGBUILD +++ b/PKGBUILD @@ -654,7 +654,7 @@ case $_basever in #0007-v5.16-fsync.patch 0007-v5.16-fsync1_via_futex_waitv.patch 0007-v5.16-winesync.patch - #0008-5.14-bcachefs.patch + 0008-5.16-bcachefs.patch 0009-glitched-ondemand-bmq.patch 0009-glitched-bmq.patch 0009-prjc_v5.16-r1.patch @@ -676,7 +676,7 @@ case $_basever in 'e5ea0bb25ee294c655ac3cc30e1eea497799826108fbfb4ef3258c676c1e8a12' 'fca63d15ca4502aebd73e76d7499b243d2c03db71ff5ab0bf5cf268b2e576320' '19661ec0d39f9663452b34433214c755179894528bf73a42f6ba52ccf572832a' - #'4503034f211de3013f8500106da753e5d1bcac14bc5576671cbe6f574805b3cd' + '44a46815d26170e43dd5f21e352081c8e5a4816512abb86353a1d90311fffcde' '9df628fd530950e37d31da854cb314d536f33c83935adf5c47e71266a55f7004' 'f91223f98f132602a4fa525917a1f27afe30bdb55a1ac863e739c536188417b3' '9fad4a40449e09522899955762c8928ae17f4cdaa16e01239fd12592e9d58177' diff --git a/linux-tkg-patches/5.16/0008-5.16-bcachefs.patch b/linux-tkg-patches/5.16/0008-5.16-bcachefs.patch new file mode 100644 index 0000000..5942233 --- /dev/null +++ b/linux-tkg-patches/5.16/0008-5.16-bcachefs.patch @@ -0,0 +1,310629 @@ +From 7ed0512418296bc8c8bff25c38542b0c83d0db26 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 27 Dec 2018 11:08:31 -0500 +Subject: Compiler Attributes: add __flatten + +Prep work for bcachefs + +Signed-off-by: Kent Overstreet +--- + include/linux/compiler_attributes.h | 5 +++++ + 1 file changed, 5 insertions(+) + +diff --git a/include/linux/compiler_attributes.h b/include/linux/compiler_attributes.h +index b9121afd8733..9f8c1706f88a 100644 +--- a/include/linux/compiler_attributes.h ++++ b/include/linux/compiler_attributes.h +@@ -314,4 +314,9 @@ + */ + #define __weak __attribute__((__weak__)) + ++/* ++ * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-flatten-function-attribute ++ */ ++#define __flatten __attribute__((flatten)) ++ + #endif /* __LINUX_COMPILER_ATTRIBUTES_H */ +-- +cgit v1.2.3 + + +From 15d763bc24753277de0d9ecf7a02f3374d02b09b Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 10 Jul 2021 22:46:17 -0400 +Subject: locking/lockdep: lock_class_is_held() + +This patch adds lock_class_is_held(), which can be used to verify that a +particular type of lock is _not_ held. + +Signed-off-by: Kent Overstreet +--- + include/linux/lockdep.h | 4 ++++ + kernel/locking/lockdep.c | 20 ++++++++++++++++++++ + 2 files changed, 24 insertions(+) + +diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h +index 467b94257105..c46b0c76c064 100644 +--- a/include/linux/lockdep.h ++++ b/include/linux/lockdep.h +@@ -336,6 +336,8 @@ extern void lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie); + #define lockdep_repin_lock(l,c) lock_repin_lock(&(l)->dep_map, (c)) + #define lockdep_unpin_lock(l,c) lock_unpin_lock(&(l)->dep_map, (c)) + ++int lock_class_is_held(struct lock_class_key *key); ++ + #else /* !CONFIG_LOCKDEP */ + + static inline void lockdep_init_task(struct task_struct *task) +@@ -423,6 +425,8 @@ extern int lockdep_is_held(const void *); + #define lockdep_repin_lock(l, c) do { (void)(l); (void)(c); } while (0) + #define lockdep_unpin_lock(l, c) do { (void)(l); (void)(c); } while (0) + ++static inline int lock_class_is_held(struct lock_class_key *key) { return 0; } ++ + #endif /* !LOCKDEP */ + + enum xhlock_context_t { +diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c +index 2270ec68f10a..ccb42ce37429 100644 +--- a/kernel/locking/lockdep.c ++++ b/kernel/locking/lockdep.c +@@ -6450,6 +6450,26 @@ void debug_check_no_locks_held(void) + } + EXPORT_SYMBOL_GPL(debug_check_no_locks_held); + ++#ifdef CONFIG_LOCKDEP ++int lock_class_is_held(struct lock_class_key *key) ++{ ++ struct task_struct *curr = current; ++ struct held_lock *hlock; ++ ++ if (unlikely(!debug_locks)) ++ return 0; ++ ++ for (hlock = curr->held_locks; ++ hlock < curr->held_locks + curr->lockdep_depth; ++ hlock++) ++ if (hlock->instance->key == key) ++ return 1; ++ ++ return 0; ++} ++EXPORT_SYMBOL_GPL(lock_class_is_held); ++#endif ++ + #ifdef __KERNEL__ + void debug_show_all_locks(void) + { +-- +cgit v1.2.3 + + +From d425ac70cbf93472c961b6181f278c11731a99a4 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 18 May 2018 06:14:56 -0400 +Subject: locking: SIX locks (shared/intent/exclusive) + +New lock for bcachefs, like read/write locks but with a third state, +intent. + +Intent locks conflict with each other, but not with read locks; taking a +write lock requires first holding an intent lock. + +Signed-off-by: Kent Overstreet +--- + include/linux/six.h | 203 +++++++++++++ + kernel/Kconfig.locks | 3 + + kernel/locking/Makefile | 1 + + kernel/locking/six.c | 759 ++++++++++++++++++++++++++++++++++++++++++++++++ + 4 files changed, 966 insertions(+) + create mode 100644 include/linux/six.h + create mode 100644 kernel/locking/six.c + +diff --git a/include/linux/six.h b/include/linux/six.h +new file mode 100644 +index 000000000000..477c33eb00d7 +--- /dev/null ++++ b/include/linux/six.h +@@ -0,0 +1,203 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++ ++#ifndef _LINUX_SIX_H ++#define _LINUX_SIX_H ++ ++/* ++ * Shared/intent/exclusive locks: sleepable read/write locks, much like rw ++ * semaphores, except with a third intermediate state, intent. Basic operations ++ * are: ++ * ++ * six_lock_read(&foo->lock); ++ * six_unlock_read(&foo->lock); ++ * ++ * six_lock_intent(&foo->lock); ++ * six_unlock_intent(&foo->lock); ++ * ++ * six_lock_write(&foo->lock); ++ * six_unlock_write(&foo->lock); ++ * ++ * Intent locks block other intent locks, but do not block read locks, and you ++ * must have an intent lock held before taking a write lock, like so: ++ * ++ * six_lock_intent(&foo->lock); ++ * six_lock_write(&foo->lock); ++ * six_unlock_write(&foo->lock); ++ * six_unlock_intent(&foo->lock); ++ * ++ * Other operations: ++ * ++ * six_trylock_read() ++ * six_trylock_intent() ++ * six_trylock_write() ++ * ++ * six_lock_downgrade(): convert from intent to read ++ * six_lock_tryupgrade(): attempt to convert from read to intent ++ * ++ * Locks also embed a sequence number, which is incremented when the lock is ++ * locked or unlocked for write. The current sequence number can be grabbed ++ * while a lock is held from lock->state.seq; then, if you drop the lock you can ++ * use six_relock_(read|intent_write)(lock, seq) to attempt to retake the lock ++ * iff it hasn't been locked for write in the meantime. ++ * ++ * There are also operations that take the lock type as a parameter, where the ++ * type is one of SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write: ++ * ++ * six_lock_type(lock, type) ++ * six_unlock_type(lock, type) ++ * six_relock(lock, type, seq) ++ * six_trylock_type(lock, type) ++ * six_trylock_convert(lock, from, to) ++ * ++ * A lock may be held multiple types by the same thread (for read or intent, ++ * not write). However, the six locks code does _not_ implement the actual ++ * recursive checks itself though - rather, if your code (e.g. btree iterator ++ * code) knows that the current thread already has a lock held, and for the ++ * correct type, six_lock_increment() may be used to bump up the counter for ++ * that type - the only effect is that one more call to unlock will be required ++ * before the lock is unlocked. ++ */ ++ ++#include ++#include ++#include ++#include ++ ++#define SIX_LOCK_SEPARATE_LOCKFNS ++ ++union six_lock_state { ++ struct { ++ atomic64_t counter; ++ }; ++ ++ struct { ++ u64 v; ++ }; ++ ++ struct { ++ /* for waitlist_bitnr() */ ++ unsigned long l; ++ }; ++ ++ struct { ++ unsigned read_lock:27; ++ unsigned write_locking:1; ++ unsigned intent_lock:1; ++ unsigned waiters:3; ++ /* ++ * seq works much like in seqlocks: it's incremented every time ++ * we lock and unlock for write. ++ * ++ * If it's odd write lock is held, even unlocked. ++ * ++ * Thus readers can unlock, and then lock again later iff it ++ * hasn't been modified in the meantime. ++ */ ++ u32 seq; ++ }; ++}; ++ ++enum six_lock_type { ++ SIX_LOCK_read, ++ SIX_LOCK_intent, ++ SIX_LOCK_write, ++}; ++ ++struct six_lock { ++ union six_lock_state state; ++ unsigned intent_lock_recurse; ++ struct task_struct *owner; ++ struct optimistic_spin_queue osq; ++ unsigned __percpu *readers; ++ ++ raw_spinlock_t wait_lock; ++ struct list_head wait_list[2]; ++#ifdef CONFIG_DEBUG_LOCK_ALLOC ++ struct lockdep_map dep_map; ++#endif ++}; ++ ++typedef int (*six_lock_should_sleep_fn)(struct six_lock *lock, void *); ++ ++static __always_inline void __six_lock_init(struct six_lock *lock, ++ const char *name, ++ struct lock_class_key *key) ++{ ++ atomic64_set(&lock->state.counter, 0); ++ raw_spin_lock_init(&lock->wait_lock); ++ INIT_LIST_HEAD(&lock->wait_list[SIX_LOCK_read]); ++ INIT_LIST_HEAD(&lock->wait_list[SIX_LOCK_intent]); ++#ifdef CONFIG_DEBUG_LOCK_ALLOC ++ debug_check_no_locks_freed((void *) lock, sizeof(*lock)); ++ lockdep_init_map(&lock->dep_map, name, key, 0); ++#endif ++} ++ ++#define six_lock_init(lock) \ ++do { \ ++ static struct lock_class_key __key; \ ++ \ ++ __six_lock_init((lock), #lock, &__key); \ ++} while (0) ++ ++#define __SIX_VAL(field, _v) (((union six_lock_state) { .field = _v }).v) ++ ++#define __SIX_LOCK(type) \ ++bool six_trylock_##type(struct six_lock *); \ ++bool six_relock_##type(struct six_lock *, u32); \ ++int six_lock_##type(struct six_lock *, six_lock_should_sleep_fn, void *);\ ++void six_unlock_##type(struct six_lock *); ++ ++__SIX_LOCK(read) ++__SIX_LOCK(intent) ++__SIX_LOCK(write) ++#undef __SIX_LOCK ++ ++#define SIX_LOCK_DISPATCH(type, fn, ...) \ ++ switch (type) { \ ++ case SIX_LOCK_read: \ ++ return fn##_read(__VA_ARGS__); \ ++ case SIX_LOCK_intent: \ ++ return fn##_intent(__VA_ARGS__); \ ++ case SIX_LOCK_write: \ ++ return fn##_write(__VA_ARGS__); \ ++ default: \ ++ BUG(); \ ++ } ++ ++static inline bool six_trylock_type(struct six_lock *lock, enum six_lock_type type) ++{ ++ SIX_LOCK_DISPATCH(type, six_trylock, lock); ++} ++ ++static inline bool six_relock_type(struct six_lock *lock, enum six_lock_type type, ++ unsigned seq) ++{ ++ SIX_LOCK_DISPATCH(type, six_relock, lock, seq); ++} ++ ++static inline int six_lock_type(struct six_lock *lock, enum six_lock_type type, ++ six_lock_should_sleep_fn should_sleep_fn, void *p) ++{ ++ SIX_LOCK_DISPATCH(type, six_lock, lock, should_sleep_fn, p); ++} ++ ++static inline void six_unlock_type(struct six_lock *lock, enum six_lock_type type) ++{ ++ SIX_LOCK_DISPATCH(type, six_unlock, lock); ++} ++ ++void six_lock_downgrade(struct six_lock *); ++bool six_lock_tryupgrade(struct six_lock *); ++bool six_trylock_convert(struct six_lock *, enum six_lock_type, ++ enum six_lock_type); ++ ++void six_lock_increment(struct six_lock *, enum six_lock_type); ++ ++void six_lock_wakeup_all(struct six_lock *); ++ ++void six_lock_pcpu_free_rcu(struct six_lock *); ++void six_lock_pcpu_free(struct six_lock *); ++void six_lock_pcpu_alloc(struct six_lock *); ++ ++#endif /* _LINUX_SIX_H */ +diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks +index 4198f0273ecd..b2abd9a5d9ab 100644 +--- a/kernel/Kconfig.locks ++++ b/kernel/Kconfig.locks +@@ -259,3 +259,6 @@ config ARCH_HAS_MMIOWB + config MMIOWB + def_bool y if ARCH_HAS_MMIOWB + depends on SMP ++ ++config SIXLOCKS ++ bool +diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile +index d51cabf28f38..cadbf6520c4b 100644 +--- a/kernel/locking/Makefile ++++ b/kernel/locking/Makefile +@@ -32,3 +32,4 @@ obj-$(CONFIG_QUEUED_RWLOCKS) += qrwlock.o + obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o + obj-$(CONFIG_WW_MUTEX_SELFTEST) += test-ww_mutex.o + obj-$(CONFIG_LOCK_EVENT_COUNTS) += lock_events.o ++obj-$(CONFIG_SIXLOCKS) += six.o +diff --git a/kernel/locking/six.c b/kernel/locking/six.c +new file mode 100644 +index 000000000000..fca1208720b6 +--- /dev/null ++++ b/kernel/locking/six.c +@@ -0,0 +1,759 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#ifdef DEBUG ++#define EBUG_ON(cond) BUG_ON(cond) ++#else ++#define EBUG_ON(cond) do {} while (0) ++#endif ++ ++#define six_acquire(l, t) lock_acquire(l, 0, t, 0, 0, NULL, _RET_IP_) ++#define six_release(l) lock_release(l, _RET_IP_) ++ ++struct six_lock_vals { ++ /* Value we add to the lock in order to take the lock: */ ++ u64 lock_val; ++ ++ /* If the lock has this value (used as a mask), taking the lock fails: */ ++ u64 lock_fail; ++ ++ /* Value we add to the lock in order to release the lock: */ ++ u64 unlock_val; ++ ++ /* Mask that indicates lock is held for this type: */ ++ u64 held_mask; ++ ++ /* Waitlist we wakeup when releasing the lock: */ ++ enum six_lock_type unlock_wakeup; ++}; ++ ++#define __SIX_LOCK_HELD_read __SIX_VAL(read_lock, ~0) ++#define __SIX_LOCK_HELD_intent __SIX_VAL(intent_lock, ~0) ++#define __SIX_LOCK_HELD_write __SIX_VAL(seq, 1) ++ ++#define LOCK_VALS { \ ++ [SIX_LOCK_read] = { \ ++ .lock_val = __SIX_VAL(read_lock, 1), \ ++ .lock_fail = __SIX_LOCK_HELD_write + __SIX_VAL(write_locking, 1),\ ++ .unlock_val = -__SIX_VAL(read_lock, 1), \ ++ .held_mask = __SIX_LOCK_HELD_read, \ ++ .unlock_wakeup = SIX_LOCK_write, \ ++ }, \ ++ [SIX_LOCK_intent] = { \ ++ .lock_val = __SIX_VAL(intent_lock, 1), \ ++ .lock_fail = __SIX_LOCK_HELD_intent, \ ++ .unlock_val = -__SIX_VAL(intent_lock, 1), \ ++ .held_mask = __SIX_LOCK_HELD_intent, \ ++ .unlock_wakeup = SIX_LOCK_intent, \ ++ }, \ ++ [SIX_LOCK_write] = { \ ++ .lock_val = __SIX_VAL(seq, 1), \ ++ .lock_fail = __SIX_LOCK_HELD_read, \ ++ .unlock_val = __SIX_VAL(seq, 1), \ ++ .held_mask = __SIX_LOCK_HELD_write, \ ++ .unlock_wakeup = SIX_LOCK_read, \ ++ }, \ ++} ++ ++static inline void six_set_owner(struct six_lock *lock, enum six_lock_type type, ++ union six_lock_state old) ++{ ++ if (type != SIX_LOCK_intent) ++ return; ++ ++ if (!old.intent_lock) { ++ EBUG_ON(lock->owner); ++ lock->owner = current; ++ } else { ++ EBUG_ON(lock->owner != current); ++ } ++} ++ ++static inline unsigned pcpu_read_count(struct six_lock *lock) ++{ ++ unsigned read_count = 0; ++ int cpu; ++ ++ for_each_possible_cpu(cpu) ++ read_count += *per_cpu_ptr(lock->readers, cpu); ++ return read_count; ++} ++ ++struct six_lock_waiter { ++ struct list_head list; ++ struct task_struct *task; ++}; ++ ++/* This is probably up there with the more evil things I've done */ ++#define waitlist_bitnr(id) ilog2((((union six_lock_state) { .waiters = 1 << (id) }).l)) ++ ++static inline void six_lock_wakeup(struct six_lock *lock, ++ union six_lock_state state, ++ unsigned waitlist_id) ++{ ++ if (waitlist_id == SIX_LOCK_write) { ++ if (state.write_locking && !state.read_lock) { ++ struct task_struct *p = READ_ONCE(lock->owner); ++ if (p) ++ wake_up_process(p); ++ } ++ } else { ++ struct list_head *wait_list = &lock->wait_list[waitlist_id]; ++ struct six_lock_waiter *w, *next; ++ ++ if (!(state.waiters & (1 << waitlist_id))) ++ return; ++ ++ clear_bit(waitlist_bitnr(waitlist_id), ++ (unsigned long *) &lock->state.v); ++ ++ raw_spin_lock(&lock->wait_lock); ++ ++ list_for_each_entry_safe(w, next, wait_list, list) { ++ list_del_init(&w->list); ++ ++ if (wake_up_process(w->task) && ++ waitlist_id != SIX_LOCK_read) { ++ if (!list_empty(wait_list)) ++ set_bit(waitlist_bitnr(waitlist_id), ++ (unsigned long *) &lock->state.v); ++ break; ++ } ++ } ++ ++ raw_spin_unlock(&lock->wait_lock); ++ } ++} ++ ++static __always_inline bool do_six_trylock_type(struct six_lock *lock, ++ enum six_lock_type type, ++ bool try) ++{ ++ const struct six_lock_vals l[] = LOCK_VALS; ++ union six_lock_state old, new; ++ bool ret; ++ u64 v; ++ ++ EBUG_ON(type == SIX_LOCK_write && lock->owner != current); ++ EBUG_ON(type == SIX_LOCK_write && (lock->state.seq & 1)); ++ ++ EBUG_ON(type == SIX_LOCK_write && (try != !(lock->state.write_locking))); ++ ++ /* ++ * Percpu reader mode: ++ * ++ * The basic idea behind this algorithm is that you can implement a lock ++ * between two threads without any atomics, just memory barriers: ++ * ++ * For two threads you'll need two variables, one variable for "thread a ++ * has the lock" and another for "thread b has the lock". ++ * ++ * To take the lock, a thread sets its variable indicating that it holds ++ * the lock, then issues a full memory barrier, then reads from the ++ * other thread's variable to check if the other thread thinks it has ++ * the lock. If we raced, we backoff and retry/sleep. ++ */ ++ ++ if (type == SIX_LOCK_read && lock->readers) { ++retry: ++ preempt_disable(); ++ this_cpu_inc(*lock->readers); /* signal that we own lock */ ++ ++ smp_mb(); ++ ++ old.v = READ_ONCE(lock->state.v); ++ ret = !(old.v & l[type].lock_fail); ++ ++ this_cpu_sub(*lock->readers, !ret); ++ preempt_enable(); ++ ++ /* ++ * If we failed because a writer was trying to take the ++ * lock, issue a wakeup because we might have caused a ++ * spurious trylock failure: ++ */ ++ if (old.write_locking) { ++ struct task_struct *p = READ_ONCE(lock->owner); ++ ++ if (p) ++ wake_up_process(p); ++ } ++ ++ /* ++ * If we failed from the lock path and the waiting bit wasn't ++ * set, set it: ++ */ ++ if (!try && !ret) { ++ v = old.v; ++ ++ do { ++ new.v = old.v = v; ++ ++ if (!(old.v & l[type].lock_fail)) ++ goto retry; ++ ++ if (new.waiters & (1 << type)) ++ break; ++ ++ new.waiters |= 1 << type; ++ } while ((v = atomic64_cmpxchg(&lock->state.counter, ++ old.v, new.v)) != old.v); ++ } ++ } else if (type == SIX_LOCK_write && lock->readers) { ++ if (try) { ++ atomic64_add(__SIX_VAL(write_locking, 1), ++ &lock->state.counter); ++ smp_mb__after_atomic(); ++ } ++ ++ ret = !pcpu_read_count(lock); ++ ++ /* ++ * On success, we increment lock->seq; also we clear ++ * write_locking unless we failed from the lock path: ++ */ ++ v = 0; ++ if (ret) ++ v += __SIX_VAL(seq, 1); ++ if (ret || try) ++ v -= __SIX_VAL(write_locking, 1); ++ ++ if (try && !ret) { ++ old.v = atomic64_add_return(v, &lock->state.counter); ++ six_lock_wakeup(lock, old, SIX_LOCK_read); ++ } else { ++ atomic64_add(v, &lock->state.counter); ++ } ++ } else { ++ v = READ_ONCE(lock->state.v); ++ do { ++ new.v = old.v = v; ++ ++ if (!(old.v & l[type].lock_fail)) { ++ new.v += l[type].lock_val; ++ ++ if (type == SIX_LOCK_write) ++ new.write_locking = 0; ++ } else if (!try && type != SIX_LOCK_write && ++ !(new.waiters & (1 << type))) ++ new.waiters |= 1 << type; ++ else ++ break; /* waiting bit already set */ ++ } while ((v = atomic64_cmpxchg_acquire(&lock->state.counter, ++ old.v, new.v)) != old.v); ++ ++ ret = !(old.v & l[type].lock_fail); ++ ++ EBUG_ON(ret && !(lock->state.v & l[type].held_mask)); ++ } ++ ++ if (ret) ++ six_set_owner(lock, type, old); ++ ++ EBUG_ON(type == SIX_LOCK_write && (try || ret) && (lock->state.write_locking)); ++ ++ return ret; ++} ++ ++__always_inline __flatten ++static bool __six_trylock_type(struct six_lock *lock, enum six_lock_type type) ++{ ++ if (!do_six_trylock_type(lock, type, true)) ++ return false; ++ ++ if (type != SIX_LOCK_write) ++ six_acquire(&lock->dep_map, 1); ++ return true; ++} ++ ++__always_inline __flatten ++static bool __six_relock_type(struct six_lock *lock, enum six_lock_type type, ++ unsigned seq) ++{ ++ const struct six_lock_vals l[] = LOCK_VALS; ++ union six_lock_state old; ++ u64 v; ++ ++ EBUG_ON(type == SIX_LOCK_write); ++ ++ if (type == SIX_LOCK_read && ++ lock->readers) { ++ bool ret; ++ ++ preempt_disable(); ++ this_cpu_inc(*lock->readers); ++ ++ smp_mb(); ++ ++ old.v = READ_ONCE(lock->state.v); ++ ret = !(old.v & l[type].lock_fail) && old.seq == seq; ++ ++ this_cpu_sub(*lock->readers, !ret); ++ preempt_enable(); ++ ++ /* ++ * Similar to the lock path, we may have caused a spurious write ++ * lock fail and need to issue a wakeup: ++ */ ++ if (old.write_locking) { ++ struct task_struct *p = READ_ONCE(lock->owner); ++ ++ if (p) ++ wake_up_process(p); ++ } ++ ++ if (ret) ++ six_acquire(&lock->dep_map, 1); ++ ++ return ret; ++ } ++ ++ v = READ_ONCE(lock->state.v); ++ do { ++ old.v = v; ++ ++ if (old.seq != seq || old.v & l[type].lock_fail) ++ return false; ++ } while ((v = atomic64_cmpxchg_acquire(&lock->state.counter, ++ old.v, ++ old.v + l[type].lock_val)) != old.v); ++ ++ six_set_owner(lock, type, old); ++ if (type != SIX_LOCK_write) ++ six_acquire(&lock->dep_map, 1); ++ return true; ++} ++ ++#ifdef CONFIG_LOCK_SPIN_ON_OWNER ++ ++static inline int six_can_spin_on_owner(struct six_lock *lock) ++{ ++ struct task_struct *owner; ++ int retval = 1; ++ ++ if (need_resched()) ++ return 0; ++ ++ rcu_read_lock(); ++ owner = READ_ONCE(lock->owner); ++ if (owner) ++ retval = owner->on_cpu; ++ rcu_read_unlock(); ++ /* ++ * if lock->owner is not set, the mutex owner may have just acquired ++ * it and not set the owner yet or the mutex has been released. ++ */ ++ return retval; ++} ++ ++static inline bool six_spin_on_owner(struct six_lock *lock, ++ struct task_struct *owner) ++{ ++ bool ret = true; ++ ++ rcu_read_lock(); ++ while (lock->owner == owner) { ++ /* ++ * Ensure we emit the owner->on_cpu, dereference _after_ ++ * checking lock->owner still matches owner. If that fails, ++ * owner might point to freed memory. If it still matches, ++ * the rcu_read_lock() ensures the memory stays valid. ++ */ ++ barrier(); ++ ++ if (!owner->on_cpu || need_resched()) { ++ ret = false; ++ break; ++ } ++ ++ cpu_relax(); ++ } ++ rcu_read_unlock(); ++ ++ return ret; ++} ++ ++static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type type) ++{ ++ struct task_struct *task = current; ++ ++ if (type == SIX_LOCK_write) ++ return false; ++ ++ preempt_disable(); ++ if (!six_can_spin_on_owner(lock)) ++ goto fail; ++ ++ if (!osq_lock(&lock->osq)) ++ goto fail; ++ ++ while (1) { ++ struct task_struct *owner; ++ ++ /* ++ * If there's an owner, wait for it to either ++ * release the lock or go to sleep. ++ */ ++ owner = READ_ONCE(lock->owner); ++ if (owner && !six_spin_on_owner(lock, owner)) ++ break; ++ ++ if (do_six_trylock_type(lock, type, false)) { ++ osq_unlock(&lock->osq); ++ preempt_enable(); ++ return true; ++ } ++ ++ /* ++ * When there's no owner, we might have preempted between the ++ * owner acquiring the lock and setting the owner field. If ++ * we're an RT task that will live-lock because we won't let ++ * the owner complete. ++ */ ++ if (!owner && (need_resched() || rt_task(task))) ++ break; ++ ++ /* ++ * The cpu_relax() call is a compiler barrier which forces ++ * everything in this loop to be re-loaded. We don't need ++ * memory barriers as we'll eventually observe the right ++ * values at the cost of a few extra spins. ++ */ ++ cpu_relax(); ++ } ++ ++ osq_unlock(&lock->osq); ++fail: ++ preempt_enable(); ++ ++ /* ++ * If we fell out of the spin path because of need_resched(), ++ * reschedule now, before we try-lock again. This avoids getting ++ * scheduled out right after we obtained the lock. ++ */ ++ if (need_resched()) ++ schedule(); ++ ++ return false; ++} ++ ++#else /* CONFIG_LOCK_SPIN_ON_OWNER */ ++ ++static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type type) ++{ ++ return false; ++} ++ ++#endif ++ ++noinline ++static int __six_lock_type_slowpath(struct six_lock *lock, enum six_lock_type type, ++ six_lock_should_sleep_fn should_sleep_fn, void *p) ++{ ++ union six_lock_state old; ++ struct six_lock_waiter wait; ++ int ret = 0; ++ ++ if (type == SIX_LOCK_write) { ++ EBUG_ON(lock->state.write_locking); ++ atomic64_add(__SIX_VAL(write_locking, 1), &lock->state.counter); ++ smp_mb__after_atomic(); ++ } ++ ++ ret = should_sleep_fn ? should_sleep_fn(lock, p) : 0; ++ if (ret) ++ goto out_before_sleep; ++ ++ if (six_optimistic_spin(lock, type)) ++ goto out_before_sleep; ++ ++ lock_contended(&lock->dep_map, _RET_IP_); ++ ++ INIT_LIST_HEAD(&wait.list); ++ wait.task = current; ++ ++ while (1) { ++ set_current_state(TASK_UNINTERRUPTIBLE); ++ if (type == SIX_LOCK_write) ++ EBUG_ON(lock->owner != current); ++ else if (list_empty_careful(&wait.list)) { ++ raw_spin_lock(&lock->wait_lock); ++ list_add_tail(&wait.list, &lock->wait_list[type]); ++ raw_spin_unlock(&lock->wait_lock); ++ } ++ ++ if (do_six_trylock_type(lock, type, false)) ++ break; ++ ++ ret = should_sleep_fn ? should_sleep_fn(lock, p) : 0; ++ if (ret) ++ break; ++ ++ schedule(); ++ } ++ ++ __set_current_state(TASK_RUNNING); ++ ++ if (!list_empty_careful(&wait.list)) { ++ raw_spin_lock(&lock->wait_lock); ++ list_del_init(&wait.list); ++ raw_spin_unlock(&lock->wait_lock); ++ } ++out_before_sleep: ++ if (ret && type == SIX_LOCK_write) { ++ old.v = atomic64_sub_return(__SIX_VAL(write_locking, 1), ++ &lock->state.counter); ++ six_lock_wakeup(lock, old, SIX_LOCK_read); ++ } ++ ++ return ret; ++} ++ ++__always_inline ++static int __six_lock_type(struct six_lock *lock, enum six_lock_type type, ++ six_lock_should_sleep_fn should_sleep_fn, void *p) ++{ ++ int ret; ++ ++ if (type != SIX_LOCK_write) ++ six_acquire(&lock->dep_map, 0); ++ ++ ret = do_six_trylock_type(lock, type, true) ? 0 ++ : __six_lock_type_slowpath(lock, type, should_sleep_fn, p); ++ ++ if (ret && type != SIX_LOCK_write) ++ six_release(&lock->dep_map); ++ if (!ret) ++ lock_acquired(&lock->dep_map, _RET_IP_); ++ ++ return ret; ++} ++ ++__always_inline __flatten ++static void __six_unlock_type(struct six_lock *lock, enum six_lock_type type) ++{ ++ const struct six_lock_vals l[] = LOCK_VALS; ++ union six_lock_state state; ++ ++ EBUG_ON(type == SIX_LOCK_write && ++ !(lock->state.v & __SIX_LOCK_HELD_intent)); ++ ++ if (type != SIX_LOCK_write) ++ six_release(&lock->dep_map); ++ ++ if (type == SIX_LOCK_intent) { ++ EBUG_ON(lock->owner != current); ++ ++ if (lock->intent_lock_recurse) { ++ --lock->intent_lock_recurse; ++ return; ++ } ++ ++ lock->owner = NULL; ++ } ++ ++ if (type == SIX_LOCK_read && ++ lock->readers) { ++ smp_mb(); /* unlock barrier */ ++ this_cpu_dec(*lock->readers); ++ smp_mb(); /* between unlocking and checking for waiters */ ++ state.v = READ_ONCE(lock->state.v); ++ } else { ++ EBUG_ON(!(lock->state.v & l[type].held_mask)); ++ state.v = atomic64_add_return_release(l[type].unlock_val, ++ &lock->state.counter); ++ } ++ ++ six_lock_wakeup(lock, state, l[type].unlock_wakeup); ++} ++ ++#define __SIX_LOCK(type) \ ++bool six_trylock_##type(struct six_lock *lock) \ ++{ \ ++ return __six_trylock_type(lock, SIX_LOCK_##type); \ ++} \ ++EXPORT_SYMBOL_GPL(six_trylock_##type); \ ++ \ ++bool six_relock_##type(struct six_lock *lock, u32 seq) \ ++{ \ ++ return __six_relock_type(lock, SIX_LOCK_##type, seq); \ ++} \ ++EXPORT_SYMBOL_GPL(six_relock_##type); \ ++ \ ++int six_lock_##type(struct six_lock *lock, \ ++ six_lock_should_sleep_fn should_sleep_fn, void *p) \ ++{ \ ++ return __six_lock_type(lock, SIX_LOCK_##type, should_sleep_fn, p);\ ++} \ ++EXPORT_SYMBOL_GPL(six_lock_##type); \ ++ \ ++void six_unlock_##type(struct six_lock *lock) \ ++{ \ ++ __six_unlock_type(lock, SIX_LOCK_##type); \ ++} \ ++EXPORT_SYMBOL_GPL(six_unlock_##type); ++ ++__SIX_LOCK(read) ++__SIX_LOCK(intent) ++__SIX_LOCK(write) ++ ++#undef __SIX_LOCK ++ ++/* Convert from intent to read: */ ++void six_lock_downgrade(struct six_lock *lock) ++{ ++ six_lock_increment(lock, SIX_LOCK_read); ++ six_unlock_intent(lock); ++} ++EXPORT_SYMBOL_GPL(six_lock_downgrade); ++ ++bool six_lock_tryupgrade(struct six_lock *lock) ++{ ++ union six_lock_state old, new; ++ u64 v = READ_ONCE(lock->state.v); ++ ++ do { ++ new.v = old.v = v; ++ ++ if (new.intent_lock) ++ return false; ++ ++ if (!lock->readers) { ++ EBUG_ON(!new.read_lock); ++ new.read_lock--; ++ } ++ ++ new.intent_lock = 1; ++ } while ((v = atomic64_cmpxchg_acquire(&lock->state.counter, ++ old.v, new.v)) != old.v); ++ ++ if (lock->readers) ++ this_cpu_dec(*lock->readers); ++ ++ six_set_owner(lock, SIX_LOCK_intent, old); ++ ++ return true; ++} ++EXPORT_SYMBOL_GPL(six_lock_tryupgrade); ++ ++bool six_trylock_convert(struct six_lock *lock, ++ enum six_lock_type from, ++ enum six_lock_type to) ++{ ++ EBUG_ON(to == SIX_LOCK_write || from == SIX_LOCK_write); ++ ++ if (to == from) ++ return true; ++ ++ if (to == SIX_LOCK_read) { ++ six_lock_downgrade(lock); ++ return true; ++ } else { ++ return six_lock_tryupgrade(lock); ++ } ++} ++EXPORT_SYMBOL_GPL(six_trylock_convert); ++ ++/* ++ * Increment read/intent lock count, assuming we already have it read or intent ++ * locked: ++ */ ++void six_lock_increment(struct six_lock *lock, enum six_lock_type type) ++{ ++ const struct six_lock_vals l[] = LOCK_VALS; ++ ++ six_acquire(&lock->dep_map, 0); ++ ++ /* XXX: assert already locked, and that we don't overflow: */ ++ ++ switch (type) { ++ case SIX_LOCK_read: ++ if (lock->readers) { ++ this_cpu_inc(*lock->readers); ++ } else { ++ EBUG_ON(!lock->state.read_lock && ++ !lock->state.intent_lock); ++ atomic64_add(l[type].lock_val, &lock->state.counter); ++ } ++ break; ++ case SIX_LOCK_intent: ++ EBUG_ON(!lock->state.intent_lock); ++ lock->intent_lock_recurse++; ++ break; ++ case SIX_LOCK_write: ++ BUG(); ++ break; ++ } ++} ++EXPORT_SYMBOL_GPL(six_lock_increment); ++ ++void six_lock_wakeup_all(struct six_lock *lock) ++{ ++ struct six_lock_waiter *w; ++ ++ raw_spin_lock(&lock->wait_lock); ++ ++ list_for_each_entry(w, &lock->wait_list[0], list) ++ wake_up_process(w->task); ++ list_for_each_entry(w, &lock->wait_list[1], list) ++ wake_up_process(w->task); ++ ++ raw_spin_unlock(&lock->wait_lock); ++} ++EXPORT_SYMBOL_GPL(six_lock_wakeup_all); ++ ++struct free_pcpu_rcu { ++ struct rcu_head rcu; ++ void __percpu *p; ++}; ++ ++static void free_pcpu_rcu_fn(struct rcu_head *_rcu) ++{ ++ struct free_pcpu_rcu *rcu = ++ container_of(_rcu, struct free_pcpu_rcu, rcu); ++ ++ free_percpu(rcu->p); ++ kfree(rcu); ++} ++ ++void six_lock_pcpu_free_rcu(struct six_lock *lock) ++{ ++ struct free_pcpu_rcu *rcu = kzalloc(sizeof(*rcu), GFP_KERNEL); ++ ++ if (!rcu) ++ return; ++ ++ rcu->p = lock->readers; ++ lock->readers = NULL; ++ ++ call_rcu(&rcu->rcu, free_pcpu_rcu_fn); ++} ++EXPORT_SYMBOL_GPL(six_lock_pcpu_free_rcu); ++ ++void six_lock_pcpu_free(struct six_lock *lock) ++{ ++ BUG_ON(lock->readers && pcpu_read_count(lock)); ++ BUG_ON(lock->state.read_lock); ++ ++ free_percpu(lock->readers); ++ lock->readers = NULL; ++} ++EXPORT_SYMBOL_GPL(six_lock_pcpu_free); ++ ++void six_lock_pcpu_alloc(struct six_lock *lock) ++{ ++#ifdef __KERNEL__ ++ if (!lock->readers) ++ lock->readers = alloc_percpu(unsigned); ++#endif ++} ++EXPORT_SYMBOL_GPL(six_lock_pcpu_alloc); +-- +cgit v1.2.3 + + +From ea5782966aac3549bbde9dba427c6ec53d29072f Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 26 Jul 2016 18:20:03 -0800 +Subject: mm: export find_get_pages_range() + +Needed for bcachefs + +Signed-off-by: Kent Overstreet +--- + mm/filemap.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/mm/filemap.c b/mm/filemap.c +index 39c4c46c6133..4487d1710def 100644 +--- a/mm/filemap.c ++++ b/mm/filemap.c +@@ -2196,6 +2196,7 @@ out: + + return ret; + } ++EXPORT_SYMBOL(find_get_pages_range); + + /** + * find_get_pages_contig - gang contiguous pagecache lookup +-- +cgit v1.2.3 + + +From bbb8373f42540fddfb60da461bed64b968358ba6 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 16 Oct 2019 15:03:50 -0400 +Subject: sched: Add task_struct->faults_disabled_mapping + +This is needed to fix a page cache coherency issue with O_DIRECT writes. + +O_DIRECT writes (and other filesystem operations that modify file data +while bypassing the page cache) need to shoot down ranges of the page +cache - and additionally, need locking to prevent those pages from +pulled back in. + +But O_DIRECT writes invoke the page fault handler (via get_user_pages), +and the page fault handler will need to take that same lock - this is a +classic recursive deadlock if userspace has mmaped the file they're DIO +writing to and uses those pages for the buffer to write from, and it's a +lock ordering deadlock in general. + +Thus we need a way to signal from the dio code to the page fault handler +when we already are holding the pagecache add lock on an address space - +this patch just adds a member to task_struct for this purpose. For now +only bcachefs is implementing this locking, though it may be moved out +of bcachefs and made available to other filesystems in the future. + +Signed-off-by: Kent Overstreet +--- + include/linux/sched.h | 1 + + init/init_task.c | 1 + + 2 files changed, 2 insertions(+) + +diff --git a/include/linux/sched.h b/include/linux/sched.h +index 78c351e35fec..2d4dd8066883 100644 +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -854,6 +854,7 @@ struct task_struct { + + struct mm_struct *mm; + struct mm_struct *active_mm; ++ struct address_space *faults_disabled_mapping; + + /* Per-thread vma caching: */ + struct vmacache vmacache; +diff --git a/init/init_task.c b/init/init_task.c +index 2d024066e27b..04c8c3d80cf7 100644 +--- a/init/init_task.c ++++ b/init/init_task.c +@@ -85,6 +85,7 @@ struct task_struct init_task + .nr_cpus_allowed= NR_CPUS, + .mm = NULL, + .active_mm = &init_mm, ++ .faults_disabled_mapping = NULL, + .restart_block = { + .fn = do_no_restart_syscall, + }, +-- +cgit v1.2.3 + + +From de0e8899aff15e26049e85ef02e1ced5cb2ac247 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 24 Aug 2020 16:11:22 -0400 +Subject: mm: Bring back vmalloc_exec + +This is needed for bcachefs, which dynamically generates per-btree node +unpack functions. + +This reverts commit 7a0e27b2a0ce2735e27e21ebc8b777550fe0ed81. + +Signed-off-by: Kent Overstreet +--- + include/linux/vmalloc.h | 1 + + kernel/module.c | 4 +--- + mm/nommu.c | 18 ++++++++++++++++++ + mm/vmalloc.c | 21 +++++++++++++++++++++ + 4 files changed, 41 insertions(+), 3 deletions(-) + +diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h +index 6e022cc712e6..7095928a5152 100644 +--- a/include/linux/vmalloc.h ++++ b/include/linux/vmalloc.h +@@ -141,6 +141,7 @@ extern void *vzalloc(unsigned long size) __alloc_size(1); + extern void *vmalloc_user(unsigned long size) __alloc_size(1); + extern void *vmalloc_node(unsigned long size, int node) __alloc_size(1); + extern void *vzalloc_node(unsigned long size, int node) __alloc_size(1); ++extern void *vmalloc_exec(unsigned long size, gfp_t gfp_mask) __alloc_size(1); + extern void *vmalloc_32(unsigned long size) __alloc_size(1); + extern void *vmalloc_32_user(unsigned long size) __alloc_size(1); + extern void *__vmalloc(unsigned long size, gfp_t gfp_mask) __alloc_size(1); +diff --git a/kernel/module.c b/kernel/module.c +index 84a9141a5e15..ffa9c2318fd8 100644 +--- a/kernel/module.c ++++ b/kernel/module.c +@@ -2835,9 +2835,7 @@ static void dynamic_debug_remove(struct module *mod, struct _ddebug *debug) + + void * __weak module_alloc(unsigned long size) + { +- return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END, +- GFP_KERNEL, PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS, +- NUMA_NO_NODE, __builtin_return_address(0)); ++ return vmalloc_exec(size, GFP_KERNEL); + } + + bool __weak module_init_section(const char *name) +diff --git a/mm/nommu.c b/mm/nommu.c +index 55a9e48a7a02..7037d53711b1 100644 +--- a/mm/nommu.c ++++ b/mm/nommu.c +@@ -279,6 +279,24 @@ void *vzalloc_node(unsigned long size, int node) + } + EXPORT_SYMBOL(vzalloc_node); + ++/** ++ * vmalloc_exec - allocate virtually contiguous, executable memory ++ * @size: allocation size ++ * ++ * Kernel-internal function to allocate enough pages to cover @size ++ * the page level allocator and map them into contiguous and ++ * executable kernel virtual space. ++ * ++ * For tight control over page level allocator and protection flags ++ * use __vmalloc() instead. ++ */ ++ ++void *vmalloc_exec(unsigned long size, gfp_t gfp_mask) ++{ ++ return __vmalloc(size, gfp_mask); ++} ++EXPORT_SYMBOL_GPL(vmalloc_exec); ++ + /** + * vmalloc_32 - allocate virtually contiguous memory (32bit addressable) + * @size: allocation size +diff --git a/mm/vmalloc.c b/mm/vmalloc.c +index d2a00ad4e1dd..f152e3c0630a 100644 +--- a/mm/vmalloc.c ++++ b/mm/vmalloc.c +@@ -3242,6 +3242,27 @@ void *vzalloc_node(unsigned long size, int node) + } + EXPORT_SYMBOL(vzalloc_node); + ++/** ++ * vmalloc_exec - allocate virtually contiguous, executable memory ++ * @size: allocation size ++ * ++ * Kernel-internal function to allocate enough pages to cover @size ++ * the page level allocator and map them into contiguous and ++ * executable kernel virtual space. ++ * ++ * For tight control over page level allocator and protection flags ++ * use __vmalloc() instead. ++ * ++ * Return: pointer to the allocated memory or %NULL on error ++ */ ++void *vmalloc_exec(unsigned long size, gfp_t gfp_mask) ++{ ++ return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END, ++ gfp_mask, PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS, ++ NUMA_NO_NODE, __builtin_return_address(0)); ++} ++EXPORT_SYMBOL_GPL(vmalloc_exec); ++ + #if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32) + #define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL) + #elif defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA) +-- +cgit v1.2.3 + + +From 2a70da71ca4297dd1450f94a83216bd444ceac60 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 9 Jul 2018 23:27:33 -0400 +Subject: fs: factor out d_mark_tmpfile() + +New helper for bcachefs - bcachefs doesn't want the +inode_dec_link_count() call that d_tmpfile does, it handles i_nlink on +its own atomically with other btree updates + +Signed-off-by: Kent Overstreet +--- + fs/dcache.c | 10 ++++++++-- + include/linux/dcache.h | 1 + + 2 files changed, 9 insertions(+), 2 deletions(-) + +diff --git a/fs/dcache.c b/fs/dcache.c +index cf871a81f4fd..30910dae37ad 100644 +--- a/fs/dcache.c ++++ b/fs/dcache.c +@@ -3167,9 +3167,8 @@ void d_genocide(struct dentry *parent) + + EXPORT_SYMBOL(d_genocide); + +-void d_tmpfile(struct dentry *dentry, struct inode *inode) ++void d_mark_tmpfile(struct dentry *dentry, struct inode *inode) + { +- inode_dec_link_count(inode); + BUG_ON(dentry->d_name.name != dentry->d_iname || + !hlist_unhashed(&dentry->d_u.d_alias) || + !d_unlinked(dentry)); +@@ -3179,6 +3178,13 @@ void d_tmpfile(struct dentry *dentry, struct inode *inode) + (unsigned long long)inode->i_ino); + spin_unlock(&dentry->d_lock); + spin_unlock(&dentry->d_parent->d_lock); ++} ++EXPORT_SYMBOL(d_mark_tmpfile); ++ ++void d_tmpfile(struct dentry *dentry, struct inode *inode) ++{ ++ inode_dec_link_count(inode); ++ d_mark_tmpfile(dentry, inode); + d_instantiate(dentry, inode); + } + EXPORT_SYMBOL(d_tmpfile); +diff --git a/include/linux/dcache.h b/include/linux/dcache.h +index 9e23d33bb6f1..b3e3ff7930b5 100644 +--- a/include/linux/dcache.h ++++ b/include/linux/dcache.h +@@ -258,6 +258,7 @@ extern struct dentry * d_make_root(struct inode *); + /* - the ramfs-type tree */ + extern void d_genocide(struct dentry *); + ++extern void d_mark_tmpfile(struct dentry *, struct inode *); + extern void d_tmpfile(struct dentry *, struct inode *); + + extern struct dentry *d_find_alias(struct inode *); +-- +cgit v1.2.3 + + +From 132e59c1c24cc442245dc5005726c181b85dc7f0 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 4 Apr 2019 12:02:01 -0400 +Subject: block: Add some exports for bcachefs + +bcachefs has its own direct IO code. + +Signed-off-by: Kent Overstreet +--- + block/bio.c | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/block/bio.c b/block/bio.c +index 15ab0d6d1c06..74c6be7dd6dd 100644 +--- a/block/bio.c ++++ b/block/bio.c +@@ -1361,6 +1361,7 @@ void bio_set_pages_dirty(struct bio *bio) + set_page_dirty_lock(bvec->bv_page); + } + } ++EXPORT_SYMBOL_GPL(bio_set_pages_dirty); + + /* + * bio_check_pages_dirty() will check that all the BIO's pages are still dirty. +@@ -1420,6 +1421,7 @@ defer: + spin_unlock_irqrestore(&bio_dirty_lock, flags); + schedule_work(&bio_dirty_work); + } ++EXPORT_SYMBOL_GPL(bio_check_pages_dirty); + + static inline bool bio_remaining_done(struct bio *bio) + { +-- +cgit v1.2.3 + + +From 37ea65b42bacf63b09ee03c38ea71a73d6906782 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 2 Jul 2020 13:37:43 -0400 +Subject: block: Add blk_status_to_str() + +If we're going to the trouble of having these nice error strings, let's +make them available. + +Signed-off-by: Kent Overstreet +--- + block/blk-core.c | 11 ++++++++--- + include/linux/blkdev.h | 1 + + 2 files changed, 9 insertions(+), 3 deletions(-) + +diff --git a/block/blk-core.c b/block/blk-core.c +index 1378d084c770..3f8103b0c1d2 100644 +--- a/block/blk-core.c ++++ b/block/blk-core.c +@@ -216,17 +216,22 @@ int blk_status_to_errno(blk_status_t status) + } + EXPORT_SYMBOL_GPL(blk_status_to_errno); + +-void blk_print_req_error(struct request *req, blk_status_t status) ++const char *blk_status_to_str(blk_status_t status) + { + int idx = (__force int)status; + + if (WARN_ON_ONCE(idx >= ARRAY_SIZE(blk_errors))) +- return; ++ return "(invalid error)"; ++ return blk_errors[idx].name; ++} ++EXPORT_SYMBOL_GPL(blk_status_to_str); + ++void blk_print_req_error(struct request *req, blk_status_t status) ++{ + printk_ratelimited(KERN_ERR + "%s error, dev %s, sector %llu op 0x%x:(%s) flags 0x%x " + "phys_seg %u prio class %u\n", +- blk_errors[idx].name, ++ blk_status_to_str(status), + req->rq_disk ? req->rq_disk->disk_name : "?", + blk_rq_pos(req), req_op(req), blk_op_str(req_op(req)), + req->cmd_flags & ~REQ_OP_MASK, +diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h +index bd4370baccca..cecba77f289e 100644 +--- a/include/linux/blkdev.h ++++ b/include/linux/blkdev.h +@@ -604,6 +604,7 @@ extern const char *blk_op_str(unsigned int op); + + int blk_status_to_errno(blk_status_t status); + blk_status_t errno_to_blk_status(int errno); ++const char *blk_status_to_str(blk_status_t status); + + /* only poll the hardware once, don't continue until a completion was found */ + #define BLK_POLL_ONESHOT (1 << 0) +-- +cgit v1.2.3 + + +From d7d1818e62458e76c25fcaca00864e22ab3234c9 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 17 Mar 2017 16:35:23 -0800 +Subject: bcache: move closures to lib/ + +Prep work for bcachefs - being a fork of bcache it also uses closures + +Signed-off-by: Kent Overstreet +Acked-by: Coly Li +--- + drivers/md/bcache/Kconfig | 10 +- + drivers/md/bcache/Makefile | 4 +- + drivers/md/bcache/bcache.h | 2 +- + drivers/md/bcache/closure.c | 207 ------------------------ + drivers/md/bcache/closure.h | 378 -------------------------------------------- + drivers/md/bcache/super.c | 1 - + drivers/md/bcache/util.h | 3 +- + include/linux/closure.h | 377 +++++++++++++++++++++++++++++++++++++++++++ + lib/Kconfig | 3 + + lib/Kconfig.debug | 9 ++ + lib/Makefile | 2 + + lib/closure.c | 204 ++++++++++++++++++++++++ + 12 files changed, 600 insertions(+), 600 deletions(-) + delete mode 100644 drivers/md/bcache/closure.c + delete mode 100644 drivers/md/bcache/closure.h + create mode 100644 include/linux/closure.h + create mode 100644 lib/closure.c + +diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig +index cf3e8096942a..f1a1f0c4a0ea 100644 +--- a/drivers/md/bcache/Kconfig ++++ b/drivers/md/bcache/Kconfig +@@ -4,6 +4,7 @@ config BCACHE + tristate "Block device as cache" + select BLOCK_HOLDER_DEPRECATED if SYSFS + select CRC64 ++ select CLOSURES + help + Allows a block device to be used as cache for other devices; uses + a btree for indexing and the layout is optimized for SSDs. +@@ -19,15 +20,6 @@ config BCACHE_DEBUG + Enables extra debugging tools, allows expensive runtime checks to be + turned on. + +-config BCACHE_CLOSURES_DEBUG +- bool "Debug closures" +- depends on BCACHE +- select DEBUG_FS +- help +- Keeps all active closures in a linked list and provides a debugfs +- interface to list them, which makes it possible to see asynchronous +- operations that get stuck. +- + config BCACHE_ASYNC_REGISTRATION + bool "Asynchronous device registration (EXPERIMENTAL)" + depends on BCACHE +diff --git a/drivers/md/bcache/Makefile b/drivers/md/bcache/Makefile +index 5b87e59676b8..054e8a33a7ab 100644 +--- a/drivers/md/bcache/Makefile ++++ b/drivers/md/bcache/Makefile +@@ -2,6 +2,6 @@ + + obj-$(CONFIG_BCACHE) += bcache.o + +-bcache-y := alloc.o bset.o btree.o closure.o debug.o extents.o\ +- io.o journal.o movinggc.o request.o stats.o super.o sysfs.o trace.o\ ++bcache-y := alloc.o bset.o btree.o debug.o extents.o io.o\ ++ journal.o movinggc.o request.o stats.o super.o sysfs.o trace.o\ + util.o writeback.o features.o +diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h +index 9ed9c955add7..dbb72beb036c 100644 +--- a/drivers/md/bcache/bcache.h ++++ b/drivers/md/bcache/bcache.h +@@ -179,6 +179,7 @@ + #define pr_fmt(fmt) "bcache: %s() " fmt, __func__ + + #include ++#include + #include + #include + #include +@@ -192,7 +193,6 @@ + #include "bcache_ondisk.h" + #include "bset.h" + #include "util.h" +-#include "closure.h" + + struct bucket { + atomic_t pin; +diff --git a/drivers/md/bcache/closure.c b/drivers/md/bcache/closure.c +deleted file mode 100644 +index d8d9394a6beb..000000000000 +--- a/drivers/md/bcache/closure.c ++++ /dev/null +@@ -1,207 +0,0 @@ +-// SPDX-License-Identifier: GPL-2.0 +-/* +- * Asynchronous refcounty things +- * +- * Copyright 2010, 2011 Kent Overstreet +- * Copyright 2012 Google, Inc. +- */ +- +-#include +-#include +-#include +-#include +- +-#include "closure.h" +- +-static inline void closure_put_after_sub(struct closure *cl, int flags) +-{ +- int r = flags & CLOSURE_REMAINING_MASK; +- +- BUG_ON(flags & CLOSURE_GUARD_MASK); +- BUG_ON(!r && (flags & ~CLOSURE_DESTRUCTOR)); +- +- if (!r) { +- if (cl->fn && !(flags & CLOSURE_DESTRUCTOR)) { +- atomic_set(&cl->remaining, +- CLOSURE_REMAINING_INITIALIZER); +- closure_queue(cl); +- } else { +- struct closure *parent = cl->parent; +- closure_fn *destructor = cl->fn; +- +- closure_debug_destroy(cl); +- +- if (destructor) +- destructor(cl); +- +- if (parent) +- closure_put(parent); +- } +- } +-} +- +-/* For clearing flags with the same atomic op as a put */ +-void closure_sub(struct closure *cl, int v) +-{ +- closure_put_after_sub(cl, atomic_sub_return(v, &cl->remaining)); +-} +- +-/* +- * closure_put - decrement a closure's refcount +- */ +-void closure_put(struct closure *cl) +-{ +- closure_put_after_sub(cl, atomic_dec_return(&cl->remaining)); +-} +- +-/* +- * closure_wake_up - wake up all closures on a wait list, without memory barrier +- */ +-void __closure_wake_up(struct closure_waitlist *wait_list) +-{ +- struct llist_node *list; +- struct closure *cl, *t; +- struct llist_node *reverse = NULL; +- +- list = llist_del_all(&wait_list->list); +- +- /* We first reverse the list to preserve FIFO ordering and fairness */ +- reverse = llist_reverse_order(list); +- +- /* Then do the wakeups */ +- llist_for_each_entry_safe(cl, t, reverse, list) { +- closure_set_waiting(cl, 0); +- closure_sub(cl, CLOSURE_WAITING + 1); +- } +-} +- +-/** +- * closure_wait - add a closure to a waitlist +- * @waitlist: will own a ref on @cl, which will be released when +- * closure_wake_up() is called on @waitlist. +- * @cl: closure pointer. +- * +- */ +-bool closure_wait(struct closure_waitlist *waitlist, struct closure *cl) +-{ +- if (atomic_read(&cl->remaining) & CLOSURE_WAITING) +- return false; +- +- closure_set_waiting(cl, _RET_IP_); +- atomic_add(CLOSURE_WAITING + 1, &cl->remaining); +- llist_add(&cl->list, &waitlist->list); +- +- return true; +-} +- +-struct closure_syncer { +- struct task_struct *task; +- int done; +-}; +- +-static void closure_sync_fn(struct closure *cl) +-{ +- struct closure_syncer *s = cl->s; +- struct task_struct *p; +- +- rcu_read_lock(); +- p = READ_ONCE(s->task); +- s->done = 1; +- wake_up_process(p); +- rcu_read_unlock(); +-} +- +-void __sched __closure_sync(struct closure *cl) +-{ +- struct closure_syncer s = { .task = current }; +- +- cl->s = &s; +- continue_at(cl, closure_sync_fn, NULL); +- +- while (1) { +- set_current_state(TASK_UNINTERRUPTIBLE); +- if (s.done) +- break; +- schedule(); +- } +- +- __set_current_state(TASK_RUNNING); +-} +- +-#ifdef CONFIG_BCACHE_CLOSURES_DEBUG +- +-static LIST_HEAD(closure_list); +-static DEFINE_SPINLOCK(closure_list_lock); +- +-void closure_debug_create(struct closure *cl) +-{ +- unsigned long flags; +- +- BUG_ON(cl->magic == CLOSURE_MAGIC_ALIVE); +- cl->magic = CLOSURE_MAGIC_ALIVE; +- +- spin_lock_irqsave(&closure_list_lock, flags); +- list_add(&cl->all, &closure_list); +- spin_unlock_irqrestore(&closure_list_lock, flags); +-} +- +-void closure_debug_destroy(struct closure *cl) +-{ +- unsigned long flags; +- +- BUG_ON(cl->magic != CLOSURE_MAGIC_ALIVE); +- cl->magic = CLOSURE_MAGIC_DEAD; +- +- spin_lock_irqsave(&closure_list_lock, flags); +- list_del(&cl->all); +- spin_unlock_irqrestore(&closure_list_lock, flags); +-} +- +-static struct dentry *closure_debug; +- +-static int debug_show(struct seq_file *f, void *data) +-{ +- struct closure *cl; +- +- spin_lock_irq(&closure_list_lock); +- +- list_for_each_entry(cl, &closure_list, all) { +- int r = atomic_read(&cl->remaining); +- +- seq_printf(f, "%p: %pS -> %pS p %p r %i ", +- cl, (void *) cl->ip, cl->fn, cl->parent, +- r & CLOSURE_REMAINING_MASK); +- +- seq_printf(f, "%s%s\n", +- test_bit(WORK_STRUCT_PENDING_BIT, +- work_data_bits(&cl->work)) ? "Q" : "", +- r & CLOSURE_RUNNING ? "R" : ""); +- +- if (r & CLOSURE_WAITING) +- seq_printf(f, " W %pS\n", +- (void *) cl->waiting_on); +- +- seq_printf(f, "\n"); +- } +- +- spin_unlock_irq(&closure_list_lock); +- return 0; +-} +- +-DEFINE_SHOW_ATTRIBUTE(debug); +- +-void __init closure_debug_init(void) +-{ +- if (!IS_ERR_OR_NULL(bcache_debug)) +- /* +- * it is unnecessary to check return value of +- * debugfs_create_file(), we should not care +- * about this. +- */ +- closure_debug = debugfs_create_file( +- "closures", 0400, bcache_debug, NULL, &debug_fops); +-} +-#endif +- +-MODULE_AUTHOR("Kent Overstreet "); +-MODULE_LICENSE("GPL"); +diff --git a/drivers/md/bcache/closure.h b/drivers/md/bcache/closure.h +deleted file mode 100644 +index c88cdc4ae4ec..000000000000 +--- a/drivers/md/bcache/closure.h ++++ /dev/null +@@ -1,378 +0,0 @@ +-/* SPDX-License-Identifier: GPL-2.0 */ +-#ifndef _LINUX_CLOSURE_H +-#define _LINUX_CLOSURE_H +- +-#include +-#include +-#include +-#include +- +-/* +- * Closure is perhaps the most overused and abused term in computer science, but +- * since I've been unable to come up with anything better you're stuck with it +- * again. +- * +- * What are closures? +- * +- * They embed a refcount. The basic idea is they count "things that are in +- * progress" - in flight bios, some other thread that's doing something else - +- * anything you might want to wait on. +- * +- * The refcount may be manipulated with closure_get() and closure_put(). +- * closure_put() is where many of the interesting things happen, when it causes +- * the refcount to go to 0. +- * +- * Closures can be used to wait on things both synchronously and asynchronously, +- * and synchronous and asynchronous use can be mixed without restriction. To +- * wait synchronously, use closure_sync() - you will sleep until your closure's +- * refcount hits 1. +- * +- * To wait asynchronously, use +- * continue_at(cl, next_function, workqueue); +- * +- * passing it, as you might expect, the function to run when nothing is pending +- * and the workqueue to run that function out of. +- * +- * continue_at() also, critically, requires a 'return' immediately following the +- * location where this macro is referenced, to return to the calling function. +- * There's good reason for this. +- * +- * To use safely closures asynchronously, they must always have a refcount while +- * they are running owned by the thread that is running them. Otherwise, suppose +- * you submit some bios and wish to have a function run when they all complete: +- * +- * foo_endio(struct bio *bio) +- * { +- * closure_put(cl); +- * } +- * +- * closure_init(cl); +- * +- * do_stuff(); +- * closure_get(cl); +- * bio1->bi_endio = foo_endio; +- * bio_submit(bio1); +- * +- * do_more_stuff(); +- * closure_get(cl); +- * bio2->bi_endio = foo_endio; +- * bio_submit(bio2); +- * +- * continue_at(cl, complete_some_read, system_wq); +- * +- * If closure's refcount started at 0, complete_some_read() could run before the +- * second bio was submitted - which is almost always not what you want! More +- * importantly, it wouldn't be possible to say whether the original thread or +- * complete_some_read()'s thread owned the closure - and whatever state it was +- * associated with! +- * +- * So, closure_init() initializes a closure's refcount to 1 - and when a +- * closure_fn is run, the refcount will be reset to 1 first. +- * +- * Then, the rule is - if you got the refcount with closure_get(), release it +- * with closure_put() (i.e, in a bio->bi_endio function). If you have a refcount +- * on a closure because you called closure_init() or you were run out of a +- * closure - _always_ use continue_at(). Doing so consistently will help +- * eliminate an entire class of particularly pernicious races. +- * +- * Lastly, you might have a wait list dedicated to a specific event, and have no +- * need for specifying the condition - you just want to wait until someone runs +- * closure_wake_up() on the appropriate wait list. In that case, just use +- * closure_wait(). It will return either true or false, depending on whether the +- * closure was already on a wait list or not - a closure can only be on one wait +- * list at a time. +- * +- * Parents: +- * +- * closure_init() takes two arguments - it takes the closure to initialize, and +- * a (possibly null) parent. +- * +- * If parent is non null, the new closure will have a refcount for its lifetime; +- * a closure is considered to be "finished" when its refcount hits 0 and the +- * function to run is null. Hence +- * +- * continue_at(cl, NULL, NULL); +- * +- * returns up the (spaghetti) stack of closures, precisely like normal return +- * returns up the C stack. continue_at() with non null fn is better thought of +- * as doing a tail call. +- * +- * All this implies that a closure should typically be embedded in a particular +- * struct (which its refcount will normally control the lifetime of), and that +- * struct can very much be thought of as a stack frame. +- */ +- +-struct closure; +-struct closure_syncer; +-typedef void (closure_fn) (struct closure *); +-extern struct dentry *bcache_debug; +- +-struct closure_waitlist { +- struct llist_head list; +-}; +- +-enum closure_state { +- /* +- * CLOSURE_WAITING: Set iff the closure is on a waitlist. Must be set by +- * the thread that owns the closure, and cleared by the thread that's +- * waking up the closure. +- * +- * The rest are for debugging and don't affect behaviour: +- * +- * CLOSURE_RUNNING: Set when a closure is running (i.e. by +- * closure_init() and when closure_put() runs then next function), and +- * must be cleared before remaining hits 0. Primarily to help guard +- * against incorrect usage and accidentally transferring references. +- * continue_at() and closure_return() clear it for you, if you're doing +- * something unusual you can use closure_set_dead() which also helps +- * annotate where references are being transferred. +- */ +- +- CLOSURE_BITS_START = (1U << 26), +- CLOSURE_DESTRUCTOR = (1U << 26), +- CLOSURE_WAITING = (1U << 28), +- CLOSURE_RUNNING = (1U << 30), +-}; +- +-#define CLOSURE_GUARD_MASK \ +- ((CLOSURE_DESTRUCTOR|CLOSURE_WAITING|CLOSURE_RUNNING) << 1) +- +-#define CLOSURE_REMAINING_MASK (CLOSURE_BITS_START - 1) +-#define CLOSURE_REMAINING_INITIALIZER (1|CLOSURE_RUNNING) +- +-struct closure { +- union { +- struct { +- struct workqueue_struct *wq; +- struct closure_syncer *s; +- struct llist_node list; +- closure_fn *fn; +- }; +- struct work_struct work; +- }; +- +- struct closure *parent; +- +- atomic_t remaining; +- +-#ifdef CONFIG_BCACHE_CLOSURES_DEBUG +-#define CLOSURE_MAGIC_DEAD 0xc054dead +-#define CLOSURE_MAGIC_ALIVE 0xc054a11e +- +- unsigned int magic; +- struct list_head all; +- unsigned long ip; +- unsigned long waiting_on; +-#endif +-}; +- +-void closure_sub(struct closure *cl, int v); +-void closure_put(struct closure *cl); +-void __closure_wake_up(struct closure_waitlist *list); +-bool closure_wait(struct closure_waitlist *list, struct closure *cl); +-void __closure_sync(struct closure *cl); +- +-/** +- * closure_sync - sleep until a closure a closure has nothing left to wait on +- * +- * Sleeps until the refcount hits 1 - the thread that's running the closure owns +- * the last refcount. +- */ +-static inline void closure_sync(struct closure *cl) +-{ +- if ((atomic_read(&cl->remaining) & CLOSURE_REMAINING_MASK) != 1) +- __closure_sync(cl); +-} +- +-#ifdef CONFIG_BCACHE_CLOSURES_DEBUG +- +-void closure_debug_init(void); +-void closure_debug_create(struct closure *cl); +-void closure_debug_destroy(struct closure *cl); +- +-#else +- +-static inline void closure_debug_init(void) {} +-static inline void closure_debug_create(struct closure *cl) {} +-static inline void closure_debug_destroy(struct closure *cl) {} +- +-#endif +- +-static inline void closure_set_ip(struct closure *cl) +-{ +-#ifdef CONFIG_BCACHE_CLOSURES_DEBUG +- cl->ip = _THIS_IP_; +-#endif +-} +- +-static inline void closure_set_ret_ip(struct closure *cl) +-{ +-#ifdef CONFIG_BCACHE_CLOSURES_DEBUG +- cl->ip = _RET_IP_; +-#endif +-} +- +-static inline void closure_set_waiting(struct closure *cl, unsigned long f) +-{ +-#ifdef CONFIG_BCACHE_CLOSURES_DEBUG +- cl->waiting_on = f; +-#endif +-} +- +-static inline void closure_set_stopped(struct closure *cl) +-{ +- atomic_sub(CLOSURE_RUNNING, &cl->remaining); +-} +- +-static inline void set_closure_fn(struct closure *cl, closure_fn *fn, +- struct workqueue_struct *wq) +-{ +- closure_set_ip(cl); +- cl->fn = fn; +- cl->wq = wq; +- /* between atomic_dec() in closure_put() */ +- smp_mb__before_atomic(); +-} +- +-static inline void closure_queue(struct closure *cl) +-{ +- struct workqueue_struct *wq = cl->wq; +- /** +- * Changes made to closure, work_struct, or a couple of other structs +- * may cause work.func not pointing to the right location. +- */ +- BUILD_BUG_ON(offsetof(struct closure, fn) +- != offsetof(struct work_struct, func)); +- if (wq) { +- INIT_WORK(&cl->work, cl->work.func); +- BUG_ON(!queue_work(wq, &cl->work)); +- } else +- cl->fn(cl); +-} +- +-/** +- * closure_get - increment a closure's refcount +- */ +-static inline void closure_get(struct closure *cl) +-{ +-#ifdef CONFIG_BCACHE_CLOSURES_DEBUG +- BUG_ON((atomic_inc_return(&cl->remaining) & +- CLOSURE_REMAINING_MASK) <= 1); +-#else +- atomic_inc(&cl->remaining); +-#endif +-} +- +-/** +- * closure_init - Initialize a closure, setting the refcount to 1 +- * @cl: closure to initialize +- * @parent: parent of the new closure. cl will take a refcount on it for its +- * lifetime; may be NULL. +- */ +-static inline void closure_init(struct closure *cl, struct closure *parent) +-{ +- memset(cl, 0, sizeof(struct closure)); +- cl->parent = parent; +- if (parent) +- closure_get(parent); +- +- atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER); +- +- closure_debug_create(cl); +- closure_set_ip(cl); +-} +- +-static inline void closure_init_stack(struct closure *cl) +-{ +- memset(cl, 0, sizeof(struct closure)); +- atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER); +-} +- +-/** +- * closure_wake_up - wake up all closures on a wait list, +- * with memory barrier +- */ +-static inline void closure_wake_up(struct closure_waitlist *list) +-{ +- /* Memory barrier for the wait list */ +- smp_mb(); +- __closure_wake_up(list); +-} +- +-/** +- * continue_at - jump to another function with barrier +- * +- * After @cl is no longer waiting on anything (i.e. all outstanding refs have +- * been dropped with closure_put()), it will resume execution at @fn running out +- * of @wq (or, if @wq is NULL, @fn will be called by closure_put() directly). +- * +- * This is because after calling continue_at() you no longer have a ref on @cl, +- * and whatever @cl owns may be freed out from under you - a running closure fn +- * has a ref on its own closure which continue_at() drops. +- * +- * Note you are expected to immediately return after using this macro. +- */ +-#define continue_at(_cl, _fn, _wq) \ +-do { \ +- set_closure_fn(_cl, _fn, _wq); \ +- closure_sub(_cl, CLOSURE_RUNNING + 1); \ +-} while (0) +- +-/** +- * closure_return - finish execution of a closure +- * +- * This is used to indicate that @cl is finished: when all outstanding refs on +- * @cl have been dropped @cl's ref on its parent closure (as passed to +- * closure_init()) will be dropped, if one was specified - thus this can be +- * thought of as returning to the parent closure. +- */ +-#define closure_return(_cl) continue_at((_cl), NULL, NULL) +- +-/** +- * continue_at_nobarrier - jump to another function without barrier +- * +- * Causes @fn to be executed out of @cl, in @wq context (or called directly if +- * @wq is NULL). +- * +- * The ref the caller of continue_at_nobarrier() had on @cl is now owned by @fn, +- * thus it's not safe to touch anything protected by @cl after a +- * continue_at_nobarrier(). +- */ +-#define continue_at_nobarrier(_cl, _fn, _wq) \ +-do { \ +- set_closure_fn(_cl, _fn, _wq); \ +- closure_queue(_cl); \ +-} while (0) +- +-/** +- * closure_return_with_destructor - finish execution of a closure, +- * with destructor +- * +- * Works like closure_return(), except @destructor will be called when all +- * outstanding refs on @cl have been dropped; @destructor may be used to safely +- * free the memory occupied by @cl, and it is called with the ref on the parent +- * closure still held - so @destructor could safely return an item to a +- * freelist protected by @cl's parent. +- */ +-#define closure_return_with_destructor(_cl, _destructor) \ +-do { \ +- set_closure_fn(_cl, _destructor, NULL); \ +- closure_sub(_cl, CLOSURE_RUNNING - CLOSURE_DESTRUCTOR + 1); \ +-} while (0) +- +-/** +- * closure_call - execute @fn out of a new, uninitialized closure +- * +- * Typically used when running out of one closure, and we want to run @fn +- * asynchronously out of a new closure - @parent will then wait for @cl to +- * finish. +- */ +-static inline void closure_call(struct closure *cl, closure_fn fn, +- struct workqueue_struct *wq, +- struct closure *parent) +-{ +- closure_init(cl, parent); +- continue_at_nobarrier(cl, fn, wq); +-} +- +-#endif /* _LINUX_CLOSURE_H */ +diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c +index 140f35dc0c45..3e2b4920803a 100644 +--- a/drivers/md/bcache/super.c ++++ b/drivers/md/bcache/super.c +@@ -2916,7 +2916,6 @@ static int __init bcache_init(void) + goto err; + + bch_debug_init(); +- closure_debug_init(); + + bcache_is_reboot = false; + +diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h +index 6f3cb7c92130..f61ab1bada6c 100644 +--- a/drivers/md/bcache/util.h ++++ b/drivers/md/bcache/util.h +@@ -4,6 +4,7 @@ + #define _BCACHE_UTIL_H + + #include ++#include + #include + #include + #include +@@ -13,8 +14,6 @@ + #include + #include + +-#include "closure.h" +- + struct closure; + + #ifdef CONFIG_BCACHE_DEBUG +diff --git a/include/linux/closure.h b/include/linux/closure.h +new file mode 100644 +index 000000000000..0ec9e7bc8d97 +--- /dev/null ++++ b/include/linux/closure.h +@@ -0,0 +1,377 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _LINUX_CLOSURE_H ++#define _LINUX_CLOSURE_H ++ ++#include ++#include ++#include ++#include ++ ++/* ++ * Closure is perhaps the most overused and abused term in computer science, but ++ * since I've been unable to come up with anything better you're stuck with it ++ * again. ++ * ++ * What are closures? ++ * ++ * They embed a refcount. The basic idea is they count "things that are in ++ * progress" - in flight bios, some other thread that's doing something else - ++ * anything you might want to wait on. ++ * ++ * The refcount may be manipulated with closure_get() and closure_put(). ++ * closure_put() is where many of the interesting things happen, when it causes ++ * the refcount to go to 0. ++ * ++ * Closures can be used to wait on things both synchronously and asynchronously, ++ * and synchronous and asynchronous use can be mixed without restriction. To ++ * wait synchronously, use closure_sync() - you will sleep until your closure's ++ * refcount hits 1. ++ * ++ * To wait asynchronously, use ++ * continue_at(cl, next_function, workqueue); ++ * ++ * passing it, as you might expect, the function to run when nothing is pending ++ * and the workqueue to run that function out of. ++ * ++ * continue_at() also, critically, requires a 'return' immediately following the ++ * location where this macro is referenced, to return to the calling function. ++ * There's good reason for this. ++ * ++ * To use safely closures asynchronously, they must always have a refcount while ++ * they are running owned by the thread that is running them. Otherwise, suppose ++ * you submit some bios and wish to have a function run when they all complete: ++ * ++ * foo_endio(struct bio *bio) ++ * { ++ * closure_put(cl); ++ * } ++ * ++ * closure_init(cl); ++ * ++ * do_stuff(); ++ * closure_get(cl); ++ * bio1->bi_endio = foo_endio; ++ * bio_submit(bio1); ++ * ++ * do_more_stuff(); ++ * closure_get(cl); ++ * bio2->bi_endio = foo_endio; ++ * bio_submit(bio2); ++ * ++ * continue_at(cl, complete_some_read, system_wq); ++ * ++ * If closure's refcount started at 0, complete_some_read() could run before the ++ * second bio was submitted - which is almost always not what you want! More ++ * importantly, it wouldn't be possible to say whether the original thread or ++ * complete_some_read()'s thread owned the closure - and whatever state it was ++ * associated with! ++ * ++ * So, closure_init() initializes a closure's refcount to 1 - and when a ++ * closure_fn is run, the refcount will be reset to 1 first. ++ * ++ * Then, the rule is - if you got the refcount with closure_get(), release it ++ * with closure_put() (i.e, in a bio->bi_endio function). If you have a refcount ++ * on a closure because you called closure_init() or you were run out of a ++ * closure - _always_ use continue_at(). Doing so consistently will help ++ * eliminate an entire class of particularly pernicious races. ++ * ++ * Lastly, you might have a wait list dedicated to a specific event, and have no ++ * need for specifying the condition - you just want to wait until someone runs ++ * closure_wake_up() on the appropriate wait list. In that case, just use ++ * closure_wait(). It will return either true or false, depending on whether the ++ * closure was already on a wait list or not - a closure can only be on one wait ++ * list at a time. ++ * ++ * Parents: ++ * ++ * closure_init() takes two arguments - it takes the closure to initialize, and ++ * a (possibly null) parent. ++ * ++ * If parent is non null, the new closure will have a refcount for its lifetime; ++ * a closure is considered to be "finished" when its refcount hits 0 and the ++ * function to run is null. Hence ++ * ++ * continue_at(cl, NULL, NULL); ++ * ++ * returns up the (spaghetti) stack of closures, precisely like normal return ++ * returns up the C stack. continue_at() with non null fn is better thought of ++ * as doing a tail call. ++ * ++ * All this implies that a closure should typically be embedded in a particular ++ * struct (which its refcount will normally control the lifetime of), and that ++ * struct can very much be thought of as a stack frame. ++ */ ++ ++struct closure; ++struct closure_syncer; ++typedef void (closure_fn) (struct closure *); ++extern struct dentry *bcache_debug; ++ ++struct closure_waitlist { ++ struct llist_head list; ++}; ++ ++enum closure_state { ++ /* ++ * CLOSURE_WAITING: Set iff the closure is on a waitlist. Must be set by ++ * the thread that owns the closure, and cleared by the thread that's ++ * waking up the closure. ++ * ++ * The rest are for debugging and don't affect behaviour: ++ * ++ * CLOSURE_RUNNING: Set when a closure is running (i.e. by ++ * closure_init() and when closure_put() runs then next function), and ++ * must be cleared before remaining hits 0. Primarily to help guard ++ * against incorrect usage and accidentally transferring references. ++ * continue_at() and closure_return() clear it for you, if you're doing ++ * something unusual you can use closure_set_dead() which also helps ++ * annotate where references are being transferred. ++ */ ++ ++ CLOSURE_BITS_START = (1U << 26), ++ CLOSURE_DESTRUCTOR = (1U << 26), ++ CLOSURE_WAITING = (1U << 28), ++ CLOSURE_RUNNING = (1U << 30), ++}; ++ ++#define CLOSURE_GUARD_MASK \ ++ ((CLOSURE_DESTRUCTOR|CLOSURE_WAITING|CLOSURE_RUNNING) << 1) ++ ++#define CLOSURE_REMAINING_MASK (CLOSURE_BITS_START - 1) ++#define CLOSURE_REMAINING_INITIALIZER (1|CLOSURE_RUNNING) ++ ++struct closure { ++ union { ++ struct { ++ struct workqueue_struct *wq; ++ struct closure_syncer *s; ++ struct llist_node list; ++ closure_fn *fn; ++ }; ++ struct work_struct work; ++ }; ++ ++ struct closure *parent; ++ ++ atomic_t remaining; ++ ++#ifdef CONFIG_DEBUG_CLOSURES ++#define CLOSURE_MAGIC_DEAD 0xc054dead ++#define CLOSURE_MAGIC_ALIVE 0xc054a11e ++ ++ unsigned int magic; ++ struct list_head all; ++ unsigned long ip; ++ unsigned long waiting_on; ++#endif ++}; ++ ++void closure_sub(struct closure *cl, int v); ++void closure_put(struct closure *cl); ++void __closure_wake_up(struct closure_waitlist *list); ++bool closure_wait(struct closure_waitlist *list, struct closure *cl); ++void __closure_sync(struct closure *cl); ++ ++/** ++ * closure_sync - sleep until a closure a closure has nothing left to wait on ++ * ++ * Sleeps until the refcount hits 1 - the thread that's running the closure owns ++ * the last refcount. ++ */ ++static inline void closure_sync(struct closure *cl) ++{ ++ if ((atomic_read(&cl->remaining) & CLOSURE_REMAINING_MASK) != 1) ++ __closure_sync(cl); ++} ++ ++#ifdef CONFIG_DEBUG_CLOSURES ++ ++void closure_debug_create(struct closure *cl); ++void closure_debug_destroy(struct closure *cl); ++ ++#else ++ ++static inline void closure_debug_create(struct closure *cl) {} ++static inline void closure_debug_destroy(struct closure *cl) {} ++ ++#endif ++ ++static inline void closure_set_ip(struct closure *cl) ++{ ++#ifdef CONFIG_DEBUG_CLOSURES ++ cl->ip = _THIS_IP_; ++#endif ++} ++ ++static inline void closure_set_ret_ip(struct closure *cl) ++{ ++#ifdef CONFIG_DEBUG_CLOSURES ++ cl->ip = _RET_IP_; ++#endif ++} ++ ++static inline void closure_set_waiting(struct closure *cl, unsigned long f) ++{ ++#ifdef CONFIG_DEBUG_CLOSURES ++ cl->waiting_on = f; ++#endif ++} ++ ++static inline void closure_set_stopped(struct closure *cl) ++{ ++ atomic_sub(CLOSURE_RUNNING, &cl->remaining); ++} ++ ++static inline void set_closure_fn(struct closure *cl, closure_fn *fn, ++ struct workqueue_struct *wq) ++{ ++ closure_set_ip(cl); ++ cl->fn = fn; ++ cl->wq = wq; ++ /* between atomic_dec() in closure_put() */ ++ smp_mb__before_atomic(); ++} ++ ++static inline void closure_queue(struct closure *cl) ++{ ++ struct workqueue_struct *wq = cl->wq; ++ /** ++ * Changes made to closure, work_struct, or a couple of other structs ++ * may cause work.func not pointing to the right location. ++ */ ++ BUILD_BUG_ON(offsetof(struct closure, fn) ++ != offsetof(struct work_struct, func)); ++ ++ if (wq) { ++ INIT_WORK(&cl->work, cl->work.func); ++ BUG_ON(!queue_work(wq, &cl->work)); ++ } else ++ cl->fn(cl); ++} ++ ++/** ++ * closure_get - increment a closure's refcount ++ */ ++static inline void closure_get(struct closure *cl) ++{ ++#ifdef CONFIG_DEBUG_CLOSURES ++ BUG_ON((atomic_inc_return(&cl->remaining) & ++ CLOSURE_REMAINING_MASK) <= 1); ++#else ++ atomic_inc(&cl->remaining); ++#endif ++} ++ ++/** ++ * closure_init - Initialize a closure, setting the refcount to 1 ++ * @cl: closure to initialize ++ * @parent: parent of the new closure. cl will take a refcount on it for its ++ * lifetime; may be NULL. ++ */ ++static inline void closure_init(struct closure *cl, struct closure *parent) ++{ ++ cl->fn = NULL; ++ cl->parent = parent; ++ if (parent) ++ closure_get(parent); ++ ++ atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER); ++ ++ closure_debug_create(cl); ++ closure_set_ip(cl); ++} ++ ++static inline void closure_init_stack(struct closure *cl) ++{ ++ memset(cl, 0, sizeof(struct closure)); ++ atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER); ++} ++ ++/** ++ * closure_wake_up - wake up all closures on a wait list, ++ * with memory barrier ++ */ ++static inline void closure_wake_up(struct closure_waitlist *list) ++{ ++ /* Memory barrier for the wait list */ ++ smp_mb(); ++ __closure_wake_up(list); ++} ++ ++/** ++ * continue_at - jump to another function with barrier ++ * ++ * After @cl is no longer waiting on anything (i.e. all outstanding refs have ++ * been dropped with closure_put()), it will resume execution at @fn running out ++ * of @wq (or, if @wq is NULL, @fn will be called by closure_put() directly). ++ * ++ * This is because after calling continue_at() you no longer have a ref on @cl, ++ * and whatever @cl owns may be freed out from under you - a running closure fn ++ * has a ref on its own closure which continue_at() drops. ++ * ++ * Note you are expected to immediately return after using this macro. ++ */ ++#define continue_at(_cl, _fn, _wq) \ ++do { \ ++ set_closure_fn(_cl, _fn, _wq); \ ++ closure_sub(_cl, CLOSURE_RUNNING + 1); \ ++} while (0) ++ ++/** ++ * closure_return - finish execution of a closure ++ * ++ * This is used to indicate that @cl is finished: when all outstanding refs on ++ * @cl have been dropped @cl's ref on its parent closure (as passed to ++ * closure_init()) will be dropped, if one was specified - thus this can be ++ * thought of as returning to the parent closure. ++ */ ++#define closure_return(_cl) continue_at((_cl), NULL, NULL) ++ ++/** ++ * continue_at_nobarrier - jump to another function without barrier ++ * ++ * Causes @fn to be executed out of @cl, in @wq context (or called directly if ++ * @wq is NULL). ++ * ++ * The ref the caller of continue_at_nobarrier() had on @cl is now owned by @fn, ++ * thus it's not safe to touch anything protected by @cl after a ++ * continue_at_nobarrier(). ++ */ ++#define continue_at_nobarrier(_cl, _fn, _wq) \ ++do { \ ++ set_closure_fn(_cl, _fn, _wq); \ ++ closure_queue(_cl); \ ++} while (0) ++ ++/** ++ * closure_return_with_destructor - finish execution of a closure, ++ * with destructor ++ * ++ * Works like closure_return(), except @destructor will be called when all ++ * outstanding refs on @cl have been dropped; @destructor may be used to safely ++ * free the memory occupied by @cl, and it is called with the ref on the parent ++ * closure still held - so @destructor could safely return an item to a ++ * freelist protected by @cl's parent. ++ */ ++#define closure_return_with_destructor(_cl, _destructor) \ ++do { \ ++ set_closure_fn(_cl, _destructor, NULL); \ ++ closure_sub(_cl, CLOSURE_RUNNING - CLOSURE_DESTRUCTOR + 1); \ ++} while (0) ++ ++/** ++ * closure_call - execute @fn out of a new, uninitialized closure ++ * ++ * Typically used when running out of one closure, and we want to run @fn ++ * asynchronously out of a new closure - @parent will then wait for @cl to ++ * finish. ++ */ ++static inline void closure_call(struct closure *cl, closure_fn fn, ++ struct workqueue_struct *wq, ++ struct closure *parent) ++{ ++ closure_init(cl, parent); ++ continue_at_nobarrier(cl, fn, wq); ++} ++ ++#endif /* _LINUX_CLOSURE_H */ +diff --git a/lib/Kconfig b/lib/Kconfig +index 5e7165e6a346..32786f287f46 100644 +--- a/lib/Kconfig ++++ b/lib/Kconfig +@@ -481,6 +481,9 @@ config ASSOCIATIVE_ARRAY + + for more information. + ++config CLOSURES ++ bool ++ + config HAS_IOMEM + bool + depends on !NO_IOMEM +diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug +index 5e14e32056ad..36e9dcb14387 100644 +--- a/lib/Kconfig.debug ++++ b/lib/Kconfig.debug +@@ -1639,6 +1639,15 @@ config DEBUG_NOTIFIERS + This is a relatively cheap check but if you care about maximum + performance, say N. + ++config DEBUG_CLOSURES ++ bool "Debug closures (bcache async widgits)" ++ depends on CLOSURES ++ select DEBUG_FS ++ help ++ Keeps all active closures in a linked list and provides a debugfs ++ interface to list them, which makes it possible to see asynchronous ++ operations that get stuck. ++ + config BUG_ON_DATA_CORRUPTION + bool "Trigger a BUG when data corruption is detected" + select DEBUG_LIST +diff --git a/lib/Makefile b/lib/Makefile +index 364c23f15578..9b7785139d19 100644 +--- a/lib/Makefile ++++ b/lib/Makefile +@@ -240,6 +240,8 @@ obj-$(CONFIG_ATOMIC64_SELFTEST) += atomic64_test.o + + obj-$(CONFIG_CPU_RMAP) += cpu_rmap.o + ++obj-$(CONFIG_CLOSURES) += closure.o ++ + obj-$(CONFIG_DQL) += dynamic_queue_limits.o + + obj-$(CONFIG_GLOB) += glob.o +diff --git a/lib/closure.c b/lib/closure.c +new file mode 100644 +index 000000000000..b38ded00b9b0 +--- /dev/null ++++ b/lib/closure.c +@@ -0,0 +1,204 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * Asynchronous refcounty things ++ * ++ * Copyright 2010, 2011 Kent Overstreet ++ * Copyright 2012 Google, Inc. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++ ++static inline void closure_put_after_sub(struct closure *cl, int flags) ++{ ++ int r = flags & CLOSURE_REMAINING_MASK; ++ ++ BUG_ON(flags & CLOSURE_GUARD_MASK); ++ BUG_ON(!r && (flags & ~CLOSURE_DESTRUCTOR)); ++ ++ if (!r) { ++ if (cl->fn && !(flags & CLOSURE_DESTRUCTOR)) { ++ atomic_set(&cl->remaining, ++ CLOSURE_REMAINING_INITIALIZER); ++ closure_queue(cl); ++ } else { ++ struct closure *parent = cl->parent; ++ closure_fn *destructor = cl->fn; ++ ++ closure_debug_destroy(cl); ++ ++ if (destructor) ++ destructor(cl); ++ ++ if (parent) ++ closure_put(parent); ++ } ++ } ++} ++ ++/* For clearing flags with the same atomic op as a put */ ++void closure_sub(struct closure *cl, int v) ++{ ++ closure_put_after_sub(cl, atomic_sub_return(v, &cl->remaining)); ++} ++EXPORT_SYMBOL(closure_sub); ++ ++/* ++ * closure_put - decrement a closure's refcount ++ */ ++void closure_put(struct closure *cl) ++{ ++ closure_put_after_sub(cl, atomic_dec_return(&cl->remaining)); ++} ++EXPORT_SYMBOL(closure_put); ++ ++/* ++ * closure_wake_up - wake up all closures on a wait list, without memory barrier ++ */ ++void __closure_wake_up(struct closure_waitlist *wait_list) ++{ ++ struct llist_node *list; ++ struct closure *cl, *t; ++ struct llist_node *reverse = NULL; ++ ++ list = llist_del_all(&wait_list->list); ++ ++ /* We first reverse the list to preserve FIFO ordering and fairness */ ++ reverse = llist_reverse_order(list); ++ ++ /* Then do the wakeups */ ++ llist_for_each_entry_safe(cl, t, reverse, list) { ++ closure_set_waiting(cl, 0); ++ closure_sub(cl, CLOSURE_WAITING + 1); ++ } ++} ++EXPORT_SYMBOL(__closure_wake_up); ++ ++/** ++ * closure_wait - add a closure to a waitlist ++ * @waitlist: will own a ref on @cl, which will be released when ++ * closure_wake_up() is called on @waitlist. ++ * @cl: closure pointer. ++ * ++ */ ++bool closure_wait(struct closure_waitlist *waitlist, struct closure *cl) ++{ ++ if (atomic_read(&cl->remaining) & CLOSURE_WAITING) ++ return false; ++ ++ closure_set_waiting(cl, _RET_IP_); ++ atomic_add(CLOSURE_WAITING + 1, &cl->remaining); ++ llist_add(&cl->list, &waitlist->list); ++ ++ return true; ++} ++EXPORT_SYMBOL(closure_wait); ++ ++struct closure_syncer { ++ struct task_struct *task; ++ int done; ++}; ++ ++static void closure_sync_fn(struct closure *cl) ++{ ++ struct closure_syncer *s = cl->s; ++ struct task_struct *p; ++ ++ rcu_read_lock(); ++ p = READ_ONCE(s->task); ++ s->done = 1; ++ wake_up_process(p); ++ rcu_read_unlock(); ++} ++ ++void __sched __closure_sync(struct closure *cl) ++{ ++ struct closure_syncer s = { .task = current }; ++ ++ cl->s = &s; ++ continue_at(cl, closure_sync_fn, NULL); ++ ++ while (1) { ++ set_current_state(TASK_UNINTERRUPTIBLE); ++ if (s.done) ++ break; ++ schedule(); ++ } ++ ++ __set_current_state(TASK_RUNNING); ++} ++EXPORT_SYMBOL(__closure_sync); ++ ++#ifdef CONFIG_DEBUG_CLOSURES ++ ++static LIST_HEAD(closure_list); ++static DEFINE_SPINLOCK(closure_list_lock); ++ ++void closure_debug_create(struct closure *cl) ++{ ++ unsigned long flags; ++ ++ BUG_ON(cl->magic == CLOSURE_MAGIC_ALIVE); ++ cl->magic = CLOSURE_MAGIC_ALIVE; ++ ++ spin_lock_irqsave(&closure_list_lock, flags); ++ list_add(&cl->all, &closure_list); ++ spin_unlock_irqrestore(&closure_list_lock, flags); ++} ++EXPORT_SYMBOL(closure_debug_create); ++ ++void closure_debug_destroy(struct closure *cl) ++{ ++ unsigned long flags; ++ ++ BUG_ON(cl->magic != CLOSURE_MAGIC_ALIVE); ++ cl->magic = CLOSURE_MAGIC_DEAD; ++ ++ spin_lock_irqsave(&closure_list_lock, flags); ++ list_del(&cl->all); ++ spin_unlock_irqrestore(&closure_list_lock, flags); ++} ++EXPORT_SYMBOL(closure_debug_destroy); ++ ++static int debug_show(struct seq_file *f, void *data) ++{ ++ struct closure *cl; ++ ++ spin_lock_irq(&closure_list_lock); ++ ++ list_for_each_entry(cl, &closure_list, all) { ++ int r = atomic_read(&cl->remaining); ++ ++ seq_printf(f, "%p: %pS -> %pS p %p r %i ", ++ cl, (void *) cl->ip, cl->fn, cl->parent, ++ r & CLOSURE_REMAINING_MASK); ++ ++ seq_printf(f, "%s%s\n", ++ test_bit(WORK_STRUCT_PENDING_BIT, ++ work_data_bits(&cl->work)) ? "Q" : "", ++ r & CLOSURE_RUNNING ? "R" : ""); ++ ++ if (r & CLOSURE_WAITING) ++ seq_printf(f, " W %pS\n", ++ (void *) cl->waiting_on); ++ ++ seq_puts(f, "\n"); ++ } ++ ++ spin_unlock_irq(&closure_list_lock); ++ return 0; ++} ++ ++DEFINE_SHOW_ATTRIBUTE(debug); ++ ++static int __init closure_debug_init(void) ++{ ++ debugfs_create_file("closures", 0400, NULL, NULL, &debug_fops); ++ return 0; ++} ++late_initcall(closure_debug_init) ++ ++#endif +-- +cgit v1.2.3 + + +From 13ecf6c20a0a3a1436df558927bc7124e8533070 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 9 Dec 2017 12:42:44 -0500 +Subject: closures: closure_wait_event() + +Like wait_event() - except, because it uses closures and closure +waitlists it doesn't have the restriction on modifying task state inside +the condition check, like wait_event() does. + +Signed-off-by: Kent Overstreet +Acked-by: Coly Li +--- + include/linux/closure.h | 22 ++++++++++++++++++++++ + 1 file changed, 22 insertions(+) + +diff --git a/include/linux/closure.h b/include/linux/closure.h +index 0ec9e7bc8d97..36b4a83f9b77 100644 +--- a/include/linux/closure.h ++++ b/include/linux/closure.h +@@ -374,4 +374,26 @@ static inline void closure_call(struct closure *cl, closure_fn fn, + continue_at_nobarrier(cl, fn, wq); + } + ++#define __closure_wait_event(waitlist, _cond) \ ++do { \ ++ struct closure cl; \ ++ \ ++ closure_init_stack(&cl); \ ++ \ ++ while (1) { \ ++ closure_wait(waitlist, &cl); \ ++ if (_cond) \ ++ break; \ ++ closure_sync(&cl); \ ++ } \ ++ closure_wake_up(waitlist); \ ++ closure_sync(&cl); \ ++} while (0) ++ ++#define closure_wait_event(waitlist, _cond) \ ++do { \ ++ if (!(_cond)) \ ++ __closure_wait_event(waitlist, _cond); \ ++} while (0) ++ + #endif /* _LINUX_CLOSURE_H */ +-- +cgit v1.2.3 + + +From 76f2d7967ae4baa735dfd43a8fdeceb36c6498bb Mon Sep 17 00:00:00 2001 +From: Dave Chinner +Date: Mon, 29 Mar 2021 13:10:44 +1100 +Subject: vfs: factor out inode hash head calculation + +In preparation for changing the inode hash table implementation. + +Signed-off-by: Dave Chinner +--- + fs/inode.c | 44 +++++++++++++++++++++++++------------------- + 1 file changed, 25 insertions(+), 19 deletions(-) + +diff --git a/fs/inode.c b/fs/inode.c +index 6b80a51129d5..708f6bc161d5 100644 +--- a/fs/inode.c ++++ b/fs/inode.c +@@ -59,6 +59,22 @@ static unsigned int i_hash_shift __read_mostly; + static struct hlist_head *inode_hashtable __read_mostly; + static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock); + ++static unsigned long hash(struct super_block *sb, unsigned long hashval) ++{ ++ unsigned long tmp; ++ ++ tmp = (hashval * (unsigned long)sb) ^ (GOLDEN_RATIO_PRIME + hashval) / ++ L1_CACHE_BYTES; ++ tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> i_hash_shift); ++ return tmp & i_hash_mask; ++} ++ ++static inline struct hlist_head *i_hash_head(struct super_block *sb, ++ unsigned int hashval) ++{ ++ return inode_hashtable + hash(sb, hashval); ++} ++ + /* + * Empty aops. Can be used for the cases where the user does not + * define any of the address_space operations. +@@ -480,16 +496,6 @@ static inline void inode_sb_list_del(struct inode *inode) + } + } + +-static unsigned long hash(struct super_block *sb, unsigned long hashval) +-{ +- unsigned long tmp; +- +- tmp = (hashval * (unsigned long)sb) ^ (GOLDEN_RATIO_PRIME + hashval) / +- L1_CACHE_BYTES; +- tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> i_hash_shift); +- return tmp & i_hash_mask; +-} +- + /** + * __insert_inode_hash - hash an inode + * @inode: unhashed inode +@@ -1089,7 +1095,7 @@ struct inode *inode_insert5(struct inode *inode, unsigned long hashval, + int (*test)(struct inode *, void *), + int (*set)(struct inode *, void *), void *data) + { +- struct hlist_head *head = inode_hashtable + hash(inode->i_sb, hashval); ++ struct hlist_head *head = i_hash_head(inode->i_sb, hashval); + struct inode *old; + bool creating = inode->i_state & I_CREATING; + +@@ -1189,7 +1195,7 @@ EXPORT_SYMBOL(iget5_locked); + */ + struct inode *iget_locked(struct super_block *sb, unsigned long ino) + { +- struct hlist_head *head = inode_hashtable + hash(sb, ino); ++ struct hlist_head *head = i_hash_head(sb, ino); + struct inode *inode; + again: + spin_lock(&inode_hash_lock); +@@ -1257,7 +1263,7 @@ EXPORT_SYMBOL(iget_locked); + */ + static int test_inode_iunique(struct super_block *sb, unsigned long ino) + { +- struct hlist_head *b = inode_hashtable + hash(sb, ino); ++ struct hlist_head *b = i_hash_head(sb, ino); + struct inode *inode; + + hlist_for_each_entry_rcu(inode, b, i_hash) { +@@ -1344,7 +1350,7 @@ EXPORT_SYMBOL(igrab); + struct inode *ilookup5_nowait(struct super_block *sb, unsigned long hashval, + int (*test)(struct inode *, void *), void *data) + { +- struct hlist_head *head = inode_hashtable + hash(sb, hashval); ++ struct hlist_head *head = i_hash_head(sb, hashval); + struct inode *inode; + + spin_lock(&inode_hash_lock); +@@ -1399,7 +1405,7 @@ EXPORT_SYMBOL(ilookup5); + */ + struct inode *ilookup(struct super_block *sb, unsigned long ino) + { +- struct hlist_head *head = inode_hashtable + hash(sb, ino); ++ struct hlist_head *head = i_hash_head(sb, ino); + struct inode *inode; + again: + spin_lock(&inode_hash_lock); +@@ -1448,7 +1454,7 @@ struct inode *find_inode_nowait(struct super_block *sb, + void *), + void *data) + { +- struct hlist_head *head = inode_hashtable + hash(sb, hashval); ++ struct hlist_head *head = i_hash_head(sb, hashval); + struct inode *inode, *ret_inode = NULL; + int mval; + +@@ -1493,7 +1499,7 @@ EXPORT_SYMBOL(find_inode_nowait); + struct inode *find_inode_rcu(struct super_block *sb, unsigned long hashval, + int (*test)(struct inode *, void *), void *data) + { +- struct hlist_head *head = inode_hashtable + hash(sb, hashval); ++ struct hlist_head *head = i_hash_head(sb, hashval); + struct inode *inode; + + RCU_LOCKDEP_WARN(!rcu_read_lock_held(), +@@ -1531,7 +1537,7 @@ EXPORT_SYMBOL(find_inode_rcu); + struct inode *find_inode_by_ino_rcu(struct super_block *sb, + unsigned long ino) + { +- struct hlist_head *head = inode_hashtable + hash(sb, ino); ++ struct hlist_head *head = i_hash_head(sb, ino); + struct inode *inode; + + RCU_LOCKDEP_WARN(!rcu_read_lock_held(), +@@ -1551,7 +1557,7 @@ int insert_inode_locked(struct inode *inode) + { + struct super_block *sb = inode->i_sb; + ino_t ino = inode->i_ino; +- struct hlist_head *head = inode_hashtable + hash(sb, ino); ++ struct hlist_head *head = i_hash_head(sb, ino); + + while (1) { + struct inode *old = NULL; +-- +cgit v1.2.3 + + +From cfc56273e78b1cdfccd815514b2152de5e5c3a2f Mon Sep 17 00:00:00 2001 +From: Dave Chinner +Date: Mon, 29 Mar 2021 13:10:45 +1100 +Subject: hlist-bl: add hlist_bl_fake() + +in preparation for switching the VFS inode cache over the hlist_bl +lists, we nee dto be able to fake a list node that looks like it is +hased for correct operation of filesystems that don't directly use +the VFS indoe cache. + +Signed-off-by: Dave Chinner +--- + include/linux/list_bl.h | 22 ++++++++++++++++++++++ + 1 file changed, 22 insertions(+) + +diff --git a/include/linux/list_bl.h b/include/linux/list_bl.h +index ae1b541446c9..8ee2bf5af131 100644 +--- a/include/linux/list_bl.h ++++ b/include/linux/list_bl.h +@@ -143,6 +143,28 @@ static inline void hlist_bl_del_init(struct hlist_bl_node *n) + } + } + ++/** ++ * hlist_bl_add_fake - create a fake list consisting of a single headless node ++ * @n: Node to make a fake list out of ++ * ++ * This makes @n appear to be its own predecessor on a headless hlist. ++ * The point of this is to allow things like hlist_bl_del() to work correctly ++ * in cases where there is no list. ++ */ ++static inline void hlist_bl_add_fake(struct hlist_bl_node *n) ++{ ++ n->pprev = &n->next; ++} ++ ++/** ++ * hlist_fake: Is this node a fake hlist_bl? ++ * @h: Node to check for being a self-referential fake hlist. ++ */ ++static inline bool hlist_bl_fake(struct hlist_bl_node *n) ++{ ++ return n->pprev == &n->next; ++} ++ + static inline void hlist_bl_lock(struct hlist_bl_head *b) + { + bit_spin_lock(0, (unsigned long *)b); +-- +cgit v1.2.3 + + +From 8c30c7b318e2d8375216dde2f3f42e094006afbe Mon Sep 17 00:00:00 2001 +From: Dave Chinner +Date: Mon, 29 Mar 2021 13:10:46 +1100 +Subject: vfs: inode cache conversion to hash-bl + +Because scalability of the global inode_hash_lock really, really +sucks. + +32-way concurrent create on a couple of different filesystems +before: + +- 52.13% 0.04% [kernel] [k] ext4_create + - 52.09% ext4_create + - 41.03% __ext4_new_inode + - 29.92% insert_inode_locked + - 25.35% _raw_spin_lock + - do_raw_spin_lock + - 24.97% __pv_queued_spin_lock_slowpath + +- 72.33% 0.02% [kernel] [k] do_filp_open + - 72.31% do_filp_open + - 72.28% path_openat + - 57.03% bch2_create + - 56.46% __bch2_create + - 40.43% inode_insert5 + - 36.07% _raw_spin_lock + - do_raw_spin_lock + 35.86% __pv_queued_spin_lock_slowpath + 4.02% find_inode + +Convert the inode hash table to a RCU-aware hash-bl table just like +the dentry cache. Note that we need to store a pointer to the +hlist_bl_head the inode has been added to in the inode so that when +it comes to unhash the inode we know what list to lock. We need to +do this because the hash value that is used to hash the inode is +generated from the inode itself - filesystems can provide this +themselves so we have to either store the hash or the head pointer +in the inode to be able to find the right list head for removal... + +Same workload after: + +Signed-off-by: Dave Chinner +--- + fs/inode.c | 200 ++++++++++++++++++++++++++++++++++------------------- + include/linux/fs.h | 9 +-- + 2 files changed, 132 insertions(+), 77 deletions(-) + +diff --git a/fs/inode.c b/fs/inode.c +index 708f6bc161d5..92ddf7da41c5 100644 +--- a/fs/inode.c ++++ b/fs/inode.c +@@ -56,8 +56,7 @@ + + static unsigned int i_hash_mask __read_mostly; + static unsigned int i_hash_shift __read_mostly; +-static struct hlist_head *inode_hashtable __read_mostly; +-static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock); ++static struct hlist_bl_head *inode_hashtable __read_mostly; + + static unsigned long hash(struct super_block *sb, unsigned long hashval) + { +@@ -69,7 +68,7 @@ static unsigned long hash(struct super_block *sb, unsigned long hashval) + return tmp & i_hash_mask; + } + +-static inline struct hlist_head *i_hash_head(struct super_block *sb, ++static inline struct hlist_bl_head *i_hash_head(struct super_block *sb, + unsigned int hashval) + { + return inode_hashtable + hash(sb, hashval); +@@ -408,7 +407,7 @@ EXPORT_SYMBOL(address_space_init_once); + void inode_init_once(struct inode *inode) + { + memset(inode, 0, sizeof(*inode)); +- INIT_HLIST_NODE(&inode->i_hash); ++ INIT_HLIST_BL_NODE(&inode->i_hash); + INIT_LIST_HEAD(&inode->i_devices); + INIT_LIST_HEAD(&inode->i_io_list); + INIT_LIST_HEAD(&inode->i_wb_list); +@@ -496,6 +495,17 @@ static inline void inode_sb_list_del(struct inode *inode) + } + } + ++/* ++ * Ensure that we store the hash head in the inode when we insert the inode into ++ * the hlist_bl_head... ++ */ ++static inline void ++__insert_inode_hash_head(struct inode *inode, struct hlist_bl_head *b) ++{ ++ hlist_bl_add_head_rcu(&inode->i_hash, b); ++ inode->i_hash_head = b; ++} ++ + /** + * __insert_inode_hash - hash an inode + * @inode: unhashed inode +@@ -506,13 +516,13 @@ static inline void inode_sb_list_del(struct inode *inode) + */ + void __insert_inode_hash(struct inode *inode, unsigned long hashval) + { +- struct hlist_head *b = inode_hashtable + hash(inode->i_sb, hashval); ++ struct hlist_bl_head *b = i_hash_head(inode->i_sb, hashval); + +- spin_lock(&inode_hash_lock); ++ hlist_bl_lock(b); + spin_lock(&inode->i_lock); +- hlist_add_head_rcu(&inode->i_hash, b); ++ __insert_inode_hash_head(inode, b); + spin_unlock(&inode->i_lock); +- spin_unlock(&inode_hash_lock); ++ hlist_bl_unlock(b); + } + EXPORT_SYMBOL(__insert_inode_hash); + +@@ -524,11 +534,44 @@ EXPORT_SYMBOL(__insert_inode_hash); + */ + void __remove_inode_hash(struct inode *inode) + { +- spin_lock(&inode_hash_lock); +- spin_lock(&inode->i_lock); +- hlist_del_init_rcu(&inode->i_hash); +- spin_unlock(&inode->i_lock); +- spin_unlock(&inode_hash_lock); ++ struct hlist_bl_head *b = inode->i_hash_head; ++ ++ /* ++ * There are some callers that come through here without synchronisation ++ * and potentially with multiple references to the inode. Hence we have ++ * to handle the case that we might race with a remove and insert to a ++ * different list. Coda, in particular, seems to have a userspace API ++ * that can directly trigger "unhash/rehash to different list" behaviour ++ * without any serialisation at all. ++ * ++ * Hence we have to handle the situation where the inode->i_hash_head ++ * might point to a different list than what we expect, indicating that ++ * we raced with another unhash and potentially a new insertion. This ++ * means we have to retest the head once we have everything locked up ++ * and loop again if it doesn't match. ++ */ ++ while (b) { ++ hlist_bl_lock(b); ++ spin_lock(&inode->i_lock); ++ if (b != inode->i_hash_head) { ++ hlist_bl_unlock(b); ++ b = inode->i_hash_head; ++ spin_unlock(&inode->i_lock); ++ continue; ++ } ++ /* ++ * Need to set the pprev pointer to NULL after list removal so ++ * that both RCU traversals and hlist_bl_unhashed() work ++ * correctly at this point. ++ */ ++ hlist_bl_del_rcu(&inode->i_hash); ++ inode->i_hash.pprev = NULL; ++ inode->i_hash_head = NULL; ++ spin_unlock(&inode->i_lock); ++ hlist_bl_unlock(b); ++ break; ++ } ++ + } + EXPORT_SYMBOL(__remove_inode_hash); + +@@ -829,26 +872,28 @@ long prune_icache_sb(struct super_block *sb, struct shrink_control *sc) + return freed; + } + +-static void __wait_on_freeing_inode(struct inode *inode); ++static void __wait_on_freeing_inode(struct hlist_bl_head *b, ++ struct inode *inode); + /* + * Called with the inode lock held. + */ + static struct inode *find_inode(struct super_block *sb, +- struct hlist_head *head, ++ struct hlist_bl_head *b, + int (*test)(struct inode *, void *), + void *data) + { ++ struct hlist_bl_node *node; + struct inode *inode = NULL; + + repeat: +- hlist_for_each_entry(inode, head, i_hash) { ++ hlist_bl_for_each_entry(inode, node, b, i_hash) { + if (inode->i_sb != sb) + continue; + if (!test(inode, data)) + continue; + spin_lock(&inode->i_lock); + if (inode->i_state & (I_FREEING|I_WILL_FREE)) { +- __wait_on_freeing_inode(inode); ++ __wait_on_freeing_inode(b, inode); + goto repeat; + } + if (unlikely(inode->i_state & I_CREATING)) { +@@ -867,19 +912,20 @@ repeat: + * iget_locked for details. + */ + static struct inode *find_inode_fast(struct super_block *sb, +- struct hlist_head *head, unsigned long ino) ++ struct hlist_bl_head *b, unsigned long ino) + { ++ struct hlist_bl_node *node; + struct inode *inode = NULL; + + repeat: +- hlist_for_each_entry(inode, head, i_hash) { ++ hlist_bl_for_each_entry(inode, node, b, i_hash) { + if (inode->i_ino != ino) + continue; + if (inode->i_sb != sb) + continue; + spin_lock(&inode->i_lock); + if (inode->i_state & (I_FREEING|I_WILL_FREE)) { +- __wait_on_freeing_inode(inode); ++ __wait_on_freeing_inode(b, inode); + goto repeat; + } + if (unlikely(inode->i_state & I_CREATING)) { +@@ -1088,26 +1134,26 @@ EXPORT_SYMBOL(unlock_two_nondirectories); + * return it locked, hashed, and with the I_NEW flag set. The file system gets + * to fill it in before unlocking it via unlock_new_inode(). + * +- * Note both @test and @set are called with the inode_hash_lock held, so can't +- * sleep. ++ * Note both @test and @set are called with the inode hash chain lock held, ++ * so can't sleep. + */ + struct inode *inode_insert5(struct inode *inode, unsigned long hashval, + int (*test)(struct inode *, void *), + int (*set)(struct inode *, void *), void *data) + { +- struct hlist_head *head = i_hash_head(inode->i_sb, hashval); ++ struct hlist_bl_head *b = i_hash_head(inode->i_sb, hashval); + struct inode *old; + bool creating = inode->i_state & I_CREATING; + + again: +- spin_lock(&inode_hash_lock); +- old = find_inode(inode->i_sb, head, test, data); ++ hlist_bl_lock(b); ++ old = find_inode(inode->i_sb, b, test, data); + if (unlikely(old)) { + /* + * Uhhuh, somebody else created the same inode under us. + * Use the old inode instead of the preallocated one. + */ +- spin_unlock(&inode_hash_lock); ++ hlist_bl_unlock(b); + if (IS_ERR(old)) + return NULL; + wait_on_inode(old); +@@ -1129,12 +1175,12 @@ again: + */ + spin_lock(&inode->i_lock); + inode->i_state |= I_NEW; +- hlist_add_head_rcu(&inode->i_hash, head); ++ __insert_inode_hash_head(inode, b); + spin_unlock(&inode->i_lock); + if (!creating) + inode_sb_list_add(inode); + unlock: +- spin_unlock(&inode_hash_lock); ++ hlist_bl_unlock(b); + + return inode; + } +@@ -1195,12 +1241,12 @@ EXPORT_SYMBOL(iget5_locked); + */ + struct inode *iget_locked(struct super_block *sb, unsigned long ino) + { +- struct hlist_head *head = i_hash_head(sb, ino); ++ struct hlist_bl_head *b = i_hash_head(sb, ino); + struct inode *inode; + again: +- spin_lock(&inode_hash_lock); +- inode = find_inode_fast(sb, head, ino); +- spin_unlock(&inode_hash_lock); ++ hlist_bl_lock(b); ++ inode = find_inode_fast(sb, b, ino); ++ hlist_bl_unlock(b); + if (inode) { + if (IS_ERR(inode)) + return NULL; +@@ -1216,17 +1262,17 @@ again: + if (inode) { + struct inode *old; + +- spin_lock(&inode_hash_lock); ++ hlist_bl_lock(b); + /* We released the lock, so.. */ +- old = find_inode_fast(sb, head, ino); ++ old = find_inode_fast(sb, b, ino); + if (!old) { + inode->i_ino = ino; + spin_lock(&inode->i_lock); + inode->i_state = I_NEW; +- hlist_add_head_rcu(&inode->i_hash, head); ++ __insert_inode_hash_head(inode, b); + spin_unlock(&inode->i_lock); + inode_sb_list_add(inode); +- spin_unlock(&inode_hash_lock); ++ hlist_bl_unlock(b); + + /* Return the locked inode with I_NEW set, the + * caller is responsible for filling in the contents +@@ -1239,7 +1285,7 @@ again: + * us. Use the old inode instead of the one we just + * allocated. + */ +- spin_unlock(&inode_hash_lock); ++ hlist_bl_unlock(b); + destroy_inode(inode); + if (IS_ERR(old)) + return NULL; +@@ -1263,10 +1309,11 @@ EXPORT_SYMBOL(iget_locked); + */ + static int test_inode_iunique(struct super_block *sb, unsigned long ino) + { +- struct hlist_head *b = i_hash_head(sb, ino); ++ struct hlist_bl_head *b = i_hash_head(sb, ino); ++ struct hlist_bl_node *node; + struct inode *inode; + +- hlist_for_each_entry_rcu(inode, b, i_hash) { ++ hlist_bl_for_each_entry_rcu(inode, node, b, i_hash) { + if (inode->i_ino == ino && inode->i_sb == sb) + return 0; + } +@@ -1350,12 +1397,12 @@ EXPORT_SYMBOL(igrab); + struct inode *ilookup5_nowait(struct super_block *sb, unsigned long hashval, + int (*test)(struct inode *, void *), void *data) + { +- struct hlist_head *head = i_hash_head(sb, hashval); ++ struct hlist_bl_head *b = i_hash_head(sb, hashval); + struct inode *inode; + +- spin_lock(&inode_hash_lock); +- inode = find_inode(sb, head, test, data); +- spin_unlock(&inode_hash_lock); ++ hlist_bl_lock(b); ++ inode = find_inode(sb, b, test, data); ++ hlist_bl_unlock(b); + + return IS_ERR(inode) ? NULL : inode; + } +@@ -1405,12 +1452,12 @@ EXPORT_SYMBOL(ilookup5); + */ + struct inode *ilookup(struct super_block *sb, unsigned long ino) + { +- struct hlist_head *head = i_hash_head(sb, ino); ++ struct hlist_bl_head *b = i_hash_head(sb, ino); + struct inode *inode; + again: +- spin_lock(&inode_hash_lock); +- inode = find_inode_fast(sb, head, ino); +- spin_unlock(&inode_hash_lock); ++ hlist_bl_lock(b); ++ inode = find_inode_fast(sb, b, ino); ++ hlist_bl_unlock(b); + + if (inode) { + if (IS_ERR(inode)) +@@ -1454,12 +1501,13 @@ struct inode *find_inode_nowait(struct super_block *sb, + void *), + void *data) + { +- struct hlist_head *head = i_hash_head(sb, hashval); ++ struct hlist_bl_head *b = i_hash_head(sb, hashval); ++ struct hlist_bl_node *node; + struct inode *inode, *ret_inode = NULL; + int mval; + +- spin_lock(&inode_hash_lock); +- hlist_for_each_entry(inode, head, i_hash) { ++ hlist_bl_lock(b); ++ hlist_bl_for_each_entry(inode, node, b, i_hash) { + if (inode->i_sb != sb) + continue; + mval = match(inode, hashval, data); +@@ -1470,7 +1518,7 @@ struct inode *find_inode_nowait(struct super_block *sb, + goto out; + } + out: +- spin_unlock(&inode_hash_lock); ++ hlist_bl_unlock(b); + return ret_inode; + } + EXPORT_SYMBOL(find_inode_nowait); +@@ -1499,13 +1547,14 @@ EXPORT_SYMBOL(find_inode_nowait); + struct inode *find_inode_rcu(struct super_block *sb, unsigned long hashval, + int (*test)(struct inode *, void *), void *data) + { +- struct hlist_head *head = i_hash_head(sb, hashval); ++ struct hlist_bl_head *b = i_hash_head(sb, hashval); ++ struct hlist_bl_node *node; + struct inode *inode; + + RCU_LOCKDEP_WARN(!rcu_read_lock_held(), + "suspicious find_inode_rcu() usage"); + +- hlist_for_each_entry_rcu(inode, head, i_hash) { ++ hlist_bl_for_each_entry_rcu(inode, node, b, i_hash) { + if (inode->i_sb == sb && + !(READ_ONCE(inode->i_state) & (I_FREEING | I_WILL_FREE)) && + test(inode, data)) +@@ -1537,13 +1586,14 @@ EXPORT_SYMBOL(find_inode_rcu); + struct inode *find_inode_by_ino_rcu(struct super_block *sb, + unsigned long ino) + { +- struct hlist_head *head = i_hash_head(sb, ino); ++ struct hlist_bl_head *b = i_hash_head(sb, ino); ++ struct hlist_bl_node *node; + struct inode *inode; + + RCU_LOCKDEP_WARN(!rcu_read_lock_held(), + "suspicious find_inode_by_ino_rcu() usage"); + +- hlist_for_each_entry_rcu(inode, head, i_hash) { ++ hlist_bl_for_each_entry_rcu(inode, node, b, i_hash) { + if (inode->i_ino == ino && + inode->i_sb == sb && + !(READ_ONCE(inode->i_state) & (I_FREEING | I_WILL_FREE))) +@@ -1557,39 +1607,42 @@ int insert_inode_locked(struct inode *inode) + { + struct super_block *sb = inode->i_sb; + ino_t ino = inode->i_ino; +- struct hlist_head *head = i_hash_head(sb, ino); ++ struct hlist_bl_head *b = i_hash_head(sb, ino); + + while (1) { +- struct inode *old = NULL; +- spin_lock(&inode_hash_lock); +- hlist_for_each_entry(old, head, i_hash) { +- if (old->i_ino != ino) ++ struct hlist_bl_node *node; ++ struct inode *old = NULL, *t; ++ ++ hlist_bl_lock(b); ++ hlist_bl_for_each_entry(t, node, b, i_hash) { ++ if (t->i_ino != ino) + continue; +- if (old->i_sb != sb) ++ if (t->i_sb != sb) + continue; +- spin_lock(&old->i_lock); +- if (old->i_state & (I_FREEING|I_WILL_FREE)) { +- spin_unlock(&old->i_lock); ++ spin_lock(&t->i_lock); ++ if (t->i_state & (I_FREEING|I_WILL_FREE)) { ++ spin_unlock(&t->i_lock); + continue; + } ++ old = t; + break; + } + if (likely(!old)) { + spin_lock(&inode->i_lock); + inode->i_state |= I_NEW | I_CREATING; +- hlist_add_head_rcu(&inode->i_hash, head); ++ __insert_inode_hash_head(inode, b); + spin_unlock(&inode->i_lock); +- spin_unlock(&inode_hash_lock); ++ hlist_bl_unlock(b); + return 0; + } + if (unlikely(old->i_state & I_CREATING)) { + spin_unlock(&old->i_lock); +- spin_unlock(&inode_hash_lock); ++ hlist_bl_unlock(b); + return -EBUSY; + } + __iget(old); + spin_unlock(&old->i_lock); +- spin_unlock(&inode_hash_lock); ++ hlist_bl_unlock(b); + wait_on_inode(old); + if (unlikely(!inode_unhashed(old))) { + iput(old); +@@ -2063,17 +2116,18 @@ EXPORT_SYMBOL(inode_needs_sync); + * wake_up_bit(&inode->i_state, __I_NEW) after removing from the hash list + * will DTRT. + */ +-static void __wait_on_freeing_inode(struct inode *inode) ++static void __wait_on_freeing_inode(struct hlist_bl_head *b, ++ struct inode *inode) + { + wait_queue_head_t *wq; + DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW); + wq = bit_waitqueue(&inode->i_state, __I_NEW); + prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); + spin_unlock(&inode->i_lock); +- spin_unlock(&inode_hash_lock); ++ hlist_bl_unlock(b); + schedule(); + finish_wait(wq, &wait.wq_entry); +- spin_lock(&inode_hash_lock); ++ hlist_bl_lock(b); + } + + static __initdata unsigned long ihash_entries; +@@ -2099,7 +2153,7 @@ void __init inode_init_early(void) + + inode_hashtable = + alloc_large_system_hash("Inode-cache", +- sizeof(struct hlist_head), ++ sizeof(struct hlist_bl_head), + ihash_entries, + 14, + HASH_EARLY | HASH_ZERO, +@@ -2125,7 +2179,7 @@ void __init inode_init(void) + + inode_hashtable = + alloc_large_system_hash("Inode-cache", +- sizeof(struct hlist_head), ++ sizeof(struct hlist_bl_head), + ihash_entries, + 14, + HASH_ZERO, +diff --git a/include/linux/fs.h b/include/linux/fs.h +index bbf812ce89a8..d8e6563f95c8 100644 +--- a/include/linux/fs.h ++++ b/include/linux/fs.h +@@ -672,7 +672,8 @@ struct inode { + unsigned long dirtied_when; /* jiffies of first dirtying */ + unsigned long dirtied_time_when; + +- struct hlist_node i_hash; ++ struct hlist_bl_node i_hash; ++ struct hlist_bl_head *i_hash_head; + struct list_head i_io_list; /* backing dev IO list */ + #ifdef CONFIG_CGROUP_WRITEBACK + struct bdi_writeback *i_wb; /* the associated cgroup wb */ +@@ -738,7 +739,7 @@ static inline unsigned int i_blocksize(const struct inode *node) + + static inline int inode_unhashed(struct inode *inode) + { +- return hlist_unhashed(&inode->i_hash); ++ return hlist_bl_unhashed(&inode->i_hash); + } + + /* +@@ -749,7 +750,7 @@ static inline int inode_unhashed(struct inode *inode) + */ + static inline void inode_fake_hash(struct inode *inode) + { +- hlist_add_fake(&inode->i_hash); ++ hlist_bl_add_fake(&inode->i_hash); + } + + /* +@@ -3187,7 +3188,7 @@ static inline void insert_inode_hash(struct inode *inode) + extern void __remove_inode_hash(struct inode *); + static inline void remove_inode_hash(struct inode *inode) + { +- if (!inode_unhashed(inode) && !hlist_fake(&inode->i_hash)) ++ if (!inode_unhashed(inode) && !hlist_bl_fake(&inode->i_hash)) + __remove_inode_hash(inode); + } + +-- +cgit v1.2.3 + + +From 17cdcb51315cbc3cfeba678de3a05e4a4b1e1924 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 12 Feb 2021 20:11:25 -0500 +Subject: lib/generic-radix-tree.c: Don't overflow in peek() + +When we started spreading new inode numbers throughout most of the 64 +bit inode space, that triggered some corner case bugs, in particular +some integer overflows related to the radix tree code. Oops. + +Signed-off-by: Kent Overstreet +--- + include/linux/generic-radix-tree.h | 6 ++++++ + lib/generic-radix-tree.c | 17 ++++++++++++++--- + 2 files changed, 20 insertions(+), 3 deletions(-) + +diff --git a/include/linux/generic-radix-tree.h b/include/linux/generic-radix-tree.h +index 107613f7d792..63080822dc84 100644 +--- a/include/linux/generic-radix-tree.h ++++ b/include/linux/generic-radix-tree.h +@@ -184,6 +184,12 @@ void *__genradix_iter_peek(struct genradix_iter *, struct __genradix *, size_t); + static inline void __genradix_iter_advance(struct genradix_iter *iter, + size_t obj_size) + { ++ if (iter->offset + obj_size < iter->offset) { ++ iter->offset = SIZE_MAX; ++ iter->pos = SIZE_MAX; ++ return; ++ } ++ + iter->offset += obj_size; + + if (!is_power_of_2(obj_size) && +diff --git a/lib/generic-radix-tree.c b/lib/generic-radix-tree.c +index f25eb111c051..7dfa88282b00 100644 +--- a/lib/generic-radix-tree.c ++++ b/lib/generic-radix-tree.c +@@ -166,6 +166,10 @@ void *__genradix_iter_peek(struct genradix_iter *iter, + struct genradix_root *r; + struct genradix_node *n; + unsigned level, i; ++ ++ if (iter->offset == SIZE_MAX) ++ return NULL; ++ + restart: + r = READ_ONCE(radix->root); + if (!r) +@@ -184,10 +188,17 @@ restart: + (GENRADIX_ARY - 1); + + while (!n->children[i]) { ++ size_t objs_per_ptr = genradix_depth_size(level); ++ ++ if (iter->offset + objs_per_ptr < iter->offset) { ++ iter->offset = SIZE_MAX; ++ iter->pos = SIZE_MAX; ++ return NULL; ++ } ++ + i++; +- iter->offset = round_down(iter->offset + +- genradix_depth_size(level), +- genradix_depth_size(level)); ++ iter->offset = round_down(iter->offset + objs_per_ptr, ++ objs_per_ptr); + iter->pos = (iter->offset >> PAGE_SHIFT) * + objs_per_page; + if (i == GENRADIX_ARY) +-- +cgit v1.2.3 + + +From 1db6d76880a3f799a7e8811378ded15ef24bbdba Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 3 Nov 2021 10:46:35 -0400 +Subject: loop: Don't call disk_force_media_change from lo_release + +This fixes a deadlock that affects bcachefs, where we call blkdev_put() +directly from put_super(). + +put_super + -> blkdev_put + -> lo_release + -> disk_force_media_change + -> __invalidate_device + -> get_super + + where we self deadlock on s_umount. + +It doesn't affect other filesystems because they use the bizzare sget() +mechanism for exclusion with other mounts, and thus don't need to close +block devices in .put_super(). + +Signed-off-by: Kent Overstreet +--- + drivers/block/loop.c | 1 - + 1 file changed, 1 deletion(-) + +diff --git a/drivers/block/loop.c b/drivers/block/loop.c +index c3a36cfaa855..c373d9abc0ed 100644 +--- a/drivers/block/loop.c ++++ b/drivers/block/loop.c +@@ -1155,7 +1155,6 @@ static int __loop_clr_fd(struct loop_device *lo, bool release) + + partscan = lo->lo_flags & LO_FLAGS_PARTSCAN; + lo_number = lo->lo_number; +- disk_force_media_change(lo->lo_disk, DISK_EVENT_MEDIA_CHANGE); + out_unlock: + mutex_unlock(&lo->lo_mutex); + if (partscan) { +-- +cgit v1.2.3 + + +From 9a0f7a44cffe6e987bd0d0d3a687ddfa8fc67cb8 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 6 Jul 2021 12:55:12 -0400 +Subject: Revert "block: remove zero_fill_bio_iter" + +Bring this helper back for bcachefs. + +This reverts commit 6f822e1b5d9dda3d20e87365de138046e3baa03a. + +Signed-off-by: Kent Overstreet +--- + block/bio.c | 6 +++--- + include/linux/bio.h | 7 ++++++- + 2 files changed, 9 insertions(+), 4 deletions(-) + +diff --git a/block/bio.c b/block/bio.c +index 74c6be7dd6dd..f608f01cc60d 100644 +--- a/block/bio.c ++++ b/block/bio.c +@@ -528,15 +528,15 @@ struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned short nr_iovecs) + } + EXPORT_SYMBOL(bio_kmalloc); + +-void zero_fill_bio(struct bio *bio) ++void zero_fill_bio_iter(struct bio *bio, struct bvec_iter start) + { + struct bio_vec bv; + struct bvec_iter iter; + +- bio_for_each_segment(bv, bio, iter) ++ __bio_for_each_segment(bv, bio, iter, start) + memzero_bvec(&bv); + } +-EXPORT_SYMBOL(zero_fill_bio); ++EXPORT_SYMBOL(zero_fill_bio_iter); + + /** + * bio_truncate - truncate the bio to small size of @new_size +diff --git a/include/linux/bio.h b/include/linux/bio.h +index fe6bdfbbef66..6106d5e51edb 100644 +--- a/include/linux/bio.h ++++ b/include/linux/bio.h +@@ -427,7 +427,12 @@ extern void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter, + extern void bio_copy_data(struct bio *dst, struct bio *src); + extern void bio_free_pages(struct bio *bio); + void guard_bio_eod(struct bio *bio); +-void zero_fill_bio(struct bio *bio); ++void zero_fill_bio_iter(struct bio *bio, struct bvec_iter iter); ++ ++static inline void zero_fill_bio(struct bio *bio) ++{ ++ zero_fill_bio_iter(bio, bio->bi_iter); ++} + + static inline void bio_release_pages(struct bio *bio, bool mark_dirty) + { +-- +cgit v1.2.3 + + +From 77be4c9e0c66d4fc74a3d165b4bf3347e8c8d6f7 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 16 Mar 2017 22:18:50 -0800 +Subject: bcachefs: Initial commit + +Forked from drivers/md/bcache, now a full blown COW multi device +filesystem with a long list of features - https://bcachefs.org + +Signed-off-by: Kent Overstreet +--- + fs/Kconfig | 1 + + fs/Makefile | 1 + + fs/bcachefs/Kconfig | 50 + + fs/bcachefs/Makefile | 57 + + fs/bcachefs/acl.c | 393 +++++ + fs/bcachefs/acl.h | 59 + + fs/bcachefs/alloc_background.c | 1656 ++++++++++++++++++ + fs/bcachefs/alloc_background.h | 100 ++ + fs/bcachefs/alloc_foreground.c | 1044 ++++++++++++ + fs/bcachefs/alloc_foreground.h | 133 ++ + fs/bcachefs/alloc_types.h | 106 ++ + fs/bcachefs/bcachefs.h | 855 ++++++++++ + fs/bcachefs/bcachefs_format.h | 1604 ++++++++++++++++++ + fs/bcachefs/bcachefs_ioctl.h | 314 ++++ + fs/bcachefs/bkey.c | 1160 +++++++++++++ + fs/bcachefs/bkey.h | 594 +++++++ + fs/bcachefs/bkey_methods.c | 262 +++ + fs/bcachefs/bkey_methods.h | 63 + + fs/bcachefs/bkey_sort.c | 630 +++++++ + fs/bcachefs/bkey_sort.h | 69 + + fs/bcachefs/bset.c | 1876 +++++++++++++++++++++ + fs/bcachefs/bset.h | 624 +++++++ + fs/bcachefs/btree_cache.c | 934 +++++++++++ + fs/bcachefs/btree_cache.h | 90 + + fs/bcachefs/btree_gc.c | 1230 ++++++++++++++ + fs/bcachefs/btree_gc.h | 120 ++ + fs/bcachefs/btree_io.c | 1703 +++++++++++++++++++ + fs/bcachefs/btree_io.h | 141 ++ + fs/bcachefs/btree_iter.c | 2158 ++++++++++++++++++++++++ + fs/bcachefs/btree_iter.h | 314 ++++ + fs/bcachefs/btree_locking.h | 239 +++ + fs/bcachefs/btree_types.h | 523 ++++++ + fs/bcachefs/btree_update.h | 157 ++ + fs/bcachefs/btree_update_interior.c | 2234 ++++++++++++++++++++++++ + fs/bcachefs/btree_update_interior.h | 341 ++++ + fs/bcachefs/btree_update_leaf.c | 952 +++++++++++ + fs/bcachefs/buckets.c | 2095 +++++++++++++++++++++++ + fs/bcachefs/buckets.h | 337 ++++ + fs/bcachefs/buckets_types.h | 130 ++ + fs/bcachefs/chardev.c | 671 ++++++++ + fs/bcachefs/chardev.h | 31 + + fs/bcachefs/checksum.c | 617 +++++++ + fs/bcachefs/checksum.h | 199 +++ + fs/bcachefs/clock.c | 180 ++ + fs/bcachefs/clock.h | 25 + + fs/bcachefs/clock_types.h | 36 + + fs/bcachefs/compress.c | 623 +++++++ + fs/bcachefs/compress.h | 18 + + fs/bcachefs/debug.c | 432 +++++ + fs/bcachefs/debug.h | 63 + + fs/bcachefs/dirent.c | 386 +++++ + fs/bcachefs/dirent.h | 65 + + fs/bcachefs/disk_groups.c | 481 ++++++ + fs/bcachefs/disk_groups.h | 88 + + fs/bcachefs/ec.c | 1401 ++++++++++++++++ + fs/bcachefs/ec.h | 164 ++ + fs/bcachefs/ec_types.h | 38 + + fs/bcachefs/error.c | 167 ++ + fs/bcachefs/error.h | 229 +++ + fs/bcachefs/extents.c | 1752 +++++++++++++++++++ + fs/bcachefs/extents.h | 582 +++++++ + fs/bcachefs/extents_types.h | 40 + + fs/bcachefs/eytzinger.h | 285 ++++ + fs/bcachefs/fifo.h | 127 ++ + fs/bcachefs/fs-common.c | 281 ++++ + fs/bcachefs/fs-common.h | 36 + + fs/bcachefs/fs-io.c | 3165 +++++++++++++++++++++++++++++++++++ + fs/bcachefs/fs-io.h | 57 + + fs/bcachefs/fs-ioctl.c | 308 ++++ + fs/bcachefs/fs-ioctl.h | 81 + + fs/bcachefs/fs.c | 1614 ++++++++++++++++++ + fs/bcachefs/fs.h | 174 ++ + fs/bcachefs/fsck.c | 1436 ++++++++++++++++ + fs/bcachefs/fsck.h | 9 + + fs/bcachefs/inode.c | 567 +++++++ + fs/bcachefs/inode.h | 177 ++ + fs/bcachefs/io.c | 2210 ++++++++++++++++++++++++ + fs/bcachefs/io.h | 163 ++ + fs/bcachefs/io_types.h | 148 ++ + fs/bcachefs/journal.c | 1253 ++++++++++++++ + fs/bcachefs/journal.h | 495 ++++++ + fs/bcachefs/journal_io.c | 1123 +++++++++++++ + fs/bcachefs/journal_io.h | 42 + + fs/bcachefs/journal_reclaim.c | 626 +++++++ + fs/bcachefs/journal_reclaim.h | 57 + + fs/bcachefs/journal_seq_blacklist.c | 318 ++++ + fs/bcachefs/journal_seq_blacklist.h | 13 + + fs/bcachefs/journal_types.h | 276 +++ + fs/bcachefs/keylist.c | 67 + + fs/bcachefs/keylist.h | 76 + + fs/bcachefs/keylist_types.h | 16 + + fs/bcachefs/migrate.c | 187 +++ + fs/bcachefs/migrate.h | 7 + + fs/bcachefs/move.c | 804 +++++++++ + fs/bcachefs/move.h | 64 + + fs/bcachefs/move_types.h | 16 + + fs/bcachefs/movinggc.c | 305 ++++ + fs/bcachefs/movinggc.h | 9 + + fs/bcachefs/opts.c | 441 +++++ + fs/bcachefs/opts.h | 403 +++++ + fs/bcachefs/quota.c | 782 +++++++++ + fs/bcachefs/quota.h | 71 + + fs/bcachefs/quota_types.h | 43 + + fs/bcachefs/rebalance.c | 332 ++++ + fs/bcachefs/rebalance.h | 28 + + fs/bcachefs/rebalance_types.h | 27 + + fs/bcachefs/recovery.c | 1047 ++++++++++++ + fs/bcachefs/recovery.h | 35 + + fs/bcachefs/reflink.c | 304 ++++ + fs/bcachefs/reflink.h | 30 + + fs/bcachefs/replicas.c | 1076 ++++++++++++ + fs/bcachefs/replicas.h | 98 ++ + fs/bcachefs/replicas_types.h | 10 + + fs/bcachefs/siphash.c | 173 ++ + fs/bcachefs/siphash.h | 87 + + fs/bcachefs/str_hash.h | 331 ++++ + fs/bcachefs/super-io.c | 1154 +++++++++++++ + fs/bcachefs/super-io.h | 150 ++ + fs/bcachefs/super.c | 1953 +++++++++++++++++++++ + fs/bcachefs/super.h | 231 +++ + fs/bcachefs/super_types.h | 51 + + fs/bcachefs/sysfs.c | 1068 ++++++++++++ + fs/bcachefs/sysfs.h | 44 + + fs/bcachefs/tests.c | 678 ++++++++ + fs/bcachefs/tests.h | 15 + + fs/bcachefs/trace.c | 12 + + fs/bcachefs/util.c | 910 ++++++++++ + fs/bcachefs/util.h | 760 +++++++++ + fs/bcachefs/vstructs.h | 63 + + fs/bcachefs/xattr.c | 584 +++++++ + fs/bcachefs/xattr.h | 49 + + include/trace/events/bcachefs.h | 647 +++++++ + lib/Kconfig.debug | 18 +- + 133 files changed, 63925 insertions(+), 9 deletions(-) + create mode 100644 fs/bcachefs/Kconfig + create mode 100644 fs/bcachefs/Makefile + create mode 100644 fs/bcachefs/acl.c + create mode 100644 fs/bcachefs/acl.h + create mode 100644 fs/bcachefs/alloc_background.c + create mode 100644 fs/bcachefs/alloc_background.h + create mode 100644 fs/bcachefs/alloc_foreground.c + create mode 100644 fs/bcachefs/alloc_foreground.h + create mode 100644 fs/bcachefs/alloc_types.h + create mode 100644 fs/bcachefs/bcachefs.h + create mode 100644 fs/bcachefs/bcachefs_format.h + create mode 100644 fs/bcachefs/bcachefs_ioctl.h + create mode 100644 fs/bcachefs/bkey.c + create mode 100644 fs/bcachefs/bkey.h + create mode 100644 fs/bcachefs/bkey_methods.c + create mode 100644 fs/bcachefs/bkey_methods.h + create mode 100644 fs/bcachefs/bkey_sort.c + create mode 100644 fs/bcachefs/bkey_sort.h + create mode 100644 fs/bcachefs/bset.c + create mode 100644 fs/bcachefs/bset.h + create mode 100644 fs/bcachefs/btree_cache.c + create mode 100644 fs/bcachefs/btree_cache.h + create mode 100644 fs/bcachefs/btree_gc.c + create mode 100644 fs/bcachefs/btree_gc.h + create mode 100644 fs/bcachefs/btree_io.c + create mode 100644 fs/bcachefs/btree_io.h + create mode 100644 fs/bcachefs/btree_iter.c + create mode 100644 fs/bcachefs/btree_iter.h + create mode 100644 fs/bcachefs/btree_locking.h + create mode 100644 fs/bcachefs/btree_types.h + create mode 100644 fs/bcachefs/btree_update.h + create mode 100644 fs/bcachefs/btree_update_interior.c + create mode 100644 fs/bcachefs/btree_update_interior.h + create mode 100644 fs/bcachefs/btree_update_leaf.c + create mode 100644 fs/bcachefs/buckets.c + create mode 100644 fs/bcachefs/buckets.h + create mode 100644 fs/bcachefs/buckets_types.h + create mode 100644 fs/bcachefs/chardev.c + create mode 100644 fs/bcachefs/chardev.h + create mode 100644 fs/bcachefs/checksum.c + create mode 100644 fs/bcachefs/checksum.h + create mode 100644 fs/bcachefs/clock.c + create mode 100644 fs/bcachefs/clock.h + create mode 100644 fs/bcachefs/clock_types.h + create mode 100644 fs/bcachefs/compress.c + create mode 100644 fs/bcachefs/compress.h + create mode 100644 fs/bcachefs/debug.c + create mode 100644 fs/bcachefs/debug.h + create mode 100644 fs/bcachefs/dirent.c + create mode 100644 fs/bcachefs/dirent.h + create mode 100644 fs/bcachefs/disk_groups.c + create mode 100644 fs/bcachefs/disk_groups.h + create mode 100644 fs/bcachefs/ec.c + create mode 100644 fs/bcachefs/ec.h + create mode 100644 fs/bcachefs/ec_types.h + create mode 100644 fs/bcachefs/error.c + create mode 100644 fs/bcachefs/error.h + create mode 100644 fs/bcachefs/extents.c + create mode 100644 fs/bcachefs/extents.h + create mode 100644 fs/bcachefs/extents_types.h + create mode 100644 fs/bcachefs/eytzinger.h + create mode 100644 fs/bcachefs/fifo.h + create mode 100644 fs/bcachefs/fs-common.c + create mode 100644 fs/bcachefs/fs-common.h + create mode 100644 fs/bcachefs/fs-io.c + create mode 100644 fs/bcachefs/fs-io.h + create mode 100644 fs/bcachefs/fs-ioctl.c + create mode 100644 fs/bcachefs/fs-ioctl.h + create mode 100644 fs/bcachefs/fs.c + create mode 100644 fs/bcachefs/fs.h + create mode 100644 fs/bcachefs/fsck.c + create mode 100644 fs/bcachefs/fsck.h + create mode 100644 fs/bcachefs/inode.c + create mode 100644 fs/bcachefs/inode.h + create mode 100644 fs/bcachefs/io.c + create mode 100644 fs/bcachefs/io.h + create mode 100644 fs/bcachefs/io_types.h + create mode 100644 fs/bcachefs/journal.c + create mode 100644 fs/bcachefs/journal.h + create mode 100644 fs/bcachefs/journal_io.c + create mode 100644 fs/bcachefs/journal_io.h + create mode 100644 fs/bcachefs/journal_reclaim.c + create mode 100644 fs/bcachefs/journal_reclaim.h + create mode 100644 fs/bcachefs/journal_seq_blacklist.c + create mode 100644 fs/bcachefs/journal_seq_blacklist.h + create mode 100644 fs/bcachefs/journal_types.h + create mode 100644 fs/bcachefs/keylist.c + create mode 100644 fs/bcachefs/keylist.h + create mode 100644 fs/bcachefs/keylist_types.h + create mode 100644 fs/bcachefs/migrate.c + create mode 100644 fs/bcachefs/migrate.h + create mode 100644 fs/bcachefs/move.c + create mode 100644 fs/bcachefs/move.h + create mode 100644 fs/bcachefs/move_types.h + create mode 100644 fs/bcachefs/movinggc.c + create mode 100644 fs/bcachefs/movinggc.h + create mode 100644 fs/bcachefs/opts.c + create mode 100644 fs/bcachefs/opts.h + create mode 100644 fs/bcachefs/quota.c + create mode 100644 fs/bcachefs/quota.h + create mode 100644 fs/bcachefs/quota_types.h + create mode 100644 fs/bcachefs/rebalance.c + create mode 100644 fs/bcachefs/rebalance.h + create mode 100644 fs/bcachefs/rebalance_types.h + create mode 100644 fs/bcachefs/recovery.c + create mode 100644 fs/bcachefs/recovery.h + create mode 100644 fs/bcachefs/reflink.c + create mode 100644 fs/bcachefs/reflink.h + create mode 100644 fs/bcachefs/replicas.c + create mode 100644 fs/bcachefs/replicas.h + create mode 100644 fs/bcachefs/replicas_types.h + create mode 100644 fs/bcachefs/siphash.c + create mode 100644 fs/bcachefs/siphash.h + create mode 100644 fs/bcachefs/str_hash.h + create mode 100644 fs/bcachefs/super-io.c + create mode 100644 fs/bcachefs/super-io.h + create mode 100644 fs/bcachefs/super.c + create mode 100644 fs/bcachefs/super.h + create mode 100644 fs/bcachefs/super_types.h + create mode 100644 fs/bcachefs/sysfs.c + create mode 100644 fs/bcachefs/sysfs.h + create mode 100644 fs/bcachefs/tests.c + create mode 100644 fs/bcachefs/tests.h + create mode 100644 fs/bcachefs/trace.c + create mode 100644 fs/bcachefs/util.c + create mode 100644 fs/bcachefs/util.h + create mode 100644 fs/bcachefs/vstructs.h + create mode 100644 fs/bcachefs/xattr.c + create mode 100644 fs/bcachefs/xattr.h + create mode 100644 include/trace/events/bcachefs.h + +diff --git a/fs/Kconfig b/fs/Kconfig +index a6313a969bc5..b3dae7de1627 100644 +--- a/fs/Kconfig ++++ b/fs/Kconfig +@@ -40,6 +40,7 @@ source "fs/ocfs2/Kconfig" + source "fs/btrfs/Kconfig" + source "fs/nilfs2/Kconfig" + source "fs/f2fs/Kconfig" ++source "fs/bcachefs/Kconfig" + source "fs/zonefs/Kconfig" + + config FS_DAX +diff --git a/fs/Makefile b/fs/Makefile +index 84c5e4cdfee5..b0ec155b5c97 100644 +--- a/fs/Makefile ++++ b/fs/Makefile +@@ -132,6 +132,7 @@ obj-$(CONFIG_OCFS2_FS) += ocfs2/ + obj-$(CONFIG_BTRFS_FS) += btrfs/ + obj-$(CONFIG_GFS2_FS) += gfs2/ + obj-$(CONFIG_F2FS_FS) += f2fs/ ++obj-$(CONFIG_BCACHEFS_FS) += bcachefs/ + obj-$(CONFIG_CEPH_FS) += ceph/ + obj-$(CONFIG_PSTORE) += pstore/ + obj-$(CONFIG_EFIVAR_FS) += efivarfs/ +diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig +new file mode 100644 +index 000000000000..5594af719b2a +--- /dev/null ++++ b/fs/bcachefs/Kconfig +@@ -0,0 +1,50 @@ ++ ++config BCACHEFS_FS ++ tristate "bcachefs filesystem support" ++ depends on BLOCK ++ select EXPORTFS ++ select CLOSURES ++ select LIBCRC32C ++ select CRC64 ++ select FS_POSIX_ACL ++ select LZ4_COMPRESS ++ select LZ4_DECOMPRESS ++ select ZLIB_DEFLATE ++ select ZLIB_INFLATE ++ select ZSTD_COMPRESS ++ select ZSTD_DECOMPRESS ++ select CRYPTO_SHA256 ++ select CRYPTO_CHACHA20 ++ select CRYPTO_POLY1305 ++ select KEYS ++ select SIXLOCKS ++ select RAID6_PQ ++ select XOR_BLOCKS ++ help ++ The bcachefs filesystem - a modern, copy on write filesystem, with ++ support for multiple devices, compression, checksumming, etc. ++ ++config BCACHEFS_QUOTA ++ bool "bcachefs quota support" ++ depends on BCACHEFS_FS ++ select QUOTACTL ++ ++config BCACHEFS_POSIX_ACL ++ bool "bcachefs POSIX ACL support" ++ depends on BCACHEFS_FS ++ select FS_POSIX_ACL ++ ++config BCACHEFS_DEBUG ++ bool "bcachefs debugging" ++ depends on BCACHEFS_FS ++ help ++ Enables many extra debugging checks and assertions. ++ ++ The resulting code will be significantly slower than normal; you ++ probably shouldn't select this option unless you're a developer. ++ ++config BCACHEFS_TESTS ++ bool "bcachefs unit and performance tests" ++ depends on BCACHEFS_FS ++ help ++ Include some unit and performance tests for the core btree code +diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile +new file mode 100644 +index 000000000000..b199da94f311 +--- /dev/null ++++ b/fs/bcachefs/Makefile +@@ -0,0 +1,57 @@ ++ ++obj-$(CONFIG_BCACHEFS_FS) += bcachefs.o ++ ++bcachefs-y := \ ++ acl.o \ ++ alloc_background.o \ ++ alloc_foreground.o \ ++ bkey.o \ ++ bkey_methods.o \ ++ bkey_sort.o \ ++ bset.o \ ++ btree_cache.o \ ++ btree_gc.o \ ++ btree_io.o \ ++ btree_iter.o \ ++ btree_update_interior.o \ ++ btree_update_leaf.o \ ++ buckets.o \ ++ chardev.o \ ++ checksum.o \ ++ clock.o \ ++ compress.o \ ++ debug.o \ ++ dirent.o \ ++ disk_groups.o \ ++ ec.o \ ++ error.o \ ++ extents.o \ ++ fs.o \ ++ fs-common.o \ ++ fs-ioctl.o \ ++ fs-io.o \ ++ fsck.o \ ++ inode.o \ ++ io.o \ ++ journal.o \ ++ journal_io.o \ ++ journal_reclaim.o \ ++ journal_seq_blacklist.o \ ++ keylist.o \ ++ migrate.o \ ++ move.o \ ++ movinggc.o \ ++ opts.o \ ++ quota.o \ ++ rebalance.o \ ++ recovery.o \ ++ reflink.o \ ++ replicas.o \ ++ siphash.o \ ++ super.o \ ++ super-io.o \ ++ sysfs.o \ ++ tests.o \ ++ trace.o \ ++ util.o \ ++ xattr.o +diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c +new file mode 100644 +index 000000000000..2c59b05da484 +--- /dev/null ++++ b/fs/bcachefs/acl.c +@@ -0,0 +1,393 @@ ++// SPDX-License-Identifier: GPL-2.0 ++#ifdef CONFIG_BCACHEFS_POSIX_ACL ++ ++#include "bcachefs.h" ++ ++#include ++#include ++#include ++#include ++#include ++ ++#include "acl.h" ++#include "fs.h" ++#include "xattr.h" ++ ++static inline size_t bch2_acl_size(unsigned nr_short, unsigned nr_long) ++{ ++ return sizeof(bch_acl_header) + ++ sizeof(bch_acl_entry_short) * nr_short + ++ sizeof(bch_acl_entry) * nr_long; ++} ++ ++static inline int acl_to_xattr_type(int type) ++{ ++ switch (type) { ++ case ACL_TYPE_ACCESS: ++ return KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS; ++ case ACL_TYPE_DEFAULT: ++ return KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT; ++ default: ++ BUG(); ++ } ++} ++ ++/* ++ * Convert from filesystem to in-memory representation. ++ */ ++static struct posix_acl *bch2_acl_from_disk(const void *value, size_t size) ++{ ++ const void *p, *end = value + size; ++ struct posix_acl *acl; ++ struct posix_acl_entry *out; ++ unsigned count = 0; ++ ++ if (!value) ++ return NULL; ++ if (size < sizeof(bch_acl_header)) ++ goto invalid; ++ if (((bch_acl_header *)value)->a_version != ++ cpu_to_le32(BCH_ACL_VERSION)) ++ goto invalid; ++ ++ p = value + sizeof(bch_acl_header); ++ while (p < end) { ++ const bch_acl_entry *entry = p; ++ ++ if (p + sizeof(bch_acl_entry_short) > end) ++ goto invalid; ++ ++ switch (le16_to_cpu(entry->e_tag)) { ++ case ACL_USER_OBJ: ++ case ACL_GROUP_OBJ: ++ case ACL_MASK: ++ case ACL_OTHER: ++ p += sizeof(bch_acl_entry_short); ++ break; ++ case ACL_USER: ++ case ACL_GROUP: ++ p += sizeof(bch_acl_entry); ++ break; ++ default: ++ goto invalid; ++ } ++ ++ count++; ++ } ++ ++ if (p > end) ++ goto invalid; ++ ++ if (!count) ++ return NULL; ++ ++ acl = posix_acl_alloc(count, GFP_KERNEL); ++ if (!acl) ++ return ERR_PTR(-ENOMEM); ++ ++ out = acl->a_entries; ++ ++ p = value + sizeof(bch_acl_header); ++ while (p < end) { ++ const bch_acl_entry *in = p; ++ ++ out->e_tag = le16_to_cpu(in->e_tag); ++ out->e_perm = le16_to_cpu(in->e_perm); ++ ++ switch (out->e_tag) { ++ case ACL_USER_OBJ: ++ case ACL_GROUP_OBJ: ++ case ACL_MASK: ++ case ACL_OTHER: ++ p += sizeof(bch_acl_entry_short); ++ break; ++ case ACL_USER: ++ out->e_uid = make_kuid(&init_user_ns, ++ le32_to_cpu(in->e_id)); ++ p += sizeof(bch_acl_entry); ++ break; ++ case ACL_GROUP: ++ out->e_gid = make_kgid(&init_user_ns, ++ le32_to_cpu(in->e_id)); ++ p += sizeof(bch_acl_entry); ++ break; ++ } ++ ++ out++; ++ } ++ ++ BUG_ON(out != acl->a_entries + acl->a_count); ++ ++ return acl; ++invalid: ++ pr_err("invalid acl entry"); ++ return ERR_PTR(-EINVAL); ++} ++ ++#define acl_for_each_entry(acl, acl_e) \ ++ for (acl_e = acl->a_entries; \ ++ acl_e < acl->a_entries + acl->a_count; \ ++ acl_e++) ++ ++/* ++ * Convert from in-memory to filesystem representation. ++ */ ++static struct bkey_i_xattr * ++bch2_acl_to_xattr(struct btree_trans *trans, ++ const struct posix_acl *acl, ++ int type) ++{ ++ struct bkey_i_xattr *xattr; ++ bch_acl_header *acl_header; ++ const struct posix_acl_entry *acl_e; ++ void *outptr; ++ unsigned nr_short = 0, nr_long = 0, acl_len, u64s; ++ ++ acl_for_each_entry(acl, acl_e) { ++ switch (acl_e->e_tag) { ++ case ACL_USER: ++ case ACL_GROUP: ++ nr_long++; ++ break; ++ case ACL_USER_OBJ: ++ case ACL_GROUP_OBJ: ++ case ACL_MASK: ++ case ACL_OTHER: ++ nr_short++; ++ break; ++ default: ++ return ERR_PTR(-EINVAL); ++ } ++ } ++ ++ acl_len = bch2_acl_size(nr_short, nr_long); ++ u64s = BKEY_U64s + xattr_val_u64s(0, acl_len); ++ ++ if (u64s > U8_MAX) ++ return ERR_PTR(-E2BIG); ++ ++ xattr = bch2_trans_kmalloc(trans, u64s * sizeof(u64)); ++ if (IS_ERR(xattr)) ++ return xattr; ++ ++ bkey_xattr_init(&xattr->k_i); ++ xattr->k.u64s = u64s; ++ xattr->v.x_type = acl_to_xattr_type(type); ++ xattr->v.x_name_len = 0, ++ xattr->v.x_val_len = cpu_to_le16(acl_len); ++ ++ acl_header = xattr_val(&xattr->v); ++ acl_header->a_version = cpu_to_le32(BCH_ACL_VERSION); ++ ++ outptr = (void *) acl_header + sizeof(*acl_header); ++ ++ acl_for_each_entry(acl, acl_e) { ++ bch_acl_entry *entry = outptr; ++ ++ entry->e_tag = cpu_to_le16(acl_e->e_tag); ++ entry->e_perm = cpu_to_le16(acl_e->e_perm); ++ switch (acl_e->e_tag) { ++ case ACL_USER: ++ entry->e_id = cpu_to_le32( ++ from_kuid(&init_user_ns, acl_e->e_uid)); ++ outptr += sizeof(bch_acl_entry); ++ break; ++ case ACL_GROUP: ++ entry->e_id = cpu_to_le32( ++ from_kgid(&init_user_ns, acl_e->e_gid)); ++ outptr += sizeof(bch_acl_entry); ++ break; ++ ++ case ACL_USER_OBJ: ++ case ACL_GROUP_OBJ: ++ case ACL_MASK: ++ case ACL_OTHER: ++ outptr += sizeof(bch_acl_entry_short); ++ break; ++ } ++ } ++ ++ BUG_ON(outptr != xattr_val(&xattr->v) + acl_len); ++ ++ return xattr; ++} ++ ++struct posix_acl *bch2_get_acl(struct inode *vinode, int type, bool rcu) ++{ ++ struct bch_inode_info *inode = to_bch_ei(vinode); ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c_xattr xattr; ++ struct posix_acl *acl = NULL; ++ ++ if (rcu) ++ return ERR_PTR(-ECHILD); ++ ++ bch2_trans_init(&trans, c, 0, 0); ++retry: ++ bch2_trans_begin(&trans); ++ ++ iter = bch2_hash_lookup(&trans, bch2_xattr_hash_desc, ++ &inode->ei_str_hash, inode->v.i_ino, ++ &X_SEARCH(acl_to_xattr_type(type), "", 0), ++ 0); ++ if (IS_ERR(iter)) { ++ if (PTR_ERR(iter) == -EINTR) ++ goto retry; ++ ++ if (PTR_ERR(iter) != -ENOENT) ++ acl = ERR_CAST(iter); ++ goto out; ++ } ++ ++ xattr = bkey_s_c_to_xattr(bch2_btree_iter_peek_slot(iter)); ++ ++ acl = bch2_acl_from_disk(xattr_val(xattr.v), ++ le16_to_cpu(xattr.v->x_val_len)); ++ ++ if (!IS_ERR(acl)) ++ set_cached_acl(&inode->v, type, acl); ++out: ++ bch2_trans_exit(&trans); ++ return acl; ++} ++ ++int bch2_set_acl_trans(struct btree_trans *trans, ++ struct bch_inode_unpacked *inode_u, ++ const struct bch_hash_info *hash_info, ++ struct posix_acl *acl, int type) ++{ ++ int ret; ++ ++ if (type == ACL_TYPE_DEFAULT && ++ !S_ISDIR(inode_u->bi_mode)) ++ return acl ? -EACCES : 0; ++ ++ if (acl) { ++ struct bkey_i_xattr *xattr = ++ bch2_acl_to_xattr(trans, acl, type); ++ if (IS_ERR(xattr)) ++ return PTR_ERR(xattr); ++ ++ ret = bch2_hash_set(trans, bch2_xattr_hash_desc, hash_info, ++ inode_u->bi_inum, &xattr->k_i, 0); ++ } else { ++ struct xattr_search_key search = ++ X_SEARCH(acl_to_xattr_type(type), "", 0); ++ ++ ret = bch2_hash_delete(trans, bch2_xattr_hash_desc, hash_info, ++ inode_u->bi_inum, &search); ++ } ++ ++ return ret == -ENOENT ? 0 : ret; ++} ++ ++int bch2_set_acl(struct user_namespace *mnt_userns, ++ struct inode *vinode, struct posix_acl *_acl, int type) ++{ ++ struct bch_inode_info *inode = to_bch_ei(vinode); ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ struct btree_trans trans; ++ struct btree_iter *inode_iter; ++ struct bch_inode_unpacked inode_u; ++ struct posix_acl *acl; ++ umode_t mode; ++ int ret; ++ ++ mutex_lock(&inode->ei_update_lock); ++ bch2_trans_init(&trans, c, 0, 0); ++retry: ++ bch2_trans_begin(&trans); ++ acl = _acl; ++ ++ inode_iter = bch2_inode_peek(&trans, &inode_u, inode->v.i_ino, ++ BTREE_ITER_INTENT); ++ ret = PTR_ERR_OR_ZERO(inode_iter); ++ if (ret) ++ goto btree_err; ++ ++ mode = inode_u.bi_mode; ++ ++ if (type == ACL_TYPE_ACCESS) { ++ ret = posix_acl_update_mode(mnt_userns, &inode->v, &mode, &acl); ++ if (ret) ++ goto err; ++ } ++ ++ ret = bch2_set_acl_trans(&trans, &inode_u, ++ &inode->ei_str_hash, ++ acl, type); ++ if (ret) ++ goto btree_err; ++ ++ inode_u.bi_ctime = bch2_current_time(c); ++ inode_u.bi_mode = mode; ++ ++ ret = bch2_inode_write(&trans, inode_iter, &inode_u) ?: ++ bch2_trans_commit(&trans, NULL, ++ &inode->ei_journal_seq, ++ BTREE_INSERT_ATOMIC| ++ BTREE_INSERT_NOUNLOCK); ++btree_err: ++ if (ret == -EINTR) ++ goto retry; ++ if (unlikely(ret)) ++ goto err; ++ ++ bch2_inode_update_after_write(c, inode, &inode_u, ++ ATTR_CTIME|ATTR_MODE); ++ ++ set_cached_acl(&inode->v, type, acl); ++err: ++ bch2_trans_exit(&trans); ++ mutex_unlock(&inode->ei_update_lock); ++ ++ return ret; ++} ++ ++int bch2_acl_chmod(struct btree_trans *trans, ++ struct bch_inode_info *inode, ++ umode_t mode, ++ struct posix_acl **new_acl) ++{ ++ struct btree_iter *iter; ++ struct bkey_s_c_xattr xattr; ++ struct bkey_i_xattr *new; ++ struct posix_acl *acl; ++ int ret = 0; ++ ++ iter = bch2_hash_lookup(trans, bch2_xattr_hash_desc, ++ &inode->ei_str_hash, inode->v.i_ino, ++ &X_SEARCH(KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS, "", 0), ++ BTREE_ITER_INTENT); ++ if (IS_ERR(iter)) ++ return PTR_ERR(iter) != -ENOENT ? PTR_ERR(iter) : 0; ++ ++ xattr = bkey_s_c_to_xattr(bch2_btree_iter_peek_slot(iter)); ++ ++ acl = bch2_acl_from_disk(xattr_val(xattr.v), ++ le16_to_cpu(xattr.v->x_val_len)); ++ if (IS_ERR_OR_NULL(acl)) ++ return PTR_ERR(acl); ++ ++ ret = __posix_acl_chmod(&acl, GFP_KERNEL, mode); ++ if (ret) ++ goto err; ++ ++ new = bch2_acl_to_xattr(trans, acl, ACL_TYPE_ACCESS); ++ if (IS_ERR(new)) { ++ ret = PTR_ERR(new); ++ goto err; ++ } ++ ++ new->k.p = iter->pos; ++ bch2_trans_update(trans, iter, &new->k_i); ++ *new_acl = acl; ++ acl = NULL; ++err: ++ kfree(acl); ++ return ret; ++} ++ ++#endif /* CONFIG_BCACHEFS_POSIX_ACL */ +diff --git a/fs/bcachefs/acl.h b/fs/bcachefs/acl.h +new file mode 100644 +index 000000000000..c008d58f2126 +--- /dev/null ++++ b/fs/bcachefs/acl.h +@@ -0,0 +1,59 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_ACL_H ++#define _BCACHEFS_ACL_H ++ ++struct bch_inode_unpacked; ++struct bch_hash_info; ++struct bch_inode_info; ++struct posix_acl; ++ ++#ifdef CONFIG_BCACHEFS_POSIX_ACL ++ ++#define BCH_ACL_VERSION 0x0001 ++ ++typedef struct { ++ __le16 e_tag; ++ __le16 e_perm; ++ __le32 e_id; ++} bch_acl_entry; ++ ++typedef struct { ++ __le16 e_tag; ++ __le16 e_perm; ++} bch_acl_entry_short; ++ ++typedef struct { ++ __le32 a_version; ++} bch_acl_header; ++ ++struct posix_acl *bch2_get_acl(struct inode *, int, bool); ++ ++int bch2_set_acl_trans(struct btree_trans *, ++ struct bch_inode_unpacked *, ++ const struct bch_hash_info *, ++ struct posix_acl *, int); ++int bch2_set_acl(struct user_namespace *, struct inode *, struct posix_acl *, int); ++int bch2_acl_chmod(struct btree_trans *, struct bch_inode_info *, ++ umode_t, struct posix_acl **); ++ ++#else ++ ++static inline int bch2_set_acl_trans(struct btree_trans *trans, ++ struct bch_inode_unpacked *inode_u, ++ const struct bch_hash_info *hash_info, ++ struct posix_acl *acl, int type) ++{ ++ return 0; ++} ++ ++static inline int bch2_acl_chmod(struct btree_trans *trans, ++ struct bch_inode_info *inode, ++ umode_t mode, ++ struct posix_acl **new_acl) ++{ ++ return 0; ++} ++ ++#endif /* CONFIG_BCACHEFS_POSIX_ACL */ ++ ++#endif /* _BCACHEFS_ACL_H */ +diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c +new file mode 100644 +index 000000000000..13e1a60fd7c6 +--- /dev/null ++++ b/fs/bcachefs/alloc_background.c +@@ -0,0 +1,1656 @@ ++// SPDX-License-Identifier: GPL-2.0 ++#include "bcachefs.h" ++#include "alloc_background.h" ++#include "alloc_foreground.h" ++#include "btree_cache.h" ++#include "btree_io.h" ++#include "btree_update.h" ++#include "btree_update_interior.h" ++#include "btree_gc.h" ++#include "buckets.h" ++#include "clock.h" ++#include "debug.h" ++#include "ec.h" ++#include "error.h" ++#include "recovery.h" ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++static const char * const bch2_alloc_field_names[] = { ++#define x(name, bytes) #name, ++ BCH_ALLOC_FIELDS() ++#undef x ++ NULL ++}; ++ ++static void bch2_recalc_oldest_io(struct bch_fs *, struct bch_dev *, int); ++ ++/* Ratelimiting/PD controllers */ ++ ++static void pd_controllers_update(struct work_struct *work) ++{ ++ struct bch_fs *c = container_of(to_delayed_work(work), ++ struct bch_fs, ++ pd_controllers_update); ++ struct bch_dev *ca; ++ unsigned i; ++ ++ for_each_member_device(ca, c, i) { ++ struct bch_dev_usage stats = bch2_dev_usage_read(c, ca); ++ ++ u64 free = bucket_to_sector(ca, ++ __dev_buckets_free(ca, stats)) << 9; ++ /* ++ * Bytes of internal fragmentation, which can be ++ * reclaimed by copy GC ++ */ ++ s64 fragmented = (bucket_to_sector(ca, ++ stats.buckets[BCH_DATA_USER] + ++ stats.buckets[BCH_DATA_CACHED]) - ++ (stats.sectors[BCH_DATA_USER] + ++ stats.sectors[BCH_DATA_CACHED])) << 9; ++ ++ fragmented = max(0LL, fragmented); ++ ++ bch2_pd_controller_update(&ca->copygc_pd, ++ free, fragmented, -1); ++ } ++ ++ schedule_delayed_work(&c->pd_controllers_update, ++ c->pd_controllers_update_seconds * HZ); ++} ++ ++/* Persistent alloc info: */ ++ ++static inline u64 get_alloc_field(const struct bch_alloc *a, ++ const void **p, unsigned field) ++{ ++ unsigned bytes = BCH_ALLOC_FIELD_BYTES[field]; ++ u64 v; ++ ++ if (!(a->fields & (1 << field))) ++ return 0; ++ ++ switch (bytes) { ++ case 1: ++ v = *((const u8 *) *p); ++ break; ++ case 2: ++ v = le16_to_cpup(*p); ++ break; ++ case 4: ++ v = le32_to_cpup(*p); ++ break; ++ case 8: ++ v = le64_to_cpup(*p); ++ break; ++ default: ++ BUG(); ++ } ++ ++ *p += bytes; ++ return v; ++} ++ ++static inline void put_alloc_field(struct bkey_i_alloc *a, void **p, ++ unsigned field, u64 v) ++{ ++ unsigned bytes = BCH_ALLOC_FIELD_BYTES[field]; ++ ++ if (!v) ++ return; ++ ++ a->v.fields |= 1 << field; ++ ++ switch (bytes) { ++ case 1: ++ *((u8 *) *p) = v; ++ break; ++ case 2: ++ *((__le16 *) *p) = cpu_to_le16(v); ++ break; ++ case 4: ++ *((__le32 *) *p) = cpu_to_le32(v); ++ break; ++ case 8: ++ *((__le64 *) *p) = cpu_to_le64(v); ++ break; ++ default: ++ BUG(); ++ } ++ ++ *p += bytes; ++} ++ ++struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c k) ++{ ++ struct bkey_alloc_unpacked ret = { .gen = 0 }; ++ ++ if (k.k->type == KEY_TYPE_alloc) { ++ const struct bch_alloc *a = bkey_s_c_to_alloc(k).v; ++ const void *d = a->data; ++ unsigned idx = 0; ++ ++ ret.gen = a->gen; ++ ++#define x(_name, _bits) ret._name = get_alloc_field(a, &d, idx++); ++ BCH_ALLOC_FIELDS() ++#undef x ++ } ++ return ret; ++} ++ ++void bch2_alloc_pack(struct bkey_i_alloc *dst, ++ const struct bkey_alloc_unpacked src) ++{ ++ unsigned idx = 0; ++ void *d = dst->v.data; ++ unsigned bytes; ++ ++ dst->v.fields = 0; ++ dst->v.gen = src.gen; ++ ++#define x(_name, _bits) put_alloc_field(dst, &d, idx++, src._name); ++ BCH_ALLOC_FIELDS() ++#undef x ++ ++ bytes = (void *) d - (void *) &dst->v; ++ set_bkey_val_bytes(&dst->k, bytes); ++ memset_u64s_tail(&dst->v, 0, bytes); ++} ++ ++static unsigned bch_alloc_val_u64s(const struct bch_alloc *a) ++{ ++ unsigned i, bytes = offsetof(struct bch_alloc, data); ++ ++ for (i = 0; i < ARRAY_SIZE(BCH_ALLOC_FIELD_BYTES); i++) ++ if (a->fields & (1 << i)) ++ bytes += BCH_ALLOC_FIELD_BYTES[i]; ++ ++ return DIV_ROUND_UP(bytes, sizeof(u64)); ++} ++ ++const char *bch2_alloc_invalid(const struct bch_fs *c, struct bkey_s_c k) ++{ ++ struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k); ++ ++ if (k.k->p.inode >= c->sb.nr_devices || ++ !c->devs[k.k->p.inode]) ++ return "invalid device"; ++ ++ /* allow for unknown fields */ ++ if (bkey_val_u64s(a.k) < bch_alloc_val_u64s(a.v)) ++ return "incorrect value size"; ++ ++ return NULL; ++} ++ ++void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, ++ struct bkey_s_c k) ++{ ++ struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k); ++ const void *d = a.v->data; ++ unsigned i; ++ ++ pr_buf(out, "gen %u", a.v->gen); ++ ++ for (i = 0; i < BCH_ALLOC_FIELD_NR; i++) ++ if (a.v->fields & (1 << i)) ++ pr_buf(out, " %s %llu", ++ bch2_alloc_field_names[i], ++ get_alloc_field(a.v, &d, i)); ++} ++ ++int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ struct bch_dev *ca; ++ struct journal_key *j; ++ unsigned i; ++ int ret; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_ALLOC, POS_MIN, 0, k, ret) ++ bch2_mark_key(c, k, 0, 0, NULL, 0, ++ BCH_BUCKET_MARK_ALLOC_READ| ++ BCH_BUCKET_MARK_NOATOMIC); ++ ++ ret = bch2_trans_exit(&trans) ?: ret; ++ if (ret) { ++ bch_err(c, "error reading alloc info: %i", ret); ++ return ret; ++ } ++ ++ for_each_journal_key(*journal_keys, j) ++ if (j->btree_id == BTREE_ID_ALLOC) ++ bch2_mark_key(c, bkey_i_to_s_c(j->k), ++ 0, 0, NULL, 0, ++ BCH_BUCKET_MARK_ALLOC_READ| ++ BCH_BUCKET_MARK_NOATOMIC); ++ ++ percpu_down_write(&c->mark_lock); ++ bch2_dev_usage_from_buckets(c); ++ percpu_up_write(&c->mark_lock); ++ ++ mutex_lock(&c->bucket_clock[READ].lock); ++ for_each_member_device(ca, c, i) { ++ down_read(&ca->bucket_lock); ++ bch2_recalc_oldest_io(c, ca, READ); ++ up_read(&ca->bucket_lock); ++ } ++ mutex_unlock(&c->bucket_clock[READ].lock); ++ ++ mutex_lock(&c->bucket_clock[WRITE].lock); ++ for_each_member_device(ca, c, i) { ++ down_read(&ca->bucket_lock); ++ bch2_recalc_oldest_io(c, ca, WRITE); ++ up_read(&ca->bucket_lock); ++ } ++ mutex_unlock(&c->bucket_clock[WRITE].lock); ++ ++ return 0; ++} ++ ++enum alloc_write_ret { ++ ALLOC_WROTE, ++ ALLOC_NOWROTE, ++ ALLOC_END, ++}; ++ ++static int bch2_alloc_write_key(struct btree_trans *trans, ++ struct btree_iter *iter, ++ unsigned flags) ++{ ++ struct bch_fs *c = trans->c; ++ struct bkey_s_c k; ++ struct bch_dev *ca; ++ struct bucket_array *ba; ++ struct bucket *g; ++ struct bucket_mark m; ++ struct bkey_alloc_unpacked old_u, new_u; ++ __BKEY_PADDED(k, 8) alloc_key; /* hack: */ ++ struct bkey_i_alloc *a; ++ int ret; ++retry: ++ k = bch2_btree_iter_peek_slot(iter); ++ ret = bkey_err(k); ++ if (ret) ++ goto err; ++ ++ old_u = bch2_alloc_unpack(k); ++ ++ if (iter->pos.inode >= c->sb.nr_devices || ++ !c->devs[iter->pos.inode]) ++ return ALLOC_END; ++ ++ percpu_down_read(&c->mark_lock); ++ ca = bch_dev_bkey_exists(c, iter->pos.inode); ++ ba = bucket_array(ca); ++ ++ if (iter->pos.offset >= ba->nbuckets) { ++ percpu_up_read(&c->mark_lock); ++ return ALLOC_END; ++ } ++ ++ g = &ba->b[iter->pos.offset]; ++ m = READ_ONCE(g->mark); ++ new_u = alloc_mem_to_key(g, m); ++ percpu_up_read(&c->mark_lock); ++ ++ if (!bkey_alloc_unpacked_cmp(old_u, new_u)) ++ return ALLOC_NOWROTE; ++ ++ a = bkey_alloc_init(&alloc_key.k); ++ a->k.p = iter->pos; ++ bch2_alloc_pack(a, new_u); ++ ++ bch2_trans_update(trans, iter, &a->k_i); ++ ret = bch2_trans_commit(trans, NULL, NULL, ++ BTREE_INSERT_ATOMIC| ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_NOMARK| ++ flags); ++err: ++ if (ret == -EINTR) ++ goto retry; ++ return ret; ++} ++ ++int bch2_alloc_write(struct bch_fs *c, unsigned flags, bool *wrote) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bch_dev *ca; ++ unsigned i; ++ int ret = 0; ++ ++ BUG_ON(BKEY_ALLOC_VAL_U64s_MAX > 8); ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_ALLOC, POS_MIN, ++ BTREE_ITER_SLOTS|BTREE_ITER_INTENT); ++ ++ for_each_rw_member(ca, c, i) { ++ unsigned first_bucket; ++ ++ percpu_down_read(&c->mark_lock); ++ first_bucket = bucket_array(ca)->first_bucket; ++ percpu_up_read(&c->mark_lock); ++ ++ bch2_btree_iter_set_pos(iter, POS(i, first_bucket)); ++ ++ while (1) { ++ ret = bch2_alloc_write_key(&trans, iter, flags); ++ if (ret < 0 || ret == ALLOC_END) ++ break; ++ if (ret == ALLOC_WROTE) ++ *wrote = true; ++ bch2_btree_iter_next_slot(iter); ++ } ++ ++ if (ret < 0) { ++ percpu_ref_put(&ca->io_ref); ++ break; ++ } ++ } ++ ++ bch2_trans_exit(&trans); ++ ++ return ret < 0 ? ret : 0; ++} ++ ++int bch2_alloc_replay_key(struct bch_fs *c, struct bkey_i *k) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ int ret; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_ALLOC, k->k.p, ++ BTREE_ITER_SLOTS|BTREE_ITER_INTENT); ++ ++ ret = bch2_alloc_write_key(&trans, iter, ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_LAZY_RW| ++ BTREE_INSERT_JOURNAL_REPLAY| ++ BTREE_INSERT_NOMARK); ++ bch2_trans_exit(&trans); ++ return ret < 0 ? ret : 0; ++} ++ ++/* Bucket IO clocks: */ ++ ++static void bch2_recalc_oldest_io(struct bch_fs *c, struct bch_dev *ca, int rw) ++{ ++ struct bucket_clock *clock = &c->bucket_clock[rw]; ++ struct bucket_array *buckets = bucket_array(ca); ++ struct bucket *g; ++ u16 max_last_io = 0; ++ unsigned i; ++ ++ lockdep_assert_held(&c->bucket_clock[rw].lock); ++ ++ /* Recalculate max_last_io for this device: */ ++ for_each_bucket(g, buckets) ++ max_last_io = max(max_last_io, bucket_last_io(c, g, rw)); ++ ++ ca->max_last_bucket_io[rw] = max_last_io; ++ ++ /* Recalculate global max_last_io: */ ++ max_last_io = 0; ++ ++ for_each_member_device(ca, c, i) ++ max_last_io = max(max_last_io, ca->max_last_bucket_io[rw]); ++ ++ clock->max_last_io = max_last_io; ++} ++ ++static void bch2_rescale_bucket_io_times(struct bch_fs *c, int rw) ++{ ++ struct bucket_clock *clock = &c->bucket_clock[rw]; ++ struct bucket_array *buckets; ++ struct bch_dev *ca; ++ struct bucket *g; ++ unsigned i; ++ ++ trace_rescale_prios(c); ++ ++ for_each_member_device(ca, c, i) { ++ down_read(&ca->bucket_lock); ++ buckets = bucket_array(ca); ++ ++ for_each_bucket(g, buckets) ++ g->io_time[rw] = clock->hand - ++ bucket_last_io(c, g, rw) / 2; ++ ++ bch2_recalc_oldest_io(c, ca, rw); ++ ++ up_read(&ca->bucket_lock); ++ } ++} ++ ++static inline u64 bucket_clock_freq(u64 capacity) ++{ ++ return max(capacity >> 10, 2028ULL); ++} ++ ++static void bch2_inc_clock_hand(struct io_timer *timer) ++{ ++ struct bucket_clock *clock = container_of(timer, ++ struct bucket_clock, rescale); ++ struct bch_fs *c = container_of(clock, ++ struct bch_fs, bucket_clock[clock->rw]); ++ struct bch_dev *ca; ++ u64 capacity; ++ unsigned i; ++ ++ mutex_lock(&clock->lock); ++ ++ /* if clock cannot be advanced more, rescale prio */ ++ if (clock->max_last_io >= U16_MAX - 2) ++ bch2_rescale_bucket_io_times(c, clock->rw); ++ ++ BUG_ON(clock->max_last_io >= U16_MAX - 2); ++ ++ for_each_member_device(ca, c, i) ++ ca->max_last_bucket_io[clock->rw]++; ++ clock->max_last_io++; ++ clock->hand++; ++ ++ mutex_unlock(&clock->lock); ++ ++ capacity = READ_ONCE(c->capacity); ++ ++ if (!capacity) ++ return; ++ ++ /* ++ * we only increment when 0.1% of the filesystem capacity has been read ++ * or written too, this determines if it's time ++ * ++ * XXX: we shouldn't really be going off of the capacity of devices in ++ * RW mode (that will be 0 when we're RO, yet we can still service ++ * reads) ++ */ ++ timer->expire += bucket_clock_freq(capacity); ++ ++ bch2_io_timer_add(&c->io_clock[clock->rw], timer); ++} ++ ++static void bch2_bucket_clock_init(struct bch_fs *c, int rw) ++{ ++ struct bucket_clock *clock = &c->bucket_clock[rw]; ++ ++ clock->hand = 1; ++ clock->rw = rw; ++ clock->rescale.fn = bch2_inc_clock_hand; ++ clock->rescale.expire = bucket_clock_freq(c->capacity); ++ mutex_init(&clock->lock); ++} ++ ++/* Background allocator thread: */ ++ ++/* ++ * Scans for buckets to be invalidated, invalidates them, rewrites prios/gens ++ * (marking them as invalidated on disk), then optionally issues discard ++ * commands to the newly free buckets, then puts them on the various freelists. ++ */ ++ ++#define BUCKET_GC_GEN_MAX 96U ++ ++/** ++ * wait_buckets_available - wait on reclaimable buckets ++ * ++ * If there aren't enough available buckets to fill up free_inc, wait until ++ * there are. ++ */ ++static int wait_buckets_available(struct bch_fs *c, struct bch_dev *ca) ++{ ++ unsigned long gc_count = c->gc_count; ++ int ret = 0; ++ ++ ca->allocator_state = ALLOCATOR_BLOCKED; ++ closure_wake_up(&c->freelist_wait); ++ ++ while (1) { ++ set_current_state(TASK_INTERRUPTIBLE); ++ if (kthread_should_stop()) { ++ ret = 1; ++ break; ++ } ++ ++ if (gc_count != c->gc_count) ++ ca->inc_gen_really_needs_gc = 0; ++ ++ if ((ssize_t) (dev_buckets_available(c, ca) - ++ ca->inc_gen_really_needs_gc) >= ++ (ssize_t) fifo_free(&ca->free_inc)) ++ break; ++ ++ up_read(&c->gc_lock); ++ schedule(); ++ try_to_freeze(); ++ down_read(&c->gc_lock); ++ } ++ ++ __set_current_state(TASK_RUNNING); ++ ca->allocator_state = ALLOCATOR_RUNNING; ++ closure_wake_up(&c->freelist_wait); ++ ++ return ret; ++} ++ ++static bool bch2_can_invalidate_bucket(struct bch_dev *ca, ++ size_t bucket, ++ struct bucket_mark mark) ++{ ++ u8 gc_gen; ++ ++ if (!is_available_bucket(mark)) ++ return false; ++ ++ if (ca->buckets_nouse && ++ test_bit(bucket, ca->buckets_nouse)) ++ return false; ++ ++ gc_gen = bucket_gc_gen(ca, bucket); ++ ++ if (gc_gen >= BUCKET_GC_GEN_MAX / 2) ++ ca->inc_gen_needs_gc++; ++ ++ if (gc_gen >= BUCKET_GC_GEN_MAX) ++ ca->inc_gen_really_needs_gc++; ++ ++ return gc_gen < BUCKET_GC_GEN_MAX; ++} ++ ++/* ++ * Determines what order we're going to reuse buckets, smallest bucket_key() ++ * first. ++ * ++ * ++ * - We take into account the read prio of the bucket, which gives us an ++ * indication of how hot the data is -- we scale the prio so that the prio ++ * farthest from the clock is worth 1/8th of the closest. ++ * ++ * - The number of sectors of cached data in the bucket, which gives us an ++ * indication of the cost in cache misses this eviction will cause. ++ * ++ * - If hotness * sectors used compares equal, we pick the bucket with the ++ * smallest bucket_gc_gen() - since incrementing the same bucket's generation ++ * number repeatedly forces us to run mark and sweep gc to avoid generation ++ * number wraparound. ++ */ ++ ++static unsigned long bucket_sort_key(struct bch_fs *c, struct bch_dev *ca, ++ size_t b, struct bucket_mark m) ++{ ++ unsigned last_io = bucket_last_io(c, bucket(ca, b), READ); ++ unsigned max_last_io = ca->max_last_bucket_io[READ]; ++ ++ /* ++ * Time since last read, scaled to [0, 8) where larger value indicates ++ * more recently read data: ++ */ ++ unsigned long hotness = (max_last_io - last_io) * 7 / max_last_io; ++ ++ /* How much we want to keep the data in this bucket: */ ++ unsigned long data_wantness = ++ (hotness + 1) * bucket_sectors_used(m); ++ ++ unsigned long needs_journal_commit = ++ bucket_needs_journal_commit(m, c->journal.last_seq_ondisk); ++ ++ return (data_wantness << 9) | ++ (needs_journal_commit << 8) | ++ (bucket_gc_gen(ca, b) / 16); ++} ++ ++static inline int bucket_alloc_cmp(alloc_heap *h, ++ struct alloc_heap_entry l, ++ struct alloc_heap_entry r) ++{ ++ return cmp_int(l.key, r.key) ?: ++ cmp_int(r.nr, l.nr) ?: ++ cmp_int(l.bucket, r.bucket); ++} ++ ++static inline int bucket_idx_cmp(const void *_l, const void *_r) ++{ ++ const struct alloc_heap_entry *l = _l, *r = _r; ++ ++ return cmp_int(l->bucket, r->bucket); ++} ++ ++static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca) ++{ ++ struct bucket_array *buckets; ++ struct alloc_heap_entry e = { 0 }; ++ size_t b, i, nr = 0; ++ ++ ca->alloc_heap.used = 0; ++ ++ mutex_lock(&c->bucket_clock[READ].lock); ++ down_read(&ca->bucket_lock); ++ ++ buckets = bucket_array(ca); ++ ++ bch2_recalc_oldest_io(c, ca, READ); ++ ++ /* ++ * Find buckets with lowest read priority, by building a maxheap sorted ++ * by read priority and repeatedly replacing the maximum element until ++ * all buckets have been visited. ++ */ ++ for (b = ca->mi.first_bucket; b < ca->mi.nbuckets; b++) { ++ struct bucket_mark m = READ_ONCE(buckets->b[b].mark); ++ unsigned long key = bucket_sort_key(c, ca, b, m); ++ ++ if (!bch2_can_invalidate_bucket(ca, b, m)) ++ continue; ++ ++ if (e.nr && e.bucket + e.nr == b && e.key == key) { ++ e.nr++; ++ } else { ++ if (e.nr) ++ heap_add_or_replace(&ca->alloc_heap, e, ++ -bucket_alloc_cmp, NULL); ++ ++ e = (struct alloc_heap_entry) { ++ .bucket = b, ++ .nr = 1, ++ .key = key, ++ }; ++ } ++ ++ cond_resched(); ++ } ++ ++ if (e.nr) ++ heap_add_or_replace(&ca->alloc_heap, e, ++ -bucket_alloc_cmp, NULL); ++ ++ for (i = 0; i < ca->alloc_heap.used; i++) ++ nr += ca->alloc_heap.data[i].nr; ++ ++ while (nr - ca->alloc_heap.data[0].nr >= ALLOC_SCAN_BATCH(ca)) { ++ nr -= ca->alloc_heap.data[0].nr; ++ heap_pop(&ca->alloc_heap, e, -bucket_alloc_cmp, NULL); ++ } ++ ++ up_read(&ca->bucket_lock); ++ mutex_unlock(&c->bucket_clock[READ].lock); ++} ++ ++static void find_reclaimable_buckets_fifo(struct bch_fs *c, struct bch_dev *ca) ++{ ++ struct bucket_array *buckets = bucket_array(ca); ++ struct bucket_mark m; ++ size_t b, start; ++ ++ if (ca->fifo_last_bucket < ca->mi.first_bucket || ++ ca->fifo_last_bucket >= ca->mi.nbuckets) ++ ca->fifo_last_bucket = ca->mi.first_bucket; ++ ++ start = ca->fifo_last_bucket; ++ ++ do { ++ ca->fifo_last_bucket++; ++ if (ca->fifo_last_bucket == ca->mi.nbuckets) ++ ca->fifo_last_bucket = ca->mi.first_bucket; ++ ++ b = ca->fifo_last_bucket; ++ m = READ_ONCE(buckets->b[b].mark); ++ ++ if (bch2_can_invalidate_bucket(ca, b, m)) { ++ struct alloc_heap_entry e = { .bucket = b, .nr = 1, }; ++ ++ heap_add(&ca->alloc_heap, e, bucket_alloc_cmp, NULL); ++ if (heap_full(&ca->alloc_heap)) ++ break; ++ } ++ ++ cond_resched(); ++ } while (ca->fifo_last_bucket != start); ++} ++ ++static void find_reclaimable_buckets_random(struct bch_fs *c, struct bch_dev *ca) ++{ ++ struct bucket_array *buckets = bucket_array(ca); ++ struct bucket_mark m; ++ size_t checked, i; ++ ++ for (checked = 0; ++ checked < ca->mi.nbuckets / 2; ++ checked++) { ++ size_t b = bch2_rand_range(ca->mi.nbuckets - ++ ca->mi.first_bucket) + ++ ca->mi.first_bucket; ++ ++ m = READ_ONCE(buckets->b[b].mark); ++ ++ if (bch2_can_invalidate_bucket(ca, b, m)) { ++ struct alloc_heap_entry e = { .bucket = b, .nr = 1, }; ++ ++ heap_add(&ca->alloc_heap, e, bucket_alloc_cmp, NULL); ++ if (heap_full(&ca->alloc_heap)) ++ break; ++ } ++ ++ cond_resched(); ++ } ++ ++ sort(ca->alloc_heap.data, ++ ca->alloc_heap.used, ++ sizeof(ca->alloc_heap.data[0]), ++ bucket_idx_cmp, NULL); ++ ++ /* remove duplicates: */ ++ for (i = 0; i + 1 < ca->alloc_heap.used; i++) ++ if (ca->alloc_heap.data[i].bucket == ++ ca->alloc_heap.data[i + 1].bucket) ++ ca->alloc_heap.data[i].nr = 0; ++} ++ ++static size_t find_reclaimable_buckets(struct bch_fs *c, struct bch_dev *ca) ++{ ++ size_t i, nr = 0; ++ ++ ca->inc_gen_needs_gc = 0; ++ ++ switch (ca->mi.replacement) { ++ case CACHE_REPLACEMENT_LRU: ++ find_reclaimable_buckets_lru(c, ca); ++ break; ++ case CACHE_REPLACEMENT_FIFO: ++ find_reclaimable_buckets_fifo(c, ca); ++ break; ++ case CACHE_REPLACEMENT_RANDOM: ++ find_reclaimable_buckets_random(c, ca); ++ break; ++ } ++ ++ heap_resort(&ca->alloc_heap, bucket_alloc_cmp, NULL); ++ ++ for (i = 0; i < ca->alloc_heap.used; i++) ++ nr += ca->alloc_heap.data[i].nr; ++ ++ return nr; ++} ++ ++static inline long next_alloc_bucket(struct bch_dev *ca) ++{ ++ struct alloc_heap_entry e, *top = ca->alloc_heap.data; ++ ++ while (ca->alloc_heap.used) { ++ if (top->nr) { ++ size_t b = top->bucket; ++ ++ top->bucket++; ++ top->nr--; ++ return b; ++ } ++ ++ heap_pop(&ca->alloc_heap, e, bucket_alloc_cmp, NULL); ++ } ++ ++ return -1; ++} ++ ++/* ++ * returns sequence number of most recent journal entry that updated this ++ * bucket: ++ */ ++static u64 bucket_journal_seq(struct bch_fs *c, struct bucket_mark m) ++{ ++ if (m.journal_seq_valid) { ++ u64 journal_seq = atomic64_read(&c->journal.seq); ++ u64 bucket_seq = journal_seq; ++ ++ bucket_seq &= ~((u64) U16_MAX); ++ bucket_seq |= m.journal_seq; ++ ++ if (bucket_seq > journal_seq) ++ bucket_seq -= 1 << 16; ++ ++ return bucket_seq; ++ } else { ++ return 0; ++ } ++} ++ ++static int bch2_invalidate_one_bucket2(struct btree_trans *trans, ++ struct bch_dev *ca, ++ struct btree_iter *iter, ++ u64 *journal_seq, unsigned flags) ++{ ++#if 0 ++ __BKEY_PADDED(k, BKEY_ALLOC_VAL_U64s_MAX) alloc_key; ++#else ++ /* hack: */ ++ __BKEY_PADDED(k, 8) alloc_key; ++#endif ++ struct bch_fs *c = trans->c; ++ struct bkey_i_alloc *a; ++ struct bkey_alloc_unpacked u; ++ struct bucket *g; ++ struct bucket_mark m; ++ struct bkey_s_c k; ++ bool invalidating_cached_data; ++ size_t b; ++ int ret; ++ ++ BUG_ON(!ca->alloc_heap.used || ++ !ca->alloc_heap.data[0].nr); ++ b = ca->alloc_heap.data[0].bucket; ++ ++ /* first, put on free_inc and mark as owned by allocator: */ ++ percpu_down_read(&c->mark_lock); ++ spin_lock(&c->freelist_lock); ++ ++ verify_not_on_freelist(c, ca, b); ++ ++ BUG_ON(!fifo_push(&ca->free_inc, b)); ++ ++ bch2_mark_alloc_bucket(c, ca, b, true, gc_pos_alloc(c, NULL), 0); ++ ++ spin_unlock(&c->freelist_lock); ++ percpu_up_read(&c->mark_lock); ++ ++ BUG_ON(BKEY_ALLOC_VAL_U64s_MAX > 8); ++ ++ bch2_btree_iter_set_pos(iter, POS(ca->dev_idx, b)); ++retry: ++ k = bch2_btree_iter_peek_slot(iter); ++ ret = bkey_err(k); ++ if (ret) ++ return ret; ++ ++ /* ++ * The allocator has to start before journal replay is finished - thus, ++ * we have to trust the in memory bucket @m, not the version in the ++ * btree: ++ */ ++ percpu_down_read(&c->mark_lock); ++ g = bucket(ca, b); ++ m = READ_ONCE(g->mark); ++ u = alloc_mem_to_key(g, m); ++ percpu_up_read(&c->mark_lock); ++ ++ invalidating_cached_data = m.cached_sectors != 0; ++ ++ u.gen++; ++ u.data_type = 0; ++ u.dirty_sectors = 0; ++ u.cached_sectors = 0; ++ u.read_time = c->bucket_clock[READ].hand; ++ u.write_time = c->bucket_clock[WRITE].hand; ++ ++ a = bkey_alloc_init(&alloc_key.k); ++ a->k.p = iter->pos; ++ bch2_alloc_pack(a, u); ++ ++ bch2_trans_update(trans, iter, &a->k_i); ++ ++ /* ++ * XXX: ++ * when using deferred btree updates, we have journal reclaim doing ++ * btree updates and thus requiring the allocator to make forward ++ * progress, and here the allocator is requiring space in the journal - ++ * so we need a journal pre-reservation: ++ */ ++ ret = bch2_trans_commit(trans, NULL, ++ invalidating_cached_data ? journal_seq : NULL, ++ BTREE_INSERT_ATOMIC| ++ BTREE_INSERT_NOUNLOCK| ++ BTREE_INSERT_NOCHECK_RW| ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_USE_RESERVE| ++ BTREE_INSERT_USE_ALLOC_RESERVE| ++ BTREE_INSERT_BUCKET_INVALIDATE| ++ flags); ++ if (ret == -EINTR) ++ goto retry; ++ ++ if (!ret) { ++ /* remove from alloc_heap: */ ++ struct alloc_heap_entry e, *top = ca->alloc_heap.data; ++ ++ top->bucket++; ++ top->nr--; ++ ++ if (!top->nr) ++ heap_pop(&ca->alloc_heap, e, bucket_alloc_cmp, NULL); ++ ++ /* ++ * Make sure we flush the last journal entry that updated this ++ * bucket (i.e. deleting the last reference) before writing to ++ * this bucket again: ++ */ ++ *journal_seq = max(*journal_seq, bucket_journal_seq(c, m)); ++ } else { ++ size_t b2; ++ ++ /* remove from free_inc: */ ++ percpu_down_read(&c->mark_lock); ++ spin_lock(&c->freelist_lock); ++ ++ bch2_mark_alloc_bucket(c, ca, b, false, ++ gc_pos_alloc(c, NULL), 0); ++ ++ BUG_ON(!fifo_pop_back(&ca->free_inc, b2)); ++ BUG_ON(b != b2); ++ ++ spin_unlock(&c->freelist_lock); ++ percpu_up_read(&c->mark_lock); ++ } ++ ++ return ret; ++} ++ ++static bool bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca, ++ size_t bucket, u64 *flush_seq) ++{ ++ struct bucket_mark m; ++ ++ percpu_down_read(&c->mark_lock); ++ spin_lock(&c->freelist_lock); ++ ++ bch2_invalidate_bucket(c, ca, bucket, &m); ++ ++ verify_not_on_freelist(c, ca, bucket); ++ BUG_ON(!fifo_push(&ca->free_inc, bucket)); ++ ++ spin_unlock(&c->freelist_lock); ++ ++ bucket_io_clock_reset(c, ca, bucket, READ); ++ bucket_io_clock_reset(c, ca, bucket, WRITE); ++ ++ percpu_up_read(&c->mark_lock); ++ ++ *flush_seq = max(*flush_seq, bucket_journal_seq(c, m)); ++ ++ return m.cached_sectors != 0; ++} ++ ++/* ++ * Pull buckets off ca->alloc_heap, invalidate them, move them to ca->free_inc: ++ */ ++static int bch2_invalidate_buckets(struct bch_fs *c, struct bch_dev *ca) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ u64 journal_seq = 0; ++ int ret = 0; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_ALLOC, ++ POS(ca->dev_idx, 0), ++ BTREE_ITER_SLOTS|BTREE_ITER_INTENT); ++ ++ /* Only use nowait if we've already invalidated at least one bucket: */ ++ while (!ret && ++ !fifo_full(&ca->free_inc) && ++ ca->alloc_heap.used) ++ ret = bch2_invalidate_one_bucket2(&trans, ca, iter, &journal_seq, ++ BTREE_INSERT_GC_LOCK_HELD| ++ (!fifo_empty(&ca->free_inc) ++ ? BTREE_INSERT_NOWAIT : 0)); ++ ++ bch2_trans_exit(&trans); ++ ++ /* If we used NOWAIT, don't return the error: */ ++ if (!fifo_empty(&ca->free_inc)) ++ ret = 0; ++ if (ret) { ++ bch_err(ca, "error invalidating buckets: %i", ret); ++ return ret; ++ } ++ ++ if (journal_seq) ++ ret = bch2_journal_flush_seq(&c->journal, journal_seq); ++ if (ret) { ++ bch_err(ca, "journal error: %i", ret); ++ return ret; ++ } ++ ++ return 0; ++} ++ ++static int push_invalidated_bucket(struct bch_fs *c, struct bch_dev *ca, size_t bucket) ++{ ++ unsigned i; ++ int ret = 0; ++ ++ while (1) { ++ set_current_state(TASK_INTERRUPTIBLE); ++ ++ spin_lock(&c->freelist_lock); ++ for (i = 0; i < RESERVE_NR; i++) ++ if (fifo_push(&ca->free[i], bucket)) { ++ fifo_pop(&ca->free_inc, bucket); ++ ++ closure_wake_up(&c->freelist_wait); ++ ca->allocator_state = ALLOCATOR_RUNNING; ++ ++ spin_unlock(&c->freelist_lock); ++ goto out; ++ } ++ ++ if (ca->allocator_state != ALLOCATOR_BLOCKED_FULL) { ++ ca->allocator_state = ALLOCATOR_BLOCKED_FULL; ++ closure_wake_up(&c->freelist_wait); ++ } ++ ++ spin_unlock(&c->freelist_lock); ++ ++ if ((current->flags & PF_KTHREAD) && ++ kthread_should_stop()) { ++ ret = 1; ++ break; ++ } ++ ++ schedule(); ++ try_to_freeze(); ++ } ++out: ++ __set_current_state(TASK_RUNNING); ++ return ret; ++} ++ ++/* ++ * Pulls buckets off free_inc, discards them (if enabled), then adds them to ++ * freelists, waiting until there's room if necessary: ++ */ ++static int discard_invalidated_buckets(struct bch_fs *c, struct bch_dev *ca) ++{ ++ while (!fifo_empty(&ca->free_inc)) { ++ size_t bucket = fifo_peek(&ca->free_inc); ++ ++ if (ca->mi.discard && ++ blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev))) ++ blkdev_issue_discard(ca->disk_sb.bdev, ++ bucket_to_sector(ca, bucket), ++ ca->mi.bucket_size, GFP_NOIO, 0); ++ ++ if (push_invalidated_bucket(c, ca, bucket)) ++ return 1; ++ } ++ ++ return 0; ++} ++ ++/** ++ * bch_allocator_thread - move buckets from free_inc to reserves ++ * ++ * The free_inc FIFO is populated by find_reclaimable_buckets(), and ++ * the reserves are depleted by bucket allocation. When we run out ++ * of free_inc, try to invalidate some buckets and write out ++ * prios and gens. ++ */ ++static int bch2_allocator_thread(void *arg) ++{ ++ struct bch_dev *ca = arg; ++ struct bch_fs *c = ca->fs; ++ size_t nr; ++ int ret; ++ ++ set_freezable(); ++ ca->allocator_state = ALLOCATOR_RUNNING; ++ ++ while (1) { ++ cond_resched(); ++ ++ pr_debug("discarding %zu invalidated buckets", ++ fifo_used(&ca->free_inc)); ++ ++ ret = discard_invalidated_buckets(c, ca); ++ if (ret) ++ goto stop; ++ ++ down_read(&c->gc_lock); ++ ++ ret = bch2_invalidate_buckets(c, ca); ++ if (ret) { ++ up_read(&c->gc_lock); ++ goto stop; ++ } ++ ++ if (!fifo_empty(&ca->free_inc)) { ++ up_read(&c->gc_lock); ++ continue; ++ } ++ ++ pr_debug("free_inc now empty"); ++ ++ do { ++ /* ++ * Find some buckets that we can invalidate, either ++ * they're completely unused, or only contain clean data ++ * that's been written back to the backing device or ++ * another cache tier ++ */ ++ ++ pr_debug("scanning for reclaimable buckets"); ++ ++ nr = find_reclaimable_buckets(c, ca); ++ ++ pr_debug("found %zu buckets", nr); ++ ++ trace_alloc_batch(ca, nr, ca->alloc_heap.size); ++ ++ if ((ca->inc_gen_needs_gc >= ALLOC_SCAN_BATCH(ca) || ++ ca->inc_gen_really_needs_gc) && ++ c->gc_thread) { ++ atomic_inc(&c->kick_gc); ++ wake_up_process(c->gc_thread); ++ } ++ ++ /* ++ * If we found any buckets, we have to invalidate them ++ * before we scan for more - but if we didn't find very ++ * many we may want to wait on more buckets being ++ * available so we don't spin: ++ */ ++ if (!nr || ++ (nr < ALLOC_SCAN_BATCH(ca) && ++ !fifo_empty(&ca->free[RESERVE_NONE]))) { ++ ret = wait_buckets_available(c, ca); ++ if (ret) { ++ up_read(&c->gc_lock); ++ goto stop; ++ } ++ } ++ } while (!nr); ++ ++ up_read(&c->gc_lock); ++ ++ pr_debug("%zu buckets to invalidate", nr); ++ ++ /* ++ * alloc_heap is now full of newly-invalidated buckets: next, ++ * write out the new bucket gens: ++ */ ++ } ++ ++stop: ++ pr_debug("alloc thread stopping (ret %i)", ret); ++ ca->allocator_state = ALLOCATOR_STOPPED; ++ closure_wake_up(&c->freelist_wait); ++ return 0; ++} ++ ++/* Startup/shutdown (ro/rw): */ ++ ++void bch2_recalc_capacity(struct bch_fs *c) ++{ ++ struct bch_dev *ca; ++ u64 capacity = 0, reserved_sectors = 0, gc_reserve; ++ unsigned bucket_size_max = 0; ++ unsigned long ra_pages = 0; ++ unsigned i, j; ++ ++ lockdep_assert_held(&c->state_lock); ++ ++ for_each_online_member(ca, c, i) { ++ struct backing_dev_info *bdi = ca->disk_sb.bdev->bd_disk->bdi; ++ ++ ra_pages += bdi->ra_pages; ++ } ++ ++ bch2_set_ra_pages(c, ra_pages); ++ ++ for_each_rw_member(ca, c, i) { ++ u64 dev_reserve = 0; ++ ++ /* ++ * We need to reserve buckets (from the number ++ * of currently available buckets) against ++ * foreground writes so that mainly copygc can ++ * make forward progress. ++ * ++ * We need enough to refill the various reserves ++ * from scratch - copygc will use its entire ++ * reserve all at once, then run against when ++ * its reserve is refilled (from the formerly ++ * available buckets). ++ * ++ * This reserve is just used when considering if ++ * allocations for foreground writes must wait - ++ * not -ENOSPC calculations. ++ */ ++ for (j = 0; j < RESERVE_NONE; j++) ++ dev_reserve += ca->free[j].size; ++ ++ dev_reserve += 1; /* btree write point */ ++ dev_reserve += 1; /* copygc write point */ ++ dev_reserve += 1; /* rebalance write point */ ++ ++ dev_reserve *= ca->mi.bucket_size; ++ ++ ca->copygc_threshold = dev_reserve; ++ ++ capacity += bucket_to_sector(ca, ca->mi.nbuckets - ++ ca->mi.first_bucket); ++ ++ reserved_sectors += dev_reserve * 2; ++ ++ bucket_size_max = max_t(unsigned, bucket_size_max, ++ ca->mi.bucket_size); ++ } ++ ++ gc_reserve = c->opts.gc_reserve_bytes ++ ? c->opts.gc_reserve_bytes >> 9 ++ : div64_u64(capacity * c->opts.gc_reserve_percent, 100); ++ ++ reserved_sectors = max(gc_reserve, reserved_sectors); ++ ++ reserved_sectors = min(reserved_sectors, capacity); ++ ++ c->capacity = capacity - reserved_sectors; ++ ++ c->bucket_size_max = bucket_size_max; ++ ++ if (c->capacity) { ++ bch2_io_timer_add(&c->io_clock[READ], ++ &c->bucket_clock[READ].rescale); ++ bch2_io_timer_add(&c->io_clock[WRITE], ++ &c->bucket_clock[WRITE].rescale); ++ } else { ++ bch2_io_timer_del(&c->io_clock[READ], ++ &c->bucket_clock[READ].rescale); ++ bch2_io_timer_del(&c->io_clock[WRITE], ++ &c->bucket_clock[WRITE].rescale); ++ } ++ ++ /* Wake up case someone was waiting for buckets */ ++ closure_wake_up(&c->freelist_wait); ++} ++ ++static bool bch2_dev_has_open_write_point(struct bch_fs *c, struct bch_dev *ca) ++{ ++ struct open_bucket *ob; ++ bool ret = false; ++ ++ for (ob = c->open_buckets; ++ ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); ++ ob++) { ++ spin_lock(&ob->lock); ++ if (ob->valid && !ob->on_partial_list && ++ ob->ptr.dev == ca->dev_idx) ++ ret = true; ++ spin_unlock(&ob->lock); ++ } ++ ++ return ret; ++} ++ ++/* device goes ro: */ ++void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca) ++{ ++ unsigned i; ++ ++ BUG_ON(ca->alloc_thread); ++ ++ /* First, remove device from allocation groups: */ ++ ++ for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++) ++ clear_bit(ca->dev_idx, c->rw_devs[i].d); ++ ++ /* ++ * Capacity is calculated based off of devices in allocation groups: ++ */ ++ bch2_recalc_capacity(c); ++ ++ /* Next, close write points that point to this device... */ ++ for (i = 0; i < ARRAY_SIZE(c->write_points); i++) ++ bch2_writepoint_stop(c, ca, &c->write_points[i]); ++ ++ bch2_writepoint_stop(c, ca, &ca->copygc_write_point); ++ bch2_writepoint_stop(c, ca, &c->rebalance_write_point); ++ bch2_writepoint_stop(c, ca, &c->btree_write_point); ++ ++ mutex_lock(&c->btree_reserve_cache_lock); ++ while (c->btree_reserve_cache_nr) { ++ struct btree_alloc *a = ++ &c->btree_reserve_cache[--c->btree_reserve_cache_nr]; ++ ++ bch2_open_buckets_put(c, &a->ob); ++ } ++ mutex_unlock(&c->btree_reserve_cache_lock); ++ ++ while (1) { ++ struct open_bucket *ob; ++ ++ spin_lock(&c->freelist_lock); ++ if (!ca->open_buckets_partial_nr) { ++ spin_unlock(&c->freelist_lock); ++ break; ++ } ++ ob = c->open_buckets + ++ ca->open_buckets_partial[--ca->open_buckets_partial_nr]; ++ ob->on_partial_list = false; ++ spin_unlock(&c->freelist_lock); ++ ++ bch2_open_bucket_put(c, ob); ++ } ++ ++ bch2_ec_stop_dev(c, ca); ++ ++ /* ++ * Wake up threads that were blocked on allocation, so they can notice ++ * the device can no longer be removed and the capacity has changed: ++ */ ++ closure_wake_up(&c->freelist_wait); ++ ++ /* ++ * journal_res_get() can block waiting for free space in the journal - ++ * it needs to notice there may not be devices to allocate from anymore: ++ */ ++ wake_up(&c->journal.wait); ++ ++ /* Now wait for any in flight writes: */ ++ ++ closure_wait_event(&c->open_buckets_wait, ++ !bch2_dev_has_open_write_point(c, ca)); ++} ++ ++/* device goes rw: */ ++void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca) ++{ ++ unsigned i; ++ ++ for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++) ++ if (ca->mi.data_allowed & (1 << i)) ++ set_bit(ca->dev_idx, c->rw_devs[i].d); ++} ++ ++void bch2_dev_allocator_quiesce(struct bch_fs *c, struct bch_dev *ca) ++{ ++ if (ca->alloc_thread) ++ closure_wait_event(&c->freelist_wait, ++ ca->allocator_state != ALLOCATOR_RUNNING); ++} ++ ++/* stop allocator thread: */ ++void bch2_dev_allocator_stop(struct bch_dev *ca) ++{ ++ struct task_struct *p; ++ ++ p = rcu_dereference_protected(ca->alloc_thread, 1); ++ ca->alloc_thread = NULL; ++ ++ /* ++ * We need an rcu barrier between setting ca->alloc_thread = NULL and ++ * the thread shutting down to avoid bch2_wake_allocator() racing: ++ * ++ * XXX: it would be better to have the rcu barrier be asynchronous ++ * instead of blocking us here ++ */ ++ synchronize_rcu(); ++ ++ if (p) { ++ kthread_stop(p); ++ put_task_struct(p); ++ } ++} ++ ++/* start allocator thread: */ ++int bch2_dev_allocator_start(struct bch_dev *ca) ++{ ++ struct task_struct *p; ++ ++ /* ++ * allocator thread already started? ++ */ ++ if (ca->alloc_thread) ++ return 0; ++ ++ p = kthread_create(bch2_allocator_thread, ca, ++ "bch_alloc[%s]", ca->name); ++ if (IS_ERR(p)) ++ return PTR_ERR(p); ++ ++ get_task_struct(p); ++ rcu_assign_pointer(ca->alloc_thread, p); ++ wake_up_process(p); ++ return 0; ++} ++ ++static bool flush_held_btree_writes(struct bch_fs *c) ++{ ++ struct bucket_table *tbl; ++ struct rhash_head *pos; ++ struct btree *b; ++ bool nodes_unwritten; ++ size_t i; ++again: ++ cond_resched(); ++ nodes_unwritten = false; ++ ++ if (bch2_journal_error(&c->journal)) ++ return true; ++ ++ rcu_read_lock(); ++ for_each_cached_btree(b, c, tbl, i, pos) ++ if (btree_node_need_write(b)) { ++ if (btree_node_may_write(b)) { ++ rcu_read_unlock(); ++ btree_node_lock_type(c, b, SIX_LOCK_read); ++ bch2_btree_node_write(c, b, SIX_LOCK_read); ++ six_unlock_read(&b->lock); ++ goto again; ++ } else { ++ nodes_unwritten = true; ++ } ++ } ++ rcu_read_unlock(); ++ ++ if (c->btree_roots_dirty) { ++ bch2_journal_meta(&c->journal); ++ goto again; ++ } ++ ++ return !nodes_unwritten && ++ !bch2_btree_interior_updates_nr_pending(c); ++} ++ ++static void allocator_start_issue_discards(struct bch_fs *c) ++{ ++ struct bch_dev *ca; ++ unsigned dev_iter; ++ size_t bu; ++ ++ for_each_rw_member(ca, c, dev_iter) ++ while (fifo_pop(&ca->free_inc, bu)) ++ blkdev_issue_discard(ca->disk_sb.bdev, ++ bucket_to_sector(ca, bu), ++ ca->mi.bucket_size, GFP_NOIO, 0); ++} ++ ++static int resize_free_inc(struct bch_dev *ca) ++{ ++ alloc_fifo free_inc; ++ ++ if (!fifo_full(&ca->free_inc)) ++ return 0; ++ ++ if (!init_fifo(&free_inc, ++ ca->free_inc.size * 2, ++ GFP_KERNEL)) ++ return -ENOMEM; ++ ++ fifo_move(&free_inc, &ca->free_inc); ++ swap(free_inc, ca->free_inc); ++ free_fifo(&free_inc); ++ return 0; ++} ++ ++static bool bch2_fs_allocator_start_fast(struct bch_fs *c) ++{ ++ struct bch_dev *ca; ++ unsigned dev_iter; ++ bool ret = true; ++ ++ if (test_alloc_startup(c)) ++ return false; ++ ++ down_read(&c->gc_lock); ++ ++ /* Scan for buckets that are already invalidated: */ ++ for_each_rw_member(ca, c, dev_iter) { ++ struct bucket_array *buckets; ++ struct bucket_mark m; ++ long bu; ++ ++ down_read(&ca->bucket_lock); ++ buckets = bucket_array(ca); ++ ++ for (bu = buckets->first_bucket; ++ bu < buckets->nbuckets; bu++) { ++ m = READ_ONCE(buckets->b[bu].mark); ++ ++ if (!buckets->b[bu].gen_valid || ++ !is_available_bucket(m) || ++ m.cached_sectors || ++ (ca->buckets_nouse && ++ test_bit(bu, ca->buckets_nouse))) ++ continue; ++ ++ percpu_down_read(&c->mark_lock); ++ bch2_mark_alloc_bucket(c, ca, bu, true, ++ gc_pos_alloc(c, NULL), 0); ++ percpu_up_read(&c->mark_lock); ++ ++ fifo_push(&ca->free_inc, bu); ++ ++ discard_invalidated_buckets(c, ca); ++ ++ if (fifo_full(&ca->free[RESERVE_BTREE])) ++ break; ++ } ++ up_read(&ca->bucket_lock); ++ } ++ ++ up_read(&c->gc_lock); ++ ++ /* did we find enough buckets? */ ++ for_each_rw_member(ca, c, dev_iter) ++ if (!fifo_full(&ca->free[RESERVE_BTREE])) ++ ret = false; ++ ++ return ret; ++} ++ ++int bch2_fs_allocator_start(struct bch_fs *c) ++{ ++ struct bch_dev *ca; ++ unsigned dev_iter; ++ u64 journal_seq = 0; ++ bool wrote; ++ long bu; ++ int ret = 0; ++ ++ if (!test_alloc_startup(c) && ++ bch2_fs_allocator_start_fast(c)) ++ return 0; ++ ++ pr_debug("not enough empty buckets; scanning for reclaimable buckets"); ++ ++ /* ++ * We're moving buckets to freelists _before_ they've been marked as ++ * invalidated on disk - we have to so that we can allocate new btree ++ * nodes to mark them as invalidated on disk. ++ * ++ * However, we can't _write_ to any of these buckets yet - they might ++ * have cached data in them, which is live until they're marked as ++ * invalidated on disk: ++ */ ++ set_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags); ++ ++ down_read(&c->gc_lock); ++ do { ++ wrote = false; ++ ++ for_each_rw_member(ca, c, dev_iter) { ++ find_reclaimable_buckets(c, ca); ++ ++ while (!fifo_full(&ca->free[RESERVE_BTREE]) && ++ (bu = next_alloc_bucket(ca)) >= 0) { ++ ret = resize_free_inc(ca); ++ if (ret) { ++ percpu_ref_put(&ca->io_ref); ++ up_read(&c->gc_lock); ++ goto err; ++ } ++ ++ bch2_invalidate_one_bucket(c, ca, bu, ++ &journal_seq); ++ ++ fifo_push(&ca->free[RESERVE_BTREE], bu); ++ } ++ } ++ ++ pr_debug("done scanning for reclaimable buckets"); ++ ++ /* ++ * XXX: it's possible for this to deadlock waiting on journal reclaim, ++ * since we're holding btree writes. What then? ++ */ ++ ret = bch2_alloc_write(c, ++ BTREE_INSERT_NOCHECK_RW| ++ BTREE_INSERT_USE_ALLOC_RESERVE| ++ BTREE_INSERT_NOWAIT, &wrote); ++ ++ /* ++ * If bch2_alloc_write() did anything, it may have used some ++ * buckets, and we need the RESERVE_BTREE freelist full - so we ++ * need to loop and scan again. ++ * And if it errored, it may have been because there weren't ++ * enough buckets, so just scan and loop again as long as it ++ * made some progress: ++ */ ++ } while (wrote); ++ up_read(&c->gc_lock); ++ ++ if (ret) ++ goto err; ++ ++ pr_debug("flushing journal"); ++ ++ ret = bch2_journal_flush(&c->journal); ++ if (ret) ++ goto err; ++ ++ pr_debug("issuing discards"); ++ allocator_start_issue_discards(c); ++err: ++ clear_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags); ++ closure_wait_event(&c->btree_interior_update_wait, ++ flush_held_btree_writes(c)); ++ ++ return ret; ++} ++ ++void bch2_fs_allocator_background_init(struct bch_fs *c) ++{ ++ spin_lock_init(&c->freelist_lock); ++ bch2_bucket_clock_init(c, READ); ++ bch2_bucket_clock_init(c, WRITE); ++ ++ c->pd_controllers_update_seconds = 5; ++ INIT_DELAYED_WORK(&c->pd_controllers_update, pd_controllers_update); ++} +diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h +new file mode 100644 +index 000000000000..501c444353fb +--- /dev/null ++++ b/fs/bcachefs/alloc_background.h +@@ -0,0 +1,100 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_ALLOC_BACKGROUND_H ++#define _BCACHEFS_ALLOC_BACKGROUND_H ++ ++#include "bcachefs.h" ++#include "alloc_types.h" ++#include "debug.h" ++ ++struct bkey_alloc_unpacked { ++ u8 gen; ++#define x(_name, _bits) u##_bits _name; ++ BCH_ALLOC_FIELDS() ++#undef x ++}; ++ ++/* returns true if not equal */ ++static inline bool bkey_alloc_unpacked_cmp(struct bkey_alloc_unpacked l, ++ struct bkey_alloc_unpacked r) ++{ ++ return l.gen != r.gen ++#define x(_name, _bits) || l._name != r._name ++ BCH_ALLOC_FIELDS() ++#undef x ++ ; ++} ++ ++struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c); ++void bch2_alloc_pack(struct bkey_i_alloc *, ++ const struct bkey_alloc_unpacked); ++ ++static inline struct bkey_alloc_unpacked ++alloc_mem_to_key(struct bucket *g, struct bucket_mark m) ++{ ++ return (struct bkey_alloc_unpacked) { ++ .gen = m.gen, ++ .oldest_gen = g->oldest_gen, ++ .data_type = m.data_type, ++ .dirty_sectors = m.dirty_sectors, ++ .cached_sectors = m.cached_sectors, ++ .read_time = g->io_time[READ], ++ .write_time = g->io_time[WRITE], ++ }; ++} ++ ++#define ALLOC_SCAN_BATCH(ca) max_t(size_t, 1, (ca)->mi.nbuckets >> 9) ++ ++const char *bch2_alloc_invalid(const struct bch_fs *, struct bkey_s_c); ++void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); ++ ++#define bch2_bkey_ops_alloc (struct bkey_ops) { \ ++ .key_invalid = bch2_alloc_invalid, \ ++ .val_to_text = bch2_alloc_to_text, \ ++} ++ ++struct journal_keys; ++int bch2_alloc_read(struct bch_fs *, struct journal_keys *); ++int bch2_alloc_replay_key(struct bch_fs *, struct bkey_i *); ++ ++static inline void bch2_wake_allocator(struct bch_dev *ca) ++{ ++ struct task_struct *p; ++ ++ rcu_read_lock(); ++ p = rcu_dereference(ca->alloc_thread); ++ if (p) ++ wake_up_process(p); ++ rcu_read_unlock(); ++} ++ ++static inline void verify_not_on_freelist(struct bch_fs *c, struct bch_dev *ca, ++ size_t bucket) ++{ ++ if (expensive_debug_checks(c) && ++ test_bit(BCH_FS_ALLOCATOR_STARTED, &c->flags)) { ++ size_t iter; ++ long i; ++ unsigned j; ++ ++ for (j = 0; j < RESERVE_NR; j++) ++ fifo_for_each_entry(i, &ca->free[j], iter) ++ BUG_ON(i == bucket); ++ fifo_for_each_entry(i, &ca->free_inc, iter) ++ BUG_ON(i == bucket); ++ } ++} ++ ++void bch2_recalc_capacity(struct bch_fs *); ++ ++void bch2_dev_allocator_remove(struct bch_fs *, struct bch_dev *); ++void bch2_dev_allocator_add(struct bch_fs *, struct bch_dev *); ++ ++void bch2_dev_allocator_quiesce(struct bch_fs *, struct bch_dev *); ++void bch2_dev_allocator_stop(struct bch_dev *); ++int bch2_dev_allocator_start(struct bch_dev *); ++ ++int bch2_alloc_write(struct bch_fs *, unsigned, bool *); ++int bch2_fs_allocator_start(struct bch_fs *); ++void bch2_fs_allocator_background_init(struct bch_fs *); ++ ++#endif /* _BCACHEFS_ALLOC_BACKGROUND_H */ +diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c +new file mode 100644 +index 000000000000..697d576802b6 +--- /dev/null ++++ b/fs/bcachefs/alloc_foreground.c +@@ -0,0 +1,1044 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * Primary bucket allocation code ++ * ++ * Copyright 2012 Google, Inc. ++ * ++ * Allocation in bcache is done in terms of buckets: ++ * ++ * Each bucket has associated an 8 bit gen; this gen corresponds to the gen in ++ * btree pointers - they must match for the pointer to be considered valid. ++ * ++ * Thus (assuming a bucket has no dirty data or metadata in it) we can reuse a ++ * bucket simply by incrementing its gen. ++ * ++ * The gens (along with the priorities; it's really the gens are important but ++ * the code is named as if it's the priorities) are written in an arbitrary list ++ * of buckets on disk, with a pointer to them in the journal header. ++ * ++ * When we invalidate a bucket, we have to write its new gen to disk and wait ++ * for that write to complete before we use it - otherwise after a crash we ++ * could have pointers that appeared to be good but pointed to data that had ++ * been overwritten. ++ * ++ * Since the gens and priorities are all stored contiguously on disk, we can ++ * batch this up: We fill up the free_inc list with freshly invalidated buckets, ++ * call prio_write(), and when prio_write() finishes we pull buckets off the ++ * free_inc list and optionally discard them. ++ * ++ * free_inc isn't the only freelist - if it was, we'd often have to sleep while ++ * priorities and gens were being written before we could allocate. c->free is a ++ * smaller freelist, and buckets on that list are always ready to be used. ++ * ++ * If we've got discards enabled, that happens when a bucket moves from the ++ * free_inc list to the free list. ++ * ++ * It's important to ensure that gens don't wrap around - with respect to ++ * either the oldest gen in the btree or the gen on disk. This is quite ++ * difficult to do in practice, but we explicitly guard against it anyways - if ++ * a bucket is in danger of wrapping around we simply skip invalidating it that ++ * time around, and we garbage collect or rewrite the priorities sooner than we ++ * would have otherwise. ++ * ++ * bch2_bucket_alloc() allocates a single bucket from a specific device. ++ * ++ * bch2_bucket_alloc_set() allocates one or more buckets from different devices ++ * in a given filesystem. ++ * ++ * invalidate_buckets() drives all the processes described above. It's called ++ * from bch2_bucket_alloc() and a few other places that need to make sure free ++ * buckets are ready. ++ * ++ * invalidate_buckets_(lru|fifo)() find buckets that are available to be ++ * invalidated, and then invalidate them and stick them on the free_inc list - ++ * in either lru or fifo order. ++ */ ++ ++#include "bcachefs.h" ++#include "alloc_background.h" ++#include "alloc_foreground.h" ++#include "btree_gc.h" ++#include "buckets.h" ++#include "clock.h" ++#include "debug.h" ++#include "disk_groups.h" ++#include "ec.h" ++#include "io.h" ++ ++#include ++#include ++#include ++#include ++ ++enum bucket_alloc_ret { ++ ALLOC_SUCCESS, ++ OPEN_BUCKETS_EMPTY, ++ FREELIST_EMPTY, /* Allocator thread not keeping up */ ++}; ++ ++/* ++ * Open buckets represent a bucket that's currently being allocated from. They ++ * serve two purposes: ++ * ++ * - They track buckets that have been partially allocated, allowing for ++ * sub-bucket sized allocations - they're used by the sector allocator below ++ * ++ * - They provide a reference to the buckets they own that mark and sweep GC ++ * can find, until the new allocation has a pointer to it inserted into the ++ * btree ++ * ++ * When allocating some space with the sector allocator, the allocation comes ++ * with a reference to an open bucket - the caller is required to put that ++ * reference _after_ doing the index update that makes its allocation reachable. ++ */ ++ ++void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob) ++{ ++ struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev); ++ ++ if (ob->ec) { ++ bch2_ec_bucket_written(c, ob); ++ return; ++ } ++ ++ percpu_down_read(&c->mark_lock); ++ spin_lock(&ob->lock); ++ ++ bch2_mark_alloc_bucket(c, ca, PTR_BUCKET_NR(ca, &ob->ptr), ++ false, gc_pos_alloc(c, ob), 0); ++ ob->valid = false; ++ ob->type = 0; ++ ++ spin_unlock(&ob->lock); ++ percpu_up_read(&c->mark_lock); ++ ++ spin_lock(&c->freelist_lock); ++ ob->freelist = c->open_buckets_freelist; ++ c->open_buckets_freelist = ob - c->open_buckets; ++ c->open_buckets_nr_free++; ++ spin_unlock(&c->freelist_lock); ++ ++ closure_wake_up(&c->open_buckets_wait); ++} ++ ++void bch2_open_bucket_write_error(struct bch_fs *c, ++ struct open_buckets *obs, ++ unsigned dev) ++{ ++ struct open_bucket *ob; ++ unsigned i; ++ ++ open_bucket_for_each(c, obs, ob, i) ++ if (ob->ptr.dev == dev && ++ ob->ec) ++ bch2_ec_bucket_cancel(c, ob); ++} ++ ++static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c) ++{ ++ struct open_bucket *ob; ++ ++ BUG_ON(!c->open_buckets_freelist || !c->open_buckets_nr_free); ++ ++ ob = c->open_buckets + c->open_buckets_freelist; ++ c->open_buckets_freelist = ob->freelist; ++ atomic_set(&ob->pin, 1); ++ ob->type = 0; ++ ++ c->open_buckets_nr_free--; ++ return ob; ++} ++ ++static void open_bucket_free_unused(struct bch_fs *c, ++ struct open_bucket *ob, ++ bool may_realloc) ++{ ++ struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev); ++ ++ BUG_ON(ca->open_buckets_partial_nr >= ++ ARRAY_SIZE(ca->open_buckets_partial)); ++ ++ if (ca->open_buckets_partial_nr < ++ ARRAY_SIZE(ca->open_buckets_partial) && ++ may_realloc) { ++ spin_lock(&c->freelist_lock); ++ ob->on_partial_list = true; ++ ca->open_buckets_partial[ca->open_buckets_partial_nr++] = ++ ob - c->open_buckets; ++ spin_unlock(&c->freelist_lock); ++ ++ closure_wake_up(&c->open_buckets_wait); ++ closure_wake_up(&c->freelist_wait); ++ } else { ++ bch2_open_bucket_put(c, ob); ++ } ++} ++ ++static void verify_not_stale(struct bch_fs *c, const struct open_buckets *obs) ++{ ++#ifdef CONFIG_BCACHEFS_DEBUG ++ struct open_bucket *ob; ++ unsigned i; ++ ++ open_bucket_for_each(c, obs, ob, i) { ++ struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev); ++ ++ BUG_ON(ptr_stale(ca, &ob->ptr)); ++ } ++#endif ++} ++ ++/* _only_ for allocating the journal on a new device: */ ++long bch2_bucket_alloc_new_fs(struct bch_dev *ca) ++{ ++ struct bucket_array *buckets; ++ ssize_t b; ++ ++ rcu_read_lock(); ++ buckets = bucket_array(ca); ++ ++ for (b = ca->mi.first_bucket; b < ca->mi.nbuckets; b++) ++ if (is_available_bucket(buckets->b[b].mark)) ++ goto success; ++ b = -1; ++success: ++ rcu_read_unlock(); ++ return b; ++} ++ ++static inline unsigned open_buckets_reserved(enum alloc_reserve reserve) ++{ ++ switch (reserve) { ++ case RESERVE_ALLOC: ++ return 0; ++ case RESERVE_BTREE: ++ return BTREE_NODE_OPEN_BUCKET_RESERVE; ++ default: ++ return BTREE_NODE_OPEN_BUCKET_RESERVE * 2; ++ } ++} ++ ++/** ++ * bch_bucket_alloc - allocate a single bucket from a specific device ++ * ++ * Returns index of bucket on success, 0 on failure ++ * */ ++struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca, ++ enum alloc_reserve reserve, ++ bool may_alloc_partial, ++ struct closure *cl) ++{ ++ struct bucket_array *buckets; ++ struct open_bucket *ob; ++ long bucket = 0; ++ ++ spin_lock(&c->freelist_lock); ++ ++ if (may_alloc_partial && ++ ca->open_buckets_partial_nr) { ++ ob = c->open_buckets + ++ ca->open_buckets_partial[--ca->open_buckets_partial_nr]; ++ ob->on_partial_list = false; ++ spin_unlock(&c->freelist_lock); ++ return ob; ++ } ++ ++ if (unlikely(c->open_buckets_nr_free <= open_buckets_reserved(reserve))) { ++ if (cl) ++ closure_wait(&c->open_buckets_wait, cl); ++ ++ if (!c->blocked_allocate_open_bucket) ++ c->blocked_allocate_open_bucket = local_clock(); ++ ++ spin_unlock(&c->freelist_lock); ++ trace_open_bucket_alloc_fail(ca, reserve); ++ return ERR_PTR(-OPEN_BUCKETS_EMPTY); ++ } ++ ++ if (likely(fifo_pop(&ca->free[RESERVE_NONE], bucket))) ++ goto out; ++ ++ switch (reserve) { ++ case RESERVE_ALLOC: ++ if (fifo_pop(&ca->free[RESERVE_BTREE], bucket)) ++ goto out; ++ break; ++ case RESERVE_BTREE: ++ if (fifo_used(&ca->free[RESERVE_BTREE]) * 2 >= ++ ca->free[RESERVE_BTREE].size && ++ fifo_pop(&ca->free[RESERVE_BTREE], bucket)) ++ goto out; ++ break; ++ case RESERVE_MOVINGGC: ++ if (fifo_pop(&ca->free[RESERVE_MOVINGGC], bucket)) ++ goto out; ++ break; ++ default: ++ break; ++ } ++ ++ if (cl) ++ closure_wait(&c->freelist_wait, cl); ++ ++ if (!c->blocked_allocate) ++ c->blocked_allocate = local_clock(); ++ ++ spin_unlock(&c->freelist_lock); ++ ++ trace_bucket_alloc_fail(ca, reserve); ++ return ERR_PTR(-FREELIST_EMPTY); ++out: ++ verify_not_on_freelist(c, ca, bucket); ++ ++ ob = bch2_open_bucket_alloc(c); ++ ++ spin_lock(&ob->lock); ++ buckets = bucket_array(ca); ++ ++ ob->valid = true; ++ ob->sectors_free = ca->mi.bucket_size; ++ ob->ptr = (struct bch_extent_ptr) { ++ .type = 1 << BCH_EXTENT_ENTRY_ptr, ++ .gen = buckets->b[bucket].mark.gen, ++ .offset = bucket_to_sector(ca, bucket), ++ .dev = ca->dev_idx, ++ }; ++ ++ bucket_io_clock_reset(c, ca, bucket, READ); ++ bucket_io_clock_reset(c, ca, bucket, WRITE); ++ spin_unlock(&ob->lock); ++ ++ if (c->blocked_allocate_open_bucket) { ++ bch2_time_stats_update( ++ &c->times[BCH_TIME_blocked_allocate_open_bucket], ++ c->blocked_allocate_open_bucket); ++ c->blocked_allocate_open_bucket = 0; ++ } ++ ++ if (c->blocked_allocate) { ++ bch2_time_stats_update( ++ &c->times[BCH_TIME_blocked_allocate], ++ c->blocked_allocate); ++ c->blocked_allocate = 0; ++ } ++ ++ spin_unlock(&c->freelist_lock); ++ ++ bch2_wake_allocator(ca); ++ ++ trace_bucket_alloc(ca, reserve); ++ return ob; ++} ++ ++static int __dev_stripe_cmp(struct dev_stripe_state *stripe, ++ unsigned l, unsigned r) ++{ ++ return ((stripe->next_alloc[l] > stripe->next_alloc[r]) - ++ (stripe->next_alloc[l] < stripe->next_alloc[r])); ++} ++ ++#define dev_stripe_cmp(l, r) __dev_stripe_cmp(stripe, l, r) ++ ++struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *c, ++ struct dev_stripe_state *stripe, ++ struct bch_devs_mask *devs) ++{ ++ struct dev_alloc_list ret = { .nr = 0 }; ++ struct bch_dev *ca; ++ unsigned i; ++ ++ for_each_member_device_rcu(ca, c, i, devs) ++ ret.devs[ret.nr++] = i; ++ ++ bubble_sort(ret.devs, ret.nr, dev_stripe_cmp); ++ return ret; ++} ++ ++void bch2_dev_stripe_increment(struct bch_fs *c, struct bch_dev *ca, ++ struct dev_stripe_state *stripe) ++{ ++ u64 *v = stripe->next_alloc + ca->dev_idx; ++ u64 free_space = dev_buckets_free(c, ca); ++ u64 free_space_inv = free_space ++ ? div64_u64(1ULL << 48, free_space) ++ : 1ULL << 48; ++ u64 scale = *v / 4; ++ ++ if (*v + free_space_inv >= *v) ++ *v += free_space_inv; ++ else ++ *v = U64_MAX; ++ ++ for (v = stripe->next_alloc; ++ v < stripe->next_alloc + ARRAY_SIZE(stripe->next_alloc); v++) ++ *v = *v < scale ? 0 : *v - scale; ++} ++ ++#define BUCKET_MAY_ALLOC_PARTIAL (1 << 0) ++#define BUCKET_ALLOC_USE_DURABILITY (1 << 1) ++ ++static void add_new_bucket(struct bch_fs *c, ++ struct open_buckets *ptrs, ++ struct bch_devs_mask *devs_may_alloc, ++ unsigned *nr_effective, ++ bool *have_cache, ++ unsigned flags, ++ struct open_bucket *ob) ++{ ++ unsigned durability = ++ bch_dev_bkey_exists(c, ob->ptr.dev)->mi.durability; ++ ++ __clear_bit(ob->ptr.dev, devs_may_alloc->d); ++ *nr_effective += (flags & BUCKET_ALLOC_USE_DURABILITY) ++ ? durability : 1; ++ *have_cache |= !durability; ++ ++ ob_push(c, ptrs, ob); ++} ++ ++static int bch2_bucket_alloc_set(struct bch_fs *c, ++ struct open_buckets *ptrs, ++ struct dev_stripe_state *stripe, ++ struct bch_devs_mask *devs_may_alloc, ++ unsigned nr_replicas, ++ unsigned *nr_effective, ++ bool *have_cache, ++ enum alloc_reserve reserve, ++ unsigned flags, ++ struct closure *cl) ++{ ++ struct dev_alloc_list devs_sorted = ++ bch2_dev_alloc_list(c, stripe, devs_may_alloc); ++ struct bch_dev *ca; ++ bool alloc_failure = false; ++ unsigned i; ++ ++ BUG_ON(*nr_effective >= nr_replicas); ++ ++ for (i = 0; i < devs_sorted.nr; i++) { ++ struct open_bucket *ob; ++ ++ ca = rcu_dereference(c->devs[devs_sorted.devs[i]]); ++ if (!ca) ++ continue; ++ ++ if (!ca->mi.durability && *have_cache) ++ continue; ++ ++ ob = bch2_bucket_alloc(c, ca, reserve, ++ flags & BUCKET_MAY_ALLOC_PARTIAL, cl); ++ if (IS_ERR(ob)) { ++ enum bucket_alloc_ret ret = -PTR_ERR(ob); ++ ++ WARN_ON(reserve == RESERVE_MOVINGGC && ++ ret != OPEN_BUCKETS_EMPTY); ++ ++ if (cl) ++ return -EAGAIN; ++ if (ret == OPEN_BUCKETS_EMPTY) ++ return -ENOSPC; ++ alloc_failure = true; ++ continue; ++ } ++ ++ add_new_bucket(c, ptrs, devs_may_alloc, ++ nr_effective, have_cache, flags, ob); ++ ++ bch2_dev_stripe_increment(c, ca, stripe); ++ ++ if (*nr_effective >= nr_replicas) ++ return 0; ++ } ++ ++ return alloc_failure ? -ENOSPC : -EROFS; ++} ++ ++/* Allocate from stripes: */ ++ ++/* ++ * XXX: use a higher watermark for allocating open buckets here: ++ */ ++static int ec_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h) ++{ ++ struct bch_devs_mask devs; ++ struct open_bucket *ob; ++ unsigned i, nr_have = 0, nr_data = ++ min_t(unsigned, h->nr_active_devs, ++ EC_STRIPE_MAX) - h->redundancy; ++ bool have_cache = true; ++ int ret = 0; ++ ++ BUG_ON(h->blocks.nr > nr_data); ++ BUG_ON(h->parity.nr > h->redundancy); ++ ++ devs = h->devs; ++ ++ open_bucket_for_each(c, &h->parity, ob, i) ++ __clear_bit(ob->ptr.dev, devs.d); ++ open_bucket_for_each(c, &h->blocks, ob, i) ++ __clear_bit(ob->ptr.dev, devs.d); ++ ++ percpu_down_read(&c->mark_lock); ++ rcu_read_lock(); ++ ++ if (h->parity.nr < h->redundancy) { ++ nr_have = h->parity.nr; ++ ++ ret = bch2_bucket_alloc_set(c, &h->parity, ++ &h->parity_stripe, ++ &devs, ++ h->redundancy, ++ &nr_have, ++ &have_cache, ++ RESERVE_NONE, ++ 0, ++ NULL); ++ if (ret) ++ goto err; ++ } ++ ++ if (h->blocks.nr < nr_data) { ++ nr_have = h->blocks.nr; ++ ++ ret = bch2_bucket_alloc_set(c, &h->blocks, ++ &h->block_stripe, ++ &devs, ++ nr_data, ++ &nr_have, ++ &have_cache, ++ RESERVE_NONE, ++ 0, ++ NULL); ++ if (ret) ++ goto err; ++ } ++ ++ rcu_read_unlock(); ++ percpu_up_read(&c->mark_lock); ++ ++ return bch2_ec_stripe_new_alloc(c, h); ++err: ++ rcu_read_unlock(); ++ percpu_up_read(&c->mark_lock); ++ return -1; ++} ++ ++/* ++ * if we can't allocate a new stripe because there are already too many ++ * partially filled stripes, force allocating from an existing stripe even when ++ * it's to a device we don't want: ++ */ ++ ++static void bucket_alloc_from_stripe(struct bch_fs *c, ++ struct open_buckets *ptrs, ++ struct write_point *wp, ++ struct bch_devs_mask *devs_may_alloc, ++ u16 target, ++ unsigned erasure_code, ++ unsigned nr_replicas, ++ unsigned *nr_effective, ++ bool *have_cache, ++ unsigned flags) ++{ ++ struct dev_alloc_list devs_sorted; ++ struct ec_stripe_head *h; ++ struct open_bucket *ob; ++ struct bch_dev *ca; ++ unsigned i, ec_idx; ++ ++ if (!erasure_code) ++ return; ++ ++ if (nr_replicas < 2) ++ return; ++ ++ if (ec_open_bucket(c, ptrs)) ++ return; ++ ++ h = bch2_ec_stripe_head_get(c, target, erasure_code, nr_replicas - 1); ++ if (!h) ++ return; ++ ++ if (!h->s && ec_stripe_alloc(c, h)) ++ goto out_put_head; ++ ++ rcu_read_lock(); ++ devs_sorted = bch2_dev_alloc_list(c, &wp->stripe, devs_may_alloc); ++ rcu_read_unlock(); ++ ++ for (i = 0; i < devs_sorted.nr; i++) ++ open_bucket_for_each(c, &h->s->blocks, ob, ec_idx) ++ if (ob->ptr.dev == devs_sorted.devs[i] && ++ !test_and_set_bit(ec_idx, h->s->blocks_allocated)) ++ goto got_bucket; ++ goto out_put_head; ++got_bucket: ++ ca = bch_dev_bkey_exists(c, ob->ptr.dev); ++ ++ ob->ec_idx = ec_idx; ++ ob->ec = h->s; ++ ++ add_new_bucket(c, ptrs, devs_may_alloc, ++ nr_effective, have_cache, flags, ob); ++ atomic_inc(&h->s->pin); ++out_put_head: ++ bch2_ec_stripe_head_put(h); ++} ++ ++/* Sector allocator */ ++ ++static void get_buckets_from_writepoint(struct bch_fs *c, ++ struct open_buckets *ptrs, ++ struct write_point *wp, ++ struct bch_devs_mask *devs_may_alloc, ++ unsigned nr_replicas, ++ unsigned *nr_effective, ++ bool *have_cache, ++ unsigned flags, ++ bool need_ec) ++{ ++ struct open_buckets ptrs_skip = { .nr = 0 }; ++ struct open_bucket *ob; ++ unsigned i; ++ ++ open_bucket_for_each(c, &wp->ptrs, ob, i) { ++ struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev); ++ ++ if (*nr_effective < nr_replicas && ++ test_bit(ob->ptr.dev, devs_may_alloc->d) && ++ (ca->mi.durability || ++ (wp->type == BCH_DATA_USER && !*have_cache)) && ++ (ob->ec || !need_ec)) { ++ add_new_bucket(c, ptrs, devs_may_alloc, ++ nr_effective, have_cache, ++ flags, ob); ++ } else { ++ ob_push(c, &ptrs_skip, ob); ++ } ++ } ++ wp->ptrs = ptrs_skip; ++} ++ ++static int open_bucket_add_buckets(struct bch_fs *c, ++ struct open_buckets *ptrs, ++ struct write_point *wp, ++ struct bch_devs_list *devs_have, ++ u16 target, ++ unsigned erasure_code, ++ unsigned nr_replicas, ++ unsigned *nr_effective, ++ bool *have_cache, ++ enum alloc_reserve reserve, ++ unsigned flags, ++ struct closure *_cl) ++{ ++ struct bch_devs_mask devs; ++ struct open_bucket *ob; ++ struct closure *cl = NULL; ++ unsigned i; ++ int ret; ++ ++ rcu_read_lock(); ++ devs = target_rw_devs(c, wp->type, target); ++ rcu_read_unlock(); ++ ++ /* Don't allocate from devices we already have pointers to: */ ++ for (i = 0; i < devs_have->nr; i++) ++ __clear_bit(devs_have->devs[i], devs.d); ++ ++ open_bucket_for_each(c, ptrs, ob, i) ++ __clear_bit(ob->ptr.dev, devs.d); ++ ++ if (erasure_code) { ++ get_buckets_from_writepoint(c, ptrs, wp, &devs, ++ nr_replicas, nr_effective, ++ have_cache, flags, true); ++ if (*nr_effective >= nr_replicas) ++ return 0; ++ ++ bucket_alloc_from_stripe(c, ptrs, wp, &devs, ++ target, erasure_code, ++ nr_replicas, nr_effective, ++ have_cache, flags); ++ if (*nr_effective >= nr_replicas) ++ return 0; ++ } ++ ++ get_buckets_from_writepoint(c, ptrs, wp, &devs, ++ nr_replicas, nr_effective, ++ have_cache, flags, false); ++ if (*nr_effective >= nr_replicas) ++ return 0; ++ ++ percpu_down_read(&c->mark_lock); ++ rcu_read_lock(); ++ ++retry_blocking: ++ /* ++ * Try nonblocking first, so that if one device is full we'll try from ++ * other devices: ++ */ ++ ret = bch2_bucket_alloc_set(c, ptrs, &wp->stripe, &devs, ++ nr_replicas, nr_effective, have_cache, ++ reserve, flags, cl); ++ if (ret && ret != -EROFS && !cl && _cl) { ++ cl = _cl; ++ goto retry_blocking; ++ } ++ ++ rcu_read_unlock(); ++ percpu_up_read(&c->mark_lock); ++ ++ return ret; ++} ++ ++void bch2_open_buckets_stop_dev(struct bch_fs *c, struct bch_dev *ca, ++ struct open_buckets *obs) ++{ ++ struct open_buckets ptrs = { .nr = 0 }; ++ struct open_bucket *ob, *ob2; ++ unsigned i, j; ++ ++ open_bucket_for_each(c, obs, ob, i) { ++ bool drop = !ca || ob->ptr.dev == ca->dev_idx; ++ ++ if (!drop && ob->ec) { ++ mutex_lock(&ob->ec->lock); ++ open_bucket_for_each(c, &ob->ec->blocks, ob2, j) ++ drop |= ob2->ptr.dev == ca->dev_idx; ++ open_bucket_for_each(c, &ob->ec->parity, ob2, j) ++ drop |= ob2->ptr.dev == ca->dev_idx; ++ mutex_unlock(&ob->ec->lock); ++ } ++ ++ if (drop) ++ bch2_open_bucket_put(c, ob); ++ else ++ ob_push(c, &ptrs, ob); ++ } ++ ++ *obs = ptrs; ++} ++ ++void bch2_writepoint_stop(struct bch_fs *c, struct bch_dev *ca, ++ struct write_point *wp) ++{ ++ mutex_lock(&wp->lock); ++ bch2_open_buckets_stop_dev(c, ca, &wp->ptrs); ++ mutex_unlock(&wp->lock); ++} ++ ++static inline struct hlist_head *writepoint_hash(struct bch_fs *c, ++ unsigned long write_point) ++{ ++ unsigned hash = ++ hash_long(write_point, ilog2(ARRAY_SIZE(c->write_points_hash))); ++ ++ return &c->write_points_hash[hash]; ++} ++ ++static struct write_point *__writepoint_find(struct hlist_head *head, ++ unsigned long write_point) ++{ ++ struct write_point *wp; ++ ++ hlist_for_each_entry_rcu(wp, head, node) ++ if (wp->write_point == write_point) ++ return wp; ++ ++ return NULL; ++} ++ ++static inline bool too_many_writepoints(struct bch_fs *c, unsigned factor) ++{ ++ u64 stranded = c->write_points_nr * c->bucket_size_max; ++ u64 free = bch2_fs_usage_read_short(c).free; ++ ++ return stranded * factor > free; ++} ++ ++static bool try_increase_writepoints(struct bch_fs *c) ++{ ++ struct write_point *wp; ++ ++ if (c->write_points_nr == ARRAY_SIZE(c->write_points) || ++ too_many_writepoints(c, 32)) ++ return false; ++ ++ wp = c->write_points + c->write_points_nr++; ++ hlist_add_head_rcu(&wp->node, writepoint_hash(c, wp->write_point)); ++ return true; ++} ++ ++static bool try_decrease_writepoints(struct bch_fs *c, ++ unsigned old_nr) ++{ ++ struct write_point *wp; ++ ++ mutex_lock(&c->write_points_hash_lock); ++ if (c->write_points_nr < old_nr) { ++ mutex_unlock(&c->write_points_hash_lock); ++ return true; ++ } ++ ++ if (c->write_points_nr == 1 || ++ !too_many_writepoints(c, 8)) { ++ mutex_unlock(&c->write_points_hash_lock); ++ return false; ++ } ++ ++ wp = c->write_points + --c->write_points_nr; ++ ++ hlist_del_rcu(&wp->node); ++ mutex_unlock(&c->write_points_hash_lock); ++ ++ bch2_writepoint_stop(c, NULL, wp); ++ return true; ++} ++ ++static struct write_point *writepoint_find(struct bch_fs *c, ++ unsigned long write_point) ++{ ++ struct write_point *wp, *oldest; ++ struct hlist_head *head; ++ ++ if (!(write_point & 1UL)) { ++ wp = (struct write_point *) write_point; ++ mutex_lock(&wp->lock); ++ return wp; ++ } ++ ++ head = writepoint_hash(c, write_point); ++restart_find: ++ wp = __writepoint_find(head, write_point); ++ if (wp) { ++lock_wp: ++ mutex_lock(&wp->lock); ++ if (wp->write_point == write_point) ++ goto out; ++ mutex_unlock(&wp->lock); ++ goto restart_find; ++ } ++restart_find_oldest: ++ oldest = NULL; ++ for (wp = c->write_points; ++ wp < c->write_points + c->write_points_nr; wp++) ++ if (!oldest || time_before64(wp->last_used, oldest->last_used)) ++ oldest = wp; ++ ++ mutex_lock(&oldest->lock); ++ mutex_lock(&c->write_points_hash_lock); ++ if (oldest >= c->write_points + c->write_points_nr || ++ try_increase_writepoints(c)) { ++ mutex_unlock(&c->write_points_hash_lock); ++ mutex_unlock(&oldest->lock); ++ goto restart_find_oldest; ++ } ++ ++ wp = __writepoint_find(head, write_point); ++ if (wp && wp != oldest) { ++ mutex_unlock(&c->write_points_hash_lock); ++ mutex_unlock(&oldest->lock); ++ goto lock_wp; ++ } ++ ++ wp = oldest; ++ hlist_del_rcu(&wp->node); ++ wp->write_point = write_point; ++ hlist_add_head_rcu(&wp->node, head); ++ mutex_unlock(&c->write_points_hash_lock); ++out: ++ wp->last_used = sched_clock(); ++ return wp; ++} ++ ++/* ++ * Get us an open_bucket we can allocate from, return with it locked: ++ */ ++struct write_point *bch2_alloc_sectors_start(struct bch_fs *c, ++ unsigned target, ++ unsigned erasure_code, ++ struct write_point_specifier write_point, ++ struct bch_devs_list *devs_have, ++ unsigned nr_replicas, ++ unsigned nr_replicas_required, ++ enum alloc_reserve reserve, ++ unsigned flags, ++ struct closure *cl) ++{ ++ struct write_point *wp; ++ struct open_bucket *ob; ++ struct open_buckets ptrs; ++ unsigned nr_effective, write_points_nr; ++ unsigned ob_flags = 0; ++ bool have_cache; ++ int ret, i; ++ ++ if (!(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)) ++ ob_flags |= BUCKET_ALLOC_USE_DURABILITY; ++ ++ BUG_ON(!nr_replicas || !nr_replicas_required); ++retry: ++ ptrs.nr = 0; ++ nr_effective = 0; ++ write_points_nr = c->write_points_nr; ++ have_cache = false; ++ ++ wp = writepoint_find(c, write_point.v); ++ ++ if (wp->type == BCH_DATA_USER) ++ ob_flags |= BUCKET_MAY_ALLOC_PARTIAL; ++ ++ /* metadata may not allocate on cache devices: */ ++ if (wp->type != BCH_DATA_USER) ++ have_cache = true; ++ ++ if (!target || (flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)) { ++ ret = open_bucket_add_buckets(c, &ptrs, wp, devs_have, ++ target, erasure_code, ++ nr_replicas, &nr_effective, ++ &have_cache, reserve, ++ ob_flags, cl); ++ } else { ++ ret = open_bucket_add_buckets(c, &ptrs, wp, devs_have, ++ target, erasure_code, ++ nr_replicas, &nr_effective, ++ &have_cache, reserve, ++ ob_flags, NULL); ++ if (!ret) ++ goto alloc_done; ++ ++ ret = open_bucket_add_buckets(c, &ptrs, wp, devs_have, ++ 0, erasure_code, ++ nr_replicas, &nr_effective, ++ &have_cache, reserve, ++ ob_flags, cl); ++ } ++alloc_done: ++ BUG_ON(!ret && nr_effective < nr_replicas); ++ ++ if (erasure_code && !ec_open_bucket(c, &ptrs)) ++ pr_debug("failed to get ec bucket: ret %u", ret); ++ ++ if (ret == -EROFS && ++ nr_effective >= nr_replicas_required) ++ ret = 0; ++ ++ if (ret) ++ goto err; ++ ++ /* Free buckets we didn't use: */ ++ open_bucket_for_each(c, &wp->ptrs, ob, i) ++ open_bucket_free_unused(c, ob, wp->type == BCH_DATA_USER); ++ ++ wp->ptrs = ptrs; ++ ++ wp->sectors_free = UINT_MAX; ++ ++ open_bucket_for_each(c, &wp->ptrs, ob, i) ++ wp->sectors_free = min(wp->sectors_free, ob->sectors_free); ++ ++ BUG_ON(!wp->sectors_free || wp->sectors_free == UINT_MAX); ++ ++ verify_not_stale(c, &wp->ptrs); ++ ++ return wp; ++err: ++ open_bucket_for_each(c, &wp->ptrs, ob, i) ++ if (ptrs.nr < ARRAY_SIZE(ptrs.v)) ++ ob_push(c, &ptrs, ob); ++ else ++ open_bucket_free_unused(c, ob, ++ wp->type == BCH_DATA_USER); ++ wp->ptrs = ptrs; ++ ++ mutex_unlock(&wp->lock); ++ ++ if (ret == -ENOSPC && ++ try_decrease_writepoints(c, write_points_nr)) ++ goto retry; ++ ++ return ERR_PTR(ret); ++} ++ ++/* ++ * Append pointers to the space we just allocated to @k, and mark @sectors space ++ * as allocated out of @ob ++ */ ++void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct write_point *wp, ++ struct bkey_i *k, unsigned sectors) ++ ++{ ++ struct open_bucket *ob; ++ unsigned i; ++ ++ BUG_ON(sectors > wp->sectors_free); ++ wp->sectors_free -= sectors; ++ ++ open_bucket_for_each(c, &wp->ptrs, ob, i) { ++ struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev); ++ struct bch_extent_ptr tmp = ob->ptr; ++ ++ tmp.cached = !ca->mi.durability && ++ wp->type == BCH_DATA_USER; ++ ++ tmp.offset += ca->mi.bucket_size - ob->sectors_free; ++ bch2_bkey_append_ptr(k, tmp); ++ ++ BUG_ON(sectors > ob->sectors_free); ++ ob->sectors_free -= sectors; ++ } ++} ++ ++/* ++ * Append pointers to the space we just allocated to @k, and mark @sectors space ++ * as allocated out of @ob ++ */ ++void bch2_alloc_sectors_done(struct bch_fs *c, struct write_point *wp) ++{ ++ struct open_buckets ptrs = { .nr = 0 }, keep = { .nr = 0 }; ++ struct open_bucket *ob; ++ unsigned i; ++ ++ open_bucket_for_each(c, &wp->ptrs, ob, i) ++ ob_push(c, !ob->sectors_free ? &ptrs : &keep, ob); ++ wp->ptrs = keep; ++ ++ mutex_unlock(&wp->lock); ++ ++ bch2_open_buckets_put(c, &ptrs); ++} ++ ++void bch2_fs_allocator_foreground_init(struct bch_fs *c) ++{ ++ struct open_bucket *ob; ++ struct write_point *wp; ++ ++ mutex_init(&c->write_points_hash_lock); ++ c->write_points_nr = ARRAY_SIZE(c->write_points); ++ ++ /* open bucket 0 is a sentinal NULL: */ ++ spin_lock_init(&c->open_buckets[0].lock); ++ ++ for (ob = c->open_buckets + 1; ++ ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); ob++) { ++ spin_lock_init(&ob->lock); ++ c->open_buckets_nr_free++; ++ ++ ob->freelist = c->open_buckets_freelist; ++ c->open_buckets_freelist = ob - c->open_buckets; ++ } ++ ++ writepoint_init(&c->btree_write_point, BCH_DATA_BTREE); ++ writepoint_init(&c->rebalance_write_point, BCH_DATA_USER); ++ ++ for (wp = c->write_points; ++ wp < c->write_points + c->write_points_nr; wp++) { ++ writepoint_init(wp, BCH_DATA_USER); ++ ++ wp->last_used = sched_clock(); ++ wp->write_point = (unsigned long) wp; ++ hlist_add_head_rcu(&wp->node, ++ writepoint_hash(c, wp->write_point)); ++ } ++} +diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h +new file mode 100644 +index 000000000000..687f973e4b3a +--- /dev/null ++++ b/fs/bcachefs/alloc_foreground.h +@@ -0,0 +1,133 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_ALLOC_FOREGROUND_H ++#define _BCACHEFS_ALLOC_FOREGROUND_H ++ ++#include "bcachefs.h" ++#include "alloc_types.h" ++ ++#include ++ ++struct bkey; ++struct bch_dev; ++struct bch_fs; ++struct bch_devs_List; ++ ++struct dev_alloc_list { ++ unsigned nr; ++ u8 devs[BCH_SB_MEMBERS_MAX]; ++}; ++ ++struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *, ++ struct dev_stripe_state *, ++ struct bch_devs_mask *); ++void bch2_dev_stripe_increment(struct bch_fs *, struct bch_dev *, ++ struct dev_stripe_state *); ++ ++long bch2_bucket_alloc_new_fs(struct bch_dev *); ++ ++struct open_bucket *bch2_bucket_alloc(struct bch_fs *, struct bch_dev *, ++ enum alloc_reserve, bool, ++ struct closure *); ++ ++static inline void ob_push(struct bch_fs *c, struct open_buckets *obs, ++ struct open_bucket *ob) ++{ ++ BUG_ON(obs->nr >= ARRAY_SIZE(obs->v)); ++ ++ obs->v[obs->nr++] = ob - c->open_buckets; ++} ++ ++#define open_bucket_for_each(_c, _obs, _ob, _i) \ ++ for ((_i) = 0; \ ++ (_i) < (_obs)->nr && \ ++ ((_ob) = (_c)->open_buckets + (_obs)->v[_i], true); \ ++ (_i)++) ++ ++static inline struct open_bucket *ec_open_bucket(struct bch_fs *c, ++ struct open_buckets *obs) ++{ ++ struct open_bucket *ob; ++ unsigned i; ++ ++ open_bucket_for_each(c, obs, ob, i) ++ if (ob->ec) ++ return ob; ++ ++ return NULL; ++} ++ ++void bch2_open_bucket_write_error(struct bch_fs *, ++ struct open_buckets *, unsigned); ++ ++void __bch2_open_bucket_put(struct bch_fs *, struct open_bucket *); ++ ++static inline void bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob) ++{ ++ if (atomic_dec_and_test(&ob->pin)) ++ __bch2_open_bucket_put(c, ob); ++} ++ ++static inline void bch2_open_buckets_put(struct bch_fs *c, ++ struct open_buckets *ptrs) ++{ ++ struct open_bucket *ob; ++ unsigned i; ++ ++ open_bucket_for_each(c, ptrs, ob, i) ++ bch2_open_bucket_put(c, ob); ++ ptrs->nr = 0; ++} ++ ++static inline void bch2_open_bucket_get(struct bch_fs *c, ++ struct write_point *wp, ++ struct open_buckets *ptrs) ++{ ++ struct open_bucket *ob; ++ unsigned i; ++ ++ open_bucket_for_each(c, &wp->ptrs, ob, i) { ++ ob->type = wp->type; ++ atomic_inc(&ob->pin); ++ ob_push(c, ptrs, ob); ++ } ++} ++ ++struct write_point *bch2_alloc_sectors_start(struct bch_fs *, ++ unsigned, unsigned, ++ struct write_point_specifier, ++ struct bch_devs_list *, ++ unsigned, unsigned, ++ enum alloc_reserve, ++ unsigned, ++ struct closure *); ++ ++void bch2_alloc_sectors_append_ptrs(struct bch_fs *, struct write_point *, ++ struct bkey_i *, unsigned); ++void bch2_alloc_sectors_done(struct bch_fs *, struct write_point *); ++ ++void bch2_open_buckets_stop_dev(struct bch_fs *, struct bch_dev *, ++ struct open_buckets *); ++ ++void bch2_writepoint_stop(struct bch_fs *, struct bch_dev *, ++ struct write_point *); ++ ++static inline struct write_point_specifier writepoint_hashed(unsigned long v) ++{ ++ return (struct write_point_specifier) { .v = v | 1 }; ++} ++ ++static inline struct write_point_specifier writepoint_ptr(struct write_point *wp) ++{ ++ return (struct write_point_specifier) { .v = (unsigned long) wp }; ++} ++ ++static inline void writepoint_init(struct write_point *wp, ++ enum bch_data_type type) ++{ ++ mutex_init(&wp->lock); ++ wp->type = type; ++} ++ ++void bch2_fs_allocator_foreground_init(struct bch_fs *); ++ ++#endif /* _BCACHEFS_ALLOC_FOREGROUND_H */ +diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h +new file mode 100644 +index 000000000000..832568dc9551 +--- /dev/null ++++ b/fs/bcachefs/alloc_types.h +@@ -0,0 +1,106 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_ALLOC_TYPES_H ++#define _BCACHEFS_ALLOC_TYPES_H ++ ++#include ++#include ++ ++#include "clock_types.h" ++#include "fifo.h" ++ ++struct ec_bucket_buf; ++ ++/* There's two of these clocks, one for reads and one for writes: */ ++struct bucket_clock { ++ /* ++ * "now" in (read/write) IO time - incremented whenever we do X amount ++ * of reads or writes. ++ * ++ * Goes with the bucket read/write prios: when we read or write to a ++ * bucket we reset the bucket's prio to the current hand; thus hand - ++ * prio = time since bucket was last read/written. ++ * ++ * The units are some amount (bytes/sectors) of data read/written, and ++ * the units can change on the fly if we need to rescale to fit ++ * everything in a u16 - your only guarantee is that the units are ++ * consistent. ++ */ ++ u16 hand; ++ u16 max_last_io; ++ ++ int rw; ++ ++ struct io_timer rescale; ++ struct mutex lock; ++}; ++ ++/* There is one reserve for each type of btree, one for prios and gens ++ * and one for moving GC */ ++enum alloc_reserve { ++ RESERVE_ALLOC = -1, ++ RESERVE_BTREE = 0, ++ RESERVE_MOVINGGC = 1, ++ RESERVE_NONE = 2, ++ RESERVE_NR = 3, ++}; ++ ++typedef FIFO(long) alloc_fifo; ++ ++/* Enough for 16 cache devices, 2 tiers and some left over for pipelining */ ++#define OPEN_BUCKETS_COUNT 256 ++ ++#define WRITE_POINT_HASH_NR 32 ++#define WRITE_POINT_MAX 32 ++ ++struct open_bucket { ++ spinlock_t lock; ++ atomic_t pin; ++ u8 freelist; ++ u8 ec_idx; ++ u8 type; ++ unsigned valid:1; ++ unsigned on_partial_list:1; ++ unsigned sectors_free; ++ struct bch_extent_ptr ptr; ++ struct ec_stripe_new *ec; ++}; ++ ++#define OPEN_BUCKET_LIST_MAX 15 ++ ++struct open_buckets { ++ u8 nr; ++ u8 v[OPEN_BUCKET_LIST_MAX]; ++}; ++ ++struct dev_stripe_state { ++ u64 next_alloc[BCH_SB_MEMBERS_MAX]; ++}; ++ ++struct write_point { ++ struct hlist_node node; ++ struct mutex lock; ++ u64 last_used; ++ unsigned long write_point; ++ enum bch_data_type type; ++ bool is_ec; ++ ++ /* calculated based on how many pointers we're actually going to use: */ ++ unsigned sectors_free; ++ ++ struct open_buckets ptrs; ++ struct dev_stripe_state stripe; ++}; ++ ++struct write_point_specifier { ++ unsigned long v; ++}; ++ ++struct alloc_heap_entry { ++ size_t bucket; ++ size_t nr; ++ unsigned long key; ++}; ++ ++typedef HEAP(struct alloc_heap_entry) alloc_heap; ++ ++#endif /* _BCACHEFS_ALLOC_TYPES_H */ +diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h +new file mode 100644 +index 000000000000..4c209c6b4aad +--- /dev/null ++++ b/fs/bcachefs/bcachefs.h +@@ -0,0 +1,855 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_H ++#define _BCACHEFS_H ++ ++/* ++ * SOME HIGH LEVEL CODE DOCUMENTATION: ++ * ++ * Bcache mostly works with cache sets, cache devices, and backing devices. ++ * ++ * Support for multiple cache devices hasn't quite been finished off yet, but ++ * it's about 95% plumbed through. A cache set and its cache devices is sort of ++ * like a md raid array and its component devices. Most of the code doesn't care ++ * about individual cache devices, the main abstraction is the cache set. ++ * ++ * Multiple cache devices is intended to give us the ability to mirror dirty ++ * cached data and metadata, without mirroring clean cached data. ++ * ++ * Backing devices are different, in that they have a lifetime independent of a ++ * cache set. When you register a newly formatted backing device it'll come up ++ * in passthrough mode, and then you can attach and detach a backing device from ++ * a cache set at runtime - while it's mounted and in use. Detaching implicitly ++ * invalidates any cached data for that backing device. ++ * ++ * A cache set can have multiple (many) backing devices attached to it. ++ * ++ * There's also flash only volumes - this is the reason for the distinction ++ * between struct cached_dev and struct bcache_device. A flash only volume ++ * works much like a bcache device that has a backing device, except the ++ * "cached" data is always dirty. The end result is that we get thin ++ * provisioning with very little additional code. ++ * ++ * Flash only volumes work but they're not production ready because the moving ++ * garbage collector needs more work. More on that later. ++ * ++ * BUCKETS/ALLOCATION: ++ * ++ * Bcache is primarily designed for caching, which means that in normal ++ * operation all of our available space will be allocated. Thus, we need an ++ * efficient way of deleting things from the cache so we can write new things to ++ * it. ++ * ++ * To do this, we first divide the cache device up into buckets. A bucket is the ++ * unit of allocation; they're typically around 1 mb - anywhere from 128k to 2M+ ++ * works efficiently. ++ * ++ * Each bucket has a 16 bit priority, and an 8 bit generation associated with ++ * it. The gens and priorities for all the buckets are stored contiguously and ++ * packed on disk (in a linked list of buckets - aside from the superblock, all ++ * of bcache's metadata is stored in buckets). ++ * ++ * The priority is used to implement an LRU. We reset a bucket's priority when ++ * we allocate it or on cache it, and every so often we decrement the priority ++ * of each bucket. It could be used to implement something more sophisticated, ++ * if anyone ever gets around to it. ++ * ++ * The generation is used for invalidating buckets. Each pointer also has an 8 ++ * bit generation embedded in it; for a pointer to be considered valid, its gen ++ * must match the gen of the bucket it points into. Thus, to reuse a bucket all ++ * we have to do is increment its gen (and write its new gen to disk; we batch ++ * this up). ++ * ++ * Bcache is entirely COW - we never write twice to a bucket, even buckets that ++ * contain metadata (including btree nodes). ++ * ++ * THE BTREE: ++ * ++ * Bcache is in large part design around the btree. ++ * ++ * At a high level, the btree is just an index of key -> ptr tuples. ++ * ++ * Keys represent extents, and thus have a size field. Keys also have a variable ++ * number of pointers attached to them (potentially zero, which is handy for ++ * invalidating the cache). ++ * ++ * The key itself is an inode:offset pair. The inode number corresponds to a ++ * backing device or a flash only volume. The offset is the ending offset of the ++ * extent within the inode - not the starting offset; this makes lookups ++ * slightly more convenient. ++ * ++ * Pointers contain the cache device id, the offset on that device, and an 8 bit ++ * generation number. More on the gen later. ++ * ++ * Index lookups are not fully abstracted - cache lookups in particular are ++ * still somewhat mixed in with the btree code, but things are headed in that ++ * direction. ++ * ++ * Updates are fairly well abstracted, though. There are two different ways of ++ * updating the btree; insert and replace. ++ * ++ * BTREE_INSERT will just take a list of keys and insert them into the btree - ++ * overwriting (possibly only partially) any extents they overlap with. This is ++ * used to update the index after a write. ++ * ++ * BTREE_REPLACE is really cmpxchg(); it inserts a key into the btree iff it is ++ * overwriting a key that matches another given key. This is used for inserting ++ * data into the cache after a cache miss, and for background writeback, and for ++ * the moving garbage collector. ++ * ++ * There is no "delete" operation; deleting things from the index is ++ * accomplished by either by invalidating pointers (by incrementing a bucket's ++ * gen) or by inserting a key with 0 pointers - which will overwrite anything ++ * previously present at that location in the index. ++ * ++ * This means that there are always stale/invalid keys in the btree. They're ++ * filtered out by the code that iterates through a btree node, and removed when ++ * a btree node is rewritten. ++ * ++ * BTREE NODES: ++ * ++ * Our unit of allocation is a bucket, and we we can't arbitrarily allocate and ++ * free smaller than a bucket - so, that's how big our btree nodes are. ++ * ++ * (If buckets are really big we'll only use part of the bucket for a btree node ++ * - no less than 1/4th - but a bucket still contains no more than a single ++ * btree node. I'd actually like to change this, but for now we rely on the ++ * bucket's gen for deleting btree nodes when we rewrite/split a node.) ++ * ++ * Anyways, btree nodes are big - big enough to be inefficient with a textbook ++ * btree implementation. ++ * ++ * The way this is solved is that btree nodes are internally log structured; we ++ * can append new keys to an existing btree node without rewriting it. This ++ * means each set of keys we write is sorted, but the node is not. ++ * ++ * We maintain this log structure in memory - keeping 1Mb of keys sorted would ++ * be expensive, and we have to distinguish between the keys we have written and ++ * the keys we haven't. So to do a lookup in a btree node, we have to search ++ * each sorted set. But we do merge written sets together lazily, so the cost of ++ * these extra searches is quite low (normally most of the keys in a btree node ++ * will be in one big set, and then there'll be one or two sets that are much ++ * smaller). ++ * ++ * This log structure makes bcache's btree more of a hybrid between a ++ * conventional btree and a compacting data structure, with some of the ++ * advantages of both. ++ * ++ * GARBAGE COLLECTION: ++ * ++ * We can't just invalidate any bucket - it might contain dirty data or ++ * metadata. If it once contained dirty data, other writes might overwrite it ++ * later, leaving no valid pointers into that bucket in the index. ++ * ++ * Thus, the primary purpose of garbage collection is to find buckets to reuse. ++ * It also counts how much valid data it each bucket currently contains, so that ++ * allocation can reuse buckets sooner when they've been mostly overwritten. ++ * ++ * It also does some things that are really internal to the btree ++ * implementation. If a btree node contains pointers that are stale by more than ++ * some threshold, it rewrites the btree node to avoid the bucket's generation ++ * wrapping around. It also merges adjacent btree nodes if they're empty enough. ++ * ++ * THE JOURNAL: ++ * ++ * Bcache's journal is not necessary for consistency; we always strictly ++ * order metadata writes so that the btree and everything else is consistent on ++ * disk in the event of an unclean shutdown, and in fact bcache had writeback ++ * caching (with recovery from unclean shutdown) before journalling was ++ * implemented. ++ * ++ * Rather, the journal is purely a performance optimization; we can't complete a ++ * write until we've updated the index on disk, otherwise the cache would be ++ * inconsistent in the event of an unclean shutdown. This means that without the ++ * journal, on random write workloads we constantly have to update all the leaf ++ * nodes in the btree, and those writes will be mostly empty (appending at most ++ * a few keys each) - highly inefficient in terms of amount of metadata writes, ++ * and it puts more strain on the various btree resorting/compacting code. ++ * ++ * The journal is just a log of keys we've inserted; on startup we just reinsert ++ * all the keys in the open journal entries. That means that when we're updating ++ * a node in the btree, we can wait until a 4k block of keys fills up before ++ * writing them out. ++ * ++ * For simplicity, we only journal updates to leaf nodes; updates to parent ++ * nodes are rare enough (since our leaf nodes are huge) that it wasn't worth ++ * the complexity to deal with journalling them (in particular, journal replay) ++ * - updates to non leaf nodes just happen synchronously (see btree_split()). ++ */ ++ ++#undef pr_fmt ++#define pr_fmt(fmt) "bcachefs: %s() " fmt "\n", __func__ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "bcachefs_format.h" ++#include "fifo.h" ++#include "opts.h" ++#include "util.h" ++ ++#define dynamic_fault(...) 0 ++#define race_fault(...) 0 ++ ++#define bch2_fs_init_fault(name) \ ++ dynamic_fault("bcachefs:bch_fs_init:" name) ++#define bch2_meta_read_fault(name) \ ++ dynamic_fault("bcachefs:meta:read:" name) ++#define bch2_meta_write_fault(name) \ ++ dynamic_fault("bcachefs:meta:write:" name) ++ ++#ifdef __KERNEL__ ++#define bch2_fmt(_c, fmt) "bcachefs (%s): " fmt "\n", ((_c)->name) ++#else ++#define bch2_fmt(_c, fmt) fmt "\n" ++#endif ++ ++#define bch_info(c, fmt, ...) \ ++ printk(KERN_INFO bch2_fmt(c, fmt), ##__VA_ARGS__) ++#define bch_notice(c, fmt, ...) \ ++ printk(KERN_NOTICE bch2_fmt(c, fmt), ##__VA_ARGS__) ++#define bch_warn(c, fmt, ...) \ ++ printk(KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__) ++#define bch_warn_ratelimited(c, fmt, ...) \ ++ printk_ratelimited(KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__) ++#define bch_err(c, fmt, ...) \ ++ printk(KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__) ++#define bch_err_ratelimited(c, fmt, ...) \ ++ printk_ratelimited(KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__) ++ ++#define bch_verbose(c, fmt, ...) \ ++do { \ ++ if ((c)->opts.verbose) \ ++ bch_info(c, fmt, ##__VA_ARGS__); \ ++} while (0) ++ ++#define pr_verbose_init(opts, fmt, ...) \ ++do { \ ++ if (opt_get(opts, verbose)) \ ++ pr_info(fmt, ##__VA_ARGS__); \ ++} while (0) ++ ++/* Parameters that are useful for debugging, but should always be compiled in: */ ++#define BCH_DEBUG_PARAMS_ALWAYS() \ ++ BCH_DEBUG_PARAM(key_merging_disabled, \ ++ "Disables merging of extents") \ ++ BCH_DEBUG_PARAM(btree_gc_always_rewrite, \ ++ "Causes mark and sweep to compact and rewrite every " \ ++ "btree node it traverses") \ ++ BCH_DEBUG_PARAM(btree_gc_rewrite_disabled, \ ++ "Disables rewriting of btree nodes during mark and sweep")\ ++ BCH_DEBUG_PARAM(btree_shrinker_disabled, \ ++ "Disables the shrinker callback for the btree node cache") ++ ++/* Parameters that should only be compiled in in debug mode: */ ++#define BCH_DEBUG_PARAMS_DEBUG() \ ++ BCH_DEBUG_PARAM(expensive_debug_checks, \ ++ "Enables various runtime debugging checks that " \ ++ "significantly affect performance") \ ++ BCH_DEBUG_PARAM(debug_check_iterators, \ ++ "Enables extra verification for btree iterators") \ ++ BCH_DEBUG_PARAM(debug_check_bkeys, \ ++ "Run bkey_debugcheck (primarily checking GC/allocation "\ ++ "information) when iterating over keys") \ ++ BCH_DEBUG_PARAM(verify_btree_ondisk, \ ++ "Reread btree nodes at various points to verify the " \ ++ "mergesort in the read path against modifications " \ ++ "done in memory") \ ++ BCH_DEBUG_PARAM(journal_seq_verify, \ ++ "Store the journal sequence number in the version " \ ++ "number of every btree key, and verify that btree " \ ++ "update ordering is preserved during recovery") \ ++ BCH_DEBUG_PARAM(inject_invalid_keys, \ ++ "Store the journal sequence number in the version " \ ++ "number of every btree key, and verify that btree " \ ++ "update ordering is preserved during recovery") \ ++ BCH_DEBUG_PARAM(test_alloc_startup, \ ++ "Force allocator startup to use the slowpath where it" \ ++ "can't find enough free buckets without invalidating" \ ++ "cached data") \ ++ BCH_DEBUG_PARAM(force_reconstruct_read, \ ++ "Force reads to use the reconstruct path, when reading" \ ++ "from erasure coded extents") \ ++ BCH_DEBUG_PARAM(test_restart_gc, \ ++ "Test restarting mark and sweep gc when bucket gens change") ++ ++#define BCH_DEBUG_PARAMS_ALL() BCH_DEBUG_PARAMS_ALWAYS() BCH_DEBUG_PARAMS_DEBUG() ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++#define BCH_DEBUG_PARAMS() BCH_DEBUG_PARAMS_ALL() ++#else ++#define BCH_DEBUG_PARAMS() BCH_DEBUG_PARAMS_ALWAYS() ++#endif ++ ++#define BCH_TIME_STATS() \ ++ x(btree_node_mem_alloc) \ ++ x(btree_node_split) \ ++ x(btree_node_sort) \ ++ x(btree_node_read) \ ++ x(btree_gc) \ ++ x(btree_lock_contended_read) \ ++ x(btree_lock_contended_intent) \ ++ x(btree_lock_contended_write) \ ++ x(data_write) \ ++ x(data_read) \ ++ x(data_promote) \ ++ x(journal_write) \ ++ x(journal_delay) \ ++ x(journal_flush_seq) \ ++ x(blocked_journal) \ ++ x(blocked_allocate) \ ++ x(blocked_allocate_open_bucket) ++ ++enum bch_time_stats { ++#define x(name) BCH_TIME_##name, ++ BCH_TIME_STATS() ++#undef x ++ BCH_TIME_STAT_NR ++}; ++ ++#include "alloc_types.h" ++#include "btree_types.h" ++#include "buckets_types.h" ++#include "clock_types.h" ++#include "ec_types.h" ++#include "journal_types.h" ++#include "keylist_types.h" ++#include "quota_types.h" ++#include "rebalance_types.h" ++#include "replicas_types.h" ++#include "super_types.h" ++ ++/* Number of nodes btree coalesce will try to coalesce at once */ ++#define GC_MERGE_NODES 4U ++ ++/* Maximum number of nodes we might need to allocate atomically: */ ++#define BTREE_RESERVE_MAX (BTREE_MAX_DEPTH + (BTREE_MAX_DEPTH - 1)) ++ ++/* Size of the freelist we allocate btree nodes from: */ ++#define BTREE_NODE_RESERVE BTREE_RESERVE_MAX ++ ++#define BTREE_NODE_OPEN_BUCKET_RESERVE (BTREE_RESERVE_MAX * BCH_REPLICAS_MAX) ++ ++struct btree; ++ ++enum gc_phase { ++ GC_PHASE_NOT_RUNNING, ++ GC_PHASE_START, ++ GC_PHASE_SB, ++ ++ GC_PHASE_BTREE_EC, ++ GC_PHASE_BTREE_EXTENTS, ++ GC_PHASE_BTREE_INODES, ++ GC_PHASE_BTREE_DIRENTS, ++ GC_PHASE_BTREE_XATTRS, ++ GC_PHASE_BTREE_ALLOC, ++ GC_PHASE_BTREE_QUOTAS, ++ GC_PHASE_BTREE_REFLINK, ++ ++ GC_PHASE_PENDING_DELETE, ++ GC_PHASE_ALLOC, ++}; ++ ++struct gc_pos { ++ enum gc_phase phase; ++ struct bpos pos; ++ unsigned level; ++}; ++ ++struct io_count { ++ u64 sectors[2][BCH_DATA_NR]; ++}; ++ ++struct bch_dev { ++ struct kobject kobj; ++ struct percpu_ref ref; ++ struct completion ref_completion; ++ struct percpu_ref io_ref; ++ struct completion io_ref_completion; ++ ++ struct bch_fs *fs; ++ ++ u8 dev_idx; ++ /* ++ * Cached version of this device's member info from superblock ++ * Committed by bch2_write_super() -> bch_fs_mi_update() ++ */ ++ struct bch_member_cpu mi; ++ uuid_le uuid; ++ char name[BDEVNAME_SIZE]; ++ ++ struct bch_sb_handle disk_sb; ++ struct bch_sb *sb_read_scratch; ++ int sb_write_error; ++ ++ struct bch_devs_mask self; ++ ++ /* biosets used in cloned bios for writing multiple replicas */ ++ struct bio_set replica_set; ++ ++ /* ++ * Buckets: ++ * Per-bucket arrays are protected by c->mark_lock, bucket_lock and ++ * gc_lock, for device resize - holding any is sufficient for access: ++ * Or rcu_read_lock(), but only for ptr_stale(): ++ */ ++ struct bucket_array __rcu *buckets[2]; ++ unsigned long *buckets_nouse; ++ struct rw_semaphore bucket_lock; ++ ++ struct bch_dev_usage __percpu *usage[2]; ++ ++ /* Allocator: */ ++ struct task_struct __rcu *alloc_thread; ++ ++ /* ++ * free: Buckets that are ready to be used ++ * ++ * free_inc: Incoming buckets - these are buckets that currently have ++ * cached data in them, and we can't reuse them until after we write ++ * their new gen to disk. After prio_write() finishes writing the new ++ * gens/prios, they'll be moved to the free list (and possibly discarded ++ * in the process) ++ */ ++ alloc_fifo free[RESERVE_NR]; ++ alloc_fifo free_inc; ++ spinlock_t freelist_lock; ++ ++ u8 open_buckets_partial[OPEN_BUCKETS_COUNT]; ++ unsigned open_buckets_partial_nr; ++ ++ size_t fifo_last_bucket; ++ ++ /* last calculated minimum prio */ ++ u16 max_last_bucket_io[2]; ++ ++ size_t inc_gen_needs_gc; ++ size_t inc_gen_really_needs_gc; ++ ++ /* ++ * XXX: this should be an enum for allocator state, so as to include ++ * error state ++ */ ++ enum { ++ ALLOCATOR_STOPPED, ++ ALLOCATOR_RUNNING, ++ ALLOCATOR_BLOCKED, ++ ALLOCATOR_BLOCKED_FULL, ++ } allocator_state; ++ ++ alloc_heap alloc_heap; ++ ++ /* Copying GC: */ ++ struct task_struct *copygc_thread; ++ copygc_heap copygc_heap; ++ struct bch_pd_controller copygc_pd; ++ struct write_point copygc_write_point; ++ u64 copygc_threshold; ++ ++ atomic64_t rebalance_work; ++ ++ struct journal_device journal; ++ ++ struct work_struct io_error_work; ++ ++ /* The rest of this all shows up in sysfs */ ++ atomic64_t cur_latency[2]; ++ struct time_stats io_latency[2]; ++ ++#define CONGESTED_MAX 1024 ++ atomic_t congested; ++ u64 congested_last; ++ ++ struct io_count __percpu *io_done; ++}; ++ ++enum { ++ /* startup: */ ++ BCH_FS_ALLOC_READ_DONE, ++ BCH_FS_ALLOCATOR_STARTED, ++ BCH_FS_ALLOCATOR_RUNNING, ++ BCH_FS_INITIAL_GC_DONE, ++ BCH_FS_FSCK_DONE, ++ BCH_FS_STARTED, ++ BCH_FS_RW, ++ ++ /* shutdown: */ ++ BCH_FS_STOPPING, ++ BCH_FS_EMERGENCY_RO, ++ BCH_FS_WRITE_DISABLE_COMPLETE, ++ ++ /* errors: */ ++ BCH_FS_ERROR, ++ BCH_FS_ERRORS_FIXED, ++ ++ /* misc: */ ++ BCH_FS_BDEV_MOUNTED, ++ BCH_FS_FIXED_GENS, ++ BCH_FS_ALLOC_WRITTEN, ++ BCH_FS_REBUILD_REPLICAS, ++ BCH_FS_HOLD_BTREE_WRITES, ++}; ++ ++struct btree_debug { ++ unsigned id; ++ struct dentry *btree; ++ struct dentry *btree_format; ++ struct dentry *failed; ++}; ++ ++struct bch_fs_pcpu { ++ u64 sectors_available; ++}; ++ ++struct journal_seq_blacklist_table { ++ size_t nr; ++ struct journal_seq_blacklist_table_entry { ++ u64 start; ++ u64 end; ++ bool dirty; ++ } entries[0]; ++}; ++ ++struct bch_fs { ++ struct closure cl; ++ ++ struct list_head list; ++ struct kobject kobj; ++ struct kobject internal; ++ struct kobject opts_dir; ++ struct kobject time_stats; ++ unsigned long flags; ++ ++ int minor; ++ struct device *chardev; ++ struct super_block *vfs_sb; ++ char name[40]; ++ ++ /* ro/rw, add/remove devices: */ ++ struct mutex state_lock; ++ ++ /* Counts outstanding writes, for clean transition to read-only */ ++ struct percpu_ref writes; ++ struct work_struct read_only_work; ++ ++ struct bch_dev __rcu *devs[BCH_SB_MEMBERS_MAX]; ++ ++ struct bch_replicas_cpu replicas; ++ struct bch_replicas_cpu replicas_gc; ++ struct mutex replicas_gc_lock; ++ ++ struct journal_entry_res replicas_journal_res; ++ ++ struct bch_disk_groups_cpu __rcu *disk_groups; ++ ++ struct bch_opts opts; ++ ++ /* Updated by bch2_sb_update():*/ ++ struct { ++ uuid_le uuid; ++ uuid_le user_uuid; ++ ++ u16 version; ++ u16 encoded_extent_max; ++ ++ u8 nr_devices; ++ u8 clean; ++ ++ u8 encryption_type; ++ ++ u64 time_base_lo; ++ u32 time_base_hi; ++ u32 time_precision; ++ u64 features; ++ u64 compat; ++ } sb; ++ ++ struct bch_sb_handle disk_sb; ++ ++ unsigned short block_bits; /* ilog2(block_size) */ ++ ++ u16 btree_foreground_merge_threshold; ++ ++ struct closure sb_write; ++ struct mutex sb_lock; ++ ++ /* BTREE CACHE */ ++ struct bio_set btree_bio; ++ ++ struct btree_root btree_roots[BTREE_ID_NR]; ++ bool btree_roots_dirty; ++ struct mutex btree_root_lock; ++ ++ struct btree_cache btree_cache; ++ ++ mempool_t btree_reserve_pool; ++ ++ /* ++ * Cache of allocated btree nodes - if we allocate a btree node and ++ * don't use it, if we free it that space can't be reused until going ++ * _all_ the way through the allocator (which exposes us to a livelock ++ * when allocating btree reserves fail halfway through) - instead, we ++ * can stick them here: ++ */ ++ struct btree_alloc btree_reserve_cache[BTREE_NODE_RESERVE * 2]; ++ unsigned btree_reserve_cache_nr; ++ struct mutex btree_reserve_cache_lock; ++ ++ mempool_t btree_interior_update_pool; ++ struct list_head btree_interior_update_list; ++ struct mutex btree_interior_update_lock; ++ struct closure_waitlist btree_interior_update_wait; ++ ++ mempool_t btree_iters_pool; ++ ++ struct workqueue_struct *wq; ++ /* copygc needs its own workqueue for index updates.. */ ++ struct workqueue_struct *copygc_wq; ++ struct workqueue_struct *journal_reclaim_wq; ++ ++ /* ALLOCATION */ ++ struct delayed_work pd_controllers_update; ++ unsigned pd_controllers_update_seconds; ++ ++ struct bch_devs_mask rw_devs[BCH_DATA_NR]; ++ ++ u64 capacity; /* sectors */ ++ ++ /* ++ * When capacity _decreases_ (due to a disk being removed), we ++ * increment capacity_gen - this invalidates outstanding reservations ++ * and forces them to be revalidated ++ */ ++ u32 capacity_gen; ++ unsigned bucket_size_max; ++ ++ atomic64_t sectors_available; ++ ++ struct bch_fs_pcpu __percpu *pcpu; ++ ++ struct percpu_rw_semaphore mark_lock; ++ ++ seqcount_t usage_lock; ++ struct bch_fs_usage *usage_base; ++ struct bch_fs_usage __percpu *usage[2]; ++ struct bch_fs_usage __percpu *usage_gc; ++ ++ /* single element mempool: */ ++ struct mutex usage_scratch_lock; ++ struct bch_fs_usage *usage_scratch; ++ ++ /* ++ * When we invalidate buckets, we use both the priority and the amount ++ * of good data to determine which buckets to reuse first - to weight ++ * those together consistently we keep track of the smallest nonzero ++ * priority of any bucket. ++ */ ++ struct bucket_clock bucket_clock[2]; ++ ++ struct io_clock io_clock[2]; ++ ++ /* JOURNAL SEQ BLACKLIST */ ++ struct journal_seq_blacklist_table * ++ journal_seq_blacklist_table; ++ struct work_struct journal_seq_blacklist_gc_work; ++ ++ /* ALLOCATOR */ ++ spinlock_t freelist_lock; ++ struct closure_waitlist freelist_wait; ++ u64 blocked_allocate; ++ u64 blocked_allocate_open_bucket; ++ u8 open_buckets_freelist; ++ u8 open_buckets_nr_free; ++ struct closure_waitlist open_buckets_wait; ++ struct open_bucket open_buckets[OPEN_BUCKETS_COUNT]; ++ ++ struct write_point btree_write_point; ++ struct write_point rebalance_write_point; ++ ++ struct write_point write_points[WRITE_POINT_MAX]; ++ struct hlist_head write_points_hash[WRITE_POINT_HASH_NR]; ++ struct mutex write_points_hash_lock; ++ unsigned write_points_nr; ++ ++ /* GARBAGE COLLECTION */ ++ struct task_struct *gc_thread; ++ atomic_t kick_gc; ++ unsigned long gc_count; ++ ++ /* ++ * Tracks GC's progress - everything in the range [ZERO_KEY..gc_cur_pos] ++ * has been marked by GC. ++ * ++ * gc_cur_phase is a superset of btree_ids (BTREE_ID_EXTENTS etc.) ++ * ++ * Protected by gc_pos_lock. Only written to by GC thread, so GC thread ++ * can read without a lock. ++ */ ++ seqcount_t gc_pos_lock; ++ struct gc_pos gc_pos; ++ ++ /* ++ * The allocation code needs gc_mark in struct bucket to be correct, but ++ * it's not while a gc is in progress. ++ */ ++ struct rw_semaphore gc_lock; ++ ++ /* IO PATH */ ++ struct bio_set bio_read; ++ struct bio_set bio_read_split; ++ struct bio_set bio_write; ++ struct mutex bio_bounce_pages_lock; ++ mempool_t bio_bounce_pages; ++ struct rhashtable promote_table; ++ ++ mempool_t compression_bounce[2]; ++ mempool_t compress_workspace[BCH_COMPRESSION_NR]; ++ mempool_t decompress_workspace; ++ ZSTD_parameters zstd_params; ++ ++ struct crypto_shash *sha256; ++ struct crypto_sync_skcipher *chacha20; ++ struct crypto_shash *poly1305; ++ ++ atomic64_t key_version; ++ ++ /* REBALANCE */ ++ struct bch_fs_rebalance rebalance; ++ ++ /* STRIPES: */ ++ GENRADIX(struct stripe) stripes[2]; ++ struct mutex ec_stripe_create_lock; ++ ++ ec_stripes_heap ec_stripes_heap; ++ spinlock_t ec_stripes_heap_lock; ++ ++ /* ERASURE CODING */ ++ struct list_head ec_new_stripe_list; ++ struct mutex ec_new_stripe_lock; ++ u64 ec_stripe_hint; ++ ++ struct bio_set ec_bioset; ++ ++ struct work_struct ec_stripe_delete_work; ++ struct llist_head ec_stripe_delete_list; ++ ++ /* REFLINK */ ++ u64 reflink_hint; ++ ++ /* VFS IO PATH - fs-io.c */ ++ struct bio_set writepage_bioset; ++ struct bio_set dio_write_bioset; ++ struct bio_set dio_read_bioset; ++ ++ struct bio_list btree_write_error_list; ++ struct work_struct btree_write_error_work; ++ spinlock_t btree_write_error_lock; ++ ++ /* ERRORS */ ++ struct list_head fsck_errors; ++ struct mutex fsck_error_lock; ++ bool fsck_alloc_err; ++ ++ /* QUOTAS */ ++ struct bch_memquota_type quotas[QTYP_NR]; ++ ++ /* DEBUG JUNK */ ++ struct dentry *debug; ++ struct btree_debug btree_debug[BTREE_ID_NR]; ++#ifdef CONFIG_BCACHEFS_DEBUG ++ struct btree *verify_data; ++ struct btree_node *verify_ondisk; ++ struct mutex verify_lock; ++#endif ++ ++ u64 unused_inode_hint; ++ ++ /* ++ * A btree node on disk could have too many bsets for an iterator to fit ++ * on the stack - have to dynamically allocate them ++ */ ++ mempool_t fill_iter; ++ ++ mempool_t btree_bounce_pool; ++ ++ struct journal journal; ++ ++ u64 last_bucket_seq_cleanup; ++ ++ /* The rest of this all shows up in sysfs */ ++ atomic_long_t read_realloc_races; ++ atomic_long_t extent_migrate_done; ++ atomic_long_t extent_migrate_raced; ++ ++ unsigned btree_gc_periodic:1; ++ unsigned copy_gc_enabled:1; ++ bool promote_whole_extents; ++ ++#define BCH_DEBUG_PARAM(name, description) bool name; ++ BCH_DEBUG_PARAMS_ALL() ++#undef BCH_DEBUG_PARAM ++ ++ struct time_stats times[BCH_TIME_STAT_NR]; ++}; ++ ++static inline void bch2_set_ra_pages(struct bch_fs *c, unsigned ra_pages) ++{ ++#ifndef NO_BCACHEFS_FS ++ if (c->vfs_sb) ++ c->vfs_sb->s_bdi->ra_pages = ra_pages; ++#endif ++} ++ ++static inline unsigned bucket_bytes(const struct bch_dev *ca) ++{ ++ return ca->mi.bucket_size << 9; ++} ++ ++static inline unsigned block_bytes(const struct bch_fs *c) ++{ ++ return c->opts.block_size << 9; ++} ++ ++static inline struct timespec64 bch2_time_to_timespec(struct bch_fs *c, u64 time) ++{ ++ return ns_to_timespec64(time * c->sb.time_precision + c->sb.time_base_lo); ++} ++ ++static inline s64 timespec_to_bch2_time(struct bch_fs *c, struct timespec64 ts) ++{ ++ s64 ns = timespec64_to_ns(&ts) - c->sb.time_base_lo; ++ ++ if (c->sb.time_precision == 1) ++ return ns; ++ ++ return div_s64(ns, c->sb.time_precision); ++} ++ ++static inline s64 bch2_current_time(struct bch_fs *c) ++{ ++ struct timespec64 now; ++ ++ ktime_get_coarse_real_ts64(&now); ++ return timespec_to_bch2_time(c, now); ++} ++ ++static inline bool bch2_dev_exists2(const struct bch_fs *c, unsigned dev) ++{ ++ return dev < c->sb.nr_devices && c->devs[dev]; ++} ++ ++#endif /* _BCACHEFS_H */ +diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h +new file mode 100644 +index 000000000000..d619e5caf09b +--- /dev/null ++++ b/fs/bcachefs/bcachefs_format.h +@@ -0,0 +1,1604 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_FORMAT_H ++#define _BCACHEFS_FORMAT_H ++ ++/* ++ * bcachefs on disk data structures ++ * ++ * OVERVIEW: ++ * ++ * There are three main types of on disk data structures in bcachefs (this is ++ * reduced from 5 in bcache) ++ * ++ * - superblock ++ * - journal ++ * - btree ++ * ++ * The btree is the primary structure; most metadata exists as keys in the ++ * various btrees. There are only a small number of btrees, they're not ++ * sharded - we have one btree for extents, another for inodes, et cetera. ++ * ++ * SUPERBLOCK: ++ * ++ * The superblock contains the location of the journal, the list of devices in ++ * the filesystem, and in general any metadata we need in order to decide ++ * whether we can start a filesystem or prior to reading the journal/btree ++ * roots. ++ * ++ * The superblock is extensible, and most of the contents of the superblock are ++ * in variable length, type tagged fields; see struct bch_sb_field. ++ * ++ * Backup superblocks do not reside in a fixed location; also, superblocks do ++ * not have a fixed size. To locate backup superblocks we have struct ++ * bch_sb_layout; we store a copy of this inside every superblock, and also ++ * before the first superblock. ++ * ++ * JOURNAL: ++ * ++ * The journal primarily records btree updates in the order they occurred; ++ * journal replay consists of just iterating over all the keys in the open ++ * journal entries and re-inserting them into the btrees. ++ * ++ * The journal also contains entry types for the btree roots, and blacklisted ++ * journal sequence numbers (see journal_seq_blacklist.c). ++ * ++ * BTREE: ++ * ++ * bcachefs btrees are copy on write b+ trees, where nodes are big (typically ++ * 128k-256k) and log structured. We use struct btree_node for writing the first ++ * entry in a given node (offset 0), and struct btree_node_entry for all ++ * subsequent writes. ++ * ++ * After the header, btree node entries contain a list of keys in sorted order. ++ * Values are stored inline with the keys; since values are variable length (and ++ * keys effectively are variable length too, due to packing) we can't do random ++ * access without building up additional in memory tables in the btree node read ++ * path. ++ * ++ * BTREE KEYS (struct bkey): ++ * ++ * The various btrees share a common format for the key - so as to avoid ++ * switching in fastpath lookup/comparison code - but define their own ++ * structures for the key values. ++ * ++ * The size of a key/value pair is stored as a u8 in units of u64s, so the max ++ * size is just under 2k. The common part also contains a type tag for the ++ * value, and a format field indicating whether the key is packed or not (and ++ * also meant to allow adding new key fields in the future, if desired). ++ * ++ * bkeys, when stored within a btree node, may also be packed. In that case, the ++ * bkey_format in that node is used to unpack it. Packed bkeys mean that we can ++ * be generous with field sizes in the common part of the key format (64 bit ++ * inode number, 64 bit offset, 96 bit version field, etc.) for negligible cost. ++ */ ++ ++#include ++#include ++#include ++#include ++ ++#define LE_BITMASK(_bits, name, type, field, offset, end) \ ++static const unsigned name##_OFFSET = offset; \ ++static const unsigned name##_BITS = (end - offset); \ ++static const __u##_bits name##_MAX = (1ULL << (end - offset)) - 1; \ ++ \ ++static inline __u64 name(const type *k) \ ++{ \ ++ return (__le##_bits##_to_cpu(k->field) >> offset) & \ ++ ~(~0ULL << (end - offset)); \ ++} \ ++ \ ++static inline void SET_##name(type *k, __u64 v) \ ++{ \ ++ __u##_bits new = __le##_bits##_to_cpu(k->field); \ ++ \ ++ new &= ~(~(~0ULL << (end - offset)) << offset); \ ++ new |= (v & ~(~0ULL << (end - offset))) << offset; \ ++ k->field = __cpu_to_le##_bits(new); \ ++} ++ ++#define LE16_BITMASK(n, t, f, o, e) LE_BITMASK(16, n, t, f, o, e) ++#define LE32_BITMASK(n, t, f, o, e) LE_BITMASK(32, n, t, f, o, e) ++#define LE64_BITMASK(n, t, f, o, e) LE_BITMASK(64, n, t, f, o, e) ++ ++struct bkey_format { ++ __u8 key_u64s; ++ __u8 nr_fields; ++ /* One unused slot for now: */ ++ __u8 bits_per_field[6]; ++ __le64 field_offset[6]; ++}; ++ ++/* Btree keys - all units are in sectors */ ++ ++struct bpos { ++ /* ++ * Word order matches machine byte order - btree code treats a bpos as a ++ * single large integer, for search/comparison purposes ++ * ++ * Note that wherever a bpos is embedded in another on disk data ++ * structure, it has to be byte swabbed when reading in metadata that ++ * wasn't written in native endian order: ++ */ ++#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ ++ __u32 snapshot; ++ __u64 offset; ++ __u64 inode; ++#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ ++ __u64 inode; ++ __u64 offset; /* Points to end of extent - sectors */ ++ __u32 snapshot; ++#else ++#error edit for your odd byteorder. ++#endif ++} __attribute__((packed, aligned(4))); ++ ++#define KEY_INODE_MAX ((__u64)~0ULL) ++#define KEY_OFFSET_MAX ((__u64)~0ULL) ++#define KEY_SNAPSHOT_MAX ((__u32)~0U) ++#define KEY_SIZE_MAX ((__u32)~0U) ++ ++static inline struct bpos POS(__u64 inode, __u64 offset) ++{ ++ struct bpos ret; ++ ++ ret.inode = inode; ++ ret.offset = offset; ++ ret.snapshot = 0; ++ ++ return ret; ++} ++ ++#define POS_MIN POS(0, 0) ++#define POS_MAX POS(KEY_INODE_MAX, KEY_OFFSET_MAX) ++ ++/* Empty placeholder struct, for container_of() */ ++struct bch_val { ++ __u64 __nothing[0]; ++}; ++ ++struct bversion { ++#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ ++ __u64 lo; ++ __u32 hi; ++#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ ++ __u32 hi; ++ __u64 lo; ++#endif ++} __attribute__((packed, aligned(4))); ++ ++struct bkey { ++ /* Size of combined key and value, in u64s */ ++ __u8 u64s; ++ ++ /* Format of key (0 for format local to btree node) */ ++#if defined(__LITTLE_ENDIAN_BITFIELD) ++ __u8 format:7, ++ needs_whiteout:1; ++#elif defined (__BIG_ENDIAN_BITFIELD) ++ __u8 needs_whiteout:1, ++ format:7; ++#else ++#error edit for your odd byteorder. ++#endif ++ ++ /* Type of the value */ ++ __u8 type; ++ ++#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ ++ __u8 pad[1]; ++ ++ struct bversion version; ++ __u32 size; /* extent size, in sectors */ ++ struct bpos p; ++#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ ++ struct bpos p; ++ __u32 size; /* extent size, in sectors */ ++ struct bversion version; ++ ++ __u8 pad[1]; ++#endif ++} __attribute__((packed, aligned(8))); ++ ++struct bkey_packed { ++ __u64 _data[0]; ++ ++ /* Size of combined key and value, in u64s */ ++ __u8 u64s; ++ ++ /* Format of key (0 for format local to btree node) */ ++ ++ /* ++ * XXX: next incompat on disk format change, switch format and ++ * needs_whiteout - bkey_packed() will be cheaper if format is the high ++ * bits of the bitfield ++ */ ++#if defined(__LITTLE_ENDIAN_BITFIELD) ++ __u8 format:7, ++ needs_whiteout:1; ++#elif defined (__BIG_ENDIAN_BITFIELD) ++ __u8 needs_whiteout:1, ++ format:7; ++#endif ++ ++ /* Type of the value */ ++ __u8 type; ++ __u8 key_start[0]; ++ ++ /* ++ * We copy bkeys with struct assignment in various places, and while ++ * that shouldn't be done with packed bkeys we can't disallow it in C, ++ * and it's legal to cast a bkey to a bkey_packed - so padding it out ++ * to the same size as struct bkey should hopefully be safest. ++ */ ++ __u8 pad[sizeof(struct bkey) - 3]; ++} __attribute__((packed, aligned(8))); ++ ++#define BKEY_U64s (sizeof(struct bkey) / sizeof(__u64)) ++#define BKEY_U64s_MAX U8_MAX ++#define BKEY_VAL_U64s_MAX (BKEY_U64s_MAX - BKEY_U64s) ++ ++#define KEY_PACKED_BITS_START 24 ++ ++#define KEY_FORMAT_LOCAL_BTREE 0 ++#define KEY_FORMAT_CURRENT 1 ++ ++enum bch_bkey_fields { ++ BKEY_FIELD_INODE, ++ BKEY_FIELD_OFFSET, ++ BKEY_FIELD_SNAPSHOT, ++ BKEY_FIELD_SIZE, ++ BKEY_FIELD_VERSION_HI, ++ BKEY_FIELD_VERSION_LO, ++ BKEY_NR_FIELDS, ++}; ++ ++#define bkey_format_field(name, field) \ ++ [BKEY_FIELD_##name] = (sizeof(((struct bkey *) NULL)->field) * 8) ++ ++#define BKEY_FORMAT_CURRENT \ ++((struct bkey_format) { \ ++ .key_u64s = BKEY_U64s, \ ++ .nr_fields = BKEY_NR_FIELDS, \ ++ .bits_per_field = { \ ++ bkey_format_field(INODE, p.inode), \ ++ bkey_format_field(OFFSET, p.offset), \ ++ bkey_format_field(SNAPSHOT, p.snapshot), \ ++ bkey_format_field(SIZE, size), \ ++ bkey_format_field(VERSION_HI, version.hi), \ ++ bkey_format_field(VERSION_LO, version.lo), \ ++ }, \ ++}) ++ ++/* bkey with inline value */ ++struct bkey_i { ++ __u64 _data[0]; ++ ++ union { ++ struct { ++ /* Size of combined key and value, in u64s */ ++ __u8 u64s; ++ }; ++ struct { ++ struct bkey k; ++ struct bch_val v; ++ }; ++ }; ++}; ++ ++#define KEY(_inode, _offset, _size) \ ++((struct bkey) { \ ++ .u64s = BKEY_U64s, \ ++ .format = KEY_FORMAT_CURRENT, \ ++ .p = POS(_inode, _offset), \ ++ .size = _size, \ ++}) ++ ++static inline void bkey_init(struct bkey *k) ++{ ++ *k = KEY(0, 0, 0); ++} ++ ++#define bkey_bytes(_k) ((_k)->u64s * sizeof(__u64)) ++ ++#define __BKEY_PADDED(key, pad) \ ++ struct { struct bkey_i key; __u64 key ## _pad[pad]; } ++ ++/* ++ * - DELETED keys are used internally to mark keys that should be ignored but ++ * override keys in composition order. Their version number is ignored. ++ * ++ * - DISCARDED keys indicate that the data is all 0s because it has been ++ * discarded. DISCARDs may have a version; if the version is nonzero the key ++ * will be persistent, otherwise the key will be dropped whenever the btree ++ * node is rewritten (like DELETED keys). ++ * ++ * - ERROR: any read of the data returns a read error, as the data was lost due ++ * to a failing device. Like DISCARDED keys, they can be removed (overridden) ++ * by new writes or cluster-wide GC. Node repair can also overwrite them with ++ * the same or a more recent version number, but not with an older version ++ * number. ++ * ++ * - WHITEOUT: for hash table btrees ++*/ ++#define BCH_BKEY_TYPES() \ ++ x(deleted, 0) \ ++ x(discard, 1) \ ++ x(error, 2) \ ++ x(cookie, 3) \ ++ x(whiteout, 4) \ ++ x(btree_ptr, 5) \ ++ x(extent, 6) \ ++ x(reservation, 7) \ ++ x(inode, 8) \ ++ x(inode_generation, 9) \ ++ x(dirent, 10) \ ++ x(xattr, 11) \ ++ x(alloc, 12) \ ++ x(quota, 13) \ ++ x(stripe, 14) \ ++ x(reflink_p, 15) \ ++ x(reflink_v, 16) ++ ++enum bch_bkey_type { ++#define x(name, nr) KEY_TYPE_##name = nr, ++ BCH_BKEY_TYPES() ++#undef x ++ KEY_TYPE_MAX, ++}; ++ ++struct bch_cookie { ++ struct bch_val v; ++ __le64 cookie; ++}; ++ ++/* Extents */ ++ ++/* ++ * In extent bkeys, the value is a list of pointers (bch_extent_ptr), optionally ++ * preceded by checksum/compression information (bch_extent_crc32 or ++ * bch_extent_crc64). ++ * ++ * One major determining factor in the format of extents is how we handle and ++ * represent extents that have been partially overwritten and thus trimmed: ++ * ++ * If an extent is not checksummed or compressed, when the extent is trimmed we ++ * don't have to remember the extent we originally allocated and wrote: we can ++ * merely adjust ptr->offset to point to the start of the data that is currently ++ * live. The size field in struct bkey records the current (live) size of the ++ * extent, and is also used to mean "size of region on disk that we point to" in ++ * this case. ++ * ++ * Thus an extent that is not checksummed or compressed will consist only of a ++ * list of bch_extent_ptrs, with none of the fields in ++ * bch_extent_crc32/bch_extent_crc64. ++ * ++ * When an extent is checksummed or compressed, it's not possible to read only ++ * the data that is currently live: we have to read the entire extent that was ++ * originally written, and then return only the part of the extent that is ++ * currently live. ++ * ++ * Thus, in addition to the current size of the extent in struct bkey, we need ++ * to store the size of the originally allocated space - this is the ++ * compressed_size and uncompressed_size fields in bch_extent_crc32/64. Also, ++ * when the extent is trimmed, instead of modifying the offset field of the ++ * pointer, we keep a second smaller offset field - "offset into the original ++ * extent of the currently live region". ++ * ++ * The other major determining factor is replication and data migration: ++ * ++ * Each pointer may have its own bch_extent_crc32/64. When doing a replicated ++ * write, we will initially write all the replicas in the same format, with the ++ * same checksum type and compression format - however, when copygc runs later (or ++ * tiering/cache promotion, anything that moves data), it is not in general ++ * going to rewrite all the pointers at once - one of the replicas may be in a ++ * bucket on one device that has very little fragmentation while another lives ++ * in a bucket that has become heavily fragmented, and thus is being rewritten ++ * sooner than the rest. ++ * ++ * Thus it will only move a subset of the pointers (or in the case of ++ * tiering/cache promotion perhaps add a single pointer without dropping any ++ * current pointers), and if the extent has been partially overwritten it must ++ * write only the currently live portion (or copygc would not be able to reduce ++ * fragmentation!) - which necessitates a different bch_extent_crc format for ++ * the new pointer. ++ * ++ * But in the interests of space efficiency, we don't want to store one ++ * bch_extent_crc for each pointer if we don't have to. ++ * ++ * Thus, a bch_extent consists of bch_extent_crc32s, bch_extent_crc64s, and ++ * bch_extent_ptrs appended arbitrarily one after the other. We determine the ++ * type of a given entry with a scheme similar to utf8 (except we're encoding a ++ * type, not a size), encoding the type in the position of the first set bit: ++ * ++ * bch_extent_crc32 - 0b1 ++ * bch_extent_ptr - 0b10 ++ * bch_extent_crc64 - 0b100 ++ * ++ * We do it this way because bch_extent_crc32 is _very_ constrained on bits (and ++ * bch_extent_crc64 is the least constrained). ++ * ++ * Then, each bch_extent_crc32/64 applies to the pointers that follow after it, ++ * until the next bch_extent_crc32/64. ++ * ++ * If there are no bch_extent_crcs preceding a bch_extent_ptr, then that pointer ++ * is neither checksummed nor compressed. ++ */ ++ ++/* 128 bits, sufficient for cryptographic MACs: */ ++struct bch_csum { ++ __le64 lo; ++ __le64 hi; ++} __attribute__((packed, aligned(8))); ++ ++enum bch_csum_type { ++ BCH_CSUM_NONE = 0, ++ BCH_CSUM_CRC32C_NONZERO = 1, ++ BCH_CSUM_CRC64_NONZERO = 2, ++ BCH_CSUM_CHACHA20_POLY1305_80 = 3, ++ BCH_CSUM_CHACHA20_POLY1305_128 = 4, ++ BCH_CSUM_CRC32C = 5, ++ BCH_CSUM_CRC64 = 6, ++ BCH_CSUM_NR = 7, ++}; ++ ++static const unsigned bch_crc_bytes[] = { ++ [BCH_CSUM_NONE] = 0, ++ [BCH_CSUM_CRC32C_NONZERO] = 4, ++ [BCH_CSUM_CRC32C] = 4, ++ [BCH_CSUM_CRC64_NONZERO] = 8, ++ [BCH_CSUM_CRC64] = 8, ++ [BCH_CSUM_CHACHA20_POLY1305_80] = 10, ++ [BCH_CSUM_CHACHA20_POLY1305_128] = 16, ++}; ++ ++static inline _Bool bch2_csum_type_is_encryption(enum bch_csum_type type) ++{ ++ switch (type) { ++ case BCH_CSUM_CHACHA20_POLY1305_80: ++ case BCH_CSUM_CHACHA20_POLY1305_128: ++ return true; ++ default: ++ return false; ++ } ++} ++ ++enum bch_compression_type { ++ BCH_COMPRESSION_NONE = 0, ++ BCH_COMPRESSION_LZ4_OLD = 1, ++ BCH_COMPRESSION_GZIP = 2, ++ BCH_COMPRESSION_LZ4 = 3, ++ BCH_COMPRESSION_ZSTD = 4, ++ BCH_COMPRESSION_NR = 5, ++}; ++ ++#define BCH_EXTENT_ENTRY_TYPES() \ ++ x(ptr, 0) \ ++ x(crc32, 1) \ ++ x(crc64, 2) \ ++ x(crc128, 3) \ ++ x(stripe_ptr, 4) ++#define BCH_EXTENT_ENTRY_MAX 5 ++ ++enum bch_extent_entry_type { ++#define x(f, n) BCH_EXTENT_ENTRY_##f = n, ++ BCH_EXTENT_ENTRY_TYPES() ++#undef x ++}; ++ ++/* Compressed/uncompressed size are stored biased by 1: */ ++struct bch_extent_crc32 { ++#if defined(__LITTLE_ENDIAN_BITFIELD) ++ __u32 type:2, ++ _compressed_size:7, ++ _uncompressed_size:7, ++ offset:7, ++ _unused:1, ++ csum_type:4, ++ compression_type:4; ++ __u32 csum; ++#elif defined (__BIG_ENDIAN_BITFIELD) ++ __u32 csum; ++ __u32 compression_type:4, ++ csum_type:4, ++ _unused:1, ++ offset:7, ++ _uncompressed_size:7, ++ _compressed_size:7, ++ type:2; ++#endif ++} __attribute__((packed, aligned(8))); ++ ++#define CRC32_SIZE_MAX (1U << 7) ++#define CRC32_NONCE_MAX 0 ++ ++struct bch_extent_crc64 { ++#if defined(__LITTLE_ENDIAN_BITFIELD) ++ __u64 type:3, ++ _compressed_size:9, ++ _uncompressed_size:9, ++ offset:9, ++ nonce:10, ++ csum_type:4, ++ compression_type:4, ++ csum_hi:16; ++#elif defined (__BIG_ENDIAN_BITFIELD) ++ __u64 csum_hi:16, ++ compression_type:4, ++ csum_type:4, ++ nonce:10, ++ offset:9, ++ _uncompressed_size:9, ++ _compressed_size:9, ++ type:3; ++#endif ++ __u64 csum_lo; ++} __attribute__((packed, aligned(8))); ++ ++#define CRC64_SIZE_MAX (1U << 9) ++#define CRC64_NONCE_MAX ((1U << 10) - 1) ++ ++struct bch_extent_crc128 { ++#if defined(__LITTLE_ENDIAN_BITFIELD) ++ __u64 type:4, ++ _compressed_size:13, ++ _uncompressed_size:13, ++ offset:13, ++ nonce:13, ++ csum_type:4, ++ compression_type:4; ++#elif defined (__BIG_ENDIAN_BITFIELD) ++ __u64 compression_type:4, ++ csum_type:4, ++ nonce:13, ++ offset:13, ++ _uncompressed_size:13, ++ _compressed_size:13, ++ type:4; ++#endif ++ struct bch_csum csum; ++} __attribute__((packed, aligned(8))); ++ ++#define CRC128_SIZE_MAX (1U << 13) ++#define CRC128_NONCE_MAX ((1U << 13) - 1) ++ ++/* ++ * @reservation - pointer hasn't been written to, just reserved ++ */ ++struct bch_extent_ptr { ++#if defined(__LITTLE_ENDIAN_BITFIELD) ++ __u64 type:1, ++ cached:1, ++ unused:1, ++ reservation:1, ++ offset:44, /* 8 petabytes */ ++ dev:8, ++ gen:8; ++#elif defined (__BIG_ENDIAN_BITFIELD) ++ __u64 gen:8, ++ dev:8, ++ offset:44, ++ reservation:1, ++ unused:1, ++ cached:1, ++ type:1; ++#endif ++} __attribute__((packed, aligned(8))); ++ ++struct bch_extent_stripe_ptr { ++#if defined(__LITTLE_ENDIAN_BITFIELD) ++ __u64 type:5, ++ block:8, ++ idx:51; ++#elif defined (__BIG_ENDIAN_BITFIELD) ++ __u64 idx:51, ++ block:8, ++ type:5; ++#endif ++}; ++ ++struct bch_extent_reservation { ++#if defined(__LITTLE_ENDIAN_BITFIELD) ++ __u64 type:6, ++ unused:22, ++ replicas:4, ++ generation:32; ++#elif defined (__BIG_ENDIAN_BITFIELD) ++ __u64 generation:32, ++ replicas:4, ++ unused:22, ++ type:6; ++#endif ++}; ++ ++union bch_extent_entry { ++#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ || __BITS_PER_LONG == 64 ++ unsigned long type; ++#elif __BITS_PER_LONG == 32 ++ struct { ++ unsigned long pad; ++ unsigned long type; ++ }; ++#else ++#error edit for your odd byteorder. ++#endif ++ ++#define x(f, n) struct bch_extent_##f f; ++ BCH_EXTENT_ENTRY_TYPES() ++#undef x ++}; ++ ++struct bch_btree_ptr { ++ struct bch_val v; ++ ++ struct bch_extent_ptr start[0]; ++ __u64 _data[0]; ++} __attribute__((packed, aligned(8))); ++ ++struct bch_extent { ++ struct bch_val v; ++ ++ union bch_extent_entry start[0]; ++ __u64 _data[0]; ++} __attribute__((packed, aligned(8))); ++ ++struct bch_reservation { ++ struct bch_val v; ++ ++ __le32 generation; ++ __u8 nr_replicas; ++ __u8 pad[3]; ++} __attribute__((packed, aligned(8))); ++ ++/* Maximum size (in u64s) a single pointer could be: */ ++#define BKEY_EXTENT_PTR_U64s_MAX\ ++ ((sizeof(struct bch_extent_crc128) + \ ++ sizeof(struct bch_extent_ptr)) / sizeof(u64)) ++ ++/* Maximum possible size of an entire extent value: */ ++#define BKEY_EXTENT_VAL_U64s_MAX \ ++ (1 + BKEY_EXTENT_PTR_U64s_MAX * (BCH_REPLICAS_MAX + 1)) ++ ++#define BKEY_PADDED(key) __BKEY_PADDED(key, BKEY_EXTENT_VAL_U64s_MAX) ++ ++/* * Maximum possible size of an entire extent, key + value: */ ++#define BKEY_EXTENT_U64s_MAX (BKEY_U64s + BKEY_EXTENT_VAL_U64s_MAX) ++ ++/* Btree pointers don't carry around checksums: */ ++#define BKEY_BTREE_PTR_VAL_U64s_MAX \ ++ ((sizeof(struct bch_extent_ptr)) / sizeof(u64) * BCH_REPLICAS_MAX) ++#define BKEY_BTREE_PTR_U64s_MAX \ ++ (BKEY_U64s + BKEY_BTREE_PTR_VAL_U64s_MAX) ++ ++/* Inodes */ ++ ++#define BLOCKDEV_INODE_MAX 4096 ++ ++#define BCACHEFS_ROOT_INO 4096 ++ ++struct bch_inode { ++ struct bch_val v; ++ ++ __le64 bi_hash_seed; ++ __le32 bi_flags; ++ __le16 bi_mode; ++ __u8 fields[0]; ++} __attribute__((packed, aligned(8))); ++ ++struct bch_inode_generation { ++ struct bch_val v; ++ ++ __le32 bi_generation; ++ __le32 pad; ++} __attribute__((packed, aligned(8))); ++ ++#define BCH_INODE_FIELDS() \ ++ x(bi_atime, 64) \ ++ x(bi_ctime, 64) \ ++ x(bi_mtime, 64) \ ++ x(bi_otime, 64) \ ++ x(bi_size, 64) \ ++ x(bi_sectors, 64) \ ++ x(bi_uid, 32) \ ++ x(bi_gid, 32) \ ++ x(bi_nlink, 32) \ ++ x(bi_generation, 32) \ ++ x(bi_dev, 32) \ ++ x(bi_data_checksum, 8) \ ++ x(bi_compression, 8) \ ++ x(bi_project, 32) \ ++ x(bi_background_compression, 8) \ ++ x(bi_data_replicas, 8) \ ++ x(bi_promote_target, 16) \ ++ x(bi_foreground_target, 16) \ ++ x(bi_background_target, 16) \ ++ x(bi_erasure_code, 16) \ ++ x(bi_fields_set, 16) ++ ++/* subset of BCH_INODE_FIELDS */ ++#define BCH_INODE_OPTS() \ ++ x(data_checksum, 8) \ ++ x(compression, 8) \ ++ x(project, 32) \ ++ x(background_compression, 8) \ ++ x(data_replicas, 8) \ ++ x(promote_target, 16) \ ++ x(foreground_target, 16) \ ++ x(background_target, 16) \ ++ x(erasure_code, 16) ++ ++enum inode_opt_id { ++#define x(name, ...) \ ++ Inode_opt_##name, ++ BCH_INODE_OPTS() ++#undef x ++ Inode_opt_nr, ++}; ++ ++enum { ++ /* ++ * User flags (get/settable with FS_IOC_*FLAGS, correspond to FS_*_FL ++ * flags) ++ */ ++ __BCH_INODE_SYNC = 0, ++ __BCH_INODE_IMMUTABLE = 1, ++ __BCH_INODE_APPEND = 2, ++ __BCH_INODE_NODUMP = 3, ++ __BCH_INODE_NOATIME = 4, ++ ++ __BCH_INODE_I_SIZE_DIRTY= 5, ++ __BCH_INODE_I_SECTORS_DIRTY= 6, ++ __BCH_INODE_UNLINKED = 7, ++ ++ /* bits 20+ reserved for packed fields below: */ ++}; ++ ++#define BCH_INODE_SYNC (1 << __BCH_INODE_SYNC) ++#define BCH_INODE_IMMUTABLE (1 << __BCH_INODE_IMMUTABLE) ++#define BCH_INODE_APPEND (1 << __BCH_INODE_APPEND) ++#define BCH_INODE_NODUMP (1 << __BCH_INODE_NODUMP) ++#define BCH_INODE_NOATIME (1 << __BCH_INODE_NOATIME) ++#define BCH_INODE_I_SIZE_DIRTY (1 << __BCH_INODE_I_SIZE_DIRTY) ++#define BCH_INODE_I_SECTORS_DIRTY (1 << __BCH_INODE_I_SECTORS_DIRTY) ++#define BCH_INODE_UNLINKED (1 << __BCH_INODE_UNLINKED) ++ ++LE32_BITMASK(INODE_STR_HASH, struct bch_inode, bi_flags, 20, 24); ++LE32_BITMASK(INODE_NR_FIELDS, struct bch_inode, bi_flags, 24, 32); ++ ++/* Dirents */ ++ ++/* ++ * Dirents (and xattrs) have to implement string lookups; since our b-tree ++ * doesn't support arbitrary length strings for the key, we instead index by a ++ * 64 bit hash (currently truncated sha1) of the string, stored in the offset ++ * field of the key - using linear probing to resolve hash collisions. This also ++ * provides us with the readdir cookie posix requires. ++ * ++ * Linear probing requires us to use whiteouts for deletions, in the event of a ++ * collision: ++ */ ++ ++struct bch_dirent { ++ struct bch_val v; ++ ++ /* Target inode number: */ ++ __le64 d_inum; ++ ++ /* ++ * Copy of mode bits 12-15 from the target inode - so userspace can get ++ * the filetype without having to do a stat() ++ */ ++ __u8 d_type; ++ ++ __u8 d_name[]; ++} __attribute__((packed, aligned(8))); ++ ++#define BCH_NAME_MAX (U8_MAX * sizeof(u64) - \ ++ sizeof(struct bkey) - \ ++ offsetof(struct bch_dirent, d_name)) ++ ++ ++/* Xattrs */ ++ ++#define KEY_TYPE_XATTR_INDEX_USER 0 ++#define KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS 1 ++#define KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT 2 ++#define KEY_TYPE_XATTR_INDEX_TRUSTED 3 ++#define KEY_TYPE_XATTR_INDEX_SECURITY 4 ++ ++struct bch_xattr { ++ struct bch_val v; ++ __u8 x_type; ++ __u8 x_name_len; ++ __le16 x_val_len; ++ __u8 x_name[]; ++} __attribute__((packed, aligned(8))); ++ ++/* Bucket/allocation information: */ ++ ++struct bch_alloc { ++ struct bch_val v; ++ __u8 fields; ++ __u8 gen; ++ __u8 data[]; ++} __attribute__((packed, aligned(8))); ++ ++#define BCH_ALLOC_FIELDS() \ ++ x(read_time, 16) \ ++ x(write_time, 16) \ ++ x(data_type, 8) \ ++ x(dirty_sectors, 16) \ ++ x(cached_sectors, 16) \ ++ x(oldest_gen, 8) ++ ++enum { ++#define x(name, bytes) BCH_ALLOC_FIELD_##name, ++ BCH_ALLOC_FIELDS() ++#undef x ++ BCH_ALLOC_FIELD_NR ++}; ++ ++static const unsigned BCH_ALLOC_FIELD_BYTES[] = { ++#define x(name, bits) [BCH_ALLOC_FIELD_##name] = bits / 8, ++ BCH_ALLOC_FIELDS() ++#undef x ++}; ++ ++#define x(name, bits) + (bits / 8) ++static const unsigned BKEY_ALLOC_VAL_U64s_MAX = ++ DIV_ROUND_UP(offsetof(struct bch_alloc, data) ++ BCH_ALLOC_FIELDS(), sizeof(u64)); ++#undef x ++ ++#define BKEY_ALLOC_U64s_MAX (BKEY_U64s + BKEY_ALLOC_VAL_U64s_MAX) ++ ++/* Quotas: */ ++ ++enum quota_types { ++ QTYP_USR = 0, ++ QTYP_GRP = 1, ++ QTYP_PRJ = 2, ++ QTYP_NR = 3, ++}; ++ ++enum quota_counters { ++ Q_SPC = 0, ++ Q_INO = 1, ++ Q_COUNTERS = 2, ++}; ++ ++struct bch_quota_counter { ++ __le64 hardlimit; ++ __le64 softlimit; ++}; ++ ++struct bch_quota { ++ struct bch_val v; ++ struct bch_quota_counter c[Q_COUNTERS]; ++} __attribute__((packed, aligned(8))); ++ ++/* Erasure coding */ ++ ++struct bch_stripe { ++ struct bch_val v; ++ __le16 sectors; ++ __u8 algorithm; ++ __u8 nr_blocks; ++ __u8 nr_redundant; ++ ++ __u8 csum_granularity_bits; ++ __u8 csum_type; ++ __u8 pad; ++ ++ struct bch_extent_ptr ptrs[0]; ++} __attribute__((packed, aligned(8))); ++ ++/* Reflink: */ ++ ++struct bch_reflink_p { ++ struct bch_val v; ++ __le64 idx; ++ ++ __le32 reservation_generation; ++ __u8 nr_replicas; ++ __u8 pad[3]; ++}; ++ ++struct bch_reflink_v { ++ struct bch_val v; ++ __le64 refcount; ++ union bch_extent_entry start[0]; ++ __u64 _data[0]; ++}; ++ ++/* Optional/variable size superblock sections: */ ++ ++struct bch_sb_field { ++ __u64 _data[0]; ++ __le32 u64s; ++ __le32 type; ++}; ++ ++#define BCH_SB_FIELDS() \ ++ x(journal, 0) \ ++ x(members, 1) \ ++ x(crypt, 2) \ ++ x(replicas_v0, 3) \ ++ x(quota, 4) \ ++ x(disk_groups, 5) \ ++ x(clean, 6) \ ++ x(replicas, 7) \ ++ x(journal_seq_blacklist, 8) ++ ++enum bch_sb_field_type { ++#define x(f, nr) BCH_SB_FIELD_##f = nr, ++ BCH_SB_FIELDS() ++#undef x ++ BCH_SB_FIELD_NR ++}; ++ ++/* BCH_SB_FIELD_journal: */ ++ ++struct bch_sb_field_journal { ++ struct bch_sb_field field; ++ __le64 buckets[0]; ++}; ++ ++/* BCH_SB_FIELD_members: */ ++ ++#define BCH_MIN_NR_NBUCKETS (1 << 6) ++ ++struct bch_member { ++ uuid_le uuid; ++ __le64 nbuckets; /* device size */ ++ __le16 first_bucket; /* index of first bucket used */ ++ __le16 bucket_size; /* sectors */ ++ __le32 pad; ++ __le64 last_mount; /* time_t */ ++ ++ __le64 flags[2]; ++}; ++ ++LE64_BITMASK(BCH_MEMBER_STATE, struct bch_member, flags[0], 0, 4) ++/* 4-10 unused, was TIER, HAS_(META)DATA */ ++LE64_BITMASK(BCH_MEMBER_REPLACEMENT, struct bch_member, flags[0], 10, 14) ++LE64_BITMASK(BCH_MEMBER_DISCARD, struct bch_member, flags[0], 14, 15) ++LE64_BITMASK(BCH_MEMBER_DATA_ALLOWED, struct bch_member, flags[0], 15, 20) ++LE64_BITMASK(BCH_MEMBER_GROUP, struct bch_member, flags[0], 20, 28) ++LE64_BITMASK(BCH_MEMBER_DURABILITY, struct bch_member, flags[0], 28, 30) ++ ++#define BCH_TIER_MAX 4U ++ ++#if 0 ++LE64_BITMASK(BCH_MEMBER_NR_READ_ERRORS, struct bch_member, flags[1], 0, 20); ++LE64_BITMASK(BCH_MEMBER_NR_WRITE_ERRORS,struct bch_member, flags[1], 20, 40); ++#endif ++ ++enum bch_member_state { ++ BCH_MEMBER_STATE_RW = 0, ++ BCH_MEMBER_STATE_RO = 1, ++ BCH_MEMBER_STATE_FAILED = 2, ++ BCH_MEMBER_STATE_SPARE = 3, ++ BCH_MEMBER_STATE_NR = 4, ++}; ++ ++enum cache_replacement { ++ CACHE_REPLACEMENT_LRU = 0, ++ CACHE_REPLACEMENT_FIFO = 1, ++ CACHE_REPLACEMENT_RANDOM = 2, ++ CACHE_REPLACEMENT_NR = 3, ++}; ++ ++struct bch_sb_field_members { ++ struct bch_sb_field field; ++ struct bch_member members[0]; ++}; ++ ++/* BCH_SB_FIELD_crypt: */ ++ ++struct nonce { ++ __le32 d[4]; ++}; ++ ++struct bch_key { ++ __le64 key[4]; ++}; ++ ++#define BCH_KEY_MAGIC \ ++ (((u64) 'b' << 0)|((u64) 'c' << 8)| \ ++ ((u64) 'h' << 16)|((u64) '*' << 24)| \ ++ ((u64) '*' << 32)|((u64) 'k' << 40)| \ ++ ((u64) 'e' << 48)|((u64) 'y' << 56)) ++ ++struct bch_encrypted_key { ++ __le64 magic; ++ struct bch_key key; ++}; ++ ++/* ++ * If this field is present in the superblock, it stores an encryption key which ++ * is used encrypt all other data/metadata. The key will normally be encrypted ++ * with the key userspace provides, but if encryption has been turned off we'll ++ * just store the master key unencrypted in the superblock so we can access the ++ * previously encrypted data. ++ */ ++struct bch_sb_field_crypt { ++ struct bch_sb_field field; ++ ++ __le64 flags; ++ __le64 kdf_flags; ++ struct bch_encrypted_key key; ++}; ++ ++LE64_BITMASK(BCH_CRYPT_KDF_TYPE, struct bch_sb_field_crypt, flags, 0, 4); ++ ++enum bch_kdf_types { ++ BCH_KDF_SCRYPT = 0, ++ BCH_KDF_NR = 1, ++}; ++ ++/* stored as base 2 log of scrypt params: */ ++LE64_BITMASK(BCH_KDF_SCRYPT_N, struct bch_sb_field_crypt, kdf_flags, 0, 16); ++LE64_BITMASK(BCH_KDF_SCRYPT_R, struct bch_sb_field_crypt, kdf_flags, 16, 32); ++LE64_BITMASK(BCH_KDF_SCRYPT_P, struct bch_sb_field_crypt, kdf_flags, 32, 48); ++ ++/* BCH_SB_FIELD_replicas: */ ++ ++enum bch_data_type { ++ BCH_DATA_NONE = 0, ++ BCH_DATA_SB = 1, ++ BCH_DATA_JOURNAL = 2, ++ BCH_DATA_BTREE = 3, ++ BCH_DATA_USER = 4, ++ BCH_DATA_CACHED = 5, ++ BCH_DATA_NR = 6, ++}; ++ ++struct bch_replicas_entry_v0 { ++ __u8 data_type; ++ __u8 nr_devs; ++ __u8 devs[0]; ++} __attribute__((packed)); ++ ++struct bch_sb_field_replicas_v0 { ++ struct bch_sb_field field; ++ struct bch_replicas_entry_v0 entries[0]; ++} __attribute__((packed, aligned(8))); ++ ++struct bch_replicas_entry { ++ __u8 data_type; ++ __u8 nr_devs; ++ __u8 nr_required; ++ __u8 devs[0]; ++} __attribute__((packed)); ++ ++struct bch_sb_field_replicas { ++ struct bch_sb_field field; ++ struct bch_replicas_entry entries[0]; ++} __attribute__((packed, aligned(8))); ++ ++/* BCH_SB_FIELD_quota: */ ++ ++struct bch_sb_quota_counter { ++ __le32 timelimit; ++ __le32 warnlimit; ++}; ++ ++struct bch_sb_quota_type { ++ __le64 flags; ++ struct bch_sb_quota_counter c[Q_COUNTERS]; ++}; ++ ++struct bch_sb_field_quota { ++ struct bch_sb_field field; ++ struct bch_sb_quota_type q[QTYP_NR]; ++} __attribute__((packed, aligned(8))); ++ ++/* BCH_SB_FIELD_disk_groups: */ ++ ++#define BCH_SB_LABEL_SIZE 32 ++ ++struct bch_disk_group { ++ __u8 label[BCH_SB_LABEL_SIZE]; ++ __le64 flags[2]; ++} __attribute__((packed, aligned(8))); ++ ++LE64_BITMASK(BCH_GROUP_DELETED, struct bch_disk_group, flags[0], 0, 1) ++LE64_BITMASK(BCH_GROUP_DATA_ALLOWED, struct bch_disk_group, flags[0], 1, 6) ++LE64_BITMASK(BCH_GROUP_PARENT, struct bch_disk_group, flags[0], 6, 24) ++ ++struct bch_sb_field_disk_groups { ++ struct bch_sb_field field; ++ struct bch_disk_group entries[0]; ++} __attribute__((packed, aligned(8))); ++ ++/* ++ * On clean shutdown, store btree roots and current journal sequence number in ++ * the superblock: ++ */ ++struct jset_entry { ++ __le16 u64s; ++ __u8 btree_id; ++ __u8 level; ++ __u8 type; /* designates what this jset holds */ ++ __u8 pad[3]; ++ ++ union { ++ struct bkey_i start[0]; ++ __u64 _data[0]; ++ }; ++}; ++ ++struct bch_sb_field_clean { ++ struct bch_sb_field field; ++ ++ __le32 flags; ++ __le16 read_clock; ++ __le16 write_clock; ++ __le64 journal_seq; ++ ++ union { ++ struct jset_entry start[0]; ++ __u64 _data[0]; ++ }; ++}; ++ ++struct journal_seq_blacklist_entry { ++ __le64 start; ++ __le64 end; ++}; ++ ++struct bch_sb_field_journal_seq_blacklist { ++ struct bch_sb_field field; ++ ++ union { ++ struct journal_seq_blacklist_entry start[0]; ++ __u64 _data[0]; ++ }; ++}; ++ ++/* Superblock: */ ++ ++/* ++ * New versioning scheme: ++ * One common version number for all on disk data structures - superblock, btree ++ * nodes, journal entries ++ */ ++#define BCH_JSET_VERSION_OLD 2 ++#define BCH_BSET_VERSION_OLD 3 ++ ++enum bcachefs_metadata_version { ++ bcachefs_metadata_version_min = 9, ++ bcachefs_metadata_version_new_versioning = 10, ++ bcachefs_metadata_version_bkey_renumber = 10, ++ bcachefs_metadata_version_max = 11, ++}; ++ ++#define bcachefs_metadata_version_current (bcachefs_metadata_version_max - 1) ++ ++#define BCH_SB_SECTOR 8 ++#define BCH_SB_MEMBERS_MAX 64 /* XXX kill */ ++ ++struct bch_sb_layout { ++ uuid_le magic; /* bcachefs superblock UUID */ ++ __u8 layout_type; ++ __u8 sb_max_size_bits; /* base 2 of 512 byte sectors */ ++ __u8 nr_superblocks; ++ __u8 pad[5]; ++ __le64 sb_offset[61]; ++} __attribute__((packed, aligned(8))); ++ ++#define BCH_SB_LAYOUT_SECTOR 7 ++ ++/* ++ * @offset - sector where this sb was written ++ * @version - on disk format version ++ * @version_min - Oldest metadata version this filesystem contains; so we can ++ * safely drop compatibility code and refuse to mount filesystems ++ * we'd need it for ++ * @magic - identifies as a bcachefs superblock (BCACHE_MAGIC) ++ * @seq - incremented each time superblock is written ++ * @uuid - used for generating various magic numbers and identifying ++ * member devices, never changes ++ * @user_uuid - user visible UUID, may be changed ++ * @label - filesystem label ++ * @seq - identifies most recent superblock, incremented each time ++ * superblock is written ++ * @features - enabled incompatible features ++ */ ++struct bch_sb { ++ struct bch_csum csum; ++ __le16 version; ++ __le16 version_min; ++ __le16 pad[2]; ++ uuid_le magic; ++ uuid_le uuid; ++ uuid_le user_uuid; ++ __u8 label[BCH_SB_LABEL_SIZE]; ++ __le64 offset; ++ __le64 seq; ++ ++ __le16 block_size; ++ __u8 dev_idx; ++ __u8 nr_devices; ++ __le32 u64s; ++ ++ __le64 time_base_lo; ++ __le32 time_base_hi; ++ __le32 time_precision; ++ ++ __le64 flags[8]; ++ __le64 features[2]; ++ __le64 compat[2]; ++ ++ struct bch_sb_layout layout; ++ ++ union { ++ struct bch_sb_field start[0]; ++ __le64 _data[0]; ++ }; ++} __attribute__((packed, aligned(8))); ++ ++/* ++ * Flags: ++ * BCH_SB_INITALIZED - set on first mount ++ * BCH_SB_CLEAN - did we shut down cleanly? Just a hint, doesn't affect ++ * behaviour of mount/recovery path: ++ * BCH_SB_INODE_32BIT - limit inode numbers to 32 bits ++ * BCH_SB_128_BIT_MACS - 128 bit macs instead of 80 ++ * BCH_SB_ENCRYPTION_TYPE - if nonzero encryption is enabled; overrides ++ * DATA/META_CSUM_TYPE. Also indicates encryption ++ * algorithm in use, if/when we get more than one ++ */ ++ ++LE16_BITMASK(BCH_SB_BLOCK_SIZE, struct bch_sb, block_size, 0, 16); ++ ++LE64_BITMASK(BCH_SB_INITIALIZED, struct bch_sb, flags[0], 0, 1); ++LE64_BITMASK(BCH_SB_CLEAN, struct bch_sb, flags[0], 1, 2); ++LE64_BITMASK(BCH_SB_CSUM_TYPE, struct bch_sb, flags[0], 2, 8); ++LE64_BITMASK(BCH_SB_ERROR_ACTION, struct bch_sb, flags[0], 8, 12); ++ ++LE64_BITMASK(BCH_SB_BTREE_NODE_SIZE, struct bch_sb, flags[0], 12, 28); ++ ++LE64_BITMASK(BCH_SB_GC_RESERVE, struct bch_sb, flags[0], 28, 33); ++LE64_BITMASK(BCH_SB_ROOT_RESERVE, struct bch_sb, flags[0], 33, 40); ++ ++LE64_BITMASK(BCH_SB_META_CSUM_TYPE, struct bch_sb, flags[0], 40, 44); ++LE64_BITMASK(BCH_SB_DATA_CSUM_TYPE, struct bch_sb, flags[0], 44, 48); ++ ++LE64_BITMASK(BCH_SB_META_REPLICAS_WANT, struct bch_sb, flags[0], 48, 52); ++LE64_BITMASK(BCH_SB_DATA_REPLICAS_WANT, struct bch_sb, flags[0], 52, 56); ++ ++LE64_BITMASK(BCH_SB_POSIX_ACL, struct bch_sb, flags[0], 56, 57); ++LE64_BITMASK(BCH_SB_USRQUOTA, struct bch_sb, flags[0], 57, 58); ++LE64_BITMASK(BCH_SB_GRPQUOTA, struct bch_sb, flags[0], 58, 59); ++LE64_BITMASK(BCH_SB_PRJQUOTA, struct bch_sb, flags[0], 59, 60); ++ ++LE64_BITMASK(BCH_SB_HAS_ERRORS, struct bch_sb, flags[0], 60, 61); ++ ++/* 61-64 unused */ ++ ++LE64_BITMASK(BCH_SB_STR_HASH_TYPE, struct bch_sb, flags[1], 0, 4); ++LE64_BITMASK(BCH_SB_COMPRESSION_TYPE, struct bch_sb, flags[1], 4, 8); ++LE64_BITMASK(BCH_SB_INODE_32BIT, struct bch_sb, flags[1], 8, 9); ++ ++LE64_BITMASK(BCH_SB_128_BIT_MACS, struct bch_sb, flags[1], 9, 10); ++LE64_BITMASK(BCH_SB_ENCRYPTION_TYPE, struct bch_sb, flags[1], 10, 14); ++ ++/* ++ * Max size of an extent that may require bouncing to read or write ++ * (checksummed, compressed): 64k ++ */ ++LE64_BITMASK(BCH_SB_ENCODED_EXTENT_MAX_BITS, ++ struct bch_sb, flags[1], 14, 20); ++ ++LE64_BITMASK(BCH_SB_META_REPLICAS_REQ, struct bch_sb, flags[1], 20, 24); ++LE64_BITMASK(BCH_SB_DATA_REPLICAS_REQ, struct bch_sb, flags[1], 24, 28); ++ ++LE64_BITMASK(BCH_SB_PROMOTE_TARGET, struct bch_sb, flags[1], 28, 40); ++LE64_BITMASK(BCH_SB_FOREGROUND_TARGET, struct bch_sb, flags[1], 40, 52); ++LE64_BITMASK(BCH_SB_BACKGROUND_TARGET, struct bch_sb, flags[1], 52, 64); ++ ++LE64_BITMASK(BCH_SB_BACKGROUND_COMPRESSION_TYPE, ++ struct bch_sb, flags[2], 0, 4); ++LE64_BITMASK(BCH_SB_GC_RESERVE_BYTES, struct bch_sb, flags[2], 4, 64); ++ ++LE64_BITMASK(BCH_SB_ERASURE_CODE, struct bch_sb, flags[3], 0, 16); ++ ++/* Features: */ ++enum bch_sb_features { ++ BCH_FEATURE_LZ4 = 0, ++ BCH_FEATURE_GZIP = 1, ++ BCH_FEATURE_ZSTD = 2, ++ BCH_FEATURE_ATOMIC_NLINK = 3, /* should have gone under compat */ ++ BCH_FEATURE_EC = 4, ++ BCH_FEATURE_JOURNAL_SEQ_BLACKLIST_V3 = 5, ++ BCH_FEATURE_REFLINK = 6, ++ BCH_FEATURE_NEW_SIPHASH = 7, ++ BCH_FEATURE_NR, ++}; ++ ++enum bch_sb_compat { ++ BCH_COMPAT_FEAT_ALLOC_INFO = 0, ++ BCH_COMPAT_FEAT_ALLOC_METADATA = 1, ++}; ++ ++/* options: */ ++ ++#define BCH_REPLICAS_MAX 4U ++ ++enum bch_error_actions { ++ BCH_ON_ERROR_CONTINUE = 0, ++ BCH_ON_ERROR_RO = 1, ++ BCH_ON_ERROR_PANIC = 2, ++ BCH_NR_ERROR_ACTIONS = 3, ++}; ++ ++enum bch_csum_opts { ++ BCH_CSUM_OPT_NONE = 0, ++ BCH_CSUM_OPT_CRC32C = 1, ++ BCH_CSUM_OPT_CRC64 = 2, ++ BCH_CSUM_OPT_NR = 3, ++}; ++ ++enum bch_str_hash_type { ++ BCH_STR_HASH_CRC32C = 0, ++ BCH_STR_HASH_CRC64 = 1, ++ BCH_STR_HASH_SIPHASH_OLD = 2, ++ BCH_STR_HASH_SIPHASH = 3, ++ BCH_STR_HASH_NR = 4, ++}; ++ ++enum bch_str_hash_opts { ++ BCH_STR_HASH_OPT_CRC32C = 0, ++ BCH_STR_HASH_OPT_CRC64 = 1, ++ BCH_STR_HASH_OPT_SIPHASH = 2, ++ BCH_STR_HASH_OPT_NR = 3, ++}; ++ ++#define BCH_COMPRESSION_TYPES() \ ++ x(NONE) \ ++ x(LZ4) \ ++ x(GZIP) \ ++ x(ZSTD) ++ ++enum bch_compression_opts { ++#define x(t) BCH_COMPRESSION_OPT_##t, ++ BCH_COMPRESSION_TYPES() ++#undef x ++ BCH_COMPRESSION_OPT_NR ++}; ++ ++/* ++ * Magic numbers ++ * ++ * The various other data structures have their own magic numbers, which are ++ * xored with the first part of the cache set's UUID ++ */ ++ ++#define BCACHE_MAGIC \ ++ UUID_LE(0xf67385c6, 0x1a4e, 0xca45, \ ++ 0x82, 0x65, 0xf5, 0x7f, 0x48, 0xba, 0x6d, 0x81) ++ ++#define BCACHEFS_STATFS_MAGIC 0xca451a4e ++ ++#define JSET_MAGIC __cpu_to_le64(0x245235c1a3625032ULL) ++#define BSET_MAGIC __cpu_to_le64(0x90135c78b99e07f5ULL) ++ ++static inline __le64 __bch2_sb_magic(struct bch_sb *sb) ++{ ++ __le64 ret; ++ memcpy(&ret, &sb->uuid, sizeof(ret)); ++ return ret; ++} ++ ++static inline __u64 __jset_magic(struct bch_sb *sb) ++{ ++ return __le64_to_cpu(__bch2_sb_magic(sb) ^ JSET_MAGIC); ++} ++ ++static inline __u64 __bset_magic(struct bch_sb *sb) ++{ ++ return __le64_to_cpu(__bch2_sb_magic(sb) ^ BSET_MAGIC); ++} ++ ++/* Journal */ ++ ++#define JSET_KEYS_U64s (sizeof(struct jset_entry) / sizeof(__u64)) ++ ++#define BCH_JSET_ENTRY_TYPES() \ ++ x(btree_keys, 0) \ ++ x(btree_root, 1) \ ++ x(prio_ptrs, 2) \ ++ x(blacklist, 3) \ ++ x(blacklist_v2, 4) \ ++ x(usage, 5) \ ++ x(data_usage, 6) ++ ++enum { ++#define x(f, nr) BCH_JSET_ENTRY_##f = nr, ++ BCH_JSET_ENTRY_TYPES() ++#undef x ++ BCH_JSET_ENTRY_NR ++}; ++ ++/* ++ * Journal sequence numbers can be blacklisted: bsets record the max sequence ++ * number of all the journal entries they contain updates for, so that on ++ * recovery we can ignore those bsets that contain index updates newer that what ++ * made it into the journal. ++ * ++ * This means that we can't reuse that journal_seq - we have to skip it, and ++ * then record that we skipped it so that the next time we crash and recover we ++ * don't think there was a missing journal entry. ++ */ ++struct jset_entry_blacklist { ++ struct jset_entry entry; ++ __le64 seq; ++}; ++ ++struct jset_entry_blacklist_v2 { ++ struct jset_entry entry; ++ __le64 start; ++ __le64 end; ++}; ++ ++enum { ++ FS_USAGE_RESERVED = 0, ++ FS_USAGE_INODES = 1, ++ FS_USAGE_KEY_VERSION = 2, ++ FS_USAGE_NR = 3 ++}; ++ ++struct jset_entry_usage { ++ struct jset_entry entry; ++ __le64 v; ++} __attribute__((packed)); ++ ++struct jset_entry_data_usage { ++ struct jset_entry entry; ++ __le64 v; ++ struct bch_replicas_entry r; ++} __attribute__((packed)); ++ ++/* ++ * On disk format for a journal entry: ++ * seq is monotonically increasing; every journal entry has its own unique ++ * sequence number. ++ * ++ * last_seq is the oldest journal entry that still has keys the btree hasn't ++ * flushed to disk yet. ++ * ++ * version is for on disk format changes. ++ */ ++struct jset { ++ struct bch_csum csum; ++ ++ __le64 magic; ++ __le64 seq; ++ __le32 version; ++ __le32 flags; ++ ++ __le32 u64s; /* size of d[] in u64s */ ++ ++ __u8 encrypted_start[0]; ++ ++ __le16 read_clock; ++ __le16 write_clock; ++ ++ /* Sequence number of oldest dirty journal entry */ ++ __le64 last_seq; ++ ++ ++ union { ++ struct jset_entry start[0]; ++ __u64 _data[0]; ++ }; ++} __attribute__((packed, aligned(8))); ++ ++LE32_BITMASK(JSET_CSUM_TYPE, struct jset, flags, 0, 4); ++LE32_BITMASK(JSET_BIG_ENDIAN, struct jset, flags, 4, 5); ++ ++#define BCH_JOURNAL_BUCKETS_MIN 8 ++ ++/* Btree: */ ++ ++#define BCH_BTREE_IDS() \ ++ x(EXTENTS, 0, "extents") \ ++ x(INODES, 1, "inodes") \ ++ x(DIRENTS, 2, "dirents") \ ++ x(XATTRS, 3, "xattrs") \ ++ x(ALLOC, 4, "alloc") \ ++ x(QUOTAS, 5, "quotas") \ ++ x(EC, 6, "stripes") \ ++ x(REFLINK, 7, "reflink") ++ ++enum btree_id { ++#define x(kwd, val, name) BTREE_ID_##kwd = val, ++ BCH_BTREE_IDS() ++#undef x ++ BTREE_ID_NR ++}; ++ ++#define BTREE_MAX_DEPTH 4U ++ ++/* Btree nodes */ ++ ++/* ++ * Btree nodes ++ * ++ * On disk a btree node is a list/log of these; within each set the keys are ++ * sorted ++ */ ++struct bset { ++ __le64 seq; ++ ++ /* ++ * Highest journal entry this bset contains keys for. ++ * If on recovery we don't see that journal entry, this bset is ignored: ++ * this allows us to preserve the order of all index updates after a ++ * crash, since the journal records a total order of all index updates ++ * and anything that didn't make it to the journal doesn't get used. ++ */ ++ __le64 journal_seq; ++ ++ __le32 flags; ++ __le16 version; ++ __le16 u64s; /* count of d[] in u64s */ ++ ++ union { ++ struct bkey_packed start[0]; ++ __u64 _data[0]; ++ }; ++} __attribute__((packed, aligned(8))); ++ ++LE32_BITMASK(BSET_CSUM_TYPE, struct bset, flags, 0, 4); ++ ++LE32_BITMASK(BSET_BIG_ENDIAN, struct bset, flags, 4, 5); ++LE32_BITMASK(BSET_SEPARATE_WHITEOUTS, ++ struct bset, flags, 5, 6); ++ ++struct btree_node { ++ struct bch_csum csum; ++ __le64 magic; ++ ++ /* this flags field is encrypted, unlike bset->flags: */ ++ __le64 flags; ++ ++ /* Closed interval: */ ++ struct bpos min_key; ++ struct bpos max_key; ++ struct bch_extent_ptr ptr; ++ struct bkey_format format; ++ ++ union { ++ struct bset keys; ++ struct { ++ __u8 pad[22]; ++ __le16 u64s; ++ __u64 _data[0]; ++ ++ }; ++ }; ++} __attribute__((packed, aligned(8))); ++ ++LE64_BITMASK(BTREE_NODE_ID, struct btree_node, flags, 0, 4); ++LE64_BITMASK(BTREE_NODE_LEVEL, struct btree_node, flags, 4, 8); ++/* 8-32 unused */ ++LE64_BITMASK(BTREE_NODE_SEQ, struct btree_node, flags, 32, 64); ++ ++struct btree_node_entry { ++ struct bch_csum csum; ++ ++ union { ++ struct bset keys; ++ struct { ++ __u8 pad[22]; ++ __le16 u64s; ++ __u64 _data[0]; ++ ++ }; ++ }; ++} __attribute__((packed, aligned(8))); ++ ++#endif /* _BCACHEFS_FORMAT_H */ +diff --git a/fs/bcachefs/bcachefs_ioctl.h b/fs/bcachefs/bcachefs_ioctl.h +new file mode 100644 +index 000000000000..d668ede5491a +--- /dev/null ++++ b/fs/bcachefs/bcachefs_ioctl.h +@@ -0,0 +1,314 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_IOCTL_H ++#define _BCACHEFS_IOCTL_H ++ ++#include ++#include ++#include "bcachefs_format.h" ++ ++/* ++ * Flags common to multiple ioctls: ++ */ ++#define BCH_FORCE_IF_DATA_LOST (1 << 0) ++#define BCH_FORCE_IF_METADATA_LOST (1 << 1) ++#define BCH_FORCE_IF_DATA_DEGRADED (1 << 2) ++#define BCH_FORCE_IF_METADATA_DEGRADED (1 << 3) ++ ++#define BCH_FORCE_IF_DEGRADED \ ++ (BCH_FORCE_IF_DATA_DEGRADED| \ ++ BCH_FORCE_IF_METADATA_DEGRADED) ++ ++/* ++ * If cleared, ioctl that refer to a device pass it as a pointer to a pathname ++ * (e.g. /dev/sda1); if set, the dev field is the device's index within the ++ * filesystem: ++ */ ++#define BCH_BY_INDEX (1 << 4) ++ ++/* ++ * For BCH_IOCTL_READ_SUPER: get superblock of a specific device, not filesystem ++ * wide superblock: ++ */ ++#define BCH_READ_DEV (1 << 5) ++ ++/* global control dev: */ ++ ++/* These are currently broken, and probably unnecessary: */ ++#if 0 ++#define BCH_IOCTL_ASSEMBLE _IOW(0xbc, 1, struct bch_ioctl_assemble) ++#define BCH_IOCTL_INCREMENTAL _IOW(0xbc, 2, struct bch_ioctl_incremental) ++ ++struct bch_ioctl_assemble { ++ __u32 flags; ++ __u32 nr_devs; ++ __u64 pad; ++ __u64 devs[]; ++}; ++ ++struct bch_ioctl_incremental { ++ __u32 flags; ++ __u64 pad; ++ __u64 dev; ++}; ++#endif ++ ++/* filesystem ioctls: */ ++ ++#define BCH_IOCTL_QUERY_UUID _IOR(0xbc, 1, struct bch_ioctl_query_uuid) ++ ++/* These only make sense when we also have incremental assembly */ ++#if 0 ++#define BCH_IOCTL_START _IOW(0xbc, 2, struct bch_ioctl_start) ++#define BCH_IOCTL_STOP _IO(0xbc, 3) ++#endif ++ ++#define BCH_IOCTL_DISK_ADD _IOW(0xbc, 4, struct bch_ioctl_disk) ++#define BCH_IOCTL_DISK_REMOVE _IOW(0xbc, 5, struct bch_ioctl_disk) ++#define BCH_IOCTL_DISK_ONLINE _IOW(0xbc, 6, struct bch_ioctl_disk) ++#define BCH_IOCTL_DISK_OFFLINE _IOW(0xbc, 7, struct bch_ioctl_disk) ++#define BCH_IOCTL_DISK_SET_STATE _IOW(0xbc, 8, struct bch_ioctl_disk_set_state) ++#define BCH_IOCTL_DATA _IOW(0xbc, 10, struct bch_ioctl_data) ++#define BCH_IOCTL_USAGE _IOWR(0xbc, 11, struct bch_ioctl_usage) ++#define BCH_IOCTL_READ_SUPER _IOW(0xbc, 12, struct bch_ioctl_read_super) ++#define BCH_IOCTL_DISK_GET_IDX _IOW(0xbc, 13, struct bch_ioctl_disk_get_idx) ++#define BCH_IOCTL_DISK_RESIZE _IOW(0xbc, 14, struct bch_ioctl_disk_resize) ++ ++/* ioctl below act on a particular file, not the filesystem as a whole: */ ++ ++#define BCHFS_IOC_REINHERIT_ATTRS _IOR(0xbc, 64, const char __user *) ++ ++/* ++ * BCH_IOCTL_QUERY_UUID: get filesystem UUID ++ * ++ * Returns user visible UUID, not internal UUID (which may not ever be changed); ++ * the filesystem's sysfs directory may be found under /sys/fs/bcachefs with ++ * this UUID. ++ */ ++struct bch_ioctl_query_uuid { ++ uuid_le uuid; ++}; ++ ++#if 0 ++struct bch_ioctl_start { ++ __u32 flags; ++ __u32 pad; ++}; ++#endif ++ ++/* ++ * BCH_IOCTL_DISK_ADD: add a new device to an existing filesystem ++ * ++ * The specified device must not be open or in use. On success, the new device ++ * will be an online member of the filesystem just like any other member. ++ * ++ * The device must first be prepared by userspace by formatting with a bcachefs ++ * superblock, which is only used for passing in superblock options/parameters ++ * for that device (in struct bch_member). The new device's superblock should ++ * not claim to be a member of any existing filesystem - UUIDs on it will be ++ * ignored. ++ */ ++ ++/* ++ * BCH_IOCTL_DISK_REMOVE: permanently remove a member device from a filesystem ++ * ++ * Any data present on @dev will be permanently deleted, and @dev will be ++ * removed from its slot in the filesystem's list of member devices. The device ++ * may be either offline or offline. ++ * ++ * Will fail removing @dev would leave us with insufficient read write devices ++ * or degraded/unavailable data, unless the approprate BCH_FORCE_IF_* flags are ++ * set. ++ */ ++ ++/* ++ * BCH_IOCTL_DISK_ONLINE: given a disk that is already a member of a filesystem ++ * but is not open (e.g. because we started in degraded mode), bring it online ++ * ++ * all existing data on @dev will be available once the device is online, ++ * exactly as if @dev was present when the filesystem was first mounted ++ */ ++ ++/* ++ * BCH_IOCTL_DISK_OFFLINE: offline a disk, causing the kernel to close that ++ * block device, without removing it from the filesystem (so it can be brought ++ * back online later) ++ * ++ * Data present on @dev will be unavailable while @dev is offline (unless ++ * replicated), but will still be intact and untouched if @dev is brought back ++ * online ++ * ++ * Will fail (similarly to BCH_IOCTL_DISK_SET_STATE) if offlining @dev would ++ * leave us with insufficient read write devices or degraded/unavailable data, ++ * unless the approprate BCH_FORCE_IF_* flags are set. ++ */ ++ ++struct bch_ioctl_disk { ++ __u32 flags; ++ __u32 pad; ++ __u64 dev; ++}; ++ ++/* ++ * BCH_IOCTL_DISK_SET_STATE: modify state of a member device of a filesystem ++ * ++ * @new_state - one of the bch_member_state states (rw, ro, failed, ++ * spare) ++ * ++ * Will refuse to change member state if we would then have insufficient devices ++ * to write to, or if it would result in degraded data (when @new_state is ++ * failed or spare) unless the appropriate BCH_FORCE_IF_* flags are set. ++ */ ++struct bch_ioctl_disk_set_state { ++ __u32 flags; ++ __u8 new_state; ++ __u8 pad[3]; ++ __u64 dev; ++}; ++ ++enum bch_data_ops { ++ BCH_DATA_OP_SCRUB = 0, ++ BCH_DATA_OP_REREPLICATE = 1, ++ BCH_DATA_OP_MIGRATE = 2, ++ BCH_DATA_OP_NR = 3, ++}; ++ ++/* ++ * BCH_IOCTL_DATA: operations that walk and manipulate filesystem data (e.g. ++ * scrub, rereplicate, migrate). ++ * ++ * This ioctl kicks off a job in the background, and returns a file descriptor. ++ * Reading from the file descriptor returns a struct bch_ioctl_data_event, ++ * indicating current progress, and closing the file descriptor will stop the ++ * job. The file descriptor is O_CLOEXEC. ++ */ ++struct bch_ioctl_data { ++ __u32 op; ++ __u32 flags; ++ ++ struct bpos start; ++ struct bpos end; ++ ++ union { ++ struct { ++ __u32 dev; ++ __u32 pad; ++ } migrate; ++ struct { ++ __u64 pad[8]; ++ }; ++ }; ++} __attribute__((packed, aligned(8))); ++ ++enum bch_data_event { ++ BCH_DATA_EVENT_PROGRESS = 0, ++ /* XXX: add an event for reporting errors */ ++ BCH_DATA_EVENT_NR = 1, ++}; ++ ++struct bch_ioctl_data_progress { ++ __u8 data_type; ++ __u8 btree_id; ++ __u8 pad[2]; ++ struct bpos pos; ++ ++ __u64 sectors_done; ++ __u64 sectors_total; ++} __attribute__((packed, aligned(8))); ++ ++struct bch_ioctl_data_event { ++ __u8 type; ++ __u8 pad[7]; ++ union { ++ struct bch_ioctl_data_progress p; ++ __u64 pad2[15]; ++ }; ++} __attribute__((packed, aligned(8))); ++ ++struct bch_ioctl_dev_usage { ++ __u8 state; ++ __u8 alive; ++ __u8 pad[6]; ++ __u32 dev; ++ ++ __u32 bucket_size; ++ __u64 nr_buckets; ++ ++ __u64 buckets[BCH_DATA_NR]; ++ __u64 sectors[BCH_DATA_NR]; ++}; ++ ++struct bch_ioctl_fs_usage { ++ __u64 capacity; ++ __u64 used; ++ __u64 online_reserved; ++ __u64 persistent_reserved[BCH_REPLICAS_MAX]; ++ __u64 sectors[BCH_DATA_NR][BCH_REPLICAS_MAX]; ++}; ++ ++/* ++ * BCH_IOCTL_USAGE: query filesystem disk space usage ++ * ++ * Returns disk space usage broken out by data type, number of replicas, and ++ * by component device ++ * ++ * @nr_devices - number of devices userspace allocated space for in @devs ++ * ++ * On success, @fs and @devs will be filled out appropriately and devs[i].alive ++ * will indicate if a device was present in that slot ++ * ++ * Returns -ERANGE if @nr_devices was too small ++ */ ++struct bch_ioctl_usage { ++ __u16 nr_devices; ++ __u16 pad[3]; ++ ++ struct bch_ioctl_fs_usage fs; ++ struct bch_ioctl_dev_usage devs[0]; ++}; ++ ++/* ++ * BCH_IOCTL_READ_SUPER: read filesystem superblock ++ * ++ * Equivalent to reading the superblock directly from the block device, except ++ * avoids racing with the kernel writing the superblock or having to figure out ++ * which block device to read ++ * ++ * @sb - buffer to read into ++ * @size - size of userspace allocated buffer ++ * @dev - device to read superblock for, if BCH_READ_DEV flag is ++ * specified ++ * ++ * Returns -ERANGE if buffer provided is too small ++ */ ++struct bch_ioctl_read_super { ++ __u32 flags; ++ __u32 pad; ++ __u64 dev; ++ __u64 size; ++ __u64 sb; ++}; ++ ++/* ++ * BCH_IOCTL_DISK_GET_IDX: give a path to a block device, query filesystem to ++ * determine if disk is a (online) member - if so, returns device's index ++ * ++ * Returns -ENOENT if not found ++ */ ++struct bch_ioctl_disk_get_idx { ++ __u64 dev; ++}; ++ ++/* ++ * BCH_IOCTL_DISK_RESIZE: resize filesystem on a device ++ * ++ * @dev - member to resize ++ * @nbuckets - new number of buckets ++ */ ++struct bch_ioctl_disk_resize { ++ __u32 flags; ++ __u32 pad; ++ __u64 dev; ++ __u64 nbuckets; ++}; ++ ++#endif /* _BCACHEFS_IOCTL_H */ +diff --git a/fs/bcachefs/bkey.c b/fs/bcachefs/bkey.c +new file mode 100644 +index 000000000000..ed7ca5b0636d +--- /dev/null ++++ b/fs/bcachefs/bkey.c +@@ -0,0 +1,1160 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include "bcachefs.h" ++#include "bkey.h" ++#include "bkey_methods.h" ++#include "bset.h" ++#include "util.h" ++ ++#undef EBUG_ON ++ ++#ifdef DEBUG_BKEYS ++#define EBUG_ON(cond) BUG_ON(cond) ++#else ++#define EBUG_ON(cond) ++#endif ++ ++const struct bkey_format bch2_bkey_format_current = BKEY_FORMAT_CURRENT; ++ ++struct bkey __bch2_bkey_unpack_key(const struct bkey_format *, ++ const struct bkey_packed *); ++ ++void bch2_to_binary(char *out, const u64 *p, unsigned nr_bits) ++{ ++ unsigned bit = high_bit_offset, done = 0; ++ ++ while (1) { ++ while (bit < 64) { ++ if (done && !(done % 8)) ++ *out++ = ' '; ++ *out++ = *p & (1ULL << (63 - bit)) ? '1' : '0'; ++ bit++; ++ done++; ++ if (done == nr_bits) { ++ *out++ = '\0'; ++ return; ++ } ++ } ++ ++ p = next_word(p); ++ bit = 0; ++ } ++} ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++ ++static void bch2_bkey_pack_verify(const struct bkey_packed *packed, ++ const struct bkey *unpacked, ++ const struct bkey_format *format) ++{ ++ struct bkey tmp; ++ ++ BUG_ON(bkeyp_val_u64s(format, packed) != ++ bkey_val_u64s(unpacked)); ++ ++ BUG_ON(packed->u64s < bkeyp_key_u64s(format, packed)); ++ ++ tmp = __bch2_bkey_unpack_key(format, packed); ++ ++ if (memcmp(&tmp, unpacked, sizeof(struct bkey))) { ++ char buf1[160], buf2[160]; ++ char buf3[160], buf4[160]; ++ ++ bch2_bkey_to_text(&PBUF(buf1), unpacked); ++ bch2_bkey_to_text(&PBUF(buf2), &tmp); ++ bch2_to_binary(buf3, (void *) unpacked, 80); ++ bch2_to_binary(buf4, high_word(format, packed), 80); ++ ++ panic("keys differ: format u64s %u fields %u %u %u %u %u\n%s\n%s\n%s\n%s\n", ++ format->key_u64s, ++ format->bits_per_field[0], ++ format->bits_per_field[1], ++ format->bits_per_field[2], ++ format->bits_per_field[3], ++ format->bits_per_field[4], ++ buf1, buf2, buf3, buf4); ++ } ++} ++ ++#else ++static inline void bch2_bkey_pack_verify(const struct bkey_packed *packed, ++ const struct bkey *unpacked, ++ const struct bkey_format *format) {} ++#endif ++ ++struct pack_state { ++ const struct bkey_format *format; ++ unsigned bits; /* bits remaining in current word */ ++ u64 w; /* current word */ ++ u64 *p; /* pointer to next word */ ++}; ++ ++__always_inline ++static struct pack_state pack_state_init(const struct bkey_format *format, ++ struct bkey_packed *k) ++{ ++ u64 *p = high_word(format, k); ++ ++ return (struct pack_state) { ++ .format = format, ++ .bits = 64 - high_bit_offset, ++ .w = 0, ++ .p = p, ++ }; ++} ++ ++__always_inline ++static void pack_state_finish(struct pack_state *state, ++ struct bkey_packed *k) ++{ ++ EBUG_ON(state->p < k->_data); ++ EBUG_ON(state->p >= k->_data + state->format->key_u64s); ++ ++ *state->p = state->w; ++} ++ ++struct unpack_state { ++ const struct bkey_format *format; ++ unsigned bits; /* bits remaining in current word */ ++ u64 w; /* current word */ ++ const u64 *p; /* pointer to next word */ ++}; ++ ++__always_inline ++static struct unpack_state unpack_state_init(const struct bkey_format *format, ++ const struct bkey_packed *k) ++{ ++ const u64 *p = high_word(format, k); ++ ++ return (struct unpack_state) { ++ .format = format, ++ .bits = 64 - high_bit_offset, ++ .w = *p << high_bit_offset, ++ .p = p, ++ }; ++} ++ ++__always_inline ++static u64 get_inc_field(struct unpack_state *state, unsigned field) ++{ ++ unsigned bits = state->format->bits_per_field[field]; ++ u64 v = 0, offset = le64_to_cpu(state->format->field_offset[field]); ++ ++ if (bits >= state->bits) { ++ v = state->w >> (64 - bits); ++ bits -= state->bits; ++ ++ state->p = next_word(state->p); ++ state->w = *state->p; ++ state->bits = 64; ++ } ++ ++ /* avoid shift by 64 if bits is 0 - bits is never 64 here: */ ++ v |= (state->w >> 1) >> (63 - bits); ++ state->w <<= bits; ++ state->bits -= bits; ++ ++ return v + offset; ++} ++ ++__always_inline ++static bool set_inc_field(struct pack_state *state, unsigned field, u64 v) ++{ ++ unsigned bits = state->format->bits_per_field[field]; ++ u64 offset = le64_to_cpu(state->format->field_offset[field]); ++ ++ if (v < offset) ++ return false; ++ ++ v -= offset; ++ ++ if (fls64(v) > bits) ++ return false; ++ ++ if (bits > state->bits) { ++ bits -= state->bits; ++ /* avoid shift by 64 if bits is 0 - bits is never 64 here: */ ++ state->w |= (v >> 1) >> (bits - 1); ++ ++ *state->p = state->w; ++ state->p = next_word(state->p); ++ state->w = 0; ++ state->bits = 64; ++ } ++ ++ state->bits -= bits; ++ state->w |= v << state->bits; ++ ++ return true; ++} ++ ++/* ++ * Note: does NOT set out->format (we don't know what it should be here!) ++ * ++ * Also: doesn't work on extents - it doesn't preserve the invariant that ++ * if k is packed bkey_start_pos(k) will successfully pack ++ */ ++static bool bch2_bkey_transform_key(const struct bkey_format *out_f, ++ struct bkey_packed *out, ++ const struct bkey_format *in_f, ++ const struct bkey_packed *in) ++{ ++ struct pack_state out_s = pack_state_init(out_f, out); ++ struct unpack_state in_s = unpack_state_init(in_f, in); ++ unsigned i; ++ ++ out->_data[0] = 0; ++ ++ for (i = 0; i < BKEY_NR_FIELDS; i++) ++ if (!set_inc_field(&out_s, i, get_inc_field(&in_s, i))) ++ return false; ++ ++ /* Can't happen because the val would be too big to unpack: */ ++ EBUG_ON(in->u64s - in_f->key_u64s + out_f->key_u64s > U8_MAX); ++ ++ pack_state_finish(&out_s, out); ++ out->u64s = out_f->key_u64s + in->u64s - in_f->key_u64s; ++ out->needs_whiteout = in->needs_whiteout; ++ out->type = in->type; ++ ++ return true; ++} ++ ++bool bch2_bkey_transform(const struct bkey_format *out_f, ++ struct bkey_packed *out, ++ const struct bkey_format *in_f, ++ const struct bkey_packed *in) ++{ ++ if (!bch2_bkey_transform_key(out_f, out, in_f, in)) ++ return false; ++ ++ memcpy_u64s((u64 *) out + out_f->key_u64s, ++ (u64 *) in + in_f->key_u64s, ++ (in->u64s - in_f->key_u64s)); ++ return true; ++} ++ ++#define bkey_fields() \ ++ x(BKEY_FIELD_INODE, p.inode) \ ++ x(BKEY_FIELD_OFFSET, p.offset) \ ++ x(BKEY_FIELD_SNAPSHOT, p.snapshot) \ ++ x(BKEY_FIELD_SIZE, size) \ ++ x(BKEY_FIELD_VERSION_HI, version.hi) \ ++ x(BKEY_FIELD_VERSION_LO, version.lo) ++ ++struct bkey __bch2_bkey_unpack_key(const struct bkey_format *format, ++ const struct bkey_packed *in) ++{ ++ struct unpack_state state = unpack_state_init(format, in); ++ struct bkey out; ++ ++ EBUG_ON(format->nr_fields != BKEY_NR_FIELDS); ++ EBUG_ON(in->u64s < format->key_u64s); ++ EBUG_ON(in->format != KEY_FORMAT_LOCAL_BTREE); ++ EBUG_ON(in->u64s - format->key_u64s + BKEY_U64s > U8_MAX); ++ ++ out.u64s = BKEY_U64s + in->u64s - format->key_u64s; ++ out.format = KEY_FORMAT_CURRENT; ++ out.needs_whiteout = in->needs_whiteout; ++ out.type = in->type; ++ out.pad[0] = 0; ++ ++#define x(id, field) out.field = get_inc_field(&state, id); ++ bkey_fields() ++#undef x ++ ++ return out; ++} ++ ++#ifndef HAVE_BCACHEFS_COMPILED_UNPACK ++struct bpos __bkey_unpack_pos(const struct bkey_format *format, ++ const struct bkey_packed *in) ++{ ++ struct unpack_state state = unpack_state_init(format, in); ++ struct bpos out; ++ ++ EBUG_ON(format->nr_fields != BKEY_NR_FIELDS); ++ EBUG_ON(in->u64s < format->key_u64s); ++ EBUG_ON(in->format != KEY_FORMAT_LOCAL_BTREE); ++ ++ out.inode = get_inc_field(&state, BKEY_FIELD_INODE); ++ out.offset = get_inc_field(&state, BKEY_FIELD_OFFSET); ++ out.snapshot = get_inc_field(&state, BKEY_FIELD_SNAPSHOT); ++ ++ return out; ++} ++#endif ++ ++/** ++ * bch2_bkey_pack_key -- pack just the key, not the value ++ */ ++bool bch2_bkey_pack_key(struct bkey_packed *out, const struct bkey *in, ++ const struct bkey_format *format) ++{ ++ struct pack_state state = pack_state_init(format, out); ++ ++ EBUG_ON((void *) in == (void *) out); ++ EBUG_ON(format->nr_fields != BKEY_NR_FIELDS); ++ EBUG_ON(in->format != KEY_FORMAT_CURRENT); ++ ++ out->_data[0] = 0; ++ ++#define x(id, field) if (!set_inc_field(&state, id, in->field)) return false; ++ bkey_fields() ++#undef x ++ ++ /* ++ * Extents - we have to guarantee that if an extent is packed, a trimmed ++ * version will also pack: ++ */ ++ if (bkey_start_offset(in) < ++ le64_to_cpu(format->field_offset[BKEY_FIELD_OFFSET])) ++ return false; ++ ++ pack_state_finish(&state, out); ++ out->u64s = format->key_u64s + in->u64s - BKEY_U64s; ++ out->format = KEY_FORMAT_LOCAL_BTREE; ++ out->needs_whiteout = in->needs_whiteout; ++ out->type = in->type; ++ ++ bch2_bkey_pack_verify(out, in, format); ++ return true; ++} ++ ++/** ++ * bch2_bkey_unpack -- unpack the key and the value ++ */ ++void bch2_bkey_unpack(const struct btree *b, struct bkey_i *dst, ++ const struct bkey_packed *src) ++{ ++ __bkey_unpack_key(b, &dst->k, src); ++ ++ memcpy_u64s(&dst->v, ++ bkeyp_val(&b->format, src), ++ bkeyp_val_u64s(&b->format, src)); ++} ++ ++/** ++ * bch2_bkey_pack -- pack the key and the value ++ */ ++bool bch2_bkey_pack(struct bkey_packed *out, const struct bkey_i *in, ++ const struct bkey_format *format) ++{ ++ struct bkey_packed tmp; ++ ++ if (!bch2_bkey_pack_key(&tmp, &in->k, format)) ++ return false; ++ ++ memmove_u64s((u64 *) out + format->key_u64s, ++ &in->v, ++ bkey_val_u64s(&in->k)); ++ memcpy_u64s(out, &tmp, format->key_u64s); ++ ++ return true; ++} ++ ++__always_inline ++static bool set_inc_field_lossy(struct pack_state *state, unsigned field, u64 v) ++{ ++ unsigned bits = state->format->bits_per_field[field]; ++ u64 offset = le64_to_cpu(state->format->field_offset[field]); ++ bool ret = true; ++ ++ EBUG_ON(v < offset); ++ v -= offset; ++ ++ if (fls64(v) > bits) { ++ v = ~(~0ULL << bits); ++ ret = false; ++ } ++ ++ if (bits > state->bits) { ++ bits -= state->bits; ++ state->w |= (v >> 1) >> (bits - 1); ++ ++ *state->p = state->w; ++ state->p = next_word(state->p); ++ state->w = 0; ++ state->bits = 64; ++ } ++ ++ state->bits -= bits; ++ state->w |= v << state->bits; ++ ++ return ret; ++} ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++static bool bkey_packed_successor(struct bkey_packed *out, ++ const struct btree *b, ++ struct bkey_packed k) ++{ ++ const struct bkey_format *f = &b->format; ++ unsigned nr_key_bits = b->nr_key_bits; ++ unsigned first_bit, offset; ++ u64 *p; ++ ++ EBUG_ON(b->nr_key_bits != bkey_format_key_bits(f)); ++ ++ if (!nr_key_bits) ++ return false; ++ ++ *out = k; ++ ++ first_bit = high_bit_offset + nr_key_bits - 1; ++ p = nth_word(high_word(f, out), first_bit >> 6); ++ offset = 63 - (first_bit & 63); ++ ++ while (nr_key_bits) { ++ unsigned bits = min(64 - offset, nr_key_bits); ++ u64 mask = (~0ULL >> (64 - bits)) << offset; ++ ++ if ((*p & mask) != mask) { ++ *p += 1ULL << offset; ++ EBUG_ON(bkey_cmp_packed(b, out, &k) <= 0); ++ return true; ++ } ++ ++ *p &= ~mask; ++ p = prev_word(p); ++ nr_key_bits -= bits; ++ offset = 0; ++ } ++ ++ return false; ++} ++#endif ++ ++/* ++ * Returns a packed key that compares <= in ++ * ++ * This is used in bset_search_tree(), where we need a packed pos in order to be ++ * able to compare against the keys in the auxiliary search tree - and it's ++ * legal to use a packed pos that isn't equivalent to the original pos, ++ * _provided_ it compares <= to the original pos. ++ */ ++enum bkey_pack_pos_ret bch2_bkey_pack_pos_lossy(struct bkey_packed *out, ++ struct bpos in, ++ const struct btree *b) ++{ ++ const struct bkey_format *f = &b->format; ++ struct pack_state state = pack_state_init(f, out); ++#ifdef CONFIG_BCACHEFS_DEBUG ++ struct bpos orig = in; ++#endif ++ bool exact = true; ++ ++ out->_data[0] = 0; ++ ++ if (unlikely(in.snapshot < ++ le64_to_cpu(f->field_offset[BKEY_FIELD_SNAPSHOT]))) { ++ if (!in.offset-- && ++ !in.inode--) ++ return BKEY_PACK_POS_FAIL; ++ in.snapshot = KEY_SNAPSHOT_MAX; ++ exact = false; ++ } ++ ++ if (unlikely(in.offset < ++ le64_to_cpu(f->field_offset[BKEY_FIELD_OFFSET]))) { ++ if (!in.inode--) ++ return BKEY_PACK_POS_FAIL; ++ in.offset = KEY_OFFSET_MAX; ++ in.snapshot = KEY_SNAPSHOT_MAX; ++ exact = false; ++ } ++ ++ if (unlikely(in.inode < ++ le64_to_cpu(f->field_offset[BKEY_FIELD_INODE]))) ++ return BKEY_PACK_POS_FAIL; ++ ++ if (!set_inc_field_lossy(&state, BKEY_FIELD_INODE, in.inode)) { ++ in.offset = KEY_OFFSET_MAX; ++ in.snapshot = KEY_SNAPSHOT_MAX; ++ exact = false; ++ } ++ ++ if (!set_inc_field_lossy(&state, BKEY_FIELD_OFFSET, in.offset)) { ++ in.snapshot = KEY_SNAPSHOT_MAX; ++ exact = false; ++ } ++ ++ if (!set_inc_field_lossy(&state, BKEY_FIELD_SNAPSHOT, in.snapshot)) ++ exact = false; ++ ++ pack_state_finish(&state, out); ++ out->u64s = f->key_u64s; ++ out->format = KEY_FORMAT_LOCAL_BTREE; ++ out->type = KEY_TYPE_deleted; ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++ if (exact) { ++ BUG_ON(bkey_cmp_left_packed(b, out, &orig)); ++ } else { ++ struct bkey_packed successor; ++ ++ BUG_ON(bkey_cmp_left_packed(b, out, &orig) >= 0); ++ BUG_ON(bkey_packed_successor(&successor, b, *out) && ++ bkey_cmp_left_packed(b, &successor, &orig) < 0); ++ } ++#endif ++ ++ return exact ? BKEY_PACK_POS_EXACT : BKEY_PACK_POS_SMALLER; ++} ++ ++void bch2_bkey_format_init(struct bkey_format_state *s) ++{ ++ unsigned i; ++ ++ for (i = 0; i < ARRAY_SIZE(s->field_min); i++) ++ s->field_min[i] = U64_MAX; ++ ++ for (i = 0; i < ARRAY_SIZE(s->field_max); i++) ++ s->field_max[i] = 0; ++ ++ /* Make sure we can store a size of 0: */ ++ s->field_min[BKEY_FIELD_SIZE] = 0; ++} ++ ++static void __bkey_format_add(struct bkey_format_state *s, ++ unsigned field, u64 v) ++{ ++ s->field_min[field] = min(s->field_min[field], v); ++ s->field_max[field] = max(s->field_max[field], v); ++} ++ ++/* ++ * Changes @format so that @k can be successfully packed with @format ++ */ ++void bch2_bkey_format_add_key(struct bkey_format_state *s, const struct bkey *k) ++{ ++#define x(id, field) __bkey_format_add(s, id, k->field); ++ bkey_fields() ++#undef x ++ __bkey_format_add(s, BKEY_FIELD_OFFSET, bkey_start_offset(k)); ++} ++ ++void bch2_bkey_format_add_pos(struct bkey_format_state *s, struct bpos p) ++{ ++ unsigned field = 0; ++ ++ __bkey_format_add(s, field++, p.inode); ++ __bkey_format_add(s, field++, p.offset); ++ __bkey_format_add(s, field++, p.snapshot); ++} ++ ++/* ++ * We don't want it to be possible for the packed format to represent fields ++ * bigger than a u64... that will cause confusion and issues (like with ++ * bkey_packed_successor()) ++ */ ++static void set_format_field(struct bkey_format *f, enum bch_bkey_fields i, ++ unsigned bits, u64 offset) ++{ ++ offset = bits == 64 ? 0 : min(offset, U64_MAX - ((1ULL << bits) - 1)); ++ ++ f->bits_per_field[i] = bits; ++ f->field_offset[i] = cpu_to_le64(offset); ++} ++ ++struct bkey_format bch2_bkey_format_done(struct bkey_format_state *s) ++{ ++ unsigned i, bits = KEY_PACKED_BITS_START; ++ struct bkey_format ret = { ++ .nr_fields = BKEY_NR_FIELDS, ++ }; ++ ++ for (i = 0; i < ARRAY_SIZE(s->field_min); i++) { ++ s->field_min[i] = min(s->field_min[i], s->field_max[i]); ++ ++ set_format_field(&ret, i, ++ fls64(s->field_max[i] - s->field_min[i]), ++ s->field_min[i]); ++ ++ bits += ret.bits_per_field[i]; ++ } ++ ++ /* allow for extent merging: */ ++ if (ret.bits_per_field[BKEY_FIELD_SIZE]) { ++ ret.bits_per_field[BKEY_FIELD_SIZE] += 4; ++ bits += 4; ++ } ++ ++ ret.key_u64s = DIV_ROUND_UP(bits, 64); ++ ++ /* if we have enough spare bits, round fields up to nearest byte */ ++ bits = ret.key_u64s * 64 - bits; ++ ++ for (i = 0; i < ARRAY_SIZE(ret.bits_per_field); i++) { ++ unsigned r = round_up(ret.bits_per_field[i], 8) - ++ ret.bits_per_field[i]; ++ ++ if (r <= bits) { ++ set_format_field(&ret, i, ++ ret.bits_per_field[i] + r, ++ le64_to_cpu(ret.field_offset[i])); ++ bits -= r; ++ } ++ } ++ ++ EBUG_ON(bch2_bkey_format_validate(&ret)); ++ return ret; ++} ++ ++const char *bch2_bkey_format_validate(struct bkey_format *f) ++{ ++ unsigned i, bits = KEY_PACKED_BITS_START; ++ ++ if (f->nr_fields != BKEY_NR_FIELDS) ++ return "incorrect number of fields"; ++ ++ for (i = 0; i < f->nr_fields; i++) { ++ u64 field_offset = le64_to_cpu(f->field_offset[i]); ++ ++ if (f->bits_per_field[i] > 64) ++ return "field too large"; ++ ++ if (field_offset && ++ (f->bits_per_field[i] == 64 || ++ (field_offset + ((1ULL << f->bits_per_field[i]) - 1) < ++ field_offset))) ++ return "offset + bits overflow"; ++ ++ bits += f->bits_per_field[i]; ++ } ++ ++ if (f->key_u64s != DIV_ROUND_UP(bits, 64)) ++ return "incorrect key_u64s"; ++ ++ return NULL; ++} ++ ++/* ++ * Most significant differing bit ++ * Bits are indexed from 0 - return is [0, nr_key_bits) ++ */ ++__pure ++unsigned bch2_bkey_greatest_differing_bit(const struct btree *b, ++ const struct bkey_packed *l_k, ++ const struct bkey_packed *r_k) ++{ ++ const u64 *l = high_word(&b->format, l_k); ++ const u64 *r = high_word(&b->format, r_k); ++ unsigned nr_key_bits = b->nr_key_bits; ++ unsigned word_bits = 64 - high_bit_offset; ++ u64 l_v, r_v; ++ ++ EBUG_ON(b->nr_key_bits != bkey_format_key_bits(&b->format)); ++ ++ /* for big endian, skip past header */ ++ l_v = *l & (~0ULL >> high_bit_offset); ++ r_v = *r & (~0ULL >> high_bit_offset); ++ ++ while (nr_key_bits) { ++ if (nr_key_bits < word_bits) { ++ l_v >>= word_bits - nr_key_bits; ++ r_v >>= word_bits - nr_key_bits; ++ nr_key_bits = 0; ++ } else { ++ nr_key_bits -= word_bits; ++ } ++ ++ if (l_v != r_v) ++ return fls64(l_v ^ r_v) - 1 + nr_key_bits; ++ ++ l = next_word(l); ++ r = next_word(r); ++ ++ l_v = *l; ++ r_v = *r; ++ word_bits = 64; ++ } ++ ++ return 0; ++} ++ ++/* ++ * First set bit ++ * Bits are indexed from 0 - return is [0, nr_key_bits) ++ */ ++__pure ++unsigned bch2_bkey_ffs(const struct btree *b, const struct bkey_packed *k) ++{ ++ const u64 *p = high_word(&b->format, k); ++ unsigned nr_key_bits = b->nr_key_bits; ++ unsigned ret = 0, offset; ++ ++ EBUG_ON(b->nr_key_bits != bkey_format_key_bits(&b->format)); ++ ++ offset = nr_key_bits; ++ while (offset > 64) { ++ p = next_word(p); ++ offset -= 64; ++ } ++ ++ offset = 64 - offset; ++ ++ while (nr_key_bits) { ++ unsigned bits = nr_key_bits + offset < 64 ++ ? nr_key_bits ++ : 64 - offset; ++ ++ u64 mask = (~0ULL >> (64 - bits)) << offset; ++ ++ if (*p & mask) ++ return ret + __ffs64(*p & mask) - offset; ++ ++ p = prev_word(p); ++ nr_key_bits -= bits; ++ ret += bits; ++ offset = 0; ++ } ++ ++ return 0; ++} ++ ++#ifdef CONFIG_X86_64 ++ ++static inline int __bkey_cmp_bits(const u64 *l, const u64 *r, ++ unsigned nr_key_bits) ++{ ++ long d0, d1, d2, d3; ++ int cmp; ++ ++ /* we shouldn't need asm for this, but gcc is being retarded: */ ++ ++ asm(".intel_syntax noprefix;" ++ "xor eax, eax;" ++ "xor edx, edx;" ++ "1:;" ++ "mov r8, [rdi];" ++ "mov r9, [rsi];" ++ "sub ecx, 64;" ++ "jl 2f;" ++ ++ "cmp r8, r9;" ++ "jnz 3f;" ++ ++ "lea rdi, [rdi - 8];" ++ "lea rsi, [rsi - 8];" ++ "jmp 1b;" ++ ++ "2:;" ++ "not ecx;" ++ "shr r8, 1;" ++ "shr r9, 1;" ++ "shr r8, cl;" ++ "shr r9, cl;" ++ "cmp r8, r9;" ++ ++ "3:\n" ++ "seta al;" ++ "setb dl;" ++ "sub eax, edx;" ++ ".att_syntax prefix;" ++ : "=&D" (d0), "=&S" (d1), "=&d" (d2), "=&c" (d3), "=&a" (cmp) ++ : "0" (l), "1" (r), "3" (nr_key_bits) ++ : "r8", "r9", "cc", "memory"); ++ ++ return cmp; ++} ++ ++#define I(_x) (*(out)++ = (_x)) ++#define I1(i0) I(i0) ++#define I2(i0, i1) (I1(i0), I(i1)) ++#define I3(i0, i1, i2) (I2(i0, i1), I(i2)) ++#define I4(i0, i1, i2, i3) (I3(i0, i1, i2), I(i3)) ++#define I5(i0, i1, i2, i3, i4) (I4(i0, i1, i2, i3), I(i4)) ++ ++static u8 *compile_bkey_field(const struct bkey_format *format, u8 *out, ++ enum bch_bkey_fields field, ++ unsigned dst_offset, unsigned dst_size, ++ bool *eax_zeroed) ++{ ++ unsigned bits = format->bits_per_field[field]; ++ u64 offset = le64_to_cpu(format->field_offset[field]); ++ unsigned i, byte, bit_offset, align, shl, shr; ++ ++ if (!bits && !offset) { ++ if (!*eax_zeroed) { ++ /* xor eax, eax */ ++ I2(0x31, 0xc0); ++ } ++ ++ *eax_zeroed = true; ++ goto set_field; ++ } ++ ++ if (!bits) { ++ /* just return offset: */ ++ ++ switch (dst_size) { ++ case 8: ++ if (offset > S32_MAX) { ++ /* mov [rdi + dst_offset], offset */ ++ I3(0xc7, 0x47, dst_offset); ++ memcpy(out, &offset, 4); ++ out += 4; ++ ++ I3(0xc7, 0x47, dst_offset + 4); ++ memcpy(out, (void *) &offset + 4, 4); ++ out += 4; ++ } else { ++ /* mov [rdi + dst_offset], offset */ ++ /* sign extended */ ++ I4(0x48, 0xc7, 0x47, dst_offset); ++ memcpy(out, &offset, 4); ++ out += 4; ++ } ++ break; ++ case 4: ++ /* mov [rdi + dst_offset], offset */ ++ I3(0xc7, 0x47, dst_offset); ++ memcpy(out, &offset, 4); ++ out += 4; ++ break; ++ default: ++ BUG(); ++ } ++ ++ return out; ++ } ++ ++ bit_offset = format->key_u64s * 64; ++ for (i = 0; i <= field; i++) ++ bit_offset -= format->bits_per_field[i]; ++ ++ byte = bit_offset / 8; ++ bit_offset -= byte * 8; ++ ++ *eax_zeroed = false; ++ ++ if (bit_offset == 0 && bits == 8) { ++ /* movzx eax, BYTE PTR [rsi + imm8] */ ++ I4(0x0f, 0xb6, 0x46, byte); ++ } else if (bit_offset == 0 && bits == 16) { ++ /* movzx eax, WORD PTR [rsi + imm8] */ ++ I4(0x0f, 0xb7, 0x46, byte); ++ } else if (bit_offset + bits <= 32) { ++ align = min(4 - DIV_ROUND_UP(bit_offset + bits, 8), byte & 3); ++ byte -= align; ++ bit_offset += align * 8; ++ ++ BUG_ON(bit_offset + bits > 32); ++ ++ /* mov eax, [rsi + imm8] */ ++ I3(0x8b, 0x46, byte); ++ ++ if (bit_offset) { ++ /* shr eax, imm8 */ ++ I3(0xc1, 0xe8, bit_offset); ++ } ++ ++ if (bit_offset + bits < 32) { ++ unsigned mask = ~0U >> (32 - bits); ++ ++ /* and eax, imm32 */ ++ I1(0x25); ++ memcpy(out, &mask, 4); ++ out += 4; ++ } ++ } else if (bit_offset + bits <= 64) { ++ align = min(8 - DIV_ROUND_UP(bit_offset + bits, 8), byte & 7); ++ byte -= align; ++ bit_offset += align * 8; ++ ++ BUG_ON(bit_offset + bits > 64); ++ ++ /* mov rax, [rsi + imm8] */ ++ I4(0x48, 0x8b, 0x46, byte); ++ ++ shl = 64 - bit_offset - bits; ++ shr = bit_offset + shl; ++ ++ if (shl) { ++ /* shl rax, imm8 */ ++ I4(0x48, 0xc1, 0xe0, shl); ++ } ++ ++ if (shr) { ++ /* shr rax, imm8 */ ++ I4(0x48, 0xc1, 0xe8, shr); ++ } ++ } else { ++ align = min(4 - DIV_ROUND_UP(bit_offset + bits, 8), byte & 3); ++ byte -= align; ++ bit_offset += align * 8; ++ ++ BUG_ON(bit_offset + bits > 96); ++ ++ /* mov rax, [rsi + byte] */ ++ I4(0x48, 0x8b, 0x46, byte); ++ ++ /* mov edx, [rsi + byte + 8] */ ++ I3(0x8b, 0x56, byte + 8); ++ ++ /* bits from next word: */ ++ shr = bit_offset + bits - 64; ++ BUG_ON(shr > bit_offset); ++ ++ /* shr rax, bit_offset */ ++ I4(0x48, 0xc1, 0xe8, shr); ++ ++ /* shl rdx, imm8 */ ++ I4(0x48, 0xc1, 0xe2, 64 - shr); ++ ++ /* or rax, rdx */ ++ I3(0x48, 0x09, 0xd0); ++ ++ shr = bit_offset - shr; ++ ++ if (shr) { ++ /* shr rax, imm8 */ ++ I4(0x48, 0xc1, 0xe8, shr); ++ } ++ } ++ ++ /* rax += offset: */ ++ if (offset > S32_MAX) { ++ /* mov rdx, imm64 */ ++ I2(0x48, 0xba); ++ memcpy(out, &offset, 8); ++ out += 8; ++ /* add %rdx, %rax */ ++ I3(0x48, 0x01, 0xd0); ++ } else if (offset + (~0ULL >> (64 - bits)) > U32_MAX) { ++ /* add rax, imm32 */ ++ I2(0x48, 0x05); ++ memcpy(out, &offset, 4); ++ out += 4; ++ } else if (offset) { ++ /* add eax, imm32 */ ++ I1(0x05); ++ memcpy(out, &offset, 4); ++ out += 4; ++ } ++set_field: ++ switch (dst_size) { ++ case 8: ++ /* mov [rdi + dst_offset], rax */ ++ I4(0x48, 0x89, 0x47, dst_offset); ++ break; ++ case 4: ++ /* mov [rdi + dst_offset], eax */ ++ I3(0x89, 0x47, dst_offset); ++ break; ++ default: ++ BUG(); ++ } ++ ++ return out; ++} ++ ++int bch2_compile_bkey_format(const struct bkey_format *format, void *_out) ++{ ++ bool eax_zeroed = false; ++ u8 *out = _out; ++ ++ /* ++ * rdi: dst - unpacked key ++ * rsi: src - packed key ++ */ ++ ++ /* k->u64s, k->format, k->type */ ++ ++ /* mov eax, [rsi] */ ++ I2(0x8b, 0x06); ++ ++ /* add eax, BKEY_U64s - format->key_u64s */ ++ I5(0x05, BKEY_U64s - format->key_u64s, KEY_FORMAT_CURRENT, 0, 0); ++ ++ /* and eax, imm32: mask out k->pad: */ ++ I5(0x25, 0xff, 0xff, 0xff, 0); ++ ++ /* mov [rdi], eax */ ++ I2(0x89, 0x07); ++ ++#define x(id, field) \ ++ out = compile_bkey_field(format, out, id, \ ++ offsetof(struct bkey, field), \ ++ sizeof(((struct bkey *) NULL)->field), \ ++ &eax_zeroed); ++ bkey_fields() ++#undef x ++ ++ /* retq */ ++ I1(0xc3); ++ ++ return (void *) out - _out; ++} ++ ++#else ++static inline int __bkey_cmp_bits(const u64 *l, const u64 *r, ++ unsigned nr_key_bits) ++{ ++ u64 l_v, r_v; ++ ++ if (!nr_key_bits) ++ return 0; ++ ++ /* for big endian, skip past header */ ++ nr_key_bits += high_bit_offset; ++ l_v = *l & (~0ULL >> high_bit_offset); ++ r_v = *r & (~0ULL >> high_bit_offset); ++ ++ while (1) { ++ if (nr_key_bits < 64) { ++ l_v >>= 64 - nr_key_bits; ++ r_v >>= 64 - nr_key_bits; ++ nr_key_bits = 0; ++ } else { ++ nr_key_bits -= 64; ++ } ++ ++ if (!nr_key_bits || l_v != r_v) ++ break; ++ ++ l = next_word(l); ++ r = next_word(r); ++ ++ l_v = *l; ++ r_v = *r; ++ } ++ ++ return cmp_int(l_v, r_v); ++} ++#endif ++ ++__pure ++int __bch2_bkey_cmp_packed_format_checked(const struct bkey_packed *l, ++ const struct bkey_packed *r, ++ const struct btree *b) ++{ ++ const struct bkey_format *f = &b->format; ++ int ret; ++ ++ EBUG_ON(!bkey_packed(l) || !bkey_packed(r)); ++ EBUG_ON(b->nr_key_bits != bkey_format_key_bits(f)); ++ ++ ret = __bkey_cmp_bits(high_word(f, l), ++ high_word(f, r), ++ b->nr_key_bits); ++ ++ EBUG_ON(ret != bkey_cmp(bkey_unpack_pos(b, l), ++ bkey_unpack_pos(b, r))); ++ return ret; ++} ++ ++__pure __flatten ++int __bch2_bkey_cmp_left_packed_format_checked(const struct btree *b, ++ const struct bkey_packed *l, ++ const struct bpos *r) ++{ ++ return bkey_cmp(bkey_unpack_pos_format_checked(b, l), *r); ++} ++ ++__pure __flatten ++int __bch2_bkey_cmp_packed(const struct bkey_packed *l, ++ const struct bkey_packed *r, ++ const struct btree *b) ++{ ++ int packed = bkey_lr_packed(l, r); ++ ++ if (likely(packed == BKEY_PACKED_BOTH)) ++ return __bch2_bkey_cmp_packed_format_checked(l, r, b); ++ ++ switch (packed) { ++ case BKEY_PACKED_NONE: ++ return bkey_cmp(((struct bkey *) l)->p, ++ ((struct bkey *) r)->p); ++ case BKEY_PACKED_LEFT: ++ return __bch2_bkey_cmp_left_packed_format_checked(b, ++ (struct bkey_packed *) l, ++ &((struct bkey *) r)->p); ++ case BKEY_PACKED_RIGHT: ++ return -__bch2_bkey_cmp_left_packed_format_checked(b, ++ (struct bkey_packed *) r, ++ &((struct bkey *) l)->p); ++ default: ++ unreachable(); ++ } ++} ++ ++__pure __flatten ++int __bch2_bkey_cmp_left_packed(const struct btree *b, ++ const struct bkey_packed *l, ++ const struct bpos *r) ++{ ++ const struct bkey *l_unpacked; ++ ++ return unlikely(l_unpacked = packed_to_bkey_c(l)) ++ ? bkey_cmp(l_unpacked->p, *r) ++ : __bch2_bkey_cmp_left_packed_format_checked(b, l, r); ++} ++ ++void bch2_bpos_swab(struct bpos *p) ++{ ++ u8 *l = (u8 *) p; ++ u8 *h = ((u8 *) &p[1]) - 1; ++ ++ while (l < h) { ++ swap(*l, *h); ++ l++; ++ --h; ++ } ++} ++ ++void bch2_bkey_swab_key(const struct bkey_format *_f, struct bkey_packed *k) ++{ ++ const struct bkey_format *f = bkey_packed(k) ? _f : &bch2_bkey_format_current; ++ u8 *l = k->key_start; ++ u8 *h = (u8 *) (k->_data + f->key_u64s) - 1; ++ ++ while (l < h) { ++ swap(*l, *h); ++ l++; ++ --h; ++ } ++} ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++void bch2_bkey_pack_test(void) ++{ ++ struct bkey t = KEY(4134ULL, 1250629070527416633ULL, 0); ++ struct bkey_packed p; ++ ++ struct bkey_format test_format = { ++ .key_u64s = 2, ++ .nr_fields = BKEY_NR_FIELDS, ++ .bits_per_field = { ++ 13, ++ 64, ++ }, ++ }; ++ ++ struct unpack_state in_s = ++ unpack_state_init(&bch2_bkey_format_current, (void *) &t); ++ struct pack_state out_s = pack_state_init(&test_format, &p); ++ unsigned i; ++ ++ for (i = 0; i < out_s.format->nr_fields; i++) { ++ u64 a, v = get_inc_field(&in_s, i); ++ ++ switch (i) { ++#define x(id, field) case id: a = t.field; break; ++ bkey_fields() ++#undef x ++ default: ++ BUG(); ++ } ++ ++ if (a != v) ++ panic("got %llu actual %llu i %u\n", v, a, i); ++ ++ if (!set_inc_field(&out_s, i, v)) ++ panic("failed at %u\n", i); ++ } ++ ++ BUG_ON(!bch2_bkey_pack_key(&p, &t, &test_format)); ++} ++#endif +diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h +new file mode 100644 +index 000000000000..b26f4934b264 +--- /dev/null ++++ b/fs/bcachefs/bkey.h +@@ -0,0 +1,594 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_BKEY_H ++#define _BCACHEFS_BKEY_H ++ ++#include ++#include "bcachefs_format.h" ++ ++#include "util.h" ++#include "vstructs.h" ++ ++#ifdef CONFIG_X86_64 ++#define HAVE_BCACHEFS_COMPILED_UNPACK 1 ++#endif ++ ++void bch2_to_binary(char *, const u64 *, unsigned); ++ ++/* bkey with split value, const */ ++struct bkey_s_c { ++ const struct bkey *k; ++ const struct bch_val *v; ++}; ++ ++/* bkey with split value */ ++struct bkey_s { ++ union { ++ struct { ++ struct bkey *k; ++ struct bch_val *v; ++ }; ++ struct bkey_s_c s_c; ++ }; ++}; ++ ++#define bkey_next(_k) vstruct_next(_k) ++ ++#define bkey_val_u64s(_k) ((_k)->u64s - BKEY_U64s) ++ ++static inline size_t bkey_val_bytes(const struct bkey *k) ++{ ++ return bkey_val_u64s(k) * sizeof(u64); ++} ++ ++static inline void set_bkey_val_u64s(struct bkey *k, unsigned val_u64s) ++{ ++ k->u64s = BKEY_U64s + val_u64s; ++} ++ ++static inline void set_bkey_val_bytes(struct bkey *k, unsigned bytes) ++{ ++ k->u64s = BKEY_U64s + DIV_ROUND_UP(bytes, sizeof(u64)); ++} ++ ++#define bkey_val_end(_k) ((void *) (((u64 *) (_k).v) + bkey_val_u64s((_k).k))) ++ ++#define bkey_deleted(_k) ((_k)->type == KEY_TYPE_deleted) ++ ++#define bkey_whiteout(_k) \ ++ ((_k)->type == KEY_TYPE_deleted || (_k)->type == KEY_TYPE_discard) ++ ++#define bkey_packed_typecheck(_k) \ ++({ \ ++ BUILD_BUG_ON(!type_is(_k, struct bkey *) && \ ++ !type_is(_k, struct bkey_packed *)); \ ++ type_is(_k, struct bkey_packed *); \ ++}) ++ ++enum bkey_lr_packed { ++ BKEY_PACKED_BOTH, ++ BKEY_PACKED_RIGHT, ++ BKEY_PACKED_LEFT, ++ BKEY_PACKED_NONE, ++}; ++ ++#define bkey_lr_packed_typecheck(_l, _r) \ ++ (!bkey_packed_typecheck(_l) + ((!bkey_packed_typecheck(_r)) << 1)) ++ ++#define bkey_lr_packed(_l, _r) \ ++ ((_l)->format + ((_r)->format << 1)) ++ ++#define bkey_copy(_dst, _src) \ ++do { \ ++ BUILD_BUG_ON(!type_is(_dst, struct bkey_i *) && \ ++ !type_is(_dst, struct bkey_packed *)); \ ++ BUILD_BUG_ON(!type_is(_src, struct bkey_i *) && \ ++ !type_is(_src, struct bkey_packed *)); \ ++ EBUG_ON((u64 *) (_dst) > (u64 *) (_src) && \ ++ (u64 *) (_dst) < (u64 *) (_src) + \ ++ ((struct bkey *) (_src))->u64s); \ ++ \ ++ memcpy_u64s_small((_dst), (_src), \ ++ ((struct bkey *) (_src))->u64s); \ ++} while (0) ++ ++struct btree; ++ ++struct bkey_format_state { ++ u64 field_min[BKEY_NR_FIELDS]; ++ u64 field_max[BKEY_NR_FIELDS]; ++}; ++ ++void bch2_bkey_format_init(struct bkey_format_state *); ++void bch2_bkey_format_add_key(struct bkey_format_state *, const struct bkey *); ++void bch2_bkey_format_add_pos(struct bkey_format_state *, struct bpos); ++struct bkey_format bch2_bkey_format_done(struct bkey_format_state *); ++const char *bch2_bkey_format_validate(struct bkey_format *); ++ ++__pure ++unsigned bch2_bkey_greatest_differing_bit(const struct btree *, ++ const struct bkey_packed *, ++ const struct bkey_packed *); ++__pure ++unsigned bch2_bkey_ffs(const struct btree *, const struct bkey_packed *); ++ ++__pure ++int __bch2_bkey_cmp_packed_format_checked(const struct bkey_packed *, ++ const struct bkey_packed *, ++ const struct btree *); ++ ++__pure ++int __bch2_bkey_cmp_left_packed_format_checked(const struct btree *, ++ const struct bkey_packed *, ++ const struct bpos *); ++ ++__pure ++int __bch2_bkey_cmp_packed(const struct bkey_packed *, ++ const struct bkey_packed *, ++ const struct btree *); ++ ++__pure ++int __bch2_bkey_cmp_left_packed(const struct btree *, ++ const struct bkey_packed *, ++ const struct bpos *); ++ ++static inline __pure ++int bkey_cmp_left_packed(const struct btree *b, ++ const struct bkey_packed *l, const struct bpos *r) ++{ ++ return __bch2_bkey_cmp_left_packed(b, l, r); ++} ++ ++/* ++ * we prefer to pass bpos by ref, but it's often enough terribly convenient to ++ * pass it by by val... as much as I hate c++, const ref would be nice here: ++ */ ++__pure __flatten ++static inline int bkey_cmp_left_packed_byval(const struct btree *b, ++ const struct bkey_packed *l, ++ struct bpos r) ++{ ++ return bkey_cmp_left_packed(b, l, &r); ++} ++ ++/* ++ * If @_l or @_r are struct bkey * (not bkey_packed *), uses type information to ++ * skip dispatching on k->format: ++ */ ++#define bkey_cmp_packed(_b, _l, _r) \ ++({ \ ++ int _cmp; \ ++ \ ++ switch (bkey_lr_packed_typecheck(_l, _r)) { \ ++ case BKEY_PACKED_NONE: \ ++ _cmp = bkey_cmp(((struct bkey *) (_l))->p, \ ++ ((struct bkey *) (_r))->p); \ ++ break; \ ++ case BKEY_PACKED_LEFT: \ ++ _cmp = bkey_cmp_left_packed((_b), \ ++ (struct bkey_packed *) (_l), \ ++ &((struct bkey *) (_r))->p); \ ++ break; \ ++ case BKEY_PACKED_RIGHT: \ ++ _cmp = -bkey_cmp_left_packed((_b), \ ++ (struct bkey_packed *) (_r), \ ++ &((struct bkey *) (_l))->p); \ ++ break; \ ++ case BKEY_PACKED_BOTH: \ ++ _cmp = __bch2_bkey_cmp_packed((void *) (_l), \ ++ (void *) (_r), (_b)); \ ++ break; \ ++ } \ ++ _cmp; \ ++}) ++ ++#if 1 ++static __always_inline int bkey_cmp(struct bpos l, struct bpos r) ++{ ++ if (l.inode != r.inode) ++ return l.inode < r.inode ? -1 : 1; ++ if (l.offset != r.offset) ++ return l.offset < r.offset ? -1 : 1; ++ if (l.snapshot != r.snapshot) ++ return l.snapshot < r.snapshot ? -1 : 1; ++ return 0; ++} ++#else ++int bkey_cmp(struct bpos l, struct bpos r); ++#endif ++ ++static inline struct bpos bpos_min(struct bpos l, struct bpos r) ++{ ++ return bkey_cmp(l, r) < 0 ? l : r; ++} ++ ++void bch2_bpos_swab(struct bpos *); ++void bch2_bkey_swab_key(const struct bkey_format *, struct bkey_packed *); ++ ++static __always_inline int bversion_cmp(struct bversion l, struct bversion r) ++{ ++ return cmp_int(l.hi, r.hi) ?: ++ cmp_int(l.lo, r.lo); ++} ++ ++#define ZERO_VERSION ((struct bversion) { .hi = 0, .lo = 0 }) ++#define MAX_VERSION ((struct bversion) { .hi = ~0, .lo = ~0ULL }) ++ ++static __always_inline int bversion_zero(struct bversion v) ++{ ++ return !bversion_cmp(v, ZERO_VERSION); ++} ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++/* statement expressions confusing unlikely()? */ ++#define bkey_packed(_k) \ ++ ({ EBUG_ON((_k)->format > KEY_FORMAT_CURRENT); \ ++ (_k)->format != KEY_FORMAT_CURRENT; }) ++#else ++#define bkey_packed(_k) ((_k)->format != KEY_FORMAT_CURRENT) ++#endif ++ ++/* ++ * It's safe to treat an unpacked bkey as a packed one, but not the reverse ++ */ ++static inline struct bkey_packed *bkey_to_packed(struct bkey_i *k) ++{ ++ return (struct bkey_packed *) k; ++} ++ ++static inline const struct bkey_packed *bkey_to_packed_c(const struct bkey_i *k) ++{ ++ return (const struct bkey_packed *) k; ++} ++ ++static inline struct bkey_i *packed_to_bkey(struct bkey_packed *k) ++{ ++ return bkey_packed(k) ? NULL : (struct bkey_i *) k; ++} ++ ++static inline const struct bkey *packed_to_bkey_c(const struct bkey_packed *k) ++{ ++ return bkey_packed(k) ? NULL : (const struct bkey *) k; ++} ++ ++static inline unsigned bkey_format_key_bits(const struct bkey_format *format) ++{ ++ return format->bits_per_field[BKEY_FIELD_INODE] + ++ format->bits_per_field[BKEY_FIELD_OFFSET] + ++ format->bits_per_field[BKEY_FIELD_SNAPSHOT]; ++} ++ ++static inline struct bpos bkey_successor(struct bpos p) ++{ ++ struct bpos ret = p; ++ ++ if (!++ret.offset) ++ BUG_ON(!++ret.inode); ++ ++ return ret; ++} ++ ++static inline struct bpos bkey_predecessor(struct bpos p) ++{ ++ struct bpos ret = p; ++ ++ if (!ret.offset--) ++ BUG_ON(!ret.inode--); ++ ++ return ret; ++} ++ ++static inline u64 bkey_start_offset(const struct bkey *k) ++{ ++ return k->p.offset - k->size; ++} ++ ++static inline struct bpos bkey_start_pos(const struct bkey *k) ++{ ++ return (struct bpos) { ++ .inode = k->p.inode, ++ .offset = bkey_start_offset(k), ++ .snapshot = k->p.snapshot, ++ }; ++} ++ ++/* Packed helpers */ ++ ++static inline unsigned bkeyp_key_u64s(const struct bkey_format *format, ++ const struct bkey_packed *k) ++{ ++ unsigned ret = bkey_packed(k) ? format->key_u64s : BKEY_U64s; ++ ++ EBUG_ON(k->u64s < ret); ++ return ret; ++} ++ ++static inline unsigned bkeyp_key_bytes(const struct bkey_format *format, ++ const struct bkey_packed *k) ++{ ++ return bkeyp_key_u64s(format, k) * sizeof(u64); ++} ++ ++static inline unsigned bkeyp_val_u64s(const struct bkey_format *format, ++ const struct bkey_packed *k) ++{ ++ return k->u64s - bkeyp_key_u64s(format, k); ++} ++ ++static inline size_t bkeyp_val_bytes(const struct bkey_format *format, ++ const struct bkey_packed *k) ++{ ++ return bkeyp_val_u64s(format, k) * sizeof(u64); ++} ++ ++static inline void set_bkeyp_val_u64s(const struct bkey_format *format, ++ struct bkey_packed *k, unsigned val_u64s) ++{ ++ k->u64s = bkeyp_key_u64s(format, k) + val_u64s; ++} ++ ++#define bkeyp_val(_format, _k) \ ++ ((struct bch_val *) ((_k)->_data + bkeyp_key_u64s(_format, _k))) ++ ++extern const struct bkey_format bch2_bkey_format_current; ++ ++bool bch2_bkey_transform(const struct bkey_format *, ++ struct bkey_packed *, ++ const struct bkey_format *, ++ const struct bkey_packed *); ++ ++struct bkey __bch2_bkey_unpack_key(const struct bkey_format *, ++ const struct bkey_packed *); ++ ++#ifndef HAVE_BCACHEFS_COMPILED_UNPACK ++struct bpos __bkey_unpack_pos(const struct bkey_format *, ++ const struct bkey_packed *); ++#endif ++ ++bool bch2_bkey_pack_key(struct bkey_packed *, const struct bkey *, ++ const struct bkey_format *); ++ ++enum bkey_pack_pos_ret { ++ BKEY_PACK_POS_EXACT, ++ BKEY_PACK_POS_SMALLER, ++ BKEY_PACK_POS_FAIL, ++}; ++ ++enum bkey_pack_pos_ret bch2_bkey_pack_pos_lossy(struct bkey_packed *, struct bpos, ++ const struct btree *); ++ ++static inline bool bkey_pack_pos(struct bkey_packed *out, struct bpos in, ++ const struct btree *b) ++{ ++ return bch2_bkey_pack_pos_lossy(out, in, b) == BKEY_PACK_POS_EXACT; ++} ++ ++void bch2_bkey_unpack(const struct btree *, struct bkey_i *, ++ const struct bkey_packed *); ++bool bch2_bkey_pack(struct bkey_packed *, const struct bkey_i *, ++ const struct bkey_format *); ++ ++static inline u64 bkey_field_max(const struct bkey_format *f, ++ enum bch_bkey_fields nr) ++{ ++ return f->bits_per_field[nr] < 64 ++ ? (le64_to_cpu(f->field_offset[nr]) + ++ ~(~0ULL << f->bits_per_field[nr])) ++ : U64_MAX; ++} ++ ++#ifdef HAVE_BCACHEFS_COMPILED_UNPACK ++ ++int bch2_compile_bkey_format(const struct bkey_format *, void *); ++ ++#else ++ ++static inline int bch2_compile_bkey_format(const struct bkey_format *format, ++ void *out) { return 0; } ++ ++#endif ++ ++static inline void bkey_reassemble(struct bkey_i *dst, ++ struct bkey_s_c src) ++{ ++ BUG_ON(bkey_packed(src.k)); ++ dst->k = *src.k; ++ memcpy_u64s(&dst->v, src.v, bkey_val_u64s(src.k)); ++} ++ ++#define bkey_s_null ((struct bkey_s) { .k = NULL }) ++#define bkey_s_c_null ((struct bkey_s_c) { .k = NULL }) ++ ++#define bkey_s_err(err) ((struct bkey_s) { .k = ERR_PTR(err) }) ++#define bkey_s_c_err(err) ((struct bkey_s_c) { .k = ERR_PTR(err) }) ++ ++static inline struct bkey_s bkey_to_s(struct bkey *k) ++{ ++ return (struct bkey_s) { .k = k, .v = NULL }; ++} ++ ++static inline struct bkey_s_c bkey_to_s_c(const struct bkey *k) ++{ ++ return (struct bkey_s_c) { .k = k, .v = NULL }; ++} ++ ++static inline struct bkey_s bkey_i_to_s(struct bkey_i *k) ++{ ++ return (struct bkey_s) { .k = &k->k, .v = &k->v }; ++} ++ ++static inline struct bkey_s_c bkey_i_to_s_c(const struct bkey_i *k) ++{ ++ return (struct bkey_s_c) { .k = &k->k, .v = &k->v }; ++} ++ ++/* ++ * For a given type of value (e.g. struct bch_extent), generates the types for ++ * bkey + bch_extent - inline, split, split const - and also all the conversion ++ * functions, which also check that the value is of the correct type. ++ * ++ * We use anonymous unions for upcasting - e.g. converting from e.g. a ++ * bkey_i_extent to a bkey_i - since that's always safe, instead of conversion ++ * functions. ++ */ ++#define BKEY_VAL_ACCESSORS(name) \ ++struct bkey_i_##name { \ ++ union { \ ++ struct bkey k; \ ++ struct bkey_i k_i; \ ++ }; \ ++ struct bch_##name v; \ ++}; \ ++ \ ++struct bkey_s_c_##name { \ ++ union { \ ++ struct { \ ++ const struct bkey *k; \ ++ const struct bch_##name *v; \ ++ }; \ ++ struct bkey_s_c s_c; \ ++ }; \ ++}; \ ++ \ ++struct bkey_s_##name { \ ++ union { \ ++ struct { \ ++ struct bkey *k; \ ++ struct bch_##name *v; \ ++ }; \ ++ struct bkey_s_c_##name c; \ ++ struct bkey_s s; \ ++ struct bkey_s_c s_c; \ ++ }; \ ++}; \ ++ \ ++static inline struct bkey_i_##name *bkey_i_to_##name(struct bkey_i *k) \ ++{ \ ++ EBUG_ON(k->k.type != KEY_TYPE_##name); \ ++ return container_of(&k->k, struct bkey_i_##name, k); \ ++} \ ++ \ ++static inline const struct bkey_i_##name * \ ++bkey_i_to_##name##_c(const struct bkey_i *k) \ ++{ \ ++ EBUG_ON(k->k.type != KEY_TYPE_##name); \ ++ return container_of(&k->k, struct bkey_i_##name, k); \ ++} \ ++ \ ++static inline struct bkey_s_##name bkey_s_to_##name(struct bkey_s k) \ ++{ \ ++ EBUG_ON(k.k->type != KEY_TYPE_##name); \ ++ return (struct bkey_s_##name) { \ ++ .k = k.k, \ ++ .v = container_of(k.v, struct bch_##name, v), \ ++ }; \ ++} \ ++ \ ++static inline struct bkey_s_c_##name bkey_s_c_to_##name(struct bkey_s_c k)\ ++{ \ ++ EBUG_ON(k.k->type != KEY_TYPE_##name); \ ++ return (struct bkey_s_c_##name) { \ ++ .k = k.k, \ ++ .v = container_of(k.v, struct bch_##name, v), \ ++ }; \ ++} \ ++ \ ++static inline struct bkey_s_##name name##_i_to_s(struct bkey_i_##name *k)\ ++{ \ ++ return (struct bkey_s_##name) { \ ++ .k = &k->k, \ ++ .v = &k->v, \ ++ }; \ ++} \ ++ \ ++static inline struct bkey_s_c_##name \ ++name##_i_to_s_c(const struct bkey_i_##name *k) \ ++{ \ ++ return (struct bkey_s_c_##name) { \ ++ .k = &k->k, \ ++ .v = &k->v, \ ++ }; \ ++} \ ++ \ ++static inline struct bkey_s_##name bkey_i_to_s_##name(struct bkey_i *k) \ ++{ \ ++ EBUG_ON(k->k.type != KEY_TYPE_##name); \ ++ return (struct bkey_s_##name) { \ ++ .k = &k->k, \ ++ .v = container_of(&k->v, struct bch_##name, v), \ ++ }; \ ++} \ ++ \ ++static inline struct bkey_s_c_##name \ ++bkey_i_to_s_c_##name(const struct bkey_i *k) \ ++{ \ ++ EBUG_ON(k->k.type != KEY_TYPE_##name); \ ++ return (struct bkey_s_c_##name) { \ ++ .k = &k->k, \ ++ .v = container_of(&k->v, struct bch_##name, v), \ ++ }; \ ++} \ ++ \ ++static inline struct bkey_i_##name *bkey_##name##_init(struct bkey_i *_k)\ ++{ \ ++ struct bkey_i_##name *k = \ ++ container_of(&_k->k, struct bkey_i_##name, k); \ ++ \ ++ bkey_init(&k->k); \ ++ memset(&k->v, 0, sizeof(k->v)); \ ++ k->k.type = KEY_TYPE_##name; \ ++ set_bkey_val_bytes(&k->k, sizeof(k->v)); \ ++ \ ++ return k; \ ++} ++ ++BKEY_VAL_ACCESSORS(cookie); ++BKEY_VAL_ACCESSORS(btree_ptr); ++BKEY_VAL_ACCESSORS(extent); ++BKEY_VAL_ACCESSORS(reservation); ++BKEY_VAL_ACCESSORS(inode); ++BKEY_VAL_ACCESSORS(inode_generation); ++BKEY_VAL_ACCESSORS(dirent); ++BKEY_VAL_ACCESSORS(xattr); ++BKEY_VAL_ACCESSORS(alloc); ++BKEY_VAL_ACCESSORS(quota); ++BKEY_VAL_ACCESSORS(stripe); ++BKEY_VAL_ACCESSORS(reflink_p); ++BKEY_VAL_ACCESSORS(reflink_v); ++ ++/* byte order helpers */ ++ ++#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ ++ ++static inline unsigned high_word_offset(const struct bkey_format *f) ++{ ++ return f->key_u64s - 1; ++} ++ ++#define high_bit_offset 0 ++#define nth_word(p, n) ((p) - (n)) ++ ++#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ ++ ++static inline unsigned high_word_offset(const struct bkey_format *f) ++{ ++ return 0; ++} ++ ++#define high_bit_offset KEY_PACKED_BITS_START ++#define nth_word(p, n) ((p) + (n)) ++ ++#else ++#error edit for your odd byteorder. ++#endif ++ ++#define high_word(f, k) ((k)->_data + high_word_offset(f)) ++#define next_word(p) nth_word(p, 1) ++#define prev_word(p) nth_word(p, -1) ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++void bch2_bkey_pack_test(void); ++#else ++static inline void bch2_bkey_pack_test(void) {} ++#endif ++ ++#endif /* _BCACHEFS_BKEY_H */ +diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c +new file mode 100644 +index 000000000000..f01405dd502b +--- /dev/null ++++ b/fs/bcachefs/bkey_methods.c +@@ -0,0 +1,262 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include "bcachefs.h" ++#include "bkey_methods.h" ++#include "btree_types.h" ++#include "alloc_background.h" ++#include "dirent.h" ++#include "ec.h" ++#include "error.h" ++#include "extents.h" ++#include "inode.h" ++#include "quota.h" ++#include "reflink.h" ++#include "xattr.h" ++ ++const char * const bch2_bkey_types[] = { ++#define x(name, nr) #name, ++ BCH_BKEY_TYPES() ++#undef x ++ NULL ++}; ++ ++static const char *deleted_key_invalid(const struct bch_fs *c, ++ struct bkey_s_c k) ++{ ++ return NULL; ++} ++ ++#define bch2_bkey_ops_deleted (struct bkey_ops) { \ ++ .key_invalid = deleted_key_invalid, \ ++} ++ ++#define bch2_bkey_ops_discard (struct bkey_ops) { \ ++ .key_invalid = deleted_key_invalid, \ ++} ++ ++static const char *empty_val_key_invalid(const struct bch_fs *c, struct bkey_s_c k) ++{ ++ if (bkey_val_bytes(k.k)) ++ return "value size should be zero"; ++ ++ return NULL; ++} ++ ++#define bch2_bkey_ops_error (struct bkey_ops) { \ ++ .key_invalid = empty_val_key_invalid, \ ++} ++ ++static const char *key_type_cookie_invalid(const struct bch_fs *c, ++ struct bkey_s_c k) ++{ ++ if (bkey_val_bytes(k.k) != sizeof(struct bch_cookie)) ++ return "incorrect value size"; ++ ++ return NULL; ++} ++ ++#define bch2_bkey_ops_cookie (struct bkey_ops) { \ ++ .key_invalid = key_type_cookie_invalid, \ ++} ++ ++#define bch2_bkey_ops_whiteout (struct bkey_ops) { \ ++ .key_invalid = empty_val_key_invalid, \ ++} ++ ++static const struct bkey_ops bch2_bkey_ops[] = { ++#define x(name, nr) [KEY_TYPE_##name] = bch2_bkey_ops_##name, ++ BCH_BKEY_TYPES() ++#undef x ++}; ++ ++const char *bch2_bkey_val_invalid(struct bch_fs *c, struct bkey_s_c k) ++{ ++ if (k.k->type >= KEY_TYPE_MAX) ++ return "invalid type"; ++ ++ return bch2_bkey_ops[k.k->type].key_invalid(c, k); ++} ++ ++const char *__bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k, ++ enum btree_node_type type) ++{ ++ if (k.k->u64s < BKEY_U64s) ++ return "u64s too small"; ++ ++ if ((btree_node_type_is_extents(type) || ++ type == BKEY_TYPE_BTREE) && ++ bkey_val_u64s(k.k) > BKEY_EXTENT_VAL_U64s_MAX) ++ return "value too big"; ++ ++ if (btree_node_type_is_extents(type)) { ++ if ((k.k->size == 0) != bkey_deleted(k.k)) ++ return "bad size field"; ++ ++ if (k.k->size > k.k->p.offset) ++ return "size greater than offset"; ++ } else { ++ if (k.k->size) ++ return "nonzero size field"; ++ } ++ ++ if (k.k->p.snapshot) ++ return "nonzero snapshot"; ++ ++ if (type != BKEY_TYPE_BTREE && ++ !bkey_cmp(k.k->p, POS_MAX)) ++ return "POS_MAX key"; ++ ++ return NULL; ++} ++ ++const char *bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k, ++ enum btree_node_type type) ++{ ++ return __bch2_bkey_invalid(c, k, type) ?: ++ bch2_bkey_val_invalid(c, k); ++} ++ ++const char *bch2_bkey_in_btree_node(struct btree *b, struct bkey_s_c k) ++{ ++ if (bkey_cmp(bkey_start_pos(k.k), b->data->min_key) < 0) ++ return "key before start of btree node"; ++ ++ if (bkey_cmp(k.k->p, b->data->max_key) > 0) ++ return "key past end of btree node"; ++ ++ return NULL; ++} ++ ++void bch2_bkey_debugcheck(struct bch_fs *c, struct btree *b, struct bkey_s_c k) ++{ ++ const struct bkey_ops *ops = &bch2_bkey_ops[k.k->type]; ++ const char *invalid; ++ ++ BUG_ON(!k.k->u64s); ++ ++ invalid = bch2_bkey_invalid(c, k, btree_node_type(b)) ?: ++ bch2_bkey_in_btree_node(b, k); ++ if (invalid) { ++ char buf[160]; ++ ++ bch2_bkey_val_to_text(&PBUF(buf), c, k); ++ bch2_fs_bug(c, "invalid bkey %s: %s", buf, invalid); ++ return; ++ } ++ ++ if (ops->key_debugcheck) ++ ops->key_debugcheck(c, k); ++} ++ ++void bch2_bpos_to_text(struct printbuf *out, struct bpos pos) ++{ ++ if (!bkey_cmp(pos, POS_MIN)) ++ pr_buf(out, "POS_MIN"); ++ else if (!bkey_cmp(pos, POS_MAX)) ++ pr_buf(out, "POS_MAX"); ++ else ++ pr_buf(out, "%llu:%llu", pos.inode, pos.offset); ++} ++ ++void bch2_bkey_to_text(struct printbuf *out, const struct bkey *k) ++{ ++ pr_buf(out, "u64s %u type %s ", k->u64s, ++ bch2_bkey_types[k->type]); ++ ++ bch2_bpos_to_text(out, k->p); ++ ++ pr_buf(out, " snap %u len %u ver %llu", ++ k->p.snapshot, k->size, k->version.lo); ++} ++ ++void bch2_val_to_text(struct printbuf *out, struct bch_fs *c, ++ struct bkey_s_c k) ++{ ++ const struct bkey_ops *ops = &bch2_bkey_ops[k.k->type]; ++ ++ if (likely(ops->val_to_text)) ++ ops->val_to_text(out, c, k); ++} ++ ++void bch2_bkey_val_to_text(struct printbuf *out, struct bch_fs *c, ++ struct bkey_s_c k) ++{ ++ bch2_bkey_to_text(out, k.k); ++ pr_buf(out, ": "); ++ bch2_val_to_text(out, c, k); ++} ++ ++void bch2_bkey_swab(const struct bkey_format *f, ++ struct bkey_packed *k) ++{ ++ const struct bkey_ops *ops = &bch2_bkey_ops[k->type]; ++ ++ bch2_bkey_swab_key(f, k); ++ ++ if (ops->swab) ++ ops->swab(f, k); ++} ++ ++bool bch2_bkey_normalize(struct bch_fs *c, struct bkey_s k) ++{ ++ const struct bkey_ops *ops = &bch2_bkey_ops[k.k->type]; ++ ++ return ops->key_normalize ++ ? ops->key_normalize(c, k) ++ : false; ++} ++ ++enum merge_result bch2_bkey_merge(struct bch_fs *c, ++ struct bkey_s l, struct bkey_s r) ++{ ++ const struct bkey_ops *ops = &bch2_bkey_ops[l.k->type]; ++ enum merge_result ret; ++ ++ if (key_merging_disabled(c) || ++ !ops->key_merge || ++ l.k->type != r.k->type || ++ bversion_cmp(l.k->version, r.k->version) || ++ bkey_cmp(l.k->p, bkey_start_pos(r.k))) ++ return BCH_MERGE_NOMERGE; ++ ++ ret = ops->key_merge(c, l, r); ++ ++ if (ret != BCH_MERGE_NOMERGE) ++ l.k->needs_whiteout |= r.k->needs_whiteout; ++ return ret; ++} ++ ++static const struct old_bkey_type { ++ u8 btree_node_type; ++ u8 old; ++ u8 new; ++} bkey_renumber_table[] = { ++ {BKEY_TYPE_BTREE, 128, KEY_TYPE_btree_ptr }, ++ {BKEY_TYPE_EXTENTS, 128, KEY_TYPE_extent }, ++ {BKEY_TYPE_EXTENTS, 129, KEY_TYPE_extent }, ++ {BKEY_TYPE_EXTENTS, 130, KEY_TYPE_reservation }, ++ {BKEY_TYPE_INODES, 128, KEY_TYPE_inode }, ++ {BKEY_TYPE_INODES, 130, KEY_TYPE_inode_generation }, ++ {BKEY_TYPE_DIRENTS, 128, KEY_TYPE_dirent }, ++ {BKEY_TYPE_DIRENTS, 129, KEY_TYPE_whiteout }, ++ {BKEY_TYPE_XATTRS, 128, KEY_TYPE_xattr }, ++ {BKEY_TYPE_XATTRS, 129, KEY_TYPE_whiteout }, ++ {BKEY_TYPE_ALLOC, 128, KEY_TYPE_alloc }, ++ {BKEY_TYPE_QUOTAS, 128, KEY_TYPE_quota }, ++}; ++ ++void bch2_bkey_renumber(enum btree_node_type btree_node_type, ++ struct bkey_packed *k, ++ int write) ++{ ++ const struct old_bkey_type *i; ++ ++ for (i = bkey_renumber_table; ++ i < bkey_renumber_table + ARRAY_SIZE(bkey_renumber_table); ++ i++) ++ if (btree_node_type == i->btree_node_type && ++ k->type == (write ? i->new : i->old)) { ++ k->type = write ? i->old : i->new; ++ break; ++ } ++} +diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h +new file mode 100644 +index 000000000000..8568b65c1ed2 +--- /dev/null ++++ b/fs/bcachefs/bkey_methods.h +@@ -0,0 +1,63 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_BKEY_METHODS_H ++#define _BCACHEFS_BKEY_METHODS_H ++ ++#include "bkey.h" ++ ++struct bch_fs; ++struct btree; ++struct bkey; ++enum btree_node_type; ++ ++extern const char * const bch2_bkey_types[]; ++ ++enum merge_result { ++ BCH_MERGE_NOMERGE, ++ ++ /* ++ * The keys were mergeable, but would have overflowed size - so instead ++ * l was changed to the maximum size, and both keys were modified: ++ */ ++ BCH_MERGE_PARTIAL, ++ BCH_MERGE_MERGE, ++}; ++ ++struct bkey_ops { ++ /* Returns reason for being invalid if invalid, else NULL: */ ++ const char * (*key_invalid)(const struct bch_fs *, ++ struct bkey_s_c); ++ void (*key_debugcheck)(struct bch_fs *, struct bkey_s_c); ++ void (*val_to_text)(struct printbuf *, struct bch_fs *, ++ struct bkey_s_c); ++ void (*swab)(const struct bkey_format *, struct bkey_packed *); ++ bool (*key_normalize)(struct bch_fs *, struct bkey_s); ++ enum merge_result (*key_merge)(struct bch_fs *, ++ struct bkey_s, struct bkey_s); ++}; ++ ++const char *bch2_bkey_val_invalid(struct bch_fs *, struct bkey_s_c); ++const char *__bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c, ++ enum btree_node_type); ++const char *bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c, ++ enum btree_node_type); ++const char *bch2_bkey_in_btree_node(struct btree *, struct bkey_s_c); ++ ++void bch2_bkey_debugcheck(struct bch_fs *, struct btree *, struct bkey_s_c); ++ ++void bch2_bpos_to_text(struct printbuf *, struct bpos); ++void bch2_bkey_to_text(struct printbuf *, const struct bkey *); ++void bch2_val_to_text(struct printbuf *, struct bch_fs *, ++ struct bkey_s_c); ++void bch2_bkey_val_to_text(struct printbuf *, struct bch_fs *, ++ struct bkey_s_c); ++ ++void bch2_bkey_swab(const struct bkey_format *, struct bkey_packed *); ++ ++bool bch2_bkey_normalize(struct bch_fs *, struct bkey_s); ++ ++enum merge_result bch2_bkey_merge(struct bch_fs *, ++ struct bkey_s, struct bkey_s); ++ ++void bch2_bkey_renumber(enum btree_node_type, struct bkey_packed *, int); ++ ++#endif /* _BCACHEFS_BKEY_METHODS_H */ +diff --git a/fs/bcachefs/bkey_sort.c b/fs/bcachefs/bkey_sort.c +new file mode 100644 +index 000000000000..e32fad5a91ac +--- /dev/null ++++ b/fs/bcachefs/bkey_sort.c +@@ -0,0 +1,630 @@ ++// SPDX-License-Identifier: GPL-2.0 ++#include "bcachefs.h" ++#include "bkey_sort.h" ++#include "bset.h" ++#include "extents.h" ++ ++/* too many iterators, need to clean this up */ ++ ++/* btree_node_iter_large: */ ++ ++#define btree_node_iter_cmp_heap(h, _l, _r) btree_node_iter_cmp(b, _l, _r) ++ ++static inline bool ++bch2_btree_node_iter_large_end(struct btree_node_iter_large *iter) ++{ ++ return !iter->used; ++} ++ ++static inline struct bkey_packed * ++bch2_btree_node_iter_large_peek_all(struct btree_node_iter_large *iter, ++ struct btree *b) ++{ ++ return bch2_btree_node_iter_large_end(iter) ++ ? NULL ++ : __btree_node_offset_to_key(b, iter->data->k); ++} ++ ++static void ++bch2_btree_node_iter_large_advance(struct btree_node_iter_large *iter, ++ struct btree *b) ++{ ++ iter->data->k += __btree_node_offset_to_key(b, iter->data->k)->u64s; ++ ++ EBUG_ON(!iter->used); ++ EBUG_ON(iter->data->k > iter->data->end); ++ ++ if (iter->data->k == iter->data->end) ++ heap_del(iter, 0, btree_node_iter_cmp_heap, NULL); ++ else ++ heap_sift_down(iter, 0, btree_node_iter_cmp_heap, NULL); ++} ++ ++static inline struct bkey_packed * ++bch2_btree_node_iter_large_next_all(struct btree_node_iter_large *iter, ++ struct btree *b) ++{ ++ struct bkey_packed *ret = bch2_btree_node_iter_large_peek_all(iter, b); ++ ++ if (ret) ++ bch2_btree_node_iter_large_advance(iter, b); ++ ++ return ret; ++} ++ ++void bch2_btree_node_iter_large_push(struct btree_node_iter_large *iter, ++ struct btree *b, ++ const struct bkey_packed *k, ++ const struct bkey_packed *end) ++{ ++ if (k != end) { ++ struct btree_node_iter_set n = ++ ((struct btree_node_iter_set) { ++ __btree_node_key_to_offset(b, k), ++ __btree_node_key_to_offset(b, end) ++ }); ++ ++ __heap_add(iter, n, btree_node_iter_cmp_heap, NULL); ++ } ++} ++ ++static void sort_key_next(struct btree_node_iter_large *iter, ++ struct btree *b, ++ struct btree_node_iter_set *i) ++{ ++ i->k += __btree_node_offset_to_key(b, i->k)->u64s; ++ ++ if (i->k == i->end) ++ *i = iter->data[--iter->used]; ++} ++ ++/* regular sort_iters */ ++ ++typedef int (*sort_cmp_fn)(struct btree *, ++ struct bkey_packed *, ++ struct bkey_packed *); ++ ++static inline void __sort_iter_sift(struct sort_iter *iter, ++ unsigned from, ++ sort_cmp_fn cmp) ++{ ++ unsigned i; ++ ++ for (i = from; ++ i + 1 < iter->used && ++ cmp(iter->b, iter->data[i].k, iter->data[i + 1].k) > 0; ++ i++) ++ swap(iter->data[i], iter->data[i + 1]); ++} ++ ++static inline void sort_iter_sift(struct sort_iter *iter, sort_cmp_fn cmp) ++{ ++ ++ __sort_iter_sift(iter, 0, cmp); ++} ++ ++static inline void sort_iter_sort(struct sort_iter *iter, sort_cmp_fn cmp) ++{ ++ unsigned i = iter->used; ++ ++ while (i--) ++ __sort_iter_sift(iter, i, cmp); ++} ++ ++static inline struct bkey_packed *sort_iter_peek(struct sort_iter *iter) ++{ ++ return iter->used ? iter->data->k : NULL; ++} ++ ++static inline void sort_iter_advance(struct sort_iter *iter, sort_cmp_fn cmp) ++{ ++ iter->data->k = bkey_next(iter->data->k); ++ ++ BUG_ON(iter->data->k > iter->data->end); ++ ++ if (iter->data->k == iter->data->end) ++ array_remove_item(iter->data, iter->used, 0); ++ else ++ sort_iter_sift(iter, cmp); ++} ++ ++static inline struct bkey_packed *sort_iter_next(struct sort_iter *iter, ++ sort_cmp_fn cmp) ++{ ++ struct bkey_packed *ret = sort_iter_peek(iter); ++ ++ if (ret) ++ sort_iter_advance(iter, cmp); ++ ++ return ret; ++} ++ ++/* ++ * Returns true if l > r - unless l == r, in which case returns true if l is ++ * older than r. ++ * ++ * Necessary for btree_sort_fixup() - if there are multiple keys that compare ++ * equal in different sets, we have to process them newest to oldest. ++ */ ++#define key_sort_cmp(h, l, r) \ ++({ \ ++ bkey_cmp_packed(b, \ ++ __btree_node_offset_to_key(b, (l).k), \ ++ __btree_node_offset_to_key(b, (r).k)) \ ++ \ ++ ?: (l).k - (r).k; \ ++}) ++ ++static inline bool should_drop_next_key(struct btree_node_iter_large *iter, ++ struct btree *b) ++{ ++ struct btree_node_iter_set *l = iter->data, *r = iter->data + 1; ++ struct bkey_packed *k = __btree_node_offset_to_key(b, l->k); ++ ++ if (bkey_whiteout(k)) ++ return true; ++ ++ if (iter->used < 2) ++ return false; ++ ++ if (iter->used > 2 && ++ key_sort_cmp(iter, r[0], r[1]) >= 0) ++ r++; ++ ++ /* ++ * key_sort_cmp() ensures that when keys compare equal the older key ++ * comes first; so if l->k compares equal to r->k then l->k is older and ++ * should be dropped. ++ */ ++ return !bkey_cmp_packed(b, ++ __btree_node_offset_to_key(b, l->k), ++ __btree_node_offset_to_key(b, r->k)); ++} ++ ++struct btree_nr_keys bch2_key_sort_fix_overlapping(struct bset *dst, ++ struct btree *b, ++ struct btree_node_iter_large *iter) ++{ ++ struct bkey_packed *out = dst->start; ++ struct btree_nr_keys nr; ++ ++ memset(&nr, 0, sizeof(nr)); ++ ++ heap_resort(iter, key_sort_cmp, NULL); ++ ++ while (!bch2_btree_node_iter_large_end(iter)) { ++ if (!should_drop_next_key(iter, b)) { ++ struct bkey_packed *k = ++ __btree_node_offset_to_key(b, iter->data->k); ++ ++ bkey_copy(out, k); ++ btree_keys_account_key_add(&nr, 0, out); ++ out = bkey_next(out); ++ } ++ ++ sort_key_next(iter, b, iter->data); ++ heap_sift_down(iter, 0, key_sort_cmp, NULL); ++ } ++ ++ dst->u64s = cpu_to_le16((u64 *) out - dst->_data); ++ return nr; ++} ++ ++/* ++ * If keys compare equal, compare by pointer order: ++ * ++ * Necessary for sort_fix_overlapping() - if there are multiple keys that ++ * compare equal in different sets, we have to process them newest to oldest. ++ */ ++#define extent_sort_cmp(h, l, r) \ ++({ \ ++ struct bkey _ul = bkey_unpack_key(b, \ ++ __btree_node_offset_to_key(b, (l).k)); \ ++ struct bkey _ur = bkey_unpack_key(b, \ ++ __btree_node_offset_to_key(b, (r).k)); \ ++ \ ++ bkey_cmp(bkey_start_pos(&_ul), \ ++ bkey_start_pos(&_ur)) ?: (r).k - (l).k; \ ++}) ++ ++static inline void extent_sort_sift(struct btree_node_iter_large *iter, ++ struct btree *b, size_t i) ++{ ++ heap_sift_down(iter, i, extent_sort_cmp, NULL); ++} ++ ++static inline void extent_sort_next(struct btree_node_iter_large *iter, ++ struct btree *b, ++ struct btree_node_iter_set *i) ++{ ++ sort_key_next(iter, b, i); ++ heap_sift_down(iter, i - iter->data, extent_sort_cmp, NULL); ++} ++ ++static void extent_sort_advance_prev(struct bkey_format *f, ++ struct btree_nr_keys *nr, ++ struct bkey_packed *start, ++ struct bkey_packed **prev) ++{ ++ if (*prev) { ++ bch2_bkey_pack(*prev, (void *) *prev, f); ++ ++ btree_keys_account_key_add(nr, 0, *prev); ++ *prev = bkey_next(*prev); ++ } else { ++ *prev = start; ++ } ++} ++ ++static void extent_sort_append(struct bch_fs *c, ++ struct bkey_format *f, ++ struct btree_nr_keys *nr, ++ struct bkey_packed *start, ++ struct bkey_packed **prev, ++ struct bkey_s k) ++{ ++ if (bkey_whiteout(k.k)) ++ return; ++ ++ /* ++ * prev is always unpacked, for key merging - until right before we ++ * advance it: ++ */ ++ ++ if (*prev && ++ bch2_bkey_merge(c, bkey_i_to_s((void *) *prev), k) == ++ BCH_MERGE_MERGE) ++ return; ++ ++ extent_sort_advance_prev(f, nr, start, prev); ++ ++ bkey_reassemble((void *) *prev, k.s_c); ++} ++ ++struct btree_nr_keys bch2_extent_sort_fix_overlapping(struct bch_fs *c, ++ struct bset *dst, ++ struct btree *b, ++ struct btree_node_iter_large *iter) ++{ ++ struct bkey_format *f = &b->format; ++ struct btree_node_iter_set *_l = iter->data, *_r; ++ struct bkey_packed *prev = NULL, *lk, *rk; ++ struct bkey l_unpacked, r_unpacked; ++ struct bkey_s l, r; ++ struct btree_nr_keys nr; ++ ++ memset(&nr, 0, sizeof(nr)); ++ ++ heap_resort(iter, extent_sort_cmp, NULL); ++ ++ while (!bch2_btree_node_iter_large_end(iter)) { ++ lk = __btree_node_offset_to_key(b, _l->k); ++ l = __bkey_disassemble(b, lk, &l_unpacked); ++ ++ if (iter->used == 1) { ++ extent_sort_append(c, f, &nr, dst->start, &prev, l); ++ extent_sort_next(iter, b, _l); ++ continue; ++ } ++ ++ _r = iter->data + 1; ++ if (iter->used > 2 && ++ extent_sort_cmp(iter, _r[0], _r[1]) >= 0) ++ _r++; ++ ++ rk = __btree_node_offset_to_key(b, _r->k); ++ r = __bkey_disassemble(b, rk, &r_unpacked); ++ ++ /* If current key and next key don't overlap, just append */ ++ if (bkey_cmp(l.k->p, bkey_start_pos(r.k)) <= 0) { ++ extent_sort_append(c, f, &nr, dst->start, &prev, l); ++ extent_sort_next(iter, b, _l); ++ continue; ++ } ++ ++ /* Skip 0 size keys */ ++ if (!r.k->size) { ++ extent_sort_next(iter, b, _r); ++ continue; ++ } ++ ++ /* ++ * overlap: keep the newer key and trim the older key so they ++ * don't overlap. comparing pointers tells us which one is ++ * newer, since the bsets are appended one after the other. ++ */ ++ ++ /* can't happen because of comparison func */ ++ BUG_ON(_l->k < _r->k && ++ !bkey_cmp(bkey_start_pos(l.k), bkey_start_pos(r.k))); ++ ++ if (_l->k > _r->k) { ++ /* l wins, trim r */ ++ if (bkey_cmp(l.k->p, r.k->p) >= 0) { ++ sort_key_next(iter, b, _r); ++ } else { ++ __bch2_cut_front(l.k->p, r); ++ extent_save(b, rk, r.k); ++ } ++ ++ extent_sort_sift(iter, b, _r - iter->data); ++ } else if (bkey_cmp(l.k->p, r.k->p) > 0) { ++ BKEY_PADDED(k) tmp; ++ ++ /* ++ * r wins, but it overlaps in the middle of l - split l: ++ */ ++ bkey_reassemble(&tmp.k, l.s_c); ++ bch2_cut_back(bkey_start_pos(r.k), &tmp.k.k); ++ ++ __bch2_cut_front(r.k->p, l); ++ extent_save(b, lk, l.k); ++ ++ extent_sort_sift(iter, b, 0); ++ ++ extent_sort_append(c, f, &nr, dst->start, ++ &prev, bkey_i_to_s(&tmp.k)); ++ } else { ++ bch2_cut_back(bkey_start_pos(r.k), l.k); ++ extent_save(b, lk, l.k); ++ } ++ } ++ ++ extent_sort_advance_prev(f, &nr, dst->start, &prev); ++ ++ dst->u64s = cpu_to_le16((u64 *) prev - dst->_data); ++ return nr; ++} ++ ++/* Sort + repack in a new format: */ ++struct btree_nr_keys ++bch2_sort_repack(struct bset *dst, struct btree *src, ++ struct btree_node_iter *src_iter, ++ struct bkey_format *out_f, ++ bool filter_whiteouts) ++{ ++ struct bkey_format *in_f = &src->format; ++ struct bkey_packed *in, *out = vstruct_last(dst); ++ struct btree_nr_keys nr; ++ ++ memset(&nr, 0, sizeof(nr)); ++ ++ while ((in = bch2_btree_node_iter_next_all(src_iter, src))) { ++ if (filter_whiteouts && bkey_whiteout(in)) ++ continue; ++ ++ if (bch2_bkey_transform(out_f, out, bkey_packed(in) ++ ? in_f : &bch2_bkey_format_current, in)) ++ out->format = KEY_FORMAT_LOCAL_BTREE; ++ else ++ bch2_bkey_unpack(src, (void *) out, in); ++ ++ btree_keys_account_key_add(&nr, 0, out); ++ out = bkey_next(out); ++ } ++ ++ dst->u64s = cpu_to_le16((u64 *) out - dst->_data); ++ return nr; ++} ++ ++/* Sort, repack, and merge: */ ++struct btree_nr_keys ++bch2_sort_repack_merge(struct bch_fs *c, ++ struct bset *dst, struct btree *src, ++ struct btree_node_iter *iter, ++ struct bkey_format *out_f, ++ bool filter_whiteouts) ++{ ++ struct bkey_packed *prev = NULL, *k_packed; ++ struct bkey_s k; ++ struct btree_nr_keys nr; ++ BKEY_PADDED(k) tmp; ++ ++ memset(&nr, 0, sizeof(nr)); ++ ++ while ((k_packed = bch2_btree_node_iter_next_all(iter, src))) { ++ if (filter_whiteouts && bkey_whiteout(k_packed)) ++ continue; ++ ++ EBUG_ON(bkeyp_val_u64s(&src->format, k_packed) > ++ BKEY_EXTENT_VAL_U64s_MAX); ++ ++ bch2_bkey_unpack(src, &tmp.k, k_packed); ++ k = bkey_i_to_s(&tmp.k); ++ ++ if (filter_whiteouts && ++ bch2_bkey_normalize(c, k)) ++ continue; ++ ++ extent_sort_append(c, out_f, &nr, vstruct_last(dst), &prev, k); ++ } ++ ++ extent_sort_advance_prev(out_f, &nr, vstruct_last(dst), &prev); ++ ++ dst->u64s = cpu_to_le16((u64 *) prev - dst->_data); ++ return nr; ++} ++ ++static inline int sort_keys_cmp(struct btree *b, ++ struct bkey_packed *l, ++ struct bkey_packed *r) ++{ ++ return bkey_cmp_packed(b, l, r) ?: ++ (int) bkey_whiteout(r) - (int) bkey_whiteout(l) ?: ++ (int) l->needs_whiteout - (int) r->needs_whiteout; ++} ++ ++unsigned bch2_sort_keys(struct bkey_packed *dst, ++ struct sort_iter *iter, ++ bool filter_whiteouts) ++{ ++ const struct bkey_format *f = &iter->b->format; ++ struct bkey_packed *in, *next, *out = dst; ++ ++ sort_iter_sort(iter, sort_keys_cmp); ++ ++ while ((in = sort_iter_next(iter, sort_keys_cmp))) { ++ if (bkey_whiteout(in) && ++ (filter_whiteouts || !in->needs_whiteout)) ++ continue; ++ ++ if (bkey_whiteout(in) && ++ (next = sort_iter_peek(iter)) && ++ !bkey_cmp_packed(iter->b, in, next)) { ++ BUG_ON(in->needs_whiteout && ++ next->needs_whiteout); ++ /* ++ * XXX racy, called with read lock from write path ++ * ++ * leads to spurious BUG_ON() in bkey_unpack_key() in ++ * debug mode ++ */ ++ next->needs_whiteout |= in->needs_whiteout; ++ continue; ++ } ++ ++ if (bkey_whiteout(in)) { ++ memcpy_u64s(out, in, bkeyp_key_u64s(f, in)); ++ set_bkeyp_val_u64s(f, out, 0); ++ } else { ++ bkey_copy(out, in); ++ } ++ out = bkey_next(out); ++ } ++ ++ return (u64 *) out - (u64 *) dst; ++} ++ ++static inline int sort_extents_cmp(struct btree *b, ++ struct bkey_packed *l, ++ struct bkey_packed *r) ++{ ++ return bkey_cmp_packed(b, l, r) ?: ++ (int) bkey_deleted(l) - (int) bkey_deleted(r); ++} ++ ++unsigned bch2_sort_extents(struct bkey_packed *dst, ++ struct sort_iter *iter, ++ bool filter_whiteouts) ++{ ++ struct bkey_packed *in, *out = dst; ++ ++ sort_iter_sort(iter, sort_extents_cmp); ++ ++ while ((in = sort_iter_next(iter, sort_extents_cmp))) { ++ if (bkey_deleted(in)) ++ continue; ++ ++ if (bkey_whiteout(in) && ++ (filter_whiteouts || !in->needs_whiteout)) ++ continue; ++ ++ bkey_copy(out, in); ++ out = bkey_next(out); ++ } ++ ++ return (u64 *) out - (u64 *) dst; ++} ++ ++static inline int sort_key_whiteouts_cmp(struct btree *b, ++ struct bkey_packed *l, ++ struct bkey_packed *r) ++{ ++ return bkey_cmp_packed(b, l, r); ++} ++ ++unsigned bch2_sort_key_whiteouts(struct bkey_packed *dst, ++ struct sort_iter *iter) ++{ ++ struct bkey_packed *in, *out = dst; ++ ++ sort_iter_sort(iter, sort_key_whiteouts_cmp); ++ ++ while ((in = sort_iter_next(iter, sort_key_whiteouts_cmp))) { ++ bkey_copy(out, in); ++ out = bkey_next(out); ++ } ++ ++ return (u64 *) out - (u64 *) dst; ++} ++ ++static inline int sort_extent_whiteouts_cmp(struct btree *b, ++ struct bkey_packed *l, ++ struct bkey_packed *r) ++{ ++ struct bkey ul = bkey_unpack_key(b, l); ++ struct bkey ur = bkey_unpack_key(b, r); ++ ++ return bkey_cmp(bkey_start_pos(&ul), bkey_start_pos(&ur)); ++} ++ ++unsigned bch2_sort_extent_whiteouts(struct bkey_packed *dst, ++ struct sort_iter *iter) ++{ ++ const struct bkey_format *f = &iter->b->format; ++ struct bkey_packed *in, *out = dst; ++ struct bkey_i l, r; ++ bool prev = false, l_packed = false; ++ u64 max_packed_size = bkey_field_max(f, BKEY_FIELD_SIZE); ++ u64 max_packed_offset = bkey_field_max(f, BKEY_FIELD_OFFSET); ++ u64 new_size; ++ ++ max_packed_size = min_t(u64, max_packed_size, KEY_SIZE_MAX); ++ ++ sort_iter_sort(iter, sort_extent_whiteouts_cmp); ++ ++ while ((in = sort_iter_next(iter, sort_extent_whiteouts_cmp))) { ++ if (bkey_deleted(in)) ++ continue; ++ ++ EBUG_ON(bkeyp_val_u64s(f, in)); ++ EBUG_ON(in->type != KEY_TYPE_discard); ++ ++ r.k = bkey_unpack_key(iter->b, in); ++ ++ if (prev && ++ bkey_cmp(l.k.p, bkey_start_pos(&r.k)) >= 0) { ++ if (bkey_cmp(l.k.p, r.k.p) >= 0) ++ continue; ++ ++ new_size = l_packed ++ ? min(max_packed_size, max_packed_offset - ++ bkey_start_offset(&l.k)) ++ : KEY_SIZE_MAX; ++ ++ new_size = min(new_size, r.k.p.offset - ++ bkey_start_offset(&l.k)); ++ ++ BUG_ON(new_size < l.k.size); ++ ++ bch2_key_resize(&l.k, new_size); ++ ++ if (bkey_cmp(l.k.p, r.k.p) >= 0) ++ continue; ++ ++ bch2_cut_front(l.k.p, &r); ++ } ++ ++ if (prev) { ++ if (!bch2_bkey_pack(out, &l, f)) { ++ BUG_ON(l_packed); ++ bkey_copy(out, &l); ++ } ++ out = bkey_next(out); ++ } ++ ++ l = r; ++ prev = true; ++ l_packed = bkey_packed(in); ++ } ++ ++ if (prev) { ++ if (!bch2_bkey_pack(out, &l, f)) { ++ BUG_ON(l_packed); ++ bkey_copy(out, &l); ++ } ++ out = bkey_next(out); ++ } ++ ++ return (u64 *) out - (u64 *) dst; ++} +diff --git a/fs/bcachefs/bkey_sort.h b/fs/bcachefs/bkey_sort.h +new file mode 100644 +index 000000000000..397009181eae +--- /dev/null ++++ b/fs/bcachefs/bkey_sort.h +@@ -0,0 +1,69 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_BKEY_SORT_H ++#define _BCACHEFS_BKEY_SORT_H ++ ++struct btree_node_iter_large { ++ u16 used; ++ ++ struct btree_node_iter_set data[MAX_BSETS]; ++}; ++ ++void bch2_btree_node_iter_large_push(struct btree_node_iter_large *, ++ struct btree *, ++ const struct bkey_packed *, ++ const struct bkey_packed *); ++ ++struct sort_iter { ++ struct btree *b; ++ unsigned used; ++ ++ struct sort_iter_set { ++ struct bkey_packed *k, *end; ++ } data[MAX_BSETS + 1]; ++}; ++ ++static inline void sort_iter_init(struct sort_iter *iter, struct btree *b) ++{ ++ memset(iter, 0, sizeof(*iter)); ++ iter->b = b; ++} ++ ++static inline void sort_iter_add(struct sort_iter *iter, ++ struct bkey_packed *k, ++ struct bkey_packed *end) ++{ ++ BUG_ON(iter->used >= ARRAY_SIZE(iter->data)); ++ ++ if (k != end) ++ iter->data[iter->used++] = (struct sort_iter_set) { k, end }; ++} ++ ++struct btree_nr_keys ++bch2_key_sort_fix_overlapping(struct bset *, struct btree *, ++ struct btree_node_iter_large *); ++struct btree_nr_keys ++bch2_extent_sort_fix_overlapping(struct bch_fs *, struct bset *, ++ struct btree *, ++ struct btree_node_iter_large *); ++ ++struct btree_nr_keys ++bch2_sort_repack(struct bset *, struct btree *, ++ struct btree_node_iter *, ++ struct bkey_format *, bool); ++struct btree_nr_keys ++bch2_sort_repack_merge(struct bch_fs *, ++ struct bset *, struct btree *, ++ struct btree_node_iter *, ++ struct bkey_format *, bool); ++ ++unsigned bch2_sort_keys(struct bkey_packed *, ++ struct sort_iter *, bool); ++unsigned bch2_sort_extents(struct bkey_packed *, ++ struct sort_iter *, bool); ++ ++unsigned bch2_sort_key_whiteouts(struct bkey_packed *, ++ struct sort_iter *); ++unsigned bch2_sort_extent_whiteouts(struct bkey_packed *, ++ struct sort_iter *); ++ ++#endif /* _BCACHEFS_BKEY_SORT_H */ +diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c +new file mode 100644 +index 000000000000..ff9465750528 +--- /dev/null ++++ b/fs/bcachefs/bset.c +@@ -0,0 +1,1876 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * Code for working with individual keys, and sorted sets of keys with in a ++ * btree node ++ * ++ * Copyright 2012 Google, Inc. ++ */ ++ ++#include "bcachefs.h" ++#include "btree_cache.h" ++#include "bset.h" ++#include "eytzinger.h" ++#include "util.h" ++ ++#include ++#include ++#include ++#include ++ ++/* hack.. */ ++#include "alloc_types.h" ++#include ++ ++static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *, ++ struct btree *); ++ ++static inline unsigned __btree_node_iter_used(struct btree_node_iter *iter) ++{ ++ unsigned n = ARRAY_SIZE(iter->data); ++ ++ while (n && __btree_node_iter_set_end(iter, n - 1)) ++ --n; ++ ++ return n; ++} ++ ++struct bset_tree *bch2_bkey_to_bset(struct btree *b, struct bkey_packed *k) ++{ ++ unsigned offset = __btree_node_key_to_offset(b, k); ++ struct bset_tree *t; ++ ++ for_each_bset(b, t) ++ if (offset <= t->end_offset) { ++ EBUG_ON(offset < btree_bkey_first_offset(t)); ++ return t; ++ } ++ ++ BUG(); ++} ++ ++/* ++ * There are never duplicate live keys in the btree - but including keys that ++ * have been flagged as deleted (and will be cleaned up later) we _will_ see ++ * duplicates. ++ * ++ * Thus the sort order is: usual key comparison first, but for keys that compare ++ * equal the deleted key(s) come first, and the (at most one) live version comes ++ * last. ++ * ++ * The main reason for this is insertion: to handle overwrites, we first iterate ++ * over keys that compare equal to our insert key, and then insert immediately ++ * prior to the first key greater than the key we're inserting - our insert ++ * position will be after all keys that compare equal to our insert key, which ++ * by the time we actually do the insert will all be deleted. ++ */ ++ ++void bch2_dump_bset(struct btree *b, struct bset *i, unsigned set) ++{ ++ struct bkey_packed *_k, *_n; ++ struct bkey k, n; ++ char buf[120]; ++ ++ if (!i->u64s) ++ return; ++ ++ for (_k = i->start, k = bkey_unpack_key(b, _k); ++ _k < vstruct_last(i); ++ _k = _n, k = n) { ++ _n = bkey_next(_k); ++ ++ bch2_bkey_to_text(&PBUF(buf), &k); ++ printk(KERN_ERR "block %u key %5u: %s\n", set, ++ __btree_node_key_to_offset(b, _k), buf); ++ ++ if (_n == vstruct_last(i)) ++ continue; ++ ++ n = bkey_unpack_key(b, _n); ++ ++ if (bkey_cmp(bkey_start_pos(&n), k.p) < 0) { ++ printk(KERN_ERR "Key skipped backwards\n"); ++ continue; ++ } ++ ++ /* ++ * Weird check for duplicate non extent keys: extents are ++ * deleted iff they have 0 size, so if it has zero size and it's ++ * not deleted these aren't extents: ++ */ ++ if (((!k.size && !bkey_deleted(&k)) || ++ (!n.size && !bkey_deleted(&n))) && ++ !bkey_deleted(&k) && ++ !bkey_cmp(n.p, k.p)) ++ printk(KERN_ERR "Duplicate keys\n"); ++ } ++} ++ ++void bch2_dump_btree_node(struct btree *b) ++{ ++ struct bset_tree *t; ++ ++ console_lock(); ++ for_each_bset(b, t) ++ bch2_dump_bset(b, bset(b, t), t - b->set); ++ console_unlock(); ++} ++ ++void bch2_dump_btree_node_iter(struct btree *b, ++ struct btree_node_iter *iter) ++{ ++ struct btree_node_iter_set *set; ++ ++ printk(KERN_ERR "btree node iter with %u/%u sets:\n", ++ __btree_node_iter_used(iter), b->nsets); ++ ++ btree_node_iter_for_each(iter, set) { ++ struct bkey_packed *k = __btree_node_offset_to_key(b, set->k); ++ struct bset_tree *t = bch2_bkey_to_bset(b, k); ++ struct bkey uk = bkey_unpack_key(b, k); ++ char buf[100]; ++ ++ bch2_bkey_to_text(&PBUF(buf), &uk); ++ printk(KERN_ERR "set %zu key %u: %s\n", ++ t - b->set, set->k, buf); ++ } ++} ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++ ++void __bch2_verify_btree_nr_keys(struct btree *b) ++{ ++ struct bset_tree *t; ++ struct bkey_packed *k; ++ struct btree_nr_keys nr = { 0 }; ++ ++ for_each_bset(b, t) ++ for (k = btree_bkey_first(b, t); ++ k != btree_bkey_last(b, t); ++ k = bkey_next(k)) ++ if (!bkey_whiteout(k)) ++ btree_keys_account_key_add(&nr, t - b->set, k); ++ ++ BUG_ON(memcmp(&nr, &b->nr, sizeof(nr))); ++} ++ ++static void bch2_btree_node_iter_next_check(struct btree_node_iter *_iter, ++ struct btree *b) ++{ ++ struct btree_node_iter iter = *_iter; ++ const struct bkey_packed *k, *n; ++ ++ k = bch2_btree_node_iter_peek_all(&iter, b); ++ __bch2_btree_node_iter_advance(&iter, b); ++ n = bch2_btree_node_iter_peek_all(&iter, b); ++ ++ bkey_unpack_key(b, k); ++ ++ if (n && ++ bkey_iter_cmp(b, k, n) > 0) { ++ struct btree_node_iter_set *set; ++ struct bkey ku = bkey_unpack_key(b, k); ++ struct bkey nu = bkey_unpack_key(b, n); ++ char buf1[80], buf2[80]; ++ ++ bch2_dump_btree_node(b); ++ bch2_bkey_to_text(&PBUF(buf1), &ku); ++ bch2_bkey_to_text(&PBUF(buf2), &nu); ++ printk(KERN_ERR "out of order/overlapping:\n%s\n%s\n", ++ buf1, buf2); ++ printk(KERN_ERR "iter was:"); ++ ++ btree_node_iter_for_each(_iter, set) { ++ struct bkey_packed *k = __btree_node_offset_to_key(b, set->k); ++ struct bset_tree *t = bch2_bkey_to_bset(b, k); ++ printk(" [%zi %zi]", t - b->set, ++ k->_data - bset(b, t)->_data); ++ } ++ panic("\n"); ++ } ++} ++ ++void bch2_btree_node_iter_verify(struct btree_node_iter *iter, ++ struct btree *b) ++{ ++ struct btree_node_iter_set *set, *s2; ++ struct bkey_packed *k, *p; ++ struct bset_tree *t; ++ ++ if (bch2_btree_node_iter_end(iter)) ++ return; ++ ++ /* Verify no duplicates: */ ++ btree_node_iter_for_each(iter, set) ++ btree_node_iter_for_each(iter, s2) ++ BUG_ON(set != s2 && set->end == s2->end); ++ ++ /* Verify that set->end is correct: */ ++ btree_node_iter_for_each(iter, set) { ++ for_each_bset(b, t) ++ if (set->end == t->end_offset) ++ goto found; ++ BUG(); ++found: ++ BUG_ON(set->k < btree_bkey_first_offset(t) || ++ set->k >= t->end_offset); ++ } ++ ++ /* Verify iterator is sorted: */ ++ btree_node_iter_for_each(iter, set) ++ BUG_ON(set != iter->data && ++ btree_node_iter_cmp(b, set[-1], set[0]) > 0); ++ ++ k = bch2_btree_node_iter_peek_all(iter, b); ++ ++ for_each_bset(b, t) { ++ if (iter->data[0].end == t->end_offset) ++ continue; ++ ++ p = bch2_bkey_prev_all(b, t, ++ bch2_btree_node_iter_bset_pos(iter, b, t)); ++ ++ BUG_ON(p && bkey_iter_cmp(b, k, p) < 0); ++ } ++} ++ ++void bch2_verify_insert_pos(struct btree *b, struct bkey_packed *where, ++ struct bkey_packed *insert, unsigned clobber_u64s) ++{ ++ struct bset_tree *t = bch2_bkey_to_bset(b, where); ++ struct bkey_packed *prev = bch2_bkey_prev_all(b, t, where); ++ struct bkey_packed *next = (void *) (where->_data + clobber_u64s); ++#if 0 ++ BUG_ON(prev && ++ bkey_iter_cmp(b, prev, insert) > 0); ++#else ++ if (prev && ++ bkey_iter_cmp(b, prev, insert) > 0) { ++ struct bkey k1 = bkey_unpack_key(b, prev); ++ struct bkey k2 = bkey_unpack_key(b, insert); ++ char buf1[100]; ++ char buf2[100]; ++ ++ bch2_dump_btree_node(b); ++ bch2_bkey_to_text(&PBUF(buf1), &k1); ++ bch2_bkey_to_text(&PBUF(buf2), &k2); ++ ++ panic("prev > insert:\n" ++ "prev key %5u %s\n" ++ "insert key %5u %s\n", ++ __btree_node_key_to_offset(b, prev), buf1, ++ __btree_node_key_to_offset(b, insert), buf2); ++ } ++#endif ++#if 0 ++ BUG_ON(next != btree_bkey_last(b, t) && ++ bkey_iter_cmp(b, insert, next) > 0); ++#else ++ if (next != btree_bkey_last(b, t) && ++ bkey_iter_cmp(b, insert, next) > 0) { ++ struct bkey k1 = bkey_unpack_key(b, insert); ++ struct bkey k2 = bkey_unpack_key(b, next); ++ char buf1[100]; ++ char buf2[100]; ++ ++ bch2_dump_btree_node(b); ++ bch2_bkey_to_text(&PBUF(buf1), &k1); ++ bch2_bkey_to_text(&PBUF(buf2), &k2); ++ ++ panic("insert > next:\n" ++ "insert key %5u %s\n" ++ "next key %5u %s\n", ++ __btree_node_key_to_offset(b, insert), buf1, ++ __btree_node_key_to_offset(b, next), buf2); ++ } ++#endif ++} ++ ++#else ++ ++static inline void bch2_btree_node_iter_next_check(struct btree_node_iter *iter, ++ struct btree *b) {} ++ ++#endif ++ ++/* Auxiliary search trees */ ++ ++#define BFLOAT_FAILED_UNPACKED (U8_MAX - 0) ++#define BFLOAT_FAILED_PREV (U8_MAX - 1) ++#define BFLOAT_FAILED_OVERFLOW (U8_MAX - 2) ++#define BFLOAT_FAILED (U8_MAX - 2) ++ ++#define KEY_WORDS BITS_TO_LONGS(1 << BKEY_EXPONENT_BITS) ++ ++struct bkey_float { ++ u8 exponent; ++ u8 key_offset; ++ union { ++ u32 mantissa32; ++ struct { ++ u16 mantissa16; ++ u16 _pad; ++ }; ++ }; ++} __packed; ++ ++#define BFLOAT_32BIT_NR 32U ++ ++static unsigned bkey_float_byte_offset(unsigned idx) ++{ ++ int d = (idx - BFLOAT_32BIT_NR) << 1; ++ ++ d &= ~(d >> 31); ++ ++ return idx * 6 - d; ++} ++ ++struct ro_aux_tree { ++ struct bkey_float _d[0]; ++}; ++ ++struct rw_aux_tree { ++ u16 offset; ++ struct bpos k; ++}; ++ ++/* ++ * BSET_CACHELINE was originally intended to match the hardware cacheline size - ++ * it used to be 64, but I realized the lookup code would touch slightly less ++ * memory if it was 128. ++ * ++ * It definites the number of bytes (in struct bset) per struct bkey_float in ++ * the auxiliar search tree - when we're done searching the bset_float tree we ++ * have this many bytes left that we do a linear search over. ++ * ++ * Since (after level 5) every level of the bset_tree is on a new cacheline, ++ * we're touching one fewer cacheline in the bset tree in exchange for one more ++ * cacheline in the linear search - but the linear search might stop before it ++ * gets to the second cacheline. ++ */ ++ ++#define BSET_CACHELINE 128 ++ ++/* Space required for the btree node keys */ ++static inline size_t btree_keys_bytes(struct btree *b) ++{ ++ return PAGE_SIZE << b->page_order; ++} ++ ++static inline size_t btree_keys_cachelines(struct btree *b) ++{ ++ return btree_keys_bytes(b) / BSET_CACHELINE; ++} ++ ++static inline size_t btree_aux_data_bytes(struct btree *b) ++{ ++ return btree_keys_cachelines(b) * 8; ++} ++ ++static inline size_t btree_aux_data_u64s(struct btree *b) ++{ ++ return btree_aux_data_bytes(b) / sizeof(u64); ++} ++ ++static unsigned bset_aux_tree_buf_end(const struct bset_tree *t) ++{ ++ BUG_ON(t->aux_data_offset == U16_MAX); ++ ++ switch (bset_aux_tree_type(t)) { ++ case BSET_NO_AUX_TREE: ++ return t->aux_data_offset; ++ case BSET_RO_AUX_TREE: ++ return t->aux_data_offset + ++ DIV_ROUND_UP(bkey_float_byte_offset(t->size) + ++ sizeof(u8) * t->size, 8); ++ case BSET_RW_AUX_TREE: ++ return t->aux_data_offset + ++ DIV_ROUND_UP(sizeof(struct rw_aux_tree) * t->size, 8); ++ default: ++ BUG(); ++ } ++} ++ ++static unsigned bset_aux_tree_buf_start(const struct btree *b, ++ const struct bset_tree *t) ++{ ++ return t == b->set ++ ? DIV_ROUND_UP(b->unpack_fn_len, 8) ++ : bset_aux_tree_buf_end(t - 1); ++} ++ ++static void *__aux_tree_base(const struct btree *b, ++ const struct bset_tree *t) ++{ ++ return b->aux_data + t->aux_data_offset * 8; ++} ++ ++static struct ro_aux_tree *ro_aux_tree_base(const struct btree *b, ++ const struct bset_tree *t) ++{ ++ EBUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE); ++ ++ return __aux_tree_base(b, t); ++} ++ ++static u8 *ro_aux_tree_prev(const struct btree *b, ++ const struct bset_tree *t) ++{ ++ EBUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE); ++ ++ return __aux_tree_base(b, t) + bkey_float_byte_offset(t->size); ++} ++ ++static struct bkey_float *bkey_float_get(struct ro_aux_tree *b, ++ unsigned idx) ++{ ++ return (void *) b + bkey_float_byte_offset(idx); ++} ++ ++static struct bkey_float *bkey_float(const struct btree *b, ++ const struct bset_tree *t, ++ unsigned idx) ++{ ++ return bkey_float_get(ro_aux_tree_base(b, t), idx); ++} ++ ++static void bset_aux_tree_verify(struct btree *b) ++{ ++#ifdef CONFIG_BCACHEFS_DEBUG ++ struct bset_tree *t; ++ ++ for_each_bset(b, t) { ++ if (t->aux_data_offset == U16_MAX) ++ continue; ++ ++ BUG_ON(t != b->set && ++ t[-1].aux_data_offset == U16_MAX); ++ ++ BUG_ON(t->aux_data_offset < bset_aux_tree_buf_start(b, t)); ++ BUG_ON(t->aux_data_offset > btree_aux_data_u64s(b)); ++ BUG_ON(bset_aux_tree_buf_end(t) > btree_aux_data_u64s(b)); ++ } ++#endif ++} ++ ++/* Memory allocation */ ++ ++void bch2_btree_keys_free(struct btree *b) ++{ ++ vfree(b->aux_data); ++ b->aux_data = NULL; ++} ++ ++int bch2_btree_keys_alloc(struct btree *b, unsigned page_order, gfp_t gfp) ++{ ++ b->page_order = page_order; ++ b->aux_data = vmalloc_exec(btree_aux_data_bytes(b), gfp); ++ if (!b->aux_data) ++ return -ENOMEM; ++ ++ return 0; ++} ++ ++void bch2_btree_keys_init(struct btree *b, bool *expensive_debug_checks) ++{ ++ unsigned i; ++ ++ b->nsets = 0; ++ memset(&b->nr, 0, sizeof(b->nr)); ++#ifdef CONFIG_BCACHEFS_DEBUG ++ b->expensive_debug_checks = expensive_debug_checks; ++#endif ++ for (i = 0; i < MAX_BSETS; i++) ++ b->set[i].data_offset = U16_MAX; ++ ++ bch2_bset_set_no_aux_tree(b, b->set); ++} ++ ++/* Binary tree stuff for auxiliary search trees */ ++ ++/* ++ * Cacheline/offset <-> bkey pointer arithmetic: ++ * ++ * t->tree is a binary search tree in an array; each node corresponds to a key ++ * in one cacheline in t->set (BSET_CACHELINE bytes). ++ * ++ * This means we don't have to store the full index of the key that a node in ++ * the binary tree points to; eytzinger1_to_inorder() gives us the cacheline, and ++ * then bkey_float->m gives us the offset within that cacheline, in units of 8 ++ * bytes. ++ * ++ * cacheline_to_bkey() and friends abstract out all the pointer arithmetic to ++ * make this work. ++ * ++ * To construct the bfloat for an arbitrary key we need to know what the key ++ * immediately preceding it is: we have to check if the two keys differ in the ++ * bits we're going to store in bkey_float->mantissa. t->prev[j] stores the size ++ * of the previous key so we can walk backwards to it from t->tree[j]'s key. ++ */ ++ ++static inline void *bset_cacheline(const struct btree *b, ++ const struct bset_tree *t, ++ unsigned cacheline) ++{ ++ return (void *) round_down((unsigned long) btree_bkey_first(b, t), ++ L1_CACHE_BYTES) + ++ cacheline * BSET_CACHELINE; ++} ++ ++static struct bkey_packed *cacheline_to_bkey(const struct btree *b, ++ const struct bset_tree *t, ++ unsigned cacheline, ++ unsigned offset) ++{ ++ return bset_cacheline(b, t, cacheline) + offset * 8; ++} ++ ++static unsigned bkey_to_cacheline(const struct btree *b, ++ const struct bset_tree *t, ++ const struct bkey_packed *k) ++{ ++ return ((void *) k - bset_cacheline(b, t, 0)) / BSET_CACHELINE; ++} ++ ++static ssize_t __bkey_to_cacheline_offset(const struct btree *b, ++ const struct bset_tree *t, ++ unsigned cacheline, ++ const struct bkey_packed *k) ++{ ++ return (u64 *) k - (u64 *) bset_cacheline(b, t, cacheline); ++} ++ ++static unsigned bkey_to_cacheline_offset(const struct btree *b, ++ const struct bset_tree *t, ++ unsigned cacheline, ++ const struct bkey_packed *k) ++{ ++ size_t m = __bkey_to_cacheline_offset(b, t, cacheline, k); ++ ++ EBUG_ON(m > U8_MAX); ++ return m; ++} ++ ++static inline struct bkey_packed *tree_to_bkey(const struct btree *b, ++ const struct bset_tree *t, ++ unsigned j) ++{ ++ return cacheline_to_bkey(b, t, ++ __eytzinger1_to_inorder(j, t->size, t->extra), ++ bkey_float(b, t, j)->key_offset); ++} ++ ++static struct bkey_packed *tree_to_prev_bkey(const struct btree *b, ++ const struct bset_tree *t, ++ unsigned j) ++{ ++ unsigned prev_u64s = ro_aux_tree_prev(b, t)[j]; ++ ++ return (void *) (tree_to_bkey(b, t, j)->_data - prev_u64s); ++} ++ ++static struct rw_aux_tree *rw_aux_tree(const struct btree *b, ++ const struct bset_tree *t) ++{ ++ EBUG_ON(bset_aux_tree_type(t) != BSET_RW_AUX_TREE); ++ ++ return __aux_tree_base(b, t); ++} ++ ++/* ++ * For the write set - the one we're currently inserting keys into - we don't ++ * maintain a full search tree, we just keep a simple lookup table in t->prev. ++ */ ++static struct bkey_packed *rw_aux_to_bkey(const struct btree *b, ++ struct bset_tree *t, ++ unsigned j) ++{ ++ return __btree_node_offset_to_key(b, rw_aux_tree(b, t)[j].offset); ++} ++ ++static void rw_aux_tree_set(const struct btree *b, struct bset_tree *t, ++ unsigned j, struct bkey_packed *k) ++{ ++ EBUG_ON(k >= btree_bkey_last(b, t)); ++ ++ rw_aux_tree(b, t)[j] = (struct rw_aux_tree) { ++ .offset = __btree_node_key_to_offset(b, k), ++ .k = bkey_unpack_pos(b, k), ++ }; ++} ++ ++static void bch2_bset_verify_rw_aux_tree(struct btree *b, ++ struct bset_tree *t) ++{ ++ struct bkey_packed *k = btree_bkey_first(b, t); ++ unsigned j = 0; ++ ++ if (!btree_keys_expensive_checks(b)) ++ return; ++ ++ BUG_ON(bset_has_ro_aux_tree(t)); ++ ++ if (!bset_has_rw_aux_tree(t)) ++ return; ++ ++ BUG_ON(t->size < 1); ++ BUG_ON(rw_aux_to_bkey(b, t, j) != k); ++ ++ goto start; ++ while (1) { ++ if (rw_aux_to_bkey(b, t, j) == k) { ++ BUG_ON(bkey_cmp(rw_aux_tree(b, t)[j].k, ++ bkey_unpack_pos(b, k))); ++start: ++ if (++j == t->size) ++ break; ++ ++ BUG_ON(rw_aux_tree(b, t)[j].offset <= ++ rw_aux_tree(b, t)[j - 1].offset); ++ } ++ ++ k = bkey_next(k); ++ BUG_ON(k >= btree_bkey_last(b, t)); ++ } ++} ++ ++/* returns idx of first entry >= offset: */ ++static unsigned rw_aux_tree_bsearch(struct btree *b, ++ struct bset_tree *t, ++ unsigned offset) ++{ ++ unsigned bset_offs = offset - btree_bkey_first_offset(t); ++ unsigned bset_u64s = t->end_offset - btree_bkey_first_offset(t); ++ unsigned idx = bset_u64s ? bset_offs * t->size / bset_u64s : 0; ++ ++ EBUG_ON(bset_aux_tree_type(t) != BSET_RW_AUX_TREE); ++ EBUG_ON(!t->size); ++ EBUG_ON(idx > t->size); ++ ++ while (idx < t->size && ++ rw_aux_tree(b, t)[idx].offset < offset) ++ idx++; ++ ++ while (idx && ++ rw_aux_tree(b, t)[idx - 1].offset >= offset) ++ idx--; ++ ++ EBUG_ON(idx < t->size && ++ rw_aux_tree(b, t)[idx].offset < offset); ++ EBUG_ON(idx && rw_aux_tree(b, t)[idx - 1].offset >= offset); ++ EBUG_ON(idx + 1 < t->size && ++ rw_aux_tree(b, t)[idx].offset == ++ rw_aux_tree(b, t)[idx + 1].offset); ++ ++ return idx; ++} ++ ++static inline unsigned bfloat_mantissa(const struct bkey_float *f, ++ unsigned idx) ++{ ++ return idx < BFLOAT_32BIT_NR ? f->mantissa32 : f->mantissa16; ++} ++ ++static inline void bfloat_mantissa_set(struct bkey_float *f, ++ unsigned idx, unsigned mantissa) ++{ ++ if (idx < BFLOAT_32BIT_NR) ++ f->mantissa32 = mantissa; ++ else ++ f->mantissa16 = mantissa; ++} ++ ++static inline unsigned bkey_mantissa(const struct bkey_packed *k, ++ const struct bkey_float *f, ++ unsigned idx) ++{ ++ u64 v; ++ ++ EBUG_ON(!bkey_packed(k)); ++ ++ v = get_unaligned((u64 *) (((u8 *) k->_data) + (f->exponent >> 3))); ++ ++ /* ++ * In little endian, we're shifting off low bits (and then the bits we ++ * want are at the low end), in big endian we're shifting off high bits ++ * (and then the bits we want are at the high end, so we shift them ++ * back down): ++ */ ++#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ ++ v >>= f->exponent & 7; ++#else ++ v >>= 64 - (f->exponent & 7) - (idx < BFLOAT_32BIT_NR ? 32 : 16); ++#endif ++ return idx < BFLOAT_32BIT_NR ? (u32) v : (u16) v; ++} ++ ++static void make_bfloat(struct btree *b, struct bset_tree *t, ++ unsigned j, ++ struct bkey_packed *min_key, ++ struct bkey_packed *max_key) ++{ ++ struct bkey_float *f = bkey_float(b, t, j); ++ struct bkey_packed *m = tree_to_bkey(b, t, j); ++ struct bkey_packed *p = tree_to_prev_bkey(b, t, j); ++ struct bkey_packed *l, *r; ++ unsigned bits = j < BFLOAT_32BIT_NR ? 32 : 16; ++ unsigned mantissa; ++ int shift, exponent, high_bit; ++ ++ EBUG_ON(bkey_next(p) != m); ++ ++ if (is_power_of_2(j)) { ++ l = min_key; ++ ++ if (!l->u64s) { ++ if (!bkey_pack_pos(l, b->data->min_key, b)) { ++ struct bkey_i tmp; ++ ++ bkey_init(&tmp.k); ++ tmp.k.p = b->data->min_key; ++ bkey_copy(l, &tmp); ++ } ++ } ++ } else { ++ l = tree_to_prev_bkey(b, t, j >> ffs(j)); ++ ++ EBUG_ON(m < l); ++ } ++ ++ if (is_power_of_2(j + 1)) { ++ r = max_key; ++ ++ if (!r->u64s) { ++ if (!bkey_pack_pos(r, t->max_key, b)) { ++ struct bkey_i tmp; ++ ++ bkey_init(&tmp.k); ++ tmp.k.p = t->max_key; ++ bkey_copy(r, &tmp); ++ } ++ } ++ } else { ++ r = tree_to_bkey(b, t, j >> (ffz(j) + 1)); ++ ++ EBUG_ON(m > r); ++ } ++ ++ /* ++ * for failed bfloats, the lookup code falls back to comparing against ++ * the original key. ++ */ ++ ++ if (!bkey_packed(l) || !bkey_packed(r) || ++ !bkey_packed(p) || !bkey_packed(m) || ++ !b->nr_key_bits) { ++ f->exponent = BFLOAT_FAILED_UNPACKED; ++ return; ++ } ++ ++ /* ++ * The greatest differing bit of l and r is the first bit we must ++ * include in the bfloat mantissa we're creating in order to do ++ * comparisons - that bit always becomes the high bit of ++ * bfloat->mantissa, and thus the exponent we're calculating here is ++ * the position of what will become the low bit in bfloat->mantissa: ++ * ++ * Note that this may be negative - we may be running off the low end ++ * of the key: we handle this later: ++ */ ++ high_bit = max(bch2_bkey_greatest_differing_bit(b, l, r), ++ min_t(unsigned, bits, b->nr_key_bits) - 1); ++ exponent = high_bit - (bits - 1); ++ ++ /* ++ * Then we calculate the actual shift value, from the start of the key ++ * (k->_data), to get the key bits starting at exponent: ++ */ ++#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ ++ shift = (int) (b->format.key_u64s * 64 - b->nr_key_bits) + exponent; ++ ++ EBUG_ON(shift + bits > b->format.key_u64s * 64); ++#else ++ shift = high_bit_offset + ++ b->nr_key_bits - ++ exponent - ++ bits; ++ ++ EBUG_ON(shift < KEY_PACKED_BITS_START); ++#endif ++ EBUG_ON(shift < 0 || shift >= BFLOAT_FAILED); ++ ++ f->exponent = shift; ++ mantissa = bkey_mantissa(m, f, j); ++ ++ /* ++ * If we've got garbage bits, set them to all 1s - it's legal for the ++ * bfloat to compare larger than the original key, but not smaller: ++ */ ++ if (exponent < 0) ++ mantissa |= ~(~0U << -exponent); ++ ++ bfloat_mantissa_set(f, j, mantissa); ++ ++ /* ++ * The bfloat must be able to tell its key apart from the previous key - ++ * if its key and the previous key don't differ in the required bits, ++ * flag as failed - unless the keys are actually equal, in which case ++ * we aren't required to return a specific one: ++ */ ++ if (exponent > 0 && ++ bfloat_mantissa(f, j) == bkey_mantissa(p, f, j) && ++ bkey_cmp_packed(b, p, m)) { ++ f->exponent = BFLOAT_FAILED_PREV; ++ return; ++ } ++ ++ /* ++ * f->mantissa must compare >= the original key - for transitivity with ++ * the comparison in bset_search_tree. If we're dropping set bits, ++ * increment it: ++ */ ++ if (exponent > (int) bch2_bkey_ffs(b, m)) { ++ if (j < BFLOAT_32BIT_NR ++ ? f->mantissa32 == U32_MAX ++ : f->mantissa16 == U16_MAX) ++ f->exponent = BFLOAT_FAILED_OVERFLOW; ++ ++ if (j < BFLOAT_32BIT_NR) ++ f->mantissa32++; ++ else ++ f->mantissa16++; ++ } ++} ++ ++/* bytes remaining - only valid for last bset: */ ++static unsigned __bset_tree_capacity(struct btree *b, struct bset_tree *t) ++{ ++ bset_aux_tree_verify(b); ++ ++ return btree_aux_data_bytes(b) - t->aux_data_offset * sizeof(u64); ++} ++ ++static unsigned bset_ro_tree_capacity(struct btree *b, struct bset_tree *t) ++{ ++ unsigned bytes = __bset_tree_capacity(b, t); ++ ++ if (bytes < 7 * BFLOAT_32BIT_NR) ++ return bytes / 7; ++ ++ bytes -= 7 * BFLOAT_32BIT_NR; ++ ++ return BFLOAT_32BIT_NR + bytes / 5; ++} ++ ++static unsigned bset_rw_tree_capacity(struct btree *b, struct bset_tree *t) ++{ ++ return __bset_tree_capacity(b, t) / sizeof(struct rw_aux_tree); ++} ++ ++static void __build_rw_aux_tree(struct btree *b, struct bset_tree *t) ++{ ++ struct bkey_packed *k; ++ ++ t->size = 1; ++ t->extra = BSET_RW_AUX_TREE_VAL; ++ rw_aux_tree(b, t)[0].offset = ++ __btree_node_key_to_offset(b, btree_bkey_first(b, t)); ++ ++ for (k = btree_bkey_first(b, t); ++ k != btree_bkey_last(b, t); ++ k = bkey_next(k)) { ++ if (t->size == bset_rw_tree_capacity(b, t)) ++ break; ++ ++ if ((void *) k - (void *) rw_aux_to_bkey(b, t, t->size - 1) > ++ L1_CACHE_BYTES) ++ rw_aux_tree_set(b, t, t->size++, k); ++ } ++} ++ ++static void __build_ro_aux_tree(struct btree *b, struct bset_tree *t) ++{ ++ struct bkey_packed *prev = NULL, *k = btree_bkey_first(b, t); ++ struct bkey_packed min_key, max_key; ++ unsigned j, cacheline = 1; ++ ++ /* signal to make_bfloat() that they're uninitialized: */ ++ min_key.u64s = max_key.u64s = 0; ++ ++ t->size = min(bkey_to_cacheline(b, t, btree_bkey_last(b, t)), ++ bset_ro_tree_capacity(b, t)); ++retry: ++ if (t->size < 2) { ++ t->size = 0; ++ t->extra = BSET_NO_AUX_TREE_VAL; ++ return; ++ } ++ ++ t->extra = (t->size - rounddown_pow_of_two(t->size - 1)) << 1; ++ ++ /* First we figure out where the first key in each cacheline is */ ++ eytzinger1_for_each(j, t->size) { ++ while (bkey_to_cacheline(b, t, k) < cacheline) ++ prev = k, k = bkey_next(k); ++ ++ if (k >= btree_bkey_last(b, t)) { ++ /* XXX: this path sucks */ ++ t->size--; ++ goto retry; ++ } ++ ++ ro_aux_tree_prev(b, t)[j] = prev->u64s; ++ bkey_float(b, t, j)->key_offset = ++ bkey_to_cacheline_offset(b, t, cacheline++, k); ++ ++ EBUG_ON(tree_to_prev_bkey(b, t, j) != prev); ++ EBUG_ON(tree_to_bkey(b, t, j) != k); ++ } ++ ++ while (bkey_next(k) != btree_bkey_last(b, t)) ++ k = bkey_next(k); ++ ++ t->max_key = bkey_unpack_pos(b, k); ++ ++ /* Then we build the tree */ ++ eytzinger1_for_each(j, t->size) ++ make_bfloat(b, t, j, &min_key, &max_key); ++} ++ ++static void bset_alloc_tree(struct btree *b, struct bset_tree *t) ++{ ++ struct bset_tree *i; ++ ++ for (i = b->set; i != t; i++) ++ BUG_ON(bset_has_rw_aux_tree(i)); ++ ++ bch2_bset_set_no_aux_tree(b, t); ++ ++ /* round up to next cacheline: */ ++ t->aux_data_offset = round_up(bset_aux_tree_buf_start(b, t), ++ SMP_CACHE_BYTES / sizeof(u64)); ++ ++ bset_aux_tree_verify(b); ++} ++ ++void bch2_bset_build_aux_tree(struct btree *b, struct bset_tree *t, ++ bool writeable) ++{ ++ if (writeable ++ ? bset_has_rw_aux_tree(t) ++ : bset_has_ro_aux_tree(t)) ++ return; ++ ++ bset_alloc_tree(b, t); ++ ++ if (!__bset_tree_capacity(b, t)) ++ return; ++ ++ if (writeable) ++ __build_rw_aux_tree(b, t); ++ else ++ __build_ro_aux_tree(b, t); ++ ++ bset_aux_tree_verify(b); ++} ++ ++void bch2_bset_init_first(struct btree *b, struct bset *i) ++{ ++ struct bset_tree *t; ++ ++ BUG_ON(b->nsets); ++ ++ memset(i, 0, sizeof(*i)); ++ get_random_bytes(&i->seq, sizeof(i->seq)); ++ SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN); ++ ++ t = &b->set[b->nsets++]; ++ set_btree_bset(b, t, i); ++} ++ ++void bch2_bset_init_next(struct bch_fs *c, struct btree *b, ++ struct btree_node_entry *bne) ++{ ++ struct bset *i = &bne->keys; ++ struct bset_tree *t; ++ ++ BUG_ON(bset_byte_offset(b, bne) >= btree_bytes(c)); ++ BUG_ON((void *) bne < (void *) btree_bkey_last(b, bset_tree_last(b))); ++ BUG_ON(b->nsets >= MAX_BSETS); ++ ++ memset(i, 0, sizeof(*i)); ++ i->seq = btree_bset_first(b)->seq; ++ SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN); ++ ++ t = &b->set[b->nsets++]; ++ set_btree_bset(b, t, i); ++} ++ ++/* ++ * find _some_ key in the same bset as @k that precedes @k - not necessarily the ++ * immediate predecessor: ++ */ ++static struct bkey_packed *__bkey_prev(struct btree *b, struct bset_tree *t, ++ struct bkey_packed *k) ++{ ++ struct bkey_packed *p; ++ unsigned offset; ++ int j; ++ ++ EBUG_ON(k < btree_bkey_first(b, t) || ++ k > btree_bkey_last(b, t)); ++ ++ if (k == btree_bkey_first(b, t)) ++ return NULL; ++ ++ switch (bset_aux_tree_type(t)) { ++ case BSET_NO_AUX_TREE: ++ p = btree_bkey_first(b, t); ++ break; ++ case BSET_RO_AUX_TREE: ++ j = min_t(unsigned, t->size - 1, bkey_to_cacheline(b, t, k)); ++ ++ do { ++ p = j ? tree_to_bkey(b, t, ++ __inorder_to_eytzinger1(j--, ++ t->size, t->extra)) ++ : btree_bkey_first(b, t); ++ } while (p >= k); ++ break; ++ case BSET_RW_AUX_TREE: ++ offset = __btree_node_key_to_offset(b, k); ++ j = rw_aux_tree_bsearch(b, t, offset); ++ p = j ? rw_aux_to_bkey(b, t, j - 1) ++ : btree_bkey_first(b, t); ++ break; ++ } ++ ++ return p; ++} ++ ++struct bkey_packed *bch2_bkey_prev_filter(struct btree *b, ++ struct bset_tree *t, ++ struct bkey_packed *k, ++ unsigned min_key_type) ++{ ++ struct bkey_packed *p, *i, *ret = NULL, *orig_k = k; ++ ++ while ((p = __bkey_prev(b, t, k)) && !ret) { ++ for (i = p; i != k; i = bkey_next(i)) ++ if (i->type >= min_key_type) ++ ret = i; ++ ++ k = p; ++ } ++ ++ if (btree_keys_expensive_checks(b)) { ++ BUG_ON(ret >= orig_k); ++ ++ for (i = ret ? bkey_next(ret) : btree_bkey_first(b, t); ++ i != orig_k; ++ i = bkey_next(i)) ++ BUG_ON(i->type >= min_key_type); ++ } ++ ++ return ret; ++} ++ ++/* Insert */ ++ ++static void rw_aux_tree_fix_invalidated_key(struct btree *b, ++ struct bset_tree *t, ++ struct bkey_packed *k) ++{ ++ unsigned offset = __btree_node_key_to_offset(b, k); ++ unsigned j = rw_aux_tree_bsearch(b, t, offset); ++ ++ if (j < t->size && ++ rw_aux_tree(b, t)[j].offset == offset) ++ rw_aux_tree_set(b, t, j, k); ++ ++ bch2_bset_verify_rw_aux_tree(b, t); ++} ++ ++static void ro_aux_tree_fix_invalidated_key(struct btree *b, ++ struct bset_tree *t, ++ struct bkey_packed *k) ++{ ++ struct bkey_packed min_key, max_key; ++ unsigned inorder, j; ++ ++ EBUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE); ++ ++ /* signal to make_bfloat() that they're uninitialized: */ ++ min_key.u64s = max_key.u64s = 0; ++ ++ if (bkey_next(k) == btree_bkey_last(b, t)) { ++ t->max_key = bkey_unpack_pos(b, k); ++ ++ for (j = 1; j < t->size; j = j * 2 + 1) ++ make_bfloat(b, t, j, &min_key, &max_key); ++ } ++ ++ inorder = bkey_to_cacheline(b, t, k); ++ ++ if (inorder && ++ inorder < t->size) { ++ j = __inorder_to_eytzinger1(inorder, t->size, t->extra); ++ ++ if (k == tree_to_bkey(b, t, j)) { ++ /* Fix the node this key corresponds to */ ++ make_bfloat(b, t, j, &min_key, &max_key); ++ ++ /* Children for which this key is the right boundary */ ++ for (j = eytzinger1_left_child(j); ++ j < t->size; ++ j = eytzinger1_right_child(j)) ++ make_bfloat(b, t, j, &min_key, &max_key); ++ } ++ } ++ ++ if (inorder + 1 < t->size) { ++ j = __inorder_to_eytzinger1(inorder + 1, t->size, t->extra); ++ ++ if (k == tree_to_prev_bkey(b, t, j)) { ++ make_bfloat(b, t, j, &min_key, &max_key); ++ ++ /* Children for which this key is the left boundary */ ++ for (j = eytzinger1_right_child(j); ++ j < t->size; ++ j = eytzinger1_left_child(j)) ++ make_bfloat(b, t, j, &min_key, &max_key); ++ } ++ } ++} ++ ++/** ++ * bch2_bset_fix_invalidated_key() - given an existing key @k that has been ++ * modified, fix any auxiliary search tree by remaking all the nodes in the ++ * auxiliary search tree that @k corresponds to ++ */ ++void bch2_bset_fix_invalidated_key(struct btree *b, struct bkey_packed *k) ++{ ++ struct bset_tree *t = bch2_bkey_to_bset(b, k); ++ ++ switch (bset_aux_tree_type(t)) { ++ case BSET_NO_AUX_TREE: ++ break; ++ case BSET_RO_AUX_TREE: ++ ro_aux_tree_fix_invalidated_key(b, t, k); ++ break; ++ case BSET_RW_AUX_TREE: ++ rw_aux_tree_fix_invalidated_key(b, t, k); ++ break; ++ } ++} ++ ++static void bch2_bset_fix_lookup_table(struct btree *b, ++ struct bset_tree *t, ++ struct bkey_packed *_where, ++ unsigned clobber_u64s, ++ unsigned new_u64s) ++{ ++ int shift = new_u64s - clobber_u64s; ++ unsigned l, j, where = __btree_node_key_to_offset(b, _where); ++ ++ EBUG_ON(bset_has_ro_aux_tree(t)); ++ ++ if (!bset_has_rw_aux_tree(t)) ++ return; ++ ++ /* returns first entry >= where */ ++ l = rw_aux_tree_bsearch(b, t, where); ++ ++ if (!l) /* never delete first entry */ ++ l++; ++ else if (l < t->size && ++ where < t->end_offset && ++ rw_aux_tree(b, t)[l].offset == where) ++ rw_aux_tree_set(b, t, l++, _where); ++ ++ /* l now > where */ ++ ++ for (j = l; ++ j < t->size && ++ rw_aux_tree(b, t)[j].offset < where + clobber_u64s; ++ j++) ++ ; ++ ++ if (j < t->size && ++ rw_aux_tree(b, t)[j].offset + shift == ++ rw_aux_tree(b, t)[l - 1].offset) ++ j++; ++ ++ memmove(&rw_aux_tree(b, t)[l], ++ &rw_aux_tree(b, t)[j], ++ (void *) &rw_aux_tree(b, t)[t->size] - ++ (void *) &rw_aux_tree(b, t)[j]); ++ t->size -= j - l; ++ ++ for (j = l; j < t->size; j++) ++ rw_aux_tree(b, t)[j].offset += shift; ++ ++ EBUG_ON(l < t->size && ++ rw_aux_tree(b, t)[l].offset == ++ rw_aux_tree(b, t)[l - 1].offset); ++ ++ if (t->size < bset_rw_tree_capacity(b, t) && ++ (l < t->size ++ ? rw_aux_tree(b, t)[l].offset ++ : t->end_offset) - ++ rw_aux_tree(b, t)[l - 1].offset > ++ L1_CACHE_BYTES / sizeof(u64)) { ++ struct bkey_packed *start = rw_aux_to_bkey(b, t, l - 1); ++ struct bkey_packed *end = l < t->size ++ ? rw_aux_to_bkey(b, t, l) ++ : btree_bkey_last(b, t); ++ struct bkey_packed *k = start; ++ ++ while (1) { ++ k = bkey_next(k); ++ if (k == end) ++ break; ++ ++ if ((void *) k - (void *) start >= L1_CACHE_BYTES) { ++ memmove(&rw_aux_tree(b, t)[l + 1], ++ &rw_aux_tree(b, t)[l], ++ (void *) &rw_aux_tree(b, t)[t->size] - ++ (void *) &rw_aux_tree(b, t)[l]); ++ t->size++; ++ rw_aux_tree_set(b, t, l, k); ++ break; ++ } ++ } ++ } ++ ++ bch2_bset_verify_rw_aux_tree(b, t); ++ bset_aux_tree_verify(b); ++} ++ ++void bch2_bset_insert(struct btree *b, ++ struct btree_node_iter *iter, ++ struct bkey_packed *where, ++ struct bkey_i *insert, ++ unsigned clobber_u64s) ++{ ++ struct bkey_format *f = &b->format; ++ struct bset_tree *t = bset_tree_last(b); ++ struct bkey_packed packed, *src = bkey_to_packed(insert); ++ ++ bch2_bset_verify_rw_aux_tree(b, t); ++ bch2_verify_insert_pos(b, where, bkey_to_packed(insert), clobber_u64s); ++ ++ if (bch2_bkey_pack_key(&packed, &insert->k, f)) ++ src = &packed; ++ ++ if (!bkey_whiteout(&insert->k)) ++ btree_keys_account_key_add(&b->nr, t - b->set, src); ++ ++ if (src->u64s != clobber_u64s) { ++ u64 *src_p = where->_data + clobber_u64s; ++ u64 *dst_p = where->_data + src->u64s; ++ ++ EBUG_ON((int) le16_to_cpu(bset(b, t)->u64s) < ++ (int) clobber_u64s - src->u64s); ++ ++ memmove_u64s(dst_p, src_p, btree_bkey_last(b, t)->_data - src_p); ++ le16_add_cpu(&bset(b, t)->u64s, src->u64s - clobber_u64s); ++ set_btree_bset_end(b, t); ++ } ++ ++ memcpy_u64s(where, src, ++ bkeyp_key_u64s(f, src)); ++ memcpy_u64s(bkeyp_val(f, where), &insert->v, ++ bkeyp_val_u64s(f, src)); ++ ++ bch2_bset_fix_lookup_table(b, t, where, clobber_u64s, src->u64s); ++ ++ bch2_verify_btree_nr_keys(b); ++} ++ ++void bch2_bset_delete(struct btree *b, ++ struct bkey_packed *where, ++ unsigned clobber_u64s) ++{ ++ struct bset_tree *t = bset_tree_last(b); ++ u64 *src_p = where->_data + clobber_u64s; ++ u64 *dst_p = where->_data; ++ ++ bch2_bset_verify_rw_aux_tree(b, t); ++ ++ EBUG_ON(le16_to_cpu(bset(b, t)->u64s) < clobber_u64s); ++ ++ memmove_u64s_down(dst_p, src_p, btree_bkey_last(b, t)->_data - src_p); ++ le16_add_cpu(&bset(b, t)->u64s, -clobber_u64s); ++ set_btree_bset_end(b, t); ++ ++ bch2_bset_fix_lookup_table(b, t, where, clobber_u64s, 0); ++} ++ ++/* Lookup */ ++ ++__flatten ++static struct bkey_packed *bset_search_write_set(const struct btree *b, ++ struct bset_tree *t, ++ struct bpos *search, ++ const struct bkey_packed *packed_search) ++{ ++ unsigned l = 0, r = t->size; ++ ++ while (l + 1 != r) { ++ unsigned m = (l + r) >> 1; ++ ++ if (bkey_cmp(rw_aux_tree(b, t)[m].k, *search) < 0) ++ l = m; ++ else ++ r = m; ++ } ++ ++ return rw_aux_to_bkey(b, t, l); ++} ++ ++noinline ++static int bset_search_tree_slowpath(const struct btree *b, ++ struct bset_tree *t, struct bpos *search, ++ const struct bkey_packed *packed_search, ++ unsigned n) ++{ ++ return bkey_cmp_p_or_unp(b, tree_to_bkey(b, t, n), ++ packed_search, search) < 0; ++} ++ ++__flatten ++static struct bkey_packed *bset_search_tree(const struct btree *b, ++ struct bset_tree *t, ++ struct bpos *search, ++ const struct bkey_packed *packed_search) ++{ ++ struct ro_aux_tree *base = ro_aux_tree_base(b, t); ++ struct bkey_float *f = bkey_float_get(base, 1); ++ void *p; ++ unsigned inorder, n = 1; ++ ++ while (1) { ++ if (likely(n << 4 < t->size)) { ++ p = bkey_float_get(base, n << 4); ++ prefetch(p); ++ } else if (n << 3 < t->size) { ++ inorder = __eytzinger1_to_inorder(n, t->size, t->extra); ++ p = bset_cacheline(b, t, inorder); ++#ifdef CONFIG_X86_64 ++ asm(".intel_syntax noprefix;" ++ "prefetcht0 [%0 - 127 + 64 * 0];" ++ "prefetcht0 [%0 - 127 + 64 * 1];" ++ "prefetcht0 [%0 - 127 + 64 * 2];" ++ "prefetcht0 [%0 - 127 + 64 * 3];" ++ ".att_syntax prefix;" ++ : ++ : "r" (p + 127)); ++#else ++ prefetch(p + L1_CACHE_BYTES * 0); ++ prefetch(p + L1_CACHE_BYTES * 1); ++ prefetch(p + L1_CACHE_BYTES * 2); ++ prefetch(p + L1_CACHE_BYTES * 3); ++#endif ++ } else if (n >= t->size) ++ break; ++ ++ f = bkey_float_get(base, n); ++ ++ if (packed_search && ++ likely(f->exponent < BFLOAT_FAILED)) ++ n = n * 2 + (bfloat_mantissa(f, n) < ++ bkey_mantissa(packed_search, f, n)); ++ else ++ n = n * 2 + bset_search_tree_slowpath(b, t, ++ search, packed_search, n); ++ } while (n < t->size); ++ ++ inorder = __eytzinger1_to_inorder(n >> 1, t->size, t->extra); ++ ++ /* ++ * n would have been the node we recursed to - the low bit tells us if ++ * we recursed left or recursed right. ++ */ ++ if (n & 1) { ++ return cacheline_to_bkey(b, t, inorder, f->key_offset); ++ } else { ++ if (--inorder) { ++ n = eytzinger1_prev(n >> 1, t->size); ++ f = bkey_float_get(base, n); ++ return cacheline_to_bkey(b, t, inorder, f->key_offset); ++ } else ++ return btree_bkey_first(b, t); ++ } ++} ++ ++/* ++ * Returns the first key greater than or equal to @search ++ */ ++__always_inline __flatten ++static struct bkey_packed *bch2_bset_search(struct btree *b, ++ struct bset_tree *t, ++ struct bpos *search, ++ struct bkey_packed *packed_search, ++ const struct bkey_packed *lossy_packed_search) ++{ ++ struct bkey_packed *m; ++ ++ /* ++ * First, we search for a cacheline, then lastly we do a linear search ++ * within that cacheline. ++ * ++ * To search for the cacheline, there's three different possibilities: ++ * * The set is too small to have a search tree, so we just do a linear ++ * search over the whole set. ++ * * The set is the one we're currently inserting into; keeping a full ++ * auxiliary search tree up to date would be too expensive, so we ++ * use a much simpler lookup table to do a binary search - ++ * bset_search_write_set(). ++ * * Or we use the auxiliary search tree we constructed earlier - ++ * bset_search_tree() ++ */ ++ ++ switch (bset_aux_tree_type(t)) { ++ case BSET_NO_AUX_TREE: ++ m = btree_bkey_first(b, t); ++ break; ++ case BSET_RW_AUX_TREE: ++ m = bset_search_write_set(b, t, search, lossy_packed_search); ++ break; ++ case BSET_RO_AUX_TREE: ++ /* ++ * Each node in the auxiliary search tree covers a certain range ++ * of bits, and keys above and below the set it covers might ++ * differ outside those bits - so we have to special case the ++ * start and end - handle that here: ++ */ ++ ++ if (bkey_cmp(*search, t->max_key) > 0) ++ return btree_bkey_last(b, t); ++ ++ m = bset_search_tree(b, t, search, lossy_packed_search); ++ break; ++ } ++ ++ if (lossy_packed_search) ++ while (m != btree_bkey_last(b, t) && ++ bkey_iter_cmp_p_or_unp(b, search, lossy_packed_search, ++ m) > 0) ++ m = bkey_next(m); ++ ++ if (!packed_search) ++ while (m != btree_bkey_last(b, t) && ++ bkey_iter_pos_cmp(b, search, m) > 0) ++ m = bkey_next(m); ++ ++ if (btree_keys_expensive_checks(b)) { ++ struct bkey_packed *prev = bch2_bkey_prev_all(b, t, m); ++ ++ BUG_ON(prev && ++ bkey_iter_cmp_p_or_unp(b, search, packed_search, ++ prev) <= 0); ++ } ++ ++ return m; ++} ++ ++/* Btree node iterator */ ++ ++static inline void __bch2_btree_node_iter_push(struct btree_node_iter *iter, ++ struct btree *b, ++ const struct bkey_packed *k, ++ const struct bkey_packed *end) ++{ ++ if (k != end) { ++ struct btree_node_iter_set *pos; ++ ++ btree_node_iter_for_each(iter, pos) ++ ; ++ ++ BUG_ON(pos >= iter->data + ARRAY_SIZE(iter->data)); ++ *pos = (struct btree_node_iter_set) { ++ __btree_node_key_to_offset(b, k), ++ __btree_node_key_to_offset(b, end) ++ }; ++ } ++} ++ ++void bch2_btree_node_iter_push(struct btree_node_iter *iter, ++ struct btree *b, ++ const struct bkey_packed *k, ++ const struct bkey_packed *end) ++{ ++ __bch2_btree_node_iter_push(iter, b, k, end); ++ bch2_btree_node_iter_sort(iter, b); ++} ++ ++noinline __flatten __attribute__((cold)) ++static void btree_node_iter_init_pack_failed(struct btree_node_iter *iter, ++ struct btree *b, struct bpos *search) ++{ ++ struct bset_tree *t; ++ ++ trace_bkey_pack_pos_fail(search); ++ ++ for_each_bset(b, t) ++ __bch2_btree_node_iter_push(iter, b, ++ bch2_bset_search(b, t, search, NULL, NULL), ++ btree_bkey_last(b, t)); ++ ++ bch2_btree_node_iter_sort(iter, b); ++} ++ ++/** ++ * bch_btree_node_iter_init - initialize a btree node iterator, starting from a ++ * given position ++ * ++ * Main entry point to the lookup code for individual btree nodes: ++ * ++ * NOTE: ++ * ++ * When you don't filter out deleted keys, btree nodes _do_ contain duplicate ++ * keys. This doesn't matter for most code, but it does matter for lookups. ++ * ++ * Some adjacent keys with a string of equal keys: ++ * i j k k k k l m ++ * ++ * If you search for k, the lookup code isn't guaranteed to return you any ++ * specific k. The lookup code is conceptually doing a binary search and ++ * iterating backwards is very expensive so if the pivot happens to land at the ++ * last k that's what you'll get. ++ * ++ * This works out ok, but it's something to be aware of: ++ * ++ * - For non extents, we guarantee that the live key comes last - see ++ * btree_node_iter_cmp(), keys_out_of_order(). So the duplicates you don't ++ * see will only be deleted keys you don't care about. ++ * ++ * - For extents, deleted keys sort last (see the comment at the top of this ++ * file). But when you're searching for extents, you actually want the first ++ * key strictly greater than your search key - an extent that compares equal ++ * to the search key is going to have 0 sectors after the search key. ++ * ++ * But this does mean that we can't just search for ++ * bkey_successor(start_of_range) to get the first extent that overlaps with ++ * the range we want - if we're unlucky and there's an extent that ends ++ * exactly where we searched, then there could be a deleted key at the same ++ * position and we'd get that when we search instead of the preceding extent ++ * we needed. ++ * ++ * So we've got to search for start_of_range, then after the lookup iterate ++ * past any extents that compare equal to the position we searched for. ++ */ ++__flatten ++void bch2_btree_node_iter_init(struct btree_node_iter *iter, ++ struct btree *b, struct bpos *search) ++{ ++ struct bset_tree *t; ++ struct bkey_packed p, *packed_search = NULL; ++ struct btree_node_iter_set *pos = iter->data; ++ ++ EBUG_ON(bkey_cmp(*search, b->data->min_key) < 0); ++ bset_aux_tree_verify(b); ++ ++ memset(iter, 0, sizeof(*iter)); ++ ++ switch (bch2_bkey_pack_pos_lossy(&p, *search, b)) { ++ case BKEY_PACK_POS_EXACT: ++ packed_search = &p; ++ break; ++ case BKEY_PACK_POS_SMALLER: ++ packed_search = NULL; ++ break; ++ case BKEY_PACK_POS_FAIL: ++ btree_node_iter_init_pack_failed(iter, b, search); ++ return; ++ } ++ ++ for_each_bset(b, t) { ++ struct bkey_packed *k = bch2_bset_search(b, t, search, ++ packed_search, &p); ++ struct bkey_packed *end = btree_bkey_last(b, t); ++ ++ if (k != end) ++ *pos++ = (struct btree_node_iter_set) { ++ __btree_node_key_to_offset(b, k), ++ __btree_node_key_to_offset(b, end) ++ }; ++ } ++ ++ bch2_btree_node_iter_sort(iter, b); ++} ++ ++void bch2_btree_node_iter_init_from_start(struct btree_node_iter *iter, ++ struct btree *b) ++{ ++ struct bset_tree *t; ++ ++ memset(iter, 0, sizeof(*iter)); ++ ++ for_each_bset(b, t) ++ __bch2_btree_node_iter_push(iter, b, ++ btree_bkey_first(b, t), ++ btree_bkey_last(b, t)); ++ bch2_btree_node_iter_sort(iter, b); ++} ++ ++struct bkey_packed *bch2_btree_node_iter_bset_pos(struct btree_node_iter *iter, ++ struct btree *b, ++ struct bset_tree *t) ++{ ++ struct btree_node_iter_set *set; ++ ++ btree_node_iter_for_each(iter, set) ++ if (set->end == t->end_offset) ++ return __btree_node_offset_to_key(b, set->k); ++ ++ return btree_bkey_last(b, t); ++} ++ ++static inline bool btree_node_iter_sort_two(struct btree_node_iter *iter, ++ struct btree *b, ++ unsigned first) ++{ ++ bool ret; ++ ++ if ((ret = (btree_node_iter_cmp(b, ++ iter->data[first], ++ iter->data[first + 1]) > 0))) ++ swap(iter->data[first], iter->data[first + 1]); ++ return ret; ++} ++ ++void bch2_btree_node_iter_sort(struct btree_node_iter *iter, ++ struct btree *b) ++{ ++ /* unrolled bubble sort: */ ++ ++ if (!__btree_node_iter_set_end(iter, 2)) { ++ btree_node_iter_sort_two(iter, b, 0); ++ btree_node_iter_sort_two(iter, b, 1); ++ } ++ ++ if (!__btree_node_iter_set_end(iter, 1)) ++ btree_node_iter_sort_two(iter, b, 0); ++} ++ ++void bch2_btree_node_iter_set_drop(struct btree_node_iter *iter, ++ struct btree_node_iter_set *set) ++{ ++ struct btree_node_iter_set *last = ++ iter->data + ARRAY_SIZE(iter->data) - 1; ++ ++ memmove(&set[0], &set[1], (void *) last - (void *) set); ++ *last = (struct btree_node_iter_set) { 0, 0 }; ++} ++ ++static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *iter, ++ struct btree *b) ++{ ++ iter->data->k += __bch2_btree_node_iter_peek_all(iter, b)->u64s; ++ ++ EBUG_ON(iter->data->k > iter->data->end); ++ ++ if (unlikely(__btree_node_iter_set_end(iter, 0))) { ++ bch2_btree_node_iter_set_drop(iter, iter->data); ++ return; ++ } ++ ++ if (__btree_node_iter_set_end(iter, 1)) ++ return; ++ ++ if (!btree_node_iter_sort_two(iter, b, 0)) ++ return; ++ ++ if (__btree_node_iter_set_end(iter, 2)) ++ return; ++ ++ btree_node_iter_sort_two(iter, b, 1); ++} ++ ++void bch2_btree_node_iter_advance(struct btree_node_iter *iter, ++ struct btree *b) ++{ ++ if (btree_keys_expensive_checks(b)) { ++ bch2_btree_node_iter_verify(iter, b); ++ bch2_btree_node_iter_next_check(iter, b); ++ } ++ ++ __bch2_btree_node_iter_advance(iter, b); ++} ++ ++/* ++ * Expensive: ++ */ ++struct bkey_packed *bch2_btree_node_iter_prev_all(struct btree_node_iter *iter, ++ struct btree *b) ++{ ++ struct bkey_packed *k, *prev = NULL; ++ struct btree_node_iter_set *set; ++ struct bset_tree *t; ++ unsigned end = 0; ++ ++ bch2_btree_node_iter_verify(iter, b); ++ ++ for_each_bset(b, t) { ++ k = bch2_bkey_prev_all(b, t, ++ bch2_btree_node_iter_bset_pos(iter, b, t)); ++ if (k && ++ (!prev || bkey_iter_cmp(b, k, prev) > 0)) { ++ prev = k; ++ end = t->end_offset; ++ } ++ } ++ ++ if (!prev) ++ return NULL; ++ ++ /* ++ * We're manually memmoving instead of just calling sort() to ensure the ++ * prev we picked ends up in slot 0 - sort won't necessarily put it ++ * there because of duplicate deleted keys: ++ */ ++ btree_node_iter_for_each(iter, set) ++ if (set->end == end) ++ goto found; ++ ++ BUG_ON(set != &iter->data[__btree_node_iter_used(iter)]); ++found: ++ BUG_ON(set >= iter->data + ARRAY_SIZE(iter->data)); ++ ++ memmove(&iter->data[1], ++ &iter->data[0], ++ (void *) set - (void *) &iter->data[0]); ++ ++ iter->data[0].k = __btree_node_key_to_offset(b, prev); ++ iter->data[0].end = end; ++ ++ bch2_btree_node_iter_verify(iter, b); ++ return prev; ++} ++ ++struct bkey_packed *bch2_btree_node_iter_prev_filter(struct btree_node_iter *iter, ++ struct btree *b, ++ unsigned min_key_type) ++{ ++ struct bkey_packed *prev; ++ ++ do { ++ prev = bch2_btree_node_iter_prev_all(iter, b); ++ } while (prev && prev->type < min_key_type); ++ ++ return prev; ++} ++ ++struct bkey_s_c bch2_btree_node_iter_peek_unpack(struct btree_node_iter *iter, ++ struct btree *b, ++ struct bkey *u) ++{ ++ struct bkey_packed *k = bch2_btree_node_iter_peek(iter, b); ++ ++ return k ? bkey_disassemble(b, k, u) : bkey_s_c_null; ++} ++ ++/* Mergesort */ ++ ++void bch2_btree_keys_stats(struct btree *b, struct bset_stats *stats) ++{ ++ struct bset_tree *t; ++ ++ for_each_bset(b, t) { ++ enum bset_aux_tree_type type = bset_aux_tree_type(t); ++ size_t j; ++ ++ stats->sets[type].nr++; ++ stats->sets[type].bytes += le16_to_cpu(bset(b, t)->u64s) * ++ sizeof(u64); ++ ++ if (bset_has_ro_aux_tree(t)) { ++ stats->floats += t->size - 1; ++ ++ for (j = 1; j < t->size; j++) ++ switch (bkey_float(b, t, j)->exponent) { ++ case BFLOAT_FAILED_UNPACKED: ++ stats->failed_unpacked++; ++ break; ++ case BFLOAT_FAILED_PREV: ++ stats->failed_prev++; ++ break; ++ case BFLOAT_FAILED_OVERFLOW: ++ stats->failed_overflow++; ++ break; ++ } ++ } ++ } ++} ++ ++void bch2_bfloat_to_text(struct printbuf *out, struct btree *b, ++ struct bkey_packed *k) ++{ ++ struct bset_tree *t = bch2_bkey_to_bset(b, k); ++ struct bkey_packed *l, *r, *p; ++ struct bkey uk, up; ++ char buf1[200], buf2[200]; ++ unsigned j, inorder; ++ ++ if (out->pos != out->end) ++ *out->pos = '\0'; ++ ++ if (!bset_has_ro_aux_tree(t)) ++ return; ++ ++ inorder = bkey_to_cacheline(b, t, k); ++ if (!inorder || inorder >= t->size) ++ return; ++ ++ j = __inorder_to_eytzinger1(inorder, t->size, t->extra); ++ if (k != tree_to_bkey(b, t, j)) ++ return; ++ ++ switch (bkey_float(b, t, j)->exponent) { ++ case BFLOAT_FAILED_UNPACKED: ++ uk = bkey_unpack_key(b, k); ++ pr_buf(out, ++ " failed unpacked at depth %u\n" ++ "\t%llu:%llu\n", ++ ilog2(j), ++ uk.p.inode, uk.p.offset); ++ break; ++ case BFLOAT_FAILED_PREV: ++ p = tree_to_prev_bkey(b, t, j); ++ l = is_power_of_2(j) ++ ? btree_bkey_first(b, t) ++ : tree_to_prev_bkey(b, t, j >> ffs(j)); ++ r = is_power_of_2(j + 1) ++ ? bch2_bkey_prev_all(b, t, btree_bkey_last(b, t)) ++ : tree_to_bkey(b, t, j >> (ffz(j) + 1)); ++ ++ up = bkey_unpack_key(b, p); ++ uk = bkey_unpack_key(b, k); ++ bch2_to_binary(buf1, high_word(&b->format, p), b->nr_key_bits); ++ bch2_to_binary(buf2, high_word(&b->format, k), b->nr_key_bits); ++ ++ pr_buf(out, ++ " failed prev at depth %u\n" ++ "\tkey starts at bit %u but first differing bit at %u\n" ++ "\t%llu:%llu\n" ++ "\t%llu:%llu\n" ++ "\t%s\n" ++ "\t%s\n", ++ ilog2(j), ++ bch2_bkey_greatest_differing_bit(b, l, r), ++ bch2_bkey_greatest_differing_bit(b, p, k), ++ uk.p.inode, uk.p.offset, ++ up.p.inode, up.p.offset, ++ buf1, buf2); ++ break; ++ case BFLOAT_FAILED_OVERFLOW: ++ uk = bkey_unpack_key(b, k); ++ pr_buf(out, ++ " failed overflow at depth %u\n" ++ "\t%llu:%llu\n", ++ ilog2(j), ++ uk.p.inode, uk.p.offset); ++ break; ++ } ++} +diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h +new file mode 100644 +index 000000000000..643bd9e8bc4d +--- /dev/null ++++ b/fs/bcachefs/bset.h +@@ -0,0 +1,624 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_BSET_H ++#define _BCACHEFS_BSET_H ++ ++#include ++#include ++ ++#include "bcachefs_format.h" ++#include "bkey.h" ++#include "bkey_methods.h" ++#include "btree_types.h" ++#include "util.h" /* for time_stats */ ++#include "vstructs.h" ++ ++/* ++ * BKEYS: ++ * ++ * A bkey contains a key, a size field, a variable number of pointers, and some ++ * ancillary flag bits. ++ * ++ * We use two different functions for validating bkeys, bkey_invalid and ++ * bkey_deleted(). ++ * ++ * The one exception to the rule that ptr_invalid() filters out invalid keys is ++ * that it also filters out keys of size 0 - these are keys that have been ++ * completely overwritten. It'd be safe to delete these in memory while leaving ++ * them on disk, just unnecessary work - so we filter them out when resorting ++ * instead. ++ * ++ * We can't filter out stale keys when we're resorting, because garbage ++ * collection needs to find them to ensure bucket gens don't wrap around - ++ * unless we're rewriting the btree node those stale keys still exist on disk. ++ * ++ * We also implement functions here for removing some number of sectors from the ++ * front or the back of a bkey - this is mainly used for fixing overlapping ++ * extents, by removing the overlapping sectors from the older key. ++ * ++ * BSETS: ++ * ++ * A bset is an array of bkeys laid out contiguously in memory in sorted order, ++ * along with a header. A btree node is made up of a number of these, written at ++ * different times. ++ * ++ * There could be many of them on disk, but we never allow there to be more than ++ * 4 in memory - we lazily resort as needed. ++ * ++ * We implement code here for creating and maintaining auxiliary search trees ++ * (described below) for searching an individial bset, and on top of that we ++ * implement a btree iterator. ++ * ++ * BTREE ITERATOR: ++ * ++ * Most of the code in bcache doesn't care about an individual bset - it needs ++ * to search entire btree nodes and iterate over them in sorted order. ++ * ++ * The btree iterator code serves both functions; it iterates through the keys ++ * in a btree node in sorted order, starting from either keys after a specific ++ * point (if you pass it a search key) or the start of the btree node. ++ * ++ * AUXILIARY SEARCH TREES: ++ * ++ * Since keys are variable length, we can't use a binary search on a bset - we ++ * wouldn't be able to find the start of the next key. But binary searches are ++ * slow anyways, due to terrible cache behaviour; bcache originally used binary ++ * searches and that code topped out at under 50k lookups/second. ++ * ++ * So we need to construct some sort of lookup table. Since we only insert keys ++ * into the last (unwritten) set, most of the keys within a given btree node are ++ * usually in sets that are mostly constant. We use two different types of ++ * lookup tables to take advantage of this. ++ * ++ * Both lookup tables share in common that they don't index every key in the ++ * set; they index one key every BSET_CACHELINE bytes, and then a linear search ++ * is used for the rest. ++ * ++ * For sets that have been written to disk and are no longer being inserted ++ * into, we construct a binary search tree in an array - traversing a binary ++ * search tree in an array gives excellent locality of reference and is very ++ * fast, since both children of any node are adjacent to each other in memory ++ * (and their grandchildren, and great grandchildren...) - this means ++ * prefetching can be used to great effect. ++ * ++ * It's quite useful performance wise to keep these nodes small - not just ++ * because they're more likely to be in L2, but also because we can prefetch ++ * more nodes on a single cacheline and thus prefetch more iterations in advance ++ * when traversing this tree. ++ * ++ * Nodes in the auxiliary search tree must contain both a key to compare against ++ * (we don't want to fetch the key from the set, that would defeat the purpose), ++ * and a pointer to the key. We use a few tricks to compress both of these. ++ * ++ * To compress the pointer, we take advantage of the fact that one node in the ++ * search tree corresponds to precisely BSET_CACHELINE bytes in the set. We have ++ * a function (to_inorder()) that takes the index of a node in a binary tree and ++ * returns what its index would be in an inorder traversal, so we only have to ++ * store the low bits of the offset. ++ * ++ * The key is 84 bits (KEY_DEV + key->key, the offset on the device). To ++ * compress that, we take advantage of the fact that when we're traversing the ++ * search tree at every iteration we know that both our search key and the key ++ * we're looking for lie within some range - bounded by our previous ++ * comparisons. (We special case the start of a search so that this is true even ++ * at the root of the tree). ++ * ++ * So we know the key we're looking for is between a and b, and a and b don't ++ * differ higher than bit 50, we don't need to check anything higher than bit ++ * 50. ++ * ++ * We don't usually need the rest of the bits, either; we only need enough bits ++ * to partition the key range we're currently checking. Consider key n - the ++ * key our auxiliary search tree node corresponds to, and key p, the key ++ * immediately preceding n. The lowest bit we need to store in the auxiliary ++ * search tree is the highest bit that differs between n and p. ++ * ++ * Note that this could be bit 0 - we might sometimes need all 80 bits to do the ++ * comparison. But we'd really like our nodes in the auxiliary search tree to be ++ * of fixed size. ++ * ++ * The solution is to make them fixed size, and when we're constructing a node ++ * check if p and n differed in the bits we needed them to. If they don't we ++ * flag that node, and when doing lookups we fallback to comparing against the ++ * real key. As long as this doesn't happen to often (and it seems to reliably ++ * happen a bit less than 1% of the time), we win - even on failures, that key ++ * is then more likely to be in cache than if we were doing binary searches all ++ * the way, since we're touching so much less memory. ++ * ++ * The keys in the auxiliary search tree are stored in (software) floating ++ * point, with an exponent and a mantissa. The exponent needs to be big enough ++ * to address all the bits in the original key, but the number of bits in the ++ * mantissa is somewhat arbitrary; more bits just gets us fewer failures. ++ * ++ * We need 7 bits for the exponent and 3 bits for the key's offset (since keys ++ * are 8 byte aligned); using 22 bits for the mantissa means a node is 4 bytes. ++ * We need one node per 128 bytes in the btree node, which means the auxiliary ++ * search trees take up 3% as much memory as the btree itself. ++ * ++ * Constructing these auxiliary search trees is moderately expensive, and we ++ * don't want to be constantly rebuilding the search tree for the last set ++ * whenever we insert another key into it. For the unwritten set, we use a much ++ * simpler lookup table - it's just a flat array, so index i in the lookup table ++ * corresponds to the i range of BSET_CACHELINE bytes in the set. Indexing ++ * within each byte range works the same as with the auxiliary search trees. ++ * ++ * These are much easier to keep up to date when we insert a key - we do it ++ * somewhat lazily; when we shift a key up we usually just increment the pointer ++ * to it, only when it would overflow do we go to the trouble of finding the ++ * first key in that range of bytes again. ++ */ ++ ++extern bool bch2_expensive_debug_checks; ++ ++static inline bool btree_keys_expensive_checks(const struct btree *b) ++{ ++#ifdef CONFIG_BCACHEFS_DEBUG ++ return bch2_expensive_debug_checks || *b->expensive_debug_checks; ++#else ++ return false; ++#endif ++} ++ ++enum bset_aux_tree_type { ++ BSET_NO_AUX_TREE, ++ BSET_RO_AUX_TREE, ++ BSET_RW_AUX_TREE, ++}; ++ ++#define BSET_TREE_NR_TYPES 3 ++ ++#define BSET_NO_AUX_TREE_VAL (U16_MAX) ++#define BSET_RW_AUX_TREE_VAL (U16_MAX - 1) ++ ++static inline enum bset_aux_tree_type bset_aux_tree_type(const struct bset_tree *t) ++{ ++ switch (t->extra) { ++ case BSET_NO_AUX_TREE_VAL: ++ EBUG_ON(t->size); ++ return BSET_NO_AUX_TREE; ++ case BSET_RW_AUX_TREE_VAL: ++ EBUG_ON(!t->size); ++ return BSET_RW_AUX_TREE; ++ default: ++ EBUG_ON(!t->size); ++ return BSET_RO_AUX_TREE; ++ } ++} ++ ++typedef void (*compiled_unpack_fn)(struct bkey *, const struct bkey_packed *); ++ ++static inline void ++__bkey_unpack_key_format_checked(const struct btree *b, ++ struct bkey *dst, ++ const struct bkey_packed *src) ++{ ++#ifdef HAVE_BCACHEFS_COMPILED_UNPACK ++ { ++ compiled_unpack_fn unpack_fn = b->aux_data; ++ unpack_fn(dst, src); ++ ++ if (btree_keys_expensive_checks(b)) { ++ struct bkey dst2 = __bch2_bkey_unpack_key(&b->format, src); ++ ++ /* ++ * hack around a harmless race when compacting whiteouts ++ * for a write: ++ */ ++ dst2.needs_whiteout = dst->needs_whiteout; ++ ++ BUG_ON(memcmp(dst, &dst2, sizeof(*dst))); ++ } ++ } ++#else ++ *dst = __bch2_bkey_unpack_key(&b->format, src); ++#endif ++} ++ ++static inline struct bkey ++bkey_unpack_key_format_checked(const struct btree *b, ++ const struct bkey_packed *src) ++{ ++ struct bkey dst; ++ ++ __bkey_unpack_key_format_checked(b, &dst, src); ++ return dst; ++} ++ ++static inline void __bkey_unpack_key(const struct btree *b, ++ struct bkey *dst, ++ const struct bkey_packed *src) ++{ ++ if (likely(bkey_packed(src))) ++ __bkey_unpack_key_format_checked(b, dst, src); ++ else ++ *dst = *packed_to_bkey_c(src); ++} ++ ++/** ++ * bkey_unpack_key -- unpack just the key, not the value ++ */ ++static inline struct bkey bkey_unpack_key(const struct btree *b, ++ const struct bkey_packed *src) ++{ ++ return likely(bkey_packed(src)) ++ ? bkey_unpack_key_format_checked(b, src) ++ : *packed_to_bkey_c(src); ++} ++ ++static inline struct bpos ++bkey_unpack_pos_format_checked(const struct btree *b, ++ const struct bkey_packed *src) ++{ ++#ifdef HAVE_BCACHEFS_COMPILED_UNPACK ++ return bkey_unpack_key_format_checked(b, src).p; ++#else ++ return __bkey_unpack_pos(&b->format, src); ++#endif ++} ++ ++static inline struct bpos bkey_unpack_pos(const struct btree *b, ++ const struct bkey_packed *src) ++{ ++ return likely(bkey_packed(src)) ++ ? bkey_unpack_pos_format_checked(b, src) ++ : packed_to_bkey_c(src)->p; ++} ++ ++/* Disassembled bkeys */ ++ ++static inline struct bkey_s_c bkey_disassemble(struct btree *b, ++ const struct bkey_packed *k, ++ struct bkey *u) ++{ ++ __bkey_unpack_key(b, u, k); ++ ++ return (struct bkey_s_c) { u, bkeyp_val(&b->format, k), }; ++} ++ ++/* non const version: */ ++static inline struct bkey_s __bkey_disassemble(struct btree *b, ++ struct bkey_packed *k, ++ struct bkey *u) ++{ ++ __bkey_unpack_key(b, u, k); ++ ++ return (struct bkey_s) { .k = u, .v = bkeyp_val(&b->format, k), }; ++} ++ ++#define for_each_bset(_b, _t) \ ++ for (_t = (_b)->set; _t < (_b)->set + (_b)->nsets; _t++) ++ ++static inline bool bset_has_ro_aux_tree(struct bset_tree *t) ++{ ++ return bset_aux_tree_type(t) == BSET_RO_AUX_TREE; ++} ++ ++static inline bool bset_has_rw_aux_tree(struct bset_tree *t) ++{ ++ return bset_aux_tree_type(t) == BSET_RW_AUX_TREE; ++} ++ ++static inline void bch2_bset_set_no_aux_tree(struct btree *b, ++ struct bset_tree *t) ++{ ++ BUG_ON(t < b->set); ++ ++ for (; t < b->set + ARRAY_SIZE(b->set); t++) { ++ t->size = 0; ++ t->extra = BSET_NO_AUX_TREE_VAL; ++ t->aux_data_offset = U16_MAX; ++ } ++} ++ ++static inline void btree_node_set_format(struct btree *b, ++ struct bkey_format f) ++{ ++ int len; ++ ++ b->format = f; ++ b->nr_key_bits = bkey_format_key_bits(&f); ++ ++ len = bch2_compile_bkey_format(&b->format, b->aux_data); ++ BUG_ON(len < 0 || len > U8_MAX); ++ ++ b->unpack_fn_len = len; ++ ++ bch2_bset_set_no_aux_tree(b, b->set); ++} ++ ++static inline struct bset *bset_next_set(struct btree *b, ++ unsigned block_bytes) ++{ ++ struct bset *i = btree_bset_last(b); ++ ++ EBUG_ON(!is_power_of_2(block_bytes)); ++ ++ return ((void *) i) + round_up(vstruct_bytes(i), block_bytes); ++} ++ ++void bch2_btree_keys_free(struct btree *); ++int bch2_btree_keys_alloc(struct btree *, unsigned, gfp_t); ++void bch2_btree_keys_init(struct btree *, bool *); ++ ++void bch2_bset_init_first(struct btree *, struct bset *); ++void bch2_bset_init_next(struct bch_fs *, struct btree *, ++ struct btree_node_entry *); ++void bch2_bset_build_aux_tree(struct btree *, struct bset_tree *, bool); ++void bch2_bset_fix_invalidated_key(struct btree *, struct bkey_packed *); ++ ++void bch2_bset_insert(struct btree *, struct btree_node_iter *, ++ struct bkey_packed *, struct bkey_i *, unsigned); ++void bch2_bset_delete(struct btree *, struct bkey_packed *, unsigned); ++ ++/* Bkey utility code */ ++ ++/* packed or unpacked */ ++static inline int bkey_cmp_p_or_unp(const struct btree *b, ++ const struct bkey_packed *l, ++ const struct bkey_packed *r_packed, ++ struct bpos *r) ++{ ++ EBUG_ON(r_packed && !bkey_packed(r_packed)); ++ ++ if (unlikely(!bkey_packed(l))) ++ return bkey_cmp(packed_to_bkey_c(l)->p, *r); ++ ++ if (likely(r_packed)) ++ return __bch2_bkey_cmp_packed_format_checked(l, r_packed, b); ++ ++ return __bch2_bkey_cmp_left_packed_format_checked(b, l, r); ++} ++ ++struct bset_tree *bch2_bkey_to_bset(struct btree *, struct bkey_packed *); ++ ++struct bkey_packed *bch2_bkey_prev_filter(struct btree *, struct bset_tree *, ++ struct bkey_packed *, unsigned); ++ ++static inline struct bkey_packed * ++bch2_bkey_prev_all(struct btree *b, struct bset_tree *t, struct bkey_packed *k) ++{ ++ return bch2_bkey_prev_filter(b, t, k, 0); ++} ++ ++static inline struct bkey_packed * ++bch2_bkey_prev(struct btree *b, struct bset_tree *t, struct bkey_packed *k) ++{ ++ return bch2_bkey_prev_filter(b, t, k, KEY_TYPE_discard + 1); ++} ++ ++enum bch_extent_overlap { ++ BCH_EXTENT_OVERLAP_ALL = 0, ++ BCH_EXTENT_OVERLAP_BACK = 1, ++ BCH_EXTENT_OVERLAP_FRONT = 2, ++ BCH_EXTENT_OVERLAP_MIDDLE = 3, ++}; ++ ++/* Returns how k overlaps with m */ ++static inline enum bch_extent_overlap bch2_extent_overlap(const struct bkey *k, ++ const struct bkey *m) ++{ ++ int cmp1 = bkey_cmp(k->p, m->p) < 0; ++ int cmp2 = bkey_cmp(bkey_start_pos(k), ++ bkey_start_pos(m)) > 0; ++ ++ return (cmp1 << 1) + cmp2; ++} ++ ++/* Btree key iteration */ ++ ++void bch2_btree_node_iter_push(struct btree_node_iter *, struct btree *, ++ const struct bkey_packed *, ++ const struct bkey_packed *); ++void bch2_btree_node_iter_init(struct btree_node_iter *, struct btree *, ++ struct bpos *); ++void bch2_btree_node_iter_init_from_start(struct btree_node_iter *, ++ struct btree *); ++struct bkey_packed *bch2_btree_node_iter_bset_pos(struct btree_node_iter *, ++ struct btree *, ++ struct bset_tree *); ++ ++void bch2_btree_node_iter_sort(struct btree_node_iter *, struct btree *); ++void bch2_btree_node_iter_set_drop(struct btree_node_iter *, ++ struct btree_node_iter_set *); ++void bch2_btree_node_iter_advance(struct btree_node_iter *, struct btree *); ++ ++#define btree_node_iter_for_each(_iter, _set) \ ++ for (_set = (_iter)->data; \ ++ _set < (_iter)->data + ARRAY_SIZE((_iter)->data) && \ ++ (_set)->k != (_set)->end; \ ++ _set++) ++ ++static inline bool __btree_node_iter_set_end(struct btree_node_iter *iter, ++ unsigned i) ++{ ++ return iter->data[i].k == iter->data[i].end; ++} ++ ++static inline bool bch2_btree_node_iter_end(struct btree_node_iter *iter) ++{ ++ return __btree_node_iter_set_end(iter, 0); ++} ++ ++/* ++ * When keys compare equal, deleted keys compare first: ++ * ++ * XXX: only need to compare pointers for keys that are both within a ++ * btree_node_iterator - we need to break ties for prev() to work correctly ++ */ ++static inline int bkey_iter_cmp(struct btree *b, ++ const struct bkey_packed *l, ++ const struct bkey_packed *r) ++{ ++ return bkey_cmp_packed(b, l, r) ++ ?: (int) bkey_deleted(r) - (int) bkey_deleted(l) ++ ?: cmp_int(l, r); ++} ++ ++static inline int btree_node_iter_cmp(struct btree *b, ++ struct btree_node_iter_set l, ++ struct btree_node_iter_set r) ++{ ++ return bkey_iter_cmp(b, ++ __btree_node_offset_to_key(b, l.k), ++ __btree_node_offset_to_key(b, r.k)); ++} ++ ++/* These assume l (the search key) is not a deleted key: */ ++static inline int bkey_iter_pos_cmp(struct btree *b, ++ struct bpos *l, ++ const struct bkey_packed *r) ++{ ++ return -bkey_cmp_left_packed(b, r, l) ++ ?: (int) bkey_deleted(r); ++} ++ ++static inline int bkey_iter_cmp_p_or_unp(struct btree *b, ++ struct bpos *l, ++ const struct bkey_packed *l_packed, ++ const struct bkey_packed *r) ++{ ++ return -bkey_cmp_p_or_unp(b, r, l_packed, l) ++ ?: (int) bkey_deleted(r); ++} ++ ++static inline struct bkey_packed * ++__bch2_btree_node_iter_peek_all(struct btree_node_iter *iter, ++ struct btree *b) ++{ ++ return __btree_node_offset_to_key(b, iter->data->k); ++} ++ ++static inline struct bkey_packed * ++bch2_btree_node_iter_peek_filter(struct btree_node_iter *iter, ++ struct btree *b, ++ unsigned min_key_type) ++{ ++ while (!bch2_btree_node_iter_end(iter)) { ++ struct bkey_packed *k = __bch2_btree_node_iter_peek_all(iter, b); ++ ++ if (k->type >= min_key_type) ++ return k; ++ ++ bch2_btree_node_iter_advance(iter, b); ++ } ++ ++ return NULL; ++} ++ ++static inline struct bkey_packed * ++bch2_btree_node_iter_peek_all(struct btree_node_iter *iter, ++ struct btree *b) ++{ ++ return bch2_btree_node_iter_peek_filter(iter, b, 0); ++} ++ ++static inline struct bkey_packed * ++bch2_btree_node_iter_peek(struct btree_node_iter *iter, struct btree *b) ++{ ++ return bch2_btree_node_iter_peek_filter(iter, b, KEY_TYPE_discard + 1); ++} ++ ++static inline struct bkey_packed * ++bch2_btree_node_iter_next_all(struct btree_node_iter *iter, struct btree *b) ++{ ++ struct bkey_packed *ret = bch2_btree_node_iter_peek_all(iter, b); ++ ++ if (ret) ++ bch2_btree_node_iter_advance(iter, b); ++ ++ return ret; ++} ++ ++struct bkey_packed *bch2_btree_node_iter_prev_all(struct btree_node_iter *, ++ struct btree *); ++struct bkey_packed *bch2_btree_node_iter_prev_filter(struct btree_node_iter *, ++ struct btree *, unsigned); ++ ++static inline struct bkey_packed * ++bch2_btree_node_iter_prev(struct btree_node_iter *iter, struct btree *b) ++{ ++ return bch2_btree_node_iter_prev_filter(iter, b, KEY_TYPE_discard + 1); ++} ++ ++struct bkey_s_c bch2_btree_node_iter_peek_unpack(struct btree_node_iter *, ++ struct btree *, ++ struct bkey *); ++ ++#define for_each_btree_node_key_unpack(b, k, iter, unpacked) \ ++ for (bch2_btree_node_iter_init_from_start((iter), (b)); \ ++ (k = bch2_btree_node_iter_peek_unpack((iter), (b), (unpacked))).k;\ ++ bch2_btree_node_iter_advance(iter, b)) ++ ++/* Accounting: */ ++ ++static inline void btree_keys_account_key(struct btree_nr_keys *n, ++ unsigned bset, ++ struct bkey_packed *k, ++ int sign) ++{ ++ n->live_u64s += k->u64s * sign; ++ n->bset_u64s[bset] += k->u64s * sign; ++ ++ if (bkey_packed(k)) ++ n->packed_keys += sign; ++ else ++ n->unpacked_keys += sign; ++} ++ ++#define btree_keys_account_key_add(_nr, _bset_idx, _k) \ ++ btree_keys_account_key(_nr, _bset_idx, _k, 1) ++#define btree_keys_account_key_drop(_nr, _bset_idx, _k) \ ++ btree_keys_account_key(_nr, _bset_idx, _k, -1) ++ ++#define btree_account_key_add(_b, _k) \ ++ btree_keys_account_key(&(_b)->nr, \ ++ bch2_bkey_to_bset(_b, _k) - (_b)->set, _k, 1) ++#define btree_account_key_drop(_b, _k) \ ++ btree_keys_account_key(&(_b)->nr, \ ++ bch2_bkey_to_bset(_b, _k) - (_b)->set, _k, -1) ++ ++struct bset_stats { ++ struct { ++ size_t nr, bytes; ++ } sets[BSET_TREE_NR_TYPES]; ++ ++ size_t floats; ++ size_t failed_unpacked; ++ size_t failed_prev; ++ size_t failed_overflow; ++}; ++ ++void bch2_btree_keys_stats(struct btree *, struct bset_stats *); ++void bch2_bfloat_to_text(struct printbuf *, struct btree *, ++ struct bkey_packed *); ++ ++/* Debug stuff */ ++ ++void bch2_dump_bset(struct btree *, struct bset *, unsigned); ++void bch2_dump_btree_node(struct btree *); ++void bch2_dump_btree_node_iter(struct btree *, struct btree_node_iter *); ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++ ++void __bch2_verify_btree_nr_keys(struct btree *); ++void bch2_btree_node_iter_verify(struct btree_node_iter *, struct btree *); ++void bch2_verify_insert_pos(struct btree *, struct bkey_packed *, ++ struct bkey_packed *, unsigned); ++ ++#else ++ ++static inline void __bch2_verify_btree_nr_keys(struct btree *b) {} ++static inline void bch2_btree_node_iter_verify(struct btree_node_iter *iter, ++ struct btree *b) {} ++static inline void bch2_verify_insert_pos(struct btree *b, ++ struct bkey_packed *where, ++ struct bkey_packed *insert, ++ unsigned clobber_u64s) {} ++#endif ++ ++static inline void bch2_verify_btree_nr_keys(struct btree *b) ++{ ++ if (btree_keys_expensive_checks(b)) ++ __bch2_verify_btree_nr_keys(b); ++} ++ ++#endif /* _BCACHEFS_BSET_H */ +diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c +new file mode 100644 +index 000000000000..416949512057 +--- /dev/null ++++ b/fs/bcachefs/btree_cache.c +@@ -0,0 +1,934 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include "bcachefs.h" ++#include "btree_cache.h" ++#include "btree_io.h" ++#include "btree_iter.h" ++#include "btree_locking.h" ++#include "debug.h" ++ ++#include ++#include ++#include ++ ++const char * const bch2_btree_ids[] = { ++#define x(kwd, val, name) name, ++ BCH_BTREE_IDS() ++#undef x ++ NULL ++}; ++ ++void bch2_recalc_btree_reserve(struct bch_fs *c) ++{ ++ unsigned i, reserve = 16; ++ ++ if (!c->btree_roots[0].b) ++ reserve += 8; ++ ++ for (i = 0; i < BTREE_ID_NR; i++) ++ if (c->btree_roots[i].b) ++ reserve += min_t(unsigned, 1, ++ c->btree_roots[i].b->level) * 8; ++ ++ c->btree_cache.reserve = reserve; ++} ++ ++static inline unsigned btree_cache_can_free(struct btree_cache *bc) ++{ ++ return max_t(int, 0, bc->used - bc->reserve); ++} ++ ++static void __btree_node_data_free(struct bch_fs *c, struct btree *b) ++{ ++ EBUG_ON(btree_node_write_in_flight(b)); ++ ++ kvpfree(b->data, btree_bytes(c)); ++ b->data = NULL; ++ bch2_btree_keys_free(b); ++} ++ ++static void btree_node_data_free(struct bch_fs *c, struct btree *b) ++{ ++ struct btree_cache *bc = &c->btree_cache; ++ ++ __btree_node_data_free(c, b); ++ bc->used--; ++ list_move(&b->list, &bc->freed); ++} ++ ++static int bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg *arg, ++ const void *obj) ++{ ++ const struct btree *b = obj; ++ const u64 *v = arg->key; ++ ++ return PTR_HASH(&b->key) == *v ? 0 : 1; ++} ++ ++static const struct rhashtable_params bch_btree_cache_params = { ++ .head_offset = offsetof(struct btree, hash), ++ .key_offset = offsetof(struct btree, key.v), ++ .key_len = sizeof(struct bch_extent_ptr), ++ .obj_cmpfn = bch2_btree_cache_cmp_fn, ++}; ++ ++static void btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp) ++{ ++ struct btree_cache *bc = &c->btree_cache; ++ ++ b->data = kvpmalloc(btree_bytes(c), gfp); ++ if (!b->data) ++ goto err; ++ ++ if (bch2_btree_keys_alloc(b, btree_page_order(c), gfp)) ++ goto err; ++ ++ bc->used++; ++ list_move(&b->list, &bc->freeable); ++ return; ++err: ++ kvpfree(b->data, btree_bytes(c)); ++ b->data = NULL; ++ list_move(&b->list, &bc->freed); ++} ++ ++static struct btree *btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp) ++{ ++ struct btree *b = kzalloc(sizeof(struct btree), gfp); ++ if (!b) ++ return NULL; ++ ++ bkey_btree_ptr_init(&b->key); ++ six_lock_init(&b->lock); ++ INIT_LIST_HEAD(&b->list); ++ INIT_LIST_HEAD(&b->write_blocked); ++ ++ btree_node_data_alloc(c, b, gfp); ++ return b->data ? b : NULL; ++} ++ ++/* Btree in memory cache - hash table */ ++ ++void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b) ++{ ++ rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params); ++ ++ /* Cause future lookups for this node to fail: */ ++ PTR_HASH(&b->key) = 0; ++} ++ ++int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b) ++{ ++ return rhashtable_lookup_insert_fast(&bc->table, &b->hash, ++ bch_btree_cache_params); ++} ++ ++int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b, ++ unsigned level, enum btree_id id) ++{ ++ int ret; ++ ++ b->level = level; ++ b->btree_id = id; ++ ++ mutex_lock(&bc->lock); ++ ret = __bch2_btree_node_hash_insert(bc, b); ++ if (!ret) ++ list_add(&b->list, &bc->live); ++ mutex_unlock(&bc->lock); ++ ++ return ret; ++} ++ ++__flatten ++static inline struct btree *btree_cache_find(struct btree_cache *bc, ++ const struct bkey_i *k) ++{ ++ return rhashtable_lookup_fast(&bc->table, &PTR_HASH(k), ++ bch_btree_cache_params); ++} ++ ++/* ++ * this version is for btree nodes that have already been freed (we're not ++ * reaping a real btree node) ++ */ ++static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush) ++{ ++ struct btree_cache *bc = &c->btree_cache; ++ int ret = 0; ++ ++ lockdep_assert_held(&bc->lock); ++ ++ if (!six_trylock_intent(&b->lock)) ++ return -ENOMEM; ++ ++ if (!six_trylock_write(&b->lock)) ++ goto out_unlock_intent; ++ ++ if (btree_node_noevict(b)) ++ goto out_unlock; ++ ++ if (!btree_node_may_write(b)) ++ goto out_unlock; ++ ++ if (btree_node_dirty(b) && ++ test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags)) ++ goto out_unlock; ++ ++ if (btree_node_dirty(b) || ++ btree_node_write_in_flight(b) || ++ btree_node_read_in_flight(b)) { ++ if (!flush) ++ goto out_unlock; ++ ++ wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight, ++ TASK_UNINTERRUPTIBLE); ++ ++ /* ++ * Using the underscore version because we don't want to compact ++ * bsets after the write, since this node is about to be evicted ++ * - unless btree verify mode is enabled, since it runs out of ++ * the post write cleanup: ++ */ ++ if (verify_btree_ondisk(c)) ++ bch2_btree_node_write(c, b, SIX_LOCK_intent); ++ else ++ __bch2_btree_node_write(c, b, SIX_LOCK_read); ++ ++ /* wait for any in flight btree write */ ++ btree_node_wait_on_io(b); ++ } ++out: ++ if (PTR_HASH(&b->key) && !ret) ++ trace_btree_node_reap(c, b); ++ return ret; ++out_unlock: ++ six_unlock_write(&b->lock); ++out_unlock_intent: ++ six_unlock_intent(&b->lock); ++ ret = -ENOMEM; ++ goto out; ++} ++ ++static int btree_node_reclaim(struct bch_fs *c, struct btree *b) ++{ ++ return __btree_node_reclaim(c, b, false); ++} ++ ++static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b) ++{ ++ return __btree_node_reclaim(c, b, true); ++} ++ ++static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, ++ struct shrink_control *sc) ++{ ++ struct bch_fs *c = container_of(shrink, struct bch_fs, ++ btree_cache.shrink); ++ struct btree_cache *bc = &c->btree_cache; ++ struct btree *b, *t; ++ unsigned long nr = sc->nr_to_scan; ++ unsigned long can_free; ++ unsigned long touched = 0; ++ unsigned long freed = 0; ++ unsigned i; ++ ++ if (btree_shrinker_disabled(c)) ++ return SHRINK_STOP; ++ ++ /* Return -1 if we can't do anything right now */ ++ if (sc->gfp_mask & __GFP_IO) ++ mutex_lock(&bc->lock); ++ else if (!mutex_trylock(&bc->lock)) ++ return -1; ++ ++ /* ++ * It's _really_ critical that we don't free too many btree nodes - we ++ * have to always leave ourselves a reserve. The reserve is how we ++ * guarantee that allocating memory for a new btree node can always ++ * succeed, so that inserting keys into the btree can always succeed and ++ * IO can always make forward progress: ++ */ ++ nr /= btree_pages(c); ++ can_free = btree_cache_can_free(bc); ++ nr = min_t(unsigned long, nr, can_free); ++ ++ i = 0; ++ list_for_each_entry_safe(b, t, &bc->freeable, list) { ++ touched++; ++ ++ if (freed >= nr) ++ break; ++ ++ if (++i > 3 && ++ !btree_node_reclaim(c, b)) { ++ btree_node_data_free(c, b); ++ six_unlock_write(&b->lock); ++ six_unlock_intent(&b->lock); ++ freed++; ++ } ++ } ++restart: ++ list_for_each_entry_safe(b, t, &bc->live, list) { ++ touched++; ++ ++ if (freed >= nr) { ++ /* Save position */ ++ if (&t->list != &bc->live) ++ list_move_tail(&bc->live, &t->list); ++ break; ++ } ++ ++ if (!btree_node_accessed(b) && ++ !btree_node_reclaim(c, b)) { ++ /* can't call bch2_btree_node_hash_remove under lock */ ++ freed++; ++ if (&t->list != &bc->live) ++ list_move_tail(&bc->live, &t->list); ++ ++ btree_node_data_free(c, b); ++ mutex_unlock(&bc->lock); ++ ++ bch2_btree_node_hash_remove(bc, b); ++ six_unlock_write(&b->lock); ++ six_unlock_intent(&b->lock); ++ ++ if (freed >= nr) ++ goto out; ++ ++ if (sc->gfp_mask & __GFP_IO) ++ mutex_lock(&bc->lock); ++ else if (!mutex_trylock(&bc->lock)) ++ goto out; ++ goto restart; ++ } else ++ clear_btree_node_accessed(b); ++ } ++ ++ mutex_unlock(&bc->lock); ++out: ++ return (unsigned long) freed * btree_pages(c); ++} ++ ++static unsigned long bch2_btree_cache_count(struct shrinker *shrink, ++ struct shrink_control *sc) ++{ ++ struct bch_fs *c = container_of(shrink, struct bch_fs, ++ btree_cache.shrink); ++ struct btree_cache *bc = &c->btree_cache; ++ ++ if (btree_shrinker_disabled(c)) ++ return 0; ++ ++ return btree_cache_can_free(bc) * btree_pages(c); ++} ++ ++void bch2_fs_btree_cache_exit(struct bch_fs *c) ++{ ++ struct btree_cache *bc = &c->btree_cache; ++ struct btree *b; ++ unsigned i; ++ ++ if (bc->shrink.list.next) ++ unregister_shrinker(&bc->shrink); ++ ++ mutex_lock(&bc->lock); ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++ if (c->verify_data) ++ list_move(&c->verify_data->list, &bc->live); ++ ++ kvpfree(c->verify_ondisk, btree_bytes(c)); ++#endif ++ ++ for (i = 0; i < BTREE_ID_NR; i++) ++ if (c->btree_roots[i].b) ++ list_add(&c->btree_roots[i].b->list, &bc->live); ++ ++ list_splice(&bc->freeable, &bc->live); ++ ++ while (!list_empty(&bc->live)) { ++ b = list_first_entry(&bc->live, struct btree, list); ++ ++ BUG_ON(btree_node_read_in_flight(b) || ++ btree_node_write_in_flight(b)); ++ ++ if (btree_node_dirty(b)) ++ bch2_btree_complete_write(c, b, btree_current_write(b)); ++ clear_btree_node_dirty(b); ++ ++ btree_node_data_free(c, b); ++ } ++ ++ while (!list_empty(&bc->freed)) { ++ b = list_first_entry(&bc->freed, struct btree, list); ++ list_del(&b->list); ++ kfree(b); ++ } ++ ++ mutex_unlock(&bc->lock); ++ ++ if (bc->table_init_done) ++ rhashtable_destroy(&bc->table); ++} ++ ++int bch2_fs_btree_cache_init(struct bch_fs *c) ++{ ++ struct btree_cache *bc = &c->btree_cache; ++ unsigned i; ++ int ret = 0; ++ ++ pr_verbose_init(c->opts, ""); ++ ++ ret = rhashtable_init(&bc->table, &bch_btree_cache_params); ++ if (ret) ++ goto out; ++ ++ bc->table_init_done = true; ++ ++ bch2_recalc_btree_reserve(c); ++ ++ for (i = 0; i < bc->reserve; i++) ++ if (!btree_node_mem_alloc(c, GFP_KERNEL)) { ++ ret = -ENOMEM; ++ goto out; ++ } ++ ++ list_splice_init(&bc->live, &bc->freeable); ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++ mutex_init(&c->verify_lock); ++ ++ c->verify_ondisk = kvpmalloc(btree_bytes(c), GFP_KERNEL); ++ if (!c->verify_ondisk) { ++ ret = -ENOMEM; ++ goto out; ++ } ++ ++ c->verify_data = btree_node_mem_alloc(c, GFP_KERNEL); ++ if (!c->verify_data) { ++ ret = -ENOMEM; ++ goto out; ++ } ++ ++ list_del_init(&c->verify_data->list); ++#endif ++ ++ bc->shrink.count_objects = bch2_btree_cache_count; ++ bc->shrink.scan_objects = bch2_btree_cache_scan; ++ bc->shrink.seeks = 4; ++ bc->shrink.batch = btree_pages(c) * 2; ++ register_shrinker(&bc->shrink); ++out: ++ pr_verbose_init(c->opts, "ret %i", ret); ++ return ret; ++} ++ ++void bch2_fs_btree_cache_init_early(struct btree_cache *bc) ++{ ++ mutex_init(&bc->lock); ++ INIT_LIST_HEAD(&bc->live); ++ INIT_LIST_HEAD(&bc->freeable); ++ INIT_LIST_HEAD(&bc->freed); ++} ++ ++/* ++ * We can only have one thread cannibalizing other cached btree nodes at a time, ++ * or we'll deadlock. We use an open coded mutex to ensure that, which a ++ * cannibalize_bucket() will take. This means every time we unlock the root of ++ * the btree, we need to release this lock if we have it held. ++ */ ++void bch2_btree_cache_cannibalize_unlock(struct bch_fs *c) ++{ ++ struct btree_cache *bc = &c->btree_cache; ++ ++ if (bc->alloc_lock == current) { ++ trace_btree_node_cannibalize_unlock(c); ++ bc->alloc_lock = NULL; ++ closure_wake_up(&bc->alloc_wait); ++ } ++} ++ ++int bch2_btree_cache_cannibalize_lock(struct bch_fs *c, struct closure *cl) ++{ ++ struct btree_cache *bc = &c->btree_cache; ++ struct task_struct *old; ++ ++ old = cmpxchg(&bc->alloc_lock, NULL, current); ++ if (old == NULL || old == current) ++ goto success; ++ ++ if (!cl) { ++ trace_btree_node_cannibalize_lock_fail(c); ++ return -ENOMEM; ++ } ++ ++ closure_wait(&bc->alloc_wait, cl); ++ ++ /* Try again, after adding ourselves to waitlist */ ++ old = cmpxchg(&bc->alloc_lock, NULL, current); ++ if (old == NULL || old == current) { ++ /* We raced */ ++ closure_wake_up(&bc->alloc_wait); ++ goto success; ++ } ++ ++ trace_btree_node_cannibalize_lock_fail(c); ++ return -EAGAIN; ++ ++success: ++ trace_btree_node_cannibalize_lock(c); ++ return 0; ++} ++ ++static struct btree *btree_node_cannibalize(struct bch_fs *c) ++{ ++ struct btree_cache *bc = &c->btree_cache; ++ struct btree *b; ++ ++ list_for_each_entry_reverse(b, &bc->live, list) ++ if (!btree_node_reclaim(c, b)) ++ return b; ++ ++ while (1) { ++ list_for_each_entry_reverse(b, &bc->live, list) ++ if (!btree_node_write_and_reclaim(c, b)) ++ return b; ++ ++ /* ++ * Rare case: all nodes were intent-locked. ++ * Just busy-wait. ++ */ ++ WARN_ONCE(1, "btree cache cannibalize failed\n"); ++ cond_resched(); ++ } ++} ++ ++struct btree *bch2_btree_node_mem_alloc(struct bch_fs *c) ++{ ++ struct btree_cache *bc = &c->btree_cache; ++ struct btree *b; ++ u64 start_time = local_clock(); ++ unsigned flags; ++ ++ flags = memalloc_nofs_save(); ++ mutex_lock(&bc->lock); ++ ++ /* ++ * btree_free() doesn't free memory; it sticks the node on the end of ++ * the list. Check if there's any freed nodes there: ++ */ ++ list_for_each_entry(b, &bc->freeable, list) ++ if (!btree_node_reclaim(c, b)) ++ goto out_unlock; ++ ++ /* ++ * We never free struct btree itself, just the memory that holds the on ++ * disk node. Check the freed list before allocating a new one: ++ */ ++ list_for_each_entry(b, &bc->freed, list) ++ if (!btree_node_reclaim(c, b)) { ++ btree_node_data_alloc(c, b, __GFP_NOWARN|GFP_NOIO); ++ if (b->data) ++ goto out_unlock; ++ ++ six_unlock_write(&b->lock); ++ six_unlock_intent(&b->lock); ++ goto err; ++ } ++ ++ b = btree_node_mem_alloc(c, __GFP_NOWARN|GFP_NOIO); ++ if (!b) ++ goto err; ++ ++ BUG_ON(!six_trylock_intent(&b->lock)); ++ BUG_ON(!six_trylock_write(&b->lock)); ++out_unlock: ++ BUG_ON(btree_node_hashed(b)); ++ BUG_ON(btree_node_write_in_flight(b)); ++ ++ list_del_init(&b->list); ++ mutex_unlock(&bc->lock); ++ memalloc_nofs_restore(flags); ++out: ++ b->flags = 0; ++ b->written = 0; ++ b->nsets = 0; ++ b->sib_u64s[0] = 0; ++ b->sib_u64s[1] = 0; ++ b->whiteout_u64s = 0; ++ b->uncompacted_whiteout_u64s = 0; ++ bch2_btree_keys_init(b, &c->expensive_debug_checks); ++ ++ bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc], ++ start_time); ++ ++ return b; ++err: ++ /* Try to cannibalize another cached btree node: */ ++ if (bc->alloc_lock == current) { ++ b = btree_node_cannibalize(c); ++ list_del_init(&b->list); ++ mutex_unlock(&bc->lock); ++ ++ bch2_btree_node_hash_remove(bc, b); ++ ++ trace_btree_node_cannibalize(c); ++ goto out; ++ } ++ ++ mutex_unlock(&bc->lock); ++ return ERR_PTR(-ENOMEM); ++} ++ ++/* Slowpath, don't want it inlined into btree_iter_traverse() */ ++static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c, ++ struct btree_iter *iter, ++ const struct bkey_i *k, ++ unsigned level, ++ enum six_lock_type lock_type, ++ bool sync) ++{ ++ struct btree_cache *bc = &c->btree_cache; ++ struct btree *b; ++ ++ /* ++ * Parent node must be locked, else we could read in a btree node that's ++ * been freed: ++ */ ++ BUG_ON(!btree_node_locked(iter, level + 1)); ++ BUG_ON(level >= BTREE_MAX_DEPTH); ++ ++ b = bch2_btree_node_mem_alloc(c); ++ if (IS_ERR(b)) ++ return b; ++ ++ bkey_copy(&b->key, k); ++ if (bch2_btree_node_hash_insert(bc, b, level, iter->btree_id)) { ++ /* raced with another fill: */ ++ ++ /* mark as unhashed... */ ++ PTR_HASH(&b->key) = 0; ++ ++ mutex_lock(&bc->lock); ++ list_add(&b->list, &bc->freeable); ++ mutex_unlock(&bc->lock); ++ ++ six_unlock_write(&b->lock); ++ six_unlock_intent(&b->lock); ++ return NULL; ++ } ++ ++ /* ++ * If the btree node wasn't cached, we can't drop our lock on ++ * the parent until after it's added to the cache - because ++ * otherwise we could race with a btree_split() freeing the node ++ * we're trying to lock. ++ * ++ * But the deadlock described below doesn't exist in this case, ++ * so it's safe to not drop the parent lock until here: ++ */ ++ if (btree_node_read_locked(iter, level + 1)) ++ btree_node_unlock(iter, level + 1); ++ ++ bch2_btree_node_read(c, b, sync); ++ ++ six_unlock_write(&b->lock); ++ ++ if (!sync) { ++ six_unlock_intent(&b->lock); ++ return NULL; ++ } ++ ++ if (lock_type == SIX_LOCK_read) ++ six_lock_downgrade(&b->lock); ++ ++ return b; ++} ++ ++/** ++ * bch_btree_node_get - find a btree node in the cache and lock it, reading it ++ * in from disk if necessary. ++ * ++ * If IO is necessary and running under generic_make_request, returns -EAGAIN. ++ * ++ * The btree node will have either a read or a write lock held, depending on ++ * the @write parameter. ++ */ ++struct btree *bch2_btree_node_get(struct bch_fs *c, struct btree_iter *iter, ++ const struct bkey_i *k, unsigned level, ++ enum six_lock_type lock_type) ++{ ++ struct btree_cache *bc = &c->btree_cache; ++ struct btree *b; ++ struct bset_tree *t; ++ ++ /* ++ * XXX: locking optimization ++ * ++ * we can make the locking looser here - caller can drop lock on parent ++ * node before locking child node (and potentially blocking): we just ++ * have to have bch2_btree_node_fill() call relock on the parent and ++ * return -EINTR if that fails ++ */ ++ EBUG_ON(!btree_node_locked(iter, level + 1)); ++ EBUG_ON(level >= BTREE_MAX_DEPTH); ++retry: ++ b = btree_cache_find(bc, k); ++ if (unlikely(!b)) { ++ /* ++ * We must have the parent locked to call bch2_btree_node_fill(), ++ * else we could read in a btree node from disk that's been ++ * freed: ++ */ ++ b = bch2_btree_node_fill(c, iter, k, level, lock_type, true); ++ ++ /* We raced and found the btree node in the cache */ ++ if (!b) ++ goto retry; ++ ++ if (IS_ERR(b)) ++ return b; ++ } else { ++ /* ++ * There's a potential deadlock with splits and insertions into ++ * interior nodes we have to avoid: ++ * ++ * The other thread might be holding an intent lock on the node ++ * we want, and they want to update its parent node so they're ++ * going to upgrade their intent lock on the parent node to a ++ * write lock. ++ * ++ * But if we're holding a read lock on the parent, and we're ++ * trying to get the intent lock they're holding, we deadlock. ++ * ++ * So to avoid this we drop the read locks on parent nodes when ++ * we're starting to take intent locks - and handle the race. ++ * ++ * The race is that they might be about to free the node we ++ * want, and dropping our read lock on the parent node lets them ++ * update the parent marking the node we want as freed, and then ++ * free it: ++ * ++ * To guard against this, btree nodes are evicted from the cache ++ * when they're freed - and PTR_HASH() is zeroed out, which we ++ * check for after we lock the node. ++ * ++ * Then, bch2_btree_node_relock() on the parent will fail - because ++ * the parent was modified, when the pointer to the node we want ++ * was removed - and we'll bail out: ++ */ ++ if (btree_node_read_locked(iter, level + 1)) ++ btree_node_unlock(iter, level + 1); ++ ++ if (!btree_node_lock(b, k->k.p, level, iter, lock_type)) ++ return ERR_PTR(-EINTR); ++ ++ if (unlikely(PTR_HASH(&b->key) != PTR_HASH(k) || ++ b->level != level || ++ race_fault())) { ++ six_unlock_type(&b->lock, lock_type); ++ if (bch2_btree_node_relock(iter, level + 1)) ++ goto retry; ++ ++ trace_trans_restart_btree_node_reused(iter->trans->ip); ++ return ERR_PTR(-EINTR); ++ } ++ } ++ ++ wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight, ++ TASK_UNINTERRUPTIBLE); ++ ++ prefetch(b->aux_data); ++ ++ for_each_bset(b, t) { ++ void *p = (u64 *) b->aux_data + t->aux_data_offset; ++ ++ prefetch(p + L1_CACHE_BYTES * 0); ++ prefetch(p + L1_CACHE_BYTES * 1); ++ prefetch(p + L1_CACHE_BYTES * 2); ++ } ++ ++ /* avoid atomic set bit if it's not needed: */ ++ if (btree_node_accessed(b)) ++ set_btree_node_accessed(b); ++ ++ if (unlikely(btree_node_read_error(b))) { ++ six_unlock_type(&b->lock, lock_type); ++ return ERR_PTR(-EIO); ++ } ++ ++ EBUG_ON(b->btree_id != iter->btree_id || ++ BTREE_NODE_LEVEL(b->data) != level || ++ bkey_cmp(b->data->max_key, k->k.p)); ++ ++ return b; ++} ++ ++struct btree *bch2_btree_node_get_sibling(struct bch_fs *c, ++ struct btree_iter *iter, ++ struct btree *b, ++ enum btree_node_sibling sib) ++{ ++ struct btree_trans *trans = iter->trans; ++ struct btree *parent; ++ struct btree_node_iter node_iter; ++ struct bkey_packed *k; ++ BKEY_PADDED(k) tmp; ++ struct btree *ret = NULL; ++ unsigned level = b->level; ++ ++ parent = btree_iter_node(iter, level + 1); ++ if (!parent) ++ return NULL; ++ ++ if (!bch2_btree_node_relock(iter, level + 1)) { ++ ret = ERR_PTR(-EINTR); ++ goto out; ++ } ++ ++ node_iter = iter->l[parent->level].iter; ++ ++ k = bch2_btree_node_iter_peek_all(&node_iter, parent); ++ BUG_ON(bkey_cmp_left_packed(parent, k, &b->key.k.p)); ++ ++ k = sib == btree_prev_sib ++ ? bch2_btree_node_iter_prev(&node_iter, parent) ++ : (bch2_btree_node_iter_advance(&node_iter, parent), ++ bch2_btree_node_iter_peek(&node_iter, parent)); ++ if (!k) ++ goto out; ++ ++ bch2_bkey_unpack(parent, &tmp.k, k); ++ ++ ret = bch2_btree_node_get(c, iter, &tmp.k, level, ++ SIX_LOCK_intent); ++ ++ if (PTR_ERR_OR_ZERO(ret) == -EINTR && !trans->nounlock) { ++ struct btree_iter *linked; ++ ++ if (!bch2_btree_node_relock(iter, level + 1)) ++ goto out; ++ ++ /* ++ * We might have got -EINTR because trylock failed, and we're ++ * holding other locks that would cause us to deadlock: ++ */ ++ trans_for_each_iter(trans, linked) ++ if (btree_iter_cmp(iter, linked) < 0) ++ __bch2_btree_iter_unlock(linked); ++ ++ if (sib == btree_prev_sib) ++ btree_node_unlock(iter, level); ++ ++ ret = bch2_btree_node_get(c, iter, &tmp.k, level, ++ SIX_LOCK_intent); ++ ++ /* ++ * before btree_iter_relock() calls btree_iter_verify_locks(): ++ */ ++ if (btree_lock_want(iter, level + 1) == BTREE_NODE_UNLOCKED) ++ btree_node_unlock(iter, level + 1); ++ ++ if (!bch2_btree_node_relock(iter, level)) { ++ btree_iter_set_dirty(iter, BTREE_ITER_NEED_RELOCK); ++ ++ if (!IS_ERR(ret)) { ++ six_unlock_intent(&ret->lock); ++ ret = ERR_PTR(-EINTR); ++ } ++ } ++ ++ bch2_trans_relock(trans); ++ } ++out: ++ if (btree_lock_want(iter, level + 1) == BTREE_NODE_UNLOCKED) ++ btree_node_unlock(iter, level + 1); ++ ++ if (PTR_ERR_OR_ZERO(ret) == -EINTR) ++ bch2_btree_iter_upgrade(iter, level + 2); ++ ++ BUG_ON(!IS_ERR(ret) && !btree_node_locked(iter, level)); ++ ++ if (!IS_ERR_OR_NULL(ret)) { ++ struct btree *n1 = ret, *n2 = b; ++ ++ if (sib != btree_prev_sib) ++ swap(n1, n2); ++ ++ BUG_ON(bkey_cmp(btree_type_successor(n1->btree_id, ++ n1->key.k.p), ++ n2->data->min_key)); ++ } ++ ++ bch2_btree_trans_verify_locks(trans); ++ ++ return ret; ++} ++ ++void bch2_btree_node_prefetch(struct bch_fs *c, struct btree_iter *iter, ++ const struct bkey_i *k, unsigned level) ++{ ++ struct btree_cache *bc = &c->btree_cache; ++ struct btree *b; ++ ++ BUG_ON(!btree_node_locked(iter, level + 1)); ++ BUG_ON(level >= BTREE_MAX_DEPTH); ++ ++ b = btree_cache_find(bc, k); ++ if (b) ++ return; ++ ++ bch2_btree_node_fill(c, iter, k, level, SIX_LOCK_read, false); ++} ++ ++void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, ++ struct btree *b) ++{ ++ const struct bkey_format *f = &b->format; ++ struct bset_stats stats; ++ ++ memset(&stats, 0, sizeof(stats)); ++ ++ bch2_btree_keys_stats(b, &stats); ++ ++ pr_buf(out, ++ "l %u %llu:%llu - %llu:%llu:\n" ++ " ptrs: ", ++ b->level, ++ b->data->min_key.inode, ++ b->data->min_key.offset, ++ b->data->max_key.inode, ++ b->data->max_key.offset); ++ bch2_val_to_text(out, c, bkey_i_to_s_c(&b->key)); ++ pr_buf(out, "\n" ++ " format: u64s %u fields %u %u %u %u %u\n" ++ " unpack fn len: %u\n" ++ " bytes used %zu/%zu (%zu%% full)\n" ++ " sib u64s: %u, %u (merge threshold %zu)\n" ++ " nr packed keys %u\n" ++ " nr unpacked keys %u\n" ++ " floats %zu\n" ++ " failed unpacked %zu\n" ++ " failed prev %zu\n" ++ " failed overflow %zu\n", ++ f->key_u64s, ++ f->bits_per_field[0], ++ f->bits_per_field[1], ++ f->bits_per_field[2], ++ f->bits_per_field[3], ++ f->bits_per_field[4], ++ b->unpack_fn_len, ++ b->nr.live_u64s * sizeof(u64), ++ btree_bytes(c) - sizeof(struct btree_node), ++ b->nr.live_u64s * 100 / btree_max_u64s(c), ++ b->sib_u64s[0], ++ b->sib_u64s[1], ++ BTREE_FOREGROUND_MERGE_THRESHOLD(c), ++ b->nr.packed_keys, ++ b->nr.unpacked_keys, ++ stats.floats, ++ stats.failed_unpacked, ++ stats.failed_prev, ++ stats.failed_overflow); ++} +diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h +new file mode 100644 +index 000000000000..c5873c58439c +--- /dev/null ++++ b/fs/bcachefs/btree_cache.h +@@ -0,0 +1,90 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_BTREE_CACHE_H ++#define _BCACHEFS_BTREE_CACHE_H ++ ++#include "bcachefs.h" ++#include "btree_types.h" ++ ++struct btree_iter; ++ ++extern const char * const bch2_btree_ids[]; ++ ++void bch2_recalc_btree_reserve(struct bch_fs *); ++ ++void bch2_btree_node_hash_remove(struct btree_cache *, struct btree *); ++int __bch2_btree_node_hash_insert(struct btree_cache *, struct btree *); ++int bch2_btree_node_hash_insert(struct btree_cache *, struct btree *, ++ unsigned, enum btree_id); ++ ++void bch2_btree_cache_cannibalize_unlock(struct bch_fs *); ++int bch2_btree_cache_cannibalize_lock(struct bch_fs *, struct closure *); ++ ++struct btree *bch2_btree_node_mem_alloc(struct bch_fs *); ++ ++struct btree *bch2_btree_node_get(struct bch_fs *, struct btree_iter *, ++ const struct bkey_i *, unsigned, ++ enum six_lock_type); ++ ++struct btree *bch2_btree_node_get_sibling(struct bch_fs *, struct btree_iter *, ++ struct btree *, enum btree_node_sibling); ++ ++void bch2_btree_node_prefetch(struct bch_fs *, struct btree_iter *, ++ const struct bkey_i *, unsigned); ++ ++void bch2_fs_btree_cache_exit(struct bch_fs *); ++int bch2_fs_btree_cache_init(struct bch_fs *); ++void bch2_fs_btree_cache_init_early(struct btree_cache *); ++ ++#define PTR_HASH(_k) *((u64 *) &bkey_i_to_btree_ptr_c(_k)->v) ++ ++/* is btree node in hash table? */ ++static inline bool btree_node_hashed(struct btree *b) ++{ ++ return b->key.k.type == KEY_TYPE_btree_ptr && ++ PTR_HASH(&b->key); ++} ++ ++#define for_each_cached_btree(_b, _c, _tbl, _iter, _pos) \ ++ for ((_tbl) = rht_dereference_rcu((_c)->btree_cache.table.tbl, \ ++ &(_c)->btree_cache.table), \ ++ _iter = 0; _iter < (_tbl)->size; _iter++) \ ++ rht_for_each_entry_rcu((_b), (_pos), _tbl, _iter, hash) ++ ++static inline size_t btree_bytes(struct bch_fs *c) ++{ ++ return c->opts.btree_node_size << 9; ++} ++ ++static inline size_t btree_max_u64s(struct bch_fs *c) ++{ ++ return (btree_bytes(c) - sizeof(struct btree_node)) / sizeof(u64); ++} ++ ++static inline size_t btree_page_order(struct bch_fs *c) ++{ ++ return get_order(btree_bytes(c)); ++} ++ ++static inline size_t btree_pages(struct bch_fs *c) ++{ ++ return 1 << btree_page_order(c); ++} ++ ++static inline unsigned btree_blocks(struct bch_fs *c) ++{ ++ return c->opts.btree_node_size >> c->block_bits; ++} ++ ++#define BTREE_SPLIT_THRESHOLD(c) (btree_blocks(c) * 3 / 4) ++ ++#define BTREE_FOREGROUND_MERGE_THRESHOLD(c) (btree_max_u64s(c) * 1 / 3) ++#define BTREE_FOREGROUND_MERGE_HYSTERESIS(c) \ ++ (BTREE_FOREGROUND_MERGE_THRESHOLD(c) + \ ++ (BTREE_FOREGROUND_MERGE_THRESHOLD(c) << 2)) ++ ++#define btree_node_root(_c, _b) ((_c)->btree_roots[(_b)->btree_id].b) ++ ++void bch2_btree_node_to_text(struct printbuf *, struct bch_fs *, ++ struct btree *); ++ ++#endif /* _BCACHEFS_BTREE_CACHE_H */ +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +new file mode 100644 +index 000000000000..c4c2e1a3ee0e +--- /dev/null ++++ b/fs/bcachefs/btree_gc.c +@@ -0,0 +1,1230 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * Copyright (C) 2010 Kent Overstreet ++ * Copyright (C) 2014 Datera Inc. ++ */ ++ ++#include "bcachefs.h" ++#include "alloc_background.h" ++#include "alloc_foreground.h" ++#include "bkey_methods.h" ++#include "btree_locking.h" ++#include "btree_update_interior.h" ++#include "btree_io.h" ++#include "btree_gc.h" ++#include "buckets.h" ++#include "clock.h" ++#include "debug.h" ++#include "ec.h" ++#include "error.h" ++#include "extents.h" ++#include "journal.h" ++#include "keylist.h" ++#include "move.h" ++#include "recovery.h" ++#include "replicas.h" ++#include "super-io.h" ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++static inline void __gc_pos_set(struct bch_fs *c, struct gc_pos new_pos) ++{ ++ write_seqcount_begin(&c->gc_pos_lock); ++ c->gc_pos = new_pos; ++ write_seqcount_end(&c->gc_pos_lock); ++} ++ ++static inline void gc_pos_set(struct bch_fs *c, struct gc_pos new_pos) ++{ ++ BUG_ON(gc_pos_cmp(new_pos, c->gc_pos) <= 0); ++ __gc_pos_set(c, new_pos); ++} ++ ++/* range_checks - for validating min/max pos of each btree node: */ ++ ++struct range_checks { ++ struct range_level { ++ struct bpos min; ++ struct bpos max; ++ } l[BTREE_MAX_DEPTH]; ++ unsigned depth; ++}; ++ ++static void btree_node_range_checks_init(struct range_checks *r, unsigned depth) ++{ ++ unsigned i; ++ ++ for (i = 0; i < BTREE_MAX_DEPTH; i++) ++ r->l[i].min = r->l[i].max = POS_MIN; ++ r->depth = depth; ++} ++ ++static void btree_node_range_checks(struct bch_fs *c, struct btree *b, ++ struct range_checks *r) ++{ ++ struct range_level *l = &r->l[b->level]; ++ ++ struct bpos expected_min = bkey_cmp(l->min, l->max) ++ ? btree_type_successor(b->btree_id, l->max) ++ : l->max; ++ ++ bch2_fs_inconsistent_on(bkey_cmp(b->data->min_key, expected_min), c, ++ "btree node has incorrect min key: %llu:%llu != %llu:%llu", ++ b->data->min_key.inode, ++ b->data->min_key.offset, ++ expected_min.inode, ++ expected_min.offset); ++ ++ l->max = b->data->max_key; ++ ++ if (b->level > r->depth) { ++ l = &r->l[b->level - 1]; ++ ++ bch2_fs_inconsistent_on(bkey_cmp(b->data->min_key, l->min), c, ++ "btree node min doesn't match min of child nodes: %llu:%llu != %llu:%llu", ++ b->data->min_key.inode, ++ b->data->min_key.offset, ++ l->min.inode, ++ l->min.offset); ++ ++ bch2_fs_inconsistent_on(bkey_cmp(b->data->max_key, l->max), c, ++ "btree node max doesn't match max of child nodes: %llu:%llu != %llu:%llu", ++ b->data->max_key.inode, ++ b->data->max_key.offset, ++ l->max.inode, ++ l->max.offset); ++ ++ if (bkey_cmp(b->data->max_key, POS_MAX)) ++ l->min = l->max = ++ btree_type_successor(b->btree_id, ++ b->data->max_key); ++ } ++} ++ ++/* marking of btree keys/nodes: */ ++ ++static int bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k, ++ u8 *max_stale, bool initial) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const struct bch_extent_ptr *ptr; ++ unsigned flags = ++ BCH_BUCKET_MARK_GC| ++ (initial ? BCH_BUCKET_MARK_NOATOMIC : 0); ++ int ret = 0; ++ ++ if (initial) { ++ BUG_ON(journal_seq_verify(c) && ++ k.k->version.lo > journal_cur_seq(&c->journal)); ++ ++ if (k.k->version.lo > atomic64_read(&c->key_version)) ++ atomic64_set(&c->key_version, k.k->version.lo); ++ ++ if (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) || ++ fsck_err_on(!bch2_bkey_replicas_marked(c, k, false), c, ++ "superblock not marked as containing replicas (type %u)", ++ k.k->type)) { ++ ret = bch2_mark_bkey_replicas(c, k); ++ if (ret) ++ return ret; ++ } ++ ++ bkey_for_each_ptr(ptrs, ptr) { ++ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); ++ struct bucket *g = PTR_BUCKET(ca, ptr, true); ++ struct bucket *g2 = PTR_BUCKET(ca, ptr, false); ++ ++ if (mustfix_fsck_err_on(!g->gen_valid, c, ++ "bucket %u:%zu data type %s ptr gen %u missing in alloc btree", ++ ptr->dev, PTR_BUCKET_NR(ca, ptr), ++ bch2_data_types[ptr_data_type(k.k, ptr)], ++ ptr->gen)) { ++ g2->_mark.gen = g->_mark.gen = ptr->gen; ++ g2->gen_valid = g->gen_valid = true; ++ } ++ ++ if (mustfix_fsck_err_on(gen_cmp(ptr->gen, g->mark.gen) > 0, c, ++ "bucket %u:%zu data type %s ptr gen in the future: %u > %u", ++ ptr->dev, PTR_BUCKET_NR(ca, ptr), ++ bch2_data_types[ptr_data_type(k.k, ptr)], ++ ptr->gen, g->mark.gen)) { ++ g2->_mark.gen = g->_mark.gen = ptr->gen; ++ g2->gen_valid = g->gen_valid = true; ++ g2->_mark.data_type = 0; ++ g2->_mark.dirty_sectors = 0; ++ g2->_mark.cached_sectors = 0; ++ set_bit(BCH_FS_FIXED_GENS, &c->flags); ++ } ++ } ++ } ++ ++ bkey_for_each_ptr(ptrs, ptr) { ++ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); ++ struct bucket *g = PTR_BUCKET(ca, ptr, true); ++ ++ if (gen_after(g->oldest_gen, ptr->gen)) ++ g->oldest_gen = ptr->gen; ++ ++ *max_stale = max(*max_stale, ptr_stale(ca, ptr)); ++ } ++ ++ bch2_mark_key(c, k, 0, k.k->size, NULL, 0, flags); ++fsck_err: ++ return ret; ++} ++ ++static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, ++ u8 *max_stale, bool initial) ++{ ++ struct btree_node_iter iter; ++ struct bkey unpacked; ++ struct bkey_s_c k; ++ int ret = 0; ++ ++ *max_stale = 0; ++ ++ if (!btree_node_type_needs_gc(btree_node_type(b))) ++ return 0; ++ ++ for_each_btree_node_key_unpack(b, k, &iter, ++ &unpacked) { ++ bch2_bkey_debugcheck(c, b, k); ++ ++ ret = bch2_gc_mark_key(c, k, max_stale, initial); ++ if (ret) ++ break; ++ } ++ ++ return ret; ++} ++ ++static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id, ++ bool initial, bool metadata_only) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct btree *b; ++ struct range_checks r; ++ unsigned depth = metadata_only ? 1 ++ : expensive_debug_checks(c) ? 0 ++ : !btree_node_type_needs_gc(btree_id) ? 1 ++ : 0; ++ u8 max_stale = 0; ++ int ret = 0; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ gc_pos_set(c, gc_pos_btree(btree_id, POS_MIN, 0)); ++ ++ btree_node_range_checks_init(&r, depth); ++ ++ __for_each_btree_node(&trans, iter, btree_id, POS_MIN, ++ 0, depth, BTREE_ITER_PREFETCH, b) { ++ btree_node_range_checks(c, b, &r); ++ ++ bch2_verify_btree_nr_keys(b); ++ ++ gc_pos_set(c, gc_pos_btree_node(b)); ++ ++ ret = btree_gc_mark_node(c, b, &max_stale, initial); ++ if (ret) ++ break; ++ ++ if (!initial) { ++ if (max_stale > 64) ++ bch2_btree_node_rewrite(c, iter, ++ b->data->keys.seq, ++ BTREE_INSERT_USE_RESERVE| ++ BTREE_INSERT_NOWAIT| ++ BTREE_INSERT_GC_LOCK_HELD); ++ else if (!btree_gc_rewrite_disabled(c) && ++ (btree_gc_always_rewrite(c) || max_stale > 16)) ++ bch2_btree_node_rewrite(c, iter, ++ b->data->keys.seq, ++ BTREE_INSERT_NOWAIT| ++ BTREE_INSERT_GC_LOCK_HELD); ++ } ++ ++ bch2_trans_cond_resched(&trans); ++ } ++ ret = bch2_trans_exit(&trans) ?: ret; ++ if (ret) ++ return ret; ++ ++ mutex_lock(&c->btree_root_lock); ++ b = c->btree_roots[btree_id].b; ++ if (!btree_node_fake(b)) ++ ret = bch2_gc_mark_key(c, bkey_i_to_s_c(&b->key), ++ &max_stale, initial); ++ gc_pos_set(c, gc_pos_btree_root(b->btree_id)); ++ mutex_unlock(&c->btree_root_lock); ++ ++ return ret; ++} ++ ++static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r) ++{ ++ return (int) btree_id_to_gc_phase(l) - ++ (int) btree_id_to_gc_phase(r); ++} ++ ++static int mark_journal_key(struct bch_fs *c, enum btree_id id, ++ struct bkey_i *insert) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ u8 max_stale; ++ int ret = 0; ++ ++ ret = bch2_gc_mark_key(c, bkey_i_to_s_c(insert), &max_stale, true); ++ if (ret) ++ return ret; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ for_each_btree_key(&trans, iter, id, bkey_start_pos(&insert->k), ++ BTREE_ITER_SLOTS, k, ret) { ++ percpu_down_read(&c->mark_lock); ++ ret = bch2_mark_overwrite(&trans, iter, k, insert, NULL, ++ BCH_BUCKET_MARK_GC| ++ BCH_BUCKET_MARK_NOATOMIC); ++ percpu_up_read(&c->mark_lock); ++ ++ if (!ret) ++ break; ++ } ++ ++ return bch2_trans_exit(&trans) ?: ret; ++} ++ ++static int bch2_gc_btrees(struct bch_fs *c, struct journal_keys *journal_keys, ++ bool initial, bool metadata_only) ++{ ++ enum btree_id ids[BTREE_ID_NR]; ++ unsigned i; ++ ++ for (i = 0; i < BTREE_ID_NR; i++) ++ ids[i] = i; ++ bubble_sort(ids, BTREE_ID_NR, btree_id_gc_phase_cmp); ++ ++ for (i = 0; i < BTREE_ID_NR; i++) { ++ enum btree_id id = ids[i]; ++ enum btree_node_type type = __btree_node_type(0, id); ++ ++ int ret = bch2_gc_btree(c, id, initial, metadata_only); ++ if (ret) ++ return ret; ++ ++ if (journal_keys && !metadata_only && ++ btree_node_type_needs_gc(type)) { ++ struct journal_key *j; ++ int ret; ++ ++ for_each_journal_key(*journal_keys, j) ++ if (j->btree_id == id) { ++ ret = mark_journal_key(c, id, j->k); ++ if (ret) ++ return ret; ++ } ++ } ++ } ++ ++ return 0; ++} ++ ++static void mark_metadata_sectors(struct bch_fs *c, struct bch_dev *ca, ++ u64 start, u64 end, ++ enum bch_data_type type, ++ unsigned flags) ++{ ++ u64 b = sector_to_bucket(ca, start); ++ ++ do { ++ unsigned sectors = ++ min_t(u64, bucket_to_sector(ca, b + 1), end) - start; ++ ++ bch2_mark_metadata_bucket(c, ca, b, type, sectors, ++ gc_phase(GC_PHASE_SB), flags); ++ b++; ++ start += sectors; ++ } while (start < end); ++} ++ ++void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca, ++ unsigned flags) ++{ ++ struct bch_sb_layout *layout = &ca->disk_sb.sb->layout; ++ unsigned i; ++ u64 b; ++ ++ /* ++ * This conditional is kind of gross, but we may be called from the ++ * device add path, before the new device has actually been added to the ++ * running filesystem: ++ */ ++ if (c) { ++ lockdep_assert_held(&c->sb_lock); ++ percpu_down_read(&c->mark_lock); ++ } ++ ++ for (i = 0; i < layout->nr_superblocks; i++) { ++ u64 offset = le64_to_cpu(layout->sb_offset[i]); ++ ++ if (offset == BCH_SB_SECTOR) ++ mark_metadata_sectors(c, ca, 0, BCH_SB_SECTOR, ++ BCH_DATA_SB, flags); ++ ++ mark_metadata_sectors(c, ca, offset, ++ offset + (1 << layout->sb_max_size_bits), ++ BCH_DATA_SB, flags); ++ } ++ ++ for (i = 0; i < ca->journal.nr; i++) { ++ b = ca->journal.buckets[i]; ++ bch2_mark_metadata_bucket(c, ca, b, BCH_DATA_JOURNAL, ++ ca->mi.bucket_size, ++ gc_phase(GC_PHASE_SB), flags); ++ } ++ ++ if (c) ++ percpu_up_read(&c->mark_lock); ++} ++ ++static void bch2_mark_superblocks(struct bch_fs *c) ++{ ++ struct bch_dev *ca; ++ unsigned i; ++ ++ mutex_lock(&c->sb_lock); ++ gc_pos_set(c, gc_phase(GC_PHASE_SB)); ++ ++ for_each_online_member(ca, c, i) ++ bch2_mark_dev_superblock(c, ca, BCH_BUCKET_MARK_GC); ++ mutex_unlock(&c->sb_lock); ++} ++ ++/* Also see bch2_pending_btree_node_free_insert_done() */ ++static void bch2_mark_pending_btree_node_frees(struct bch_fs *c) ++{ ++ struct btree_update *as; ++ struct pending_btree_node_free *d; ++ ++ mutex_lock(&c->btree_interior_update_lock); ++ gc_pos_set(c, gc_phase(GC_PHASE_PENDING_DELETE)); ++ ++ for_each_pending_btree_node_free(c, as, d) ++ if (d->index_update_done) ++ bch2_mark_key(c, bkey_i_to_s_c(&d->key), ++ 0, 0, NULL, 0, ++ BCH_BUCKET_MARK_GC); ++ ++ mutex_unlock(&c->btree_interior_update_lock); ++} ++ ++static void bch2_mark_allocator_buckets(struct bch_fs *c) ++{ ++ struct bch_dev *ca; ++ struct open_bucket *ob; ++ size_t i, j, iter; ++ unsigned ci; ++ ++ percpu_down_read(&c->mark_lock); ++ ++ spin_lock(&c->freelist_lock); ++ gc_pos_set(c, gc_pos_alloc(c, NULL)); ++ ++ for_each_member_device(ca, c, ci) { ++ fifo_for_each_entry(i, &ca->free_inc, iter) ++ bch2_mark_alloc_bucket(c, ca, i, true, ++ gc_pos_alloc(c, NULL), ++ BCH_BUCKET_MARK_GC); ++ ++ ++ ++ for (j = 0; j < RESERVE_NR; j++) ++ fifo_for_each_entry(i, &ca->free[j], iter) ++ bch2_mark_alloc_bucket(c, ca, i, true, ++ gc_pos_alloc(c, NULL), ++ BCH_BUCKET_MARK_GC); ++ } ++ ++ spin_unlock(&c->freelist_lock); ++ ++ for (ob = c->open_buckets; ++ ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); ++ ob++) { ++ spin_lock(&ob->lock); ++ if (ob->valid) { ++ gc_pos_set(c, gc_pos_alloc(c, ob)); ++ ca = bch_dev_bkey_exists(c, ob->ptr.dev); ++ bch2_mark_alloc_bucket(c, ca, PTR_BUCKET_NR(ca, &ob->ptr), true, ++ gc_pos_alloc(c, ob), ++ BCH_BUCKET_MARK_GC); ++ } ++ spin_unlock(&ob->lock); ++ } ++ ++ percpu_up_read(&c->mark_lock); ++} ++ ++static void bch2_gc_free(struct bch_fs *c) ++{ ++ struct bch_dev *ca; ++ unsigned i; ++ ++ genradix_free(&c->stripes[1]); ++ ++ for_each_member_device(ca, c, i) { ++ kvpfree(rcu_dereference_protected(ca->buckets[1], 1), ++ sizeof(struct bucket_array) + ++ ca->mi.nbuckets * sizeof(struct bucket)); ++ ca->buckets[1] = NULL; ++ ++ free_percpu(ca->usage[1]); ++ ca->usage[1] = NULL; ++ } ++ ++ free_percpu(c->usage_gc); ++ c->usage_gc = NULL; ++} ++ ++static int bch2_gc_done(struct bch_fs *c, ++ bool initial, bool metadata_only) ++{ ++ struct bch_dev *ca; ++ bool verify = !metadata_only && ++ (!initial || ++ (c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO))); ++ unsigned i; ++ int ret = 0; ++ ++#define copy_field(_f, _msg, ...) \ ++ if (dst->_f != src->_f) { \ ++ if (verify) \ ++ fsck_err(c, _msg ": got %llu, should be %llu" \ ++ , ##__VA_ARGS__, dst->_f, src->_f); \ ++ dst->_f = src->_f; \ ++ } ++#define copy_stripe_field(_f, _msg, ...) \ ++ if (dst->_f != src->_f) { \ ++ if (verify) \ ++ fsck_err(c, "stripe %zu has wrong "_msg \ ++ ": got %u, should be %u", \ ++ dst_iter.pos, ##__VA_ARGS__, \ ++ dst->_f, src->_f); \ ++ dst->_f = src->_f; \ ++ dst->dirty = true; \ ++ } ++#define copy_bucket_field(_f) \ ++ if (dst->b[b].mark._f != src->b[b].mark._f) { \ ++ if (verify) \ ++ fsck_err(c, "dev %u bucket %zu has wrong " #_f \ ++ ": got %u, should be %u", i, b, \ ++ dst->b[b].mark._f, src->b[b].mark._f); \ ++ dst->b[b]._mark._f = src->b[b].mark._f; \ ++ } ++#define copy_dev_field(_f, _msg, ...) \ ++ copy_field(_f, "dev %u has wrong " _msg, i, ##__VA_ARGS__) ++#define copy_fs_field(_f, _msg, ...) \ ++ copy_field(_f, "fs has wrong " _msg, ##__VA_ARGS__) ++ ++ if (!metadata_only) { ++ struct genradix_iter dst_iter = genradix_iter_init(&c->stripes[0], 0); ++ struct genradix_iter src_iter = genradix_iter_init(&c->stripes[1], 0); ++ struct stripe *dst, *src; ++ unsigned i; ++ ++ c->ec_stripes_heap.used = 0; ++ ++ while ((dst = genradix_iter_peek(&dst_iter, &c->stripes[0])) && ++ (src = genradix_iter_peek(&src_iter, &c->stripes[1]))) { ++ BUG_ON(src_iter.pos != dst_iter.pos); ++ ++ copy_stripe_field(alive, "alive"); ++ copy_stripe_field(sectors, "sectors"); ++ copy_stripe_field(algorithm, "algorithm"); ++ copy_stripe_field(nr_blocks, "nr_blocks"); ++ copy_stripe_field(nr_redundant, "nr_redundant"); ++ copy_stripe_field(blocks_nonempty, ++ "blocks_nonempty"); ++ ++ for (i = 0; i < ARRAY_SIZE(dst->block_sectors); i++) ++ copy_stripe_field(block_sectors[i], ++ "block_sectors[%u]", i); ++ ++ if (dst->alive) ++ bch2_stripes_heap_insert(c, dst, dst_iter.pos); ++ ++ genradix_iter_advance(&dst_iter, &c->stripes[0]); ++ genradix_iter_advance(&src_iter, &c->stripes[1]); ++ } ++ } ++ ++ for_each_member_device(ca, c, i) { ++ struct bucket_array *dst = __bucket_array(ca, 0); ++ struct bucket_array *src = __bucket_array(ca, 1); ++ size_t b; ++ ++ for (b = 0; b < src->nbuckets; b++) { ++ copy_bucket_field(gen); ++ copy_bucket_field(data_type); ++ copy_bucket_field(owned_by_allocator); ++ copy_bucket_field(stripe); ++ copy_bucket_field(dirty_sectors); ++ copy_bucket_field(cached_sectors); ++ ++ dst->b[b].oldest_gen = src->b[b].oldest_gen; ++ } ++ }; ++ ++ bch2_fs_usage_acc_to_base(c, 0); ++ bch2_fs_usage_acc_to_base(c, 1); ++ ++ bch2_dev_usage_from_buckets(c); ++ ++ { ++ unsigned nr = fs_usage_u64s(c); ++ struct bch_fs_usage *dst = c->usage_base; ++ struct bch_fs_usage *src = (void *) ++ bch2_acc_percpu_u64s((void *) c->usage_gc, nr); ++ ++ copy_fs_field(hidden, "hidden"); ++ copy_fs_field(btree, "btree"); ++ ++ if (!metadata_only) { ++ copy_fs_field(data, "data"); ++ copy_fs_field(cached, "cached"); ++ copy_fs_field(reserved, "reserved"); ++ copy_fs_field(nr_inodes,"nr_inodes"); ++ ++ for (i = 0; i < BCH_REPLICAS_MAX; i++) ++ copy_fs_field(persistent_reserved[i], ++ "persistent_reserved[%i]", i); ++ } ++ ++ for (i = 0; i < c->replicas.nr; i++) { ++ struct bch_replicas_entry *e = ++ cpu_replicas_entry(&c->replicas, i); ++ char buf[80]; ++ ++ if (metadata_only && ++ (e->data_type == BCH_DATA_USER || ++ e->data_type == BCH_DATA_CACHED)) ++ continue; ++ ++ bch2_replicas_entry_to_text(&PBUF(buf), e); ++ ++ copy_fs_field(replicas[i], "%s", buf); ++ } ++ } ++ ++#undef copy_fs_field ++#undef copy_dev_field ++#undef copy_bucket_field ++#undef copy_stripe_field ++#undef copy_field ++fsck_err: ++ return ret; ++} ++ ++static int bch2_gc_start(struct bch_fs *c, ++ bool metadata_only) ++{ ++ struct bch_dev *ca; ++ unsigned i; ++ int ret; ++ ++ BUG_ON(c->usage_gc); ++ ++ c->usage_gc = __alloc_percpu_gfp(fs_usage_u64s(c) * sizeof(u64), ++ sizeof(u64), GFP_KERNEL); ++ if (!c->usage_gc) ++ return -ENOMEM; ++ ++ for_each_member_device(ca, c, i) { ++ BUG_ON(ca->buckets[1]); ++ BUG_ON(ca->usage[1]); ++ ++ ca->buckets[1] = kvpmalloc(sizeof(struct bucket_array) + ++ ca->mi.nbuckets * sizeof(struct bucket), ++ GFP_KERNEL|__GFP_ZERO); ++ if (!ca->buckets[1]) { ++ percpu_ref_put(&ca->ref); ++ return -ENOMEM; ++ } ++ ++ ca->usage[1] = alloc_percpu(struct bch_dev_usage); ++ if (!ca->usage[1]) { ++ percpu_ref_put(&ca->ref); ++ return -ENOMEM; ++ } ++ } ++ ++ ret = bch2_ec_mem_alloc(c, true); ++ if (ret) ++ return ret; ++ ++ percpu_down_write(&c->mark_lock); ++ ++ /* ++ * indicate to stripe code that we need to allocate for the gc stripes ++ * radix tree, too ++ */ ++ gc_pos_set(c, gc_phase(GC_PHASE_START)); ++ ++ for_each_member_device(ca, c, i) { ++ struct bucket_array *dst = __bucket_array(ca, 1); ++ struct bucket_array *src = __bucket_array(ca, 0); ++ size_t b; ++ ++ dst->first_bucket = src->first_bucket; ++ dst->nbuckets = src->nbuckets; ++ ++ for (b = 0; b < src->nbuckets; b++) { ++ struct bucket *d = &dst->b[b]; ++ struct bucket *s = &src->b[b]; ++ ++ d->_mark.gen = dst->b[b].oldest_gen = s->mark.gen; ++ d->gen_valid = s->gen_valid; ++ ++ if (metadata_only && ++ (s->mark.data_type == BCH_DATA_USER || ++ s->mark.data_type == BCH_DATA_CACHED)) { ++ d->_mark = s->mark; ++ d->_mark.owned_by_allocator = 0; ++ } ++ } ++ }; ++ ++ percpu_up_write(&c->mark_lock); ++ ++ return 0; ++} ++ ++/** ++ * bch2_gc - walk _all_ references to buckets, and recompute them: ++ * ++ * Order matters here: ++ * - Concurrent GC relies on the fact that we have a total ordering for ++ * everything that GC walks - see gc_will_visit_node(), ++ * gc_will_visit_root() ++ * ++ * - also, references move around in the course of index updates and ++ * various other crap: everything needs to agree on the ordering ++ * references are allowed to move around in - e.g., we're allowed to ++ * start with a reference owned by an open_bucket (the allocator) and ++ * move it to the btree, but not the reverse. ++ * ++ * This is necessary to ensure that gc doesn't miss references that ++ * move around - if references move backwards in the ordering GC ++ * uses, GC could skip past them ++ */ ++int bch2_gc(struct bch_fs *c, struct journal_keys *journal_keys, ++ bool initial, bool metadata_only) ++{ ++ struct bch_dev *ca; ++ u64 start_time = local_clock(); ++ unsigned i, iter = 0; ++ int ret; ++ ++ trace_gc_start(c); ++ ++ down_write(&c->gc_lock); ++again: ++ ret = bch2_gc_start(c, metadata_only); ++ if (ret) ++ goto out; ++ ++ bch2_mark_superblocks(c); ++ ++ ret = bch2_gc_btrees(c, journal_keys, initial, metadata_only); ++ if (ret) ++ goto out; ++ ++ bch2_mark_pending_btree_node_frees(c); ++ bch2_mark_allocator_buckets(c); ++ ++ c->gc_count++; ++out: ++ if (!ret && ++ (test_bit(BCH_FS_FIXED_GENS, &c->flags) || ++ (!iter && test_restart_gc(c)))) { ++ /* ++ * XXX: make sure gens we fixed got saved ++ */ ++ if (iter++ <= 2) { ++ bch_info(c, "Fixed gens, restarting mark and sweep:"); ++ clear_bit(BCH_FS_FIXED_GENS, &c->flags); ++ __gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING)); ++ ++ percpu_down_write(&c->mark_lock); ++ bch2_gc_free(c); ++ percpu_up_write(&c->mark_lock); ++ /* flush fsck errors, reset counters */ ++ bch2_flush_fsck_errs(c); ++ ++ goto again; ++ } ++ ++ bch_info(c, "Unable to fix bucket gens, looping"); ++ ret = -EINVAL; ++ } ++ ++ if (!ret) { ++ bch2_journal_block(&c->journal); ++ ++ percpu_down_write(&c->mark_lock); ++ ret = bch2_gc_done(c, initial, metadata_only); ++ ++ bch2_journal_unblock(&c->journal); ++ } else { ++ percpu_down_write(&c->mark_lock); ++ } ++ ++ /* Indicates that gc is no longer in progress: */ ++ __gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING)); ++ ++ bch2_gc_free(c); ++ percpu_up_write(&c->mark_lock); ++ ++ up_write(&c->gc_lock); ++ ++ trace_gc_end(c); ++ bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time); ++ ++ /* ++ * Wake up allocator in case it was waiting for buckets ++ * because of not being able to inc gens ++ */ ++ for_each_member_device(ca, c, i) ++ bch2_wake_allocator(ca); ++ ++ /* ++ * At startup, allocations can happen directly instead of via the ++ * allocator thread - issue wakeup in case they blocked on gc_lock: ++ */ ++ closure_wake_up(&c->freelist_wait); ++ return ret; ++} ++ ++/* Btree coalescing */ ++ ++static void recalc_packed_keys(struct btree *b) ++{ ++ struct bset *i = btree_bset_first(b); ++ struct bkey_packed *k; ++ ++ memset(&b->nr, 0, sizeof(b->nr)); ++ ++ BUG_ON(b->nsets != 1); ++ ++ vstruct_for_each(i, k) ++ btree_keys_account_key_add(&b->nr, 0, k); ++} ++ ++static void bch2_coalesce_nodes(struct bch_fs *c, struct btree_iter *iter, ++ struct btree *old_nodes[GC_MERGE_NODES]) ++{ ++ struct btree *parent = btree_node_parent(iter, old_nodes[0]); ++ unsigned i, nr_old_nodes, nr_new_nodes, u64s = 0; ++ unsigned blocks = btree_blocks(c) * 2 / 3; ++ struct btree *new_nodes[GC_MERGE_NODES]; ++ struct btree_update *as; ++ struct keylist keylist; ++ struct bkey_format_state format_state; ++ struct bkey_format new_format; ++ ++ memset(new_nodes, 0, sizeof(new_nodes)); ++ bch2_keylist_init(&keylist, NULL); ++ ++ /* Count keys that are not deleted */ ++ for (i = 0; i < GC_MERGE_NODES && old_nodes[i]; i++) ++ u64s += old_nodes[i]->nr.live_u64s; ++ ++ nr_old_nodes = nr_new_nodes = i; ++ ++ /* Check if all keys in @old_nodes could fit in one fewer node */ ++ if (nr_old_nodes <= 1 || ++ __vstruct_blocks(struct btree_node, c->block_bits, ++ DIV_ROUND_UP(u64s, nr_old_nodes - 1)) > blocks) ++ return; ++ ++ /* Find a format that all keys in @old_nodes can pack into */ ++ bch2_bkey_format_init(&format_state); ++ ++ for (i = 0; i < nr_old_nodes; i++) ++ __bch2_btree_calc_format(&format_state, old_nodes[i]); ++ ++ new_format = bch2_bkey_format_done(&format_state); ++ ++ /* Check if repacking would make any nodes too big to fit */ ++ for (i = 0; i < nr_old_nodes; i++) ++ if (!bch2_btree_node_format_fits(c, old_nodes[i], &new_format)) { ++ trace_btree_gc_coalesce_fail(c, ++ BTREE_GC_COALESCE_FAIL_FORMAT_FITS); ++ return; ++ } ++ ++ if (bch2_keylist_realloc(&keylist, NULL, 0, ++ (BKEY_U64s + BKEY_EXTENT_U64s_MAX) * nr_old_nodes)) { ++ trace_btree_gc_coalesce_fail(c, ++ BTREE_GC_COALESCE_FAIL_KEYLIST_REALLOC); ++ return; ++ } ++ ++ as = bch2_btree_update_start(c, iter->btree_id, ++ btree_update_reserve_required(c, parent) + nr_old_nodes, ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_USE_RESERVE, ++ NULL); ++ if (IS_ERR(as)) { ++ trace_btree_gc_coalesce_fail(c, ++ BTREE_GC_COALESCE_FAIL_RESERVE_GET); ++ bch2_keylist_free(&keylist, NULL); ++ return; ++ } ++ ++ trace_btree_gc_coalesce(c, old_nodes[0]); ++ ++ for (i = 0; i < nr_old_nodes; i++) ++ bch2_btree_interior_update_will_free_node(as, old_nodes[i]); ++ ++ /* Repack everything with @new_format and sort down to one bset */ ++ for (i = 0; i < nr_old_nodes; i++) ++ new_nodes[i] = ++ __bch2_btree_node_alloc_replacement(as, old_nodes[i], ++ new_format); ++ ++ /* ++ * Conceptually we concatenate the nodes together and slice them ++ * up at different boundaries. ++ */ ++ for (i = nr_new_nodes - 1; i > 0; --i) { ++ struct btree *n1 = new_nodes[i]; ++ struct btree *n2 = new_nodes[i - 1]; ++ ++ struct bset *s1 = btree_bset_first(n1); ++ struct bset *s2 = btree_bset_first(n2); ++ struct bkey_packed *k, *last = NULL; ++ ++ /* Calculate how many keys from @n2 we could fit inside @n1 */ ++ u64s = 0; ++ ++ for (k = s2->start; ++ k < vstruct_last(s2) && ++ vstruct_blocks_plus(n1->data, c->block_bits, ++ u64s + k->u64s) <= blocks; ++ k = bkey_next(k)) { ++ last = k; ++ u64s += k->u64s; ++ } ++ ++ if (u64s == le16_to_cpu(s2->u64s)) { ++ /* n2 fits entirely in n1 */ ++ n1->key.k.p = n1->data->max_key = n2->data->max_key; ++ ++ memcpy_u64s(vstruct_last(s1), ++ s2->start, ++ le16_to_cpu(s2->u64s)); ++ le16_add_cpu(&s1->u64s, le16_to_cpu(s2->u64s)); ++ ++ set_btree_bset_end(n1, n1->set); ++ ++ six_unlock_write(&n2->lock); ++ bch2_btree_node_free_never_inserted(c, n2); ++ six_unlock_intent(&n2->lock); ++ ++ memmove(new_nodes + i - 1, ++ new_nodes + i, ++ sizeof(new_nodes[0]) * (nr_new_nodes - i)); ++ new_nodes[--nr_new_nodes] = NULL; ++ } else if (u64s) { ++ /* move part of n2 into n1 */ ++ n1->key.k.p = n1->data->max_key = ++ bkey_unpack_pos(n1, last); ++ ++ n2->data->min_key = ++ btree_type_successor(iter->btree_id, ++ n1->data->max_key); ++ ++ memcpy_u64s(vstruct_last(s1), ++ s2->start, u64s); ++ le16_add_cpu(&s1->u64s, u64s); ++ ++ memmove(s2->start, ++ vstruct_idx(s2, u64s), ++ (le16_to_cpu(s2->u64s) - u64s) * sizeof(u64)); ++ s2->u64s = cpu_to_le16(le16_to_cpu(s2->u64s) - u64s); ++ ++ set_btree_bset_end(n1, n1->set); ++ set_btree_bset_end(n2, n2->set); ++ } ++ } ++ ++ for (i = 0; i < nr_new_nodes; i++) { ++ struct btree *n = new_nodes[i]; ++ ++ recalc_packed_keys(n); ++ btree_node_reset_sib_u64s(n); ++ ++ bch2_btree_build_aux_trees(n); ++ six_unlock_write(&n->lock); ++ ++ bch2_btree_node_write(c, n, SIX_LOCK_intent); ++ } ++ ++ /* ++ * The keys for the old nodes get deleted. We don't want to insert keys ++ * that compare equal to the keys for the new nodes we'll also be ++ * inserting - we can't because keys on a keylist must be strictly ++ * greater than the previous keys, and we also don't need to since the ++ * key for the new node will serve the same purpose (overwriting the key ++ * for the old node). ++ */ ++ for (i = 0; i < nr_old_nodes; i++) { ++ struct bkey_i delete; ++ unsigned j; ++ ++ for (j = 0; j < nr_new_nodes; j++) ++ if (!bkey_cmp(old_nodes[i]->key.k.p, ++ new_nodes[j]->key.k.p)) ++ goto next; ++ ++ bkey_init(&delete.k); ++ delete.k.p = old_nodes[i]->key.k.p; ++ bch2_keylist_add_in_order(&keylist, &delete); ++next: ++ i = i; ++ } ++ ++ /* ++ * Keys for the new nodes get inserted: bch2_btree_insert_keys() only ++ * does the lookup once and thus expects the keys to be in sorted order ++ * so we have to make sure the new keys are correctly ordered with ++ * respect to the deleted keys added in the previous loop ++ */ ++ for (i = 0; i < nr_new_nodes; i++) ++ bch2_keylist_add_in_order(&keylist, &new_nodes[i]->key); ++ ++ /* Insert the newly coalesced nodes */ ++ bch2_btree_insert_node(as, parent, iter, &keylist, 0); ++ ++ BUG_ON(!bch2_keylist_empty(&keylist)); ++ ++ BUG_ON(iter->l[old_nodes[0]->level].b != old_nodes[0]); ++ ++ bch2_btree_iter_node_replace(iter, new_nodes[0]); ++ ++ for (i = 0; i < nr_new_nodes; i++) ++ bch2_open_buckets_put(c, &new_nodes[i]->ob); ++ ++ /* Free the old nodes and update our sliding window */ ++ for (i = 0; i < nr_old_nodes; i++) { ++ bch2_btree_node_free_inmem(c, old_nodes[i], iter); ++ ++ /* ++ * the index update might have triggered a split, in which case ++ * the nodes we coalesced - the new nodes we just created - ++ * might not be sibling nodes anymore - don't add them to the ++ * sliding window (except the first): ++ */ ++ if (!i) { ++ old_nodes[i] = new_nodes[i]; ++ } else { ++ old_nodes[i] = NULL; ++ } ++ } ++ ++ for (i = 0; i < nr_new_nodes; i++) ++ six_unlock_intent(&new_nodes[i]->lock); ++ ++ bch2_btree_update_done(as); ++ bch2_keylist_free(&keylist, NULL); ++} ++ ++static int bch2_coalesce_btree(struct bch_fs *c, enum btree_id btree_id) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct btree *b; ++ bool kthread = (current->flags & PF_KTHREAD) != 0; ++ unsigned i; ++ ++ /* Sliding window of adjacent btree nodes */ ++ struct btree *merge[GC_MERGE_NODES]; ++ u32 lock_seq[GC_MERGE_NODES]; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ /* ++ * XXX: We don't have a good way of positively matching on sibling nodes ++ * that have the same parent - this code works by handling the cases ++ * where they might not have the same parent, and is thus fragile. Ugh. ++ * ++ * Perhaps redo this to use multiple linked iterators? ++ */ ++ memset(merge, 0, sizeof(merge)); ++ ++ __for_each_btree_node(&trans, iter, btree_id, POS_MIN, ++ BTREE_MAX_DEPTH, 0, ++ BTREE_ITER_PREFETCH, b) { ++ memmove(merge + 1, merge, ++ sizeof(merge) - sizeof(merge[0])); ++ memmove(lock_seq + 1, lock_seq, ++ sizeof(lock_seq) - sizeof(lock_seq[0])); ++ ++ merge[0] = b; ++ ++ for (i = 1; i < GC_MERGE_NODES; i++) { ++ if (!merge[i] || ++ !six_relock_intent(&merge[i]->lock, lock_seq[i])) ++ break; ++ ++ if (merge[i]->level != merge[0]->level) { ++ six_unlock_intent(&merge[i]->lock); ++ break; ++ } ++ } ++ memset(merge + i, 0, (GC_MERGE_NODES - i) * sizeof(merge[0])); ++ ++ bch2_coalesce_nodes(c, iter, merge); ++ ++ for (i = 1; i < GC_MERGE_NODES && merge[i]; i++) { ++ lock_seq[i] = merge[i]->lock.state.seq; ++ six_unlock_intent(&merge[i]->lock); ++ } ++ ++ lock_seq[0] = merge[0]->lock.state.seq; ++ ++ if (kthread && kthread_should_stop()) { ++ bch2_trans_exit(&trans); ++ return -ESHUTDOWN; ++ } ++ ++ bch2_trans_cond_resched(&trans); ++ ++ /* ++ * If the parent node wasn't relocked, it might have been split ++ * and the nodes in our sliding window might not have the same ++ * parent anymore - blow away the sliding window: ++ */ ++ if (btree_iter_node(iter, iter->level + 1) && ++ !btree_node_intent_locked(iter, iter->level + 1)) ++ memset(merge + 1, 0, ++ (GC_MERGE_NODES - 1) * sizeof(merge[0])); ++ } ++ return bch2_trans_exit(&trans); ++} ++ ++/** ++ * bch_coalesce - coalesce adjacent nodes with low occupancy ++ */ ++void bch2_coalesce(struct bch_fs *c) ++{ ++ enum btree_id id; ++ ++ down_read(&c->gc_lock); ++ trace_gc_coalesce_start(c); ++ ++ for (id = 0; id < BTREE_ID_NR; id++) { ++ int ret = c->btree_roots[id].b ++ ? bch2_coalesce_btree(c, id) ++ : 0; ++ ++ if (ret) { ++ if (ret != -ESHUTDOWN) ++ bch_err(c, "btree coalescing failed: %d", ret); ++ return; ++ } ++ } ++ ++ trace_gc_coalesce_end(c); ++ up_read(&c->gc_lock); ++} ++ ++static int bch2_gc_thread(void *arg) ++{ ++ struct bch_fs *c = arg; ++ struct io_clock *clock = &c->io_clock[WRITE]; ++ unsigned long last = atomic_long_read(&clock->now); ++ unsigned last_kick = atomic_read(&c->kick_gc); ++ int ret; ++ ++ set_freezable(); ++ ++ while (1) { ++ while (1) { ++ set_current_state(TASK_INTERRUPTIBLE); ++ ++ if (kthread_should_stop()) { ++ __set_current_state(TASK_RUNNING); ++ return 0; ++ } ++ ++ if (atomic_read(&c->kick_gc) != last_kick) ++ break; ++ ++ if (c->btree_gc_periodic) { ++ unsigned long next = last + c->capacity / 16; ++ ++ if (atomic_long_read(&clock->now) >= next) ++ break; ++ ++ bch2_io_clock_schedule_timeout(clock, next); ++ } else { ++ schedule(); ++ } ++ ++ try_to_freeze(); ++ } ++ __set_current_state(TASK_RUNNING); ++ ++ last = atomic_long_read(&clock->now); ++ last_kick = atomic_read(&c->kick_gc); ++ ++ ret = bch2_gc(c, NULL, false, false); ++ if (ret) ++ bch_err(c, "btree gc failed: %i", ret); ++ ++ debug_check_no_locks_held(); ++ } ++ ++ return 0; ++} ++ ++void bch2_gc_thread_stop(struct bch_fs *c) ++{ ++ struct task_struct *p; ++ ++ p = c->gc_thread; ++ c->gc_thread = NULL; ++ ++ if (p) { ++ kthread_stop(p); ++ put_task_struct(p); ++ } ++} ++ ++int bch2_gc_thread_start(struct bch_fs *c) ++{ ++ struct task_struct *p; ++ ++ BUG_ON(c->gc_thread); ++ ++ p = kthread_create(bch2_gc_thread, c, "bch_gc"); ++ if (IS_ERR(p)) ++ return PTR_ERR(p); ++ ++ get_task_struct(p); ++ c->gc_thread = p; ++ wake_up_process(p); ++ return 0; ++} +diff --git a/fs/bcachefs/btree_gc.h b/fs/bcachefs/btree_gc.h +new file mode 100644 +index 000000000000..bd5f2752954f +--- /dev/null ++++ b/fs/bcachefs/btree_gc.h +@@ -0,0 +1,120 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_BTREE_GC_H ++#define _BCACHEFS_BTREE_GC_H ++ ++#include "btree_types.h" ++ ++void bch2_coalesce(struct bch_fs *); ++ ++struct journal_keys; ++int bch2_gc(struct bch_fs *, struct journal_keys *, bool, bool); ++void bch2_gc_thread_stop(struct bch_fs *); ++int bch2_gc_thread_start(struct bch_fs *); ++void bch2_mark_dev_superblock(struct bch_fs *, struct bch_dev *, unsigned); ++ ++/* ++ * For concurrent mark and sweep (with other index updates), we define a total ++ * ordering of _all_ references GC walks: ++ * ++ * Note that some references will have the same GC position as others - e.g. ++ * everything within the same btree node; in those cases we're relying on ++ * whatever locking exists for where those references live, i.e. the write lock ++ * on a btree node. ++ * ++ * That locking is also required to ensure GC doesn't pass the updater in ++ * between the updater adding/removing the reference and updating the GC marks; ++ * without that, we would at best double count sometimes. ++ * ++ * That part is important - whenever calling bch2_mark_pointers(), a lock _must_ ++ * be held that prevents GC from passing the position the updater is at. ++ * ++ * (What about the start of gc, when we're clearing all the marks? GC clears the ++ * mark with the gc pos seqlock held, and bch_mark_bucket checks against the gc ++ * position inside its cmpxchg loop, so crap magically works). ++ */ ++ ++/* Position of (the start of) a gc phase: */ ++static inline struct gc_pos gc_phase(enum gc_phase phase) ++{ ++ return (struct gc_pos) { ++ .phase = phase, ++ .pos = POS_MIN, ++ .level = 0, ++ }; ++} ++ ++static inline int gc_pos_cmp(struct gc_pos l, struct gc_pos r) ++{ ++ if (l.phase != r.phase) ++ return l.phase < r.phase ? -1 : 1; ++ if (bkey_cmp(l.pos, r.pos)) ++ return bkey_cmp(l.pos, r.pos); ++ if (l.level != r.level) ++ return l.level < r.level ? -1 : 1; ++ return 0; ++} ++ ++static inline enum gc_phase btree_id_to_gc_phase(enum btree_id id) ++{ ++ switch (id) { ++#define x(n, v, s) case BTREE_ID_##n: return GC_PHASE_BTREE_##n; ++ BCH_BTREE_IDS() ++#undef x ++ default: ++ BUG(); ++ } ++} ++ ++static inline struct gc_pos gc_pos_btree(enum btree_id id, ++ struct bpos pos, unsigned level) ++{ ++ return (struct gc_pos) { ++ .phase = btree_id_to_gc_phase(id), ++ .pos = pos, ++ .level = level, ++ }; ++} ++ ++/* ++ * GC position of the pointers within a btree node: note, _not_ for &b->key ++ * itself, that lives in the parent node: ++ */ ++static inline struct gc_pos gc_pos_btree_node(struct btree *b) ++{ ++ return gc_pos_btree(b->btree_id, b->key.k.p, b->level); ++} ++ ++/* ++ * GC position of the pointer to a btree root: we don't use ++ * gc_pos_pointer_to_btree_node() here to avoid a potential race with ++ * btree_split() increasing the tree depth - the new root will have level > the ++ * old root and thus have a greater gc position than the old root, but that ++ * would be incorrect since once gc has marked the root it's not coming back. ++ */ ++static inline struct gc_pos gc_pos_btree_root(enum btree_id id) ++{ ++ return gc_pos_btree(id, POS_MAX, BTREE_MAX_DEPTH); ++} ++ ++static inline struct gc_pos gc_pos_alloc(struct bch_fs *c, struct open_bucket *ob) ++{ ++ return (struct gc_pos) { ++ .phase = GC_PHASE_ALLOC, ++ .pos = POS(ob ? ob - c->open_buckets : 0, 0), ++ }; ++} ++ ++static inline bool gc_visited(struct bch_fs *c, struct gc_pos pos) ++{ ++ unsigned seq; ++ bool ret; ++ ++ do { ++ seq = read_seqcount_begin(&c->gc_pos_lock); ++ ret = gc_pos_cmp(pos, c->gc_pos) <= 0; ++ } while (read_seqcount_retry(&c->gc_pos_lock, seq)); ++ ++ return ret; ++} ++ ++#endif /* _BCACHEFS_BTREE_GC_H */ +diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c +new file mode 100644 +index 000000000000..591980d2011f +--- /dev/null ++++ b/fs/bcachefs/btree_io.c +@@ -0,0 +1,1703 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include "bcachefs.h" ++#include "bkey_methods.h" ++#include "bkey_sort.h" ++#include "btree_cache.h" ++#include "btree_io.h" ++#include "btree_iter.h" ++#include "btree_locking.h" ++#include "btree_update.h" ++#include "btree_update_interior.h" ++#include "buckets.h" ++#include "checksum.h" ++#include "debug.h" ++#include "error.h" ++#include "extents.h" ++#include "io.h" ++#include "journal_reclaim.h" ++#include "journal_seq_blacklist.h" ++#include "super-io.h" ++ ++#include ++ ++static void verify_no_dups(struct btree *b, ++ struct bkey_packed *start, ++ struct bkey_packed *end) ++{ ++#ifdef CONFIG_BCACHEFS_DEBUG ++ struct bkey_packed *k; ++ ++ for (k = start; k != end && bkey_next(k) != end; k = bkey_next(k)) { ++ struct bkey l = bkey_unpack_key(b, k); ++ struct bkey r = bkey_unpack_key(b, bkey_next(k)); ++ ++ BUG_ON(btree_node_is_extents(b) ++ ? bkey_cmp(l.p, bkey_start_pos(&r)) > 0 ++ : bkey_cmp(l.p, bkey_start_pos(&r)) >= 0); ++ //BUG_ON(bkey_cmp_packed(&b->format, k, bkey_next(k)) >= 0); ++ } ++#endif ++} ++ ++static void clear_needs_whiteout(struct bset *i) ++{ ++ struct bkey_packed *k; ++ ++ for (k = i->start; k != vstruct_last(i); k = bkey_next(k)) ++ k->needs_whiteout = false; ++} ++ ++static void set_needs_whiteout(struct bset *i) ++{ ++ struct bkey_packed *k; ++ ++ for (k = i->start; k != vstruct_last(i); k = bkey_next(k)) ++ k->needs_whiteout = true; ++} ++ ++static void btree_bounce_free(struct bch_fs *c, unsigned order, ++ bool used_mempool, void *p) ++{ ++ if (used_mempool) ++ mempool_free(p, &c->btree_bounce_pool); ++ else ++ vpfree(p, PAGE_SIZE << order); ++} ++ ++static void *btree_bounce_alloc(struct bch_fs *c, unsigned order, ++ bool *used_mempool) ++{ ++ void *p; ++ ++ BUG_ON(order > btree_page_order(c)); ++ ++ *used_mempool = false; ++ p = (void *) __get_free_pages(__GFP_NOWARN|GFP_NOWAIT, order); ++ if (p) ++ return p; ++ ++ *used_mempool = true; ++ return mempool_alloc(&c->btree_bounce_pool, GFP_NOIO); ++} ++ ++static unsigned should_compact_bset(struct btree *b, struct bset_tree *t, ++ bool compacting, ++ enum compact_mode mode) ++{ ++ unsigned bset_u64s = le16_to_cpu(bset(b, t)->u64s); ++ unsigned dead_u64s = bset_u64s - b->nr.bset_u64s[t - b->set]; ++ ++ if (mode == COMPACT_LAZY) { ++ if (should_compact_bset_lazy(b, t) || ++ (compacting && !bset_written(b, bset(b, t)))) ++ return dead_u64s; ++ } else { ++ if (bset_written(b, bset(b, t))) ++ return dead_u64s; ++ } ++ ++ return 0; ++} ++ ++bool __bch2_compact_whiteouts(struct bch_fs *c, struct btree *b, ++ enum compact_mode mode) ++{ ++ const struct bkey_format *f = &b->format; ++ struct bset_tree *t; ++ struct bkey_packed *whiteouts = NULL; ++ struct bkey_packed *u_start, *u_pos; ++ struct sort_iter sort_iter; ++ unsigned order, whiteout_u64s = 0, u64s; ++ bool used_mempool, compacting = false; ++ ++ for_each_bset(b, t) ++ whiteout_u64s += should_compact_bset(b, t, ++ whiteout_u64s != 0, mode); ++ ++ if (!whiteout_u64s) ++ return false; ++ ++ sort_iter_init(&sort_iter, b); ++ ++ whiteout_u64s += b->whiteout_u64s; ++ order = get_order(whiteout_u64s * sizeof(u64)); ++ ++ whiteouts = btree_bounce_alloc(c, order, &used_mempool); ++ u_start = u_pos = whiteouts; ++ ++ memcpy_u64s(u_pos, unwritten_whiteouts_start(c, b), ++ b->whiteout_u64s); ++ u_pos = (void *) u_pos + b->whiteout_u64s * sizeof(u64); ++ ++ sort_iter_add(&sort_iter, u_start, u_pos); ++ ++ for_each_bset(b, t) { ++ struct bset *i = bset(b, t); ++ struct bkey_packed *k, *n, *out, *start, *end; ++ struct btree_node_entry *src = NULL, *dst = NULL; ++ ++ if (t != b->set && !bset_written(b, i)) { ++ src = container_of(i, struct btree_node_entry, keys); ++ dst = max(write_block(b), ++ (void *) btree_bkey_last(b, t -1)); ++ } ++ ++ if (!should_compact_bset(b, t, compacting, mode)) { ++ if (src != dst) { ++ memmove(dst, src, sizeof(*src) + ++ le16_to_cpu(src->keys.u64s) * ++ sizeof(u64)); ++ i = &dst->keys; ++ set_btree_bset(b, t, i); ++ } ++ continue; ++ } ++ ++ compacting = true; ++ u_start = u_pos; ++ start = i->start; ++ end = vstruct_last(i); ++ ++ if (src != dst) { ++ memmove(dst, src, sizeof(*src)); ++ i = &dst->keys; ++ set_btree_bset(b, t, i); ++ } ++ ++ out = i->start; ++ ++ for (k = start; k != end; k = n) { ++ n = bkey_next(k); ++ ++ if (bkey_deleted(k) && btree_node_is_extents(b)) ++ continue; ++ ++ if (bkey_whiteout(k) && !k->needs_whiteout) ++ continue; ++ ++ if (bkey_whiteout(k)) { ++ unreserve_whiteout(b, k); ++ memcpy_u64s(u_pos, k, bkeyp_key_u64s(f, k)); ++ set_bkeyp_val_u64s(f, u_pos, 0); ++ u_pos = bkey_next(u_pos); ++ } else if (mode != COMPACT_WRITTEN_NO_WRITE_LOCK) { ++ bkey_copy(out, k); ++ out = bkey_next(out); ++ } ++ } ++ ++ sort_iter_add(&sort_iter, u_start, u_pos); ++ ++ if (mode != COMPACT_WRITTEN_NO_WRITE_LOCK) { ++ i->u64s = cpu_to_le16((u64 *) out - i->_data); ++ set_btree_bset_end(b, t); ++ bch2_bset_set_no_aux_tree(b, t); ++ } ++ } ++ ++ b->whiteout_u64s = (u64 *) u_pos - (u64 *) whiteouts; ++ ++ BUG_ON((void *) unwritten_whiteouts_start(c, b) < ++ (void *) btree_bkey_last(b, bset_tree_last(b))); ++ ++ u64s = (btree_node_is_extents(b) ++ ? bch2_sort_extent_whiteouts ++ : bch2_sort_key_whiteouts)(unwritten_whiteouts_start(c, b), ++ &sort_iter); ++ ++ BUG_ON(u64s > b->whiteout_u64s); ++ BUG_ON(u64s != b->whiteout_u64s && !btree_node_is_extents(b)); ++ BUG_ON(u_pos != whiteouts && !u64s); ++ ++ if (u64s != b->whiteout_u64s) { ++ void *src = unwritten_whiteouts_start(c, b); ++ ++ b->whiteout_u64s = u64s; ++ memmove_u64s_up(unwritten_whiteouts_start(c, b), src, u64s); ++ } ++ ++ verify_no_dups(b, ++ unwritten_whiteouts_start(c, b), ++ unwritten_whiteouts_end(c, b)); ++ ++ btree_bounce_free(c, order, used_mempool, whiteouts); ++ ++ if (mode != COMPACT_WRITTEN_NO_WRITE_LOCK) ++ bch2_btree_build_aux_trees(b); ++ ++ bch_btree_keys_u64s_remaining(c, b); ++ bch2_verify_btree_nr_keys(b); ++ ++ return true; ++} ++ ++static bool bch2_drop_whiteouts(struct btree *b) ++{ ++ struct bset_tree *t; ++ bool ret = false; ++ ++ for_each_bset(b, t) { ++ struct bset *i = bset(b, t); ++ struct bkey_packed *k, *n, *out, *start, *end; ++ ++ if (!should_compact_bset(b, t, true, COMPACT_WRITTEN)) ++ continue; ++ ++ start = btree_bkey_first(b, t); ++ end = btree_bkey_last(b, t); ++ ++ if (!bset_written(b, i) && ++ t != b->set) { ++ struct bset *dst = ++ max_t(struct bset *, write_block(b), ++ (void *) btree_bkey_last(b, t -1)); ++ ++ memmove(dst, i, sizeof(struct bset)); ++ i = dst; ++ set_btree_bset(b, t, i); ++ } ++ ++ out = i->start; ++ ++ for (k = start; k != end; k = n) { ++ n = bkey_next(k); ++ ++ if (!bkey_whiteout(k)) { ++ bkey_copy(out, k); ++ out = bkey_next(out); ++ } ++ } ++ ++ i->u64s = cpu_to_le16((u64 *) out - i->_data); ++ bch2_bset_set_no_aux_tree(b, t); ++ ret = true; ++ } ++ ++ bch2_verify_btree_nr_keys(b); ++ ++ return ret; ++} ++ ++static void btree_node_sort(struct bch_fs *c, struct btree *b, ++ struct btree_iter *iter, ++ unsigned start_idx, ++ unsigned end_idx, ++ bool filter_whiteouts) ++{ ++ struct btree_node *out; ++ struct sort_iter sort_iter; ++ struct bset_tree *t; ++ struct bset *start_bset = bset(b, &b->set[start_idx]); ++ bool used_mempool = false; ++ u64 start_time, seq = 0; ++ unsigned i, u64s = 0, order, shift = end_idx - start_idx - 1; ++ bool sorting_entire_node = start_idx == 0 && ++ end_idx == b->nsets; ++ ++ sort_iter_init(&sort_iter, b); ++ ++ for (t = b->set + start_idx; ++ t < b->set + end_idx; ++ t++) { ++ u64s += le16_to_cpu(bset(b, t)->u64s); ++ sort_iter_add(&sort_iter, ++ btree_bkey_first(b, t), ++ btree_bkey_last(b, t)); ++ } ++ ++ order = sorting_entire_node ++ ? btree_page_order(c) ++ : get_order(__vstruct_bytes(struct btree_node, u64s)); ++ ++ out = btree_bounce_alloc(c, order, &used_mempool); ++ ++ start_time = local_clock(); ++ ++ if (btree_node_is_extents(b)) ++ filter_whiteouts = bset_written(b, start_bset); ++ ++ u64s = (btree_node_is_extents(b) ++ ? bch2_sort_extents ++ : bch2_sort_keys)(out->keys.start, ++ &sort_iter, ++ filter_whiteouts); ++ ++ out->keys.u64s = cpu_to_le16(u64s); ++ ++ BUG_ON(vstruct_end(&out->keys) > (void *) out + (PAGE_SIZE << order)); ++ ++ if (sorting_entire_node) ++ bch2_time_stats_update(&c->times[BCH_TIME_btree_node_sort], ++ start_time); ++ ++ /* Make sure we preserve bset journal_seq: */ ++ for (t = b->set + start_idx; t < b->set + end_idx; t++) ++ seq = max(seq, le64_to_cpu(bset(b, t)->journal_seq)); ++ start_bset->journal_seq = cpu_to_le64(seq); ++ ++ if (sorting_entire_node) { ++ unsigned u64s = le16_to_cpu(out->keys.u64s); ++ ++ BUG_ON(order != btree_page_order(c)); ++ ++ /* ++ * Our temporary buffer is the same size as the btree node's ++ * buffer, we can just swap buffers instead of doing a big ++ * memcpy() ++ */ ++ *out = *b->data; ++ out->keys.u64s = cpu_to_le16(u64s); ++ swap(out, b->data); ++ set_btree_bset(b, b->set, &b->data->keys); ++ } else { ++ start_bset->u64s = out->keys.u64s; ++ memcpy_u64s(start_bset->start, ++ out->keys.start, ++ le16_to_cpu(out->keys.u64s)); ++ } ++ ++ for (i = start_idx + 1; i < end_idx; i++) ++ b->nr.bset_u64s[start_idx] += ++ b->nr.bset_u64s[i]; ++ ++ b->nsets -= shift; ++ ++ for (i = start_idx + 1; i < b->nsets; i++) { ++ b->nr.bset_u64s[i] = b->nr.bset_u64s[i + shift]; ++ b->set[i] = b->set[i + shift]; ++ } ++ ++ for (i = b->nsets; i < MAX_BSETS; i++) ++ b->nr.bset_u64s[i] = 0; ++ ++ set_btree_bset_end(b, &b->set[start_idx]); ++ bch2_bset_set_no_aux_tree(b, &b->set[start_idx]); ++ ++ btree_bounce_free(c, order, used_mempool, out); ++ ++ bch2_verify_btree_nr_keys(b); ++} ++ ++void bch2_btree_sort_into(struct bch_fs *c, ++ struct btree *dst, ++ struct btree *src) ++{ ++ struct btree_nr_keys nr; ++ struct btree_node_iter src_iter; ++ u64 start_time = local_clock(); ++ ++ BUG_ON(dst->nsets != 1); ++ ++ bch2_bset_set_no_aux_tree(dst, dst->set); ++ ++ bch2_btree_node_iter_init_from_start(&src_iter, src); ++ ++ if (btree_node_is_extents(src)) ++ nr = bch2_sort_repack_merge(c, btree_bset_first(dst), ++ src, &src_iter, ++ &dst->format, ++ true); ++ else ++ nr = bch2_sort_repack(btree_bset_first(dst), ++ src, &src_iter, ++ &dst->format, ++ true); ++ ++ bch2_time_stats_update(&c->times[BCH_TIME_btree_node_sort], ++ start_time); ++ ++ set_btree_bset_end(dst, dst->set); ++ ++ dst->nr.live_u64s += nr.live_u64s; ++ dst->nr.bset_u64s[0] += nr.bset_u64s[0]; ++ dst->nr.packed_keys += nr.packed_keys; ++ dst->nr.unpacked_keys += nr.unpacked_keys; ++ ++ bch2_verify_btree_nr_keys(dst); ++} ++ ++#define SORT_CRIT (4096 / sizeof(u64)) ++ ++/* ++ * We're about to add another bset to the btree node, so if there's currently ++ * too many bsets - sort some of them together: ++ */ ++static bool btree_node_compact(struct bch_fs *c, struct btree *b, ++ struct btree_iter *iter) ++{ ++ unsigned unwritten_idx; ++ bool ret = false; ++ ++ for (unwritten_idx = 0; ++ unwritten_idx < b->nsets; ++ unwritten_idx++) ++ if (!bset_written(b, bset(b, &b->set[unwritten_idx]))) ++ break; ++ ++ if (b->nsets - unwritten_idx > 1) { ++ btree_node_sort(c, b, iter, unwritten_idx, ++ b->nsets, false); ++ ret = true; ++ } ++ ++ if (unwritten_idx > 1) { ++ btree_node_sort(c, b, iter, 0, unwritten_idx, false); ++ ret = true; ++ } ++ ++ return ret; ++} ++ ++void bch2_btree_build_aux_trees(struct btree *b) ++{ ++ struct bset_tree *t; ++ ++ for_each_bset(b, t) ++ bch2_bset_build_aux_tree(b, t, ++ !bset_written(b, bset(b, t)) && ++ t == bset_tree_last(b)); ++} ++ ++/* ++ * @bch_btree_init_next - initialize a new (unwritten) bset that can then be ++ * inserted into ++ * ++ * Safe to call if there already is an unwritten bset - will only add a new bset ++ * if @b doesn't already have one. ++ * ++ * Returns true if we sorted (i.e. invalidated iterators ++ */ ++void bch2_btree_init_next(struct bch_fs *c, struct btree *b, ++ struct btree_iter *iter) ++{ ++ struct btree_node_entry *bne; ++ bool did_sort; ++ ++ EBUG_ON(!(b->lock.state.seq & 1)); ++ EBUG_ON(iter && iter->l[b->level].b != b); ++ ++ did_sort = btree_node_compact(c, b, iter); ++ ++ bne = want_new_bset(c, b); ++ if (bne) ++ bch2_bset_init_next(c, b, bne); ++ ++ bch2_btree_build_aux_trees(b); ++ ++ if (iter && did_sort) ++ bch2_btree_iter_reinit_node(iter, b); ++} ++ ++static struct nonce btree_nonce(struct bset *i, unsigned offset) ++{ ++ return (struct nonce) {{ ++ [0] = cpu_to_le32(offset), ++ [1] = ((__le32 *) &i->seq)[0], ++ [2] = ((__le32 *) &i->seq)[1], ++ [3] = ((__le32 *) &i->journal_seq)[0]^BCH_NONCE_BTREE, ++ }}; ++} ++ ++static void bset_encrypt(struct bch_fs *c, struct bset *i, unsigned offset) ++{ ++ struct nonce nonce = btree_nonce(i, offset); ++ ++ if (!offset) { ++ struct btree_node *bn = container_of(i, struct btree_node, keys); ++ unsigned bytes = (void *) &bn->keys - (void *) &bn->flags; ++ ++ bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, &bn->flags, ++ bytes); ++ ++ nonce = nonce_add(nonce, round_up(bytes, CHACHA_BLOCK_SIZE)); ++ } ++ ++ bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, i->_data, ++ vstruct_end(i) - (void *) i->_data); ++} ++ ++static void btree_err_msg(struct printbuf *out, struct bch_fs *c, ++ struct btree *b, struct bset *i, ++ unsigned offset, int write) ++{ ++ pr_buf(out, "error validating btree node %s" ++ "at btree %u level %u/%u\n" ++ "pos %llu:%llu node offset %u", ++ write ? "before write " : "", ++ b->btree_id, b->level, ++ c->btree_roots[b->btree_id].level, ++ b->key.k.p.inode, b->key.k.p.offset, ++ b->written); ++ if (i) ++ pr_buf(out, " bset u64s %u", le16_to_cpu(i->u64s)); ++} ++ ++enum btree_err_type { ++ BTREE_ERR_FIXABLE, ++ BTREE_ERR_WANT_RETRY, ++ BTREE_ERR_MUST_RETRY, ++ BTREE_ERR_FATAL, ++}; ++ ++enum btree_validate_ret { ++ BTREE_RETRY_READ = 64, ++}; ++ ++#define btree_err(type, c, b, i, msg, ...) \ ++({ \ ++ __label__ out; \ ++ char _buf[300]; \ ++ struct printbuf out = PBUF(_buf); \ ++ \ ++ btree_err_msg(&out, c, b, i, b->written, write); \ ++ pr_buf(&out, ": " msg, ##__VA_ARGS__); \ ++ \ ++ if (type == BTREE_ERR_FIXABLE && \ ++ write == READ && \ ++ !test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) { \ ++ mustfix_fsck_err(c, "%s", _buf); \ ++ goto out; \ ++ } \ ++ \ ++ switch (write) { \ ++ case READ: \ ++ bch_err(c, "%s", _buf); \ ++ \ ++ switch (type) { \ ++ case BTREE_ERR_FIXABLE: \ ++ ret = BCH_FSCK_ERRORS_NOT_FIXED; \ ++ goto fsck_err; \ ++ case BTREE_ERR_WANT_RETRY: \ ++ if (have_retry) { \ ++ ret = BTREE_RETRY_READ; \ ++ goto fsck_err; \ ++ } \ ++ break; \ ++ case BTREE_ERR_MUST_RETRY: \ ++ ret = BTREE_RETRY_READ; \ ++ goto fsck_err; \ ++ case BTREE_ERR_FATAL: \ ++ ret = BCH_FSCK_ERRORS_NOT_FIXED; \ ++ goto fsck_err; \ ++ } \ ++ break; \ ++ case WRITE: \ ++ bch_err(c, "corrupt metadata before write: %s", _buf); \ ++ \ ++ if (bch2_fs_inconsistent(c)) { \ ++ ret = BCH_FSCK_ERRORS_NOT_FIXED; \ ++ goto fsck_err; \ ++ } \ ++ break; \ ++ } \ ++out: \ ++ true; \ ++}) ++ ++#define btree_err_on(cond, ...) ((cond) ? btree_err(__VA_ARGS__) : false) ++ ++static int validate_bset(struct bch_fs *c, struct btree *b, ++ struct bset *i, unsigned sectors, ++ unsigned *whiteout_u64s, int write, ++ bool have_retry) ++{ ++ struct bkey_packed *k, *prev = NULL; ++ struct bpos prev_pos = POS_MIN; ++ bool seen_non_whiteout = false; ++ unsigned version; ++ const char *err; ++ int ret = 0; ++ ++ if (i == &b->data->keys) { ++ /* These indicate that we read the wrong btree node: */ ++ btree_err_on(BTREE_NODE_ID(b->data) != b->btree_id, ++ BTREE_ERR_MUST_RETRY, c, b, i, ++ "incorrect btree id"); ++ ++ btree_err_on(BTREE_NODE_LEVEL(b->data) != b->level, ++ BTREE_ERR_MUST_RETRY, c, b, i, ++ "incorrect level"); ++ ++ if (BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN) { ++ u64 *p = (u64 *) &b->data->ptr; ++ ++ *p = swab64(*p); ++ bch2_bpos_swab(&b->data->min_key); ++ bch2_bpos_swab(&b->data->max_key); ++ } ++ ++ btree_err_on(bkey_cmp(b->data->max_key, b->key.k.p), ++ BTREE_ERR_MUST_RETRY, c, b, i, ++ "incorrect max key"); ++ ++ /* XXX: ideally we would be validating min_key too */ ++#if 0 ++ /* ++ * not correct anymore, due to btree node write error ++ * handling ++ * ++ * need to add b->data->seq to btree keys and verify ++ * against that ++ */ ++ btree_err_on(!extent_contains_ptr(bkey_i_to_s_c_extent(&b->key), ++ b->data->ptr), ++ BTREE_ERR_FATAL, c, b, i, ++ "incorrect backpointer"); ++#endif ++ err = bch2_bkey_format_validate(&b->data->format); ++ btree_err_on(err, ++ BTREE_ERR_FATAL, c, b, i, ++ "invalid bkey format: %s", err); ++ } ++ ++ version = le16_to_cpu(i->version); ++ btree_err_on((version != BCH_BSET_VERSION_OLD && ++ version < bcachefs_metadata_version_min) || ++ version >= bcachefs_metadata_version_max, ++ BTREE_ERR_FATAL, c, b, i, ++ "unsupported bset version"); ++ ++ if (btree_err_on(b->written + sectors > c->opts.btree_node_size, ++ BTREE_ERR_FIXABLE, c, b, i, ++ "bset past end of btree node")) { ++ i->u64s = 0; ++ return 0; ++ } ++ ++ btree_err_on(b->written && !i->u64s, ++ BTREE_ERR_FIXABLE, c, b, i, ++ "empty bset"); ++ ++ if (!BSET_SEPARATE_WHITEOUTS(i)) { ++ seen_non_whiteout = true; ++ *whiteout_u64s = 0; ++ } ++ ++ for (k = i->start; ++ k != vstruct_last(i);) { ++ struct bkey_s_c u; ++ struct bkey tmp; ++ const char *invalid; ++ ++ if (btree_err_on(!k->u64s, ++ BTREE_ERR_FIXABLE, c, b, i, ++ "KEY_U64s 0: %zu bytes of metadata lost", ++ vstruct_end(i) - (void *) k)) { ++ i->u64s = cpu_to_le16((u64 *) k - i->_data); ++ break; ++ } ++ ++ if (btree_err_on(bkey_next(k) > vstruct_last(i), ++ BTREE_ERR_FIXABLE, c, b, i, ++ "key extends past end of bset")) { ++ i->u64s = cpu_to_le16((u64 *) k - i->_data); ++ break; ++ } ++ ++ if (btree_err_on(k->format > KEY_FORMAT_CURRENT, ++ BTREE_ERR_FIXABLE, c, b, i, ++ "invalid bkey format %u", k->format)) { ++ i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); ++ memmove_u64s_down(k, bkey_next(k), ++ (u64 *) vstruct_end(i) - (u64 *) k); ++ continue; ++ } ++ ++ if (BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN) ++ bch2_bkey_swab(&b->format, k); ++ ++ if (!write && ++ version < bcachefs_metadata_version_bkey_renumber) ++ bch2_bkey_renumber(btree_node_type(b), k, write); ++ ++ u = bkey_disassemble(b, k, &tmp); ++ ++ invalid = __bch2_bkey_invalid(c, u, btree_node_type(b)) ?: ++ bch2_bkey_in_btree_node(b, u) ?: ++ (write ? bch2_bkey_val_invalid(c, u) : NULL); ++ if (invalid) { ++ char buf[160]; ++ ++ bch2_bkey_val_to_text(&PBUF(buf), c, u); ++ btree_err(BTREE_ERR_FIXABLE, c, b, i, ++ "invalid bkey:\n%s\n%s", invalid, buf); ++ ++ i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); ++ memmove_u64s_down(k, bkey_next(k), ++ (u64 *) vstruct_end(i) - (u64 *) k); ++ continue; ++ } ++ ++ if (write && ++ version < bcachefs_metadata_version_bkey_renumber) ++ bch2_bkey_renumber(btree_node_type(b), k, write); ++ ++ /* ++ * with the separate whiteouts thing (used for extents), the ++ * second set of keys actually can have whiteouts too, so we ++ * can't solely go off bkey_whiteout()... ++ */ ++ ++ if (!seen_non_whiteout && ++ (!bkey_whiteout(k) || ++ (bkey_cmp(prev_pos, bkey_start_pos(u.k)) > 0))) { ++ *whiteout_u64s = k->_data - i->_data; ++ seen_non_whiteout = true; ++ } else if (bkey_cmp(prev_pos, bkey_start_pos(u.k)) > 0) { ++ btree_err(BTREE_ERR_FATAL, c, b, i, ++ "keys out of order: %llu:%llu > %llu:%llu", ++ prev_pos.inode, ++ prev_pos.offset, ++ u.k->p.inode, ++ bkey_start_offset(u.k)); ++ /* XXX: repair this */ ++ } ++ ++ prev_pos = u.k->p; ++ prev = k; ++ k = bkey_next(k); ++ } ++ ++ SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN); ++fsck_err: ++ return ret; ++} ++ ++int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry) ++{ ++ struct btree_node_entry *bne; ++ struct btree_node_iter_large *iter; ++ struct btree_node *sorted; ++ struct bkey_packed *k; ++ struct bset *i; ++ bool used_mempool, blacklisted; ++ unsigned u64s; ++ int ret, retry_read = 0, write = READ; ++ ++ iter = mempool_alloc(&c->fill_iter, GFP_NOIO); ++ iter->used = 0; ++ ++ if (bch2_meta_read_fault("btree")) ++ btree_err(BTREE_ERR_MUST_RETRY, c, b, NULL, ++ "dynamic fault"); ++ ++ btree_err_on(le64_to_cpu(b->data->magic) != bset_magic(c), ++ BTREE_ERR_MUST_RETRY, c, b, NULL, ++ "bad magic"); ++ ++ btree_err_on(!b->data->keys.seq, ++ BTREE_ERR_MUST_RETRY, c, b, NULL, ++ "bad btree header"); ++ ++ while (b->written < c->opts.btree_node_size) { ++ unsigned sectors, whiteout_u64s = 0; ++ struct nonce nonce; ++ struct bch_csum csum; ++ bool first = !b->written; ++ ++ if (!b->written) { ++ i = &b->data->keys; ++ ++ btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)), ++ BTREE_ERR_WANT_RETRY, c, b, i, ++ "unknown checksum type"); ++ ++ nonce = btree_nonce(i, b->written << 9); ++ csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data); ++ ++ btree_err_on(bch2_crc_cmp(csum, b->data->csum), ++ BTREE_ERR_WANT_RETRY, c, b, i, ++ "invalid checksum"); ++ ++ bset_encrypt(c, i, b->written << 9); ++ ++ sectors = vstruct_sectors(b->data, c->block_bits); ++ ++ btree_node_set_format(b, b->data->format); ++ } else { ++ bne = write_block(b); ++ i = &bne->keys; ++ ++ if (i->seq != b->data->keys.seq) ++ break; ++ ++ btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)), ++ BTREE_ERR_WANT_RETRY, c, b, i, ++ "unknown checksum type"); ++ ++ nonce = btree_nonce(i, b->written << 9); ++ csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne); ++ ++ btree_err_on(bch2_crc_cmp(csum, bne->csum), ++ BTREE_ERR_WANT_RETRY, c, b, i, ++ "invalid checksum"); ++ ++ bset_encrypt(c, i, b->written << 9); ++ ++ sectors = vstruct_sectors(bne, c->block_bits); ++ } ++ ++ ret = validate_bset(c, b, i, sectors, &whiteout_u64s, ++ READ, have_retry); ++ if (ret) ++ goto fsck_err; ++ ++ b->written += sectors; ++ ++ blacklisted = bch2_journal_seq_is_blacklisted(c, ++ le64_to_cpu(i->journal_seq), ++ true); ++ ++ btree_err_on(blacklisted && first, ++ BTREE_ERR_FIXABLE, c, b, i, ++ "first btree node bset has blacklisted journal seq"); ++ if (blacklisted && !first) ++ continue; ++ ++ bch2_btree_node_iter_large_push(iter, b, ++ i->start, ++ vstruct_idx(i, whiteout_u64s)); ++ ++ bch2_btree_node_iter_large_push(iter, b, ++ vstruct_idx(i, whiteout_u64s), ++ vstruct_last(i)); ++ } ++ ++ for (bne = write_block(b); ++ bset_byte_offset(b, bne) < btree_bytes(c); ++ bne = (void *) bne + block_bytes(c)) ++ btree_err_on(bne->keys.seq == b->data->keys.seq, ++ BTREE_ERR_WANT_RETRY, c, b, NULL, ++ "found bset signature after last bset"); ++ ++ sorted = btree_bounce_alloc(c, btree_page_order(c), &used_mempool); ++ sorted->keys.u64s = 0; ++ ++ set_btree_bset(b, b->set, &b->data->keys); ++ ++ b->nr = btree_node_is_extents(b) ++ ? bch2_extent_sort_fix_overlapping(c, &sorted->keys, b, iter) ++ : bch2_key_sort_fix_overlapping(&sorted->keys, b, iter); ++ ++ u64s = le16_to_cpu(sorted->keys.u64s); ++ *sorted = *b->data; ++ sorted->keys.u64s = cpu_to_le16(u64s); ++ swap(sorted, b->data); ++ set_btree_bset(b, b->set, &b->data->keys); ++ b->nsets = 1; ++ ++ BUG_ON(b->nr.live_u64s != u64s); ++ ++ btree_bounce_free(c, btree_page_order(c), used_mempool, sorted); ++ ++ i = &b->data->keys; ++ for (k = i->start; k != vstruct_last(i);) { ++ struct bkey tmp; ++ struct bkey_s_c u = bkey_disassemble(b, k, &tmp); ++ const char *invalid = bch2_bkey_val_invalid(c, u); ++ ++ if (invalid || ++ (inject_invalid_keys(c) && ++ !bversion_cmp(u.k->version, MAX_VERSION))) { ++ char buf[160]; ++ ++ bch2_bkey_val_to_text(&PBUF(buf), c, u); ++ btree_err(BTREE_ERR_FIXABLE, c, b, i, ++ "invalid bkey %s: %s", buf, invalid); ++ ++ btree_keys_account_key_drop(&b->nr, 0, k); ++ ++ i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); ++ memmove_u64s_down(k, bkey_next(k), ++ (u64 *) vstruct_end(i) - (u64 *) k); ++ set_btree_bset_end(b, b->set); ++ continue; ++ } ++ ++ k = bkey_next(k); ++ } ++ ++ bch2_bset_build_aux_tree(b, b->set, false); ++ ++ set_needs_whiteout(btree_bset_first(b)); ++ ++ btree_node_reset_sib_u64s(b); ++out: ++ mempool_free(iter, &c->fill_iter); ++ return retry_read; ++fsck_err: ++ if (ret == BTREE_RETRY_READ) { ++ retry_read = 1; ++ } else { ++ bch2_inconsistent_error(c); ++ set_btree_node_read_error(b); ++ } ++ goto out; ++} ++ ++static void btree_node_read_work(struct work_struct *work) ++{ ++ struct btree_read_bio *rb = ++ container_of(work, struct btree_read_bio, work); ++ struct bch_fs *c = rb->c; ++ struct bch_dev *ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev); ++ struct btree *b = rb->bio.bi_private; ++ struct bio *bio = &rb->bio; ++ struct bch_io_failures failed = { .nr = 0 }; ++ bool can_retry; ++ ++ goto start; ++ while (1) { ++ bch_info(c, "retrying read"); ++ ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev); ++ rb->have_ioref = bch2_dev_get_ioref(ca, READ); ++ bio_reset(bio); ++ bio->bi_opf = REQ_OP_READ|REQ_SYNC|REQ_META; ++ bio->bi_iter.bi_sector = rb->pick.ptr.offset; ++ bio->bi_iter.bi_size = btree_bytes(c); ++ ++ if (rb->have_ioref) { ++ bio_set_dev(bio, ca->disk_sb.bdev); ++ submit_bio_wait(bio); ++ } else { ++ bio->bi_status = BLK_STS_REMOVED; ++ } ++start: ++ bch2_dev_io_err_on(bio->bi_status, ca, "btree read"); ++ if (rb->have_ioref) ++ percpu_ref_put(&ca->io_ref); ++ rb->have_ioref = false; ++ ++ bch2_mark_io_failure(&failed, &rb->pick); ++ ++ can_retry = bch2_bkey_pick_read_device(c, ++ bkey_i_to_s_c(&b->key), ++ &failed, &rb->pick) > 0; ++ ++ if (!bio->bi_status && ++ !bch2_btree_node_read_done(c, b, can_retry)) ++ break; ++ ++ if (!can_retry) { ++ set_btree_node_read_error(b); ++ break; ++ } ++ } ++ ++ bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read], ++ rb->start_time); ++ bio_put(&rb->bio); ++ clear_btree_node_read_in_flight(b); ++ wake_up_bit(&b->flags, BTREE_NODE_read_in_flight); ++} ++ ++static void btree_node_read_endio(struct bio *bio) ++{ ++ struct btree_read_bio *rb = ++ container_of(bio, struct btree_read_bio, bio); ++ struct bch_fs *c = rb->c; ++ ++ if (rb->have_ioref) { ++ struct bch_dev *ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev); ++ bch2_latency_acct(ca, rb->start_time, READ); ++ } ++ ++ queue_work(system_unbound_wq, &rb->work); ++} ++ ++void bch2_btree_node_read(struct bch_fs *c, struct btree *b, ++ bool sync) ++{ ++ struct extent_ptr_decoded pick; ++ struct btree_read_bio *rb; ++ struct bch_dev *ca; ++ struct bio *bio; ++ int ret; ++ ++ trace_btree_read(c, b); ++ ++ ret = bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key), ++ NULL, &pick); ++ if (bch2_fs_fatal_err_on(ret <= 0, c, ++ "btree node read error: no device to read from")) { ++ set_btree_node_read_error(b); ++ return; ++ } ++ ++ ca = bch_dev_bkey_exists(c, pick.ptr.dev); ++ ++ bio = bio_alloc_bioset(GFP_NOIO, buf_pages(b->data, ++ btree_bytes(c)), ++ &c->btree_bio); ++ rb = container_of(bio, struct btree_read_bio, bio); ++ rb->c = c; ++ rb->start_time = local_clock(); ++ rb->have_ioref = bch2_dev_get_ioref(ca, READ); ++ rb->pick = pick; ++ INIT_WORK(&rb->work, btree_node_read_work); ++ bio->bi_opf = REQ_OP_READ|REQ_SYNC|REQ_META; ++ bio->bi_iter.bi_sector = pick.ptr.offset; ++ bio->bi_end_io = btree_node_read_endio; ++ bio->bi_private = b; ++ bch2_bio_map(bio, b->data, btree_bytes(c)); ++ ++ set_btree_node_read_in_flight(b); ++ ++ if (rb->have_ioref) { ++ this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_BTREE], ++ bio_sectors(bio)); ++ bio_set_dev(bio, ca->disk_sb.bdev); ++ ++ if (sync) { ++ submit_bio_wait(bio); ++ ++ bio->bi_private = b; ++ btree_node_read_work(&rb->work); ++ } else { ++ submit_bio(bio); ++ } ++ } else { ++ bio->bi_status = BLK_STS_REMOVED; ++ ++ if (sync) ++ btree_node_read_work(&rb->work); ++ else ++ queue_work(system_unbound_wq, &rb->work); ++ ++ } ++} ++ ++int bch2_btree_root_read(struct bch_fs *c, enum btree_id id, ++ const struct bkey_i *k, unsigned level) ++{ ++ struct closure cl; ++ struct btree *b; ++ int ret; ++ ++ closure_init_stack(&cl); ++ ++ do { ++ ret = bch2_btree_cache_cannibalize_lock(c, &cl); ++ closure_sync(&cl); ++ } while (ret); ++ ++ b = bch2_btree_node_mem_alloc(c); ++ bch2_btree_cache_cannibalize_unlock(c); ++ ++ BUG_ON(IS_ERR(b)); ++ ++ bkey_copy(&b->key, k); ++ BUG_ON(bch2_btree_node_hash_insert(&c->btree_cache, b, level, id)); ++ ++ bch2_btree_node_read(c, b, true); ++ ++ if (btree_node_read_error(b)) { ++ bch2_btree_node_hash_remove(&c->btree_cache, b); ++ ++ mutex_lock(&c->btree_cache.lock); ++ list_move(&b->list, &c->btree_cache.freeable); ++ mutex_unlock(&c->btree_cache.lock); ++ ++ ret = -EIO; ++ goto err; ++ } ++ ++ bch2_btree_set_root_for_read(c, b); ++err: ++ six_unlock_write(&b->lock); ++ six_unlock_intent(&b->lock); ++ ++ return ret; ++} ++ ++void bch2_btree_complete_write(struct bch_fs *c, struct btree *b, ++ struct btree_write *w) ++{ ++ unsigned long old, new, v = READ_ONCE(b->will_make_reachable); ++ ++ do { ++ old = new = v; ++ if (!(old & 1)) ++ break; ++ ++ new &= ~1UL; ++ } while ((v = cmpxchg(&b->will_make_reachable, old, new)) != old); ++ ++ if (old & 1) ++ closure_put(&((struct btree_update *) new)->cl); ++ ++ bch2_journal_pin_drop(&c->journal, &w->journal); ++ closure_wake_up(&w->wait); ++} ++ ++static void btree_node_write_done(struct bch_fs *c, struct btree *b) ++{ ++ struct btree_write *w = btree_prev_write(b); ++ ++ bch2_btree_complete_write(c, b, w); ++ btree_node_io_unlock(b); ++} ++ ++static void bch2_btree_node_write_error(struct bch_fs *c, ++ struct btree_write_bio *wbio) ++{ ++ struct btree *b = wbio->wbio.bio.bi_private; ++ __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp; ++ struct bkey_i_btree_ptr *new_key; ++ struct bkey_s_btree_ptr bp; ++ struct bch_extent_ptr *ptr; ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ int ret; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ iter = bch2_trans_get_node_iter(&trans, b->btree_id, b->key.k.p, ++ BTREE_MAX_DEPTH, b->level, 0); ++retry: ++ ret = bch2_btree_iter_traverse(iter); ++ if (ret) ++ goto err; ++ ++ /* has node been freed? */ ++ if (iter->l[b->level].b != b) { ++ /* node has been freed: */ ++ BUG_ON(!btree_node_dying(b)); ++ goto out; ++ } ++ ++ BUG_ON(!btree_node_hashed(b)); ++ ++ bkey_copy(&tmp.k, &b->key); ++ ++ new_key = bkey_i_to_btree_ptr(&tmp.k); ++ bp = btree_ptr_i_to_s(new_key); ++ ++ bch2_bkey_drop_ptrs(bkey_i_to_s(&tmp.k), ptr, ++ bch2_dev_list_has_dev(wbio->wbio.failed, ptr->dev)); ++ ++ if (!bch2_bkey_nr_ptrs(bp.s_c)) ++ goto err; ++ ++ ret = bch2_btree_node_update_key(c, iter, b, new_key); ++ if (ret == -EINTR) ++ goto retry; ++ if (ret) ++ goto err; ++out: ++ bch2_trans_exit(&trans); ++ bio_put(&wbio->wbio.bio); ++ btree_node_write_done(c, b); ++ return; ++err: ++ set_btree_node_noevict(b); ++ bch2_fs_fatal_error(c, "fatal error writing btree node"); ++ goto out; ++} ++ ++void bch2_btree_write_error_work(struct work_struct *work) ++{ ++ struct bch_fs *c = container_of(work, struct bch_fs, ++ btree_write_error_work); ++ struct bio *bio; ++ ++ while (1) { ++ spin_lock_irq(&c->btree_write_error_lock); ++ bio = bio_list_pop(&c->btree_write_error_list); ++ spin_unlock_irq(&c->btree_write_error_lock); ++ ++ if (!bio) ++ break; ++ ++ bch2_btree_node_write_error(c, ++ container_of(bio, struct btree_write_bio, wbio.bio)); ++ } ++} ++ ++static void btree_node_write_work(struct work_struct *work) ++{ ++ struct btree_write_bio *wbio = ++ container_of(work, struct btree_write_bio, work); ++ struct bch_fs *c = wbio->wbio.c; ++ struct btree *b = wbio->wbio.bio.bi_private; ++ ++ btree_bounce_free(c, ++ wbio->wbio.order, ++ wbio->wbio.used_mempool, ++ wbio->data); ++ ++ if (wbio->wbio.failed.nr) { ++ unsigned long flags; ++ ++ spin_lock_irqsave(&c->btree_write_error_lock, flags); ++ bio_list_add(&c->btree_write_error_list, &wbio->wbio.bio); ++ spin_unlock_irqrestore(&c->btree_write_error_lock, flags); ++ ++ queue_work(c->wq, &c->btree_write_error_work); ++ return; ++ } ++ ++ bio_put(&wbio->wbio.bio); ++ btree_node_write_done(c, b); ++} ++ ++static void btree_node_write_endio(struct bio *bio) ++{ ++ struct bch_write_bio *wbio = to_wbio(bio); ++ struct bch_write_bio *parent = wbio->split ? wbio->parent : NULL; ++ struct bch_write_bio *orig = parent ?: wbio; ++ struct bch_fs *c = wbio->c; ++ struct bch_dev *ca = bch_dev_bkey_exists(c, wbio->dev); ++ unsigned long flags; ++ ++ if (wbio->have_ioref) ++ bch2_latency_acct(ca, wbio->submit_time, WRITE); ++ ++ if (bio->bi_status == BLK_STS_REMOVED || ++ bch2_dev_io_err_on(bio->bi_status, ca, "btree write") || ++ bch2_meta_write_fault("btree")) { ++ spin_lock_irqsave(&c->btree_write_error_lock, flags); ++ bch2_dev_list_add_dev(&orig->failed, wbio->dev); ++ spin_unlock_irqrestore(&c->btree_write_error_lock, flags); ++ } ++ ++ if (wbio->have_ioref) ++ percpu_ref_put(&ca->io_ref); ++ ++ if (parent) { ++ bio_put(bio); ++ bio_endio(&parent->bio); ++ } else { ++ struct btree_write_bio *wb = ++ container_of(orig, struct btree_write_bio, wbio); ++ ++ INIT_WORK(&wb->work, btree_node_write_work); ++ queue_work(system_unbound_wq, &wb->work); ++ } ++} ++ ++static int validate_bset_for_write(struct bch_fs *c, struct btree *b, ++ struct bset *i, unsigned sectors) ++{ ++ unsigned whiteout_u64s = 0; ++ int ret; ++ ++ if (bch2_bkey_invalid(c, bkey_i_to_s_c(&b->key), BKEY_TYPE_BTREE)) ++ return -1; ++ ++ ret = validate_bset(c, b, i, sectors, &whiteout_u64s, WRITE, false); ++ if (ret) ++ bch2_inconsistent_error(c); ++ ++ return ret; ++} ++ ++void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, ++ enum six_lock_type lock_type_held) ++{ ++ struct btree_write_bio *wbio; ++ struct bset_tree *t; ++ struct bset *i; ++ struct btree_node *bn = NULL; ++ struct btree_node_entry *bne = NULL; ++ BKEY_PADDED(key) k; ++ struct bch_extent_ptr *ptr; ++ struct sort_iter sort_iter; ++ struct nonce nonce; ++ unsigned bytes_to_write, sectors_to_write, order, bytes, u64s; ++ u64 seq = 0; ++ bool used_mempool; ++ unsigned long old, new; ++ bool validate_before_checksum = false; ++ void *data; ++ ++ if (test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags)) ++ return; ++ ++ /* ++ * We may only have a read lock on the btree node - the dirty bit is our ++ * "lock" against racing with other threads that may be trying to start ++ * a write, we do a write iff we clear the dirty bit. Since setting the ++ * dirty bit requires a write lock, we can't race with other threads ++ * redirtying it: ++ */ ++ do { ++ old = new = READ_ONCE(b->flags); ++ ++ if (!(old & (1 << BTREE_NODE_dirty))) ++ return; ++ ++ if (!btree_node_may_write(b)) ++ return; ++ ++ if (old & (1 << BTREE_NODE_write_in_flight)) { ++ btree_node_wait_on_io(b); ++ continue; ++ } ++ ++ new &= ~(1 << BTREE_NODE_dirty); ++ new &= ~(1 << BTREE_NODE_need_write); ++ new |= (1 << BTREE_NODE_write_in_flight); ++ new |= (1 << BTREE_NODE_just_written); ++ new ^= (1 << BTREE_NODE_write_idx); ++ } while (cmpxchg_acquire(&b->flags, old, new) != old); ++ ++ BUG_ON(btree_node_fake(b)); ++ BUG_ON((b->will_make_reachable != 0) != !b->written); ++ ++ BUG_ON(b->written >= c->opts.btree_node_size); ++ BUG_ON(b->written & (c->opts.block_size - 1)); ++ BUG_ON(bset_written(b, btree_bset_last(b))); ++ BUG_ON(le64_to_cpu(b->data->magic) != bset_magic(c)); ++ BUG_ON(memcmp(&b->data->format, &b->format, sizeof(b->format))); ++ ++ /* ++ * We can't block on six_lock_write() here; another thread might be ++ * trying to get a journal reservation with read locks held, and getting ++ * a journal reservation might be blocked on flushing the journal and ++ * doing btree writes: ++ */ ++ if (lock_type_held == SIX_LOCK_intent && ++ six_trylock_write(&b->lock)) { ++ __bch2_compact_whiteouts(c, b, COMPACT_WRITTEN); ++ six_unlock_write(&b->lock); ++ } else { ++ __bch2_compact_whiteouts(c, b, COMPACT_WRITTEN_NO_WRITE_LOCK); ++ } ++ ++ BUG_ON(b->uncompacted_whiteout_u64s); ++ ++ sort_iter_init(&sort_iter, b); ++ ++ bytes = !b->written ++ ? sizeof(struct btree_node) ++ : sizeof(struct btree_node_entry); ++ ++ bytes += b->whiteout_u64s * sizeof(u64); ++ ++ for_each_bset(b, t) { ++ i = bset(b, t); ++ ++ if (bset_written(b, i)) ++ continue; ++ ++ bytes += le16_to_cpu(i->u64s) * sizeof(u64); ++ sort_iter_add(&sort_iter, ++ btree_bkey_first(b, t), ++ btree_bkey_last(b, t)); ++ seq = max(seq, le64_to_cpu(i->journal_seq)); ++ } ++ ++ order = get_order(bytes); ++ data = btree_bounce_alloc(c, order, &used_mempool); ++ ++ if (!b->written) { ++ bn = data; ++ *bn = *b->data; ++ i = &bn->keys; ++ } else { ++ bne = data; ++ bne->keys = b->data->keys; ++ i = &bne->keys; ++ } ++ ++ i->journal_seq = cpu_to_le64(seq); ++ i->u64s = 0; ++ ++ if (!btree_node_is_extents(b)) { ++ sort_iter_add(&sort_iter, ++ unwritten_whiteouts_start(c, b), ++ unwritten_whiteouts_end(c, b)); ++ SET_BSET_SEPARATE_WHITEOUTS(i, false); ++ } else { ++ memcpy_u64s(i->start, ++ unwritten_whiteouts_start(c, b), ++ b->whiteout_u64s); ++ i->u64s = cpu_to_le16(b->whiteout_u64s); ++ SET_BSET_SEPARATE_WHITEOUTS(i, true); ++ } ++ ++ b->whiteout_u64s = 0; ++ ++ u64s = btree_node_is_extents(b) ++ ? bch2_sort_extents(vstruct_last(i), &sort_iter, false) ++ : bch2_sort_keys(i->start, &sort_iter, false); ++ le16_add_cpu(&i->u64s, u64s); ++ ++ clear_needs_whiteout(i); ++ ++ /* do we have data to write? */ ++ if (b->written && !i->u64s) ++ goto nowrite; ++ ++ bytes_to_write = vstruct_end(i) - data; ++ sectors_to_write = round_up(bytes_to_write, block_bytes(c)) >> 9; ++ ++ memset(data + bytes_to_write, 0, ++ (sectors_to_write << 9) - bytes_to_write); ++ ++ BUG_ON(b->written + sectors_to_write > c->opts.btree_node_size); ++ BUG_ON(BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN); ++ BUG_ON(i->seq != b->data->keys.seq); ++ ++ i->version = c->sb.version < bcachefs_metadata_version_new_versioning ++ ? cpu_to_le16(BCH_BSET_VERSION_OLD) ++ : cpu_to_le16(c->sb.version); ++ SET_BSET_CSUM_TYPE(i, bch2_meta_checksum_type(c)); ++ ++ if (bch2_csum_type_is_encryption(BSET_CSUM_TYPE(i))) ++ validate_before_checksum = true; ++ ++ /* validate_bset will be modifying: */ ++ if (le16_to_cpu(i->version) < ++ bcachefs_metadata_version_bkey_renumber) ++ validate_before_checksum = true; ++ ++ /* if we're going to be encrypting, check metadata validity first: */ ++ if (validate_before_checksum && ++ validate_bset_for_write(c, b, i, sectors_to_write)) ++ goto err; ++ ++ bset_encrypt(c, i, b->written << 9); ++ ++ nonce = btree_nonce(i, b->written << 9); ++ ++ if (bn) ++ bn->csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bn); ++ else ++ bne->csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne); ++ ++ /* if we're not encrypting, check metadata after checksumming: */ ++ if (!validate_before_checksum && ++ validate_bset_for_write(c, b, i, sectors_to_write)) ++ goto err; ++ ++ /* ++ * We handle btree write errors by immediately halting the journal - ++ * after we've done that, we can't issue any subsequent btree writes ++ * because they might have pointers to new nodes that failed to write. ++ * ++ * Furthermore, there's no point in doing any more btree writes because ++ * with the journal stopped, we're never going to update the journal to ++ * reflect that those writes were done and the data flushed from the ++ * journal: ++ * ++ * Make sure to update b->written so bch2_btree_init_next() doesn't ++ * break: ++ */ ++ if (bch2_journal_error(&c->journal) || ++ c->opts.nochanges) ++ goto err; ++ ++ trace_btree_write(b, bytes_to_write, sectors_to_write); ++ ++ wbio = container_of(bio_alloc_bioset(GFP_NOIO, ++ buf_pages(data, sectors_to_write << 9), ++ &c->btree_bio), ++ struct btree_write_bio, wbio.bio); ++ wbio_init(&wbio->wbio.bio); ++ wbio->data = data; ++ wbio->wbio.order = order; ++ wbio->wbio.used_mempool = used_mempool; ++ wbio->wbio.bio.bi_opf = REQ_OP_WRITE|REQ_META; ++ wbio->wbio.bio.bi_end_io = btree_node_write_endio; ++ wbio->wbio.bio.bi_private = b; ++ ++ if (b->level || !b->written) ++ wbio->wbio.bio.bi_opf |= REQ_FUA; ++ ++ bch2_bio_map(&wbio->wbio.bio, data, sectors_to_write << 9); ++ ++ /* ++ * If we're appending to a leaf node, we don't technically need FUA - ++ * this write just needs to be persisted before the next journal write, ++ * which will be marked FLUSH|FUA. ++ * ++ * Similarly if we're writing a new btree root - the pointer is going to ++ * be in the next journal entry. ++ * ++ * But if we're writing a new btree node (that isn't a root) or ++ * appending to a non leaf btree node, we need either FUA or a flush ++ * when we write the parent with the new pointer. FUA is cheaper than a ++ * flush, and writes appending to leaf nodes aren't blocking anything so ++ * just make all btree node writes FUA to keep things sane. ++ */ ++ ++ bkey_copy(&k.key, &b->key); ++ ++ bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&k.key)), ptr) ++ ptr->offset += b->written; ++ ++ b->written += sectors_to_write; ++ ++ bch2_submit_wbio_replicas(&wbio->wbio, c, BCH_DATA_BTREE, &k.key); ++ return; ++err: ++ set_btree_node_noevict(b); ++ b->written += sectors_to_write; ++nowrite: ++ btree_bounce_free(c, order, used_mempool, data); ++ btree_node_write_done(c, b); ++} ++ ++/* ++ * Work that must be done with write lock held: ++ */ ++bool bch2_btree_post_write_cleanup(struct bch_fs *c, struct btree *b) ++{ ++ bool invalidated_iter = false; ++ struct btree_node_entry *bne; ++ struct bset_tree *t; ++ ++ if (!btree_node_just_written(b)) ++ return false; ++ ++ BUG_ON(b->whiteout_u64s); ++ BUG_ON(b->uncompacted_whiteout_u64s); ++ ++ clear_btree_node_just_written(b); ++ ++ /* ++ * Note: immediately after write, bset_written() doesn't work - the ++ * amount of data we had to write after compaction might have been ++ * smaller than the offset of the last bset. ++ * ++ * However, we know that all bsets have been written here, as long as ++ * we're still holding the write lock: ++ */ ++ ++ /* ++ * XXX: decide if we really want to unconditionally sort down to a ++ * single bset: ++ */ ++ if (b->nsets > 1) { ++ btree_node_sort(c, b, NULL, 0, b->nsets, true); ++ invalidated_iter = true; ++ } else { ++ invalidated_iter = bch2_drop_whiteouts(b); ++ } ++ ++ for_each_bset(b, t) ++ set_needs_whiteout(bset(b, t)); ++ ++ bch2_btree_verify(c, b); ++ ++ /* ++ * If later we don't unconditionally sort down to a single bset, we have ++ * to ensure this is still true: ++ */ ++ BUG_ON((void *) btree_bkey_last(b, bset_tree_last(b)) > write_block(b)); ++ ++ bne = want_new_bset(c, b); ++ if (bne) ++ bch2_bset_init_next(c, b, bne); ++ ++ bch2_btree_build_aux_trees(b); ++ ++ return invalidated_iter; ++} ++ ++/* ++ * Use this one if the node is intent locked: ++ */ ++void bch2_btree_node_write(struct bch_fs *c, struct btree *b, ++ enum six_lock_type lock_type_held) ++{ ++ BUG_ON(lock_type_held == SIX_LOCK_write); ++ ++ if (lock_type_held == SIX_LOCK_intent || ++ six_lock_tryupgrade(&b->lock)) { ++ __bch2_btree_node_write(c, b, SIX_LOCK_intent); ++ ++ /* don't cycle lock unnecessarily: */ ++ if (btree_node_just_written(b) && ++ six_trylock_write(&b->lock)) { ++ bch2_btree_post_write_cleanup(c, b); ++ six_unlock_write(&b->lock); ++ } ++ ++ if (lock_type_held == SIX_LOCK_read) ++ six_lock_downgrade(&b->lock); ++ } else { ++ __bch2_btree_node_write(c, b, SIX_LOCK_read); ++ } ++} ++ ++static void __bch2_btree_flush_all(struct bch_fs *c, unsigned flag) ++{ ++ struct bucket_table *tbl; ++ struct rhash_head *pos; ++ struct btree *b; ++ unsigned i; ++restart: ++ rcu_read_lock(); ++ for_each_cached_btree(b, c, tbl, i, pos) ++ if (test_bit(flag, &b->flags)) { ++ rcu_read_unlock(); ++ wait_on_bit_io(&b->flags, flag, TASK_UNINTERRUPTIBLE); ++ goto restart; ++ ++ } ++ rcu_read_unlock(); ++} ++ ++void bch2_btree_flush_all_reads(struct bch_fs *c) ++{ ++ __bch2_btree_flush_all(c, BTREE_NODE_read_in_flight); ++} ++ ++void bch2_btree_flush_all_writes(struct bch_fs *c) ++{ ++ __bch2_btree_flush_all(c, BTREE_NODE_write_in_flight); ++} ++ ++void bch2_btree_verify_flushed(struct bch_fs *c) ++{ ++ struct bucket_table *tbl; ++ struct rhash_head *pos; ++ struct btree *b; ++ unsigned i; ++ ++ rcu_read_lock(); ++ for_each_cached_btree(b, c, tbl, i, pos) { ++ unsigned long flags = READ_ONCE(b->flags); ++ ++ BUG_ON((flags & (1 << BTREE_NODE_dirty)) || ++ (flags & (1 << BTREE_NODE_write_in_flight))); ++ } ++ rcu_read_unlock(); ++} ++ ++ssize_t bch2_dirty_btree_nodes_print(struct bch_fs *c, char *buf) ++{ ++ struct printbuf out = _PBUF(buf, PAGE_SIZE); ++ struct bucket_table *tbl; ++ struct rhash_head *pos; ++ struct btree *b; ++ unsigned i; ++ ++ rcu_read_lock(); ++ for_each_cached_btree(b, c, tbl, i, pos) { ++ unsigned long flags = READ_ONCE(b->flags); ++ unsigned idx = (flags & (1 << BTREE_NODE_write_idx)) != 0; ++ ++ if (!(flags & (1 << BTREE_NODE_dirty))) ++ continue; ++ ++ pr_buf(&out, "%p d %u n %u l %u w %u b %u r %u:%lu c %u p %u\n", ++ b, ++ (flags & (1 << BTREE_NODE_dirty)) != 0, ++ (flags & (1 << BTREE_NODE_need_write)) != 0, ++ b->level, ++ b->written, ++ !list_empty_careful(&b->write_blocked), ++ b->will_make_reachable != 0, ++ b->will_make_reachable & 1, ++ b->writes[ idx].wait.list.first != NULL, ++ b->writes[!idx].wait.list.first != NULL); ++ } ++ rcu_read_unlock(); ++ ++ return out.pos - buf; ++} +diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h +new file mode 100644 +index 000000000000..955a80cafae3 +--- /dev/null ++++ b/fs/bcachefs/btree_io.h +@@ -0,0 +1,141 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_BTREE_IO_H ++#define _BCACHEFS_BTREE_IO_H ++ ++#include "bset.h" ++#include "btree_locking.h" ++#include "extents.h" ++#include "io_types.h" ++ ++struct bch_fs; ++struct btree_write; ++struct btree; ++struct btree_iter; ++ ++struct btree_read_bio { ++ struct bch_fs *c; ++ u64 start_time; ++ unsigned have_ioref:1; ++ struct extent_ptr_decoded pick; ++ struct work_struct work; ++ struct bio bio; ++}; ++ ++struct btree_write_bio { ++ void *data; ++ struct work_struct work; ++ struct bch_write_bio wbio; ++}; ++ ++static inline void btree_node_io_unlock(struct btree *b) ++{ ++ EBUG_ON(!btree_node_write_in_flight(b)); ++ clear_btree_node_write_in_flight(b); ++ wake_up_bit(&b->flags, BTREE_NODE_write_in_flight); ++} ++ ++static inline void btree_node_io_lock(struct btree *b) ++{ ++ wait_on_bit_lock_io(&b->flags, BTREE_NODE_write_in_flight, ++ TASK_UNINTERRUPTIBLE); ++} ++ ++static inline void btree_node_wait_on_io(struct btree *b) ++{ ++ wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight, ++ TASK_UNINTERRUPTIBLE); ++} ++ ++static inline bool btree_node_may_write(struct btree *b) ++{ ++ return list_empty_careful(&b->write_blocked) && ++ (!b->written || !b->will_make_reachable); ++} ++ ++enum compact_mode { ++ COMPACT_LAZY, ++ COMPACT_WRITTEN, ++ COMPACT_WRITTEN_NO_WRITE_LOCK, ++}; ++ ++bool __bch2_compact_whiteouts(struct bch_fs *, struct btree *, enum compact_mode); ++ ++static inline unsigned should_compact_bset_lazy(struct btree *b, struct bset_tree *t) ++{ ++ unsigned total_u64s = bset_u64s(t); ++ unsigned dead_u64s = total_u64s - b->nr.bset_u64s[t - b->set]; ++ ++ return dead_u64s > 64 && dead_u64s * 3 > total_u64s; ++} ++ ++static inline bool bch2_maybe_compact_whiteouts(struct bch_fs *c, struct btree *b) ++{ ++ struct bset_tree *t; ++ ++ for_each_bset(b, t) ++ if (should_compact_bset_lazy(b, t)) ++ return __bch2_compact_whiteouts(c, b, COMPACT_LAZY); ++ ++ return false; ++} ++ ++void bch2_btree_sort_into(struct bch_fs *, struct btree *, struct btree *); ++ ++void bch2_btree_build_aux_trees(struct btree *); ++void bch2_btree_init_next(struct bch_fs *, struct btree *, ++ struct btree_iter *); ++ ++int bch2_btree_node_read_done(struct bch_fs *, struct btree *, bool); ++void bch2_btree_node_read(struct bch_fs *, struct btree *, bool); ++int bch2_btree_root_read(struct bch_fs *, enum btree_id, ++ const struct bkey_i *, unsigned); ++ ++void bch2_btree_complete_write(struct bch_fs *, struct btree *, ++ struct btree_write *); ++void bch2_btree_write_error_work(struct work_struct *); ++ ++void __bch2_btree_node_write(struct bch_fs *, struct btree *, ++ enum six_lock_type); ++bool bch2_btree_post_write_cleanup(struct bch_fs *, struct btree *); ++ ++void bch2_btree_node_write(struct bch_fs *, struct btree *, ++ enum six_lock_type); ++ ++static inline void btree_node_write_if_need(struct bch_fs *c, struct btree *b) ++{ ++ while (b->written && ++ btree_node_need_write(b) && ++ btree_node_may_write(b)) { ++ if (!btree_node_write_in_flight(b)) { ++ bch2_btree_node_write(c, b, SIX_LOCK_read); ++ break; ++ } ++ ++ six_unlock_read(&b->lock); ++ btree_node_wait_on_io(b); ++ btree_node_lock_type(c, b, SIX_LOCK_read); ++ } ++} ++ ++#define bch2_btree_node_write_cond(_c, _b, cond) \ ++do { \ ++ unsigned long old, new, v = READ_ONCE((_b)->flags); \ ++ \ ++ do { \ ++ old = new = v; \ ++ \ ++ if (!(old & (1 << BTREE_NODE_dirty)) || !(cond)) \ ++ break; \ ++ \ ++ new |= (1 << BTREE_NODE_need_write); \ ++ } while ((v = cmpxchg(&(_b)->flags, old, new)) != old); \ ++ \ ++ btree_node_write_if_need(_c, _b); \ ++} while (0) ++ ++void bch2_btree_flush_all_reads(struct bch_fs *); ++void bch2_btree_flush_all_writes(struct bch_fs *); ++void bch2_btree_verify_flushed(struct bch_fs *); ++ssize_t bch2_dirty_btree_nodes_print(struct bch_fs *, char *); ++ ++#endif /* _BCACHEFS_BTREE_IO_H */ +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +new file mode 100644 +index 000000000000..5fab505dbea0 +--- /dev/null ++++ b/fs/bcachefs/btree_iter.c +@@ -0,0 +1,2158 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include "bcachefs.h" ++#include "bkey_methods.h" ++#include "btree_cache.h" ++#include "btree_iter.h" ++#include "btree_locking.h" ++#include "debug.h" ++#include "extents.h" ++ ++#include ++#include ++ ++static inline struct bkey_s_c __btree_iter_peek_all(struct btree_iter *, ++ struct btree_iter_level *, ++ struct bkey *); ++ ++#define BTREE_ITER_NO_NODE_GET_LOCKS ((struct btree *) 1) ++#define BTREE_ITER_NO_NODE_DROP ((struct btree *) 2) ++#define BTREE_ITER_NO_NODE_LOCK_ROOT ((struct btree *) 3) ++#define BTREE_ITER_NO_NODE_UP ((struct btree *) 4) ++#define BTREE_ITER_NO_NODE_DOWN ((struct btree *) 5) ++#define BTREE_ITER_NO_NODE_INIT ((struct btree *) 6) ++#define BTREE_ITER_NO_NODE_ERROR ((struct btree *) 7) ++ ++static inline bool is_btree_node(struct btree_iter *iter, unsigned l) ++{ ++ return l < BTREE_MAX_DEPTH && ++ (unsigned long) iter->l[l].b >= 128; ++} ++ ++/* Returns < 0 if @k is before iter pos, > 0 if @k is after */ ++static inline int __btree_iter_pos_cmp(struct btree_iter *iter, ++ const struct btree *b, ++ const struct bkey_packed *k, ++ bool interior_node) ++{ ++ int cmp = bkey_cmp_left_packed(b, k, &iter->pos); ++ ++ if (cmp) ++ return cmp; ++ if (bkey_deleted(k)) ++ return -1; ++ ++ /* ++ * Normally, for extents we want the first key strictly greater than ++ * the iterator position - with the exception that for interior nodes, ++ * we don't want to advance past the last key if the iterator position ++ * is POS_MAX: ++ */ ++ if (iter->flags & BTREE_ITER_IS_EXTENTS && ++ (!interior_node || ++ bkey_cmp_left_packed_byval(b, k, POS_MAX))) ++ return -1; ++ return 1; ++} ++ ++static inline int btree_iter_pos_cmp(struct btree_iter *iter, ++ const struct btree *b, ++ const struct bkey_packed *k) ++{ ++ return __btree_iter_pos_cmp(iter, b, k, b->level != 0); ++} ++ ++/* Btree node locking: */ ++ ++void bch2_btree_node_unlock_write(struct btree *b, struct btree_iter *iter) ++{ ++ bch2_btree_node_unlock_write_inlined(b, iter); ++} ++ ++void __bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter) ++{ ++ struct btree_iter *linked; ++ unsigned readers = 0; ++ ++ EBUG_ON(!btree_node_intent_locked(iter, b->level)); ++ ++ trans_for_each_iter(iter->trans, linked) ++ if (linked->l[b->level].b == b && ++ btree_node_read_locked(linked, b->level)) ++ readers++; ++ ++ /* ++ * Must drop our read locks before calling six_lock_write() - ++ * six_unlock() won't do wakeups until the reader count ++ * goes to 0, and it's safe because we have the node intent ++ * locked: ++ */ ++ atomic64_sub(__SIX_VAL(read_lock, readers), ++ &b->lock.state.counter); ++ btree_node_lock_type(iter->trans->c, b, SIX_LOCK_write); ++ atomic64_add(__SIX_VAL(read_lock, readers), ++ &b->lock.state.counter); ++} ++ ++bool __bch2_btree_node_relock(struct btree_iter *iter, unsigned level) ++{ ++ struct btree *b = btree_iter_node(iter, level); ++ int want = __btree_lock_want(iter, level); ++ ++ if (!is_btree_node(iter, level)) ++ return false; ++ ++ if (race_fault()) ++ return false; ++ ++ if (six_relock_type(&b->lock, want, iter->l[level].lock_seq) || ++ (btree_node_lock_seq_matches(iter, b, level) && ++ btree_node_lock_increment(iter, b, level, want))) { ++ mark_btree_node_locked(iter, level, want); ++ return true; ++ } else { ++ return false; ++ } ++} ++ ++static bool bch2_btree_node_upgrade(struct btree_iter *iter, unsigned level) ++{ ++ struct btree *b = iter->l[level].b; ++ ++ EBUG_ON(btree_lock_want(iter, level) != BTREE_NODE_INTENT_LOCKED); ++ ++ if (!is_btree_node(iter, level)) ++ return false; ++ ++ if (btree_node_intent_locked(iter, level)) ++ return true; ++ ++ if (race_fault()) ++ return false; ++ ++ if (btree_node_locked(iter, level) ++ ? six_lock_tryupgrade(&b->lock) ++ : six_relock_type(&b->lock, SIX_LOCK_intent, iter->l[level].lock_seq)) ++ goto success; ++ ++ if (btree_node_lock_seq_matches(iter, b, level) && ++ btree_node_lock_increment(iter, b, level, BTREE_NODE_INTENT_LOCKED)) { ++ btree_node_unlock(iter, level); ++ goto success; ++ } ++ ++ return false; ++success: ++ mark_btree_node_intent_locked(iter, level); ++ return true; ++} ++ ++static inline bool btree_iter_get_locks(struct btree_iter *iter, ++ bool upgrade, bool trace) ++{ ++ unsigned l = iter->level; ++ int fail_idx = -1; ++ ++ do { ++ if (!btree_iter_node(iter, l)) ++ break; ++ ++ if (!(upgrade ++ ? bch2_btree_node_upgrade(iter, l) ++ : bch2_btree_node_relock(iter, l))) { ++ if (trace) ++ (upgrade ++ ? trace_node_upgrade_fail ++ : trace_node_relock_fail)(l, iter->l[l].lock_seq, ++ is_btree_node(iter, l) ++ ? 0 ++ : (unsigned long) iter->l[l].b, ++ is_btree_node(iter, l) ++ ? iter->l[l].b->lock.state.seq ++ : 0); ++ ++ fail_idx = l; ++ btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); ++ } ++ ++ l++; ++ } while (l < iter->locks_want); ++ ++ /* ++ * When we fail to get a lock, we have to ensure that any child nodes ++ * can't be relocked so bch2_btree_iter_traverse has to walk back up to ++ * the node that we failed to relock: ++ */ ++ while (fail_idx >= 0) { ++ btree_node_unlock(iter, fail_idx); ++ iter->l[fail_idx].b = BTREE_ITER_NO_NODE_GET_LOCKS; ++ --fail_idx; ++ } ++ ++ if (iter->uptodate == BTREE_ITER_NEED_RELOCK) ++ iter->uptodate = BTREE_ITER_NEED_PEEK; ++ ++ bch2_btree_trans_verify_locks(iter->trans); ++ ++ return iter->uptodate < BTREE_ITER_NEED_RELOCK; ++} ++ ++/* Slowpath: */ ++bool __bch2_btree_node_lock(struct btree *b, struct bpos pos, ++ unsigned level, ++ struct btree_iter *iter, ++ enum six_lock_type type) ++{ ++ struct btree_iter *linked; ++ bool ret = true; ++ ++ /* Check if it's safe to block: */ ++ trans_for_each_iter(iter->trans, linked) { ++ if (!linked->nodes_locked) ++ continue; ++ ++ /* * Must lock btree nodes in key order: */ ++ if (__btree_iter_cmp(iter->btree_id, pos, linked) < 0) ++ ret = false; ++ ++ /* ++ * Can't block taking an intent lock if we have _any_ nodes read ++ * locked: ++ * ++ * - Our read lock blocks another thread with an intent lock on ++ * the same node from getting a write lock, and thus from ++ * dropping its intent lock ++ * ++ * - And the other thread may have multiple nodes intent locked: ++ * both the node we want to intent lock, and the node we ++ * already have read locked - deadlock: ++ */ ++ if (type == SIX_LOCK_intent && ++ linked->nodes_locked != linked->nodes_intent_locked) { ++ if (!(iter->trans->nounlock)) { ++ linked->locks_want = max_t(unsigned, ++ linked->locks_want, ++ __fls(linked->nodes_locked) + 1); ++ btree_iter_get_locks(linked, true, false); ++ } ++ ret = false; ++ } ++ ++ /* ++ * Interior nodes must be locked before their descendants: if ++ * another iterator has possible descendants locked of the node ++ * we're about to lock, it must have the ancestors locked too: ++ */ ++ if (linked->btree_id == iter->btree_id && ++ level > __fls(linked->nodes_locked)) { ++ if (!(iter->trans->nounlock)) { ++ linked->locks_want = ++ max(level + 1, max_t(unsigned, ++ linked->locks_want, ++ iter->locks_want)); ++ btree_iter_get_locks(linked, true, false); ++ } ++ ret = false; ++ } ++ } ++ ++ if (unlikely(!ret)) { ++ trace_trans_restart_would_deadlock(iter->trans->ip); ++ return false; ++ } ++ ++ __btree_node_lock_type(iter->trans->c, b, type); ++ return true; ++} ++ ++/* Btree iterator locking: */ ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++void bch2_btree_iter_verify_locks(struct btree_iter *iter) ++{ ++ unsigned l; ++ ++ for (l = 0; btree_iter_node(iter, l); l++) { ++ if (iter->uptodate >= BTREE_ITER_NEED_RELOCK && ++ !btree_node_locked(iter, l)) ++ continue; ++ ++ BUG_ON(btree_lock_want(iter, l) != ++ btree_node_locked_type(iter, l)); ++ } ++} ++ ++void bch2_btree_trans_verify_locks(struct btree_trans *trans) ++{ ++ struct btree_iter *iter; ++ ++ trans_for_each_iter(trans, iter) ++ bch2_btree_iter_verify_locks(iter); ++} ++#endif ++ ++__flatten ++static bool bch2_btree_iter_relock(struct btree_iter *iter, bool trace) ++{ ++ return iter->uptodate >= BTREE_ITER_NEED_RELOCK ++ ? btree_iter_get_locks(iter, false, trace) ++ : true; ++} ++ ++bool __bch2_btree_iter_upgrade(struct btree_iter *iter, ++ unsigned new_locks_want) ++{ ++ struct btree_iter *linked; ++ ++ EBUG_ON(iter->locks_want >= new_locks_want); ++ ++ iter->locks_want = new_locks_want; ++ ++ if (btree_iter_get_locks(iter, true, true)) ++ return true; ++ ++ /* ++ * Ancestor nodes must be locked before child nodes, so set locks_want ++ * on iterators that might lock ancestors before us to avoid getting ++ * -EINTR later: ++ */ ++ trans_for_each_iter(iter->trans, linked) ++ if (linked != iter && ++ linked->btree_id == iter->btree_id && ++ linked->locks_want < new_locks_want) { ++ linked->locks_want = new_locks_want; ++ btree_iter_get_locks(linked, true, false); ++ } ++ ++ return false; ++} ++ ++bool __bch2_btree_iter_upgrade_nounlock(struct btree_iter *iter, ++ unsigned new_locks_want) ++{ ++ unsigned l = iter->level; ++ ++ EBUG_ON(iter->locks_want >= new_locks_want); ++ ++ iter->locks_want = new_locks_want; ++ ++ do { ++ if (!btree_iter_node(iter, l)) ++ break; ++ ++ if (!bch2_btree_node_upgrade(iter, l)) { ++ iter->locks_want = l; ++ return false; ++ } ++ ++ l++; ++ } while (l < iter->locks_want); ++ ++ return true; ++} ++ ++void __bch2_btree_iter_downgrade(struct btree_iter *iter, ++ unsigned downgrade_to) ++{ ++ struct btree_iter *linked; ++ unsigned l; ++ ++ /* ++ * We downgrade linked iterators as well because btree_iter_upgrade ++ * might have had to modify locks_want on linked iterators due to lock ++ * ordering: ++ */ ++ trans_for_each_iter(iter->trans, linked) { ++ unsigned new_locks_want = downgrade_to ?: ++ (linked->flags & BTREE_ITER_INTENT ? 1 : 0); ++ ++ if (linked->locks_want <= new_locks_want) ++ continue; ++ ++ linked->locks_want = new_locks_want; ++ ++ while (linked->nodes_locked && ++ (l = __fls(linked->nodes_locked)) >= linked->locks_want) { ++ if (l > linked->level) { ++ btree_node_unlock(linked, l); ++ } else { ++ if (btree_node_intent_locked(linked, l)) { ++ six_lock_downgrade(&linked->l[l].b->lock); ++ linked->nodes_intent_locked ^= 1 << l; ++ } ++ break; ++ } ++ } ++ } ++ ++ bch2_btree_trans_verify_locks(iter->trans); ++} ++ ++/* Btree transaction locking: */ ++ ++bool bch2_trans_relock(struct btree_trans *trans) ++{ ++ struct btree_iter *iter; ++ bool ret = true; ++ ++ trans_for_each_iter(trans, iter) ++ if (iter->uptodate == BTREE_ITER_NEED_RELOCK) ++ ret &= bch2_btree_iter_relock(iter, true); ++ ++ return ret; ++} ++ ++void bch2_trans_unlock(struct btree_trans *trans) ++{ ++ struct btree_iter *iter; ++ ++ trans_for_each_iter(trans, iter) ++ __bch2_btree_iter_unlock(iter); ++} ++ ++/* Btree iterator: */ ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++ ++static void __bch2_btree_iter_verify(struct btree_iter *iter, ++ struct btree *b) ++{ ++ struct btree_iter_level *l = &iter->l[b->level]; ++ struct btree_node_iter tmp = l->iter; ++ struct bkey_packed *k; ++ ++ if (!debug_check_iterators(iter->trans->c)) ++ return; ++ ++ if (iter->uptodate > BTREE_ITER_NEED_PEEK) ++ return; ++ ++ bch2_btree_node_iter_verify(&l->iter, b); ++ ++ /* ++ * For interior nodes, the iterator will have skipped past ++ * deleted keys: ++ * ++ * For extents, the iterator may have skipped past deleted keys (but not ++ * whiteouts) ++ */ ++ k = b->level || iter->flags & BTREE_ITER_IS_EXTENTS ++ ? bch2_btree_node_iter_prev_filter(&tmp, b, KEY_TYPE_discard) ++ : bch2_btree_node_iter_prev_all(&tmp, b); ++ if (k && btree_iter_pos_cmp(iter, b, k) > 0) { ++ char buf[100]; ++ struct bkey uk = bkey_unpack_key(b, k); ++ ++ bch2_bkey_to_text(&PBUF(buf), &uk); ++ panic("prev key should be before iter pos:\n%s\n%llu:%llu\n", ++ buf, iter->pos.inode, iter->pos.offset); ++ } ++ ++ k = bch2_btree_node_iter_peek_all(&l->iter, b); ++ if (k && btree_iter_pos_cmp(iter, b, k) < 0) { ++ char buf[100]; ++ struct bkey uk = bkey_unpack_key(b, k); ++ ++ bch2_bkey_to_text(&PBUF(buf), &uk); ++ panic("iter should be after current key:\n" ++ "iter pos %llu:%llu\n" ++ "cur key %s\n", ++ iter->pos.inode, iter->pos.offset, buf); ++ } ++ ++ BUG_ON(iter->uptodate == BTREE_ITER_UPTODATE && ++ btree_iter_type(iter) == BTREE_ITER_KEYS && ++ !bkey_whiteout(&iter->k) && ++ bch2_btree_node_iter_end(&l->iter)); ++} ++ ++void bch2_btree_iter_verify(struct btree_iter *iter, struct btree *b) ++{ ++ struct btree_iter *linked; ++ ++ if (!debug_check_iterators(iter->trans->c)) ++ return; ++ ++ trans_for_each_iter_with_node(iter->trans, b, linked) ++ __bch2_btree_iter_verify(linked, b); ++} ++ ++#else ++ ++static inline void __bch2_btree_iter_verify(struct btree_iter *iter, ++ struct btree *b) {} ++ ++#endif ++ ++static void btree_node_iter_set_set_pos(struct btree_node_iter *iter, ++ struct btree *b, ++ struct bset_tree *t, ++ struct bkey_packed *k) ++{ ++ struct btree_node_iter_set *set; ++ ++ btree_node_iter_for_each(iter, set) ++ if (set->end == t->end_offset) { ++ set->k = __btree_node_key_to_offset(b, k); ++ bch2_btree_node_iter_sort(iter, b); ++ return; ++ } ++ ++ bch2_btree_node_iter_push(iter, b, k, btree_bkey_last(b, t)); ++} ++ ++static void __bch2_btree_iter_fix_key_modified(struct btree_iter *iter, ++ struct btree *b, ++ struct bkey_packed *where) ++{ ++ struct btree_node_iter *node_iter = &iter->l[0].iter; ++ ++ if (where == bch2_btree_node_iter_peek_all(node_iter, b)) { ++ bkey_disassemble(b, where, &iter->k); ++ btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); ++ } ++} ++ ++void bch2_btree_iter_fix_key_modified(struct btree_iter *iter, ++ struct btree *b, ++ struct bkey_packed *where) ++{ ++ struct btree_iter *linked; ++ ++ trans_for_each_iter_with_node(iter->trans, b, linked) { ++ __bch2_btree_iter_fix_key_modified(linked, b, where); ++ __bch2_btree_iter_verify(linked, b); ++ } ++} ++ ++static void __bch2_btree_node_iter_fix(struct btree_iter *iter, ++ struct btree *b, ++ struct btree_node_iter *node_iter, ++ struct bset_tree *t, ++ struct bkey_packed *where, ++ unsigned clobber_u64s, ++ unsigned new_u64s) ++{ ++ const struct bkey_packed *end = btree_bkey_last(b, t); ++ struct btree_node_iter_set *set; ++ unsigned offset = __btree_node_key_to_offset(b, where); ++ int shift = new_u64s - clobber_u64s; ++ unsigned old_end = t->end_offset - shift; ++ unsigned orig_iter_pos = node_iter->data[0].k; ++ bool iter_current_key_modified = ++ orig_iter_pos >= offset && ++ orig_iter_pos <= offset + clobber_u64s; ++ ++ btree_node_iter_for_each(node_iter, set) ++ if (set->end == old_end) ++ goto found; ++ ++ /* didn't find the bset in the iterator - might have to readd it: */ ++ if (new_u64s && ++ btree_iter_pos_cmp(iter, b, where) > 0) { ++ bch2_btree_node_iter_push(node_iter, b, where, end); ++ goto fixup_done; ++ } else { ++ /* Iterator is after key that changed */ ++ return; ++ } ++found: ++ set->end = t->end_offset; ++ ++ /* Iterator hasn't gotten to the key that changed yet: */ ++ if (set->k < offset) ++ return; ++ ++ if (new_u64s && ++ btree_iter_pos_cmp(iter, b, where) > 0) { ++ set->k = offset; ++ } else if (set->k < offset + clobber_u64s) { ++ set->k = offset + new_u64s; ++ if (set->k == set->end) ++ bch2_btree_node_iter_set_drop(node_iter, set); ++ } else { ++ /* Iterator is after key that changed */ ++ set->k = (int) set->k + shift; ++ return; ++ } ++ ++ bch2_btree_node_iter_sort(node_iter, b); ++fixup_done: ++ if (node_iter->data[0].k != orig_iter_pos) ++ iter_current_key_modified = true; ++ ++ /* ++ * When a new key is added, and the node iterator now points to that ++ * key, the iterator might have skipped past deleted keys that should ++ * come after the key the iterator now points to. We have to rewind to ++ * before those deleted keys - otherwise ++ * bch2_btree_node_iter_prev_all() breaks: ++ */ ++ if (!bch2_btree_node_iter_end(node_iter) && ++ iter_current_key_modified && ++ (b->level || ++ (iter->flags & BTREE_ITER_IS_EXTENTS))) { ++ struct bset_tree *t; ++ struct bkey_packed *k, *k2, *p; ++ ++ k = bch2_btree_node_iter_peek_all(node_iter, b); ++ ++ for_each_bset(b, t) { ++ bool set_pos = false; ++ ++ if (node_iter->data[0].end == t->end_offset) ++ continue; ++ ++ k2 = bch2_btree_node_iter_bset_pos(node_iter, b, t); ++ ++ while ((p = bch2_bkey_prev_all(b, t, k2)) && ++ bkey_iter_cmp(b, k, p) < 0) { ++ k2 = p; ++ set_pos = true; ++ } ++ ++ if (set_pos) ++ btree_node_iter_set_set_pos(node_iter, ++ b, t, k2); ++ } ++ } ++ ++ if (!b->level && ++ node_iter == &iter->l[0].iter && ++ iter_current_key_modified) { ++ struct bkey_packed *k = ++ bch2_btree_node_iter_peek_all(node_iter, b); ++ ++ if (likely(k)) { ++ bkey_disassemble(b, k, &iter->k); ++ } else { ++ /* XXX: for extents, calculate size of hole? */ ++ iter->k.type = KEY_TYPE_deleted; ++ } ++ ++ btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); ++ } ++} ++ ++void bch2_btree_node_iter_fix(struct btree_iter *iter, ++ struct btree *b, ++ struct btree_node_iter *node_iter, ++ struct bkey_packed *where, ++ unsigned clobber_u64s, ++ unsigned new_u64s) ++{ ++ struct bset_tree *t = bch2_bkey_to_bset(b, where); ++ struct btree_iter *linked; ++ ++ if (node_iter != &iter->l[b->level].iter) { ++ __bch2_btree_node_iter_fix(iter, b, node_iter, t, ++ where, clobber_u64s, new_u64s); ++ bch2_btree_node_iter_verify(node_iter, b); ++ } ++ ++ trans_for_each_iter_with_node(iter->trans, b, linked) { ++ __bch2_btree_node_iter_fix(linked, b, ++ &linked->l[b->level].iter, t, ++ where, clobber_u64s, new_u64s); ++ __bch2_btree_iter_verify(linked, b); ++ } ++} ++ ++static inline struct bkey_s_c __btree_iter_unpack(struct btree_iter *iter, ++ struct btree_iter_level *l, ++ struct bkey *u, ++ struct bkey_packed *k) ++{ ++ struct bkey_s_c ret; ++ ++ if (unlikely(!k)) { ++ /* ++ * signal to bch2_btree_iter_peek_slot() that we're currently at ++ * a hole ++ */ ++ u->type = KEY_TYPE_deleted; ++ return bkey_s_c_null; ++ } ++ ++ ret = bkey_disassemble(l->b, k, u); ++ ++ if (debug_check_bkeys(iter->trans->c)) ++ bch2_bkey_debugcheck(iter->trans->c, l->b, ret); ++ ++ return ret; ++} ++ ++/* peek_all() doesn't skip deleted keys */ ++static inline struct bkey_s_c __btree_iter_peek_all(struct btree_iter *iter, ++ struct btree_iter_level *l, ++ struct bkey *u) ++{ ++ return __btree_iter_unpack(iter, l, u, ++ bch2_btree_node_iter_peek_all(&l->iter, l->b)); ++} ++ ++static inline struct bkey_s_c __btree_iter_peek(struct btree_iter *iter, ++ struct btree_iter_level *l) ++{ ++ return __btree_iter_unpack(iter, l, &iter->k, ++ bch2_btree_node_iter_peek(&l->iter, l->b)); ++} ++ ++static inline struct bkey_s_c __btree_iter_prev(struct btree_iter *iter, ++ struct btree_iter_level *l) ++{ ++ return __btree_iter_unpack(iter, l, &iter->k, ++ bch2_btree_node_iter_prev(&l->iter, l->b)); ++} ++ ++static inline bool btree_iter_advance_to_pos(struct btree_iter *iter, ++ struct btree_iter_level *l, ++ int max_advance) ++{ ++ struct bkey_packed *k; ++ int nr_advanced = 0; ++ ++ while ((k = bch2_btree_node_iter_peek_all(&l->iter, l->b)) && ++ btree_iter_pos_cmp(iter, l->b, k) < 0) { ++ if (max_advance > 0 && nr_advanced >= max_advance) ++ return false; ++ ++ bch2_btree_node_iter_advance(&l->iter, l->b); ++ nr_advanced++; ++ } ++ ++ return true; ++} ++ ++/* ++ * Verify that iterator for parent node points to child node: ++ */ ++static void btree_iter_verify_new_node(struct btree_iter *iter, struct btree *b) ++{ ++ struct btree_iter_level *l; ++ unsigned plevel; ++ bool parent_locked; ++ struct bkey_packed *k; ++ ++ if (!IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) ++ return; ++ ++ plevel = b->level + 1; ++ if (!btree_iter_node(iter, plevel)) ++ return; ++ ++ parent_locked = btree_node_locked(iter, plevel); ++ ++ if (!bch2_btree_node_relock(iter, plevel)) ++ return; ++ ++ l = &iter->l[plevel]; ++ k = bch2_btree_node_iter_peek_all(&l->iter, l->b); ++ if (!k || ++ bkey_deleted(k) || ++ bkey_cmp_left_packed(l->b, k, &b->key.k.p)) { ++ char buf[100]; ++ struct bkey uk = bkey_unpack_key(b, k); ++ ++ bch2_bkey_to_text(&PBUF(buf), &uk); ++ panic("parent iter doesn't point to new node:\n%s\n%llu:%llu\n", ++ buf, b->key.k.p.inode, b->key.k.p.offset); ++ } ++ ++ if (!parent_locked) ++ btree_node_unlock(iter, b->level + 1); ++} ++ ++static inline bool btree_iter_pos_before_node(struct btree_iter *iter, ++ struct btree *b) ++{ ++ return bkey_cmp(iter->pos, b->data->min_key) < 0; ++} ++ ++static inline bool btree_iter_pos_after_node(struct btree_iter *iter, ++ struct btree *b) ++{ ++ int cmp = bkey_cmp(b->key.k.p, iter->pos); ++ ++ if (!cmp && ++ (iter->flags & BTREE_ITER_IS_EXTENTS) && ++ bkey_cmp(b->key.k.p, POS_MAX)) ++ cmp = -1; ++ return cmp < 0; ++} ++ ++static inline bool btree_iter_pos_in_node(struct btree_iter *iter, ++ struct btree *b) ++{ ++ return iter->btree_id == b->btree_id && ++ !btree_iter_pos_before_node(iter, b) && ++ !btree_iter_pos_after_node(iter, b); ++} ++ ++static inline void __btree_iter_init(struct btree_iter *iter, ++ unsigned level) ++{ ++ struct btree_iter_level *l = &iter->l[level]; ++ ++ bch2_btree_node_iter_init(&l->iter, l->b, &iter->pos); ++ ++ if (iter->flags & BTREE_ITER_IS_EXTENTS) ++ btree_iter_advance_to_pos(iter, l, -1); ++ ++ /* Skip to first non whiteout: */ ++ if (level) ++ bch2_btree_node_iter_peek(&l->iter, l->b); ++ ++ btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); ++} ++ ++static inline void btree_iter_node_set(struct btree_iter *iter, ++ struct btree *b) ++{ ++ btree_iter_verify_new_node(iter, b); ++ ++ EBUG_ON(!btree_iter_pos_in_node(iter, b)); ++ EBUG_ON(b->lock.state.seq & 1); ++ ++ iter->l[b->level].lock_seq = b->lock.state.seq; ++ iter->l[b->level].b = b; ++ __btree_iter_init(iter, b->level); ++} ++ ++/* ++ * A btree node is being replaced - update the iterator to point to the new ++ * node: ++ */ ++void bch2_btree_iter_node_replace(struct btree_iter *iter, struct btree *b) ++{ ++ enum btree_node_locked_type t; ++ struct btree_iter *linked; ++ ++ trans_for_each_iter(iter->trans, linked) ++ if (btree_iter_pos_in_node(linked, b)) { ++ /* ++ * bch2_btree_iter_node_drop() has already been called - ++ * the old node we're replacing has already been ++ * unlocked and the pointer invalidated ++ */ ++ BUG_ON(btree_node_locked(linked, b->level)); ++ ++ t = btree_lock_want(linked, b->level); ++ if (t != BTREE_NODE_UNLOCKED) { ++ six_lock_increment(&b->lock, t); ++ mark_btree_node_locked(linked, b->level, t); ++ } ++ ++ btree_iter_node_set(linked, b); ++ } ++} ++ ++void bch2_btree_iter_node_drop(struct btree_iter *iter, struct btree *b) ++{ ++ struct btree_iter *linked; ++ unsigned level = b->level; ++ ++ trans_for_each_iter(iter->trans, linked) ++ if (linked->l[level].b == b) { ++ __btree_node_unlock(linked, level); ++ linked->l[level].b = BTREE_ITER_NO_NODE_DROP; ++ } ++} ++ ++/* ++ * A btree node has been modified in such a way as to invalidate iterators - fix ++ * them: ++ */ ++void bch2_btree_iter_reinit_node(struct btree_iter *iter, struct btree *b) ++{ ++ struct btree_iter *linked; ++ ++ trans_for_each_iter_with_node(iter->trans, b, linked) ++ __btree_iter_init(linked, b->level); ++} ++ ++static inline int btree_iter_lock_root(struct btree_iter *iter, ++ unsigned depth_want) ++{ ++ struct bch_fs *c = iter->trans->c; ++ struct btree *b; ++ enum six_lock_type lock_type; ++ unsigned i; ++ ++ EBUG_ON(iter->nodes_locked); ++ ++ while (1) { ++ b = READ_ONCE(c->btree_roots[iter->btree_id].b); ++ iter->level = READ_ONCE(b->level); ++ ++ if (unlikely(iter->level < depth_want)) { ++ /* ++ * the root is at a lower depth than the depth we want: ++ * got to the end of the btree, or we're walking nodes ++ * greater than some depth and there are no nodes >= ++ * that depth ++ */ ++ iter->level = depth_want; ++ for (i = iter->level; i < BTREE_MAX_DEPTH; i++) ++ iter->l[i].b = NULL; ++ return 1; ++ } ++ ++ lock_type = __btree_lock_want(iter, iter->level); ++ if (unlikely(!btree_node_lock(b, POS_MAX, iter->level, ++ iter, lock_type))) ++ return -EINTR; ++ ++ if (likely(b == c->btree_roots[iter->btree_id].b && ++ b->level == iter->level && ++ !race_fault())) { ++ for (i = 0; i < iter->level; i++) ++ iter->l[i].b = BTREE_ITER_NO_NODE_LOCK_ROOT; ++ iter->l[iter->level].b = b; ++ for (i = iter->level + 1; i < BTREE_MAX_DEPTH; i++) ++ iter->l[i].b = NULL; ++ ++ mark_btree_node_locked(iter, iter->level, lock_type); ++ btree_iter_node_set(iter, b); ++ return 0; ++ } ++ ++ six_unlock_type(&b->lock, lock_type); ++ } ++} ++ ++noinline ++static void btree_iter_prefetch(struct btree_iter *iter) ++{ ++ struct bch_fs *c = iter->trans->c; ++ struct btree_iter_level *l = &iter->l[iter->level]; ++ struct btree_node_iter node_iter = l->iter; ++ struct bkey_packed *k; ++ BKEY_PADDED(k) tmp; ++ unsigned nr = test_bit(BCH_FS_STARTED, &c->flags) ++ ? (iter->level > 1 ? 0 : 2) ++ : (iter->level > 1 ? 1 : 16); ++ bool was_locked = btree_node_locked(iter, iter->level); ++ ++ while (nr) { ++ if (!bch2_btree_node_relock(iter, iter->level)) ++ return; ++ ++ bch2_btree_node_iter_advance(&node_iter, l->b); ++ k = bch2_btree_node_iter_peek(&node_iter, l->b); ++ if (!k) ++ break; ++ ++ bch2_bkey_unpack(l->b, &tmp.k, k); ++ bch2_btree_node_prefetch(c, iter, &tmp.k, iter->level - 1); ++ } ++ ++ if (!was_locked) ++ btree_node_unlock(iter, iter->level); ++} ++ ++static __always_inline int btree_iter_down(struct btree_iter *iter) ++{ ++ struct bch_fs *c = iter->trans->c; ++ struct btree_iter_level *l = &iter->l[iter->level]; ++ struct btree *b; ++ unsigned level = iter->level - 1; ++ enum six_lock_type lock_type = __btree_lock_want(iter, level); ++ BKEY_PADDED(k) tmp; ++ ++ EBUG_ON(!btree_node_locked(iter, iter->level)); ++ ++ bch2_bkey_unpack(l->b, &tmp.k, ++ bch2_btree_node_iter_peek(&l->iter, l->b)); ++ ++ b = bch2_btree_node_get(c, iter, &tmp.k, level, lock_type); ++ if (unlikely(IS_ERR(b))) ++ return PTR_ERR(b); ++ ++ mark_btree_node_locked(iter, level, lock_type); ++ btree_iter_node_set(iter, b); ++ ++ if (iter->flags & BTREE_ITER_PREFETCH) ++ btree_iter_prefetch(iter); ++ ++ iter->level = level; ++ ++ return 0; ++} ++ ++static void btree_iter_up(struct btree_iter *iter) ++{ ++ btree_node_unlock(iter, iter->level++); ++} ++ ++static int btree_iter_traverse_one(struct btree_iter *); ++ ++static int __btree_iter_traverse_all(struct btree_trans *trans, ++ struct btree_iter *orig_iter, int ret) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_iter *iter; ++ u8 sorted[BTREE_ITER_MAX]; ++ unsigned i, nr_sorted = 0; ++ ++ trans_for_each_iter(trans, iter) ++ sorted[nr_sorted++] = iter - trans->iters; ++ ++#define btree_iter_cmp_by_idx(_l, _r) \ ++ btree_iter_cmp(&trans->iters[_l], &trans->iters[_r]) ++ ++ bubble_sort(sorted, nr_sorted, btree_iter_cmp_by_idx); ++#undef btree_iter_cmp_by_idx ++ ++retry_all: ++ bch2_trans_unlock(trans); ++ ++ if (unlikely(ret == -ENOMEM)) { ++ struct closure cl; ++ ++ closure_init_stack(&cl); ++ ++ do { ++ ret = bch2_btree_cache_cannibalize_lock(c, &cl); ++ closure_sync(&cl); ++ } while (ret); ++ } ++ ++ if (unlikely(ret == -EIO)) { ++ trans->error = true; ++ if (orig_iter) { ++ orig_iter->flags |= BTREE_ITER_ERROR; ++ orig_iter->l[orig_iter->level].b = ++ BTREE_ITER_NO_NODE_ERROR; ++ } ++ goto out; ++ } ++ ++ BUG_ON(ret && ret != -EINTR); ++ ++ /* Now, redo traversals in correct order: */ ++ for (i = 0; i < nr_sorted; i++) { ++ iter = &trans->iters[sorted[i]]; ++ ++ do { ++ ret = btree_iter_traverse_one(iter); ++ } while (ret == -EINTR); ++ ++ if (ret) ++ goto retry_all; ++ } ++ ++ ret = hweight64(trans->iters_live) > 1 ? -EINTR : 0; ++out: ++ bch2_btree_cache_cannibalize_unlock(c); ++ return ret; ++} ++ ++int bch2_btree_iter_traverse_all(struct btree_trans *trans) ++{ ++ return __btree_iter_traverse_all(trans, NULL, 0); ++} ++ ++static inline bool btree_iter_good_node(struct btree_iter *iter, ++ unsigned l, int check_pos) ++{ ++ if (!is_btree_node(iter, l) || ++ !bch2_btree_node_relock(iter, l)) ++ return false; ++ ++ if (check_pos <= 0 && btree_iter_pos_before_node(iter, iter->l[l].b)) ++ return false; ++ if (check_pos >= 0 && btree_iter_pos_after_node(iter, iter->l[l].b)) ++ return false; ++ return true; ++} ++ ++static inline unsigned btree_iter_up_until_good_node(struct btree_iter *iter, ++ int check_pos) ++{ ++ unsigned l = iter->level; ++ ++ while (btree_iter_node(iter, l) && ++ !btree_iter_good_node(iter, l, check_pos)) { ++ btree_node_unlock(iter, l); ++ iter->l[l].b = BTREE_ITER_NO_NODE_UP; ++ l++; ++ } ++ ++ return l; ++} ++ ++/* ++ * This is the main state machine for walking down the btree - walks down to a ++ * specified depth ++ * ++ * Returns 0 on success, -EIO on error (error reading in a btree node). ++ * ++ * On error, caller (peek_node()/peek_key()) must return NULL; the error is ++ * stashed in the iterator and returned from bch2_trans_exit(). ++ */ ++static int btree_iter_traverse_one(struct btree_iter *iter) ++{ ++ unsigned depth_want = iter->level; ++ ++ if (unlikely(iter->level >= BTREE_MAX_DEPTH)) ++ return 0; ++ ++ if (bch2_btree_iter_relock(iter, false)) ++ return 0; ++ ++ /* ++ * XXX: correctly using BTREE_ITER_UPTODATE should make using check_pos ++ * here unnecessary ++ */ ++ iter->level = btree_iter_up_until_good_node(iter, 0); ++ ++ /* ++ * If we've got a btree node locked (i.e. we aren't about to relock the ++ * root) - advance its node iterator if necessary: ++ * ++ * XXX correctly using BTREE_ITER_UPTODATE should make this unnecessary ++ */ ++ if (btree_iter_node(iter, iter->level)) { ++ BUG_ON(!btree_iter_pos_in_node(iter, iter->l[iter->level].b)); ++ ++ btree_iter_advance_to_pos(iter, &iter->l[iter->level], -1); ++ } ++ ++ /* ++ * Note: iter->nodes[iter->level] may be temporarily NULL here - that ++ * would indicate to other code that we got to the end of the btree, ++ * here it indicates that relocking the root failed - it's critical that ++ * btree_iter_lock_root() comes next and that it can't fail ++ */ ++ while (iter->level > depth_want) { ++ int ret = btree_iter_node(iter, iter->level) ++ ? btree_iter_down(iter) ++ : btree_iter_lock_root(iter, depth_want); ++ if (unlikely(ret)) { ++ if (ret == 1) ++ return 0; ++ ++ iter->level = depth_want; ++ iter->l[iter->level].b = BTREE_ITER_NO_NODE_DOWN; ++ return ret; ++ } ++ } ++ ++ iter->uptodate = BTREE_ITER_NEED_PEEK; ++ ++ bch2_btree_trans_verify_locks(iter->trans); ++ __bch2_btree_iter_verify(iter, iter->l[iter->level].b); ++ return 0; ++} ++ ++int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter) ++{ ++ int ret; ++ ++ ret = bch2_trans_cond_resched(iter->trans) ?: ++ btree_iter_traverse_one(iter); ++ if (unlikely(ret)) ++ ret = __btree_iter_traverse_all(iter->trans, iter, ret); ++ ++ return ret; ++} ++ ++static inline void bch2_btree_iter_checks(struct btree_iter *iter, ++ enum btree_iter_type type) ++{ ++ EBUG_ON(iter->btree_id >= BTREE_ID_NR); ++ EBUG_ON(!!(iter->flags & BTREE_ITER_IS_EXTENTS) != ++ (btree_node_type_is_extents(iter->btree_id) && ++ type != BTREE_ITER_NODES)); ++ EBUG_ON(btree_iter_type(iter) != type); ++ ++ bch2_btree_trans_verify_locks(iter->trans); ++} ++ ++/* Iterate across nodes (leaf and interior nodes) */ ++ ++struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter) ++{ ++ struct btree *b; ++ int ret; ++ ++ bch2_btree_iter_checks(iter, BTREE_ITER_NODES); ++ ++ if (iter->uptodate == BTREE_ITER_UPTODATE) ++ return iter->l[iter->level].b; ++ ++ ret = bch2_btree_iter_traverse(iter); ++ if (ret) ++ return NULL; ++ ++ b = btree_iter_node(iter, iter->level); ++ if (!b) ++ return NULL; ++ ++ BUG_ON(bkey_cmp(b->key.k.p, iter->pos) < 0); ++ ++ iter->pos = b->key.k.p; ++ iter->uptodate = BTREE_ITER_UPTODATE; ++ ++ return b; ++} ++ ++struct btree *bch2_btree_iter_next_node(struct btree_iter *iter, unsigned depth) ++{ ++ struct btree *b; ++ int ret; ++ ++ bch2_btree_iter_checks(iter, BTREE_ITER_NODES); ++ ++ /* already got to end? */ ++ if (!btree_iter_node(iter, iter->level)) ++ return NULL; ++ ++ bch2_trans_cond_resched(iter->trans); ++ ++ btree_iter_up(iter); ++ ++ if (!bch2_btree_node_relock(iter, iter->level)) ++ btree_iter_set_dirty(iter, BTREE_ITER_NEED_RELOCK); ++ ++ ret = bch2_btree_iter_traverse(iter); ++ if (ret) ++ return NULL; ++ ++ /* got to end? */ ++ b = btree_iter_node(iter, iter->level); ++ if (!b) ++ return NULL; ++ ++ if (bkey_cmp(iter->pos, b->key.k.p) < 0) { ++ /* ++ * Haven't gotten to the end of the parent node: go back down to ++ * the next child node ++ */ ++ ++ /* ++ * We don't really want to be unlocking here except we can't ++ * directly tell btree_iter_traverse() "traverse to this level" ++ * except by setting iter->level, so we have to unlock so we ++ * don't screw up our lock invariants: ++ */ ++ if (btree_node_read_locked(iter, iter->level)) ++ btree_node_unlock(iter, iter->level); ++ ++ /* ick: */ ++ iter->pos = iter->btree_id == BTREE_ID_INODES ++ ? btree_type_successor(iter->btree_id, iter->pos) ++ : bkey_successor(iter->pos); ++ iter->level = depth; ++ ++ btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); ++ ret = bch2_btree_iter_traverse(iter); ++ if (ret) ++ return NULL; ++ ++ b = iter->l[iter->level].b; ++ } ++ ++ iter->pos = b->key.k.p; ++ iter->uptodate = BTREE_ITER_UPTODATE; ++ ++ return b; ++} ++ ++/* Iterate across keys (in leaf nodes only) */ ++ ++void bch2_btree_iter_set_pos_same_leaf(struct btree_iter *iter, struct bpos new_pos) ++{ ++ struct btree_iter_level *l = &iter->l[0]; ++ ++ EBUG_ON(iter->level != 0); ++ EBUG_ON(bkey_cmp(new_pos, iter->pos) < 0); ++ EBUG_ON(!btree_node_locked(iter, 0)); ++ EBUG_ON(bkey_cmp(new_pos, l->b->key.k.p) > 0); ++ ++ iter->pos = new_pos; ++ btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); ++ ++ btree_iter_advance_to_pos(iter, l, -1); ++ ++ if (bch2_btree_node_iter_end(&l->iter) && ++ btree_iter_pos_after_node(iter, l->b)) ++ btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); ++} ++ ++static unsigned btree_iter_pos_changed(struct btree_iter *iter, int cmp) ++{ ++ unsigned l = btree_iter_up_until_good_node(iter, cmp); ++ ++ if (btree_iter_node(iter, l)) { ++ /* ++ * We might have to skip over many keys, or just a few: try ++ * advancing the node iterator, and if we have to skip over too ++ * many keys just reinit it (or if we're rewinding, since that ++ * is expensive). ++ */ ++ if (cmp < 0 || ++ !btree_iter_advance_to_pos(iter, &iter->l[l], 8)) ++ __btree_iter_init(iter, l); ++ ++ /* Don't leave it locked if we're not supposed to: */ ++ if (btree_lock_want(iter, l) == BTREE_NODE_UNLOCKED) ++ btree_node_unlock(iter, l); ++ } ++ ++ return l; ++} ++ ++void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos) ++{ ++ int cmp = bkey_cmp(new_pos, iter->pos); ++ unsigned l; ++ ++ if (!cmp) ++ return; ++ ++ iter->pos = new_pos; ++ ++ l = btree_iter_pos_changed(iter, cmp); ++ ++ if (l != iter->level) ++ btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); ++ else ++ btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); ++} ++ ++static inline bool btree_iter_set_pos_to_next_leaf(struct btree_iter *iter) ++{ ++ struct btree_iter_level *l = &iter->l[0]; ++ ++ iter->pos = l->b->key.k.p; ++ iter->uptodate = BTREE_ITER_NEED_TRAVERSE; ++ ++ if (!bkey_cmp(iter->pos, POS_MAX)) { ++ bkey_init(&iter->k); ++ iter->k.p = POS_MAX; ++ return false; ++ } ++ ++ iter->pos = btree_type_successor(iter->btree_id, iter->pos); ++ btree_iter_pos_changed(iter, 1); ++ return true; ++} ++ ++static inline bool btree_iter_set_pos_to_prev_leaf(struct btree_iter *iter) ++{ ++ struct btree_iter_level *l = &iter->l[0]; ++ ++ iter->pos = l->b->data->min_key; ++ iter->uptodate = BTREE_ITER_NEED_TRAVERSE; ++ ++ if (!bkey_cmp(iter->pos, POS_MIN)) { ++ bkey_init(&iter->k); ++ iter->k.p = POS_MIN; ++ return false; ++ } ++ ++ iter->pos = btree_type_predecessor(iter->btree_id, iter->pos); ++ btree_iter_pos_changed(iter, -1); ++ return true; ++} ++ ++static inline struct bkey_s_c btree_iter_peek_uptodate(struct btree_iter *iter) ++{ ++ struct btree_iter_level *l = &iter->l[0]; ++ struct bkey_s_c ret = { .k = &iter->k }; ++ ++ if (!bkey_deleted(&iter->k)) { ++ struct bkey_packed *_k = ++ __bch2_btree_node_iter_peek_all(&l->iter, l->b); ++ ++ ret.v = bkeyp_val(&l->b->format, _k); ++ ++ if (debug_check_iterators(iter->trans->c)) { ++ struct bkey k = bkey_unpack_key(l->b, _k); ++ ++ /* ++ * this flag is internal to the btree code, ++ * we don't care if it doesn't match - if it's now set ++ * it just means the key has been written out to disk: ++ */ ++ k.needs_whiteout = iter->k.needs_whiteout; ++ BUG_ON(memcmp(&k, &iter->k, sizeof(k))); ++ } ++ ++ if (debug_check_bkeys(iter->trans->c)) ++ bch2_bkey_debugcheck(iter->trans->c, l->b, ret); ++ } ++ ++ return ret; ++} ++ ++/** ++ * bch2_btree_iter_peek: returns first key greater than or equal to iterator's ++ * current position ++ */ ++struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) ++{ ++ struct btree_iter_level *l = &iter->l[0]; ++ struct bkey_s_c k; ++ int ret; ++ ++ bch2_btree_iter_checks(iter, BTREE_ITER_KEYS); ++ ++ if (iter->uptodate == BTREE_ITER_UPTODATE) ++ return btree_iter_peek_uptodate(iter); ++ ++ while (1) { ++ ret = bch2_btree_iter_traverse(iter); ++ if (unlikely(ret)) ++ return bkey_s_c_err(ret); ++ ++ k = __btree_iter_peek(iter, l); ++ if (likely(k.k)) ++ break; ++ ++ if (!btree_iter_set_pos_to_next_leaf(iter)) ++ return bkey_s_c_null; ++ } ++ ++ /* ++ * iter->pos should always be equal to the key we just ++ * returned - except extents can straddle iter->pos: ++ */ ++ if (!(iter->flags & BTREE_ITER_IS_EXTENTS) || ++ bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0) ++ iter->pos = bkey_start_pos(k.k); ++ ++ iter->uptodate = BTREE_ITER_UPTODATE; ++ return k; ++} ++ ++/** ++ * bch2_btree_iter_next: returns first key greater than iterator's current ++ * position ++ */ ++struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter) ++{ ++ struct btree_iter_level *l = &iter->l[0]; ++ struct bkey_packed *p; ++ struct bkey_s_c k; ++ ++ bch2_btree_iter_checks(iter, BTREE_ITER_KEYS); ++ ++ if (unlikely(iter->uptodate != BTREE_ITER_UPTODATE)) { ++ if (unlikely(!bkey_cmp(iter->k.p, POS_MAX))) ++ return bkey_s_c_null; ++ ++ /* ++ * XXX: when we just need to relock we should be able to avoid ++ * calling traverse, but we need to kill BTREE_ITER_NEED_PEEK ++ * for that to work ++ */ ++ iter->uptodate = BTREE_ITER_NEED_TRAVERSE; ++ ++ bch2_btree_iter_set_pos(iter, ++ btree_type_successor(iter->btree_id, iter->k.p)); ++ ++ return bch2_btree_iter_peek(iter); ++ } ++ ++ if (unlikely(bkey_deleted(&iter->k))) { ++ /* ++ * we're currently pointed at a hole, because previously we were ++ * iterating over slots: ++ */ ++ return bch2_btree_iter_peek(iter); ++ } ++ ++ do { ++ bch2_btree_node_iter_advance(&l->iter, l->b); ++ p = bch2_btree_node_iter_peek_all(&l->iter, l->b); ++ } while (likely(p) && bkey_whiteout(p)); ++ ++ if (unlikely(!p)) ++ return btree_iter_set_pos_to_next_leaf(iter) ++ ? bch2_btree_iter_peek(iter) ++ : bkey_s_c_null; ++ ++ k = __btree_iter_unpack(iter, l, &iter->k, p); ++ ++ EBUG_ON(bkey_cmp(bkey_start_pos(k.k), iter->pos) < 0); ++ iter->pos = bkey_start_pos(k.k); ++ return k; ++} ++ ++/** ++ * bch2_btree_iter_peek_prev: returns first key less than or equal to ++ * iterator's current position ++ */ ++struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) ++{ ++ struct btree_iter_level *l = &iter->l[0]; ++ struct bkey_s_c k; ++ int ret; ++ ++ bch2_btree_iter_checks(iter, BTREE_ITER_KEYS); ++ ++ if (iter->uptodate == BTREE_ITER_UPTODATE) ++ return btree_iter_peek_uptodate(iter); ++ ++ while (1) { ++ ret = bch2_btree_iter_traverse(iter); ++ if (unlikely(ret)) ++ return bkey_s_c_err(ret); ++ ++ k = __btree_iter_peek(iter, l); ++ if (!k.k || ++ bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0) ++ k = __btree_iter_prev(iter, l); ++ ++ if (likely(k.k)) ++ break; ++ ++ if (!btree_iter_set_pos_to_prev_leaf(iter)) ++ return bkey_s_c_null; ++ } ++ ++ EBUG_ON(bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0); ++ iter->pos = bkey_start_pos(k.k); ++ iter->uptodate = BTREE_ITER_UPTODATE; ++ return k; ++} ++ ++/** ++ * bch2_btree_iter_prev: returns first key less than iterator's current ++ * position ++ */ ++struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *iter) ++{ ++ struct btree_iter_level *l = &iter->l[0]; ++ struct bkey_s_c k; ++ ++ bch2_btree_iter_checks(iter, BTREE_ITER_KEYS); ++ ++ if (unlikely(iter->uptodate != BTREE_ITER_UPTODATE)) { ++ /* ++ * XXX: when we just need to relock we should be able to avoid ++ * calling traverse, but we need to kill BTREE_ITER_NEED_PEEK ++ * for that to work ++ */ ++ iter->pos = btree_type_predecessor(iter->btree_id, ++ iter->pos); ++ iter->uptodate = BTREE_ITER_NEED_TRAVERSE; ++ ++ return bch2_btree_iter_peek_prev(iter); ++ } ++ ++ k = __btree_iter_prev(iter, l); ++ if (unlikely(!k.k)) ++ return btree_iter_set_pos_to_prev_leaf(iter) ++ ? bch2_btree_iter_peek(iter) ++ : bkey_s_c_null; ++ ++ EBUG_ON(bkey_cmp(bkey_start_pos(k.k), iter->pos) >= 0); ++ iter->pos = bkey_start_pos(k.k); ++ return k; ++} ++ ++static inline struct bkey_s_c ++__bch2_btree_iter_peek_slot_extents(struct btree_iter *iter) ++{ ++ struct btree_iter_level *l = &iter->l[0]; ++ struct btree_node_iter node_iter; ++ struct bkey_s_c k; ++ struct bkey n; ++ int ret; ++ ++recheck: ++ while ((k = __btree_iter_peek_all(iter, l, &iter->k)).k && ++ bkey_cmp(k.k->p, iter->pos) <= 0) ++ bch2_btree_node_iter_advance(&l->iter, l->b); ++ ++ /* ++ * iterator is now at the correct position for inserting at iter->pos, ++ * but we need to keep iterating until we find the first non whiteout so ++ * we know how big a hole we have, if any: ++ */ ++ ++ node_iter = l->iter; ++ if (k.k && bkey_whiteout(k.k)) ++ k = __btree_iter_unpack(iter, l, &iter->k, ++ bch2_btree_node_iter_peek(&node_iter, l->b)); ++ ++ /* ++ * If we got to the end of the node, check if we need to traverse to the ++ * next node: ++ */ ++ if (unlikely(!k.k && btree_iter_pos_after_node(iter, l->b))) { ++ btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); ++ ret = bch2_btree_iter_traverse(iter); ++ if (unlikely(ret)) ++ return bkey_s_c_err(ret); ++ ++ goto recheck; ++ } ++ ++ if (k.k && ++ !bkey_whiteout(k.k) && ++ bkey_cmp(bkey_start_pos(k.k), iter->pos) <= 0) { ++ /* ++ * if we skipped forward to find the first non whiteout and ++ * there _wasn't_ actually a hole, we want the iterator to be ++ * pointed at the key we found: ++ */ ++ l->iter = node_iter; ++ ++ EBUG_ON(bkey_cmp(k.k->p, iter->pos) < 0); ++ EBUG_ON(bkey_deleted(k.k)); ++ iter->uptodate = BTREE_ITER_UPTODATE; ++ ++ __bch2_btree_iter_verify(iter, l->b); ++ return k; ++ } ++ ++ /* hole */ ++ ++ /* holes can't span inode numbers: */ ++ if (iter->pos.offset == KEY_OFFSET_MAX) { ++ if (iter->pos.inode == KEY_INODE_MAX) ++ return bkey_s_c_null; ++ ++ iter->pos = bkey_successor(iter->pos); ++ goto recheck; ++ } ++ ++ if (!k.k) ++ k.k = &l->b->key.k; ++ ++ bkey_init(&n); ++ n.p = iter->pos; ++ bch2_key_resize(&n, ++ min_t(u64, KEY_SIZE_MAX, ++ (k.k->p.inode == n.p.inode ++ ? bkey_start_offset(k.k) ++ : KEY_OFFSET_MAX) - ++ n.p.offset)); ++ ++ EBUG_ON(!n.size); ++ ++ iter->k = n; ++ iter->uptodate = BTREE_ITER_UPTODATE; ++ ++ __bch2_btree_iter_verify(iter, l->b); ++ return (struct bkey_s_c) { &iter->k, NULL }; ++} ++ ++static inline struct bkey_s_c ++__bch2_btree_iter_peek_slot(struct btree_iter *iter) ++{ ++ struct btree_iter_level *l = &iter->l[0]; ++ struct bkey_s_c k; ++ int ret; ++ ++ if (iter->flags & BTREE_ITER_IS_EXTENTS) ++ return __bch2_btree_iter_peek_slot_extents(iter); ++ ++recheck: ++ while ((k = __btree_iter_peek_all(iter, l, &iter->k)).k && ++ bkey_deleted(k.k) && ++ bkey_cmp(k.k->p, iter->pos) == 0) ++ bch2_btree_node_iter_advance(&l->iter, l->b); ++ ++ /* ++ * If we got to the end of the node, check if we need to traverse to the ++ * next node: ++ */ ++ if (unlikely(!k.k && btree_iter_pos_after_node(iter, l->b))) { ++ btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); ++ ret = bch2_btree_iter_traverse(iter); ++ if (unlikely(ret)) ++ return bkey_s_c_err(ret); ++ ++ goto recheck; ++ } ++ ++ if (!k.k || ++ bkey_deleted(k.k) || ++ bkey_cmp(iter->pos, k.k->p)) { ++ /* hole */ ++ bkey_init(&iter->k); ++ iter->k.p = iter->pos; ++ k = (struct bkey_s_c) { &iter->k, NULL }; ++ } ++ ++ iter->uptodate = BTREE_ITER_UPTODATE; ++ __bch2_btree_iter_verify(iter, l->b); ++ return k; ++} ++ ++struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) ++{ ++ int ret; ++ ++ bch2_btree_iter_checks(iter, BTREE_ITER_KEYS); ++ ++ if (iter->uptodate == BTREE_ITER_UPTODATE) ++ return btree_iter_peek_uptodate(iter); ++ ++ ret = bch2_btree_iter_traverse(iter); ++ if (unlikely(ret)) ++ return bkey_s_c_err(ret); ++ ++ return __bch2_btree_iter_peek_slot(iter); ++} ++ ++struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *iter) ++{ ++ bch2_btree_iter_checks(iter, BTREE_ITER_KEYS); ++ ++ iter->pos = btree_type_successor(iter->btree_id, iter->k.p); ++ ++ if (unlikely(iter->uptodate != BTREE_ITER_UPTODATE)) { ++ /* ++ * XXX: when we just need to relock we should be able to avoid ++ * calling traverse, but we need to kill BTREE_ITER_NEED_PEEK ++ * for that to work ++ */ ++ btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); ++ ++ return bch2_btree_iter_peek_slot(iter); ++ } ++ ++ if (!bkey_deleted(&iter->k)) ++ bch2_btree_node_iter_advance(&iter->l[0].iter, iter->l[0].b); ++ ++ btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); ++ ++ return __bch2_btree_iter_peek_slot(iter); ++} ++ ++static inline void bch2_btree_iter_init(struct btree_trans *trans, ++ struct btree_iter *iter, enum btree_id btree_id, ++ struct bpos pos, unsigned flags) ++{ ++ struct bch_fs *c = trans->c; ++ unsigned i; ++ ++ if (btree_node_type_is_extents(btree_id) && ++ !(flags & BTREE_ITER_NODES)) ++ flags |= BTREE_ITER_IS_EXTENTS; ++ ++ iter->trans = trans; ++ iter->pos = pos; ++ bkey_init(&iter->k); ++ iter->k.p = pos; ++ iter->flags = flags; ++ iter->uptodate = BTREE_ITER_NEED_TRAVERSE; ++ iter->btree_id = btree_id; ++ iter->level = 0; ++ iter->locks_want = flags & BTREE_ITER_INTENT ? 1 : 0; ++ iter->nodes_locked = 0; ++ iter->nodes_intent_locked = 0; ++ for (i = 0; i < ARRAY_SIZE(iter->l); i++) ++ iter->l[i].b = NULL; ++ iter->l[iter->level].b = BTREE_ITER_NO_NODE_INIT; ++ ++ prefetch(c->btree_roots[btree_id].b); ++} ++ ++/* new transactional stuff: */ ++ ++static inline void __bch2_trans_iter_free(struct btree_trans *trans, ++ unsigned idx) ++{ ++ __bch2_btree_iter_unlock(&trans->iters[idx]); ++ trans->iters_linked &= ~(1ULL << idx); ++ trans->iters_live &= ~(1ULL << idx); ++ trans->iters_touched &= ~(1ULL << idx); ++} ++ ++int bch2_trans_iter_put(struct btree_trans *trans, ++ struct btree_iter *iter) ++{ ++ int ret = btree_iter_err(iter); ++ ++ if (!(trans->iters_touched & (1ULL << iter->idx)) && ++ !(iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT)) ++ __bch2_trans_iter_free(trans, iter->idx); ++ ++ trans->iters_live &= ~(1ULL << iter->idx); ++ return ret; ++} ++ ++int bch2_trans_iter_free(struct btree_trans *trans, ++ struct btree_iter *iter) ++{ ++ trans->iters_touched &= ~(1ULL << iter->idx); ++ ++ return bch2_trans_iter_put(trans, iter); ++} ++ ++static int bch2_trans_realloc_iters(struct btree_trans *trans, ++ unsigned new_size) ++{ ++ void *new_iters, *new_updates, *new_sorted; ++ size_t iters_bytes; ++ size_t updates_bytes; ++ size_t sorted_bytes; ++ ++ new_size = roundup_pow_of_two(new_size); ++ ++ BUG_ON(new_size > BTREE_ITER_MAX); ++ ++ if (new_size <= trans->size) ++ return 0; ++ ++ BUG_ON(trans->used_mempool); ++ ++ bch2_trans_unlock(trans); ++ ++ iters_bytes = sizeof(struct btree_iter) * new_size; ++ updates_bytes = sizeof(struct btree_insert_entry) * (new_size + 4); ++ sorted_bytes = sizeof(u8) * (new_size + 4); ++ ++ new_iters = kmalloc(iters_bytes + ++ updates_bytes + ++ sorted_bytes, GFP_NOFS); ++ if (new_iters) ++ goto success; ++ ++ new_iters = mempool_alloc(&trans->c->btree_iters_pool, GFP_NOFS); ++ new_size = BTREE_ITER_MAX; ++ ++ trans->used_mempool = true; ++success: ++ new_updates = new_iters + iters_bytes; ++ new_sorted = new_updates + updates_bytes; ++ ++ memcpy(new_iters, trans->iters, ++ sizeof(struct btree_iter) * trans->nr_iters); ++ memcpy(new_updates, trans->updates, ++ sizeof(struct btree_insert_entry) * trans->nr_updates); ++ ++ if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) ++ memset(trans->iters, POISON_FREE, ++ sizeof(struct btree_iter) * trans->nr_iters + ++ sizeof(struct btree_insert_entry) * trans->nr_iters); ++ ++ if (trans->iters != trans->iters_onstack) ++ kfree(trans->iters); ++ ++ trans->iters = new_iters; ++ trans->updates = new_updates; ++ trans->updates_sorted = new_sorted; ++ trans->size = new_size; ++ ++ if (trans->iters_live) { ++ trace_trans_restart_iters_realloced(trans->ip, trans->size); ++ return -EINTR; ++ } ++ ++ return 0; ++} ++ ++static struct btree_iter *btree_trans_iter_alloc(struct btree_trans *trans) ++{ ++ unsigned idx = __ffs64(~trans->iters_linked); ++ ++ if (idx < trans->nr_iters) ++ goto got_slot; ++ ++ if (trans->nr_iters == trans->size) { ++ int ret; ++ ++ if (trans->nr_iters >= BTREE_ITER_MAX) { ++ struct btree_iter *iter; ++ ++ trans_for_each_iter(trans, iter) { ++ pr_err("iter: btree %s pos %llu:%llu%s%s%s", ++ bch2_btree_ids[iter->btree_id], ++ iter->pos.inode, ++ iter->pos.offset, ++ (trans->iters_live & (1ULL << iter->idx)) ? " live" : "", ++ (trans->iters_touched & (1ULL << iter->idx)) ? " touched" : "", ++ iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT ? " keep" : ""); ++ } ++ ++ panic("trans iter oveflow\n"); ++ } ++ ++ ret = bch2_trans_realloc_iters(trans, trans->size * 2); ++ if (ret) ++ return ERR_PTR(ret); ++ } ++ ++ idx = trans->nr_iters++; ++ BUG_ON(trans->nr_iters > trans->size); ++ ++ trans->iters[idx].idx = idx; ++got_slot: ++ BUG_ON(trans->iters_linked & (1ULL << idx)); ++ trans->iters_linked |= 1ULL << idx; ++ return &trans->iters[idx]; ++} ++ ++static inline void btree_iter_copy(struct btree_iter *dst, ++ struct btree_iter *src) ++{ ++ unsigned i, idx = dst->idx; ++ ++ *dst = *src; ++ dst->idx = idx; ++ ++ for (i = 0; i < BTREE_MAX_DEPTH; i++) ++ if (btree_node_locked(dst, i)) ++ six_lock_increment(&dst->l[i].b->lock, ++ __btree_lock_want(dst, i)); ++} ++ ++static inline struct bpos bpos_diff(struct bpos l, struct bpos r) ++{ ++ if (bkey_cmp(l, r) > 0) ++ swap(l, r); ++ ++ return POS(r.inode - l.inode, r.offset - l.offset); ++} ++ ++static struct btree_iter *__btree_trans_get_iter(struct btree_trans *trans, ++ unsigned btree_id, struct bpos pos, ++ unsigned flags) ++{ ++ struct btree_iter *iter, *best = NULL; ++ ++ BUG_ON(trans->nr_iters > BTREE_ITER_MAX); ++ ++ trans_for_each_iter(trans, iter) { ++ if (btree_iter_type(iter) != (flags & BTREE_ITER_TYPE)) ++ continue; ++ ++ if (iter->btree_id != btree_id) ++ continue; ++ ++ if (best && ++ bkey_cmp(bpos_diff(best->pos, pos), ++ bpos_diff(iter->pos, pos)) < 0) ++ continue; ++ ++ best = iter; ++ } ++ ++ if (!best) { ++ iter = btree_trans_iter_alloc(trans); ++ if (IS_ERR(iter)) ++ return iter; ++ ++ bch2_btree_iter_init(trans, iter, btree_id, pos, flags); ++ } else if ((trans->iters_live & (1ULL << best->idx)) || ++ (best->flags & BTREE_ITER_KEEP_UNTIL_COMMIT)) { ++ iter = btree_trans_iter_alloc(trans); ++ if (IS_ERR(iter)) ++ return iter; ++ ++ btree_iter_copy(iter, best); ++ } else { ++ iter = best; ++ } ++ ++ iter->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT; ++ iter->flags &= ~(BTREE_ITER_SLOTS|BTREE_ITER_INTENT|BTREE_ITER_PREFETCH); ++ iter->flags |= flags & (BTREE_ITER_SLOTS|BTREE_ITER_INTENT|BTREE_ITER_PREFETCH); ++ ++ if (iter->flags & BTREE_ITER_INTENT) ++ bch2_btree_iter_upgrade(iter, 1); ++ else ++ bch2_btree_iter_downgrade(iter); ++ ++ BUG_ON(iter->btree_id != btree_id); ++ BUG_ON((iter->flags ^ flags) & BTREE_ITER_TYPE); ++ BUG_ON(iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT); ++ BUG_ON(trans->iters_live & (1ULL << iter->idx)); ++ ++ trans->iters_live |= 1ULL << iter->idx; ++ trans->iters_touched |= 1ULL << iter->idx; ++ ++ return iter; ++} ++ ++struct btree_iter *bch2_trans_get_iter(struct btree_trans *trans, ++ enum btree_id btree_id, ++ struct bpos pos, unsigned flags) ++{ ++ struct btree_iter *iter = ++ __btree_trans_get_iter(trans, btree_id, pos, flags); ++ ++ if (!IS_ERR(iter)) ++ bch2_btree_iter_set_pos(iter, pos); ++ return iter; ++} ++ ++struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *trans, ++ enum btree_id btree_id, ++ struct bpos pos, ++ unsigned locks_want, ++ unsigned depth, ++ unsigned flags) ++{ ++ struct btree_iter *iter = ++ __btree_trans_get_iter(trans, btree_id, pos, ++ flags|BTREE_ITER_NODES); ++ unsigned i; ++ ++ BUG_ON(IS_ERR(iter)); ++ BUG_ON(bkey_cmp(iter->pos, pos)); ++ ++ iter->locks_want = locks_want; ++ iter->level = depth; ++ ++ for (i = 0; i < ARRAY_SIZE(iter->l); i++) ++ iter->l[i].b = NULL; ++ iter->l[iter->level].b = BTREE_ITER_NO_NODE_INIT; ++ ++ return iter; ++} ++ ++struct btree_iter *bch2_trans_copy_iter(struct btree_trans *trans, ++ struct btree_iter *src) ++{ ++ struct btree_iter *iter; ++ ++ iter = btree_trans_iter_alloc(trans); ++ if (IS_ERR(iter)) ++ return iter; ++ ++ btree_iter_copy(iter, src); ++ ++ trans->iters_live |= 1ULL << iter->idx; ++ /* ++ * Don't mark it as touched, we don't need to preserve this iter since ++ * it's cheap to copy it again: ++ */ ++ trans->iters_touched &= ~(1ULL << iter->idx); ++ iter->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT; ++ ++ return iter; ++} ++ ++static int bch2_trans_preload_mem(struct btree_trans *trans, size_t size) ++{ ++ if (size > trans->mem_bytes) { ++ size_t old_bytes = trans->mem_bytes; ++ size_t new_bytes = roundup_pow_of_two(size); ++ void *new_mem = krealloc(trans->mem, new_bytes, GFP_NOFS); ++ ++ if (!new_mem) ++ return -ENOMEM; ++ ++ trans->mem = new_mem; ++ trans->mem_bytes = new_bytes; ++ ++ if (old_bytes) { ++ trace_trans_restart_mem_realloced(trans->ip, new_bytes); ++ return -EINTR; ++ } ++ } ++ ++ return 0; ++} ++ ++void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size) ++{ ++ void *p; ++ int ret; ++ ++ ret = bch2_trans_preload_mem(trans, trans->mem_top + size); ++ if (ret) ++ return ERR_PTR(ret); ++ ++ p = trans->mem + trans->mem_top; ++ trans->mem_top += size; ++ return p; ++} ++ ++inline void bch2_trans_unlink_iters(struct btree_trans *trans) ++{ ++ u64 iters = trans->iters_linked & ++ ~trans->iters_touched & ++ ~trans->iters_live; ++ ++ while (iters) { ++ unsigned idx = __ffs64(iters); ++ ++ iters &= ~(1ULL << idx); ++ __bch2_trans_iter_free(trans, idx); ++ } ++} ++ ++void bch2_trans_reset(struct btree_trans *trans, unsigned flags) ++{ ++ struct btree_iter *iter; ++ ++ trans_for_each_iter(trans, iter) ++ iter->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT; ++ ++ bch2_trans_unlink_iters(trans); ++ ++ if (flags & TRANS_RESET_ITERS) ++ trans->iters_live = 0; ++ ++ trans->iters_touched &= trans->iters_live; ++ ++ trans->nr_updates = 0; ++ ++ if (flags & TRANS_RESET_MEM) ++ trans->mem_top = 0; ++ ++ bch2_btree_iter_traverse_all(trans); ++} ++ ++void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, ++ unsigned expected_nr_iters, ++ size_t expected_mem_bytes) ++{ ++ memset(trans, 0, offsetof(struct btree_trans, iters_onstack)); ++ ++ trans->c = c; ++ trans->ip = _RET_IP_; ++ trans->size = ARRAY_SIZE(trans->iters_onstack); ++ trans->iters = trans->iters_onstack; ++ trans->updates = trans->updates_onstack; ++ trans->updates_sorted = trans->updates_sorted_onstack; ++ trans->fs_usage_deltas = NULL; ++ ++ if (expected_nr_iters > trans->size) ++ bch2_trans_realloc_iters(trans, expected_nr_iters); ++ ++ if (expected_mem_bytes) ++ bch2_trans_preload_mem(trans, expected_mem_bytes); ++} ++ ++int bch2_trans_exit(struct btree_trans *trans) ++{ ++ bch2_trans_unlock(trans); ++ ++ kfree(trans->fs_usage_deltas); ++ kfree(trans->mem); ++ if (trans->used_mempool) ++ mempool_free(trans->iters, &trans->c->btree_iters_pool); ++ else if (trans->iters != trans->iters_onstack) ++ kfree(trans->iters); ++ trans->mem = (void *) 0x1; ++ trans->iters = (void *) 0x1; ++ ++ return trans->error ? -EIO : 0; ++} ++ ++void bch2_fs_btree_iter_exit(struct bch_fs *c) ++{ ++ mempool_exit(&c->btree_iters_pool); ++} ++ ++int bch2_fs_btree_iter_init(struct bch_fs *c) ++{ ++ unsigned nr = BTREE_ITER_MAX; ++ ++ return mempool_init_kmalloc_pool(&c->btree_iters_pool, 1, ++ sizeof(struct btree_iter) * nr + ++ sizeof(struct btree_insert_entry) * (nr + 4) + ++ sizeof(u8) * (nr + 4)); ++} +diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h +new file mode 100644 +index 000000000000..4c5032222319 +--- /dev/null ++++ b/fs/bcachefs/btree_iter.h +@@ -0,0 +1,314 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_BTREE_ITER_H ++#define _BCACHEFS_BTREE_ITER_H ++ ++#include "bset.h" ++#include "btree_types.h" ++ ++static inline void btree_iter_set_dirty(struct btree_iter *iter, ++ enum btree_iter_uptodate u) ++{ ++ iter->uptodate = max_t(unsigned, iter->uptodate, u); ++} ++ ++static inline struct btree *btree_iter_node(struct btree_iter *iter, ++ unsigned level) ++{ ++ return level < BTREE_MAX_DEPTH ? iter->l[level].b : NULL; ++} ++ ++static inline bool btree_node_lock_seq_matches(const struct btree_iter *iter, ++ const struct btree *b, unsigned level) ++{ ++ /* ++ * We don't compare the low bits of the lock sequence numbers because ++ * @iter might have taken a write lock on @b, and we don't want to skip ++ * the linked iterator if the sequence numbers were equal before taking ++ * that write lock. The lock sequence number is incremented by taking ++ * and releasing write locks and is even when unlocked: ++ */ ++ return iter->l[level].lock_seq >> 1 == b->lock.state.seq >> 1; ++} ++ ++static inline struct btree *btree_node_parent(struct btree_iter *iter, ++ struct btree *b) ++{ ++ return btree_iter_node(iter, b->level + 1); ++} ++ ++static inline bool btree_trans_has_multiple_iters(const struct btree_trans *trans) ++{ ++ return hweight64(trans->iters_linked) > 1; ++} ++ ++static inline int btree_iter_err(const struct btree_iter *iter) ++{ ++ return iter->flags & BTREE_ITER_ERROR ? -EIO : 0; ++} ++ ++/* Iterate over iters within a transaction: */ ++ ++#define trans_for_each_iter_all(_trans, _iter) \ ++ for (_iter = (_trans)->iters; \ ++ _iter < (_trans)->iters + (_trans)->nr_iters; \ ++ _iter++) ++ ++static inline struct btree_iter * ++__trans_next_iter(struct btree_trans *trans, unsigned idx) ++{ ++ EBUG_ON(idx < trans->nr_iters && trans->iters[idx].idx != idx); ++ ++ for (; idx < trans->nr_iters; idx++) ++ if (trans->iters_linked & (1ULL << idx)) ++ return &trans->iters[idx]; ++ ++ return NULL; ++} ++ ++#define trans_for_each_iter(_trans, _iter) \ ++ for (_iter = __trans_next_iter((_trans), 0); \ ++ (_iter); \ ++ _iter = __trans_next_iter((_trans), (_iter)->idx + 1)) ++ ++static inline bool __iter_has_node(const struct btree_iter *iter, ++ const struct btree *b) ++{ ++ return iter->l[b->level].b == b && ++ btree_node_lock_seq_matches(iter, b, b->level); ++} ++ ++static inline struct btree_iter * ++__trans_next_iter_with_node(struct btree_trans *trans, struct btree *b, ++ unsigned idx) ++{ ++ struct btree_iter *iter = __trans_next_iter(trans, idx); ++ ++ while (iter && !__iter_has_node(iter, b)) ++ iter = __trans_next_iter(trans, iter->idx + 1); ++ ++ return iter; ++} ++ ++#define trans_for_each_iter_with_node(_trans, _b, _iter) \ ++ for (_iter = __trans_next_iter_with_node((_trans), (_b), 0); \ ++ (_iter); \ ++ _iter = __trans_next_iter_with_node((_trans), (_b), \ ++ (_iter)->idx + 1)) ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++void bch2_btree_iter_verify(struct btree_iter *, struct btree *); ++void bch2_btree_trans_verify_locks(struct btree_trans *); ++#else ++static inline void bch2_btree_iter_verify(struct btree_iter *iter, ++ struct btree *b) {} ++static inline void bch2_btree_trans_verify_locks(struct btree_trans *iter) {} ++#endif ++ ++void bch2_btree_iter_fix_key_modified(struct btree_iter *, struct btree *, ++ struct bkey_packed *); ++void bch2_btree_node_iter_fix(struct btree_iter *, struct btree *, ++ struct btree_node_iter *, struct bkey_packed *, ++ unsigned, unsigned); ++ ++bool bch2_trans_relock(struct btree_trans *); ++void bch2_trans_unlock(struct btree_trans *); ++ ++bool __bch2_btree_iter_upgrade(struct btree_iter *, unsigned); ++bool __bch2_btree_iter_upgrade_nounlock(struct btree_iter *, unsigned); ++ ++static inline bool bch2_btree_iter_upgrade(struct btree_iter *iter, ++ unsigned new_locks_want) ++{ ++ new_locks_want = min(new_locks_want, BTREE_MAX_DEPTH); ++ ++ return iter->locks_want < new_locks_want ++ ? (!iter->trans->nounlock ++ ? __bch2_btree_iter_upgrade(iter, new_locks_want) ++ : __bch2_btree_iter_upgrade_nounlock(iter, new_locks_want)) ++ : iter->uptodate <= BTREE_ITER_NEED_PEEK; ++} ++ ++void __bch2_btree_iter_downgrade(struct btree_iter *, unsigned); ++ ++static inline void bch2_btree_iter_downgrade(struct btree_iter *iter) ++{ ++ if (iter->locks_want > (iter->flags & BTREE_ITER_INTENT) ? 1 : 0) ++ __bch2_btree_iter_downgrade(iter, 0); ++} ++ ++void bch2_btree_iter_node_replace(struct btree_iter *, struct btree *); ++void bch2_btree_iter_node_drop(struct btree_iter *, struct btree *); ++ ++void bch2_btree_iter_reinit_node(struct btree_iter *, struct btree *); ++ ++int __must_check __bch2_btree_iter_traverse(struct btree_iter *); ++ ++static inline int __must_check ++bch2_btree_iter_traverse(struct btree_iter *iter) ++{ ++ return iter->uptodate >= BTREE_ITER_NEED_RELOCK ++ ? __bch2_btree_iter_traverse(iter) ++ : 0; ++} ++ ++int bch2_btree_iter_traverse_all(struct btree_trans *); ++ ++struct btree *bch2_btree_iter_peek_node(struct btree_iter *); ++struct btree *bch2_btree_iter_next_node(struct btree_iter *, unsigned); ++ ++struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *); ++struct bkey_s_c bch2_btree_iter_next(struct btree_iter *); ++ ++struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *); ++struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *); ++ ++struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *); ++struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *); ++ ++void bch2_btree_iter_set_pos_same_leaf(struct btree_iter *, struct bpos); ++void bch2_btree_iter_set_pos(struct btree_iter *, struct bpos); ++ ++static inline struct bpos btree_type_successor(enum btree_id id, ++ struct bpos pos) ++{ ++ if (id == BTREE_ID_INODES) { ++ pos.inode++; ++ pos.offset = 0; ++ } else if (!btree_node_type_is_extents(id)) { ++ pos = bkey_successor(pos); ++ } ++ ++ return pos; ++} ++ ++static inline struct bpos btree_type_predecessor(enum btree_id id, ++ struct bpos pos) ++{ ++ if (id == BTREE_ID_INODES) { ++ --pos.inode; ++ pos.offset = 0; ++ } else { ++ pos = bkey_predecessor(pos); ++ } ++ ++ return pos; ++} ++ ++static inline int __btree_iter_cmp(enum btree_id id, ++ struct bpos pos, ++ const struct btree_iter *r) ++{ ++ if (id != r->btree_id) ++ return id < r->btree_id ? -1 : 1; ++ return bkey_cmp(pos, r->pos); ++} ++ ++static inline int btree_iter_cmp(const struct btree_iter *l, ++ const struct btree_iter *r) ++{ ++ return __btree_iter_cmp(l->btree_id, l->pos, r); ++} ++ ++/* ++ * Unlocks before scheduling ++ * Note: does not revalidate iterator ++ */ ++static inline int bch2_trans_cond_resched(struct btree_trans *trans) ++{ ++ if (need_resched() || race_fault()) { ++ bch2_trans_unlock(trans); ++ schedule(); ++ return bch2_trans_relock(trans) ? 0 : -EINTR; ++ } else { ++ return 0; ++ } ++} ++ ++#define __for_each_btree_node(_trans, _iter, _btree_id, _start, \ ++ _locks_want, _depth, _flags, _b) \ ++ for (iter = bch2_trans_get_node_iter((_trans), (_btree_id), \ ++ _start, _locks_want, _depth, _flags), \ ++ _b = bch2_btree_iter_peek_node(_iter); \ ++ (_b); \ ++ (_b) = bch2_btree_iter_next_node(_iter, _depth)) ++ ++#define for_each_btree_node(_trans, _iter, _btree_id, _start, \ ++ _flags, _b) \ ++ __for_each_btree_node(_trans, _iter, _btree_id, _start, \ ++ 0, 0, _flags, _b) ++ ++static inline struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, ++ unsigned flags) ++{ ++ return flags & BTREE_ITER_SLOTS ++ ? bch2_btree_iter_peek_slot(iter) ++ : bch2_btree_iter_peek(iter); ++} ++ ++static inline struct bkey_s_c __bch2_btree_iter_next(struct btree_iter *iter, ++ unsigned flags) ++{ ++ return flags & BTREE_ITER_SLOTS ++ ? bch2_btree_iter_next_slot(iter) ++ : bch2_btree_iter_next(iter); ++} ++ ++static inline int bkey_err(struct bkey_s_c k) ++{ ++ return PTR_ERR_OR_ZERO(k.k); ++} ++ ++#define for_each_btree_key(_trans, _iter, _btree_id, \ ++ _start, _flags, _k, _ret) \ ++ for ((_ret) = PTR_ERR_OR_ZERO((_iter) = \ ++ bch2_trans_get_iter((_trans), (_btree_id), \ ++ (_start), (_flags))) ?: \ ++ PTR_ERR_OR_ZERO(((_k) = \ ++ __bch2_btree_iter_peek(_iter, _flags)).k); \ ++ !_ret && (_k).k; \ ++ (_ret) = PTR_ERR_OR_ZERO(((_k) = \ ++ __bch2_btree_iter_next(_iter, _flags)).k)) ++ ++#define for_each_btree_key_continue(_iter, _flags, _k, _ret) \ ++ for ((_k) = __bch2_btree_iter_peek(_iter, _flags); \ ++ !((_ret) = bkey_err(_k)) && (_k).k; \ ++ (_k) = __bch2_btree_iter_next(_iter, _flags)) ++ ++/* new multiple iterator interface: */ ++ ++int bch2_trans_iter_put(struct btree_trans *, struct btree_iter *); ++int bch2_trans_iter_free(struct btree_trans *, struct btree_iter *); ++ ++void bch2_trans_unlink_iters(struct btree_trans *); ++ ++struct btree_iter *bch2_trans_get_iter(struct btree_trans *, enum btree_id, ++ struct bpos, unsigned); ++struct btree_iter *bch2_trans_copy_iter(struct btree_trans *, ++ struct btree_iter *); ++struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *, ++ enum btree_id, struct bpos, ++ unsigned, unsigned, unsigned); ++ ++#define TRANS_RESET_ITERS (1 << 0) ++#define TRANS_RESET_MEM (1 << 1) ++ ++void bch2_trans_reset(struct btree_trans *, unsigned); ++ ++static inline void bch2_trans_begin(struct btree_trans *trans) ++{ ++ return bch2_trans_reset(trans, TRANS_RESET_ITERS|TRANS_RESET_MEM); ++} ++ ++static inline void bch2_trans_begin_updates(struct btree_trans *trans) ++{ ++ return bch2_trans_reset(trans, TRANS_RESET_MEM); ++} ++ ++void *bch2_trans_kmalloc(struct btree_trans *, size_t); ++void bch2_trans_init(struct btree_trans *, struct bch_fs *, unsigned, size_t); ++int bch2_trans_exit(struct btree_trans *); ++ ++void bch2_fs_btree_iter_exit(struct bch_fs *); ++int bch2_fs_btree_iter_init(struct bch_fs *); ++ ++#endif /* _BCACHEFS_BTREE_ITER_H */ +diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h +new file mode 100644 +index 000000000000..fe8b58384a9e +--- /dev/null ++++ b/fs/bcachefs/btree_locking.h +@@ -0,0 +1,239 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_BTREE_LOCKING_H ++#define _BCACHEFS_BTREE_LOCKING_H ++ ++/* ++ * Only for internal btree use: ++ * ++ * The btree iterator tracks what locks it wants to take, and what locks it ++ * currently has - here we have wrappers for locking/unlocking btree nodes and ++ * updating the iterator state ++ */ ++ ++#include ++ ++#include "btree_iter.h" ++ ++/* matches six lock types */ ++enum btree_node_locked_type { ++ BTREE_NODE_UNLOCKED = -1, ++ BTREE_NODE_READ_LOCKED = SIX_LOCK_read, ++ BTREE_NODE_INTENT_LOCKED = SIX_LOCK_intent, ++}; ++ ++static inline int btree_node_locked_type(struct btree_iter *iter, ++ unsigned level) ++{ ++ /* ++ * We're relying on the fact that if nodes_intent_locked is set ++ * nodes_locked must be set as well, so that we can compute without ++ * branches: ++ */ ++ return BTREE_NODE_UNLOCKED + ++ ((iter->nodes_locked >> level) & 1) + ++ ((iter->nodes_intent_locked >> level) & 1); ++} ++ ++static inline bool btree_node_intent_locked(struct btree_iter *iter, ++ unsigned level) ++{ ++ return btree_node_locked_type(iter, level) == BTREE_NODE_INTENT_LOCKED; ++} ++ ++static inline bool btree_node_read_locked(struct btree_iter *iter, ++ unsigned level) ++{ ++ return btree_node_locked_type(iter, level) == BTREE_NODE_READ_LOCKED; ++} ++ ++static inline bool btree_node_locked(struct btree_iter *iter, unsigned level) ++{ ++ return iter->nodes_locked & (1 << level); ++} ++ ++static inline void mark_btree_node_unlocked(struct btree_iter *iter, ++ unsigned level) ++{ ++ iter->nodes_locked &= ~(1 << level); ++ iter->nodes_intent_locked &= ~(1 << level); ++} ++ ++static inline void mark_btree_node_locked(struct btree_iter *iter, ++ unsigned level, ++ enum six_lock_type type) ++{ ++ /* relying on this to avoid a branch */ ++ BUILD_BUG_ON(SIX_LOCK_read != 0); ++ BUILD_BUG_ON(SIX_LOCK_intent != 1); ++ ++ iter->nodes_locked |= 1 << level; ++ iter->nodes_intent_locked |= type << level; ++} ++ ++static inline void mark_btree_node_intent_locked(struct btree_iter *iter, ++ unsigned level) ++{ ++ mark_btree_node_locked(iter, level, SIX_LOCK_intent); ++} ++ ++static inline enum six_lock_type __btree_lock_want(struct btree_iter *iter, int level) ++{ ++ return level < iter->locks_want ++ ? SIX_LOCK_intent ++ : SIX_LOCK_read; ++} ++ ++static inline enum btree_node_locked_type ++btree_lock_want(struct btree_iter *iter, int level) ++{ ++ if (level < iter->level) ++ return BTREE_NODE_UNLOCKED; ++ if (level < iter->locks_want) ++ return BTREE_NODE_INTENT_LOCKED; ++ if (level == iter->level) ++ return BTREE_NODE_READ_LOCKED; ++ return BTREE_NODE_UNLOCKED; ++} ++ ++static inline void __btree_node_unlock(struct btree_iter *iter, unsigned level) ++{ ++ int lock_type = btree_node_locked_type(iter, level); ++ ++ EBUG_ON(level >= BTREE_MAX_DEPTH); ++ ++ if (lock_type != BTREE_NODE_UNLOCKED) ++ six_unlock_type(&iter->l[level].b->lock, lock_type); ++ mark_btree_node_unlocked(iter, level); ++} ++ ++static inline void btree_node_unlock(struct btree_iter *iter, unsigned level) ++{ ++ EBUG_ON(!level && iter->trans->nounlock); ++ ++ __btree_node_unlock(iter, level); ++} ++ ++static inline void __bch2_btree_iter_unlock(struct btree_iter *iter) ++{ ++ btree_iter_set_dirty(iter, BTREE_ITER_NEED_RELOCK); ++ ++ while (iter->nodes_locked) ++ btree_node_unlock(iter, __ffs(iter->nodes_locked)); ++} ++ ++static inline enum bch_time_stats lock_to_time_stat(enum six_lock_type type) ++{ ++ switch (type) { ++ case SIX_LOCK_read: ++ return BCH_TIME_btree_lock_contended_read; ++ case SIX_LOCK_intent: ++ return BCH_TIME_btree_lock_contended_intent; ++ case SIX_LOCK_write: ++ return BCH_TIME_btree_lock_contended_write; ++ default: ++ BUG(); ++ } ++} ++ ++/* ++ * wrapper around six locks that just traces lock contended time ++ */ ++static inline void __btree_node_lock_type(struct bch_fs *c, struct btree *b, ++ enum six_lock_type type) ++{ ++ u64 start_time = local_clock(); ++ ++ six_lock_type(&b->lock, type, NULL, NULL); ++ bch2_time_stats_update(&c->times[lock_to_time_stat(type)], start_time); ++} ++ ++static inline void btree_node_lock_type(struct bch_fs *c, struct btree *b, ++ enum six_lock_type type) ++{ ++ if (!six_trylock_type(&b->lock, type)) ++ __btree_node_lock_type(c, b, type); ++} ++ ++/* ++ * Lock a btree node if we already have it locked on one of our linked ++ * iterators: ++ */ ++static inline bool btree_node_lock_increment(struct btree_iter *iter, ++ struct btree *b, unsigned level, ++ enum btree_node_locked_type want) ++{ ++ struct btree_iter *linked; ++ ++ trans_for_each_iter(iter->trans, linked) ++ if (linked != iter && ++ linked->l[level].b == b && ++ btree_node_locked_type(linked, level) >= want) { ++ six_lock_increment(&b->lock, want); ++ return true; ++ } ++ ++ return false; ++} ++ ++bool __bch2_btree_node_lock(struct btree *, struct bpos, unsigned, ++ struct btree_iter *, enum six_lock_type); ++ ++static inline bool btree_node_lock(struct btree *b, struct bpos pos, ++ unsigned level, ++ struct btree_iter *iter, ++ enum six_lock_type type) ++{ ++ EBUG_ON(level >= BTREE_MAX_DEPTH); ++ ++ return likely(six_trylock_type(&b->lock, type)) || ++ btree_node_lock_increment(iter, b, level, type) || ++ __bch2_btree_node_lock(b, pos, level, iter, type); ++} ++ ++bool __bch2_btree_node_relock(struct btree_iter *, unsigned); ++ ++static inline bool bch2_btree_node_relock(struct btree_iter *iter, ++ unsigned level) ++{ ++ EBUG_ON(btree_node_locked(iter, level) && ++ btree_node_locked_type(iter, level) != ++ __btree_lock_want(iter, level)); ++ ++ return likely(btree_node_locked(iter, level)) || ++ __bch2_btree_node_relock(iter, level); ++} ++ ++/* ++ * Updates the saved lock sequence number, so that bch2_btree_node_relock() will ++ * succeed: ++ */ ++static inline void ++bch2_btree_node_unlock_write_inlined(struct btree *b, struct btree_iter *iter) ++{ ++ struct btree_iter *linked; ++ ++ EBUG_ON(iter->l[b->level].b != b); ++ EBUG_ON(iter->l[b->level].lock_seq + 1 != b->lock.state.seq); ++ ++ trans_for_each_iter_with_node(iter->trans, b, linked) ++ linked->l[b->level].lock_seq += 2; ++ ++ six_unlock_write(&b->lock); ++} ++ ++void bch2_btree_node_unlock_write(struct btree *, struct btree_iter *); ++ ++void __bch2_btree_node_lock_write(struct btree *, struct btree_iter *); ++ ++static inline void bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter) ++{ ++ EBUG_ON(iter->l[b->level].b != b); ++ EBUG_ON(iter->l[b->level].lock_seq != b->lock.state.seq); ++ ++ if (unlikely(!six_trylock_write(&b->lock))) ++ __bch2_btree_node_lock_write(b, iter); ++} ++ ++#endif /* _BCACHEFS_BTREE_LOCKING_H */ ++ ++ +diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h +new file mode 100644 +index 000000000000..efa68bb578ab +--- /dev/null ++++ b/fs/bcachefs/btree_types.h +@@ -0,0 +1,523 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_BTREE_TYPES_H ++#define _BCACHEFS_BTREE_TYPES_H ++ ++#include ++#include ++#include ++ ++#include "bkey_methods.h" ++#include "buckets_types.h" ++#include "journal_types.h" ++ ++struct open_bucket; ++struct btree_update; ++struct btree_trans; ++ ++#define MAX_BSETS 3U ++ ++struct btree_nr_keys { ++ ++ /* ++ * Amount of live metadata (i.e. size of node after a compaction) in ++ * units of u64s ++ */ ++ u16 live_u64s; ++ u16 bset_u64s[MAX_BSETS]; ++ ++ /* live keys only: */ ++ u16 packed_keys; ++ u16 unpacked_keys; ++}; ++ ++struct bset_tree { ++ /* ++ * We construct a binary tree in an array as if the array ++ * started at 1, so that things line up on the same cachelines ++ * better: see comments in bset.c at cacheline_to_bkey() for ++ * details ++ */ ++ ++ /* size of the binary tree and prev array */ ++ u16 size; ++ ++ /* function of size - precalculated for to_inorder() */ ++ u16 extra; ++ ++ u16 data_offset; ++ u16 aux_data_offset; ++ u16 end_offset; ++ ++ struct bpos max_key; ++}; ++ ++struct btree_write { ++ struct journal_entry_pin journal; ++ struct closure_waitlist wait; ++}; ++ ++struct btree_alloc { ++ struct open_buckets ob; ++ BKEY_PADDED(k); ++}; ++ ++struct btree { ++ /* Hottest entries first */ ++ struct rhash_head hash; ++ ++ /* Key/pointer for this btree node */ ++ __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX); ++ ++ struct six_lock lock; ++ ++ unsigned long flags; ++ u16 written; ++ u8 level; ++ u8 btree_id; ++ u8 nsets; ++ u8 nr_key_bits; ++ ++ struct bkey_format format; ++ ++ struct btree_node *data; ++ void *aux_data; ++ ++ /* ++ * Sets of sorted keys - the real btree node - plus a binary search tree ++ * ++ * set[0] is special; set[0]->tree, set[0]->prev and set[0]->data point ++ * to the memory we have allocated for this btree node. Additionally, ++ * set[0]->data points to the entire btree node as it exists on disk. ++ */ ++ struct bset_tree set[MAX_BSETS]; ++ ++ struct btree_nr_keys nr; ++ u16 sib_u64s[2]; ++ u16 whiteout_u64s; ++ u16 uncompacted_whiteout_u64s; ++ u8 page_order; ++ u8 unpack_fn_len; ++ ++ /* ++ * XXX: add a delete sequence number, so when bch2_btree_node_relock() ++ * fails because the lock sequence number has changed - i.e. the ++ * contents were modified - we can still relock the node if it's still ++ * the one we want, without redoing the traversal ++ */ ++ ++ /* ++ * For asynchronous splits/interior node updates: ++ * When we do a split, we allocate new child nodes and update the parent ++ * node to point to them: we update the parent in memory immediately, ++ * but then we must wait until the children have been written out before ++ * the update to the parent can be written - this is a list of the ++ * btree_updates that are blocking this node from being ++ * written: ++ */ ++ struct list_head write_blocked; ++ ++ /* ++ * Also for asynchronous splits/interior node updates: ++ * If a btree node isn't reachable yet, we don't want to kick off ++ * another write - because that write also won't yet be reachable and ++ * marking it as completed before it's reachable would be incorrect: ++ */ ++ unsigned long will_make_reachable; ++ ++ struct open_buckets ob; ++ ++ /* lru list */ ++ struct list_head list; ++ ++ struct btree_write writes[2]; ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++ bool *expensive_debug_checks; ++#endif ++}; ++ ++struct btree_cache { ++ struct rhashtable table; ++ bool table_init_done; ++ /* ++ * We never free a struct btree, except on shutdown - we just put it on ++ * the btree_cache_freed list and reuse it later. This simplifies the ++ * code, and it doesn't cost us much memory as the memory usage is ++ * dominated by buffers that hold the actual btree node data and those ++ * can be freed - and the number of struct btrees allocated is ++ * effectively bounded. ++ * ++ * btree_cache_freeable effectively is a small cache - we use it because ++ * high order page allocations can be rather expensive, and it's quite ++ * common to delete and allocate btree nodes in quick succession. It ++ * should never grow past ~2-3 nodes in practice. ++ */ ++ struct mutex lock; ++ struct list_head live; ++ struct list_head freeable; ++ struct list_head freed; ++ ++ /* Number of elements in live + freeable lists */ ++ unsigned used; ++ unsigned reserve; ++ struct shrinker shrink; ++ ++ /* ++ * If we need to allocate memory for a new btree node and that ++ * allocation fails, we can cannibalize another node in the btree cache ++ * to satisfy the allocation - lock to guarantee only one thread does ++ * this at a time: ++ */ ++ struct task_struct *alloc_lock; ++ struct closure_waitlist alloc_wait; ++}; ++ ++struct btree_node_iter { ++ struct btree_node_iter_set { ++ u16 k, end; ++ } data[MAX_BSETS]; ++}; ++ ++enum btree_iter_type { ++ BTREE_ITER_KEYS, ++ BTREE_ITER_NODES, ++}; ++ ++#define BTREE_ITER_TYPE ((1 << 2) - 1) ++ ++#define BTREE_ITER_SLOTS (1 << 2) ++#define BTREE_ITER_INTENT (1 << 3) ++#define BTREE_ITER_PREFETCH (1 << 4) ++#define BTREE_ITER_KEEP_UNTIL_COMMIT (1 << 5) ++/* ++ * Used in bch2_btree_iter_traverse(), to indicate whether we're searching for ++ * @pos or the first key strictly greater than @pos ++ */ ++#define BTREE_ITER_IS_EXTENTS (1 << 6) ++#define BTREE_ITER_ERROR (1 << 7) ++ ++enum btree_iter_uptodate { ++ BTREE_ITER_UPTODATE = 0, ++ BTREE_ITER_NEED_PEEK = 1, ++ BTREE_ITER_NEED_RELOCK = 2, ++ BTREE_ITER_NEED_TRAVERSE = 3, ++}; ++ ++/* ++ * @pos - iterator's current position ++ * @level - current btree depth ++ * @locks_want - btree level below which we start taking intent locks ++ * @nodes_locked - bitmask indicating which nodes in @nodes are locked ++ * @nodes_intent_locked - bitmask indicating which locks are intent locks ++ */ ++struct btree_iter { ++ u8 idx; ++ ++ struct btree_trans *trans; ++ struct bpos pos; ++ ++ u8 flags; ++ enum btree_iter_uptodate uptodate:4; ++ enum btree_id btree_id:4; ++ unsigned level:4, ++ locks_want:4, ++ nodes_locked:4, ++ nodes_intent_locked:4; ++ ++ struct btree_iter_level { ++ struct btree *b; ++ struct btree_node_iter iter; ++ u32 lock_seq; ++ } l[BTREE_MAX_DEPTH]; ++ ++ /* ++ * Current unpacked key - so that bch2_btree_iter_next()/ ++ * bch2_btree_iter_next_slot() can correctly advance pos. ++ */ ++ struct bkey k; ++}; ++ ++static inline enum btree_iter_type btree_iter_type(struct btree_iter *iter) ++{ ++ return iter->flags & BTREE_ITER_TYPE; ++} ++ ++struct btree_insert_entry { ++ struct bkey_i *k; ++ struct btree_iter *iter; ++}; ++ ++#define BTREE_ITER_MAX 64 ++ ++struct btree_trans { ++ struct bch_fs *c; ++ unsigned long ip; ++ ++ u64 iters_linked; ++ u64 iters_live; ++ u64 iters_touched; ++ ++ u8 nr_iters; ++ u8 nr_updates; ++ u8 size; ++ unsigned used_mempool:1; ++ unsigned error:1; ++ unsigned nounlock:1; ++ ++ unsigned mem_top; ++ unsigned mem_bytes; ++ void *mem; ++ ++ struct btree_iter *iters; ++ struct btree_insert_entry *updates; ++ u8 *updates_sorted; ++ ++ /* update path: */ ++ struct journal_res journal_res; ++ struct journal_preres journal_preres; ++ u64 *journal_seq; ++ struct disk_reservation *disk_res; ++ unsigned flags; ++ unsigned journal_u64s; ++ struct replicas_delta_list *fs_usage_deltas; ++ ++ struct btree_iter iters_onstack[2]; ++ struct btree_insert_entry updates_onstack[6]; ++ u8 updates_sorted_onstack[6]; ++}; ++ ++#define BTREE_FLAG(flag) \ ++static inline bool btree_node_ ## flag(struct btree *b) \ ++{ return test_bit(BTREE_NODE_ ## flag, &b->flags); } \ ++ \ ++static inline void set_btree_node_ ## flag(struct btree *b) \ ++{ set_bit(BTREE_NODE_ ## flag, &b->flags); } \ ++ \ ++static inline void clear_btree_node_ ## flag(struct btree *b) \ ++{ clear_bit(BTREE_NODE_ ## flag, &b->flags); } ++ ++enum btree_flags { ++ BTREE_NODE_read_in_flight, ++ BTREE_NODE_read_error, ++ BTREE_NODE_dirty, ++ BTREE_NODE_need_write, ++ BTREE_NODE_noevict, ++ BTREE_NODE_write_idx, ++ BTREE_NODE_accessed, ++ BTREE_NODE_write_in_flight, ++ BTREE_NODE_just_written, ++ BTREE_NODE_dying, ++ BTREE_NODE_fake, ++}; ++ ++BTREE_FLAG(read_in_flight); ++BTREE_FLAG(read_error); ++BTREE_FLAG(dirty); ++BTREE_FLAG(need_write); ++BTREE_FLAG(noevict); ++BTREE_FLAG(write_idx); ++BTREE_FLAG(accessed); ++BTREE_FLAG(write_in_flight); ++BTREE_FLAG(just_written); ++BTREE_FLAG(dying); ++BTREE_FLAG(fake); ++ ++static inline struct btree_write *btree_current_write(struct btree *b) ++{ ++ return b->writes + btree_node_write_idx(b); ++} ++ ++static inline struct btree_write *btree_prev_write(struct btree *b) ++{ ++ return b->writes + (btree_node_write_idx(b) ^ 1); ++} ++ ++static inline struct bset_tree *bset_tree_last(struct btree *b) ++{ ++ EBUG_ON(!b->nsets); ++ return b->set + b->nsets - 1; ++} ++ ++static inline void * ++__btree_node_offset_to_ptr(const struct btree *b, u16 offset) ++{ ++ return (void *) ((u64 *) b->data + 1 + offset); ++} ++ ++static inline u16 ++__btree_node_ptr_to_offset(const struct btree *b, const void *p) ++{ ++ u16 ret = (u64 *) p - 1 - (u64 *) b->data; ++ ++ EBUG_ON(__btree_node_offset_to_ptr(b, ret) != p); ++ return ret; ++} ++ ++static inline struct bset *bset(const struct btree *b, ++ const struct bset_tree *t) ++{ ++ return __btree_node_offset_to_ptr(b, t->data_offset); ++} ++ ++static inline void set_btree_bset_end(struct btree *b, struct bset_tree *t) ++{ ++ t->end_offset = ++ __btree_node_ptr_to_offset(b, vstruct_last(bset(b, t))); ++} ++ ++static inline void set_btree_bset(struct btree *b, struct bset_tree *t, ++ const struct bset *i) ++{ ++ t->data_offset = __btree_node_ptr_to_offset(b, i); ++ set_btree_bset_end(b, t); ++} ++ ++static inline struct bset *btree_bset_first(struct btree *b) ++{ ++ return bset(b, b->set); ++} ++ ++static inline struct bset *btree_bset_last(struct btree *b) ++{ ++ return bset(b, bset_tree_last(b)); ++} ++ ++static inline u16 ++__btree_node_key_to_offset(const struct btree *b, const struct bkey_packed *k) ++{ ++ return __btree_node_ptr_to_offset(b, k); ++} ++ ++static inline struct bkey_packed * ++__btree_node_offset_to_key(const struct btree *b, u16 k) ++{ ++ return __btree_node_offset_to_ptr(b, k); ++} ++ ++static inline unsigned btree_bkey_first_offset(const struct bset_tree *t) ++{ ++ return t->data_offset + offsetof(struct bset, _data) / sizeof(u64); ++} ++ ++#define btree_bkey_first(_b, _t) \ ++({ \ ++ EBUG_ON(bset(_b, _t)->start != \ ++ __btree_node_offset_to_key(_b, btree_bkey_first_offset(_t)));\ ++ \ ++ bset(_b, _t)->start; \ ++}) ++ ++#define btree_bkey_last(_b, _t) \ ++({ \ ++ EBUG_ON(__btree_node_offset_to_key(_b, (_t)->end_offset) != \ ++ vstruct_last(bset(_b, _t))); \ ++ \ ++ __btree_node_offset_to_key(_b, (_t)->end_offset); \ ++}) ++ ++static inline unsigned bset_u64s(struct bset_tree *t) ++{ ++ return t->end_offset - t->data_offset - ++ sizeof(struct bset) / sizeof(u64); ++} ++ ++static inline unsigned bset_byte_offset(struct btree *b, void *i) ++{ ++ return i - (void *) b->data; ++} ++ ++enum btree_node_type { ++#define x(kwd, val, name) BKEY_TYPE_##kwd = val, ++ BCH_BTREE_IDS() ++#undef x ++ BKEY_TYPE_BTREE, ++}; ++ ++/* Type of a key in btree @id at level @level: */ ++static inline enum btree_node_type __btree_node_type(unsigned level, enum btree_id id) ++{ ++ return level ? BKEY_TYPE_BTREE : (enum btree_node_type) id; ++} ++ ++/* Type of keys @b contains: */ ++static inline enum btree_node_type btree_node_type(struct btree *b) ++{ ++ return __btree_node_type(b->level, b->btree_id); ++} ++ ++static inline bool btree_node_type_is_extents(enum btree_node_type type) ++{ ++ switch (type) { ++ case BKEY_TYPE_EXTENTS: ++ case BKEY_TYPE_REFLINK: ++ return true; ++ default: ++ return false; ++ } ++} ++ ++static inline bool btree_node_is_extents(struct btree *b) ++{ ++ return btree_node_type_is_extents(btree_node_type(b)); ++} ++ ++#define BTREE_NODE_TYPE_HAS_TRIGGERS \ ++ ((1U << BKEY_TYPE_EXTENTS)| \ ++ (1U << BKEY_TYPE_ALLOC)| \ ++ (1U << BKEY_TYPE_INODES)| \ ++ (1U << BKEY_TYPE_REFLINK)| \ ++ (1U << BKEY_TYPE_EC)| \ ++ (1U << BKEY_TYPE_BTREE)) ++ ++#define BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS \ ++ ((1U << BKEY_TYPE_EXTENTS)| \ ++ (1U << BKEY_TYPE_INODES)| \ ++ (1U << BKEY_TYPE_REFLINK)) ++ ++static inline bool btree_node_type_needs_gc(enum btree_node_type type) ++{ ++ return BTREE_NODE_TYPE_HAS_TRIGGERS & (1U << type); ++} ++ ++struct btree_root { ++ struct btree *b; ++ ++ struct btree_update *as; ++ ++ /* On disk root - see async splits: */ ++ __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX); ++ u8 level; ++ u8 alive; ++ s8 error; ++}; ++ ++/* ++ * Optional hook that will be called just prior to a btree node update, when ++ * we're holding the write lock and we know what key is about to be overwritten: ++ */ ++ ++enum btree_insert_ret { ++ BTREE_INSERT_OK, ++ /* leaf node needs to be split */ ++ BTREE_INSERT_BTREE_NODE_FULL, ++ BTREE_INSERT_ENOSPC, ++ BTREE_INSERT_NEED_MARK_REPLICAS, ++ BTREE_INSERT_NEED_JOURNAL_RES, ++}; ++ ++enum btree_gc_coalesce_fail_reason { ++ BTREE_GC_COALESCE_FAIL_RESERVE_GET, ++ BTREE_GC_COALESCE_FAIL_KEYLIST_REALLOC, ++ BTREE_GC_COALESCE_FAIL_FORMAT_FITS, ++}; ++ ++enum btree_node_sibling { ++ btree_prev_sib, ++ btree_next_sib, ++}; ++ ++typedef struct btree_nr_keys (*sort_fix_overlapping_fn)(struct bset *, ++ struct btree *, ++ struct btree_node_iter *); ++ ++#endif /* _BCACHEFS_BTREE_TYPES_H */ +diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h +new file mode 100644 +index 000000000000..ad8cbf3fb778 +--- /dev/null ++++ b/fs/bcachefs/btree_update.h +@@ -0,0 +1,157 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_BTREE_UPDATE_H ++#define _BCACHEFS_BTREE_UPDATE_H ++ ++#include "btree_iter.h" ++#include "journal.h" ++ ++struct bch_fs; ++struct btree; ++ ++void bch2_btree_node_lock_for_insert(struct bch_fs *, struct btree *, ++ struct btree_iter *); ++bool bch2_btree_bset_insert_key(struct btree_iter *, struct btree *, ++ struct btree_node_iter *, struct bkey_i *); ++void bch2_btree_journal_key(struct btree_trans *, struct btree_iter *, ++ struct bkey_i *); ++ ++enum { ++ __BTREE_INSERT_ATOMIC, ++ __BTREE_INSERT_NOUNLOCK, ++ __BTREE_INSERT_NOFAIL, ++ __BTREE_INSERT_NOCHECK_RW, ++ __BTREE_INSERT_LAZY_RW, ++ __BTREE_INSERT_USE_RESERVE, ++ __BTREE_INSERT_USE_ALLOC_RESERVE, ++ __BTREE_INSERT_JOURNAL_REPLAY, ++ __BTREE_INSERT_JOURNAL_RESERVED, ++ __BTREE_INSERT_NOMARK_OVERWRITES, ++ __BTREE_INSERT_NOMARK, ++ __BTREE_INSERT_NO_CLEAR_REPLICAS, ++ __BTREE_INSERT_BUCKET_INVALIDATE, ++ __BTREE_INSERT_NOWAIT, ++ __BTREE_INSERT_GC_LOCK_HELD, ++ __BCH_HASH_SET_MUST_CREATE, ++ __BCH_HASH_SET_MUST_REPLACE, ++}; ++ ++/* ++ * Don't drop/retake locks before doing btree update, instead return -EINTR if ++ * we had to drop locks for any reason ++ */ ++#define BTREE_INSERT_ATOMIC (1 << __BTREE_INSERT_ATOMIC) ++ ++/* ++ * Don't drop locks _after_ successfully updating btree: ++ */ ++#define BTREE_INSERT_NOUNLOCK (1 << __BTREE_INSERT_NOUNLOCK) ++ ++/* Don't check for -ENOSPC: */ ++#define BTREE_INSERT_NOFAIL (1 << __BTREE_INSERT_NOFAIL) ++ ++#define BTREE_INSERT_NOCHECK_RW (1 << __BTREE_INSERT_NOCHECK_RW) ++#define BTREE_INSERT_LAZY_RW (1 << __BTREE_INSERT_LAZY_RW) ++ ++/* for copygc, or when merging btree nodes */ ++#define BTREE_INSERT_USE_RESERVE (1 << __BTREE_INSERT_USE_RESERVE) ++#define BTREE_INSERT_USE_ALLOC_RESERVE (1 << __BTREE_INSERT_USE_ALLOC_RESERVE) ++ ++/* Insert is for journal replay - don't get journal reservations: */ ++#define BTREE_INSERT_JOURNAL_REPLAY (1 << __BTREE_INSERT_JOURNAL_REPLAY) ++ ++#define BTREE_INSERT_JOURNAL_RESERVED (1 << __BTREE_INSERT_JOURNAL_RESERVED) ++ ++/* Don't mark overwrites, just new key: */ ++#define BTREE_INSERT_NOMARK_OVERWRITES (1 << __BTREE_INSERT_NOMARK_OVERWRITES) ++ ++/* Don't call mark new key at all: */ ++#define BTREE_INSERT_NOMARK (1 << __BTREE_INSERT_NOMARK) ++ ++#define BTREE_INSERT_NO_CLEAR_REPLICAS (1 << __BTREE_INSERT_NO_CLEAR_REPLICAS) ++ ++#define BTREE_INSERT_BUCKET_INVALIDATE (1 << __BTREE_INSERT_BUCKET_INVALIDATE) ++ ++/* Don't block on allocation failure (for new btree nodes: */ ++#define BTREE_INSERT_NOWAIT (1 << __BTREE_INSERT_NOWAIT) ++#define BTREE_INSERT_GC_LOCK_HELD (1 << __BTREE_INSERT_GC_LOCK_HELD) ++ ++#define BCH_HASH_SET_MUST_CREATE (1 << __BCH_HASH_SET_MUST_CREATE) ++#define BCH_HASH_SET_MUST_REPLACE (1 << __BCH_HASH_SET_MUST_REPLACE) ++ ++int bch2_btree_delete_at(struct btree_trans *, struct btree_iter *, unsigned); ++ ++int bch2_btree_insert(struct bch_fs *, enum btree_id, struct bkey_i *, ++ struct disk_reservation *, u64 *, int flags); ++ ++int bch2_btree_delete_at_range(struct btree_trans *, struct btree_iter *, ++ struct bpos, u64 *); ++int bch2_btree_delete_range(struct bch_fs *, enum btree_id, ++ struct bpos, struct bpos, u64 *); ++ ++int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *, ++ __le64, unsigned); ++int bch2_btree_node_update_key(struct bch_fs *, struct btree_iter *, ++ struct btree *, struct bkey_i_btree_ptr *); ++ ++int __bch2_trans_commit(struct btree_trans *); ++ ++/** ++ * bch2_trans_commit - insert keys at given iterator positions ++ * ++ * This is main entry point for btree updates. ++ * ++ * Return values: ++ * -EINTR: locking changed, this function should be called again. Only returned ++ * if passed BTREE_INSERT_ATOMIC. ++ * -EROFS: filesystem read only ++ * -EIO: journal or btree node IO error ++ */ ++static inline int bch2_trans_commit(struct btree_trans *trans, ++ struct disk_reservation *disk_res, ++ u64 *journal_seq, ++ unsigned flags) ++{ ++ trans->disk_res = disk_res; ++ trans->journal_seq = journal_seq; ++ trans->flags = flags; ++ ++ return __bch2_trans_commit(trans); ++} ++ ++static inline void bch2_trans_update(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct bkey_i *k) ++{ ++ EBUG_ON(trans->nr_updates >= trans->nr_iters + 4); ++ ++ iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT; ++ ++ trans->updates[trans->nr_updates++] = (struct btree_insert_entry) { ++ .iter = iter, .k = k ++ }; ++} ++ ++#define bch2_trans_do(_c, _journal_seq, _flags, _do) \ ++({ \ ++ struct btree_trans trans; \ ++ int _ret; \ ++ \ ++ bch2_trans_init(&trans, (_c), 0, 0); \ ++ \ ++ do { \ ++ bch2_trans_begin(&trans); \ ++ \ ++ _ret = (_do) ?: bch2_trans_commit(&trans, NULL, \ ++ (_journal_seq), (_flags)); \ ++ } while (_ret == -EINTR); \ ++ \ ++ bch2_trans_exit(&trans); \ ++ _ret; \ ++}) ++ ++#define trans_for_each_update(_trans, _i) \ ++ for ((_i) = (_trans)->updates; \ ++ (_i) < (_trans)->updates + (_trans)->nr_updates; \ ++ (_i)++) ++ ++#endif /* _BCACHEFS_BTREE_UPDATE_H */ +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +new file mode 100644 +index 000000000000..40d801e1094f +--- /dev/null ++++ b/fs/bcachefs/btree_update_interior.c +@@ -0,0 +1,2234 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include "bcachefs.h" ++#include "alloc_foreground.h" ++#include "bkey_methods.h" ++#include "btree_cache.h" ++#include "btree_gc.h" ++#include "btree_update.h" ++#include "btree_update_interior.h" ++#include "btree_io.h" ++#include "btree_iter.h" ++#include "btree_locking.h" ++#include "buckets.h" ++#include "extents.h" ++#include "journal.h" ++#include "journal_reclaim.h" ++#include "keylist.h" ++#include "replicas.h" ++#include "super-io.h" ++ ++#include ++#include ++ ++static void btree_node_will_make_reachable(struct btree_update *, ++ struct btree *); ++static void btree_update_drop_new_node(struct bch_fs *, struct btree *); ++static void bch2_btree_set_root_ondisk(struct bch_fs *, struct btree *, int); ++ ++/* Debug code: */ ++ ++static void btree_node_interior_verify(struct btree *b) ++{ ++ struct btree_node_iter iter; ++ struct bkey_packed *k; ++ ++ BUG_ON(!b->level); ++ ++ bch2_btree_node_iter_init(&iter, b, &b->key.k.p); ++#if 1 ++ BUG_ON(!(k = bch2_btree_node_iter_peek(&iter, b)) || ++ bkey_cmp_left_packed(b, k, &b->key.k.p)); ++ ++ BUG_ON((bch2_btree_node_iter_advance(&iter, b), ++ !bch2_btree_node_iter_end(&iter))); ++#else ++ const char *msg; ++ ++ msg = "not found"; ++ k = bch2_btree_node_iter_peek(&iter, b); ++ if (!k) ++ goto err; ++ ++ msg = "isn't what it should be"; ++ if (bkey_cmp_left_packed(b, k, &b->key.k.p)) ++ goto err; ++ ++ bch2_btree_node_iter_advance(&iter, b); ++ ++ msg = "isn't last key"; ++ if (!bch2_btree_node_iter_end(&iter)) ++ goto err; ++ return; ++err: ++ bch2_dump_btree_node(b); ++ printk(KERN_ERR "last key %llu:%llu %s\n", b->key.k.p.inode, ++ b->key.k.p.offset, msg); ++ BUG(); ++#endif ++} ++ ++/* Calculate ideal packed bkey format for new btree nodes: */ ++ ++void __bch2_btree_calc_format(struct bkey_format_state *s, struct btree *b) ++{ ++ struct bkey_packed *k; ++ struct bset_tree *t; ++ struct bkey uk; ++ ++ bch2_bkey_format_add_pos(s, b->data->min_key); ++ ++ for_each_bset(b, t) ++ for (k = btree_bkey_first(b, t); ++ k != btree_bkey_last(b, t); ++ k = bkey_next(k)) ++ if (!bkey_whiteout(k)) { ++ uk = bkey_unpack_key(b, k); ++ bch2_bkey_format_add_key(s, &uk); ++ } ++} ++ ++static struct bkey_format bch2_btree_calc_format(struct btree *b) ++{ ++ struct bkey_format_state s; ++ ++ bch2_bkey_format_init(&s); ++ __bch2_btree_calc_format(&s, b); ++ ++ return bch2_bkey_format_done(&s); ++} ++ ++static size_t btree_node_u64s_with_format(struct btree *b, ++ struct bkey_format *new_f) ++{ ++ struct bkey_format *old_f = &b->format; ++ ++ /* stupid integer promotion rules */ ++ ssize_t delta = ++ (((int) new_f->key_u64s - old_f->key_u64s) * ++ (int) b->nr.packed_keys) + ++ (((int) new_f->key_u64s - BKEY_U64s) * ++ (int) b->nr.unpacked_keys); ++ ++ BUG_ON(delta + b->nr.live_u64s < 0); ++ ++ return b->nr.live_u64s + delta; ++} ++ ++/** ++ * btree_node_format_fits - check if we could rewrite node with a new format ++ * ++ * This assumes all keys can pack with the new format -- it just checks if ++ * the re-packed keys would fit inside the node itself. ++ */ ++bool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *b, ++ struct bkey_format *new_f) ++{ ++ size_t u64s = btree_node_u64s_with_format(b, new_f); ++ ++ return __vstruct_bytes(struct btree_node, u64s) < btree_bytes(c); ++} ++ ++/* Btree node freeing/allocation: */ ++ ++static bool btree_key_matches(struct bch_fs *c, ++ struct bkey_s_c l, ++ struct bkey_s_c r) ++{ ++ struct bkey_ptrs_c ptrs1 = bch2_bkey_ptrs_c(l); ++ struct bkey_ptrs_c ptrs2 = bch2_bkey_ptrs_c(r); ++ const struct bch_extent_ptr *ptr1, *ptr2; ++ ++ bkey_for_each_ptr(ptrs1, ptr1) ++ bkey_for_each_ptr(ptrs2, ptr2) ++ if (ptr1->dev == ptr2->dev && ++ ptr1->gen == ptr2->gen && ++ ptr1->offset == ptr2->offset) ++ return true; ++ ++ return false; ++} ++ ++/* ++ * We're doing the index update that makes @b unreachable, update stuff to ++ * reflect that: ++ * ++ * Must be called _before_ btree_update_updated_root() or ++ * btree_update_updated_node: ++ */ ++static void bch2_btree_node_free_index(struct btree_update *as, struct btree *b, ++ struct bkey_s_c k, ++ struct bch_fs_usage *stats) ++{ ++ struct bch_fs *c = as->c; ++ struct pending_btree_node_free *d; ++ ++ for (d = as->pending; d < as->pending + as->nr_pending; d++) ++ if (!bkey_cmp(k.k->p, d->key.k.p) && ++ btree_key_matches(c, k, bkey_i_to_s_c(&d->key))) ++ goto found; ++ BUG(); ++found: ++ BUG_ON(d->index_update_done); ++ d->index_update_done = true; ++ ++ /* ++ * We're dropping @k from the btree, but it's still live until the ++ * index update is persistent so we need to keep a reference around for ++ * mark and sweep to find - that's primarily what the ++ * btree_node_pending_free list is for. ++ * ++ * So here (when we set index_update_done = true), we're moving an ++ * existing reference to a different part of the larger "gc keyspace" - ++ * and the new position comes after the old position, since GC marks ++ * the pending free list after it walks the btree. ++ * ++ * If we move the reference while mark and sweep is _between_ the old ++ * and the new position, mark and sweep will see the reference twice ++ * and it'll get double accounted - so check for that here and subtract ++ * to cancel out one of mark and sweep's markings if necessary: ++ */ ++ ++ if (gc_pos_cmp(c->gc_pos, b ++ ? gc_pos_btree_node(b) ++ : gc_pos_btree_root(as->btree_id)) >= 0 && ++ gc_pos_cmp(c->gc_pos, gc_phase(GC_PHASE_PENDING_DELETE)) < 0) ++ bch2_mark_key_locked(c, bkey_i_to_s_c(&d->key), ++ 0, 0, NULL, 0, ++ BCH_BUCKET_MARK_OVERWRITE| ++ BCH_BUCKET_MARK_GC); ++} ++ ++static void __btree_node_free(struct bch_fs *c, struct btree *b) ++{ ++ trace_btree_node_free(c, b); ++ ++ BUG_ON(btree_node_dirty(b)); ++ BUG_ON(btree_node_need_write(b)); ++ BUG_ON(b == btree_node_root(c, b)); ++ BUG_ON(b->ob.nr); ++ BUG_ON(!list_empty(&b->write_blocked)); ++ BUG_ON(b->will_make_reachable); ++ ++ clear_btree_node_noevict(b); ++ ++ bch2_btree_node_hash_remove(&c->btree_cache, b); ++ ++ mutex_lock(&c->btree_cache.lock); ++ list_move(&b->list, &c->btree_cache.freeable); ++ mutex_unlock(&c->btree_cache.lock); ++} ++ ++void bch2_btree_node_free_never_inserted(struct bch_fs *c, struct btree *b) ++{ ++ struct open_buckets ob = b->ob; ++ ++ btree_update_drop_new_node(c, b); ++ ++ b->ob.nr = 0; ++ ++ clear_btree_node_dirty(b); ++ ++ btree_node_lock_type(c, b, SIX_LOCK_write); ++ __btree_node_free(c, b); ++ six_unlock_write(&b->lock); ++ ++ bch2_open_buckets_put(c, &ob); ++} ++ ++void bch2_btree_node_free_inmem(struct bch_fs *c, struct btree *b, ++ struct btree_iter *iter) ++{ ++ struct btree_iter *linked; ++ ++ trans_for_each_iter(iter->trans, linked) ++ BUG_ON(linked->l[b->level].b == b); ++ ++ /* ++ * Is this a node that isn't reachable on disk yet? ++ * ++ * Nodes that aren't reachable yet have writes blocked until they're ++ * reachable - now that we've cancelled any pending writes and moved ++ * things waiting on that write to wait on this update, we can drop this ++ * node from the list of nodes that the other update is making ++ * reachable, prior to freeing it: ++ */ ++ btree_update_drop_new_node(c, b); ++ ++ six_lock_write(&b->lock, NULL, NULL); ++ __btree_node_free(c, b); ++ six_unlock_write(&b->lock); ++ six_unlock_intent(&b->lock); ++} ++ ++static void bch2_btree_node_free_ondisk(struct bch_fs *c, ++ struct pending_btree_node_free *pending) ++{ ++ BUG_ON(!pending->index_update_done); ++ ++ bch2_mark_key(c, bkey_i_to_s_c(&pending->key), ++ 0, 0, NULL, 0, BCH_BUCKET_MARK_OVERWRITE); ++ ++ if (gc_visited(c, gc_phase(GC_PHASE_PENDING_DELETE))) ++ bch2_mark_key(c, bkey_i_to_s_c(&pending->key), ++ 0, 0, NULL, 0, ++ BCH_BUCKET_MARK_OVERWRITE| ++ BCH_BUCKET_MARK_GC); ++} ++ ++static struct btree *__bch2_btree_node_alloc(struct bch_fs *c, ++ struct disk_reservation *res, ++ struct closure *cl, ++ unsigned flags) ++{ ++ struct write_point *wp; ++ struct btree *b; ++ BKEY_PADDED(k) tmp; ++ struct open_buckets ob = { .nr = 0 }; ++ struct bch_devs_list devs_have = (struct bch_devs_list) { 0 }; ++ unsigned nr_reserve; ++ enum alloc_reserve alloc_reserve; ++ ++ if (flags & BTREE_INSERT_USE_ALLOC_RESERVE) { ++ nr_reserve = 0; ++ alloc_reserve = RESERVE_ALLOC; ++ } else if (flags & BTREE_INSERT_USE_RESERVE) { ++ nr_reserve = BTREE_NODE_RESERVE / 2; ++ alloc_reserve = RESERVE_BTREE; ++ } else { ++ nr_reserve = BTREE_NODE_RESERVE; ++ alloc_reserve = RESERVE_NONE; ++ } ++ ++ mutex_lock(&c->btree_reserve_cache_lock); ++ if (c->btree_reserve_cache_nr > nr_reserve) { ++ struct btree_alloc *a = ++ &c->btree_reserve_cache[--c->btree_reserve_cache_nr]; ++ ++ ob = a->ob; ++ bkey_copy(&tmp.k, &a->k); ++ mutex_unlock(&c->btree_reserve_cache_lock); ++ goto mem_alloc; ++ } ++ mutex_unlock(&c->btree_reserve_cache_lock); ++ ++retry: ++ wp = bch2_alloc_sectors_start(c, c->opts.foreground_target, 0, ++ writepoint_ptr(&c->btree_write_point), ++ &devs_have, ++ res->nr_replicas, ++ c->opts.metadata_replicas_required, ++ alloc_reserve, 0, cl); ++ if (IS_ERR(wp)) ++ return ERR_CAST(wp); ++ ++ if (wp->sectors_free < c->opts.btree_node_size) { ++ struct open_bucket *ob; ++ unsigned i; ++ ++ open_bucket_for_each(c, &wp->ptrs, ob, i) ++ if (ob->sectors_free < c->opts.btree_node_size) ++ ob->sectors_free = 0; ++ ++ bch2_alloc_sectors_done(c, wp); ++ goto retry; ++ } ++ ++ bkey_btree_ptr_init(&tmp.k); ++ bch2_alloc_sectors_append_ptrs(c, wp, &tmp.k, c->opts.btree_node_size); ++ ++ bch2_open_bucket_get(c, wp, &ob); ++ bch2_alloc_sectors_done(c, wp); ++mem_alloc: ++ b = bch2_btree_node_mem_alloc(c); ++ ++ /* we hold cannibalize_lock: */ ++ BUG_ON(IS_ERR(b)); ++ BUG_ON(b->ob.nr); ++ ++ bkey_copy(&b->key, &tmp.k); ++ b->ob = ob; ++ ++ return b; ++} ++ ++static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned level) ++{ ++ struct bch_fs *c = as->c; ++ struct btree *b; ++ ++ BUG_ON(level >= BTREE_MAX_DEPTH); ++ BUG_ON(!as->reserve->nr); ++ ++ b = as->reserve->b[--as->reserve->nr]; ++ ++ BUG_ON(bch2_btree_node_hash_insert(&c->btree_cache, b, level, as->btree_id)); ++ ++ set_btree_node_accessed(b); ++ set_btree_node_dirty(b); ++ set_btree_node_need_write(b); ++ ++ bch2_bset_init_first(b, &b->data->keys); ++ memset(&b->nr, 0, sizeof(b->nr)); ++ b->data->magic = cpu_to_le64(bset_magic(c)); ++ b->data->flags = 0; ++ SET_BTREE_NODE_ID(b->data, as->btree_id); ++ SET_BTREE_NODE_LEVEL(b->data, level); ++ b->data->ptr = bkey_i_to_btree_ptr(&b->key)->v.start[0]; ++ ++ bch2_btree_build_aux_trees(b); ++ ++ btree_node_will_make_reachable(as, b); ++ ++ trace_btree_node_alloc(c, b); ++ return b; ++} ++ ++struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *as, ++ struct btree *b, ++ struct bkey_format format) ++{ ++ struct btree *n; ++ ++ n = bch2_btree_node_alloc(as, b->level); ++ ++ n->data->min_key = b->data->min_key; ++ n->data->max_key = b->data->max_key; ++ n->data->format = format; ++ SET_BTREE_NODE_SEQ(n->data, BTREE_NODE_SEQ(b->data) + 1); ++ ++ btree_node_set_format(n, format); ++ ++ bch2_btree_sort_into(as->c, n, b); ++ ++ btree_node_reset_sib_u64s(n); ++ ++ n->key.k.p = b->key.k.p; ++ return n; ++} ++ ++static struct btree *bch2_btree_node_alloc_replacement(struct btree_update *as, ++ struct btree *b) ++{ ++ struct bkey_format new_f = bch2_btree_calc_format(b); ++ ++ /* ++ * The keys might expand with the new format - if they wouldn't fit in ++ * the btree node anymore, use the old format for now: ++ */ ++ if (!bch2_btree_node_format_fits(as->c, b, &new_f)) ++ new_f = b->format; ++ ++ return __bch2_btree_node_alloc_replacement(as, b, new_f); ++} ++ ++static struct btree *__btree_root_alloc(struct btree_update *as, unsigned level) ++{ ++ struct btree *b = bch2_btree_node_alloc(as, level); ++ ++ b->data->min_key = POS_MIN; ++ b->data->max_key = POS_MAX; ++ b->data->format = bch2_btree_calc_format(b); ++ b->key.k.p = POS_MAX; ++ ++ btree_node_set_format(b, b->data->format); ++ bch2_btree_build_aux_trees(b); ++ ++ six_unlock_write(&b->lock); ++ ++ return b; ++} ++ ++static void bch2_btree_reserve_put(struct bch_fs *c, struct btree_reserve *reserve) ++{ ++ bch2_disk_reservation_put(c, &reserve->disk_res); ++ ++ mutex_lock(&c->btree_reserve_cache_lock); ++ ++ while (reserve->nr) { ++ struct btree *b = reserve->b[--reserve->nr]; ++ ++ six_unlock_write(&b->lock); ++ ++ if (c->btree_reserve_cache_nr < ++ ARRAY_SIZE(c->btree_reserve_cache)) { ++ struct btree_alloc *a = ++ &c->btree_reserve_cache[c->btree_reserve_cache_nr++]; ++ ++ a->ob = b->ob; ++ b->ob.nr = 0; ++ bkey_copy(&a->k, &b->key); ++ } else { ++ bch2_open_buckets_put(c, &b->ob); ++ } ++ ++ btree_node_lock_type(c, b, SIX_LOCK_write); ++ __btree_node_free(c, b); ++ six_unlock_write(&b->lock); ++ ++ six_unlock_intent(&b->lock); ++ } ++ ++ mutex_unlock(&c->btree_reserve_cache_lock); ++ ++ mempool_free(reserve, &c->btree_reserve_pool); ++} ++ ++static struct btree_reserve *bch2_btree_reserve_get(struct bch_fs *c, ++ unsigned nr_nodes, ++ unsigned flags, ++ struct closure *cl) ++{ ++ struct btree_reserve *reserve; ++ struct btree *b; ++ struct disk_reservation disk_res = { 0, 0 }; ++ unsigned sectors = nr_nodes * c->opts.btree_node_size; ++ int ret, disk_res_flags = 0; ++ ++ if (flags & BTREE_INSERT_NOFAIL) ++ disk_res_flags |= BCH_DISK_RESERVATION_NOFAIL; ++ ++ /* ++ * This check isn't necessary for correctness - it's just to potentially ++ * prevent us from doing a lot of work that'll end up being wasted: ++ */ ++ ret = bch2_journal_error(&c->journal); ++ if (ret) ++ return ERR_PTR(ret); ++ ++ if (bch2_disk_reservation_get(c, &disk_res, sectors, ++ c->opts.metadata_replicas, ++ disk_res_flags)) ++ return ERR_PTR(-ENOSPC); ++ ++ BUG_ON(nr_nodes > BTREE_RESERVE_MAX); ++ ++ /* ++ * Protects reaping from the btree node cache and using the btree node ++ * open bucket reserve: ++ */ ++ ret = bch2_btree_cache_cannibalize_lock(c, cl); ++ if (ret) { ++ bch2_disk_reservation_put(c, &disk_res); ++ return ERR_PTR(ret); ++ } ++ ++ reserve = mempool_alloc(&c->btree_reserve_pool, GFP_NOIO); ++ ++ reserve->disk_res = disk_res; ++ reserve->nr = 0; ++ ++ while (reserve->nr < nr_nodes) { ++ b = __bch2_btree_node_alloc(c, &disk_res, ++ flags & BTREE_INSERT_NOWAIT ++ ? NULL : cl, flags); ++ if (IS_ERR(b)) { ++ ret = PTR_ERR(b); ++ goto err_free; ++ } ++ ++ ret = bch2_mark_bkey_replicas(c, bkey_i_to_s_c(&b->key)); ++ if (ret) ++ goto err_free; ++ ++ reserve->b[reserve->nr++] = b; ++ } ++ ++ bch2_btree_cache_cannibalize_unlock(c); ++ return reserve; ++err_free: ++ bch2_btree_reserve_put(c, reserve); ++ bch2_btree_cache_cannibalize_unlock(c); ++ trace_btree_reserve_get_fail(c, nr_nodes, cl); ++ return ERR_PTR(ret); ++} ++ ++/* Asynchronous interior node update machinery */ ++ ++static void bch2_btree_update_free(struct btree_update *as) ++{ ++ struct bch_fs *c = as->c; ++ ++ bch2_journal_pin_flush(&c->journal, &as->journal); ++ ++ BUG_ON(as->nr_new_nodes); ++ BUG_ON(as->nr_pending); ++ ++ if (as->reserve) ++ bch2_btree_reserve_put(c, as->reserve); ++ ++ mutex_lock(&c->btree_interior_update_lock); ++ list_del(&as->list); ++ ++ closure_debug_destroy(&as->cl); ++ mempool_free(as, &c->btree_interior_update_pool); ++ ++ closure_wake_up(&c->btree_interior_update_wait); ++ mutex_unlock(&c->btree_interior_update_lock); ++} ++ ++static void btree_update_nodes_reachable(struct closure *cl) ++{ ++ struct btree_update *as = container_of(cl, struct btree_update, cl); ++ struct bch_fs *c = as->c; ++ ++ bch2_journal_pin_drop(&c->journal, &as->journal); ++ ++ mutex_lock(&c->btree_interior_update_lock); ++ ++ while (as->nr_new_nodes) { ++ struct btree *b = as->new_nodes[--as->nr_new_nodes]; ++ ++ BUG_ON(b->will_make_reachable != (unsigned long) as); ++ b->will_make_reachable = 0; ++ mutex_unlock(&c->btree_interior_update_lock); ++ ++ /* ++ * b->will_make_reachable prevented it from being written, so ++ * write it now if it needs to be written: ++ */ ++ btree_node_lock_type(c, b, SIX_LOCK_read); ++ bch2_btree_node_write_cond(c, b, btree_node_need_write(b)); ++ six_unlock_read(&b->lock); ++ mutex_lock(&c->btree_interior_update_lock); ++ } ++ ++ while (as->nr_pending) ++ bch2_btree_node_free_ondisk(c, &as->pending[--as->nr_pending]); ++ ++ mutex_unlock(&c->btree_interior_update_lock); ++ ++ closure_wake_up(&as->wait); ++ ++ bch2_btree_update_free(as); ++} ++ ++static void btree_update_wait_on_journal(struct closure *cl) ++{ ++ struct btree_update *as = container_of(cl, struct btree_update, cl); ++ struct bch_fs *c = as->c; ++ int ret; ++ ++ ret = bch2_journal_open_seq_async(&c->journal, as->journal_seq, cl); ++ if (ret == -EAGAIN) { ++ continue_at(cl, btree_update_wait_on_journal, system_wq); ++ return; ++ } ++ if (ret < 0) ++ goto err; ++ ++ bch2_journal_flush_seq_async(&c->journal, as->journal_seq, cl); ++err: ++ continue_at(cl, btree_update_nodes_reachable, system_wq); ++} ++ ++static void btree_update_nodes_written(struct closure *cl) ++{ ++ struct btree_update *as = container_of(cl, struct btree_update, cl); ++ struct bch_fs *c = as->c; ++ struct btree *b; ++ ++ /* ++ * We did an update to a parent node where the pointers we added pointed ++ * to child nodes that weren't written yet: now, the child nodes have ++ * been written so we can write out the update to the interior node. ++ */ ++retry: ++ mutex_lock(&c->btree_interior_update_lock); ++ as->nodes_written = true; ++ ++ switch (as->mode) { ++ case BTREE_INTERIOR_NO_UPDATE: ++ BUG(); ++ case BTREE_INTERIOR_UPDATING_NODE: ++ /* The usual case: */ ++ b = READ_ONCE(as->b); ++ ++ if (!six_trylock_read(&b->lock)) { ++ mutex_unlock(&c->btree_interior_update_lock); ++ btree_node_lock_type(c, b, SIX_LOCK_read); ++ six_unlock_read(&b->lock); ++ goto retry; ++ } ++ ++ BUG_ON(!btree_node_dirty(b)); ++ closure_wait(&btree_current_write(b)->wait, cl); ++ ++ list_del(&as->write_blocked_list); ++ ++ /* ++ * for flush_held_btree_writes() waiting on updates to flush or ++ * nodes to be writeable: ++ */ ++ closure_wake_up(&c->btree_interior_update_wait); ++ mutex_unlock(&c->btree_interior_update_lock); ++ ++ /* ++ * b->write_blocked prevented it from being written, so ++ * write it now if it needs to be written: ++ */ ++ bch2_btree_node_write_cond(c, b, true); ++ six_unlock_read(&b->lock); ++ break; ++ ++ case BTREE_INTERIOR_UPDATING_AS: ++ /* ++ * The btree node we originally updated has been freed and is ++ * being rewritten - so we need to write anything here, we just ++ * need to signal to that btree_update that it's ok to make the ++ * new replacement node visible: ++ */ ++ closure_put(&as->parent_as->cl); ++ ++ /* ++ * and then we have to wait on that btree_update to finish: ++ */ ++ closure_wait(&as->parent_as->wait, cl); ++ mutex_unlock(&c->btree_interior_update_lock); ++ break; ++ ++ case BTREE_INTERIOR_UPDATING_ROOT: ++ /* b is the new btree root: */ ++ b = READ_ONCE(as->b); ++ ++ if (!six_trylock_read(&b->lock)) { ++ mutex_unlock(&c->btree_interior_update_lock); ++ btree_node_lock_type(c, b, SIX_LOCK_read); ++ six_unlock_read(&b->lock); ++ goto retry; ++ } ++ ++ BUG_ON(c->btree_roots[b->btree_id].as != as); ++ c->btree_roots[b->btree_id].as = NULL; ++ ++ bch2_btree_set_root_ondisk(c, b, WRITE); ++ ++ /* ++ * We don't have to wait anything anything here (before ++ * btree_update_nodes_reachable frees the old nodes ++ * ondisk) - we've ensured that the very next journal write will ++ * have the pointer to the new root, and before the allocator ++ * can reuse the old nodes it'll have to do a journal commit: ++ */ ++ six_unlock_read(&b->lock); ++ mutex_unlock(&c->btree_interior_update_lock); ++ ++ /* ++ * Bit of funny circularity going on here we have to break: ++ * ++ * We have to drop our journal pin before writing the journal ++ * entry that points to the new btree root: else, we could ++ * deadlock if the journal currently happens to be full. ++ * ++ * This mean we're dropping the journal pin _before_ the new ++ * nodes are technically reachable - but this is safe, because ++ * after the bch2_btree_set_root_ondisk() call above they will ++ * be reachable as of the very next journal write: ++ */ ++ bch2_journal_pin_drop(&c->journal, &as->journal); ++ ++ as->journal_seq = bch2_journal_last_unwritten_seq(&c->journal); ++ ++ btree_update_wait_on_journal(cl); ++ return; ++ } ++ ++ continue_at(cl, btree_update_nodes_reachable, system_wq); ++} ++ ++/* ++ * We're updating @b with pointers to nodes that haven't finished writing yet: ++ * block @b from being written until @as completes ++ */ ++static void btree_update_updated_node(struct btree_update *as, struct btree *b) ++{ ++ struct bch_fs *c = as->c; ++ ++ mutex_lock(&c->btree_interior_update_lock); ++ ++ BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE); ++ BUG_ON(!btree_node_dirty(b)); ++ ++ as->mode = BTREE_INTERIOR_UPDATING_NODE; ++ as->b = b; ++ list_add(&as->write_blocked_list, &b->write_blocked); ++ ++ mutex_unlock(&c->btree_interior_update_lock); ++ ++ /* ++ * In general, when you're staging things in a journal that will later ++ * be written elsewhere, and you also want to guarantee ordering: that ++ * is, if you have updates a, b, c, after a crash you should never see c ++ * and not a or b - there's a problem: ++ * ++ * If the final destination of the update(s) (i.e. btree node) can be ++ * written/flushed _before_ the relevant journal entry - oops, that ++ * breaks ordering, since the various leaf nodes can be written in any ++ * order. ++ * ++ * Normally we use bset->journal_seq to deal with this - if during ++ * recovery we find a btree node write that's newer than the newest ++ * journal entry, we just ignore it - we don't need it, anything we're ++ * supposed to have (that we reported as completed via fsync()) will ++ * still be in the journal, and as far as the state of the journal is ++ * concerned that btree node write never happened. ++ * ++ * That breaks when we're rewriting/splitting/merging nodes, since we're ++ * mixing btree node writes that haven't happened yet with previously ++ * written data that has been reported as completed to the journal. ++ * ++ * Thus, before making the new nodes reachable, we have to wait the ++ * newest journal sequence number we have data for to be written (if it ++ * hasn't been yet). ++ */ ++ bch2_journal_wait_on_seq(&c->journal, as->journal_seq, &as->cl); ++} ++ ++static void interior_update_flush(struct journal *j, ++ struct journal_entry_pin *pin, u64 seq) ++{ ++ struct btree_update *as = ++ container_of(pin, struct btree_update, journal); ++ ++ bch2_journal_flush_seq_async(j, as->journal_seq, NULL); ++} ++ ++static void btree_update_reparent(struct btree_update *as, ++ struct btree_update *child) ++{ ++ struct bch_fs *c = as->c; ++ ++ child->b = NULL; ++ child->mode = BTREE_INTERIOR_UPDATING_AS; ++ child->parent_as = as; ++ closure_get(&as->cl); ++ ++ /* ++ * When we write a new btree root, we have to drop our journal pin ++ * _before_ the new nodes are technically reachable; see ++ * btree_update_nodes_written(). ++ * ++ * This goes for journal pins that are recursively blocked on us - so, ++ * just transfer the journal pin to the new interior update so ++ * btree_update_nodes_written() can drop it. ++ */ ++ bch2_journal_pin_add_if_older(&c->journal, &child->journal, ++ &as->journal, interior_update_flush); ++ bch2_journal_pin_drop(&c->journal, &child->journal); ++ ++ as->journal_seq = max(as->journal_seq, child->journal_seq); ++} ++ ++static void btree_update_updated_root(struct btree_update *as) ++{ ++ struct bch_fs *c = as->c; ++ struct btree_root *r = &c->btree_roots[as->btree_id]; ++ ++ mutex_lock(&c->btree_interior_update_lock); ++ ++ BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE); ++ ++ /* ++ * Old root might not be persistent yet - if so, redirect its ++ * btree_update operation to point to us: ++ */ ++ if (r->as) ++ btree_update_reparent(as, r->as); ++ ++ as->mode = BTREE_INTERIOR_UPDATING_ROOT; ++ as->b = r->b; ++ r->as = as; ++ ++ mutex_unlock(&c->btree_interior_update_lock); ++ ++ /* ++ * When we're rewriting nodes and updating interior nodes, there's an ++ * issue with updates that haven't been written in the journal getting ++ * mixed together with older data - see btree_update_updated_node() ++ * for the explanation. ++ * ++ * However, this doesn't affect us when we're writing a new btree root - ++ * because to make that new root reachable we have to write out a new ++ * journal entry, which must necessarily be newer than as->journal_seq. ++ */ ++} ++ ++static void btree_node_will_make_reachable(struct btree_update *as, ++ struct btree *b) ++{ ++ struct bch_fs *c = as->c; ++ ++ mutex_lock(&c->btree_interior_update_lock); ++ BUG_ON(as->nr_new_nodes >= ARRAY_SIZE(as->new_nodes)); ++ BUG_ON(b->will_make_reachable); ++ ++ as->new_nodes[as->nr_new_nodes++] = b; ++ b->will_make_reachable = 1UL|(unsigned long) as; ++ ++ closure_get(&as->cl); ++ mutex_unlock(&c->btree_interior_update_lock); ++} ++ ++static void btree_update_drop_new_node(struct bch_fs *c, struct btree *b) ++{ ++ struct btree_update *as; ++ unsigned long v; ++ unsigned i; ++ ++ mutex_lock(&c->btree_interior_update_lock); ++ v = xchg(&b->will_make_reachable, 0); ++ as = (struct btree_update *) (v & ~1UL); ++ ++ if (!as) { ++ mutex_unlock(&c->btree_interior_update_lock); ++ return; ++ } ++ ++ for (i = 0; i < as->nr_new_nodes; i++) ++ if (as->new_nodes[i] == b) ++ goto found; ++ ++ BUG(); ++found: ++ array_remove_item(as->new_nodes, as->nr_new_nodes, i); ++ mutex_unlock(&c->btree_interior_update_lock); ++ ++ if (v & 1) ++ closure_put(&as->cl); ++} ++ ++static void btree_interior_update_add_node_reference(struct btree_update *as, ++ struct btree *b) ++{ ++ struct bch_fs *c = as->c; ++ struct pending_btree_node_free *d; ++ ++ mutex_lock(&c->btree_interior_update_lock); ++ ++ /* Add this node to the list of nodes being freed: */ ++ BUG_ON(as->nr_pending >= ARRAY_SIZE(as->pending)); ++ ++ d = &as->pending[as->nr_pending++]; ++ d->index_update_done = false; ++ d->seq = b->data->keys.seq; ++ d->btree_id = b->btree_id; ++ d->level = b->level; ++ bkey_copy(&d->key, &b->key); ++ ++ mutex_unlock(&c->btree_interior_update_lock); ++} ++ ++/* ++ * @b is being split/rewritten: it may have pointers to not-yet-written btree ++ * nodes and thus outstanding btree_updates - redirect @b's ++ * btree_updates to point to this btree_update: ++ */ ++void bch2_btree_interior_update_will_free_node(struct btree_update *as, ++ struct btree *b) ++{ ++ struct bch_fs *c = as->c; ++ struct closure *cl, *cl_n; ++ struct btree_update *p, *n; ++ struct btree_write *w; ++ struct bset_tree *t; ++ ++ set_btree_node_dying(b); ++ ++ if (btree_node_fake(b)) ++ return; ++ ++ btree_interior_update_add_node_reference(as, b); ++ ++ /* ++ * Does this node have data that hasn't been written in the journal? ++ * ++ * If so, we have to wait for the corresponding journal entry to be ++ * written before making the new nodes reachable - we can't just carry ++ * over the bset->journal_seq tracking, since we'll be mixing those keys ++ * in with keys that aren't in the journal anymore: ++ */ ++ for_each_bset(b, t) ++ as->journal_seq = max(as->journal_seq, ++ le64_to_cpu(bset(b, t)->journal_seq)); ++ ++ mutex_lock(&c->btree_interior_update_lock); ++ ++ /* ++ * Does this node have any btree_update operations preventing ++ * it from being written? ++ * ++ * If so, redirect them to point to this btree_update: we can ++ * write out our new nodes, but we won't make them visible until those ++ * operations complete ++ */ ++ list_for_each_entry_safe(p, n, &b->write_blocked, write_blocked_list) { ++ list_del(&p->write_blocked_list); ++ btree_update_reparent(as, p); ++ ++ /* ++ * for flush_held_btree_writes() waiting on updates to flush or ++ * nodes to be writeable: ++ */ ++ closure_wake_up(&c->btree_interior_update_wait); ++ } ++ ++ clear_btree_node_dirty(b); ++ clear_btree_node_need_write(b); ++ w = btree_current_write(b); ++ ++ /* ++ * Does this node have any btree_update operations waiting on this node ++ * to be written? ++ * ++ * If so, wake them up when this btree_update operation is reachable: ++ */ ++ llist_for_each_entry_safe(cl, cl_n, llist_del_all(&w->wait.list), list) ++ llist_add(&cl->list, &as->wait.list); ++ ++ /* ++ * Does this node have unwritten data that has a pin on the journal? ++ * ++ * If so, transfer that pin to the btree_update operation - ++ * note that if we're freeing multiple nodes, we only need to keep the ++ * oldest pin of any of the nodes we're freeing. We'll release the pin ++ * when the new nodes are persistent and reachable on disk: ++ */ ++ bch2_journal_pin_add_if_older(&c->journal, &w->journal, ++ &as->journal, interior_update_flush); ++ bch2_journal_pin_drop(&c->journal, &w->journal); ++ ++ w = btree_prev_write(b); ++ bch2_journal_pin_add_if_older(&c->journal, &w->journal, ++ &as->journal, interior_update_flush); ++ bch2_journal_pin_drop(&c->journal, &w->journal); ++ ++ mutex_unlock(&c->btree_interior_update_lock); ++} ++ ++void bch2_btree_update_done(struct btree_update *as) ++{ ++ BUG_ON(as->mode == BTREE_INTERIOR_NO_UPDATE); ++ ++ bch2_btree_reserve_put(as->c, as->reserve); ++ as->reserve = NULL; ++ ++ continue_at(&as->cl, btree_update_nodes_written, system_freezable_wq); ++} ++ ++struct btree_update * ++bch2_btree_update_start(struct bch_fs *c, enum btree_id id, ++ unsigned nr_nodes, unsigned flags, ++ struct closure *cl) ++{ ++ struct btree_reserve *reserve; ++ struct btree_update *as; ++ ++ reserve = bch2_btree_reserve_get(c, nr_nodes, flags, cl); ++ if (IS_ERR(reserve)) ++ return ERR_CAST(reserve); ++ ++ as = mempool_alloc(&c->btree_interior_update_pool, GFP_NOIO); ++ memset(as, 0, sizeof(*as)); ++ closure_init(&as->cl, NULL); ++ as->c = c; ++ as->mode = BTREE_INTERIOR_NO_UPDATE; ++ as->btree_id = id; ++ as->reserve = reserve; ++ INIT_LIST_HEAD(&as->write_blocked_list); ++ ++ bch2_keylist_init(&as->parent_keys, as->inline_keys); ++ ++ mutex_lock(&c->btree_interior_update_lock); ++ list_add_tail(&as->list, &c->btree_interior_update_list); ++ mutex_unlock(&c->btree_interior_update_lock); ++ ++ return as; ++} ++ ++/* Btree root updates: */ ++ ++static void __bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b) ++{ ++ /* Root nodes cannot be reaped */ ++ mutex_lock(&c->btree_cache.lock); ++ list_del_init(&b->list); ++ mutex_unlock(&c->btree_cache.lock); ++ ++ mutex_lock(&c->btree_root_lock); ++ BUG_ON(btree_node_root(c, b) && ++ (b->level < btree_node_root(c, b)->level || ++ !btree_node_dying(btree_node_root(c, b)))); ++ ++ btree_node_root(c, b) = b; ++ mutex_unlock(&c->btree_root_lock); ++ ++ bch2_recalc_btree_reserve(c); ++} ++ ++static void bch2_btree_set_root_inmem(struct btree_update *as, struct btree *b) ++{ ++ struct bch_fs *c = as->c; ++ struct btree *old = btree_node_root(c, b); ++ struct bch_fs_usage *fs_usage; ++ ++ __bch2_btree_set_root_inmem(c, b); ++ ++ mutex_lock(&c->btree_interior_update_lock); ++ percpu_down_read(&c->mark_lock); ++ fs_usage = bch2_fs_usage_scratch_get(c); ++ ++ bch2_mark_key_locked(c, bkey_i_to_s_c(&b->key), ++ 0, 0, fs_usage, 0, ++ BCH_BUCKET_MARK_INSERT); ++ if (gc_visited(c, gc_pos_btree_root(b->btree_id))) ++ bch2_mark_key_locked(c, bkey_i_to_s_c(&b->key), ++ 0, 0, NULL, 0, ++ BCH_BUCKET_MARK_INSERT| ++ BCH_BUCKET_MARK_GC); ++ ++ if (old && !btree_node_fake(old)) ++ bch2_btree_node_free_index(as, NULL, ++ bkey_i_to_s_c(&old->key), ++ fs_usage); ++ bch2_fs_usage_apply(c, fs_usage, &as->reserve->disk_res, 0); ++ ++ bch2_fs_usage_scratch_put(c, fs_usage); ++ percpu_up_read(&c->mark_lock); ++ mutex_unlock(&c->btree_interior_update_lock); ++} ++ ++static void bch2_btree_set_root_ondisk(struct bch_fs *c, struct btree *b, int rw) ++{ ++ struct btree_root *r = &c->btree_roots[b->btree_id]; ++ ++ mutex_lock(&c->btree_root_lock); ++ ++ BUG_ON(b != r->b); ++ bkey_copy(&r->key, &b->key); ++ r->level = b->level; ++ r->alive = true; ++ if (rw == WRITE) ++ c->btree_roots_dirty = true; ++ ++ mutex_unlock(&c->btree_root_lock); ++} ++ ++/** ++ * bch_btree_set_root - update the root in memory and on disk ++ * ++ * To ensure forward progress, the current task must not be holding any ++ * btree node write locks. However, you must hold an intent lock on the ++ * old root. ++ * ++ * Note: This allocates a journal entry but doesn't add any keys to ++ * it. All the btree roots are part of every journal write, so there ++ * is nothing new to be done. This just guarantees that there is a ++ * journal write. ++ */ ++static void bch2_btree_set_root(struct btree_update *as, struct btree *b, ++ struct btree_iter *iter) ++{ ++ struct bch_fs *c = as->c; ++ struct btree *old; ++ ++ trace_btree_set_root(c, b); ++ BUG_ON(!b->written && ++ !test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags)); ++ ++ old = btree_node_root(c, b); ++ ++ /* ++ * Ensure no one is using the old root while we switch to the ++ * new root: ++ */ ++ bch2_btree_node_lock_write(old, iter); ++ ++ bch2_btree_set_root_inmem(as, b); ++ ++ btree_update_updated_root(as); ++ ++ /* ++ * Unlock old root after new root is visible: ++ * ++ * The new root isn't persistent, but that's ok: we still have ++ * an intent lock on the new root, and any updates that would ++ * depend on the new root would have to update the new root. ++ */ ++ bch2_btree_node_unlock_write(old, iter); ++} ++ ++/* Interior node updates: */ ++ ++static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b, ++ struct btree_iter *iter, ++ struct bkey_i *insert, ++ struct btree_node_iter *node_iter) ++{ ++ struct bch_fs *c = as->c; ++ struct bch_fs_usage *fs_usage; ++ struct bkey_packed *k; ++ struct bkey tmp; ++ ++ BUG_ON(insert->k.u64s > bch_btree_keys_u64s_remaining(c, b)); ++ ++ mutex_lock(&c->btree_interior_update_lock); ++ percpu_down_read(&c->mark_lock); ++ fs_usage = bch2_fs_usage_scratch_get(c); ++ ++ bch2_mark_key_locked(c, bkey_i_to_s_c(insert), ++ 0, 0, fs_usage, 0, ++ BCH_BUCKET_MARK_INSERT); ++ ++ if (gc_visited(c, gc_pos_btree_node(b))) ++ bch2_mark_key_locked(c, bkey_i_to_s_c(insert), ++ 0, 0, NULL, 0, ++ BCH_BUCKET_MARK_INSERT| ++ BCH_BUCKET_MARK_GC); ++ ++ while ((k = bch2_btree_node_iter_peek_all(node_iter, b)) && ++ bkey_iter_pos_cmp(b, &insert->k.p, k) > 0) ++ bch2_btree_node_iter_advance(node_iter, b); ++ ++ /* ++ * If we're overwriting, look up pending delete and mark so that gc ++ * marks it on the pending delete list: ++ */ ++ if (k && !bkey_cmp_packed(b, k, &insert->k)) ++ bch2_btree_node_free_index(as, b, ++ bkey_disassemble(b, k, &tmp), ++ fs_usage); ++ ++ bch2_fs_usage_apply(c, fs_usage, &as->reserve->disk_res, 0); ++ ++ bch2_fs_usage_scratch_put(c, fs_usage); ++ percpu_up_read(&c->mark_lock); ++ mutex_unlock(&c->btree_interior_update_lock); ++ ++ bch2_btree_bset_insert_key(iter, b, node_iter, insert); ++ set_btree_node_dirty(b); ++ set_btree_node_need_write(b); ++} ++ ++/* ++ * Move keys from n1 (original replacement node, now lower node) to n2 (higher ++ * node) ++ */ ++static struct btree *__btree_split_node(struct btree_update *as, ++ struct btree *n1, ++ struct btree_iter *iter) ++{ ++ size_t nr_packed = 0, nr_unpacked = 0; ++ struct btree *n2; ++ struct bset *set1, *set2; ++ struct bkey_packed *k, *prev = NULL; ++ ++ n2 = bch2_btree_node_alloc(as, n1->level); ++ ++ n2->data->max_key = n1->data->max_key; ++ n2->data->format = n1->format; ++ SET_BTREE_NODE_SEQ(n2->data, BTREE_NODE_SEQ(n1->data)); ++ n2->key.k.p = n1->key.k.p; ++ ++ btree_node_set_format(n2, n2->data->format); ++ ++ set1 = btree_bset_first(n1); ++ set2 = btree_bset_first(n2); ++ ++ /* ++ * Has to be a linear search because we don't have an auxiliary ++ * search tree yet ++ */ ++ k = set1->start; ++ while (1) { ++ if (bkey_next(k) == vstruct_last(set1)) ++ break; ++ if (k->_data - set1->_data >= (le16_to_cpu(set1->u64s) * 3) / 5) ++ break; ++ ++ if (bkey_packed(k)) ++ nr_packed++; ++ else ++ nr_unpacked++; ++ ++ prev = k; ++ k = bkey_next(k); ++ } ++ ++ BUG_ON(!prev); ++ ++ n1->key.k.p = bkey_unpack_pos(n1, prev); ++ n1->data->max_key = n1->key.k.p; ++ n2->data->min_key = ++ btree_type_successor(n1->btree_id, n1->key.k.p); ++ ++ set2->u64s = cpu_to_le16((u64 *) vstruct_end(set1) - (u64 *) k); ++ set1->u64s = cpu_to_le16(le16_to_cpu(set1->u64s) - le16_to_cpu(set2->u64s)); ++ ++ set_btree_bset_end(n1, n1->set); ++ set_btree_bset_end(n2, n2->set); ++ ++ n2->nr.live_u64s = le16_to_cpu(set2->u64s); ++ n2->nr.bset_u64s[0] = le16_to_cpu(set2->u64s); ++ n2->nr.packed_keys = n1->nr.packed_keys - nr_packed; ++ n2->nr.unpacked_keys = n1->nr.unpacked_keys - nr_unpacked; ++ ++ n1->nr.live_u64s = le16_to_cpu(set1->u64s); ++ n1->nr.bset_u64s[0] = le16_to_cpu(set1->u64s); ++ n1->nr.packed_keys = nr_packed; ++ n1->nr.unpacked_keys = nr_unpacked; ++ ++ BUG_ON(!set1->u64s); ++ BUG_ON(!set2->u64s); ++ ++ memcpy_u64s(set2->start, ++ vstruct_end(set1), ++ le16_to_cpu(set2->u64s)); ++ ++ btree_node_reset_sib_u64s(n1); ++ btree_node_reset_sib_u64s(n2); ++ ++ bch2_verify_btree_nr_keys(n1); ++ bch2_verify_btree_nr_keys(n2); ++ ++ if (n1->level) { ++ btree_node_interior_verify(n1); ++ btree_node_interior_verify(n2); ++ } ++ ++ return n2; ++} ++ ++/* ++ * For updates to interior nodes, we've got to do the insert before we split ++ * because the stuff we're inserting has to be inserted atomically. Post split, ++ * the keys might have to go in different nodes and the split would no longer be ++ * atomic. ++ * ++ * Worse, if the insert is from btree node coalescing, if we do the insert after ++ * we do the split (and pick the pivot) - the pivot we pick might be between ++ * nodes that were coalesced, and thus in the middle of a child node post ++ * coalescing: ++ */ ++static void btree_split_insert_keys(struct btree_update *as, struct btree *b, ++ struct btree_iter *iter, ++ struct keylist *keys) ++{ ++ struct btree_node_iter node_iter; ++ struct bkey_i *k = bch2_keylist_front(keys); ++ struct bkey_packed *p; ++ struct bset *i; ++ ++ BUG_ON(btree_node_type(b) != BKEY_TYPE_BTREE); ++ ++ bch2_btree_node_iter_init(&node_iter, b, &k->k.p); ++ ++ while (!bch2_keylist_empty(keys)) { ++ k = bch2_keylist_front(keys); ++ ++ BUG_ON(bch_keylist_u64s(keys) > ++ bch_btree_keys_u64s_remaining(as->c, b)); ++ BUG_ON(bkey_cmp(k->k.p, b->data->min_key) < 0); ++ BUG_ON(bkey_cmp(k->k.p, b->data->max_key) > 0); ++ ++ bch2_insert_fixup_btree_ptr(as, b, iter, k, &node_iter); ++ bch2_keylist_pop_front(keys); ++ } ++ ++ /* ++ * We can't tolerate whiteouts here - with whiteouts there can be ++ * duplicate keys, and it would be rather bad if we picked a duplicate ++ * for the pivot: ++ */ ++ i = btree_bset_first(b); ++ p = i->start; ++ while (p != vstruct_last(i)) ++ if (bkey_deleted(p)) { ++ le16_add_cpu(&i->u64s, -p->u64s); ++ set_btree_bset_end(b, b->set); ++ memmove_u64s_down(p, bkey_next(p), ++ (u64 *) vstruct_last(i) - ++ (u64 *) p); ++ } else ++ p = bkey_next(p); ++ ++ BUG_ON(b->nsets != 1 || ++ b->nr.live_u64s != le16_to_cpu(btree_bset_first(b)->u64s)); ++ ++ btree_node_interior_verify(b); ++} ++ ++static void btree_split(struct btree_update *as, struct btree *b, ++ struct btree_iter *iter, struct keylist *keys, ++ unsigned flags) ++{ ++ struct bch_fs *c = as->c; ++ struct btree *parent = btree_node_parent(iter, b); ++ struct btree *n1, *n2 = NULL, *n3 = NULL; ++ u64 start_time = local_clock(); ++ ++ BUG_ON(!parent && (b != btree_node_root(c, b))); ++ BUG_ON(!btree_node_intent_locked(iter, btree_node_root(c, b)->level)); ++ ++ bch2_btree_interior_update_will_free_node(as, b); ++ ++ n1 = bch2_btree_node_alloc_replacement(as, b); ++ ++ if (keys) ++ btree_split_insert_keys(as, n1, iter, keys); ++ ++ if (vstruct_blocks(n1->data, c->block_bits) > BTREE_SPLIT_THRESHOLD(c)) { ++ trace_btree_split(c, b); ++ ++ n2 = __btree_split_node(as, n1, iter); ++ ++ bch2_btree_build_aux_trees(n2); ++ bch2_btree_build_aux_trees(n1); ++ six_unlock_write(&n2->lock); ++ six_unlock_write(&n1->lock); ++ ++ bch2_btree_node_write(c, n2, SIX_LOCK_intent); ++ ++ /* ++ * Note that on recursive parent_keys == keys, so we ++ * can't start adding new keys to parent_keys before emptying it ++ * out (which we did with btree_split_insert_keys() above) ++ */ ++ bch2_keylist_add(&as->parent_keys, &n1->key); ++ bch2_keylist_add(&as->parent_keys, &n2->key); ++ ++ if (!parent) { ++ /* Depth increases, make a new root */ ++ n3 = __btree_root_alloc(as, b->level + 1); ++ ++ n3->sib_u64s[0] = U16_MAX; ++ n3->sib_u64s[1] = U16_MAX; ++ ++ btree_split_insert_keys(as, n3, iter, &as->parent_keys); ++ ++ bch2_btree_node_write(c, n3, SIX_LOCK_intent); ++ } ++ } else { ++ trace_btree_compact(c, b); ++ ++ bch2_btree_build_aux_trees(n1); ++ six_unlock_write(&n1->lock); ++ ++ bch2_keylist_add(&as->parent_keys, &n1->key); ++ } ++ ++ bch2_btree_node_write(c, n1, SIX_LOCK_intent); ++ ++ /* New nodes all written, now make them visible: */ ++ ++ if (parent) { ++ /* Split a non root node */ ++ bch2_btree_insert_node(as, parent, iter, &as->parent_keys, flags); ++ } else if (n3) { ++ bch2_btree_set_root(as, n3, iter); ++ } else { ++ /* Root filled up but didn't need to be split */ ++ bch2_btree_set_root(as, n1, iter); ++ } ++ ++ bch2_open_buckets_put(c, &n1->ob); ++ if (n2) ++ bch2_open_buckets_put(c, &n2->ob); ++ if (n3) ++ bch2_open_buckets_put(c, &n3->ob); ++ ++ /* Successful split, update the iterator to point to the new nodes: */ ++ ++ six_lock_increment(&b->lock, SIX_LOCK_intent); ++ bch2_btree_iter_node_drop(iter, b); ++ if (n3) ++ bch2_btree_iter_node_replace(iter, n3); ++ if (n2) ++ bch2_btree_iter_node_replace(iter, n2); ++ bch2_btree_iter_node_replace(iter, n1); ++ ++ /* ++ * The old node must be freed (in memory) _before_ unlocking the new ++ * nodes - else another thread could re-acquire a read lock on the old ++ * node after another thread has locked and updated the new node, thus ++ * seeing stale data: ++ */ ++ bch2_btree_node_free_inmem(c, b, iter); ++ ++ if (n3) ++ six_unlock_intent(&n3->lock); ++ if (n2) ++ six_unlock_intent(&n2->lock); ++ six_unlock_intent(&n1->lock); ++ ++ bch2_btree_trans_verify_locks(iter->trans); ++ ++ bch2_time_stats_update(&c->times[BCH_TIME_btree_node_split], ++ start_time); ++} ++ ++static void ++bch2_btree_insert_keys_interior(struct btree_update *as, struct btree *b, ++ struct btree_iter *iter, struct keylist *keys) ++{ ++ struct btree_iter *linked; ++ struct btree_node_iter node_iter; ++ struct bkey_i *insert = bch2_keylist_front(keys); ++ struct bkey_packed *k; ++ ++ /* Don't screw up @iter's position: */ ++ node_iter = iter->l[b->level].iter; ++ ++ /* ++ * btree_split(), btree_gc_coalesce() will insert keys before ++ * the iterator's current position - they know the keys go in ++ * the node the iterator points to: ++ */ ++ while ((k = bch2_btree_node_iter_prev_all(&node_iter, b)) && ++ (bkey_cmp_packed(b, k, &insert->k) >= 0)) ++ ; ++ ++ while (!bch2_keylist_empty(keys)) { ++ insert = bch2_keylist_front(keys); ++ ++ bch2_insert_fixup_btree_ptr(as, b, iter, insert, &node_iter); ++ bch2_keylist_pop_front(keys); ++ } ++ ++ btree_update_updated_node(as, b); ++ ++ trans_for_each_iter_with_node(iter->trans, b, linked) ++ bch2_btree_node_iter_peek(&linked->l[b->level].iter, b); ++ ++ bch2_btree_iter_verify(iter, b); ++} ++ ++/** ++ * bch_btree_insert_node - insert bkeys into a given btree node ++ * ++ * @iter: btree iterator ++ * @keys: list of keys to insert ++ * @hook: insert callback ++ * @persistent: if not null, @persistent will wait on journal write ++ * ++ * Inserts as many keys as it can into a given btree node, splitting it if full. ++ * If a split occurred, this function will return early. This can only happen ++ * for leaf nodes -- inserts into interior nodes have to be atomic. ++ */ ++void bch2_btree_insert_node(struct btree_update *as, struct btree *b, ++ struct btree_iter *iter, struct keylist *keys, ++ unsigned flags) ++{ ++ struct bch_fs *c = as->c; ++ int old_u64s = le16_to_cpu(btree_bset_last(b)->u64s); ++ int old_live_u64s = b->nr.live_u64s; ++ int live_u64s_added, u64s_added; ++ ++ BUG_ON(!btree_node_intent_locked(iter, btree_node_root(c, b)->level)); ++ BUG_ON(!b->level); ++ BUG_ON(!as || as->b); ++ bch2_verify_keylist_sorted(keys); ++ ++ if (as->must_rewrite) ++ goto split; ++ ++ bch2_btree_node_lock_for_insert(c, b, iter); ++ ++ if (!bch2_btree_node_insert_fits(c, b, bch_keylist_u64s(keys))) { ++ bch2_btree_node_unlock_write(b, iter); ++ goto split; ++ } ++ ++ bch2_btree_insert_keys_interior(as, b, iter, keys); ++ ++ live_u64s_added = (int) b->nr.live_u64s - old_live_u64s; ++ u64s_added = (int) le16_to_cpu(btree_bset_last(b)->u64s) - old_u64s; ++ ++ if (b->sib_u64s[0] != U16_MAX && live_u64s_added < 0) ++ b->sib_u64s[0] = max(0, (int) b->sib_u64s[0] + live_u64s_added); ++ if (b->sib_u64s[1] != U16_MAX && live_u64s_added < 0) ++ b->sib_u64s[1] = max(0, (int) b->sib_u64s[1] + live_u64s_added); ++ ++ if (u64s_added > live_u64s_added && ++ bch2_maybe_compact_whiteouts(c, b)) ++ bch2_btree_iter_reinit_node(iter, b); ++ ++ bch2_btree_node_unlock_write(b, iter); ++ ++ btree_node_interior_verify(b); ++ ++ /* ++ * when called from the btree_split path the new nodes aren't added to ++ * the btree iterator yet, so the merge path's unlock/wait/relock dance ++ * won't work: ++ */ ++ bch2_foreground_maybe_merge(c, iter, b->level, ++ flags|BTREE_INSERT_NOUNLOCK); ++ return; ++split: ++ btree_split(as, b, iter, keys, flags); ++} ++ ++int bch2_btree_split_leaf(struct bch_fs *c, struct btree_iter *iter, ++ unsigned flags) ++{ ++ struct btree_trans *trans = iter->trans; ++ struct btree *b = iter->l[0].b; ++ struct btree_update *as; ++ struct closure cl; ++ int ret = 0; ++ struct btree_iter *linked; ++ ++ /* ++ * We already have a disk reservation and open buckets pinned; this ++ * allocation must not block: ++ */ ++ trans_for_each_iter(trans, linked) ++ if (linked->btree_id == BTREE_ID_EXTENTS) ++ flags |= BTREE_INSERT_USE_RESERVE; ++ ++ closure_init_stack(&cl); ++ ++ /* Hack, because gc and splitting nodes doesn't mix yet: */ ++ if (!(flags & BTREE_INSERT_GC_LOCK_HELD) && ++ !down_read_trylock(&c->gc_lock)) { ++ if (flags & BTREE_INSERT_NOUNLOCK) ++ return -EINTR; ++ ++ bch2_trans_unlock(trans); ++ down_read(&c->gc_lock); ++ ++ if (!bch2_trans_relock(trans)) ++ ret = -EINTR; ++ } ++ ++ /* ++ * XXX: figure out how far we might need to split, ++ * instead of locking/reserving all the way to the root: ++ */ ++ if (!bch2_btree_iter_upgrade(iter, U8_MAX)) { ++ trace_trans_restart_iter_upgrade(trans->ip); ++ ret = -EINTR; ++ goto out; ++ } ++ ++ as = bch2_btree_update_start(c, iter->btree_id, ++ btree_update_reserve_required(c, b), flags, ++ !(flags & BTREE_INSERT_NOUNLOCK) ? &cl : NULL); ++ if (IS_ERR(as)) { ++ ret = PTR_ERR(as); ++ if (ret == -EAGAIN) { ++ BUG_ON(flags & BTREE_INSERT_NOUNLOCK); ++ bch2_trans_unlock(trans); ++ ret = -EINTR; ++ } ++ goto out; ++ } ++ ++ btree_split(as, b, iter, NULL, flags); ++ bch2_btree_update_done(as); ++ ++ /* ++ * We haven't successfully inserted yet, so don't downgrade all the way ++ * back to read locks; ++ */ ++ __bch2_btree_iter_downgrade(iter, 1); ++out: ++ if (!(flags & BTREE_INSERT_GC_LOCK_HELD)) ++ up_read(&c->gc_lock); ++ closure_sync(&cl); ++ return ret; ++} ++ ++void __bch2_foreground_maybe_merge(struct bch_fs *c, ++ struct btree_iter *iter, ++ unsigned level, ++ unsigned flags, ++ enum btree_node_sibling sib) ++{ ++ struct btree_trans *trans = iter->trans; ++ struct btree_update *as; ++ struct bkey_format_state new_s; ++ struct bkey_format new_f; ++ struct bkey_i delete; ++ struct btree *b, *m, *n, *prev, *next, *parent; ++ struct closure cl; ++ size_t sib_u64s; ++ int ret = 0; ++ ++ closure_init_stack(&cl); ++retry: ++ BUG_ON(!btree_node_locked(iter, level)); ++ ++ b = iter->l[level].b; ++ ++ parent = btree_node_parent(iter, b); ++ if (!parent) ++ goto out; ++ ++ if (b->sib_u64s[sib] > BTREE_FOREGROUND_MERGE_THRESHOLD(c)) ++ goto out; ++ ++ /* XXX: can't be holding read locks */ ++ m = bch2_btree_node_get_sibling(c, iter, b, sib); ++ if (IS_ERR(m)) { ++ ret = PTR_ERR(m); ++ goto err; ++ } ++ ++ /* NULL means no sibling: */ ++ if (!m) { ++ b->sib_u64s[sib] = U16_MAX; ++ goto out; ++ } ++ ++ if (sib == btree_prev_sib) { ++ prev = m; ++ next = b; ++ } else { ++ prev = b; ++ next = m; ++ } ++ ++ bch2_bkey_format_init(&new_s); ++ __bch2_btree_calc_format(&new_s, b); ++ __bch2_btree_calc_format(&new_s, m); ++ new_f = bch2_bkey_format_done(&new_s); ++ ++ sib_u64s = btree_node_u64s_with_format(b, &new_f) + ++ btree_node_u64s_with_format(m, &new_f); ++ ++ if (sib_u64s > BTREE_FOREGROUND_MERGE_HYSTERESIS(c)) { ++ sib_u64s -= BTREE_FOREGROUND_MERGE_HYSTERESIS(c); ++ sib_u64s /= 2; ++ sib_u64s += BTREE_FOREGROUND_MERGE_HYSTERESIS(c); ++ } ++ ++ sib_u64s = min(sib_u64s, btree_max_u64s(c)); ++ b->sib_u64s[sib] = sib_u64s; ++ ++ if (b->sib_u64s[sib] > BTREE_FOREGROUND_MERGE_THRESHOLD(c)) { ++ six_unlock_intent(&m->lock); ++ goto out; ++ } ++ ++ /* We're changing btree topology, doesn't mix with gc: */ ++ if (!(flags & BTREE_INSERT_GC_LOCK_HELD) && ++ !down_read_trylock(&c->gc_lock)) ++ goto err_cycle_gc_lock; ++ ++ if (!bch2_btree_iter_upgrade(iter, U8_MAX)) { ++ ret = -EINTR; ++ goto err_unlock; ++ } ++ ++ as = bch2_btree_update_start(c, iter->btree_id, ++ btree_update_reserve_required(c, parent) + 1, ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_USE_RESERVE, ++ !(flags & BTREE_INSERT_NOUNLOCK) ? &cl : NULL); ++ if (IS_ERR(as)) { ++ ret = PTR_ERR(as); ++ goto err_unlock; ++ } ++ ++ trace_btree_merge(c, b); ++ ++ bch2_btree_interior_update_will_free_node(as, b); ++ bch2_btree_interior_update_will_free_node(as, m); ++ ++ n = bch2_btree_node_alloc(as, b->level); ++ ++ n->data->min_key = prev->data->min_key; ++ n->data->max_key = next->data->max_key; ++ n->data->format = new_f; ++ n->key.k.p = next->key.k.p; ++ ++ btree_node_set_format(n, new_f); ++ ++ bch2_btree_sort_into(c, n, prev); ++ bch2_btree_sort_into(c, n, next); ++ ++ bch2_btree_build_aux_trees(n); ++ six_unlock_write(&n->lock); ++ ++ bkey_init(&delete.k); ++ delete.k.p = prev->key.k.p; ++ bch2_keylist_add(&as->parent_keys, &delete); ++ bch2_keylist_add(&as->parent_keys, &n->key); ++ ++ bch2_btree_node_write(c, n, SIX_LOCK_intent); ++ ++ bch2_btree_insert_node(as, parent, iter, &as->parent_keys, flags); ++ ++ bch2_open_buckets_put(c, &n->ob); ++ ++ six_lock_increment(&b->lock, SIX_LOCK_intent); ++ bch2_btree_iter_node_drop(iter, b); ++ bch2_btree_iter_node_drop(iter, m); ++ ++ bch2_btree_iter_node_replace(iter, n); ++ ++ bch2_btree_iter_verify(iter, n); ++ ++ bch2_btree_node_free_inmem(c, b, iter); ++ bch2_btree_node_free_inmem(c, m, iter); ++ ++ six_unlock_intent(&n->lock); ++ ++ bch2_btree_update_done(as); ++ ++ if (!(flags & BTREE_INSERT_GC_LOCK_HELD)) ++ up_read(&c->gc_lock); ++out: ++ bch2_btree_trans_verify_locks(trans); ++ ++ /* ++ * Don't downgrade locks here: we're called after successful insert, ++ * and the caller will downgrade locks after a successful insert ++ * anyways (in case e.g. a split was required first) ++ * ++ * And we're also called when inserting into interior nodes in the ++ * split path, and downgrading to read locks in there is potentially ++ * confusing: ++ */ ++ closure_sync(&cl); ++ return; ++ ++err_cycle_gc_lock: ++ six_unlock_intent(&m->lock); ++ ++ if (flags & BTREE_INSERT_NOUNLOCK) ++ goto out; ++ ++ bch2_trans_unlock(trans); ++ ++ down_read(&c->gc_lock); ++ up_read(&c->gc_lock); ++ ret = -EINTR; ++ goto err; ++ ++err_unlock: ++ six_unlock_intent(&m->lock); ++ if (!(flags & BTREE_INSERT_GC_LOCK_HELD)) ++ up_read(&c->gc_lock); ++err: ++ BUG_ON(ret == -EAGAIN && (flags & BTREE_INSERT_NOUNLOCK)); ++ ++ if ((ret == -EAGAIN || ret == -EINTR) && ++ !(flags & BTREE_INSERT_NOUNLOCK)) { ++ bch2_trans_unlock(trans); ++ closure_sync(&cl); ++ ret = bch2_btree_iter_traverse(iter); ++ if (ret) ++ goto out; ++ ++ goto retry; ++ } ++ ++ goto out; ++} ++ ++static int __btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter, ++ struct btree *b, unsigned flags, ++ struct closure *cl) ++{ ++ struct btree *n, *parent = btree_node_parent(iter, b); ++ struct btree_update *as; ++ ++ as = bch2_btree_update_start(c, iter->btree_id, ++ (parent ++ ? btree_update_reserve_required(c, parent) ++ : 0) + 1, ++ flags, cl); ++ if (IS_ERR(as)) { ++ trace_btree_gc_rewrite_node_fail(c, b); ++ return PTR_ERR(as); ++ } ++ ++ bch2_btree_interior_update_will_free_node(as, b); ++ ++ n = bch2_btree_node_alloc_replacement(as, b); ++ ++ bch2_btree_build_aux_trees(n); ++ six_unlock_write(&n->lock); ++ ++ trace_btree_gc_rewrite_node(c, b); ++ ++ bch2_btree_node_write(c, n, SIX_LOCK_intent); ++ ++ if (parent) { ++ bch2_keylist_add(&as->parent_keys, &n->key); ++ bch2_btree_insert_node(as, parent, iter, &as->parent_keys, flags); ++ } else { ++ bch2_btree_set_root(as, n, iter); ++ } ++ ++ bch2_open_buckets_put(c, &n->ob); ++ ++ six_lock_increment(&b->lock, SIX_LOCK_intent); ++ bch2_btree_iter_node_drop(iter, b); ++ bch2_btree_iter_node_replace(iter, n); ++ bch2_btree_node_free_inmem(c, b, iter); ++ six_unlock_intent(&n->lock); ++ ++ bch2_btree_update_done(as); ++ return 0; ++} ++ ++/** ++ * bch_btree_node_rewrite - Rewrite/move a btree node ++ * ++ * Returns 0 on success, -EINTR or -EAGAIN on failure (i.e. ++ * btree_check_reserve() has to wait) ++ */ ++int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter, ++ __le64 seq, unsigned flags) ++{ ++ struct btree_trans *trans = iter->trans; ++ struct closure cl; ++ struct btree *b; ++ int ret; ++ ++ flags |= BTREE_INSERT_NOFAIL; ++ ++ closure_init_stack(&cl); ++ ++ bch2_btree_iter_upgrade(iter, U8_MAX); ++ ++ if (!(flags & BTREE_INSERT_GC_LOCK_HELD)) { ++ if (!down_read_trylock(&c->gc_lock)) { ++ bch2_trans_unlock(trans); ++ down_read(&c->gc_lock); ++ } ++ } ++ ++ while (1) { ++ ret = bch2_btree_iter_traverse(iter); ++ if (ret) ++ break; ++ ++ b = bch2_btree_iter_peek_node(iter); ++ if (!b || b->data->keys.seq != seq) ++ break; ++ ++ ret = __btree_node_rewrite(c, iter, b, flags, &cl); ++ if (ret != -EAGAIN && ++ ret != -EINTR) ++ break; ++ ++ bch2_trans_unlock(trans); ++ closure_sync(&cl); ++ } ++ ++ bch2_btree_iter_downgrade(iter); ++ ++ if (!(flags & BTREE_INSERT_GC_LOCK_HELD)) ++ up_read(&c->gc_lock); ++ ++ closure_sync(&cl); ++ return ret; ++} ++ ++static void __bch2_btree_node_update_key(struct bch_fs *c, ++ struct btree_update *as, ++ struct btree_iter *iter, ++ struct btree *b, struct btree *new_hash, ++ struct bkey_i_btree_ptr *new_key) ++{ ++ struct btree *parent; ++ int ret; ++ ++ /* ++ * Two corner cases that need to be thought about here: ++ * ++ * @b may not be reachable yet - there might be another interior update ++ * operation waiting on @b to be written, and we're gonna deliver the ++ * write completion to that interior update operation _before_ ++ * persisting the new_key update ++ * ++ * That ends up working without us having to do anything special here: ++ * the reason is, we do kick off (and do the in memory updates) for the ++ * update for @new_key before we return, creating a new interior_update ++ * operation here. ++ * ++ * The new interior update operation here will in effect override the ++ * previous one. The previous one was going to terminate - make @b ++ * reachable - in one of two ways: ++ * - updating the btree root pointer ++ * In that case, ++ * no, this doesn't work. argh. ++ */ ++ ++ if (b->will_make_reachable) ++ as->must_rewrite = true; ++ ++ btree_interior_update_add_node_reference(as, b); ++ ++ /* ++ * XXX: the rest of the update path treats this like we're actually ++ * inserting a new node and deleting the existing node, so the ++ * reservation needs to include enough space for @b ++ * ++ * that is actually sketch as fuck though and I am surprised the code ++ * seems to work like that, definitely need to go back and rework it ++ * into something saner. ++ * ++ * (I think @b is just getting double counted until the btree update ++ * finishes and "deletes" @b on disk) ++ */ ++ ret = bch2_disk_reservation_add(c, &as->reserve->disk_res, ++ c->opts.btree_node_size * ++ bch2_bkey_nr_ptrs(bkey_i_to_s_c(&new_key->k_i)), ++ BCH_DISK_RESERVATION_NOFAIL); ++ BUG_ON(ret); ++ ++ parent = btree_node_parent(iter, b); ++ if (parent) { ++ if (new_hash) { ++ bkey_copy(&new_hash->key, &new_key->k_i); ++ ret = bch2_btree_node_hash_insert(&c->btree_cache, ++ new_hash, b->level, b->btree_id); ++ BUG_ON(ret); ++ } ++ ++ bch2_keylist_add(&as->parent_keys, &new_key->k_i); ++ bch2_btree_insert_node(as, parent, iter, &as->parent_keys, 0); ++ ++ if (new_hash) { ++ mutex_lock(&c->btree_cache.lock); ++ bch2_btree_node_hash_remove(&c->btree_cache, new_hash); ++ ++ bch2_btree_node_hash_remove(&c->btree_cache, b); ++ ++ bkey_copy(&b->key, &new_key->k_i); ++ ret = __bch2_btree_node_hash_insert(&c->btree_cache, b); ++ BUG_ON(ret); ++ mutex_unlock(&c->btree_cache.lock); ++ } else { ++ bkey_copy(&b->key, &new_key->k_i); ++ } ++ } else { ++ struct bch_fs_usage *fs_usage; ++ ++ BUG_ON(btree_node_root(c, b) != b); ++ ++ bch2_btree_node_lock_write(b, iter); ++ ++ mutex_lock(&c->btree_interior_update_lock); ++ percpu_down_read(&c->mark_lock); ++ fs_usage = bch2_fs_usage_scratch_get(c); ++ ++ bch2_mark_key_locked(c, bkey_i_to_s_c(&new_key->k_i), ++ 0, 0, fs_usage, 0, ++ BCH_BUCKET_MARK_INSERT); ++ if (gc_visited(c, gc_pos_btree_root(b->btree_id))) ++ bch2_mark_key_locked(c, bkey_i_to_s_c(&new_key->k_i), ++ 0, 0, NULL, 0, ++ BCH_BUCKET_MARK_INSERT|| ++ BCH_BUCKET_MARK_GC); ++ ++ bch2_btree_node_free_index(as, NULL, ++ bkey_i_to_s_c(&b->key), ++ fs_usage); ++ bch2_fs_usage_apply(c, fs_usage, &as->reserve->disk_res, 0); ++ ++ bch2_fs_usage_scratch_put(c, fs_usage); ++ percpu_up_read(&c->mark_lock); ++ mutex_unlock(&c->btree_interior_update_lock); ++ ++ if (PTR_HASH(&new_key->k_i) != PTR_HASH(&b->key)) { ++ mutex_lock(&c->btree_cache.lock); ++ bch2_btree_node_hash_remove(&c->btree_cache, b); ++ ++ bkey_copy(&b->key, &new_key->k_i); ++ ret = __bch2_btree_node_hash_insert(&c->btree_cache, b); ++ BUG_ON(ret); ++ mutex_unlock(&c->btree_cache.lock); ++ } else { ++ bkey_copy(&b->key, &new_key->k_i); ++ } ++ ++ btree_update_updated_root(as); ++ bch2_btree_node_unlock_write(b, iter); ++ } ++ ++ bch2_btree_update_done(as); ++} ++ ++int bch2_btree_node_update_key(struct bch_fs *c, struct btree_iter *iter, ++ struct btree *b, ++ struct bkey_i_btree_ptr *new_key) ++{ ++ struct btree *parent = btree_node_parent(iter, b); ++ struct btree_update *as = NULL; ++ struct btree *new_hash = NULL; ++ struct closure cl; ++ int ret; ++ ++ closure_init_stack(&cl); ++ ++ if (!bch2_btree_iter_upgrade(iter, U8_MAX)) ++ return -EINTR; ++ ++ if (!down_read_trylock(&c->gc_lock)) { ++ bch2_trans_unlock(iter->trans); ++ down_read(&c->gc_lock); ++ ++ if (!bch2_trans_relock(iter->trans)) { ++ ret = -EINTR; ++ goto err; ++ } ++ } ++ ++ /* check PTR_HASH() after @b is locked by btree_iter_traverse(): */ ++ if (PTR_HASH(&new_key->k_i) != PTR_HASH(&b->key)) { ++ /* bch2_btree_reserve_get will unlock */ ++ ret = bch2_btree_cache_cannibalize_lock(c, &cl); ++ if (ret) { ++ bch2_trans_unlock(iter->trans); ++ up_read(&c->gc_lock); ++ closure_sync(&cl); ++ down_read(&c->gc_lock); ++ ++ if (!bch2_trans_relock(iter->trans)) { ++ ret = -EINTR; ++ goto err; ++ } ++ } ++ ++ new_hash = bch2_btree_node_mem_alloc(c); ++ } ++ ++ as = bch2_btree_update_start(c, iter->btree_id, ++ parent ? btree_update_reserve_required(c, parent) : 0, ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_USE_RESERVE| ++ BTREE_INSERT_USE_ALLOC_RESERVE, ++ &cl); ++ ++ if (IS_ERR(as)) { ++ ret = PTR_ERR(as); ++ if (ret == -EAGAIN) ++ ret = -EINTR; ++ ++ if (ret != -EINTR) ++ goto err; ++ ++ bch2_trans_unlock(iter->trans); ++ up_read(&c->gc_lock); ++ closure_sync(&cl); ++ down_read(&c->gc_lock); ++ ++ if (!bch2_trans_relock(iter->trans)) ++ goto err; ++ } ++ ++ ret = bch2_mark_bkey_replicas(c, bkey_i_to_s_c(&new_key->k_i)); ++ if (ret) ++ goto err_free_update; ++ ++ __bch2_btree_node_update_key(c, as, iter, b, new_hash, new_key); ++ ++ bch2_btree_iter_downgrade(iter); ++err: ++ if (new_hash) { ++ mutex_lock(&c->btree_cache.lock); ++ list_move(&new_hash->list, &c->btree_cache.freeable); ++ mutex_unlock(&c->btree_cache.lock); ++ ++ six_unlock_write(&new_hash->lock); ++ six_unlock_intent(&new_hash->lock); ++ } ++ up_read(&c->gc_lock); ++ closure_sync(&cl); ++ return ret; ++err_free_update: ++ bch2_btree_update_free(as); ++ goto err; ++} ++ ++/* Init code: */ ++ ++/* ++ * Only for filesystem bringup, when first reading the btree roots or allocating ++ * btree roots when initializing a new filesystem: ++ */ ++void bch2_btree_set_root_for_read(struct bch_fs *c, struct btree *b) ++{ ++ BUG_ON(btree_node_root(c, b)); ++ ++ __bch2_btree_set_root_inmem(c, b); ++} ++ ++void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id) ++{ ++ struct closure cl; ++ struct btree *b; ++ int ret; ++ ++ closure_init_stack(&cl); ++ ++ do { ++ ret = bch2_btree_cache_cannibalize_lock(c, &cl); ++ closure_sync(&cl); ++ } while (ret); ++ ++ b = bch2_btree_node_mem_alloc(c); ++ bch2_btree_cache_cannibalize_unlock(c); ++ ++ set_btree_node_fake(b); ++ b->level = 0; ++ b->btree_id = id; ++ ++ bkey_btree_ptr_init(&b->key); ++ b->key.k.p = POS_MAX; ++ PTR_HASH(&b->key) = U64_MAX - id; ++ ++ bch2_bset_init_first(b, &b->data->keys); ++ bch2_btree_build_aux_trees(b); ++ ++ b->data->flags = 0; ++ b->data->min_key = POS_MIN; ++ b->data->max_key = POS_MAX; ++ b->data->format = bch2_btree_calc_format(b); ++ btree_node_set_format(b, b->data->format); ++ ++ ret = bch2_btree_node_hash_insert(&c->btree_cache, b, b->level, b->btree_id); ++ BUG_ON(ret); ++ ++ __bch2_btree_set_root_inmem(c, b); ++ ++ six_unlock_write(&b->lock); ++ six_unlock_intent(&b->lock); ++} ++ ++ssize_t bch2_btree_updates_print(struct bch_fs *c, char *buf) ++{ ++ struct printbuf out = _PBUF(buf, PAGE_SIZE); ++ struct btree_update *as; ++ ++ mutex_lock(&c->btree_interior_update_lock); ++ list_for_each_entry(as, &c->btree_interior_update_list, list) ++ pr_buf(&out, "%p m %u w %u r %u j %llu\n", ++ as, ++ as->mode, ++ as->nodes_written, ++ atomic_read(&as->cl.remaining) & CLOSURE_REMAINING_MASK, ++ as->journal.seq); ++ mutex_unlock(&c->btree_interior_update_lock); ++ ++ return out.pos - buf; ++} ++ ++size_t bch2_btree_interior_updates_nr_pending(struct bch_fs *c) ++{ ++ size_t ret = 0; ++ struct list_head *i; ++ ++ mutex_lock(&c->btree_interior_update_lock); ++ list_for_each(i, &c->btree_interior_update_list) ++ ret++; ++ mutex_unlock(&c->btree_interior_update_lock); ++ ++ return ret; ++} +diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h +new file mode 100644 +index 000000000000..c5a0ab5d7bb8 +--- /dev/null ++++ b/fs/bcachefs/btree_update_interior.h +@@ -0,0 +1,341 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_BTREE_UPDATE_INTERIOR_H ++#define _BCACHEFS_BTREE_UPDATE_INTERIOR_H ++ ++#include "btree_cache.h" ++#include "btree_locking.h" ++#include "btree_update.h" ++ ++struct btree_reserve { ++ struct disk_reservation disk_res; ++ unsigned nr; ++ struct btree *b[BTREE_RESERVE_MAX]; ++}; ++ ++void __bch2_btree_calc_format(struct bkey_format_state *, struct btree *); ++bool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *, ++ struct bkey_format *); ++ ++/* Btree node freeing/allocation: */ ++ ++/* ++ * Tracks a btree node that has been (or is about to be) freed in memory, but ++ * has _not_ yet been freed on disk (because the write that makes the new ++ * node(s) visible and frees the old hasn't completed yet) ++ */ ++struct pending_btree_node_free { ++ bool index_update_done; ++ ++ __le64 seq; ++ enum btree_id btree_id; ++ unsigned level; ++ __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX); ++}; ++ ++/* ++ * Tracks an in progress split/rewrite of a btree node and the update to the ++ * parent node: ++ * ++ * When we split/rewrite a node, we do all the updates in memory without ++ * waiting for any writes to complete - we allocate the new node(s) and update ++ * the parent node, possibly recursively up to the root. ++ * ++ * The end result is that we have one or more new nodes being written - ++ * possibly several, if there were multiple splits - and then a write (updating ++ * an interior node) which will make all these new nodes visible. ++ * ++ * Additionally, as we split/rewrite nodes we free the old nodes - but the old ++ * nodes can't be freed (their space on disk can't be reclaimed) until the ++ * update to the interior node that makes the new node visible completes - ++ * until then, the old nodes are still reachable on disk. ++ * ++ */ ++struct btree_update { ++ struct closure cl; ++ struct bch_fs *c; ++ ++ struct list_head list; ++ ++ /* What kind of update are we doing? */ ++ enum { ++ BTREE_INTERIOR_NO_UPDATE, ++ BTREE_INTERIOR_UPDATING_NODE, ++ BTREE_INTERIOR_UPDATING_ROOT, ++ BTREE_INTERIOR_UPDATING_AS, ++ } mode; ++ ++ unsigned must_rewrite:1; ++ unsigned nodes_written:1; ++ ++ enum btree_id btree_id; ++ ++ struct btree_reserve *reserve; ++ ++ /* ++ * BTREE_INTERIOR_UPDATING_NODE: ++ * The update that made the new nodes visible was a regular update to an ++ * existing interior node - @b. We can't write out the update to @b ++ * until the new nodes we created are finished writing, so we block @b ++ * from writing by putting this btree_interior update on the ++ * @b->write_blocked list with @write_blocked_list: ++ */ ++ struct btree *b; ++ struct list_head write_blocked_list; ++ ++ /* ++ * BTREE_INTERIOR_UPDATING_AS: btree node we updated was freed, so now ++ * we're now blocking another btree_update ++ * @parent_as - btree_update that's waiting on our nodes to finish ++ * writing, before it can make new nodes visible on disk ++ * @wait - list of child btree_updates that are waiting on this ++ * btree_update to make all the new nodes visible before they can free ++ * their old btree nodes ++ */ ++ struct btree_update *parent_as; ++ struct closure_waitlist wait; ++ ++ /* ++ * We may be freeing nodes that were dirty, and thus had journal entries ++ * pinned: we need to transfer the oldest of those pins to the ++ * btree_update operation, and release it when the new node(s) ++ * are all persistent and reachable: ++ */ ++ struct journal_entry_pin journal; ++ ++ u64 journal_seq; ++ ++ /* ++ * Nodes being freed: ++ * Protected by c->btree_node_pending_free_lock ++ */ ++ struct pending_btree_node_free pending[BTREE_MAX_DEPTH + GC_MERGE_NODES]; ++ unsigned nr_pending; ++ ++ /* New nodes, that will be made reachable by this update: */ ++ struct btree *new_nodes[BTREE_MAX_DEPTH * 2 + GC_MERGE_NODES]; ++ unsigned nr_new_nodes; ++ ++ /* Only here to reduce stack usage on recursive splits: */ ++ struct keylist parent_keys; ++ /* ++ * Enough room for btree_split's keys without realloc - btree node ++ * pointers never have crc/compression info, so we only need to acount ++ * for the pointers for three keys ++ */ ++ u64 inline_keys[BKEY_BTREE_PTR_U64s_MAX * 3]; ++}; ++ ++#define for_each_pending_btree_node_free(c, as, p) \ ++ list_for_each_entry(as, &c->btree_interior_update_list, list) \ ++ for (p = as->pending; p < as->pending + as->nr_pending; p++) ++ ++void bch2_btree_node_free_inmem(struct bch_fs *, struct btree *, ++ struct btree_iter *); ++void bch2_btree_node_free_never_inserted(struct bch_fs *, struct btree *); ++ ++struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *, ++ struct btree *, ++ struct bkey_format); ++ ++void bch2_btree_update_done(struct btree_update *); ++struct btree_update * ++bch2_btree_update_start(struct bch_fs *, enum btree_id, unsigned, ++ unsigned, struct closure *); ++ ++void bch2_btree_interior_update_will_free_node(struct btree_update *, ++ struct btree *); ++ ++void bch2_btree_insert_node(struct btree_update *, struct btree *, ++ struct btree_iter *, struct keylist *, ++ unsigned); ++int bch2_btree_split_leaf(struct bch_fs *, struct btree_iter *, unsigned); ++ ++void __bch2_foreground_maybe_merge(struct bch_fs *, struct btree_iter *, ++ unsigned, unsigned, enum btree_node_sibling); ++ ++static inline void bch2_foreground_maybe_merge_sibling(struct bch_fs *c, ++ struct btree_iter *iter, ++ unsigned level, unsigned flags, ++ enum btree_node_sibling sib) ++{ ++ struct btree *b; ++ ++ if (iter->uptodate >= BTREE_ITER_NEED_TRAVERSE) ++ return; ++ ++ if (!bch2_btree_node_relock(iter, level)) ++ return; ++ ++ b = iter->l[level].b; ++ if (b->sib_u64s[sib] > c->btree_foreground_merge_threshold) ++ return; ++ ++ __bch2_foreground_maybe_merge(c, iter, level, flags, sib); ++} ++ ++static inline void bch2_foreground_maybe_merge(struct bch_fs *c, ++ struct btree_iter *iter, ++ unsigned level, ++ unsigned flags) ++{ ++ bch2_foreground_maybe_merge_sibling(c, iter, level, flags, ++ btree_prev_sib); ++ bch2_foreground_maybe_merge_sibling(c, iter, level, flags, ++ btree_next_sib); ++} ++ ++void bch2_btree_set_root_for_read(struct bch_fs *, struct btree *); ++void bch2_btree_root_alloc(struct bch_fs *, enum btree_id); ++ ++static inline unsigned btree_update_reserve_required(struct bch_fs *c, ++ struct btree *b) ++{ ++ unsigned depth = btree_node_root(c, b)->level + 1; ++ ++ /* ++ * Number of nodes we might have to allocate in a worst case btree ++ * split operation - we split all the way up to the root, then allocate ++ * a new root, unless we're already at max depth: ++ */ ++ if (depth < BTREE_MAX_DEPTH) ++ return (depth - b->level) * 2 + 1; ++ else ++ return (depth - b->level) * 2 - 1; ++} ++ ++static inline void btree_node_reset_sib_u64s(struct btree *b) ++{ ++ b->sib_u64s[0] = b->nr.live_u64s; ++ b->sib_u64s[1] = b->nr.live_u64s; ++} ++ ++static inline void *btree_data_end(struct bch_fs *c, struct btree *b) ++{ ++ return (void *) b->data + btree_bytes(c); ++} ++ ++static inline struct bkey_packed *unwritten_whiteouts_start(struct bch_fs *c, ++ struct btree *b) ++{ ++ return (void *) ((u64 *) btree_data_end(c, b) - b->whiteout_u64s); ++} ++ ++static inline struct bkey_packed *unwritten_whiteouts_end(struct bch_fs *c, ++ struct btree *b) ++{ ++ return btree_data_end(c, b); ++} ++ ++static inline void *write_block(struct btree *b) ++{ ++ return (void *) b->data + (b->written << 9); ++} ++ ++static inline bool __btree_addr_written(struct btree *b, void *p) ++{ ++ return p < write_block(b); ++} ++ ++static inline bool bset_written(struct btree *b, struct bset *i) ++{ ++ return __btree_addr_written(b, i); ++} ++ ++static inline bool bkey_written(struct btree *b, struct bkey_packed *k) ++{ ++ return __btree_addr_written(b, k); ++} ++ ++static inline ssize_t __bch_btree_u64s_remaining(struct bch_fs *c, ++ struct btree *b, ++ void *end) ++{ ++ ssize_t used = bset_byte_offset(b, end) / sizeof(u64) + ++ b->whiteout_u64s + ++ b->uncompacted_whiteout_u64s; ++ ssize_t total = c->opts.btree_node_size << 6; ++ ++ return total - used; ++} ++ ++static inline size_t bch_btree_keys_u64s_remaining(struct bch_fs *c, ++ struct btree *b) ++{ ++ ssize_t remaining = __bch_btree_u64s_remaining(c, b, ++ btree_bkey_last(b, bset_tree_last(b))); ++ ++ BUG_ON(remaining < 0); ++ ++ if (bset_written(b, btree_bset_last(b))) ++ return 0; ++ ++ return remaining; ++} ++ ++static inline unsigned btree_write_set_buffer(struct btree *b) ++{ ++ /* ++ * Could buffer up larger amounts of keys for btrees with larger keys, ++ * pending benchmarking: ++ */ ++ return 4 << 10; ++} ++ ++static inline struct btree_node_entry *want_new_bset(struct bch_fs *c, ++ struct btree *b) ++{ ++ struct bset_tree *t = bset_tree_last(b); ++ struct btree_node_entry *bne = max(write_block(b), ++ (void *) btree_bkey_last(b, bset_tree_last(b))); ++ ssize_t remaining_space = ++ __bch_btree_u64s_remaining(c, b, &bne->keys.start[0]); ++ ++ if (unlikely(bset_written(b, bset(b, t)))) { ++ if (remaining_space > (ssize_t) (block_bytes(c) >> 3)) ++ return bne; ++ } else { ++ if (unlikely(bset_u64s(t) * sizeof(u64) > btree_write_set_buffer(b)) && ++ remaining_space > (ssize_t) (btree_write_set_buffer(b) >> 3)) ++ return bne; ++ } ++ ++ return NULL; ++} ++ ++static inline void unreserve_whiteout(struct btree *b, struct bkey_packed *k) ++{ ++ if (bkey_written(b, k)) { ++ EBUG_ON(b->uncompacted_whiteout_u64s < ++ bkeyp_key_u64s(&b->format, k)); ++ b->uncompacted_whiteout_u64s -= ++ bkeyp_key_u64s(&b->format, k); ++ } ++} ++ ++static inline void reserve_whiteout(struct btree *b, struct bkey_packed *k) ++{ ++ if (bkey_written(b, k)) { ++ BUG_ON(!k->needs_whiteout); ++ b->uncompacted_whiteout_u64s += ++ bkeyp_key_u64s(&b->format, k); ++ } ++} ++ ++/* ++ * write lock must be held on @b (else the dirty bset that we were going to ++ * insert into could be written out from under us) ++ */ ++static inline bool bch2_btree_node_insert_fits(struct bch_fs *c, ++ struct btree *b, unsigned u64s) ++{ ++ if (unlikely(btree_node_fake(b))) ++ return false; ++ ++ return u64s <= bch_btree_keys_u64s_remaining(c, b); ++} ++ ++ssize_t bch2_btree_updates_print(struct bch_fs *, char *); ++ ++size_t bch2_btree_interior_updates_nr_pending(struct bch_fs *); ++ ++#endif /* _BCACHEFS_BTREE_UPDATE_INTERIOR_H */ +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +new file mode 100644 +index 000000000000..5f5574ecc176 +--- /dev/null ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -0,0 +1,952 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include "bcachefs.h" ++#include "btree_update.h" ++#include "btree_update_interior.h" ++#include "btree_gc.h" ++#include "btree_io.h" ++#include "btree_iter.h" ++#include "btree_locking.h" ++#include "buckets.h" ++#include "debug.h" ++#include "error.h" ++#include "extents.h" ++#include "journal.h" ++#include "journal_reclaim.h" ++#include "keylist.h" ++#include "replicas.h" ++ ++#include ++#include ++#include ++ ++static inline bool same_leaf_as_prev(struct btree_trans *trans, ++ unsigned idx) ++{ ++ return idx && ++ trans->updates[trans->updates_sorted[idx]].iter->l[0].b == ++ trans->updates[trans->updates_sorted[idx - 1]].iter->l[0].b; ++} ++ ++#define trans_for_each_update_sorted(_trans, _i, _iter) \ ++ for (_iter = 0; \ ++ _iter < _trans->nr_updates && \ ++ (_i = _trans->updates + _trans->updates_sorted[_iter], 1); \ ++ _iter++) ++ ++inline void bch2_btree_node_lock_for_insert(struct bch_fs *c, struct btree *b, ++ struct btree_iter *iter) ++{ ++ bch2_btree_node_lock_write(b, iter); ++ ++ if (unlikely(btree_node_just_written(b)) && ++ bch2_btree_post_write_cleanup(c, b)) ++ bch2_btree_iter_reinit_node(iter, b); ++ ++ /* ++ * If the last bset has been written, or if it's gotten too big - start ++ * a new bset to insert into: ++ */ ++ if (want_new_bset(c, b)) ++ bch2_btree_init_next(c, b, iter); ++} ++ ++static inline void btree_trans_sort_updates(struct btree_trans *trans) ++{ ++ struct btree_insert_entry *l, *r; ++ unsigned nr = 0, pos; ++ ++ trans_for_each_update(trans, l) { ++ for (pos = 0; pos < nr; pos++) { ++ r = trans->updates + trans->updates_sorted[pos]; ++ ++ if (btree_iter_cmp(l->iter, r->iter) <= 0) ++ break; ++ } ++ ++ memmove(&trans->updates_sorted[pos + 1], ++ &trans->updates_sorted[pos], ++ (nr - pos) * sizeof(trans->updates_sorted[0])); ++ ++ trans->updates_sorted[pos] = l - trans->updates; ++ nr++; ++ } ++} ++ ++/* Inserting into a given leaf node (last stage of insert): */ ++ ++/* Handle overwrites and do insert, for non extents: */ ++bool bch2_btree_bset_insert_key(struct btree_iter *iter, ++ struct btree *b, ++ struct btree_node_iter *node_iter, ++ struct bkey_i *insert) ++{ ++ const struct bkey_format *f = &b->format; ++ struct bkey_packed *k; ++ unsigned clobber_u64s; ++ ++ EBUG_ON(btree_node_just_written(b)); ++ EBUG_ON(bset_written(b, btree_bset_last(b))); ++ EBUG_ON(bkey_deleted(&insert->k) && bkey_val_u64s(&insert->k)); ++ EBUG_ON(bkey_cmp(bkey_start_pos(&insert->k), b->data->min_key) < 0 || ++ bkey_cmp(insert->k.p, b->data->max_key) > 0); ++ ++ k = bch2_btree_node_iter_peek_all(node_iter, b); ++ if (k && !bkey_cmp_packed(b, k, &insert->k)) { ++ BUG_ON(bkey_whiteout(k)); ++ ++ if (!bkey_written(b, k) && ++ bkey_val_u64s(&insert->k) == bkeyp_val_u64s(f, k) && ++ !bkey_whiteout(&insert->k)) { ++ k->type = insert->k.type; ++ memcpy_u64s(bkeyp_val(f, k), &insert->v, ++ bkey_val_u64s(&insert->k)); ++ return true; ++ } ++ ++ insert->k.needs_whiteout = k->needs_whiteout; ++ ++ btree_account_key_drop(b, k); ++ ++ if (k >= btree_bset_last(b)->start) { ++ clobber_u64s = k->u64s; ++ ++ /* ++ * If we're deleting, and the key we're deleting doesn't ++ * need a whiteout (it wasn't overwriting a key that had ++ * been written to disk) - just delete it: ++ */ ++ if (bkey_whiteout(&insert->k) && !k->needs_whiteout) { ++ bch2_bset_delete(b, k, clobber_u64s); ++ bch2_btree_node_iter_fix(iter, b, node_iter, ++ k, clobber_u64s, 0); ++ return true; ++ } ++ ++ goto overwrite; ++ } ++ ++ k->type = KEY_TYPE_deleted; ++ bch2_btree_node_iter_fix(iter, b, node_iter, k, ++ k->u64s, k->u64s); ++ ++ if (bkey_whiteout(&insert->k)) { ++ reserve_whiteout(b, k); ++ return true; ++ } else { ++ k->needs_whiteout = false; ++ } ++ } else { ++ /* ++ * Deleting, but the key to delete wasn't found - nothing to do: ++ */ ++ if (bkey_whiteout(&insert->k)) ++ return false; ++ ++ insert->k.needs_whiteout = false; ++ } ++ ++ k = bch2_btree_node_iter_bset_pos(node_iter, b, bset_tree_last(b)); ++ clobber_u64s = 0; ++overwrite: ++ bch2_bset_insert(b, node_iter, k, insert, clobber_u64s); ++ bch2_btree_node_iter_fix(iter, b, node_iter, k, ++ clobber_u64s, k->u64s); ++ return true; ++} ++ ++static void __btree_node_flush(struct journal *j, struct journal_entry_pin *pin, ++ unsigned i, u64 seq) ++{ ++ struct bch_fs *c = container_of(j, struct bch_fs, journal); ++ struct btree_write *w = container_of(pin, struct btree_write, journal); ++ struct btree *b = container_of(w, struct btree, writes[i]); ++ ++ btree_node_lock_type(c, b, SIX_LOCK_read); ++ bch2_btree_node_write_cond(c, b, ++ (btree_current_write(b) == w && w->journal.seq == seq)); ++ six_unlock_read(&b->lock); ++} ++ ++static void btree_node_flush0(struct journal *j, struct journal_entry_pin *pin, u64 seq) ++{ ++ return __btree_node_flush(j, pin, 0, seq); ++} ++ ++static void btree_node_flush1(struct journal *j, struct journal_entry_pin *pin, u64 seq) ++{ ++ return __btree_node_flush(j, pin, 1, seq); ++} ++ ++static inline void __btree_journal_key(struct btree_trans *trans, ++ enum btree_id btree_id, ++ struct bkey_i *insert) ++{ ++ struct journal *j = &trans->c->journal; ++ u64 seq = trans->journal_res.seq; ++ bool needs_whiteout = insert->k.needs_whiteout; ++ ++ /* ick */ ++ insert->k.needs_whiteout = false; ++ bch2_journal_add_keys(j, &trans->journal_res, ++ btree_id, insert); ++ insert->k.needs_whiteout = needs_whiteout; ++ ++ bch2_journal_set_has_inode(j, &trans->journal_res, ++ insert->k.p.inode); ++ ++ if (trans->journal_seq) ++ *trans->journal_seq = seq; ++} ++ ++void bch2_btree_journal_key(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct bkey_i *insert) ++{ ++ struct bch_fs *c = trans->c; ++ struct journal *j = &c->journal; ++ struct btree *b = iter->l[0].b; ++ struct btree_write *w = btree_current_write(b); ++ ++ EBUG_ON(iter->level || b->level); ++ EBUG_ON(trans->journal_res.ref != ++ !(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)); ++ ++ if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) { ++ __btree_journal_key(trans, iter->btree_id, insert); ++ btree_bset_last(b)->journal_seq = ++ cpu_to_le64(trans->journal_res.seq); ++ } ++ ++ if (unlikely(!journal_pin_active(&w->journal))) { ++ u64 seq = likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)) ++ ? trans->journal_res.seq ++ : j->replay_journal_seq; ++ ++ bch2_journal_pin_add(j, seq, &w->journal, ++ btree_node_write_idx(b) == 0 ++ ? btree_node_flush0 ++ : btree_node_flush1); ++ } ++ ++ if (unlikely(!btree_node_dirty(b))) ++ set_btree_node_dirty(b); ++} ++ ++static void bch2_insert_fixup_key(struct btree_trans *trans, ++ struct btree_insert_entry *insert) ++{ ++ struct btree_iter *iter = insert->iter; ++ struct btree_iter_level *l = &iter->l[0]; ++ ++ EBUG_ON(iter->level); ++ EBUG_ON(insert->k->k.u64s > ++ bch_btree_keys_u64s_remaining(trans->c, l->b)); ++ ++ if (likely(bch2_btree_bset_insert_key(iter, l->b, &l->iter, ++ insert->k))) ++ bch2_btree_journal_key(trans, iter, insert->k); ++} ++ ++/** ++ * btree_insert_key - insert a key one key into a leaf node ++ */ ++static void btree_insert_key_leaf(struct btree_trans *trans, ++ struct btree_insert_entry *insert) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_iter *iter = insert->iter; ++ struct btree *b = iter->l[0].b; ++ struct bset_tree *t = bset_tree_last(b); ++ int old_u64s = bset_u64s(t); ++ int old_live_u64s = b->nr.live_u64s; ++ int live_u64s_added, u64s_added; ++ ++ if (!btree_node_is_extents(b)) ++ bch2_insert_fixup_key(trans, insert); ++ else ++ bch2_insert_fixup_extent(trans, insert); ++ ++ live_u64s_added = (int) b->nr.live_u64s - old_live_u64s; ++ u64s_added = (int) bset_u64s(t) - old_u64s; ++ ++ if (b->sib_u64s[0] != U16_MAX && live_u64s_added < 0) ++ b->sib_u64s[0] = max(0, (int) b->sib_u64s[0] + live_u64s_added); ++ if (b->sib_u64s[1] != U16_MAX && live_u64s_added < 0) ++ b->sib_u64s[1] = max(0, (int) b->sib_u64s[1] + live_u64s_added); ++ ++ if (u64s_added > live_u64s_added && ++ bch2_maybe_compact_whiteouts(c, b)) ++ bch2_btree_iter_reinit_node(iter, b); ++ ++ trace_btree_insert_key(c, b, insert->k); ++} ++ ++/* Normal update interface: */ ++ ++static inline void btree_insert_entry_checks(struct btree_trans *trans, ++ struct btree_insert_entry *i) ++{ ++ struct bch_fs *c = trans->c; ++ ++ BUG_ON(i->iter->level); ++ BUG_ON(bkey_cmp(bkey_start_pos(&i->k->k), i->iter->pos)); ++ EBUG_ON((i->iter->flags & BTREE_ITER_IS_EXTENTS) && ++ bkey_cmp(i->k->k.p, i->iter->l[0].b->key.k.p) > 0); ++ EBUG_ON((i->iter->flags & BTREE_ITER_IS_EXTENTS) && ++ !(trans->flags & BTREE_INSERT_ATOMIC)); ++ ++ BUG_ON(debug_check_bkeys(c) && ++ !bkey_deleted(&i->k->k) && ++ bch2_bkey_invalid(c, bkey_i_to_s_c(i->k), i->iter->btree_id)); ++} ++ ++static noinline int ++bch2_trans_journal_preres_get_cold(struct btree_trans *trans, unsigned u64s) ++{ ++ struct bch_fs *c = trans->c; ++ int ret; ++ ++ bch2_trans_unlock(trans); ++ ++ ret = bch2_journal_preres_get(&c->journal, ++ &trans->journal_preres, u64s, 0); ++ if (ret) ++ return ret; ++ ++ if (!bch2_trans_relock(trans)) { ++ trace_trans_restart_journal_preres_get(trans->ip); ++ return -EINTR; ++ } ++ ++ return 0; ++} ++ ++static inline int bch2_trans_journal_res_get(struct btree_trans *trans, ++ unsigned flags) ++{ ++ struct bch_fs *c = trans->c; ++ int ret; ++ ++ if (trans->flags & BTREE_INSERT_JOURNAL_RESERVED) ++ flags |= JOURNAL_RES_GET_RESERVED; ++ ++ ret = bch2_journal_res_get(&c->journal, &trans->journal_res, ++ trans->journal_u64s, flags); ++ ++ return ret == -EAGAIN ? BTREE_INSERT_NEED_JOURNAL_RES : ret; ++} ++ ++static enum btree_insert_ret ++btree_key_can_insert(struct btree_trans *trans, ++ struct btree_insert_entry *insert, ++ unsigned *u64s) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree *b = insert->iter->l[0].b; ++ static enum btree_insert_ret ret; ++ ++ if (unlikely(btree_node_fake(b))) ++ return BTREE_INSERT_BTREE_NODE_FULL; ++ ++ ret = !btree_node_is_extents(b) ++ ? BTREE_INSERT_OK ++ : bch2_extent_can_insert(trans, insert, u64s); ++ if (ret) ++ return ret; ++ ++ if (*u64s > bch_btree_keys_u64s_remaining(c, b)) ++ return BTREE_INSERT_BTREE_NODE_FULL; ++ ++ return BTREE_INSERT_OK; ++} ++ ++static inline void do_btree_insert_one(struct btree_trans *trans, ++ struct btree_insert_entry *insert) ++{ ++ btree_insert_key_leaf(trans, insert); ++} ++ ++static inline bool update_has_trans_triggers(struct btree_insert_entry *i) ++{ ++ return BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->iter->btree_id); ++} ++ ++static inline bool update_has_nontrans_triggers(struct btree_insert_entry *i) ++{ ++ return (BTREE_NODE_TYPE_HAS_TRIGGERS & ++ ~BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS) & ++ (1U << i->iter->btree_id); ++} ++ ++static noinline void bch2_btree_iter_unlock_noinline(struct btree_iter *iter) ++{ ++ __bch2_btree_iter_unlock(iter); ++} ++ ++static noinline void bch2_trans_mark_gc(struct btree_trans *trans) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_insert_entry *i; ++ unsigned mark_flags = trans->flags & BTREE_INSERT_BUCKET_INVALIDATE ++ ? BCH_BUCKET_MARK_BUCKET_INVALIDATE ++ : 0; ++ ++ if (unlikely(trans->flags & BTREE_INSERT_NOMARK)) ++ return; ++ ++ trans_for_each_update(trans, i) ++ if (gc_visited(c, gc_pos_btree_node(i->iter->l[0].b))) ++ bch2_mark_update(trans, i, NULL, ++ mark_flags|BCH_BUCKET_MARK_GC); ++} ++ ++static inline int ++bch2_trans_commit_write_locked(struct btree_trans *trans, ++ struct btree_insert_entry **stopped_at) ++{ ++ struct bch_fs *c = trans->c; ++ struct bch_fs_usage *fs_usage = NULL; ++ struct btree_insert_entry *i; ++ unsigned mark_flags = trans->flags & BTREE_INSERT_BUCKET_INVALIDATE ++ ? BCH_BUCKET_MARK_BUCKET_INVALIDATE ++ : 0; ++ unsigned iter, u64s = 0; ++ bool marking = false; ++ int ret; ++ ++ if (race_fault()) { ++ trace_trans_restart_fault_inject(trans->ip); ++ return -EINTR; ++ } ++ ++ /* ++ * Check if the insert will fit in the leaf node with the write lock ++ * held, otherwise another thread could write the node changing the ++ * amount of space available: ++ */ ++ ++ prefetch(&trans->c->journal.flags); ++ ++ trans_for_each_update_sorted(trans, i, iter) { ++ /* Multiple inserts might go to same leaf: */ ++ if (!same_leaf_as_prev(trans, iter)) ++ u64s = 0; ++ ++ u64s += i->k->k.u64s; ++ ret = btree_key_can_insert(trans, i, &u64s); ++ if (ret) { ++ *stopped_at = i; ++ return ret; ++ } ++ ++ if (btree_node_type_needs_gc(i->iter->btree_id)) ++ marking = true; ++ } ++ ++ if (marking) { ++ percpu_down_read(&c->mark_lock); ++ fs_usage = bch2_fs_usage_scratch_get(c); ++ } ++ ++ /* ++ * Don't get journal reservation until after we know insert will ++ * succeed: ++ */ ++ if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) { ++ ret = bch2_trans_journal_res_get(trans, ++ JOURNAL_RES_GET_NONBLOCK); ++ if (ret) ++ goto err; ++ } ++ ++ /* ++ * Not allowed to fail after we've gotten our journal reservation - we ++ * have to use it: ++ */ ++ ++ if (!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)) { ++ if (journal_seq_verify(c)) ++ trans_for_each_update(trans, i) ++ i->k->k.version.lo = trans->journal_res.seq; ++ else if (inject_invalid_keys(c)) ++ trans_for_each_update(trans, i) ++ i->k->k.version = MAX_VERSION; ++ } ++ ++ /* Must be called under mark_lock: */ ++ if (marking && trans->fs_usage_deltas && ++ bch2_replicas_delta_list_apply(c, fs_usage, ++ trans->fs_usage_deltas)) { ++ ret = BTREE_INSERT_NEED_MARK_REPLICAS; ++ goto err; ++ } ++ ++ trans_for_each_update(trans, i) ++ if (likely(!(trans->flags & BTREE_INSERT_NOMARK)) && ++ update_has_nontrans_triggers(i)) ++ bch2_mark_update(trans, i, fs_usage, mark_flags); ++ ++ if (marking) ++ bch2_trans_fs_usage_apply(trans, fs_usage); ++ ++ if (unlikely(c->gc_pos.phase)) ++ bch2_trans_mark_gc(trans); ++ ++ trans_for_each_update(trans, i) ++ do_btree_insert_one(trans, i); ++err: ++ if (marking) { ++ bch2_fs_usage_scratch_put(c, fs_usage); ++ percpu_up_read(&c->mark_lock); ++ } ++ ++ return ret; ++} ++ ++/* ++ * Get journal reservation, take write locks, and attempt to do btree update(s): ++ */ ++static inline int do_bch2_trans_commit(struct btree_trans *trans, ++ struct btree_insert_entry **stopped_at) ++{ ++ struct btree_insert_entry *i; ++ struct btree_iter *iter; ++ unsigned idx, u64s, journal_preres_u64s = 0; ++ int ret; ++ ++ /* ++ * note: running triggers will append more updates to the list of ++ * updates as we're walking it: ++ */ ++ trans_for_each_update(trans, i) { ++ /* we know trans->nounlock won't be set here: */ ++ if (unlikely(!(i->iter->locks_want < 1 ++ ? __bch2_btree_iter_upgrade(i->iter, 1) ++ : i->iter->uptodate <= BTREE_ITER_NEED_PEEK))) { ++ trace_trans_restart_upgrade(trans->ip); ++ return -EINTR; ++ } ++ ++ if (likely(!(trans->flags & BTREE_INSERT_NOMARK)) && ++ update_has_trans_triggers(i)) { ++ ret = bch2_trans_mark_update(trans, i->iter, i->k); ++ if (unlikely(ret)) { ++ if (ret == -EINTR) ++ trace_trans_restart_mark(trans->ip); ++ return ret; ++ } ++ } ++ ++ u64s = jset_u64s(i->k->k.u64s); ++ if (0) ++ journal_preres_u64s += u64s; ++ trans->journal_u64s += u64s; ++ } ++ ++ ret = bch2_journal_preres_get(&trans->c->journal, ++ &trans->journal_preres, journal_preres_u64s, ++ JOURNAL_RES_GET_NONBLOCK); ++ if (unlikely(ret == -EAGAIN)) ++ ret = bch2_trans_journal_preres_get_cold(trans, ++ journal_preres_u64s); ++ if (unlikely(ret)) ++ return ret; ++ ++ /* ++ * Can't be holding any read locks when we go to take write locks: ++ * ++ * note - this must be done after bch2_trans_journal_preres_get_cold() ++ * or anything else that might call bch2_trans_relock(), since that ++ * would just retake the read locks: ++ */ ++ trans_for_each_iter_all(trans, iter) { ++ if (iter->nodes_locked != iter->nodes_intent_locked) { ++ EBUG_ON(iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT); ++ EBUG_ON(trans->iters_live & (1ULL << iter->idx)); ++ bch2_btree_iter_unlock_noinline(iter); ++ } ++ } ++ ++ if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) ++ trans_for_each_update(trans, i) ++ btree_insert_entry_checks(trans, i); ++ bch2_btree_trans_verify_locks(trans); ++ ++ /* ++ * No more updates can be added - sort updates so we can take write ++ * locks in the correct order: ++ */ ++ btree_trans_sort_updates(trans); ++ ++ trans_for_each_update_sorted(trans, i, idx) ++ if (!same_leaf_as_prev(trans, idx)) ++ bch2_btree_node_lock_for_insert(trans->c, ++ i->iter->l[0].b, i->iter); ++ ++ ret = bch2_trans_commit_write_locked(trans, stopped_at); ++ ++ trans_for_each_update_sorted(trans, i, idx) ++ if (!same_leaf_as_prev(trans, idx)) ++ bch2_btree_node_unlock_write_inlined(i->iter->l[0].b, ++ i->iter); ++ ++ /* ++ * Drop journal reservation after dropping write locks, since dropping ++ * the journal reservation may kick off a journal write: ++ */ ++ bch2_journal_res_put(&trans->c->journal, &trans->journal_res); ++ ++ if (unlikely(ret)) ++ return ret; ++ ++ if (trans->flags & BTREE_INSERT_NOUNLOCK) ++ trans->nounlock = true; ++ ++ trans_for_each_update_sorted(trans, i, idx) ++ if (!same_leaf_as_prev(trans, idx)) ++ bch2_foreground_maybe_merge(trans->c, i->iter, ++ 0, trans->flags); ++ ++ trans->nounlock = false; ++ ++ trans_for_each_update(trans, i) ++ bch2_btree_iter_downgrade(i->iter); ++ ++ return 0; ++} ++ ++static noinline ++int bch2_trans_commit_error(struct btree_trans *trans, ++ struct btree_insert_entry *i, ++ int ret) ++{ ++ struct bch_fs *c = trans->c; ++ unsigned flags = trans->flags; ++ ++ /* ++ * BTREE_INSERT_NOUNLOCK means don't unlock _after_ successful btree ++ * update; if we haven't done anything yet it doesn't apply ++ */ ++ flags &= ~BTREE_INSERT_NOUNLOCK; ++ ++ switch (ret) { ++ case BTREE_INSERT_BTREE_NODE_FULL: ++ ret = bch2_btree_split_leaf(c, i->iter, flags); ++ ++ /* ++ * if the split succeeded without dropping locks the insert will ++ * still be atomic (in the BTREE_INSERT_ATOMIC sense, what the ++ * caller peeked() and is overwriting won't have changed) ++ */ ++#if 0 ++ /* ++ * XXX: ++ * split -> btree node merging (of parent node) might still drop ++ * locks when we're not passing it BTREE_INSERT_NOUNLOCK ++ * ++ * we don't want to pass BTREE_INSERT_NOUNLOCK to split as that ++ * will inhibit merging - but we don't have a reliable way yet ++ * (do we?) of checking if we dropped locks in this path ++ */ ++ if (!ret) ++ goto retry; ++#endif ++ ++ /* ++ * don't care if we got ENOSPC because we told split it ++ * couldn't block: ++ */ ++ if (!ret || ++ ret == -EINTR || ++ (flags & BTREE_INSERT_NOUNLOCK)) { ++ trace_trans_restart_btree_node_split(trans->ip); ++ ret = -EINTR; ++ } ++ break; ++ case BTREE_INSERT_ENOSPC: ++ ret = -ENOSPC; ++ break; ++ case BTREE_INSERT_NEED_MARK_REPLICAS: ++ bch2_trans_unlock(trans); ++ ++ trans_for_each_update(trans, i) { ++ ret = bch2_mark_bkey_replicas(c, bkey_i_to_s_c(i->k)); ++ if (ret) ++ return ret; ++ } ++ ++ if (bch2_trans_relock(trans)) ++ return 0; ++ ++ trace_trans_restart_mark_replicas(trans->ip); ++ ret = -EINTR; ++ break; ++ case BTREE_INSERT_NEED_JOURNAL_RES: ++ bch2_trans_unlock(trans); ++ ++ ret = bch2_trans_journal_res_get(trans, JOURNAL_RES_GET_CHECK); ++ if (ret) ++ return ret; ++ ++ if (bch2_trans_relock(trans)) ++ return 0; ++ ++ trace_trans_restart_journal_res_get(trans->ip); ++ ret = -EINTR; ++ break; ++ default: ++ BUG_ON(ret >= 0); ++ break; ++ } ++ ++ if (ret == -EINTR) { ++ int ret2 = bch2_btree_iter_traverse_all(trans); ++ ++ if (ret2) { ++ trace_trans_restart_traverse(trans->ip); ++ return ret2; ++ } ++ ++ /* ++ * BTREE_ITER_ATOMIC means we have to return -EINTR if we ++ * dropped locks: ++ */ ++ if (!(flags & BTREE_INSERT_ATOMIC)) ++ return 0; ++ ++ trace_trans_restart_atomic(trans->ip); ++ } ++ ++ return ret; ++} ++ ++static noinline int ++bch2_trans_commit_get_rw_cold(struct btree_trans *trans) ++{ ++ struct bch_fs *c = trans->c; ++ int ret; ++ ++ if (likely(!(trans->flags & BTREE_INSERT_LAZY_RW))) ++ return -EROFS; ++ ++ bch2_trans_unlock(trans); ++ ++ ret = bch2_fs_read_write_early(c); ++ if (ret) ++ return ret; ++ ++ percpu_ref_get(&c->writes); ++ return 0; ++} ++ ++int __bch2_trans_commit(struct btree_trans *trans) ++{ ++ struct btree_insert_entry *i = NULL; ++ struct btree_iter *iter; ++ unsigned orig_nr_updates = trans->nr_updates; ++ unsigned orig_mem_top = trans->mem_top; ++ int ret = 0; ++ ++ if (!trans->nr_updates) ++ goto out_noupdates; ++ ++ /* for the sake of sanity: */ ++ EBUG_ON(trans->nr_updates > 1 && !(trans->flags & BTREE_INSERT_ATOMIC)); ++ ++ if (trans->flags & BTREE_INSERT_GC_LOCK_HELD) ++ lockdep_assert_held(&trans->c->gc_lock); ++ ++ memset(&trans->journal_preres, 0, sizeof(trans->journal_preres)); ++ ++ if (!(trans->flags & BTREE_INSERT_NOCHECK_RW) && ++ unlikely(!percpu_ref_tryget(&trans->c->writes))) { ++ ret = bch2_trans_commit_get_rw_cold(trans); ++ if (ret) ++ return ret; ++ } ++retry: ++ memset(&trans->journal_res, 0, sizeof(trans->journal_res)); ++ trans->journal_u64s = 0; ++ ++ ret = do_bch2_trans_commit(trans, &i); ++ ++ if (trans->fs_usage_deltas) { ++ trans->fs_usage_deltas->used = 0; ++ memset(&trans->fs_usage_deltas->memset_start, 0, ++ (void *) &trans->fs_usage_deltas->memset_end - ++ (void *) &trans->fs_usage_deltas->memset_start); ++ } ++ ++ /* make sure we didn't drop or screw up locks: */ ++ bch2_btree_trans_verify_locks(trans); ++ ++ if (ret) ++ goto err; ++out: ++ bch2_journal_preres_put(&trans->c->journal, &trans->journal_preres); ++ ++ if (likely(!(trans->flags & BTREE_INSERT_NOCHECK_RW))) ++ percpu_ref_put(&trans->c->writes); ++out_noupdates: ++ EBUG_ON(!(trans->flags & BTREE_INSERT_ATOMIC) && ret == -EINTR); ++ ++ trans_for_each_iter_all(trans, iter) ++ iter->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT; ++ ++ if (!ret) { ++ bch2_trans_unlink_iters(trans); ++ trans->iters_touched = 0; ++ } ++ trans->nr_updates = 0; ++ trans->mem_top = 0; ++ ++ return ret; ++err: ++ ret = bch2_trans_commit_error(trans, i, ret); ++ ++ /* can't loop if it was passed in and we changed it: */ ++ if (unlikely(trans->flags & BTREE_INSERT_NO_CLEAR_REPLICAS) && !ret) ++ ret = -EINTR; ++ if (ret) ++ goto out; ++ ++ /* free updates and memory used by triggers, they'll be reexecuted: */ ++ trans->nr_updates = orig_nr_updates; ++ trans->mem_top = orig_mem_top; ++ goto retry; ++} ++ ++/** ++ * bch2_btree_insert - insert keys into the extent btree ++ * @c: pointer to struct bch_fs ++ * @id: btree to insert into ++ * @insert_keys: list of keys to insert ++ * @hook: insert callback ++ */ ++int bch2_btree_insert(struct bch_fs *c, enum btree_id id, ++ struct bkey_i *k, ++ struct disk_reservation *disk_res, ++ u64 *journal_seq, int flags) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ int ret; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++retry: ++ bch2_trans_begin(&trans); ++ ++ iter = bch2_trans_get_iter(&trans, id, bkey_start_pos(&k->k), ++ BTREE_ITER_INTENT); ++ ++ bch2_trans_update(&trans, iter, k); ++ ++ ret = bch2_trans_commit(&trans, disk_res, journal_seq, flags); ++ if (ret == -EINTR) ++ goto retry; ++ bch2_trans_exit(&trans); ++ ++ return ret; ++} ++ ++int bch2_btree_delete_at_range(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct bpos end, ++ u64 *journal_seq) ++{ ++ struct bkey_s_c k; ++ int ret = 0; ++retry: ++ while ((k = bch2_btree_iter_peek(iter)).k && ++ !(ret = bkey_err(k)) && ++ bkey_cmp(iter->pos, end) < 0) { ++ struct bkey_i delete; ++ ++ bkey_init(&delete.k); ++ ++ /* ++ * For extents, iter.pos won't necessarily be the same as ++ * bkey_start_pos(k.k) (for non extents they always will be the ++ * same). It's important that we delete starting from iter.pos ++ * because the range we want to delete could start in the middle ++ * of k. ++ * ++ * (bch2_btree_iter_peek() does guarantee that iter.pos >= ++ * bkey_start_pos(k.k)). ++ */ ++ delete.k.p = iter->pos; ++ ++ if (iter->flags & BTREE_ITER_IS_EXTENTS) { ++ unsigned max_sectors = ++ KEY_SIZE_MAX & (~0 << trans->c->block_bits); ++ ++ /* create the biggest key we can */ ++ bch2_key_resize(&delete.k, max_sectors); ++ bch2_cut_back(end, &delete.k); ++ ++ ret = bch2_extent_trim_atomic(&delete, iter); ++ if (ret) ++ break; ++ } ++ ++ bch2_trans_update(trans, iter, &delete); ++ ret = bch2_trans_commit(trans, NULL, journal_seq, ++ BTREE_INSERT_ATOMIC| ++ BTREE_INSERT_NOFAIL); ++ if (ret) ++ break; ++ ++ bch2_trans_cond_resched(trans); ++ } ++ ++ if (ret == -EINTR) { ++ ret = 0; ++ goto retry; ++ } ++ ++ return ret; ++ ++} ++ ++int bch2_btree_delete_at(struct btree_trans *trans, ++ struct btree_iter *iter, unsigned flags) ++{ ++ struct bkey_i k; ++ ++ bkey_init(&k.k); ++ k.k.p = iter->pos; ++ ++ bch2_trans_update(trans, iter, &k); ++ return bch2_trans_commit(trans, NULL, NULL, ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_USE_RESERVE|flags); ++} ++ ++/* ++ * bch_btree_delete_range - delete everything within a given range ++ * ++ * Range is a half open interval - [start, end) ++ */ ++int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id, ++ struct bpos start, struct bpos end, ++ u64 *journal_seq) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ int ret = 0; ++ ++ /* ++ * XXX: whether we need mem/more iters depends on whether this btree id ++ * has triggers ++ */ ++ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 512); ++ ++ iter = bch2_trans_get_iter(&trans, id, start, BTREE_ITER_INTENT); ++ ++ ret = bch2_btree_delete_at_range(&trans, iter, end, journal_seq); ++ ret = bch2_trans_exit(&trans) ?: ret; ++ ++ BUG_ON(ret == -EINTR); ++ return ret; ++} +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +new file mode 100644 +index 000000000000..c418398266a3 +--- /dev/null ++++ b/fs/bcachefs/buckets.c +@@ -0,0 +1,2095 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * Code for manipulating bucket marks for garbage collection. ++ * ++ * Copyright 2014 Datera, Inc. ++ * ++ * Bucket states: ++ * - free bucket: mark == 0 ++ * The bucket contains no data and will not be read ++ * ++ * - allocator bucket: owned_by_allocator == 1 ++ * The bucket is on a free list, or it is an open bucket ++ * ++ * - cached bucket: owned_by_allocator == 0 && ++ * dirty_sectors == 0 && ++ * cached_sectors > 0 ++ * The bucket contains data but may be safely discarded as there are ++ * enough replicas of the data on other cache devices, or it has been ++ * written back to the backing device ++ * ++ * - dirty bucket: owned_by_allocator == 0 && ++ * dirty_sectors > 0 ++ * The bucket contains data that we must not discard (either only copy, ++ * or one of the 'main copies' for data requiring multiple replicas) ++ * ++ * - metadata bucket: owned_by_allocator == 0 && is_metadata == 1 ++ * This is a btree node, journal or gen/prio bucket ++ * ++ * Lifecycle: ++ * ++ * bucket invalidated => bucket on freelist => open bucket => ++ * [dirty bucket =>] cached bucket => bucket invalidated => ... ++ * ++ * Note that cache promotion can skip the dirty bucket step, as data ++ * is copied from a deeper tier to a shallower tier, onto a cached ++ * bucket. ++ * Note also that a cached bucket can spontaneously become dirty -- ++ * see below. ++ * ++ * Only a traversal of the key space can determine whether a bucket is ++ * truly dirty or cached. ++ * ++ * Transitions: ++ * ++ * - free => allocator: bucket was invalidated ++ * - cached => allocator: bucket was invalidated ++ * ++ * - allocator => dirty: open bucket was filled up ++ * - allocator => cached: open bucket was filled up ++ * - allocator => metadata: metadata was allocated ++ * ++ * - dirty => cached: dirty sectors were copied to a deeper tier ++ * - dirty => free: dirty sectors were overwritten or moved (copy gc) ++ * - cached => free: cached sectors were overwritten ++ * ++ * - metadata => free: metadata was freed ++ * ++ * Oddities: ++ * - cached => dirty: a device was removed so formerly replicated data ++ * is no longer sufficiently replicated ++ * - free => cached: cannot happen ++ * - free => dirty: cannot happen ++ * - free => metadata: cannot happen ++ */ ++ ++#include "bcachefs.h" ++#include "alloc_background.h" ++#include "bset.h" ++#include "btree_gc.h" ++#include "btree_update.h" ++#include "buckets.h" ++#include "ec.h" ++#include "error.h" ++#include "movinggc.h" ++#include "replicas.h" ++ ++#include ++#include ++ ++/* ++ * Clear journal_seq_valid for buckets for which it's not needed, to prevent ++ * wraparound: ++ */ ++void bch2_bucket_seq_cleanup(struct bch_fs *c) ++{ ++ u64 journal_seq = atomic64_read(&c->journal.seq); ++ u16 last_seq_ondisk = c->journal.last_seq_ondisk; ++ struct bch_dev *ca; ++ struct bucket_array *buckets; ++ struct bucket *g; ++ struct bucket_mark m; ++ unsigned i; ++ ++ if (journal_seq - c->last_bucket_seq_cleanup < ++ (1U << (BUCKET_JOURNAL_SEQ_BITS - 2))) ++ return; ++ ++ c->last_bucket_seq_cleanup = journal_seq; ++ ++ for_each_member_device(ca, c, i) { ++ down_read(&ca->bucket_lock); ++ buckets = bucket_array(ca); ++ ++ for_each_bucket(g, buckets) { ++ bucket_cmpxchg(g, m, ({ ++ if (!m.journal_seq_valid || ++ bucket_needs_journal_commit(m, last_seq_ondisk)) ++ break; ++ ++ m.journal_seq_valid = 0; ++ })); ++ } ++ up_read(&ca->bucket_lock); ++ } ++} ++ ++void bch2_fs_usage_initialize(struct bch_fs *c) ++{ ++ struct bch_fs_usage *usage; ++ unsigned i; ++ ++ percpu_down_write(&c->mark_lock); ++ usage = c->usage_base; ++ ++ bch2_fs_usage_acc_to_base(c, 0); ++ bch2_fs_usage_acc_to_base(c, 1); ++ ++ for (i = 0; i < BCH_REPLICAS_MAX; i++) ++ usage->reserved += usage->persistent_reserved[i]; ++ ++ for (i = 0; i < c->replicas.nr; i++) { ++ struct bch_replicas_entry *e = ++ cpu_replicas_entry(&c->replicas, i); ++ ++ switch (e->data_type) { ++ case BCH_DATA_BTREE: ++ usage->btree += usage->replicas[i]; ++ break; ++ case BCH_DATA_USER: ++ usage->data += usage->replicas[i]; ++ break; ++ case BCH_DATA_CACHED: ++ usage->cached += usage->replicas[i]; ++ break; ++ } ++ } ++ ++ percpu_up_write(&c->mark_lock); ++} ++ ++void bch2_fs_usage_scratch_put(struct bch_fs *c, struct bch_fs_usage *fs_usage) ++{ ++ if (fs_usage == c->usage_scratch) ++ mutex_unlock(&c->usage_scratch_lock); ++ else ++ kfree(fs_usage); ++} ++ ++struct bch_fs_usage *bch2_fs_usage_scratch_get(struct bch_fs *c) ++{ ++ struct bch_fs_usage *ret; ++ unsigned bytes = fs_usage_u64s(c) * sizeof(u64); ++ ++ ret = kzalloc(bytes, GFP_NOWAIT); ++ if (ret) ++ return ret; ++ ++ if (mutex_trylock(&c->usage_scratch_lock)) ++ goto out_pool; ++ ++ ret = kzalloc(bytes, GFP_NOFS); ++ if (ret) ++ return ret; ++ ++ mutex_lock(&c->usage_scratch_lock); ++out_pool: ++ ret = c->usage_scratch; ++ memset(ret, 0, bytes); ++ return ret; ++} ++ ++struct bch_dev_usage bch2_dev_usage_read(struct bch_fs *c, struct bch_dev *ca) ++{ ++ struct bch_dev_usage ret; ++ ++ memset(&ret, 0, sizeof(ret)); ++ acc_u64s_percpu((u64 *) &ret, ++ (u64 __percpu *) ca->usage[0], ++ sizeof(ret) / sizeof(u64)); ++ ++ return ret; ++} ++ ++static inline struct bch_fs_usage *fs_usage_ptr(struct bch_fs *c, ++ unsigned journal_seq, ++ bool gc) ++{ ++ return this_cpu_ptr(gc ++ ? c->usage_gc ++ : c->usage[journal_seq & 1]); ++} ++ ++u64 bch2_fs_usage_read_one(struct bch_fs *c, u64 *v) ++{ ++ ssize_t offset = v - (u64 *) c->usage_base; ++ unsigned seq; ++ u64 ret; ++ ++ BUG_ON(offset < 0 || offset >= fs_usage_u64s(c)); ++ percpu_rwsem_assert_held(&c->mark_lock); ++ ++ do { ++ seq = read_seqcount_begin(&c->usage_lock); ++ ret = *v + ++ percpu_u64_get((u64 __percpu *) c->usage[0] + offset) + ++ percpu_u64_get((u64 __percpu *) c->usage[1] + offset); ++ } while (read_seqcount_retry(&c->usage_lock, seq)); ++ ++ return ret; ++} ++ ++struct bch_fs_usage *bch2_fs_usage_read(struct bch_fs *c) ++{ ++ struct bch_fs_usage *ret; ++ unsigned seq, v, u64s = fs_usage_u64s(c); ++retry: ++ ret = kmalloc(u64s * sizeof(u64), GFP_NOFS); ++ if (unlikely(!ret)) ++ return NULL; ++ ++ percpu_down_read(&c->mark_lock); ++ ++ v = fs_usage_u64s(c); ++ if (unlikely(u64s != v)) { ++ u64s = v; ++ percpu_up_read(&c->mark_lock); ++ kfree(ret); ++ goto retry; ++ } ++ ++ do { ++ seq = read_seqcount_begin(&c->usage_lock); ++ memcpy(ret, c->usage_base, u64s * sizeof(u64)); ++ acc_u64s_percpu((u64 *) ret, (u64 __percpu *) c->usage[0], u64s); ++ acc_u64s_percpu((u64 *) ret, (u64 __percpu *) c->usage[1], u64s); ++ } while (read_seqcount_retry(&c->usage_lock, seq)); ++ ++ return ret; ++} ++ ++void bch2_fs_usage_acc_to_base(struct bch_fs *c, unsigned idx) ++{ ++ unsigned u64s = fs_usage_u64s(c); ++ ++ BUG_ON(idx >= 2); ++ ++ write_seqcount_begin(&c->usage_lock); ++ ++ acc_u64s_percpu((u64 *) c->usage_base, ++ (u64 __percpu *) c->usage[idx], u64s); ++ percpu_memset(c->usage[idx], 0, u64s * sizeof(u64)); ++ ++ write_seqcount_end(&c->usage_lock); ++} ++ ++void bch2_fs_usage_to_text(struct printbuf *out, ++ struct bch_fs *c, ++ struct bch_fs_usage *fs_usage) ++{ ++ unsigned i; ++ ++ pr_buf(out, "capacity:\t\t\t%llu\n", c->capacity); ++ ++ pr_buf(out, "hidden:\t\t\t\t%llu\n", ++ fs_usage->hidden); ++ pr_buf(out, "data:\t\t\t\t%llu\n", ++ fs_usage->data); ++ pr_buf(out, "cached:\t\t\t\t%llu\n", ++ fs_usage->cached); ++ pr_buf(out, "reserved:\t\t\t%llu\n", ++ fs_usage->reserved); ++ pr_buf(out, "nr_inodes:\t\t\t%llu\n", ++ fs_usage->nr_inodes); ++ pr_buf(out, "online reserved:\t\t%llu\n", ++ fs_usage->online_reserved); ++ ++ for (i = 0; ++ i < ARRAY_SIZE(fs_usage->persistent_reserved); ++ i++) { ++ pr_buf(out, "%u replicas:\n", i + 1); ++ pr_buf(out, "\treserved:\t\t%llu\n", ++ fs_usage->persistent_reserved[i]); ++ } ++ ++ for (i = 0; i < c->replicas.nr; i++) { ++ struct bch_replicas_entry *e = ++ cpu_replicas_entry(&c->replicas, i); ++ ++ pr_buf(out, "\t"); ++ bch2_replicas_entry_to_text(out, e); ++ pr_buf(out, ":\t%llu\n", fs_usage->replicas[i]); ++ } ++} ++ ++#define RESERVE_FACTOR 6 ++ ++static u64 reserve_factor(u64 r) ++{ ++ return r + (round_up(r, (1 << RESERVE_FACTOR)) >> RESERVE_FACTOR); ++} ++ ++static u64 avail_factor(u64 r) ++{ ++ return (r << RESERVE_FACTOR) / ((1 << RESERVE_FACTOR) + 1); ++} ++ ++u64 bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage *fs_usage) ++{ ++ return min(fs_usage->hidden + ++ fs_usage->btree + ++ fs_usage->data + ++ reserve_factor(fs_usage->reserved + ++ fs_usage->online_reserved), ++ c->capacity); ++} ++ ++static struct bch_fs_usage_short ++__bch2_fs_usage_read_short(struct bch_fs *c) ++{ ++ struct bch_fs_usage_short ret; ++ u64 data, reserved; ++ ++ ret.capacity = c->capacity - ++ bch2_fs_usage_read_one(c, &c->usage_base->hidden); ++ ++ data = bch2_fs_usage_read_one(c, &c->usage_base->data) + ++ bch2_fs_usage_read_one(c, &c->usage_base->btree); ++ reserved = bch2_fs_usage_read_one(c, &c->usage_base->reserved) + ++ bch2_fs_usage_read_one(c, &c->usage_base->online_reserved); ++ ++ ret.used = min(ret.capacity, data + reserve_factor(reserved)); ++ ret.free = ret.capacity - ret.used; ++ ++ ret.nr_inodes = bch2_fs_usage_read_one(c, &c->usage_base->nr_inodes); ++ ++ return ret; ++} ++ ++struct bch_fs_usage_short ++bch2_fs_usage_read_short(struct bch_fs *c) ++{ ++ struct bch_fs_usage_short ret; ++ ++ percpu_down_read(&c->mark_lock); ++ ret = __bch2_fs_usage_read_short(c); ++ percpu_up_read(&c->mark_lock); ++ ++ return ret; ++} ++ ++static inline int is_unavailable_bucket(struct bucket_mark m) ++{ ++ return !is_available_bucket(m); ++} ++ ++static inline int is_fragmented_bucket(struct bucket_mark m, ++ struct bch_dev *ca) ++{ ++ if (!m.owned_by_allocator && ++ m.data_type == BCH_DATA_USER && ++ bucket_sectors_used(m)) ++ return max_t(int, 0, (int) ca->mi.bucket_size - ++ bucket_sectors_used(m)); ++ return 0; ++} ++ ++static inline enum bch_data_type bucket_type(struct bucket_mark m) ++{ ++ return m.cached_sectors && !m.dirty_sectors ++ ? BCH_DATA_CACHED ++ : m.data_type; ++} ++ ++static bool bucket_became_unavailable(struct bucket_mark old, ++ struct bucket_mark new) ++{ ++ return is_available_bucket(old) && ++ !is_available_bucket(new); ++} ++ ++int bch2_fs_usage_apply(struct bch_fs *c, ++ struct bch_fs_usage *fs_usage, ++ struct disk_reservation *disk_res, ++ unsigned journal_seq) ++{ ++ s64 added = fs_usage->data + fs_usage->reserved; ++ s64 should_not_have_added; ++ int ret = 0; ++ ++ percpu_rwsem_assert_held(&c->mark_lock); ++ ++ /* ++ * Not allowed to reduce sectors_available except by getting a ++ * reservation: ++ */ ++ should_not_have_added = added - (s64) (disk_res ? disk_res->sectors : 0); ++ if (WARN_ONCE(should_not_have_added > 0, ++ "disk usage increased by %lli without a reservation", ++ should_not_have_added)) { ++ atomic64_sub(should_not_have_added, &c->sectors_available); ++ added -= should_not_have_added; ++ ret = -1; ++ } ++ ++ if (added > 0) { ++ disk_res->sectors -= added; ++ fs_usage->online_reserved -= added; ++ } ++ ++ preempt_disable(); ++ acc_u64s((u64 *) fs_usage_ptr(c, journal_seq, false), ++ (u64 *) fs_usage, fs_usage_u64s(c)); ++ preempt_enable(); ++ ++ return ret; ++} ++ ++static inline void account_bucket(struct bch_fs_usage *fs_usage, ++ struct bch_dev_usage *dev_usage, ++ enum bch_data_type type, ++ int nr, s64 size) ++{ ++ if (type == BCH_DATA_SB || type == BCH_DATA_JOURNAL) ++ fs_usage->hidden += size; ++ ++ dev_usage->buckets[type] += nr; ++} ++ ++static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca, ++ struct bch_fs_usage *fs_usage, ++ struct bucket_mark old, struct bucket_mark new, ++ bool gc) ++{ ++ struct bch_dev_usage *dev_usage; ++ ++ percpu_rwsem_assert_held(&c->mark_lock); ++ ++ preempt_disable(); ++ dev_usage = this_cpu_ptr(ca->usage[gc]); ++ ++ if (bucket_type(old)) ++ account_bucket(fs_usage, dev_usage, bucket_type(old), ++ -1, -ca->mi.bucket_size); ++ ++ if (bucket_type(new)) ++ account_bucket(fs_usage, dev_usage, bucket_type(new), ++ 1, ca->mi.bucket_size); ++ ++ dev_usage->buckets_alloc += ++ (int) new.owned_by_allocator - (int) old.owned_by_allocator; ++ dev_usage->buckets_ec += ++ (int) new.stripe - (int) old.stripe; ++ dev_usage->buckets_unavailable += ++ is_unavailable_bucket(new) - is_unavailable_bucket(old); ++ ++ dev_usage->sectors[old.data_type] -= old.dirty_sectors; ++ dev_usage->sectors[new.data_type] += new.dirty_sectors; ++ dev_usage->sectors[BCH_DATA_CACHED] += ++ (int) new.cached_sectors - (int) old.cached_sectors; ++ dev_usage->sectors_fragmented += ++ is_fragmented_bucket(new, ca) - is_fragmented_bucket(old, ca); ++ preempt_enable(); ++ ++ if (!is_available_bucket(old) && is_available_bucket(new)) ++ bch2_wake_allocator(ca); ++} ++ ++void bch2_dev_usage_from_buckets(struct bch_fs *c) ++{ ++ struct bch_dev *ca; ++ struct bucket_mark old = { .v.counter = 0 }; ++ struct bucket_array *buckets; ++ struct bucket *g; ++ unsigned i; ++ int cpu; ++ ++ c->usage_base->hidden = 0; ++ ++ for_each_member_device(ca, c, i) { ++ for_each_possible_cpu(cpu) ++ memset(per_cpu_ptr(ca->usage[0], cpu), 0, ++ sizeof(*ca->usage[0])); ++ ++ buckets = bucket_array(ca); ++ ++ for_each_bucket(g, buckets) ++ bch2_dev_usage_update(c, ca, c->usage_base, ++ old, g->mark, false); ++ } ++} ++ ++static inline int update_replicas(struct bch_fs *c, ++ struct bch_fs_usage *fs_usage, ++ struct bch_replicas_entry *r, ++ s64 sectors) ++{ ++ int idx = bch2_replicas_entry_idx(c, r); ++ ++ if (idx < 0) ++ return -1; ++ ++ if (!fs_usage) ++ return 0; ++ ++ switch (r->data_type) { ++ case BCH_DATA_BTREE: ++ fs_usage->btree += sectors; ++ break; ++ case BCH_DATA_USER: ++ fs_usage->data += sectors; ++ break; ++ case BCH_DATA_CACHED: ++ fs_usage->cached += sectors; ++ break; ++ } ++ fs_usage->replicas[idx] += sectors; ++ return 0; ++} ++ ++static inline void update_cached_sectors(struct bch_fs *c, ++ struct bch_fs_usage *fs_usage, ++ unsigned dev, s64 sectors) ++{ ++ struct bch_replicas_padded r; ++ ++ bch2_replicas_entry_cached(&r.e, dev); ++ ++ update_replicas(c, fs_usage, &r.e, sectors); ++} ++ ++static struct replicas_delta_list * ++replicas_deltas_realloc(struct btree_trans *trans, unsigned more) ++{ ++ struct replicas_delta_list *d = trans->fs_usage_deltas; ++ unsigned new_size = d ? (d->size + more) * 2 : 128; ++ ++ if (!d || d->used + more > d->size) { ++ d = krealloc(d, sizeof(*d) + new_size, GFP_NOIO|__GFP_ZERO); ++ BUG_ON(!d); ++ ++ d->size = new_size; ++ trans->fs_usage_deltas = d; ++ } ++ return d; ++} ++ ++static inline void update_replicas_list(struct btree_trans *trans, ++ struct bch_replicas_entry *r, ++ s64 sectors) ++{ ++ struct replicas_delta_list *d; ++ struct replicas_delta *n; ++ unsigned b; ++ ++ if (!sectors) ++ return; ++ ++ b = replicas_entry_bytes(r) + 8; ++ d = replicas_deltas_realloc(trans, b); ++ ++ n = (void *) d->d + d->used; ++ n->delta = sectors; ++ memcpy(&n->r, r, replicas_entry_bytes(r)); ++ d->used += b; ++} ++ ++static inline void update_cached_sectors_list(struct btree_trans *trans, ++ unsigned dev, s64 sectors) ++{ ++ struct bch_replicas_padded r; ++ ++ bch2_replicas_entry_cached(&r.e, dev); ++ ++ update_replicas_list(trans, &r.e, sectors); ++} ++ ++static inline struct replicas_delta * ++replicas_delta_next(struct replicas_delta *d) ++{ ++ return (void *) d + replicas_entry_bytes(&d->r) + 8; ++} ++ ++int bch2_replicas_delta_list_apply(struct bch_fs *c, ++ struct bch_fs_usage *fs_usage, ++ struct replicas_delta_list *r) ++{ ++ struct replicas_delta *d = r->d; ++ struct replicas_delta *top = (void *) r->d + r->used; ++ unsigned i; ++ ++ for (d = r->d; d != top; d = replicas_delta_next(d)) ++ if (update_replicas(c, fs_usage, &d->r, d->delta)) { ++ top = d; ++ goto unwind; ++ } ++ ++ if (!fs_usage) ++ return 0; ++ ++ fs_usage->nr_inodes += r->nr_inodes; ++ ++ for (i = 0; i < BCH_REPLICAS_MAX; i++) { ++ fs_usage->reserved += r->persistent_reserved[i]; ++ fs_usage->persistent_reserved[i] += r->persistent_reserved[i]; ++ } ++ ++ return 0; ++unwind: ++ for (d = r->d; d != top; d = replicas_delta_next(d)) ++ update_replicas(c, fs_usage, &d->r, -d->delta); ++ return -1; ++} ++ ++#define do_mark_fn(fn, c, pos, flags, ...) \ ++({ \ ++ int gc, ret = 0; \ ++ \ ++ percpu_rwsem_assert_held(&c->mark_lock); \ ++ \ ++ for (gc = 0; gc < 2 && !ret; gc++) \ ++ if (!gc == !(flags & BCH_BUCKET_MARK_GC) || \ ++ (gc && gc_visited(c, pos))) \ ++ ret = fn(c, __VA_ARGS__, gc); \ ++ ret; \ ++}) ++ ++static int __bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca, ++ size_t b, struct bucket_mark *ret, ++ bool gc) ++{ ++ struct bch_fs_usage *fs_usage = fs_usage_ptr(c, 0, gc); ++ struct bucket *g = __bucket(ca, b, gc); ++ struct bucket_mark old, new; ++ ++ old = bucket_cmpxchg(g, new, ({ ++ BUG_ON(!is_available_bucket(new)); ++ ++ new.owned_by_allocator = true; ++ new.data_type = 0; ++ new.cached_sectors = 0; ++ new.dirty_sectors = 0; ++ new.gen++; ++ })); ++ ++ bch2_dev_usage_update(c, ca, fs_usage, old, new, gc); ++ ++ if (old.cached_sectors) ++ update_cached_sectors(c, fs_usage, ca->dev_idx, ++ -((s64) old.cached_sectors)); ++ ++ if (!gc) ++ *ret = old; ++ return 0; ++} ++ ++void bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca, ++ size_t b, struct bucket_mark *old) ++{ ++ do_mark_fn(__bch2_invalidate_bucket, c, gc_phase(GC_PHASE_START), 0, ++ ca, b, old); ++ ++ if (!old->owned_by_allocator && old->cached_sectors) ++ trace_invalidate(ca, bucket_to_sector(ca, b), ++ old->cached_sectors); ++} ++ ++static int __bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca, ++ size_t b, bool owned_by_allocator, ++ bool gc) ++{ ++ struct bch_fs_usage *fs_usage = fs_usage_ptr(c, 0, gc); ++ struct bucket *g = __bucket(ca, b, gc); ++ struct bucket_mark old, new; ++ ++ old = bucket_cmpxchg(g, new, ({ ++ new.owned_by_allocator = owned_by_allocator; ++ })); ++ ++ bch2_dev_usage_update(c, ca, fs_usage, old, new, gc); ++ ++ BUG_ON(!gc && ++ !owned_by_allocator && !old.owned_by_allocator); ++ ++ return 0; ++} ++ ++void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca, ++ size_t b, bool owned_by_allocator, ++ struct gc_pos pos, unsigned flags) ++{ ++ preempt_disable(); ++ ++ do_mark_fn(__bch2_mark_alloc_bucket, c, pos, flags, ++ ca, b, owned_by_allocator); ++ ++ preempt_enable(); ++} ++ ++static int bch2_mark_alloc(struct bch_fs *c, struct bkey_s_c k, ++ struct bch_fs_usage *fs_usage, ++ u64 journal_seq, unsigned flags) ++{ ++ bool gc = flags & BCH_BUCKET_MARK_GC; ++ struct bkey_alloc_unpacked u; ++ struct bch_dev *ca; ++ struct bucket *g; ++ struct bucket_mark old, m; ++ ++ /* ++ * alloc btree is read in by bch2_alloc_read, not gc: ++ */ ++ if ((flags & BCH_BUCKET_MARK_GC) && ++ !(flags & BCH_BUCKET_MARK_BUCKET_INVALIDATE)) ++ return 0; ++ ++ ca = bch_dev_bkey_exists(c, k.k->p.inode); ++ ++ if (k.k->p.offset >= ca->mi.nbuckets) ++ return 0; ++ ++ g = __bucket(ca, k.k->p.offset, gc); ++ u = bch2_alloc_unpack(k); ++ ++ old = bucket_cmpxchg(g, m, ({ ++ m.gen = u.gen; ++ m.data_type = u.data_type; ++ m.dirty_sectors = u.dirty_sectors; ++ m.cached_sectors = u.cached_sectors; ++ ++ if (journal_seq) { ++ m.journal_seq_valid = 1; ++ m.journal_seq = journal_seq; ++ } ++ })); ++ ++ if (!(flags & BCH_BUCKET_MARK_ALLOC_READ)) ++ bch2_dev_usage_update(c, ca, fs_usage, old, m, gc); ++ ++ g->io_time[READ] = u.read_time; ++ g->io_time[WRITE] = u.write_time; ++ g->oldest_gen = u.oldest_gen; ++ g->gen_valid = 1; ++ ++ /* ++ * need to know if we're getting called from the invalidate path or ++ * not: ++ */ ++ ++ if ((flags & BCH_BUCKET_MARK_BUCKET_INVALIDATE) && ++ old.cached_sectors) { ++ update_cached_sectors(c, fs_usage, ca->dev_idx, ++ -old.cached_sectors); ++ trace_invalidate(ca, bucket_to_sector(ca, k.k->p.offset), ++ old.cached_sectors); ++ } ++ ++ return 0; ++} ++ ++#define checked_add(a, b) \ ++({ \ ++ unsigned _res = (unsigned) (a) + (b); \ ++ bool overflow = _res > U16_MAX; \ ++ if (overflow) \ ++ _res = U16_MAX; \ ++ (a) = _res; \ ++ overflow; \ ++}) ++ ++static int __bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, ++ size_t b, enum bch_data_type type, ++ unsigned sectors, bool gc) ++{ ++ struct bucket *g = __bucket(ca, b, gc); ++ struct bucket_mark old, new; ++ bool overflow; ++ ++ BUG_ON(type != BCH_DATA_SB && ++ type != BCH_DATA_JOURNAL); ++ ++ old = bucket_cmpxchg(g, new, ({ ++ new.data_type = type; ++ overflow = checked_add(new.dirty_sectors, sectors); ++ })); ++ ++ bch2_fs_inconsistent_on(old.data_type && ++ old.data_type != type, c, ++ "different types of data in same bucket: %s, %s", ++ bch2_data_types[old.data_type], ++ bch2_data_types[type]); ++ ++ bch2_fs_inconsistent_on(overflow, c, ++ "bucket sector count overflow: %u + %u > U16_MAX", ++ old.dirty_sectors, sectors); ++ ++ if (c) ++ bch2_dev_usage_update(c, ca, fs_usage_ptr(c, 0, gc), ++ old, new, gc); ++ ++ return 0; ++} ++ ++void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, ++ size_t b, enum bch_data_type type, ++ unsigned sectors, struct gc_pos pos, ++ unsigned flags) ++{ ++ BUG_ON(type != BCH_DATA_SB && ++ type != BCH_DATA_JOURNAL); ++ ++ preempt_disable(); ++ ++ if (likely(c)) { ++ do_mark_fn(__bch2_mark_metadata_bucket, c, pos, flags, ++ ca, b, type, sectors); ++ } else { ++ __bch2_mark_metadata_bucket(c, ca, b, type, sectors, 0); ++ } ++ ++ preempt_enable(); ++} ++ ++static s64 disk_sectors_scaled(unsigned n, unsigned d, unsigned sectors) ++{ ++ return DIV_ROUND_UP(sectors * n, d); ++} ++ ++static s64 __ptr_disk_sectors_delta(unsigned old_size, ++ unsigned offset, s64 delta, ++ unsigned flags, ++ unsigned n, unsigned d) ++{ ++ BUG_ON(!n || !d); ++ ++ if (flags & BCH_BUCKET_MARK_OVERWRITE_SPLIT) { ++ BUG_ON(offset + -delta > old_size); ++ ++ return -disk_sectors_scaled(n, d, old_size) + ++ disk_sectors_scaled(n, d, offset) + ++ disk_sectors_scaled(n, d, old_size - offset + delta); ++ } else if (flags & BCH_BUCKET_MARK_OVERWRITE) { ++ BUG_ON(offset + -delta > old_size); ++ ++ return -disk_sectors_scaled(n, d, old_size) + ++ disk_sectors_scaled(n, d, old_size + delta); ++ } else { ++ return disk_sectors_scaled(n, d, delta); ++ } ++} ++ ++static s64 ptr_disk_sectors_delta(struct extent_ptr_decoded p, ++ unsigned offset, s64 delta, ++ unsigned flags) ++{ ++ return __ptr_disk_sectors_delta(p.crc.live_size, ++ offset, delta, flags, ++ p.crc.compressed_size, ++ p.crc.uncompressed_size); ++} ++ ++static void bucket_set_stripe(struct bch_fs *c, ++ const struct bch_stripe *v, ++ struct bch_fs_usage *fs_usage, ++ u64 journal_seq, ++ unsigned flags) ++{ ++ bool enabled = !(flags & BCH_BUCKET_MARK_OVERWRITE); ++ bool gc = flags & BCH_BUCKET_MARK_GC; ++ unsigned i; ++ ++ for (i = 0; i < v->nr_blocks; i++) { ++ const struct bch_extent_ptr *ptr = v->ptrs + i; ++ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); ++ struct bucket *g = PTR_BUCKET(ca, ptr, gc); ++ struct bucket_mark new, old; ++ ++ old = bucket_cmpxchg(g, new, ({ ++ new.stripe = enabled; ++ if (journal_seq) { ++ new.journal_seq_valid = 1; ++ new.journal_seq = journal_seq; ++ } ++ })); ++ ++ bch2_dev_usage_update(c, ca, fs_usage, old, new, gc); ++ ++ /* ++ * XXX write repair code for these, flag stripe as possibly bad ++ */ ++ if (old.gen != ptr->gen) ++ bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ++ "stripe with stale pointer"); ++#if 0 ++ /* ++ * We'd like to check for these, but these checks don't work ++ * yet: ++ */ ++ if (old.stripe && enabled) ++ bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ++ "multiple stripes using same bucket"); ++ ++ if (!old.stripe && !enabled) ++ bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ++ "deleting stripe but bucket not marked as stripe bucket"); ++#endif ++ } ++} ++ ++static bool bch2_mark_pointer(struct bch_fs *c, ++ struct extent_ptr_decoded p, ++ s64 sectors, enum bch_data_type data_type, ++ struct bch_fs_usage *fs_usage, ++ u64 journal_seq, unsigned flags) ++{ ++ bool gc = flags & BCH_BUCKET_MARK_GC; ++ struct bucket_mark old, new; ++ struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); ++ struct bucket *g = PTR_BUCKET(ca, &p.ptr, gc); ++ bool overflow; ++ u64 v; ++ ++ v = atomic64_read(&g->_mark.v); ++ do { ++ new.v.counter = old.v.counter = v; ++ ++ /* ++ * Check this after reading bucket mark to guard against ++ * the allocator invalidating a bucket after we've already ++ * checked the gen ++ */ ++ if (gen_after(p.ptr.gen, new.gen)) { ++ bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ++ "pointer gen in the future"); ++ return true; ++ } ++ ++ if (new.gen != p.ptr.gen) { ++ /* XXX write repair code for this */ ++ if (!p.ptr.cached && ++ test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)) ++ bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ++ "stale dirty pointer"); ++ return true; ++ } ++ ++ if (!p.ptr.cached) ++ overflow = checked_add(new.dirty_sectors, sectors); ++ else ++ overflow = checked_add(new.cached_sectors, sectors); ++ ++ if (!new.dirty_sectors && ++ !new.cached_sectors) { ++ new.data_type = 0; ++ ++ if (journal_seq) { ++ new.journal_seq_valid = 1; ++ new.journal_seq = journal_seq; ++ } ++ } else { ++ new.data_type = data_type; ++ } ++ ++ if (flags & BCH_BUCKET_MARK_NOATOMIC) { ++ g->_mark = new; ++ break; ++ } ++ } while ((v = atomic64_cmpxchg(&g->_mark.v, ++ old.v.counter, ++ new.v.counter)) != old.v.counter); ++ ++ if (old.data_type && old.data_type != data_type) ++ bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ++ "bucket %u:%zu gen %u different types of data in same bucket: %s, %s", ++ p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), ++ new.gen, ++ bch2_data_types[old.data_type], ++ bch2_data_types[data_type]); ++ ++ bch2_fs_inconsistent_on(overflow, c, ++ "bucket sector count overflow: %u + %lli > U16_MAX", ++ !p.ptr.cached ++ ? old.dirty_sectors ++ : old.cached_sectors, sectors); ++ ++ bch2_dev_usage_update(c, ca, fs_usage, old, new, gc); ++ ++ BUG_ON(!gc && bucket_became_unavailable(old, new)); ++ ++ return false; ++} ++ ++static int bch2_mark_stripe_ptr(struct bch_fs *c, ++ struct bch_extent_stripe_ptr p, ++ enum bch_data_type data_type, ++ struct bch_fs_usage *fs_usage, ++ s64 sectors, unsigned flags, ++ struct bch_replicas_padded *r, ++ unsigned *nr_data, ++ unsigned *nr_parity) ++{ ++ bool gc = flags & BCH_BUCKET_MARK_GC; ++ struct stripe *m; ++ unsigned old, new; ++ int blocks_nonempty_delta; ++ ++ m = genradix_ptr(&c->stripes[gc], p.idx); ++ ++ spin_lock(&c->ec_stripes_heap_lock); ++ ++ if (!m || !m->alive) { ++ spin_unlock(&c->ec_stripes_heap_lock); ++ bch_err_ratelimited(c, "pointer to nonexistent stripe %llu", ++ (u64) p.idx); ++ return -EIO; ++ } ++ ++ BUG_ON(m->r.e.data_type != data_type); ++ ++ *nr_data = m->nr_blocks - m->nr_redundant; ++ *nr_parity = m->nr_redundant; ++ *r = m->r; ++ ++ old = m->block_sectors[p.block]; ++ m->block_sectors[p.block] += sectors; ++ new = m->block_sectors[p.block]; ++ ++ blocks_nonempty_delta = (int) !!new - (int) !!old; ++ if (blocks_nonempty_delta) { ++ m->blocks_nonempty += blocks_nonempty_delta; ++ ++ if (!gc) ++ bch2_stripes_heap_update(c, m, p.idx); ++ } ++ ++ m->dirty = true; ++ ++ spin_unlock(&c->ec_stripes_heap_lock); ++ ++ return 0; ++} ++ ++static int bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k, ++ unsigned offset, s64 sectors, ++ enum bch_data_type data_type, ++ struct bch_fs_usage *fs_usage, ++ unsigned journal_seq, unsigned flags) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const union bch_extent_entry *entry; ++ struct extent_ptr_decoded p; ++ struct bch_replicas_padded r; ++ s64 dirty_sectors = 0; ++ int ret; ++ ++ r.e.data_type = data_type; ++ r.e.nr_devs = 0; ++ r.e.nr_required = 1; ++ ++ BUG_ON(!sectors); ++ ++ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { ++ s64 disk_sectors = data_type == BCH_DATA_BTREE ++ ? sectors ++ : ptr_disk_sectors_delta(p, offset, sectors, flags); ++ bool stale = bch2_mark_pointer(c, p, disk_sectors, data_type, ++ fs_usage, journal_seq, flags); ++ ++ if (p.ptr.cached) { ++ if (!stale) ++ update_cached_sectors(c, fs_usage, p.ptr.dev, ++ disk_sectors); ++ } else if (!p.has_ec) { ++ dirty_sectors += disk_sectors; ++ r.e.devs[r.e.nr_devs++] = p.ptr.dev; ++ } else { ++ struct bch_replicas_padded ec_r; ++ unsigned nr_data, nr_parity; ++ s64 parity_sectors; ++ ++ ret = bch2_mark_stripe_ptr(c, p.ec, data_type, ++ fs_usage, disk_sectors, flags, ++ &ec_r, &nr_data, &nr_parity); ++ if (ret) ++ return ret; ++ ++ parity_sectors = ++ __ptr_disk_sectors_delta(p.crc.live_size, ++ offset, sectors, flags, ++ p.crc.compressed_size * nr_parity, ++ p.crc.uncompressed_size * nr_data); ++ ++ update_replicas(c, fs_usage, &ec_r.e, ++ disk_sectors + parity_sectors); ++ ++ /* ++ * There may be other dirty pointers in this extent, but ++ * if so they're not required for mounting if we have an ++ * erasure coded pointer in this extent: ++ */ ++ r.e.nr_required = 0; ++ } ++ } ++ ++ if (r.e.nr_devs) ++ update_replicas(c, fs_usage, &r.e, dirty_sectors); ++ ++ return 0; ++} ++ ++static int bch2_mark_stripe(struct bch_fs *c, struct bkey_s_c k, ++ struct bch_fs_usage *fs_usage, ++ u64 journal_seq, unsigned flags) ++{ ++ bool gc = flags & BCH_BUCKET_MARK_GC; ++ struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k); ++ size_t idx = s.k->p.offset; ++ struct stripe *m = genradix_ptr(&c->stripes[gc], idx); ++ unsigned i; ++ ++ spin_lock(&c->ec_stripes_heap_lock); ++ ++ if (!m || ((flags & BCH_BUCKET_MARK_OVERWRITE) && !m->alive)) { ++ spin_unlock(&c->ec_stripes_heap_lock); ++ bch_err_ratelimited(c, "error marking nonexistent stripe %zu", ++ idx); ++ return -1; ++ } ++ ++ if (!(flags & BCH_BUCKET_MARK_OVERWRITE)) { ++ m->sectors = le16_to_cpu(s.v->sectors); ++ m->algorithm = s.v->algorithm; ++ m->nr_blocks = s.v->nr_blocks; ++ m->nr_redundant = s.v->nr_redundant; ++ ++ bch2_bkey_to_replicas(&m->r.e, k); ++ ++ /* ++ * XXX: account for stripes somehow here ++ */ ++#if 0 ++ update_replicas(c, fs_usage, &m->r.e, stripe_sectors); ++#endif ++ ++ /* gc recalculates these fields: */ ++ if (!(flags & BCH_BUCKET_MARK_GC)) { ++ for (i = 0; i < s.v->nr_blocks; i++) { ++ m->block_sectors[i] = ++ stripe_blockcount_get(s.v, i); ++ m->blocks_nonempty += !!m->block_sectors[i]; ++ } ++ } ++ ++ if (!gc) ++ bch2_stripes_heap_update(c, m, idx); ++ m->alive = true; ++ } else { ++ if (!gc) ++ bch2_stripes_heap_del(c, m, idx); ++ memset(m, 0, sizeof(*m)); ++ } ++ ++ spin_unlock(&c->ec_stripes_heap_lock); ++ ++ bucket_set_stripe(c, s.v, fs_usage, 0, flags); ++ return 0; ++} ++ ++int bch2_mark_key_locked(struct bch_fs *c, ++ struct bkey_s_c k, ++ unsigned offset, s64 sectors, ++ struct bch_fs_usage *fs_usage, ++ u64 journal_seq, unsigned flags) ++{ ++ int ret = 0; ++ ++ preempt_disable(); ++ ++ if (!fs_usage || (flags & BCH_BUCKET_MARK_GC)) ++ fs_usage = fs_usage_ptr(c, journal_seq, ++ flags & BCH_BUCKET_MARK_GC); ++ ++ switch (k.k->type) { ++ case KEY_TYPE_alloc: ++ ret = bch2_mark_alloc(c, k, fs_usage, journal_seq, flags); ++ break; ++ case KEY_TYPE_btree_ptr: ++ sectors = !(flags & BCH_BUCKET_MARK_OVERWRITE) ++ ? c->opts.btree_node_size ++ : -c->opts.btree_node_size; ++ ++ ret = bch2_mark_extent(c, k, offset, sectors, BCH_DATA_BTREE, ++ fs_usage, journal_seq, flags); ++ break; ++ case KEY_TYPE_extent: ++ case KEY_TYPE_reflink_v: ++ ret = bch2_mark_extent(c, k, offset, sectors, BCH_DATA_USER, ++ fs_usage, journal_seq, flags); ++ break; ++ case KEY_TYPE_stripe: ++ ret = bch2_mark_stripe(c, k, fs_usage, journal_seq, flags); ++ break; ++ case KEY_TYPE_inode: ++ if (!(flags & BCH_BUCKET_MARK_OVERWRITE)) ++ fs_usage->nr_inodes++; ++ else ++ fs_usage->nr_inodes--; ++ break; ++ case KEY_TYPE_reservation: { ++ unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas; ++ ++ sectors *= replicas; ++ replicas = clamp_t(unsigned, replicas, 1, ++ ARRAY_SIZE(fs_usage->persistent_reserved)); ++ ++ fs_usage->reserved += sectors; ++ fs_usage->persistent_reserved[replicas - 1] += sectors; ++ break; ++ } ++ } ++ ++ preempt_enable(); ++ ++ return ret; ++} ++ ++int bch2_mark_key(struct bch_fs *c, struct bkey_s_c k, ++ unsigned offset, s64 sectors, ++ struct bch_fs_usage *fs_usage, ++ u64 journal_seq, unsigned flags) ++{ ++ int ret; ++ ++ percpu_down_read(&c->mark_lock); ++ ret = bch2_mark_key_locked(c, k, offset, sectors, ++ fs_usage, journal_seq, flags); ++ percpu_up_read(&c->mark_lock); ++ ++ return ret; ++} ++ ++inline int bch2_mark_overwrite(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct bkey_s_c old, ++ struct bkey_i *new, ++ struct bch_fs_usage *fs_usage, ++ unsigned flags) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree *b = iter->l[0].b; ++ unsigned offset = 0; ++ s64 sectors = 0; ++ ++ flags |= BCH_BUCKET_MARK_OVERWRITE; ++ ++ if (btree_node_is_extents(b) ++ ? bkey_cmp(new->k.p, bkey_start_pos(old.k)) <= 0 ++ : bkey_cmp(new->k.p, old.k->p)) ++ return 0; ++ ++ if (btree_node_is_extents(b)) { ++ switch (bch2_extent_overlap(&new->k, old.k)) { ++ case BCH_EXTENT_OVERLAP_ALL: ++ offset = 0; ++ sectors = -((s64) old.k->size); ++ break; ++ case BCH_EXTENT_OVERLAP_BACK: ++ offset = bkey_start_offset(&new->k) - ++ bkey_start_offset(old.k); ++ sectors = bkey_start_offset(&new->k) - ++ old.k->p.offset; ++ break; ++ case BCH_EXTENT_OVERLAP_FRONT: ++ offset = 0; ++ sectors = bkey_start_offset(old.k) - ++ new->k.p.offset; ++ break; ++ case BCH_EXTENT_OVERLAP_MIDDLE: ++ offset = bkey_start_offset(&new->k) - ++ bkey_start_offset(old.k); ++ sectors = -((s64) new->k.size); ++ flags |= BCH_BUCKET_MARK_OVERWRITE_SPLIT; ++ break; ++ } ++ ++ BUG_ON(sectors >= 0); ++ } ++ ++ return bch2_mark_key_locked(c, old, offset, sectors, fs_usage, ++ trans->journal_res.seq, flags) ?: 1; ++} ++ ++int bch2_mark_update(struct btree_trans *trans, ++ struct btree_insert_entry *insert, ++ struct bch_fs_usage *fs_usage, ++ unsigned flags) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_iter *iter = insert->iter; ++ struct btree *b = iter->l[0].b; ++ struct btree_node_iter node_iter = iter->l[0].iter; ++ struct bkey_packed *_k; ++ int ret = 0; ++ ++ if (!btree_node_type_needs_gc(iter->btree_id)) ++ return 0; ++ ++ bch2_mark_key_locked(c, bkey_i_to_s_c(insert->k), ++ 0, insert->k->k.size, ++ fs_usage, trans->journal_res.seq, ++ BCH_BUCKET_MARK_INSERT|flags); ++ ++ if (unlikely(trans->flags & BTREE_INSERT_NOMARK_OVERWRITES)) ++ return 0; ++ ++ /* ++ * For non extents, we only mark the new key, not the key being ++ * overwritten - unless we're actually deleting: ++ */ ++ if ((iter->btree_id == BTREE_ID_ALLOC || ++ iter->btree_id == BTREE_ID_EC) && ++ !bkey_deleted(&insert->k->k)) ++ return 0; ++ ++ while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, b, ++ KEY_TYPE_discard))) { ++ struct bkey unpacked; ++ struct bkey_s_c k = bkey_disassemble(b, _k, &unpacked); ++ ++ ret = bch2_mark_overwrite(trans, iter, k, insert->k, ++ fs_usage, flags); ++ if (ret <= 0) ++ break; ++ ++ bch2_btree_node_iter_advance(&node_iter, b); ++ } ++ ++ return ret; ++} ++ ++void bch2_trans_fs_usage_apply(struct btree_trans *trans, ++ struct bch_fs_usage *fs_usage) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_insert_entry *i; ++ static int warned_disk_usage = 0; ++ u64 disk_res_sectors = trans->disk_res ? trans->disk_res->sectors : 0; ++ char buf[200]; ++ ++ if (!bch2_fs_usage_apply(c, fs_usage, trans->disk_res, ++ trans->journal_res.seq) || ++ warned_disk_usage || ++ xchg(&warned_disk_usage, 1)) ++ return; ++ ++ bch_err(c, "disk usage increased more than %llu sectors reserved", ++ disk_res_sectors); ++ ++ trans_for_each_update(trans, i) { ++ struct btree_iter *iter = i->iter; ++ struct btree *b = iter->l[0].b; ++ struct btree_node_iter node_iter = iter->l[0].iter; ++ struct bkey_packed *_k; ++ ++ pr_err("while inserting"); ++ bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(i->k)); ++ pr_err("%s", buf); ++ pr_err("overlapping with"); ++ ++ node_iter = iter->l[0].iter; ++ while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, b, ++ KEY_TYPE_discard))) { ++ struct bkey unpacked; ++ struct bkey_s_c k; ++ ++ k = bkey_disassemble(b, _k, &unpacked); ++ ++ if (btree_node_is_extents(b) ++ ? bkey_cmp(i->k->k.p, bkey_start_pos(k.k)) <= 0 ++ : bkey_cmp(i->k->k.p, k.k->p)) ++ break; ++ ++ bch2_bkey_val_to_text(&PBUF(buf), c, k); ++ pr_err("%s", buf); ++ ++ bch2_btree_node_iter_advance(&node_iter, b); ++ } ++ } ++} ++ ++/* trans_mark: */ ++ ++static int trans_get_key(struct btree_trans *trans, ++ enum btree_id btree_id, struct bpos pos, ++ struct btree_iter **iter, ++ struct bkey_s_c *k) ++{ ++ struct btree_insert_entry *i; ++ int ret; ++ ++ trans_for_each_update(trans, i) ++ if (i->iter->btree_id == btree_id && ++ (btree_node_type_is_extents(btree_id) ++ ? bkey_cmp(pos, bkey_start_pos(&i->k->k)) >= 0 && ++ bkey_cmp(pos, i->k->k.p) < 0 ++ : !bkey_cmp(pos, i->iter->pos))) { ++ *iter = i->iter; ++ *k = bkey_i_to_s_c(i->k); ++ return 1; ++ } ++ ++ *iter = bch2_trans_get_iter(trans, btree_id, pos, ++ BTREE_ITER_SLOTS|BTREE_ITER_INTENT); ++ if (IS_ERR(*iter)) ++ return PTR_ERR(*iter); ++ ++ *k = bch2_btree_iter_peek_slot(*iter); ++ ret = bkey_err(*k); ++ if (ret) ++ bch2_trans_iter_put(trans, *iter); ++ return ret; ++} ++ ++static void *trans_update_key(struct btree_trans *trans, ++ struct btree_iter *iter, ++ unsigned u64s) ++{ ++ struct btree_insert_entry *i; ++ struct bkey_i *new_k; ++ ++ new_k = bch2_trans_kmalloc(trans, u64s * sizeof(u64)); ++ if (IS_ERR(new_k)) ++ return new_k; ++ ++ bkey_init(&new_k->k); ++ new_k->k.p = iter->pos; ++ ++ trans_for_each_update(trans, i) ++ if (i->iter == iter) { ++ i->k = new_k; ++ return new_k; ++ } ++ ++ bch2_trans_update(trans, iter, new_k); ++ return new_k; ++} ++ ++static int bch2_trans_mark_pointer(struct btree_trans *trans, ++ struct extent_ptr_decoded p, ++ s64 sectors, enum bch_data_type data_type) ++{ ++ struct bch_fs *c = trans->c; ++ struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ struct bkey_alloc_unpacked u; ++ struct bkey_i_alloc *a; ++ unsigned old; ++ bool overflow; ++ int ret; ++ ++ ret = trans_get_key(trans, BTREE_ID_ALLOC, ++ POS(p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr)), ++ &iter, &k); ++ if (ret < 0) ++ return ret; ++ ++ if (!ret && unlikely(!test_bit(BCH_FS_ALLOC_WRITTEN, &c->flags))) { ++ /* ++ * During journal replay, and if gc repairs alloc info at ++ * runtime, the alloc info in the btree might not be up to date ++ * yet - so, trust the in memory mark: ++ */ ++ struct bucket *g; ++ struct bucket_mark m; ++ ++ percpu_down_read(&c->mark_lock); ++ g = bucket(ca, iter->pos.offset); ++ m = READ_ONCE(g->mark); ++ u = alloc_mem_to_key(g, m); ++ percpu_up_read(&c->mark_lock); ++ } else { ++ /* ++ * Unless we're already updating that key: ++ */ ++ if (k.k->type != KEY_TYPE_alloc) { ++ bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ++ "pointer to nonexistent bucket %llu:%llu", ++ iter->pos.inode, iter->pos.offset); ++ ret = -1; ++ goto out; ++ } ++ ++ u = bch2_alloc_unpack(k); ++ } ++ ++ if (gen_after(u.gen, p.ptr.gen)) { ++ ret = 1; ++ goto out; ++ } ++ ++ if (u.data_type && u.data_type != data_type) { ++ bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ++ "bucket %llu:%llu gen %u different types of data in same bucket: %s, %s", ++ iter->pos.inode, iter->pos.offset, ++ u.gen, ++ bch2_data_types[u.data_type], ++ bch2_data_types[data_type]); ++ ret = -1; ++ goto out; ++ } ++ ++ if (!p.ptr.cached) { ++ old = u.dirty_sectors; ++ overflow = checked_add(u.dirty_sectors, sectors); ++ } else { ++ old = u.cached_sectors; ++ overflow = checked_add(u.cached_sectors, sectors); ++ } ++ ++ u.data_type = u.dirty_sectors || u.cached_sectors ++ ? data_type : 0; ++ ++ bch2_fs_inconsistent_on(overflow, c, ++ "bucket sector count overflow: %u + %lli > U16_MAX", ++ old, sectors); ++ BUG_ON(overflow); ++ ++ a = trans_update_key(trans, iter, BKEY_ALLOC_U64s_MAX); ++ ret = PTR_ERR_OR_ZERO(a); ++ if (ret) ++ goto out; ++ ++ bkey_alloc_init(&a->k_i); ++ a->k.p = iter->pos; ++ bch2_alloc_pack(a, u); ++out: ++ bch2_trans_iter_put(trans, iter); ++ return ret; ++} ++ ++static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans, ++ struct bch_extent_stripe_ptr p, ++ s64 sectors, enum bch_data_type data_type, ++ struct bch_replicas_padded *r, ++ unsigned *nr_data, ++ unsigned *nr_parity) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_iter *iter; ++ struct bkey_i *new_k; ++ struct bkey_s_c k; ++ struct bkey_s_stripe s; ++ int ret = 0; ++ ++ ret = trans_get_key(trans, BTREE_ID_EC, POS(0, p.idx), &iter, &k); ++ if (ret < 0) ++ return ret; ++ ++ if (k.k->type != KEY_TYPE_stripe) { ++ bch2_fs_inconsistent(c, ++ "pointer to nonexistent stripe %llu", ++ (u64) p.idx); ++ ret = -EIO; ++ goto out; ++ } ++ ++ new_k = trans_update_key(trans, iter, k.k->u64s); ++ ret = PTR_ERR_OR_ZERO(new_k); ++ if (ret) ++ goto out; ++ ++ bkey_reassemble(new_k, k); ++ s = bkey_i_to_s_stripe(new_k); ++ ++ stripe_blockcount_set(s.v, p.block, ++ stripe_blockcount_get(s.v, p.block) + ++ sectors); ++ ++ *nr_data = s.v->nr_blocks - s.v->nr_redundant; ++ *nr_parity = s.v->nr_redundant; ++ bch2_bkey_to_replicas(&r->e, s.s_c); ++out: ++ bch2_trans_iter_put(trans, iter); ++ return ret; ++} ++ ++static int bch2_trans_mark_extent(struct btree_trans *trans, ++ struct bkey_s_c k, unsigned offset, ++ s64 sectors, unsigned flags, ++ enum bch_data_type data_type) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const union bch_extent_entry *entry; ++ struct extent_ptr_decoded p; ++ struct bch_replicas_padded r; ++ s64 dirty_sectors = 0; ++ bool stale; ++ int ret; ++ ++ r.e.data_type = data_type; ++ r.e.nr_devs = 0; ++ r.e.nr_required = 1; ++ ++ BUG_ON(!sectors); ++ ++ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { ++ s64 disk_sectors = data_type == BCH_DATA_BTREE ++ ? sectors ++ : ptr_disk_sectors_delta(p, offset, sectors, flags); ++ ++ ret = bch2_trans_mark_pointer(trans, p, disk_sectors, ++ data_type); ++ if (ret < 0) ++ return ret; ++ ++ stale = ret > 0; ++ ++ if (p.ptr.cached) { ++ if (!stale) ++ update_cached_sectors_list(trans, p.ptr.dev, ++ disk_sectors); ++ } else if (!p.has_ec) { ++ dirty_sectors += disk_sectors; ++ r.e.devs[r.e.nr_devs++] = p.ptr.dev; ++ } else { ++ struct bch_replicas_padded ec_r; ++ unsigned nr_data, nr_parity; ++ s64 parity_sectors; ++ ++ ret = bch2_trans_mark_stripe_ptr(trans, p.ec, ++ disk_sectors, data_type, ++ &ec_r, &nr_data, &nr_parity); ++ if (ret) ++ return ret; ++ ++ parity_sectors = ++ __ptr_disk_sectors_delta(p.crc.live_size, ++ offset, sectors, flags, ++ p.crc.compressed_size * nr_parity, ++ p.crc.uncompressed_size * nr_data); ++ ++ update_replicas_list(trans, &ec_r.e, ++ disk_sectors + parity_sectors); ++ ++ r.e.nr_required = 0; ++ } ++ } ++ ++ if (r.e.nr_devs) ++ update_replicas_list(trans, &r.e, dirty_sectors); ++ ++ return 0; ++} ++ ++static int __bch2_trans_mark_reflink_p(struct btree_trans *trans, ++ struct bkey_s_c_reflink_p p, ++ u64 idx, unsigned sectors, ++ unsigned flags) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_iter *iter; ++ struct bkey_i *new_k; ++ struct bkey_s_c k; ++ struct bkey_i_reflink_v *r_v; ++ s64 ret; ++ ++ ret = trans_get_key(trans, BTREE_ID_REFLINK, ++ POS(0, idx), &iter, &k); ++ if (ret < 0) ++ return ret; ++ ++ if (k.k->type != KEY_TYPE_reflink_v) { ++ bch2_fs_inconsistent(c, ++ "%llu:%llu len %u points to nonexistent indirect extent %llu", ++ p.k->p.inode, p.k->p.offset, p.k->size, idx); ++ ret = -EIO; ++ goto err; ++ } ++ ++ if ((flags & BCH_BUCKET_MARK_OVERWRITE) && ++ (bkey_start_offset(k.k) < idx || ++ k.k->p.offset > idx + sectors)) ++ goto out; ++ ++ bch2_btree_iter_set_pos(iter, bkey_start_pos(k.k)); ++ BUG_ON(iter->uptodate > BTREE_ITER_NEED_PEEK); ++ ++ new_k = trans_update_key(trans, iter, k.k->u64s); ++ ret = PTR_ERR_OR_ZERO(new_k); ++ if (ret) ++ goto err; ++ ++ bkey_reassemble(new_k, k); ++ r_v = bkey_i_to_reflink_v(new_k); ++ ++ le64_add_cpu(&r_v->v.refcount, ++ !(flags & BCH_BUCKET_MARK_OVERWRITE) ? 1 : -1); ++ ++ if (!r_v->v.refcount) { ++ r_v->k.type = KEY_TYPE_deleted; ++ set_bkey_val_u64s(&r_v->k, 0); ++ } ++out: ++ ret = k.k->p.offset - idx; ++err: ++ bch2_trans_iter_put(trans, iter); ++ return ret; ++} ++ ++static int bch2_trans_mark_reflink_p(struct btree_trans *trans, ++ struct bkey_s_c_reflink_p p, unsigned offset, ++ s64 sectors, unsigned flags) ++{ ++ u64 idx = le64_to_cpu(p.v->idx) + offset; ++ s64 ret = 0; ++ ++ sectors = abs(sectors); ++ BUG_ON(offset + sectors > p.k->size); ++ ++ while (sectors) { ++ ret = __bch2_trans_mark_reflink_p(trans, p, idx, sectors, flags); ++ if (ret < 0) ++ break; ++ ++ idx += ret; ++ sectors = max_t(s64, 0LL, sectors - ret); ++ ret = 0; ++ } ++ ++ return ret; ++} ++ ++int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c k, ++ unsigned offset, s64 sectors, unsigned flags) ++{ ++ struct replicas_delta_list *d; ++ struct bch_fs *c = trans->c; ++ ++ switch (k.k->type) { ++ case KEY_TYPE_btree_ptr: ++ sectors = !(flags & BCH_BUCKET_MARK_OVERWRITE) ++ ? c->opts.btree_node_size ++ : -c->opts.btree_node_size; ++ ++ return bch2_trans_mark_extent(trans, k, offset, sectors, ++ flags, BCH_DATA_BTREE); ++ case KEY_TYPE_extent: ++ case KEY_TYPE_reflink_v: ++ return bch2_trans_mark_extent(trans, k, offset, sectors, ++ flags, BCH_DATA_USER); ++ case KEY_TYPE_inode: ++ d = replicas_deltas_realloc(trans, 0); ++ ++ if (!(flags & BCH_BUCKET_MARK_OVERWRITE)) ++ d->nr_inodes++; ++ else ++ d->nr_inodes--; ++ return 0; ++ case KEY_TYPE_reservation: { ++ unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas; ++ ++ d = replicas_deltas_realloc(trans, 0); ++ ++ sectors *= replicas; ++ replicas = clamp_t(unsigned, replicas, 1, ++ ARRAY_SIZE(d->persistent_reserved)); ++ ++ d->persistent_reserved[replicas - 1] += sectors; ++ return 0; ++ } ++ case KEY_TYPE_reflink_p: ++ return bch2_trans_mark_reflink_p(trans, ++ bkey_s_c_to_reflink_p(k), ++ offset, sectors, flags); ++ default: ++ return 0; ++ } ++} ++ ++int bch2_trans_mark_update(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct bkey_i *insert) ++{ ++ struct btree *b = iter->l[0].b; ++ struct btree_node_iter node_iter = iter->l[0].iter; ++ struct bkey_packed *_k; ++ int ret; ++ ++ if (!btree_node_type_needs_gc(iter->btree_id)) ++ return 0; ++ ++ ret = bch2_trans_mark_key(trans, bkey_i_to_s_c(insert), ++ 0, insert->k.size, BCH_BUCKET_MARK_INSERT); ++ if (ret) ++ return ret; ++ ++ if (unlikely(trans->flags & BTREE_INSERT_NOMARK_OVERWRITES)) ++ return 0; ++ ++ while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, b, ++ KEY_TYPE_discard))) { ++ struct bkey unpacked; ++ struct bkey_s_c k; ++ unsigned offset = 0; ++ s64 sectors = 0; ++ unsigned flags = BCH_BUCKET_MARK_OVERWRITE; ++ ++ k = bkey_disassemble(b, _k, &unpacked); ++ ++ if (btree_node_is_extents(b) ++ ? bkey_cmp(insert->k.p, bkey_start_pos(k.k)) <= 0 ++ : bkey_cmp(insert->k.p, k.k->p)) ++ break; ++ ++ if (btree_node_is_extents(b)) { ++ switch (bch2_extent_overlap(&insert->k, k.k)) { ++ case BCH_EXTENT_OVERLAP_ALL: ++ offset = 0; ++ sectors = -((s64) k.k->size); ++ break; ++ case BCH_EXTENT_OVERLAP_BACK: ++ offset = bkey_start_offset(&insert->k) - ++ bkey_start_offset(k.k); ++ sectors = bkey_start_offset(&insert->k) - ++ k.k->p.offset; ++ break; ++ case BCH_EXTENT_OVERLAP_FRONT: ++ offset = 0; ++ sectors = bkey_start_offset(k.k) - ++ insert->k.p.offset; ++ break; ++ case BCH_EXTENT_OVERLAP_MIDDLE: ++ offset = bkey_start_offset(&insert->k) - ++ bkey_start_offset(k.k); ++ sectors = -((s64) insert->k.size); ++ flags |= BCH_BUCKET_MARK_OVERWRITE_SPLIT; ++ break; ++ } ++ ++ BUG_ON(sectors >= 0); ++ } ++ ++ ret = bch2_trans_mark_key(trans, k, offset, sectors, flags); ++ if (ret) ++ return ret; ++ ++ bch2_btree_node_iter_advance(&node_iter, b); ++ } ++ ++ return 0; ++} ++ ++/* Disk reservations: */ ++ ++static u64 bch2_recalc_sectors_available(struct bch_fs *c) ++{ ++ percpu_u64_set(&c->pcpu->sectors_available, 0); ++ ++ return avail_factor(__bch2_fs_usage_read_short(c).free); ++} ++ ++void __bch2_disk_reservation_put(struct bch_fs *c, struct disk_reservation *res) ++{ ++ percpu_down_read(&c->mark_lock); ++ this_cpu_sub(c->usage[0]->online_reserved, ++ res->sectors); ++ percpu_up_read(&c->mark_lock); ++ ++ res->sectors = 0; ++} ++ ++#define SECTORS_CACHE 1024 ++ ++int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res, ++ unsigned sectors, int flags) ++{ ++ struct bch_fs_pcpu *pcpu; ++ u64 old, v, get; ++ s64 sectors_available; ++ int ret; ++ ++ percpu_down_read(&c->mark_lock); ++ preempt_disable(); ++ pcpu = this_cpu_ptr(c->pcpu); ++ ++ if (sectors <= pcpu->sectors_available) ++ goto out; ++ ++ v = atomic64_read(&c->sectors_available); ++ do { ++ old = v; ++ get = min((u64) sectors + SECTORS_CACHE, old); ++ ++ if (get < sectors) { ++ preempt_enable(); ++ percpu_up_read(&c->mark_lock); ++ goto recalculate; ++ } ++ } while ((v = atomic64_cmpxchg(&c->sectors_available, ++ old, old - get)) != old); ++ ++ pcpu->sectors_available += get; ++ ++out: ++ pcpu->sectors_available -= sectors; ++ this_cpu_add(c->usage[0]->online_reserved, sectors); ++ res->sectors += sectors; ++ ++ preempt_enable(); ++ percpu_up_read(&c->mark_lock); ++ return 0; ++ ++recalculate: ++ percpu_down_write(&c->mark_lock); ++ ++ sectors_available = bch2_recalc_sectors_available(c); ++ ++ if (sectors <= sectors_available || ++ (flags & BCH_DISK_RESERVATION_NOFAIL)) { ++ atomic64_set(&c->sectors_available, ++ max_t(s64, 0, sectors_available - sectors)); ++ this_cpu_add(c->usage[0]->online_reserved, sectors); ++ res->sectors += sectors; ++ ret = 0; ++ } else { ++ atomic64_set(&c->sectors_available, sectors_available); ++ ret = -ENOSPC; ++ } ++ ++ percpu_up_write(&c->mark_lock); ++ ++ return ret; ++} ++ ++/* Startup/shutdown: */ ++ ++static void buckets_free_rcu(struct rcu_head *rcu) ++{ ++ struct bucket_array *buckets = ++ container_of(rcu, struct bucket_array, rcu); ++ ++ kvpfree(buckets, ++ sizeof(struct bucket_array) + ++ buckets->nbuckets * sizeof(struct bucket)); ++} ++ ++int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) ++{ ++ struct bucket_array *buckets = NULL, *old_buckets = NULL; ++ unsigned long *buckets_nouse = NULL; ++ alloc_fifo free[RESERVE_NR]; ++ alloc_fifo free_inc; ++ alloc_heap alloc_heap; ++ copygc_heap copygc_heap; ++ ++ size_t btree_reserve = DIV_ROUND_UP(BTREE_NODE_RESERVE, ++ ca->mi.bucket_size / c->opts.btree_node_size); ++ /* XXX: these should be tunable */ ++ size_t reserve_none = max_t(size_t, 1, nbuckets >> 9); ++ size_t copygc_reserve = max_t(size_t, 2, nbuckets >> 7); ++ size_t free_inc_nr = max(max_t(size_t, 1, nbuckets >> 12), ++ btree_reserve * 2); ++ bool resize = ca->buckets[0] != NULL, ++ start_copygc = ca->copygc_thread != NULL; ++ int ret = -ENOMEM; ++ unsigned i; ++ ++ memset(&free, 0, sizeof(free)); ++ memset(&free_inc, 0, sizeof(free_inc)); ++ memset(&alloc_heap, 0, sizeof(alloc_heap)); ++ memset(©gc_heap, 0, sizeof(copygc_heap)); ++ ++ if (!(buckets = kvpmalloc(sizeof(struct bucket_array) + ++ nbuckets * sizeof(struct bucket), ++ GFP_KERNEL|__GFP_ZERO)) || ++ !(buckets_nouse = kvpmalloc(BITS_TO_LONGS(nbuckets) * ++ sizeof(unsigned long), ++ GFP_KERNEL|__GFP_ZERO)) || ++ !init_fifo(&free[RESERVE_BTREE], btree_reserve, GFP_KERNEL) || ++ !init_fifo(&free[RESERVE_MOVINGGC], ++ copygc_reserve, GFP_KERNEL) || ++ !init_fifo(&free[RESERVE_NONE], reserve_none, GFP_KERNEL) || ++ !init_fifo(&free_inc, free_inc_nr, GFP_KERNEL) || ++ !init_heap(&alloc_heap, ALLOC_SCAN_BATCH(ca) << 1, GFP_KERNEL) || ++ !init_heap(©gc_heap, copygc_reserve, GFP_KERNEL)) ++ goto err; ++ ++ buckets->first_bucket = ca->mi.first_bucket; ++ buckets->nbuckets = nbuckets; ++ ++ bch2_copygc_stop(ca); ++ ++ if (resize) { ++ down_write(&c->gc_lock); ++ down_write(&ca->bucket_lock); ++ percpu_down_write(&c->mark_lock); ++ } ++ ++ old_buckets = bucket_array(ca); ++ ++ if (resize) { ++ size_t n = min(buckets->nbuckets, old_buckets->nbuckets); ++ ++ memcpy(buckets->b, ++ old_buckets->b, ++ n * sizeof(struct bucket)); ++ memcpy(buckets_nouse, ++ ca->buckets_nouse, ++ BITS_TO_LONGS(n) * sizeof(unsigned long)); ++ } ++ ++ rcu_assign_pointer(ca->buckets[0], buckets); ++ buckets = old_buckets; ++ ++ swap(ca->buckets_nouse, buckets_nouse); ++ ++ if (resize) ++ percpu_up_write(&c->mark_lock); ++ ++ spin_lock(&c->freelist_lock); ++ for (i = 0; i < RESERVE_NR; i++) { ++ fifo_move(&free[i], &ca->free[i]); ++ swap(ca->free[i], free[i]); ++ } ++ fifo_move(&free_inc, &ca->free_inc); ++ swap(ca->free_inc, free_inc); ++ spin_unlock(&c->freelist_lock); ++ ++ /* with gc lock held, alloc_heap can't be in use: */ ++ swap(ca->alloc_heap, alloc_heap); ++ ++ /* and we shut down copygc: */ ++ swap(ca->copygc_heap, copygc_heap); ++ ++ nbuckets = ca->mi.nbuckets; ++ ++ if (resize) { ++ up_write(&ca->bucket_lock); ++ up_write(&c->gc_lock); ++ } ++ ++ if (start_copygc && ++ bch2_copygc_start(c, ca)) ++ bch_err(ca, "error restarting copygc thread"); ++ ++ ret = 0; ++err: ++ free_heap(©gc_heap); ++ free_heap(&alloc_heap); ++ free_fifo(&free_inc); ++ for (i = 0; i < RESERVE_NR; i++) ++ free_fifo(&free[i]); ++ kvpfree(buckets_nouse, ++ BITS_TO_LONGS(nbuckets) * sizeof(unsigned long)); ++ if (buckets) ++ call_rcu(&old_buckets->rcu, buckets_free_rcu); ++ ++ return ret; ++} ++ ++void bch2_dev_buckets_free(struct bch_dev *ca) ++{ ++ unsigned i; ++ ++ free_heap(&ca->copygc_heap); ++ free_heap(&ca->alloc_heap); ++ free_fifo(&ca->free_inc); ++ for (i = 0; i < RESERVE_NR; i++) ++ free_fifo(&ca->free[i]); ++ kvpfree(ca->buckets_nouse, ++ BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long)); ++ kvpfree(rcu_dereference_protected(ca->buckets[0], 1), ++ sizeof(struct bucket_array) + ++ ca->mi.nbuckets * sizeof(struct bucket)); ++ ++ free_percpu(ca->usage[0]); ++} ++ ++int bch2_dev_buckets_alloc(struct bch_fs *c, struct bch_dev *ca) ++{ ++ if (!(ca->usage[0] = alloc_percpu(struct bch_dev_usage))) ++ return -ENOMEM; ++ ++ return bch2_dev_buckets_resize(c, ca, ca->mi.nbuckets);; ++} +diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h +new file mode 100644 +index 000000000000..ad6f731b1cea +--- /dev/null ++++ b/fs/bcachefs/buckets.h +@@ -0,0 +1,337 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++/* ++ * Code for manipulating bucket marks for garbage collection. ++ * ++ * Copyright 2014 Datera, Inc. ++ */ ++ ++#ifndef _BUCKETS_H ++#define _BUCKETS_H ++ ++#include "buckets_types.h" ++#include "super.h" ++ ++#define for_each_bucket(_b, _buckets) \ ++ for (_b = (_buckets)->b + (_buckets)->first_bucket; \ ++ _b < (_buckets)->b + (_buckets)->nbuckets; _b++) ++ ++#define bucket_cmpxchg(g, new, expr) \ ++({ \ ++ struct bucket *_g = g; \ ++ u64 _v = atomic64_read(&(g)->_mark.v); \ ++ struct bucket_mark _old; \ ++ \ ++ do { \ ++ (new).v.counter = _old.v.counter = _v; \ ++ expr; \ ++ } while ((_v = atomic64_cmpxchg(&(_g)->_mark.v, \ ++ _old.v.counter, \ ++ (new).v.counter)) != _old.v.counter);\ ++ _old; \ ++}) ++ ++static inline struct bucket_array *__bucket_array(struct bch_dev *ca, ++ bool gc) ++{ ++ return rcu_dereference_check(ca->buckets[gc], ++ !ca->fs || ++ percpu_rwsem_is_held(&ca->fs->mark_lock) || ++ lockdep_is_held(&ca->fs->gc_lock) || ++ lockdep_is_held(&ca->bucket_lock)); ++} ++ ++static inline struct bucket_array *bucket_array(struct bch_dev *ca) ++{ ++ return __bucket_array(ca, false); ++} ++ ++static inline struct bucket *__bucket(struct bch_dev *ca, size_t b, bool gc) ++{ ++ struct bucket_array *buckets = __bucket_array(ca, gc); ++ ++ BUG_ON(b < buckets->first_bucket || b >= buckets->nbuckets); ++ return buckets->b + b; ++} ++ ++static inline struct bucket *bucket(struct bch_dev *ca, size_t b) ++{ ++ return __bucket(ca, b, false); ++} ++ ++static inline void bucket_io_clock_reset(struct bch_fs *c, struct bch_dev *ca, ++ size_t b, int rw) ++{ ++ bucket(ca, b)->io_time[rw] = c->bucket_clock[rw].hand; ++} ++ ++static inline u16 bucket_last_io(struct bch_fs *c, struct bucket *g, int rw) ++{ ++ return c->bucket_clock[rw].hand - g->io_time[rw]; ++} ++ ++/* ++ * bucket_gc_gen() returns the difference between the bucket's current gen and ++ * the oldest gen of any pointer into that bucket in the btree. ++ */ ++ ++static inline u8 bucket_gc_gen(struct bch_dev *ca, size_t b) ++{ ++ struct bucket *g = bucket(ca, b); ++ ++ return g->mark.gen - g->oldest_gen; ++} ++ ++static inline size_t PTR_BUCKET_NR(const struct bch_dev *ca, ++ const struct bch_extent_ptr *ptr) ++{ ++ return sector_to_bucket(ca, ptr->offset); ++} ++ ++static inline struct bucket *PTR_BUCKET(struct bch_dev *ca, ++ const struct bch_extent_ptr *ptr, ++ bool gc) ++{ ++ return __bucket(ca, PTR_BUCKET_NR(ca, ptr), gc); ++} ++ ++static inline enum bch_data_type ptr_data_type(const struct bkey *k, ++ const struct bch_extent_ptr *ptr) ++{ ++ if (k->type == KEY_TYPE_btree_ptr) ++ return BCH_DATA_BTREE; ++ ++ return ptr->cached ? BCH_DATA_CACHED : BCH_DATA_USER; ++} ++ ++static inline struct bucket_mark ptr_bucket_mark(struct bch_dev *ca, ++ const struct bch_extent_ptr *ptr) ++{ ++ struct bucket_mark m; ++ ++ rcu_read_lock(); ++ m = READ_ONCE(PTR_BUCKET(ca, ptr, 0)->mark); ++ rcu_read_unlock(); ++ ++ return m; ++} ++ ++static inline int gen_cmp(u8 a, u8 b) ++{ ++ return (s8) (a - b); ++} ++ ++static inline int gen_after(u8 a, u8 b) ++{ ++ int r = gen_cmp(a, b); ++ ++ return r > 0 ? r : 0; ++} ++ ++/** ++ * ptr_stale() - check if a pointer points into a bucket that has been ++ * invalidated. ++ */ ++static inline u8 ptr_stale(struct bch_dev *ca, ++ const struct bch_extent_ptr *ptr) ++{ ++ return gen_after(ptr_bucket_mark(ca, ptr).gen, ptr->gen); ++} ++ ++static inline s64 __ptr_disk_sectors(struct extent_ptr_decoded p, ++ unsigned live_size) ++{ ++ return live_size && p.crc.compression_type ++ ? max(1U, DIV_ROUND_UP(live_size * p.crc.compressed_size, ++ p.crc.uncompressed_size)) ++ : live_size; ++} ++ ++static inline s64 ptr_disk_sectors(struct extent_ptr_decoded p) ++{ ++ return __ptr_disk_sectors(p, p.crc.live_size); ++} ++ ++/* bucket gc marks */ ++ ++static inline unsigned bucket_sectors_used(struct bucket_mark mark) ++{ ++ return mark.dirty_sectors + mark.cached_sectors; ++} ++ ++static inline bool bucket_unused(struct bucket_mark mark) ++{ ++ return !mark.owned_by_allocator && ++ !mark.data_type && ++ !bucket_sectors_used(mark); ++} ++ ++static inline bool is_available_bucket(struct bucket_mark mark) ++{ ++ return (!mark.owned_by_allocator && ++ !mark.dirty_sectors && ++ !mark.stripe); ++} ++ ++static inline bool bucket_needs_journal_commit(struct bucket_mark m, ++ u16 last_seq_ondisk) ++{ ++ return m.journal_seq_valid && ++ ((s16) m.journal_seq - (s16) last_seq_ondisk > 0); ++} ++ ++/* Device usage: */ ++ ++struct bch_dev_usage bch2_dev_usage_read(struct bch_fs *, struct bch_dev *); ++ ++void bch2_dev_usage_from_buckets(struct bch_fs *); ++ ++static inline u64 __dev_buckets_available(struct bch_dev *ca, ++ struct bch_dev_usage stats) ++{ ++ u64 total = ca->mi.nbuckets - ca->mi.first_bucket; ++ ++ if (WARN_ONCE(stats.buckets_unavailable > total, ++ "buckets_unavailable overflow (%llu > %llu)\n", ++ stats.buckets_unavailable, total)) ++ return 0; ++ ++ return total - stats.buckets_unavailable; ++} ++ ++/* ++ * Number of reclaimable buckets - only for use by the allocator thread: ++ */ ++static inline u64 dev_buckets_available(struct bch_fs *c, struct bch_dev *ca) ++{ ++ return __dev_buckets_available(ca, bch2_dev_usage_read(c, ca)); ++} ++ ++static inline u64 __dev_buckets_free(struct bch_dev *ca, ++ struct bch_dev_usage stats) ++{ ++ return __dev_buckets_available(ca, stats) + ++ fifo_used(&ca->free[RESERVE_NONE]) + ++ fifo_used(&ca->free_inc); ++} ++ ++static inline u64 dev_buckets_free(struct bch_fs *c, struct bch_dev *ca) ++{ ++ return __dev_buckets_free(ca, bch2_dev_usage_read(c, ca)); ++} ++ ++/* Filesystem usage: */ ++ ++static inline unsigned fs_usage_u64s(struct bch_fs *c) ++{ ++ ++ return sizeof(struct bch_fs_usage) / sizeof(u64) + ++ READ_ONCE(c->replicas.nr); ++} ++ ++void bch2_fs_usage_scratch_put(struct bch_fs *, struct bch_fs_usage *); ++struct bch_fs_usage *bch2_fs_usage_scratch_get(struct bch_fs *); ++ ++u64 bch2_fs_usage_read_one(struct bch_fs *, u64 *); ++ ++struct bch_fs_usage *bch2_fs_usage_read(struct bch_fs *); ++ ++void bch2_fs_usage_acc_to_base(struct bch_fs *, unsigned); ++ ++void bch2_fs_usage_to_text(struct printbuf *, ++ struct bch_fs *, struct bch_fs_usage *); ++ ++u64 bch2_fs_sectors_used(struct bch_fs *, struct bch_fs_usage *); ++ ++struct bch_fs_usage_short ++bch2_fs_usage_read_short(struct bch_fs *); ++ ++/* key/bucket marking: */ ++ ++void bch2_bucket_seq_cleanup(struct bch_fs *); ++void bch2_fs_usage_initialize(struct bch_fs *); ++ ++void bch2_invalidate_bucket(struct bch_fs *, struct bch_dev *, ++ size_t, struct bucket_mark *); ++void bch2_mark_alloc_bucket(struct bch_fs *, struct bch_dev *, ++ size_t, bool, struct gc_pos, unsigned); ++void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *, ++ size_t, enum bch_data_type, unsigned, ++ struct gc_pos, unsigned); ++ ++#define BCH_BUCKET_MARK_INSERT (1 << 0) ++#define BCH_BUCKET_MARK_OVERWRITE (1 << 1) ++#define BCH_BUCKET_MARK_OVERWRITE_SPLIT (1 << 2) ++#define BCH_BUCKET_MARK_BUCKET_INVALIDATE (1 << 3) ++#define BCH_BUCKET_MARK_GC (1 << 4) ++#define BCH_BUCKET_MARK_ALLOC_READ (1 << 5) ++#define BCH_BUCKET_MARK_NOATOMIC (1 << 6) ++ ++int bch2_mark_key_locked(struct bch_fs *, struct bkey_s_c, unsigned, s64, ++ struct bch_fs_usage *, u64, unsigned); ++int bch2_mark_key(struct bch_fs *, struct bkey_s_c, unsigned, s64, ++ struct bch_fs_usage *, u64, unsigned); ++int bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage *, ++ struct disk_reservation *, unsigned); ++ ++int bch2_mark_overwrite(struct btree_trans *, struct btree_iter *, ++ struct bkey_s_c, struct bkey_i *, ++ struct bch_fs_usage *, unsigned); ++int bch2_mark_update(struct btree_trans *, struct btree_insert_entry *, ++ struct bch_fs_usage *, unsigned); ++ ++int bch2_replicas_delta_list_apply(struct bch_fs *, ++ struct bch_fs_usage *, ++ struct replicas_delta_list *); ++int bch2_trans_mark_key(struct btree_trans *, struct bkey_s_c, ++ unsigned, s64, unsigned); ++int bch2_trans_mark_update(struct btree_trans *, ++ struct btree_iter *iter, ++ struct bkey_i *insert); ++void bch2_trans_fs_usage_apply(struct btree_trans *, struct bch_fs_usage *); ++ ++/* disk reservations: */ ++ ++void __bch2_disk_reservation_put(struct bch_fs *, struct disk_reservation *); ++ ++static inline void bch2_disk_reservation_put(struct bch_fs *c, ++ struct disk_reservation *res) ++{ ++ if (res->sectors) ++ __bch2_disk_reservation_put(c, res); ++} ++ ++#define BCH_DISK_RESERVATION_NOFAIL (1 << 0) ++ ++int bch2_disk_reservation_add(struct bch_fs *, ++ struct disk_reservation *, ++ unsigned, int); ++ ++static inline struct disk_reservation ++bch2_disk_reservation_init(struct bch_fs *c, unsigned nr_replicas) ++{ ++ return (struct disk_reservation) { ++ .sectors = 0, ++#if 0 ++ /* not used yet: */ ++ .gen = c->capacity_gen, ++#endif ++ .nr_replicas = nr_replicas, ++ }; ++} ++ ++static inline int bch2_disk_reservation_get(struct bch_fs *c, ++ struct disk_reservation *res, ++ unsigned sectors, ++ unsigned nr_replicas, ++ int flags) ++{ ++ *res = bch2_disk_reservation_init(c, nr_replicas); ++ ++ return bch2_disk_reservation_add(c, res, sectors * nr_replicas, flags); ++} ++ ++int bch2_dev_buckets_resize(struct bch_fs *, struct bch_dev *, u64); ++void bch2_dev_buckets_free(struct bch_dev *); ++int bch2_dev_buckets_alloc(struct bch_fs *, struct bch_dev *); ++ ++#endif /* _BUCKETS_H */ +diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h +new file mode 100644 +index 000000000000..f3ff4a18b1fd +--- /dev/null ++++ b/fs/bcachefs/buckets_types.h +@@ -0,0 +1,130 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BUCKETS_TYPES_H ++#define _BUCKETS_TYPES_H ++ ++#include "bcachefs_format.h" ++#include "util.h" ++ ++#define BUCKET_JOURNAL_SEQ_BITS 16 ++ ++struct bucket_mark { ++ union { ++ atomic64_t v; ++ ++ struct { ++ u8 gen; ++ u8 data_type:3, ++ owned_by_allocator:1, ++ journal_seq_valid:1, ++ stripe:1; ++ u16 dirty_sectors; ++ u16 cached_sectors; ++ ++ /* ++ * low bits of journal sequence number when this bucket was most ++ * recently modified: if journal_seq_valid is set, this bucket can't be ++ * reused until the journal sequence number written to disk is >= the ++ * bucket's journal sequence number: ++ */ ++ u16 journal_seq; ++ }; ++ }; ++}; ++ ++struct bucket { ++ union { ++ struct bucket_mark _mark; ++ const struct bucket_mark mark; ++ }; ++ ++ u16 io_time[2]; ++ u8 oldest_gen; ++ unsigned gen_valid:1; ++}; ++ ++struct bucket_array { ++ struct rcu_head rcu; ++ u16 first_bucket; ++ size_t nbuckets; ++ struct bucket b[]; ++}; ++ ++struct bch_dev_usage { ++ u64 buckets[BCH_DATA_NR]; ++ u64 buckets_alloc; ++ u64 buckets_ec; ++ u64 buckets_unavailable; ++ ++ /* _compressed_ sectors: */ ++ u64 sectors[BCH_DATA_NR]; ++ u64 sectors_fragmented; ++}; ++ ++struct bch_fs_usage { ++ /* all fields are in units of 512 byte sectors: */ ++ ++ u64 online_reserved; ++ ++ /* fields after online_reserved are cleared/recalculated by gc: */ ++ u64 gc_start[0]; ++ ++ u64 hidden; ++ u64 btree; ++ u64 data; ++ u64 cached; ++ u64 reserved; ++ u64 nr_inodes; ++ ++ /* XXX: add stats for compression ratio */ ++#if 0 ++ u64 uncompressed; ++ u64 compressed; ++#endif ++ ++ /* broken out: */ ++ ++ u64 persistent_reserved[BCH_REPLICAS_MAX]; ++ u64 replicas[]; ++}; ++ ++struct bch_fs_usage_short { ++ u64 capacity; ++ u64 used; ++ u64 free; ++ u64 nr_inodes; ++}; ++ ++struct replicas_delta { ++ s64 delta; ++ struct bch_replicas_entry r; ++} __packed; ++ ++struct replicas_delta_list { ++ unsigned size; ++ unsigned used; ++ ++ struct {} memset_start; ++ u64 nr_inodes; ++ u64 persistent_reserved[BCH_REPLICAS_MAX]; ++ struct {} memset_end; ++ struct replicas_delta d[0]; ++}; ++ ++/* ++ * A reservation for space on disk: ++ */ ++struct disk_reservation { ++ u64 sectors; ++ u32 gen; ++ unsigned nr_replicas; ++}; ++ ++struct copygc_heap_entry { ++ u8 gen; ++ u32 sectors; ++ u64 offset; ++}; ++ ++typedef HEAP(struct copygc_heap_entry) copygc_heap; ++ ++#endif /* _BUCKETS_TYPES_H */ +diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c +new file mode 100644 +index 000000000000..059eca01ccc4 +--- /dev/null ++++ b/fs/bcachefs/chardev.c +@@ -0,0 +1,671 @@ ++// SPDX-License-Identifier: GPL-2.0 ++#ifndef NO_BCACHEFS_CHARDEV ++ ++#include "bcachefs.h" ++#include "bcachefs_ioctl.h" ++#include "buckets.h" ++#include "chardev.h" ++#include "move.h" ++#include "super.h" ++#include "super-io.h" ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++/* returns with ref on ca->ref */ ++static struct bch_dev *bch2_device_lookup(struct bch_fs *c, u64 dev, ++ unsigned flags) ++{ ++ struct bch_dev *ca; ++ ++ if (flags & BCH_BY_INDEX) { ++ if (dev >= c->sb.nr_devices) ++ return ERR_PTR(-EINVAL); ++ ++ rcu_read_lock(); ++ ca = rcu_dereference(c->devs[dev]); ++ if (ca) ++ percpu_ref_get(&ca->ref); ++ rcu_read_unlock(); ++ ++ if (!ca) ++ return ERR_PTR(-EINVAL); ++ } else { ++ char *path; ++ ++ path = strndup_user((const char __user *) ++ (unsigned long) dev, PATH_MAX); ++ if (IS_ERR(path)) ++ return ERR_CAST(path); ++ ++ ca = bch2_dev_lookup(c, path); ++ kfree(path); ++ } ++ ++ return ca; ++} ++ ++#if 0 ++static long bch2_ioctl_assemble(struct bch_ioctl_assemble __user *user_arg) ++{ ++ struct bch_ioctl_assemble arg; ++ struct bch_fs *c; ++ u64 *user_devs = NULL; ++ char **devs = NULL; ++ unsigned i; ++ int ret = -EFAULT; ++ ++ if (copy_from_user(&arg, user_arg, sizeof(arg))) ++ return -EFAULT; ++ ++ if (arg.flags || arg.pad) ++ return -EINVAL; ++ ++ user_devs = kmalloc_array(arg.nr_devs, sizeof(u64), GFP_KERNEL); ++ if (!user_devs) ++ return -ENOMEM; ++ ++ devs = kcalloc(arg.nr_devs, sizeof(char *), GFP_KERNEL); ++ ++ if (copy_from_user(user_devs, user_arg->devs, ++ sizeof(u64) * arg.nr_devs)) ++ goto err; ++ ++ for (i = 0; i < arg.nr_devs; i++) { ++ devs[i] = strndup_user((const char __user *)(unsigned long) ++ user_devs[i], ++ PATH_MAX); ++ if (!devs[i]) { ++ ret = -ENOMEM; ++ goto err; ++ } ++ } ++ ++ c = bch2_fs_open(devs, arg.nr_devs, bch2_opts_empty()); ++ ret = PTR_ERR_OR_ZERO(c); ++ if (!ret) ++ closure_put(&c->cl); ++err: ++ if (devs) ++ for (i = 0; i < arg.nr_devs; i++) ++ kfree(devs[i]); ++ kfree(devs); ++ return ret; ++} ++ ++static long bch2_ioctl_incremental(struct bch_ioctl_incremental __user *user_arg) ++{ ++ struct bch_ioctl_incremental arg; ++ const char *err; ++ char *path; ++ ++ if (copy_from_user(&arg, user_arg, sizeof(arg))) ++ return -EFAULT; ++ ++ if (arg.flags || arg.pad) ++ return -EINVAL; ++ ++ path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX); ++ if (!path) ++ return -ENOMEM; ++ ++ err = bch2_fs_open_incremental(path); ++ kfree(path); ++ ++ if (err) { ++ pr_err("Could not register bcachefs devices: %s", err); ++ return -EINVAL; ++ } ++ ++ return 0; ++} ++#endif ++ ++static long bch2_global_ioctl(unsigned cmd, void __user *arg) ++{ ++ switch (cmd) { ++#if 0 ++ case BCH_IOCTL_ASSEMBLE: ++ return bch2_ioctl_assemble(arg); ++ case BCH_IOCTL_INCREMENTAL: ++ return bch2_ioctl_incremental(arg); ++#endif ++ default: ++ return -ENOTTY; ++ } ++} ++ ++static long bch2_ioctl_query_uuid(struct bch_fs *c, ++ struct bch_ioctl_query_uuid __user *user_arg) ++{ ++ return copy_to_user(&user_arg->uuid, ++ &c->sb.user_uuid, ++ sizeof(c->sb.user_uuid)); ++} ++ ++#if 0 ++static long bch2_ioctl_start(struct bch_fs *c, struct bch_ioctl_start arg) ++{ ++ if (arg.flags || arg.pad) ++ return -EINVAL; ++ ++ return bch2_fs_start(c); ++} ++ ++static long bch2_ioctl_stop(struct bch_fs *c) ++{ ++ bch2_fs_stop(c); ++ return 0; ++} ++#endif ++ ++static long bch2_ioctl_disk_add(struct bch_fs *c, struct bch_ioctl_disk arg) ++{ ++ char *path; ++ int ret; ++ ++ if (arg.flags || arg.pad) ++ return -EINVAL; ++ ++ path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX); ++ if (!path) ++ return -ENOMEM; ++ ++ ret = bch2_dev_add(c, path); ++ kfree(path); ++ ++ return ret; ++} ++ ++static long bch2_ioctl_disk_remove(struct bch_fs *c, struct bch_ioctl_disk arg) ++{ ++ struct bch_dev *ca; ++ ++ if ((arg.flags & ~(BCH_FORCE_IF_DATA_LOST| ++ BCH_FORCE_IF_METADATA_LOST| ++ BCH_FORCE_IF_DEGRADED| ++ BCH_BY_INDEX)) || ++ arg.pad) ++ return -EINVAL; ++ ++ ca = bch2_device_lookup(c, arg.dev, arg.flags); ++ if (IS_ERR(ca)) ++ return PTR_ERR(ca); ++ ++ return bch2_dev_remove(c, ca, arg.flags); ++} ++ ++static long bch2_ioctl_disk_online(struct bch_fs *c, struct bch_ioctl_disk arg) ++{ ++ char *path; ++ int ret; ++ ++ if (arg.flags || arg.pad) ++ return -EINVAL; ++ ++ path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX); ++ if (!path) ++ return -ENOMEM; ++ ++ ret = bch2_dev_online(c, path); ++ kfree(path); ++ return ret; ++} ++ ++static long bch2_ioctl_disk_offline(struct bch_fs *c, struct bch_ioctl_disk arg) ++{ ++ struct bch_dev *ca; ++ int ret; ++ ++ if ((arg.flags & ~(BCH_FORCE_IF_DATA_LOST| ++ BCH_FORCE_IF_METADATA_LOST| ++ BCH_FORCE_IF_DEGRADED| ++ BCH_BY_INDEX)) || ++ arg.pad) ++ return -EINVAL; ++ ++ ca = bch2_device_lookup(c, arg.dev, arg.flags); ++ if (IS_ERR(ca)) ++ return PTR_ERR(ca); ++ ++ ret = bch2_dev_offline(c, ca, arg.flags); ++ percpu_ref_put(&ca->ref); ++ return ret; ++} ++ ++static long bch2_ioctl_disk_set_state(struct bch_fs *c, ++ struct bch_ioctl_disk_set_state arg) ++{ ++ struct bch_dev *ca; ++ int ret; ++ ++ if ((arg.flags & ~(BCH_FORCE_IF_DATA_LOST| ++ BCH_FORCE_IF_METADATA_LOST| ++ BCH_FORCE_IF_DEGRADED| ++ BCH_BY_INDEX)) || ++ arg.pad[0] || arg.pad[1] || arg.pad[2]) ++ return -EINVAL; ++ ++ ca = bch2_device_lookup(c, arg.dev, arg.flags); ++ if (IS_ERR(ca)) ++ return PTR_ERR(ca); ++ ++ ret = bch2_dev_set_state(c, ca, arg.new_state, arg.flags); ++ ++ percpu_ref_put(&ca->ref); ++ return ret; ++} ++ ++struct bch_data_ctx { ++ struct bch_fs *c; ++ struct bch_ioctl_data arg; ++ struct bch_move_stats stats; ++ ++ int ret; ++ ++ struct task_struct *thread; ++}; ++ ++static int bch2_data_thread(void *arg) ++{ ++ struct bch_data_ctx *ctx = arg; ++ ++ ctx->ret = bch2_data_job(ctx->c, &ctx->stats, ctx->arg); ++ ++ ctx->stats.data_type = U8_MAX; ++ return 0; ++} ++ ++static int bch2_data_job_release(struct inode *inode, struct file *file) ++{ ++ struct bch_data_ctx *ctx = file->private_data; ++ ++ kthread_stop(ctx->thread); ++ put_task_struct(ctx->thread); ++ kfree(ctx); ++ return 0; ++} ++ ++static ssize_t bch2_data_job_read(struct file *file, char __user *buf, ++ size_t len, loff_t *ppos) ++{ ++ struct bch_data_ctx *ctx = file->private_data; ++ struct bch_fs *c = ctx->c; ++ struct bch_ioctl_data_event e = { ++ .type = BCH_DATA_EVENT_PROGRESS, ++ .p.data_type = ctx->stats.data_type, ++ .p.btree_id = ctx->stats.btree_id, ++ .p.pos = ctx->stats.pos, ++ .p.sectors_done = atomic64_read(&ctx->stats.sectors_seen), ++ .p.sectors_total = bch2_fs_usage_read_short(c).used, ++ }; ++ ++ if (len < sizeof(e)) ++ return -EINVAL; ++ ++ return copy_to_user(buf, &e, sizeof(e)) ?: sizeof(e); ++} ++ ++static const struct file_operations bcachefs_data_ops = { ++ .release = bch2_data_job_release, ++ .read = bch2_data_job_read, ++ .llseek = no_llseek, ++}; ++ ++static long bch2_ioctl_data(struct bch_fs *c, ++ struct bch_ioctl_data arg) ++{ ++ struct bch_data_ctx *ctx = NULL; ++ struct file *file = NULL; ++ unsigned flags = O_RDONLY|O_CLOEXEC|O_NONBLOCK; ++ int ret, fd = -1; ++ ++ if (arg.op >= BCH_DATA_OP_NR || arg.flags) ++ return -EINVAL; ++ ++ ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); ++ if (!ctx) ++ return -ENOMEM; ++ ++ ctx->c = c; ++ ctx->arg = arg; ++ ++ ctx->thread = kthread_create(bch2_data_thread, ctx, "[bcachefs]"); ++ if (IS_ERR(ctx->thread)) { ++ ret = PTR_ERR(ctx->thread); ++ goto err; ++ } ++ ++ ret = get_unused_fd_flags(flags); ++ if (ret < 0) ++ goto err; ++ fd = ret; ++ ++ file = anon_inode_getfile("[bcachefs]", &bcachefs_data_ops, ctx, flags); ++ if (IS_ERR(file)) { ++ ret = PTR_ERR(file); ++ goto err; ++ } ++ ++ fd_install(fd, file); ++ ++ get_task_struct(ctx->thread); ++ wake_up_process(ctx->thread); ++ ++ return fd; ++err: ++ if (fd >= 0) ++ put_unused_fd(fd); ++ if (!IS_ERR_OR_NULL(ctx->thread)) ++ kthread_stop(ctx->thread); ++ kfree(ctx); ++ return ret; ++} ++ ++static long bch2_ioctl_usage(struct bch_fs *c, ++ struct bch_ioctl_usage __user *user_arg) ++{ ++ struct bch_ioctl_usage arg; ++ struct bch_dev *ca; ++ unsigned i, j; ++ int ret; ++ ++ if (!test_bit(BCH_FS_STARTED, &c->flags)) ++ return -EINVAL; ++ ++ if (copy_from_user(&arg, user_arg, sizeof(arg))) ++ return -EFAULT; ++ ++ for (i = 0; i < arg.nr_devices; i++) { ++ struct bch_ioctl_dev_usage dst = { .alive = 0 }; ++ ++ ret = copy_to_user(&user_arg->devs[i], &dst, sizeof(dst)); ++ if (ret) ++ return ret; ++ } ++ ++ { ++ struct bch_fs_usage *src; ++ struct bch_ioctl_fs_usage dst = { ++ .capacity = c->capacity, ++ }; ++ ++ src = bch2_fs_usage_read(c); ++ if (!src) ++ return -ENOMEM; ++ ++ dst.used = bch2_fs_sectors_used(c, src); ++ dst.online_reserved = src->online_reserved; ++ ++ percpu_up_read(&c->mark_lock); ++ ++ for (i = 0; i < BCH_REPLICAS_MAX; i++) { ++ dst.persistent_reserved[i] = ++ src->persistent_reserved[i]; ++#if 0 ++ for (j = 0; j < BCH_DATA_NR; j++) ++ dst.sectors[j][i] = src.replicas[i].data[j]; ++#endif ++ } ++ ++ kfree(src); ++ ++ ret = copy_to_user(&user_arg->fs, &dst, sizeof(dst)); ++ if (ret) ++ return ret; ++ } ++ ++ for_each_member_device(ca, c, i) { ++ struct bch_dev_usage src = bch2_dev_usage_read(c, ca); ++ struct bch_ioctl_dev_usage dst = { ++ .alive = 1, ++ .state = ca->mi.state, ++ .bucket_size = ca->mi.bucket_size, ++ .nr_buckets = ca->mi.nbuckets - ca->mi.first_bucket, ++ }; ++ ++ if (ca->dev_idx >= arg.nr_devices) { ++ percpu_ref_put(&ca->ref); ++ return -ERANGE; ++ } ++ ++ if (percpu_ref_tryget(&ca->io_ref)) { ++ dst.dev = huge_encode_dev(ca->disk_sb.bdev->bd_dev); ++ percpu_ref_put(&ca->io_ref); ++ } ++ ++ for (j = 0; j < BCH_DATA_NR; j++) { ++ dst.buckets[j] = src.buckets[j]; ++ dst.sectors[j] = src.sectors[j]; ++ } ++ ++ ret = copy_to_user(&user_arg->devs[i], &dst, sizeof(dst)); ++ if (ret) ++ return ret; ++ } ++ ++ return 0; ++} ++ ++static long bch2_ioctl_read_super(struct bch_fs *c, ++ struct bch_ioctl_read_super arg) ++{ ++ struct bch_dev *ca = NULL; ++ struct bch_sb *sb; ++ int ret = 0; ++ ++ if ((arg.flags & ~(BCH_BY_INDEX|BCH_READ_DEV)) || ++ arg.pad) ++ return -EINVAL; ++ ++ mutex_lock(&c->sb_lock); ++ ++ if (arg.flags & BCH_READ_DEV) { ++ ca = bch2_device_lookup(c, arg.dev, arg.flags); ++ ++ if (IS_ERR(ca)) { ++ ret = PTR_ERR(ca); ++ goto err; ++ } ++ ++ sb = ca->disk_sb.sb; ++ } else { ++ sb = c->disk_sb.sb; ++ } ++ ++ if (vstruct_bytes(sb) > arg.size) { ++ ret = -ERANGE; ++ goto err; ++ } ++ ++ ret = copy_to_user((void __user *)(unsigned long)arg.sb, ++ sb, vstruct_bytes(sb)); ++err: ++ if (ca) ++ percpu_ref_put(&ca->ref); ++ mutex_unlock(&c->sb_lock); ++ return ret; ++} ++ ++static long bch2_ioctl_disk_get_idx(struct bch_fs *c, ++ struct bch_ioctl_disk_get_idx arg) ++{ ++ dev_t dev = huge_decode_dev(arg.dev); ++ struct bch_dev *ca; ++ unsigned i; ++ ++ for_each_online_member(ca, c, i) ++ if (ca->disk_sb.bdev->bd_dev == dev) { ++ percpu_ref_put(&ca->io_ref); ++ return i; ++ } ++ ++ return -ENOENT; ++} ++ ++static long bch2_ioctl_disk_resize(struct bch_fs *c, ++ struct bch_ioctl_disk_resize arg) ++{ ++ struct bch_dev *ca; ++ int ret; ++ ++ if ((arg.flags & ~BCH_BY_INDEX) || ++ arg.pad) ++ return -EINVAL; ++ ++ ca = bch2_device_lookup(c, arg.dev, arg.flags); ++ if (IS_ERR(ca)) ++ return PTR_ERR(ca); ++ ++ ret = bch2_dev_resize(c, ca, arg.nbuckets); ++ ++ percpu_ref_put(&ca->ref); ++ return ret; ++} ++ ++#define BCH_IOCTL(_name, _argtype) \ ++do { \ ++ _argtype i; \ ++ \ ++ if (copy_from_user(&i, arg, sizeof(i))) \ ++ return -EFAULT; \ ++ return bch2_ioctl_##_name(c, i); \ ++} while (0) ++ ++long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg) ++{ ++ /* ioctls that don't require admin cap: */ ++ switch (cmd) { ++ case BCH_IOCTL_QUERY_UUID: ++ return bch2_ioctl_query_uuid(c, arg); ++ case BCH_IOCTL_USAGE: ++ return bch2_ioctl_usage(c, arg); ++ } ++ ++ if (!capable(CAP_SYS_ADMIN)) ++ return -EPERM; ++ ++ switch (cmd) { ++#if 0 ++ case BCH_IOCTL_START: ++ BCH_IOCTL(start, struct bch_ioctl_start); ++ case BCH_IOCTL_STOP: ++ return bch2_ioctl_stop(c); ++#endif ++ case BCH_IOCTL_READ_SUPER: ++ BCH_IOCTL(read_super, struct bch_ioctl_read_super); ++ case BCH_IOCTL_DISK_GET_IDX: ++ BCH_IOCTL(disk_get_idx, struct bch_ioctl_disk_get_idx); ++ } ++ ++ if (!test_bit(BCH_FS_STARTED, &c->flags)) ++ return -EINVAL; ++ ++ /* ioctls that do require admin cap: */ ++ switch (cmd) { ++ case BCH_IOCTL_DISK_ADD: ++ BCH_IOCTL(disk_add, struct bch_ioctl_disk); ++ case BCH_IOCTL_DISK_REMOVE: ++ BCH_IOCTL(disk_remove, struct bch_ioctl_disk); ++ case BCH_IOCTL_DISK_ONLINE: ++ BCH_IOCTL(disk_online, struct bch_ioctl_disk); ++ case BCH_IOCTL_DISK_OFFLINE: ++ BCH_IOCTL(disk_offline, struct bch_ioctl_disk); ++ case BCH_IOCTL_DISK_SET_STATE: ++ BCH_IOCTL(disk_set_state, struct bch_ioctl_disk_set_state); ++ case BCH_IOCTL_DATA: ++ BCH_IOCTL(data, struct bch_ioctl_data); ++ case BCH_IOCTL_DISK_RESIZE: ++ BCH_IOCTL(disk_resize, struct bch_ioctl_disk_resize); ++ ++ default: ++ return -ENOTTY; ++ } ++} ++ ++static DEFINE_IDR(bch_chardev_minor); ++ ++static long bch2_chardev_ioctl(struct file *filp, unsigned cmd, unsigned long v) ++{ ++ unsigned minor = iminor(file_inode(filp)); ++ struct bch_fs *c = minor < U8_MAX ? idr_find(&bch_chardev_minor, minor) : NULL; ++ void __user *arg = (void __user *) v; ++ ++ return c ++ ? bch2_fs_ioctl(c, cmd, arg) ++ : bch2_global_ioctl(cmd, arg); ++} ++ ++static const struct file_operations bch_chardev_fops = { ++ .owner = THIS_MODULE, ++ .unlocked_ioctl = bch2_chardev_ioctl, ++ .open = nonseekable_open, ++}; ++ ++static int bch_chardev_major; ++static struct class *bch_chardev_class; ++static struct device *bch_chardev; ++ ++void bch2_fs_chardev_exit(struct bch_fs *c) ++{ ++ if (!IS_ERR_OR_NULL(c->chardev)) ++ device_unregister(c->chardev); ++ if (c->minor >= 0) ++ idr_remove(&bch_chardev_minor, c->minor); ++} ++ ++int bch2_fs_chardev_init(struct bch_fs *c) ++{ ++ c->minor = idr_alloc(&bch_chardev_minor, c, 0, 0, GFP_KERNEL); ++ if (c->minor < 0) ++ return c->minor; ++ ++ c->chardev = device_create(bch_chardev_class, NULL, ++ MKDEV(bch_chardev_major, c->minor), c, ++ "bcachefs%u-ctl", c->minor); ++ if (IS_ERR(c->chardev)) ++ return PTR_ERR(c->chardev); ++ ++ return 0; ++} ++ ++void bch2_chardev_exit(void) ++{ ++ if (!IS_ERR_OR_NULL(bch_chardev_class)) ++ device_destroy(bch_chardev_class, ++ MKDEV(bch_chardev_major, U8_MAX)); ++ if (!IS_ERR_OR_NULL(bch_chardev_class)) ++ class_destroy(bch_chardev_class); ++ if (bch_chardev_major > 0) ++ unregister_chrdev(bch_chardev_major, "bcachefs"); ++} ++ ++int __init bch2_chardev_init(void) ++{ ++ bch_chardev_major = register_chrdev(0, "bcachefs-ctl", &bch_chardev_fops); ++ if (bch_chardev_major < 0) ++ return bch_chardev_major; ++ ++ bch_chardev_class = class_create(THIS_MODULE, "bcachefs"); ++ if (IS_ERR(bch_chardev_class)) ++ return PTR_ERR(bch_chardev_class); ++ ++ bch_chardev = device_create(bch_chardev_class, NULL, ++ MKDEV(bch_chardev_major, U8_MAX), ++ NULL, "bcachefs-ctl"); ++ if (IS_ERR(bch_chardev)) ++ return PTR_ERR(bch_chardev); ++ ++ return 0; ++} ++ ++#endif /* NO_BCACHEFS_CHARDEV */ +diff --git a/fs/bcachefs/chardev.h b/fs/bcachefs/chardev.h +new file mode 100644 +index 000000000000..3a4890d39ff9 +--- /dev/null ++++ b/fs/bcachefs/chardev.h +@@ -0,0 +1,31 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_CHARDEV_H ++#define _BCACHEFS_CHARDEV_H ++ ++#ifndef NO_BCACHEFS_FS ++ ++long bch2_fs_ioctl(struct bch_fs *, unsigned, void __user *); ++ ++void bch2_fs_chardev_exit(struct bch_fs *); ++int bch2_fs_chardev_init(struct bch_fs *); ++ ++void bch2_chardev_exit(void); ++int __init bch2_chardev_init(void); ++ ++#else ++ ++static inline long bch2_fs_ioctl(struct bch_fs *c, ++ unsigned cmd, void __user * arg) ++{ ++ return -ENOSYS; ++} ++ ++static inline void bch2_fs_chardev_exit(struct bch_fs *c) {} ++static inline int bch2_fs_chardev_init(struct bch_fs *c) { return 0; } ++ ++static inline void bch2_chardev_exit(void) {} ++static inline int __init bch2_chardev_init(void) { return 0; } ++ ++#endif /* NO_BCACHEFS_FS */ ++ ++#endif /* _BCACHEFS_CHARDEV_H */ +diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c +new file mode 100644 +index 000000000000..2e1dfdc68e15 +--- /dev/null ++++ b/fs/bcachefs/checksum.c +@@ -0,0 +1,617 @@ ++// SPDX-License-Identifier: GPL-2.0 ++#include "bcachefs.h" ++#include "checksum.h" ++#include "super.h" ++#include "super-io.h" ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++static u64 bch2_checksum_init(unsigned type) ++{ ++ switch (type) { ++ case BCH_CSUM_NONE: ++ return 0; ++ case BCH_CSUM_CRC32C_NONZERO: ++ return U32_MAX; ++ case BCH_CSUM_CRC64_NONZERO: ++ return U64_MAX; ++ case BCH_CSUM_CRC32C: ++ return 0; ++ case BCH_CSUM_CRC64: ++ return 0; ++ default: ++ BUG(); ++ } ++} ++ ++static u64 bch2_checksum_final(unsigned type, u64 crc) ++{ ++ switch (type) { ++ case BCH_CSUM_NONE: ++ return 0; ++ case BCH_CSUM_CRC32C_NONZERO: ++ return crc ^ U32_MAX; ++ case BCH_CSUM_CRC64_NONZERO: ++ return crc ^ U64_MAX; ++ case BCH_CSUM_CRC32C: ++ return crc; ++ case BCH_CSUM_CRC64: ++ return crc; ++ default: ++ BUG(); ++ } ++} ++ ++static u64 bch2_checksum_update(unsigned type, u64 crc, const void *data, size_t len) ++{ ++ switch (type) { ++ case BCH_CSUM_NONE: ++ return 0; ++ case BCH_CSUM_CRC32C_NONZERO: ++ case BCH_CSUM_CRC32C: ++ return crc32c(crc, data, len); ++ case BCH_CSUM_CRC64_NONZERO: ++ case BCH_CSUM_CRC64: ++ return crc64_be(crc, data, len); ++ default: ++ BUG(); ++ } ++} ++ ++static inline void do_encrypt_sg(struct crypto_sync_skcipher *tfm, ++ struct nonce nonce, ++ struct scatterlist *sg, size_t len) ++{ ++ SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm); ++ int ret; ++ ++ skcipher_request_set_sync_tfm(req, tfm); ++ skcipher_request_set_crypt(req, sg, sg, len, nonce.d); ++ ++ ret = crypto_skcipher_encrypt(req); ++ BUG_ON(ret); ++} ++ ++static inline void do_encrypt(struct crypto_sync_skcipher *tfm, ++ struct nonce nonce, ++ void *buf, size_t len) ++{ ++ struct scatterlist sg; ++ ++ sg_init_one(&sg, buf, len); ++ do_encrypt_sg(tfm, nonce, &sg, len); ++} ++ ++int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce, ++ void *buf, size_t len) ++{ ++ struct crypto_sync_skcipher *chacha20 = ++ crypto_alloc_sync_skcipher("chacha20", 0, 0); ++ int ret; ++ ++ if (!chacha20) { ++ pr_err("error requesting chacha20 module: %li", PTR_ERR(chacha20)); ++ return PTR_ERR(chacha20); ++ } ++ ++ ret = crypto_skcipher_setkey(&chacha20->base, ++ (void *) key, sizeof(*key)); ++ if (ret) { ++ pr_err("crypto_skcipher_setkey() error: %i", ret); ++ goto err; ++ } ++ ++ do_encrypt(chacha20, nonce, buf, len); ++err: ++ crypto_free_sync_skcipher(chacha20); ++ return ret; ++} ++ ++static void gen_poly_key(struct bch_fs *c, struct shash_desc *desc, ++ struct nonce nonce) ++{ ++ u8 key[POLY1305_KEY_SIZE]; ++ ++ nonce.d[3] ^= BCH_NONCE_POLY; ++ ++ memset(key, 0, sizeof(key)); ++ do_encrypt(c->chacha20, nonce, key, sizeof(key)); ++ ++ desc->tfm = c->poly1305; ++ crypto_shash_init(desc); ++ crypto_shash_update(desc, key, sizeof(key)); ++} ++ ++struct bch_csum bch2_checksum(struct bch_fs *c, unsigned type, ++ struct nonce nonce, const void *data, size_t len) ++{ ++ switch (type) { ++ case BCH_CSUM_NONE: ++ case BCH_CSUM_CRC32C_NONZERO: ++ case BCH_CSUM_CRC64_NONZERO: ++ case BCH_CSUM_CRC32C: ++ case BCH_CSUM_CRC64: { ++ u64 crc = bch2_checksum_init(type); ++ ++ crc = bch2_checksum_update(type, crc, data, len); ++ crc = bch2_checksum_final(type, crc); ++ ++ return (struct bch_csum) { .lo = cpu_to_le64(crc) }; ++ } ++ ++ case BCH_CSUM_CHACHA20_POLY1305_80: ++ case BCH_CSUM_CHACHA20_POLY1305_128: { ++ SHASH_DESC_ON_STACK(desc, c->poly1305); ++ u8 digest[POLY1305_DIGEST_SIZE]; ++ struct bch_csum ret = { 0 }; ++ ++ gen_poly_key(c, desc, nonce); ++ ++ crypto_shash_update(desc, data, len); ++ crypto_shash_final(desc, digest); ++ ++ memcpy(&ret, digest, bch_crc_bytes[type]); ++ return ret; ++ } ++ default: ++ BUG(); ++ } ++} ++ ++void bch2_encrypt(struct bch_fs *c, unsigned type, ++ struct nonce nonce, void *data, size_t len) ++{ ++ if (!bch2_csum_type_is_encryption(type)) ++ return; ++ ++ do_encrypt(c->chacha20, nonce, data, len); ++} ++ ++static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type, ++ struct nonce nonce, struct bio *bio, ++ struct bvec_iter *iter) ++{ ++ struct bio_vec bv; ++ ++ switch (type) { ++ case BCH_CSUM_NONE: ++ return (struct bch_csum) { 0 }; ++ case BCH_CSUM_CRC32C_NONZERO: ++ case BCH_CSUM_CRC64_NONZERO: ++ case BCH_CSUM_CRC32C: ++ case BCH_CSUM_CRC64: { ++ u64 crc = bch2_checksum_init(type); ++ ++#ifdef CONFIG_HIGHMEM ++ __bio_for_each_segment(bv, bio, *iter, *iter) { ++ void *p = kmap_atomic(bv.bv_page) + bv.bv_offset; ++ crc = bch2_checksum_update(type, ++ crc, p, bv.bv_len); ++ kunmap_atomic(p); ++ } ++#else ++ __bio_for_each_bvec(bv, bio, *iter, *iter) ++ crc = bch2_checksum_update(type, crc, ++ page_address(bv.bv_page) + bv.bv_offset, ++ bv.bv_len); ++#endif ++ crc = bch2_checksum_final(type, crc); ++ return (struct bch_csum) { .lo = cpu_to_le64(crc) }; ++ } ++ ++ case BCH_CSUM_CHACHA20_POLY1305_80: ++ case BCH_CSUM_CHACHA20_POLY1305_128: { ++ SHASH_DESC_ON_STACK(desc, c->poly1305); ++ u8 digest[POLY1305_DIGEST_SIZE]; ++ struct bch_csum ret = { 0 }; ++ ++ gen_poly_key(c, desc, nonce); ++ ++#ifdef CONFIG_HIGHMEM ++ __bio_for_each_segment(bv, bio, *iter, *iter) { ++ void *p = kmap_atomic(bv.bv_page) + bv.bv_offset; ++ ++ crypto_shash_update(desc, p, bv.bv_len); ++ kunmap_atomic(p); ++ } ++#else ++ __bio_for_each_bvec(bv, bio, *iter, *iter) ++ crypto_shash_update(desc, ++ page_address(bv.bv_page) + bv.bv_offset, ++ bv.bv_len); ++#endif ++ crypto_shash_final(desc, digest); ++ ++ memcpy(&ret, digest, bch_crc_bytes[type]); ++ return ret; ++ } ++ default: ++ BUG(); ++ } ++} ++ ++struct bch_csum bch2_checksum_bio(struct bch_fs *c, unsigned type, ++ struct nonce nonce, struct bio *bio) ++{ ++ struct bvec_iter iter = bio->bi_iter; ++ ++ return __bch2_checksum_bio(c, type, nonce, bio, &iter); ++} ++ ++void bch2_encrypt_bio(struct bch_fs *c, unsigned type, ++ struct nonce nonce, struct bio *bio) ++{ ++ struct bio_vec bv; ++ struct bvec_iter iter; ++ struct scatterlist sgl[16], *sg = sgl; ++ size_t bytes = 0; ++ ++ if (!bch2_csum_type_is_encryption(type)) ++ return; ++ ++ sg_init_table(sgl, ARRAY_SIZE(sgl)); ++ ++ bio_for_each_segment(bv, bio, iter) { ++ if (sg == sgl + ARRAY_SIZE(sgl)) { ++ sg_mark_end(sg - 1); ++ do_encrypt_sg(c->chacha20, nonce, sgl, bytes); ++ ++ nonce = nonce_add(nonce, bytes); ++ bytes = 0; ++ ++ sg_init_table(sgl, ARRAY_SIZE(sgl)); ++ sg = sgl; ++ } ++ ++ sg_set_page(sg++, bv.bv_page, bv.bv_len, bv.bv_offset); ++ bytes += bv.bv_len; ++ } ++ ++ sg_mark_end(sg - 1); ++ do_encrypt_sg(c->chacha20, nonce, sgl, bytes); ++} ++ ++struct bch_csum bch2_checksum_merge(unsigned type, struct bch_csum a, ++ struct bch_csum b, size_t b_len) ++{ ++ BUG_ON(!bch2_checksum_mergeable(type)); ++ ++ while (b_len) { ++ unsigned b = min_t(unsigned, b_len, PAGE_SIZE); ++ ++ a.lo = bch2_checksum_update(type, a.lo, ++ page_address(ZERO_PAGE(0)), b); ++ b_len -= b; ++ } ++ ++ a.lo ^= b.lo; ++ a.hi ^= b.hi; ++ return a; ++} ++ ++int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio, ++ struct bversion version, ++ struct bch_extent_crc_unpacked crc_old, ++ struct bch_extent_crc_unpacked *crc_a, ++ struct bch_extent_crc_unpacked *crc_b, ++ unsigned len_a, unsigned len_b, ++ unsigned new_csum_type) ++{ ++ struct bvec_iter iter = bio->bi_iter; ++ struct nonce nonce = extent_nonce(version, crc_old); ++ struct bch_csum merged = { 0 }; ++ struct crc_split { ++ struct bch_extent_crc_unpacked *crc; ++ unsigned len; ++ unsigned csum_type; ++ struct bch_csum csum; ++ } splits[3] = { ++ { crc_a, len_a, new_csum_type }, ++ { crc_b, len_b, new_csum_type }, ++ { NULL, bio_sectors(bio) - len_a - len_b, new_csum_type }, ++ }, *i; ++ bool mergeable = crc_old.csum_type == new_csum_type && ++ bch2_checksum_mergeable(new_csum_type); ++ unsigned crc_nonce = crc_old.nonce; ++ ++ BUG_ON(len_a + len_b > bio_sectors(bio)); ++ BUG_ON(crc_old.uncompressed_size != bio_sectors(bio)); ++ BUG_ON(crc_old.compression_type); ++ BUG_ON(bch2_csum_type_is_encryption(crc_old.csum_type) != ++ bch2_csum_type_is_encryption(new_csum_type)); ++ ++ for (i = splits; i < splits + ARRAY_SIZE(splits); i++) { ++ iter.bi_size = i->len << 9; ++ if (mergeable || i->crc) ++ i->csum = __bch2_checksum_bio(c, i->csum_type, ++ nonce, bio, &iter); ++ else ++ bio_advance_iter(bio, &iter, i->len << 9); ++ nonce = nonce_add(nonce, i->len << 9); ++ } ++ ++ if (mergeable) ++ for (i = splits; i < splits + ARRAY_SIZE(splits); i++) ++ merged = bch2_checksum_merge(new_csum_type, merged, ++ i->csum, i->len << 9); ++ else ++ merged = bch2_checksum_bio(c, crc_old.csum_type, ++ extent_nonce(version, crc_old), bio); ++ ++ if (bch2_crc_cmp(merged, crc_old.csum)) ++ return -EIO; ++ ++ for (i = splits; i < splits + ARRAY_SIZE(splits); i++) { ++ if (i->crc) ++ *i->crc = (struct bch_extent_crc_unpacked) { ++ .csum_type = i->csum_type, ++ .compressed_size = i->len, ++ .uncompressed_size = i->len, ++ .offset = 0, ++ .live_size = i->len, ++ .nonce = crc_nonce, ++ .csum = i->csum, ++ }; ++ ++ if (bch2_csum_type_is_encryption(new_csum_type)) ++ crc_nonce += i->len; ++ } ++ ++ return 0; ++} ++ ++#ifdef __KERNEL__ ++int bch2_request_key(struct bch_sb *sb, struct bch_key *key) ++{ ++ char key_description[60]; ++ struct key *keyring_key; ++ const struct user_key_payload *ukp; ++ int ret; ++ ++ snprintf(key_description, sizeof(key_description), ++ "bcachefs:%pUb", &sb->user_uuid); ++ ++ keyring_key = request_key(&key_type_logon, key_description, NULL); ++ if (IS_ERR(keyring_key)) ++ return PTR_ERR(keyring_key); ++ ++ down_read(&keyring_key->sem); ++ ukp = dereference_key_locked(keyring_key); ++ if (ukp->datalen == sizeof(*key)) { ++ memcpy(key, ukp->data, ukp->datalen); ++ ret = 0; ++ } else { ++ ret = -EINVAL; ++ } ++ up_read(&keyring_key->sem); ++ key_put(keyring_key); ++ ++ return ret; ++} ++#else ++#include ++#include ++ ++int bch2_request_key(struct bch_sb *sb, struct bch_key *key) ++{ ++ key_serial_t key_id; ++ char key_description[60]; ++ char uuid[40]; ++ ++ uuid_unparse_lower(sb->user_uuid.b, uuid); ++ sprintf(key_description, "bcachefs:%s", uuid); ++ ++ key_id = request_key("user", key_description, NULL, ++ KEY_SPEC_USER_KEYRING); ++ if (key_id < 0) ++ return -errno; ++ ++ if (keyctl_read(key_id, (void *) key, sizeof(*key)) != sizeof(*key)) ++ return -1; ++ ++ return 0; ++} ++#endif ++ ++int bch2_decrypt_sb_key(struct bch_fs *c, ++ struct bch_sb_field_crypt *crypt, ++ struct bch_key *key) ++{ ++ struct bch_encrypted_key sb_key = crypt->key; ++ struct bch_key user_key; ++ int ret = 0; ++ ++ /* is key encrypted? */ ++ if (!bch2_key_is_encrypted(&sb_key)) ++ goto out; ++ ++ ret = bch2_request_key(c->disk_sb.sb, &user_key); ++ if (ret) { ++ bch_err(c, "error requesting encryption key: %i", ret); ++ goto err; ++ } ++ ++ /* decrypt real key: */ ++ ret = bch2_chacha_encrypt_key(&user_key, bch2_sb_key_nonce(c), ++ &sb_key, sizeof(sb_key)); ++ if (ret) ++ goto err; ++ ++ if (bch2_key_is_encrypted(&sb_key)) { ++ bch_err(c, "incorrect encryption key"); ++ ret = -EINVAL; ++ goto err; ++ } ++out: ++ *key = sb_key.key; ++err: ++ memzero_explicit(&sb_key, sizeof(sb_key)); ++ memzero_explicit(&user_key, sizeof(user_key)); ++ return ret; ++} ++ ++static int bch2_alloc_ciphers(struct bch_fs *c) ++{ ++ if (!c->chacha20) ++ c->chacha20 = crypto_alloc_sync_skcipher("chacha20", 0, 0); ++ if (IS_ERR(c->chacha20)) { ++ bch_err(c, "error requesting chacha20 module: %li", ++ PTR_ERR(c->chacha20)); ++ return PTR_ERR(c->chacha20); ++ } ++ ++ if (!c->poly1305) ++ c->poly1305 = crypto_alloc_shash("poly1305", 0, 0); ++ if (IS_ERR(c->poly1305)) { ++ bch_err(c, "error requesting poly1305 module: %li", ++ PTR_ERR(c->poly1305)); ++ return PTR_ERR(c->poly1305); ++ } ++ ++ return 0; ++} ++ ++int bch2_disable_encryption(struct bch_fs *c) ++{ ++ struct bch_sb_field_crypt *crypt; ++ struct bch_key key; ++ int ret = -EINVAL; ++ ++ mutex_lock(&c->sb_lock); ++ ++ crypt = bch2_sb_get_crypt(c->disk_sb.sb); ++ if (!crypt) ++ goto out; ++ ++ /* is key encrypted? */ ++ ret = 0; ++ if (bch2_key_is_encrypted(&crypt->key)) ++ goto out; ++ ++ ret = bch2_decrypt_sb_key(c, crypt, &key); ++ if (ret) ++ goto out; ++ ++ crypt->key.magic = BCH_KEY_MAGIC; ++ crypt->key.key = key; ++ ++ SET_BCH_SB_ENCRYPTION_TYPE(c->disk_sb.sb, 0); ++ bch2_write_super(c); ++out: ++ mutex_unlock(&c->sb_lock); ++ ++ return ret; ++} ++ ++int bch2_enable_encryption(struct bch_fs *c, bool keyed) ++{ ++ struct bch_encrypted_key key; ++ struct bch_key user_key; ++ struct bch_sb_field_crypt *crypt; ++ int ret = -EINVAL; ++ ++ mutex_lock(&c->sb_lock); ++ ++ /* Do we already have an encryption key? */ ++ if (bch2_sb_get_crypt(c->disk_sb.sb)) ++ goto err; ++ ++ ret = bch2_alloc_ciphers(c); ++ if (ret) ++ goto err; ++ ++ key.magic = BCH_KEY_MAGIC; ++ get_random_bytes(&key.key, sizeof(key.key)); ++ ++ if (keyed) { ++ ret = bch2_request_key(c->disk_sb.sb, &user_key); ++ if (ret) { ++ bch_err(c, "error requesting encryption key: %i", ret); ++ goto err; ++ } ++ ++ ret = bch2_chacha_encrypt_key(&user_key, bch2_sb_key_nonce(c), ++ &key, sizeof(key)); ++ if (ret) ++ goto err; ++ } ++ ++ ret = crypto_skcipher_setkey(&c->chacha20->base, ++ (void *) &key.key, sizeof(key.key)); ++ if (ret) ++ goto err; ++ ++ crypt = bch2_sb_resize_crypt(&c->disk_sb, sizeof(*crypt) / sizeof(u64)); ++ if (!crypt) { ++ ret = -ENOMEM; /* XXX this technically could be -ENOSPC */ ++ goto err; ++ } ++ ++ crypt->key = key; ++ ++ /* write superblock */ ++ SET_BCH_SB_ENCRYPTION_TYPE(c->disk_sb.sb, 1); ++ bch2_write_super(c); ++err: ++ mutex_unlock(&c->sb_lock); ++ memzero_explicit(&user_key, sizeof(user_key)); ++ memzero_explicit(&key, sizeof(key)); ++ return ret; ++} ++ ++void bch2_fs_encryption_exit(struct bch_fs *c) ++{ ++ if (!IS_ERR_OR_NULL(c->poly1305)) ++ crypto_free_shash(c->poly1305); ++ if (!IS_ERR_OR_NULL(c->chacha20)) ++ crypto_free_sync_skcipher(c->chacha20); ++ if (!IS_ERR_OR_NULL(c->sha256)) ++ crypto_free_shash(c->sha256); ++} ++ ++int bch2_fs_encryption_init(struct bch_fs *c) ++{ ++ struct bch_sb_field_crypt *crypt; ++ struct bch_key key; ++ int ret = 0; ++ ++ pr_verbose_init(c->opts, ""); ++ ++ c->sha256 = crypto_alloc_shash("sha256", 0, 0); ++ if (IS_ERR(c->sha256)) { ++ bch_err(c, "error requesting sha256 module"); ++ ret = PTR_ERR(c->sha256); ++ goto out; ++ } ++ ++ crypt = bch2_sb_get_crypt(c->disk_sb.sb); ++ if (!crypt) ++ goto out; ++ ++ ret = bch2_alloc_ciphers(c); ++ if (ret) ++ goto out; ++ ++ ret = bch2_decrypt_sb_key(c, crypt, &key); ++ if (ret) ++ goto out; ++ ++ ret = crypto_skcipher_setkey(&c->chacha20->base, ++ (void *) &key.key, sizeof(key.key)); ++ if (ret) ++ goto out; ++out: ++ memzero_explicit(&key, sizeof(key)); ++ pr_verbose_init(c->opts, "ret %i", ret); ++ return ret; ++} +diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h +new file mode 100644 +index 000000000000..b84e81bac8ff +--- /dev/null ++++ b/fs/bcachefs/checksum.h +@@ -0,0 +1,199 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_CHECKSUM_H ++#define _BCACHEFS_CHECKSUM_H ++ ++#include "bcachefs.h" ++#include "extents_types.h" ++#include "super-io.h" ++ ++#include ++#include ++ ++static inline bool bch2_checksum_mergeable(unsigned type) ++{ ++ ++ switch (type) { ++ case BCH_CSUM_NONE: ++ case BCH_CSUM_CRC32C: ++ case BCH_CSUM_CRC64: ++ return true; ++ default: ++ return false; ++ } ++} ++ ++struct bch_csum bch2_checksum_merge(unsigned, struct bch_csum, ++ struct bch_csum, size_t); ++ ++#define BCH_NONCE_EXTENT cpu_to_le32(1 << 28) ++#define BCH_NONCE_BTREE cpu_to_le32(2 << 28) ++#define BCH_NONCE_JOURNAL cpu_to_le32(3 << 28) ++#define BCH_NONCE_PRIO cpu_to_le32(4 << 28) ++#define BCH_NONCE_POLY cpu_to_le32(1 << 31) ++ ++struct bch_csum bch2_checksum(struct bch_fs *, unsigned, struct nonce, ++ const void *, size_t); ++ ++/* ++ * This is used for various on disk data structures - bch_sb, prio_set, bset, ++ * jset: The checksum is _always_ the first field of these structs ++ */ ++#define csum_vstruct(_c, _type, _nonce, _i) \ ++({ \ ++ const void *start = ((const void *) (_i)) + sizeof((_i)->csum); \ ++ const void *end = vstruct_end(_i); \ ++ \ ++ bch2_checksum(_c, _type, _nonce, start, end - start); \ ++}) ++ ++int bch2_chacha_encrypt_key(struct bch_key *, struct nonce, void *, size_t); ++int bch2_request_key(struct bch_sb *, struct bch_key *); ++ ++void bch2_encrypt(struct bch_fs *, unsigned, struct nonce, ++ void *data, size_t); ++ ++struct bch_csum bch2_checksum_bio(struct bch_fs *, unsigned, ++ struct nonce, struct bio *); ++ ++int bch2_rechecksum_bio(struct bch_fs *, struct bio *, struct bversion, ++ struct bch_extent_crc_unpacked, ++ struct bch_extent_crc_unpacked *, ++ struct bch_extent_crc_unpacked *, ++ unsigned, unsigned, unsigned); ++ ++void bch2_encrypt_bio(struct bch_fs *, unsigned, ++ struct nonce, struct bio *); ++ ++int bch2_decrypt_sb_key(struct bch_fs *, struct bch_sb_field_crypt *, ++ struct bch_key *); ++ ++int bch2_disable_encryption(struct bch_fs *); ++int bch2_enable_encryption(struct bch_fs *, bool); ++ ++void bch2_fs_encryption_exit(struct bch_fs *); ++int bch2_fs_encryption_init(struct bch_fs *); ++ ++static inline enum bch_csum_type bch2_csum_opt_to_type(enum bch_csum_opts type, ++ bool data) ++{ ++ switch (type) { ++ case BCH_CSUM_OPT_NONE: ++ return BCH_CSUM_NONE; ++ case BCH_CSUM_OPT_CRC32C: ++ return data ? BCH_CSUM_CRC32C : BCH_CSUM_CRC32C_NONZERO; ++ case BCH_CSUM_OPT_CRC64: ++ return data ? BCH_CSUM_CRC64 : BCH_CSUM_CRC64_NONZERO; ++ default: ++ BUG(); ++ } ++} ++ ++static inline enum bch_csum_type bch2_data_checksum_type(struct bch_fs *c, ++ unsigned opt) ++{ ++ if (c->sb.encryption_type) ++ return c->opts.wide_macs ++ ? BCH_CSUM_CHACHA20_POLY1305_128 ++ : BCH_CSUM_CHACHA20_POLY1305_80; ++ ++ return bch2_csum_opt_to_type(opt, true); ++} ++ ++static inline enum bch_csum_type bch2_meta_checksum_type(struct bch_fs *c) ++{ ++ if (c->sb.encryption_type) ++ return BCH_CSUM_CHACHA20_POLY1305_128; ++ ++ return bch2_csum_opt_to_type(c->opts.metadata_checksum, false); ++} ++ ++static const unsigned bch2_compression_opt_to_type[] = { ++#define x(t) [BCH_COMPRESSION_OPT_##t] = BCH_COMPRESSION_##t, ++ BCH_COMPRESSION_TYPES() ++#undef x ++}; ++ ++static inline bool bch2_checksum_type_valid(const struct bch_fs *c, ++ unsigned type) ++{ ++ if (type >= BCH_CSUM_NR) ++ return false; ++ ++ if (bch2_csum_type_is_encryption(type) && !c->chacha20) ++ return false; ++ ++ return true; ++} ++ ++/* returns true if not equal */ ++static inline bool bch2_crc_cmp(struct bch_csum l, struct bch_csum r) ++{ ++ /* ++ * XXX: need some way of preventing the compiler from optimizing this ++ * into a form that isn't constant time.. ++ */ ++ return ((l.lo ^ r.lo) | (l.hi ^ r.hi)) != 0; ++} ++ ++/* for skipping ahead and encrypting/decrypting at an offset: */ ++static inline struct nonce nonce_add(struct nonce nonce, unsigned offset) ++{ ++ EBUG_ON(offset & (CHACHA_BLOCK_SIZE - 1)); ++ ++ le32_add_cpu(&nonce.d[0], offset / CHACHA_BLOCK_SIZE); ++ return nonce; ++} ++ ++static inline struct nonce null_nonce(void) ++{ ++ struct nonce ret; ++ ++ memset(&ret, 0, sizeof(ret)); ++ return ret; ++} ++ ++static inline struct nonce extent_nonce(struct bversion version, ++ struct bch_extent_crc_unpacked crc) ++{ ++ unsigned size = crc.compression_type ? crc.uncompressed_size : 0; ++ struct nonce nonce = (struct nonce) {{ ++ [0] = cpu_to_le32(size << 22), ++ [1] = cpu_to_le32(version.lo), ++ [2] = cpu_to_le32(version.lo >> 32), ++ [3] = cpu_to_le32(version.hi| ++ (crc.compression_type << 24))^BCH_NONCE_EXTENT, ++ }}; ++ ++ return nonce_add(nonce, crc.nonce << 9); ++} ++ ++static inline bool bch2_key_is_encrypted(struct bch_encrypted_key *key) ++{ ++ return le64_to_cpu(key->magic) != BCH_KEY_MAGIC; ++} ++ ++static inline struct nonce __bch2_sb_key_nonce(struct bch_sb *sb) ++{ ++ __le64 magic = __bch2_sb_magic(sb); ++ ++ return (struct nonce) {{ ++ [0] = 0, ++ [1] = 0, ++ [2] = ((__le32 *) &magic)[0], ++ [3] = ((__le32 *) &magic)[1], ++ }}; ++} ++ ++static inline struct nonce bch2_sb_key_nonce(struct bch_fs *c) ++{ ++ __le64 magic = bch2_sb_magic(c); ++ ++ return (struct nonce) {{ ++ [0] = 0, ++ [1] = 0, ++ [2] = ((__le32 *) &magic)[0], ++ [3] = ((__le32 *) &magic)[1], ++ }}; ++} ++ ++#endif /* _BCACHEFS_CHECKSUM_H */ +diff --git a/fs/bcachefs/clock.c b/fs/bcachefs/clock.c +new file mode 100644 +index 000000000000..8ac6990c6971 +--- /dev/null ++++ b/fs/bcachefs/clock.c +@@ -0,0 +1,180 @@ ++// SPDX-License-Identifier: GPL-2.0 ++#include "bcachefs.h" ++#include "clock.h" ++ ++#include ++#include ++#include ++ ++static inline long io_timer_cmp(io_timer_heap *h, ++ struct io_timer *l, ++ struct io_timer *r) ++{ ++ return l->expire - r->expire; ++} ++ ++void bch2_io_timer_add(struct io_clock *clock, struct io_timer *timer) ++{ ++ size_t i; ++ ++ spin_lock(&clock->timer_lock); ++ for (i = 0; i < clock->timers.used; i++) ++ if (clock->timers.data[i] == timer) ++ goto out; ++ ++ BUG_ON(!heap_add(&clock->timers, timer, io_timer_cmp, NULL)); ++out: ++ spin_unlock(&clock->timer_lock); ++} ++ ++void bch2_io_timer_del(struct io_clock *clock, struct io_timer *timer) ++{ ++ size_t i; ++ ++ spin_lock(&clock->timer_lock); ++ ++ for (i = 0; i < clock->timers.used; i++) ++ if (clock->timers.data[i] == timer) { ++ heap_del(&clock->timers, i, io_timer_cmp, NULL); ++ break; ++ } ++ ++ spin_unlock(&clock->timer_lock); ++} ++ ++struct io_clock_wait { ++ struct io_timer io_timer; ++ struct timer_list cpu_timer; ++ struct task_struct *task; ++ int expired; ++}; ++ ++static void io_clock_wait_fn(struct io_timer *timer) ++{ ++ struct io_clock_wait *wait = container_of(timer, ++ struct io_clock_wait, io_timer); ++ ++ wait->expired = 1; ++ wake_up_process(wait->task); ++} ++ ++static void io_clock_cpu_timeout(struct timer_list *timer) ++{ ++ struct io_clock_wait *wait = container_of(timer, ++ struct io_clock_wait, cpu_timer); ++ ++ wait->expired = 1; ++ wake_up_process(wait->task); ++} ++ ++void bch2_io_clock_schedule_timeout(struct io_clock *clock, unsigned long until) ++{ ++ struct io_clock_wait wait; ++ ++ /* XXX: calculate sleep time rigorously */ ++ wait.io_timer.expire = until; ++ wait.io_timer.fn = io_clock_wait_fn; ++ wait.task = current; ++ wait.expired = 0; ++ bch2_io_timer_add(clock, &wait.io_timer); ++ ++ schedule(); ++ ++ bch2_io_timer_del(clock, &wait.io_timer); ++} ++ ++void bch2_kthread_io_clock_wait(struct io_clock *clock, ++ unsigned long io_until, ++ unsigned long cpu_timeout) ++{ ++ bool kthread = (current->flags & PF_KTHREAD) != 0; ++ struct io_clock_wait wait; ++ ++ wait.io_timer.expire = io_until; ++ wait.io_timer.fn = io_clock_wait_fn; ++ wait.task = current; ++ wait.expired = 0; ++ bch2_io_timer_add(clock, &wait.io_timer); ++ ++ timer_setup_on_stack(&wait.cpu_timer, io_clock_cpu_timeout, 0); ++ ++ if (cpu_timeout != MAX_SCHEDULE_TIMEOUT) ++ mod_timer(&wait.cpu_timer, cpu_timeout + jiffies); ++ ++ while (1) { ++ set_current_state(TASK_INTERRUPTIBLE); ++ if (kthread && kthread_should_stop()) ++ break; ++ ++ if (wait.expired) ++ break; ++ ++ schedule(); ++ try_to_freeze(); ++ } ++ ++ __set_current_state(TASK_RUNNING); ++ del_singleshot_timer_sync(&wait.cpu_timer); ++ destroy_timer_on_stack(&wait.cpu_timer); ++ bch2_io_timer_del(clock, &wait.io_timer); ++} ++ ++static struct io_timer *get_expired_timer(struct io_clock *clock, ++ unsigned long now) ++{ ++ struct io_timer *ret = NULL; ++ ++ spin_lock(&clock->timer_lock); ++ ++ if (clock->timers.used && ++ time_after_eq(now, clock->timers.data[0]->expire)) ++ heap_pop(&clock->timers, ret, io_timer_cmp, NULL); ++ ++ spin_unlock(&clock->timer_lock); ++ ++ return ret; ++} ++ ++void bch2_increment_clock(struct bch_fs *c, unsigned sectors, int rw) ++{ ++ struct io_clock *clock = &c->io_clock[rw]; ++ struct io_timer *timer; ++ unsigned long now; ++ ++ /* Buffer up one megabyte worth of IO in the percpu counter */ ++ preempt_disable(); ++ ++ if (likely(this_cpu_add_return(*clock->pcpu_buf, sectors) < ++ IO_CLOCK_PCPU_SECTORS)) { ++ preempt_enable(); ++ return; ++ } ++ ++ sectors = this_cpu_xchg(*clock->pcpu_buf, 0); ++ preempt_enable(); ++ now = atomic_long_add_return(sectors, &clock->now); ++ ++ while ((timer = get_expired_timer(clock, now))) ++ timer->fn(timer); ++} ++ ++void bch2_io_clock_exit(struct io_clock *clock) ++{ ++ free_heap(&clock->timers); ++ free_percpu(clock->pcpu_buf); ++} ++ ++int bch2_io_clock_init(struct io_clock *clock) ++{ ++ atomic_long_set(&clock->now, 0); ++ spin_lock_init(&clock->timer_lock); ++ ++ clock->pcpu_buf = alloc_percpu(*clock->pcpu_buf); ++ if (!clock->pcpu_buf) ++ return -ENOMEM; ++ ++ if (!init_heap(&clock->timers, NR_IO_TIMERS, GFP_KERNEL)) ++ return -ENOMEM; ++ ++ return 0; ++} +diff --git a/fs/bcachefs/clock.h b/fs/bcachefs/clock.h +new file mode 100644 +index 000000000000..5cb043c579d8 +--- /dev/null ++++ b/fs/bcachefs/clock.h +@@ -0,0 +1,25 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_CLOCK_H ++#define _BCACHEFS_CLOCK_H ++ ++void bch2_io_timer_add(struct io_clock *, struct io_timer *); ++void bch2_io_timer_del(struct io_clock *, struct io_timer *); ++void bch2_kthread_io_clock_wait(struct io_clock *, unsigned long, ++ unsigned long); ++void bch2_increment_clock(struct bch_fs *, unsigned, int); ++ ++void bch2_io_clock_schedule_timeout(struct io_clock *, unsigned long); ++ ++#define bch2_kthread_wait_event_ioclock_timeout(condition, clock, timeout)\ ++({ \ ++ long __ret = timeout; \ ++ might_sleep(); \ ++ if (!___wait_cond_timeout(condition)) \ ++ __ret = __wait_event_timeout(wq, condition, timeout); \ ++ __ret; \ ++}) ++ ++void bch2_io_clock_exit(struct io_clock *); ++int bch2_io_clock_init(struct io_clock *); ++ ++#endif /* _BCACHEFS_CLOCK_H */ +diff --git a/fs/bcachefs/clock_types.h b/fs/bcachefs/clock_types.h +new file mode 100644 +index 000000000000..2b5e499e12b4 +--- /dev/null ++++ b/fs/bcachefs/clock_types.h +@@ -0,0 +1,36 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_CLOCK_TYPES_H ++#define _BCACHEFS_CLOCK_TYPES_H ++ ++#include "util.h" ++ ++#define NR_IO_TIMERS (BCH_SB_MEMBERS_MAX * 3) ++ ++/* ++ * Clocks/timers in units of sectors of IO: ++ * ++ * Note - they use percpu batching, so they're only approximate. ++ */ ++ ++struct io_timer; ++typedef void (*io_timer_fn)(struct io_timer *); ++ ++struct io_timer { ++ io_timer_fn fn; ++ unsigned long expire; ++}; ++ ++/* Amount to buffer up on a percpu counter */ ++#define IO_CLOCK_PCPU_SECTORS 128 ++ ++typedef HEAP(struct io_timer *) io_timer_heap; ++ ++struct io_clock { ++ atomic_long_t now; ++ u16 __percpu *pcpu_buf; ++ ++ spinlock_t timer_lock; ++ io_timer_heap timers; ++}; ++ ++#endif /* _BCACHEFS_CLOCK_TYPES_H */ +diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c +new file mode 100644 +index 000000000000..8f0f35b13c79 +--- /dev/null ++++ b/fs/bcachefs/compress.c +@@ -0,0 +1,623 @@ ++// SPDX-License-Identifier: GPL-2.0 ++#include "bcachefs.h" ++#include "checksum.h" ++#include "compress.h" ++#include "extents.h" ++#include "io.h" ++#include "super-io.h" ++ ++#include ++#include ++#include ++ ++/* Bounce buffer: */ ++struct bbuf { ++ void *b; ++ enum { ++ BB_NONE, ++ BB_VMAP, ++ BB_KMALLOC, ++ BB_VMALLOC, ++ BB_MEMPOOL, ++ } type; ++ int rw; ++}; ++ ++static struct bbuf __bounce_alloc(struct bch_fs *c, unsigned size, int rw) ++{ ++ void *b; ++ ++ BUG_ON(size > c->sb.encoded_extent_max << 9); ++ ++ b = kmalloc(size, GFP_NOIO|__GFP_NOWARN); ++ if (b) ++ return (struct bbuf) { .b = b, .type = BB_KMALLOC, .rw = rw }; ++ ++ b = mempool_alloc(&c->compression_bounce[rw], GFP_NOWAIT); ++ b = b ? page_address(b) : NULL; ++ if (b) ++ return (struct bbuf) { .b = b, .type = BB_MEMPOOL, .rw = rw }; ++ ++ b = vmalloc(size); ++ if (b) ++ return (struct bbuf) { .b = b, .type = BB_VMALLOC, .rw = rw }; ++ ++ b = mempool_alloc(&c->compression_bounce[rw], GFP_NOIO); ++ b = b ? page_address(b) : NULL; ++ if (b) ++ return (struct bbuf) { .b = b, .type = BB_MEMPOOL, .rw = rw }; ++ ++ BUG(); ++} ++ ++static struct bbuf __bio_map_or_bounce(struct bch_fs *c, struct bio *bio, ++ struct bvec_iter start, int rw) ++{ ++ struct bbuf ret; ++ struct bio_vec bv; ++ struct bvec_iter iter; ++ unsigned nr_pages = 0; ++ struct page *stack_pages[16]; ++ struct page **pages = NULL; ++ bool first = true; ++ unsigned prev_end = PAGE_SIZE; ++ void *data; ++ ++ BUG_ON(bvec_iter_sectors(start) > c->sb.encoded_extent_max); ++ ++#ifndef CONFIG_HIGHMEM ++ __bio_for_each_bvec(bv, bio, iter, start) { ++ if (bv.bv_len == start.bi_size) ++ return (struct bbuf) { ++ .b = page_address(bv.bv_page) + bv.bv_offset, ++ .type = BB_NONE, .rw = rw ++ }; ++ } ++#endif ++ __bio_for_each_segment(bv, bio, iter, start) { ++ if ((!first && bv.bv_offset) || ++ prev_end != PAGE_SIZE) ++ goto bounce; ++ ++ prev_end = bv.bv_offset + bv.bv_len; ++ nr_pages++; ++ } ++ ++ BUG_ON(DIV_ROUND_UP(start.bi_size, PAGE_SIZE) > nr_pages); ++ ++ pages = nr_pages > ARRAY_SIZE(stack_pages) ++ ? kmalloc_array(nr_pages, sizeof(struct page *), GFP_NOIO) ++ : stack_pages; ++ if (!pages) ++ goto bounce; ++ ++ nr_pages = 0; ++ __bio_for_each_segment(bv, bio, iter, start) ++ pages[nr_pages++] = bv.bv_page; ++ ++ data = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL); ++ if (pages != stack_pages) ++ kfree(pages); ++ ++ if (data) ++ return (struct bbuf) { ++ .b = data + bio_iter_offset(bio, start), ++ .type = BB_VMAP, .rw = rw ++ }; ++bounce: ++ ret = __bounce_alloc(c, start.bi_size, rw); ++ ++ if (rw == READ) ++ memcpy_from_bio(ret.b, bio, start); ++ ++ return ret; ++} ++ ++static struct bbuf bio_map_or_bounce(struct bch_fs *c, struct bio *bio, int rw) ++{ ++ return __bio_map_or_bounce(c, bio, bio->bi_iter, rw); ++} ++ ++static void bio_unmap_or_unbounce(struct bch_fs *c, struct bbuf buf) ++{ ++ switch (buf.type) { ++ case BB_NONE: ++ break; ++ case BB_VMAP: ++ vunmap((void *) ((unsigned long) buf.b & PAGE_MASK)); ++ break; ++ case BB_KMALLOC: ++ kfree(buf.b); ++ break; ++ case BB_VMALLOC: ++ vfree(buf.b); ++ break; ++ case BB_MEMPOOL: ++ mempool_free(virt_to_page(buf.b), ++ &c->compression_bounce[buf.rw]); ++ break; ++ } ++} ++ ++static inline void zlib_set_workspace(z_stream *strm, void *workspace) ++{ ++#ifdef __KERNEL__ ++ strm->workspace = workspace; ++#endif ++} ++ ++static int __bio_uncompress(struct bch_fs *c, struct bio *src, ++ void *dst_data, struct bch_extent_crc_unpacked crc) ++{ ++ struct bbuf src_data = { NULL }; ++ size_t src_len = src->bi_iter.bi_size; ++ size_t dst_len = crc.uncompressed_size << 9; ++ void *workspace; ++ int ret; ++ ++ src_data = bio_map_or_bounce(c, src, READ); ++ ++ switch (crc.compression_type) { ++ case BCH_COMPRESSION_LZ4_OLD: ++ case BCH_COMPRESSION_LZ4: ++ ret = LZ4_decompress_safe_partial(src_data.b, dst_data, ++ src_len, dst_len, dst_len); ++ if (ret != dst_len) ++ goto err; ++ break; ++ case BCH_COMPRESSION_GZIP: { ++ z_stream strm = { ++ .next_in = src_data.b, ++ .avail_in = src_len, ++ .next_out = dst_data, ++ .avail_out = dst_len, ++ }; ++ ++ workspace = mempool_alloc(&c->decompress_workspace, GFP_NOIO); ++ ++ zlib_set_workspace(&strm, workspace); ++ zlib_inflateInit2(&strm, -MAX_WBITS); ++ ret = zlib_inflate(&strm, Z_FINISH); ++ ++ mempool_free(workspace, &c->decompress_workspace); ++ ++ if (ret != Z_STREAM_END) ++ goto err; ++ break; ++ } ++ case BCH_COMPRESSION_ZSTD: { ++ ZSTD_DCtx *ctx; ++ size_t len; ++ ++ workspace = mempool_alloc(&c->decompress_workspace, GFP_NOIO); ++ ctx = zstd_init_dctx(workspace, zstd_dctx_workspace_bound()); ++ ++ src_len = le32_to_cpup(src_data.b); ++ ++ ret = zstd_decompress_dctx(ctx, ++ dst_data, dst_len, ++ src_data.b + 4, src_len); ++ ++ mempool_free(workspace, &c->decompress_workspace); ++ ++ if (len != dst_len) ++ goto err; ++ break; ++ } ++ default: ++ BUG(); ++ } ++ ret = 0; ++out: ++ bio_unmap_or_unbounce(c, src_data); ++ return ret; ++err: ++ ret = -EIO; ++ goto out; ++} ++ ++int bch2_bio_uncompress_inplace(struct bch_fs *c, struct bio *bio, ++ struct bch_extent_crc_unpacked *crc) ++{ ++ struct bbuf data = { NULL }; ++ size_t dst_len = crc->uncompressed_size << 9; ++ ++ /* bio must own its pages: */ ++ BUG_ON(!bio->bi_vcnt); ++ BUG_ON(DIV_ROUND_UP(crc->live_size, PAGE_SECTORS) > bio->bi_max_vecs); ++ ++ if (crc->uncompressed_size > c->sb.encoded_extent_max || ++ crc->compressed_size > c->sb.encoded_extent_max) { ++ bch_err(c, "error rewriting existing data: extent too big"); ++ return -EIO; ++ } ++ ++ data = __bounce_alloc(c, dst_len, WRITE); ++ ++ if (__bio_uncompress(c, bio, data.b, *crc)) { ++ bch_err(c, "error rewriting existing data: decompression error"); ++ bio_unmap_or_unbounce(c, data); ++ return -EIO; ++ } ++ ++ /* ++ * XXX: don't have a good way to assert that the bio was allocated with ++ * enough space, we depend on bch2_move_extent doing the right thing ++ */ ++ bio->bi_iter.bi_size = crc->live_size << 9; ++ ++ memcpy_to_bio(bio, bio->bi_iter, data.b + (crc->offset << 9)); ++ ++ crc->csum_type = 0; ++ crc->compression_type = 0; ++ crc->compressed_size = crc->live_size; ++ crc->uncompressed_size = crc->live_size; ++ crc->offset = 0; ++ crc->csum = (struct bch_csum) { 0, 0 }; ++ ++ bio_unmap_or_unbounce(c, data); ++ return 0; ++} ++ ++int bch2_bio_uncompress(struct bch_fs *c, struct bio *src, ++ struct bio *dst, struct bvec_iter dst_iter, ++ struct bch_extent_crc_unpacked crc) ++{ ++ struct bbuf dst_data = { NULL }; ++ size_t dst_len = crc.uncompressed_size << 9; ++ int ret = -ENOMEM; ++ ++ if (crc.uncompressed_size > c->sb.encoded_extent_max || ++ crc.compressed_size > c->sb.encoded_extent_max) ++ return -EIO; ++ ++ dst_data = dst_len == dst_iter.bi_size ++ ? __bio_map_or_bounce(c, dst, dst_iter, WRITE) ++ : __bounce_alloc(c, dst_len, WRITE); ++ ++ ret = __bio_uncompress(c, src, dst_data.b, crc); ++ if (ret) ++ goto err; ++ ++ if (dst_data.type != BB_NONE) ++ memcpy_to_bio(dst, dst_iter, dst_data.b + (crc.offset << 9)); ++err: ++ bio_unmap_or_unbounce(c, dst_data); ++ return ret; ++} ++ ++static int attempt_compress(struct bch_fs *c, ++ void *workspace, ++ void *dst, size_t dst_len, ++ void *src, size_t src_len, ++ unsigned compression_type) ++{ ++ switch (compression_type) { ++ case BCH_COMPRESSION_LZ4: { ++ int len = src_len; ++ int ret = LZ4_compress_destSize( ++ src, dst, ++ &len, dst_len, ++ workspace); ++ ++ if (len < src_len) ++ return -len; ++ ++ return ret; ++ } ++ case BCH_COMPRESSION_GZIP: { ++ z_stream strm = { ++ .next_in = src, ++ .avail_in = src_len, ++ .next_out = dst, ++ .avail_out = dst_len, ++ }; ++ ++ zlib_set_workspace(&strm, workspace); ++ zlib_deflateInit2(&strm, Z_DEFAULT_COMPRESSION, ++ Z_DEFLATED, -MAX_WBITS, DEF_MEM_LEVEL, ++ Z_DEFAULT_STRATEGY); ++ ++ if (zlib_deflate(&strm, Z_FINISH) != Z_STREAM_END) ++ return 0; ++ ++ if (zlib_deflateEnd(&strm) != Z_OK) ++ return 0; ++ ++ return strm.total_out; ++ } ++ case BCH_COMPRESSION_ZSTD: { ++ ZSTD_CCtx *ctx = zstd_init_cctx(workspace, ++ zstd_cctx_workspace_bound(&c->zstd_params.cParams)); ++ ++ size_t len = zstd_compress_cctx(ctx, ++ dst + 4, dst_len - 4, ++ src, src_len, ++ &c->zstd_params); ++ if (zstd_is_error(len)) ++ return 0; ++ ++ *((__le32 *) dst) = cpu_to_le32(len); ++ return len + 4; ++ } ++ default: ++ BUG(); ++ } ++} ++ ++static unsigned __bio_compress(struct bch_fs *c, ++ struct bio *dst, size_t *dst_len, ++ struct bio *src, size_t *src_len, ++ unsigned compression_type) ++{ ++ struct bbuf src_data = { NULL }, dst_data = { NULL }; ++ void *workspace; ++ unsigned pad; ++ int ret = 0; ++ ++ BUG_ON(compression_type >= BCH_COMPRESSION_NR); ++ BUG_ON(!mempool_initialized(&c->compress_workspace[compression_type])); ++ ++ /* If it's only one block, don't bother trying to compress: */ ++ if (bio_sectors(src) <= c->opts.block_size) ++ return 0; ++ ++ dst_data = bio_map_or_bounce(c, dst, WRITE); ++ src_data = bio_map_or_bounce(c, src, READ); ++ ++ workspace = mempool_alloc(&c->compress_workspace[compression_type], GFP_NOIO); ++ ++ *src_len = src->bi_iter.bi_size; ++ *dst_len = dst->bi_iter.bi_size; ++ ++ /* ++ * XXX: this algorithm sucks when the compression code doesn't tell us ++ * how much would fit, like LZ4 does: ++ */ ++ while (1) { ++ if (*src_len <= block_bytes(c)) { ++ ret = -1; ++ break; ++ } ++ ++ ret = attempt_compress(c, workspace, ++ dst_data.b, *dst_len, ++ src_data.b, *src_len, ++ compression_type); ++ if (ret > 0) { ++ *dst_len = ret; ++ ret = 0; ++ break; ++ } ++ ++ /* Didn't fit: should we retry with a smaller amount? */ ++ if (*src_len <= *dst_len) { ++ ret = -1; ++ break; ++ } ++ ++ /* ++ * If ret is negative, it's a hint as to how much data would fit ++ */ ++ BUG_ON(-ret >= *src_len); ++ ++ if (ret < 0) ++ *src_len = -ret; ++ else ++ *src_len -= (*src_len - *dst_len) / 2; ++ *src_len = round_down(*src_len, block_bytes(c)); ++ } ++ ++ mempool_free(workspace, &c->compress_workspace[compression_type]); ++ ++ if (ret) ++ goto err; ++ ++ /* Didn't get smaller: */ ++ if (round_up(*dst_len, block_bytes(c)) >= *src_len) ++ goto err; ++ ++ pad = round_up(*dst_len, block_bytes(c)) - *dst_len; ++ ++ memset(dst_data.b + *dst_len, 0, pad); ++ *dst_len += pad; ++ ++ if (dst_data.type != BB_NONE) ++ memcpy_to_bio(dst, dst->bi_iter, dst_data.b); ++ ++ BUG_ON(!*dst_len || *dst_len > dst->bi_iter.bi_size); ++ BUG_ON(!*src_len || *src_len > src->bi_iter.bi_size); ++ BUG_ON(*dst_len & (block_bytes(c) - 1)); ++ BUG_ON(*src_len & (block_bytes(c) - 1)); ++out: ++ bio_unmap_or_unbounce(c, src_data); ++ bio_unmap_or_unbounce(c, dst_data); ++ return compression_type; ++err: ++ compression_type = 0; ++ goto out; ++} ++ ++unsigned bch2_bio_compress(struct bch_fs *c, ++ struct bio *dst, size_t *dst_len, ++ struct bio *src, size_t *src_len, ++ unsigned compression_type) ++{ ++ unsigned orig_dst = dst->bi_iter.bi_size; ++ unsigned orig_src = src->bi_iter.bi_size; ++ ++ /* Don't consume more than BCH_ENCODED_EXTENT_MAX from @src: */ ++ src->bi_iter.bi_size = min_t(unsigned, src->bi_iter.bi_size, ++ c->sb.encoded_extent_max << 9); ++ /* Don't generate a bigger output than input: */ ++ dst->bi_iter.bi_size = min(dst->bi_iter.bi_size, src->bi_iter.bi_size); ++ ++ if (compression_type == BCH_COMPRESSION_LZ4_OLD) ++ compression_type = BCH_COMPRESSION_LZ4; ++ ++ compression_type = ++ __bio_compress(c, dst, dst_len, src, src_len, compression_type); ++ ++ dst->bi_iter.bi_size = orig_dst; ++ src->bi_iter.bi_size = orig_src; ++ return compression_type; ++} ++ ++static int __bch2_fs_compress_init(struct bch_fs *, u64); ++ ++#define BCH_FEATURE_NONE 0 ++ ++static const unsigned bch2_compression_opt_to_feature[] = { ++#define x(t) [BCH_COMPRESSION_OPT_##t] = BCH_FEATURE_##t, ++ BCH_COMPRESSION_TYPES() ++#undef x ++}; ++ ++#undef BCH_FEATURE_NONE ++ ++static int __bch2_check_set_has_compressed_data(struct bch_fs *c, u64 f) ++{ ++ int ret = 0; ++ ++ if ((c->sb.features & f) == f) ++ return 0; ++ ++ mutex_lock(&c->sb_lock); ++ ++ if ((c->sb.features & f) == f) { ++ mutex_unlock(&c->sb_lock); ++ return 0; ++ } ++ ++ ret = __bch2_fs_compress_init(c, c->sb.features|f); ++ if (ret) { ++ mutex_unlock(&c->sb_lock); ++ return ret; ++ } ++ ++ c->disk_sb.sb->features[0] |= cpu_to_le64(f); ++ bch2_write_super(c); ++ mutex_unlock(&c->sb_lock); ++ ++ return 0; ++} ++ ++int bch2_check_set_has_compressed_data(struct bch_fs *c, ++ unsigned compression_type) ++{ ++ BUG_ON(compression_type >= ARRAY_SIZE(bch2_compression_opt_to_feature)); ++ ++ return compression_type ++ ? __bch2_check_set_has_compressed_data(c, ++ 1ULL << bch2_compression_opt_to_feature[compression_type]) ++ : 0; ++} ++ ++void bch2_fs_compress_exit(struct bch_fs *c) ++{ ++ unsigned i; ++ ++ mempool_exit(&c->decompress_workspace); ++ for (i = 0; i < ARRAY_SIZE(c->compress_workspace); i++) ++ mempool_exit(&c->compress_workspace[i]); ++ mempool_exit(&c->compression_bounce[WRITE]); ++ mempool_exit(&c->compression_bounce[READ]); ++} ++ ++static int __bch2_fs_compress_init(struct bch_fs *c, u64 features) ++{ ++ size_t max_extent = c->sb.encoded_extent_max << 9; ++ size_t order = get_order(max_extent); ++ size_t decompress_workspace_size = 0; ++ bool decompress_workspace_needed; ++ ZSTD_parameters params = zstd_get_params(0, max_extent); ++ struct { ++ unsigned feature; ++ unsigned type; ++ size_t compress_workspace; ++ size_t decompress_workspace; ++ } compression_types[] = { ++ { BCH_FEATURE_LZ4, BCH_COMPRESSION_LZ4, LZ4_MEM_COMPRESS, 0 }, ++ { BCH_FEATURE_GZIP, BCH_COMPRESSION_GZIP, ++ zlib_deflate_workspacesize(MAX_WBITS, DEF_MEM_LEVEL), ++ zlib_inflate_workspacesize(), }, ++ { BCH_FEATURE_ZSTD, BCH_COMPRESSION_ZSTD, ++ zstd_cctx_workspace_bound(¶ms.cParams), ++ zstd_dctx_workspace_bound() }, ++ }, *i; ++ int ret = 0; ++ ++ pr_verbose_init(c->opts, ""); ++ ++ c->zstd_params = params; ++ ++ for (i = compression_types; ++ i < compression_types + ARRAY_SIZE(compression_types); ++ i++) ++ if (features & (1 << i->feature)) ++ goto have_compressed; ++ ++ goto out; ++have_compressed: ++ ++ if (!mempool_initialized(&c->compression_bounce[READ])) { ++ ret = mempool_init_page_pool(&c->compression_bounce[READ], ++ 1, order); ++ if (ret) ++ goto out; ++ } ++ ++ if (!mempool_initialized(&c->compression_bounce[WRITE])) { ++ ret = mempool_init_page_pool(&c->compression_bounce[WRITE], ++ 1, order); ++ if (ret) ++ goto out; ++ } ++ ++ for (i = compression_types; ++ i < compression_types + ARRAY_SIZE(compression_types); ++ i++) { ++ decompress_workspace_size = ++ max(decompress_workspace_size, i->decompress_workspace); ++ ++ if (!(features & (1 << i->feature))) ++ continue; ++ ++ if (i->decompress_workspace) ++ decompress_workspace_needed = true; ++ ++ if (mempool_initialized(&c->compress_workspace[i->type])) ++ continue; ++ ++ ret = mempool_init_kvpmalloc_pool( ++ &c->compress_workspace[i->type], ++ 1, i->compress_workspace); ++ if (ret) ++ goto out; ++ } ++ ++ if (!mempool_initialized(&c->decompress_workspace)) { ++ ret = mempool_init_kmalloc_pool( ++ &c->decompress_workspace, ++ 1, decompress_workspace_size); ++ if (ret) ++ goto out; ++ } ++out: ++ pr_verbose_init(c->opts, "ret %i", ret); ++ return ret; ++} ++ ++int bch2_fs_compress_init(struct bch_fs *c) ++{ ++ u64 f = c->sb.features; ++ ++ if (c->opts.compression) ++ f |= 1ULL << bch2_compression_opt_to_feature[c->opts.compression]; ++ ++ if (c->opts.background_compression) ++ f |= 1ULL << bch2_compression_opt_to_feature[c->opts.background_compression]; ++ ++ return __bch2_fs_compress_init(c, f); ++ ++} +diff --git a/fs/bcachefs/compress.h b/fs/bcachefs/compress.h +new file mode 100644 +index 000000000000..4bab1f61b3b5 +--- /dev/null ++++ b/fs/bcachefs/compress.h +@@ -0,0 +1,18 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_COMPRESS_H ++#define _BCACHEFS_COMPRESS_H ++ ++#include "extents_types.h" ++ ++int bch2_bio_uncompress_inplace(struct bch_fs *, struct bio *, ++ struct bch_extent_crc_unpacked *); ++int bch2_bio_uncompress(struct bch_fs *, struct bio *, struct bio *, ++ struct bvec_iter, struct bch_extent_crc_unpacked); ++unsigned bch2_bio_compress(struct bch_fs *, struct bio *, size_t *, ++ struct bio *, size_t *, unsigned); ++ ++int bch2_check_set_has_compressed_data(struct bch_fs *, unsigned); ++void bch2_fs_compress_exit(struct bch_fs *); ++int bch2_fs_compress_init(struct bch_fs *); ++ ++#endif /* _BCACHEFS_COMPRESS_H */ +diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c +new file mode 100644 +index 000000000000..69b123bad83b +--- /dev/null ++++ b/fs/bcachefs/debug.c +@@ -0,0 +1,432 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * Assorted bcachefs debug code ++ * ++ * Copyright 2010, 2011 Kent Overstreet ++ * Copyright 2012 Google, Inc. ++ */ ++ ++#include "bcachefs.h" ++#include "bkey_methods.h" ++#include "btree_cache.h" ++#include "btree_io.h" ++#include "btree_iter.h" ++#include "btree_update.h" ++#include "buckets.h" ++#include "debug.h" ++#include "error.h" ++#include "extents.h" ++#include "fsck.h" ++#include "inode.h" ++#include "io.h" ++#include "super.h" ++ ++#include ++#include ++#include ++#include ++#include ++ ++static struct dentry *bch_debug; ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++ ++void __bch2_btree_verify(struct bch_fs *c, struct btree *b) ++{ ++ struct btree *v = c->verify_data; ++ struct btree_node *n_ondisk, *n_sorted, *n_inmemory; ++ struct bset *sorted, *inmemory; ++ struct extent_ptr_decoded pick; ++ struct bch_dev *ca; ++ struct bio *bio; ++ ++ if (c->opts.nochanges) ++ return; ++ ++ btree_node_io_lock(b); ++ mutex_lock(&c->verify_lock); ++ ++ n_ondisk = c->verify_ondisk; ++ n_sorted = c->verify_data->data; ++ n_inmemory = b->data; ++ ++ bkey_copy(&v->key, &b->key); ++ v->written = 0; ++ v->level = b->level; ++ v->btree_id = b->btree_id; ++ bch2_btree_keys_init(v, &c->expensive_debug_checks); ++ ++ if (bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key), ++ NULL, &pick) <= 0) ++ return; ++ ++ ca = bch_dev_bkey_exists(c, pick.ptr.dev); ++ if (!bch2_dev_get_ioref(ca, READ)) ++ return; ++ ++ bio = bio_alloc_bioset(GFP_NOIO, ++ buf_pages(n_sorted, btree_bytes(c)), ++ &c->btree_bio); ++ bio_set_dev(bio, ca->disk_sb.bdev); ++ bio->bi_opf = REQ_OP_READ|REQ_META; ++ bio->bi_iter.bi_sector = pick.ptr.offset; ++ bch2_bio_map(bio, n_sorted, btree_bytes(c)); ++ ++ submit_bio_wait(bio); ++ ++ bio_put(bio); ++ percpu_ref_put(&ca->io_ref); ++ ++ memcpy(n_ondisk, n_sorted, btree_bytes(c)); ++ ++ if (bch2_btree_node_read_done(c, v, false)) ++ goto out; ++ ++ n_sorted = c->verify_data->data; ++ sorted = &n_sorted->keys; ++ inmemory = &n_inmemory->keys; ++ ++ if (inmemory->u64s != sorted->u64s || ++ memcmp(inmemory->start, ++ sorted->start, ++ vstruct_end(inmemory) - (void *) inmemory->start)) { ++ unsigned offset = 0, sectors; ++ struct bset *i; ++ unsigned j; ++ ++ console_lock(); ++ ++ printk(KERN_ERR "*** in memory:\n"); ++ bch2_dump_bset(b, inmemory, 0); ++ ++ printk(KERN_ERR "*** read back in:\n"); ++ bch2_dump_bset(v, sorted, 0); ++ ++ while (offset < b->written) { ++ if (!offset ) { ++ i = &n_ondisk->keys; ++ sectors = vstruct_blocks(n_ondisk, c->block_bits) << ++ c->block_bits; ++ } else { ++ struct btree_node_entry *bne = ++ (void *) n_ondisk + (offset << 9); ++ i = &bne->keys; ++ ++ sectors = vstruct_blocks(bne, c->block_bits) << ++ c->block_bits; ++ } ++ ++ printk(KERN_ERR "*** on disk block %u:\n", offset); ++ bch2_dump_bset(b, i, offset); ++ ++ offset += sectors; ++ } ++ ++ printk(KERN_ERR "*** block %u/%u not written\n", ++ offset >> c->block_bits, btree_blocks(c)); ++ ++ for (j = 0; j < le16_to_cpu(inmemory->u64s); j++) ++ if (inmemory->_data[j] != sorted->_data[j]) ++ break; ++ ++ printk(KERN_ERR "b->written %u\n", b->written); ++ ++ console_unlock(); ++ panic("verify failed at %u\n", j); ++ } ++out: ++ mutex_unlock(&c->verify_lock); ++ btree_node_io_unlock(b); ++} ++ ++#endif ++ ++#ifdef CONFIG_DEBUG_FS ++ ++/* XXX: bch_fs refcounting */ ++ ++struct dump_iter { ++ struct bpos from; ++ struct bch_fs *c; ++ enum btree_id id; ++ ++ char buf[PAGE_SIZE]; ++ size_t bytes; /* what's currently in buf */ ++ ++ char __user *ubuf; /* destination user buffer */ ++ size_t size; /* size of requested read */ ++ ssize_t ret; /* bytes read so far */ ++}; ++ ++static int flush_buf(struct dump_iter *i) ++{ ++ if (i->bytes) { ++ size_t bytes = min(i->bytes, i->size); ++ int err = copy_to_user(i->ubuf, i->buf, bytes); ++ ++ if (err) ++ return err; ++ ++ i->ret += bytes; ++ i->ubuf += bytes; ++ i->size -= bytes; ++ i->bytes -= bytes; ++ memmove(i->buf, i->buf + bytes, i->bytes); ++ } ++ ++ return 0; ++} ++ ++static int bch2_dump_open(struct inode *inode, struct file *file) ++{ ++ struct btree_debug *bd = inode->i_private; ++ struct dump_iter *i; ++ ++ i = kzalloc(sizeof(struct dump_iter), GFP_KERNEL); ++ if (!i) ++ return -ENOMEM; ++ ++ file->private_data = i; ++ i->from = POS_MIN; ++ i->c = container_of(bd, struct bch_fs, btree_debug[bd->id]); ++ i->id = bd->id; ++ ++ return 0; ++} ++ ++static int bch2_dump_release(struct inode *inode, struct file *file) ++{ ++ kfree(file->private_data); ++ return 0; ++} ++ ++static ssize_t bch2_read_btree(struct file *file, char __user *buf, ++ size_t size, loff_t *ppos) ++{ ++ struct dump_iter *i = file->private_data; ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ int err; ++ ++ i->ubuf = buf; ++ i->size = size; ++ i->ret = 0; ++ ++ err = flush_buf(i); ++ if (err) ++ return err; ++ ++ if (!i->size) ++ return i->ret; ++ ++ bch2_trans_init(&trans, i->c, 0, 0); ++ ++ iter = bch2_trans_get_iter(&trans, i->id, i->from, BTREE_ITER_PREFETCH); ++ k = bch2_btree_iter_peek(iter); ++ ++ while (k.k && !(err = bkey_err(k))) { ++ bch2_bkey_val_to_text(&PBUF(i->buf), i->c, k); ++ i->bytes = strlen(i->buf); ++ BUG_ON(i->bytes >= PAGE_SIZE); ++ i->buf[i->bytes] = '\n'; ++ i->bytes++; ++ ++ k = bch2_btree_iter_next(iter); ++ i->from = iter->pos; ++ ++ err = flush_buf(i); ++ if (err) ++ break; ++ ++ if (!i->size) ++ break; ++ } ++ bch2_trans_exit(&trans); ++ ++ return err < 0 ? err : i->ret; ++} ++ ++static const struct file_operations btree_debug_ops = { ++ .owner = THIS_MODULE, ++ .open = bch2_dump_open, ++ .release = bch2_dump_release, ++ .read = bch2_read_btree, ++}; ++ ++static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf, ++ size_t size, loff_t *ppos) ++{ ++ struct dump_iter *i = file->private_data; ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct btree *b; ++ int err; ++ ++ i->ubuf = buf; ++ i->size = size; ++ i->ret = 0; ++ ++ err = flush_buf(i); ++ if (err) ++ return err; ++ ++ if (!i->size || !bkey_cmp(POS_MAX, i->from)) ++ return i->ret; ++ ++ bch2_trans_init(&trans, i->c, 0, 0); ++ ++ for_each_btree_node(&trans, iter, i->id, i->from, 0, b) { ++ bch2_btree_node_to_text(&PBUF(i->buf), i->c, b); ++ i->bytes = strlen(i->buf); ++ err = flush_buf(i); ++ if (err) ++ break; ++ ++ /* ++ * can't easily correctly restart a btree node traversal across ++ * all nodes, meh ++ */ ++ i->from = bkey_cmp(POS_MAX, b->key.k.p) ++ ? bkey_successor(b->key.k.p) ++ : b->key.k.p; ++ ++ if (!i->size) ++ break; ++ } ++ bch2_trans_exit(&trans); ++ ++ return err < 0 ? err : i->ret; ++} ++ ++static const struct file_operations btree_format_debug_ops = { ++ .owner = THIS_MODULE, ++ .open = bch2_dump_open, ++ .release = bch2_dump_release, ++ .read = bch2_read_btree_formats, ++}; ++ ++static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf, ++ size_t size, loff_t *ppos) ++{ ++ struct dump_iter *i = file->private_data; ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ struct btree *prev_node = NULL; ++ int err; ++ ++ i->ubuf = buf; ++ i->size = size; ++ i->ret = 0; ++ ++ err = flush_buf(i); ++ if (err) ++ return err; ++ ++ if (!i->size) ++ return i->ret; ++ ++ bch2_trans_init(&trans, i->c, 0, 0); ++ ++ iter = bch2_trans_get_iter(&trans, i->id, i->from, BTREE_ITER_PREFETCH); ++ ++ while ((k = bch2_btree_iter_peek(iter)).k && ++ !(err = bkey_err(k))) { ++ struct btree_iter_level *l = &iter->l[0]; ++ struct bkey_packed *_k = ++ bch2_btree_node_iter_peek(&l->iter, l->b); ++ ++ if (l->b != prev_node) { ++ bch2_btree_node_to_text(&PBUF(i->buf), i->c, l->b); ++ i->bytes = strlen(i->buf); ++ err = flush_buf(i); ++ if (err) ++ break; ++ } ++ prev_node = l->b; ++ ++ bch2_bfloat_to_text(&PBUF(i->buf), l->b, _k); ++ i->bytes = strlen(i->buf); ++ err = flush_buf(i); ++ if (err) ++ break; ++ ++ bch2_btree_iter_next(iter); ++ i->from = iter->pos; ++ ++ err = flush_buf(i); ++ if (err) ++ break; ++ ++ if (!i->size) ++ break; ++ } ++ bch2_trans_exit(&trans); ++ ++ return err < 0 ? err : i->ret; ++} ++ ++static const struct file_operations bfloat_failed_debug_ops = { ++ .owner = THIS_MODULE, ++ .open = bch2_dump_open, ++ .release = bch2_dump_release, ++ .read = bch2_read_bfloat_failed, ++}; ++ ++void bch2_fs_debug_exit(struct bch_fs *c) ++{ ++ if (!IS_ERR_OR_NULL(c->debug)) ++ debugfs_remove_recursive(c->debug); ++} ++ ++void bch2_fs_debug_init(struct bch_fs *c) ++{ ++ struct btree_debug *bd; ++ char name[100]; ++ ++ if (IS_ERR_OR_NULL(bch_debug)) ++ return; ++ ++ snprintf(name, sizeof(name), "%pU", c->sb.user_uuid.b); ++ c->debug = debugfs_create_dir(name, bch_debug); ++ if (IS_ERR_OR_NULL(c->debug)) ++ return; ++ ++ for (bd = c->btree_debug; ++ bd < c->btree_debug + ARRAY_SIZE(c->btree_debug); ++ bd++) { ++ bd->id = bd - c->btree_debug; ++ bd->btree = debugfs_create_file(bch2_btree_ids[bd->id], ++ 0400, c->debug, bd, ++ &btree_debug_ops); ++ ++ snprintf(name, sizeof(name), "%s-formats", ++ bch2_btree_ids[bd->id]); ++ ++ bd->btree_format = debugfs_create_file(name, 0400, c->debug, bd, ++ &btree_format_debug_ops); ++ ++ snprintf(name, sizeof(name), "%s-bfloat-failed", ++ bch2_btree_ids[bd->id]); ++ ++ bd->failed = debugfs_create_file(name, 0400, c->debug, bd, ++ &bfloat_failed_debug_ops); ++ } ++} ++ ++#endif ++ ++void bch2_debug_exit(void) ++{ ++ if (!IS_ERR_OR_NULL(bch_debug)) ++ debugfs_remove_recursive(bch_debug); ++} ++ ++int __init bch2_debug_init(void) ++{ ++ int ret = 0; ++ ++ bch_debug = debugfs_create_dir("bcachefs", NULL); ++ return ret; ++} +diff --git a/fs/bcachefs/debug.h b/fs/bcachefs/debug.h +new file mode 100644 +index 000000000000..56c2d1ab5f63 +--- /dev/null ++++ b/fs/bcachefs/debug.h +@@ -0,0 +1,63 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_DEBUG_H ++#define _BCACHEFS_DEBUG_H ++ ++#include "bcachefs.h" ++ ++struct bio; ++struct btree; ++struct bch_fs; ++ ++#define BCH_DEBUG_PARAM(name, description) extern bool bch2_##name; ++BCH_DEBUG_PARAMS() ++#undef BCH_DEBUG_PARAM ++ ++#define BCH_DEBUG_PARAM(name, description) \ ++ static inline bool name(struct bch_fs *c) \ ++ { return bch2_##name || c->name; } ++BCH_DEBUG_PARAMS_ALWAYS() ++#undef BCH_DEBUG_PARAM ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++ ++#define BCH_DEBUG_PARAM(name, description) \ ++ static inline bool name(struct bch_fs *c) \ ++ { return bch2_##name || c->name; } ++BCH_DEBUG_PARAMS_DEBUG() ++#undef BCH_DEBUG_PARAM ++ ++void __bch2_btree_verify(struct bch_fs *, struct btree *); ++ ++#define bypass_torture_test(d) ((d)->bypass_torture_test) ++ ++#else /* DEBUG */ ++ ++#define BCH_DEBUG_PARAM(name, description) \ ++ static inline bool name(struct bch_fs *c) { return false; } ++BCH_DEBUG_PARAMS_DEBUG() ++#undef BCH_DEBUG_PARAM ++ ++static inline void __bch2_btree_verify(struct bch_fs *c, struct btree *b) {} ++ ++#define bypass_torture_test(d) 0 ++ ++#endif ++ ++static inline void bch2_btree_verify(struct bch_fs *c, struct btree *b) ++{ ++ if (verify_btree_ondisk(c)) ++ __bch2_btree_verify(c, b); ++} ++ ++#ifdef CONFIG_DEBUG_FS ++void bch2_fs_debug_exit(struct bch_fs *); ++void bch2_fs_debug_init(struct bch_fs *); ++#else ++static inline void bch2_fs_debug_exit(struct bch_fs *c) {} ++static inline void bch2_fs_debug_init(struct bch_fs *c) {} ++#endif ++ ++void bch2_debug_exit(void); ++int bch2_debug_init(void); ++ ++#endif /* _BCACHEFS_DEBUG_H */ +diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c +new file mode 100644 +index 000000000000..38017699c04a +--- /dev/null ++++ b/fs/bcachefs/dirent.c +@@ -0,0 +1,386 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include "bcachefs.h" ++#include "bkey_methods.h" ++#include "btree_update.h" ++#include "extents.h" ++#include "dirent.h" ++#include "fs.h" ++#include "keylist.h" ++#include "str_hash.h" ++ ++#include ++ ++unsigned bch2_dirent_name_bytes(struct bkey_s_c_dirent d) ++{ ++ unsigned len = bkey_val_bytes(d.k) - ++ offsetof(struct bch_dirent, d_name); ++ ++ return strnlen(d.v->d_name, len); ++} ++ ++static u64 bch2_dirent_hash(const struct bch_hash_info *info, ++ const struct qstr *name) ++{ ++ struct bch_str_hash_ctx ctx; ++ ++ bch2_str_hash_init(&ctx, info); ++ bch2_str_hash_update(&ctx, info, name->name, name->len); ++ ++ /* [0,2) reserved for dots */ ++ return max_t(u64, bch2_str_hash_end(&ctx, info), 2); ++} ++ ++static u64 dirent_hash_key(const struct bch_hash_info *info, const void *key) ++{ ++ return bch2_dirent_hash(info, key); ++} ++ ++static u64 dirent_hash_bkey(const struct bch_hash_info *info, struct bkey_s_c k) ++{ ++ struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); ++ struct qstr name = QSTR_INIT(d.v->d_name, bch2_dirent_name_bytes(d)); ++ ++ return bch2_dirent_hash(info, &name); ++} ++ ++static bool dirent_cmp_key(struct bkey_s_c _l, const void *_r) ++{ ++ struct bkey_s_c_dirent l = bkey_s_c_to_dirent(_l); ++ int len = bch2_dirent_name_bytes(l); ++ const struct qstr *r = _r; ++ ++ return len - r->len ?: memcmp(l.v->d_name, r->name, len); ++} ++ ++static bool dirent_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r) ++{ ++ struct bkey_s_c_dirent l = bkey_s_c_to_dirent(_l); ++ struct bkey_s_c_dirent r = bkey_s_c_to_dirent(_r); ++ int l_len = bch2_dirent_name_bytes(l); ++ int r_len = bch2_dirent_name_bytes(r); ++ ++ return l_len - r_len ?: memcmp(l.v->d_name, r.v->d_name, l_len); ++} ++ ++const struct bch_hash_desc bch2_dirent_hash_desc = { ++ .btree_id = BTREE_ID_DIRENTS, ++ .key_type = KEY_TYPE_dirent, ++ .hash_key = dirent_hash_key, ++ .hash_bkey = dirent_hash_bkey, ++ .cmp_key = dirent_cmp_key, ++ .cmp_bkey = dirent_cmp_bkey, ++}; ++ ++const char *bch2_dirent_invalid(const struct bch_fs *c, struct bkey_s_c k) ++{ ++ struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); ++ unsigned len; ++ ++ if (bkey_val_bytes(k.k) < sizeof(struct bch_dirent)) ++ return "value too small"; ++ ++ len = bch2_dirent_name_bytes(d); ++ if (!len) ++ return "empty name"; ++ ++ /* ++ * older versions of bcachefs were buggy and creating dirent ++ * keys that were bigger than necessary: ++ */ ++ if (bkey_val_u64s(k.k) > dirent_val_u64s(len + 7)) ++ return "value too big"; ++ ++ if (len > BCH_NAME_MAX) ++ return "dirent name too big"; ++ ++ return NULL; ++} ++ ++void bch2_dirent_to_text(struct printbuf *out, struct bch_fs *c, ++ struct bkey_s_c k) ++{ ++ struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); ++ ++ bch_scnmemcpy(out, d.v->d_name, ++ bch2_dirent_name_bytes(d)); ++ pr_buf(out, " -> %llu", d.v->d_inum); ++} ++ ++static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans, ++ u8 type, const struct qstr *name, u64 dst) ++{ ++ struct bkey_i_dirent *dirent; ++ unsigned u64s = BKEY_U64s + dirent_val_u64s(name->len); ++ ++ if (name->len > BCH_NAME_MAX) ++ return ERR_PTR(-ENAMETOOLONG); ++ ++ BUG_ON(u64s > U8_MAX); ++ ++ dirent = bch2_trans_kmalloc(trans, u64s * sizeof(u64)); ++ if (IS_ERR(dirent)) ++ return dirent; ++ ++ bkey_dirent_init(&dirent->k_i); ++ dirent->k.u64s = u64s; ++ dirent->v.d_inum = cpu_to_le64(dst); ++ dirent->v.d_type = type; ++ ++ memcpy(dirent->v.d_name, name->name, name->len); ++ memset(dirent->v.d_name + name->len, 0, ++ bkey_val_bytes(&dirent->k) - ++ offsetof(struct bch_dirent, d_name) - ++ name->len); ++ ++ EBUG_ON(bch2_dirent_name_bytes(dirent_i_to_s_c(dirent)) != name->len); ++ ++ return dirent; ++} ++ ++int bch2_dirent_create(struct btree_trans *trans, ++ u64 dir_inum, const struct bch_hash_info *hash_info, ++ u8 type, const struct qstr *name, u64 dst_inum, ++ int flags) ++{ ++ struct bkey_i_dirent *dirent; ++ int ret; ++ ++ dirent = dirent_create_key(trans, type, name, dst_inum); ++ ret = PTR_ERR_OR_ZERO(dirent); ++ if (ret) ++ return ret; ++ ++ return bch2_hash_set(trans, bch2_dirent_hash_desc, hash_info, ++ dir_inum, &dirent->k_i, flags); ++} ++ ++static void dirent_copy_target(struct bkey_i_dirent *dst, ++ struct bkey_s_c_dirent src) ++{ ++ dst->v.d_inum = src.v->d_inum; ++ dst->v.d_type = src.v->d_type; ++} ++ ++int bch2_dirent_rename(struct btree_trans *trans, ++ u64 src_dir, struct bch_hash_info *src_hash, ++ u64 dst_dir, struct bch_hash_info *dst_hash, ++ const struct qstr *src_name, u64 *src_inum, ++ const struct qstr *dst_name, u64 *dst_inum, ++ enum bch_rename_mode mode) ++{ ++ struct btree_iter *src_iter, *dst_iter; ++ struct bkey_s_c old_src, old_dst; ++ struct bkey_i_dirent *new_src = NULL, *new_dst = NULL; ++ struct bpos dst_pos = ++ POS(dst_dir, bch2_dirent_hash(dst_hash, dst_name)); ++ int ret; ++ ++ *src_inum = *dst_inum = 0; ++ ++ /* ++ * Lookup dst: ++ * ++ * Note that in BCH_RENAME mode, we're _not_ checking if ++ * the target already exists - we're relying on the VFS ++ * to do that check for us for correctness: ++ */ ++ dst_iter = mode == BCH_RENAME ++ ? bch2_hash_hole(trans, bch2_dirent_hash_desc, ++ dst_hash, dst_dir, dst_name) ++ : bch2_hash_lookup(trans, bch2_dirent_hash_desc, ++ dst_hash, dst_dir, dst_name, ++ BTREE_ITER_INTENT); ++ if (IS_ERR(dst_iter)) ++ return PTR_ERR(dst_iter); ++ old_dst = bch2_btree_iter_peek_slot(dst_iter); ++ ++ if (mode != BCH_RENAME) ++ *dst_inum = le64_to_cpu(bkey_s_c_to_dirent(old_dst).v->d_inum); ++ ++ /* Lookup src: */ ++ src_iter = bch2_hash_lookup(trans, bch2_dirent_hash_desc, ++ src_hash, src_dir, src_name, ++ BTREE_ITER_INTENT); ++ if (IS_ERR(src_iter)) ++ return PTR_ERR(src_iter); ++ old_src = bch2_btree_iter_peek_slot(src_iter); ++ *src_inum = le64_to_cpu(bkey_s_c_to_dirent(old_src).v->d_inum); ++ ++ /* Create new dst key: */ ++ new_dst = dirent_create_key(trans, 0, dst_name, 0); ++ if (IS_ERR(new_dst)) ++ return PTR_ERR(new_dst); ++ ++ dirent_copy_target(new_dst, bkey_s_c_to_dirent(old_src)); ++ new_dst->k.p = dst_iter->pos; ++ ++ /* Create new src key: */ ++ if (mode == BCH_RENAME_EXCHANGE) { ++ new_src = dirent_create_key(trans, 0, src_name, 0); ++ if (IS_ERR(new_src)) ++ return PTR_ERR(new_src); ++ ++ dirent_copy_target(new_src, bkey_s_c_to_dirent(old_dst)); ++ new_src->k.p = src_iter->pos; ++ } else { ++ new_src = bch2_trans_kmalloc(trans, sizeof(struct bkey_i)); ++ if (IS_ERR(new_src)) ++ return PTR_ERR(new_src); ++ bkey_init(&new_src->k); ++ new_src->k.p = src_iter->pos; ++ ++ if (bkey_cmp(dst_pos, src_iter->pos) <= 0 && ++ bkey_cmp(src_iter->pos, dst_iter->pos) < 0) { ++ /* ++ * We have a hash collision for the new dst key, ++ * and new_src - the key we're deleting - is between ++ * new_dst's hashed slot and the slot we're going to be ++ * inserting it into - oops. This will break the hash ++ * table if we don't deal with it: ++ */ ++ if (mode == BCH_RENAME) { ++ /* ++ * If we're not overwriting, we can just insert ++ * new_dst at the src position: ++ */ ++ new_dst->k.p = src_iter->pos; ++ bch2_trans_update(trans, src_iter, ++ &new_dst->k_i); ++ return 0; ++ } else { ++ /* If we're overwriting, we can't insert new_dst ++ * at a different slot because it has to ++ * overwrite old_dst - just make sure to use a ++ * whiteout when deleting src: ++ */ ++ new_src->k.type = KEY_TYPE_whiteout; ++ } ++ } else { ++ /* Check if we need a whiteout to delete src: */ ++ ret = bch2_hash_needs_whiteout(trans, bch2_dirent_hash_desc, ++ src_hash, src_iter); ++ if (ret < 0) ++ return ret; ++ ++ if (ret) ++ new_src->k.type = KEY_TYPE_whiteout; ++ } ++ } ++ ++ bch2_trans_update(trans, src_iter, &new_src->k_i); ++ bch2_trans_update(trans, dst_iter, &new_dst->k_i); ++ return 0; ++} ++ ++int bch2_dirent_delete_at(struct btree_trans *trans, ++ const struct bch_hash_info *hash_info, ++ struct btree_iter *iter) ++{ ++ return bch2_hash_delete_at(trans, bch2_dirent_hash_desc, ++ hash_info, iter); ++} ++ ++int bch2_dirent_delete(struct bch_fs *c, u64 dir_inum, ++ const struct bch_hash_info *hash_info, ++ const struct qstr *name, ++ u64 *journal_seq) ++{ ++ return bch2_trans_do(c, journal_seq, ++ BTREE_INSERT_ATOMIC| ++ BTREE_INSERT_NOFAIL, ++ bch2_hash_delete(&trans, bch2_dirent_hash_desc, hash_info, ++ dir_inum, name)); ++} ++ ++struct btree_iter * ++__bch2_dirent_lookup_trans(struct btree_trans *trans, u64 dir_inum, ++ const struct bch_hash_info *hash_info, ++ const struct qstr *name, unsigned flags) ++{ ++ return bch2_hash_lookup(trans, bch2_dirent_hash_desc, ++ hash_info, dir_inum, name, flags); ++} ++ ++u64 bch2_dirent_lookup(struct bch_fs *c, u64 dir_inum, ++ const struct bch_hash_info *hash_info, ++ const struct qstr *name) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ u64 inum = 0; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ iter = __bch2_dirent_lookup_trans(&trans, dir_inum, ++ hash_info, name, 0); ++ if (IS_ERR(iter)) { ++ BUG_ON(PTR_ERR(iter) == -EINTR); ++ goto out; ++ } ++ ++ k = bch2_btree_iter_peek_slot(iter); ++ inum = le64_to_cpu(bkey_s_c_to_dirent(k).v->d_inum); ++out: ++ bch2_trans_exit(&trans); ++ return inum; ++} ++ ++int bch2_empty_dir_trans(struct btree_trans *trans, u64 dir_inum) ++{ ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ int ret; ++ ++ for_each_btree_key(trans, iter, BTREE_ID_DIRENTS, ++ POS(dir_inum, 0), 0, k, ret) { ++ if (k.k->p.inode > dir_inum) ++ break; ++ ++ if (k.k->type == KEY_TYPE_dirent) { ++ ret = -ENOTEMPTY; ++ break; ++ } ++ } ++ bch2_trans_iter_put(trans, iter); ++ ++ return ret; ++} ++ ++int bch2_readdir(struct bch_fs *c, u64 inum, struct dir_context *ctx) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ struct bkey_s_c_dirent dirent; ++ int ret; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS, ++ POS(inum, ctx->pos), 0, k, ret) { ++ if (k.k->p.inode > inum) ++ break; ++ ++ if (k.k->type != KEY_TYPE_dirent) ++ continue; ++ ++ dirent = bkey_s_c_to_dirent(k); ++ ++ /* ++ * XXX: dir_emit() can fault and block, while we're holding ++ * locks ++ */ ++ ctx->pos = dirent.k->p.offset; ++ if (!dir_emit(ctx, dirent.v->d_name, ++ bch2_dirent_name_bytes(dirent), ++ le64_to_cpu(dirent.v->d_inum), ++ dirent.v->d_type)) ++ break; ++ ctx->pos = dirent.k->p.offset + 1; ++ } ++ ret = bch2_trans_exit(&trans) ?: ret; ++ ++ return ret; ++} +diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h +new file mode 100644 +index 000000000000..e6184dc796d3 +--- /dev/null ++++ b/fs/bcachefs/dirent.h +@@ -0,0 +1,65 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_DIRENT_H ++#define _BCACHEFS_DIRENT_H ++ ++#include "str_hash.h" ++ ++extern const struct bch_hash_desc bch2_dirent_hash_desc; ++ ++const char *bch2_dirent_invalid(const struct bch_fs *, struct bkey_s_c); ++void bch2_dirent_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); ++ ++#define bch2_bkey_ops_dirent (struct bkey_ops) { \ ++ .key_invalid = bch2_dirent_invalid, \ ++ .val_to_text = bch2_dirent_to_text, \ ++} ++ ++struct qstr; ++struct file; ++struct dir_context; ++struct bch_fs; ++struct bch_hash_info; ++struct bch_inode_info; ++ ++unsigned bch2_dirent_name_bytes(struct bkey_s_c_dirent); ++ ++static inline unsigned dirent_val_u64s(unsigned len) ++{ ++ return DIV_ROUND_UP(offsetof(struct bch_dirent, d_name) + len, ++ sizeof(u64)); ++} ++ ++int bch2_dirent_create(struct btree_trans *, u64, ++ const struct bch_hash_info *, u8, ++ const struct qstr *, u64, int); ++ ++int bch2_dirent_delete_at(struct btree_trans *, ++ const struct bch_hash_info *, ++ struct btree_iter *); ++int bch2_dirent_delete(struct bch_fs *, u64, const struct bch_hash_info *, ++ const struct qstr *, u64 *); ++ ++enum bch_rename_mode { ++ BCH_RENAME, ++ BCH_RENAME_OVERWRITE, ++ BCH_RENAME_EXCHANGE, ++}; ++ ++int bch2_dirent_rename(struct btree_trans *, ++ u64, struct bch_hash_info *, ++ u64, struct bch_hash_info *, ++ const struct qstr *, u64 *, ++ const struct qstr *, u64 *, ++ enum bch_rename_mode); ++ ++struct btree_iter * ++__bch2_dirent_lookup_trans(struct btree_trans *, u64, ++ const struct bch_hash_info *, ++ const struct qstr *, unsigned); ++u64 bch2_dirent_lookup(struct bch_fs *, u64, const struct bch_hash_info *, ++ const struct qstr *); ++ ++int bch2_empty_dir_trans(struct btree_trans *, u64); ++int bch2_readdir(struct bch_fs *, u64, struct dir_context *); ++ ++#endif /* _BCACHEFS_DIRENT_H */ +diff --git a/fs/bcachefs/disk_groups.c b/fs/bcachefs/disk_groups.c +new file mode 100644 +index 000000000000..4a4ec8f46108 +--- /dev/null ++++ b/fs/bcachefs/disk_groups.c +@@ -0,0 +1,481 @@ ++// SPDX-License-Identifier: GPL-2.0 ++#include "bcachefs.h" ++#include "disk_groups.h" ++#include "super-io.h" ++ ++#include ++ ++static int group_cmp(const void *_l, const void *_r) ++{ ++ const struct bch_disk_group *l = _l; ++ const struct bch_disk_group *r = _r; ++ ++ return ((BCH_GROUP_DELETED(l) > BCH_GROUP_DELETED(r)) - ++ (BCH_GROUP_DELETED(l) < BCH_GROUP_DELETED(r))) ?: ++ ((BCH_GROUP_PARENT(l) > BCH_GROUP_PARENT(r)) - ++ (BCH_GROUP_PARENT(l) < BCH_GROUP_PARENT(r))) ?: ++ strncmp(l->label, r->label, sizeof(l->label)); ++} ++ ++static const char *bch2_sb_disk_groups_validate(struct bch_sb *sb, ++ struct bch_sb_field *f) ++{ ++ struct bch_sb_field_disk_groups *groups = ++ field_to_type(f, disk_groups); ++ struct bch_disk_group *g, *sorted = NULL; ++ struct bch_sb_field_members *mi; ++ struct bch_member *m; ++ unsigned i, nr_groups, len; ++ const char *err = NULL; ++ ++ mi = bch2_sb_get_members(sb); ++ groups = bch2_sb_get_disk_groups(sb); ++ nr_groups = disk_groups_nr(groups); ++ ++ for (m = mi->members; ++ m < mi->members + sb->nr_devices; ++ m++) { ++ unsigned g; ++ ++ if (!BCH_MEMBER_GROUP(m)) ++ continue; ++ ++ g = BCH_MEMBER_GROUP(m) - 1; ++ ++ if (g >= nr_groups || ++ BCH_GROUP_DELETED(&groups->entries[g])) ++ return "disk has invalid group"; ++ } ++ ++ if (!nr_groups) ++ return NULL; ++ ++ for (g = groups->entries; ++ g < groups->entries + nr_groups; ++ g++) { ++ if (BCH_GROUP_DELETED(g)) ++ continue; ++ ++ len = strnlen(g->label, sizeof(g->label)); ++ if (!len) { ++ err = "group with empty label"; ++ goto err; ++ } ++ } ++ ++ sorted = kmalloc_array(nr_groups, sizeof(*sorted), GFP_KERNEL); ++ if (!sorted) ++ return "cannot allocate memory"; ++ ++ memcpy(sorted, groups->entries, nr_groups * sizeof(*sorted)); ++ sort(sorted, nr_groups, sizeof(*sorted), group_cmp, NULL); ++ ++ for (i = 0; i + 1 < nr_groups; i++) ++ if (!BCH_GROUP_DELETED(sorted + i) && ++ !group_cmp(sorted + i, sorted + i + 1)) { ++ err = "duplicate groups"; ++ goto err; ++ } ++ ++ err = NULL; ++err: ++ kfree(sorted); ++ return err; ++} ++ ++static void bch2_sb_disk_groups_to_text(struct printbuf *out, ++ struct bch_sb *sb, ++ struct bch_sb_field *f) ++{ ++ struct bch_sb_field_disk_groups *groups = ++ field_to_type(f, disk_groups); ++ struct bch_disk_group *g; ++ unsigned nr_groups = disk_groups_nr(groups); ++ ++ for (g = groups->entries; ++ g < groups->entries + nr_groups; ++ g++) { ++ if (g != groups->entries) ++ pr_buf(out, " "); ++ ++ if (BCH_GROUP_DELETED(g)) ++ pr_buf(out, "[deleted]"); ++ else ++ pr_buf(out, "[parent %llu name %s]", ++ BCH_GROUP_PARENT(g), g->label); ++ } ++} ++ ++const struct bch_sb_field_ops bch_sb_field_ops_disk_groups = { ++ .validate = bch2_sb_disk_groups_validate, ++ .to_text = bch2_sb_disk_groups_to_text ++}; ++ ++int bch2_sb_disk_groups_to_cpu(struct bch_fs *c) ++{ ++ struct bch_sb_field_members *mi; ++ struct bch_sb_field_disk_groups *groups; ++ struct bch_disk_groups_cpu *cpu_g, *old_g; ++ unsigned i, g, nr_groups; ++ ++ lockdep_assert_held(&c->sb_lock); ++ ++ mi = bch2_sb_get_members(c->disk_sb.sb); ++ groups = bch2_sb_get_disk_groups(c->disk_sb.sb); ++ nr_groups = disk_groups_nr(groups); ++ ++ if (!groups) ++ return 0; ++ ++ cpu_g = kzalloc(sizeof(*cpu_g) + ++ sizeof(cpu_g->entries[0]) * nr_groups, GFP_KERNEL); ++ if (!cpu_g) ++ return -ENOMEM; ++ ++ cpu_g->nr = nr_groups; ++ ++ for (i = 0; i < nr_groups; i++) { ++ struct bch_disk_group *src = &groups->entries[i]; ++ struct bch_disk_group_cpu *dst = &cpu_g->entries[i]; ++ ++ dst->deleted = BCH_GROUP_DELETED(src); ++ dst->parent = BCH_GROUP_PARENT(src); ++ } ++ ++ for (i = 0; i < c->disk_sb.sb->nr_devices; i++) { ++ struct bch_member *m = mi->members + i; ++ struct bch_disk_group_cpu *dst = ++ &cpu_g->entries[BCH_MEMBER_GROUP(m)]; ++ ++ if (!bch2_member_exists(m)) ++ continue; ++ ++ g = BCH_MEMBER_GROUP(m); ++ while (g) { ++ dst = &cpu_g->entries[g - 1]; ++ __set_bit(i, dst->devs.d); ++ g = dst->parent; ++ } ++ } ++ ++ old_g = rcu_dereference_protected(c->disk_groups, ++ lockdep_is_held(&c->sb_lock)); ++ rcu_assign_pointer(c->disk_groups, cpu_g); ++ if (old_g) ++ kfree_rcu(old_g, rcu); ++ ++ return 0; ++} ++ ++const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *c, unsigned target) ++{ ++ struct target t = target_decode(target); ++ ++ switch (t.type) { ++ case TARGET_NULL: ++ return NULL; ++ case TARGET_DEV: { ++ struct bch_dev *ca = t.dev < c->sb.nr_devices ++ ? rcu_dereference(c->devs[t.dev]) ++ : NULL; ++ return ca ? &ca->self : NULL; ++ } ++ case TARGET_GROUP: { ++ struct bch_disk_groups_cpu *g = rcu_dereference(c->disk_groups); ++ ++ return t.group < g->nr && !g->entries[t.group].deleted ++ ? &g->entries[t.group].devs ++ : NULL; ++ } ++ default: ++ BUG(); ++ } ++} ++ ++bool bch2_dev_in_target(struct bch_fs *c, unsigned dev, unsigned target) ++{ ++ struct target t = target_decode(target); ++ ++ switch (t.type) { ++ case TARGET_NULL: ++ return false; ++ case TARGET_DEV: ++ return dev == t.dev; ++ case TARGET_GROUP: { ++ struct bch_disk_groups_cpu *g; ++ const struct bch_devs_mask *m; ++ bool ret; ++ ++ rcu_read_lock(); ++ g = rcu_dereference(c->disk_groups); ++ m = t.group < g->nr && !g->entries[t.group].deleted ++ ? &g->entries[t.group].devs ++ : NULL; ++ ++ ret = m ? test_bit(dev, m->d) : false; ++ rcu_read_unlock(); ++ ++ return ret; ++ } ++ default: ++ BUG(); ++ } ++} ++ ++static int __bch2_disk_group_find(struct bch_sb_field_disk_groups *groups, ++ unsigned parent, ++ const char *name, unsigned namelen) ++{ ++ unsigned i, nr_groups = disk_groups_nr(groups); ++ ++ if (!namelen || namelen > BCH_SB_LABEL_SIZE) ++ return -EINVAL; ++ ++ for (i = 0; i < nr_groups; i++) { ++ struct bch_disk_group *g = groups->entries + i; ++ ++ if (BCH_GROUP_DELETED(g)) ++ continue; ++ ++ if (!BCH_GROUP_DELETED(g) && ++ BCH_GROUP_PARENT(g) == parent && ++ strnlen(g->label, sizeof(g->label)) == namelen && ++ !memcmp(name, g->label, namelen)) ++ return i; ++ } ++ ++ return -1; ++} ++ ++static int __bch2_disk_group_add(struct bch_sb_handle *sb, unsigned parent, ++ const char *name, unsigned namelen) ++{ ++ struct bch_sb_field_disk_groups *groups = ++ bch2_sb_get_disk_groups(sb->sb); ++ unsigned i, nr_groups = disk_groups_nr(groups); ++ struct bch_disk_group *g; ++ ++ if (!namelen || namelen > BCH_SB_LABEL_SIZE) ++ return -EINVAL; ++ ++ for (i = 0; ++ i < nr_groups && !BCH_GROUP_DELETED(&groups->entries[i]); ++ i++) ++ ; ++ ++ if (i == nr_groups) { ++ unsigned u64s = ++ (sizeof(struct bch_sb_field_disk_groups) + ++ sizeof(struct bch_disk_group) * (nr_groups + 1)) / ++ sizeof(u64); ++ ++ groups = bch2_sb_resize_disk_groups(sb, u64s); ++ if (!groups) ++ return -ENOSPC; ++ ++ nr_groups = disk_groups_nr(groups); ++ } ++ ++ BUG_ON(i >= nr_groups); ++ ++ g = &groups->entries[i]; ++ ++ memcpy(g->label, name, namelen); ++ if (namelen < sizeof(g->label)) ++ g->label[namelen] = '\0'; ++ SET_BCH_GROUP_DELETED(g, 0); ++ SET_BCH_GROUP_PARENT(g, parent); ++ SET_BCH_GROUP_DATA_ALLOWED(g, ~0); ++ ++ return i; ++} ++ ++int bch2_disk_path_find(struct bch_sb_handle *sb, const char *name) ++{ ++ struct bch_sb_field_disk_groups *groups = ++ bch2_sb_get_disk_groups(sb->sb); ++ int v = -1; ++ ++ do { ++ const char *next = strchrnul(name, '.'); ++ unsigned len = next - name; ++ ++ if (*next == '.') ++ next++; ++ ++ v = __bch2_disk_group_find(groups, v + 1, name, len); ++ name = next; ++ } while (*name && v >= 0); ++ ++ return v; ++} ++ ++int bch2_disk_path_find_or_create(struct bch_sb_handle *sb, const char *name) ++{ ++ struct bch_sb_field_disk_groups *groups; ++ unsigned parent = 0; ++ int v = -1; ++ ++ do { ++ const char *next = strchrnul(name, '.'); ++ unsigned len = next - name; ++ ++ if (*next == '.') ++ next++; ++ ++ groups = bch2_sb_get_disk_groups(sb->sb); ++ ++ v = __bch2_disk_group_find(groups, parent, name, len); ++ if (v < 0) ++ v = __bch2_disk_group_add(sb, parent, name, len); ++ if (v < 0) ++ return v; ++ ++ parent = v + 1; ++ name = next; ++ } while (*name && v >= 0); ++ ++ return v; ++} ++ ++void bch2_disk_path_to_text(struct printbuf *out, ++ struct bch_sb_handle *sb, ++ unsigned v) ++{ ++ struct bch_sb_field_disk_groups *groups = ++ bch2_sb_get_disk_groups(sb->sb); ++ struct bch_disk_group *g; ++ unsigned nr = 0; ++ u16 path[32]; ++ ++ while (1) { ++ if (nr == ARRAY_SIZE(path)) ++ goto inval; ++ ++ if (v >= disk_groups_nr(groups)) ++ goto inval; ++ ++ g = groups->entries + v; ++ ++ if (BCH_GROUP_DELETED(g)) ++ goto inval; ++ ++ path[nr++] = v; ++ ++ if (!BCH_GROUP_PARENT(g)) ++ break; ++ ++ v = BCH_GROUP_PARENT(g) - 1; ++ } ++ ++ while (nr) { ++ v = path[--nr]; ++ g = groups->entries + v; ++ ++ bch_scnmemcpy(out, g->label, ++ strnlen(g->label, sizeof(g->label))); ++ ++ if (nr) ++ pr_buf(out, "."); ++ } ++ return; ++inval: ++ pr_buf(out, "invalid group %u", v); ++} ++ ++int bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name) ++{ ++ struct bch_member *mi; ++ int v = -1; ++ ++ mutex_lock(&c->sb_lock); ++ ++ if (!strlen(name) || !strcmp(name, "none")) ++ goto write_sb; ++ ++ v = bch2_disk_path_find_or_create(&c->disk_sb, name); ++ if (v < 0) { ++ mutex_unlock(&c->sb_lock); ++ return v; ++ } ++ ++write_sb: ++ mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx]; ++ SET_BCH_MEMBER_GROUP(mi, v + 1); ++ ++ bch2_write_super(c); ++ mutex_unlock(&c->sb_lock); ++ ++ return 0; ++} ++ ++int bch2_opt_target_parse(struct bch_fs *c, const char *buf, u64 *v) ++{ ++ struct bch_dev *ca; ++ int g; ++ ++ if (!strlen(buf) || !strcmp(buf, "none")) { ++ *v = 0; ++ return 0; ++ } ++ ++ /* Is it a device? */ ++ ca = bch2_dev_lookup(c, buf); ++ if (!IS_ERR(ca)) { ++ *v = dev_to_target(ca->dev_idx); ++ percpu_ref_put(&ca->ref); ++ return 0; ++ } ++ ++ mutex_lock(&c->sb_lock); ++ g = bch2_disk_path_find(&c->disk_sb, buf); ++ mutex_unlock(&c->sb_lock); ++ ++ if (g >= 0) { ++ *v = group_to_target(g); ++ return 0; ++ } ++ ++ return -EINVAL; ++} ++ ++void bch2_opt_target_to_text(struct printbuf *out, struct bch_fs *c, u64 v) ++{ ++ struct target t = target_decode(v); ++ ++ switch (t.type) { ++ case TARGET_NULL: ++ pr_buf(out, "none"); ++ break; ++ case TARGET_DEV: { ++ struct bch_dev *ca; ++ ++ rcu_read_lock(); ++ ca = t.dev < c->sb.nr_devices ++ ? rcu_dereference(c->devs[t.dev]) ++ : NULL; ++ ++ if (ca && percpu_ref_tryget(&ca->io_ref)) { ++ char b[BDEVNAME_SIZE]; ++ ++ pr_buf(out, "/dev/%s", ++ bdevname(ca->disk_sb.bdev, b)); ++ percpu_ref_put(&ca->io_ref); ++ } else if (ca) { ++ pr_buf(out, "offline device %u", t.dev); ++ } else { ++ pr_buf(out, "invalid device %u", t.dev); ++ } ++ ++ rcu_read_unlock(); ++ break; ++ } ++ case TARGET_GROUP: ++ mutex_lock(&c->sb_lock); ++ bch2_disk_path_to_text(out, &c->disk_sb, t.group); ++ mutex_unlock(&c->sb_lock); ++ break; ++ default: ++ BUG(); ++ } ++} +diff --git a/fs/bcachefs/disk_groups.h b/fs/bcachefs/disk_groups.h +new file mode 100644 +index 000000000000..c8e0c37a5e1a +--- /dev/null ++++ b/fs/bcachefs/disk_groups.h +@@ -0,0 +1,88 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_DISK_GROUPS_H ++#define _BCACHEFS_DISK_GROUPS_H ++ ++extern const struct bch_sb_field_ops bch_sb_field_ops_disk_groups; ++ ++static inline unsigned disk_groups_nr(struct bch_sb_field_disk_groups *groups) ++{ ++ return groups ++ ? (vstruct_end(&groups->field) - ++ (void *) &groups->entries[0]) / sizeof(struct bch_disk_group) ++ : 0; ++} ++ ++struct target { ++ enum { ++ TARGET_NULL, ++ TARGET_DEV, ++ TARGET_GROUP, ++ } type; ++ union { ++ unsigned dev; ++ unsigned group; ++ }; ++}; ++ ++#define TARGET_DEV_START 1 ++#define TARGET_GROUP_START (256 + TARGET_DEV_START) ++ ++static inline u16 dev_to_target(unsigned dev) ++{ ++ return TARGET_DEV_START + dev; ++} ++ ++static inline u16 group_to_target(unsigned group) ++{ ++ return TARGET_GROUP_START + group; ++} ++ ++static inline struct target target_decode(unsigned target) ++{ ++ if (target >= TARGET_GROUP_START) ++ return (struct target) { ++ .type = TARGET_GROUP, ++ .group = target - TARGET_GROUP_START ++ }; ++ ++ if (target >= TARGET_DEV_START) ++ return (struct target) { ++ .type = TARGET_DEV, ++ .group = target - TARGET_DEV_START ++ }; ++ ++ return (struct target) { .type = TARGET_NULL }; ++} ++ ++const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *, unsigned); ++ ++static inline struct bch_devs_mask target_rw_devs(struct bch_fs *c, ++ enum bch_data_type data_type, ++ u16 target) ++{ ++ struct bch_devs_mask devs = c->rw_devs[data_type]; ++ const struct bch_devs_mask *t = bch2_target_to_mask(c, target); ++ ++ if (t) ++ bitmap_and(devs.d, devs.d, t->d, BCH_SB_MEMBERS_MAX); ++ return devs; ++} ++ ++bool bch2_dev_in_target(struct bch_fs *, unsigned, unsigned); ++ ++int bch2_disk_path_find(struct bch_sb_handle *, const char *); ++int bch2_disk_path_find_or_create(struct bch_sb_handle *, const char *); ++void bch2_disk_path_to_text(struct printbuf *, struct bch_sb_handle *, ++ unsigned); ++ ++int bch2_opt_target_parse(struct bch_fs *, const char *, u64 *); ++void bch2_opt_target_to_text(struct printbuf *, struct bch_fs *, u64); ++ ++int bch2_sb_disk_groups_to_cpu(struct bch_fs *); ++ ++int bch2_dev_group_set(struct bch_fs *, struct bch_dev *, const char *); ++ ++const char *bch2_sb_validate_disk_groups(struct bch_sb *, ++ struct bch_sb_field *); ++ ++#endif /* _BCACHEFS_DISK_GROUPS_H */ +diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c +new file mode 100644 +index 000000000000..47a11a2d69dd +--- /dev/null ++++ b/fs/bcachefs/ec.c +@@ -0,0 +1,1401 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++/* erasure coding */ ++ ++#include "bcachefs.h" ++#include "alloc_foreground.h" ++#include "bset.h" ++#include "btree_gc.h" ++#include "btree_update.h" ++#include "buckets.h" ++#include "disk_groups.h" ++#include "ec.h" ++#include "error.h" ++#include "io.h" ++#include "keylist.h" ++#include "recovery.h" ++#include "super-io.h" ++#include "util.h" ++ ++#include ++ ++#ifdef __KERNEL__ ++ ++#include ++#include ++ ++static void raid5_recov(unsigned disks, unsigned failed_idx, ++ size_t size, void **data) ++{ ++ unsigned i = 2, nr; ++ ++ BUG_ON(failed_idx >= disks); ++ ++ swap(data[0], data[failed_idx]); ++ memcpy(data[0], data[1], size); ++ ++ while (i < disks) { ++ nr = min_t(unsigned, disks - i, MAX_XOR_BLOCKS); ++ xor_blocks(nr, size, data[0], data + i); ++ i += nr; ++ } ++ ++ swap(data[0], data[failed_idx]); ++} ++ ++static void raid_gen(int nd, int np, size_t size, void **v) ++{ ++ if (np >= 1) ++ raid5_recov(nd + np, nd, size, v); ++ if (np >= 2) ++ raid6_call.gen_syndrome(nd + np, size, v); ++ BUG_ON(np > 2); ++} ++ ++static void raid_rec(int nr, int *ir, int nd, int np, size_t size, void **v) ++{ ++ switch (nr) { ++ case 0: ++ break; ++ case 1: ++ if (ir[0] < nd + 1) ++ raid5_recov(nd + 1, ir[0], size, v); ++ else ++ raid6_call.gen_syndrome(nd + np, size, v); ++ break; ++ case 2: ++ if (ir[1] < nd) { ++ /* data+data failure. */ ++ raid6_2data_recov(nd + np, size, ir[0], ir[1], v); ++ } else if (ir[0] < nd) { ++ /* data + p/q failure */ ++ ++ if (ir[1] == nd) /* data + p failure */ ++ raid6_datap_recov(nd + np, size, ir[0], v); ++ else { /* data + q failure */ ++ raid5_recov(nd + 1, ir[0], size, v); ++ raid6_call.gen_syndrome(nd + np, size, v); ++ } ++ } else { ++ raid_gen(nd, np, size, v); ++ } ++ break; ++ default: ++ BUG(); ++ } ++} ++ ++#else ++ ++#include ++ ++#endif ++ ++struct ec_bio { ++ struct bch_dev *ca; ++ struct ec_stripe_buf *buf; ++ size_t idx; ++ struct bio bio; ++}; ++ ++/* Stripes btree keys: */ ++ ++const char *bch2_stripe_invalid(const struct bch_fs *c, struct bkey_s_c k) ++{ ++ const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; ++ ++ if (k.k->p.inode) ++ return "invalid stripe key"; ++ ++ if (bkey_val_bytes(k.k) < sizeof(*s)) ++ return "incorrect value size"; ++ ++ if (bkey_val_bytes(k.k) < sizeof(*s) || ++ bkey_val_u64s(k.k) < stripe_val_u64s(s)) ++ return "incorrect value size"; ++ ++ return bch2_bkey_ptrs_invalid(c, k); ++} ++ ++void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c, ++ struct bkey_s_c k) ++{ ++ const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; ++ unsigned i; ++ ++ pr_buf(out, "algo %u sectors %u blocks %u:%u csum %u gran %u", ++ s->algorithm, ++ le16_to_cpu(s->sectors), ++ s->nr_blocks - s->nr_redundant, ++ s->nr_redundant, ++ s->csum_type, ++ 1U << s->csum_granularity_bits); ++ ++ for (i = 0; i < s->nr_blocks; i++) ++ pr_buf(out, " %u:%llu:%u", s->ptrs[i].dev, ++ (u64) s->ptrs[i].offset, ++ stripe_blockcount_get(s, i)); ++} ++ ++static int ptr_matches_stripe(struct bch_fs *c, ++ struct bch_stripe *v, ++ const struct bch_extent_ptr *ptr) ++{ ++ unsigned i; ++ ++ for (i = 0; i < v->nr_blocks - v->nr_redundant; i++) { ++ const struct bch_extent_ptr *ptr2 = v->ptrs + i; ++ ++ if (ptr->dev == ptr2->dev && ++ ptr->gen == ptr2->gen && ++ ptr->offset >= ptr2->offset && ++ ptr->offset < ptr2->offset + le16_to_cpu(v->sectors)) ++ return i; ++ } ++ ++ return -1; ++} ++ ++static int extent_matches_stripe(struct bch_fs *c, ++ struct bch_stripe *v, ++ struct bkey_s_c k) ++{ ++ ++ switch (k.k->type) { ++ case KEY_TYPE_extent: { ++ struct bkey_s_c_extent e = bkey_s_c_to_extent(k); ++ const struct bch_extent_ptr *ptr; ++ int idx; ++ ++ extent_for_each_ptr(e, ptr) { ++ idx = ptr_matches_stripe(c, v, ptr); ++ if (idx >= 0) ++ return idx; ++ } ++ break; ++ } ++ } ++ ++ return -1; ++} ++ ++static bool extent_has_stripe_ptr(struct bkey_s_c k, u64 idx) ++{ ++ switch (k.k->type) { ++ case KEY_TYPE_extent: { ++ struct bkey_s_c_extent e = bkey_s_c_to_extent(k); ++ const union bch_extent_entry *entry; ++ ++ extent_for_each_entry(e, entry) ++ if (extent_entry_type(entry) == ++ BCH_EXTENT_ENTRY_stripe_ptr && ++ entry->stripe_ptr.idx == idx) ++ return true; ++ ++ break; ++ } ++ } ++ ++ return false; ++} ++ ++static void ec_stripe_key_init(struct bch_fs *c, ++ struct bkey_i_stripe *s, ++ struct open_buckets *blocks, ++ struct open_buckets *parity, ++ unsigned stripe_size) ++{ ++ struct open_bucket *ob; ++ unsigned i, u64s; ++ ++ bkey_stripe_init(&s->k_i); ++ s->v.sectors = cpu_to_le16(stripe_size); ++ s->v.algorithm = 0; ++ s->v.nr_blocks = parity->nr + blocks->nr; ++ s->v.nr_redundant = parity->nr; ++ s->v.csum_granularity_bits = ilog2(c->sb.encoded_extent_max); ++ s->v.csum_type = BCH_CSUM_CRC32C; ++ s->v.pad = 0; ++ ++ open_bucket_for_each(c, blocks, ob, i) ++ s->v.ptrs[i] = ob->ptr; ++ ++ open_bucket_for_each(c, parity, ob, i) ++ s->v.ptrs[blocks->nr + i] = ob->ptr; ++ ++ while ((u64s = stripe_val_u64s(&s->v)) > BKEY_VAL_U64s_MAX) { ++ BUG_ON(1 << s->v.csum_granularity_bits >= ++ le16_to_cpu(s->v.sectors) || ++ s->v.csum_granularity_bits == U8_MAX); ++ s->v.csum_granularity_bits++; ++ } ++ ++ set_bkey_val_u64s(&s->k, u64s); ++} ++ ++/* Checksumming: */ ++ ++static void ec_generate_checksums(struct ec_stripe_buf *buf) ++{ ++ struct bch_stripe *v = &buf->key.v; ++ unsigned csum_granularity = 1 << v->csum_granularity_bits; ++ unsigned csums_per_device = stripe_csums_per_device(v); ++ unsigned csum_bytes = bch_crc_bytes[v->csum_type]; ++ unsigned i, j; ++ ++ if (!csum_bytes) ++ return; ++ ++ BUG_ON(buf->offset); ++ BUG_ON(buf->size != le16_to_cpu(v->sectors)); ++ ++ for (i = 0; i < v->nr_blocks; i++) { ++ for (j = 0; j < csums_per_device; j++) { ++ unsigned offset = j << v->csum_granularity_bits; ++ unsigned len = min(csum_granularity, buf->size - offset); ++ ++ struct bch_csum csum = ++ bch2_checksum(NULL, v->csum_type, ++ null_nonce(), ++ buf->data[i] + (offset << 9), ++ len << 9); ++ ++ memcpy(stripe_csum(v, i, j), &csum, csum_bytes); ++ } ++ } ++} ++ ++static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf) ++{ ++ struct bch_stripe *v = &buf->key.v; ++ unsigned csum_granularity = 1 << v->csum_granularity_bits; ++ unsigned csum_bytes = bch_crc_bytes[v->csum_type]; ++ unsigned i; ++ ++ if (!csum_bytes) ++ return; ++ ++ for (i = 0; i < v->nr_blocks; i++) { ++ unsigned offset = buf->offset; ++ unsigned end = buf->offset + buf->size; ++ ++ if (!test_bit(i, buf->valid)) ++ continue; ++ ++ while (offset < end) { ++ unsigned j = offset >> v->csum_granularity_bits; ++ unsigned len = min(csum_granularity, end - offset); ++ struct bch_csum csum; ++ ++ BUG_ON(offset & (csum_granularity - 1)); ++ BUG_ON(offset + len != le16_to_cpu(v->sectors) && ++ ((offset + len) & (csum_granularity - 1))); ++ ++ csum = bch2_checksum(NULL, v->csum_type, ++ null_nonce(), ++ buf->data[i] + ((offset - buf->offset) << 9), ++ len << 9); ++ ++ if (memcmp(stripe_csum(v, i, j), &csum, csum_bytes)) { ++ __bcache_io_error(c, ++ "checksum error while doing reconstruct read (%u:%u)", ++ i, j); ++ clear_bit(i, buf->valid); ++ break; ++ } ++ ++ offset += len; ++ } ++ } ++} ++ ++/* Erasure coding: */ ++ ++static void ec_generate_ec(struct ec_stripe_buf *buf) ++{ ++ struct bch_stripe *v = &buf->key.v; ++ unsigned nr_data = v->nr_blocks - v->nr_redundant; ++ unsigned bytes = le16_to_cpu(v->sectors) << 9; ++ ++ raid_gen(nr_data, v->nr_redundant, bytes, buf->data); ++} ++ ++static unsigned __ec_nr_failed(struct ec_stripe_buf *buf, unsigned nr) ++{ ++ return nr - bitmap_weight(buf->valid, nr); ++} ++ ++static unsigned ec_nr_failed(struct ec_stripe_buf *buf) ++{ ++ return __ec_nr_failed(buf, buf->key.v.nr_blocks); ++} ++ ++static int ec_do_recov(struct bch_fs *c, struct ec_stripe_buf *buf) ++{ ++ struct bch_stripe *v = &buf->key.v; ++ unsigned i, failed[EC_STRIPE_MAX], nr_failed = 0; ++ unsigned nr_data = v->nr_blocks - v->nr_redundant; ++ unsigned bytes = buf->size << 9; ++ ++ if (ec_nr_failed(buf) > v->nr_redundant) { ++ __bcache_io_error(c, ++ "error doing reconstruct read: unable to read enough blocks"); ++ return -1; ++ } ++ ++ for (i = 0; i < nr_data; i++) ++ if (!test_bit(i, buf->valid)) ++ failed[nr_failed++] = i; ++ ++ raid_rec(nr_failed, failed, nr_data, v->nr_redundant, bytes, buf->data); ++ return 0; ++} ++ ++/* IO: */ ++ ++static void ec_block_endio(struct bio *bio) ++{ ++ struct ec_bio *ec_bio = container_of(bio, struct ec_bio, bio); ++ struct bch_dev *ca = ec_bio->ca; ++ struct closure *cl = bio->bi_private; ++ ++ if (bch2_dev_io_err_on(bio->bi_status, ca, "erasure coding")) ++ clear_bit(ec_bio->idx, ec_bio->buf->valid); ++ ++ bio_put(&ec_bio->bio); ++ percpu_ref_put(&ca->io_ref); ++ closure_put(cl); ++} ++ ++static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf, ++ unsigned rw, unsigned idx, struct closure *cl) ++{ ++ struct bch_stripe *v = &buf->key.v; ++ unsigned offset = 0, bytes = buf->size << 9; ++ struct bch_extent_ptr *ptr = &v->ptrs[idx]; ++ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); ++ ++ if (!bch2_dev_get_ioref(ca, rw)) { ++ clear_bit(idx, buf->valid); ++ return; ++ } ++ ++ while (offset < bytes) { ++ unsigned nr_iovecs = min_t(size_t, BIO_MAX_VECS, ++ DIV_ROUND_UP(bytes, PAGE_SIZE)); ++ unsigned b = min_t(size_t, bytes - offset, ++ nr_iovecs << PAGE_SHIFT); ++ struct ec_bio *ec_bio; ++ ++ ec_bio = container_of(bio_alloc_bioset(GFP_KERNEL, nr_iovecs, ++ &c->ec_bioset), ++ struct ec_bio, bio); ++ ++ ec_bio->ca = ca; ++ ec_bio->buf = buf; ++ ec_bio->idx = idx; ++ ++ bio_set_dev(&ec_bio->bio, ca->disk_sb.bdev); ++ bio_set_op_attrs(&ec_bio->bio, rw, 0); ++ ++ ec_bio->bio.bi_iter.bi_sector = ptr->offset + buf->offset + (offset >> 9); ++ ec_bio->bio.bi_end_io = ec_block_endio; ++ ec_bio->bio.bi_private = cl; ++ ++ bch2_bio_map(&ec_bio->bio, buf->data[idx] + offset, b); ++ ++ closure_get(cl); ++ percpu_ref_get(&ca->io_ref); ++ ++ submit_bio(&ec_bio->bio); ++ ++ offset += b; ++ } ++ ++ percpu_ref_put(&ca->io_ref); ++} ++ ++/* recovery read path: */ ++int bch2_ec_read_extent(struct bch_fs *c, struct bch_read_bio *rbio) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct ec_stripe_buf *buf; ++ struct closure cl; ++ struct bkey_s_c k; ++ struct bch_stripe *v; ++ unsigned stripe_idx; ++ unsigned offset, end; ++ unsigned i, nr_data, csum_granularity; ++ int ret = 0, idx; ++ ++ closure_init_stack(&cl); ++ ++ BUG_ON(!rbio->pick.has_ec); ++ ++ stripe_idx = rbio->pick.ec.idx; ++ ++ buf = kzalloc(sizeof(*buf), GFP_NOIO); ++ if (!buf) ++ return -ENOMEM; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_EC, ++ POS(0, stripe_idx), ++ BTREE_ITER_SLOTS); ++ k = bch2_btree_iter_peek_slot(iter); ++ if (bkey_err(k) || k.k->type != KEY_TYPE_stripe) { ++ __bcache_io_error(c, ++ "error doing reconstruct read: stripe not found"); ++ kfree(buf); ++ return bch2_trans_exit(&trans) ?: -EIO; ++ } ++ ++ bkey_reassemble(&buf->key.k_i, k); ++ bch2_trans_exit(&trans); ++ ++ v = &buf->key.v; ++ ++ nr_data = v->nr_blocks - v->nr_redundant; ++ ++ idx = ptr_matches_stripe(c, v, &rbio->pick.ptr); ++ BUG_ON(idx < 0); ++ ++ csum_granularity = 1U << v->csum_granularity_bits; ++ ++ offset = rbio->bio.bi_iter.bi_sector - v->ptrs[idx].offset; ++ end = offset + bio_sectors(&rbio->bio); ++ ++ BUG_ON(end > le16_to_cpu(v->sectors)); ++ ++ buf->offset = round_down(offset, csum_granularity); ++ buf->size = min_t(unsigned, le16_to_cpu(v->sectors), ++ round_up(end, csum_granularity)) - buf->offset; ++ ++ for (i = 0; i < v->nr_blocks; i++) { ++ buf->data[i] = kmalloc(buf->size << 9, GFP_NOIO); ++ if (!buf->data[i]) { ++ ret = -ENOMEM; ++ goto err; ++ } ++ } ++ ++ memset(buf->valid, 0xFF, sizeof(buf->valid)); ++ ++ for (i = 0; i < v->nr_blocks; i++) { ++ struct bch_extent_ptr *ptr = v->ptrs + i; ++ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); ++ ++ if (ptr_stale(ca, ptr)) { ++ __bcache_io_error(c, ++ "error doing reconstruct read: stale pointer"); ++ clear_bit(i, buf->valid); ++ continue; ++ } ++ ++ ec_block_io(c, buf, REQ_OP_READ, i, &cl); ++ } ++ ++ closure_sync(&cl); ++ ++ if (ec_nr_failed(buf) > v->nr_redundant) { ++ __bcache_io_error(c, ++ "error doing reconstruct read: unable to read enough blocks"); ++ ret = -EIO; ++ goto err; ++ } ++ ++ ec_validate_checksums(c, buf); ++ ++ ret = ec_do_recov(c, buf); ++ if (ret) ++ goto err; ++ ++ memcpy_to_bio(&rbio->bio, rbio->bio.bi_iter, ++ buf->data[idx] + ((offset - buf->offset) << 9)); ++err: ++ for (i = 0; i < v->nr_blocks; i++) ++ kfree(buf->data[i]); ++ kfree(buf); ++ return ret; ++} ++ ++/* stripe bucket accounting: */ ++ ++static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx, gfp_t gfp) ++{ ++ ec_stripes_heap n, *h = &c->ec_stripes_heap; ++ ++ if (idx >= h->size) { ++ if (!init_heap(&n, max(1024UL, roundup_pow_of_two(idx + 1)), gfp)) ++ return -ENOMEM; ++ ++ spin_lock(&c->ec_stripes_heap_lock); ++ if (n.size > h->size) { ++ memcpy(n.data, h->data, h->used * sizeof(h->data[0])); ++ n.used = h->used; ++ swap(*h, n); ++ } ++ spin_unlock(&c->ec_stripes_heap_lock); ++ ++ free_heap(&n); ++ } ++ ++ if (!genradix_ptr_alloc(&c->stripes[0], idx, gfp)) ++ return -ENOMEM; ++ ++ if (c->gc_pos.phase != GC_PHASE_NOT_RUNNING && ++ !genradix_ptr_alloc(&c->stripes[1], idx, gfp)) ++ return -ENOMEM; ++ ++ return 0; ++} ++ ++static int ec_stripe_mem_alloc(struct bch_fs *c, ++ struct btree_iter *iter) ++{ ++ size_t idx = iter->pos.offset; ++ int ret = 0; ++ ++ if (!__ec_stripe_mem_alloc(c, idx, GFP_NOWAIT|__GFP_NOWARN)) ++ return ret; ++ ++ bch2_trans_unlock(iter->trans); ++ ret = -EINTR; ++ ++ if (!__ec_stripe_mem_alloc(c, idx, GFP_KERNEL)) ++ return ret; ++ ++ return -ENOMEM; ++} ++ ++static ssize_t stripe_idx_to_delete(struct bch_fs *c) ++{ ++ ec_stripes_heap *h = &c->ec_stripes_heap; ++ ++ return h->used && h->data[0].blocks_nonempty == 0 ++ ? h->data[0].idx : -1; ++} ++ ++static inline int ec_stripes_heap_cmp(ec_stripes_heap *h, ++ struct ec_stripe_heap_entry l, ++ struct ec_stripe_heap_entry r) ++{ ++ return ((l.blocks_nonempty > r.blocks_nonempty) - ++ (l.blocks_nonempty < r.blocks_nonempty)); ++} ++ ++static inline void ec_stripes_heap_set_backpointer(ec_stripes_heap *h, ++ size_t i) ++{ ++ struct bch_fs *c = container_of(h, struct bch_fs, ec_stripes_heap); ++ ++ genradix_ptr(&c->stripes[0], h->data[i].idx)->heap_idx = i; ++} ++ ++static void heap_verify_backpointer(struct bch_fs *c, size_t idx) ++{ ++ ec_stripes_heap *h = &c->ec_stripes_heap; ++ struct stripe *m = genradix_ptr(&c->stripes[0], idx); ++ ++ BUG_ON(!m->alive); ++ BUG_ON(m->heap_idx >= h->used); ++ BUG_ON(h->data[m->heap_idx].idx != idx); ++} ++ ++void bch2_stripes_heap_update(struct bch_fs *c, ++ struct stripe *m, size_t idx) ++{ ++ ec_stripes_heap *h = &c->ec_stripes_heap; ++ size_t i; ++ ++ if (m->alive) { ++ heap_verify_backpointer(c, idx); ++ ++ h->data[m->heap_idx].blocks_nonempty = m->blocks_nonempty; ++ ++ i = m->heap_idx; ++ heap_sift_up(h, i, ec_stripes_heap_cmp, ++ ec_stripes_heap_set_backpointer); ++ heap_sift_down(h, i, ec_stripes_heap_cmp, ++ ec_stripes_heap_set_backpointer); ++ ++ heap_verify_backpointer(c, idx); ++ } else { ++ bch2_stripes_heap_insert(c, m, idx); ++ } ++ ++ if (stripe_idx_to_delete(c) >= 0 && ++ !percpu_ref_is_dying(&c->writes)) ++ schedule_work(&c->ec_stripe_delete_work); ++} ++ ++void bch2_stripes_heap_del(struct bch_fs *c, ++ struct stripe *m, size_t idx) ++{ ++ heap_verify_backpointer(c, idx); ++ ++ m->alive = false; ++ heap_del(&c->ec_stripes_heap, m->heap_idx, ++ ec_stripes_heap_cmp, ++ ec_stripes_heap_set_backpointer); ++} ++ ++void bch2_stripes_heap_insert(struct bch_fs *c, ++ struct stripe *m, size_t idx) ++{ ++ BUG_ON(heap_full(&c->ec_stripes_heap)); ++ ++ heap_add(&c->ec_stripes_heap, ((struct ec_stripe_heap_entry) { ++ .idx = idx, ++ .blocks_nonempty = m->blocks_nonempty, ++ }), ++ ec_stripes_heap_cmp, ++ ec_stripes_heap_set_backpointer); ++ m->alive = true; ++ ++ heap_verify_backpointer(c, idx); ++} ++ ++/* stripe deletion */ ++ ++static int ec_stripe_delete(struct bch_fs *c, size_t idx) ++{ ++ return bch2_btree_delete_range(c, BTREE_ID_EC, ++ POS(0, idx), ++ POS(0, idx + 1), ++ NULL); ++} ++ ++static void ec_stripe_delete_work(struct work_struct *work) ++{ ++ struct bch_fs *c = ++ container_of(work, struct bch_fs, ec_stripe_delete_work); ++ ssize_t idx; ++ ++ down_read(&c->gc_lock); ++ mutex_lock(&c->ec_stripe_create_lock); ++ ++ while (1) { ++ spin_lock(&c->ec_stripes_heap_lock); ++ idx = stripe_idx_to_delete(c); ++ spin_unlock(&c->ec_stripes_heap_lock); ++ ++ if (idx < 0) ++ break; ++ ++ if (ec_stripe_delete(c, idx)) ++ break; ++ } ++ ++ mutex_unlock(&c->ec_stripe_create_lock); ++ up_read(&c->gc_lock); ++} ++ ++/* stripe creation: */ ++ ++static int ec_stripe_bkey_insert(struct bch_fs *c, ++ struct bkey_i_stripe *stripe) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ struct bpos start_pos = POS(0, c->ec_stripe_hint); ++ int ret; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++retry: ++ bch2_trans_begin(&trans); ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_EC, start_pos, ++ BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { ++ if (bkey_cmp(k.k->p, POS(0, U32_MAX)) > 0) { ++ if (start_pos.offset) { ++ start_pos = POS_MIN; ++ bch2_btree_iter_set_pos(iter, start_pos); ++ continue; ++ } ++ ++ ret = -ENOSPC; ++ break; ++ } ++ ++ if (bkey_deleted(k.k)) ++ goto found_slot; ++ } ++ ++ goto err; ++found_slot: ++ start_pos = iter->pos; ++ ++ ret = ec_stripe_mem_alloc(c, iter); ++ if (ret) ++ goto err; ++ ++ stripe->k.p = iter->pos; ++ ++ bch2_trans_update(&trans, iter, &stripe->k_i); ++ ++ ret = bch2_trans_commit(&trans, NULL, NULL, ++ BTREE_INSERT_ATOMIC| ++ BTREE_INSERT_NOFAIL); ++err: ++ if (ret == -EINTR) ++ goto retry; ++ ++ c->ec_stripe_hint = ret ? start_pos.offset : start_pos.offset + 1; ++ bch2_trans_exit(&trans); ++ ++ return ret; ++} ++ ++static void extent_stripe_ptr_add(struct bkey_s_extent e, ++ struct ec_stripe_buf *s, ++ struct bch_extent_ptr *ptr, ++ unsigned block) ++{ ++ struct bch_extent_stripe_ptr *dst = (void *) ptr; ++ union bch_extent_entry *end = extent_entry_last(e); ++ ++ memmove_u64s_up(dst + 1, dst, (u64 *) end - (u64 *) dst); ++ e.k->u64s += sizeof(*dst) / sizeof(u64); ++ ++ *dst = (struct bch_extent_stripe_ptr) { ++ .type = 1 << BCH_EXTENT_ENTRY_stripe_ptr, ++ .block = block, ++ .idx = s->key.k.p.offset, ++ }; ++} ++ ++static int ec_stripe_update_ptrs(struct bch_fs *c, ++ struct ec_stripe_buf *s, ++ struct bkey *pos) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ struct bkey_s_extent e; ++ struct bch_extent_ptr *ptr; ++ BKEY_PADDED(k) tmp; ++ int ret = 0, dev, idx; ++ ++ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); ++ ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, ++ bkey_start_pos(pos), ++ BTREE_ITER_INTENT); ++ ++ while ((k = bch2_btree_iter_peek(iter)).k && ++ !(ret = bkey_err(k)) && ++ bkey_cmp(bkey_start_pos(k.k), pos->p) < 0) { ++ if (extent_has_stripe_ptr(k, s->key.k.p.offset)) { ++ bch2_btree_iter_next(iter); ++ continue; ++ } ++ ++ idx = extent_matches_stripe(c, &s->key.v, k); ++ if (idx < 0) { ++ bch2_btree_iter_next(iter); ++ continue; ++ } ++ ++ bch2_btree_iter_set_pos(iter, bkey_start_pos(k.k)); ++ ++ dev = s->key.v.ptrs[idx].dev; ++ ++ bkey_reassemble(&tmp.k, k); ++ e = bkey_i_to_s_extent(&tmp.k); ++ ++ extent_for_each_ptr(e, ptr) ++ if (ptr->dev != dev) ++ ptr->cached = true; ++ ++ ptr = (void *) bch2_extent_has_device(e.c, dev); ++ BUG_ON(!ptr); ++ ++ extent_stripe_ptr_add(e, s, ptr, idx); ++ ++ bch2_trans_update(&trans, iter, &tmp.k); ++ ++ ret = bch2_trans_commit(&trans, NULL, NULL, ++ BTREE_INSERT_ATOMIC| ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_USE_RESERVE); ++ if (ret == -EINTR) ++ ret = 0; ++ if (ret) ++ break; ++ } ++ ++ bch2_trans_exit(&trans); ++ ++ return ret; ++} ++ ++/* ++ * data buckets of new stripe all written: create the stripe ++ */ ++static void ec_stripe_create(struct ec_stripe_new *s) ++{ ++ struct bch_fs *c = s->c; ++ struct open_bucket *ob; ++ struct bkey_i *k; ++ struct bch_stripe *v = &s->stripe.key.v; ++ unsigned i, nr_data = v->nr_blocks - v->nr_redundant; ++ struct closure cl; ++ int ret; ++ ++ BUG_ON(s->h->s == s); ++ ++ closure_init_stack(&cl); ++ ++ if (s->err) { ++ bch_err(c, "error creating stripe: error writing data buckets"); ++ goto err; ++ } ++ ++ if (!percpu_ref_tryget(&c->writes)) ++ goto err; ++ ++ BUG_ON(bitmap_weight(s->blocks_allocated, ++ s->blocks.nr) != s->blocks.nr); ++ ++ ec_generate_ec(&s->stripe); ++ ++ ec_generate_checksums(&s->stripe); ++ ++ /* write p/q: */ ++ for (i = nr_data; i < v->nr_blocks; i++) ++ ec_block_io(c, &s->stripe, REQ_OP_WRITE, i, &cl); ++ ++ closure_sync(&cl); ++ ++ for (i = nr_data; i < v->nr_blocks; i++) ++ if (!test_bit(i, s->stripe.valid)) { ++ bch_err(c, "error creating stripe: error writing redundancy buckets"); ++ goto err_put_writes; ++ } ++ ++ mutex_lock(&c->ec_stripe_create_lock); ++ ++ ret = ec_stripe_bkey_insert(c, &s->stripe.key); ++ if (ret) { ++ bch_err(c, "error creating stripe: error creating stripe key"); ++ goto err_unlock; ++ } ++ ++ for_each_keylist_key(&s->keys, k) { ++ ret = ec_stripe_update_ptrs(c, &s->stripe, &k->k); ++ if (ret) ++ break; ++ } ++ ++err_unlock: ++ mutex_unlock(&c->ec_stripe_create_lock); ++err_put_writes: ++ percpu_ref_put(&c->writes); ++err: ++ open_bucket_for_each(c, &s->blocks, ob, i) { ++ ob->ec = NULL; ++ __bch2_open_bucket_put(c, ob); ++ } ++ ++ bch2_open_buckets_put(c, &s->parity); ++ ++ bch2_keylist_free(&s->keys, s->inline_keys); ++ ++ mutex_lock(&s->h->lock); ++ list_del(&s->list); ++ mutex_unlock(&s->h->lock); ++ ++ for (i = 0; i < s->stripe.key.v.nr_blocks; i++) ++ kvpfree(s->stripe.data[i], s->stripe.size << 9); ++ kfree(s); ++} ++ ++static struct ec_stripe_new *ec_stripe_set_pending(struct ec_stripe_head *h) ++{ ++ struct ec_stripe_new *s = h->s; ++ ++ list_add(&s->list, &h->stripes); ++ h->s = NULL; ++ ++ return s; ++} ++ ++static void ec_stripe_new_put(struct ec_stripe_new *s) ++{ ++ BUG_ON(atomic_read(&s->pin) <= 0); ++ if (atomic_dec_and_test(&s->pin)) ++ ec_stripe_create(s); ++} ++ ++/* have a full bucket - hand it off to be erasure coded: */ ++void bch2_ec_bucket_written(struct bch_fs *c, struct open_bucket *ob) ++{ ++ struct ec_stripe_new *s = ob->ec; ++ ++ if (ob->sectors_free) ++ s->err = -1; ++ ++ ec_stripe_new_put(s); ++} ++ ++void bch2_ec_bucket_cancel(struct bch_fs *c, struct open_bucket *ob) ++{ ++ struct ec_stripe_new *s = ob->ec; ++ ++ s->err = -EIO; ++} ++ ++void *bch2_writepoint_ec_buf(struct bch_fs *c, struct write_point *wp) ++{ ++ struct open_bucket *ob = ec_open_bucket(c, &wp->ptrs); ++ struct bch_dev *ca; ++ unsigned offset; ++ ++ if (!ob) ++ return NULL; ++ ++ ca = bch_dev_bkey_exists(c, ob->ptr.dev); ++ offset = ca->mi.bucket_size - ob->sectors_free; ++ ++ return ob->ec->stripe.data[ob->ec_idx] + (offset << 9); ++} ++ ++void bch2_ec_add_backpointer(struct bch_fs *c, struct write_point *wp, ++ struct bpos pos, unsigned sectors) ++{ ++ struct open_bucket *ob = ec_open_bucket(c, &wp->ptrs); ++ struct ec_stripe_new *ec; ++ ++ if (!ob) ++ return; ++ ++ ec = ob->ec; ++ mutex_lock(&ec->lock); ++ ++ if (bch2_keylist_realloc(&ec->keys, ec->inline_keys, ++ ARRAY_SIZE(ec->inline_keys), ++ BKEY_U64s)) { ++ BUG(); ++ } ++ ++ bkey_init(&ec->keys.top->k); ++ ec->keys.top->k.p = pos; ++ bch2_key_resize(&ec->keys.top->k, sectors); ++ bch2_keylist_push(&ec->keys); ++ ++ mutex_unlock(&ec->lock); ++} ++ ++static int unsigned_cmp(const void *_l, const void *_r) ++{ ++ unsigned l = *((const unsigned *) _l); ++ unsigned r = *((const unsigned *) _r); ++ ++ return cmp_int(l, r); ++} ++ ++/* pick most common bucket size: */ ++static unsigned pick_blocksize(struct bch_fs *c, ++ struct bch_devs_mask *devs) ++{ ++ struct bch_dev *ca; ++ unsigned i, nr = 0, sizes[BCH_SB_MEMBERS_MAX]; ++ struct { ++ unsigned nr, size; ++ } cur = { 0, 0 }, best = { 0, 0 }; ++ ++ for_each_member_device_rcu(ca, c, i, devs) ++ sizes[nr++] = ca->mi.bucket_size; ++ ++ sort(sizes, nr, sizeof(unsigned), unsigned_cmp, NULL); ++ ++ for (i = 0; i < nr; i++) { ++ if (sizes[i] != cur.size) { ++ if (cur.nr > best.nr) ++ best = cur; ++ ++ cur.nr = 0; ++ cur.size = sizes[i]; ++ } ++ ++ cur.nr++; ++ } ++ ++ if (cur.nr > best.nr) ++ best = cur; ++ ++ return best.size; ++} ++ ++int bch2_ec_stripe_new_alloc(struct bch_fs *c, struct ec_stripe_head *h) ++{ ++ struct ec_stripe_new *s; ++ unsigned i; ++ ++ BUG_ON(h->parity.nr != h->redundancy); ++ BUG_ON(!h->blocks.nr); ++ BUG_ON(h->parity.nr + h->blocks.nr > EC_STRIPE_MAX); ++ lockdep_assert_held(&h->lock); ++ ++ s = kzalloc(sizeof(*s), GFP_KERNEL); ++ if (!s) ++ return -ENOMEM; ++ ++ mutex_init(&s->lock); ++ atomic_set(&s->pin, 1); ++ s->c = c; ++ s->h = h; ++ s->blocks = h->blocks; ++ s->parity = h->parity; ++ ++ memset(&h->blocks, 0, sizeof(h->blocks)); ++ memset(&h->parity, 0, sizeof(h->parity)); ++ ++ bch2_keylist_init(&s->keys, s->inline_keys); ++ ++ s->stripe.offset = 0; ++ s->stripe.size = h->blocksize; ++ memset(s->stripe.valid, 0xFF, sizeof(s->stripe.valid)); ++ ++ ec_stripe_key_init(c, &s->stripe.key, ++ &s->blocks, &s->parity, ++ h->blocksize); ++ ++ for (i = 0; i < s->stripe.key.v.nr_blocks; i++) { ++ s->stripe.data[i] = kvpmalloc(s->stripe.size << 9, GFP_KERNEL); ++ if (!s->stripe.data[i]) ++ goto err; ++ } ++ ++ h->s = s; ++ ++ return 0; ++err: ++ for (i = 0; i < s->stripe.key.v.nr_blocks; i++) ++ kvpfree(s->stripe.data[i], s->stripe.size << 9); ++ kfree(s); ++ return -ENOMEM; ++} ++ ++static struct ec_stripe_head * ++ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target, ++ unsigned algo, unsigned redundancy) ++{ ++ struct ec_stripe_head *h; ++ struct bch_dev *ca; ++ unsigned i; ++ ++ h = kzalloc(sizeof(*h), GFP_KERNEL); ++ if (!h) ++ return NULL; ++ ++ mutex_init(&h->lock); ++ mutex_lock(&h->lock); ++ INIT_LIST_HEAD(&h->stripes); ++ ++ h->target = target; ++ h->algo = algo; ++ h->redundancy = redundancy; ++ ++ rcu_read_lock(); ++ h->devs = target_rw_devs(c, BCH_DATA_USER, target); ++ ++ for_each_member_device_rcu(ca, c, i, &h->devs) ++ if (!ca->mi.durability) ++ __clear_bit(i, h->devs.d); ++ ++ h->blocksize = pick_blocksize(c, &h->devs); ++ ++ for_each_member_device_rcu(ca, c, i, &h->devs) ++ if (ca->mi.bucket_size == h->blocksize) ++ h->nr_active_devs++; ++ ++ rcu_read_unlock(); ++ list_add(&h->list, &c->ec_new_stripe_list); ++ return h; ++} ++ ++void bch2_ec_stripe_head_put(struct ec_stripe_head *h) ++{ ++ struct ec_stripe_new *s = NULL; ++ ++ if (h->s && ++ bitmap_weight(h->s->blocks_allocated, ++ h->s->blocks.nr) == h->s->blocks.nr) ++ s = ec_stripe_set_pending(h); ++ ++ mutex_unlock(&h->lock); ++ ++ if (s) ++ ec_stripe_new_put(s); ++} ++ ++struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c, ++ unsigned target, ++ unsigned algo, ++ unsigned redundancy) ++{ ++ struct ec_stripe_head *h; ++ ++ if (!redundancy) ++ return NULL; ++ ++ mutex_lock(&c->ec_new_stripe_lock); ++ list_for_each_entry(h, &c->ec_new_stripe_list, list) ++ if (h->target == target && ++ h->algo == algo && ++ h->redundancy == redundancy) { ++ mutex_lock(&h->lock); ++ goto found; ++ } ++ ++ h = ec_new_stripe_head_alloc(c, target, algo, redundancy); ++found: ++ mutex_unlock(&c->ec_new_stripe_lock); ++ return h; ++} ++ ++void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca) ++{ ++ struct ec_stripe_head *h; ++ struct open_bucket *ob; ++ unsigned i; ++ ++ mutex_lock(&c->ec_new_stripe_lock); ++ list_for_each_entry(h, &c->ec_new_stripe_list, list) { ++ struct ec_stripe_new *s = NULL; ++ ++ mutex_lock(&h->lock); ++ bch2_open_buckets_stop_dev(c, ca, &h->blocks); ++ bch2_open_buckets_stop_dev(c, ca, &h->parity); ++ ++ if (!h->s) ++ goto unlock; ++ ++ open_bucket_for_each(c, &h->s->blocks, ob, i) ++ if (ob->ptr.dev == ca->dev_idx) ++ goto found; ++ open_bucket_for_each(c, &h->s->parity, ob, i) ++ if (ob->ptr.dev == ca->dev_idx) ++ goto found; ++ goto unlock; ++found: ++ h->s->err = -1; ++ s = ec_stripe_set_pending(h); ++unlock: ++ mutex_unlock(&h->lock); ++ ++ if (s) ++ ec_stripe_new_put(s); ++ } ++ mutex_unlock(&c->ec_new_stripe_lock); ++} ++ ++static int __bch2_stripe_write_key(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct stripe *m, ++ size_t idx, ++ struct bkey_i_stripe *new_key, ++ unsigned flags) ++{ ++ struct bch_fs *c = trans->c; ++ struct bkey_s_c k; ++ unsigned i; ++ int ret; ++ ++ bch2_btree_iter_set_pos(iter, POS(0, idx)); ++ ++ k = bch2_btree_iter_peek_slot(iter); ++ ret = bkey_err(k); ++ if (ret) ++ return ret; ++ ++ if (k.k->type != KEY_TYPE_stripe) ++ return -EIO; ++ ++ bkey_reassemble(&new_key->k_i, k); ++ ++ spin_lock(&c->ec_stripes_heap_lock); ++ ++ for (i = 0; i < new_key->v.nr_blocks; i++) ++ stripe_blockcount_set(&new_key->v, i, ++ m->block_sectors[i]); ++ m->dirty = false; ++ ++ spin_unlock(&c->ec_stripes_heap_lock); ++ ++ bch2_trans_update(trans, iter, &new_key->k_i); ++ ++ return bch2_trans_commit(trans, NULL, NULL, ++ BTREE_INSERT_NOFAIL|flags); ++} ++ ++int bch2_stripes_write(struct bch_fs *c, unsigned flags, bool *wrote) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct genradix_iter giter; ++ struct bkey_i_stripe *new_key; ++ struct stripe *m; ++ int ret = 0; ++ ++ new_key = kmalloc(255 * sizeof(u64), GFP_KERNEL); ++ BUG_ON(!new_key); ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_EC, POS_MIN, ++ BTREE_ITER_SLOTS|BTREE_ITER_INTENT); ++ ++ genradix_for_each(&c->stripes[0], giter, m) { ++ if (!m->dirty) ++ continue; ++ ++ ret = __bch2_stripe_write_key(&trans, iter, m, giter.pos, ++ new_key, flags); ++ if (ret) ++ break; ++ ++ *wrote = true; ++ } ++ ++ bch2_trans_exit(&trans); ++ ++ kfree(new_key); ++ ++ return ret; ++} ++ ++int bch2_stripes_read(struct bch_fs *c, struct journal_keys *journal_keys) ++{ ++ struct btree_trans trans; ++ struct btree_iter *btree_iter; ++ struct journal_iter journal_iter; ++ struct bkey_s_c btree_k, journal_k; ++ int ret; ++ ++ ret = bch2_fs_ec_start(c); ++ if (ret) ++ return ret; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ btree_iter = bch2_trans_get_iter(&trans, BTREE_ID_EC, POS_MIN, 0); ++ journal_iter = bch2_journal_iter_init(journal_keys, BTREE_ID_EC); ++ ++ btree_k = bch2_btree_iter_peek(btree_iter); ++ journal_k = bch2_journal_iter_peek(&journal_iter); ++ ++ while (1) { ++ bool btree; ++ ++ if (btree_k.k && journal_k.k) { ++ int cmp = bkey_cmp(btree_k.k->p, journal_k.k->p); ++ ++ if (!cmp) ++ btree_k = bch2_btree_iter_next(btree_iter); ++ btree = cmp < 0; ++ } else if (btree_k.k) { ++ btree = true; ++ } else if (journal_k.k) { ++ btree = false; ++ } else { ++ break; ++ } ++ ++ bch2_mark_key(c, btree ? btree_k : journal_k, ++ 0, 0, NULL, 0, ++ BCH_BUCKET_MARK_ALLOC_READ| ++ BCH_BUCKET_MARK_NOATOMIC); ++ ++ if (btree) ++ btree_k = bch2_btree_iter_next(btree_iter); ++ else ++ journal_k = bch2_journal_iter_next(&journal_iter); ++ } ++ ++ ret = bch2_trans_exit(&trans) ?: ret; ++ if (ret) { ++ bch_err(c, "error reading stripes: %i", ret); ++ return ret; ++ } ++ ++ return 0; ++} ++ ++int bch2_ec_mem_alloc(struct bch_fs *c, bool gc) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ size_t i, idx = 0; ++ int ret = 0; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_EC, POS(0, U64_MAX), 0); ++ ++ k = bch2_btree_iter_prev(iter); ++ if (!IS_ERR_OR_NULL(k.k)) ++ idx = k.k->p.offset + 1; ++ ret = bch2_trans_exit(&trans); ++ if (ret) ++ return ret; ++ ++ if (!idx) ++ return 0; ++ ++ if (!gc && ++ !init_heap(&c->ec_stripes_heap, roundup_pow_of_two(idx), ++ GFP_KERNEL)) ++ return -ENOMEM; ++#if 0 ++ ret = genradix_prealloc(&c->stripes[gc], idx, GFP_KERNEL); ++#else ++ for (i = 0; i < idx; i++) ++ if (!genradix_ptr_alloc(&c->stripes[gc], i, GFP_KERNEL)) ++ return -ENOMEM; ++#endif ++ return 0; ++} ++ ++int bch2_fs_ec_start(struct bch_fs *c) ++{ ++ return bch2_ec_mem_alloc(c, false); ++} ++ ++void bch2_fs_ec_exit(struct bch_fs *c) ++{ ++ struct ec_stripe_head *h; ++ ++ while (1) { ++ mutex_lock(&c->ec_new_stripe_lock); ++ h = list_first_entry_or_null(&c->ec_new_stripe_list, ++ struct ec_stripe_head, list); ++ if (h) ++ list_del(&h->list); ++ mutex_unlock(&c->ec_new_stripe_lock); ++ if (!h) ++ break; ++ ++ BUG_ON(h->s); ++ BUG_ON(!list_empty(&h->stripes)); ++ kfree(h); ++ } ++ ++ free_heap(&c->ec_stripes_heap); ++ genradix_free(&c->stripes[0]); ++ bioset_exit(&c->ec_bioset); ++} ++ ++int bch2_fs_ec_init(struct bch_fs *c) ++{ ++ INIT_WORK(&c->ec_stripe_delete_work, ec_stripe_delete_work); ++ ++ return bioset_init(&c->ec_bioset, 1, offsetof(struct ec_bio, bio), ++ BIOSET_NEED_BVECS); ++} +diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h +new file mode 100644 +index 000000000000..8d9fbfd19f66 +--- /dev/null ++++ b/fs/bcachefs/ec.h +@@ -0,0 +1,164 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_EC_H ++#define _BCACHEFS_EC_H ++ ++#include "ec_types.h" ++#include "keylist_types.h" ++ ++const char *bch2_stripe_invalid(const struct bch_fs *, struct bkey_s_c); ++void bch2_stripe_to_text(struct printbuf *, struct bch_fs *, ++ struct bkey_s_c); ++ ++#define bch2_bkey_ops_stripe (struct bkey_ops) { \ ++ .key_invalid = bch2_stripe_invalid, \ ++ .val_to_text = bch2_stripe_to_text, \ ++} ++ ++static inline unsigned stripe_csums_per_device(const struct bch_stripe *s) ++{ ++ return DIV_ROUND_UP(le16_to_cpu(s->sectors), ++ 1 << s->csum_granularity_bits); ++} ++ ++static inline unsigned stripe_csum_offset(const struct bch_stripe *s, ++ unsigned dev, unsigned csum_idx) ++{ ++ unsigned csum_bytes = bch_crc_bytes[s->csum_type]; ++ ++ return sizeof(struct bch_stripe) + ++ sizeof(struct bch_extent_ptr) * s->nr_blocks + ++ (dev * stripe_csums_per_device(s) + csum_idx) * csum_bytes; ++} ++ ++static inline unsigned stripe_blockcount_offset(const struct bch_stripe *s, ++ unsigned idx) ++{ ++ return stripe_csum_offset(s, s->nr_blocks, 0) + ++ sizeof(u16) * idx; ++} ++ ++static inline unsigned stripe_blockcount_get(const struct bch_stripe *s, ++ unsigned idx) ++{ ++ return le16_to_cpup((void *) s + stripe_blockcount_offset(s, idx)); ++} ++ ++static inline void stripe_blockcount_set(struct bch_stripe *s, ++ unsigned idx, unsigned v) ++{ ++ __le16 *p = (void *) s + stripe_blockcount_offset(s, idx); ++ ++ *p = cpu_to_le16(v); ++} ++ ++static inline unsigned stripe_val_u64s(const struct bch_stripe *s) ++{ ++ return DIV_ROUND_UP(stripe_blockcount_offset(s, s->nr_blocks), ++ sizeof(u64)); ++} ++ ++static inline void *stripe_csum(struct bch_stripe *s, ++ unsigned dev, unsigned csum_idx) ++{ ++ return (void *) s + stripe_csum_offset(s, dev, csum_idx); ++} ++ ++struct bch_read_bio; ++ ++struct ec_stripe_buf { ++ /* might not be buffering the entire stripe: */ ++ unsigned offset; ++ unsigned size; ++ unsigned long valid[BITS_TO_LONGS(EC_STRIPE_MAX)]; ++ ++ void *data[EC_STRIPE_MAX]; ++ ++ union { ++ struct bkey_i_stripe key; ++ u64 pad[255]; ++ }; ++}; ++ ++struct ec_stripe_head; ++ ++struct ec_stripe_new { ++ struct bch_fs *c; ++ struct ec_stripe_head *h; ++ struct mutex lock; ++ struct list_head list; ++ ++ /* counts in flight writes, stripe is created when pin == 0 */ ++ atomic_t pin; ++ ++ int err; ++ ++ unsigned long blocks_allocated[BITS_TO_LONGS(EC_STRIPE_MAX)]; ++ ++ struct open_buckets blocks; ++ struct open_buckets parity; ++ ++ struct keylist keys; ++ u64 inline_keys[BKEY_U64s * 8]; ++ ++ struct ec_stripe_buf stripe; ++}; ++ ++struct ec_stripe_head { ++ struct list_head list; ++ struct mutex lock; ++ ++ struct list_head stripes; ++ ++ unsigned target; ++ unsigned algo; ++ unsigned redundancy; ++ ++ struct bch_devs_mask devs; ++ unsigned nr_active_devs; ++ ++ unsigned blocksize; ++ ++ struct dev_stripe_state block_stripe; ++ struct dev_stripe_state parity_stripe; ++ ++ struct open_buckets blocks; ++ struct open_buckets parity; ++ ++ struct ec_stripe_new *s; ++}; ++ ++int bch2_ec_read_extent(struct bch_fs *, struct bch_read_bio *); ++ ++void *bch2_writepoint_ec_buf(struct bch_fs *, struct write_point *); ++void bch2_ec_add_backpointer(struct bch_fs *, struct write_point *, ++ struct bpos, unsigned); ++ ++void bch2_ec_bucket_written(struct bch_fs *, struct open_bucket *); ++void bch2_ec_bucket_cancel(struct bch_fs *, struct open_bucket *); ++ ++int bch2_ec_stripe_new_alloc(struct bch_fs *, struct ec_stripe_head *); ++ ++void bch2_ec_stripe_head_put(struct ec_stripe_head *); ++struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *, unsigned, ++ unsigned, unsigned); ++ ++void bch2_stripes_heap_update(struct bch_fs *, struct stripe *, size_t); ++void bch2_stripes_heap_del(struct bch_fs *, struct stripe *, size_t); ++void bch2_stripes_heap_insert(struct bch_fs *, struct stripe *, size_t); ++ ++void bch2_ec_stop_dev(struct bch_fs *, struct bch_dev *); ++ ++void bch2_ec_flush_new_stripes(struct bch_fs *); ++ ++struct journal_keys; ++int bch2_stripes_read(struct bch_fs *, struct journal_keys *); ++int bch2_stripes_write(struct bch_fs *, unsigned, bool *); ++ ++int bch2_ec_mem_alloc(struct bch_fs *, bool); ++ ++int bch2_fs_ec_start(struct bch_fs *); ++ ++void bch2_fs_ec_exit(struct bch_fs *); ++int bch2_fs_ec_init(struct bch_fs *); ++ ++#endif /* _BCACHEFS_EC_H */ +diff --git a/fs/bcachefs/ec_types.h b/fs/bcachefs/ec_types.h +new file mode 100644 +index 000000000000..5c3f77c8aac7 +--- /dev/null ++++ b/fs/bcachefs/ec_types.h +@@ -0,0 +1,38 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_EC_TYPES_H ++#define _BCACHEFS_EC_TYPES_H ++ ++#include ++ ++#define EC_STRIPE_MAX 16 ++ ++struct bch_replicas_padded { ++ struct bch_replicas_entry e; ++ u8 pad[EC_STRIPE_MAX]; ++}; ++ ++struct stripe { ++ size_t heap_idx; ++ ++ u16 sectors; ++ u8 algorithm; ++ ++ u8 nr_blocks; ++ u8 nr_redundant; ++ ++ unsigned alive:1; ++ unsigned dirty:1; ++ u8 blocks_nonempty; ++ u16 block_sectors[EC_STRIPE_MAX]; ++ ++ struct bch_replicas_padded r; ++}; ++ ++struct ec_stripe_heap_entry { ++ size_t idx; ++ unsigned blocks_nonempty; ++}; ++ ++typedef HEAP(struct ec_stripe_heap_entry) ec_stripes_heap; ++ ++#endif /* _BCACHEFS_EC_TYPES_H */ +diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c +new file mode 100644 +index 000000000000..304ff92500be +--- /dev/null ++++ b/fs/bcachefs/error.c +@@ -0,0 +1,167 @@ ++// SPDX-License-Identifier: GPL-2.0 ++#include "bcachefs.h" ++#include "error.h" ++#include "io.h" ++#include "super.h" ++ ++#define FSCK_ERR_RATELIMIT_NR 10 ++ ++bool bch2_inconsistent_error(struct bch_fs *c) ++{ ++ set_bit(BCH_FS_ERROR, &c->flags); ++ ++ switch (c->opts.errors) { ++ case BCH_ON_ERROR_CONTINUE: ++ return false; ++ case BCH_ON_ERROR_RO: ++ if (bch2_fs_emergency_read_only(c)) ++ bch_err(c, "emergency read only"); ++ return true; ++ case BCH_ON_ERROR_PANIC: ++ panic(bch2_fmt(c, "panic after error")); ++ return true; ++ default: ++ BUG(); ++ } ++} ++ ++void bch2_fatal_error(struct bch_fs *c) ++{ ++ if (bch2_fs_emergency_read_only(c)) ++ bch_err(c, "emergency read only"); ++} ++ ++void bch2_io_error_work(struct work_struct *work) ++{ ++ struct bch_dev *ca = container_of(work, struct bch_dev, io_error_work); ++ struct bch_fs *c = ca->fs; ++ bool dev; ++ ++ mutex_lock(&c->state_lock); ++ dev = bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_RO, ++ BCH_FORCE_IF_DEGRADED); ++ if (dev ++ ? __bch2_dev_set_state(c, ca, BCH_MEMBER_STATE_RO, ++ BCH_FORCE_IF_DEGRADED) ++ : bch2_fs_emergency_read_only(c)) ++ bch_err(ca, ++ "too many IO errors, setting %s RO", ++ dev ? "device" : "filesystem"); ++ mutex_unlock(&c->state_lock); ++} ++ ++void bch2_io_error(struct bch_dev *ca) ++{ ++ //queue_work(system_long_wq, &ca->io_error_work); ++} ++ ++#ifdef __KERNEL__ ++#define ask_yn() false ++#else ++#include "tools-util.h" ++#endif ++ ++enum fsck_err_ret bch2_fsck_err(struct bch_fs *c, unsigned flags, ++ const char *fmt, ...) ++{ ++ struct fsck_err_state *s; ++ va_list args; ++ bool fix = false, print = true, suppressing = false; ++ char _buf[sizeof(s->buf)], *buf = _buf; ++ ++ if (test_bit(BCH_FS_FSCK_DONE, &c->flags)) { ++ va_start(args, fmt); ++ vprintk(fmt, args); ++ va_end(args); ++ ++ return bch2_inconsistent_error(c) ++ ? FSCK_ERR_EXIT ++ : FSCK_ERR_FIX; ++ } ++ ++ mutex_lock(&c->fsck_error_lock); ++ ++ list_for_each_entry(s, &c->fsck_errors, list) ++ if (s->fmt == fmt) ++ goto found; ++ ++ s = kzalloc(sizeof(*s), GFP_KERNEL); ++ if (!s) { ++ if (!c->fsck_alloc_err) ++ bch_err(c, "kmalloc err, cannot ratelimit fsck errs"); ++ c->fsck_alloc_err = true; ++ buf = _buf; ++ goto print; ++ } ++ ++ INIT_LIST_HEAD(&s->list); ++ s->fmt = fmt; ++found: ++ list_move(&s->list, &c->fsck_errors); ++ s->nr++; ++ suppressing = s->nr == FSCK_ERR_RATELIMIT_NR; ++ print = s->nr <= FSCK_ERR_RATELIMIT_NR; ++ buf = s->buf; ++print: ++ va_start(args, fmt); ++ vscnprintf(buf, sizeof(_buf), fmt, args); ++ va_end(args); ++ ++ if (c->opts.fix_errors == FSCK_OPT_EXIT) { ++ bch_err(c, "%s, exiting", buf); ++ } else if (flags & FSCK_CAN_FIX) { ++ if (c->opts.fix_errors == FSCK_OPT_ASK) { ++ printk(KERN_ERR "%s: fix?", buf); ++ fix = ask_yn(); ++ } else if (c->opts.fix_errors == FSCK_OPT_YES || ++ (c->opts.nochanges && ++ !(flags & FSCK_CAN_IGNORE))) { ++ if (print) ++ bch_err(c, "%s, fixing", buf); ++ fix = true; ++ } else { ++ if (print) ++ bch_err(c, "%s, not fixing", buf); ++ fix = false; ++ } ++ } else if (flags & FSCK_NEED_FSCK) { ++ if (print) ++ bch_err(c, "%s (run fsck to correct)", buf); ++ } else { ++ if (print) ++ bch_err(c, "%s (repair unimplemented)", buf); ++ } ++ ++ if (suppressing) ++ bch_err(c, "Ratelimiting new instances of previous error"); ++ ++ mutex_unlock(&c->fsck_error_lock); ++ ++ if (fix) { ++ set_bit(BCH_FS_ERRORS_FIXED, &c->flags); ++ return FSCK_ERR_FIX; ++ } else { ++ set_bit(BCH_FS_ERROR, &c->flags); ++ return c->opts.fix_errors == FSCK_OPT_EXIT || ++ !(flags & FSCK_CAN_IGNORE) ++ ? FSCK_ERR_EXIT ++ : FSCK_ERR_IGNORE; ++ } ++} ++ ++void bch2_flush_fsck_errs(struct bch_fs *c) ++{ ++ struct fsck_err_state *s, *n; ++ ++ mutex_lock(&c->fsck_error_lock); ++ ++ list_for_each_entry_safe(s, n, &c->fsck_errors, list) { ++ if (s->nr > FSCK_ERR_RATELIMIT_NR) ++ bch_err(c, "Saw %llu errors like:\n %s", s->nr, s->buf); ++ ++ list_del(&s->list); ++ kfree(s); ++ } ++ ++ mutex_unlock(&c->fsck_error_lock); ++} +diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h +new file mode 100644 +index 000000000000..2591e12305b7 +--- /dev/null ++++ b/fs/bcachefs/error.h +@@ -0,0 +1,229 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_ERROR_H ++#define _BCACHEFS_ERROR_H ++ ++#include ++#include ++ ++struct bch_dev; ++struct bch_fs; ++struct work_struct; ++ ++/* ++ * XXX: separate out errors that indicate on disk data is inconsistent, and flag ++ * superblock as such ++ */ ++ ++/* Error messages: */ ++ ++/* ++ * Very fatal logic/inconsistency errors: these indicate that we've majorly ++ * screwed up at runtime, i.e. it's not likely that it was just caused by the ++ * data on disk being inconsistent. These BUG(): ++ * ++ * XXX: audit and convert to inconsistent() checks ++ */ ++ ++#define bch2_fs_bug(c, ...) \ ++do { \ ++ bch_err(c, __VA_ARGS__); \ ++ BUG(); \ ++} while (0) ++ ++#define bch2_fs_bug_on(cond, c, ...) \ ++do { \ ++ if (cond) \ ++ bch2_fs_bug(c, __VA_ARGS__); \ ++} while (0) ++ ++/* ++ * Inconsistency errors: The on disk data is inconsistent. If these occur during ++ * initial recovery, they don't indicate a bug in the running code - we walk all ++ * the metadata before modifying anything. If they occur at runtime, they ++ * indicate either a bug in the running code or (less likely) data is being ++ * silently corrupted under us. ++ * ++ * XXX: audit all inconsistent errors and make sure they're all recoverable, in ++ * BCH_ON_ERROR_CONTINUE mode ++ */ ++ ++bool bch2_inconsistent_error(struct bch_fs *); ++ ++#define bch2_fs_inconsistent(c, ...) \ ++({ \ ++ bch_err(c, __VA_ARGS__); \ ++ bch2_inconsistent_error(c); \ ++}) ++ ++#define bch2_fs_inconsistent_on(cond, c, ...) \ ++({ \ ++ int _ret = !!(cond); \ ++ \ ++ if (_ret) \ ++ bch2_fs_inconsistent(c, __VA_ARGS__); \ ++ _ret; \ ++}) ++ ++/* ++ * Later we might want to mark only the particular device inconsistent, not the ++ * entire filesystem: ++ */ ++ ++#define bch2_dev_inconsistent(ca, ...) \ ++do { \ ++ bch_err(ca, __VA_ARGS__); \ ++ bch2_inconsistent_error((ca)->fs); \ ++} while (0) ++ ++#define bch2_dev_inconsistent_on(cond, ca, ...) \ ++({ \ ++ int _ret = !!(cond); \ ++ \ ++ if (_ret) \ ++ bch2_dev_inconsistent(ca, __VA_ARGS__); \ ++ _ret; \ ++}) ++ ++/* ++ * Fsck errors: inconsistency errors we detect at mount time, and should ideally ++ * be able to repair: ++ */ ++ ++enum { ++ BCH_FSCK_OK = 0, ++ BCH_FSCK_ERRORS_NOT_FIXED = 1, ++ BCH_FSCK_REPAIR_UNIMPLEMENTED = 2, ++ BCH_FSCK_REPAIR_IMPOSSIBLE = 3, ++ BCH_FSCK_UNKNOWN_VERSION = 4, ++}; ++ ++enum fsck_err_opts { ++ FSCK_OPT_EXIT, ++ FSCK_OPT_YES, ++ FSCK_OPT_NO, ++ FSCK_OPT_ASK, ++}; ++ ++enum fsck_err_ret { ++ FSCK_ERR_IGNORE = 0, ++ FSCK_ERR_FIX = 1, ++ FSCK_ERR_EXIT = 2, ++}; ++ ++struct fsck_err_state { ++ struct list_head list; ++ const char *fmt; ++ u64 nr; ++ char buf[512]; ++}; ++ ++#define FSCK_CAN_FIX (1 << 0) ++#define FSCK_CAN_IGNORE (1 << 1) ++#define FSCK_NEED_FSCK (1 << 2) ++ ++enum fsck_err_ret bch2_fsck_err(struct bch_fs *, ++ unsigned, const char *, ...); ++void bch2_flush_fsck_errs(struct bch_fs *); ++ ++#define __fsck_err(c, _flags, msg, ...) \ ++({ \ ++ int _fix = bch2_fsck_err(c, _flags, msg, ##__VA_ARGS__);\ ++ \ ++ if (_fix == FSCK_ERR_EXIT) { \ ++ bch_err(c, "Unable to continue, halting"); \ ++ ret = BCH_FSCK_ERRORS_NOT_FIXED; \ ++ goto fsck_err; \ ++ } \ ++ \ ++ _fix; \ ++}) ++ ++/* These macros return true if error should be fixed: */ ++ ++/* XXX: mark in superblock that filesystem contains errors, if we ignore: */ ++ ++#define __fsck_err_on(cond, c, _flags, ...) \ ++ ((cond) ? __fsck_err(c, _flags, ##__VA_ARGS__) : false) ++ ++#define need_fsck_err_on(cond, c, ...) \ ++ __fsck_err_on(cond, c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ##__VA_ARGS__) ++ ++#define need_fsck_err(c, ...) \ ++ __fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ##__VA_ARGS__) ++ ++#define mustfix_fsck_err(c, ...) \ ++ __fsck_err(c, FSCK_CAN_FIX, ##__VA_ARGS__) ++ ++#define mustfix_fsck_err_on(cond, c, ...) \ ++ __fsck_err_on(cond, c, FSCK_CAN_FIX, ##__VA_ARGS__) ++ ++#define fsck_err(c, ...) \ ++ __fsck_err(c, FSCK_CAN_FIX|FSCK_CAN_IGNORE, ##__VA_ARGS__) ++ ++#define fsck_err_on(cond, c, ...) \ ++ __fsck_err_on(cond, c, FSCK_CAN_FIX|FSCK_CAN_IGNORE, ##__VA_ARGS__) ++ ++/* ++ * Fatal errors: these don't indicate a bug, but we can't continue running in RW ++ * mode - pretty much just due to metadata IO errors: ++ */ ++ ++void bch2_fatal_error(struct bch_fs *); ++ ++#define bch2_fs_fatal_error(c, ...) \ ++do { \ ++ bch_err(c, __VA_ARGS__); \ ++ bch2_fatal_error(c); \ ++} while (0) ++ ++#define bch2_fs_fatal_err_on(cond, c, ...) \ ++({ \ ++ int _ret = !!(cond); \ ++ \ ++ if (_ret) \ ++ bch2_fs_fatal_error(c, __VA_ARGS__); \ ++ _ret; \ ++}) ++ ++/* ++ * IO errors: either recoverable metadata IO (because we have replicas), or data ++ * IO - we need to log it and print out a message, but we don't (necessarily) ++ * want to shut down the fs: ++ */ ++ ++void bch2_io_error_work(struct work_struct *); ++ ++/* Does the error handling without logging a message */ ++void bch2_io_error(struct bch_dev *); ++ ++/* Logs message and handles the error: */ ++#define bch2_dev_io_error(ca, fmt, ...) \ ++do { \ ++ printk_ratelimited(KERN_ERR bch2_fmt((ca)->fs, \ ++ "IO error on %s for " fmt), \ ++ (ca)->name, ##__VA_ARGS__); \ ++ bch2_io_error(ca); \ ++} while (0) ++ ++#define bch2_dev_io_err_on(cond, ca, ...) \ ++({ \ ++ bool _ret = (cond); \ ++ \ ++ if (_ret) \ ++ bch2_dev_io_error(ca, __VA_ARGS__); \ ++ _ret; \ ++}) ++ ++/* kill? */ ++ ++#define __bcache_io_error(c, fmt, ...) \ ++ printk_ratelimited(KERN_ERR bch2_fmt(c, \ ++ "IO error: " fmt), ##__VA_ARGS__) ++ ++#define bcache_io_error(c, bio, fmt, ...) \ ++do { \ ++ __bcache_io_error(c, fmt, ##__VA_ARGS__); \ ++ (bio)->bi_status = BLK_STS_IOERR; \ ++} while (0) ++ ++#endif /* _BCACHEFS_ERROR_H */ +diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c +new file mode 100644 +index 000000000000..4cc2a4b13199 +--- /dev/null ++++ b/fs/bcachefs/extents.c +@@ -0,0 +1,1752 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * Copyright (C) 2010 Kent Overstreet ++ * ++ * Code for managing the extent btree and dynamically updating the writeback ++ * dirty sector count. ++ */ ++ ++#include "bcachefs.h" ++#include "bkey_methods.h" ++#include "btree_gc.h" ++#include "btree_update.h" ++#include "btree_update_interior.h" ++#include "buckets.h" ++#include "checksum.h" ++#include "debug.h" ++#include "dirent.h" ++#include "disk_groups.h" ++#include "error.h" ++#include "extents.h" ++#include "inode.h" ++#include "journal.h" ++#include "replicas.h" ++#include "super.h" ++#include "super-io.h" ++#include "util.h" ++#include "xattr.h" ++ ++#include ++ ++unsigned bch2_bkey_nr_ptrs(struct bkey_s_c k) ++{ ++ struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k); ++ const struct bch_extent_ptr *ptr; ++ unsigned nr_ptrs = 0; ++ ++ bkey_for_each_ptr(p, ptr) ++ nr_ptrs++; ++ ++ return nr_ptrs; ++} ++ ++unsigned bch2_bkey_nr_dirty_ptrs(struct bkey_s_c k) ++{ ++ unsigned nr_ptrs = 0; ++ ++ switch (k.k->type) { ++ case KEY_TYPE_btree_ptr: ++ case KEY_TYPE_extent: ++ case KEY_TYPE_reflink_v: { ++ struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k); ++ const struct bch_extent_ptr *ptr; ++ ++ bkey_for_each_ptr(p, ptr) ++ nr_ptrs += !ptr->cached; ++ BUG_ON(!nr_ptrs); ++ break; ++ } ++ case KEY_TYPE_reservation: ++ nr_ptrs = bkey_s_c_to_reservation(k).v->nr_replicas; ++ break; ++ } ++ ++ return nr_ptrs; ++} ++ ++static unsigned bch2_extent_ptr_durability(struct bch_fs *c, ++ struct extent_ptr_decoded p) ++{ ++ unsigned durability = 0; ++ struct bch_dev *ca; ++ ++ if (p.ptr.cached) ++ return 0; ++ ++ ca = bch_dev_bkey_exists(c, p.ptr.dev); ++ ++ if (ca->mi.state != BCH_MEMBER_STATE_FAILED) ++ durability = max_t(unsigned, durability, ca->mi.durability); ++ ++ if (p.has_ec) { ++ struct stripe *s = ++ genradix_ptr(&c->stripes[0], p.ec.idx); ++ ++ if (WARN_ON(!s)) ++ goto out; ++ ++ durability = max_t(unsigned, durability, s->nr_redundant); ++ } ++out: ++ return durability; ++} ++ ++unsigned bch2_bkey_durability(struct bch_fs *c, struct bkey_s_c k) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const union bch_extent_entry *entry; ++ struct extent_ptr_decoded p; ++ unsigned durability = 0; ++ ++ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) ++ durability += bch2_extent_ptr_durability(c, p); ++ ++ return durability; ++} ++ ++static struct bch_dev_io_failures *dev_io_failures(struct bch_io_failures *f, ++ unsigned dev) ++{ ++ struct bch_dev_io_failures *i; ++ ++ for (i = f->devs; i < f->devs + f->nr; i++) ++ if (i->dev == dev) ++ return i; ++ ++ return NULL; ++} ++ ++void bch2_mark_io_failure(struct bch_io_failures *failed, ++ struct extent_ptr_decoded *p) ++{ ++ struct bch_dev_io_failures *f = dev_io_failures(failed, p->ptr.dev); ++ ++ if (!f) { ++ BUG_ON(failed->nr >= ARRAY_SIZE(failed->devs)); ++ ++ f = &failed->devs[failed->nr++]; ++ f->dev = p->ptr.dev; ++ f->idx = p->idx; ++ f->nr_failed = 1; ++ f->nr_retries = 0; ++ } else if (p->idx != f->idx) { ++ f->idx = p->idx; ++ f->nr_failed = 1; ++ f->nr_retries = 0; ++ } else { ++ f->nr_failed++; ++ } ++} ++ ++/* ++ * returns true if p1 is better than p2: ++ */ ++static inline bool ptr_better(struct bch_fs *c, ++ const struct extent_ptr_decoded p1, ++ const struct extent_ptr_decoded p2) ++{ ++ if (likely(!p1.idx && !p2.idx)) { ++ struct bch_dev *dev1 = bch_dev_bkey_exists(c, p1.ptr.dev); ++ struct bch_dev *dev2 = bch_dev_bkey_exists(c, p2.ptr.dev); ++ ++ u64 l1 = atomic64_read(&dev1->cur_latency[READ]); ++ u64 l2 = atomic64_read(&dev2->cur_latency[READ]); ++ ++ /* Pick at random, biased in favor of the faster device: */ ++ ++ return bch2_rand_range(l1 + l2) > l1; ++ } ++ ++ if (force_reconstruct_read(c)) ++ return p1.idx > p2.idx; ++ ++ return p1.idx < p2.idx; ++} ++ ++/* ++ * This picks a non-stale pointer, preferably from a device other than @avoid. ++ * Avoid can be NULL, meaning pick any. If there are no non-stale pointers to ++ * other devices, it will still pick a pointer from avoid. ++ */ ++int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, ++ struct bch_io_failures *failed, ++ struct extent_ptr_decoded *pick) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const union bch_extent_entry *entry; ++ struct extent_ptr_decoded p; ++ struct bch_dev_io_failures *f; ++ struct bch_dev *ca; ++ int ret = 0; ++ ++ if (k.k->type == KEY_TYPE_error) ++ return -EIO; ++ ++ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { ++ ca = bch_dev_bkey_exists(c, p.ptr.dev); ++ ++ /* ++ * If there are any dirty pointers it's an error if we can't ++ * read: ++ */ ++ if (!ret && !p.ptr.cached) ++ ret = -EIO; ++ ++ if (p.ptr.cached && ptr_stale(ca, &p.ptr)) ++ continue; ++ ++ f = failed ? dev_io_failures(failed, p.ptr.dev) : NULL; ++ if (f) ++ p.idx = f->nr_failed < f->nr_retries ++ ? f->idx ++ : f->idx + 1; ++ ++ if (!p.idx && ++ !bch2_dev_is_readable(ca)) ++ p.idx++; ++ ++ if (force_reconstruct_read(c) && ++ !p.idx && p.has_ec) ++ p.idx++; ++ ++ if (p.idx >= (unsigned) p.has_ec + 1) ++ continue; ++ ++ if (ret > 0 && !ptr_better(c, p, *pick)) ++ continue; ++ ++ *pick = p; ++ ret = 1; ++ } ++ ++ return ret; ++} ++ ++void bch2_bkey_append_ptr(struct bkey_i *k, ++ struct bch_extent_ptr ptr) ++{ ++ EBUG_ON(bch2_bkey_has_device(bkey_i_to_s_c(k), ptr.dev)); ++ ++ switch (k->k.type) { ++ case KEY_TYPE_btree_ptr: ++ case KEY_TYPE_extent: ++ EBUG_ON(bkey_val_u64s(&k->k) >= BKEY_EXTENT_VAL_U64s_MAX); ++ ++ ptr.type = 1 << BCH_EXTENT_ENTRY_ptr; ++ ++ memcpy((void *) &k->v + bkey_val_bytes(&k->k), ++ &ptr, ++ sizeof(ptr)); ++ k->u64s++; ++ break; ++ default: ++ BUG(); ++ } ++} ++ ++void bch2_bkey_drop_device(struct bkey_s k, unsigned dev) ++{ ++ struct bch_extent_ptr *ptr; ++ ++ bch2_bkey_drop_ptrs(k, ptr, ptr->dev == dev); ++} ++ ++const struct bch_extent_ptr * ++bch2_bkey_has_device(struct bkey_s_c k, unsigned dev) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const struct bch_extent_ptr *ptr; ++ ++ bkey_for_each_ptr(ptrs, ptr) ++ if (ptr->dev == dev) ++ return ptr; ++ ++ return NULL; ++} ++ ++bool bch2_bkey_has_target(struct bch_fs *c, struct bkey_s_c k, unsigned target) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const struct bch_extent_ptr *ptr; ++ ++ bkey_for_each_ptr(ptrs, ptr) ++ if (bch2_dev_in_target(c, ptr->dev, target) && ++ (!ptr->cached || ++ !ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr))) ++ return true; ++ ++ return false; ++} ++ ++/* extent specific utility code */ ++ ++const struct bch_extent_ptr * ++bch2_extent_has_device(struct bkey_s_c_extent e, unsigned dev) ++{ ++ const struct bch_extent_ptr *ptr; ++ ++ extent_for_each_ptr(e, ptr) ++ if (ptr->dev == dev) ++ return ptr; ++ ++ return NULL; ++} ++ ++const struct bch_extent_ptr * ++bch2_extent_has_group(struct bch_fs *c, struct bkey_s_c_extent e, unsigned group) ++{ ++ const struct bch_extent_ptr *ptr; ++ ++ extent_for_each_ptr(e, ptr) { ++ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); ++ ++ if (ca->mi.group && ++ ca->mi.group - 1 == group) ++ return ptr; ++ } ++ ++ return NULL; ++} ++ ++unsigned bch2_extent_is_compressed(struct bkey_s_c k) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const union bch_extent_entry *entry; ++ struct extent_ptr_decoded p; ++ unsigned ret = 0; ++ ++ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) ++ if (!p.ptr.cached && ++ p.crc.compression_type != BCH_COMPRESSION_NONE) ++ ret += p.crc.compressed_size; ++ ++ return ret; ++} ++ ++bool bch2_bkey_matches_ptr(struct bch_fs *c, struct bkey_s_c k, ++ struct bch_extent_ptr m, u64 offset) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const union bch_extent_entry *entry; ++ struct extent_ptr_decoded p; ++ ++ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) ++ if (p.ptr.dev == m.dev && ++ p.ptr.gen == m.gen && ++ (s64) p.ptr.offset + p.crc.offset - bkey_start_offset(k.k) == ++ (s64) m.offset - offset) ++ return true; ++ ++ return false; ++} ++ ++static union bch_extent_entry *extent_entry_prev(struct bkey_ptrs ptrs, ++ union bch_extent_entry *entry) ++{ ++ union bch_extent_entry *i = ptrs.start; ++ ++ if (i == entry) ++ return NULL; ++ ++ while (extent_entry_next(i) != entry) ++ i = extent_entry_next(i); ++ return i; ++} ++ ++union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s k, ++ struct bch_extent_ptr *ptr) ++{ ++ struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); ++ union bch_extent_entry *dst, *src, *prev; ++ bool drop_crc = true; ++ ++ EBUG_ON(ptr < &ptrs.start->ptr || ++ ptr >= &ptrs.end->ptr); ++ EBUG_ON(ptr->type != 1 << BCH_EXTENT_ENTRY_ptr); ++ ++ src = extent_entry_next(to_entry(ptr)); ++ if (src != ptrs.end && ++ !extent_entry_is_crc(src)) ++ drop_crc = false; ++ ++ dst = to_entry(ptr); ++ while ((prev = extent_entry_prev(ptrs, dst))) { ++ if (extent_entry_is_ptr(prev)) ++ break; ++ ++ if (extent_entry_is_crc(prev)) { ++ if (drop_crc) ++ dst = prev; ++ break; ++ } ++ ++ dst = prev; ++ } ++ ++ memmove_u64s_down(dst, src, ++ (u64 *) ptrs.end - (u64 *) src); ++ k.k->u64s -= (u64 *) src - (u64 *) dst; ++ ++ return dst; ++} ++ ++static inline bool can_narrow_crc(struct bch_extent_crc_unpacked u, ++ struct bch_extent_crc_unpacked n) ++{ ++ return !u.compression_type && ++ u.csum_type && ++ u.uncompressed_size > u.live_size && ++ bch2_csum_type_is_encryption(u.csum_type) == ++ bch2_csum_type_is_encryption(n.csum_type); ++} ++ ++bool bch2_can_narrow_extent_crcs(struct bkey_s_c k, ++ struct bch_extent_crc_unpacked n) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ struct bch_extent_crc_unpacked crc; ++ const union bch_extent_entry *i; ++ ++ if (!n.csum_type) ++ return false; ++ ++ bkey_for_each_crc(k.k, ptrs, crc, i) ++ if (can_narrow_crc(crc, n)) ++ return true; ++ ++ return false; ++} ++ ++/* ++ * We're writing another replica for this extent, so while we've got the data in ++ * memory we'll be computing a new checksum for the currently live data. ++ * ++ * If there are other replicas we aren't moving, and they are checksummed but ++ * not compressed, we can modify them to point to only the data that is ++ * currently live (so that readers won't have to bounce) while we've got the ++ * checksum we need: ++ */ ++bool bch2_bkey_narrow_crcs(struct bkey_i *k, struct bch_extent_crc_unpacked n) ++{ ++ struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k)); ++ struct bch_extent_crc_unpacked u; ++ struct extent_ptr_decoded p; ++ union bch_extent_entry *i; ++ bool ret = false; ++ ++ /* Find a checksum entry that covers only live data: */ ++ if (!n.csum_type) { ++ bkey_for_each_crc(&k->k, ptrs, u, i) ++ if (!u.compression_type && ++ u.csum_type && ++ u.live_size == u.uncompressed_size) { ++ n = u; ++ goto found; ++ } ++ return false; ++ } ++found: ++ BUG_ON(n.compression_type); ++ BUG_ON(n.offset); ++ BUG_ON(n.live_size != k->k.size); ++ ++restart_narrow_pointers: ++ ptrs = bch2_bkey_ptrs(bkey_i_to_s(k)); ++ ++ bkey_for_each_ptr_decode(&k->k, ptrs, p, i) ++ if (can_narrow_crc(p.crc, n)) { ++ bch2_bkey_drop_ptr(bkey_i_to_s(k), &i->ptr); ++ p.ptr.offset += p.crc.offset; ++ p.crc = n; ++ bch2_extent_ptr_decoded_append(k, &p); ++ ret = true; ++ goto restart_narrow_pointers; ++ } ++ ++ return ret; ++} ++ ++/* returns true if not equal */ ++static inline bool bch2_crc_unpacked_cmp(struct bch_extent_crc_unpacked l, ++ struct bch_extent_crc_unpacked r) ++{ ++ return (l.csum_type != r.csum_type || ++ l.compression_type != r.compression_type || ++ l.compressed_size != r.compressed_size || ++ l.uncompressed_size != r.uncompressed_size || ++ l.offset != r.offset || ++ l.live_size != r.live_size || ++ l.nonce != r.nonce || ++ bch2_crc_cmp(l.csum, r.csum)); ++} ++ ++void bch2_ptr_swab(const struct bkey_format *f, struct bkey_packed *k) ++{ ++ union bch_extent_entry *entry; ++ u64 *d = (u64 *) bkeyp_val(f, k); ++ unsigned i; ++ ++ for (i = 0; i < bkeyp_val_u64s(f, k); i++) ++ d[i] = swab64(d[i]); ++ ++ for (entry = (union bch_extent_entry *) d; ++ entry < (union bch_extent_entry *) (d + bkeyp_val_u64s(f, k)); ++ entry = extent_entry_next(entry)) { ++ switch (extent_entry_type(entry)) { ++ case BCH_EXTENT_ENTRY_ptr: ++ break; ++ case BCH_EXTENT_ENTRY_crc32: ++ entry->crc32.csum = swab32(entry->crc32.csum); ++ break; ++ case BCH_EXTENT_ENTRY_crc64: ++ entry->crc64.csum_hi = swab16(entry->crc64.csum_hi); ++ entry->crc64.csum_lo = swab64(entry->crc64.csum_lo); ++ break; ++ case BCH_EXTENT_ENTRY_crc128: ++ entry->crc128.csum.hi = (__force __le64) ++ swab64((__force u64) entry->crc128.csum.hi); ++ entry->crc128.csum.lo = (__force __le64) ++ swab64((__force u64) entry->crc128.csum.lo); ++ break; ++ case BCH_EXTENT_ENTRY_stripe_ptr: ++ break; ++ } ++ } ++} ++ ++void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c, ++ struct bkey_s_c k) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const union bch_extent_entry *entry; ++ struct bch_extent_crc_unpacked crc; ++ const struct bch_extent_ptr *ptr; ++ const struct bch_extent_stripe_ptr *ec; ++ struct bch_dev *ca; ++ bool first = true; ++ ++ bkey_extent_entry_for_each(ptrs, entry) { ++ if (!first) ++ pr_buf(out, " "); ++ ++ switch (__extent_entry_type(entry)) { ++ case BCH_EXTENT_ENTRY_ptr: ++ ptr = entry_to_ptr(entry); ++ ca = ptr->dev < c->sb.nr_devices && c->devs[ptr->dev] ++ ? bch_dev_bkey_exists(c, ptr->dev) ++ : NULL; ++ ++ pr_buf(out, "ptr: %u:%llu gen %u%s%s", ptr->dev, ++ (u64) ptr->offset, ptr->gen, ++ ptr->cached ? " cached" : "", ++ ca && ptr_stale(ca, ptr) ++ ? " stale" : ""); ++ break; ++ case BCH_EXTENT_ENTRY_crc32: ++ case BCH_EXTENT_ENTRY_crc64: ++ case BCH_EXTENT_ENTRY_crc128: ++ crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry)); ++ ++ pr_buf(out, "crc: c_size %u size %u offset %u nonce %u csum %u compress %u", ++ crc.compressed_size, ++ crc.uncompressed_size, ++ crc.offset, crc.nonce, ++ crc.csum_type, ++ crc.compression_type); ++ break; ++ case BCH_EXTENT_ENTRY_stripe_ptr: ++ ec = &entry->stripe_ptr; ++ ++ pr_buf(out, "ec: idx %llu block %u", ++ (u64) ec->idx, ec->block); ++ break; ++ default: ++ pr_buf(out, "(invalid extent entry %.16llx)", *((u64 *) entry)); ++ return; ++ } ++ ++ first = false; ++ } ++} ++ ++static const char *extent_ptr_invalid(const struct bch_fs *c, ++ struct bkey_s_c k, ++ const struct bch_extent_ptr *ptr, ++ unsigned size_ondisk, ++ bool metadata) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const struct bch_extent_ptr *ptr2; ++ struct bch_dev *ca; ++ ++ if (!bch2_dev_exists2(c, ptr->dev)) ++ return "pointer to invalid device"; ++ ++ ca = bch_dev_bkey_exists(c, ptr->dev); ++ if (!ca) ++ return "pointer to invalid device"; ++ ++ bkey_for_each_ptr(ptrs, ptr2) ++ if (ptr != ptr2 && ptr->dev == ptr2->dev) ++ return "multiple pointers to same device"; ++ ++ if (ptr->offset + size_ondisk > bucket_to_sector(ca, ca->mi.nbuckets)) ++ return "offset past end of device"; ++ ++ if (ptr->offset < bucket_to_sector(ca, ca->mi.first_bucket)) ++ return "offset before first bucket"; ++ ++ if (bucket_remainder(ca, ptr->offset) + ++ size_ondisk > ca->mi.bucket_size) ++ return "spans multiple buckets"; ++ ++ return NULL; ++} ++ ++const char *bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const union bch_extent_entry *entry; ++ struct bch_extent_crc_unpacked crc; ++ unsigned size_ondisk = k.k->size; ++ const char *reason; ++ unsigned nonce = UINT_MAX; ++ ++ if (k.k->type == KEY_TYPE_btree_ptr) ++ size_ondisk = c->opts.btree_node_size; ++ ++ bkey_extent_entry_for_each(ptrs, entry) { ++ if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX) ++ return "invalid extent entry type"; ++ ++ if (k.k->type == KEY_TYPE_btree_ptr && ++ !extent_entry_is_ptr(entry)) ++ return "has non ptr field"; ++ ++ switch (extent_entry_type(entry)) { ++ case BCH_EXTENT_ENTRY_ptr: ++ reason = extent_ptr_invalid(c, k, &entry->ptr, ++ size_ondisk, false); ++ if (reason) ++ return reason; ++ break; ++ case BCH_EXTENT_ENTRY_crc32: ++ case BCH_EXTENT_ENTRY_crc64: ++ case BCH_EXTENT_ENTRY_crc128: ++ crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry)); ++ ++ if (crc.offset + crc.live_size > ++ crc.uncompressed_size) ++ return "checksum offset + key size > uncompressed size"; ++ ++ size_ondisk = crc.compressed_size; ++ ++ if (!bch2_checksum_type_valid(c, crc.csum_type)) ++ return "invalid checksum type"; ++ ++ if (crc.compression_type >= BCH_COMPRESSION_NR) ++ return "invalid compression type"; ++ ++ if (bch2_csum_type_is_encryption(crc.csum_type)) { ++ if (nonce == UINT_MAX) ++ nonce = crc.offset + crc.nonce; ++ else if (nonce != crc.offset + crc.nonce) ++ return "incorrect nonce"; ++ } ++ break; ++ case BCH_EXTENT_ENTRY_stripe_ptr: ++ break; ++ } ++ } ++ ++ return NULL; ++} ++ ++/* Btree ptrs */ ++ ++const char *bch2_btree_ptr_invalid(const struct bch_fs *c, struct bkey_s_c k) ++{ ++ if (bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX) ++ return "value too big"; ++ ++ return bch2_bkey_ptrs_invalid(c, k); ++} ++ ++void bch2_btree_ptr_debugcheck(struct bch_fs *c, struct bkey_s_c k) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const struct bch_extent_ptr *ptr; ++ const char *err; ++ char buf[160]; ++ struct bucket_mark mark; ++ struct bch_dev *ca; ++ ++ bch2_fs_bug_on(!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) && ++ !bch2_bkey_replicas_marked(c, k, false), c, ++ "btree key bad (replicas not marked in superblock):\n%s", ++ (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); ++ ++ if (!test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) ++ return; ++ ++ bkey_for_each_ptr(ptrs, ptr) { ++ ca = bch_dev_bkey_exists(c, ptr->dev); ++ ++ mark = ptr_bucket_mark(ca, ptr); ++ ++ err = "stale"; ++ if (gen_after(mark.gen, ptr->gen)) ++ goto err; ++ ++ err = "inconsistent"; ++ if (mark.data_type != BCH_DATA_BTREE || ++ mark.dirty_sectors < c->opts.btree_node_size) ++ goto err; ++ } ++ ++ return; ++err: ++ bch2_bkey_val_to_text(&PBUF(buf), c, k); ++ bch2_fs_bug(c, "%s btree pointer %s: bucket %zi gen %i mark %08x", ++ err, buf, PTR_BUCKET_NR(ca, ptr), ++ mark.gen, (unsigned) mark.v.counter); ++} ++ ++void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c, ++ struct bkey_s_c k) ++{ ++ bch2_bkey_ptrs_to_text(out, c, k); ++} ++ ++/* Extents */ ++ ++void __bch2_cut_front(struct bpos where, struct bkey_s k) ++{ ++ u64 sub; ++ ++ if (bkey_cmp(where, bkey_start_pos(k.k)) <= 0) ++ return; ++ ++ EBUG_ON(bkey_cmp(where, k.k->p) > 0); ++ ++ sub = where.offset - bkey_start_offset(k.k); ++ ++ k.k->size -= sub; ++ ++ if (!k.k->size) ++ k.k->type = KEY_TYPE_deleted; ++ ++ switch (k.k->type) { ++ case KEY_TYPE_deleted: ++ case KEY_TYPE_discard: ++ case KEY_TYPE_error: ++ case KEY_TYPE_cookie: ++ break; ++ case KEY_TYPE_extent: ++ case KEY_TYPE_reflink_v: { ++ struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); ++ union bch_extent_entry *entry; ++ bool seen_crc = false; ++ ++ bkey_extent_entry_for_each(ptrs, entry) { ++ switch (extent_entry_type(entry)) { ++ case BCH_EXTENT_ENTRY_ptr: ++ if (!seen_crc) ++ entry->ptr.offset += sub; ++ break; ++ case BCH_EXTENT_ENTRY_crc32: ++ entry->crc32.offset += sub; ++ break; ++ case BCH_EXTENT_ENTRY_crc64: ++ entry->crc64.offset += sub; ++ break; ++ case BCH_EXTENT_ENTRY_crc128: ++ entry->crc128.offset += sub; ++ break; ++ case BCH_EXTENT_ENTRY_stripe_ptr: ++ break; ++ } ++ ++ if (extent_entry_is_crc(entry)) ++ seen_crc = true; ++ } ++ ++ break; ++ } ++ case KEY_TYPE_reflink_p: { ++ struct bkey_s_reflink_p p = bkey_s_to_reflink_p(k); ++ ++ le64_add_cpu(&p.v->idx, sub); ++ break; ++ } ++ case KEY_TYPE_reservation: ++ break; ++ default: ++ BUG(); ++ } ++} ++ ++bool bch2_cut_back(struct bpos where, struct bkey *k) ++{ ++ u64 len = 0; ++ ++ if (bkey_cmp(where, k->p) >= 0) ++ return false; ++ ++ EBUG_ON(bkey_cmp(where, bkey_start_pos(k)) < 0); ++ ++ len = where.offset - bkey_start_offset(k); ++ ++ k->p = where; ++ k->size = len; ++ ++ if (!len) ++ k->type = KEY_TYPE_deleted; ++ ++ return true; ++} ++ ++static unsigned bch2_bkey_nr_alloc_ptrs(struct bkey_s_c k) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const union bch_extent_entry *entry; ++ unsigned ret = 0; ++ ++ bkey_extent_entry_for_each(ptrs, entry) { ++ switch (__extent_entry_type(entry)) { ++ case BCH_EXTENT_ENTRY_ptr: ++ case BCH_EXTENT_ENTRY_stripe_ptr: ++ ret++; ++ } ++ } ++ ++ return ret; ++} ++ ++static int count_iters_for_insert(struct btree_trans *trans, ++ struct bkey_s_c k, ++ unsigned offset, ++ struct bpos *end, ++ unsigned *nr_iters, ++ unsigned max_iters, ++ bool overwrite) ++{ ++ int ret = 0; ++ ++ switch (k.k->type) { ++ case KEY_TYPE_extent: ++ case KEY_TYPE_reflink_v: ++ *nr_iters += bch2_bkey_nr_alloc_ptrs(k); ++ ++ if (*nr_iters >= max_iters) { ++ *end = bpos_min(*end, k.k->p); ++ ret = 1; ++ } ++ ++ break; ++ case KEY_TYPE_reflink_p: { ++ struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); ++ u64 idx = le64_to_cpu(p.v->idx); ++ unsigned sectors = bpos_min(*end, p.k->p).offset - ++ bkey_start_offset(p.k); ++ struct btree_iter *iter; ++ struct bkey_s_c r_k; ++ ++ for_each_btree_key(trans, iter, ++ BTREE_ID_REFLINK, POS(0, idx + offset), ++ BTREE_ITER_SLOTS, r_k, ret) { ++ if (bkey_cmp(bkey_start_pos(r_k.k), ++ POS(0, idx + sectors)) >= 0) ++ break; ++ ++ *nr_iters += 1 + bch2_bkey_nr_alloc_ptrs(r_k); ++ ++ if (*nr_iters >= max_iters) { ++ struct bpos pos = bkey_start_pos(k.k); ++ pos.offset += r_k.k->p.offset - idx; ++ ++ *end = bpos_min(*end, pos); ++ ret = 1; ++ break; ++ } ++ } ++ ++ bch2_trans_iter_put(trans, iter); ++ break; ++ } ++ } ++ ++ return ret; ++} ++ ++#define EXTENT_ITERS_MAX (BTREE_ITER_MAX / 3) ++ ++int bch2_extent_atomic_end(struct btree_iter *iter, ++ struct bkey_i *insert, ++ struct bpos *end) ++{ ++ struct btree_trans *trans = iter->trans; ++ struct btree *b; ++ struct btree_node_iter node_iter; ++ struct bkey_packed *_k; ++ unsigned nr_iters = 0; ++ int ret; ++ ++ ret = bch2_btree_iter_traverse(iter); ++ if (ret) ++ return ret; ++ ++ b = iter->l[0].b; ++ node_iter = iter->l[0].iter; ++ ++ BUG_ON(bkey_cmp(bkey_start_pos(&insert->k), b->data->min_key) < 0); ++ ++ *end = bpos_min(insert->k.p, b->key.k.p); ++ ++ ret = count_iters_for_insert(trans, bkey_i_to_s_c(insert), 0, end, ++ &nr_iters, EXTENT_ITERS_MAX / 2, false); ++ if (ret < 0) ++ return ret; ++ ++ while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, b, ++ KEY_TYPE_discard))) { ++ struct bkey unpacked; ++ struct bkey_s_c k = bkey_disassemble(b, _k, &unpacked); ++ unsigned offset = 0; ++ ++ if (bkey_cmp(bkey_start_pos(k.k), *end) >= 0) ++ break; ++ ++ if (bkey_cmp(bkey_start_pos(&insert->k), ++ bkey_start_pos(k.k)) > 0) ++ offset = bkey_start_offset(&insert->k) - ++ bkey_start_offset(k.k); ++ ++ ret = count_iters_for_insert(trans, k, offset, end, ++ &nr_iters, EXTENT_ITERS_MAX, true); ++ if (ret) ++ break; ++ ++ bch2_btree_node_iter_advance(&node_iter, b); ++ } ++ ++ return ret < 0 ? ret : 0; ++} ++ ++int bch2_extent_trim_atomic(struct bkey_i *k, struct btree_iter *iter) ++{ ++ struct bpos end; ++ int ret; ++ ++ ret = bch2_extent_atomic_end(iter, k, &end); ++ if (ret) ++ return ret; ++ ++ bch2_cut_back(end, &k->k); ++ return 0; ++} ++ ++int bch2_extent_is_atomic(struct bkey_i *k, struct btree_iter *iter) ++{ ++ struct bpos end; ++ int ret; ++ ++ ret = bch2_extent_atomic_end(iter, k, &end); ++ if (ret) ++ return ret; ++ ++ return !bkey_cmp(end, k->k.p); ++} ++ ++enum btree_insert_ret ++bch2_extent_can_insert(struct btree_trans *trans, ++ struct btree_insert_entry *insert, ++ unsigned *u64s) ++{ ++ struct btree_iter_level *l = &insert->iter->l[0]; ++ struct btree_node_iter node_iter = l->iter; ++ enum bch_extent_overlap overlap; ++ struct bkey_packed *_k; ++ struct bkey unpacked; ++ struct bkey_s_c k; ++ int sectors; ++ ++ /* ++ * We avoid creating whiteouts whenever possible when deleting, but ++ * those optimizations mean we may potentially insert two whiteouts ++ * instead of one (when we overlap with the front of one extent and the ++ * back of another): ++ */ ++ if (bkey_whiteout(&insert->k->k)) ++ *u64s += BKEY_U64s; ++ ++ _k = bch2_btree_node_iter_peek_filter(&node_iter, l->b, ++ KEY_TYPE_discard); ++ if (!_k) ++ return BTREE_INSERT_OK; ++ ++ k = bkey_disassemble(l->b, _k, &unpacked); ++ ++ overlap = bch2_extent_overlap(&insert->k->k, k.k); ++ ++ /* account for having to split existing extent: */ ++ if (overlap == BCH_EXTENT_OVERLAP_MIDDLE) ++ *u64s += _k->u64s; ++ ++ if (overlap == BCH_EXTENT_OVERLAP_MIDDLE && ++ (sectors = bch2_extent_is_compressed(k))) { ++ int flags = trans->flags & BTREE_INSERT_NOFAIL ++ ? BCH_DISK_RESERVATION_NOFAIL : 0; ++ ++ switch (bch2_disk_reservation_add(trans->c, ++ trans->disk_res, ++ sectors, flags)) { ++ case 0: ++ break; ++ case -ENOSPC: ++ return BTREE_INSERT_ENOSPC; ++ default: ++ BUG(); ++ } ++ } ++ ++ return BTREE_INSERT_OK; ++} ++ ++static void verify_extent_nonoverlapping(struct bch_fs *c, ++ struct btree *b, ++ struct btree_node_iter *_iter, ++ struct bkey_i *insert) ++{ ++#ifdef CONFIG_BCACHEFS_DEBUG ++ struct btree_node_iter iter; ++ struct bkey_packed *k; ++ struct bkey uk; ++ ++ if (!expensive_debug_checks(c)) ++ return; ++ ++ iter = *_iter; ++ k = bch2_btree_node_iter_prev_filter(&iter, b, KEY_TYPE_discard); ++ BUG_ON(k && ++ (uk = bkey_unpack_key(b, k), ++ bkey_cmp(uk.p, bkey_start_pos(&insert->k)) > 0)); ++ ++ iter = *_iter; ++ k = bch2_btree_node_iter_peek_filter(&iter, b, KEY_TYPE_discard); ++#if 0 ++ BUG_ON(k && ++ (uk = bkey_unpack_key(b, k), ++ bkey_cmp(insert->k.p, bkey_start_pos(&uk))) > 0); ++#else ++ if (k && ++ (uk = bkey_unpack_key(b, k), ++ bkey_cmp(insert->k.p, bkey_start_pos(&uk))) > 0) { ++ char buf1[100]; ++ char buf2[100]; ++ ++ bch2_bkey_to_text(&PBUF(buf1), &insert->k); ++ bch2_bkey_to_text(&PBUF(buf2), &uk); ++ ++ bch2_dump_btree_node(b); ++ panic("insert > next :\n" ++ "insert %s\n" ++ "next %s\n", ++ buf1, buf2); ++ } ++#endif ++ ++#endif ++} ++ ++static void extent_bset_insert(struct bch_fs *c, struct btree_iter *iter, ++ struct bkey_i *insert) ++{ ++ struct btree_iter_level *l = &iter->l[0]; ++ struct bkey_packed *k = ++ bch2_btree_node_iter_bset_pos(&l->iter, l->b, bset_tree_last(l->b)); ++ ++ BUG_ON(insert->k.u64s > bch_btree_keys_u64s_remaining(c, l->b)); ++ ++ EBUG_ON(bkey_deleted(&insert->k) || !insert->k.size); ++ verify_extent_nonoverlapping(c, l->b, &l->iter, insert); ++ ++ if (debug_check_bkeys(c)) ++ bch2_bkey_debugcheck(c, l->b, bkey_i_to_s_c(insert)); ++ ++ bch2_bset_insert(l->b, &l->iter, k, insert, 0); ++ bch2_btree_node_iter_fix(iter, l->b, &l->iter, k, 0, k->u64s); ++} ++ ++static void ++extent_squash(struct bch_fs *c, struct btree_iter *iter, ++ struct bkey_i *insert, ++ struct bkey_packed *_k, struct bkey_s k, ++ enum bch_extent_overlap overlap) ++{ ++ struct btree_iter_level *l = &iter->l[0]; ++ ++ switch (overlap) { ++ case BCH_EXTENT_OVERLAP_FRONT: ++ /* insert overlaps with start of k: */ ++ __bch2_cut_front(insert->k.p, k); ++ EBUG_ON(bkey_deleted(k.k)); ++ extent_save(l->b, _k, k.k); ++ bch2_btree_iter_fix_key_modified(iter, l->b, _k); ++ break; ++ ++ case BCH_EXTENT_OVERLAP_BACK: ++ /* insert overlaps with end of k: */ ++ bch2_cut_back(bkey_start_pos(&insert->k), k.k); ++ EBUG_ON(bkey_deleted(k.k)); ++ extent_save(l->b, _k, k.k); ++ ++ /* ++ * As the auxiliary tree is indexed by the end of the ++ * key and we've just changed the end, update the ++ * auxiliary tree. ++ */ ++ bch2_bset_fix_invalidated_key(l->b, _k); ++ bch2_btree_node_iter_fix(iter, l->b, &l->iter, ++ _k, _k->u64s, _k->u64s); ++ break; ++ ++ case BCH_EXTENT_OVERLAP_ALL: { ++ /* The insert key completely covers k, invalidate k */ ++ if (!bkey_whiteout(k.k)) ++ btree_account_key_drop(l->b, _k); ++ ++ k.k->size = 0; ++ k.k->type = KEY_TYPE_deleted; ++ ++ if (_k >= btree_bset_last(l->b)->start) { ++ unsigned u64s = _k->u64s; ++ ++ bch2_bset_delete(l->b, _k, _k->u64s); ++ bch2_btree_node_iter_fix(iter, l->b, &l->iter, ++ _k, u64s, 0); ++ } else { ++ extent_save(l->b, _k, k.k); ++ bch2_btree_iter_fix_key_modified(iter, l->b, _k); ++ } ++ ++ break; ++ } ++ case BCH_EXTENT_OVERLAP_MIDDLE: { ++ BKEY_PADDED(k) split; ++ /* ++ * The insert key falls 'in the middle' of k ++ * The insert key splits k in 3: ++ * - start only in k, preserve ++ * - middle common section, invalidate in k ++ * - end only in k, preserve ++ * ++ * We update the old key to preserve the start, ++ * insert will be the new common section, ++ * we manually insert the end that we are preserving. ++ * ++ * modify k _before_ doing the insert (which will move ++ * what k points to) ++ */ ++ bkey_reassemble(&split.k, k.s_c); ++ split.k.k.needs_whiteout |= bkey_written(l->b, _k); ++ ++ bch2_cut_back(bkey_start_pos(&insert->k), &split.k.k); ++ BUG_ON(bkey_deleted(&split.k.k)); ++ ++ __bch2_cut_front(insert->k.p, k); ++ BUG_ON(bkey_deleted(k.k)); ++ extent_save(l->b, _k, k.k); ++ bch2_btree_iter_fix_key_modified(iter, l->b, _k); ++ ++ extent_bset_insert(c, iter, &split.k); ++ break; ++ } ++ } ++} ++ ++/** ++ * bch_extent_insert_fixup - insert a new extent and deal with overlaps ++ * ++ * this may result in not actually doing the insert, or inserting some subset ++ * of the insert key. For cmpxchg operations this is where that logic lives. ++ * ++ * All subsets of @insert that need to be inserted are inserted using ++ * bch2_btree_insert_and_journal(). If @b or @res fills up, this function ++ * returns false, setting @iter->pos for the prefix of @insert that actually got ++ * inserted. ++ * ++ * BSET INVARIANTS: this function is responsible for maintaining all the ++ * invariants for bsets of extents in memory. things get really hairy with 0 ++ * size extents ++ * ++ * within one bset: ++ * ++ * bkey_start_pos(bkey_next(k)) >= k ++ * or bkey_start_offset(bkey_next(k)) >= k->offset ++ * ++ * i.e. strict ordering, no overlapping extents. ++ * ++ * multiple bsets (i.e. full btree node): ++ * ++ * ∀ k, j ++ * k.size != 0 ∧ j.size != 0 → ++ * ¬ (k > bkey_start_pos(j) ∧ k < j) ++ * ++ * i.e. no two overlapping keys _of nonzero size_ ++ * ++ * We can't realistically maintain this invariant for zero size keys because of ++ * the key merging done in bch2_btree_insert_key() - for two mergeable keys k, j ++ * there may be another 0 size key between them in another bset, and it will ++ * thus overlap with the merged key. ++ * ++ * In addition, the end of iter->pos indicates how much has been processed. ++ * If the end of iter->pos is not the same as the end of insert, then ++ * key insertion needs to continue/be retried. ++ */ ++void bch2_insert_fixup_extent(struct btree_trans *trans, ++ struct btree_insert_entry *insert_entry) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_iter *iter = insert_entry->iter; ++ struct bkey_i *insert = insert_entry->k; ++ struct btree_iter_level *l = &iter->l[0]; ++ struct btree_node_iter node_iter = l->iter; ++ bool deleting = bkey_whiteout(&insert->k); ++ bool update_journal = !deleting; ++ bool update_btree = !deleting; ++ struct bkey_i whiteout = *insert; ++ struct bkey_packed *_k; ++ struct bkey unpacked; ++ BKEY_PADDED(k) tmp; ++ ++ EBUG_ON(iter->level); ++ EBUG_ON(!insert->k.size); ++ EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k))); ++ ++ while ((_k = bch2_btree_node_iter_peek_filter(&l->iter, l->b, ++ KEY_TYPE_discard))) { ++ struct bkey_s k = __bkey_disassemble(l->b, _k, &unpacked); ++ struct bpos cur_end = bpos_min(insert->k.p, k.k->p); ++ enum bch_extent_overlap overlap = ++ bch2_extent_overlap(&insert->k, k.k); ++ ++ if (bkey_cmp(bkey_start_pos(k.k), insert->k.p) >= 0) ++ break; ++ ++ if (!bkey_whiteout(k.k)) ++ update_journal = true; ++ ++ if (!update_journal) { ++ bch2_cut_front(cur_end, insert); ++ bch2_cut_front(cur_end, &whiteout); ++ bch2_btree_iter_set_pos_same_leaf(iter, cur_end); ++ goto next; ++ } ++ ++ /* ++ * When deleting, if possible just do it by switching the type ++ * of the key we're deleting, instead of creating and inserting ++ * a new whiteout: ++ */ ++ if (deleting && ++ !update_btree && ++ !bkey_cmp(insert->k.p, k.k->p) && ++ !bkey_cmp(bkey_start_pos(&insert->k), bkey_start_pos(k.k))) { ++ if (!bkey_whiteout(k.k)) { ++ btree_account_key_drop(l->b, _k); ++ _k->type = KEY_TYPE_discard; ++ reserve_whiteout(l->b, _k); ++ bch2_btree_iter_fix_key_modified(iter, ++ l->b, _k); ++ } ++ break; ++ } ++ ++ if (k.k->needs_whiteout || bkey_written(l->b, _k)) { ++ insert->k.needs_whiteout = true; ++ update_btree = true; ++ } ++ ++ if (update_btree && ++ overlap == BCH_EXTENT_OVERLAP_ALL && ++ bkey_whiteout(k.k) && ++ k.k->needs_whiteout) { ++ unreserve_whiteout(l->b, _k); ++ _k->needs_whiteout = false; ++ } ++ ++ extent_squash(c, iter, insert, _k, k, overlap); ++ ++ if (!update_btree) ++ bch2_cut_front(cur_end, insert); ++next: ++ node_iter = l->iter; ++ ++ if (overlap == BCH_EXTENT_OVERLAP_FRONT || ++ overlap == BCH_EXTENT_OVERLAP_MIDDLE) ++ break; ++ } ++ ++ l->iter = node_iter; ++ bch2_btree_iter_set_pos_same_leaf(iter, insert->k.p); ++ ++ if (update_btree) { ++ bkey_copy(&tmp.k, insert); ++ ++ if (deleting) ++ tmp.k.k.type = KEY_TYPE_discard; ++ ++ EBUG_ON(bkey_deleted(&tmp.k.k) || !tmp.k.k.size); ++ ++ extent_bset_insert(c, iter, &tmp.k); ++ } ++ ++ if (update_journal) { ++ bkey_copy(&tmp.k, !deleting ? insert : &whiteout); ++ ++ if (deleting) ++ tmp.k.k.type = KEY_TYPE_discard; ++ ++ EBUG_ON(bkey_deleted(&tmp.k.k) || !tmp.k.k.size); ++ ++ bch2_btree_journal_key(trans, iter, &tmp.k); ++ } ++ ++ bch2_cut_front(insert->k.p, insert); ++} ++ ++const char *bch2_extent_invalid(const struct bch_fs *c, struct bkey_s_c k) ++{ ++ return bch2_bkey_ptrs_invalid(c, k); ++} ++ ++void bch2_extent_debugcheck(struct bch_fs *c, struct bkey_s_c k) ++{ ++ struct bkey_s_c_extent e = bkey_s_c_to_extent(k); ++ const union bch_extent_entry *entry; ++ struct extent_ptr_decoded p; ++ char buf[160]; ++ ++ /* ++ * XXX: we should be doing most/all of these checks at startup time, ++ * where we check bch2_bkey_invalid() in btree_node_read_done() ++ * ++ * But note that we can't check for stale pointers or incorrect gc marks ++ * until after journal replay is done (it might be an extent that's ++ * going to get overwritten during replay) ++ */ ++ ++ if (percpu_down_read_trylock(&c->mark_lock)) { ++ bch2_fs_bug_on(!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) && ++ !bch2_bkey_replicas_marked_locked(c, e.s_c, false), c, ++ "extent key bad (replicas not marked in superblock):\n%s", ++ (bch2_bkey_val_to_text(&PBUF(buf), c, e.s_c), buf)); ++ percpu_up_read(&c->mark_lock); ++ } ++ /* ++ * If journal replay hasn't finished, we might be seeing keys ++ * that will be overwritten by the time journal replay is done: ++ */ ++ if (!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)) ++ return; ++ ++ extent_for_each_ptr_decode(e, p, entry) { ++ struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); ++ struct bucket_mark mark = ptr_bucket_mark(ca, &p.ptr); ++ unsigned stale = gen_after(mark.gen, p.ptr.gen); ++ unsigned disk_sectors = ptr_disk_sectors(p); ++ unsigned mark_sectors = p.ptr.cached ++ ? mark.cached_sectors ++ : mark.dirty_sectors; ++ ++ bch2_fs_bug_on(stale && !p.ptr.cached, c, ++ "stale dirty pointer (ptr gen %u bucket %u", ++ p.ptr.gen, mark.gen); ++ ++ bch2_fs_bug_on(stale > 96, c, "key too stale: %i", stale); ++ ++ bch2_fs_bug_on(!stale && ++ (mark.data_type != BCH_DATA_USER || ++ mark_sectors < disk_sectors), c, ++ "extent pointer not marked: %s:\n" ++ "type %u sectors %u < %u", ++ (bch2_bkey_val_to_text(&PBUF(buf), c, e.s_c), buf), ++ mark.data_type, ++ mark_sectors, disk_sectors); ++ } ++} ++ ++void bch2_extent_to_text(struct printbuf *out, struct bch_fs *c, ++ struct bkey_s_c k) ++{ ++ bch2_bkey_ptrs_to_text(out, c, k); ++} ++ ++static unsigned bch2_crc_field_size_max[] = { ++ [BCH_EXTENT_ENTRY_crc32] = CRC32_SIZE_MAX, ++ [BCH_EXTENT_ENTRY_crc64] = CRC64_SIZE_MAX, ++ [BCH_EXTENT_ENTRY_crc128] = CRC128_SIZE_MAX, ++}; ++ ++static void bch2_extent_crc_pack(union bch_extent_crc *dst, ++ struct bch_extent_crc_unpacked src) ++{ ++#define set_common_fields(_dst, _src) \ ++ _dst.csum_type = _src.csum_type, \ ++ _dst.compression_type = _src.compression_type, \ ++ _dst._compressed_size = _src.compressed_size - 1, \ ++ _dst._uncompressed_size = _src.uncompressed_size - 1, \ ++ _dst.offset = _src.offset ++ ++ switch (extent_entry_type(to_entry(dst))) { ++ case BCH_EXTENT_ENTRY_crc32: ++ set_common_fields(dst->crc32, src); ++ dst->crc32.csum = *((__le32 *) &src.csum.lo); ++ break; ++ case BCH_EXTENT_ENTRY_crc64: ++ set_common_fields(dst->crc64, src); ++ dst->crc64.nonce = src.nonce; ++ dst->crc64.csum_lo = src.csum.lo; ++ dst->crc64.csum_hi = *((__le16 *) &src.csum.hi); ++ break; ++ case BCH_EXTENT_ENTRY_crc128: ++ set_common_fields(dst->crc128, src); ++ dst->crc128.nonce = src.nonce; ++ dst->crc128.csum = src.csum; ++ break; ++ default: ++ BUG(); ++ } ++#undef set_common_fields ++} ++ ++void bch2_extent_crc_append(struct bkey_i *k, ++ struct bch_extent_crc_unpacked new) ++{ ++ struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k)); ++ union bch_extent_crc *crc = (void *) ptrs.end; ++ ++ if (bch_crc_bytes[new.csum_type] <= 4 && ++ new.uncompressed_size - 1 <= CRC32_SIZE_MAX && ++ new.nonce <= CRC32_NONCE_MAX) ++ crc->type = 1 << BCH_EXTENT_ENTRY_crc32; ++ else if (bch_crc_bytes[new.csum_type] <= 10 && ++ new.uncompressed_size - 1 <= CRC64_SIZE_MAX && ++ new.nonce <= CRC64_NONCE_MAX) ++ crc->type = 1 << BCH_EXTENT_ENTRY_crc64; ++ else if (bch_crc_bytes[new.csum_type] <= 16 && ++ new.uncompressed_size - 1 <= CRC128_SIZE_MAX && ++ new.nonce <= CRC128_NONCE_MAX) ++ crc->type = 1 << BCH_EXTENT_ENTRY_crc128; ++ else ++ BUG(); ++ ++ bch2_extent_crc_pack(crc, new); ++ ++ k->k.u64s += extent_entry_u64s(ptrs.end); ++ ++ EBUG_ON(bkey_val_u64s(&k->k) > BKEY_EXTENT_VAL_U64s_MAX); ++} ++ ++static inline void __extent_entry_insert(struct bkey_i *k, ++ union bch_extent_entry *dst, ++ union bch_extent_entry *new) ++{ ++ union bch_extent_entry *end = bkey_val_end(bkey_i_to_s(k)); ++ ++ memmove_u64s_up_small((u64 *) dst + extent_entry_u64s(new), ++ dst, (u64 *) end - (u64 *) dst); ++ k->k.u64s += extent_entry_u64s(new); ++ memcpy(dst, new, extent_entry_bytes(new)); ++} ++ ++void bch2_extent_ptr_decoded_append(struct bkey_i *k, ++ struct extent_ptr_decoded *p) ++{ ++ struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k)); ++ struct bch_extent_crc_unpacked crc = ++ bch2_extent_crc_unpack(&k->k, NULL); ++ union bch_extent_entry *pos; ++ ++ if (!bch2_crc_unpacked_cmp(crc, p->crc)) { ++ pos = ptrs.start; ++ goto found; ++ } ++ ++ bkey_for_each_crc(&k->k, ptrs, crc, pos) ++ if (!bch2_crc_unpacked_cmp(crc, p->crc)) { ++ pos = extent_entry_next(pos); ++ goto found; ++ } ++ ++ bch2_extent_crc_append(k, p->crc); ++ pos = bkey_val_end(bkey_i_to_s(k)); ++found: ++ p->ptr.type = 1 << BCH_EXTENT_ENTRY_ptr; ++ __extent_entry_insert(k, pos, to_entry(&p->ptr)); ++ ++ if (p->has_ec) { ++ p->ec.type = 1 << BCH_EXTENT_ENTRY_stripe_ptr; ++ __extent_entry_insert(k, pos, to_entry(&p->ec)); ++ } ++} ++ ++/* ++ * bch_extent_normalize - clean up an extent, dropping stale pointers etc. ++ * ++ * Returns true if @k should be dropped entirely ++ * ++ * For existing keys, only called when btree nodes are being rewritten, not when ++ * they're merely being compacted/resorted in memory. ++ */ ++bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k) ++{ ++ struct bch_extent_ptr *ptr; ++ ++ bch2_bkey_drop_ptrs(k, ptr, ++ ptr->cached && ++ ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr)); ++ ++ /* will only happen if all pointers were cached: */ ++ if (!bkey_val_u64s(k.k)) ++ k.k->type = KEY_TYPE_discard; ++ ++ return bkey_whiteout(k.k); ++} ++ ++void bch2_bkey_mark_replicas_cached(struct bch_fs *c, struct bkey_s k, ++ unsigned target, ++ unsigned nr_desired_replicas) ++{ ++ struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); ++ union bch_extent_entry *entry; ++ struct extent_ptr_decoded p; ++ int extra = bch2_bkey_durability(c, k.s_c) - nr_desired_replicas; ++ ++ if (target && extra > 0) ++ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { ++ int n = bch2_extent_ptr_durability(c, p); ++ ++ if (n && n <= extra && ++ !bch2_dev_in_target(c, p.ptr.dev, target)) { ++ entry->ptr.cached = true; ++ extra -= n; ++ } ++ } ++ ++ if (extra > 0) ++ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { ++ int n = bch2_extent_ptr_durability(c, p); ++ ++ if (n && n <= extra) { ++ entry->ptr.cached = true; ++ extra -= n; ++ } ++ } ++} ++ ++enum merge_result bch2_extent_merge(struct bch_fs *c, ++ struct bkey_s _l, struct bkey_s _r) ++{ ++ struct bkey_s_extent l = bkey_s_to_extent(_l); ++ struct bkey_s_extent r = bkey_s_to_extent(_r); ++ union bch_extent_entry *en_l = l.v->start; ++ union bch_extent_entry *en_r = r.v->start; ++ struct bch_extent_crc_unpacked crc_l, crc_r; ++ ++ if (bkey_val_u64s(l.k) != bkey_val_u64s(r.k)) ++ return BCH_MERGE_NOMERGE; ++ ++ crc_l = bch2_extent_crc_unpack(l.k, NULL); ++ ++ extent_for_each_entry(l, en_l) { ++ en_r = vstruct_idx(r.v, (u64 *) en_l - l.v->_data); ++ ++ if (extent_entry_type(en_l) != extent_entry_type(en_r)) ++ return BCH_MERGE_NOMERGE; ++ ++ switch (extent_entry_type(en_l)) { ++ case BCH_EXTENT_ENTRY_ptr: { ++ const struct bch_extent_ptr *lp = &en_l->ptr; ++ const struct bch_extent_ptr *rp = &en_r->ptr; ++ struct bch_dev *ca; ++ ++ if (lp->offset + crc_l.compressed_size != rp->offset || ++ lp->dev != rp->dev || ++ lp->gen != rp->gen) ++ return BCH_MERGE_NOMERGE; ++ ++ /* We don't allow extents to straddle buckets: */ ++ ca = bch_dev_bkey_exists(c, lp->dev); ++ ++ if (PTR_BUCKET_NR(ca, lp) != PTR_BUCKET_NR(ca, rp)) ++ return BCH_MERGE_NOMERGE; ++ ++ break; ++ } ++ case BCH_EXTENT_ENTRY_stripe_ptr: ++ if (en_l->stripe_ptr.block != en_r->stripe_ptr.block || ++ en_l->stripe_ptr.idx != en_r->stripe_ptr.idx) ++ return BCH_MERGE_NOMERGE; ++ break; ++ case BCH_EXTENT_ENTRY_crc32: ++ case BCH_EXTENT_ENTRY_crc64: ++ case BCH_EXTENT_ENTRY_crc128: ++ crc_l = bch2_extent_crc_unpack(l.k, entry_to_crc(en_l)); ++ crc_r = bch2_extent_crc_unpack(r.k, entry_to_crc(en_r)); ++ ++ if (crc_l.csum_type != crc_r.csum_type || ++ crc_l.compression_type != crc_r.compression_type || ++ crc_l.nonce != crc_r.nonce) ++ return BCH_MERGE_NOMERGE; ++ ++ if (crc_l.offset + crc_l.live_size != crc_l.compressed_size || ++ crc_r.offset) ++ return BCH_MERGE_NOMERGE; ++ ++ if (!bch2_checksum_mergeable(crc_l.csum_type)) ++ return BCH_MERGE_NOMERGE; ++ ++ if (crc_l.compression_type) ++ return BCH_MERGE_NOMERGE; ++ ++ if (crc_l.csum_type && ++ crc_l.uncompressed_size + ++ crc_r.uncompressed_size > c->sb.encoded_extent_max) ++ return BCH_MERGE_NOMERGE; ++ ++ if (crc_l.uncompressed_size + crc_r.uncompressed_size - 1 > ++ bch2_crc_field_size_max[extent_entry_type(en_l)]) ++ return BCH_MERGE_NOMERGE; ++ ++ break; ++ default: ++ return BCH_MERGE_NOMERGE; ++ } ++ } ++ ++ extent_for_each_entry(l, en_l) { ++ struct bch_extent_crc_unpacked crc_l, crc_r; ++ ++ en_r = vstruct_idx(r.v, (u64 *) en_l - l.v->_data); ++ ++ if (!extent_entry_is_crc(en_l)) ++ continue; ++ ++ crc_l = bch2_extent_crc_unpack(l.k, entry_to_crc(en_l)); ++ crc_r = bch2_extent_crc_unpack(r.k, entry_to_crc(en_r)); ++ ++ crc_l.csum = bch2_checksum_merge(crc_l.csum_type, ++ crc_l.csum, ++ crc_r.csum, ++ crc_r.uncompressed_size << 9); ++ ++ crc_l.uncompressed_size += crc_r.uncompressed_size; ++ crc_l.compressed_size += crc_r.compressed_size; ++ ++ bch2_extent_crc_pack(entry_to_crc(en_l), crc_l); ++ } ++ ++ bch2_key_resize(l.k, l.k->size + r.k->size); ++ ++ return BCH_MERGE_MERGE; ++} ++ ++bool bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size, ++ unsigned nr_replicas) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bpos end = pos; ++ struct bkey_s_c k; ++ bool ret = true; ++ int err; ++ ++ end.offset += size; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, pos, ++ BTREE_ITER_SLOTS, k, err) { ++ if (bkey_cmp(bkey_start_pos(k.k), end) >= 0) ++ break; ++ ++ if (nr_replicas > bch2_bkey_nr_ptrs_allocated(k)) { ++ ret = false; ++ break; ++ } ++ } ++ bch2_trans_exit(&trans); ++ ++ return ret; ++} ++ ++unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c k) ++{ ++ unsigned ret = 0; ++ ++ switch (k.k->type) { ++ case KEY_TYPE_extent: { ++ struct bkey_s_c_extent e = bkey_s_c_to_extent(k); ++ const union bch_extent_entry *entry; ++ struct extent_ptr_decoded p; ++ ++ extent_for_each_ptr_decode(e, p, entry) ++ ret += !p.ptr.cached && ++ p.crc.compression_type == BCH_COMPRESSION_NONE; ++ break; ++ } ++ case KEY_TYPE_reservation: ++ ret = bkey_s_c_to_reservation(k).v->nr_replicas; ++ break; ++ } ++ ++ return ret; ++} ++ ++/* KEY_TYPE_reservation: */ ++ ++const char *bch2_reservation_invalid(const struct bch_fs *c, struct bkey_s_c k) ++{ ++ struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k); ++ ++ if (bkey_val_bytes(k.k) != sizeof(struct bch_reservation)) ++ return "incorrect value size"; ++ ++ if (!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX) ++ return "invalid nr_replicas"; ++ ++ return NULL; ++} ++ ++void bch2_reservation_to_text(struct printbuf *out, struct bch_fs *c, ++ struct bkey_s_c k) ++{ ++ struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k); ++ ++ pr_buf(out, "generation %u replicas %u", ++ le32_to_cpu(r.v->generation), ++ r.v->nr_replicas); ++} ++ ++enum merge_result bch2_reservation_merge(struct bch_fs *c, ++ struct bkey_s _l, struct bkey_s _r) ++{ ++ struct bkey_s_reservation l = bkey_s_to_reservation(_l); ++ struct bkey_s_reservation r = bkey_s_to_reservation(_r); ++ ++ if (l.v->generation != r.v->generation || ++ l.v->nr_replicas != r.v->nr_replicas) ++ return BCH_MERGE_NOMERGE; ++ ++ if ((u64) l.k->size + r.k->size > KEY_SIZE_MAX) { ++ bch2_key_resize(l.k, KEY_SIZE_MAX); ++ __bch2_cut_front(l.k->p, r.s); ++ return BCH_MERGE_PARTIAL; ++ } ++ ++ bch2_key_resize(l.k, l.k->size + r.k->size); ++ ++ return BCH_MERGE_MERGE; ++} +diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h +new file mode 100644 +index 000000000000..cc7ee9067b50 +--- /dev/null ++++ b/fs/bcachefs/extents.h +@@ -0,0 +1,582 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_EXTENTS_H ++#define _BCACHEFS_EXTENTS_H ++ ++#include "bcachefs.h" ++#include "bkey.h" ++#include "extents_types.h" ++ ++struct bch_fs; ++struct btree_trans; ++struct btree_insert_entry; ++ ++/* extent entries: */ ++ ++#define extent_entry_last(_e) \ ++ ((typeof(&(_e).v->start[0])) bkey_val_end(_e)) ++ ++#define entry_to_ptr(_entry) \ ++({ \ ++ EBUG_ON((_entry) && !extent_entry_is_ptr(_entry)); \ ++ \ ++ __builtin_choose_expr( \ ++ type_is_exact(_entry, const union bch_extent_entry *), \ ++ (const struct bch_extent_ptr *) (_entry), \ ++ (struct bch_extent_ptr *) (_entry)); \ ++}) ++ ++/* downcast, preserves const */ ++#define to_entry(_entry) \ ++({ \ ++ BUILD_BUG_ON(!type_is(_entry, union bch_extent_crc *) && \ ++ !type_is(_entry, struct bch_extent_ptr *) && \ ++ !type_is(_entry, struct bch_extent_stripe_ptr *)); \ ++ \ ++ __builtin_choose_expr( \ ++ (type_is_exact(_entry, const union bch_extent_crc *) || \ ++ type_is_exact(_entry, const struct bch_extent_ptr *) ||\ ++ type_is_exact(_entry, const struct bch_extent_stripe_ptr *)),\ ++ (const union bch_extent_entry *) (_entry), \ ++ (union bch_extent_entry *) (_entry)); \ ++}) ++ ++static inline unsigned ++__extent_entry_type(const union bch_extent_entry *e) ++{ ++ return e->type ? __ffs(e->type) : BCH_EXTENT_ENTRY_MAX; ++} ++ ++static inline enum bch_extent_entry_type ++extent_entry_type(const union bch_extent_entry *e) ++{ ++ int ret = __ffs(e->type); ++ ++ EBUG_ON(ret < 0 || ret >= BCH_EXTENT_ENTRY_MAX); ++ ++ return ret; ++} ++ ++static inline size_t extent_entry_bytes(const union bch_extent_entry *entry) ++{ ++ switch (extent_entry_type(entry)) { ++#define x(f, n) \ ++ case BCH_EXTENT_ENTRY_##f: \ ++ return sizeof(struct bch_extent_##f); ++ BCH_EXTENT_ENTRY_TYPES() ++#undef x ++ default: ++ BUG(); ++ } ++} ++ ++static inline size_t extent_entry_u64s(const union bch_extent_entry *entry) ++{ ++ return extent_entry_bytes(entry) / sizeof(u64); ++} ++ ++static inline bool extent_entry_is_ptr(const union bch_extent_entry *e) ++{ ++ switch (extent_entry_type(e)) { ++ case BCH_EXTENT_ENTRY_ptr: ++ return true; ++ default: ++ return false; ++ } ++} ++ ++static inline bool extent_entry_is_crc(const union bch_extent_entry *e) ++{ ++ switch (extent_entry_type(e)) { ++ case BCH_EXTENT_ENTRY_crc32: ++ case BCH_EXTENT_ENTRY_crc64: ++ case BCH_EXTENT_ENTRY_crc128: ++ return true; ++ default: ++ return false; ++ } ++} ++ ++union bch_extent_crc { ++ u8 type; ++ struct bch_extent_crc32 crc32; ++ struct bch_extent_crc64 crc64; ++ struct bch_extent_crc128 crc128; ++}; ++ ++#define __entry_to_crc(_entry) \ ++ __builtin_choose_expr( \ ++ type_is_exact(_entry, const union bch_extent_entry *), \ ++ (const union bch_extent_crc *) (_entry), \ ++ (union bch_extent_crc *) (_entry)) ++ ++#define entry_to_crc(_entry) \ ++({ \ ++ EBUG_ON((_entry) && !extent_entry_is_crc(_entry)); \ ++ \ ++ __entry_to_crc(_entry); \ ++}) ++ ++static inline struct bch_extent_crc_unpacked ++bch2_extent_crc_unpack(const struct bkey *k, const union bch_extent_crc *crc) ++{ ++#define common_fields(_crc) \ ++ .csum_type = _crc.csum_type, \ ++ .compression_type = _crc.compression_type, \ ++ .compressed_size = _crc._compressed_size + 1, \ ++ .uncompressed_size = _crc._uncompressed_size + 1, \ ++ .offset = _crc.offset, \ ++ .live_size = k->size ++ ++ if (!crc) ++ return (struct bch_extent_crc_unpacked) { ++ .compressed_size = k->size, ++ .uncompressed_size = k->size, ++ .live_size = k->size, ++ }; ++ ++ switch (extent_entry_type(to_entry(crc))) { ++ case BCH_EXTENT_ENTRY_crc32: { ++ struct bch_extent_crc_unpacked ret = (struct bch_extent_crc_unpacked) { ++ common_fields(crc->crc32), ++ }; ++ ++ *((__le32 *) &ret.csum.lo) = crc->crc32.csum; ++ ++ memcpy(&ret.csum.lo, &crc->crc32.csum, ++ sizeof(crc->crc32.csum)); ++ ++ return ret; ++ } ++ case BCH_EXTENT_ENTRY_crc64: { ++ struct bch_extent_crc_unpacked ret = (struct bch_extent_crc_unpacked) { ++ common_fields(crc->crc64), ++ .nonce = crc->crc64.nonce, ++ .csum.lo = (__force __le64) crc->crc64.csum_lo, ++ }; ++ ++ *((__le16 *) &ret.csum.hi) = crc->crc64.csum_hi; ++ ++ return ret; ++ } ++ case BCH_EXTENT_ENTRY_crc128: { ++ struct bch_extent_crc_unpacked ret = (struct bch_extent_crc_unpacked) { ++ common_fields(crc->crc128), ++ .nonce = crc->crc128.nonce, ++ .csum = crc->crc128.csum, ++ }; ++ ++ return ret; ++ } ++ default: ++ BUG(); ++ } ++#undef common_fields ++} ++ ++/* bkey_ptrs: generically over any key type that has ptrs */ ++ ++struct bkey_ptrs_c { ++ const union bch_extent_entry *start; ++ const union bch_extent_entry *end; ++}; ++ ++struct bkey_ptrs { ++ union bch_extent_entry *start; ++ union bch_extent_entry *end; ++}; ++ ++/* iterate over bkey ptrs */ ++ ++#define extent_entry_next(_entry) \ ++ ((typeof(_entry)) ((void *) (_entry) + extent_entry_bytes(_entry))) ++ ++#define __bkey_extent_entry_for_each_from(_start, _end, _entry) \ ++ for ((_entry) = (_start); \ ++ (_entry) < (_end); \ ++ (_entry) = extent_entry_next(_entry)) ++ ++#define __bkey_ptr_next(_ptr, _end) \ ++({ \ ++ typeof(_end) _entry; \ ++ \ ++ __bkey_extent_entry_for_each_from(to_entry(_ptr), _end, _entry) \ ++ if (extent_entry_is_ptr(_entry)) \ ++ break; \ ++ \ ++ _entry < (_end) ? entry_to_ptr(_entry) : NULL; \ ++}) ++ ++#define bkey_extent_entry_for_each_from(_p, _entry, _start) \ ++ __bkey_extent_entry_for_each_from(_start, (_p).end, _entry) ++ ++#define bkey_extent_entry_for_each(_p, _entry) \ ++ bkey_extent_entry_for_each_from(_p, _entry, _p.start) ++ ++#define __bkey_for_each_ptr(_start, _end, _ptr) \ ++ for ((_ptr) = (_start); \ ++ ((_ptr) = __bkey_ptr_next(_ptr, _end)); \ ++ (_ptr)++) ++ ++#define bkey_ptr_next(_p, _ptr) \ ++ __bkey_ptr_next(_ptr, (_p).end) ++ ++#define bkey_for_each_ptr(_p, _ptr) \ ++ __bkey_for_each_ptr(&(_p).start->ptr, (_p).end, _ptr) ++ ++#define __bkey_ptr_next_decode(_k, _end, _ptr, _entry) \ ++({ \ ++ __label__ out; \ ++ \ ++ (_ptr).idx = 0; \ ++ (_ptr).has_ec = false; \ ++ \ ++ __bkey_extent_entry_for_each_from(_entry, _end, _entry) \ ++ switch (extent_entry_type(_entry)) { \ ++ case BCH_EXTENT_ENTRY_ptr: \ ++ (_ptr).ptr = _entry->ptr; \ ++ goto out; \ ++ case BCH_EXTENT_ENTRY_crc32: \ ++ case BCH_EXTENT_ENTRY_crc64: \ ++ case BCH_EXTENT_ENTRY_crc128: \ ++ (_ptr).crc = bch2_extent_crc_unpack(_k, \ ++ entry_to_crc(_entry)); \ ++ break; \ ++ case BCH_EXTENT_ENTRY_stripe_ptr: \ ++ (_ptr).ec = _entry->stripe_ptr; \ ++ (_ptr).has_ec = true; \ ++ break; \ ++ } \ ++out: \ ++ _entry < (_end); \ ++}) ++ ++#define __bkey_for_each_ptr_decode(_k, _start, _end, _ptr, _entry) \ ++ for ((_ptr).crc = bch2_extent_crc_unpack(_k, NULL), \ ++ (_entry) = _start; \ ++ __bkey_ptr_next_decode(_k, _end, _ptr, _entry); \ ++ (_entry) = extent_entry_next(_entry)) ++ ++#define bkey_for_each_ptr_decode(_k, _p, _ptr, _entry) \ ++ __bkey_for_each_ptr_decode(_k, (_p).start, (_p).end, \ ++ _ptr, _entry) ++ ++#define bkey_crc_next(_k, _start, _end, _crc, _iter) \ ++({ \ ++ __bkey_extent_entry_for_each_from(_iter, _end, _iter) \ ++ if (extent_entry_is_crc(_iter)) { \ ++ (_crc) = bch2_extent_crc_unpack(_k, \ ++ entry_to_crc(_iter)); \ ++ break; \ ++ } \ ++ \ ++ (_iter) < (_end); \ ++}) ++ ++#define __bkey_for_each_crc(_k, _start, _end, _crc, _iter) \ ++ for ((_crc) = bch2_extent_crc_unpack(_k, NULL), \ ++ (_iter) = (_start); \ ++ bkey_crc_next(_k, _start, _end, _crc, _iter); \ ++ (_iter) = extent_entry_next(_iter)) ++ ++#define bkey_for_each_crc(_k, _p, _crc, _iter) \ ++ __bkey_for_each_crc(_k, (_p).start, (_p).end, _crc, _iter) ++ ++/* utility code common to all keys with pointers: */ ++ ++static inline struct bkey_ptrs_c bch2_bkey_ptrs_c(struct bkey_s_c k) ++{ ++ switch (k.k->type) { ++ case KEY_TYPE_btree_ptr: { ++ struct bkey_s_c_btree_ptr e = bkey_s_c_to_btree_ptr(k); ++ return (struct bkey_ptrs_c) { ++ to_entry(&e.v->start[0]), ++ to_entry(extent_entry_last(e)) ++ }; ++ } ++ case KEY_TYPE_extent: { ++ struct bkey_s_c_extent e = bkey_s_c_to_extent(k); ++ return (struct bkey_ptrs_c) { ++ e.v->start, ++ extent_entry_last(e) ++ }; ++ } ++ case KEY_TYPE_stripe: { ++ struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k); ++ return (struct bkey_ptrs_c) { ++ to_entry(&s.v->ptrs[0]), ++ to_entry(&s.v->ptrs[s.v->nr_blocks]), ++ }; ++ } ++ case KEY_TYPE_reflink_v: { ++ struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k); ++ ++ return (struct bkey_ptrs_c) { ++ r.v->start, ++ bkey_val_end(r), ++ }; ++ } ++ default: ++ return (struct bkey_ptrs_c) { NULL, NULL }; ++ } ++} ++ ++static inline struct bkey_ptrs bch2_bkey_ptrs(struct bkey_s k) ++{ ++ struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k.s_c); ++ ++ return (struct bkey_ptrs) { ++ (void *) p.start, ++ (void *) p.end ++ }; ++} ++ ++static inline struct bch_devs_list bch2_bkey_devs(struct bkey_s_c k) ++{ ++ struct bch_devs_list ret = (struct bch_devs_list) { 0 }; ++ struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k); ++ const struct bch_extent_ptr *ptr; ++ ++ bkey_for_each_ptr(p, ptr) ++ ret.devs[ret.nr++] = ptr->dev; ++ ++ return ret; ++} ++ ++static inline struct bch_devs_list bch2_bkey_dirty_devs(struct bkey_s_c k) ++{ ++ struct bch_devs_list ret = (struct bch_devs_list) { 0 }; ++ struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k); ++ const struct bch_extent_ptr *ptr; ++ ++ bkey_for_each_ptr(p, ptr) ++ if (!ptr->cached) ++ ret.devs[ret.nr++] = ptr->dev; ++ ++ return ret; ++} ++ ++static inline struct bch_devs_list bch2_bkey_cached_devs(struct bkey_s_c k) ++{ ++ struct bch_devs_list ret = (struct bch_devs_list) { 0 }; ++ struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k); ++ const struct bch_extent_ptr *ptr; ++ ++ bkey_for_each_ptr(p, ptr) ++ if (ptr->cached) ++ ret.devs[ret.nr++] = ptr->dev; ++ ++ return ret; ++} ++ ++unsigned bch2_bkey_nr_ptrs(struct bkey_s_c); ++unsigned bch2_bkey_nr_dirty_ptrs(struct bkey_s_c); ++unsigned bch2_bkey_durability(struct bch_fs *, struct bkey_s_c); ++ ++void bch2_mark_io_failure(struct bch_io_failures *, ++ struct extent_ptr_decoded *); ++int bch2_bkey_pick_read_device(struct bch_fs *, struct bkey_s_c, ++ struct bch_io_failures *, ++ struct extent_ptr_decoded *); ++ ++void bch2_bkey_append_ptr(struct bkey_i *, struct bch_extent_ptr); ++void bch2_bkey_drop_device(struct bkey_s, unsigned); ++const struct bch_extent_ptr *bch2_bkey_has_device(struct bkey_s_c, unsigned); ++bool bch2_bkey_has_target(struct bch_fs *, struct bkey_s_c, unsigned); ++ ++void bch2_bkey_ptrs_to_text(struct printbuf *, struct bch_fs *, ++ struct bkey_s_c); ++const char *bch2_bkey_ptrs_invalid(const struct bch_fs *, struct bkey_s_c); ++ ++/* bch_btree_ptr: */ ++ ++const char *bch2_btree_ptr_invalid(const struct bch_fs *, struct bkey_s_c); ++void bch2_btree_ptr_debugcheck(struct bch_fs *, struct bkey_s_c); ++void bch2_btree_ptr_to_text(struct printbuf *, struct bch_fs *, ++ struct bkey_s_c); ++void bch2_ptr_swab(const struct bkey_format *, struct bkey_packed *); ++ ++#define bch2_bkey_ops_btree_ptr (struct bkey_ops) { \ ++ .key_invalid = bch2_btree_ptr_invalid, \ ++ .key_debugcheck = bch2_btree_ptr_debugcheck, \ ++ .val_to_text = bch2_btree_ptr_to_text, \ ++ .swab = bch2_ptr_swab, \ ++} ++ ++/* bch_extent: */ ++ ++const char *bch2_extent_invalid(const struct bch_fs *, struct bkey_s_c); ++void bch2_extent_debugcheck(struct bch_fs *, struct bkey_s_c); ++void bch2_extent_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); ++bool bch2_extent_normalize(struct bch_fs *, struct bkey_s); ++enum merge_result bch2_extent_merge(struct bch_fs *, ++ struct bkey_s, struct bkey_s); ++ ++#define bch2_bkey_ops_extent (struct bkey_ops) { \ ++ .key_invalid = bch2_extent_invalid, \ ++ .key_debugcheck = bch2_extent_debugcheck, \ ++ .val_to_text = bch2_extent_to_text, \ ++ .swab = bch2_ptr_swab, \ ++ .key_normalize = bch2_extent_normalize, \ ++ .key_merge = bch2_extent_merge, \ ++} ++ ++/* bch_reservation: */ ++ ++const char *bch2_reservation_invalid(const struct bch_fs *, struct bkey_s_c); ++void bch2_reservation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); ++enum merge_result bch2_reservation_merge(struct bch_fs *, ++ struct bkey_s, struct bkey_s); ++ ++#define bch2_bkey_ops_reservation (struct bkey_ops) { \ ++ .key_invalid = bch2_reservation_invalid, \ ++ .val_to_text = bch2_reservation_to_text, \ ++ .key_merge = bch2_reservation_merge, \ ++} ++ ++int bch2_extent_atomic_end(struct btree_iter *, struct bkey_i *, ++ struct bpos *); ++int bch2_extent_trim_atomic(struct bkey_i *, struct btree_iter *); ++int bch2_extent_is_atomic(struct bkey_i *, struct btree_iter *); ++ ++enum btree_insert_ret ++bch2_extent_can_insert(struct btree_trans *, struct btree_insert_entry *, ++ unsigned *); ++void bch2_insert_fixup_extent(struct btree_trans *, ++ struct btree_insert_entry *); ++ ++void bch2_bkey_mark_replicas_cached(struct bch_fs *, struct bkey_s, ++ unsigned, unsigned); ++ ++const struct bch_extent_ptr * ++bch2_extent_has_device(struct bkey_s_c_extent, unsigned); ++ ++unsigned bch2_extent_is_compressed(struct bkey_s_c); ++ ++bool bch2_bkey_matches_ptr(struct bch_fs *, struct bkey_s_c, ++ struct bch_extent_ptr, u64); ++ ++static inline bool bkey_extent_is_direct_data(const struct bkey *k) ++{ ++ switch (k->type) { ++ case KEY_TYPE_btree_ptr: ++ case KEY_TYPE_extent: ++ case KEY_TYPE_reflink_v: ++ return true; ++ default: ++ return false; ++ } ++} ++ ++static inline bool bkey_extent_is_data(const struct bkey *k) ++{ ++ return bkey_extent_is_direct_data(k) || ++ k->type == KEY_TYPE_reflink_p; ++} ++ ++/* ++ * Should extent be counted under inode->i_sectors? ++ */ ++static inline bool bkey_extent_is_allocation(const struct bkey *k) ++{ ++ switch (k->type) { ++ case KEY_TYPE_extent: ++ case KEY_TYPE_reservation: ++ case KEY_TYPE_reflink_p: ++ case KEY_TYPE_reflink_v: ++ return true; ++ default: ++ return false; ++ } ++} ++ ++/* Extent entry iteration: */ ++ ++#define extent_for_each_entry_from(_e, _entry, _start) \ ++ __bkey_extent_entry_for_each_from(_start, \ ++ extent_entry_last(_e),_entry) ++ ++#define extent_for_each_entry(_e, _entry) \ ++ extent_for_each_entry_from(_e, _entry, (_e).v->start) ++ ++#define extent_ptr_next(_e, _ptr) \ ++ __bkey_ptr_next(_ptr, extent_entry_last(_e)) ++ ++#define extent_for_each_ptr(_e, _ptr) \ ++ __bkey_for_each_ptr(&(_e).v->start->ptr, extent_entry_last(_e), _ptr) ++ ++#define extent_for_each_ptr_decode(_e, _ptr, _entry) \ ++ __bkey_for_each_ptr_decode((_e).k, (_e).v->start, \ ++ extent_entry_last(_e), _ptr, _entry) ++ ++void bch2_extent_crc_append(struct bkey_i *, ++ struct bch_extent_crc_unpacked); ++void bch2_extent_ptr_decoded_append(struct bkey_i *, ++ struct extent_ptr_decoded *); ++ ++bool bch2_can_narrow_extent_crcs(struct bkey_s_c, ++ struct bch_extent_crc_unpacked); ++bool bch2_bkey_narrow_crcs(struct bkey_i *, struct bch_extent_crc_unpacked); ++ ++union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s, ++ struct bch_extent_ptr *); ++ ++#define bch2_bkey_drop_ptrs(_k, _ptr, _cond) \ ++do { \ ++ struct bkey_ptrs _ptrs = bch2_bkey_ptrs(_k); \ ++ \ ++ _ptr = &_ptrs.start->ptr; \ ++ \ ++ while ((_ptr = bkey_ptr_next(_ptrs, _ptr))) { \ ++ if (_cond) { \ ++ _ptr = (void *) bch2_bkey_drop_ptr(_k, _ptr); \ ++ _ptrs = bch2_bkey_ptrs(_k); \ ++ continue; \ ++ } \ ++ \ ++ (_ptr)++; \ ++ } \ ++} while (0) ++ ++void __bch2_cut_front(struct bpos, struct bkey_s); ++ ++static inline void bch2_cut_front(struct bpos where, struct bkey_i *k) ++{ ++ __bch2_cut_front(where, bkey_i_to_s(k)); ++} ++ ++bool bch2_cut_back(struct bpos, struct bkey *); ++ ++/** ++ * bch_key_resize - adjust size of @k ++ * ++ * bkey_start_offset(k) will be preserved, modifies where the extent ends ++ */ ++static inline void bch2_key_resize(struct bkey *k, unsigned new_size) ++{ ++ k->p.offset -= k->size; ++ k->p.offset += new_size; ++ k->size = new_size; ++} ++ ++/* ++ * In extent_sort_fix_overlapping(), insert_fixup_extent(), ++ * extent_merge_inline() - we're modifying keys in place that are packed. To do ++ * that we have to unpack the key, modify the unpacked key - then this ++ * copies/repacks the unpacked to the original as necessary. ++ */ ++static inline void extent_save(struct btree *b, struct bkey_packed *dst, ++ struct bkey *src) ++{ ++ struct bkey_format *f = &b->format; ++ struct bkey_i *dst_unpacked; ++ ++ if ((dst_unpacked = packed_to_bkey(dst))) ++ dst_unpacked->k = *src; ++ else ++ BUG_ON(!bch2_bkey_pack_key(dst, src, f)); ++} ++ ++bool bch2_check_range_allocated(struct bch_fs *, struct bpos, u64, unsigned); ++unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c); ++ ++#endif /* _BCACHEFS_EXTENTS_H */ +diff --git a/fs/bcachefs/extents_types.h b/fs/bcachefs/extents_types.h +new file mode 100644 +index 000000000000..43d6c341ecca +--- /dev/null ++++ b/fs/bcachefs/extents_types.h +@@ -0,0 +1,40 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_EXTENTS_TYPES_H ++#define _BCACHEFS_EXTENTS_TYPES_H ++ ++#include "bcachefs_format.h" ++ ++struct bch_extent_crc_unpacked { ++ u32 compressed_size; ++ u32 uncompressed_size; ++ u32 live_size; ++ ++ u8 csum_type; ++ u8 compression_type; ++ ++ u16 offset; ++ ++ u16 nonce; ++ ++ struct bch_csum csum; ++}; ++ ++struct extent_ptr_decoded { ++ unsigned idx; ++ bool has_ec; ++ struct bch_extent_crc_unpacked crc; ++ struct bch_extent_ptr ptr; ++ struct bch_extent_stripe_ptr ec; ++}; ++ ++struct bch_io_failures { ++ u8 nr; ++ struct bch_dev_io_failures { ++ u8 dev; ++ u8 idx; ++ u8 nr_failed; ++ u8 nr_retries; ++ } devs[BCH_REPLICAS_MAX]; ++}; ++ ++#endif /* _BCACHEFS_EXTENTS_TYPES_H */ +diff --git a/fs/bcachefs/eytzinger.h b/fs/bcachefs/eytzinger.h +new file mode 100644 +index 000000000000..26d5cad7e6a5 +--- /dev/null ++++ b/fs/bcachefs/eytzinger.h +@@ -0,0 +1,285 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _EYTZINGER_H ++#define _EYTZINGER_H ++ ++#include ++#include ++ ++#include "util.h" ++ ++/* ++ * Traversal for trees in eytzinger layout - a full binary tree layed out in an ++ * array ++ */ ++ ++/* ++ * One based indexing version: ++ * ++ * With one based indexing each level of the tree starts at a power of two - ++ * good for cacheline alignment: ++ * ++ * Size parameter is treated as if we were using 0 based indexing, however: ++ * valid nodes, and inorder indices, are in the range [1..size) - that is, there ++ * are actually size - 1 elements ++ */ ++ ++static inline unsigned eytzinger1_child(unsigned i, unsigned child) ++{ ++ EBUG_ON(child > 1); ++ ++ return (i << 1) + child; ++} ++ ++static inline unsigned eytzinger1_left_child(unsigned i) ++{ ++ return eytzinger1_child(i, 0); ++} ++ ++static inline unsigned eytzinger1_right_child(unsigned i) ++{ ++ return eytzinger1_child(i, 1); ++} ++ ++static inline unsigned eytzinger1_first(unsigned size) ++{ ++ return rounddown_pow_of_two(size - 1); ++} ++ ++static inline unsigned eytzinger1_last(unsigned size) ++{ ++ return rounddown_pow_of_two(size) - 1; ++} ++ ++/* ++ * eytzinger1_next() and eytzinger1_prev() have the nice properties that ++ * ++ * eytzinger1_next(0) == eytzinger1_first()) ++ * eytzinger1_prev(0) == eytzinger1_last()) ++ * ++ * eytzinger1_prev(eytzinger1_first()) == 0 ++ * eytzinger1_next(eytzinger1_last()) == 0 ++ */ ++ ++static inline unsigned eytzinger1_next(unsigned i, unsigned size) ++{ ++ EBUG_ON(i >= size); ++ ++ if (eytzinger1_right_child(i) < size) { ++ i = eytzinger1_right_child(i); ++ ++ i <<= __fls(size) - __fls(i); ++ i >>= i >= size; ++ } else { ++ i >>= ffz(i) + 1; ++ } ++ ++ return i; ++} ++ ++static inline unsigned eytzinger1_prev(unsigned i, unsigned size) ++{ ++ EBUG_ON(i >= size); ++ ++ if (eytzinger1_left_child(i) < size) { ++ i = eytzinger1_left_child(i) + 1; ++ ++ i <<= __fls(size) - __fls(i); ++ i -= 1; ++ i >>= i >= size; ++ } else { ++ i >>= __ffs(i) + 1; ++ } ++ ++ return i; ++} ++ ++static inline unsigned eytzinger1_extra(unsigned size) ++{ ++ return (size - rounddown_pow_of_two(size - 1)) << 1; ++} ++ ++static inline unsigned __eytzinger1_to_inorder(unsigned i, unsigned size, ++ unsigned extra) ++{ ++ unsigned b = __fls(i); ++ unsigned shift = __fls(size - 1) - b; ++ int s; ++ ++ EBUG_ON(!i || i >= size); ++ ++ i ^= 1U << b; ++ i <<= 1; ++ i |= 1; ++ i <<= shift; ++ ++ /* ++ * sign bit trick: ++ * ++ * if (i > extra) ++ * i -= (i - extra) >> 1; ++ */ ++ s = extra - i; ++ i += (s >> 1) & (s >> 31); ++ ++ return i; ++} ++ ++static inline unsigned __inorder_to_eytzinger1(unsigned i, unsigned size, ++ unsigned extra) ++{ ++ unsigned shift; ++ int s; ++ ++ EBUG_ON(!i || i >= size); ++ ++ /* ++ * sign bit trick: ++ * ++ * if (i > extra) ++ * i += i - extra; ++ */ ++ s = extra - i; ++ i -= s & (s >> 31); ++ ++ shift = __ffs(i); ++ ++ i >>= shift + 1; ++ i |= 1U << (__fls(size - 1) - shift); ++ ++ return i; ++} ++ ++static inline unsigned eytzinger1_to_inorder(unsigned i, unsigned size) ++{ ++ return __eytzinger1_to_inorder(i, size, eytzinger1_extra(size)); ++} ++ ++static inline unsigned inorder_to_eytzinger1(unsigned i, unsigned size) ++{ ++ return __inorder_to_eytzinger1(i, size, eytzinger1_extra(size)); ++} ++ ++#define eytzinger1_for_each(_i, _size) \ ++ for ((_i) = eytzinger1_first((_size)); \ ++ (_i) != 0; \ ++ (_i) = eytzinger1_next((_i), (_size))) ++ ++/* Zero based indexing version: */ ++ ++static inline unsigned eytzinger0_child(unsigned i, unsigned child) ++{ ++ EBUG_ON(child > 1); ++ ++ return (i << 1) + 1 + child; ++} ++ ++static inline unsigned eytzinger0_left_child(unsigned i) ++{ ++ return eytzinger0_child(i, 0); ++} ++ ++static inline unsigned eytzinger0_right_child(unsigned i) ++{ ++ return eytzinger0_child(i, 1); ++} ++ ++static inline unsigned eytzinger0_first(unsigned size) ++{ ++ return eytzinger1_first(size + 1) - 1; ++} ++ ++static inline unsigned eytzinger0_last(unsigned size) ++{ ++ return eytzinger1_last(size + 1) - 1; ++} ++ ++static inline unsigned eytzinger0_next(unsigned i, unsigned size) ++{ ++ return eytzinger1_next(i + 1, size + 1) - 1; ++} ++ ++static inline unsigned eytzinger0_prev(unsigned i, unsigned size) ++{ ++ return eytzinger1_prev(i + 1, size + 1) - 1; ++} ++ ++static inline unsigned eytzinger0_extra(unsigned size) ++{ ++ return eytzinger1_extra(size + 1); ++} ++ ++static inline unsigned __eytzinger0_to_inorder(unsigned i, unsigned size, ++ unsigned extra) ++{ ++ return __eytzinger1_to_inorder(i + 1, size + 1, extra) - 1; ++} ++ ++static inline unsigned __inorder_to_eytzinger0(unsigned i, unsigned size, ++ unsigned extra) ++{ ++ return __inorder_to_eytzinger1(i + 1, size + 1, extra) - 1; ++} ++ ++static inline unsigned eytzinger0_to_inorder(unsigned i, unsigned size) ++{ ++ return __eytzinger0_to_inorder(i, size, eytzinger0_extra(size)); ++} ++ ++static inline unsigned inorder_to_eytzinger0(unsigned i, unsigned size) ++{ ++ return __inorder_to_eytzinger0(i, size, eytzinger0_extra(size)); ++} ++ ++#define eytzinger0_for_each(_i, _size) \ ++ for ((_i) = eytzinger0_first((_size)); \ ++ (_i) != -1; \ ++ (_i) = eytzinger0_next((_i), (_size))) ++ ++typedef int (*eytzinger_cmp_fn)(const void *l, const void *r, size_t size); ++ ++/* return greatest node <= @search, or -1 if not found */ ++static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size, ++ eytzinger_cmp_fn cmp, const void *search) ++{ ++ unsigned i, n = 0; ++ ++ if (!nr) ++ return -1; ++ ++ do { ++ i = n; ++ n = eytzinger0_child(i, cmp(search, base + i * size, size) >= 0); ++ } while (n < nr); ++ ++ if (n & 1) { ++ /* @i was greater than @search, return previous node: */ ++ ++ if (i == eytzinger0_first(nr)) ++ return -1; ++ ++ return eytzinger0_prev(i, nr); ++ } else { ++ return i; ++ } ++} ++ ++#define eytzinger0_find(base, nr, size, _cmp, search) \ ++({ \ ++ void *_base = (base); \ ++ void *_search = (search); \ ++ size_t _nr = (nr); \ ++ size_t _size = (size); \ ++ size_t _i = 0; \ ++ int _res; \ ++ \ ++ while (_i < _nr && \ ++ (_res = _cmp(_search, _base + _i * _size, _size))) \ ++ _i = eytzinger0_child(_i, _res > 0); \ ++ _i; \ ++}) ++ ++void eytzinger0_sort(void *, size_t, size_t, ++ int (*cmp_func)(const void *, const void *, size_t), ++ void (*swap_func)(void *, void *, size_t)); ++ ++#endif /* _EYTZINGER_H */ +diff --git a/fs/bcachefs/fifo.h b/fs/bcachefs/fifo.h +new file mode 100644 +index 000000000000..cdb272708a4b +--- /dev/null ++++ b/fs/bcachefs/fifo.h +@@ -0,0 +1,127 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_FIFO_H ++#define _BCACHEFS_FIFO_H ++ ++#include "util.h" ++ ++#define FIFO(type) \ ++struct { \ ++ size_t front, back, size, mask; \ ++ type *data; \ ++} ++ ++#define DECLARE_FIFO(type, name) FIFO(type) name ++ ++#define fifo_buf_size(fifo) \ ++ ((fifo)->size \ ++ ? roundup_pow_of_two((fifo)->size) * sizeof((fifo)->data[0]) \ ++ : 0) ++ ++#define init_fifo(fifo, _size, _gfp) \ ++({ \ ++ (fifo)->front = (fifo)->back = 0; \ ++ (fifo)->size = (_size); \ ++ (fifo)->mask = (fifo)->size \ ++ ? roundup_pow_of_two((fifo)->size) - 1 \ ++ : 0; \ ++ (fifo)->data = kvpmalloc(fifo_buf_size(fifo), (_gfp)); \ ++}) ++ ++#define free_fifo(fifo) \ ++do { \ ++ kvpfree((fifo)->data, fifo_buf_size(fifo)); \ ++ (fifo)->data = NULL; \ ++} while (0) ++ ++#define fifo_swap(l, r) \ ++do { \ ++ swap((l)->front, (r)->front); \ ++ swap((l)->back, (r)->back); \ ++ swap((l)->size, (r)->size); \ ++ swap((l)->mask, (r)->mask); \ ++ swap((l)->data, (r)->data); \ ++} while (0) ++ ++#define fifo_move(dest, src) \ ++do { \ ++ typeof(*((dest)->data)) _t; \ ++ while (!fifo_full(dest) && \ ++ fifo_pop(src, _t)) \ ++ fifo_push(dest, _t); \ ++} while (0) ++ ++#define fifo_used(fifo) (((fifo)->back - (fifo)->front)) ++#define fifo_free(fifo) ((fifo)->size - fifo_used(fifo)) ++ ++#define fifo_empty(fifo) ((fifo)->front == (fifo)->back) ++#define fifo_full(fifo) (fifo_used(fifo) == (fifo)->size) ++ ++#define fifo_peek_front(fifo) ((fifo)->data[(fifo)->front & (fifo)->mask]) ++#define fifo_peek_back(fifo) ((fifo)->data[((fifo)->back - 1) & (fifo)->mask]) ++ ++#define fifo_entry_idx_abs(fifo, p) \ ++ ((((p) >= &fifo_peek_front(fifo) \ ++ ? (fifo)->front : (fifo)->back) & ~(fifo)->mask) + \ ++ (((p) - (fifo)->data))) ++ ++#define fifo_entry_idx(fifo, p) (((p) - &fifo_peek_front(fifo)) & (fifo)->mask) ++#define fifo_idx_entry(fifo, i) (fifo)->data[((fifo)->front + (i)) & (fifo)->mask] ++ ++#define fifo_push_back_ref(f) \ ++ (fifo_full((f)) ? NULL : &(f)->data[(f)->back++ & (f)->mask]) ++ ++#define fifo_push_front_ref(f) \ ++ (fifo_full((f)) ? NULL : &(f)->data[--(f)->front & (f)->mask]) ++ ++#define fifo_push_back(fifo, new) \ ++({ \ ++ typeof((fifo)->data) _r = fifo_push_back_ref(fifo); \ ++ if (_r) \ ++ *_r = (new); \ ++ _r != NULL; \ ++}) ++ ++#define fifo_push_front(fifo, new) \ ++({ \ ++ typeof((fifo)->data) _r = fifo_push_front_ref(fifo); \ ++ if (_r) \ ++ *_r = (new); \ ++ _r != NULL; \ ++}) ++ ++#define fifo_pop_front(fifo, i) \ ++({ \ ++ bool _r = !fifo_empty((fifo)); \ ++ if (_r) \ ++ (i) = (fifo)->data[(fifo)->front++ & (fifo)->mask]; \ ++ _r; \ ++}) ++ ++#define fifo_pop_back(fifo, i) \ ++({ \ ++ bool _r = !fifo_empty((fifo)); \ ++ if (_r) \ ++ (i) = (fifo)->data[--(fifo)->back & (fifo)->mask]; \ ++ _r; \ ++}) ++ ++#define fifo_push_ref(fifo) fifo_push_back_ref(fifo) ++#define fifo_push(fifo, i) fifo_push_back(fifo, (i)) ++#define fifo_pop(fifo, i) fifo_pop_front(fifo, (i)) ++#define fifo_peek(fifo) fifo_peek_front(fifo) ++ ++#define fifo_for_each_entry(_entry, _fifo, _iter) \ ++ for (typecheck(typeof((_fifo)->front), _iter), \ ++ (_iter) = (_fifo)->front; \ ++ ((_iter != (_fifo)->back) && \ ++ (_entry = (_fifo)->data[(_iter) & (_fifo)->mask], true)); \ ++ (_iter)++) ++ ++#define fifo_for_each_entry_ptr(_ptr, _fifo, _iter) \ ++ for (typecheck(typeof((_fifo)->front), _iter), \ ++ (_iter) = (_fifo)->front; \ ++ ((_iter != (_fifo)->back) && \ ++ (_ptr = &(_fifo)->data[(_iter) & (_fifo)->mask], true)); \ ++ (_iter)++) ++ ++#endif /* _BCACHEFS_FIFO_H */ +diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c +new file mode 100644 +index 000000000000..a4497eeb1f1b +--- /dev/null ++++ b/fs/bcachefs/fs-common.c +@@ -0,0 +1,281 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include "bcachefs.h" ++#include "acl.h" ++#include "btree_update.h" ++#include "dirent.h" ++#include "fs-common.h" ++#include "inode.h" ++#include "xattr.h" ++ ++#include ++ ++int bch2_create_trans(struct btree_trans *trans, u64 dir_inum, ++ struct bch_inode_unpacked *dir_u, ++ struct bch_inode_unpacked *new_inode, ++ const struct qstr *name, ++ uid_t uid, gid_t gid, umode_t mode, dev_t rdev, ++ struct posix_acl *default_acl, ++ struct posix_acl *acl) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_iter *dir_iter; ++ struct bch_hash_info hash = bch2_hash_info_init(c, new_inode); ++ u64 now = bch2_current_time(trans->c); ++ int ret; ++ ++ dir_iter = bch2_inode_peek(trans, dir_u, dir_inum, BTREE_ITER_INTENT); ++ if (IS_ERR(dir_iter)) ++ return PTR_ERR(dir_iter); ++ ++ bch2_inode_init_late(new_inode, now, uid, gid, mode, rdev, dir_u); ++ ++ if (!name) ++ new_inode->bi_flags |= BCH_INODE_UNLINKED; ++ ++ ret = bch2_inode_create(trans, new_inode, ++ BLOCKDEV_INODE_MAX, 0, ++ &c->unused_inode_hint); ++ if (ret) ++ return ret; ++ ++ if (default_acl) { ++ ret = bch2_set_acl_trans(trans, new_inode, &hash, ++ default_acl, ACL_TYPE_DEFAULT); ++ if (ret) ++ return ret; ++ } ++ ++ if (acl) { ++ ret = bch2_set_acl_trans(trans, new_inode, &hash, ++ acl, ACL_TYPE_ACCESS); ++ if (ret) ++ return ret; ++ } ++ ++ if (name) { ++ struct bch_hash_info dir_hash = bch2_hash_info_init(c, dir_u); ++ dir_u->bi_mtime = dir_u->bi_ctime = now; ++ ++ if (S_ISDIR(new_inode->bi_mode)) ++ dir_u->bi_nlink++; ++ ++ ret = bch2_inode_write(trans, dir_iter, dir_u); ++ if (ret) ++ return ret; ++ ++ ret = bch2_dirent_create(trans, dir_inum, &dir_hash, ++ mode_to_type(new_inode->bi_mode), ++ name, new_inode->bi_inum, ++ BCH_HASH_SET_MUST_CREATE); ++ if (ret) ++ return ret; ++ } ++ ++ return 0; ++} ++ ++int bch2_link_trans(struct btree_trans *trans, u64 dir_inum, ++ u64 inum, struct bch_inode_unpacked *inode_u, ++ const struct qstr *name) ++{ ++ struct btree_iter *dir_iter, *inode_iter; ++ struct bch_inode_unpacked dir_u; ++ struct bch_hash_info dir_hash; ++ u64 now = bch2_current_time(trans->c); ++ ++ inode_iter = bch2_inode_peek(trans, inode_u, inum, BTREE_ITER_INTENT); ++ if (IS_ERR(inode_iter)) ++ return PTR_ERR(inode_iter); ++ ++ inode_u->bi_ctime = now; ++ bch2_inode_nlink_inc(inode_u); ++ ++ dir_iter = bch2_inode_peek(trans, &dir_u, dir_inum, 0); ++ if (IS_ERR(dir_iter)) ++ return PTR_ERR(dir_iter); ++ ++ /* XXX: shouldn't we be updating mtime/ctime on the directory? */ ++ ++ dir_hash = bch2_hash_info_init(trans->c, &dir_u); ++ bch2_trans_iter_put(trans, dir_iter); ++ ++ return bch2_dirent_create(trans, dir_inum, &dir_hash, ++ mode_to_type(inode_u->bi_mode), ++ name, inum, BCH_HASH_SET_MUST_CREATE) ?: ++ bch2_inode_write(trans, inode_iter, inode_u); ++} ++ ++int bch2_unlink_trans(struct btree_trans *trans, ++ u64 dir_inum, struct bch_inode_unpacked *dir_u, ++ struct bch_inode_unpacked *inode_u, ++ const struct qstr *name) ++{ ++ struct btree_iter *dir_iter, *dirent_iter, *inode_iter; ++ struct bch_hash_info dir_hash; ++ u64 inum, now = bch2_current_time(trans->c); ++ struct bkey_s_c k; ++ ++ dir_iter = bch2_inode_peek(trans, dir_u, dir_inum, BTREE_ITER_INTENT); ++ if (IS_ERR(dir_iter)) ++ return PTR_ERR(dir_iter); ++ ++ dir_hash = bch2_hash_info_init(trans->c, dir_u); ++ ++ dirent_iter = __bch2_dirent_lookup_trans(trans, dir_inum, &dir_hash, ++ name, BTREE_ITER_INTENT); ++ if (IS_ERR(dirent_iter)) ++ return PTR_ERR(dirent_iter); ++ ++ k = bch2_btree_iter_peek_slot(dirent_iter); ++ inum = le64_to_cpu(bkey_s_c_to_dirent(k).v->d_inum); ++ ++ inode_iter = bch2_inode_peek(trans, inode_u, inum, BTREE_ITER_INTENT); ++ if (IS_ERR(inode_iter)) ++ return PTR_ERR(inode_iter); ++ ++ dir_u->bi_mtime = dir_u->bi_ctime = inode_u->bi_ctime = now; ++ dir_u->bi_nlink -= S_ISDIR(inode_u->bi_mode); ++ bch2_inode_nlink_dec(inode_u); ++ ++ return (S_ISDIR(inode_u->bi_mode) ++ ? bch2_empty_dir_trans(trans, inum) ++ : 0) ?: ++ bch2_dirent_delete_at(trans, &dir_hash, dirent_iter) ?: ++ bch2_inode_write(trans, dir_iter, dir_u) ?: ++ bch2_inode_write(trans, inode_iter, inode_u); ++} ++ ++bool bch2_reinherit_attrs(struct bch_inode_unpacked *dst_u, ++ struct bch_inode_unpacked *src_u) ++{ ++ u64 src, dst; ++ unsigned id; ++ bool ret = false; ++ ++ for (id = 0; id < Inode_opt_nr; id++) { ++ if (dst_u->bi_fields_set & (1 << id)) ++ continue; ++ ++ src = bch2_inode_opt_get(src_u, id); ++ dst = bch2_inode_opt_get(dst_u, id); ++ ++ if (src == dst) ++ continue; ++ ++ bch2_inode_opt_set(dst_u, id, src); ++ ret = true; ++ } ++ ++ return ret; ++} ++ ++int bch2_rename_trans(struct btree_trans *trans, ++ u64 src_dir, struct bch_inode_unpacked *src_dir_u, ++ u64 dst_dir, struct bch_inode_unpacked *dst_dir_u, ++ struct bch_inode_unpacked *src_inode_u, ++ struct bch_inode_unpacked *dst_inode_u, ++ const struct qstr *src_name, ++ const struct qstr *dst_name, ++ enum bch_rename_mode mode) ++{ ++ struct btree_iter *src_dir_iter, *dst_dir_iter = NULL; ++ struct btree_iter *src_inode_iter, *dst_inode_iter = NULL; ++ struct bch_hash_info src_hash, dst_hash; ++ u64 src_inode, dst_inode, now = bch2_current_time(trans->c); ++ int ret; ++ ++ src_dir_iter = bch2_inode_peek(trans, src_dir_u, src_dir, ++ BTREE_ITER_INTENT); ++ if (IS_ERR(src_dir_iter)) ++ return PTR_ERR(src_dir_iter); ++ ++ src_hash = bch2_hash_info_init(trans->c, src_dir_u); ++ ++ if (dst_dir != src_dir) { ++ dst_dir_iter = bch2_inode_peek(trans, dst_dir_u, dst_dir, ++ BTREE_ITER_INTENT); ++ if (IS_ERR(dst_dir_iter)) ++ return PTR_ERR(dst_dir_iter); ++ ++ dst_hash = bch2_hash_info_init(trans->c, dst_dir_u); ++ } else { ++ dst_dir_u = src_dir_u; ++ dst_hash = src_hash; ++ } ++ ++ ret = bch2_dirent_rename(trans, ++ src_dir, &src_hash, ++ dst_dir, &dst_hash, ++ src_name, &src_inode, ++ dst_name, &dst_inode, ++ mode); ++ if (ret) ++ return ret; ++ ++ src_inode_iter = bch2_inode_peek(trans, src_inode_u, src_inode, ++ BTREE_ITER_INTENT); ++ if (IS_ERR(src_inode_iter)) ++ return PTR_ERR(src_inode_iter); ++ ++ if (dst_inode) { ++ dst_inode_iter = bch2_inode_peek(trans, dst_inode_u, dst_inode, ++ BTREE_ITER_INTENT); ++ if (IS_ERR(dst_inode_iter)) ++ return PTR_ERR(dst_inode_iter); ++ } ++ ++ if (mode == BCH_RENAME_OVERWRITE) { ++ if (S_ISDIR(src_inode_u->bi_mode) != ++ S_ISDIR(dst_inode_u->bi_mode)) ++ return -ENOTDIR; ++ ++ if (S_ISDIR(dst_inode_u->bi_mode) && ++ bch2_empty_dir_trans(trans, dst_inode)) ++ return -ENOTEMPTY; ++ } ++ ++ if (bch2_reinherit_attrs(src_inode_u, dst_dir_u) && ++ S_ISDIR(src_inode_u->bi_mode)) ++ return -EXDEV; ++ ++ if (mode == BCH_RENAME_EXCHANGE && ++ bch2_reinherit_attrs(dst_inode_u, src_dir_u) && ++ S_ISDIR(dst_inode_u->bi_mode)) ++ return -EXDEV; ++ ++ if (S_ISDIR(src_inode_u->bi_mode)) { ++ src_dir_u->bi_nlink--; ++ dst_dir_u->bi_nlink++; ++ } ++ ++ if (dst_inode && S_ISDIR(dst_inode_u->bi_mode)) { ++ dst_dir_u->bi_nlink--; ++ src_dir_u->bi_nlink += mode == BCH_RENAME_EXCHANGE; ++ } ++ ++ if (mode == BCH_RENAME_OVERWRITE) ++ bch2_inode_nlink_dec(dst_inode_u); ++ ++ src_dir_u->bi_mtime = now; ++ src_dir_u->bi_ctime = now; ++ ++ if (src_dir != dst_dir) { ++ dst_dir_u->bi_mtime = now; ++ dst_dir_u->bi_ctime = now; ++ } ++ ++ src_inode_u->bi_ctime = now; ++ ++ if (dst_inode) ++ dst_inode_u->bi_ctime = now; ++ ++ return bch2_inode_write(trans, src_dir_iter, src_dir_u) ?: ++ (src_dir != dst_dir ++ ? bch2_inode_write(trans, dst_dir_iter, dst_dir_u) ++ : 0 ) ?: ++ bch2_inode_write(trans, src_inode_iter, src_inode_u) ?: ++ (dst_inode ++ ? bch2_inode_write(trans, dst_inode_iter, dst_inode_u) ++ : 0 ); ++} +diff --git a/fs/bcachefs/fs-common.h b/fs/bcachefs/fs-common.h +new file mode 100644 +index 000000000000..c1621485a526 +--- /dev/null ++++ b/fs/bcachefs/fs-common.h +@@ -0,0 +1,36 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_FS_COMMON_H ++#define _BCACHEFS_FS_COMMON_H ++ ++struct posix_acl; ++ ++int bch2_create_trans(struct btree_trans *, u64, ++ struct bch_inode_unpacked *, ++ struct bch_inode_unpacked *, ++ const struct qstr *, ++ uid_t, gid_t, umode_t, dev_t, ++ struct posix_acl *, ++ struct posix_acl *); ++ ++int bch2_link_trans(struct btree_trans *, u64, ++ u64, struct bch_inode_unpacked *, ++ const struct qstr *); ++ ++int bch2_unlink_trans(struct btree_trans *, ++ u64, struct bch_inode_unpacked *, ++ struct bch_inode_unpacked *, ++ const struct qstr *); ++ ++int bch2_rename_trans(struct btree_trans *, ++ u64, struct bch_inode_unpacked *, ++ u64, struct bch_inode_unpacked *, ++ struct bch_inode_unpacked *, ++ struct bch_inode_unpacked *, ++ const struct qstr *, ++ const struct qstr *, ++ enum bch_rename_mode); ++ ++bool bch2_reinherit_attrs(struct bch_inode_unpacked *, ++ struct bch_inode_unpacked *); ++ ++#endif /* _BCACHEFS_FS_COMMON_H */ +diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c +new file mode 100644 +index 000000000000..f8e931e01fcc +--- /dev/null ++++ b/fs/bcachefs/fs-io.c +@@ -0,0 +1,3165 @@ ++// SPDX-License-Identifier: GPL-2.0 ++#ifndef NO_BCACHEFS_FS ++ ++#include "bcachefs.h" ++#include "alloc_foreground.h" ++#include "btree_update.h" ++#include "buckets.h" ++#include "clock.h" ++#include "error.h" ++#include "extents.h" ++#include "fs.h" ++#include "fs-io.h" ++#include "fsck.h" ++#include "inode.h" ++#include "journal.h" ++#include "io.h" ++#include "keylist.h" ++#include "quota.h" ++#include "reflink.h" ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++ ++static inline bool bio_full(struct bio *bio, unsigned len) ++{ ++ if (bio->bi_vcnt >= bio->bi_max_vecs) ++ return true; ++ if (bio->bi_iter.bi_size > UINT_MAX - len) ++ return true; ++ return false; ++} ++ ++struct quota_res { ++ u64 sectors; ++}; ++ ++struct bch_writepage_io { ++ struct closure cl; ++ struct bch_inode_info *inode; ++ ++ /* must be last: */ ++ struct bch_write_op op; ++}; ++ ++struct dio_write { ++ struct completion done; ++ struct kiocb *req; ++ struct mm_struct *mm; ++ unsigned loop:1, ++ sync:1, ++ free_iov:1; ++ struct quota_res quota_res; ++ ++ struct iov_iter iter; ++ struct iovec inline_vecs[2]; ++ ++ /* must be last: */ ++ struct bch_write_op op; ++}; ++ ++struct dio_read { ++ struct closure cl; ++ struct kiocb *req; ++ long ret; ++ struct bch_read_bio rbio; ++}; ++ ++/* stub version */ ++static int add_to_page_cache_lru_vec(struct address_space *mapping, ++ struct page **pages, ++ unsigned nr_pages, ++ pgoff_t offset, gfp_t gfp_mask) ++{ ++ int i, err = 0; ++ ++ for (i = 0; i < nr_pages; i++) { ++ err = add_to_page_cache_lru(pages[i], mapping, ++ offset + i, gfp_mask); ++ if (err) ++ break; ++ } ++ ++ return i ?: err; ++} ++ ++/* pagecache_block must be held */ ++static int write_invalidate_inode_pages_range(struct address_space *mapping, ++ loff_t start, loff_t end) ++{ ++ int ret; ++ ++ /* ++ * XXX: the way this is currently implemented, we can spin if a process ++ * is continually redirtying a specific page ++ */ ++ do { ++ if (!mapping->nrpages) ++ return 0; ++ ++ ret = filemap_write_and_wait_range(mapping, start, end); ++ if (ret) ++ break; ++ ++ if (!mapping->nrpages) ++ return 0; ++ ++ ret = invalidate_inode_pages2_range(mapping, ++ start >> PAGE_SHIFT, ++ end >> PAGE_SHIFT); ++ } while (ret == -EBUSY); ++ ++ return ret; ++} ++ ++/* quotas */ ++ ++#ifdef CONFIG_BCACHEFS_QUOTA ++ ++static void bch2_quota_reservation_put(struct bch_fs *c, ++ struct bch_inode_info *inode, ++ struct quota_res *res) ++{ ++ if (!res->sectors) ++ return; ++ ++ mutex_lock(&inode->ei_quota_lock); ++ BUG_ON(res->sectors > inode->ei_quota_reserved); ++ ++ bch2_quota_acct(c, inode->ei_qid, Q_SPC, ++ -((s64) res->sectors), KEY_TYPE_QUOTA_PREALLOC); ++ inode->ei_quota_reserved -= res->sectors; ++ mutex_unlock(&inode->ei_quota_lock); ++ ++ res->sectors = 0; ++} ++ ++static int bch2_quota_reservation_add(struct bch_fs *c, ++ struct bch_inode_info *inode, ++ struct quota_res *res, ++ unsigned sectors, ++ bool check_enospc) ++{ ++ int ret; ++ ++ mutex_lock(&inode->ei_quota_lock); ++ ret = bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors, ++ check_enospc ? KEY_TYPE_QUOTA_PREALLOC : KEY_TYPE_QUOTA_NOCHECK); ++ if (likely(!ret)) { ++ inode->ei_quota_reserved += sectors; ++ res->sectors += sectors; ++ } ++ mutex_unlock(&inode->ei_quota_lock); ++ ++ return ret; ++} ++ ++#else ++ ++static void bch2_quota_reservation_put(struct bch_fs *c, ++ struct bch_inode_info *inode, ++ struct quota_res *res) ++{ ++} ++ ++static int bch2_quota_reservation_add(struct bch_fs *c, ++ struct bch_inode_info *inode, ++ struct quota_res *res, ++ unsigned sectors, ++ bool check_enospc) ++{ ++ return 0; ++} ++ ++#endif ++ ++/* i_size updates: */ ++ ++struct inode_new_size { ++ loff_t new_size; ++ u64 now; ++ unsigned fields; ++}; ++ ++static int inode_set_size(struct bch_inode_info *inode, ++ struct bch_inode_unpacked *bi, ++ void *p) ++{ ++ struct inode_new_size *s = p; ++ ++ bi->bi_size = s->new_size; ++ if (s->fields & ATTR_ATIME) ++ bi->bi_atime = s->now; ++ if (s->fields & ATTR_MTIME) ++ bi->bi_mtime = s->now; ++ if (s->fields & ATTR_CTIME) ++ bi->bi_ctime = s->now; ++ ++ return 0; ++} ++ ++int __must_check bch2_write_inode_size(struct bch_fs *c, ++ struct bch_inode_info *inode, ++ loff_t new_size, unsigned fields) ++{ ++ struct inode_new_size s = { ++ .new_size = new_size, ++ .now = bch2_current_time(c), ++ .fields = fields, ++ }; ++ ++ return bch2_write_inode(c, inode, inode_set_size, &s, fields); ++} ++ ++static void i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode, ++ struct quota_res *quota_res, s64 sectors) ++{ ++ if (!sectors) ++ return; ++ ++ mutex_lock(&inode->ei_quota_lock); ++#ifdef CONFIG_BCACHEFS_QUOTA ++ if (quota_res && sectors > 0) { ++ BUG_ON(sectors > quota_res->sectors); ++ BUG_ON(sectors > inode->ei_quota_reserved); ++ ++ quota_res->sectors -= sectors; ++ inode->ei_quota_reserved -= sectors; ++ } else { ++ bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors, KEY_TYPE_QUOTA_WARN); ++ } ++#endif ++ inode->v.i_blocks += sectors; ++ mutex_unlock(&inode->ei_quota_lock); ++} ++ ++/* page state: */ ++ ++/* stored in page->private: */ ++ ++struct bch_page_sector { ++ /* Uncompressed, fully allocated replicas: */ ++ unsigned nr_replicas:3; ++ ++ /* Owns PAGE_SECTORS * replicas_reserved sized reservation: */ ++ unsigned replicas_reserved:3; ++ ++ /* i_sectors: */ ++ enum { ++ SECTOR_UNALLOCATED, ++ SECTOR_RESERVED, ++ SECTOR_DIRTY, ++ SECTOR_ALLOCATED, ++ } state:2; ++}; ++ ++struct bch_page_state { ++ spinlock_t lock; ++ atomic_t write_count; ++ struct bch_page_sector s[PAGE_SECTORS]; ++}; ++ ++static inline struct bch_page_state *__bch2_page_state(struct page *page) ++{ ++ return page_has_private(page) ++ ? (struct bch_page_state *) page_private(page) ++ : NULL; ++} ++ ++static inline struct bch_page_state *bch2_page_state(struct page *page) ++{ ++ EBUG_ON(!PageLocked(page)); ++ ++ return __bch2_page_state(page); ++} ++ ++/* for newly allocated pages: */ ++static void __bch2_page_state_release(struct page *page) ++{ ++ struct bch_page_state *s = __bch2_page_state(page); ++ ++ if (!s) ++ return; ++ ++ ClearPagePrivate(page); ++ set_page_private(page, 0); ++ put_page(page); ++ kfree(s); ++} ++ ++static void bch2_page_state_release(struct page *page) ++{ ++ struct bch_page_state *s = bch2_page_state(page); ++ ++ if (!s) ++ return; ++ ++ ClearPagePrivate(page); ++ set_page_private(page, 0); ++ put_page(page); ++ kfree(s); ++} ++ ++/* for newly allocated pages: */ ++static struct bch_page_state *__bch2_page_state_create(struct page *page, ++ gfp_t gfp) ++{ ++ struct bch_page_state *s; ++ ++ s = kzalloc(sizeof(*s), GFP_NOFS|gfp); ++ if (!s) ++ return NULL; ++ ++ spin_lock_init(&s->lock); ++ /* ++ * migrate_page_move_mapping() assumes that pages with private data ++ * have their count elevated by 1. ++ */ ++ get_page(page); ++ set_page_private(page, (unsigned long) s); ++ SetPagePrivate(page); ++ return s; ++} ++ ++static struct bch_page_state *bch2_page_state_create(struct page *page, ++ gfp_t gfp) ++{ ++ return bch2_page_state(page) ?: __bch2_page_state_create(page, gfp); ++} ++ ++static inline unsigned inode_nr_replicas(struct bch_fs *c, struct bch_inode_info *inode) ++{ ++ /* XXX: this should not be open coded */ ++ return inode->ei_inode.bi_data_replicas ++ ? inode->ei_inode.bi_data_replicas - 1 ++ : c->opts.data_replicas; ++} ++ ++static inline unsigned sectors_to_reserve(struct bch_page_sector *s, ++ unsigned nr_replicas) ++{ ++ return max(0, (int) nr_replicas - ++ s->nr_replicas - ++ s->replicas_reserved); ++} ++ ++static int bch2_get_page_disk_reservation(struct bch_fs *c, ++ struct bch_inode_info *inode, ++ struct page *page, bool check_enospc) ++{ ++ struct bch_page_state *s = bch2_page_state_create(page, 0); ++ unsigned nr_replicas = inode_nr_replicas(c, inode); ++ struct disk_reservation disk_res = { 0 }; ++ unsigned i, disk_res_sectors = 0; ++ int ret; ++ ++ if (!s) ++ return -ENOMEM; ++ ++ for (i = 0; i < ARRAY_SIZE(s->s); i++) ++ disk_res_sectors += sectors_to_reserve(&s->s[i], nr_replicas); ++ ++ if (!disk_res_sectors) ++ return 0; ++ ++ ret = bch2_disk_reservation_get(c, &disk_res, ++ disk_res_sectors, 1, ++ !check_enospc ++ ? BCH_DISK_RESERVATION_NOFAIL ++ : 0); ++ if (unlikely(ret)) ++ return ret; ++ ++ for (i = 0; i < ARRAY_SIZE(s->s); i++) ++ s->s[i].replicas_reserved += ++ sectors_to_reserve(&s->s[i], nr_replicas); ++ ++ return 0; ++} ++ ++struct bch2_page_reservation { ++ struct disk_reservation disk; ++ struct quota_res quota; ++}; ++ ++static void bch2_page_reservation_init(struct bch_fs *c, ++ struct bch_inode_info *inode, ++ struct bch2_page_reservation *res) ++{ ++ memset(res, 0, sizeof(*res)); ++ ++ res->disk.nr_replicas = inode_nr_replicas(c, inode); ++} ++ ++static void bch2_page_reservation_put(struct bch_fs *c, ++ struct bch_inode_info *inode, ++ struct bch2_page_reservation *res) ++{ ++ bch2_disk_reservation_put(c, &res->disk); ++ bch2_quota_reservation_put(c, inode, &res->quota); ++} ++ ++static int bch2_page_reservation_get(struct bch_fs *c, ++ struct bch_inode_info *inode, struct page *page, ++ struct bch2_page_reservation *res, ++ unsigned offset, unsigned len, bool check_enospc) ++{ ++ struct bch_page_state *s = bch2_page_state_create(page, 0); ++ unsigned i, disk_sectors = 0, quota_sectors = 0; ++ int ret; ++ ++ if (!s) ++ return -ENOMEM; ++ ++ for (i = round_down(offset, block_bytes(c)) >> 9; ++ i < round_up(offset + len, block_bytes(c)) >> 9; ++ i++) { ++ disk_sectors += sectors_to_reserve(&s->s[i], ++ res->disk.nr_replicas); ++ quota_sectors += s->s[i].state == SECTOR_UNALLOCATED; ++ } ++ ++ if (disk_sectors) { ++ ret = bch2_disk_reservation_add(c, &res->disk, ++ disk_sectors, ++ !check_enospc ++ ? BCH_DISK_RESERVATION_NOFAIL ++ : 0); ++ if (unlikely(ret)) ++ return ret; ++ } ++ ++ if (quota_sectors) { ++ ret = bch2_quota_reservation_add(c, inode, &res->quota, ++ quota_sectors, ++ check_enospc); ++ if (unlikely(ret)) { ++ struct disk_reservation tmp = { ++ .sectors = disk_sectors ++ }; ++ ++ bch2_disk_reservation_put(c, &tmp); ++ res->disk.sectors -= disk_sectors; ++ return ret; ++ } ++ } ++ ++ return 0; ++} ++ ++static void bch2_clear_page_bits(struct page *page) ++{ ++ struct bch_inode_info *inode = to_bch_ei(page->mapping->host); ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ struct bch_page_state *s = bch2_page_state(page); ++ struct disk_reservation disk_res = { 0 }; ++ int i, dirty_sectors = 0; ++ ++ if (!s) ++ return; ++ ++ EBUG_ON(!PageLocked(page)); ++ EBUG_ON(PageWriteback(page)); ++ ++ for (i = 0; i < ARRAY_SIZE(s->s); i++) { ++ disk_res.sectors += s->s[i].replicas_reserved; ++ s->s[i].replicas_reserved = 0; ++ ++ if (s->s[i].state == SECTOR_DIRTY) { ++ dirty_sectors++; ++ s->s[i].state = SECTOR_UNALLOCATED; ++ } ++ } ++ ++ bch2_disk_reservation_put(c, &disk_res); ++ ++ if (dirty_sectors) ++ i_sectors_acct(c, inode, NULL, -dirty_sectors); ++ ++ bch2_page_state_release(page); ++} ++ ++static void bch2_set_page_dirty(struct bch_fs *c, ++ struct bch_inode_info *inode, struct page *page, ++ struct bch2_page_reservation *res, ++ unsigned offset, unsigned len) ++{ ++ struct bch_page_state *s = bch2_page_state(page); ++ unsigned i, dirty_sectors = 0; ++ ++ WARN_ON((u64) page_offset(page) + offset + len > ++ round_up((u64) i_size_read(&inode->v), block_bytes(c))); ++ ++ spin_lock(&s->lock); ++ ++ for (i = round_down(offset, block_bytes(c)) >> 9; ++ i < round_up(offset + len, block_bytes(c)) >> 9; ++ i++) { ++ unsigned sectors = sectors_to_reserve(&s->s[i], ++ res->disk.nr_replicas); ++ ++ /* ++ * This can happen if we race with the error path in ++ * bch2_writepage_io_done(): ++ */ ++ sectors = min_t(unsigned, sectors, res->disk.sectors); ++ ++ s->s[i].replicas_reserved += sectors; ++ res->disk.sectors -= sectors; ++ ++ if (s->s[i].state == SECTOR_UNALLOCATED) ++ dirty_sectors++; ++ ++ s->s[i].state = max_t(unsigned, s->s[i].state, SECTOR_DIRTY); ++ } ++ ++ spin_unlock(&s->lock); ++ ++ if (dirty_sectors) ++ i_sectors_acct(c, inode, &res->quota, dirty_sectors); ++ ++ if (!PageDirty(page)) ++ __set_page_dirty_nobuffers(page); ++} ++ ++vm_fault_t bch2_page_fault(struct vm_fault *vmf) ++{ ++ struct file *file = vmf->vma->vm_file; ++ struct bch_inode_info *inode = file_bch_inode(file); ++ int ret; ++ ++ bch2_pagecache_add_get(&inode->ei_pagecache_lock); ++ ret = filemap_fault(vmf); ++ bch2_pagecache_add_put(&inode->ei_pagecache_lock); ++ ++ return ret; ++} ++ ++vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf) ++{ ++ struct page *page = vmf->page; ++ struct file *file = vmf->vma->vm_file; ++ struct bch_inode_info *inode = file_bch_inode(file); ++ struct address_space *mapping = file->f_mapping; ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ struct bch2_page_reservation res; ++ unsigned len; ++ loff_t isize; ++ int ret = VM_FAULT_LOCKED; ++ ++ bch2_page_reservation_init(c, inode, &res); ++ ++ sb_start_pagefault(inode->v.i_sb); ++ file_update_time(file); ++ ++ /* ++ * Not strictly necessary, but helps avoid dio writes livelocking in ++ * write_invalidate_inode_pages_range() - can drop this if/when we get ++ * a write_invalidate_inode_pages_range() that works without dropping ++ * page lock before invalidating page ++ */ ++ bch2_pagecache_add_get(&inode->ei_pagecache_lock); ++ ++ lock_page(page); ++ isize = i_size_read(&inode->v); ++ ++ if (page->mapping != mapping || page_offset(page) >= isize) { ++ unlock_page(page); ++ ret = VM_FAULT_NOPAGE; ++ goto out; ++ } ++ ++ len = min_t(loff_t, PAGE_SIZE, isize - page_offset(page)); ++ ++ if (bch2_page_reservation_get(c, inode, page, &res, 0, len, true)) { ++ unlock_page(page); ++ ret = VM_FAULT_SIGBUS; ++ goto out; ++ } ++ ++ bch2_set_page_dirty(c, inode, page, &res, 0, len); ++ bch2_page_reservation_put(c, inode, &res); ++ ++ wait_for_stable_page(page); ++out: ++ bch2_pagecache_add_put(&inode->ei_pagecache_lock); ++ sb_end_pagefault(inode->v.i_sb); ++ ++ return ret; ++} ++ ++void bch2_invalidatepage(struct page *page, unsigned int offset, ++ unsigned int length) ++{ ++ if (offset || length < PAGE_SIZE) ++ return; ++ ++ bch2_clear_page_bits(page); ++} ++ ++int bch2_releasepage(struct page *page, gfp_t gfp_mask) ++{ ++ if (PageDirty(page)) ++ return 0; ++ ++ bch2_clear_page_bits(page); ++ return 1; ++} ++ ++#ifdef CONFIG_MIGRATION ++int bch2_migrate_page(struct address_space *mapping, struct page *newpage, ++ struct page *page, enum migrate_mode mode) ++{ ++ int ret; ++ ++ EBUG_ON(!PageLocked(page)); ++ EBUG_ON(!PageLocked(newpage)); ++ ++ ret = migrate_page_move_mapping(mapping, newpage, page, 0); ++ if (ret != MIGRATEPAGE_SUCCESS) ++ return ret; ++ ++ if (PagePrivate(page)) { ++ ClearPagePrivate(page); ++ get_page(newpage); ++ set_page_private(newpage, page_private(page)); ++ set_page_private(page, 0); ++ put_page(page); ++ SetPagePrivate(newpage); ++ } ++ ++ if (mode != MIGRATE_SYNC_NO_COPY) ++ migrate_page_copy(newpage, page); ++ else ++ migrate_page_states(newpage, page); ++ return MIGRATEPAGE_SUCCESS; ++} ++#endif ++ ++/* readpage(s): */ ++ ++static void bch2_readpages_end_io(struct bio *bio) ++{ ++ struct bvec_iter_all iter; ++ struct bio_vec *bv; ++ ++ bio_for_each_segment_all(bv, bio, iter) { ++ struct page *page = bv->bv_page; ++ ++ if (!bio->bi_status) { ++ SetPageUptodate(page); ++ } else { ++ ClearPageUptodate(page); ++ SetPageError(page); ++ } ++ unlock_page(page); ++ } ++ ++ bio_put(bio); ++} ++ ++static inline void page_state_init_for_read(struct page *page) ++{ ++ SetPagePrivate(page); ++ page->private = 0; ++} ++ ++struct readpages_iter { ++ struct address_space *mapping; ++ struct page **pages; ++ unsigned nr_pages; ++ unsigned nr_added; ++ unsigned idx; ++ pgoff_t offset; ++}; ++ ++static int readpages_iter_init(struct readpages_iter *iter, ++ struct address_space *mapping, ++ struct list_head *pages, unsigned nr_pages) ++{ ++ memset(iter, 0, sizeof(*iter)); ++ ++ iter->mapping = mapping; ++ iter->offset = list_last_entry(pages, struct page, lru)->index; ++ ++ iter->pages = kmalloc_array(nr_pages, sizeof(struct page *), GFP_NOFS); ++ if (!iter->pages) ++ return -ENOMEM; ++ ++ while (!list_empty(pages)) { ++ struct page *page = list_last_entry(pages, struct page, lru); ++ ++ __bch2_page_state_create(page, __GFP_NOFAIL); ++ ++ iter->pages[iter->nr_pages++] = page; ++ list_del(&page->lru); ++ } ++ ++ return 0; ++} ++ ++static inline struct page *readpage_iter_next(struct readpages_iter *iter) ++{ ++ struct page *page; ++ unsigned i; ++ int ret; ++ ++ BUG_ON(iter->idx > iter->nr_added); ++ BUG_ON(iter->nr_added > iter->nr_pages); ++ ++ if (iter->idx < iter->nr_added) ++ goto out; ++ ++ while (1) { ++ if (iter->idx == iter->nr_pages) ++ return NULL; ++ ++ ret = add_to_page_cache_lru_vec(iter->mapping, ++ iter->pages + iter->nr_added, ++ iter->nr_pages - iter->nr_added, ++ iter->offset + iter->nr_added, ++ GFP_NOFS); ++ if (ret > 0) ++ break; ++ ++ page = iter->pages[iter->nr_added]; ++ iter->idx++; ++ iter->nr_added++; ++ ++ __bch2_page_state_release(page); ++ put_page(page); ++ } ++ ++ iter->nr_added += ret; ++ ++ for (i = iter->idx; i < iter->nr_added; i++) ++ put_page(iter->pages[i]); ++out: ++ EBUG_ON(iter->pages[iter->idx]->index != iter->offset + iter->idx); ++ ++ return iter->pages[iter->idx]; ++} ++ ++static void bch2_add_page_sectors(struct bio *bio, struct bkey_s_c k) ++{ ++ struct bvec_iter iter; ++ struct bio_vec bv; ++ unsigned nr_ptrs = k.k->type == KEY_TYPE_reflink_v ++ ? 0 : bch2_bkey_nr_ptrs_allocated(k); ++ unsigned state = k.k->type == KEY_TYPE_reservation ++ ? SECTOR_RESERVED ++ : SECTOR_ALLOCATED; ++ ++ bio_for_each_segment(bv, bio, iter) { ++ struct bch_page_state *s = bch2_page_state(bv.bv_page); ++ unsigned i; ++ ++ for (i = bv.bv_offset >> 9; ++ i < (bv.bv_offset + bv.bv_len) >> 9; ++ i++) { ++ s->s[i].nr_replicas = nr_ptrs; ++ s->s[i].state = state; ++ } ++ } ++} ++ ++static void readpage_bio_extend(struct readpages_iter *iter, ++ struct bio *bio, ++ unsigned sectors_this_extent, ++ bool get_more) ++{ ++ while (bio_sectors(bio) < sectors_this_extent && ++ bio->bi_vcnt < bio->bi_max_vecs) { ++ pgoff_t page_offset = bio_end_sector(bio) >> PAGE_SECTOR_SHIFT; ++ struct page *page = readpage_iter_next(iter); ++ int ret; ++ ++ if (page) { ++ if (iter->offset + iter->idx != page_offset) ++ break; ++ ++ iter->idx++; ++ } else { ++ if (!get_more) ++ break; ++ ++ page = xa_load(&iter->mapping->i_pages, page_offset); ++ if (page && !xa_is_value(page)) ++ break; ++ ++ page = __page_cache_alloc(readahead_gfp_mask(iter->mapping)); ++ if (!page) ++ break; ++ ++ if (!__bch2_page_state_create(page, 0)) { ++ put_page(page); ++ break; ++ } ++ ++ ret = add_to_page_cache_lru(page, iter->mapping, ++ page_offset, GFP_NOFS); ++ if (ret) { ++ __bch2_page_state_release(page); ++ put_page(page); ++ break; ++ } ++ ++ put_page(page); ++ } ++ ++ BUG_ON(!bio_add_page(bio, page, PAGE_SIZE, 0)); ++ } ++} ++ ++static void bchfs_read(struct btree_trans *trans, struct btree_iter *iter, ++ struct bch_read_bio *rbio, u64 inum, ++ struct readpages_iter *readpages_iter) ++{ ++ struct bch_fs *c = trans->c; ++ int flags = BCH_READ_RETRY_IF_STALE| ++ BCH_READ_MAY_PROMOTE; ++ int ret = 0; ++ ++ rbio->c = c; ++ rbio->start_time = local_clock(); ++retry: ++ while (1) { ++ BKEY_PADDED(k) tmp; ++ struct bkey_s_c k; ++ unsigned bytes, sectors, offset_into_extent; ++ ++ bch2_btree_iter_set_pos(iter, ++ POS(inum, rbio->bio.bi_iter.bi_sector)); ++ ++ k = bch2_btree_iter_peek_slot(iter); ++ ret = bkey_err(k); ++ if (ret) ++ break; ++ ++ bkey_reassemble(&tmp.k, k); ++ k = bkey_i_to_s_c(&tmp.k); ++ ++ offset_into_extent = iter->pos.offset - ++ bkey_start_offset(k.k); ++ sectors = k.k->size - offset_into_extent; ++ ++ ret = bch2_read_indirect_extent(trans, ++ &offset_into_extent, &tmp.k); ++ if (ret) ++ break; ++ ++ sectors = min(sectors, k.k->size - offset_into_extent); ++ ++ bch2_trans_unlock(trans); ++ ++ if (readpages_iter) { ++ bool want_full_extent = false; ++ ++ if (bkey_extent_is_data(k.k)) { ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const union bch_extent_entry *i; ++ struct extent_ptr_decoded p; ++ ++ bkey_for_each_ptr_decode(k.k, ptrs, p, i) ++ want_full_extent |= ((p.crc.csum_type != 0) | ++ (p.crc.compression_type != 0)); ++ } ++ ++ readpage_bio_extend(readpages_iter, &rbio->bio, ++ sectors, want_full_extent); ++ } ++ ++ bytes = min(sectors, bio_sectors(&rbio->bio)) << 9; ++ swap(rbio->bio.bi_iter.bi_size, bytes); ++ ++ if (rbio->bio.bi_iter.bi_size == bytes) ++ flags |= BCH_READ_LAST_FRAGMENT; ++ ++ if (bkey_extent_is_allocation(k.k)) ++ bch2_add_page_sectors(&rbio->bio, k); ++ ++ bch2_read_extent(c, rbio, k, offset_into_extent, flags); ++ ++ if (flags & BCH_READ_LAST_FRAGMENT) ++ return; ++ ++ swap(rbio->bio.bi_iter.bi_size, bytes); ++ bio_advance(&rbio->bio, bytes); ++ } ++ ++ if (ret == -EINTR) ++ goto retry; ++ ++ bcache_io_error(c, &rbio->bio, "btree IO error %i", ret); ++ bio_endio(&rbio->bio); ++} ++ ++int bch2_readpages(struct file *file, struct address_space *mapping, ++ struct list_head *pages, unsigned nr_pages) ++{ ++ struct bch_inode_info *inode = to_bch_ei(mapping->host); ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ struct bch_io_opts opts = io_opts(c, &inode->ei_inode); ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct page *page; ++ struct readpages_iter readpages_iter; ++ int ret; ++ ++ ret = readpages_iter_init(&readpages_iter, mapping, pages, nr_pages); ++ BUG_ON(ret); ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, POS_MIN, ++ BTREE_ITER_SLOTS); ++ ++ bch2_pagecache_add_get(&inode->ei_pagecache_lock); ++ ++ while ((page = readpage_iter_next(&readpages_iter))) { ++ pgoff_t index = readpages_iter.offset + readpages_iter.idx; ++ unsigned n = min_t(unsigned, ++ readpages_iter.nr_pages - ++ readpages_iter.idx, ++ BIO_MAX_VECS); ++ struct bch_read_bio *rbio = ++ rbio_init(bio_alloc_bioset(GFP_NOFS, n, &c->bio_read), ++ opts); ++ ++ readpages_iter.idx++; ++ ++ bio_set_op_attrs(&rbio->bio, REQ_OP_READ, 0); ++ rbio->bio.bi_iter.bi_sector = (sector_t) index << PAGE_SECTOR_SHIFT; ++ rbio->bio.bi_end_io = bch2_readpages_end_io; ++ BUG_ON(!bio_add_page(&rbio->bio, page, PAGE_SIZE, 0)); ++ ++ bchfs_read(&trans, iter, rbio, inode->v.i_ino, ++ &readpages_iter); ++ } ++ ++ bch2_pagecache_add_put(&inode->ei_pagecache_lock); ++ ++ bch2_trans_exit(&trans); ++ kfree(readpages_iter.pages); ++ ++ return 0; ++} ++ ++static void __bchfs_readpage(struct bch_fs *c, struct bch_read_bio *rbio, ++ u64 inum, struct page *page) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ ++ bch2_page_state_create(page, __GFP_NOFAIL); ++ ++ bio_set_op_attrs(&rbio->bio, REQ_OP_READ, REQ_SYNC); ++ rbio->bio.bi_iter.bi_sector = ++ (sector_t) page->index << PAGE_SECTOR_SHIFT; ++ BUG_ON(!bio_add_page(&rbio->bio, page, PAGE_SIZE, 0)); ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, POS_MIN, ++ BTREE_ITER_SLOTS); ++ ++ bchfs_read(&trans, iter, rbio, inum, NULL); ++ ++ bch2_trans_exit(&trans); ++} ++ ++int bch2_readpage(struct file *file, struct page *page) ++{ ++ struct bch_inode_info *inode = to_bch_ei(page->mapping->host); ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ struct bch_io_opts opts = io_opts(c, &inode->ei_inode); ++ struct bch_read_bio *rbio; ++ ++ rbio = rbio_init(bio_alloc_bioset(GFP_NOFS, 1, &c->bio_read), opts); ++ rbio->bio.bi_end_io = bch2_readpages_end_io; ++ ++ __bchfs_readpage(c, rbio, inode->v.i_ino, page); ++ return 0; ++} ++ ++static void bch2_read_single_page_end_io(struct bio *bio) ++{ ++ complete(bio->bi_private); ++} ++ ++static int bch2_read_single_page(struct page *page, ++ struct address_space *mapping) ++{ ++ struct bch_inode_info *inode = to_bch_ei(mapping->host); ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ struct bch_read_bio *rbio; ++ int ret; ++ DECLARE_COMPLETION_ONSTACK(done); ++ ++ rbio = rbio_init(bio_alloc_bioset(GFP_NOFS, 1, &c->bio_read), ++ io_opts(c, &inode->ei_inode)); ++ rbio->bio.bi_private = &done; ++ rbio->bio.bi_end_io = bch2_read_single_page_end_io; ++ ++ __bchfs_readpage(c, rbio, inode->v.i_ino, page); ++ wait_for_completion(&done); ++ ++ ret = blk_status_to_errno(rbio->bio.bi_status); ++ bio_put(&rbio->bio); ++ ++ if (ret < 0) ++ return ret; ++ ++ SetPageUptodate(page); ++ return 0; ++} ++ ++/* writepages: */ ++ ++struct bch_writepage_state { ++ struct bch_writepage_io *io; ++ struct bch_io_opts opts; ++}; ++ ++static inline struct bch_writepage_state bch_writepage_state_init(struct bch_fs *c, ++ struct bch_inode_info *inode) ++{ ++ return (struct bch_writepage_state) { ++ .opts = io_opts(c, &inode->ei_inode) ++ }; ++} ++ ++static void bch2_writepage_io_free(struct closure *cl) ++{ ++ struct bch_writepage_io *io = container_of(cl, ++ struct bch_writepage_io, cl); ++ ++ bio_put(&io->op.wbio.bio); ++} ++ ++static void bch2_writepage_io_done(struct closure *cl) ++{ ++ struct bch_writepage_io *io = container_of(cl, ++ struct bch_writepage_io, cl); ++ struct bch_fs *c = io->op.c; ++ struct bio *bio = &io->op.wbio.bio; ++ struct bvec_iter_all iter; ++ struct bio_vec *bvec; ++ unsigned i; ++ ++ if (io->op.error) { ++ bio_for_each_segment_all(bvec, bio, iter) { ++ struct bch_page_state *s; ++ ++ SetPageError(bvec->bv_page); ++ mapping_set_error(bvec->bv_page->mapping, -EIO); ++ ++ s = __bch2_page_state(bvec->bv_page); ++ spin_lock(&s->lock); ++ for (i = 0; i < PAGE_SECTORS; i++) ++ s->s[i].nr_replicas = 0; ++ spin_unlock(&s->lock); ++ } ++ } ++ ++ /* ++ * racing with fallocate can cause us to add fewer sectors than ++ * expected - but we shouldn't add more sectors than expected: ++ */ ++ BUG_ON(io->op.i_sectors_delta > 0); ++ ++ /* ++ * (error (due to going RO) halfway through a page can screw that up ++ * slightly) ++ * XXX wtf? ++ BUG_ON(io->op.op.i_sectors_delta >= PAGE_SECTORS); ++ */ ++ ++ /* ++ * PageWriteback is effectively our ref on the inode - fixup i_blocks ++ * before calling end_page_writeback: ++ */ ++ i_sectors_acct(c, io->inode, NULL, io->op.i_sectors_delta); ++ ++ bio_for_each_segment_all(bvec, bio, iter) { ++ struct bch_page_state *s = __bch2_page_state(bvec->bv_page); ++ ++ if (atomic_dec_and_test(&s->write_count)) ++ end_page_writeback(bvec->bv_page); ++ } ++ ++ closure_return_with_destructor(&io->cl, bch2_writepage_io_free); ++} ++ ++static void bch2_writepage_do_io(struct bch_writepage_state *w) ++{ ++ struct bch_writepage_io *io = w->io; ++ ++ w->io = NULL; ++ closure_call(&io->op.cl, bch2_write, NULL, &io->cl); ++ continue_at(&io->cl, bch2_writepage_io_done, NULL); ++} ++ ++/* ++ * Get a bch_writepage_io and add @page to it - appending to an existing one if ++ * possible, else allocating a new one: ++ */ ++static void bch2_writepage_io_alloc(struct bch_fs *c, ++ struct bch_writepage_state *w, ++ struct bch_inode_info *inode, ++ u64 sector, ++ unsigned nr_replicas) ++{ ++ struct bch_write_op *op; ++ ++ w->io = container_of(bio_alloc_bioset(GFP_NOFS, BIO_MAX_VECS, ++ &c->writepage_bioset), ++ struct bch_writepage_io, op.wbio.bio); ++ ++ closure_init(&w->io->cl, NULL); ++ w->io->inode = inode; ++ ++ op = &w->io->op; ++ bch2_write_op_init(op, c, w->opts); ++ op->target = w->opts.foreground_target; ++ op_journal_seq_set(op, &inode->ei_journal_seq); ++ op->nr_replicas = nr_replicas; ++ op->res.nr_replicas = nr_replicas; ++ op->write_point = writepoint_hashed(inode->ei_last_dirtied); ++ op->pos = POS(inode->v.i_ino, sector); ++ op->wbio.bio.bi_iter.bi_sector = sector; ++} ++ ++static int __bch2_writepage(struct page *page, ++ struct writeback_control *wbc, ++ void *data) ++{ ++ struct bch_inode_info *inode = to_bch_ei(page->mapping->host); ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ struct bch_writepage_state *w = data; ++ struct bch_page_state *s, orig; ++ unsigned i, offset, nr_replicas_this_write = U32_MAX; ++ loff_t i_size = i_size_read(&inode->v); ++ pgoff_t end_index = i_size >> PAGE_SHIFT; ++ int ret; ++ ++ EBUG_ON(!PageUptodate(page)); ++ ++ /* Is the page fully inside i_size? */ ++ if (page->index < end_index) ++ goto do_io; ++ ++ /* Is the page fully outside i_size? (truncate in progress) */ ++ offset = i_size & (PAGE_SIZE - 1); ++ if (page->index > end_index || !offset) { ++ unlock_page(page); ++ return 0; ++ } ++ ++ /* ++ * The page straddles i_size. It must be zeroed out on each and every ++ * writepage invocation because it may be mmapped. "A file is mapped ++ * in multiples of the page size. For a file that is not a multiple of ++ * the page size, the remaining memory is zeroed when mapped, and ++ * writes to that region are not written out to the file." ++ */ ++ zero_user_segment(page, offset, PAGE_SIZE); ++do_io: ++ s = bch2_page_state_create(page, __GFP_NOFAIL); ++ ++ ret = bch2_get_page_disk_reservation(c, inode, page, true); ++ if (ret) { ++ SetPageError(page); ++ mapping_set_error(page->mapping, ret); ++ unlock_page(page); ++ return 0; ++ } ++ ++ /* Before unlocking the page, get copy of reservations: */ ++ orig = *s; ++ ++ for (i = 0; i < PAGE_SECTORS; i++) { ++ if (s->s[i].state < SECTOR_DIRTY) ++ continue; ++ ++ nr_replicas_this_write = ++ min_t(unsigned, nr_replicas_this_write, ++ s->s[i].nr_replicas + ++ s->s[i].replicas_reserved); ++ } ++ ++ for (i = 0; i < PAGE_SECTORS; i++) { ++ if (s->s[i].state < SECTOR_DIRTY) ++ continue; ++ ++ s->s[i].nr_replicas = w->opts.compression ++ ? 0 : nr_replicas_this_write; ++ ++ s->s[i].replicas_reserved = 0; ++ s->s[i].state = SECTOR_ALLOCATED; ++ } ++ ++ BUG_ON(atomic_read(&s->write_count)); ++ atomic_set(&s->write_count, 1); ++ ++ BUG_ON(PageWriteback(page)); ++ set_page_writeback(page); ++ ++ unlock_page(page); ++ ++ offset = 0; ++ while (1) { ++ unsigned sectors = 1, dirty_sectors = 0, reserved_sectors = 0; ++ u64 sector; ++ ++ while (offset < PAGE_SECTORS && ++ orig.s[offset].state < SECTOR_DIRTY) ++ offset++; ++ ++ if (offset == PAGE_SECTORS) ++ break; ++ ++ sector = ((u64) page->index << PAGE_SECTOR_SHIFT) + offset; ++ ++ while (offset + sectors < PAGE_SECTORS && ++ orig.s[offset + sectors].state >= SECTOR_DIRTY) ++ sectors++; ++ ++ for (i = offset; i < offset + sectors; i++) { ++ reserved_sectors += orig.s[i].replicas_reserved; ++ dirty_sectors += orig.s[i].state == SECTOR_DIRTY; ++ } ++ ++ if (w->io && ++ (w->io->op.res.nr_replicas != nr_replicas_this_write || ++ bio_full(&w->io->op.wbio.bio, PAGE_SIZE) || ++ w->io->op.wbio.bio.bi_iter.bi_size >= (256U << 20) || ++ bio_end_sector(&w->io->op.wbio.bio) != sector)) ++ bch2_writepage_do_io(w); ++ ++ if (!w->io) ++ bch2_writepage_io_alloc(c, w, inode, sector, ++ nr_replicas_this_write); ++ ++ atomic_inc(&s->write_count); ++ ++ BUG_ON(inode != w->io->inode); ++ BUG_ON(!bio_add_page(&w->io->op.wbio.bio, page, ++ sectors << 9, offset << 9)); ++ ++ /* Check for writing past i_size: */ ++ WARN_ON((bio_end_sector(&w->io->op.wbio.bio) << 9) > ++ round_up(i_size, block_bytes(c))); ++ ++ w->io->op.res.sectors += reserved_sectors; ++ w->io->op.i_sectors_delta -= dirty_sectors; ++ w->io->op.new_i_size = i_size; ++ ++ if (wbc->sync_mode == WB_SYNC_ALL) ++ w->io->op.wbio.bio.bi_opf |= REQ_SYNC; ++ ++ offset += sectors; ++ } ++ ++ if (atomic_dec_and_test(&s->write_count)) ++ end_page_writeback(page); ++ ++ return 0; ++} ++ ++int bch2_writepages(struct address_space *mapping, struct writeback_control *wbc) ++{ ++ struct bch_fs *c = mapping->host->i_sb->s_fs_info; ++ struct bch_writepage_state w = ++ bch_writepage_state_init(c, to_bch_ei(mapping->host)); ++ struct blk_plug plug; ++ int ret; ++ ++ blk_start_plug(&plug); ++ ret = write_cache_pages(mapping, wbc, __bch2_writepage, &w); ++ if (w.io) ++ bch2_writepage_do_io(&w); ++ blk_finish_plug(&plug); ++ return ret; ++} ++ ++int bch2_writepage(struct page *page, struct writeback_control *wbc) ++{ ++ struct bch_fs *c = page->mapping->host->i_sb->s_fs_info; ++ struct bch_writepage_state w = ++ bch_writepage_state_init(c, to_bch_ei(page->mapping->host)); ++ int ret; ++ ++ ret = __bch2_writepage(page, wbc, &w); ++ if (w.io) ++ bch2_writepage_do_io(&w); ++ ++ return ret; ++} ++ ++/* buffered writes: */ ++ ++int bch2_write_begin(struct file *file, struct address_space *mapping, ++ loff_t pos, unsigned len, unsigned flags, ++ struct page **pagep, void **fsdata) ++{ ++ struct bch_inode_info *inode = to_bch_ei(mapping->host); ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ struct bch2_page_reservation *res; ++ pgoff_t index = pos >> PAGE_SHIFT; ++ unsigned offset = pos & (PAGE_SIZE - 1); ++ struct page *page; ++ int ret = -ENOMEM; ++ ++ res = kmalloc(sizeof(*res), GFP_KERNEL); ++ if (!res) ++ return -ENOMEM; ++ ++ bch2_page_reservation_init(c, inode, res); ++ *fsdata = res; ++ ++ bch2_pagecache_add_get(&inode->ei_pagecache_lock); ++ ++ page = grab_cache_page_write_begin(mapping, index, flags); ++ if (!page) ++ goto err_unlock; ++ ++ if (PageUptodate(page)) ++ goto out; ++ ++ /* If we're writing entire page, don't need to read it in first: */ ++ if (len == PAGE_SIZE) ++ goto out; ++ ++ if (!offset && pos + len >= inode->v.i_size) { ++ zero_user_segment(page, len, PAGE_SIZE); ++ flush_dcache_page(page); ++ goto out; ++ } ++ ++ if (index > inode->v.i_size >> PAGE_SHIFT) { ++ zero_user_segments(page, 0, offset, offset + len, PAGE_SIZE); ++ flush_dcache_page(page); ++ goto out; ++ } ++readpage: ++ ret = bch2_read_single_page(page, mapping); ++ if (ret) ++ goto err; ++out: ++ ret = bch2_page_reservation_get(c, inode, page, res, ++ offset, len, true); ++ if (ret) { ++ if (!PageUptodate(page)) { ++ /* ++ * If the page hasn't been read in, we won't know if we ++ * actually need a reservation - we don't actually need ++ * to read here, we just need to check if the page is ++ * fully backed by uncompressed data: ++ */ ++ goto readpage; ++ } ++ ++ goto err; ++ } ++ ++ *pagep = page; ++ return 0; ++err: ++ unlock_page(page); ++ put_page(page); ++ *pagep = NULL; ++err_unlock: ++ bch2_pagecache_add_put(&inode->ei_pagecache_lock); ++ kfree(res); ++ *fsdata = NULL; ++ return ret; ++} ++ ++int bch2_write_end(struct file *file, struct address_space *mapping, ++ loff_t pos, unsigned len, unsigned copied, ++ struct page *page, void *fsdata) ++{ ++ struct bch_inode_info *inode = to_bch_ei(mapping->host); ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ struct bch2_page_reservation *res = fsdata; ++ unsigned offset = pos & (PAGE_SIZE - 1); ++ ++ lockdep_assert_held(&inode->v.i_rwsem); ++ ++ if (unlikely(copied < len && !PageUptodate(page))) { ++ /* ++ * The page needs to be read in, but that would destroy ++ * our partial write - simplest thing is to just force ++ * userspace to redo the write: ++ */ ++ zero_user(page, 0, PAGE_SIZE); ++ flush_dcache_page(page); ++ copied = 0; ++ } ++ ++ spin_lock(&inode->v.i_lock); ++ if (pos + copied > inode->v.i_size) ++ i_size_write(&inode->v, pos + copied); ++ spin_unlock(&inode->v.i_lock); ++ ++ if (copied) { ++ if (!PageUptodate(page)) ++ SetPageUptodate(page); ++ ++ bch2_set_page_dirty(c, inode, page, res, offset, copied); ++ ++ inode->ei_last_dirtied = (unsigned long) current; ++ } ++ ++ unlock_page(page); ++ put_page(page); ++ bch2_pagecache_add_put(&inode->ei_pagecache_lock); ++ ++ bch2_page_reservation_put(c, inode, res); ++ kfree(res); ++ ++ return copied; ++} ++ ++#define WRITE_BATCH_PAGES 32 ++ ++static int __bch2_buffered_write(struct bch_inode_info *inode, ++ struct address_space *mapping, ++ struct iov_iter *iter, ++ loff_t pos, unsigned len) ++{ ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ struct page *pages[WRITE_BATCH_PAGES]; ++ struct bch2_page_reservation res; ++ unsigned long index = pos >> PAGE_SHIFT; ++ unsigned offset = pos & (PAGE_SIZE - 1); ++ unsigned nr_pages = DIV_ROUND_UP(offset + len, PAGE_SIZE); ++ unsigned i, reserved = 0, set_dirty = 0; ++ unsigned copied = 0, nr_pages_copied = 0; ++ int ret = 0; ++ ++ BUG_ON(!len); ++ BUG_ON(nr_pages > ARRAY_SIZE(pages)); ++ ++ bch2_page_reservation_init(c, inode, &res); ++ ++ for (i = 0; i < nr_pages; i++) { ++ pages[i] = grab_cache_page_write_begin(mapping, index + i, 0); ++ if (!pages[i]) { ++ nr_pages = i; ++ if (!i) { ++ ret = -ENOMEM; ++ goto out; ++ } ++ len = min_t(unsigned, len, ++ nr_pages * PAGE_SIZE - offset); ++ break; ++ } ++ } ++ ++ if (offset && !PageUptodate(pages[0])) { ++ ret = bch2_read_single_page(pages[0], mapping); ++ if (ret) ++ goto out; ++ } ++ ++ if ((pos + len) & (PAGE_SIZE - 1) && ++ !PageUptodate(pages[nr_pages - 1])) { ++ if ((index + nr_pages - 1) << PAGE_SHIFT >= inode->v.i_size) { ++ zero_user(pages[nr_pages - 1], 0, PAGE_SIZE); ++ } else { ++ ret = bch2_read_single_page(pages[nr_pages - 1], mapping); ++ if (ret) ++ goto out; ++ } ++ } ++ ++ while (reserved < len) { ++ struct page *page = pages[(offset + reserved) >> PAGE_SHIFT]; ++ unsigned pg_offset = (offset + reserved) & (PAGE_SIZE - 1); ++ unsigned pg_len = min_t(unsigned, len - reserved, ++ PAGE_SIZE - pg_offset); ++retry_reservation: ++ ret = bch2_page_reservation_get(c, inode, page, &res, ++ pg_offset, pg_len, true); ++ ++ if (ret && !PageUptodate(page)) { ++ ret = bch2_read_single_page(page, mapping); ++ if (!ret) ++ goto retry_reservation; ++ } ++ ++ if (ret) ++ goto out; ++ ++ reserved += pg_len; ++ } ++ ++ if (mapping_writably_mapped(mapping)) ++ for (i = 0; i < nr_pages; i++) ++ flush_dcache_page(pages[i]); ++ ++ while (copied < len) { ++ struct page *page = pages[(offset + copied) >> PAGE_SHIFT]; ++ unsigned pg_offset = (offset + copied) & (PAGE_SIZE - 1); ++ unsigned pg_len = min_t(unsigned, len - copied, ++ PAGE_SIZE - pg_offset); ++ unsigned pg_copied = copy_page_from_iter_atomic(page, ++ pg_offset, pg_len,iter); ++ ++ if (!pg_copied) ++ break; ++ ++ flush_dcache_page(page); ++ copied += pg_copied; ++ } ++ ++ if (!copied) ++ goto out; ++ ++ if (copied < len && ++ ((offset + copied) & (PAGE_SIZE - 1))) { ++ struct page *page = pages[(offset + copied) >> PAGE_SHIFT]; ++ ++ if (!PageUptodate(page)) { ++ zero_user(page, 0, PAGE_SIZE); ++ copied -= (offset + copied) & (PAGE_SIZE - 1); ++ } ++ } ++ ++ spin_lock(&inode->v.i_lock); ++ if (pos + copied > inode->v.i_size) ++ i_size_write(&inode->v, pos + copied); ++ spin_unlock(&inode->v.i_lock); ++ ++ while (set_dirty < copied) { ++ struct page *page = pages[(offset + set_dirty) >> PAGE_SHIFT]; ++ unsigned pg_offset = (offset + set_dirty) & (PAGE_SIZE - 1); ++ unsigned pg_len = min_t(unsigned, copied - set_dirty, ++ PAGE_SIZE - pg_offset); ++ ++ if (!PageUptodate(page)) ++ SetPageUptodate(page); ++ ++ bch2_set_page_dirty(c, inode, page, &res, pg_offset, pg_len); ++ unlock_page(page); ++ put_page(page); ++ ++ set_dirty += pg_len; ++ } ++ ++ nr_pages_copied = DIV_ROUND_UP(offset + copied, PAGE_SIZE); ++ inode->ei_last_dirtied = (unsigned long) current; ++out: ++ for (i = nr_pages_copied; i < nr_pages; i++) { ++ unlock_page(pages[i]); ++ put_page(pages[i]); ++ } ++ ++ bch2_page_reservation_put(c, inode, &res); ++ ++ return copied ?: ret; ++} ++ ++static ssize_t bch2_buffered_write(struct kiocb *iocb, struct iov_iter *iter) ++{ ++ struct file *file = iocb->ki_filp; ++ struct address_space *mapping = file->f_mapping; ++ struct bch_inode_info *inode = file_bch_inode(file); ++ loff_t pos = iocb->ki_pos; ++ ssize_t written = 0; ++ int ret = 0; ++ ++ bch2_pagecache_add_get(&inode->ei_pagecache_lock); ++ ++ do { ++ unsigned offset = pos & (PAGE_SIZE - 1); ++ unsigned bytes = min_t(unsigned long, iov_iter_count(iter), ++ PAGE_SIZE * WRITE_BATCH_PAGES - offset); ++again: ++ /* ++ * Bring in the user page that we will copy from _first_. ++ * Otherwise there's a nasty deadlock on copying from the ++ * same page as we're writing to, without it being marked ++ * up-to-date. ++ * ++ * Not only is this an optimisation, but it is also required ++ * to check that the address is actually valid, when atomic ++ * usercopies are used, below. ++ */ ++ if (unlikely(fault_in_iov_iter_readable(iter, bytes))) { ++ bytes = min_t(unsigned long, iov_iter_count(iter), ++ PAGE_SIZE - offset); ++ ++ if (unlikely(fault_in_iov_iter_readable(iter, bytes))) { ++ ret = -EFAULT; ++ break; ++ } ++ } ++ ++ if (unlikely(fatal_signal_pending(current))) { ++ ret = -EINTR; ++ break; ++ } ++ ++ ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes); ++ if (unlikely(ret < 0)) ++ break; ++ ++ cond_resched(); ++ ++ if (unlikely(ret == 0)) { ++ /* ++ * If we were unable to copy any data at all, we must ++ * fall back to a single segment length write. ++ * ++ * If we didn't fallback here, we could livelock ++ * because not all segments in the iov can be copied at ++ * once without a pagefault. ++ */ ++ bytes = min_t(unsigned long, PAGE_SIZE - offset, ++ iov_iter_single_seg_count(iter)); ++ goto again; ++ } ++ pos += ret; ++ written += ret; ++ ++ balance_dirty_pages_ratelimited(mapping); ++ } while (iov_iter_count(iter)); ++ ++ bch2_pagecache_add_put(&inode->ei_pagecache_lock); ++ ++ return written ? written : ret; ++} ++ ++/* O_DIRECT reads */ ++ ++static void bch2_dio_read_complete(struct closure *cl) ++{ ++ struct dio_read *dio = container_of(cl, struct dio_read, cl); ++ ++ dio->req->ki_complete(dio->req, dio->ret); ++ bio_check_pages_dirty(&dio->rbio.bio); /* transfers ownership */ ++} ++ ++static void bch2_direct_IO_read_endio(struct bio *bio) ++{ ++ struct dio_read *dio = bio->bi_private; ++ ++ if (bio->bi_status) ++ dio->ret = blk_status_to_errno(bio->bi_status); ++ ++ closure_put(&dio->cl); ++} ++ ++static void bch2_direct_IO_read_split_endio(struct bio *bio) ++{ ++ bch2_direct_IO_read_endio(bio); ++ bio_check_pages_dirty(bio); /* transfers ownership */ ++} ++ ++static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter) ++{ ++ struct file *file = req->ki_filp; ++ struct bch_inode_info *inode = file_bch_inode(file); ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ struct bch_io_opts opts = io_opts(c, &inode->ei_inode); ++ struct dio_read *dio; ++ struct bio *bio; ++ loff_t offset = req->ki_pos; ++ bool sync = is_sync_kiocb(req); ++ size_t shorten; ++ ssize_t ret; ++ ++ if ((offset|iter->count) & (block_bytes(c) - 1)) ++ return -EINVAL; ++ ++ ret = min_t(loff_t, iter->count, ++ max_t(loff_t, 0, i_size_read(&inode->v) - offset)); ++ ++ if (!ret) ++ return ret; ++ ++ shorten = iov_iter_count(iter) - round_up(ret, block_bytes(c)); ++ iter->count -= shorten; ++ ++ bio = bio_alloc_bioset(GFP_KERNEL, ++ iov_iter_npages(iter, BIO_MAX_VECS), ++ &c->dio_read_bioset); ++ ++ bio->bi_end_io = bch2_direct_IO_read_endio; ++ ++ dio = container_of(bio, struct dio_read, rbio.bio); ++ closure_init(&dio->cl, NULL); ++ ++ /* ++ * this is a _really_ horrible hack just to avoid an atomic sub at the ++ * end: ++ */ ++ if (!sync) { ++ set_closure_fn(&dio->cl, bch2_dio_read_complete, NULL); ++ atomic_set(&dio->cl.remaining, ++ CLOSURE_REMAINING_INITIALIZER - ++ CLOSURE_RUNNING + ++ CLOSURE_DESTRUCTOR); ++ } else { ++ atomic_set(&dio->cl.remaining, ++ CLOSURE_REMAINING_INITIALIZER + 1); ++ } ++ ++ dio->req = req; ++ dio->ret = ret; ++ ++ goto start; ++ while (iter->count) { ++ bio = bio_alloc_bioset(GFP_KERNEL, ++ iov_iter_npages(iter, BIO_MAX_VECS), ++ &c->bio_read); ++ bio->bi_end_io = bch2_direct_IO_read_split_endio; ++start: ++ bio_set_op_attrs(bio, REQ_OP_READ, REQ_SYNC); ++ bio->bi_iter.bi_sector = offset >> 9; ++ bio->bi_private = dio; ++ ++ ret = bio_iov_iter_get_pages(bio, iter); ++ if (ret < 0) { ++ /* XXX: fault inject this path */ ++ bio->bi_status = BLK_STS_RESOURCE; ++ bio_endio(bio); ++ break; ++ } ++ ++ offset += bio->bi_iter.bi_size; ++ bio_set_pages_dirty(bio); ++ ++ if (iter->count) ++ closure_get(&dio->cl); ++ ++ bch2_read(c, rbio_init(bio, opts), inode->v.i_ino); ++ } ++ ++ iter->count += shorten; ++ ++ if (sync) { ++ closure_sync(&dio->cl); ++ closure_debug_destroy(&dio->cl); ++ ret = dio->ret; ++ bio_check_pages_dirty(&dio->rbio.bio); /* transfers ownership */ ++ return ret; ++ } else { ++ return -EIOCBQUEUED; ++ } ++} ++ ++ssize_t bch2_read_iter(struct kiocb *iocb, struct iov_iter *iter) ++{ ++ struct file *file = iocb->ki_filp; ++ struct bch_inode_info *inode = file_bch_inode(file); ++ struct address_space *mapping = file->f_mapping; ++ size_t count = iov_iter_count(iter); ++ ssize_t ret; ++ ++ if (!count) ++ return 0; /* skip atime */ ++ ++ if (iocb->ki_flags & IOCB_DIRECT) { ++ struct blk_plug plug; ++ ++ ret = filemap_write_and_wait_range(mapping, ++ iocb->ki_pos, ++ iocb->ki_pos + count - 1); ++ if (ret < 0) ++ return ret; ++ ++ file_accessed(file); ++ ++ blk_start_plug(&plug); ++ ret = bch2_direct_IO_read(iocb, iter); ++ blk_finish_plug(&plug); ++ ++ if (ret >= 0) ++ iocb->ki_pos += ret; ++ } else { ++ bch2_pagecache_add_get(&inode->ei_pagecache_lock); ++ ret = generic_file_read_iter(iocb, iter); ++ bch2_pagecache_add_put(&inode->ei_pagecache_lock); ++ } ++ ++ return ret; ++} ++ ++/* O_DIRECT writes */ ++ ++static long bch2_dio_write_loop(struct dio_write *dio) ++{ ++ bool kthread = (current->flags & PF_KTHREAD) != 0; ++ struct bch_fs *c = dio->op.c; ++ struct kiocb *req = dio->req; ++ struct address_space *mapping = req->ki_filp->f_mapping; ++ struct bch_inode_info *inode = file_bch_inode(req->ki_filp); ++ struct bio *bio = &dio->op.wbio.bio; ++ struct bvec_iter_all iter; ++ struct bio_vec *bv; ++ unsigned unaligned; ++ u64 new_i_size; ++ bool sync; ++ long ret; ++ ++ if (dio->loop) ++ goto loop; ++ ++ while (1) { ++ if (kthread) ++ kthread_use_mm(dio->mm); ++ BUG_ON(current->faults_disabled_mapping); ++ current->faults_disabled_mapping = mapping; ++ ++ ret = bio_iov_iter_get_pages(bio, &dio->iter); ++ ++ current->faults_disabled_mapping = NULL; ++ if (kthread) ++ kthread_unuse_mm(dio->mm); ++ ++ if (unlikely(ret < 0)) ++ goto err; ++ ++ unaligned = bio->bi_iter.bi_size & (block_bytes(c) - 1); ++ bio->bi_iter.bi_size -= unaligned; ++ iov_iter_revert(&dio->iter, unaligned); ++ ++ if (!bio->bi_iter.bi_size) { ++ /* ++ * bio_iov_iter_get_pages was only able to get < ++ * blocksize worth of pages: ++ */ ++ bio_for_each_segment_all(bv, bio, iter) ++ put_page(bv->bv_page); ++ ret = -EFAULT; ++ goto err; ++ } ++ ++ dio->op.pos = POS(inode->v.i_ino, ++ (req->ki_pos >> 9) + dio->op.written); ++ ++ task_io_account_write(bio->bi_iter.bi_size); ++ ++ if (!dio->sync && !dio->loop && dio->iter.count) { ++ struct iovec *iov = dio->inline_vecs; ++ ++ if (dio->iter.nr_segs > ARRAY_SIZE(dio->inline_vecs)) { ++ iov = kmalloc(dio->iter.nr_segs * sizeof(*iov), ++ GFP_KERNEL); ++ if (unlikely(!iov)) { ++ dio->sync = true; ++ goto do_io; ++ } ++ ++ dio->free_iov = true; ++ } ++ ++ memcpy(iov, dio->iter.iov, dio->iter.nr_segs * sizeof(*iov)); ++ dio->iter.iov = iov; ++ } ++do_io: ++ dio->loop = true; ++ closure_call(&dio->op.cl, bch2_write, NULL, NULL); ++ ++ if (dio->sync) ++ wait_for_completion(&dio->done); ++ else ++ return -EIOCBQUEUED; ++loop: ++ i_sectors_acct(c, inode, &dio->quota_res, ++ dio->op.i_sectors_delta); ++ dio->op.i_sectors_delta = 0; ++ ++ new_i_size = req->ki_pos + ((u64) dio->op.written << 9); ++ ++ spin_lock(&inode->v.i_lock); ++ if (new_i_size > inode->v.i_size) ++ i_size_write(&inode->v, new_i_size); ++ spin_unlock(&inode->v.i_lock); ++ ++ bio_for_each_segment_all(bv, bio, iter) ++ put_page(bv->bv_page); ++ if (!dio->iter.count || dio->op.error) ++ break; ++ ++ bio_reset(bio); ++ reinit_completion(&dio->done); ++ } ++ ++ ret = dio->op.error ?: ((long) dio->op.written << 9); ++err: ++ bch2_pagecache_block_put(&inode->ei_pagecache_lock); ++ bch2_disk_reservation_put(c, &dio->op.res); ++ bch2_quota_reservation_put(c, inode, &dio->quota_res); ++ ++ if (dio->free_iov) ++ kfree(dio->iter.iov); ++ ++ sync = dio->sync; ++ bio_put(bio); ++ ++ /* inode->i_dio_count is our ref on inode and thus bch_fs */ ++ inode_dio_end(&inode->v); ++ ++ if (!sync) { ++ req->ki_complete(req, ret); ++ ret = -EIOCBQUEUED; ++ } ++ return ret; ++} ++ ++static void bch2_dio_write_loop_async(struct bch_write_op *op) ++{ ++ struct dio_write *dio = container_of(op, struct dio_write, op); ++ ++ if (dio->sync) ++ complete(&dio->done); ++ else ++ bch2_dio_write_loop(dio); ++} ++ ++static noinline ++ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter) ++{ ++ struct file *file = req->ki_filp; ++ struct address_space *mapping = file->f_mapping; ++ struct bch_inode_info *inode = file_bch_inode(file); ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ struct bch_io_opts opts = io_opts(c, &inode->ei_inode); ++ struct dio_write *dio; ++ struct bio *bio; ++ bool locked = true, extending; ++ ssize_t ret; ++ ++ prefetch(&c->opts); ++ prefetch((void *) &c->opts + 64); ++ prefetch(&inode->ei_inode); ++ prefetch((void *) &inode->ei_inode + 64); ++ ++ inode_lock(&inode->v); ++ ++ ret = generic_write_checks(req, iter); ++ if (unlikely(ret <= 0)) ++ goto err; ++ ++ ret = file_remove_privs(file); ++ if (unlikely(ret)) ++ goto err; ++ ++ ret = file_update_time(file); ++ if (unlikely(ret)) ++ goto err; ++ ++ if (unlikely((req->ki_pos|iter->count) & (block_bytes(c) - 1))) ++ goto err; ++ ++ inode_dio_begin(&inode->v); ++ bch2_pagecache_block_get(&inode->ei_pagecache_lock); ++ ++ extending = req->ki_pos + iter->count > inode->v.i_size; ++ if (!extending) { ++ inode_unlock(&inode->v); ++ locked = false; ++ } ++ ++ bio = bio_alloc_bioset(GFP_KERNEL, ++ iov_iter_npages(iter, BIO_MAX_VECS), ++ &c->dio_write_bioset); ++ dio = container_of(bio, struct dio_write, op.wbio.bio); ++ init_completion(&dio->done); ++ dio->req = req; ++ dio->mm = current->mm; ++ dio->loop = false; ++ dio->sync = is_sync_kiocb(req) || extending; ++ dio->free_iov = false; ++ dio->quota_res.sectors = 0; ++ dio->iter = *iter; ++ ++ bch2_write_op_init(&dio->op, c, opts); ++ dio->op.end_io = bch2_dio_write_loop_async; ++ dio->op.target = opts.foreground_target; ++ op_journal_seq_set(&dio->op, &inode->ei_journal_seq); ++ dio->op.write_point = writepoint_hashed((unsigned long) current); ++ dio->op.flags |= BCH_WRITE_NOPUT_RESERVATION; ++ ++ if ((req->ki_flags & IOCB_DSYNC) && ++ !c->opts.journal_flush_disabled) ++ dio->op.flags |= BCH_WRITE_FLUSH; ++ ++ ret = bch2_quota_reservation_add(c, inode, &dio->quota_res, ++ iter->count >> 9, true); ++ if (unlikely(ret)) ++ goto err_put_bio; ++ ++ dio->op.nr_replicas = dio->op.opts.data_replicas; ++ ++ ret = bch2_disk_reservation_get(c, &dio->op.res, iter->count >> 9, ++ dio->op.opts.data_replicas, 0); ++ if (unlikely(ret) && ++ !bch2_check_range_allocated(c, POS(inode->v.i_ino, ++ req->ki_pos >> 9), ++ iter->count >> 9, ++ dio->op.opts.data_replicas)) ++ goto err_put_bio; ++ ++ ret = write_invalidate_inode_pages_range(mapping, ++ req->ki_pos, ++ req->ki_pos + iter->count - 1); ++ if (unlikely(ret)) ++ goto err_put_bio; ++ ++ ret = bch2_dio_write_loop(dio); ++err: ++ if (locked) ++ inode_unlock(&inode->v); ++ if (ret > 0) ++ req->ki_pos += ret; ++ return ret; ++err_put_bio: ++ bch2_pagecache_block_put(&inode->ei_pagecache_lock); ++ bch2_disk_reservation_put(c, &dio->op.res); ++ bch2_quota_reservation_put(c, inode, &dio->quota_res); ++ bio_put(bio); ++ inode_dio_end(&inode->v); ++ goto err; ++} ++ ++ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *from) ++{ ++ struct file *file = iocb->ki_filp; ++ struct bch_inode_info *inode = file_bch_inode(file); ++ ssize_t ret; ++ ++ if (iocb->ki_flags & IOCB_DIRECT) ++ return bch2_direct_write(iocb, from); ++ ++ /* We can write back this queue in page reclaim */ ++ current->backing_dev_info = inode_to_bdi(&inode->v); ++ inode_lock(&inode->v); ++ ++ ret = generic_write_checks(iocb, from); ++ if (ret <= 0) ++ goto unlock; ++ ++ ret = file_remove_privs(file); ++ if (ret) ++ goto unlock; ++ ++ ret = file_update_time(file); ++ if (ret) ++ goto unlock; ++ ++ ret = bch2_buffered_write(iocb, from); ++ if (likely(ret > 0)) ++ iocb->ki_pos += ret; ++unlock: ++ inode_unlock(&inode->v); ++ current->backing_dev_info = NULL; ++ ++ if (ret > 0) ++ ret = generic_write_sync(iocb, ret); ++ ++ return ret; ++} ++ ++/* fsync: */ ++ ++int bch2_fsync(struct file *file, loff_t start, loff_t end, int datasync) ++{ ++ struct bch_inode_info *inode = file_bch_inode(file); ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ int ret, ret2; ++ ++ ret = file_write_and_wait_range(file, start, end); ++ if (ret) ++ return ret; ++ ++ if (datasync && !(inode->v.i_state & I_DIRTY_DATASYNC)) ++ goto out; ++ ++ ret = sync_inode_metadata(&inode->v, 1); ++ if (ret) ++ return ret; ++out: ++ if (!c->opts.journal_flush_disabled) ++ ret = bch2_journal_flush_seq(&c->journal, ++ inode->ei_journal_seq); ++ ret2 = file_check_and_advance_wb_err(file); ++ ++ return ret ?: ret2; ++} ++ ++/* truncate: */ ++ ++static inline int range_has_data(struct bch_fs *c, ++ struct bpos start, ++ struct bpos end) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ int ret = 0; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, start, 0, k, ret) { ++ if (bkey_cmp(bkey_start_pos(k.k), end) >= 0) ++ break; ++ ++ if (bkey_extent_is_data(k.k)) { ++ ret = 1; ++ break; ++ } ++ } ++ ++ return bch2_trans_exit(&trans) ?: ret; ++} ++ ++static int __bch2_truncate_page(struct bch_inode_info *inode, ++ pgoff_t index, loff_t start, loff_t end) ++{ ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ struct address_space *mapping = inode->v.i_mapping; ++ struct bch_page_state *s; ++ unsigned start_offset = start & (PAGE_SIZE - 1); ++ unsigned end_offset = ((end - 1) & (PAGE_SIZE - 1)) + 1; ++ unsigned i; ++ struct page *page; ++ int ret = 0; ++ ++ /* Page boundary? Nothing to do */ ++ if (!((index == start >> PAGE_SHIFT && start_offset) || ++ (index == end >> PAGE_SHIFT && end_offset != PAGE_SIZE))) ++ return 0; ++ ++ /* Above i_size? */ ++ if (index << PAGE_SHIFT >= inode->v.i_size) ++ return 0; ++ ++ page = find_lock_page(mapping, index); ++ if (!page) { ++ /* ++ * XXX: we're doing two index lookups when we end up reading the ++ * page ++ */ ++ ret = range_has_data(c, ++ POS(inode->v.i_ino, index << PAGE_SECTOR_SHIFT), ++ POS(inode->v.i_ino, (index + 1) << PAGE_SECTOR_SHIFT)); ++ if (ret <= 0) ++ return ret; ++ ++ page = find_or_create_page(mapping, index, GFP_KERNEL); ++ if (unlikely(!page)) { ++ ret = -ENOMEM; ++ goto out; ++ } ++ } ++ ++ s = bch2_page_state_create(page, 0); ++ if (!s) { ++ ret = -ENOMEM; ++ goto unlock; ++ } ++ ++ if (!PageUptodate(page)) { ++ ret = bch2_read_single_page(page, mapping); ++ if (ret) ++ goto unlock; ++ } ++ ++ if (index != start >> PAGE_SHIFT) ++ start_offset = 0; ++ if (index != end >> PAGE_SHIFT) ++ end_offset = PAGE_SIZE; ++ ++ for (i = round_up(start_offset, block_bytes(c)) >> 9; ++ i < round_down(end_offset, block_bytes(c)) >> 9; ++ i++) { ++ s->s[i].nr_replicas = 0; ++ s->s[i].state = SECTOR_UNALLOCATED; ++ } ++ ++ zero_user_segment(page, start_offset, end_offset); ++ ++ /* ++ * Bit of a hack - we don't want truncate to fail due to -ENOSPC. ++ * ++ * XXX: because we aren't currently tracking whether the page has actual ++ * data in it (vs. just 0s, or only partially written) this wrong. ick. ++ */ ++ ret = bch2_get_page_disk_reservation(c, inode, page, false); ++ BUG_ON(ret); ++ ++ __set_page_dirty_nobuffers(page); ++unlock: ++ unlock_page(page); ++ put_page(page); ++out: ++ return ret; ++} ++ ++static int bch2_truncate_page(struct bch_inode_info *inode, loff_t from) ++{ ++ return __bch2_truncate_page(inode, from >> PAGE_SHIFT, ++ from, round_up(from, PAGE_SIZE)); ++} ++ ++static int bch2_extend(struct bch_inode_info *inode, ++ struct bch_inode_unpacked *inode_u, ++ struct iattr *iattr) ++{ ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ struct address_space *mapping = inode->v.i_mapping; ++ int ret; ++ ++ /* ++ * sync appends: ++ * ++ * this has to be done _before_ extending i_size: ++ */ ++ ret = filemap_write_and_wait_range(mapping, inode_u->bi_size, S64_MAX); ++ if (ret) ++ return ret; ++ ++ truncate_setsize(&inode->v, iattr->ia_size); ++ /* ATTR_MODE will never be set here, ns argument isn't needed: */ ++ setattr_copy(NULL, &inode->v, iattr); ++ ++ mutex_lock(&inode->ei_update_lock); ++ ret = bch2_write_inode_size(c, inode, inode->v.i_size, ++ ATTR_MTIME|ATTR_CTIME); ++ mutex_unlock(&inode->ei_update_lock); ++ ++ return ret; ++} ++ ++static int bch2_truncate_finish_fn(struct bch_inode_info *inode, ++ struct bch_inode_unpacked *bi, ++ void *p) ++{ ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ ++ bi->bi_flags &= ~BCH_INODE_I_SIZE_DIRTY; ++ bi->bi_mtime = bi->bi_ctime = bch2_current_time(c); ++ return 0; ++} ++ ++static int bch2_truncate_start_fn(struct bch_inode_info *inode, ++ struct bch_inode_unpacked *bi, void *p) ++{ ++ u64 *new_i_size = p; ++ ++ bi->bi_flags |= BCH_INODE_I_SIZE_DIRTY; ++ bi->bi_size = *new_i_size; ++ return 0; ++} ++ ++int bch2_truncate(struct bch_inode_info *inode, struct iattr *iattr) ++{ ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ struct address_space *mapping = inode->v.i_mapping; ++ struct bch_inode_unpacked inode_u; ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ u64 new_i_size = iattr->ia_size; ++ s64 i_sectors_delta = 0; ++ int ret = 0; ++ ++ inode_dio_wait(&inode->v); ++ bch2_pagecache_block_get(&inode->ei_pagecache_lock); ++ ++ /* ++ * fetch current on disk i_size: inode is locked, i_size can only ++ * increase underneath us: ++ */ ++ bch2_trans_init(&trans, c, 0, 0); ++ iter = bch2_inode_peek(&trans, &inode_u, inode->v.i_ino, 0); ++ ret = PTR_ERR_OR_ZERO(iter); ++ bch2_trans_exit(&trans); ++ ++ if (ret) ++ goto err; ++ ++ BUG_ON(inode->v.i_size < inode_u.bi_size); ++ ++ if (iattr->ia_size > inode->v.i_size) { ++ ret = bch2_extend(inode, &inode_u, iattr); ++ goto err; ++ } ++ ++ ret = bch2_truncate_page(inode, iattr->ia_size); ++ if (unlikely(ret)) ++ goto err; ++ ++ /* ++ * When extending, we're going to write the new i_size to disk ++ * immediately so we need to flush anything above the current on disk ++ * i_size first: ++ * ++ * Also, when extending we need to flush the page that i_size currently ++ * straddles - if it's mapped to userspace, we need to ensure that ++ * userspace has to redirty it and call .mkwrite -> set_page_dirty ++ * again to allocate the part of the page that was extended. ++ */ ++ if (iattr->ia_size > inode_u.bi_size) ++ ret = filemap_write_and_wait_range(mapping, ++ inode_u.bi_size, ++ iattr->ia_size - 1); ++ else if (iattr->ia_size & (PAGE_SIZE - 1)) ++ ret = filemap_write_and_wait_range(mapping, ++ round_down(iattr->ia_size, PAGE_SIZE), ++ iattr->ia_size - 1); ++ if (ret) ++ goto err; ++ ++ mutex_lock(&inode->ei_update_lock); ++ ret = bch2_write_inode(c, inode, bch2_truncate_start_fn, ++ &new_i_size, 0); ++ mutex_unlock(&inode->ei_update_lock); ++ ++ if (unlikely(ret)) ++ goto err; ++ ++ truncate_setsize(&inode->v, iattr->ia_size); ++ ++ ret = bch2_fpunch(c, inode->v.i_ino, ++ round_up(iattr->ia_size, block_bytes(c)) >> 9, ++ U64_MAX, &inode->ei_journal_seq, &i_sectors_delta); ++ i_sectors_acct(c, inode, NULL, i_sectors_delta); ++ ++ if (unlikely(ret)) ++ goto err; ++ ++ /* ATTR_MODE will never be set here, ns argument isn't needed: */ ++ setattr_copy(NULL, &inode->v, iattr); ++ ++ mutex_lock(&inode->ei_update_lock); ++ ret = bch2_write_inode(c, inode, bch2_truncate_finish_fn, NULL, ++ ATTR_MTIME|ATTR_CTIME); ++ mutex_unlock(&inode->ei_update_lock); ++err: ++ bch2_pagecache_block_put(&inode->ei_pagecache_lock); ++ return ret; ++} ++ ++/* fallocate: */ ++ ++static long bchfs_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len) ++{ ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ u64 discard_start = round_up(offset, block_bytes(c)) >> 9; ++ u64 discard_end = round_down(offset + len, block_bytes(c)) >> 9; ++ int ret = 0; ++ ++ inode_lock(&inode->v); ++ inode_dio_wait(&inode->v); ++ bch2_pagecache_block_get(&inode->ei_pagecache_lock); ++ ++ ret = __bch2_truncate_page(inode, ++ offset >> PAGE_SHIFT, ++ offset, offset + len); ++ if (unlikely(ret)) ++ goto err; ++ ++ if (offset >> PAGE_SHIFT != ++ (offset + len) >> PAGE_SHIFT) { ++ ret = __bch2_truncate_page(inode, ++ (offset + len) >> PAGE_SHIFT, ++ offset, offset + len); ++ if (unlikely(ret)) ++ goto err; ++ } ++ ++ truncate_pagecache_range(&inode->v, offset, offset + len - 1); ++ ++ if (discard_start < discard_end) { ++ s64 i_sectors_delta = 0; ++ ++ ret = bch2_fpunch(c, inode->v.i_ino, ++ discard_start, discard_end, ++ &inode->ei_journal_seq, ++ &i_sectors_delta); ++ i_sectors_acct(c, inode, NULL, i_sectors_delta); ++ } ++err: ++ bch2_pagecache_block_put(&inode->ei_pagecache_lock); ++ inode_unlock(&inode->v); ++ ++ return ret; ++} ++ ++static long bchfs_fcollapse_finsert(struct bch_inode_info *inode, ++ loff_t offset, loff_t len, ++ bool insert) ++{ ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ struct address_space *mapping = inode->v.i_mapping; ++ struct btree_trans trans; ++ struct btree_iter *src, *dst, *del = NULL; ++ loff_t shift, new_size; ++ u64 src_start; ++ int ret; ++ ++ if ((offset | len) & (block_bytes(c) - 1)) ++ return -EINVAL; ++ ++ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 256); ++ ++ /* ++ * We need i_mutex to keep the page cache consistent with the extents ++ * btree, and the btree consistent with i_size - we don't need outside ++ * locking for the extents btree itself, because we're using linked ++ * iterators ++ */ ++ inode_lock(&inode->v); ++ inode_dio_wait(&inode->v); ++ bch2_pagecache_block_get(&inode->ei_pagecache_lock); ++ ++ if (insert) { ++ ret = -EFBIG; ++ if (inode->v.i_sb->s_maxbytes - inode->v.i_size < len) ++ goto err; ++ ++ ret = -EINVAL; ++ if (offset >= inode->v.i_size) ++ goto err; ++ ++ src_start = U64_MAX; ++ shift = len; ++ } else { ++ ret = -EINVAL; ++ if (offset + len >= inode->v.i_size) ++ goto err; ++ ++ src_start = offset + len; ++ shift = -len; ++ } ++ ++ new_size = inode->v.i_size + shift; ++ ++ ret = write_invalidate_inode_pages_range(mapping, offset, LLONG_MAX); ++ if (ret) ++ goto err; ++ ++ if (insert) { ++ i_size_write(&inode->v, new_size); ++ mutex_lock(&inode->ei_update_lock); ++ ret = bch2_write_inode_size(c, inode, new_size, ++ ATTR_MTIME|ATTR_CTIME); ++ mutex_unlock(&inode->ei_update_lock); ++ } else { ++ s64 i_sectors_delta = 0; ++ ++ ret = bch2_fpunch(c, inode->v.i_ino, ++ offset >> 9, (offset + len) >> 9, ++ &inode->ei_journal_seq, ++ &i_sectors_delta); ++ i_sectors_acct(c, inode, NULL, i_sectors_delta); ++ ++ if (ret) ++ goto err; ++ } ++ ++ src = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, ++ POS(inode->v.i_ino, src_start >> 9), ++ BTREE_ITER_INTENT); ++ BUG_ON(IS_ERR_OR_NULL(src)); ++ ++ dst = bch2_trans_copy_iter(&trans, src); ++ BUG_ON(IS_ERR_OR_NULL(dst)); ++ ++ while (1) { ++ struct disk_reservation disk_res = ++ bch2_disk_reservation_init(c, 0); ++ BKEY_PADDED(k) copy; ++ struct bkey_i delete; ++ struct bkey_s_c k; ++ struct bpos next_pos; ++ struct bpos move_pos = POS(inode->v.i_ino, offset >> 9); ++ struct bpos atomic_end; ++ unsigned commit_flags = BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_ATOMIC| ++ BTREE_INSERT_USE_RESERVE; ++ ++ k = insert ++ ? bch2_btree_iter_peek_prev(src) ++ : bch2_btree_iter_peek(src); ++ if ((ret = bkey_err(k))) ++ goto bkey_err; ++ ++ if (!k.k || k.k->p.inode != inode->v.i_ino) ++ break; ++ ++ BUG_ON(bkey_cmp(src->pos, bkey_start_pos(k.k))); ++ ++ if (insert && ++ bkey_cmp(k.k->p, POS(inode->v.i_ino, offset >> 9)) <= 0) ++ break; ++reassemble: ++ bkey_reassemble(©.k, k); ++ ++ if (insert && ++ bkey_cmp(bkey_start_pos(k.k), move_pos) < 0) { ++ bch2_cut_front(move_pos, ©.k); ++ bch2_btree_iter_set_pos(src, bkey_start_pos(©.k.k)); ++ } ++ ++ copy.k.k.p.offset += shift >> 9; ++ bch2_btree_iter_set_pos(dst, bkey_start_pos(©.k.k)); ++ ++ ret = bch2_extent_atomic_end(dst, ©.k, &atomic_end); ++ if (ret) ++ goto bkey_err; ++ ++ if (bkey_cmp(atomic_end, copy.k.k.p)) { ++ if (insert) { ++ move_pos = atomic_end; ++ move_pos.offset -= shift >> 9; ++ goto reassemble; ++ } else { ++ bch2_cut_back(atomic_end, ©.k.k); ++ } ++ } ++ ++ bkey_init(&delete.k); ++ delete.k.p = src->pos; ++ bch2_key_resize(&delete.k, copy.k.k.size); ++ ++ next_pos = insert ? bkey_start_pos(&delete.k) : delete.k.p; ++ ++ /* ++ * If the new and old keys overlap (because we're moving an ++ * extent that's bigger than the amount we're collapsing by), ++ * we need to trim the delete key here so they don't overlap ++ * because overlaps on insertions aren't handled before ++ * triggers are run, so the overwrite will get double counted ++ * by the triggers machinery: ++ */ ++ if (insert && ++ bkey_cmp(bkey_start_pos(©.k.k), delete.k.p) < 0) { ++ bch2_cut_back(bkey_start_pos(©.k.k), &delete.k); ++ } else if (!insert && ++ bkey_cmp(copy.k.k.p, ++ bkey_start_pos(&delete.k)) > 0) { ++ bch2_cut_front(copy.k.k.p, &delete); ++ ++ del = bch2_trans_copy_iter(&trans, src); ++ BUG_ON(IS_ERR_OR_NULL(del)); ++ ++ bch2_btree_iter_set_pos(del, ++ bkey_start_pos(&delete.k)); ++ } ++ ++ bch2_trans_update(&trans, dst, ©.k); ++ bch2_trans_update(&trans, del ?: src, &delete); ++ ++ if (copy.k.k.size == k.k->size) { ++ /* ++ * If we're moving the entire extent, we can skip ++ * running triggers: ++ */ ++ commit_flags |= BTREE_INSERT_NOMARK; ++ } else { ++ /* We might end up splitting compressed extents: */ ++ unsigned nr_ptrs = ++ bch2_bkey_nr_dirty_ptrs(bkey_i_to_s_c(©.k)); ++ ++ ret = bch2_disk_reservation_get(c, &disk_res, ++ copy.k.k.size, nr_ptrs, ++ BCH_DISK_RESERVATION_NOFAIL); ++ BUG_ON(ret); ++ } ++ ++ ret = bch2_trans_commit(&trans, &disk_res, ++ &inode->ei_journal_seq, ++ commit_flags); ++ bch2_disk_reservation_put(c, &disk_res); ++bkey_err: ++ if (del) ++ bch2_trans_iter_put(&trans, del); ++ del = NULL; ++ ++ if (!ret) ++ bch2_btree_iter_set_pos(src, next_pos); ++ ++ if (ret == -EINTR) ++ ret = 0; ++ if (ret) ++ goto err; ++ ++ bch2_trans_cond_resched(&trans); ++ } ++ bch2_trans_unlock(&trans); ++ ++ if (!insert) { ++ i_size_write(&inode->v, new_size); ++ mutex_lock(&inode->ei_update_lock); ++ ret = bch2_write_inode_size(c, inode, new_size, ++ ATTR_MTIME|ATTR_CTIME); ++ mutex_unlock(&inode->ei_update_lock); ++ } ++err: ++ bch2_trans_exit(&trans); ++ bch2_pagecache_block_put(&inode->ei_pagecache_lock); ++ inode_unlock(&inode->v); ++ return ret; ++} ++ ++static long bchfs_fallocate(struct bch_inode_info *inode, int mode, ++ loff_t offset, loff_t len) ++{ ++ struct address_space *mapping = inode->v.i_mapping; ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bpos end_pos; ++ loff_t end = offset + len; ++ loff_t block_start = round_down(offset, block_bytes(c)); ++ loff_t block_end = round_up(end, block_bytes(c)); ++ unsigned sectors; ++ unsigned replicas = io_opts(c, &inode->ei_inode).data_replicas; ++ int ret; ++ ++ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); ++ ++ inode_lock(&inode->v); ++ inode_dio_wait(&inode->v); ++ bch2_pagecache_block_get(&inode->ei_pagecache_lock); ++ ++ if (!(mode & FALLOC_FL_KEEP_SIZE) && end > inode->v.i_size) { ++ ret = inode_newsize_ok(&inode->v, end); ++ if (ret) ++ goto err; ++ } ++ ++ if (mode & FALLOC_FL_ZERO_RANGE) { ++ ret = __bch2_truncate_page(inode, ++ offset >> PAGE_SHIFT, ++ offset, end); ++ ++ if (!ret && ++ offset >> PAGE_SHIFT != end >> PAGE_SHIFT) ++ ret = __bch2_truncate_page(inode, ++ end >> PAGE_SHIFT, ++ offset, end); ++ ++ if (unlikely(ret)) ++ goto err; ++ ++ truncate_pagecache_range(&inode->v, offset, end - 1); ++ } ++ ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, ++ POS(inode->v.i_ino, block_start >> 9), ++ BTREE_ITER_SLOTS|BTREE_ITER_INTENT); ++ end_pos = POS(inode->v.i_ino, block_end >> 9); ++ ++ while (bkey_cmp(iter->pos, end_pos) < 0) { ++ s64 i_sectors_delta = 0; ++ struct disk_reservation disk_res = { 0 }; ++ struct quota_res quota_res = { 0 }; ++ struct bkey_i_reservation reservation; ++ struct bkey_s_c k; ++ ++ k = bch2_btree_iter_peek_slot(iter); ++ if ((ret = bkey_err(k))) ++ goto bkey_err; ++ ++ /* already reserved */ ++ if (k.k->type == KEY_TYPE_reservation && ++ bkey_s_c_to_reservation(k).v->nr_replicas >= replicas) { ++ bch2_btree_iter_next_slot(iter); ++ continue; ++ } ++ ++ if (bkey_extent_is_data(k.k) && ++ !(mode & FALLOC_FL_ZERO_RANGE)) { ++ bch2_btree_iter_next_slot(iter); ++ continue; ++ } ++ ++ bkey_reservation_init(&reservation.k_i); ++ reservation.k.type = KEY_TYPE_reservation; ++ reservation.k.p = k.k->p; ++ reservation.k.size = k.k->size; ++ ++ bch2_cut_front(iter->pos, &reservation.k_i); ++ bch2_cut_back(end_pos, &reservation.k); ++ ++ sectors = reservation.k.size; ++ reservation.v.nr_replicas = bch2_bkey_nr_dirty_ptrs(k); ++ ++ if (!bkey_extent_is_allocation(k.k)) { ++ ret = bch2_quota_reservation_add(c, inode, ++ "a_res, ++ sectors, true); ++ if (unlikely(ret)) ++ goto bkey_err; ++ } ++ ++ if (reservation.v.nr_replicas < replicas || ++ bch2_extent_is_compressed(k)) { ++ ret = bch2_disk_reservation_get(c, &disk_res, sectors, ++ replicas, 0); ++ if (unlikely(ret)) ++ goto bkey_err; ++ ++ reservation.v.nr_replicas = disk_res.nr_replicas; ++ } ++ ++ bch2_trans_begin_updates(&trans); ++ ++ ret = bch2_extent_update(&trans, iter, &reservation.k_i, ++ &disk_res, &inode->ei_journal_seq, ++ 0, &i_sectors_delta); ++ i_sectors_acct(c, inode, "a_res, i_sectors_delta); ++bkey_err: ++ bch2_quota_reservation_put(c, inode, "a_res); ++ bch2_disk_reservation_put(c, &disk_res); ++ if (ret == -EINTR) ++ ret = 0; ++ if (ret) ++ goto err; ++ } ++ ++ /* ++ * Do we need to extend the file? ++ * ++ * If we zeroed up to the end of the file, we dropped whatever writes ++ * were going to write out the current i_size, so we have to extend ++ * manually even if FL_KEEP_SIZE was set: ++ */ ++ if (end >= inode->v.i_size && ++ (!(mode & FALLOC_FL_KEEP_SIZE) || ++ (mode & FALLOC_FL_ZERO_RANGE))) { ++ struct btree_iter *inode_iter; ++ struct bch_inode_unpacked inode_u; ++ ++ do { ++ bch2_trans_begin(&trans); ++ inode_iter = bch2_inode_peek(&trans, &inode_u, ++ inode->v.i_ino, 0); ++ ret = PTR_ERR_OR_ZERO(inode_iter); ++ } while (ret == -EINTR); ++ ++ bch2_trans_unlock(&trans); ++ ++ if (ret) ++ goto err; ++ ++ /* ++ * Sync existing appends before extending i_size, ++ * as in bch2_extend(): ++ */ ++ ret = filemap_write_and_wait_range(mapping, ++ inode_u.bi_size, S64_MAX); ++ if (ret) ++ goto err; ++ ++ if (mode & FALLOC_FL_KEEP_SIZE) ++ end = inode->v.i_size; ++ else ++ i_size_write(&inode->v, end); ++ ++ mutex_lock(&inode->ei_update_lock); ++ ret = bch2_write_inode_size(c, inode, end, 0); ++ mutex_unlock(&inode->ei_update_lock); ++ } ++err: ++ bch2_trans_exit(&trans); ++ bch2_pagecache_block_put(&inode->ei_pagecache_lock); ++ inode_unlock(&inode->v); ++ return ret; ++} ++ ++long bch2_fallocate_dispatch(struct file *file, int mode, ++ loff_t offset, loff_t len) ++{ ++ struct bch_inode_info *inode = file_bch_inode(file); ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ long ret; ++ ++ if (!percpu_ref_tryget(&c->writes)) ++ return -EROFS; ++ ++ if (!(mode & ~(FALLOC_FL_KEEP_SIZE|FALLOC_FL_ZERO_RANGE))) ++ ret = bchfs_fallocate(inode, mode, offset, len); ++ else if (mode == (FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE)) ++ ret = bchfs_fpunch(inode, offset, len); ++ else if (mode == FALLOC_FL_INSERT_RANGE) ++ ret = bchfs_fcollapse_finsert(inode, offset, len, true); ++ else if (mode == FALLOC_FL_COLLAPSE_RANGE) ++ ret = bchfs_fcollapse_finsert(inode, offset, len, false); ++ else ++ ret = -EOPNOTSUPP; ++ ++ percpu_ref_put(&c->writes); ++ ++ return ret; ++} ++ ++static void mark_range_unallocated(struct bch_inode_info *inode, ++ loff_t start, loff_t end) ++{ ++ pgoff_t index = start >> PAGE_SHIFT; ++ pgoff_t end_index = (end - 1) >> PAGE_SHIFT; ++ struct pagevec pvec; ++ ++ pagevec_init(&pvec); ++ ++ do { ++ unsigned nr_pages, i, j; ++ ++ nr_pages = pagevec_lookup_range(&pvec, inode->v.i_mapping, ++ &index, end_index); ++ if (nr_pages == 0) ++ break; ++ ++ for (i = 0; i < nr_pages; i++) { ++ struct page *page = pvec.pages[i]; ++ struct bch_page_state *s; ++ ++ lock_page(page); ++ s = bch2_page_state(page); ++ ++ if (s) { ++ spin_lock(&s->lock); ++ for (j = 0; j < PAGE_SECTORS; j++) ++ s->s[j].nr_replicas = 0; ++ spin_unlock(&s->lock); ++ } ++ ++ unlock_page(page); ++ } ++ pagevec_release(&pvec); ++ } while (index <= end_index); ++} ++ ++loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src, ++ struct file *file_dst, loff_t pos_dst, ++ loff_t len, unsigned remap_flags) ++{ ++ struct bch_inode_info *src = file_bch_inode(file_src); ++ struct bch_inode_info *dst = file_bch_inode(file_dst); ++ struct bch_fs *c = src->v.i_sb->s_fs_info; ++ s64 i_sectors_delta = 0; ++ loff_t ret = 0; ++ loff_t aligned_len; ++ ++ if (remap_flags & ~(REMAP_FILE_DEDUP|REMAP_FILE_ADVISORY)) ++ return -EINVAL; ++ ++ if (remap_flags & REMAP_FILE_DEDUP) ++ return -EOPNOTSUPP; ++ ++ if ((pos_src & (block_bytes(c) - 1)) || ++ (pos_dst & (block_bytes(c) - 1))) ++ return -EINVAL; ++ ++ if (src == dst && ++ abs(pos_src - pos_dst) < len) ++ return -EINVAL; ++ ++ bch2_lock_inodes(INODE_LOCK|INODE_PAGECACHE_BLOCK, src, dst); ++ ++ file_update_time(file_dst); ++ ++ inode_dio_wait(&src->v); ++ inode_dio_wait(&dst->v); ++ ++ ret = generic_remap_file_range_prep(file_src, pos_src, ++ file_dst, pos_dst, ++ &len, remap_flags); ++ if (ret < 0 || len == 0) ++ goto err; ++ ++ aligned_len = round_up(len, block_bytes(c)); ++ ++ ret = write_invalidate_inode_pages_range(dst->v.i_mapping, ++ pos_dst, pos_dst + aligned_len); ++ if (ret) ++ goto err; ++ ++ mark_range_unallocated(src, pos_src, pos_src + aligned_len); ++ ++ ret = bch2_remap_range(c, ++ POS(dst->v.i_ino, pos_dst >> 9), ++ POS(src->v.i_ino, pos_src >> 9), ++ aligned_len >> 9, ++ &dst->ei_journal_seq, ++ pos_dst + len, &i_sectors_delta); ++ if (ret < 0) ++ goto err; ++ ++ ret <<= 9; ++ /* ++ * due to alignment, we might have remapped slightly more than requsted ++ */ ++ ret = min(ret, len); ++ ++ /* XXX get a quota reservation */ ++ i_sectors_acct(c, dst, NULL, i_sectors_delta); ++ ++ spin_lock(&dst->v.i_lock); ++ if (pos_dst + len > dst->v.i_size) ++ i_size_write(&dst->v, pos_dst + len); ++ spin_unlock(&dst->v.i_lock); ++err: ++ bch2_unlock_inodes(INODE_LOCK|INODE_PAGECACHE_BLOCK, src, dst); ++ ++ return ret; ++} ++ ++/* fseek: */ ++ ++static int page_data_offset(struct page *page, unsigned offset) ++{ ++ struct bch_page_state *s = bch2_page_state(page); ++ unsigned i; ++ ++ if (s) ++ for (i = offset >> 9; i < PAGE_SECTORS; i++) ++ if (s->s[i].state >= SECTOR_DIRTY) ++ return i << 9; ++ ++ return -1; ++} ++ ++static loff_t bch2_seek_pagecache_data(struct inode *vinode, ++ loff_t start_offset, ++ loff_t end_offset) ++{ ++ struct address_space *mapping = vinode->i_mapping; ++ struct page *page; ++ pgoff_t start_index = start_offset >> PAGE_SHIFT; ++ pgoff_t end_index = end_offset >> PAGE_SHIFT; ++ pgoff_t index = start_index; ++ loff_t ret; ++ int offset; ++ ++ while (index <= end_index) { ++ if (find_get_pages_range(mapping, &index, end_index, 1, &page)) { ++ lock_page(page); ++ ++ offset = page_data_offset(page, ++ page->index == start_index ++ ? start_offset & (PAGE_SIZE - 1) ++ : 0); ++ if (offset >= 0) { ++ ret = clamp(((loff_t) page->index << PAGE_SHIFT) + ++ offset, ++ start_offset, end_offset); ++ unlock_page(page); ++ put_page(page); ++ return ret; ++ } ++ ++ unlock_page(page); ++ put_page(page); ++ } else { ++ break; ++ } ++ } ++ ++ return end_offset; ++} ++ ++static loff_t bch2_seek_data(struct file *file, u64 offset) ++{ ++ struct bch_inode_info *inode = file_bch_inode(file); ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ u64 isize, next_data = MAX_LFS_FILESIZE; ++ int ret; ++ ++ isize = i_size_read(&inode->v); ++ if (offset >= isize) ++ return -ENXIO; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, ++ POS(inode->v.i_ino, offset >> 9), 0, k, ret) { ++ if (k.k->p.inode != inode->v.i_ino) { ++ break; ++ } else if (bkey_extent_is_data(k.k)) { ++ next_data = max(offset, bkey_start_offset(k.k) << 9); ++ break; ++ } else if (k.k->p.offset >> 9 > isize) ++ break; ++ } ++ ++ ret = bch2_trans_exit(&trans) ?: ret; ++ if (ret) ++ return ret; ++ ++ if (next_data > offset) ++ next_data = bch2_seek_pagecache_data(&inode->v, ++ offset, next_data); ++ ++ if (next_data >= isize) ++ return -ENXIO; ++ ++ return vfs_setpos(file, next_data, MAX_LFS_FILESIZE); ++} ++ ++static int __page_hole_offset(struct page *page, unsigned offset) ++{ ++ struct bch_page_state *s = bch2_page_state(page); ++ unsigned i; ++ ++ if (!s) ++ return 0; ++ ++ for (i = offset >> 9; i < PAGE_SECTORS; i++) ++ if (s->s[i].state < SECTOR_DIRTY) ++ return i << 9; ++ ++ return -1; ++} ++ ++static loff_t page_hole_offset(struct address_space *mapping, loff_t offset) ++{ ++ pgoff_t index = offset >> PAGE_SHIFT; ++ struct page *page; ++ int pg_offset; ++ loff_t ret = -1; ++ ++ page = find_lock_page(mapping, index); ++ if (!page) ++ return offset; ++ ++ pg_offset = __page_hole_offset(page, offset & (PAGE_SIZE - 1)); ++ if (pg_offset >= 0) ++ ret = ((loff_t) index << PAGE_SHIFT) + pg_offset; ++ ++ unlock_page(page); ++ ++ return ret; ++} ++ ++static loff_t bch2_seek_pagecache_hole(struct inode *vinode, ++ loff_t start_offset, ++ loff_t end_offset) ++{ ++ struct address_space *mapping = vinode->i_mapping; ++ loff_t offset = start_offset, hole; ++ ++ while (offset < end_offset) { ++ hole = page_hole_offset(mapping, offset); ++ if (hole >= 0 && hole <= end_offset) ++ return max(start_offset, hole); ++ ++ offset += PAGE_SIZE; ++ offset &= PAGE_MASK; ++ } ++ ++ return end_offset; ++} ++ ++static loff_t bch2_seek_hole(struct file *file, u64 offset) ++{ ++ struct bch_inode_info *inode = file_bch_inode(file); ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ u64 isize, next_hole = MAX_LFS_FILESIZE; ++ int ret; ++ ++ isize = i_size_read(&inode->v); ++ if (offset >= isize) ++ return -ENXIO; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, ++ POS(inode->v.i_ino, offset >> 9), ++ BTREE_ITER_SLOTS, k, ret) { ++ if (k.k->p.inode != inode->v.i_ino) { ++ next_hole = bch2_seek_pagecache_hole(&inode->v, ++ offset, MAX_LFS_FILESIZE); ++ break; ++ } else if (!bkey_extent_is_data(k.k)) { ++ next_hole = bch2_seek_pagecache_hole(&inode->v, ++ max(offset, bkey_start_offset(k.k) << 9), ++ k.k->p.offset << 9); ++ ++ if (next_hole < k.k->p.offset << 9) ++ break; ++ } else { ++ offset = max(offset, bkey_start_offset(k.k) << 9); ++ } ++ } ++ ++ ret = bch2_trans_exit(&trans) ?: ret; ++ if (ret) ++ return ret; ++ ++ if (next_hole > isize) ++ next_hole = isize; ++ ++ return vfs_setpos(file, next_hole, MAX_LFS_FILESIZE); ++} ++ ++loff_t bch2_llseek(struct file *file, loff_t offset, int whence) ++{ ++ switch (whence) { ++ case SEEK_SET: ++ case SEEK_CUR: ++ case SEEK_END: ++ return generic_file_llseek(file, offset, whence); ++ case SEEK_DATA: ++ return bch2_seek_data(file, offset); ++ case SEEK_HOLE: ++ return bch2_seek_hole(file, offset); ++ } ++ ++ return -EINVAL; ++} ++ ++void bch2_fs_fsio_exit(struct bch_fs *c) ++{ ++ bioset_exit(&c->dio_write_bioset); ++ bioset_exit(&c->dio_read_bioset); ++ bioset_exit(&c->writepage_bioset); ++} ++ ++int bch2_fs_fsio_init(struct bch_fs *c) ++{ ++ int ret = 0; ++ ++ pr_verbose_init(c->opts, ""); ++ ++ if (bioset_init(&c->writepage_bioset, ++ 4, offsetof(struct bch_writepage_io, op.wbio.bio), ++ BIOSET_NEED_BVECS) || ++ bioset_init(&c->dio_read_bioset, ++ 4, offsetof(struct dio_read, rbio.bio), ++ BIOSET_NEED_BVECS) || ++ bioset_init(&c->dio_write_bioset, ++ 4, offsetof(struct dio_write, op.wbio.bio), ++ BIOSET_NEED_BVECS)) ++ ret = -ENOMEM; ++ ++ pr_verbose_init(c->opts, "ret %i", ret); ++ return ret; ++} ++ ++#endif /* NO_BCACHEFS_FS */ +diff --git a/fs/bcachefs/fs-io.h b/fs/bcachefs/fs-io.h +new file mode 100644 +index 000000000000..7063556d289b +--- /dev/null ++++ b/fs/bcachefs/fs-io.h +@@ -0,0 +1,57 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_FS_IO_H ++#define _BCACHEFS_FS_IO_H ++ ++#ifndef NO_BCACHEFS_FS ++ ++#include "buckets.h" ++#include "io_types.h" ++ ++#include ++ ++struct quota_res; ++ ++int __must_check bch2_write_inode_size(struct bch_fs *, ++ struct bch_inode_info *, ++ loff_t, unsigned); ++ ++int bch2_writepage(struct page *, struct writeback_control *); ++int bch2_readpage(struct file *, struct page *); ++ ++int bch2_writepages(struct address_space *, struct writeback_control *); ++int bch2_readpages(struct file *, struct address_space *, ++ struct list_head *, unsigned); ++ ++int bch2_write_begin(struct file *, struct address_space *, loff_t, ++ unsigned, unsigned, struct page **, void **); ++int bch2_write_end(struct file *, struct address_space *, loff_t, ++ unsigned, unsigned, struct page *, void *); ++ ++ssize_t bch2_read_iter(struct kiocb *, struct iov_iter *); ++ssize_t bch2_write_iter(struct kiocb *, struct iov_iter *); ++ ++int bch2_fsync(struct file *, loff_t, loff_t, int); ++ ++int bch2_truncate(struct bch_inode_info *, struct iattr *); ++long bch2_fallocate_dispatch(struct file *, int, loff_t, loff_t); ++ ++loff_t bch2_remap_file_range(struct file *, loff_t, struct file *, ++ loff_t, loff_t, unsigned); ++ ++loff_t bch2_llseek(struct file *, loff_t, int); ++ ++vm_fault_t bch2_page_fault(struct vm_fault *); ++vm_fault_t bch2_page_mkwrite(struct vm_fault *); ++void bch2_invalidatepage(struct page *, unsigned int, unsigned int); ++int bch2_releasepage(struct page *, gfp_t); ++int bch2_migrate_page(struct address_space *, struct page *, ++ struct page *, enum migrate_mode); ++ ++void bch2_fs_fsio_exit(struct bch_fs *); ++int bch2_fs_fsio_init(struct bch_fs *); ++#else ++static inline void bch2_fs_fsio_exit(struct bch_fs *c) {} ++static inline int bch2_fs_fsio_init(struct bch_fs *c) { return 0; } ++#endif ++ ++#endif /* _BCACHEFS_FS_IO_H */ +diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c +new file mode 100644 +index 000000000000..75fdb2fe861e +--- /dev/null ++++ b/fs/bcachefs/fs-ioctl.c +@@ -0,0 +1,308 @@ ++// SPDX-License-Identifier: GPL-2.0 ++#ifndef NO_BCACHEFS_FS ++ ++#include "bcachefs.h" ++#include "chardev.h" ++#include "dirent.h" ++#include "fs.h" ++#include "fs-common.h" ++#include "fs-ioctl.h" ++#include "quota.h" ++ ++#include ++#include ++ ++#define FS_IOC_GOINGDOWN _IOR('X', 125, __u32) ++ ++struct flags_set { ++ unsigned mask; ++ unsigned flags; ++ ++ unsigned projid; ++}; ++ ++static int bch2_inode_flags_set(struct bch_inode_info *inode, ++ struct bch_inode_unpacked *bi, ++ void *p) ++{ ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ /* ++ * We're relying on btree locking here for exclusion with other ioctl ++ * calls - use the flags in the btree (@bi), not inode->i_flags: ++ */ ++ struct flags_set *s = p; ++ unsigned newflags = s->flags; ++ unsigned oldflags = bi->bi_flags & s->mask; ++ ++ if (((newflags ^ oldflags) & (BCH_INODE_APPEND|BCH_INODE_IMMUTABLE)) && ++ !capable(CAP_LINUX_IMMUTABLE)) ++ return -EPERM; ++ ++ if (!S_ISREG(bi->bi_mode) && ++ !S_ISDIR(bi->bi_mode) && ++ (newflags & (BCH_INODE_NODUMP|BCH_INODE_NOATIME)) != newflags) ++ return -EINVAL; ++ ++ bi->bi_flags &= ~s->mask; ++ bi->bi_flags |= newflags; ++ ++ bi->bi_ctime = timespec_to_bch2_time(c, current_time(&inode->v)); ++ return 0; ++} ++ ++static int bch2_ioc_getflags(struct bch_inode_info *inode, int __user *arg) ++{ ++ unsigned flags = map_flags(bch_flags_to_uflags, inode->ei_inode.bi_flags); ++ ++ return put_user(flags, arg); ++} ++ ++static int bch2_ioc_setflags(struct bch_fs *c, ++ struct file *file, ++ struct bch_inode_info *inode, ++ void __user *arg) ++{ ++ struct flags_set s = { .mask = map_defined(bch_flags_to_uflags) }; ++ unsigned uflags; ++ int ret; ++ ++ if (get_user(uflags, (int __user *) arg)) ++ return -EFAULT; ++ ++ s.flags = map_flags_rev(bch_flags_to_uflags, uflags); ++ if (uflags) ++ return -EOPNOTSUPP; ++ ++ ret = mnt_want_write_file(file); ++ if (ret) ++ return ret; ++ ++ inode_lock(&inode->v); ++ if (!inode_owner_or_capable(file_mnt_user_ns(file), &inode->v)) { ++ ret = -EACCES; ++ goto setflags_out; ++ } ++ ++ mutex_lock(&inode->ei_update_lock); ++ ret = bch2_write_inode(c, inode, bch2_inode_flags_set, &s, ++ ATTR_CTIME); ++ mutex_unlock(&inode->ei_update_lock); ++ ++setflags_out: ++ inode_unlock(&inode->v); ++ mnt_drop_write_file(file); ++ return ret; ++} ++ ++static int bch2_ioc_fsgetxattr(struct bch_inode_info *inode, ++ struct fsxattr __user *arg) ++{ ++ struct fsxattr fa = { 0 }; ++ ++ fa.fsx_xflags = map_flags(bch_flags_to_xflags, inode->ei_inode.bi_flags); ++ fa.fsx_projid = inode->ei_qid.q[QTYP_PRJ]; ++ ++ return copy_to_user(arg, &fa, sizeof(fa)); ++} ++ ++static int fssetxattr_inode_update_fn(struct bch_inode_info *inode, ++ struct bch_inode_unpacked *bi, ++ void *p) ++{ ++ struct flags_set *s = p; ++ ++ if (s->projid != bi->bi_project) { ++ bi->bi_fields_set |= 1U << Inode_opt_project; ++ bi->bi_project = s->projid; ++ } ++ ++ return bch2_inode_flags_set(inode, bi, p); ++} ++ ++static int bch2_ioc_fssetxattr(struct bch_fs *c, ++ struct file *file, ++ struct bch_inode_info *inode, ++ struct fsxattr __user *arg) ++{ ++ struct flags_set s = { .mask = map_defined(bch_flags_to_xflags) }; ++ struct fsxattr fa; ++ int ret; ++ ++ if (copy_from_user(&fa, arg, sizeof(fa))) ++ return -EFAULT; ++ ++ s.flags = map_flags_rev(bch_flags_to_xflags, fa.fsx_xflags); ++ if (fa.fsx_xflags) ++ return -EOPNOTSUPP; ++ ++ if (fa.fsx_projid >= U32_MAX) ++ return -EINVAL; ++ ++ s.projid = fa.fsx_projid + 1; ++ ++ ret = mnt_want_write_file(file); ++ if (ret) ++ return ret; ++ ++ inode_lock(&inode->v); ++ if (!inode_owner_or_capable(file_mnt_user_ns(file), &inode->v)) { ++ ret = -EACCES; ++ goto err; ++ } ++ ++ mutex_lock(&inode->ei_update_lock); ++ ret = bch2_set_projid(c, inode, s.projid); ++ if (ret) ++ goto err_unlock; ++ ++ ret = bch2_write_inode(c, inode, fssetxattr_inode_update_fn, &s, ++ ATTR_CTIME); ++err_unlock: ++ mutex_unlock(&inode->ei_update_lock); ++err: ++ inode_unlock(&inode->v); ++ mnt_drop_write_file(file); ++ return ret; ++} ++ ++static int bch2_reinherit_attrs_fn(struct bch_inode_info *inode, ++ struct bch_inode_unpacked *bi, ++ void *p) ++{ ++ struct bch_inode_info *dir = p; ++ ++ return !bch2_reinherit_attrs(bi, &dir->ei_inode); ++} ++ ++static int bch2_ioc_reinherit_attrs(struct bch_fs *c, ++ struct file *file, ++ struct bch_inode_info *src, ++ const char __user *name) ++{ ++ struct bch_inode_info *dst; ++ struct inode *vinode = NULL; ++ char *kname = NULL; ++ struct qstr qstr; ++ int ret = 0; ++ u64 inum; ++ ++ kname = kmalloc(BCH_NAME_MAX + 1, GFP_KERNEL); ++ if (!kname) ++ return -ENOMEM; ++ ++ ret = strncpy_from_user(kname, name, BCH_NAME_MAX); ++ if (unlikely(ret < 0)) ++ goto err1; ++ ++ qstr.len = ret; ++ qstr.name = kname; ++ ++ ret = -ENOENT; ++ inum = bch2_dirent_lookup(c, src->v.i_ino, ++ &src->ei_str_hash, ++ &qstr); ++ if (!inum) ++ goto err1; ++ ++ vinode = bch2_vfs_inode_get(c, inum); ++ ret = PTR_ERR_OR_ZERO(vinode); ++ if (ret) ++ goto err1; ++ ++ dst = to_bch_ei(vinode); ++ ++ ret = mnt_want_write_file(file); ++ if (ret) ++ goto err2; ++ ++ bch2_lock_inodes(INODE_UPDATE_LOCK, src, dst); ++ ++ if (inode_attr_changing(src, dst, Inode_opt_project)) { ++ ret = bch2_fs_quota_transfer(c, dst, ++ src->ei_qid, ++ 1 << QTYP_PRJ, ++ KEY_TYPE_QUOTA_PREALLOC); ++ if (ret) ++ goto err3; ++ } ++ ++ ret = bch2_write_inode(c, dst, bch2_reinherit_attrs_fn, src, 0); ++err3: ++ bch2_unlock_inodes(INODE_UPDATE_LOCK, src, dst); ++ ++ /* return true if we did work */ ++ if (ret >= 0) ++ ret = !ret; ++ ++ mnt_drop_write_file(file); ++err2: ++ iput(vinode); ++err1: ++ kfree(kname); ++ ++ return ret; ++} ++ ++long bch2_fs_file_ioctl(struct file *file, unsigned cmd, unsigned long arg) ++{ ++ struct bch_inode_info *inode = file_bch_inode(file); ++ struct super_block *sb = inode->v.i_sb; ++ struct bch_fs *c = sb->s_fs_info; ++ ++ switch (cmd) { ++ case FS_IOC_GETFLAGS: ++ return bch2_ioc_getflags(inode, (int __user *) arg); ++ ++ case FS_IOC_SETFLAGS: ++ return bch2_ioc_setflags(c, file, inode, (int __user *) arg); ++ ++ case FS_IOC_FSGETXATTR: ++ return bch2_ioc_fsgetxattr(inode, (void __user *) arg); ++ case FS_IOC_FSSETXATTR: ++ return bch2_ioc_fssetxattr(c, file, inode, ++ (void __user *) arg); ++ ++ case BCHFS_IOC_REINHERIT_ATTRS: ++ return bch2_ioc_reinherit_attrs(c, file, inode, ++ (void __user *) arg); ++ ++ case FS_IOC_GETVERSION: ++ return -ENOTTY; ++ case FS_IOC_SETVERSION: ++ return -ENOTTY; ++ ++ case FS_IOC_GOINGDOWN: ++ if (!capable(CAP_SYS_ADMIN)) ++ return -EPERM; ++ ++ down_write(&sb->s_umount); ++ sb->s_flags |= SB_RDONLY; ++ if (bch2_fs_emergency_read_only(c)) ++ bch_err(c, "emergency read only due to ioctl"); ++ up_write(&sb->s_umount); ++ return 0; ++ ++ default: ++ return bch2_fs_ioctl(c, cmd, (void __user *) arg); ++ } ++} ++ ++#ifdef CONFIG_COMPAT ++long bch2_compat_fs_ioctl(struct file *file, unsigned cmd, unsigned long arg) ++{ ++ /* These are just misnamed, they actually get/put from/to user an int */ ++ switch (cmd) { ++ case FS_IOC_GETFLAGS: ++ cmd = FS_IOC_GETFLAGS; ++ break; ++ case FS_IOC32_SETFLAGS: ++ cmd = FS_IOC_SETFLAGS; ++ break; ++ default: ++ return -ENOIOCTLCMD; ++ } ++ return bch2_fs_file_ioctl(file, cmd, (unsigned long) compat_ptr(arg)); ++} ++#endif ++ ++#endif /* NO_BCACHEFS_FS */ +diff --git a/fs/bcachefs/fs-ioctl.h b/fs/bcachefs/fs-ioctl.h +new file mode 100644 +index 000000000000..f201980ef2c3 +--- /dev/null ++++ b/fs/bcachefs/fs-ioctl.h +@@ -0,0 +1,81 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_FS_IOCTL_H ++#define _BCACHEFS_FS_IOCTL_H ++ ++/* Inode flags: */ ++ ++/* bcachefs inode flags -> vfs inode flags: */ ++static const unsigned bch_flags_to_vfs[] = { ++ [__BCH_INODE_SYNC] = S_SYNC, ++ [__BCH_INODE_IMMUTABLE] = S_IMMUTABLE, ++ [__BCH_INODE_APPEND] = S_APPEND, ++ [__BCH_INODE_NOATIME] = S_NOATIME, ++}; ++ ++/* bcachefs inode flags -> FS_IOC_GETFLAGS: */ ++static const unsigned bch_flags_to_uflags[] = { ++ [__BCH_INODE_SYNC] = FS_SYNC_FL, ++ [__BCH_INODE_IMMUTABLE] = FS_IMMUTABLE_FL, ++ [__BCH_INODE_APPEND] = FS_APPEND_FL, ++ [__BCH_INODE_NODUMP] = FS_NODUMP_FL, ++ [__BCH_INODE_NOATIME] = FS_NOATIME_FL, ++}; ++ ++/* bcachefs inode flags -> FS_IOC_FSGETXATTR: */ ++static const unsigned bch_flags_to_xflags[] = { ++ [__BCH_INODE_SYNC] = FS_XFLAG_SYNC, ++ [__BCH_INODE_IMMUTABLE] = FS_XFLAG_IMMUTABLE, ++ [__BCH_INODE_APPEND] = FS_XFLAG_APPEND, ++ [__BCH_INODE_NODUMP] = FS_XFLAG_NODUMP, ++ [__BCH_INODE_NOATIME] = FS_XFLAG_NOATIME, ++ //[__BCH_INODE_PROJINHERIT] = FS_XFLAG_PROJINHERIT; ++}; ++ ++#define set_flags(_map, _in, _out) \ ++do { \ ++ unsigned _i; \ ++ \ ++ for (_i = 0; _i < ARRAY_SIZE(_map); _i++) \ ++ if ((_in) & (1 << _i)) \ ++ (_out) |= _map[_i]; \ ++ else \ ++ (_out) &= ~_map[_i]; \ ++} while (0) ++ ++#define map_flags(_map, _in) \ ++({ \ ++ unsigned _out = 0; \ ++ \ ++ set_flags(_map, _in, _out); \ ++ _out; \ ++}) ++ ++#define map_flags_rev(_map, _in) \ ++({ \ ++ unsigned _i, _out = 0; \ ++ \ ++ for (_i = 0; _i < ARRAY_SIZE(_map); _i++) \ ++ if ((_in) & _map[_i]) { \ ++ (_out) |= 1 << _i; \ ++ (_in) &= ~_map[_i]; \ ++ } \ ++ (_out); \ ++}) ++ ++#define map_defined(_map) \ ++({ \ ++ unsigned _in = ~0; \ ++ \ ++ map_flags_rev(_map, _in); \ ++}) ++ ++/* Set VFS inode flags from bcachefs inode: */ ++static inline void bch2_inode_flags_to_vfs(struct bch_inode_info *inode) ++{ ++ set_flags(bch_flags_to_vfs, inode->ei_inode.bi_flags, inode->v.i_flags); ++} ++ ++long bch2_fs_file_ioctl(struct file *, unsigned, unsigned long); ++long bch2_compat_fs_ioctl(struct file *, unsigned, unsigned long); ++ ++#endif /* _BCACHEFS_FS_IOCTL_H */ +diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c +new file mode 100644 +index 000000000000..eb4e671ae0f0 +--- /dev/null ++++ b/fs/bcachefs/fs.c +@@ -0,0 +1,1614 @@ ++// SPDX-License-Identifier: GPL-2.0 ++#ifndef NO_BCACHEFS_FS ++ ++#include "bcachefs.h" ++#include "acl.h" ++#include "btree_update.h" ++#include "buckets.h" ++#include "chardev.h" ++#include "dirent.h" ++#include "extents.h" ++#include "fs.h" ++#include "fs-common.h" ++#include "fs-io.h" ++#include "fs-ioctl.h" ++#include "fsck.h" ++#include "inode.h" ++#include "io.h" ++#include "journal.h" ++#include "keylist.h" ++#include "quota.h" ++#include "super.h" ++#include "xattr.h" ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++static struct kmem_cache *bch2_inode_cache; ++ ++static void bch2_vfs_inode_init(struct bch_fs *, ++ struct bch_inode_info *, ++ struct bch_inode_unpacked *); ++ ++static void journal_seq_copy(struct bch_inode_info *dst, ++ u64 journal_seq) ++{ ++ u64 old, v = READ_ONCE(dst->ei_journal_seq); ++ ++ do { ++ old = v; ++ ++ if (old >= journal_seq) ++ break; ++ } while ((v = cmpxchg(&dst->ei_journal_seq, old, journal_seq)) != old); ++} ++ ++static void __pagecache_lock_put(struct pagecache_lock *lock, long i) ++{ ++ BUG_ON(atomic_long_read(&lock->v) == 0); ++ ++ if (atomic_long_sub_return_release(i, &lock->v) == 0) ++ wake_up_all(&lock->wait); ++} ++ ++static bool __pagecache_lock_tryget(struct pagecache_lock *lock, long i) ++{ ++ long v = atomic_long_read(&lock->v), old; ++ ++ do { ++ old = v; ++ ++ if (i > 0 ? v < 0 : v > 0) ++ return false; ++ } while ((v = atomic_long_cmpxchg_acquire(&lock->v, ++ old, old + i)) != old); ++ return true; ++} ++ ++static void __pagecache_lock_get(struct pagecache_lock *lock, long i) ++{ ++ wait_event(lock->wait, __pagecache_lock_tryget(lock, i)); ++} ++ ++void bch2_pagecache_add_put(struct pagecache_lock *lock) ++{ ++ __pagecache_lock_put(lock, 1); ++} ++ ++void bch2_pagecache_add_get(struct pagecache_lock *lock) ++{ ++ __pagecache_lock_get(lock, 1); ++} ++ ++void bch2_pagecache_block_put(struct pagecache_lock *lock) ++{ ++ __pagecache_lock_put(lock, -1); ++} ++ ++void bch2_pagecache_block_get(struct pagecache_lock *lock) ++{ ++ __pagecache_lock_get(lock, -1); ++} ++ ++void bch2_inode_update_after_write(struct bch_fs *c, ++ struct bch_inode_info *inode, ++ struct bch_inode_unpacked *bi, ++ unsigned fields) ++{ ++ set_nlink(&inode->v, bch2_inode_nlink_get(bi)); ++ i_uid_write(&inode->v, bi->bi_uid); ++ i_gid_write(&inode->v, bi->bi_gid); ++ inode->v.i_mode = bi->bi_mode; ++ ++ if (fields & ATTR_ATIME) ++ inode->v.i_atime = bch2_time_to_timespec(c, bi->bi_atime); ++ if (fields & ATTR_MTIME) ++ inode->v.i_mtime = bch2_time_to_timespec(c, bi->bi_mtime); ++ if (fields & ATTR_CTIME) ++ inode->v.i_ctime = bch2_time_to_timespec(c, bi->bi_ctime); ++ ++ inode->ei_inode = *bi; ++ ++ bch2_inode_flags_to_vfs(inode); ++} ++ ++int __must_check bch2_write_inode(struct bch_fs *c, ++ struct bch_inode_info *inode, ++ inode_set_fn set, ++ void *p, unsigned fields) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bch_inode_unpacked inode_u; ++ int ret; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++retry: ++ bch2_trans_begin(&trans); ++ ++ iter = bch2_inode_peek(&trans, &inode_u, inode->v.i_ino, ++ BTREE_ITER_INTENT); ++ ret = PTR_ERR_OR_ZERO(iter) ?: ++ (set ? set(inode, &inode_u, p) : 0) ?: ++ bch2_inode_write(&trans, iter, &inode_u) ?: ++ bch2_trans_commit(&trans, NULL, ++ &inode->ei_journal_seq, ++ BTREE_INSERT_ATOMIC| ++ BTREE_INSERT_NOUNLOCK| ++ BTREE_INSERT_NOFAIL); ++ if (ret == -EINTR) ++ goto retry; ++ ++ /* ++ * the btree node lock protects inode->ei_inode, not ei_update_lock; ++ * this is important for inode updates via bchfs_write_index_update ++ */ ++ if (!ret) ++ bch2_inode_update_after_write(c, inode, &inode_u, fields); ++ ++ bch2_trans_exit(&trans); ++ return ret < 0 ? ret : 0; ++} ++ ++int bch2_fs_quota_transfer(struct bch_fs *c, ++ struct bch_inode_info *inode, ++ struct bch_qid new_qid, ++ unsigned qtypes, ++ enum quota_acct_mode mode) ++{ ++ unsigned i; ++ int ret; ++ ++ qtypes &= enabled_qtypes(c); ++ ++ for (i = 0; i < QTYP_NR; i++) ++ if (new_qid.q[i] == inode->ei_qid.q[i]) ++ qtypes &= ~(1U << i); ++ ++ if (!qtypes) ++ return 0; ++ ++ mutex_lock(&inode->ei_quota_lock); ++ ++ ret = bch2_quota_transfer(c, qtypes, new_qid, ++ inode->ei_qid, ++ inode->v.i_blocks + ++ inode->ei_quota_reserved, ++ mode); ++ if (!ret) ++ for (i = 0; i < QTYP_NR; i++) ++ if (qtypes & (1 << i)) ++ inode->ei_qid.q[i] = new_qid.q[i]; ++ ++ mutex_unlock(&inode->ei_quota_lock); ++ ++ return ret; ++} ++ ++struct inode *bch2_vfs_inode_get(struct bch_fs *c, u64 inum) ++{ ++ struct bch_inode_unpacked inode_u; ++ struct bch_inode_info *inode; ++ int ret; ++ ++ inode = to_bch_ei(iget_locked(c->vfs_sb, inum)); ++ if (unlikely(!inode)) ++ return ERR_PTR(-ENOMEM); ++ if (!(inode->v.i_state & I_NEW)) ++ return &inode->v; ++ ++ ret = bch2_inode_find_by_inum(c, inum, &inode_u); ++ if (ret) { ++ iget_failed(&inode->v); ++ return ERR_PTR(ret); ++ } ++ ++ bch2_vfs_inode_init(c, inode, &inode_u); ++ ++ inode->ei_journal_seq = bch2_inode_journal_seq(&c->journal, inum); ++ ++ unlock_new_inode(&inode->v); ++ ++ return &inode->v; ++} ++ ++static int inum_test(struct inode *inode, void *p) ++{ ++ unsigned long *ino = p; ++ ++ return *ino == inode->i_ino; ++} ++ ++static struct bch_inode_info * ++__bch2_create(struct user_namespace *mnt_userns, ++ struct bch_inode_info *dir, struct dentry *dentry, ++ umode_t mode, dev_t rdev, bool tmpfile) ++{ ++ struct bch_fs *c = dir->v.i_sb->s_fs_info; ++ struct btree_trans trans; ++ struct bch_inode_unpacked dir_u; ++ struct bch_inode_info *inode, *old; ++ struct bch_inode_unpacked inode_u; ++ struct posix_acl *default_acl = NULL, *acl = NULL; ++ u64 journal_seq = 0; ++ int ret; ++ ++ /* ++ * preallocate acls + vfs inode before btree transaction, so that ++ * nothing can fail after the transaction succeeds: ++ */ ++#ifdef CONFIG_BCACHEFS_POSIX_ACL ++ ret = posix_acl_create(&dir->v, &mode, &default_acl, &acl); ++ if (ret) ++ return ERR_PTR(ret); ++#endif ++ inode = to_bch_ei(new_inode(c->vfs_sb)); ++ if (unlikely(!inode)) { ++ inode = ERR_PTR(-ENOMEM); ++ goto err; ++ } ++ ++ bch2_inode_init_early(c, &inode_u); ++ ++ if (!tmpfile) ++ mutex_lock(&dir->ei_update_lock); ++ ++ bch2_trans_init(&trans, c, 8, 1024); ++retry: ++ bch2_trans_begin(&trans); ++ ++ ret = bch2_create_trans(&trans, dir->v.i_ino, &dir_u, &inode_u, ++ !tmpfile ? &dentry->d_name : NULL, ++ from_kuid(mnt_userns, current_fsuid()), ++ from_kgid(mnt_userns, current_fsgid()), ++ mode, rdev, ++ default_acl, acl) ?: ++ bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, 1, ++ KEY_TYPE_QUOTA_PREALLOC); ++ if (unlikely(ret)) ++ goto err_before_quota; ++ ++ ret = bch2_trans_commit(&trans, NULL, &journal_seq, ++ BTREE_INSERT_ATOMIC| ++ BTREE_INSERT_NOUNLOCK); ++ if (unlikely(ret)) { ++ bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1, ++ KEY_TYPE_QUOTA_WARN); ++err_before_quota: ++ if (ret == -EINTR) ++ goto retry; ++ goto err_trans; ++ } ++ ++ if (!tmpfile) { ++ bch2_inode_update_after_write(c, dir, &dir_u, ++ ATTR_MTIME|ATTR_CTIME); ++ journal_seq_copy(dir, journal_seq); ++ mutex_unlock(&dir->ei_update_lock); ++ } ++ ++ bch2_vfs_inode_init(c, inode, &inode_u); ++ journal_seq_copy(inode, journal_seq); ++ ++ set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl); ++ set_cached_acl(&inode->v, ACL_TYPE_DEFAULT, default_acl); ++ ++ /* ++ * we must insert the new inode into the inode cache before calling ++ * bch2_trans_exit() and dropping locks, else we could race with another ++ * thread pulling the inode in and modifying it: ++ */ ++ ++ inode->v.i_state |= I_CREATING; ++ old = to_bch_ei(inode_insert5(&inode->v, inode->v.i_ino, ++ inum_test, NULL, &inode->v.i_ino)); ++ BUG_ON(!old); ++ ++ if (unlikely(old != inode)) { ++ /* ++ * We raced, another process pulled the new inode into cache ++ * before us: ++ */ ++ journal_seq_copy(old, journal_seq); ++ make_bad_inode(&inode->v); ++ iput(&inode->v); ++ ++ inode = old; ++ } else { ++ /* ++ * we really don't want insert_inode_locked2() to be setting ++ * I_NEW... ++ */ ++ unlock_new_inode(&inode->v); ++ } ++ ++ bch2_trans_exit(&trans); ++err: ++ posix_acl_release(default_acl); ++ posix_acl_release(acl); ++ return inode; ++err_trans: ++ if (!tmpfile) ++ mutex_unlock(&dir->ei_update_lock); ++ ++ bch2_trans_exit(&trans); ++ make_bad_inode(&inode->v); ++ iput(&inode->v); ++ inode = ERR_PTR(ret); ++ goto err; ++} ++ ++/* methods */ ++ ++static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry, ++ unsigned int flags) ++{ ++ struct bch_fs *c = vdir->i_sb->s_fs_info; ++ struct bch_inode_info *dir = to_bch_ei(vdir); ++ struct inode *vinode = NULL; ++ u64 inum; ++ ++ inum = bch2_dirent_lookup(c, dir->v.i_ino, ++ &dir->ei_str_hash, ++ &dentry->d_name); ++ ++ if (inum) ++ vinode = bch2_vfs_inode_get(c, inum); ++ ++ return d_splice_alias(vinode, dentry); ++} ++ ++static int bch2_mknod(struct user_namespace *mnt_userns, ++ struct inode *vdir, struct dentry *dentry, ++ umode_t mode, dev_t rdev) ++{ ++ struct bch_inode_info *inode = ++ __bch2_create(mnt_userns, to_bch_ei(vdir), dentry, mode, rdev, false); ++ ++ if (IS_ERR(inode)) ++ return PTR_ERR(inode); ++ ++ d_instantiate(dentry, &inode->v); ++ return 0; ++} ++ ++static int bch2_create(struct user_namespace *mnt_userns, ++ struct inode *vdir, struct dentry *dentry, ++ umode_t mode, bool excl) ++{ ++ return bch2_mknod(mnt_userns, vdir, dentry, mode|S_IFREG, 0); ++} ++ ++static int __bch2_link(struct bch_fs *c, ++ struct bch_inode_info *inode, ++ struct bch_inode_info *dir, ++ struct dentry *dentry) ++{ ++ struct btree_trans trans; ++ struct bch_inode_unpacked inode_u; ++ int ret; ++ ++ mutex_lock(&inode->ei_update_lock); ++ bch2_trans_init(&trans, c, 4, 1024); ++ ++ do { ++ bch2_trans_begin(&trans); ++ ret = bch2_link_trans(&trans, ++ dir->v.i_ino, ++ inode->v.i_ino, &inode_u, ++ &dentry->d_name) ?: ++ bch2_trans_commit(&trans, NULL, ++ &inode->ei_journal_seq, ++ BTREE_INSERT_ATOMIC| ++ BTREE_INSERT_NOUNLOCK); ++ } while (ret == -EINTR); ++ ++ if (likely(!ret)) ++ bch2_inode_update_after_write(c, inode, &inode_u, ATTR_CTIME); ++ ++ bch2_trans_exit(&trans); ++ mutex_unlock(&inode->ei_update_lock); ++ return ret; ++} ++ ++static int bch2_link(struct dentry *old_dentry, struct inode *vdir, ++ struct dentry *dentry) ++{ ++ struct bch_fs *c = vdir->i_sb->s_fs_info; ++ struct bch_inode_info *dir = to_bch_ei(vdir); ++ struct bch_inode_info *inode = to_bch_ei(old_dentry->d_inode); ++ int ret; ++ ++ lockdep_assert_held(&inode->v.i_rwsem); ++ ++ ret = __bch2_link(c, inode, dir, dentry); ++ if (unlikely(ret)) ++ return ret; ++ ++ ihold(&inode->v); ++ d_instantiate(dentry, &inode->v); ++ return 0; ++} ++ ++static int bch2_unlink(struct inode *vdir, struct dentry *dentry) ++{ ++ struct bch_fs *c = vdir->i_sb->s_fs_info; ++ struct bch_inode_info *dir = to_bch_ei(vdir); ++ struct bch_inode_info *inode = to_bch_ei(dentry->d_inode); ++ struct bch_inode_unpacked dir_u, inode_u; ++ struct btree_trans trans; ++ int ret; ++ ++ bch2_lock_inodes(INODE_UPDATE_LOCK, dir, inode); ++ bch2_trans_init(&trans, c, 4, 1024); ++ ++ do { ++ bch2_trans_begin(&trans); ++ ++ ret = bch2_unlink_trans(&trans, ++ dir->v.i_ino, &dir_u, ++ &inode_u, &dentry->d_name) ?: ++ bch2_trans_commit(&trans, NULL, ++ &dir->ei_journal_seq, ++ BTREE_INSERT_ATOMIC| ++ BTREE_INSERT_NOUNLOCK| ++ BTREE_INSERT_NOFAIL); ++ } while (ret == -EINTR); ++ ++ if (likely(!ret)) { ++ BUG_ON(inode_u.bi_inum != inode->v.i_ino); ++ ++ journal_seq_copy(inode, dir->ei_journal_seq); ++ bch2_inode_update_after_write(c, dir, &dir_u, ++ ATTR_MTIME|ATTR_CTIME); ++ bch2_inode_update_after_write(c, inode, &inode_u, ++ ATTR_MTIME); ++ } ++ ++ bch2_trans_exit(&trans); ++ bch2_unlock_inodes(INODE_UPDATE_LOCK, dir, inode); ++ ++ return ret; ++} ++ ++static int bch2_symlink(struct user_namespace *mnt_userns, ++ struct inode *vdir, struct dentry *dentry, ++ const char *symname) ++{ ++ struct bch_fs *c = vdir->i_sb->s_fs_info; ++ struct bch_inode_info *dir = to_bch_ei(vdir), *inode; ++ int ret; ++ ++ inode = __bch2_create(mnt_userns, dir, dentry, S_IFLNK|S_IRWXUGO, 0, true); ++ if (unlikely(IS_ERR(inode))) ++ return PTR_ERR(inode); ++ ++ inode_lock(&inode->v); ++ ret = page_symlink(&inode->v, symname, strlen(symname) + 1); ++ inode_unlock(&inode->v); ++ ++ if (unlikely(ret)) ++ goto err; ++ ++ ret = filemap_write_and_wait_range(inode->v.i_mapping, 0, LLONG_MAX); ++ if (unlikely(ret)) ++ goto err; ++ ++ journal_seq_copy(dir, inode->ei_journal_seq); ++ ++ ret = __bch2_link(c, inode, dir, dentry); ++ if (unlikely(ret)) ++ goto err; ++ ++ d_instantiate(dentry, &inode->v); ++ return 0; ++err: ++ iput(&inode->v); ++ return ret; ++} ++ ++static int bch2_mkdir(struct user_namespace *mnt_userns, ++ struct inode *vdir, struct dentry *dentry, umode_t mode) ++{ ++ return bch2_mknod(mnt_userns, vdir, dentry, mode|S_IFDIR, 0); ++} ++ ++static int bch2_rename2(struct user_namespace *mnt_userns, ++ struct inode *src_vdir, struct dentry *src_dentry, ++ struct inode *dst_vdir, struct dentry *dst_dentry, ++ unsigned flags) ++{ ++ struct bch_fs *c = src_vdir->i_sb->s_fs_info; ++ struct bch_inode_info *src_dir = to_bch_ei(src_vdir); ++ struct bch_inode_info *dst_dir = to_bch_ei(dst_vdir); ++ struct bch_inode_info *src_inode = to_bch_ei(src_dentry->d_inode); ++ struct bch_inode_info *dst_inode = to_bch_ei(dst_dentry->d_inode); ++ struct bch_inode_unpacked dst_dir_u, src_dir_u; ++ struct bch_inode_unpacked src_inode_u, dst_inode_u; ++ struct btree_trans trans; ++ enum bch_rename_mode mode = flags & RENAME_EXCHANGE ++ ? BCH_RENAME_EXCHANGE ++ : dst_dentry->d_inode ++ ? BCH_RENAME_OVERWRITE : BCH_RENAME; ++ u64 journal_seq = 0; ++ int ret; ++ ++ if (flags & ~(RENAME_NOREPLACE|RENAME_EXCHANGE)) ++ return -EINVAL; ++ ++ if (mode == BCH_RENAME_OVERWRITE) { ++ ret = filemap_write_and_wait_range(src_inode->v.i_mapping, ++ 0, LLONG_MAX); ++ if (ret) ++ return ret; ++ } ++ ++ bch2_trans_init(&trans, c, 8, 2048); ++ ++ bch2_lock_inodes(INODE_UPDATE_LOCK, ++ src_dir, ++ dst_dir, ++ src_inode, ++ dst_inode); ++ ++ if (inode_attr_changing(dst_dir, src_inode, Inode_opt_project)) { ++ ret = bch2_fs_quota_transfer(c, src_inode, ++ dst_dir->ei_qid, ++ 1 << QTYP_PRJ, ++ KEY_TYPE_QUOTA_PREALLOC); ++ if (ret) ++ goto err; ++ } ++ ++ if (mode == BCH_RENAME_EXCHANGE && ++ inode_attr_changing(src_dir, dst_inode, Inode_opt_project)) { ++ ret = bch2_fs_quota_transfer(c, dst_inode, ++ src_dir->ei_qid, ++ 1 << QTYP_PRJ, ++ KEY_TYPE_QUOTA_PREALLOC); ++ if (ret) ++ goto err; ++ } ++ ++retry: ++ bch2_trans_begin(&trans); ++ ret = bch2_rename_trans(&trans, ++ src_dir->v.i_ino, &src_dir_u, ++ dst_dir->v.i_ino, &dst_dir_u, ++ &src_inode_u, ++ &dst_inode_u, ++ &src_dentry->d_name, ++ &dst_dentry->d_name, ++ mode) ?: ++ bch2_trans_commit(&trans, NULL, ++ &journal_seq, ++ BTREE_INSERT_ATOMIC| ++ BTREE_INSERT_NOUNLOCK); ++ if (ret == -EINTR) ++ goto retry; ++ if (unlikely(ret)) ++ goto err; ++ ++ BUG_ON(src_inode->v.i_ino != src_inode_u.bi_inum); ++ BUG_ON(dst_inode && ++ dst_inode->v.i_ino != dst_inode_u.bi_inum); ++ ++ bch2_inode_update_after_write(c, src_dir, &src_dir_u, ++ ATTR_MTIME|ATTR_CTIME); ++ journal_seq_copy(src_dir, journal_seq); ++ ++ if (src_dir != dst_dir) { ++ bch2_inode_update_after_write(c, dst_dir, &dst_dir_u, ++ ATTR_MTIME|ATTR_CTIME); ++ journal_seq_copy(dst_dir, journal_seq); ++ } ++ ++ bch2_inode_update_after_write(c, src_inode, &src_inode_u, ++ ATTR_CTIME); ++ journal_seq_copy(src_inode, journal_seq); ++ ++ if (dst_inode) { ++ bch2_inode_update_after_write(c, dst_inode, &dst_inode_u, ++ ATTR_CTIME); ++ journal_seq_copy(dst_inode, journal_seq); ++ } ++err: ++ bch2_trans_exit(&trans); ++ ++ bch2_fs_quota_transfer(c, src_inode, ++ bch_qid(&src_inode->ei_inode), ++ 1 << QTYP_PRJ, ++ KEY_TYPE_QUOTA_NOCHECK); ++ if (dst_inode) ++ bch2_fs_quota_transfer(c, dst_inode, ++ bch_qid(&dst_inode->ei_inode), ++ 1 << QTYP_PRJ, ++ KEY_TYPE_QUOTA_NOCHECK); ++ ++ bch2_unlock_inodes(INODE_UPDATE_LOCK, ++ src_dir, ++ dst_dir, ++ src_inode, ++ dst_inode); ++ ++ return ret; ++} ++ ++void bch2_setattr_copy(struct user_namespace *mnt_userns, ++ struct bch_inode_info *inode, ++ struct bch_inode_unpacked *bi, ++ struct iattr *attr) ++{ ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ unsigned int ia_valid = attr->ia_valid; ++ ++ if (ia_valid & ATTR_UID) ++ bi->bi_uid = from_kuid(mnt_userns, attr->ia_uid); ++ if (ia_valid & ATTR_GID) ++ bi->bi_gid = from_kgid(mnt_userns, attr->ia_gid); ++ ++ if (ia_valid & ATTR_ATIME) ++ bi->bi_atime = timespec_to_bch2_time(c, attr->ia_atime); ++ if (ia_valid & ATTR_MTIME) ++ bi->bi_mtime = timespec_to_bch2_time(c, attr->ia_mtime); ++ if (ia_valid & ATTR_CTIME) ++ bi->bi_ctime = timespec_to_bch2_time(c, attr->ia_ctime); ++ ++ if (ia_valid & ATTR_MODE) { ++ umode_t mode = attr->ia_mode; ++ kgid_t gid = ia_valid & ATTR_GID ++ ? attr->ia_gid ++ : inode->v.i_gid; ++ ++ if (!in_group_p(gid) && ++ !capable_wrt_inode_uidgid(mnt_userns, &inode->v, CAP_FSETID)) ++ mode &= ~S_ISGID; ++ bi->bi_mode = mode; ++ } ++} ++ ++static int bch2_setattr_nonsize(struct user_namespace *mnt_userns, ++ struct bch_inode_info *inode, ++ struct iattr *attr) ++{ ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ struct bch_qid qid; ++ struct btree_trans trans; ++ struct btree_iter *inode_iter; ++ struct bch_inode_unpacked inode_u; ++ struct posix_acl *acl = NULL; ++ int ret; ++ ++ mutex_lock(&inode->ei_update_lock); ++ ++ qid = inode->ei_qid; ++ ++ if (attr->ia_valid & ATTR_UID) ++ qid.q[QTYP_USR] = from_kuid(&init_user_ns, attr->ia_uid); ++ ++ if (attr->ia_valid & ATTR_GID) ++ qid.q[QTYP_GRP] = from_kgid(&init_user_ns, attr->ia_gid); ++ ++ ret = bch2_fs_quota_transfer(c, inode, qid, ~0, ++ KEY_TYPE_QUOTA_PREALLOC); ++ if (ret) ++ goto err; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++retry: ++ bch2_trans_begin(&trans); ++ kfree(acl); ++ acl = NULL; ++ ++ inode_iter = bch2_inode_peek(&trans, &inode_u, inode->v.i_ino, ++ BTREE_ITER_INTENT); ++ ret = PTR_ERR_OR_ZERO(inode_iter); ++ if (ret) ++ goto btree_err; ++ ++ bch2_setattr_copy(mnt_userns, inode, &inode_u, attr); ++ ++ if (attr->ia_valid & ATTR_MODE) { ++ ret = bch2_acl_chmod(&trans, inode, inode_u.bi_mode, &acl); ++ if (ret) ++ goto btree_err; ++ } ++ ++ ret = bch2_inode_write(&trans, inode_iter, &inode_u) ?: ++ bch2_trans_commit(&trans, NULL, ++ &inode->ei_journal_seq, ++ BTREE_INSERT_ATOMIC| ++ BTREE_INSERT_NOUNLOCK| ++ BTREE_INSERT_NOFAIL); ++btree_err: ++ if (ret == -EINTR) ++ goto retry; ++ if (unlikely(ret)) ++ goto err_trans; ++ ++ bch2_inode_update_after_write(c, inode, &inode_u, attr->ia_valid); ++ ++ if (acl) ++ set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl); ++err_trans: ++ bch2_trans_exit(&trans); ++err: ++ mutex_unlock(&inode->ei_update_lock); ++ ++ return ret; ++} ++ ++static int bch2_getattr(struct user_namespace *mnt_userns, ++ const struct path *path, struct kstat *stat, ++ u32 request_mask, unsigned query_flags) ++{ ++ struct bch_inode_info *inode = to_bch_ei(d_inode(path->dentry)); ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ ++ stat->dev = inode->v.i_sb->s_dev; ++ stat->ino = inode->v.i_ino; ++ stat->mode = inode->v.i_mode; ++ stat->nlink = inode->v.i_nlink; ++ stat->uid = inode->v.i_uid; ++ stat->gid = inode->v.i_gid; ++ stat->rdev = inode->v.i_rdev; ++ stat->size = i_size_read(&inode->v); ++ stat->atime = inode->v.i_atime; ++ stat->mtime = inode->v.i_mtime; ++ stat->ctime = inode->v.i_ctime; ++ stat->blksize = block_bytes(c); ++ stat->blocks = inode->v.i_blocks; ++ ++ if (request_mask & STATX_BTIME) { ++ stat->result_mask |= STATX_BTIME; ++ stat->btime = bch2_time_to_timespec(c, inode->ei_inode.bi_otime); ++ } ++ ++ if (inode->ei_inode.bi_flags & BCH_INODE_IMMUTABLE) ++ stat->attributes |= STATX_ATTR_IMMUTABLE; ++ if (inode->ei_inode.bi_flags & BCH_INODE_APPEND) ++ stat->attributes |= STATX_ATTR_APPEND; ++ if (inode->ei_inode.bi_flags & BCH_INODE_NODUMP) ++ stat->attributes |= STATX_ATTR_NODUMP; ++ ++ return 0; ++} ++ ++static int bch2_setattr(struct user_namespace *mnt_userns, ++ struct dentry *dentry, struct iattr *iattr) ++{ ++ struct bch_inode_info *inode = to_bch_ei(dentry->d_inode); ++ int ret; ++ ++ lockdep_assert_held(&inode->v.i_rwsem); ++ ++ ret = setattr_prepare(mnt_userns, dentry, iattr); ++ if (ret) ++ return ret; ++ ++ return iattr->ia_valid & ATTR_SIZE ++ ? bch2_truncate(inode, iattr) ++ : bch2_setattr_nonsize(mnt_userns, inode, iattr); ++} ++ ++static int bch2_tmpfile(struct user_namespace *mnt_userns, ++ struct inode *vdir, struct dentry *dentry, umode_t mode) ++{ ++ struct bch_inode_info *inode = ++ __bch2_create(mnt_userns, to_bch_ei(vdir), dentry, mode, 0, true); ++ ++ if (IS_ERR(inode)) ++ return PTR_ERR(inode); ++ ++ d_mark_tmpfile(dentry, &inode->v); ++ d_instantiate(dentry, &inode->v); ++ return 0; ++} ++ ++static int bch2_fill_extent(struct bch_fs *c, ++ struct fiemap_extent_info *info, ++ struct bkey_s_c k, unsigned flags) ++{ ++ if (bkey_extent_is_data(k.k)) { ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const union bch_extent_entry *entry; ++ struct extent_ptr_decoded p; ++ int ret; ++ ++ if (k.k->type == KEY_TYPE_reflink_v) ++ flags |= FIEMAP_EXTENT_SHARED; ++ ++ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { ++ int flags2 = 0; ++ u64 offset = p.ptr.offset; ++ ++ if (p.crc.compression_type) ++ flags2 |= FIEMAP_EXTENT_ENCODED; ++ else ++ offset += p.crc.offset; ++ ++ if ((offset & (c->opts.block_size - 1)) || ++ (k.k->size & (c->opts.block_size - 1))) ++ flags2 |= FIEMAP_EXTENT_NOT_ALIGNED; ++ ++ ret = fiemap_fill_next_extent(info, ++ bkey_start_offset(k.k) << 9, ++ offset << 9, ++ k.k->size << 9, flags|flags2); ++ if (ret) ++ return ret; ++ } ++ ++ return 0; ++ } else if (k.k->type == KEY_TYPE_reservation) { ++ return fiemap_fill_next_extent(info, ++ bkey_start_offset(k.k) << 9, ++ 0, k.k->size << 9, ++ flags| ++ FIEMAP_EXTENT_DELALLOC| ++ FIEMAP_EXTENT_UNWRITTEN); ++ } else { ++ BUG(); ++ } ++} ++ ++static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, ++ u64 start, u64 len) ++{ ++ struct bch_fs *c = vinode->i_sb->s_fs_info; ++ struct bch_inode_info *ei = to_bch_ei(vinode); ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ BKEY_PADDED(k) cur, prev; ++ struct bpos end = POS(ei->v.i_ino, (start + len) >> 9); ++ unsigned offset_into_extent, sectors; ++ bool have_extent = false; ++ int ret = 0; ++ ++ ret = fiemap_prep(&ei->v, info, start, &len, FIEMAP_FLAG_SYNC); ++ if (ret) ++ return ret; ++ ++ if (start + len < start) ++ return -EINVAL; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, ++ POS(ei->v.i_ino, start >> 9), 0); ++retry: ++ while ((k = bch2_btree_iter_peek(iter)).k && ++ !(ret = bkey_err(k)) && ++ bkey_cmp(iter->pos, end) < 0) { ++ if (!bkey_extent_is_data(k.k) && ++ k.k->type != KEY_TYPE_reservation) { ++ bch2_btree_iter_next(iter); ++ continue; ++ } ++ ++ bkey_reassemble(&cur.k, k); ++ k = bkey_i_to_s_c(&cur.k); ++ ++ offset_into_extent = iter->pos.offset - ++ bkey_start_offset(k.k); ++ sectors = k.k->size - offset_into_extent; ++ ++ ret = bch2_read_indirect_extent(&trans, ++ &offset_into_extent, &cur.k); ++ if (ret) ++ break; ++ ++ sectors = min(sectors, k.k->size - offset_into_extent); ++ ++ if (offset_into_extent) ++ bch2_cut_front(POS(k.k->p.inode, ++ bkey_start_offset(k.k) + ++ offset_into_extent), ++ &cur.k); ++ bch2_key_resize(&cur.k.k, sectors); ++ cur.k.k.p = iter->pos; ++ cur.k.k.p.offset += cur.k.k.size; ++ ++ if (have_extent) { ++ ret = bch2_fill_extent(c, info, ++ bkey_i_to_s_c(&prev.k), 0); ++ if (ret) ++ break; ++ } ++ ++ bkey_copy(&prev.k, &cur.k); ++ have_extent = true; ++ ++ if (k.k->type == KEY_TYPE_reflink_v) ++ bch2_btree_iter_set_pos(iter, k.k->p); ++ else ++ bch2_btree_iter_next(iter); ++ } ++ ++ if (ret == -EINTR) ++ goto retry; ++ ++ if (!ret && have_extent) ++ ret = bch2_fill_extent(c, info, bkey_i_to_s_c(&prev.k), ++ FIEMAP_EXTENT_LAST); ++ ++ ret = bch2_trans_exit(&trans) ?: ret; ++ return ret < 0 ? ret : 0; ++} ++ ++static const struct vm_operations_struct bch_vm_ops = { ++ .fault = bch2_page_fault, ++ .map_pages = filemap_map_pages, ++ .page_mkwrite = bch2_page_mkwrite, ++}; ++ ++static int bch2_mmap(struct file *file, struct vm_area_struct *vma) ++{ ++ file_accessed(file); ++ ++ vma->vm_ops = &bch_vm_ops; ++ return 0; ++} ++ ++/* Directories: */ ++ ++static loff_t bch2_dir_llseek(struct file *file, loff_t offset, int whence) ++{ ++ return generic_file_llseek_size(file, offset, whence, ++ S64_MAX, S64_MAX); ++} ++ ++static int bch2_vfs_readdir(struct file *file, struct dir_context *ctx) ++{ ++ struct bch_inode_info *inode = file_bch_inode(file); ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ ++ if (!dir_emit_dots(file, ctx)) ++ return 0; ++ ++ return bch2_readdir(c, inode->v.i_ino, ctx); ++} ++ ++static const struct file_operations bch_file_operations = { ++ .llseek = bch2_llseek, ++ .read_iter = bch2_read_iter, ++ .write_iter = bch2_write_iter, ++ .mmap = bch2_mmap, ++ .open = generic_file_open, ++ .fsync = bch2_fsync, ++ .splice_read = generic_file_splice_read, ++ .splice_write = iter_file_splice_write, ++ .fallocate = bch2_fallocate_dispatch, ++ .unlocked_ioctl = bch2_fs_file_ioctl, ++#ifdef CONFIG_COMPAT ++ .compat_ioctl = bch2_compat_fs_ioctl, ++#endif ++ .remap_file_range = bch2_remap_file_range, ++}; ++ ++static const struct inode_operations bch_file_inode_operations = { ++ .getattr = bch2_getattr, ++ .setattr = bch2_setattr, ++ .fiemap = bch2_fiemap, ++ .listxattr = bch2_xattr_list, ++#ifdef CONFIG_BCACHEFS_POSIX_ACL ++ .get_acl = bch2_get_acl, ++ .set_acl = bch2_set_acl, ++#endif ++}; ++ ++static const struct inode_operations bch_dir_inode_operations = { ++ .lookup = bch2_lookup, ++ .create = bch2_create, ++ .link = bch2_link, ++ .unlink = bch2_unlink, ++ .symlink = bch2_symlink, ++ .mkdir = bch2_mkdir, ++ .rmdir = bch2_unlink, ++ .mknod = bch2_mknod, ++ .rename = bch2_rename2, ++ .getattr = bch2_getattr, ++ .setattr = bch2_setattr, ++ .tmpfile = bch2_tmpfile, ++ .listxattr = bch2_xattr_list, ++#ifdef CONFIG_BCACHEFS_POSIX_ACL ++ .get_acl = bch2_get_acl, ++ .set_acl = bch2_set_acl, ++#endif ++}; ++ ++static const struct file_operations bch_dir_file_operations = { ++ .llseek = bch2_dir_llseek, ++ .read = generic_read_dir, ++ .iterate_shared = bch2_vfs_readdir, ++ .fsync = bch2_fsync, ++ .unlocked_ioctl = bch2_fs_file_ioctl, ++#ifdef CONFIG_COMPAT ++ .compat_ioctl = bch2_compat_fs_ioctl, ++#endif ++}; ++ ++static const struct inode_operations bch_symlink_inode_operations = { ++ .get_link = page_get_link, ++ .getattr = bch2_getattr, ++ .setattr = bch2_setattr, ++ .listxattr = bch2_xattr_list, ++#ifdef CONFIG_BCACHEFS_POSIX_ACL ++ .get_acl = bch2_get_acl, ++ .set_acl = bch2_set_acl, ++#endif ++}; ++ ++static const struct inode_operations bch_special_inode_operations = { ++ .getattr = bch2_getattr, ++ .setattr = bch2_setattr, ++ .listxattr = bch2_xattr_list, ++#ifdef CONFIG_BCACHEFS_POSIX_ACL ++ .get_acl = bch2_get_acl, ++ .set_acl = bch2_set_acl, ++#endif ++}; ++ ++static const struct address_space_operations bch_address_space_operations = { ++ .writepage = bch2_writepage, ++ .readpage = bch2_readpage, ++ .writepages = bch2_writepages, ++ .readpages = bch2_readpages, ++ .set_page_dirty = __set_page_dirty_nobuffers, ++ .write_begin = bch2_write_begin, ++ .write_end = bch2_write_end, ++ .invalidatepage = bch2_invalidatepage, ++ .releasepage = bch2_releasepage, ++ .direct_IO = noop_direct_IO, ++#ifdef CONFIG_MIGRATION ++ .migratepage = bch2_migrate_page, ++#endif ++ .error_remove_page = generic_error_remove_page, ++}; ++ ++static struct inode *bch2_nfs_get_inode(struct super_block *sb, ++ u64 ino, u32 generation) ++{ ++ struct bch_fs *c = sb->s_fs_info; ++ struct inode *vinode; ++ ++ if (ino < BCACHEFS_ROOT_INO) ++ return ERR_PTR(-ESTALE); ++ ++ vinode = bch2_vfs_inode_get(c, ino); ++ if (IS_ERR(vinode)) ++ return ERR_CAST(vinode); ++ if (generation && vinode->i_generation != generation) { ++ /* we didn't find the right inode.. */ ++ iput(vinode); ++ return ERR_PTR(-ESTALE); ++ } ++ return vinode; ++} ++ ++static struct dentry *bch2_fh_to_dentry(struct super_block *sb, struct fid *fid, ++ int fh_len, int fh_type) ++{ ++ return generic_fh_to_dentry(sb, fid, fh_len, fh_type, ++ bch2_nfs_get_inode); ++} ++ ++static struct dentry *bch2_fh_to_parent(struct super_block *sb, struct fid *fid, ++ int fh_len, int fh_type) ++{ ++ return generic_fh_to_parent(sb, fid, fh_len, fh_type, ++ bch2_nfs_get_inode); ++} ++ ++static const struct export_operations bch_export_ops = { ++ .fh_to_dentry = bch2_fh_to_dentry, ++ .fh_to_parent = bch2_fh_to_parent, ++ //.get_parent = bch2_get_parent, ++}; ++ ++static void bch2_vfs_inode_init(struct bch_fs *c, ++ struct bch_inode_info *inode, ++ struct bch_inode_unpacked *bi) ++{ ++ bch2_inode_update_after_write(c, inode, bi, ~0); ++ ++ inode->v.i_blocks = bi->bi_sectors; ++ inode->v.i_ino = bi->bi_inum; ++ inode->v.i_rdev = bi->bi_dev; ++ inode->v.i_generation = bi->bi_generation; ++ inode->v.i_size = bi->bi_size; ++ ++ inode->ei_journal_seq = 0; ++ inode->ei_quota_reserved = 0; ++ inode->ei_str_hash = bch2_hash_info_init(c, bi); ++ inode->ei_qid = bch_qid(bi); ++ ++ inode->v.i_mapping->a_ops = &bch_address_space_operations; ++ ++ switch (inode->v.i_mode & S_IFMT) { ++ case S_IFREG: ++ inode->v.i_op = &bch_file_inode_operations; ++ inode->v.i_fop = &bch_file_operations; ++ break; ++ case S_IFDIR: ++ inode->v.i_op = &bch_dir_inode_operations; ++ inode->v.i_fop = &bch_dir_file_operations; ++ break; ++ case S_IFLNK: ++ inode_nohighmem(&inode->v); ++ inode->v.i_op = &bch_symlink_inode_operations; ++ break; ++ default: ++ init_special_inode(&inode->v, inode->v.i_mode, inode->v.i_rdev); ++ inode->v.i_op = &bch_special_inode_operations; ++ break; ++ } ++} ++ ++static struct inode *bch2_alloc_inode(struct super_block *sb) ++{ ++ struct bch_inode_info *inode; ++ ++ inode = kmem_cache_alloc(bch2_inode_cache, GFP_NOFS); ++ if (!inode) ++ return NULL; ++ ++ inode_init_once(&inode->v); ++ mutex_init(&inode->ei_update_lock); ++ pagecache_lock_init(&inode->ei_pagecache_lock); ++ mutex_init(&inode->ei_quota_lock); ++ inode->ei_journal_seq = 0; ++ ++ return &inode->v; ++} ++ ++static void bch2_i_callback(struct rcu_head *head) ++{ ++ struct inode *vinode = container_of(head, struct inode, i_rcu); ++ struct bch_inode_info *inode = to_bch_ei(vinode); ++ ++ kmem_cache_free(bch2_inode_cache, inode); ++} ++ ++static void bch2_destroy_inode(struct inode *vinode) ++{ ++ call_rcu(&vinode->i_rcu, bch2_i_callback); ++} ++ ++static int inode_update_times_fn(struct bch_inode_info *inode, ++ struct bch_inode_unpacked *bi, ++ void *p) ++{ ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ ++ bi->bi_atime = timespec_to_bch2_time(c, inode->v.i_atime); ++ bi->bi_mtime = timespec_to_bch2_time(c, inode->v.i_mtime); ++ bi->bi_ctime = timespec_to_bch2_time(c, inode->v.i_ctime); ++ ++ return 0; ++} ++ ++static int bch2_vfs_write_inode(struct inode *vinode, ++ struct writeback_control *wbc) ++{ ++ struct bch_fs *c = vinode->i_sb->s_fs_info; ++ struct bch_inode_info *inode = to_bch_ei(vinode); ++ int ret; ++ ++ mutex_lock(&inode->ei_update_lock); ++ ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL, ++ ATTR_ATIME|ATTR_MTIME|ATTR_CTIME); ++ mutex_unlock(&inode->ei_update_lock); ++ ++ return ret; ++} ++ ++static void bch2_evict_inode(struct inode *vinode) ++{ ++ struct bch_fs *c = vinode->i_sb->s_fs_info; ++ struct bch_inode_info *inode = to_bch_ei(vinode); ++ ++ truncate_inode_pages_final(&inode->v.i_data); ++ ++ clear_inode(&inode->v); ++ ++ BUG_ON(!is_bad_inode(&inode->v) && inode->ei_quota_reserved); ++ ++ if (!inode->v.i_nlink && !is_bad_inode(&inode->v)) { ++ bch2_quota_acct(c, inode->ei_qid, Q_SPC, -((s64) inode->v.i_blocks), ++ KEY_TYPE_QUOTA_WARN); ++ bch2_quota_acct(c, inode->ei_qid, Q_INO, -1, ++ KEY_TYPE_QUOTA_WARN); ++ bch2_inode_rm(c, inode->v.i_ino); ++ } ++} ++ ++static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf) ++{ ++ struct super_block *sb = dentry->d_sb; ++ struct bch_fs *c = sb->s_fs_info; ++ struct bch_fs_usage_short usage = bch2_fs_usage_read_short(c); ++ unsigned shift = sb->s_blocksize_bits - 9; ++ u64 fsid; ++ ++ buf->f_type = BCACHEFS_STATFS_MAGIC; ++ buf->f_bsize = sb->s_blocksize; ++ buf->f_blocks = usage.capacity >> shift; ++ buf->f_bfree = (usage.capacity - usage.used) >> shift; ++ buf->f_bavail = buf->f_bfree; ++ buf->f_files = usage.nr_inodes; ++ buf->f_ffree = U64_MAX; ++ ++ fsid = le64_to_cpup((void *) c->sb.user_uuid.b) ^ ++ le64_to_cpup((void *) c->sb.user_uuid.b + sizeof(u64)); ++ buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL; ++ buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL; ++ buf->f_namelen = BCH_NAME_MAX; ++ ++ return 0; ++} ++ ++static int bch2_sync_fs(struct super_block *sb, int wait) ++{ ++ struct bch_fs *c = sb->s_fs_info; ++ ++ if (c->opts.journal_flush_disabled) ++ return 0; ++ ++ if (!wait) { ++ bch2_journal_flush_async(&c->journal, NULL); ++ return 0; ++ } ++ ++ return bch2_journal_flush(&c->journal); ++} ++ ++static struct bch_fs *bch2_path_to_fs(const char *path) ++{ ++ struct bch_fs *c; ++ dev_t dev; ++ int ret; ++ ++ ret = lookup_bdev(path, &dev); ++ if (ret) ++ return ERR_PTR(ret); ++ ++ c = bch2_dev_to_fs(dev); ++ return c ?: ERR_PTR(-ENOENT); ++} ++ ++static struct bch_fs *__bch2_open_as_blockdevs(const char *dev_name, char * const *devs, ++ unsigned nr_devs, struct bch_opts opts) ++{ ++ struct bch_fs *c, *c1, *c2; ++ size_t i; ++ ++ if (!nr_devs) ++ return ERR_PTR(-EINVAL); ++ ++ c = bch2_fs_open(devs, nr_devs, opts); ++ ++ if (IS_ERR(c) && PTR_ERR(c) == -EBUSY) { ++ /* ++ * Already open? ++ * Look up each block device, make sure they all belong to a ++ * filesystem and they all belong to the _same_ filesystem ++ */ ++ ++ c1 = bch2_path_to_fs(devs[0]); ++ if (IS_ERR(c1)) ++ return c; ++ ++ for (i = 1; i < nr_devs; i++) { ++ c2 = bch2_path_to_fs(devs[i]); ++ if (!IS_ERR(c2)) ++ closure_put(&c2->cl); ++ ++ if (c1 != c2) { ++ closure_put(&c1->cl); ++ return c; ++ } ++ } ++ ++ c = c1; ++ } ++ ++ if (IS_ERR(c)) ++ return c; ++ ++ mutex_lock(&c->state_lock); ++ ++ if (!test_bit(BCH_FS_STARTED, &c->flags)) { ++ mutex_unlock(&c->state_lock); ++ closure_put(&c->cl); ++ pr_err("err mounting %s: incomplete filesystem", dev_name); ++ return ERR_PTR(-EINVAL); ++ } ++ ++ mutex_unlock(&c->state_lock); ++ ++ set_bit(BCH_FS_BDEV_MOUNTED, &c->flags); ++ return c; ++} ++ ++static struct bch_fs *bch2_open_as_blockdevs(const char *_dev_name, ++ struct bch_opts opts) ++{ ++ char *dev_name = NULL, **devs = NULL, *s; ++ struct bch_fs *c = ERR_PTR(-ENOMEM); ++ size_t i, nr_devs = 0; ++ ++ dev_name = kstrdup(_dev_name, GFP_KERNEL); ++ if (!dev_name) ++ goto err; ++ ++ for (s = dev_name; s; s = strchr(s + 1, ':')) ++ nr_devs++; ++ ++ devs = kcalloc(nr_devs, sizeof(const char *), GFP_KERNEL); ++ if (!devs) ++ goto err; ++ ++ for (i = 0, s = dev_name; ++ s; ++ (s = strchr(s, ':')) && (*s++ = '\0')) ++ devs[i++] = s; ++ ++ c = __bch2_open_as_blockdevs(_dev_name, devs, nr_devs, opts); ++err: ++ kfree(devs); ++ kfree(dev_name); ++ return c; ++} ++ ++static int bch2_remount(struct super_block *sb, int *flags, char *data) ++{ ++ struct bch_fs *c = sb->s_fs_info; ++ struct bch_opts opts = bch2_opts_empty(); ++ int ret; ++ ++ opt_set(opts, read_only, (*flags & SB_RDONLY) != 0); ++ ++ ret = bch2_parse_mount_opts(&opts, data); ++ if (ret) ++ return ret; ++ ++ if (opts.read_only != c->opts.read_only) { ++ mutex_lock(&c->state_lock); ++ ++ if (opts.read_only) { ++ bch2_fs_read_only(c); ++ ++ sb->s_flags |= SB_RDONLY; ++ } else { ++ ret = bch2_fs_read_write(c); ++ if (ret) { ++ bch_err(c, "error going rw: %i", ret); ++ mutex_unlock(&c->state_lock); ++ return -EINVAL; ++ } ++ ++ sb->s_flags &= ~SB_RDONLY; ++ } ++ ++ c->opts.read_only = opts.read_only; ++ ++ mutex_unlock(&c->state_lock); ++ } ++ ++ if (opts.errors >= 0) ++ c->opts.errors = opts.errors; ++ ++ return ret; ++} ++ ++static int bch2_show_options(struct seq_file *seq, struct dentry *root) ++{ ++ struct bch_fs *c = root->d_sb->s_fs_info; ++ enum bch_opt_id i; ++ char buf[512]; ++ ++ for (i = 0; i < bch2_opts_nr; i++) { ++ const struct bch_option *opt = &bch2_opt_table[i]; ++ u64 v = bch2_opt_get_by_id(&c->opts, i); ++ ++ if (!(opt->mode & OPT_MOUNT)) ++ continue; ++ ++ if (v == bch2_opt_get_by_id(&bch2_opts_default, i)) ++ continue; ++ ++ bch2_opt_to_text(&PBUF(buf), c, opt, v, ++ OPT_SHOW_MOUNT_STYLE); ++ seq_putc(seq, ','); ++ seq_puts(seq, buf); ++ } ++ ++ return 0; ++ ++} ++ ++static const struct super_operations bch_super_operations = { ++ .alloc_inode = bch2_alloc_inode, ++ .destroy_inode = bch2_destroy_inode, ++ .write_inode = bch2_vfs_write_inode, ++ .evict_inode = bch2_evict_inode, ++ .sync_fs = bch2_sync_fs, ++ .statfs = bch2_statfs, ++ .show_options = bch2_show_options, ++ .remount_fs = bch2_remount, ++#if 0 ++ .put_super = bch2_put_super, ++ .freeze_fs = bch2_freeze, ++ .unfreeze_fs = bch2_unfreeze, ++#endif ++}; ++ ++static int bch2_test_super(struct super_block *s, void *data) ++{ ++ return s->s_fs_info == data; ++} ++ ++static int bch2_set_super(struct super_block *s, void *data) ++{ ++ s->s_fs_info = data; ++ return 0; ++} ++ ++static struct dentry *bch2_mount(struct file_system_type *fs_type, ++ int flags, const char *dev_name, void *data) ++{ ++ struct bch_fs *c; ++ struct bch_dev *ca; ++ struct super_block *sb; ++ struct inode *vinode; ++ struct bch_opts opts = bch2_opts_empty(); ++ unsigned i; ++ int ret; ++ ++ opt_set(opts, read_only, (flags & SB_RDONLY) != 0); ++ ++ ret = bch2_parse_mount_opts(&opts, data); ++ if (ret) ++ return ERR_PTR(ret); ++ ++ c = bch2_open_as_blockdevs(dev_name, opts); ++ if (IS_ERR(c)) ++ return ERR_CAST(c); ++ ++ sb = sget(fs_type, bch2_test_super, bch2_set_super, flags|SB_NOSEC, c); ++ if (IS_ERR(sb)) { ++ closure_put(&c->cl); ++ return ERR_CAST(sb); ++ } ++ ++ BUG_ON(sb->s_fs_info != c); ++ ++ if (sb->s_root) { ++ closure_put(&c->cl); ++ ++ if ((flags ^ sb->s_flags) & SB_RDONLY) { ++ ret = -EBUSY; ++ goto err_put_super; ++ } ++ goto out; ++ } ++ ++ sb->s_blocksize = block_bytes(c); ++ sb->s_blocksize_bits = ilog2(block_bytes(c)); ++ sb->s_maxbytes = MAX_LFS_FILESIZE; ++ sb->s_op = &bch_super_operations; ++ sb->s_export_op = &bch_export_ops; ++#ifdef CONFIG_BCACHEFS_QUOTA ++ sb->s_qcop = &bch2_quotactl_operations; ++ sb->s_quota_types = QTYPE_MASK_USR|QTYPE_MASK_GRP|QTYPE_MASK_PRJ; ++#endif ++ sb->s_xattr = bch2_xattr_handlers; ++ sb->s_magic = BCACHEFS_STATFS_MAGIC; ++ sb->s_time_gran = c->sb.time_precision; ++ c->vfs_sb = sb; ++ strlcpy(sb->s_id, c->name, sizeof(sb->s_id)); ++ ++ ret = super_setup_bdi(sb); ++ if (ret) ++ goto err_put_super; ++ ++ sb->s_bdi->ra_pages = VM_READAHEAD_PAGES; ++ ++ for_each_online_member(ca, c, i) { ++ struct block_device *bdev = ca->disk_sb.bdev; ++ ++ /* XXX: create an anonymous device for multi device filesystems */ ++ sb->s_bdev = bdev; ++ sb->s_dev = bdev->bd_dev; ++ percpu_ref_put(&ca->io_ref); ++ break; ++ } ++ ++#ifdef CONFIG_BCACHEFS_POSIX_ACL ++ if (c->opts.acl) ++ sb->s_flags |= SB_POSIXACL; ++#endif ++ ++ vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_INO); ++ if (IS_ERR(vinode)) { ++ bch_err(c, "error mounting: error getting root inode %i", ++ (int) PTR_ERR(vinode)); ++ ret = PTR_ERR(vinode); ++ goto err_put_super; ++ } ++ ++ sb->s_root = d_make_root(vinode); ++ if (!sb->s_root) { ++ bch_err(c, "error mounting: error allocating root dentry"); ++ ret = -ENOMEM; ++ goto err_put_super; ++ } ++ ++ sb->s_flags |= SB_ACTIVE; ++out: ++ return dget(sb->s_root); ++ ++err_put_super: ++ deactivate_locked_super(sb); ++ return ERR_PTR(ret); ++} ++ ++static void bch2_kill_sb(struct super_block *sb) ++{ ++ struct bch_fs *c = sb->s_fs_info; ++ ++ generic_shutdown_super(sb); ++ ++ if (test_bit(BCH_FS_BDEV_MOUNTED, &c->flags)) ++ bch2_fs_stop(c); ++ else ++ closure_put(&c->cl); ++} ++ ++static struct file_system_type bcache_fs_type = { ++ .owner = THIS_MODULE, ++ .name = "bcachefs", ++ .mount = bch2_mount, ++ .kill_sb = bch2_kill_sb, ++ .fs_flags = FS_REQUIRES_DEV, ++}; ++ ++MODULE_ALIAS_FS("bcachefs"); ++ ++void bch2_vfs_exit(void) ++{ ++ unregister_filesystem(&bcache_fs_type); ++ if (bch2_inode_cache) ++ kmem_cache_destroy(bch2_inode_cache); ++} ++ ++int __init bch2_vfs_init(void) ++{ ++ int ret = -ENOMEM; ++ ++ bch2_inode_cache = KMEM_CACHE(bch_inode_info, 0); ++ if (!bch2_inode_cache) ++ goto err; ++ ++ ret = register_filesystem(&bcache_fs_type); ++ if (ret) ++ goto err; ++ ++ return 0; ++err: ++ bch2_vfs_exit(); ++ return ret; ++} ++ ++#endif /* NO_BCACHEFS_FS */ +diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h +new file mode 100644 +index 000000000000..eda903a45325 +--- /dev/null ++++ b/fs/bcachefs/fs.h +@@ -0,0 +1,174 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_FS_H ++#define _BCACHEFS_FS_H ++ ++#include "inode.h" ++#include "opts.h" ++#include "str_hash.h" ++#include "quota_types.h" ++ ++#include ++#include ++ ++/* ++ * Two-state lock - can be taken for add or block - both states are shared, ++ * like read side of rwsem, but conflict with other state: ++ */ ++struct pagecache_lock { ++ atomic_long_t v; ++ wait_queue_head_t wait; ++}; ++ ++static inline void pagecache_lock_init(struct pagecache_lock *lock) ++{ ++ atomic_long_set(&lock->v, 0); ++ init_waitqueue_head(&lock->wait); ++} ++ ++void bch2_pagecache_add_put(struct pagecache_lock *); ++void bch2_pagecache_add_get(struct pagecache_lock *); ++void bch2_pagecache_block_put(struct pagecache_lock *); ++void bch2_pagecache_block_get(struct pagecache_lock *); ++ ++struct bch_inode_info { ++ struct inode v; ++ ++ struct mutex ei_update_lock; ++ u64 ei_journal_seq; ++ u64 ei_quota_reserved; ++ unsigned long ei_last_dirtied; ++ ++ struct pagecache_lock ei_pagecache_lock; ++ ++ struct mutex ei_quota_lock; ++ struct bch_qid ei_qid; ++ ++ struct bch_hash_info ei_str_hash; ++ ++ /* copy of inode in btree: */ ++ struct bch_inode_unpacked ei_inode; ++}; ++ ++#define to_bch_ei(_inode) \ ++ container_of_or_null(_inode, struct bch_inode_info, v) ++ ++static inline int ptrcmp(void *l, void *r) ++{ ++ return cmp_int(l, r); ++} ++ ++enum bch_inode_lock_op { ++ INODE_LOCK = (1U << 0), ++ INODE_PAGECACHE_BLOCK = (1U << 1), ++ INODE_UPDATE_LOCK = (1U << 2), ++}; ++ ++#define bch2_lock_inodes(_locks, ...) \ ++do { \ ++ struct bch_inode_info *a[] = { NULL, __VA_ARGS__ }; \ ++ unsigned i; \ ++ \ ++ bubble_sort(&a[1], ARRAY_SIZE(a) - 1, ptrcmp); \ ++ \ ++ for (i = 1; i < ARRAY_SIZE(a); i++) \ ++ if (a[i] != a[i - 1]) { \ ++ if ((_locks) & INODE_LOCK) \ ++ down_write_nested(&a[i]->v.i_rwsem, i); \ ++ if ((_locks) & INODE_PAGECACHE_BLOCK) \ ++ bch2_pagecache_block_get(&a[i]->ei_pagecache_lock);\ ++ if ((_locks) & INODE_UPDATE_LOCK) \ ++ mutex_lock_nested(&a[i]->ei_update_lock, i);\ ++ } \ ++} while (0) ++ ++#define bch2_unlock_inodes(_locks, ...) \ ++do { \ ++ struct bch_inode_info *a[] = { NULL, __VA_ARGS__ }; \ ++ unsigned i; \ ++ \ ++ bubble_sort(&a[1], ARRAY_SIZE(a) - 1, ptrcmp); \ ++ \ ++ for (i = 1; i < ARRAY_SIZE(a); i++) \ ++ if (a[i] != a[i - 1]) { \ ++ if ((_locks) & INODE_LOCK) \ ++ up_write(&a[i]->v.i_rwsem); \ ++ if ((_locks) & INODE_PAGECACHE_BLOCK) \ ++ bch2_pagecache_block_put(&a[i]->ei_pagecache_lock);\ ++ if ((_locks) & INODE_UPDATE_LOCK) \ ++ mutex_unlock(&a[i]->ei_update_lock); \ ++ } \ ++} while (0) ++ ++static inline struct bch_inode_info *file_bch_inode(struct file *file) ++{ ++ return to_bch_ei(file_inode(file)); ++} ++ ++static inline bool inode_attr_changing(struct bch_inode_info *dir, ++ struct bch_inode_info *inode, ++ enum inode_opt_id id) ++{ ++ return !(inode->ei_inode.bi_fields_set & (1 << id)) && ++ bch2_inode_opt_get(&dir->ei_inode, id) != ++ bch2_inode_opt_get(&inode->ei_inode, id); ++} ++ ++static inline bool inode_attrs_changing(struct bch_inode_info *dir, ++ struct bch_inode_info *inode) ++{ ++ unsigned id; ++ ++ for (id = 0; id < Inode_opt_nr; id++) ++ if (inode_attr_changing(dir, inode, id)) ++ return true; ++ ++ return false; ++} ++ ++struct bch_inode_unpacked; ++ ++#ifndef NO_BCACHEFS_FS ++ ++int bch2_fs_quota_transfer(struct bch_fs *, ++ struct bch_inode_info *, ++ struct bch_qid, ++ unsigned, ++ enum quota_acct_mode); ++ ++static inline int bch2_set_projid(struct bch_fs *c, ++ struct bch_inode_info *inode, ++ u32 projid) ++{ ++ struct bch_qid qid = inode->ei_qid; ++ ++ qid.q[QTYP_PRJ] = projid; ++ ++ return bch2_fs_quota_transfer(c, inode, qid, ++ 1 << QTYP_PRJ, ++ KEY_TYPE_QUOTA_PREALLOC); ++} ++ ++struct inode *bch2_vfs_inode_get(struct bch_fs *, u64); ++ ++/* returns 0 if we want to do the update, or error is passed up */ ++typedef int (*inode_set_fn)(struct bch_inode_info *, ++ struct bch_inode_unpacked *, void *); ++ ++void bch2_inode_update_after_write(struct bch_fs *, ++ struct bch_inode_info *, ++ struct bch_inode_unpacked *, ++ unsigned); ++int __must_check bch2_write_inode(struct bch_fs *, struct bch_inode_info *, ++ inode_set_fn, void *, unsigned); ++ ++void bch2_vfs_exit(void); ++int bch2_vfs_init(void); ++ ++#else ++ ++static inline void bch2_vfs_exit(void) {} ++static inline int bch2_vfs_init(void) { return 0; } ++ ++#endif /* NO_BCACHEFS_FS */ ++ ++#endif /* _BCACHEFS_FS_H */ +diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c +new file mode 100644 +index 000000000000..3cced2b99f3f +--- /dev/null ++++ b/fs/bcachefs/fsck.c +@@ -0,0 +1,1436 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include "bcachefs.h" ++#include "btree_update.h" ++#include "dirent.h" ++#include "error.h" ++#include "fs-common.h" ++#include "fsck.h" ++#include "inode.h" ++#include "keylist.h" ++#include "super.h" ++#include "xattr.h" ++ ++#include /* struct qstr */ ++#include ++ ++#define QSTR(n) { { { .len = strlen(n) } }, .name = n } ++ ++static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum) ++{ ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ u64 sectors = 0; ++ int ret; ++ ++ for_each_btree_key(trans, iter, BTREE_ID_EXTENTS, ++ POS(inum, 0), 0, k, ret) { ++ if (k.k->p.inode != inum) ++ break; ++ ++ if (bkey_extent_is_allocation(k.k)) ++ sectors += k.k->size; ++ } ++ ++ bch2_trans_iter_free(trans, iter); ++ ++ return ret ?: sectors; ++} ++ ++static int remove_dirent(struct btree_trans *trans, ++ struct bkey_s_c_dirent dirent) ++{ ++ struct bch_fs *c = trans->c; ++ struct qstr name; ++ struct bch_inode_unpacked dir_inode; ++ struct bch_hash_info dir_hash_info; ++ u64 dir_inum = dirent.k->p.inode; ++ int ret; ++ char *buf; ++ ++ name.len = bch2_dirent_name_bytes(dirent); ++ buf = kmalloc(name.len + 1, GFP_KERNEL); ++ if (!buf) ++ return -ENOMEM; ++ ++ memcpy(buf, dirent.v->d_name, name.len); ++ buf[name.len] = '\0'; ++ name.name = buf; ++ ++ /* Unlock so we don't deadlock, after copying name: */ ++ bch2_trans_unlock(trans); ++ ++ ret = bch2_inode_find_by_inum(c, dir_inum, &dir_inode); ++ if (ret) { ++ bch_err(c, "remove_dirent: err %i looking up directory inode", ret); ++ goto err; ++ } ++ ++ dir_hash_info = bch2_hash_info_init(c, &dir_inode); ++ ++ ret = bch2_dirent_delete(c, dir_inum, &dir_hash_info, &name, NULL); ++ if (ret) ++ bch_err(c, "remove_dirent: err %i deleting dirent", ret); ++err: ++ kfree(buf); ++ return ret; ++} ++ ++static int reattach_inode(struct bch_fs *c, ++ struct bch_inode_unpacked *lostfound_inode, ++ u64 inum) ++{ ++ struct bch_inode_unpacked inode_u; ++ char name_buf[20]; ++ struct qstr name; ++ int ret; ++ ++ snprintf(name_buf, sizeof(name_buf), "%llu", inum); ++ name = (struct qstr) QSTR(name_buf); ++ ++ ret = bch2_trans_do(c, NULL, ++ BTREE_INSERT_ATOMIC| ++ BTREE_INSERT_LAZY_RW, ++ bch2_link_trans(&trans, lostfound_inode->bi_inum, ++ inum, &inode_u, &name)); ++ if (ret) ++ bch_err(c, "error %i reattaching inode %llu", ret, inum); ++ ++ return ret; ++} ++ ++struct inode_walker { ++ bool first_this_inode; ++ bool have_inode; ++ u64 cur_inum; ++ struct bch_inode_unpacked inode; ++}; ++ ++static struct inode_walker inode_walker_init(void) ++{ ++ return (struct inode_walker) { ++ .cur_inum = -1, ++ .have_inode = false, ++ }; ++} ++ ++static int walk_inode(struct btree_trans *trans, ++ struct inode_walker *w, u64 inum) ++{ ++ if (inum != w->cur_inum) { ++ int ret = bch2_inode_find_by_inum_trans(trans, inum, ++ &w->inode); ++ ++ if (ret && ret != -ENOENT) ++ return ret; ++ ++ w->have_inode = !ret; ++ w->cur_inum = inum; ++ w->first_this_inode = true; ++ } else { ++ w->first_this_inode = false; ++ } ++ ++ return 0; ++} ++ ++struct hash_check { ++ struct bch_hash_info info; ++ ++ /* start of current chain of hash collisions: */ ++ struct btree_iter *chain; ++ ++ /* next offset in current chain of hash collisions: */ ++ u64 chain_end; ++}; ++ ++static void hash_check_init(struct hash_check *h) ++{ ++ h->chain = NULL; ++ h->chain_end = 0; ++} ++ ++static void hash_stop_chain(struct btree_trans *trans, ++ struct hash_check *h) ++{ ++ if (h->chain) ++ bch2_trans_iter_free(trans, h->chain); ++ h->chain = NULL; ++} ++ ++static void hash_check_set_inode(struct btree_trans *trans, ++ struct hash_check *h, ++ const struct bch_inode_unpacked *bi) ++{ ++ h->info = bch2_hash_info_init(trans->c, bi); ++ hash_stop_chain(trans, h); ++} ++ ++static int hash_redo_key(const struct bch_hash_desc desc, ++ struct btree_trans *trans, struct hash_check *h, ++ struct btree_iter *k_iter, struct bkey_s_c k, ++ u64 hashed) ++{ ++ struct bkey_i *tmp; ++ int ret = 0; ++ ++ tmp = kmalloc(bkey_bytes(k.k), GFP_KERNEL); ++ if (!tmp) ++ return -ENOMEM; ++ ++ bkey_reassemble(tmp, k); ++ ++ ret = bch2_btree_delete_at(trans, k_iter, 0); ++ if (ret) ++ goto err; ++ ++ bch2_hash_set(trans, desc, &h->info, k_iter->pos.inode, ++ tmp, BCH_HASH_SET_MUST_CREATE); ++ ret = bch2_trans_commit(trans, NULL, NULL, ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_LAZY_RW); ++err: ++ kfree(tmp); ++ return ret; ++} ++ ++static int fsck_hash_delete_at(struct btree_trans *trans, ++ const struct bch_hash_desc desc, ++ struct bch_hash_info *info, ++ struct btree_iter *iter) ++{ ++ int ret; ++retry: ++ ret = bch2_hash_delete_at(trans, desc, info, iter) ?: ++ bch2_trans_commit(trans, NULL, NULL, ++ BTREE_INSERT_ATOMIC| ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_LAZY_RW); ++ if (ret == -EINTR) { ++ ret = bch2_btree_iter_traverse(iter); ++ if (!ret) ++ goto retry; ++ } ++ ++ return ret; ++} ++ ++static int hash_check_duplicates(struct btree_trans *trans, ++ const struct bch_hash_desc desc, struct hash_check *h, ++ struct btree_iter *k_iter, struct bkey_s_c k) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_iter *iter; ++ struct bkey_s_c k2; ++ char buf[200]; ++ int ret = 0; ++ ++ if (!bkey_cmp(h->chain->pos, k_iter->pos)) ++ return 0; ++ ++ iter = bch2_trans_copy_iter(trans, h->chain); ++ BUG_ON(IS_ERR(iter)); ++ ++ for_each_btree_key_continue(iter, 0, k2, ret) { ++ if (bkey_cmp(k2.k->p, k.k->p) >= 0) ++ break; ++ ++ if (fsck_err_on(k2.k->type == desc.key_type && ++ !desc.cmp_bkey(k, k2), c, ++ "duplicate hash table keys:\n%s", ++ (bch2_bkey_val_to_text(&PBUF(buf), c, ++ k), buf))) { ++ ret = fsck_hash_delete_at(trans, desc, &h->info, k_iter); ++ if (ret) ++ return ret; ++ ret = 1; ++ break; ++ } ++ } ++fsck_err: ++ bch2_trans_iter_free(trans, iter); ++ return ret; ++} ++ ++static void hash_set_chain_start(struct btree_trans *trans, ++ const struct bch_hash_desc desc, ++ struct hash_check *h, ++ struct btree_iter *k_iter, struct bkey_s_c k) ++{ ++ bool hole = (k.k->type != KEY_TYPE_whiteout && ++ k.k->type != desc.key_type); ++ ++ if (hole || k.k->p.offset > h->chain_end + 1) ++ hash_stop_chain(trans, h); ++ ++ if (!hole) { ++ if (!h->chain) { ++ h->chain = bch2_trans_copy_iter(trans, k_iter); ++ BUG_ON(IS_ERR(h->chain)); ++ } ++ ++ h->chain_end = k.k->p.offset; ++ } ++} ++ ++static bool key_has_correct_hash(struct btree_trans *trans, ++ const struct bch_hash_desc desc, ++ struct hash_check *h, ++ struct btree_iter *k_iter, struct bkey_s_c k) ++{ ++ u64 hash; ++ ++ hash_set_chain_start(trans, desc, h, k_iter, k); ++ ++ if (k.k->type != desc.key_type) ++ return true; ++ ++ hash = desc.hash_bkey(&h->info, k); ++ ++ return hash >= h->chain->pos.offset && ++ hash <= k.k->p.offset; ++} ++ ++static int hash_check_key(struct btree_trans *trans, ++ const struct bch_hash_desc desc, struct hash_check *h, ++ struct btree_iter *k_iter, struct bkey_s_c k) ++{ ++ struct bch_fs *c = trans->c; ++ char buf[200]; ++ u64 hashed; ++ int ret = 0; ++ ++ hash_set_chain_start(trans, desc, h, k_iter, k); ++ ++ if (k.k->type != desc.key_type) ++ return 0; ++ ++ hashed = desc.hash_bkey(&h->info, k); ++ ++ if (fsck_err_on(hashed < h->chain->pos.offset || ++ hashed > k.k->p.offset, c, ++ "hash table key at wrong offset: btree %u, %llu, " ++ "hashed to %llu chain starts at %llu\n%s", ++ desc.btree_id, k.k->p.offset, ++ hashed, h->chain->pos.offset, ++ (bch2_bkey_val_to_text(&PBUF(buf), c, ++ k), buf))) { ++ ret = hash_redo_key(desc, trans, h, k_iter, k, hashed); ++ if (ret) { ++ bch_err(c, "hash_redo_key err %i", ret); ++ return ret; ++ } ++ return 1; ++ } ++ ++ ret = hash_check_duplicates(trans, desc, h, k_iter, k); ++fsck_err: ++ return ret; ++} ++ ++static int check_dirent_hash(struct btree_trans *trans, struct hash_check *h, ++ struct btree_iter *iter, struct bkey_s_c *k) ++{ ++ struct bch_fs *c = trans->c; ++ struct bkey_i_dirent *d = NULL; ++ int ret = -EINVAL; ++ char buf[200]; ++ unsigned len; ++ u64 hash; ++ ++ if (key_has_correct_hash(trans, bch2_dirent_hash_desc, h, iter, *k)) ++ return 0; ++ ++ len = bch2_dirent_name_bytes(bkey_s_c_to_dirent(*k)); ++ BUG_ON(!len); ++ ++ memcpy(buf, bkey_s_c_to_dirent(*k).v->d_name, len); ++ buf[len] = '\0'; ++ ++ d = kmalloc(bkey_bytes(k->k), GFP_KERNEL); ++ if (!d) { ++ bch_err(c, "memory allocation failure"); ++ return -ENOMEM; ++ } ++ ++ bkey_reassemble(&d->k_i, *k); ++ ++ do { ++ --len; ++ if (!len) ++ goto err_redo; ++ ++ d->k.u64s = BKEY_U64s + dirent_val_u64s(len); ++ ++ BUG_ON(bkey_val_bytes(&d->k) < ++ offsetof(struct bch_dirent, d_name) + len); ++ ++ memset(d->v.d_name + len, 0, ++ bkey_val_bytes(&d->k) - ++ offsetof(struct bch_dirent, d_name) - len); ++ ++ hash = bch2_dirent_hash_desc.hash_bkey(&h->info, ++ bkey_i_to_s_c(&d->k_i)); ++ } while (hash < h->chain->pos.offset || ++ hash > k->k->p.offset); ++ ++ if (fsck_err(c, "dirent with junk at end, was %s (%zu) now %s (%u)", ++ buf, strlen(buf), d->v.d_name, len)) { ++ bch2_trans_update(trans, iter, &d->k_i); ++ ++ ret = bch2_trans_commit(trans, NULL, NULL, ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_LAZY_RW); ++ if (ret) ++ goto err; ++ ++ *k = bch2_btree_iter_peek(iter); ++ ++ BUG_ON(k->k->type != KEY_TYPE_dirent); ++ } ++err: ++fsck_err: ++ kfree(d); ++ return ret; ++err_redo: ++ hash = bch2_dirent_hash_desc.hash_bkey(&h->info, *k); ++ ++ if (fsck_err(c, "cannot fix dirent by removing trailing garbage %s (%zu)\n" ++ "hash table key at wrong offset: btree %u, offset %llu, " ++ "hashed to %llu chain starts at %llu\n%s", ++ buf, strlen(buf), BTREE_ID_DIRENTS, ++ k->k->p.offset, hash, h->chain->pos.offset, ++ (bch2_bkey_val_to_text(&PBUF(buf), c, ++ *k), buf))) { ++ ret = hash_redo_key(bch2_dirent_hash_desc, trans, ++ h, iter, *k, hash); ++ if (ret) ++ bch_err(c, "hash_redo_key err %i", ret); ++ else ++ ret = 1; ++ } ++ ++ goto err; ++} ++ ++static int bch2_inode_truncate(struct bch_fs *c, u64 inode_nr, u64 new_size) ++{ ++ return bch2_btree_delete_range(c, BTREE_ID_EXTENTS, ++ POS(inode_nr, round_up(new_size, block_bytes(c)) >> 9), ++ POS(inode_nr + 1, 0), NULL); ++} ++ ++/* ++ * Walk extents: verify that extents have a corresponding S_ISREG inode, and ++ * that i_size an i_sectors are consistent ++ */ ++noinline_for_stack ++static int check_extents(struct bch_fs *c) ++{ ++ struct inode_walker w = inode_walker_init(); ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ u64 i_sectors; ++ int ret = 0; ++ ++ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); ++ ++ bch_verbose(c, "checking extents"); ++ ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, ++ POS(BCACHEFS_ROOT_INO, 0), 0); ++retry: ++ for_each_btree_key_continue(iter, 0, k, ret) { ++ ret = walk_inode(&trans, &w, k.k->p.inode); ++ if (ret) ++ break; ++ ++ if (fsck_err_on(!w.have_inode, c, ++ "extent type %u for missing inode %llu", ++ k.k->type, k.k->p.inode) || ++ fsck_err_on(w.have_inode && ++ !S_ISREG(w.inode.bi_mode) && !S_ISLNK(w.inode.bi_mode), c, ++ "extent type %u for non regular file, inode %llu mode %o", ++ k.k->type, k.k->p.inode, w.inode.bi_mode)) { ++ bch2_trans_unlock(&trans); ++ ++ ret = bch2_inode_truncate(c, k.k->p.inode, 0); ++ if (ret) ++ goto err; ++ continue; ++ } ++ ++ if (fsck_err_on(w.first_this_inode && ++ w.have_inode && ++ !(w.inode.bi_flags & BCH_INODE_I_SECTORS_DIRTY) && ++ w.inode.bi_sectors != ++ (i_sectors = bch2_count_inode_sectors(&trans, w.cur_inum)), ++ c, "i_sectors wrong: got %llu, should be %llu", ++ w.inode.bi_sectors, i_sectors)) { ++ struct bkey_inode_buf p; ++ ++ w.inode.bi_sectors = i_sectors; ++ ++ bch2_trans_unlock(&trans); ++ ++ bch2_inode_pack(&p, &w.inode); ++ ++ ret = bch2_btree_insert(c, BTREE_ID_INODES, ++ &p.inode.k_i, NULL, NULL, ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_LAZY_RW); ++ if (ret) { ++ bch_err(c, "error in fsck: error %i updating inode", ret); ++ goto err; ++ } ++ ++ /* revalidate iterator: */ ++ k = bch2_btree_iter_peek(iter); ++ } ++ ++ if (fsck_err_on(w.have_inode && ++ !(w.inode.bi_flags & BCH_INODE_I_SIZE_DIRTY) && ++ k.k->type != KEY_TYPE_reservation && ++ k.k->p.offset > round_up(w.inode.bi_size, block_bytes(c)) >> 9, c, ++ "extent type %u offset %llu past end of inode %llu, i_size %llu", ++ k.k->type, k.k->p.offset, k.k->p.inode, w.inode.bi_size)) { ++ bch2_trans_unlock(&trans); ++ ++ ret = bch2_inode_truncate(c, k.k->p.inode, ++ w.inode.bi_size); ++ if (ret) ++ goto err; ++ continue; ++ } ++ } ++err: ++fsck_err: ++ if (ret == -EINTR) ++ goto retry; ++ return bch2_trans_exit(&trans) ?: ret; ++} ++ ++/* ++ * Walk dirents: verify that they all have a corresponding S_ISDIR inode, ++ * validate d_type ++ */ ++noinline_for_stack ++static int check_dirents(struct bch_fs *c) ++{ ++ struct inode_walker w = inode_walker_init(); ++ struct hash_check h; ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ unsigned name_len; ++ char buf[200]; ++ int ret = 0; ++ ++ bch_verbose(c, "checking dirents"); ++ ++ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); ++ ++ hash_check_init(&h); ++ ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_DIRENTS, ++ POS(BCACHEFS_ROOT_INO, 0), 0); ++retry: ++ for_each_btree_key_continue(iter, 0, k, ret) { ++ struct bkey_s_c_dirent d; ++ struct bch_inode_unpacked target; ++ bool have_target; ++ u64 d_inum; ++ ++ ret = walk_inode(&trans, &w, k.k->p.inode); ++ if (ret) ++ break; ++ ++ if (fsck_err_on(!w.have_inode, c, ++ "dirent in nonexisting directory:\n%s", ++ (bch2_bkey_val_to_text(&PBUF(buf), c, ++ k), buf)) || ++ fsck_err_on(!S_ISDIR(w.inode.bi_mode), c, ++ "dirent in non directory inode type %u:\n%s", ++ mode_to_type(w.inode.bi_mode), ++ (bch2_bkey_val_to_text(&PBUF(buf), c, ++ k), buf))) { ++ ret = bch2_btree_delete_at(&trans, iter, 0); ++ if (ret) ++ goto err; ++ continue; ++ } ++ ++ if (w.first_this_inode && w.have_inode) ++ hash_check_set_inode(&trans, &h, &w.inode); ++ ++ ret = check_dirent_hash(&trans, &h, iter, &k); ++ if (ret > 0) { ++ ret = 0; ++ continue; ++ } ++ if (ret) ++ goto fsck_err; ++ ++ if (ret) ++ goto fsck_err; ++ ++ if (k.k->type != KEY_TYPE_dirent) ++ continue; ++ ++ d = bkey_s_c_to_dirent(k); ++ d_inum = le64_to_cpu(d.v->d_inum); ++ ++ name_len = bch2_dirent_name_bytes(d); ++ ++ if (fsck_err_on(!name_len, c, "empty dirent") || ++ fsck_err_on(name_len == 1 && ++ !memcmp(d.v->d_name, ".", 1), c, ++ ". dirent") || ++ fsck_err_on(name_len == 2 && ++ !memcmp(d.v->d_name, "..", 2), c, ++ ".. dirent") || ++ fsck_err_on(name_len == 2 && ++ !memcmp(d.v->d_name, "..", 2), c, ++ ".. dirent") || ++ fsck_err_on(memchr(d.v->d_name, '/', name_len), c, ++ "dirent name has invalid chars")) { ++ ret = remove_dirent(&trans, d); ++ if (ret) ++ goto err; ++ continue; ++ } ++ ++ if (fsck_err_on(d_inum == d.k->p.inode, c, ++ "dirent points to own directory:\n%s", ++ (bch2_bkey_val_to_text(&PBUF(buf), c, ++ k), buf))) { ++ ret = remove_dirent(&trans, d); ++ if (ret) ++ goto err; ++ continue; ++ } ++ ++ ret = bch2_inode_find_by_inum_trans(&trans, d_inum, &target); ++ if (ret && ret != -ENOENT) ++ break; ++ ++ have_target = !ret; ++ ret = 0; ++ ++ if (fsck_err_on(!have_target, c, ++ "dirent points to missing inode:\n%s", ++ (bch2_bkey_val_to_text(&PBUF(buf), c, ++ k), buf))) { ++ ret = remove_dirent(&trans, d); ++ if (ret) ++ goto err; ++ continue; ++ } ++ ++ if (fsck_err_on(have_target && ++ d.v->d_type != ++ mode_to_type(target.bi_mode), c, ++ "incorrect d_type: should be %u:\n%s", ++ mode_to_type(target.bi_mode), ++ (bch2_bkey_val_to_text(&PBUF(buf), c, ++ k), buf))) { ++ struct bkey_i_dirent *n; ++ ++ n = kmalloc(bkey_bytes(d.k), GFP_KERNEL); ++ if (!n) { ++ ret = -ENOMEM; ++ goto err; ++ } ++ ++ bkey_reassemble(&n->k_i, d.s_c); ++ n->v.d_type = mode_to_type(target.bi_mode); ++ ++ bch2_trans_update(&trans, iter, &n->k_i); ++ ++ ret = bch2_trans_commit(&trans, NULL, NULL, ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_LAZY_RW); ++ kfree(n); ++ if (ret) ++ goto err; ++ ++ } ++ } ++ ++ hash_stop_chain(&trans, &h); ++err: ++fsck_err: ++ if (ret == -EINTR) ++ goto retry; ++ ++ return bch2_trans_exit(&trans) ?: ret; ++} ++ ++/* ++ * Walk xattrs: verify that they all have a corresponding inode ++ */ ++noinline_for_stack ++static int check_xattrs(struct bch_fs *c) ++{ ++ struct inode_walker w = inode_walker_init(); ++ struct hash_check h; ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ int ret = 0; ++ ++ bch_verbose(c, "checking xattrs"); ++ ++ hash_check_init(&h); ++ ++ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); ++ ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_XATTRS, ++ POS(BCACHEFS_ROOT_INO, 0), 0); ++retry: ++ for_each_btree_key_continue(iter, 0, k, ret) { ++ ret = walk_inode(&trans, &w, k.k->p.inode); ++ if (ret) ++ break; ++ ++ if (fsck_err_on(!w.have_inode, c, ++ "xattr for missing inode %llu", ++ k.k->p.inode)) { ++ ret = bch2_btree_delete_at(&trans, iter, 0); ++ if (ret) ++ goto err; ++ continue; ++ } ++ ++ if (w.first_this_inode && w.have_inode) ++ hash_check_set_inode(&trans, &h, &w.inode); ++ ++ ret = hash_check_key(&trans, bch2_xattr_hash_desc, ++ &h, iter, k); ++ if (ret) ++ goto fsck_err; ++ } ++err: ++fsck_err: ++ if (ret == -EINTR) ++ goto retry; ++ return bch2_trans_exit(&trans) ?: ret; ++} ++ ++/* Get root directory, create if it doesn't exist: */ ++static int check_root(struct bch_fs *c, struct bch_inode_unpacked *root_inode) ++{ ++ struct bkey_inode_buf packed; ++ int ret; ++ ++ bch_verbose(c, "checking root directory"); ++ ++ ret = bch2_inode_find_by_inum(c, BCACHEFS_ROOT_INO, root_inode); ++ if (ret && ret != -ENOENT) ++ return ret; ++ ++ if (fsck_err_on(ret, c, "root directory missing")) ++ goto create_root; ++ ++ if (fsck_err_on(!S_ISDIR(root_inode->bi_mode), c, ++ "root inode not a directory")) ++ goto create_root; ++ ++ return 0; ++fsck_err: ++ return ret; ++create_root: ++ bch2_inode_init(c, root_inode, 0, 0, S_IFDIR|0755, ++ 0, NULL); ++ root_inode->bi_inum = BCACHEFS_ROOT_INO; ++ ++ bch2_inode_pack(&packed, root_inode); ++ ++ return bch2_btree_insert(c, BTREE_ID_INODES, &packed.inode.k_i, ++ NULL, NULL, ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_LAZY_RW); ++} ++ ++/* Get lost+found, create if it doesn't exist: */ ++static int check_lostfound(struct bch_fs *c, ++ struct bch_inode_unpacked *root_inode, ++ struct bch_inode_unpacked *lostfound_inode) ++{ ++ struct qstr lostfound = QSTR("lost+found"); ++ struct bch_hash_info root_hash_info = ++ bch2_hash_info_init(c, root_inode); ++ u64 inum; ++ int ret; ++ ++ bch_verbose(c, "checking lost+found"); ++ ++ inum = bch2_dirent_lookup(c, BCACHEFS_ROOT_INO, &root_hash_info, ++ &lostfound); ++ if (!inum) { ++ bch_notice(c, "creating lost+found"); ++ goto create_lostfound; ++ } ++ ++ ret = bch2_inode_find_by_inum(c, inum, lostfound_inode); ++ if (ret && ret != -ENOENT) ++ return ret; ++ ++ if (fsck_err_on(ret, c, "lost+found missing")) ++ goto create_lostfound; ++ ++ if (fsck_err_on(!S_ISDIR(lostfound_inode->bi_mode), c, ++ "lost+found inode not a directory")) ++ goto create_lostfound; ++ ++ return 0; ++fsck_err: ++ return ret; ++create_lostfound: ++ bch2_inode_init_early(c, lostfound_inode); ++ ++ ret = bch2_trans_do(c, NULL, ++ BTREE_INSERT_ATOMIC| ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_LAZY_RW, ++ bch2_create_trans(&trans, ++ BCACHEFS_ROOT_INO, root_inode, ++ lostfound_inode, &lostfound, ++ 0, 0, S_IFDIR|0755, 0, NULL, NULL)); ++ if (ret) ++ bch_err(c, "error creating lost+found: %i", ret); ++ ++ return ret; ++} ++ ++struct inode_bitmap { ++ unsigned long *bits; ++ size_t size; ++}; ++ ++static inline bool inode_bitmap_test(struct inode_bitmap *b, size_t nr) ++{ ++ return nr < b->size ? test_bit(nr, b->bits) : false; ++} ++ ++static inline int inode_bitmap_set(struct inode_bitmap *b, size_t nr) ++{ ++ if (nr >= b->size) { ++ size_t new_size = max_t(size_t, max_t(size_t, ++ PAGE_SIZE * 8, ++ b->size * 2), ++ nr + 1); ++ void *n; ++ ++ new_size = roundup_pow_of_two(new_size); ++ n = krealloc(b->bits, new_size / 8, GFP_KERNEL|__GFP_ZERO); ++ if (!n) { ++ return -ENOMEM; ++ } ++ ++ b->bits = n; ++ b->size = new_size; ++ } ++ ++ __set_bit(nr, b->bits); ++ return 0; ++} ++ ++struct pathbuf { ++ size_t nr; ++ size_t size; ++ ++ struct pathbuf_entry { ++ u64 inum; ++ u64 offset; ++ } *entries; ++}; ++ ++static int path_down(struct pathbuf *p, u64 inum) ++{ ++ if (p->nr == p->size) { ++ size_t new_size = max_t(size_t, 256UL, p->size * 2); ++ void *n = krealloc(p->entries, ++ new_size * sizeof(p->entries[0]), ++ GFP_KERNEL); ++ if (!n) ++ return -ENOMEM; ++ ++ p->entries = n; ++ p->size = new_size; ++ }; ++ ++ p->entries[p->nr++] = (struct pathbuf_entry) { ++ .inum = inum, ++ .offset = 0, ++ }; ++ return 0; ++} ++ ++noinline_for_stack ++static int check_directory_structure(struct bch_fs *c, ++ struct bch_inode_unpacked *lostfound_inode) ++{ ++ struct inode_bitmap dirs_done = { NULL, 0 }; ++ struct pathbuf path = { 0, 0, NULL }; ++ struct pathbuf_entry *e; ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ struct bkey_s_c_dirent dirent; ++ bool had_unreachable; ++ u64 d_inum; ++ int ret = 0; ++ ++ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); ++ ++ bch_verbose(c, "checking directory structure"); ++ ++ /* DFS: */ ++restart_dfs: ++ had_unreachable = false; ++ ++ ret = inode_bitmap_set(&dirs_done, BCACHEFS_ROOT_INO); ++ if (ret) { ++ bch_err(c, "memory allocation failure in inode_bitmap_set()"); ++ goto err; ++ } ++ ++ ret = path_down(&path, BCACHEFS_ROOT_INO); ++ if (ret) ++ goto err; ++ ++ while (path.nr) { ++next: ++ e = &path.entries[path.nr - 1]; ++ ++ if (e->offset == U64_MAX) ++ goto up; ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS, ++ POS(e->inum, e->offset + 1), 0, k, ret) { ++ if (k.k->p.inode != e->inum) ++ break; ++ ++ e->offset = k.k->p.offset; ++ ++ if (k.k->type != KEY_TYPE_dirent) ++ continue; ++ ++ dirent = bkey_s_c_to_dirent(k); ++ ++ if (dirent.v->d_type != DT_DIR) ++ continue; ++ ++ d_inum = le64_to_cpu(dirent.v->d_inum); ++ ++ if (fsck_err_on(inode_bitmap_test(&dirs_done, d_inum), c, ++ "directory %llu has multiple hardlinks", ++ d_inum)) { ++ ret = remove_dirent(&trans, dirent); ++ if (ret) ++ goto err; ++ continue; ++ } ++ ++ ret = inode_bitmap_set(&dirs_done, d_inum); ++ if (ret) { ++ bch_err(c, "memory allocation failure in inode_bitmap_set()"); ++ goto err; ++ } ++ ++ ret = path_down(&path, d_inum); ++ if (ret) { ++ goto err; ++ } ++ ++ ret = bch2_trans_iter_free(&trans, iter); ++ if (ret) { ++ bch_err(c, "btree error %i in fsck", ret); ++ goto err; ++ } ++ goto next; ++ } ++ ret = bch2_trans_iter_free(&trans, iter) ?: ret; ++ if (ret) { ++ bch_err(c, "btree error %i in fsck", ret); ++ goto err; ++ } ++up: ++ path.nr--; ++ } ++ ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_INODES, POS_MIN, 0); ++retry: ++ for_each_btree_key_continue(iter, 0, k, ret) { ++ if (k.k->type != KEY_TYPE_inode) ++ continue; ++ ++ if (!S_ISDIR(le16_to_cpu(bkey_s_c_to_inode(k).v->bi_mode))) ++ continue; ++ ++ ret = bch2_empty_dir_trans(&trans, k.k->p.inode); ++ if (ret == -EINTR) ++ goto retry; ++ if (!ret) ++ continue; ++ ++ if (fsck_err_on(!inode_bitmap_test(&dirs_done, k.k->p.inode), c, ++ "unreachable directory found (inum %llu)", ++ k.k->p.inode)) { ++ bch2_trans_unlock(&trans); ++ ++ ret = reattach_inode(c, lostfound_inode, k.k->p.inode); ++ if (ret) { ++ goto err; ++ } ++ ++ had_unreachable = true; ++ } ++ } ++ bch2_trans_iter_free(&trans, iter); ++ if (ret) ++ goto err; ++ ++ if (had_unreachable) { ++ bch_info(c, "reattached unreachable directories, restarting pass to check for loops"); ++ kfree(dirs_done.bits); ++ kfree(path.entries); ++ memset(&dirs_done, 0, sizeof(dirs_done)); ++ memset(&path, 0, sizeof(path)); ++ goto restart_dfs; ++ } ++err: ++fsck_err: ++ ret = bch2_trans_exit(&trans) ?: ret; ++ kfree(dirs_done.bits); ++ kfree(path.entries); ++ return ret; ++} ++ ++struct nlink { ++ u32 count; ++ u32 dir_count; ++}; ++ ++typedef GENRADIX(struct nlink) nlink_table; ++ ++static void inc_link(struct bch_fs *c, nlink_table *links, ++ u64 range_start, u64 *range_end, ++ u64 inum, bool dir) ++{ ++ struct nlink *link; ++ ++ if (inum < range_start || inum >= *range_end) ++ return; ++ ++ link = genradix_ptr_alloc(links, inum - range_start, GFP_KERNEL); ++ if (!link) { ++ bch_verbose(c, "allocation failed during fsck - will need another pass"); ++ *range_end = inum; ++ return; ++ } ++ ++ if (dir) ++ link->dir_count++; ++ else ++ link->count++; ++} ++ ++noinline_for_stack ++static int bch2_gc_walk_dirents(struct bch_fs *c, nlink_table *links, ++ u64 range_start, u64 *range_end) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ struct bkey_s_c_dirent d; ++ u64 d_inum; ++ int ret; ++ ++ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); ++ ++ inc_link(c, links, range_start, range_end, BCACHEFS_ROOT_INO, false); ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS, POS_MIN, 0, k, ret) { ++ switch (k.k->type) { ++ case KEY_TYPE_dirent: ++ d = bkey_s_c_to_dirent(k); ++ d_inum = le64_to_cpu(d.v->d_inum); ++ ++ if (d.v->d_type == DT_DIR) ++ inc_link(c, links, range_start, range_end, ++ d.k->p.inode, true); ++ ++ inc_link(c, links, range_start, range_end, ++ d_inum, false); ++ ++ break; ++ } ++ ++ bch2_trans_cond_resched(&trans); ++ } ++ ret = bch2_trans_exit(&trans) ?: ret; ++ if (ret) ++ bch_err(c, "error in fsck: btree error %i while walking dirents", ret); ++ ++ return ret; ++} ++ ++static int check_inode_nlink(struct bch_fs *c, ++ struct bch_inode_unpacked *lostfound_inode, ++ struct bch_inode_unpacked *u, ++ struct nlink *link, ++ bool *do_update) ++{ ++ u32 i_nlink = bch2_inode_nlink_get(u); ++ u32 real_i_nlink = ++ link->count * nlink_bias(u->bi_mode) + ++ link->dir_count; ++ int ret = 0; ++ ++ /* ++ * These should have been caught/fixed by earlier passes, we don't ++ * repair them here: ++ */ ++ if (S_ISDIR(u->bi_mode) && link->count > 1) { ++ need_fsck_err(c, "directory %llu with multiple hardlinks: %u", ++ u->bi_inum, link->count); ++ return 0; ++ } ++ ++ if (S_ISDIR(u->bi_mode) && !link->count) { ++ need_fsck_err(c, "unreachable directory found (inum %llu)", ++ u->bi_inum); ++ return 0; ++ } ++ ++ if (!S_ISDIR(u->bi_mode) && link->dir_count) { ++ need_fsck_err(c, "non directory with subdirectories", ++ u->bi_inum); ++ return 0; ++ } ++ ++ if (!link->count && ++ !(u->bi_flags & BCH_INODE_UNLINKED) && ++ (c->sb.features & (1 << BCH_FEATURE_ATOMIC_NLINK))) { ++ if (fsck_err(c, "unreachable inode %llu not marked as unlinked (type %u)", ++ u->bi_inum, mode_to_type(u->bi_mode)) == ++ FSCK_ERR_IGNORE) ++ return 0; ++ ++ ret = reattach_inode(c, lostfound_inode, u->bi_inum); ++ if (ret) ++ return ret; ++ ++ link->count = 1; ++ real_i_nlink = nlink_bias(u->bi_mode) + link->dir_count; ++ goto set_i_nlink; ++ } ++ ++ if (i_nlink < link->count) { ++ if (fsck_err(c, "inode %llu i_link too small (%u < %u, type %i)", ++ u->bi_inum, i_nlink, link->count, ++ mode_to_type(u->bi_mode)) == FSCK_ERR_IGNORE) ++ return 0; ++ goto set_i_nlink; ++ } ++ ++ if (i_nlink != real_i_nlink && ++ c->sb.clean) { ++ if (fsck_err(c, "filesystem marked clean, " ++ "but inode %llu has wrong i_nlink " ++ "(type %u i_nlink %u, should be %u)", ++ u->bi_inum, mode_to_type(u->bi_mode), ++ i_nlink, real_i_nlink) == FSCK_ERR_IGNORE) ++ return 0; ++ goto set_i_nlink; ++ } ++ ++ if (i_nlink != real_i_nlink && ++ (c->sb.features & (1 << BCH_FEATURE_ATOMIC_NLINK))) { ++ if (fsck_err(c, "inode %llu has wrong i_nlink " ++ "(type %u i_nlink %u, should be %u)", ++ u->bi_inum, mode_to_type(u->bi_mode), ++ i_nlink, real_i_nlink) == FSCK_ERR_IGNORE) ++ return 0; ++ goto set_i_nlink; ++ } ++ ++ if (real_i_nlink && i_nlink != real_i_nlink) ++ bch_verbose(c, "setting inode %llu nlink from %u to %u", ++ u->bi_inum, i_nlink, real_i_nlink); ++set_i_nlink: ++ if (i_nlink != real_i_nlink) { ++ bch2_inode_nlink_set(u, real_i_nlink); ++ *do_update = true; ++ } ++fsck_err: ++ return ret; ++} ++ ++static int check_inode(struct btree_trans *trans, ++ struct bch_inode_unpacked *lostfound_inode, ++ struct btree_iter *iter, ++ struct bkey_s_c_inode inode, ++ struct nlink *link) ++{ ++ struct bch_fs *c = trans->c; ++ struct bch_inode_unpacked u; ++ bool do_update = false; ++ int ret = 0; ++ ++ ret = bch2_inode_unpack(inode, &u); ++ ++ bch2_trans_unlock(trans); ++ ++ if (bch2_fs_inconsistent_on(ret, c, ++ "error unpacking inode %llu in fsck", ++ inode.k->p.inode)) ++ return ret; ++ ++ if (link) { ++ ret = check_inode_nlink(c, lostfound_inode, &u, link, ++ &do_update); ++ if (ret) ++ return ret; ++ } ++ ++ if (u.bi_flags & BCH_INODE_UNLINKED && ++ (!c->sb.clean || ++ fsck_err(c, "filesystem marked clean, but inode %llu unlinked", ++ u.bi_inum))) { ++ bch_verbose(c, "deleting inode %llu", u.bi_inum); ++ ++ ret = bch2_inode_rm(c, u.bi_inum); ++ if (ret) ++ bch_err(c, "error in fsck: error %i while deleting inode", ret); ++ return ret; ++ } ++ ++ if (u.bi_flags & BCH_INODE_I_SIZE_DIRTY && ++ (!c->sb.clean || ++ fsck_err(c, "filesystem marked clean, but inode %llu has i_size dirty", ++ u.bi_inum))) { ++ bch_verbose(c, "truncating inode %llu", u.bi_inum); ++ ++ /* ++ * XXX: need to truncate partial blocks too here - or ideally ++ * just switch units to bytes and that issue goes away ++ */ ++ ++ ret = bch2_inode_truncate(c, u.bi_inum, u.bi_size); ++ if (ret) { ++ bch_err(c, "error in fsck: error %i truncating inode", ret); ++ return ret; ++ } ++ ++ /* ++ * We truncated without our normal sector accounting hook, just ++ * make sure we recalculate it: ++ */ ++ u.bi_flags |= BCH_INODE_I_SECTORS_DIRTY; ++ ++ u.bi_flags &= ~BCH_INODE_I_SIZE_DIRTY; ++ do_update = true; ++ } ++ ++ if (u.bi_flags & BCH_INODE_I_SECTORS_DIRTY && ++ (!c->sb.clean || ++ fsck_err(c, "filesystem marked clean, but inode %llu has i_sectors dirty", ++ u.bi_inum))) { ++ s64 sectors; ++ ++ bch_verbose(c, "recounting sectors for inode %llu", ++ u.bi_inum); ++ ++ sectors = bch2_count_inode_sectors(trans, u.bi_inum); ++ if (sectors < 0) { ++ bch_err(c, "error in fsck: error %i recounting inode sectors", ++ (int) sectors); ++ return sectors; ++ } ++ ++ u.bi_sectors = sectors; ++ u.bi_flags &= ~BCH_INODE_I_SECTORS_DIRTY; ++ do_update = true; ++ } ++ ++ if (do_update) { ++ struct bkey_inode_buf p; ++ ++ bch2_inode_pack(&p, &u); ++ bch2_trans_update(trans, iter, &p.inode.k_i); ++ ++ ret = bch2_trans_commit(trans, NULL, NULL, ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_LAZY_RW); ++ if (ret && ret != -EINTR) ++ bch_err(c, "error in fsck: error %i " ++ "updating inode", ret); ++ } ++fsck_err: ++ return ret; ++} ++ ++noinline_for_stack ++static int bch2_gc_walk_inodes(struct bch_fs *c, ++ struct bch_inode_unpacked *lostfound_inode, ++ nlink_table *links, ++ u64 range_start, u64 range_end) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ struct nlink *link, zero_links = { 0, 0 }; ++ struct genradix_iter nlinks_iter; ++ int ret = 0, ret2 = 0; ++ u64 nlinks_pos; ++ ++ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); ++ ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_INODES, ++ POS(range_start, 0), 0); ++ nlinks_iter = genradix_iter_init(links, 0); ++ ++ while ((k = bch2_btree_iter_peek(iter)).k && ++ !(ret2 = bkey_err(k))) { ++peek_nlinks: link = genradix_iter_peek(&nlinks_iter, links); ++ ++ if (!link && (!k.k || iter->pos.inode >= range_end)) ++ break; ++ ++ nlinks_pos = range_start + nlinks_iter.pos; ++ if (iter->pos.inode > nlinks_pos) { ++ /* Should have been caught by dirents pass: */ ++ need_fsck_err_on(link && link->count, c, ++ "missing inode %llu (nlink %u)", ++ nlinks_pos, link->count); ++ genradix_iter_advance(&nlinks_iter, links); ++ goto peek_nlinks; ++ } ++ ++ if (iter->pos.inode < nlinks_pos || !link) ++ link = &zero_links; ++ ++ if (k.k && k.k->type == KEY_TYPE_inode) { ++ ret = check_inode(&trans, lostfound_inode, iter, ++ bkey_s_c_to_inode(k), link); ++ BUG_ON(ret == -EINTR); ++ if (ret) ++ break; ++ } else { ++ /* Should have been caught by dirents pass: */ ++ need_fsck_err_on(link->count, c, ++ "missing inode %llu (nlink %u)", ++ nlinks_pos, link->count); ++ } ++ ++ if (nlinks_pos == iter->pos.inode) ++ genradix_iter_advance(&nlinks_iter, links); ++ ++ bch2_btree_iter_next(iter); ++ bch2_trans_cond_resched(&trans); ++ } ++fsck_err: ++ bch2_trans_exit(&trans); ++ ++ if (ret2) ++ bch_err(c, "error in fsck: btree error %i while walking inodes", ret2); ++ ++ return ret ?: ret2; ++} ++ ++noinline_for_stack ++static int check_inode_nlinks(struct bch_fs *c, ++ struct bch_inode_unpacked *lostfound_inode) ++{ ++ nlink_table links; ++ u64 this_iter_range_start, next_iter_range_start = 0; ++ int ret = 0; ++ ++ bch_verbose(c, "checking inode nlinks"); ++ ++ genradix_init(&links); ++ ++ do { ++ this_iter_range_start = next_iter_range_start; ++ next_iter_range_start = U64_MAX; ++ ++ ret = bch2_gc_walk_dirents(c, &links, ++ this_iter_range_start, ++ &next_iter_range_start); ++ if (ret) ++ break; ++ ++ ret = bch2_gc_walk_inodes(c, lostfound_inode, &links, ++ this_iter_range_start, ++ next_iter_range_start); ++ if (ret) ++ break; ++ ++ genradix_free(&links); ++ } while (next_iter_range_start != U64_MAX); ++ ++ genradix_free(&links); ++ ++ return ret; ++} ++ ++/* ++ * Checks for inconsistencies that shouldn't happen, unless we have a bug. ++ * Doesn't fix them yet, mainly because they haven't yet been observed: ++ */ ++int bch2_fsck_full(struct bch_fs *c) ++{ ++ struct bch_inode_unpacked root_inode, lostfound_inode; ++ ++ return check_extents(c) ?: ++ check_dirents(c) ?: ++ check_xattrs(c) ?: ++ check_root(c, &root_inode) ?: ++ check_lostfound(c, &root_inode, &lostfound_inode) ?: ++ check_directory_structure(c, &lostfound_inode) ?: ++ check_inode_nlinks(c, &lostfound_inode); ++} ++ ++int bch2_fsck_inode_nlink(struct bch_fs *c) ++{ ++ struct bch_inode_unpacked root_inode, lostfound_inode; ++ ++ return check_root(c, &root_inode) ?: ++ check_lostfound(c, &root_inode, &lostfound_inode) ?: ++ check_inode_nlinks(c, &lostfound_inode); ++} ++ ++int bch2_fsck_walk_inodes_only(struct bch_fs *c) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ struct bkey_s_c_inode inode; ++ int ret; ++ ++ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_INODES, POS_MIN, 0, k, ret) { ++ if (k.k->type != KEY_TYPE_inode) ++ continue; ++ ++ inode = bkey_s_c_to_inode(k); ++ ++ if (inode.v->bi_flags & ++ (BCH_INODE_I_SIZE_DIRTY| ++ BCH_INODE_I_SECTORS_DIRTY| ++ BCH_INODE_UNLINKED)) { ++ ret = check_inode(&trans, NULL, iter, inode, NULL); ++ BUG_ON(ret == -EINTR); ++ if (ret) ++ break; ++ } ++ } ++ BUG_ON(ret == -EINTR); ++ ++ return bch2_trans_exit(&trans) ?: ret; ++} +diff --git a/fs/bcachefs/fsck.h b/fs/bcachefs/fsck.h +new file mode 100644 +index 000000000000..9e4af02bde1e +--- /dev/null ++++ b/fs/bcachefs/fsck.h +@@ -0,0 +1,9 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_FSCK_H ++#define _BCACHEFS_FSCK_H ++ ++int bch2_fsck_full(struct bch_fs *); ++int bch2_fsck_inode_nlink(struct bch_fs *); ++int bch2_fsck_walk_inodes_only(struct bch_fs *); ++ ++#endif /* _BCACHEFS_FSCK_H */ +diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c +new file mode 100644 +index 000000000000..c0642ff46ba0 +--- /dev/null ++++ b/fs/bcachefs/inode.c +@@ -0,0 +1,567 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include "bcachefs.h" ++#include "bkey_methods.h" ++#include "btree_update.h" ++#include "error.h" ++#include "extents.h" ++#include "inode.h" ++#include "str_hash.h" ++ ++#include ++ ++#include ++ ++const char * const bch2_inode_opts[] = { ++#define x(name, ...) #name, ++ BCH_INODE_OPTS() ++#undef x ++ NULL, ++}; ++ ++static const u8 byte_table[8] = { 1, 2, 3, 4, 6, 8, 10, 13 }; ++static const u8 bits_table[8] = { ++ 1 * 8 - 1, ++ 2 * 8 - 2, ++ 3 * 8 - 3, ++ 4 * 8 - 4, ++ 6 * 8 - 5, ++ 8 * 8 - 6, ++ 10 * 8 - 7, ++ 13 * 8 - 8, ++}; ++ ++static int inode_encode_field(u8 *out, u8 *end, u64 hi, u64 lo) ++{ ++ __be64 in[2] = { cpu_to_be64(hi), cpu_to_be64(lo), }; ++ unsigned shift, bytes, bits = likely(!hi) ++ ? fls64(lo) ++ : fls64(hi) + 64; ++ ++ for (shift = 1; shift <= 8; shift++) ++ if (bits < bits_table[shift - 1]) ++ goto got_shift; ++ ++ BUG(); ++got_shift: ++ bytes = byte_table[shift - 1]; ++ ++ BUG_ON(out + bytes > end); ++ ++ memcpy(out, (u8 *) in + 16 - bytes, bytes); ++ *out |= (1 << 8) >> shift; ++ ++ return bytes; ++} ++ ++static int inode_decode_field(const u8 *in, const u8 *end, ++ u64 out[2], unsigned *out_bits) ++{ ++ __be64 be[2] = { 0, 0 }; ++ unsigned bytes, shift; ++ u8 *p; ++ ++ if (in >= end) ++ return -1; ++ ++ if (!*in) ++ return -1; ++ ++ /* ++ * position of highest set bit indicates number of bytes: ++ * shift = number of bits to remove in high byte: ++ */ ++ shift = 8 - __fls(*in); /* 1 <= shift <= 8 */ ++ bytes = byte_table[shift - 1]; ++ ++ if (in + bytes > end) ++ return -1; ++ ++ p = (u8 *) be + 16 - bytes; ++ memcpy(p, in, bytes); ++ *p ^= (1 << 8) >> shift; ++ ++ out[0] = be64_to_cpu(be[0]); ++ out[1] = be64_to_cpu(be[1]); ++ *out_bits = out[0] ? 64 + fls64(out[0]) : fls64(out[1]); ++ ++ return bytes; ++} ++ ++void bch2_inode_pack(struct bkey_inode_buf *packed, ++ const struct bch_inode_unpacked *inode) ++{ ++ u8 *out = packed->inode.v.fields; ++ u8 *end = (void *) &packed[1]; ++ u8 *last_nonzero_field = out; ++ unsigned nr_fields = 0, last_nonzero_fieldnr = 0; ++ unsigned bytes; ++ ++ bkey_inode_init(&packed->inode.k_i); ++ packed->inode.k.p.inode = inode->bi_inum; ++ packed->inode.v.bi_hash_seed = inode->bi_hash_seed; ++ packed->inode.v.bi_flags = cpu_to_le32(inode->bi_flags); ++ packed->inode.v.bi_mode = cpu_to_le16(inode->bi_mode); ++ ++#define x(_name, _bits) \ ++ out += inode_encode_field(out, end, 0, inode->_name); \ ++ nr_fields++; \ ++ \ ++ if (inode->_name) { \ ++ last_nonzero_field = out; \ ++ last_nonzero_fieldnr = nr_fields; \ ++ } ++ ++ BCH_INODE_FIELDS() ++#undef x ++ ++ out = last_nonzero_field; ++ nr_fields = last_nonzero_fieldnr; ++ ++ bytes = out - (u8 *) &packed->inode.v; ++ set_bkey_val_bytes(&packed->inode.k, bytes); ++ memset_u64s_tail(&packed->inode.v, 0, bytes); ++ ++ SET_INODE_NR_FIELDS(&packed->inode.v, nr_fields); ++ ++ if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) { ++ struct bch_inode_unpacked unpacked; ++ ++ int ret = bch2_inode_unpack(inode_i_to_s_c(&packed->inode), ++ &unpacked); ++ BUG_ON(ret); ++ BUG_ON(unpacked.bi_inum != inode->bi_inum); ++ BUG_ON(unpacked.bi_hash_seed != inode->bi_hash_seed); ++ BUG_ON(unpacked.bi_mode != inode->bi_mode); ++ ++#define x(_name, _bits) BUG_ON(unpacked._name != inode->_name); ++ BCH_INODE_FIELDS() ++#undef x ++ } ++} ++ ++int bch2_inode_unpack(struct bkey_s_c_inode inode, ++ struct bch_inode_unpacked *unpacked) ++{ ++ const u8 *in = inode.v->fields; ++ const u8 *end = (void *) inode.v + bkey_val_bytes(inode.k); ++ u64 field[2]; ++ unsigned fieldnr = 0, field_bits; ++ int ret; ++ ++ unpacked->bi_inum = inode.k->p.inode; ++ unpacked->bi_hash_seed = inode.v->bi_hash_seed; ++ unpacked->bi_flags = le32_to_cpu(inode.v->bi_flags); ++ unpacked->bi_mode = le16_to_cpu(inode.v->bi_mode); ++ ++#define x(_name, _bits) \ ++ if (fieldnr++ == INODE_NR_FIELDS(inode.v)) { \ ++ memset(&unpacked->_name, 0, \ ++ sizeof(*unpacked) - \ ++ offsetof(struct bch_inode_unpacked, _name)); \ ++ return 0; \ ++ } \ ++ \ ++ ret = inode_decode_field(in, end, field, &field_bits); \ ++ if (ret < 0) \ ++ return ret; \ ++ \ ++ if (field_bits > sizeof(unpacked->_name) * 8) \ ++ return -1; \ ++ \ ++ unpacked->_name = field[1]; \ ++ in += ret; ++ ++ BCH_INODE_FIELDS() ++#undef x ++ ++ /* XXX: signal if there were more fields than expected? */ ++ ++ return 0; ++} ++ ++struct btree_iter *bch2_inode_peek(struct btree_trans *trans, ++ struct bch_inode_unpacked *inode, ++ u64 inum, unsigned flags) ++{ ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ int ret; ++ ++ iter = bch2_trans_get_iter(trans, BTREE_ID_INODES, POS(inum, 0), ++ BTREE_ITER_SLOTS|flags); ++ if (IS_ERR(iter)) ++ return iter; ++ ++ k = bch2_btree_iter_peek_slot(iter); ++ ret = bkey_err(k); ++ if (ret) ++ goto err; ++ ++ ret = k.k->type == KEY_TYPE_inode ? 0 : -EIO; ++ if (ret) ++ goto err; ++ ++ ret = bch2_inode_unpack(bkey_s_c_to_inode(k), inode); ++ if (ret) ++ goto err; ++ ++ return iter; ++err: ++ bch2_trans_iter_put(trans, iter); ++ return ERR_PTR(ret); ++} ++ ++int bch2_inode_write(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct bch_inode_unpacked *inode) ++{ ++ struct bkey_inode_buf *inode_p; ++ ++ inode_p = bch2_trans_kmalloc(trans, sizeof(*inode_p)); ++ if (IS_ERR(inode_p)) ++ return PTR_ERR(inode_p); ++ ++ bch2_inode_pack(inode_p, inode); ++ bch2_trans_update(trans, iter, &inode_p->inode.k_i); ++ return 0; ++} ++ ++const char *bch2_inode_invalid(const struct bch_fs *c, struct bkey_s_c k) ++{ ++ struct bkey_s_c_inode inode = bkey_s_c_to_inode(k); ++ struct bch_inode_unpacked unpacked; ++ ++ if (k.k->p.offset) ++ return "nonzero offset"; ++ ++ if (bkey_val_bytes(k.k) < sizeof(struct bch_inode)) ++ return "incorrect value size"; ++ ++ if (k.k->p.inode < BLOCKDEV_INODE_MAX) ++ return "fs inode in blockdev range"; ++ ++ if (INODE_STR_HASH(inode.v) >= BCH_STR_HASH_NR) ++ return "invalid str hash type"; ++ ++ if (bch2_inode_unpack(inode, &unpacked)) ++ return "invalid variable length fields"; ++ ++ if (unpacked.bi_data_checksum >= BCH_CSUM_OPT_NR + 1) ++ return "invalid data checksum type"; ++ ++ if (unpacked.bi_compression >= BCH_COMPRESSION_OPT_NR + 1) ++ return "invalid data checksum type"; ++ ++ if ((unpacked.bi_flags & BCH_INODE_UNLINKED) && ++ unpacked.bi_nlink != 0) ++ return "flagged as unlinked but bi_nlink != 0"; ++ ++ return NULL; ++} ++ ++void bch2_inode_to_text(struct printbuf *out, struct bch_fs *c, ++ struct bkey_s_c k) ++{ ++ struct bkey_s_c_inode inode = bkey_s_c_to_inode(k); ++ struct bch_inode_unpacked unpacked; ++ ++ if (bch2_inode_unpack(inode, &unpacked)) { ++ pr_buf(out, "(unpack error)"); ++ return; ++ } ++ ++#define x(_name, _bits) \ ++ pr_buf(out, #_name ": %llu ", (u64) unpacked._name); ++ BCH_INODE_FIELDS() ++#undef x ++} ++ ++const char *bch2_inode_generation_invalid(const struct bch_fs *c, ++ struct bkey_s_c k) ++{ ++ if (k.k->p.offset) ++ return "nonzero offset"; ++ ++ if (bkey_val_bytes(k.k) != sizeof(struct bch_inode_generation)) ++ return "incorrect value size"; ++ ++ return NULL; ++} ++ ++void bch2_inode_generation_to_text(struct printbuf *out, struct bch_fs *c, ++ struct bkey_s_c k) ++{ ++ struct bkey_s_c_inode_generation gen = bkey_s_c_to_inode_generation(k); ++ ++ pr_buf(out, "generation: %u", le32_to_cpu(gen.v->bi_generation)); ++} ++ ++void bch2_inode_init_early(struct bch_fs *c, ++ struct bch_inode_unpacked *inode_u) ++{ ++ enum bch_str_hash_type str_hash = ++ bch2_str_hash_opt_to_type(c, c->opts.str_hash); ++ ++ memset(inode_u, 0, sizeof(*inode_u)); ++ ++ /* ick */ ++ inode_u->bi_flags |= str_hash << INODE_STR_HASH_OFFSET; ++ get_random_bytes(&inode_u->bi_hash_seed, ++ sizeof(inode_u->bi_hash_seed)); ++} ++ ++void bch2_inode_init_late(struct bch_inode_unpacked *inode_u, u64 now, ++ uid_t uid, gid_t gid, umode_t mode, dev_t rdev, ++ struct bch_inode_unpacked *parent) ++{ ++ inode_u->bi_mode = mode; ++ inode_u->bi_uid = uid; ++ inode_u->bi_gid = gid; ++ inode_u->bi_dev = rdev; ++ inode_u->bi_atime = now; ++ inode_u->bi_mtime = now; ++ inode_u->bi_ctime = now; ++ inode_u->bi_otime = now; ++ ++ if (parent && parent->bi_mode & S_ISGID) { ++ inode_u->bi_gid = parent->bi_gid; ++ if (S_ISDIR(mode)) ++ inode_u->bi_mode |= S_ISGID; ++ } ++ ++ if (parent) { ++#define x(_name, ...) inode_u->bi_##_name = parent->bi_##_name; ++ BCH_INODE_OPTS() ++#undef x ++ } ++} ++ ++void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u, ++ uid_t uid, gid_t gid, umode_t mode, dev_t rdev, ++ struct bch_inode_unpacked *parent) ++{ ++ bch2_inode_init_early(c, inode_u); ++ bch2_inode_init_late(inode_u, bch2_current_time(c), ++ uid, gid, mode, rdev, parent); ++} ++ ++static inline u32 bkey_generation(struct bkey_s_c k) ++{ ++ switch (k.k->type) { ++ case KEY_TYPE_inode: ++ BUG(); ++ case KEY_TYPE_inode_generation: ++ return le32_to_cpu(bkey_s_c_to_inode_generation(k).v->bi_generation); ++ default: ++ return 0; ++ } ++} ++ ++int bch2_inode_create(struct btree_trans *trans, ++ struct bch_inode_unpacked *inode_u, ++ u64 min, u64 max, u64 *hint) ++{ ++ struct bch_fs *c = trans->c; ++ struct bkey_inode_buf *inode_p; ++ struct btree_iter *iter; ++ u64 start; ++ int ret; ++ ++ if (!max) ++ max = ULLONG_MAX; ++ ++ if (c->opts.inodes_32bit) ++ max = min_t(u64, max, U32_MAX); ++ ++ start = READ_ONCE(*hint); ++ ++ if (start >= max || start < min) ++ start = min; ++ ++ inode_p = bch2_trans_kmalloc(trans, sizeof(*inode_p)); ++ if (IS_ERR(inode_p)) ++ return PTR_ERR(inode_p); ++ ++ iter = bch2_trans_get_iter(trans, ++ BTREE_ID_INODES, POS(start, 0), ++ BTREE_ITER_SLOTS|BTREE_ITER_INTENT); ++ if (IS_ERR(iter)) ++ return PTR_ERR(iter); ++again: ++ while (1) { ++ struct bkey_s_c k = bch2_btree_iter_peek_slot(iter); ++ ++ ret = bkey_err(k); ++ if (ret) ++ return ret; ++ ++ switch (k.k->type) { ++ case KEY_TYPE_inode: ++ /* slot used */ ++ if (iter->pos.inode >= max) ++ goto out; ++ ++ bch2_btree_iter_next_slot(iter); ++ break; ++ ++ default: ++ *hint = k.k->p.inode; ++ inode_u->bi_inum = k.k->p.inode; ++ inode_u->bi_generation = bkey_generation(k); ++ ++ bch2_inode_pack(inode_p, inode_u); ++ bch2_trans_update(trans, iter, &inode_p->inode.k_i); ++ return 0; ++ } ++ } ++out: ++ if (start != min) { ++ /* Retry from start */ ++ start = min; ++ bch2_btree_iter_set_pos(iter, POS(start, 0)); ++ goto again; ++ } ++ ++ return -ENOSPC; ++} ++ ++int bch2_inode_rm(struct bch_fs *c, u64 inode_nr) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_i_inode_generation delete; ++ struct bpos start = POS(inode_nr, 0); ++ struct bpos end = POS(inode_nr + 1, 0); ++ int ret; ++ ++ /* ++ * If this was a directory, there shouldn't be any real dirents left - ++ * but there could be whiteouts (from hash collisions) that we should ++ * delete: ++ * ++ * XXX: the dirent could ideally would delete whiteouts when they're no ++ * longer needed ++ */ ++ ret = bch2_btree_delete_range(c, BTREE_ID_EXTENTS, ++ start, end, NULL) ?: ++ bch2_btree_delete_range(c, BTREE_ID_XATTRS, ++ start, end, NULL) ?: ++ bch2_btree_delete_range(c, BTREE_ID_DIRENTS, ++ start, end, NULL); ++ if (ret) ++ return ret; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_INODES, POS(inode_nr, 0), ++ BTREE_ITER_SLOTS|BTREE_ITER_INTENT); ++ do { ++ struct bkey_s_c k = bch2_btree_iter_peek_slot(iter); ++ u32 bi_generation = 0; ++ ++ ret = bkey_err(k); ++ if (ret) ++ break; ++ ++ bch2_fs_inconsistent_on(k.k->type != KEY_TYPE_inode, c, ++ "inode %llu not found when deleting", ++ inode_nr); ++ ++ switch (k.k->type) { ++ case KEY_TYPE_inode: { ++ struct bch_inode_unpacked inode_u; ++ ++ if (!bch2_inode_unpack(bkey_s_c_to_inode(k), &inode_u)) ++ bi_generation = inode_u.bi_generation + 1; ++ break; ++ } ++ case KEY_TYPE_inode_generation: { ++ struct bkey_s_c_inode_generation g = ++ bkey_s_c_to_inode_generation(k); ++ bi_generation = le32_to_cpu(g.v->bi_generation); ++ break; ++ } ++ } ++ ++ if (!bi_generation) { ++ bkey_init(&delete.k); ++ delete.k.p.inode = inode_nr; ++ } else { ++ bkey_inode_generation_init(&delete.k_i); ++ delete.k.p.inode = inode_nr; ++ delete.v.bi_generation = cpu_to_le32(bi_generation); ++ } ++ ++ bch2_trans_update(&trans, iter, &delete.k_i); ++ ++ ret = bch2_trans_commit(&trans, NULL, NULL, ++ BTREE_INSERT_ATOMIC| ++ BTREE_INSERT_NOFAIL); ++ } while (ret == -EINTR); ++ ++ bch2_trans_exit(&trans); ++ return ret; ++} ++ ++int bch2_inode_find_by_inum_trans(struct btree_trans *trans, u64 inode_nr, ++ struct bch_inode_unpacked *inode) ++{ ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ int ret; ++ ++ iter = bch2_trans_get_iter(trans, BTREE_ID_INODES, ++ POS(inode_nr, 0), BTREE_ITER_SLOTS); ++ if (IS_ERR(iter)) ++ return PTR_ERR(iter); ++ ++ k = bch2_btree_iter_peek_slot(iter); ++ ret = bkey_err(k); ++ if (ret) ++ return ret; ++ ++ ret = k.k->type == KEY_TYPE_inode ++ ? bch2_inode_unpack(bkey_s_c_to_inode(k), inode) ++ : -ENOENT; ++ ++ bch2_trans_iter_put(trans, iter); ++ ++ return ret; ++} ++ ++int bch2_inode_find_by_inum(struct bch_fs *c, u64 inode_nr, ++ struct bch_inode_unpacked *inode) ++{ ++ return bch2_trans_do(c, NULL, 0, ++ bch2_inode_find_by_inum_trans(&trans, inode_nr, inode)); ++} ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++void bch2_inode_pack_test(void) ++{ ++ struct bch_inode_unpacked *u, test_inodes[] = { ++ { ++ .bi_atime = U64_MAX, ++ .bi_ctime = U64_MAX, ++ .bi_mtime = U64_MAX, ++ .bi_otime = U64_MAX, ++ .bi_size = U64_MAX, ++ .bi_sectors = U64_MAX, ++ .bi_uid = U32_MAX, ++ .bi_gid = U32_MAX, ++ .bi_nlink = U32_MAX, ++ .bi_generation = U32_MAX, ++ .bi_dev = U32_MAX, ++ }, ++ }; ++ ++ for (u = test_inodes; ++ u < test_inodes + ARRAY_SIZE(test_inodes); ++ u++) { ++ struct bkey_inode_buf p; ++ ++ bch2_inode_pack(&p, u); ++ } ++} ++#endif +diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h +new file mode 100644 +index 000000000000..bb759a46dc41 +--- /dev/null ++++ b/fs/bcachefs/inode.h +@@ -0,0 +1,177 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_INODE_H ++#define _BCACHEFS_INODE_H ++ ++#include "opts.h" ++ ++extern const char * const bch2_inode_opts[]; ++ ++const char *bch2_inode_invalid(const struct bch_fs *, struct bkey_s_c); ++void bch2_inode_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); ++ ++#define bch2_bkey_ops_inode (struct bkey_ops) { \ ++ .key_invalid = bch2_inode_invalid, \ ++ .val_to_text = bch2_inode_to_text, \ ++} ++ ++const char *bch2_inode_generation_invalid(const struct bch_fs *, ++ struct bkey_s_c); ++void bch2_inode_generation_to_text(struct printbuf *, struct bch_fs *, ++ struct bkey_s_c); ++ ++#define bch2_bkey_ops_inode_generation (struct bkey_ops) { \ ++ .key_invalid = bch2_inode_generation_invalid, \ ++ .val_to_text = bch2_inode_generation_to_text, \ ++} ++ ++struct bch_inode_unpacked { ++ u64 bi_inum; ++ __le64 bi_hash_seed; ++ u32 bi_flags; ++ u16 bi_mode; ++ ++#define x(_name, _bits) u##_bits _name; ++ BCH_INODE_FIELDS() ++#undef x ++}; ++ ++struct bkey_inode_buf { ++ struct bkey_i_inode inode; ++ ++#define x(_name, _bits) + 8 + _bits / 8 ++ u8 _pad[0 + BCH_INODE_FIELDS()]; ++#undef x ++} __attribute__((packed, aligned(8))); ++ ++void bch2_inode_pack(struct bkey_inode_buf *, const struct bch_inode_unpacked *); ++int bch2_inode_unpack(struct bkey_s_c_inode, struct bch_inode_unpacked *); ++ ++struct btree_iter *bch2_inode_peek(struct btree_trans *, ++ struct bch_inode_unpacked *, u64, unsigned); ++int bch2_inode_write(struct btree_trans *, struct btree_iter *, ++ struct bch_inode_unpacked *); ++ ++void bch2_inode_init_early(struct bch_fs *, ++ struct bch_inode_unpacked *); ++void bch2_inode_init_late(struct bch_inode_unpacked *, u64, ++ uid_t, gid_t, umode_t, dev_t, ++ struct bch_inode_unpacked *); ++void bch2_inode_init(struct bch_fs *, struct bch_inode_unpacked *, ++ uid_t, gid_t, umode_t, dev_t, ++ struct bch_inode_unpacked *); ++ ++int bch2_inode_create(struct btree_trans *, ++ struct bch_inode_unpacked *, ++ u64, u64, u64 *); ++ ++int bch2_inode_rm(struct bch_fs *, u64); ++ ++int bch2_inode_find_by_inum_trans(struct btree_trans *, u64, ++ struct bch_inode_unpacked *); ++int bch2_inode_find_by_inum(struct bch_fs *, u64, struct bch_inode_unpacked *); ++ ++static inline struct bch_io_opts bch2_inode_opts_get(struct bch_inode_unpacked *inode) ++{ ++ struct bch_io_opts ret = { 0 }; ++ ++#define x(_name, _bits) \ ++ if (inode->bi_##_name) \ ++ opt_set(ret, _name, inode->bi_##_name - 1); ++ BCH_INODE_OPTS() ++#undef x ++ return ret; ++} ++ ++static inline void bch2_inode_opt_set(struct bch_inode_unpacked *inode, ++ enum inode_opt_id id, u64 v) ++{ ++ switch (id) { ++#define x(_name, ...) \ ++ case Inode_opt_##_name: \ ++ inode->bi_##_name = v; \ ++ break; ++ BCH_INODE_OPTS() ++#undef x ++ default: ++ BUG(); ++ } ++} ++ ++static inline u64 bch2_inode_opt_get(struct bch_inode_unpacked *inode, ++ enum inode_opt_id id) ++{ ++ switch (id) { ++#define x(_name, ...) \ ++ case Inode_opt_##_name: \ ++ return inode->bi_##_name; ++ BCH_INODE_OPTS() ++#undef x ++ default: ++ BUG(); ++ } ++} ++ ++static inline struct bch_io_opts ++io_opts(struct bch_fs *c, struct bch_inode_unpacked *inode) ++{ ++ struct bch_io_opts opts = bch2_opts_to_inode_opts(c->opts); ++ ++ bch2_io_opts_apply(&opts, bch2_inode_opts_get(inode)); ++ return opts; ++} ++ ++static inline u8 mode_to_type(umode_t mode) ++{ ++ return (mode >> 12) & 15; ++} ++ ++/* i_nlink: */ ++ ++static inline unsigned nlink_bias(umode_t mode) ++{ ++ return S_ISDIR(mode) ? 2 : 1; ++} ++ ++static inline void bch2_inode_nlink_inc(struct bch_inode_unpacked *bi) ++{ ++ if (bi->bi_flags & BCH_INODE_UNLINKED) ++ bi->bi_flags &= ~BCH_INODE_UNLINKED; ++ else ++ bi->bi_nlink++; ++} ++ ++static inline void bch2_inode_nlink_dec(struct bch_inode_unpacked *bi) ++{ ++ BUG_ON(bi->bi_flags & BCH_INODE_UNLINKED); ++ if (bi->bi_nlink) ++ bi->bi_nlink--; ++ else ++ bi->bi_flags |= BCH_INODE_UNLINKED; ++} ++ ++static inline unsigned bch2_inode_nlink_get(struct bch_inode_unpacked *bi) ++{ ++ return bi->bi_flags & BCH_INODE_UNLINKED ++ ? 0 ++ : bi->bi_nlink + nlink_bias(bi->bi_mode); ++} ++ ++static inline void bch2_inode_nlink_set(struct bch_inode_unpacked *bi, ++ unsigned nlink) ++{ ++ if (nlink) { ++ bi->bi_nlink = nlink - nlink_bias(bi->bi_mode); ++ bi->bi_flags &= ~BCH_INODE_UNLINKED; ++ } else { ++ bi->bi_nlink = 0; ++ bi->bi_flags |= BCH_INODE_UNLINKED; ++ } ++} ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++void bch2_inode_pack_test(void); ++#else ++static inline void bch2_inode_pack_test(void) {} ++#endif ++ ++#endif /* _BCACHEFS_INODE_H */ +diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c +new file mode 100644 +index 000000000000..836004b128f0 +--- /dev/null ++++ b/fs/bcachefs/io.c +@@ -0,0 +1,2210 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * Some low level IO code, and hacks for various block layer limitations ++ * ++ * Copyright 2010, 2011 Kent Overstreet ++ * Copyright 2012 Google, Inc. ++ */ ++ ++#include "bcachefs.h" ++#include "alloc_foreground.h" ++#include "bset.h" ++#include "btree_update.h" ++#include "buckets.h" ++#include "checksum.h" ++#include "compress.h" ++#include "clock.h" ++#include "debug.h" ++#include "disk_groups.h" ++#include "ec.h" ++#include "error.h" ++#include "extents.h" ++#include "inode.h" ++#include "io.h" ++#include "journal.h" ++#include "keylist.h" ++#include "move.h" ++#include "rebalance.h" ++#include "super.h" ++#include "super-io.h" ++ ++#include ++#include ++ ++#include ++ ++static bool bch2_target_congested(struct bch_fs *c, u16 target) ++{ ++ const struct bch_devs_mask *devs; ++ unsigned d, nr = 0, total = 0; ++ u64 now = local_clock(), last; ++ s64 congested; ++ struct bch_dev *ca; ++ ++ if (!target) ++ return false; ++ ++ rcu_read_lock(); ++ devs = bch2_target_to_mask(c, target); ++ for_each_set_bit(d, devs->d, BCH_SB_MEMBERS_MAX) { ++ ca = rcu_dereference(c->devs[d]); ++ if (!ca) ++ continue; ++ ++ congested = atomic_read(&ca->congested); ++ last = READ_ONCE(ca->congested_last); ++ if (time_after64(now, last)) ++ congested -= (now - last) >> 12; ++ ++ total += max(congested, 0LL); ++ nr++; ++ } ++ rcu_read_unlock(); ++ ++ return bch2_rand_range(nr * CONGESTED_MAX) < total; ++} ++ ++static inline void bch2_congested_acct(struct bch_dev *ca, u64 io_latency, ++ u64 now, int rw) ++{ ++ u64 latency_capable = ++ ca->io_latency[rw].quantiles.entries[QUANTILE_IDX(1)].m; ++ /* ideally we'd be taking into account the device's variance here: */ ++ u64 latency_threshold = latency_capable << (rw == READ ? 2 : 3); ++ s64 latency_over = io_latency - latency_threshold; ++ ++ if (latency_threshold && latency_over > 0) { ++ /* ++ * bump up congested by approximately latency_over * 4 / ++ * latency_threshold - we don't need much accuracy here so don't ++ * bother with the divide: ++ */ ++ if (atomic_read(&ca->congested) < CONGESTED_MAX) ++ atomic_add(latency_over >> ++ max_t(int, ilog2(latency_threshold) - 2, 0), ++ &ca->congested); ++ ++ ca->congested_last = now; ++ } else if (atomic_read(&ca->congested) > 0) { ++ atomic_dec(&ca->congested); ++ } ++} ++ ++void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) ++{ ++ atomic64_t *latency = &ca->cur_latency[rw]; ++ u64 now = local_clock(); ++ u64 io_latency = time_after64(now, submit_time) ++ ? now - submit_time ++ : 0; ++ u64 old, new, v = atomic64_read(latency); ++ ++ do { ++ old = v; ++ ++ /* ++ * If the io latency was reasonably close to the current ++ * latency, skip doing the update and atomic operation - most of ++ * the time: ++ */ ++ if (abs((int) (old - io_latency)) < (old >> 1) && ++ now & ~(~0 << 5)) ++ break; ++ ++ new = ewma_add(old, io_latency, 5); ++ } while ((v = atomic64_cmpxchg(latency, old, new)) != old); ++ ++ bch2_congested_acct(ca, io_latency, now, rw); ++ ++ __bch2_time_stats_update(&ca->io_latency[rw], submit_time, now); ++} ++ ++/* Allocate, free from mempool: */ ++ ++void bch2_bio_free_pages_pool(struct bch_fs *c, struct bio *bio) ++{ ++ struct bvec_iter_all iter; ++ struct bio_vec *bv; ++ ++ bio_for_each_segment_all(bv, bio, iter) ++ if (bv->bv_page != ZERO_PAGE(0)) ++ mempool_free(bv->bv_page, &c->bio_bounce_pages); ++ bio->bi_vcnt = 0; ++} ++ ++static struct page *__bio_alloc_page_pool(struct bch_fs *c, bool *using_mempool) ++{ ++ struct page *page; ++ ++ if (likely(!*using_mempool)) { ++ page = alloc_page(GFP_NOIO); ++ if (unlikely(!page)) { ++ mutex_lock(&c->bio_bounce_pages_lock); ++ *using_mempool = true; ++ goto pool_alloc; ++ ++ } ++ } else { ++pool_alloc: ++ page = mempool_alloc(&c->bio_bounce_pages, GFP_NOIO); ++ } ++ ++ return page; ++} ++ ++void bch2_bio_alloc_pages_pool(struct bch_fs *c, struct bio *bio, ++ size_t size) ++{ ++ bool using_mempool = false; ++ ++ while (size) { ++ struct page *page = __bio_alloc_page_pool(c, &using_mempool); ++ unsigned len = min(PAGE_SIZE, size); ++ ++ BUG_ON(!bio_add_page(bio, page, len, 0)); ++ size -= len; ++ } ++ ++ if (using_mempool) ++ mutex_unlock(&c->bio_bounce_pages_lock); ++} ++ ++/* Extent update path: */ ++ ++static int sum_sector_overwrites(struct btree_trans *trans, ++ struct btree_iter *extent_iter, ++ struct bkey_i *new, ++ bool may_allocate, ++ bool *maybe_extending, ++ s64 *delta) ++{ ++ struct btree_iter *iter; ++ struct bkey_s_c old; ++ int ret = 0; ++ ++ *maybe_extending = true; ++ *delta = 0; ++ ++ iter = bch2_trans_copy_iter(trans, extent_iter); ++ if (IS_ERR(iter)) ++ return PTR_ERR(iter); ++ ++ for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, old, ret) { ++ if (!may_allocate && ++ bch2_bkey_nr_ptrs_allocated(old) < ++ bch2_bkey_nr_dirty_ptrs(bkey_i_to_s_c(new))) { ++ ret = -ENOSPC; ++ break; ++ } ++ ++ *delta += (min(new->k.p.offset, ++ old.k->p.offset) - ++ max(bkey_start_offset(&new->k), ++ bkey_start_offset(old.k))) * ++ (bkey_extent_is_allocation(&new->k) - ++ bkey_extent_is_allocation(old.k)); ++ ++ if (bkey_cmp(old.k->p, new->k.p) >= 0) { ++ /* ++ * Check if there's already data above where we're ++ * going to be writing to - this means we're definitely ++ * not extending the file: ++ * ++ * Note that it's not sufficient to check if there's ++ * data up to the sector offset we're going to be ++ * writing to, because i_size could be up to one block ++ * less: ++ */ ++ if (!bkey_cmp(old.k->p, new->k.p)) ++ old = bch2_btree_iter_next(iter); ++ ++ if (old.k && !bkey_err(old) && ++ old.k->p.inode == extent_iter->pos.inode && ++ bkey_extent_is_data(old.k)) ++ *maybe_extending = false; ++ ++ break; ++ } ++ } ++ ++ bch2_trans_iter_put(trans, iter); ++ return ret; ++} ++ ++int bch2_extent_update(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct bkey_i *k, ++ struct disk_reservation *disk_res, ++ u64 *journal_seq, ++ u64 new_i_size, ++ s64 *i_sectors_delta) ++{ ++ /* this must live until after bch2_trans_commit(): */ ++ struct bkey_inode_buf inode_p; ++ bool extending = false; ++ s64 delta = 0; ++ int ret; ++ ++ ret = bch2_extent_trim_atomic(k, iter); ++ if (ret) ++ return ret; ++ ++ ret = sum_sector_overwrites(trans, iter, k, ++ disk_res && disk_res->sectors != 0, ++ &extending, &delta); ++ if (ret) ++ return ret; ++ ++ new_i_size = extending ++ ? min(k->k.p.offset << 9, new_i_size) ++ : 0; ++ ++ if (delta || new_i_size) { ++ struct btree_iter *inode_iter; ++ struct bch_inode_unpacked inode_u; ++ ++ inode_iter = bch2_inode_peek(trans, &inode_u, ++ k->k.p.inode, BTREE_ITER_INTENT); ++ if (IS_ERR(inode_iter)) ++ return PTR_ERR(inode_iter); ++ ++ /* ++ * XXX: ++ * writeback can race a bit with truncate, because truncate ++ * first updates the inode then truncates the pagecache. This is ++ * ugly, but lets us preserve the invariant that the in memory ++ * i_size is always >= the on disk i_size. ++ * ++ BUG_ON(new_i_size > inode_u.bi_size && ++ (inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY)); ++ */ ++ BUG_ON(new_i_size > inode_u.bi_size && !extending); ++ ++ if (!(inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY) && ++ new_i_size > inode_u.bi_size) ++ inode_u.bi_size = new_i_size; ++ else ++ new_i_size = 0; ++ ++ inode_u.bi_sectors += delta; ++ ++ if (delta || new_i_size) { ++ bch2_inode_pack(&inode_p, &inode_u); ++ bch2_trans_update(trans, inode_iter, ++ &inode_p.inode.k_i); ++ } ++ ++ bch2_trans_iter_put(trans, inode_iter); ++ } ++ ++ bch2_trans_update(trans, iter, k); ++ ++ ret = bch2_trans_commit(trans, disk_res, journal_seq, ++ BTREE_INSERT_NOCHECK_RW| ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_ATOMIC| ++ BTREE_INSERT_USE_RESERVE); ++ if (!ret && i_sectors_delta) ++ *i_sectors_delta += delta; ++ ++ return ret; ++} ++ ++int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter, ++ struct bpos end, u64 *journal_seq, ++ s64 *i_sectors_delta) ++{ ++ struct bch_fs *c = trans->c; ++ unsigned max_sectors = KEY_SIZE_MAX & (~0 << c->block_bits); ++ struct bkey_s_c k; ++ int ret = 0, ret2 = 0; ++ ++ while ((k = bch2_btree_iter_peek(iter)).k && ++ bkey_cmp(iter->pos, end) < 0) { ++ struct disk_reservation disk_res = ++ bch2_disk_reservation_init(c, 0); ++ struct bkey_i delete; ++ ++ ret = bkey_err(k); ++ if (ret) ++ goto btree_err; ++ ++ bkey_init(&delete.k); ++ delete.k.p = iter->pos; ++ ++ /* create the biggest key we can */ ++ bch2_key_resize(&delete.k, max_sectors); ++ bch2_cut_back(end, &delete.k); ++ ++ bch2_trans_begin_updates(trans); ++ ++ ret = bch2_extent_update(trans, iter, &delete, ++ &disk_res, journal_seq, ++ 0, i_sectors_delta); ++ bch2_disk_reservation_put(c, &disk_res); ++btree_err: ++ if (ret == -EINTR) { ++ ret2 = ret; ++ ret = 0; ++ } ++ if (ret) ++ break; ++ } ++ ++ if (bkey_cmp(iter->pos, end) > 0) { ++ bch2_btree_iter_set_pos(iter, end); ++ ret = bch2_btree_iter_traverse(iter); ++ } ++ ++ return ret ?: ret2; ++} ++ ++int bch2_fpunch(struct bch_fs *c, u64 inum, u64 start, u64 end, ++ u64 *journal_seq, s64 *i_sectors_delta) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ int ret = 0; ++ ++ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024); ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, ++ POS(inum, start), ++ BTREE_ITER_INTENT); ++ ++ ret = bch2_fpunch_at(&trans, iter, POS(inum, end), ++ journal_seq, i_sectors_delta); ++ bch2_trans_exit(&trans); ++ ++ if (ret == -EINTR) ++ ret = 0; ++ ++ return ret; ++} ++ ++int bch2_write_index_default(struct bch_write_op *op) ++{ ++ struct bch_fs *c = op->c; ++ struct keylist *keys = &op->insert_keys; ++ struct bkey_i *k = bch2_keylist_front(keys); ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ int ret; ++ ++ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024); ++ ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, ++ bkey_start_pos(&k->k), ++ BTREE_ITER_SLOTS|BTREE_ITER_INTENT); ++ ++ do { ++ BKEY_PADDED(k) tmp; ++ ++ bkey_copy(&tmp.k, bch2_keylist_front(keys)); ++ ++ bch2_trans_begin_updates(&trans); ++ ++ ret = bch2_extent_update(&trans, iter, &tmp.k, ++ &op->res, op_journal_seq(op), ++ op->new_i_size, &op->i_sectors_delta); ++ if (ret == -EINTR) ++ continue; ++ if (ret) ++ break; ++ ++ if (bkey_cmp(iter->pos, bch2_keylist_front(keys)->k.p) < 0) ++ bch2_cut_front(iter->pos, bch2_keylist_front(keys)); ++ else ++ bch2_keylist_pop_front(keys); ++ } while (!bch2_keylist_empty(keys)); ++ ++ bch2_trans_exit(&trans); ++ ++ return ret; ++} ++ ++/* Writes */ ++ ++void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, ++ enum bch_data_type type, ++ const struct bkey_i *k) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(k)); ++ const struct bch_extent_ptr *ptr; ++ struct bch_write_bio *n; ++ struct bch_dev *ca; ++ ++ BUG_ON(c->opts.nochanges); ++ ++ bkey_for_each_ptr(ptrs, ptr) { ++ BUG_ON(ptr->dev >= BCH_SB_MEMBERS_MAX || ++ !c->devs[ptr->dev]); ++ ++ ca = bch_dev_bkey_exists(c, ptr->dev); ++ ++ if (to_entry(ptr + 1) < ptrs.end) { ++ n = to_wbio(bio_clone_fast(&wbio->bio, GFP_NOIO, ++ &ca->replica_set)); ++ ++ n->bio.bi_end_io = wbio->bio.bi_end_io; ++ n->bio.bi_private = wbio->bio.bi_private; ++ n->parent = wbio; ++ n->split = true; ++ n->bounce = false; ++ n->put_bio = true; ++ n->bio.bi_opf = wbio->bio.bi_opf; ++ bio_inc_remaining(&wbio->bio); ++ } else { ++ n = wbio; ++ n->split = false; ++ } ++ ++ n->c = c; ++ n->dev = ptr->dev; ++ n->have_ioref = bch2_dev_get_ioref(ca, WRITE); ++ n->submit_time = local_clock(); ++ n->bio.bi_iter.bi_sector = ptr->offset; ++ ++ if (!journal_flushes_device(ca)) ++ n->bio.bi_opf |= REQ_FUA; ++ ++ if (likely(n->have_ioref)) { ++ this_cpu_add(ca->io_done->sectors[WRITE][type], ++ bio_sectors(&n->bio)); ++ ++ bio_set_dev(&n->bio, ca->disk_sb.bdev); ++ submit_bio(&n->bio); ++ } else { ++ n->bio.bi_status = BLK_STS_REMOVED; ++ bio_endio(&n->bio); ++ } ++ } ++} ++ ++static void __bch2_write(struct closure *); ++ ++static void bch2_write_done(struct closure *cl) ++{ ++ struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); ++ struct bch_fs *c = op->c; ++ ++ if (!op->error && (op->flags & BCH_WRITE_FLUSH)) ++ op->error = bch2_journal_error(&c->journal); ++ ++ if (!(op->flags & BCH_WRITE_NOPUT_RESERVATION)) ++ bch2_disk_reservation_put(c, &op->res); ++ percpu_ref_put(&c->writes); ++ bch2_keylist_free(&op->insert_keys, op->inline_keys); ++ ++ bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time); ++ ++ if (op->end_io) ++ op->end_io(op); ++ if (cl->parent) ++ closure_return(cl); ++ else ++ closure_debug_destroy(cl); ++} ++ ++/** ++ * bch_write_index - after a write, update index to point to new data ++ */ ++static void __bch2_write_index(struct bch_write_op *op) ++{ ++ struct bch_fs *c = op->c; ++ struct keylist *keys = &op->insert_keys; ++ struct bch_extent_ptr *ptr; ++ struct bkey_i *src, *dst = keys->keys, *n, *k; ++ unsigned dev; ++ int ret; ++ ++ for (src = keys->keys; src != keys->top; src = n) { ++ n = bkey_next(src); ++ bkey_copy(dst, src); ++ ++ bch2_bkey_drop_ptrs(bkey_i_to_s(dst), ptr, ++ test_bit(ptr->dev, op->failed.d)); ++ ++ if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(dst))) { ++ ret = -EIO; ++ goto err; ++ } ++ ++ dst = bkey_next(dst); ++ } ++ ++ keys->top = dst; ++ ++ /* ++ * probably not the ideal place to hook this in, but I don't ++ * particularly want to plumb io_opts all the way through the btree ++ * update stack right now ++ */ ++ for_each_keylist_key(keys, k) ++ bch2_rebalance_add_key(c, bkey_i_to_s_c(k), &op->opts); ++ ++ if (!bch2_keylist_empty(keys)) { ++ u64 sectors_start = keylist_sectors(keys); ++ int ret = op->index_update_fn(op); ++ ++ BUG_ON(ret == -EINTR); ++ BUG_ON(keylist_sectors(keys) && !ret); ++ ++ op->written += sectors_start - keylist_sectors(keys); ++ ++ if (ret) { ++ __bcache_io_error(c, "btree IO error %i", ret); ++ op->error = ret; ++ } ++ } ++out: ++ /* If some a bucket wasn't written, we can't erasure code it: */ ++ for_each_set_bit(dev, op->failed.d, BCH_SB_MEMBERS_MAX) ++ bch2_open_bucket_write_error(c, &op->open_buckets, dev); ++ ++ bch2_open_buckets_put(c, &op->open_buckets); ++ return; ++err: ++ keys->top = keys->keys; ++ op->error = ret; ++ goto out; ++} ++ ++static void bch2_write_index(struct closure *cl) ++{ ++ struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); ++ struct bch_fs *c = op->c; ++ ++ __bch2_write_index(op); ++ ++ if (!op->error && (op->flags & BCH_WRITE_FLUSH)) { ++ bch2_journal_flush_seq_async(&c->journal, ++ *op_journal_seq(op), ++ cl); ++ continue_at(cl, bch2_write_done, index_update_wq(op)); ++ } else { ++ continue_at_nobarrier(cl, bch2_write_done, NULL); ++ } ++} ++ ++static void bch2_write_endio(struct bio *bio) ++{ ++ struct closure *cl = bio->bi_private; ++ struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); ++ struct bch_write_bio *wbio = to_wbio(bio); ++ struct bch_write_bio *parent = wbio->split ? wbio->parent : NULL; ++ struct bch_fs *c = wbio->c; ++ struct bch_dev *ca = bch_dev_bkey_exists(c, wbio->dev); ++ ++ if (bch2_dev_io_err_on(bio->bi_status, ca, "data write")) ++ set_bit(wbio->dev, op->failed.d); ++ ++ if (wbio->have_ioref) { ++ bch2_latency_acct(ca, wbio->submit_time, WRITE); ++ percpu_ref_put(&ca->io_ref); ++ } ++ ++ if (wbio->bounce) ++ bch2_bio_free_pages_pool(c, bio); ++ ++ if (wbio->put_bio) ++ bio_put(bio); ++ ++ if (parent) ++ bio_endio(&parent->bio); ++ else if (!(op->flags & BCH_WRITE_SKIP_CLOSURE_PUT)) ++ closure_put(cl); ++ else ++ continue_at_nobarrier(cl, bch2_write_index, index_update_wq(op)); ++} ++ ++static void init_append_extent(struct bch_write_op *op, ++ struct write_point *wp, ++ struct bversion version, ++ struct bch_extent_crc_unpacked crc) ++{ ++ struct bch_fs *c = op->c; ++ struct bkey_i_extent *e; ++ struct open_bucket *ob; ++ unsigned i; ++ ++ BUG_ON(crc.compressed_size > wp->sectors_free); ++ wp->sectors_free -= crc.compressed_size; ++ op->pos.offset += crc.uncompressed_size; ++ ++ e = bkey_extent_init(op->insert_keys.top); ++ e->k.p = op->pos; ++ e->k.size = crc.uncompressed_size; ++ e->k.version = version; ++ ++ if (crc.csum_type || ++ crc.compression_type || ++ crc.nonce) ++ bch2_extent_crc_append(&e->k_i, crc); ++ ++ open_bucket_for_each(c, &wp->ptrs, ob, i) { ++ struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev); ++ union bch_extent_entry *end = ++ bkey_val_end(bkey_i_to_s(&e->k_i)); ++ ++ end->ptr = ob->ptr; ++ end->ptr.type = 1 << BCH_EXTENT_ENTRY_ptr; ++ end->ptr.cached = !ca->mi.durability || ++ (op->flags & BCH_WRITE_CACHED) != 0; ++ end->ptr.offset += ca->mi.bucket_size - ob->sectors_free; ++ ++ e->k.u64s++; ++ ++ BUG_ON(crc.compressed_size > ob->sectors_free); ++ ob->sectors_free -= crc.compressed_size; ++ } ++ ++ bch2_keylist_push(&op->insert_keys); ++} ++ ++static struct bio *bch2_write_bio_alloc(struct bch_fs *c, ++ struct write_point *wp, ++ struct bio *src, ++ bool *page_alloc_failed, ++ void *buf) ++{ ++ struct bch_write_bio *wbio; ++ struct bio *bio; ++ unsigned output_available = ++ min(wp->sectors_free << 9, src->bi_iter.bi_size); ++ unsigned pages = DIV_ROUND_UP(output_available + ++ (buf ++ ? ((unsigned long) buf & (PAGE_SIZE - 1)) ++ : 0), PAGE_SIZE); ++ ++ bio = bio_alloc_bioset(GFP_NOIO, pages, &c->bio_write); ++ wbio = wbio_init(bio); ++ wbio->put_bio = true; ++ /* copy WRITE_SYNC flag */ ++ wbio->bio.bi_opf = src->bi_opf; ++ ++ if (buf) { ++ bch2_bio_map(bio, buf, output_available); ++ return bio; ++ } ++ ++ wbio->bounce = true; ++ ++ /* ++ * We can't use mempool for more than c->sb.encoded_extent_max ++ * worth of pages, but we'd like to allocate more if we can: ++ */ ++ bch2_bio_alloc_pages_pool(c, bio, ++ min_t(unsigned, output_available, ++ c->sb.encoded_extent_max << 9)); ++ ++ if (bio->bi_iter.bi_size < output_available) ++ *page_alloc_failed = ++ bch2_bio_alloc_pages(bio, ++ output_available - ++ bio->bi_iter.bi_size, ++ GFP_NOFS) != 0; ++ ++ return bio; ++} ++ ++static int bch2_write_rechecksum(struct bch_fs *c, ++ struct bch_write_op *op, ++ unsigned new_csum_type) ++{ ++ struct bio *bio = &op->wbio.bio; ++ struct bch_extent_crc_unpacked new_crc; ++ int ret; ++ ++ /* bch2_rechecksum_bio() can't encrypt or decrypt data: */ ++ ++ if (bch2_csum_type_is_encryption(op->crc.csum_type) != ++ bch2_csum_type_is_encryption(new_csum_type)) ++ new_csum_type = op->crc.csum_type; ++ ++ ret = bch2_rechecksum_bio(c, bio, op->version, op->crc, ++ NULL, &new_crc, ++ op->crc.offset, op->crc.live_size, ++ new_csum_type); ++ if (ret) ++ return ret; ++ ++ bio_advance(bio, op->crc.offset << 9); ++ bio->bi_iter.bi_size = op->crc.live_size << 9; ++ op->crc = new_crc; ++ return 0; ++} ++ ++static int bch2_write_decrypt(struct bch_write_op *op) ++{ ++ struct bch_fs *c = op->c; ++ struct nonce nonce = extent_nonce(op->version, op->crc); ++ struct bch_csum csum; ++ ++ if (!bch2_csum_type_is_encryption(op->crc.csum_type)) ++ return 0; ++ ++ /* ++ * If we need to decrypt data in the write path, we'll no longer be able ++ * to verify the existing checksum (poly1305 mac, in this case) after ++ * it's decrypted - this is the last point we'll be able to reverify the ++ * checksum: ++ */ ++ csum = bch2_checksum_bio(c, op->crc.csum_type, nonce, &op->wbio.bio); ++ if (bch2_crc_cmp(op->crc.csum, csum)) ++ return -EIO; ++ ++ bch2_encrypt_bio(c, op->crc.csum_type, nonce, &op->wbio.bio); ++ op->crc.csum_type = 0; ++ op->crc.csum = (struct bch_csum) { 0, 0 }; ++ return 0; ++} ++ ++static enum prep_encoded_ret { ++ PREP_ENCODED_OK, ++ PREP_ENCODED_ERR, ++ PREP_ENCODED_CHECKSUM_ERR, ++ PREP_ENCODED_DO_WRITE, ++} bch2_write_prep_encoded_data(struct bch_write_op *op, struct write_point *wp) ++{ ++ struct bch_fs *c = op->c; ++ struct bio *bio = &op->wbio.bio; ++ ++ if (!(op->flags & BCH_WRITE_DATA_ENCODED)) ++ return PREP_ENCODED_OK; ++ ++ BUG_ON(bio_sectors(bio) != op->crc.compressed_size); ++ ++ /* Can we just write the entire extent as is? */ ++ if (op->crc.uncompressed_size == op->crc.live_size && ++ op->crc.compressed_size <= wp->sectors_free && ++ op->crc.compression_type == op->compression_type) { ++ if (!op->crc.compression_type && ++ op->csum_type != op->crc.csum_type && ++ bch2_write_rechecksum(c, op, op->csum_type)) ++ return PREP_ENCODED_CHECKSUM_ERR; ++ ++ return PREP_ENCODED_DO_WRITE; ++ } ++ ++ /* ++ * If the data is compressed and we couldn't write the entire extent as ++ * is, we have to decompress it: ++ */ ++ if (op->crc.compression_type) { ++ struct bch_csum csum; ++ ++ if (bch2_write_decrypt(op)) ++ return PREP_ENCODED_CHECKSUM_ERR; ++ ++ /* Last point we can still verify checksum: */ ++ csum = bch2_checksum_bio(c, op->crc.csum_type, ++ extent_nonce(op->version, op->crc), ++ bio); ++ if (bch2_crc_cmp(op->crc.csum, csum)) ++ return PREP_ENCODED_CHECKSUM_ERR; ++ ++ if (bch2_bio_uncompress_inplace(c, bio, &op->crc)) ++ return PREP_ENCODED_ERR; ++ } ++ ++ /* ++ * No longer have compressed data after this point - data might be ++ * encrypted: ++ */ ++ ++ /* ++ * If the data is checksummed and we're only writing a subset, ++ * rechecksum and adjust bio to point to currently live data: ++ */ ++ if ((op->crc.live_size != op->crc.uncompressed_size || ++ op->crc.csum_type != op->csum_type) && ++ bch2_write_rechecksum(c, op, op->csum_type)) ++ return PREP_ENCODED_CHECKSUM_ERR; ++ ++ /* ++ * If we want to compress the data, it has to be decrypted: ++ */ ++ if ((op->compression_type || ++ bch2_csum_type_is_encryption(op->crc.csum_type) != ++ bch2_csum_type_is_encryption(op->csum_type)) && ++ bch2_write_decrypt(op)) ++ return PREP_ENCODED_CHECKSUM_ERR; ++ ++ return PREP_ENCODED_OK; ++} ++ ++static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, ++ struct bio **_dst) ++{ ++ struct bch_fs *c = op->c; ++ struct bio *src = &op->wbio.bio, *dst = src; ++ struct bvec_iter saved_iter; ++ void *ec_buf; ++ struct bpos ec_pos = op->pos; ++ unsigned total_output = 0, total_input = 0; ++ bool bounce = false; ++ bool page_alloc_failed = false; ++ int ret, more = 0; ++ ++ BUG_ON(!bio_sectors(src)); ++ ++ ec_buf = bch2_writepoint_ec_buf(c, wp); ++ ++ switch (bch2_write_prep_encoded_data(op, wp)) { ++ case PREP_ENCODED_OK: ++ break; ++ case PREP_ENCODED_ERR: ++ ret = -EIO; ++ goto err; ++ case PREP_ENCODED_CHECKSUM_ERR: ++ goto csum_err; ++ case PREP_ENCODED_DO_WRITE: ++ /* XXX look for bug here */ ++ if (ec_buf) { ++ dst = bch2_write_bio_alloc(c, wp, src, ++ &page_alloc_failed, ++ ec_buf); ++ bio_copy_data(dst, src); ++ bounce = true; ++ } ++ init_append_extent(op, wp, op->version, op->crc); ++ goto do_write; ++ } ++ ++ if (ec_buf || ++ op->compression_type || ++ (op->csum_type && ++ !(op->flags & BCH_WRITE_PAGES_STABLE)) || ++ (bch2_csum_type_is_encryption(op->csum_type) && ++ !(op->flags & BCH_WRITE_PAGES_OWNED))) { ++ dst = bch2_write_bio_alloc(c, wp, src, ++ &page_alloc_failed, ++ ec_buf); ++ bounce = true; ++ } ++ ++ saved_iter = dst->bi_iter; ++ ++ do { ++ struct bch_extent_crc_unpacked crc = ++ (struct bch_extent_crc_unpacked) { 0 }; ++ struct bversion version = op->version; ++ size_t dst_len, src_len; ++ ++ if (page_alloc_failed && ++ bio_sectors(dst) < wp->sectors_free && ++ bio_sectors(dst) < c->sb.encoded_extent_max) ++ break; ++ ++ BUG_ON(op->compression_type && ++ (op->flags & BCH_WRITE_DATA_ENCODED) && ++ bch2_csum_type_is_encryption(op->crc.csum_type)); ++ BUG_ON(op->compression_type && !bounce); ++ ++ crc.compression_type = op->compression_type ++ ? bch2_bio_compress(c, dst, &dst_len, src, &src_len, ++ op->compression_type) ++ : 0; ++ if (!crc.compression_type) { ++ dst_len = min(dst->bi_iter.bi_size, src->bi_iter.bi_size); ++ dst_len = min_t(unsigned, dst_len, wp->sectors_free << 9); ++ ++ if (op->csum_type) ++ dst_len = min_t(unsigned, dst_len, ++ c->sb.encoded_extent_max << 9); ++ ++ if (bounce) { ++ swap(dst->bi_iter.bi_size, dst_len); ++ bio_copy_data(dst, src); ++ swap(dst->bi_iter.bi_size, dst_len); ++ } ++ ++ src_len = dst_len; ++ } ++ ++ BUG_ON(!src_len || !dst_len); ++ ++ if (bch2_csum_type_is_encryption(op->csum_type)) { ++ if (bversion_zero(version)) { ++ version.lo = atomic64_inc_return(&c->key_version) + 1; ++ } else { ++ crc.nonce = op->nonce; ++ op->nonce += src_len >> 9; ++ } ++ } ++ ++ if ((op->flags & BCH_WRITE_DATA_ENCODED) && ++ !crc.compression_type && ++ bch2_csum_type_is_encryption(op->crc.csum_type) == ++ bch2_csum_type_is_encryption(op->csum_type)) { ++ /* ++ * Note: when we're using rechecksum(), we need to be ++ * checksumming @src because it has all the data our ++ * existing checksum covers - if we bounced (because we ++ * were trying to compress), @dst will only have the ++ * part of the data the new checksum will cover. ++ * ++ * But normally we want to be checksumming post bounce, ++ * because part of the reason for bouncing is so the ++ * data can't be modified (by userspace) while it's in ++ * flight. ++ */ ++ if (bch2_rechecksum_bio(c, src, version, op->crc, ++ &crc, &op->crc, ++ src_len >> 9, ++ bio_sectors(src) - (src_len >> 9), ++ op->csum_type)) ++ goto csum_err; ++ } else { ++ if ((op->flags & BCH_WRITE_DATA_ENCODED) && ++ bch2_rechecksum_bio(c, src, version, op->crc, ++ NULL, &op->crc, ++ src_len >> 9, ++ bio_sectors(src) - (src_len >> 9), ++ op->crc.csum_type)) ++ goto csum_err; ++ ++ crc.compressed_size = dst_len >> 9; ++ crc.uncompressed_size = src_len >> 9; ++ crc.live_size = src_len >> 9; ++ ++ swap(dst->bi_iter.bi_size, dst_len); ++ bch2_encrypt_bio(c, op->csum_type, ++ extent_nonce(version, crc), dst); ++ crc.csum = bch2_checksum_bio(c, op->csum_type, ++ extent_nonce(version, crc), dst); ++ crc.csum_type = op->csum_type; ++ swap(dst->bi_iter.bi_size, dst_len); ++ } ++ ++ init_append_extent(op, wp, version, crc); ++ ++ if (dst != src) ++ bio_advance(dst, dst_len); ++ bio_advance(src, src_len); ++ total_output += dst_len; ++ total_input += src_len; ++ } while (dst->bi_iter.bi_size && ++ src->bi_iter.bi_size && ++ wp->sectors_free && ++ !bch2_keylist_realloc(&op->insert_keys, ++ op->inline_keys, ++ ARRAY_SIZE(op->inline_keys), ++ BKEY_EXTENT_U64s_MAX)); ++ ++ more = src->bi_iter.bi_size != 0; ++ ++ dst->bi_iter = saved_iter; ++ ++ if (dst == src && more) { ++ BUG_ON(total_output != total_input); ++ ++ dst = bio_split(src, total_input >> 9, ++ GFP_NOIO, &c->bio_write); ++ wbio_init(dst)->put_bio = true; ++ /* copy WRITE_SYNC flag */ ++ dst->bi_opf = src->bi_opf; ++ } ++ ++ dst->bi_iter.bi_size = total_output; ++do_write: ++ /* might have done a realloc... */ ++ bch2_ec_add_backpointer(c, wp, ec_pos, total_input >> 9); ++ ++ *_dst = dst; ++ return more; ++csum_err: ++ bch_err(c, "error verifying existing checksum while " ++ "rewriting existing data (memory corruption?)"); ++ ret = -EIO; ++err: ++ if (to_wbio(dst)->bounce) ++ bch2_bio_free_pages_pool(c, dst); ++ if (to_wbio(dst)->put_bio) ++ bio_put(dst); ++ ++ return ret; ++} ++ ++static void __bch2_write(struct closure *cl) ++{ ++ struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); ++ struct bch_fs *c = op->c; ++ struct write_point *wp; ++ struct bio *bio; ++ bool skip_put = true; ++ int ret; ++again: ++ memset(&op->failed, 0, sizeof(op->failed)); ++ ++ do { ++ struct bkey_i *key_to_write; ++ unsigned key_to_write_offset = op->insert_keys.top_p - ++ op->insert_keys.keys_p; ++ ++ /* +1 for possible cache device: */ ++ if (op->open_buckets.nr + op->nr_replicas + 1 > ++ ARRAY_SIZE(op->open_buckets.v)) ++ goto flush_io; ++ ++ if (bch2_keylist_realloc(&op->insert_keys, ++ op->inline_keys, ++ ARRAY_SIZE(op->inline_keys), ++ BKEY_EXTENT_U64s_MAX)) ++ goto flush_io; ++ ++ wp = bch2_alloc_sectors_start(c, ++ op->target, ++ op->opts.erasure_code, ++ op->write_point, ++ &op->devs_have, ++ op->nr_replicas, ++ op->nr_replicas_required, ++ op->alloc_reserve, ++ op->flags, ++ (op->flags & BCH_WRITE_ALLOC_NOWAIT) ? NULL : cl); ++ EBUG_ON(!wp); ++ ++ if (unlikely(IS_ERR(wp))) { ++ if (unlikely(PTR_ERR(wp) != -EAGAIN)) { ++ ret = PTR_ERR(wp); ++ goto err; ++ } ++ ++ goto flush_io; ++ } ++ ++ bch2_open_bucket_get(c, wp, &op->open_buckets); ++ ret = bch2_write_extent(op, wp, &bio); ++ bch2_alloc_sectors_done(c, wp); ++ ++ if (ret < 0) ++ goto err; ++ ++ if (ret) ++ skip_put = false; ++ ++ bio->bi_end_io = bch2_write_endio; ++ bio->bi_private = &op->cl; ++ bio_set_op_attrs(bio, REQ_OP_WRITE, 0); ++ ++ if (!skip_put) ++ closure_get(bio->bi_private); ++ else ++ op->flags |= BCH_WRITE_SKIP_CLOSURE_PUT; ++ ++ key_to_write = (void *) (op->insert_keys.keys_p + ++ key_to_write_offset); ++ ++ bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_USER, ++ key_to_write); ++ } while (ret); ++ ++ if (!skip_put) ++ continue_at(cl, bch2_write_index, index_update_wq(op)); ++ return; ++err: ++ op->error = ret; ++ ++ continue_at(cl, bch2_write_index, index_update_wq(op)); ++ return; ++flush_io: ++ closure_sync(cl); ++ ++ if (!bch2_keylist_empty(&op->insert_keys)) { ++ __bch2_write_index(op); ++ ++ if (op->error) { ++ continue_at_nobarrier(cl, bch2_write_done, NULL); ++ return; ++ } ++ } ++ ++ goto again; ++} ++ ++/** ++ * bch_write - handle a write to a cache device or flash only volume ++ * ++ * This is the starting point for any data to end up in a cache device; it could ++ * be from a normal write, or a writeback write, or a write to a flash only ++ * volume - it's also used by the moving garbage collector to compact data in ++ * mostly empty buckets. ++ * ++ * It first writes the data to the cache, creating a list of keys to be inserted ++ * (if the data won't fit in a single open bucket, there will be multiple keys); ++ * after the data is written it calls bch_journal, and after the keys have been ++ * added to the next journal write they're inserted into the btree. ++ * ++ * If op->discard is true, instead of inserting the data it invalidates the ++ * region of the cache represented by op->bio and op->inode. ++ */ ++void bch2_write(struct closure *cl) ++{ ++ struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); ++ struct bio *bio = &op->wbio.bio; ++ struct bch_fs *c = op->c; ++ ++ BUG_ON(!op->nr_replicas); ++ BUG_ON(!op->write_point.v); ++ BUG_ON(!bkey_cmp(op->pos, POS_MAX)); ++ ++ if (bio_sectors(bio) & (c->opts.block_size - 1)) { ++ __bcache_io_error(c, "misaligned write"); ++ op->error = -EIO; ++ goto err; ++ } ++ ++ op->start_time = local_clock(); ++ ++ bch2_keylist_init(&op->insert_keys, op->inline_keys); ++ wbio_init(bio)->put_bio = false; ++ ++ if (c->opts.nochanges || ++ !percpu_ref_tryget(&c->writes)) { ++ __bcache_io_error(c, "read only"); ++ op->error = -EROFS; ++ goto err; ++ } ++ ++ bch2_increment_clock(c, bio_sectors(bio), WRITE); ++ ++ continue_at_nobarrier(cl, __bch2_write, NULL); ++ return; ++err: ++ if (!(op->flags & BCH_WRITE_NOPUT_RESERVATION)) ++ bch2_disk_reservation_put(c, &op->res); ++ closure_return(cl); ++} ++ ++/* Cache promotion on read */ ++ ++struct promote_op { ++ struct closure cl; ++ struct rcu_head rcu; ++ u64 start_time; ++ ++ struct rhash_head hash; ++ struct bpos pos; ++ ++ struct migrate_write write; ++ struct bio_vec bi_inline_vecs[0]; /* must be last */ ++}; ++ ++static const struct rhashtable_params bch_promote_params = { ++ .head_offset = offsetof(struct promote_op, hash), ++ .key_offset = offsetof(struct promote_op, pos), ++ .key_len = sizeof(struct bpos), ++}; ++ ++static inline bool should_promote(struct bch_fs *c, struct bkey_s_c k, ++ struct bpos pos, ++ struct bch_io_opts opts, ++ unsigned flags) ++{ ++ if (!(flags & BCH_READ_MAY_PROMOTE)) ++ return false; ++ ++ if (!opts.promote_target) ++ return false; ++ ++ if (bch2_bkey_has_target(c, k, opts.promote_target)) ++ return false; ++ ++ if (bch2_target_congested(c, opts.promote_target)) { ++ /* XXX trace this */ ++ return false; ++ } ++ ++ if (rhashtable_lookup_fast(&c->promote_table, &pos, ++ bch_promote_params)) ++ return false; ++ ++ return true; ++} ++ ++static void promote_free(struct bch_fs *c, struct promote_op *op) ++{ ++ int ret; ++ ++ ret = rhashtable_remove_fast(&c->promote_table, &op->hash, ++ bch_promote_params); ++ BUG_ON(ret); ++ percpu_ref_put(&c->writes); ++ kfree_rcu(op, rcu); ++} ++ ++static void promote_done(struct closure *cl) ++{ ++ struct promote_op *op = ++ container_of(cl, struct promote_op, cl); ++ struct bch_fs *c = op->write.op.c; ++ ++ bch2_time_stats_update(&c->times[BCH_TIME_data_promote], ++ op->start_time); ++ ++ bch2_bio_free_pages_pool(c, &op->write.op.wbio.bio); ++ promote_free(c, op); ++} ++ ++static void promote_start(struct promote_op *op, struct bch_read_bio *rbio) ++{ ++ struct bch_fs *c = rbio->c; ++ struct closure *cl = &op->cl; ++ struct bio *bio = &op->write.op.wbio.bio; ++ ++ trace_promote(&rbio->bio); ++ ++ /* we now own pages: */ ++ BUG_ON(!rbio->bounce); ++ BUG_ON(rbio->bio.bi_vcnt > bio->bi_max_vecs); ++ ++ memcpy(bio->bi_io_vec, rbio->bio.bi_io_vec, ++ sizeof(struct bio_vec) * rbio->bio.bi_vcnt); ++ swap(bio->bi_vcnt, rbio->bio.bi_vcnt); ++ ++ bch2_migrate_read_done(&op->write, rbio); ++ ++ closure_init(cl, NULL); ++ closure_call(&op->write.op.cl, bch2_write, c->wq, cl); ++ closure_return_with_destructor(cl, promote_done); ++} ++ ++noinline ++static struct promote_op *__promote_alloc(struct bch_fs *c, ++ enum btree_id btree_id, ++ struct bpos pos, ++ struct extent_ptr_decoded *pick, ++ struct bch_io_opts opts, ++ unsigned sectors, ++ struct bch_read_bio **rbio) ++{ ++ struct promote_op *op = NULL; ++ struct bio *bio; ++ unsigned pages = DIV_ROUND_UP(sectors, PAGE_SECTORS); ++ int ret; ++ ++ if (!percpu_ref_tryget(&c->writes)) ++ return NULL; ++ ++ op = kzalloc(sizeof(*op) + sizeof(struct bio_vec) * pages, GFP_NOIO); ++ if (!op) ++ goto err; ++ ++ op->start_time = local_clock(); ++ op->pos = pos; ++ ++ /* ++ * We don't use the mempool here because extents that aren't ++ * checksummed or compressed can be too big for the mempool: ++ */ ++ *rbio = kzalloc(sizeof(struct bch_read_bio) + ++ sizeof(struct bio_vec) * pages, ++ GFP_NOIO); ++ if (!*rbio) ++ goto err; ++ ++ rbio_init(&(*rbio)->bio, opts); ++ bio_init(&(*rbio)->bio, (*rbio)->bio.bi_inline_vecs, pages); ++ ++ if (bch2_bio_alloc_pages(&(*rbio)->bio, sectors << 9, ++ GFP_NOIO)) ++ goto err; ++ ++ (*rbio)->bounce = true; ++ (*rbio)->split = true; ++ (*rbio)->kmalloc = true; ++ ++ if (rhashtable_lookup_insert_fast(&c->promote_table, &op->hash, ++ bch_promote_params)) ++ goto err; ++ ++ bio = &op->write.op.wbio.bio; ++ bio_init(bio, bio->bi_inline_vecs, pages); ++ ++ ret = bch2_migrate_write_init(c, &op->write, ++ writepoint_hashed((unsigned long) current), ++ opts, ++ DATA_PROMOTE, ++ (struct data_opts) { ++ .target = opts.promote_target ++ }, ++ btree_id, ++ bkey_s_c_null); ++ BUG_ON(ret); ++ ++ return op; ++err: ++ if (*rbio) ++ bio_free_pages(&(*rbio)->bio); ++ kfree(*rbio); ++ *rbio = NULL; ++ kfree(op); ++ percpu_ref_put(&c->writes); ++ return NULL; ++} ++ ++static inline struct promote_op *promote_alloc(struct bch_fs *c, ++ struct bvec_iter iter, ++ struct bkey_s_c k, ++ struct extent_ptr_decoded *pick, ++ struct bch_io_opts opts, ++ unsigned flags, ++ struct bch_read_bio **rbio, ++ bool *bounce, ++ bool *read_full) ++{ ++ bool promote_full = *read_full || READ_ONCE(c->promote_whole_extents); ++ /* data might have to be decompressed in the write path: */ ++ unsigned sectors = promote_full ++ ? max(pick->crc.compressed_size, pick->crc.live_size) ++ : bvec_iter_sectors(iter); ++ struct bpos pos = promote_full ++ ? bkey_start_pos(k.k) ++ : POS(k.k->p.inode, iter.bi_sector); ++ struct promote_op *promote; ++ ++ if (!should_promote(c, k, pos, opts, flags)) ++ return NULL; ++ ++ promote = __promote_alloc(c, ++ k.k->type == KEY_TYPE_reflink_v ++ ? BTREE_ID_REFLINK ++ : BTREE_ID_EXTENTS, ++ pos, pick, opts, sectors, rbio); ++ if (!promote) ++ return NULL; ++ ++ *bounce = true; ++ *read_full = promote_full; ++ return promote; ++} ++ ++/* Read */ ++ ++#define READ_RETRY_AVOID 1 ++#define READ_RETRY 2 ++#define READ_ERR 3 ++ ++enum rbio_context { ++ RBIO_CONTEXT_NULL, ++ RBIO_CONTEXT_HIGHPRI, ++ RBIO_CONTEXT_UNBOUND, ++}; ++ ++static inline struct bch_read_bio * ++bch2_rbio_parent(struct bch_read_bio *rbio) ++{ ++ return rbio->split ? rbio->parent : rbio; ++} ++ ++__always_inline ++static void bch2_rbio_punt(struct bch_read_bio *rbio, work_func_t fn, ++ enum rbio_context context, ++ struct workqueue_struct *wq) ++{ ++ if (context <= rbio->context) { ++ fn(&rbio->work); ++ } else { ++ rbio->work.func = fn; ++ rbio->context = context; ++ queue_work(wq, &rbio->work); ++ } ++} ++ ++static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio) ++{ ++ BUG_ON(rbio->bounce && !rbio->split); ++ ++ if (rbio->promote) ++ promote_free(rbio->c, rbio->promote); ++ rbio->promote = NULL; ++ ++ if (rbio->bounce) ++ bch2_bio_free_pages_pool(rbio->c, &rbio->bio); ++ ++ if (rbio->split) { ++ struct bch_read_bio *parent = rbio->parent; ++ ++ if (rbio->kmalloc) ++ kfree(rbio); ++ else ++ bio_put(&rbio->bio); ++ ++ rbio = parent; ++ } ++ ++ return rbio; ++} ++ ++/* ++ * Only called on a top level bch_read_bio to complete an entire read request, ++ * not a split: ++ */ ++static void bch2_rbio_done(struct bch_read_bio *rbio) ++{ ++ if (rbio->start_time) ++ bch2_time_stats_update(&rbio->c->times[BCH_TIME_data_read], ++ rbio->start_time); ++ bio_endio(&rbio->bio); ++} ++ ++static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio, ++ struct bvec_iter bvec_iter, u64 inode, ++ struct bch_io_failures *failed, ++ unsigned flags) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ BKEY_PADDED(k) tmp; ++ struct bkey_s_c k; ++ int ret; ++ ++ flags &= ~BCH_READ_LAST_FRAGMENT; ++ flags |= BCH_READ_MUST_CLONE; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, ++ rbio->pos, BTREE_ITER_SLOTS); ++retry: ++ rbio->bio.bi_status = 0; ++ ++ k = bch2_btree_iter_peek_slot(iter); ++ if (bkey_err(k)) ++ goto err; ++ ++ bkey_reassemble(&tmp.k, k); ++ k = bkey_i_to_s_c(&tmp.k); ++ bch2_trans_unlock(&trans); ++ ++ if (!bch2_bkey_matches_ptr(c, bkey_i_to_s_c(&tmp.k), ++ rbio->pick.ptr, ++ rbio->pos.offset - ++ rbio->pick.crc.offset)) { ++ /* extent we wanted to read no longer exists: */ ++ rbio->hole = true; ++ goto out; ++ } ++ ++ ret = __bch2_read_extent(c, rbio, bvec_iter, k, 0, failed, flags); ++ if (ret == READ_RETRY) ++ goto retry; ++ if (ret) ++ goto err; ++out: ++ bch2_rbio_done(rbio); ++ bch2_trans_exit(&trans); ++ return; ++err: ++ rbio->bio.bi_status = BLK_STS_IOERR; ++ goto out; ++} ++ ++static void bch2_read_retry(struct bch_fs *c, struct bch_read_bio *rbio, ++ struct bvec_iter bvec_iter, u64 inode, ++ struct bch_io_failures *failed, unsigned flags) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ int ret; ++ ++ flags &= ~BCH_READ_LAST_FRAGMENT; ++ flags |= BCH_READ_MUST_CLONE; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++retry: ++ bch2_trans_begin(&trans); ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, ++ POS(inode, bvec_iter.bi_sector), ++ BTREE_ITER_SLOTS, k, ret) { ++ BKEY_PADDED(k) tmp; ++ unsigned bytes, sectors, offset_into_extent; ++ ++ bkey_reassemble(&tmp.k, k); ++ k = bkey_i_to_s_c(&tmp.k); ++ ++ offset_into_extent = iter->pos.offset - ++ bkey_start_offset(k.k); ++ sectors = k.k->size - offset_into_extent; ++ ++ ret = bch2_read_indirect_extent(&trans, ++ &offset_into_extent, &tmp.k); ++ if (ret) ++ break; ++ ++ sectors = min(sectors, k.k->size - offset_into_extent); ++ ++ bch2_trans_unlock(&trans); ++ ++ bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9; ++ swap(bvec_iter.bi_size, bytes); ++ ++ ret = __bch2_read_extent(c, rbio, bvec_iter, k, ++ offset_into_extent, failed, flags); ++ switch (ret) { ++ case READ_RETRY: ++ goto retry; ++ case READ_ERR: ++ goto err; ++ }; ++ ++ if (bytes == bvec_iter.bi_size) ++ goto out; ++ ++ swap(bvec_iter.bi_size, bytes); ++ bio_advance_iter(&rbio->bio, &bvec_iter, bytes); ++ } ++ ++ if (ret == -EINTR) ++ goto retry; ++ /* ++ * If we get here, it better have been because there was an error ++ * reading a btree node ++ */ ++ BUG_ON(!ret); ++ __bcache_io_error(c, "btree IO error: %i", ret); ++err: ++ rbio->bio.bi_status = BLK_STS_IOERR; ++out: ++ bch2_trans_exit(&trans); ++ bch2_rbio_done(rbio); ++} ++ ++static void bch2_rbio_retry(struct work_struct *work) ++{ ++ struct bch_read_bio *rbio = ++ container_of(work, struct bch_read_bio, work); ++ struct bch_fs *c = rbio->c; ++ struct bvec_iter iter = rbio->bvec_iter; ++ unsigned flags = rbio->flags; ++ u64 inode = rbio->pos.inode; ++ struct bch_io_failures failed = { .nr = 0 }; ++ ++ trace_read_retry(&rbio->bio); ++ ++ if (rbio->retry == READ_RETRY_AVOID) ++ bch2_mark_io_failure(&failed, &rbio->pick); ++ ++ rbio->bio.bi_status = 0; ++ ++ rbio = bch2_rbio_free(rbio); ++ ++ flags |= BCH_READ_IN_RETRY; ++ flags &= ~BCH_READ_MAY_PROMOTE; ++ ++ if (flags & BCH_READ_NODECODE) ++ bch2_read_retry_nodecode(c, rbio, iter, inode, &failed, flags); ++ else ++ bch2_read_retry(c, rbio, iter, inode, &failed, flags); ++} ++ ++static void bch2_rbio_error(struct bch_read_bio *rbio, int retry, ++ blk_status_t error) ++{ ++ rbio->retry = retry; ++ ++ if (rbio->flags & BCH_READ_IN_RETRY) ++ return; ++ ++ if (retry == READ_ERR) { ++ rbio = bch2_rbio_free(rbio); ++ ++ rbio->bio.bi_status = error; ++ bch2_rbio_done(rbio); ++ } else { ++ bch2_rbio_punt(rbio, bch2_rbio_retry, ++ RBIO_CONTEXT_UNBOUND, system_unbound_wq); ++ } ++} ++ ++static void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio) ++{ ++ struct bch_fs *c = rbio->c; ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ BKEY_PADDED(k) new; ++ struct bch_extent_crc_unpacked new_crc; ++ u64 data_offset = rbio->pos.offset - rbio->pick.crc.offset; ++ int ret; ++ ++ if (rbio->pick.crc.compression_type) ++ return; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++retry: ++ bch2_trans_begin(&trans); ++ ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, rbio->pos, ++ BTREE_ITER_SLOTS|BTREE_ITER_INTENT); ++ k = bch2_btree_iter_peek_slot(iter); ++ if (IS_ERR_OR_NULL(k.k)) ++ goto out; ++ ++ bkey_reassemble(&new.k, k); ++ k = bkey_i_to_s_c(&new.k); ++ ++ if (bversion_cmp(k.k->version, rbio->version) || ++ !bch2_bkey_matches_ptr(c, k, rbio->pick.ptr, data_offset)) ++ goto out; ++ ++ /* Extent was merged? */ ++ if (bkey_start_offset(k.k) < data_offset || ++ k.k->p.offset > data_offset + rbio->pick.crc.uncompressed_size) ++ goto out; ++ ++ if (bch2_rechecksum_bio(c, &rbio->bio, rbio->version, ++ rbio->pick.crc, NULL, &new_crc, ++ bkey_start_offset(k.k) - data_offset, k.k->size, ++ rbio->pick.crc.csum_type)) { ++ bch_err(c, "error verifying existing checksum while narrowing checksum (memory corruption?)"); ++ goto out; ++ } ++ ++ if (!bch2_bkey_narrow_crcs(&new.k, new_crc)) ++ goto out; ++ ++ bch2_trans_update(&trans, iter, &new.k); ++ ret = bch2_trans_commit(&trans, NULL, NULL, ++ BTREE_INSERT_ATOMIC| ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_NOWAIT); ++ if (ret == -EINTR) ++ goto retry; ++out: ++ bch2_trans_exit(&trans); ++} ++ ++/* Inner part that may run in process context */ ++static void __bch2_read_endio(struct work_struct *work) ++{ ++ struct bch_read_bio *rbio = ++ container_of(work, struct bch_read_bio, work); ++ struct bch_fs *c = rbio->c; ++ struct bch_dev *ca = bch_dev_bkey_exists(c, rbio->pick.ptr.dev); ++ struct bio *src = &rbio->bio; ++ struct bio *dst = &bch2_rbio_parent(rbio)->bio; ++ struct bvec_iter dst_iter = rbio->bvec_iter; ++ struct bch_extent_crc_unpacked crc = rbio->pick.crc; ++ struct nonce nonce = extent_nonce(rbio->version, crc); ++ struct bch_csum csum; ++ ++ /* Reset iterator for checksumming and copying bounced data: */ ++ if (rbio->bounce) { ++ src->bi_iter.bi_size = crc.compressed_size << 9; ++ src->bi_iter.bi_idx = 0; ++ src->bi_iter.bi_bvec_done = 0; ++ } else { ++ src->bi_iter = rbio->bvec_iter; ++ } ++ ++ csum = bch2_checksum_bio(c, crc.csum_type, nonce, src); ++ if (bch2_crc_cmp(csum, rbio->pick.crc.csum)) ++ goto csum_err; ++ ++ if (unlikely(rbio->narrow_crcs)) ++ bch2_rbio_narrow_crcs(rbio); ++ ++ if (rbio->flags & BCH_READ_NODECODE) ++ goto nodecode; ++ ++ /* Adjust crc to point to subset of data we want: */ ++ crc.offset += rbio->offset_into_extent; ++ crc.live_size = bvec_iter_sectors(rbio->bvec_iter); ++ ++ if (crc.compression_type != BCH_COMPRESSION_NONE) { ++ bch2_encrypt_bio(c, crc.csum_type, nonce, src); ++ if (bch2_bio_uncompress(c, src, dst, dst_iter, crc)) ++ goto decompression_err; ++ } else { ++ /* don't need to decrypt the entire bio: */ ++ nonce = nonce_add(nonce, crc.offset << 9); ++ bio_advance(src, crc.offset << 9); ++ ++ BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size); ++ src->bi_iter.bi_size = dst_iter.bi_size; ++ ++ bch2_encrypt_bio(c, crc.csum_type, nonce, src); ++ ++ if (rbio->bounce) { ++ struct bvec_iter src_iter = src->bi_iter; ++ bio_copy_data_iter(dst, &dst_iter, src, &src_iter); ++ } ++ } ++ ++ if (rbio->promote) { ++ /* ++ * Re encrypt data we decrypted, so it's consistent with ++ * rbio->crc: ++ */ ++ bch2_encrypt_bio(c, crc.csum_type, nonce, src); ++ promote_start(rbio->promote, rbio); ++ rbio->promote = NULL; ++ } ++nodecode: ++ if (likely(!(rbio->flags & BCH_READ_IN_RETRY))) { ++ rbio = bch2_rbio_free(rbio); ++ bch2_rbio_done(rbio); ++ } ++ return; ++csum_err: ++ /* ++ * Checksum error: if the bio wasn't bounced, we may have been ++ * reading into buffers owned by userspace (that userspace can ++ * scribble over) - retry the read, bouncing it this time: ++ */ ++ if (!rbio->bounce && (rbio->flags & BCH_READ_USER_MAPPED)) { ++ rbio->flags |= BCH_READ_MUST_BOUNCE; ++ bch2_rbio_error(rbio, READ_RETRY, BLK_STS_IOERR); ++ return; ++ } ++ ++ bch2_dev_io_error(ca, ++ "data checksum error, inode %llu offset %llu: expected %0llx:%0llx got %0llx:%0llx (type %u)", ++ rbio->pos.inode, (u64) rbio->bvec_iter.bi_sector, ++ rbio->pick.crc.csum.hi, rbio->pick.crc.csum.lo, ++ csum.hi, csum.lo, crc.csum_type); ++ bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); ++ return; ++decompression_err: ++ __bcache_io_error(c, "decompression error, inode %llu offset %llu", ++ rbio->pos.inode, ++ (u64) rbio->bvec_iter.bi_sector); ++ bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR); ++ return; ++} ++ ++static void bch2_read_endio(struct bio *bio) ++{ ++ struct bch_read_bio *rbio = ++ container_of(bio, struct bch_read_bio, bio); ++ struct bch_fs *c = rbio->c; ++ struct bch_dev *ca = bch_dev_bkey_exists(c, rbio->pick.ptr.dev); ++ struct workqueue_struct *wq = NULL; ++ enum rbio_context context = RBIO_CONTEXT_NULL; ++ ++ if (rbio->have_ioref) { ++ bch2_latency_acct(ca, rbio->submit_time, READ); ++ percpu_ref_put(&ca->io_ref); ++ } ++ ++ if (!rbio->split) ++ rbio->bio.bi_end_io = rbio->end_io; ++ ++ if (bch2_dev_io_err_on(bio->bi_status, ca, "data read")) { ++ bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status); ++ return; ++ } ++ ++ if (rbio->pick.ptr.cached && ++ (((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) || ++ ptr_stale(ca, &rbio->pick.ptr))) { ++ atomic_long_inc(&c->read_realloc_races); ++ ++ if (rbio->flags & BCH_READ_RETRY_IF_STALE) ++ bch2_rbio_error(rbio, READ_RETRY, BLK_STS_AGAIN); ++ else ++ bch2_rbio_error(rbio, READ_ERR, BLK_STS_AGAIN); ++ return; ++ } ++ ++ if (rbio->narrow_crcs || ++ rbio->pick.crc.compression_type || ++ bch2_csum_type_is_encryption(rbio->pick.crc.csum_type)) ++ context = RBIO_CONTEXT_UNBOUND, wq = system_unbound_wq; ++ else if (rbio->pick.crc.csum_type) ++ context = RBIO_CONTEXT_HIGHPRI, wq = system_highpri_wq; ++ ++ bch2_rbio_punt(rbio, __bch2_read_endio, context, wq); ++} ++ ++int __bch2_read_indirect_extent(struct btree_trans *trans, ++ unsigned *offset_into_extent, ++ struct bkey_i *orig_k) ++{ ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ u64 reflink_offset; ++ int ret; ++ ++ reflink_offset = le64_to_cpu(bkey_i_to_reflink_p(orig_k)->v.idx) + ++ *offset_into_extent; ++ ++ iter = bch2_trans_get_iter(trans, BTREE_ID_REFLINK, ++ POS(0, reflink_offset), ++ BTREE_ITER_SLOTS); ++ ret = PTR_ERR_OR_ZERO(iter); ++ if (ret) ++ return ret; ++ ++ k = bch2_btree_iter_peek_slot(iter); ++ ret = bkey_err(k); ++ if (ret) ++ goto err; ++ ++ if (k.k->type != KEY_TYPE_reflink_v) { ++ __bcache_io_error(trans->c, ++ "pointer to nonexistent indirect extent"); ++ ret = -EIO; ++ goto err; ++ } ++ ++ *offset_into_extent = iter->pos.offset - bkey_start_offset(k.k); ++ bkey_reassemble(orig_k, k); ++err: ++ bch2_trans_iter_put(trans, iter); ++ return ret; ++} ++ ++int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig, ++ struct bvec_iter iter, struct bkey_s_c k, ++ unsigned offset_into_extent, ++ struct bch_io_failures *failed, unsigned flags) ++{ ++ struct extent_ptr_decoded pick; ++ struct bch_read_bio *rbio = NULL; ++ struct bch_dev *ca; ++ struct promote_op *promote = NULL; ++ bool bounce = false, read_full = false, narrow_crcs = false; ++ struct bpos pos = bkey_start_pos(k.k); ++ int pick_ret; ++ ++ pick_ret = bch2_bkey_pick_read_device(c, k, failed, &pick); ++ ++ /* hole or reservation - just zero fill: */ ++ if (!pick_ret) ++ goto hole; ++ ++ if (pick_ret < 0) { ++ __bcache_io_error(c, "no device to read from"); ++ goto err; ++ } ++ ++ if (pick_ret > 0) ++ ca = bch_dev_bkey_exists(c, pick.ptr.dev); ++ ++ if (flags & BCH_READ_NODECODE) { ++ /* ++ * can happen if we retry, and the extent we were going to read ++ * has been merged in the meantime: ++ */ ++ if (pick.crc.compressed_size > orig->bio.bi_vcnt * PAGE_SECTORS) ++ goto hole; ++ ++ iter.bi_size = pick.crc.compressed_size << 9; ++ goto noclone; ++ } ++ ++ if (!(flags & BCH_READ_LAST_FRAGMENT) || ++ bio_flagged(&orig->bio, BIO_CHAIN)) ++ flags |= BCH_READ_MUST_CLONE; ++ ++ narrow_crcs = !(flags & BCH_READ_IN_RETRY) && ++ bch2_can_narrow_extent_crcs(k, pick.crc); ++ ++ if (narrow_crcs && (flags & BCH_READ_USER_MAPPED)) ++ flags |= BCH_READ_MUST_BOUNCE; ++ ++ BUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size); ++ ++ if (pick.crc.compression_type != BCH_COMPRESSION_NONE || ++ (pick.crc.csum_type != BCH_CSUM_NONE && ++ (bvec_iter_sectors(iter) != pick.crc.uncompressed_size || ++ (bch2_csum_type_is_encryption(pick.crc.csum_type) && ++ (flags & BCH_READ_USER_MAPPED)) || ++ (flags & BCH_READ_MUST_BOUNCE)))) { ++ read_full = true; ++ bounce = true; ++ } ++ ++ promote = promote_alloc(c, iter, k, &pick, orig->opts, flags, ++ &rbio, &bounce, &read_full); ++ ++ if (!read_full) { ++ EBUG_ON(pick.crc.compression_type); ++ EBUG_ON(pick.crc.csum_type && ++ (bvec_iter_sectors(iter) != pick.crc.uncompressed_size || ++ bvec_iter_sectors(iter) != pick.crc.live_size || ++ pick.crc.offset || ++ offset_into_extent)); ++ ++ pos.offset += offset_into_extent; ++ pick.ptr.offset += pick.crc.offset + ++ offset_into_extent; ++ offset_into_extent = 0; ++ pick.crc.compressed_size = bvec_iter_sectors(iter); ++ pick.crc.uncompressed_size = bvec_iter_sectors(iter); ++ pick.crc.offset = 0; ++ pick.crc.live_size = bvec_iter_sectors(iter); ++ offset_into_extent = 0; ++ } ++ ++ if (rbio) { ++ /* ++ * promote already allocated bounce rbio: ++ * promote needs to allocate a bio big enough for uncompressing ++ * data in the write path, but we're not going to use it all ++ * here: ++ */ ++ BUG_ON(rbio->bio.bi_iter.bi_size < ++ pick.crc.compressed_size << 9); ++ rbio->bio.bi_iter.bi_size = ++ pick.crc.compressed_size << 9; ++ } else if (bounce) { ++ unsigned sectors = pick.crc.compressed_size; ++ ++ rbio = rbio_init(bio_alloc_bioset(GFP_NOIO, ++ DIV_ROUND_UP(sectors, PAGE_SECTORS), ++ &c->bio_read_split), ++ orig->opts); ++ ++ bch2_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9); ++ rbio->bounce = true; ++ rbio->split = true; ++ } else if (flags & BCH_READ_MUST_CLONE) { ++ /* ++ * Have to clone if there were any splits, due to error ++ * reporting issues (if a split errored, and retrying didn't ++ * work, when it reports the error to its parent (us) we don't ++ * know if the error was from our bio, and we should retry, or ++ * from the whole bio, in which case we don't want to retry and ++ * lose the error) ++ */ ++ rbio = rbio_init(bio_clone_fast(&orig->bio, GFP_NOIO, ++ &c->bio_read_split), ++ orig->opts); ++ rbio->bio.bi_iter = iter; ++ rbio->split = true; ++ } else { ++noclone: ++ rbio = orig; ++ rbio->bio.bi_iter = iter; ++ BUG_ON(bio_flagged(&rbio->bio, BIO_CHAIN)); ++ } ++ ++ BUG_ON(bio_sectors(&rbio->bio) != pick.crc.compressed_size); ++ ++ rbio->c = c; ++ rbio->submit_time = local_clock(); ++ if (rbio->split) ++ rbio->parent = orig; ++ else ++ rbio->end_io = orig->bio.bi_end_io; ++ rbio->bvec_iter = iter; ++ rbio->offset_into_extent= offset_into_extent; ++ rbio->flags = flags; ++ rbio->have_ioref = pick_ret > 0 && bch2_dev_get_ioref(ca, READ); ++ rbio->narrow_crcs = narrow_crcs; ++ rbio->hole = 0; ++ rbio->retry = 0; ++ rbio->context = 0; ++ rbio->devs_have = bch2_bkey_devs(k); ++ rbio->pick = pick; ++ rbio->pos = pos; ++ rbio->version = k.k->version; ++ rbio->promote = promote; ++ INIT_WORK(&rbio->work, NULL); ++ ++ rbio->bio.bi_opf = orig->bio.bi_opf; ++ rbio->bio.bi_iter.bi_sector = pick.ptr.offset; ++ rbio->bio.bi_end_io = bch2_read_endio; ++ ++ if (rbio->bounce) ++ trace_read_bounce(&rbio->bio); ++ ++ bch2_increment_clock(c, bio_sectors(&rbio->bio), READ); ++ ++ percpu_down_read(&c->mark_lock); ++ bucket_io_clock_reset(c, ca, PTR_BUCKET_NR(ca, &pick.ptr), READ); ++ percpu_up_read(&c->mark_lock); ++ ++ if (likely(!(flags & (BCH_READ_IN_RETRY|BCH_READ_LAST_FRAGMENT)))) { ++ bio_inc_remaining(&orig->bio); ++ trace_read_split(&orig->bio); ++ } ++ ++ if (!rbio->pick.idx) { ++ if (!rbio->have_ioref) { ++ __bcache_io_error(c, "no device to read from"); ++ bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); ++ goto out; ++ } ++ ++ this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_USER], ++ bio_sectors(&rbio->bio)); ++ bio_set_dev(&rbio->bio, ca->disk_sb.bdev); ++ ++ if (likely(!(flags & BCH_READ_IN_RETRY))) ++ submit_bio(&rbio->bio); ++ else ++ submit_bio_wait(&rbio->bio); ++ } else { ++ /* Attempting reconstruct read: */ ++ if (bch2_ec_read_extent(c, rbio)) { ++ bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); ++ goto out; ++ } ++ ++ if (likely(!(flags & BCH_READ_IN_RETRY))) ++ bio_endio(&rbio->bio); ++ } ++out: ++ if (likely(!(flags & BCH_READ_IN_RETRY))) { ++ return 0; ++ } else { ++ int ret; ++ ++ rbio->context = RBIO_CONTEXT_UNBOUND; ++ bch2_read_endio(&rbio->bio); ++ ++ ret = rbio->retry; ++ rbio = bch2_rbio_free(rbio); ++ ++ if (ret == READ_RETRY_AVOID) { ++ bch2_mark_io_failure(failed, &pick); ++ ret = READ_RETRY; ++ } ++ ++ return ret; ++ } ++ ++err: ++ if (flags & BCH_READ_IN_RETRY) ++ return READ_ERR; ++ ++ orig->bio.bi_status = BLK_STS_IOERR; ++ goto out_read_done; ++ ++hole: ++ /* ++ * won't normally happen in the BCH_READ_NODECODE ++ * (bch2_move_extent()) path, but if we retry and the extent we wanted ++ * to read no longer exists we have to signal that: ++ */ ++ if (flags & BCH_READ_NODECODE) ++ orig->hole = true; ++ ++ zero_fill_bio_iter(&orig->bio, iter); ++out_read_done: ++ if (flags & BCH_READ_LAST_FRAGMENT) ++ bch2_rbio_done(orig); ++ return 0; ++} ++ ++void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, u64 inode) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ unsigned flags = BCH_READ_RETRY_IF_STALE| ++ BCH_READ_MAY_PROMOTE| ++ BCH_READ_USER_MAPPED; ++ int ret; ++ ++ BUG_ON(rbio->_state); ++ BUG_ON(flags & BCH_READ_NODECODE); ++ BUG_ON(flags & BCH_READ_IN_RETRY); ++ ++ rbio->c = c; ++ rbio->start_time = local_clock(); ++ ++ bch2_trans_init(&trans, c, 0, 0); ++retry: ++ bch2_trans_begin(&trans); ++ ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, ++ POS(inode, rbio->bio.bi_iter.bi_sector), ++ BTREE_ITER_SLOTS); ++ while (1) { ++ BKEY_PADDED(k) tmp; ++ unsigned bytes, sectors, offset_into_extent; ++ ++ bch2_btree_iter_set_pos(iter, ++ POS(inode, rbio->bio.bi_iter.bi_sector)); ++ ++ k = bch2_btree_iter_peek_slot(iter); ++ ret = bkey_err(k); ++ if (ret) ++ goto err; ++ ++ bkey_reassemble(&tmp.k, k); ++ k = bkey_i_to_s_c(&tmp.k); ++ ++ offset_into_extent = iter->pos.offset - ++ bkey_start_offset(k.k); ++ sectors = k.k->size - offset_into_extent; ++ ++ ret = bch2_read_indirect_extent(&trans, ++ &offset_into_extent, &tmp.k); ++ if (ret) ++ goto err; ++ ++ /* ++ * With indirect extents, the amount of data to read is the min ++ * of the original extent and the indirect extent: ++ */ ++ sectors = min(sectors, k.k->size - offset_into_extent); ++ ++ /* ++ * Unlock the iterator while the btree node's lock is still in ++ * cache, before doing the IO: ++ */ ++ bch2_trans_unlock(&trans); ++ ++ bytes = min(sectors, bio_sectors(&rbio->bio)) << 9; ++ swap(rbio->bio.bi_iter.bi_size, bytes); ++ ++ if (rbio->bio.bi_iter.bi_size == bytes) ++ flags |= BCH_READ_LAST_FRAGMENT; ++ ++ bch2_read_extent(c, rbio, k, offset_into_extent, flags); ++ ++ if (flags & BCH_READ_LAST_FRAGMENT) ++ break; ++ ++ swap(rbio->bio.bi_iter.bi_size, bytes); ++ bio_advance(&rbio->bio, bytes); ++ } ++out: ++ bch2_trans_exit(&trans); ++ return; ++err: ++ if (ret == -EINTR) ++ goto retry; ++ ++ bcache_io_error(c, &rbio->bio, "btree IO error: %i", ret); ++ bch2_rbio_done(rbio); ++ goto out; ++} ++ ++void bch2_fs_io_exit(struct bch_fs *c) ++{ ++ if (c->promote_table.tbl) ++ rhashtable_destroy(&c->promote_table); ++ mempool_exit(&c->bio_bounce_pages); ++ bioset_exit(&c->bio_write); ++ bioset_exit(&c->bio_read_split); ++ bioset_exit(&c->bio_read); ++} ++ ++int bch2_fs_io_init(struct bch_fs *c) ++{ ++ if (bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio), ++ BIOSET_NEED_BVECS) || ++ bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio), ++ BIOSET_NEED_BVECS) || ++ bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio), ++ BIOSET_NEED_BVECS) || ++ mempool_init_page_pool(&c->bio_bounce_pages, ++ max_t(unsigned, ++ c->opts.btree_node_size, ++ c->sb.encoded_extent_max) / ++ PAGE_SECTORS, 0) || ++ rhashtable_init(&c->promote_table, &bch_promote_params)) ++ return -ENOMEM; ++ ++ return 0; ++} +diff --git a/fs/bcachefs/io.h b/fs/bcachefs/io.h +new file mode 100644 +index 000000000000..91aaa58fce4e +--- /dev/null ++++ b/fs/bcachefs/io.h +@@ -0,0 +1,163 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_IO_H ++#define _BCACHEFS_IO_H ++ ++#include "checksum.h" ++#include "io_types.h" ++ ++#define to_wbio(_bio) \ ++ container_of((_bio), struct bch_write_bio, bio) ++ ++#define to_rbio(_bio) \ ++ container_of((_bio), struct bch_read_bio, bio) ++ ++void bch2_bio_free_pages_pool(struct bch_fs *, struct bio *); ++void bch2_bio_alloc_pages_pool(struct bch_fs *, struct bio *, size_t); ++ ++void bch2_latency_acct(struct bch_dev *, u64, int); ++ ++void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *, ++ enum bch_data_type, const struct bkey_i *); ++ ++#define BLK_STS_REMOVED ((__force blk_status_t)128) ++ ++enum bch_write_flags { ++ BCH_WRITE_ALLOC_NOWAIT = (1 << 0), ++ BCH_WRITE_CACHED = (1 << 1), ++ BCH_WRITE_FLUSH = (1 << 2), ++ BCH_WRITE_DATA_ENCODED = (1 << 3), ++ BCH_WRITE_PAGES_STABLE = (1 << 4), ++ BCH_WRITE_PAGES_OWNED = (1 << 5), ++ BCH_WRITE_ONLY_SPECIFIED_DEVS = (1 << 6), ++ BCH_WRITE_NOPUT_RESERVATION = (1 << 7), ++ ++ /* Internal: */ ++ BCH_WRITE_JOURNAL_SEQ_PTR = (1 << 8), ++ BCH_WRITE_SKIP_CLOSURE_PUT = (1 << 9), ++}; ++ ++static inline u64 *op_journal_seq(struct bch_write_op *op) ++{ ++ return (op->flags & BCH_WRITE_JOURNAL_SEQ_PTR) ++ ? op->journal_seq_p : &op->journal_seq; ++} ++ ++static inline void op_journal_seq_set(struct bch_write_op *op, u64 *journal_seq) ++{ ++ op->journal_seq_p = journal_seq; ++ op->flags |= BCH_WRITE_JOURNAL_SEQ_PTR; ++} ++ ++static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op) ++{ ++ return op->alloc_reserve == RESERVE_MOVINGGC ++ ? op->c->copygc_wq ++ : op->c->wq; ++} ++ ++int bch2_extent_update(struct btree_trans *, struct btree_iter *, ++ struct bkey_i *, struct disk_reservation *, ++ u64 *, u64, s64 *); ++int bch2_fpunch_at(struct btree_trans *, struct btree_iter *, ++ struct bpos, u64 *, s64 *); ++int bch2_fpunch(struct bch_fs *c, u64, u64, u64, u64 *, s64 *); ++ ++int bch2_write_index_default(struct bch_write_op *); ++ ++static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c, ++ struct bch_io_opts opts) ++{ ++ op->c = c; ++ op->end_io = NULL; ++ op->flags = 0; ++ op->written = 0; ++ op->error = 0; ++ op->csum_type = bch2_data_checksum_type(c, opts.data_checksum); ++ op->compression_type = bch2_compression_opt_to_type[opts.compression]; ++ op->nr_replicas = 0; ++ op->nr_replicas_required = c->opts.data_replicas_required; ++ op->alloc_reserve = RESERVE_NONE; ++ op->open_buckets.nr = 0; ++ op->devs_have.nr = 0; ++ op->target = 0; ++ op->opts = opts; ++ op->pos = POS_MAX; ++ op->version = ZERO_VERSION; ++ op->write_point = (struct write_point_specifier) { 0 }; ++ op->res = (struct disk_reservation) { 0 }; ++ op->journal_seq = 0; ++ op->new_i_size = U64_MAX; ++ op->i_sectors_delta = 0; ++ op->index_update_fn = bch2_write_index_default; ++} ++ ++void bch2_write(struct closure *); ++ ++static inline struct bch_write_bio *wbio_init(struct bio *bio) ++{ ++ struct bch_write_bio *wbio = to_wbio(bio); ++ ++ memset(wbio, 0, offsetof(struct bch_write_bio, bio)); ++ return wbio; ++} ++ ++struct bch_devs_mask; ++struct cache_promote_op; ++struct extent_ptr_decoded; ++ ++int __bch2_read_indirect_extent(struct btree_trans *, unsigned *, ++ struct bkey_i *); ++ ++static inline int bch2_read_indirect_extent(struct btree_trans *trans, ++ unsigned *offset_into_extent, ++ struct bkey_i *k) ++{ ++ return k->k.type == KEY_TYPE_reflink_p ++ ? __bch2_read_indirect_extent(trans, offset_into_extent, k) ++ : 0; ++} ++ ++enum bch_read_flags { ++ BCH_READ_RETRY_IF_STALE = 1 << 0, ++ BCH_READ_MAY_PROMOTE = 1 << 1, ++ BCH_READ_USER_MAPPED = 1 << 2, ++ BCH_READ_NODECODE = 1 << 3, ++ BCH_READ_LAST_FRAGMENT = 1 << 4, ++ ++ /* internal: */ ++ BCH_READ_MUST_BOUNCE = 1 << 5, ++ BCH_READ_MUST_CLONE = 1 << 6, ++ BCH_READ_IN_RETRY = 1 << 7, ++}; ++ ++int __bch2_read_extent(struct bch_fs *, struct bch_read_bio *, ++ struct bvec_iter, struct bkey_s_c, unsigned, ++ struct bch_io_failures *, unsigned); ++ ++static inline void bch2_read_extent(struct bch_fs *c, ++ struct bch_read_bio *rbio, ++ struct bkey_s_c k, ++ unsigned offset_into_extent, ++ unsigned flags) ++{ ++ __bch2_read_extent(c, rbio, rbio->bio.bi_iter, k, ++ offset_into_extent, NULL, flags); ++} ++ ++void bch2_read(struct bch_fs *, struct bch_read_bio *, u64); ++ ++static inline struct bch_read_bio *rbio_init(struct bio *bio, ++ struct bch_io_opts opts) ++{ ++ struct bch_read_bio *rbio = to_rbio(bio); ++ ++ rbio->_state = 0; ++ rbio->promote = NULL; ++ rbio->opts = opts; ++ return rbio; ++} ++ ++void bch2_fs_io_exit(struct bch_fs *); ++int bch2_fs_io_init(struct bch_fs *); ++ ++#endif /* _BCACHEFS_IO_H */ +diff --git a/fs/bcachefs/io_types.h b/fs/bcachefs/io_types.h +new file mode 100644 +index 000000000000..c37b7d7401e9 +--- /dev/null ++++ b/fs/bcachefs/io_types.h +@@ -0,0 +1,148 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_IO_TYPES_H ++#define _BCACHEFS_IO_TYPES_H ++ ++#include "alloc_types.h" ++#include "btree_types.h" ++#include "buckets_types.h" ++#include "extents_types.h" ++#include "keylist_types.h" ++#include "opts.h" ++#include "super_types.h" ++ ++#include ++#include ++ ++struct bch_read_bio { ++ struct bch_fs *c; ++ u64 start_time; ++ u64 submit_time; ++ ++ /* ++ * Reads will often have to be split, and if the extent being read from ++ * was checksummed or compressed we'll also have to allocate bounce ++ * buffers and copy the data back into the original bio. ++ * ++ * If we didn't have to split, we have to save and restore the original ++ * bi_end_io - @split below indicates which: ++ */ ++ union { ++ struct bch_read_bio *parent; ++ bio_end_io_t *end_io; ++ }; ++ ++ /* ++ * Saved copy of bio->bi_iter, from submission time - allows us to ++ * resubmit on IO error, and also to copy data back to the original bio ++ * when we're bouncing: ++ */ ++ struct bvec_iter bvec_iter; ++ ++ unsigned offset_into_extent; ++ ++ u16 flags; ++ union { ++ struct { ++ u16 bounce:1, ++ split:1, ++ kmalloc:1, ++ have_ioref:1, ++ narrow_crcs:1, ++ hole:1, ++ retry:2, ++ context:2; ++ }; ++ u16 _state; ++ }; ++ ++ struct bch_devs_list devs_have; ++ ++ struct extent_ptr_decoded pick; ++ /* start pos of data we read (may not be pos of data we want) */ ++ struct bpos pos; ++ struct bversion version; ++ ++ struct promote_op *promote; ++ ++ struct bch_io_opts opts; ++ ++ struct work_struct work; ++ ++ struct bio bio; ++}; ++ ++struct bch_write_bio { ++ struct bch_fs *c; ++ struct bch_write_bio *parent; ++ ++ u64 submit_time; ++ ++ struct bch_devs_list failed; ++ u8 order; ++ u8 dev; ++ ++ unsigned split:1, ++ bounce:1, ++ put_bio:1, ++ have_ioref:1, ++ used_mempool:1; ++ ++ struct bio bio; ++}; ++ ++struct bch_write_op { ++ struct closure cl; ++ struct bch_fs *c; ++ void (*end_io)(struct bch_write_op *); ++ u64 start_time; ++ ++ unsigned written; /* sectors */ ++ u16 flags; ++ s16 error; /* dio write path expects it to hold -ERESTARTSYS... */ ++ ++ unsigned csum_type:4; ++ unsigned compression_type:4; ++ unsigned nr_replicas:4; ++ unsigned nr_replicas_required:4; ++ unsigned alloc_reserve:4; ++ ++ struct bch_devs_list devs_have; ++ u16 target; ++ u16 nonce; ++ struct bch_io_opts opts; ++ ++ struct bpos pos; ++ struct bversion version; ++ ++ /* For BCH_WRITE_DATA_ENCODED: */ ++ struct bch_extent_crc_unpacked crc; ++ ++ struct write_point_specifier write_point; ++ ++ struct disk_reservation res; ++ ++ struct open_buckets open_buckets; ++ ++ /* ++ * If caller wants to flush but hasn't passed us a journal_seq ptr, we ++ * still need to stash the journal_seq somewhere: ++ */ ++ union { ++ u64 *journal_seq_p; ++ u64 journal_seq; ++ }; ++ u64 new_i_size; ++ s64 i_sectors_delta; ++ ++ int (*index_update_fn)(struct bch_write_op *); ++ ++ struct bch_devs_mask failed; ++ ++ struct keylist insert_keys; ++ u64 inline_keys[BKEY_EXTENT_U64s_MAX * 2]; ++ ++ /* Must be last: */ ++ struct bch_write_bio wbio; ++}; ++ ++#endif /* _BCACHEFS_IO_TYPES_H */ +diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c +new file mode 100644 +index 000000000000..5c3e146e3942 +--- /dev/null ++++ b/fs/bcachefs/journal.c +@@ -0,0 +1,1253 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * bcachefs journalling code, for btree insertions ++ * ++ * Copyright 2012 Google, Inc. ++ */ ++ ++#include "bcachefs.h" ++#include "alloc_foreground.h" ++#include "bkey_methods.h" ++#include "btree_gc.h" ++#include "buckets.h" ++#include "journal.h" ++#include "journal_io.h" ++#include "journal_reclaim.h" ++#include "journal_seq_blacklist.h" ++#include "super-io.h" ++ ++#include ++ ++static bool __journal_entry_is_open(union journal_res_state state) ++{ ++ return state.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL; ++} ++ ++static bool journal_entry_is_open(struct journal *j) ++{ ++ return __journal_entry_is_open(j->reservations); ++} ++ ++static void journal_pin_new_entry(struct journal *j, int count) ++{ ++ struct journal_entry_pin_list *p; ++ ++ /* ++ * The fifo_push() needs to happen at the same time as j->seq is ++ * incremented for journal_last_seq() to be calculated correctly ++ */ ++ atomic64_inc(&j->seq); ++ p = fifo_push_ref(&j->pin); ++ ++ INIT_LIST_HEAD(&p->list); ++ INIT_LIST_HEAD(&p->flushed); ++ atomic_set(&p->count, count); ++ p->devs.nr = 0; ++} ++ ++static void bch2_journal_buf_init(struct journal *j) ++{ ++ struct journal_buf *buf = journal_cur_buf(j); ++ ++ memset(buf->has_inode, 0, sizeof(buf->has_inode)); ++ ++ memset(buf->data, 0, sizeof(*buf->data)); ++ buf->data->seq = cpu_to_le64(journal_cur_seq(j)); ++ buf->data->u64s = 0; ++} ++ ++void bch2_journal_halt(struct journal *j) ++{ ++ union journal_res_state old, new; ++ u64 v = atomic64_read(&j->reservations.counter); ++ ++ do { ++ old.v = new.v = v; ++ if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL) ++ return; ++ ++ new.cur_entry_offset = JOURNAL_ENTRY_ERROR_VAL; ++ } while ((v = atomic64_cmpxchg(&j->reservations.counter, ++ old.v, new.v)) != old.v); ++ ++ journal_wake(j); ++ closure_wake_up(&journal_cur_buf(j)->wait); ++} ++ ++/* journal entry close/open: */ ++ ++void __bch2_journal_buf_put(struct journal *j, bool need_write_just_set) ++{ ++ if (!need_write_just_set && ++ test_bit(JOURNAL_NEED_WRITE, &j->flags)) ++ bch2_time_stats_update(j->delay_time, ++ j->need_write_time); ++ ++ clear_bit(JOURNAL_NEED_WRITE, &j->flags); ++ ++ closure_call(&j->io, bch2_journal_write, system_highpri_wq, NULL); ++} ++ ++/* ++ * Returns true if journal entry is now closed: ++ */ ++static bool __journal_entry_close(struct journal *j) ++{ ++ struct bch_fs *c = container_of(j, struct bch_fs, journal); ++ struct journal_buf *buf = journal_cur_buf(j); ++ union journal_res_state old, new; ++ u64 v = atomic64_read(&j->reservations.counter); ++ bool set_need_write = false; ++ unsigned sectors; ++ ++ lockdep_assert_held(&j->lock); ++ ++ do { ++ old.v = new.v = v; ++ if (old.cur_entry_offset == JOURNAL_ENTRY_CLOSED_VAL) ++ return true; ++ ++ if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL) { ++ /* this entry will never be written: */ ++ closure_wake_up(&buf->wait); ++ return true; ++ } ++ ++ if (!test_bit(JOURNAL_NEED_WRITE, &j->flags)) { ++ set_bit(JOURNAL_NEED_WRITE, &j->flags); ++ j->need_write_time = local_clock(); ++ set_need_write = true; ++ } ++ ++ if (new.prev_buf_unwritten) ++ return false; ++ ++ new.cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL; ++ new.idx++; ++ new.prev_buf_unwritten = 1; ++ ++ BUG_ON(journal_state_count(new, new.idx)); ++ } while ((v = atomic64_cmpxchg(&j->reservations.counter, ++ old.v, new.v)) != old.v); ++ ++ buf->data->u64s = cpu_to_le32(old.cur_entry_offset); ++ ++ sectors = vstruct_blocks_plus(buf->data, c->block_bits, ++ buf->u64s_reserved) << c->block_bits; ++ BUG_ON(sectors > buf->sectors); ++ buf->sectors = sectors; ++ ++ bkey_extent_init(&buf->key); ++ ++ /* ++ * We have to set last_seq here, _before_ opening a new journal entry: ++ * ++ * A threads may replace an old pin with a new pin on their current ++ * journal reservation - the expectation being that the journal will ++ * contain either what the old pin protected or what the new pin ++ * protects. ++ * ++ * After the old pin is dropped journal_last_seq() won't include the old ++ * pin, so we can only write the updated last_seq on the entry that ++ * contains whatever the new pin protects. ++ * ++ * Restated, we can _not_ update last_seq for a given entry if there ++ * could be a newer entry open with reservations/pins that have been ++ * taken against it. ++ * ++ * Hence, we want update/set last_seq on the current journal entry right ++ * before we open a new one: ++ */ ++ buf->data->last_seq = cpu_to_le64(journal_last_seq(j)); ++ ++ if (journal_entry_empty(buf->data)) ++ clear_bit(JOURNAL_NOT_EMPTY, &j->flags); ++ else ++ set_bit(JOURNAL_NOT_EMPTY, &j->flags); ++ ++ journal_pin_new_entry(j, 1); ++ ++ bch2_journal_buf_init(j); ++ ++ cancel_delayed_work(&j->write_work); ++ ++ bch2_journal_space_available(j); ++ ++ bch2_journal_buf_put(j, old.idx, set_need_write); ++ return true; ++} ++ ++static bool journal_entry_close(struct journal *j) ++{ ++ bool ret; ++ ++ spin_lock(&j->lock); ++ ret = __journal_entry_close(j); ++ spin_unlock(&j->lock); ++ ++ return ret; ++} ++ ++/* ++ * should _only_ called from journal_res_get() - when we actually want a ++ * journal reservation - journal entry is open means journal is dirty: ++ * ++ * returns: ++ * 0: success ++ * -ENOSPC: journal currently full, must invoke reclaim ++ * -EAGAIN: journal blocked, must wait ++ * -EROFS: insufficient rw devices or journal error ++ */ ++static int journal_entry_open(struct journal *j) ++{ ++ struct journal_buf *buf = journal_cur_buf(j); ++ union journal_res_state old, new; ++ int u64s; ++ u64 v; ++ ++ lockdep_assert_held(&j->lock); ++ BUG_ON(journal_entry_is_open(j)); ++ ++ if (j->blocked) ++ return -EAGAIN; ++ ++ if (j->cur_entry_error) ++ return j->cur_entry_error; ++ ++ BUG_ON(!j->cur_entry_sectors); ++ ++ buf->u64s_reserved = j->entry_u64s_reserved; ++ buf->disk_sectors = j->cur_entry_sectors; ++ buf->sectors = min(buf->disk_sectors, buf->buf_size >> 9); ++ ++ u64s = (int) (buf->sectors << 9) / sizeof(u64) - ++ journal_entry_overhead(j); ++ u64s = clamp_t(int, u64s, 0, JOURNAL_ENTRY_CLOSED_VAL - 1); ++ ++ if (u64s <= le32_to_cpu(buf->data->u64s)) ++ return -ENOSPC; ++ ++ /* ++ * Must be set before marking the journal entry as open: ++ */ ++ j->cur_entry_u64s = u64s; ++ ++ v = atomic64_read(&j->reservations.counter); ++ do { ++ old.v = new.v = v; ++ ++ if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL) ++ return -EROFS; ++ ++ /* Handle any already added entries */ ++ new.cur_entry_offset = le32_to_cpu(buf->data->u64s); ++ ++ EBUG_ON(journal_state_count(new, new.idx)); ++ journal_state_inc(&new); ++ } while ((v = atomic64_cmpxchg(&j->reservations.counter, ++ old.v, new.v)) != old.v); ++ ++ if (j->res_get_blocked_start) ++ bch2_time_stats_update(j->blocked_time, ++ j->res_get_blocked_start); ++ j->res_get_blocked_start = 0; ++ ++ mod_delayed_work(system_freezable_wq, ++ &j->write_work, ++ msecs_to_jiffies(j->write_delay_ms)); ++ journal_wake(j); ++ return 0; ++} ++ ++static bool journal_quiesced(struct journal *j) ++{ ++ union journal_res_state state = READ_ONCE(j->reservations); ++ bool ret = !state.prev_buf_unwritten && !__journal_entry_is_open(state); ++ ++ if (!ret) ++ journal_entry_close(j); ++ return ret; ++} ++ ++static void journal_quiesce(struct journal *j) ++{ ++ wait_event(j->wait, journal_quiesced(j)); ++} ++ ++static void journal_write_work(struct work_struct *work) ++{ ++ struct journal *j = container_of(work, struct journal, write_work.work); ++ ++ journal_entry_close(j); ++} ++ ++/* ++ * Given an inode number, if that inode number has data in the journal that ++ * hasn't yet been flushed, return the journal sequence number that needs to be ++ * flushed: ++ */ ++u64 bch2_inode_journal_seq(struct journal *j, u64 inode) ++{ ++ size_t h = hash_64(inode, ilog2(sizeof(j->buf[0].has_inode) * 8)); ++ u64 seq = 0; ++ ++ if (!test_bit(h, j->buf[0].has_inode) && ++ !test_bit(h, j->buf[1].has_inode)) ++ return 0; ++ ++ spin_lock(&j->lock); ++ if (test_bit(h, journal_cur_buf(j)->has_inode)) ++ seq = journal_cur_seq(j); ++ else if (test_bit(h, journal_prev_buf(j)->has_inode)) ++ seq = journal_cur_seq(j) - 1; ++ spin_unlock(&j->lock); ++ ++ return seq; ++} ++ ++static int __journal_res_get(struct journal *j, struct journal_res *res, ++ unsigned flags) ++{ ++ struct bch_fs *c = container_of(j, struct bch_fs, journal); ++ struct journal_buf *buf; ++ bool can_discard; ++ int ret; ++retry: ++ if (journal_res_get_fast(j, res, flags)) ++ return 0; ++ ++ if (bch2_journal_error(j)) ++ return -EROFS; ++ ++ spin_lock(&j->lock); ++ ++ /* ++ * Recheck after taking the lock, so we don't race with another thread ++ * that just did journal_entry_open() and call journal_entry_close() ++ * unnecessarily ++ */ ++ if (journal_res_get_fast(j, res, flags)) { ++ spin_unlock(&j->lock); ++ return 0; ++ } ++ ++ if (!(flags & JOURNAL_RES_GET_RESERVED) && ++ !test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags)) { ++ /* ++ * Don't want to close current journal entry, just need to ++ * invoke reclaim: ++ */ ++ ret = -ENOSPC; ++ goto unlock; ++ } ++ ++ /* ++ * If we couldn't get a reservation because the current buf filled up, ++ * and we had room for a bigger entry on disk, signal that we want to ++ * realloc the journal bufs: ++ */ ++ buf = journal_cur_buf(j); ++ if (journal_entry_is_open(j) && ++ buf->buf_size >> 9 < buf->disk_sectors && ++ buf->buf_size < JOURNAL_ENTRY_SIZE_MAX) ++ j->buf_size_want = max(j->buf_size_want, buf->buf_size << 1); ++ ++ if (journal_entry_is_open(j) && ++ !__journal_entry_close(j)) { ++ /* ++ * We failed to get a reservation on the current open journal ++ * entry because it's full, and we can't close it because ++ * there's still a previous one in flight: ++ */ ++ trace_journal_entry_full(c); ++ ret = -EAGAIN; ++ } else { ++ ret = journal_entry_open(j); ++ } ++unlock: ++ if ((ret == -EAGAIN || ret == -ENOSPC) && ++ !j->res_get_blocked_start) ++ j->res_get_blocked_start = local_clock() ?: 1; ++ ++ can_discard = j->can_discard; ++ spin_unlock(&j->lock); ++ ++ if (!ret) ++ goto retry; ++ ++ if (ret == -ENOSPC) { ++ BUG_ON(!can_discard && (flags & JOURNAL_RES_GET_RESERVED)); ++ ++ /* ++ * Journal is full - can't rely on reclaim from work item due to ++ * freezing: ++ */ ++ trace_journal_full(c); ++ ++ if (!(flags & JOURNAL_RES_GET_NONBLOCK)) { ++ if (can_discard) { ++ bch2_journal_do_discards(j); ++ goto retry; ++ } ++ ++ if (mutex_trylock(&j->reclaim_lock)) { ++ bch2_journal_reclaim(j); ++ mutex_unlock(&j->reclaim_lock); ++ } ++ } ++ ++ ret = -EAGAIN; ++ } ++ ++ return ret; ++} ++ ++/* ++ * Essentially the entry function to the journaling code. When bcachefs is doing ++ * a btree insert, it calls this function to get the current journal write. ++ * Journal write is the structure used set up journal writes. The calling ++ * function will then add its keys to the structure, queuing them for the next ++ * write. ++ * ++ * To ensure forward progress, the current task must not be holding any ++ * btree node write locks. ++ */ ++int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res, ++ unsigned flags) ++{ ++ int ret; ++ ++ closure_wait_event(&j->async_wait, ++ (ret = __journal_res_get(j, res, flags)) != -EAGAIN || ++ (flags & JOURNAL_RES_GET_NONBLOCK)); ++ return ret; ++} ++ ++/* journal_preres: */ ++ ++static bool journal_preres_available(struct journal *j, ++ struct journal_preres *res, ++ unsigned new_u64s) ++{ ++ bool ret = bch2_journal_preres_get_fast(j, res, new_u64s); ++ ++ if (!ret) ++ bch2_journal_reclaim_work(&j->reclaim_work.work); ++ ++ return ret; ++} ++ ++int __bch2_journal_preres_get(struct journal *j, ++ struct journal_preres *res, ++ unsigned new_u64s) ++{ ++ int ret; ++ ++ closure_wait_event(&j->preres_wait, ++ (ret = bch2_journal_error(j)) || ++ journal_preres_available(j, res, new_u64s)); ++ return ret; ++} ++ ++/* journal_entry_res: */ ++ ++void bch2_journal_entry_res_resize(struct journal *j, ++ struct journal_entry_res *res, ++ unsigned new_u64s) ++{ ++ union journal_res_state state; ++ int d = new_u64s - res->u64s; ++ ++ spin_lock(&j->lock); ++ ++ j->entry_u64s_reserved += d; ++ if (d <= 0) ++ goto out; ++ ++ j->cur_entry_u64s = max_t(int, 0, j->cur_entry_u64s - d); ++ smp_mb(); ++ state = READ_ONCE(j->reservations); ++ ++ if (state.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL && ++ state.cur_entry_offset > j->cur_entry_u64s) { ++ j->cur_entry_u64s += d; ++ /* ++ * Not enough room in current journal entry, have to flush it: ++ */ ++ __journal_entry_close(j); ++ } else { ++ journal_cur_buf(j)->u64s_reserved += d; ++ } ++out: ++ spin_unlock(&j->lock); ++ res->u64s += d; ++} ++ ++/* journal flushing: */ ++ ++u64 bch2_journal_last_unwritten_seq(struct journal *j) ++{ ++ u64 seq; ++ ++ spin_lock(&j->lock); ++ seq = journal_cur_seq(j); ++ if (j->reservations.prev_buf_unwritten) ++ seq--; ++ spin_unlock(&j->lock); ++ ++ return seq; ++} ++ ++/** ++ * bch2_journal_open_seq_async - try to open a new journal entry if @seq isn't ++ * open yet, or wait if we cannot ++ * ++ * used by the btree interior update machinery, when it needs to write a new ++ * btree root - every journal entry contains the roots of all the btrees, so it ++ * doesn't need to bother with getting a journal reservation ++ */ ++int bch2_journal_open_seq_async(struct journal *j, u64 seq, struct closure *cl) ++{ ++ struct bch_fs *c = container_of(j, struct bch_fs, journal); ++ int ret; ++ ++ spin_lock(&j->lock); ++ ++ /* ++ * Can't try to open more than one sequence number ahead: ++ */ ++ BUG_ON(journal_cur_seq(j) < seq && !journal_entry_is_open(j)); ++ ++ if (journal_cur_seq(j) > seq || ++ journal_entry_is_open(j)) { ++ spin_unlock(&j->lock); ++ return 0; ++ } ++ ++ if (journal_cur_seq(j) < seq && ++ !__journal_entry_close(j)) { ++ /* haven't finished writing out the previous one: */ ++ trace_journal_entry_full(c); ++ ret = -EAGAIN; ++ } else { ++ BUG_ON(journal_cur_seq(j) != seq); ++ ++ ret = journal_entry_open(j); ++ } ++ ++ if ((ret == -EAGAIN || ret == -ENOSPC) && ++ !j->res_get_blocked_start) ++ j->res_get_blocked_start = local_clock() ?: 1; ++ ++ if (ret == -EAGAIN || ret == -ENOSPC) ++ closure_wait(&j->async_wait, cl); ++ ++ spin_unlock(&j->lock); ++ ++ if (ret == -ENOSPC) { ++ trace_journal_full(c); ++ bch2_journal_reclaim_work(&j->reclaim_work.work); ++ ret = -EAGAIN; ++ } ++ ++ return ret; ++} ++ ++static int journal_seq_error(struct journal *j, u64 seq) ++{ ++ union journal_res_state state = READ_ONCE(j->reservations); ++ ++ if (seq == journal_cur_seq(j)) ++ return bch2_journal_error(j); ++ ++ if (seq + 1 == journal_cur_seq(j) && ++ !state.prev_buf_unwritten && ++ seq > j->seq_ondisk) ++ return -EIO; ++ ++ return 0; ++} ++ ++static inline struct journal_buf * ++journal_seq_to_buf(struct journal *j, u64 seq) ++{ ++ /* seq should be for a journal entry that has been opened: */ ++ BUG_ON(seq > journal_cur_seq(j)); ++ BUG_ON(seq == journal_cur_seq(j) && ++ j->reservations.cur_entry_offset == JOURNAL_ENTRY_CLOSED_VAL); ++ ++ if (seq == journal_cur_seq(j)) ++ return journal_cur_buf(j); ++ if (seq + 1 == journal_cur_seq(j) && ++ j->reservations.prev_buf_unwritten) ++ return journal_prev_buf(j); ++ return NULL; ++} ++ ++/** ++ * bch2_journal_wait_on_seq - wait for a journal entry to be written ++ * ++ * does _not_ cause @seq to be written immediately - if there is no other ++ * activity to cause the relevant journal entry to be filled up or flushed it ++ * can wait for an arbitrary amount of time (up to @j->write_delay_ms, which is ++ * configurable). ++ */ ++void bch2_journal_wait_on_seq(struct journal *j, u64 seq, ++ struct closure *parent) ++{ ++ struct journal_buf *buf; ++ ++ spin_lock(&j->lock); ++ ++ if ((buf = journal_seq_to_buf(j, seq))) { ++ if (!closure_wait(&buf->wait, parent)) ++ BUG(); ++ ++ if (seq == journal_cur_seq(j)) { ++ smp_mb(); ++ if (bch2_journal_error(j)) ++ closure_wake_up(&buf->wait); ++ } ++ } ++ ++ spin_unlock(&j->lock); ++} ++ ++/** ++ * bch2_journal_flush_seq_async - wait for a journal entry to be written ++ * ++ * like bch2_journal_wait_on_seq, except that it triggers a write immediately if ++ * necessary ++ */ ++void bch2_journal_flush_seq_async(struct journal *j, u64 seq, ++ struct closure *parent) ++{ ++ struct journal_buf *buf; ++ ++ spin_lock(&j->lock); ++ ++ if (parent && ++ (buf = journal_seq_to_buf(j, seq))) ++ if (!closure_wait(&buf->wait, parent)) ++ BUG(); ++ ++ if (seq == journal_cur_seq(j)) ++ __journal_entry_close(j); ++ spin_unlock(&j->lock); ++} ++ ++static int journal_seq_flushed(struct journal *j, u64 seq) ++{ ++ int ret; ++ ++ spin_lock(&j->lock); ++ ret = seq <= j->seq_ondisk ? 1 : journal_seq_error(j, seq); ++ ++ if (seq == journal_cur_seq(j)) ++ __journal_entry_close(j); ++ spin_unlock(&j->lock); ++ ++ return ret; ++} ++ ++int bch2_journal_flush_seq(struct journal *j, u64 seq) ++{ ++ u64 start_time = local_clock(); ++ int ret, ret2; ++ ++ ret = wait_event_killable(j->wait, (ret2 = journal_seq_flushed(j, seq))); ++ ++ bch2_time_stats_update(j->flush_seq_time, start_time); ++ ++ return ret ?: ret2 < 0 ? ret2 : 0; ++} ++ ++/** ++ * bch2_journal_meta_async - force a journal entry to be written ++ */ ++void bch2_journal_meta_async(struct journal *j, struct closure *parent) ++{ ++ struct journal_res res; ++ ++ memset(&res, 0, sizeof(res)); ++ ++ bch2_journal_res_get(j, &res, jset_u64s(0), 0); ++ bch2_journal_res_put(j, &res); ++ ++ bch2_journal_flush_seq_async(j, res.seq, parent); ++} ++ ++int bch2_journal_meta(struct journal *j) ++{ ++ struct journal_res res; ++ int ret; ++ ++ memset(&res, 0, sizeof(res)); ++ ++ ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0); ++ if (ret) ++ return ret; ++ ++ bch2_journal_res_put(j, &res); ++ ++ return bch2_journal_flush_seq(j, res.seq); ++} ++ ++/* ++ * bch2_journal_flush_async - if there is an open journal entry, or a journal ++ * still being written, write it and wait for the write to complete ++ */ ++void bch2_journal_flush_async(struct journal *j, struct closure *parent) ++{ ++ u64 seq, journal_seq; ++ ++ spin_lock(&j->lock); ++ journal_seq = journal_cur_seq(j); ++ ++ if (journal_entry_is_open(j)) { ++ seq = journal_seq; ++ } else if (journal_seq) { ++ seq = journal_seq - 1; ++ } else { ++ spin_unlock(&j->lock); ++ return; ++ } ++ spin_unlock(&j->lock); ++ ++ bch2_journal_flush_seq_async(j, seq, parent); ++} ++ ++int bch2_journal_flush(struct journal *j) ++{ ++ u64 seq, journal_seq; ++ ++ spin_lock(&j->lock); ++ journal_seq = journal_cur_seq(j); ++ ++ if (journal_entry_is_open(j)) { ++ seq = journal_seq; ++ } else if (journal_seq) { ++ seq = journal_seq - 1; ++ } else { ++ spin_unlock(&j->lock); ++ return 0; ++ } ++ spin_unlock(&j->lock); ++ ++ return bch2_journal_flush_seq(j, seq); ++} ++ ++/* block/unlock the journal: */ ++ ++void bch2_journal_unblock(struct journal *j) ++{ ++ spin_lock(&j->lock); ++ j->blocked--; ++ spin_unlock(&j->lock); ++ ++ journal_wake(j); ++} ++ ++void bch2_journal_block(struct journal *j) ++{ ++ spin_lock(&j->lock); ++ j->blocked++; ++ spin_unlock(&j->lock); ++ ++ journal_quiesce(j); ++} ++ ++/* allocate journal on a device: */ ++ ++static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, ++ bool new_fs, struct closure *cl) ++{ ++ struct bch_fs *c = ca->fs; ++ struct journal_device *ja = &ca->journal; ++ struct bch_sb_field_journal *journal_buckets; ++ u64 *new_bucket_seq = NULL, *new_buckets = NULL; ++ int ret = 0; ++ ++ /* don't handle reducing nr of buckets yet: */ ++ if (nr <= ja->nr) ++ return 0; ++ ++ ret = -ENOMEM; ++ new_buckets = kzalloc(nr * sizeof(u64), GFP_KERNEL); ++ new_bucket_seq = kzalloc(nr * sizeof(u64), GFP_KERNEL); ++ if (!new_buckets || !new_bucket_seq) ++ goto err; ++ ++ journal_buckets = bch2_sb_resize_journal(&ca->disk_sb, ++ nr + sizeof(*journal_buckets) / sizeof(u64)); ++ if (!journal_buckets) ++ goto err; ++ ++ /* ++ * We may be called from the device add path, before the new device has ++ * actually been added to the running filesystem: ++ */ ++ if (c) ++ spin_lock(&c->journal.lock); ++ ++ memcpy(new_buckets, ja->buckets, ja->nr * sizeof(u64)); ++ memcpy(new_bucket_seq, ja->bucket_seq, ja->nr * sizeof(u64)); ++ swap(new_buckets, ja->buckets); ++ swap(new_bucket_seq, ja->bucket_seq); ++ ++ if (c) ++ spin_unlock(&c->journal.lock); ++ ++ while (ja->nr < nr) { ++ struct open_bucket *ob = NULL; ++ unsigned pos; ++ long bucket; ++ ++ if (new_fs) { ++ bucket = bch2_bucket_alloc_new_fs(ca); ++ if (bucket < 0) { ++ ret = -ENOSPC; ++ goto err; ++ } ++ } else { ++ ob = bch2_bucket_alloc(c, ca, RESERVE_ALLOC, ++ false, cl); ++ if (IS_ERR(ob)) { ++ ret = cl ? -EAGAIN : -ENOSPC; ++ goto err; ++ } ++ ++ bucket = sector_to_bucket(ca, ob->ptr.offset); ++ } ++ ++ if (c) { ++ percpu_down_read(&c->mark_lock); ++ spin_lock(&c->journal.lock); ++ } ++ ++ pos = ja->nr ? (ja->cur_idx + 1) % ja->nr : 0; ++ __array_insert_item(ja->buckets, ja->nr, pos); ++ __array_insert_item(ja->bucket_seq, ja->nr, pos); ++ __array_insert_item(journal_buckets->buckets, ja->nr, pos); ++ ja->nr++; ++ ++ ja->buckets[pos] = bucket; ++ ja->bucket_seq[pos] = 0; ++ journal_buckets->buckets[pos] = cpu_to_le64(bucket); ++ ++ if (pos <= ja->discard_idx) ++ ja->discard_idx = (ja->discard_idx + 1) % ja->nr; ++ if (pos <= ja->dirty_idx_ondisk) ++ ja->dirty_idx_ondisk = (ja->dirty_idx_ondisk + 1) % ja->nr; ++ if (pos <= ja->dirty_idx) ++ ja->dirty_idx = (ja->dirty_idx + 1) % ja->nr; ++ if (pos <= ja->cur_idx) ++ ja->cur_idx = (ja->cur_idx + 1) % ja->nr; ++ ++ bch2_mark_metadata_bucket(c, ca, bucket, BCH_DATA_JOURNAL, ++ ca->mi.bucket_size, ++ gc_phase(GC_PHASE_SB), ++ 0); ++ ++ if (c) { ++ spin_unlock(&c->journal.lock); ++ percpu_up_read(&c->mark_lock); ++ } ++ ++ if (!new_fs) ++ bch2_open_bucket_put(c, ob); ++ } ++ ++ ret = 0; ++err: ++ kfree(new_bucket_seq); ++ kfree(new_buckets); ++ ++ return ret; ++} ++ ++/* ++ * Allocate more journal space at runtime - not currently making use if it, but ++ * the code works: ++ */ ++int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca, ++ unsigned nr) ++{ ++ struct journal_device *ja = &ca->journal; ++ struct closure cl; ++ unsigned current_nr; ++ int ret; ++ ++ closure_init_stack(&cl); ++ ++ do { ++ struct disk_reservation disk_res = { 0, 0 }; ++ ++ closure_sync(&cl); ++ ++ mutex_lock(&c->sb_lock); ++ current_nr = ja->nr; ++ ++ /* ++ * note: journal buckets aren't really counted as _sectors_ used yet, so ++ * we don't need the disk reservation to avoid the BUG_ON() in buckets.c ++ * when space used goes up without a reservation - but we do need the ++ * reservation to ensure we'll actually be able to allocate: ++ */ ++ ++ if (bch2_disk_reservation_get(c, &disk_res, ++ bucket_to_sector(ca, nr - ja->nr), 1, 0)) { ++ mutex_unlock(&c->sb_lock); ++ return -ENOSPC; ++ } ++ ++ ret = __bch2_set_nr_journal_buckets(ca, nr, false, &cl); ++ ++ bch2_disk_reservation_put(c, &disk_res); ++ ++ if (ja->nr != current_nr) ++ bch2_write_super(c); ++ mutex_unlock(&c->sb_lock); ++ } while (ret == -EAGAIN); ++ ++ return ret; ++} ++ ++int bch2_dev_journal_alloc(struct bch_dev *ca) ++{ ++ unsigned nr; ++ ++ if (dynamic_fault("bcachefs:add:journal_alloc")) ++ return -ENOMEM; ++ ++ /* ++ * clamp journal size to 1024 buckets or 512MB (in sectors), whichever ++ * is smaller: ++ */ ++ nr = clamp_t(unsigned, ca->mi.nbuckets >> 8, ++ BCH_JOURNAL_BUCKETS_MIN, ++ min(1 << 10, ++ (1 << 20) / ca->mi.bucket_size)); ++ ++ return __bch2_set_nr_journal_buckets(ca, nr, true, NULL); ++} ++ ++/* startup/shutdown: */ ++ ++static bool bch2_journal_writing_to_device(struct journal *j, unsigned dev_idx) ++{ ++ union journal_res_state state; ++ struct journal_buf *w; ++ bool ret; ++ ++ spin_lock(&j->lock); ++ state = READ_ONCE(j->reservations); ++ w = j->buf + !state.idx; ++ ++ ret = state.prev_buf_unwritten && ++ bch2_extent_has_device(bkey_i_to_s_c_extent(&w->key), dev_idx); ++ spin_unlock(&j->lock); ++ ++ return ret; ++} ++ ++void bch2_dev_journal_stop(struct journal *j, struct bch_dev *ca) ++{ ++ wait_event(j->wait, !bch2_journal_writing_to_device(j, ca->dev_idx)); ++} ++ ++void bch2_fs_journal_stop(struct journal *j) ++{ ++ struct bch_fs *c = container_of(j, struct bch_fs, journal); ++ ++ bch2_journal_flush_all_pins(j); ++ ++ wait_event(j->wait, journal_entry_close(j)); ++ ++ /* do we need to write another journal entry? */ ++ if (test_bit(JOURNAL_NOT_EMPTY, &j->flags) || ++ c->btree_roots_dirty) ++ bch2_journal_meta(j); ++ ++ journal_quiesce(j); ++ ++ BUG_ON(!bch2_journal_error(j) && ++ test_bit(JOURNAL_NOT_EMPTY, &j->flags)); ++ ++ cancel_delayed_work_sync(&j->write_work); ++ cancel_delayed_work_sync(&j->reclaim_work); ++} ++ ++int bch2_fs_journal_start(struct journal *j, u64 cur_seq, ++ struct list_head *journal_entries) ++{ ++ struct bch_fs *c = container_of(j, struct bch_fs, journal); ++ struct journal_entry_pin_list *p; ++ struct journal_replay *i; ++ u64 last_seq = cur_seq, nr, seq; ++ ++ if (!list_empty(journal_entries)) ++ last_seq = le64_to_cpu(list_first_entry(journal_entries, ++ struct journal_replay, ++ list)->j.seq); ++ ++ nr = cur_seq - last_seq; ++ ++ if (nr + 1 > j->pin.size) { ++ free_fifo(&j->pin); ++ init_fifo(&j->pin, roundup_pow_of_two(nr + 1), GFP_KERNEL); ++ if (!j->pin.data) { ++ bch_err(c, "error reallocating journal fifo (%llu open entries)", nr); ++ return -ENOMEM; ++ } ++ } ++ ++ j->replay_journal_seq = last_seq; ++ j->replay_journal_seq_end = cur_seq; ++ j->last_seq_ondisk = last_seq; ++ j->pin.front = last_seq; ++ j->pin.back = cur_seq; ++ atomic64_set(&j->seq, cur_seq - 1); ++ ++ fifo_for_each_entry_ptr(p, &j->pin, seq) { ++ INIT_LIST_HEAD(&p->list); ++ INIT_LIST_HEAD(&p->flushed); ++ atomic_set(&p->count, 1); ++ p->devs.nr = 0; ++ } ++ ++ list_for_each_entry(i, journal_entries, list) { ++ seq = le64_to_cpu(i->j.seq); ++ ++ BUG_ON(seq < last_seq || seq >= cur_seq); ++ ++ journal_seq_pin(j, seq)->devs = i->devs; ++ } ++ ++ spin_lock(&j->lock); ++ ++ set_bit(JOURNAL_STARTED, &j->flags); ++ ++ journal_pin_new_entry(j, 1); ++ bch2_journal_buf_init(j); ++ ++ c->last_bucket_seq_cleanup = journal_cur_seq(j); ++ ++ bch2_journal_space_available(j); ++ spin_unlock(&j->lock); ++ ++ return 0; ++} ++ ++/* init/exit: */ ++ ++void bch2_dev_journal_exit(struct bch_dev *ca) ++{ ++ kfree(ca->journal.bio); ++ kfree(ca->journal.buckets); ++ kfree(ca->journal.bucket_seq); ++ ++ ca->journal.bio = NULL; ++ ca->journal.buckets = NULL; ++ ca->journal.bucket_seq = NULL; ++} ++ ++int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb) ++{ ++ struct journal_device *ja = &ca->journal; ++ struct bch_sb_field_journal *journal_buckets = ++ bch2_sb_get_journal(sb); ++ unsigned i; ++ ++ ja->nr = bch2_nr_journal_buckets(journal_buckets); ++ ++ ja->bucket_seq = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL); ++ if (!ja->bucket_seq) ++ return -ENOMEM; ++ ++ ca->journal.bio = bio_kmalloc(GFP_KERNEL, ++ DIV_ROUND_UP(JOURNAL_ENTRY_SIZE_MAX, PAGE_SIZE)); ++ if (!ca->journal.bio) ++ return -ENOMEM; ++ ++ ja->buckets = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL); ++ if (!ja->buckets) ++ return -ENOMEM; ++ ++ for (i = 0; i < ja->nr; i++) ++ ja->buckets[i] = le64_to_cpu(journal_buckets->buckets[i]); ++ ++ return 0; ++} ++ ++void bch2_fs_journal_exit(struct journal *j) ++{ ++ kvpfree(j->buf[1].data, j->buf[1].buf_size); ++ kvpfree(j->buf[0].data, j->buf[0].buf_size); ++ free_fifo(&j->pin); ++} ++ ++int bch2_fs_journal_init(struct journal *j) ++{ ++ struct bch_fs *c = container_of(j, struct bch_fs, journal); ++ static struct lock_class_key res_key; ++ int ret = 0; ++ ++ pr_verbose_init(c->opts, ""); ++ ++ spin_lock_init(&j->lock); ++ spin_lock_init(&j->err_lock); ++ init_waitqueue_head(&j->wait); ++ INIT_DELAYED_WORK(&j->write_work, journal_write_work); ++ INIT_DELAYED_WORK(&j->reclaim_work, bch2_journal_reclaim_work); ++ init_waitqueue_head(&j->pin_flush_wait); ++ mutex_init(&j->reclaim_lock); ++ mutex_init(&j->discard_lock); ++ ++ lockdep_init_map(&j->res_map, "journal res", &res_key, 0); ++ ++ j->buf[0].buf_size = JOURNAL_ENTRY_SIZE_MIN; ++ j->buf[1].buf_size = JOURNAL_ENTRY_SIZE_MIN; ++ j->write_delay_ms = 1000; ++ j->reclaim_delay_ms = 100; ++ ++ /* Btree roots: */ ++ j->entry_u64s_reserved += ++ BTREE_ID_NR * (JSET_KEYS_U64s + BKEY_EXTENT_U64s_MAX); ++ ++ atomic64_set(&j->reservations.counter, ++ ((union journal_res_state) ++ { .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v); ++ ++ if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) || ++ !(j->buf[0].data = kvpmalloc(j->buf[0].buf_size, GFP_KERNEL)) || ++ !(j->buf[1].data = kvpmalloc(j->buf[1].buf_size, GFP_KERNEL))) { ++ ret = -ENOMEM; ++ goto out; ++ } ++ ++ j->pin.front = j->pin.back = 1; ++out: ++ pr_verbose_init(c->opts, "ret %i", ret); ++ return ret; ++} ++ ++/* debug: */ ++ ++ssize_t bch2_journal_print_debug(struct journal *j, char *buf) ++{ ++ struct printbuf out = _PBUF(buf, PAGE_SIZE); ++ struct bch_fs *c = container_of(j, struct bch_fs, journal); ++ union journal_res_state s; ++ struct bch_dev *ca; ++ unsigned iter; ++ ++ rcu_read_lock(); ++ spin_lock(&j->lock); ++ s = READ_ONCE(j->reservations); ++ ++ pr_buf(&out, ++ "active journal entries:\t%llu\n" ++ "seq:\t\t\t%llu\n" ++ "last_seq:\t\t%llu\n" ++ "last_seq_ondisk:\t%llu\n" ++ "prereserved:\t\t%u/%u\n" ++ "current entry sectors:\t%u\n" ++ "current entry:\t\t", ++ fifo_used(&j->pin), ++ journal_cur_seq(j), ++ journal_last_seq(j), ++ j->last_seq_ondisk, ++ j->prereserved.reserved, ++ j->prereserved.remaining, ++ j->cur_entry_sectors); ++ ++ switch (s.cur_entry_offset) { ++ case JOURNAL_ENTRY_ERROR_VAL: ++ pr_buf(&out, "error\n"); ++ break; ++ case JOURNAL_ENTRY_CLOSED_VAL: ++ pr_buf(&out, "closed\n"); ++ break; ++ default: ++ pr_buf(&out, "%u/%u\n", ++ s.cur_entry_offset, ++ j->cur_entry_u64s); ++ break; ++ } ++ ++ pr_buf(&out, ++ "current entry refs:\t%u\n" ++ "prev entry unwritten:\t", ++ journal_state_count(s, s.idx)); ++ ++ if (s.prev_buf_unwritten) ++ pr_buf(&out, "yes, ref %u sectors %u\n", ++ journal_state_count(s, !s.idx), ++ journal_prev_buf(j)->sectors); ++ else ++ pr_buf(&out, "no\n"); ++ ++ pr_buf(&out, ++ "need write:\t\t%i\n" ++ "replay done:\t\t%i\n", ++ test_bit(JOURNAL_NEED_WRITE, &j->flags), ++ test_bit(JOURNAL_REPLAY_DONE, &j->flags)); ++ ++ for_each_member_device_rcu(ca, c, iter, ++ &c->rw_devs[BCH_DATA_JOURNAL]) { ++ struct journal_device *ja = &ca->journal; ++ ++ if (!ja->nr) ++ continue; ++ ++ pr_buf(&out, ++ "dev %u:\n" ++ "\tnr\t\t%u\n" ++ "\tavailable\t%u:%u\n" ++ "\tdiscard_idx\t\t%u\n" ++ "\tdirty_idx_ondisk\t%u (seq %llu)\n" ++ "\tdirty_idx\t\t%u (seq %llu)\n" ++ "\tcur_idx\t\t%u (seq %llu)\n", ++ iter, ja->nr, ++ bch2_journal_dev_buckets_available(j, ja, journal_space_discarded), ++ ja->sectors_free, ++ ja->discard_idx, ++ ja->dirty_idx_ondisk, ja->bucket_seq[ja->dirty_idx_ondisk], ++ ja->dirty_idx, ja->bucket_seq[ja->dirty_idx], ++ ja->cur_idx, ja->bucket_seq[ja->cur_idx]); ++ } ++ ++ spin_unlock(&j->lock); ++ rcu_read_unlock(); ++ ++ return out.pos - buf; ++} ++ ++ssize_t bch2_journal_print_pins(struct journal *j, char *buf) ++{ ++ struct printbuf out = _PBUF(buf, PAGE_SIZE); ++ struct journal_entry_pin_list *pin_list; ++ struct journal_entry_pin *pin; ++ u64 i; ++ ++ spin_lock(&j->lock); ++ fifo_for_each_entry_ptr(pin_list, &j->pin, i) { ++ pr_buf(&out, "%llu: count %u\n", ++ i, atomic_read(&pin_list->count)); ++ ++ list_for_each_entry(pin, &pin_list->list, list) ++ pr_buf(&out, "\t%p %pf\n", ++ pin, pin->flush); ++ ++ if (!list_empty(&pin_list->flushed)) ++ pr_buf(&out, "flushed:\n"); ++ ++ list_for_each_entry(pin, &pin_list->flushed, list) ++ pr_buf(&out, "\t%p %pf\n", ++ pin, pin->flush); ++ } ++ spin_unlock(&j->lock); ++ ++ return out.pos - buf; ++} +diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h +new file mode 100644 +index 000000000000..f0da2c52581c +--- /dev/null ++++ b/fs/bcachefs/journal.h +@@ -0,0 +1,495 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_JOURNAL_H ++#define _BCACHEFS_JOURNAL_H ++ ++/* ++ * THE JOURNAL: ++ * ++ * The primary purpose of the journal is to log updates (insertions) to the ++ * b-tree, to avoid having to do synchronous updates to the b-tree on disk. ++ * ++ * Without the journal, the b-tree is always internally consistent on ++ * disk - and in fact, in the earliest incarnations bcache didn't have a journal ++ * but did handle unclean shutdowns by doing all index updates synchronously ++ * (with coalescing). ++ * ++ * Updates to interior nodes still happen synchronously and without the journal ++ * (for simplicity) - this may change eventually but updates to interior nodes ++ * are rare enough it's not a huge priority. ++ * ++ * This means the journal is relatively separate from the b-tree; it consists of ++ * just a list of keys and journal replay consists of just redoing those ++ * insertions in same order that they appear in the journal. ++ * ++ * PERSISTENCE: ++ * ++ * For synchronous updates (where we're waiting on the index update to hit ++ * disk), the journal entry will be written out immediately (or as soon as ++ * possible, if the write for the previous journal entry was still in flight). ++ * ++ * Synchronous updates are specified by passing a closure (@flush_cl) to ++ * bch2_btree_insert() or bch_btree_insert_node(), which then pass that parameter ++ * down to the journalling code. That closure will will wait on the journal ++ * write to complete (via closure_wait()). ++ * ++ * If the index update wasn't synchronous, the journal entry will be ++ * written out after 10 ms have elapsed, by default (the delay_ms field ++ * in struct journal). ++ * ++ * JOURNAL ENTRIES: ++ * ++ * A journal entry is variable size (struct jset), it's got a fixed length ++ * header and then a variable number of struct jset_entry entries. ++ * ++ * Journal entries are identified by monotonically increasing 64 bit sequence ++ * numbers - jset->seq; other places in the code refer to this sequence number. ++ * ++ * A jset_entry entry contains one or more bkeys (which is what gets inserted ++ * into the b-tree). We need a container to indicate which b-tree the key is ++ * for; also, the roots of the various b-trees are stored in jset_entry entries ++ * (one for each b-tree) - this lets us add new b-tree types without changing ++ * the on disk format. ++ * ++ * We also keep some things in the journal header that are logically part of the ++ * superblock - all the things that are frequently updated. This is for future ++ * bcache on raw flash support; the superblock (which will become another ++ * journal) can't be moved or wear leveled, so it contains just enough ++ * information to find the main journal, and the superblock only has to be ++ * rewritten when we want to move/wear level the main journal. ++ * ++ * JOURNAL LAYOUT ON DISK: ++ * ++ * The journal is written to a ringbuffer of buckets (which is kept in the ++ * superblock); the individual buckets are not necessarily contiguous on disk ++ * which means that journal entries are not allowed to span buckets, but also ++ * that we can resize the journal at runtime if desired (unimplemented). ++ * ++ * The journal buckets exist in the same pool as all the other buckets that are ++ * managed by the allocator and garbage collection - garbage collection marks ++ * the journal buckets as metadata buckets. ++ * ++ * OPEN/DIRTY JOURNAL ENTRIES: ++ * ++ * Open/dirty journal entries are journal entries that contain b-tree updates ++ * that have not yet been written out to the b-tree on disk. We have to track ++ * which journal entries are dirty, and we also have to avoid wrapping around ++ * the journal and overwriting old but still dirty journal entries with new ++ * journal entries. ++ * ++ * On disk, this is represented with the "last_seq" field of struct jset; ++ * last_seq is the first sequence number that journal replay has to replay. ++ * ++ * To avoid overwriting dirty journal entries on disk, we keep a mapping (in ++ * journal_device->seq) of for each journal bucket, the highest sequence number ++ * any journal entry it contains. Then, by comparing that against last_seq we ++ * can determine whether that journal bucket contains dirty journal entries or ++ * not. ++ * ++ * To track which journal entries are dirty, we maintain a fifo of refcounts ++ * (where each entry corresponds to a specific sequence number) - when a ref ++ * goes to 0, that journal entry is no longer dirty. ++ * ++ * Journalling of index updates is done at the same time as the b-tree itself is ++ * being modified (see btree_insert_key()); when we add the key to the journal ++ * the pending b-tree write takes a ref on the journal entry the key was added ++ * to. If a pending b-tree write would need to take refs on multiple dirty ++ * journal entries, it only keeps the ref on the oldest one (since a newer ++ * journal entry will still be replayed if an older entry was dirty). ++ * ++ * JOURNAL FILLING UP: ++ * ++ * There are two ways the journal could fill up; either we could run out of ++ * space to write to, or we could have too many open journal entries and run out ++ * of room in the fifo of refcounts. Since those refcounts are decremented ++ * without any locking we can't safely resize that fifo, so we handle it the ++ * same way. ++ * ++ * If the journal fills up, we start flushing dirty btree nodes until we can ++ * allocate space for a journal write again - preferentially flushing btree ++ * nodes that are pinning the oldest journal entries first. ++ */ ++ ++#include ++ ++#include "journal_types.h" ++ ++struct bch_fs; ++ ++static inline void journal_wake(struct journal *j) ++{ ++ wake_up(&j->wait); ++ closure_wake_up(&j->async_wait); ++ closure_wake_up(&j->preres_wait); ++} ++ ++static inline struct journal_buf *journal_cur_buf(struct journal *j) ++{ ++ return j->buf + j->reservations.idx; ++} ++ ++static inline struct journal_buf *journal_prev_buf(struct journal *j) ++{ ++ return j->buf + !j->reservations.idx; ++} ++ ++/* Sequence number of oldest dirty journal entry */ ++ ++static inline u64 journal_last_seq(struct journal *j) ++{ ++ return j->pin.front; ++} ++ ++static inline u64 journal_cur_seq(struct journal *j) ++{ ++ BUG_ON(j->pin.back - 1 != atomic64_read(&j->seq)); ++ ++ return j->pin.back - 1; ++} ++ ++u64 bch2_inode_journal_seq(struct journal *, u64); ++ ++static inline int journal_state_count(union journal_res_state s, int idx) ++{ ++ return idx == 0 ? s.buf0_count : s.buf1_count; ++} ++ ++static inline void journal_state_inc(union journal_res_state *s) ++{ ++ s->buf0_count += s->idx == 0; ++ s->buf1_count += s->idx == 1; ++} ++ ++static inline void bch2_journal_set_has_inode(struct journal *j, ++ struct journal_res *res, ++ u64 inum) ++{ ++ struct journal_buf *buf = &j->buf[res->idx]; ++ unsigned long bit = hash_64(inum, ilog2(sizeof(buf->has_inode) * 8)); ++ ++ /* avoid atomic op if possible */ ++ if (unlikely(!test_bit(bit, buf->has_inode))) ++ set_bit(bit, buf->has_inode); ++} ++ ++/* ++ * Amount of space that will be taken up by some keys in the journal (i.e. ++ * including the jset header) ++ */ ++static inline unsigned jset_u64s(unsigned u64s) ++{ ++ return u64s + sizeof(struct jset_entry) / sizeof(u64); ++} ++ ++static inline int journal_entry_overhead(struct journal *j) ++{ ++ return sizeof(struct jset) / sizeof(u64) + j->entry_u64s_reserved; ++} ++ ++static inline struct jset_entry * ++bch2_journal_add_entry_noreservation(struct journal_buf *buf, size_t u64s) ++{ ++ struct jset *jset = buf->data; ++ struct jset_entry *entry = vstruct_idx(jset, le32_to_cpu(jset->u64s)); ++ ++ memset(entry, 0, sizeof(*entry)); ++ entry->u64s = cpu_to_le16(u64s); ++ ++ le32_add_cpu(&jset->u64s, jset_u64s(u64s)); ++ ++ return entry; ++} ++ ++static inline void bch2_journal_add_entry(struct journal *j, struct journal_res *res, ++ unsigned type, enum btree_id id, ++ unsigned level, ++ const void *data, unsigned u64s) ++{ ++ struct journal_buf *buf = &j->buf[res->idx]; ++ struct jset_entry *entry = vstruct_idx(buf->data, res->offset); ++ unsigned actual = jset_u64s(u64s); ++ ++ EBUG_ON(!res->ref); ++ EBUG_ON(actual > res->u64s); ++ ++ res->offset += actual; ++ res->u64s -= actual; ++ ++ memset(entry, 0, sizeof(*entry)); ++ entry->u64s = cpu_to_le16(u64s); ++ entry->type = type; ++ entry->btree_id = id; ++ entry->level = level; ++ memcpy_u64s(entry->_data, data, u64s); ++} ++ ++static inline void bch2_journal_add_keys(struct journal *j, struct journal_res *res, ++ enum btree_id id, const struct bkey_i *k) ++{ ++ bch2_journal_add_entry(j, res, BCH_JSET_ENTRY_btree_keys, ++ id, 0, k, k->k.u64s); ++} ++ ++static inline bool journal_entry_empty(struct jset *j) ++{ ++ struct jset_entry *i; ++ ++ if (j->seq != j->last_seq) ++ return false; ++ ++ vstruct_for_each(j, i) ++ if (i->type == BCH_JSET_ENTRY_btree_keys && i->u64s) ++ return false; ++ return true; ++} ++ ++void __bch2_journal_buf_put(struct journal *, bool); ++ ++static inline void bch2_journal_buf_put(struct journal *j, unsigned idx, ++ bool need_write_just_set) ++{ ++ union journal_res_state s; ++ ++ s.v = atomic64_sub_return(((union journal_res_state) { ++ .buf0_count = idx == 0, ++ .buf1_count = idx == 1, ++ }).v, &j->reservations.counter); ++ if (!journal_state_count(s, idx)) { ++ EBUG_ON(s.idx == idx || !s.prev_buf_unwritten); ++ __bch2_journal_buf_put(j, need_write_just_set); ++ } ++} ++ ++/* ++ * This function releases the journal write structure so other threads can ++ * then proceed to add their keys as well. ++ */ ++static inline void bch2_journal_res_put(struct journal *j, ++ struct journal_res *res) ++{ ++ if (!res->ref) ++ return; ++ ++ lock_release(&j->res_map, _THIS_IP_); ++ ++ while (res->u64s) ++ bch2_journal_add_entry(j, res, ++ BCH_JSET_ENTRY_btree_keys, ++ 0, 0, NULL, 0); ++ ++ bch2_journal_buf_put(j, res->idx, false); ++ ++ res->ref = 0; ++} ++ ++int bch2_journal_res_get_slowpath(struct journal *, struct journal_res *, ++ unsigned); ++ ++#define JOURNAL_RES_GET_NONBLOCK (1 << 0) ++#define JOURNAL_RES_GET_CHECK (1 << 1) ++#define JOURNAL_RES_GET_RESERVED (1 << 2) ++ ++static inline int journal_res_get_fast(struct journal *j, ++ struct journal_res *res, ++ unsigned flags) ++{ ++ union journal_res_state old, new; ++ u64 v = atomic64_read(&j->reservations.counter); ++ ++ do { ++ old.v = new.v = v; ++ ++ /* ++ * Check if there is still room in the current journal ++ * entry: ++ */ ++ if (new.cur_entry_offset + res->u64s > j->cur_entry_u64s) ++ return 0; ++ ++ EBUG_ON(!journal_state_count(new, new.idx)); ++ ++ if (!(flags & JOURNAL_RES_GET_RESERVED) && ++ !test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags)) ++ return 0; ++ ++ if (flags & JOURNAL_RES_GET_CHECK) ++ return 1; ++ ++ new.cur_entry_offset += res->u64s; ++ journal_state_inc(&new); ++ } while ((v = atomic64_cmpxchg(&j->reservations.counter, ++ old.v, new.v)) != old.v); ++ ++ res->ref = true; ++ res->idx = old.idx; ++ res->offset = old.cur_entry_offset; ++ res->seq = le64_to_cpu(j->buf[old.idx].data->seq); ++ return 1; ++} ++ ++static inline int bch2_journal_res_get(struct journal *j, struct journal_res *res, ++ unsigned u64s, unsigned flags) ++{ ++ int ret; ++ ++ EBUG_ON(res->ref); ++ EBUG_ON(!test_bit(JOURNAL_STARTED, &j->flags)); ++ ++ res->u64s = u64s; ++ ++ if (journal_res_get_fast(j, res, flags)) ++ goto out; ++ ++ ret = bch2_journal_res_get_slowpath(j, res, flags); ++ if (ret) ++ return ret; ++out: ++ if (!(flags & JOURNAL_RES_GET_CHECK)) { ++ lock_acquire_shared(&j->res_map, 0, 0, NULL, _THIS_IP_); ++ EBUG_ON(!res->ref); ++ } ++ return 0; ++} ++ ++/* journal_preres: */ ++ ++static inline bool journal_check_may_get_unreserved(struct journal *j) ++{ ++ union journal_preres_state s = READ_ONCE(j->prereserved); ++ bool ret = s.reserved <= s.remaining && ++ fifo_free(&j->pin) > 8; ++ ++ lockdep_assert_held(&j->lock); ++ ++ if (ret != test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags)) { ++ if (ret) { ++ set_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags); ++ journal_wake(j); ++ } else { ++ clear_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags); ++ } ++ } ++ return ret; ++} ++ ++static inline void bch2_journal_preres_put(struct journal *j, ++ struct journal_preres *res) ++{ ++ union journal_preres_state s = { .reserved = res->u64s }; ++ ++ if (!res->u64s) ++ return; ++ ++ s.v = atomic64_sub_return(s.v, &j->prereserved.counter); ++ res->u64s = 0; ++ closure_wake_up(&j->preres_wait); ++ ++ if (s.reserved <= s.remaining && ++ !test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags)) { ++ spin_lock(&j->lock); ++ journal_check_may_get_unreserved(j); ++ spin_unlock(&j->lock); ++ } ++} ++ ++int __bch2_journal_preres_get(struct journal *, ++ struct journal_preres *, unsigned); ++ ++static inline int bch2_journal_preres_get_fast(struct journal *j, ++ struct journal_preres *res, ++ unsigned new_u64s) ++{ ++ int d = new_u64s - res->u64s; ++ union journal_preres_state old, new; ++ u64 v = atomic64_read(&j->prereserved.counter); ++ ++ do { ++ old.v = new.v = v; ++ ++ new.reserved += d; ++ ++ if (new.reserved > new.remaining) ++ return 0; ++ } while ((v = atomic64_cmpxchg(&j->prereserved.counter, ++ old.v, new.v)) != old.v); ++ ++ res->u64s += d; ++ return 1; ++} ++ ++static inline int bch2_journal_preres_get(struct journal *j, ++ struct journal_preres *res, ++ unsigned new_u64s, ++ unsigned flags) ++{ ++ if (new_u64s <= res->u64s) ++ return 0; ++ ++ if (bch2_journal_preres_get_fast(j, res, new_u64s)) ++ return 0; ++ ++ if (flags & JOURNAL_RES_GET_NONBLOCK) ++ return -EAGAIN; ++ ++ return __bch2_journal_preres_get(j, res, new_u64s); ++} ++ ++/* journal_entry_res: */ ++ ++void bch2_journal_entry_res_resize(struct journal *, ++ struct journal_entry_res *, ++ unsigned); ++ ++u64 bch2_journal_last_unwritten_seq(struct journal *); ++int bch2_journal_open_seq_async(struct journal *, u64, struct closure *); ++ ++void bch2_journal_wait_on_seq(struct journal *, u64, struct closure *); ++void bch2_journal_flush_seq_async(struct journal *, u64, struct closure *); ++void bch2_journal_flush_async(struct journal *, struct closure *); ++void bch2_journal_meta_async(struct journal *, struct closure *); ++ ++int bch2_journal_flush_seq(struct journal *, u64); ++int bch2_journal_flush(struct journal *); ++int bch2_journal_meta(struct journal *); ++ ++void bch2_journal_halt(struct journal *); ++ ++static inline int bch2_journal_error(struct journal *j) ++{ ++ return j->reservations.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL ++ ? -EIO : 0; ++} ++ ++struct bch_dev; ++ ++static inline bool journal_flushes_device(struct bch_dev *ca) ++{ ++ return true; ++} ++ ++static inline void bch2_journal_set_replay_done(struct journal *j) ++{ ++ BUG_ON(!test_bit(JOURNAL_STARTED, &j->flags)); ++ set_bit(JOURNAL_REPLAY_DONE, &j->flags); ++} ++ ++void bch2_journal_unblock(struct journal *); ++void bch2_journal_block(struct journal *); ++ ++ssize_t bch2_journal_print_debug(struct journal *, char *); ++ssize_t bch2_journal_print_pins(struct journal *, char *); ++ ++int bch2_set_nr_journal_buckets(struct bch_fs *, struct bch_dev *, ++ unsigned nr); ++int bch2_dev_journal_alloc(struct bch_dev *); ++ ++void bch2_dev_journal_stop(struct journal *, struct bch_dev *); ++ ++void bch2_fs_journal_stop(struct journal *); ++int bch2_fs_journal_start(struct journal *, u64, struct list_head *); ++ ++void bch2_dev_journal_exit(struct bch_dev *); ++int bch2_dev_journal_init(struct bch_dev *, struct bch_sb *); ++void bch2_fs_journal_exit(struct journal *); ++int bch2_fs_journal_init(struct journal *); ++ ++#endif /* _BCACHEFS_JOURNAL_H */ +diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c +new file mode 100644 +index 000000000000..387377dadab5 +--- /dev/null ++++ b/fs/bcachefs/journal_io.c +@@ -0,0 +1,1123 @@ ++// SPDX-License-Identifier: GPL-2.0 ++#include "bcachefs.h" ++#include "alloc_foreground.h" ++#include "buckets.h" ++#include "checksum.h" ++#include "error.h" ++#include "journal.h" ++#include "journal_io.h" ++#include "journal_reclaim.h" ++#include "replicas.h" ++ ++#include ++ ++struct journal_list { ++ struct closure cl; ++ struct mutex lock; ++ struct list_head *head; ++ int ret; ++}; ++ ++#define JOURNAL_ENTRY_ADD_OK 0 ++#define JOURNAL_ENTRY_ADD_OUT_OF_RANGE 5 ++ ++/* ++ * Given a journal entry we just read, add it to the list of journal entries to ++ * be replayed: ++ */ ++static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca, ++ struct journal_list *jlist, struct jset *j) ++{ ++ struct journal_replay *i, *pos; ++ struct list_head *where; ++ size_t bytes = vstruct_bytes(j); ++ __le64 last_seq; ++ int ret; ++ ++ last_seq = !list_empty(jlist->head) ++ ? list_last_entry(jlist->head, struct journal_replay, ++ list)->j.last_seq ++ : 0; ++ ++ /* Is this entry older than the range we need? */ ++ if (le64_to_cpu(j->seq) < le64_to_cpu(last_seq)) { ++ ret = JOURNAL_ENTRY_ADD_OUT_OF_RANGE; ++ goto out; ++ } ++ ++ /* Drop entries we don't need anymore */ ++ list_for_each_entry_safe(i, pos, jlist->head, list) { ++ if (le64_to_cpu(i->j.seq) >= le64_to_cpu(j->last_seq)) ++ break; ++ list_del(&i->list); ++ kvpfree(i, offsetof(struct journal_replay, j) + ++ vstruct_bytes(&i->j)); ++ } ++ ++ list_for_each_entry_reverse(i, jlist->head, list) { ++ /* Duplicate? */ ++ if (le64_to_cpu(j->seq) == le64_to_cpu(i->j.seq)) { ++ fsck_err_on(bytes != vstruct_bytes(&i->j) || ++ memcmp(j, &i->j, bytes), c, ++ "found duplicate but non identical journal entries (seq %llu)", ++ le64_to_cpu(j->seq)); ++ goto found; ++ } ++ ++ if (le64_to_cpu(j->seq) > le64_to_cpu(i->j.seq)) { ++ where = &i->list; ++ goto add; ++ } ++ } ++ ++ where = jlist->head; ++add: ++ i = kvpmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL); ++ if (!i) { ++ ret = -ENOMEM; ++ goto out; ++ } ++ ++ list_add(&i->list, where); ++ i->devs.nr = 0; ++ memcpy(&i->j, j, bytes); ++found: ++ if (!bch2_dev_list_has_dev(i->devs, ca->dev_idx)) ++ bch2_dev_list_add_dev(&i->devs, ca->dev_idx); ++ else ++ fsck_err_on(1, c, "duplicate journal entries on same device"); ++ ret = JOURNAL_ENTRY_ADD_OK; ++out: ++fsck_err: ++ return ret; ++} ++ ++static struct nonce journal_nonce(const struct jset *jset) ++{ ++ return (struct nonce) {{ ++ [0] = 0, ++ [1] = ((__le32 *) &jset->seq)[0], ++ [2] = ((__le32 *) &jset->seq)[1], ++ [3] = BCH_NONCE_JOURNAL, ++ }}; ++} ++ ++/* this fills in a range with empty jset_entries: */ ++static void journal_entry_null_range(void *start, void *end) ++{ ++ struct jset_entry *entry; ++ ++ for (entry = start; entry != end; entry = vstruct_next(entry)) ++ memset(entry, 0, sizeof(*entry)); ++} ++ ++#define JOURNAL_ENTRY_REREAD 5 ++#define JOURNAL_ENTRY_NONE 6 ++#define JOURNAL_ENTRY_BAD 7 ++ ++#define journal_entry_err(c, msg, ...) \ ++({ \ ++ switch (write) { \ ++ case READ: \ ++ mustfix_fsck_err(c, msg, ##__VA_ARGS__); \ ++ break; \ ++ case WRITE: \ ++ bch_err(c, "corrupt metadata before write:\n" \ ++ msg, ##__VA_ARGS__); \ ++ if (bch2_fs_inconsistent(c)) { \ ++ ret = BCH_FSCK_ERRORS_NOT_FIXED; \ ++ goto fsck_err; \ ++ } \ ++ break; \ ++ } \ ++ true; \ ++}) ++ ++#define journal_entry_err_on(cond, c, msg, ...) \ ++ ((cond) ? journal_entry_err(c, msg, ##__VA_ARGS__) : false) ++ ++static int journal_validate_key(struct bch_fs *c, struct jset *jset, ++ struct jset_entry *entry, ++ struct bkey_i *k, enum btree_node_type key_type, ++ const char *type, int write) ++{ ++ void *next = vstruct_next(entry); ++ const char *invalid; ++ unsigned version = le32_to_cpu(jset->version); ++ int ret = 0; ++ ++ if (journal_entry_err_on(!k->k.u64s, c, ++ "invalid %s in journal: k->u64s 0", type)) { ++ entry->u64s = cpu_to_le16((u64 *) k - entry->_data); ++ journal_entry_null_range(vstruct_next(entry), next); ++ return 0; ++ } ++ ++ if (journal_entry_err_on((void *) bkey_next(k) > ++ (void *) vstruct_next(entry), c, ++ "invalid %s in journal: extends past end of journal entry", ++ type)) { ++ entry->u64s = cpu_to_le16((u64 *) k - entry->_data); ++ journal_entry_null_range(vstruct_next(entry), next); ++ return 0; ++ } ++ ++ if (journal_entry_err_on(k->k.format != KEY_FORMAT_CURRENT, c, ++ "invalid %s in journal: bad format %u", ++ type, k->k.format)) { ++ le16_add_cpu(&entry->u64s, -k->k.u64s); ++ memmove(k, bkey_next(k), next - (void *) bkey_next(k)); ++ journal_entry_null_range(vstruct_next(entry), next); ++ return 0; ++ } ++ ++ if (JSET_BIG_ENDIAN(jset) != CPU_BIG_ENDIAN) ++ bch2_bkey_swab(NULL, bkey_to_packed(k)); ++ ++ if (!write && ++ version < bcachefs_metadata_version_bkey_renumber) ++ bch2_bkey_renumber(key_type, bkey_to_packed(k), write); ++ ++ invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(k), key_type); ++ if (invalid) { ++ char buf[160]; ++ ++ bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(k)); ++ mustfix_fsck_err(c, "invalid %s in journal: %s\n%s", ++ type, invalid, buf); ++ ++ le16_add_cpu(&entry->u64s, -k->k.u64s); ++ memmove(k, bkey_next(k), next - (void *) bkey_next(k)); ++ journal_entry_null_range(vstruct_next(entry), next); ++ return 0; ++ } ++ ++ if (write && ++ version < bcachefs_metadata_version_bkey_renumber) ++ bch2_bkey_renumber(key_type, bkey_to_packed(k), write); ++fsck_err: ++ return ret; ++} ++ ++static int journal_entry_validate_btree_keys(struct bch_fs *c, ++ struct jset *jset, ++ struct jset_entry *entry, ++ int write) ++{ ++ struct bkey_i *k; ++ ++ vstruct_for_each(entry, k) { ++ int ret = journal_validate_key(c, jset, entry, k, ++ __btree_node_type(entry->level, ++ entry->btree_id), ++ "key", write); ++ if (ret) ++ return ret; ++ } ++ ++ return 0; ++} ++ ++static int journal_entry_validate_btree_root(struct bch_fs *c, ++ struct jset *jset, ++ struct jset_entry *entry, ++ int write) ++{ ++ struct bkey_i *k = entry->start; ++ int ret = 0; ++ ++ if (journal_entry_err_on(!entry->u64s || ++ le16_to_cpu(entry->u64s) != k->k.u64s, c, ++ "invalid btree root journal entry: wrong number of keys")) { ++ void *next = vstruct_next(entry); ++ /* ++ * we don't want to null out this jset_entry, ++ * just the contents, so that later we can tell ++ * we were _supposed_ to have a btree root ++ */ ++ entry->u64s = 0; ++ journal_entry_null_range(vstruct_next(entry), next); ++ return 0; ++ } ++ ++ return journal_validate_key(c, jset, entry, k, BKEY_TYPE_BTREE, ++ "btree root", write); ++fsck_err: ++ return ret; ++} ++ ++static int journal_entry_validate_prio_ptrs(struct bch_fs *c, ++ struct jset *jset, ++ struct jset_entry *entry, ++ int write) ++{ ++ /* obsolete, don't care: */ ++ return 0; ++} ++ ++static int journal_entry_validate_blacklist(struct bch_fs *c, ++ struct jset *jset, ++ struct jset_entry *entry, ++ int write) ++{ ++ int ret = 0; ++ ++ if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 1, c, ++ "invalid journal seq blacklist entry: bad size")) { ++ journal_entry_null_range(entry, vstruct_next(entry)); ++ } ++fsck_err: ++ return ret; ++} ++ ++static int journal_entry_validate_blacklist_v2(struct bch_fs *c, ++ struct jset *jset, ++ struct jset_entry *entry, ++ int write) ++{ ++ struct jset_entry_blacklist_v2 *bl_entry; ++ int ret = 0; ++ ++ if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 2, c, ++ "invalid journal seq blacklist entry: bad size")) { ++ journal_entry_null_range(entry, vstruct_next(entry)); ++ goto out; ++ } ++ ++ bl_entry = container_of(entry, struct jset_entry_blacklist_v2, entry); ++ ++ if (journal_entry_err_on(le64_to_cpu(bl_entry->start) > ++ le64_to_cpu(bl_entry->end), c, ++ "invalid journal seq blacklist entry: start > end")) { ++ journal_entry_null_range(entry, vstruct_next(entry)); ++ } ++out: ++fsck_err: ++ return ret; ++} ++ ++static int journal_entry_validate_usage(struct bch_fs *c, ++ struct jset *jset, ++ struct jset_entry *entry, ++ int write) ++{ ++ struct jset_entry_usage *u = ++ container_of(entry, struct jset_entry_usage, entry); ++ unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); ++ int ret = 0; ++ ++ if (journal_entry_err_on(bytes < sizeof(*u), ++ c, ++ "invalid journal entry usage: bad size")) { ++ journal_entry_null_range(entry, vstruct_next(entry)); ++ return ret; ++ } ++ ++fsck_err: ++ return ret; ++} ++ ++static int journal_entry_validate_data_usage(struct bch_fs *c, ++ struct jset *jset, ++ struct jset_entry *entry, ++ int write) ++{ ++ struct jset_entry_data_usage *u = ++ container_of(entry, struct jset_entry_data_usage, entry); ++ unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); ++ int ret = 0; ++ ++ if (journal_entry_err_on(bytes < sizeof(*u) || ++ bytes < sizeof(*u) + u->r.nr_devs, ++ c, ++ "invalid journal entry usage: bad size")) { ++ journal_entry_null_range(entry, vstruct_next(entry)); ++ return ret; ++ } ++ ++fsck_err: ++ return ret; ++} ++ ++struct jset_entry_ops { ++ int (*validate)(struct bch_fs *, struct jset *, ++ struct jset_entry *, int); ++}; ++ ++static const struct jset_entry_ops bch2_jset_entry_ops[] = { ++#define x(f, nr) \ ++ [BCH_JSET_ENTRY_##f] = (struct jset_entry_ops) { \ ++ .validate = journal_entry_validate_##f, \ ++ }, ++ BCH_JSET_ENTRY_TYPES() ++#undef x ++}; ++ ++static int journal_entry_validate(struct bch_fs *c, struct jset *jset, ++ struct jset_entry *entry, int write) ++{ ++ return entry->type < BCH_JSET_ENTRY_NR ++ ? bch2_jset_entry_ops[entry->type].validate(c, jset, ++ entry, write) ++ : 0; ++} ++ ++static int jset_validate_entries(struct bch_fs *c, struct jset *jset, ++ int write) ++{ ++ struct jset_entry *entry; ++ int ret = 0; ++ ++ vstruct_for_each(jset, entry) { ++ if (journal_entry_err_on(vstruct_next(entry) > ++ vstruct_last(jset), c, ++ "journal entry extends past end of jset")) { ++ jset->u64s = cpu_to_le32((u64 *) entry - jset->_data); ++ break; ++ } ++ ++ ret = journal_entry_validate(c, jset, entry, write); ++ if (ret) ++ break; ++ } ++fsck_err: ++ return ret; ++} ++ ++static int jset_validate(struct bch_fs *c, ++ struct jset *jset, u64 sector, ++ unsigned bucket_sectors_left, ++ unsigned sectors_read, ++ int write) ++{ ++ size_t bytes = vstruct_bytes(jset); ++ struct bch_csum csum; ++ unsigned version; ++ int ret = 0; ++ ++ if (le64_to_cpu(jset->magic) != jset_magic(c)) ++ return JOURNAL_ENTRY_NONE; ++ ++ version = le32_to_cpu(jset->version); ++ if ((version != BCH_JSET_VERSION_OLD && ++ version < bcachefs_metadata_version_min) || ++ version >= bcachefs_metadata_version_max) { ++ bch_err(c, "unknown journal entry version %u", jset->version); ++ return BCH_FSCK_UNKNOWN_VERSION; ++ } ++ ++ if (journal_entry_err_on(bytes > bucket_sectors_left << 9, c, ++ "journal entry too big (%zu bytes), sector %lluu", ++ bytes, sector)) { ++ /* XXX: note we might have missing journal entries */ ++ return JOURNAL_ENTRY_BAD; ++ } ++ ++ if (bytes > sectors_read << 9) ++ return JOURNAL_ENTRY_REREAD; ++ ++ if (fsck_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)), c, ++ "journal entry with unknown csum type %llu sector %lluu", ++ JSET_CSUM_TYPE(jset), sector)) ++ return JOURNAL_ENTRY_BAD; ++ ++ csum = csum_vstruct(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), jset); ++ if (journal_entry_err_on(bch2_crc_cmp(csum, jset->csum), c, ++ "journal checksum bad, sector %llu", sector)) { ++ /* XXX: retry IO, when we start retrying checksum errors */ ++ /* XXX: note we might have missing journal entries */ ++ return JOURNAL_ENTRY_BAD; ++ } ++ ++ bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), ++ jset->encrypted_start, ++ vstruct_end(jset) - (void *) jset->encrypted_start); ++ ++ if (journal_entry_err_on(le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq), c, ++ "invalid journal entry: last_seq > seq")) ++ jset->last_seq = jset->seq; ++ ++ return 0; ++fsck_err: ++ return ret; ++} ++ ++struct journal_read_buf { ++ void *data; ++ size_t size; ++}; ++ ++static int journal_read_buf_realloc(struct journal_read_buf *b, ++ size_t new_size) ++{ ++ void *n; ++ ++ /* the bios are sized for this many pages, max: */ ++ if (new_size > JOURNAL_ENTRY_SIZE_MAX) ++ return -ENOMEM; ++ ++ new_size = roundup_pow_of_two(new_size); ++ n = kvpmalloc(new_size, GFP_KERNEL); ++ if (!n) ++ return -ENOMEM; ++ ++ kvpfree(b->data, b->size); ++ b->data = n; ++ b->size = new_size; ++ return 0; ++} ++ ++static int journal_read_bucket(struct bch_dev *ca, ++ struct journal_read_buf *buf, ++ struct journal_list *jlist, ++ unsigned bucket) ++{ ++ struct bch_fs *c = ca->fs; ++ struct journal_device *ja = &ca->journal; ++ struct jset *j = NULL; ++ unsigned sectors, sectors_read = 0; ++ u64 offset = bucket_to_sector(ca, ja->buckets[bucket]), ++ end = offset + ca->mi.bucket_size; ++ bool saw_bad = false; ++ int ret = 0; ++ ++ pr_debug("reading %u", bucket); ++ ++ while (offset < end) { ++ if (!sectors_read) { ++ struct bio *bio; ++reread: ++ sectors_read = min_t(unsigned, ++ end - offset, buf->size >> 9); ++ ++ bio = bio_kmalloc(GFP_KERNEL, ++ buf_pages(buf->data, ++ sectors_read << 9)); ++ bio_set_dev(bio, ca->disk_sb.bdev); ++ bio->bi_iter.bi_sector = offset; ++ bio_set_op_attrs(bio, REQ_OP_READ, 0); ++ bch2_bio_map(bio, buf->data, sectors_read << 9); ++ ++ ret = submit_bio_wait(bio); ++ bio_put(bio); ++ ++ if (bch2_dev_io_err_on(ret, ca, ++ "journal read from sector %llu", ++ offset) || ++ bch2_meta_read_fault("journal")) ++ return -EIO; ++ ++ j = buf->data; ++ } ++ ++ ret = jset_validate(c, j, offset, ++ end - offset, sectors_read, ++ READ); ++ switch (ret) { ++ case BCH_FSCK_OK: ++ break; ++ case JOURNAL_ENTRY_REREAD: ++ if (vstruct_bytes(j) > buf->size) { ++ ret = journal_read_buf_realloc(buf, ++ vstruct_bytes(j)); ++ if (ret) ++ return ret; ++ } ++ goto reread; ++ case JOURNAL_ENTRY_NONE: ++ if (!saw_bad) ++ return 0; ++ sectors = c->opts.block_size; ++ goto next_block; ++ case JOURNAL_ENTRY_BAD: ++ saw_bad = true; ++ sectors = c->opts.block_size; ++ goto next_block; ++ default: ++ return ret; ++ } ++ ++ /* ++ * This happens sometimes if we don't have discards on - ++ * when we've partially overwritten a bucket with new ++ * journal entries. We don't need the rest of the ++ * bucket: ++ */ ++ if (le64_to_cpu(j->seq) < ja->bucket_seq[bucket]) ++ return 0; ++ ++ ja->bucket_seq[bucket] = le64_to_cpu(j->seq); ++ ++ mutex_lock(&jlist->lock); ++ ret = journal_entry_add(c, ca, jlist, j); ++ mutex_unlock(&jlist->lock); ++ ++ switch (ret) { ++ case JOURNAL_ENTRY_ADD_OK: ++ break; ++ case JOURNAL_ENTRY_ADD_OUT_OF_RANGE: ++ break; ++ default: ++ return ret; ++ } ++ ++ sectors = vstruct_sectors(j, c->block_bits); ++next_block: ++ pr_debug("next"); ++ offset += sectors; ++ sectors_read -= sectors; ++ j = ((void *) j) + (sectors << 9); ++ } ++ ++ return 0; ++} ++ ++static void bch2_journal_read_device(struct closure *cl) ++{ ++ struct journal_device *ja = ++ container_of(cl, struct journal_device, read); ++ struct bch_dev *ca = container_of(ja, struct bch_dev, journal); ++ struct journal_list *jlist = ++ container_of(cl->parent, struct journal_list, cl); ++ struct journal_read_buf buf = { NULL, 0 }; ++ u64 min_seq = U64_MAX; ++ unsigned i; ++ int ret; ++ ++ if (!ja->nr) ++ goto out; ++ ++ ret = journal_read_buf_realloc(&buf, PAGE_SIZE); ++ if (ret) ++ goto err; ++ ++ pr_debug("%u journal buckets", ja->nr); ++ ++ for (i = 0; i < ja->nr; i++) { ++ ret = journal_read_bucket(ca, &buf, jlist, i); ++ if (ret) ++ goto err; ++ } ++ ++ /* Find the journal bucket with the highest sequence number: */ ++ for (i = 0; i < ja->nr; i++) { ++ if (ja->bucket_seq[i] > ja->bucket_seq[ja->cur_idx]) ++ ja->cur_idx = i; ++ ++ min_seq = min(ja->bucket_seq[i], min_seq); ++ } ++ ++ /* ++ * If there's duplicate journal entries in multiple buckets (which ++ * definitely isn't supposed to happen, but...) - make sure to start ++ * cur_idx at the last of those buckets, so we don't deadlock trying to ++ * allocate ++ */ ++ while (ja->bucket_seq[ja->cur_idx] > min_seq && ++ ja->bucket_seq[ja->cur_idx] > ++ ja->bucket_seq[(ja->cur_idx + 1) % ja->nr]) ++ ja->cur_idx = (ja->cur_idx + 1) % ja->nr; ++ ++ ja->sectors_free = 0; ++ ++ /* ++ * Set dirty_idx to indicate the entire journal is full and needs to be ++ * reclaimed - journal reclaim will immediately reclaim whatever isn't ++ * pinned when it first runs: ++ */ ++ ja->discard_idx = ja->dirty_idx_ondisk = ++ ja->dirty_idx = (ja->cur_idx + 1) % ja->nr; ++out: ++ kvpfree(buf.data, buf.size); ++ percpu_ref_put(&ca->io_ref); ++ closure_return(cl); ++ return; ++err: ++ mutex_lock(&jlist->lock); ++ jlist->ret = ret; ++ mutex_unlock(&jlist->lock); ++ goto out; ++} ++ ++int bch2_journal_read(struct bch_fs *c, struct list_head *list) ++{ ++ struct journal_list jlist; ++ struct journal_replay *i; ++ struct bch_dev *ca; ++ unsigned iter; ++ size_t keys = 0, entries = 0; ++ bool degraded = false; ++ int ret = 0; ++ ++ closure_init_stack(&jlist.cl); ++ mutex_init(&jlist.lock); ++ jlist.head = list; ++ jlist.ret = 0; ++ ++ for_each_member_device(ca, c, iter) { ++ if (!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) && ++ !(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_JOURNAL))) ++ continue; ++ ++ if ((ca->mi.state == BCH_MEMBER_STATE_RW || ++ ca->mi.state == BCH_MEMBER_STATE_RO) && ++ percpu_ref_tryget(&ca->io_ref)) ++ closure_call(&ca->journal.read, ++ bch2_journal_read_device, ++ system_unbound_wq, ++ &jlist.cl); ++ else ++ degraded = true; ++ } ++ ++ closure_sync(&jlist.cl); ++ ++ if (jlist.ret) ++ return jlist.ret; ++ ++ list_for_each_entry(i, list, list) { ++ struct jset_entry *entry; ++ struct bkey_i *k, *_n; ++ struct bch_replicas_padded replicas; ++ char buf[80]; ++ ++ ret = jset_validate_entries(c, &i->j, READ); ++ if (ret) ++ goto fsck_err; ++ ++ /* ++ * If we're mounting in degraded mode - if we didn't read all ++ * the devices - this is wrong: ++ */ ++ ++ bch2_devlist_to_replicas(&replicas.e, BCH_DATA_JOURNAL, i->devs); ++ ++ if (!degraded && ++ (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) || ++ fsck_err_on(!bch2_replicas_marked(c, &replicas.e, false), c, ++ "superblock not marked as containing replicas %s", ++ (bch2_replicas_entry_to_text(&PBUF(buf), ++ &replicas.e), buf)))) { ++ ret = bch2_mark_replicas(c, &replicas.e); ++ if (ret) ++ return ret; ++ } ++ ++ for_each_jset_key(k, _n, entry, &i->j) ++ keys++; ++ entries++; ++ } ++ ++ if (!list_empty(list)) { ++ i = list_last_entry(list, struct journal_replay, list); ++ ++ bch_info(c, "journal read done, %zu keys in %zu entries, seq %llu", ++ keys, entries, le64_to_cpu(i->j.seq)); ++ } ++fsck_err: ++ return ret; ++} ++ ++/* journal write: */ ++ ++static void __journal_write_alloc(struct journal *j, ++ struct journal_buf *w, ++ struct dev_alloc_list *devs_sorted, ++ unsigned sectors, ++ unsigned *replicas, ++ unsigned replicas_want) ++{ ++ struct bch_fs *c = container_of(j, struct bch_fs, journal); ++ struct journal_device *ja; ++ struct bch_dev *ca; ++ unsigned i; ++ ++ if (*replicas >= replicas_want) ++ return; ++ ++ for (i = 0; i < devs_sorted->nr; i++) { ++ ca = rcu_dereference(c->devs[devs_sorted->devs[i]]); ++ if (!ca) ++ continue; ++ ++ ja = &ca->journal; ++ ++ /* ++ * Check that we can use this device, and aren't already using ++ * it: ++ */ ++ if (!ca->mi.durability || ++ ca->mi.state != BCH_MEMBER_STATE_RW || ++ !ja->nr || ++ bch2_bkey_has_device(bkey_i_to_s_c(&w->key), ++ ca->dev_idx) || ++ sectors > ja->sectors_free) ++ continue; ++ ++ bch2_dev_stripe_increment(c, ca, &j->wp.stripe); ++ ++ bch2_bkey_append_ptr(&w->key, ++ (struct bch_extent_ptr) { ++ .offset = bucket_to_sector(ca, ++ ja->buckets[ja->cur_idx]) + ++ ca->mi.bucket_size - ++ ja->sectors_free, ++ .dev = ca->dev_idx, ++ }); ++ ++ ja->sectors_free -= sectors; ++ ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq); ++ ++ *replicas += ca->mi.durability; ++ ++ if (*replicas >= replicas_want) ++ break; ++ } ++} ++ ++/** ++ * journal_next_bucket - move on to the next journal bucket if possible ++ */ ++static int journal_write_alloc(struct journal *j, struct journal_buf *w, ++ unsigned sectors) ++{ ++ struct bch_fs *c = container_of(j, struct bch_fs, journal); ++ struct journal_device *ja; ++ struct bch_dev *ca; ++ struct dev_alloc_list devs_sorted; ++ unsigned i, replicas = 0, replicas_want = ++ READ_ONCE(c->opts.metadata_replicas); ++ ++ rcu_read_lock(); ++ ++ devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe, ++ &c->rw_devs[BCH_DATA_JOURNAL]); ++ ++ __journal_write_alloc(j, w, &devs_sorted, ++ sectors, &replicas, replicas_want); ++ ++ if (replicas >= replicas_want) ++ goto done; ++ ++ for (i = 0; i < devs_sorted.nr; i++) { ++ ca = rcu_dereference(c->devs[devs_sorted.devs[i]]); ++ if (!ca) ++ continue; ++ ++ ja = &ca->journal; ++ ++ if (sectors > ja->sectors_free && ++ sectors <= ca->mi.bucket_size && ++ bch2_journal_dev_buckets_available(j, ja, ++ journal_space_discarded)) { ++ ja->cur_idx = (ja->cur_idx + 1) % ja->nr; ++ ja->sectors_free = ca->mi.bucket_size; ++ ++ /* ++ * ja->bucket_seq[ja->cur_idx] must always have ++ * something sensible: ++ */ ++ ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq); ++ } ++ } ++ ++ __journal_write_alloc(j, w, &devs_sorted, ++ sectors, &replicas, replicas_want); ++done: ++ rcu_read_unlock(); ++ ++ return replicas >= c->opts.metadata_replicas_required ? 0 : -EROFS; ++} ++ ++static void journal_write_compact(struct jset *jset) ++{ ++ struct jset_entry *i, *next, *prev = NULL; ++ ++ /* ++ * Simple compaction, dropping empty jset_entries (from journal ++ * reservations that weren't fully used) and merging jset_entries that ++ * can be. ++ * ++ * If we wanted to be really fancy here, we could sort all the keys in ++ * the jset and drop keys that were overwritten - probably not worth it: ++ */ ++ vstruct_for_each_safe(jset, i, next) { ++ unsigned u64s = le16_to_cpu(i->u64s); ++ ++ /* Empty entry: */ ++ if (!u64s) ++ continue; ++ ++ /* Can we merge with previous entry? */ ++ if (prev && ++ i->btree_id == prev->btree_id && ++ i->level == prev->level && ++ i->type == prev->type && ++ i->type == BCH_JSET_ENTRY_btree_keys && ++ le16_to_cpu(prev->u64s) + u64s <= U16_MAX) { ++ memmove_u64s_down(vstruct_next(prev), ++ i->_data, ++ u64s); ++ le16_add_cpu(&prev->u64s, u64s); ++ continue; ++ } ++ ++ /* Couldn't merge, move i into new position (after prev): */ ++ prev = prev ? vstruct_next(prev) : jset->start; ++ if (i != prev) ++ memmove_u64s_down(prev, i, jset_u64s(u64s)); ++ } ++ ++ prev = prev ? vstruct_next(prev) : jset->start; ++ jset->u64s = cpu_to_le32((u64 *) prev - jset->_data); ++} ++ ++static void journal_buf_realloc(struct journal *j, struct journal_buf *buf) ++{ ++ /* we aren't holding j->lock: */ ++ unsigned new_size = READ_ONCE(j->buf_size_want); ++ void *new_buf; ++ ++ if (buf->buf_size >= new_size) ++ return; ++ ++ new_buf = kvpmalloc(new_size, GFP_NOIO|__GFP_NOWARN); ++ if (!new_buf) ++ return; ++ ++ memcpy(new_buf, buf->data, buf->buf_size); ++ kvpfree(buf->data, buf->buf_size); ++ buf->data = new_buf; ++ buf->buf_size = new_size; ++} ++ ++static void journal_write_done(struct closure *cl) ++{ ++ struct journal *j = container_of(cl, struct journal, io); ++ struct bch_fs *c = container_of(j, struct bch_fs, journal); ++ struct journal_buf *w = journal_prev_buf(j); ++ struct bch_devs_list devs = ++ bch2_bkey_devs(bkey_i_to_s_c(&w->key)); ++ struct bch_replicas_padded replicas; ++ u64 seq = le64_to_cpu(w->data->seq); ++ u64 last_seq = le64_to_cpu(w->data->last_seq); ++ ++ bch2_time_stats_update(j->write_time, j->write_start_time); ++ ++ if (!devs.nr) { ++ bch_err(c, "unable to write journal to sufficient devices"); ++ goto err; ++ } ++ ++ bch2_devlist_to_replicas(&replicas.e, BCH_DATA_JOURNAL, devs); ++ ++ if (bch2_mark_replicas(c, &replicas.e)) ++ goto err; ++ ++ spin_lock(&j->lock); ++ if (seq >= j->pin.front) ++ journal_seq_pin(j, seq)->devs = devs; ++ ++ j->seq_ondisk = seq; ++ j->last_seq_ondisk = last_seq; ++ bch2_journal_space_available(j); ++ ++ /* ++ * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard ++ * more buckets: ++ * ++ * Must come before signaling write completion, for ++ * bch2_fs_journal_stop(): ++ */ ++ mod_delayed_work(c->journal_reclaim_wq, &j->reclaim_work, 0); ++out: ++ /* also must come before signalling write completion: */ ++ closure_debug_destroy(cl); ++ ++ BUG_ON(!j->reservations.prev_buf_unwritten); ++ atomic64_sub(((union journal_res_state) { .prev_buf_unwritten = 1 }).v, ++ &j->reservations.counter); ++ ++ closure_wake_up(&w->wait); ++ journal_wake(j); ++ ++ if (test_bit(JOURNAL_NEED_WRITE, &j->flags)) ++ mod_delayed_work(system_freezable_wq, &j->write_work, 0); ++ spin_unlock(&j->lock); ++ return; ++err: ++ bch2_fatal_error(c); ++ spin_lock(&j->lock); ++ goto out; ++} ++ ++static void journal_write_endio(struct bio *bio) ++{ ++ struct bch_dev *ca = bio->bi_private; ++ struct journal *j = &ca->fs->journal; ++ ++ if (bch2_dev_io_err_on(bio->bi_status, ca, "journal write") || ++ bch2_meta_write_fault("journal")) { ++ struct journal_buf *w = journal_prev_buf(j); ++ unsigned long flags; ++ ++ spin_lock_irqsave(&j->err_lock, flags); ++ bch2_bkey_drop_device(bkey_i_to_s(&w->key), ca->dev_idx); ++ spin_unlock_irqrestore(&j->err_lock, flags); ++ } ++ ++ closure_put(&j->io); ++ percpu_ref_put(&ca->io_ref); ++} ++ ++void bch2_journal_write(struct closure *cl) ++{ ++ struct journal *j = container_of(cl, struct journal, io); ++ struct bch_fs *c = container_of(j, struct bch_fs, journal); ++ struct bch_dev *ca; ++ struct journal_buf *w = journal_prev_buf(j); ++ struct jset_entry *start, *end; ++ struct jset *jset; ++ struct bio *bio; ++ struct bch_extent_ptr *ptr; ++ bool validate_before_checksum = false; ++ unsigned i, sectors, bytes, u64s; ++ int ret; ++ ++ bch2_journal_pin_put(j, le64_to_cpu(w->data->seq)); ++ ++ journal_buf_realloc(j, w); ++ jset = w->data; ++ ++ j->write_start_time = local_clock(); ++ ++ start = vstruct_last(jset); ++ end = bch2_journal_super_entries_add_common(c, start, ++ le64_to_cpu(jset->seq)); ++ u64s = (u64 *) end - (u64 *) start; ++ BUG_ON(u64s > j->entry_u64s_reserved); ++ ++ le32_add_cpu(&jset->u64s, u64s); ++ BUG_ON(vstruct_sectors(jset, c->block_bits) > w->sectors); ++ ++ journal_write_compact(jset); ++ ++ jset->read_clock = cpu_to_le16(c->bucket_clock[READ].hand); ++ jset->write_clock = cpu_to_le16(c->bucket_clock[WRITE].hand); ++ jset->magic = cpu_to_le64(jset_magic(c)); ++ ++ jset->version = c->sb.version < bcachefs_metadata_version_new_versioning ++ ? cpu_to_le32(BCH_JSET_VERSION_OLD) ++ : cpu_to_le32(c->sb.version); ++ ++ SET_JSET_BIG_ENDIAN(jset, CPU_BIG_ENDIAN); ++ SET_JSET_CSUM_TYPE(jset, bch2_meta_checksum_type(c)); ++ ++ if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset))) ++ validate_before_checksum = true; ++ ++ if (le32_to_cpu(jset->version) < ++ bcachefs_metadata_version_bkey_renumber) ++ validate_before_checksum = true; ++ ++ if (validate_before_checksum && ++ jset_validate_entries(c, jset, WRITE)) ++ goto err; ++ ++ bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), ++ jset->encrypted_start, ++ vstruct_end(jset) - (void *) jset->encrypted_start); ++ ++ jset->csum = csum_vstruct(c, JSET_CSUM_TYPE(jset), ++ journal_nonce(jset), jset); ++ ++ if (!validate_before_checksum && ++ jset_validate_entries(c, jset, WRITE)) ++ goto err; ++ ++ sectors = vstruct_sectors(jset, c->block_bits); ++ BUG_ON(sectors > w->sectors); ++ ++ bytes = vstruct_bytes(jset); ++ memset((void *) jset + bytes, 0, (sectors << 9) - bytes); ++ ++ spin_lock(&j->lock); ++ ret = journal_write_alloc(j, w, sectors); ++ ++ /* ++ * write is allocated, no longer need to account for it in ++ * bch2_journal_space_available(): ++ */ ++ w->sectors = 0; ++ ++ /* ++ * journal entry has been compacted and allocated, recalculate space ++ * available: ++ */ ++ bch2_journal_space_available(j); ++ spin_unlock(&j->lock); ++ ++ if (ret) { ++ bch_err(c, "Unable to allocate journal write"); ++ bch2_fatal_error(c); ++ continue_at(cl, journal_write_done, system_highpri_wq); ++ return; ++ } ++ ++ /* ++ * XXX: we really should just disable the entire journal in nochanges ++ * mode ++ */ ++ if (c->opts.nochanges) ++ goto no_io; ++ ++ extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) { ++ ca = bch_dev_bkey_exists(c, ptr->dev); ++ if (!percpu_ref_tryget(&ca->io_ref)) { ++ /* XXX: fix this */ ++ bch_err(c, "missing device for journal write\n"); ++ continue; ++ } ++ ++ this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_JOURNAL], ++ sectors); ++ ++ bio = ca->journal.bio; ++ bio_reset(bio); ++ bio_set_dev(bio, ca->disk_sb.bdev); ++ bio->bi_iter.bi_sector = ptr->offset; ++ bio->bi_end_io = journal_write_endio; ++ bio->bi_private = ca; ++ bio_set_op_attrs(bio, REQ_OP_WRITE, ++ REQ_SYNC|REQ_META|REQ_PREFLUSH|REQ_FUA); ++ bch2_bio_map(bio, jset, sectors << 9); ++ ++ trace_journal_write(bio); ++ closure_bio_submit(bio, cl); ++ ++ ca->journal.bucket_seq[ca->journal.cur_idx] = le64_to_cpu(jset->seq); ++ } ++ ++ for_each_rw_member(ca, c, i) ++ if (journal_flushes_device(ca) && ++ !bch2_extent_has_device(bkey_i_to_s_c_extent(&w->key), i)) { ++ percpu_ref_get(&ca->io_ref); ++ ++ bio = ca->journal.bio; ++ bio_reset(bio); ++ bio_set_dev(bio, ca->disk_sb.bdev); ++ bio->bi_opf = REQ_OP_FLUSH; ++ bio->bi_end_io = journal_write_endio; ++ bio->bi_private = ca; ++ closure_bio_submit(bio, cl); ++ } ++ ++no_io: ++ bch2_bucket_seq_cleanup(c); ++ ++ continue_at(cl, journal_write_done, system_highpri_wq); ++ return; ++err: ++ bch2_inconsistent_error(c); ++ continue_at(cl, journal_write_done, system_highpri_wq); ++} +diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h +new file mode 100644 +index 000000000000..72e575f360af +--- /dev/null ++++ b/fs/bcachefs/journal_io.h +@@ -0,0 +1,42 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_JOURNAL_IO_H ++#define _BCACHEFS_JOURNAL_IO_H ++ ++/* ++ * Only used for holding the journal entries we read in btree_journal_read() ++ * during cache_registration ++ */ ++struct journal_replay { ++ struct list_head list; ++ struct bch_devs_list devs; ++ /* must be last: */ ++ struct jset j; ++}; ++ ++static inline struct jset_entry *__jset_entry_type_next(struct jset *jset, ++ struct jset_entry *entry, unsigned type) ++{ ++ while (entry < vstruct_last(jset)) { ++ if (entry->type == type) ++ return entry; ++ ++ entry = vstruct_next(entry); ++ } ++ ++ return NULL; ++} ++ ++#define for_each_jset_entry_type(entry, jset, type) \ ++ for (entry = (jset)->start; \ ++ (entry = __jset_entry_type_next(jset, entry, type)); \ ++ entry = vstruct_next(entry)) ++ ++#define for_each_jset_key(k, _n, entry, jset) \ ++ for_each_jset_entry_type(entry, jset, BCH_JSET_ENTRY_btree_keys) \ ++ vstruct_for_each_safe(entry, k, _n) ++ ++int bch2_journal_read(struct bch_fs *, struct list_head *); ++ ++void bch2_journal_write(struct closure *); ++ ++#endif /* _BCACHEFS_JOURNAL_IO_H */ +diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c +new file mode 100644 +index 000000000000..695b2c8ba03b +--- /dev/null ++++ b/fs/bcachefs/journal_reclaim.c +@@ -0,0 +1,626 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include "bcachefs.h" ++#include "journal.h" ++#include "journal_io.h" ++#include "journal_reclaim.h" ++#include "replicas.h" ++#include "super.h" ++ ++/* Free space calculations: */ ++ ++static unsigned journal_space_from(struct journal_device *ja, ++ enum journal_space_from from) ++{ ++ switch (from) { ++ case journal_space_discarded: ++ return ja->discard_idx; ++ case journal_space_clean_ondisk: ++ return ja->dirty_idx_ondisk; ++ case journal_space_clean: ++ return ja->dirty_idx; ++ default: ++ BUG(); ++ } ++} ++ ++unsigned bch2_journal_dev_buckets_available(struct journal *j, ++ struct journal_device *ja, ++ enum journal_space_from from) ++{ ++ struct bch_fs *c = container_of(j, struct bch_fs, journal); ++ unsigned available = (journal_space_from(ja, from) - ++ ja->cur_idx - 1 + ja->nr) % ja->nr; ++ ++ /* ++ * Allocator startup needs some journal space before we can do journal ++ * replay: ++ */ ++ if (available && test_bit(BCH_FS_ALLOCATOR_STARTED, &c->flags)) ++ --available; ++ ++ /* ++ * Don't use the last bucket unless writing the new last_seq ++ * will make another bucket available: ++ */ ++ if (available && ja->dirty_idx_ondisk == ja->dirty_idx) ++ --available; ++ ++ return available; ++} ++ ++static void journal_set_remaining(struct journal *j, unsigned u64s_remaining) ++{ ++ union journal_preres_state old, new; ++ u64 v = atomic64_read(&j->prereserved.counter); ++ ++ do { ++ old.v = new.v = v; ++ new.remaining = u64s_remaining; ++ } while ((v = atomic64_cmpxchg(&j->prereserved.counter, ++ old.v, new.v)) != old.v); ++} ++ ++static struct journal_space { ++ unsigned next_entry; ++ unsigned remaining; ++} __journal_space_available(struct journal *j, unsigned nr_devs_want, ++ enum journal_space_from from) ++{ ++ struct bch_fs *c = container_of(j, struct bch_fs, journal); ++ struct bch_dev *ca; ++ unsigned sectors_next_entry = UINT_MAX; ++ unsigned sectors_total = UINT_MAX; ++ unsigned i, nr_devs = 0; ++ unsigned unwritten_sectors = j->reservations.prev_buf_unwritten ++ ? journal_prev_buf(j)->sectors ++ : 0; ++ ++ rcu_read_lock(); ++ for_each_member_device_rcu(ca, c, i, ++ &c->rw_devs[BCH_DATA_JOURNAL]) { ++ struct journal_device *ja = &ca->journal; ++ unsigned buckets_this_device, sectors_this_device; ++ ++ if (!ja->nr) ++ continue; ++ ++ buckets_this_device = bch2_journal_dev_buckets_available(j, ja, from); ++ sectors_this_device = ja->sectors_free; ++ ++ /* ++ * We that we don't allocate the space for a journal entry ++ * until we write it out - thus, account for it here: ++ */ ++ if (unwritten_sectors >= sectors_this_device) { ++ if (!buckets_this_device) ++ continue; ++ ++ buckets_this_device--; ++ sectors_this_device = ca->mi.bucket_size; ++ } ++ ++ sectors_this_device -= unwritten_sectors; ++ ++ if (sectors_this_device < ca->mi.bucket_size && ++ buckets_this_device) { ++ buckets_this_device--; ++ sectors_this_device = ca->mi.bucket_size; ++ } ++ ++ if (!sectors_this_device) ++ continue; ++ ++ sectors_next_entry = min(sectors_next_entry, ++ sectors_this_device); ++ ++ sectors_total = min(sectors_total, ++ buckets_this_device * ca->mi.bucket_size + ++ sectors_this_device); ++ ++ nr_devs++; ++ } ++ rcu_read_unlock(); ++ ++ if (nr_devs < nr_devs_want) ++ return (struct journal_space) { 0, 0 }; ++ ++ return (struct journal_space) { ++ .next_entry = sectors_next_entry, ++ .remaining = max_t(int, 0, sectors_total - sectors_next_entry), ++ }; ++} ++ ++void bch2_journal_space_available(struct journal *j) ++{ ++ struct bch_fs *c = container_of(j, struct bch_fs, journal); ++ struct bch_dev *ca; ++ struct journal_space discarded, clean_ondisk, clean; ++ unsigned overhead, u64s_remaining = 0; ++ unsigned max_entry_size = min(j->buf[0].buf_size >> 9, ++ j->buf[1].buf_size >> 9); ++ unsigned i, nr_online = 0, nr_devs_want; ++ bool can_discard = false; ++ int ret = 0; ++ ++ lockdep_assert_held(&j->lock); ++ ++ rcu_read_lock(); ++ for_each_member_device_rcu(ca, c, i, ++ &c->rw_devs[BCH_DATA_JOURNAL]) { ++ struct journal_device *ja = &ca->journal; ++ ++ if (!ja->nr) ++ continue; ++ ++ while (ja->dirty_idx != ja->cur_idx && ++ ja->bucket_seq[ja->dirty_idx] < journal_last_seq(j)) ++ ja->dirty_idx = (ja->dirty_idx + 1) % ja->nr; ++ ++ while (ja->dirty_idx_ondisk != ja->dirty_idx && ++ ja->bucket_seq[ja->dirty_idx_ondisk] < j->last_seq_ondisk) ++ ja->dirty_idx_ondisk = (ja->dirty_idx_ondisk + 1) % ja->nr; ++ ++ if (ja->discard_idx != ja->dirty_idx_ondisk) ++ can_discard = true; ++ ++ max_entry_size = min_t(unsigned, max_entry_size, ca->mi.bucket_size); ++ nr_online++; ++ } ++ rcu_read_unlock(); ++ ++ j->can_discard = can_discard; ++ ++ if (nr_online < c->opts.metadata_replicas_required) { ++ ret = -EROFS; ++ goto out; ++ } ++ ++ if (!fifo_free(&j->pin)) { ++ ret = -ENOSPC; ++ goto out; ++ } ++ ++ nr_devs_want = min_t(unsigned, nr_online, c->opts.metadata_replicas); ++ ++ discarded = __journal_space_available(j, nr_devs_want, journal_space_discarded); ++ clean_ondisk = __journal_space_available(j, nr_devs_want, journal_space_clean_ondisk); ++ clean = __journal_space_available(j, nr_devs_want, journal_space_clean); ++ ++ if (!discarded.next_entry) ++ ret = -ENOSPC; ++ ++ overhead = DIV_ROUND_UP(clean.remaining, max_entry_size) * ++ journal_entry_overhead(j); ++ u64s_remaining = clean.remaining << 6; ++ u64s_remaining = max_t(int, 0, u64s_remaining - overhead); ++ u64s_remaining /= 4; ++out: ++ j->cur_entry_sectors = !ret ? discarded.next_entry : 0; ++ j->cur_entry_error = ret; ++ journal_set_remaining(j, u64s_remaining); ++ journal_check_may_get_unreserved(j); ++ ++ if (!ret) ++ journal_wake(j); ++} ++ ++/* Discards - last part of journal reclaim: */ ++ ++static bool should_discard_bucket(struct journal *j, struct journal_device *ja) ++{ ++ bool ret; ++ ++ spin_lock(&j->lock); ++ ret = ja->discard_idx != ja->dirty_idx_ondisk; ++ spin_unlock(&j->lock); ++ ++ return ret; ++} ++ ++/* ++ * Advance ja->discard_idx as long as it points to buckets that are no longer ++ * dirty, issuing discards if necessary: ++ */ ++void bch2_journal_do_discards(struct journal *j) ++{ ++ struct bch_fs *c = container_of(j, struct bch_fs, journal); ++ struct bch_dev *ca; ++ unsigned iter; ++ ++ mutex_lock(&j->discard_lock); ++ ++ for_each_rw_member(ca, c, iter) { ++ struct journal_device *ja = &ca->journal; ++ ++ while (should_discard_bucket(j, ja)) { ++ if (ca->mi.discard && ++ blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev))) ++ blkdev_issue_discard(ca->disk_sb.bdev, ++ bucket_to_sector(ca, ++ ja->buckets[ja->discard_idx]), ++ ca->mi.bucket_size, GFP_NOIO, 0); ++ ++ spin_lock(&j->lock); ++ ja->discard_idx = (ja->discard_idx + 1) % ja->nr; ++ ++ bch2_journal_space_available(j); ++ spin_unlock(&j->lock); ++ } ++ } ++ ++ mutex_unlock(&j->discard_lock); ++} ++ ++/* ++ * Journal entry pinning - machinery for holding a reference on a given journal ++ * entry, holding it open to ensure it gets replayed during recovery: ++ */ ++ ++static void bch2_journal_reclaim_fast(struct journal *j) ++{ ++ struct journal_entry_pin_list temp; ++ bool popped = false; ++ ++ lockdep_assert_held(&j->lock); ++ ++ /* ++ * Unpin journal entries whose reference counts reached zero, meaning ++ * all btree nodes got written out ++ */ ++ while (!fifo_empty(&j->pin) && ++ !atomic_read(&fifo_peek_front(&j->pin).count)) { ++ BUG_ON(!list_empty(&fifo_peek_front(&j->pin).list)); ++ BUG_ON(!fifo_pop(&j->pin, temp)); ++ popped = true; ++ } ++ ++ if (popped) ++ bch2_journal_space_available(j); ++} ++ ++void bch2_journal_pin_put(struct journal *j, u64 seq) ++{ ++ struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq); ++ ++ if (atomic_dec_and_test(&pin_list->count)) { ++ spin_lock(&j->lock); ++ bch2_journal_reclaim_fast(j); ++ spin_unlock(&j->lock); ++ } ++} ++ ++static inline void __journal_pin_add(struct journal *j, ++ u64 seq, ++ struct journal_entry_pin *pin, ++ journal_pin_flush_fn flush_fn) ++{ ++ struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq); ++ ++ BUG_ON(journal_pin_active(pin)); ++ BUG_ON(!atomic_read(&pin_list->count)); ++ ++ atomic_inc(&pin_list->count); ++ pin->seq = seq; ++ pin->flush = flush_fn; ++ ++ list_add(&pin->list, flush_fn ? &pin_list->list : &pin_list->flushed); ++ ++ /* ++ * If the journal is currently full, we might want to call flush_fn ++ * immediately: ++ */ ++ journal_wake(j); ++} ++ ++void bch2_journal_pin_add(struct journal *j, u64 seq, ++ struct journal_entry_pin *pin, ++ journal_pin_flush_fn flush_fn) ++{ ++ spin_lock(&j->lock); ++ __journal_pin_add(j, seq, pin, flush_fn); ++ spin_unlock(&j->lock); ++} ++ ++static inline void __journal_pin_drop(struct journal *j, ++ struct journal_entry_pin *pin) ++{ ++ struct journal_entry_pin_list *pin_list; ++ ++ if (!journal_pin_active(pin)) ++ return; ++ ++ pin_list = journal_seq_pin(j, pin->seq); ++ pin->seq = 0; ++ list_del_init(&pin->list); ++ ++ /* ++ * Unpinning a journal entry make make journal_next_bucket() succeed, if ++ * writing a new last_seq will now make another bucket available: ++ */ ++ if (atomic_dec_and_test(&pin_list->count) && ++ pin_list == &fifo_peek_front(&j->pin)) ++ bch2_journal_reclaim_fast(j); ++ else if (fifo_used(&j->pin) == 1 && ++ atomic_read(&pin_list->count) == 1) ++ journal_wake(j); ++} ++ ++void bch2_journal_pin_drop(struct journal *j, ++ struct journal_entry_pin *pin) ++{ ++ spin_lock(&j->lock); ++ __journal_pin_drop(j, pin); ++ spin_unlock(&j->lock); ++} ++ ++void bch2_journal_pin_update(struct journal *j, u64 seq, ++ struct journal_entry_pin *pin, ++ journal_pin_flush_fn flush_fn) ++{ ++ spin_lock(&j->lock); ++ ++ if (pin->seq != seq) { ++ __journal_pin_drop(j, pin); ++ __journal_pin_add(j, seq, pin, flush_fn); ++ } else { ++ struct journal_entry_pin_list *pin_list = ++ journal_seq_pin(j, seq); ++ ++ list_move(&pin->list, &pin_list->list); ++ } ++ ++ spin_unlock(&j->lock); ++} ++ ++void bch2_journal_pin_add_if_older(struct journal *j, ++ struct journal_entry_pin *src_pin, ++ struct journal_entry_pin *pin, ++ journal_pin_flush_fn flush_fn) ++{ ++ spin_lock(&j->lock); ++ ++ if (journal_pin_active(src_pin) && ++ (!journal_pin_active(pin) || ++ src_pin->seq < pin->seq)) { ++ __journal_pin_drop(j, pin); ++ __journal_pin_add(j, src_pin->seq, pin, flush_fn); ++ } ++ ++ spin_unlock(&j->lock); ++} ++ ++void bch2_journal_pin_flush(struct journal *j, struct journal_entry_pin *pin) ++{ ++ BUG_ON(journal_pin_active(pin)); ++ ++ wait_event(j->pin_flush_wait, j->flush_in_progress != pin); ++} ++ ++/* ++ * Journal reclaim: flush references to open journal entries to reclaim space in ++ * the journal ++ * ++ * May be done by the journal code in the background as needed to free up space ++ * for more journal entries, or as part of doing a clean shutdown, or to migrate ++ * data off of a specific device: ++ */ ++ ++static struct journal_entry_pin * ++journal_get_next_pin(struct journal *j, u64 max_seq, u64 *seq) ++{ ++ struct journal_entry_pin_list *pin_list; ++ struct journal_entry_pin *ret = NULL; ++ ++ spin_lock(&j->lock); ++ ++ fifo_for_each_entry_ptr(pin_list, &j->pin, *seq) ++ if (*seq > max_seq || ++ (ret = list_first_entry_or_null(&pin_list->list, ++ struct journal_entry_pin, list))) ++ break; ++ ++ if (ret) { ++ list_move(&ret->list, &pin_list->flushed); ++ BUG_ON(j->flush_in_progress); ++ j->flush_in_progress = ret; ++ j->last_flushed = jiffies; ++ } ++ ++ spin_unlock(&j->lock); ++ ++ return ret; ++} ++ ++static void journal_flush_pins(struct journal *j, u64 seq_to_flush, ++ unsigned min_nr) ++{ ++ struct journal_entry_pin *pin; ++ u64 seq; ++ ++ lockdep_assert_held(&j->reclaim_lock); ++ ++ while ((pin = journal_get_next_pin(j, min_nr ++ ? U64_MAX : seq_to_flush, &seq))) { ++ if (min_nr) ++ min_nr--; ++ ++ pin->flush(j, pin, seq); ++ ++ BUG_ON(j->flush_in_progress != pin); ++ j->flush_in_progress = NULL; ++ wake_up(&j->pin_flush_wait); ++ } ++} ++ ++/** ++ * bch2_journal_reclaim - free up journal buckets ++ * ++ * Background journal reclaim writes out btree nodes. It should be run ++ * early enough so that we never completely run out of journal buckets. ++ * ++ * High watermarks for triggering background reclaim: ++ * - FIFO has fewer than 512 entries left ++ * - fewer than 25% journal buckets free ++ * ++ * Background reclaim runs until low watermarks are reached: ++ * - FIFO has more than 1024 entries left ++ * - more than 50% journal buckets free ++ * ++ * As long as a reclaim can complete in the time it takes to fill up ++ * 512 journal entries or 25% of all journal buckets, then ++ * journal_next_bucket() should not stall. ++ */ ++void bch2_journal_reclaim(struct journal *j) ++{ ++ struct bch_fs *c = container_of(j, struct bch_fs, journal); ++ struct bch_dev *ca; ++ unsigned iter, min_nr = 0; ++ u64 seq_to_flush = 0; ++ ++ lockdep_assert_held(&j->reclaim_lock); ++ ++ bch2_journal_do_discards(j); ++ ++ spin_lock(&j->lock); ++ ++ for_each_rw_member(ca, c, iter) { ++ struct journal_device *ja = &ca->journal; ++ unsigned nr_buckets, bucket_to_flush; ++ ++ if (!ja->nr) ++ continue; ++ ++ /* Try to keep the journal at most half full: */ ++ nr_buckets = ja->nr / 2; ++ ++ /* And include pre-reservations: */ ++ nr_buckets += DIV_ROUND_UP(j->prereserved.reserved, ++ (ca->mi.bucket_size << 6) - ++ journal_entry_overhead(j)); ++ ++ nr_buckets = min(nr_buckets, ja->nr); ++ ++ bucket_to_flush = (ja->cur_idx + nr_buckets) % ja->nr; ++ seq_to_flush = max(seq_to_flush, ++ ja->bucket_seq[bucket_to_flush]); ++ } ++ ++ /* Also flush if the pin fifo is more than half full */ ++ seq_to_flush = max_t(s64, seq_to_flush, ++ (s64) journal_cur_seq(j) - ++ (j->pin.size >> 1)); ++ spin_unlock(&j->lock); ++ ++ /* ++ * If it's been longer than j->reclaim_delay_ms since we last flushed, ++ * make sure to flush at least one journal pin: ++ */ ++ if (time_after(jiffies, j->last_flushed + ++ msecs_to_jiffies(j->reclaim_delay_ms))) ++ min_nr = 1; ++ ++ if (j->prereserved.reserved * 2 > j->prereserved.remaining) { ++ seq_to_flush = max(seq_to_flush, journal_last_seq(j)); ++ min_nr = 1; ++ } ++ ++ journal_flush_pins(j, seq_to_flush, min_nr); ++ ++ if (!bch2_journal_error(j)) ++ queue_delayed_work(c->journal_reclaim_wq, &j->reclaim_work, ++ msecs_to_jiffies(j->reclaim_delay_ms)); ++} ++ ++void bch2_journal_reclaim_work(struct work_struct *work) ++{ ++ struct journal *j = container_of(to_delayed_work(work), ++ struct journal, reclaim_work); ++ ++ mutex_lock(&j->reclaim_lock); ++ bch2_journal_reclaim(j); ++ mutex_unlock(&j->reclaim_lock); ++} ++ ++static int journal_flush_done(struct journal *j, u64 seq_to_flush) ++{ ++ int ret; ++ ++ ret = bch2_journal_error(j); ++ if (ret) ++ return ret; ++ ++ mutex_lock(&j->reclaim_lock); ++ ++ journal_flush_pins(j, seq_to_flush, 0); ++ ++ spin_lock(&j->lock); ++ /* ++ * If journal replay hasn't completed, the unreplayed journal entries ++ * hold refs on their corresponding sequence numbers ++ */ ++ ret = !test_bit(JOURNAL_REPLAY_DONE, &j->flags) || ++ journal_last_seq(j) > seq_to_flush || ++ (fifo_used(&j->pin) == 1 && ++ atomic_read(&fifo_peek_front(&j->pin).count) == 1); ++ ++ spin_unlock(&j->lock); ++ mutex_unlock(&j->reclaim_lock); ++ ++ return ret; ++} ++ ++void bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush) ++{ ++ if (!test_bit(JOURNAL_STARTED, &j->flags)) ++ return; ++ ++ closure_wait_event(&j->async_wait, journal_flush_done(j, seq_to_flush)); ++} ++ ++int bch2_journal_flush_device_pins(struct journal *j, int dev_idx) ++{ ++ struct bch_fs *c = container_of(j, struct bch_fs, journal); ++ struct journal_entry_pin_list *p; ++ u64 iter, seq = 0; ++ int ret = 0; ++ ++ spin_lock(&j->lock); ++ fifo_for_each_entry_ptr(p, &j->pin, iter) ++ if (dev_idx >= 0 ++ ? bch2_dev_list_has_dev(p->devs, dev_idx) ++ : p->devs.nr < c->opts.metadata_replicas) ++ seq = iter; ++ spin_unlock(&j->lock); ++ ++ bch2_journal_flush_pins(j, seq); ++ ++ ret = bch2_journal_error(j); ++ if (ret) ++ return ret; ++ ++ mutex_lock(&c->replicas_gc_lock); ++ bch2_replicas_gc_start(c, 1 << BCH_DATA_JOURNAL); ++ ++ seq = 0; ++ ++ spin_lock(&j->lock); ++ while (!ret && seq < j->pin.back) { ++ struct bch_replicas_padded replicas; ++ ++ seq = max(seq, journal_last_seq(j)); ++ bch2_devlist_to_replicas(&replicas.e, BCH_DATA_JOURNAL, ++ journal_seq_pin(j, seq)->devs); ++ seq++; ++ ++ spin_unlock(&j->lock); ++ ret = bch2_mark_replicas(c, &replicas.e); ++ spin_lock(&j->lock); ++ } ++ spin_unlock(&j->lock); ++ ++ ret = bch2_replicas_gc_end(c, ret); ++ mutex_unlock(&c->replicas_gc_lock); ++ ++ return ret; ++} +diff --git a/fs/bcachefs/journal_reclaim.h b/fs/bcachefs/journal_reclaim.h +new file mode 100644 +index 000000000000..9bf982a17797 +--- /dev/null ++++ b/fs/bcachefs/journal_reclaim.h +@@ -0,0 +1,57 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_JOURNAL_RECLAIM_H ++#define _BCACHEFS_JOURNAL_RECLAIM_H ++ ++#define JOURNAL_PIN (32 * 1024) ++ ++enum journal_space_from { ++ journal_space_discarded, ++ journal_space_clean_ondisk, ++ journal_space_clean, ++}; ++ ++unsigned bch2_journal_dev_buckets_available(struct journal *, ++ struct journal_device *, ++ enum journal_space_from); ++void bch2_journal_space_available(struct journal *); ++ ++static inline bool journal_pin_active(struct journal_entry_pin *pin) ++{ ++ return pin->seq != 0; ++} ++ ++static inline struct journal_entry_pin_list * ++journal_seq_pin(struct journal *j, u64 seq) ++{ ++ EBUG_ON(seq < j->pin.front || seq >= j->pin.back); ++ ++ return &j->pin.data[seq & j->pin.mask]; ++} ++ ++void bch2_journal_pin_put(struct journal *, u64); ++ ++void bch2_journal_pin_add(struct journal *, u64, struct journal_entry_pin *, ++ journal_pin_flush_fn); ++void bch2_journal_pin_update(struct journal *, u64, struct journal_entry_pin *, ++ journal_pin_flush_fn); ++void bch2_journal_pin_drop(struct journal *, struct journal_entry_pin *); ++void bch2_journal_pin_add_if_older(struct journal *, ++ struct journal_entry_pin *, ++ struct journal_entry_pin *, ++ journal_pin_flush_fn); ++void bch2_journal_pin_flush(struct journal *, struct journal_entry_pin *); ++ ++void bch2_journal_do_discards(struct journal *); ++void bch2_journal_reclaim(struct journal *); ++void bch2_journal_reclaim_work(struct work_struct *); ++ ++void bch2_journal_flush_pins(struct journal *, u64); ++ ++static inline void bch2_journal_flush_all_pins(struct journal *j) ++{ ++ bch2_journal_flush_pins(j, U64_MAX); ++} ++ ++int bch2_journal_flush_device_pins(struct journal *, int); ++ ++#endif /* _BCACHEFS_JOURNAL_RECLAIM_H */ +diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c +new file mode 100644 +index 000000000000..787d9f7638d0 +--- /dev/null ++++ b/fs/bcachefs/journal_seq_blacklist.c +@@ -0,0 +1,318 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include "bcachefs.h" ++#include "btree_iter.h" ++#include "eytzinger.h" ++#include "journal_seq_blacklist.h" ++#include "super-io.h" ++ ++/* ++ * journal_seq_blacklist machinery: ++ * ++ * To guarantee order of btree updates after a crash, we need to detect when a ++ * btree node entry (bset) is newer than the newest journal entry that was ++ * successfully written, and ignore it - effectively ignoring any btree updates ++ * that didn't make it into the journal. ++ * ++ * If we didn't do this, we might have two btree nodes, a and b, both with ++ * updates that weren't written to the journal yet: if b was updated after a, ++ * but b was flushed and not a - oops; on recovery we'll find that the updates ++ * to b happened, but not the updates to a that happened before it. ++ * ++ * Ignoring bsets that are newer than the newest journal entry is always safe, ++ * because everything they contain will also have been journalled - and must ++ * still be present in the journal on disk until a journal entry has been ++ * written _after_ that bset was written. ++ * ++ * To accomplish this, bsets record the newest journal sequence number they ++ * contain updates for; then, on startup, the btree code queries the journal ++ * code to ask "Is this sequence number newer than the newest journal entry? If ++ * so, ignore it." ++ * ++ * When this happens, we must blacklist that journal sequence number: the ++ * journal must not write any entries with that sequence number, and it must ++ * record that it was blacklisted so that a) on recovery we don't think we have ++ * missing journal entries and b) so that the btree code continues to ignore ++ * that bset, until that btree node is rewritten. ++ */ ++ ++static unsigned ++blacklist_nr_entries(struct bch_sb_field_journal_seq_blacklist *bl) ++{ ++ return bl ++ ? ((vstruct_end(&bl->field) - (void *) &bl->start[0]) / ++ sizeof(struct journal_seq_blacklist_entry)) ++ : 0; ++} ++ ++static unsigned sb_blacklist_u64s(unsigned nr) ++{ ++ struct bch_sb_field_journal_seq_blacklist *bl; ++ ++ return (sizeof(*bl) + sizeof(bl->start[0]) * nr) / sizeof(u64); ++} ++ ++static struct bch_sb_field_journal_seq_blacklist * ++blacklist_entry_try_merge(struct bch_fs *c, ++ struct bch_sb_field_journal_seq_blacklist *bl, ++ unsigned i) ++{ ++ unsigned nr = blacklist_nr_entries(bl); ++ ++ if (le64_to_cpu(bl->start[i].end) >= ++ le64_to_cpu(bl->start[i + 1].start)) { ++ bl->start[i].end = bl->start[i + 1].end; ++ --nr; ++ memmove(&bl->start[i], ++ &bl->start[i + 1], ++ sizeof(bl->start[0]) * (nr - i)); ++ ++ bl = bch2_sb_resize_journal_seq_blacklist(&c->disk_sb, ++ sb_blacklist_u64s(nr)); ++ BUG_ON(!bl); ++ } ++ ++ return bl; ++} ++ ++int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64 start, u64 end) ++{ ++ struct bch_sb_field_journal_seq_blacklist *bl; ++ unsigned i, nr; ++ int ret = 0; ++ ++ mutex_lock(&c->sb_lock); ++ bl = bch2_sb_get_journal_seq_blacklist(c->disk_sb.sb); ++ nr = blacklist_nr_entries(bl); ++ ++ if (bl) { ++ for (i = 0; i < nr; i++) { ++ struct journal_seq_blacklist_entry *e = ++ bl->start + i; ++ ++ if (start == le64_to_cpu(e->start) && ++ end == le64_to_cpu(e->end)) ++ goto out; ++ ++ if (start <= le64_to_cpu(e->start) && ++ end >= le64_to_cpu(e->end)) { ++ e->start = cpu_to_le64(start); ++ e->end = cpu_to_le64(end); ++ ++ if (i + 1 < nr) ++ bl = blacklist_entry_try_merge(c, ++ bl, i); ++ if (i) ++ bl = blacklist_entry_try_merge(c, ++ bl, i - 1); ++ goto out_write_sb; ++ } ++ } ++ } ++ ++ bl = bch2_sb_resize_journal_seq_blacklist(&c->disk_sb, ++ sb_blacklist_u64s(nr + 1)); ++ if (!bl) { ++ ret = -ENOMEM; ++ goto out; ++ } ++ ++ bl->start[nr].start = cpu_to_le64(start); ++ bl->start[nr].end = cpu_to_le64(end); ++out_write_sb: ++ c->disk_sb.sb->features[0] |= ++ 1ULL << BCH_FEATURE_JOURNAL_SEQ_BLACKLIST_V3; ++ ++ ret = bch2_write_super(c); ++out: ++ mutex_unlock(&c->sb_lock); ++ ++ return ret; ++} ++ ++static int journal_seq_blacklist_table_cmp(const void *_l, ++ const void *_r, size_t size) ++{ ++ const struct journal_seq_blacklist_table_entry *l = _l; ++ const struct journal_seq_blacklist_table_entry *r = _r; ++ ++ return cmp_int(l->start, r->start); ++} ++ ++bool bch2_journal_seq_is_blacklisted(struct bch_fs *c, u64 seq, ++ bool dirty) ++{ ++ struct journal_seq_blacklist_table *t = c->journal_seq_blacklist_table; ++ struct journal_seq_blacklist_table_entry search = { .start = seq }; ++ int idx; ++ ++ if (!t) ++ return false; ++ ++ idx = eytzinger0_find_le(t->entries, t->nr, ++ sizeof(t->entries[0]), ++ journal_seq_blacklist_table_cmp, ++ &search); ++ if (idx < 0) ++ return false; ++ ++ BUG_ON(t->entries[idx].start > seq); ++ ++ if (seq >= t->entries[idx].end) ++ return false; ++ ++ if (dirty) ++ t->entries[idx].dirty = true; ++ return true; ++} ++ ++int bch2_blacklist_table_initialize(struct bch_fs *c) ++{ ++ struct bch_sb_field_journal_seq_blacklist *bl = ++ bch2_sb_get_journal_seq_blacklist(c->disk_sb.sb); ++ struct journal_seq_blacklist_table *t; ++ unsigned i, nr = blacklist_nr_entries(bl); ++ ++ BUG_ON(c->journal_seq_blacklist_table); ++ ++ if (!bl) ++ return 0; ++ ++ t = kzalloc(sizeof(*t) + sizeof(t->entries[0]) * nr, ++ GFP_KERNEL); ++ if (!t) ++ return -ENOMEM; ++ ++ t->nr = nr; ++ ++ for (i = 0; i < nr; i++) { ++ t->entries[i].start = le64_to_cpu(bl->start[i].start); ++ t->entries[i].end = le64_to_cpu(bl->start[i].end); ++ } ++ ++ eytzinger0_sort(t->entries, ++ t->nr, ++ sizeof(t->entries[0]), ++ journal_seq_blacklist_table_cmp, ++ NULL); ++ ++ c->journal_seq_blacklist_table = t; ++ return 0; ++} ++ ++static const char * ++bch2_sb_journal_seq_blacklist_validate(struct bch_sb *sb, ++ struct bch_sb_field *f) ++{ ++ struct bch_sb_field_journal_seq_blacklist *bl = ++ field_to_type(f, journal_seq_blacklist); ++ struct journal_seq_blacklist_entry *i; ++ unsigned nr = blacklist_nr_entries(bl); ++ ++ for (i = bl->start; i < bl->start + nr; i++) { ++ if (le64_to_cpu(i->start) >= ++ le64_to_cpu(i->end)) ++ return "entry start >= end"; ++ ++ if (i + 1 < bl->start + nr && ++ le64_to_cpu(i[0].end) > ++ le64_to_cpu(i[1].start)) ++ return "entries out of order"; ++ } ++ ++ return NULL; ++} ++ ++static void bch2_sb_journal_seq_blacklist_to_text(struct printbuf *out, ++ struct bch_sb *sb, ++ struct bch_sb_field *f) ++{ ++ struct bch_sb_field_journal_seq_blacklist *bl = ++ field_to_type(f, journal_seq_blacklist); ++ struct journal_seq_blacklist_entry *i; ++ unsigned nr = blacklist_nr_entries(bl); ++ ++ for (i = bl->start; i < bl->start + nr; i++) { ++ if (i != bl->start) ++ pr_buf(out, " "); ++ ++ pr_buf(out, "%llu-%llu", ++ le64_to_cpu(i->start), ++ le64_to_cpu(i->end)); ++ } ++} ++ ++const struct bch_sb_field_ops bch_sb_field_ops_journal_seq_blacklist = { ++ .validate = bch2_sb_journal_seq_blacklist_validate, ++ .to_text = bch2_sb_journal_seq_blacklist_to_text ++}; ++ ++void bch2_blacklist_entries_gc(struct work_struct *work) ++{ ++ struct bch_fs *c = container_of(work, struct bch_fs, ++ journal_seq_blacklist_gc_work); ++ struct journal_seq_blacklist_table *t; ++ struct bch_sb_field_journal_seq_blacklist *bl; ++ struct journal_seq_blacklist_entry *src, *dst; ++ struct btree_trans trans; ++ unsigned i, nr, new_nr; ++ int ret; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ for (i = 0; i < BTREE_ID_NR; i++) { ++ struct btree_iter *iter; ++ struct btree *b; ++ ++ for_each_btree_node(&trans, iter, i, POS_MIN, ++ BTREE_ITER_PREFETCH, b) ++ if (test_bit(BCH_FS_STOPPING, &c->flags)) { ++ bch2_trans_exit(&trans); ++ return; ++ } ++ bch2_trans_iter_free(&trans, iter); ++ } ++ ++ ret = bch2_trans_exit(&trans); ++ if (ret) ++ return; ++ ++ mutex_lock(&c->sb_lock); ++ bl = bch2_sb_get_journal_seq_blacklist(c->disk_sb.sb); ++ if (!bl) ++ goto out; ++ ++ nr = blacklist_nr_entries(bl); ++ dst = bl->start; ++ ++ t = c->journal_seq_blacklist_table; ++ BUG_ON(nr != t->nr); ++ ++ for (src = bl->start, i = eytzinger0_first(t->nr); ++ src < bl->start + nr; ++ src++, i = eytzinger0_next(i, nr)) { ++ BUG_ON(t->entries[i].start != le64_to_cpu(src->start)); ++ BUG_ON(t->entries[i].end != le64_to_cpu(src->end)); ++ ++ if (t->entries[i].dirty) ++ *dst++ = *src; ++ } ++ ++ new_nr = dst - bl->start; ++ ++ bch_info(c, "nr blacklist entries was %u, now %u", nr, new_nr); ++ ++ if (new_nr != nr) { ++ bl = bch2_sb_resize_journal_seq_blacklist(&c->disk_sb, ++ new_nr ? sb_blacklist_u64s(new_nr) : 0); ++ BUG_ON(new_nr && !bl); ++ ++ if (!new_nr) ++ c->disk_sb.sb->features[0] &= ++ ~(1ULL << BCH_FEATURE_JOURNAL_SEQ_BLACKLIST_V3); ++ ++ bch2_write_super(c); ++ } ++out: ++ mutex_unlock(&c->sb_lock); ++} +diff --git a/fs/bcachefs/journal_seq_blacklist.h b/fs/bcachefs/journal_seq_blacklist.h +new file mode 100644 +index 000000000000..03f4b97247fd +--- /dev/null ++++ b/fs/bcachefs/journal_seq_blacklist.h +@@ -0,0 +1,13 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H ++#define _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H ++ ++bool bch2_journal_seq_is_blacklisted(struct bch_fs *, u64, bool); ++int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64, u64); ++int bch2_blacklist_table_initialize(struct bch_fs *); ++ ++extern const struct bch_sb_field_ops bch_sb_field_ops_journal_seq_blacklist; ++ ++void bch2_blacklist_entries_gc(struct work_struct *); ++ ++#endif /* _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H */ +diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h +new file mode 100644 +index 000000000000..8eea12a03c06 +--- /dev/null ++++ b/fs/bcachefs/journal_types.h +@@ -0,0 +1,276 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_JOURNAL_TYPES_H ++#define _BCACHEFS_JOURNAL_TYPES_H ++ ++#include ++#include ++ ++#include "alloc_types.h" ++#include "super_types.h" ++#include "fifo.h" ++ ++struct journal_res; ++ ++/* ++ * We put two of these in struct journal; we used them for writes to the ++ * journal that are being staged or in flight. ++ */ ++struct journal_buf { ++ struct jset *data; ++ ++ BKEY_PADDED(key); ++ ++ struct closure_waitlist wait; ++ ++ unsigned buf_size; /* size in bytes of @data */ ++ unsigned sectors; /* maximum size for current entry */ ++ unsigned disk_sectors; /* maximum size entry could have been, if ++ buf_size was bigger */ ++ unsigned u64s_reserved; ++ /* bloom filter: */ ++ unsigned long has_inode[1024 / sizeof(unsigned long)]; ++}; ++ ++/* ++ * Something that makes a journal entry dirty - i.e. a btree node that has to be ++ * flushed: ++ */ ++ ++struct journal_entry_pin_list { ++ struct list_head list; ++ struct list_head flushed; ++ atomic_t count; ++ struct bch_devs_list devs; ++}; ++ ++struct journal; ++struct journal_entry_pin; ++typedef void (*journal_pin_flush_fn)(struct journal *j, ++ struct journal_entry_pin *, u64); ++ ++struct journal_entry_pin { ++ struct list_head list; ++ journal_pin_flush_fn flush; ++ u64 seq; ++}; ++ ++struct journal_res { ++ bool ref; ++ u8 idx; ++ u16 u64s; ++ u32 offset; ++ u64 seq; ++}; ++ ++/* ++ * For reserving space in the journal prior to getting a reservation on a ++ * particular journal entry: ++ */ ++struct journal_preres { ++ unsigned u64s; ++}; ++ ++union journal_res_state { ++ struct { ++ atomic64_t counter; ++ }; ++ ++ struct { ++ u64 v; ++ }; ++ ++ struct { ++ u64 cur_entry_offset:20, ++ idx:1, ++ prev_buf_unwritten:1, ++ buf0_count:21, ++ buf1_count:21; ++ }; ++}; ++ ++union journal_preres_state { ++ struct { ++ atomic64_t counter; ++ }; ++ ++ struct { ++ u64 v; ++ }; ++ ++ struct { ++ u32 reserved; ++ u32 remaining; ++ }; ++}; ++ ++/* bytes: */ ++#define JOURNAL_ENTRY_SIZE_MIN (64U << 10) /* 64k */ ++#define JOURNAL_ENTRY_SIZE_MAX (4U << 20) /* 4M */ ++ ++/* ++ * We stash some journal state as sentinal values in cur_entry_offset: ++ * note - cur_entry_offset is in units of u64s ++ */ ++#define JOURNAL_ENTRY_OFFSET_MAX ((1U << 20) - 1) ++ ++#define JOURNAL_ENTRY_CLOSED_VAL (JOURNAL_ENTRY_OFFSET_MAX - 1) ++#define JOURNAL_ENTRY_ERROR_VAL (JOURNAL_ENTRY_OFFSET_MAX) ++ ++/* ++ * JOURNAL_NEED_WRITE - current (pending) journal entry should be written ASAP, ++ * either because something's waiting on the write to complete or because it's ++ * been dirty too long and the timer's expired. ++ */ ++ ++enum { ++ JOURNAL_REPLAY_DONE, ++ JOURNAL_STARTED, ++ JOURNAL_NEED_WRITE, ++ JOURNAL_NOT_EMPTY, ++ JOURNAL_MAY_GET_UNRESERVED, ++}; ++ ++/* Embedded in struct bch_fs */ ++struct journal { ++ /* Fastpath stuff up front: */ ++ ++ unsigned long flags; ++ ++ union journal_res_state reservations; ++ ++ /* Max size of current journal entry */ ++ unsigned cur_entry_u64s; ++ unsigned cur_entry_sectors; ++ ++ /* ++ * 0, or -ENOSPC if waiting on journal reclaim, or -EROFS if ++ * insufficient devices: ++ */ ++ int cur_entry_error; ++ ++ union journal_preres_state prereserved; ++ ++ /* Reserved space in journal entry to be used just prior to write */ ++ unsigned entry_u64s_reserved; ++ ++ unsigned buf_size_want; ++ ++ /* ++ * Two journal entries -- one is currently open for new entries, the ++ * other is possibly being written out. ++ */ ++ struct journal_buf buf[2]; ++ ++ spinlock_t lock; ++ ++ /* if nonzero, we may not open a new journal entry: */ ++ unsigned blocked; ++ ++ /* Used when waiting because the journal was full */ ++ wait_queue_head_t wait; ++ struct closure_waitlist async_wait; ++ struct closure_waitlist preres_wait; ++ ++ struct closure io; ++ struct delayed_work write_work; ++ ++ /* Sequence number of most recent journal entry (last entry in @pin) */ ++ atomic64_t seq; ++ ++ /* seq, last_seq from the most recent journal entry successfully written */ ++ u64 seq_ondisk; ++ u64 last_seq_ondisk; ++ ++ /* ++ * FIFO of journal entries whose btree updates have not yet been ++ * written out. ++ * ++ * Each entry is a reference count. The position in the FIFO is the ++ * entry's sequence number relative to @seq. ++ * ++ * The journal entry itself holds a reference count, put when the ++ * journal entry is written out. Each btree node modified by the journal ++ * entry also holds a reference count, put when the btree node is ++ * written. ++ * ++ * When a reference count reaches zero, the journal entry is no longer ++ * needed. When all journal entries in the oldest journal bucket are no ++ * longer needed, the bucket can be discarded and reused. ++ */ ++ struct { ++ u64 front, back, size, mask; ++ struct journal_entry_pin_list *data; ++ } pin; ++ ++ u64 replay_journal_seq; ++ u64 replay_journal_seq_end; ++ ++ struct write_point wp; ++ spinlock_t err_lock; ++ ++ struct delayed_work reclaim_work; ++ struct mutex reclaim_lock; ++ unsigned long last_flushed; ++ struct journal_entry_pin *flush_in_progress; ++ wait_queue_head_t pin_flush_wait; ++ ++ /* protects advancing ja->discard_idx: */ ++ struct mutex discard_lock; ++ bool can_discard; ++ ++ unsigned write_delay_ms; ++ unsigned reclaim_delay_ms; ++ ++ u64 res_get_blocked_start; ++ u64 need_write_time; ++ u64 write_start_time; ++ ++ struct time_stats *write_time; ++ struct time_stats *delay_time; ++ struct time_stats *blocked_time; ++ struct time_stats *flush_seq_time; ++ ++#ifdef CONFIG_DEBUG_LOCK_ALLOC ++ struct lockdep_map res_map; ++#endif ++}; ++ ++/* ++ * Embedded in struct bch_dev. First three fields refer to the array of journal ++ * buckets, in bch_sb. ++ */ ++struct journal_device { ++ /* ++ * For each journal bucket, contains the max sequence number of the ++ * journal writes it contains - so we know when a bucket can be reused. ++ */ ++ u64 *bucket_seq; ++ ++ unsigned sectors_free; ++ ++ /* ++ * discard_idx <= dirty_idx_ondisk <= dirty_idx <= cur_idx: ++ */ ++ unsigned discard_idx; /* Next bucket to discard */ ++ unsigned dirty_idx_ondisk; ++ unsigned dirty_idx; ++ unsigned cur_idx; /* Journal bucket we're currently writing to */ ++ unsigned nr; ++ ++ u64 *buckets; ++ ++ /* Bio for journal reads/writes to this device */ ++ struct bio *bio; ++ ++ /* for bch_journal_read_device */ ++ struct closure read; ++}; ++ ++/* ++ * journal_entry_res - reserve space in every journal entry: ++ */ ++struct journal_entry_res { ++ unsigned u64s; ++}; ++ ++#endif /* _BCACHEFS_JOURNAL_TYPES_H */ +diff --git a/fs/bcachefs/keylist.c b/fs/bcachefs/keylist.c +new file mode 100644 +index 000000000000..5da54ced9cad +--- /dev/null ++++ b/fs/bcachefs/keylist.c +@@ -0,0 +1,67 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include "bcachefs.h" ++#include "keylist.h" ++ ++int bch2_keylist_realloc(struct keylist *l, u64 *inline_u64s, ++ size_t nr_inline_u64s, size_t new_u64s) ++{ ++ size_t oldsize = bch_keylist_u64s(l); ++ size_t newsize = oldsize + new_u64s; ++ u64 *old_buf = l->keys_p == inline_u64s ? NULL : l->keys_p; ++ u64 *new_keys; ++ ++ newsize = roundup_pow_of_two(newsize); ++ ++ if (newsize <= nr_inline_u64s || ++ (old_buf && roundup_pow_of_two(oldsize) == newsize)) ++ return 0; ++ ++ new_keys = krealloc(old_buf, sizeof(u64) * newsize, GFP_NOIO); ++ if (!new_keys) ++ return -ENOMEM; ++ ++ if (!old_buf) ++ memcpy_u64s(new_keys, inline_u64s, oldsize); ++ ++ l->keys_p = new_keys; ++ l->top_p = new_keys + oldsize; ++ ++ return 0; ++} ++ ++void bch2_keylist_add_in_order(struct keylist *l, struct bkey_i *insert) ++{ ++ struct bkey_i *where; ++ ++ for_each_keylist_key(l, where) ++ if (bkey_cmp(insert->k.p, where->k.p) < 0) ++ break; ++ ++ memmove_u64s_up((u64 *) where + insert->k.u64s, ++ where, ++ ((u64 *) l->top) - ((u64 *) where)); ++ ++ l->top_p += insert->k.u64s; ++ bkey_copy(where, insert); ++} ++ ++void bch2_keylist_pop_front(struct keylist *l) ++{ ++ l->top_p -= bch2_keylist_front(l)->k.u64s; ++ ++ memmove_u64s_down(l->keys, ++ bkey_next(l->keys), ++ bch_keylist_u64s(l)); ++} ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++void bch2_verify_keylist_sorted(struct keylist *l) ++{ ++ struct bkey_i *k; ++ ++ for_each_keylist_key(l, k) ++ BUG_ON(bkey_next(k) != l->top && ++ bkey_cmp(k->k.p, bkey_next(k)->k.p) >= 0); ++} ++#endif +diff --git a/fs/bcachefs/keylist.h b/fs/bcachefs/keylist.h +new file mode 100644 +index 000000000000..a7ff86b08abc +--- /dev/null ++++ b/fs/bcachefs/keylist.h +@@ -0,0 +1,76 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_KEYLIST_H ++#define _BCACHEFS_KEYLIST_H ++ ++#include "keylist_types.h" ++ ++int bch2_keylist_realloc(struct keylist *, u64 *, size_t, size_t); ++void bch2_keylist_add_in_order(struct keylist *, struct bkey_i *); ++void bch2_keylist_pop_front(struct keylist *); ++ ++static inline void bch2_keylist_init(struct keylist *l, u64 *inline_keys) ++{ ++ l->top_p = l->keys_p = inline_keys; ++} ++ ++static inline void bch2_keylist_free(struct keylist *l, u64 *inline_keys) ++{ ++ if (l->keys_p != inline_keys) ++ kfree(l->keys_p); ++ bch2_keylist_init(l, inline_keys); ++} ++ ++static inline void bch2_keylist_push(struct keylist *l) ++{ ++ l->top = bkey_next(l->top); ++} ++ ++static inline void bch2_keylist_add(struct keylist *l, const struct bkey_i *k) ++{ ++ bkey_copy(l->top, k); ++ bch2_keylist_push(l); ++} ++ ++static inline bool bch2_keylist_empty(struct keylist *l) ++{ ++ return l->top == l->keys; ++} ++ ++static inline size_t bch_keylist_u64s(struct keylist *l) ++{ ++ return l->top_p - l->keys_p; ++} ++ ++static inline size_t bch2_keylist_bytes(struct keylist *l) ++{ ++ return bch_keylist_u64s(l) * sizeof(u64); ++} ++ ++static inline struct bkey_i *bch2_keylist_front(struct keylist *l) ++{ ++ return l->keys; ++} ++ ++#define for_each_keylist_key(_keylist, _k) \ ++ for (_k = (_keylist)->keys; \ ++ _k != (_keylist)->top; \ ++ _k = bkey_next(_k)) ++ ++static inline u64 keylist_sectors(struct keylist *keys) ++{ ++ struct bkey_i *k; ++ u64 ret = 0; ++ ++ for_each_keylist_key(keys, k) ++ ret += k->k.size; ++ ++ return ret; ++} ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++void bch2_verify_keylist_sorted(struct keylist *); ++#else ++static inline void bch2_verify_keylist_sorted(struct keylist *l) {} ++#endif ++ ++#endif /* _BCACHEFS_KEYLIST_H */ +diff --git a/fs/bcachefs/keylist_types.h b/fs/bcachefs/keylist_types.h +new file mode 100644 +index 000000000000..4b3ff7d8a875 +--- /dev/null ++++ b/fs/bcachefs/keylist_types.h +@@ -0,0 +1,16 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_KEYLIST_TYPES_H ++#define _BCACHEFS_KEYLIST_TYPES_H ++ ++struct keylist { ++ union { ++ struct bkey_i *keys; ++ u64 *keys_p; ++ }; ++ union { ++ struct bkey_i *top; ++ u64 *top_p; ++ }; ++}; ++ ++#endif /* _BCACHEFS_KEYLIST_TYPES_H */ +diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c +new file mode 100644 +index 000000000000..de8522f754e2 +--- /dev/null ++++ b/fs/bcachefs/migrate.c +@@ -0,0 +1,187 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * Code for moving data off a device. ++ */ ++ ++#include "bcachefs.h" ++#include "btree_update.h" ++#include "btree_update_interior.h" ++#include "buckets.h" ++#include "extents.h" ++#include "io.h" ++#include "journal.h" ++#include "keylist.h" ++#include "migrate.h" ++#include "move.h" ++#include "replicas.h" ++#include "super-io.h" ++ ++static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s k, ++ unsigned dev_idx, int flags, bool metadata) ++{ ++ unsigned replicas = metadata ? c->opts.metadata_replicas : c->opts.data_replicas; ++ unsigned lost = metadata ? BCH_FORCE_IF_METADATA_LOST : BCH_FORCE_IF_DATA_LOST; ++ unsigned degraded = metadata ? BCH_FORCE_IF_METADATA_DEGRADED : BCH_FORCE_IF_DATA_DEGRADED; ++ unsigned nr_good; ++ ++ bch2_bkey_drop_device(k, dev_idx); ++ ++ nr_good = bch2_bkey_durability(c, k.s_c); ++ if ((!nr_good && !(flags & lost)) || ++ (nr_good < replicas && !(flags & degraded))) ++ return -EINVAL; ++ ++ return 0; ++} ++ ++static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags, ++ enum btree_id btree_id) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ BKEY_PADDED(key) tmp; ++ int ret = 0; ++ ++ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); ++ ++ iter = bch2_trans_get_iter(&trans, btree_id, POS_MIN, ++ BTREE_ITER_PREFETCH); ++ ++ while ((k = bch2_btree_iter_peek(iter)).k && ++ !(ret = bkey_err(k))) { ++ if (!bch2_bkey_has_device(k, dev_idx)) { ++ ret = bch2_mark_bkey_replicas(c, k); ++ if (ret) ++ break; ++ bch2_btree_iter_next(iter); ++ continue; ++ } ++ ++ bkey_reassemble(&tmp.key, k); ++ ++ ret = drop_dev_ptrs(c, bkey_i_to_s(&tmp.key), ++ dev_idx, flags, false); ++ if (ret) ++ break; ++ ++ /* ++ * If the new extent no longer has any pointers, bch2_extent_normalize() ++ * will do the appropriate thing with it (turning it into a ++ * KEY_TYPE_error key, or just a discard if it was a cached extent) ++ */ ++ bch2_extent_normalize(c, bkey_i_to_s(&tmp.key)); ++ ++ bch2_btree_iter_set_pos(iter, bkey_start_pos(&tmp.key.k)); ++ ++ bch2_trans_update(&trans, iter, &tmp.key); ++ ++ ret = bch2_trans_commit(&trans, NULL, NULL, ++ BTREE_INSERT_ATOMIC| ++ BTREE_INSERT_NOFAIL); ++ ++ /* ++ * don't want to leave ret == -EINTR, since if we raced and ++ * something else overwrote the key we could spuriously return ++ * -EINTR below: ++ */ ++ if (ret == -EINTR) ++ ret = 0; ++ if (ret) ++ break; ++ } ++ ++ ret = bch2_trans_exit(&trans) ?: ret; ++ ++ BUG_ON(ret == -EINTR); ++ ++ return ret; ++} ++ ++static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags) ++{ ++ return __bch2_dev_usrdata_drop(c, dev_idx, flags, BTREE_ID_EXTENTS) ?: ++ __bch2_dev_usrdata_drop(c, dev_idx, flags, BTREE_ID_REFLINK); ++} ++ ++static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct closure cl; ++ struct btree *b; ++ unsigned id; ++ int ret; ++ ++ /* don't handle this yet: */ ++ if (flags & BCH_FORCE_IF_METADATA_LOST) ++ return -EINVAL; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ closure_init_stack(&cl); ++ ++ for (id = 0; id < BTREE_ID_NR; id++) { ++ for_each_btree_node(&trans, iter, id, POS_MIN, ++ BTREE_ITER_PREFETCH, b) { ++ __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp; ++ struct bkey_i_btree_ptr *new_key; ++retry: ++ if (!bch2_bkey_has_device(bkey_i_to_s_c(&b->key), ++ dev_idx)) { ++ /* ++ * we might have found a btree node key we ++ * needed to update, and then tried to update it ++ * but got -EINTR after upgrading the iter, but ++ * then raced and the node is now gone: ++ */ ++ bch2_btree_iter_downgrade(iter); ++ ++ ret = bch2_mark_bkey_replicas(c, bkey_i_to_s_c(&b->key)); ++ if (ret) ++ goto err; ++ } else { ++ bkey_copy(&tmp.k, &b->key); ++ new_key = bkey_i_to_btree_ptr(&tmp.k); ++ ++ ret = drop_dev_ptrs(c, bkey_i_to_s(&new_key->k_i), ++ dev_idx, flags, true); ++ if (ret) ++ goto err; ++ ++ ret = bch2_btree_node_update_key(c, iter, b, new_key); ++ if (ret == -EINTR) { ++ b = bch2_btree_iter_peek_node(iter); ++ goto retry; ++ } ++ if (ret) ++ goto err; ++ } ++ } ++ bch2_trans_iter_free(&trans, iter); ++ } ++ ++ /* flush relevant btree updates */ ++ while (1) { ++ closure_wait_event(&c->btree_interior_update_wait, ++ !bch2_btree_interior_updates_nr_pending(c) || ++ c->btree_roots_dirty); ++ if (!bch2_btree_interior_updates_nr_pending(c)) ++ break; ++ bch2_journal_meta(&c->journal); ++ } ++ ++ ret = 0; ++err: ++ ret = bch2_trans_exit(&trans) ?: ret; ++ ++ BUG_ON(ret == -EINTR); ++ ++ return ret; ++} ++ ++int bch2_dev_data_drop(struct bch_fs *c, unsigned dev_idx, int flags) ++{ ++ return bch2_dev_usrdata_drop(c, dev_idx, flags) ?: ++ bch2_dev_metadata_drop(c, dev_idx, flags) ?: ++ bch2_replicas_gc2(c); ++} +diff --git a/fs/bcachefs/migrate.h b/fs/bcachefs/migrate.h +new file mode 100644 +index 000000000000..027efaa0d575 +--- /dev/null ++++ b/fs/bcachefs/migrate.h +@@ -0,0 +1,7 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_MIGRATE_H ++#define _BCACHEFS_MIGRATE_H ++ ++int bch2_dev_data_drop(struct bch_fs *, unsigned, int); ++ ++#endif /* _BCACHEFS_MIGRATE_H */ +diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c +new file mode 100644 +index 000000000000..ab20e981145b +--- /dev/null ++++ b/fs/bcachefs/move.c +@@ -0,0 +1,804 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include "bcachefs.h" ++#include "alloc_foreground.h" ++#include "btree_gc.h" ++#include "btree_update.h" ++#include "btree_update_interior.h" ++#include "buckets.h" ++#include "disk_groups.h" ++#include "inode.h" ++#include "io.h" ++#include "journal_reclaim.h" ++#include "move.h" ++#include "replicas.h" ++#include "super-io.h" ++#include "keylist.h" ++ ++#include ++#include ++ ++#include ++ ++#define SECTORS_IN_FLIGHT_PER_DEVICE 2048 ++ ++struct moving_io { ++ struct list_head list; ++ struct closure cl; ++ bool read_completed; ++ ++ unsigned read_sectors; ++ unsigned write_sectors; ++ ++ struct bch_read_bio rbio; ++ ++ struct migrate_write write; ++ /* Must be last since it is variable size */ ++ struct bio_vec bi_inline_vecs[0]; ++}; ++ ++struct moving_context { ++ /* Closure for waiting on all reads and writes to complete */ ++ struct closure cl; ++ ++ struct bch_move_stats *stats; ++ ++ struct list_head reads; ++ ++ /* in flight sectors: */ ++ atomic_t read_sectors; ++ atomic_t write_sectors; ++ ++ wait_queue_head_t wait; ++}; ++ ++static int bch2_migrate_index_update(struct bch_write_op *op) ++{ ++ struct bch_fs *c = op->c; ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct migrate_write *m = ++ container_of(op, struct migrate_write, op); ++ struct keylist *keys = &op->insert_keys; ++ int ret = 0; ++ ++ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); ++ ++ iter = bch2_trans_get_iter(&trans, m->btree_id, ++ bkey_start_pos(&bch2_keylist_front(keys)->k), ++ BTREE_ITER_SLOTS|BTREE_ITER_INTENT); ++ ++ while (1) { ++ struct bkey_s_c k = bch2_btree_iter_peek_slot(iter); ++ struct bkey_i *insert; ++ struct bkey_i_extent *new = ++ bkey_i_to_extent(bch2_keylist_front(keys)); ++ BKEY_PADDED(k) _new, _insert; ++ const union bch_extent_entry *entry; ++ struct extent_ptr_decoded p; ++ bool did_work = false; ++ int nr; ++ ++ ret = bkey_err(k); ++ if (ret) ++ break; ++ ++ if (bversion_cmp(k.k->version, new->k.version) || ++ !bch2_bkey_matches_ptr(c, k, m->ptr, m->offset)) ++ goto nomatch; ++ ++ if (m->data_cmd == DATA_REWRITE && ++ !bch2_bkey_has_device(k, m->data_opts.rewrite_dev)) ++ goto nomatch; ++ ++ bkey_reassemble(&_insert.k, k); ++ insert = &_insert.k; ++ ++ bkey_copy(&_new.k, bch2_keylist_front(keys)); ++ new = bkey_i_to_extent(&_new.k); ++ ++ bch2_cut_front(iter->pos, insert); ++ bch2_cut_back(new->k.p, &insert->k); ++ bch2_cut_back(insert->k.p, &new->k); ++ ++ if (m->data_cmd == DATA_REWRITE) ++ bch2_bkey_drop_device(bkey_i_to_s(insert), ++ m->data_opts.rewrite_dev); ++ ++ extent_for_each_ptr_decode(extent_i_to_s(new), p, entry) { ++ if (bch2_bkey_has_device(bkey_i_to_s_c(insert), p.ptr.dev)) { ++ /* ++ * raced with another move op? extent already ++ * has a pointer to the device we just wrote ++ * data to ++ */ ++ continue; ++ } ++ ++ bch2_extent_ptr_decoded_append(insert, &p); ++ did_work = true; ++ } ++ ++ if (!did_work) ++ goto nomatch; ++ ++ bch2_bkey_narrow_crcs(insert, ++ (struct bch_extent_crc_unpacked) { 0 }); ++ bch2_extent_normalize(c, bkey_i_to_s(insert)); ++ bch2_bkey_mark_replicas_cached(c, bkey_i_to_s(insert), ++ op->opts.background_target, ++ op->opts.data_replicas); ++ ++ /* ++ * If we're not fully overwriting @k, and it's compressed, we ++ * need a reservation for all the pointers in @insert ++ */ ++ nr = bch2_bkey_nr_dirty_ptrs(bkey_i_to_s_c(insert)) - ++ m->nr_ptrs_reserved; ++ ++ if (insert->k.size < k.k->size && ++ bch2_extent_is_compressed(k) && ++ nr > 0) { ++ ret = bch2_disk_reservation_add(c, &op->res, ++ keylist_sectors(keys) * nr, 0); ++ if (ret) ++ goto out; ++ ++ m->nr_ptrs_reserved += nr; ++ goto next; ++ } ++ ++ bch2_trans_update(&trans, iter, insert); ++ ++ ret = bch2_trans_commit(&trans, &op->res, ++ op_journal_seq(op), ++ BTREE_INSERT_ATOMIC| ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_USE_RESERVE| ++ m->data_opts.btree_insert_flags); ++ if (!ret) ++ atomic_long_inc(&c->extent_migrate_done); ++ if (ret == -EINTR) ++ ret = 0; ++ if (ret) ++ break; ++next: ++ while (bkey_cmp(iter->pos, bch2_keylist_front(keys)->k.p) >= 0) { ++ bch2_keylist_pop_front(keys); ++ if (bch2_keylist_empty(keys)) ++ goto out; ++ } ++ ++ bch2_cut_front(iter->pos, bch2_keylist_front(keys)); ++ continue; ++nomatch: ++ if (m->ctxt) ++ atomic64_add(k.k->p.offset - iter->pos.offset, ++ &m->ctxt->stats->sectors_raced); ++ atomic_long_inc(&c->extent_migrate_raced); ++ trace_move_race(&new->k); ++ bch2_btree_iter_next_slot(iter); ++ goto next; ++ } ++out: ++ bch2_trans_exit(&trans); ++ BUG_ON(ret == -EINTR); ++ return ret; ++} ++ ++void bch2_migrate_read_done(struct migrate_write *m, struct bch_read_bio *rbio) ++{ ++ /* write bio must own pages: */ ++ BUG_ON(!m->op.wbio.bio.bi_vcnt); ++ ++ m->ptr = rbio->pick.ptr; ++ m->offset = rbio->pos.offset - rbio->pick.crc.offset; ++ m->op.devs_have = rbio->devs_have; ++ m->op.pos = rbio->pos; ++ m->op.version = rbio->version; ++ m->op.crc = rbio->pick.crc; ++ m->op.wbio.bio.bi_iter.bi_size = m->op.crc.compressed_size << 9; ++ ++ if (bch2_csum_type_is_encryption(m->op.crc.csum_type)) { ++ m->op.nonce = m->op.crc.nonce + m->op.crc.offset; ++ m->op.csum_type = m->op.crc.csum_type; ++ } ++ ++ if (m->data_cmd == DATA_REWRITE) ++ bch2_dev_list_drop_dev(&m->op.devs_have, m->data_opts.rewrite_dev); ++} ++ ++int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m, ++ struct write_point_specifier wp, ++ struct bch_io_opts io_opts, ++ enum data_cmd data_cmd, ++ struct data_opts data_opts, ++ enum btree_id btree_id, ++ struct bkey_s_c k) ++{ ++ int ret; ++ ++ m->btree_id = btree_id; ++ m->data_cmd = data_cmd; ++ m->data_opts = data_opts; ++ m->nr_ptrs_reserved = 0; ++ ++ bch2_write_op_init(&m->op, c, io_opts); ++ m->op.compression_type = ++ bch2_compression_opt_to_type[io_opts.background_compression ?: ++ io_opts.compression]; ++ m->op.target = data_opts.target, ++ m->op.write_point = wp; ++ ++ if (m->data_opts.btree_insert_flags & BTREE_INSERT_USE_RESERVE) ++ m->op.alloc_reserve = RESERVE_MOVINGGC; ++ ++ m->op.flags |= BCH_WRITE_ONLY_SPECIFIED_DEVS| ++ BCH_WRITE_PAGES_STABLE| ++ BCH_WRITE_PAGES_OWNED| ++ BCH_WRITE_DATA_ENCODED; ++ ++ m->op.nr_replicas = 1; ++ m->op.nr_replicas_required = 1; ++ m->op.index_update_fn = bch2_migrate_index_update; ++ ++ switch (data_cmd) { ++ case DATA_ADD_REPLICAS: { ++ /* ++ * DATA_ADD_REPLICAS is used for moving data to a different ++ * device in the background, and due to compression the new copy ++ * might take up more space than the old copy: ++ */ ++#if 0 ++ int nr = (int) io_opts.data_replicas - ++ bch2_bkey_nr_dirty_ptrs(k); ++#endif ++ int nr = (int) io_opts.data_replicas; ++ ++ if (nr > 0) { ++ m->op.nr_replicas = m->nr_ptrs_reserved = nr; ++ ++ ret = bch2_disk_reservation_get(c, &m->op.res, ++ k.k->size, m->op.nr_replicas, 0); ++ if (ret) ++ return ret; ++ } ++ break; ++ } ++ case DATA_REWRITE: { ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const union bch_extent_entry *entry; ++ struct extent_ptr_decoded p; ++ unsigned compressed_sectors = 0; ++ ++ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) ++ if (!p.ptr.cached && ++ p.crc.compression_type != BCH_COMPRESSION_NONE && ++ bch2_dev_in_target(c, p.ptr.dev, data_opts.target)) ++ compressed_sectors += p.crc.compressed_size; ++ ++ if (compressed_sectors) { ++ ret = bch2_disk_reservation_add(c, &m->op.res, ++ compressed_sectors, ++ BCH_DISK_RESERVATION_NOFAIL); ++ if (ret) ++ return ret; ++ } ++ break; ++ } ++ case DATA_PROMOTE: ++ m->op.flags |= BCH_WRITE_ALLOC_NOWAIT; ++ m->op.flags |= BCH_WRITE_CACHED; ++ break; ++ default: ++ BUG(); ++ } ++ ++ return 0; ++} ++ ++static void move_free(struct closure *cl) ++{ ++ struct moving_io *io = container_of(cl, struct moving_io, cl); ++ struct moving_context *ctxt = io->write.ctxt; ++ struct bvec_iter_all iter; ++ struct bio_vec *bv; ++ ++ bch2_disk_reservation_put(io->write.op.c, &io->write.op.res); ++ ++ bio_for_each_segment_all(bv, &io->write.op.wbio.bio, iter) ++ if (bv->bv_page) ++ __free_page(bv->bv_page); ++ ++ wake_up(&ctxt->wait); ++ ++ kfree(io); ++} ++ ++static void move_write_done(struct closure *cl) ++{ ++ struct moving_io *io = container_of(cl, struct moving_io, cl); ++ ++ atomic_sub(io->write_sectors, &io->write.ctxt->write_sectors); ++ closure_return_with_destructor(cl, move_free); ++} ++ ++static void move_write(struct closure *cl) ++{ ++ struct moving_io *io = container_of(cl, struct moving_io, cl); ++ ++ if (unlikely(io->rbio.bio.bi_status || io->rbio.hole)) { ++ closure_return_with_destructor(cl, move_free); ++ return; ++ } ++ ++ bch2_migrate_read_done(&io->write, &io->rbio); ++ ++ atomic_add(io->write_sectors, &io->write.ctxt->write_sectors); ++ closure_call(&io->write.op.cl, bch2_write, NULL, cl); ++ continue_at(cl, move_write_done, NULL); ++} ++ ++static inline struct moving_io *next_pending_write(struct moving_context *ctxt) ++{ ++ struct moving_io *io = ++ list_first_entry_or_null(&ctxt->reads, struct moving_io, list); ++ ++ return io && io->read_completed ? io : NULL; ++} ++ ++static void move_read_endio(struct bio *bio) ++{ ++ struct moving_io *io = container_of(bio, struct moving_io, rbio.bio); ++ struct moving_context *ctxt = io->write.ctxt; ++ ++ atomic_sub(io->read_sectors, &ctxt->read_sectors); ++ io->read_completed = true; ++ ++ if (next_pending_write(ctxt)) ++ wake_up(&ctxt->wait); ++ ++ closure_put(&ctxt->cl); ++} ++ ++static void do_pending_writes(struct moving_context *ctxt) ++{ ++ struct moving_io *io; ++ ++ while ((io = next_pending_write(ctxt))) { ++ list_del(&io->list); ++ closure_call(&io->cl, move_write, NULL, &ctxt->cl); ++ } ++} ++ ++#define move_ctxt_wait_event(_ctxt, _cond) \ ++do { \ ++ do_pending_writes(_ctxt); \ ++ \ ++ if (_cond) \ ++ break; \ ++ __wait_event((_ctxt)->wait, \ ++ next_pending_write(_ctxt) || (_cond)); \ ++} while (1) ++ ++static void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt) ++{ ++ unsigned sectors_pending = atomic_read(&ctxt->write_sectors); ++ ++ move_ctxt_wait_event(ctxt, ++ !atomic_read(&ctxt->write_sectors) || ++ atomic_read(&ctxt->write_sectors) != sectors_pending); ++} ++ ++static int bch2_move_extent(struct bch_fs *c, ++ struct moving_context *ctxt, ++ struct write_point_specifier wp, ++ struct bch_io_opts io_opts, ++ enum btree_id btree_id, ++ struct bkey_s_c k, ++ enum data_cmd data_cmd, ++ struct data_opts data_opts) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ struct moving_io *io; ++ const union bch_extent_entry *entry; ++ struct extent_ptr_decoded p; ++ unsigned sectors = k.k->size, pages; ++ int ret = -ENOMEM; ++ ++ move_ctxt_wait_event(ctxt, ++ atomic_read(&ctxt->write_sectors) < ++ SECTORS_IN_FLIGHT_PER_DEVICE); ++ ++ move_ctxt_wait_event(ctxt, ++ atomic_read(&ctxt->read_sectors) < ++ SECTORS_IN_FLIGHT_PER_DEVICE); ++ ++ /* write path might have to decompress data: */ ++ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) ++ sectors = max_t(unsigned, sectors, p.crc.uncompressed_size); ++ ++ pages = DIV_ROUND_UP(sectors, PAGE_SECTORS); ++ io = kzalloc(sizeof(struct moving_io) + ++ sizeof(struct bio_vec) * pages, GFP_KERNEL); ++ if (!io) ++ goto err; ++ ++ io->write.ctxt = ctxt; ++ io->read_sectors = k.k->size; ++ io->write_sectors = k.k->size; ++ ++ bio_init(&io->write.op.wbio.bio, io->bi_inline_vecs, pages); ++ bio_set_prio(&io->write.op.wbio.bio, ++ IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); ++ ++ if (bch2_bio_alloc_pages(&io->write.op.wbio.bio, sectors << 9, ++ GFP_KERNEL)) ++ goto err_free; ++ ++ io->rbio.c = c; ++ io->rbio.opts = io_opts; ++ bio_init(&io->rbio.bio, io->bi_inline_vecs, pages); ++ io->rbio.bio.bi_vcnt = pages; ++ bio_set_prio(&io->rbio.bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); ++ io->rbio.bio.bi_iter.bi_size = sectors << 9; ++ ++ bio_set_op_attrs(&io->rbio.bio, REQ_OP_READ, 0); ++ io->rbio.bio.bi_iter.bi_sector = bkey_start_offset(k.k); ++ io->rbio.bio.bi_end_io = move_read_endio; ++ ++ ret = bch2_migrate_write_init(c, &io->write, wp, io_opts, ++ data_cmd, data_opts, btree_id, k); ++ if (ret) ++ goto err_free_pages; ++ ++ atomic64_inc(&ctxt->stats->keys_moved); ++ atomic64_add(k.k->size, &ctxt->stats->sectors_moved); ++ ++ trace_move_extent(k.k); ++ ++ atomic_add(io->read_sectors, &ctxt->read_sectors); ++ list_add_tail(&io->list, &ctxt->reads); ++ ++ /* ++ * dropped by move_read_endio() - guards against use after free of ++ * ctxt when doing wakeup ++ */ ++ closure_get(&ctxt->cl); ++ bch2_read_extent(c, &io->rbio, k, 0, ++ BCH_READ_NODECODE| ++ BCH_READ_LAST_FRAGMENT); ++ return 0; ++err_free_pages: ++ bio_free_pages(&io->write.op.wbio.bio); ++err_free: ++ kfree(io); ++err: ++ trace_move_alloc_fail(k.k); ++ return ret; ++} ++ ++static int __bch2_move_data(struct bch_fs *c, ++ struct moving_context *ctxt, ++ struct bch_ratelimit *rate, ++ struct write_point_specifier wp, ++ struct bpos start, ++ struct bpos end, ++ move_pred_fn pred, void *arg, ++ struct bch_move_stats *stats, ++ enum btree_id btree_id) ++{ ++ bool kthread = (current->flags & PF_KTHREAD) != 0; ++ struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts); ++ BKEY_PADDED(k) tmp; ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ struct data_opts data_opts; ++ enum data_cmd data_cmd; ++ u64 delay, cur_inum = U64_MAX; ++ int ret = 0, ret2; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ stats->data_type = BCH_DATA_USER; ++ stats->btree_id = btree_id; ++ stats->pos = POS_MIN; ++ ++ iter = bch2_trans_get_iter(&trans, btree_id, start, ++ BTREE_ITER_PREFETCH); ++ ++ if (rate) ++ bch2_ratelimit_reset(rate); ++ ++ while (1) { ++ do { ++ delay = rate ? bch2_ratelimit_delay(rate) : 0; ++ ++ if (delay) { ++ bch2_trans_unlock(&trans); ++ set_current_state(TASK_INTERRUPTIBLE); ++ } ++ ++ if (kthread && (ret = kthread_should_stop())) { ++ __set_current_state(TASK_RUNNING); ++ goto out; ++ } ++ ++ if (delay) ++ schedule_timeout(delay); ++ ++ if (unlikely(freezing(current))) { ++ bch2_trans_unlock(&trans); ++ move_ctxt_wait_event(ctxt, list_empty(&ctxt->reads)); ++ try_to_freeze(); ++ } ++ } while (delay); ++peek: ++ k = bch2_btree_iter_peek(iter); ++ ++ stats->pos = iter->pos; ++ ++ if (!k.k) ++ break; ++ ret = bkey_err(k); ++ if (ret) ++ break; ++ if (bkey_cmp(bkey_start_pos(k.k), end) >= 0) ++ break; ++ ++ if (!bkey_extent_is_direct_data(k.k)) ++ goto next_nondata; ++ ++ if (btree_id == BTREE_ID_EXTENTS && ++ cur_inum != k.k->p.inode) { ++ struct bch_inode_unpacked inode; ++ ++ /* don't hold btree locks while looking up inode: */ ++ bch2_trans_unlock(&trans); ++ ++ io_opts = bch2_opts_to_inode_opts(c->opts); ++ if (!bch2_inode_find_by_inum(c, k.k->p.inode, &inode)) ++ bch2_io_opts_apply(&io_opts, bch2_inode_opts_get(&inode)); ++ cur_inum = k.k->p.inode; ++ goto peek; ++ } ++ ++ switch ((data_cmd = pred(c, arg, k, &io_opts, &data_opts))) { ++ case DATA_SKIP: ++ goto next; ++ case DATA_SCRUB: ++ BUG(); ++ case DATA_ADD_REPLICAS: ++ case DATA_REWRITE: ++ case DATA_PROMOTE: ++ break; ++ default: ++ BUG(); ++ } ++ ++ /* unlock before doing IO: */ ++ bkey_reassemble(&tmp.k, k); ++ k = bkey_i_to_s_c(&tmp.k); ++ bch2_trans_unlock(&trans); ++ ++ ret2 = bch2_move_extent(c, ctxt, wp, io_opts, btree_id, k, ++ data_cmd, data_opts); ++ if (ret2) { ++ if (ret2 == -ENOMEM) { ++ /* memory allocation failure, wait for some IO to finish */ ++ bch2_move_ctxt_wait_for_io(ctxt); ++ continue; ++ } ++ ++ /* XXX signal failure */ ++ goto next; ++ } ++ ++ if (rate) ++ bch2_ratelimit_increment(rate, k.k->size); ++next: ++ atomic64_add(k.k->size * bch2_bkey_nr_dirty_ptrs(k), ++ &stats->sectors_seen); ++next_nondata: ++ bch2_btree_iter_next(iter); ++ bch2_trans_cond_resched(&trans); ++ } ++out: ++ ret = bch2_trans_exit(&trans) ?: ret; ++ ++ return ret; ++} ++ ++int bch2_move_data(struct bch_fs *c, ++ struct bch_ratelimit *rate, ++ struct write_point_specifier wp, ++ struct bpos start, ++ struct bpos end, ++ move_pred_fn pred, void *arg, ++ struct bch_move_stats *stats) ++{ ++ struct moving_context ctxt = { .stats = stats }; ++ int ret; ++ ++ closure_init_stack(&ctxt.cl); ++ INIT_LIST_HEAD(&ctxt.reads); ++ init_waitqueue_head(&ctxt.wait); ++ ++ stats->data_type = BCH_DATA_USER; ++ ++ ret = __bch2_move_data(c, &ctxt, rate, wp, start, end, ++ pred, arg, stats, BTREE_ID_EXTENTS) ?: ++ __bch2_move_data(c, &ctxt, rate, wp, start, end, ++ pred, arg, stats, BTREE_ID_REFLINK); ++ ++ move_ctxt_wait_event(&ctxt, list_empty(&ctxt.reads)); ++ closure_sync(&ctxt.cl); ++ ++ EBUG_ON(atomic_read(&ctxt.write_sectors)); ++ ++ trace_move_data(c, ++ atomic64_read(&stats->sectors_moved), ++ atomic64_read(&stats->keys_moved)); ++ ++ return ret; ++} ++ ++static int bch2_move_btree(struct bch_fs *c, ++ move_pred_fn pred, ++ void *arg, ++ struct bch_move_stats *stats) ++{ ++ struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts); ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct btree *b; ++ unsigned id; ++ struct data_opts data_opts; ++ enum data_cmd cmd; ++ int ret = 0; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ stats->data_type = BCH_DATA_BTREE; ++ ++ for (id = 0; id < BTREE_ID_NR; id++) { ++ stats->btree_id = id; ++ ++ for_each_btree_node(&trans, iter, id, POS_MIN, ++ BTREE_ITER_PREFETCH, b) { ++ stats->pos = iter->pos; ++ ++ switch ((cmd = pred(c, arg, ++ bkey_i_to_s_c(&b->key), ++ &io_opts, &data_opts))) { ++ case DATA_SKIP: ++ goto next; ++ case DATA_SCRUB: ++ BUG(); ++ case DATA_ADD_REPLICAS: ++ case DATA_REWRITE: ++ break; ++ default: ++ BUG(); ++ } ++ ++ ret = bch2_btree_node_rewrite(c, iter, ++ b->data->keys.seq, 0) ?: ret; ++next: ++ bch2_trans_cond_resched(&trans); ++ } ++ ++ ret = bch2_trans_iter_free(&trans, iter) ?: ret; ++ } ++ ++ bch2_trans_exit(&trans); ++ ++ return ret; ++} ++ ++#if 0 ++static enum data_cmd scrub_pred(struct bch_fs *c, void *arg, ++ struct bkey_s_c k, ++ struct bch_io_opts *io_opts, ++ struct data_opts *data_opts) ++{ ++ return DATA_SCRUB; ++} ++#endif ++ ++static enum data_cmd rereplicate_pred(struct bch_fs *c, void *arg, ++ struct bkey_s_c k, ++ struct bch_io_opts *io_opts, ++ struct data_opts *data_opts) ++{ ++ unsigned nr_good = bch2_bkey_durability(c, k); ++ unsigned replicas = 0; ++ ++ switch (k.k->type) { ++ case KEY_TYPE_btree_ptr: ++ replicas = c->opts.metadata_replicas; ++ break; ++ case KEY_TYPE_extent: ++ replicas = io_opts->data_replicas; ++ break; ++ } ++ ++ if (!nr_good || nr_good >= replicas) ++ return DATA_SKIP; ++ ++ data_opts->target = 0; ++ data_opts->btree_insert_flags = 0; ++ return DATA_ADD_REPLICAS; ++} ++ ++static enum data_cmd migrate_pred(struct bch_fs *c, void *arg, ++ struct bkey_s_c k, ++ struct bch_io_opts *io_opts, ++ struct data_opts *data_opts) ++{ ++ struct bch_ioctl_data *op = arg; ++ ++ if (!bch2_bkey_has_device(k, op->migrate.dev)) ++ return DATA_SKIP; ++ ++ data_opts->target = 0; ++ data_opts->btree_insert_flags = 0; ++ data_opts->rewrite_dev = op->migrate.dev; ++ return DATA_REWRITE; ++} ++ ++int bch2_data_job(struct bch_fs *c, ++ struct bch_move_stats *stats, ++ struct bch_ioctl_data op) ++{ ++ int ret = 0; ++ ++ switch (op.op) { ++ case BCH_DATA_OP_REREPLICATE: ++ stats->data_type = BCH_DATA_JOURNAL; ++ ret = bch2_journal_flush_device_pins(&c->journal, -1); ++ ++ ret = bch2_move_btree(c, rereplicate_pred, c, stats) ?: ret; ++ ++ while (1) { ++ closure_wait_event(&c->btree_interior_update_wait, ++ !bch2_btree_interior_updates_nr_pending(c) || ++ c->btree_roots_dirty); ++ if (!bch2_btree_interior_updates_nr_pending(c)) ++ break; ++ bch2_journal_meta(&c->journal); ++ } ++ ++ ret = bch2_replicas_gc2(c) ?: ret; ++ ++ ret = bch2_move_data(c, NULL, ++ writepoint_hashed((unsigned long) current), ++ op.start, ++ op.end, ++ rereplicate_pred, c, stats) ?: ret; ++ ret = bch2_replicas_gc2(c) ?: ret; ++ break; ++ case BCH_DATA_OP_MIGRATE: ++ if (op.migrate.dev >= c->sb.nr_devices) ++ return -EINVAL; ++ ++ stats->data_type = BCH_DATA_JOURNAL; ++ ret = bch2_journal_flush_device_pins(&c->journal, op.migrate.dev); ++ ++ ret = bch2_move_btree(c, migrate_pred, &op, stats) ?: ret; ++ ret = bch2_replicas_gc2(c) ?: ret; ++ ++ ret = bch2_move_data(c, NULL, ++ writepoint_hashed((unsigned long) current), ++ op.start, ++ op.end, ++ migrate_pred, &op, stats) ?: ret; ++ ret = bch2_replicas_gc2(c) ?: ret; ++ break; ++ default: ++ ret = -EINVAL; ++ } ++ ++ return ret; ++} +diff --git a/fs/bcachefs/move.h b/fs/bcachefs/move.h +new file mode 100644 +index 000000000000..0acd1720d4f8 +--- /dev/null ++++ b/fs/bcachefs/move.h +@@ -0,0 +1,64 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_MOVE_H ++#define _BCACHEFS_MOVE_H ++ ++#include "btree_iter.h" ++#include "buckets.h" ++#include "io_types.h" ++#include "move_types.h" ++ ++struct bch_read_bio; ++struct moving_context; ++ ++enum data_cmd { ++ DATA_SKIP, ++ DATA_SCRUB, ++ DATA_ADD_REPLICAS, ++ DATA_REWRITE, ++ DATA_PROMOTE, ++}; ++ ++struct data_opts { ++ u16 target; ++ unsigned rewrite_dev; ++ int btree_insert_flags; ++}; ++ ++struct migrate_write { ++ enum btree_id btree_id; ++ enum data_cmd data_cmd; ++ struct data_opts data_opts; ++ ++ unsigned nr_ptrs_reserved; ++ ++ struct moving_context *ctxt; ++ ++ /* what we read: */ ++ struct bch_extent_ptr ptr; ++ u64 offset; ++ ++ struct bch_write_op op; ++}; ++ ++void bch2_migrate_read_done(struct migrate_write *, struct bch_read_bio *); ++int bch2_migrate_write_init(struct bch_fs *, struct migrate_write *, ++ struct write_point_specifier, ++ struct bch_io_opts, ++ enum data_cmd, struct data_opts, ++ enum btree_id, struct bkey_s_c); ++ ++typedef enum data_cmd (*move_pred_fn)(struct bch_fs *, void *, ++ struct bkey_s_c, ++ struct bch_io_opts *, struct data_opts *); ++ ++int bch2_move_data(struct bch_fs *, struct bch_ratelimit *, ++ struct write_point_specifier, ++ struct bpos, struct bpos, ++ move_pred_fn, void *, ++ struct bch_move_stats *); ++ ++int bch2_data_job(struct bch_fs *, ++ struct bch_move_stats *, ++ struct bch_ioctl_data); ++ ++#endif /* _BCACHEFS_MOVE_H */ +diff --git a/fs/bcachefs/move_types.h b/fs/bcachefs/move_types.h +new file mode 100644 +index 000000000000..6788170d3f95 +--- /dev/null ++++ b/fs/bcachefs/move_types.h +@@ -0,0 +1,16 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_MOVE_TYPES_H ++#define _BCACHEFS_MOVE_TYPES_H ++ ++struct bch_move_stats { ++ enum bch_data_type data_type; ++ enum btree_id btree_id; ++ struct bpos pos; ++ ++ atomic64_t keys_moved; ++ atomic64_t sectors_moved; ++ atomic64_t sectors_seen; ++ atomic64_t sectors_raced; ++}; ++ ++#endif /* _BCACHEFS_MOVE_TYPES_H */ +diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c +new file mode 100644 +index 000000000000..710296044194 +--- /dev/null ++++ b/fs/bcachefs/movinggc.c +@@ -0,0 +1,305 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * Moving/copying garbage collector ++ * ++ * Copyright 2012 Google, Inc. ++ */ ++ ++#include "bcachefs.h" ++#include "alloc_foreground.h" ++#include "btree_iter.h" ++#include "btree_update.h" ++#include "buckets.h" ++#include "clock.h" ++#include "disk_groups.h" ++#include "extents.h" ++#include "eytzinger.h" ++#include "io.h" ++#include "keylist.h" ++#include "move.h" ++#include "movinggc.h" ++#include "super-io.h" ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++/* ++ * We can't use the entire copygc reserve in one iteration of copygc: we may ++ * need the buckets we're freeing up to go back into the copygc reserve to make ++ * forward progress, but if the copygc reserve is full they'll be available for ++ * any allocation - and it's possible that in a given iteration, we free up most ++ * of the buckets we're going to free before we allocate most of the buckets ++ * we're going to allocate. ++ * ++ * If we only use half of the reserve per iteration, then in steady state we'll ++ * always have room in the reserve for the buckets we're going to need in the ++ * next iteration: ++ */ ++#define COPYGC_BUCKETS_PER_ITER(ca) \ ++ ((ca)->free[RESERVE_MOVINGGC].size / 2) ++ ++/* ++ * Max sectors to move per iteration: Have to take into account internal ++ * fragmentation from the multiple write points for each generation: ++ */ ++#define COPYGC_SECTORS_PER_ITER(ca) \ ++ ((ca)->mi.bucket_size * COPYGC_BUCKETS_PER_ITER(ca)) ++ ++static inline int sectors_used_cmp(copygc_heap *heap, ++ struct copygc_heap_entry l, ++ struct copygc_heap_entry r) ++{ ++ return cmp_int(l.sectors, r.sectors); ++} ++ ++static int bucket_offset_cmp(const void *_l, const void *_r, size_t size) ++{ ++ const struct copygc_heap_entry *l = _l; ++ const struct copygc_heap_entry *r = _r; ++ ++ return cmp_int(l->offset, r->offset); ++} ++ ++static bool __copygc_pred(struct bch_dev *ca, ++ struct bkey_s_c k) ++{ ++ copygc_heap *h = &ca->copygc_heap; ++ const struct bch_extent_ptr *ptr = ++ bch2_bkey_has_device(k, ca->dev_idx); ++ ++ if (ptr) { ++ struct copygc_heap_entry search = { .offset = ptr->offset }; ++ ++ ssize_t i = eytzinger0_find_le(h->data, h->used, ++ sizeof(h->data[0]), ++ bucket_offset_cmp, &search); ++ ++ return (i >= 0 && ++ ptr->offset < h->data[i].offset + ca->mi.bucket_size && ++ ptr->gen == h->data[i].gen); ++ } ++ ++ return false; ++} ++ ++static enum data_cmd copygc_pred(struct bch_fs *c, void *arg, ++ struct bkey_s_c k, ++ struct bch_io_opts *io_opts, ++ struct data_opts *data_opts) ++{ ++ struct bch_dev *ca = arg; ++ ++ if (!__copygc_pred(ca, k)) ++ return DATA_SKIP; ++ ++ data_opts->target = dev_to_target(ca->dev_idx); ++ data_opts->btree_insert_flags = BTREE_INSERT_USE_RESERVE; ++ data_opts->rewrite_dev = ca->dev_idx; ++ return DATA_REWRITE; ++} ++ ++static bool have_copygc_reserve(struct bch_dev *ca) ++{ ++ bool ret; ++ ++ spin_lock(&ca->freelist_lock); ++ ret = fifo_full(&ca->free[RESERVE_MOVINGGC]) || ++ ca->allocator_state != ALLOCATOR_RUNNING; ++ spin_unlock(&ca->freelist_lock); ++ ++ return ret; ++} ++ ++static void bch2_copygc(struct bch_fs *c, struct bch_dev *ca) ++{ ++ copygc_heap *h = &ca->copygc_heap; ++ struct copygc_heap_entry e, *i; ++ struct bucket_array *buckets; ++ struct bch_move_stats move_stats; ++ u64 sectors_to_move = 0, sectors_not_moved = 0; ++ u64 buckets_to_move, buckets_not_moved = 0; ++ size_t b; ++ int ret; ++ ++ memset(&move_stats, 0, sizeof(move_stats)); ++ closure_wait_event(&c->freelist_wait, have_copygc_reserve(ca)); ++ ++ /* ++ * Find buckets with lowest sector counts, skipping completely ++ * empty buckets, by building a maxheap sorted by sector count, ++ * and repeatedly replacing the maximum element until all ++ * buckets have been visited. ++ */ ++ h->used = 0; ++ ++ /* ++ * We need bucket marks to be up to date - gc can't be recalculating ++ * them: ++ */ ++ down_read(&c->gc_lock); ++ down_read(&ca->bucket_lock); ++ buckets = bucket_array(ca); ++ ++ for (b = buckets->first_bucket; b < buckets->nbuckets; b++) { ++ struct bucket_mark m = READ_ONCE(buckets->b[b].mark); ++ struct copygc_heap_entry e; ++ ++ if (m.owned_by_allocator || ++ m.data_type != BCH_DATA_USER || ++ !bucket_sectors_used(m) || ++ bucket_sectors_used(m) >= ca->mi.bucket_size) ++ continue; ++ ++ e = (struct copygc_heap_entry) { ++ .gen = m.gen, ++ .sectors = bucket_sectors_used(m), ++ .offset = bucket_to_sector(ca, b), ++ }; ++ heap_add_or_replace(h, e, -sectors_used_cmp, NULL); ++ } ++ up_read(&ca->bucket_lock); ++ up_read(&c->gc_lock); ++ ++ for (i = h->data; i < h->data + h->used; i++) ++ sectors_to_move += i->sectors; ++ ++ while (sectors_to_move > COPYGC_SECTORS_PER_ITER(ca)) { ++ BUG_ON(!heap_pop(h, e, -sectors_used_cmp, NULL)); ++ sectors_to_move -= e.sectors; ++ } ++ ++ buckets_to_move = h->used; ++ ++ if (!buckets_to_move) ++ return; ++ ++ eytzinger0_sort(h->data, h->used, ++ sizeof(h->data[0]), ++ bucket_offset_cmp, NULL); ++ ++ ret = bch2_move_data(c, &ca->copygc_pd.rate, ++ writepoint_ptr(&ca->copygc_write_point), ++ POS_MIN, POS_MAX, ++ copygc_pred, ca, ++ &move_stats); ++ ++ down_read(&ca->bucket_lock); ++ buckets = bucket_array(ca); ++ for (i = h->data; i < h->data + h->used; i++) { ++ size_t b = sector_to_bucket(ca, i->offset); ++ struct bucket_mark m = READ_ONCE(buckets->b[b].mark); ++ ++ if (i->gen == m.gen && bucket_sectors_used(m)) { ++ sectors_not_moved += bucket_sectors_used(m); ++ buckets_not_moved++; ++ } ++ } ++ up_read(&ca->bucket_lock); ++ ++ if (sectors_not_moved && !ret) ++ bch_warn_ratelimited(c, ++ "copygc finished but %llu/%llu sectors, %llu/%llu buckets not moved", ++ sectors_not_moved, sectors_to_move, ++ buckets_not_moved, buckets_to_move); ++ ++ trace_copygc(ca, ++ atomic64_read(&move_stats.sectors_moved), sectors_not_moved, ++ buckets_to_move, buckets_not_moved); ++} ++ ++static int bch2_copygc_thread(void *arg) ++{ ++ struct bch_dev *ca = arg; ++ struct bch_fs *c = ca->fs; ++ struct io_clock *clock = &c->io_clock[WRITE]; ++ struct bch_dev_usage usage; ++ unsigned long last; ++ u64 available, fragmented, reserve, next; ++ ++ set_freezable(); ++ ++ while (!kthread_should_stop()) { ++ if (kthread_wait_freezable(c->copy_gc_enabled)) ++ break; ++ ++ last = atomic_long_read(&clock->now); ++ ++ reserve = ca->copygc_threshold; ++ ++ usage = bch2_dev_usage_read(c, ca); ++ ++ available = __dev_buckets_available(ca, usage) * ++ ca->mi.bucket_size; ++ if (available > reserve) { ++ next = last + available - reserve; ++ bch2_kthread_io_clock_wait(clock, next, ++ MAX_SCHEDULE_TIMEOUT); ++ continue; ++ } ++ ++ /* ++ * don't start copygc until there's more than half the copygc ++ * reserve of fragmented space: ++ */ ++ fragmented = usage.sectors_fragmented; ++ if (fragmented < reserve) { ++ next = last + reserve - fragmented; ++ bch2_kthread_io_clock_wait(clock, next, ++ MAX_SCHEDULE_TIMEOUT); ++ continue; ++ } ++ ++ bch2_copygc(c, ca); ++ } ++ ++ return 0; ++} ++ ++void bch2_copygc_stop(struct bch_dev *ca) ++{ ++ ca->copygc_pd.rate.rate = UINT_MAX; ++ bch2_ratelimit_reset(&ca->copygc_pd.rate); ++ ++ if (ca->copygc_thread) { ++ kthread_stop(ca->copygc_thread); ++ put_task_struct(ca->copygc_thread); ++ } ++ ca->copygc_thread = NULL; ++} ++ ++int bch2_copygc_start(struct bch_fs *c, struct bch_dev *ca) ++{ ++ struct task_struct *t; ++ ++ if (ca->copygc_thread) ++ return 0; ++ ++ if (c->opts.nochanges) ++ return 0; ++ ++ if (bch2_fs_init_fault("copygc_start")) ++ return -ENOMEM; ++ ++ t = kthread_create(bch2_copygc_thread, ca, ++ "bch_copygc[%s]", ca->name); ++ if (IS_ERR(t)) ++ return PTR_ERR(t); ++ ++ get_task_struct(t); ++ ++ ca->copygc_thread = t; ++ wake_up_process(ca->copygc_thread); ++ ++ return 0; ++} ++ ++void bch2_dev_copygc_init(struct bch_dev *ca) ++{ ++ bch2_pd_controller_init(&ca->copygc_pd); ++ ca->copygc_pd.d_term = 0; ++} +diff --git a/fs/bcachefs/movinggc.h b/fs/bcachefs/movinggc.h +new file mode 100644 +index 000000000000..dcd479632cf1 +--- /dev/null ++++ b/fs/bcachefs/movinggc.h +@@ -0,0 +1,9 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_MOVINGGC_H ++#define _BCACHEFS_MOVINGGC_H ++ ++void bch2_copygc_stop(struct bch_dev *); ++int bch2_copygc_start(struct bch_fs *, struct bch_dev *); ++void bch2_dev_copygc_init(struct bch_dev *); ++ ++#endif /* _BCACHEFS_MOVINGGC_H */ +diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c +new file mode 100644 +index 000000000000..13a9a2fcd575 +--- /dev/null ++++ b/fs/bcachefs/opts.c +@@ -0,0 +1,441 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include ++ ++#include "bcachefs.h" ++#include "compress.h" ++#include "disk_groups.h" ++#include "opts.h" ++#include "super-io.h" ++#include "util.h" ++ ++const char * const bch2_error_actions[] = { ++ "continue", ++ "remount-ro", ++ "panic", ++ NULL ++}; ++ ++const char * const bch2_csum_types[] = { ++ "none", ++ "crc32c", ++ "crc64", ++ NULL ++}; ++ ++const char * const bch2_compression_types[] = { ++ "none", ++ "lz4", ++ "gzip", ++ "zstd", ++ NULL ++}; ++ ++const char * const bch2_str_hash_types[] = { ++ "crc32c", ++ "crc64", ++ "siphash", ++ NULL ++}; ++ ++const char * const bch2_data_types[] = { ++ "none", ++ "sb", ++ "journal", ++ "btree", ++ "data", ++ "cached", ++ NULL ++}; ++ ++const char * const bch2_cache_replacement_policies[] = { ++ "lru", ++ "fifo", ++ "random", ++ NULL ++}; ++ ++/* Default is -1; we skip past it for struct cached_dev's cache mode */ ++const char * const bch2_cache_modes[] = { ++ "default", ++ "writethrough", ++ "writeback", ++ "writearound", ++ "none", ++ NULL ++}; ++ ++const char * const bch2_dev_state[] = { ++ "readwrite", ++ "readonly", ++ "failed", ++ "spare", ++ NULL ++}; ++ ++void bch2_opts_apply(struct bch_opts *dst, struct bch_opts src) ++{ ++#define x(_name, ...) \ ++ if (opt_defined(src, _name)) \ ++ opt_set(*dst, _name, src._name); ++ ++ BCH_OPTS() ++#undef x ++} ++ ++bool bch2_opt_defined_by_id(const struct bch_opts *opts, enum bch_opt_id id) ++{ ++ switch (id) { ++#define x(_name, ...) \ ++ case Opt_##_name: \ ++ return opt_defined(*opts, _name); ++ BCH_OPTS() ++#undef x ++ default: ++ BUG(); ++ } ++} ++ ++u64 bch2_opt_get_by_id(const struct bch_opts *opts, enum bch_opt_id id) ++{ ++ switch (id) { ++#define x(_name, ...) \ ++ case Opt_##_name: \ ++ return opts->_name; ++ BCH_OPTS() ++#undef x ++ default: ++ BUG(); ++ } ++} ++ ++void bch2_opt_set_by_id(struct bch_opts *opts, enum bch_opt_id id, u64 v) ++{ ++ switch (id) { ++#define x(_name, ...) \ ++ case Opt_##_name: \ ++ opt_set(*opts, _name, v); \ ++ break; ++ BCH_OPTS() ++#undef x ++ default: ++ BUG(); ++ } ++} ++ ++/* ++ * Initial options from superblock - here we don't want any options undefined, ++ * any options the superblock doesn't specify are set to 0: ++ */ ++struct bch_opts bch2_opts_from_sb(struct bch_sb *sb) ++{ ++ struct bch_opts opts = bch2_opts_empty(); ++ ++#define x(_name, _bits, _mode, _type, _sb_opt, ...) \ ++ if (_sb_opt != NO_SB_OPT) \ ++ opt_set(opts, _name, _sb_opt(sb)); ++ BCH_OPTS() ++#undef x ++ ++ return opts; ++} ++ ++const struct bch_option bch2_opt_table[] = { ++#define OPT_BOOL() .type = BCH_OPT_BOOL ++#define OPT_UINT(_min, _max) .type = BCH_OPT_UINT, .min = _min, .max = _max ++#define OPT_SECTORS(_min, _max) .type = BCH_OPT_SECTORS, .min = _min, .max = _max ++#define OPT_STR(_choices) .type = BCH_OPT_STR, .choices = _choices ++#define OPT_FN(_fn) .type = BCH_OPT_FN, \ ++ .parse = _fn##_parse, \ ++ .to_text = _fn##_to_text ++ ++#define x(_name, _bits, _mode, _type, _sb_opt, _default, _hint, _help) \ ++ [Opt_##_name] = { \ ++ .attr = { \ ++ .name = #_name, \ ++ .mode = (_mode) & OPT_RUNTIME ? 0644 : 0444, \ ++ }, \ ++ .mode = _mode, \ ++ .hint = _hint, \ ++ .help = _help, \ ++ .set_sb = SET_##_sb_opt, \ ++ _type \ ++ }, ++ ++ BCH_OPTS() ++#undef x ++}; ++ ++int bch2_opt_lookup(const char *name) ++{ ++ const struct bch_option *i; ++ ++ for (i = bch2_opt_table; ++ i < bch2_opt_table + ARRAY_SIZE(bch2_opt_table); ++ i++) ++ if (!strcmp(name, i->attr.name)) ++ return i - bch2_opt_table; ++ ++ return -1; ++} ++ ++struct synonym { ++ const char *s1, *s2; ++}; ++ ++static const struct synonym bch_opt_synonyms[] = { ++ { "quota", "usrquota" }, ++}; ++ ++static int bch2_mount_opt_lookup(const char *name) ++{ ++ const struct synonym *i; ++ ++ for (i = bch_opt_synonyms; ++ i < bch_opt_synonyms + ARRAY_SIZE(bch_opt_synonyms); ++ i++) ++ if (!strcmp(name, i->s1)) ++ name = i->s2; ++ ++ return bch2_opt_lookup(name); ++} ++ ++int bch2_opt_parse(struct bch_fs *c, const struct bch_option *opt, ++ const char *val, u64 *res) ++{ ++ ssize_t ret; ++ ++ switch (opt->type) { ++ case BCH_OPT_BOOL: ++ ret = kstrtou64(val, 10, res); ++ if (ret < 0) ++ return ret; ++ ++ if (*res > 1) ++ return -ERANGE; ++ break; ++ case BCH_OPT_UINT: ++ ret = kstrtou64(val, 10, res); ++ if (ret < 0) ++ return ret; ++ ++ if (*res < opt->min || *res >= opt->max) ++ return -ERANGE; ++ break; ++ case BCH_OPT_SECTORS: ++ ret = bch2_strtou64_h(val, res); ++ if (ret < 0) ++ return ret; ++ ++ if (*res & 511) ++ return -EINVAL; ++ ++ *res >>= 9; ++ ++ if (*res < opt->min || *res >= opt->max) ++ return -ERANGE; ++ break; ++ case BCH_OPT_STR: ++ ret = match_string(opt->choices, -1, val); ++ if (ret < 0) ++ return ret; ++ ++ *res = ret; ++ break; ++ case BCH_OPT_FN: ++ if (!c) ++ return -EINVAL; ++ ++ return opt->parse(c, val, res); ++ } ++ ++ return 0; ++} ++ ++void bch2_opt_to_text(struct printbuf *out, struct bch_fs *c, ++ const struct bch_option *opt, u64 v, ++ unsigned flags) ++{ ++ if (flags & OPT_SHOW_MOUNT_STYLE) { ++ if (opt->type == BCH_OPT_BOOL) { ++ pr_buf(out, "%s%s", ++ v ? "" : "no", ++ opt->attr.name); ++ return; ++ } ++ ++ pr_buf(out, "%s=", opt->attr.name); ++ } ++ ++ switch (opt->type) { ++ case BCH_OPT_BOOL: ++ case BCH_OPT_UINT: ++ pr_buf(out, "%lli", v); ++ break; ++ case BCH_OPT_SECTORS: ++ bch2_hprint(out, v); ++ break; ++ case BCH_OPT_STR: ++ if (flags & OPT_SHOW_FULL_LIST) ++ bch2_string_opt_to_text(out, opt->choices, v); ++ else ++ pr_buf(out, opt->choices[v]); ++ break; ++ case BCH_OPT_FN: ++ opt->to_text(out, c, v); ++ break; ++ default: ++ BUG(); ++ } ++} ++ ++int bch2_opt_check_may_set(struct bch_fs *c, int id, u64 v) ++{ ++ int ret = 0; ++ ++ switch (id) { ++ case Opt_compression: ++ case Opt_background_compression: ++ ret = bch2_check_set_has_compressed_data(c, v); ++ break; ++ case Opt_erasure_code: ++ if (v && ++ !(c->sb.features & (1ULL << BCH_FEATURE_EC))) { ++ mutex_lock(&c->sb_lock); ++ c->disk_sb.sb->features[0] |= ++ cpu_to_le64(1ULL << BCH_FEATURE_EC); ++ ++ bch2_write_super(c); ++ mutex_unlock(&c->sb_lock); ++ } ++ break; ++ } ++ ++ return ret; ++} ++ ++int bch2_opts_check_may_set(struct bch_fs *c) ++{ ++ unsigned i; ++ int ret; ++ ++ for (i = 0; i < bch2_opts_nr; i++) { ++ ret = bch2_opt_check_may_set(c, i, ++ bch2_opt_get_by_id(&c->opts, i)); ++ if (ret) ++ return ret; ++ } ++ ++ return 0; ++} ++ ++int bch2_parse_mount_opts(struct bch_opts *opts, char *options) ++{ ++ char *opt, *name, *val; ++ int ret, id; ++ u64 v; ++ ++ while ((opt = strsep(&options, ",")) != NULL) { ++ name = strsep(&opt, "="); ++ val = opt; ++ ++ if (val) { ++ id = bch2_mount_opt_lookup(name); ++ if (id < 0) ++ goto bad_opt; ++ ++ ret = bch2_opt_parse(NULL, &bch2_opt_table[id], val, &v); ++ if (ret < 0) ++ goto bad_val; ++ } else { ++ id = bch2_mount_opt_lookup(name); ++ v = 1; ++ ++ if (id < 0 && ++ !strncmp("no", name, 2)) { ++ id = bch2_mount_opt_lookup(name + 2); ++ v = 0; ++ } ++ ++ if (id < 0) ++ goto bad_opt; ++ ++ if (bch2_opt_table[id].type != BCH_OPT_BOOL) ++ goto no_val; ++ } ++ ++ if (!(bch2_opt_table[id].mode & OPT_MOUNT)) ++ goto bad_opt; ++ ++ if (id == Opt_acl && ++ !IS_ENABLED(CONFIG_BCACHEFS_POSIX_ACL)) ++ goto bad_opt; ++ ++ if ((id == Opt_usrquota || ++ id == Opt_grpquota) && ++ !IS_ENABLED(CONFIG_BCACHEFS_QUOTA)) ++ goto bad_opt; ++ ++ bch2_opt_set_by_id(opts, id, v); ++ } ++ ++ return 0; ++bad_opt: ++ pr_err("Bad mount option %s", name); ++ return -1; ++bad_val: ++ pr_err("Invalid value %s for mount option %s", val, name); ++ return -1; ++no_val: ++ pr_err("Mount option %s requires a value", name); ++ return -1; ++} ++ ++/* io opts: */ ++ ++struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts src) ++{ ++ struct bch_io_opts ret = { 0 }; ++#define x(_name, _bits) \ ++ if (opt_defined(src, _name)) \ ++ opt_set(ret, _name, src._name); ++ BCH_INODE_OPTS() ++#undef x ++ return ret; ++} ++ ++struct bch_opts bch2_inode_opts_to_opts(struct bch_io_opts src) ++{ ++ struct bch_opts ret = { 0 }; ++#define x(_name, _bits) \ ++ if (opt_defined(src, _name)) \ ++ opt_set(ret, _name, src._name); ++ BCH_INODE_OPTS() ++#undef x ++ return ret; ++} ++ ++void bch2_io_opts_apply(struct bch_io_opts *dst, struct bch_io_opts src) ++{ ++#define x(_name, _bits) \ ++ if (opt_defined(src, _name)) \ ++ opt_set(*dst, _name, src._name); ++ BCH_INODE_OPTS() ++#undef x ++} ++ ++bool bch2_opt_is_inode_opt(enum bch_opt_id id) ++{ ++ static const enum bch_opt_id inode_opt_list[] = { ++#define x(_name, _bits) Opt_##_name, ++ BCH_INODE_OPTS() ++#undef x ++ }; ++ unsigned i; ++ ++ for (i = 0; i < ARRAY_SIZE(inode_opt_list); i++) ++ if (inode_opt_list[i] == id) ++ return true; ++ ++ return false; ++} +diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h +new file mode 100644 +index 000000000000..bd2058f1fe2b +--- /dev/null ++++ b/fs/bcachefs/opts.h +@@ -0,0 +1,403 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_OPTS_H ++#define _BCACHEFS_OPTS_H ++ ++#include ++#include ++#include ++#include ++#include "bcachefs_format.h" ++ ++extern const char * const bch2_error_actions[]; ++extern const char * const bch2_csum_types[]; ++extern const char * const bch2_compression_types[]; ++extern const char * const bch2_str_hash_types[]; ++extern const char * const bch2_data_types[]; ++extern const char * const bch2_cache_replacement_policies[]; ++extern const char * const bch2_cache_modes[]; ++extern const char * const bch2_dev_state[]; ++ ++/* ++ * Mount options; we also store defaults in the superblock. ++ * ++ * Also exposed via sysfs: if an option is writeable, and it's also stored in ++ * the superblock, changing it via sysfs (currently? might change this) also ++ * updates the superblock. ++ * ++ * We store options as signed integers, where -1 means undefined. This means we ++ * can pass the mount options to bch2_fs_alloc() as a whole struct, and then only ++ * apply the options from that struct that are defined. ++ */ ++ ++/* dummy option, for options that aren't stored in the superblock */ ++LE64_BITMASK(NO_SB_OPT, struct bch_sb, flags[0], 0, 0); ++ ++/* When can be set: */ ++enum opt_mode { ++ OPT_FORMAT = (1 << 0), ++ OPT_MOUNT = (1 << 1), ++ OPT_RUNTIME = (1 << 2), ++ OPT_INODE = (1 << 3), ++ OPT_DEVICE = (1 << 4), ++}; ++ ++enum opt_type { ++ BCH_OPT_BOOL, ++ BCH_OPT_UINT, ++ BCH_OPT_SECTORS, ++ BCH_OPT_STR, ++ BCH_OPT_FN, ++}; ++ ++/** ++ * x(name, shortopt, type, in mem type, mode, sb_opt) ++ * ++ * @name - name of mount option, sysfs attribute, and struct bch_opts ++ * member ++ * ++ * @mode - when opt may be set ++ * ++ * @sb_option - name of corresponding superblock option ++ * ++ * @type - one of OPT_BOOL, OPT_UINT, OPT_STR ++ */ ++ ++/* ++ * XXX: add fields for ++ * - default value ++ * - helptext ++ */ ++ ++#define BCH_OPTS() \ ++ x(block_size, u16, \ ++ OPT_FORMAT, \ ++ OPT_SECTORS(1, 128), \ ++ BCH_SB_BLOCK_SIZE, 8, \ ++ "size", NULL) \ ++ x(btree_node_size, u16, \ ++ OPT_FORMAT, \ ++ OPT_SECTORS(1, 128), \ ++ BCH_SB_BTREE_NODE_SIZE, 512, \ ++ "size", "Btree node size, default 256k") \ ++ x(errors, u8, \ ++ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ ++ OPT_STR(bch2_error_actions), \ ++ BCH_SB_ERROR_ACTION, BCH_ON_ERROR_RO, \ ++ NULL, "Action to take on filesystem error") \ ++ x(metadata_replicas, u8, \ ++ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ ++ OPT_UINT(1, BCH_REPLICAS_MAX), \ ++ BCH_SB_META_REPLICAS_WANT, 1, \ ++ "#", "Number of metadata replicas") \ ++ x(data_replicas, u8, \ ++ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \ ++ OPT_UINT(1, BCH_REPLICAS_MAX), \ ++ BCH_SB_DATA_REPLICAS_WANT, 1, \ ++ "#", "Number of data replicas") \ ++ x(metadata_replicas_required, u8, \ ++ OPT_FORMAT|OPT_MOUNT, \ ++ OPT_UINT(1, BCH_REPLICAS_MAX), \ ++ BCH_SB_META_REPLICAS_REQ, 1, \ ++ "#", NULL) \ ++ x(data_replicas_required, u8, \ ++ OPT_FORMAT|OPT_MOUNT, \ ++ OPT_UINT(1, BCH_REPLICAS_MAX), \ ++ BCH_SB_DATA_REPLICAS_REQ, 1, \ ++ "#", NULL) \ ++ x(metadata_checksum, u8, \ ++ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ ++ OPT_STR(bch2_csum_types), \ ++ BCH_SB_META_CSUM_TYPE, BCH_CSUM_OPT_CRC32C, \ ++ NULL, NULL) \ ++ x(data_checksum, u8, \ ++ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \ ++ OPT_STR(bch2_csum_types), \ ++ BCH_SB_DATA_CSUM_TYPE, BCH_CSUM_OPT_CRC32C, \ ++ NULL, NULL) \ ++ x(compression, u8, \ ++ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \ ++ OPT_STR(bch2_compression_types), \ ++ BCH_SB_COMPRESSION_TYPE, BCH_COMPRESSION_OPT_NONE, \ ++ NULL, NULL) \ ++ x(background_compression, u8, \ ++ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \ ++ OPT_STR(bch2_compression_types), \ ++ BCH_SB_BACKGROUND_COMPRESSION_TYPE,BCH_COMPRESSION_OPT_NONE, \ ++ NULL, NULL) \ ++ x(str_hash, u8, \ ++ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ ++ OPT_STR(bch2_str_hash_types), \ ++ BCH_SB_STR_HASH_TYPE, BCH_STR_HASH_OPT_SIPHASH, \ ++ NULL, "Hash function for directory entries and xattrs")\ ++ x(foreground_target, u16, \ ++ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \ ++ OPT_FN(bch2_opt_target), \ ++ BCH_SB_FOREGROUND_TARGET, 0, \ ++ "(target)", "Device or disk group for foreground writes") \ ++ x(background_target, u16, \ ++ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \ ++ OPT_FN(bch2_opt_target), \ ++ BCH_SB_BACKGROUND_TARGET, 0, \ ++ "(target)", "Device or disk group to move data to in the background")\ ++ x(promote_target, u16, \ ++ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \ ++ OPT_FN(bch2_opt_target), \ ++ BCH_SB_PROMOTE_TARGET, 0, \ ++ "(target)", "Device or disk group to promote data to on read")\ ++ x(erasure_code, u16, \ ++ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \ ++ OPT_BOOL(), \ ++ BCH_SB_ERASURE_CODE, false, \ ++ NULL, "Enable erasure coding (DO NOT USE YET)") \ ++ x(inodes_32bit, u8, \ ++ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ ++ OPT_BOOL(), \ ++ BCH_SB_INODE_32BIT, false, \ ++ NULL, "Constrain inode numbers to 32 bits") \ ++ x(gc_reserve_percent, u8, \ ++ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ ++ OPT_UINT(5, 21), \ ++ BCH_SB_GC_RESERVE, 8, \ ++ "%", "Percentage of disk space to reserve for copygc")\ ++ x(gc_reserve_bytes, u64, \ ++ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ ++ OPT_SECTORS(0, U64_MAX), \ ++ BCH_SB_GC_RESERVE_BYTES, 0, \ ++ "%", "Amount of disk space to reserve for copygc\n" \ ++ "Takes precedence over gc_reserve_percent if set")\ ++ x(root_reserve_percent, u8, \ ++ OPT_FORMAT|OPT_MOUNT, \ ++ OPT_UINT(0, 100), \ ++ BCH_SB_ROOT_RESERVE, 0, \ ++ "%", "Percentage of disk space to reserve for superuser")\ ++ x(wide_macs, u8, \ ++ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ ++ OPT_BOOL(), \ ++ BCH_SB_128_BIT_MACS, false, \ ++ NULL, "Store full 128 bits of cryptographic MACs, instead of 80")\ ++ x(acl, u8, \ ++ OPT_FORMAT|OPT_MOUNT, \ ++ OPT_BOOL(), \ ++ BCH_SB_POSIX_ACL, true, \ ++ NULL, "Enable POSIX acls") \ ++ x(usrquota, u8, \ ++ OPT_FORMAT|OPT_MOUNT, \ ++ OPT_BOOL(), \ ++ BCH_SB_USRQUOTA, false, \ ++ NULL, "Enable user quotas") \ ++ x(grpquota, u8, \ ++ OPT_FORMAT|OPT_MOUNT, \ ++ OPT_BOOL(), \ ++ BCH_SB_GRPQUOTA, false, \ ++ NULL, "Enable group quotas") \ ++ x(prjquota, u8, \ ++ OPT_FORMAT|OPT_MOUNT, \ ++ OPT_BOOL(), \ ++ BCH_SB_PRJQUOTA, false, \ ++ NULL, "Enable project quotas") \ ++ x(degraded, u8, \ ++ OPT_MOUNT, \ ++ OPT_BOOL(), \ ++ NO_SB_OPT, false, \ ++ NULL, "Allow mounting in degraded mode") \ ++ x(discard, u8, \ ++ OPT_MOUNT|OPT_DEVICE, \ ++ OPT_BOOL(), \ ++ NO_SB_OPT, false, \ ++ NULL, "Enable discard/TRIM support") \ ++ x(verbose, u8, \ ++ OPT_MOUNT, \ ++ OPT_BOOL(), \ ++ NO_SB_OPT, false, \ ++ NULL, "Extra debugging information during mount/recovery")\ ++ x(journal_flush_disabled, u8, \ ++ OPT_MOUNT|OPT_RUNTIME, \ ++ OPT_BOOL(), \ ++ NO_SB_OPT, false, \ ++ NULL, "Disable journal flush on sync/fsync\n" \ ++ "If enabled, writes can be lost, but only since the\n"\ ++ "last journal write (default 1 second)") \ ++ x(fsck, u8, \ ++ OPT_MOUNT, \ ++ OPT_BOOL(), \ ++ NO_SB_OPT, false, \ ++ NULL, "Run fsck on mount") \ ++ x(fix_errors, u8, \ ++ OPT_MOUNT, \ ++ OPT_BOOL(), \ ++ NO_SB_OPT, false, \ ++ NULL, "Fix errors during fsck without asking") \ ++ x(nochanges, u8, \ ++ OPT_MOUNT, \ ++ OPT_BOOL(), \ ++ NO_SB_OPT, false, \ ++ NULL, "Super read only mode - no writes at all will be issued,\n"\ ++ "even if we have to replay the journal") \ ++ x(norecovery, u8, \ ++ OPT_MOUNT, \ ++ OPT_BOOL(), \ ++ NO_SB_OPT, false, \ ++ NULL, "Don't replay the journal") \ ++ x(noexcl, u8, \ ++ OPT_MOUNT, \ ++ OPT_BOOL(), \ ++ NO_SB_OPT, false, \ ++ NULL, "Don't open device in exclusive mode") \ ++ x(sb, u64, \ ++ OPT_MOUNT, \ ++ OPT_UINT(0, S64_MAX), \ ++ NO_SB_OPT, BCH_SB_SECTOR, \ ++ "offset", "Sector offset of superblock") \ ++ x(read_only, u8, \ ++ 0, \ ++ OPT_BOOL(), \ ++ NO_SB_OPT, false, \ ++ NULL, NULL) \ ++ x(nostart, u8, \ ++ 0, \ ++ OPT_BOOL(), \ ++ NO_SB_OPT, false, \ ++ NULL, "Don\'t start filesystem, only open devices") \ ++ x(reconstruct_alloc, u8, \ ++ OPT_MOUNT, \ ++ OPT_BOOL(), \ ++ NO_SB_OPT, false, \ ++ NULL, "Reconstruct alloc btree") \ ++ x(version_upgrade, u8, \ ++ OPT_MOUNT, \ ++ OPT_BOOL(), \ ++ NO_SB_OPT, false, \ ++ NULL, "Set superblock to latest version,\n" \ ++ "allowing any new features to be used") \ ++ x(project, u8, \ ++ OPT_INODE, \ ++ OPT_BOOL(), \ ++ NO_SB_OPT, false, \ ++ NULL, NULL) \ ++ x(fs_size, u64, \ ++ OPT_DEVICE, \ ++ OPT_SECTORS(0, S64_MAX), \ ++ NO_SB_OPT, 0, \ ++ "size", "Size of filesystem on device") \ ++ x(bucket, u32, \ ++ OPT_DEVICE, \ ++ OPT_SECTORS(0, S64_MAX), \ ++ NO_SB_OPT, 0, \ ++ "size", "Size of filesystem on device") \ ++ x(durability, u8, \ ++ OPT_DEVICE, \ ++ OPT_UINT(0, BCH_REPLICAS_MAX), \ ++ NO_SB_OPT, 1, \ ++ "n", "Data written to this device will be considered\n"\ ++ "to have already been replicated n times") ++ ++struct bch_opts { ++#define x(_name, _bits, ...) unsigned _name##_defined:1; ++ BCH_OPTS() ++#undef x ++ ++#define x(_name, _bits, ...) _bits _name; ++ BCH_OPTS() ++#undef x ++}; ++ ++static const struct bch_opts bch2_opts_default = { ++#define x(_name, _bits, _mode, _type, _sb_opt, _default, ...) \ ++ ._name##_defined = true, \ ++ ._name = _default, \ ++ ++ BCH_OPTS() ++#undef x ++}; ++ ++#define opt_defined(_opts, _name) ((_opts)._name##_defined) ++ ++#define opt_get(_opts, _name) \ ++ (opt_defined(_opts, _name) ? (_opts)._name : bch2_opts_default._name) ++ ++#define opt_set(_opts, _name, _v) \ ++do { \ ++ (_opts)._name##_defined = true; \ ++ (_opts)._name = _v; \ ++} while (0) ++ ++static inline struct bch_opts bch2_opts_empty(void) ++{ ++ return (struct bch_opts) { 0 }; ++} ++ ++void bch2_opts_apply(struct bch_opts *, struct bch_opts); ++ ++enum bch_opt_id { ++#define x(_name, ...) Opt_##_name, ++ BCH_OPTS() ++#undef x ++ bch2_opts_nr ++}; ++ ++struct bch_fs; ++struct printbuf; ++ ++struct bch_option { ++ struct attribute attr; ++ void (*set_sb)(struct bch_sb *, u64); ++ enum opt_mode mode; ++ enum opt_type type; ++ ++ union { ++ struct { ++ u64 min, max; ++ }; ++ struct { ++ const char * const *choices; ++ }; ++ struct { ++ int (*parse)(struct bch_fs *, const char *, u64 *); ++ void (*to_text)(struct printbuf *, struct bch_fs *, u64); ++ }; ++ }; ++ ++ const char *hint; ++ const char *help; ++ ++}; ++ ++extern const struct bch_option bch2_opt_table[]; ++ ++bool bch2_opt_defined_by_id(const struct bch_opts *, enum bch_opt_id); ++u64 bch2_opt_get_by_id(const struct bch_opts *, enum bch_opt_id); ++void bch2_opt_set_by_id(struct bch_opts *, enum bch_opt_id, u64); ++ ++struct bch_opts bch2_opts_from_sb(struct bch_sb *); ++ ++int bch2_opt_lookup(const char *); ++int bch2_opt_parse(struct bch_fs *, const struct bch_option *, const char *, u64 *); ++ ++#define OPT_SHOW_FULL_LIST (1 << 0) ++#define OPT_SHOW_MOUNT_STYLE (1 << 1) ++ ++void bch2_opt_to_text(struct printbuf *, struct bch_fs *, ++ const struct bch_option *, u64, unsigned); ++ ++int bch2_opt_check_may_set(struct bch_fs *, int, u64); ++int bch2_opts_check_may_set(struct bch_fs *); ++int bch2_parse_mount_opts(struct bch_opts *, char *); ++ ++/* inode opts: */ ++ ++struct bch_io_opts { ++#define x(_name, _bits) unsigned _name##_defined:1; ++ BCH_INODE_OPTS() ++#undef x ++ ++#define x(_name, _bits) u##_bits _name; ++ BCH_INODE_OPTS() ++#undef x ++}; ++ ++struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts); ++struct bch_opts bch2_inode_opts_to_opts(struct bch_io_opts); ++void bch2_io_opts_apply(struct bch_io_opts *, struct bch_io_opts); ++bool bch2_opt_is_inode_opt(enum bch_opt_id); ++ ++#endif /* _BCACHEFS_OPTS_H */ +diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c +new file mode 100644 +index 000000000000..0fa6f33c049b +--- /dev/null ++++ b/fs/bcachefs/quota.c +@@ -0,0 +1,782 @@ ++// SPDX-License-Identifier: GPL-2.0 ++#include "bcachefs.h" ++#include "btree_update.h" ++#include "inode.h" ++#include "quota.h" ++#include "super-io.h" ++ ++static const char *bch2_sb_validate_quota(struct bch_sb *sb, ++ struct bch_sb_field *f) ++{ ++ struct bch_sb_field_quota *q = field_to_type(f, quota); ++ ++ if (vstruct_bytes(&q->field) != sizeof(*q)) ++ return "invalid field quota: wrong size"; ++ ++ return NULL; ++} ++ ++const struct bch_sb_field_ops bch_sb_field_ops_quota = { ++ .validate = bch2_sb_validate_quota, ++}; ++ ++const char *bch2_quota_invalid(const struct bch_fs *c, struct bkey_s_c k) ++{ ++ if (k.k->p.inode >= QTYP_NR) ++ return "invalid quota type"; ++ ++ if (bkey_val_bytes(k.k) != sizeof(struct bch_quota)) ++ return "incorrect value size"; ++ ++ return NULL; ++} ++ ++static const char * const bch2_quota_counters[] = { ++ "space", ++ "inodes", ++}; ++ ++void bch2_quota_to_text(struct printbuf *out, struct bch_fs *c, ++ struct bkey_s_c k) ++{ ++ struct bkey_s_c_quota dq = bkey_s_c_to_quota(k); ++ unsigned i; ++ ++ for (i = 0; i < Q_COUNTERS; i++) ++ pr_buf(out, "%s hardlimit %llu softlimit %llu", ++ bch2_quota_counters[i], ++ le64_to_cpu(dq.v->c[i].hardlimit), ++ le64_to_cpu(dq.v->c[i].softlimit)); ++} ++ ++#ifdef CONFIG_BCACHEFS_QUOTA ++ ++#include ++#include ++#include ++ ++static inline unsigned __next_qtype(unsigned i, unsigned qtypes) ++{ ++ qtypes >>= i; ++ return qtypes ? i + __ffs(qtypes) : QTYP_NR; ++} ++ ++#define for_each_set_qtype(_c, _i, _q, _qtypes) \ ++ for (_i = 0; \ ++ (_i = __next_qtype(_i, _qtypes), \ ++ _q = &(_c)->quotas[_i], \ ++ _i < QTYP_NR); \ ++ _i++) ++ ++static bool ignore_hardlimit(struct bch_memquota_type *q) ++{ ++ if (capable(CAP_SYS_RESOURCE)) ++ return true; ++#if 0 ++ struct mem_dqinfo *info = &sb_dqopt(dquot->dq_sb)->info[dquot->dq_id.type]; ++ ++ return capable(CAP_SYS_RESOURCE) && ++ (info->dqi_format->qf_fmt_id != QFMT_VFS_OLD || ++ !(info->dqi_flags & DQF_ROOT_SQUASH)); ++#endif ++ return false; ++} ++ ++enum quota_msg { ++ SOFTWARN, /* Softlimit reached */ ++ SOFTLONGWARN, /* Grace time expired */ ++ HARDWARN, /* Hardlimit reached */ ++ ++ HARDBELOW, /* Usage got below inode hardlimit */ ++ SOFTBELOW, /* Usage got below inode softlimit */ ++}; ++ ++static int quota_nl[][Q_COUNTERS] = { ++ [HARDWARN][Q_SPC] = QUOTA_NL_BHARDWARN, ++ [SOFTLONGWARN][Q_SPC] = QUOTA_NL_BSOFTLONGWARN, ++ [SOFTWARN][Q_SPC] = QUOTA_NL_BSOFTWARN, ++ [HARDBELOW][Q_SPC] = QUOTA_NL_BHARDBELOW, ++ [SOFTBELOW][Q_SPC] = QUOTA_NL_BSOFTBELOW, ++ ++ [HARDWARN][Q_INO] = QUOTA_NL_IHARDWARN, ++ [SOFTLONGWARN][Q_INO] = QUOTA_NL_ISOFTLONGWARN, ++ [SOFTWARN][Q_INO] = QUOTA_NL_ISOFTWARN, ++ [HARDBELOW][Q_INO] = QUOTA_NL_IHARDBELOW, ++ [SOFTBELOW][Q_INO] = QUOTA_NL_ISOFTBELOW, ++}; ++ ++struct quota_msgs { ++ u8 nr; ++ struct { ++ u8 qtype; ++ u8 msg; ++ } m[QTYP_NR * Q_COUNTERS]; ++}; ++ ++static void prepare_msg(unsigned qtype, ++ enum quota_counters counter, ++ struct quota_msgs *msgs, ++ enum quota_msg msg_type) ++{ ++ BUG_ON(msgs->nr >= ARRAY_SIZE(msgs->m)); ++ ++ msgs->m[msgs->nr].qtype = qtype; ++ msgs->m[msgs->nr].msg = quota_nl[msg_type][counter]; ++ msgs->nr++; ++} ++ ++static void prepare_warning(struct memquota_counter *qc, ++ unsigned qtype, ++ enum quota_counters counter, ++ struct quota_msgs *msgs, ++ enum quota_msg msg_type) ++{ ++ if (qc->warning_issued & (1 << msg_type)) ++ return; ++ ++ prepare_msg(qtype, counter, msgs, msg_type); ++} ++ ++static void flush_warnings(struct bch_qid qid, ++ struct super_block *sb, ++ struct quota_msgs *msgs) ++{ ++ unsigned i; ++ ++ for (i = 0; i < msgs->nr; i++) ++ quota_send_warning(make_kqid(&init_user_ns, msgs->m[i].qtype, qid.q[i]), ++ sb->s_dev, msgs->m[i].msg); ++} ++ ++static int bch2_quota_check_limit(struct bch_fs *c, ++ unsigned qtype, ++ struct bch_memquota *mq, ++ struct quota_msgs *msgs, ++ enum quota_counters counter, ++ s64 v, ++ enum quota_acct_mode mode) ++{ ++ struct bch_memquota_type *q = &c->quotas[qtype]; ++ struct memquota_counter *qc = &mq->c[counter]; ++ u64 n = qc->v + v; ++ ++ BUG_ON((s64) n < 0); ++ ++ if (mode == KEY_TYPE_QUOTA_NOCHECK) ++ return 0; ++ ++ if (v <= 0) { ++ if (n < qc->hardlimit && ++ (qc->warning_issued & (1 << HARDWARN))) { ++ qc->warning_issued &= ~(1 << HARDWARN); ++ prepare_msg(qtype, counter, msgs, HARDBELOW); ++ } ++ ++ if (n < qc->softlimit && ++ (qc->warning_issued & (1 << SOFTWARN))) { ++ qc->warning_issued &= ~(1 << SOFTWARN); ++ prepare_msg(qtype, counter, msgs, SOFTBELOW); ++ } ++ ++ qc->warning_issued = 0; ++ return 0; ++ } ++ ++ if (qc->hardlimit && ++ qc->hardlimit < n && ++ !ignore_hardlimit(q)) { ++ if (mode == KEY_TYPE_QUOTA_PREALLOC) ++ return -EDQUOT; ++ ++ prepare_warning(qc, qtype, counter, msgs, HARDWARN); ++ } ++ ++ if (qc->softlimit && ++ qc->softlimit < n && ++ qc->timer && ++ ktime_get_real_seconds() >= qc->timer && ++ !ignore_hardlimit(q)) { ++ if (mode == KEY_TYPE_QUOTA_PREALLOC) ++ return -EDQUOT; ++ ++ prepare_warning(qc, qtype, counter, msgs, SOFTLONGWARN); ++ } ++ ++ if (qc->softlimit && ++ qc->softlimit < n && ++ qc->timer == 0) { ++ if (mode == KEY_TYPE_QUOTA_PREALLOC) ++ return -EDQUOT; ++ ++ prepare_warning(qc, qtype, counter, msgs, SOFTWARN); ++ ++ /* XXX is this the right one? */ ++ qc->timer = ktime_get_real_seconds() + ++ q->limits[counter].warnlimit; ++ } ++ ++ return 0; ++} ++ ++int bch2_quota_acct(struct bch_fs *c, struct bch_qid qid, ++ enum quota_counters counter, s64 v, ++ enum quota_acct_mode mode) ++{ ++ unsigned qtypes = enabled_qtypes(c); ++ struct bch_memquota_type *q; ++ struct bch_memquota *mq[QTYP_NR]; ++ struct quota_msgs msgs; ++ unsigned i; ++ int ret = 0; ++ ++ memset(&msgs, 0, sizeof(msgs)); ++ ++ for_each_set_qtype(c, i, q, qtypes) ++ mutex_lock_nested(&q->lock, i); ++ ++ for_each_set_qtype(c, i, q, qtypes) { ++ mq[i] = genradix_ptr_alloc(&q->table, qid.q[i], GFP_NOFS); ++ if (!mq[i]) { ++ ret = -ENOMEM; ++ goto err; ++ } ++ ++ ret = bch2_quota_check_limit(c, i, mq[i], &msgs, counter, v, mode); ++ if (ret) ++ goto err; ++ } ++ ++ for_each_set_qtype(c, i, q, qtypes) ++ mq[i]->c[counter].v += v; ++err: ++ for_each_set_qtype(c, i, q, qtypes) ++ mutex_unlock(&q->lock); ++ ++ flush_warnings(qid, c->vfs_sb, &msgs); ++ ++ return ret; ++} ++ ++static void __bch2_quota_transfer(struct bch_memquota *src_q, ++ struct bch_memquota *dst_q, ++ enum quota_counters counter, s64 v) ++{ ++ BUG_ON(v > src_q->c[counter].v); ++ BUG_ON(v + dst_q->c[counter].v < v); ++ ++ src_q->c[counter].v -= v; ++ dst_q->c[counter].v += v; ++} ++ ++int bch2_quota_transfer(struct bch_fs *c, unsigned qtypes, ++ struct bch_qid dst, ++ struct bch_qid src, u64 space, ++ enum quota_acct_mode mode) ++{ ++ struct bch_memquota_type *q; ++ struct bch_memquota *src_q[3], *dst_q[3]; ++ struct quota_msgs msgs; ++ unsigned i; ++ int ret = 0; ++ ++ qtypes &= enabled_qtypes(c); ++ ++ memset(&msgs, 0, sizeof(msgs)); ++ ++ for_each_set_qtype(c, i, q, qtypes) ++ mutex_lock_nested(&q->lock, i); ++ ++ for_each_set_qtype(c, i, q, qtypes) { ++ src_q[i] = genradix_ptr_alloc(&q->table, src.q[i], GFP_NOFS); ++ dst_q[i] = genradix_ptr_alloc(&q->table, dst.q[i], GFP_NOFS); ++ ++ if (!src_q[i] || !dst_q[i]) { ++ ret = -ENOMEM; ++ goto err; ++ } ++ ++ ret = bch2_quota_check_limit(c, i, dst_q[i], &msgs, Q_SPC, ++ dst_q[i]->c[Q_SPC].v + space, ++ mode); ++ if (ret) ++ goto err; ++ ++ ret = bch2_quota_check_limit(c, i, dst_q[i], &msgs, Q_INO, ++ dst_q[i]->c[Q_INO].v + 1, ++ mode); ++ if (ret) ++ goto err; ++ } ++ ++ for_each_set_qtype(c, i, q, qtypes) { ++ __bch2_quota_transfer(src_q[i], dst_q[i], Q_SPC, space); ++ __bch2_quota_transfer(src_q[i], dst_q[i], Q_INO, 1); ++ } ++ ++err: ++ for_each_set_qtype(c, i, q, qtypes) ++ mutex_unlock(&q->lock); ++ ++ flush_warnings(dst, c->vfs_sb, &msgs); ++ ++ return ret; ++} ++ ++static int __bch2_quota_set(struct bch_fs *c, struct bkey_s_c k) ++{ ++ struct bkey_s_c_quota dq; ++ struct bch_memquota_type *q; ++ struct bch_memquota *mq; ++ unsigned i; ++ ++ BUG_ON(k.k->p.inode >= QTYP_NR); ++ ++ switch (k.k->type) { ++ case KEY_TYPE_quota: ++ dq = bkey_s_c_to_quota(k); ++ q = &c->quotas[k.k->p.inode]; ++ ++ mutex_lock(&q->lock); ++ mq = genradix_ptr_alloc(&q->table, k.k->p.offset, GFP_KERNEL); ++ if (!mq) { ++ mutex_unlock(&q->lock); ++ return -ENOMEM; ++ } ++ ++ for (i = 0; i < Q_COUNTERS; i++) { ++ mq->c[i].hardlimit = le64_to_cpu(dq.v->c[i].hardlimit); ++ mq->c[i].softlimit = le64_to_cpu(dq.v->c[i].softlimit); ++ } ++ ++ mutex_unlock(&q->lock); ++ } ++ ++ return 0; ++} ++ ++static int bch2_quota_init_type(struct bch_fs *c, enum quota_types type) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ int ret = 0; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_QUOTAS, POS(type, 0), ++ BTREE_ITER_PREFETCH, k, ret) { ++ if (k.k->p.inode != type) ++ break; ++ ++ ret = __bch2_quota_set(c, k); ++ if (ret) ++ break; ++ } ++ ++ return bch2_trans_exit(&trans) ?: ret; ++} ++ ++void bch2_fs_quota_exit(struct bch_fs *c) ++{ ++ unsigned i; ++ ++ for (i = 0; i < ARRAY_SIZE(c->quotas); i++) ++ genradix_free(&c->quotas[i].table); ++} ++ ++void bch2_fs_quota_init(struct bch_fs *c) ++{ ++ unsigned i; ++ ++ for (i = 0; i < ARRAY_SIZE(c->quotas); i++) ++ mutex_init(&c->quotas[i].lock); ++} ++ ++static void bch2_sb_quota_read(struct bch_fs *c) ++{ ++ struct bch_sb_field_quota *sb_quota; ++ unsigned i, j; ++ ++ sb_quota = bch2_sb_get_quota(c->disk_sb.sb); ++ if (!sb_quota) ++ return; ++ ++ for (i = 0; i < QTYP_NR; i++) { ++ struct bch_memquota_type *q = &c->quotas[i]; ++ ++ for (j = 0; j < Q_COUNTERS; j++) { ++ q->limits[j].timelimit = ++ le32_to_cpu(sb_quota->q[i].c[j].timelimit); ++ q->limits[j].warnlimit = ++ le32_to_cpu(sb_quota->q[i].c[j].warnlimit); ++ } ++ } ++} ++ ++int bch2_fs_quota_read(struct bch_fs *c) ++{ ++ unsigned i, qtypes = enabled_qtypes(c); ++ struct bch_memquota_type *q; ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bch_inode_unpacked u; ++ struct bkey_s_c k; ++ int ret; ++ ++ mutex_lock(&c->sb_lock); ++ bch2_sb_quota_read(c); ++ mutex_unlock(&c->sb_lock); ++ ++ for_each_set_qtype(c, i, q, qtypes) { ++ ret = bch2_quota_init_type(c, i); ++ if (ret) ++ return ret; ++ } ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_INODES, POS_MIN, ++ BTREE_ITER_PREFETCH, k, ret) { ++ switch (k.k->type) { ++ case KEY_TYPE_inode: ++ ret = bch2_inode_unpack(bkey_s_c_to_inode(k), &u); ++ if (ret) ++ return ret; ++ ++ bch2_quota_acct(c, bch_qid(&u), Q_SPC, u.bi_sectors, ++ KEY_TYPE_QUOTA_NOCHECK); ++ bch2_quota_acct(c, bch_qid(&u), Q_INO, 1, ++ KEY_TYPE_QUOTA_NOCHECK); ++ } ++ } ++ return bch2_trans_exit(&trans) ?: ret; ++} ++ ++/* Enable/disable/delete quotas for an entire filesystem: */ ++ ++static int bch2_quota_enable(struct super_block *sb, unsigned uflags) ++{ ++ struct bch_fs *c = sb->s_fs_info; ++ ++ if (sb->s_flags & SB_RDONLY) ++ return -EROFS; ++ ++ /* Accounting must be enabled at mount time: */ ++ if (uflags & (FS_QUOTA_UDQ_ACCT|FS_QUOTA_GDQ_ACCT|FS_QUOTA_PDQ_ACCT)) ++ return -EINVAL; ++ ++ /* Can't enable enforcement without accounting: */ ++ if ((uflags & FS_QUOTA_UDQ_ENFD) && !c->opts.usrquota) ++ return -EINVAL; ++ ++ if ((uflags & FS_QUOTA_GDQ_ENFD) && !c->opts.grpquota) ++ return -EINVAL; ++ ++ if (uflags & FS_QUOTA_PDQ_ENFD && !c->opts.prjquota) ++ return -EINVAL; ++ ++ mutex_lock(&c->sb_lock); ++ if (uflags & FS_QUOTA_UDQ_ENFD) ++ SET_BCH_SB_USRQUOTA(c->disk_sb.sb, true); ++ ++ if (uflags & FS_QUOTA_GDQ_ENFD) ++ SET_BCH_SB_GRPQUOTA(c->disk_sb.sb, true); ++ ++ if (uflags & FS_QUOTA_PDQ_ENFD) ++ SET_BCH_SB_PRJQUOTA(c->disk_sb.sb, true); ++ ++ bch2_write_super(c); ++ mutex_unlock(&c->sb_lock); ++ ++ return 0; ++} ++ ++static int bch2_quota_disable(struct super_block *sb, unsigned uflags) ++{ ++ struct bch_fs *c = sb->s_fs_info; ++ ++ if (sb->s_flags & SB_RDONLY) ++ return -EROFS; ++ ++ mutex_lock(&c->sb_lock); ++ if (uflags & FS_QUOTA_UDQ_ENFD) ++ SET_BCH_SB_USRQUOTA(c->disk_sb.sb, false); ++ ++ if (uflags & FS_QUOTA_GDQ_ENFD) ++ SET_BCH_SB_GRPQUOTA(c->disk_sb.sb, false); ++ ++ if (uflags & FS_QUOTA_PDQ_ENFD) ++ SET_BCH_SB_PRJQUOTA(c->disk_sb.sb, false); ++ ++ bch2_write_super(c); ++ mutex_unlock(&c->sb_lock); ++ ++ return 0; ++} ++ ++static int bch2_quota_remove(struct super_block *sb, unsigned uflags) ++{ ++ struct bch_fs *c = sb->s_fs_info; ++ int ret; ++ ++ if (sb->s_flags & SB_RDONLY) ++ return -EROFS; ++ ++ if (uflags & FS_USER_QUOTA) { ++ if (c->opts.usrquota) ++ return -EINVAL; ++ ++ ret = bch2_btree_delete_range(c, BTREE_ID_QUOTAS, ++ POS(QTYP_USR, 0), ++ POS(QTYP_USR + 1, 0), ++ NULL); ++ if (ret) ++ return ret; ++ } ++ ++ if (uflags & FS_GROUP_QUOTA) { ++ if (c->opts.grpquota) ++ return -EINVAL; ++ ++ ret = bch2_btree_delete_range(c, BTREE_ID_QUOTAS, ++ POS(QTYP_GRP, 0), ++ POS(QTYP_GRP + 1, 0), ++ NULL); ++ if (ret) ++ return ret; ++ } ++ ++ if (uflags & FS_PROJ_QUOTA) { ++ if (c->opts.prjquota) ++ return -EINVAL; ++ ++ ret = bch2_btree_delete_range(c, BTREE_ID_QUOTAS, ++ POS(QTYP_PRJ, 0), ++ POS(QTYP_PRJ + 1, 0), ++ NULL); ++ if (ret) ++ return ret; ++ } ++ ++ return 0; ++} ++ ++/* ++ * Return quota status information, such as enforcements, quota file inode ++ * numbers etc. ++ */ ++static int bch2_quota_get_state(struct super_block *sb, struct qc_state *state) ++{ ++ struct bch_fs *c = sb->s_fs_info; ++ unsigned qtypes = enabled_qtypes(c); ++ unsigned i; ++ ++ memset(state, 0, sizeof(*state)); ++ ++ for (i = 0; i < QTYP_NR; i++) { ++ state->s_state[i].flags |= QCI_SYSFILE; ++ ++ if (!(qtypes & (1 << i))) ++ continue; ++ ++ state->s_state[i].flags |= QCI_ACCT_ENABLED; ++ ++ state->s_state[i].spc_timelimit = c->quotas[i].limits[Q_SPC].timelimit; ++ state->s_state[i].spc_warnlimit = c->quotas[i].limits[Q_SPC].warnlimit; ++ ++ state->s_state[i].ino_timelimit = c->quotas[i].limits[Q_INO].timelimit; ++ state->s_state[i].ino_warnlimit = c->quotas[i].limits[Q_INO].warnlimit; ++ } ++ ++ return 0; ++} ++ ++/* ++ * Adjust quota timers & warnings ++ */ ++static int bch2_quota_set_info(struct super_block *sb, int type, ++ struct qc_info *info) ++{ ++ struct bch_fs *c = sb->s_fs_info; ++ struct bch_sb_field_quota *sb_quota; ++ struct bch_memquota_type *q; ++ ++ if (sb->s_flags & SB_RDONLY) ++ return -EROFS; ++ ++ if (type >= QTYP_NR) ++ return -EINVAL; ++ ++ if (!((1 << type) & enabled_qtypes(c))) ++ return -ESRCH; ++ ++ if (info->i_fieldmask & ++ ~(QC_SPC_TIMER|QC_INO_TIMER|QC_SPC_WARNS|QC_INO_WARNS)) ++ return -EINVAL; ++ ++ q = &c->quotas[type]; ++ ++ mutex_lock(&c->sb_lock); ++ sb_quota = bch2_sb_get_quota(c->disk_sb.sb); ++ if (!sb_quota) { ++ sb_quota = bch2_sb_resize_quota(&c->disk_sb, ++ sizeof(*sb_quota) / sizeof(u64)); ++ if (!sb_quota) ++ return -ENOSPC; ++ } ++ ++ if (info->i_fieldmask & QC_SPC_TIMER) ++ sb_quota->q[type].c[Q_SPC].timelimit = ++ cpu_to_le32(info->i_spc_timelimit); ++ ++ if (info->i_fieldmask & QC_SPC_WARNS) ++ sb_quota->q[type].c[Q_SPC].warnlimit = ++ cpu_to_le32(info->i_spc_warnlimit); ++ ++ if (info->i_fieldmask & QC_INO_TIMER) ++ sb_quota->q[type].c[Q_INO].timelimit = ++ cpu_to_le32(info->i_ino_timelimit); ++ ++ if (info->i_fieldmask & QC_INO_WARNS) ++ sb_quota->q[type].c[Q_INO].warnlimit = ++ cpu_to_le32(info->i_ino_warnlimit); ++ ++ bch2_sb_quota_read(c); ++ ++ bch2_write_super(c); ++ mutex_unlock(&c->sb_lock); ++ ++ return 0; ++} ++ ++/* Get/set individual quotas: */ ++ ++static void __bch2_quota_get(struct qc_dqblk *dst, struct bch_memquota *src) ++{ ++ dst->d_space = src->c[Q_SPC].v << 9; ++ dst->d_spc_hardlimit = src->c[Q_SPC].hardlimit << 9; ++ dst->d_spc_softlimit = src->c[Q_SPC].softlimit << 9; ++ dst->d_spc_timer = src->c[Q_SPC].timer; ++ dst->d_spc_warns = src->c[Q_SPC].warns; ++ ++ dst->d_ino_count = src->c[Q_INO].v; ++ dst->d_ino_hardlimit = src->c[Q_INO].hardlimit; ++ dst->d_ino_softlimit = src->c[Q_INO].softlimit; ++ dst->d_ino_timer = src->c[Q_INO].timer; ++ dst->d_ino_warns = src->c[Q_INO].warns; ++} ++ ++static int bch2_get_quota(struct super_block *sb, struct kqid kqid, ++ struct qc_dqblk *qdq) ++{ ++ struct bch_fs *c = sb->s_fs_info; ++ struct bch_memquota_type *q = &c->quotas[kqid.type]; ++ qid_t qid = from_kqid(&init_user_ns, kqid); ++ struct bch_memquota *mq; ++ ++ memset(qdq, 0, sizeof(*qdq)); ++ ++ mutex_lock(&q->lock); ++ mq = genradix_ptr(&q->table, qid); ++ if (mq) ++ __bch2_quota_get(qdq, mq); ++ mutex_unlock(&q->lock); ++ ++ return 0; ++} ++ ++static int bch2_get_next_quota(struct super_block *sb, struct kqid *kqid, ++ struct qc_dqblk *qdq) ++{ ++ struct bch_fs *c = sb->s_fs_info; ++ struct bch_memquota_type *q = &c->quotas[kqid->type]; ++ qid_t qid = from_kqid(&init_user_ns, *kqid); ++ struct genradix_iter iter; ++ struct bch_memquota *mq; ++ int ret = 0; ++ ++ mutex_lock(&q->lock); ++ ++ genradix_for_each_from(&q->table, iter, mq, qid) ++ if (memcmp(mq, page_address(ZERO_PAGE(0)), sizeof(*mq))) { ++ __bch2_quota_get(qdq, mq); ++ *kqid = make_kqid(current_user_ns(), kqid->type, iter.pos); ++ goto found; ++ } ++ ++ ret = -ENOENT; ++found: ++ mutex_unlock(&q->lock); ++ return ret; ++} ++ ++static int bch2_set_quota(struct super_block *sb, struct kqid qid, ++ struct qc_dqblk *qdq) ++{ ++ struct bch_fs *c = sb->s_fs_info; ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ struct bkey_i_quota new_quota; ++ int ret; ++ ++ if (sb->s_flags & SB_RDONLY) ++ return -EROFS; ++ ++ bkey_quota_init(&new_quota.k_i); ++ new_quota.k.p = POS(qid.type, from_kqid(&init_user_ns, qid)); ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_QUOTAS, new_quota.k.p, ++ BTREE_ITER_SLOTS|BTREE_ITER_INTENT); ++ k = bch2_btree_iter_peek_slot(iter); ++ ++ ret = bkey_err(k); ++ if (unlikely(ret)) ++ return ret; ++ ++ switch (k.k->type) { ++ case KEY_TYPE_quota: ++ new_quota.v = *bkey_s_c_to_quota(k).v; ++ break; ++ } ++ ++ if (qdq->d_fieldmask & QC_SPC_SOFT) ++ new_quota.v.c[Q_SPC].softlimit = cpu_to_le64(qdq->d_spc_softlimit >> 9); ++ if (qdq->d_fieldmask & QC_SPC_HARD) ++ new_quota.v.c[Q_SPC].hardlimit = cpu_to_le64(qdq->d_spc_hardlimit >> 9); ++ ++ if (qdq->d_fieldmask & QC_INO_SOFT) ++ new_quota.v.c[Q_INO].softlimit = cpu_to_le64(qdq->d_ino_softlimit); ++ if (qdq->d_fieldmask & QC_INO_HARD) ++ new_quota.v.c[Q_INO].hardlimit = cpu_to_le64(qdq->d_ino_hardlimit); ++ ++ bch2_trans_update(&trans, iter, &new_quota.k_i); ++ ++ ret = bch2_trans_commit(&trans, NULL, NULL, 0); ++ ++ bch2_trans_exit(&trans); ++ ++ if (ret) ++ return ret; ++ ++ ret = __bch2_quota_set(c, bkey_i_to_s_c(&new_quota.k_i)); ++ ++ return ret; ++} ++ ++const struct quotactl_ops bch2_quotactl_operations = { ++ .quota_enable = bch2_quota_enable, ++ .quota_disable = bch2_quota_disable, ++ .rm_xquota = bch2_quota_remove, ++ ++ .get_state = bch2_quota_get_state, ++ .set_info = bch2_quota_set_info, ++ ++ .get_dqblk = bch2_get_quota, ++ .get_nextdqblk = bch2_get_next_quota, ++ .set_dqblk = bch2_set_quota, ++}; ++ ++#endif /* CONFIG_BCACHEFS_QUOTA */ +diff --git a/fs/bcachefs/quota.h b/fs/bcachefs/quota.h +new file mode 100644 +index 000000000000..51e4f9713ef0 +--- /dev/null ++++ b/fs/bcachefs/quota.h +@@ -0,0 +1,71 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_QUOTA_H ++#define _BCACHEFS_QUOTA_H ++ ++#include "inode.h" ++#include "quota_types.h" ++ ++extern const struct bch_sb_field_ops bch_sb_field_ops_quota; ++ ++const char *bch2_quota_invalid(const struct bch_fs *, struct bkey_s_c); ++void bch2_quota_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); ++ ++#define bch2_bkey_ops_quota (struct bkey_ops) { \ ++ .key_invalid = bch2_quota_invalid, \ ++ .val_to_text = bch2_quota_to_text, \ ++} ++ ++static inline struct bch_qid bch_qid(struct bch_inode_unpacked *u) ++{ ++ return (struct bch_qid) { ++ .q[QTYP_USR] = u->bi_uid, ++ .q[QTYP_GRP] = u->bi_gid, ++ .q[QTYP_PRJ] = u->bi_project ? u->bi_project - 1 : 0, ++ }; ++} ++ ++static inline unsigned enabled_qtypes(struct bch_fs *c) ++{ ++ return ((c->opts.usrquota << QTYP_USR)| ++ (c->opts.grpquota << QTYP_GRP)| ++ (c->opts.prjquota << QTYP_PRJ)); ++} ++ ++#ifdef CONFIG_BCACHEFS_QUOTA ++ ++int bch2_quota_acct(struct bch_fs *, struct bch_qid, enum quota_counters, ++ s64, enum quota_acct_mode); ++ ++int bch2_quota_transfer(struct bch_fs *, unsigned, struct bch_qid, ++ struct bch_qid, u64, enum quota_acct_mode); ++ ++void bch2_fs_quota_exit(struct bch_fs *); ++void bch2_fs_quota_init(struct bch_fs *); ++int bch2_fs_quota_read(struct bch_fs *); ++ ++extern const struct quotactl_ops bch2_quotactl_operations; ++ ++#else ++ ++static inline int bch2_quota_acct(struct bch_fs *c, struct bch_qid qid, ++ enum quota_counters counter, s64 v, ++ enum quota_acct_mode mode) ++{ ++ return 0; ++} ++ ++static inline int bch2_quota_transfer(struct bch_fs *c, unsigned qtypes, ++ struct bch_qid dst, ++ struct bch_qid src, u64 space, ++ enum quota_acct_mode mode) ++{ ++ return 0; ++} ++ ++static inline void bch2_fs_quota_exit(struct bch_fs *c) {} ++static inline void bch2_fs_quota_init(struct bch_fs *c) {} ++static inline int bch2_fs_quota_read(struct bch_fs *c) { return 0; } ++ ++#endif ++ ++#endif /* _BCACHEFS_QUOTA_H */ +diff --git a/fs/bcachefs/quota_types.h b/fs/bcachefs/quota_types.h +new file mode 100644 +index 000000000000..6a136083d389 +--- /dev/null ++++ b/fs/bcachefs/quota_types.h +@@ -0,0 +1,43 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_QUOTA_TYPES_H ++#define _BCACHEFS_QUOTA_TYPES_H ++ ++#include ++ ++struct bch_qid { ++ u32 q[QTYP_NR]; ++}; ++ ++enum quota_acct_mode { ++ KEY_TYPE_QUOTA_PREALLOC, ++ KEY_TYPE_QUOTA_WARN, ++ KEY_TYPE_QUOTA_NOCHECK, ++}; ++ ++struct memquota_counter { ++ u64 v; ++ u64 hardlimit; ++ u64 softlimit; ++ s64 timer; ++ int warns; ++ int warning_issued; ++}; ++ ++struct bch_memquota { ++ struct memquota_counter c[Q_COUNTERS]; ++}; ++ ++typedef GENRADIX(struct bch_memquota) bch_memquota_table; ++ ++struct quota_limit { ++ u32 timelimit; ++ u32 warnlimit; ++}; ++ ++struct bch_memquota_type { ++ struct quota_limit limits[Q_COUNTERS]; ++ bch_memquota_table table; ++ struct mutex lock; ++}; ++ ++#endif /* _BCACHEFS_QUOTA_TYPES_H */ +diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c +new file mode 100644 +index 000000000000..84b3fb6eb101 +--- /dev/null ++++ b/fs/bcachefs/rebalance.c +@@ -0,0 +1,332 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include "bcachefs.h" ++#include "alloc_foreground.h" ++#include "btree_iter.h" ++#include "buckets.h" ++#include "clock.h" ++#include "disk_groups.h" ++#include "extents.h" ++#include "io.h" ++#include "move.h" ++#include "rebalance.h" ++#include "super-io.h" ++ ++#include ++#include ++#include ++#include ++ ++static inline bool rebalance_ptr_pred(struct bch_fs *c, ++ struct extent_ptr_decoded p, ++ struct bch_io_opts *io_opts) ++{ ++ if (io_opts->background_target && ++ !bch2_dev_in_target(c, p.ptr.dev, io_opts->background_target) && ++ !p.ptr.cached) ++ return true; ++ ++ if (io_opts->background_compression && ++ p.crc.compression_type != ++ bch2_compression_opt_to_type[io_opts->background_compression]) ++ return true; ++ ++ return false; ++} ++ ++void bch2_rebalance_add_key(struct bch_fs *c, ++ struct bkey_s_c k, ++ struct bch_io_opts *io_opts) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const union bch_extent_entry *entry; ++ struct extent_ptr_decoded p; ++ ++ if (!io_opts->background_target && ++ !io_opts->background_compression) ++ return; ++ ++ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) ++ if (rebalance_ptr_pred(c, p, io_opts)) { ++ struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); ++ ++ if (atomic64_add_return(p.crc.compressed_size, ++ &ca->rebalance_work) == ++ p.crc.compressed_size) ++ rebalance_wakeup(c); ++ } ++} ++ ++void bch2_rebalance_add_work(struct bch_fs *c, u64 sectors) ++{ ++ if (atomic64_add_return(sectors, &c->rebalance.work_unknown_dev) == ++ sectors) ++ rebalance_wakeup(c); ++} ++ ++static enum data_cmd rebalance_pred(struct bch_fs *c, void *arg, ++ struct bkey_s_c k, ++ struct bch_io_opts *io_opts, ++ struct data_opts *data_opts) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const union bch_extent_entry *entry; ++ struct extent_ptr_decoded p; ++ unsigned nr_replicas = 0; ++ ++ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { ++ nr_replicas += !p.ptr.cached; ++ ++ if (rebalance_ptr_pred(c, p, io_opts)) ++ goto found; ++ } ++ ++ if (nr_replicas < io_opts->data_replicas) ++ goto found; ++ ++ return DATA_SKIP; ++found: ++ data_opts->target = io_opts->background_target; ++ data_opts->btree_insert_flags = 0; ++ return DATA_ADD_REPLICAS; ++} ++ ++struct rebalance_work { ++ int dev_most_full_idx; ++ unsigned dev_most_full_percent; ++ u64 dev_most_full_work; ++ u64 dev_most_full_capacity; ++ u64 total_work; ++}; ++ ++static void rebalance_work_accumulate(struct rebalance_work *w, ++ u64 dev_work, u64 unknown_dev, u64 capacity, int idx) ++{ ++ unsigned percent_full; ++ u64 work = dev_work + unknown_dev; ++ ++ if (work < dev_work || work < unknown_dev) ++ work = U64_MAX; ++ work = min(work, capacity); ++ ++ percent_full = div64_u64(work * 100, capacity); ++ ++ if (percent_full >= w->dev_most_full_percent) { ++ w->dev_most_full_idx = idx; ++ w->dev_most_full_percent = percent_full; ++ w->dev_most_full_work = work; ++ w->dev_most_full_capacity = capacity; ++ } ++ ++ if (w->total_work + dev_work >= w->total_work && ++ w->total_work + dev_work >= dev_work) ++ w->total_work += dev_work; ++} ++ ++static struct rebalance_work rebalance_work(struct bch_fs *c) ++{ ++ struct bch_dev *ca; ++ struct rebalance_work ret = { .dev_most_full_idx = -1 }; ++ u64 unknown_dev = atomic64_read(&c->rebalance.work_unknown_dev); ++ unsigned i; ++ ++ for_each_online_member(ca, c, i) ++ rebalance_work_accumulate(&ret, ++ atomic64_read(&ca->rebalance_work), ++ unknown_dev, ++ bucket_to_sector(ca, ca->mi.nbuckets - ++ ca->mi.first_bucket), ++ i); ++ ++ rebalance_work_accumulate(&ret, ++ unknown_dev, 0, c->capacity, -1); ++ ++ return ret; ++} ++ ++static void rebalance_work_reset(struct bch_fs *c) ++{ ++ struct bch_dev *ca; ++ unsigned i; ++ ++ for_each_online_member(ca, c, i) ++ atomic64_set(&ca->rebalance_work, 0); ++ ++ atomic64_set(&c->rebalance.work_unknown_dev, 0); ++} ++ ++static unsigned long curr_cputime(void) ++{ ++ u64 utime, stime; ++ ++ task_cputime_adjusted(current, &utime, &stime); ++ return nsecs_to_jiffies(utime + stime); ++} ++ ++static int bch2_rebalance_thread(void *arg) ++{ ++ struct bch_fs *c = arg; ++ struct bch_fs_rebalance *r = &c->rebalance; ++ struct io_clock *clock = &c->io_clock[WRITE]; ++ struct rebalance_work w, p; ++ unsigned long start, prev_start; ++ unsigned long prev_run_time, prev_run_cputime; ++ unsigned long cputime, prev_cputime; ++ unsigned long io_start; ++ long throttle; ++ ++ set_freezable(); ++ ++ io_start = atomic_long_read(&clock->now); ++ p = rebalance_work(c); ++ prev_start = jiffies; ++ prev_cputime = curr_cputime(); ++ ++ while (!kthread_wait_freezable(r->enabled)) { ++ start = jiffies; ++ cputime = curr_cputime(); ++ ++ prev_run_time = start - prev_start; ++ prev_run_cputime = cputime - prev_cputime; ++ ++ w = rebalance_work(c); ++ BUG_ON(!w.dev_most_full_capacity); ++ ++ if (!w.total_work) { ++ r->state = REBALANCE_WAITING; ++ kthread_wait_freezable(rebalance_work(c).total_work); ++ continue; ++ } ++ ++ /* ++ * If there isn't much work to do, throttle cpu usage: ++ */ ++ throttle = prev_run_cputime * 100 / ++ max(1U, w.dev_most_full_percent) - ++ prev_run_time; ++ ++ if (w.dev_most_full_percent < 20 && throttle > 0) { ++ r->state = REBALANCE_THROTTLED; ++ r->throttled_until_iotime = io_start + ++ div_u64(w.dev_most_full_capacity * ++ (20 - w.dev_most_full_percent), ++ 50); ++ r->throttled_until_cputime = start + throttle; ++ ++ bch2_kthread_io_clock_wait(clock, ++ r->throttled_until_iotime, ++ throttle); ++ continue; ++ } ++ ++ /* minimum 1 mb/sec: */ ++ r->pd.rate.rate = ++ max_t(u64, 1 << 11, ++ r->pd.rate.rate * ++ max(p.dev_most_full_percent, 1U) / ++ max(w.dev_most_full_percent, 1U)); ++ ++ io_start = atomic_long_read(&clock->now); ++ p = w; ++ prev_start = start; ++ prev_cputime = cputime; ++ ++ r->state = REBALANCE_RUNNING; ++ memset(&r->move_stats, 0, sizeof(r->move_stats)); ++ rebalance_work_reset(c); ++ ++ bch2_move_data(c, ++ /* ratelimiting disabled for now */ ++ NULL, /* &r->pd.rate, */ ++ writepoint_ptr(&c->rebalance_write_point), ++ POS_MIN, POS_MAX, ++ rebalance_pred, NULL, ++ &r->move_stats); ++ } ++ ++ return 0; ++} ++ ++ssize_t bch2_rebalance_work_show(struct bch_fs *c, char *buf) ++{ ++ struct printbuf out = _PBUF(buf, PAGE_SIZE); ++ struct bch_fs_rebalance *r = &c->rebalance; ++ struct rebalance_work w = rebalance_work(c); ++ char h1[21], h2[21]; ++ ++ bch2_hprint(&PBUF(h1), w.dev_most_full_work << 9); ++ bch2_hprint(&PBUF(h2), w.dev_most_full_capacity << 9); ++ pr_buf(&out, "fullest_dev (%i):\t%s/%s\n", ++ w.dev_most_full_idx, h1, h2); ++ ++ bch2_hprint(&PBUF(h1), w.total_work << 9); ++ bch2_hprint(&PBUF(h2), c->capacity << 9); ++ pr_buf(&out, "total work:\t\t%s/%s\n", h1, h2); ++ ++ pr_buf(&out, "rate:\t\t\t%u\n", r->pd.rate.rate); ++ ++ switch (r->state) { ++ case REBALANCE_WAITING: ++ pr_buf(&out, "waiting\n"); ++ break; ++ case REBALANCE_THROTTLED: ++ bch2_hprint(&PBUF(h1), ++ (r->throttled_until_iotime - ++ atomic_long_read(&c->io_clock[WRITE].now)) << 9); ++ pr_buf(&out, "throttled for %lu sec or %s io\n", ++ (r->throttled_until_cputime - jiffies) / HZ, ++ h1); ++ break; ++ case REBALANCE_RUNNING: ++ pr_buf(&out, "running\n"); ++ pr_buf(&out, "pos %llu:%llu\n", ++ r->move_stats.pos.inode, ++ r->move_stats.pos.offset); ++ break; ++ } ++ ++ return out.pos - buf; ++} ++ ++void bch2_rebalance_stop(struct bch_fs *c) ++{ ++ struct task_struct *p; ++ ++ c->rebalance.pd.rate.rate = UINT_MAX; ++ bch2_ratelimit_reset(&c->rebalance.pd.rate); ++ ++ p = rcu_dereference_protected(c->rebalance.thread, 1); ++ c->rebalance.thread = NULL; ++ ++ if (p) { ++ /* for sychronizing with rebalance_wakeup() */ ++ synchronize_rcu(); ++ ++ kthread_stop(p); ++ put_task_struct(p); ++ } ++} ++ ++int bch2_rebalance_start(struct bch_fs *c) ++{ ++ struct task_struct *p; ++ ++ if (c->opts.nochanges) ++ return 0; ++ ++ p = kthread_create(bch2_rebalance_thread, c, "bch_rebalance"); ++ if (IS_ERR(p)) ++ return PTR_ERR(p); ++ ++ get_task_struct(p); ++ rcu_assign_pointer(c->rebalance.thread, p); ++ wake_up_process(p); ++ return 0; ++} ++ ++void bch2_fs_rebalance_init(struct bch_fs *c) ++{ ++ bch2_pd_controller_init(&c->rebalance.pd); ++ ++ atomic64_set(&c->rebalance.work_unknown_dev, S64_MAX); ++} +diff --git a/fs/bcachefs/rebalance.h b/fs/bcachefs/rebalance.h +new file mode 100644 +index 000000000000..99e2a1fb6084 +--- /dev/null ++++ b/fs/bcachefs/rebalance.h +@@ -0,0 +1,28 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_REBALANCE_H ++#define _BCACHEFS_REBALANCE_H ++ ++#include "rebalance_types.h" ++ ++static inline void rebalance_wakeup(struct bch_fs *c) ++{ ++ struct task_struct *p; ++ ++ rcu_read_lock(); ++ p = rcu_dereference(c->rebalance.thread); ++ if (p) ++ wake_up_process(p); ++ rcu_read_unlock(); ++} ++ ++void bch2_rebalance_add_key(struct bch_fs *, struct bkey_s_c, ++ struct bch_io_opts *); ++void bch2_rebalance_add_work(struct bch_fs *, u64); ++ ++ssize_t bch2_rebalance_work_show(struct bch_fs *, char *); ++ ++void bch2_rebalance_stop(struct bch_fs *); ++int bch2_rebalance_start(struct bch_fs *); ++void bch2_fs_rebalance_init(struct bch_fs *); ++ ++#endif /* _BCACHEFS_REBALANCE_H */ +diff --git a/fs/bcachefs/rebalance_types.h b/fs/bcachefs/rebalance_types.h +new file mode 100644 +index 000000000000..192c6be20ced +--- /dev/null ++++ b/fs/bcachefs/rebalance_types.h +@@ -0,0 +1,27 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_REBALANCE_TYPES_H ++#define _BCACHEFS_REBALANCE_TYPES_H ++ ++#include "move_types.h" ++ ++enum rebalance_state { ++ REBALANCE_WAITING, ++ REBALANCE_THROTTLED, ++ REBALANCE_RUNNING, ++}; ++ ++struct bch_fs_rebalance { ++ struct task_struct __rcu *thread; ++ struct bch_pd_controller pd; ++ ++ atomic64_t work_unknown_dev; ++ ++ enum rebalance_state state; ++ unsigned long throttled_until_iotime; ++ unsigned long throttled_until_cputime; ++ struct bch_move_stats move_stats; ++ ++ unsigned enabled:1; ++}; ++ ++#endif /* _BCACHEFS_REBALANCE_TYPES_H */ +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +new file mode 100644 +index 000000000000..2c441a278044 +--- /dev/null ++++ b/fs/bcachefs/recovery.c +@@ -0,0 +1,1047 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include "bcachefs.h" ++#include "alloc_background.h" ++#include "btree_gc.h" ++#include "btree_update.h" ++#include "btree_update_interior.h" ++#include "btree_io.h" ++#include "buckets.h" ++#include "dirent.h" ++#include "ec.h" ++#include "error.h" ++#include "fs-common.h" ++#include "fsck.h" ++#include "journal_io.h" ++#include "journal_reclaim.h" ++#include "journal_seq_blacklist.h" ++#include "quota.h" ++#include "recovery.h" ++#include "replicas.h" ++#include "super-io.h" ++ ++#include ++#include ++ ++#define QSTR(n) { { { .len = strlen(n) } }, .name = n } ++ ++/* iterate over keys read from the journal: */ ++ ++struct journal_iter bch2_journal_iter_init(struct journal_keys *keys, ++ enum btree_id id) ++{ ++ return (struct journal_iter) { ++ .keys = keys, ++ .k = keys->d, ++ .btree_id = id, ++ }; ++} ++ ++struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter) ++{ ++ while (1) { ++ if (iter->k == iter->keys->d + iter->keys->nr) ++ return bkey_s_c_null; ++ ++ if (iter->k->btree_id == iter->btree_id) ++ return bkey_i_to_s_c(iter->k->k); ++ ++ iter->k++; ++ } ++ ++ return bkey_s_c_null; ++} ++ ++struct bkey_s_c bch2_journal_iter_next(struct journal_iter *iter) ++{ ++ if (iter->k == iter->keys->d + iter->keys->nr) ++ return bkey_s_c_null; ++ ++ iter->k++; ++ return bch2_journal_iter_peek(iter); ++} ++ ++/* sort and dedup all keys in the journal: */ ++ ++static void journal_entries_free(struct list_head *list) ++{ ++ ++ while (!list_empty(list)) { ++ struct journal_replay *i = ++ list_first_entry(list, struct journal_replay, list); ++ list_del(&i->list); ++ kvpfree(i, offsetof(struct journal_replay, j) + ++ vstruct_bytes(&i->j)); ++ } ++} ++ ++static int journal_sort_key_cmp(const void *_l, const void *_r) ++{ ++ const struct journal_key *l = _l; ++ const struct journal_key *r = _r; ++ ++ return cmp_int(l->btree_id, r->btree_id) ?: ++ bkey_cmp(l->pos, r->pos) ?: ++ cmp_int(l->journal_seq, r->journal_seq) ?: ++ cmp_int(l->journal_offset, r->journal_offset); ++} ++ ++static int journal_sort_seq_cmp(const void *_l, const void *_r) ++{ ++ const struct journal_key *l = _l; ++ const struct journal_key *r = _r; ++ ++ return cmp_int(l->journal_seq, r->journal_seq) ?: ++ cmp_int(l->btree_id, r->btree_id) ?: ++ bkey_cmp(l->pos, r->pos); ++} ++ ++static void journal_keys_sift(struct journal_keys *keys, struct journal_key *i) ++{ ++ while (i + 1 < keys->d + keys->nr && ++ journal_sort_key_cmp(i, i + 1) > 0) { ++ swap(i[0], i[1]); ++ i++; ++ } ++} ++ ++static void journal_keys_free(struct journal_keys *keys) ++{ ++ struct journal_key *i; ++ ++ for_each_journal_key(*keys, i) ++ if (i->allocated) ++ kfree(i->k); ++ kvfree(keys->d); ++ keys->d = NULL; ++ keys->nr = 0; ++} ++ ++static struct journal_keys journal_keys_sort(struct list_head *journal_entries) ++{ ++ struct journal_replay *p; ++ struct jset_entry *entry; ++ struct bkey_i *k, *_n; ++ struct journal_keys keys = { NULL }, keys_deduped = { NULL }; ++ struct journal_key *i; ++ size_t nr_keys = 0; ++ ++ list_for_each_entry(p, journal_entries, list) ++ for_each_jset_key(k, _n, entry, &p->j) ++ nr_keys++; ++ ++ keys.journal_seq_base = keys_deduped.journal_seq_base = ++ le64_to_cpu(list_first_entry(journal_entries, ++ struct journal_replay, ++ list)->j.seq); ++ ++ keys.d = kvmalloc(sizeof(keys.d[0]) * nr_keys, GFP_KERNEL); ++ if (!keys.d) ++ goto err; ++ ++ keys_deduped.d = kvmalloc(sizeof(keys.d[0]) * nr_keys * 2, GFP_KERNEL); ++ if (!keys_deduped.d) ++ goto err; ++ ++ list_for_each_entry(p, journal_entries, list) ++ for_each_jset_key(k, _n, entry, &p->j) ++ keys.d[keys.nr++] = (struct journal_key) { ++ .btree_id = entry->btree_id, ++ .pos = bkey_start_pos(&k->k), ++ .k = k, ++ .journal_seq = le64_to_cpu(p->j.seq) - ++ keys.journal_seq_base, ++ .journal_offset = k->_data - p->j._data, ++ }; ++ ++ sort(keys.d, nr_keys, sizeof(keys.d[0]), journal_sort_key_cmp, NULL); ++ ++ i = keys.d; ++ while (i < keys.d + keys.nr) { ++ if (i + 1 < keys.d + keys.nr && ++ i[0].btree_id == i[1].btree_id && ++ !bkey_cmp(i[0].pos, i[1].pos)) { ++ if (bkey_cmp(i[0].k->k.p, i[1].k->k.p) <= 0) { ++ i++; ++ } else { ++ bch2_cut_front(i[1].k->k.p, i[0].k); ++ i[0].pos = i[1].k->k.p; ++ journal_keys_sift(&keys, i); ++ } ++ continue; ++ } ++ ++ if (i + 1 < keys.d + keys.nr && ++ i[0].btree_id == i[1].btree_id && ++ bkey_cmp(i[0].k->k.p, bkey_start_pos(&i[1].k->k)) > 0) { ++ if ((cmp_int(i[0].journal_seq, i[1].journal_seq) ?: ++ cmp_int(i[0].journal_offset, i[1].journal_offset)) < 0) { ++ if (bkey_cmp(i[0].k->k.p, i[1].k->k.p) <= 0) { ++ bch2_cut_back(bkey_start_pos(&i[1].k->k), &i[0].k->k); ++ } else { ++ struct bkey_i *split = ++ kmalloc(bkey_bytes(i[0].k), GFP_KERNEL); ++ ++ if (!split) ++ goto err; ++ ++ bkey_copy(split, i[0].k); ++ bch2_cut_back(bkey_start_pos(&i[1].k->k), &split->k); ++ keys_deduped.d[keys_deduped.nr++] = (struct journal_key) { ++ .btree_id = i[0].btree_id, ++ .allocated = true, ++ .pos = bkey_start_pos(&split->k), ++ .k = split, ++ .journal_seq = i[0].journal_seq, ++ .journal_offset = i[0].journal_offset, ++ }; ++ ++ bch2_cut_front(i[1].k->k.p, i[0].k); ++ i[0].pos = i[1].k->k.p; ++ journal_keys_sift(&keys, i); ++ continue; ++ } ++ } else { ++ if (bkey_cmp(i[0].k->k.p, i[1].k->k.p) >= 0) { ++ i[1] = i[0]; ++ i++; ++ continue; ++ } else { ++ bch2_cut_front(i[0].k->k.p, i[1].k); ++ i[1].pos = i[0].k->k.p; ++ journal_keys_sift(&keys, i + 1); ++ continue; ++ } ++ } ++ } ++ ++ keys_deduped.d[keys_deduped.nr++] = *i++; ++ } ++ ++ kvfree(keys.d); ++ return keys_deduped; ++err: ++ journal_keys_free(&keys_deduped); ++ kvfree(keys.d); ++ return (struct journal_keys) { NULL }; ++} ++ ++/* journal replay: */ ++ ++static void replay_now_at(struct journal *j, u64 seq) ++{ ++ BUG_ON(seq < j->replay_journal_seq); ++ BUG_ON(seq > j->replay_journal_seq_end); ++ ++ while (j->replay_journal_seq < seq) ++ bch2_journal_pin_put(j, j->replay_journal_seq++); ++} ++ ++static int bch2_extent_replay_key(struct bch_fs *c, enum btree_id btree_id, ++ struct bkey_i *k) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter, *split_iter; ++ /* ++ * We might cause compressed extents to be split, so we need to pass in ++ * a disk_reservation: ++ */ ++ struct disk_reservation disk_res = ++ bch2_disk_reservation_init(c, 0); ++ struct bkey_i *split; ++ struct bpos atomic_end; ++ /* ++ * Some extents aren't equivalent - w.r.t. what the triggers do ++ * - if they're split: ++ */ ++ bool remark_if_split = bch2_extent_is_compressed(bkey_i_to_s_c(k)) || ++ k->k.type == KEY_TYPE_reflink_p; ++ bool remark = false; ++ int ret; ++ ++ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); ++retry: ++ bch2_trans_begin(&trans); ++ ++ iter = bch2_trans_get_iter(&trans, btree_id, ++ bkey_start_pos(&k->k), ++ BTREE_ITER_INTENT); ++ ++ do { ++ ret = bch2_btree_iter_traverse(iter); ++ if (ret) ++ goto err; ++ ++ atomic_end = bpos_min(k->k.p, iter->l[0].b->key.k.p); ++ ++ split_iter = bch2_trans_copy_iter(&trans, iter); ++ ret = PTR_ERR_OR_ZERO(split_iter); ++ if (ret) ++ goto err; ++ ++ split = bch2_trans_kmalloc(&trans, bkey_bytes(&k->k)); ++ ret = PTR_ERR_OR_ZERO(split); ++ if (ret) ++ goto err; ++ ++ if (!remark && ++ remark_if_split && ++ bkey_cmp(atomic_end, k->k.p) < 0) { ++ ret = bch2_disk_reservation_add(c, &disk_res, ++ k->k.size * ++ bch2_bkey_nr_dirty_ptrs(bkey_i_to_s_c(k)), ++ BCH_DISK_RESERVATION_NOFAIL); ++ BUG_ON(ret); ++ ++ remark = true; ++ } ++ ++ bkey_copy(split, k); ++ bch2_cut_front(split_iter->pos, split); ++ bch2_cut_back(atomic_end, &split->k); ++ ++ bch2_trans_update(&trans, split_iter, split); ++ bch2_btree_iter_set_pos(iter, split->k.p); ++ } while (bkey_cmp(iter->pos, k->k.p) < 0); ++ ++ if (remark) { ++ ret = bch2_trans_mark_key(&trans, bkey_i_to_s_c(k), ++ 0, -((s64) k->k.size), ++ BCH_BUCKET_MARK_OVERWRITE) ?: ++ bch2_trans_commit(&trans, &disk_res, NULL, ++ BTREE_INSERT_ATOMIC| ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_LAZY_RW| ++ BTREE_INSERT_NOMARK_OVERWRITES| ++ BTREE_INSERT_NO_CLEAR_REPLICAS); ++ } else { ++ ret = bch2_trans_commit(&trans, &disk_res, NULL, ++ BTREE_INSERT_ATOMIC| ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_LAZY_RW| ++ BTREE_INSERT_JOURNAL_REPLAY| ++ BTREE_INSERT_NOMARK); ++ } ++ ++ if (ret) ++ goto err; ++err: ++ if (ret == -EINTR) ++ goto retry; ++ ++ bch2_disk_reservation_put(c, &disk_res); ++ ++ return bch2_trans_exit(&trans) ?: ret; ++} ++ ++static int bch2_journal_replay(struct bch_fs *c, ++ struct journal_keys keys) ++{ ++ struct journal *j = &c->journal; ++ struct journal_key *i; ++ int ret; ++ ++ sort(keys.d, keys.nr, sizeof(keys.d[0]), journal_sort_seq_cmp, NULL); ++ ++ for_each_journal_key(keys, i) { ++ replay_now_at(j, keys.journal_seq_base + i->journal_seq); ++ ++ if (i->btree_id == BTREE_ID_ALLOC) ++ ret = bch2_alloc_replay_key(c, i->k); ++ else if (btree_node_type_is_extents(i->btree_id)) ++ ret = bch2_extent_replay_key(c, i->btree_id, i->k); ++ else ++ ret = bch2_btree_insert(c, i->btree_id, i->k, ++ NULL, NULL, ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_LAZY_RW| ++ BTREE_INSERT_JOURNAL_REPLAY| ++ BTREE_INSERT_NOMARK); ++ ++ if (ret) { ++ bch_err(c, "journal replay: error %d while replaying key", ++ ret); ++ return ret; ++ } ++ ++ cond_resched(); ++ } ++ ++ replay_now_at(j, j->replay_journal_seq_end); ++ j->replay_journal_seq = 0; ++ ++ bch2_journal_set_replay_done(j); ++ bch2_journal_flush_all_pins(j); ++ return bch2_journal_error(j); ++} ++ ++static bool journal_empty(struct list_head *journal) ++{ ++ return list_empty(journal) || ++ journal_entry_empty(&list_last_entry(journal, ++ struct journal_replay, list)->j); ++} ++ ++static int ++verify_journal_entries_not_blacklisted_or_missing(struct bch_fs *c, ++ struct list_head *journal) ++{ ++ struct journal_replay *i = ++ list_last_entry(journal, struct journal_replay, list); ++ u64 start_seq = le64_to_cpu(i->j.last_seq); ++ u64 end_seq = le64_to_cpu(i->j.seq); ++ u64 seq = start_seq; ++ int ret = 0; ++ ++ list_for_each_entry(i, journal, list) { ++ fsck_err_on(seq != le64_to_cpu(i->j.seq), c, ++ "journal entries %llu-%llu missing! (replaying %llu-%llu)", ++ seq, le64_to_cpu(i->j.seq) - 1, ++ start_seq, end_seq); ++ ++ seq = le64_to_cpu(i->j.seq); ++ ++ fsck_err_on(bch2_journal_seq_is_blacklisted(c, seq, false), c, ++ "found blacklisted journal entry %llu", seq); ++ ++ do { ++ seq++; ++ } while (bch2_journal_seq_is_blacklisted(c, seq, false)); ++ } ++fsck_err: ++ return ret; ++} ++ ++/* journal replay early: */ ++ ++static int journal_replay_entry_early(struct bch_fs *c, ++ struct jset_entry *entry) ++{ ++ int ret = 0; ++ ++ switch (entry->type) { ++ case BCH_JSET_ENTRY_btree_root: { ++ struct btree_root *r; ++ ++ if (entry->btree_id >= BTREE_ID_NR) { ++ bch_err(c, "filesystem has unknown btree type %u", ++ entry->btree_id); ++ return -EINVAL; ++ } ++ ++ r = &c->btree_roots[entry->btree_id]; ++ ++ if (entry->u64s) { ++ r->level = entry->level; ++ bkey_copy(&r->key, &entry->start[0]); ++ r->error = 0; ++ } else { ++ r->error = -EIO; ++ } ++ r->alive = true; ++ break; ++ } ++ case BCH_JSET_ENTRY_usage: { ++ struct jset_entry_usage *u = ++ container_of(entry, struct jset_entry_usage, entry); ++ ++ switch (entry->btree_id) { ++ case FS_USAGE_RESERVED: ++ if (entry->level < BCH_REPLICAS_MAX) ++ c->usage_base->persistent_reserved[entry->level] = ++ le64_to_cpu(u->v); ++ break; ++ case FS_USAGE_INODES: ++ c->usage_base->nr_inodes = le64_to_cpu(u->v); ++ break; ++ case FS_USAGE_KEY_VERSION: ++ atomic64_set(&c->key_version, ++ le64_to_cpu(u->v)); ++ break; ++ } ++ ++ break; ++ } ++ case BCH_JSET_ENTRY_data_usage: { ++ struct jset_entry_data_usage *u = ++ container_of(entry, struct jset_entry_data_usage, entry); ++ ret = bch2_replicas_set_usage(c, &u->r, ++ le64_to_cpu(u->v)); ++ break; ++ } ++ case BCH_JSET_ENTRY_blacklist: { ++ struct jset_entry_blacklist *bl_entry = ++ container_of(entry, struct jset_entry_blacklist, entry); ++ ++ ret = bch2_journal_seq_blacklist_add(c, ++ le64_to_cpu(bl_entry->seq), ++ le64_to_cpu(bl_entry->seq) + 1); ++ break; ++ } ++ case BCH_JSET_ENTRY_blacklist_v2: { ++ struct jset_entry_blacklist_v2 *bl_entry = ++ container_of(entry, struct jset_entry_blacklist_v2, entry); ++ ++ ret = bch2_journal_seq_blacklist_add(c, ++ le64_to_cpu(bl_entry->start), ++ le64_to_cpu(bl_entry->end) + 1); ++ break; ++ } ++ } ++ ++ return ret; ++} ++ ++static int journal_replay_early(struct bch_fs *c, ++ struct bch_sb_field_clean *clean, ++ struct list_head *journal) ++{ ++ struct jset_entry *entry; ++ int ret; ++ ++ if (clean) { ++ c->bucket_clock[READ].hand = le16_to_cpu(clean->read_clock); ++ c->bucket_clock[WRITE].hand = le16_to_cpu(clean->write_clock); ++ ++ for (entry = clean->start; ++ entry != vstruct_end(&clean->field); ++ entry = vstruct_next(entry)) { ++ ret = journal_replay_entry_early(c, entry); ++ if (ret) ++ return ret; ++ } ++ } else { ++ struct journal_replay *i = ++ list_last_entry(journal, struct journal_replay, list); ++ ++ c->bucket_clock[READ].hand = le16_to_cpu(i->j.read_clock); ++ c->bucket_clock[WRITE].hand = le16_to_cpu(i->j.write_clock); ++ ++ list_for_each_entry(i, journal, list) ++ vstruct_for_each(&i->j, entry) { ++ ret = journal_replay_entry_early(c, entry); ++ if (ret) ++ return ret; ++ } ++ } ++ ++ bch2_fs_usage_initialize(c); ++ ++ return 0; ++} ++ ++/* sb clean section: */ ++ ++static struct bkey_i *btree_root_find(struct bch_fs *c, ++ struct bch_sb_field_clean *clean, ++ struct jset *j, ++ enum btree_id id, unsigned *level) ++{ ++ struct bkey_i *k; ++ struct jset_entry *entry, *start, *end; ++ ++ if (clean) { ++ start = clean->start; ++ end = vstruct_end(&clean->field); ++ } else { ++ start = j->start; ++ end = vstruct_last(j); ++ } ++ ++ for (entry = start; entry < end; entry = vstruct_next(entry)) ++ if (entry->type == BCH_JSET_ENTRY_btree_root && ++ entry->btree_id == id) ++ goto found; ++ ++ return NULL; ++found: ++ if (!entry->u64s) ++ return ERR_PTR(-EINVAL); ++ ++ k = entry->start; ++ *level = entry->level; ++ return k; ++} ++ ++static int verify_superblock_clean(struct bch_fs *c, ++ struct bch_sb_field_clean **cleanp, ++ struct jset *j) ++{ ++ unsigned i; ++ struct bch_sb_field_clean *clean = *cleanp; ++ int ret = 0; ++ ++ if (!c->sb.clean || !j) ++ return 0; ++ ++ if (mustfix_fsck_err_on(j->seq != clean->journal_seq, c, ++ "superblock journal seq (%llu) doesn't match journal (%llu) after clean shutdown", ++ le64_to_cpu(clean->journal_seq), ++ le64_to_cpu(j->seq))) { ++ kfree(clean); ++ *cleanp = NULL; ++ return 0; ++ } ++ ++ mustfix_fsck_err_on(j->read_clock != clean->read_clock, c, ++ "superblock read clock doesn't match journal after clean shutdown"); ++ mustfix_fsck_err_on(j->write_clock != clean->write_clock, c, ++ "superblock read clock doesn't match journal after clean shutdown"); ++ ++ for (i = 0; i < BTREE_ID_NR; i++) { ++ struct bkey_i *k1, *k2; ++ unsigned l1 = 0, l2 = 0; ++ ++ k1 = btree_root_find(c, clean, NULL, i, &l1); ++ k2 = btree_root_find(c, NULL, j, i, &l2); ++ ++ if (!k1 && !k2) ++ continue; ++ ++ mustfix_fsck_err_on(!k1 || !k2 || ++ IS_ERR(k1) || ++ IS_ERR(k2) || ++ k1->k.u64s != k2->k.u64s || ++ memcmp(k1, k2, bkey_bytes(k1)) || ++ l1 != l2, c, ++ "superblock btree root doesn't match journal after clean shutdown"); ++ } ++fsck_err: ++ return ret; ++} ++ ++static struct bch_sb_field_clean *read_superblock_clean(struct bch_fs *c) ++{ ++ struct bch_sb_field_clean *clean, *sb_clean; ++ int ret; ++ ++ mutex_lock(&c->sb_lock); ++ sb_clean = bch2_sb_get_clean(c->disk_sb.sb); ++ ++ if (fsck_err_on(!sb_clean, c, ++ "superblock marked clean but clean section not present")) { ++ SET_BCH_SB_CLEAN(c->disk_sb.sb, false); ++ c->sb.clean = false; ++ mutex_unlock(&c->sb_lock); ++ return NULL; ++ } ++ ++ clean = kmemdup(sb_clean, vstruct_bytes(&sb_clean->field), ++ GFP_KERNEL); ++ if (!clean) { ++ mutex_unlock(&c->sb_lock); ++ return ERR_PTR(-ENOMEM); ++ } ++ ++ if (le16_to_cpu(c->disk_sb.sb->version) < ++ bcachefs_metadata_version_bkey_renumber) ++ bch2_sb_clean_renumber(clean, READ); ++ ++ mutex_unlock(&c->sb_lock); ++ ++ return clean; ++fsck_err: ++ mutex_unlock(&c->sb_lock); ++ return ERR_PTR(ret); ++} ++ ++static int read_btree_roots(struct bch_fs *c) ++{ ++ unsigned i; ++ int ret = 0; ++ ++ for (i = 0; i < BTREE_ID_NR; i++) { ++ struct btree_root *r = &c->btree_roots[i]; ++ ++ if (!r->alive) ++ continue; ++ ++ if (i == BTREE_ID_ALLOC && ++ c->opts.reconstruct_alloc) { ++ c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO); ++ continue; ++ } ++ ++ ++ if (r->error) { ++ __fsck_err(c, i == BTREE_ID_ALLOC ++ ? FSCK_CAN_IGNORE : 0, ++ "invalid btree root %s", ++ bch2_btree_ids[i]); ++ if (i == BTREE_ID_ALLOC) ++ c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO); ++ } ++ ++ ret = bch2_btree_root_read(c, i, &r->key, r->level); ++ if (ret) { ++ __fsck_err(c, i == BTREE_ID_ALLOC ++ ? FSCK_CAN_IGNORE : 0, ++ "error reading btree root %s", ++ bch2_btree_ids[i]); ++ if (i == BTREE_ID_ALLOC) ++ c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO); ++ } ++ } ++ ++ for (i = 0; i < BTREE_ID_NR; i++) ++ if (!c->btree_roots[i].b) ++ bch2_btree_root_alloc(c, i); ++fsck_err: ++ return ret; ++} ++ ++int bch2_fs_recovery(struct bch_fs *c) ++{ ++ const char *err = "cannot allocate memory"; ++ struct bch_sb_field_clean *clean = NULL; ++ u64 journal_seq; ++ LIST_HEAD(journal_entries); ++ struct journal_keys journal_keys = { NULL }; ++ bool wrote = false, write_sb = false; ++ int ret; ++ ++ if (c->sb.clean) ++ clean = read_superblock_clean(c); ++ ret = PTR_ERR_OR_ZERO(clean); ++ if (ret) ++ goto err; ++ ++ if (c->sb.clean) ++ bch_info(c, "recovering from clean shutdown, journal seq %llu", ++ le64_to_cpu(clean->journal_seq)); ++ ++ if (!c->replicas.entries) { ++ bch_info(c, "building replicas info"); ++ set_bit(BCH_FS_REBUILD_REPLICAS, &c->flags); ++ } ++ ++ if (!c->sb.clean || c->opts.fsck) { ++ struct jset *j; ++ ++ ret = bch2_journal_read(c, &journal_entries); ++ if (ret) ++ goto err; ++ ++ if (mustfix_fsck_err_on(c->sb.clean && !journal_empty(&journal_entries), c, ++ "filesystem marked clean but journal not empty")) { ++ c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO); ++ SET_BCH_SB_CLEAN(c->disk_sb.sb, false); ++ c->sb.clean = false; ++ } ++ ++ if (!c->sb.clean && list_empty(&journal_entries)) { ++ bch_err(c, "no journal entries found"); ++ ret = BCH_FSCK_REPAIR_IMPOSSIBLE; ++ goto err; ++ } ++ ++ journal_keys = journal_keys_sort(&journal_entries); ++ if (!journal_keys.d) { ++ ret = -ENOMEM; ++ goto err; ++ } ++ ++ j = &list_last_entry(&journal_entries, ++ struct journal_replay, list)->j; ++ ++ ret = verify_superblock_clean(c, &clean, j); ++ if (ret) ++ goto err; ++ ++ journal_seq = le64_to_cpu(j->seq) + 1; ++ } else { ++ journal_seq = le64_to_cpu(clean->journal_seq) + 1; ++ } ++ ++ ret = journal_replay_early(c, clean, &journal_entries); ++ if (ret) ++ goto err; ++ ++ if (!c->sb.clean) { ++ ret = bch2_journal_seq_blacklist_add(c, ++ journal_seq, ++ journal_seq + 4); ++ if (ret) { ++ bch_err(c, "error creating new journal seq blacklist entry"); ++ goto err; ++ } ++ ++ journal_seq += 4; ++ } ++ ++ ret = bch2_blacklist_table_initialize(c); ++ ++ if (!list_empty(&journal_entries)) { ++ ret = verify_journal_entries_not_blacklisted_or_missing(c, ++ &journal_entries); ++ if (ret) ++ goto err; ++ } ++ ++ ret = bch2_fs_journal_start(&c->journal, journal_seq, ++ &journal_entries); ++ if (ret) ++ goto err; ++ ++ ret = read_btree_roots(c); ++ if (ret) ++ goto err; ++ ++ bch_verbose(c, "starting alloc read"); ++ err = "error reading allocation information"; ++ ret = bch2_alloc_read(c, &journal_keys); ++ if (ret) ++ goto err; ++ bch_verbose(c, "alloc read done"); ++ ++ bch_verbose(c, "starting stripes_read"); ++ err = "error reading stripes"; ++ ret = bch2_stripes_read(c, &journal_keys); ++ if (ret) ++ goto err; ++ bch_verbose(c, "stripes_read done"); ++ ++ set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags); ++ ++ if ((c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO)) && ++ !(c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_METADATA))) { ++ /* ++ * interior btree node updates aren't consistent with the ++ * journal; after an unclean shutdown we have to walk all ++ * pointers to metadata: ++ */ ++ bch_info(c, "starting metadata mark and sweep"); ++ err = "error in mark and sweep"; ++ ret = bch2_gc(c, NULL, true, true); ++ if (ret) ++ goto err; ++ bch_verbose(c, "mark and sweep done"); ++ } ++ ++ if (c->opts.fsck || ++ !(c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO)) || ++ test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags)) { ++ bch_info(c, "starting mark and sweep"); ++ err = "error in mark and sweep"; ++ ret = bch2_gc(c, &journal_keys, true, false); ++ if (ret) ++ goto err; ++ bch_verbose(c, "mark and sweep done"); ++ } ++ ++ clear_bit(BCH_FS_REBUILD_REPLICAS, &c->flags); ++ set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags); ++ ++ /* ++ * Skip past versions that might have possibly been used (as nonces), ++ * but hadn't had their pointers written: ++ */ ++ if (c->sb.encryption_type && !c->sb.clean) ++ atomic64_add(1 << 16, &c->key_version); ++ ++ if (c->opts.norecovery) ++ goto out; ++ ++ bch_verbose(c, "starting journal replay"); ++ err = "journal replay failed"; ++ ret = bch2_journal_replay(c, journal_keys); ++ if (ret) ++ goto err; ++ bch_verbose(c, "journal replay done"); ++ ++ if (!c->opts.nochanges) { ++ /* ++ * note that even when filesystem was clean there might be work ++ * to do here, if we ran gc (because of fsck) which recalculated ++ * oldest_gen: ++ */ ++ bch_verbose(c, "writing allocation info"); ++ err = "error writing out alloc info"; ++ ret = bch2_stripes_write(c, BTREE_INSERT_LAZY_RW, &wrote) ?: ++ bch2_alloc_write(c, BTREE_INSERT_LAZY_RW, &wrote); ++ if (ret) { ++ bch_err(c, "error writing alloc info"); ++ goto err; ++ } ++ bch_verbose(c, "alloc write done"); ++ ++ set_bit(BCH_FS_ALLOC_WRITTEN, &c->flags); ++ } ++ ++ if (!c->sb.clean) { ++ if (!(c->sb.features & (1 << BCH_FEATURE_ATOMIC_NLINK))) { ++ bch_info(c, "checking inode link counts"); ++ err = "error in recovery"; ++ ret = bch2_fsck_inode_nlink(c); ++ if (ret) ++ goto err; ++ bch_verbose(c, "check inodes done"); ++ ++ } else { ++ bch_verbose(c, "checking for deleted inodes"); ++ err = "error in recovery"; ++ ret = bch2_fsck_walk_inodes_only(c); ++ if (ret) ++ goto err; ++ bch_verbose(c, "check inodes done"); ++ } ++ } ++ ++ if (c->opts.fsck) { ++ bch_info(c, "starting fsck"); ++ err = "error in fsck"; ++ ret = bch2_fsck_full(c); ++ if (ret) ++ goto err; ++ bch_verbose(c, "fsck done"); ++ } ++ ++ if (enabled_qtypes(c)) { ++ bch_verbose(c, "reading quotas"); ++ ret = bch2_fs_quota_read(c); ++ if (ret) ++ goto err; ++ bch_verbose(c, "quotas done"); ++ } ++ ++ mutex_lock(&c->sb_lock); ++ if (c->opts.version_upgrade) { ++ if (c->sb.version < bcachefs_metadata_version_new_versioning) ++ c->disk_sb.sb->version_min = ++ le16_to_cpu(bcachefs_metadata_version_min); ++ c->disk_sb.sb->version = le16_to_cpu(bcachefs_metadata_version_current); ++ write_sb = true; ++ } ++ ++ if (!test_bit(BCH_FS_ERROR, &c->flags)) { ++ c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_ALLOC_INFO; ++ write_sb = true; ++ } ++ ++ if (c->opts.fsck && ++ !test_bit(BCH_FS_ERROR, &c->flags)) { ++ c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_ATOMIC_NLINK; ++ SET_BCH_SB_HAS_ERRORS(c->disk_sb.sb, 0); ++ write_sb = true; ++ } ++ ++ if (write_sb) ++ bch2_write_super(c); ++ mutex_unlock(&c->sb_lock); ++ ++ if (c->journal_seq_blacklist_table && ++ c->journal_seq_blacklist_table->nr > 128) ++ queue_work(system_long_wq, &c->journal_seq_blacklist_gc_work); ++out: ++ ret = 0; ++err: ++fsck_err: ++ set_bit(BCH_FS_FSCK_DONE, &c->flags); ++ bch2_flush_fsck_errs(c); ++ ++ journal_keys_free(&journal_keys); ++ journal_entries_free(&journal_entries); ++ kfree(clean); ++ if (ret) ++ bch_err(c, "Error in recovery: %s (%i)", err, ret); ++ else ++ bch_verbose(c, "ret %i", ret); ++ return ret; ++} ++ ++int bch2_fs_initialize(struct bch_fs *c) ++{ ++ struct bch_inode_unpacked root_inode, lostfound_inode; ++ struct bkey_inode_buf packed_inode; ++ struct qstr lostfound = QSTR("lost+found"); ++ const char *err = "cannot allocate memory"; ++ struct bch_dev *ca; ++ LIST_HEAD(journal); ++ unsigned i; ++ int ret; ++ ++ bch_notice(c, "initializing new filesystem"); ++ ++ mutex_lock(&c->sb_lock); ++ for_each_online_member(ca, c, i) ++ bch2_mark_dev_superblock(c, ca, 0); ++ mutex_unlock(&c->sb_lock); ++ ++ set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags); ++ set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags); ++ ++ for (i = 0; i < BTREE_ID_NR; i++) ++ bch2_btree_root_alloc(c, i); ++ ++ err = "unable to allocate journal buckets"; ++ for_each_online_member(ca, c, i) { ++ ret = bch2_dev_journal_alloc(ca); ++ if (ret) { ++ percpu_ref_put(&ca->io_ref); ++ goto err; ++ } ++ } ++ ++ /* ++ * journal_res_get() will crash if called before this has ++ * set up the journal.pin FIFO and journal.cur pointer: ++ */ ++ bch2_fs_journal_start(&c->journal, 1, &journal); ++ bch2_journal_set_replay_done(&c->journal); ++ ++ err = "error going read write"; ++ ret = __bch2_fs_read_write(c, true); ++ if (ret) ++ goto err; ++ ++ bch2_inode_init(c, &root_inode, 0, 0, ++ S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0, NULL); ++ root_inode.bi_inum = BCACHEFS_ROOT_INO; ++ bch2_inode_pack(&packed_inode, &root_inode); ++ ++ err = "error creating root directory"; ++ ret = bch2_btree_insert(c, BTREE_ID_INODES, ++ &packed_inode.inode.k_i, ++ NULL, NULL, 0); ++ if (ret) ++ goto err; ++ ++ bch2_inode_init_early(c, &lostfound_inode); ++ ++ err = "error creating lost+found"; ++ ret = bch2_trans_do(c, NULL, BTREE_INSERT_ATOMIC, ++ bch2_create_trans(&trans, BCACHEFS_ROOT_INO, ++ &root_inode, &lostfound_inode, ++ &lostfound, ++ 0, 0, S_IFDIR|0755, 0, ++ NULL, NULL)); ++ if (ret) ++ goto err; ++ ++ if (enabled_qtypes(c)) { ++ ret = bch2_fs_quota_read(c); ++ if (ret) ++ goto err; ++ } ++ ++ err = "error writing first journal entry"; ++ ret = bch2_journal_meta(&c->journal); ++ if (ret) ++ goto err; ++ ++ mutex_lock(&c->sb_lock); ++ c->disk_sb.sb->version = c->disk_sb.sb->version_min = ++ le16_to_cpu(bcachefs_metadata_version_current); ++ c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_ATOMIC_NLINK; ++ ++ SET_BCH_SB_INITIALIZED(c->disk_sb.sb, true); ++ SET_BCH_SB_CLEAN(c->disk_sb.sb, false); ++ ++ bch2_write_super(c); ++ mutex_unlock(&c->sb_lock); ++ ++ return 0; ++err: ++ pr_err("Error initializing new filesystem: %s (%i)", err, ret); ++ return ret; ++} +diff --git a/fs/bcachefs/recovery.h b/fs/bcachefs/recovery.h +new file mode 100644 +index 000000000000..479ea46f8dcb +--- /dev/null ++++ b/fs/bcachefs/recovery.h +@@ -0,0 +1,35 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_RECOVERY_H ++#define _BCACHEFS_RECOVERY_H ++ ++struct journal_keys { ++ struct journal_key { ++ enum btree_id btree_id:8; ++ unsigned allocated:1; ++ struct bpos pos; ++ struct bkey_i *k; ++ u32 journal_seq; ++ u32 journal_offset; ++ } *d; ++ size_t nr; ++ u64 journal_seq_base; ++}; ++ ++#define for_each_journal_key(keys, i) \ ++ for (i = (keys).d; i < (keys).d + (keys).nr; (i)++) ++ ++struct journal_iter { ++ struct journal_keys *keys; ++ struct journal_key *k; ++ enum btree_id btree_id; ++}; ++ ++struct journal_iter bch2_journal_iter_init(struct journal_keys *, ++ enum btree_id); ++struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *); ++struct bkey_s_c bch2_journal_iter_next(struct journal_iter *); ++ ++int bch2_fs_recovery(struct bch_fs *); ++int bch2_fs_initialize(struct bch_fs *); ++ ++#endif /* _BCACHEFS_RECOVERY_H */ +diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c +new file mode 100644 +index 000000000000..6d45ae24479d +--- /dev/null ++++ b/fs/bcachefs/reflink.c +@@ -0,0 +1,304 @@ ++// SPDX-License-Identifier: GPL-2.0 ++#include "bcachefs.h" ++#include "btree_update.h" ++#include "extents.h" ++#include "inode.h" ++#include "io.h" ++#include "reflink.h" ++ ++#include ++ ++/* reflink pointers */ ++ ++const char *bch2_reflink_p_invalid(const struct bch_fs *c, struct bkey_s_c k) ++{ ++ struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); ++ ++ if (bkey_val_bytes(p.k) != sizeof(*p.v)) ++ return "incorrect value size"; ++ ++ return NULL; ++} ++ ++void bch2_reflink_p_to_text(struct printbuf *out, struct bch_fs *c, ++ struct bkey_s_c k) ++{ ++ struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); ++ ++ pr_buf(out, "idx %llu", le64_to_cpu(p.v->idx)); ++} ++ ++enum merge_result bch2_reflink_p_merge(struct bch_fs *c, ++ struct bkey_s _l, struct bkey_s _r) ++{ ++ struct bkey_s_reflink_p l = bkey_s_to_reflink_p(_l); ++ struct bkey_s_reflink_p r = bkey_s_to_reflink_p(_r); ++ ++ if (le64_to_cpu(l.v->idx) + l.k->size != le64_to_cpu(r.v->idx)) ++ return BCH_MERGE_NOMERGE; ++ ++ if ((u64) l.k->size + r.k->size > KEY_SIZE_MAX) { ++ bch2_key_resize(l.k, KEY_SIZE_MAX); ++ __bch2_cut_front(l.k->p, _r); ++ return BCH_MERGE_PARTIAL; ++ } ++ ++ bch2_key_resize(l.k, l.k->size + r.k->size); ++ ++ return BCH_MERGE_MERGE; ++} ++ ++/* indirect extents */ ++ ++const char *bch2_reflink_v_invalid(const struct bch_fs *c, struct bkey_s_c k) ++{ ++ struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k); ++ ++ if (bkey_val_bytes(r.k) < sizeof(*r.v)) ++ return "incorrect value size"; ++ ++ return bch2_bkey_ptrs_invalid(c, k); ++} ++ ++void bch2_reflink_v_to_text(struct printbuf *out, struct bch_fs *c, ++ struct bkey_s_c k) ++{ ++ struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k); ++ ++ pr_buf(out, "refcount: %llu ", le64_to_cpu(r.v->refcount)); ++ ++ bch2_bkey_ptrs_to_text(out, c, k); ++} ++ ++static int bch2_make_extent_indirect(struct btree_trans *trans, ++ struct btree_iter *extent_iter, ++ struct bkey_i_extent *e) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_iter *reflink_iter; ++ struct bkey_s_c k; ++ struct bkey_i_reflink_v *r_v; ++ struct bkey_i_reflink_p *r_p; ++ int ret; ++ ++ for_each_btree_key(trans, reflink_iter, BTREE_ID_REFLINK, ++ POS(0, c->reflink_hint), ++ BTREE_ITER_INTENT|BTREE_ITER_SLOTS, k, ret) { ++ if (reflink_iter->pos.inode) { ++ bch2_btree_iter_set_pos(reflink_iter, POS_MIN); ++ continue; ++ } ++ ++ if (bkey_deleted(k.k) && e->k.size <= k.k->size) ++ break; ++ } ++ ++ if (ret) ++ goto err; ++ ++ /* rewind iter to start of hole, if necessary: */ ++ bch2_btree_iter_set_pos(reflink_iter, bkey_start_pos(k.k)); ++ ++ r_v = bch2_trans_kmalloc(trans, sizeof(*r_v) + bkey_val_bytes(&e->k)); ++ ret = PTR_ERR_OR_ZERO(r_v); ++ if (ret) ++ goto err; ++ ++ bkey_reflink_v_init(&r_v->k_i); ++ r_v->k.p = reflink_iter->pos; ++ bch2_key_resize(&r_v->k, e->k.size); ++ r_v->k.version = e->k.version; ++ ++ set_bkey_val_u64s(&r_v->k, bkey_val_u64s(&r_v->k) + ++ bkey_val_u64s(&e->k)); ++ r_v->v.refcount = 0; ++ memcpy(r_v->v.start, e->v.start, bkey_val_bytes(&e->k)); ++ ++ bch2_trans_update(trans, reflink_iter, &r_v->k_i); ++ ++ r_p = bch2_trans_kmalloc(trans, sizeof(*r_p)); ++ if (IS_ERR(r_p)) ++ return PTR_ERR(r_p); ++ ++ e->k.type = KEY_TYPE_reflink_p; ++ r_p = bkey_i_to_reflink_p(&e->k_i); ++ set_bkey_val_bytes(&r_p->k, sizeof(r_p->v)); ++ r_p->v.idx = cpu_to_le64(bkey_start_offset(&r_v->k)); ++ ++ bch2_trans_update(trans, extent_iter, &r_p->k_i); ++err: ++ if (!IS_ERR(reflink_iter)) { ++ c->reflink_hint = reflink_iter->pos.offset; ++ bch2_trans_iter_put(trans, reflink_iter); ++ } ++ ++ return ret; ++} ++ ++static struct bkey_s_c get_next_src(struct btree_iter *iter, struct bpos end) ++{ ++ struct bkey_s_c k = bch2_btree_iter_peek(iter); ++ int ret; ++ ++ for_each_btree_key_continue(iter, 0, k, ret) { ++ if (bkey_cmp(iter->pos, end) >= 0) ++ return bkey_s_c_null; ++ ++ if (k.k->type == KEY_TYPE_extent || ++ k.k->type == KEY_TYPE_reflink_p) ++ break; ++ } ++ ++ return k; ++} ++ ++s64 bch2_remap_range(struct bch_fs *c, ++ struct bpos dst_start, struct bpos src_start, ++ u64 remap_sectors, u64 *journal_seq, ++ u64 new_i_size, s64 *i_sectors_delta) ++{ ++ struct btree_trans trans; ++ struct btree_iter *dst_iter, *src_iter; ++ struct bkey_s_c src_k; ++ BKEY_PADDED(k) new_dst, new_src; ++ struct bpos dst_end = dst_start, src_end = src_start; ++ struct bpos dst_want, src_want; ++ u64 src_done, dst_done; ++ int ret = 0, ret2 = 0; ++ ++ if (!percpu_ref_tryget(&c->writes)) ++ return -EROFS; ++ ++ if (!(c->sb.features & (1ULL << BCH_FEATURE_REFLINK))) { ++ mutex_lock(&c->sb_lock); ++ if (!(c->sb.features & (1ULL << BCH_FEATURE_REFLINK))) { ++ c->disk_sb.sb->features[0] |= ++ cpu_to_le64(1ULL << BCH_FEATURE_REFLINK); ++ ++ bch2_write_super(c); ++ } ++ mutex_unlock(&c->sb_lock); ++ } ++ ++ dst_end.offset += remap_sectors; ++ src_end.offset += remap_sectors; ++ ++ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 4096); ++ ++ src_iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, src_start, ++ BTREE_ITER_INTENT); ++ dst_iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, dst_start, ++ BTREE_ITER_INTENT); ++ ++ while (1) { ++ bch2_trans_begin_updates(&trans); ++ trans.mem_top = 0; ++ ++ if (fatal_signal_pending(current)) { ++ ret = -EINTR; ++ goto err; ++ } ++ ++ src_k = get_next_src(src_iter, src_end); ++ ret = bkey_err(src_k); ++ if (ret) ++ goto btree_err; ++ ++ src_done = bpos_min(src_iter->pos, src_end).offset - ++ src_start.offset; ++ dst_want = POS(dst_start.inode, dst_start.offset + src_done); ++ ++ if (bkey_cmp(dst_iter->pos, dst_want) < 0) { ++ ret = bch2_fpunch_at(&trans, dst_iter, dst_want, ++ journal_seq, i_sectors_delta); ++ if (ret) ++ goto btree_err; ++ continue; ++ } ++ ++ BUG_ON(bkey_cmp(dst_iter->pos, dst_want)); ++ ++ if (!bkey_cmp(dst_iter->pos, dst_end)) ++ break; ++ ++ if (src_k.k->type == KEY_TYPE_extent) { ++ bkey_reassemble(&new_src.k, src_k); ++ src_k = bkey_i_to_s_c(&new_src.k); ++ ++ bch2_cut_front(src_iter->pos, &new_src.k); ++ bch2_cut_back(src_end, &new_src.k.k); ++ ++ ret = bch2_make_extent_indirect(&trans, src_iter, ++ bkey_i_to_extent(&new_src.k)); ++ if (ret) ++ goto btree_err; ++ ++ BUG_ON(src_k.k->type != KEY_TYPE_reflink_p); ++ } ++ ++ if (src_k.k->type == KEY_TYPE_reflink_p) { ++ struct bkey_s_c_reflink_p src_p = ++ bkey_s_c_to_reflink_p(src_k); ++ struct bkey_i_reflink_p *dst_p = ++ bkey_reflink_p_init(&new_dst.k); ++ ++ u64 offset = le64_to_cpu(src_p.v->idx) + ++ (src_iter->pos.offset - ++ bkey_start_offset(src_k.k)); ++ ++ dst_p->v.idx = cpu_to_le64(offset); ++ } else { ++ BUG(); ++ } ++ ++ new_dst.k.k.p = dst_iter->pos; ++ bch2_key_resize(&new_dst.k.k, ++ min(src_k.k->p.offset - src_iter->pos.offset, ++ dst_end.offset - dst_iter->pos.offset)); ++ ++ ret = bch2_extent_update(&trans, dst_iter, &new_dst.k, ++ NULL, journal_seq, ++ new_i_size, i_sectors_delta); ++ if (ret) ++ goto btree_err; ++ ++ dst_done = dst_iter->pos.offset - dst_start.offset; ++ src_want = POS(src_start.inode, src_start.offset + dst_done); ++ bch2_btree_iter_set_pos(src_iter, src_want); ++btree_err: ++ if (ret == -EINTR) ++ ret = 0; ++ if (ret) ++ goto err; ++ } ++ ++ BUG_ON(bkey_cmp(dst_iter->pos, dst_end)); ++err: ++ BUG_ON(bkey_cmp(dst_iter->pos, dst_end) > 0); ++ ++ dst_done = dst_iter->pos.offset - dst_start.offset; ++ new_i_size = min(dst_iter->pos.offset << 9, new_i_size); ++ ++ bch2_trans_begin(&trans); ++ ++ do { ++ struct bch_inode_unpacked inode_u; ++ struct btree_iter *inode_iter; ++ ++ inode_iter = bch2_inode_peek(&trans, &inode_u, ++ dst_start.inode, BTREE_ITER_INTENT); ++ ret2 = PTR_ERR_OR_ZERO(inode_iter); ++ ++ if (!ret2 && ++ inode_u.bi_size < new_i_size) ++ ret2 = bch2_inode_write(&trans, inode_iter, &inode_u) ?: ++ bch2_trans_commit(&trans, NULL, journal_seq, ++ BTREE_INSERT_ATOMIC); ++ } while (ret2 == -EINTR); ++ ++ ret = bch2_trans_exit(&trans) ?: ret; ++ ++ percpu_ref_put(&c->writes); ++ ++ return dst_done ?: ret ?: ret2; ++} +diff --git a/fs/bcachefs/reflink.h b/fs/bcachefs/reflink.h +new file mode 100644 +index 000000000000..ac23b855858c +--- /dev/null ++++ b/fs/bcachefs/reflink.h +@@ -0,0 +1,30 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_REFLINK_H ++#define _BCACHEFS_REFLINK_H ++ ++const char *bch2_reflink_p_invalid(const struct bch_fs *, struct bkey_s_c); ++void bch2_reflink_p_to_text(struct printbuf *, struct bch_fs *, ++ struct bkey_s_c); ++enum merge_result bch2_reflink_p_merge(struct bch_fs *, ++ struct bkey_s, struct bkey_s); ++ ++#define bch2_bkey_ops_reflink_p (struct bkey_ops) { \ ++ .key_invalid = bch2_reflink_p_invalid, \ ++ .val_to_text = bch2_reflink_p_to_text, \ ++ .key_merge = bch2_reflink_p_merge, \ ++} ++ ++const char *bch2_reflink_v_invalid(const struct bch_fs *, struct bkey_s_c); ++void bch2_reflink_v_to_text(struct printbuf *, struct bch_fs *, ++ struct bkey_s_c); ++ ++ ++#define bch2_bkey_ops_reflink_v (struct bkey_ops) { \ ++ .key_invalid = bch2_reflink_v_invalid, \ ++ .val_to_text = bch2_reflink_v_to_text, \ ++} ++ ++s64 bch2_remap_range(struct bch_fs *, struct bpos, struct bpos, ++ u64, u64 *, u64, s64 *); ++ ++#endif /* _BCACHEFS_REFLINK_H */ +diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c +new file mode 100644 +index 000000000000..cb5ebb87c701 +--- /dev/null ++++ b/fs/bcachefs/replicas.c +@@ -0,0 +1,1076 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include "bcachefs.h" ++#include "buckets.h" ++#include "journal.h" ++#include "replicas.h" ++#include "super-io.h" ++ ++static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *, ++ struct bch_replicas_cpu *); ++ ++/* Replicas tracking - in memory: */ ++ ++static inline int u8_cmp(u8 l, u8 r) ++{ ++ return cmp_int(l, r); ++} ++ ++static void verify_replicas_entry(struct bch_replicas_entry *e) ++{ ++#ifdef CONFIG_BCACHEFS_DEBUG ++ unsigned i; ++ ++ BUG_ON(e->data_type >= BCH_DATA_NR); ++ BUG_ON(!e->nr_devs); ++ BUG_ON(e->nr_required > 1 && ++ e->nr_required >= e->nr_devs); ++ ++ for (i = 0; i + 1 < e->nr_devs; i++) ++ BUG_ON(e->devs[i] >= e->devs[i + 1]); ++#endif ++} ++ ++static void replicas_entry_sort(struct bch_replicas_entry *e) ++{ ++ bubble_sort(e->devs, e->nr_devs, u8_cmp); ++} ++ ++static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r) ++{ ++ eytzinger0_sort(r->entries, r->nr, r->entry_size, memcmp, NULL); ++} ++ ++void bch2_replicas_entry_to_text(struct printbuf *out, ++ struct bch_replicas_entry *e) ++{ ++ unsigned i; ++ ++ pr_buf(out, "%s: %u/%u [", ++ bch2_data_types[e->data_type], ++ e->nr_required, ++ e->nr_devs); ++ ++ for (i = 0; i < e->nr_devs; i++) ++ pr_buf(out, i ? " %u" : "%u", e->devs[i]); ++ pr_buf(out, "]"); ++} ++ ++void bch2_cpu_replicas_to_text(struct printbuf *out, ++ struct bch_replicas_cpu *r) ++{ ++ struct bch_replicas_entry *e; ++ bool first = true; ++ ++ for_each_cpu_replicas_entry(r, e) { ++ if (!first) ++ pr_buf(out, " "); ++ first = false; ++ ++ bch2_replicas_entry_to_text(out, e); ++ } ++} ++ ++static void extent_to_replicas(struct bkey_s_c k, ++ struct bch_replicas_entry *r) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const union bch_extent_entry *entry; ++ struct extent_ptr_decoded p; ++ ++ r->nr_required = 1; ++ ++ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { ++ if (p.ptr.cached) ++ continue; ++ ++ if (p.has_ec) ++ r->nr_required = 0; ++ ++ r->devs[r->nr_devs++] = p.ptr.dev; ++ } ++} ++ ++static void stripe_to_replicas(struct bkey_s_c k, ++ struct bch_replicas_entry *r) ++{ ++ struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k); ++ const struct bch_extent_ptr *ptr; ++ ++ r->nr_required = s.v->nr_blocks - s.v->nr_redundant; ++ ++ for (ptr = s.v->ptrs; ++ ptr < s.v->ptrs + s.v->nr_blocks; ++ ptr++) ++ r->devs[r->nr_devs++] = ptr->dev; ++} ++ ++void bch2_bkey_to_replicas(struct bch_replicas_entry *e, ++ struct bkey_s_c k) ++{ ++ e->nr_devs = 0; ++ ++ switch (k.k->type) { ++ case KEY_TYPE_btree_ptr: ++ e->data_type = BCH_DATA_BTREE; ++ extent_to_replicas(k, e); ++ break; ++ case KEY_TYPE_extent: ++ case KEY_TYPE_reflink_v: ++ e->data_type = BCH_DATA_USER; ++ extent_to_replicas(k, e); ++ break; ++ case KEY_TYPE_stripe: ++ e->data_type = BCH_DATA_USER; ++ stripe_to_replicas(k, e); ++ break; ++ } ++ ++ replicas_entry_sort(e); ++} ++ ++void bch2_devlist_to_replicas(struct bch_replicas_entry *e, ++ enum bch_data_type data_type, ++ struct bch_devs_list devs) ++{ ++ unsigned i; ++ ++ BUG_ON(!data_type || ++ data_type == BCH_DATA_SB || ++ data_type >= BCH_DATA_NR); ++ ++ e->data_type = data_type; ++ e->nr_devs = 0; ++ e->nr_required = 1; ++ ++ for (i = 0; i < devs.nr; i++) ++ e->devs[e->nr_devs++] = devs.devs[i]; ++ ++ replicas_entry_sort(e); ++} ++ ++static struct bch_replicas_cpu ++cpu_replicas_add_entry(struct bch_replicas_cpu *old, ++ struct bch_replicas_entry *new_entry) ++{ ++ unsigned i; ++ struct bch_replicas_cpu new = { ++ .nr = old->nr + 1, ++ .entry_size = max_t(unsigned, old->entry_size, ++ replicas_entry_bytes(new_entry)), ++ }; ++ ++ BUG_ON(!new_entry->data_type); ++ verify_replicas_entry(new_entry); ++ ++ new.entries = kcalloc(new.nr, new.entry_size, GFP_NOIO); ++ if (!new.entries) ++ return new; ++ ++ for (i = 0; i < old->nr; i++) ++ memcpy(cpu_replicas_entry(&new, i), ++ cpu_replicas_entry(old, i), ++ old->entry_size); ++ ++ memcpy(cpu_replicas_entry(&new, old->nr), ++ new_entry, ++ replicas_entry_bytes(new_entry)); ++ ++ bch2_cpu_replicas_sort(&new); ++ return new; ++} ++ ++static inline int __replicas_entry_idx(struct bch_replicas_cpu *r, ++ struct bch_replicas_entry *search) ++{ ++ int idx, entry_size = replicas_entry_bytes(search); ++ ++ if (unlikely(entry_size > r->entry_size)) ++ return -1; ++ ++ verify_replicas_entry(search); ++ ++#define entry_cmp(_l, _r, size) memcmp(_l, _r, entry_size) ++ idx = eytzinger0_find(r->entries, r->nr, r->entry_size, ++ entry_cmp, search); ++#undef entry_cmp ++ ++ return idx < r->nr ? idx : -1; ++} ++ ++int bch2_replicas_entry_idx(struct bch_fs *c, ++ struct bch_replicas_entry *search) ++{ ++ replicas_entry_sort(search); ++ ++ return __replicas_entry_idx(&c->replicas, search); ++} ++ ++static bool __replicas_has_entry(struct bch_replicas_cpu *r, ++ struct bch_replicas_entry *search) ++{ ++ return __replicas_entry_idx(r, search) >= 0; ++} ++ ++static bool bch2_replicas_marked_locked(struct bch_fs *c, ++ struct bch_replicas_entry *search, ++ bool check_gc_replicas) ++{ ++ if (!search->nr_devs) ++ return true; ++ ++ verify_replicas_entry(search); ++ ++ return __replicas_has_entry(&c->replicas, search) && ++ (!check_gc_replicas || ++ likely((!c->replicas_gc.entries)) || ++ __replicas_has_entry(&c->replicas_gc, search)); ++} ++ ++bool bch2_replicas_marked(struct bch_fs *c, ++ struct bch_replicas_entry *search, ++ bool check_gc_replicas) ++{ ++ bool marked; ++ ++ percpu_down_read(&c->mark_lock); ++ marked = bch2_replicas_marked_locked(c, search, check_gc_replicas); ++ percpu_up_read(&c->mark_lock); ++ ++ return marked; ++} ++ ++static void __replicas_table_update(struct bch_fs_usage *dst, ++ struct bch_replicas_cpu *dst_r, ++ struct bch_fs_usage *src, ++ struct bch_replicas_cpu *src_r) ++{ ++ int src_idx, dst_idx; ++ ++ *dst = *src; ++ ++ for (src_idx = 0; src_idx < src_r->nr; src_idx++) { ++ if (!src->replicas[src_idx]) ++ continue; ++ ++ dst_idx = __replicas_entry_idx(dst_r, ++ cpu_replicas_entry(src_r, src_idx)); ++ BUG_ON(dst_idx < 0); ++ ++ dst->replicas[dst_idx] = src->replicas[src_idx]; ++ } ++} ++ ++static void __replicas_table_update_pcpu(struct bch_fs_usage __percpu *dst_p, ++ struct bch_replicas_cpu *dst_r, ++ struct bch_fs_usage __percpu *src_p, ++ struct bch_replicas_cpu *src_r) ++{ ++ unsigned src_nr = sizeof(struct bch_fs_usage) / sizeof(u64) + src_r->nr; ++ struct bch_fs_usage *dst, *src = (void *) ++ bch2_acc_percpu_u64s((void *) src_p, src_nr); ++ ++ preempt_disable(); ++ dst = this_cpu_ptr(dst_p); ++ preempt_enable(); ++ ++ __replicas_table_update(dst, dst_r, src, src_r); ++} ++ ++/* ++ * Resize filesystem accounting: ++ */ ++static int replicas_table_update(struct bch_fs *c, ++ struct bch_replicas_cpu *new_r) ++{ ++ struct bch_fs_usage __percpu *new_usage[2] = { NULL, NULL }; ++ struct bch_fs_usage *new_scratch = NULL; ++ struct bch_fs_usage __percpu *new_gc = NULL; ++ struct bch_fs_usage *new_base = NULL; ++ unsigned bytes = sizeof(struct bch_fs_usage) + ++ sizeof(u64) * new_r->nr; ++ int ret = -ENOMEM; ++ ++ if (!(new_base = kzalloc(bytes, GFP_NOIO)) || ++ !(new_usage[0] = __alloc_percpu_gfp(bytes, sizeof(u64), ++ GFP_NOIO)) || ++ !(new_usage[1] = __alloc_percpu_gfp(bytes, sizeof(u64), ++ GFP_NOIO)) || ++ !(new_scratch = kmalloc(bytes, GFP_NOIO)) || ++ (c->usage_gc && ++ !(new_gc = __alloc_percpu_gfp(bytes, sizeof(u64), GFP_NOIO)))) ++ goto err; ++ ++ if (c->usage_base) ++ __replicas_table_update(new_base, new_r, ++ c->usage_base, &c->replicas); ++ if (c->usage[0]) ++ __replicas_table_update_pcpu(new_usage[0], new_r, ++ c->usage[0], &c->replicas); ++ if (c->usage[1]) ++ __replicas_table_update_pcpu(new_usage[1], new_r, ++ c->usage[1], &c->replicas); ++ if (c->usage_gc) ++ __replicas_table_update_pcpu(new_gc, new_r, ++ c->usage_gc, &c->replicas); ++ ++ swap(c->usage_base, new_base); ++ swap(c->usage[0], new_usage[0]); ++ swap(c->usage[1], new_usage[1]); ++ swap(c->usage_scratch, new_scratch); ++ swap(c->usage_gc, new_gc); ++ swap(c->replicas, *new_r); ++ ret = 0; ++err: ++ free_percpu(new_gc); ++ kfree(new_scratch); ++ free_percpu(new_usage[1]); ++ free_percpu(new_usage[0]); ++ kfree(new_base); ++ return ret; ++} ++ ++static unsigned reserve_journal_replicas(struct bch_fs *c, ++ struct bch_replicas_cpu *r) ++{ ++ struct bch_replicas_entry *e; ++ unsigned journal_res_u64s = 0; ++ ++ /* nr_inodes: */ ++ journal_res_u64s += ++ DIV_ROUND_UP(sizeof(struct jset_entry_usage), sizeof(u64)); ++ ++ /* key_version: */ ++ journal_res_u64s += ++ DIV_ROUND_UP(sizeof(struct jset_entry_usage), sizeof(u64)); ++ ++ /* persistent_reserved: */ ++ journal_res_u64s += ++ DIV_ROUND_UP(sizeof(struct jset_entry_usage), sizeof(u64)) * ++ BCH_REPLICAS_MAX; ++ ++ for_each_cpu_replicas_entry(r, e) ++ journal_res_u64s += ++ DIV_ROUND_UP(sizeof(struct jset_entry_data_usage) + ++ e->nr_devs, sizeof(u64)); ++ return journal_res_u64s; ++} ++ ++noinline ++static int bch2_mark_replicas_slowpath(struct bch_fs *c, ++ struct bch_replicas_entry *new_entry) ++{ ++ struct bch_replicas_cpu new_r, new_gc; ++ int ret = -ENOMEM; ++ ++ verify_replicas_entry(new_entry); ++ ++ memset(&new_r, 0, sizeof(new_r)); ++ memset(&new_gc, 0, sizeof(new_gc)); ++ ++ mutex_lock(&c->sb_lock); ++ ++ if (c->replicas_gc.entries && ++ !__replicas_has_entry(&c->replicas_gc, new_entry)) { ++ new_gc = cpu_replicas_add_entry(&c->replicas_gc, new_entry); ++ if (!new_gc.entries) ++ goto err; ++ } ++ ++ if (!__replicas_has_entry(&c->replicas, new_entry)) { ++ new_r = cpu_replicas_add_entry(&c->replicas, new_entry); ++ if (!new_r.entries) ++ goto err; ++ ++ ret = bch2_cpu_replicas_to_sb_replicas(c, &new_r); ++ if (ret) ++ goto err; ++ ++ bch2_journal_entry_res_resize(&c->journal, ++ &c->replicas_journal_res, ++ reserve_journal_replicas(c, &new_r)); ++ } ++ ++ if (!new_r.entries && ++ !new_gc.entries) ++ goto out; ++ ++ /* allocations done, now commit: */ ++ ++ if (new_r.entries) ++ bch2_write_super(c); ++ ++ /* don't update in memory replicas until changes are persistent */ ++ percpu_down_write(&c->mark_lock); ++ if (new_r.entries) ++ ret = replicas_table_update(c, &new_r); ++ if (new_gc.entries) ++ swap(new_gc, c->replicas_gc); ++ percpu_up_write(&c->mark_lock); ++out: ++ ret = 0; ++err: ++ mutex_unlock(&c->sb_lock); ++ ++ kfree(new_r.entries); ++ kfree(new_gc.entries); ++ ++ return ret; ++} ++ ++int bch2_mark_replicas(struct bch_fs *c, ++ struct bch_replicas_entry *r) ++{ ++ return likely(bch2_replicas_marked(c, r, true)) ++ ? 0 ++ : bch2_mark_replicas_slowpath(c, r); ++} ++ ++bool bch2_bkey_replicas_marked_locked(struct bch_fs *c, ++ struct bkey_s_c k, ++ bool check_gc_replicas) ++{ ++ struct bch_replicas_padded search; ++ struct bch_devs_list cached = bch2_bkey_cached_devs(k); ++ unsigned i; ++ ++ for (i = 0; i < cached.nr; i++) { ++ bch2_replicas_entry_cached(&search.e, cached.devs[i]); ++ ++ if (!bch2_replicas_marked_locked(c, &search.e, ++ check_gc_replicas)) ++ return false; ++ } ++ ++ bch2_bkey_to_replicas(&search.e, k); ++ ++ return bch2_replicas_marked_locked(c, &search.e, check_gc_replicas); ++} ++ ++bool bch2_bkey_replicas_marked(struct bch_fs *c, ++ struct bkey_s_c k, ++ bool check_gc_replicas) ++{ ++ bool marked; ++ ++ percpu_down_read(&c->mark_lock); ++ marked = bch2_bkey_replicas_marked_locked(c, k, check_gc_replicas); ++ percpu_up_read(&c->mark_lock); ++ ++ return marked; ++} ++ ++int bch2_mark_bkey_replicas(struct bch_fs *c, struct bkey_s_c k) ++{ ++ struct bch_replicas_padded search; ++ struct bch_devs_list cached = bch2_bkey_cached_devs(k); ++ unsigned i; ++ int ret; ++ ++ for (i = 0; i < cached.nr; i++) { ++ bch2_replicas_entry_cached(&search.e, cached.devs[i]); ++ ++ ret = bch2_mark_replicas(c, &search.e); ++ if (ret) ++ return ret; ++ } ++ ++ bch2_bkey_to_replicas(&search.e, k); ++ ++ return bch2_mark_replicas(c, &search.e); ++} ++ ++int bch2_replicas_gc_end(struct bch_fs *c, int ret) ++{ ++ unsigned i; ++ ++ lockdep_assert_held(&c->replicas_gc_lock); ++ ++ mutex_lock(&c->sb_lock); ++ percpu_down_write(&c->mark_lock); ++ ++ /* ++ * this is kind of crappy; the replicas gc mechanism needs to be ripped ++ * out ++ */ ++ ++ for (i = 0; i < c->replicas.nr; i++) { ++ struct bch_replicas_entry *e = ++ cpu_replicas_entry(&c->replicas, i); ++ struct bch_replicas_cpu n; ++ ++ if (!__replicas_has_entry(&c->replicas_gc, e) && ++ (c->usage_base->replicas[i] || ++ percpu_u64_get(&c->usage[0]->replicas[i]) || ++ percpu_u64_get(&c->usage[1]->replicas[i]))) { ++ n = cpu_replicas_add_entry(&c->replicas_gc, e); ++ if (!n.entries) { ++ ret = -ENOSPC; ++ goto err; ++ } ++ ++ swap(n, c->replicas_gc); ++ kfree(n.entries); ++ } ++ } ++ ++ if (bch2_cpu_replicas_to_sb_replicas(c, &c->replicas_gc)) { ++ ret = -ENOSPC; ++ goto err; ++ } ++ ++ ret = replicas_table_update(c, &c->replicas_gc); ++err: ++ kfree(c->replicas_gc.entries); ++ c->replicas_gc.entries = NULL; ++ ++ percpu_up_write(&c->mark_lock); ++ ++ if (!ret) ++ bch2_write_super(c); ++ ++ mutex_unlock(&c->sb_lock); ++ ++ return ret; ++} ++ ++int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask) ++{ ++ struct bch_replicas_entry *e; ++ unsigned i = 0; ++ ++ lockdep_assert_held(&c->replicas_gc_lock); ++ ++ mutex_lock(&c->sb_lock); ++ BUG_ON(c->replicas_gc.entries); ++ ++ c->replicas_gc.nr = 0; ++ c->replicas_gc.entry_size = 0; ++ ++ for_each_cpu_replicas_entry(&c->replicas, e) ++ if (!((1 << e->data_type) & typemask)) { ++ c->replicas_gc.nr++; ++ c->replicas_gc.entry_size = ++ max_t(unsigned, c->replicas_gc.entry_size, ++ replicas_entry_bytes(e)); ++ } ++ ++ c->replicas_gc.entries = kcalloc(c->replicas_gc.nr, ++ c->replicas_gc.entry_size, ++ GFP_NOIO); ++ if (!c->replicas_gc.entries) { ++ mutex_unlock(&c->sb_lock); ++ return -ENOMEM; ++ } ++ ++ for_each_cpu_replicas_entry(&c->replicas, e) ++ if (!((1 << e->data_type) & typemask)) ++ memcpy(cpu_replicas_entry(&c->replicas_gc, i++), ++ e, c->replicas_gc.entry_size); ++ ++ bch2_cpu_replicas_sort(&c->replicas_gc); ++ mutex_unlock(&c->sb_lock); ++ ++ return 0; ++} ++ ++int bch2_replicas_gc2(struct bch_fs *c) ++{ ++ struct bch_replicas_cpu new = { 0 }; ++ unsigned i, nr; ++ int ret = 0; ++ ++ bch2_journal_meta(&c->journal); ++retry: ++ nr = READ_ONCE(c->replicas.nr); ++ new.entry_size = READ_ONCE(c->replicas.entry_size); ++ new.entries = kcalloc(nr, new.entry_size, GFP_KERNEL); ++ if (!new.entries) ++ return -ENOMEM; ++ ++ mutex_lock(&c->sb_lock); ++ percpu_down_write(&c->mark_lock); ++ ++ if (nr != c->replicas.nr || ++ new.entry_size != c->replicas.entry_size) { ++ percpu_up_write(&c->mark_lock); ++ mutex_unlock(&c->sb_lock); ++ kfree(new.entries); ++ goto retry; ++ } ++ ++ for (i = 0; i < c->replicas.nr; i++) { ++ struct bch_replicas_entry *e = ++ cpu_replicas_entry(&c->replicas, i); ++ ++ if (e->data_type == BCH_DATA_JOURNAL || ++ c->usage_base->replicas[i] || ++ percpu_u64_get(&c->usage[0]->replicas[i]) || ++ percpu_u64_get(&c->usage[1]->replicas[i])) ++ memcpy(cpu_replicas_entry(&new, new.nr++), ++ e, new.entry_size); ++ } ++ ++ bch2_cpu_replicas_sort(&new); ++ ++ if (bch2_cpu_replicas_to_sb_replicas(c, &new)) { ++ ret = -ENOSPC; ++ goto err; ++ } ++ ++ ret = replicas_table_update(c, &new); ++err: ++ kfree(new.entries); ++ ++ percpu_up_write(&c->mark_lock); ++ ++ if (!ret) ++ bch2_write_super(c); ++ ++ mutex_unlock(&c->sb_lock); ++ ++ return ret; ++} ++ ++int bch2_replicas_set_usage(struct bch_fs *c, ++ struct bch_replicas_entry *r, ++ u64 sectors) ++{ ++ int ret, idx = bch2_replicas_entry_idx(c, r); ++ ++ if (idx < 0) { ++ struct bch_replicas_cpu n; ++ ++ n = cpu_replicas_add_entry(&c->replicas, r); ++ if (!n.entries) ++ return -ENOMEM; ++ ++ ret = replicas_table_update(c, &n); ++ if (ret) ++ return ret; ++ ++ kfree(n.entries); ++ ++ idx = bch2_replicas_entry_idx(c, r); ++ BUG_ON(ret < 0); ++ } ++ ++ c->usage_base->replicas[idx] = sectors; ++ ++ return 0; ++} ++ ++/* Replicas tracking - superblock: */ ++ ++static int ++__bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r, ++ struct bch_replicas_cpu *cpu_r) ++{ ++ struct bch_replicas_entry *e, *dst; ++ unsigned nr = 0, entry_size = 0, idx = 0; ++ ++ for_each_replicas_entry(sb_r, e) { ++ entry_size = max_t(unsigned, entry_size, ++ replicas_entry_bytes(e)); ++ nr++; ++ } ++ ++ cpu_r->entries = kcalloc(nr, entry_size, GFP_NOIO); ++ if (!cpu_r->entries) ++ return -ENOMEM; ++ ++ cpu_r->nr = nr; ++ cpu_r->entry_size = entry_size; ++ ++ for_each_replicas_entry(sb_r, e) { ++ dst = cpu_replicas_entry(cpu_r, idx++); ++ memcpy(dst, e, replicas_entry_bytes(e)); ++ replicas_entry_sort(dst); ++ } ++ ++ return 0; ++} ++ ++static int ++__bch2_sb_replicas_v0_to_cpu_replicas(struct bch_sb_field_replicas_v0 *sb_r, ++ struct bch_replicas_cpu *cpu_r) ++{ ++ struct bch_replicas_entry_v0 *e; ++ unsigned nr = 0, entry_size = 0, idx = 0; ++ ++ for_each_replicas_entry(sb_r, e) { ++ entry_size = max_t(unsigned, entry_size, ++ replicas_entry_bytes(e)); ++ nr++; ++ } ++ ++ entry_size += sizeof(struct bch_replicas_entry) - ++ sizeof(struct bch_replicas_entry_v0); ++ ++ cpu_r->entries = kcalloc(nr, entry_size, GFP_NOIO); ++ if (!cpu_r->entries) ++ return -ENOMEM; ++ ++ cpu_r->nr = nr; ++ cpu_r->entry_size = entry_size; ++ ++ for_each_replicas_entry(sb_r, e) { ++ struct bch_replicas_entry *dst = ++ cpu_replicas_entry(cpu_r, idx++); ++ ++ dst->data_type = e->data_type; ++ dst->nr_devs = e->nr_devs; ++ dst->nr_required = 1; ++ memcpy(dst->devs, e->devs, e->nr_devs); ++ replicas_entry_sort(dst); ++ } ++ ++ return 0; ++} ++ ++int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *c) ++{ ++ struct bch_sb_field_replicas *sb_v1; ++ struct bch_sb_field_replicas_v0 *sb_v0; ++ struct bch_replicas_cpu new_r = { 0, 0, NULL }; ++ int ret = 0; ++ ++ if ((sb_v1 = bch2_sb_get_replicas(c->disk_sb.sb))) ++ ret = __bch2_sb_replicas_to_cpu_replicas(sb_v1, &new_r); ++ else if ((sb_v0 = bch2_sb_get_replicas_v0(c->disk_sb.sb))) ++ ret = __bch2_sb_replicas_v0_to_cpu_replicas(sb_v0, &new_r); ++ ++ if (ret) ++ return -ENOMEM; ++ ++ bch2_cpu_replicas_sort(&new_r); ++ ++ percpu_down_write(&c->mark_lock); ++ ++ ret = replicas_table_update(c, &new_r); ++ percpu_up_write(&c->mark_lock); ++ ++ kfree(new_r.entries); ++ ++ return 0; ++} ++ ++static int bch2_cpu_replicas_to_sb_replicas_v0(struct bch_fs *c, ++ struct bch_replicas_cpu *r) ++{ ++ struct bch_sb_field_replicas_v0 *sb_r; ++ struct bch_replicas_entry_v0 *dst; ++ struct bch_replicas_entry *src; ++ size_t bytes; ++ ++ bytes = sizeof(struct bch_sb_field_replicas); ++ ++ for_each_cpu_replicas_entry(r, src) ++ bytes += replicas_entry_bytes(src) - 1; ++ ++ sb_r = bch2_sb_resize_replicas_v0(&c->disk_sb, ++ DIV_ROUND_UP(bytes, sizeof(u64))); ++ if (!sb_r) ++ return -ENOSPC; ++ ++ bch2_sb_field_delete(&c->disk_sb, BCH_SB_FIELD_replicas); ++ sb_r = bch2_sb_get_replicas_v0(c->disk_sb.sb); ++ ++ memset(&sb_r->entries, 0, ++ vstruct_end(&sb_r->field) - ++ (void *) &sb_r->entries); ++ ++ dst = sb_r->entries; ++ for_each_cpu_replicas_entry(r, src) { ++ dst->data_type = src->data_type; ++ dst->nr_devs = src->nr_devs; ++ memcpy(dst->devs, src->devs, src->nr_devs); ++ ++ dst = replicas_entry_next(dst); ++ ++ BUG_ON((void *) dst > vstruct_end(&sb_r->field)); ++ } ++ ++ return 0; ++} ++ ++static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *c, ++ struct bch_replicas_cpu *r) ++{ ++ struct bch_sb_field_replicas *sb_r; ++ struct bch_replicas_entry *dst, *src; ++ bool need_v1 = false; ++ size_t bytes; ++ ++ bytes = sizeof(struct bch_sb_field_replicas); ++ ++ for_each_cpu_replicas_entry(r, src) { ++ bytes += replicas_entry_bytes(src); ++ if (src->nr_required != 1) ++ need_v1 = true; ++ } ++ ++ if (!need_v1) ++ return bch2_cpu_replicas_to_sb_replicas_v0(c, r); ++ ++ sb_r = bch2_sb_resize_replicas(&c->disk_sb, ++ DIV_ROUND_UP(bytes, sizeof(u64))); ++ if (!sb_r) ++ return -ENOSPC; ++ ++ bch2_sb_field_delete(&c->disk_sb, BCH_SB_FIELD_replicas_v0); ++ sb_r = bch2_sb_get_replicas(c->disk_sb.sb); ++ ++ memset(&sb_r->entries, 0, ++ vstruct_end(&sb_r->field) - ++ (void *) &sb_r->entries); ++ ++ dst = sb_r->entries; ++ for_each_cpu_replicas_entry(r, src) { ++ memcpy(dst, src, replicas_entry_bytes(src)); ++ ++ dst = replicas_entry_next(dst); ++ ++ BUG_ON((void *) dst > vstruct_end(&sb_r->field)); ++ } ++ ++ return 0; ++} ++ ++static const char *check_dup_replicas_entries(struct bch_replicas_cpu *cpu_r) ++{ ++ unsigned i; ++ ++ sort_cmp_size(cpu_r->entries, ++ cpu_r->nr, ++ cpu_r->entry_size, ++ memcmp, NULL); ++ ++ for (i = 0; i + 1 < cpu_r->nr; i++) { ++ struct bch_replicas_entry *l = ++ cpu_replicas_entry(cpu_r, i); ++ struct bch_replicas_entry *r = ++ cpu_replicas_entry(cpu_r, i + 1); ++ ++ BUG_ON(memcmp(l, r, cpu_r->entry_size) > 0); ++ ++ if (!memcmp(l, r, cpu_r->entry_size)) ++ return "duplicate replicas entry"; ++ } ++ ++ return NULL; ++} ++ ++static const char *bch2_sb_validate_replicas(struct bch_sb *sb, struct bch_sb_field *f) ++{ ++ struct bch_sb_field_replicas *sb_r = field_to_type(f, replicas); ++ struct bch_sb_field_members *mi = bch2_sb_get_members(sb); ++ struct bch_replicas_cpu cpu_r = { .entries = NULL }; ++ struct bch_replicas_entry *e; ++ const char *err; ++ unsigned i; ++ ++ for_each_replicas_entry(sb_r, e) { ++ err = "invalid replicas entry: invalid data type"; ++ if (e->data_type >= BCH_DATA_NR) ++ goto err; ++ ++ err = "invalid replicas entry: no devices"; ++ if (!e->nr_devs) ++ goto err; ++ ++ err = "invalid replicas entry: bad nr_required"; ++ if (e->nr_required > 1 && ++ e->nr_required >= e->nr_devs) ++ goto err; ++ ++ err = "invalid replicas entry: invalid device"; ++ for (i = 0; i < e->nr_devs; i++) ++ if (!bch2_dev_exists(sb, mi, e->devs[i])) ++ goto err; ++ } ++ ++ err = "cannot allocate memory"; ++ if (__bch2_sb_replicas_to_cpu_replicas(sb_r, &cpu_r)) ++ goto err; ++ ++ err = check_dup_replicas_entries(&cpu_r); ++err: ++ kfree(cpu_r.entries); ++ return err; ++} ++ ++static void bch2_sb_replicas_to_text(struct printbuf *out, ++ struct bch_sb *sb, ++ struct bch_sb_field *f) ++{ ++ struct bch_sb_field_replicas *r = field_to_type(f, replicas); ++ struct bch_replicas_entry *e; ++ bool first = true; ++ ++ for_each_replicas_entry(r, e) { ++ if (!first) ++ pr_buf(out, " "); ++ first = false; ++ ++ bch2_replicas_entry_to_text(out, e); ++ } ++} ++ ++const struct bch_sb_field_ops bch_sb_field_ops_replicas = { ++ .validate = bch2_sb_validate_replicas, ++ .to_text = bch2_sb_replicas_to_text, ++}; ++ ++static const char *bch2_sb_validate_replicas_v0(struct bch_sb *sb, struct bch_sb_field *f) ++{ ++ struct bch_sb_field_replicas_v0 *sb_r = field_to_type(f, replicas_v0); ++ struct bch_sb_field_members *mi = bch2_sb_get_members(sb); ++ struct bch_replicas_cpu cpu_r = { .entries = NULL }; ++ struct bch_replicas_entry_v0 *e; ++ const char *err; ++ unsigned i; ++ ++ for_each_replicas_entry_v0(sb_r, e) { ++ err = "invalid replicas entry: invalid data type"; ++ if (e->data_type >= BCH_DATA_NR) ++ goto err; ++ ++ err = "invalid replicas entry: no devices"; ++ if (!e->nr_devs) ++ goto err; ++ ++ err = "invalid replicas entry: invalid device"; ++ for (i = 0; i < e->nr_devs; i++) ++ if (!bch2_dev_exists(sb, mi, e->devs[i])) ++ goto err; ++ } ++ ++ err = "cannot allocate memory"; ++ if (__bch2_sb_replicas_v0_to_cpu_replicas(sb_r, &cpu_r)) ++ goto err; ++ ++ err = check_dup_replicas_entries(&cpu_r); ++err: ++ kfree(cpu_r.entries); ++ return err; ++} ++ ++const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0 = { ++ .validate = bch2_sb_validate_replicas_v0, ++}; ++ ++/* Query replicas: */ ++ ++struct replicas_status __bch2_replicas_status(struct bch_fs *c, ++ struct bch_devs_mask online_devs) ++{ ++ struct bch_sb_field_members *mi; ++ struct bch_replicas_entry *e; ++ unsigned i, nr_online, nr_offline; ++ struct replicas_status ret; ++ ++ memset(&ret, 0, sizeof(ret)); ++ ++ for (i = 0; i < ARRAY_SIZE(ret.replicas); i++) ++ ret.replicas[i].redundancy = INT_MAX; ++ ++ mi = bch2_sb_get_members(c->disk_sb.sb); ++ ++ percpu_down_read(&c->mark_lock); ++ ++ for_each_cpu_replicas_entry(&c->replicas, e) { ++ if (e->data_type >= ARRAY_SIZE(ret.replicas)) ++ panic("e %p data_type %u\n", e, e->data_type); ++ ++ nr_online = nr_offline = 0; ++ ++ for (i = 0; i < e->nr_devs; i++) { ++ BUG_ON(!bch2_dev_exists(c->disk_sb.sb, mi, ++ e->devs[i])); ++ ++ if (test_bit(e->devs[i], online_devs.d)) ++ nr_online++; ++ else ++ nr_offline++; ++ } ++ ++ ret.replicas[e->data_type].redundancy = ++ min(ret.replicas[e->data_type].redundancy, ++ (int) nr_online - (int) e->nr_required); ++ ++ ret.replicas[e->data_type].nr_offline = ++ max(ret.replicas[e->data_type].nr_offline, ++ nr_offline); ++ } ++ ++ percpu_up_read(&c->mark_lock); ++ ++ for (i = 0; i < ARRAY_SIZE(ret.replicas); i++) ++ if (ret.replicas[i].redundancy == INT_MAX) ++ ret.replicas[i].redundancy = 0; ++ ++ return ret; ++} ++ ++struct replicas_status bch2_replicas_status(struct bch_fs *c) ++{ ++ return __bch2_replicas_status(c, bch2_online_devs(c)); ++} ++ ++static bool have_enough_devs(struct replicas_status s, ++ enum bch_data_type type, ++ bool force_if_degraded, ++ bool force_if_lost) ++{ ++ return (!s.replicas[type].nr_offline || force_if_degraded) && ++ (s.replicas[type].redundancy >= 0 || force_if_lost); ++} ++ ++bool bch2_have_enough_devs(struct replicas_status s, unsigned flags) ++{ ++ return (have_enough_devs(s, BCH_DATA_JOURNAL, ++ flags & BCH_FORCE_IF_METADATA_DEGRADED, ++ flags & BCH_FORCE_IF_METADATA_LOST) && ++ have_enough_devs(s, BCH_DATA_BTREE, ++ flags & BCH_FORCE_IF_METADATA_DEGRADED, ++ flags & BCH_FORCE_IF_METADATA_LOST) && ++ have_enough_devs(s, BCH_DATA_USER, ++ flags & BCH_FORCE_IF_DATA_DEGRADED, ++ flags & BCH_FORCE_IF_DATA_LOST)); ++} ++ ++int bch2_replicas_online(struct bch_fs *c, bool meta) ++{ ++ struct replicas_status s = bch2_replicas_status(c); ++ ++ return (meta ++ ? min(s.replicas[BCH_DATA_JOURNAL].redundancy, ++ s.replicas[BCH_DATA_BTREE].redundancy) ++ : s.replicas[BCH_DATA_USER].redundancy) + 1; ++} ++ ++unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca) ++{ ++ struct bch_replicas_entry *e; ++ unsigned i, ret = 0; ++ ++ percpu_down_read(&c->mark_lock); ++ ++ for_each_cpu_replicas_entry(&c->replicas, e) ++ for (i = 0; i < e->nr_devs; i++) ++ if (e->devs[i] == ca->dev_idx) ++ ret |= 1 << e->data_type; ++ ++ percpu_up_read(&c->mark_lock); ++ ++ return ret; ++} ++ ++int bch2_fs_replicas_init(struct bch_fs *c) ++{ ++ c->journal.entry_u64s_reserved += ++ reserve_journal_replicas(c, &c->replicas); ++ ++ return replicas_table_update(c, &c->replicas); ++} +diff --git a/fs/bcachefs/replicas.h b/fs/bcachefs/replicas.h +new file mode 100644 +index 000000000000..0d6e19126021 +--- /dev/null ++++ b/fs/bcachefs/replicas.h +@@ -0,0 +1,98 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_REPLICAS_H ++#define _BCACHEFS_REPLICAS_H ++ ++#include "eytzinger.h" ++#include "replicas_types.h" ++ ++void bch2_replicas_entry_to_text(struct printbuf *, ++ struct bch_replicas_entry *); ++void bch2_cpu_replicas_to_text(struct printbuf *, struct bch_replicas_cpu *); ++ ++static inline struct bch_replicas_entry * ++cpu_replicas_entry(struct bch_replicas_cpu *r, unsigned i) ++{ ++ return (void *) r->entries + r->entry_size * i; ++} ++ ++int bch2_replicas_entry_idx(struct bch_fs *, ++ struct bch_replicas_entry *); ++ ++void bch2_devlist_to_replicas(struct bch_replicas_entry *, ++ enum bch_data_type, ++ struct bch_devs_list); ++bool bch2_replicas_marked(struct bch_fs *, ++ struct bch_replicas_entry *, bool); ++int bch2_mark_replicas(struct bch_fs *, ++ struct bch_replicas_entry *); ++ ++bool bch2_bkey_replicas_marked_locked(struct bch_fs *, ++ struct bkey_s_c, bool); ++void bch2_bkey_to_replicas(struct bch_replicas_entry *, struct bkey_s_c); ++bool bch2_bkey_replicas_marked(struct bch_fs *, ++ struct bkey_s_c, bool); ++int bch2_mark_bkey_replicas(struct bch_fs *, struct bkey_s_c); ++ ++static inline void bch2_replicas_entry_cached(struct bch_replicas_entry *e, ++ unsigned dev) ++{ ++ e->data_type = BCH_DATA_CACHED; ++ e->nr_devs = 1; ++ e->nr_required = 1; ++ e->devs[0] = dev; ++} ++ ++struct replicas_status { ++ struct { ++ int redundancy; ++ unsigned nr_offline; ++ } replicas[BCH_DATA_NR]; ++}; ++ ++struct replicas_status __bch2_replicas_status(struct bch_fs *, ++ struct bch_devs_mask); ++struct replicas_status bch2_replicas_status(struct bch_fs *); ++bool bch2_have_enough_devs(struct replicas_status, unsigned); ++ ++int bch2_replicas_online(struct bch_fs *, bool); ++unsigned bch2_dev_has_data(struct bch_fs *, struct bch_dev *); ++ ++int bch2_replicas_gc_end(struct bch_fs *, int); ++int bch2_replicas_gc_start(struct bch_fs *, unsigned); ++int bch2_replicas_gc2(struct bch_fs *); ++ ++int bch2_replicas_set_usage(struct bch_fs *, ++ struct bch_replicas_entry *, ++ u64); ++ ++#define for_each_cpu_replicas_entry(_r, _i) \ ++ for (_i = (_r)->entries; \ ++ (void *) (_i) < (void *) (_r)->entries + (_r)->nr * (_r)->entry_size;\ ++ _i = (void *) (_i) + (_r)->entry_size) ++ ++/* iterate over superblock replicas - used by userspace tools: */ ++ ++#define replicas_entry_bytes(_i) \ ++ (offsetof(typeof(*(_i)), devs) + (_i)->nr_devs) ++ ++#define replicas_entry_next(_i) \ ++ ((typeof(_i)) ((void *) (_i) + replicas_entry_bytes(_i))) ++ ++#define for_each_replicas_entry(_r, _i) \ ++ for (_i = (_r)->entries; \ ++ (void *) (_i) < vstruct_end(&(_r)->field) && (_i)->data_type;\ ++ (_i) = replicas_entry_next(_i)) ++ ++#define for_each_replicas_entry_v0(_r, _i) \ ++ for (_i = (_r)->entries; \ ++ (void *) (_i) < vstruct_end(&(_r)->field) && (_i)->data_type;\ ++ (_i) = replicas_entry_next(_i)) ++ ++int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *); ++ ++extern const struct bch_sb_field_ops bch_sb_field_ops_replicas; ++extern const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0; ++ ++int bch2_fs_replicas_init(struct bch_fs *); ++ ++#endif /* _BCACHEFS_REPLICAS_H */ +diff --git a/fs/bcachefs/replicas_types.h b/fs/bcachefs/replicas_types.h +new file mode 100644 +index 000000000000..0535b1d3760e +--- /dev/null ++++ b/fs/bcachefs/replicas_types.h +@@ -0,0 +1,10 @@ ++#ifndef _BCACHEFS_REPLICAS_TYPES_H ++#define _BCACHEFS_REPLICAS_TYPES_H ++ ++struct bch_replicas_cpu { ++ unsigned nr; ++ unsigned entry_size; ++ struct bch_replicas_entry *entries; ++}; ++ ++#endif /* _BCACHEFS_REPLICAS_TYPES_H */ +diff --git a/fs/bcachefs/siphash.c b/fs/bcachefs/siphash.c +new file mode 100644 +index 000000000000..c062edb3fbc2 +--- /dev/null ++++ b/fs/bcachefs/siphash.c +@@ -0,0 +1,173 @@ ++// SPDX-License-Identifier: BSD-3-Clause ++/* $OpenBSD: siphash.c,v 1.3 2015/02/20 11:51:03 tedu Exp $ */ ++ ++/*- ++ * Copyright (c) 2013 Andre Oppermann ++ * All rights reserved. ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. The name of the author may not be used to endorse or promote ++ * products derived from this software without specific prior written ++ * permission. ++ * ++ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND ++ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ++ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ++ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ++ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS ++ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) ++ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT ++ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY ++ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF ++ * SUCH DAMAGE. ++ */ ++ ++/* ++ * SipHash is a family of PRFs SipHash-c-d where the integer parameters c and d ++ * are the number of compression rounds and the number of finalization rounds. ++ * A compression round is identical to a finalization round and this round ++ * function is called SipRound. Given a 128-bit key k and a (possibly empty) ++ * byte string m, SipHash-c-d returns a 64-bit value SipHash-c-d(k; m). ++ * ++ * Implemented from the paper "SipHash: a fast short-input PRF", 2012.09.18, ++ * by Jean-Philippe Aumasson and Daniel J. Bernstein, ++ * Permanent Document ID b9a943a805fbfc6fde808af9fc0ecdfa ++ * https://131002.net/siphash/siphash.pdf ++ * https://131002.net/siphash/ ++ */ ++ ++#include ++#include ++#include ++#include ++ ++#include "siphash.h" ++ ++static void SipHash_Rounds(SIPHASH_CTX *ctx, int rounds) ++{ ++ while (rounds--) { ++ ctx->v[0] += ctx->v[1]; ++ ctx->v[2] += ctx->v[3]; ++ ctx->v[1] = rol64(ctx->v[1], 13); ++ ctx->v[3] = rol64(ctx->v[3], 16); ++ ++ ctx->v[1] ^= ctx->v[0]; ++ ctx->v[3] ^= ctx->v[2]; ++ ctx->v[0] = rol64(ctx->v[0], 32); ++ ++ ctx->v[2] += ctx->v[1]; ++ ctx->v[0] += ctx->v[3]; ++ ctx->v[1] = rol64(ctx->v[1], 17); ++ ctx->v[3] = rol64(ctx->v[3], 21); ++ ++ ctx->v[1] ^= ctx->v[2]; ++ ctx->v[3] ^= ctx->v[0]; ++ ctx->v[2] = rol64(ctx->v[2], 32); ++ } ++} ++ ++static void SipHash_CRounds(SIPHASH_CTX *ctx, const void *ptr, int rounds) ++{ ++ u64 m = get_unaligned_le64(ptr); ++ ++ ctx->v[3] ^= m; ++ SipHash_Rounds(ctx, rounds); ++ ctx->v[0] ^= m; ++} ++ ++void SipHash_Init(SIPHASH_CTX *ctx, const SIPHASH_KEY *key) ++{ ++ u64 k0, k1; ++ ++ k0 = le64_to_cpu(key->k0); ++ k1 = le64_to_cpu(key->k1); ++ ++ ctx->v[0] = 0x736f6d6570736575ULL ^ k0; ++ ctx->v[1] = 0x646f72616e646f6dULL ^ k1; ++ ctx->v[2] = 0x6c7967656e657261ULL ^ k0; ++ ctx->v[3] = 0x7465646279746573ULL ^ k1; ++ ++ memset(ctx->buf, 0, sizeof(ctx->buf)); ++ ctx->bytes = 0; ++} ++ ++void SipHash_Update(SIPHASH_CTX *ctx, int rc, int rf, ++ const void *src, size_t len) ++{ ++ const u8 *ptr = src; ++ size_t left, used; ++ ++ if (len == 0) ++ return; ++ ++ used = ctx->bytes % sizeof(ctx->buf); ++ ctx->bytes += len; ++ ++ if (used > 0) { ++ left = sizeof(ctx->buf) - used; ++ ++ if (len >= left) { ++ memcpy(&ctx->buf[used], ptr, left); ++ SipHash_CRounds(ctx, ctx->buf, rc); ++ len -= left; ++ ptr += left; ++ } else { ++ memcpy(&ctx->buf[used], ptr, len); ++ return; ++ } ++ } ++ ++ while (len >= sizeof(ctx->buf)) { ++ SipHash_CRounds(ctx, ptr, rc); ++ len -= sizeof(ctx->buf); ++ ptr += sizeof(ctx->buf); ++ } ++ ++ if (len > 0) ++ memcpy(&ctx->buf[used], ptr, len); ++} ++ ++void SipHash_Final(void *dst, SIPHASH_CTX *ctx, int rc, int rf) ++{ ++ u64 r; ++ ++ r = SipHash_End(ctx, rc, rf); ++ ++ *((__le64 *) dst) = cpu_to_le64(r); ++} ++ ++u64 SipHash_End(SIPHASH_CTX *ctx, int rc, int rf) ++{ ++ u64 r; ++ size_t left, used; ++ ++ used = ctx->bytes % sizeof(ctx->buf); ++ left = sizeof(ctx->buf) - used; ++ memset(&ctx->buf[used], 0, left - 1); ++ ctx->buf[7] = ctx->bytes; ++ ++ SipHash_CRounds(ctx, ctx->buf, rc); ++ ctx->v[2] ^= 0xff; ++ SipHash_Rounds(ctx, rf); ++ ++ r = (ctx->v[0] ^ ctx->v[1]) ^ (ctx->v[2] ^ ctx->v[3]); ++ memset(ctx, 0, sizeof(*ctx)); ++ return (r); ++} ++ ++u64 SipHash(const SIPHASH_KEY *key, int rc, int rf, const void *src, size_t len) ++{ ++ SIPHASH_CTX ctx; ++ ++ SipHash_Init(&ctx, key); ++ SipHash_Update(&ctx, rc, rf, src, len); ++ return SipHash_End(&ctx, rc, rf); ++} +diff --git a/fs/bcachefs/siphash.h b/fs/bcachefs/siphash.h +new file mode 100644 +index 000000000000..3dfaf34a43b2 +--- /dev/null ++++ b/fs/bcachefs/siphash.h +@@ -0,0 +1,87 @@ ++/* SPDX-License-Identifier: BSD-3-Clause */ ++/* $OpenBSD: siphash.h,v 1.5 2015/02/20 11:51:03 tedu Exp $ */ ++/*- ++ * Copyright (c) 2013 Andre Oppermann ++ * All rights reserved. ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. The name of the author may not be used to endorse or promote ++ * products derived from this software without specific prior written ++ * permission. ++ * ++ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND ++ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ++ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ++ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ++ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS ++ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) ++ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT ++ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY ++ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF ++ * SUCH DAMAGE. ++ * ++ * $FreeBSD$ ++ */ ++ ++/* ++ * SipHash is a family of pseudorandom functions (a.k.a. keyed hash functions) ++ * optimized for speed on short messages returning a 64bit hash/digest value. ++ * ++ * The number of rounds is defined during the initialization: ++ * SipHash24_Init() for the fast and resonable strong version ++ * SipHash48_Init() for the strong version (half as fast) ++ * ++ * struct SIPHASH_CTX ctx; ++ * SipHash24_Init(&ctx); ++ * SipHash_SetKey(&ctx, "16bytes long key"); ++ * SipHash_Update(&ctx, pointer_to_string, length_of_string); ++ * SipHash_Final(output, &ctx); ++ */ ++ ++#ifndef _SIPHASH_H_ ++#define _SIPHASH_H_ ++ ++#include ++ ++#define SIPHASH_BLOCK_LENGTH 8 ++#define SIPHASH_KEY_LENGTH 16 ++#define SIPHASH_DIGEST_LENGTH 8 ++ ++typedef struct _SIPHASH_CTX { ++ u64 v[4]; ++ u8 buf[SIPHASH_BLOCK_LENGTH]; ++ u32 bytes; ++} SIPHASH_CTX; ++ ++typedef struct { ++ __le64 k0; ++ __le64 k1; ++} SIPHASH_KEY; ++ ++void SipHash_Init(SIPHASH_CTX *, const SIPHASH_KEY *); ++void SipHash_Update(SIPHASH_CTX *, int, int, const void *, size_t); ++u64 SipHash_End(SIPHASH_CTX *, int, int); ++void SipHash_Final(void *, SIPHASH_CTX *, int, int); ++u64 SipHash(const SIPHASH_KEY *, int, int, const void *, size_t); ++ ++#define SipHash24_Init(_c, _k) SipHash_Init((_c), (_k)) ++#define SipHash24_Update(_c, _p, _l) SipHash_Update((_c), 2, 4, (_p), (_l)) ++#define SipHash24_End(_d) SipHash_End((_d), 2, 4) ++#define SipHash24_Final(_d, _c) SipHash_Final((_d), (_c), 2, 4) ++#define SipHash24(_k, _p, _l) SipHash((_k), 2, 4, (_p), (_l)) ++ ++#define SipHash48_Init(_c, _k) SipHash_Init((_c), (_k)) ++#define SipHash48_Update(_c, _p, _l) SipHash_Update((_c), 4, 8, (_p), (_l)) ++#define SipHash48_End(_d) SipHash_End((_d), 4, 8) ++#define SipHash48_Final(_d, _c) SipHash_Final((_d), (_c), 4, 8) ++#define SipHash48(_k, _p, _l) SipHash((_k), 4, 8, (_p), (_l)) ++ ++#endif /* _SIPHASH_H_ */ +diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h +new file mode 100644 +index 000000000000..7be4a8e50eaa +--- /dev/null ++++ b/fs/bcachefs/str_hash.h +@@ -0,0 +1,331 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_STR_HASH_H ++#define _BCACHEFS_STR_HASH_H ++ ++#include "btree_iter.h" ++#include "btree_update.h" ++#include "checksum.h" ++#include "error.h" ++#include "inode.h" ++#include "siphash.h" ++#include "super.h" ++ ++#include ++#include ++#include ++ ++static inline enum bch_str_hash_type ++bch2_str_hash_opt_to_type(struct bch_fs *c, enum bch_str_hash_opts opt) ++{ ++ switch (opt) { ++ case BCH_STR_HASH_OPT_CRC32C: ++ return BCH_STR_HASH_CRC32C; ++ case BCH_STR_HASH_OPT_CRC64: ++ return BCH_STR_HASH_CRC64; ++ case BCH_STR_HASH_OPT_SIPHASH: ++ return c->sb.features & (1ULL << BCH_FEATURE_NEW_SIPHASH) ++ ? BCH_STR_HASH_SIPHASH ++ : BCH_STR_HASH_SIPHASH_OLD; ++ default: ++ BUG(); ++ } ++} ++ ++struct bch_hash_info { ++ u8 type; ++ union { ++ __le64 crc_key; ++ SIPHASH_KEY siphash_key; ++ }; ++}; ++ ++static inline struct bch_hash_info ++bch2_hash_info_init(struct bch_fs *c, const struct bch_inode_unpacked *bi) ++{ ++ /* XXX ick */ ++ struct bch_hash_info info = { ++ .type = (bi->bi_flags >> INODE_STR_HASH_OFFSET) & ++ ~(~0U << INODE_STR_HASH_BITS), ++ .crc_key = bi->bi_hash_seed, ++ }; ++ ++ if (unlikely(info.type == BCH_STR_HASH_SIPHASH_OLD)) { ++ SHASH_DESC_ON_STACK(desc, c->sha256); ++ u8 digest[SHA256_DIGEST_SIZE]; ++ ++ desc->tfm = c->sha256; ++ ++ crypto_shash_digest(desc, (void *) &bi->bi_hash_seed, ++ sizeof(bi->bi_hash_seed), digest); ++ memcpy(&info.siphash_key, digest, sizeof(info.siphash_key)); ++ } ++ ++ return info; ++} ++ ++struct bch_str_hash_ctx { ++ union { ++ u32 crc32c; ++ u64 crc64; ++ SIPHASH_CTX siphash; ++ }; ++}; ++ ++static inline void bch2_str_hash_init(struct bch_str_hash_ctx *ctx, ++ const struct bch_hash_info *info) ++{ ++ switch (info->type) { ++ case BCH_STR_HASH_CRC32C: ++ ctx->crc32c = crc32c(~0, &info->crc_key, sizeof(info->crc_key)); ++ break; ++ case BCH_STR_HASH_CRC64: ++ ctx->crc64 = crc64_be(~0, &info->crc_key, sizeof(info->crc_key)); ++ break; ++ case BCH_STR_HASH_SIPHASH_OLD: ++ case BCH_STR_HASH_SIPHASH: ++ SipHash24_Init(&ctx->siphash, &info->siphash_key); ++ break; ++ default: ++ BUG(); ++ } ++} ++ ++static inline void bch2_str_hash_update(struct bch_str_hash_ctx *ctx, ++ const struct bch_hash_info *info, ++ const void *data, size_t len) ++{ ++ switch (info->type) { ++ case BCH_STR_HASH_CRC32C: ++ ctx->crc32c = crc32c(ctx->crc32c, data, len); ++ break; ++ case BCH_STR_HASH_CRC64: ++ ctx->crc64 = crc64_be(ctx->crc64, data, len); ++ break; ++ case BCH_STR_HASH_SIPHASH_OLD: ++ case BCH_STR_HASH_SIPHASH: ++ SipHash24_Update(&ctx->siphash, data, len); ++ break; ++ default: ++ BUG(); ++ } ++} ++ ++static inline u64 bch2_str_hash_end(struct bch_str_hash_ctx *ctx, ++ const struct bch_hash_info *info) ++{ ++ switch (info->type) { ++ case BCH_STR_HASH_CRC32C: ++ return ctx->crc32c; ++ case BCH_STR_HASH_CRC64: ++ return ctx->crc64 >> 1; ++ case BCH_STR_HASH_SIPHASH_OLD: ++ case BCH_STR_HASH_SIPHASH: ++ return SipHash24_End(&ctx->siphash) >> 1; ++ default: ++ BUG(); ++ } ++} ++ ++struct bch_hash_desc { ++ enum btree_id btree_id; ++ u8 key_type; ++ ++ u64 (*hash_key)(const struct bch_hash_info *, const void *); ++ u64 (*hash_bkey)(const struct bch_hash_info *, struct bkey_s_c); ++ bool (*cmp_key)(struct bkey_s_c, const void *); ++ bool (*cmp_bkey)(struct bkey_s_c, struct bkey_s_c); ++}; ++ ++static __always_inline struct btree_iter * ++bch2_hash_lookup(struct btree_trans *trans, ++ const struct bch_hash_desc desc, ++ const struct bch_hash_info *info, ++ u64 inode, const void *key, ++ unsigned flags) ++{ ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ int ret; ++ ++ for_each_btree_key(trans, iter, desc.btree_id, ++ POS(inode, desc.hash_key(info, key)), ++ BTREE_ITER_SLOTS|flags, k, ret) { ++ if (iter->pos.inode != inode) ++ break; ++ ++ if (k.k->type == desc.key_type) { ++ if (!desc.cmp_key(k, key)) ++ return iter; ++ } else if (k.k->type == KEY_TYPE_whiteout) { ++ ; ++ } else { ++ /* hole, not found */ ++ break; ++ } ++ } ++ ++ return ERR_PTR(ret ?: -ENOENT); ++} ++ ++static __always_inline struct btree_iter * ++bch2_hash_hole(struct btree_trans *trans, ++ const struct bch_hash_desc desc, ++ const struct bch_hash_info *info, ++ u64 inode, const void *key) ++{ ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ int ret; ++ ++ for_each_btree_key(trans, iter, desc.btree_id, ++ POS(inode, desc.hash_key(info, key)), ++ BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { ++ if (iter->pos.inode != inode) ++ break; ++ ++ if (k.k->type != desc.key_type) ++ return iter; ++ } ++ ++ return ERR_PTR(ret ?: -ENOSPC); ++} ++ ++static __always_inline ++int bch2_hash_needs_whiteout(struct btree_trans *trans, ++ const struct bch_hash_desc desc, ++ const struct bch_hash_info *info, ++ struct btree_iter *start) ++{ ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ int ret; ++ ++ iter = bch2_trans_copy_iter(trans, start); ++ if (IS_ERR(iter)) ++ return PTR_ERR(iter); ++ ++ bch2_btree_iter_next_slot(iter); ++ ++ for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, k, ret) { ++ if (k.k->type != desc.key_type && ++ k.k->type != KEY_TYPE_whiteout) ++ break; ++ ++ if (k.k->type == desc.key_type && ++ desc.hash_bkey(info, k) <= start->pos.offset) { ++ iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT; ++ ret = 1; ++ break; ++ } ++ } ++ ++ bch2_trans_iter_put(trans, iter); ++ return ret; ++} ++ ++static __always_inline ++int bch2_hash_set(struct btree_trans *trans, ++ const struct bch_hash_desc desc, ++ const struct bch_hash_info *info, ++ u64 inode, struct bkey_i *insert, int flags) ++{ ++ struct btree_iter *iter, *slot = NULL; ++ struct bkey_s_c k; ++ bool found = false; ++ int ret; ++ ++ for_each_btree_key(trans, iter, desc.btree_id, ++ POS(inode, desc.hash_bkey(info, bkey_i_to_s_c(insert))), ++ BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { ++ if (iter->pos.inode != inode) ++ break; ++ ++ if (k.k->type == desc.key_type) { ++ if (!desc.cmp_bkey(k, bkey_i_to_s_c(insert))) ++ goto found; ++ ++ /* hash collision: */ ++ continue; ++ } ++ ++ if (!slot && ++ !(flags & BCH_HASH_SET_MUST_REPLACE)) { ++ slot = bch2_trans_copy_iter(trans, iter); ++ if (IS_ERR(slot)) ++ return PTR_ERR(slot); ++ } ++ ++ if (k.k->type != KEY_TYPE_whiteout) ++ goto not_found; ++ } ++ ++ if (!ret) ++ ret = -ENOSPC; ++out: ++ if (!IS_ERR_OR_NULL(slot)) ++ bch2_trans_iter_put(trans, slot); ++ if (!IS_ERR_OR_NULL(iter)) ++ bch2_trans_iter_put(trans, iter); ++ ++ return ret; ++found: ++ found = true; ++not_found: ++ ++ if (!found && (flags & BCH_HASH_SET_MUST_REPLACE)) { ++ ret = -ENOENT; ++ } else if (found && (flags & BCH_HASH_SET_MUST_CREATE)) { ++ ret = -EEXIST; ++ } else { ++ if (!found && slot) ++ swap(iter, slot); ++ ++ insert->k.p = iter->pos; ++ bch2_trans_update(trans, iter, insert); ++ } ++ ++ goto out; ++} ++ ++static __always_inline ++int bch2_hash_delete_at(struct btree_trans *trans, ++ const struct bch_hash_desc desc, ++ const struct bch_hash_info *info, ++ struct btree_iter *iter) ++{ ++ struct bkey_i *delete; ++ int ret; ++ ++ ret = bch2_hash_needs_whiteout(trans, desc, info, iter); ++ if (ret < 0) ++ return ret; ++ ++ delete = bch2_trans_kmalloc(trans, sizeof(*delete)); ++ if (IS_ERR(delete)) ++ return PTR_ERR(delete); ++ ++ bkey_init(&delete->k); ++ delete->k.p = iter->pos; ++ delete->k.type = ret ? KEY_TYPE_whiteout : KEY_TYPE_deleted; ++ ++ bch2_trans_update(trans, iter, delete); ++ return 0; ++} ++ ++static __always_inline ++int bch2_hash_delete(struct btree_trans *trans, ++ const struct bch_hash_desc desc, ++ const struct bch_hash_info *info, ++ u64 inode, const void *key) ++{ ++ struct btree_iter *iter; ++ ++ iter = bch2_hash_lookup(trans, desc, info, inode, key, ++ BTREE_ITER_INTENT); ++ if (IS_ERR(iter)) ++ return PTR_ERR(iter); ++ ++ return bch2_hash_delete_at(trans, desc, info, iter); ++} ++ ++#endif /* _BCACHEFS_STR_HASH_H */ +diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c +new file mode 100644 +index 000000000000..7e9c1f9c850c +--- /dev/null ++++ b/fs/bcachefs/super-io.c +@@ -0,0 +1,1154 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include "bcachefs.h" ++#include "buckets.h" ++#include "checksum.h" ++#include "disk_groups.h" ++#include "ec.h" ++#include "error.h" ++#include "io.h" ++#include "journal.h" ++#include "journal_seq_blacklist.h" ++#include "replicas.h" ++#include "quota.h" ++#include "super-io.h" ++#include "super.h" ++#include "vstructs.h" ++ ++#include ++#include ++ ++const char * const bch2_sb_fields[] = { ++#define x(name, nr) #name, ++ BCH_SB_FIELDS() ++#undef x ++ NULL ++}; ++ ++static const char *bch2_sb_field_validate(struct bch_sb *, ++ struct bch_sb_field *); ++ ++struct bch_sb_field *bch2_sb_field_get(struct bch_sb *sb, ++ enum bch_sb_field_type type) ++{ ++ struct bch_sb_field *f; ++ ++ /* XXX: need locking around superblock to access optional fields */ ++ ++ vstruct_for_each(sb, f) ++ if (le32_to_cpu(f->type) == type) ++ return f; ++ return NULL; ++} ++ ++static struct bch_sb_field *__bch2_sb_field_resize(struct bch_sb_handle *sb, ++ struct bch_sb_field *f, ++ unsigned u64s) ++{ ++ unsigned old_u64s = f ? le32_to_cpu(f->u64s) : 0; ++ unsigned sb_u64s = le32_to_cpu(sb->sb->u64s) + u64s - old_u64s; ++ ++ BUG_ON(get_order(__vstruct_bytes(struct bch_sb, sb_u64s)) > ++ sb->page_order); ++ ++ if (!f) { ++ f = vstruct_last(sb->sb); ++ memset(f, 0, sizeof(u64) * u64s); ++ f->u64s = cpu_to_le32(u64s); ++ f->type = 0; ++ } else { ++ void *src, *dst; ++ ++ src = vstruct_end(f); ++ ++ if (u64s) { ++ f->u64s = cpu_to_le32(u64s); ++ dst = vstruct_end(f); ++ } else { ++ dst = f; ++ } ++ ++ memmove(dst, src, vstruct_end(sb->sb) - src); ++ ++ if (dst > src) ++ memset(src, 0, dst - src); ++ } ++ ++ sb->sb->u64s = cpu_to_le32(sb_u64s); ++ ++ return u64s ? f : NULL; ++} ++ ++void bch2_sb_field_delete(struct bch_sb_handle *sb, ++ enum bch_sb_field_type type) ++{ ++ struct bch_sb_field *f = bch2_sb_field_get(sb->sb, type); ++ ++ if (f) ++ __bch2_sb_field_resize(sb, f, 0); ++} ++ ++/* Superblock realloc/free: */ ++ ++void bch2_free_super(struct bch_sb_handle *sb) ++{ ++ if (sb->bio) ++ bio_put(sb->bio); ++ if (!IS_ERR_OR_NULL(sb->bdev)) ++ blkdev_put(sb->bdev, sb->mode); ++ ++ free_pages((unsigned long) sb->sb, sb->page_order); ++ memset(sb, 0, sizeof(*sb)); ++} ++ ++int bch2_sb_realloc(struct bch_sb_handle *sb, unsigned u64s) ++{ ++ size_t new_bytes = __vstruct_bytes(struct bch_sb, u64s); ++ unsigned order = get_order(new_bytes); ++ struct bch_sb *new_sb; ++ struct bio *bio; ++ ++ if (sb->sb && sb->page_order >= order) ++ return 0; ++ ++ if (sb->have_layout) { ++ u64 max_bytes = 512 << sb->sb->layout.sb_max_size_bits; ++ ++ if (new_bytes > max_bytes) { ++ char buf[BDEVNAME_SIZE]; ++ ++ pr_err("%s: superblock too big: want %zu but have %llu", ++ bdevname(sb->bdev, buf), new_bytes, max_bytes); ++ return -ENOSPC; ++ } ++ } ++ ++ if (sb->page_order >= order && sb->sb) ++ return 0; ++ ++ if (dynamic_fault("bcachefs:add:super_realloc")) ++ return -ENOMEM; ++ ++ if (sb->have_bio) { ++ bio = bio_kmalloc(GFP_KERNEL, 1 << order); ++ if (!bio) ++ return -ENOMEM; ++ ++ if (sb->bio) ++ bio_put(sb->bio); ++ sb->bio = bio; ++ } ++ ++ new_sb = (void *) __get_free_pages(GFP_NOFS|__GFP_ZERO, order); ++ if (!new_sb) ++ return -ENOMEM; ++ ++ if (sb->sb) ++ memcpy(new_sb, sb->sb, PAGE_SIZE << sb->page_order); ++ ++ free_pages((unsigned long) sb->sb, sb->page_order); ++ sb->sb = new_sb; ++ ++ sb->page_order = order; ++ ++ return 0; ++} ++ ++struct bch_sb_field *bch2_sb_field_resize(struct bch_sb_handle *sb, ++ enum bch_sb_field_type type, ++ unsigned u64s) ++{ ++ struct bch_sb_field *f = bch2_sb_field_get(sb->sb, type); ++ ssize_t old_u64s = f ? le32_to_cpu(f->u64s) : 0; ++ ssize_t d = -old_u64s + u64s; ++ ++ if (bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s) + d)) ++ return NULL; ++ ++ if (sb->fs_sb) { ++ struct bch_fs *c = container_of(sb, struct bch_fs, disk_sb); ++ struct bch_dev *ca; ++ unsigned i; ++ ++ lockdep_assert_held(&c->sb_lock); ++ ++ /* XXX: we're not checking that offline device have enough space */ ++ ++ for_each_online_member(ca, c, i) { ++ struct bch_sb_handle *sb = &ca->disk_sb; ++ ++ if (bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s) + d)) { ++ percpu_ref_put(&ca->ref); ++ return NULL; ++ } ++ } ++ } ++ ++ f = bch2_sb_field_get(sb->sb, type); ++ f = __bch2_sb_field_resize(sb, f, u64s); ++ if (f) ++ f->type = cpu_to_le32(type); ++ return f; ++} ++ ++/* Superblock validate: */ ++ ++static inline void __bch2_sb_layout_size_assert(void) ++{ ++ BUILD_BUG_ON(sizeof(struct bch_sb_layout) != 512); ++} ++ ++static const char *validate_sb_layout(struct bch_sb_layout *layout) ++{ ++ u64 offset, prev_offset, max_sectors; ++ unsigned i; ++ ++ if (uuid_le_cmp(layout->magic, BCACHE_MAGIC)) ++ return "Not a bcachefs superblock layout"; ++ ++ if (layout->layout_type != 0) ++ return "Invalid superblock layout type"; ++ ++ if (!layout->nr_superblocks) ++ return "Invalid superblock layout: no superblocks"; ++ ++ if (layout->nr_superblocks > ARRAY_SIZE(layout->sb_offset)) ++ return "Invalid superblock layout: too many superblocks"; ++ ++ max_sectors = 1 << layout->sb_max_size_bits; ++ ++ prev_offset = le64_to_cpu(layout->sb_offset[0]); ++ ++ for (i = 1; i < layout->nr_superblocks; i++) { ++ offset = le64_to_cpu(layout->sb_offset[i]); ++ ++ if (offset < prev_offset + max_sectors) ++ return "Invalid superblock layout: superblocks overlap"; ++ prev_offset = offset; ++ } ++ ++ return NULL; ++} ++ ++const char *bch2_sb_validate(struct bch_sb_handle *disk_sb) ++{ ++ struct bch_sb *sb = disk_sb->sb; ++ struct bch_sb_field *f; ++ struct bch_sb_field_members *mi; ++ const char *err; ++ u32 version, version_min; ++ u16 block_size; ++ ++ version = le16_to_cpu(sb->version); ++ version_min = version >= bcachefs_metadata_version_new_versioning ++ ? le16_to_cpu(sb->version_min) ++ : version; ++ ++ if (version >= bcachefs_metadata_version_max || ++ version_min < bcachefs_metadata_version_min) ++ return "Unsupported superblock version"; ++ ++ if (version_min > version) ++ return "Bad minimum version"; ++ ++ if (sb->features[1] || ++ (le64_to_cpu(sb->features[0]) & (~0ULL << BCH_FEATURE_NR))) ++ return "Filesystem has incompatible features"; ++ ++ block_size = le16_to_cpu(sb->block_size); ++ ++ if (!is_power_of_2(block_size) || ++ block_size > PAGE_SECTORS) ++ return "Bad block size"; ++ ++ if (bch2_is_zero(sb->user_uuid.b, sizeof(uuid_le))) ++ return "Bad user UUID"; ++ ++ if (bch2_is_zero(sb->uuid.b, sizeof(uuid_le))) ++ return "Bad internal UUID"; ++ ++ if (!sb->nr_devices || ++ sb->nr_devices <= sb->dev_idx || ++ sb->nr_devices > BCH_SB_MEMBERS_MAX) ++ return "Bad number of member devices"; ++ ++ if (!BCH_SB_META_REPLICAS_WANT(sb) || ++ BCH_SB_META_REPLICAS_WANT(sb) >= BCH_REPLICAS_MAX) ++ return "Invalid number of metadata replicas"; ++ ++ if (!BCH_SB_META_REPLICAS_REQ(sb) || ++ BCH_SB_META_REPLICAS_REQ(sb) >= BCH_REPLICAS_MAX) ++ return "Invalid number of metadata replicas"; ++ ++ if (!BCH_SB_DATA_REPLICAS_WANT(sb) || ++ BCH_SB_DATA_REPLICAS_WANT(sb) >= BCH_REPLICAS_MAX) ++ return "Invalid number of data replicas"; ++ ++ if (!BCH_SB_DATA_REPLICAS_REQ(sb) || ++ BCH_SB_DATA_REPLICAS_REQ(sb) >= BCH_REPLICAS_MAX) ++ return "Invalid number of data replicas"; ++ ++ if (BCH_SB_META_CSUM_TYPE(sb) >= BCH_CSUM_OPT_NR) ++ return "Invalid metadata checksum type"; ++ ++ if (BCH_SB_DATA_CSUM_TYPE(sb) >= BCH_CSUM_OPT_NR) ++ return "Invalid metadata checksum type"; ++ ++ if (BCH_SB_COMPRESSION_TYPE(sb) >= BCH_COMPRESSION_OPT_NR) ++ return "Invalid compression type"; ++ ++ if (!BCH_SB_BTREE_NODE_SIZE(sb)) ++ return "Btree node size not set"; ++ ++ if (!is_power_of_2(BCH_SB_BTREE_NODE_SIZE(sb))) ++ return "Btree node size not a power of two"; ++ ++ if (BCH_SB_GC_RESERVE(sb) < 5) ++ return "gc reserve percentage too small"; ++ ++ if (!sb->time_precision || ++ le32_to_cpu(sb->time_precision) > NSEC_PER_SEC) ++ return "invalid time precision"; ++ ++ /* validate layout */ ++ err = validate_sb_layout(&sb->layout); ++ if (err) ++ return err; ++ ++ vstruct_for_each(sb, f) { ++ if (!f->u64s) ++ return "Invalid superblock: invalid optional field"; ++ ++ if (vstruct_next(f) > vstruct_last(sb)) ++ return "Invalid superblock: invalid optional field"; ++ } ++ ++ /* members must be validated first: */ ++ mi = bch2_sb_get_members(sb); ++ if (!mi) ++ return "Invalid superblock: member info area missing"; ++ ++ err = bch2_sb_field_validate(sb, &mi->field); ++ if (err) ++ return err; ++ ++ vstruct_for_each(sb, f) { ++ if (le32_to_cpu(f->type) == BCH_SB_FIELD_members) ++ continue; ++ ++ err = bch2_sb_field_validate(sb, f); ++ if (err) ++ return err; ++ } ++ ++ return NULL; ++} ++ ++/* device open: */ ++ ++static void bch2_sb_update(struct bch_fs *c) ++{ ++ struct bch_sb *src = c->disk_sb.sb; ++ struct bch_sb_field_members *mi = bch2_sb_get_members(src); ++ struct bch_dev *ca; ++ unsigned i; ++ ++ lockdep_assert_held(&c->sb_lock); ++ ++ c->sb.uuid = src->uuid; ++ c->sb.user_uuid = src->user_uuid; ++ c->sb.version = le16_to_cpu(src->version); ++ c->sb.nr_devices = src->nr_devices; ++ c->sb.clean = BCH_SB_CLEAN(src); ++ c->sb.encryption_type = BCH_SB_ENCRYPTION_TYPE(src); ++ c->sb.encoded_extent_max= 1 << BCH_SB_ENCODED_EXTENT_MAX_BITS(src); ++ c->sb.time_base_lo = le64_to_cpu(src->time_base_lo); ++ c->sb.time_base_hi = le32_to_cpu(src->time_base_hi); ++ c->sb.time_precision = le32_to_cpu(src->time_precision); ++ c->sb.features = le64_to_cpu(src->features[0]); ++ c->sb.compat = le64_to_cpu(src->compat[0]); ++ ++ for_each_member_device(ca, c, i) ++ ca->mi = bch2_mi_to_cpu(mi->members + i); ++} ++ ++/* doesn't copy member info */ ++static void __copy_super(struct bch_sb_handle *dst_handle, struct bch_sb *src) ++{ ++ struct bch_sb_field *src_f, *dst_f; ++ struct bch_sb *dst = dst_handle->sb; ++ unsigned i; ++ ++ dst->version = src->version; ++ dst->version_min = src->version_min; ++ dst->seq = src->seq; ++ dst->uuid = src->uuid; ++ dst->user_uuid = src->user_uuid; ++ memcpy(dst->label, src->label, sizeof(dst->label)); ++ ++ dst->block_size = src->block_size; ++ dst->nr_devices = src->nr_devices; ++ ++ dst->time_base_lo = src->time_base_lo; ++ dst->time_base_hi = src->time_base_hi; ++ dst->time_precision = src->time_precision; ++ ++ memcpy(dst->flags, src->flags, sizeof(dst->flags)); ++ memcpy(dst->features, src->features, sizeof(dst->features)); ++ memcpy(dst->compat, src->compat, sizeof(dst->compat)); ++ ++ for (i = 0; i < BCH_SB_FIELD_NR; i++) { ++ if (i == BCH_SB_FIELD_journal) ++ continue; ++ ++ src_f = bch2_sb_field_get(src, i); ++ dst_f = bch2_sb_field_get(dst, i); ++ dst_f = __bch2_sb_field_resize(dst_handle, dst_f, ++ src_f ? le32_to_cpu(src_f->u64s) : 0); ++ ++ if (src_f) ++ memcpy(dst_f, src_f, vstruct_bytes(src_f)); ++ } ++} ++ ++int bch2_sb_to_fs(struct bch_fs *c, struct bch_sb *src) ++{ ++ struct bch_sb_field_journal *journal_buckets = ++ bch2_sb_get_journal(src); ++ unsigned journal_u64s = journal_buckets ++ ? le32_to_cpu(journal_buckets->field.u64s) ++ : 0; ++ int ret; ++ ++ lockdep_assert_held(&c->sb_lock); ++ ++ ret = bch2_sb_realloc(&c->disk_sb, ++ le32_to_cpu(src->u64s) - journal_u64s); ++ if (ret) ++ return ret; ++ ++ __copy_super(&c->disk_sb, src); ++ ++ ret = bch2_sb_replicas_to_cpu_replicas(c); ++ if (ret) ++ return ret; ++ ++ ret = bch2_sb_disk_groups_to_cpu(c); ++ if (ret) ++ return ret; ++ ++ bch2_sb_update(c); ++ return 0; ++} ++ ++int bch2_sb_from_fs(struct bch_fs *c, struct bch_dev *ca) ++{ ++ struct bch_sb *src = c->disk_sb.sb, *dst = ca->disk_sb.sb; ++ struct bch_sb_field_journal *journal_buckets = ++ bch2_sb_get_journal(dst); ++ unsigned journal_u64s = journal_buckets ++ ? le32_to_cpu(journal_buckets->field.u64s) ++ : 0; ++ unsigned u64s = le32_to_cpu(src->u64s) + journal_u64s; ++ int ret; ++ ++ ret = bch2_sb_realloc(&ca->disk_sb, u64s); ++ if (ret) ++ return ret; ++ ++ __copy_super(&ca->disk_sb, src); ++ return 0; ++} ++ ++/* read superblock: */ ++ ++static const char *read_one_super(struct bch_sb_handle *sb, u64 offset) ++{ ++ struct bch_csum csum; ++ size_t bytes; ++reread: ++ bio_reset(sb->bio); ++ bio_set_dev(sb->bio, sb->bdev); ++ sb->bio->bi_iter.bi_sector = offset; ++ bio_set_op_attrs(sb->bio, REQ_OP_READ, REQ_SYNC|REQ_META); ++ bch2_bio_map(sb->bio, sb->sb, PAGE_SIZE << sb->page_order); ++ ++ if (submit_bio_wait(sb->bio)) ++ return "IO error"; ++ ++ if (uuid_le_cmp(sb->sb->magic, BCACHE_MAGIC)) ++ return "Not a bcachefs superblock"; ++ ++ if (le16_to_cpu(sb->sb->version) < bcachefs_metadata_version_min || ++ le16_to_cpu(sb->sb->version) >= bcachefs_metadata_version_max) ++ return "Unsupported superblock version"; ++ ++ bytes = vstruct_bytes(sb->sb); ++ ++ if (bytes > 512 << sb->sb->layout.sb_max_size_bits) ++ return "Bad superblock: too big"; ++ ++ if (get_order(bytes) > sb->page_order) { ++ if (bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s))) ++ return "cannot allocate memory"; ++ goto reread; ++ } ++ ++ if (BCH_SB_CSUM_TYPE(sb->sb) >= BCH_CSUM_NR) ++ return "unknown csum type"; ++ ++ /* XXX: verify MACs */ ++ csum = csum_vstruct(NULL, BCH_SB_CSUM_TYPE(sb->sb), ++ null_nonce(), sb->sb); ++ ++ if (bch2_crc_cmp(csum, sb->sb->csum)) ++ return "bad checksum reading superblock"; ++ ++ sb->seq = le64_to_cpu(sb->sb->seq); ++ ++ return NULL; ++} ++ ++int bch2_read_super(const char *path, struct bch_opts *opts, ++ struct bch_sb_handle *sb) ++{ ++ u64 offset = opt_get(*opts, sb); ++ struct bch_sb_layout layout; ++ const char *err; ++ __le64 *i; ++ int ret; ++ ++ pr_verbose_init(*opts, ""); ++ ++ memset(sb, 0, sizeof(*sb)); ++ sb->mode = FMODE_READ; ++ sb->have_bio = true; ++ ++ if (!opt_get(*opts, noexcl)) ++ sb->mode |= FMODE_EXCL; ++ ++ if (!opt_get(*opts, nochanges)) ++ sb->mode |= FMODE_WRITE; ++ ++ sb->bdev = blkdev_get_by_path(path, sb->mode, sb); ++ if (IS_ERR(sb->bdev) && ++ PTR_ERR(sb->bdev) == -EACCES && ++ opt_get(*opts, read_only)) { ++ sb->mode &= ~FMODE_WRITE; ++ ++ sb->bdev = blkdev_get_by_path(path, sb->mode, sb); ++ if (!IS_ERR(sb->bdev)) ++ opt_set(*opts, nochanges, true); ++ } ++ ++ if (IS_ERR(sb->bdev)) { ++ ret = PTR_ERR(sb->bdev); ++ goto out; ++ } ++ ++ err = "cannot allocate memory"; ++ ret = bch2_sb_realloc(sb, 0); ++ if (ret) ++ goto err; ++ ++ ret = -EFAULT; ++ err = "dynamic fault"; ++ if (bch2_fs_init_fault("read_super")) ++ goto err; ++ ++ ret = -EINVAL; ++ err = read_one_super(sb, offset); ++ if (!err) ++ goto got_super; ++ ++ if (opt_defined(*opts, sb)) ++ goto err; ++ ++ pr_err("error reading default superblock: %s", err); ++ ++ /* ++ * Error reading primary superblock - read location of backup ++ * superblocks: ++ */ ++ bio_reset(sb->bio); ++ bio_set_dev(sb->bio, sb->bdev); ++ sb->bio->bi_iter.bi_sector = BCH_SB_LAYOUT_SECTOR; ++ bio_set_op_attrs(sb->bio, REQ_OP_READ, REQ_SYNC|REQ_META); ++ /* ++ * use sb buffer to read layout, since sb buffer is page aligned but ++ * layout won't be: ++ */ ++ bch2_bio_map(sb->bio, sb->sb, sizeof(struct bch_sb_layout)); ++ ++ err = "IO error"; ++ if (submit_bio_wait(sb->bio)) ++ goto err; ++ ++ memcpy(&layout, sb->sb, sizeof(layout)); ++ err = validate_sb_layout(&layout); ++ if (err) ++ goto err; ++ ++ for (i = layout.sb_offset; ++ i < layout.sb_offset + layout.nr_superblocks; i++) { ++ offset = le64_to_cpu(*i); ++ ++ if (offset == opt_get(*opts, sb)) ++ continue; ++ ++ err = read_one_super(sb, offset); ++ if (!err) ++ goto got_super; ++ } ++ ++ ret = -EINVAL; ++ goto err; ++ ++got_super: ++ err = "Superblock block size smaller than device block size"; ++ ret = -EINVAL; ++ if (le16_to_cpu(sb->sb->block_size) << 9 < ++ bdev_logical_block_size(sb->bdev)) ++ goto err; ++ ++ ret = 0; ++ sb->have_layout = true; ++out: ++ pr_verbose_init(*opts, "ret %i", ret); ++ return ret; ++err: ++ bch2_free_super(sb); ++ pr_err("error reading superblock: %s", err); ++ goto out; ++} ++ ++/* write superblock: */ ++ ++static void write_super_endio(struct bio *bio) ++{ ++ struct bch_dev *ca = bio->bi_private; ++ ++ /* XXX: return errors directly */ ++ ++ if (bch2_dev_io_err_on(bio->bi_status, ca, "superblock write")) ++ ca->sb_write_error = 1; ++ ++ closure_put(&ca->fs->sb_write); ++ percpu_ref_put(&ca->io_ref); ++} ++ ++static void read_back_super(struct bch_fs *c, struct bch_dev *ca) ++{ ++ struct bch_sb *sb = ca->disk_sb.sb; ++ struct bio *bio = ca->disk_sb.bio; ++ ++ bio_reset(bio); ++ bio_set_dev(bio, ca->disk_sb.bdev); ++ bio->bi_iter.bi_sector = le64_to_cpu(sb->layout.sb_offset[0]); ++ bio->bi_end_io = write_super_endio; ++ bio->bi_private = ca; ++ bio_set_op_attrs(bio, REQ_OP_READ, REQ_SYNC|REQ_META); ++ bch2_bio_map(bio, ca->sb_read_scratch, PAGE_SIZE); ++ ++ this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_SB], ++ bio_sectors(bio)); ++ ++ percpu_ref_get(&ca->io_ref); ++ closure_bio_submit(bio, &c->sb_write); ++} ++ ++static void write_one_super(struct bch_fs *c, struct bch_dev *ca, unsigned idx) ++{ ++ struct bch_sb *sb = ca->disk_sb.sb; ++ struct bio *bio = ca->disk_sb.bio; ++ ++ sb->offset = sb->layout.sb_offset[idx]; ++ ++ SET_BCH_SB_CSUM_TYPE(sb, c->opts.metadata_checksum); ++ sb->csum = csum_vstruct(c, BCH_SB_CSUM_TYPE(sb), ++ null_nonce(), sb); ++ ++ bio_reset(bio); ++ bio_set_dev(bio, ca->disk_sb.bdev); ++ bio->bi_iter.bi_sector = le64_to_cpu(sb->offset); ++ bio->bi_end_io = write_super_endio; ++ bio->bi_private = ca; ++ bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC|REQ_META); ++ bch2_bio_map(bio, sb, ++ roundup((size_t) vstruct_bytes(sb), ++ bdev_logical_block_size(ca->disk_sb.bdev))); ++ ++ this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_SB], ++ bio_sectors(bio)); ++ ++ percpu_ref_get(&ca->io_ref); ++ closure_bio_submit(bio, &c->sb_write); ++} ++ ++int bch2_write_super(struct bch_fs *c) ++{ ++ struct closure *cl = &c->sb_write; ++ struct bch_dev *ca; ++ unsigned i, sb = 0, nr_wrote; ++ const char *err; ++ struct bch_devs_mask sb_written; ++ bool wrote, can_mount_without_written, can_mount_with_written; ++ int ret = 0; ++ ++ lockdep_assert_held(&c->sb_lock); ++ ++ closure_init_stack(cl); ++ memset(&sb_written, 0, sizeof(sb_written)); ++ ++ le64_add_cpu(&c->disk_sb.sb->seq, 1); ++ ++ if (test_bit(BCH_FS_ERROR, &c->flags)) ++ SET_BCH_SB_HAS_ERRORS(c->disk_sb.sb, 1); ++ ++ for_each_online_member(ca, c, i) ++ bch2_sb_from_fs(c, ca); ++ ++ for_each_online_member(ca, c, i) { ++ err = bch2_sb_validate(&ca->disk_sb); ++ if (err) { ++ bch2_fs_inconsistent(c, "sb invalid before write: %s", err); ++ ret = -1; ++ goto out; ++ } ++ } ++ ++ if (c->opts.nochanges) ++ goto out; ++ ++ for_each_online_member(ca, c, i) { ++ __set_bit(ca->dev_idx, sb_written.d); ++ ca->sb_write_error = 0; ++ } ++ ++ for_each_online_member(ca, c, i) ++ read_back_super(c, ca); ++ closure_sync(cl); ++ ++ for_each_online_member(ca, c, i) { ++ if (!ca->sb_write_error && ++ ca->disk_sb.seq != ++ le64_to_cpu(ca->sb_read_scratch->seq)) { ++ bch2_fs_fatal_error(c, ++ "Superblock modified by another process"); ++ percpu_ref_put(&ca->io_ref); ++ ret = -EROFS; ++ goto out; ++ } ++ } ++ ++ do { ++ wrote = false; ++ for_each_online_member(ca, c, i) ++ if (!ca->sb_write_error && ++ sb < ca->disk_sb.sb->layout.nr_superblocks) { ++ write_one_super(c, ca, sb); ++ wrote = true; ++ } ++ closure_sync(cl); ++ sb++; ++ } while (wrote); ++ ++ for_each_online_member(ca, c, i) { ++ if (ca->sb_write_error) ++ __clear_bit(ca->dev_idx, sb_written.d); ++ else ++ ca->disk_sb.seq = le64_to_cpu(ca->disk_sb.sb->seq); ++ } ++ ++ nr_wrote = dev_mask_nr(&sb_written); ++ ++ can_mount_with_written = ++ bch2_have_enough_devs(__bch2_replicas_status(c, sb_written), ++ BCH_FORCE_IF_DEGRADED); ++ ++ for (i = 0; i < ARRAY_SIZE(sb_written.d); i++) ++ sb_written.d[i] = ~sb_written.d[i]; ++ ++ can_mount_without_written = ++ bch2_have_enough_devs(__bch2_replicas_status(c, sb_written), ++ BCH_FORCE_IF_DEGRADED); ++ ++ /* ++ * If we would be able to mount _without_ the devices we successfully ++ * wrote superblocks to, we weren't able to write to enough devices: ++ * ++ * Exception: if we can mount without the successes because we haven't ++ * written anything (new filesystem), we continue if we'd be able to ++ * mount with the devices we did successfully write to: ++ */ ++ if (bch2_fs_fatal_err_on(!nr_wrote || ++ (can_mount_without_written && ++ !can_mount_with_written), c, ++ "Unable to write superblock to sufficient devices")) ++ ret = -1; ++out: ++ /* Make new options visible after they're persistent: */ ++ bch2_sb_update(c); ++ return ret; ++} ++ ++/* BCH_SB_FIELD_journal: */ ++ ++static int u64_cmp(const void *_l, const void *_r) ++{ ++ u64 l = *((const u64 *) _l), r = *((const u64 *) _r); ++ ++ return l < r ? -1 : l > r ? 1 : 0; ++} ++ ++static const char *bch2_sb_validate_journal(struct bch_sb *sb, ++ struct bch_sb_field *f) ++{ ++ struct bch_sb_field_journal *journal = field_to_type(f, journal); ++ struct bch_member *m = bch2_sb_get_members(sb)->members + sb->dev_idx; ++ const char *err; ++ unsigned nr; ++ unsigned i; ++ u64 *b; ++ ++ journal = bch2_sb_get_journal(sb); ++ if (!journal) ++ return NULL; ++ ++ nr = bch2_nr_journal_buckets(journal); ++ if (!nr) ++ return NULL; ++ ++ b = kmalloc_array(sizeof(u64), nr, GFP_KERNEL); ++ if (!b) ++ return "cannot allocate memory"; ++ ++ for (i = 0; i < nr; i++) ++ b[i] = le64_to_cpu(journal->buckets[i]); ++ ++ sort(b, nr, sizeof(u64), u64_cmp, NULL); ++ ++ err = "journal bucket at sector 0"; ++ if (!b[0]) ++ goto err; ++ ++ err = "journal bucket before first bucket"; ++ if (m && b[0] < le16_to_cpu(m->first_bucket)) ++ goto err; ++ ++ err = "journal bucket past end of device"; ++ if (m && b[nr - 1] >= le64_to_cpu(m->nbuckets)) ++ goto err; ++ ++ err = "duplicate journal buckets"; ++ for (i = 0; i + 1 < nr; i++) ++ if (b[i] == b[i + 1]) ++ goto err; ++ ++ err = NULL; ++err: ++ kfree(b); ++ return err; ++} ++ ++static const struct bch_sb_field_ops bch_sb_field_ops_journal = { ++ .validate = bch2_sb_validate_journal, ++}; ++ ++/* BCH_SB_FIELD_members: */ ++ ++static const char *bch2_sb_validate_members(struct bch_sb *sb, ++ struct bch_sb_field *f) ++{ ++ struct bch_sb_field_members *mi = field_to_type(f, members); ++ struct bch_member *m; ++ ++ if ((void *) (mi->members + sb->nr_devices) > ++ vstruct_end(&mi->field)) ++ return "Invalid superblock: bad member info"; ++ ++ for (m = mi->members; ++ m < mi->members + sb->nr_devices; ++ m++) { ++ if (!bch2_member_exists(m)) ++ continue; ++ ++ if (le64_to_cpu(m->nbuckets) > LONG_MAX) ++ return "Too many buckets"; ++ ++ if (le64_to_cpu(m->nbuckets) - ++ le16_to_cpu(m->first_bucket) < BCH_MIN_NR_NBUCKETS) ++ return "Not enough buckets"; ++ ++ if (le16_to_cpu(m->bucket_size) < ++ le16_to_cpu(sb->block_size)) ++ return "bucket size smaller than block size"; ++ ++ if (le16_to_cpu(m->bucket_size) < ++ BCH_SB_BTREE_NODE_SIZE(sb)) ++ return "bucket size smaller than btree node size"; ++ } ++ ++ return NULL; ++} ++ ++static const struct bch_sb_field_ops bch_sb_field_ops_members = { ++ .validate = bch2_sb_validate_members, ++}; ++ ++/* BCH_SB_FIELD_crypt: */ ++ ++static const char *bch2_sb_validate_crypt(struct bch_sb *sb, ++ struct bch_sb_field *f) ++{ ++ struct bch_sb_field_crypt *crypt = field_to_type(f, crypt); ++ ++ if (vstruct_bytes(&crypt->field) != sizeof(*crypt)) ++ return "invalid field crypt: wrong size"; ++ ++ if (BCH_CRYPT_KDF_TYPE(crypt)) ++ return "invalid field crypt: bad kdf type"; ++ ++ return NULL; ++} ++ ++static const struct bch_sb_field_ops bch_sb_field_ops_crypt = { ++ .validate = bch2_sb_validate_crypt, ++}; ++ ++/* BCH_SB_FIELD_clean: */ ++ ++void bch2_sb_clean_renumber(struct bch_sb_field_clean *clean, int write) ++{ ++ struct jset_entry *entry; ++ ++ for (entry = clean->start; ++ entry < (struct jset_entry *) vstruct_end(&clean->field); ++ entry = vstruct_next(entry)) ++ bch2_bkey_renumber(BKEY_TYPE_BTREE, bkey_to_packed(entry->start), write); ++} ++ ++int bch2_fs_mark_dirty(struct bch_fs *c) ++{ ++ int ret; ++ ++ /* ++ * Unconditionally write superblock, to verify it hasn't changed before ++ * we go rw: ++ */ ++ ++ mutex_lock(&c->sb_lock); ++ SET_BCH_SB_CLEAN(c->disk_sb.sb, false); ++ c->disk_sb.sb->compat[0] &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_METADATA); ++ ret = bch2_write_super(c); ++ mutex_unlock(&c->sb_lock); ++ ++ return ret; ++} ++ ++static void ++entry_init_u64s(struct jset_entry *entry, unsigned u64s) ++{ ++ memset(entry, 0, u64s * sizeof(u64)); ++ ++ /* ++ * The u64s field counts from the start of data, ignoring the shared ++ * fields. ++ */ ++ entry->u64s = u64s - 1; ++} ++ ++static void ++entry_init_size(struct jset_entry *entry, size_t size) ++{ ++ unsigned u64s = DIV_ROUND_UP(size, sizeof(u64)); ++ entry_init_u64s(entry, u64s); ++} ++ ++struct jset_entry * ++bch2_journal_super_entries_add_common(struct bch_fs *c, ++ struct jset_entry *entry, ++ u64 journal_seq) ++{ ++ struct btree_root *r; ++ unsigned i; ++ ++ mutex_lock(&c->btree_root_lock); ++ ++ for (r = c->btree_roots; ++ r < c->btree_roots + BTREE_ID_NR; ++ r++) ++ if (r->alive) { ++ entry_init_u64s(entry, r->key.u64s + 1); ++ entry->btree_id = r - c->btree_roots; ++ entry->level = r->level; ++ entry->type = BCH_JSET_ENTRY_btree_root; ++ bkey_copy(&entry->start[0], &r->key); ++ ++ entry = vstruct_next(entry); ++ } ++ c->btree_roots_dirty = false; ++ ++ mutex_unlock(&c->btree_root_lock); ++ ++ percpu_down_write(&c->mark_lock); ++ ++ if (!journal_seq) { ++ bch2_fs_usage_acc_to_base(c, 0); ++ bch2_fs_usage_acc_to_base(c, 1); ++ } else { ++ bch2_fs_usage_acc_to_base(c, journal_seq & 1); ++ } ++ ++ { ++ struct jset_entry_usage *u = ++ container_of(entry, struct jset_entry_usage, entry); ++ ++ entry_init_size(entry, sizeof(*u)); ++ u->entry.type = BCH_JSET_ENTRY_usage; ++ u->entry.btree_id = FS_USAGE_INODES; ++ u->v = cpu_to_le64(c->usage_base->nr_inodes); ++ ++ entry = vstruct_next(entry); ++ } ++ ++ { ++ struct jset_entry_usage *u = ++ container_of(entry, struct jset_entry_usage, entry); ++ ++ entry_init_size(entry, sizeof(*u)); ++ u->entry.type = BCH_JSET_ENTRY_usage; ++ u->entry.btree_id = FS_USAGE_KEY_VERSION; ++ u->v = cpu_to_le64(atomic64_read(&c->key_version)); ++ ++ entry = vstruct_next(entry); ++ } ++ ++ for (i = 0; i < BCH_REPLICAS_MAX; i++) { ++ struct jset_entry_usage *u = ++ container_of(entry, struct jset_entry_usage, entry); ++ ++ entry_init_size(entry, sizeof(*u)); ++ u->entry.type = BCH_JSET_ENTRY_usage; ++ u->entry.btree_id = FS_USAGE_RESERVED; ++ u->entry.level = i; ++ u->v = cpu_to_le64(c->usage_base->persistent_reserved[i]); ++ ++ entry = vstruct_next(entry); ++ } ++ ++ for (i = 0; i < c->replicas.nr; i++) { ++ struct bch_replicas_entry *e = ++ cpu_replicas_entry(&c->replicas, i); ++ struct jset_entry_data_usage *u = ++ container_of(entry, struct jset_entry_data_usage, entry); ++ ++ entry_init_size(entry, sizeof(*u) + e->nr_devs); ++ u->entry.type = BCH_JSET_ENTRY_data_usage; ++ u->v = cpu_to_le64(c->usage_base->replicas[i]); ++ memcpy(&u->r, e, replicas_entry_bytes(e)); ++ ++ entry = vstruct_next(entry); ++ } ++ ++ percpu_up_write(&c->mark_lock); ++ ++ return entry; ++} ++ ++void bch2_fs_mark_clean(struct bch_fs *c) ++{ ++ struct bch_sb_field_clean *sb_clean; ++ struct jset_entry *entry; ++ unsigned u64s; ++ ++ mutex_lock(&c->sb_lock); ++ if (BCH_SB_CLEAN(c->disk_sb.sb)) ++ goto out; ++ ++ SET_BCH_SB_CLEAN(c->disk_sb.sb, true); ++ ++ c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_ALLOC_INFO; ++ c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_ALLOC_METADATA; ++ ++ u64s = sizeof(*sb_clean) / sizeof(u64) + c->journal.entry_u64s_reserved; ++ ++ sb_clean = bch2_sb_resize_clean(&c->disk_sb, u64s); ++ if (!sb_clean) { ++ bch_err(c, "error resizing superblock while setting filesystem clean"); ++ goto out; ++ } ++ ++ sb_clean->flags = 0; ++ sb_clean->read_clock = cpu_to_le16(c->bucket_clock[READ].hand); ++ sb_clean->write_clock = cpu_to_le16(c->bucket_clock[WRITE].hand); ++ sb_clean->journal_seq = cpu_to_le64(journal_cur_seq(&c->journal) - 1); ++ ++ /* Trying to catch outstanding bug: */ ++ BUG_ON(le64_to_cpu(sb_clean->journal_seq) > S64_MAX); ++ ++ entry = sb_clean->start; ++ entry = bch2_journal_super_entries_add_common(c, entry, 0); ++ BUG_ON((void *) entry > vstruct_end(&sb_clean->field)); ++ ++ memset(entry, 0, ++ vstruct_end(&sb_clean->field) - (void *) entry); ++ ++ if (le16_to_cpu(c->disk_sb.sb->version) < ++ bcachefs_metadata_version_bkey_renumber) ++ bch2_sb_clean_renumber(sb_clean, WRITE); ++ ++ bch2_write_super(c); ++out: ++ mutex_unlock(&c->sb_lock); ++} ++ ++static const char *bch2_sb_validate_clean(struct bch_sb *sb, ++ struct bch_sb_field *f) ++{ ++ struct bch_sb_field_clean *clean = field_to_type(f, clean); ++ ++ if (vstruct_bytes(&clean->field) < sizeof(*clean)) ++ return "invalid field crypt: wrong size"; ++ ++ return NULL; ++} ++ ++static const struct bch_sb_field_ops bch_sb_field_ops_clean = { ++ .validate = bch2_sb_validate_clean, ++}; ++ ++static const struct bch_sb_field_ops *bch2_sb_field_ops[] = { ++#define x(f, nr) \ ++ [BCH_SB_FIELD_##f] = &bch_sb_field_ops_##f, ++ BCH_SB_FIELDS() ++#undef x ++}; ++ ++static const char *bch2_sb_field_validate(struct bch_sb *sb, ++ struct bch_sb_field *f) ++{ ++ unsigned type = le32_to_cpu(f->type); ++ ++ return type < BCH_SB_FIELD_NR ++ ? bch2_sb_field_ops[type]->validate(sb, f) ++ : NULL; ++} ++ ++void bch2_sb_field_to_text(struct printbuf *out, struct bch_sb *sb, ++ struct bch_sb_field *f) ++{ ++ unsigned type = le32_to_cpu(f->type); ++ const struct bch_sb_field_ops *ops = type < BCH_SB_FIELD_NR ++ ? bch2_sb_field_ops[type] : NULL; ++ ++ if (ops) ++ pr_buf(out, "%s", bch2_sb_fields[type]); ++ else ++ pr_buf(out, "(unknown field %u)", type); ++ ++ pr_buf(out, " (size %llu):", vstruct_bytes(f)); ++ ++ if (ops && ops->to_text) ++ bch2_sb_field_ops[type]->to_text(out, sb, f); ++} +diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h +new file mode 100644 +index 000000000000..f5450e596c62 +--- /dev/null ++++ b/fs/bcachefs/super-io.h +@@ -0,0 +1,150 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_SUPER_IO_H ++#define _BCACHEFS_SUPER_IO_H ++ ++#include "extents.h" ++#include "eytzinger.h" ++#include "super_types.h" ++#include "super.h" ++ ++#include ++ ++struct bch_sb_field *bch2_sb_field_get(struct bch_sb *, enum bch_sb_field_type); ++struct bch_sb_field *bch2_sb_field_resize(struct bch_sb_handle *, ++ enum bch_sb_field_type, unsigned); ++void bch2_sb_field_delete(struct bch_sb_handle *, enum bch_sb_field_type); ++ ++#define field_to_type(_f, _name) \ ++ container_of_or_null(_f, struct bch_sb_field_##_name, field) ++ ++#define x(_name, _nr) \ ++static inline struct bch_sb_field_##_name * \ ++bch2_sb_get_##_name(struct bch_sb *sb) \ ++{ \ ++ return field_to_type(bch2_sb_field_get(sb, \ ++ BCH_SB_FIELD_##_name), _name); \ ++} \ ++ \ ++static inline struct bch_sb_field_##_name * \ ++bch2_sb_resize_##_name(struct bch_sb_handle *sb, unsigned u64s) \ ++{ \ ++ return field_to_type(bch2_sb_field_resize(sb, \ ++ BCH_SB_FIELD_##_name, u64s), _name); \ ++} ++ ++BCH_SB_FIELDS() ++#undef x ++ ++extern const char * const bch2_sb_fields[]; ++ ++struct bch_sb_field_ops { ++ const char * (*validate)(struct bch_sb *, struct bch_sb_field *); ++ void (*to_text)(struct printbuf *, struct bch_sb *, ++ struct bch_sb_field *); ++}; ++ ++static inline bool bch2_sb_test_feature(struct bch_sb *sb, ++ enum bch_sb_features f) ++{ ++ unsigned w = f / 64; ++ unsigned b = f % 64; ++ ++ return le64_to_cpu(sb->features[w]) & (1ULL << b); ++} ++ ++static inline void bch2_sb_set_feature(struct bch_sb *sb, ++ enum bch_sb_features f) ++{ ++ if (!bch2_sb_test_feature(sb, f)) { ++ unsigned w = f / 64; ++ unsigned b = f % 64; ++ ++ le64_add_cpu(&sb->features[w], 1ULL << b); ++ } ++} ++ ++static inline __le64 bch2_sb_magic(struct bch_fs *c) ++{ ++ __le64 ret; ++ memcpy(&ret, &c->sb.uuid, sizeof(ret)); ++ return ret; ++} ++ ++static inline __u64 jset_magic(struct bch_fs *c) ++{ ++ return __le64_to_cpu(bch2_sb_magic(c) ^ JSET_MAGIC); ++} ++ ++static inline __u64 bset_magic(struct bch_fs *c) ++{ ++ return __le64_to_cpu(bch2_sb_magic(c) ^ BSET_MAGIC); ++} ++ ++int bch2_sb_to_fs(struct bch_fs *, struct bch_sb *); ++int bch2_sb_from_fs(struct bch_fs *, struct bch_dev *); ++ ++void bch2_free_super(struct bch_sb_handle *); ++int bch2_sb_realloc(struct bch_sb_handle *, unsigned); ++ ++const char *bch2_sb_validate(struct bch_sb_handle *); ++ ++int bch2_read_super(const char *, struct bch_opts *, struct bch_sb_handle *); ++int bch2_write_super(struct bch_fs *); ++ ++/* BCH_SB_FIELD_journal: */ ++ ++static inline unsigned bch2_nr_journal_buckets(struct bch_sb_field_journal *j) ++{ ++ return j ++ ? (__le64 *) vstruct_end(&j->field) - j->buckets ++ : 0; ++} ++ ++/* BCH_SB_FIELD_members: */ ++ ++static inline bool bch2_member_exists(struct bch_member *m) ++{ ++ return !bch2_is_zero(m->uuid.b, sizeof(uuid_le)); ++} ++ ++static inline bool bch2_dev_exists(struct bch_sb *sb, ++ struct bch_sb_field_members *mi, ++ unsigned dev) ++{ ++ return dev < sb->nr_devices && ++ bch2_member_exists(&mi->members[dev]); ++} ++ ++static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi) ++{ ++ return (struct bch_member_cpu) { ++ .nbuckets = le64_to_cpu(mi->nbuckets), ++ .first_bucket = le16_to_cpu(mi->first_bucket), ++ .bucket_size = le16_to_cpu(mi->bucket_size), ++ .group = BCH_MEMBER_GROUP(mi), ++ .state = BCH_MEMBER_STATE(mi), ++ .replacement = BCH_MEMBER_REPLACEMENT(mi), ++ .discard = BCH_MEMBER_DISCARD(mi), ++ .data_allowed = BCH_MEMBER_DATA_ALLOWED(mi), ++ .durability = BCH_MEMBER_DURABILITY(mi) ++ ? BCH_MEMBER_DURABILITY(mi) - 1 ++ : 1, ++ .valid = !bch2_is_zero(mi->uuid.b, sizeof(uuid_le)), ++ }; ++} ++ ++/* BCH_SB_FIELD_clean: */ ++ ++struct jset_entry * ++bch2_journal_super_entries_add_common(struct bch_fs *, ++ struct jset_entry *, u64); ++ ++void bch2_sb_clean_renumber(struct bch_sb_field_clean *, int); ++ ++int bch2_fs_mark_dirty(struct bch_fs *); ++void bch2_fs_mark_clean(struct bch_fs *); ++ ++void bch2_sb_field_to_text(struct printbuf *, struct bch_sb *, ++ struct bch_sb_field *); ++ ++#endif /* _BCACHEFS_SUPER_IO_H */ +diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c +new file mode 100644 +index 000000000000..f0af26bd328f +--- /dev/null ++++ b/fs/bcachefs/super.c +@@ -0,0 +1,1953 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * bcachefs setup/teardown code, and some metadata io - read a superblock and ++ * figure out what to do with it. ++ * ++ * Copyright 2010, 2011 Kent Overstreet ++ * Copyright 2012 Google, Inc. ++ */ ++ ++#include "bcachefs.h" ++#include "alloc_background.h" ++#include "alloc_foreground.h" ++#include "bkey_sort.h" ++#include "btree_cache.h" ++#include "btree_gc.h" ++#include "btree_update_interior.h" ++#include "btree_io.h" ++#include "chardev.h" ++#include "checksum.h" ++#include "clock.h" ++#include "compress.h" ++#include "debug.h" ++#include "disk_groups.h" ++#include "ec.h" ++#include "error.h" ++#include "fs.h" ++#include "fs-io.h" ++#include "fsck.h" ++#include "inode.h" ++#include "io.h" ++#include "journal.h" ++#include "journal_reclaim.h" ++#include "journal_seq_blacklist.h" ++#include "move.h" ++#include "migrate.h" ++#include "movinggc.h" ++#include "quota.h" ++#include "rebalance.h" ++#include "recovery.h" ++#include "replicas.h" ++#include "super.h" ++#include "super-io.h" ++#include "sysfs.h" ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++ ++MODULE_LICENSE("GPL"); ++MODULE_AUTHOR("Kent Overstreet "); ++ ++#define KTYPE(type) \ ++struct kobj_type type ## _ktype = { \ ++ .release = type ## _release, \ ++ .sysfs_ops = &type ## _sysfs_ops, \ ++ .default_attrs = type ## _files \ ++} ++ ++static void bch2_fs_release(struct kobject *); ++static void bch2_dev_release(struct kobject *); ++ ++static void bch2_fs_internal_release(struct kobject *k) ++{ ++} ++ ++static void bch2_fs_opts_dir_release(struct kobject *k) ++{ ++} ++ ++static void bch2_fs_time_stats_release(struct kobject *k) ++{ ++} ++ ++static KTYPE(bch2_fs); ++static KTYPE(bch2_fs_internal); ++static KTYPE(bch2_fs_opts_dir); ++static KTYPE(bch2_fs_time_stats); ++static KTYPE(bch2_dev); ++ ++static struct kset *bcachefs_kset; ++static LIST_HEAD(bch_fs_list); ++static DEFINE_MUTEX(bch_fs_list_lock); ++ ++static DECLARE_WAIT_QUEUE_HEAD(bch_read_only_wait); ++ ++static void bch2_dev_free(struct bch_dev *); ++static int bch2_dev_alloc(struct bch_fs *, unsigned); ++static int bch2_dev_sysfs_online(struct bch_fs *, struct bch_dev *); ++static void __bch2_dev_read_only(struct bch_fs *, struct bch_dev *); ++ ++struct bch_fs *bch2_dev_to_fs(dev_t dev) ++{ ++ struct bch_fs *c; ++ struct bch_dev *ca; ++ unsigned i; ++ ++ mutex_lock(&bch_fs_list_lock); ++ rcu_read_lock(); ++ ++ list_for_each_entry(c, &bch_fs_list, list) ++ for_each_member_device_rcu(ca, c, i, NULL) ++ if (ca->disk_sb.bdev->bd_dev == dev) { ++ closure_get(&c->cl); ++ goto found; ++ } ++ c = NULL; ++found: ++ rcu_read_unlock(); ++ mutex_unlock(&bch_fs_list_lock); ++ ++ return c; ++} ++ ++static struct bch_fs *__bch2_uuid_to_fs(uuid_le uuid) ++{ ++ struct bch_fs *c; ++ ++ lockdep_assert_held(&bch_fs_list_lock); ++ ++ list_for_each_entry(c, &bch_fs_list, list) ++ if (!memcmp(&c->disk_sb.sb->uuid, &uuid, sizeof(uuid_le))) ++ return c; ++ ++ return NULL; ++} ++ ++struct bch_fs *bch2_uuid_to_fs(uuid_le uuid) ++{ ++ struct bch_fs *c; ++ ++ mutex_lock(&bch_fs_list_lock); ++ c = __bch2_uuid_to_fs(uuid); ++ if (c) ++ closure_get(&c->cl); ++ mutex_unlock(&bch_fs_list_lock); ++ ++ return c; ++} ++ ++/* Filesystem RO/RW: */ ++ ++/* ++ * For startup/shutdown of RW stuff, the dependencies are: ++ * ++ * - foreground writes depend on copygc and rebalance (to free up space) ++ * ++ * - copygc and rebalance depend on mark and sweep gc (they actually probably ++ * don't because they either reserve ahead of time or don't block if ++ * allocations fail, but allocations can require mark and sweep gc to run ++ * because of generation number wraparound) ++ * ++ * - all of the above depends on the allocator threads ++ * ++ * - allocator depends on the journal (when it rewrites prios and gens) ++ */ ++ ++static void __bch2_fs_read_only(struct bch_fs *c) ++{ ++ struct bch_dev *ca; ++ bool wrote; ++ unsigned i, clean_passes = 0; ++ int ret; ++ ++ bch2_rebalance_stop(c); ++ ++ for_each_member_device(ca, c, i) ++ bch2_copygc_stop(ca); ++ ++ bch2_gc_thread_stop(c); ++ ++ /* ++ * Flush journal before stopping allocators, because flushing journal ++ * blacklist entries involves allocating new btree nodes: ++ */ ++ bch2_journal_flush_all_pins(&c->journal); ++ ++ if (!test_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags)) ++ goto allocator_not_running; ++ ++ do { ++ wrote = false; ++ ++ ret = bch2_stripes_write(c, BTREE_INSERT_NOCHECK_RW, &wrote) ?: ++ bch2_alloc_write(c, BTREE_INSERT_NOCHECK_RW, &wrote); ++ ++ if (ret && !test_bit(BCH_FS_EMERGENCY_RO, &c->flags)) ++ bch2_fs_inconsistent(c, "error writing out alloc info %i", ret); ++ ++ if (ret) ++ break; ++ ++ for_each_member_device(ca, c, i) ++ bch2_dev_allocator_quiesce(c, ca); ++ ++ bch2_journal_flush_all_pins(&c->journal); ++ ++ /* ++ * We need to explicitly wait on btree interior updates to complete ++ * before stopping the journal, flushing all journal pins isn't ++ * sufficient, because in the BTREE_INTERIOR_UPDATING_ROOT case btree ++ * interior updates have to drop their journal pin before they're ++ * fully complete: ++ */ ++ closure_wait_event(&c->btree_interior_update_wait, ++ !bch2_btree_interior_updates_nr_pending(c)); ++ ++ clean_passes = wrote ? 0 : clean_passes + 1; ++ } while (clean_passes < 2); ++allocator_not_running: ++ for_each_member_device(ca, c, i) ++ bch2_dev_allocator_stop(ca); ++ ++ clear_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags); ++ ++ bch2_fs_journal_stop(&c->journal); ++ ++ /* XXX: mark super that alloc info is persistent */ ++ ++ /* ++ * the journal kicks off btree writes via reclaim - wait for in flight ++ * writes after stopping journal: ++ */ ++ if (test_bit(BCH_FS_EMERGENCY_RO, &c->flags)) ++ bch2_btree_flush_all_writes(c); ++ else ++ bch2_btree_verify_flushed(c); ++ ++ /* ++ * After stopping journal: ++ */ ++ for_each_member_device(ca, c, i) ++ bch2_dev_allocator_remove(c, ca); ++} ++ ++static void bch2_writes_disabled(struct percpu_ref *writes) ++{ ++ struct bch_fs *c = container_of(writes, struct bch_fs, writes); ++ ++ set_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags); ++ wake_up(&bch_read_only_wait); ++} ++ ++void bch2_fs_read_only(struct bch_fs *c) ++{ ++ if (!test_bit(BCH_FS_RW, &c->flags)) { ++ cancel_delayed_work_sync(&c->journal.reclaim_work); ++ return; ++ } ++ ++ BUG_ON(test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags)); ++ ++ /* ++ * Block new foreground-end write operations from starting - any new ++ * writes will return -EROFS: ++ * ++ * (This is really blocking new _allocations_, writes to previously ++ * allocated space can still happen until stopping the allocator in ++ * bch2_dev_allocator_stop()). ++ */ ++ percpu_ref_kill(&c->writes); ++ ++ cancel_work_sync(&c->ec_stripe_delete_work); ++ cancel_delayed_work(&c->pd_controllers_update); ++ ++ /* ++ * If we're not doing an emergency shutdown, we want to wait on ++ * outstanding writes to complete so they don't see spurious errors due ++ * to shutting down the allocator: ++ * ++ * If we are doing an emergency shutdown outstanding writes may ++ * hang until we shutdown the allocator so we don't want to wait ++ * on outstanding writes before shutting everything down - but ++ * we do need to wait on them before returning and signalling ++ * that going RO is complete: ++ */ ++ wait_event(bch_read_only_wait, ++ test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags) || ++ test_bit(BCH_FS_EMERGENCY_RO, &c->flags)); ++ ++ __bch2_fs_read_only(c); ++ ++ wait_event(bch_read_only_wait, ++ test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags)); ++ ++ clear_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags); ++ ++ if (!bch2_journal_error(&c->journal) && ++ !test_bit(BCH_FS_ERROR, &c->flags) && ++ !test_bit(BCH_FS_EMERGENCY_RO, &c->flags) && ++ test_bit(BCH_FS_STARTED, &c->flags) && ++ !c->opts.norecovery) ++ bch2_fs_mark_clean(c); ++ ++ clear_bit(BCH_FS_RW, &c->flags); ++} ++ ++static void bch2_fs_read_only_work(struct work_struct *work) ++{ ++ struct bch_fs *c = ++ container_of(work, struct bch_fs, read_only_work); ++ ++ mutex_lock(&c->state_lock); ++ bch2_fs_read_only(c); ++ mutex_unlock(&c->state_lock); ++} ++ ++static void bch2_fs_read_only_async(struct bch_fs *c) ++{ ++ queue_work(system_long_wq, &c->read_only_work); ++} ++ ++bool bch2_fs_emergency_read_only(struct bch_fs *c) ++{ ++ bool ret = !test_and_set_bit(BCH_FS_EMERGENCY_RO, &c->flags); ++ ++ bch2_fs_read_only_async(c); ++ bch2_journal_halt(&c->journal); ++ ++ wake_up(&bch_read_only_wait); ++ return ret; ++} ++ ++static int bch2_fs_read_write_late(struct bch_fs *c) ++{ ++ struct bch_dev *ca; ++ unsigned i; ++ int ret; ++ ++ ret = bch2_gc_thread_start(c); ++ if (ret) { ++ bch_err(c, "error starting gc thread"); ++ return ret; ++ } ++ ++ for_each_rw_member(ca, c, i) { ++ ret = bch2_copygc_start(c, ca); ++ if (ret) { ++ bch_err(c, "error starting copygc threads"); ++ percpu_ref_put(&ca->io_ref); ++ return ret; ++ } ++ } ++ ++ ret = bch2_rebalance_start(c); ++ if (ret) { ++ bch_err(c, "error starting rebalance thread"); ++ return ret; ++ } ++ ++ schedule_delayed_work(&c->pd_controllers_update, 5 * HZ); ++ ++ schedule_work(&c->ec_stripe_delete_work); ++ ++ return 0; ++} ++ ++int __bch2_fs_read_write(struct bch_fs *c, bool early) ++{ ++ struct bch_dev *ca; ++ unsigned i; ++ int ret; ++ ++ if (test_bit(BCH_FS_RW, &c->flags)) ++ return 0; ++ ++ /* ++ * nochanges is used for fsck -n mode - we have to allow going rw ++ * during recovery for that to work: ++ */ ++ if (c->opts.norecovery || ++ (c->opts.nochanges && ++ (!early || c->opts.read_only))) ++ return -EROFS; ++ ++ ret = bch2_fs_mark_dirty(c); ++ if (ret) ++ goto err; ++ ++ for_each_rw_member(ca, c, i) ++ bch2_dev_allocator_add(c, ca); ++ bch2_recalc_capacity(c); ++ ++ if (!test_bit(BCH_FS_ALLOCATOR_STARTED, &c->flags)) { ++ ret = bch2_fs_allocator_start(c); ++ if (ret) { ++ bch_err(c, "error initializing allocator"); ++ goto err; ++ } ++ ++ set_bit(BCH_FS_ALLOCATOR_STARTED, &c->flags); ++ } ++ ++ for_each_rw_member(ca, c, i) { ++ ret = bch2_dev_allocator_start(ca); ++ if (ret) { ++ bch_err(c, "error starting allocator threads"); ++ percpu_ref_put(&ca->io_ref); ++ goto err; ++ } ++ } ++ ++ set_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags); ++ ++ if (!early) { ++ ret = bch2_fs_read_write_late(c); ++ if (ret) ++ goto err; ++ } ++ ++ percpu_ref_reinit(&c->writes); ++ set_bit(BCH_FS_RW, &c->flags); ++ ++ queue_delayed_work(c->journal_reclaim_wq, ++ &c->journal.reclaim_work, 0); ++ return 0; ++err: ++ __bch2_fs_read_only(c); ++ return ret; ++} ++ ++int bch2_fs_read_write(struct bch_fs *c) ++{ ++ return __bch2_fs_read_write(c, false); ++} ++ ++int bch2_fs_read_write_early(struct bch_fs *c) ++{ ++ lockdep_assert_held(&c->state_lock); ++ ++ return __bch2_fs_read_write(c, true); ++} ++ ++/* Filesystem startup/shutdown: */ ++ ++static void bch2_fs_free(struct bch_fs *c) ++{ ++ unsigned i; ++ ++ for (i = 0; i < BCH_TIME_STAT_NR; i++) ++ bch2_time_stats_exit(&c->times[i]); ++ ++ bch2_fs_quota_exit(c); ++ bch2_fs_fsio_exit(c); ++ bch2_fs_ec_exit(c); ++ bch2_fs_encryption_exit(c); ++ bch2_fs_io_exit(c); ++ bch2_fs_btree_iter_exit(c); ++ bch2_fs_btree_cache_exit(c); ++ bch2_fs_journal_exit(&c->journal); ++ bch2_io_clock_exit(&c->io_clock[WRITE]); ++ bch2_io_clock_exit(&c->io_clock[READ]); ++ bch2_fs_compress_exit(c); ++ percpu_free_rwsem(&c->mark_lock); ++ kfree(c->usage_scratch); ++ free_percpu(c->usage[1]); ++ free_percpu(c->usage[0]); ++ kfree(c->usage_base); ++ free_percpu(c->pcpu); ++ mempool_exit(&c->btree_bounce_pool); ++ bioset_exit(&c->btree_bio); ++ mempool_exit(&c->btree_interior_update_pool); ++ mempool_exit(&c->btree_reserve_pool); ++ mempool_exit(&c->fill_iter); ++ percpu_ref_exit(&c->writes); ++ kfree(c->replicas.entries); ++ kfree(c->replicas_gc.entries); ++ kfree(rcu_dereference_protected(c->disk_groups, 1)); ++ kfree(c->journal_seq_blacklist_table); ++ ++ if (c->journal_reclaim_wq) ++ destroy_workqueue(c->journal_reclaim_wq); ++ if (c->copygc_wq) ++ destroy_workqueue(c->copygc_wq); ++ if (c->wq) ++ destroy_workqueue(c->wq); ++ ++ free_pages((unsigned long) c->disk_sb.sb, ++ c->disk_sb.page_order); ++ kvpfree(c, sizeof(*c)); ++ module_put(THIS_MODULE); ++} ++ ++static void bch2_fs_release(struct kobject *kobj) ++{ ++ struct bch_fs *c = container_of(kobj, struct bch_fs, kobj); ++ ++ bch2_fs_free(c); ++} ++ ++void bch2_fs_stop(struct bch_fs *c) ++{ ++ struct bch_dev *ca; ++ unsigned i; ++ ++ bch_verbose(c, "shutting down"); ++ ++ set_bit(BCH_FS_STOPPING, &c->flags); ++ ++ cancel_work_sync(&c->journal_seq_blacklist_gc_work); ++ ++ for_each_member_device(ca, c, i) ++ if (ca->kobj.state_in_sysfs && ++ ca->disk_sb.bdev) ++ sysfs_remove_link(bdev_kobj(ca->disk_sb.bdev), "bcachefs"); ++ ++ if (c->kobj.state_in_sysfs) ++ kobject_del(&c->kobj); ++ ++ bch2_fs_debug_exit(c); ++ bch2_fs_chardev_exit(c); ++ ++ kobject_put(&c->time_stats); ++ kobject_put(&c->opts_dir); ++ kobject_put(&c->internal); ++ ++ mutex_lock(&bch_fs_list_lock); ++ list_del(&c->list); ++ mutex_unlock(&bch_fs_list_lock); ++ ++ closure_sync(&c->cl); ++ closure_debug_destroy(&c->cl); ++ ++ mutex_lock(&c->state_lock); ++ bch2_fs_read_only(c); ++ mutex_unlock(&c->state_lock); ++ ++ /* btree prefetch might have kicked off reads in the background: */ ++ bch2_btree_flush_all_reads(c); ++ ++ for_each_member_device(ca, c, i) ++ cancel_work_sync(&ca->io_error_work); ++ ++ cancel_work_sync(&c->btree_write_error_work); ++ cancel_delayed_work_sync(&c->pd_controllers_update); ++ cancel_work_sync(&c->read_only_work); ++ ++ for (i = 0; i < c->sb.nr_devices; i++) ++ if (c->devs[i]) ++ bch2_dev_free(rcu_dereference_protected(c->devs[i], 1)); ++ ++ bch_verbose(c, "shutdown complete"); ++ ++ kobject_put(&c->kobj); ++} ++ ++static const char *bch2_fs_online(struct bch_fs *c) ++{ ++ struct bch_dev *ca; ++ const char *err = NULL; ++ unsigned i; ++ int ret; ++ ++ lockdep_assert_held(&bch_fs_list_lock); ++ ++ if (!list_empty(&c->list)) ++ return NULL; ++ ++ if (__bch2_uuid_to_fs(c->sb.uuid)) ++ return "filesystem UUID already open"; ++ ++ ret = bch2_fs_chardev_init(c); ++ if (ret) ++ return "error creating character device"; ++ ++ bch2_fs_debug_init(c); ++ ++ if (kobject_add(&c->kobj, NULL, "%pU", c->sb.user_uuid.b) || ++ kobject_add(&c->internal, &c->kobj, "internal") || ++ kobject_add(&c->opts_dir, &c->kobj, "options") || ++ kobject_add(&c->time_stats, &c->kobj, "time_stats") || ++ bch2_opts_create_sysfs_files(&c->opts_dir)) ++ return "error creating sysfs objects"; ++ ++ mutex_lock(&c->state_lock); ++ ++ err = "error creating sysfs objects"; ++ __for_each_member_device(ca, c, i, NULL) ++ if (bch2_dev_sysfs_online(c, ca)) ++ goto err; ++ ++ list_add(&c->list, &bch_fs_list); ++ err = NULL; ++err: ++ mutex_unlock(&c->state_lock); ++ return err; ++} ++ ++static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) ++{ ++ struct bch_sb_field_members *mi; ++ struct bch_fs *c; ++ unsigned i, iter_size; ++ const char *err; ++ ++ pr_verbose_init(opts, ""); ++ ++ c = kvpmalloc(sizeof(struct bch_fs), GFP_KERNEL|__GFP_ZERO); ++ if (!c) ++ goto out; ++ ++ __module_get(THIS_MODULE); ++ ++ c->minor = -1; ++ c->disk_sb.fs_sb = true; ++ ++ mutex_init(&c->state_lock); ++ mutex_init(&c->sb_lock); ++ mutex_init(&c->replicas_gc_lock); ++ mutex_init(&c->btree_root_lock); ++ INIT_WORK(&c->read_only_work, bch2_fs_read_only_work); ++ ++ init_rwsem(&c->gc_lock); ++ ++ for (i = 0; i < BCH_TIME_STAT_NR; i++) ++ bch2_time_stats_init(&c->times[i]); ++ ++ bch2_fs_allocator_background_init(c); ++ bch2_fs_allocator_foreground_init(c); ++ bch2_fs_rebalance_init(c); ++ bch2_fs_quota_init(c); ++ ++ INIT_LIST_HEAD(&c->list); ++ ++ INIT_LIST_HEAD(&c->btree_interior_update_list); ++ mutex_init(&c->btree_reserve_cache_lock); ++ mutex_init(&c->btree_interior_update_lock); ++ ++ mutex_init(&c->usage_scratch_lock); ++ ++ mutex_init(&c->bio_bounce_pages_lock); ++ ++ bio_list_init(&c->btree_write_error_list); ++ spin_lock_init(&c->btree_write_error_lock); ++ INIT_WORK(&c->btree_write_error_work, bch2_btree_write_error_work); ++ ++ INIT_WORK(&c->journal_seq_blacklist_gc_work, ++ bch2_blacklist_entries_gc); ++ ++ INIT_LIST_HEAD(&c->fsck_errors); ++ mutex_init(&c->fsck_error_lock); ++ ++ INIT_LIST_HEAD(&c->ec_new_stripe_list); ++ mutex_init(&c->ec_new_stripe_lock); ++ mutex_init(&c->ec_stripe_create_lock); ++ spin_lock_init(&c->ec_stripes_heap_lock); ++ ++ seqcount_init(&c->gc_pos_lock); ++ ++ seqcount_init(&c->usage_lock); ++ ++ c->copy_gc_enabled = 1; ++ c->rebalance.enabled = 1; ++ c->promote_whole_extents = true; ++ ++ c->journal.write_time = &c->times[BCH_TIME_journal_write]; ++ c->journal.delay_time = &c->times[BCH_TIME_journal_delay]; ++ c->journal.blocked_time = &c->times[BCH_TIME_blocked_journal]; ++ c->journal.flush_seq_time = &c->times[BCH_TIME_journal_flush_seq]; ++ ++ bch2_fs_btree_cache_init_early(&c->btree_cache); ++ ++ if (percpu_init_rwsem(&c->mark_lock)) ++ goto err; ++ ++ mutex_lock(&c->sb_lock); ++ ++ if (bch2_sb_to_fs(c, sb)) { ++ mutex_unlock(&c->sb_lock); ++ goto err; ++ } ++ ++ mutex_unlock(&c->sb_lock); ++ ++ scnprintf(c->name, sizeof(c->name), "%pU", &c->sb.user_uuid); ++ ++ c->opts = bch2_opts_default; ++ bch2_opts_apply(&c->opts, bch2_opts_from_sb(sb)); ++ bch2_opts_apply(&c->opts, opts); ++ ++ c->block_bits = ilog2(c->opts.block_size); ++ c->btree_foreground_merge_threshold = BTREE_FOREGROUND_MERGE_THRESHOLD(c); ++ ++ if (bch2_fs_init_fault("fs_alloc")) ++ goto err; ++ ++ iter_size = sizeof(struct btree_node_iter_large) + ++ (btree_blocks(c) + 1) * 2 * ++ sizeof(struct btree_node_iter_set); ++ ++ if (!(c->wq = alloc_workqueue("bcachefs", ++ WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) || ++ !(c->copygc_wq = alloc_workqueue("bcache_copygc", ++ WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) || ++ !(c->journal_reclaim_wq = alloc_workqueue("bcache_journal", ++ WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_HIGHPRI, 1)) || ++ percpu_ref_init(&c->writes, bch2_writes_disabled, ++ PERCPU_REF_INIT_DEAD, GFP_KERNEL) || ++ mempool_init_kmalloc_pool(&c->btree_reserve_pool, 1, ++ sizeof(struct btree_reserve)) || ++ mempool_init_kmalloc_pool(&c->btree_interior_update_pool, 1, ++ sizeof(struct btree_update)) || ++ mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) || ++ bioset_init(&c->btree_bio, 1, ++ max(offsetof(struct btree_read_bio, bio), ++ offsetof(struct btree_write_bio, wbio.bio)), ++ BIOSET_NEED_BVECS) || ++ !(c->pcpu = alloc_percpu(struct bch_fs_pcpu)) || ++ mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1, ++ btree_bytes(c)) || ++ bch2_io_clock_init(&c->io_clock[READ]) || ++ bch2_io_clock_init(&c->io_clock[WRITE]) || ++ bch2_fs_journal_init(&c->journal) || ++ bch2_fs_replicas_init(c) || ++ bch2_fs_btree_cache_init(c) || ++ bch2_fs_btree_iter_init(c) || ++ bch2_fs_io_init(c) || ++ bch2_fs_encryption_init(c) || ++ bch2_fs_compress_init(c) || ++ bch2_fs_ec_init(c) || ++ bch2_fs_fsio_init(c)) ++ goto err; ++ ++ mi = bch2_sb_get_members(c->disk_sb.sb); ++ for (i = 0; i < c->sb.nr_devices; i++) ++ if (bch2_dev_exists(c->disk_sb.sb, mi, i) && ++ bch2_dev_alloc(c, i)) ++ goto err; ++ ++ /* ++ * Now that all allocations have succeeded, init various refcounty ++ * things that let us shutdown: ++ */ ++ closure_init(&c->cl, NULL); ++ ++ c->kobj.kset = bcachefs_kset; ++ kobject_init(&c->kobj, &bch2_fs_ktype); ++ kobject_init(&c->internal, &bch2_fs_internal_ktype); ++ kobject_init(&c->opts_dir, &bch2_fs_opts_dir_ktype); ++ kobject_init(&c->time_stats, &bch2_fs_time_stats_ktype); ++ ++ mutex_lock(&bch_fs_list_lock); ++ err = bch2_fs_online(c); ++ mutex_unlock(&bch_fs_list_lock); ++ if (err) { ++ bch_err(c, "bch2_fs_online() error: %s", err); ++ goto err; ++ } ++out: ++ pr_verbose_init(opts, "ret %i", c ? 0 : -ENOMEM); ++ return c; ++err: ++ bch2_fs_free(c); ++ c = NULL; ++ goto out; ++} ++ ++noinline_for_stack ++static void print_mount_opts(struct bch_fs *c) ++{ ++ enum bch_opt_id i; ++ char buf[512]; ++ struct printbuf p = PBUF(buf); ++ bool first = true; ++ ++ strcpy(buf, "(null)"); ++ ++ if (c->opts.read_only) { ++ pr_buf(&p, "ro"); ++ first = false; ++ } ++ ++ for (i = 0; i < bch2_opts_nr; i++) { ++ const struct bch_option *opt = &bch2_opt_table[i]; ++ u64 v = bch2_opt_get_by_id(&c->opts, i); ++ ++ if (!(opt->mode & OPT_MOUNT)) ++ continue; ++ ++ if (v == bch2_opt_get_by_id(&bch2_opts_default, i)) ++ continue; ++ ++ if (!first) ++ pr_buf(&p, ","); ++ first = false; ++ bch2_opt_to_text(&p, c, opt, v, OPT_SHOW_MOUNT_STYLE); ++ } ++ ++ bch_info(c, "mounted with opts: %s", buf); ++} ++ ++int bch2_fs_start(struct bch_fs *c) ++{ ++ const char *err = "cannot allocate memory"; ++ struct bch_sb_field_members *mi; ++ struct bch_dev *ca; ++ time64_t now = ktime_get_real_seconds(); ++ unsigned i; ++ int ret = -EINVAL; ++ ++ mutex_lock(&c->state_lock); ++ ++ BUG_ON(test_bit(BCH_FS_STARTED, &c->flags)); ++ ++ mutex_lock(&c->sb_lock); ++ ++ for_each_online_member(ca, c, i) ++ bch2_sb_from_fs(c, ca); ++ ++ mi = bch2_sb_get_members(c->disk_sb.sb); ++ for_each_online_member(ca, c, i) ++ mi->members[ca->dev_idx].last_mount = cpu_to_le64(now); ++ ++ mutex_unlock(&c->sb_lock); ++ ++ for_each_rw_member(ca, c, i) ++ bch2_dev_allocator_add(c, ca); ++ bch2_recalc_capacity(c); ++ ++ ret = BCH_SB_INITIALIZED(c->disk_sb.sb) ++ ? bch2_fs_recovery(c) ++ : bch2_fs_initialize(c); ++ if (ret) ++ goto err; ++ ++ ret = bch2_opts_check_may_set(c); ++ if (ret) ++ goto err; ++ ++ err = "dynamic fault"; ++ ret = -EINVAL; ++ if (bch2_fs_init_fault("fs_start")) ++ goto err; ++ ++ if (c->opts.read_only || c->opts.nochanges) { ++ bch2_fs_read_only(c); ++ } else { ++ err = "error going read write"; ++ ret = !test_bit(BCH_FS_RW, &c->flags) ++ ? bch2_fs_read_write(c) ++ : bch2_fs_read_write_late(c); ++ if (ret) ++ goto err; ++ } ++ ++ set_bit(BCH_FS_STARTED, &c->flags); ++ print_mount_opts(c); ++ ret = 0; ++out: ++ mutex_unlock(&c->state_lock); ++ return ret; ++err: ++ switch (ret) { ++ case BCH_FSCK_ERRORS_NOT_FIXED: ++ bch_err(c, "filesystem contains errors: please report this to the developers"); ++ pr_cont("mount with -o fix_errors to repair\n"); ++ err = "fsck error"; ++ break; ++ case BCH_FSCK_REPAIR_UNIMPLEMENTED: ++ bch_err(c, "filesystem contains errors: please report this to the developers"); ++ pr_cont("repair unimplemented: inform the developers so that it can be added\n"); ++ err = "fsck error"; ++ break; ++ case BCH_FSCK_REPAIR_IMPOSSIBLE: ++ bch_err(c, "filesystem contains errors, but repair impossible"); ++ err = "fsck error"; ++ break; ++ case BCH_FSCK_UNKNOWN_VERSION: ++ err = "unknown metadata version";; ++ break; ++ case -ENOMEM: ++ err = "cannot allocate memory"; ++ break; ++ case -EIO: ++ err = "IO error"; ++ break; ++ } ++ ++ if (ret >= 0) ++ ret = -EIO; ++ goto out; ++} ++ ++static const char *bch2_dev_may_add(struct bch_sb *sb, struct bch_fs *c) ++{ ++ struct bch_sb_field_members *sb_mi; ++ ++ sb_mi = bch2_sb_get_members(sb); ++ if (!sb_mi) ++ return "Invalid superblock: member info area missing"; ++ ++ if (le16_to_cpu(sb->block_size) != c->opts.block_size) ++ return "mismatched block size"; ++ ++ if (le16_to_cpu(sb_mi->members[sb->dev_idx].bucket_size) < ++ BCH_SB_BTREE_NODE_SIZE(c->disk_sb.sb)) ++ return "new cache bucket size is too small"; ++ ++ return NULL; ++} ++ ++static const char *bch2_dev_in_fs(struct bch_sb *fs, struct bch_sb *sb) ++{ ++ struct bch_sb *newest = ++ le64_to_cpu(fs->seq) > le64_to_cpu(sb->seq) ? fs : sb; ++ struct bch_sb_field_members *mi = bch2_sb_get_members(newest); ++ ++ if (uuid_le_cmp(fs->uuid, sb->uuid)) ++ return "device not a member of filesystem"; ++ ++ if (!bch2_dev_exists(newest, mi, sb->dev_idx)) ++ return "device has been removed"; ++ ++ if (fs->block_size != sb->block_size) ++ return "mismatched block size"; ++ ++ return NULL; ++} ++ ++/* Device startup/shutdown: */ ++ ++static void bch2_dev_release(struct kobject *kobj) ++{ ++ struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj); ++ ++ kfree(ca); ++} ++ ++static void bch2_dev_free(struct bch_dev *ca) ++{ ++ cancel_work_sync(&ca->io_error_work); ++ ++ if (ca->kobj.state_in_sysfs && ++ ca->disk_sb.bdev) ++ sysfs_remove_link(bdev_kobj(ca->disk_sb.bdev), "bcachefs"); ++ ++ if (ca->kobj.state_in_sysfs) ++ kobject_del(&ca->kobj); ++ ++ bch2_free_super(&ca->disk_sb); ++ bch2_dev_journal_exit(ca); ++ ++ free_percpu(ca->io_done); ++ bioset_exit(&ca->replica_set); ++ bch2_dev_buckets_free(ca); ++ free_page((unsigned long) ca->sb_read_scratch); ++ ++ bch2_time_stats_exit(&ca->io_latency[WRITE]); ++ bch2_time_stats_exit(&ca->io_latency[READ]); ++ ++ percpu_ref_exit(&ca->io_ref); ++ percpu_ref_exit(&ca->ref); ++ kobject_put(&ca->kobj); ++} ++ ++static void __bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca) ++{ ++ ++ lockdep_assert_held(&c->state_lock); ++ ++ if (percpu_ref_is_zero(&ca->io_ref)) ++ return; ++ ++ __bch2_dev_read_only(c, ca); ++ ++ reinit_completion(&ca->io_ref_completion); ++ percpu_ref_kill(&ca->io_ref); ++ wait_for_completion(&ca->io_ref_completion); ++ ++ if (ca->kobj.state_in_sysfs) { ++ sysfs_remove_link(bdev_kobj(ca->disk_sb.bdev), "bcachefs"); ++ sysfs_remove_link(&ca->kobj, "block"); ++ } ++ ++ bch2_free_super(&ca->disk_sb); ++ bch2_dev_journal_exit(ca); ++} ++ ++static void bch2_dev_ref_complete(struct percpu_ref *ref) ++{ ++ struct bch_dev *ca = container_of(ref, struct bch_dev, ref); ++ ++ complete(&ca->ref_completion); ++} ++ ++static void bch2_dev_io_ref_complete(struct percpu_ref *ref) ++{ ++ struct bch_dev *ca = container_of(ref, struct bch_dev, io_ref); ++ ++ complete(&ca->io_ref_completion); ++} ++ ++static int bch2_dev_sysfs_online(struct bch_fs *c, struct bch_dev *ca) ++{ ++ int ret; ++ ++ if (!c->kobj.state_in_sysfs) ++ return 0; ++ ++ if (!ca->kobj.state_in_sysfs) { ++ ret = kobject_add(&ca->kobj, &c->kobj, ++ "dev-%u", ca->dev_idx); ++ if (ret) ++ return ret; ++ } ++ ++ if (ca->disk_sb.bdev) { ++ struct kobject *block = bdev_kobj(ca->disk_sb.bdev); ++ ++ ret = sysfs_create_link(block, &ca->kobj, "bcachefs"); ++ if (ret) ++ return ret; ++ ++ ret = sysfs_create_link(&ca->kobj, block, "block"); ++ if (ret) ++ return ret; ++ } ++ ++ return 0; ++} ++ ++static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c, ++ struct bch_member *member) ++{ ++ struct bch_dev *ca; ++ ++ ca = kzalloc(sizeof(*ca), GFP_KERNEL); ++ if (!ca) ++ return NULL; ++ ++ kobject_init(&ca->kobj, &bch2_dev_ktype); ++ init_completion(&ca->ref_completion); ++ init_completion(&ca->io_ref_completion); ++ ++ init_rwsem(&ca->bucket_lock); ++ ++ writepoint_init(&ca->copygc_write_point, BCH_DATA_USER); ++ ++ spin_lock_init(&ca->freelist_lock); ++ bch2_dev_copygc_init(ca); ++ ++ INIT_WORK(&ca->io_error_work, bch2_io_error_work); ++ ++ bch2_time_stats_init(&ca->io_latency[READ]); ++ bch2_time_stats_init(&ca->io_latency[WRITE]); ++ ++ ca->mi = bch2_mi_to_cpu(member); ++ ca->uuid = member->uuid; ++ ++ if (opt_defined(c->opts, discard)) ++ ca->mi.discard = opt_get(c->opts, discard); ++ ++ if (percpu_ref_init(&ca->ref, bch2_dev_ref_complete, ++ 0, GFP_KERNEL) || ++ percpu_ref_init(&ca->io_ref, bch2_dev_io_ref_complete, ++ PERCPU_REF_INIT_DEAD, GFP_KERNEL) || ++ !(ca->sb_read_scratch = (void *) __get_free_page(GFP_KERNEL)) || ++ bch2_dev_buckets_alloc(c, ca) || ++ bioset_init(&ca->replica_set, 4, ++ offsetof(struct bch_write_bio, bio), 0) || ++ !(ca->io_done = alloc_percpu(*ca->io_done))) ++ goto err; ++ ++ return ca; ++err: ++ bch2_dev_free(ca); ++ return NULL; ++} ++ ++static void bch2_dev_attach(struct bch_fs *c, struct bch_dev *ca, ++ unsigned dev_idx) ++{ ++ ca->dev_idx = dev_idx; ++ __set_bit(ca->dev_idx, ca->self.d); ++ scnprintf(ca->name, sizeof(ca->name), "dev-%u", dev_idx); ++ ++ ca->fs = c; ++ rcu_assign_pointer(c->devs[ca->dev_idx], ca); ++ ++ if (bch2_dev_sysfs_online(c, ca)) ++ pr_warn("error creating sysfs objects"); ++} ++ ++static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx) ++{ ++ struct bch_member *member = ++ bch2_sb_get_members(c->disk_sb.sb)->members + dev_idx; ++ struct bch_dev *ca = NULL; ++ int ret = 0; ++ ++ pr_verbose_init(c->opts, ""); ++ ++ if (bch2_fs_init_fault("dev_alloc")) ++ goto err; ++ ++ ca = __bch2_dev_alloc(c, member); ++ if (!ca) ++ goto err; ++ ++ bch2_dev_attach(c, ca, dev_idx); ++out: ++ pr_verbose_init(c->opts, "ret %i", ret); ++ return ret; ++err: ++ if (ca) ++ bch2_dev_free(ca); ++ ret = -ENOMEM; ++ goto out; ++} ++ ++static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb) ++{ ++ unsigned ret; ++ ++ if (bch2_dev_is_online(ca)) { ++ bch_err(ca, "already have device online in slot %u", ++ sb->sb->dev_idx); ++ return -EINVAL; ++ } ++ ++ if (get_capacity(sb->bdev->bd_disk) < ++ ca->mi.bucket_size * ca->mi.nbuckets) { ++ bch_err(ca, "cannot online: device too small"); ++ return -EINVAL; ++ } ++ ++ BUG_ON(!percpu_ref_is_zero(&ca->io_ref)); ++ ++ if (get_capacity(sb->bdev->bd_disk) < ++ ca->mi.bucket_size * ca->mi.nbuckets) { ++ bch_err(ca, "device too small"); ++ return -EINVAL; ++ } ++ ++ ret = bch2_dev_journal_init(ca, sb->sb); ++ if (ret) ++ return ret; ++ ++ /* Commit: */ ++ ca->disk_sb = *sb; ++ if (sb->mode & FMODE_EXCL) ++ ca->disk_sb.bdev->bd_holder = ca; ++ memset(sb, 0, sizeof(*sb)); ++ ++ percpu_ref_reinit(&ca->io_ref); ++ ++ return 0; ++} ++ ++static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb) ++{ ++ struct bch_dev *ca; ++ int ret; ++ ++ lockdep_assert_held(&c->state_lock); ++ ++ if (le64_to_cpu(sb->sb->seq) > ++ le64_to_cpu(c->disk_sb.sb->seq)) ++ bch2_sb_to_fs(c, sb->sb); ++ ++ BUG_ON(sb->sb->dev_idx >= c->sb.nr_devices || ++ !c->devs[sb->sb->dev_idx]); ++ ++ ca = bch_dev_locked(c, sb->sb->dev_idx); ++ ++ ret = __bch2_dev_attach_bdev(ca, sb); ++ if (ret) ++ return ret; ++ ++ if (test_bit(BCH_FS_ALLOC_READ_DONE, &c->flags) && ++ !percpu_u64_get(&ca->usage[0]->buckets[BCH_DATA_SB])) { ++ mutex_lock(&c->sb_lock); ++ bch2_mark_dev_superblock(ca->fs, ca, 0); ++ mutex_unlock(&c->sb_lock); ++ } ++ ++ bch2_dev_sysfs_online(c, ca); ++ ++ if (c->sb.nr_devices == 1) ++ bdevname(ca->disk_sb.bdev, c->name); ++ bdevname(ca->disk_sb.bdev, ca->name); ++ ++ rebalance_wakeup(c); ++ return 0; ++} ++ ++/* Device management: */ ++ ++/* ++ * Note: this function is also used by the error paths - when a particular ++ * device sees an error, we call it to determine whether we can just set the ++ * device RO, or - if this function returns false - we'll set the whole ++ * filesystem RO: ++ * ++ * XXX: maybe we should be more explicit about whether we're changing state ++ * because we got an error or what have you? ++ */ ++bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca, ++ enum bch_member_state new_state, int flags) ++{ ++ struct bch_devs_mask new_online_devs; ++ struct replicas_status s; ++ struct bch_dev *ca2; ++ int i, nr_rw = 0, required; ++ ++ lockdep_assert_held(&c->state_lock); ++ ++ switch (new_state) { ++ case BCH_MEMBER_STATE_RW: ++ return true; ++ case BCH_MEMBER_STATE_RO: ++ if (ca->mi.state != BCH_MEMBER_STATE_RW) ++ return true; ++ ++ /* do we have enough devices to write to? */ ++ for_each_member_device(ca2, c, i) ++ if (ca2 != ca) ++ nr_rw += ca2->mi.state == BCH_MEMBER_STATE_RW; ++ ++ required = max(!(flags & BCH_FORCE_IF_METADATA_DEGRADED) ++ ? c->opts.metadata_replicas ++ : c->opts.metadata_replicas_required, ++ !(flags & BCH_FORCE_IF_DATA_DEGRADED) ++ ? c->opts.data_replicas ++ : c->opts.data_replicas_required); ++ ++ return nr_rw >= required; ++ case BCH_MEMBER_STATE_FAILED: ++ case BCH_MEMBER_STATE_SPARE: ++ if (ca->mi.state != BCH_MEMBER_STATE_RW && ++ ca->mi.state != BCH_MEMBER_STATE_RO) ++ return true; ++ ++ /* do we have enough devices to read from? */ ++ new_online_devs = bch2_online_devs(c); ++ __clear_bit(ca->dev_idx, new_online_devs.d); ++ ++ s = __bch2_replicas_status(c, new_online_devs); ++ ++ return bch2_have_enough_devs(s, flags); ++ default: ++ BUG(); ++ } ++} ++ ++static bool bch2_fs_may_start(struct bch_fs *c) ++{ ++ struct replicas_status s; ++ struct bch_sb_field_members *mi; ++ struct bch_dev *ca; ++ unsigned i, flags = c->opts.degraded ++ ? BCH_FORCE_IF_DEGRADED ++ : 0; ++ ++ if (!c->opts.degraded) { ++ mutex_lock(&c->sb_lock); ++ mi = bch2_sb_get_members(c->disk_sb.sb); ++ ++ for (i = 0; i < c->disk_sb.sb->nr_devices; i++) { ++ if (!bch2_dev_exists(c->disk_sb.sb, mi, i)) ++ continue; ++ ++ ca = bch_dev_locked(c, i); ++ ++ if (!bch2_dev_is_online(ca) && ++ (ca->mi.state == BCH_MEMBER_STATE_RW || ++ ca->mi.state == BCH_MEMBER_STATE_RO)) { ++ mutex_unlock(&c->sb_lock); ++ return false; ++ } ++ } ++ mutex_unlock(&c->sb_lock); ++ } ++ ++ s = bch2_replicas_status(c); ++ ++ return bch2_have_enough_devs(s, flags); ++} ++ ++static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca) ++{ ++ bch2_copygc_stop(ca); ++ ++ /* ++ * The allocator thread itself allocates btree nodes, so stop it first: ++ */ ++ bch2_dev_allocator_stop(ca); ++ bch2_dev_allocator_remove(c, ca); ++ bch2_dev_journal_stop(&c->journal, ca); ++} ++ ++static const char *__bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca) ++{ ++ lockdep_assert_held(&c->state_lock); ++ ++ BUG_ON(ca->mi.state != BCH_MEMBER_STATE_RW); ++ ++ bch2_dev_allocator_add(c, ca); ++ bch2_recalc_capacity(c); ++ ++ if (bch2_dev_allocator_start(ca)) ++ return "error starting allocator thread"; ++ ++ if (bch2_copygc_start(c, ca)) ++ return "error starting copygc thread"; ++ ++ return NULL; ++} ++ ++int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca, ++ enum bch_member_state new_state, int flags) ++{ ++ struct bch_sb_field_members *mi; ++ int ret = 0; ++ ++ if (ca->mi.state == new_state) ++ return 0; ++ ++ if (!bch2_dev_state_allowed(c, ca, new_state, flags)) ++ return -EINVAL; ++ ++ if (new_state != BCH_MEMBER_STATE_RW) ++ __bch2_dev_read_only(c, ca); ++ ++ bch_notice(ca, "%s", bch2_dev_state[new_state]); ++ ++ mutex_lock(&c->sb_lock); ++ mi = bch2_sb_get_members(c->disk_sb.sb); ++ SET_BCH_MEMBER_STATE(&mi->members[ca->dev_idx], new_state); ++ bch2_write_super(c); ++ mutex_unlock(&c->sb_lock); ++ ++ if (new_state == BCH_MEMBER_STATE_RW && ++ __bch2_dev_read_write(c, ca)) ++ ret = -ENOMEM; ++ ++ rebalance_wakeup(c); ++ ++ return ret; ++} ++ ++int bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca, ++ enum bch_member_state new_state, int flags) ++{ ++ int ret; ++ ++ mutex_lock(&c->state_lock); ++ ret = __bch2_dev_set_state(c, ca, new_state, flags); ++ mutex_unlock(&c->state_lock); ++ ++ return ret; ++} ++ ++/* Device add/removal: */ ++ ++int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) ++{ ++ struct bch_sb_field_members *mi; ++ unsigned dev_idx = ca->dev_idx, data; ++ int ret = -EINVAL; ++ ++ mutex_lock(&c->state_lock); ++ ++ percpu_ref_put(&ca->ref); /* XXX */ ++ ++ if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_FAILED, flags)) { ++ bch_err(ca, "Cannot remove without losing data"); ++ goto err; ++ } ++ ++ __bch2_dev_read_only(c, ca); ++ ++ /* ++ * XXX: verify that dev_idx is really not in use anymore, anywhere ++ * ++ * flag_data_bad() does not check btree pointers ++ */ ++ ret = bch2_dev_data_drop(c, ca->dev_idx, flags); ++ if (ret) { ++ bch_err(ca, "Remove failed: error %i dropping data", ret); ++ goto err; ++ } ++ ++ ret = bch2_journal_flush_device_pins(&c->journal, ca->dev_idx); ++ if (ret) { ++ bch_err(ca, "Remove failed: error %i flushing journal", ret); ++ goto err; ++ } ++ ++ data = bch2_dev_has_data(c, ca); ++ if (data) { ++ char data_has_str[100]; ++ ++ bch2_flags_to_text(&PBUF(data_has_str), ++ bch2_data_types, data); ++ bch_err(ca, "Remove failed, still has data (%s)", data_has_str); ++ ret = -EBUSY; ++ goto err; ++ } ++ ++ ret = bch2_btree_delete_range(c, BTREE_ID_ALLOC, ++ POS(ca->dev_idx, 0), ++ POS(ca->dev_idx + 1, 0), ++ NULL); ++ if (ret) { ++ bch_err(ca, "Remove failed, error deleting alloc info"); ++ goto err; ++ } ++ ++ /* ++ * must flush all existing journal entries, they might have ++ * (overwritten) keys that point to the device we're removing: ++ */ ++ bch2_journal_flush_all_pins(&c->journal); ++ ret = bch2_journal_error(&c->journal); ++ if (ret) { ++ bch_err(ca, "Remove failed, journal error"); ++ goto err; ++ } ++ ++ __bch2_dev_offline(c, ca); ++ ++ mutex_lock(&c->sb_lock); ++ rcu_assign_pointer(c->devs[ca->dev_idx], NULL); ++ mutex_unlock(&c->sb_lock); ++ ++ percpu_ref_kill(&ca->ref); ++ wait_for_completion(&ca->ref_completion); ++ ++ bch2_dev_free(ca); ++ ++ /* ++ * Free this device's slot in the bch_member array - all pointers to ++ * this device must be gone: ++ */ ++ mutex_lock(&c->sb_lock); ++ mi = bch2_sb_get_members(c->disk_sb.sb); ++ memset(&mi->members[dev_idx].uuid, 0, sizeof(mi->members[dev_idx].uuid)); ++ ++ bch2_write_super(c); ++ ++ mutex_unlock(&c->sb_lock); ++ mutex_unlock(&c->state_lock); ++ return 0; ++err: ++ if (ca->mi.state == BCH_MEMBER_STATE_RW && ++ !percpu_ref_is_zero(&ca->io_ref)) ++ __bch2_dev_read_write(c, ca); ++ mutex_unlock(&c->state_lock); ++ return ret; ++} ++ ++static void dev_usage_clear(struct bch_dev *ca) ++{ ++ struct bucket_array *buckets; ++ ++ percpu_memset(ca->usage[0], 0, sizeof(*ca->usage[0])); ++ ++ down_read(&ca->bucket_lock); ++ buckets = bucket_array(ca); ++ ++ memset(buckets->b, 0, sizeof(buckets->b[0]) * buckets->nbuckets); ++ up_read(&ca->bucket_lock); ++} ++ ++/* Add new device to running filesystem: */ ++int bch2_dev_add(struct bch_fs *c, const char *path) ++{ ++ struct bch_opts opts = bch2_opts_empty(); ++ struct bch_sb_handle sb; ++ const char *err; ++ struct bch_dev *ca = NULL; ++ struct bch_sb_field_members *mi; ++ struct bch_member dev_mi; ++ unsigned dev_idx, nr_devices, u64s; ++ int ret; ++ ++ ret = bch2_read_super(path, &opts, &sb); ++ if (ret) ++ return ret; ++ ++ err = bch2_sb_validate(&sb); ++ if (err) ++ return -EINVAL; ++ ++ dev_mi = bch2_sb_get_members(sb.sb)->members[sb.sb->dev_idx]; ++ ++ err = bch2_dev_may_add(sb.sb, c); ++ if (err) ++ return -EINVAL; ++ ++ ca = __bch2_dev_alloc(c, &dev_mi); ++ if (!ca) { ++ bch2_free_super(&sb); ++ return -ENOMEM; ++ } ++ ++ ret = __bch2_dev_attach_bdev(ca, &sb); ++ if (ret) { ++ bch2_dev_free(ca); ++ return ret; ++ } ++ ++ /* ++ * We want to allocate journal on the new device before adding the new ++ * device to the filesystem because allocating after we attach requires ++ * spinning up the allocator thread, and the allocator thread requires ++ * doing btree writes, which if the existing devices are RO isn't going ++ * to work ++ * ++ * So we have to mark where the superblocks are, but marking allocated ++ * data normally updates the filesystem usage too, so we have to mark, ++ * allocate the journal, reset all the marks, then remark after we ++ * attach... ++ */ ++ bch2_mark_dev_superblock(ca->fs, ca, 0); ++ ++ err = "journal alloc failed"; ++ ret = bch2_dev_journal_alloc(ca); ++ if (ret) ++ goto err; ++ ++ dev_usage_clear(ca); ++ ++ mutex_lock(&c->state_lock); ++ mutex_lock(&c->sb_lock); ++ ++ err = "insufficient space in new superblock"; ++ ret = bch2_sb_from_fs(c, ca); ++ if (ret) ++ goto err_unlock; ++ ++ mi = bch2_sb_get_members(ca->disk_sb.sb); ++ ++ if (!bch2_sb_resize_members(&ca->disk_sb, ++ le32_to_cpu(mi->field.u64s) + ++ sizeof(dev_mi) / sizeof(u64))) { ++ ret = -ENOSPC; ++ goto err_unlock; ++ } ++ ++ if (dynamic_fault("bcachefs:add:no_slot")) ++ goto no_slot; ++ ++ mi = bch2_sb_get_members(c->disk_sb.sb); ++ for (dev_idx = 0; dev_idx < BCH_SB_MEMBERS_MAX; dev_idx++) ++ if (!bch2_dev_exists(c->disk_sb.sb, mi, dev_idx)) ++ goto have_slot; ++no_slot: ++ err = "no slots available in superblock"; ++ ret = -ENOSPC; ++ goto err_unlock; ++ ++have_slot: ++ nr_devices = max_t(unsigned, dev_idx + 1, c->sb.nr_devices); ++ u64s = (sizeof(struct bch_sb_field_members) + ++ sizeof(struct bch_member) * nr_devices) / sizeof(u64); ++ ++ err = "no space in superblock for member info"; ++ ret = -ENOSPC; ++ ++ mi = bch2_sb_resize_members(&c->disk_sb, u64s); ++ if (!mi) ++ goto err_unlock; ++ ++ /* success: */ ++ ++ mi->members[dev_idx] = dev_mi; ++ mi->members[dev_idx].last_mount = cpu_to_le64(ktime_get_real_seconds()); ++ c->disk_sb.sb->nr_devices = nr_devices; ++ ++ ca->disk_sb.sb->dev_idx = dev_idx; ++ bch2_dev_attach(c, ca, dev_idx); ++ ++ bch2_mark_dev_superblock(c, ca, 0); ++ ++ bch2_write_super(c); ++ mutex_unlock(&c->sb_lock); ++ ++ if (ca->mi.state == BCH_MEMBER_STATE_RW) { ++ err = __bch2_dev_read_write(c, ca); ++ if (err) ++ goto err_late; ++ } ++ ++ mutex_unlock(&c->state_lock); ++ return 0; ++ ++err_unlock: ++ mutex_unlock(&c->sb_lock); ++ mutex_unlock(&c->state_lock); ++err: ++ if (ca) ++ bch2_dev_free(ca); ++ bch2_free_super(&sb); ++ bch_err(c, "Unable to add device: %s", err); ++ return ret; ++err_late: ++ bch_err(c, "Error going rw after adding device: %s", err); ++ return -EINVAL; ++} ++ ++/* Hot add existing device to running filesystem: */ ++int bch2_dev_online(struct bch_fs *c, const char *path) ++{ ++ struct bch_opts opts = bch2_opts_empty(); ++ struct bch_sb_handle sb = { NULL }; ++ struct bch_sb_field_members *mi; ++ struct bch_dev *ca; ++ unsigned dev_idx; ++ const char *err; ++ int ret; ++ ++ mutex_lock(&c->state_lock); ++ ++ ret = bch2_read_super(path, &opts, &sb); ++ if (ret) { ++ mutex_unlock(&c->state_lock); ++ return ret; ++ } ++ ++ dev_idx = sb.sb->dev_idx; ++ ++ err = bch2_dev_in_fs(c->disk_sb.sb, sb.sb); ++ if (err) ++ goto err; ++ ++ if (bch2_dev_attach_bdev(c, &sb)) { ++ err = "bch2_dev_attach_bdev() error"; ++ goto err; ++ } ++ ++ ca = bch_dev_locked(c, dev_idx); ++ if (ca->mi.state == BCH_MEMBER_STATE_RW) { ++ err = __bch2_dev_read_write(c, ca); ++ if (err) ++ goto err; ++ } ++ ++ mutex_lock(&c->sb_lock); ++ mi = bch2_sb_get_members(c->disk_sb.sb); ++ ++ mi->members[ca->dev_idx].last_mount = ++ cpu_to_le64(ktime_get_real_seconds()); ++ ++ bch2_write_super(c); ++ mutex_unlock(&c->sb_lock); ++ ++ mutex_unlock(&c->state_lock); ++ return 0; ++err: ++ mutex_unlock(&c->state_lock); ++ bch2_free_super(&sb); ++ bch_err(c, "error bringing %s online: %s", path, err); ++ return -EINVAL; ++} ++ ++int bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca, int flags) ++{ ++ mutex_lock(&c->state_lock); ++ ++ if (!bch2_dev_is_online(ca)) { ++ bch_err(ca, "Already offline"); ++ mutex_unlock(&c->state_lock); ++ return 0; ++ } ++ ++ if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_FAILED, flags)) { ++ bch_err(ca, "Cannot offline required disk"); ++ mutex_unlock(&c->state_lock); ++ return -EINVAL; ++ } ++ ++ __bch2_dev_offline(c, ca); ++ ++ mutex_unlock(&c->state_lock); ++ return 0; ++} ++ ++int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) ++{ ++ struct bch_member *mi; ++ int ret = 0; ++ ++ mutex_lock(&c->state_lock); ++ ++ if (nbuckets < ca->mi.nbuckets) { ++ bch_err(ca, "Cannot shrink yet"); ++ ret = -EINVAL; ++ goto err; ++ } ++ ++ if (bch2_dev_is_online(ca) && ++ get_capacity(ca->disk_sb.bdev->bd_disk) < ++ ca->mi.bucket_size * nbuckets) { ++ bch_err(ca, "New size larger than device"); ++ ret = -EINVAL; ++ goto err; ++ } ++ ++ ret = bch2_dev_buckets_resize(c, ca, nbuckets); ++ if (ret) { ++ bch_err(ca, "Resize error: %i", ret); ++ goto err; ++ } ++ ++ mutex_lock(&c->sb_lock); ++ mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx]; ++ mi->nbuckets = cpu_to_le64(nbuckets); ++ ++ bch2_write_super(c); ++ mutex_unlock(&c->sb_lock); ++ ++ bch2_recalc_capacity(c); ++err: ++ mutex_unlock(&c->state_lock); ++ return ret; ++} ++ ++/* return with ref on ca->ref: */ ++struct bch_dev *bch2_dev_lookup(struct bch_fs *c, const char *path) ++{ ++ ++ struct bch_dev *ca; ++ dev_t dev; ++ unsigned i; ++ int ret; ++ ++ ret = lookup_bdev(path, &dev); ++ if (ret) ++ return ERR_PTR(ret); ++ ++ for_each_member_device(ca, c, i) ++ if (ca->disk_sb.bdev->bd_dev == dev) ++ goto found; ++ ++ ca = ERR_PTR(-ENOENT); ++found: ++ return ca; ++} ++ ++/* Filesystem open: */ ++ ++struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices, ++ struct bch_opts opts) ++{ ++ struct bch_sb_handle *sb = NULL; ++ struct bch_fs *c = NULL; ++ unsigned i, best_sb = 0; ++ const char *err; ++ int ret = -ENOMEM; ++ ++ pr_verbose_init(opts, ""); ++ ++ if (!nr_devices) { ++ c = ERR_PTR(-EINVAL); ++ goto out2; ++ } ++ ++ if (!try_module_get(THIS_MODULE)) { ++ c = ERR_PTR(-ENODEV); ++ goto out2; ++ } ++ ++ sb = kcalloc(nr_devices, sizeof(*sb), GFP_KERNEL); ++ if (!sb) ++ goto err; ++ ++ for (i = 0; i < nr_devices; i++) { ++ ret = bch2_read_super(devices[i], &opts, &sb[i]); ++ if (ret) ++ goto err; ++ ++ err = bch2_sb_validate(&sb[i]); ++ if (err) ++ goto err_print; ++ } ++ ++ for (i = 1; i < nr_devices; i++) ++ if (le64_to_cpu(sb[i].sb->seq) > ++ le64_to_cpu(sb[best_sb].sb->seq)) ++ best_sb = i; ++ ++ for (i = 0; i < nr_devices; i++) { ++ err = bch2_dev_in_fs(sb[best_sb].sb, sb[i].sb); ++ if (err) ++ goto err_print; ++ } ++ ++ ret = -ENOMEM; ++ c = bch2_fs_alloc(sb[best_sb].sb, opts); ++ if (!c) ++ goto err; ++ ++ err = "bch2_dev_online() error"; ++ mutex_lock(&c->state_lock); ++ for (i = 0; i < nr_devices; i++) ++ if (bch2_dev_attach_bdev(c, &sb[i])) { ++ mutex_unlock(&c->state_lock); ++ goto err_print; ++ } ++ mutex_unlock(&c->state_lock); ++ ++ err = "insufficient devices"; ++ if (!bch2_fs_may_start(c)) ++ goto err_print; ++ ++ if (!c->opts.nostart) { ++ ret = bch2_fs_start(c); ++ if (ret) ++ goto err; ++ } ++out: ++ kfree(sb); ++ module_put(THIS_MODULE); ++out2: ++ pr_verbose_init(opts, "ret %i", PTR_ERR_OR_ZERO(c)); ++ return c; ++err_print: ++ pr_err("bch_fs_open err opening %s: %s", ++ devices[0], err); ++ ret = -EINVAL; ++err: ++ if (c) ++ bch2_fs_stop(c); ++ for (i = 0; i < nr_devices; i++) ++ bch2_free_super(&sb[i]); ++ c = ERR_PTR(ret); ++ goto out; ++} ++ ++static const char *__bch2_fs_open_incremental(struct bch_sb_handle *sb, ++ struct bch_opts opts) ++{ ++ const char *err; ++ struct bch_fs *c; ++ bool allocated_fs = false; ++ int ret; ++ ++ err = bch2_sb_validate(sb); ++ if (err) ++ return err; ++ ++ mutex_lock(&bch_fs_list_lock); ++ c = __bch2_uuid_to_fs(sb->sb->uuid); ++ if (c) { ++ closure_get(&c->cl); ++ ++ err = bch2_dev_in_fs(c->disk_sb.sb, sb->sb); ++ if (err) ++ goto err; ++ } else { ++ c = bch2_fs_alloc(sb->sb, opts); ++ err = "cannot allocate memory"; ++ if (!c) ++ goto err; ++ ++ allocated_fs = true; ++ } ++ ++ err = "bch2_dev_online() error"; ++ ++ mutex_lock(&c->sb_lock); ++ if (bch2_dev_attach_bdev(c, sb)) { ++ mutex_unlock(&c->sb_lock); ++ goto err; ++ } ++ mutex_unlock(&c->sb_lock); ++ ++ if (!c->opts.nostart && bch2_fs_may_start(c)) { ++ err = "error starting filesystem"; ++ ret = bch2_fs_start(c); ++ if (ret) ++ goto err; ++ } ++ ++ closure_put(&c->cl); ++ mutex_unlock(&bch_fs_list_lock); ++ ++ return NULL; ++err: ++ mutex_unlock(&bch_fs_list_lock); ++ ++ if (allocated_fs) ++ bch2_fs_stop(c); ++ else if (c) ++ closure_put(&c->cl); ++ ++ return err; ++} ++ ++const char *bch2_fs_open_incremental(const char *path) ++{ ++ struct bch_sb_handle sb; ++ struct bch_opts opts = bch2_opts_empty(); ++ const char *err; ++ ++ if (bch2_read_super(path, &opts, &sb)) ++ return "error reading superblock"; ++ ++ err = __bch2_fs_open_incremental(&sb, opts); ++ bch2_free_super(&sb); ++ ++ return err; ++} ++ ++/* Global interfaces/init */ ++ ++static void bcachefs_exit(void) ++{ ++ bch2_debug_exit(); ++ bch2_vfs_exit(); ++ bch2_chardev_exit(); ++ if (bcachefs_kset) ++ kset_unregister(bcachefs_kset); ++} ++ ++static int __init bcachefs_init(void) ++{ ++ bch2_bkey_pack_test(); ++ bch2_inode_pack_test(); ++ ++ if (!(bcachefs_kset = kset_create_and_add("bcachefs", NULL, fs_kobj)) || ++ bch2_chardev_init() || ++ bch2_vfs_init() || ++ bch2_debug_init()) ++ goto err; ++ ++ return 0; ++err: ++ bcachefs_exit(); ++ return -ENOMEM; ++} ++ ++#define BCH_DEBUG_PARAM(name, description) \ ++ bool bch2_##name; \ ++ module_param_named(name, bch2_##name, bool, 0644); \ ++ MODULE_PARM_DESC(name, description); ++BCH_DEBUG_PARAMS() ++#undef BCH_DEBUG_PARAM ++ ++module_exit(bcachefs_exit); ++module_init(bcachefs_init); +diff --git a/fs/bcachefs/super.h b/fs/bcachefs/super.h +new file mode 100644 +index 000000000000..9204e8fdabdd +--- /dev/null ++++ b/fs/bcachefs/super.h +@@ -0,0 +1,231 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_SUPER_H ++#define _BCACHEFS_SUPER_H ++ ++#include "extents.h" ++ ++#include "bcachefs_ioctl.h" ++ ++#include ++ ++static inline size_t sector_to_bucket(const struct bch_dev *ca, sector_t s) ++{ ++ return div_u64(s, ca->mi.bucket_size); ++} ++ ++static inline sector_t bucket_to_sector(const struct bch_dev *ca, size_t b) ++{ ++ return ((sector_t) b) * ca->mi.bucket_size; ++} ++ ++static inline sector_t bucket_remainder(const struct bch_dev *ca, sector_t s) ++{ ++ u32 remainder; ++ ++ div_u64_rem(s, ca->mi.bucket_size, &remainder); ++ return remainder; ++} ++ ++static inline bool bch2_dev_is_online(struct bch_dev *ca) ++{ ++ return !percpu_ref_is_zero(&ca->io_ref); ++} ++ ++static inline bool bch2_dev_is_readable(struct bch_dev *ca) ++{ ++ return bch2_dev_is_online(ca) && ++ ca->mi.state != BCH_MEMBER_STATE_FAILED; ++} ++ ++static inline bool bch2_dev_get_ioref(struct bch_dev *ca, int rw) ++{ ++ if (!percpu_ref_tryget(&ca->io_ref)) ++ return false; ++ ++ if (ca->mi.state == BCH_MEMBER_STATE_RW || ++ (ca->mi.state == BCH_MEMBER_STATE_RO && rw == READ)) ++ return true; ++ ++ percpu_ref_put(&ca->io_ref); ++ return false; ++} ++ ++static inline unsigned dev_mask_nr(const struct bch_devs_mask *devs) ++{ ++ return bitmap_weight(devs->d, BCH_SB_MEMBERS_MAX); ++} ++ ++static inline bool bch2_dev_list_has_dev(struct bch_devs_list devs, ++ unsigned dev) ++{ ++ unsigned i; ++ ++ for (i = 0; i < devs.nr; i++) ++ if (devs.devs[i] == dev) ++ return true; ++ ++ return false; ++} ++ ++static inline void bch2_dev_list_drop_dev(struct bch_devs_list *devs, ++ unsigned dev) ++{ ++ unsigned i; ++ ++ for (i = 0; i < devs->nr; i++) ++ if (devs->devs[i] == dev) { ++ array_remove_item(devs->devs, devs->nr, i); ++ return; ++ } ++} ++ ++static inline void bch2_dev_list_add_dev(struct bch_devs_list *devs, ++ unsigned dev) ++{ ++ BUG_ON(bch2_dev_list_has_dev(*devs, dev)); ++ BUG_ON(devs->nr >= BCH_REPLICAS_MAX); ++ devs->devs[devs->nr++] = dev; ++} ++ ++static inline struct bch_devs_list bch2_dev_list_single(unsigned dev) ++{ ++ return (struct bch_devs_list) { .nr = 1, .devs[0] = dev }; ++} ++ ++static inline struct bch_dev *__bch2_next_dev(struct bch_fs *c, unsigned *iter, ++ const struct bch_devs_mask *mask) ++{ ++ struct bch_dev *ca = NULL; ++ ++ while ((*iter = mask ++ ? find_next_bit(mask->d, c->sb.nr_devices, *iter) ++ : *iter) < c->sb.nr_devices && ++ !(ca = rcu_dereference_check(c->devs[*iter], ++ lockdep_is_held(&c->state_lock)))) ++ (*iter)++; ++ ++ return ca; ++} ++ ++#define __for_each_member_device(ca, c, iter, mask) \ ++ for ((iter) = 0; ((ca) = __bch2_next_dev((c), &(iter), mask)); (iter)++) ++ ++#define for_each_member_device_rcu(ca, c, iter, mask) \ ++ __for_each_member_device(ca, c, iter, mask) ++ ++static inline struct bch_dev *bch2_get_next_dev(struct bch_fs *c, unsigned *iter) ++{ ++ struct bch_dev *ca; ++ ++ rcu_read_lock(); ++ if ((ca = __bch2_next_dev(c, iter, NULL))) ++ percpu_ref_get(&ca->ref); ++ rcu_read_unlock(); ++ ++ return ca; ++} ++ ++/* ++ * If you break early, you must drop your ref on the current device ++ */ ++#define for_each_member_device(ca, c, iter) \ ++ for ((iter) = 0; \ ++ (ca = bch2_get_next_dev(c, &(iter))); \ ++ percpu_ref_put(&ca->ref), (iter)++) ++ ++static inline struct bch_dev *bch2_get_next_online_dev(struct bch_fs *c, ++ unsigned *iter, ++ int state_mask) ++{ ++ struct bch_dev *ca; ++ ++ rcu_read_lock(); ++ while ((ca = __bch2_next_dev(c, iter, NULL)) && ++ (!((1 << ca->mi.state) & state_mask) || ++ !percpu_ref_tryget(&ca->io_ref))) ++ (*iter)++; ++ rcu_read_unlock(); ++ ++ return ca; ++} ++ ++#define __for_each_online_member(ca, c, iter, state_mask) \ ++ for ((iter) = 0; \ ++ (ca = bch2_get_next_online_dev(c, &(iter), state_mask)); \ ++ percpu_ref_put(&ca->io_ref), (iter)++) ++ ++#define for_each_online_member(ca, c, iter) \ ++ __for_each_online_member(ca, c, iter, ~0) ++ ++#define for_each_rw_member(ca, c, iter) \ ++ __for_each_online_member(ca, c, iter, 1 << BCH_MEMBER_STATE_RW) ++ ++#define for_each_readable_member(ca, c, iter) \ ++ __for_each_online_member(ca, c, iter, \ ++ (1 << BCH_MEMBER_STATE_RW)|(1 << BCH_MEMBER_STATE_RO)) ++ ++/* ++ * If a key exists that references a device, the device won't be going away and ++ * we can omit rcu_read_lock(): ++ */ ++static inline struct bch_dev *bch_dev_bkey_exists(const struct bch_fs *c, unsigned idx) ++{ ++ EBUG_ON(idx >= c->sb.nr_devices || !c->devs[idx]); ++ ++ return rcu_dereference_check(c->devs[idx], 1); ++} ++ ++static inline struct bch_dev *bch_dev_locked(struct bch_fs *c, unsigned idx) ++{ ++ EBUG_ON(idx >= c->sb.nr_devices || !c->devs[idx]); ++ ++ return rcu_dereference_protected(c->devs[idx], ++ lockdep_is_held(&c->sb_lock) || ++ lockdep_is_held(&c->state_lock)); ++} ++ ++/* XXX kill, move to struct bch_fs */ ++static inline struct bch_devs_mask bch2_online_devs(struct bch_fs *c) ++{ ++ struct bch_devs_mask devs; ++ struct bch_dev *ca; ++ unsigned i; ++ ++ memset(&devs, 0, sizeof(devs)); ++ for_each_online_member(ca, c, i) ++ __set_bit(ca->dev_idx, devs.d); ++ return devs; ++} ++ ++struct bch_fs *bch2_dev_to_fs(dev_t); ++struct bch_fs *bch2_uuid_to_fs(uuid_le); ++ ++bool bch2_dev_state_allowed(struct bch_fs *, struct bch_dev *, ++ enum bch_member_state, int); ++int __bch2_dev_set_state(struct bch_fs *, struct bch_dev *, ++ enum bch_member_state, int); ++int bch2_dev_set_state(struct bch_fs *, struct bch_dev *, ++ enum bch_member_state, int); ++ ++int bch2_dev_fail(struct bch_dev *, int); ++int bch2_dev_remove(struct bch_fs *, struct bch_dev *, int); ++int bch2_dev_add(struct bch_fs *, const char *); ++int bch2_dev_online(struct bch_fs *, const char *); ++int bch2_dev_offline(struct bch_fs *, struct bch_dev *, int); ++int bch2_dev_resize(struct bch_fs *, struct bch_dev *, u64); ++struct bch_dev *bch2_dev_lookup(struct bch_fs *, const char *); ++ ++bool bch2_fs_emergency_read_only(struct bch_fs *); ++void bch2_fs_read_only(struct bch_fs *); ++ ++int __bch2_fs_read_write(struct bch_fs *, bool); ++int bch2_fs_read_write(struct bch_fs *); ++int bch2_fs_read_write_early(struct bch_fs *); ++ ++void bch2_fs_stop(struct bch_fs *); ++ ++int bch2_fs_start(struct bch_fs *); ++struct bch_fs *bch2_fs_open(char * const *, unsigned, struct bch_opts); ++const char *bch2_fs_open_incremental(const char *path); ++ ++#endif /* _BCACHEFS_SUPER_H */ +diff --git a/fs/bcachefs/super_types.h b/fs/bcachefs/super_types.h +new file mode 100644 +index 000000000000..20406ebd6f5b +--- /dev/null ++++ b/fs/bcachefs/super_types.h +@@ -0,0 +1,51 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_SUPER_TYPES_H ++#define _BCACHEFS_SUPER_TYPES_H ++ ++struct bch_sb_handle { ++ struct bch_sb *sb; ++ struct block_device *bdev; ++ struct bio *bio; ++ unsigned page_order; ++ fmode_t mode; ++ unsigned have_layout:1; ++ unsigned have_bio:1; ++ unsigned fs_sb:1; ++ u64 seq; ++}; ++ ++struct bch_devs_mask { ++ unsigned long d[BITS_TO_LONGS(BCH_SB_MEMBERS_MAX)]; ++}; ++ ++struct bch_devs_list { ++ u8 nr; ++ u8 devs[BCH_REPLICAS_MAX + 1]; ++}; ++ ++struct bch_member_cpu { ++ u64 nbuckets; /* device size */ ++ u16 first_bucket; /* index of first bucket used */ ++ u16 bucket_size; /* sectors */ ++ u16 group; ++ u8 state; ++ u8 replacement; ++ u8 discard; ++ u8 data_allowed; ++ u8 durability; ++ u8 valid; ++}; ++ ++struct bch_disk_group_cpu { ++ bool deleted; ++ u16 parent; ++ struct bch_devs_mask devs; ++}; ++ ++struct bch_disk_groups_cpu { ++ struct rcu_head rcu; ++ unsigned nr; ++ struct bch_disk_group_cpu entries[]; ++}; ++ ++#endif /* _BCACHEFS_SUPER_TYPES_H */ +diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c +new file mode 100644 +index 000000000000..27646c435e30 +--- /dev/null ++++ b/fs/bcachefs/sysfs.c +@@ -0,0 +1,1068 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * bcache sysfs interfaces ++ * ++ * Copyright 2010, 2011 Kent Overstreet ++ * Copyright 2012 Google, Inc. ++ */ ++ ++#ifndef NO_BCACHEFS_SYSFS ++ ++#include "bcachefs.h" ++#include "alloc_background.h" ++#include "sysfs.h" ++#include "btree_cache.h" ++#include "btree_io.h" ++#include "btree_iter.h" ++#include "btree_update.h" ++#include "btree_update_interior.h" ++#include "btree_gc.h" ++#include "buckets.h" ++#include "disk_groups.h" ++#include "ec.h" ++#include "inode.h" ++#include "journal.h" ++#include "keylist.h" ++#include "move.h" ++#include "opts.h" ++#include "rebalance.h" ++#include "replicas.h" ++#include "super-io.h" ++#include "tests.h" ++ ++#include ++#include ++#include ++ ++#include "util.h" ++ ++#define SYSFS_OPS(type) \ ++struct sysfs_ops type ## _sysfs_ops = { \ ++ .show = type ## _show, \ ++ .store = type ## _store \ ++} ++ ++#define SHOW(fn) \ ++static ssize_t fn ## _show(struct kobject *kobj, struct attribute *attr,\ ++ char *buf) \ ++ ++#define STORE(fn) \ ++static ssize_t fn ## _store(struct kobject *kobj, struct attribute *attr,\ ++ const char *buf, size_t size) \ ++ ++#define __sysfs_attribute(_name, _mode) \ ++ static struct attribute sysfs_##_name = \ ++ { .name = #_name, .mode = _mode } ++ ++#define write_attribute(n) __sysfs_attribute(n, S_IWUSR) ++#define read_attribute(n) __sysfs_attribute(n, S_IRUGO) ++#define rw_attribute(n) __sysfs_attribute(n, S_IRUGO|S_IWUSR) ++ ++#define sysfs_printf(file, fmt, ...) \ ++do { \ ++ if (attr == &sysfs_ ## file) \ ++ return scnprintf(buf, PAGE_SIZE, fmt "\n", __VA_ARGS__);\ ++} while (0) ++ ++#define sysfs_print(file, var) \ ++do { \ ++ if (attr == &sysfs_ ## file) \ ++ return snprint(buf, PAGE_SIZE, var); \ ++} while (0) ++ ++#define sysfs_hprint(file, val) \ ++do { \ ++ if (attr == &sysfs_ ## file) { \ ++ struct printbuf out = _PBUF(buf, PAGE_SIZE); \ ++ bch2_hprint(&out, val); \ ++ pr_buf(&out, "\n"); \ ++ return out.pos - buf; \ ++ } \ ++} while (0) ++ ++#define var_printf(_var, fmt) sysfs_printf(_var, fmt, var(_var)) ++#define var_print(_var) sysfs_print(_var, var(_var)) ++#define var_hprint(_var) sysfs_hprint(_var, var(_var)) ++ ++#define sysfs_strtoul(file, var) \ ++do { \ ++ if (attr == &sysfs_ ## file) \ ++ return strtoul_safe(buf, var) ?: (ssize_t) size; \ ++} while (0) ++ ++#define sysfs_strtoul_clamp(file, var, min, max) \ ++do { \ ++ if (attr == &sysfs_ ## file) \ ++ return strtoul_safe_clamp(buf, var, min, max) \ ++ ?: (ssize_t) size; \ ++} while (0) ++ ++#define strtoul_or_return(cp) \ ++({ \ ++ unsigned long _v; \ ++ int _r = kstrtoul(cp, 10, &_v); \ ++ if (_r) \ ++ return _r; \ ++ _v; \ ++}) ++ ++#define strtoul_restrict_or_return(cp, min, max) \ ++({ \ ++ unsigned long __v = 0; \ ++ int _r = strtoul_safe_restrict(cp, __v, min, max); \ ++ if (_r) \ ++ return _r; \ ++ __v; \ ++}) ++ ++#define strtoi_h_or_return(cp) \ ++({ \ ++ u64 _v; \ ++ int _r = strtoi_h(cp, &_v); \ ++ if (_r) \ ++ return _r; \ ++ _v; \ ++}) ++ ++#define sysfs_hatoi(file, var) \ ++do { \ ++ if (attr == &sysfs_ ## file) \ ++ return strtoi_h(buf, &var) ?: (ssize_t) size; \ ++} while (0) ++ ++write_attribute(trigger_journal_flush); ++write_attribute(trigger_btree_coalesce); ++write_attribute(trigger_gc); ++write_attribute(trigger_alloc_write); ++write_attribute(prune_cache); ++rw_attribute(btree_gc_periodic); ++ ++read_attribute(uuid); ++read_attribute(minor); ++read_attribute(bucket_size); ++read_attribute(block_size); ++read_attribute(btree_node_size); ++read_attribute(first_bucket); ++read_attribute(nbuckets); ++read_attribute(durability); ++read_attribute(iodone); ++ ++read_attribute(io_latency_read); ++read_attribute(io_latency_write); ++read_attribute(io_latency_stats_read); ++read_attribute(io_latency_stats_write); ++read_attribute(congested); ++ ++read_attribute(bucket_quantiles_last_read); ++read_attribute(bucket_quantiles_last_write); ++read_attribute(bucket_quantiles_fragmentation); ++read_attribute(bucket_quantiles_oldest_gen); ++ ++read_attribute(reserve_stats); ++read_attribute(btree_cache_size); ++read_attribute(compression_stats); ++read_attribute(journal_debug); ++read_attribute(journal_pins); ++read_attribute(btree_updates); ++read_attribute(dirty_btree_nodes); ++ ++read_attribute(internal_uuid); ++ ++read_attribute(has_data); ++read_attribute(alloc_debug); ++write_attribute(wake_allocator); ++ ++read_attribute(read_realloc_races); ++read_attribute(extent_migrate_done); ++read_attribute(extent_migrate_raced); ++ ++rw_attribute(journal_write_delay_ms); ++rw_attribute(journal_reclaim_delay_ms); ++ ++rw_attribute(discard); ++rw_attribute(cache_replacement_policy); ++rw_attribute(label); ++ ++rw_attribute(copy_gc_enabled); ++sysfs_pd_controller_attribute(copy_gc); ++ ++rw_attribute(rebalance_enabled); ++sysfs_pd_controller_attribute(rebalance); ++read_attribute(rebalance_work); ++rw_attribute(promote_whole_extents); ++ ++read_attribute(new_stripes); ++ ++rw_attribute(pd_controllers_update_seconds); ++ ++read_attribute(meta_replicas_have); ++read_attribute(data_replicas_have); ++ ++#ifdef CONFIG_BCACHEFS_TESTS ++write_attribute(perf_test); ++#endif /* CONFIG_BCACHEFS_TESTS */ ++ ++#define BCH_DEBUG_PARAM(name, description) \ ++ rw_attribute(name); ++ ++ BCH_DEBUG_PARAMS() ++#undef BCH_DEBUG_PARAM ++ ++#define x(_name) \ ++ static struct attribute sysfs_time_stat_##_name = \ ++ { .name = #_name, .mode = S_IRUGO }; ++ BCH_TIME_STATS() ++#undef x ++ ++static struct attribute sysfs_state_rw = { ++ .name = "state", ++ .mode = S_IRUGO ++}; ++ ++static size_t bch2_btree_cache_size(struct bch_fs *c) ++{ ++ size_t ret = 0; ++ struct btree *b; ++ ++ mutex_lock(&c->btree_cache.lock); ++ list_for_each_entry(b, &c->btree_cache.live, list) ++ ret += btree_bytes(c); ++ ++ mutex_unlock(&c->btree_cache.lock); ++ return ret; ++} ++ ++static ssize_t show_fs_alloc_debug(struct bch_fs *c, char *buf) ++{ ++ struct printbuf out = _PBUF(buf, PAGE_SIZE); ++ struct bch_fs_usage *fs_usage = bch2_fs_usage_read(c); ++ ++ if (!fs_usage) ++ return -ENOMEM; ++ ++ bch2_fs_usage_to_text(&out, c, fs_usage); ++ ++ percpu_up_read(&c->mark_lock); ++ ++ kfree(fs_usage); ++ ++ return out.pos - buf; ++} ++ ++static ssize_t bch2_compression_stats(struct bch_fs *c, char *buf) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ u64 nr_uncompressed_extents = 0, uncompressed_sectors = 0, ++ nr_compressed_extents = 0, ++ compressed_sectors_compressed = 0, ++ compressed_sectors_uncompressed = 0; ++ int ret; ++ ++ if (!test_bit(BCH_FS_STARTED, &c->flags)) ++ return -EPERM; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, POS_MIN, 0, k, ret) ++ if (k.k->type == KEY_TYPE_extent) { ++ struct bkey_s_c_extent e = bkey_s_c_to_extent(k); ++ const union bch_extent_entry *entry; ++ struct extent_ptr_decoded p; ++ ++ extent_for_each_ptr_decode(e, p, entry) { ++ if (p.crc.compression_type == BCH_COMPRESSION_NONE) { ++ nr_uncompressed_extents++; ++ uncompressed_sectors += e.k->size; ++ } else { ++ nr_compressed_extents++; ++ compressed_sectors_compressed += ++ p.crc.compressed_size; ++ compressed_sectors_uncompressed += ++ p.crc.uncompressed_size; ++ } ++ ++ /* only looking at the first ptr */ ++ break; ++ } ++ } ++ ++ ret = bch2_trans_exit(&trans) ?: ret; ++ if (ret) ++ return ret; ++ ++ return scnprintf(buf, PAGE_SIZE, ++ "uncompressed data:\n" ++ " nr extents: %llu\n" ++ " size (bytes): %llu\n" ++ "compressed data:\n" ++ " nr extents: %llu\n" ++ " compressed size (bytes): %llu\n" ++ " uncompressed size (bytes): %llu\n", ++ nr_uncompressed_extents, ++ uncompressed_sectors << 9, ++ nr_compressed_extents, ++ compressed_sectors_compressed << 9, ++ compressed_sectors_uncompressed << 9); ++} ++ ++static ssize_t bch2_new_stripes(struct bch_fs *c, char *buf) ++{ ++ char *out = buf, *end = buf + PAGE_SIZE; ++ struct ec_stripe_head *h; ++ struct ec_stripe_new *s; ++ ++ mutex_lock(&c->ec_new_stripe_lock); ++ list_for_each_entry(h, &c->ec_new_stripe_list, list) { ++ out += scnprintf(out, end - out, ++ "target %u algo %u redundancy %u:\n", ++ h->target, h->algo, h->redundancy); ++ ++ if (h->s) ++ out += scnprintf(out, end - out, ++ "\tpending: blocks %u allocated %u\n", ++ h->s->blocks.nr, ++ bitmap_weight(h->s->blocks_allocated, ++ h->s->blocks.nr)); ++ ++ mutex_lock(&h->lock); ++ list_for_each_entry(s, &h->stripes, list) ++ out += scnprintf(out, end - out, ++ "\tin flight: blocks %u allocated %u pin %u\n", ++ s->blocks.nr, ++ bitmap_weight(s->blocks_allocated, ++ s->blocks.nr), ++ atomic_read(&s->pin)); ++ mutex_unlock(&h->lock); ++ ++ } ++ mutex_unlock(&c->ec_new_stripe_lock); ++ ++ return out - buf; ++} ++ ++SHOW(bch2_fs) ++{ ++ struct bch_fs *c = container_of(kobj, struct bch_fs, kobj); ++ ++ sysfs_print(minor, c->minor); ++ sysfs_printf(internal_uuid, "%pU", c->sb.uuid.b); ++ ++ sysfs_print(journal_write_delay_ms, c->journal.write_delay_ms); ++ sysfs_print(journal_reclaim_delay_ms, c->journal.reclaim_delay_ms); ++ ++ sysfs_print(block_size, block_bytes(c)); ++ sysfs_print(btree_node_size, btree_bytes(c)); ++ sysfs_hprint(btree_cache_size, bch2_btree_cache_size(c)); ++ ++ sysfs_print(read_realloc_races, ++ atomic_long_read(&c->read_realloc_races)); ++ sysfs_print(extent_migrate_done, ++ atomic_long_read(&c->extent_migrate_done)); ++ sysfs_print(extent_migrate_raced, ++ atomic_long_read(&c->extent_migrate_raced)); ++ ++ sysfs_printf(btree_gc_periodic, "%u", (int) c->btree_gc_periodic); ++ ++ sysfs_printf(copy_gc_enabled, "%i", c->copy_gc_enabled); ++ ++ sysfs_print(pd_controllers_update_seconds, ++ c->pd_controllers_update_seconds); ++ ++ sysfs_printf(rebalance_enabled, "%i", c->rebalance.enabled); ++ sysfs_pd_controller_show(rebalance, &c->rebalance.pd); /* XXX */ ++ ++ if (attr == &sysfs_rebalance_work) ++ return bch2_rebalance_work_show(c, buf); ++ ++ sysfs_print(promote_whole_extents, c->promote_whole_extents); ++ ++ sysfs_printf(meta_replicas_have, "%i", bch2_replicas_online(c, true)); ++ sysfs_printf(data_replicas_have, "%i", bch2_replicas_online(c, false)); ++ ++ /* Debugging: */ ++ ++ if (attr == &sysfs_alloc_debug) ++ return show_fs_alloc_debug(c, buf); ++ ++ if (attr == &sysfs_journal_debug) ++ return bch2_journal_print_debug(&c->journal, buf); ++ ++ if (attr == &sysfs_journal_pins) ++ return bch2_journal_print_pins(&c->journal, buf); ++ ++ if (attr == &sysfs_btree_updates) ++ return bch2_btree_updates_print(c, buf); ++ ++ if (attr == &sysfs_dirty_btree_nodes) ++ return bch2_dirty_btree_nodes_print(c, buf); ++ ++ if (attr == &sysfs_compression_stats) ++ return bch2_compression_stats(c, buf); ++ ++ if (attr == &sysfs_new_stripes) ++ return bch2_new_stripes(c, buf); ++ ++#define BCH_DEBUG_PARAM(name, description) sysfs_print(name, c->name); ++ BCH_DEBUG_PARAMS() ++#undef BCH_DEBUG_PARAM ++ ++ return 0; ++} ++ ++STORE(__bch2_fs) ++{ ++ struct bch_fs *c = container_of(kobj, struct bch_fs, kobj); ++ ++ sysfs_strtoul(journal_write_delay_ms, c->journal.write_delay_ms); ++ sysfs_strtoul(journal_reclaim_delay_ms, c->journal.reclaim_delay_ms); ++ ++ if (attr == &sysfs_btree_gc_periodic) { ++ ssize_t ret = strtoul_safe(buf, c->btree_gc_periodic) ++ ?: (ssize_t) size; ++ ++ wake_up_process(c->gc_thread); ++ return ret; ++ } ++ ++ if (attr == &sysfs_copy_gc_enabled) { ++ struct bch_dev *ca; ++ unsigned i; ++ ssize_t ret = strtoul_safe(buf, c->copy_gc_enabled) ++ ?: (ssize_t) size; ++ ++ for_each_member_device(ca, c, i) ++ if (ca->copygc_thread) ++ wake_up_process(ca->copygc_thread); ++ return ret; ++ } ++ ++ if (attr == &sysfs_rebalance_enabled) { ++ ssize_t ret = strtoul_safe(buf, c->rebalance.enabled) ++ ?: (ssize_t) size; ++ ++ rebalance_wakeup(c); ++ return ret; ++ } ++ ++ sysfs_strtoul(pd_controllers_update_seconds, ++ c->pd_controllers_update_seconds); ++ sysfs_pd_controller_store(rebalance, &c->rebalance.pd); ++ ++ sysfs_strtoul(promote_whole_extents, c->promote_whole_extents); ++ ++ /* Debugging: */ ++ ++#define BCH_DEBUG_PARAM(name, description) sysfs_strtoul(name, c->name); ++ BCH_DEBUG_PARAMS() ++#undef BCH_DEBUG_PARAM ++ ++ if (!test_bit(BCH_FS_STARTED, &c->flags)) ++ return -EPERM; ++ ++ /* Debugging: */ ++ ++ if (attr == &sysfs_trigger_journal_flush) ++ bch2_journal_meta_async(&c->journal, NULL); ++ ++ if (attr == &sysfs_trigger_btree_coalesce) ++ bch2_coalesce(c); ++ ++ if (attr == &sysfs_trigger_gc) ++ bch2_gc(c, NULL, false, false); ++ ++ if (attr == &sysfs_trigger_alloc_write) { ++ bool wrote; ++ ++ bch2_alloc_write(c, 0, &wrote); ++ } ++ ++ if (attr == &sysfs_prune_cache) { ++ struct shrink_control sc; ++ ++ sc.gfp_mask = GFP_KERNEL; ++ sc.nr_to_scan = strtoul_or_return(buf); ++ c->btree_cache.shrink.scan_objects(&c->btree_cache.shrink, &sc); ++ } ++#ifdef CONFIG_BCACHEFS_TESTS ++ if (attr == &sysfs_perf_test) { ++ char *tmp = kstrdup(buf, GFP_KERNEL), *p = tmp; ++ char *test = strsep(&p, " \t\n"); ++ char *nr_str = strsep(&p, " \t\n"); ++ char *threads_str = strsep(&p, " \t\n"); ++ unsigned threads; ++ u64 nr; ++ int ret = -EINVAL; ++ ++ if (threads_str && ++ !(ret = kstrtouint(threads_str, 10, &threads)) && ++ !(ret = bch2_strtoull_h(nr_str, &nr))) ++ bch2_btree_perf_test(c, test, nr, threads); ++ else ++ size = ret; ++ kfree(tmp); ++ } ++#endif ++ return size; ++} ++ ++STORE(bch2_fs) ++{ ++ struct bch_fs *c = container_of(kobj, struct bch_fs, kobj); ++ ++ mutex_lock(&c->state_lock); ++ size = __bch2_fs_store(kobj, attr, buf, size); ++ mutex_unlock(&c->state_lock); ++ ++ return size; ++} ++SYSFS_OPS(bch2_fs); ++ ++struct attribute *bch2_fs_files[] = { ++ &sysfs_minor, ++ &sysfs_block_size, ++ &sysfs_btree_node_size, ++ &sysfs_btree_cache_size, ++ ++ &sysfs_meta_replicas_have, ++ &sysfs_data_replicas_have, ++ ++ &sysfs_journal_write_delay_ms, ++ &sysfs_journal_reclaim_delay_ms, ++ ++ &sysfs_promote_whole_extents, ++ ++ &sysfs_compression_stats, ++ ++#ifdef CONFIG_BCACHEFS_TESTS ++ &sysfs_perf_test, ++#endif ++ NULL ++}; ++ ++/* internal dir - just a wrapper */ ++ ++SHOW(bch2_fs_internal) ++{ ++ struct bch_fs *c = container_of(kobj, struct bch_fs, internal); ++ return bch2_fs_show(&c->kobj, attr, buf); ++} ++ ++STORE(bch2_fs_internal) ++{ ++ struct bch_fs *c = container_of(kobj, struct bch_fs, internal); ++ return bch2_fs_store(&c->kobj, attr, buf, size); ++} ++SYSFS_OPS(bch2_fs_internal); ++ ++struct attribute *bch2_fs_internal_files[] = { ++ &sysfs_alloc_debug, ++ &sysfs_journal_debug, ++ &sysfs_journal_pins, ++ &sysfs_btree_updates, ++ &sysfs_dirty_btree_nodes, ++ ++ &sysfs_read_realloc_races, ++ &sysfs_extent_migrate_done, ++ &sysfs_extent_migrate_raced, ++ ++ &sysfs_trigger_journal_flush, ++ &sysfs_trigger_btree_coalesce, ++ &sysfs_trigger_gc, ++ &sysfs_trigger_alloc_write, ++ &sysfs_prune_cache, ++ ++ &sysfs_copy_gc_enabled, ++ ++ &sysfs_rebalance_enabled, ++ &sysfs_rebalance_work, ++ sysfs_pd_controller_files(rebalance), ++ ++ &sysfs_new_stripes, ++ ++ &sysfs_internal_uuid, ++ ++#define BCH_DEBUG_PARAM(name, description) &sysfs_##name, ++ BCH_DEBUG_PARAMS() ++#undef BCH_DEBUG_PARAM ++ ++ NULL ++}; ++ ++/* options */ ++ ++SHOW(bch2_fs_opts_dir) ++{ ++ struct printbuf out = _PBUF(buf, PAGE_SIZE); ++ struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir); ++ const struct bch_option *opt = container_of(attr, struct bch_option, attr); ++ int id = opt - bch2_opt_table; ++ u64 v = bch2_opt_get_by_id(&c->opts, id); ++ ++ bch2_opt_to_text(&out, c, opt, v, OPT_SHOW_FULL_LIST); ++ pr_buf(&out, "\n"); ++ ++ return out.pos - buf; ++} ++ ++STORE(bch2_fs_opts_dir) ++{ ++ struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir); ++ const struct bch_option *opt = container_of(attr, struct bch_option, attr); ++ int ret, id = opt - bch2_opt_table; ++ char *tmp; ++ u64 v; ++ ++ tmp = kstrdup(buf, GFP_KERNEL); ++ if (!tmp) ++ return -ENOMEM; ++ ++ ret = bch2_opt_parse(c, opt, strim(tmp), &v); ++ kfree(tmp); ++ ++ if (ret < 0) ++ return ret; ++ ++ ret = bch2_opt_check_may_set(c, id, v); ++ if (ret < 0) ++ return ret; ++ ++ if (opt->set_sb != SET_NO_SB_OPT) { ++ mutex_lock(&c->sb_lock); ++ opt->set_sb(c->disk_sb.sb, v); ++ bch2_write_super(c); ++ mutex_unlock(&c->sb_lock); ++ } ++ ++ bch2_opt_set_by_id(&c->opts, id, v); ++ ++ if ((id == Opt_background_target || ++ id == Opt_background_compression) && v) { ++ bch2_rebalance_add_work(c, S64_MAX); ++ rebalance_wakeup(c); ++ } ++ ++ return size; ++} ++SYSFS_OPS(bch2_fs_opts_dir); ++ ++struct attribute *bch2_fs_opts_dir_files[] = { NULL }; ++ ++int bch2_opts_create_sysfs_files(struct kobject *kobj) ++{ ++ const struct bch_option *i; ++ int ret; ++ ++ for (i = bch2_opt_table; ++ i < bch2_opt_table + bch2_opts_nr; ++ i++) { ++ if (!(i->mode & (OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME))) ++ continue; ++ ++ ret = sysfs_create_file(kobj, &i->attr); ++ if (ret) ++ return ret; ++ } ++ ++ return 0; ++} ++ ++/* time stats */ ++ ++SHOW(bch2_fs_time_stats) ++{ ++ struct bch_fs *c = container_of(kobj, struct bch_fs, time_stats); ++ ++#define x(name) \ ++ if (attr == &sysfs_time_stat_##name) \ ++ return bch2_time_stats_print(&c->times[BCH_TIME_##name],\ ++ buf, PAGE_SIZE); ++ BCH_TIME_STATS() ++#undef x ++ ++ return 0; ++} ++ ++STORE(bch2_fs_time_stats) ++{ ++ return size; ++} ++SYSFS_OPS(bch2_fs_time_stats); ++ ++struct attribute *bch2_fs_time_stats_files[] = { ++#define x(name) \ ++ &sysfs_time_stat_##name, ++ BCH_TIME_STATS() ++#undef x ++ NULL ++}; ++ ++typedef unsigned (bucket_map_fn)(struct bch_fs *, struct bch_dev *, ++ size_t, void *); ++ ++static unsigned bucket_last_io_fn(struct bch_fs *c, struct bch_dev *ca, ++ size_t b, void *private) ++{ ++ int rw = (private ? 1 : 0); ++ ++ return bucket_last_io(c, bucket(ca, b), rw); ++} ++ ++static unsigned bucket_sectors_used_fn(struct bch_fs *c, struct bch_dev *ca, ++ size_t b, void *private) ++{ ++ struct bucket *g = bucket(ca, b); ++ return bucket_sectors_used(g->mark); ++} ++ ++static unsigned bucket_oldest_gen_fn(struct bch_fs *c, struct bch_dev *ca, ++ size_t b, void *private) ++{ ++ return bucket_gc_gen(ca, b); ++} ++ ++static int unsigned_cmp(const void *_l, const void *_r) ++{ ++ const unsigned *l = _l; ++ const unsigned *r = _r; ++ ++ return cmp_int(*l, *r); ++} ++ ++static ssize_t show_quantiles(struct bch_fs *c, struct bch_dev *ca, ++ char *buf, bucket_map_fn *fn, void *private) ++{ ++ size_t i, n; ++ /* Compute 31 quantiles */ ++ unsigned q[31], *p; ++ ssize_t ret = 0; ++ ++ down_read(&ca->bucket_lock); ++ n = ca->mi.nbuckets; ++ ++ p = vzalloc(n * sizeof(unsigned)); ++ if (!p) { ++ up_read(&ca->bucket_lock); ++ return -ENOMEM; ++ } ++ ++ for (i = ca->mi.first_bucket; i < n; i++) ++ p[i] = fn(c, ca, i, private); ++ ++ sort(p, n, sizeof(unsigned), unsigned_cmp, NULL); ++ up_read(&ca->bucket_lock); ++ ++ while (n && ++ !p[n - 1]) ++ --n; ++ ++ for (i = 0; i < ARRAY_SIZE(q); i++) ++ q[i] = p[n * (i + 1) / (ARRAY_SIZE(q) + 1)]; ++ ++ vfree(p); ++ ++ for (i = 0; i < ARRAY_SIZE(q); i++) ++ ret += scnprintf(buf + ret, PAGE_SIZE - ret, ++ "%u ", q[i]); ++ buf[ret - 1] = '\n'; ++ ++ return ret; ++} ++ ++static ssize_t show_reserve_stats(struct bch_dev *ca, char *buf) ++{ ++ struct printbuf out = _PBUF(buf, PAGE_SIZE); ++ enum alloc_reserve i; ++ ++ spin_lock(&ca->freelist_lock); ++ ++ pr_buf(&out, "free_inc:\t%zu\t%zu\n", ++ fifo_used(&ca->free_inc), ++ ca->free_inc.size); ++ ++ for (i = 0; i < RESERVE_NR; i++) ++ pr_buf(&out, "free[%u]:\t%zu\t%zu\n", i, ++ fifo_used(&ca->free[i]), ++ ca->free[i].size); ++ ++ spin_unlock(&ca->freelist_lock); ++ ++ return out.pos - buf; ++} ++ ++static ssize_t show_dev_alloc_debug(struct bch_dev *ca, char *buf) ++{ ++ struct bch_fs *c = ca->fs; ++ struct bch_dev_usage stats = bch2_dev_usage_read(c, ca); ++ unsigned i, nr[BCH_DATA_NR]; ++ ++ memset(nr, 0, sizeof(nr)); ++ ++ for (i = 0; i < ARRAY_SIZE(c->open_buckets); i++) ++ nr[c->open_buckets[i].type]++; ++ ++ return scnprintf(buf, PAGE_SIZE, ++ "free_inc: %zu/%zu\n" ++ "free[RESERVE_BTREE]: %zu/%zu\n" ++ "free[RESERVE_MOVINGGC]: %zu/%zu\n" ++ "free[RESERVE_NONE]: %zu/%zu\n" ++ "buckets:\n" ++ " capacity: %llu\n" ++ " alloc: %llu\n" ++ " sb: %llu\n" ++ " journal: %llu\n" ++ " meta: %llu\n" ++ " user: %llu\n" ++ " cached: %llu\n" ++ " erasure coded: %llu\n" ++ " available: %lli\n" ++ "sectors:\n" ++ " sb: %llu\n" ++ " journal: %llu\n" ++ " meta: %llu\n" ++ " user: %llu\n" ++ " cached: %llu\n" ++ " fragmented: %llu\n" ++ " copygc threshold: %llu\n" ++ "freelist_wait: %s\n" ++ "open buckets: %u/%u (reserved %u)\n" ++ "open_buckets_wait: %s\n" ++ "open_buckets_btree: %u\n" ++ "open_buckets_user: %u\n" ++ "btree reserve cache: %u\n", ++ fifo_used(&ca->free_inc), ca->free_inc.size, ++ fifo_used(&ca->free[RESERVE_BTREE]), ca->free[RESERVE_BTREE].size, ++ fifo_used(&ca->free[RESERVE_MOVINGGC]), ca->free[RESERVE_MOVINGGC].size, ++ fifo_used(&ca->free[RESERVE_NONE]), ca->free[RESERVE_NONE].size, ++ ca->mi.nbuckets - ca->mi.first_bucket, ++ stats.buckets_alloc, ++ stats.buckets[BCH_DATA_SB], ++ stats.buckets[BCH_DATA_JOURNAL], ++ stats.buckets[BCH_DATA_BTREE], ++ stats.buckets[BCH_DATA_USER], ++ stats.buckets[BCH_DATA_CACHED], ++ stats.buckets_ec, ++ ca->mi.nbuckets - ca->mi.first_bucket - stats.buckets_unavailable, ++ stats.sectors[BCH_DATA_SB], ++ stats.sectors[BCH_DATA_JOURNAL], ++ stats.sectors[BCH_DATA_BTREE], ++ stats.sectors[BCH_DATA_USER], ++ stats.sectors[BCH_DATA_CACHED], ++ stats.sectors_fragmented, ++ ca->copygc_threshold, ++ c->freelist_wait.list.first ? "waiting" : "empty", ++ c->open_buckets_nr_free, OPEN_BUCKETS_COUNT, ++ BTREE_NODE_OPEN_BUCKET_RESERVE, ++ c->open_buckets_wait.list.first ? "waiting" : "empty", ++ nr[BCH_DATA_BTREE], ++ nr[BCH_DATA_USER], ++ c->btree_reserve_cache_nr); ++} ++ ++static const char * const bch2_rw[] = { ++ "read", ++ "write", ++ NULL ++}; ++ ++static ssize_t show_dev_iodone(struct bch_dev *ca, char *buf) ++{ ++ struct printbuf out = _PBUF(buf, PAGE_SIZE); ++ int rw, i; ++ ++ for (rw = 0; rw < 2; rw++) { ++ pr_buf(&out, "%s:\n", bch2_rw[rw]); ++ ++ for (i = 1; i < BCH_DATA_NR; i++) ++ pr_buf(&out, "%-12s:%12llu\n", ++ bch2_data_types[i], ++ percpu_u64_get(&ca->io_done->sectors[rw][i]) << 9); ++ } ++ ++ return out.pos - buf; ++} ++ ++SHOW(bch2_dev) ++{ ++ struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj); ++ struct bch_fs *c = ca->fs; ++ struct printbuf out = _PBUF(buf, PAGE_SIZE); ++ ++ sysfs_printf(uuid, "%pU\n", ca->uuid.b); ++ ++ sysfs_print(bucket_size, bucket_bytes(ca)); ++ sysfs_print(block_size, block_bytes(c)); ++ sysfs_print(first_bucket, ca->mi.first_bucket); ++ sysfs_print(nbuckets, ca->mi.nbuckets); ++ sysfs_print(durability, ca->mi.durability); ++ sysfs_print(discard, ca->mi.discard); ++ ++ if (attr == &sysfs_label) { ++ if (ca->mi.group) { ++ mutex_lock(&c->sb_lock); ++ bch2_disk_path_to_text(&out, &c->disk_sb, ++ ca->mi.group - 1); ++ mutex_unlock(&c->sb_lock); ++ } else { ++ pr_buf(&out, "none"); ++ } ++ ++ pr_buf(&out, "\n"); ++ return out.pos - buf; ++ } ++ ++ if (attr == &sysfs_has_data) { ++ bch2_flags_to_text(&out, bch2_data_types, ++ bch2_dev_has_data(c, ca)); ++ pr_buf(&out, "\n"); ++ return out.pos - buf; ++ } ++ ++ sysfs_pd_controller_show(copy_gc, &ca->copygc_pd); ++ ++ if (attr == &sysfs_cache_replacement_policy) { ++ bch2_string_opt_to_text(&out, ++ bch2_cache_replacement_policies, ++ ca->mi.replacement); ++ pr_buf(&out, "\n"); ++ return out.pos - buf; ++ } ++ ++ if (attr == &sysfs_state_rw) { ++ bch2_string_opt_to_text(&out, bch2_dev_state, ++ ca->mi.state); ++ pr_buf(&out, "\n"); ++ return out.pos - buf; ++ } ++ ++ if (attr == &sysfs_iodone) ++ return show_dev_iodone(ca, buf); ++ ++ sysfs_print(io_latency_read, atomic64_read(&ca->cur_latency[READ])); ++ sysfs_print(io_latency_write, atomic64_read(&ca->cur_latency[WRITE])); ++ ++ if (attr == &sysfs_io_latency_stats_read) ++ return bch2_time_stats_print(&ca->io_latency[READ], buf, PAGE_SIZE); ++ if (attr == &sysfs_io_latency_stats_write) ++ return bch2_time_stats_print(&ca->io_latency[WRITE], buf, PAGE_SIZE); ++ ++ sysfs_printf(congested, "%u%%", ++ clamp(atomic_read(&ca->congested), 0, CONGESTED_MAX) ++ * 100 / CONGESTED_MAX); ++ ++ if (attr == &sysfs_bucket_quantiles_last_read) ++ return show_quantiles(c, ca, buf, bucket_last_io_fn, (void *) 0); ++ if (attr == &sysfs_bucket_quantiles_last_write) ++ return show_quantiles(c, ca, buf, bucket_last_io_fn, (void *) 1); ++ if (attr == &sysfs_bucket_quantiles_fragmentation) ++ return show_quantiles(c, ca, buf, bucket_sectors_used_fn, NULL); ++ if (attr == &sysfs_bucket_quantiles_oldest_gen) ++ return show_quantiles(c, ca, buf, bucket_oldest_gen_fn, NULL); ++ ++ if (attr == &sysfs_reserve_stats) ++ return show_reserve_stats(ca, buf); ++ if (attr == &sysfs_alloc_debug) ++ return show_dev_alloc_debug(ca, buf); ++ ++ return 0; ++} ++ ++STORE(bch2_dev) ++{ ++ struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj); ++ struct bch_fs *c = ca->fs; ++ struct bch_member *mi; ++ ++ sysfs_pd_controller_store(copy_gc, &ca->copygc_pd); ++ ++ if (attr == &sysfs_discard) { ++ bool v = strtoul_or_return(buf); ++ ++ mutex_lock(&c->sb_lock); ++ mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx]; ++ ++ if (v != BCH_MEMBER_DISCARD(mi)) { ++ SET_BCH_MEMBER_DISCARD(mi, v); ++ bch2_write_super(c); ++ } ++ mutex_unlock(&c->sb_lock); ++ } ++ ++ if (attr == &sysfs_cache_replacement_policy) { ++ ssize_t v = __sysfs_match_string(bch2_cache_replacement_policies, -1, buf); ++ ++ if (v < 0) ++ return v; ++ ++ mutex_lock(&c->sb_lock); ++ mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx]; ++ ++ if ((unsigned) v != BCH_MEMBER_REPLACEMENT(mi)) { ++ SET_BCH_MEMBER_REPLACEMENT(mi, v); ++ bch2_write_super(c); ++ } ++ mutex_unlock(&c->sb_lock); ++ } ++ ++ if (attr == &sysfs_label) { ++ char *tmp; ++ int ret; ++ ++ tmp = kstrdup(buf, GFP_KERNEL); ++ if (!tmp) ++ return -ENOMEM; ++ ++ ret = bch2_dev_group_set(c, ca, strim(tmp)); ++ kfree(tmp); ++ if (ret) ++ return ret; ++ } ++ ++ if (attr == &sysfs_wake_allocator) ++ bch2_wake_allocator(ca); ++ ++ return size; ++} ++SYSFS_OPS(bch2_dev); ++ ++struct attribute *bch2_dev_files[] = { ++ &sysfs_uuid, ++ &sysfs_bucket_size, ++ &sysfs_block_size, ++ &sysfs_first_bucket, ++ &sysfs_nbuckets, ++ &sysfs_durability, ++ ++ /* settings: */ ++ &sysfs_discard, ++ &sysfs_cache_replacement_policy, ++ &sysfs_state_rw, ++ &sysfs_label, ++ ++ &sysfs_has_data, ++ &sysfs_iodone, ++ ++ &sysfs_io_latency_read, ++ &sysfs_io_latency_write, ++ &sysfs_io_latency_stats_read, ++ &sysfs_io_latency_stats_write, ++ &sysfs_congested, ++ ++ /* alloc info - other stats: */ ++ &sysfs_bucket_quantiles_last_read, ++ &sysfs_bucket_quantiles_last_write, ++ &sysfs_bucket_quantiles_fragmentation, ++ &sysfs_bucket_quantiles_oldest_gen, ++ ++ &sysfs_reserve_stats, ++ ++ /* debug: */ ++ &sysfs_alloc_debug, ++ &sysfs_wake_allocator, ++ ++ sysfs_pd_controller_files(copy_gc), ++ NULL ++}; ++ ++#endif /* _BCACHEFS_SYSFS_H_ */ +diff --git a/fs/bcachefs/sysfs.h b/fs/bcachefs/sysfs.h +new file mode 100644 +index 000000000000..525fd05d91f7 +--- /dev/null ++++ b/fs/bcachefs/sysfs.h +@@ -0,0 +1,44 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_SYSFS_H_ ++#define _BCACHEFS_SYSFS_H_ ++ ++#include ++ ++#ifndef NO_BCACHEFS_SYSFS ++ ++struct attribute; ++struct sysfs_ops; ++ ++extern struct attribute *bch2_fs_files[]; ++extern struct attribute *bch2_fs_internal_files[]; ++extern struct attribute *bch2_fs_opts_dir_files[]; ++extern struct attribute *bch2_fs_time_stats_files[]; ++extern struct attribute *bch2_dev_files[]; ++ ++extern struct sysfs_ops bch2_fs_sysfs_ops; ++extern struct sysfs_ops bch2_fs_internal_sysfs_ops; ++extern struct sysfs_ops bch2_fs_opts_dir_sysfs_ops; ++extern struct sysfs_ops bch2_fs_time_stats_sysfs_ops; ++extern struct sysfs_ops bch2_dev_sysfs_ops; ++ ++int bch2_opts_create_sysfs_files(struct kobject *); ++ ++#else ++ ++static struct attribute *bch2_fs_files[] = {}; ++static struct attribute *bch2_fs_internal_files[] = {}; ++static struct attribute *bch2_fs_opts_dir_files[] = {}; ++static struct attribute *bch2_fs_time_stats_files[] = {}; ++static struct attribute *bch2_dev_files[] = {}; ++ ++static const struct sysfs_ops bch2_fs_sysfs_ops; ++static const struct sysfs_ops bch2_fs_internal_sysfs_ops; ++static const struct sysfs_ops bch2_fs_opts_dir_sysfs_ops; ++static const struct sysfs_ops bch2_fs_time_stats_sysfs_ops; ++static const struct sysfs_ops bch2_dev_sysfs_ops; ++ ++static inline int bch2_opts_create_sysfs_files(struct kobject *kobj) { return 0; } ++ ++#endif /* NO_BCACHEFS_SYSFS */ ++ ++#endif /* _BCACHEFS_SYSFS_H_ */ +diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c +new file mode 100644 +index 000000000000..724f41e6590c +--- /dev/null ++++ b/fs/bcachefs/tests.c +@@ -0,0 +1,678 @@ ++// SPDX-License-Identifier: GPL-2.0 ++#ifdef CONFIG_BCACHEFS_TESTS ++ ++#include "bcachefs.h" ++#include "btree_update.h" ++#include "journal_reclaim.h" ++#include "tests.h" ++ ++#include "linux/kthread.h" ++#include "linux/random.h" ++ ++static void delete_test_keys(struct bch_fs *c) ++{ ++ int ret; ++ ++ ret = bch2_btree_delete_range(c, BTREE_ID_EXTENTS, ++ POS(0, 0), POS(0, U64_MAX), ++ NULL); ++ BUG_ON(ret); ++ ++ ret = bch2_btree_delete_range(c, BTREE_ID_DIRENTS, ++ POS(0, 0), POS(0, U64_MAX), ++ NULL); ++ BUG_ON(ret); ++} ++ ++/* unit tests */ ++ ++static void test_delete(struct bch_fs *c, u64 nr) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_i_cookie k; ++ int ret; ++ ++ bkey_cookie_init(&k.k_i); ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_DIRENTS, k.k.p, ++ BTREE_ITER_INTENT); ++ ++ ret = bch2_btree_iter_traverse(iter); ++ BUG_ON(ret); ++ ++ bch2_trans_update(&trans, iter, &k.k_i); ++ ret = bch2_trans_commit(&trans, NULL, NULL, 0); ++ BUG_ON(ret); ++ ++ pr_info("deleting once"); ++ ret = bch2_btree_delete_at(&trans, iter, 0); ++ BUG_ON(ret); ++ ++ pr_info("deleting twice"); ++ ret = bch2_btree_delete_at(&trans, iter, 0); ++ BUG_ON(ret); ++ ++ bch2_trans_exit(&trans); ++} ++ ++static void test_delete_written(struct bch_fs *c, u64 nr) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_i_cookie k; ++ int ret; ++ ++ bkey_cookie_init(&k.k_i); ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_DIRENTS, k.k.p, ++ BTREE_ITER_INTENT); ++ ++ ret = bch2_btree_iter_traverse(iter); ++ BUG_ON(ret); ++ ++ bch2_trans_update(&trans, iter, &k.k_i); ++ ret = bch2_trans_commit(&trans, NULL, NULL, 0); ++ BUG_ON(ret); ++ ++ bch2_journal_flush_all_pins(&c->journal); ++ ++ ret = bch2_btree_delete_at(&trans, iter, 0); ++ BUG_ON(ret); ++ ++ bch2_trans_exit(&trans); ++} ++ ++static void test_iterate(struct bch_fs *c, u64 nr) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ u64 i; ++ int ret; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ delete_test_keys(c); ++ ++ pr_info("inserting test keys"); ++ ++ for (i = 0; i < nr; i++) { ++ struct bkey_i_cookie k; ++ ++ bkey_cookie_init(&k.k_i); ++ k.k.p.offset = i; ++ ++ ret = bch2_btree_insert(c, BTREE_ID_DIRENTS, &k.k_i, ++ NULL, NULL, 0); ++ BUG_ON(ret); ++ } ++ ++ pr_info("iterating forwards"); ++ ++ i = 0; ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS, ++ POS_MIN, 0, k, ret) ++ BUG_ON(k.k->p.offset != i++); ++ ++ BUG_ON(i != nr); ++ ++ pr_info("iterating backwards"); ++ ++ while (!IS_ERR_OR_NULL((k = bch2_btree_iter_prev(iter)).k)) ++ BUG_ON(k.k->p.offset != --i); ++ ++ BUG_ON(i); ++ ++ bch2_trans_exit(&trans); ++} ++ ++static void test_iterate_extents(struct bch_fs *c, u64 nr) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ u64 i; ++ int ret; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ delete_test_keys(c); ++ ++ pr_info("inserting test extents"); ++ ++ for (i = 0; i < nr; i += 8) { ++ struct bkey_i_cookie k; ++ ++ bkey_cookie_init(&k.k_i); ++ k.k.p.offset = i + 8; ++ k.k.size = 8; ++ ++ ret = bch2_btree_insert(c, BTREE_ID_EXTENTS, &k.k_i, ++ NULL, NULL, 0); ++ BUG_ON(ret); ++ } ++ ++ pr_info("iterating forwards"); ++ ++ i = 0; ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, ++ POS_MIN, 0, k, ret) { ++ BUG_ON(bkey_start_offset(k.k) != i); ++ i = k.k->p.offset; ++ } ++ ++ BUG_ON(i != nr); ++ ++ pr_info("iterating backwards"); ++ ++ while (!IS_ERR_OR_NULL((k = bch2_btree_iter_prev(iter)).k)) { ++ BUG_ON(k.k->p.offset != i); ++ i = bkey_start_offset(k.k); ++ } ++ ++ BUG_ON(i); ++ ++ bch2_trans_exit(&trans); ++} ++ ++static void test_iterate_slots(struct bch_fs *c, u64 nr) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ u64 i; ++ int ret; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ delete_test_keys(c); ++ ++ pr_info("inserting test keys"); ++ ++ for (i = 0; i < nr; i++) { ++ struct bkey_i_cookie k; ++ ++ bkey_cookie_init(&k.k_i); ++ k.k.p.offset = i * 2; ++ ++ ret = bch2_btree_insert(c, BTREE_ID_DIRENTS, &k.k_i, ++ NULL, NULL, 0); ++ BUG_ON(ret); ++ } ++ ++ pr_info("iterating forwards"); ++ ++ i = 0; ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS, POS_MIN, ++ 0, k, ret) { ++ BUG_ON(k.k->p.offset != i); ++ i += 2; ++ } ++ bch2_trans_iter_free(&trans, iter); ++ ++ BUG_ON(i != nr * 2); ++ ++ pr_info("iterating forwards by slots"); ++ ++ i = 0; ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS, POS_MIN, ++ BTREE_ITER_SLOTS, k, ret) { ++ BUG_ON(bkey_deleted(k.k) != (i & 1)); ++ BUG_ON(k.k->p.offset != i++); ++ ++ if (i == nr * 2) ++ break; ++ } ++ ++ bch2_trans_exit(&trans); ++} ++ ++static void test_iterate_slots_extents(struct bch_fs *c, u64 nr) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ u64 i; ++ int ret; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ delete_test_keys(c); ++ ++ pr_info("inserting test keys"); ++ ++ for (i = 0; i < nr; i += 16) { ++ struct bkey_i_cookie k; ++ ++ bkey_cookie_init(&k.k_i); ++ k.k.p.offset = i + 16; ++ k.k.size = 8; ++ ++ ret = bch2_btree_insert(c, BTREE_ID_EXTENTS, &k.k_i, ++ NULL, NULL, 0); ++ BUG_ON(ret); ++ } ++ ++ pr_info("iterating forwards"); ++ ++ i = 0; ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, POS_MIN, ++ 0, k, ret) { ++ BUG_ON(bkey_start_offset(k.k) != i + 8); ++ BUG_ON(k.k->size != 8); ++ i += 16; ++ } ++ bch2_trans_iter_free(&trans, iter); ++ ++ BUG_ON(i != nr); ++ ++ pr_info("iterating forwards by slots"); ++ ++ i = 0; ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, POS_MIN, ++ BTREE_ITER_SLOTS, k, ret) { ++ BUG_ON(bkey_deleted(k.k) != !(i % 16)); ++ ++ BUG_ON(bkey_start_offset(k.k) != i); ++ BUG_ON(k.k->size != 8); ++ i = k.k->p.offset; ++ ++ if (i == nr) ++ break; ++ } ++ ++ bch2_trans_exit(&trans); ++} ++ ++/* ++ * XXX: we really want to make sure we've got a btree with depth > 0 for these ++ * tests ++ */ ++static void test_peek_end(struct bch_fs *c, u64 nr) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_DIRENTS, POS_MIN, 0); ++ ++ k = bch2_btree_iter_peek(iter); ++ BUG_ON(k.k); ++ ++ k = bch2_btree_iter_peek(iter); ++ BUG_ON(k.k); ++ ++ bch2_trans_exit(&trans); ++} ++ ++static void test_peek_end_extents(struct bch_fs *c, u64 nr) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, POS_MIN, 0); ++ ++ k = bch2_btree_iter_peek(iter); ++ BUG_ON(k.k); ++ ++ k = bch2_btree_iter_peek(iter); ++ BUG_ON(k.k); ++ ++ bch2_trans_exit(&trans); ++} ++ ++/* extent unit tests */ ++ ++u64 test_version; ++ ++static void insert_test_extent(struct bch_fs *c, ++ u64 start, u64 end) ++{ ++ struct bkey_i_cookie k; ++ int ret; ++ ++ //pr_info("inserting %llu-%llu v %llu", start, end, test_version); ++ ++ bkey_cookie_init(&k.k_i); ++ k.k_i.k.p.offset = end; ++ k.k_i.k.size = end - start; ++ k.k_i.k.version.lo = test_version++; ++ ++ ret = bch2_btree_insert(c, BTREE_ID_EXTENTS, &k.k_i, ++ NULL, NULL, 0); ++ BUG_ON(ret); ++} ++ ++static void __test_extent_overwrite(struct bch_fs *c, ++ u64 e1_start, u64 e1_end, ++ u64 e2_start, u64 e2_end) ++{ ++ insert_test_extent(c, e1_start, e1_end); ++ insert_test_extent(c, e2_start, e2_end); ++ ++ delete_test_keys(c); ++} ++ ++static void test_extent_overwrite_front(struct bch_fs *c, u64 nr) ++{ ++ __test_extent_overwrite(c, 0, 64, 0, 32); ++ __test_extent_overwrite(c, 8, 64, 0, 32); ++} ++ ++static void test_extent_overwrite_back(struct bch_fs *c, u64 nr) ++{ ++ __test_extent_overwrite(c, 0, 64, 32, 64); ++ __test_extent_overwrite(c, 0, 64, 32, 72); ++} ++ ++static void test_extent_overwrite_middle(struct bch_fs *c, u64 nr) ++{ ++ __test_extent_overwrite(c, 0, 64, 32, 40); ++} ++ ++static void test_extent_overwrite_all(struct bch_fs *c, u64 nr) ++{ ++ __test_extent_overwrite(c, 32, 64, 0, 64); ++ __test_extent_overwrite(c, 32, 64, 0, 128); ++ __test_extent_overwrite(c, 32, 64, 32, 64); ++ __test_extent_overwrite(c, 32, 64, 32, 128); ++} ++ ++/* perf tests */ ++ ++static u64 test_rand(void) ++{ ++ u64 v; ++#if 0 ++ v = prandom_u32(); ++#else ++ prandom_bytes(&v, sizeof(v)); ++#endif ++ return v; ++} ++ ++static void rand_insert(struct bch_fs *c, u64 nr) ++{ ++ struct bkey_i_cookie k; ++ int ret; ++ u64 i; ++ ++ for (i = 0; i < nr; i++) { ++ bkey_cookie_init(&k.k_i); ++ k.k.p.offset = test_rand(); ++ ++ ret = bch2_btree_insert(c, BTREE_ID_DIRENTS, &k.k_i, ++ NULL, NULL, 0); ++ BUG_ON(ret); ++ } ++} ++ ++static void rand_lookup(struct bch_fs *c, u64 nr) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ u64 i; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ for (i = 0; i < nr; i++) { ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_DIRENTS, ++ POS(0, test_rand()), 0); ++ ++ k = bch2_btree_iter_peek(iter); ++ bch2_trans_iter_free(&trans, iter); ++ } ++ ++ bch2_trans_exit(&trans); ++} ++ ++static void rand_mixed(struct bch_fs *c, u64 nr) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ int ret; ++ u64 i; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ for (i = 0; i < nr; i++) { ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_DIRENTS, ++ POS(0, test_rand()), 0); ++ ++ k = bch2_btree_iter_peek(iter); ++ ++ if (!(i & 3) && k.k) { ++ struct bkey_i_cookie k; ++ ++ bkey_cookie_init(&k.k_i); ++ k.k.p = iter->pos; ++ ++ bch2_trans_update(&trans, iter, &k.k_i); ++ ret = bch2_trans_commit(&trans, NULL, NULL, 0); ++ BUG_ON(ret); ++ } ++ ++ bch2_trans_iter_free(&trans, iter); ++ } ++ ++ bch2_trans_exit(&trans); ++} ++ ++static void rand_delete(struct bch_fs *c, u64 nr) ++{ ++ struct bkey_i k; ++ int ret; ++ u64 i; ++ ++ for (i = 0; i < nr; i++) { ++ bkey_init(&k.k); ++ k.k.p.offset = test_rand(); ++ ++ ret = bch2_btree_insert(c, BTREE_ID_DIRENTS, &k, ++ NULL, NULL, 0); ++ BUG_ON(ret); ++ } ++} ++ ++static void seq_insert(struct bch_fs *c, u64 nr) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ struct bkey_i_cookie insert; ++ int ret; ++ u64 i = 0; ++ ++ bkey_cookie_init(&insert.k_i); ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS, POS_MIN, ++ BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { ++ insert.k.p = iter->pos; ++ ++ bch2_trans_update(&trans, iter, &insert.k_i); ++ ret = bch2_trans_commit(&trans, NULL, NULL, 0); ++ BUG_ON(ret); ++ ++ if (++i == nr) ++ break; ++ } ++ bch2_trans_exit(&trans); ++} ++ ++static void seq_lookup(struct bch_fs *c, u64 nr) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ int ret; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS, POS_MIN, 0, k, ret) ++ ; ++ bch2_trans_exit(&trans); ++} ++ ++static void seq_overwrite(struct bch_fs *c, u64 nr) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ int ret; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS, POS_MIN, ++ BTREE_ITER_INTENT, k, ret) { ++ struct bkey_i_cookie u; ++ ++ bkey_reassemble(&u.k_i, k); ++ ++ bch2_trans_update(&trans, iter, &u.k_i); ++ ret = bch2_trans_commit(&trans, NULL, NULL, 0); ++ BUG_ON(ret); ++ } ++ bch2_trans_exit(&trans); ++} ++ ++static void seq_delete(struct bch_fs *c, u64 nr) ++{ ++ int ret; ++ ++ ret = bch2_btree_delete_range(c, BTREE_ID_DIRENTS, ++ POS(0, 0), POS(0, U64_MAX), ++ NULL); ++ BUG_ON(ret); ++} ++ ++typedef void (*perf_test_fn)(struct bch_fs *, u64); ++ ++struct test_job { ++ struct bch_fs *c; ++ u64 nr; ++ unsigned nr_threads; ++ perf_test_fn fn; ++ ++ atomic_t ready; ++ wait_queue_head_t ready_wait; ++ ++ atomic_t done; ++ struct completion done_completion; ++ ++ u64 start; ++ u64 finish; ++}; ++ ++static int btree_perf_test_thread(void *data) ++{ ++ struct test_job *j = data; ++ ++ if (atomic_dec_and_test(&j->ready)) { ++ wake_up(&j->ready_wait); ++ j->start = sched_clock(); ++ } else { ++ wait_event(j->ready_wait, !atomic_read(&j->ready)); ++ } ++ ++ j->fn(j->c, j->nr / j->nr_threads); ++ ++ if (atomic_dec_and_test(&j->done)) { ++ j->finish = sched_clock(); ++ complete(&j->done_completion); ++ } ++ ++ return 0; ++} ++ ++void bch2_btree_perf_test(struct bch_fs *c, const char *testname, ++ u64 nr, unsigned nr_threads) ++{ ++ struct test_job j = { .c = c, .nr = nr, .nr_threads = nr_threads }; ++ char name_buf[20], nr_buf[20], per_sec_buf[20]; ++ unsigned i; ++ u64 time; ++ ++ atomic_set(&j.ready, nr_threads); ++ init_waitqueue_head(&j.ready_wait); ++ ++ atomic_set(&j.done, nr_threads); ++ init_completion(&j.done_completion); ++ ++#define perf_test(_test) \ ++ if (!strcmp(testname, #_test)) j.fn = _test ++ ++ perf_test(rand_insert); ++ perf_test(rand_lookup); ++ perf_test(rand_mixed); ++ perf_test(rand_delete); ++ ++ perf_test(seq_insert); ++ perf_test(seq_lookup); ++ perf_test(seq_overwrite); ++ perf_test(seq_delete); ++ ++ /* a unit test, not a perf test: */ ++ perf_test(test_delete); ++ perf_test(test_delete_written); ++ perf_test(test_iterate); ++ perf_test(test_iterate_extents); ++ perf_test(test_iterate_slots); ++ perf_test(test_iterate_slots_extents); ++ perf_test(test_peek_end); ++ perf_test(test_peek_end_extents); ++ ++ perf_test(test_extent_overwrite_front); ++ perf_test(test_extent_overwrite_back); ++ perf_test(test_extent_overwrite_middle); ++ perf_test(test_extent_overwrite_all); ++ ++ if (!j.fn) { ++ pr_err("unknown test %s", testname); ++ return; ++ } ++ ++ //pr_info("running test %s:", testname); ++ ++ if (nr_threads == 1) ++ btree_perf_test_thread(&j); ++ else ++ for (i = 0; i < nr_threads; i++) ++ kthread_run(btree_perf_test_thread, &j, ++ "bcachefs perf test[%u]", i); ++ ++ while (wait_for_completion_interruptible(&j.done_completion)) ++ ; ++ ++ time = j.finish - j.start; ++ ++ scnprintf(name_buf, sizeof(name_buf), "%s:", testname); ++ bch2_hprint(&PBUF(nr_buf), nr); ++ bch2_hprint(&PBUF(per_sec_buf), nr * NSEC_PER_SEC / time); ++ printk(KERN_INFO "%-12s %s with %u threads in %5llu sec, %5llu nsec per iter, %5s per sec\n", ++ name_buf, nr_buf, nr_threads, ++ time / NSEC_PER_SEC, ++ time * nr_threads / nr, ++ per_sec_buf); ++} ++ ++#endif /* CONFIG_BCACHEFS_TESTS */ +diff --git a/fs/bcachefs/tests.h b/fs/bcachefs/tests.h +new file mode 100644 +index 000000000000..551d0764225e +--- /dev/null ++++ b/fs/bcachefs/tests.h +@@ -0,0 +1,15 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_TEST_H ++#define _BCACHEFS_TEST_H ++ ++struct bch_fs; ++ ++#ifdef CONFIG_BCACHEFS_TESTS ++ ++void bch2_btree_perf_test(struct bch_fs *, const char *, u64, unsigned); ++ ++#else ++ ++#endif /* CONFIG_BCACHEFS_TESTS */ ++ ++#endif /* _BCACHEFS_TEST_H */ +diff --git a/fs/bcachefs/trace.c b/fs/bcachefs/trace.c +new file mode 100644 +index 000000000000..59e8dfa3d245 +--- /dev/null ++++ b/fs/bcachefs/trace.c +@@ -0,0 +1,12 @@ ++// SPDX-License-Identifier: GPL-2.0 ++#include "bcachefs.h" ++#include "alloc_types.h" ++#include "buckets.h" ++#include "btree_types.h" ++#include "keylist.h" ++ ++#include ++#include "keylist.h" ++ ++#define CREATE_TRACE_POINTS ++#include +diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c +new file mode 100644 +index 000000000000..2cc433ec0e3a +--- /dev/null ++++ b/fs/bcachefs/util.c +@@ -0,0 +1,910 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * random utiility code, for bcache but in theory not specific to bcache ++ * ++ * Copyright 2010, 2011 Kent Overstreet ++ * Copyright 2012 Google, Inc. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "eytzinger.h" ++#include "util.h" ++ ++static const char si_units[] = "?kMGTPEZY"; ++ ++static int __bch2_strtoh(const char *cp, u64 *res, ++ u64 t_max, bool t_signed) ++{ ++ bool positive = *cp != '-'; ++ unsigned u; ++ u64 v = 0; ++ ++ if (*cp == '+' || *cp == '-') ++ cp++; ++ ++ if (!isdigit(*cp)) ++ return -EINVAL; ++ ++ do { ++ if (v > U64_MAX / 10) ++ return -ERANGE; ++ v *= 10; ++ if (v > U64_MAX - (*cp - '0')) ++ return -ERANGE; ++ v += *cp - '0'; ++ cp++; ++ } while (isdigit(*cp)); ++ ++ for (u = 1; u < strlen(si_units); u++) ++ if (*cp == si_units[u]) { ++ cp++; ++ goto got_unit; ++ } ++ u = 0; ++got_unit: ++ if (*cp == '\n') ++ cp++; ++ if (*cp) ++ return -EINVAL; ++ ++ if (fls64(v) + u * 10 > 64) ++ return -ERANGE; ++ ++ v <<= u * 10; ++ ++ if (positive) { ++ if (v > t_max) ++ return -ERANGE; ++ } else { ++ if (v && !t_signed) ++ return -ERANGE; ++ ++ if (v > t_max + 1) ++ return -ERANGE; ++ v = -v; ++ } ++ ++ *res = v; ++ return 0; ++} ++ ++#define STRTO_H(name, type) \ ++int bch2_ ## name ## _h(const char *cp, type *res) \ ++{ \ ++ u64 v; \ ++ int ret = __bch2_strtoh(cp, &v, ANYSINT_MAX(type), \ ++ ANYSINT_MAX(type) != ((type) ~0ULL)); \ ++ *res = v; \ ++ return ret; \ ++} ++ ++STRTO_H(strtoint, int) ++STRTO_H(strtouint, unsigned int) ++STRTO_H(strtoll, long long) ++STRTO_H(strtoull, unsigned long long) ++STRTO_H(strtou64, u64) ++ ++void bch2_hprint(struct printbuf *buf, s64 v) ++{ ++ int u, t = 0; ++ ++ for (u = 0; v >= 1024 || v <= -1024; u++) { ++ t = v & ~(~0U << 10); ++ v >>= 10; ++ } ++ ++ pr_buf(buf, "%lli", v); ++ ++ /* ++ * 103 is magic: t is in the range [-1023, 1023] and we want ++ * to turn it into [-9, 9] ++ */ ++ if (u && v < 100 && v > -100) ++ pr_buf(buf, ".%i", t / 103); ++ if (u) ++ pr_buf(buf, "%c", si_units[u]); ++} ++ ++void bch2_string_opt_to_text(struct printbuf *out, ++ const char * const list[], ++ size_t selected) ++{ ++ size_t i; ++ ++ for (i = 0; list[i]; i++) ++ pr_buf(out, i == selected ? "[%s] " : "%s ", list[i]); ++} ++ ++void bch2_flags_to_text(struct printbuf *out, ++ const char * const list[], u64 flags) ++{ ++ unsigned bit, nr = 0; ++ bool first = true; ++ ++ if (out->pos != out->end) ++ *out->pos = '\0'; ++ ++ while (list[nr]) ++ nr++; ++ ++ while (flags && (bit = __ffs(flags)) < nr) { ++ if (!first) ++ pr_buf(out, ","); ++ first = false; ++ pr_buf(out, "%s", list[bit]); ++ flags ^= 1 << bit; ++ } ++} ++ ++u64 bch2_read_flag_list(char *opt, const char * const list[]) ++{ ++ u64 ret = 0; ++ char *p, *s, *d = kstrndup(opt, PAGE_SIZE - 1, GFP_KERNEL); ++ ++ if (!d) ++ return -ENOMEM; ++ ++ s = strim(d); ++ ++ while ((p = strsep(&s, ","))) { ++ int flag = match_string(list, -1, p); ++ if (flag < 0) { ++ ret = -1; ++ break; ++ } ++ ++ ret |= 1 << flag; ++ } ++ ++ kfree(d); ++ ++ return ret; ++} ++ ++bool bch2_is_zero(const void *_p, size_t n) ++{ ++ const char *p = _p; ++ size_t i; ++ ++ for (i = 0; i < n; i++) ++ if (p[i]) ++ return false; ++ return true; ++} ++ ++static void bch2_quantiles_update(struct quantiles *q, u64 v) ++{ ++ unsigned i = 0; ++ ++ while (i < ARRAY_SIZE(q->entries)) { ++ struct quantile_entry *e = q->entries + i; ++ ++ if (unlikely(!e->step)) { ++ e->m = v; ++ e->step = max_t(unsigned, v / 2, 1024); ++ } else if (e->m > v) { ++ e->m = e->m >= e->step ++ ? e->m - e->step ++ : 0; ++ } else if (e->m < v) { ++ e->m = e->m + e->step > e->m ++ ? e->m + e->step ++ : U32_MAX; ++ } ++ ++ if ((e->m > v ? e->m - v : v - e->m) < e->step) ++ e->step = max_t(unsigned, e->step / 2, 1); ++ ++ if (v >= e->m) ++ break; ++ ++ i = eytzinger0_child(i, v > e->m); ++ } ++} ++ ++/* time stats: */ ++ ++static void bch2_time_stats_update_one(struct time_stats *stats, ++ u64 start, u64 end) ++{ ++ u64 duration, freq; ++ ++ duration = time_after64(end, start) ++ ? end - start : 0; ++ freq = time_after64(end, stats->last_event) ++ ? end - stats->last_event : 0; ++ ++ stats->count++; ++ ++ stats->average_duration = stats->average_duration ++ ? ewma_add(stats->average_duration, duration, 6) ++ : duration; ++ ++ stats->average_frequency = stats->average_frequency ++ ? ewma_add(stats->average_frequency, freq, 6) ++ : freq; ++ ++ stats->max_duration = max(stats->max_duration, duration); ++ ++ stats->last_event = end; ++ ++ bch2_quantiles_update(&stats->quantiles, duration); ++} ++ ++void __bch2_time_stats_update(struct time_stats *stats, u64 start, u64 end) ++{ ++ unsigned long flags; ++ ++ if (!stats->buffer) { ++ spin_lock_irqsave(&stats->lock, flags); ++ bch2_time_stats_update_one(stats, start, end); ++ ++ if (stats->average_frequency < 32 && ++ stats->count > 1024) ++ stats->buffer = ++ alloc_percpu_gfp(struct time_stat_buffer, ++ GFP_ATOMIC); ++ spin_unlock_irqrestore(&stats->lock, flags); ++ } else { ++ struct time_stat_buffer_entry *i; ++ struct time_stat_buffer *b; ++ ++ preempt_disable(); ++ b = this_cpu_ptr(stats->buffer); ++ ++ BUG_ON(b->nr >= ARRAY_SIZE(b->entries)); ++ b->entries[b->nr++] = (struct time_stat_buffer_entry) { ++ .start = start, ++ .end = end ++ }; ++ ++ if (b->nr == ARRAY_SIZE(b->entries)) { ++ spin_lock_irqsave(&stats->lock, flags); ++ for (i = b->entries; ++ i < b->entries + ARRAY_SIZE(b->entries); ++ i++) ++ bch2_time_stats_update_one(stats, i->start, i->end); ++ spin_unlock_irqrestore(&stats->lock, flags); ++ ++ b->nr = 0; ++ } ++ ++ preempt_enable(); ++ } ++} ++ ++static const struct time_unit { ++ const char *name; ++ u32 nsecs; ++} time_units[] = { ++ { "ns", 1 }, ++ { "us", NSEC_PER_USEC }, ++ { "ms", NSEC_PER_MSEC }, ++ { "sec", NSEC_PER_SEC }, ++}; ++ ++static const struct time_unit *pick_time_units(u64 ns) ++{ ++ const struct time_unit *u; ++ ++ for (u = time_units; ++ u + 1 < time_units + ARRAY_SIZE(time_units) && ++ ns >= u[1].nsecs << 1; ++ u++) ++ ; ++ ++ return u; ++} ++ ++static void pr_time_units(struct printbuf *out, u64 ns) ++{ ++ const struct time_unit *u = pick_time_units(ns); ++ ++ pr_buf(out, "%llu %s", div_u64(ns, u->nsecs), u->name); ++} ++ ++size_t bch2_time_stats_print(struct time_stats *stats, char *buf, size_t len) ++{ ++ struct printbuf out = _PBUF(buf, len); ++ const struct time_unit *u; ++ u64 freq = READ_ONCE(stats->average_frequency); ++ u64 q, last_q = 0; ++ int i; ++ ++ pr_buf(&out, "count:\t\t%llu\n", ++ stats->count); ++ pr_buf(&out, "rate:\t\t%llu/sec\n", ++ freq ? div64_u64(NSEC_PER_SEC, freq) : 0); ++ ++ pr_buf(&out, "frequency:\t"); ++ pr_time_units(&out, freq); ++ ++ pr_buf(&out, "\navg duration:\t"); ++ pr_time_units(&out, stats->average_duration); ++ ++ pr_buf(&out, "\nmax duration:\t"); ++ pr_time_units(&out, stats->max_duration); ++ ++ i = eytzinger0_first(NR_QUANTILES); ++ u = pick_time_units(stats->quantiles.entries[i].m); ++ ++ pr_buf(&out, "\nquantiles (%s):\t", u->name); ++ eytzinger0_for_each(i, NR_QUANTILES) { ++ bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1; ++ ++ q = max(stats->quantiles.entries[i].m, last_q); ++ pr_buf(&out, "%llu%s", ++ div_u64(q, u->nsecs), ++ is_last ? "\n" : " "); ++ last_q = q; ++ } ++ ++ return out.pos - buf; ++} ++ ++void bch2_time_stats_exit(struct time_stats *stats) ++{ ++ free_percpu(stats->buffer); ++} ++ ++void bch2_time_stats_init(struct time_stats *stats) ++{ ++ memset(stats, 0, sizeof(*stats)); ++ spin_lock_init(&stats->lock); ++} ++ ++/* ratelimit: */ ++ ++/** ++ * bch2_ratelimit_delay() - return how long to delay until the next time to do ++ * some work ++ * ++ * @d - the struct bch_ratelimit to update ++ * ++ * Returns the amount of time to delay by, in jiffies ++ */ ++u64 bch2_ratelimit_delay(struct bch_ratelimit *d) ++{ ++ u64 now = local_clock(); ++ ++ return time_after64(d->next, now) ++ ? nsecs_to_jiffies(d->next - now) ++ : 0; ++} ++ ++/** ++ * bch2_ratelimit_increment() - increment @d by the amount of work done ++ * ++ * @d - the struct bch_ratelimit to update ++ * @done - the amount of work done, in arbitrary units ++ */ ++void bch2_ratelimit_increment(struct bch_ratelimit *d, u64 done) ++{ ++ u64 now = local_clock(); ++ ++ d->next += div_u64(done * NSEC_PER_SEC, d->rate); ++ ++ if (time_before64(now + NSEC_PER_SEC, d->next)) ++ d->next = now + NSEC_PER_SEC; ++ ++ if (time_after64(now - NSEC_PER_SEC * 2, d->next)) ++ d->next = now - NSEC_PER_SEC * 2; ++} ++ ++/* pd controller: */ ++ ++/* ++ * Updates pd_controller. Attempts to scale inputed values to units per second. ++ * @target: desired value ++ * @actual: current value ++ * ++ * @sign: 1 or -1; 1 if increasing the rate makes actual go up, -1 if increasing ++ * it makes actual go down. ++ */ ++void bch2_pd_controller_update(struct bch_pd_controller *pd, ++ s64 target, s64 actual, int sign) ++{ ++ s64 proportional, derivative, change; ++ ++ unsigned long seconds_since_update = (jiffies - pd->last_update) / HZ; ++ ++ if (seconds_since_update == 0) ++ return; ++ ++ pd->last_update = jiffies; ++ ++ proportional = actual - target; ++ proportional *= seconds_since_update; ++ proportional = div_s64(proportional, pd->p_term_inverse); ++ ++ derivative = actual - pd->last_actual; ++ derivative = div_s64(derivative, seconds_since_update); ++ derivative = ewma_add(pd->smoothed_derivative, derivative, ++ (pd->d_term / seconds_since_update) ?: 1); ++ derivative = derivative * pd->d_term; ++ derivative = div_s64(derivative, pd->p_term_inverse); ++ ++ change = proportional + derivative; ++ ++ /* Don't increase rate if not keeping up */ ++ if (change > 0 && ++ pd->backpressure && ++ time_after64(local_clock(), ++ pd->rate.next + NSEC_PER_MSEC)) ++ change = 0; ++ ++ change *= (sign * -1); ++ ++ pd->rate.rate = clamp_t(s64, (s64) pd->rate.rate + change, ++ 1, UINT_MAX); ++ ++ pd->last_actual = actual; ++ pd->last_derivative = derivative; ++ pd->last_proportional = proportional; ++ pd->last_change = change; ++ pd->last_target = target; ++} ++ ++void bch2_pd_controller_init(struct bch_pd_controller *pd) ++{ ++ pd->rate.rate = 1024; ++ pd->last_update = jiffies; ++ pd->p_term_inverse = 6000; ++ pd->d_term = 30; ++ pd->d_smooth = pd->d_term; ++ pd->backpressure = 1; ++} ++ ++size_t bch2_pd_controller_print_debug(struct bch_pd_controller *pd, char *buf) ++{ ++ /* 2^64 - 1 is 20 digits, plus null byte */ ++ char rate[21]; ++ char actual[21]; ++ char target[21]; ++ char proportional[21]; ++ char derivative[21]; ++ char change[21]; ++ s64 next_io; ++ ++ bch2_hprint(&PBUF(rate), pd->rate.rate); ++ bch2_hprint(&PBUF(actual), pd->last_actual); ++ bch2_hprint(&PBUF(target), pd->last_target); ++ bch2_hprint(&PBUF(proportional), pd->last_proportional); ++ bch2_hprint(&PBUF(derivative), pd->last_derivative); ++ bch2_hprint(&PBUF(change), pd->last_change); ++ ++ next_io = div64_s64(pd->rate.next - local_clock(), NSEC_PER_MSEC); ++ ++ return sprintf(buf, ++ "rate:\t\t%s/sec\n" ++ "target:\t\t%s\n" ++ "actual:\t\t%s\n" ++ "proportional:\t%s\n" ++ "derivative:\t%s\n" ++ "change:\t\t%s/sec\n" ++ "next io:\t%llims\n", ++ rate, target, actual, proportional, ++ derivative, change, next_io); ++} ++ ++/* misc: */ ++ ++void bch2_bio_map(struct bio *bio, void *base, size_t size) ++{ ++ while (size) { ++ struct page *page = is_vmalloc_addr(base) ++ ? vmalloc_to_page(base) ++ : virt_to_page(base); ++ unsigned offset = offset_in_page(base); ++ unsigned len = min_t(size_t, PAGE_SIZE - offset, size); ++ ++ BUG_ON(!bio_add_page(bio, page, len, offset)); ++ size -= len; ++ base += len; ++ } ++} ++ ++int bch2_bio_alloc_pages(struct bio *bio, size_t size, gfp_t gfp_mask) ++{ ++ while (size) { ++ struct page *page = alloc_page(gfp_mask); ++ unsigned len = min(PAGE_SIZE, size); ++ ++ if (!page) ++ return -ENOMEM; ++ ++ BUG_ON(!bio_add_page(bio, page, len, 0)); ++ size -= len; ++ } ++ ++ return 0; ++} ++ ++size_t bch2_rand_range(size_t max) ++{ ++ size_t rand; ++ ++ if (!max) ++ return 0; ++ ++ do { ++ rand = get_random_long(); ++ rand &= roundup_pow_of_two(max) - 1; ++ } while (rand >= max); ++ ++ return rand; ++} ++ ++void memcpy_to_bio(struct bio *dst, struct bvec_iter dst_iter, void *src) ++{ ++ struct bio_vec bv; ++ struct bvec_iter iter; ++ ++ __bio_for_each_segment(bv, dst, iter, dst_iter) { ++ void *dstp = kmap_atomic(bv.bv_page); ++ memcpy(dstp + bv.bv_offset, src, bv.bv_len); ++ kunmap_atomic(dstp); ++ ++ src += bv.bv_len; ++ } ++} ++ ++void memcpy_from_bio(void *dst, struct bio *src, struct bvec_iter src_iter) ++{ ++ struct bio_vec bv; ++ struct bvec_iter iter; ++ ++ __bio_for_each_segment(bv, src, iter, src_iter) { ++ void *srcp = kmap_atomic(bv.bv_page); ++ memcpy(dst, srcp + bv.bv_offset, bv.bv_len); ++ kunmap_atomic(srcp); ++ ++ dst += bv.bv_len; ++ } ++} ++ ++void bch_scnmemcpy(struct printbuf *out, ++ const char *src, size_t len) ++{ ++ size_t n = printbuf_remaining(out); ++ ++ if (n) { ++ n = min(n - 1, len); ++ memcpy(out->pos, src, n); ++ out->pos += n; ++ *out->pos = '\0'; ++ } ++} ++ ++#include "eytzinger.h" ++ ++static int alignment_ok(const void *base, size_t align) ++{ ++ return IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) || ++ ((unsigned long)base & (align - 1)) == 0; ++} ++ ++static void u32_swap(void *a, void *b, size_t size) ++{ ++ u32 t = *(u32 *)a; ++ *(u32 *)a = *(u32 *)b; ++ *(u32 *)b = t; ++} ++ ++static void u64_swap(void *a, void *b, size_t size) ++{ ++ u64 t = *(u64 *)a; ++ *(u64 *)a = *(u64 *)b; ++ *(u64 *)b = t; ++} ++ ++static void generic_swap(void *a, void *b, size_t size) ++{ ++ char t; ++ ++ do { ++ t = *(char *)a; ++ *(char *)a++ = *(char *)b; ++ *(char *)b++ = t; ++ } while (--size > 0); ++} ++ ++static inline int do_cmp(void *base, size_t n, size_t size, ++ int (*cmp_func)(const void *, const void *, size_t), ++ size_t l, size_t r) ++{ ++ return cmp_func(base + inorder_to_eytzinger0(l, n) * size, ++ base + inorder_to_eytzinger0(r, n) * size, ++ size); ++} ++ ++static inline void do_swap(void *base, size_t n, size_t size, ++ void (*swap_func)(void *, void *, size_t), ++ size_t l, size_t r) ++{ ++ swap_func(base + inorder_to_eytzinger0(l, n) * size, ++ base + inorder_to_eytzinger0(r, n) * size, ++ size); ++} ++ ++void eytzinger0_sort(void *base, size_t n, size_t size, ++ int (*cmp_func)(const void *, const void *, size_t), ++ void (*swap_func)(void *, void *, size_t)) ++{ ++ int i, c, r; ++ ++ if (!swap_func) { ++ if (size == 4 && alignment_ok(base, 4)) ++ swap_func = u32_swap; ++ else if (size == 8 && alignment_ok(base, 8)) ++ swap_func = u64_swap; ++ else ++ swap_func = generic_swap; ++ } ++ ++ /* heapify */ ++ for (i = n / 2 - 1; i >= 0; --i) { ++ for (r = i; r * 2 + 1 < n; r = c) { ++ c = r * 2 + 1; ++ ++ if (c + 1 < n && ++ do_cmp(base, n, size, cmp_func, c, c + 1) < 0) ++ c++; ++ ++ if (do_cmp(base, n, size, cmp_func, r, c) >= 0) ++ break; ++ ++ do_swap(base, n, size, swap_func, r, c); ++ } ++ } ++ ++ /* sort */ ++ for (i = n - 1; i > 0; --i) { ++ do_swap(base, n, size, swap_func, 0, i); ++ ++ for (r = 0; r * 2 + 1 < i; r = c) { ++ c = r * 2 + 1; ++ ++ if (c + 1 < i && ++ do_cmp(base, n, size, cmp_func, c, c + 1) < 0) ++ c++; ++ ++ if (do_cmp(base, n, size, cmp_func, r, c) >= 0) ++ break; ++ ++ do_swap(base, n, size, swap_func, r, c); ++ } ++ } ++} ++ ++void sort_cmp_size(void *base, size_t num, size_t size, ++ int (*cmp_func)(const void *, const void *, size_t), ++ void (*swap_func)(void *, void *, size_t size)) ++{ ++ /* pre-scale counters for performance */ ++ int i = (num/2 - 1) * size, n = num * size, c, r; ++ ++ if (!swap_func) { ++ if (size == 4 && alignment_ok(base, 4)) ++ swap_func = u32_swap; ++ else if (size == 8 && alignment_ok(base, 8)) ++ swap_func = u64_swap; ++ else ++ swap_func = generic_swap; ++ } ++ ++ /* heapify */ ++ for ( ; i >= 0; i -= size) { ++ for (r = i; r * 2 + size < n; r = c) { ++ c = r * 2 + size; ++ if (c < n - size && ++ cmp_func(base + c, base + c + size, size) < 0) ++ c += size; ++ if (cmp_func(base + r, base + c, size) >= 0) ++ break; ++ swap_func(base + r, base + c, size); ++ } ++ } ++ ++ /* sort */ ++ for (i = n - size; i > 0; i -= size) { ++ swap_func(base, base + i, size); ++ for (r = 0; r * 2 + size < i; r = c) { ++ c = r * 2 + size; ++ if (c < i - size && ++ cmp_func(base + c, base + c + size, size) < 0) ++ c += size; ++ if (cmp_func(base + r, base + c, size) >= 0) ++ break; ++ swap_func(base + r, base + c, size); ++ } ++ } ++} ++ ++static void mempool_free_vp(void *element, void *pool_data) ++{ ++ size_t size = (size_t) pool_data; ++ ++ vpfree(element, size); ++} ++ ++static void *mempool_alloc_vp(gfp_t gfp_mask, void *pool_data) ++{ ++ size_t size = (size_t) pool_data; ++ ++ return vpmalloc(size, gfp_mask); ++} ++ ++int mempool_init_kvpmalloc_pool(mempool_t *pool, int min_nr, size_t size) ++{ ++ return size < PAGE_SIZE ++ ? mempool_init_kmalloc_pool(pool, min_nr, size) ++ : mempool_init(pool, min_nr, mempool_alloc_vp, ++ mempool_free_vp, (void *) size); ++} ++ ++#if 0 ++void eytzinger1_test(void) ++{ ++ unsigned inorder, eytz, size; ++ ++ pr_info("1 based eytzinger test:"); ++ ++ for (size = 2; ++ size < 65536; ++ size++) { ++ unsigned extra = eytzinger1_extra(size); ++ ++ if (!(size % 4096)) ++ pr_info("tree size %u", size); ++ ++ BUG_ON(eytzinger1_prev(0, size) != eytzinger1_last(size)); ++ BUG_ON(eytzinger1_next(0, size) != eytzinger1_first(size)); ++ ++ BUG_ON(eytzinger1_prev(eytzinger1_first(size), size) != 0); ++ BUG_ON(eytzinger1_next(eytzinger1_last(size), size) != 0); ++ ++ inorder = 1; ++ eytzinger1_for_each(eytz, size) { ++ BUG_ON(__inorder_to_eytzinger1(inorder, size, extra) != eytz); ++ BUG_ON(__eytzinger1_to_inorder(eytz, size, extra) != inorder); ++ BUG_ON(eytz != eytzinger1_last(size) && ++ eytzinger1_prev(eytzinger1_next(eytz, size), size) != eytz); ++ ++ inorder++; ++ } ++ } ++} ++ ++void eytzinger0_test(void) ++{ ++ ++ unsigned inorder, eytz, size; ++ ++ pr_info("0 based eytzinger test:"); ++ ++ for (size = 1; ++ size < 65536; ++ size++) { ++ unsigned extra = eytzinger0_extra(size); ++ ++ if (!(size % 4096)) ++ pr_info("tree size %u", size); ++ ++ BUG_ON(eytzinger0_prev(-1, size) != eytzinger0_last(size)); ++ BUG_ON(eytzinger0_next(-1, size) != eytzinger0_first(size)); ++ ++ BUG_ON(eytzinger0_prev(eytzinger0_first(size), size) != -1); ++ BUG_ON(eytzinger0_next(eytzinger0_last(size), size) != -1); ++ ++ inorder = 0; ++ eytzinger0_for_each(eytz, size) { ++ BUG_ON(__inorder_to_eytzinger0(inorder, size, extra) != eytz); ++ BUG_ON(__eytzinger0_to_inorder(eytz, size, extra) != inorder); ++ BUG_ON(eytz != eytzinger0_last(size) && ++ eytzinger0_prev(eytzinger0_next(eytz, size), size) != eytz); ++ ++ inorder++; ++ } ++ } ++} ++ ++static inline int cmp_u16(const void *_l, const void *_r, size_t size) ++{ ++ const u16 *l = _l, *r = _r; ++ ++ return (*l > *r) - (*r - *l); ++} ++ ++static void eytzinger0_find_test_val(u16 *test_array, unsigned nr, u16 search) ++{ ++ int i, c1 = -1, c2 = -1; ++ ssize_t r; ++ ++ r = eytzinger0_find_le(test_array, nr, ++ sizeof(test_array[0]), ++ cmp_u16, &search); ++ if (r >= 0) ++ c1 = test_array[r]; ++ ++ for (i = 0; i < nr; i++) ++ if (test_array[i] <= search && test_array[i] > c2) ++ c2 = test_array[i]; ++ ++ if (c1 != c2) { ++ eytzinger0_for_each(i, nr) ++ pr_info("[%3u] = %12u", i, test_array[i]); ++ pr_info("find_le(%2u) -> [%2zi] = %2i should be %2i", ++ i, r, c1, c2); ++ } ++} ++ ++void eytzinger0_find_test(void) ++{ ++ unsigned i, nr, allocated = 1 << 12; ++ u16 *test_array = kmalloc_array(allocated, sizeof(test_array[0]), GFP_KERNEL); ++ ++ for (nr = 1; nr < allocated; nr++) { ++ pr_info("testing %u elems", nr); ++ ++ get_random_bytes(test_array, nr * sizeof(test_array[0])); ++ eytzinger0_sort(test_array, nr, sizeof(test_array[0]), cmp_u16, NULL); ++ ++ /* verify array is sorted correctly: */ ++ eytzinger0_for_each(i, nr) ++ BUG_ON(i != eytzinger0_last(nr) && ++ test_array[i] > test_array[eytzinger0_next(i, nr)]); ++ ++ for (i = 0; i < U16_MAX; i += 1 << 12) ++ eytzinger0_find_test_val(test_array, nr, i); ++ ++ for (i = 0; i < nr; i++) { ++ eytzinger0_find_test_val(test_array, nr, test_array[i] - 1); ++ eytzinger0_find_test_val(test_array, nr, test_array[i]); ++ eytzinger0_find_test_val(test_array, nr, test_array[i] + 1); ++ } ++ } ++ ++ kfree(test_array); ++} ++#endif ++ ++/* ++ * Accumulate percpu counters onto one cpu's copy - only valid when access ++ * against any percpu counter is guarded against ++ */ ++u64 *bch2_acc_percpu_u64s(u64 __percpu *p, unsigned nr) ++{ ++ u64 *ret; ++ int cpu; ++ ++ preempt_disable(); ++ ret = this_cpu_ptr(p); ++ preempt_enable(); ++ ++ for_each_possible_cpu(cpu) { ++ u64 *i = per_cpu_ptr(p, cpu); ++ ++ if (i != ret) { ++ acc_u64s(ret, i, nr); ++ memset(i, 0, nr * sizeof(u64)); ++ } ++ } ++ ++ return ret; ++} +diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h +new file mode 100644 +index 000000000000..7e96ff7fda5c +--- /dev/null ++++ b/fs/bcachefs/util.h +@@ -0,0 +1,760 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_UTIL_H ++#define _BCACHEFS_UTIL_H ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#define PAGE_SECTOR_SHIFT (PAGE_SHIFT - 9) ++ ++struct closure; ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++ ++#define EBUG_ON(cond) BUG_ON(cond) ++#define atomic_dec_bug(v) BUG_ON(atomic_dec_return(v) < 0) ++#define atomic_inc_bug(v, i) BUG_ON(atomic_inc_return(v) <= i) ++#define atomic_sub_bug(i, v) BUG_ON(atomic_sub_return(i, v) < 0) ++#define atomic_add_bug(i, v) BUG_ON(atomic_add_return(i, v) < 0) ++#define atomic_long_dec_bug(v) BUG_ON(atomic_long_dec_return(v) < 0) ++#define atomic_long_sub_bug(i, v) BUG_ON(atomic_long_sub_return(i, v) < 0) ++#define atomic64_dec_bug(v) BUG_ON(atomic64_dec_return(v) < 0) ++#define atomic64_inc_bug(v, i) BUG_ON(atomic64_inc_return(v) <= i) ++#define atomic64_sub_bug(i, v) BUG_ON(atomic64_sub_return(i, v) < 0) ++#define atomic64_add_bug(i, v) BUG_ON(atomic64_add_return(i, v) < 0) ++ ++#define memcpy(dst, src, len) \ ++({ \ ++ void *_dst = (dst); \ ++ const void *_src = (src); \ ++ size_t _len = (len); \ ++ \ ++ BUG_ON(!((void *) (_dst) >= (void *) (_src) + (_len) || \ ++ (void *) (_dst) + (_len) <= (void *) (_src))); \ ++ memcpy(_dst, _src, _len); \ ++}) ++ ++#else /* DEBUG */ ++ ++#define EBUG_ON(cond) ++#define atomic_dec_bug(v) atomic_dec(v) ++#define atomic_inc_bug(v, i) atomic_inc(v) ++#define atomic_sub_bug(i, v) atomic_sub(i, v) ++#define atomic_add_bug(i, v) atomic_add(i, v) ++#define atomic_long_dec_bug(v) atomic_long_dec(v) ++#define atomic_long_sub_bug(i, v) atomic_long_sub(i, v) ++#define atomic64_dec_bug(v) atomic64_dec(v) ++#define atomic64_inc_bug(v, i) atomic64_inc(v) ++#define atomic64_sub_bug(i, v) atomic64_sub(i, v) ++#define atomic64_add_bug(i, v) atomic64_add(i, v) ++ ++#endif ++ ++#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ ++#define CPU_BIG_ENDIAN 0 ++#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ ++#define CPU_BIG_ENDIAN 1 ++#endif ++ ++/* type hackery */ ++ ++#define type_is_exact(_val, _type) \ ++ __builtin_types_compatible_p(typeof(_val), _type) ++ ++#define type_is(_val, _type) \ ++ (__builtin_types_compatible_p(typeof(_val), _type) || \ ++ __builtin_types_compatible_p(typeof(_val), const _type)) ++ ++/* Userspace doesn't align allocations as nicely as the kernel allocators: */ ++static inline size_t buf_pages(void *p, size_t len) ++{ ++ return DIV_ROUND_UP(len + ++ ((unsigned long) p & (PAGE_SIZE - 1)), ++ PAGE_SIZE); ++} ++ ++static inline void vpfree(void *p, size_t size) ++{ ++ if (is_vmalloc_addr(p)) ++ vfree(p); ++ else ++ free_pages((unsigned long) p, get_order(size)); ++} ++ ++static inline void *vpmalloc(size_t size, gfp_t gfp_mask) ++{ ++ return (void *) __get_free_pages(gfp_mask|__GFP_NOWARN, ++ get_order(size)) ?: ++ __vmalloc(size, gfp_mask); ++} ++ ++static inline void kvpfree(void *p, size_t size) ++{ ++ if (size < PAGE_SIZE) ++ kfree(p); ++ else ++ vpfree(p, size); ++} ++ ++static inline void *kvpmalloc(size_t size, gfp_t gfp_mask) ++{ ++ return size < PAGE_SIZE ++ ? kmalloc(size, gfp_mask) ++ : vpmalloc(size, gfp_mask); ++} ++ ++int mempool_init_kvpmalloc_pool(mempool_t *, int, size_t); ++ ++#define HEAP(type) \ ++struct { \ ++ size_t size, used; \ ++ type *data; \ ++} ++ ++#define DECLARE_HEAP(type, name) HEAP(type) name ++ ++#define init_heap(heap, _size, gfp) \ ++({ \ ++ (heap)->used = 0; \ ++ (heap)->size = (_size); \ ++ (heap)->data = kvpmalloc((heap)->size * sizeof((heap)->data[0]),\ ++ (gfp)); \ ++}) ++ ++#define free_heap(heap) \ ++do { \ ++ kvpfree((heap)->data, (heap)->size * sizeof((heap)->data[0])); \ ++ (heap)->data = NULL; \ ++} while (0) ++ ++#define heap_set_backpointer(h, i, _fn) \ ++do { \ ++ void (*fn)(typeof(h), size_t) = _fn; \ ++ if (fn) \ ++ fn(h, i); \ ++} while (0) ++ ++#define heap_swap(h, i, j, set_backpointer) \ ++do { \ ++ swap((h)->data[i], (h)->data[j]); \ ++ heap_set_backpointer(h, i, set_backpointer); \ ++ heap_set_backpointer(h, j, set_backpointer); \ ++} while (0) ++ ++#define heap_peek(h) \ ++({ \ ++ EBUG_ON(!(h)->used); \ ++ (h)->data[0]; \ ++}) ++ ++#define heap_full(h) ((h)->used == (h)->size) ++ ++#define heap_sift_down(h, i, cmp, set_backpointer) \ ++do { \ ++ size_t _c, _j = i; \ ++ \ ++ for (; _j * 2 + 1 < (h)->used; _j = _c) { \ ++ _c = _j * 2 + 1; \ ++ if (_c + 1 < (h)->used && \ ++ cmp(h, (h)->data[_c], (h)->data[_c + 1]) >= 0) \ ++ _c++; \ ++ \ ++ if (cmp(h, (h)->data[_c], (h)->data[_j]) >= 0) \ ++ break; \ ++ heap_swap(h, _c, _j, set_backpointer); \ ++ } \ ++} while (0) ++ ++#define heap_sift_up(h, i, cmp, set_backpointer) \ ++do { \ ++ while (i) { \ ++ size_t p = (i - 1) / 2; \ ++ if (cmp(h, (h)->data[i], (h)->data[p]) >= 0) \ ++ break; \ ++ heap_swap(h, i, p, set_backpointer); \ ++ i = p; \ ++ } \ ++} while (0) ++ ++#define __heap_add(h, d, cmp, set_backpointer) \ ++({ \ ++ size_t _i = (h)->used++; \ ++ (h)->data[_i] = d; \ ++ heap_set_backpointer(h, _i, set_backpointer); \ ++ \ ++ heap_sift_up(h, _i, cmp, set_backpointer); \ ++ _i; \ ++}) ++ ++#define heap_add(h, d, cmp, set_backpointer) \ ++({ \ ++ bool _r = !heap_full(h); \ ++ if (_r) \ ++ __heap_add(h, d, cmp, set_backpointer); \ ++ _r; \ ++}) ++ ++#define heap_add_or_replace(h, new, cmp, set_backpointer) \ ++do { \ ++ if (!heap_add(h, new, cmp, set_backpointer) && \ ++ cmp(h, new, heap_peek(h)) >= 0) { \ ++ (h)->data[0] = new; \ ++ heap_set_backpointer(h, 0, set_backpointer); \ ++ heap_sift_down(h, 0, cmp, set_backpointer); \ ++ } \ ++} while (0) ++ ++#define heap_del(h, i, cmp, set_backpointer) \ ++do { \ ++ size_t _i = (i); \ ++ \ ++ BUG_ON(_i >= (h)->used); \ ++ (h)->used--; \ ++ heap_swap(h, _i, (h)->used, set_backpointer); \ ++ heap_sift_up(h, _i, cmp, set_backpointer); \ ++ heap_sift_down(h, _i, cmp, set_backpointer); \ ++} while (0) ++ ++#define heap_pop(h, d, cmp, set_backpointer) \ ++({ \ ++ bool _r = (h)->used; \ ++ if (_r) { \ ++ (d) = (h)->data[0]; \ ++ heap_del(h, 0, cmp, set_backpointer); \ ++ } \ ++ _r; \ ++}) ++ ++#define heap_resort(heap, cmp, set_backpointer) \ ++do { \ ++ ssize_t _i; \ ++ for (_i = (ssize_t) (heap)->used / 2 - 1; _i >= 0; --_i) \ ++ heap_sift_down(heap, _i, cmp, set_backpointer); \ ++} while (0) ++ ++#define ANYSINT_MAX(t) \ ++ ((((t) 1 << (sizeof(t) * 8 - 2)) - (t) 1) * (t) 2 + (t) 1) ++ ++struct printbuf { ++ char *pos; ++ char *end; ++}; ++ ++static inline size_t printbuf_remaining(struct printbuf *buf) ++{ ++ return buf->end - buf->pos; ++} ++ ++#define _PBUF(_buf, _len) \ ++ ((struct printbuf) { \ ++ .pos = _buf, \ ++ .end = _buf + _len, \ ++ }) ++ ++#define PBUF(_buf) _PBUF(_buf, sizeof(_buf)) ++ ++#define pr_buf(_out, ...) \ ++do { \ ++ (_out)->pos += scnprintf((_out)->pos, printbuf_remaining(_out), \ ++ __VA_ARGS__); \ ++} while (0) ++ ++void bch_scnmemcpy(struct printbuf *, const char *, size_t); ++ ++int bch2_strtoint_h(const char *, int *); ++int bch2_strtouint_h(const char *, unsigned int *); ++int bch2_strtoll_h(const char *, long long *); ++int bch2_strtoull_h(const char *, unsigned long long *); ++int bch2_strtou64_h(const char *, u64 *); ++ ++static inline int bch2_strtol_h(const char *cp, long *res) ++{ ++#if BITS_PER_LONG == 32 ++ return bch2_strtoint_h(cp, (int *) res); ++#else ++ return bch2_strtoll_h(cp, (long long *) res); ++#endif ++} ++ ++static inline int bch2_strtoul_h(const char *cp, long *res) ++{ ++#if BITS_PER_LONG == 32 ++ return bch2_strtouint_h(cp, (unsigned int *) res); ++#else ++ return bch2_strtoull_h(cp, (unsigned long long *) res); ++#endif ++} ++ ++#define strtoi_h(cp, res) \ ++ ( type_is(*res, int) ? bch2_strtoint_h(cp, (void *) res)\ ++ : type_is(*res, long) ? bch2_strtol_h(cp, (void *) res)\ ++ : type_is(*res, long long) ? bch2_strtoll_h(cp, (void *) res)\ ++ : type_is(*res, unsigned) ? bch2_strtouint_h(cp, (void *) res)\ ++ : type_is(*res, unsigned long) ? bch2_strtoul_h(cp, (void *) res)\ ++ : type_is(*res, unsigned long long) ? bch2_strtoull_h(cp, (void *) res)\ ++ : -EINVAL) ++ ++#define strtoul_safe(cp, var) \ ++({ \ ++ unsigned long _v; \ ++ int _r = kstrtoul(cp, 10, &_v); \ ++ if (!_r) \ ++ var = _v; \ ++ _r; \ ++}) ++ ++#define strtoul_safe_clamp(cp, var, min, max) \ ++({ \ ++ unsigned long _v; \ ++ int _r = kstrtoul(cp, 10, &_v); \ ++ if (!_r) \ ++ var = clamp_t(typeof(var), _v, min, max); \ ++ _r; \ ++}) ++ ++#define strtoul_safe_restrict(cp, var, min, max) \ ++({ \ ++ unsigned long _v; \ ++ int _r = kstrtoul(cp, 10, &_v); \ ++ if (!_r && _v >= min && _v <= max) \ ++ var = _v; \ ++ else \ ++ _r = -EINVAL; \ ++ _r; \ ++}) ++ ++#define snprint(buf, size, var) \ ++ snprintf(buf, size, \ ++ type_is(var, int) ? "%i\n" \ ++ : type_is(var, unsigned) ? "%u\n" \ ++ : type_is(var, long) ? "%li\n" \ ++ : type_is(var, unsigned long) ? "%lu\n" \ ++ : type_is(var, s64) ? "%lli\n" \ ++ : type_is(var, u64) ? "%llu\n" \ ++ : type_is(var, char *) ? "%s\n" \ ++ : "%i\n", var) ++ ++void bch2_hprint(struct printbuf *, s64); ++ ++bool bch2_is_zero(const void *, size_t); ++ ++void bch2_string_opt_to_text(struct printbuf *, ++ const char * const [], size_t); ++ ++void bch2_flags_to_text(struct printbuf *, const char * const[], u64); ++u64 bch2_read_flag_list(char *, const char * const[]); ++ ++#define NR_QUANTILES 15 ++#define QUANTILE_IDX(i) inorder_to_eytzinger0(i, NR_QUANTILES) ++#define QUANTILE_FIRST eytzinger0_first(NR_QUANTILES) ++#define QUANTILE_LAST eytzinger0_last(NR_QUANTILES) ++ ++struct quantiles { ++ struct quantile_entry { ++ u64 m; ++ u64 step; ++ } entries[NR_QUANTILES]; ++}; ++ ++struct time_stat_buffer { ++ unsigned nr; ++ struct time_stat_buffer_entry { ++ u64 start; ++ u64 end; ++ } entries[32]; ++}; ++ ++struct time_stats { ++ spinlock_t lock; ++ u64 count; ++ /* all fields are in nanoseconds */ ++ u64 average_duration; ++ u64 average_frequency; ++ u64 max_duration; ++ u64 last_event; ++ struct quantiles quantiles; ++ ++ struct time_stat_buffer __percpu *buffer; ++}; ++ ++void __bch2_time_stats_update(struct time_stats *stats, u64, u64); ++ ++static inline void bch2_time_stats_update(struct time_stats *stats, u64 start) ++{ ++ __bch2_time_stats_update(stats, start, local_clock()); ++} ++ ++size_t bch2_time_stats_print(struct time_stats *, char *, size_t); ++ ++void bch2_time_stats_exit(struct time_stats *); ++void bch2_time_stats_init(struct time_stats *); ++ ++#define ewma_add(ewma, val, weight) \ ++({ \ ++ typeof(ewma) _ewma = (ewma); \ ++ typeof(weight) _weight = (weight); \ ++ \ ++ (((_ewma << _weight) - _ewma) + (val)) >> _weight; \ ++}) ++ ++struct bch_ratelimit { ++ /* Next time we want to do some work, in nanoseconds */ ++ u64 next; ++ ++ /* ++ * Rate at which we want to do work, in units per nanosecond ++ * The units here correspond to the units passed to ++ * bch2_ratelimit_increment() ++ */ ++ unsigned rate; ++}; ++ ++static inline void bch2_ratelimit_reset(struct bch_ratelimit *d) ++{ ++ d->next = local_clock(); ++} ++ ++u64 bch2_ratelimit_delay(struct bch_ratelimit *); ++void bch2_ratelimit_increment(struct bch_ratelimit *, u64); ++ ++struct bch_pd_controller { ++ struct bch_ratelimit rate; ++ unsigned long last_update; ++ ++ s64 last_actual; ++ s64 smoothed_derivative; ++ ++ unsigned p_term_inverse; ++ unsigned d_smooth; ++ unsigned d_term; ++ ++ /* for exporting to sysfs (no effect on behavior) */ ++ s64 last_derivative; ++ s64 last_proportional; ++ s64 last_change; ++ s64 last_target; ++ ++ /* If true, the rate will not increase if bch2_ratelimit_delay() ++ * is not being called often enough. */ ++ bool backpressure; ++}; ++ ++void bch2_pd_controller_update(struct bch_pd_controller *, s64, s64, int); ++void bch2_pd_controller_init(struct bch_pd_controller *); ++size_t bch2_pd_controller_print_debug(struct bch_pd_controller *, char *); ++ ++#define sysfs_pd_controller_attribute(name) \ ++ rw_attribute(name##_rate); \ ++ rw_attribute(name##_rate_bytes); \ ++ rw_attribute(name##_rate_d_term); \ ++ rw_attribute(name##_rate_p_term_inverse); \ ++ read_attribute(name##_rate_debug) ++ ++#define sysfs_pd_controller_files(name) \ ++ &sysfs_##name##_rate, \ ++ &sysfs_##name##_rate_bytes, \ ++ &sysfs_##name##_rate_d_term, \ ++ &sysfs_##name##_rate_p_term_inverse, \ ++ &sysfs_##name##_rate_debug ++ ++#define sysfs_pd_controller_show(name, var) \ ++do { \ ++ sysfs_hprint(name##_rate, (var)->rate.rate); \ ++ sysfs_print(name##_rate_bytes, (var)->rate.rate); \ ++ sysfs_print(name##_rate_d_term, (var)->d_term); \ ++ sysfs_print(name##_rate_p_term_inverse, (var)->p_term_inverse); \ ++ \ ++ if (attr == &sysfs_##name##_rate_debug) \ ++ return bch2_pd_controller_print_debug(var, buf); \ ++} while (0) ++ ++#define sysfs_pd_controller_store(name, var) \ ++do { \ ++ sysfs_strtoul_clamp(name##_rate, \ ++ (var)->rate.rate, 1, UINT_MAX); \ ++ sysfs_strtoul_clamp(name##_rate_bytes, \ ++ (var)->rate.rate, 1, UINT_MAX); \ ++ sysfs_strtoul(name##_rate_d_term, (var)->d_term); \ ++ sysfs_strtoul_clamp(name##_rate_p_term_inverse, \ ++ (var)->p_term_inverse, 1, INT_MAX); \ ++} while (0) ++ ++#define container_of_or_null(ptr, type, member) \ ++({ \ ++ typeof(ptr) _ptr = ptr; \ ++ _ptr ? container_of(_ptr, type, member) : NULL; \ ++}) ++ ++/* Does linear interpolation between powers of two */ ++static inline unsigned fract_exp_two(unsigned x, unsigned fract_bits) ++{ ++ unsigned fract = x & ~(~0 << fract_bits); ++ ++ x >>= fract_bits; ++ x = 1 << x; ++ x += (x * fract) >> fract_bits; ++ ++ return x; ++} ++ ++void bch2_bio_map(struct bio *bio, void *base, size_t); ++int bch2_bio_alloc_pages(struct bio *, size_t, gfp_t); ++ ++static inline sector_t bdev_sectors(struct block_device *bdev) ++{ ++ return bdev->bd_inode->i_size >> 9; ++} ++ ++#define closure_bio_submit(bio, cl) \ ++do { \ ++ closure_get(cl); \ ++ submit_bio(bio); \ ++} while (0) ++ ++#define kthread_wait_freezable(cond) \ ++({ \ ++ int _ret = 0; \ ++ while (1) { \ ++ set_current_state(TASK_INTERRUPTIBLE); \ ++ if (kthread_should_stop()) { \ ++ _ret = -1; \ ++ break; \ ++ } \ ++ \ ++ if (cond) \ ++ break; \ ++ \ ++ schedule(); \ ++ try_to_freeze(); \ ++ } \ ++ set_current_state(TASK_RUNNING); \ ++ _ret; \ ++}) ++ ++size_t bch2_rand_range(size_t); ++ ++void memcpy_to_bio(struct bio *, struct bvec_iter, void *); ++void memcpy_from_bio(void *, struct bio *, struct bvec_iter); ++ ++static inline void memcpy_u64s_small(void *dst, const void *src, ++ unsigned u64s) ++{ ++ u64 *d = dst; ++ const u64 *s = src; ++ ++ while (u64s--) ++ *d++ = *s++; ++} ++ ++static inline void __memcpy_u64s(void *dst, const void *src, ++ unsigned u64s) ++{ ++#ifdef CONFIG_X86_64 ++ long d0, d1, d2; ++ asm volatile("rep ; movsq" ++ : "=&c" (d0), "=&D" (d1), "=&S" (d2) ++ : "0" (u64s), "1" (dst), "2" (src) ++ : "memory"); ++#else ++ u64 *d = dst; ++ const u64 *s = src; ++ ++ while (u64s--) ++ *d++ = *s++; ++#endif ++} ++ ++static inline void memcpy_u64s(void *dst, const void *src, ++ unsigned u64s) ++{ ++ EBUG_ON(!(dst >= src + u64s * sizeof(u64) || ++ dst + u64s * sizeof(u64) <= src)); ++ ++ __memcpy_u64s(dst, src, u64s); ++} ++ ++static inline void __memmove_u64s_down(void *dst, const void *src, ++ unsigned u64s) ++{ ++ __memcpy_u64s(dst, src, u64s); ++} ++ ++static inline void memmove_u64s_down(void *dst, const void *src, ++ unsigned u64s) ++{ ++ EBUG_ON(dst > src); ++ ++ __memmove_u64s_down(dst, src, u64s); ++} ++ ++static inline void __memmove_u64s_up_small(void *_dst, const void *_src, ++ unsigned u64s) ++{ ++ u64 *dst = (u64 *) _dst + u64s; ++ u64 *src = (u64 *) _src + u64s; ++ ++ while (u64s--) ++ *--dst = *--src; ++} ++ ++static inline void memmove_u64s_up_small(void *dst, const void *src, ++ unsigned u64s) ++{ ++ EBUG_ON(dst < src); ++ ++ __memmove_u64s_up_small(dst, src, u64s); ++} ++ ++static inline void __memmove_u64s_up(void *_dst, const void *_src, ++ unsigned u64s) ++{ ++ u64 *dst = (u64 *) _dst + u64s - 1; ++ u64 *src = (u64 *) _src + u64s - 1; ++ ++#ifdef CONFIG_X86_64 ++ long d0, d1, d2; ++ asm volatile("std ;\n" ++ "rep ; movsq\n" ++ "cld ;\n" ++ : "=&c" (d0), "=&D" (d1), "=&S" (d2) ++ : "0" (u64s), "1" (dst), "2" (src) ++ : "memory"); ++#else ++ while (u64s--) ++ *dst-- = *src--; ++#endif ++} ++ ++static inline void memmove_u64s_up(void *dst, const void *src, ++ unsigned u64s) ++{ ++ EBUG_ON(dst < src); ++ ++ __memmove_u64s_up(dst, src, u64s); ++} ++ ++static inline void memmove_u64s(void *dst, const void *src, ++ unsigned u64s) ++{ ++ if (dst < src) ++ __memmove_u64s_down(dst, src, u64s); ++ else ++ __memmove_u64s_up(dst, src, u64s); ++} ++ ++/* Set the last few bytes up to a u64 boundary given an offset into a buffer. */ ++static inline void memset_u64s_tail(void *s, int c, unsigned bytes) ++{ ++ unsigned rem = round_up(bytes, sizeof(u64)) - bytes; ++ ++ memset(s + bytes, c, rem); ++} ++ ++void sort_cmp_size(void *base, size_t num, size_t size, ++ int (*cmp_func)(const void *, const void *, size_t), ++ void (*swap_func)(void *, void *, size_t)); ++ ++/* just the memmove, doesn't update @_nr */ ++#define __array_insert_item(_array, _nr, _pos) \ ++ memmove(&(_array)[(_pos) + 1], \ ++ &(_array)[(_pos)], \ ++ sizeof((_array)[0]) * ((_nr) - (_pos))) ++ ++#define array_insert_item(_array, _nr, _pos, _new_item) \ ++do { \ ++ __array_insert_item(_array, _nr, _pos); \ ++ (_nr)++; \ ++ (_array)[(_pos)] = (_new_item); \ ++} while (0) ++ ++#define array_remove_items(_array, _nr, _pos, _nr_to_remove) \ ++do { \ ++ (_nr) -= (_nr_to_remove); \ ++ memmove(&(_array)[(_pos)], \ ++ &(_array)[(_pos) + (_nr_to_remove)], \ ++ sizeof((_array)[0]) * ((_nr) - (_pos))); \ ++} while (0) ++ ++#define array_remove_item(_array, _nr, _pos) \ ++ array_remove_items(_array, _nr, _pos, 1) ++ ++#define bubble_sort(_base, _nr, _cmp) \ ++do { \ ++ ssize_t _i, _end; \ ++ bool _swapped = true; \ ++ \ ++ for (_end = (ssize_t) (_nr) - 1; _end > 0 && _swapped; --_end) {\ ++ _swapped = false; \ ++ for (_i = 0; _i < _end; _i++) \ ++ if (_cmp((_base)[_i], (_base)[_i + 1]) > 0) { \ ++ swap((_base)[_i], (_base)[_i + 1]); \ ++ _swapped = true; \ ++ } \ ++ } \ ++} while (0) ++ ++static inline u64 percpu_u64_get(u64 __percpu *src) ++{ ++ u64 ret = 0; ++ int cpu; ++ ++ for_each_possible_cpu(cpu) ++ ret += *per_cpu_ptr(src, cpu); ++ return ret; ++} ++ ++static inline void percpu_u64_set(u64 __percpu *dst, u64 src) ++{ ++ int cpu; ++ ++ for_each_possible_cpu(cpu) ++ *per_cpu_ptr(dst, cpu) = 0; ++ ++ preempt_disable(); ++ *this_cpu_ptr(dst) = src; ++ preempt_enable(); ++} ++ ++static inline void acc_u64s(u64 *acc, const u64 *src, unsigned nr) ++{ ++ unsigned i; ++ ++ for (i = 0; i < nr; i++) ++ acc[i] += src[i]; ++} ++ ++static inline void acc_u64s_percpu(u64 *acc, const u64 __percpu *src, ++ unsigned nr) ++{ ++ int cpu; ++ ++ for_each_possible_cpu(cpu) ++ acc_u64s(acc, per_cpu_ptr(src, cpu), nr); ++} ++ ++static inline void percpu_memset(void __percpu *p, int c, size_t bytes) ++{ ++ int cpu; ++ ++ for_each_possible_cpu(cpu) ++ memset(per_cpu_ptr(p, cpu), c, bytes); ++} ++ ++u64 *bch2_acc_percpu_u64s(u64 __percpu *, unsigned); ++ ++#define cmp_int(l, r) ((l > r) - (l < r)) ++ ++#endif /* _BCACHEFS_UTIL_H */ +diff --git a/fs/bcachefs/vstructs.h b/fs/bcachefs/vstructs.h +new file mode 100644 +index 000000000000..c099cdc0605f +--- /dev/null ++++ b/fs/bcachefs/vstructs.h +@@ -0,0 +1,63 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _VSTRUCTS_H ++#define _VSTRUCTS_H ++ ++#include "util.h" ++ ++/* ++ * NOTE: we can't differentiate between __le64 and u64 with type_is - this ++ * assumes u64 is little endian: ++ */ ++#define __vstruct_u64s(_s) \ ++({ \ ++ ( type_is((_s)->u64s, u64) ? le64_to_cpu((__force __le64) (_s)->u64s) \ ++ : type_is((_s)->u64s, u32) ? le32_to_cpu((__force __le32) (_s)->u64s) \ ++ : type_is((_s)->u64s, u16) ? le16_to_cpu((__force __le16) (_s)->u64s) \ ++ : ((__force u8) ((_s)->u64s))); \ ++}) ++ ++#define __vstruct_bytes(_type, _u64s) \ ++({ \ ++ BUILD_BUG_ON(offsetof(_type, _data) % sizeof(u64)); \ ++ \ ++ (offsetof(_type, _data) + (_u64s) * sizeof(u64)); \ ++}) ++ ++#define vstruct_bytes(_s) \ ++ __vstruct_bytes(typeof(*(_s)), __vstruct_u64s(_s)) ++ ++#define __vstruct_blocks(_type, _sector_block_bits, _u64s) \ ++ (round_up(__vstruct_bytes(_type, _u64s), \ ++ 512 << (_sector_block_bits)) >> (9 + (_sector_block_bits))) ++ ++#define vstruct_blocks(_s, _sector_block_bits) \ ++ __vstruct_blocks(typeof(*(_s)), _sector_block_bits, __vstruct_u64s(_s)) ++ ++#define vstruct_blocks_plus(_s, _sector_block_bits, _u64s) \ ++ __vstruct_blocks(typeof(*(_s)), _sector_block_bits, \ ++ __vstruct_u64s(_s) + (_u64s)) ++ ++#define vstruct_sectors(_s, _sector_block_bits) \ ++ (round_up(vstruct_bytes(_s), 512 << (_sector_block_bits)) >> 9) ++ ++#define vstruct_next(_s) \ ++ ((typeof(_s)) ((_s)->_data + __vstruct_u64s(_s))) ++#define vstruct_last(_s) \ ++ ((typeof(&(_s)->start[0])) ((_s)->_data + __vstruct_u64s(_s))) ++#define vstruct_end(_s) \ ++ ((void *) ((_s)->_data + __vstruct_u64s(_s))) ++ ++#define vstruct_for_each(_s, _i) \ ++ for (_i = (_s)->start; \ ++ _i < vstruct_last(_s); \ ++ _i = vstruct_next(_i)) ++ ++#define vstruct_for_each_safe(_s, _i, _t) \ ++ for (_i = (_s)->start; \ ++ _i < vstruct_last(_s) && (_t = vstruct_next(_i), true); \ ++ _i = _t) ++ ++#define vstruct_idx(_s, _idx) \ ++ ((typeof(&(_s)->start[0])) ((_s)->_data + (_idx))) ++ ++#endif /* _VSTRUCTS_H */ +diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c +new file mode 100644 +index 000000000000..4cbdf870c8e8 +--- /dev/null ++++ b/fs/bcachefs/xattr.c +@@ -0,0 +1,584 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include "bcachefs.h" ++#include "bkey_methods.h" ++#include "btree_update.h" ++#include "extents.h" ++#include "fs.h" ++#include "rebalance.h" ++#include "str_hash.h" ++#include "xattr.h" ++ ++#include ++#include ++#include ++ ++static const struct xattr_handler *bch2_xattr_type_to_handler(unsigned); ++ ++static u64 bch2_xattr_hash(const struct bch_hash_info *info, ++ const struct xattr_search_key *key) ++{ ++ struct bch_str_hash_ctx ctx; ++ ++ bch2_str_hash_init(&ctx, info); ++ bch2_str_hash_update(&ctx, info, &key->type, sizeof(key->type)); ++ bch2_str_hash_update(&ctx, info, key->name.name, key->name.len); ++ ++ return bch2_str_hash_end(&ctx, info); ++} ++ ++static u64 xattr_hash_key(const struct bch_hash_info *info, const void *key) ++{ ++ return bch2_xattr_hash(info, key); ++} ++ ++static u64 xattr_hash_bkey(const struct bch_hash_info *info, struct bkey_s_c k) ++{ ++ struct bkey_s_c_xattr x = bkey_s_c_to_xattr(k); ++ ++ return bch2_xattr_hash(info, ++ &X_SEARCH(x.v->x_type, x.v->x_name, x.v->x_name_len)); ++} ++ ++static bool xattr_cmp_key(struct bkey_s_c _l, const void *_r) ++{ ++ struct bkey_s_c_xattr l = bkey_s_c_to_xattr(_l); ++ const struct xattr_search_key *r = _r; ++ ++ return l.v->x_type != r->type || ++ l.v->x_name_len != r->name.len || ++ memcmp(l.v->x_name, r->name.name, r->name.len); ++} ++ ++static bool xattr_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r) ++{ ++ struct bkey_s_c_xattr l = bkey_s_c_to_xattr(_l); ++ struct bkey_s_c_xattr r = bkey_s_c_to_xattr(_r); ++ ++ return l.v->x_type != r.v->x_type || ++ l.v->x_name_len != r.v->x_name_len || ++ memcmp(l.v->x_name, r.v->x_name, r.v->x_name_len); ++} ++ ++const struct bch_hash_desc bch2_xattr_hash_desc = { ++ .btree_id = BTREE_ID_XATTRS, ++ .key_type = KEY_TYPE_xattr, ++ .hash_key = xattr_hash_key, ++ .hash_bkey = xattr_hash_bkey, ++ .cmp_key = xattr_cmp_key, ++ .cmp_bkey = xattr_cmp_bkey, ++}; ++ ++const char *bch2_xattr_invalid(const struct bch_fs *c, struct bkey_s_c k) ++{ ++ const struct xattr_handler *handler; ++ struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k); ++ ++ if (bkey_val_bytes(k.k) < sizeof(struct bch_xattr)) ++ return "value too small"; ++ ++ if (bkey_val_u64s(k.k) < ++ xattr_val_u64s(xattr.v->x_name_len, ++ le16_to_cpu(xattr.v->x_val_len))) ++ return "value too small"; ++ ++ if (bkey_val_u64s(k.k) > ++ xattr_val_u64s(xattr.v->x_name_len, ++ le16_to_cpu(xattr.v->x_val_len) + 4)) ++ return "value too big"; ++ ++ handler = bch2_xattr_type_to_handler(xattr.v->x_type); ++ if (!handler) ++ return "invalid type"; ++ ++ if (memchr(xattr.v->x_name, '\0', xattr.v->x_name_len)) ++ return "xattr name has invalid characters"; ++ ++ return NULL; ++} ++ ++void bch2_xattr_to_text(struct printbuf *out, struct bch_fs *c, ++ struct bkey_s_c k) ++{ ++ const struct xattr_handler *handler; ++ struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k); ++ ++ handler = bch2_xattr_type_to_handler(xattr.v->x_type); ++ if (handler && handler->prefix) ++ pr_buf(out, "%s", handler->prefix); ++ else if (handler) ++ pr_buf(out, "(type %u)", xattr.v->x_type); ++ else ++ pr_buf(out, "(unknown type %u)", xattr.v->x_type); ++ ++ bch_scnmemcpy(out, xattr.v->x_name, ++ xattr.v->x_name_len); ++ pr_buf(out, ":"); ++ bch_scnmemcpy(out, xattr_val(xattr.v), ++ le16_to_cpu(xattr.v->x_val_len)); ++} ++ ++int bch2_xattr_get(struct bch_fs *c, struct bch_inode_info *inode, ++ const char *name, void *buffer, size_t size, int type) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c_xattr xattr; ++ int ret; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ iter = bch2_hash_lookup(&trans, bch2_xattr_hash_desc, ++ &inode->ei_str_hash, inode->v.i_ino, ++ &X_SEARCH(type, name, strlen(name)), ++ 0); ++ if (IS_ERR(iter)) { ++ bch2_trans_exit(&trans); ++ BUG_ON(PTR_ERR(iter) == -EINTR); ++ ++ return PTR_ERR(iter) == -ENOENT ? -ENODATA : PTR_ERR(iter); ++ } ++ ++ xattr = bkey_s_c_to_xattr(bch2_btree_iter_peek_slot(iter)); ++ ret = le16_to_cpu(xattr.v->x_val_len); ++ if (buffer) { ++ if (ret > size) ++ ret = -ERANGE; ++ else ++ memcpy(buffer, xattr_val(xattr.v), ret); ++ } ++ ++ bch2_trans_exit(&trans); ++ return ret; ++} ++ ++int bch2_xattr_set(struct btree_trans *trans, u64 inum, ++ const struct bch_hash_info *hash_info, ++ const char *name, const void *value, size_t size, ++ int type, int flags) ++{ ++ int ret; ++ ++ if (value) { ++ struct bkey_i_xattr *xattr; ++ unsigned namelen = strlen(name); ++ unsigned u64s = BKEY_U64s + ++ xattr_val_u64s(namelen, size); ++ ++ if (u64s > U8_MAX) ++ return -ERANGE; ++ ++ xattr = bch2_trans_kmalloc(trans, u64s * sizeof(u64)); ++ if (IS_ERR(xattr)) ++ return PTR_ERR(xattr); ++ ++ bkey_xattr_init(&xattr->k_i); ++ xattr->k.u64s = u64s; ++ xattr->v.x_type = type; ++ xattr->v.x_name_len = namelen; ++ xattr->v.x_val_len = cpu_to_le16(size); ++ memcpy(xattr->v.x_name, name, namelen); ++ memcpy(xattr_val(&xattr->v), value, size); ++ ++ ret = bch2_hash_set(trans, bch2_xattr_hash_desc, hash_info, ++ inum, &xattr->k_i, ++ (flags & XATTR_CREATE ? BCH_HASH_SET_MUST_CREATE : 0)| ++ (flags & XATTR_REPLACE ? BCH_HASH_SET_MUST_REPLACE : 0)); ++ } else { ++ struct xattr_search_key search = ++ X_SEARCH(type, name, strlen(name)); ++ ++ ret = bch2_hash_delete(trans, bch2_xattr_hash_desc, ++ hash_info, inum, &search); ++ } ++ ++ if (ret == -ENOENT) ++ ret = flags & XATTR_REPLACE ? -ENODATA : 0; ++ ++ return ret; ++} ++ ++struct xattr_buf { ++ char *buf; ++ size_t len; ++ size_t used; ++}; ++ ++static int __bch2_xattr_emit(const char *prefix, ++ const char *name, size_t name_len, ++ struct xattr_buf *buf) ++{ ++ const size_t prefix_len = strlen(prefix); ++ const size_t total_len = prefix_len + name_len + 1; ++ ++ if (buf->buf) { ++ if (buf->used + total_len > buf->len) ++ return -ERANGE; ++ ++ memcpy(buf->buf + buf->used, prefix, prefix_len); ++ memcpy(buf->buf + buf->used + prefix_len, ++ name, name_len); ++ buf->buf[buf->used + prefix_len + name_len] = '\0'; ++ } ++ ++ buf->used += total_len; ++ return 0; ++} ++ ++static int bch2_xattr_emit(struct dentry *dentry, ++ const struct bch_xattr *xattr, ++ struct xattr_buf *buf) ++{ ++ const struct xattr_handler *handler = ++ bch2_xattr_type_to_handler(xattr->x_type); ++ ++ return handler && (!handler->list || handler->list(dentry)) ++ ? __bch2_xattr_emit(handler->prefix ?: handler->name, ++ xattr->x_name, xattr->x_name_len, buf) ++ : 0; ++} ++ ++static int bch2_xattr_list_bcachefs(struct bch_fs *c, ++ struct bch_inode_info *inode, ++ struct xattr_buf *buf, ++ bool all) ++{ ++ const char *prefix = all ? "bcachefs_effective." : "bcachefs."; ++ unsigned id; ++ int ret = 0; ++ u64 v; ++ ++ for (id = 0; id < Inode_opt_nr; id++) { ++ v = bch2_inode_opt_get(&inode->ei_inode, id); ++ if (!v) ++ continue; ++ ++ if (!all && ++ !(inode->ei_inode.bi_fields_set & (1 << id))) ++ continue; ++ ++ ret = __bch2_xattr_emit(prefix, bch2_inode_opts[id], ++ strlen(bch2_inode_opts[id]), buf); ++ if (ret) ++ break; ++ } ++ ++ return ret; ++} ++ ++ssize_t bch2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) ++{ ++ struct bch_fs *c = dentry->d_sb->s_fs_info; ++ struct bch_inode_info *inode = to_bch_ei(dentry->d_inode); ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ struct xattr_buf buf = { .buf = buffer, .len = buffer_size }; ++ u64 inum = dentry->d_inode->i_ino; ++ int ret; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_XATTRS, ++ POS(inum, 0), 0, k, ret) { ++ BUG_ON(k.k->p.inode < inum); ++ ++ if (k.k->p.inode > inum) ++ break; ++ ++ if (k.k->type != KEY_TYPE_xattr) ++ continue; ++ ++ ret = bch2_xattr_emit(dentry, bkey_s_c_to_xattr(k).v, &buf); ++ if (ret) ++ break; ++ } ++ ret = bch2_trans_exit(&trans) ?: ret; ++ ++ if (ret) ++ return ret; ++ ++ ret = bch2_xattr_list_bcachefs(c, inode, &buf, false); ++ if (ret) ++ return ret; ++ ++ ret = bch2_xattr_list_bcachefs(c, inode, &buf, true); ++ if (ret) ++ return ret; ++ ++ return buf.used; ++} ++ ++static int bch2_xattr_get_handler(const struct xattr_handler *handler, ++ struct dentry *dentry, struct inode *vinode, ++ const char *name, void *buffer, size_t size) ++{ ++ struct bch_inode_info *inode = to_bch_ei(vinode); ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ ++ return bch2_xattr_get(c, inode, name, buffer, size, handler->flags); ++} ++ ++static int bch2_xattr_set_handler(const struct xattr_handler *handler, ++ struct user_namespace *mnt_userns, ++ struct dentry *dentry, struct inode *vinode, ++ const char *name, const void *value, ++ size_t size, int flags) ++{ ++ struct bch_inode_info *inode = to_bch_ei(vinode); ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ ++ return bch2_trans_do(c, &inode->ei_journal_seq, BTREE_INSERT_ATOMIC, ++ bch2_xattr_set(&trans, inode->v.i_ino, ++ &inode->ei_str_hash, ++ name, value, size, ++ handler->flags, flags)); ++} ++ ++static const struct xattr_handler bch_xattr_user_handler = { ++ .prefix = XATTR_USER_PREFIX, ++ .get = bch2_xattr_get_handler, ++ .set = bch2_xattr_set_handler, ++ .flags = KEY_TYPE_XATTR_INDEX_USER, ++}; ++ ++static bool bch2_xattr_trusted_list(struct dentry *dentry) ++{ ++ return capable(CAP_SYS_ADMIN); ++} ++ ++static const struct xattr_handler bch_xattr_trusted_handler = { ++ .prefix = XATTR_TRUSTED_PREFIX, ++ .list = bch2_xattr_trusted_list, ++ .get = bch2_xattr_get_handler, ++ .set = bch2_xattr_set_handler, ++ .flags = KEY_TYPE_XATTR_INDEX_TRUSTED, ++}; ++ ++static const struct xattr_handler bch_xattr_security_handler = { ++ .prefix = XATTR_SECURITY_PREFIX, ++ .get = bch2_xattr_get_handler, ++ .set = bch2_xattr_set_handler, ++ .flags = KEY_TYPE_XATTR_INDEX_SECURITY, ++}; ++ ++#ifndef NO_BCACHEFS_FS ++ ++static int opt_to_inode_opt(int id) ++{ ++ switch (id) { ++#define x(name, ...) \ ++ case Opt_##name: return Inode_opt_##name; ++ BCH_INODE_OPTS() ++#undef x ++ default: ++ return -1; ++ } ++} ++ ++static int __bch2_xattr_bcachefs_get(const struct xattr_handler *handler, ++ struct dentry *dentry, struct inode *vinode, ++ const char *name, void *buffer, size_t size, ++ bool all) ++{ ++ struct bch_inode_info *inode = to_bch_ei(vinode); ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ struct bch_opts opts = ++ bch2_inode_opts_to_opts(bch2_inode_opts_get(&inode->ei_inode)); ++ const struct bch_option *opt; ++ int id, inode_opt_id; ++ char buf[512]; ++ struct printbuf out = PBUF(buf); ++ unsigned val_len; ++ u64 v; ++ ++ id = bch2_opt_lookup(name); ++ if (id < 0 || !bch2_opt_is_inode_opt(id)) ++ return -EINVAL; ++ ++ inode_opt_id = opt_to_inode_opt(id); ++ if (inode_opt_id < 0) ++ return -EINVAL; ++ ++ opt = bch2_opt_table + id; ++ ++ if (!bch2_opt_defined_by_id(&opts, id)) ++ return -ENODATA; ++ ++ if (!all && ++ !(inode->ei_inode.bi_fields_set & (1 << inode_opt_id))) ++ return -ENODATA; ++ ++ v = bch2_opt_get_by_id(&opts, id); ++ bch2_opt_to_text(&out, c, opt, v, 0); ++ ++ val_len = out.pos - buf; ++ ++ if (buffer && val_len > size) ++ return -ERANGE; ++ ++ if (buffer) ++ memcpy(buffer, buf, val_len); ++ return val_len; ++} ++ ++static int bch2_xattr_bcachefs_get(const struct xattr_handler *handler, ++ struct dentry *dentry, struct inode *vinode, ++ const char *name, void *buffer, size_t size) ++{ ++ return __bch2_xattr_bcachefs_get(handler, dentry, vinode, ++ name, buffer, size, false); ++} ++ ++struct inode_opt_set { ++ int id; ++ u64 v; ++ bool defined; ++}; ++ ++static int inode_opt_set_fn(struct bch_inode_info *inode, ++ struct bch_inode_unpacked *bi, ++ void *p) ++{ ++ struct inode_opt_set *s = p; ++ ++ if (s->defined) ++ bi->bi_fields_set |= 1U << s->id; ++ else ++ bi->bi_fields_set &= ~(1U << s->id); ++ ++ bch2_inode_opt_set(bi, s->id, s->v); ++ ++ return 0; ++} ++ ++static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler, ++ struct user_namespace *mnt_userns, ++ struct dentry *dentry, struct inode *vinode, ++ const char *name, const void *value, ++ size_t size, int flags) ++{ ++ struct bch_inode_info *inode = to_bch_ei(vinode); ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ const struct bch_option *opt; ++ char *buf; ++ struct inode_opt_set s; ++ int opt_id, inode_opt_id, ret; ++ ++ opt_id = bch2_opt_lookup(name); ++ if (opt_id < 0) ++ return -EINVAL; ++ ++ opt = bch2_opt_table + opt_id; ++ ++ inode_opt_id = opt_to_inode_opt(opt_id); ++ if (inode_opt_id < 0) ++ return -EINVAL; ++ ++ s.id = inode_opt_id; ++ ++ if (value) { ++ u64 v = 0; ++ ++ buf = kmalloc(size + 1, GFP_KERNEL); ++ if (!buf) ++ return -ENOMEM; ++ memcpy(buf, value, size); ++ buf[size] = '\0'; ++ ++ ret = bch2_opt_parse(c, opt, buf, &v); ++ kfree(buf); ++ ++ if (ret < 0) ++ return ret; ++ ++ ret = bch2_opt_check_may_set(c, opt_id, v); ++ if (ret < 0) ++ return ret; ++ ++ s.v = v + 1; ++ s.defined = true; ++ } else { ++ if (!IS_ROOT(dentry)) { ++ struct bch_inode_info *dir = ++ to_bch_ei(d_inode(dentry->d_parent)); ++ ++ s.v = bch2_inode_opt_get(&dir->ei_inode, inode_opt_id); ++ } else { ++ s.v = 0; ++ } ++ ++ s.defined = false; ++ } ++ ++ mutex_lock(&inode->ei_update_lock); ++ if (inode_opt_id == Inode_opt_project) { ++ ret = bch2_set_projid(c, inode, s.v); ++ if (ret) ++ goto err; ++ } ++ ++ ret = bch2_write_inode(c, inode, inode_opt_set_fn, &s, 0); ++err: ++ mutex_unlock(&inode->ei_update_lock); ++ ++ if (value && ++ (opt_id == Opt_background_compression || ++ opt_id == Opt_background_target)) ++ bch2_rebalance_add_work(c, inode->v.i_blocks); ++ ++ return ret; ++} ++ ++static const struct xattr_handler bch_xattr_bcachefs_handler = { ++ .prefix = "bcachefs.", ++ .get = bch2_xattr_bcachefs_get, ++ .set = bch2_xattr_bcachefs_set, ++}; ++ ++static int bch2_xattr_bcachefs_get_effective( ++ const struct xattr_handler *handler, ++ struct dentry *dentry, struct inode *vinode, ++ const char *name, void *buffer, size_t size) ++{ ++ return __bch2_xattr_bcachefs_get(handler, dentry, vinode, ++ name, buffer, size, true); ++} ++ ++static const struct xattr_handler bch_xattr_bcachefs_effective_handler = { ++ .prefix = "bcachefs_effective.", ++ .get = bch2_xattr_bcachefs_get_effective, ++ .set = bch2_xattr_bcachefs_set, ++}; ++ ++#endif /* NO_BCACHEFS_FS */ ++ ++const struct xattr_handler *bch2_xattr_handlers[] = { ++ &bch_xattr_user_handler, ++ &posix_acl_access_xattr_handler, ++ &posix_acl_default_xattr_handler, ++ &bch_xattr_trusted_handler, ++ &bch_xattr_security_handler, ++#ifndef NO_BCACHEFS_FS ++ &bch_xattr_bcachefs_handler, ++ &bch_xattr_bcachefs_effective_handler, ++#endif ++ NULL ++}; ++ ++static const struct xattr_handler *bch_xattr_handler_map[] = { ++ [KEY_TYPE_XATTR_INDEX_USER] = &bch_xattr_user_handler, ++ [KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS] = ++ &posix_acl_access_xattr_handler, ++ [KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT] = ++ &posix_acl_default_xattr_handler, ++ [KEY_TYPE_XATTR_INDEX_TRUSTED] = &bch_xattr_trusted_handler, ++ [KEY_TYPE_XATTR_INDEX_SECURITY] = &bch_xattr_security_handler, ++}; ++ ++static const struct xattr_handler *bch2_xattr_type_to_handler(unsigned type) ++{ ++ return type < ARRAY_SIZE(bch_xattr_handler_map) ++ ? bch_xattr_handler_map[type] ++ : NULL; ++} +diff --git a/fs/bcachefs/xattr.h b/fs/bcachefs/xattr.h +new file mode 100644 +index 000000000000..4151065ab853 +--- /dev/null ++++ b/fs/bcachefs/xattr.h +@@ -0,0 +1,49 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_XATTR_H ++#define _BCACHEFS_XATTR_H ++ ++#include "str_hash.h" ++ ++extern const struct bch_hash_desc bch2_xattr_hash_desc; ++ ++const char *bch2_xattr_invalid(const struct bch_fs *, struct bkey_s_c); ++void bch2_xattr_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); ++ ++#define bch2_bkey_ops_xattr (struct bkey_ops) { \ ++ .key_invalid = bch2_xattr_invalid, \ ++ .val_to_text = bch2_xattr_to_text, \ ++} ++ ++static inline unsigned xattr_val_u64s(unsigned name_len, unsigned val_len) ++{ ++ return DIV_ROUND_UP(offsetof(struct bch_xattr, x_name) + ++ name_len + val_len, sizeof(u64)); ++} ++ ++#define xattr_val(_xattr) \ ++ ((void *) (_xattr)->x_name + (_xattr)->x_name_len) ++ ++struct xattr_search_key { ++ u8 type; ++ struct qstr name; ++}; ++ ++#define X_SEARCH(_type, _name, _len) ((struct xattr_search_key) \ ++ { .type = _type, .name = QSTR_INIT(_name, _len) }) ++ ++struct dentry; ++struct xattr_handler; ++struct bch_hash_info; ++struct bch_inode_info; ++ ++int bch2_xattr_get(struct bch_fs *, struct bch_inode_info *, ++ const char *, void *, size_t, int); ++ ++int bch2_xattr_set(struct btree_trans *, u64, const struct bch_hash_info *, ++ const char *, const void *, size_t, int, int); ++ ++ssize_t bch2_xattr_list(struct dentry *, char *, size_t); ++ ++extern const struct xattr_handler *bch2_xattr_handlers[]; ++ ++#endif /* _BCACHEFS_XATTR_H */ +diff --git a/include/trace/events/bcachefs.h b/include/trace/events/bcachefs.h +new file mode 100644 +index 000000000000..db828e9c1390 +--- /dev/null ++++ b/include/trace/events/bcachefs.h +@@ -0,0 +1,647 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#undef TRACE_SYSTEM ++#define TRACE_SYSTEM bcachefs ++ ++#if !defined(_TRACE_BCACHE_H) || defined(TRACE_HEADER_MULTI_READ) ++#define _TRACE_BCACHE_H ++ ++#include ++ ++DECLARE_EVENT_CLASS(bpos, ++ TP_PROTO(struct bpos *p), ++ TP_ARGS(p), ++ ++ TP_STRUCT__entry( ++ __field(u64, inode ) ++ __field(u64, offset ) ++ ), ++ ++ TP_fast_assign( ++ __entry->inode = p->inode; ++ __entry->offset = p->offset; ++ ), ++ ++ TP_printk("%llu:%llu", __entry->inode, __entry->offset) ++); ++ ++DECLARE_EVENT_CLASS(bkey, ++ TP_PROTO(const struct bkey *k), ++ TP_ARGS(k), ++ ++ TP_STRUCT__entry( ++ __field(u64, inode ) ++ __field(u64, offset ) ++ __field(u32, size ) ++ ), ++ ++ TP_fast_assign( ++ __entry->inode = k->p.inode; ++ __entry->offset = k->p.offset; ++ __entry->size = k->size; ++ ), ++ ++ TP_printk("%llu:%llu len %u", __entry->inode, ++ __entry->offset, __entry->size) ++); ++ ++DECLARE_EVENT_CLASS(bch_fs, ++ TP_PROTO(struct bch_fs *c), ++ TP_ARGS(c), ++ ++ TP_STRUCT__entry( ++ __array(char, uuid, 16 ) ++ ), ++ ++ TP_fast_assign( ++ memcpy(__entry->uuid, c->sb.user_uuid.b, 16); ++ ), ++ ++ TP_printk("%pU", __entry->uuid) ++); ++ ++DECLARE_EVENT_CLASS(bio, ++ TP_PROTO(struct bio *bio), ++ TP_ARGS(bio), ++ ++ TP_STRUCT__entry( ++ __field(dev_t, dev ) ++ __field(sector_t, sector ) ++ __field(unsigned int, nr_sector ) ++ __array(char, rwbs, 6 ) ++ ), ++ ++ TP_fast_assign( ++ __entry->dev = bio->bi_bdev ? bio_dev(bio) : 0; ++ __entry->sector = bio->bi_iter.bi_sector; ++ __entry->nr_sector = bio->bi_iter.bi_size >> 9; ++ blk_fill_rwbs(__entry->rwbs, bio->bi_opf); ++ ), ++ ++ TP_printk("%d,%d %s %llu + %u", ++ MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, ++ (unsigned long long)__entry->sector, __entry->nr_sector) ++); ++ ++/* io.c: */ ++ ++DEFINE_EVENT(bio, read_split, ++ TP_PROTO(struct bio *bio), ++ TP_ARGS(bio) ++); ++ ++DEFINE_EVENT(bio, read_bounce, ++ TP_PROTO(struct bio *bio), ++ TP_ARGS(bio) ++); ++ ++DEFINE_EVENT(bio, read_retry, ++ TP_PROTO(struct bio *bio), ++ TP_ARGS(bio) ++); ++ ++DEFINE_EVENT(bio, promote, ++ TP_PROTO(struct bio *bio), ++ TP_ARGS(bio) ++); ++ ++/* Journal */ ++ ++DEFINE_EVENT(bch_fs, journal_full, ++ TP_PROTO(struct bch_fs *c), ++ TP_ARGS(c) ++); ++ ++DEFINE_EVENT(bch_fs, journal_entry_full, ++ TP_PROTO(struct bch_fs *c), ++ TP_ARGS(c) ++); ++ ++DEFINE_EVENT(bio, journal_write, ++ TP_PROTO(struct bio *bio), ++ TP_ARGS(bio) ++); ++ ++/* bset.c: */ ++ ++DEFINE_EVENT(bpos, bkey_pack_pos_fail, ++ TP_PROTO(struct bpos *p), ++ TP_ARGS(p) ++); ++ ++/* Btree */ ++ ++DECLARE_EVENT_CLASS(btree_node, ++ TP_PROTO(struct bch_fs *c, struct btree *b), ++ TP_ARGS(c, b), ++ ++ TP_STRUCT__entry( ++ __array(char, uuid, 16 ) ++ __field(u8, level ) ++ __field(u8, id ) ++ __field(u64, inode ) ++ __field(u64, offset ) ++ ), ++ ++ TP_fast_assign( ++ memcpy(__entry->uuid, c->sb.user_uuid.b, 16); ++ __entry->level = b->level; ++ __entry->id = b->btree_id; ++ __entry->inode = b->key.k.p.inode; ++ __entry->offset = b->key.k.p.offset; ++ ), ++ ++ TP_printk("%pU %u id %u %llu:%llu", ++ __entry->uuid, __entry->level, __entry->id, ++ __entry->inode, __entry->offset) ++); ++ ++DEFINE_EVENT(btree_node, btree_read, ++ TP_PROTO(struct bch_fs *c, struct btree *b), ++ TP_ARGS(c, b) ++); ++ ++TRACE_EVENT(btree_write, ++ TP_PROTO(struct btree *b, unsigned bytes, unsigned sectors), ++ TP_ARGS(b, bytes, sectors), ++ ++ TP_STRUCT__entry( ++ __field(enum btree_node_type, type) ++ __field(unsigned, bytes ) ++ __field(unsigned, sectors ) ++ ), ++ ++ TP_fast_assign( ++ __entry->type = btree_node_type(b); ++ __entry->bytes = bytes; ++ __entry->sectors = sectors; ++ ), ++ ++ TP_printk("bkey type %u bytes %u sectors %u", ++ __entry->type , __entry->bytes, __entry->sectors) ++); ++ ++DEFINE_EVENT(btree_node, btree_node_alloc, ++ TP_PROTO(struct bch_fs *c, struct btree *b), ++ TP_ARGS(c, b) ++); ++ ++DEFINE_EVENT(btree_node, btree_node_free, ++ TP_PROTO(struct bch_fs *c, struct btree *b), ++ TP_ARGS(c, b) ++); ++ ++DEFINE_EVENT(btree_node, btree_node_reap, ++ TP_PROTO(struct bch_fs *c, struct btree *b), ++ TP_ARGS(c, b) ++); ++ ++DECLARE_EVENT_CLASS(btree_node_cannibalize_lock, ++ TP_PROTO(struct bch_fs *c), ++ TP_ARGS(c), ++ ++ TP_STRUCT__entry( ++ __array(char, uuid, 16 ) ++ ), ++ ++ TP_fast_assign( ++ memcpy(__entry->uuid, c->sb.user_uuid.b, 16); ++ ), ++ ++ TP_printk("%pU", __entry->uuid) ++); ++ ++DEFINE_EVENT(btree_node_cannibalize_lock, btree_node_cannibalize_lock_fail, ++ TP_PROTO(struct bch_fs *c), ++ TP_ARGS(c) ++); ++ ++DEFINE_EVENT(btree_node_cannibalize_lock, btree_node_cannibalize_lock, ++ TP_PROTO(struct bch_fs *c), ++ TP_ARGS(c) ++); ++ ++DEFINE_EVENT(btree_node_cannibalize_lock, btree_node_cannibalize, ++ TP_PROTO(struct bch_fs *c), ++ TP_ARGS(c) ++); ++ ++DEFINE_EVENT(bch_fs, btree_node_cannibalize_unlock, ++ TP_PROTO(struct bch_fs *c), ++ TP_ARGS(c) ++); ++ ++TRACE_EVENT(btree_reserve_get_fail, ++ TP_PROTO(struct bch_fs *c, size_t required, struct closure *cl), ++ TP_ARGS(c, required, cl), ++ ++ TP_STRUCT__entry( ++ __array(char, uuid, 16 ) ++ __field(size_t, required ) ++ __field(struct closure *, cl ) ++ ), ++ ++ TP_fast_assign( ++ memcpy(__entry->uuid, c->sb.user_uuid.b, 16); ++ __entry->required = required; ++ __entry->cl = cl; ++ ), ++ ++ TP_printk("%pU required %zu by %p", __entry->uuid, ++ __entry->required, __entry->cl) ++); ++ ++TRACE_EVENT(btree_insert_key, ++ TP_PROTO(struct bch_fs *c, struct btree *b, struct bkey_i *k), ++ TP_ARGS(c, b, k), ++ ++ TP_STRUCT__entry( ++ __field(u8, id ) ++ __field(u64, inode ) ++ __field(u64, offset ) ++ __field(u32, size ) ++ ), ++ ++ TP_fast_assign( ++ __entry->id = b->btree_id; ++ __entry->inode = k->k.p.inode; ++ __entry->offset = k->k.p.offset; ++ __entry->size = k->k.size; ++ ), ++ ++ TP_printk("btree %u: %llu:%llu len %u", __entry->id, ++ __entry->inode, __entry->offset, __entry->size) ++); ++ ++DEFINE_EVENT(btree_node, btree_split, ++ TP_PROTO(struct bch_fs *c, struct btree *b), ++ TP_ARGS(c, b) ++); ++ ++DEFINE_EVENT(btree_node, btree_compact, ++ TP_PROTO(struct bch_fs *c, struct btree *b), ++ TP_ARGS(c, b) ++); ++ ++DEFINE_EVENT(btree_node, btree_merge, ++ TP_PROTO(struct bch_fs *c, struct btree *b), ++ TP_ARGS(c, b) ++); ++ ++DEFINE_EVENT(btree_node, btree_set_root, ++ TP_PROTO(struct bch_fs *c, struct btree *b), ++ TP_ARGS(c, b) ++); ++ ++/* Garbage collection */ ++ ++DEFINE_EVENT(btree_node, btree_gc_coalesce, ++ TP_PROTO(struct bch_fs *c, struct btree *b), ++ TP_ARGS(c, b) ++); ++ ++TRACE_EVENT(btree_gc_coalesce_fail, ++ TP_PROTO(struct bch_fs *c, int reason), ++ TP_ARGS(c, reason), ++ ++ TP_STRUCT__entry( ++ __field(u8, reason ) ++ __array(char, uuid, 16 ) ++ ), ++ ++ TP_fast_assign( ++ __entry->reason = reason; ++ memcpy(__entry->uuid, c->disk_sb.sb->user_uuid.b, 16); ++ ), ++ ++ TP_printk("%pU: %u", __entry->uuid, __entry->reason) ++); ++ ++DEFINE_EVENT(btree_node, btree_gc_rewrite_node, ++ TP_PROTO(struct bch_fs *c, struct btree *b), ++ TP_ARGS(c, b) ++); ++ ++DEFINE_EVENT(btree_node, btree_gc_rewrite_node_fail, ++ TP_PROTO(struct bch_fs *c, struct btree *b), ++ TP_ARGS(c, b) ++); ++ ++DEFINE_EVENT(bch_fs, gc_start, ++ TP_PROTO(struct bch_fs *c), ++ TP_ARGS(c) ++); ++ ++DEFINE_EVENT(bch_fs, gc_end, ++ TP_PROTO(struct bch_fs *c), ++ TP_ARGS(c) ++); ++ ++DEFINE_EVENT(bch_fs, gc_coalesce_start, ++ TP_PROTO(struct bch_fs *c), ++ TP_ARGS(c) ++); ++ ++DEFINE_EVENT(bch_fs, gc_coalesce_end, ++ TP_PROTO(struct bch_fs *c), ++ TP_ARGS(c) ++); ++ ++DEFINE_EVENT(bch_fs, gc_cannot_inc_gens, ++ TP_PROTO(struct bch_fs *c), ++ TP_ARGS(c) ++); ++ ++/* Allocator */ ++ ++TRACE_EVENT(alloc_batch, ++ TP_PROTO(struct bch_dev *ca, size_t free, size_t total), ++ TP_ARGS(ca, free, total), ++ ++ TP_STRUCT__entry( ++ __array(char, uuid, 16 ) ++ __field(size_t, free ) ++ __field(size_t, total ) ++ ), ++ ++ TP_fast_assign( ++ memcpy(__entry->uuid, ca->uuid.b, 16); ++ __entry->free = free; ++ __entry->total = total; ++ ), ++ ++ TP_printk("%pU free %zu total %zu", ++ __entry->uuid, __entry->free, __entry->total) ++); ++ ++TRACE_EVENT(invalidate, ++ TP_PROTO(struct bch_dev *ca, u64 offset, unsigned sectors), ++ TP_ARGS(ca, offset, sectors), ++ ++ TP_STRUCT__entry( ++ __field(unsigned, sectors ) ++ __field(dev_t, dev ) ++ __field(__u64, offset ) ++ ), ++ ++ TP_fast_assign( ++ __entry->dev = ca->disk_sb.bdev->bd_dev; ++ __entry->offset = offset, ++ __entry->sectors = sectors; ++ ), ++ ++ TP_printk("invalidated %u sectors at %d,%d sector=%llu", ++ __entry->sectors, MAJOR(__entry->dev), ++ MINOR(__entry->dev), __entry->offset) ++); ++ ++DEFINE_EVENT(bch_fs, rescale_prios, ++ TP_PROTO(struct bch_fs *c), ++ TP_ARGS(c) ++); ++ ++DECLARE_EVENT_CLASS(bucket_alloc, ++ TP_PROTO(struct bch_dev *ca, enum alloc_reserve reserve), ++ TP_ARGS(ca, reserve), ++ ++ TP_STRUCT__entry( ++ __array(char, uuid, 16) ++ __field(enum alloc_reserve, reserve ) ++ ), ++ ++ TP_fast_assign( ++ memcpy(__entry->uuid, ca->uuid.b, 16); ++ __entry->reserve = reserve; ++ ), ++ ++ TP_printk("%pU reserve %d", __entry->uuid, __entry->reserve) ++); ++ ++DEFINE_EVENT(bucket_alloc, bucket_alloc, ++ TP_PROTO(struct bch_dev *ca, enum alloc_reserve reserve), ++ TP_ARGS(ca, reserve) ++); ++ ++DEFINE_EVENT(bucket_alloc, bucket_alloc_fail, ++ TP_PROTO(struct bch_dev *ca, enum alloc_reserve reserve), ++ TP_ARGS(ca, reserve) ++); ++ ++DEFINE_EVENT(bucket_alloc, open_bucket_alloc_fail, ++ TP_PROTO(struct bch_dev *ca, enum alloc_reserve reserve), ++ TP_ARGS(ca, reserve) ++); ++ ++/* Moving IO */ ++ ++DEFINE_EVENT(bkey, move_extent, ++ TP_PROTO(const struct bkey *k), ++ TP_ARGS(k) ++); ++ ++DEFINE_EVENT(bkey, move_alloc_fail, ++ TP_PROTO(const struct bkey *k), ++ TP_ARGS(k) ++); ++ ++DEFINE_EVENT(bkey, move_race, ++ TP_PROTO(const struct bkey *k), ++ TP_ARGS(k) ++); ++ ++TRACE_EVENT(move_data, ++ TP_PROTO(struct bch_fs *c, u64 sectors_moved, ++ u64 keys_moved), ++ TP_ARGS(c, sectors_moved, keys_moved), ++ ++ TP_STRUCT__entry( ++ __array(char, uuid, 16 ) ++ __field(u64, sectors_moved ) ++ __field(u64, keys_moved ) ++ ), ++ ++ TP_fast_assign( ++ memcpy(__entry->uuid, c->sb.user_uuid.b, 16); ++ __entry->sectors_moved = sectors_moved; ++ __entry->keys_moved = keys_moved; ++ ), ++ ++ TP_printk("%pU sectors_moved %llu keys_moved %llu", ++ __entry->uuid, __entry->sectors_moved, __entry->keys_moved) ++); ++ ++TRACE_EVENT(copygc, ++ TP_PROTO(struct bch_dev *ca, ++ u64 sectors_moved, u64 sectors_not_moved, ++ u64 buckets_moved, u64 buckets_not_moved), ++ TP_ARGS(ca, ++ sectors_moved, sectors_not_moved, ++ buckets_moved, buckets_not_moved), ++ ++ TP_STRUCT__entry( ++ __array(char, uuid, 16 ) ++ __field(u64, sectors_moved ) ++ __field(u64, sectors_not_moved ) ++ __field(u64, buckets_moved ) ++ __field(u64, buckets_not_moved ) ++ ), ++ ++ TP_fast_assign( ++ memcpy(__entry->uuid, ca->uuid.b, 16); ++ __entry->sectors_moved = sectors_moved; ++ __entry->sectors_not_moved = sectors_not_moved; ++ __entry->buckets_moved = buckets_moved; ++ __entry->buckets_not_moved = buckets_moved; ++ ), ++ ++ TP_printk("%pU sectors moved %llu remain %llu buckets moved %llu remain %llu", ++ __entry->uuid, ++ __entry->sectors_moved, __entry->sectors_not_moved, ++ __entry->buckets_moved, __entry->buckets_not_moved) ++); ++ ++DECLARE_EVENT_CLASS(transaction_restart, ++ TP_PROTO(unsigned long ip), ++ TP_ARGS(ip), ++ ++ TP_STRUCT__entry( ++ __field(unsigned long, ip ) ++ ), ++ ++ TP_fast_assign( ++ __entry->ip = ip; ++ ), ++ ++ TP_printk("%pf", (void *) __entry->ip) ++); ++ ++DEFINE_EVENT(transaction_restart, trans_restart_btree_node_reused, ++ TP_PROTO(unsigned long ip), ++ TP_ARGS(ip) ++); ++ ++DEFINE_EVENT(transaction_restart, trans_restart_would_deadlock, ++ TP_PROTO(unsigned long ip), ++ TP_ARGS(ip) ++); ++ ++TRACE_EVENT(trans_restart_iters_realloced, ++ TP_PROTO(unsigned long ip, unsigned nr), ++ TP_ARGS(ip, nr), ++ ++ TP_STRUCT__entry( ++ __field(unsigned long, ip ) ++ __field(unsigned, nr ) ++ ), ++ ++ TP_fast_assign( ++ __entry->ip = ip; ++ __entry->nr = nr; ++ ), ++ ++ TP_printk("%pf nr %u", (void *) __entry->ip, __entry->nr) ++); ++ ++TRACE_EVENT(trans_restart_mem_realloced, ++ TP_PROTO(unsigned long ip, unsigned long bytes), ++ TP_ARGS(ip, bytes), ++ ++ TP_STRUCT__entry( ++ __field(unsigned long, ip ) ++ __field(unsigned long, bytes ) ++ ), ++ ++ TP_fast_assign( ++ __entry->ip = ip; ++ __entry->bytes = bytes; ++ ), ++ ++ TP_printk("%pf bytes %lu", (void *) __entry->ip, __entry->bytes) ++); ++ ++DEFINE_EVENT(transaction_restart, trans_restart_journal_res_get, ++ TP_PROTO(unsigned long ip), ++ TP_ARGS(ip) ++); ++ ++DEFINE_EVENT(transaction_restart, trans_restart_journal_preres_get, ++ TP_PROTO(unsigned long ip), ++ TP_ARGS(ip) ++); ++ ++DEFINE_EVENT(transaction_restart, trans_restart_mark_replicas, ++ TP_PROTO(unsigned long ip), ++ TP_ARGS(ip) ++); ++ ++DEFINE_EVENT(transaction_restart, trans_restart_fault_inject, ++ TP_PROTO(unsigned long ip), ++ TP_ARGS(ip) ++); ++ ++DEFINE_EVENT(transaction_restart, trans_restart_btree_node_split, ++ TP_PROTO(unsigned long ip), ++ TP_ARGS(ip) ++); ++ ++DEFINE_EVENT(transaction_restart, trans_restart_mark, ++ TP_PROTO(unsigned long ip), ++ TP_ARGS(ip) ++); ++ ++DEFINE_EVENT(transaction_restart, trans_restart_upgrade, ++ TP_PROTO(unsigned long ip), ++ TP_ARGS(ip) ++); ++ ++DEFINE_EVENT(transaction_restart, trans_restart_iter_upgrade, ++ TP_PROTO(unsigned long ip), ++ TP_ARGS(ip) ++); ++ ++DEFINE_EVENT(transaction_restart, trans_restart_traverse, ++ TP_PROTO(unsigned long ip), ++ TP_ARGS(ip) ++); ++ ++DEFINE_EVENT(transaction_restart, trans_restart_atomic, ++ TP_PROTO(unsigned long ip), ++ TP_ARGS(ip) ++); ++ ++DECLARE_EVENT_CLASS(node_lock_fail, ++ TP_PROTO(unsigned level, u32 iter_seq, unsigned node, u32 node_seq), ++ TP_ARGS(level, iter_seq, node, node_seq), ++ ++ TP_STRUCT__entry( ++ __field(u32, level) ++ __field(u32, iter_seq) ++ __field(u32, node) ++ __field(u32, node_seq) ++ ), ++ ++ TP_fast_assign( ++ __entry->level = level; ++ __entry->iter_seq = iter_seq; ++ __entry->node = node; ++ __entry->node_seq = node_seq; ++ ), ++ ++ TP_printk("level %u iter seq %u node %u node seq %u", ++ __entry->level, __entry->iter_seq, ++ __entry->node, __entry->node_seq) ++); ++ ++DEFINE_EVENT(node_lock_fail, node_upgrade_fail, ++ TP_PROTO(unsigned level, u32 iter_seq, unsigned node, u32 node_seq), ++ TP_ARGS(level, iter_seq, node, node_seq) ++); ++ ++DEFINE_EVENT(node_lock_fail, node_relock_fail, ++ TP_PROTO(unsigned level, u32 iter_seq, unsigned node, u32 node_seq), ++ TP_ARGS(level, iter_seq, node, node_seq) ++); ++ ++#endif /* _TRACE_BCACHE_H */ ++ ++/* This part must be outside protection */ ++#include +diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug +index 36e9dcb14387..5631696f938f 100644 +--- a/lib/Kconfig.debug ++++ b/lib/Kconfig.debug +@@ -1639,15 +1639,6 @@ config DEBUG_NOTIFIERS + This is a relatively cheap check but if you care about maximum + performance, say N. + +-config DEBUG_CLOSURES +- bool "Debug closures (bcache async widgits)" +- depends on CLOSURES +- select DEBUG_FS +- help +- Keeps all active closures in a linked list and provides a debugfs +- interface to list them, which makes it possible to see asynchronous +- operations that get stuck. +- + config BUG_ON_DATA_CORRUPTION + bool "Trigger a BUG when data corruption is detected" + select DEBUG_LIST +@@ -1677,6 +1668,15 @@ config DEBUG_CREDENTIALS + + source "kernel/rcu/Kconfig.debug" + ++config DEBUG_CLOSURES ++ bool "Debug closures (bcache async widgits)" ++ depends on CLOSURES ++ select DEBUG_FS ++ help ++ Keeps all active closures in a linked list and provides a debugfs ++ interface to list them, which makes it possible to see asynchronous ++ operations that get stuck. ++ + config DEBUG_WQ_FORCE_RR_CPU + bool "Force round-robin CPU selection for unbound work items" + depends on DEBUG_KERNEL +-- +cgit v1.2.3 + + +From a5f9a93d8ad44bfbfe578b01dce2bdd2745ba578 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 12 Mar 2022 20:10:40 -0500 +Subject: lib/generic-radix-tree.c: Add a missing include + +We now need linux/limits.h for SIZE_MAX. + +Signed-off-by: Kent Overstreet +--- + include/linux/generic-radix-tree.h | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/include/linux/generic-radix-tree.h b/include/linux/generic-radix-tree.h +index 63080822dc84..f6cd0f909d9f 100644 +--- a/include/linux/generic-radix-tree.h ++++ b/include/linux/generic-radix-tree.h +@@ -38,6 +38,7 @@ + + #include + #include ++#include + #include + #include + #include +-- +cgit v1.2.3 + + +From dca6d4d90ebb249fa968484e81d62a0a0f001f00 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 6 Nov 2019 14:29:30 -0500 +Subject: bcachefs: Fix setting of attributes mask in getattr + +Discovered by xfstests generic/553 + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/fs.c | 5 +++++ + 1 file changed, 5 insertions(+) + +diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c +index eb4e671ae0f0..a7212b91ac4c 100644 +--- a/fs/bcachefs/fs.c ++++ b/fs/bcachefs/fs.c +@@ -775,10 +775,15 @@ static int bch2_getattr(struct user_namespace *mnt_userns, + + if (inode->ei_inode.bi_flags & BCH_INODE_IMMUTABLE) + stat->attributes |= STATX_ATTR_IMMUTABLE; ++ stat->attributes_mask |= STATX_ATTR_IMMUTABLE; ++ + if (inode->ei_inode.bi_flags & BCH_INODE_APPEND) + stat->attributes |= STATX_ATTR_APPEND; ++ stat->attributes_mask |= STATX_ATTR_APPEND; ++ + if (inode->ei_inode.bi_flags & BCH_INODE_NODUMP) + stat->attributes |= STATX_ATTR_NODUMP; ++ stat->attributes_mask |= STATX_ATTR_NODUMP; + + return 0; + } +-- +cgit v1.2.3 + + +From 4f68d69dc83604eb2689969c135f89033d1df9b3 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 4 Nov 2019 22:22:13 -0500 +Subject: bcachefs: Some reflink fixes + +len might fit into a loff_t when aligned_len does not - make sure we use +a u64 for aligned_len. Also, we weren't always extending the inode +correctly. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/fs-io.c | 13 ++++++------- + fs/bcachefs/reflink.c | 4 +++- + 2 files changed, 9 insertions(+), 8 deletions(-) + +diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c +index f8e931e01fcc..d312f7773805 100644 +--- a/fs/bcachefs/fs-io.c ++++ b/fs/bcachefs/fs-io.c +@@ -2857,8 +2857,8 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src, + struct bch_inode_info *dst = file_bch_inode(file_dst); + struct bch_fs *c = src->v.i_sb->s_fs_info; + s64 i_sectors_delta = 0; ++ u64 aligned_len; + loff_t ret = 0; +- loff_t aligned_len; + + if (remap_flags & ~(REMAP_FILE_DEDUP|REMAP_FILE_ADVISORY)) + return -EINVAL; +@@ -2887,10 +2887,10 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src, + if (ret < 0 || len == 0) + goto err; + +- aligned_len = round_up(len, block_bytes(c)); ++ aligned_len = round_up((u64) len, block_bytes(c)); + + ret = write_invalidate_inode_pages_range(dst->v.i_mapping, +- pos_dst, pos_dst + aligned_len); ++ pos_dst, pos_dst + len - 1); + if (ret) + goto err; + +@@ -2905,18 +2905,17 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src, + if (ret < 0) + goto err; + +- ret <<= 9; + /* + * due to alignment, we might have remapped slightly more than requsted + */ +- ret = min(ret, len); ++ ret = min((u64) ret << 9, (u64) len); + + /* XXX get a quota reservation */ + i_sectors_acct(c, dst, NULL, i_sectors_delta); + + spin_lock(&dst->v.i_lock); +- if (pos_dst + len > dst->v.i_size) +- i_size_write(&dst->v, pos_dst + len); ++ if (pos_dst + ret > dst->v.i_size) ++ i_size_write(&dst->v, pos_dst + ret); + spin_unlock(&dst->v.i_lock); + err: + bch2_unlock_inodes(INODE_LOCK|INODE_PAGECACHE_BLOCK, src, dst); +diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c +index 6d45ae24479d..6e71c5e8f9a2 100644 +--- a/fs/bcachefs/reflink.c ++++ b/fs/bcachefs/reflink.c +@@ -290,10 +290,12 @@ err: + ret2 = PTR_ERR_OR_ZERO(inode_iter); + + if (!ret2 && +- inode_u.bi_size < new_i_size) ++ inode_u.bi_size < new_i_size) { ++ inode_u.bi_size = new_i_size; + ret2 = bch2_inode_write(&trans, inode_iter, &inode_u) ?: + bch2_trans_commit(&trans, NULL, journal_seq, + BTREE_INSERT_ATOMIC); ++ } + } while (ret2 == -EINTR); + + ret = bch2_trans_exit(&trans) ?: ret; +-- +cgit v1.2.3 + + +From 24584360940078fb3d3d21ce559f63adaab86365 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 26 Oct 2019 14:58:36 -0400 +Subject: bcachefs: Don't BUG_ON() sector count overflow + +Return an error instead (still work in progress...) + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/buckets.c | 26 ++++++++++++++------------ + 1 file changed, 14 insertions(+), 12 deletions(-) + +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index c418398266a3..8d223aa2bee5 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -1464,7 +1464,7 @@ static int bch2_trans_mark_pointer(struct btree_trans *trans, + struct bkey_s_c k; + struct bkey_alloc_unpacked u; + struct bkey_i_alloc *a; +- unsigned old; ++ u16 *dst_sectors; + bool overflow; + int ret; + +@@ -1519,22 +1519,24 @@ static int bch2_trans_mark_pointer(struct btree_trans *trans, + goto out; + } + +- if (!p.ptr.cached) { +- old = u.dirty_sectors; +- overflow = checked_add(u.dirty_sectors, sectors); +- } else { +- old = u.cached_sectors; +- overflow = checked_add(u.cached_sectors, sectors); ++ dst_sectors = !p.ptr.cached ++ ? &u.dirty_sectors ++ : &u.cached_sectors; ++ ++ overflow = checked_add(*dst_sectors, sectors); ++ ++ if (overflow) { ++ bch2_fs_inconsistent(c, ++ "bucket sector count overflow: %u + %lli > U16_MAX", ++ *dst_sectors, sectors); ++ /* return an error indicating that we need full fsck */ ++ ret = -EIO; ++ goto out; + } + + u.data_type = u.dirty_sectors || u.cached_sectors + ? data_type : 0; + +- bch2_fs_inconsistent_on(overflow, c, +- "bucket sector count overflow: %u + %lli > U16_MAX", +- old, sectors); +- BUG_ON(overflow); +- + a = trans_update_key(trans, iter, BKEY_ALLOC_U64s_MAX); + ret = PTR_ERR_OR_ZERO(a); + if (ret) +-- +cgit v1.2.3 + + +From 1e38028f03f5589cc72064bb1e1713c693279d4b Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 6 Nov 2019 15:32:11 -0500 +Subject: bcachefs: Add an option for fsck error ratelimiting + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/error.c | 13 +++++++++---- + fs/bcachefs/error.h | 1 + + fs/bcachefs/opts.h | 11 +++++++++++ + 3 files changed, 21 insertions(+), 4 deletions(-) + +diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c +index 304ff92500be..5a5cfee623e2 100644 +--- a/fs/bcachefs/error.c ++++ b/fs/bcachefs/error.c +@@ -64,7 +64,7 @@ void bch2_io_error(struct bch_dev *ca) + enum fsck_err_ret bch2_fsck_err(struct bch_fs *c, unsigned flags, + const char *fmt, ...) + { +- struct fsck_err_state *s; ++ struct fsck_err_state *s = NULL; + va_list args; + bool fix = false, print = true, suppressing = false; + char _buf[sizeof(s->buf)], *buf = _buf; +@@ -99,8 +99,13 @@ enum fsck_err_ret bch2_fsck_err(struct bch_fs *c, unsigned flags, + found: + list_move(&s->list, &c->fsck_errors); + s->nr++; +- suppressing = s->nr == FSCK_ERR_RATELIMIT_NR; +- print = s->nr <= FSCK_ERR_RATELIMIT_NR; ++ if (c->opts.ratelimit_errors && ++ s->nr >= FSCK_ERR_RATELIMIT_NR) { ++ if (s->nr == FSCK_ERR_RATELIMIT_NR) ++ suppressing = true; ++ else ++ print = false; ++ } + buf = s->buf; + print: + va_start(args, fmt); +@@ -156,7 +161,7 @@ void bch2_flush_fsck_errs(struct bch_fs *c) + mutex_lock(&c->fsck_error_lock); + + list_for_each_entry_safe(s, n, &c->fsck_errors, list) { +- if (s->nr > FSCK_ERR_RATELIMIT_NR) ++ if (s->ratelimited) + bch_err(c, "Saw %llu errors like:\n %s", s->nr, s->buf); + + list_del(&s->list); +diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h +index 2591e12305b7..7dcb0f6552fc 100644 +--- a/fs/bcachefs/error.h ++++ b/fs/bcachefs/error.h +@@ -114,6 +114,7 @@ struct fsck_err_state { + struct list_head list; + const char *fmt; + u64 nr; ++ bool ratelimited; + char buf[512]; + }; + +diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h +index bd2058f1fe2b..0ec0999a6214 100644 +--- a/fs/bcachefs/opts.h ++++ b/fs/bcachefs/opts.h +@@ -68,6 +68,12 @@ enum opt_type { + * - helptext + */ + ++#ifdef __KERNEL__ ++#define RATELIMIT_ERRORS true ++#else ++#define RATELIMIT_ERRORS false ++#endif ++ + #define BCH_OPTS() \ + x(block_size, u16, \ + OPT_FORMAT, \ +@@ -227,6 +233,11 @@ enum opt_type { + OPT_BOOL(), \ + NO_SB_OPT, false, \ + NULL, "Fix errors during fsck without asking") \ ++ x(ratelimit_errors, u8, \ ++ OPT_MOUNT, \ ++ OPT_BOOL(), \ ++ NO_SB_OPT, RATELIMIT_ERRORS, \ ++ NULL, "Ratelimit error messages during fsck") \ + x(nochanges, u8, \ + OPT_MOUNT, \ + OPT_BOOL(), \ +-- +cgit v1.2.3 + + +From cbb7c7cd055707d7c9f785f88350567cd1bc0c8d Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 4 Nov 2019 15:56:04 -0500 +Subject: bcachefs: Avoid calling bch2_btree_iter_relock() in + bch2_btree_iter_traverse() + +--- + fs/bcachefs/btree_iter.c | 14 ++++++++++---- + 1 file changed, 10 insertions(+), 4 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 5fab505dbea0..a4180124d7d1 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -294,9 +294,7 @@ void bch2_btree_trans_verify_locks(struct btree_trans *trans) + __flatten + static bool bch2_btree_iter_relock(struct btree_iter *iter, bool trace) + { +- return iter->uptodate >= BTREE_ITER_NEED_RELOCK +- ? btree_iter_get_locks(iter, false, trace) +- : true; ++ return btree_iter_get_locks(iter, false, trace); + } + + bool __bch2_btree_iter_upgrade(struct btree_iter *iter, +@@ -1098,7 +1096,15 @@ static int btree_iter_traverse_one(struct btree_iter *iter) + if (unlikely(iter->level >= BTREE_MAX_DEPTH)) + return 0; + +- if (bch2_btree_iter_relock(iter, false)) ++ /* ++ * if we need interior nodes locked, call btree_iter_relock() to make ++ * sure we walk back up enough that we lock them: ++ */ ++ if (iter->uptodate == BTREE_ITER_NEED_RELOCK || ++ iter->locks_want > 1) ++ bch2_btree_iter_relock(iter, false); ++ ++ if (iter->uptodate < BTREE_ITER_NEED_RELOCK) + return 0; + + /* +-- +cgit v1.2.3 + + +From 9bfcdc0c949c10ad1952c1ef0feb819e74518e66 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 7 Nov 2019 15:00:08 -0500 +Subject: bcachefs: Inline fast path of bch2_increment_clock() + +Shaving more cycles. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/clock.c | 7 +++---- + fs/bcachefs/clock.h | 13 ++++++++++++- + 2 files changed, 15 insertions(+), 5 deletions(-) + +diff --git a/fs/bcachefs/clock.c b/fs/bcachefs/clock.c +index 8ac6990c6971..f18266330687 100644 +--- a/fs/bcachefs/clock.c ++++ b/fs/bcachefs/clock.c +@@ -135,17 +135,16 @@ static struct io_timer *get_expired_timer(struct io_clock *clock, + return ret; + } + +-void bch2_increment_clock(struct bch_fs *c, unsigned sectors, int rw) ++void __bch2_increment_clock(struct io_clock *clock) + { +- struct io_clock *clock = &c->io_clock[rw]; + struct io_timer *timer; + unsigned long now; ++ unsigned sectors; + + /* Buffer up one megabyte worth of IO in the percpu counter */ + preempt_disable(); + +- if (likely(this_cpu_add_return(*clock->pcpu_buf, sectors) < +- IO_CLOCK_PCPU_SECTORS)) { ++ if (this_cpu_read(*clock->pcpu_buf) < IO_CLOCK_PCPU_SECTORS) { + preempt_enable(); + return; + } +diff --git a/fs/bcachefs/clock.h b/fs/bcachefs/clock.h +index 5cb043c579d8..bfbbca8a207b 100644 +--- a/fs/bcachefs/clock.h ++++ b/fs/bcachefs/clock.h +@@ -6,7 +6,18 @@ void bch2_io_timer_add(struct io_clock *, struct io_timer *); + void bch2_io_timer_del(struct io_clock *, struct io_timer *); + void bch2_kthread_io_clock_wait(struct io_clock *, unsigned long, + unsigned long); +-void bch2_increment_clock(struct bch_fs *, unsigned, int); ++ ++void __bch2_increment_clock(struct io_clock *); ++ ++static inline void bch2_increment_clock(struct bch_fs *c, unsigned sectors, ++ int rw) ++{ ++ struct io_clock *clock = &c->io_clock[rw]; ++ ++ if (unlikely(this_cpu_add_return(*clock->pcpu_buf, sectors) >= ++ IO_CLOCK_PCPU_SECTORS)) ++ __bch2_increment_clock(clock); ++} + + void bch2_io_clock_schedule_timeout(struct io_clock *, unsigned long); + +-- +cgit v1.2.3 + + +From 0982dc2d30bd6504141d3a41904f9480cdfb79e5 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 7 Nov 2019 15:03:09 -0500 +Subject: bcachefs: Make __bch2_bkey_cmp_packed() smaller + +We can probably get rid of the version that dispatches based on type +checking too. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bkey.c | 26 ++++++++++---------------- + 1 file changed, 10 insertions(+), 16 deletions(-) + +diff --git a/fs/bcachefs/bkey.c b/fs/bcachefs/bkey.c +index ed7ca5b0636d..4d0c9129cd4a 100644 +--- a/fs/bcachefs/bkey.c ++++ b/fs/bcachefs/bkey.c +@@ -1058,26 +1058,20 @@ int __bch2_bkey_cmp_packed(const struct bkey_packed *l, + const struct bkey_packed *r, + const struct btree *b) + { +- int packed = bkey_lr_packed(l, r); ++ struct bkey unpacked; + +- if (likely(packed == BKEY_PACKED_BOTH)) ++ if (likely(bkey_packed(l) && bkey_packed(r))) + return __bch2_bkey_cmp_packed_format_checked(l, r, b); + +- switch (packed) { +- case BKEY_PACKED_NONE: +- return bkey_cmp(((struct bkey *) l)->p, +- ((struct bkey *) r)->p); +- case BKEY_PACKED_LEFT: +- return __bch2_bkey_cmp_left_packed_format_checked(b, +- (struct bkey_packed *) l, +- &((struct bkey *) r)->p); +- case BKEY_PACKED_RIGHT: +- return -__bch2_bkey_cmp_left_packed_format_checked(b, +- (struct bkey_packed *) r, +- &((struct bkey *) l)->p); +- default: +- unreachable(); ++ if (bkey_packed(l)) { ++ __bkey_unpack_key_format_checked(b, &unpacked, l); ++ l = (void*) &unpacked; ++ } else if (bkey_packed(r)) { ++ __bkey_unpack_key_format_checked(b, &unpacked, r); ++ r = (void*) &unpacked; + } ++ ++ return bkey_cmp(((struct bkey *) l)->p, ((struct bkey *) r)->p); + } + + __pure __flatten +-- +cgit v1.2.3 + + +From 11f4adda6ecd301bc0d9ecf862606d479aeaf6d7 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 7 Nov 2019 15:14:10 -0500 +Subject: bcachefs: Pipeline binary searches and linear searches + +This makes prefetching for the linear search at the end of the lookup +much more effective, and is a couple percent speedup. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bset.c | 114 ++++++++++++++++++++++++++++++++--------------------- + 1 file changed, 69 insertions(+), 45 deletions(-) + +diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c +index ff9465750528..c3164c3013a6 100644 +--- a/fs/bcachefs/bset.c ++++ b/fs/bcachefs/bset.c +@@ -1338,6 +1338,25 @@ static int bset_search_tree_slowpath(const struct btree *b, + packed_search, search) < 0; + } + ++static inline void prefetch_four_cachelines(void *p) ++{ ++#ifdef CONFIG_X86_64 ++ asm(".intel_syntax noprefix;" ++ "prefetcht0 [%0 - 127 + 64 * 0];" ++ "prefetcht0 [%0 - 127 + 64 * 1];" ++ "prefetcht0 [%0 - 127 + 64 * 2];" ++ "prefetcht0 [%0 - 127 + 64 * 3];" ++ ".att_syntax prefix;" ++ : ++ : "r" (p + 127)); ++#else ++ prefetch(p + L1_CACHE_BYTES * 0); ++ prefetch(p + L1_CACHE_BYTES * 1); ++ prefetch(p + L1_CACHE_BYTES * 2); ++ prefetch(p + L1_CACHE_BYTES * 3); ++#endif ++} ++ + __flatten + static struct bkey_packed *bset_search_tree(const struct btree *b, + struct bset_tree *t, +@@ -1345,34 +1364,12 @@ static struct bkey_packed *bset_search_tree(const struct btree *b, + const struct bkey_packed *packed_search) + { + struct ro_aux_tree *base = ro_aux_tree_base(b, t); +- struct bkey_float *f = bkey_float_get(base, 1); +- void *p; ++ struct bkey_float *f; + unsigned inorder, n = 1; + +- while (1) { +- if (likely(n << 4 < t->size)) { +- p = bkey_float_get(base, n << 4); +- prefetch(p); +- } else if (n << 3 < t->size) { +- inorder = __eytzinger1_to_inorder(n, t->size, t->extra); +- p = bset_cacheline(b, t, inorder); +-#ifdef CONFIG_X86_64 +- asm(".intel_syntax noprefix;" +- "prefetcht0 [%0 - 127 + 64 * 0];" +- "prefetcht0 [%0 - 127 + 64 * 1];" +- "prefetcht0 [%0 - 127 + 64 * 2];" +- "prefetcht0 [%0 - 127 + 64 * 3];" +- ".att_syntax prefix;" +- : +- : "r" (p + 127)); +-#else +- prefetch(p + L1_CACHE_BYTES * 0); +- prefetch(p + L1_CACHE_BYTES * 1); +- prefetch(p + L1_CACHE_BYTES * 2); +- prefetch(p + L1_CACHE_BYTES * 3); +-#endif +- } else if (n >= t->size) +- break; ++ do { ++ if (likely(n << 4 < t->size)) ++ prefetch(bkey_float_get(base, n << 4)); + + f = bkey_float_get(base, n); + +@@ -1403,17 +1400,12 @@ static struct bkey_packed *bset_search_tree(const struct btree *b, + } + } + +-/* +- * Returns the first key greater than or equal to @search +- */ +-__always_inline __flatten +-static struct bkey_packed *bch2_bset_search(struct btree *b, ++static __always_inline __flatten ++struct bkey_packed *__bch2_bset_search(struct btree *b, + struct bset_tree *t, + struct bpos *search, +- struct bkey_packed *packed_search, + const struct bkey_packed *lossy_packed_search) + { +- struct bkey_packed *m; + + /* + * First, we search for a cacheline, then lastly we do a linear search +@@ -1432,11 +1424,9 @@ static struct bkey_packed *bch2_bset_search(struct btree *b, + + switch (bset_aux_tree_type(t)) { + case BSET_NO_AUX_TREE: +- m = btree_bkey_first(b, t); +- break; ++ return btree_bkey_first(b, t); + case BSET_RW_AUX_TREE: +- m = bset_search_write_set(b, t, search, lossy_packed_search); +- break; ++ return bset_search_write_set(b, t, search, lossy_packed_search); + case BSET_RO_AUX_TREE: + /* + * Each node in the auxiliary search tree covers a certain range +@@ -1448,10 +1438,20 @@ static struct bkey_packed *bch2_bset_search(struct btree *b, + if (bkey_cmp(*search, t->max_key) > 0) + return btree_bkey_last(b, t); + +- m = bset_search_tree(b, t, search, lossy_packed_search); +- break; ++ return bset_search_tree(b, t, search, lossy_packed_search); ++ default: ++ unreachable(); + } ++} + ++static __always_inline __flatten ++struct bkey_packed *bch2_bset_search_linear(struct btree *b, ++ struct bset_tree *t, ++ struct bpos *search, ++ struct bkey_packed *packed_search, ++ const struct bkey_packed *lossy_packed_search, ++ struct bkey_packed *m) ++{ + if (lossy_packed_search) + while (m != btree_bkey_last(b, t) && + bkey_iter_cmp_p_or_unp(b, search, lossy_packed_search, +@@ -1474,6 +1474,23 @@ static struct bkey_packed *bch2_bset_search(struct btree *b, + return m; + } + ++/* ++ * Returns the first key greater than or equal to @search ++ */ ++static __always_inline __flatten ++struct bkey_packed *bch2_bset_search(struct btree *b, ++ struct bset_tree *t, ++ struct bpos *search, ++ struct bkey_packed *packed_search, ++ const struct bkey_packed *lossy_packed_search) ++{ ++ struct bkey_packed *m = __bch2_bset_search(b, t, search, ++ lossy_packed_search); ++ ++ return bch2_bset_search_linear(b, t, search, ++ packed_search, lossy_packed_search, m); ++} ++ + /* Btree node iterator */ + + static inline void __bch2_btree_node_iter_push(struct btree_node_iter *iter, +@@ -1564,9 +1581,10 @@ __flatten + void bch2_btree_node_iter_init(struct btree_node_iter *iter, + struct btree *b, struct bpos *search) + { +- struct bset_tree *t; + struct bkey_packed p, *packed_search = NULL; + struct btree_node_iter_set *pos = iter->data; ++ struct bkey_packed *k[MAX_BSETS]; ++ unsigned i; + + EBUG_ON(bkey_cmp(*search, b->data->min_key) < 0); + bset_aux_tree_verify(b); +@@ -1585,14 +1603,20 @@ void bch2_btree_node_iter_init(struct btree_node_iter *iter, + return; + } + +- for_each_bset(b, t) { +- struct bkey_packed *k = bch2_bset_search(b, t, search, +- packed_search, &p); ++ for (i = 0; i < b->nsets; i++) { ++ k[i] = __bch2_bset_search(b, b->set + i, search, &p); ++ prefetch_four_cachelines(k[i]); ++ } ++ ++ for (i = 0; i < b->nsets; i++) { ++ struct bset_tree *t = b->set + i; + struct bkey_packed *end = btree_bkey_last(b, t); + +- if (k != end) ++ k[i] = bch2_bset_search_linear(b, t, search, ++ packed_search, &p, k[i]); ++ if (k[i] != end) + *pos++ = (struct btree_node_iter_set) { +- __btree_node_key_to_offset(b, k), ++ __btree_node_key_to_offset(b, k[i]), + __btree_node_key_to_offset(b, end) + }; + } +-- +cgit v1.2.3 + + +From 395bae8996d718604b0c9d2ccb0894a3154bbd3d Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 7 Nov 2019 15:04:13 -0500 +Subject: bcachefs: bch2_read_extent() microoptimizations + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/io.c | 24 +++++++++++++----------- + 1 file changed, 13 insertions(+), 11 deletions(-) + +diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c +index 836004b128f0..e3ef662e2a12 100644 +--- a/fs/bcachefs/io.c ++++ b/fs/bcachefs/io.c +@@ -1270,7 +1270,6 @@ static void promote_start(struct promote_op *op, struct bch_read_bio *rbio) + closure_return_with_destructor(cl, promote_done); + } + +-noinline + static struct promote_op *__promote_alloc(struct bch_fs *c, + enum btree_id btree_id, + struct bpos pos, +@@ -1344,7 +1343,8 @@ err: + return NULL; + } + +-static inline struct promote_op *promote_alloc(struct bch_fs *c, ++noinline ++static struct promote_op *promote_alloc(struct bch_fs *c, + struct bvec_iter iter, + struct bkey_s_c k, + struct extent_ptr_decoded *pick, +@@ -1908,7 +1908,7 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig, + if (narrow_crcs && (flags & BCH_READ_USER_MAPPED)) + flags |= BCH_READ_MUST_BOUNCE; + +- BUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size); ++ EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size); + + if (pick.crc.compression_type != BCH_COMPRESSION_NONE || + (pick.crc.csum_type != BCH_CSUM_NONE && +@@ -1920,8 +1920,9 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig, + bounce = true; + } + +- promote = promote_alloc(c, iter, k, &pick, orig->opts, flags, +- &rbio, &bounce, &read_full); ++ if (orig->opts.promote_target) ++ promote = promote_alloc(c, iter, k, &pick, orig->opts, flags, ++ &rbio, &bounce, &read_full); + + if (!read_full) { + EBUG_ON(pick.crc.compression_type); +@@ -1949,7 +1950,7 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig, + * data in the write path, but we're not going to use it all + * here: + */ +- BUG_ON(rbio->bio.bi_iter.bi_size < ++ EBUG_ON(rbio->bio.bi_iter.bi_size < + pick.crc.compressed_size << 9); + rbio->bio.bi_iter.bi_size = + pick.crc.compressed_size << 9; +@@ -1982,10 +1983,10 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig, + noclone: + rbio = orig; + rbio->bio.bi_iter = iter; +- BUG_ON(bio_flagged(&rbio->bio, BIO_CHAIN)); ++ EBUG_ON(bio_flagged(&rbio->bio, BIO_CHAIN)); + } + +- BUG_ON(bio_sectors(&rbio->bio) != pick.crc.compressed_size); ++ EBUG_ON(bio_sectors(&rbio->bio) != pick.crc.compressed_size); + + rbio->c = c; + rbio->submit_time = local_clock(); +@@ -2001,6 +2002,7 @@ noclone: + rbio->hole = 0; + rbio->retry = 0; + rbio->context = 0; ++ /* XXX: only initialize this if needed */ + rbio->devs_have = bch2_bkey_devs(k); + rbio->pick = pick; + rbio->pos = pos; +@@ -2017,11 +2019,11 @@ noclone: + + bch2_increment_clock(c, bio_sectors(&rbio->bio), READ); + +- percpu_down_read(&c->mark_lock); ++ rcu_read_lock(); + bucket_io_clock_reset(c, ca, PTR_BUCKET_NR(ca, &pick.ptr), READ); +- percpu_up_read(&c->mark_lock); ++ rcu_read_unlock(); + +- if (likely(!(flags & (BCH_READ_IN_RETRY|BCH_READ_LAST_FRAGMENT)))) { ++ if (!(flags & (BCH_READ_IN_RETRY|BCH_READ_LAST_FRAGMENT))) { + bio_inc_remaining(&orig->bio); + trace_read_split(&orig->bio); + } +-- +cgit v1.2.3 + + +From 415bd2aecd980f675bdccaa3738a438d9255383a Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 6 Nov 2019 16:37:29 -0500 +Subject: bcachefs: kill BFLOAT_FAILED_PREV + +The assumption underlying BFLOAT_FAILED_PREV was wrong; the comparison +we're doing in bset_search_tree() doesn't have to tell the pivot apart +from the previous key, it just has to tell if search is definitely +greater than or equal to the pivot. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bset.c | 59 ++++------------------------------------------- + fs/bcachefs/bset.h | 1 - + fs/bcachefs/btree_cache.c | 2 -- + 3 files changed, 4 insertions(+), 58 deletions(-) + +diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c +index c3164c3013a6..1992b31af40d 100644 +--- a/fs/bcachefs/bset.c ++++ b/fs/bcachefs/bset.c +@@ -295,9 +295,8 @@ static inline void bch2_btree_node_iter_next_check(struct btree_node_iter *iter, + /* Auxiliary search trees */ + + #define BFLOAT_FAILED_UNPACKED (U8_MAX - 0) +-#define BFLOAT_FAILED_PREV (U8_MAX - 1) +-#define BFLOAT_FAILED_OVERFLOW (U8_MAX - 2) +-#define BFLOAT_FAILED (U8_MAX - 2) ++#define BFLOAT_FAILED_OVERFLOW (U8_MAX - 1) ++#define BFLOAT_FAILED (U8_MAX - 1) + + #define KEY_WORDS BITS_TO_LONGS(1 << BKEY_EXPONENT_BITS) + +@@ -710,14 +709,11 @@ static void make_bfloat(struct btree *b, struct bset_tree *t, + { + struct bkey_float *f = bkey_float(b, t, j); + struct bkey_packed *m = tree_to_bkey(b, t, j); +- struct bkey_packed *p = tree_to_prev_bkey(b, t, j); + struct bkey_packed *l, *r; + unsigned bits = j < BFLOAT_32BIT_NR ? 32 : 16; + unsigned mantissa; + int shift, exponent, high_bit; + +- EBUG_ON(bkey_next(p) != m); +- + if (is_power_of_2(j)) { + l = min_key; + +@@ -759,8 +755,7 @@ static void make_bfloat(struct btree *b, struct bset_tree *t, + * the original key. + */ + +- if (!bkey_packed(l) || !bkey_packed(r) || +- !bkey_packed(p) || !bkey_packed(m) || ++ if (!bkey_packed(l) || !bkey_packed(r) || !bkey_packed(m) || + !b->nr_key_bits) { + f->exponent = BFLOAT_FAILED_UNPACKED; + return; +@@ -810,19 +805,6 @@ static void make_bfloat(struct btree *b, struct bset_tree *t, + + bfloat_mantissa_set(f, j, mantissa); + +- /* +- * The bfloat must be able to tell its key apart from the previous key - +- * if its key and the previous key don't differ in the required bits, +- * flag as failed - unless the keys are actually equal, in which case +- * we aren't required to return a specific one: +- */ +- if (exponent > 0 && +- bfloat_mantissa(f, j) == bkey_mantissa(p, f, j) && +- bkey_cmp_packed(b, p, m)) { +- f->exponent = BFLOAT_FAILED_PREV; +- return; +- } +- + /* + * f->mantissa must compare >= the original key - for transitivity with + * the comparison in bset_search_tree. If we're dropping set bits, +@@ -1817,9 +1799,6 @@ void bch2_btree_keys_stats(struct btree *b, struct bset_stats *stats) + case BFLOAT_FAILED_UNPACKED: + stats->failed_unpacked++; + break; +- case BFLOAT_FAILED_PREV: +- stats->failed_prev++; +- break; + case BFLOAT_FAILED_OVERFLOW: + stats->failed_overflow++; + break; +@@ -1832,9 +1811,7 @@ void bch2_bfloat_to_text(struct printbuf *out, struct btree *b, + struct bkey_packed *k) + { + struct bset_tree *t = bch2_bkey_to_bset(b, k); +- struct bkey_packed *l, *r, *p; +- struct bkey uk, up; +- char buf1[200], buf2[200]; ++ struct bkey uk; + unsigned j, inorder; + + if (out->pos != out->end) +@@ -1860,34 +1837,6 @@ void bch2_bfloat_to_text(struct printbuf *out, struct btree *b, + ilog2(j), + uk.p.inode, uk.p.offset); + break; +- case BFLOAT_FAILED_PREV: +- p = tree_to_prev_bkey(b, t, j); +- l = is_power_of_2(j) +- ? btree_bkey_first(b, t) +- : tree_to_prev_bkey(b, t, j >> ffs(j)); +- r = is_power_of_2(j + 1) +- ? bch2_bkey_prev_all(b, t, btree_bkey_last(b, t)) +- : tree_to_bkey(b, t, j >> (ffz(j) + 1)); +- +- up = bkey_unpack_key(b, p); +- uk = bkey_unpack_key(b, k); +- bch2_to_binary(buf1, high_word(&b->format, p), b->nr_key_bits); +- bch2_to_binary(buf2, high_word(&b->format, k), b->nr_key_bits); +- +- pr_buf(out, +- " failed prev at depth %u\n" +- "\tkey starts at bit %u but first differing bit at %u\n" +- "\t%llu:%llu\n" +- "\t%llu:%llu\n" +- "\t%s\n" +- "\t%s\n", +- ilog2(j), +- bch2_bkey_greatest_differing_bit(b, l, r), +- bch2_bkey_greatest_differing_bit(b, p, k), +- uk.p.inode, uk.p.offset, +- up.p.inode, up.p.offset, +- buf1, buf2); +- break; + case BFLOAT_FAILED_OVERFLOW: + uk = bkey_unpack_key(b, k); + pr_buf(out, +diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h +index 643bd9e8bc4d..737eb1a90279 100644 +--- a/fs/bcachefs/bset.h ++++ b/fs/bcachefs/bset.h +@@ -583,7 +583,6 @@ struct bset_stats { + + size_t floats; + size_t failed_unpacked; +- size_t failed_prev; + size_t failed_overflow; + }; + +diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c +index 416949512057..b56ac1e53ef5 100644 +--- a/fs/bcachefs/btree_cache.c ++++ b/fs/bcachefs/btree_cache.c +@@ -910,7 +910,6 @@ void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, + " nr unpacked keys %u\n" + " floats %zu\n" + " failed unpacked %zu\n" +- " failed prev %zu\n" + " failed overflow %zu\n", + f->key_u64s, + f->bits_per_field[0], +@@ -929,6 +928,5 @@ void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, + b->nr.unpacked_keys, + stats.floats, + stats.failed_unpacked, +- stats.failed_prev, + stats.failed_overflow); + } +-- +cgit v1.2.3 + + +From af5ce7032d8330d83e68f96e30ad959a71519feb Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 23 Oct 2019 14:56:20 -0400 +Subject: bcachefs: Fall back to slowpath on exact comparison + +This is basically equivalent to the original strategy of falling back to +checking against the original key when the original key and previous key +didn't differ in the required bits - except, now we only fall back when +the search key doesn't differ in the required bits, which ends up being +a bit faster. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bset.c | 100 +++++++++++++++++++++------------------------- + fs/bcachefs/bset.h | 3 +- + fs/bcachefs/btree_cache.c | 6 +-- + 3 files changed, 48 insertions(+), 61 deletions(-) + +diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c +index 1992b31af40d..6c3f4ea8c259 100644 +--- a/fs/bcachefs/bset.c ++++ b/fs/bcachefs/bset.c +@@ -294,9 +294,8 @@ static inline void bch2_btree_node_iter_next_check(struct btree_node_iter *iter, + + /* Auxiliary search trees */ + +-#define BFLOAT_FAILED_UNPACKED (U8_MAX - 0) +-#define BFLOAT_FAILED_OVERFLOW (U8_MAX - 1) +-#define BFLOAT_FAILED (U8_MAX - 1) ++#define BFLOAT_FAILED_UNPACKED U8_MAX ++#define BFLOAT_FAILED U8_MAX + + #define KEY_WORDS BITS_TO_LONGS(1 << BKEY_EXPONENT_BITS) + +@@ -804,23 +803,6 @@ static void make_bfloat(struct btree *b, struct bset_tree *t, + mantissa |= ~(~0U << -exponent); + + bfloat_mantissa_set(f, j, mantissa); +- +- /* +- * f->mantissa must compare >= the original key - for transitivity with +- * the comparison in bset_search_tree. If we're dropping set bits, +- * increment it: +- */ +- if (exponent > (int) bch2_bkey_ffs(b, m)) { +- if (j < BFLOAT_32BIT_NR +- ? f->mantissa32 == U32_MAX +- : f->mantissa16 == U16_MAX) +- f->exponent = BFLOAT_FAILED_OVERFLOW; +- +- if (j < BFLOAT_32BIT_NR) +- f->mantissa32++; +- else +- f->mantissa16++; +- } + } + + /* bytes remaining - only valid for last bset: */ +@@ -1310,16 +1292,6 @@ static struct bkey_packed *bset_search_write_set(const struct btree *b, + return rw_aux_to_bkey(b, t, l); + } + +-noinline +-static int bset_search_tree_slowpath(const struct btree *b, +- struct bset_tree *t, struct bpos *search, +- const struct bkey_packed *packed_search, +- unsigned n) +-{ +- return bkey_cmp_p_or_unp(b, tree_to_bkey(b, t, n), +- packed_search, search) < 0; +-} +- + static inline void prefetch_four_cachelines(void *p) + { + #ifdef CONFIG_X86_64 +@@ -1339,6 +1311,22 @@ static inline void prefetch_four_cachelines(void *p) + #endif + } + ++static inline bool bkey_mantissa_bits_dropped(const struct btree *b, ++ const struct bkey_float *f, ++ unsigned idx) ++{ ++#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ ++ unsigned key_bits_start = b->format.key_u64s * 64 - b->nr_key_bits; ++ ++ return f->exponent > key_bits_start; ++#else ++ unsigned key_bits_end = high_bit_offset + b->nr_key_bits; ++ unsigned mantissa_bits = n < BFLOAT_32BIT_NR ? 32 : 16; ++ ++ return f->exponent + mantissa_bits < key_bits_end; ++#endif ++} ++ + __flatten + static struct bkey_packed *bset_search_tree(const struct btree *b, + struct bset_tree *t, +@@ -1347,7 +1335,9 @@ static struct bkey_packed *bset_search_tree(const struct btree *b, + { + struct ro_aux_tree *base = ro_aux_tree_base(b, t); + struct bkey_float *f; +- unsigned inorder, n = 1; ++ struct bkey_packed *k; ++ unsigned inorder, n = 1, l, r; ++ int cmp; + + do { + if (likely(n << 4 < t->size)) +@@ -1355,13 +1345,26 @@ static struct bkey_packed *bset_search_tree(const struct btree *b, + + f = bkey_float_get(base, n); + +- if (packed_search && +- likely(f->exponent < BFLOAT_FAILED)) +- n = n * 2 + (bfloat_mantissa(f, n) < +- bkey_mantissa(packed_search, f, n)); +- else +- n = n * 2 + bset_search_tree_slowpath(b, t, +- search, packed_search, n); ++ if (!unlikely(packed_search)) ++ goto slowpath; ++ if (unlikely(f->exponent >= BFLOAT_FAILED)) ++ goto slowpath; ++ ++ l = bfloat_mantissa(f, n); ++ r = bkey_mantissa(packed_search, f, n); ++ ++ if (unlikely(l == r) && bkey_mantissa_bits_dropped(b, f, n)) ++ goto slowpath; ++ ++ n = n * 2 + (l < r); ++ continue; ++slowpath: ++ k = tree_to_bkey(b, t, n); ++ cmp = bkey_cmp_p_or_unp(b, k, packed_search, search); ++ if (!cmp) ++ return k; ++ ++ n = n * 2 + (cmp < 0); + } while (n < t->size); + + inorder = __eytzinger1_to_inorder(n >> 1, t->size, t->extra); +@@ -1795,14 +1798,9 @@ void bch2_btree_keys_stats(struct btree *b, struct bset_stats *stats) + stats->floats += t->size - 1; + + for (j = 1; j < t->size; j++) +- switch (bkey_float(b, t, j)->exponent) { +- case BFLOAT_FAILED_UNPACKED: +- stats->failed_unpacked++; +- break; +- case BFLOAT_FAILED_OVERFLOW: +- stats->failed_overflow++; +- break; +- } ++ stats->failed += ++ bkey_float(b, t, j)->exponent == ++ BFLOAT_FAILED; + } + } + } +@@ -1829,7 +1827,7 @@ void bch2_bfloat_to_text(struct printbuf *out, struct btree *b, + return; + + switch (bkey_float(b, t, j)->exponent) { +- case BFLOAT_FAILED_UNPACKED: ++ case BFLOAT_FAILED: + uk = bkey_unpack_key(b, k); + pr_buf(out, + " failed unpacked at depth %u\n" +@@ -1837,13 +1835,5 @@ void bch2_bfloat_to_text(struct printbuf *out, struct btree *b, + ilog2(j), + uk.p.inode, uk.p.offset); + break; +- case BFLOAT_FAILED_OVERFLOW: +- uk = bkey_unpack_key(b, k); +- pr_buf(out, +- " failed overflow at depth %u\n" +- "\t%llu:%llu\n", +- ilog2(j), +- uk.p.inode, uk.p.offset); +- break; + } + } +diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h +index 737eb1a90279..ccc0866d6435 100644 +--- a/fs/bcachefs/bset.h ++++ b/fs/bcachefs/bset.h +@@ -582,8 +582,7 @@ struct bset_stats { + } sets[BSET_TREE_NR_TYPES]; + + size_t floats; +- size_t failed_unpacked; +- size_t failed_overflow; ++ size_t failed; + }; + + void bch2_btree_keys_stats(struct btree *, struct bset_stats *); +diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c +index b56ac1e53ef5..5d3acba525c2 100644 +--- a/fs/bcachefs/btree_cache.c ++++ b/fs/bcachefs/btree_cache.c +@@ -909,8 +909,7 @@ void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, + " nr packed keys %u\n" + " nr unpacked keys %u\n" + " floats %zu\n" +- " failed unpacked %zu\n" +- " failed overflow %zu\n", ++ " failed unpacked %zu\n", + f->key_u64s, + f->bits_per_field[0], + f->bits_per_field[1], +@@ -927,6 +926,5 @@ void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, + b->nr.packed_keys, + b->nr.unpacked_keys, + stats.floats, +- stats.failed_unpacked, +- stats.failed_overflow); ++ stats.failed); + } +-- +cgit v1.2.3 + + +From e28a31c2142c8aacb3d47caeb71729e79efea463 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 6 Nov 2019 19:40:09 -0500 +Subject: bcachefs: Go back to 16 bit mantissa bkey floats + +The previous optimizations means using 32 bit mantissas are now a net +loss - having bkey_float be only 4 bytes is good for prefetching. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bset.c | 99 +++++++++++++++--------------------------------------- + 1 file changed, 28 insertions(+), 71 deletions(-) + +diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c +index 6c3f4ea8c259..23493309ba63 100644 +--- a/fs/bcachefs/bset.c ++++ b/fs/bcachefs/bset.c +@@ -297,33 +297,20 @@ static inline void bch2_btree_node_iter_next_check(struct btree_node_iter *iter, + #define BFLOAT_FAILED_UNPACKED U8_MAX + #define BFLOAT_FAILED U8_MAX + +-#define KEY_WORDS BITS_TO_LONGS(1 << BKEY_EXPONENT_BITS) +- + struct bkey_float { + u8 exponent; + u8 key_offset; +- union { +- u32 mantissa32; +- struct { +- u16 mantissa16; +- u16 _pad; +- }; +- }; +-} __packed; +- +-#define BFLOAT_32BIT_NR 32U ++ u16 mantissa; ++}; ++#define BKEY_MANTISSA_BITS 16 + + static unsigned bkey_float_byte_offset(unsigned idx) + { +- int d = (idx - BFLOAT_32BIT_NR) << 1; +- +- d &= ~(d >> 31); +- +- return idx * 6 - d; ++ return idx * sizeof(struct bkey_float); + } + + struct ro_aux_tree { +- struct bkey_float _d[0]; ++ struct bkey_float f[0]; + }; + + struct rw_aux_tree { +@@ -378,8 +365,8 @@ static unsigned bset_aux_tree_buf_end(const struct bset_tree *t) + return t->aux_data_offset; + case BSET_RO_AUX_TREE: + return t->aux_data_offset + +- DIV_ROUND_UP(bkey_float_byte_offset(t->size) + +- sizeof(u8) * t->size, 8); ++ DIV_ROUND_UP(t->size * sizeof(struct bkey_float) + ++ t->size * sizeof(u8), 8); + case BSET_RW_AUX_TREE: + return t->aux_data_offset + + DIV_ROUND_UP(sizeof(struct rw_aux_tree) * t->size, 8); +@@ -418,17 +405,11 @@ static u8 *ro_aux_tree_prev(const struct btree *b, + return __aux_tree_base(b, t) + bkey_float_byte_offset(t->size); + } + +-static struct bkey_float *bkey_float_get(struct ro_aux_tree *b, +- unsigned idx) +-{ +- return (void *) b + bkey_float_byte_offset(idx); +-} +- + static struct bkey_float *bkey_float(const struct btree *b, + const struct bset_tree *t, + unsigned idx) + { +- return bkey_float_get(ro_aux_tree_base(b, t), idx); ++ return ro_aux_tree_base(b, t)->f + idx; + } + + static void bset_aux_tree_verify(struct btree *b) +@@ -662,21 +643,6 @@ static unsigned rw_aux_tree_bsearch(struct btree *b, + return idx; + } + +-static inline unsigned bfloat_mantissa(const struct bkey_float *f, +- unsigned idx) +-{ +- return idx < BFLOAT_32BIT_NR ? f->mantissa32 : f->mantissa16; +-} +- +-static inline void bfloat_mantissa_set(struct bkey_float *f, +- unsigned idx, unsigned mantissa) +-{ +- if (idx < BFLOAT_32BIT_NR) +- f->mantissa32 = mantissa; +- else +- f->mantissa16 = mantissa; +-} +- + static inline unsigned bkey_mantissa(const struct bkey_packed *k, + const struct bkey_float *f, + unsigned idx) +@@ -696,9 +662,9 @@ static inline unsigned bkey_mantissa(const struct bkey_packed *k, + #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + v >>= f->exponent & 7; + #else +- v >>= 64 - (f->exponent & 7) - (idx < BFLOAT_32BIT_NR ? 32 : 16); ++ v >>= 64 - (f->exponent & 7) - BKEY_MANTISSA_BITS; + #endif +- return idx < BFLOAT_32BIT_NR ? (u32) v : (u16) v; ++ return (u16) v; + } + + static void make_bfloat(struct btree *b, struct bset_tree *t, +@@ -709,7 +675,6 @@ static void make_bfloat(struct btree *b, struct bset_tree *t, + struct bkey_float *f = bkey_float(b, t, j); + struct bkey_packed *m = tree_to_bkey(b, t, j); + struct bkey_packed *l, *r; +- unsigned bits = j < BFLOAT_32BIT_NR ? 32 : 16; + unsigned mantissa; + int shift, exponent, high_bit; + +@@ -771,8 +736,8 @@ static void make_bfloat(struct btree *b, struct bset_tree *t, + * of the key: we handle this later: + */ + high_bit = max(bch2_bkey_greatest_differing_bit(b, l, r), +- min_t(unsigned, bits, b->nr_key_bits) - 1); +- exponent = high_bit - (bits - 1); ++ min_t(unsigned, BKEY_MANTISSA_BITS, b->nr_key_bits) - 1); ++ exponent = high_bit - (BKEY_MANTISSA_BITS - 1); + + /* + * Then we calculate the actual shift value, from the start of the key +@@ -781,12 +746,12 @@ static void make_bfloat(struct btree *b, struct bset_tree *t, + #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + shift = (int) (b->format.key_u64s * 64 - b->nr_key_bits) + exponent; + +- EBUG_ON(shift + bits > b->format.key_u64s * 64); ++ EBUG_ON(shift + BKEY_MANTISSA_BITS > b->format.key_u64s * 64); + #else + shift = high_bit_offset + + b->nr_key_bits - + exponent - +- bits; ++ BKEY_MANTISSA_BITS; + + EBUG_ON(shift < KEY_PACKED_BITS_START); + #endif +@@ -802,7 +767,7 @@ static void make_bfloat(struct btree *b, struct bset_tree *t, + if (exponent < 0) + mantissa |= ~(~0U << -exponent); + +- bfloat_mantissa_set(f, j, mantissa); ++ f->mantissa = mantissa; + } + + /* bytes remaining - only valid for last bset: */ +@@ -815,14 +780,8 @@ static unsigned __bset_tree_capacity(struct btree *b, struct bset_tree *t) + + static unsigned bset_ro_tree_capacity(struct btree *b, struct bset_tree *t) + { +- unsigned bytes = __bset_tree_capacity(b, t); +- +- if (bytes < 7 * BFLOAT_32BIT_NR) +- return bytes / 7; +- +- bytes -= 7 * BFLOAT_32BIT_NR; +- +- return BFLOAT_32BIT_NR + bytes / 5; ++ return __bset_tree_capacity(b, t) / ++ (sizeof(struct bkey_float) + sizeof(u8)); + } + + static unsigned bset_rw_tree_capacity(struct btree *b, struct bset_tree *t) +@@ -1321,9 +1280,8 @@ static inline bool bkey_mantissa_bits_dropped(const struct btree *b, + return f->exponent > key_bits_start; + #else + unsigned key_bits_end = high_bit_offset + b->nr_key_bits; +- unsigned mantissa_bits = n < BFLOAT_32BIT_NR ? 32 : 16; + +- return f->exponent + mantissa_bits < key_bits_end; ++ return f->exponent + BKEY_MANTISSA_BITS < key_bits_end; + #endif + } + +@@ -1341,16 +1299,16 @@ static struct bkey_packed *bset_search_tree(const struct btree *b, + + do { + if (likely(n << 4 < t->size)) +- prefetch(bkey_float_get(base, n << 4)); ++ prefetch(&base->f[n << 4]); + +- f = bkey_float_get(base, n); ++ f = &base->f[n]; + + if (!unlikely(packed_search)) + goto slowpath; + if (unlikely(f->exponent >= BFLOAT_FAILED)) + goto slowpath; + +- l = bfloat_mantissa(f, n); ++ l = f->mantissa; + r = bkey_mantissa(packed_search, f, n); + + if (unlikely(l == r) && bkey_mantissa_bits_dropped(b, f, n)) +@@ -1373,16 +1331,15 @@ slowpath: + * n would have been the node we recursed to - the low bit tells us if + * we recursed left or recursed right. + */ +- if (n & 1) { +- return cacheline_to_bkey(b, t, inorder, f->key_offset); +- } else { +- if (--inorder) { +- n = eytzinger1_prev(n >> 1, t->size); +- f = bkey_float_get(base, n); +- return cacheline_to_bkey(b, t, inorder, f->key_offset); +- } else ++ if (likely(!(n & 1))) { ++ --inorder; ++ if (unlikely(!inorder)) + return btree_bkey_first(b, t); ++ ++ f = &base->f[eytzinger1_prev(n >> 1, t->size)]; + } ++ ++ return cacheline_to_bkey(b, t, inorder, f->key_offset); + } + + static __always_inline __flatten +-- +cgit v1.2.3 + + +From f4b82685c63b420af3f0d0979b1f774ab88e8c5e Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 8 Nov 2019 15:09:36 -0500 +Subject: bcachefs: Remove some BKEY_PADDED uses + +Prep work for extents with inline data + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bkey_sort.c | 8 ++------ + fs/bcachefs/extents.c | 17 +++++++---------- + 2 files changed, 9 insertions(+), 16 deletions(-) + +diff --git a/fs/bcachefs/bkey_sort.c b/fs/bcachefs/bkey_sort.c +index e32fad5a91ac..2cac269b386f 100644 +--- a/fs/bcachefs/bkey_sort.c ++++ b/fs/bcachefs/bkey_sort.c +@@ -418,7 +418,7 @@ bch2_sort_repack_merge(struct bch_fs *c, + struct bkey_packed *prev = NULL, *k_packed; + struct bkey_s k; + struct btree_nr_keys nr; +- BKEY_PADDED(k) tmp; ++ struct bkey unpacked; + + memset(&nr, 0, sizeof(nr)); + +@@ -426,11 +426,7 @@ bch2_sort_repack_merge(struct bch_fs *c, + if (filter_whiteouts && bkey_whiteout(k_packed)) + continue; + +- EBUG_ON(bkeyp_val_u64s(&src->format, k_packed) > +- BKEY_EXTENT_VAL_U64s_MAX); +- +- bch2_bkey_unpack(src, &tmp.k, k_packed); +- k = bkey_i_to_s(&tmp.k); ++ k = __bkey_disassemble(src, k_packed, &unpacked); + + if (filter_whiteouts && + bch2_bkey_normalize(c, k)) +diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c +index 4cc2a4b13199..576ccdf939ee 100644 +--- a/fs/bcachefs/extents.c ++++ b/fs/bcachefs/extents.c +@@ -1218,7 +1218,6 @@ void bch2_insert_fixup_extent(struct btree_trans *trans, + struct bkey_i whiteout = *insert; + struct bkey_packed *_k; + struct bkey unpacked; +- BKEY_PADDED(k) tmp; + + EBUG_ON(iter->level); + EBUG_ON(!insert->k.size); +@@ -1292,25 +1291,23 @@ next: + bch2_btree_iter_set_pos_same_leaf(iter, insert->k.p); + + if (update_btree) { +- bkey_copy(&tmp.k, insert); +- + if (deleting) +- tmp.k.k.type = KEY_TYPE_discard; ++ insert->k.type = KEY_TYPE_discard; + +- EBUG_ON(bkey_deleted(&tmp.k.k) || !tmp.k.k.size); ++ EBUG_ON(bkey_deleted(&insert->k) || !insert->k.size); + +- extent_bset_insert(c, iter, &tmp.k); ++ extent_bset_insert(c, iter, insert); + } + + if (update_journal) { +- bkey_copy(&tmp.k, !deleting ? insert : &whiteout); ++ struct bkey_i *k = !deleting ? insert : &whiteout; + + if (deleting) +- tmp.k.k.type = KEY_TYPE_discard; ++ k->k.type = KEY_TYPE_discard; + +- EBUG_ON(bkey_deleted(&tmp.k.k) || !tmp.k.k.size); ++ EBUG_ON(bkey_deleted(&k->k) || !k->k.size); + +- bch2_btree_journal_key(trans, iter, &tmp.k); ++ bch2_btree_journal_key(trans, iter, k); + } + + bch2_cut_front(insert->k.p, insert); +-- +cgit v1.2.3 + + +From 78834c99152e85caa422d97aee503f88b6947fbf Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 9 Nov 2019 14:58:09 -0500 +Subject: bcachefs: Be slightly less tricky with union usage + +This is to fix a valgrind complaint - the code was correct, but too +tricky for valgrind to know that. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/extents.c | 18 +++++++++++------- + 1 file changed, 11 insertions(+), 7 deletions(-) + +diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c +index 576ccdf939ee..b9c69792f81f 100644 +--- a/fs/bcachefs/extents.c ++++ b/fs/bcachefs/extents.c +@@ -1387,16 +1387,18 @@ static unsigned bch2_crc_field_size_max[] = { + }; + + static void bch2_extent_crc_pack(union bch_extent_crc *dst, +- struct bch_extent_crc_unpacked src) ++ struct bch_extent_crc_unpacked src, ++ enum bch_extent_entry_type type) + { + #define set_common_fields(_dst, _src) \ ++ _dst.type = 1 << type; \ + _dst.csum_type = _src.csum_type, \ + _dst.compression_type = _src.compression_type, \ + _dst._compressed_size = _src.compressed_size - 1, \ + _dst._uncompressed_size = _src.uncompressed_size - 1, \ + _dst.offset = _src.offset + +- switch (extent_entry_type(to_entry(dst))) { ++ switch (type) { + case BCH_EXTENT_ENTRY_crc32: + set_common_fields(dst->crc32, src); + dst->crc32.csum = *((__le32 *) &src.csum.lo); +@@ -1423,23 +1425,24 @@ void bch2_extent_crc_append(struct bkey_i *k, + { + struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k)); + union bch_extent_crc *crc = (void *) ptrs.end; ++ enum bch_extent_entry_type type; + + if (bch_crc_bytes[new.csum_type] <= 4 && + new.uncompressed_size - 1 <= CRC32_SIZE_MAX && + new.nonce <= CRC32_NONCE_MAX) +- crc->type = 1 << BCH_EXTENT_ENTRY_crc32; ++ type = BCH_EXTENT_ENTRY_crc32; + else if (bch_crc_bytes[new.csum_type] <= 10 && + new.uncompressed_size - 1 <= CRC64_SIZE_MAX && + new.nonce <= CRC64_NONCE_MAX) +- crc->type = 1 << BCH_EXTENT_ENTRY_crc64; ++ type = BCH_EXTENT_ENTRY_crc64; + else if (bch_crc_bytes[new.csum_type] <= 16 && + new.uncompressed_size - 1 <= CRC128_SIZE_MAX && + new.nonce <= CRC128_NONCE_MAX) +- crc->type = 1 << BCH_EXTENT_ENTRY_crc128; ++ type = BCH_EXTENT_ENTRY_crc128; + else + BUG(); + +- bch2_extent_crc_pack(crc, new); ++ bch2_extent_crc_pack(crc, new, type); + + k->k.u64s += extent_entry_u64s(ptrs.end); + +@@ -1642,7 +1645,8 @@ enum merge_result bch2_extent_merge(struct bch_fs *c, + crc_l.uncompressed_size += crc_r.uncompressed_size; + crc_l.compressed_size += crc_r.compressed_size; + +- bch2_extent_crc_pack(entry_to_crc(en_l), crc_l); ++ bch2_extent_crc_pack(entry_to_crc(en_l), crc_l, ++ extent_entry_type(en_l)); + } + + bch2_key_resize(l.k, l.k->size + r.k->size); +-- +cgit v1.2.3 + + +From e884e0e172febec68d3a582575b7be22e18b788a Mon Sep 17 00:00:00 2001 +From: Justin Husted +Date: Sat, 9 Nov 2019 19:15:40 -0800 +Subject: bcachefs: Set lost+found mode to 0700 + +For security and conformance with other filesystems, the lost+found +directory should not be world or group accessible. + +Signed-off-by: Justin Husted +--- + fs/bcachefs/fsck.c | 2 +- + fs/bcachefs/recovery.c | 2 +- + 2 files changed, 2 insertions(+), 2 deletions(-) + +diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c +index 3cced2b99f3f..0f2308e53d65 100644 +--- a/fs/bcachefs/fsck.c ++++ b/fs/bcachefs/fsck.c +@@ -797,7 +797,7 @@ create_lostfound: + bch2_create_trans(&trans, + BCACHEFS_ROOT_INO, root_inode, + lostfound_inode, &lostfound, +- 0, 0, S_IFDIR|0755, 0, NULL, NULL)); ++ 0, 0, S_IFDIR|0700, 0, NULL, NULL)); + if (ret) + bch_err(c, "error creating lost+found: %i", ret); + +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index 2c441a278044..d1184bf62cae 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -1013,7 +1013,7 @@ int bch2_fs_initialize(struct bch_fs *c) + bch2_create_trans(&trans, BCACHEFS_ROOT_INO, + &root_inode, &lostfound_inode, + &lostfound, +- 0, 0, S_IFDIR|0755, 0, ++ 0, 0, S_IFDIR|0700, 0, + NULL, NULL)); + if (ret) + goto err; +-- +cgit v1.2.3 + + +From 9236845be5a8de16d6fcdf3544b1c497e392fe60 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 11 Nov 2019 13:42:10 -0500 +Subject: bcachefs: Fix erorr path in bch2_write() + +The error path in bch2_write wasn't updated when the end_io callback was +added to bch_write_op. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/io.c | 7 ++++++- + 1 file changed, 6 insertions(+), 1 deletion(-) + +diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c +index e3ef662e2a12..390949b41757 100644 +--- a/fs/bcachefs/io.c ++++ b/fs/bcachefs/io.c +@@ -1174,7 +1174,12 @@ void bch2_write(struct closure *cl) + err: + if (!(op->flags & BCH_WRITE_NOPUT_RESERVATION)) + bch2_disk_reservation_put(c, &op->res); +- closure_return(cl); ++ if (op->end_io) ++ op->end_io(op); ++ if (cl->parent) ++ closure_return(cl); ++ else ++ closure_debug_destroy(cl); + } + + /* Cache promotion on read */ +-- +cgit v1.2.3 + + +From d5499947a0c5888324dc2acdbb04e51d085b3259 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 13 Nov 2019 19:45:48 -0500 +Subject: bcachefs: Use wbc_to_write_flags() + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/fs-io.c | 7 +++---- + 1 file changed, 3 insertions(+), 4 deletions(-) + +diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c +index d312f7773805..9c11cc486ccf 100644 +--- a/fs/bcachefs/fs-io.c ++++ b/fs/bcachefs/fs-io.c +@@ -1115,6 +1115,7 @@ static void bch2_writepage_do_io(struct bch_writepage_state *w) + * possible, else allocating a new one: + */ + static void bch2_writepage_io_alloc(struct bch_fs *c, ++ struct writeback_control *wbc, + struct bch_writepage_state *w, + struct bch_inode_info *inode, + u64 sector, +@@ -1138,6 +1139,7 @@ static void bch2_writepage_io_alloc(struct bch_fs *c, + op->write_point = writepoint_hashed(inode->ei_last_dirtied); + op->pos = POS(inode->v.i_ino, sector); + op->wbio.bio.bi_iter.bi_sector = sector; ++ op->wbio.bio.bi_opf = wbc_to_write_flags(wbc); + } + + static int __bch2_writepage(struct page *page, +@@ -1248,7 +1250,7 @@ do_io: + bch2_writepage_do_io(w); + + if (!w->io) +- bch2_writepage_io_alloc(c, w, inode, sector, ++ bch2_writepage_io_alloc(c, wbc, w, inode, sector, + nr_replicas_this_write); + + atomic_inc(&s->write_count); +@@ -1265,9 +1267,6 @@ do_io: + w->io->op.i_sectors_delta -= dirty_sectors; + w->io->op.new_i_size = i_size; + +- if (wbc->sync_mode == WB_SYNC_ALL) +- w->io->op.wbio.bio.bi_opf |= REQ_SYNC; +- + offset += sectors; + } + +-- +cgit v1.2.3 + + +From 2884fee758c8cad0db9311c9d52af0c68427b56b Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 13 Nov 2019 19:46:11 -0500 +Subject: bcachefs: Make memcpy_to_bio() param const + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/util.c | 2 +- + fs/bcachefs/util.h | 2 +- + 2 files changed, 2 insertions(+), 2 deletions(-) + +diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c +index 2cc433ec0e3a..e69d03d1109f 100644 +--- a/fs/bcachefs/util.c ++++ b/fs/bcachefs/util.c +@@ -550,7 +550,7 @@ size_t bch2_rand_range(size_t max) + return rand; + } + +-void memcpy_to_bio(struct bio *dst, struct bvec_iter dst_iter, void *src) ++void memcpy_to_bio(struct bio *dst, struct bvec_iter dst_iter, const void *src) + { + struct bio_vec bv; + struct bvec_iter iter; +diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h +index 7e96ff7fda5c..b2f423e49954 100644 +--- a/fs/bcachefs/util.h ++++ b/fs/bcachefs/util.h +@@ -546,7 +546,7 @@ do { \ + + size_t bch2_rand_range(size_t); + +-void memcpy_to_bio(struct bio *, struct bvec_iter, void *); ++void memcpy_to_bio(struct bio *, struct bvec_iter, const void *); + void memcpy_from_bio(void *, struct bio *, struct bvec_iter); + + static inline void memcpy_u64s_small(void *dst, const void *src, +-- +cgit v1.2.3 + + +From a7ee021ed9c947806ca41fb95e1f739290cf6ae6 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 9 Nov 2019 16:01:15 -0500 +Subject: bcachefs: bkey_on_stack + +This implements code for storing small bkeys on the stack and allocating +out of a mempool if they're too big. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bcachefs.h | 2 + + fs/bcachefs/bkey_on_stack.h | 35 +++++++++++++++++ + fs/bcachefs/bkey_sort.c | 13 +++++-- + fs/bcachefs/ec.c | 12 ++++-- + fs/bcachefs/extents.c | 18 ++++++--- + fs/bcachefs/fs-io.c | 92 +++++++++++++++++++++++++-------------------- + fs/bcachefs/fs.c | 29 ++++++++------ + fs/bcachefs/io.c | 63 +++++++++++++++++++------------ + fs/bcachefs/migrate.c | 16 +++++--- + fs/bcachefs/move.c | 10 +++-- + fs/bcachefs/reflink.c | 17 ++++++--- + fs/bcachefs/super.c | 2 + + 12 files changed, 205 insertions(+), 104 deletions(-) + create mode 100644 fs/bcachefs/bkey_on_stack.h + +diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h +index 4c209c6b4aad..b1e1ceb61b73 100644 +--- a/fs/bcachefs/bcachefs.h ++++ b/fs/bcachefs/bcachefs.h +@@ -727,6 +727,8 @@ struct bch_fs { + + atomic64_t key_version; + ++ mempool_t large_bkey_pool; ++ + /* REBALANCE */ + struct bch_fs_rebalance rebalance; + +diff --git a/fs/bcachefs/bkey_on_stack.h b/fs/bcachefs/bkey_on_stack.h +new file mode 100644 +index 000000000000..d4739038323f +--- /dev/null ++++ b/fs/bcachefs/bkey_on_stack.h +@@ -0,0 +1,35 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_BKEY_ON_STACK_H ++#define _BCACHEFS_BKEY_ON_STACK_H ++ ++#include "bcachefs.h" ++ ++struct bkey_on_stack { ++ struct bkey_i *k; ++ u64 onstack[12]; ++}; ++ ++static inline void bkey_on_stack_realloc(struct bkey_on_stack *s, ++ struct bch_fs *c, unsigned u64s) ++{ ++ if (s->k == (void *) s->onstack && ++ u64s > ARRAY_SIZE(s->onstack)) { ++ s->k = mempool_alloc(&c->large_bkey_pool, GFP_NOFS); ++ memcpy(s->k, s->onstack, sizeof(s->onstack)); ++ } ++} ++ ++static inline void bkey_on_stack_init(struct bkey_on_stack *s) ++{ ++ s->k = (void *) s->onstack; ++} ++ ++static inline void bkey_on_stack_exit(struct bkey_on_stack *s, ++ struct bch_fs *c) ++{ ++ if (s->k != (void *) s->onstack) ++ mempool_free(s->k, &c->large_bkey_pool); ++ s->k = NULL; ++} ++ ++#endif /* _BCACHEFS_BKEY_ON_STACK_H */ +diff --git a/fs/bcachefs/bkey_sort.c b/fs/bcachefs/bkey_sort.c +index 2cac269b386f..f5c0507ad79d 100644 +--- a/fs/bcachefs/bkey_sort.c ++++ b/fs/bcachefs/bkey_sort.c +@@ -1,5 +1,6 @@ + // SPDX-License-Identifier: GPL-2.0 + #include "bcachefs.h" ++#include "bkey_on_stack.h" + #include "bkey_sort.h" + #include "bset.h" + #include "extents.h" +@@ -292,8 +293,10 @@ struct btree_nr_keys bch2_extent_sort_fix_overlapping(struct bch_fs *c, + struct bkey l_unpacked, r_unpacked; + struct bkey_s l, r; + struct btree_nr_keys nr; ++ struct bkey_on_stack split; + + memset(&nr, 0, sizeof(nr)); ++ bkey_on_stack_init(&split); + + heap_resort(iter, extent_sort_cmp, NULL); + +@@ -349,13 +352,13 @@ struct btree_nr_keys bch2_extent_sort_fix_overlapping(struct bch_fs *c, + + extent_sort_sift(iter, b, _r - iter->data); + } else if (bkey_cmp(l.k->p, r.k->p) > 0) { +- BKEY_PADDED(k) tmp; ++ bkey_on_stack_realloc(&split, c, l.k->u64s); + + /* + * r wins, but it overlaps in the middle of l - split l: + */ +- bkey_reassemble(&tmp.k, l.s_c); +- bch2_cut_back(bkey_start_pos(r.k), &tmp.k.k); ++ bkey_reassemble(split.k, l.s_c); ++ bch2_cut_back(bkey_start_pos(r.k), &split.k->k); + + __bch2_cut_front(r.k->p, l); + extent_save(b, lk, l.k); +@@ -363,7 +366,7 @@ struct btree_nr_keys bch2_extent_sort_fix_overlapping(struct bch_fs *c, + extent_sort_sift(iter, b, 0); + + extent_sort_append(c, f, &nr, dst->start, +- &prev, bkey_i_to_s(&tmp.k)); ++ &prev, bkey_i_to_s(split.k)); + } else { + bch2_cut_back(bkey_start_pos(r.k), l.k); + extent_save(b, lk, l.k); +@@ -373,6 +376,8 @@ struct btree_nr_keys bch2_extent_sort_fix_overlapping(struct bch_fs *c, + extent_sort_advance_prev(f, &nr, dst->start, &prev); + + dst->u64s = cpu_to_le16((u64 *) prev - dst->_data); ++ ++ bkey_on_stack_exit(&split, c); + return nr; + } + +diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c +index 47a11a2d69dd..51e00386f173 100644 +--- a/fs/bcachefs/ec.c ++++ b/fs/bcachefs/ec.c +@@ -4,6 +4,7 @@ + + #include "bcachefs.h" + #include "alloc_foreground.h" ++#include "bkey_on_stack.h" + #include "bset.h" + #include "btree_gc.h" + #include "btree_update.h" +@@ -777,9 +778,10 @@ static int ec_stripe_update_ptrs(struct bch_fs *c, + struct bkey_s_c k; + struct bkey_s_extent e; + struct bch_extent_ptr *ptr; +- BKEY_PADDED(k) tmp; ++ struct bkey_on_stack sk; + int ret = 0, dev, idx; + ++ bkey_on_stack_init(&sk); + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); + + iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, +@@ -804,8 +806,9 @@ static int ec_stripe_update_ptrs(struct bch_fs *c, + + dev = s->key.v.ptrs[idx].dev; + +- bkey_reassemble(&tmp.k, k); +- e = bkey_i_to_s_extent(&tmp.k); ++ bkey_on_stack_realloc(&sk, c, k.k->u64s); ++ bkey_reassemble(sk.k, k); ++ e = bkey_i_to_s_extent(sk.k); + + extent_for_each_ptr(e, ptr) + if (ptr->dev != dev) +@@ -816,7 +819,7 @@ static int ec_stripe_update_ptrs(struct bch_fs *c, + + extent_stripe_ptr_add(e, s, ptr, idx); + +- bch2_trans_update(&trans, iter, &tmp.k); ++ bch2_trans_update(&trans, iter, sk.k); + + ret = bch2_trans_commit(&trans, NULL, NULL, + BTREE_INSERT_ATOMIC| +@@ -829,6 +832,7 @@ static int ec_stripe_update_ptrs(struct bch_fs *c, + } + + bch2_trans_exit(&trans); ++ bkey_on_stack_exit(&sk, c); + + return ret; + } +diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c +index b9c69792f81f..12799f7ba1db 100644 +--- a/fs/bcachefs/extents.c ++++ b/fs/bcachefs/extents.c +@@ -8,6 +8,7 @@ + + #include "bcachefs.h" + #include "bkey_methods.h" ++#include "bkey_on_stack.h" + #include "btree_gc.h" + #include "btree_update.h" + #include "btree_update_interior.h" +@@ -1133,7 +1134,11 @@ extent_squash(struct bch_fs *c, struct btree_iter *iter, + break; + } + case BCH_EXTENT_OVERLAP_MIDDLE: { +- BKEY_PADDED(k) split; ++ struct bkey_on_stack split; ++ ++ bkey_on_stack_init(&split); ++ bkey_on_stack_realloc(&split, c, k.k->u64s); ++ + /* + * The insert key falls 'in the middle' of k + * The insert key splits k in 3: +@@ -1148,18 +1153,19 @@ extent_squash(struct bch_fs *c, struct btree_iter *iter, + * modify k _before_ doing the insert (which will move + * what k points to) + */ +- bkey_reassemble(&split.k, k.s_c); +- split.k.k.needs_whiteout |= bkey_written(l->b, _k); ++ bkey_reassemble(split.k, k.s_c); ++ split.k->k.needs_whiteout |= bkey_written(l->b, _k); + +- bch2_cut_back(bkey_start_pos(&insert->k), &split.k.k); +- BUG_ON(bkey_deleted(&split.k.k)); ++ bch2_cut_back(bkey_start_pos(&insert->k), &split.k->k); ++ BUG_ON(bkey_deleted(&split.k->k)); + + __bch2_cut_front(insert->k.p, k); + BUG_ON(bkey_deleted(k.k)); + extent_save(l->b, _k, k.k); + bch2_btree_iter_fix_key_modified(iter, l->b, _k); + +- extent_bset_insert(c, iter, &split.k); ++ extent_bset_insert(c, iter, split.k); ++ bkey_on_stack_exit(&split, c); + break; + } + } +diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c +index 9c11cc486ccf..a94e016b5355 100644 +--- a/fs/bcachefs/fs-io.c ++++ b/fs/bcachefs/fs-io.c +@@ -3,6 +3,7 @@ + + #include "bcachefs.h" + #include "alloc_foreground.h" ++#include "bkey_on_stack.h" + #include "btree_update.h" + #include "buckets.h" + #include "clock.h" +@@ -774,6 +775,18 @@ static void bch2_add_page_sectors(struct bio *bio, struct bkey_s_c k) + } + } + ++static bool extent_partial_reads_expensive(struct bkey_s_c k) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ struct bch_extent_crc_unpacked crc; ++ const union bch_extent_entry *i; ++ ++ bkey_for_each_crc(k.k, ptrs, crc, i) ++ if (crc.csum_type || crc.compression_type) ++ return true; ++ return false; ++} ++ + static void readpage_bio_extend(struct readpages_iter *iter, + struct bio *bio, + unsigned sectors_this_extent, +@@ -827,15 +840,17 @@ static void bchfs_read(struct btree_trans *trans, struct btree_iter *iter, + struct readpages_iter *readpages_iter) + { + struct bch_fs *c = trans->c; ++ struct bkey_on_stack sk; + int flags = BCH_READ_RETRY_IF_STALE| + BCH_READ_MAY_PROMOTE; + int ret = 0; + + rbio->c = c; + rbio->start_time = local_clock(); ++ ++ bkey_on_stack_init(&sk); + retry: + while (1) { +- BKEY_PADDED(k) tmp; + struct bkey_s_c k; + unsigned bytes, sectors, offset_into_extent; + +@@ -847,15 +862,16 @@ retry: + if (ret) + break; + +- bkey_reassemble(&tmp.k, k); +- k = bkey_i_to_s_c(&tmp.k); ++ bkey_on_stack_realloc(&sk, c, k.k->u64s); ++ bkey_reassemble(sk.k, k); ++ k = bkey_i_to_s_c(sk.k); + + offset_into_extent = iter->pos.offset - + bkey_start_offset(k.k); + sectors = k.k->size - offset_into_extent; + + ret = bch2_read_indirect_extent(trans, +- &offset_into_extent, &tmp.k); ++ &offset_into_extent, sk.k); + if (ret) + break; + +@@ -863,22 +879,9 @@ retry: + + bch2_trans_unlock(trans); + +- if (readpages_iter) { +- bool want_full_extent = false; +- +- if (bkey_extent_is_data(k.k)) { +- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); +- const union bch_extent_entry *i; +- struct extent_ptr_decoded p; +- +- bkey_for_each_ptr_decode(k.k, ptrs, p, i) +- want_full_extent |= ((p.crc.csum_type != 0) | +- (p.crc.compression_type != 0)); +- } +- +- readpage_bio_extend(readpages_iter, &rbio->bio, +- sectors, want_full_extent); +- } ++ if (readpages_iter) ++ readpage_bio_extend(readpages_iter, &rbio->bio, sectors, ++ extent_partial_reads_expensive(k)); + + bytes = min(sectors, bio_sectors(&rbio->bio)) << 9; + swap(rbio->bio.bi_iter.bi_size, bytes); +@@ -892,7 +895,7 @@ retry: + bch2_read_extent(c, rbio, k, offset_into_extent, flags); + + if (flags & BCH_READ_LAST_FRAGMENT) +- return; ++ break; + + swap(rbio->bio.bi_iter.bi_size, bytes); + bio_advance(&rbio->bio, bytes); +@@ -901,8 +904,12 @@ retry: + if (ret == -EINTR) + goto retry; + +- bcache_io_error(c, &rbio->bio, "btree IO error %i", ret); +- bio_endio(&rbio->bio); ++ if (ret) { ++ bcache_io_error(c, &rbio->bio, "btree IO error %i", ret); ++ bio_endio(&rbio->bio); ++ } ++ ++ bkey_on_stack_exit(&sk, c); + } + + int bch2_readpages(struct file *file, struct address_space *mapping, +@@ -2407,6 +2414,7 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode, + { + struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct address_space *mapping = inode->v.i_mapping; ++ struct bkey_on_stack copy; + struct btree_trans trans; + struct btree_iter *src, *dst, *del = NULL; + loff_t shift, new_size; +@@ -2416,6 +2424,7 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode, + if ((offset | len) & (block_bytes(c) - 1)) + return -EINVAL; + ++ bkey_on_stack_init(©); + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 256); + + /* +@@ -2484,7 +2493,6 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode, + while (1) { + struct disk_reservation disk_res = + bch2_disk_reservation_init(c, 0); +- BKEY_PADDED(k) copy; + struct bkey_i delete; + struct bkey_s_c k; + struct bpos next_pos; +@@ -2509,34 +2517,35 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode, + bkey_cmp(k.k->p, POS(inode->v.i_ino, offset >> 9)) <= 0) + break; + reassemble: +- bkey_reassemble(©.k, k); ++ bkey_on_stack_realloc(©, c, k.k->u64s); ++ bkey_reassemble(copy.k, k); + + if (insert && + bkey_cmp(bkey_start_pos(k.k), move_pos) < 0) { +- bch2_cut_front(move_pos, ©.k); +- bch2_btree_iter_set_pos(src, bkey_start_pos(©.k.k)); ++ bch2_cut_front(move_pos, copy.k); ++ bch2_btree_iter_set_pos(src, bkey_start_pos(©.k->k)); + } + +- copy.k.k.p.offset += shift >> 9; +- bch2_btree_iter_set_pos(dst, bkey_start_pos(©.k.k)); ++ copy.k->k.p.offset += shift >> 9; ++ bch2_btree_iter_set_pos(dst, bkey_start_pos(©.k->k)); + +- ret = bch2_extent_atomic_end(dst, ©.k, &atomic_end); ++ ret = bch2_extent_atomic_end(dst, copy.k, &atomic_end); + if (ret) + goto bkey_err; + +- if (bkey_cmp(atomic_end, copy.k.k.p)) { ++ if (bkey_cmp(atomic_end, copy.k->k.p)) { + if (insert) { + move_pos = atomic_end; + move_pos.offset -= shift >> 9; + goto reassemble; + } else { +- bch2_cut_back(atomic_end, ©.k.k); ++ bch2_cut_back(atomic_end, ©.k->k); + } + } + + bkey_init(&delete.k); + delete.k.p = src->pos; +- bch2_key_resize(&delete.k, copy.k.k.size); ++ bch2_key_resize(&delete.k, copy.k->k.size); + + next_pos = insert ? bkey_start_pos(&delete.k) : delete.k.p; + +@@ -2549,12 +2558,12 @@ reassemble: + * by the triggers machinery: + */ + if (insert && +- bkey_cmp(bkey_start_pos(©.k.k), delete.k.p) < 0) { +- bch2_cut_back(bkey_start_pos(©.k.k), &delete.k); ++ bkey_cmp(bkey_start_pos(©.k->k), delete.k.p) < 0) { ++ bch2_cut_back(bkey_start_pos(©.k->k), &delete.k); + } else if (!insert && +- bkey_cmp(copy.k.k.p, ++ bkey_cmp(copy.k->k.p, + bkey_start_pos(&delete.k)) > 0) { +- bch2_cut_front(copy.k.k.p, &delete); ++ bch2_cut_front(copy.k->k.p, &delete); + + del = bch2_trans_copy_iter(&trans, src); + BUG_ON(IS_ERR_OR_NULL(del)); +@@ -2563,10 +2572,10 @@ reassemble: + bkey_start_pos(&delete.k)); + } + +- bch2_trans_update(&trans, dst, ©.k); ++ bch2_trans_update(&trans, dst, copy.k); + bch2_trans_update(&trans, del ?: src, &delete); + +- if (copy.k.k.size == k.k->size) { ++ if (copy.k->k.size == k.k->size) { + /* + * If we're moving the entire extent, we can skip + * running triggers: +@@ -2575,10 +2584,10 @@ reassemble: + } else { + /* We might end up splitting compressed extents: */ + unsigned nr_ptrs = +- bch2_bkey_nr_dirty_ptrs(bkey_i_to_s_c(©.k)); ++ bch2_bkey_nr_dirty_ptrs(bkey_i_to_s_c(copy.k)); + + ret = bch2_disk_reservation_get(c, &disk_res, +- copy.k.k.size, nr_ptrs, ++ copy.k->k.size, nr_ptrs, + BCH_DISK_RESERVATION_NOFAIL); + BUG_ON(ret); + } +@@ -2613,6 +2622,7 @@ bkey_err: + } + err: + bch2_trans_exit(&trans); ++ bkey_on_stack_exit(©, c); + bch2_pagecache_block_put(&inode->ei_pagecache_lock); + inode_unlock(&inode->v); + return ret; +diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c +index a7212b91ac4c..fc9022e2508c 100644 +--- a/fs/bcachefs/fs.c ++++ b/fs/bcachefs/fs.c +@@ -3,6 +3,7 @@ + + #include "bcachefs.h" + #include "acl.h" ++#include "bkey_on_stack.h" + #include "btree_update.h" + #include "buckets.h" + #include "chardev.h" +@@ -874,7 +875,7 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, + struct btree_trans trans; + struct btree_iter *iter; + struct bkey_s_c k; +- BKEY_PADDED(k) cur, prev; ++ struct bkey_on_stack cur, prev; + struct bpos end = POS(ei->v.i_ino, (start + len) >> 9); + unsigned offset_into_extent, sectors; + bool have_extent = false; +@@ -887,6 +888,8 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, + if (start + len < start) + return -EINVAL; + ++ bkey_on_stack_init(&cur); ++ bkey_on_stack_init(&prev); + bch2_trans_init(&trans, c, 0, 0); + + iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, +@@ -901,15 +904,17 @@ retry: + continue; + } + +- bkey_reassemble(&cur.k, k); +- k = bkey_i_to_s_c(&cur.k); ++ bkey_on_stack_realloc(&cur, c, k.k->u64s); ++ bkey_on_stack_realloc(&prev, c, k.k->u64s); ++ bkey_reassemble(cur.k, k); ++ k = bkey_i_to_s_c(cur.k); + + offset_into_extent = iter->pos.offset - + bkey_start_offset(k.k); + sectors = k.k->size - offset_into_extent; + + ret = bch2_read_indirect_extent(&trans, +- &offset_into_extent, &cur.k); ++ &offset_into_extent, cur.k); + if (ret) + break; + +@@ -919,19 +924,19 @@ retry: + bch2_cut_front(POS(k.k->p.inode, + bkey_start_offset(k.k) + + offset_into_extent), +- &cur.k); +- bch2_key_resize(&cur.k.k, sectors); +- cur.k.k.p = iter->pos; +- cur.k.k.p.offset += cur.k.k.size; ++ cur.k); ++ bch2_key_resize(&cur.k->k, sectors); ++ cur.k->k.p = iter->pos; ++ cur.k->k.p.offset += cur.k->k.size; + + if (have_extent) { + ret = bch2_fill_extent(c, info, +- bkey_i_to_s_c(&prev.k), 0); ++ bkey_i_to_s_c(prev.k), 0); + if (ret) + break; + } + +- bkey_copy(&prev.k, &cur.k); ++ bkey_copy(prev.k, cur.k); + have_extent = true; + + if (k.k->type == KEY_TYPE_reflink_v) +@@ -944,10 +949,12 @@ retry: + goto retry; + + if (!ret && have_extent) +- ret = bch2_fill_extent(c, info, bkey_i_to_s_c(&prev.k), ++ ret = bch2_fill_extent(c, info, bkey_i_to_s_c(prev.k), + FIEMAP_EXTENT_LAST); + + ret = bch2_trans_exit(&trans) ?: ret; ++ bkey_on_stack_exit(&cur, c); ++ bkey_on_stack_exit(&prev, c); + return ret < 0 ? ret : 0; + } + +diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c +index 390949b41757..7b713bc10d1a 100644 +--- a/fs/bcachefs/io.c ++++ b/fs/bcachefs/io.c +@@ -8,6 +8,7 @@ + + #include "bcachefs.h" + #include "alloc_foreground.h" ++#include "bkey_on_stack.h" + #include "bset.h" + #include "btree_update.h" + #include "buckets.h" +@@ -384,12 +385,14 @@ int bch2_fpunch(struct bch_fs *c, u64 inum, u64 start, u64 end, + int bch2_write_index_default(struct bch_write_op *op) + { + struct bch_fs *c = op->c; ++ struct bkey_on_stack sk; + struct keylist *keys = &op->insert_keys; + struct bkey_i *k = bch2_keylist_front(keys); + struct btree_trans trans; + struct btree_iter *iter; + int ret; + ++ bkey_on_stack_init(&sk); + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024); + + iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, +@@ -397,13 +400,14 @@ int bch2_write_index_default(struct bch_write_op *op) + BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + + do { +- BKEY_PADDED(k) tmp; ++ k = bch2_keylist_front(keys); + +- bkey_copy(&tmp.k, bch2_keylist_front(keys)); ++ bkey_on_stack_realloc(&sk, c, k->k.u64s); ++ bkey_copy(sk.k, k); + + bch2_trans_begin_updates(&trans); + +- ret = bch2_extent_update(&trans, iter, &tmp.k, ++ ret = bch2_extent_update(&trans, iter, sk.k, + &op->res, op_journal_seq(op), + op->new_i_size, &op->i_sectors_delta); + if (ret == -EINTR) +@@ -411,13 +415,14 @@ int bch2_write_index_default(struct bch_write_op *op) + if (ret) + break; + +- if (bkey_cmp(iter->pos, bch2_keylist_front(keys)->k.p) < 0) +- bch2_cut_front(iter->pos, bch2_keylist_front(keys)); ++ if (bkey_cmp(iter->pos, k->k.p) < 0) ++ bch2_cut_front(iter->pos, k); + else + bch2_keylist_pop_front(keys); + } while (!bch2_keylist_empty(keys)); + + bch2_trans_exit(&trans); ++ bkey_on_stack_exit(&sk, c); + + return ret; + } +@@ -1461,13 +1466,14 @@ static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio + { + struct btree_trans trans; + struct btree_iter *iter; +- BKEY_PADDED(k) tmp; ++ struct bkey_on_stack sk; + struct bkey_s_c k; + int ret; + + flags &= ~BCH_READ_LAST_FRAGMENT; + flags |= BCH_READ_MUST_CLONE; + ++ bkey_on_stack_init(&sk); + bch2_trans_init(&trans, c, 0, 0); + + iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, +@@ -1479,11 +1485,12 @@ retry: + if (bkey_err(k)) + goto err; + +- bkey_reassemble(&tmp.k, k); +- k = bkey_i_to_s_c(&tmp.k); ++ bkey_on_stack_realloc(&sk, c, k.k->u64s); ++ bkey_reassemble(sk.k, k); ++ k = bkey_i_to_s_c(sk.k); + bch2_trans_unlock(&trans); + +- if (!bch2_bkey_matches_ptr(c, bkey_i_to_s_c(&tmp.k), ++ if (!bch2_bkey_matches_ptr(c, k, + rbio->pick.ptr, + rbio->pos.offset - + rbio->pick.crc.offset)) { +@@ -1500,6 +1507,7 @@ retry: + out: + bch2_rbio_done(rbio); + bch2_trans_exit(&trans); ++ bkey_on_stack_exit(&sk, c); + return; + err: + rbio->bio.bi_status = BLK_STS_IOERR; +@@ -1512,12 +1520,14 @@ static void bch2_read_retry(struct bch_fs *c, struct bch_read_bio *rbio, + { + struct btree_trans trans; + struct btree_iter *iter; ++ struct bkey_on_stack sk; + struct bkey_s_c k; + int ret; + + flags &= ~BCH_READ_LAST_FRAGMENT; + flags |= BCH_READ_MUST_CLONE; + ++ bkey_on_stack_init(&sk); + bch2_trans_init(&trans, c, 0, 0); + retry: + bch2_trans_begin(&trans); +@@ -1525,18 +1535,18 @@ retry: + for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, + POS(inode, bvec_iter.bi_sector), + BTREE_ITER_SLOTS, k, ret) { +- BKEY_PADDED(k) tmp; + unsigned bytes, sectors, offset_into_extent; + +- bkey_reassemble(&tmp.k, k); +- k = bkey_i_to_s_c(&tmp.k); ++ bkey_on_stack_realloc(&sk, c, k.k->u64s); ++ bkey_reassemble(sk.k, k); ++ k = bkey_i_to_s_c(sk.k); + + offset_into_extent = iter->pos.offset - + bkey_start_offset(k.k); + sectors = k.k->size - offset_into_extent; + + ret = bch2_read_indirect_extent(&trans, +- &offset_into_extent, &tmp.k); ++ &offset_into_extent, sk.k); + if (ret) + break; + +@@ -1575,6 +1585,7 @@ err: + rbio->bio.bi_status = BLK_STS_IOERR; + out: + bch2_trans_exit(&trans); ++ bkey_on_stack_exit(&sk, c); + bch2_rbio_done(rbio); + } + +@@ -1631,7 +1642,7 @@ static void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio) + struct btree_trans trans; + struct btree_iter *iter; + struct bkey_s_c k; +- BKEY_PADDED(k) new; ++ struct bkey_on_stack new; + struct bch_extent_crc_unpacked new_crc; + u64 data_offset = rbio->pos.offset - rbio->pick.crc.offset; + int ret; +@@ -1639,6 +1650,7 @@ static void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio) + if (rbio->pick.crc.compression_type) + return; + ++ bkey_on_stack_init(&new); + bch2_trans_init(&trans, c, 0, 0); + retry: + bch2_trans_begin(&trans); +@@ -1649,8 +1661,9 @@ retry: + if (IS_ERR_OR_NULL(k.k)) + goto out; + +- bkey_reassemble(&new.k, k); +- k = bkey_i_to_s_c(&new.k); ++ bkey_on_stack_realloc(&new, c, k.k->u64s); ++ bkey_reassemble(new.k, k); ++ k = bkey_i_to_s_c(new.k); + + if (bversion_cmp(k.k->version, rbio->version) || + !bch2_bkey_matches_ptr(c, k, rbio->pick.ptr, data_offset)) +@@ -1669,10 +1682,10 @@ retry: + goto out; + } + +- if (!bch2_bkey_narrow_crcs(&new.k, new_crc)) ++ if (!bch2_bkey_narrow_crcs(new.k, new_crc)) + goto out; + +- bch2_trans_update(&trans, iter, &new.k); ++ bch2_trans_update(&trans, iter, new.k); + ret = bch2_trans_commit(&trans, NULL, NULL, + BTREE_INSERT_ATOMIC| + BTREE_INSERT_NOFAIL| +@@ -1681,6 +1694,7 @@ retry: + goto retry; + out: + bch2_trans_exit(&trans); ++ bkey_on_stack_exit(&new, c); + } + + /* Inner part that may run in process context */ +@@ -2105,6 +2119,7 @@ void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, u64 inode) + { + struct btree_trans trans; + struct btree_iter *iter; ++ struct bkey_on_stack sk; + struct bkey_s_c k; + unsigned flags = BCH_READ_RETRY_IF_STALE| + BCH_READ_MAY_PROMOTE| +@@ -2118,6 +2133,7 @@ void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, u64 inode) + rbio->c = c; + rbio->start_time = local_clock(); + ++ bkey_on_stack_init(&sk); + bch2_trans_init(&trans, c, 0, 0); + retry: + bch2_trans_begin(&trans); +@@ -2126,7 +2142,6 @@ retry: + POS(inode, rbio->bio.bi_iter.bi_sector), + BTREE_ITER_SLOTS); + while (1) { +- BKEY_PADDED(k) tmp; + unsigned bytes, sectors, offset_into_extent; + + bch2_btree_iter_set_pos(iter, +@@ -2137,15 +2152,16 @@ retry: + if (ret) + goto err; + +- bkey_reassemble(&tmp.k, k); +- k = bkey_i_to_s_c(&tmp.k); +- + offset_into_extent = iter->pos.offset - + bkey_start_offset(k.k); + sectors = k.k->size - offset_into_extent; + ++ bkey_on_stack_realloc(&sk, c, k.k->u64s); ++ bkey_reassemble(sk.k, k); ++ k = bkey_i_to_s_c(sk.k); ++ + ret = bch2_read_indirect_extent(&trans, +- &offset_into_extent, &tmp.k); ++ &offset_into_extent, sk.k); + if (ret) + goto err; + +@@ -2177,6 +2193,7 @@ retry: + } + out: + bch2_trans_exit(&trans); ++ bkey_on_stack_exit(&sk, c); + return; + err: + if (ret == -EINTR) +diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c +index de8522f754e2..4dacbd637d02 100644 +--- a/fs/bcachefs/migrate.c ++++ b/fs/bcachefs/migrate.c +@@ -4,6 +4,7 @@ + */ + + #include "bcachefs.h" ++#include "bkey_on_stack.h" + #include "btree_update.h" + #include "btree_update_interior.h" + #include "buckets.h" +@@ -40,9 +41,10 @@ static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags + struct btree_trans trans; + struct btree_iter *iter; + struct bkey_s_c k; +- BKEY_PADDED(key) tmp; ++ struct bkey_on_stack sk; + int ret = 0; + ++ bkey_on_stack_init(&sk); + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); + + iter = bch2_trans_get_iter(&trans, btree_id, POS_MIN, +@@ -58,9 +60,10 @@ static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags + continue; + } + +- bkey_reassemble(&tmp.key, k); ++ bkey_on_stack_realloc(&sk, c, k.k->u64s); ++ bkey_reassemble(sk.k, k); + +- ret = drop_dev_ptrs(c, bkey_i_to_s(&tmp.key), ++ ret = drop_dev_ptrs(c, bkey_i_to_s(sk.k), + dev_idx, flags, false); + if (ret) + break; +@@ -70,11 +73,11 @@ static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags + * will do the appropriate thing with it (turning it into a + * KEY_TYPE_error key, or just a discard if it was a cached extent) + */ +- bch2_extent_normalize(c, bkey_i_to_s(&tmp.key)); ++ bch2_extent_normalize(c, bkey_i_to_s(sk.k)); + +- bch2_btree_iter_set_pos(iter, bkey_start_pos(&tmp.key.k)); ++ bch2_btree_iter_set_pos(iter, bkey_start_pos(&sk.k->k)); + +- bch2_trans_update(&trans, iter, &tmp.key); ++ bch2_trans_update(&trans, iter, sk.k); + + ret = bch2_trans_commit(&trans, NULL, NULL, + BTREE_INSERT_ATOMIC| +@@ -92,6 +95,7 @@ static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags + } + + ret = bch2_trans_exit(&trans) ?: ret; ++ bkey_on_stack_exit(&sk, c); + + BUG_ON(ret == -EINTR); + +diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c +index ab20e981145b..87ff04bdfc73 100644 +--- a/fs/bcachefs/move.c ++++ b/fs/bcachefs/move.c +@@ -2,6 +2,7 @@ + + #include "bcachefs.h" + #include "alloc_foreground.h" ++#include "bkey_on_stack.h" + #include "btree_gc.h" + #include "btree_update.h" + #include "btree_update_interior.h" +@@ -490,7 +491,7 @@ static int __bch2_move_data(struct bch_fs *c, + { + bool kthread = (current->flags & PF_KTHREAD) != 0; + struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts); +- BKEY_PADDED(k) tmp; ++ struct bkey_on_stack sk; + struct btree_trans trans; + struct btree_iter *iter; + struct bkey_s_c k; +@@ -499,6 +500,7 @@ static int __bch2_move_data(struct bch_fs *c, + u64 delay, cur_inum = U64_MAX; + int ret = 0, ret2; + ++ bkey_on_stack_init(&sk); + bch2_trans_init(&trans, c, 0, 0); + + stats->data_type = BCH_DATA_USER; +@@ -578,8 +580,9 @@ peek: + } + + /* unlock before doing IO: */ +- bkey_reassemble(&tmp.k, k); +- k = bkey_i_to_s_c(&tmp.k); ++ bkey_on_stack_realloc(&sk, c, k.k->u64s); ++ bkey_reassemble(sk.k, k); ++ k = bkey_i_to_s_c(sk.k); + bch2_trans_unlock(&trans); + + ret2 = bch2_move_extent(c, ctxt, wp, io_opts, btree_id, k, +@@ -606,6 +609,7 @@ next_nondata: + } + out: + ret = bch2_trans_exit(&trans) ?: ret; ++ bkey_on_stack_exit(&sk, c); + + return ret; + } +diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c +index 6e71c5e8f9a2..6d21086c3254 100644 +--- a/fs/bcachefs/reflink.c ++++ b/fs/bcachefs/reflink.c +@@ -1,5 +1,6 @@ + // SPDX-License-Identifier: GPL-2.0 + #include "bcachefs.h" ++#include "bkey_on_stack.h" + #include "btree_update.h" + #include "extents.h" + #include "inode.h" +@@ -160,7 +161,8 @@ s64 bch2_remap_range(struct bch_fs *c, + struct btree_trans trans; + struct btree_iter *dst_iter, *src_iter; + struct bkey_s_c src_k; +- BKEY_PADDED(k) new_dst, new_src; ++ BKEY_PADDED(k) new_dst; ++ struct bkey_on_stack new_src; + struct bpos dst_end = dst_start, src_end = src_start; + struct bpos dst_want, src_want; + u64 src_done, dst_done; +@@ -183,6 +185,7 @@ s64 bch2_remap_range(struct bch_fs *c, + dst_end.offset += remap_sectors; + src_end.offset += remap_sectors; + ++ bkey_on_stack_init(&new_src); + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 4096); + + src_iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, src_start, +@@ -222,14 +225,15 @@ s64 bch2_remap_range(struct bch_fs *c, + break; + + if (src_k.k->type == KEY_TYPE_extent) { +- bkey_reassemble(&new_src.k, src_k); +- src_k = bkey_i_to_s_c(&new_src.k); ++ bkey_on_stack_realloc(&new_src, c, src_k.k->u64s); ++ bkey_reassemble(new_src.k, src_k); ++ src_k = bkey_i_to_s_c(new_src.k); + +- bch2_cut_front(src_iter->pos, &new_src.k); +- bch2_cut_back(src_end, &new_src.k.k); ++ bch2_cut_front(src_iter->pos, new_src.k); ++ bch2_cut_back(src_end, &new_src.k->k); + + ret = bch2_make_extent_indirect(&trans, src_iter, +- bkey_i_to_extent(&new_src.k)); ++ bkey_i_to_extent(new_src.k)); + if (ret) + goto btree_err; + +@@ -299,6 +303,7 @@ err: + } while (ret2 == -EINTR); + + ret = bch2_trans_exit(&trans) ?: ret; ++ bkey_on_stack_exit(&new_src, c); + + percpu_ref_put(&c->writes); + +diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c +index f0af26bd328f..a0406740ab4e 100644 +--- a/fs/bcachefs/super.c ++++ b/fs/bcachefs/super.c +@@ -467,6 +467,7 @@ static void bch2_fs_free(struct bch_fs *c) + free_percpu(c->usage[0]); + kfree(c->usage_base); + free_percpu(c->pcpu); ++ mempool_exit(&c->large_bkey_pool); + mempool_exit(&c->btree_bounce_pool); + bioset_exit(&c->btree_bio); + mempool_exit(&c->btree_interior_update_pool); +@@ -718,6 +719,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) + !(c->pcpu = alloc_percpu(struct bch_fs_pcpu)) || + mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1, + btree_bytes(c)) || ++ mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048) || + bch2_io_clock_init(&c->io_clock[READ]) || + bch2_io_clock_init(&c->io_clock[WRITE]) || + bch2_fs_journal_init(&c->journal) || +-- +cgit v1.2.3 + + +From a8c2a9a44ec763b4001d283a3c653835cd7b51b0 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 15 Nov 2019 20:40:15 -0500 +Subject: bcachefs: kill bch2_extent_has_device() + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/ec.c | 15 ++++++++------- + fs/bcachefs/extents.h | 3 --- + fs/bcachefs/journal.c | 2 +- + fs/bcachefs/journal_io.c | 2 +- + 4 files changed, 10 insertions(+), 12 deletions(-) + +diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c +index 51e00386f173..e6bca0d4918b 100644 +--- a/fs/bcachefs/ec.c ++++ b/fs/bcachefs/ec.c +@@ -777,7 +777,6 @@ static int ec_stripe_update_ptrs(struct bch_fs *c, + struct btree_iter *iter; + struct bkey_s_c k; + struct bkey_s_extent e; +- struct bch_extent_ptr *ptr; + struct bkey_on_stack sk; + int ret = 0, dev, idx; + +@@ -791,6 +790,8 @@ static int ec_stripe_update_ptrs(struct bch_fs *c, + while ((k = bch2_btree_iter_peek(iter)).k && + !(ret = bkey_err(k)) && + bkey_cmp(bkey_start_pos(k.k), pos->p) < 0) { ++ struct bch_extent_ptr *ptr, *ec_ptr = NULL; ++ + if (extent_has_stripe_ptr(k, s->key.k.p.offset)) { + bch2_btree_iter_next(iter); + continue; +@@ -810,14 +811,14 @@ static int ec_stripe_update_ptrs(struct bch_fs *c, + bkey_reassemble(sk.k, k); + e = bkey_i_to_s_extent(sk.k); + +- extent_for_each_ptr(e, ptr) +- if (ptr->dev != dev) ++ extent_for_each_ptr(e, ptr) { ++ if (ptr->dev == dev) ++ ec_ptr = ptr; ++ else + ptr->cached = true; ++ } + +- ptr = (void *) bch2_extent_has_device(e.c, dev); +- BUG_ON(!ptr); +- +- extent_stripe_ptr_add(e, s, ptr, idx); ++ extent_stripe_ptr_add(e, s, ec_ptr, idx); + + bch2_trans_update(&trans, iter, sk.k); + +diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h +index cc7ee9067b50..f334b6f763e3 100644 +--- a/fs/bcachefs/extents.h ++++ b/fs/bcachefs/extents.h +@@ -447,9 +447,6 @@ void bch2_insert_fixup_extent(struct btree_trans *, + void bch2_bkey_mark_replicas_cached(struct bch_fs *, struct bkey_s, + unsigned, unsigned); + +-const struct bch_extent_ptr * +-bch2_extent_has_device(struct bkey_s_c_extent, unsigned); +- + unsigned bch2_extent_is_compressed(struct bkey_s_c); + + bool bch2_bkey_matches_ptr(struct bch_fs *, struct bkey_s_c, +diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c +index 5c3e146e3942..9f03a479c9a2 100644 +--- a/fs/bcachefs/journal.c ++++ b/fs/bcachefs/journal.c +@@ -945,7 +945,7 @@ static bool bch2_journal_writing_to_device(struct journal *j, unsigned dev_idx) + w = j->buf + !state.idx; + + ret = state.prev_buf_unwritten && +- bch2_extent_has_device(bkey_i_to_s_c_extent(&w->key), dev_idx); ++ bch2_bkey_has_device(bkey_i_to_s_c(&w->key), dev_idx); + spin_unlock(&j->lock); + + return ret; +diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c +index 387377dadab5..7112a25d0600 100644 +--- a/fs/bcachefs/journal_io.c ++++ b/fs/bcachefs/journal_io.c +@@ -1100,7 +1100,7 @@ void bch2_journal_write(struct closure *cl) + + for_each_rw_member(ca, c, i) + if (journal_flushes_device(ca) && +- !bch2_extent_has_device(bkey_i_to_s_c_extent(&w->key), i)) { ++ !bch2_bkey_has_device(bkey_i_to_s_c(&w->key), i)) { + percpu_ref_get(&ca->io_ref); + + bio = ca->journal.bio; +-- +cgit v1.2.3 + + +From 2b4aed9e1f17b75a0bf8c4184bc3661f632351bd Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 9 Nov 2019 23:50:52 -0500 +Subject: bcachefs: bkey noops + +For upcoming inline data extents, we're going to need to be able to +shorten the value of existing bkeys in the btree - and to make that work +we're going to be able to need to pad out the space the value previously +took up with something. + +This patch changes the various code that iterates over bkeys to handle +k->u64s == 0 as meaning "skip the next 8 bytes". + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bkey.h | 10 +++++++ + fs/bcachefs/bkey_sort.c | 6 ++++- + fs/bcachefs/bset.c | 40 +++++++++++++++------------- + fs/bcachefs/bset.h | 7 ++++- + fs/bcachefs/btree_gc.c | 2 +- + fs/bcachefs/btree_io.c | 53 +++++++++++++++---------------------- + fs/bcachefs/btree_update_interior.c | 34 +++++++++++++----------- + 7 files changed, 83 insertions(+), 69 deletions(-) + +diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h +index b26f4934b264..d3c39dc50b7f 100644 +--- a/fs/bcachefs/bkey.h ++++ b/fs/bcachefs/bkey.h +@@ -33,6 +33,16 @@ struct bkey_s { + + #define bkey_next(_k) vstruct_next(_k) + ++static inline struct bkey_packed *bkey_next_skip_noops(struct bkey_packed *k, ++ struct bkey_packed *end) ++{ ++ k = bkey_next(k); ++ ++ while (k != end && !k->u64s) ++ k = (void *) ((u64 *) k + 1); ++ return k; ++} ++ + #define bkey_val_u64s(_k) ((_k)->u64s - BKEY_U64s) + + static inline size_t bkey_val_bytes(const struct bkey *k) +diff --git a/fs/bcachefs/bkey_sort.c b/fs/bcachefs/bkey_sort.c +index f5c0507ad79d..5f9f3d2e6906 100644 +--- a/fs/bcachefs/bkey_sort.c ++++ b/fs/bcachefs/bkey_sort.c +@@ -75,6 +75,10 @@ static void sort_key_next(struct btree_node_iter_large *iter, + { + i->k += __btree_node_offset_to_key(b, i->k)->u64s; + ++ while (i->k != i->end && ++ !__btree_node_offset_to_key(b, i->k)->u64s) ++ i->k++; ++ + if (i->k == i->end) + *i = iter->data[--iter->used]; + } +@@ -119,7 +123,7 @@ static inline struct bkey_packed *sort_iter_peek(struct sort_iter *iter) + + static inline void sort_iter_advance(struct sort_iter *iter, sort_cmp_fn cmp) + { +- iter->data->k = bkey_next(iter->data->k); ++ iter->data->k = bkey_next_skip_noops(iter->data->k, iter->data->end); + + BUG_ON(iter->data->k > iter->data->end); + +diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c +index 23493309ba63..b3e3a9c0ea13 100644 +--- a/fs/bcachefs/bset.c ++++ b/fs/bcachefs/bset.c +@@ -76,7 +76,7 @@ void bch2_dump_bset(struct btree *b, struct bset *i, unsigned set) + for (_k = i->start, k = bkey_unpack_key(b, _k); + _k < vstruct_last(i); + _k = _n, k = n) { +- _n = bkey_next(_k); ++ _n = bkey_next_skip_noops(_k, vstruct_last(i)); + + bch2_bkey_to_text(&PBUF(buf), &k); + printk(KERN_ERR "block %u key %5u: %s\n", set, +@@ -144,9 +144,7 @@ void __bch2_verify_btree_nr_keys(struct btree *b) + struct btree_nr_keys nr = { 0 }; + + for_each_bset(b, t) +- for (k = btree_bkey_first(b, t); +- k != btree_bkey_last(b, t); +- k = bkey_next(k)) ++ bset_tree_for_each_key(b, t, k) + if (!bkey_whiteout(k)) + btree_keys_account_key_add(&nr, t - b->set, k); + +@@ -607,7 +605,7 @@ start: + rw_aux_tree(b, t)[j - 1].offset); + } + +- k = bkey_next(k); ++ k = bkey_next_skip_noops(k, btree_bkey_last(b, t)); + BUG_ON(k >= btree_bkey_last(b, t)); + } + } +@@ -798,9 +796,7 @@ static void __build_rw_aux_tree(struct btree *b, struct bset_tree *t) + rw_aux_tree(b, t)[0].offset = + __btree_node_key_to_offset(b, btree_bkey_first(b, t)); + +- for (k = btree_bkey_first(b, t); +- k != btree_bkey_last(b, t); +- k = bkey_next(k)) { ++ bset_tree_for_each_key(b, t, k) { + if (t->size == bset_rw_tree_capacity(b, t)) + break; + +@@ -833,7 +829,7 @@ retry: + /* First we figure out where the first key in each cacheline is */ + eytzinger1_for_each(j, t->size) { + while (bkey_to_cacheline(b, t, k) < cacheline) +- prev = k, k = bkey_next(k); ++ prev = k, k = bkey_next_skip_noops(k, btree_bkey_last(b, t)); + + if (k >= btree_bkey_last(b, t)) { + /* XXX: this path sucks */ +@@ -849,10 +845,10 @@ retry: + EBUG_ON(tree_to_bkey(b, t, j) != k); + } + +- while (bkey_next(k) != btree_bkey_last(b, t)) +- k = bkey_next(k); ++ while (k != btree_bkey_last(b, t)) ++ prev = k, k = bkey_next_skip_noops(k, btree_bkey_last(b, t)); + +- t->max_key = bkey_unpack_pos(b, k); ++ t->max_key = bkey_unpack_pos(b, prev); + + /* Then we build the tree */ + eytzinger1_for_each(j, t->size) +@@ -978,7 +974,7 @@ struct bkey_packed *bch2_bkey_prev_filter(struct btree *b, + struct bkey_packed *p, *i, *ret = NULL, *orig_k = k; + + while ((p = __bkey_prev(b, t, k)) && !ret) { +- for (i = p; i != k; i = bkey_next(i)) ++ for (i = p; i != k; i = bkey_next_skip_noops(i, k)) + if (i->type >= min_key_type) + ret = i; + +@@ -988,9 +984,11 @@ struct bkey_packed *bch2_bkey_prev_filter(struct btree *b, + if (btree_keys_expensive_checks(b)) { + BUG_ON(ret >= orig_k); + +- for (i = ret ? bkey_next(ret) : btree_bkey_first(b, t); ++ for (i = ret ++ ? bkey_next_skip_noops(ret, orig_k) ++ : btree_bkey_first(b, t); + i != orig_k; +- i = bkey_next(i)) ++ i = bkey_next_skip_noops(i, orig_k)) + BUG_ON(i->type >= min_key_type); + } + +@@ -1025,7 +1023,7 @@ static void ro_aux_tree_fix_invalidated_key(struct btree *b, + /* signal to make_bfloat() that they're uninitialized: */ + min_key.u64s = max_key.u64s = 0; + +- if (bkey_next(k) == btree_bkey_last(b, t)) { ++ if (bkey_next_skip_noops(k, btree_bkey_last(b, t)) == btree_bkey_last(b, t)) { + t->max_key = bkey_unpack_pos(b, k); + + for (j = 1; j < t->size; j = j * 2 + 1) +@@ -1149,7 +1147,7 @@ static void bch2_bset_fix_lookup_table(struct btree *b, + struct bkey_packed *k = start; + + while (1) { +- k = bkey_next(k); ++ k = bkey_next_skip_noops(k, end); + if (k == end) + break; + +@@ -1398,12 +1396,12 @@ struct bkey_packed *bch2_bset_search_linear(struct btree *b, + while (m != btree_bkey_last(b, t) && + bkey_iter_cmp_p_or_unp(b, search, lossy_packed_search, + m) > 0) +- m = bkey_next(m); ++ m = bkey_next_skip_noops(m, btree_bkey_last(b, t)); + + if (!packed_search) + while (m != btree_bkey_last(b, t) && + bkey_iter_pos_cmp(b, search, m) > 0) +- m = bkey_next(m); ++ m = bkey_next_skip_noops(m, btree_bkey_last(b, t)); + + if (btree_keys_expensive_checks(b)) { + struct bkey_packed *prev = bch2_bkey_prev_all(b, t, m); +@@ -1637,6 +1635,10 @@ static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *iter, + + EBUG_ON(iter->data->k > iter->data->end); + ++ while (!__btree_node_iter_set_end(iter, 0) && ++ !__bch2_btree_node_iter_peek_all(iter, b)->u64s) ++ iter->data->k++; ++ + if (unlikely(__btree_node_iter_set_end(iter, 0))) { + bch2_btree_node_iter_set_drop(iter, iter->data); + return; +diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h +index ccc0866d6435..25f5f2ce722e 100644 +--- a/fs/bcachefs/bset.h ++++ b/fs/bcachefs/bset.h +@@ -284,9 +284,14 @@ static inline struct bkey_s __bkey_disassemble(struct btree *b, + return (struct bkey_s) { .k = u, .v = bkeyp_val(&b->format, k), }; + } + +-#define for_each_bset(_b, _t) \ ++#define for_each_bset(_b, _t) \ + for (_t = (_b)->set; _t < (_b)->set + (_b)->nsets; _t++) + ++#define bset_tree_for_each_key(_b, _t, _k) \ ++ for (_k = btree_bkey_first(_b, _t); \ ++ _k != btree_bkey_last(_b, _t); \ ++ _k = bkey_next_skip_noops(_k, btree_bkey_last(_b, _t))) ++ + static inline bool bset_has_ro_aux_tree(struct bset_tree *t) + { + return bset_aux_tree_type(t) == BSET_RO_AUX_TREE; +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +index c4c2e1a3ee0e..8bbf60b07736 100644 +--- a/fs/bcachefs/btree_gc.c ++++ b/fs/bcachefs/btree_gc.c +@@ -922,7 +922,7 @@ static void bch2_coalesce_nodes(struct bch_fs *c, struct btree_iter *iter, + k < vstruct_last(s2) && + vstruct_blocks_plus(n1->data, c->block_bits, + u64s + k->u64s) <= blocks; +- k = bkey_next(k)) { ++ k = bkey_next_skip_noops(k, vstruct_last(s2))) { + last = k; + u64s += k->u64s; + } +diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c +index 591980d2011f..c345262d804b 100644 +--- a/fs/bcachefs/btree_io.c ++++ b/fs/bcachefs/btree_io.c +@@ -26,34 +26,33 @@ static void verify_no_dups(struct btree *b, + struct bkey_packed *end) + { + #ifdef CONFIG_BCACHEFS_DEBUG +- struct bkey_packed *k; ++ struct bkey_packed *k, *p; ++ ++ if (start == end) ++ return; + +- for (k = start; k != end && bkey_next(k) != end; k = bkey_next(k)) { +- struct bkey l = bkey_unpack_key(b, k); +- struct bkey r = bkey_unpack_key(b, bkey_next(k)); ++ for (p = start, k = bkey_next_skip_noops(start, end); ++ k != end; ++ p = k, k = bkey_next_skip_noops(k, end)) { ++ struct bkey l = bkey_unpack_key(b, p); ++ struct bkey r = bkey_unpack_key(b, k); + + BUG_ON(btree_node_is_extents(b) + ? bkey_cmp(l.p, bkey_start_pos(&r)) > 0 + : bkey_cmp(l.p, bkey_start_pos(&r)) >= 0); +- //BUG_ON(bkey_cmp_packed(&b->format, k, bkey_next(k)) >= 0); ++ //BUG_ON(bkey_cmp_packed(&b->format, p, k) >= 0); + } + #endif + } + +-static void clear_needs_whiteout(struct bset *i) +-{ +- struct bkey_packed *k; +- +- for (k = i->start; k != vstruct_last(i); k = bkey_next(k)) +- k->needs_whiteout = false; +-} +- +-static void set_needs_whiteout(struct bset *i) ++static void set_needs_whiteout(struct bset *i, int v) + { + struct bkey_packed *k; + +- for (k = i->start; k != vstruct_last(i); k = bkey_next(k)) +- k->needs_whiteout = true; ++ for (k = i->start; ++ k != vstruct_last(i); ++ k = bkey_next_skip_noops(k, vstruct_last(i))) ++ k->needs_whiteout = v; + } + + static void btree_bounce_free(struct bch_fs *c, unsigned order, +@@ -168,7 +167,7 @@ bool __bch2_compact_whiteouts(struct bch_fs *c, struct btree *b, + out = i->start; + + for (k = start; k != end; k = n) { +- n = bkey_next(k); ++ n = bkey_next_skip_noops(k, end); + + if (bkey_deleted(k) && btree_node_is_extents(b)) + continue; +@@ -261,7 +260,7 @@ static bool bch2_drop_whiteouts(struct btree *b) + out = i->start; + + for (k = start; k != end; k = n) { +- n = bkey_next(k); ++ n = bkey_next_skip_noops(k, end); + + if (!bkey_whiteout(k)) { + bkey_copy(out, k); +@@ -680,14 +679,6 @@ static int validate_bset(struct bch_fs *c, struct btree *b, + struct bkey tmp; + const char *invalid; + +- if (btree_err_on(!k->u64s, +- BTREE_ERR_FIXABLE, c, b, i, +- "KEY_U64s 0: %zu bytes of metadata lost", +- vstruct_end(i) - (void *) k)) { +- i->u64s = cpu_to_le16((u64 *) k - i->_data); +- break; +- } +- + if (btree_err_on(bkey_next(k) > vstruct_last(i), + BTREE_ERR_FIXABLE, c, b, i, + "key extends past end of bset")) { +@@ -756,7 +747,7 @@ static int validate_bset(struct bch_fs *c, struct btree *b, + + prev_pos = u.k->p; + prev = k; +- k = bkey_next(k); ++ k = bkey_next_skip_noops(k, vstruct_last(i)); + } + + SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN); +@@ -915,12 +906,12 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry + continue; + } + +- k = bkey_next(k); ++ k = bkey_next_skip_noops(k, vstruct_last(i)); + } + + bch2_bset_build_aux_tree(b, b->set, false); + +- set_needs_whiteout(btree_bset_first(b)); ++ set_needs_whiteout(btree_bset_first(b), true); + + btree_node_reset_sib_u64s(b); + out: +@@ -1425,7 +1416,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, + : bch2_sort_keys(i->start, &sort_iter, false); + le16_add_cpu(&i->u64s, u64s); + +- clear_needs_whiteout(i); ++ set_needs_whiteout(i, false); + + /* do we have data to write? */ + if (b->written && !i->u64s) +@@ -1579,7 +1570,7 @@ bool bch2_btree_post_write_cleanup(struct bch_fs *c, struct btree *b) + } + + for_each_bset(b, t) +- set_needs_whiteout(bset(b, t)); ++ set_needs_whiteout(bset(b, t), true); + + bch2_btree_verify(c, b); + +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index 40d801e1094f..4c34b9da9d52 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -79,9 +79,7 @@ void __bch2_btree_calc_format(struct bkey_format_state *s, struct btree *b) + bch2_bkey_format_add_pos(s, b->data->min_key); + + for_each_bset(b, t) +- for (k = btree_bkey_first(b, t); +- k != btree_bkey_last(b, t); +- k = bkey_next(k)) ++ bset_tree_for_each_key(b, t, k) + if (!bkey_whiteout(k)) { + uk = bkey_unpack_key(b, k); + bch2_bkey_format_add_key(s, &uk); +@@ -1240,7 +1238,9 @@ static struct btree *__btree_split_node(struct btree_update *as, + */ + k = set1->start; + while (1) { +- if (bkey_next(k) == vstruct_last(set1)) ++ struct bkey_packed *n = bkey_next_skip_noops(k, vstruct_last(set1)); ++ ++ if (n == vstruct_last(set1)) + break; + if (k->_data - set1->_data >= (le16_to_cpu(set1->u64s) * 3) / 5) + break; +@@ -1251,7 +1251,7 @@ static struct btree *__btree_split_node(struct btree_update *as, + nr_unpacked++; + + prev = k; +- k = bkey_next(k); ++ k = n; + } + + BUG_ON(!prev); +@@ -1315,7 +1315,7 @@ static void btree_split_insert_keys(struct btree_update *as, struct btree *b, + { + struct btree_node_iter node_iter; + struct bkey_i *k = bch2_keylist_front(keys); +- struct bkey_packed *p; ++ struct bkey_packed *src, *dst, *n; + struct bset *i; + + BUG_ON(btree_node_type(b) != BKEY_TYPE_BTREE); +@@ -1340,16 +1340,18 @@ static void btree_split_insert_keys(struct btree_update *as, struct btree *b, + * for the pivot: + */ + i = btree_bset_first(b); +- p = i->start; +- while (p != vstruct_last(i)) +- if (bkey_deleted(p)) { +- le16_add_cpu(&i->u64s, -p->u64s); +- set_btree_bset_end(b, b->set); +- memmove_u64s_down(p, bkey_next(p), +- (u64 *) vstruct_last(i) - +- (u64 *) p); +- } else +- p = bkey_next(p); ++ src = dst = i->start; ++ while (src != vstruct_last(i)) { ++ n = bkey_next_skip_noops(src, vstruct_last(i)); ++ if (!bkey_deleted(src)) { ++ memmove_u64s_down(dst, src, src->u64s); ++ dst = bkey_next(dst); ++ } ++ src = n; ++ } ++ ++ i->u64s = cpu_to_le16((u64 *) dst - i->_data); ++ set_btree_bset_end(b, b->set); + + BUG_ON(b->nsets != 1 || + b->nr.live_u64s != le16_to_cpu(btree_bset_first(b)->u64s)); +-- +cgit v1.2.3 + + +From f0638c2706f58568a2ee0d804be671b786aa9f2a Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 9 Nov 2019 19:02:48 -0500 +Subject: bcachefs: Rework of cut_front & cut_back + +This changes bch2_cut_front and bch2_cut_back so that they're able to +shorten the size of the value, and it also changes the extent update +path to update the accounting in the btree node when this happens. + +When the size of the value is shortened, they zero out the space that's +no longer used, so it's interpreted as noops (as implemented in the last +patch). + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bkey_sort.c | 8 ++--- + fs/bcachefs/bset.h | 10 +++++++ + fs/bcachefs/btree_update_leaf.c | 2 +- + fs/bcachefs/extents.c | 65 +++++++++++++++++++++++++++++------------ + fs/bcachefs/extents.h | 10 +++++-- + fs/bcachefs/fs-io.c | 8 ++--- + fs/bcachefs/io.c | 7 ++--- + fs/bcachefs/move.c | 9 +++--- + fs/bcachefs/recovery.c | 6 ++-- + fs/bcachefs/reflink.c | 4 +-- + 10 files changed, 84 insertions(+), 45 deletions(-) + +diff --git a/fs/bcachefs/bkey_sort.c b/fs/bcachefs/bkey_sort.c +index 5f9f3d2e6906..daef8e5c599f 100644 +--- a/fs/bcachefs/bkey_sort.c ++++ b/fs/bcachefs/bkey_sort.c +@@ -350,7 +350,7 @@ struct btree_nr_keys bch2_extent_sort_fix_overlapping(struct bch_fs *c, + if (bkey_cmp(l.k->p, r.k->p) >= 0) { + sort_key_next(iter, b, _r); + } else { +- __bch2_cut_front(l.k->p, r); ++ bch2_cut_front_s(l.k->p, r); + extent_save(b, rk, r.k); + } + +@@ -362,9 +362,9 @@ struct btree_nr_keys bch2_extent_sort_fix_overlapping(struct bch_fs *c, + * r wins, but it overlaps in the middle of l - split l: + */ + bkey_reassemble(split.k, l.s_c); +- bch2_cut_back(bkey_start_pos(r.k), &split.k->k); ++ bch2_cut_back(bkey_start_pos(r.k), split.k); + +- __bch2_cut_front(r.k->p, l); ++ bch2_cut_front_s(r.k->p, l); + extent_save(b, lk, l.k); + + extent_sort_sift(iter, b, 0); +@@ -372,7 +372,7 @@ struct btree_nr_keys bch2_extent_sort_fix_overlapping(struct bch_fs *c, + extent_sort_append(c, f, &nr, dst->start, + &prev, bkey_i_to_s(split.k)); + } else { +- bch2_cut_back(bkey_start_pos(r.k), l.k); ++ bch2_cut_back_s(bkey_start_pos(r.k), l); + extent_save(b, lk, l.k); + } + } +diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h +index 25f5f2ce722e..2653a74b3b14 100644 +--- a/fs/bcachefs/bset.h ++++ b/fs/bcachefs/bset.h +@@ -569,6 +569,16 @@ static inline void btree_keys_account_key(struct btree_nr_keys *n, + n->unpacked_keys += sign; + } + ++static inline void btree_keys_account_val_delta(struct btree *b, ++ struct bkey_packed *k, ++ int delta) ++{ ++ struct bset_tree *t = bch2_bkey_to_bset(b, k); ++ ++ b->nr.live_u64s += delta; ++ b->nr.bset_u64s[t - b->set] += delta; ++} ++ + #define btree_keys_account_key_add(_nr, _bset_idx, _k) \ + btree_keys_account_key(_nr, _bset_idx, _k, 1) + #define btree_keys_account_key_drop(_nr, _bset_idx, _k) \ +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index 5f5574ecc176..26bf9b42bbbd 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -883,7 +883,7 @@ retry: + + /* create the biggest key we can */ + bch2_key_resize(&delete.k, max_sectors); +- bch2_cut_back(end, &delete.k); ++ bch2_cut_back(end, &delete); + + ret = bch2_extent_trim_atomic(&delete, iter); + if (ret) +diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c +index 12799f7ba1db..8410c2d19031 100644 +--- a/fs/bcachefs/extents.c ++++ b/fs/bcachefs/extents.c +@@ -721,12 +721,14 @@ void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c, + + /* Extents */ + +-void __bch2_cut_front(struct bpos where, struct bkey_s k) ++int bch2_cut_front_s(struct bpos where, struct bkey_s k) + { ++ unsigned new_val_u64s = bkey_val_u64s(k.k); ++ int val_u64s_delta; + u64 sub; + + if (bkey_cmp(where, bkey_start_pos(k.k)) <= 0) +- return; ++ return 0; + + EBUG_ON(bkey_cmp(where, k.k->p) > 0); + +@@ -734,8 +736,10 @@ void __bch2_cut_front(struct bpos where, struct bkey_s k) + + k.k->size -= sub; + +- if (!k.k->size) ++ if (!k.k->size) { + k.k->type = KEY_TYPE_deleted; ++ new_val_u64s = 0; ++ } + + switch (k.k->type) { + case KEY_TYPE_deleted: +@@ -785,26 +789,42 @@ void __bch2_cut_front(struct bpos where, struct bkey_s k) + default: + BUG(); + } ++ ++ val_u64s_delta = bkey_val_u64s(k.k) - new_val_u64s; ++ BUG_ON(val_u64s_delta < 0); ++ ++ set_bkey_val_u64s(k.k, new_val_u64s); ++ memset(bkey_val_end(k), 0, val_u64s_delta * sizeof(u64)); ++ return -val_u64s_delta; + } + +-bool bch2_cut_back(struct bpos where, struct bkey *k) ++int bch2_cut_back_s(struct bpos where, struct bkey_s k) + { ++ unsigned new_val_u64s = bkey_val_u64s(k.k); ++ int val_u64s_delta; + u64 len = 0; + +- if (bkey_cmp(where, k->p) >= 0) +- return false; ++ if (bkey_cmp(where, k.k->p) >= 0) ++ return 0; + +- EBUG_ON(bkey_cmp(where, bkey_start_pos(k)) < 0); ++ EBUG_ON(bkey_cmp(where, bkey_start_pos(k.k)) < 0); + +- len = where.offset - bkey_start_offset(k); ++ len = where.offset - bkey_start_offset(k.k); + +- k->p = where; +- k->size = len; ++ k.k->p = where; ++ k.k->size = len; + +- if (!len) +- k->type = KEY_TYPE_deleted; ++ if (!len) { ++ k.k->type = KEY_TYPE_deleted; ++ new_val_u64s = 0; ++ } ++ ++ val_u64s_delta = bkey_val_u64s(k.k) - new_val_u64s; ++ BUG_ON(val_u64s_delta < 0); + +- return true; ++ set_bkey_val_u64s(k.k, new_val_u64s); ++ memset(bkey_val_end(k), 0, val_u64s_delta * sizeof(u64)); ++ return -val_u64s_delta; + } + + static unsigned bch2_bkey_nr_alloc_ptrs(struct bkey_s_c k) +@@ -943,7 +963,7 @@ int bch2_extent_trim_atomic(struct bkey_i *k, struct btree_iter *iter) + if (ret) + return ret; + +- bch2_cut_back(end, &k->k); ++ bch2_cut_back(end, k); + return 0; + } + +@@ -1086,11 +1106,14 @@ extent_squash(struct bch_fs *c, struct btree_iter *iter, + enum bch_extent_overlap overlap) + { + struct btree_iter_level *l = &iter->l[0]; ++ int u64s_delta; + + switch (overlap) { + case BCH_EXTENT_OVERLAP_FRONT: + /* insert overlaps with start of k: */ +- __bch2_cut_front(insert->k.p, k); ++ u64s_delta = bch2_cut_front_s(insert->k.p, k); ++ btree_keys_account_val_delta(l->b, _k, u64s_delta); ++ + EBUG_ON(bkey_deleted(k.k)); + extent_save(l->b, _k, k.k); + bch2_btree_iter_fix_key_modified(iter, l->b, _k); +@@ -1098,7 +1121,9 @@ extent_squash(struct bch_fs *c, struct btree_iter *iter, + + case BCH_EXTENT_OVERLAP_BACK: + /* insert overlaps with end of k: */ +- bch2_cut_back(bkey_start_pos(&insert->k), k.k); ++ u64s_delta = bch2_cut_back_s(bkey_start_pos(&insert->k), k); ++ btree_keys_account_val_delta(l->b, _k, u64s_delta); ++ + EBUG_ON(bkey_deleted(k.k)); + extent_save(l->b, _k, k.k); + +@@ -1156,10 +1181,12 @@ extent_squash(struct bch_fs *c, struct btree_iter *iter, + bkey_reassemble(split.k, k.s_c); + split.k->k.needs_whiteout |= bkey_written(l->b, _k); + +- bch2_cut_back(bkey_start_pos(&insert->k), &split.k->k); ++ bch2_cut_back(bkey_start_pos(&insert->k), split.k); + BUG_ON(bkey_deleted(&split.k->k)); + +- __bch2_cut_front(insert->k.p, k); ++ u64s_delta = bch2_cut_front_s(insert->k.p, k); ++ btree_keys_account_val_delta(l->b, _k, u64s_delta); ++ + BUG_ON(bkey_deleted(k.k)); + extent_save(l->b, _k, k.k); + bch2_btree_iter_fix_key_modified(iter, l->b, _k); +@@ -1749,7 +1776,7 @@ enum merge_result bch2_reservation_merge(struct bch_fs *c, + + if ((u64) l.k->size + r.k->size > KEY_SIZE_MAX) { + bch2_key_resize(l.k, KEY_SIZE_MAX); +- __bch2_cut_front(l.k->p, r.s); ++ bch2_cut_front_s(l.k->p, r.s); + return BCH_MERGE_PARTIAL; + } + +diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h +index f334b6f763e3..6e893c37c287 100644 +--- a/fs/bcachefs/extents.h ++++ b/fs/bcachefs/extents.h +@@ -534,14 +534,18 @@ do { \ + } \ + } while (0) + +-void __bch2_cut_front(struct bpos, struct bkey_s); ++int bch2_cut_front_s(struct bpos, struct bkey_s); ++int bch2_cut_back_s(struct bpos, struct bkey_s); + + static inline void bch2_cut_front(struct bpos where, struct bkey_i *k) + { +- __bch2_cut_front(where, bkey_i_to_s(k)); ++ bch2_cut_front_s(where, bkey_i_to_s(k)); + } + +-bool bch2_cut_back(struct bpos, struct bkey *); ++static inline void bch2_cut_back(struct bpos where, struct bkey_i *k) ++{ ++ bch2_cut_back_s(where, bkey_i_to_s(k)); ++} + + /** + * bch_key_resize - adjust size of @k +diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c +index a94e016b5355..0f4e251e0c5f 100644 +--- a/fs/bcachefs/fs-io.c ++++ b/fs/bcachefs/fs-io.c +@@ -2539,7 +2539,7 @@ reassemble: + move_pos.offset -= shift >> 9; + goto reassemble; + } else { +- bch2_cut_back(atomic_end, ©.k->k); ++ bch2_cut_back(atomic_end, copy.k); + } + } + +@@ -2559,7 +2559,7 @@ reassemble: + */ + if (insert && + bkey_cmp(bkey_start_pos(©.k->k), delete.k.p) < 0) { +- bch2_cut_back(bkey_start_pos(©.k->k), &delete.k); ++ bch2_cut_back(bkey_start_pos(©.k->k), &delete); + } else if (!insert && + bkey_cmp(copy.k->k.p, + bkey_start_pos(&delete.k)) > 0) { +@@ -2706,8 +2706,8 @@ static long bchfs_fallocate(struct bch_inode_info *inode, int mode, + reservation.k.p = k.k->p; + reservation.k.size = k.k->size; + +- bch2_cut_front(iter->pos, &reservation.k_i); +- bch2_cut_back(end_pos, &reservation.k); ++ bch2_cut_front(iter->pos, &reservation.k_i); ++ bch2_cut_back(end_pos, &reservation.k_i); + + sectors = reservation.k.size; + reservation.v.nr_replicas = bch2_bkey_nr_dirty_ptrs(k); +diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c +index 7b713bc10d1a..4a5355942e02 100644 +--- a/fs/bcachefs/io.c ++++ b/fs/bcachefs/io.c +@@ -335,7 +335,7 @@ int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter, + + /* create the biggest key we can */ + bch2_key_resize(&delete.k, max_sectors); +- bch2_cut_back(end, &delete.k); ++ bch2_cut_back(end, &delete); + + bch2_trans_begin_updates(trans); + +@@ -404,6 +404,7 @@ int bch2_write_index_default(struct bch_write_op *op) + + bkey_on_stack_realloc(&sk, c, k->k.u64s); + bkey_copy(sk.k, k); ++ bch2_cut_front(iter->pos, sk.k); + + bch2_trans_begin_updates(&trans); + +@@ -415,9 +416,7 @@ int bch2_write_index_default(struct bch_write_op *op) + if (ret) + break; + +- if (bkey_cmp(iter->pos, k->k.p) < 0) +- bch2_cut_front(iter->pos, k); +- else ++ if (bkey_cmp(iter->pos, k->k.p) >= 0) + bch2_keylist_pop_front(keys); + } while (!bch2_keylist_empty(keys)); + +diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c +index 87ff04bdfc73..15547e149b3e 100644 +--- a/fs/bcachefs/move.c ++++ b/fs/bcachefs/move.c +@@ -97,10 +97,11 @@ static int bch2_migrate_index_update(struct bch_write_op *op) + + bkey_copy(&_new.k, bch2_keylist_front(keys)); + new = bkey_i_to_extent(&_new.k); ++ bch2_cut_front(iter->pos, &new->k_i); + +- bch2_cut_front(iter->pos, insert); +- bch2_cut_back(new->k.p, &insert->k); +- bch2_cut_back(insert->k.p, &new->k); ++ bch2_cut_front(iter->pos, insert); ++ bch2_cut_back(new->k.p, insert); ++ bch2_cut_back(insert->k.p, &new->k_i); + + if (m->data_cmd == DATA_REWRITE) + bch2_bkey_drop_device(bkey_i_to_s(insert), +@@ -169,8 +170,6 @@ next: + if (bch2_keylist_empty(keys)) + goto out; + } +- +- bch2_cut_front(iter->pos, bch2_keylist_front(keys)); + continue; + nomatch: + if (m->ctxt) +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index d1184bf62cae..2efe023b2f0d 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -177,7 +177,7 @@ static struct journal_keys journal_keys_sort(struct list_head *journal_entries) + if ((cmp_int(i[0].journal_seq, i[1].journal_seq) ?: + cmp_int(i[0].journal_offset, i[1].journal_offset)) < 0) { + if (bkey_cmp(i[0].k->k.p, i[1].k->k.p) <= 0) { +- bch2_cut_back(bkey_start_pos(&i[1].k->k), &i[0].k->k); ++ bch2_cut_back(bkey_start_pos(&i[1].k->k), i[0].k); + } else { + struct bkey_i *split = + kmalloc(bkey_bytes(i[0].k), GFP_KERNEL); +@@ -186,7 +186,7 @@ static struct journal_keys journal_keys_sort(struct list_head *journal_entries) + goto err; + + bkey_copy(split, i[0].k); +- bch2_cut_back(bkey_start_pos(&i[1].k->k), &split->k); ++ bch2_cut_back(bkey_start_pos(&i[1].k->k), split); + keys_deduped.d[keys_deduped.nr++] = (struct journal_key) { + .btree_id = i[0].btree_id, + .allocated = true, +@@ -298,7 +298,7 @@ retry: + + bkey_copy(split, k); + bch2_cut_front(split_iter->pos, split); +- bch2_cut_back(atomic_end, &split->k); ++ bch2_cut_back(atomic_end, split); + + bch2_trans_update(&trans, split_iter, split); + bch2_btree_iter_set_pos(iter, split->k.p); +diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c +index 6d21086c3254..4de65bf70362 100644 +--- a/fs/bcachefs/reflink.c ++++ b/fs/bcachefs/reflink.c +@@ -40,7 +40,7 @@ enum merge_result bch2_reflink_p_merge(struct bch_fs *c, + + if ((u64) l.k->size + r.k->size > KEY_SIZE_MAX) { + bch2_key_resize(l.k, KEY_SIZE_MAX); +- __bch2_cut_front(l.k->p, _r); ++ bch2_cut_front_s(l.k->p, _r); + return BCH_MERGE_PARTIAL; + } + +@@ -230,7 +230,7 @@ s64 bch2_remap_range(struct bch_fs *c, + src_k = bkey_i_to_s_c(new_src.k); + + bch2_cut_front(src_iter->pos, new_src.k); +- bch2_cut_back(src_end, &new_src.k->k); ++ bch2_cut_back(src_end, new_src.k); + + ret = bch2_make_extent_indirect(&trans, src_iter, + bkey_i_to_extent(new_src.k)); +-- +cgit v1.2.3 + + +From d9a46f2690c9fdbb13ae59fc92edfd9898df4b3e Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 15 Nov 2019 15:52:28 -0500 +Subject: bcachefs: Split out extent_update.c + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/Makefile | 1 + + fs/bcachefs/btree_update_leaf.c | 2 +- + fs/bcachefs/extent_update.c | 532 ++++++++++++++++++++++++++++++++++++++++ + fs/bcachefs/extent_update.h | 18 ++ + fs/bcachefs/extents.c | 525 +-------------------------------------- + fs/bcachefs/extents.h | 11 - + fs/bcachefs/fs-io.c | 1 + + fs/bcachefs/io.c | 2 +- + 8 files changed, 555 insertions(+), 537 deletions(-) + create mode 100644 fs/bcachefs/extent_update.c + create mode 100644 fs/bcachefs/extent_update.h + +diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile +index b199da94f311..c7727d05cf49 100644 +--- a/fs/bcachefs/Makefile ++++ b/fs/bcachefs/Makefile +@@ -26,6 +26,7 @@ bcachefs-y := \ + ec.o \ + error.o \ + extents.o \ ++ extent_update.o \ + fs.o \ + fs-common.o \ + fs-ioctl.o \ +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index 26bf9b42bbbd..d37a95299240 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -10,7 +10,7 @@ + #include "buckets.h" + #include "debug.h" + #include "error.h" +-#include "extents.h" ++#include "extent_update.h" + #include "journal.h" + #include "journal_reclaim.h" + #include "keylist.h" +diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c +new file mode 100644 +index 000000000000..21426e01c395 +--- /dev/null ++++ b/fs/bcachefs/extent_update.c +@@ -0,0 +1,532 @@ ++// SPDX-License-Identifier: GPL-2.0 ++#include "bcachefs.h" ++#include "bkey_on_stack.h" ++#include "btree_update.h" ++#include "btree_update_interior.h" ++#include "buckets.h" ++#include "debug.h" ++#include "extents.h" ++#include "extent_update.h" ++ ++/* ++ * This counts the number of iterators to the alloc & ec btrees we'll need ++ * inserting/removing this extent: ++ */ ++static unsigned bch2_bkey_nr_alloc_ptrs(struct bkey_s_c k) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const union bch_extent_entry *entry; ++ unsigned ret = 0; ++ ++ bkey_extent_entry_for_each(ptrs, entry) { ++ switch (__extent_entry_type(entry)) { ++ case BCH_EXTENT_ENTRY_ptr: ++ case BCH_EXTENT_ENTRY_stripe_ptr: ++ ret++; ++ } ++ } ++ ++ return ret; ++} ++ ++static int count_iters_for_insert(struct btree_trans *trans, ++ struct bkey_s_c k, ++ unsigned offset, ++ struct bpos *end, ++ unsigned *nr_iters, ++ unsigned max_iters, ++ bool overwrite) ++{ ++ int ret = 0; ++ ++ switch (k.k->type) { ++ case KEY_TYPE_extent: ++ case KEY_TYPE_reflink_v: ++ *nr_iters += bch2_bkey_nr_alloc_ptrs(k); ++ ++ if (*nr_iters >= max_iters) { ++ *end = bpos_min(*end, k.k->p); ++ ret = 1; ++ } ++ ++ break; ++ case KEY_TYPE_reflink_p: { ++ struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); ++ u64 idx = le64_to_cpu(p.v->idx); ++ unsigned sectors = bpos_min(*end, p.k->p).offset - ++ bkey_start_offset(p.k); ++ struct btree_iter *iter; ++ struct bkey_s_c r_k; ++ ++ for_each_btree_key(trans, iter, ++ BTREE_ID_REFLINK, POS(0, idx + offset), ++ BTREE_ITER_SLOTS, r_k, ret) { ++ if (bkey_cmp(bkey_start_pos(r_k.k), ++ POS(0, idx + sectors)) >= 0) ++ break; ++ ++ *nr_iters += 1 + bch2_bkey_nr_alloc_ptrs(r_k); ++ ++ if (*nr_iters >= max_iters) { ++ struct bpos pos = bkey_start_pos(k.k); ++ pos.offset += r_k.k->p.offset - idx; ++ ++ *end = bpos_min(*end, pos); ++ ret = 1; ++ break; ++ } ++ } ++ ++ bch2_trans_iter_put(trans, iter); ++ break; ++ } ++ } ++ ++ return ret; ++} ++ ++#define EXTENT_ITERS_MAX (BTREE_ITER_MAX / 3) ++ ++int bch2_extent_atomic_end(struct btree_iter *iter, ++ struct bkey_i *insert, ++ struct bpos *end) ++{ ++ struct btree_trans *trans = iter->trans; ++ struct btree *b; ++ struct btree_node_iter node_iter; ++ struct bkey_packed *_k; ++ unsigned nr_iters = 0; ++ int ret; ++ ++ ret = bch2_btree_iter_traverse(iter); ++ if (ret) ++ return ret; ++ ++ b = iter->l[0].b; ++ node_iter = iter->l[0].iter; ++ ++ BUG_ON(bkey_cmp(bkey_start_pos(&insert->k), b->data->min_key) < 0); ++ ++ *end = bpos_min(insert->k.p, b->key.k.p); ++ ++ ret = count_iters_for_insert(trans, bkey_i_to_s_c(insert), 0, end, ++ &nr_iters, EXTENT_ITERS_MAX / 2, false); ++ if (ret < 0) ++ return ret; ++ ++ while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, b, ++ KEY_TYPE_discard))) { ++ struct bkey unpacked; ++ struct bkey_s_c k = bkey_disassemble(b, _k, &unpacked); ++ unsigned offset = 0; ++ ++ if (bkey_cmp(bkey_start_pos(k.k), *end) >= 0) ++ break; ++ ++ if (bkey_cmp(bkey_start_pos(&insert->k), ++ bkey_start_pos(k.k)) > 0) ++ offset = bkey_start_offset(&insert->k) - ++ bkey_start_offset(k.k); ++ ++ ret = count_iters_for_insert(trans, k, offset, end, ++ &nr_iters, EXTENT_ITERS_MAX, true); ++ if (ret) ++ break; ++ ++ bch2_btree_node_iter_advance(&node_iter, b); ++ } ++ ++ return ret < 0 ? ret : 0; ++} ++ ++int bch2_extent_trim_atomic(struct bkey_i *k, struct btree_iter *iter) ++{ ++ struct bpos end; ++ int ret; ++ ++ ret = bch2_extent_atomic_end(iter, k, &end); ++ if (ret) ++ return ret; ++ ++ bch2_cut_back(end, k); ++ return 0; ++} ++ ++int bch2_extent_is_atomic(struct bkey_i *k, struct btree_iter *iter) ++{ ++ struct bpos end; ++ int ret; ++ ++ ret = bch2_extent_atomic_end(iter, k, &end); ++ if (ret) ++ return ret; ++ ++ return !bkey_cmp(end, k->k.p); ++} ++ ++enum btree_insert_ret ++bch2_extent_can_insert(struct btree_trans *trans, ++ struct btree_insert_entry *insert, ++ unsigned *u64s) ++{ ++ struct btree_iter_level *l = &insert->iter->l[0]; ++ struct btree_node_iter node_iter = l->iter; ++ enum bch_extent_overlap overlap; ++ struct bkey_packed *_k; ++ struct bkey unpacked; ++ struct bkey_s_c k; ++ int sectors; ++ ++ /* ++ * We avoid creating whiteouts whenever possible when deleting, but ++ * those optimizations mean we may potentially insert two whiteouts ++ * instead of one (when we overlap with the front of one extent and the ++ * back of another): ++ */ ++ if (bkey_whiteout(&insert->k->k)) ++ *u64s += BKEY_U64s; ++ ++ _k = bch2_btree_node_iter_peek_filter(&node_iter, l->b, ++ KEY_TYPE_discard); ++ if (!_k) ++ return BTREE_INSERT_OK; ++ ++ k = bkey_disassemble(l->b, _k, &unpacked); ++ ++ overlap = bch2_extent_overlap(&insert->k->k, k.k); ++ ++ /* account for having to split existing extent: */ ++ if (overlap == BCH_EXTENT_OVERLAP_MIDDLE) ++ *u64s += _k->u64s; ++ ++ if (overlap == BCH_EXTENT_OVERLAP_MIDDLE && ++ (sectors = bch2_extent_is_compressed(k))) { ++ int flags = trans->flags & BTREE_INSERT_NOFAIL ++ ? BCH_DISK_RESERVATION_NOFAIL : 0; ++ ++ switch (bch2_disk_reservation_add(trans->c, ++ trans->disk_res, ++ sectors, flags)) { ++ case 0: ++ break; ++ case -ENOSPC: ++ return BTREE_INSERT_ENOSPC; ++ default: ++ BUG(); ++ } ++ } ++ ++ return BTREE_INSERT_OK; ++} ++ ++static void verify_extent_nonoverlapping(struct bch_fs *c, ++ struct btree *b, ++ struct btree_node_iter *_iter, ++ struct bkey_i *insert) ++{ ++#ifdef CONFIG_BCACHEFS_DEBUG ++ struct btree_node_iter iter; ++ struct bkey_packed *k; ++ struct bkey uk; ++ ++ if (!expensive_debug_checks(c)) ++ return; ++ ++ iter = *_iter; ++ k = bch2_btree_node_iter_prev_filter(&iter, b, KEY_TYPE_discard); ++ BUG_ON(k && ++ (uk = bkey_unpack_key(b, k), ++ bkey_cmp(uk.p, bkey_start_pos(&insert->k)) > 0)); ++ ++ iter = *_iter; ++ k = bch2_btree_node_iter_peek_filter(&iter, b, KEY_TYPE_discard); ++#if 0 ++ BUG_ON(k && ++ (uk = bkey_unpack_key(b, k), ++ bkey_cmp(insert->k.p, bkey_start_pos(&uk))) > 0); ++#else ++ if (k && ++ (uk = bkey_unpack_key(b, k), ++ bkey_cmp(insert->k.p, bkey_start_pos(&uk))) > 0) { ++ char buf1[100]; ++ char buf2[100]; ++ ++ bch2_bkey_to_text(&PBUF(buf1), &insert->k); ++ bch2_bkey_to_text(&PBUF(buf2), &uk); ++ ++ bch2_dump_btree_node(b); ++ panic("insert > next :\n" ++ "insert %s\n" ++ "next %s\n", ++ buf1, buf2); ++ } ++#endif ++ ++#endif ++} ++ ++static void extent_bset_insert(struct bch_fs *c, struct btree_iter *iter, ++ struct bkey_i *insert) ++{ ++ struct btree_iter_level *l = &iter->l[0]; ++ struct bkey_packed *k = ++ bch2_btree_node_iter_bset_pos(&l->iter, l->b, bset_tree_last(l->b)); ++ ++ BUG_ON(insert->k.u64s > bch_btree_keys_u64s_remaining(c, l->b)); ++ ++ EBUG_ON(bkey_deleted(&insert->k) || !insert->k.size); ++ verify_extent_nonoverlapping(c, l->b, &l->iter, insert); ++ ++ if (debug_check_bkeys(c)) ++ bch2_bkey_debugcheck(c, l->b, bkey_i_to_s_c(insert)); ++ ++ bch2_bset_insert(l->b, &l->iter, k, insert, 0); ++ bch2_btree_node_iter_fix(iter, l->b, &l->iter, k, 0, k->u64s); ++} ++ ++static void ++extent_squash(struct bch_fs *c, struct btree_iter *iter, ++ struct bkey_i *insert, ++ struct bkey_packed *_k, struct bkey_s k, ++ enum bch_extent_overlap overlap) ++{ ++ struct btree_iter_level *l = &iter->l[0]; ++ int u64s_delta; ++ ++ switch (overlap) { ++ case BCH_EXTENT_OVERLAP_FRONT: ++ /* insert overlaps with start of k: */ ++ u64s_delta = bch2_cut_front_s(insert->k.p, k); ++ btree_keys_account_val_delta(l->b, _k, u64s_delta); ++ ++ EBUG_ON(bkey_deleted(k.k)); ++ extent_save(l->b, _k, k.k); ++ bch2_btree_iter_fix_key_modified(iter, l->b, _k); ++ break; ++ ++ case BCH_EXTENT_OVERLAP_BACK: ++ /* insert overlaps with end of k: */ ++ u64s_delta = bch2_cut_back_s(bkey_start_pos(&insert->k), k); ++ btree_keys_account_val_delta(l->b, _k, u64s_delta); ++ ++ EBUG_ON(bkey_deleted(k.k)); ++ extent_save(l->b, _k, k.k); ++ ++ /* ++ * As the auxiliary tree is indexed by the end of the ++ * key and we've just changed the end, update the ++ * auxiliary tree. ++ */ ++ bch2_bset_fix_invalidated_key(l->b, _k); ++ bch2_btree_node_iter_fix(iter, l->b, &l->iter, ++ _k, _k->u64s, _k->u64s); ++ break; ++ ++ case BCH_EXTENT_OVERLAP_ALL: { ++ /* The insert key completely covers k, invalidate k */ ++ if (!bkey_whiteout(k.k)) ++ btree_account_key_drop(l->b, _k); ++ ++ k.k->size = 0; ++ k.k->type = KEY_TYPE_deleted; ++ ++ if (_k >= btree_bset_last(l->b)->start) { ++ unsigned u64s = _k->u64s; ++ ++ bch2_bset_delete(l->b, _k, _k->u64s); ++ bch2_btree_node_iter_fix(iter, l->b, &l->iter, ++ _k, u64s, 0); ++ } else { ++ extent_save(l->b, _k, k.k); ++ bch2_btree_iter_fix_key_modified(iter, l->b, _k); ++ } ++ ++ break; ++ } ++ case BCH_EXTENT_OVERLAP_MIDDLE: { ++ struct bkey_on_stack split; ++ ++ bkey_on_stack_init(&split); ++ bkey_on_stack_realloc(&split, c, k.k->u64s); ++ ++ /* ++ * The insert key falls 'in the middle' of k ++ * The insert key splits k in 3: ++ * - start only in k, preserve ++ * - middle common section, invalidate in k ++ * - end only in k, preserve ++ * ++ * We update the old key to preserve the start, ++ * insert will be the new common section, ++ * we manually insert the end that we are preserving. ++ * ++ * modify k _before_ doing the insert (which will move ++ * what k points to) ++ */ ++ bkey_reassemble(split.k, k.s_c); ++ split.k->k.needs_whiteout |= bkey_written(l->b, _k); ++ ++ bch2_cut_back(bkey_start_pos(&insert->k), split.k); ++ BUG_ON(bkey_deleted(&split.k->k)); ++ ++ u64s_delta = bch2_cut_front_s(insert->k.p, k); ++ btree_keys_account_val_delta(l->b, _k, u64s_delta); ++ ++ BUG_ON(bkey_deleted(k.k)); ++ extent_save(l->b, _k, k.k); ++ bch2_btree_iter_fix_key_modified(iter, l->b, _k); ++ ++ extent_bset_insert(c, iter, split.k); ++ bkey_on_stack_exit(&split, c); ++ break; ++ } ++ } ++} ++ ++/** ++ * bch_extent_insert_fixup - insert a new extent and deal with overlaps ++ * ++ * this may result in not actually doing the insert, or inserting some subset ++ * of the insert key. For cmpxchg operations this is where that logic lives. ++ * ++ * All subsets of @insert that need to be inserted are inserted using ++ * bch2_btree_insert_and_journal(). If @b or @res fills up, this function ++ * returns false, setting @iter->pos for the prefix of @insert that actually got ++ * inserted. ++ * ++ * BSET INVARIANTS: this function is responsible for maintaining all the ++ * invariants for bsets of extents in memory. things get really hairy with 0 ++ * size extents ++ * ++ * within one bset: ++ * ++ * bkey_start_pos(bkey_next(k)) >= k ++ * or bkey_start_offset(bkey_next(k)) >= k->offset ++ * ++ * i.e. strict ordering, no overlapping extents. ++ * ++ * multiple bsets (i.e. full btree node): ++ * ++ * ∀ k, j ++ * k.size != 0 ∧ j.size != 0 → ++ * ¬ (k > bkey_start_pos(j) ∧ k < j) ++ * ++ * i.e. no two overlapping keys _of nonzero size_ ++ * ++ * We can't realistically maintain this invariant for zero size keys because of ++ * the key merging done in bch2_btree_insert_key() - for two mergeable keys k, j ++ * there may be another 0 size key between them in another bset, and it will ++ * thus overlap with the merged key. ++ * ++ * In addition, the end of iter->pos indicates how much has been processed. ++ * If the end of iter->pos is not the same as the end of insert, then ++ * key insertion needs to continue/be retried. ++ */ ++void bch2_insert_fixup_extent(struct btree_trans *trans, ++ struct btree_insert_entry *insert_entry) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_iter *iter = insert_entry->iter; ++ struct bkey_i *insert = insert_entry->k; ++ struct btree_iter_level *l = &iter->l[0]; ++ struct btree_node_iter node_iter = l->iter; ++ bool deleting = bkey_whiteout(&insert->k); ++ bool update_journal = !deleting; ++ bool update_btree = !deleting; ++ struct bkey_i whiteout = *insert; ++ struct bkey_packed *_k; ++ struct bkey unpacked; ++ ++ EBUG_ON(iter->level); ++ EBUG_ON(!insert->k.size); ++ EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k))); ++ ++ while ((_k = bch2_btree_node_iter_peek_filter(&l->iter, l->b, ++ KEY_TYPE_discard))) { ++ struct bkey_s k = __bkey_disassemble(l->b, _k, &unpacked); ++ struct bpos cur_end = bpos_min(insert->k.p, k.k->p); ++ enum bch_extent_overlap overlap = ++ bch2_extent_overlap(&insert->k, k.k); ++ ++ if (bkey_cmp(bkey_start_pos(k.k), insert->k.p) >= 0) ++ break; ++ ++ if (!bkey_whiteout(k.k)) ++ update_journal = true; ++ ++ if (!update_journal) { ++ bch2_cut_front(cur_end, insert); ++ bch2_cut_front(cur_end, &whiteout); ++ bch2_btree_iter_set_pos_same_leaf(iter, cur_end); ++ goto next; ++ } ++ ++ /* ++ * When deleting, if possible just do it by switching the type ++ * of the key we're deleting, instead of creating and inserting ++ * a new whiteout: ++ */ ++ if (deleting && ++ !update_btree && ++ !bkey_cmp(insert->k.p, k.k->p) && ++ !bkey_cmp(bkey_start_pos(&insert->k), bkey_start_pos(k.k))) { ++ if (!bkey_whiteout(k.k)) { ++ btree_account_key_drop(l->b, _k); ++ _k->type = KEY_TYPE_discard; ++ reserve_whiteout(l->b, _k); ++ bch2_btree_iter_fix_key_modified(iter, ++ l->b, _k); ++ } ++ break; ++ } ++ ++ if (k.k->needs_whiteout || bkey_written(l->b, _k)) { ++ insert->k.needs_whiteout = true; ++ update_btree = true; ++ } ++ ++ if (update_btree && ++ overlap == BCH_EXTENT_OVERLAP_ALL && ++ bkey_whiteout(k.k) && ++ k.k->needs_whiteout) { ++ unreserve_whiteout(l->b, _k); ++ _k->needs_whiteout = false; ++ } ++ ++ extent_squash(c, iter, insert, _k, k, overlap); ++ ++ if (!update_btree) ++ bch2_cut_front(cur_end, insert); ++next: ++ node_iter = l->iter; ++ ++ if (overlap == BCH_EXTENT_OVERLAP_FRONT || ++ overlap == BCH_EXTENT_OVERLAP_MIDDLE) ++ break; ++ } ++ ++ l->iter = node_iter; ++ bch2_btree_iter_set_pos_same_leaf(iter, insert->k.p); ++ ++ if (update_btree) { ++ if (deleting) ++ insert->k.type = KEY_TYPE_discard; ++ ++ EBUG_ON(bkey_deleted(&insert->k) || !insert->k.size); ++ ++ extent_bset_insert(c, iter, insert); ++ } ++ ++ if (update_journal) { ++ struct bkey_i *k = !deleting ? insert : &whiteout; ++ ++ if (deleting) ++ k->k.type = KEY_TYPE_discard; ++ ++ EBUG_ON(bkey_deleted(&k->k) || !k->k.size); ++ ++ bch2_btree_journal_key(trans, iter, k); ++ } ++ ++ bch2_cut_front(insert->k.p, insert); ++} +diff --git a/fs/bcachefs/extent_update.h b/fs/bcachefs/extent_update.h +new file mode 100644 +index 000000000000..89d18e4b6758 +--- /dev/null ++++ b/fs/bcachefs/extent_update.h +@@ -0,0 +1,18 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_EXTENT_UPDATE_H ++#define _BCACHEFS_EXTENT_UPDATE_H ++ ++#include "bcachefs.h" ++ ++int bch2_extent_atomic_end(struct btree_iter *, struct bkey_i *, ++ struct bpos *); ++int bch2_extent_trim_atomic(struct bkey_i *, struct btree_iter *); ++int bch2_extent_is_atomic(struct bkey_i *, struct btree_iter *); ++ ++enum btree_insert_ret ++bch2_extent_can_insert(struct btree_trans *, struct btree_insert_entry *, ++ unsigned *); ++void bch2_insert_fixup_extent(struct btree_trans *, ++ struct btree_insert_entry *); ++ ++#endif /* _BCACHEFS_EXTENT_UPDATE_H */ +diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c +index 8410c2d19031..41dec5cd46d1 100644 +--- a/fs/bcachefs/extents.c ++++ b/fs/bcachefs/extents.c +@@ -8,14 +8,11 @@ + + #include "bcachefs.h" + #include "bkey_methods.h" +-#include "bkey_on_stack.h" + #include "btree_gc.h" +-#include "btree_update.h" +-#include "btree_update_interior.h" ++#include "btree_iter.h" + #include "buckets.h" + #include "checksum.h" + #include "debug.h" +-#include "dirent.h" + #include "disk_groups.h" + #include "error.h" + #include "extents.h" +@@ -25,7 +22,6 @@ + #include "super.h" + #include "super-io.h" + #include "util.h" +-#include "xattr.h" + + #include + +@@ -827,525 +823,6 @@ int bch2_cut_back_s(struct bpos where, struct bkey_s k) + return -val_u64s_delta; + } + +-static unsigned bch2_bkey_nr_alloc_ptrs(struct bkey_s_c k) +-{ +- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); +- const union bch_extent_entry *entry; +- unsigned ret = 0; +- +- bkey_extent_entry_for_each(ptrs, entry) { +- switch (__extent_entry_type(entry)) { +- case BCH_EXTENT_ENTRY_ptr: +- case BCH_EXTENT_ENTRY_stripe_ptr: +- ret++; +- } +- } +- +- return ret; +-} +- +-static int count_iters_for_insert(struct btree_trans *trans, +- struct bkey_s_c k, +- unsigned offset, +- struct bpos *end, +- unsigned *nr_iters, +- unsigned max_iters, +- bool overwrite) +-{ +- int ret = 0; +- +- switch (k.k->type) { +- case KEY_TYPE_extent: +- case KEY_TYPE_reflink_v: +- *nr_iters += bch2_bkey_nr_alloc_ptrs(k); +- +- if (*nr_iters >= max_iters) { +- *end = bpos_min(*end, k.k->p); +- ret = 1; +- } +- +- break; +- case KEY_TYPE_reflink_p: { +- struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); +- u64 idx = le64_to_cpu(p.v->idx); +- unsigned sectors = bpos_min(*end, p.k->p).offset - +- bkey_start_offset(p.k); +- struct btree_iter *iter; +- struct bkey_s_c r_k; +- +- for_each_btree_key(trans, iter, +- BTREE_ID_REFLINK, POS(0, idx + offset), +- BTREE_ITER_SLOTS, r_k, ret) { +- if (bkey_cmp(bkey_start_pos(r_k.k), +- POS(0, idx + sectors)) >= 0) +- break; +- +- *nr_iters += 1 + bch2_bkey_nr_alloc_ptrs(r_k); +- +- if (*nr_iters >= max_iters) { +- struct bpos pos = bkey_start_pos(k.k); +- pos.offset += r_k.k->p.offset - idx; +- +- *end = bpos_min(*end, pos); +- ret = 1; +- break; +- } +- } +- +- bch2_trans_iter_put(trans, iter); +- break; +- } +- } +- +- return ret; +-} +- +-#define EXTENT_ITERS_MAX (BTREE_ITER_MAX / 3) +- +-int bch2_extent_atomic_end(struct btree_iter *iter, +- struct bkey_i *insert, +- struct bpos *end) +-{ +- struct btree_trans *trans = iter->trans; +- struct btree *b; +- struct btree_node_iter node_iter; +- struct bkey_packed *_k; +- unsigned nr_iters = 0; +- int ret; +- +- ret = bch2_btree_iter_traverse(iter); +- if (ret) +- return ret; +- +- b = iter->l[0].b; +- node_iter = iter->l[0].iter; +- +- BUG_ON(bkey_cmp(bkey_start_pos(&insert->k), b->data->min_key) < 0); +- +- *end = bpos_min(insert->k.p, b->key.k.p); +- +- ret = count_iters_for_insert(trans, bkey_i_to_s_c(insert), 0, end, +- &nr_iters, EXTENT_ITERS_MAX / 2, false); +- if (ret < 0) +- return ret; +- +- while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, b, +- KEY_TYPE_discard))) { +- struct bkey unpacked; +- struct bkey_s_c k = bkey_disassemble(b, _k, &unpacked); +- unsigned offset = 0; +- +- if (bkey_cmp(bkey_start_pos(k.k), *end) >= 0) +- break; +- +- if (bkey_cmp(bkey_start_pos(&insert->k), +- bkey_start_pos(k.k)) > 0) +- offset = bkey_start_offset(&insert->k) - +- bkey_start_offset(k.k); +- +- ret = count_iters_for_insert(trans, k, offset, end, +- &nr_iters, EXTENT_ITERS_MAX, true); +- if (ret) +- break; +- +- bch2_btree_node_iter_advance(&node_iter, b); +- } +- +- return ret < 0 ? ret : 0; +-} +- +-int bch2_extent_trim_atomic(struct bkey_i *k, struct btree_iter *iter) +-{ +- struct bpos end; +- int ret; +- +- ret = bch2_extent_atomic_end(iter, k, &end); +- if (ret) +- return ret; +- +- bch2_cut_back(end, k); +- return 0; +-} +- +-int bch2_extent_is_atomic(struct bkey_i *k, struct btree_iter *iter) +-{ +- struct bpos end; +- int ret; +- +- ret = bch2_extent_atomic_end(iter, k, &end); +- if (ret) +- return ret; +- +- return !bkey_cmp(end, k->k.p); +-} +- +-enum btree_insert_ret +-bch2_extent_can_insert(struct btree_trans *trans, +- struct btree_insert_entry *insert, +- unsigned *u64s) +-{ +- struct btree_iter_level *l = &insert->iter->l[0]; +- struct btree_node_iter node_iter = l->iter; +- enum bch_extent_overlap overlap; +- struct bkey_packed *_k; +- struct bkey unpacked; +- struct bkey_s_c k; +- int sectors; +- +- /* +- * We avoid creating whiteouts whenever possible when deleting, but +- * those optimizations mean we may potentially insert two whiteouts +- * instead of one (when we overlap with the front of one extent and the +- * back of another): +- */ +- if (bkey_whiteout(&insert->k->k)) +- *u64s += BKEY_U64s; +- +- _k = bch2_btree_node_iter_peek_filter(&node_iter, l->b, +- KEY_TYPE_discard); +- if (!_k) +- return BTREE_INSERT_OK; +- +- k = bkey_disassemble(l->b, _k, &unpacked); +- +- overlap = bch2_extent_overlap(&insert->k->k, k.k); +- +- /* account for having to split existing extent: */ +- if (overlap == BCH_EXTENT_OVERLAP_MIDDLE) +- *u64s += _k->u64s; +- +- if (overlap == BCH_EXTENT_OVERLAP_MIDDLE && +- (sectors = bch2_extent_is_compressed(k))) { +- int flags = trans->flags & BTREE_INSERT_NOFAIL +- ? BCH_DISK_RESERVATION_NOFAIL : 0; +- +- switch (bch2_disk_reservation_add(trans->c, +- trans->disk_res, +- sectors, flags)) { +- case 0: +- break; +- case -ENOSPC: +- return BTREE_INSERT_ENOSPC; +- default: +- BUG(); +- } +- } +- +- return BTREE_INSERT_OK; +-} +- +-static void verify_extent_nonoverlapping(struct bch_fs *c, +- struct btree *b, +- struct btree_node_iter *_iter, +- struct bkey_i *insert) +-{ +-#ifdef CONFIG_BCACHEFS_DEBUG +- struct btree_node_iter iter; +- struct bkey_packed *k; +- struct bkey uk; +- +- if (!expensive_debug_checks(c)) +- return; +- +- iter = *_iter; +- k = bch2_btree_node_iter_prev_filter(&iter, b, KEY_TYPE_discard); +- BUG_ON(k && +- (uk = bkey_unpack_key(b, k), +- bkey_cmp(uk.p, bkey_start_pos(&insert->k)) > 0)); +- +- iter = *_iter; +- k = bch2_btree_node_iter_peek_filter(&iter, b, KEY_TYPE_discard); +-#if 0 +- BUG_ON(k && +- (uk = bkey_unpack_key(b, k), +- bkey_cmp(insert->k.p, bkey_start_pos(&uk))) > 0); +-#else +- if (k && +- (uk = bkey_unpack_key(b, k), +- bkey_cmp(insert->k.p, bkey_start_pos(&uk))) > 0) { +- char buf1[100]; +- char buf2[100]; +- +- bch2_bkey_to_text(&PBUF(buf1), &insert->k); +- bch2_bkey_to_text(&PBUF(buf2), &uk); +- +- bch2_dump_btree_node(b); +- panic("insert > next :\n" +- "insert %s\n" +- "next %s\n", +- buf1, buf2); +- } +-#endif +- +-#endif +-} +- +-static void extent_bset_insert(struct bch_fs *c, struct btree_iter *iter, +- struct bkey_i *insert) +-{ +- struct btree_iter_level *l = &iter->l[0]; +- struct bkey_packed *k = +- bch2_btree_node_iter_bset_pos(&l->iter, l->b, bset_tree_last(l->b)); +- +- BUG_ON(insert->k.u64s > bch_btree_keys_u64s_remaining(c, l->b)); +- +- EBUG_ON(bkey_deleted(&insert->k) || !insert->k.size); +- verify_extent_nonoverlapping(c, l->b, &l->iter, insert); +- +- if (debug_check_bkeys(c)) +- bch2_bkey_debugcheck(c, l->b, bkey_i_to_s_c(insert)); +- +- bch2_bset_insert(l->b, &l->iter, k, insert, 0); +- bch2_btree_node_iter_fix(iter, l->b, &l->iter, k, 0, k->u64s); +-} +- +-static void +-extent_squash(struct bch_fs *c, struct btree_iter *iter, +- struct bkey_i *insert, +- struct bkey_packed *_k, struct bkey_s k, +- enum bch_extent_overlap overlap) +-{ +- struct btree_iter_level *l = &iter->l[0]; +- int u64s_delta; +- +- switch (overlap) { +- case BCH_EXTENT_OVERLAP_FRONT: +- /* insert overlaps with start of k: */ +- u64s_delta = bch2_cut_front_s(insert->k.p, k); +- btree_keys_account_val_delta(l->b, _k, u64s_delta); +- +- EBUG_ON(bkey_deleted(k.k)); +- extent_save(l->b, _k, k.k); +- bch2_btree_iter_fix_key_modified(iter, l->b, _k); +- break; +- +- case BCH_EXTENT_OVERLAP_BACK: +- /* insert overlaps with end of k: */ +- u64s_delta = bch2_cut_back_s(bkey_start_pos(&insert->k), k); +- btree_keys_account_val_delta(l->b, _k, u64s_delta); +- +- EBUG_ON(bkey_deleted(k.k)); +- extent_save(l->b, _k, k.k); +- +- /* +- * As the auxiliary tree is indexed by the end of the +- * key and we've just changed the end, update the +- * auxiliary tree. +- */ +- bch2_bset_fix_invalidated_key(l->b, _k); +- bch2_btree_node_iter_fix(iter, l->b, &l->iter, +- _k, _k->u64s, _k->u64s); +- break; +- +- case BCH_EXTENT_OVERLAP_ALL: { +- /* The insert key completely covers k, invalidate k */ +- if (!bkey_whiteout(k.k)) +- btree_account_key_drop(l->b, _k); +- +- k.k->size = 0; +- k.k->type = KEY_TYPE_deleted; +- +- if (_k >= btree_bset_last(l->b)->start) { +- unsigned u64s = _k->u64s; +- +- bch2_bset_delete(l->b, _k, _k->u64s); +- bch2_btree_node_iter_fix(iter, l->b, &l->iter, +- _k, u64s, 0); +- } else { +- extent_save(l->b, _k, k.k); +- bch2_btree_iter_fix_key_modified(iter, l->b, _k); +- } +- +- break; +- } +- case BCH_EXTENT_OVERLAP_MIDDLE: { +- struct bkey_on_stack split; +- +- bkey_on_stack_init(&split); +- bkey_on_stack_realloc(&split, c, k.k->u64s); +- +- /* +- * The insert key falls 'in the middle' of k +- * The insert key splits k in 3: +- * - start only in k, preserve +- * - middle common section, invalidate in k +- * - end only in k, preserve +- * +- * We update the old key to preserve the start, +- * insert will be the new common section, +- * we manually insert the end that we are preserving. +- * +- * modify k _before_ doing the insert (which will move +- * what k points to) +- */ +- bkey_reassemble(split.k, k.s_c); +- split.k->k.needs_whiteout |= bkey_written(l->b, _k); +- +- bch2_cut_back(bkey_start_pos(&insert->k), split.k); +- BUG_ON(bkey_deleted(&split.k->k)); +- +- u64s_delta = bch2_cut_front_s(insert->k.p, k); +- btree_keys_account_val_delta(l->b, _k, u64s_delta); +- +- BUG_ON(bkey_deleted(k.k)); +- extent_save(l->b, _k, k.k); +- bch2_btree_iter_fix_key_modified(iter, l->b, _k); +- +- extent_bset_insert(c, iter, split.k); +- bkey_on_stack_exit(&split, c); +- break; +- } +- } +-} +- +-/** +- * bch_extent_insert_fixup - insert a new extent and deal with overlaps +- * +- * this may result in not actually doing the insert, or inserting some subset +- * of the insert key. For cmpxchg operations this is where that logic lives. +- * +- * All subsets of @insert that need to be inserted are inserted using +- * bch2_btree_insert_and_journal(). If @b or @res fills up, this function +- * returns false, setting @iter->pos for the prefix of @insert that actually got +- * inserted. +- * +- * BSET INVARIANTS: this function is responsible for maintaining all the +- * invariants for bsets of extents in memory. things get really hairy with 0 +- * size extents +- * +- * within one bset: +- * +- * bkey_start_pos(bkey_next(k)) >= k +- * or bkey_start_offset(bkey_next(k)) >= k->offset +- * +- * i.e. strict ordering, no overlapping extents. +- * +- * multiple bsets (i.e. full btree node): +- * +- * ∀ k, j +- * k.size != 0 ∧ j.size != 0 → +- * ¬ (k > bkey_start_pos(j) ∧ k < j) +- * +- * i.e. no two overlapping keys _of nonzero size_ +- * +- * We can't realistically maintain this invariant for zero size keys because of +- * the key merging done in bch2_btree_insert_key() - for two mergeable keys k, j +- * there may be another 0 size key between them in another bset, and it will +- * thus overlap with the merged key. +- * +- * In addition, the end of iter->pos indicates how much has been processed. +- * If the end of iter->pos is not the same as the end of insert, then +- * key insertion needs to continue/be retried. +- */ +-void bch2_insert_fixup_extent(struct btree_trans *trans, +- struct btree_insert_entry *insert_entry) +-{ +- struct bch_fs *c = trans->c; +- struct btree_iter *iter = insert_entry->iter; +- struct bkey_i *insert = insert_entry->k; +- struct btree_iter_level *l = &iter->l[0]; +- struct btree_node_iter node_iter = l->iter; +- bool deleting = bkey_whiteout(&insert->k); +- bool update_journal = !deleting; +- bool update_btree = !deleting; +- struct bkey_i whiteout = *insert; +- struct bkey_packed *_k; +- struct bkey unpacked; +- +- EBUG_ON(iter->level); +- EBUG_ON(!insert->k.size); +- EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k))); +- +- while ((_k = bch2_btree_node_iter_peek_filter(&l->iter, l->b, +- KEY_TYPE_discard))) { +- struct bkey_s k = __bkey_disassemble(l->b, _k, &unpacked); +- struct bpos cur_end = bpos_min(insert->k.p, k.k->p); +- enum bch_extent_overlap overlap = +- bch2_extent_overlap(&insert->k, k.k); +- +- if (bkey_cmp(bkey_start_pos(k.k), insert->k.p) >= 0) +- break; +- +- if (!bkey_whiteout(k.k)) +- update_journal = true; +- +- if (!update_journal) { +- bch2_cut_front(cur_end, insert); +- bch2_cut_front(cur_end, &whiteout); +- bch2_btree_iter_set_pos_same_leaf(iter, cur_end); +- goto next; +- } +- +- /* +- * When deleting, if possible just do it by switching the type +- * of the key we're deleting, instead of creating and inserting +- * a new whiteout: +- */ +- if (deleting && +- !update_btree && +- !bkey_cmp(insert->k.p, k.k->p) && +- !bkey_cmp(bkey_start_pos(&insert->k), bkey_start_pos(k.k))) { +- if (!bkey_whiteout(k.k)) { +- btree_account_key_drop(l->b, _k); +- _k->type = KEY_TYPE_discard; +- reserve_whiteout(l->b, _k); +- bch2_btree_iter_fix_key_modified(iter, +- l->b, _k); +- } +- break; +- } +- +- if (k.k->needs_whiteout || bkey_written(l->b, _k)) { +- insert->k.needs_whiteout = true; +- update_btree = true; +- } +- +- if (update_btree && +- overlap == BCH_EXTENT_OVERLAP_ALL && +- bkey_whiteout(k.k) && +- k.k->needs_whiteout) { +- unreserve_whiteout(l->b, _k); +- _k->needs_whiteout = false; +- } +- +- extent_squash(c, iter, insert, _k, k, overlap); +- +- if (!update_btree) +- bch2_cut_front(cur_end, insert); +-next: +- node_iter = l->iter; +- +- if (overlap == BCH_EXTENT_OVERLAP_FRONT || +- overlap == BCH_EXTENT_OVERLAP_MIDDLE) +- break; +- } +- +- l->iter = node_iter; +- bch2_btree_iter_set_pos_same_leaf(iter, insert->k.p); +- +- if (update_btree) { +- if (deleting) +- insert->k.type = KEY_TYPE_discard; +- +- EBUG_ON(bkey_deleted(&insert->k) || !insert->k.size); +- +- extent_bset_insert(c, iter, insert); +- } +- +- if (update_journal) { +- struct bkey_i *k = !deleting ? insert : &whiteout; +- +- if (deleting) +- k->k.type = KEY_TYPE_discard; +- +- EBUG_ON(bkey_deleted(&k->k) || !k->k.size); +- +- bch2_btree_journal_key(trans, iter, k); +- } +- +- bch2_cut_front(insert->k.p, insert); +-} +- + const char *bch2_extent_invalid(const struct bch_fs *c, struct bkey_s_c k) + { + return bch2_bkey_ptrs_invalid(c, k); +diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h +index 6e893c37c287..e360e1989812 100644 +--- a/fs/bcachefs/extents.h ++++ b/fs/bcachefs/extents.h +@@ -433,17 +433,6 @@ enum merge_result bch2_reservation_merge(struct bch_fs *, + .key_merge = bch2_reservation_merge, \ + } + +-int bch2_extent_atomic_end(struct btree_iter *, struct bkey_i *, +- struct bpos *); +-int bch2_extent_trim_atomic(struct bkey_i *, struct btree_iter *); +-int bch2_extent_is_atomic(struct bkey_i *, struct btree_iter *); +- +-enum btree_insert_ret +-bch2_extent_can_insert(struct btree_trans *, struct btree_insert_entry *, +- unsigned *); +-void bch2_insert_fixup_extent(struct btree_trans *, +- struct btree_insert_entry *); +- + void bch2_bkey_mark_replicas_cached(struct bch_fs *, struct bkey_s, + unsigned, unsigned); + +diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c +index 0f4e251e0c5f..b413242e3c8d 100644 +--- a/fs/bcachefs/fs-io.c ++++ b/fs/bcachefs/fs-io.c +@@ -9,6 +9,7 @@ + #include "clock.h" + #include "error.h" + #include "extents.h" ++#include "extent_update.h" + #include "fs.h" + #include "fs-io.h" + #include "fsck.h" +diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c +index 4a5355942e02..107443ddea5f 100644 +--- a/fs/bcachefs/io.c ++++ b/fs/bcachefs/io.c +@@ -19,7 +19,7 @@ + #include "disk_groups.h" + #include "ec.h" + #include "error.h" +-#include "extents.h" ++#include "extent_update.h" + #include "inode.h" + #include "io.h" + #include "journal.h" +-- +cgit v1.2.3 + + +From e722099ddffcefa5a2601cf032590b8dc00437cc Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 9 Nov 2019 16:43:16 -0500 +Subject: bcachefs: Inline data extents + +This implements extents that have their data inline, in the value, +instead of the bkey value being pointers to the data - and the read and +write paths are updated to read from these new extent types and write +them out, when the write size is small enough. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bcachefs_format.h | 11 +++++- + fs/bcachefs/bkey.h | 1 + + fs/bcachefs/bkey_methods.c | 22 +++++++++-- + fs/bcachefs/extents.c | 25 ++++++++---- + fs/bcachefs/extents.h | 2 + + fs/bcachefs/fs-io.c | 12 ++++++ + fs/bcachefs/io.c | 89 +++++++++++++++++++++++++++++++++++++------ + fs/bcachefs/io.h | 5 ++- + fs/bcachefs/recovery.c | 6 +++ + 9 files changed, 147 insertions(+), 26 deletions(-) + +diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h +index d619e5caf09b..3d85012a15fd 100644 +--- a/fs/bcachefs/bcachefs_format.h ++++ b/fs/bcachefs/bcachefs_format.h +@@ -338,7 +338,8 @@ static inline void bkey_init(struct bkey *k) + x(quota, 13) \ + x(stripe, 14) \ + x(reflink_p, 15) \ +- x(reflink_v, 16) ++ x(reflink_v, 16) \ ++ x(inline_data, 17) + + enum bch_bkey_type { + #define x(name, nr) KEY_TYPE_##name = nr, +@@ -911,6 +912,13 @@ struct bch_reflink_v { + __u64 _data[0]; + }; + ++/* Inline data */ ++ ++struct bch_inline_data { ++ struct bch_val v; ++ u8 data[0]; ++}; ++ + /* Optional/variable size superblock sections: */ + + struct bch_sb_field { +@@ -1315,6 +1323,7 @@ enum bch_sb_features { + BCH_FEATURE_JOURNAL_SEQ_BLACKLIST_V3 = 5, + BCH_FEATURE_REFLINK = 6, + BCH_FEATURE_NEW_SIPHASH = 7, ++ BCH_FEATURE_INLINE_DATA = 8, + BCH_FEATURE_NR, + }; + +diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h +index d3c39dc50b7f..f2d5f3009b21 100644 +--- a/fs/bcachefs/bkey.h ++++ b/fs/bcachefs/bkey.h +@@ -564,6 +564,7 @@ BKEY_VAL_ACCESSORS(quota); + BKEY_VAL_ACCESSORS(stripe); + BKEY_VAL_ACCESSORS(reflink_p); + BKEY_VAL_ACCESSORS(reflink_v); ++BKEY_VAL_ACCESSORS(inline_data); + + /* byte order helpers */ + +diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c +index f01405dd502b..5312184c37f7 100644 +--- a/fs/bcachefs/bkey_methods.c ++++ b/fs/bcachefs/bkey_methods.c +@@ -63,6 +63,23 @@ static const char *key_type_cookie_invalid(const struct bch_fs *c, + .key_invalid = empty_val_key_invalid, \ + } + ++static const char *key_type_inline_data_invalid(const struct bch_fs *c, ++ struct bkey_s_c k) ++{ ++ return NULL; ++} ++ ++static void key_type_inline_data_to_text(struct printbuf *out, struct bch_fs *c, ++ struct bkey_s_c k) ++{ ++ pr_buf(out, "(%zu bytes)", bkey_val_bytes(k.k)); ++} ++ ++static const struct bkey_ops bch2_bkey_ops_inline_data = { ++ .key_invalid = key_type_inline_data_invalid, ++ .val_to_text = key_type_inline_data_to_text, ++}; ++ + static const struct bkey_ops bch2_bkey_ops[] = { + #define x(name, nr) [KEY_TYPE_##name] = bch2_bkey_ops_##name, + BCH_BKEY_TYPES() +@@ -83,9 +100,8 @@ const char *__bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k, + if (k.k->u64s < BKEY_U64s) + return "u64s too small"; + +- if ((btree_node_type_is_extents(type) || +- type == BKEY_TYPE_BTREE) && +- bkey_val_u64s(k.k) > BKEY_EXTENT_VAL_U64s_MAX) ++ if (type == BKEY_TYPE_BTREE && ++ bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX) + return "value too big"; + + if (btree_node_type_is_extents(type)) { +diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c +index 41dec5cd46d1..2f1d4634ea09 100644 +--- a/fs/bcachefs/extents.c ++++ b/fs/bcachefs/extents.c +@@ -738,11 +738,6 @@ int bch2_cut_front_s(struct bpos where, struct bkey_s k) + } + + switch (k.k->type) { +- case KEY_TYPE_deleted: +- case KEY_TYPE_discard: +- case KEY_TYPE_error: +- case KEY_TYPE_cookie: +- break; + case KEY_TYPE_extent: + case KEY_TYPE_reflink_v: { + struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); +@@ -780,10 +775,18 @@ int bch2_cut_front_s(struct bpos where, struct bkey_s k) + le64_add_cpu(&p.v->idx, sub); + break; + } +- case KEY_TYPE_reservation: ++ case KEY_TYPE_inline_data: { ++ struct bkey_s_inline_data d = bkey_s_to_inline_data(k); ++ ++ sub = min_t(u64, sub << 9, bkey_val_bytes(d.k)); ++ ++ memmove(d.v->data, ++ d.v->data + sub, ++ bkey_val_bytes(d.k) - sub); ++ ++ new_val_u64s -= sub >> 3; + break; +- default: +- BUG(); ++ } + } + + val_u64s_delta = bkey_val_u64s(k.k) - new_val_u64s; +@@ -815,6 +818,12 @@ int bch2_cut_back_s(struct bpos where, struct bkey_s k) + new_val_u64s = 0; + } + ++ switch (k.k->type) { ++ case KEY_TYPE_inline_data: ++ new_val_u64s = min(new_val_u64s, k.k->size << 6); ++ break; ++ } ++ + val_u64s_delta = bkey_val_u64s(k.k) - new_val_u64s; + BUG_ON(val_u64s_delta < 0); + +diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h +index e360e1989812..35a66d4f4ea2 100644 +--- a/fs/bcachefs/extents.h ++++ b/fs/bcachefs/extents.h +@@ -456,6 +456,7 @@ static inline bool bkey_extent_is_direct_data(const struct bkey *k) + static inline bool bkey_extent_is_data(const struct bkey *k) + { + return bkey_extent_is_direct_data(k) || ++ k->type == KEY_TYPE_inline_data || + k->type == KEY_TYPE_reflink_p; + } + +@@ -469,6 +470,7 @@ static inline bool bkey_extent_is_allocation(const struct bkey *k) + case KEY_TYPE_reservation: + case KEY_TYPE_reflink_p: + case KEY_TYPE_reflink_v: ++ case KEY_TYPE_inline_data: + return true; + default: + return false; +diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c +index b413242e3c8d..6f0c7bea2ccc 100644 +--- a/fs/bcachefs/fs-io.c ++++ b/fs/bcachefs/fs-io.c +@@ -1080,6 +1080,18 @@ static void bch2_writepage_io_done(struct closure *cl) + } + } + ++ if (io->op.flags & BCH_WRITE_WROTE_DATA_INLINE) { ++ bio_for_each_segment_all(bvec, bio, iter) { ++ struct bch_page_state *s; ++ ++ s = __bch2_page_state(bvec->bv_page); ++ spin_lock(&s->lock); ++ for (i = 0; i < PAGE_SECTORS; i++) ++ s->s[i].nr_replicas = 0; ++ spin_unlock(&s->lock); ++ } ++ } ++ + /* + * racing with fallocate can cause us to add fewer sectors than + * expected - but we shouldn't add more sectors than expected: +diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c +index 107443ddea5f..a544ef7de31f 100644 +--- a/fs/bcachefs/io.c ++++ b/fs/bcachefs/io.c +@@ -523,16 +523,19 @@ static void __bch2_write_index(struct bch_write_op *op) + + for (src = keys->keys; src != keys->top; src = n) { + n = bkey_next(src); +- bkey_copy(dst, src); + +- bch2_bkey_drop_ptrs(bkey_i_to_s(dst), ptr, +- test_bit(ptr->dev, op->failed.d)); ++ if (bkey_extent_is_direct_data(&src->k)) { ++ bch2_bkey_drop_ptrs(bkey_i_to_s(src), ptr, ++ test_bit(ptr->dev, op->failed.d)); + +- if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(dst))) { +- ret = -EIO; +- goto err; ++ if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(src))) { ++ ret = -EIO; ++ goto err; ++ } + } + ++ if (dst != src) ++ memmove_u64s_down(dst, src, src->u64s); + dst = bkey_next(dst); + } + +@@ -1090,7 +1093,7 @@ again: + + bio->bi_end_io = bch2_write_endio; + bio->bi_private = &op->cl; +- bio_set_op_attrs(bio, REQ_OP_WRITE, 0); ++ bio->bi_opf |= REQ_OP_WRITE; + + if (!skip_put) + closure_get(bio->bi_private); +@@ -1127,6 +1130,47 @@ flush_io: + goto again; + } + ++static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len) ++{ ++ struct closure *cl = &op->cl; ++ struct bio *bio = &op->wbio.bio; ++ struct bvec_iter iter; ++ struct bkey_i_inline_data *id; ++ unsigned sectors; ++ int ret; ++ ++ ret = bch2_keylist_realloc(&op->insert_keys, op->inline_keys, ++ ARRAY_SIZE(op->inline_keys), ++ BKEY_U64s + DIV_ROUND_UP(data_len, 8)); ++ if (ret) { ++ op->error = ret; ++ goto err; ++ } ++ ++ sectors = bio_sectors(bio); ++ op->pos.offset += sectors; ++ ++ id = bkey_inline_data_init(op->insert_keys.top); ++ id->k.p = op->pos; ++ id->k.version = op->version; ++ id->k.size = sectors; ++ ++ iter = bio->bi_iter; ++ iter.bi_size = data_len; ++ memcpy_from_bio(id->v.data, bio, iter); ++ ++ while (data_len & 7) ++ id->v.data[data_len++] = '\0'; ++ set_bkey_val_bytes(&id->k, data_len); ++ bch2_keylist_push(&op->insert_keys); ++ ++ op->flags |= BCH_WRITE_WROTE_DATA_INLINE; ++ continue_at_nobarrier(cl, bch2_write_index, NULL); ++ return; ++err: ++ bch2_write_done(&op->cl); ++} ++ + /** + * bch_write - handle a write to a cache device or flash only volume + * +@@ -1148,22 +1192,22 @@ void bch2_write(struct closure *cl) + struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); + struct bio *bio = &op->wbio.bio; + struct bch_fs *c = op->c; ++ unsigned data_len; + + BUG_ON(!op->nr_replicas); + BUG_ON(!op->write_point.v); + BUG_ON(!bkey_cmp(op->pos, POS_MAX)); + ++ op->start_time = local_clock(); ++ bch2_keylist_init(&op->insert_keys, op->inline_keys); ++ wbio_init(bio)->put_bio = false; ++ + if (bio_sectors(bio) & (c->opts.block_size - 1)) { + __bcache_io_error(c, "misaligned write"); + op->error = -EIO; + goto err; + } + +- op->start_time = local_clock(); +- +- bch2_keylist_init(&op->insert_keys, op->inline_keys); +- wbio_init(bio)->put_bio = false; +- + if (c->opts.nochanges || + !percpu_ref_tryget(&c->writes)) { + __bcache_io_error(c, "read only"); +@@ -1173,6 +1217,14 @@ void bch2_write(struct closure *cl) + + bch2_increment_clock(c, bio_sectors(bio), WRITE); + ++ data_len = min_t(u64, bio->bi_iter.bi_size, ++ op->new_i_size - (op->pos.offset << 9)); ++ ++ if (data_len <= min(block_bytes(c) / 2, 1024U)) { ++ bch2_write_data_inline(op, data_len); ++ return; ++ } ++ + continue_at_nobarrier(cl, __bch2_write, NULL); + return; + err: +@@ -1890,6 +1942,19 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig, + struct bpos pos = bkey_start_pos(k.k); + int pick_ret; + ++ if (k.k->type == KEY_TYPE_inline_data) { ++ struct bkey_s_c_inline_data d = bkey_s_c_to_inline_data(k); ++ unsigned bytes = min_t(unsigned, iter.bi_size, ++ bkey_val_bytes(d.k)); ++ ++ swap(iter.bi_size, bytes); ++ memcpy_to_bio(&orig->bio, iter, d.v->data); ++ swap(iter.bi_size, bytes); ++ bio_advance_iter(&orig->bio, &iter, bytes); ++ zero_fill_bio_iter(&orig->bio, iter); ++ goto out_read_done; ++ } ++ + pick_ret = bch2_bkey_pick_read_device(c, k, failed, &pick); + + /* hole or reservation - just zero fill: */ +diff --git a/fs/bcachefs/io.h b/fs/bcachefs/io.h +index 91aaa58fce4e..45c950942d78 100644 +--- a/fs/bcachefs/io.h ++++ b/fs/bcachefs/io.h +@@ -30,10 +30,11 @@ enum bch_write_flags { + BCH_WRITE_PAGES_OWNED = (1 << 5), + BCH_WRITE_ONLY_SPECIFIED_DEVS = (1 << 6), + BCH_WRITE_NOPUT_RESERVATION = (1 << 7), ++ BCH_WRITE_WROTE_DATA_INLINE = (1 << 8), + + /* Internal: */ +- BCH_WRITE_JOURNAL_SEQ_PTR = (1 << 8), +- BCH_WRITE_SKIP_CLOSURE_PUT = (1 << 9), ++ BCH_WRITE_JOURNAL_SEQ_PTR = (1 << 9), ++ BCH_WRITE_SKIP_CLOSURE_PUT = (1 << 10), + }; + + static inline u64 *op_journal_seq(struct bch_write_op *op) +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index 2efe023b2f0d..9102a1ce1ec4 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -913,6 +913,12 @@ int bch2_fs_recovery(struct bch_fs *c) + write_sb = true; + } + ++ if (!(c->sb.features & (1ULL << BCH_FEATURE_INLINE_DATA))) { ++ c->disk_sb.sb->features[0] |= ++ cpu_to_le64(1ULL << BCH_FEATURE_INLINE_DATA); ++ write_sb = true; ++ } ++ + if (!test_bit(BCH_FS_ERROR, &c->flags)) { + c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_ALLOC_INFO; + write_sb = true; +-- +cgit v1.2.3 + + +From 2d6e0929b89132d60f5e10d5a132b4e92e1e3131 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 16 Nov 2019 16:25:58 -0500 +Subject: bcachefs: Reorganize extents.c + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/extent_update.c | 2 +- + fs/bcachefs/extents.c | 1556 +++++++++++++++++++++---------------------- + fs/bcachefs/extents.h | 260 ++++---- + fs/bcachefs/fs-io.c | 8 +- + fs/bcachefs/io.c | 4 +- + fs/bcachefs/move.c | 8 +- + fs/bcachefs/recovery.c | 4 +- + 7 files changed, 898 insertions(+), 944 deletions(-) + +diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c +index 21426e01c395..91ceb5d53f92 100644 +--- a/fs/bcachefs/extent_update.c ++++ b/fs/bcachefs/extent_update.c +@@ -200,7 +200,7 @@ bch2_extent_can_insert(struct btree_trans *trans, + *u64s += _k->u64s; + + if (overlap == BCH_EXTENT_OVERLAP_MIDDLE && +- (sectors = bch2_extent_is_compressed(k))) { ++ (sectors = bch2_bkey_sectors_compressed(k))) { + int flags = trans->flags & BTREE_INSERT_NOFAIL + ? BCH_DISK_RESERVATION_NOFAIL : 0; + +diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c +index 2f1d4634ea09..6bcc178604b0 100644 +--- a/fs/bcachefs/extents.c ++++ b/fs/bcachefs/extents.c +@@ -25,81 +25,15 @@ + + #include + +-unsigned bch2_bkey_nr_ptrs(struct bkey_s_c k) +-{ +- struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k); +- const struct bch_extent_ptr *ptr; +- unsigned nr_ptrs = 0; +- +- bkey_for_each_ptr(p, ptr) +- nr_ptrs++; +- +- return nr_ptrs; +-} +- +-unsigned bch2_bkey_nr_dirty_ptrs(struct bkey_s_c k) +-{ +- unsigned nr_ptrs = 0; +- +- switch (k.k->type) { +- case KEY_TYPE_btree_ptr: +- case KEY_TYPE_extent: +- case KEY_TYPE_reflink_v: { +- struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k); +- const struct bch_extent_ptr *ptr; +- +- bkey_for_each_ptr(p, ptr) +- nr_ptrs += !ptr->cached; +- BUG_ON(!nr_ptrs); +- break; +- } +- case KEY_TYPE_reservation: +- nr_ptrs = bkey_s_c_to_reservation(k).v->nr_replicas; +- break; +- } +- +- return nr_ptrs; +-} +- +-static unsigned bch2_extent_ptr_durability(struct bch_fs *c, +- struct extent_ptr_decoded p) +-{ +- unsigned durability = 0; +- struct bch_dev *ca; +- +- if (p.ptr.cached) +- return 0; +- +- ca = bch_dev_bkey_exists(c, p.ptr.dev); +- +- if (ca->mi.state != BCH_MEMBER_STATE_FAILED) +- durability = max_t(unsigned, durability, ca->mi.durability); +- +- if (p.has_ec) { +- struct stripe *s = +- genradix_ptr(&c->stripes[0], p.ec.idx); +- +- if (WARN_ON(!s)) +- goto out; +- +- durability = max_t(unsigned, durability, s->nr_redundant); +- } +-out: +- return durability; +-} +- +-unsigned bch2_bkey_durability(struct bch_fs *c, struct bkey_s_c k) +-{ +- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); +- const union bch_extent_entry *entry; +- struct extent_ptr_decoded p; +- unsigned durability = 0; +- +- bkey_for_each_ptr_decode(k.k, ptrs, p, entry) +- durability += bch2_extent_ptr_durability(c, p); ++static unsigned bch2_crc_field_size_max[] = { ++ [BCH_EXTENT_ENTRY_crc32] = CRC32_SIZE_MAX, ++ [BCH_EXTENT_ENTRY_crc64] = CRC64_SIZE_MAX, ++ [BCH_EXTENT_ENTRY_crc128] = CRC128_SIZE_MAX, ++}; + +- return durability; +-} ++static void bch2_extent_crc_pack(union bch_extent_crc *, ++ struct bch_extent_crc_unpacked, ++ enum bch_extent_entry_type); + + static struct bch_dev_io_failures *dev_io_failures(struct bch_io_failures *f, + unsigned dev) +@@ -219,172 +153,299 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, + return ret; + } + +-void bch2_bkey_append_ptr(struct bkey_i *k, +- struct bch_extent_ptr ptr) +-{ +- EBUG_ON(bch2_bkey_has_device(bkey_i_to_s_c(k), ptr.dev)); +- +- switch (k->k.type) { +- case KEY_TYPE_btree_ptr: +- case KEY_TYPE_extent: +- EBUG_ON(bkey_val_u64s(&k->k) >= BKEY_EXTENT_VAL_U64s_MAX); +- +- ptr.type = 1 << BCH_EXTENT_ENTRY_ptr; +- +- memcpy((void *) &k->v + bkey_val_bytes(&k->k), +- &ptr, +- sizeof(ptr)); +- k->u64s++; +- break; +- default: +- BUG(); +- } +-} ++/* KEY_TYPE_btree_ptr: */ + +-void bch2_bkey_drop_device(struct bkey_s k, unsigned dev) ++const char *bch2_btree_ptr_invalid(const struct bch_fs *c, struct bkey_s_c k) + { +- struct bch_extent_ptr *ptr; ++ if (bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX) ++ return "value too big"; + +- bch2_bkey_drop_ptrs(k, ptr, ptr->dev == dev); ++ return bch2_bkey_ptrs_invalid(c, k); + } + +-const struct bch_extent_ptr * +-bch2_bkey_has_device(struct bkey_s_c k, unsigned dev) ++void bch2_btree_ptr_debugcheck(struct bch_fs *c, struct bkey_s_c k) + { + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const struct bch_extent_ptr *ptr; ++ const char *err; ++ char buf[160]; ++ struct bucket_mark mark; ++ struct bch_dev *ca; + +- bkey_for_each_ptr(ptrs, ptr) +- if (ptr->dev == dev) +- return ptr; +- +- return NULL; +-} +- +-bool bch2_bkey_has_target(struct bch_fs *c, struct bkey_s_c k, unsigned target) +-{ +- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); +- const struct bch_extent_ptr *ptr; ++ bch2_fs_bug_on(!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) && ++ !bch2_bkey_replicas_marked(c, k, false), c, ++ "btree key bad (replicas not marked in superblock):\n%s", ++ (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); + +- bkey_for_each_ptr(ptrs, ptr) +- if (bch2_dev_in_target(c, ptr->dev, target) && +- (!ptr->cached || +- !ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr))) +- return true; ++ if (!test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) ++ return; + +- return false; +-} ++ bkey_for_each_ptr(ptrs, ptr) { ++ ca = bch_dev_bkey_exists(c, ptr->dev); + +-/* extent specific utility code */ ++ mark = ptr_bucket_mark(ca, ptr); + +-const struct bch_extent_ptr * +-bch2_extent_has_device(struct bkey_s_c_extent e, unsigned dev) +-{ +- const struct bch_extent_ptr *ptr; ++ err = "stale"; ++ if (gen_after(mark.gen, ptr->gen)) ++ goto err; + +- extent_for_each_ptr(e, ptr) +- if (ptr->dev == dev) +- return ptr; ++ err = "inconsistent"; ++ if (mark.data_type != BCH_DATA_BTREE || ++ mark.dirty_sectors < c->opts.btree_node_size) ++ goto err; ++ } + +- return NULL; ++ return; ++err: ++ bch2_bkey_val_to_text(&PBUF(buf), c, k); ++ bch2_fs_bug(c, "%s btree pointer %s: bucket %zi gen %i mark %08x", ++ err, buf, PTR_BUCKET_NR(ca, ptr), ++ mark.gen, (unsigned) mark.v.counter); + } + +-const struct bch_extent_ptr * +-bch2_extent_has_group(struct bch_fs *c, struct bkey_s_c_extent e, unsigned group) ++void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c, ++ struct bkey_s_c k) + { +- const struct bch_extent_ptr *ptr; +- +- extent_for_each_ptr(e, ptr) { +- struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); ++ bch2_bkey_ptrs_to_text(out, c, k); ++} + +- if (ca->mi.group && +- ca->mi.group - 1 == group) +- return ptr; +- } ++/* KEY_TYPE_extent: */ + +- return NULL; ++const char *bch2_extent_invalid(const struct bch_fs *c, struct bkey_s_c k) ++{ ++ return bch2_bkey_ptrs_invalid(c, k); + } + +-unsigned bch2_extent_is_compressed(struct bkey_s_c k) ++void bch2_extent_debugcheck(struct bch_fs *c, struct bkey_s_c k) + { +- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ struct bkey_s_c_extent e = bkey_s_c_to_extent(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; +- unsigned ret = 0; ++ char buf[160]; + +- bkey_for_each_ptr_decode(k.k, ptrs, p, entry) +- if (!p.ptr.cached && +- p.crc.compression_type != BCH_COMPRESSION_NONE) +- ret += p.crc.compressed_size; ++ /* ++ * XXX: we should be doing most/all of these checks at startup time, ++ * where we check bch2_bkey_invalid() in btree_node_read_done() ++ * ++ * But note that we can't check for stale pointers or incorrect gc marks ++ * until after journal replay is done (it might be an extent that's ++ * going to get overwritten during replay) ++ */ + +- return ret; +-} ++ if (percpu_down_read_trylock(&c->mark_lock)) { ++ bch2_fs_bug_on(!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) && ++ !bch2_bkey_replicas_marked_locked(c, e.s_c, false), c, ++ "extent key bad (replicas not marked in superblock):\n%s", ++ (bch2_bkey_val_to_text(&PBUF(buf), c, e.s_c), buf)); ++ percpu_up_read(&c->mark_lock); ++ } ++ /* ++ * If journal replay hasn't finished, we might be seeing keys ++ * that will be overwritten by the time journal replay is done: ++ */ ++ if (!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)) ++ return; + +-bool bch2_bkey_matches_ptr(struct bch_fs *c, struct bkey_s_c k, +- struct bch_extent_ptr m, u64 offset) +-{ +- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); +- const union bch_extent_entry *entry; +- struct extent_ptr_decoded p; ++ extent_for_each_ptr_decode(e, p, entry) { ++ struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); ++ struct bucket_mark mark = ptr_bucket_mark(ca, &p.ptr); ++ unsigned stale = gen_after(mark.gen, p.ptr.gen); ++ unsigned disk_sectors = ptr_disk_sectors(p); ++ unsigned mark_sectors = p.ptr.cached ++ ? mark.cached_sectors ++ : mark.dirty_sectors; + +- bkey_for_each_ptr_decode(k.k, ptrs, p, entry) +- if (p.ptr.dev == m.dev && +- p.ptr.gen == m.gen && +- (s64) p.ptr.offset + p.crc.offset - bkey_start_offset(k.k) == +- (s64) m.offset - offset) +- return true; ++ bch2_fs_bug_on(stale && !p.ptr.cached, c, ++ "stale dirty pointer (ptr gen %u bucket %u", ++ p.ptr.gen, mark.gen); + +- return false; ++ bch2_fs_bug_on(stale > 96, c, "key too stale: %i", stale); ++ ++ bch2_fs_bug_on(!stale && ++ (mark.data_type != BCH_DATA_USER || ++ mark_sectors < disk_sectors), c, ++ "extent pointer not marked: %s:\n" ++ "type %u sectors %u < %u", ++ (bch2_bkey_val_to_text(&PBUF(buf), c, e.s_c), buf), ++ mark.data_type, ++ mark_sectors, disk_sectors); ++ } + } + +-static union bch_extent_entry *extent_entry_prev(struct bkey_ptrs ptrs, +- union bch_extent_entry *entry) ++void bch2_extent_to_text(struct printbuf *out, struct bch_fs *c, ++ struct bkey_s_c k) + { +- union bch_extent_entry *i = ptrs.start; +- +- if (i == entry) +- return NULL; +- +- while (extent_entry_next(i) != entry) +- i = extent_entry_next(i); +- return i; ++ bch2_bkey_ptrs_to_text(out, c, k); + } + +-union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s k, +- struct bch_extent_ptr *ptr) ++enum merge_result bch2_extent_merge(struct bch_fs *c, ++ struct bkey_s _l, struct bkey_s _r) + { +- struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); +- union bch_extent_entry *dst, *src, *prev; +- bool drop_crc = true; ++ struct bkey_s_extent l = bkey_s_to_extent(_l); ++ struct bkey_s_extent r = bkey_s_to_extent(_r); ++ union bch_extent_entry *en_l = l.v->start; ++ union bch_extent_entry *en_r = r.v->start; ++ struct bch_extent_crc_unpacked crc_l, crc_r; + +- EBUG_ON(ptr < &ptrs.start->ptr || +- ptr >= &ptrs.end->ptr); +- EBUG_ON(ptr->type != 1 << BCH_EXTENT_ENTRY_ptr); ++ if (bkey_val_u64s(l.k) != bkey_val_u64s(r.k)) ++ return BCH_MERGE_NOMERGE; + +- src = extent_entry_next(to_entry(ptr)); +- if (src != ptrs.end && +- !extent_entry_is_crc(src)) +- drop_crc = false; ++ crc_l = bch2_extent_crc_unpack(l.k, NULL); + +- dst = to_entry(ptr); +- while ((prev = extent_entry_prev(ptrs, dst))) { +- if (extent_entry_is_ptr(prev)) ++ extent_for_each_entry(l, en_l) { ++ en_r = vstruct_idx(r.v, (u64 *) en_l - l.v->_data); ++ ++ if (extent_entry_type(en_l) != extent_entry_type(en_r)) ++ return BCH_MERGE_NOMERGE; ++ ++ switch (extent_entry_type(en_l)) { ++ case BCH_EXTENT_ENTRY_ptr: { ++ const struct bch_extent_ptr *lp = &en_l->ptr; ++ const struct bch_extent_ptr *rp = &en_r->ptr; ++ struct bch_dev *ca; ++ ++ if (lp->offset + crc_l.compressed_size != rp->offset || ++ lp->dev != rp->dev || ++ lp->gen != rp->gen) ++ return BCH_MERGE_NOMERGE; ++ ++ /* We don't allow extents to straddle buckets: */ ++ ca = bch_dev_bkey_exists(c, lp->dev); ++ ++ if (PTR_BUCKET_NR(ca, lp) != PTR_BUCKET_NR(ca, rp)) ++ return BCH_MERGE_NOMERGE; ++ ++ break; ++ } ++ case BCH_EXTENT_ENTRY_stripe_ptr: ++ if (en_l->stripe_ptr.block != en_r->stripe_ptr.block || ++ en_l->stripe_ptr.idx != en_r->stripe_ptr.idx) ++ return BCH_MERGE_NOMERGE; + break; ++ case BCH_EXTENT_ENTRY_crc32: ++ case BCH_EXTENT_ENTRY_crc64: ++ case BCH_EXTENT_ENTRY_crc128: ++ crc_l = bch2_extent_crc_unpack(l.k, entry_to_crc(en_l)); ++ crc_r = bch2_extent_crc_unpack(r.k, entry_to_crc(en_r)); ++ ++ if (crc_l.csum_type != crc_r.csum_type || ++ crc_l.compression_type != crc_r.compression_type || ++ crc_l.nonce != crc_r.nonce) ++ return BCH_MERGE_NOMERGE; ++ ++ if (crc_l.offset + crc_l.live_size != crc_l.compressed_size || ++ crc_r.offset) ++ return BCH_MERGE_NOMERGE; ++ ++ if (!bch2_checksum_mergeable(crc_l.csum_type)) ++ return BCH_MERGE_NOMERGE; ++ ++ if (crc_l.compression_type) ++ return BCH_MERGE_NOMERGE; ++ ++ if (crc_l.csum_type && ++ crc_l.uncompressed_size + ++ crc_r.uncompressed_size > c->sb.encoded_extent_max) ++ return BCH_MERGE_NOMERGE; ++ ++ if (crc_l.uncompressed_size + crc_r.uncompressed_size - 1 > ++ bch2_crc_field_size_max[extent_entry_type(en_l)]) ++ return BCH_MERGE_NOMERGE; + +- if (extent_entry_is_crc(prev)) { +- if (drop_crc) +- dst = prev; + break; ++ default: ++ return BCH_MERGE_NOMERGE; + } ++ } + +- dst = prev; ++ extent_for_each_entry(l, en_l) { ++ struct bch_extent_crc_unpacked crc_l, crc_r; ++ ++ en_r = vstruct_idx(r.v, (u64 *) en_l - l.v->_data); ++ ++ if (!extent_entry_is_crc(en_l)) ++ continue; ++ ++ crc_l = bch2_extent_crc_unpack(l.k, entry_to_crc(en_l)); ++ crc_r = bch2_extent_crc_unpack(r.k, entry_to_crc(en_r)); ++ ++ crc_l.csum = bch2_checksum_merge(crc_l.csum_type, ++ crc_l.csum, ++ crc_r.csum, ++ crc_r.uncompressed_size << 9); ++ ++ crc_l.uncompressed_size += crc_r.uncompressed_size; ++ crc_l.compressed_size += crc_r.compressed_size; ++ ++ bch2_extent_crc_pack(entry_to_crc(en_l), crc_l, ++ extent_entry_type(en_l)); + } + +- memmove_u64s_down(dst, src, +- (u64 *) ptrs.end - (u64 *) src); +- k.k->u64s -= (u64 *) src - (u64 *) dst; ++ bch2_key_resize(l.k, l.k->size + r.k->size); + +- return dst; ++ return BCH_MERGE_MERGE; ++} ++ ++/* KEY_TYPE_reservation: */ ++ ++const char *bch2_reservation_invalid(const struct bch_fs *c, struct bkey_s_c k) ++{ ++ struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k); ++ ++ if (bkey_val_bytes(k.k) != sizeof(struct bch_reservation)) ++ return "incorrect value size"; ++ ++ if (!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX) ++ return "invalid nr_replicas"; ++ ++ return NULL; ++} ++ ++void bch2_reservation_to_text(struct printbuf *out, struct bch_fs *c, ++ struct bkey_s_c k) ++{ ++ struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k); ++ ++ pr_buf(out, "generation %u replicas %u", ++ le32_to_cpu(r.v->generation), ++ r.v->nr_replicas); ++} ++ ++enum merge_result bch2_reservation_merge(struct bch_fs *c, ++ struct bkey_s _l, struct bkey_s _r) ++{ ++ struct bkey_s_reservation l = bkey_s_to_reservation(_l); ++ struct bkey_s_reservation r = bkey_s_to_reservation(_r); ++ ++ if (l.v->generation != r.v->generation || ++ l.v->nr_replicas != r.v->nr_replicas) ++ return BCH_MERGE_NOMERGE; ++ ++ if ((u64) l.k->size + r.k->size > KEY_SIZE_MAX) { ++ bch2_key_resize(l.k, KEY_SIZE_MAX); ++ bch2_cut_front_s(l.k->p, r.s); ++ return BCH_MERGE_PARTIAL; ++ } ++ ++ bch2_key_resize(l.k, l.k->size + r.k->size); ++ ++ return BCH_MERGE_MERGE; ++} ++ ++/* Extent checksum entries: */ ++ ++/* returns true if not equal */ ++static inline bool bch2_crc_unpacked_cmp(struct bch_extent_crc_unpacked l, ++ struct bch_extent_crc_unpacked r) ++{ ++ return (l.csum_type != r.csum_type || ++ l.compression_type != r.compression_type || ++ l.compressed_size != r.compressed_size || ++ l.uncompressed_size != r.uncompressed_size || ++ l.offset != r.offset || ++ l.live_size != r.live_size || ++ l.nonce != r.nonce || ++ bch2_crc_cmp(l.csum, r.csum)); + } + + static inline bool can_narrow_crc(struct bch_extent_crc_unpacked u, +@@ -463,509 +524,237 @@ restart_narrow_pointers: + return ret; + } + +-/* returns true if not equal */ +-static inline bool bch2_crc_unpacked_cmp(struct bch_extent_crc_unpacked l, +- struct bch_extent_crc_unpacked r) ++static void bch2_extent_crc_pack(union bch_extent_crc *dst, ++ struct bch_extent_crc_unpacked src, ++ enum bch_extent_entry_type type) + { +- return (l.csum_type != r.csum_type || +- l.compression_type != r.compression_type || +- l.compressed_size != r.compressed_size || +- l.uncompressed_size != r.uncompressed_size || +- l.offset != r.offset || +- l.live_size != r.live_size || +- l.nonce != r.nonce || +- bch2_crc_cmp(l.csum, r.csum)); ++#define set_common_fields(_dst, _src) \ ++ _dst.type = 1 << type; \ ++ _dst.csum_type = _src.csum_type, \ ++ _dst.compression_type = _src.compression_type, \ ++ _dst._compressed_size = _src.compressed_size - 1, \ ++ _dst._uncompressed_size = _src.uncompressed_size - 1, \ ++ _dst.offset = _src.offset ++ ++ switch (type) { ++ case BCH_EXTENT_ENTRY_crc32: ++ set_common_fields(dst->crc32, src); ++ dst->crc32.csum = *((__le32 *) &src.csum.lo); ++ break; ++ case BCH_EXTENT_ENTRY_crc64: ++ set_common_fields(dst->crc64, src); ++ dst->crc64.nonce = src.nonce; ++ dst->crc64.csum_lo = src.csum.lo; ++ dst->crc64.csum_hi = *((__le16 *) &src.csum.hi); ++ break; ++ case BCH_EXTENT_ENTRY_crc128: ++ set_common_fields(dst->crc128, src); ++ dst->crc128.nonce = src.nonce; ++ dst->crc128.csum = src.csum; ++ break; ++ default: ++ BUG(); ++ } ++#undef set_common_fields + } + +-void bch2_ptr_swab(const struct bkey_format *f, struct bkey_packed *k) ++void bch2_extent_crc_append(struct bkey_i *k, ++ struct bch_extent_crc_unpacked new) + { +- union bch_extent_entry *entry; +- u64 *d = (u64 *) bkeyp_val(f, k); +- unsigned i; ++ struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k)); ++ union bch_extent_crc *crc = (void *) ptrs.end; ++ enum bch_extent_entry_type type; + +- for (i = 0; i < bkeyp_val_u64s(f, k); i++) +- d[i] = swab64(d[i]); ++ if (bch_crc_bytes[new.csum_type] <= 4 && ++ new.uncompressed_size - 1 <= CRC32_SIZE_MAX && ++ new.nonce <= CRC32_NONCE_MAX) ++ type = BCH_EXTENT_ENTRY_crc32; ++ else if (bch_crc_bytes[new.csum_type] <= 10 && ++ new.uncompressed_size - 1 <= CRC64_SIZE_MAX && ++ new.nonce <= CRC64_NONCE_MAX) ++ type = BCH_EXTENT_ENTRY_crc64; ++ else if (bch_crc_bytes[new.csum_type] <= 16 && ++ new.uncompressed_size - 1 <= CRC128_SIZE_MAX && ++ new.nonce <= CRC128_NONCE_MAX) ++ type = BCH_EXTENT_ENTRY_crc128; ++ else ++ BUG(); + +- for (entry = (union bch_extent_entry *) d; +- entry < (union bch_extent_entry *) (d + bkeyp_val_u64s(f, k)); +- entry = extent_entry_next(entry)) { +- switch (extent_entry_type(entry)) { +- case BCH_EXTENT_ENTRY_ptr: +- break; +- case BCH_EXTENT_ENTRY_crc32: +- entry->crc32.csum = swab32(entry->crc32.csum); +- break; +- case BCH_EXTENT_ENTRY_crc64: +- entry->crc64.csum_hi = swab16(entry->crc64.csum_hi); +- entry->crc64.csum_lo = swab64(entry->crc64.csum_lo); +- break; +- case BCH_EXTENT_ENTRY_crc128: +- entry->crc128.csum.hi = (__force __le64) +- swab64((__force u64) entry->crc128.csum.hi); +- entry->crc128.csum.lo = (__force __le64) +- swab64((__force u64) entry->crc128.csum.lo); +- break; +- case BCH_EXTENT_ENTRY_stripe_ptr: +- break; +- } +- } ++ bch2_extent_crc_pack(crc, new, type); ++ ++ k->k.u64s += extent_entry_u64s(ptrs.end); ++ ++ EBUG_ON(bkey_val_u64s(&k->k) > BKEY_EXTENT_VAL_U64s_MAX); + } + +-void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c, +- struct bkey_s_c k) +-{ +- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); +- const union bch_extent_entry *entry; +- struct bch_extent_crc_unpacked crc; +- const struct bch_extent_ptr *ptr; +- const struct bch_extent_stripe_ptr *ec; +- struct bch_dev *ca; +- bool first = true; +- +- bkey_extent_entry_for_each(ptrs, entry) { +- if (!first) +- pr_buf(out, " "); +- +- switch (__extent_entry_type(entry)) { +- case BCH_EXTENT_ENTRY_ptr: +- ptr = entry_to_ptr(entry); +- ca = ptr->dev < c->sb.nr_devices && c->devs[ptr->dev] +- ? bch_dev_bkey_exists(c, ptr->dev) +- : NULL; +- +- pr_buf(out, "ptr: %u:%llu gen %u%s%s", ptr->dev, +- (u64) ptr->offset, ptr->gen, +- ptr->cached ? " cached" : "", +- ca && ptr_stale(ca, ptr) +- ? " stale" : ""); +- break; +- case BCH_EXTENT_ENTRY_crc32: +- case BCH_EXTENT_ENTRY_crc64: +- case BCH_EXTENT_ENTRY_crc128: +- crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry)); +- +- pr_buf(out, "crc: c_size %u size %u offset %u nonce %u csum %u compress %u", +- crc.compressed_size, +- crc.uncompressed_size, +- crc.offset, crc.nonce, +- crc.csum_type, +- crc.compression_type); +- break; +- case BCH_EXTENT_ENTRY_stripe_ptr: +- ec = &entry->stripe_ptr; +- +- pr_buf(out, "ec: idx %llu block %u", +- (u64) ec->idx, ec->block); +- break; +- default: +- pr_buf(out, "(invalid extent entry %.16llx)", *((u64 *) entry)); +- return; +- } ++/* Generic code for keys with pointers: */ + +- first = false; +- } ++unsigned bch2_bkey_nr_ptrs(struct bkey_s_c k) ++{ ++ return bch2_bkey_devs(k).nr; + } + +-static const char *extent_ptr_invalid(const struct bch_fs *c, +- struct bkey_s_c k, +- const struct bch_extent_ptr *ptr, +- unsigned size_ondisk, +- bool metadata) ++unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c k) + { +- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); +- const struct bch_extent_ptr *ptr2; +- struct bch_dev *ca; +- +- if (!bch2_dev_exists2(c, ptr->dev)) +- return "pointer to invalid device"; +- +- ca = bch_dev_bkey_exists(c, ptr->dev); +- if (!ca) +- return "pointer to invalid device"; +- +- bkey_for_each_ptr(ptrs, ptr2) +- if (ptr != ptr2 && ptr->dev == ptr2->dev) +- return "multiple pointers to same device"; +- +- if (ptr->offset + size_ondisk > bucket_to_sector(ca, ca->mi.nbuckets)) +- return "offset past end of device"; +- +- if (ptr->offset < bucket_to_sector(ca, ca->mi.first_bucket)) +- return "offset before first bucket"; +- +- if (bucket_remainder(ca, ptr->offset) + +- size_ondisk > ca->mi.bucket_size) +- return "spans multiple buckets"; +- +- return NULL; ++ return k.k->type == KEY_TYPE_reservation ++ ? bkey_s_c_to_reservation(k).v->nr_replicas ++ : bch2_bkey_dirty_devs(k).nr; + } + +-const char *bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k) ++unsigned bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c k) + { +- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); +- const union bch_extent_entry *entry; +- struct bch_extent_crc_unpacked crc; +- unsigned size_ondisk = k.k->size; +- const char *reason; +- unsigned nonce = UINT_MAX; +- +- if (k.k->type == KEY_TYPE_btree_ptr) +- size_ondisk = c->opts.btree_node_size; +- +- bkey_extent_entry_for_each(ptrs, entry) { +- if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX) +- return "invalid extent entry type"; +- +- if (k.k->type == KEY_TYPE_btree_ptr && +- !extent_entry_is_ptr(entry)) +- return "has non ptr field"; +- +- switch (extent_entry_type(entry)) { +- case BCH_EXTENT_ENTRY_ptr: +- reason = extent_ptr_invalid(c, k, &entry->ptr, +- size_ondisk, false); +- if (reason) +- return reason; +- break; +- case BCH_EXTENT_ENTRY_crc32: +- case BCH_EXTENT_ENTRY_crc64: +- case BCH_EXTENT_ENTRY_crc128: +- crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry)); +- +- if (crc.offset + crc.live_size > +- crc.uncompressed_size) +- return "checksum offset + key size > uncompressed size"; +- +- size_ondisk = crc.compressed_size; +- +- if (!bch2_checksum_type_valid(c, crc.csum_type)) +- return "invalid checksum type"; ++ unsigned ret = 0; + +- if (crc.compression_type >= BCH_COMPRESSION_NR) +- return "invalid compression type"; ++ if (k.k->type == KEY_TYPE_reservation) { ++ ret = bkey_s_c_to_reservation(k).v->nr_replicas; ++ } else { ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const union bch_extent_entry *entry; ++ struct extent_ptr_decoded p; + +- if (bch2_csum_type_is_encryption(crc.csum_type)) { +- if (nonce == UINT_MAX) +- nonce = crc.offset + crc.nonce; +- else if (nonce != crc.offset + crc.nonce) +- return "incorrect nonce"; +- } +- break; +- case BCH_EXTENT_ENTRY_stripe_ptr: +- break; +- } ++ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) ++ ret += !p.ptr.cached && ++ p.crc.compression_type == BCH_COMPRESSION_NONE; + } + +- return NULL; +-} +- +-/* Btree ptrs */ +- +-const char *bch2_btree_ptr_invalid(const struct bch_fs *c, struct bkey_s_c k) +-{ +- if (bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX) +- return "value too big"; +- +- return bch2_bkey_ptrs_invalid(c, k); ++ return ret; + } + +-void bch2_btree_ptr_debugcheck(struct bch_fs *c, struct bkey_s_c k) ++unsigned bch2_bkey_sectors_compressed(struct bkey_s_c k) + { + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); +- const struct bch_extent_ptr *ptr; +- const char *err; +- char buf[160]; +- struct bucket_mark mark; +- struct bch_dev *ca; +- +- bch2_fs_bug_on(!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) && +- !bch2_bkey_replicas_marked(c, k, false), c, +- "btree key bad (replicas not marked in superblock):\n%s", +- (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); +- +- if (!test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) +- return; +- +- bkey_for_each_ptr(ptrs, ptr) { +- ca = bch_dev_bkey_exists(c, ptr->dev); +- +- mark = ptr_bucket_mark(ca, ptr); +- +- err = "stale"; +- if (gen_after(mark.gen, ptr->gen)) +- goto err; +- +- err = "inconsistent"; +- if (mark.data_type != BCH_DATA_BTREE || +- mark.dirty_sectors < c->opts.btree_node_size) +- goto err; +- } ++ const union bch_extent_entry *entry; ++ struct extent_ptr_decoded p; ++ unsigned ret = 0; + +- return; +-err: +- bch2_bkey_val_to_text(&PBUF(buf), c, k); +- bch2_fs_bug(c, "%s btree pointer %s: bucket %zi gen %i mark %08x", +- err, buf, PTR_BUCKET_NR(ca, ptr), +- mark.gen, (unsigned) mark.v.counter); +-} ++ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) ++ if (!p.ptr.cached && ++ p.crc.compression_type != BCH_COMPRESSION_NONE) ++ ret += p.crc.compressed_size; + +-void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c, +- struct bkey_s_c k) +-{ +- bch2_bkey_ptrs_to_text(out, c, k); ++ return ret; + } + +-/* Extents */ +- +-int bch2_cut_front_s(struct bpos where, struct bkey_s k) ++bool bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size, ++ unsigned nr_replicas) + { +- unsigned new_val_u64s = bkey_val_u64s(k.k); +- int val_u64s_delta; +- u64 sub; +- +- if (bkey_cmp(where, bkey_start_pos(k.k)) <= 0) +- return 0; +- +- EBUG_ON(bkey_cmp(where, k.k->p) > 0); +- +- sub = where.offset - bkey_start_offset(k.k); +- +- k.k->size -= sub; ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bpos end = pos; ++ struct bkey_s_c k; ++ bool ret = true; ++ int err; + +- if (!k.k->size) { +- k.k->type = KEY_TYPE_deleted; +- new_val_u64s = 0; +- } ++ end.offset += size; + +- switch (k.k->type) { +- case KEY_TYPE_extent: +- case KEY_TYPE_reflink_v: { +- struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); +- union bch_extent_entry *entry; +- bool seen_crc = false; ++ bch2_trans_init(&trans, c, 0, 0); + +- bkey_extent_entry_for_each(ptrs, entry) { +- switch (extent_entry_type(entry)) { +- case BCH_EXTENT_ENTRY_ptr: +- if (!seen_crc) +- entry->ptr.offset += sub; +- break; +- case BCH_EXTENT_ENTRY_crc32: +- entry->crc32.offset += sub; +- break; +- case BCH_EXTENT_ENTRY_crc64: +- entry->crc64.offset += sub; +- break; +- case BCH_EXTENT_ENTRY_crc128: +- entry->crc128.offset += sub; +- break; +- case BCH_EXTENT_ENTRY_stripe_ptr: +- break; +- } ++ for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, pos, ++ BTREE_ITER_SLOTS, k, err) { ++ if (bkey_cmp(bkey_start_pos(k.k), end) >= 0) ++ break; + +- if (extent_entry_is_crc(entry)) +- seen_crc = true; ++ if (nr_replicas > bch2_bkey_nr_ptrs_fully_allocated(k)) { ++ ret = false; ++ break; + } +- +- break; +- } +- case KEY_TYPE_reflink_p: { +- struct bkey_s_reflink_p p = bkey_s_to_reflink_p(k); +- +- le64_add_cpu(&p.v->idx, sub); +- break; + } +- case KEY_TYPE_inline_data: { +- struct bkey_s_inline_data d = bkey_s_to_inline_data(k); +- +- sub = min_t(u64, sub << 9, bkey_val_bytes(d.k)); +- +- memmove(d.v->data, +- d.v->data + sub, +- bkey_val_bytes(d.k) - sub); +- +- new_val_u64s -= sub >> 3; +- break; +- } +- } +- +- val_u64s_delta = bkey_val_u64s(k.k) - new_val_u64s; +- BUG_ON(val_u64s_delta < 0); ++ bch2_trans_exit(&trans); + +- set_bkey_val_u64s(k.k, new_val_u64s); +- memset(bkey_val_end(k), 0, val_u64s_delta * sizeof(u64)); +- return -val_u64s_delta; ++ return ret; + } + +-int bch2_cut_back_s(struct bpos where, struct bkey_s k) ++static unsigned bch2_extent_ptr_durability(struct bch_fs *c, ++ struct extent_ptr_decoded p) + { +- unsigned new_val_u64s = bkey_val_u64s(k.k); +- int val_u64s_delta; +- u64 len = 0; ++ unsigned durability = 0; ++ struct bch_dev *ca; + +- if (bkey_cmp(where, k.k->p) >= 0) ++ if (p.ptr.cached) + return 0; + +- EBUG_ON(bkey_cmp(where, bkey_start_pos(k.k)) < 0); ++ ca = bch_dev_bkey_exists(c, p.ptr.dev); + +- len = where.offset - bkey_start_offset(k.k); ++ if (ca->mi.state != BCH_MEMBER_STATE_FAILED) ++ durability = max_t(unsigned, durability, ca->mi.durability); + +- k.k->p = where; +- k.k->size = len; ++ if (p.has_ec) { ++ struct stripe *s = ++ genradix_ptr(&c->stripes[0], p.ec.idx); + +- if (!len) { +- k.k->type = KEY_TYPE_deleted; +- new_val_u64s = 0; +- } ++ if (WARN_ON(!s)) ++ goto out; + +- switch (k.k->type) { +- case KEY_TYPE_inline_data: +- new_val_u64s = min(new_val_u64s, k.k->size << 6); +- break; ++ durability = max_t(unsigned, durability, s->nr_redundant); + } +- +- val_u64s_delta = bkey_val_u64s(k.k) - new_val_u64s; +- BUG_ON(val_u64s_delta < 0); +- +- set_bkey_val_u64s(k.k, new_val_u64s); +- memset(bkey_val_end(k), 0, val_u64s_delta * sizeof(u64)); +- return -val_u64s_delta; +-} +- +-const char *bch2_extent_invalid(const struct bch_fs *c, struct bkey_s_c k) +-{ +- return bch2_bkey_ptrs_invalid(c, k); ++out: ++ return durability; + } + +-void bch2_extent_debugcheck(struct bch_fs *c, struct bkey_s_c k) ++unsigned bch2_bkey_durability(struct bch_fs *c, struct bkey_s_c k) + { +- struct bkey_s_c_extent e = bkey_s_c_to_extent(k); ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; +- char buf[160]; +- +- /* +- * XXX: we should be doing most/all of these checks at startup time, +- * where we check bch2_bkey_invalid() in btree_node_read_done() +- * +- * But note that we can't check for stale pointers or incorrect gc marks +- * until after journal replay is done (it might be an extent that's +- * going to get overwritten during replay) +- */ +- +- if (percpu_down_read_trylock(&c->mark_lock)) { +- bch2_fs_bug_on(!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) && +- !bch2_bkey_replicas_marked_locked(c, e.s_c, false), c, +- "extent key bad (replicas not marked in superblock):\n%s", +- (bch2_bkey_val_to_text(&PBUF(buf), c, e.s_c), buf)); +- percpu_up_read(&c->mark_lock); +- } +- /* +- * If journal replay hasn't finished, we might be seeing keys +- * that will be overwritten by the time journal replay is done: +- */ +- if (!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)) +- return; +- +- extent_for_each_ptr_decode(e, p, entry) { +- struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); +- struct bucket_mark mark = ptr_bucket_mark(ca, &p.ptr); +- unsigned stale = gen_after(mark.gen, p.ptr.gen); +- unsigned disk_sectors = ptr_disk_sectors(p); +- unsigned mark_sectors = p.ptr.cached +- ? mark.cached_sectors +- : mark.dirty_sectors; +- +- bch2_fs_bug_on(stale && !p.ptr.cached, c, +- "stale dirty pointer (ptr gen %u bucket %u", +- p.ptr.gen, mark.gen); ++ unsigned durability = 0; + +- bch2_fs_bug_on(stale > 96, c, "key too stale: %i", stale); ++ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) ++ durability += bch2_extent_ptr_durability(c, p); + +- bch2_fs_bug_on(!stale && +- (mark.data_type != BCH_DATA_USER || +- mark_sectors < disk_sectors), c, +- "extent pointer not marked: %s:\n" +- "type %u sectors %u < %u", +- (bch2_bkey_val_to_text(&PBUF(buf), c, e.s_c), buf), +- mark.data_type, +- mark_sectors, disk_sectors); +- } ++ return durability; + } + +-void bch2_extent_to_text(struct printbuf *out, struct bch_fs *c, +- struct bkey_s_c k) ++void bch2_bkey_mark_replicas_cached(struct bch_fs *c, struct bkey_s k, ++ unsigned target, ++ unsigned nr_desired_replicas) + { +- bch2_bkey_ptrs_to_text(out, c, k); +-} ++ struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); ++ union bch_extent_entry *entry; ++ struct extent_ptr_decoded p; ++ int extra = bch2_bkey_durability(c, k.s_c) - nr_desired_replicas; + +-static unsigned bch2_crc_field_size_max[] = { +- [BCH_EXTENT_ENTRY_crc32] = CRC32_SIZE_MAX, +- [BCH_EXTENT_ENTRY_crc64] = CRC64_SIZE_MAX, +- [BCH_EXTENT_ENTRY_crc128] = CRC128_SIZE_MAX, +-}; ++ if (target && extra > 0) ++ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { ++ int n = bch2_extent_ptr_durability(c, p); + +-static void bch2_extent_crc_pack(union bch_extent_crc *dst, +- struct bch_extent_crc_unpacked src, +- enum bch_extent_entry_type type) +-{ +-#define set_common_fields(_dst, _src) \ +- _dst.type = 1 << type; \ +- _dst.csum_type = _src.csum_type, \ +- _dst.compression_type = _src.compression_type, \ +- _dst._compressed_size = _src.compressed_size - 1, \ +- _dst._uncompressed_size = _src.uncompressed_size - 1, \ +- _dst.offset = _src.offset ++ if (n && n <= extra && ++ !bch2_dev_in_target(c, p.ptr.dev, target)) { ++ entry->ptr.cached = true; ++ extra -= n; ++ } ++ } + +- switch (type) { +- case BCH_EXTENT_ENTRY_crc32: +- set_common_fields(dst->crc32, src); +- dst->crc32.csum = *((__le32 *) &src.csum.lo); +- break; +- case BCH_EXTENT_ENTRY_crc64: +- set_common_fields(dst->crc64, src); +- dst->crc64.nonce = src.nonce; +- dst->crc64.csum_lo = src.csum.lo; +- dst->crc64.csum_hi = *((__le16 *) &src.csum.hi); +- break; +- case BCH_EXTENT_ENTRY_crc128: +- set_common_fields(dst->crc128, src); +- dst->crc128.nonce = src.nonce; +- dst->crc128.csum = src.csum; +- break; +- default: +- BUG(); +- } +-#undef set_common_fields +-} ++ if (extra > 0) ++ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { ++ int n = bch2_extent_ptr_durability(c, p); + +-void bch2_extent_crc_append(struct bkey_i *k, +- struct bch_extent_crc_unpacked new) +-{ +- struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k)); +- union bch_extent_crc *crc = (void *) ptrs.end; +- enum bch_extent_entry_type type; ++ if (n && n <= extra) { ++ entry->ptr.cached = true; ++ extra -= n; ++ } ++ } ++} + +- if (bch_crc_bytes[new.csum_type] <= 4 && +- new.uncompressed_size - 1 <= CRC32_SIZE_MAX && +- new.nonce <= CRC32_NONCE_MAX) +- type = BCH_EXTENT_ENTRY_crc32; +- else if (bch_crc_bytes[new.csum_type] <= 10 && +- new.uncompressed_size - 1 <= CRC64_SIZE_MAX && +- new.nonce <= CRC64_NONCE_MAX) +- type = BCH_EXTENT_ENTRY_crc64; +- else if (bch_crc_bytes[new.csum_type] <= 16 && +- new.uncompressed_size - 1 <= CRC128_SIZE_MAX && +- new.nonce <= CRC128_NONCE_MAX) +- type = BCH_EXTENT_ENTRY_crc128; +- else +- BUG(); ++void bch2_bkey_append_ptr(struct bkey_i *k, ++ struct bch_extent_ptr ptr) ++{ ++ EBUG_ON(bch2_bkey_has_device(bkey_i_to_s_c(k), ptr.dev)); + +- bch2_extent_crc_pack(crc, new, type); ++ switch (k->k.type) { ++ case KEY_TYPE_btree_ptr: ++ case KEY_TYPE_extent: ++ EBUG_ON(bkey_val_u64s(&k->k) >= BKEY_EXTENT_VAL_U64s_MAX); + +- k->k.u64s += extent_entry_u64s(ptrs.end); ++ ptr.type = 1 << BCH_EXTENT_ENTRY_ptr; + +- EBUG_ON(bkey_val_u64s(&k->k) > BKEY_EXTENT_VAL_U64s_MAX); ++ memcpy((void *) &k->v + bkey_val_bytes(&k->k), ++ &ptr, ++ sizeof(ptr)); ++ k->u64s++; ++ break; ++ default: ++ BUG(); ++ } + } + + static inline void __extent_entry_insert(struct bkey_i *k, +@@ -1011,6 +800,107 @@ found: + } + } + ++static union bch_extent_entry *extent_entry_prev(struct bkey_ptrs ptrs, ++ union bch_extent_entry *entry) ++{ ++ union bch_extent_entry *i = ptrs.start; ++ ++ if (i == entry) ++ return NULL; ++ ++ while (extent_entry_next(i) != entry) ++ i = extent_entry_next(i); ++ return i; ++} ++ ++union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s k, ++ struct bch_extent_ptr *ptr) ++{ ++ struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); ++ union bch_extent_entry *dst, *src, *prev; ++ bool drop_crc = true; ++ ++ EBUG_ON(ptr < &ptrs.start->ptr || ++ ptr >= &ptrs.end->ptr); ++ EBUG_ON(ptr->type != 1 << BCH_EXTENT_ENTRY_ptr); ++ ++ src = extent_entry_next(to_entry(ptr)); ++ if (src != ptrs.end && ++ !extent_entry_is_crc(src)) ++ drop_crc = false; ++ ++ dst = to_entry(ptr); ++ while ((prev = extent_entry_prev(ptrs, dst))) { ++ if (extent_entry_is_ptr(prev)) ++ break; ++ ++ if (extent_entry_is_crc(prev)) { ++ if (drop_crc) ++ dst = prev; ++ break; ++ } ++ ++ dst = prev; ++ } ++ ++ memmove_u64s_down(dst, src, ++ (u64 *) ptrs.end - (u64 *) src); ++ k.k->u64s -= (u64 *) src - (u64 *) dst; ++ ++ return dst; ++} ++ ++void bch2_bkey_drop_device(struct bkey_s k, unsigned dev) ++{ ++ struct bch_extent_ptr *ptr; ++ ++ bch2_bkey_drop_ptrs(k, ptr, ptr->dev == dev); ++} ++ ++const struct bch_extent_ptr * ++bch2_bkey_has_device(struct bkey_s_c k, unsigned dev) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const struct bch_extent_ptr *ptr; ++ ++ bkey_for_each_ptr(ptrs, ptr) ++ if (ptr->dev == dev) ++ return ptr; ++ ++ return NULL; ++} ++ ++bool bch2_bkey_has_target(struct bch_fs *c, struct bkey_s_c k, unsigned target) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const struct bch_extent_ptr *ptr; ++ ++ bkey_for_each_ptr(ptrs, ptr) ++ if (bch2_dev_in_target(c, ptr->dev, target) && ++ (!ptr->cached || ++ !ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr))) ++ return true; ++ ++ return false; ++} ++ ++bool bch2_bkey_matches_ptr(struct bch_fs *c, struct bkey_s_c k, ++ struct bch_extent_ptr m, u64 offset) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const union bch_extent_entry *entry; ++ struct extent_ptr_decoded p; ++ ++ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) ++ if (p.ptr.dev == m.dev && ++ p.ptr.gen == m.gen && ++ (s64) p.ptr.offset + p.crc.offset - bkey_start_offset(k.k) == ++ (s64) m.offset - offset) ++ return true; ++ ++ return false; ++} ++ + /* + * bch_extent_normalize - clean up an extent, dropping stale pointers etc. + * +@@ -1028,245 +918,307 @@ bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k) + ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr)); + + /* will only happen if all pointers were cached: */ +- if (!bkey_val_u64s(k.k)) ++ if (!bch2_bkey_nr_ptrs(k.s_c)) + k.k->type = KEY_TYPE_discard; + + return bkey_whiteout(k.k); + } + +-void bch2_bkey_mark_replicas_cached(struct bch_fs *c, struct bkey_s k, +- unsigned target, +- unsigned nr_desired_replicas) ++void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c, ++ struct bkey_s_c k) + { +- struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); +- union bch_extent_entry *entry; +- struct extent_ptr_decoded p; +- int extra = bch2_bkey_durability(c, k.s_c) - nr_desired_replicas; ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const union bch_extent_entry *entry; ++ struct bch_extent_crc_unpacked crc; ++ const struct bch_extent_ptr *ptr; ++ const struct bch_extent_stripe_ptr *ec; ++ struct bch_dev *ca; ++ bool first = true; + +- if (target && extra > 0) +- bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { +- int n = bch2_extent_ptr_durability(c, p); ++ bkey_extent_entry_for_each(ptrs, entry) { ++ if (!first) ++ pr_buf(out, " "); + +- if (n && n <= extra && +- !bch2_dev_in_target(c, p.ptr.dev, target)) { +- entry->ptr.cached = true; +- extra -= n; +- } +- } ++ switch (__extent_entry_type(entry)) { ++ case BCH_EXTENT_ENTRY_ptr: ++ ptr = entry_to_ptr(entry); ++ ca = ptr->dev < c->sb.nr_devices && c->devs[ptr->dev] ++ ? bch_dev_bkey_exists(c, ptr->dev) ++ : NULL; + +- if (extra > 0) +- bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { +- int n = bch2_extent_ptr_durability(c, p); ++ pr_buf(out, "ptr: %u:%llu gen %u%s%s", ptr->dev, ++ (u64) ptr->offset, ptr->gen, ++ ptr->cached ? " cached" : "", ++ ca && ptr_stale(ca, ptr) ++ ? " stale" : ""); ++ break; ++ case BCH_EXTENT_ENTRY_crc32: ++ case BCH_EXTENT_ENTRY_crc64: ++ case BCH_EXTENT_ENTRY_crc128: ++ crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry)); + +- if (n && n <= extra) { +- entry->ptr.cached = true; +- extra -= n; +- } ++ pr_buf(out, "crc: c_size %u size %u offset %u nonce %u csum %u compress %u", ++ crc.compressed_size, ++ crc.uncompressed_size, ++ crc.offset, crc.nonce, ++ crc.csum_type, ++ crc.compression_type); ++ break; ++ case BCH_EXTENT_ENTRY_stripe_ptr: ++ ec = &entry->stripe_ptr; ++ ++ pr_buf(out, "ec: idx %llu block %u", ++ (u64) ec->idx, ec->block); ++ break; ++ default: ++ pr_buf(out, "(invalid extent entry %.16llx)", *((u64 *) entry)); ++ return; + } ++ ++ first = false; ++ } + } + +-enum merge_result bch2_extent_merge(struct bch_fs *c, +- struct bkey_s _l, struct bkey_s _r) ++static const char *extent_ptr_invalid(const struct bch_fs *c, ++ struct bkey_s_c k, ++ const struct bch_extent_ptr *ptr, ++ unsigned size_ondisk, ++ bool metadata) + { +- struct bkey_s_extent l = bkey_s_to_extent(_l); +- struct bkey_s_extent r = bkey_s_to_extent(_r); +- union bch_extent_entry *en_l = l.v->start; +- union bch_extent_entry *en_r = r.v->start; +- struct bch_extent_crc_unpacked crc_l, crc_r; ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const struct bch_extent_ptr *ptr2; ++ struct bch_dev *ca; + +- if (bkey_val_u64s(l.k) != bkey_val_u64s(r.k)) +- return BCH_MERGE_NOMERGE; ++ if (!bch2_dev_exists2(c, ptr->dev)) ++ return "pointer to invalid device"; + +- crc_l = bch2_extent_crc_unpack(l.k, NULL); ++ ca = bch_dev_bkey_exists(c, ptr->dev); ++ if (!ca) ++ return "pointer to invalid device"; + +- extent_for_each_entry(l, en_l) { +- en_r = vstruct_idx(r.v, (u64 *) en_l - l.v->_data); ++ bkey_for_each_ptr(ptrs, ptr2) ++ if (ptr != ptr2 && ptr->dev == ptr2->dev) ++ return "multiple pointers to same device"; + +- if (extent_entry_type(en_l) != extent_entry_type(en_r)) +- return BCH_MERGE_NOMERGE; ++ if (ptr->offset + size_ondisk > bucket_to_sector(ca, ca->mi.nbuckets)) ++ return "offset past end of device"; + +- switch (extent_entry_type(en_l)) { +- case BCH_EXTENT_ENTRY_ptr: { +- const struct bch_extent_ptr *lp = &en_l->ptr; +- const struct bch_extent_ptr *rp = &en_r->ptr; +- struct bch_dev *ca; ++ if (ptr->offset < bucket_to_sector(ca, ca->mi.first_bucket)) ++ return "offset before first bucket"; + +- if (lp->offset + crc_l.compressed_size != rp->offset || +- lp->dev != rp->dev || +- lp->gen != rp->gen) +- return BCH_MERGE_NOMERGE; ++ if (bucket_remainder(ca, ptr->offset) + ++ size_ondisk > ca->mi.bucket_size) ++ return "spans multiple buckets"; + +- /* We don't allow extents to straddle buckets: */ +- ca = bch_dev_bkey_exists(c, lp->dev); ++ return NULL; ++} + +- if (PTR_BUCKET_NR(ca, lp) != PTR_BUCKET_NR(ca, rp)) +- return BCH_MERGE_NOMERGE; ++const char *bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const union bch_extent_entry *entry; ++ struct bch_extent_crc_unpacked crc; ++ unsigned size_ondisk = k.k->size; ++ const char *reason; ++ unsigned nonce = UINT_MAX; ++ ++ if (k.k->type == KEY_TYPE_btree_ptr) ++ size_ondisk = c->opts.btree_node_size; ++ ++ bkey_extent_entry_for_each(ptrs, entry) { ++ if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX) ++ return "invalid extent entry type"; + +- break; +- } +- case BCH_EXTENT_ENTRY_stripe_ptr: +- if (en_l->stripe_ptr.block != en_r->stripe_ptr.block || +- en_l->stripe_ptr.idx != en_r->stripe_ptr.idx) +- return BCH_MERGE_NOMERGE; ++ if (k.k->type == KEY_TYPE_btree_ptr && ++ !extent_entry_is_ptr(entry)) ++ return "has non ptr field"; ++ ++ switch (extent_entry_type(entry)) { ++ case BCH_EXTENT_ENTRY_ptr: ++ reason = extent_ptr_invalid(c, k, &entry->ptr, ++ size_ondisk, false); ++ if (reason) ++ return reason; + break; + case BCH_EXTENT_ENTRY_crc32: + case BCH_EXTENT_ENTRY_crc64: + case BCH_EXTENT_ENTRY_crc128: +- crc_l = bch2_extent_crc_unpack(l.k, entry_to_crc(en_l)); +- crc_r = bch2_extent_crc_unpack(r.k, entry_to_crc(en_r)); +- +- if (crc_l.csum_type != crc_r.csum_type || +- crc_l.compression_type != crc_r.compression_type || +- crc_l.nonce != crc_r.nonce) +- return BCH_MERGE_NOMERGE; +- +- if (crc_l.offset + crc_l.live_size != crc_l.compressed_size || +- crc_r.offset) +- return BCH_MERGE_NOMERGE; ++ crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry)); + +- if (!bch2_checksum_mergeable(crc_l.csum_type)) +- return BCH_MERGE_NOMERGE; ++ if (crc.offset + crc.live_size > ++ crc.uncompressed_size) ++ return "checksum offset + key size > uncompressed size"; + +- if (crc_l.compression_type) +- return BCH_MERGE_NOMERGE; ++ size_ondisk = crc.compressed_size; + +- if (crc_l.csum_type && +- crc_l.uncompressed_size + +- crc_r.uncompressed_size > c->sb.encoded_extent_max) +- return BCH_MERGE_NOMERGE; ++ if (!bch2_checksum_type_valid(c, crc.csum_type)) ++ return "invalid checksum type"; + +- if (crc_l.uncompressed_size + crc_r.uncompressed_size - 1 > +- bch2_crc_field_size_max[extent_entry_type(en_l)]) +- return BCH_MERGE_NOMERGE; ++ if (crc.compression_type >= BCH_COMPRESSION_NR) ++ return "invalid compression type"; + ++ if (bch2_csum_type_is_encryption(crc.csum_type)) { ++ if (nonce == UINT_MAX) ++ nonce = crc.offset + crc.nonce; ++ else if (nonce != crc.offset + crc.nonce) ++ return "incorrect nonce"; ++ } ++ break; ++ case BCH_EXTENT_ENTRY_stripe_ptr: + break; +- default: +- return BCH_MERGE_NOMERGE; + } + } + +- extent_for_each_entry(l, en_l) { +- struct bch_extent_crc_unpacked crc_l, crc_r; +- +- en_r = vstruct_idx(r.v, (u64 *) en_l - l.v->_data); +- +- if (!extent_entry_is_crc(en_l)) +- continue; +- +- crc_l = bch2_extent_crc_unpack(l.k, entry_to_crc(en_l)); +- crc_r = bch2_extent_crc_unpack(r.k, entry_to_crc(en_r)); ++ return NULL; ++} + +- crc_l.csum = bch2_checksum_merge(crc_l.csum_type, +- crc_l.csum, +- crc_r.csum, +- crc_r.uncompressed_size << 9); ++void bch2_ptr_swab(const struct bkey_format *f, struct bkey_packed *k) ++{ ++ union bch_extent_entry *entry; ++ u64 *d = (u64 *) bkeyp_val(f, k); ++ unsigned i; + +- crc_l.uncompressed_size += crc_r.uncompressed_size; +- crc_l.compressed_size += crc_r.compressed_size; ++ for (i = 0; i < bkeyp_val_u64s(f, k); i++) ++ d[i] = swab64(d[i]); + +- bch2_extent_crc_pack(entry_to_crc(en_l), crc_l, +- extent_entry_type(en_l)); ++ for (entry = (union bch_extent_entry *) d; ++ entry < (union bch_extent_entry *) (d + bkeyp_val_u64s(f, k)); ++ entry = extent_entry_next(entry)) { ++ switch (extent_entry_type(entry)) { ++ case BCH_EXTENT_ENTRY_ptr: ++ break; ++ case BCH_EXTENT_ENTRY_crc32: ++ entry->crc32.csum = swab32(entry->crc32.csum); ++ break; ++ case BCH_EXTENT_ENTRY_crc64: ++ entry->crc64.csum_hi = swab16(entry->crc64.csum_hi); ++ entry->crc64.csum_lo = swab64(entry->crc64.csum_lo); ++ break; ++ case BCH_EXTENT_ENTRY_crc128: ++ entry->crc128.csum.hi = (__force __le64) ++ swab64((__force u64) entry->crc128.csum.hi); ++ entry->crc128.csum.lo = (__force __le64) ++ swab64((__force u64) entry->crc128.csum.lo); ++ break; ++ case BCH_EXTENT_ENTRY_stripe_ptr: ++ break; ++ } + } +- +- bch2_key_resize(l.k, l.k->size + r.k->size); +- +- return BCH_MERGE_MERGE; + } + +-bool bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size, +- unsigned nr_replicas) ++/* Generic extent code: */ ++ ++int bch2_cut_front_s(struct bpos where, struct bkey_s k) + { +- struct btree_trans trans; +- struct btree_iter *iter; +- struct bpos end = pos; +- struct bkey_s_c k; +- bool ret = true; +- int err; ++ unsigned new_val_u64s = bkey_val_u64s(k.k); ++ int val_u64s_delta; ++ u64 sub; + +- end.offset += size; ++ if (bkey_cmp(where, bkey_start_pos(k.k)) <= 0) ++ return 0; + +- bch2_trans_init(&trans, c, 0, 0); ++ EBUG_ON(bkey_cmp(where, k.k->p) > 0); + +- for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, pos, +- BTREE_ITER_SLOTS, k, err) { +- if (bkey_cmp(bkey_start_pos(k.k), end) >= 0) +- break; ++ sub = where.offset - bkey_start_offset(k.k); + +- if (nr_replicas > bch2_bkey_nr_ptrs_allocated(k)) { +- ret = false; +- break; +- } ++ k.k->size -= sub; ++ ++ if (!k.k->size) { ++ k.k->type = KEY_TYPE_deleted; ++ new_val_u64s = 0; + } +- bch2_trans_exit(&trans); + +- return ret; +-} ++ switch (k.k->type) { ++ case KEY_TYPE_extent: ++ case KEY_TYPE_reflink_v: { ++ struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); ++ union bch_extent_entry *entry; ++ bool seen_crc = false; + +-unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c k) +-{ +- unsigned ret = 0; ++ bkey_extent_entry_for_each(ptrs, entry) { ++ switch (extent_entry_type(entry)) { ++ case BCH_EXTENT_ENTRY_ptr: ++ if (!seen_crc) ++ entry->ptr.offset += sub; ++ break; ++ case BCH_EXTENT_ENTRY_crc32: ++ entry->crc32.offset += sub; ++ break; ++ case BCH_EXTENT_ENTRY_crc64: ++ entry->crc64.offset += sub; ++ break; ++ case BCH_EXTENT_ENTRY_crc128: ++ entry->crc128.offset += sub; ++ break; ++ case BCH_EXTENT_ENTRY_stripe_ptr: ++ break; ++ } + +- switch (k.k->type) { +- case KEY_TYPE_extent: { +- struct bkey_s_c_extent e = bkey_s_c_to_extent(k); +- const union bch_extent_entry *entry; +- struct extent_ptr_decoded p; ++ if (extent_entry_is_crc(entry)) ++ seen_crc = true; ++ } + +- extent_for_each_ptr_decode(e, p, entry) +- ret += !p.ptr.cached && +- p.crc.compression_type == BCH_COMPRESSION_NONE; + break; + } +- case KEY_TYPE_reservation: +- ret = bkey_s_c_to_reservation(k).v->nr_replicas; ++ case KEY_TYPE_reflink_p: { ++ struct bkey_s_reflink_p p = bkey_s_to_reflink_p(k); ++ ++ le64_add_cpu(&p.v->idx, sub); + break; + } ++ case KEY_TYPE_inline_data: { ++ struct bkey_s_inline_data d = bkey_s_to_inline_data(k); + +- return ret; +-} +- +-/* KEY_TYPE_reservation: */ ++ sub = min_t(u64, sub << 9, bkey_val_bytes(d.k)); + +-const char *bch2_reservation_invalid(const struct bch_fs *c, struct bkey_s_c k) +-{ +- struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k); ++ memmove(d.v->data, ++ d.v->data + sub, ++ bkey_val_bytes(d.k) - sub); + +- if (bkey_val_bytes(k.k) != sizeof(struct bch_reservation)) +- return "incorrect value size"; ++ new_val_u64s -= sub >> 3; ++ break; ++ } ++ } + +- if (!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX) +- return "invalid nr_replicas"; ++ val_u64s_delta = bkey_val_u64s(k.k) - new_val_u64s; ++ BUG_ON(val_u64s_delta < 0); + +- return NULL; ++ set_bkey_val_u64s(k.k, new_val_u64s); ++ memset(bkey_val_end(k), 0, val_u64s_delta * sizeof(u64)); ++ return -val_u64s_delta; + } + +-void bch2_reservation_to_text(struct printbuf *out, struct bch_fs *c, +- struct bkey_s_c k) ++int bch2_cut_back_s(struct bpos where, struct bkey_s k) + { +- struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k); ++ unsigned new_val_u64s = bkey_val_u64s(k.k); ++ int val_u64s_delta; ++ u64 len = 0; + +- pr_buf(out, "generation %u replicas %u", +- le32_to_cpu(r.v->generation), +- r.v->nr_replicas); +-} ++ if (bkey_cmp(where, k.k->p) >= 0) ++ return 0; + +-enum merge_result bch2_reservation_merge(struct bch_fs *c, +- struct bkey_s _l, struct bkey_s _r) +-{ +- struct bkey_s_reservation l = bkey_s_to_reservation(_l); +- struct bkey_s_reservation r = bkey_s_to_reservation(_r); ++ EBUG_ON(bkey_cmp(where, bkey_start_pos(k.k)) < 0); + +- if (l.v->generation != r.v->generation || +- l.v->nr_replicas != r.v->nr_replicas) +- return BCH_MERGE_NOMERGE; ++ len = where.offset - bkey_start_offset(k.k); + +- if ((u64) l.k->size + r.k->size > KEY_SIZE_MAX) { +- bch2_key_resize(l.k, KEY_SIZE_MAX); +- bch2_cut_front_s(l.k->p, r.s); +- return BCH_MERGE_PARTIAL; ++ k.k->p = where; ++ k.k->size = len; ++ ++ if (!len) { ++ k.k->type = KEY_TYPE_deleted; ++ new_val_u64s = 0; + } + +- bch2_key_resize(l.k, l.k->size + r.k->size); ++ switch (k.k->type) { ++ case KEY_TYPE_inline_data: ++ new_val_u64s = min(new_val_u64s, k.k->size << 6); ++ break; ++ } + +- return BCH_MERGE_MERGE; ++ val_u64s_delta = bkey_val_u64s(k.k) - new_val_u64s; ++ BUG_ON(val_u64s_delta < 0); ++ ++ set_bkey_val_u64s(k.k, new_val_u64s); ++ memset(bkey_val_end(k), 0, val_u64s_delta * sizeof(u64)); ++ return -val_u64s_delta; + } +diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h +index 35a66d4f4ea2..1140d01a42ab 100644 +--- a/fs/bcachefs/extents.h ++++ b/fs/bcachefs/extents.h +@@ -40,6 +40,9 @@ struct btree_insert_entry; + (union bch_extent_entry *) (_entry)); \ + }) + ++#define extent_entry_next(_entry) \ ++ ((typeof(_entry)) ((void *) (_entry) + extent_entry_bytes(_entry))) ++ + static inline unsigned + __extent_entry_type(const union bch_extent_entry *e) + { +@@ -185,10 +188,52 @@ struct bkey_ptrs { + union bch_extent_entry *end; + }; + +-/* iterate over bkey ptrs */ ++static inline struct bkey_ptrs_c bch2_bkey_ptrs_c(struct bkey_s_c k) ++{ ++ switch (k.k->type) { ++ case KEY_TYPE_btree_ptr: { ++ struct bkey_s_c_btree_ptr e = bkey_s_c_to_btree_ptr(k); ++ return (struct bkey_ptrs_c) { ++ to_entry(&e.v->start[0]), ++ to_entry(extent_entry_last(e)) ++ }; ++ } ++ case KEY_TYPE_extent: { ++ struct bkey_s_c_extent e = bkey_s_c_to_extent(k); ++ return (struct bkey_ptrs_c) { ++ e.v->start, ++ extent_entry_last(e) ++ }; ++ } ++ case KEY_TYPE_stripe: { ++ struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k); ++ return (struct bkey_ptrs_c) { ++ to_entry(&s.v->ptrs[0]), ++ to_entry(&s.v->ptrs[s.v->nr_blocks]), ++ }; ++ } ++ case KEY_TYPE_reflink_v: { ++ struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k); + +-#define extent_entry_next(_entry) \ +- ((typeof(_entry)) ((void *) (_entry) + extent_entry_bytes(_entry))) ++ return (struct bkey_ptrs_c) { ++ r.v->start, ++ bkey_val_end(r), ++ }; ++ } ++ default: ++ return (struct bkey_ptrs_c) { NULL, NULL }; ++ } ++} ++ ++static inline struct bkey_ptrs bch2_bkey_ptrs(struct bkey_s k) ++{ ++ struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k.s_c); ++ ++ return (struct bkey_ptrs) { ++ (void *) p.start, ++ (void *) p.end ++ }; ++} + + #define __bkey_extent_entry_for_each_from(_start, _end, _entry) \ + for ((_entry) = (_start); \ +@@ -281,96 +326,26 @@ out: \ + #define bkey_for_each_crc(_k, _p, _crc, _iter) \ + __bkey_for_each_crc(_k, (_p).start, (_p).end, _crc, _iter) + +-/* utility code common to all keys with pointers: */ +- +-static inline struct bkey_ptrs_c bch2_bkey_ptrs_c(struct bkey_s_c k) +-{ +- switch (k.k->type) { +- case KEY_TYPE_btree_ptr: { +- struct bkey_s_c_btree_ptr e = bkey_s_c_to_btree_ptr(k); +- return (struct bkey_ptrs_c) { +- to_entry(&e.v->start[0]), +- to_entry(extent_entry_last(e)) +- }; +- } +- case KEY_TYPE_extent: { +- struct bkey_s_c_extent e = bkey_s_c_to_extent(k); +- return (struct bkey_ptrs_c) { +- e.v->start, +- extent_entry_last(e) +- }; +- } +- case KEY_TYPE_stripe: { +- struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k); +- return (struct bkey_ptrs_c) { +- to_entry(&s.v->ptrs[0]), +- to_entry(&s.v->ptrs[s.v->nr_blocks]), +- }; +- } +- case KEY_TYPE_reflink_v: { +- struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k); +- +- return (struct bkey_ptrs_c) { +- r.v->start, +- bkey_val_end(r), +- }; +- } +- default: +- return (struct bkey_ptrs_c) { NULL, NULL }; +- } +-} +- +-static inline struct bkey_ptrs bch2_bkey_ptrs(struct bkey_s k) +-{ +- struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k.s_c); ++/* Iterate over pointers in KEY_TYPE_extent: */ + +- return (struct bkey_ptrs) { +- (void *) p.start, +- (void *) p.end +- }; +-} +- +-static inline struct bch_devs_list bch2_bkey_devs(struct bkey_s_c k) +-{ +- struct bch_devs_list ret = (struct bch_devs_list) { 0 }; +- struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k); +- const struct bch_extent_ptr *ptr; +- +- bkey_for_each_ptr(p, ptr) +- ret.devs[ret.nr++] = ptr->dev; +- +- return ret; +-} +- +-static inline struct bch_devs_list bch2_bkey_dirty_devs(struct bkey_s_c k) +-{ +- struct bch_devs_list ret = (struct bch_devs_list) { 0 }; +- struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k); +- const struct bch_extent_ptr *ptr; +- +- bkey_for_each_ptr(p, ptr) +- if (!ptr->cached) +- ret.devs[ret.nr++] = ptr->dev; ++#define extent_for_each_entry_from(_e, _entry, _start) \ ++ __bkey_extent_entry_for_each_from(_start, \ ++ extent_entry_last(_e),_entry) + +- return ret; +-} ++#define extent_for_each_entry(_e, _entry) \ ++ extent_for_each_entry_from(_e, _entry, (_e).v->start) + +-static inline struct bch_devs_list bch2_bkey_cached_devs(struct bkey_s_c k) +-{ +- struct bch_devs_list ret = (struct bch_devs_list) { 0 }; +- struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k); +- const struct bch_extent_ptr *ptr; ++#define extent_ptr_next(_e, _ptr) \ ++ __bkey_ptr_next(_ptr, extent_entry_last(_e)) + +- bkey_for_each_ptr(p, ptr) +- if (ptr->cached) +- ret.devs[ret.nr++] = ptr->dev; ++#define extent_for_each_ptr(_e, _ptr) \ ++ __bkey_for_each_ptr(&(_e).v->start->ptr, extent_entry_last(_e), _ptr) + +- return ret; +-} ++#define extent_for_each_ptr_decode(_e, _ptr, _entry) \ ++ __bkey_for_each_ptr_decode((_e).k, (_e).v->start, \ ++ extent_entry_last(_e), _ptr, _entry) + +-unsigned bch2_bkey_nr_ptrs(struct bkey_s_c); +-unsigned bch2_bkey_nr_dirty_ptrs(struct bkey_s_c); +-unsigned bch2_bkey_durability(struct bch_fs *, struct bkey_s_c); ++/* utility code common to all keys with pointers: */ + + void bch2_mark_io_failure(struct bch_io_failures *, + struct extent_ptr_decoded *); +@@ -378,22 +353,12 @@ int bch2_bkey_pick_read_device(struct bch_fs *, struct bkey_s_c, + struct bch_io_failures *, + struct extent_ptr_decoded *); + +-void bch2_bkey_append_ptr(struct bkey_i *, struct bch_extent_ptr); +-void bch2_bkey_drop_device(struct bkey_s, unsigned); +-const struct bch_extent_ptr *bch2_bkey_has_device(struct bkey_s_c, unsigned); +-bool bch2_bkey_has_target(struct bch_fs *, struct bkey_s_c, unsigned); +- +-void bch2_bkey_ptrs_to_text(struct printbuf *, struct bch_fs *, +- struct bkey_s_c); +-const char *bch2_bkey_ptrs_invalid(const struct bch_fs *, struct bkey_s_c); +- +-/* bch_btree_ptr: */ ++/* KEY_TYPE_btree_ptr: */ + + const char *bch2_btree_ptr_invalid(const struct bch_fs *, struct bkey_s_c); + void bch2_btree_ptr_debugcheck(struct bch_fs *, struct bkey_s_c); + void bch2_btree_ptr_to_text(struct printbuf *, struct bch_fs *, + struct bkey_s_c); +-void bch2_ptr_swab(const struct bkey_format *, struct bkey_packed *); + + #define bch2_bkey_ops_btree_ptr (struct bkey_ops) { \ + .key_invalid = bch2_btree_ptr_invalid, \ +@@ -402,12 +367,11 @@ void bch2_ptr_swab(const struct bkey_format *, struct bkey_packed *); + .swab = bch2_ptr_swab, \ + } + +-/* bch_extent: */ ++/* KEY_TYPE_extent: */ + + const char *bch2_extent_invalid(const struct bch_fs *, struct bkey_s_c); + void bch2_extent_debugcheck(struct bch_fs *, struct bkey_s_c); + void bch2_extent_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); +-bool bch2_extent_normalize(struct bch_fs *, struct bkey_s); + enum merge_result bch2_extent_merge(struct bch_fs *, + struct bkey_s, struct bkey_s); + +@@ -420,7 +384,7 @@ enum merge_result bch2_extent_merge(struct bch_fs *, + .key_merge = bch2_extent_merge, \ + } + +-/* bch_reservation: */ ++/* KEY_TYPE_reservation: */ + + const char *bch2_reservation_invalid(const struct bch_fs *, struct bkey_s_c); + void bch2_reservation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); +@@ -433,13 +397,15 @@ enum merge_result bch2_reservation_merge(struct bch_fs *, + .key_merge = bch2_reservation_merge, \ + } + +-void bch2_bkey_mark_replicas_cached(struct bch_fs *, struct bkey_s, +- unsigned, unsigned); ++/* Extent checksum entries: */ + +-unsigned bch2_extent_is_compressed(struct bkey_s_c); ++bool bch2_can_narrow_extent_crcs(struct bkey_s_c, ++ struct bch_extent_crc_unpacked); ++bool bch2_bkey_narrow_crcs(struct bkey_i *, struct bch_extent_crc_unpacked); ++void bch2_extent_crc_append(struct bkey_i *, ++ struct bch_extent_crc_unpacked); + +-bool bch2_bkey_matches_ptr(struct bch_fs *, struct bkey_s_c, +- struct bch_extent_ptr, u64); ++/* Generic code for keys with pointers: */ + + static inline bool bkey_extent_is_direct_data(const struct bkey *k) + { +@@ -477,34 +443,57 @@ static inline bool bkey_extent_is_allocation(const struct bkey *k) + } + } + +-/* Extent entry iteration: */ ++static inline struct bch_devs_list bch2_bkey_devs(struct bkey_s_c k) ++{ ++ struct bch_devs_list ret = (struct bch_devs_list) { 0 }; ++ struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k); ++ const struct bch_extent_ptr *ptr; + +-#define extent_for_each_entry_from(_e, _entry, _start) \ +- __bkey_extent_entry_for_each_from(_start, \ +- extent_entry_last(_e),_entry) ++ bkey_for_each_ptr(p, ptr) ++ ret.devs[ret.nr++] = ptr->dev; + +-#define extent_for_each_entry(_e, _entry) \ +- extent_for_each_entry_from(_e, _entry, (_e).v->start) ++ return ret; ++} + +-#define extent_ptr_next(_e, _ptr) \ +- __bkey_ptr_next(_ptr, extent_entry_last(_e)) ++static inline struct bch_devs_list bch2_bkey_dirty_devs(struct bkey_s_c k) ++{ ++ struct bch_devs_list ret = (struct bch_devs_list) { 0 }; ++ struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k); ++ const struct bch_extent_ptr *ptr; + +-#define extent_for_each_ptr(_e, _ptr) \ +- __bkey_for_each_ptr(&(_e).v->start->ptr, extent_entry_last(_e), _ptr) ++ bkey_for_each_ptr(p, ptr) ++ if (!ptr->cached) ++ ret.devs[ret.nr++] = ptr->dev; + +-#define extent_for_each_ptr_decode(_e, _ptr, _entry) \ +- __bkey_for_each_ptr_decode((_e).k, (_e).v->start, \ +- extent_entry_last(_e), _ptr, _entry) ++ return ret; ++} + +-void bch2_extent_crc_append(struct bkey_i *, +- struct bch_extent_crc_unpacked); +-void bch2_extent_ptr_decoded_append(struct bkey_i *, +- struct extent_ptr_decoded *); ++static inline struct bch_devs_list bch2_bkey_cached_devs(struct bkey_s_c k) ++{ ++ struct bch_devs_list ret = (struct bch_devs_list) { 0 }; ++ struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k); ++ const struct bch_extent_ptr *ptr; + +-bool bch2_can_narrow_extent_crcs(struct bkey_s_c, +- struct bch_extent_crc_unpacked); +-bool bch2_bkey_narrow_crcs(struct bkey_i *, struct bch_extent_crc_unpacked); ++ bkey_for_each_ptr(p, ptr) ++ if (ptr->cached) ++ ret.devs[ret.nr++] = ptr->dev; + ++ return ret; ++} ++ ++unsigned bch2_bkey_nr_ptrs(struct bkey_s_c); ++unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c); ++unsigned bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c); ++unsigned bch2_bkey_sectors_compressed(struct bkey_s_c); ++bool bch2_check_range_allocated(struct bch_fs *, struct bpos, u64, unsigned); ++unsigned bch2_bkey_durability(struct bch_fs *, struct bkey_s_c); ++ ++void bch2_bkey_mark_replicas_cached(struct bch_fs *, struct bkey_s, ++ unsigned, unsigned); ++ ++void bch2_bkey_append_ptr(struct bkey_i *, struct bch_extent_ptr); ++void bch2_extent_ptr_decoded_append(struct bkey_i *, ++ struct extent_ptr_decoded *); + union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s, + struct bch_extent_ptr *); + +@@ -525,6 +514,22 @@ do { \ + } \ + } while (0) + ++void bch2_bkey_drop_device(struct bkey_s, unsigned); ++const struct bch_extent_ptr *bch2_bkey_has_device(struct bkey_s_c, unsigned); ++bool bch2_bkey_has_target(struct bch_fs *, struct bkey_s_c, unsigned); ++ ++bool bch2_bkey_matches_ptr(struct bch_fs *, struct bkey_s_c, ++ struct bch_extent_ptr, u64); ++ ++bool bch2_extent_normalize(struct bch_fs *, struct bkey_s); ++void bch2_bkey_ptrs_to_text(struct printbuf *, struct bch_fs *, ++ struct bkey_s_c); ++const char *bch2_bkey_ptrs_invalid(const struct bch_fs *, struct bkey_s_c); ++ ++void bch2_ptr_swab(const struct bkey_format *, struct bkey_packed *); ++ ++/* Generic extent code: */ ++ + int bch2_cut_front_s(struct bpos, struct bkey_s); + int bch2_cut_back_s(struct bpos, struct bkey_s); + +@@ -568,7 +573,4 @@ static inline void extent_save(struct btree *b, struct bkey_packed *dst, + BUG_ON(!bch2_bkey_pack_key(dst, src, f)); + } + +-bool bch2_check_range_allocated(struct bch_fs *, struct bpos, u64, unsigned); +-unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c); +- + #endif /* _BCACHEFS_EXTENTS_H */ +diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c +index 6f0c7bea2ccc..2847706cb740 100644 +--- a/fs/bcachefs/fs-io.c ++++ b/fs/bcachefs/fs-io.c +@@ -758,7 +758,7 @@ static void bch2_add_page_sectors(struct bio *bio, struct bkey_s_c k) + struct bvec_iter iter; + struct bio_vec bv; + unsigned nr_ptrs = k.k->type == KEY_TYPE_reflink_v +- ? 0 : bch2_bkey_nr_ptrs_allocated(k); ++ ? 0 : bch2_bkey_nr_ptrs_fully_allocated(k); + unsigned state = k.k->type == KEY_TYPE_reservation + ? SECTOR_RESERVED + : SECTOR_ALLOCATED; +@@ -2597,7 +2597,7 @@ reassemble: + } else { + /* We might end up splitting compressed extents: */ + unsigned nr_ptrs = +- bch2_bkey_nr_dirty_ptrs(bkey_i_to_s_c(copy.k)); ++ bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(copy.k)); + + ret = bch2_disk_reservation_get(c, &disk_res, + copy.k->k.size, nr_ptrs, +@@ -2723,7 +2723,7 @@ static long bchfs_fallocate(struct bch_inode_info *inode, int mode, + bch2_cut_back(end_pos, &reservation.k_i); + + sectors = reservation.k.size; +- reservation.v.nr_replicas = bch2_bkey_nr_dirty_ptrs(k); ++ reservation.v.nr_replicas = bch2_bkey_nr_ptrs_allocated(k); + + if (!bkey_extent_is_allocation(k.k)) { + ret = bch2_quota_reservation_add(c, inode, +@@ -2734,7 +2734,7 @@ static long bchfs_fallocate(struct bch_inode_info *inode, int mode, + } + + if (reservation.v.nr_replicas < replicas || +- bch2_extent_is_compressed(k)) { ++ bch2_bkey_sectors_compressed(k)) { + ret = bch2_disk_reservation_get(c, &disk_res, sectors, + replicas, 0); + if (unlikely(ret)) +diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c +index a544ef7de31f..ca891b52706f 100644 +--- a/fs/bcachefs/io.c ++++ b/fs/bcachefs/io.c +@@ -192,8 +192,8 @@ static int sum_sector_overwrites(struct btree_trans *trans, + + for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, old, ret) { + if (!may_allocate && +- bch2_bkey_nr_ptrs_allocated(old) < +- bch2_bkey_nr_dirty_ptrs(bkey_i_to_s_c(new))) { ++ bch2_bkey_nr_ptrs_fully_allocated(old) < ++ bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(new))) { + ret = -ENOSPC; + break; + } +diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c +index 15547e149b3e..acdc1730e218 100644 +--- a/fs/bcachefs/move.c ++++ b/fs/bcachefs/move.c +@@ -135,11 +135,11 @@ static int bch2_migrate_index_update(struct bch_write_op *op) + * If we're not fully overwriting @k, and it's compressed, we + * need a reservation for all the pointers in @insert + */ +- nr = bch2_bkey_nr_dirty_ptrs(bkey_i_to_s_c(insert)) - ++ nr = bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(insert)) - + m->nr_ptrs_reserved; + + if (insert->k.size < k.k->size && +- bch2_extent_is_compressed(k) && ++ bch2_bkey_sectors_compressed(k) && + nr > 0) { + ret = bch2_disk_reservation_add(c, &op->res, + keylist_sectors(keys) * nr, 0); +@@ -251,7 +251,7 @@ int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m, + */ + #if 0 + int nr = (int) io_opts.data_replicas - +- bch2_bkey_nr_dirty_ptrs(k); ++ bch2_bkey_nr_ptrs_allocated(k); + #endif + int nr = (int) io_opts.data_replicas; + +@@ -600,7 +600,7 @@ peek: + if (rate) + bch2_ratelimit_increment(rate, k.k->size); + next: +- atomic64_add(k.k->size * bch2_bkey_nr_dirty_ptrs(k), ++ atomic64_add(k.k->size * bch2_bkey_nr_ptrs_allocated(k), + &stats->sectors_seen); + next_nondata: + bch2_btree_iter_next(iter); +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index 9102a1ce1ec4..d4002b7fc917 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -254,7 +254,7 @@ static int bch2_extent_replay_key(struct bch_fs *c, enum btree_id btree_id, + * Some extents aren't equivalent - w.r.t. what the triggers do + * - if they're split: + */ +- bool remark_if_split = bch2_extent_is_compressed(bkey_i_to_s_c(k)) || ++ bool remark_if_split = bch2_bkey_sectors_compressed(bkey_i_to_s_c(k)) || + k->k.type == KEY_TYPE_reflink_p; + bool remark = false; + int ret; +@@ -289,7 +289,7 @@ retry: + bkey_cmp(atomic_end, k->k.p) < 0) { + ret = bch2_disk_reservation_add(c, &disk_res, + k->k.size * +- bch2_bkey_nr_dirty_ptrs(bkey_i_to_s_c(k)), ++ bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(k)), + BCH_DISK_RESERVATION_NOFAIL); + BUG_ON(ret); + +-- +cgit v1.2.3 + + +From 16947b43efcad1ead0fb9777ae7443bc9201b7b6 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 20 Nov 2019 16:16:57 -0500 +Subject: bcachefs: kill ca->freelist_lock + +All uses were supposed to be switched over to c->freelist_lock +--- + fs/bcachefs/bcachefs.h | 1 - + fs/bcachefs/movinggc.c | 4 ++-- + fs/bcachefs/super.c | 1 - + fs/bcachefs/sysfs.c | 4 ++-- + 4 files changed, 4 insertions(+), 6 deletions(-) + +diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h +index b1e1ceb61b73..ce6f74ff6581 100644 +--- a/fs/bcachefs/bcachefs.h ++++ b/fs/bcachefs/bcachefs.h +@@ -427,7 +427,6 @@ struct bch_dev { + */ + alloc_fifo free[RESERVE_NR]; + alloc_fifo free_inc; +- spinlock_t freelist_lock; + + u8 open_buckets_partial[OPEN_BUCKETS_COUNT]; + unsigned open_buckets_partial_nr; +diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c +index 710296044194..abdeef20fde9 100644 +--- a/fs/bcachefs/movinggc.c ++++ b/fs/bcachefs/movinggc.c +@@ -107,10 +107,10 @@ static bool have_copygc_reserve(struct bch_dev *ca) + { + bool ret; + +- spin_lock(&ca->freelist_lock); ++ spin_lock(&ca->fs->freelist_lock); + ret = fifo_full(&ca->free[RESERVE_MOVINGGC]) || + ca->allocator_state != ALLOCATOR_RUNNING; +- spin_unlock(&ca->freelist_lock); ++ spin_unlock(&ca->fs->freelist_lock); + + return ret; + } +diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c +index a0406740ab4e..0188ddda5f6c 100644 +--- a/fs/bcachefs/super.c ++++ b/fs/bcachefs/super.c +@@ -1048,7 +1048,6 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c, + + writepoint_init(&ca->copygc_write_point, BCH_DATA_USER); + +- spin_lock_init(&ca->freelist_lock); + bch2_dev_copygc_init(ca); + + INIT_WORK(&ca->io_error_work, bch2_io_error_work); +diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c +index 27646c435e30..e7699afd99fc 100644 +--- a/fs/bcachefs/sysfs.c ++++ b/fs/bcachefs/sysfs.c +@@ -775,7 +775,7 @@ static ssize_t show_reserve_stats(struct bch_dev *ca, char *buf) + struct printbuf out = _PBUF(buf, PAGE_SIZE); + enum alloc_reserve i; + +- spin_lock(&ca->freelist_lock); ++ spin_lock(&ca->fs->freelist_lock); + + pr_buf(&out, "free_inc:\t%zu\t%zu\n", + fifo_used(&ca->free_inc), +@@ -786,7 +786,7 @@ static ssize_t show_reserve_stats(struct bch_dev *ca, char *buf) + fifo_used(&ca->free[i]), + ca->free[i].size); + +- spin_unlock(&ca->freelist_lock); ++ spin_unlock(&ca->fs->freelist_lock); + + return out.pos - buf; + } +-- +cgit v1.2.3 + + +From e2ceb2318daa1d4233c0bd5c07fc8d3960506102 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 20 Nov 2019 14:51:30 -0500 +Subject: bcachefs: bkey_on_stack_reassemble() + +Small helper function. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bkey_on_stack.h | 8 ++++++++ + fs/bcachefs/bkey_sort.c | 3 +-- + fs/bcachefs/ec.c | 3 +-- + fs/bcachefs/extent_update.c | 3 +-- + fs/bcachefs/fs-io.c | 6 ++---- + fs/bcachefs/io.c | 12 ++++-------- + fs/bcachefs/migrate.c | 3 +-- + fs/bcachefs/move.c | 3 +-- + fs/bcachefs/reflink.c | 3 +-- + 9 files changed, 20 insertions(+), 24 deletions(-) + +diff --git a/fs/bcachefs/bkey_on_stack.h b/fs/bcachefs/bkey_on_stack.h +index d4739038323f..f607a0cb37ed 100644 +--- a/fs/bcachefs/bkey_on_stack.h ++++ b/fs/bcachefs/bkey_on_stack.h +@@ -19,6 +19,14 @@ static inline void bkey_on_stack_realloc(struct bkey_on_stack *s, + } + } + ++static inline void bkey_on_stack_reassemble(struct bkey_on_stack *s, ++ struct bch_fs *c, ++ struct bkey_s_c k) ++{ ++ bkey_on_stack_realloc(s, c, k.k->u64s); ++ bkey_reassemble(s->k, k); ++} ++ + static inline void bkey_on_stack_init(struct bkey_on_stack *s) + { + s->k = (void *) s->onstack; +diff --git a/fs/bcachefs/bkey_sort.c b/fs/bcachefs/bkey_sort.c +index daef8e5c599f..2e205db5433d 100644 +--- a/fs/bcachefs/bkey_sort.c ++++ b/fs/bcachefs/bkey_sort.c +@@ -356,12 +356,11 @@ struct btree_nr_keys bch2_extent_sort_fix_overlapping(struct bch_fs *c, + + extent_sort_sift(iter, b, _r - iter->data); + } else if (bkey_cmp(l.k->p, r.k->p) > 0) { +- bkey_on_stack_realloc(&split, c, l.k->u64s); + + /* + * r wins, but it overlaps in the middle of l - split l: + */ +- bkey_reassemble(split.k, l.s_c); ++ bkey_on_stack_reassemble(&split, c, l.s_c); + bch2_cut_back(bkey_start_pos(r.k), split.k); + + bch2_cut_front_s(r.k->p, l); +diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c +index e6bca0d4918b..3781838cda82 100644 +--- a/fs/bcachefs/ec.c ++++ b/fs/bcachefs/ec.c +@@ -807,8 +807,7 @@ static int ec_stripe_update_ptrs(struct bch_fs *c, + + dev = s->key.v.ptrs[idx].dev; + +- bkey_on_stack_realloc(&sk, c, k.k->u64s); +- bkey_reassemble(sk.k, k); ++ bkey_on_stack_reassemble(&sk, c, k); + e = bkey_i_to_s_extent(sk.k); + + extent_for_each_ptr(e, ptr) { +diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c +index 91ceb5d53f92..742b4d78cb3a 100644 +--- a/fs/bcachefs/extent_update.c ++++ b/fs/bcachefs/extent_update.c +@@ -347,7 +347,7 @@ extent_squash(struct bch_fs *c, struct btree_iter *iter, + struct bkey_on_stack split; + + bkey_on_stack_init(&split); +- bkey_on_stack_realloc(&split, c, k.k->u64s); ++ bkey_on_stack_reassemble(&split, c, k.s_c); + + /* + * The insert key falls 'in the middle' of k +@@ -363,7 +363,6 @@ extent_squash(struct bch_fs *c, struct btree_iter *iter, + * modify k _before_ doing the insert (which will move + * what k points to) + */ +- bkey_reassemble(split.k, k.s_c); + split.k->k.needs_whiteout |= bkey_written(l->b, _k); + + bch2_cut_back(bkey_start_pos(&insert->k), split.k); +diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c +index 2847706cb740..ed7a26ba2b48 100644 +--- a/fs/bcachefs/fs-io.c ++++ b/fs/bcachefs/fs-io.c +@@ -863,8 +863,7 @@ retry: + if (ret) + break; + +- bkey_on_stack_realloc(&sk, c, k.k->u64s); +- bkey_reassemble(sk.k, k); ++ bkey_on_stack_reassemble(&sk, c, k); + k = bkey_i_to_s_c(sk.k); + + offset_into_extent = iter->pos.offset - +@@ -2530,8 +2529,7 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode, + bkey_cmp(k.k->p, POS(inode->v.i_ino, offset >> 9)) <= 0) + break; + reassemble: +- bkey_on_stack_realloc(©, c, k.k->u64s); +- bkey_reassemble(copy.k, k); ++ bkey_on_stack_reassemble(©, c, k); + + if (insert && + bkey_cmp(bkey_start_pos(k.k), move_pos) < 0) { +diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c +index ca891b52706f..f483312acd0d 100644 +--- a/fs/bcachefs/io.c ++++ b/fs/bcachefs/io.c +@@ -1536,8 +1536,7 @@ retry: + if (bkey_err(k)) + goto err; + +- bkey_on_stack_realloc(&sk, c, k.k->u64s); +- bkey_reassemble(sk.k, k); ++ bkey_on_stack_reassemble(&sk, c, k); + k = bkey_i_to_s_c(sk.k); + bch2_trans_unlock(&trans); + +@@ -1588,8 +1587,7 @@ retry: + BTREE_ITER_SLOTS, k, ret) { + unsigned bytes, sectors, offset_into_extent; + +- bkey_on_stack_realloc(&sk, c, k.k->u64s); +- bkey_reassemble(sk.k, k); ++ bkey_on_stack_reassemble(&sk, c, k); + k = bkey_i_to_s_c(sk.k); + + offset_into_extent = iter->pos.offset - +@@ -1712,8 +1710,7 @@ retry: + if (IS_ERR_OR_NULL(k.k)) + goto out; + +- bkey_on_stack_realloc(&new, c, k.k->u64s); +- bkey_reassemble(new.k, k); ++ bkey_on_stack_reassemble(&new, c, k); + k = bkey_i_to_s_c(new.k); + + if (bversion_cmp(k.k->version, rbio->version) || +@@ -2220,8 +2217,7 @@ retry: + bkey_start_offset(k.k); + sectors = k.k->size - offset_into_extent; + +- bkey_on_stack_realloc(&sk, c, k.k->u64s); +- bkey_reassemble(sk.k, k); ++ bkey_on_stack_reassemble(&sk, c, k); + k = bkey_i_to_s_c(sk.k); + + ret = bch2_read_indirect_extent(&trans, +diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c +index 4dacbd637d02..4b59dcd04cce 100644 +--- a/fs/bcachefs/migrate.c ++++ b/fs/bcachefs/migrate.c +@@ -60,8 +60,7 @@ static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags + continue; + } + +- bkey_on_stack_realloc(&sk, c, k.k->u64s); +- bkey_reassemble(sk.k, k); ++ bkey_on_stack_reassemble(&sk, c, k); + + ret = drop_dev_ptrs(c, bkey_i_to_s(sk.k), + dev_idx, flags, false); +diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c +index acdc1730e218..fad3cc4d587c 100644 +--- a/fs/bcachefs/move.c ++++ b/fs/bcachefs/move.c +@@ -579,8 +579,7 @@ peek: + } + + /* unlock before doing IO: */ +- bkey_on_stack_realloc(&sk, c, k.k->u64s); +- bkey_reassemble(sk.k, k); ++ bkey_on_stack_reassemble(&sk, c, k); + k = bkey_i_to_s_c(sk.k); + bch2_trans_unlock(&trans); + +diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c +index 4de65bf70362..2812fa305c0e 100644 +--- a/fs/bcachefs/reflink.c ++++ b/fs/bcachefs/reflink.c +@@ -225,8 +225,7 @@ s64 bch2_remap_range(struct bch_fs *c, + break; + + if (src_k.k->type == KEY_TYPE_extent) { +- bkey_on_stack_realloc(&new_src, c, src_k.k->u64s); +- bkey_reassemble(new_src.k, src_k); ++ bkey_on_stack_reassemble(&new_src, c, src_k); + src_k = bkey_i_to_s_c(new_src.k); + + bch2_cut_front(src_iter->pos, new_src.k); +-- +cgit v1.2.3 + + +From d25dd681b70841272a88de7192793f47cd36fef1 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 26 Nov 2019 17:09:32 -0500 +Subject: bcachefs: Switch to macro for bkey_ops + +Older versions of gcc refuse to compile it the other way + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bkey_methods.c | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c +index 5312184c37f7..ed448fad83c5 100644 +--- a/fs/bcachefs/bkey_methods.c ++++ b/fs/bcachefs/bkey_methods.c +@@ -75,10 +75,10 @@ static void key_type_inline_data_to_text(struct printbuf *out, struct bch_fs *c, + pr_buf(out, "(%zu bytes)", bkey_val_bytes(k.k)); + } + +-static const struct bkey_ops bch2_bkey_ops_inline_data = { +- .key_invalid = key_type_inline_data_invalid, +- .val_to_text = key_type_inline_data_to_text, +-}; ++#define bch2_bkey_ops_inline_data (struct bkey_ops) { \ ++ .key_invalid = key_type_inline_data_invalid, \ ++ .val_to_text = key_type_inline_data_to_text, \ ++} + + static const struct bkey_ops bch2_bkey_ops[] = { + #define x(name, nr) [KEY_TYPE_##name] = bch2_bkey_ops_##name, +-- +cgit v1.2.3 + + +From 52a639d303f90425ca886f2312a48a8d680390a6 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 29 Nov 2019 13:47:42 -0500 +Subject: bcachefs: bch2_check_set_feature() + +New helper function for setting incompatible feature bits + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/io.c | 2 ++ + fs/bcachefs/opts.c | 11 ++--------- + fs/bcachefs/recovery.c | 6 ------ + fs/bcachefs/reflink.c | 11 +---------- + fs/bcachefs/super-io.c | 11 +++++++++++ + fs/bcachefs/super-io.h | 27 +++++++-------------------- + 6 files changed, 23 insertions(+), 45 deletions(-) + +diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c +index f483312acd0d..a88df9f6ac9b 100644 +--- a/fs/bcachefs/io.c ++++ b/fs/bcachefs/io.c +@@ -1139,6 +1139,8 @@ static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len) + unsigned sectors; + int ret; + ++ bch2_check_set_feature(op->c, BCH_FEATURE_INLINE_DATA); ++ + ret = bch2_keylist_realloc(&op->insert_keys, op->inline_keys, + ARRAY_SIZE(op->inline_keys), + BKEY_U64s + DIV_ROUND_UP(data_len, 8)); +diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c +index 13a9a2fcd575..cbacd2f36799 100644 +--- a/fs/bcachefs/opts.c ++++ b/fs/bcachefs/opts.c +@@ -299,15 +299,8 @@ int bch2_opt_check_may_set(struct bch_fs *c, int id, u64 v) + ret = bch2_check_set_has_compressed_data(c, v); + break; + case Opt_erasure_code: +- if (v && +- !(c->sb.features & (1ULL << BCH_FEATURE_EC))) { +- mutex_lock(&c->sb_lock); +- c->disk_sb.sb->features[0] |= +- cpu_to_le64(1ULL << BCH_FEATURE_EC); +- +- bch2_write_super(c); +- mutex_unlock(&c->sb_lock); +- } ++ if (v) ++ bch2_check_set_feature(c, BCH_FEATURE_EC); + break; + } + +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index d4002b7fc917..e6b51131cff2 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -913,12 +913,6 @@ int bch2_fs_recovery(struct bch_fs *c) + write_sb = true; + } + +- if (!(c->sb.features & (1ULL << BCH_FEATURE_INLINE_DATA))) { +- c->disk_sb.sb->features[0] |= +- cpu_to_le64(1ULL << BCH_FEATURE_INLINE_DATA); +- write_sb = true; +- } +- + if (!test_bit(BCH_FS_ERROR, &c->flags)) { + c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_ALLOC_INFO; + write_sb = true; +diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c +index 2812fa305c0e..53bd0e0ea058 100644 +--- a/fs/bcachefs/reflink.c ++++ b/fs/bcachefs/reflink.c +@@ -171,16 +171,7 @@ s64 bch2_remap_range(struct bch_fs *c, + if (!percpu_ref_tryget(&c->writes)) + return -EROFS; + +- if (!(c->sb.features & (1ULL << BCH_FEATURE_REFLINK))) { +- mutex_lock(&c->sb_lock); +- if (!(c->sb.features & (1ULL << BCH_FEATURE_REFLINK))) { +- c->disk_sb.sb->features[0] |= +- cpu_to_le64(1ULL << BCH_FEATURE_REFLINK); +- +- bch2_write_super(c); +- } +- mutex_unlock(&c->sb_lock); +- } ++ bch2_check_set_feature(c, BCH_FEATURE_REFLINK); + + dst_end.offset += remap_sectors; + src_end.offset += remap_sectors; +diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c +index 7e9c1f9c850c..1d592856dea1 100644 +--- a/fs/bcachefs/super-io.c ++++ b/fs/bcachefs/super-io.c +@@ -792,6 +792,17 @@ out: + return ret; + } + ++void __bch2_check_set_feature(struct bch_fs *c, unsigned feat) ++{ ++ mutex_lock(&c->sb_lock); ++ if (!(c->sb.features & (1ULL << feat))) { ++ c->disk_sb.sb->features[0] |= cpu_to_le64(1ULL << feat); ++ ++ bch2_write_super(c); ++ } ++ mutex_unlock(&c->sb_lock); ++} ++ + /* BCH_SB_FIELD_journal: */ + + static int u64_cmp(const void *_l, const void *_r) +diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h +index f5450e596c62..7a068158efca 100644 +--- a/fs/bcachefs/super-io.h ++++ b/fs/bcachefs/super-io.h +@@ -43,26 +43,6 @@ struct bch_sb_field_ops { + struct bch_sb_field *); + }; + +-static inline bool bch2_sb_test_feature(struct bch_sb *sb, +- enum bch_sb_features f) +-{ +- unsigned w = f / 64; +- unsigned b = f % 64; +- +- return le64_to_cpu(sb->features[w]) & (1ULL << b); +-} +- +-static inline void bch2_sb_set_feature(struct bch_sb *sb, +- enum bch_sb_features f) +-{ +- if (!bch2_sb_test_feature(sb, f)) { +- unsigned w = f / 64; +- unsigned b = f % 64; +- +- le64_add_cpu(&sb->features[w], 1ULL << b); +- } +-} +- + static inline __le64 bch2_sb_magic(struct bch_fs *c) + { + __le64 ret; +@@ -90,6 +70,13 @@ const char *bch2_sb_validate(struct bch_sb_handle *); + + int bch2_read_super(const char *, struct bch_opts *, struct bch_sb_handle *); + int bch2_write_super(struct bch_fs *); ++void __bch2_check_set_feature(struct bch_fs *, unsigned); ++ ++static inline void bch2_check_set_feature(struct bch_fs *c, unsigned feat) ++{ ++ if (!(c->sb.features & (1ULL << feat))) ++ __bch2_check_set_feature(c, feat); ++} + + /* BCH_SB_FIELD_journal: */ + +-- +cgit v1.2.3 + + +From 4a4edfd28d36f3766a76a38f2c1a6e4086581ec7 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 29 Nov 2019 13:48:09 -0500 +Subject: bcachefs: Put inline data behind a mount option for now + +Inline data extents + reflink is still broken + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/io.c | 3 ++- + fs/bcachefs/opts.h | 5 +++++ + 2 files changed, 7 insertions(+), 1 deletion(-) + +diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c +index a88df9f6ac9b..17ea38e42ae8 100644 +--- a/fs/bcachefs/io.c ++++ b/fs/bcachefs/io.c +@@ -1222,7 +1222,8 @@ void bch2_write(struct closure *cl) + data_len = min_t(u64, bio->bi_iter.bi_size, + op->new_i_size - (op->pos.offset << 9)); + +- if (data_len <= min(block_bytes(c) / 2, 1024U)) { ++ if (c->opts.inline_data && ++ data_len <= min(block_bytes(c) / 2, 1024U)) { + bch2_write_data_inline(op, data_len); + return; + } +diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h +index 0ec0999a6214..1f11f4152a6f 100644 +--- a/fs/bcachefs/opts.h ++++ b/fs/bcachefs/opts.h +@@ -181,6 +181,11 @@ enum opt_type { + OPT_BOOL(), \ + BCH_SB_128_BIT_MACS, false, \ + NULL, "Store full 128 bits of cryptographic MACs, instead of 80")\ ++ x(inline_data, u8, \ ++ OPT_MOUNT|OPT_RUNTIME, \ ++ OPT_BOOL(), \ ++ NO_SB_OPT, false, \ ++ NULL, "Enable inline data extents") \ + x(acl, u8, \ + OPT_FORMAT|OPT_MOUNT, \ + OPT_BOOL(), \ +-- +cgit v1.2.3 + + +From 4d1186c1051212d3e5ec1ba093a4eb9d4e8b036e Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 26 Nov 2019 17:18:23 -0500 +Subject: bcachefs: Fix bch2_verify_insert_pos() + +We were calling __btree_node_key_to_offset() on a key that wasn't in the +btree node. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bset.c | 14 ++++++-------- + 1 file changed, 6 insertions(+), 8 deletions(-) + +diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c +index b3e3a9c0ea13..4892c002214e 100644 +--- a/fs/bcachefs/bset.c ++++ b/fs/bcachefs/bset.c +@@ -253,10 +253,9 @@ void bch2_verify_insert_pos(struct btree *b, struct bkey_packed *where, + bch2_bkey_to_text(&PBUF(buf2), &k2); + + panic("prev > insert:\n" +- "prev key %5u %s\n" +- "insert key %5u %s\n", +- __btree_node_key_to_offset(b, prev), buf1, +- __btree_node_key_to_offset(b, insert), buf2); ++ "prev key %s\n" ++ "insert key %s\n", ++ buf1, buf2); + } + #endif + #if 0 +@@ -275,10 +274,9 @@ void bch2_verify_insert_pos(struct btree *b, struct bkey_packed *where, + bch2_bkey_to_text(&PBUF(buf2), &k2); + + panic("insert > next:\n" +- "insert key %5u %s\n" +- "next key %5u %s\n", +- __btree_node_key_to_offset(b, insert), buf1, +- __btree_node_key_to_offset(b, next), buf2); ++ "insert key %s\n" ++ "next key %s\n", ++ buf1, buf2); + } + #endif + } +-- +cgit v1.2.3 + + +From 6744cf580d38f05d369867a6399c595cd3d80bbf Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 20 Nov 2019 16:22:49 -0500 +Subject: bcachefs: Always emit new extents on partial overwrite + +This is prep work for snapshots: the algorithm in +bch2_extent_sort_fix_overlapping() will break when we have multiple +overlapping extents in unrelated snapshots - but, we'll be able to make +extents work like regular keys and use bch2_key_sort_fix_overlapping() +for extent btree nodes if we make a couple changes - the main one being +to always emit new extents when we partially overwrite an existing +(written) extent. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/extent_update.c | 293 +++++++++++++++++++------------------------- + 1 file changed, 125 insertions(+), 168 deletions(-) + +diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c +index 742b4d78cb3a..e021e1623a91 100644 +--- a/fs/bcachefs/extent_update.c ++++ b/fs/bcachefs/extent_update.c +@@ -171,49 +171,51 @@ bch2_extent_can_insert(struct btree_trans *trans, + { + struct btree_iter_level *l = &insert->iter->l[0]; + struct btree_node_iter node_iter = l->iter; +- enum bch_extent_overlap overlap; + struct bkey_packed *_k; + struct bkey unpacked; +- struct bkey_s_c k; + int sectors; + +- /* +- * We avoid creating whiteouts whenever possible when deleting, but +- * those optimizations mean we may potentially insert two whiteouts +- * instead of one (when we overlap with the front of one extent and the +- * back of another): +- */ +- if (bkey_whiteout(&insert->k->k)) +- *u64s += BKEY_U64s; +- +- _k = bch2_btree_node_iter_peek_filter(&node_iter, l->b, +- KEY_TYPE_discard); +- if (!_k) +- return BTREE_INSERT_OK; +- +- k = bkey_disassemble(l->b, _k, &unpacked); +- +- overlap = bch2_extent_overlap(&insert->k->k, k.k); +- +- /* account for having to split existing extent: */ +- if (overlap == BCH_EXTENT_OVERLAP_MIDDLE) +- *u64s += _k->u64s; +- +- if (overlap == BCH_EXTENT_OVERLAP_MIDDLE && +- (sectors = bch2_bkey_sectors_compressed(k))) { +- int flags = trans->flags & BTREE_INSERT_NOFAIL +- ? BCH_DISK_RESERVATION_NOFAIL : 0; +- +- switch (bch2_disk_reservation_add(trans->c, +- trans->disk_res, +- sectors, flags)) { +- case 0: ++ while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, l->b, ++ KEY_TYPE_discard))) { ++ struct bkey_s_c k = bkey_disassemble(l->b, _k, &unpacked); ++ enum bch_extent_overlap overlap = ++ bch2_extent_overlap(&insert->k->k, k.k); ++ ++ if (bkey_cmp(bkey_start_pos(k.k), insert->k->k.p) >= 0) + break; +- case -ENOSPC: +- return BTREE_INSERT_ENOSPC; +- default: +- BUG(); ++ ++ overlap = bch2_extent_overlap(&insert->k->k, k.k); ++ ++ if (bkey_written(l->b, _k) && ++ overlap != BCH_EXTENT_OVERLAP_ALL) ++ *u64s += _k->u64s; ++ ++ /* account for having to split existing extent: */ ++ if (overlap == BCH_EXTENT_OVERLAP_MIDDLE) ++ *u64s += _k->u64s; ++ ++ if (overlap == BCH_EXTENT_OVERLAP_MIDDLE && ++ (sectors = bch2_bkey_sectors_compressed(k))) { ++ int flags = trans->flags & BTREE_INSERT_NOFAIL ++ ? BCH_DISK_RESERVATION_NOFAIL : 0; ++ ++ switch (bch2_disk_reservation_add(trans->c, ++ trans->disk_res, ++ sectors, flags)) { ++ case 0: ++ break; ++ case -ENOSPC: ++ return BTREE_INSERT_ENOSPC; ++ default: ++ BUG(); ++ } + } ++ ++ if (overlap == BCH_EXTENT_OVERLAP_FRONT || ++ overlap == BCH_EXTENT_OVERLAP_MIDDLE) ++ break; ++ ++ bch2_btree_node_iter_advance(&node_iter, l->b); + } + + return BTREE_INSERT_OK; +@@ -285,101 +287,106 @@ static void extent_bset_insert(struct bch_fs *c, struct btree_iter *iter, + } + + static void +-extent_squash(struct bch_fs *c, struct btree_iter *iter, +- struct bkey_i *insert, +- struct bkey_packed *_k, struct bkey_s k, +- enum bch_extent_overlap overlap) ++extent_drop(struct bch_fs *c, struct btree_iter *iter, ++ struct bkey_packed *_k, struct bkey_s k) + { + struct btree_iter_level *l = &iter->l[0]; +- int u64s_delta; + +- switch (overlap) { +- case BCH_EXTENT_OVERLAP_FRONT: +- /* insert overlaps with start of k: */ +- u64s_delta = bch2_cut_front_s(insert->k.p, k); +- btree_keys_account_val_delta(l->b, _k, u64s_delta); ++ if (!bkey_whiteout(k.k)) ++ btree_account_key_drop(l->b, _k); + +- EBUG_ON(bkey_deleted(k.k)); +- extent_save(l->b, _k, k.k); +- bch2_btree_iter_fix_key_modified(iter, l->b, _k); +- break; ++ k.k->size = 0; ++ k.k->type = KEY_TYPE_deleted; ++ k.k->needs_whiteout = false; + +- case BCH_EXTENT_OVERLAP_BACK: +- /* insert overlaps with end of k: */ +- u64s_delta = bch2_cut_back_s(bkey_start_pos(&insert->k), k); +- btree_keys_account_val_delta(l->b, _k, u64s_delta); ++ if (_k >= btree_bset_last(l->b)->start) { ++ unsigned u64s = _k->u64s; + +- EBUG_ON(bkey_deleted(k.k)); ++ bch2_bset_delete(l->b, _k, _k->u64s); ++ bch2_btree_node_iter_fix(iter, l->b, &l->iter, _k, u64s, 0); ++ } else { + extent_save(l->b, _k, k.k); ++ bch2_btree_iter_fix_key_modified(iter, l->b, _k); ++ } ++} + +- /* +- * As the auxiliary tree is indexed by the end of the +- * key and we've just changed the end, update the +- * auxiliary tree. +- */ +- bch2_bset_fix_invalidated_key(l->b, _k); +- bch2_btree_node_iter_fix(iter, l->b, &l->iter, +- _k, _k->u64s, _k->u64s); +- break; +- +- case BCH_EXTENT_OVERLAP_ALL: { +- /* The insert key completely covers k, invalidate k */ +- if (!bkey_whiteout(k.k)) +- btree_account_key_drop(l->b, _k); ++static void ++extent_squash(struct bch_fs *c, struct btree_iter *iter, ++ struct bkey_i *insert, ++ struct bkey_packed *_k, struct bkey_s k, ++ enum bch_extent_overlap overlap) ++{ ++ struct btree_iter_level *l = &iter->l[0]; ++ struct bkey_on_stack tmp, split; + +- k.k->size = 0; +- k.k->type = KEY_TYPE_deleted; ++ bkey_on_stack_init(&tmp); ++ bkey_on_stack_init(&split); + +- if (_k >= btree_bset_last(l->b)->start) { +- unsigned u64s = _k->u64s; ++ switch (overlap) { ++ case BCH_EXTENT_OVERLAP_FRONT: ++ if (bkey_written(l->b, _k)) { ++ bkey_on_stack_reassemble(&tmp, c, k.s_c); ++ bch2_cut_front(insert->k.p, tmp.k); + +- bch2_bset_delete(l->b, _k, _k->u64s); +- bch2_btree_node_iter_fix(iter, l->b, &l->iter, +- _k, u64s, 0); ++ extent_drop(c, iter, _k, k); ++ extent_bset_insert(c, iter, tmp.k); + } else { ++ btree_keys_account_val_delta(l->b, _k, ++ bch2_cut_front_s(insert->k.p, k)); ++ + extent_save(l->b, _k, k.k); ++ /* ++ * No need to call bset_fix_invalidated_key, start of ++ * extent changed but extents are indexed by where they ++ * end ++ */ + bch2_btree_iter_fix_key_modified(iter, l->b, _k); + } +- + break; +- } +- case BCH_EXTENT_OVERLAP_MIDDLE: { +- struct bkey_on_stack split; +- +- bkey_on_stack_init(&split); +- bkey_on_stack_reassemble(&split, c, k.s_c); ++ case BCH_EXTENT_OVERLAP_BACK: ++ if (bkey_written(l->b, _k)) { ++ bkey_on_stack_reassemble(&tmp, c, k.s_c); ++ bch2_cut_back(bkey_start_pos(&insert->k), tmp.k); + +- /* +- * The insert key falls 'in the middle' of k +- * The insert key splits k in 3: +- * - start only in k, preserve +- * - middle common section, invalidate in k +- * - end only in k, preserve +- * +- * We update the old key to preserve the start, +- * insert will be the new common section, +- * we manually insert the end that we are preserving. +- * +- * modify k _before_ doing the insert (which will move +- * what k points to) +- */ +- split.k->k.needs_whiteout |= bkey_written(l->b, _k); ++ extent_drop(c, iter, _k, k); ++ extent_bset_insert(c, iter, tmp.k); ++ } else { ++ btree_keys_account_val_delta(l->b, _k, ++ bch2_cut_back_s(bkey_start_pos(&insert->k), k)); ++ extent_save(l->b, _k, k.k); + ++ bch2_bset_fix_invalidated_key(l->b, _k); ++ bch2_btree_node_iter_fix(iter, l->b, &l->iter, ++ _k, _k->u64s, _k->u64s); ++ } ++ break; ++ case BCH_EXTENT_OVERLAP_ALL: ++ extent_drop(c, iter, _k, k); ++ break; ++ case BCH_EXTENT_OVERLAP_MIDDLE: ++ bkey_on_stack_reassemble(&split, c, k.s_c); + bch2_cut_back(bkey_start_pos(&insert->k), split.k); +- BUG_ON(bkey_deleted(&split.k->k)); + +- u64s_delta = bch2_cut_front_s(insert->k.p, k); +- btree_keys_account_val_delta(l->b, _k, u64s_delta); ++ if (bkey_written(l->b, _k)) { ++ bkey_on_stack_reassemble(&tmp, c, k.s_c); ++ bch2_cut_front(insert->k.p, tmp.k); + +- BUG_ON(bkey_deleted(k.k)); +- extent_save(l->b, _k, k.k); +- bch2_btree_iter_fix_key_modified(iter, l->b, _k); ++ extent_drop(c, iter, _k, k); ++ extent_bset_insert(c, iter, tmp.k); ++ } else { ++ btree_keys_account_val_delta(l->b, _k, ++ bch2_cut_front_s(insert->k.p, k)); ++ ++ extent_save(l->b, _k, k.k); ++ bch2_btree_iter_fix_key_modified(iter, l->b, _k); ++ } + + extent_bset_insert(c, iter, split.k); +- bkey_on_stack_exit(&split, c); + break; + } +- } ++ ++ bkey_on_stack_exit(&split, c); ++ bkey_on_stack_exit(&tmp, c); + } + + /** +@@ -429,10 +436,7 @@ void bch2_insert_fixup_extent(struct btree_trans *trans, + struct bkey_i *insert = insert_entry->k; + struct btree_iter_level *l = &iter->l[0]; + struct btree_node_iter node_iter = l->iter; +- bool deleting = bkey_whiteout(&insert->k); +- bool update_journal = !deleting; +- bool update_btree = !deleting; +- struct bkey_i whiteout = *insert; ++ bool do_update = !bkey_whiteout(&insert->k); + struct bkey_packed *_k; + struct bkey unpacked; + +@@ -443,7 +447,6 @@ void bch2_insert_fixup_extent(struct btree_trans *trans, + while ((_k = bch2_btree_node_iter_peek_filter(&l->iter, l->b, + KEY_TYPE_discard))) { + struct bkey_s k = __bkey_disassemble(l->b, _k, &unpacked); +- struct bpos cur_end = bpos_min(insert->k.p, k.k->p); + enum bch_extent_overlap overlap = + bch2_extent_overlap(&insert->k, k.k); + +@@ -451,52 +454,18 @@ void bch2_insert_fixup_extent(struct btree_trans *trans, + break; + + if (!bkey_whiteout(k.k)) +- update_journal = true; ++ do_update = true; ++ ++ if (!do_update) { ++ struct bpos cur_end = bpos_min(insert->k.p, k.k->p); + +- if (!update_journal) { + bch2_cut_front(cur_end, insert); +- bch2_cut_front(cur_end, &whiteout); + bch2_btree_iter_set_pos_same_leaf(iter, cur_end); +- goto next; +- } +- +- /* +- * When deleting, if possible just do it by switching the type +- * of the key we're deleting, instead of creating and inserting +- * a new whiteout: +- */ +- if (deleting && +- !update_btree && +- !bkey_cmp(insert->k.p, k.k->p) && +- !bkey_cmp(bkey_start_pos(&insert->k), bkey_start_pos(k.k))) { +- if (!bkey_whiteout(k.k)) { +- btree_account_key_drop(l->b, _k); +- _k->type = KEY_TYPE_discard; +- reserve_whiteout(l->b, _k); +- bch2_btree_iter_fix_key_modified(iter, +- l->b, _k); +- } +- break; +- } +- +- if (k.k->needs_whiteout || bkey_written(l->b, _k)) { +- insert->k.needs_whiteout = true; +- update_btree = true; +- } +- +- if (update_btree && +- overlap == BCH_EXTENT_OVERLAP_ALL && +- bkey_whiteout(k.k) && +- k.k->needs_whiteout) { +- unreserve_whiteout(l->b, _k); +- _k->needs_whiteout = false; ++ } else { ++ insert->k.needs_whiteout |= k.k->needs_whiteout; ++ extent_squash(c, iter, insert, _k, k, overlap); + } + +- extent_squash(c, iter, insert, _k, k, overlap); +- +- if (!update_btree) +- bch2_cut_front(cur_end, insert); +-next: + node_iter = l->iter; + + if (overlap == BCH_EXTENT_OVERLAP_FRONT || +@@ -507,24 +476,12 @@ next: + l->iter = node_iter; + bch2_btree_iter_set_pos_same_leaf(iter, insert->k.p); + +- if (update_btree) { +- if (deleting) ++ if (do_update) { ++ if (insert->k.type == KEY_TYPE_deleted) + insert->k.type = KEY_TYPE_discard; + +- EBUG_ON(bkey_deleted(&insert->k) || !insert->k.size); +- + extent_bset_insert(c, iter, insert); +- } +- +- if (update_journal) { +- struct bkey_i *k = !deleting ? insert : &whiteout; +- +- if (deleting) +- k->k.type = KEY_TYPE_discard; +- +- EBUG_ON(bkey_deleted(&k->k) || !k->k.size); +- +- bch2_btree_journal_key(trans, iter, k); ++ bch2_btree_journal_key(trans, iter, insert); + } + + bch2_cut_front(insert->k.p, insert); +-- +cgit v1.2.3 + + +From 324f7f2e7b17d4bcbca844bac1ebcbc55a64e162 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 29 Nov 2019 14:08:51 -0500 +Subject: bcachefs: Whiteout changes + +More prep work for snapshots: extents will soon be using +KEY_TYPE_deleted for whiteouts, with 0 size. But we wen't be able to +keep these whiteouts with the rest of the extents in the btree node, due +to sorting invariants breaking. + +We can deal with this by immediately moving the new whiteouts to the +unwritten whiteouts area - this just means those whiteouts won't be +sorted, so we need new code to sort them prior to merging them with the +rest of the keys to be written. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_cache.c | 1 - + fs/bcachefs/btree_io.c | 99 ++++++++++++++++++++++++++++++------- + fs/bcachefs/btree_types.h | 1 - + fs/bcachefs/btree_update_interior.h | 29 +++++------ + fs/bcachefs/btree_update_leaf.c | 45 +++++++++-------- + 5 files changed, 119 insertions(+), 56 deletions(-) + +diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c +index 5d3acba525c2..0c737f35f430 100644 +--- a/fs/bcachefs/btree_cache.c ++++ b/fs/bcachefs/btree_cache.c +@@ -557,7 +557,6 @@ out: + b->sib_u64s[0] = 0; + b->sib_u64s[1] = 0; + b->whiteout_u64s = 0; +- b->uncompacted_whiteout_u64s = 0; + bch2_btree_keys_init(b, &c->expensive_debug_checks); + + bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc], +diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c +index c345262d804b..c0938c75c2be 100644 +--- a/fs/bcachefs/btree_io.c ++++ b/fs/bcachefs/btree_io.c +@@ -80,6 +80,81 @@ static void *btree_bounce_alloc(struct bch_fs *c, unsigned order, + return mempool_alloc(&c->btree_bounce_pool, GFP_NOIO); + } + ++static void sort_bkey_ptrs(const struct btree *bt, ++ struct bkey_packed **ptrs, unsigned nr) ++{ ++ unsigned n = nr, a = nr / 2, b, c, d; ++ ++ if (!a) ++ return; ++ ++ /* Heap sort: see lib/sort.c: */ ++ while (1) { ++ if (a) ++ a--; ++ else if (--n) ++ swap(ptrs[0], ptrs[n]); ++ else ++ break; ++ ++ for (b = a; c = 2 * b + 1, (d = c + 1) < n;) ++ b = bkey_cmp_packed(bt, ++ ptrs[c], ++ ptrs[d]) >= 0 ? c : d; ++ if (d == n) ++ b = c; ++ ++ while (b != a && ++ bkey_cmp_packed(bt, ++ ptrs[a], ++ ptrs[b]) >= 0) ++ b = (b - 1) / 2; ++ c = b; ++ while (b != a) { ++ b = (b - 1) / 2; ++ swap(ptrs[b], ptrs[c]); ++ } ++ } ++} ++ ++static void bch2_sort_whiteouts(struct bch_fs *c, struct btree *b) ++{ ++ struct bkey_packed *new_whiteouts, **whiteout_ptrs, *k; ++ bool used_mempool1 = false, used_mempool2 = false; ++ unsigned order, i, nr = 0; ++ ++ if (!b->whiteout_u64s) ++ return; ++ ++ order = get_order(b->whiteout_u64s * sizeof(u64)); ++ ++ new_whiteouts = btree_bounce_alloc(c, order, &used_mempool1); ++ whiteout_ptrs = btree_bounce_alloc(c, order, &used_mempool2); ++ ++ for (k = unwritten_whiteouts_start(c, b); ++ k != unwritten_whiteouts_end(c, b); ++ k = bkey_next(k)) ++ whiteout_ptrs[nr++] = k; ++ ++ sort_bkey_ptrs(b, whiteout_ptrs, nr); ++ ++ k = new_whiteouts; ++ ++ for (i = 0; i < nr; i++) { ++ bkey_copy(k, whiteout_ptrs[i]); ++ k = bkey_next(k); ++ } ++ ++ verify_no_dups(b, new_whiteouts, ++ (void *) ((u64 *) new_whiteouts + b->whiteout_u64s)); ++ ++ memcpy_u64s(unwritten_whiteouts_start(c, b), ++ new_whiteouts, b->whiteout_u64s); ++ ++ btree_bounce_free(c, order, used_mempool2, whiteout_ptrs); ++ btree_bounce_free(c, order, used_mempool1, new_whiteouts); ++} ++ + static unsigned should_compact_bset(struct btree *b, struct bset_tree *t, + bool compacting, + enum compact_mode mode) +@@ -117,6 +192,8 @@ bool __bch2_compact_whiteouts(struct bch_fs *c, struct btree *b, + if (!whiteout_u64s) + return false; + ++ bch2_sort_whiteouts(c, b); ++ + sort_iter_init(&sort_iter, b); + + whiteout_u64s += b->whiteout_u64s; +@@ -172,11 +249,14 @@ bool __bch2_compact_whiteouts(struct bch_fs *c, struct btree *b, + if (bkey_deleted(k) && btree_node_is_extents(b)) + continue; + ++ BUG_ON(bkey_whiteout(k) && ++ k->needs_whiteout && ++ bkey_written(b, k)); ++ + if (bkey_whiteout(k) && !k->needs_whiteout) + continue; + + if (bkey_whiteout(k)) { +- unreserve_whiteout(b, k); + memcpy_u64s(u_pos, k, bkeyp_key_u64s(f, k)); + set_bkeyp_val_u64s(f, u_pos, 0); + u_pos = bkey_next(u_pos); +@@ -1343,21 +1423,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, + BUG_ON(le64_to_cpu(b->data->magic) != bset_magic(c)); + BUG_ON(memcmp(&b->data->format, &b->format, sizeof(b->format))); + +- /* +- * We can't block on six_lock_write() here; another thread might be +- * trying to get a journal reservation with read locks held, and getting +- * a journal reservation might be blocked on flushing the journal and +- * doing btree writes: +- */ +- if (lock_type_held == SIX_LOCK_intent && +- six_trylock_write(&b->lock)) { +- __bch2_compact_whiteouts(c, b, COMPACT_WRITTEN); +- six_unlock_write(&b->lock); +- } else { +- __bch2_compact_whiteouts(c, b, COMPACT_WRITTEN_NO_WRITE_LOCK); +- } +- +- BUG_ON(b->uncompacted_whiteout_u64s); ++ bch2_sort_whiteouts(c, b); + + sort_iter_init(&sort_iter, b); + +@@ -1545,7 +1611,6 @@ bool bch2_btree_post_write_cleanup(struct bch_fs *c, struct btree *b) + return false; + + BUG_ON(b->whiteout_u64s); +- BUG_ON(b->uncompacted_whiteout_u64s); + + clear_btree_node_just_written(b); + +diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h +index efa68bb578ab..6371156fe88a 100644 +--- a/fs/bcachefs/btree_types.h ++++ b/fs/bcachefs/btree_types.h +@@ -94,7 +94,6 @@ struct btree { + struct btree_nr_keys nr; + u16 sib_u64s[2]; + u16 whiteout_u64s; +- u16 uncompacted_whiteout_u64s; + u8 page_order; + u8 unpack_fn_len; + +diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h +index c5a0ab5d7bb8..2d8e0b7f3aaf 100644 +--- a/fs/bcachefs/btree_update_interior.h ++++ b/fs/bcachefs/btree_update_interior.h +@@ -251,8 +251,7 @@ static inline ssize_t __bch_btree_u64s_remaining(struct bch_fs *c, + void *end) + { + ssize_t used = bset_byte_offset(b, end) / sizeof(u64) + +- b->whiteout_u64s + +- b->uncompacted_whiteout_u64s; ++ b->whiteout_u64s; + ssize_t total = c->opts.btree_node_size << 6; + + return total - used; +@@ -302,23 +301,19 @@ static inline struct btree_node_entry *want_new_bset(struct bch_fs *c, + return NULL; + } + +-static inline void unreserve_whiteout(struct btree *b, struct bkey_packed *k) ++static inline void push_whiteout(struct bch_fs *c, struct btree *b, ++ struct bkey_packed *k) + { +- if (bkey_written(b, k)) { +- EBUG_ON(b->uncompacted_whiteout_u64s < +- bkeyp_key_u64s(&b->format, k)); +- b->uncompacted_whiteout_u64s -= +- bkeyp_key_u64s(&b->format, k); +- } +-} ++ unsigned u64s = bkeyp_key_u64s(&b->format, k); ++ struct bkey_packed *dst; + +-static inline void reserve_whiteout(struct btree *b, struct bkey_packed *k) +-{ +- if (bkey_written(b, k)) { +- BUG_ON(!k->needs_whiteout); +- b->uncompacted_whiteout_u64s += +- bkeyp_key_u64s(&b->format, k); +- } ++ BUG_ON(u64s > bch_btree_keys_u64s_remaining(c, b)); ++ ++ b->whiteout_u64s += bkeyp_key_u64s(&b->format, k); ++ dst = unwritten_whiteouts_start(c, b); ++ memcpy_u64s(dst, k, u64s); ++ dst->u64s = u64s; ++ dst->type = KEY_TYPE_deleted; + } + + /* +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index d37a95299240..46c0a1e7fa20 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -104,38 +104,43 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter, + return true; + } + +- insert->k.needs_whiteout = k->needs_whiteout; +- + btree_account_key_drop(b, k); + +- if (k >= btree_bset_last(b)->start) { +- clobber_u64s = k->u64s; ++ if (bkey_whiteout(&insert->k)) { ++ unsigned clobber_u64s = k->u64s, new_u64s = k->u64s; ++ ++ k->type = KEY_TYPE_deleted; + +- /* +- * If we're deleting, and the key we're deleting doesn't +- * need a whiteout (it wasn't overwriting a key that had +- * been written to disk) - just delete it: +- */ +- if (bkey_whiteout(&insert->k) && !k->needs_whiteout) { ++ if (k->needs_whiteout) { ++ push_whiteout(iter->trans->c, b, k); ++ k->needs_whiteout = false; ++ } ++ ++ if (k >= btree_bset_last(b)->start) { + bch2_bset_delete(b, k, clobber_u64s); +- bch2_btree_node_iter_fix(iter, b, node_iter, +- k, clobber_u64s, 0); +- return true; ++ new_u64s = 0; + } + ++ bch2_btree_node_iter_fix(iter, b, node_iter, k, ++ clobber_u64s, new_u64s); ++ return true; ++ ++ } ++ ++ if (k >= btree_bset_last(b)->start) { ++ clobber_u64s = k->u64s; + goto overwrite; + } + ++ insert->k.needs_whiteout = k->needs_whiteout; ++ k->needs_whiteout = false; + k->type = KEY_TYPE_deleted; ++ /* ++ * XXX: we should be able to do this without two calls to ++ * bch2_btree_node_iter_fix: ++ */ + bch2_btree_node_iter_fix(iter, b, node_iter, k, + k->u64s, k->u64s); +- +- if (bkey_whiteout(&insert->k)) { +- reserve_whiteout(b, k); +- return true; +- } else { +- k->needs_whiteout = false; +- } + } else { + /* + * Deleting, but the key to delete wasn't found - nothing to do: +-- +cgit v1.2.3 + + +From 1f67f050bcc5d7a348bfdc67da2de00d742ebd11 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 13 Dec 2019 13:08:37 -0500 +Subject: bcachefs: Refactor whiteouts compaction + +The whiteout compaction path - as opposed to just dropping whiteouts - +is now only needed for extents, and soon will only be needed for extent +btree nodes in the old format. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bkey_sort.c | 22 --------- + fs/bcachefs/bkey_sort.h | 2 - + fs/bcachefs/btree_io.c | 112 ++++++++++++++++++++++++++++------------------ + fs/bcachefs/btree_io.h | 13 +++--- + fs/bcachefs/btree_types.h | 5 +++ + 5 files changed, 80 insertions(+), 74 deletions(-) + +diff --git a/fs/bcachefs/bkey_sort.c b/fs/bcachefs/bkey_sort.c +index 2e205db5433d..4f614cde3267 100644 +--- a/fs/bcachefs/bkey_sort.c ++++ b/fs/bcachefs/bkey_sort.c +@@ -530,28 +530,6 @@ unsigned bch2_sort_extents(struct bkey_packed *dst, + return (u64 *) out - (u64 *) dst; + } + +-static inline int sort_key_whiteouts_cmp(struct btree *b, +- struct bkey_packed *l, +- struct bkey_packed *r) +-{ +- return bkey_cmp_packed(b, l, r); +-} +- +-unsigned bch2_sort_key_whiteouts(struct bkey_packed *dst, +- struct sort_iter *iter) +-{ +- struct bkey_packed *in, *out = dst; +- +- sort_iter_sort(iter, sort_key_whiteouts_cmp); +- +- while ((in = sort_iter_next(iter, sort_key_whiteouts_cmp))) { +- bkey_copy(out, in); +- out = bkey_next(out); +- } +- +- return (u64 *) out - (u64 *) dst; +-} +- + static inline int sort_extent_whiteouts_cmp(struct btree *b, + struct bkey_packed *l, + struct bkey_packed *r) +diff --git a/fs/bcachefs/bkey_sort.h b/fs/bcachefs/bkey_sort.h +index 397009181eae..47a808670341 100644 +--- a/fs/bcachefs/bkey_sort.h ++++ b/fs/bcachefs/bkey_sort.h +@@ -61,8 +61,6 @@ unsigned bch2_sort_keys(struct bkey_packed *, + unsigned bch2_sort_extents(struct bkey_packed *, + struct sort_iter *, bool); + +-unsigned bch2_sort_key_whiteouts(struct bkey_packed *, +- struct sort_iter *); + unsigned bch2_sort_extent_whiteouts(struct bkey_packed *, + struct sort_iter *); + +diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c +index c0938c75c2be..8b308138d586 100644 +--- a/fs/bcachefs/btree_io.c ++++ b/fs/bcachefs/btree_io.c +@@ -155,27 +155,26 @@ static void bch2_sort_whiteouts(struct bch_fs *c, struct btree *b) + btree_bounce_free(c, order, used_mempool1, new_whiteouts); + } + +-static unsigned should_compact_bset(struct btree *b, struct bset_tree *t, +- bool compacting, +- enum compact_mode mode) ++static bool should_compact_bset(struct btree *b, struct bset_tree *t, ++ bool compacting, enum compact_mode mode) + { +- unsigned bset_u64s = le16_to_cpu(bset(b, t)->u64s); +- unsigned dead_u64s = bset_u64s - b->nr.bset_u64s[t - b->set]; ++ if (!bset_dead_u64s(b, t)) ++ return false; + +- if (mode == COMPACT_LAZY) { +- if (should_compact_bset_lazy(b, t) || +- (compacting && !bset_written(b, bset(b, t)))) +- return dead_u64s; +- } else { +- if (bset_written(b, bset(b, t))) +- return dead_u64s; ++ switch (mode) { ++ case COMPACT_LAZY: ++ return should_compact_bset_lazy(b, t) || ++ (compacting && !bset_written(b, bset(b, t))); ++ case COMPACT_ALL: ++ return true; ++ default: ++ BUG(); + } +- +- return 0; + } + +-bool __bch2_compact_whiteouts(struct bch_fs *c, struct btree *b, +- enum compact_mode mode) ++static bool bch2_compact_extent_whiteouts(struct bch_fs *c, ++ struct btree *b, ++ enum compact_mode mode) + { + const struct bkey_format *f = &b->format; + struct bset_tree *t; +@@ -185,9 +184,11 @@ bool __bch2_compact_whiteouts(struct bch_fs *c, struct btree *b, + unsigned order, whiteout_u64s = 0, u64s; + bool used_mempool, compacting = false; + ++ BUG_ON(!btree_node_is_extents(b)); ++ + for_each_bset(b, t) +- whiteout_u64s += should_compact_bset(b, t, +- whiteout_u64s != 0, mode); ++ if (should_compact_bset(b, t, whiteout_u64s != 0, mode)) ++ whiteout_u64s += bset_dead_u64s(b, t); + + if (!whiteout_u64s) + return false; +@@ -216,9 +217,12 @@ bool __bch2_compact_whiteouts(struct bch_fs *c, struct btree *b, + if (t != b->set && !bset_written(b, i)) { + src = container_of(i, struct btree_node_entry, keys); + dst = max(write_block(b), +- (void *) btree_bkey_last(b, t -1)); ++ (void *) btree_bkey_last(b, t - 1)); + } + ++ if (src != dst) ++ compacting = true; ++ + if (!should_compact_bset(b, t, compacting, mode)) { + if (src != dst) { + memmove(dst, src, sizeof(*src) + +@@ -246,7 +250,7 @@ bool __bch2_compact_whiteouts(struct bch_fs *c, struct btree *b, + for (k = start; k != end; k = n) { + n = bkey_next_skip_noops(k, end); + +- if (bkey_deleted(k) && btree_node_is_extents(b)) ++ if (bkey_deleted(k)) + continue; + + BUG_ON(bkey_whiteout(k) && +@@ -260,7 +264,7 @@ bool __bch2_compact_whiteouts(struct bch_fs *c, struct btree *b, + memcpy_u64s(u_pos, k, bkeyp_key_u64s(f, k)); + set_bkeyp_val_u64s(f, u_pos, 0); + u_pos = bkey_next(u_pos); +- } else if (mode != COMPACT_WRITTEN_NO_WRITE_LOCK) { ++ } else { + bkey_copy(out, k); + out = bkey_next(out); + } +@@ -268,11 +272,9 @@ bool __bch2_compact_whiteouts(struct bch_fs *c, struct btree *b, + + sort_iter_add(&sort_iter, u_start, u_pos); + +- if (mode != COMPACT_WRITTEN_NO_WRITE_LOCK) { +- i->u64s = cpu_to_le16((u64 *) out - i->_data); +- set_btree_bset_end(b, t); +- bch2_bset_set_no_aux_tree(b, t); +- } ++ i->u64s = cpu_to_le16((u64 *) out - i->_data); ++ set_btree_bset_end(b, t); ++ bch2_bset_set_no_aux_tree(b, t); + } + + b->whiteout_u64s = (u64 *) u_pos - (u64 *) whiteouts; +@@ -280,13 +282,10 @@ bool __bch2_compact_whiteouts(struct bch_fs *c, struct btree *b, + BUG_ON((void *) unwritten_whiteouts_start(c, b) < + (void *) btree_bkey_last(b, bset_tree_last(b))); + +- u64s = (btree_node_is_extents(b) +- ? bch2_sort_extent_whiteouts +- : bch2_sort_key_whiteouts)(unwritten_whiteouts_start(c, b), +- &sort_iter); ++ u64s = bch2_sort_extent_whiteouts(unwritten_whiteouts_start(c, b), ++ &sort_iter); + + BUG_ON(u64s > b->whiteout_u64s); +- BUG_ON(u64s != b->whiteout_u64s && !btree_node_is_extents(b)); + BUG_ON(u_pos != whiteouts && !u64s); + + if (u64s != b->whiteout_u64s) { +@@ -302,8 +301,7 @@ bool __bch2_compact_whiteouts(struct bch_fs *c, struct btree *b, + + btree_bounce_free(c, order, used_mempool, whiteouts); + +- if (mode != COMPACT_WRITTEN_NO_WRITE_LOCK) +- bch2_btree_build_aux_trees(b); ++ bch2_btree_build_aux_trees(b); + + bch_btree_keys_u64s_remaining(c, b); + bch2_verify_btree_nr_keys(b); +@@ -311,7 +309,7 @@ bool __bch2_compact_whiteouts(struct bch_fs *c, struct btree *b, + return true; + } + +-static bool bch2_drop_whiteouts(struct btree *b) ++static bool bch2_drop_whiteouts(struct btree *b, enum compact_mode mode) + { + struct bset_tree *t; + bool ret = false; +@@ -319,21 +317,34 @@ static bool bch2_drop_whiteouts(struct btree *b) + for_each_bset(b, t) { + struct bset *i = bset(b, t); + struct bkey_packed *k, *n, *out, *start, *end; ++ struct btree_node_entry *src = NULL, *dst = NULL; ++ ++ if (t != b->set && !bset_written(b, i)) { ++ src = container_of(i, struct btree_node_entry, keys); ++ dst = max(write_block(b), ++ (void *) btree_bkey_last(b, t - 1)); ++ } ++ ++ if (src != dst) ++ ret = true; + +- if (!should_compact_bset(b, t, true, COMPACT_WRITTEN)) ++ if (!should_compact_bset(b, t, ret, mode)) { ++ if (src != dst) { ++ memmove(dst, src, sizeof(*src) + ++ le16_to_cpu(src->keys.u64s) * ++ sizeof(u64)); ++ i = &dst->keys; ++ set_btree_bset(b, t, i); ++ } + continue; ++ } + + start = btree_bkey_first(b, t); + end = btree_bkey_last(b, t); + +- if (!bset_written(b, i) && +- t != b->set) { +- struct bset *dst = +- max_t(struct bset *, write_block(b), +- (void *) btree_bkey_last(b, t -1)); +- +- memmove(dst, i, sizeof(struct bset)); +- i = dst; ++ if (src != dst) { ++ memmove(dst, src, sizeof(*src)); ++ i = &dst->keys; + set_btree_bset(b, t, i); + } + +@@ -345,19 +356,32 @@ static bool bch2_drop_whiteouts(struct btree *b) + if (!bkey_whiteout(k)) { + bkey_copy(out, k); + out = bkey_next(out); ++ } else { ++ BUG_ON(k->needs_whiteout); + } + } + + i->u64s = cpu_to_le16((u64 *) out - i->_data); ++ set_btree_bset_end(b, t); + bch2_bset_set_no_aux_tree(b, t); + ret = true; + } + + bch2_verify_btree_nr_keys(b); + ++ bch2_btree_build_aux_trees(b); ++ + return ret; + } + ++bool bch2_compact_whiteouts(struct bch_fs *c, struct btree *b, ++ enum compact_mode mode) ++{ ++ return !btree_node_is_extents(b) ++ ? bch2_drop_whiteouts(b, mode) ++ : bch2_compact_extent_whiteouts(c, b, mode); ++} ++ + static void btree_node_sort(struct bch_fs *c, struct btree *b, + struct btree_iter *iter, + unsigned start_idx, +@@ -1631,7 +1655,7 @@ bool bch2_btree_post_write_cleanup(struct bch_fs *c, struct btree *b) + btree_node_sort(c, b, NULL, 0, b->nsets, true); + invalidated_iter = true; + } else { +- invalidated_iter = bch2_drop_whiteouts(b); ++ invalidated_iter = bch2_drop_whiteouts(b, COMPACT_ALL); + } + + for_each_bset(b, t) +diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h +index 955a80cafae3..e90e89eee273 100644 +--- a/fs/bcachefs/btree_io.h ++++ b/fs/bcachefs/btree_io.h +@@ -54,16 +54,17 @@ static inline bool btree_node_may_write(struct btree *b) + + enum compact_mode { + COMPACT_LAZY, +- COMPACT_WRITTEN, +- COMPACT_WRITTEN_NO_WRITE_LOCK, ++ COMPACT_ALL, + }; + +-bool __bch2_compact_whiteouts(struct bch_fs *, struct btree *, enum compact_mode); ++bool bch2_compact_whiteouts(struct bch_fs *, struct btree *, ++ enum compact_mode); + +-static inline unsigned should_compact_bset_lazy(struct btree *b, struct bset_tree *t) ++static inline bool should_compact_bset_lazy(struct btree *b, ++ struct bset_tree *t) + { + unsigned total_u64s = bset_u64s(t); +- unsigned dead_u64s = total_u64s - b->nr.bset_u64s[t - b->set]; ++ unsigned dead_u64s = bset_dead_u64s(b, t); + + return dead_u64s > 64 && dead_u64s * 3 > total_u64s; + } +@@ -74,7 +75,7 @@ static inline bool bch2_maybe_compact_whiteouts(struct bch_fs *c, struct btree * + + for_each_bset(b, t) + if (should_compact_bset_lazy(b, t)) +- return __bch2_compact_whiteouts(c, b, COMPACT_LAZY); ++ return bch2_compact_whiteouts(c, b, COMPACT_LAZY); + + return false; + } +diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h +index 6371156fe88a..0c0a3f35a62e 100644 +--- a/fs/bcachefs/btree_types.h ++++ b/fs/bcachefs/btree_types.h +@@ -420,6 +420,11 @@ static inline unsigned bset_u64s(struct bset_tree *t) + sizeof(struct bset) / sizeof(u64); + } + ++static inline unsigned bset_dead_u64s(struct btree *b, struct bset_tree *t) ++{ ++ return bset_u64s(t) - b->nr.bset_u64s[t - b->set]; ++} ++ + static inline unsigned bset_byte_offset(struct btree *b, void *i) + { + return i - (void *) b->data; +-- +cgit v1.2.3 + + +From 1c64fc45845a445620955a042d548d8f85a44273 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 14 Dec 2019 15:55:29 -0500 +Subject: bcachefs: Use one buffer for sorting whiteouts + +We're not really supposed to allocate from the same mempool more than +once. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_io.c | 23 ++++++++++++----------- + 1 file changed, 12 insertions(+), 11 deletions(-) + +diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c +index 8b308138d586..781ad97f910b 100644 +--- a/fs/bcachefs/btree_io.c ++++ b/fs/bcachefs/btree_io.c +@@ -119,30 +119,32 @@ static void sort_bkey_ptrs(const struct btree *bt, + + static void bch2_sort_whiteouts(struct bch_fs *c, struct btree *b) + { +- struct bkey_packed *new_whiteouts, **whiteout_ptrs, *k; +- bool used_mempool1 = false, used_mempool2 = false; +- unsigned order, i, nr = 0; ++ struct bkey_packed *new_whiteouts, **ptrs, **ptrs_end, *k; ++ bool used_mempool = false; ++ unsigned order; + + if (!b->whiteout_u64s) + return; + + order = get_order(b->whiteout_u64s * sizeof(u64)); + +- new_whiteouts = btree_bounce_alloc(c, order, &used_mempool1); +- whiteout_ptrs = btree_bounce_alloc(c, order, &used_mempool2); ++ new_whiteouts = btree_bounce_alloc(c, order, &used_mempool); ++ ++ ptrs = ptrs_end = ((void *) new_whiteouts + (PAGE_SIZE << order)); + + for (k = unwritten_whiteouts_start(c, b); + k != unwritten_whiteouts_end(c, b); + k = bkey_next(k)) +- whiteout_ptrs[nr++] = k; ++ *--ptrs = k; + +- sort_bkey_ptrs(b, whiteout_ptrs, nr); ++ sort_bkey_ptrs(b, ptrs, ptrs_end - ptrs); + + k = new_whiteouts; + +- for (i = 0; i < nr; i++) { +- bkey_copy(k, whiteout_ptrs[i]); ++ while (ptrs != ptrs_end) { ++ bkey_copy(k, *ptrs); + k = bkey_next(k); ++ ptrs++; + } + + verify_no_dups(b, new_whiteouts, +@@ -151,8 +153,7 @@ static void bch2_sort_whiteouts(struct bch_fs *c, struct btree *b) + memcpy_u64s(unwritten_whiteouts_start(c, b), + new_whiteouts, b->whiteout_u64s); + +- btree_bounce_free(c, order, used_mempool2, whiteout_ptrs); +- btree_bounce_free(c, order, used_mempool1, new_whiteouts); ++ btree_bounce_free(c, order, used_mempool, new_whiteouts); + } + + static bool should_compact_bset(struct btree *b, struct bset_tree *t, +-- +cgit v1.2.3 + + +From 10fa772608e351c3cc7b36f3ce47cac1e437668f Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 14 Dec 2019 16:20:33 -0500 +Subject: bcachefs: Kill btree_node_iter_large + +Long overdue cleanup - this converts btree_node_iter_large uses to +sort_iter. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bkey_sort.c | 256 +++++++++++++++--------------------------------- + fs/bcachefs/bkey_sort.h | 26 ++--- + fs/bcachefs/btree_io.c | 22 ++--- + fs/bcachefs/super.c | 4 +- + 4 files changed, 99 insertions(+), 209 deletions(-) + +diff --git a/fs/bcachefs/bkey_sort.c b/fs/bcachefs/bkey_sort.c +index 4f614cde3267..23b51ef57303 100644 +--- a/fs/bcachefs/bkey_sort.c ++++ b/fs/bcachefs/bkey_sort.c +@@ -5,90 +5,15 @@ + #include "bset.h" + #include "extents.h" + +-/* too many iterators, need to clean this up */ +- +-/* btree_node_iter_large: */ +- +-#define btree_node_iter_cmp_heap(h, _l, _r) btree_node_iter_cmp(b, _l, _r) ++typedef int (*sort_cmp_fn)(struct btree *, ++ struct bkey_packed *, ++ struct bkey_packed *); + +-static inline bool +-bch2_btree_node_iter_large_end(struct btree_node_iter_large *iter) ++static inline bool sort_iter_end(struct sort_iter *iter) + { + return !iter->used; + } + +-static inline struct bkey_packed * +-bch2_btree_node_iter_large_peek_all(struct btree_node_iter_large *iter, +- struct btree *b) +-{ +- return bch2_btree_node_iter_large_end(iter) +- ? NULL +- : __btree_node_offset_to_key(b, iter->data->k); +-} +- +-static void +-bch2_btree_node_iter_large_advance(struct btree_node_iter_large *iter, +- struct btree *b) +-{ +- iter->data->k += __btree_node_offset_to_key(b, iter->data->k)->u64s; +- +- EBUG_ON(!iter->used); +- EBUG_ON(iter->data->k > iter->data->end); +- +- if (iter->data->k == iter->data->end) +- heap_del(iter, 0, btree_node_iter_cmp_heap, NULL); +- else +- heap_sift_down(iter, 0, btree_node_iter_cmp_heap, NULL); +-} +- +-static inline struct bkey_packed * +-bch2_btree_node_iter_large_next_all(struct btree_node_iter_large *iter, +- struct btree *b) +-{ +- struct bkey_packed *ret = bch2_btree_node_iter_large_peek_all(iter, b); +- +- if (ret) +- bch2_btree_node_iter_large_advance(iter, b); +- +- return ret; +-} +- +-void bch2_btree_node_iter_large_push(struct btree_node_iter_large *iter, +- struct btree *b, +- const struct bkey_packed *k, +- const struct bkey_packed *end) +-{ +- if (k != end) { +- struct btree_node_iter_set n = +- ((struct btree_node_iter_set) { +- __btree_node_key_to_offset(b, k), +- __btree_node_key_to_offset(b, end) +- }); +- +- __heap_add(iter, n, btree_node_iter_cmp_heap, NULL); +- } +-} +- +-static void sort_key_next(struct btree_node_iter_large *iter, +- struct btree *b, +- struct btree_node_iter_set *i) +-{ +- i->k += __btree_node_offset_to_key(b, i->k)->u64s; +- +- while (i->k != i->end && +- !__btree_node_offset_to_key(b, i->k)->u64s) +- i->k++; +- +- if (i->k == i->end) +- *i = iter->data[--iter->used]; +-} +- +-/* regular sort_iters */ +- +-typedef int (*sort_cmp_fn)(struct btree *, +- struct bkey_packed *, +- struct bkey_packed *); +- + static inline void __sort_iter_sift(struct sort_iter *iter, + unsigned from, + sort_cmp_fn cmp) +@@ -118,19 +43,29 @@ static inline void sort_iter_sort(struct sort_iter *iter, sort_cmp_fn cmp) + + static inline struct bkey_packed *sort_iter_peek(struct sort_iter *iter) + { +- return iter->used ? iter->data->k : NULL; ++ return !sort_iter_end(iter) ? iter->data->k : NULL; + } + +-static inline void sort_iter_advance(struct sort_iter *iter, sort_cmp_fn cmp) ++static inline void __sort_iter_advance(struct sort_iter *iter, ++ unsigned idx, sort_cmp_fn cmp) + { +- iter->data->k = bkey_next_skip_noops(iter->data->k, iter->data->end); ++ struct sort_iter_set *i = iter->data + idx; ++ ++ BUG_ON(idx >= iter->used); ++ ++ i->k = bkey_next_skip_noops(i->k, i->end); + +- BUG_ON(iter->data->k > iter->data->end); ++ BUG_ON(i->k > i->end); + +- if (iter->data->k == iter->data->end) +- array_remove_item(iter->data, iter->used, 0); ++ if (i->k == i->end) ++ array_remove_item(iter->data, iter->used, idx); + else +- sort_iter_sift(iter, cmp); ++ __sort_iter_sift(iter, idx, cmp); ++} ++ ++static inline void sort_iter_advance(struct sort_iter *iter, sort_cmp_fn cmp) ++{ ++ __sort_iter_advance(iter, 0, cmp); + } + + static inline struct bkey_packed *sort_iter_next(struct sort_iter *iter, +@@ -145,70 +80,50 @@ static inline struct bkey_packed *sort_iter_next(struct sort_iter *iter, + } + + /* +- * Returns true if l > r - unless l == r, in which case returns true if l is +- * older than r. +- * +- * Necessary for btree_sort_fixup() - if there are multiple keys that compare +- * equal in different sets, we have to process them newest to oldest. ++ * If keys compare equal, compare by pointer order: + */ +-#define key_sort_cmp(h, l, r) \ +-({ \ +- bkey_cmp_packed(b, \ +- __btree_node_offset_to_key(b, (l).k), \ +- __btree_node_offset_to_key(b, (r).k)) \ +- \ +- ?: (l).k - (r).k; \ +-}) +- +-static inline bool should_drop_next_key(struct btree_node_iter_large *iter, +- struct btree *b) ++static inline int key_sort_fix_overlapping_cmp(struct btree *b, ++ struct bkey_packed *l, ++ struct bkey_packed *r) + { +- struct btree_node_iter_set *l = iter->data, *r = iter->data + 1; +- struct bkey_packed *k = __btree_node_offset_to_key(b, l->k); +- +- if (bkey_whiteout(k)) +- return true; +- +- if (iter->used < 2) +- return false; +- +- if (iter->used > 2 && +- key_sort_cmp(iter, r[0], r[1]) >= 0) +- r++; ++ return bkey_cmp_packed(b, l, r) ?: ++ cmp_int((unsigned long) l, (unsigned long) r); ++} + ++static inline bool should_drop_next_key(struct sort_iter *iter) ++{ + /* + * key_sort_cmp() ensures that when keys compare equal the older key +- * comes first; so if l->k compares equal to r->k then l->k is older and +- * should be dropped. ++ * comes first; so if l->k compares equal to r->k then l->k is older ++ * and should be dropped. + */ +- return !bkey_cmp_packed(b, +- __btree_node_offset_to_key(b, l->k), +- __btree_node_offset_to_key(b, r->k)); ++ return iter->used >= 2 && ++ !bkey_cmp_packed(iter->b, ++ iter->data[0].k, ++ iter->data[1].k); + } + +-struct btree_nr_keys bch2_key_sort_fix_overlapping(struct bset *dst, +- struct btree *b, +- struct btree_node_iter_large *iter) ++struct btree_nr_keys ++bch2_key_sort_fix_overlapping(struct bch_fs *c, struct bset *dst, ++ struct sort_iter *iter) + { + struct bkey_packed *out = dst->start; ++ struct bkey_packed *k; + struct btree_nr_keys nr; + + memset(&nr, 0, sizeof(nr)); + +- heap_resort(iter, key_sort_cmp, NULL); +- +- while (!bch2_btree_node_iter_large_end(iter)) { +- if (!should_drop_next_key(iter, b)) { +- struct bkey_packed *k = +- __btree_node_offset_to_key(b, iter->data->k); ++ sort_iter_sort(iter, key_sort_fix_overlapping_cmp); + ++ while ((k = sort_iter_peek(iter))) { ++ if (!bkey_whiteout(k) && ++ !should_drop_next_key(iter)) { + bkey_copy(out, k); + btree_keys_account_key_add(&nr, 0, out); + out = bkey_next(out); + } + +- sort_key_next(iter, b, iter->data); +- heap_sift_down(iter, 0, key_sort_cmp, NULL); ++ sort_iter_advance(iter, key_sort_fix_overlapping_cmp); + } + + dst->u64s = cpu_to_le16((u64 *) out - dst->_data); +@@ -221,29 +136,16 @@ struct btree_nr_keys bch2_key_sort_fix_overlapping(struct bset *dst, + * Necessary for sort_fix_overlapping() - if there are multiple keys that + * compare equal in different sets, we have to process them newest to oldest. + */ +-#define extent_sort_cmp(h, l, r) \ +-({ \ +- struct bkey _ul = bkey_unpack_key(b, \ +- __btree_node_offset_to_key(b, (l).k)); \ +- struct bkey _ur = bkey_unpack_key(b, \ +- __btree_node_offset_to_key(b, (r).k)); \ +- \ +- bkey_cmp(bkey_start_pos(&_ul), \ +- bkey_start_pos(&_ur)) ?: (r).k - (l).k; \ +-}) +- +-static inline void extent_sort_sift(struct btree_node_iter_large *iter, +- struct btree *b, size_t i) ++static inline int extent_sort_fix_overlapping_cmp(struct btree *b, ++ struct bkey_packed *l, ++ struct bkey_packed *r) + { +- heap_sift_down(iter, i, extent_sort_cmp, NULL); +-} ++ struct bkey ul = bkey_unpack_key(b, l); ++ struct bkey ur = bkey_unpack_key(b, r); + +-static inline void extent_sort_next(struct btree_node_iter_large *iter, +- struct btree *b, +- struct btree_node_iter_set *i) +-{ +- sort_key_next(iter, b, i); +- heap_sift_down(iter, i - iter->data, extent_sort_cmp, NULL); ++ return bkey_cmp(bkey_start_pos(&ul), ++ bkey_start_pos(&ur)) ?: ++ cmp_int((unsigned long) r, (unsigned long) l); + } + + static void extent_sort_advance_prev(struct bkey_format *f, +@@ -286,14 +188,14 @@ static void extent_sort_append(struct bch_fs *c, + bkey_reassemble((void *) *prev, k.s_c); + } + +-struct btree_nr_keys bch2_extent_sort_fix_overlapping(struct bch_fs *c, +- struct bset *dst, +- struct btree *b, +- struct btree_node_iter_large *iter) ++struct btree_nr_keys ++bch2_extent_sort_fix_overlapping(struct bch_fs *c, struct bset *dst, ++ struct sort_iter *iter) + { ++ struct btree *b = iter->b; + struct bkey_format *f = &b->format; +- struct btree_node_iter_set *_l = iter->data, *_r; +- struct bkey_packed *prev = NULL, *lk, *rk; ++ struct sort_iter_set *_l = iter->data, *_r = iter->data + 1; ++ struct bkey_packed *prev = NULL; + struct bkey l_unpacked, r_unpacked; + struct bkey_s l, r; + struct btree_nr_keys nr; +@@ -302,36 +204,32 @@ struct btree_nr_keys bch2_extent_sort_fix_overlapping(struct bch_fs *c, + memset(&nr, 0, sizeof(nr)); + bkey_on_stack_init(&split); + +- heap_resort(iter, extent_sort_cmp, NULL); ++ sort_iter_sort(iter, extent_sort_fix_overlapping_cmp); + +- while (!bch2_btree_node_iter_large_end(iter)) { +- lk = __btree_node_offset_to_key(b, _l->k); +- l = __bkey_disassemble(b, lk, &l_unpacked); ++ while (!sort_iter_end(iter)) { ++ l = __bkey_disassemble(b, _l->k, &l_unpacked); + + if (iter->used == 1) { + extent_sort_append(c, f, &nr, dst->start, &prev, l); +- extent_sort_next(iter, b, _l); ++ sort_iter_advance(iter, ++ extent_sort_fix_overlapping_cmp); + continue; + } + +- _r = iter->data + 1; +- if (iter->used > 2 && +- extent_sort_cmp(iter, _r[0], _r[1]) >= 0) +- _r++; +- +- rk = __btree_node_offset_to_key(b, _r->k); +- r = __bkey_disassemble(b, rk, &r_unpacked); ++ r = __bkey_disassemble(b, _r->k, &r_unpacked); + + /* If current key and next key don't overlap, just append */ + if (bkey_cmp(l.k->p, bkey_start_pos(r.k)) <= 0) { + extent_sort_append(c, f, &nr, dst->start, &prev, l); +- extent_sort_next(iter, b, _l); ++ sort_iter_advance(iter, ++ extent_sort_fix_overlapping_cmp); + continue; + } + + /* Skip 0 size keys */ + if (!r.k->size) { +- extent_sort_next(iter, b, _r); ++ __sort_iter_advance(iter, 1, ++ extent_sort_fix_overlapping_cmp); + continue; + } + +@@ -348,13 +246,14 @@ struct btree_nr_keys bch2_extent_sort_fix_overlapping(struct bch_fs *c, + if (_l->k > _r->k) { + /* l wins, trim r */ + if (bkey_cmp(l.k->p, r.k->p) >= 0) { +- sort_key_next(iter, b, _r); ++ __sort_iter_advance(iter, 1, ++ extent_sort_fix_overlapping_cmp); + } else { + bch2_cut_front_s(l.k->p, r); +- extent_save(b, rk, r.k); ++ extent_save(b, _r->k, r.k); ++ __sort_iter_sift(iter, 1, ++ extent_sort_fix_overlapping_cmp); + } +- +- extent_sort_sift(iter, b, _r - iter->data); + } else if (bkey_cmp(l.k->p, r.k->p) > 0) { + + /* +@@ -364,15 +263,16 @@ struct btree_nr_keys bch2_extent_sort_fix_overlapping(struct bch_fs *c, + bch2_cut_back(bkey_start_pos(r.k), split.k); + + bch2_cut_front_s(r.k->p, l); +- extent_save(b, lk, l.k); ++ extent_save(b, _l->k, l.k); + +- extent_sort_sift(iter, b, 0); ++ __sort_iter_sift(iter, 0, ++ extent_sort_fix_overlapping_cmp); + + extent_sort_append(c, f, &nr, dst->start, + &prev, bkey_i_to_s(split.k)); + } else { + bch2_cut_back_s(bkey_start_pos(r.k), l); +- extent_save(b, lk, l.k); ++ extent_save(b, _l->k, l.k); + } + } + +diff --git a/fs/bcachefs/bkey_sort.h b/fs/bcachefs/bkey_sort.h +index 47a808670341..458a051fdac5 100644 +--- a/fs/bcachefs/bkey_sort.h ++++ b/fs/bcachefs/bkey_sort.h +@@ -2,20 +2,10 @@ + #ifndef _BCACHEFS_BKEY_SORT_H + #define _BCACHEFS_BKEY_SORT_H + +-struct btree_node_iter_large { +- u16 used; +- +- struct btree_node_iter_set data[MAX_BSETS]; +-}; +- +-void bch2_btree_node_iter_large_push(struct btree_node_iter_large *, +- struct btree *, +- const struct bkey_packed *, +- const struct bkey_packed *); +- + struct sort_iter { +- struct btree *b; ++ struct btree *b; + unsigned used; ++ unsigned size; + + struct sort_iter_set { + struct bkey_packed *k, *end; +@@ -24,27 +14,27 @@ struct sort_iter { + + static inline void sort_iter_init(struct sort_iter *iter, struct btree *b) + { +- memset(iter, 0, sizeof(*iter)); + iter->b = b; ++ iter->used = 0; ++ iter->size = ARRAY_SIZE(iter->data); + } + + static inline void sort_iter_add(struct sort_iter *iter, + struct bkey_packed *k, + struct bkey_packed *end) + { +- BUG_ON(iter->used >= ARRAY_SIZE(iter->data)); ++ BUG_ON(iter->used >= iter->size); + + if (k != end) + iter->data[iter->used++] = (struct sort_iter_set) { k, end }; + } + + struct btree_nr_keys +-bch2_key_sort_fix_overlapping(struct bset *, struct btree *, +- struct btree_node_iter_large *); ++bch2_key_sort_fix_overlapping(struct bch_fs *, struct bset *, ++ struct sort_iter *); + struct btree_nr_keys + bch2_extent_sort_fix_overlapping(struct bch_fs *, struct bset *, +- struct btree *, +- struct btree_node_iter_large *); ++ struct sort_iter *); + + struct btree_nr_keys + bch2_sort_repack(struct bset *, struct btree *, +diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c +index 781ad97f910b..4b1cd4dd0741 100644 +--- a/fs/bcachefs/btree_io.c ++++ b/fs/bcachefs/btree_io.c +@@ -863,7 +863,7 @@ fsck_err: + int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry) + { + struct btree_node_entry *bne; +- struct btree_node_iter_large *iter; ++ struct sort_iter *iter; + struct btree_node *sorted; + struct bkey_packed *k; + struct bset *i; +@@ -872,7 +872,8 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry + int ret, retry_read = 0, write = READ; + + iter = mempool_alloc(&c->fill_iter, GFP_NOIO); +- iter->used = 0; ++ sort_iter_init(iter, b); ++ iter->size = (btree_blocks(c) + 1) * 2; + + if (bch2_meta_read_fault("btree")) + btree_err(BTREE_ERR_MUST_RETRY, c, b, NULL, +@@ -951,13 +952,12 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry + if (blacklisted && !first) + continue; + +- bch2_btree_node_iter_large_push(iter, b, +- i->start, +- vstruct_idx(i, whiteout_u64s)); ++ sort_iter_add(iter, i->start, ++ vstruct_idx(i, whiteout_u64s)); + +- bch2_btree_node_iter_large_push(iter, b, +- vstruct_idx(i, whiteout_u64s), +- vstruct_last(i)); ++ sort_iter_add(iter, ++ vstruct_idx(i, whiteout_u64s), ++ vstruct_last(i)); + } + + for (bne = write_block(b); +@@ -972,9 +972,9 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry + + set_btree_bset(b, b->set, &b->data->keys); + +- b->nr = btree_node_is_extents(b) +- ? bch2_extent_sort_fix_overlapping(c, &sorted->keys, b, iter) +- : bch2_key_sort_fix_overlapping(&sorted->keys, b, iter); ++ b->nr = (btree_node_is_extents(b) ++ ? bch2_extent_sort_fix_overlapping ++ : bch2_key_sort_fix_overlapping)(c, &sorted->keys, iter); + + u64s = le16_to_cpu(sorted->keys.u64s); + *sorted = *b->data; +diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c +index 0188ddda5f6c..a317e089882d 100644 +--- a/fs/bcachefs/super.c ++++ b/fs/bcachefs/super.c +@@ -695,9 +695,9 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) + if (bch2_fs_init_fault("fs_alloc")) + goto err; + +- iter_size = sizeof(struct btree_node_iter_large) + ++ iter_size = sizeof(struct sort_iter) + + (btree_blocks(c) + 1) * 2 * +- sizeof(struct btree_node_iter_set); ++ sizeof(struct sort_iter_set); + + if (!(c->wq = alloc_workqueue("bcachefs", + WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) || +-- +cgit v1.2.3 + + +From 476c07b4e72e7a297b3caceb37624794f66a0eae Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 16 Dec 2019 15:37:47 -0500 +Subject: bcachefs: Fix a null ptr deref in btree_iter_traverse_one() + +When traversing nodes and we've reached the end of the btree, the +current btree node will be NULL. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index a4180124d7d1..918e6fe4a0a1 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -1148,7 +1148,8 @@ static int btree_iter_traverse_one(struct btree_iter *iter) + iter->uptodate = BTREE_ITER_NEED_PEEK; + + bch2_btree_trans_verify_locks(iter->trans); +- __bch2_btree_iter_verify(iter, iter->l[iter->level].b); ++ if (btree_iter_node(iter, iter->level)) ++ __bch2_btree_iter_verify(iter, iter->l[iter->level].b); + return 0; + } + +-- +cgit v1.2.3 + + +From fedebc24e11453be825abd9676ed075196bbd9c0 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 18 Dec 2019 13:18:33 -0500 +Subject: bcachefs: Fix for an assertion on filesystem error + +Normally the in memory i_size is always greater than or equal to i_size +on disk; this doesn't hold on filesystem error. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/fs-io.c | 9 +++++++++ + 1 file changed, 9 insertions(+) + +diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c +index ed7a26ba2b48..311d9517e15c 100644 +--- a/fs/bcachefs/fs-io.c ++++ b/fs/bcachefs/fs-io.c +@@ -2309,6 +2309,15 @@ int bch2_truncate(struct bch_inode_info *inode, struct iattr *iattr) + ret = PTR_ERR_OR_ZERO(iter); + bch2_trans_exit(&trans); + ++ if (ret) ++ goto err; ++ ++ /* ++ * check this before next assertion; on filesystem error our normal ++ * invariants are a bit broken (truncate has to truncate the page cache ++ * before the inode). ++ */ ++ ret = bch2_journal_error(&c->journal); + if (ret) + goto err; + +-- +cgit v1.2.3 + + +From 8c2a040cffee865452e473af979fb2eb6f065f93 Mon Sep 17 00:00:00 2001 +From: Justin Husted +Date: Mon, 11 Nov 2019 20:14:30 -0800 +Subject: bcachefs: Update directory timestamps during link + +Timestamp updates on the directory during a link operation were cached. +This is inconsistent with other metadata operations such as rename, as +well as being less efficient. + +Signed-off-by: Justin Husted +--- + fs/bcachefs/fs-common.c | 12 ++++++------ + fs/bcachefs/fs-common.h | 1 + + fs/bcachefs/fs.c | 12 +++++++++--- + fs/bcachefs/fsck.c | 4 ++-- + 4 files changed, 18 insertions(+), 11 deletions(-) + +diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c +index a4497eeb1f1b..96f7bbe0a3ed 100644 +--- a/fs/bcachefs/fs-common.c ++++ b/fs/bcachefs/fs-common.c +@@ -76,11 +76,10 @@ int bch2_create_trans(struct btree_trans *trans, u64 dir_inum, + } + + int bch2_link_trans(struct btree_trans *trans, u64 dir_inum, +- u64 inum, struct bch_inode_unpacked *inode_u, +- const struct qstr *name) ++ u64 inum, struct bch_inode_unpacked *dir_u, ++ struct bch_inode_unpacked *inode_u, const struct qstr *name) + { + struct btree_iter *dir_iter, *inode_iter; +- struct bch_inode_unpacked dir_u; + struct bch_hash_info dir_hash; + u64 now = bch2_current_time(trans->c); + +@@ -91,18 +90,19 @@ int bch2_link_trans(struct btree_trans *trans, u64 dir_inum, + inode_u->bi_ctime = now; + bch2_inode_nlink_inc(inode_u); + +- dir_iter = bch2_inode_peek(trans, &dir_u, dir_inum, 0); ++ dir_iter = bch2_inode_peek(trans, dir_u, dir_inum, 0); + if (IS_ERR(dir_iter)) + return PTR_ERR(dir_iter); + +- /* XXX: shouldn't we be updating mtime/ctime on the directory? */ ++ dir_u->bi_mtime = dir_u->bi_ctime = now; + +- dir_hash = bch2_hash_info_init(trans->c, &dir_u); ++ dir_hash = bch2_hash_info_init(trans->c, dir_u); + bch2_trans_iter_put(trans, dir_iter); + + return bch2_dirent_create(trans, dir_inum, &dir_hash, + mode_to_type(inode_u->bi_mode), + name, inum, BCH_HASH_SET_MUST_CREATE) ?: ++ bch2_inode_write(trans, dir_iter, dir_u) ?: + bch2_inode_write(trans, inode_iter, inode_u); + } + +diff --git a/fs/bcachefs/fs-common.h b/fs/bcachefs/fs-common.h +index c1621485a526..2273b7961c9b 100644 +--- a/fs/bcachefs/fs-common.h ++++ b/fs/bcachefs/fs-common.h +@@ -14,6 +14,7 @@ int bch2_create_trans(struct btree_trans *, u64, + + int bch2_link_trans(struct btree_trans *, u64, + u64, struct bch_inode_unpacked *, ++ struct bch_inode_unpacked *, + const struct qstr *); + + int bch2_unlink_trans(struct btree_trans *, +diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c +index fc9022e2508c..564c69543ffa 100644 +--- a/fs/bcachefs/fs.c ++++ b/fs/bcachefs/fs.c +@@ -395,7 +395,7 @@ static int __bch2_link(struct bch_fs *c, + struct dentry *dentry) + { + struct btree_trans trans; +- struct bch_inode_unpacked inode_u; ++ struct bch_inode_unpacked dir_u, inode_u; + int ret; + + mutex_lock(&inode->ei_update_lock); +@@ -405,7 +405,7 @@ static int __bch2_link(struct bch_fs *c, + bch2_trans_begin(&trans); + ret = bch2_link_trans(&trans, + dir->v.i_ino, +- inode->v.i_ino, &inode_u, ++ inode->v.i_ino, &dir_u, &inode_u, + &dentry->d_name) ?: + bch2_trans_commit(&trans, NULL, + &inode->ei_journal_seq, +@@ -413,8 +413,14 @@ static int __bch2_link(struct bch_fs *c, + BTREE_INSERT_NOUNLOCK); + } while (ret == -EINTR); + +- if (likely(!ret)) ++ if (likely(!ret)) { ++ BUG_ON(inode_u.bi_inum != inode->v.i_ino); ++ ++ journal_seq_copy(inode, dir->ei_journal_seq); ++ bch2_inode_update_after_write(c, dir, &dir_u, ++ ATTR_MTIME|ATTR_CTIME); + bch2_inode_update_after_write(c, inode, &inode_u, ATTR_CTIME); ++ } + + bch2_trans_exit(&trans); + mutex_unlock(&inode->ei_update_lock); +diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c +index 0f2308e53d65..3ae545b31c7a 100644 +--- a/fs/bcachefs/fsck.c ++++ b/fs/bcachefs/fsck.c +@@ -80,7 +80,7 @@ static int reattach_inode(struct bch_fs *c, + struct bch_inode_unpacked *lostfound_inode, + u64 inum) + { +- struct bch_inode_unpacked inode_u; ++ struct bch_inode_unpacked dir_u, inode_u; + char name_buf[20]; + struct qstr name; + int ret; +@@ -92,7 +92,7 @@ static int reattach_inode(struct bch_fs *c, + BTREE_INSERT_ATOMIC| + BTREE_INSERT_LAZY_RW, + bch2_link_trans(&trans, lostfound_inode->bi_inum, +- inum, &inode_u, &name)); ++ inum, &dir_u, &inode_u, &name)); + if (ret) + bch_err(c, "error %i reattaching inode %llu", ret, inum); + +-- +cgit v1.2.3 + + +From 8b5afa880201baab3ced981bccc2c1a8dceb0ab0 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 16 Dec 2019 17:53:59 -0500 +Subject: bcachefs: Redo filesystem usage ioctls + +When disk space accounting was changed to be tracked by replicas entry, +the ioctl interface was never update: this patch finally does that. + +Aditionally, the BCH_IOCTL_USAGE ioctl is now broken out into separate +ioctls for filesystem and device usage. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bcachefs_format.h | 3 + + fs/bcachefs/bcachefs_ioctl.h | 70 +++++++++++-------- + fs/bcachefs/chardev.c | 152 +++++++++++++++++++++++++----------------- + fs/bcachefs/replicas.h | 3 - + 4 files changed, 136 insertions(+), 92 deletions(-) + +diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h +index 3d85012a15fd..66af4f4cdd53 100644 +--- a/fs/bcachefs/bcachefs_format.h ++++ b/fs/bcachefs/bcachefs_format.h +@@ -1080,6 +1080,9 @@ struct bch_replicas_entry { + __u8 devs[0]; + } __attribute__((packed)); + ++#define replicas_entry_bytes(_i) \ ++ (offsetof(typeof(*(_i)), devs) + (_i)->nr_devs) ++ + struct bch_sb_field_replicas { + struct bch_sb_field field; + struct bch_replicas_entry entries[0]; +diff --git a/fs/bcachefs/bcachefs_ioctl.h b/fs/bcachefs/bcachefs_ioctl.h +index d668ede5491a..ba8c75706bf1 100644 +--- a/fs/bcachefs/bcachefs_ioctl.h ++++ b/fs/bcachefs/bcachefs_ioctl.h +@@ -68,7 +68,8 @@ struct bch_ioctl_incremental { + #define BCH_IOCTL_DISK_OFFLINE _IOW(0xbc, 7, struct bch_ioctl_disk) + #define BCH_IOCTL_DISK_SET_STATE _IOW(0xbc, 8, struct bch_ioctl_disk_set_state) + #define BCH_IOCTL_DATA _IOW(0xbc, 10, struct bch_ioctl_data) +-#define BCH_IOCTL_USAGE _IOWR(0xbc, 11, struct bch_ioctl_usage) ++#define BCH_IOCTL_FS_USAGE _IOWR(0xbc, 11, struct bch_ioctl_fs_usage) ++#define BCH_IOCTL_DEV_USAGE _IOWR(0xbc, 11, struct bch_ioctl_dev_usage) + #define BCH_IOCTL_READ_SUPER _IOW(0xbc, 12, struct bch_ioctl_read_super) + #define BCH_IOCTL_DISK_GET_IDX _IOW(0xbc, 13, struct bch_ioctl_disk_get_idx) + #define BCH_IOCTL_DISK_RESIZE _IOW(0xbc, 14, struct bch_ioctl_disk_resize) +@@ -224,46 +225,59 @@ struct bch_ioctl_data_event { + }; + } __attribute__((packed, aligned(8))); + +-struct bch_ioctl_dev_usage { +- __u8 state; +- __u8 alive; +- __u8 pad[6]; +- __u32 dev; ++struct bch_replicas_usage { ++ __u64 sectors; ++ struct bch_replicas_entry r; ++} __attribute__((packed)); + +- __u32 bucket_size; +- __u64 nr_buckets; +- +- __u64 buckets[BCH_DATA_NR]; +- __u64 sectors[BCH_DATA_NR]; +-}; ++static inline struct bch_replicas_usage * ++replicas_usage_next(struct bch_replicas_usage *u) ++{ ++ return (void *) u + replicas_entry_bytes(&u->r) + 8; ++} + ++/* ++ * BCH_IOCTL_FS_USAGE: query filesystem disk space usage ++ * ++ * Returns disk space usage broken out by data type, number of replicas, and ++ * by component device ++ * ++ * @replica_entries_bytes - size, in bytes, allocated for replica usage entries ++ * ++ * On success, @replica_entries_bytes will be changed to indicate the number of ++ * bytes actually used. ++ * ++ * Returns -ERANGE if @replica_entries_bytes was too small ++ */ + struct bch_ioctl_fs_usage { + __u64 capacity; + __u64 used; + __u64 online_reserved; + __u64 persistent_reserved[BCH_REPLICAS_MAX]; +- __u64 sectors[BCH_DATA_NR][BCH_REPLICAS_MAX]; ++ ++ __u32 replica_entries_bytes; ++ __u32 pad; ++ ++ struct bch_replicas_usage replicas[0]; + }; + + /* +- * BCH_IOCTL_USAGE: query filesystem disk space usage +- * +- * Returns disk space usage broken out by data type, number of replicas, and +- * by component device ++ * BCH_IOCTL_DEV_USAGE: query device disk space usage + * +- * @nr_devices - number of devices userspace allocated space for in @devs +- * +- * On success, @fs and @devs will be filled out appropriately and devs[i].alive +- * will indicate if a device was present in that slot +- * +- * Returns -ERANGE if @nr_devices was too small ++ * Returns disk space usage broken out by data type - both by buckets and ++ * sectors. + */ +-struct bch_ioctl_usage { +- __u16 nr_devices; +- __u16 pad[3]; ++struct bch_ioctl_dev_usage { ++ __u64 dev; ++ __u32 flags; ++ __u8 state; ++ __u8 pad[7]; ++ ++ __u32 bucket_size; ++ __u64 nr_buckets; + +- struct bch_ioctl_fs_usage fs; +- struct bch_ioctl_dev_usage devs[0]; ++ __u64 buckets[BCH_DATA_NR]; ++ __u64 sectors[BCH_DATA_NR]; + }; + + /* +diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c +index 059eca01ccc4..5028d0dcc2d6 100644 +--- a/fs/bcachefs/chardev.c ++++ b/fs/bcachefs/chardev.c +@@ -6,6 +6,7 @@ + #include "buckets.h" + #include "chardev.h" + #include "move.h" ++#include "replicas.h" + #include "super.h" + #include "super-io.h" + +@@ -371,89 +372,116 @@ err: + return ret; + } + +-static long bch2_ioctl_usage(struct bch_fs *c, +- struct bch_ioctl_usage __user *user_arg) ++static long bch2_ioctl_fs_usage(struct bch_fs *c, ++ struct bch_ioctl_fs_usage __user *user_arg) + { +- struct bch_ioctl_usage arg; +- struct bch_dev *ca; +- unsigned i, j; +- int ret; ++ struct bch_ioctl_fs_usage *arg = NULL; ++ struct bch_replicas_usage *dst_e, *dst_end; ++ struct bch_fs_usage *src; ++ u32 replica_entries_bytes; ++ unsigned i; ++ int ret = 0; + + if (!test_bit(BCH_FS_STARTED, &c->flags)) + return -EINVAL; + +- if (copy_from_user(&arg, user_arg, sizeof(arg))) ++ if (get_user(replica_entries_bytes, &user_arg->replica_entries_bytes)) + return -EFAULT; + +- for (i = 0; i < arg.nr_devices; i++) { +- struct bch_ioctl_dev_usage dst = { .alive = 0 }; ++ arg = kzalloc(sizeof(*arg) + replica_entries_bytes, GFP_KERNEL); ++ if (!arg) ++ return -ENOMEM; + +- ret = copy_to_user(&user_arg->devs[i], &dst, sizeof(dst)); +- if (ret) +- return ret; ++ src = bch2_fs_usage_read(c); ++ if (!src) { ++ ret = -ENOMEM; ++ goto err; + } + +- { +- struct bch_fs_usage *src; +- struct bch_ioctl_fs_usage dst = { +- .capacity = c->capacity, +- }; ++ arg->capacity = c->capacity; ++ arg->used = bch2_fs_sectors_used(c, src); ++ arg->online_reserved = src->online_reserved; + +- src = bch2_fs_usage_read(c); +- if (!src) +- return -ENOMEM; ++ for (i = 0; i < BCH_REPLICAS_MAX; i++) ++ arg->persistent_reserved[i] = src->persistent_reserved[i]; + +- dst.used = bch2_fs_sectors_used(c, src); +- dst.online_reserved = src->online_reserved; ++ dst_e = arg->replicas; ++ dst_end = (void *) arg->replicas + replica_entries_bytes; + +- percpu_up_read(&c->mark_lock); ++ for (i = 0; i < c->replicas.nr; i++) { ++ struct bch_replicas_entry *src_e = ++ cpu_replicas_entry(&c->replicas, i); + +- for (i = 0; i < BCH_REPLICAS_MAX; i++) { +- dst.persistent_reserved[i] = +- src->persistent_reserved[i]; +-#if 0 +- for (j = 0; j < BCH_DATA_NR; j++) +- dst.sectors[j][i] = src.replicas[i].data[j]; +-#endif ++ if (replicas_usage_next(dst_e) > dst_end) { ++ ret = -ERANGE; ++ break; + } + +- kfree(src); ++ dst_e->sectors = src->replicas[i]; ++ dst_e->r = *src_e; ++ ++ /* recheck after setting nr_devs: */ ++ if (replicas_usage_next(dst_e) > dst_end) { ++ ret = -ERANGE; ++ break; ++ } + +- ret = copy_to_user(&user_arg->fs, &dst, sizeof(dst)); +- if (ret) +- return ret; ++ memcpy(dst_e->r.devs, src_e->devs, src_e->nr_devs); ++ ++ dst_e = replicas_usage_next(dst_e); + } + +- for_each_member_device(ca, c, i) { +- struct bch_dev_usage src = bch2_dev_usage_read(c, ca); +- struct bch_ioctl_dev_usage dst = { +- .alive = 1, +- .state = ca->mi.state, +- .bucket_size = ca->mi.bucket_size, +- .nr_buckets = ca->mi.nbuckets - ca->mi.first_bucket, +- }; +- +- if (ca->dev_idx >= arg.nr_devices) { +- percpu_ref_put(&ca->ref); +- return -ERANGE; +- } ++ arg->replica_entries_bytes = (void *) dst_e - (void *) arg->replicas; + +- if (percpu_ref_tryget(&ca->io_ref)) { +- dst.dev = huge_encode_dev(ca->disk_sb.bdev->bd_dev); +- percpu_ref_put(&ca->io_ref); +- } ++ percpu_up_read(&c->mark_lock); ++ kfree(src); + +- for (j = 0; j < BCH_DATA_NR; j++) { +- dst.buckets[j] = src.buckets[j]; +- dst.sectors[j] = src.sectors[j]; +- } ++ if (!ret) ++ ret = copy_to_user(user_arg, arg, ++ sizeof(*arg) + arg->replica_entries_bytes); ++err: ++ kfree(arg); ++ return ret; ++} ++ ++static long bch2_ioctl_dev_usage(struct bch_fs *c, ++ struct bch_ioctl_dev_usage __user *user_arg) ++{ ++ struct bch_ioctl_dev_usage arg; ++ struct bch_dev_usage src; ++ struct bch_dev *ca; ++ unsigned i; ++ ++ if (!test_bit(BCH_FS_STARTED, &c->flags)) ++ return -EINVAL; + +- ret = copy_to_user(&user_arg->devs[i], &dst, sizeof(dst)); +- if (ret) +- return ret; ++ if (copy_from_user(&arg, user_arg, sizeof(arg))) ++ return -EFAULT; ++ ++ if ((arg.flags & ~BCH_BY_INDEX) || ++ arg.pad[0] || ++ arg.pad[1] || ++ arg.pad[2]) ++ return -EINVAL; ++ ++ ca = bch2_device_lookup(c, arg.dev, arg.flags); ++ if (IS_ERR(ca)) ++ return PTR_ERR(ca); ++ ++ src = bch2_dev_usage_read(c, ca); ++ ++ arg.state = ca->mi.state; ++ arg.bucket_size = ca->mi.bucket_size; ++ arg.nr_buckets = ca->mi.nbuckets - ca->mi.first_bucket; ++ ++ for (i = 0; i < BCH_DATA_NR; i++) { ++ arg.buckets[i] = src.buckets[i]; ++ arg.sectors[i] = src.sectors[i]; + } + +- return 0; ++ percpu_ref_put(&ca->ref); ++ ++ return copy_to_user(user_arg, &arg, sizeof(arg)); + } + + static long bch2_ioctl_read_super(struct bch_fs *c, +@@ -547,8 +575,10 @@ long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg) + switch (cmd) { + case BCH_IOCTL_QUERY_UUID: + return bch2_ioctl_query_uuid(c, arg); +- case BCH_IOCTL_USAGE: +- return bch2_ioctl_usage(c, arg); ++ case BCH_IOCTL_FS_USAGE: ++ return bch2_ioctl_fs_usage(c, arg); ++ case BCH_IOCTL_DEV_USAGE: ++ return bch2_ioctl_dev_usage(c, arg); + } + + if (!capable(CAP_SYS_ADMIN)) +diff --git a/fs/bcachefs/replicas.h b/fs/bcachefs/replicas.h +index 0d6e19126021..8527d82841bb 100644 +--- a/fs/bcachefs/replicas.h ++++ b/fs/bcachefs/replicas.h +@@ -72,9 +72,6 @@ int bch2_replicas_set_usage(struct bch_fs *, + + /* iterate over superblock replicas - used by userspace tools: */ + +-#define replicas_entry_bytes(_i) \ +- (offsetof(typeof(*(_i)), devs) + (_i)->nr_devs) +- + #define replicas_entry_next(_i) \ + ((typeof(_i)) ((void *) (_i) + replicas_entry_bytes(_i))) + +-- +cgit v1.2.3 + + +From 14e98d9b211146575fd43f93557e3f1525fdb16d Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 18 Dec 2019 17:34:36 -0500 +Subject: bcachefs: Fix a memory splat + +In __bch2_sb_field_resize, when a field's old a new size was 0, we were +doing an invalid write just past the end of the superblock. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/super-io.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c +index 1d592856dea1..180d9091a75b 100644 +--- a/fs/bcachefs/super-io.c ++++ b/fs/bcachefs/super-io.c +@@ -51,7 +51,9 @@ static struct bch_sb_field *__bch2_sb_field_resize(struct bch_sb_handle *sb, + BUG_ON(get_order(__vstruct_bytes(struct bch_sb, sb_u64s)) > + sb->page_order); + +- if (!f) { ++ if (!f && !u64s) { ++ /* nothing to do: */ ++ } else if (!f) { + f = vstruct_last(sb->sb); + memset(f, 0, sizeof(u64) * u64s); + f->u64s = cpu_to_le32(u64s); +-- +cgit v1.2.3 + + +From 90a53510644d51c865a2883ff11fe4dcf1931259 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 18 Dec 2019 17:37:15 -0500 +Subject: bcachefs: Add __GFP_NOWARN to a GFP_NOWAIT allocation + +We have fallbacks for this. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/buckets.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index 8d223aa2bee5..5a34dab013f3 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -161,7 +161,7 @@ struct bch_fs_usage *bch2_fs_usage_scratch_get(struct bch_fs *c) + struct bch_fs_usage *ret; + unsigned bytes = fs_usage_u64s(c) * sizeof(u64); + +- ret = kzalloc(bytes, GFP_NOWAIT); ++ ret = kzalloc(bytes, GFP_NOWAIT|__GFP_NOWARN); + if (ret) + return ret; + +-- +cgit v1.2.3 + + +From 3c5cfc1b3b7412550b88455b37a0ee1537167359 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 19 Dec 2019 15:07:51 -0500 +Subject: bcachefs: Make io timers less buggy + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/clock.c | 43 +++++++++++++++++++++++++++++-------------- + fs/bcachefs/clock.h | 6 ++++-- + fs/bcachefs/clock_types.h | 1 + + fs/bcachefs/sysfs.c | 12 ++++++++++++ + 4 files changed, 46 insertions(+), 16 deletions(-) + +diff --git a/fs/bcachefs/clock.c b/fs/bcachefs/clock.c +index f18266330687..d9de0d1302e2 100644 +--- a/fs/bcachefs/clock.c ++++ b/fs/bcachefs/clock.c +@@ -18,6 +18,14 @@ void bch2_io_timer_add(struct io_clock *clock, struct io_timer *timer) + size_t i; + + spin_lock(&clock->timer_lock); ++ ++ if (time_after_eq((unsigned long) atomic_long_read(&clock->now), ++ timer->expire)) { ++ spin_unlock(&clock->timer_lock); ++ timer->fn(timer); ++ return; ++ } ++ + for (i = 0; i < clock->timers.used; i++) + if (clock->timers.data[i] == timer) + goto out; +@@ -135,26 +143,31 @@ static struct io_timer *get_expired_timer(struct io_clock *clock, + return ret; + } + +-void __bch2_increment_clock(struct io_clock *clock) ++void __bch2_increment_clock(struct io_clock *clock, unsigned sectors) + { + struct io_timer *timer; +- unsigned long now; +- unsigned sectors; ++ unsigned long now = atomic_long_add_return(sectors, &clock->now); + +- /* Buffer up one megabyte worth of IO in the percpu counter */ +- preempt_disable(); ++ while ((timer = get_expired_timer(clock, now))) ++ timer->fn(timer); ++} + +- if (this_cpu_read(*clock->pcpu_buf) < IO_CLOCK_PCPU_SECTORS) { +- preempt_enable(); +- return; +- } ++ssize_t bch2_io_timers_show(struct io_clock *clock, char *buf) ++{ ++ struct printbuf out = _PBUF(buf, PAGE_SIZE); ++ unsigned long now; ++ unsigned i; + +- sectors = this_cpu_xchg(*clock->pcpu_buf, 0); +- preempt_enable(); +- now = atomic_long_add_return(sectors, &clock->now); ++ spin_lock(&clock->timer_lock); ++ now = atomic_long_read(&clock->now); + +- while ((timer = get_expired_timer(clock, now))) +- timer->fn(timer); ++ for (i = 0; i < clock->timers.used; i++) ++ pr_buf(&out, "%pf:\t%li\n", ++ clock->timers.data[i]->fn, ++ clock->timers.data[i]->expire - now); ++ spin_unlock(&clock->timer_lock); ++ ++ return out.pos - buf; + } + + void bch2_io_clock_exit(struct io_clock *clock) +@@ -168,6 +181,8 @@ int bch2_io_clock_init(struct io_clock *clock) + atomic_long_set(&clock->now, 0); + spin_lock_init(&clock->timer_lock); + ++ clock->max_slop = IO_CLOCK_PCPU_SECTORS * num_possible_cpus(); ++ + clock->pcpu_buf = alloc_percpu(*clock->pcpu_buf); + if (!clock->pcpu_buf) + return -ENOMEM; +diff --git a/fs/bcachefs/clock.h b/fs/bcachefs/clock.h +index bfbbca8a207b..da50afe206cc 100644 +--- a/fs/bcachefs/clock.h ++++ b/fs/bcachefs/clock.h +@@ -7,7 +7,7 @@ void bch2_io_timer_del(struct io_clock *, struct io_timer *); + void bch2_kthread_io_clock_wait(struct io_clock *, unsigned long, + unsigned long); + +-void __bch2_increment_clock(struct io_clock *); ++void __bch2_increment_clock(struct io_clock *, unsigned); + + static inline void bch2_increment_clock(struct bch_fs *c, unsigned sectors, + int rw) +@@ -16,7 +16,7 @@ static inline void bch2_increment_clock(struct bch_fs *c, unsigned sectors, + + if (unlikely(this_cpu_add_return(*clock->pcpu_buf, sectors) >= + IO_CLOCK_PCPU_SECTORS)) +- __bch2_increment_clock(clock); ++ __bch2_increment_clock(clock, this_cpu_xchg(*clock->pcpu_buf, 0)); + } + + void bch2_io_clock_schedule_timeout(struct io_clock *, unsigned long); +@@ -30,6 +30,8 @@ void bch2_io_clock_schedule_timeout(struct io_clock *, unsigned long); + __ret; \ + }) + ++ssize_t bch2_io_timers_show(struct io_clock *, char *); ++ + void bch2_io_clock_exit(struct io_clock *); + int bch2_io_clock_init(struct io_clock *); + +diff --git a/fs/bcachefs/clock_types.h b/fs/bcachefs/clock_types.h +index 2b5e499e12b4..92c740a47565 100644 +--- a/fs/bcachefs/clock_types.h ++++ b/fs/bcachefs/clock_types.h +@@ -28,6 +28,7 @@ typedef HEAP(struct io_timer *) io_timer_heap; + struct io_clock { + atomic_long_t now; + u16 __percpu *pcpu_buf; ++ unsigned max_slop; + + spinlock_t timer_lock; + io_timer_heap timers; +diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c +index e7699afd99fc..95e527844e0a 100644 +--- a/fs/bcachefs/sysfs.c ++++ b/fs/bcachefs/sysfs.c +@@ -18,6 +18,7 @@ + #include "btree_update_interior.h" + #include "btree_gc.h" + #include "buckets.h" ++#include "clock.h" + #include "disk_groups.h" + #include "ec.h" + #include "inode.h" +@@ -198,6 +199,9 @@ rw_attribute(pd_controllers_update_seconds); + read_attribute(meta_replicas_have); + read_attribute(data_replicas_have); + ++read_attribute(io_timers_read); ++read_attribute(io_timers_write); ++ + #ifdef CONFIG_BCACHEFS_TESTS + write_attribute(perf_test); + #endif /* CONFIG_BCACHEFS_TESTS */ +@@ -404,6 +408,11 @@ SHOW(bch2_fs) + if (attr == &sysfs_new_stripes) + return bch2_new_stripes(c, buf); + ++ if (attr == &sysfs_io_timers_read) ++ return bch2_io_timers_show(&c->io_clock[READ], buf); ++ if (attr == &sysfs_io_timers_write) ++ return bch2_io_timers_show(&c->io_clock[WRITE], buf); ++ + #define BCH_DEBUG_PARAM(name, description) sysfs_print(name, c->name); + BCH_DEBUG_PARAMS() + #undef BCH_DEBUG_PARAM +@@ -581,6 +590,9 @@ struct attribute *bch2_fs_internal_files[] = { + + &sysfs_new_stripes, + ++ &sysfs_io_timers_read, ++ &sysfs_io_timers_write, ++ + &sysfs_internal_uuid, + + #define BCH_DEBUG_PARAM(name, description) &sysfs_##name, +-- +cgit v1.2.3 + + +From c5c37d16b20d8f375c2ec72bf56f189728758376 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 20 Dec 2019 16:19:46 -0500 +Subject: bcachefs: Redo copygc throttling + +The code that checked the current free space and waited if it was too +big was causing issues - btree node allocations do not increment the +write IO clock (perhaps they should); but more broadly the check +wouldn't run copygc at all until the device was mostly full, at which +point it might have to do a bunch of work. + +This redoes that logic so that copygc starts to run earlier, smoothly +running more and more often as the device becomes closer to full. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/movinggc.c | 52 +++++++++++++++++++++++++++----------------------- + 1 file changed, 28 insertions(+), 24 deletions(-) + +diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c +index abdeef20fde9..e9cb2304576f 100644 +--- a/fs/bcachefs/movinggc.c ++++ b/fs/bcachefs/movinggc.c +@@ -212,14 +212,36 @@ static void bch2_copygc(struct bch_fs *c, struct bch_dev *ca) + buckets_to_move, buckets_not_moved); + } + ++/* ++ * Copygc runs when the amount of fragmented data is above some arbitrary ++ * threshold: ++ * ++ * The threshold at the limit - when the device is full - is the amount of space ++ * we reserved in bch2_recalc_capacity; we can't have more than that amount of ++ * disk space stranded due to fragmentation and store everything we have ++ * promised to store. ++ * ++ * But we don't want to be running copygc unnecessarily when the device still ++ * has plenty of free space - rather, we want copygc to smoothly run every so ++ * often and continually reduce the amount of fragmented space as the device ++ * fills up. So, we increase the threshold by half the current free space. ++ */ ++unsigned long bch2_copygc_wait_amount(struct bch_dev *ca) ++{ ++ struct bch_fs *c = ca->fs; ++ struct bch_dev_usage usage = bch2_dev_usage_read(c, ca); ++ u64 fragmented_allowed = ca->copygc_threshold + ++ ((__dev_buckets_available(ca, usage) * ca->mi.bucket_size) >> 1); ++ ++ return max_t(s64, 0, fragmented_allowed - usage.sectors_fragmented); ++} ++ + static int bch2_copygc_thread(void *arg) + { + struct bch_dev *ca = arg; + struct bch_fs *c = ca->fs; + struct io_clock *clock = &c->io_clock[WRITE]; +- struct bch_dev_usage usage; +- unsigned long last; +- u64 available, fragmented, reserve, next; ++ unsigned long last, wait; + + set_freezable(); + +@@ -228,28 +250,10 @@ static int bch2_copygc_thread(void *arg) + break; + + last = atomic_long_read(&clock->now); ++ wait = bch2_copygc_wait_amount(ca); + +- reserve = ca->copygc_threshold; +- +- usage = bch2_dev_usage_read(c, ca); +- +- available = __dev_buckets_available(ca, usage) * +- ca->mi.bucket_size; +- if (available > reserve) { +- next = last + available - reserve; +- bch2_kthread_io_clock_wait(clock, next, +- MAX_SCHEDULE_TIMEOUT); +- continue; +- } +- +- /* +- * don't start copygc until there's more than half the copygc +- * reserve of fragmented space: +- */ +- fragmented = usage.sectors_fragmented; +- if (fragmented < reserve) { +- next = last + reserve - fragmented; +- bch2_kthread_io_clock_wait(clock, next, ++ if (wait > clock->max_slop) { ++ bch2_kthread_io_clock_wait(clock, last + wait, + MAX_SCHEDULE_TIMEOUT); + continue; + } +-- +cgit v1.2.3 + + +From c4c20d3d216f196e9cccb01376df5887e6c7650d Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 20 Dec 2019 16:26:27 -0500 +Subject: bcachefs: Drop a faulty assertion + +This assertion was wrong for interior nodes (and wasn't terribly useful +to begin with) + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 5 ----- + 1 file changed, 5 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 918e6fe4a0a1..6f19304bb913 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -457,11 +457,6 @@ static void __bch2_btree_iter_verify(struct btree_iter *iter, + "cur key %s\n", + iter->pos.inode, iter->pos.offset, buf); + } +- +- BUG_ON(iter->uptodate == BTREE_ITER_UPTODATE && +- btree_iter_type(iter) == BTREE_ITER_KEYS && +- !bkey_whiteout(&iter->k) && +- bch2_btree_node_iter_end(&l->iter)); + } + + void bch2_btree_iter_verify(struct btree_iter *iter, struct btree *b) +-- +cgit v1.2.3 + + +From a452ec6a6c9ccc60a993869cef1af73d0aaed26c Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 20 Dec 2019 16:35:24 -0500 +Subject: bcachefs: bch2_trans_reset() calls should be at the tops of loops + +It needs to be called when we get -EINTR due to e.g. lock restart - this +fixes a transaction iterators overflow bug. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.h | 5 ----- + fs/bcachefs/btree_update_leaf.c | 2 ++ + fs/bcachefs/fs-io.c | 4 ++-- + fs/bcachefs/io.c | 8 ++++---- + fs/bcachefs/reflink.c | 3 ++- + 5 files changed, 10 insertions(+), 12 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h +index 4c5032222319..d750c4e5f18e 100644 +--- a/fs/bcachefs/btree_iter.h ++++ b/fs/bcachefs/btree_iter.h +@@ -299,11 +299,6 @@ static inline void bch2_trans_begin(struct btree_trans *trans) + return bch2_trans_reset(trans, TRANS_RESET_ITERS|TRANS_RESET_MEM); + } + +-static inline void bch2_trans_begin_updates(struct btree_trans *trans) +-{ +- return bch2_trans_reset(trans, TRANS_RESET_MEM); +-} +- + void *bch2_trans_kmalloc(struct btree_trans *, size_t); + void bch2_trans_init(struct btree_trans *, struct bch_fs *, unsigned, size_t); + int bch2_trans_exit(struct btree_trans *); +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index 46c0a1e7fa20..94268cfa09c7 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -868,6 +868,8 @@ retry: + bkey_cmp(iter->pos, end) < 0) { + struct bkey_i delete; + ++ bch2_trans_reset(trans, TRANS_RESET_MEM); ++ + bkey_init(&delete.k); + + /* +diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c +index 311d9517e15c..156286468731 100644 +--- a/fs/bcachefs/fs-io.c ++++ b/fs/bcachefs/fs-io.c +@@ -2704,6 +2704,8 @@ static long bchfs_fallocate(struct bch_inode_info *inode, int mode, + struct bkey_i_reservation reservation; + struct bkey_s_c k; + ++ bch2_trans_reset(&trans, TRANS_RESET_MEM); ++ + k = bch2_btree_iter_peek_slot(iter); + if ((ret = bkey_err(k))) + goto bkey_err; +@@ -2750,8 +2752,6 @@ static long bchfs_fallocate(struct bch_inode_info *inode, int mode, + reservation.v.nr_replicas = disk_res.nr_replicas; + } + +- bch2_trans_begin_updates(&trans); +- + ret = bch2_extent_update(&trans, iter, &reservation.k_i, + &disk_res, &inode->ei_journal_seq, + 0, &i_sectors_delta); +diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c +index 17ea38e42ae8..95bd53c558a8 100644 +--- a/fs/bcachefs/io.c ++++ b/fs/bcachefs/io.c +@@ -326,6 +326,8 @@ int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter, + bch2_disk_reservation_init(c, 0); + struct bkey_i delete; + ++ bch2_trans_reset(trans, TRANS_RESET_MEM); ++ + ret = bkey_err(k); + if (ret) + goto btree_err; +@@ -337,8 +339,6 @@ int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter, + bch2_key_resize(&delete.k, max_sectors); + bch2_cut_back(end, &delete); + +- bch2_trans_begin_updates(trans); +- + ret = bch2_extent_update(trans, iter, &delete, + &disk_res, journal_seq, + 0, i_sectors_delta); +@@ -400,14 +400,14 @@ int bch2_write_index_default(struct bch_write_op *op) + BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + + do { ++ bch2_trans_reset(&trans, TRANS_RESET_MEM); ++ + k = bch2_keylist_front(keys); + + bkey_on_stack_realloc(&sk, c, k->k.u64s); + bkey_copy(sk.k, k); + bch2_cut_front(iter->pos, sk.k); + +- bch2_trans_begin_updates(&trans); +- + ret = bch2_extent_update(&trans, iter, sk.k, + &op->res, op_journal_seq(op), + op->new_i_size, &op->i_sectors_delta); +diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c +index 53bd0e0ea058..a65ada691ba1 100644 +--- a/fs/bcachefs/reflink.c ++++ b/fs/bcachefs/reflink.c +@@ -185,7 +185,8 @@ s64 bch2_remap_range(struct bch_fs *c, + BTREE_ITER_INTENT); + + while (1) { +- bch2_trans_begin_updates(&trans); ++ bch2_trans_reset(&trans, TRANS_RESET_MEM); ++ + trans.mem_top = 0; + + if (fatal_signal_pending(current)) { +-- +cgit v1.2.3 + + +From 76290641aa79429588ab8dfe66083a3678e4db77 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 22 Dec 2019 23:04:30 -0500 +Subject: bcachefs: Convert all bch2_trans_commit() users to + BTREE_INSERT_ATOMIC + +BTREE_INSERT_ATOMIC should really be the default mode, and there's not +that much code that doesn't need it - so this is prep work for getting +rid of the flag. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_update.h | 24 +++++--- + fs/bcachefs/btree_update_leaf.c | 41 ++++++------- + fs/bcachefs/dirent.c | 12 ---- + fs/bcachefs/dirent.h | 2 - + fs/bcachefs/ec.c | 10 +++- + fs/bcachefs/fs-io.c | 6 +- + fs/bcachefs/fsck.c | 127 +++++++++++++++++++++++----------------- + fs/bcachefs/inode.c | 2 +- + fs/bcachefs/recovery.c | 2 +- + fs/bcachefs/xattr.c | 3 +- + 10 files changed, 124 insertions(+), 105 deletions(-) + +diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h +index ad8cbf3fb778..d72da179f866 100644 +--- a/fs/bcachefs/btree_update.h ++++ b/fs/bcachefs/btree_update.h +@@ -131,24 +131,34 @@ static inline void bch2_trans_update(struct btree_trans *trans, + }; + } + +-#define bch2_trans_do(_c, _journal_seq, _flags, _do) \ ++#define __bch2_trans_do(_trans, _disk_res, _journal_seq, \ ++ _flags, _reset_flags, _do) \ + ({ \ +- struct btree_trans trans; \ + int _ret; \ + \ +- bch2_trans_init(&trans, (_c), 0, 0); \ +- \ + do { \ +- bch2_trans_begin(&trans); \ ++ bch2_trans_reset(_trans, _reset_flags); \ + \ +- _ret = (_do) ?: bch2_trans_commit(&trans, NULL, \ ++ _ret = (_do) ?: bch2_trans_commit(_trans, (_disk_res), \ + (_journal_seq), (_flags)); \ + } while (_ret == -EINTR); \ + \ +- bch2_trans_exit(&trans); \ + _ret; \ + }) + ++#define bch2_trans_do(_c, _disk_res, _journal_seq, _flags, _do) \ ++({ \ ++ struct btree_trans trans; \ ++ int _ret, _ret2; \ ++ \ ++ bch2_trans_init(&trans, (_c), 0, 0); \ ++ _ret = __bch2_trans_do(&trans, _disk_res, _journal_seq, _flags, \ ++ TRANS_RESET_MEM|TRANS_RESET_ITERS, _do); \ ++ _ret2 = bch2_trans_exit(&trans); \ ++ \ ++ _ret ?: _ret2; \ ++}) ++ + #define trans_for_each_update(_trans, _i) \ + for ((_i) = (_trans)->updates; \ + (_i) < (_trans)->updates + (_trans)->nr_updates; \ +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index 94268cfa09c7..b964c4212091 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -822,6 +822,20 @@ err: + goto retry; + } + ++static int __bch2_btree_insert(struct btree_trans *trans, ++ enum btree_id id, struct bkey_i *k) ++{ ++ struct btree_iter *iter; ++ ++ iter = bch2_trans_get_iter(trans, id, bkey_start_pos(&k->k), ++ BTREE_ITER_INTENT); ++ if (IS_ERR(iter)) ++ return PTR_ERR(iter); ++ ++ bch2_trans_update(trans, iter, k); ++ return 0; ++} ++ + /** + * bch2_btree_insert - insert keys into the extent btree + * @c: pointer to struct bch_fs +@@ -830,29 +844,12 @@ err: + * @hook: insert callback + */ + int bch2_btree_insert(struct bch_fs *c, enum btree_id id, +- struct bkey_i *k, +- struct disk_reservation *disk_res, +- u64 *journal_seq, int flags) ++ struct bkey_i *k, ++ struct disk_reservation *disk_res, ++ u64 *journal_seq, int flags) + { +- struct btree_trans trans; +- struct btree_iter *iter; +- int ret; +- +- bch2_trans_init(&trans, c, 0, 0); +-retry: +- bch2_trans_begin(&trans); +- +- iter = bch2_trans_get_iter(&trans, id, bkey_start_pos(&k->k), +- BTREE_ITER_INTENT); +- +- bch2_trans_update(&trans, iter, k); +- +- ret = bch2_trans_commit(&trans, disk_res, journal_seq, flags); +- if (ret == -EINTR) +- goto retry; +- bch2_trans_exit(&trans); +- +- return ret; ++ return bch2_trans_do(c, disk_res, journal_seq, flags, ++ __bch2_btree_insert(&trans, id, k)); + } + + int bch2_btree_delete_at_range(struct btree_trans *trans, +diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c +index 38017699c04a..1bf53c55912d 100644 +--- a/fs/bcachefs/dirent.c ++++ b/fs/bcachefs/dirent.c +@@ -281,18 +281,6 @@ int bch2_dirent_delete_at(struct btree_trans *trans, + hash_info, iter); + } + +-int bch2_dirent_delete(struct bch_fs *c, u64 dir_inum, +- const struct bch_hash_info *hash_info, +- const struct qstr *name, +- u64 *journal_seq) +-{ +- return bch2_trans_do(c, journal_seq, +- BTREE_INSERT_ATOMIC| +- BTREE_INSERT_NOFAIL, +- bch2_hash_delete(&trans, bch2_dirent_hash_desc, hash_info, +- dir_inum, name)); +-} +- + struct btree_iter * + __bch2_dirent_lookup_trans(struct btree_trans *trans, u64 dir_inum, + const struct bch_hash_info *hash_info, +diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h +index e6184dc796d3..34769371dd13 100644 +--- a/fs/bcachefs/dirent.h ++++ b/fs/bcachefs/dirent.h +@@ -36,8 +36,6 @@ int bch2_dirent_create(struct btree_trans *, u64, + int bch2_dirent_delete_at(struct btree_trans *, + const struct bch_hash_info *, + struct btree_iter *); +-int bch2_dirent_delete(struct bch_fs *, u64, const struct bch_hash_info *, +- const struct qstr *, u64 *); + + enum bch_rename_mode { + BCH_RENAME, +diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c +index 3781838cda82..3426925edb41 100644 +--- a/fs/bcachefs/ec.c ++++ b/fs/bcachefs/ec.c +@@ -1235,6 +1235,7 @@ static int __bch2_stripe_write_key(struct btree_trans *trans, + bch2_trans_update(trans, iter, &new_key->k_i); + + return bch2_trans_commit(trans, NULL, NULL, ++ BTREE_INSERT_ATOMIC| + BTREE_INSERT_NOFAIL|flags); + } + +@@ -1259,8 +1260,13 @@ int bch2_stripes_write(struct bch_fs *c, unsigned flags, bool *wrote) + if (!m->dirty) + continue; + +- ret = __bch2_stripe_write_key(&trans, iter, m, giter.pos, +- new_key, flags); ++ do { ++ bch2_trans_reset(&trans, TRANS_RESET_MEM); ++ ++ ret = __bch2_stripe_write_key(&trans, iter, m, ++ giter.pos, new_key, flags); ++ } while (ret == -EINTR); ++ + if (ret) + break; + +diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c +index 156286468731..5f1c5cd63220 100644 +--- a/fs/bcachefs/fs-io.c ++++ b/fs/bcachefs/fs-io.c +@@ -2519,9 +2519,7 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode, + struct bpos next_pos; + struct bpos move_pos = POS(inode->v.i_ino, offset >> 9); + struct bpos atomic_end; +- unsigned commit_flags = BTREE_INSERT_NOFAIL| +- BTREE_INSERT_ATOMIC| +- BTREE_INSERT_USE_RESERVE; ++ unsigned commit_flags = 0; + + k = insert + ? bch2_btree_iter_peek_prev(src) +@@ -2614,6 +2612,8 @@ reassemble: + + ret = bch2_trans_commit(&trans, &disk_res, + &inode->ei_journal_seq, ++ BTREE_INSERT_ATOMIC| ++ BTREE_INSERT_NOFAIL| + commit_flags); + bch2_disk_reservation_put(c, &disk_res); + bkey_err: +diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c +index 3ae545b31c7a..a0fdd2ba92f6 100644 +--- a/fs/bcachefs/fsck.c ++++ b/fs/bcachefs/fsck.c +@@ -37,8 +37,8 @@ static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum) + return ret ?: sectors; + } + +-static int remove_dirent(struct btree_trans *trans, +- struct bkey_s_c_dirent dirent) ++static int __remove_dirent(struct btree_trans *trans, ++ struct bkey_s_c_dirent dirent) + { + struct bch_fs *c = trans->c; + struct qstr name; +@@ -49,31 +49,41 @@ static int remove_dirent(struct btree_trans *trans, + char *buf; + + name.len = bch2_dirent_name_bytes(dirent); +- buf = kmalloc(name.len + 1, GFP_KERNEL); +- if (!buf) +- return -ENOMEM; ++ buf = bch2_trans_kmalloc(trans, name.len + 1); ++ if (IS_ERR(buf)) ++ return PTR_ERR(buf); + + memcpy(buf, dirent.v->d_name, name.len); + buf[name.len] = '\0'; + name.name = buf; + +- /* Unlock so we don't deadlock, after copying name: */ +- bch2_trans_unlock(trans); +- +- ret = bch2_inode_find_by_inum(c, dir_inum, &dir_inode); +- if (ret) { ++ ret = bch2_inode_find_by_inum_trans(trans, dir_inum, &dir_inode); ++ if (ret && ret != -EINTR) + bch_err(c, "remove_dirent: err %i looking up directory inode", ret); +- goto err; +- } ++ if (ret) ++ return ret; + + dir_hash_info = bch2_hash_info_init(c, &dir_inode); + +- ret = bch2_dirent_delete(c, dir_inum, &dir_hash_info, &name, NULL); +- if (ret) ++ ret = bch2_hash_delete(trans, bch2_dirent_hash_desc, ++ &dir_hash_info, dir_inum, &name); ++ if (ret && ret != -EINTR) + bch_err(c, "remove_dirent: err %i deleting dirent", ret); +-err: +- kfree(buf); +- return ret; ++ if (ret) ++ return ret; ++ ++ return 0; ++} ++ ++static int remove_dirent(struct btree_trans *trans, ++ struct bkey_s_c_dirent dirent) ++{ ++ return __bch2_trans_do(trans, NULL, NULL, ++ BTREE_INSERT_ATOMIC| ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_LAZY_RW, ++ TRANS_RESET_MEM, ++ __remove_dirent(trans, dirent)); + } + + static int reattach_inode(struct bch_fs *c, +@@ -88,7 +98,7 @@ static int reattach_inode(struct bch_fs *c, + snprintf(name_buf, sizeof(name_buf), "%llu", inum); + name = (struct qstr) QSTR(name_buf); + +- ret = bch2_trans_do(c, NULL, ++ ret = bch2_trans_do(c, NULL, NULL, + BTREE_INSERT_ATOMIC| + BTREE_INSERT_LAZY_RW, + bch2_link_trans(&trans, lostfound_inode->bi_inum, +@@ -171,27 +181,27 @@ static int hash_redo_key(const struct bch_hash_desc desc, + struct btree_iter *k_iter, struct bkey_s_c k, + u64 hashed) + { ++ struct bkey_i delete; + struct bkey_i *tmp; +- int ret = 0; + +- tmp = kmalloc(bkey_bytes(k.k), GFP_KERNEL); +- if (!tmp) +- return -ENOMEM; ++ bch2_trans_reset(trans, TRANS_RESET_MEM); ++ ++ tmp = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); ++ if (IS_ERR(tmp)) ++ return PTR_ERR(tmp); + + bkey_reassemble(tmp, k); + +- ret = bch2_btree_delete_at(trans, k_iter, 0); +- if (ret) +- goto err; ++ bkey_init(&delete.k); ++ delete.k.p = k_iter->pos; ++ bch2_trans_update(trans, k_iter, &delete); + +- bch2_hash_set(trans, desc, &h->info, k_iter->pos.inode, +- tmp, BCH_HASH_SET_MUST_CREATE); +- ret = bch2_trans_commit(trans, NULL, NULL, +- BTREE_INSERT_NOFAIL| +- BTREE_INSERT_LAZY_RW); +-err: +- kfree(tmp); +- return ret; ++ return bch2_hash_set(trans, desc, &h->info, k_iter->pos.inode, ++ tmp, BCH_HASH_SET_MUST_CREATE) ?: ++ bch2_trans_commit(trans, NULL, NULL, ++ BTREE_INSERT_ATOMIC| ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_LAZY_RW); + } + + static int fsck_hash_delete_at(struct btree_trans *trans, +@@ -313,9 +323,11 @@ static int hash_check_key(struct btree_trans *trans, + "hashed to %llu chain starts at %llu\n%s", + desc.btree_id, k.k->p.offset, + hashed, h->chain->pos.offset, +- (bch2_bkey_val_to_text(&PBUF(buf), c, +- k), buf))) { +- ret = hash_redo_key(desc, trans, h, k_iter, k, hashed); ++ (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf))) { ++ do { ++ ret = hash_redo_key(desc, trans, h, k_iter, k, hashed); ++ } while (ret == -EINTR); ++ + if (ret) { + bch_err(c, "hash_redo_key err %i", ret); + return ret; +@@ -376,11 +388,12 @@ static int check_dirent_hash(struct btree_trans *trans, struct hash_check *h, + + if (fsck_err(c, "dirent with junk at end, was %s (%zu) now %s (%u)", + buf, strlen(buf), d->v.d_name, len)) { +- bch2_trans_update(trans, iter, &d->k_i); +- +- ret = bch2_trans_commit(trans, NULL, NULL, +- BTREE_INSERT_NOFAIL| +- BTREE_INSERT_LAZY_RW); ++ ret = __bch2_trans_do(trans, NULL, NULL, ++ BTREE_INSERT_ATOMIC| ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_LAZY_RW, ++ TRANS_RESET_MEM, ++ (bch2_trans_update(trans, iter, &d->k_i), 0)); + if (ret) + goto err; + +@@ -402,8 +415,11 @@ err_redo: + k->k->p.offset, hash, h->chain->pos.offset, + (bch2_bkey_val_to_text(&PBUF(buf), c, + *k), buf))) { +- ret = hash_redo_key(bch2_dirent_hash_desc, trans, +- h, iter, *k, hash); ++ do { ++ ret = hash_redo_key(bch2_dirent_hash_desc, trans, ++ h, iter, *k, hash); ++ } while (ret == -EINTR); ++ + if (ret) + bch_err(c, "hash_redo_key err %i", ret); + else +@@ -646,11 +662,12 @@ retry: + bkey_reassemble(&n->k_i, d.s_c); + n->v.d_type = mode_to_type(target.bi_mode); + +- bch2_trans_update(&trans, iter, &n->k_i); +- +- ret = bch2_trans_commit(&trans, NULL, NULL, +- BTREE_INSERT_NOFAIL| +- BTREE_INSERT_LAZY_RW); ++ ret = __bch2_trans_do(&trans, NULL, NULL, ++ BTREE_INSERT_ATOMIC| ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_LAZY_RW, ++ TRANS_RESET_MEM, ++ (bch2_trans_update(&trans, iter, &n->k_i), 0)); + kfree(n); + if (ret) + goto err; +@@ -790,7 +807,7 @@ fsck_err: + create_lostfound: + bch2_inode_init_early(c, lostfound_inode); + +- ret = bch2_trans_do(c, NULL, ++ ret = bch2_trans_do(c, NULL, NULL, + BTREE_INSERT_ATOMIC| + BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW, +@@ -1261,12 +1278,14 @@ static int check_inode(struct btree_trans *trans, + struct bkey_inode_buf p; + + bch2_inode_pack(&p, &u); +- bch2_trans_update(trans, iter, &p.inode.k_i); + +- ret = bch2_trans_commit(trans, NULL, NULL, +- BTREE_INSERT_NOFAIL| +- BTREE_INSERT_LAZY_RW); +- if (ret && ret != -EINTR) ++ ret = __bch2_trans_do(trans, NULL, NULL, ++ BTREE_INSERT_ATOMIC| ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_LAZY_RW, ++ TRANS_RESET_MEM, ++ (bch2_trans_update(trans, iter, &p.inode.k_i), 0)); ++ if (ret) + bch_err(c, "error in fsck: error %i " + "updating inode", ret); + } +diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c +index c0642ff46ba0..439f9dc7f7e0 100644 +--- a/fs/bcachefs/inode.c ++++ b/fs/bcachefs/inode.c +@@ -533,7 +533,7 @@ int bch2_inode_find_by_inum_trans(struct btree_trans *trans, u64 inode_nr, + int bch2_inode_find_by_inum(struct bch_fs *c, u64 inode_nr, + struct bch_inode_unpacked *inode) + { +- return bch2_trans_do(c, NULL, 0, ++ return bch2_trans_do(c, NULL, NULL, 0, + bch2_inode_find_by_inum_trans(&trans, inode_nr, inode)); + } + +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index e6b51131cff2..a3ee2f474952 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -1009,7 +1009,7 @@ int bch2_fs_initialize(struct bch_fs *c) + bch2_inode_init_early(c, &lostfound_inode); + + err = "error creating lost+found"; +- ret = bch2_trans_do(c, NULL, BTREE_INSERT_ATOMIC, ++ ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_ATOMIC, + bch2_create_trans(&trans, BCACHEFS_ROOT_INO, + &root_inode, &lostfound_inode, + &lostfound, +diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c +index 4cbdf870c8e8..fd56ffdbcfcb 100644 +--- a/fs/bcachefs/xattr.c ++++ b/fs/bcachefs/xattr.c +@@ -328,7 +328,8 @@ static int bch2_xattr_set_handler(const struct xattr_handler *handler, + struct bch_inode_info *inode = to_bch_ei(vinode); + struct bch_fs *c = inode->v.i_sb->s_fs_info; + +- return bch2_trans_do(c, &inode->ei_journal_seq, BTREE_INSERT_ATOMIC, ++ return bch2_trans_do(c, NULL, &inode->ei_journal_seq, ++ BTREE_INSERT_ATOMIC, + bch2_xattr_set(&trans, inode->v.i_ino, + &inode->ei_str_hash, + name, value, size, +-- +cgit v1.2.3 + + +From 6ee01fb88da21cff483f4622b38e692761c26b2f Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 22 Dec 2019 23:39:28 -0500 +Subject: bcachefs: Kill BTREE_INSERT_ATOMIC + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/acl.c | 1 - + fs/bcachefs/alloc_background.c | 2 -- + fs/bcachefs/btree_update.h | 10 +--------- + fs/bcachefs/btree_update_leaf.c | 19 ++----------------- + fs/bcachefs/ec.c | 3 --- + fs/bcachefs/fs-io.c | 1 - + fs/bcachefs/fs.c | 6 ------ + fs/bcachefs/fsck.c | 8 -------- + fs/bcachefs/inode.c | 1 - + fs/bcachefs/io.c | 2 -- + fs/bcachefs/migrate.c | 1 - + fs/bcachefs/move.c | 1 - + fs/bcachefs/recovery.c | 4 +--- + fs/bcachefs/reflink.c | 3 +-- + fs/bcachefs/xattr.c | 3 +-- + 15 files changed, 6 insertions(+), 59 deletions(-) + +diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c +index 2c59b05da484..20cdc7999244 100644 +--- a/fs/bcachefs/acl.c ++++ b/fs/bcachefs/acl.c +@@ -327,7 +327,6 @@ retry: + ret = bch2_inode_write(&trans, inode_iter, &inode_u) ?: + bch2_trans_commit(&trans, NULL, + &inode->ei_journal_seq, +- BTREE_INSERT_ATOMIC| + BTREE_INSERT_NOUNLOCK); + btree_err: + if (ret == -EINTR) +diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c +index 13e1a60fd7c6..61380e16623f 100644 +--- a/fs/bcachefs/alloc_background.c ++++ b/fs/bcachefs/alloc_background.c +@@ -316,7 +316,6 @@ retry: + + bch2_trans_update(trans, iter, &a->k_i); + ret = bch2_trans_commit(trans, NULL, NULL, +- BTREE_INSERT_ATOMIC| + BTREE_INSERT_NOFAIL| + BTREE_INSERT_NOMARK| + flags); +@@ -913,7 +912,6 @@ retry: + */ + ret = bch2_trans_commit(trans, NULL, + invalidating_cached_data ? journal_seq : NULL, +- BTREE_INSERT_ATOMIC| + BTREE_INSERT_NOUNLOCK| + BTREE_INSERT_NOCHECK_RW| + BTREE_INSERT_NOFAIL| +diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h +index d72da179f866..aa87477b51e1 100644 +--- a/fs/bcachefs/btree_update.h ++++ b/fs/bcachefs/btree_update.h +@@ -16,7 +16,6 @@ void bch2_btree_journal_key(struct btree_trans *, struct btree_iter *, + struct bkey_i *); + + enum { +- __BTREE_INSERT_ATOMIC, + __BTREE_INSERT_NOUNLOCK, + __BTREE_INSERT_NOFAIL, + __BTREE_INSERT_NOCHECK_RW, +@@ -35,12 +34,6 @@ enum { + __BCH_HASH_SET_MUST_REPLACE, + }; + +-/* +- * Don't drop/retake locks before doing btree update, instead return -EINTR if +- * we had to drop locks for any reason +- */ +-#define BTREE_INSERT_ATOMIC (1 << __BTREE_INSERT_ATOMIC) +- + /* + * Don't drop locks _after_ successfully updating btree: + */ +@@ -101,8 +94,7 @@ int __bch2_trans_commit(struct btree_trans *); + * This is main entry point for btree updates. + * + * Return values: +- * -EINTR: locking changed, this function should be called again. Only returned +- * if passed BTREE_INSERT_ATOMIC. ++ * -EINTR: locking changed, this function should be called again. + * -EROFS: filesystem read only + * -EIO: journal or btree node IO error + */ +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index b964c4212091..e81ac3ed0c41 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -298,8 +298,6 @@ static inline void btree_insert_entry_checks(struct btree_trans *trans, + BUG_ON(bkey_cmp(bkey_start_pos(&i->k->k), i->iter->pos)); + EBUG_ON((i->iter->flags & BTREE_ITER_IS_EXTENTS) && + bkey_cmp(i->k->k.p, i->iter->l[0].b->key.k.p) > 0); +- EBUG_ON((i->iter->flags & BTREE_ITER_IS_EXTENTS) && +- !(trans->flags & BTREE_INSERT_ATOMIC)); + + BUG_ON(debug_check_bkeys(c) && + !bkey_deleted(&i->k->k) && +@@ -641,8 +639,8 @@ int bch2_trans_commit_error(struct btree_trans *trans, + + /* + * if the split succeeded without dropping locks the insert will +- * still be atomic (in the BTREE_INSERT_ATOMIC sense, what the +- * caller peeked() and is overwriting won't have changed) ++ * still be atomic (what the caller peeked() and is overwriting ++ * won't have changed) + */ + #if 0 + /* +@@ -713,13 +711,6 @@ int bch2_trans_commit_error(struct btree_trans *trans, + return ret2; + } + +- /* +- * BTREE_ITER_ATOMIC means we have to return -EINTR if we +- * dropped locks: +- */ +- if (!(flags & BTREE_INSERT_ATOMIC)) +- return 0; +- + trace_trans_restart_atomic(trans->ip); + } + +@@ -756,9 +747,6 @@ int __bch2_trans_commit(struct btree_trans *trans) + if (!trans->nr_updates) + goto out_noupdates; + +- /* for the sake of sanity: */ +- EBUG_ON(trans->nr_updates > 1 && !(trans->flags & BTREE_INSERT_ATOMIC)); +- + if (trans->flags & BTREE_INSERT_GC_LOCK_HELD) + lockdep_assert_held(&trans->c->gc_lock); + +@@ -794,8 +782,6 @@ out: + if (likely(!(trans->flags & BTREE_INSERT_NOCHECK_RW))) + percpu_ref_put(&trans->c->writes); + out_noupdates: +- EBUG_ON(!(trans->flags & BTREE_INSERT_ATOMIC) && ret == -EINTR); +- + trans_for_each_iter_all(trans, iter) + iter->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT; + +@@ -896,7 +882,6 @@ retry: + + bch2_trans_update(trans, iter, &delete); + ret = bch2_trans_commit(trans, NULL, journal_seq, +- BTREE_INSERT_ATOMIC| + BTREE_INSERT_NOFAIL); + if (ret) + break; +diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c +index 3426925edb41..91f5a4a110b4 100644 +--- a/fs/bcachefs/ec.c ++++ b/fs/bcachefs/ec.c +@@ -739,7 +739,6 @@ found_slot: + bch2_trans_update(&trans, iter, &stripe->k_i); + + ret = bch2_trans_commit(&trans, NULL, NULL, +- BTREE_INSERT_ATOMIC| + BTREE_INSERT_NOFAIL); + err: + if (ret == -EINTR) +@@ -822,7 +821,6 @@ static int ec_stripe_update_ptrs(struct bch_fs *c, + bch2_trans_update(&trans, iter, sk.k); + + ret = bch2_trans_commit(&trans, NULL, NULL, +- BTREE_INSERT_ATOMIC| + BTREE_INSERT_NOFAIL| + BTREE_INSERT_USE_RESERVE); + if (ret == -EINTR) +@@ -1235,7 +1233,6 @@ static int __bch2_stripe_write_key(struct btree_trans *trans, + bch2_trans_update(trans, iter, &new_key->k_i); + + return bch2_trans_commit(trans, NULL, NULL, +- BTREE_INSERT_ATOMIC| + BTREE_INSERT_NOFAIL|flags); + } + +diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c +index 5f1c5cd63220..a6ebe41c85eb 100644 +--- a/fs/bcachefs/fs-io.c ++++ b/fs/bcachefs/fs-io.c +@@ -2612,7 +2612,6 @@ reassemble: + + ret = bch2_trans_commit(&trans, &disk_res, + &inode->ei_journal_seq, +- BTREE_INSERT_ATOMIC| + BTREE_INSERT_NOFAIL| + commit_flags); + bch2_disk_reservation_put(c, &disk_res); +diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c +index 564c69543ffa..3d586e6a4e44 100644 +--- a/fs/bcachefs/fs.c ++++ b/fs/bcachefs/fs.c +@@ -143,7 +143,6 @@ retry: + bch2_inode_write(&trans, iter, &inode_u) ?: + bch2_trans_commit(&trans, NULL, + &inode->ei_journal_seq, +- BTREE_INSERT_ATOMIC| + BTREE_INSERT_NOUNLOCK| + BTREE_INSERT_NOFAIL); + if (ret == -EINTR) +@@ -279,7 +278,6 @@ retry: + goto err_before_quota; + + ret = bch2_trans_commit(&trans, NULL, &journal_seq, +- BTREE_INSERT_ATOMIC| + BTREE_INSERT_NOUNLOCK); + if (unlikely(ret)) { + bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1, +@@ -409,7 +407,6 @@ static int __bch2_link(struct bch_fs *c, + &dentry->d_name) ?: + bch2_trans_commit(&trans, NULL, + &inode->ei_journal_seq, +- BTREE_INSERT_ATOMIC| + BTREE_INSERT_NOUNLOCK); + } while (ret == -EINTR); + +@@ -466,7 +463,6 @@ static int bch2_unlink(struct inode *vdir, struct dentry *dentry) + &inode_u, &dentry->d_name) ?: + bch2_trans_commit(&trans, NULL, + &dir->ei_journal_seq, +- BTREE_INSERT_ATOMIC| + BTREE_INSERT_NOUNLOCK| + BTREE_INSERT_NOFAIL); + } while (ret == -EINTR); +@@ -598,7 +594,6 @@ retry: + mode) ?: + bch2_trans_commit(&trans, NULL, + &journal_seq, +- BTREE_INSERT_ATOMIC| + BTREE_INSERT_NOUNLOCK); + if (ret == -EINTR) + goto retry; +@@ -733,7 +728,6 @@ retry: + ret = bch2_inode_write(&trans, inode_iter, &inode_u) ?: + bch2_trans_commit(&trans, NULL, + &inode->ei_journal_seq, +- BTREE_INSERT_ATOMIC| + BTREE_INSERT_NOUNLOCK| + BTREE_INSERT_NOFAIL); + btree_err: +diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c +index a0fdd2ba92f6..cd230dc10984 100644 +--- a/fs/bcachefs/fsck.c ++++ b/fs/bcachefs/fsck.c +@@ -79,7 +79,6 @@ static int remove_dirent(struct btree_trans *trans, + struct bkey_s_c_dirent dirent) + { + return __bch2_trans_do(trans, NULL, NULL, +- BTREE_INSERT_ATOMIC| + BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW, + TRANS_RESET_MEM, +@@ -99,7 +98,6 @@ static int reattach_inode(struct bch_fs *c, + name = (struct qstr) QSTR(name_buf); + + ret = bch2_trans_do(c, NULL, NULL, +- BTREE_INSERT_ATOMIC| + BTREE_INSERT_LAZY_RW, + bch2_link_trans(&trans, lostfound_inode->bi_inum, + inum, &dir_u, &inode_u, &name)); +@@ -199,7 +197,6 @@ static int hash_redo_key(const struct bch_hash_desc desc, + return bch2_hash_set(trans, desc, &h->info, k_iter->pos.inode, + tmp, BCH_HASH_SET_MUST_CREATE) ?: + bch2_trans_commit(trans, NULL, NULL, +- BTREE_INSERT_ATOMIC| + BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW); + } +@@ -213,7 +210,6 @@ static int fsck_hash_delete_at(struct btree_trans *trans, + retry: + ret = bch2_hash_delete_at(trans, desc, info, iter) ?: + bch2_trans_commit(trans, NULL, NULL, +- BTREE_INSERT_ATOMIC| + BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW); + if (ret == -EINTR) { +@@ -389,7 +385,6 @@ static int check_dirent_hash(struct btree_trans *trans, struct hash_check *h, + if (fsck_err(c, "dirent with junk at end, was %s (%zu) now %s (%u)", + buf, strlen(buf), d->v.d_name, len)) { + ret = __bch2_trans_do(trans, NULL, NULL, +- BTREE_INSERT_ATOMIC| + BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW, + TRANS_RESET_MEM, +@@ -663,7 +658,6 @@ retry: + n->v.d_type = mode_to_type(target.bi_mode); + + ret = __bch2_trans_do(&trans, NULL, NULL, +- BTREE_INSERT_ATOMIC| + BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW, + TRANS_RESET_MEM, +@@ -808,7 +802,6 @@ create_lostfound: + bch2_inode_init_early(c, lostfound_inode); + + ret = bch2_trans_do(c, NULL, NULL, +- BTREE_INSERT_ATOMIC| + BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW, + bch2_create_trans(&trans, +@@ -1280,7 +1273,6 @@ static int check_inode(struct btree_trans *trans, + bch2_inode_pack(&p, &u); + + ret = __bch2_trans_do(trans, NULL, NULL, +- BTREE_INSERT_ATOMIC| + BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW, + TRANS_RESET_MEM, +diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c +index 439f9dc7f7e0..227cfb572ff2 100644 +--- a/fs/bcachefs/inode.c ++++ b/fs/bcachefs/inode.c +@@ -496,7 +496,6 @@ int bch2_inode_rm(struct bch_fs *c, u64 inode_nr) + bch2_trans_update(&trans, iter, &delete.k_i); + + ret = bch2_trans_commit(&trans, NULL, NULL, +- BTREE_INSERT_ATOMIC| + BTREE_INSERT_NOFAIL); + } while (ret == -EINTR); + +diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c +index 95bd53c558a8..37caed669156 100644 +--- a/fs/bcachefs/io.c ++++ b/fs/bcachefs/io.c +@@ -303,7 +303,6 @@ int bch2_extent_update(struct btree_trans *trans, + ret = bch2_trans_commit(trans, disk_res, journal_seq, + BTREE_INSERT_NOCHECK_RW| + BTREE_INSERT_NOFAIL| +- BTREE_INSERT_ATOMIC| + BTREE_INSERT_USE_RESERVE); + if (!ret && i_sectors_delta) + *i_sectors_delta += delta; +@@ -1738,7 +1737,6 @@ retry: + + bch2_trans_update(&trans, iter, new.k); + ret = bch2_trans_commit(&trans, NULL, NULL, +- BTREE_INSERT_ATOMIC| + BTREE_INSERT_NOFAIL| + BTREE_INSERT_NOWAIT); + if (ret == -EINTR) +diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c +index 4b59dcd04cce..db86420bd647 100644 +--- a/fs/bcachefs/migrate.c ++++ b/fs/bcachefs/migrate.c +@@ -79,7 +79,6 @@ static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags + bch2_trans_update(&trans, iter, sk.k); + + ret = bch2_trans_commit(&trans, NULL, NULL, +- BTREE_INSERT_ATOMIC| + BTREE_INSERT_NOFAIL); + + /* +diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c +index fad3cc4d587c..2803056288c3 100644 +--- a/fs/bcachefs/move.c ++++ b/fs/bcachefs/move.c +@@ -154,7 +154,6 @@ static int bch2_migrate_index_update(struct bch_write_op *op) + + ret = bch2_trans_commit(&trans, &op->res, + op_journal_seq(op), +- BTREE_INSERT_ATOMIC| + BTREE_INSERT_NOFAIL| + BTREE_INSERT_USE_RESERVE| + m->data_opts.btree_insert_flags); +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index a3ee2f474952..44a1dcdb135d 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -309,14 +309,12 @@ retry: + 0, -((s64) k->k.size), + BCH_BUCKET_MARK_OVERWRITE) ?: + bch2_trans_commit(&trans, &disk_res, NULL, +- BTREE_INSERT_ATOMIC| + BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW| + BTREE_INSERT_NOMARK_OVERWRITES| + BTREE_INSERT_NO_CLEAR_REPLICAS); + } else { + ret = bch2_trans_commit(&trans, &disk_res, NULL, +- BTREE_INSERT_ATOMIC| + BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW| + BTREE_INSERT_JOURNAL_REPLAY| +@@ -1009,7 +1007,7 @@ int bch2_fs_initialize(struct bch_fs *c) + bch2_inode_init_early(c, &lostfound_inode); + + err = "error creating lost+found"; +- ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_ATOMIC, ++ ret = bch2_trans_do(c, NULL, NULL, 0, + bch2_create_trans(&trans, BCACHEFS_ROOT_INO, + &root_inode, &lostfound_inode, + &lostfound, +diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c +index a65ada691ba1..5cad39fe031f 100644 +--- a/fs/bcachefs/reflink.c ++++ b/fs/bcachefs/reflink.c +@@ -288,8 +288,7 @@ err: + inode_u.bi_size < new_i_size) { + inode_u.bi_size = new_i_size; + ret2 = bch2_inode_write(&trans, inode_iter, &inode_u) ?: +- bch2_trans_commit(&trans, NULL, journal_seq, +- BTREE_INSERT_ATOMIC); ++ bch2_trans_commit(&trans, NULL, journal_seq, 0); + } + } while (ret2 == -EINTR); + +diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c +index fd56ffdbcfcb..3f383039765f 100644 +--- a/fs/bcachefs/xattr.c ++++ b/fs/bcachefs/xattr.c +@@ -328,8 +328,7 @@ static int bch2_xattr_set_handler(const struct xattr_handler *handler, + struct bch_inode_info *inode = to_bch_ei(vinode); + struct bch_fs *c = inode->v.i_sb->s_fs_info; + +- return bch2_trans_do(c, NULL, &inode->ei_journal_seq, +- BTREE_INSERT_ATOMIC, ++ return bch2_trans_do(c, NULL, &inode->ei_journal_seq, 0, + bch2_xattr_set(&trans, inode->v.i_ino, + &inode->ei_str_hash, + name, value, size, +-- +cgit v1.2.3 + + +From abcf879329640ff31da2283243854d0cfee8a1d3 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 24 Dec 2019 18:03:53 -0500 +Subject: bcachefs: Don't reexecute triggers when retrying transaction commit + +This was causing a bug with transaction iterators overflowing; now, if +triggers have to be reexecuted we always return -EINTR and retry from +the start of the transaction. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_types.h | 1 + + fs/bcachefs/btree_update.h | 3 -- + fs/bcachefs/btree_update_leaf.c | 94 ++++++++++++++++++++--------------------- + fs/bcachefs/recovery.c | 3 +- + 4 files changed, 48 insertions(+), 53 deletions(-) + +diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h +index 0c0a3f35a62e..2a5b70c72a13 100644 +--- a/fs/bcachefs/btree_types.h ++++ b/fs/bcachefs/btree_types.h +@@ -278,6 +278,7 @@ struct btree_trans { + struct disk_reservation *disk_res; + unsigned flags; + unsigned journal_u64s; ++ unsigned journal_preres_u64s; + struct replicas_delta_list *fs_usage_deltas; + + struct btree_iter iters_onstack[2]; +diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h +index aa87477b51e1..1534e937a95d 100644 +--- a/fs/bcachefs/btree_update.h ++++ b/fs/bcachefs/btree_update.h +@@ -26,7 +26,6 @@ enum { + __BTREE_INSERT_JOURNAL_RESERVED, + __BTREE_INSERT_NOMARK_OVERWRITES, + __BTREE_INSERT_NOMARK, +- __BTREE_INSERT_NO_CLEAR_REPLICAS, + __BTREE_INSERT_BUCKET_INVALIDATE, + __BTREE_INSERT_NOWAIT, + __BTREE_INSERT_GC_LOCK_HELD, +@@ -60,8 +59,6 @@ enum { + /* Don't call mark new key at all: */ + #define BTREE_INSERT_NOMARK (1 << __BTREE_INSERT_NOMARK) + +-#define BTREE_INSERT_NO_CLEAR_REPLICAS (1 << __BTREE_INSERT_NO_CLEAR_REPLICAS) +- + #define BTREE_INSERT_BUCKET_INVALIDATE (1 << __BTREE_INSERT_BUCKET_INVALIDATE) + + /* Don't block on allocation failure (for new btree nodes: */ +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index e81ac3ed0c41..d0aca7f9b3d3 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -515,44 +515,18 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, + { + struct btree_insert_entry *i; + struct btree_iter *iter; +- unsigned idx, u64s, journal_preres_u64s = 0; ++ unsigned idx; + int ret; + +- /* +- * note: running triggers will append more updates to the list of +- * updates as we're walking it: +- */ +- trans_for_each_update(trans, i) { +- /* we know trans->nounlock won't be set here: */ +- if (unlikely(!(i->iter->locks_want < 1 +- ? __bch2_btree_iter_upgrade(i->iter, 1) +- : i->iter->uptodate <= BTREE_ITER_NEED_PEEK))) { +- trace_trans_restart_upgrade(trans->ip); +- return -EINTR; +- } +- +- if (likely(!(trans->flags & BTREE_INSERT_NOMARK)) && +- update_has_trans_triggers(i)) { +- ret = bch2_trans_mark_update(trans, i->iter, i->k); +- if (unlikely(ret)) { +- if (ret == -EINTR) +- trace_trans_restart_mark(trans->ip); +- return ret; +- } +- } +- +- u64s = jset_u64s(i->k->k.u64s); +- if (0) +- journal_preres_u64s += u64s; +- trans->journal_u64s += u64s; +- } ++ trans_for_each_update(trans, i) ++ BUG_ON(!btree_node_intent_locked(i->iter, 0)); + + ret = bch2_journal_preres_get(&trans->c->journal, +- &trans->journal_preres, journal_preres_u64s, ++ &trans->journal_preres, trans->journal_preres_u64s, + JOURNAL_RES_GET_NONBLOCK); + if (unlikely(ret == -EAGAIN)) + ret = bch2_trans_journal_preres_get_cold(trans, +- journal_preres_u64s); ++ trans->journal_preres_u64s); + if (unlikely(ret)) + return ret; + +@@ -740,8 +714,7 @@ int __bch2_trans_commit(struct btree_trans *trans) + { + struct btree_insert_entry *i = NULL; + struct btree_iter *iter; +- unsigned orig_nr_updates = trans->nr_updates; +- unsigned orig_mem_top = trans->mem_top; ++ unsigned u64s; + int ret = 0; + + if (!trans->nr_updates) +@@ -752,25 +725,50 @@ int __bch2_trans_commit(struct btree_trans *trans) + + memset(&trans->journal_preres, 0, sizeof(trans->journal_preres)); + ++ trans->journal_u64s = 0; ++ trans->journal_preres_u64s = 0; ++ + if (!(trans->flags & BTREE_INSERT_NOCHECK_RW) && + unlikely(!percpu_ref_tryget(&trans->c->writes))) { + ret = bch2_trans_commit_get_rw_cold(trans); + if (ret) + return ret; + } ++ ++ /* ++ * note: running triggers will append more updates to the list of ++ * updates as we're walking it: ++ */ ++ trans_for_each_update(trans, i) { ++ /* we know trans->nounlock won't be set here: */ ++ if (unlikely(!(i->iter->locks_want < 1 ++ ? __bch2_btree_iter_upgrade(i->iter, 1) ++ : i->iter->uptodate <= BTREE_ITER_NEED_PEEK))) { ++ trace_trans_restart_upgrade(trans->ip); ++ ret = -EINTR; ++ goto out; ++ } ++ ++ if (likely(!(trans->flags & BTREE_INSERT_NOMARK)) && ++ update_has_trans_triggers(i)) { ++ ret = bch2_trans_mark_update(trans, i->iter, i->k); ++ if (unlikely(ret)) { ++ if (ret == -EINTR) ++ trace_trans_restart_mark(trans->ip); ++ goto out; ++ } ++ } ++ ++ u64s = jset_u64s(i->k->k.u64s); ++ if (0) ++ trans->journal_preres_u64s += u64s; ++ trans->journal_u64s += u64s; ++ } + retry: + memset(&trans->journal_res, 0, sizeof(trans->journal_res)); +- trans->journal_u64s = 0; + + ret = do_bch2_trans_commit(trans, &i); + +- if (trans->fs_usage_deltas) { +- trans->fs_usage_deltas->used = 0; +- memset(&trans->fs_usage_deltas->memset_start, 0, +- (void *) &trans->fs_usage_deltas->memset_end - +- (void *) &trans->fs_usage_deltas->memset_start); +- } +- + /* make sure we didn't drop or screw up locks: */ + bch2_btree_trans_verify_locks(trans); + +@@ -792,19 +790,19 @@ out_noupdates: + trans->nr_updates = 0; + trans->mem_top = 0; + ++ if (trans->fs_usage_deltas) { ++ trans->fs_usage_deltas->used = 0; ++ memset(&trans->fs_usage_deltas->memset_start, 0, ++ (void *) &trans->fs_usage_deltas->memset_end - ++ (void *) &trans->fs_usage_deltas->memset_start); ++ } ++ + return ret; + err: + ret = bch2_trans_commit_error(trans, i, ret); +- +- /* can't loop if it was passed in and we changed it: */ +- if (unlikely(trans->flags & BTREE_INSERT_NO_CLEAR_REPLICAS) && !ret) +- ret = -EINTR; + if (ret) + goto out; + +- /* free updates and memory used by triggers, they'll be reexecuted: */ +- trans->nr_updates = orig_nr_updates; +- trans->mem_top = orig_mem_top; + goto retry; + } + +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index 44a1dcdb135d..c366050d572c 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -311,8 +311,7 @@ retry: + bch2_trans_commit(&trans, &disk_res, NULL, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW| +- BTREE_INSERT_NOMARK_OVERWRITES| +- BTREE_INSERT_NO_CLEAR_REPLICAS); ++ BTREE_INSERT_NOMARK_OVERWRITES); + } else { + ret = bch2_trans_commit(&trans, &disk_res, NULL, + BTREE_INSERT_NOFAIL| +-- +cgit v1.2.3 + + +From c2731f0426745ff579ffeab1386138e18e4a4677 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 26 Dec 2019 14:54:43 -0500 +Subject: bcachefs: Don't export __bch2_fs_read_write + +BTREE_INSERT_LAZY_RW was added for this since this code was written; use +it instead. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/recovery.c | 7 +------ + fs/bcachefs/super.c | 2 +- + fs/bcachefs/super.h | 1 - + 3 files changed, 2 insertions(+), 8 deletions(-) + +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index c366050d572c..9c90d2bbb7cc 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -986,11 +986,6 @@ int bch2_fs_initialize(struct bch_fs *c) + bch2_fs_journal_start(&c->journal, 1, &journal); + bch2_journal_set_replay_done(&c->journal); + +- err = "error going read write"; +- ret = __bch2_fs_read_write(c, true); +- if (ret) +- goto err; +- + bch2_inode_init(c, &root_inode, 0, 0, + S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0, NULL); + root_inode.bi_inum = BCACHEFS_ROOT_INO; +@@ -999,7 +994,7 @@ int bch2_fs_initialize(struct bch_fs *c) + err = "error creating root directory"; + ret = bch2_btree_insert(c, BTREE_ID_INODES, + &packed_inode.inode.k_i, +- NULL, NULL, 0); ++ NULL, NULL, BTREE_INSERT_LAZY_RW); + if (ret) + goto err; + +diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c +index a317e089882d..2b4c39e74125 100644 +--- a/fs/bcachefs/super.c ++++ b/fs/bcachefs/super.c +@@ -365,7 +365,7 @@ static int bch2_fs_read_write_late(struct bch_fs *c) + return 0; + } + +-int __bch2_fs_read_write(struct bch_fs *c, bool early) ++static int __bch2_fs_read_write(struct bch_fs *c, bool early) + { + struct bch_dev *ca; + unsigned i; +diff --git a/fs/bcachefs/super.h b/fs/bcachefs/super.h +index 9204e8fdabdd..543cc5422d9e 100644 +--- a/fs/bcachefs/super.h ++++ b/fs/bcachefs/super.h +@@ -218,7 +218,6 @@ struct bch_dev *bch2_dev_lookup(struct bch_fs *, const char *); + bool bch2_fs_emergency_read_only(struct bch_fs *); + void bch2_fs_read_only(struct bch_fs *); + +-int __bch2_fs_read_write(struct bch_fs *, bool); + int bch2_fs_read_write(struct bch_fs *); + int bch2_fs_read_write_early(struct bch_fs *); + +-- +cgit v1.2.3 + + +From 5a7f3013d70b6064dd626c1550660ce58bc4c425 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 27 Dec 2019 13:44:03 -0500 +Subject: bcachefs: Fix a use after free + +op->end_io may free the op struct + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/io.c | 19 +++++++++++-------- + 1 file changed, 11 insertions(+), 8 deletions(-) + +diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c +index 37caed669156..86dee7e80daf 100644 +--- a/fs/bcachefs/io.c ++++ b/fs/bcachefs/io.c +@@ -500,12 +500,13 @@ static void bch2_write_done(struct closure *cl) + + bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time); + +- if (op->end_io) ++ if (op->end_io) { ++ EBUG_ON(cl->parent); ++ closure_debug_destroy(cl); + op->end_io(op); +- if (cl->parent) ++ } else { + closure_return(cl); +- else +- closure_debug_destroy(cl); ++ } + } + + /** +@@ -1232,12 +1233,14 @@ void bch2_write(struct closure *cl) + err: + if (!(op->flags & BCH_WRITE_NOPUT_RESERVATION)) + bch2_disk_reservation_put(c, &op->res); +- if (op->end_io) ++ ++ if (op->end_io) { ++ EBUG_ON(cl->parent); ++ closure_debug_destroy(cl); + op->end_io(op); +- if (cl->parent) ++ } else { + closure_return(cl); +- else +- closure_debug_destroy(cl); ++ } + } + + /* Cache promotion on read */ +-- +cgit v1.2.3 + + +From 7beb02f563727e29fd89ef80fa013a0bcd133f9d Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 27 Dec 2019 20:42:06 -0500 +Subject: bcachefs: Add an assertion to track down a heisenbug + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_update_interior.c | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index 4c34b9da9d52..fcd4b809c807 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -1657,6 +1657,8 @@ void __bch2_foreground_maybe_merge(struct bch_fs *c, + size_t sib_u64s; + int ret = 0; + ++ BUG_ON(!btree_node_locked(iter, level)); ++ + closure_init_stack(&cl); + retry: + BUG_ON(!btree_node_locked(iter, level)); +-- +cgit v1.2.3 + + +From 134c38f46883c5e11a055e7784f9e2da691f6fad Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 28 Dec 2019 20:17:06 -0500 +Subject: bcachefs: Convert some enums to x-macros + +Helps for preventing things from getting out of sync. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bcachefs.h | 2 +- + fs/bcachefs/bcachefs_format.h | 147 ++++++++++++++++++++---------------- + fs/bcachefs/checksum.h | 4 +- + fs/bcachefs/compress.c | 38 +++++----- + fs/bcachefs/extents.c | 6 +- + fs/bcachefs/fsck.c | 4 +- + fs/bcachefs/io.c | 6 +- + fs/bcachefs/journal_seq_blacklist.c | 4 +- + fs/bcachefs/move.c | 2 +- + fs/bcachefs/opts.c | 20 +++-- + fs/bcachefs/opts.h | 17 +++-- + fs/bcachefs/recovery.c | 8 +- + fs/bcachefs/reflink.c | 2 +- + fs/bcachefs/str_hash.h | 2 +- + fs/bcachefs/sysfs.c | 2 +- + 15 files changed, 145 insertions(+), 119 deletions(-) + +diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h +index ce6f74ff6581..58e4c494b540 100644 +--- a/fs/bcachefs/bcachefs.h ++++ b/fs/bcachefs/bcachefs.h +@@ -716,7 +716,7 @@ struct bch_fs { + struct rhashtable promote_table; + + mempool_t compression_bounce[2]; +- mempool_t compress_workspace[BCH_COMPRESSION_NR]; ++ mempool_t compress_workspace[BCH_COMPRESSION_TYPE_NR]; + mempool_t decompress_workspace; + ZSTD_parameters zstd_params; + +diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h +index 66af4f4cdd53..966da0149861 100644 +--- a/fs/bcachefs/bcachefs_format.h ++++ b/fs/bcachefs/bcachefs_format.h +@@ -432,47 +432,6 @@ struct bch_csum { + __le64 hi; + } __attribute__((packed, aligned(8))); + +-enum bch_csum_type { +- BCH_CSUM_NONE = 0, +- BCH_CSUM_CRC32C_NONZERO = 1, +- BCH_CSUM_CRC64_NONZERO = 2, +- BCH_CSUM_CHACHA20_POLY1305_80 = 3, +- BCH_CSUM_CHACHA20_POLY1305_128 = 4, +- BCH_CSUM_CRC32C = 5, +- BCH_CSUM_CRC64 = 6, +- BCH_CSUM_NR = 7, +-}; +- +-static const unsigned bch_crc_bytes[] = { +- [BCH_CSUM_NONE] = 0, +- [BCH_CSUM_CRC32C_NONZERO] = 4, +- [BCH_CSUM_CRC32C] = 4, +- [BCH_CSUM_CRC64_NONZERO] = 8, +- [BCH_CSUM_CRC64] = 8, +- [BCH_CSUM_CHACHA20_POLY1305_80] = 10, +- [BCH_CSUM_CHACHA20_POLY1305_128] = 16, +-}; +- +-static inline _Bool bch2_csum_type_is_encryption(enum bch_csum_type type) +-{ +- switch (type) { +- case BCH_CSUM_CHACHA20_POLY1305_80: +- case BCH_CSUM_CHACHA20_POLY1305_128: +- return true; +- default: +- return false; +- } +-} +- +-enum bch_compression_type { +- BCH_COMPRESSION_NONE = 0, +- BCH_COMPRESSION_LZ4_OLD = 1, +- BCH_COMPRESSION_GZIP = 2, +- BCH_COMPRESSION_LZ4 = 3, +- BCH_COMPRESSION_ZSTD = 4, +- BCH_COMPRESSION_NR = 5, +-}; +- + #define BCH_EXTENT_ENTRY_TYPES() \ + x(ptr, 0) \ + x(crc32, 1) \ +@@ -1316,17 +1275,29 @@ LE64_BITMASK(BCH_SB_GC_RESERVE_BYTES, struct bch_sb, flags[2], 4, 64); + + LE64_BITMASK(BCH_SB_ERASURE_CODE, struct bch_sb, flags[3], 0, 16); + +-/* Features: */ +-enum bch_sb_features { +- BCH_FEATURE_LZ4 = 0, +- BCH_FEATURE_GZIP = 1, +- BCH_FEATURE_ZSTD = 2, +- BCH_FEATURE_ATOMIC_NLINK = 3, /* should have gone under compat */ +- BCH_FEATURE_EC = 4, +- BCH_FEATURE_JOURNAL_SEQ_BLACKLIST_V3 = 5, +- BCH_FEATURE_REFLINK = 6, +- BCH_FEATURE_NEW_SIPHASH = 7, +- BCH_FEATURE_INLINE_DATA = 8, ++/* ++ * Features: ++ * ++ * journal_seq_blacklist_v3: gates BCH_SB_FIELD_journal_seq_blacklist ++ * reflink: gates KEY_TYPE_reflink ++ * inline_data: gates KEY_TYPE_inline_data ++ * new_siphash: gates BCH_STR_HASH_SIPHASH ++ */ ++#define BCH_SB_FEATURES() \ ++ x(lz4, 0) \ ++ x(gzip, 1) \ ++ x(zstd, 2) \ ++ x(atomic_nlink, 3) \ ++ x(ec, 4) \ ++ x(journal_seq_blacklist_v3, 5) \ ++ x(reflink, 6) \ ++ x(new_siphash, 7) \ ++ x(inline_data, 8) ++ ++enum bch_sb_feature { ++#define x(f, n) BCH_FEATURE_##f, ++ BCH_SB_FEATURES() ++#undef x + BCH_FEATURE_NR, + }; + +@@ -1346,13 +1317,6 @@ enum bch_error_actions { + BCH_NR_ERROR_ACTIONS = 3, + }; + +-enum bch_csum_opts { +- BCH_CSUM_OPT_NONE = 0, +- BCH_CSUM_OPT_CRC32C = 1, +- BCH_CSUM_OPT_CRC64 = 2, +- BCH_CSUM_OPT_NR = 3, +-}; +- + enum bch_str_hash_type { + BCH_STR_HASH_CRC32C = 0, + BCH_STR_HASH_CRC64 = 1, +@@ -1368,15 +1332,68 @@ enum bch_str_hash_opts { + BCH_STR_HASH_OPT_NR = 3, + }; + ++enum bch_csum_type { ++ BCH_CSUM_NONE = 0, ++ BCH_CSUM_CRC32C_NONZERO = 1, ++ BCH_CSUM_CRC64_NONZERO = 2, ++ BCH_CSUM_CHACHA20_POLY1305_80 = 3, ++ BCH_CSUM_CHACHA20_POLY1305_128 = 4, ++ BCH_CSUM_CRC32C = 5, ++ BCH_CSUM_CRC64 = 6, ++ BCH_CSUM_NR = 7, ++}; ++ ++static const unsigned bch_crc_bytes[] = { ++ [BCH_CSUM_NONE] = 0, ++ [BCH_CSUM_CRC32C_NONZERO] = 4, ++ [BCH_CSUM_CRC32C] = 4, ++ [BCH_CSUM_CRC64_NONZERO] = 8, ++ [BCH_CSUM_CRC64] = 8, ++ [BCH_CSUM_CHACHA20_POLY1305_80] = 10, ++ [BCH_CSUM_CHACHA20_POLY1305_128] = 16, ++}; ++ ++static inline _Bool bch2_csum_type_is_encryption(enum bch_csum_type type) ++{ ++ switch (type) { ++ case BCH_CSUM_CHACHA20_POLY1305_80: ++ case BCH_CSUM_CHACHA20_POLY1305_128: ++ return true; ++ default: ++ return false; ++ } ++} ++ ++enum bch_csum_opts { ++ BCH_CSUM_OPT_NONE = 0, ++ BCH_CSUM_OPT_CRC32C = 1, ++ BCH_CSUM_OPT_CRC64 = 2, ++ BCH_CSUM_OPT_NR = 3, ++}; ++ + #define BCH_COMPRESSION_TYPES() \ +- x(NONE) \ +- x(LZ4) \ +- x(GZIP) \ +- x(ZSTD) ++ x(none, 0) \ ++ x(lz4_old, 1) \ ++ x(gzip, 2) \ ++ x(lz4, 3) \ ++ x(zstd, 4) + +-enum bch_compression_opts { +-#define x(t) BCH_COMPRESSION_OPT_##t, ++enum bch_compression_type { ++#define x(t, n) BCH_COMPRESSION_TYPE_##t, + BCH_COMPRESSION_TYPES() ++#undef x ++ BCH_COMPRESSION_TYPE_NR ++}; ++ ++#define BCH_COMPRESSION_OPTS() \ ++ x(none, 0) \ ++ x(lz4, 1) \ ++ x(gzip, 2) \ ++ x(zstd, 3) ++ ++enum bch_compression_opts { ++#define x(t, n) BCH_COMPRESSION_OPT_##t, ++ BCH_COMPRESSION_OPTS() + #undef x + BCH_COMPRESSION_OPT_NR + }; +diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h +index b84e81bac8ff..ca9e45906dc8 100644 +--- a/fs/bcachefs/checksum.h ++++ b/fs/bcachefs/checksum.h +@@ -108,8 +108,8 @@ static inline enum bch_csum_type bch2_meta_checksum_type(struct bch_fs *c) + } + + static const unsigned bch2_compression_opt_to_type[] = { +-#define x(t) [BCH_COMPRESSION_OPT_##t] = BCH_COMPRESSION_##t, +- BCH_COMPRESSION_TYPES() ++#define x(t, n) [BCH_COMPRESSION_OPT_##t] = BCH_COMPRESSION_TYPE_##t, ++ BCH_COMPRESSION_OPTS() + #undef x + }; + +diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c +index 8f0f35b13c79..e311a382c9c2 100644 +--- a/fs/bcachefs/compress.c ++++ b/fs/bcachefs/compress.c +@@ -158,14 +158,14 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src, + src_data = bio_map_or_bounce(c, src, READ); + + switch (crc.compression_type) { +- case BCH_COMPRESSION_LZ4_OLD: +- case BCH_COMPRESSION_LZ4: ++ case BCH_COMPRESSION_TYPE_lz4_old: ++ case BCH_COMPRESSION_TYPE_lz4: + ret = LZ4_decompress_safe_partial(src_data.b, dst_data, + src_len, dst_len, dst_len); + if (ret != dst_len) + goto err; + break; +- case BCH_COMPRESSION_GZIP: { ++ case BCH_COMPRESSION_TYPE_gzip: { + z_stream strm = { + .next_in = src_data.b, + .avail_in = src_len, +@@ -185,7 +185,7 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src, + goto err; + break; + } +- case BCH_COMPRESSION_ZSTD: { ++ case BCH_COMPRESSION_TYPE_zstd: { + ZSTD_DCtx *ctx; + size_t len; + +@@ -290,10 +290,10 @@ static int attempt_compress(struct bch_fs *c, + void *workspace, + void *dst, size_t dst_len, + void *src, size_t src_len, +- unsigned compression_type) ++ enum bch_compression_type compression_type) + { + switch (compression_type) { +- case BCH_COMPRESSION_LZ4: { ++ case BCH_COMPRESSION_TYPE_lz4: { + int len = src_len; + int ret = LZ4_compress_destSize( + src, dst, +@@ -305,7 +305,7 @@ static int attempt_compress(struct bch_fs *c, + + return ret; + } +- case BCH_COMPRESSION_GZIP: { ++ case BCH_COMPRESSION_TYPE_gzip: { + z_stream strm = { + .next_in = src, + .avail_in = src_len, +@@ -326,7 +326,7 @@ static int attempt_compress(struct bch_fs *c, + + return strm.total_out; + } +- case BCH_COMPRESSION_ZSTD: { ++ case BCH_COMPRESSION_TYPE_zstd: { + ZSTD_CCtx *ctx = zstd_init_cctx(workspace, + zstd_cctx_workspace_bound(&c->zstd_params.cParams)); + +@@ -348,14 +348,14 @@ static int attempt_compress(struct bch_fs *c, + static unsigned __bio_compress(struct bch_fs *c, + struct bio *dst, size_t *dst_len, + struct bio *src, size_t *src_len, +- unsigned compression_type) ++ enum bch_compression_type compression_type) + { + struct bbuf src_data = { NULL }, dst_data = { NULL }; + void *workspace; + unsigned pad; + int ret = 0; + +- BUG_ON(compression_type >= BCH_COMPRESSION_NR); ++ BUG_ON(compression_type >= BCH_COMPRESSION_TYPE_NR); + BUG_ON(!mempool_initialized(&c->compress_workspace[compression_type])); + + /* If it's only one block, don't bother trying to compress: */ +@@ -452,8 +452,8 @@ unsigned bch2_bio_compress(struct bch_fs *c, + /* Don't generate a bigger output than input: */ + dst->bi_iter.bi_size = min(dst->bi_iter.bi_size, src->bi_iter.bi_size); + +- if (compression_type == BCH_COMPRESSION_LZ4_OLD) +- compression_type = BCH_COMPRESSION_LZ4; ++ if (compression_type == BCH_COMPRESSION_TYPE_lz4_old) ++ compression_type = BCH_COMPRESSION_TYPE_lz4; + + compression_type = + __bio_compress(c, dst, dst_len, src, src_len, compression_type); +@@ -465,15 +465,15 @@ unsigned bch2_bio_compress(struct bch_fs *c, + + static int __bch2_fs_compress_init(struct bch_fs *, u64); + +-#define BCH_FEATURE_NONE 0 ++#define BCH_FEATURE_none 0 + + static const unsigned bch2_compression_opt_to_feature[] = { +-#define x(t) [BCH_COMPRESSION_OPT_##t] = BCH_FEATURE_##t, +- BCH_COMPRESSION_TYPES() ++#define x(t, n) [BCH_COMPRESSION_OPT_##t] = BCH_FEATURE_##t, ++ BCH_COMPRESSION_OPTS() + #undef x + }; + +-#undef BCH_FEATURE_NONE ++#undef BCH_FEATURE_none + + static int __bch2_check_set_has_compressed_data(struct bch_fs *c, u64 f) + { +@@ -537,11 +537,11 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features) + size_t compress_workspace; + size_t decompress_workspace; + } compression_types[] = { +- { BCH_FEATURE_LZ4, BCH_COMPRESSION_LZ4, LZ4_MEM_COMPRESS, 0 }, +- { BCH_FEATURE_GZIP, BCH_COMPRESSION_GZIP, ++ { BCH_FEATURE_lz4, BCH_COMPRESSION_TYPE_lz4, LZ4_MEM_COMPRESS, 0 }, ++ { BCH_FEATURE_gzip, BCH_COMPRESSION_TYPE_gzip, + zlib_deflate_workspacesize(MAX_WBITS, DEF_MEM_LEVEL), + zlib_inflate_workspacesize(), }, +- { BCH_FEATURE_ZSTD, BCH_COMPRESSION_ZSTD, ++ { BCH_FEATURE_zstd, BCH_COMPRESSION_TYPE_zstd, + zstd_cctx_workspace_bound(¶ms.cParams), + zstd_dctx_workspace_bound() }, + }, *i; +diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c +index 6bcc178604b0..a70ece750355 100644 +--- a/fs/bcachefs/extents.c ++++ b/fs/bcachefs/extents.c +@@ -614,7 +614,7 @@ unsigned bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c k) + + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) + ret += !p.ptr.cached && +- p.crc.compression_type == BCH_COMPRESSION_NONE; ++ p.crc.compression_type == BCH_COMPRESSION_TYPE_none; + } + + return ret; +@@ -629,7 +629,7 @@ unsigned bch2_bkey_sectors_compressed(struct bkey_s_c k) + + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) + if (!p.ptr.cached && +- p.crc.compression_type != BCH_COMPRESSION_NONE) ++ p.crc.compression_type != BCH_COMPRESSION_TYPE_none) + ret += p.crc.compressed_size; + + return ret; +@@ -1054,7 +1054,7 @@ const char *bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k) + if (!bch2_checksum_type_valid(c, crc.csum_type)) + return "invalid checksum type"; + +- if (crc.compression_type >= BCH_COMPRESSION_NR) ++ if (crc.compression_type >= BCH_COMPRESSION_TYPE_NR) + return "invalid compression type"; + + if (bch2_csum_type_is_encryption(crc.csum_type)) { +diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c +index cd230dc10984..e25f064706ad 100644 +--- a/fs/bcachefs/fsck.c ++++ b/fs/bcachefs/fsck.c +@@ -1124,7 +1124,7 @@ static int check_inode_nlink(struct bch_fs *c, + + if (!link->count && + !(u->bi_flags & BCH_INODE_UNLINKED) && +- (c->sb.features & (1 << BCH_FEATURE_ATOMIC_NLINK))) { ++ (c->sb.features & (1 << BCH_FEATURE_atomic_nlink))) { + if (fsck_err(c, "unreachable inode %llu not marked as unlinked (type %u)", + u->bi_inum, mode_to_type(u->bi_mode)) == + FSCK_ERR_IGNORE) +@@ -1159,7 +1159,7 @@ static int check_inode_nlink(struct bch_fs *c, + } + + if (i_nlink != real_i_nlink && +- (c->sb.features & (1 << BCH_FEATURE_ATOMIC_NLINK))) { ++ (c->sb.features & (1 << BCH_FEATURE_atomic_nlink))) { + if (fsck_err(c, "inode %llu has wrong i_nlink " + "(type %u i_nlink %u, should be %u)", + u->bi_inum, mode_to_type(u->bi_mode), +diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c +index 86dee7e80daf..01f61bc81755 100644 +--- a/fs/bcachefs/io.c ++++ b/fs/bcachefs/io.c +@@ -1139,7 +1139,7 @@ static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len) + unsigned sectors; + int ret; + +- bch2_check_set_feature(op->c, BCH_FEATURE_INLINE_DATA); ++ bch2_check_set_feature(op->c, BCH_FEATURE_inline_data); + + ret = bch2_keylist_realloc(&op->insert_keys, op->inline_keys, + ARRAY_SIZE(op->inline_keys), +@@ -1786,7 +1786,7 @@ static void __bch2_read_endio(struct work_struct *work) + crc.offset += rbio->offset_into_extent; + crc.live_size = bvec_iter_sectors(rbio->bvec_iter); + +- if (crc.compression_type != BCH_COMPRESSION_NONE) { ++ if (crc.compression_type != BCH_COMPRESSION_TYPE_none) { + bch2_encrypt_bio(c, crc.csum_type, nonce, src); + if (bch2_bio_uncompress(c, src, dst, dst_iter, crc)) + goto decompression_err; +@@ -1994,7 +1994,7 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig, + + EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size); + +- if (pick.crc.compression_type != BCH_COMPRESSION_NONE || ++ if (pick.crc.compression_type != BCH_COMPRESSION_TYPE_none || + (pick.crc.csum_type != BCH_CSUM_NONE && + (bvec_iter_sectors(iter) != pick.crc.uncompressed_size || + (bch2_csum_type_is_encryption(pick.crc.csum_type) && +diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c +index 787d9f7638d0..a21de0088753 100644 +--- a/fs/bcachefs/journal_seq_blacklist.c ++++ b/fs/bcachefs/journal_seq_blacklist.c +@@ -121,7 +121,7 @@ int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64 start, u64 end) + bl->start[nr].end = cpu_to_le64(end); + out_write_sb: + c->disk_sb.sb->features[0] |= +- 1ULL << BCH_FEATURE_JOURNAL_SEQ_BLACKLIST_V3; ++ 1ULL << BCH_FEATURE_journal_seq_blacklist_v3; + + ret = bch2_write_super(c); + out: +@@ -309,7 +309,7 @@ void bch2_blacklist_entries_gc(struct work_struct *work) + + if (!new_nr) + c->disk_sb.sb->features[0] &= +- ~(1ULL << BCH_FEATURE_JOURNAL_SEQ_BLACKLIST_V3); ++ ~(1ULL << BCH_FEATURE_journal_seq_blacklist_v3); + + bch2_write_super(c); + } +diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c +index 2803056288c3..67e9fd3f86f5 100644 +--- a/fs/bcachefs/move.c ++++ b/fs/bcachefs/move.c +@@ -272,7 +272,7 @@ int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m, + + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) + if (!p.ptr.cached && +- p.crc.compression_type != BCH_COMPRESSION_NONE && ++ p.crc.compression_type != BCH_COMPRESSION_TYPE_none && + bch2_dev_in_target(c, p.ptr.dev, data_opts.target)) + compressed_sectors += p.crc.compressed_size; + +diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c +index cbacd2f36799..94d6c044a27d 100644 +--- a/fs/bcachefs/opts.c ++++ b/fs/bcachefs/opts.c +@@ -16,18 +16,24 @@ const char * const bch2_error_actions[] = { + NULL + }; + +-const char * const bch2_csum_types[] = { ++const char * const bch2_sb_features[] = { ++#define x(f, n) #f, ++ BCH_SB_FEATURES() ++#undef x ++ NULL ++}; ++ ++const char * const bch2_csum_opts[] = { + "none", + "crc32c", + "crc64", + NULL + }; + +-const char * const bch2_compression_types[] = { +- "none", +- "lz4", +- "gzip", +- "zstd", ++const char * const bch2_compression_opts[] = { ++#define x(t, n) #t, ++ BCH_COMPRESSION_OPTS() ++#undef x + NULL + }; + +@@ -300,7 +306,7 @@ int bch2_opt_check_may_set(struct bch_fs *c, int id, u64 v) + break; + case Opt_erasure_code: + if (v) +- bch2_check_set_feature(c, BCH_FEATURE_EC); ++ bch2_check_set_feature(c, BCH_FEATURE_ec); + break; + } + +diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h +index 1f11f4152a6f..1c05effa71e6 100644 +--- a/fs/bcachefs/opts.h ++++ b/fs/bcachefs/opts.h +@@ -9,8 +9,9 @@ + #include "bcachefs_format.h" + + extern const char * const bch2_error_actions[]; +-extern const char * const bch2_csum_types[]; +-extern const char * const bch2_compression_types[]; ++extern const char * const bch2_sb_features[]; ++extern const char * const bch2_csum_opts[]; ++extern const char * const bch2_compression_opts[]; + extern const char * const bch2_str_hash_types[]; + extern const char * const bch2_data_types[]; + extern const char * const bch2_cache_replacement_policies[]; +@@ -112,23 +113,23 @@ enum opt_type { + "#", NULL) \ + x(metadata_checksum, u8, \ + OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ +- OPT_STR(bch2_csum_types), \ ++ OPT_STR(bch2_csum_opts), \ + BCH_SB_META_CSUM_TYPE, BCH_CSUM_OPT_CRC32C, \ + NULL, NULL) \ + x(data_checksum, u8, \ + OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \ +- OPT_STR(bch2_csum_types), \ ++ OPT_STR(bch2_csum_opts), \ + BCH_SB_DATA_CSUM_TYPE, BCH_CSUM_OPT_CRC32C, \ + NULL, NULL) \ + x(compression, u8, \ + OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \ +- OPT_STR(bch2_compression_types), \ +- BCH_SB_COMPRESSION_TYPE, BCH_COMPRESSION_OPT_NONE, \ ++ OPT_STR(bch2_compression_opts), \ ++ BCH_SB_COMPRESSION_TYPE, BCH_COMPRESSION_OPT_none, \ + NULL, NULL) \ + x(background_compression, u8, \ + OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \ +- OPT_STR(bch2_compression_types), \ +- BCH_SB_BACKGROUND_COMPRESSION_TYPE,BCH_COMPRESSION_OPT_NONE, \ ++ OPT_STR(bch2_compression_opts), \ ++ BCH_SB_BACKGROUND_COMPRESSION_TYPE,BCH_COMPRESSION_OPT_none, \ + NULL, NULL) \ + x(str_hash, u8, \ + OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index 9c90d2bbb7cc..97b367252e82 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -866,7 +866,7 @@ int bch2_fs_recovery(struct bch_fs *c) + } + + if (!c->sb.clean) { +- if (!(c->sb.features & (1 << BCH_FEATURE_ATOMIC_NLINK))) { ++ if (!(c->sb.features & (1 << BCH_FEATURE_atomic_nlink))) { + bch_info(c, "checking inode link counts"); + err = "error in recovery"; + ret = bch2_fsck_inode_nlink(c); +@@ -907,6 +907,7 @@ int bch2_fs_recovery(struct bch_fs *c) + c->disk_sb.sb->version_min = + le16_to_cpu(bcachefs_metadata_version_min); + c->disk_sb.sb->version = le16_to_cpu(bcachefs_metadata_version_current); ++ c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_new_siphash; + write_sb = true; + } + +@@ -917,7 +918,7 @@ int bch2_fs_recovery(struct bch_fs *c) + + if (c->opts.fsck && + !test_bit(BCH_FS_ERROR, &c->flags)) { +- c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_ATOMIC_NLINK; ++ c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_atomic_nlink; + SET_BCH_SB_HAS_ERRORS(c->disk_sb.sb, 0); + write_sb = true; + } +@@ -1024,7 +1025,8 @@ int bch2_fs_initialize(struct bch_fs *c) + mutex_lock(&c->sb_lock); + c->disk_sb.sb->version = c->disk_sb.sb->version_min = + le16_to_cpu(bcachefs_metadata_version_current); +- c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_ATOMIC_NLINK; ++ c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_atomic_nlink; ++ c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_new_siphash; + + SET_BCH_SB_INITIALIZED(c->disk_sb.sb, true); + SET_BCH_SB_CLEAN(c->disk_sb.sb, false); +diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c +index 5cad39fe031f..2bf003ba3bd8 100644 +--- a/fs/bcachefs/reflink.c ++++ b/fs/bcachefs/reflink.c +@@ -171,7 +171,7 @@ s64 bch2_remap_range(struct bch_fs *c, + if (!percpu_ref_tryget(&c->writes)) + return -EROFS; + +- bch2_check_set_feature(c, BCH_FEATURE_REFLINK); ++ bch2_check_set_feature(c, BCH_FEATURE_reflink); + + dst_end.offset += remap_sectors; + src_end.offset += remap_sectors; +diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h +index 7be4a8e50eaa..3870df2d58ce 100644 +--- a/fs/bcachefs/str_hash.h ++++ b/fs/bcachefs/str_hash.h +@@ -23,7 +23,7 @@ bch2_str_hash_opt_to_type(struct bch_fs *c, enum bch_str_hash_opts opt) + case BCH_STR_HASH_OPT_CRC64: + return BCH_STR_HASH_CRC64; + case BCH_STR_HASH_OPT_SIPHASH: +- return c->sb.features & (1ULL << BCH_FEATURE_NEW_SIPHASH) ++ return c->sb.features & (1ULL << BCH_FEATURE_new_siphash) + ? BCH_STR_HASH_SIPHASH + : BCH_STR_HASH_SIPHASH_OLD; + default: +diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c +index 95e527844e0a..a8bcba747582 100644 +--- a/fs/bcachefs/sysfs.c ++++ b/fs/bcachefs/sysfs.c +@@ -276,7 +276,7 @@ static ssize_t bch2_compression_stats(struct bch_fs *c, char *buf) + struct extent_ptr_decoded p; + + extent_for_each_ptr_decode(e, p, entry) { +- if (p.crc.compression_type == BCH_COMPRESSION_NONE) { ++ if (p.crc.compression_type == BCH_COMPRESSION_TYPE_none) { + nr_uncompressed_extents++; + uncompressed_sectors += e.k->size; + } else { +-- +cgit v1.2.3 + + +From 653e46420aadd0fd4357f4bf30ee1cea278c117d Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 26 Nov 2019 17:26:04 -0500 +Subject: bcachefs: Use KEY_TYPE_deleted whitouts for extents + +Previously, partial overwrites of existing extents were handled +implicitly by the btree code; when reading in a btree node, we'd do a +mergesort of the different bsets and detect and fix partially +overlapping extents during that mergesort. + +That approach won't work with snapshots: this changes extents to work +like regular keys as far as the btree code is concerned, where a 0 size +KEY_TYPE_deleted whiteout will completely overwrite an existing extent. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bcachefs_format.h | 8 +- + fs/bcachefs/bkey_sort.c | 232 ++++++++++++++++++------------------ + fs/bcachefs/btree_io.c | 36 ++++-- + fs/bcachefs/btree_types.h | 2 + + fs/bcachefs/btree_update_interior.c | 7 ++ + fs/bcachefs/btree_update_leaf.c | 2 + + fs/bcachefs/extent_update.c | 88 +++++++++++++- + fs/bcachefs/recovery.c | 2 + + 8 files changed, 244 insertions(+), 133 deletions(-) + +diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h +index 966da0149861..f6141fde830b 100644 +--- a/fs/bcachefs/bcachefs_format.h ++++ b/fs/bcachefs/bcachefs_format.h +@@ -1282,6 +1282,7 @@ LE64_BITMASK(BCH_SB_ERASURE_CODE, struct bch_sb, flags[3], 0, 16); + * reflink: gates KEY_TYPE_reflink + * inline_data: gates KEY_TYPE_inline_data + * new_siphash: gates BCH_STR_HASH_SIPHASH ++ * new_extent_overwrite: gates BTREE_NODE_NEW_EXTENT_OVERWRITE + */ + #define BCH_SB_FEATURES() \ + x(lz4, 0) \ +@@ -1292,7 +1293,8 @@ LE64_BITMASK(BCH_SB_ERASURE_CODE, struct bch_sb, flags[3], 0, 16); + x(journal_seq_blacklist_v3, 5) \ + x(reflink, 6) \ + x(new_siphash, 7) \ +- x(inline_data, 8) ++ x(inline_data, 8) \ ++ x(new_extent_overwrite, 9) + + enum bch_sb_feature { + #define x(f, n) BCH_FEATURE_##f, +@@ -1613,7 +1615,9 @@ struct btree_node { + + LE64_BITMASK(BTREE_NODE_ID, struct btree_node, flags, 0, 4); + LE64_BITMASK(BTREE_NODE_LEVEL, struct btree_node, flags, 4, 8); +-/* 8-32 unused */ ++LE64_BITMASK(BTREE_NODE_NEW_EXTENT_OVERWRITE, ++ struct btree_node, flags, 8, 9); ++/* 9-32 unused */ + LE64_BITMASK(BTREE_NODE_SEQ, struct btree_node, flags, 32, 64); + + struct btree_node_entry { +diff --git a/fs/bcachefs/bkey_sort.c b/fs/bcachefs/bkey_sort.c +index 23b51ef57303..18f842012f05 100644 +--- a/fs/bcachefs/bkey_sort.c ++++ b/fs/bcachefs/bkey_sort.c +@@ -130,24 +130,6 @@ bch2_key_sort_fix_overlapping(struct bch_fs *c, struct bset *dst, + return nr; + } + +-/* +- * If keys compare equal, compare by pointer order: +- * +- * Necessary for sort_fix_overlapping() - if there are multiple keys that +- * compare equal in different sets, we have to process them newest to oldest. +- */ +-static inline int extent_sort_fix_overlapping_cmp(struct btree *b, +- struct bkey_packed *l, +- struct bkey_packed *r) +-{ +- struct bkey ul = bkey_unpack_key(b, l); +- struct bkey ur = bkey_unpack_key(b, r); +- +- return bkey_cmp(bkey_start_pos(&ul), +- bkey_start_pos(&ur)) ?: +- cmp_int((unsigned long) r, (unsigned long) l); +-} +- + static void extent_sort_advance_prev(struct bkey_format *f, + struct btree_nr_keys *nr, + struct bkey_packed *start, +@@ -188,102 +170,6 @@ static void extent_sort_append(struct bch_fs *c, + bkey_reassemble((void *) *prev, k.s_c); + } + +-struct btree_nr_keys +-bch2_extent_sort_fix_overlapping(struct bch_fs *c, struct bset *dst, +- struct sort_iter *iter) +-{ +- struct btree *b = iter->b; +- struct bkey_format *f = &b->format; +- struct sort_iter_set *_l = iter->data, *_r = iter->data + 1; +- struct bkey_packed *prev = NULL; +- struct bkey l_unpacked, r_unpacked; +- struct bkey_s l, r; +- struct btree_nr_keys nr; +- struct bkey_on_stack split; +- +- memset(&nr, 0, sizeof(nr)); +- bkey_on_stack_init(&split); +- +- sort_iter_sort(iter, extent_sort_fix_overlapping_cmp); +- +- while (!sort_iter_end(iter)) { +- l = __bkey_disassemble(b, _l->k, &l_unpacked); +- +- if (iter->used == 1) { +- extent_sort_append(c, f, &nr, dst->start, &prev, l); +- sort_iter_advance(iter, +- extent_sort_fix_overlapping_cmp); +- continue; +- } +- +- r = __bkey_disassemble(b, _r->k, &r_unpacked); +- +- /* If current key and next key don't overlap, just append */ +- if (bkey_cmp(l.k->p, bkey_start_pos(r.k)) <= 0) { +- extent_sort_append(c, f, &nr, dst->start, &prev, l); +- sort_iter_advance(iter, +- extent_sort_fix_overlapping_cmp); +- continue; +- } +- +- /* Skip 0 size keys */ +- if (!r.k->size) { +- __sort_iter_advance(iter, 1, +- extent_sort_fix_overlapping_cmp); +- continue; +- } +- +- /* +- * overlap: keep the newer key and trim the older key so they +- * don't overlap. comparing pointers tells us which one is +- * newer, since the bsets are appended one after the other. +- */ +- +- /* can't happen because of comparison func */ +- BUG_ON(_l->k < _r->k && +- !bkey_cmp(bkey_start_pos(l.k), bkey_start_pos(r.k))); +- +- if (_l->k > _r->k) { +- /* l wins, trim r */ +- if (bkey_cmp(l.k->p, r.k->p) >= 0) { +- __sort_iter_advance(iter, 1, +- extent_sort_fix_overlapping_cmp); +- } else { +- bch2_cut_front_s(l.k->p, r); +- extent_save(b, _r->k, r.k); +- __sort_iter_sift(iter, 1, +- extent_sort_fix_overlapping_cmp); +- } +- } else if (bkey_cmp(l.k->p, r.k->p) > 0) { +- +- /* +- * r wins, but it overlaps in the middle of l - split l: +- */ +- bkey_on_stack_reassemble(&split, c, l.s_c); +- bch2_cut_back(bkey_start_pos(r.k), split.k); +- +- bch2_cut_front_s(r.k->p, l); +- extent_save(b, _l->k, l.k); +- +- __sort_iter_sift(iter, 0, +- extent_sort_fix_overlapping_cmp); +- +- extent_sort_append(c, f, &nr, dst->start, +- &prev, bkey_i_to_s(split.k)); +- } else { +- bch2_cut_back_s(bkey_start_pos(r.k), l); +- extent_save(b, _l->k, l.k); +- } +- } +- +- extent_sort_advance_prev(f, &nr, dst->start, &prev); +- +- dst->u64s = cpu_to_le16((u64 *) prev - dst->_data); +- +- bkey_on_stack_exit(&split, c); +- return nr; +-} +- + /* Sort + repack in a new format: */ + struct btree_nr_keys + bch2_sort_repack(struct bset *dst, struct btree *src, +@@ -354,7 +240,7 @@ static inline int sort_keys_cmp(struct btree *b, + struct bkey_packed *r) + { + return bkey_cmp_packed(b, l, r) ?: +- (int) bkey_whiteout(r) - (int) bkey_whiteout(l) ?: ++ (int) bkey_deleted(r) - (int) bkey_deleted(l) ?: + (int) l->needs_whiteout - (int) r->needs_whiteout; + } + +@@ -399,6 +285,122 @@ unsigned bch2_sort_keys(struct bkey_packed *dst, + return (u64 *) out - (u64 *) dst; + } + ++/* Compat code for btree_node_old_extent_overwrite: */ ++ ++/* ++ * If keys compare equal, compare by pointer order: ++ * ++ * Necessary for sort_fix_overlapping() - if there are multiple keys that ++ * compare equal in different sets, we have to process them newest to oldest. ++ */ ++static inline int extent_sort_fix_overlapping_cmp(struct btree *b, ++ struct bkey_packed *l, ++ struct bkey_packed *r) ++{ ++ struct bkey ul = bkey_unpack_key(b, l); ++ struct bkey ur = bkey_unpack_key(b, r); ++ ++ return bkey_cmp(bkey_start_pos(&ul), ++ bkey_start_pos(&ur)) ?: ++ cmp_int((unsigned long) r, (unsigned long) l); ++} ++ ++struct btree_nr_keys ++bch2_extent_sort_fix_overlapping(struct bch_fs *c, struct bset *dst, ++ struct sort_iter *iter) ++{ ++ struct btree *b = iter->b; ++ struct bkey_format *f = &b->format; ++ struct sort_iter_set *_l = iter->data, *_r = iter->data + 1; ++ struct bkey_packed *prev = NULL; ++ struct bkey l_unpacked, r_unpacked; ++ struct bkey_s l, r; ++ struct btree_nr_keys nr; ++ struct bkey_on_stack split; ++ ++ memset(&nr, 0, sizeof(nr)); ++ bkey_on_stack_init(&split); ++ ++ sort_iter_sort(iter, extent_sort_fix_overlapping_cmp); ++ ++ while (!sort_iter_end(iter)) { ++ l = __bkey_disassemble(b, _l->k, &l_unpacked); ++ ++ if (iter->used == 1) { ++ extent_sort_append(c, f, &nr, dst->start, &prev, l); ++ sort_iter_advance(iter, ++ extent_sort_fix_overlapping_cmp); ++ continue; ++ } ++ ++ r = __bkey_disassemble(b, _r->k, &r_unpacked); ++ ++ /* If current key and next key don't overlap, just append */ ++ if (bkey_cmp(l.k->p, bkey_start_pos(r.k)) <= 0) { ++ extent_sort_append(c, f, &nr, dst->start, &prev, l); ++ sort_iter_advance(iter, ++ extent_sort_fix_overlapping_cmp); ++ continue; ++ } ++ ++ /* Skip 0 size keys */ ++ if (!r.k->size) { ++ __sort_iter_advance(iter, 1, ++ extent_sort_fix_overlapping_cmp); ++ continue; ++ } ++ ++ /* ++ * overlap: keep the newer key and trim the older key so they ++ * don't overlap. comparing pointers tells us which one is ++ * newer, since the bsets are appended one after the other. ++ */ ++ ++ /* can't happen because of comparison func */ ++ BUG_ON(_l->k < _r->k && ++ !bkey_cmp(bkey_start_pos(l.k), bkey_start_pos(r.k))); ++ ++ if (_l->k > _r->k) { ++ /* l wins, trim r */ ++ if (bkey_cmp(l.k->p, r.k->p) >= 0) { ++ __sort_iter_advance(iter, 1, ++ extent_sort_fix_overlapping_cmp); ++ } else { ++ bch2_cut_front_s(l.k->p, r); ++ extent_save(b, _r->k, r.k); ++ __sort_iter_sift(iter, 1, ++ extent_sort_fix_overlapping_cmp); ++ } ++ } else if (bkey_cmp(l.k->p, r.k->p) > 0) { ++ ++ /* ++ * r wins, but it overlaps in the middle of l - split l: ++ */ ++ bkey_on_stack_reassemble(&split, c, l.s_c); ++ bch2_cut_back(bkey_start_pos(r.k), split.k); ++ ++ bch2_cut_front_s(r.k->p, l); ++ extent_save(b, _l->k, l.k); ++ ++ __sort_iter_sift(iter, 0, ++ extent_sort_fix_overlapping_cmp); ++ ++ extent_sort_append(c, f, &nr, dst->start, ++ &prev, bkey_i_to_s(split.k)); ++ } else { ++ bch2_cut_back_s(bkey_start_pos(r.k), l); ++ extent_save(b, _l->k, l.k); ++ } ++ } ++ ++ extent_sort_advance_prev(f, &nr, dst->start, &prev); ++ ++ dst->u64s = cpu_to_le16((u64 *) prev - dst->_data); ++ ++ bkey_on_stack_exit(&split, c); ++ return nr; ++} ++ + static inline int sort_extents_cmp(struct btree *b, + struct bkey_packed *l, + struct bkey_packed *r) +diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c +index 4b1cd4dd0741..5f1c3183fa85 100644 +--- a/fs/bcachefs/btree_io.c ++++ b/fs/bcachefs/btree_io.c +@@ -23,7 +23,8 @@ + + static void verify_no_dups(struct btree *b, + struct bkey_packed *start, +- struct bkey_packed *end) ++ struct bkey_packed *end, ++ bool extents) + { + #ifdef CONFIG_BCACHEFS_DEBUG + struct bkey_packed *k, *p; +@@ -37,7 +38,7 @@ static void verify_no_dups(struct btree *b, + struct bkey l = bkey_unpack_key(b, p); + struct bkey r = bkey_unpack_key(b, k); + +- BUG_ON(btree_node_is_extents(b) ++ BUG_ON(extents + ? bkey_cmp(l.p, bkey_start_pos(&r)) > 0 + : bkey_cmp(l.p, bkey_start_pos(&r)) >= 0); + //BUG_ON(bkey_cmp_packed(&b->format, p, k) >= 0); +@@ -148,7 +149,8 @@ static void bch2_sort_whiteouts(struct bch_fs *c, struct btree *b) + } + + verify_no_dups(b, new_whiteouts, +- (void *) ((u64 *) new_whiteouts + b->whiteout_u64s)); ++ (void *) ((u64 *) new_whiteouts + b->whiteout_u64s), ++ btree_node_old_extent_overwrite(b)); + + memcpy_u64s(unwritten_whiteouts_start(c, b), + new_whiteouts, b->whiteout_u64s); +@@ -298,7 +300,8 @@ static bool bch2_compact_extent_whiteouts(struct bch_fs *c, + + verify_no_dups(b, + unwritten_whiteouts_start(c, b), +- unwritten_whiteouts_end(c, b)); ++ unwritten_whiteouts_end(c, b), ++ true); + + btree_bounce_free(c, order, used_mempool, whiteouts); + +@@ -378,7 +381,7 @@ static bool bch2_drop_whiteouts(struct btree *b, enum compact_mode mode) + bool bch2_compact_whiteouts(struct bch_fs *c, struct btree *b, + enum compact_mode mode) + { +- return !btree_node_is_extents(b) ++ return !btree_node_old_extent_overwrite(b) + ? bch2_drop_whiteouts(b, mode) + : bch2_compact_extent_whiteouts(c, b, mode); + } +@@ -418,10 +421,10 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b, + + start_time = local_clock(); + +- if (btree_node_is_extents(b)) ++ if (btree_node_old_extent_overwrite(b)) + filter_whiteouts = bset_written(b, start_bset); + +- u64s = (btree_node_is_extents(b) ++ u64s = (btree_node_old_extent_overwrite(b) + ? bch2_sort_extents + : bch2_sort_keys)(out->keys.start, + &sort_iter, +@@ -707,7 +710,8 @@ static int validate_bset(struct bch_fs *c, struct btree *b, + bool have_retry) + { + struct bkey_packed *k, *prev = NULL; +- struct bpos prev_pos = POS_MIN; ++ struct bpos prev_pos = POS_MIN; ++ struct bpos prev_data = POS_MIN; + bool seen_non_whiteout = false; + unsigned version; + const char *err; +@@ -840,7 +844,8 @@ static int validate_bset(struct bch_fs *c, struct btree *b, + (bkey_cmp(prev_pos, bkey_start_pos(u.k)) > 0))) { + *whiteout_u64s = k->_data - i->_data; + seen_non_whiteout = true; +- } else if (bkey_cmp(prev_pos, bkey_start_pos(u.k)) > 0) { ++ } else if (bkey_cmp(prev_data, bkey_start_pos(u.k)) > 0 || ++ bkey_cmp(prev_pos, u.k->p) > 0) { + btree_err(BTREE_ERR_FATAL, c, b, i, + "keys out of order: %llu:%llu > %llu:%llu", + prev_pos.inode, +@@ -850,7 +855,10 @@ static int validate_bset(struct bch_fs *c, struct btree *b, + /* XXX: repair this */ + } + ++ if (!bkey_deleted(u.k)) ++ prev_data = u.k->p; + prev_pos = u.k->p; ++ + prev = k; + k = bkey_next_skip_noops(k, vstruct_last(i)); + } +@@ -909,6 +917,10 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry + + bset_encrypt(c, i, b->written << 9); + ++ if (btree_node_is_extents(b) && ++ !BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data)) ++ set_btree_node_old_extent_overwrite(b); ++ + sectors = vstruct_sectors(b->data, c->block_bits); + + btree_node_set_format(b, b->data->format); +@@ -972,7 +984,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry + + set_btree_bset(b, b->set, &b->data->keys); + +- b->nr = (btree_node_is_extents(b) ++ b->nr = (btree_node_old_extent_overwrite(b) + ? bch2_extent_sort_fix_overlapping + : bch2_key_sort_fix_overlapping)(c, &sorted->keys, iter); + +@@ -1487,7 +1499,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, + i->journal_seq = cpu_to_le64(seq); + i->u64s = 0; + +- if (!btree_node_is_extents(b)) { ++ if (!btree_node_old_extent_overwrite(b)) { + sort_iter_add(&sort_iter, + unwritten_whiteouts_start(c, b), + unwritten_whiteouts_end(c, b)); +@@ -1502,7 +1514,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, + + b->whiteout_u64s = 0; + +- u64s = btree_node_is_extents(b) ++ u64s = btree_node_old_extent_overwrite(b) + ? bch2_sort_extents(vstruct_last(i), &sort_iter, false) + : bch2_sort_keys(i->start, &sort_iter, false); + le16_add_cpu(&i->u64s, u64s); +diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h +index 2a5b70c72a13..7d03226adeba 100644 +--- a/fs/bcachefs/btree_types.h ++++ b/fs/bcachefs/btree_types.h +@@ -308,6 +308,7 @@ enum btree_flags { + BTREE_NODE_just_written, + BTREE_NODE_dying, + BTREE_NODE_fake, ++ BTREE_NODE_old_extent_overwrite, + }; + + BTREE_FLAG(read_in_flight); +@@ -321,6 +322,7 @@ BTREE_FLAG(write_in_flight); + BTREE_FLAG(just_written); + BTREE_FLAG(dying); + BTREE_FLAG(fake); ++BTREE_FLAG(old_extent_overwrite); + + static inline struct btree_write *btree_current_write(struct btree *b) + { +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index fcd4b809c807..6f98451aefef 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -374,6 +374,13 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned lev + SET_BTREE_NODE_LEVEL(b->data, level); + b->data->ptr = bkey_i_to_btree_ptr(&b->key)->v.start[0]; + ++ if (c->sb.features & (1ULL << BCH_FEATURE_new_extent_overwrite)) ++ SET_BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data, true); ++ ++ if (btree_node_is_extents(b) && ++ !BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data)) ++ set_btree_node_old_extent_overwrite(b); ++ + bch2_btree_build_aux_trees(b); + + btree_node_will_make_reachable(as, b); +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index d0aca7f9b3d3..4ef290d96c89 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -267,6 +267,8 @@ static void btree_insert_key_leaf(struct btree_trans *trans, + int old_live_u64s = b->nr.live_u64s; + int live_u64s_added, u64s_added; + ++ insert->k->k.needs_whiteout = false; ++ + if (!btree_node_is_extents(b)) + bch2_insert_fixup_key(trans, insert); + else +diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c +index e021e1623a91..d2f1414f28e2 100644 +--- a/fs/bcachefs/extent_update.c ++++ b/fs/bcachefs/extent_update.c +@@ -186,11 +186,26 @@ bch2_extent_can_insert(struct btree_trans *trans, + + overlap = bch2_extent_overlap(&insert->k->k, k.k); + ++ /* ++ * If we're overwriting an existing extent, we may need to emit ++ * a whiteout - unless we're inserting a new extent at the same ++ * position: ++ */ ++ if (k.k->needs_whiteout && ++ (!bkey_whiteout(&insert->k->k) || ++ bkey_cmp(k.k->p, insert->k->k.p))) ++ *u64s += BKEY_U64s; ++ ++ /* ++ * If we're partially overwriting an existing extent which has ++ * been written out to disk, we'll need to emit a new version of ++ * that extent: ++ */ + if (bkey_written(l->b, _k) && + overlap != BCH_EXTENT_OVERLAP_ALL) + *u64s += _k->u64s; + +- /* account for having to split existing extent: */ ++ /* And we may be splitting an existing extent: */ + if (overlap == BCH_EXTENT_OVERLAP_MIDDLE) + *u64s += _k->u64s; + +@@ -286,6 +301,23 @@ static void extent_bset_insert(struct bch_fs *c, struct btree_iter *iter, + bch2_btree_node_iter_fix(iter, l->b, &l->iter, k, 0, k->u64s); + } + ++static void pack_push_whiteout(struct bch_fs *c, struct btree *b, ++ struct bpos pos) ++{ ++ struct bkey_packed k; ++ ++ if (!bkey_pack_pos(&k, pos, b)) { ++ struct bkey_i tmp; ++ ++ bkey_init(&tmp.k); ++ tmp.k.p = pos; ++ bkey_copy(&k, &tmp); ++ } ++ ++ k.needs_whiteout = true; ++ push_whiteout(c, b, &k); ++} ++ + static void + extent_drop(struct bch_fs *c, struct btree_iter *iter, + struct bkey_packed *_k, struct bkey_s k) +@@ -297,7 +329,12 @@ extent_drop(struct bch_fs *c, struct btree_iter *iter, + + k.k->size = 0; + k.k->type = KEY_TYPE_deleted; +- k.k->needs_whiteout = false; ++ ++ if (!btree_node_old_extent_overwrite(l->b) && ++ k.k->needs_whiteout) { ++ pack_push_whiteout(c, l->b, k.k->p); ++ k.k->needs_whiteout = false; ++ } + + if (_k >= btree_bset_last(l->b)->start) { + unsigned u64s = _k->u64s; +@@ -322,12 +359,29 @@ extent_squash(struct bch_fs *c, struct btree_iter *iter, + bkey_on_stack_init(&tmp); + bkey_on_stack_init(&split); + ++ if (!btree_node_old_extent_overwrite(l->b)) { ++ if (!bkey_whiteout(&insert->k) && ++ !bkey_cmp(k.k->p, insert->k.p)) { ++ insert->k.needs_whiteout = k.k->needs_whiteout; ++ k.k->needs_whiteout = false; ++ } ++ } else { ++ insert->k.needs_whiteout |= k.k->needs_whiteout; ++ } ++ + switch (overlap) { + case BCH_EXTENT_OVERLAP_FRONT: + if (bkey_written(l->b, _k)) { + bkey_on_stack_reassemble(&tmp, c, k.s_c); + bch2_cut_front(insert->k.p, tmp.k); + ++ /* ++ * needs_whiteout was propagated to new version of @k, ++ * @tmp: ++ */ ++ if (!btree_node_old_extent_overwrite(l->b)) ++ k.k->needs_whiteout = false; ++ + extent_drop(c, iter, _k, k); + extent_bset_insert(c, iter, tmp.k); + } else { +@@ -348,9 +402,26 @@ extent_squash(struct bch_fs *c, struct btree_iter *iter, + bkey_on_stack_reassemble(&tmp, c, k.s_c); + bch2_cut_back(bkey_start_pos(&insert->k), tmp.k); + ++ /* ++ * @tmp has different position than @k, needs_whiteout ++ * should not be propagated: ++ */ ++ if (!btree_node_old_extent_overwrite(l->b)) ++ tmp.k->k.needs_whiteout = false; ++ + extent_drop(c, iter, _k, k); + extent_bset_insert(c, iter, tmp.k); + } else { ++ /* ++ * position of @k is changing, emit a whiteout if ++ * needs_whiteout is set: ++ */ ++ if (!btree_node_old_extent_overwrite(l->b) && ++ k.k->needs_whiteout) { ++ pack_push_whiteout(c, l->b, k.k->p); ++ k.k->needs_whiteout = false; ++ } ++ + btree_keys_account_val_delta(l->b, _k, + bch2_cut_back_s(bkey_start_pos(&insert->k), k)); + extent_save(l->b, _k, k.k); +@@ -367,10 +438,17 @@ extent_squash(struct bch_fs *c, struct btree_iter *iter, + bkey_on_stack_reassemble(&split, c, k.s_c); + bch2_cut_back(bkey_start_pos(&insert->k), split.k); + ++ if (!btree_node_old_extent_overwrite(l->b)) ++ split.k->k.needs_whiteout = false; ++ ++ /* this is identical to BCH_EXTENT_OVERLAP_FRONT: */ + if (bkey_written(l->b, _k)) { + bkey_on_stack_reassemble(&tmp, c, k.s_c); + bch2_cut_front(insert->k.p, tmp.k); + ++ if (!btree_node_old_extent_overwrite(l->b)) ++ k.k->needs_whiteout = false; ++ + extent_drop(c, iter, _k, k); + extent_bset_insert(c, iter, tmp.k); + } else { +@@ -462,7 +540,6 @@ void bch2_insert_fixup_extent(struct btree_trans *trans, + bch2_cut_front(cur_end, insert); + bch2_btree_iter_set_pos_same_leaf(iter, cur_end); + } else { +- insert->k.needs_whiteout |= k.k->needs_whiteout; + extent_squash(c, iter, insert, _k, k, overlap); + } + +@@ -480,7 +557,10 @@ void bch2_insert_fixup_extent(struct btree_trans *trans, + if (insert->k.type == KEY_TYPE_deleted) + insert->k.type = KEY_TYPE_discard; + +- extent_bset_insert(c, iter, insert); ++ if (!bkey_whiteout(&insert->k) || ++ btree_node_old_extent_overwrite(l->b)) ++ extent_bset_insert(c, iter, insert); ++ + bch2_btree_journal_key(trans, iter, insert); + } + +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index 97b367252e82..c7367a679b22 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -908,6 +908,7 @@ int bch2_fs_recovery(struct bch_fs *c) + le16_to_cpu(bcachefs_metadata_version_min); + c->disk_sb.sb->version = le16_to_cpu(bcachefs_metadata_version_current); + c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_new_siphash; ++ c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_new_extent_overwrite; + write_sb = true; + } + +@@ -1027,6 +1028,7 @@ int bch2_fs_initialize(struct bch_fs *c) + le16_to_cpu(bcachefs_metadata_version_current); + c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_atomic_nlink; + c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_new_siphash; ++ c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_new_extent_overwrite; + + SET_BCH_SB_INITIALIZED(c->disk_sb.sb, true); + SET_BCH_SB_CLEAN(c->disk_sb.sb, false); +-- +cgit v1.2.3 + + +From 7475a2976e37ee166428f4f7ab5ab5358df4e3c2 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 30 Dec 2019 12:43:19 -0500 +Subject: bcachefs: Use bch2_trans_reset in bch2_trans_commit() + +Clean up a bit of duplicated code. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 10 +++++++++- + fs/bcachefs/btree_iter.h | 1 + + fs/bcachefs/btree_update_leaf.c | 18 +----------------- + 3 files changed, 11 insertions(+), 18 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 6f19304bb913..8c42d09fd050 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -2104,7 +2104,15 @@ void bch2_trans_reset(struct btree_trans *trans, unsigned flags) + if (flags & TRANS_RESET_MEM) + trans->mem_top = 0; + +- bch2_btree_iter_traverse_all(trans); ++ if (trans->fs_usage_deltas) { ++ trans->fs_usage_deltas->used = 0; ++ memset(&trans->fs_usage_deltas->memset_start, 0, ++ (void *) &trans->fs_usage_deltas->memset_end - ++ (void *) &trans->fs_usage_deltas->memset_start); ++ } ++ ++ if (!(flags & TRANS_RESET_NOTRAVERSE)) ++ bch2_btree_iter_traverse_all(trans); + } + + void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, +diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h +index d750c4e5f18e..962380925511 100644 +--- a/fs/bcachefs/btree_iter.h ++++ b/fs/bcachefs/btree_iter.h +@@ -291,6 +291,7 @@ struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *, + + #define TRANS_RESET_ITERS (1 << 0) + #define TRANS_RESET_MEM (1 << 1) ++#define TRANS_RESET_NOTRAVERSE (1 << 2) + + void bch2_trans_reset(struct btree_trans *, unsigned); + +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index 4ef290d96c89..415848dbedb1 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -715,7 +715,6 @@ bch2_trans_commit_get_rw_cold(struct btree_trans *trans) + int __bch2_trans_commit(struct btree_trans *trans) + { + struct btree_insert_entry *i = NULL; +- struct btree_iter *iter; + unsigned u64s; + int ret = 0; + +@@ -782,22 +781,7 @@ out: + if (likely(!(trans->flags & BTREE_INSERT_NOCHECK_RW))) + percpu_ref_put(&trans->c->writes); + out_noupdates: +- trans_for_each_iter_all(trans, iter) +- iter->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT; +- +- if (!ret) { +- bch2_trans_unlink_iters(trans); +- trans->iters_touched = 0; +- } +- trans->nr_updates = 0; +- trans->mem_top = 0; +- +- if (trans->fs_usage_deltas) { +- trans->fs_usage_deltas->used = 0; +- memset(&trans->fs_usage_deltas->memset_start, 0, +- (void *) &trans->fs_usage_deltas->memset_end - +- (void *) &trans->fs_usage_deltas->memset_start); +- } ++ bch2_trans_reset(trans, TRANS_RESET_MEM|TRANS_RESET_NOTRAVERSE); + + return ret; + err: +-- +cgit v1.2.3 + + +From f440c5c6f799cca4e89b9d1b07db61172509c42f Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 30 Dec 2019 13:08:26 -0500 +Subject: bcachefs: Make btree_insert_entry more private to update path + +This should be private to btree_update_leaf.c, and we might end up +removing it. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 8 ++--- + fs/bcachefs/btree_types.h | 4 +-- + fs/bcachefs/btree_update.h | 2 +- + fs/bcachefs/btree_update_leaf.c | 73 +++++++++++++++++++++-------------------- + fs/bcachefs/buckets.c | 12 +++---- + fs/bcachefs/buckets.h | 4 +-- + fs/bcachefs/extent_update.c | 20 +++++------ + fs/bcachefs/extent_update.h | 7 ++-- + fs/bcachefs/extents.h | 1 - + 9 files changed, 67 insertions(+), 64 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 8c42d09fd050..be303db951e2 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -1810,8 +1810,8 @@ static int bch2_trans_realloc_iters(struct btree_trans *trans, + bch2_trans_unlock(trans); + + iters_bytes = sizeof(struct btree_iter) * new_size; +- updates_bytes = sizeof(struct btree_insert_entry) * (new_size + 4); +- sorted_bytes = sizeof(u8) * (new_size + 4); ++ updates_bytes = sizeof(struct btree_insert_entry) * new_size; ++ sorted_bytes = sizeof(u8) * new_size; + + new_iters = kmalloc(iters_bytes + + updates_bytes + +@@ -2163,6 +2163,6 @@ int bch2_fs_btree_iter_init(struct bch_fs *c) + + return mempool_init_kmalloc_pool(&c->btree_iters_pool, 1, + sizeof(struct btree_iter) * nr + +- sizeof(struct btree_insert_entry) * (nr + 4) + +- sizeof(u8) * (nr + 4)); ++ sizeof(struct btree_insert_entry) * nr + ++ sizeof(u8) * nr); + } +diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h +index 7d03226adeba..b01c064fe19c 100644 +--- a/fs/bcachefs/btree_types.h ++++ b/fs/bcachefs/btree_types.h +@@ -282,8 +282,8 @@ struct btree_trans { + struct replicas_delta_list *fs_usage_deltas; + + struct btree_iter iters_onstack[2]; +- struct btree_insert_entry updates_onstack[6]; +- u8 updates_sorted_onstack[6]; ++ struct btree_insert_entry updates_onstack[2]; ++ u8 updates_sorted_onstack[2]; + }; + + #define BTREE_FLAG(flag) \ +diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h +index 1534e937a95d..7f61351aed71 100644 +--- a/fs/bcachefs/btree_update.h ++++ b/fs/bcachefs/btree_update.h +@@ -111,7 +111,7 @@ static inline void bch2_trans_update(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_i *k) + { +- EBUG_ON(trans->nr_updates >= trans->nr_iters + 4); ++ EBUG_ON(trans->nr_updates >= trans->nr_iters); + + iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT; + +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index 415848dbedb1..3102c4d6fa08 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -239,40 +239,39 @@ void bch2_btree_journal_key(struct btree_trans *trans, + } + + static void bch2_insert_fixup_key(struct btree_trans *trans, +- struct btree_insert_entry *insert) ++ struct btree_iter *iter, ++ struct bkey_i *insert) + { +- struct btree_iter *iter = insert->iter; + struct btree_iter_level *l = &iter->l[0]; + + EBUG_ON(iter->level); +- EBUG_ON(insert->k->k.u64s > ++ EBUG_ON(insert->k.u64s > + bch_btree_keys_u64s_remaining(trans->c, l->b)); + +- if (likely(bch2_btree_bset_insert_key(iter, l->b, &l->iter, +- insert->k))) +- bch2_btree_journal_key(trans, iter, insert->k); ++ if (likely(bch2_btree_bset_insert_key(iter, l->b, &l->iter, insert))) ++ bch2_btree_journal_key(trans, iter, insert); + } + + /** + * btree_insert_key - insert a key one key into a leaf node + */ + static void btree_insert_key_leaf(struct btree_trans *trans, +- struct btree_insert_entry *insert) ++ struct btree_iter *iter, ++ struct bkey_i *insert) + { + struct bch_fs *c = trans->c; +- struct btree_iter *iter = insert->iter; + struct btree *b = iter->l[0].b; + struct bset_tree *t = bset_tree_last(b); + int old_u64s = bset_u64s(t); + int old_live_u64s = b->nr.live_u64s; + int live_u64s_added, u64s_added; + +- insert->k->k.needs_whiteout = false; ++ insert->k.needs_whiteout = false; + + if (!btree_node_is_extents(b)) +- bch2_insert_fixup_key(trans, insert); ++ bch2_insert_fixup_key(trans, iter, insert); + else +- bch2_insert_fixup_extent(trans, insert); ++ bch2_insert_fixup_extent(trans, iter, insert); + + live_u64s_added = (int) b->nr.live_u64s - old_live_u64s; + u64s_added = (int) bset_u64s(t) - old_u64s; +@@ -286,24 +285,25 @@ static void btree_insert_key_leaf(struct btree_trans *trans, + bch2_maybe_compact_whiteouts(c, b)) + bch2_btree_iter_reinit_node(iter, b); + +- trace_btree_insert_key(c, b, insert->k); ++ trace_btree_insert_key(c, b, insert); + } + + /* Normal update interface: */ + + static inline void btree_insert_entry_checks(struct btree_trans *trans, +- struct btree_insert_entry *i) ++ struct btree_iter *iter, ++ struct bkey_i *insert) + { + struct bch_fs *c = trans->c; + +- BUG_ON(i->iter->level); +- BUG_ON(bkey_cmp(bkey_start_pos(&i->k->k), i->iter->pos)); +- EBUG_ON((i->iter->flags & BTREE_ITER_IS_EXTENTS) && +- bkey_cmp(i->k->k.p, i->iter->l[0].b->key.k.p) > 0); ++ BUG_ON(iter->level); ++ BUG_ON(bkey_cmp(bkey_start_pos(&insert->k), iter->pos)); ++ EBUG_ON((iter->flags & BTREE_ITER_IS_EXTENTS) && ++ bkey_cmp(insert->k.p, iter->l[0].b->key.k.p) > 0); + + BUG_ON(debug_check_bkeys(c) && +- !bkey_deleted(&i->k->k) && +- bch2_bkey_invalid(c, bkey_i_to_s_c(i->k), i->iter->btree_id)); ++ !bkey_deleted(&insert->k) && ++ bch2_bkey_invalid(c, bkey_i_to_s_c(insert), iter->btree_id)); + } + + static noinline int +@@ -344,11 +344,12 @@ static inline int bch2_trans_journal_res_get(struct btree_trans *trans, + + static enum btree_insert_ret + btree_key_can_insert(struct btree_trans *trans, +- struct btree_insert_entry *insert, ++ struct btree_iter *iter, ++ struct bkey_i *insert, + unsigned *u64s) + { + struct bch_fs *c = trans->c; +- struct btree *b = insert->iter->l[0].b; ++ struct btree *b = iter->l[0].b; + static enum btree_insert_ret ret; + + if (unlikely(btree_node_fake(b))) +@@ -356,7 +357,7 @@ btree_key_can_insert(struct btree_trans *trans, + + ret = !btree_node_is_extents(b) + ? BTREE_INSERT_OK +- : bch2_extent_can_insert(trans, insert, u64s); ++ : bch2_extent_can_insert(trans, iter, insert, u64s); + if (ret) + return ret; + +@@ -367,21 +368,22 @@ btree_key_can_insert(struct btree_trans *trans, + } + + static inline void do_btree_insert_one(struct btree_trans *trans, +- struct btree_insert_entry *insert) ++ struct btree_iter *iter, ++ struct bkey_i *insert) + { +- btree_insert_key_leaf(trans, insert); ++ btree_insert_key_leaf(trans, iter, insert); + } + +-static inline bool update_has_trans_triggers(struct btree_insert_entry *i) ++static inline bool iter_has_trans_triggers(struct btree_iter *iter) + { +- return BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->iter->btree_id); ++ return BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << iter->btree_id); + } + +-static inline bool update_has_nontrans_triggers(struct btree_insert_entry *i) ++static inline bool iter_has_nontrans_triggers(struct btree_iter *iter) + { + return (BTREE_NODE_TYPE_HAS_TRIGGERS & + ~BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS) & +- (1U << i->iter->btree_id); ++ (1U << iter->btree_id); + } + + static noinline void bch2_btree_iter_unlock_noinline(struct btree_iter *iter) +@@ -402,7 +404,7 @@ static noinline void bch2_trans_mark_gc(struct btree_trans *trans) + + trans_for_each_update(trans, i) + if (gc_visited(c, gc_pos_btree_node(i->iter->l[0].b))) +- bch2_mark_update(trans, i, NULL, ++ bch2_mark_update(trans, i->iter, i->k, NULL, + mark_flags|BCH_BUCKET_MARK_GC); + } + +@@ -439,7 +441,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, + u64s = 0; + + u64s += i->k->k.u64s; +- ret = btree_key_can_insert(trans, i, &u64s); ++ ret = btree_key_can_insert(trans, i->iter, i->k, &u64s); + if (ret) { + *stopped_at = i; + return ret; +@@ -489,8 +491,9 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, + + trans_for_each_update(trans, i) + if (likely(!(trans->flags & BTREE_INSERT_NOMARK)) && +- update_has_nontrans_triggers(i)) +- bch2_mark_update(trans, i, fs_usage, mark_flags); ++ iter_has_nontrans_triggers(i->iter)) ++ bch2_mark_update(trans, i->iter, i->k, ++ fs_usage, mark_flags); + + if (marking) + bch2_trans_fs_usage_apply(trans, fs_usage); +@@ -499,7 +502,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, + bch2_trans_mark_gc(trans); + + trans_for_each_update(trans, i) +- do_btree_insert_one(trans, i); ++ do_btree_insert_one(trans, i->iter, i->k); + err: + if (marking) { + bch2_fs_usage_scratch_put(c, fs_usage); +@@ -549,7 +552,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, + + if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) + trans_for_each_update(trans, i) +- btree_insert_entry_checks(trans, i); ++ btree_insert_entry_checks(trans, i->iter, i->k); + bch2_btree_trans_verify_locks(trans); + + /* +@@ -751,7 +754,7 @@ int __bch2_trans_commit(struct btree_trans *trans) + } + + if (likely(!(trans->flags & BTREE_INSERT_NOMARK)) && +- update_has_trans_triggers(i)) { ++ iter_has_trans_triggers(i->iter)) { + ret = bch2_trans_mark_update(trans, i->iter, i->k); + if (unlikely(ret)) { + if (ret == -EINTR) +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index 5a34dab013f3..5b1005117b3f 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -1300,12 +1300,12 @@ inline int bch2_mark_overwrite(struct btree_trans *trans, + } + + int bch2_mark_update(struct btree_trans *trans, +- struct btree_insert_entry *insert, ++ struct btree_iter *iter, ++ struct bkey_i *insert, + struct bch_fs_usage *fs_usage, + unsigned flags) + { + struct bch_fs *c = trans->c; +- struct btree_iter *iter = insert->iter; + struct btree *b = iter->l[0].b; + struct btree_node_iter node_iter = iter->l[0].iter; + struct bkey_packed *_k; +@@ -1314,8 +1314,8 @@ int bch2_mark_update(struct btree_trans *trans, + if (!btree_node_type_needs_gc(iter->btree_id)) + return 0; + +- bch2_mark_key_locked(c, bkey_i_to_s_c(insert->k), +- 0, insert->k->k.size, ++ bch2_mark_key_locked(c, bkey_i_to_s_c(insert), ++ 0, insert->k.size, + fs_usage, trans->journal_res.seq, + BCH_BUCKET_MARK_INSERT|flags); + +@@ -1328,7 +1328,7 @@ int bch2_mark_update(struct btree_trans *trans, + */ + if ((iter->btree_id == BTREE_ID_ALLOC || + iter->btree_id == BTREE_ID_EC) && +- !bkey_deleted(&insert->k->k)) ++ !bkey_deleted(&insert->k)) + return 0; + + while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, b, +@@ -1336,7 +1336,7 @@ int bch2_mark_update(struct btree_trans *trans, + struct bkey unpacked; + struct bkey_s_c k = bkey_disassemble(b, _k, &unpacked); + +- ret = bch2_mark_overwrite(trans, iter, k, insert->k, ++ ret = bch2_mark_overwrite(trans, iter, k, insert, + fs_usage, flags); + if (ret <= 0) + break; +diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h +index ad6f731b1cea..60a12bb0d8f7 100644 +--- a/fs/bcachefs/buckets.h ++++ b/fs/bcachefs/buckets.h +@@ -276,8 +276,8 @@ int bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage *, + int bch2_mark_overwrite(struct btree_trans *, struct btree_iter *, + struct bkey_s_c, struct bkey_i *, + struct bch_fs_usage *, unsigned); +-int bch2_mark_update(struct btree_trans *, struct btree_insert_entry *, +- struct bch_fs_usage *, unsigned); ++int bch2_mark_update(struct btree_trans *, struct btree_iter *, ++ struct bkey_i *, struct bch_fs_usage *, unsigned); + + int bch2_replicas_delta_list_apply(struct bch_fs *, + struct bch_fs_usage *, +diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c +index d2f1414f28e2..846d77dc2530 100644 +--- a/fs/bcachefs/extent_update.c ++++ b/fs/bcachefs/extent_update.c +@@ -166,10 +166,11 @@ int bch2_extent_is_atomic(struct bkey_i *k, struct btree_iter *iter) + + enum btree_insert_ret + bch2_extent_can_insert(struct btree_trans *trans, +- struct btree_insert_entry *insert, ++ struct btree_iter *iter, ++ struct bkey_i *insert, + unsigned *u64s) + { +- struct btree_iter_level *l = &insert->iter->l[0]; ++ struct btree_iter_level *l = &iter->l[0]; + struct btree_node_iter node_iter = l->iter; + struct bkey_packed *_k; + struct bkey unpacked; +@@ -179,12 +180,12 @@ bch2_extent_can_insert(struct btree_trans *trans, + KEY_TYPE_discard))) { + struct bkey_s_c k = bkey_disassemble(l->b, _k, &unpacked); + enum bch_extent_overlap overlap = +- bch2_extent_overlap(&insert->k->k, k.k); ++ bch2_extent_overlap(&insert->k, k.k); + +- if (bkey_cmp(bkey_start_pos(k.k), insert->k->k.p) >= 0) ++ if (bkey_cmp(bkey_start_pos(k.k), insert->k.p) >= 0) + break; + +- overlap = bch2_extent_overlap(&insert->k->k, k.k); ++ overlap = bch2_extent_overlap(&insert->k, k.k); + + /* + * If we're overwriting an existing extent, we may need to emit +@@ -192,8 +193,8 @@ bch2_extent_can_insert(struct btree_trans *trans, + * position: + */ + if (k.k->needs_whiteout && +- (!bkey_whiteout(&insert->k->k) || +- bkey_cmp(k.k->p, insert->k->k.p))) ++ (!bkey_whiteout(&insert->k) || ++ bkey_cmp(k.k->p, insert->k.p))) + *u64s += BKEY_U64s; + + /* +@@ -507,11 +508,10 @@ extent_squash(struct bch_fs *c, struct btree_iter *iter, + * key insertion needs to continue/be retried. + */ + void bch2_insert_fixup_extent(struct btree_trans *trans, +- struct btree_insert_entry *insert_entry) ++ struct btree_iter *iter, ++ struct bkey_i *insert) + { + struct bch_fs *c = trans->c; +- struct btree_iter *iter = insert_entry->iter; +- struct bkey_i *insert = insert_entry->k; + struct btree_iter_level *l = &iter->l[0]; + struct btree_node_iter node_iter = l->iter; + bool do_update = !bkey_whiteout(&insert->k); +diff --git a/fs/bcachefs/extent_update.h b/fs/bcachefs/extent_update.h +index 89d18e4b6758..e9dc8091ba3f 100644 +--- a/fs/bcachefs/extent_update.h ++++ b/fs/bcachefs/extent_update.h +@@ -10,9 +10,10 @@ int bch2_extent_trim_atomic(struct bkey_i *, struct btree_iter *); + int bch2_extent_is_atomic(struct bkey_i *, struct btree_iter *); + + enum btree_insert_ret +-bch2_extent_can_insert(struct btree_trans *, struct btree_insert_entry *, +- unsigned *); ++bch2_extent_can_insert(struct btree_trans *, struct btree_iter *, ++ struct bkey_i *, unsigned *); + void bch2_insert_fixup_extent(struct btree_trans *, +- struct btree_insert_entry *); ++ struct btree_iter *, ++ struct bkey_i *); + + #endif /* _BCACHEFS_EXTENT_UPDATE_H */ +diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h +index 1140d01a42ab..7c5a41e6d79d 100644 +--- a/fs/bcachefs/extents.h ++++ b/fs/bcachefs/extents.h +@@ -8,7 +8,6 @@ + + struct bch_fs; + struct btree_trans; +-struct btree_insert_entry; + + /* extent entries: */ + +-- +cgit v1.2.3 + + +From d030f520d127344580365c1faa45d48f7e53df82 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 31 Dec 2019 16:17:42 -0500 +Subject: bcachefs: Split out btree_trigger_flags + +The trigger flags really belong with individual btree_insert_entries, +not the transaction commit flags - this splits out those flags and +unifies them with the BCH_BUCKET_MARK flags. Todo - split out +btree_trigger.c from buckets.c + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/acl.c | 2 +- + fs/bcachefs/alloc_background.c | 22 +++++------ + fs/bcachefs/btree_gc.c | 18 ++++----- + fs/bcachefs/btree_types.h | 27 +++++++++++++ + fs/bcachefs/btree_update.h | 19 ++------- + fs/bcachefs/btree_update_interior.c | 28 +++++++------- + fs/bcachefs/btree_update_leaf.c | 28 +++++--------- + fs/bcachefs/buckets.c | 77 ++++++++++++++++++++----------------- + fs/bcachefs/buckets.h | 13 +------ + fs/bcachefs/dirent.c | 6 +-- + fs/bcachefs/ec.c | 10 ++--- + fs/bcachefs/fs-io.c | 13 +++---- + fs/bcachefs/fsck.c | 8 ++-- + fs/bcachefs/inode.c | 6 +-- + fs/bcachefs/io.c | 6 +-- + fs/bcachefs/migrate.c | 2 +- + fs/bcachefs/move.c | 2 +- + fs/bcachefs/quota.c | 2 +- + fs/bcachefs/recovery.c | 55 ++++++++++++++++---------- + fs/bcachefs/reflink.c | 4 +- + fs/bcachefs/str_hash.h | 4 +- + fs/bcachefs/tests.c | 10 ++--- + 22 files changed, 189 insertions(+), 173 deletions(-) + +diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c +index 20cdc7999244..250e9304666e 100644 +--- a/fs/bcachefs/acl.c ++++ b/fs/bcachefs/acl.c +@@ -381,7 +381,7 @@ int bch2_acl_chmod(struct btree_trans *trans, + } + + new->k.p = iter->pos; +- bch2_trans_update(trans, iter, &new->k_i); ++ bch2_trans_update(trans, iter, &new->k_i, 0); + *new_acl = acl; + acl = NULL; + err: +diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c +index 61380e16623f..87dd137fed3f 100644 +--- a/fs/bcachefs/alloc_background.c ++++ b/fs/bcachefs/alloc_background.c +@@ -222,8 +222,8 @@ int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys) + + for_each_btree_key(&trans, iter, BTREE_ID_ALLOC, POS_MIN, 0, k, ret) + bch2_mark_key(c, k, 0, 0, NULL, 0, +- BCH_BUCKET_MARK_ALLOC_READ| +- BCH_BUCKET_MARK_NOATOMIC); ++ BTREE_TRIGGER_ALLOC_READ| ++ BTREE_TRIGGER_NOATOMIC); + + ret = bch2_trans_exit(&trans) ?: ret; + if (ret) { +@@ -235,8 +235,8 @@ int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys) + if (j->btree_id == BTREE_ID_ALLOC) + bch2_mark_key(c, bkey_i_to_s_c(j->k), + 0, 0, NULL, 0, +- BCH_BUCKET_MARK_ALLOC_READ| +- BCH_BUCKET_MARK_NOATOMIC); ++ BTREE_TRIGGER_ALLOC_READ| ++ BTREE_TRIGGER_NOATOMIC); + + percpu_down_write(&c->mark_lock); + bch2_dev_usage_from_buckets(c); +@@ -314,11 +314,10 @@ retry: + a->k.p = iter->pos; + bch2_alloc_pack(a, new_u); + +- bch2_trans_update(trans, iter, &a->k_i); ++ bch2_trans_update(trans, iter, &a->k_i, ++ BTREE_TRIGGER_NORUN); + ret = bch2_trans_commit(trans, NULL, NULL, +- BTREE_INSERT_NOFAIL| +- BTREE_INSERT_NOMARK| +- flags); ++ BTREE_INSERT_NOFAIL|flags); + err: + if (ret == -EINTR) + goto retry; +@@ -383,8 +382,7 @@ int bch2_alloc_replay_key(struct bch_fs *c, struct bkey_i *k) + ret = bch2_alloc_write_key(&trans, iter, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW| +- BTREE_INSERT_JOURNAL_REPLAY| +- BTREE_INSERT_NOMARK); ++ BTREE_INSERT_JOURNAL_REPLAY); + bch2_trans_exit(&trans); + return ret < 0 ? ret : 0; + } +@@ -901,7 +899,8 @@ retry: + a->k.p = iter->pos; + bch2_alloc_pack(a, u); + +- bch2_trans_update(trans, iter, &a->k_i); ++ bch2_trans_update(trans, iter, &a->k_i, ++ BTREE_TRIGGER_BUCKET_INVALIDATE); + + /* + * XXX: +@@ -917,7 +916,6 @@ retry: + BTREE_INSERT_NOFAIL| + BTREE_INSERT_USE_RESERVE| + BTREE_INSERT_USE_ALLOC_RESERVE| +- BTREE_INSERT_BUCKET_INVALIDATE| + flags); + if (ret == -EINTR) + goto retry; +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +index 8bbf60b07736..05879b66d6af 100644 +--- a/fs/bcachefs/btree_gc.c ++++ b/fs/bcachefs/btree_gc.c +@@ -116,8 +116,8 @@ static int bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k, + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const struct bch_extent_ptr *ptr; + unsigned flags = +- BCH_BUCKET_MARK_GC| +- (initial ? BCH_BUCKET_MARK_NOATOMIC : 0); ++ BTREE_TRIGGER_GC| ++ (initial ? BTREE_TRIGGER_NOATOMIC : 0); + int ret = 0; + + if (initial) { +@@ -294,8 +294,8 @@ static int mark_journal_key(struct bch_fs *c, enum btree_id id, + BTREE_ITER_SLOTS, k, ret) { + percpu_down_read(&c->mark_lock); + ret = bch2_mark_overwrite(&trans, iter, k, insert, NULL, +- BCH_BUCKET_MARK_GC| +- BCH_BUCKET_MARK_NOATOMIC); ++ BTREE_TRIGGER_GC| ++ BTREE_TRIGGER_NOATOMIC); + percpu_up_read(&c->mark_lock); + + if (!ret) +@@ -407,7 +407,7 @@ static void bch2_mark_superblocks(struct bch_fs *c) + gc_pos_set(c, gc_phase(GC_PHASE_SB)); + + for_each_online_member(ca, c, i) +- bch2_mark_dev_superblock(c, ca, BCH_BUCKET_MARK_GC); ++ bch2_mark_dev_superblock(c, ca, BTREE_TRIGGER_GC); + mutex_unlock(&c->sb_lock); + } + +@@ -424,7 +424,7 @@ static void bch2_mark_pending_btree_node_frees(struct bch_fs *c) + if (d->index_update_done) + bch2_mark_key(c, bkey_i_to_s_c(&d->key), + 0, 0, NULL, 0, +- BCH_BUCKET_MARK_GC); ++ BTREE_TRIGGER_GC); + + mutex_unlock(&c->btree_interior_update_lock); + } +@@ -445,7 +445,7 @@ static void bch2_mark_allocator_buckets(struct bch_fs *c) + fifo_for_each_entry(i, &ca->free_inc, iter) + bch2_mark_alloc_bucket(c, ca, i, true, + gc_pos_alloc(c, NULL), +- BCH_BUCKET_MARK_GC); ++ BTREE_TRIGGER_GC); + + + +@@ -453,7 +453,7 @@ static void bch2_mark_allocator_buckets(struct bch_fs *c) + fifo_for_each_entry(i, &ca->free[j], iter) + bch2_mark_alloc_bucket(c, ca, i, true, + gc_pos_alloc(c, NULL), +- BCH_BUCKET_MARK_GC); ++ BTREE_TRIGGER_GC); + } + + spin_unlock(&c->freelist_lock); +@@ -467,7 +467,7 @@ static void bch2_mark_allocator_buckets(struct bch_fs *c) + ca = bch_dev_bkey_exists(c, ob->ptr.dev); + bch2_mark_alloc_bucket(c, ca, PTR_BUCKET_NR(ca, &ob->ptr), true, + gc_pos_alloc(c, ob), +- BCH_BUCKET_MARK_GC); ++ BTREE_TRIGGER_GC); + } + spin_unlock(&ob->lock); + } +diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h +index b01c064fe19c..274682b7bcff 100644 +--- a/fs/bcachefs/btree_types.h ++++ b/fs/bcachefs/btree_types.h +@@ -242,6 +242,7 @@ static inline enum btree_iter_type btree_iter_type(struct btree_iter *iter) + } + + struct btree_insert_entry { ++ unsigned trigger_flags; + struct bkey_i *k; + struct btree_iter *iter; + }; +@@ -481,6 +482,32 @@ static inline bool btree_node_is_extents(struct btree *b) + (1U << BKEY_TYPE_INODES)| \ + (1U << BKEY_TYPE_REFLINK)) + ++enum btree_trigger_flags { ++ __BTREE_TRIGGER_NORUN, /* Don't run triggers at all */ ++ __BTREE_TRIGGER_NOOVERWRITES, /* Don't run triggers on overwrites */ ++ ++ __BTREE_TRIGGER_INSERT, ++ __BTREE_TRIGGER_OVERWRITE, ++ __BTREE_TRIGGER_OVERWRITE_SPLIT, ++ ++ __BTREE_TRIGGER_GC, ++ __BTREE_TRIGGER_BUCKET_INVALIDATE, ++ __BTREE_TRIGGER_ALLOC_READ, ++ __BTREE_TRIGGER_NOATOMIC, ++}; ++ ++#define BTREE_TRIGGER_NORUN (1U << __BTREE_TRIGGER_NORUN) ++#define BTREE_TRIGGER_NOOVERWRITES (1U << __BTREE_TRIGGER_NOOVERWRITES) ++ ++#define BTREE_TRIGGER_INSERT (1U << __BTREE_TRIGGER_INSERT) ++#define BTREE_TRIGGER_OVERWRITE (1U << __BTREE_TRIGGER_OVERWRITE) ++#define BTREE_TRIGGER_OVERWRITE_SPLIT (1U << __BTREE_TRIGGER_OVERWRITE_SPLIT) ++ ++#define BTREE_TRIGGER_GC (1U << __BTREE_TRIGGER_GC) ++#define BTREE_TRIGGER_BUCKET_INVALIDATE (1U << __BTREE_TRIGGER_BUCKET_INVALIDATE) ++#define BTREE_TRIGGER_ALLOC_READ (1U << __BTREE_TRIGGER_ALLOC_READ) ++#define BTREE_TRIGGER_NOATOMIC (1U << __BTREE_TRIGGER_NOATOMIC) ++ + static inline bool btree_node_type_needs_gc(enum btree_node_type type) + { + return BTREE_NODE_TYPE_HAS_TRIGGERS & (1U << type); +diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h +index 7f61351aed71..add7217598ed 100644 +--- a/fs/bcachefs/btree_update.h ++++ b/fs/bcachefs/btree_update.h +@@ -15,7 +15,7 @@ bool bch2_btree_bset_insert_key(struct btree_iter *, struct btree *, + void bch2_btree_journal_key(struct btree_trans *, struct btree_iter *, + struct bkey_i *); + +-enum { ++enum btree_insert_flags { + __BTREE_INSERT_NOUNLOCK, + __BTREE_INSERT_NOFAIL, + __BTREE_INSERT_NOCHECK_RW, +@@ -24,9 +24,6 @@ enum { + __BTREE_INSERT_USE_ALLOC_RESERVE, + __BTREE_INSERT_JOURNAL_REPLAY, + __BTREE_INSERT_JOURNAL_RESERVED, +- __BTREE_INSERT_NOMARK_OVERWRITES, +- __BTREE_INSERT_NOMARK, +- __BTREE_INSERT_BUCKET_INVALIDATE, + __BTREE_INSERT_NOWAIT, + __BTREE_INSERT_GC_LOCK_HELD, + __BCH_HASH_SET_MUST_CREATE, +@@ -53,14 +50,6 @@ enum { + + #define BTREE_INSERT_JOURNAL_RESERVED (1 << __BTREE_INSERT_JOURNAL_RESERVED) + +-/* Don't mark overwrites, just new key: */ +-#define BTREE_INSERT_NOMARK_OVERWRITES (1 << __BTREE_INSERT_NOMARK_OVERWRITES) +- +-/* Don't call mark new key at all: */ +-#define BTREE_INSERT_NOMARK (1 << __BTREE_INSERT_NOMARK) +- +-#define BTREE_INSERT_BUCKET_INVALIDATE (1 << __BTREE_INSERT_BUCKET_INVALIDATE) +- + /* Don't block on allocation failure (for new btree nodes: */ + #define BTREE_INSERT_NOWAIT (1 << __BTREE_INSERT_NOWAIT) + #define BTREE_INSERT_GC_LOCK_HELD (1 << __BTREE_INSERT_GC_LOCK_HELD) +@@ -108,15 +97,15 @@ static inline int bch2_trans_commit(struct btree_trans *trans, + } + + static inline void bch2_trans_update(struct btree_trans *trans, +- struct btree_iter *iter, +- struct bkey_i *k) ++ struct btree_iter *iter, struct bkey_i *k, ++ enum btree_trigger_flags flags) + { + EBUG_ON(trans->nr_updates >= trans->nr_iters); + + iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT; + + trans->updates[trans->nr_updates++] = (struct btree_insert_entry) { +- .iter = iter, .k = k ++ .trigger_flags = flags, .iter = iter, .k = k + }; + } + +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index 6f98451aefef..0278145e9058 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -193,8 +193,8 @@ found: + gc_pos_cmp(c->gc_pos, gc_phase(GC_PHASE_PENDING_DELETE)) < 0) + bch2_mark_key_locked(c, bkey_i_to_s_c(&d->key), + 0, 0, NULL, 0, +- BCH_BUCKET_MARK_OVERWRITE| +- BCH_BUCKET_MARK_GC); ++ BTREE_TRIGGER_OVERWRITE| ++ BTREE_TRIGGER_GC); + } + + static void __btree_node_free(struct bch_fs *c, struct btree *b) +@@ -265,13 +265,13 @@ static void bch2_btree_node_free_ondisk(struct bch_fs *c, + BUG_ON(!pending->index_update_done); + + bch2_mark_key(c, bkey_i_to_s_c(&pending->key), +- 0, 0, NULL, 0, BCH_BUCKET_MARK_OVERWRITE); ++ 0, 0, NULL, 0, BTREE_TRIGGER_OVERWRITE); + + if (gc_visited(c, gc_phase(GC_PHASE_PENDING_DELETE))) + bch2_mark_key(c, bkey_i_to_s_c(&pending->key), + 0, 0, NULL, 0, +- BCH_BUCKET_MARK_OVERWRITE| +- BCH_BUCKET_MARK_GC); ++ BTREE_TRIGGER_OVERWRITE| ++ BTREE_TRIGGER_GC); + } + + static struct btree *__bch2_btree_node_alloc(struct bch_fs *c, +@@ -1084,12 +1084,12 @@ static void bch2_btree_set_root_inmem(struct btree_update *as, struct btree *b) + + bch2_mark_key_locked(c, bkey_i_to_s_c(&b->key), + 0, 0, fs_usage, 0, +- BCH_BUCKET_MARK_INSERT); ++ BTREE_TRIGGER_INSERT); + if (gc_visited(c, gc_pos_btree_root(b->btree_id))) + bch2_mark_key_locked(c, bkey_i_to_s_c(&b->key), + 0, 0, NULL, 0, +- BCH_BUCKET_MARK_INSERT| +- BCH_BUCKET_MARK_GC); ++ BTREE_TRIGGER_INSERT| ++ BTREE_TRIGGER_GC); + + if (old && !btree_node_fake(old)) + bch2_btree_node_free_index(as, NULL, +@@ -1182,13 +1182,13 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b + + bch2_mark_key_locked(c, bkey_i_to_s_c(insert), + 0, 0, fs_usage, 0, +- BCH_BUCKET_MARK_INSERT); ++ BTREE_TRIGGER_INSERT); + + if (gc_visited(c, gc_pos_btree_node(b))) + bch2_mark_key_locked(c, bkey_i_to_s_c(insert), + 0, 0, NULL, 0, +- BCH_BUCKET_MARK_INSERT| +- BCH_BUCKET_MARK_GC); ++ BTREE_TRIGGER_INSERT| ++ BTREE_TRIGGER_GC); + + while ((k = bch2_btree_node_iter_peek_all(node_iter, b)) && + bkey_iter_pos_cmp(b, &insert->k.p, k) > 0) +@@ -2031,12 +2031,12 @@ static void __bch2_btree_node_update_key(struct bch_fs *c, + + bch2_mark_key_locked(c, bkey_i_to_s_c(&new_key->k_i), + 0, 0, fs_usage, 0, +- BCH_BUCKET_MARK_INSERT); ++ BTREE_TRIGGER_INSERT); + if (gc_visited(c, gc_pos_btree_root(b->btree_id))) + bch2_mark_key_locked(c, bkey_i_to_s_c(&new_key->k_i), + 0, 0, NULL, 0, +- BCH_BUCKET_MARK_INSERT|| +- BCH_BUCKET_MARK_GC); ++ BTREE_TRIGGER_INSERT|| ++ BTREE_TRIGGER_GC); + + bch2_btree_node_free_index(as, NULL, + bkey_i_to_s_c(&b->key), +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index 3102c4d6fa08..31dae75d2428 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -395,17 +395,11 @@ static noinline void bch2_trans_mark_gc(struct btree_trans *trans) + { + struct bch_fs *c = trans->c; + struct btree_insert_entry *i; +- unsigned mark_flags = trans->flags & BTREE_INSERT_BUCKET_INVALIDATE +- ? BCH_BUCKET_MARK_BUCKET_INVALIDATE +- : 0; +- +- if (unlikely(trans->flags & BTREE_INSERT_NOMARK)) +- return; + + trans_for_each_update(trans, i) + if (gc_visited(c, gc_pos_btree_node(i->iter->l[0].b))) + bch2_mark_update(trans, i->iter, i->k, NULL, +- mark_flags|BCH_BUCKET_MARK_GC); ++ i->trigger_flags|BTREE_TRIGGER_GC); + } + + static inline int +@@ -415,9 +409,6 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, + struct bch_fs *c = trans->c; + struct bch_fs_usage *fs_usage = NULL; + struct btree_insert_entry *i; +- unsigned mark_flags = trans->flags & BTREE_INSERT_BUCKET_INVALIDATE +- ? BCH_BUCKET_MARK_BUCKET_INVALIDATE +- : 0; + unsigned iter, u64s = 0; + bool marking = false; + int ret; +@@ -490,10 +481,9 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, + } + + trans_for_each_update(trans, i) +- if (likely(!(trans->flags & BTREE_INSERT_NOMARK)) && +- iter_has_nontrans_triggers(i->iter)) ++ if (iter_has_nontrans_triggers(i->iter)) + bch2_mark_update(trans, i->iter, i->k, +- fs_usage, mark_flags); ++ fs_usage, i->trigger_flags); + + if (marking) + bch2_trans_fs_usage_apply(trans, fs_usage); +@@ -753,9 +743,9 @@ int __bch2_trans_commit(struct btree_trans *trans) + goto out; + } + +- if (likely(!(trans->flags & BTREE_INSERT_NOMARK)) && +- iter_has_trans_triggers(i->iter)) { +- ret = bch2_trans_mark_update(trans, i->iter, i->k); ++ if (iter_has_trans_triggers(i->iter)) { ++ ret = bch2_trans_mark_update(trans, i->iter, i->k, ++ i->trigger_flags); + if (unlikely(ret)) { + if (ret == -EINTR) + trace_trans_restart_mark(trans->ip); +@@ -805,7 +795,7 @@ static int __bch2_btree_insert(struct btree_trans *trans, + if (IS_ERR(iter)) + return PTR_ERR(iter); + +- bch2_trans_update(trans, iter, k); ++ bch2_trans_update(trans, iter, k, 0); + return 0; + } + +@@ -867,7 +857,7 @@ retry: + break; + } + +- bch2_trans_update(trans, iter, &delete); ++ bch2_trans_update(trans, iter, &delete, 0); + ret = bch2_trans_commit(trans, NULL, journal_seq, + BTREE_INSERT_NOFAIL); + if (ret) +@@ -893,7 +883,7 @@ int bch2_btree_delete_at(struct btree_trans *trans, + bkey_init(&k.k); + k.k.p = iter->pos; + +- bch2_trans_update(trans, iter, &k); ++ bch2_trans_update(trans, iter, &k, 0); + return bch2_trans_commit(trans, NULL, NULL, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_USE_RESERVE|flags); +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index 5b1005117b3f..fd4fe4bf3a0f 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -628,7 +628,7 @@ unwind: + percpu_rwsem_assert_held(&c->mark_lock); \ + \ + for (gc = 0; gc < 2 && !ret; gc++) \ +- if (!gc == !(flags & BCH_BUCKET_MARK_GC) || \ ++ if (!gc == !(flags & BTREE_TRIGGER_GC) || \ + (gc && gc_visited(c, pos))) \ + ret = fn(c, __VA_ARGS__, gc); \ + ret; \ +@@ -710,7 +710,7 @@ static int bch2_mark_alloc(struct bch_fs *c, struct bkey_s_c k, + struct bch_fs_usage *fs_usage, + u64 journal_seq, unsigned flags) + { +- bool gc = flags & BCH_BUCKET_MARK_GC; ++ bool gc = flags & BTREE_TRIGGER_GC; + struct bkey_alloc_unpacked u; + struct bch_dev *ca; + struct bucket *g; +@@ -719,8 +719,8 @@ static int bch2_mark_alloc(struct bch_fs *c, struct bkey_s_c k, + /* + * alloc btree is read in by bch2_alloc_read, not gc: + */ +- if ((flags & BCH_BUCKET_MARK_GC) && +- !(flags & BCH_BUCKET_MARK_BUCKET_INVALIDATE)) ++ if ((flags & BTREE_TRIGGER_GC) && ++ !(flags & BTREE_TRIGGER_BUCKET_INVALIDATE)) + return 0; + + ca = bch_dev_bkey_exists(c, k.k->p.inode); +@@ -743,7 +743,7 @@ static int bch2_mark_alloc(struct bch_fs *c, struct bkey_s_c k, + } + })); + +- if (!(flags & BCH_BUCKET_MARK_ALLOC_READ)) ++ if (!(flags & BTREE_TRIGGER_ALLOC_READ)) + bch2_dev_usage_update(c, ca, fs_usage, old, m, gc); + + g->io_time[READ] = u.read_time; +@@ -756,7 +756,7 @@ static int bch2_mark_alloc(struct bch_fs *c, struct bkey_s_c k, + * not: + */ + +- if ((flags & BCH_BUCKET_MARK_BUCKET_INVALIDATE) && ++ if ((flags & BTREE_TRIGGER_BUCKET_INVALIDATE) && + old.cached_sectors) { + update_cached_sectors(c, fs_usage, ca->dev_idx, + -old.cached_sectors); +@@ -842,13 +842,13 @@ static s64 __ptr_disk_sectors_delta(unsigned old_size, + { + BUG_ON(!n || !d); + +- if (flags & BCH_BUCKET_MARK_OVERWRITE_SPLIT) { ++ if (flags & BTREE_TRIGGER_OVERWRITE_SPLIT) { + BUG_ON(offset + -delta > old_size); + + return -disk_sectors_scaled(n, d, old_size) + + disk_sectors_scaled(n, d, offset) + + disk_sectors_scaled(n, d, old_size - offset + delta); +- } else if (flags & BCH_BUCKET_MARK_OVERWRITE) { ++ } else if (flags & BTREE_TRIGGER_OVERWRITE) { + BUG_ON(offset + -delta > old_size); + + return -disk_sectors_scaled(n, d, old_size) + +@@ -874,8 +874,8 @@ static void bucket_set_stripe(struct bch_fs *c, + u64 journal_seq, + unsigned flags) + { +- bool enabled = !(flags & BCH_BUCKET_MARK_OVERWRITE); +- bool gc = flags & BCH_BUCKET_MARK_GC; ++ bool enabled = !(flags & BTREE_TRIGGER_OVERWRITE); ++ bool gc = flags & BTREE_TRIGGER_GC; + unsigned i; + + for (i = 0; i < v->nr_blocks; i++) { +@@ -922,7 +922,7 @@ static bool bch2_mark_pointer(struct bch_fs *c, + struct bch_fs_usage *fs_usage, + u64 journal_seq, unsigned flags) + { +- bool gc = flags & BCH_BUCKET_MARK_GC; ++ bool gc = flags & BTREE_TRIGGER_GC; + struct bucket_mark old, new; + struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); + struct bucket *g = PTR_BUCKET(ca, &p.ptr, gc); +@@ -970,7 +970,7 @@ static bool bch2_mark_pointer(struct bch_fs *c, + new.data_type = data_type; + } + +- if (flags & BCH_BUCKET_MARK_NOATOMIC) { ++ if (flags & BTREE_TRIGGER_NOATOMIC) { + g->_mark = new; + break; + } +@@ -1008,7 +1008,7 @@ static int bch2_mark_stripe_ptr(struct bch_fs *c, + unsigned *nr_data, + unsigned *nr_parity) + { +- bool gc = flags & BCH_BUCKET_MARK_GC; ++ bool gc = flags & BTREE_TRIGGER_GC; + struct stripe *m; + unsigned old, new; + int blocks_nonempty_delta; +@@ -1121,7 +1121,7 @@ static int bch2_mark_stripe(struct bch_fs *c, struct bkey_s_c k, + struct bch_fs_usage *fs_usage, + u64 journal_seq, unsigned flags) + { +- bool gc = flags & BCH_BUCKET_MARK_GC; ++ bool gc = flags & BTREE_TRIGGER_GC; + struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k); + size_t idx = s.k->p.offset; + struct stripe *m = genradix_ptr(&c->stripes[gc], idx); +@@ -1129,14 +1129,14 @@ static int bch2_mark_stripe(struct bch_fs *c, struct bkey_s_c k, + + spin_lock(&c->ec_stripes_heap_lock); + +- if (!m || ((flags & BCH_BUCKET_MARK_OVERWRITE) && !m->alive)) { ++ if (!m || ((flags & BTREE_TRIGGER_OVERWRITE) && !m->alive)) { + spin_unlock(&c->ec_stripes_heap_lock); + bch_err_ratelimited(c, "error marking nonexistent stripe %zu", + idx); + return -1; + } + +- if (!(flags & BCH_BUCKET_MARK_OVERWRITE)) { ++ if (!(flags & BTREE_TRIGGER_OVERWRITE)) { + m->sectors = le16_to_cpu(s.v->sectors); + m->algorithm = s.v->algorithm; + m->nr_blocks = s.v->nr_blocks; +@@ -1152,7 +1152,7 @@ static int bch2_mark_stripe(struct bch_fs *c, struct bkey_s_c k, + #endif + + /* gc recalculates these fields: */ +- if (!(flags & BCH_BUCKET_MARK_GC)) { ++ if (!(flags & BTREE_TRIGGER_GC)) { + for (i = 0; i < s.v->nr_blocks; i++) { + m->block_sectors[i] = + stripe_blockcount_get(s.v, i); +@@ -1185,16 +1185,16 @@ int bch2_mark_key_locked(struct bch_fs *c, + + preempt_disable(); + +- if (!fs_usage || (flags & BCH_BUCKET_MARK_GC)) ++ if (!fs_usage || (flags & BTREE_TRIGGER_GC)) + fs_usage = fs_usage_ptr(c, journal_seq, +- flags & BCH_BUCKET_MARK_GC); ++ flags & BTREE_TRIGGER_GC); + + switch (k.k->type) { + case KEY_TYPE_alloc: + ret = bch2_mark_alloc(c, k, fs_usage, journal_seq, flags); + break; + case KEY_TYPE_btree_ptr: +- sectors = !(flags & BCH_BUCKET_MARK_OVERWRITE) ++ sectors = !(flags & BTREE_TRIGGER_OVERWRITE) + ? c->opts.btree_node_size + : -c->opts.btree_node_size; + +@@ -1210,7 +1210,7 @@ int bch2_mark_key_locked(struct bch_fs *c, + ret = bch2_mark_stripe(c, k, fs_usage, journal_seq, flags); + break; + case KEY_TYPE_inode: +- if (!(flags & BCH_BUCKET_MARK_OVERWRITE)) ++ if (!(flags & BTREE_TRIGGER_OVERWRITE)) + fs_usage->nr_inodes++; + else + fs_usage->nr_inodes--; +@@ -1260,7 +1260,7 @@ inline int bch2_mark_overwrite(struct btree_trans *trans, + unsigned offset = 0; + s64 sectors = 0; + +- flags |= BCH_BUCKET_MARK_OVERWRITE; ++ flags |= BTREE_TRIGGER_OVERWRITE; + + if (btree_node_is_extents(b) + ? bkey_cmp(new->k.p, bkey_start_pos(old.k)) <= 0 +@@ -1288,7 +1288,7 @@ inline int bch2_mark_overwrite(struct btree_trans *trans, + offset = bkey_start_offset(&new->k) - + bkey_start_offset(old.k); + sectors = -((s64) new->k.size); +- flags |= BCH_BUCKET_MARK_OVERWRITE_SPLIT; ++ flags |= BTREE_TRIGGER_OVERWRITE_SPLIT; + break; + } + +@@ -1311,15 +1311,18 @@ int bch2_mark_update(struct btree_trans *trans, + struct bkey_packed *_k; + int ret = 0; + ++ if (unlikely(flags & BTREE_TRIGGER_NORUN)) ++ return 0; ++ + if (!btree_node_type_needs_gc(iter->btree_id)) + return 0; + + bch2_mark_key_locked(c, bkey_i_to_s_c(insert), + 0, insert->k.size, + fs_usage, trans->journal_res.seq, +- BCH_BUCKET_MARK_INSERT|flags); ++ BTREE_TRIGGER_INSERT|flags); + +- if (unlikely(trans->flags & BTREE_INSERT_NOMARK_OVERWRITES)) ++ if (unlikely(flags & BTREE_TRIGGER_NOOVERWRITES)) + return 0; + + /* +@@ -1450,7 +1453,7 @@ static void *trans_update_key(struct btree_trans *trans, + return new_k; + } + +- bch2_trans_update(trans, iter, new_k); ++ bch2_trans_update(trans, iter, new_k, 0); + return new_k; + } + +@@ -1689,7 +1692,7 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans, + goto err; + } + +- if ((flags & BCH_BUCKET_MARK_OVERWRITE) && ++ if ((flags & BTREE_TRIGGER_OVERWRITE) && + (bkey_start_offset(k.k) < idx || + k.k->p.offset > idx + sectors)) + goto out; +@@ -1706,7 +1709,7 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans, + r_v = bkey_i_to_reflink_v(new_k); + + le64_add_cpu(&r_v->v.refcount, +- !(flags & BCH_BUCKET_MARK_OVERWRITE) ? 1 : -1); ++ !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1); + + if (!r_v->v.refcount) { + r_v->k.type = KEY_TYPE_deleted; +@@ -1750,7 +1753,7 @@ int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c k, + + switch (k.k->type) { + case KEY_TYPE_btree_ptr: +- sectors = !(flags & BCH_BUCKET_MARK_OVERWRITE) ++ sectors = !(flags & BTREE_TRIGGER_OVERWRITE) + ? c->opts.btree_node_size + : -c->opts.btree_node_size; + +@@ -1763,7 +1766,7 @@ int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c k, + case KEY_TYPE_inode: + d = replicas_deltas_realloc(trans, 0); + +- if (!(flags & BCH_BUCKET_MARK_OVERWRITE)) ++ if (!(flags & BTREE_TRIGGER_OVERWRITE)) + d->nr_inodes++; + else + d->nr_inodes--; +@@ -1791,22 +1794,26 @@ int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c k, + + int bch2_trans_mark_update(struct btree_trans *trans, + struct btree_iter *iter, +- struct bkey_i *insert) ++ struct bkey_i *insert, ++ unsigned flags) + { + struct btree *b = iter->l[0].b; + struct btree_node_iter node_iter = iter->l[0].iter; + struct bkey_packed *_k; + int ret; + ++ if (unlikely(flags & BTREE_TRIGGER_NORUN)) ++ return 0; ++ + if (!btree_node_type_needs_gc(iter->btree_id)) + return 0; + + ret = bch2_trans_mark_key(trans, bkey_i_to_s_c(insert), +- 0, insert->k.size, BCH_BUCKET_MARK_INSERT); ++ 0, insert->k.size, BTREE_TRIGGER_INSERT); + if (ret) + return ret; + +- if (unlikely(trans->flags & BTREE_INSERT_NOMARK_OVERWRITES)) ++ if (unlikely(flags & BTREE_TRIGGER_NOOVERWRITES)) + return 0; + + while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, b, +@@ -1815,7 +1822,7 @@ int bch2_trans_mark_update(struct btree_trans *trans, + struct bkey_s_c k; + unsigned offset = 0; + s64 sectors = 0; +- unsigned flags = BCH_BUCKET_MARK_OVERWRITE; ++ unsigned flags = BTREE_TRIGGER_OVERWRITE; + + k = bkey_disassemble(b, _k, &unpacked); + +@@ -1845,7 +1852,7 @@ int bch2_trans_mark_update(struct btree_trans *trans, + offset = bkey_start_offset(&insert->k) - + bkey_start_offset(k.k); + sectors = -((s64) insert->k.size); +- flags |= BCH_BUCKET_MARK_OVERWRITE_SPLIT; ++ flags |= BTREE_TRIGGER_OVERWRITE_SPLIT; + break; + } + +diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h +index 60a12bb0d8f7..4717a1a6f568 100644 +--- a/fs/bcachefs/buckets.h ++++ b/fs/bcachefs/buckets.h +@@ -258,14 +258,6 @@ void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *, + size_t, enum bch_data_type, unsigned, + struct gc_pos, unsigned); + +-#define BCH_BUCKET_MARK_INSERT (1 << 0) +-#define BCH_BUCKET_MARK_OVERWRITE (1 << 1) +-#define BCH_BUCKET_MARK_OVERWRITE_SPLIT (1 << 2) +-#define BCH_BUCKET_MARK_BUCKET_INVALIDATE (1 << 3) +-#define BCH_BUCKET_MARK_GC (1 << 4) +-#define BCH_BUCKET_MARK_ALLOC_READ (1 << 5) +-#define BCH_BUCKET_MARK_NOATOMIC (1 << 6) +- + int bch2_mark_key_locked(struct bch_fs *, struct bkey_s_c, unsigned, s64, + struct bch_fs_usage *, u64, unsigned); + int bch2_mark_key(struct bch_fs *, struct bkey_s_c, unsigned, s64, +@@ -284,9 +276,8 @@ int bch2_replicas_delta_list_apply(struct bch_fs *, + struct replicas_delta_list *); + int bch2_trans_mark_key(struct btree_trans *, struct bkey_s_c, + unsigned, s64, unsigned); +-int bch2_trans_mark_update(struct btree_trans *, +- struct btree_iter *iter, +- struct bkey_i *insert); ++int bch2_trans_mark_update(struct btree_trans *, struct btree_iter *iter, ++ struct bkey_i *insert, unsigned); + void bch2_trans_fs_usage_apply(struct btree_trans *, struct bch_fs_usage *); + + /* disk reservations: */ +diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c +index 1bf53c55912d..4b4aeaf81d21 100644 +--- a/fs/bcachefs/dirent.c ++++ b/fs/bcachefs/dirent.c +@@ -246,7 +246,7 @@ int bch2_dirent_rename(struct btree_trans *trans, + */ + new_dst->k.p = src_iter->pos; + bch2_trans_update(trans, src_iter, +- &new_dst->k_i); ++ &new_dst->k_i, 0); + return 0; + } else { + /* If we're overwriting, we can't insert new_dst +@@ -268,8 +268,8 @@ int bch2_dirent_rename(struct btree_trans *trans, + } + } + +- bch2_trans_update(trans, src_iter, &new_src->k_i); +- bch2_trans_update(trans, dst_iter, &new_dst->k_i); ++ bch2_trans_update(trans, src_iter, &new_src->k_i, 0); ++ bch2_trans_update(trans, dst_iter, &new_dst->k_i, 0); + return 0; + } + +diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c +index 91f5a4a110b4..1648dd3dac6f 100644 +--- a/fs/bcachefs/ec.c ++++ b/fs/bcachefs/ec.c +@@ -736,7 +736,7 @@ found_slot: + + stripe->k.p = iter->pos; + +- bch2_trans_update(&trans, iter, &stripe->k_i); ++ bch2_trans_update(&trans, iter, &stripe->k_i, 0); + + ret = bch2_trans_commit(&trans, NULL, NULL, + BTREE_INSERT_NOFAIL); +@@ -818,7 +818,7 @@ static int ec_stripe_update_ptrs(struct bch_fs *c, + + extent_stripe_ptr_add(e, s, ec_ptr, idx); + +- bch2_trans_update(&trans, iter, sk.k); ++ bch2_trans_update(&trans, iter, sk.k, 0); + + ret = bch2_trans_commit(&trans, NULL, NULL, + BTREE_INSERT_NOFAIL| +@@ -1230,7 +1230,7 @@ static int __bch2_stripe_write_key(struct btree_trans *trans, + + spin_unlock(&c->ec_stripes_heap_lock); + +- bch2_trans_update(trans, iter, &new_key->k_i); ++ bch2_trans_update(trans, iter, &new_key->k_i, 0); + + return bch2_trans_commit(trans, NULL, NULL, + BTREE_INSERT_NOFAIL|flags); +@@ -1316,8 +1316,8 @@ int bch2_stripes_read(struct bch_fs *c, struct journal_keys *journal_keys) + + bch2_mark_key(c, btree ? btree_k : journal_k, + 0, 0, NULL, 0, +- BCH_BUCKET_MARK_ALLOC_READ| +- BCH_BUCKET_MARK_NOATOMIC); ++ BTREE_TRIGGER_ALLOC_READ| ++ BTREE_TRIGGER_NOATOMIC); + + if (btree) + btree_k = bch2_btree_iter_next(btree_iter); +diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c +index a6ebe41c85eb..b111ce3829a8 100644 +--- a/fs/bcachefs/fs-io.c ++++ b/fs/bcachefs/fs-io.c +@@ -2519,7 +2519,7 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode, + struct bpos next_pos; + struct bpos move_pos = POS(inode->v.i_ino, offset >> 9); + struct bpos atomic_end; +- unsigned commit_flags = 0; ++ unsigned trigger_flags = 0; + + k = insert + ? bch2_btree_iter_peek_prev(src) +@@ -2590,15 +2590,12 @@ reassemble: + bkey_start_pos(&delete.k)); + } + +- bch2_trans_update(&trans, dst, copy.k); +- bch2_trans_update(&trans, del ?: src, &delete); +- + if (copy.k->k.size == k.k->size) { + /* + * If we're moving the entire extent, we can skip + * running triggers: + */ +- commit_flags |= BTREE_INSERT_NOMARK; ++ trigger_flags |= BTREE_TRIGGER_NORUN; + } else { + /* We might end up splitting compressed extents: */ + unsigned nr_ptrs = +@@ -2610,10 +2607,12 @@ reassemble: + BUG_ON(ret); + } + ++ bch2_trans_update(&trans, dst, copy.k, trigger_flags); ++ bch2_trans_update(&trans, del ?: src, &delete, trigger_flags); ++ + ret = bch2_trans_commit(&trans, &disk_res, + &inode->ei_journal_seq, +- BTREE_INSERT_NOFAIL| +- commit_flags); ++ BTREE_INSERT_NOFAIL); + bch2_disk_reservation_put(c, &disk_res); + bkey_err: + if (del) +diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c +index e25f064706ad..9ef532d875e8 100644 +--- a/fs/bcachefs/fsck.c ++++ b/fs/bcachefs/fsck.c +@@ -192,7 +192,7 @@ static int hash_redo_key(const struct bch_hash_desc desc, + + bkey_init(&delete.k); + delete.k.p = k_iter->pos; +- bch2_trans_update(trans, k_iter, &delete); ++ bch2_trans_update(trans, k_iter, &delete, 0); + + return bch2_hash_set(trans, desc, &h->info, k_iter->pos.inode, + tmp, BCH_HASH_SET_MUST_CREATE) ?: +@@ -388,7 +388,7 @@ static int check_dirent_hash(struct btree_trans *trans, struct hash_check *h, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW, + TRANS_RESET_MEM, +- (bch2_trans_update(trans, iter, &d->k_i), 0)); ++ (bch2_trans_update(trans, iter, &d->k_i, 0), 0)); + if (ret) + goto err; + +@@ -661,7 +661,7 @@ retry: + BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW, + TRANS_RESET_MEM, +- (bch2_trans_update(&trans, iter, &n->k_i), 0)); ++ (bch2_trans_update(&trans, iter, &n->k_i, 0), 0)); + kfree(n); + if (ret) + goto err; +@@ -1276,7 +1276,7 @@ static int check_inode(struct btree_trans *trans, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW, + TRANS_RESET_MEM, +- (bch2_trans_update(trans, iter, &p.inode.k_i), 0)); ++ (bch2_trans_update(trans, iter, &p.inode.k_i, 0), 0)); + if (ret) + bch_err(c, "error in fsck: error %i " + "updating inode", ret); +diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c +index 227cfb572ff2..e811b98d0f03 100644 +--- a/fs/bcachefs/inode.c ++++ b/fs/bcachefs/inode.c +@@ -223,7 +223,7 @@ int bch2_inode_write(struct btree_trans *trans, + return PTR_ERR(inode_p); + + bch2_inode_pack(inode_p, inode); +- bch2_trans_update(trans, iter, &inode_p->inode.k_i); ++ bch2_trans_update(trans, iter, &inode_p->inode.k_i, 0); + return 0; + } + +@@ -411,7 +411,7 @@ again: + inode_u->bi_generation = bkey_generation(k); + + bch2_inode_pack(inode_p, inode_u); +- bch2_trans_update(trans, iter, &inode_p->inode.k_i); ++ bch2_trans_update(trans, iter, &inode_p->inode.k_i, 0); + return 0; + } + } +@@ -493,7 +493,7 @@ int bch2_inode_rm(struct bch_fs *c, u64 inode_nr) + delete.v.bi_generation = cpu_to_le32(bi_generation); + } + +- bch2_trans_update(&trans, iter, &delete.k_i); ++ bch2_trans_update(&trans, iter, &delete.k_i, 0); + + ret = bch2_trans_commit(&trans, NULL, NULL, + BTREE_INSERT_NOFAIL); +diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c +index 01f61bc81755..e406d6d8d916 100644 +--- a/fs/bcachefs/io.c ++++ b/fs/bcachefs/io.c +@@ -292,13 +292,13 @@ int bch2_extent_update(struct btree_trans *trans, + if (delta || new_i_size) { + bch2_inode_pack(&inode_p, &inode_u); + bch2_trans_update(trans, inode_iter, +- &inode_p.inode.k_i); ++ &inode_p.inode.k_i, 0); + } + + bch2_trans_iter_put(trans, inode_iter); + } + +- bch2_trans_update(trans, iter, k); ++ bch2_trans_update(trans, iter, k, 0); + + ret = bch2_trans_commit(trans, disk_res, journal_seq, + BTREE_INSERT_NOCHECK_RW| +@@ -1738,7 +1738,7 @@ retry: + if (!bch2_bkey_narrow_crcs(new.k, new_crc)) + goto out; + +- bch2_trans_update(&trans, iter, new.k); ++ bch2_trans_update(&trans, iter, new.k, 0); + ret = bch2_trans_commit(&trans, NULL, NULL, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_NOWAIT); +diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c +index db86420bd647..0e3f63c1d65c 100644 +--- a/fs/bcachefs/migrate.c ++++ b/fs/bcachefs/migrate.c +@@ -76,7 +76,7 @@ static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags + + bch2_btree_iter_set_pos(iter, bkey_start_pos(&sk.k->k)); + +- bch2_trans_update(&trans, iter, sk.k); ++ bch2_trans_update(&trans, iter, sk.k, 0); + + ret = bch2_trans_commit(&trans, NULL, NULL, + BTREE_INSERT_NOFAIL); +diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c +index 67e9fd3f86f5..257e00ae6fa7 100644 +--- a/fs/bcachefs/move.c ++++ b/fs/bcachefs/move.c +@@ -150,7 +150,7 @@ static int bch2_migrate_index_update(struct bch_write_op *op) + goto next; + } + +- bch2_trans_update(&trans, iter, insert); ++ bch2_trans_update(&trans, iter, insert, 0); + + ret = bch2_trans_commit(&trans, &op->res, + op_journal_seq(op), +diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c +index 0fa6f33c049b..e7787c5063ce 100644 +--- a/fs/bcachefs/quota.c ++++ b/fs/bcachefs/quota.c +@@ -752,7 +752,7 @@ static int bch2_set_quota(struct super_block *sb, struct kqid qid, + if (qdq->d_fieldmask & QC_INO_HARD) + new_quota.v.c[Q_INO].hardlimit = cpu_to_le64(qdq->d_ino_hardlimit); + +- bch2_trans_update(&trans, iter, &new_quota.k_i); ++ bch2_trans_update(&trans, iter, &new_quota.k_i, 0); + + ret = bch2_trans_commit(&trans, NULL, NULL, 0); + +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index c7367a679b22..8ecd4abc8eeb 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -300,28 +300,24 @@ retry: + bch2_cut_front(split_iter->pos, split); + bch2_cut_back(atomic_end, split); + +- bch2_trans_update(&trans, split_iter, split); ++ bch2_trans_update(&trans, split_iter, split, !remark ++ ? BTREE_TRIGGER_NORUN ++ : BTREE_TRIGGER_NOOVERWRITES); + bch2_btree_iter_set_pos(iter, split->k.p); + } while (bkey_cmp(iter->pos, k->k.p) < 0); + + if (remark) { + ret = bch2_trans_mark_key(&trans, bkey_i_to_s_c(k), + 0, -((s64) k->k.size), +- BCH_BUCKET_MARK_OVERWRITE) ?: +- bch2_trans_commit(&trans, &disk_res, NULL, +- BTREE_INSERT_NOFAIL| +- BTREE_INSERT_LAZY_RW| +- BTREE_INSERT_NOMARK_OVERWRITES); +- } else { +- ret = bch2_trans_commit(&trans, &disk_res, NULL, +- BTREE_INSERT_NOFAIL| +- BTREE_INSERT_LAZY_RW| +- BTREE_INSERT_JOURNAL_REPLAY| +- BTREE_INSERT_NOMARK); ++ BTREE_TRIGGER_OVERWRITE); ++ if (ret) ++ goto err; + } + +- if (ret) +- goto err; ++ ret = bch2_trans_commit(&trans, &disk_res, NULL, ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_LAZY_RW| ++ BTREE_INSERT_JOURNAL_REPLAY); + err: + if (ret == -EINTR) + goto retry; +@@ -331,6 +327,30 @@ err: + return bch2_trans_exit(&trans) ?: ret; + } + ++static int __bch2_journal_replay_key(struct btree_trans *trans, ++ enum btree_id id, struct bkey_i *k) ++{ ++ struct btree_iter *iter; ++ ++ iter = bch2_trans_get_iter(trans, id, bkey_start_pos(&k->k), ++ BTREE_ITER_INTENT); ++ if (IS_ERR(iter)) ++ return PTR_ERR(iter); ++ ++ bch2_trans_update(trans, iter, k, BTREE_TRIGGER_NORUN); ++ return 0; ++} ++ ++static int bch2_journal_replay_key(struct bch_fs *c, enum btree_id id, ++ struct bkey_i *k) ++{ ++ return bch2_trans_do(c, NULL, NULL, ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_LAZY_RW| ++ BTREE_INSERT_JOURNAL_REPLAY, ++ __bch2_journal_replay_key(&trans, id, k)); ++} ++ + static int bch2_journal_replay(struct bch_fs *c, + struct journal_keys keys) + { +@@ -348,12 +368,7 @@ static int bch2_journal_replay(struct bch_fs *c, + else if (btree_node_type_is_extents(i->btree_id)) + ret = bch2_extent_replay_key(c, i->btree_id, i->k); + else +- ret = bch2_btree_insert(c, i->btree_id, i->k, +- NULL, NULL, +- BTREE_INSERT_NOFAIL| +- BTREE_INSERT_LAZY_RW| +- BTREE_INSERT_JOURNAL_REPLAY| +- BTREE_INSERT_NOMARK); ++ ret = bch2_journal_replay_key(c, i->btree_id, i->k); + + if (ret) { + bch_err(c, "journal replay: error %d while replaying key", +diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c +index 2bf003ba3bd8..3b8c74ca3725 100644 +--- a/fs/bcachefs/reflink.c ++++ b/fs/bcachefs/reflink.c +@@ -115,7 +115,7 @@ static int bch2_make_extent_indirect(struct btree_trans *trans, + r_v->v.refcount = 0; + memcpy(r_v->v.start, e->v.start, bkey_val_bytes(&e->k)); + +- bch2_trans_update(trans, reflink_iter, &r_v->k_i); ++ bch2_trans_update(trans, reflink_iter, &r_v->k_i, 0); + + r_p = bch2_trans_kmalloc(trans, sizeof(*r_p)); + if (IS_ERR(r_p)) +@@ -126,7 +126,7 @@ static int bch2_make_extent_indirect(struct btree_trans *trans, + set_bkey_val_bytes(&r_p->k, sizeof(r_p->v)); + r_p->v.idx = cpu_to_le64(bkey_start_offset(&r_v->k)); + +- bch2_trans_update(trans, extent_iter, &r_p->k_i); ++ bch2_trans_update(trans, extent_iter, &r_p->k_i, 0); + err: + if (!IS_ERR(reflink_iter)) { + c->reflink_hint = reflink_iter->pos.offset; +diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h +index 3870df2d58ce..35f4232d0755 100644 +--- a/fs/bcachefs/str_hash.h ++++ b/fs/bcachefs/str_hash.h +@@ -281,7 +281,7 @@ not_found: + swap(iter, slot); + + insert->k.p = iter->pos; +- bch2_trans_update(trans, iter, insert); ++ bch2_trans_update(trans, iter, insert, 0); + } + + goto out; +@@ -308,7 +308,7 @@ int bch2_hash_delete_at(struct btree_trans *trans, + delete->k.p = iter->pos; + delete->k.type = ret ? KEY_TYPE_whiteout : KEY_TYPE_deleted; + +- bch2_trans_update(trans, iter, delete); ++ bch2_trans_update(trans, iter, delete, 0); + return 0; + } + +diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c +index 724f41e6590c..8f9b0cca17da 100644 +--- a/fs/bcachefs/tests.c ++++ b/fs/bcachefs/tests.c +@@ -43,7 +43,7 @@ static void test_delete(struct bch_fs *c, u64 nr) + ret = bch2_btree_iter_traverse(iter); + BUG_ON(ret); + +- bch2_trans_update(&trans, iter, &k.k_i); ++ bch2_trans_update(&trans, iter, &k.k_i, 0); + ret = bch2_trans_commit(&trans, NULL, NULL, 0); + BUG_ON(ret); + +@@ -75,7 +75,7 @@ static void test_delete_written(struct bch_fs *c, u64 nr) + ret = bch2_btree_iter_traverse(iter); + BUG_ON(ret); + +- bch2_trans_update(&trans, iter, &k.k_i); ++ bch2_trans_update(&trans, iter, &k.k_i, 0); + ret = bch2_trans_commit(&trans, NULL, NULL, 0); + BUG_ON(ret); + +@@ -465,7 +465,7 @@ static void rand_mixed(struct bch_fs *c, u64 nr) + bkey_cookie_init(&k.k_i); + k.k.p = iter->pos; + +- bch2_trans_update(&trans, iter, &k.k_i); ++ bch2_trans_update(&trans, iter, &k.k_i, 0); + ret = bch2_trans_commit(&trans, NULL, NULL, 0); + BUG_ON(ret); + } +@@ -509,7 +509,7 @@ static void seq_insert(struct bch_fs *c, u64 nr) + BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { + insert.k.p = iter->pos; + +- bch2_trans_update(&trans, iter, &insert.k_i); ++ bch2_trans_update(&trans, iter, &insert.k_i, 0); + ret = bch2_trans_commit(&trans, NULL, NULL, 0); + BUG_ON(ret); + +@@ -548,7 +548,7 @@ static void seq_overwrite(struct bch_fs *c, u64 nr) + + bkey_reassemble(&u.k_i, k); + +- bch2_trans_update(&trans, iter, &u.k_i); ++ bch2_trans_update(&trans, iter, &u.k_i, 0); + ret = bch2_trans_commit(&trans, NULL, NULL, 0); + BUG_ON(ret); + } +-- +cgit v1.2.3 + + +From 923887e34675eba66f5f04997a32fce7f0335e3e Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 31 Dec 2019 19:37:10 -0500 +Subject: bcachefs: Sort & deduplicate updates in bch2_trans_update() + +Previously, when doing multiple update in the same transaction commit +that overwrote each other, we relied on doing the updates in the same +order as the bch2_trans_update() calls in order to get the correct +result. But that wasn't correct for triggers; bch2_trans_mark_update() +when marking overwrites would do the wrong thing because it hadn't seen +the update that was being overwritten. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 22 +++-- + fs/bcachefs/btree_types.h | 12 +-- + fs/bcachefs/btree_update.h | 15 +--- + fs/bcachefs/btree_update_leaf.c | 176 ++++++++++++++++++++++++++-------------- + fs/bcachefs/buckets.c | 58 ++++--------- + fs/bcachefs/fs-io.c | 40 ++------- + 6 files changed, 159 insertions(+), 164 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index be303db951e2..f4ce99a5a19a 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -1793,10 +1793,9 @@ int bch2_trans_iter_free(struct btree_trans *trans, + static int bch2_trans_realloc_iters(struct btree_trans *trans, + unsigned new_size) + { +- void *new_iters, *new_updates, *new_sorted; ++ void *new_iters, *new_updates; + size_t iters_bytes; + size_t updates_bytes; +- size_t sorted_bytes; + + new_size = roundup_pow_of_two(new_size); + +@@ -1811,11 +1810,8 @@ static int bch2_trans_realloc_iters(struct btree_trans *trans, + + iters_bytes = sizeof(struct btree_iter) * new_size; + updates_bytes = sizeof(struct btree_insert_entry) * new_size; +- sorted_bytes = sizeof(u8) * new_size; + +- new_iters = kmalloc(iters_bytes + +- updates_bytes + +- sorted_bytes, GFP_NOFS); ++ new_iters = kmalloc(iters_bytes + updates_bytes, GFP_NOFS); + if (new_iters) + goto success; + +@@ -1825,7 +1821,6 @@ static int bch2_trans_realloc_iters(struct btree_trans *trans, + trans->used_mempool = true; + success: + new_updates = new_iters + iters_bytes; +- new_sorted = new_updates + updates_bytes; + + memcpy(new_iters, trans->iters, + sizeof(struct btree_iter) * trans->nr_iters); +@@ -1842,7 +1837,6 @@ success: + + trans->iters = new_iters; + trans->updates = new_updates; +- trans->updates_sorted = new_sorted; + trans->size = new_size; + + if (trans->iters_live) { +@@ -1891,6 +1885,7 @@ static struct btree_iter *btree_trans_iter_alloc(struct btree_trans *trans) + got_slot: + BUG_ON(trans->iters_linked & (1ULL << idx)); + trans->iters_linked |= 1ULL << idx; ++ trans->iters[idx].flags = 0; + return &trans->iters[idx]; + } + +@@ -1906,6 +1901,9 @@ static inline void btree_iter_copy(struct btree_iter *dst, + if (btree_node_locked(dst, i)) + six_lock_increment(&dst->l[i].b->lock, + __btree_lock_want(dst, i)); ++ ++ dst->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT; ++ dst->flags &= ~BTREE_ITER_SET_POS_AFTER_COMMIT; + } + + static inline struct bpos bpos_diff(struct bpos l, struct bpos r) +@@ -1956,7 +1954,6 @@ static struct btree_iter *__btree_trans_get_iter(struct btree_trans *trans, + iter = best; + } + +- iter->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT; + iter->flags &= ~(BTREE_ITER_SLOTS|BTREE_ITER_INTENT|BTREE_ITER_PREFETCH); + iter->flags |= flags & (BTREE_ITER_SLOTS|BTREE_ITER_INTENT|BTREE_ITER_PREFETCH); + +@@ -1968,6 +1965,7 @@ static struct btree_iter *__btree_trans_get_iter(struct btree_trans *trans, + BUG_ON(iter->btree_id != btree_id); + BUG_ON((iter->flags ^ flags) & BTREE_ITER_TYPE); + BUG_ON(iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT); ++ BUG_ON(iter->flags & BTREE_ITER_SET_POS_AFTER_COMMIT); + BUG_ON(trans->iters_live & (1ULL << iter->idx)); + + trans->iters_live |= 1ULL << iter->idx; +@@ -2030,7 +2028,6 @@ struct btree_iter *bch2_trans_copy_iter(struct btree_trans *trans, + * it's cheap to copy it again: + */ + trans->iters_touched &= ~(1ULL << iter->idx); +- iter->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT; + + return iter; + } +@@ -2090,7 +2087,8 @@ void bch2_trans_reset(struct btree_trans *trans, unsigned flags) + struct btree_iter *iter; + + trans_for_each_iter(trans, iter) +- iter->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT; ++ iter->flags &= ~(BTREE_ITER_KEEP_UNTIL_COMMIT| ++ BTREE_ITER_SET_POS_AFTER_COMMIT); + + bch2_trans_unlink_iters(trans); + +@@ -2099,6 +2097,7 @@ void bch2_trans_reset(struct btree_trans *trans, unsigned flags) + + trans->iters_touched &= trans->iters_live; + ++ trans->need_reset = 0; + trans->nr_updates = 0; + + if (flags & TRANS_RESET_MEM) +@@ -2126,7 +2125,6 @@ void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, + trans->size = ARRAY_SIZE(trans->iters_onstack); + trans->iters = trans->iters_onstack; + trans->updates = trans->updates_onstack; +- trans->updates_sorted = trans->updates_sorted_onstack; + trans->fs_usage_deltas = NULL; + + if (expected_nr_iters > trans->size) +diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h +index 274682b7bcff..86e52468c1aa 100644 +--- a/fs/bcachefs/btree_types.h ++++ b/fs/bcachefs/btree_types.h +@@ -194,6 +194,7 @@ enum btree_iter_type { + */ + #define BTREE_ITER_IS_EXTENTS (1 << 6) + #define BTREE_ITER_ERROR (1 << 7) ++#define BTREE_ITER_SET_POS_AFTER_COMMIT (1 << 8) + + enum btree_iter_uptodate { + BTREE_ITER_UPTODATE = 0, +@@ -210,12 +211,13 @@ enum btree_iter_uptodate { + * @nodes_intent_locked - bitmask indicating which locks are intent locks + */ + struct btree_iter { +- u8 idx; +- + struct btree_trans *trans; + struct bpos pos; ++ struct bpos pos_after_commit; ++ ++ u16 flags; ++ u8 idx; + +- u8 flags; + enum btree_iter_uptodate uptodate:4; + enum btree_id btree_id:4; + unsigned level:4, +@@ -243,6 +245,7 @@ static inline enum btree_iter_type btree_iter_type(struct btree_iter *iter) + + struct btree_insert_entry { + unsigned trigger_flags; ++ unsigned trans_triggers_run:1; + struct bkey_i *k; + struct btree_iter *iter; + }; +@@ -263,6 +266,7 @@ struct btree_trans { + unsigned used_mempool:1; + unsigned error:1; + unsigned nounlock:1; ++ unsigned need_reset:1; + + unsigned mem_top; + unsigned mem_bytes; +@@ -270,7 +274,6 @@ struct btree_trans { + + struct btree_iter *iters; + struct btree_insert_entry *updates; +- u8 *updates_sorted; + + /* update path: */ + struct journal_res journal_res; +@@ -284,7 +287,6 @@ struct btree_trans { + + struct btree_iter iters_onstack[2]; + struct btree_insert_entry updates_onstack[2]; +- u8 updates_sorted_onstack[2]; + }; + + #define BTREE_FLAG(flag) \ +diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h +index add7217598ed..2c34bae64281 100644 +--- a/fs/bcachefs/btree_update.h ++++ b/fs/bcachefs/btree_update.h +@@ -72,6 +72,8 @@ int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *, + int bch2_btree_node_update_key(struct bch_fs *, struct btree_iter *, + struct btree *, struct bkey_i_btree_ptr *); + ++int bch2_trans_update(struct btree_trans *, struct btree_iter *, ++ struct bkey_i *, enum btree_trigger_flags); + int __bch2_trans_commit(struct btree_trans *); + + /** +@@ -96,19 +98,6 @@ static inline int bch2_trans_commit(struct btree_trans *trans, + return __bch2_trans_commit(trans); + } + +-static inline void bch2_trans_update(struct btree_trans *trans, +- struct btree_iter *iter, struct bkey_i *k, +- enum btree_trigger_flags flags) +-{ +- EBUG_ON(trans->nr_updates >= trans->nr_iters); +- +- iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT; +- +- trans->updates[trans->nr_updates++] = (struct btree_insert_entry) { +- .trigger_flags = flags, .iter = iter, .k = k +- }; +-} +- + #define __bch2_trans_do(_trans, _disk_res, _journal_seq, \ + _flags, _reset_flags, _do) \ + ({ \ +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index 31dae75d2428..325baf2b8920 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -21,18 +21,12 @@ + #include + + static inline bool same_leaf_as_prev(struct btree_trans *trans, +- unsigned idx) ++ struct btree_insert_entry *i) + { +- return idx && +- trans->updates[trans->updates_sorted[idx]].iter->l[0].b == +- trans->updates[trans->updates_sorted[idx - 1]].iter->l[0].b; ++ return i != trans->updates && ++ i[0].iter->l[0].b == i[-1].iter->l[0].b; + } + +-#define trans_for_each_update_sorted(_trans, _i, _iter) \ +- for (_iter = 0; \ +- _iter < _trans->nr_updates && \ +- (_i = _trans->updates + _trans->updates_sorted[_iter], 1); \ +- _iter++) + + inline void bch2_btree_node_lock_for_insert(struct bch_fs *c, struct btree *b, + struct btree_iter *iter) +@@ -51,28 +45,6 @@ inline void bch2_btree_node_lock_for_insert(struct bch_fs *c, struct btree *b, + bch2_btree_init_next(c, b, iter); + } + +-static inline void btree_trans_sort_updates(struct btree_trans *trans) +-{ +- struct btree_insert_entry *l, *r; +- unsigned nr = 0, pos; +- +- trans_for_each_update(trans, l) { +- for (pos = 0; pos < nr; pos++) { +- r = trans->updates + trans->updates_sorted[pos]; +- +- if (btree_iter_cmp(l->iter, r->iter) <= 0) +- break; +- } +- +- memmove(&trans->updates_sorted[pos + 1], +- &trans->updates_sorted[pos], +- (nr - pos) * sizeof(trans->updates_sorted[0])); +- +- trans->updates_sorted[pos] = l - trans->updates; +- nr++; +- } +-} +- + /* Inserting into a given leaf node (last stage of insert): */ + + /* Handle overwrites and do insert, for non extents: */ +@@ -409,7 +381,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, + struct bch_fs *c = trans->c; + struct bch_fs_usage *fs_usage = NULL; + struct btree_insert_entry *i; +- unsigned iter, u64s = 0; ++ unsigned u64s = 0; + bool marking = false; + int ret; + +@@ -426,9 +398,9 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, + + prefetch(&trans->c->journal.flags); + +- trans_for_each_update_sorted(trans, i, iter) { ++ trans_for_each_update(trans, i) { + /* Multiple inserts might go to same leaf: */ +- if (!same_leaf_as_prev(trans, iter)) ++ if (!same_leaf_as_prev(trans, i)) + u64s = 0; + + u64s += i->k->k.u64s; +@@ -510,7 +482,6 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, + { + struct btree_insert_entry *i; + struct btree_iter *iter; +- unsigned idx; + int ret; + + trans_for_each_update(trans, i) +@@ -545,21 +516,15 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, + btree_insert_entry_checks(trans, i->iter, i->k); + bch2_btree_trans_verify_locks(trans); + +- /* +- * No more updates can be added - sort updates so we can take write +- * locks in the correct order: +- */ +- btree_trans_sort_updates(trans); +- +- trans_for_each_update_sorted(trans, i, idx) +- if (!same_leaf_as_prev(trans, idx)) ++ trans_for_each_update(trans, i) ++ if (!same_leaf_as_prev(trans, i)) + bch2_btree_node_lock_for_insert(trans->c, + i->iter->l[0].b, i->iter); + + ret = bch2_trans_commit_write_locked(trans, stopped_at); + +- trans_for_each_update_sorted(trans, i, idx) +- if (!same_leaf_as_prev(trans, idx)) ++ trans_for_each_update(trans, i) ++ if (!same_leaf_as_prev(trans, i)) + bch2_btree_node_unlock_write_inlined(i->iter->l[0].b, + i->iter); + +@@ -575,8 +540,8 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, + if (trans->flags & BTREE_INSERT_NOUNLOCK) + trans->nounlock = true; + +- trans_for_each_update_sorted(trans, i, idx) +- if (!same_leaf_as_prev(trans, idx)) ++ trans_for_each_update(trans, i) ++ if (!same_leaf_as_prev(trans, i)) + bch2_foreground_maybe_merge(trans->c, i->iter, + 0, trans->flags); + +@@ -708,9 +673,13 @@ bch2_trans_commit_get_rw_cold(struct btree_trans *trans) + int __bch2_trans_commit(struct btree_trans *trans) + { + struct btree_insert_entry *i = NULL; ++ struct btree_iter *iter; ++ bool trans_trigger_run; + unsigned u64s; + int ret = 0; + ++ BUG_ON(trans->need_reset); ++ + if (!trans->nr_updates) + goto out_noupdates; + +@@ -730,9 +699,29 @@ int __bch2_trans_commit(struct btree_trans *trans) + } + + /* +- * note: running triggers will append more updates to the list of +- * updates as we're walking it: ++ * Running triggers will append more updates to the list of updates as ++ * we're walking it: + */ ++ do { ++ trans_trigger_run = false; ++ ++ trans_for_each_update(trans, i) { ++ if (iter_has_trans_triggers(i->iter) && ++ !i->trans_triggers_run) { ++ i->trans_triggers_run = true; ++ trans_trigger_run = true; ++ ++ ret = bch2_trans_mark_update(trans, i->iter, i->k, ++ i->trigger_flags); ++ if (unlikely(ret)) { ++ if (ret == -EINTR) ++ trace_trans_restart_mark(trans->ip); ++ goto out; ++ } ++ } ++ } ++ } while (trans_trigger_run); ++ + trans_for_each_update(trans, i) { + /* we know trans->nounlock won't be set here: */ + if (unlikely(!(i->iter->locks_want < 1 +@@ -743,16 +732,6 @@ int __bch2_trans_commit(struct btree_trans *trans) + goto out; + } + +- if (iter_has_trans_triggers(i->iter)) { +- ret = bch2_trans_mark_update(trans, i->iter, i->k, +- i->trigger_flags); +- if (unlikely(ret)) { +- if (ret == -EINTR) +- trace_trans_restart_mark(trans->ip); +- goto out; +- } +- } +- + u64s = jset_u64s(i->k->k.u64s); + if (0) + trans->journal_preres_u64s += u64s; +@@ -768,6 +747,15 @@ retry: + + if (ret) + goto err; ++ ++ trans_for_each_iter(trans, iter) ++ if ((trans->iters_live & (1ULL << iter->idx)) && ++ (iter->flags & BTREE_ITER_SET_POS_AFTER_COMMIT)) { ++ if (trans->flags & BTREE_INSERT_NOUNLOCK) ++ bch2_btree_iter_set_pos_same_leaf(iter, iter->pos_after_commit); ++ else ++ bch2_btree_iter_set_pos(iter, iter->pos_after_commit); ++ } + out: + bch2_journal_preres_put(&trans->c->journal, &trans->journal_preres); + +@@ -785,6 +773,76 @@ err: + goto retry; + } + ++int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter, ++ struct bkey_i *k, enum btree_trigger_flags flags) ++{ ++ struct btree_insert_entry *i, n = (struct btree_insert_entry) { ++ .trigger_flags = flags, .iter = iter, .k = k ++ }; ++ ++ EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&k->k))); ++ ++ iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT; ++ ++ if (iter->flags & BTREE_ITER_IS_EXTENTS) { ++ iter->pos_after_commit = k->k.p; ++ iter->flags |= BTREE_ITER_SET_POS_AFTER_COMMIT; ++ } ++ ++ /* ++ * Pending updates are kept sorted: first, find position of new update: ++ */ ++ trans_for_each_update(trans, i) ++ if (btree_iter_cmp(iter, i->iter) <= 0) ++ break; ++ ++ /* ++ * Now delete/trim any updates the new update overwrites: ++ */ ++ if (i > trans->updates && ++ i[-1].iter->btree_id == iter->btree_id && ++ bkey_cmp(iter->pos, i[-1].k->k.p) < 0) ++ bch2_cut_back(n.iter->pos, i[-1].k); ++ ++ while (i < trans->updates + trans->nr_updates && ++ iter->btree_id == i->iter->btree_id && ++ bkey_cmp(n.k->k.p, i->k->k.p) >= 0) ++ array_remove_item(trans->updates, trans->nr_updates, ++ i - trans->updates); ++ ++ if (i < trans->updates + trans->nr_updates && ++ iter->btree_id == i->iter->btree_id && ++ bkey_cmp(n.k->k.p, i->iter->pos) > 0) { ++ /* ++ * When we have an extent that overwrites the start of another ++ * update, trimming that extent will mean the iterator's ++ * position has to change since the iterator position has to ++ * match the extent's start pos - but we don't want to change ++ * the iterator pos if some other code is using it, so we may ++ * need to clone it: ++ */ ++ if (trans->iters_live & (1ULL << i->iter->idx)) { ++ i->iter = bch2_trans_copy_iter(trans, i->iter); ++ if (IS_ERR(i->iter)) { ++ trans->need_reset = true; ++ return PTR_ERR(i->iter); ++ } ++ ++ i->iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT; ++ bch2_trans_iter_put(trans, i->iter); ++ } ++ ++ bch2_cut_front(n.k->k.p, i->k); ++ bch2_btree_iter_set_pos(i->iter, n.k->k.p); ++ } ++ ++ EBUG_ON(trans->nr_updates >= trans->nr_iters); ++ ++ array_insert_item(trans->updates, trans->nr_updates, ++ i - trans->updates, n); ++ return 0; ++} ++ + static int __bch2_btree_insert(struct btree_trans *trans, + enum btree_id id, struct bkey_i *k) + { +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index fd4fe4bf3a0f..731b93255876 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -1433,30 +1433,6 @@ static int trans_get_key(struct btree_trans *trans, + return ret; + } + +-static void *trans_update_key(struct btree_trans *trans, +- struct btree_iter *iter, +- unsigned u64s) +-{ +- struct btree_insert_entry *i; +- struct bkey_i *new_k; +- +- new_k = bch2_trans_kmalloc(trans, u64s * sizeof(u64)); +- if (IS_ERR(new_k)) +- return new_k; +- +- bkey_init(&new_k->k); +- new_k->k.p = iter->pos; +- +- trans_for_each_update(trans, i) +- if (i->iter == iter) { +- i->k = new_k; +- return new_k; +- } +- +- bch2_trans_update(trans, iter, new_k, 0); +- return new_k; +-} +- + static int bch2_trans_mark_pointer(struct btree_trans *trans, + struct extent_ptr_decoded p, + s64 sectors, enum bch_data_type data_type) +@@ -1540,7 +1516,7 @@ static int bch2_trans_mark_pointer(struct btree_trans *trans, + u.data_type = u.dirty_sectors || u.cached_sectors + ? data_type : 0; + +- a = trans_update_key(trans, iter, BKEY_ALLOC_U64s_MAX); ++ a = bch2_trans_kmalloc(trans, BKEY_ALLOC_U64s_MAX * 8); + ret = PTR_ERR_OR_ZERO(a); + if (ret) + goto out; +@@ -1548,6 +1524,7 @@ static int bch2_trans_mark_pointer(struct btree_trans *trans, + bkey_alloc_init(&a->k_i); + a->k.p = iter->pos; + bch2_alloc_pack(a, u); ++ bch2_trans_update(trans, iter, &a->k_i, 0); + out: + bch2_trans_iter_put(trans, iter); + return ret; +@@ -1562,9 +1539,8 @@ static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans, + { + struct bch_fs *c = trans->c; + struct btree_iter *iter; +- struct bkey_i *new_k; + struct bkey_s_c k; +- struct bkey_s_stripe s; ++ struct bkey_i_stripe *s; + int ret = 0; + + ret = trans_get_key(trans, BTREE_ID_EC, POS(0, p.idx), &iter, &k); +@@ -1579,21 +1555,21 @@ static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans, + goto out; + } + +- new_k = trans_update_key(trans, iter, k.k->u64s); +- ret = PTR_ERR_OR_ZERO(new_k); ++ s = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); ++ ret = PTR_ERR_OR_ZERO(s); + if (ret) + goto out; + +- bkey_reassemble(new_k, k); +- s = bkey_i_to_s_stripe(new_k); ++ bkey_reassemble(&s->k_i, k); + +- stripe_blockcount_set(s.v, p.block, +- stripe_blockcount_get(s.v, p.block) + ++ stripe_blockcount_set(&s->v, p.block, ++ stripe_blockcount_get(&s->v, p.block) + + sectors); + +- *nr_data = s.v->nr_blocks - s.v->nr_redundant; +- *nr_parity = s.v->nr_redundant; +- bch2_bkey_to_replicas(&r->e, s.s_c); ++ *nr_data = s->v.nr_blocks - s->v.nr_redundant; ++ *nr_parity = s->v.nr_redundant; ++ bch2_bkey_to_replicas(&r->e, bkey_i_to_s_c(&s->k_i)); ++ bch2_trans_update(trans, iter, &s->k_i, 0); + out: + bch2_trans_iter_put(trans, iter); + return ret; +@@ -1674,7 +1650,6 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans, + { + struct bch_fs *c = trans->c; + struct btree_iter *iter; +- struct bkey_i *new_k; + struct bkey_s_c k; + struct bkey_i_reflink_v *r_v; + s64 ret; +@@ -1700,13 +1675,12 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans, + bch2_btree_iter_set_pos(iter, bkey_start_pos(k.k)); + BUG_ON(iter->uptodate > BTREE_ITER_NEED_PEEK); + +- new_k = trans_update_key(trans, iter, k.k->u64s); +- ret = PTR_ERR_OR_ZERO(new_k); ++ r_v = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); ++ ret = PTR_ERR_OR_ZERO(r_v); + if (ret) + goto err; + +- bkey_reassemble(new_k, k); +- r_v = bkey_i_to_reflink_v(new_k); ++ bkey_reassemble(&r_v->k_i, k); + + le64_add_cpu(&r_v->v.refcount, + !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1); +@@ -1715,6 +1689,8 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans, + r_v->k.type = KEY_TYPE_deleted; + set_bkey_val_u64s(&r_v->k, 0); + } ++ ++ bch2_trans_update(trans, iter, &r_v->k_i, 0); + out: + ret = k.k->p.offset - idx; + err: +diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c +index b111ce3829a8..92f42c2fee33 100644 +--- a/fs/bcachefs/fs-io.c ++++ b/fs/bcachefs/fs-io.c +@@ -2437,7 +2437,7 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode, + struct address_space *mapping = inode->v.i_mapping; + struct bkey_on_stack copy; + struct btree_trans trans; +- struct btree_iter *src, *dst, *del = NULL; ++ struct btree_iter *src, *dst; + loff_t shift, new_size; + u64 src_start; + int ret; +@@ -2567,29 +2567,6 @@ reassemble: + + next_pos = insert ? bkey_start_pos(&delete.k) : delete.k.p; + +- /* +- * If the new and old keys overlap (because we're moving an +- * extent that's bigger than the amount we're collapsing by), +- * we need to trim the delete key here so they don't overlap +- * because overlaps on insertions aren't handled before +- * triggers are run, so the overwrite will get double counted +- * by the triggers machinery: +- */ +- if (insert && +- bkey_cmp(bkey_start_pos(©.k->k), delete.k.p) < 0) { +- bch2_cut_back(bkey_start_pos(©.k->k), &delete); +- } else if (!insert && +- bkey_cmp(copy.k->k.p, +- bkey_start_pos(&delete.k)) > 0) { +- bch2_cut_front(copy.k->k.p, &delete); +- +- del = bch2_trans_copy_iter(&trans, src); +- BUG_ON(IS_ERR_OR_NULL(del)); +- +- bch2_btree_iter_set_pos(del, +- bkey_start_pos(&delete.k)); +- } +- + if (copy.k->k.size == k.k->size) { + /* + * If we're moving the entire extent, we can skip +@@ -2607,18 +2584,13 @@ reassemble: + BUG_ON(ret); + } + +- bch2_trans_update(&trans, dst, copy.k, trigger_flags); +- bch2_trans_update(&trans, del ?: src, &delete, trigger_flags); +- +- ret = bch2_trans_commit(&trans, &disk_res, +- &inode->ei_journal_seq, +- BTREE_INSERT_NOFAIL); ++ ret = bch2_trans_update(&trans, src, &delete, trigger_flags) ?: ++ bch2_trans_update(&trans, dst, copy.k, trigger_flags) ?: ++ bch2_trans_commit(&trans, &disk_res, ++ &inode->ei_journal_seq, ++ BTREE_INSERT_NOFAIL); + bch2_disk_reservation_put(c, &disk_res); + bkey_err: +- if (del) +- bch2_trans_iter_put(&trans, del); +- del = NULL; +- + if (!ret) + bch2_btree_iter_set_pos(src, next_pos); + +-- +cgit v1.2.3 + + +From 9a9280fb558e77f8ba13fd58f14e9cf1a6dbc02e Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 3 Jan 2020 16:03:10 -0500 +Subject: bcachefs: Make sure bch2_read_extent obeys BCH_READ_MUST_CLONE + +This fixes the bch2_read_retry_nodecode() path, we were resubmitting a +bio without properly reinitializing it. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/io.c | 5 ++--- + 1 file changed, 2 insertions(+), 3 deletions(-) + +diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c +index e406d6d8d916..4c7dd0994a28 100644 +--- a/fs/bcachefs/io.c ++++ b/fs/bcachefs/io.c +@@ -1979,7 +1979,7 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig, + goto hole; + + iter.bi_size = pick.crc.compressed_size << 9; +- goto noclone; ++ goto get_bio; + } + + if (!(flags & BCH_READ_LAST_FRAGMENT) || +@@ -2026,7 +2026,7 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig, + pick.crc.live_size = bvec_iter_sectors(iter); + offset_into_extent = 0; + } +- ++get_bio: + if (rbio) { + /* + * promote already allocated bounce rbio: +@@ -2064,7 +2064,6 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig, + rbio->bio.bi_iter = iter; + rbio->split = true; + } else { +-noclone: + rbio = orig; + rbio->bio.bi_iter = iter; + EBUG_ON(bio_flagged(&rbio->bio, BIO_CHAIN)); +-- +cgit v1.2.3 + + +From 2ea40d5490198b3c5fea5a126899230f5ede1f23 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 3 Jan 2020 18:04:19 -0500 +Subject: bcachefs: Fix an iterator error path + +On transaction restart (-EINTR), we need to traverse all iterators. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 5 +---- + 1 file changed, 1 insertion(+), 4 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index f4ce99a5a19a..988c550c85eb 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -1027,10 +1027,7 @@ retry_all: + for (i = 0; i < nr_sorted; i++) { + iter = &trans->iters[sorted[i]]; + +- do { +- ret = btree_iter_traverse_one(iter); +- } while (ret == -EINTR); +- ++ ret = btree_iter_traverse_one(iter); + if (ret) + goto retry_all; + } +-- +cgit v1.2.3 + + +From d5b1c7fdc611ff0695c32859a16040a9d3eb5c20 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 3 Jan 2020 18:57:32 -0500 +Subject: bcachefs: Don't print anything when device doesn't have a label + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/sysfs.c | 2 -- + 1 file changed, 2 deletions(-) + +diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c +index a8bcba747582..602def1ee95a 100644 +--- a/fs/bcachefs/sysfs.c ++++ b/fs/bcachefs/sysfs.c +@@ -916,8 +916,6 @@ SHOW(bch2_dev) + bch2_disk_path_to_text(&out, &c->disk_sb, + ca->mi.group - 1); + mutex_unlock(&c->sb_lock); +- } else { +- pr_buf(&out, "none"); + } + + pr_buf(&out, "\n"); +-- +cgit v1.2.3 + + +From 9beff0bc26dda8db95d1892e9d9b47d632c62c43 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 3 Jan 2020 22:38:14 -0500 +Subject: bcachefs: Hacky fixes for device removal + +The device remove test was sporadically failing, because we hadn't +finished dropping btree sector counts for the device when +bch2_replicas_gc2() was called - mainly due to in flight journal writes. +We don't yet have a good mechanism for flushing the counts that +correspend to open journal entries yet. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/migrate.c | 58 +++++++++++++++++++++------------------------------ + fs/bcachefs/super.c | 43 +++++++++++++++++++++++--------------- + 2 files changed, 50 insertions(+), 51 deletions(-) + +diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c +index 0e3f63c1d65c..1ef62a189e33 100644 +--- a/fs/bcachefs/migrate.c ++++ b/fs/bcachefs/migrate.c +@@ -53,9 +53,6 @@ static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags + while ((k = bch2_btree_iter_peek(iter)).k && + !(ret = bkey_err(k))) { + if (!bch2_bkey_has_device(k, dev_idx)) { +- ret = bch2_mark_bkey_replicas(c, k); +- if (ret) +- break; + bch2_btree_iter_next(iter); + continue; + } +@@ -129,34 +126,27 @@ static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags) + struct bkey_i_btree_ptr *new_key; + retry: + if (!bch2_bkey_has_device(bkey_i_to_s_c(&b->key), +- dev_idx)) { +- /* +- * we might have found a btree node key we +- * needed to update, and then tried to update it +- * but got -EINTR after upgrading the iter, but +- * then raced and the node is now gone: +- */ +- bch2_btree_iter_downgrade(iter); +- +- ret = bch2_mark_bkey_replicas(c, bkey_i_to_s_c(&b->key)); +- if (ret) +- goto err; +- } else { +- bkey_copy(&tmp.k, &b->key); +- new_key = bkey_i_to_btree_ptr(&tmp.k); +- +- ret = drop_dev_ptrs(c, bkey_i_to_s(&new_key->k_i), +- dev_idx, flags, true); +- if (ret) +- goto err; +- +- ret = bch2_btree_node_update_key(c, iter, b, new_key); +- if (ret == -EINTR) { +- b = bch2_btree_iter_peek_node(iter); +- goto retry; +- } +- if (ret) +- goto err; ++ dev_idx)) ++ continue; ++ ++ bkey_copy(&tmp.k, &b->key); ++ new_key = bkey_i_to_btree_ptr(&tmp.k); ++ ++ ret = drop_dev_ptrs(c, bkey_i_to_s(&new_key->k_i), ++ dev_idx, flags, true); ++ if (ret) { ++ bch_err(c, "Cannot drop device without losing data"); ++ goto err; ++ } ++ ++ ret = bch2_btree_node_update_key(c, iter, b, new_key); ++ if (ret == -EINTR) { ++ b = bch2_btree_iter_peek_node(iter); ++ goto retry; ++ } ++ if (ret) { ++ bch_err(c, "Error updating btree node key: %i", ret); ++ goto err; + } + } + bch2_trans_iter_free(&trans, iter); +@@ -167,9 +157,10 @@ retry: + closure_wait_event(&c->btree_interior_update_wait, + !bch2_btree_interior_updates_nr_pending(c) || + c->btree_roots_dirty); ++ if (c->btree_roots_dirty) ++ bch2_journal_meta(&c->journal); + if (!bch2_btree_interior_updates_nr_pending(c)) + break; +- bch2_journal_meta(&c->journal); + } + + ret = 0; +@@ -184,6 +175,5 @@ err: + int bch2_dev_data_drop(struct bch_fs *c, unsigned dev_idx, int flags) + { + return bch2_dev_usrdata_drop(c, dev_idx, flags) ?: +- bch2_dev_metadata_drop(c, dev_idx, flags) ?: +- bch2_replicas_gc2(c); ++ bch2_dev_metadata_drop(c, dev_idx, flags); + } +diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c +index 2b4c39e74125..b46b7d78173e 100644 +--- a/fs/bcachefs/super.c ++++ b/fs/bcachefs/super.c +@@ -1372,7 +1372,11 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) + + mutex_lock(&c->state_lock); + +- percpu_ref_put(&ca->ref); /* XXX */ ++ /* ++ * We consume a reference to ca->ref, regardless of whether we succeed ++ * or fail: ++ */ ++ percpu_ref_put(&ca->ref); + + if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_FAILED, flags)) { + bch_err(ca, "Cannot remove without losing data"); +@@ -1381,11 +1385,6 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) + + __bch2_dev_read_only(c, ca); + +- /* +- * XXX: verify that dev_idx is really not in use anymore, anywhere +- * +- * flag_data_bad() does not check btree pointers +- */ + ret = bch2_dev_data_drop(c, ca->dev_idx, flags); + if (ret) { + bch_err(ca, "Remove failed: error %i dropping data", ret); +@@ -1398,17 +1397,6 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) + goto err; + } + +- data = bch2_dev_has_data(c, ca); +- if (data) { +- char data_has_str[100]; +- +- bch2_flags_to_text(&PBUF(data_has_str), +- bch2_data_types, data); +- bch_err(ca, "Remove failed, still has data (%s)", data_has_str); +- ret = -EBUSY; +- goto err; +- } +- + ret = bch2_btree_delete_range(c, BTREE_ID_ALLOC, + POS(ca->dev_idx, 0), + POS(ca->dev_idx + 1, 0), +@@ -1423,12 +1411,33 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) + * (overwritten) keys that point to the device we're removing: + */ + bch2_journal_flush_all_pins(&c->journal); ++ /* ++ * hack to ensure bch2_replicas_gc2() clears out entries to this device ++ */ ++ bch2_journal_meta(&c->journal); + ret = bch2_journal_error(&c->journal); + if (ret) { + bch_err(ca, "Remove failed, journal error"); + goto err; + } + ++ ret = bch2_replicas_gc2(c); ++ if (ret) { ++ bch_err(ca, "Remove failed: error %i from replicas gc", ret); ++ goto err; ++ } ++ ++ data = bch2_dev_has_data(c, ca); ++ if (data) { ++ char data_has_str[100]; ++ ++ bch2_flags_to_text(&PBUF(data_has_str), ++ bch2_data_types, data); ++ bch_err(ca, "Remove failed, still has data (%s)", data_has_str); ++ ret = -EBUSY; ++ goto err; ++ } ++ + __bch2_dev_offline(c, ca); + + mutex_lock(&c->sb_lock); +-- +cgit v1.2.3 + + +From f02d566a1d5b0925ecd57ce7e5a082dd4ac320a1 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 4 Jan 2020 16:09:52 -0500 +Subject: bcachefs: Kill bch2_fs_bug() + +These have all been converted to fsck/inconsistent errors + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bkey_methods.c | 2 +- + fs/bcachefs/error.h | 20 ----------- + fs/bcachefs/extents.c | 87 ++++++++++++++++++++++------------------------ + 3 files changed, 43 insertions(+), 66 deletions(-) + +diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c +index ed448fad83c5..320e17d108d2 100644 +--- a/fs/bcachefs/bkey_methods.c ++++ b/fs/bcachefs/bkey_methods.c +@@ -156,7 +156,7 @@ void bch2_bkey_debugcheck(struct bch_fs *c, struct btree *b, struct bkey_s_c k) + char buf[160]; + + bch2_bkey_val_to_text(&PBUF(buf), c, k); +- bch2_fs_bug(c, "invalid bkey %s: %s", buf, invalid); ++ bch2_fs_inconsistent(c, "invalid bkey %s: %s", buf, invalid); + return; + } + +diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h +index 7dcb0f6552fc..de319794ccd1 100644 +--- a/fs/bcachefs/error.h ++++ b/fs/bcachefs/error.h +@@ -16,26 +16,6 @@ struct work_struct; + + /* Error messages: */ + +-/* +- * Very fatal logic/inconsistency errors: these indicate that we've majorly +- * screwed up at runtime, i.e. it's not likely that it was just caused by the +- * data on disk being inconsistent. These BUG(): +- * +- * XXX: audit and convert to inconsistent() checks +- */ +- +-#define bch2_fs_bug(c, ...) \ +-do { \ +- bch_err(c, __VA_ARGS__); \ +- BUG(); \ +-} while (0) +- +-#define bch2_fs_bug_on(cond, c, ...) \ +-do { \ +- if (cond) \ +- bch2_fs_bug(c, __VA_ARGS__); \ +-} while (0) +- + /* + * Inconsistency errors: The on disk data is inconsistent. If these occur during + * initial recovery, they don't indicate a bug in the running code - we walk all +diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c +index a70ece750355..c4b0b9e15a8f 100644 +--- a/fs/bcachefs/extents.c ++++ b/fs/bcachefs/extents.c +@@ -172,14 +172,17 @@ void bch2_btree_ptr_debugcheck(struct bch_fs *c, struct bkey_s_c k) + struct bucket_mark mark; + struct bch_dev *ca; + +- bch2_fs_bug_on(!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) && +- !bch2_bkey_replicas_marked(c, k, false), c, +- "btree key bad (replicas not marked in superblock):\n%s", +- (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); +- + if (!test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) + return; + ++ if (!percpu_down_read_trylock(&c->mark_lock)) ++ return; ++ ++ bch2_fs_inconsistent_on(!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) && ++ !bch2_bkey_replicas_marked(c, k, false), c, ++ "btree key bad (replicas not marked in superblock):\n%s", ++ (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); ++ + bkey_for_each_ptr(ptrs, ptr) { + ca = bch_dev_bkey_exists(c, ptr->dev); + +@@ -194,13 +197,15 @@ void bch2_btree_ptr_debugcheck(struct bch_fs *c, struct bkey_s_c k) + mark.dirty_sectors < c->opts.btree_node_size) + goto err; + } +- ++out: ++ percpu_up_read(&c->mark_lock); + return; + err: +- bch2_bkey_val_to_text(&PBUF(buf), c, k); +- bch2_fs_bug(c, "%s btree pointer %s: bucket %zi gen %i mark %08x", +- err, buf, PTR_BUCKET_NR(ca, ptr), +- mark.gen, (unsigned) mark.v.counter); ++ bch2_fs_inconsistent(c, "%s btree pointer %s: bucket %zi gen %i mark %08x", ++ err, (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf), ++ PTR_BUCKET_NR(ca, ptr), ++ mark.gen, (unsigned) mark.v.counter); ++ goto out; + } + + void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c, +@@ -223,29 +228,18 @@ void bch2_extent_debugcheck(struct bch_fs *c, struct bkey_s_c k) + struct extent_ptr_decoded p; + char buf[160]; + +- /* +- * XXX: we should be doing most/all of these checks at startup time, +- * where we check bch2_bkey_invalid() in btree_node_read_done() +- * +- * But note that we can't check for stale pointers or incorrect gc marks +- * until after journal replay is done (it might be an extent that's +- * going to get overwritten during replay) +- */ +- +- if (percpu_down_read_trylock(&c->mark_lock)) { +- bch2_fs_bug_on(!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) && +- !bch2_bkey_replicas_marked_locked(c, e.s_c, false), c, +- "extent key bad (replicas not marked in superblock):\n%s", +- (bch2_bkey_val_to_text(&PBUF(buf), c, e.s_c), buf)); +- percpu_up_read(&c->mark_lock); +- } +- /* +- * If journal replay hasn't finished, we might be seeing keys +- * that will be overwritten by the time journal replay is done: +- */ +- if (!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)) ++ if (!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags) || ++ !test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) + return; + ++ if (!percpu_down_read_trylock(&c->mark_lock)) ++ return; ++ ++ bch2_fs_inconsistent_on(!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) && ++ !bch2_bkey_replicas_marked_locked(c, e.s_c, false), c, ++ "extent key bad (replicas not marked in superblock):\n%s", ++ (bch2_bkey_val_to_text(&PBUF(buf), c, e.s_c), buf)); ++ + extent_for_each_ptr_decode(e, p, entry) { + struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); + struct bucket_mark mark = ptr_bucket_mark(ca, &p.ptr); +@@ -255,21 +249,24 @@ void bch2_extent_debugcheck(struct bch_fs *c, struct bkey_s_c k) + ? mark.cached_sectors + : mark.dirty_sectors; + +- bch2_fs_bug_on(stale && !p.ptr.cached, c, +- "stale dirty pointer (ptr gen %u bucket %u", +- p.ptr.gen, mark.gen); +- +- bch2_fs_bug_on(stale > 96, c, "key too stale: %i", stale); +- +- bch2_fs_bug_on(!stale && +- (mark.data_type != BCH_DATA_USER || +- mark_sectors < disk_sectors), c, +- "extent pointer not marked: %s:\n" +- "type %u sectors %u < %u", +- (bch2_bkey_val_to_text(&PBUF(buf), c, e.s_c), buf), +- mark.data_type, +- mark_sectors, disk_sectors); ++ bch2_fs_inconsistent_on(stale && !p.ptr.cached, c, ++ "stale dirty pointer (ptr gen %u bucket %u", ++ p.ptr.gen, mark.gen); ++ ++ bch2_fs_inconsistent_on(stale > 96, c, ++ "key too stale: %i", stale); ++ ++ bch2_fs_inconsistent_on(!stale && ++ (mark.data_type != BCH_DATA_USER || ++ mark_sectors < disk_sectors), c, ++ "extent pointer not marked: %s:\n" ++ "type %u sectors %u < %u", ++ (bch2_bkey_val_to_text(&PBUF(buf), c, e.s_c), buf), ++ mark.data_type, ++ mark_sectors, disk_sectors); + } ++ ++ percpu_up_read(&c->mark_lock); + } + + void bch2_extent_to_text(struct printbuf *out, struct bch_fs *c, +-- +cgit v1.2.3 + + +From 5110a12c40d42131e33a949bd7a477076d9616c6 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 4 Jan 2020 16:33:32 -0500 +Subject: bcachefs: Fix extent_to_replicas() + +This needs to match bch2_mark_extent()/bch2_trans_mark_extent() in +buckets.c + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/replicas.c | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c +index cb5ebb87c701..366888b1b36d 100644 +--- a/fs/bcachefs/replicas.c ++++ b/fs/bcachefs/replicas.c +@@ -84,10 +84,10 @@ static void extent_to_replicas(struct bkey_s_c k, + if (p.ptr.cached) + continue; + +- if (p.has_ec) ++ if (!p.has_ec) ++ r->devs[r->nr_devs++] = p.ptr.dev; ++ else + r->nr_required = 0; +- +- r->devs[r->nr_devs++] = p.ptr.dev; + } + } + +-- +cgit v1.2.3 + + +From 0166a75bbbc1b6f3e5d97459b8e58bdf5cf723fd Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 4 Jan 2020 16:46:23 -0500 +Subject: bcachefs: Ensure iterators are valid before calling trans_mark_key() + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_update_leaf.c | 18 +++++++++--------- + 1 file changed, 9 insertions(+), 9 deletions(-) + +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index 325baf2b8920..d05ed96c4280 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -706,6 +706,15 @@ int __bch2_trans_commit(struct btree_trans *trans) + trans_trigger_run = false; + + trans_for_each_update(trans, i) { ++ /* we know trans->nounlock won't be set here: */ ++ if (unlikely(!(i->iter->locks_want < 1 ++ ? __bch2_btree_iter_upgrade(i->iter, 1) ++ : i->iter->uptodate <= BTREE_ITER_NEED_PEEK))) { ++ trace_trans_restart_upgrade(trans->ip); ++ ret = -EINTR; ++ goto out; ++ } ++ + if (iter_has_trans_triggers(i->iter) && + !i->trans_triggers_run) { + i->trans_triggers_run = true; +@@ -723,15 +732,6 @@ int __bch2_trans_commit(struct btree_trans *trans) + } while (trans_trigger_run); + + trans_for_each_update(trans, i) { +- /* we know trans->nounlock won't be set here: */ +- if (unlikely(!(i->iter->locks_want < 1 +- ? __bch2_btree_iter_upgrade(i->iter, 1) +- : i->iter->uptodate <= BTREE_ITER_NEED_PEEK))) { +- trace_trans_restart_upgrade(trans->ip); +- ret = -EINTR; +- goto out; +- } +- + u64s = jset_u64s(i->k->k.u64s); + if (0) + trans->journal_preres_u64s += u64s; +-- +cgit v1.2.3 + + +From d955baedca2fbd5488f463ca8f8ce78f43095dd7 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 4 Jan 2020 19:04:47 -0500 +Subject: bcachefs: Don't call trans_iter_put() on error pointer + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/dirent.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c +index 4b4aeaf81d21..623b6c3eda95 100644 +--- a/fs/bcachefs/dirent.c ++++ b/fs/bcachefs/dirent.c +@@ -331,7 +331,9 @@ int bch2_empty_dir_trans(struct btree_trans *trans, u64 dir_inum) + break; + } + } +- bch2_trans_iter_put(trans, iter); ++ ++ if (!IS_ERR(iter)) ++ bch2_trans_iter_put(trans, iter); + + return ret; + } +-- +cgit v1.2.3 + + +From d186676b1484b14acea70aa704ef2709ae96e0dc Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 5 Jan 2020 18:20:23 -0500 +Subject: bcachefs: Don't lose needs_whiteout in overwrite path + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_update_leaf.c | 5 +++-- + 1 file changed, 3 insertions(+), 2 deletions(-) + +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index d05ed96c4280..dfbe5dcd2b77 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -99,13 +99,14 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter, + + } + ++ insert->k.needs_whiteout = k->needs_whiteout; ++ k->needs_whiteout = false; ++ + if (k >= btree_bset_last(b)->start) { + clobber_u64s = k->u64s; + goto overwrite; + } + +- insert->k.needs_whiteout = k->needs_whiteout; +- k->needs_whiteout = false; + k->type = KEY_TYPE_deleted; + /* + * XXX: we should be able to do this without two calls to +-- +cgit v1.2.3 + + +From 352ab18a61e1e4e6cdd6100f008d03c39a245ebe Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 6 Jan 2020 22:25:09 -0500 +Subject: bcachefs: Rework iter->pos handling + +- Rework some of the helper comparison functions for consistency + +- Currently trying to refactor all the logic that's different for +extents in the btree iterator code. The main difference is that for non +extents we search for a key greater than or equal to the search key, +while for extents we search for a key strictly greater than the search +key (iter->pos). + +So that logic is now handled by btree_iter_search_key(), which computes +the real search key based on iter->pos and whether or not we're +searching for a key >= or > iter->pos. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bset.c | 10 +-- + fs/bcachefs/bset.h | 30 ++++---- + fs/bcachefs/btree_iter.c | 136 ++++++++++++++---------------------- + fs/bcachefs/btree_update_interior.c | 2 +- + 4 files changed, 72 insertions(+), 106 deletions(-) + +diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c +index 4892c002214e..f268def79927 100644 +--- a/fs/bcachefs/bset.c ++++ b/fs/bcachefs/bset.c +@@ -1392,21 +1392,21 @@ struct bkey_packed *bch2_bset_search_linear(struct btree *b, + { + if (lossy_packed_search) + while (m != btree_bkey_last(b, t) && +- bkey_iter_cmp_p_or_unp(b, search, lossy_packed_search, +- m) > 0) ++ bkey_iter_cmp_p_or_unp(b, m, ++ lossy_packed_search, search) < 0) + m = bkey_next_skip_noops(m, btree_bkey_last(b, t)); + + if (!packed_search) + while (m != btree_bkey_last(b, t) && +- bkey_iter_pos_cmp(b, search, m) > 0) ++ bkey_iter_pos_cmp(b, m, search) < 0) + m = bkey_next_skip_noops(m, btree_bkey_last(b, t)); + + if (btree_keys_expensive_checks(b)) { + struct bkey_packed *prev = bch2_bkey_prev_all(b, t, m); + + BUG_ON(prev && +- bkey_iter_cmp_p_or_unp(b, search, packed_search, +- prev) <= 0); ++ bkey_iter_cmp_p_or_unp(b, prev, ++ packed_search, search) >= 0); + } + + return m; +diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h +index 2653a74b3b14..8f4856eeecc0 100644 +--- a/fs/bcachefs/bset.h ++++ b/fs/bcachefs/bset.h +@@ -360,7 +360,7 @@ void bch2_bset_delete(struct btree *, struct bkey_packed *, unsigned); + static inline int bkey_cmp_p_or_unp(const struct btree *b, + const struct bkey_packed *l, + const struct bkey_packed *r_packed, +- struct bpos *r) ++ const struct bpos *r) + { + EBUG_ON(r_packed && !bkey_packed(r_packed)); + +@@ -449,7 +449,7 @@ static inline bool bch2_btree_node_iter_end(struct btree_node_iter *iter) + * XXX: only need to compare pointers for keys that are both within a + * btree_node_iterator - we need to break ties for prev() to work correctly + */ +-static inline int bkey_iter_cmp(struct btree *b, ++static inline int bkey_iter_cmp(const struct btree *b, + const struct bkey_packed *l, + const struct bkey_packed *r) + { +@@ -458,7 +458,7 @@ static inline int bkey_iter_cmp(struct btree *b, + ?: cmp_int(l, r); + } + +-static inline int btree_node_iter_cmp(struct btree *b, ++static inline int btree_node_iter_cmp(const struct btree *b, + struct btree_node_iter_set l, + struct btree_node_iter_set r) + { +@@ -467,22 +467,22 @@ static inline int btree_node_iter_cmp(struct btree *b, + __btree_node_offset_to_key(b, r.k)); + } + +-/* These assume l (the search key) is not a deleted key: */ +-static inline int bkey_iter_pos_cmp(struct btree *b, +- struct bpos *l, +- const struct bkey_packed *r) ++/* These assume r (the search key) is not a deleted key: */ ++static inline int bkey_iter_pos_cmp(const struct btree *b, ++ const struct bkey_packed *l, ++ const struct bpos *r) + { +- return -bkey_cmp_left_packed(b, r, l) +- ?: (int) bkey_deleted(r); ++ return bkey_cmp_left_packed(b, l, r) ++ ?: -((int) bkey_deleted(l)); + } + +-static inline int bkey_iter_cmp_p_or_unp(struct btree *b, +- struct bpos *l, +- const struct bkey_packed *l_packed, +- const struct bkey_packed *r) ++static inline int bkey_iter_cmp_p_or_unp(const struct btree *b, ++ const struct bkey_packed *l, ++ const struct bkey_packed *r_packed, ++ const struct bpos *r) + { +- return -bkey_cmp_p_or_unp(b, r, l_packed, l) +- ?: (int) bkey_deleted(r); ++ return bkey_cmp_p_or_unp(b, l, r_packed, r) ++ ?: -((int) bkey_deleted(l)); + } + + static inline struct bkey_packed * +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 988c550c85eb..5c940ca8d8a7 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -11,10 +11,6 @@ + #include + #include + +-static inline struct bkey_s_c __btree_iter_peek_all(struct btree_iter *, +- struct btree_iter_level *, +- struct bkey *); +- + #define BTREE_ITER_NO_NODE_GET_LOCKS ((struct btree *) 1) + #define BTREE_ITER_NO_NODE_DROP ((struct btree *) 2) + #define BTREE_ITER_NO_NODE_LOCK_ROOT ((struct btree *) 3) +@@ -29,37 +25,14 @@ static inline bool is_btree_node(struct btree_iter *iter, unsigned l) + (unsigned long) iter->l[l].b >= 128; + } + +-/* Returns < 0 if @k is before iter pos, > 0 if @k is after */ +-static inline int __btree_iter_pos_cmp(struct btree_iter *iter, +- const struct btree *b, +- const struct bkey_packed *k, +- bool interior_node) ++static inline struct bpos btree_iter_search_key(struct btree_iter *iter) + { +- int cmp = bkey_cmp_left_packed(b, k, &iter->pos); +- +- if (cmp) +- return cmp; +- if (bkey_deleted(k)) +- return -1; +- +- /* +- * Normally, for extents we want the first key strictly greater than +- * the iterator position - with the exception that for interior nodes, +- * we don't want to advance past the last key if the iterator position +- * is POS_MAX: +- */ +- if (iter->flags & BTREE_ITER_IS_EXTENTS && +- (!interior_node || +- bkey_cmp_left_packed_byval(b, k, POS_MAX))) +- return -1; +- return 1; +-} ++ struct bpos pos = iter->pos; + +-static inline int btree_iter_pos_cmp(struct btree_iter *iter, +- const struct btree *b, +- const struct bkey_packed *k) +-{ +- return __btree_iter_pos_cmp(iter, b, k, b->level != 0); ++ if ((iter->flags & BTREE_ITER_IS_EXTENTS) && ++ bkey_cmp(pos, POS_MAX)) ++ pos = bkey_successor(pos); ++ return pos; + } + + /* Btree node locking: */ +@@ -415,6 +388,7 @@ void bch2_trans_unlock(struct btree_trans *trans) + static void __bch2_btree_iter_verify(struct btree_iter *iter, + struct btree *b) + { ++ struct bpos pos = btree_iter_search_key(iter); + struct btree_iter_level *l = &iter->l[b->level]; + struct btree_node_iter tmp = l->iter; + struct bkey_packed *k; +@@ -437,17 +411,17 @@ static void __bch2_btree_iter_verify(struct btree_iter *iter, + k = b->level || iter->flags & BTREE_ITER_IS_EXTENTS + ? bch2_btree_node_iter_prev_filter(&tmp, b, KEY_TYPE_discard) + : bch2_btree_node_iter_prev_all(&tmp, b); +- if (k && btree_iter_pos_cmp(iter, b, k) > 0) { ++ if (k && bkey_iter_pos_cmp(b, k, &pos) >= 0) { + char buf[100]; + struct bkey uk = bkey_unpack_key(b, k); + + bch2_bkey_to_text(&PBUF(buf), &uk); +- panic("prev key should be before iter pos:\n%s\n%llu:%llu\n", ++ panic("iterator should be before prev key:\n%s\n%llu:%llu\n", + buf, iter->pos.inode, iter->pos.offset); + } + + k = bch2_btree_node_iter_peek_all(&l->iter, b); +- if (k && btree_iter_pos_cmp(iter, b, k) < 0) { ++ if (k && bkey_iter_pos_cmp(b, k, &pos) < 0) { + char buf[100]; + struct bkey uk = bkey_unpack_key(b, k); + +@@ -495,15 +469,19 @@ static void btree_node_iter_set_set_pos(struct btree_node_iter *iter, + } + + static void __bch2_btree_iter_fix_key_modified(struct btree_iter *iter, +- struct btree *b, +- struct bkey_packed *where) ++ struct btree *b, ++ struct bkey_packed *where) + { +- struct btree_node_iter *node_iter = &iter->l[0].iter; ++ struct btree_iter_level *l = &iter->l[b->level]; ++ struct bpos pos = btree_iter_search_key(iter); + +- if (where == bch2_btree_node_iter_peek_all(node_iter, b)) { +- bkey_disassemble(b, where, &iter->k); +- btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); +- } ++ if (where != bch2_btree_node_iter_peek_all(&l->iter, l->b)) ++ return; ++ ++ if (bkey_iter_pos_cmp(l->b, where, &pos) < 0) ++ bch2_btree_node_iter_advance(&l->iter, l->b); ++ ++ btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); + } + + void bch2_btree_iter_fix_key_modified(struct btree_iter *iter, +@@ -535,6 +513,7 @@ static void __bch2_btree_node_iter_fix(struct btree_iter *iter, + bool iter_current_key_modified = + orig_iter_pos >= offset && + orig_iter_pos <= offset + clobber_u64s; ++ struct bpos iter_pos = btree_iter_search_key(iter); + + btree_node_iter_for_each(node_iter, set) + if (set->end == old_end) +@@ -542,7 +521,7 @@ static void __bch2_btree_node_iter_fix(struct btree_iter *iter, + + /* didn't find the bset in the iterator - might have to readd it: */ + if (new_u64s && +- btree_iter_pos_cmp(iter, b, where) > 0) { ++ bkey_iter_pos_cmp(b, where, &iter_pos) >= 0) { + bch2_btree_node_iter_push(node_iter, b, where, end); + goto fixup_done; + } else { +@@ -557,7 +536,7 @@ found: + return; + + if (new_u64s && +- btree_iter_pos_cmp(iter, b, where) > 0) { ++ bkey_iter_pos_cmp(b, where, &iter_pos) >= 0) { + set->k = offset; + } else if (set->k < offset + clobber_u64s) { + set->k = offset + new_u64s; +@@ -702,11 +681,12 @@ static inline bool btree_iter_advance_to_pos(struct btree_iter *iter, + struct btree_iter_level *l, + int max_advance) + { ++ struct bpos pos = btree_iter_search_key(iter); + struct bkey_packed *k; + int nr_advanced = 0; + + while ((k = bch2_btree_node_iter_peek_all(&l->iter, l->b)) && +- btree_iter_pos_cmp(iter, l->b, k) < 0) { ++ bkey_iter_pos_cmp(l->b, k, &pos) < 0) { + if (max_advance > 0 && nr_advanced >= max_advance) + return false; + +@@ -765,13 +745,7 @@ static inline bool btree_iter_pos_before_node(struct btree_iter *iter, + static inline bool btree_iter_pos_after_node(struct btree_iter *iter, + struct btree *b) + { +- int cmp = bkey_cmp(b->key.k.p, iter->pos); +- +- if (!cmp && +- (iter->flags & BTREE_ITER_IS_EXTENTS) && +- bkey_cmp(b->key.k.p, POS_MAX)) +- cmp = -1; +- return cmp < 0; ++ return bkey_cmp(b->key.k.p, btree_iter_search_key(iter)) < 0; + } + + static inline bool btree_iter_pos_in_node(struct btree_iter *iter, +@@ -785,16 +759,10 @@ static inline bool btree_iter_pos_in_node(struct btree_iter *iter, + static inline void __btree_iter_init(struct btree_iter *iter, + unsigned level) + { ++ struct bpos pos = btree_iter_search_key(iter); + struct btree_iter_level *l = &iter->l[level]; + +- bch2_btree_node_iter_init(&l->iter, l->b, &iter->pos); +- +- if (iter->flags & BTREE_ITER_IS_EXTENTS) +- btree_iter_advance_to_pos(iter, l, -1); +- +- /* Skip to first non whiteout: */ +- if (level) +- bch2_btree_node_iter_peek(&l->iter, l->b); ++ bch2_btree_node_iter_init(&l->iter, l->b, &pos); + + btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); + } +@@ -1564,9 +1532,7 @@ __bch2_btree_iter_peek_slot_extents(struct btree_iter *iter) + int ret; + + recheck: +- while ((k = __btree_iter_peek_all(iter, l, &iter->k)).k && +- bkey_cmp(k.k->p, iter->pos) <= 0) +- bch2_btree_node_iter_advance(&l->iter, l->b); ++ btree_iter_advance_to_pos(iter, l, -1); + + /* + * iterator is now at the correct position for inserting at iter->pos, +@@ -1575,9 +1541,27 @@ recheck: + */ + + node_iter = l->iter; +- if (k.k && bkey_whiteout(k.k)) +- k = __btree_iter_unpack(iter, l, &iter->k, +- bch2_btree_node_iter_peek(&node_iter, l->b)); ++ k = __btree_iter_unpack(iter, l, &iter->k, ++ bch2_btree_node_iter_peek(&node_iter, l->b)); ++ ++ if (k.k && bkey_cmp(bkey_start_pos(k.k), iter->pos) <= 0) { ++ /* ++ * If there wasn't actually a hole, want the iterator to be ++ * pointed at the key we found: ++ * ++ * XXX: actually, we shouldn't be changing the iterator here: ++ * the iterator needs to be correct for inserting at iter->pos, ++ * and there may be whiteouts between iter->pos and what this ++ * iterator points at: ++ */ ++ l->iter = node_iter; ++ ++ EBUG_ON(bkey_cmp(k.k->p, iter->pos) <= 0); ++ iter->uptodate = BTREE_ITER_UPTODATE; ++ ++ __bch2_btree_iter_verify(iter, l->b); ++ return k; ++ } + + /* + * If we got to the end of the node, check if we need to traverse to the +@@ -1592,24 +1576,6 @@ recheck: + goto recheck; + } + +- if (k.k && +- !bkey_whiteout(k.k) && +- bkey_cmp(bkey_start_pos(k.k), iter->pos) <= 0) { +- /* +- * if we skipped forward to find the first non whiteout and +- * there _wasn't_ actually a hole, we want the iterator to be +- * pointed at the key we found: +- */ +- l->iter = node_iter; +- +- EBUG_ON(bkey_cmp(k.k->p, iter->pos) < 0); +- EBUG_ON(bkey_deleted(k.k)); +- iter->uptodate = BTREE_ITER_UPTODATE; +- +- __bch2_btree_iter_verify(iter, l->b); +- return k; +- } +- + /* hole */ + + /* holes can't span inode numbers: */ +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index 0278145e9058..8264dd05921b 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -1191,7 +1191,7 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b + BTREE_TRIGGER_GC); + + while ((k = bch2_btree_node_iter_peek_all(node_iter, b)) && +- bkey_iter_pos_cmp(b, &insert->k.p, k) > 0) ++ bkey_iter_pos_cmp(b, k, &insert->k.p) < 0) + bch2_btree_node_iter_advance(node_iter, b); + + /* +-- +cgit v1.2.3 + + +From 581745016dbffd9b63ef8dd2fb0c823b28100272 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 6 Jan 2020 23:43:04 -0500 +Subject: bcachefs: Refactor bch2_btree_bset_insert_key() + +The main thing going on is to separate out the different cases deletion, +overwriting, and inserting a new key. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_update_leaf.c | 79 ++++++++++++++++++++--------------------- + 1 file changed, 39 insertions(+), 40 deletions(-) + +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index dfbe5dcd2b77..76d28f23f927 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -64,64 +64,63 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter, + bkey_cmp(insert->k.p, b->data->max_key) > 0); + + k = bch2_btree_node_iter_peek_all(node_iter, b); +- if (k && !bkey_cmp_packed(b, k, &insert->k)) { +- BUG_ON(bkey_whiteout(k)); ++ if (k && bkey_cmp_packed(b, k, &insert->k)) ++ k = NULL; + +- if (!bkey_written(b, k) && +- bkey_val_u64s(&insert->k) == bkeyp_val_u64s(f, k) && +- !bkey_whiteout(&insert->k)) { +- k->type = insert->k.type; +- memcpy_u64s(bkeyp_val(f, k), &insert->v, +- bkey_val_u64s(&insert->k)); +- return true; +- } ++ /* @k is the key being overwritten/deleted, if any: */ + +- btree_account_key_drop(b, k); ++ EBUG_ON(k && bkey_whiteout(k)); + +- if (bkey_whiteout(&insert->k)) { +- unsigned clobber_u64s = k->u64s, new_u64s = k->u64s; ++ if (bkey_whiteout(&insert->k)) { ++ /* Deleting: */ + +- k->type = KEY_TYPE_deleted; ++ /* Not found? Nothing to do: */ ++ if (!k) ++ return false; + +- if (k->needs_whiteout) { +- push_whiteout(iter->trans->c, b, k); +- k->needs_whiteout = false; +- } ++ btree_account_key_drop(b, k); ++ k->type = KEY_TYPE_deleted; + +- if (k >= btree_bset_last(b)->start) { +- bch2_bset_delete(b, k, clobber_u64s); +- new_u64s = 0; +- } ++ if (k->needs_whiteout) { ++ push_whiteout(iter->trans->c, b, k); ++ k->needs_whiteout = false; ++ } + ++ if (k >= btree_bset_last(b)->start) { ++ clobber_u64s = k->u64s; ++ ++ bch2_bset_delete(b, k, clobber_u64s); + bch2_btree_node_iter_fix(iter, b, node_iter, k, +- clobber_u64s, new_u64s); +- return true; ++ clobber_u64s, 0); ++ } else { ++ bch2_btree_iter_fix_key_modified(iter, b, k); ++ } + ++ return true; ++ } ++ ++ if (k) { ++ /* Overwriting: */ ++ if (!bkey_written(b, k) && ++ bkey_val_u64s(&insert->k) == bkeyp_val_u64s(f, k)) { ++ k->type = insert->k.type; ++ memcpy_u64s(bkeyp_val(f, k), &insert->v, ++ bkey_val_u64s(&insert->k)); ++ return true; + } + ++ btree_account_key_drop(b, k); ++ k->type = KEY_TYPE_deleted; ++ + insert->k.needs_whiteout = k->needs_whiteout; + k->needs_whiteout = false; + + if (k >= btree_bset_last(b)->start) { + clobber_u64s = k->u64s; + goto overwrite; ++ } else { ++ bch2_btree_iter_fix_key_modified(iter, b, k); + } +- +- k->type = KEY_TYPE_deleted; +- /* +- * XXX: we should be able to do this without two calls to +- * bch2_btree_node_iter_fix: +- */ +- bch2_btree_node_iter_fix(iter, b, node_iter, k, +- k->u64s, k->u64s); +- } else { +- /* +- * Deleting, but the key to delete wasn't found - nothing to do: +- */ +- if (bkey_whiteout(&insert->k)) +- return false; +- +- insert->k.needs_whiteout = false; + } + + k = bch2_btree_node_iter_bset_pos(node_iter, b, bset_tree_last(b)); +-- +cgit v1.2.3 + + +From 9cb79f98b933a15345a4cc2863499f281c454867 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 9 Jan 2020 20:43:58 -0500 +Subject: bcachefs: Add some comments for btree iterator flags + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_types.h | 16 ++++++++++++++++ + 1 file changed, 16 insertions(+) + +diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h +index 86e52468c1aa..b7af88e05837 100644 +--- a/fs/bcachefs/btree_types.h ++++ b/fs/bcachefs/btree_types.h +@@ -184,9 +184,25 @@ enum btree_iter_type { + + #define BTREE_ITER_TYPE ((1 << 2) - 1) + ++/* ++ * Iterate over all possible positions, synthesizing deleted keys for holes: ++ */ + #define BTREE_ITER_SLOTS (1 << 2) ++/* ++ * Indicates that intent locks should be taken on leaf nodes, because we expect ++ * to be doing updates: ++ */ + #define BTREE_ITER_INTENT (1 << 3) ++/* ++ * Causes the btree iterator code to prefetch additional btree nodes from disk: ++ */ + #define BTREE_ITER_PREFETCH (1 << 4) ++/* ++ * Indicates that this iterator should not be reused until transaction commit, ++ * either because a pending update references it or because the update depends ++ * on that particular key being locked (e.g. by the str_hash code, for hash ++ * table consistency) ++ */ + #define BTREE_ITER_KEEP_UNTIL_COMMIT (1 << 5) + /* + * Used in bch2_btree_iter_traverse(), to indicate whether we're searching for +-- +cgit v1.2.3 + + +From bb7b3bd0546b69e5d12f8bea3b25a70ae9f43dff Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 15 Jan 2020 15:11:22 -0500 +Subject: bcachefs: Change btree split threshold to be in u64s + +This fixes a bug with very small btree nodes where splitting would end +up with one of the new nodes empty. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_cache.h | 2 +- + fs/bcachefs/btree_update_interior.c | 2 +- + 2 files changed, 2 insertions(+), 2 deletions(-) + +diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h +index c5873c58439c..83358d6a4df8 100644 +--- a/fs/bcachefs/btree_cache.h ++++ b/fs/bcachefs/btree_cache.h +@@ -75,7 +75,7 @@ static inline unsigned btree_blocks(struct bch_fs *c) + return c->opts.btree_node_size >> c->block_bits; + } + +-#define BTREE_SPLIT_THRESHOLD(c) (btree_blocks(c) * 3 / 4) ++#define BTREE_SPLIT_THRESHOLD(c) (btree_max_u64s(c) * 3 / 4) + + #define BTREE_FOREGROUND_MERGE_THRESHOLD(c) (btree_max_u64s(c) * 1 / 3) + #define BTREE_FOREGROUND_MERGE_HYSTERESIS(c) \ +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index 8264dd05921b..da13a20d9a95 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -1385,7 +1385,7 @@ static void btree_split(struct btree_update *as, struct btree *b, + if (keys) + btree_split_insert_keys(as, n1, iter, keys); + +- if (vstruct_blocks(n1->data, c->block_bits) > BTREE_SPLIT_THRESHOLD(c)) { ++ if (bset_u64s(&n1->set[0]) > BTREE_SPLIT_THRESHOLD(c)) { + trace_btree_split(c, b); + + n2 = __btree_split_node(as, n1, iter); +-- +cgit v1.2.3 + + +From b8d02fd44fc34252caa6e53952350184d74e73c7 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 15 Jan 2020 22:53:49 -0500 +Subject: bcachefs: Fix bch2_sort_keys() to not modify src keys + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bkey_sort.c | 18 +++++++----------- + fs/bcachefs/bset.h | 6 ------ + fs/bcachefs/btree_iter.c | 6 ------ + 3 files changed, 7 insertions(+), 23 deletions(-) + +diff --git a/fs/bcachefs/bkey_sort.c b/fs/bcachefs/bkey_sort.c +index 18f842012f05..1c8e5a80e32a 100644 +--- a/fs/bcachefs/bkey_sort.c ++++ b/fs/bcachefs/bkey_sort.c +@@ -254,23 +254,18 @@ unsigned bch2_sort_keys(struct bkey_packed *dst, + sort_iter_sort(iter, sort_keys_cmp); + + while ((in = sort_iter_next(iter, sort_keys_cmp))) { ++ bool needs_whiteout = false; ++ + if (bkey_whiteout(in) && + (filter_whiteouts || !in->needs_whiteout)) + continue; + +- if (bkey_whiteout(in) && +- (next = sort_iter_peek(iter)) && +- !bkey_cmp_packed(iter->b, in, next)) { ++ while ((next = sort_iter_peek(iter)) && ++ !bkey_cmp_packed(iter->b, in, next)) { + BUG_ON(in->needs_whiteout && + next->needs_whiteout); +- /* +- * XXX racy, called with read lock from write path +- * +- * leads to spurious BUG_ON() in bkey_unpack_key() in +- * debug mode +- */ +- next->needs_whiteout |= in->needs_whiteout; +- continue; ++ needs_whiteout |= in->needs_whiteout; ++ in = sort_iter_next(iter, sort_keys_cmp); + } + + if (bkey_whiteout(in)) { +@@ -279,6 +274,7 @@ unsigned bch2_sort_keys(struct bkey_packed *dst, + } else { + bkey_copy(out, in); + } ++ out->needs_whiteout |= needs_whiteout; + out = bkey_next(out); + } + +diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h +index 8f4856eeecc0..7338ccbc8cbd 100644 +--- a/fs/bcachefs/bset.h ++++ b/fs/bcachefs/bset.h +@@ -199,12 +199,6 @@ __bkey_unpack_key_format_checked(const struct btree *b, + if (btree_keys_expensive_checks(b)) { + struct bkey dst2 = __bch2_bkey_unpack_key(&b->format, src); + +- /* +- * hack around a harmless race when compacting whiteouts +- * for a write: +- */ +- dst2.needs_whiteout = dst->needs_whiteout; +- + BUG_ON(memcmp(dst, &dst2, sizeof(*dst))); + } + } +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 5c940ca8d8a7..ea0555b806f0 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -1339,12 +1339,6 @@ static inline struct bkey_s_c btree_iter_peek_uptodate(struct btree_iter *iter) + if (debug_check_iterators(iter->trans->c)) { + struct bkey k = bkey_unpack_key(l->b, _k); + +- /* +- * this flag is internal to the btree code, +- * we don't care if it doesn't match - if it's now set +- * it just means the key has been written out to disk: +- */ +- k.needs_whiteout = iter->k.needs_whiteout; + BUG_ON(memcmp(&k, &iter->k, sizeof(k))); + } + +-- +cgit v1.2.3 + + +From fef53b0f8d6c058bbac17ea7dd021902f4528497 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 16 Jan 2020 16:14:56 -0500 +Subject: bcachefs: Don't modify existing key in place in sort_repack_merge() + +This fixes a nasty memory corruption with other threads that are still +reading the btree node being compacted. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bkey_sort.c | 20 +++++++++++++++----- + 1 file changed, 15 insertions(+), 5 deletions(-) + +diff --git a/fs/bcachefs/bkey_sort.c b/fs/bcachefs/bkey_sort.c +index 1c8e5a80e32a..7cbb57042af1 100644 +--- a/fs/bcachefs/bkey_sort.c ++++ b/fs/bcachefs/bkey_sort.c +@@ -210,28 +210,38 @@ bch2_sort_repack_merge(struct bch_fs *c, + bool filter_whiteouts) + { + struct bkey_packed *prev = NULL, *k_packed; +- struct bkey_s k; ++ struct bkey_on_stack k; + struct btree_nr_keys nr; +- struct bkey unpacked; + + memset(&nr, 0, sizeof(nr)); ++ bkey_on_stack_init(&k); + + while ((k_packed = bch2_btree_node_iter_next_all(iter, src))) { + if (filter_whiteouts && bkey_whiteout(k_packed)) + continue; + +- k = __bkey_disassemble(src, k_packed, &unpacked); ++ /* ++ * NOTE: ++ * bch2_bkey_normalize may modify the key we pass it (dropping ++ * stale pointers) and we don't have a write lock on the src ++ * node; we have to make a copy of the entire key before calling ++ * normalize ++ */ ++ bkey_on_stack_realloc(&k, c, k_packed->u64s + BKEY_U64s); ++ bch2_bkey_unpack(src, k.k, k_packed); + + if (filter_whiteouts && +- bch2_bkey_normalize(c, k)) ++ bch2_bkey_normalize(c, bkey_i_to_s(k.k))) + continue; + +- extent_sort_append(c, out_f, &nr, vstruct_last(dst), &prev, k); ++ extent_sort_append(c, out_f, &nr, vstruct_last(dst), ++ &prev, bkey_i_to_s(k.k)); + } + + extent_sort_advance_prev(out_f, &nr, vstruct_last(dst), &prev); + + dst->u64s = cpu_to_le16((u64 *) prev - dst->_data); ++ bkey_on_stack_exit(&k, c); + return nr; + } + +-- +cgit v1.2.3 + + +From 13e42b7c688cafca0b97d900128753d993046943 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 16 Jan 2020 16:20:53 -0500 +Subject: bcachefs: Add a cond_resched() to rebalance loop + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/rebalance.c | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c +index 84b3fb6eb101..612385e9d4e4 100644 +--- a/fs/bcachefs/rebalance.c ++++ b/fs/bcachefs/rebalance.c +@@ -183,6 +183,8 @@ static int bch2_rebalance_thread(void *arg) + prev_cputime = curr_cputime(); + + while (!kthread_wait_freezable(r->enabled)) { ++ cond_resched(); ++ + start = jiffies; + cputime = curr_cputime(); + +-- +cgit v1.2.3 + + +From bbc137491905a90e4cc10f5c0b2182fc327298b7 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 9 Jan 2020 18:30:05 -0500 +Subject: bcachefs: Improve tracepoints slightly in commit path + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_update_leaf.c | 16 ++++++++++++---- + 1 file changed, 12 insertions(+), 4 deletions(-) + +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index 76d28f23f927..afd2086edeff 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -706,10 +706,18 @@ int __bch2_trans_commit(struct btree_trans *trans) + trans_trigger_run = false; + + trans_for_each_update(trans, i) { +- /* we know trans->nounlock won't be set here: */ +- if (unlikely(!(i->iter->locks_want < 1 +- ? __bch2_btree_iter_upgrade(i->iter, 1) +- : i->iter->uptodate <= BTREE_ITER_NEED_PEEK))) { ++ if (unlikely(i->iter->uptodate > BTREE_ITER_NEED_PEEK)) { ++ trace_trans_restart_traverse(trans->ip); ++ ret = -EINTR; ++ goto out; ++ } ++ ++ /* ++ * We're not using bch2_btree_iter_upgrade here because ++ * we know trans->nounlock can't be set: ++ */ ++ if (unlikely(i->iter->locks_want < 1 && ++ !__bch2_btree_iter_upgrade(i->iter, 1))) { + trace_trans_restart_upgrade(trans->ip); + ret = -EINTR; + goto out; +-- +cgit v1.2.3 + + +From c5af017922c0b47cbe38122349874ff095912cad Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 20 Jan 2020 19:42:38 -0500 +Subject: bcachefs: Refactor rebalance_pred function + +Before, the logic for if we should move an extent was duplicated +somewhat, in both rebalance_add_key() and rebalance_pred(); this +centralizes that in __rebalance_pred() + +This is prep work for a patch that enables marking data as +incompressible. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/rebalance.c | 93 +++++++++++++++++++++++-------------------------- + 1 file changed, 44 insertions(+), 49 deletions(-) + +diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c +index 612385e9d4e4..d5883f89fb2d 100644 +--- a/fs/bcachefs/rebalance.c ++++ b/fs/bcachefs/rebalance.c +@@ -17,50 +17,51 @@ + #include + #include + +-static inline bool rebalance_ptr_pred(struct bch_fs *c, +- struct extent_ptr_decoded p, +- struct bch_io_opts *io_opts) ++/* ++ * Check if an extent should be moved: ++ * returns -1 if it should not be moved, or ++ * device of pointer that should be moved, if known, or INT_MAX if unknown ++ */ ++static int __bch2_rebalance_pred(struct bch_fs *c, ++ struct bkey_s_c k, ++ struct bch_io_opts *io_opts) + { +- if (io_opts->background_target && +- !bch2_dev_in_target(c, p.ptr.dev, io_opts->background_target) && +- !p.ptr.cached) +- return true; ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const union bch_extent_entry *entry; ++ struct extent_ptr_decoded p; ++ ++ if (io_opts->background_compression) ++ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) ++ if (!p.ptr.cached && ++ p.crc.compression_type != ++ bch2_compression_opt_to_type[io_opts->background_compression]) ++ return p.ptr.dev; + +- if (io_opts->background_compression && +- p.crc.compression_type != +- bch2_compression_opt_to_type[io_opts->background_compression]) +- return true; ++ if (io_opts->background_target) ++ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) ++ if (!p.ptr.cached && ++ !bch2_dev_in_target(c, p.ptr.dev, io_opts->background_target)) ++ return p.ptr.dev; + +- return false; ++ return -1; + } + + void bch2_rebalance_add_key(struct bch_fs *c, + struct bkey_s_c k, + struct bch_io_opts *io_opts) + { +- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); +- const union bch_extent_entry *entry; +- struct extent_ptr_decoded p; ++ atomic64_t *counter; ++ int dev; + +- if (!io_opts->background_target && +- !io_opts->background_compression) ++ dev = __bch2_rebalance_pred(c, k, io_opts); ++ if (dev < 0) + return; + +- bkey_for_each_ptr_decode(k.k, ptrs, p, entry) +- if (rebalance_ptr_pred(c, p, io_opts)) { +- struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); ++ counter = dev < INT_MAX ++ ? &bch_dev_bkey_exists(c, dev)->rebalance_work ++ : &c->rebalance.work_unknown_dev; + +- if (atomic64_add_return(p.crc.compressed_size, +- &ca->rebalance_work) == +- p.crc.compressed_size) +- rebalance_wakeup(c); +- } +-} +- +-void bch2_rebalance_add_work(struct bch_fs *c, u64 sectors) +-{ +- if (atomic64_add_return(sectors, &c->rebalance.work_unknown_dev) == +- sectors) ++ if (atomic64_add_return(k.k->size, counter) == k.k->size) + rebalance_wakeup(c); + } + +@@ -69,26 +70,20 @@ static enum data_cmd rebalance_pred(struct bch_fs *c, void *arg, + struct bch_io_opts *io_opts, + struct data_opts *data_opts) + { +- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); +- const union bch_extent_entry *entry; +- struct extent_ptr_decoded p; +- unsigned nr_replicas = 0; +- +- bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { +- nr_replicas += !p.ptr.cached; +- +- if (rebalance_ptr_pred(c, p, io_opts)) +- goto found; ++ if (__bch2_rebalance_pred(c, k, io_opts) >= 0) { ++ data_opts->target = io_opts->background_target; ++ data_opts->btree_insert_flags = 0; ++ return DATA_ADD_REPLICAS; ++ } else { ++ return DATA_SKIP; + } ++} + +- if (nr_replicas < io_opts->data_replicas) +- goto found; +- +- return DATA_SKIP; +-found: +- data_opts->target = io_opts->background_target; +- data_opts->btree_insert_flags = 0; +- return DATA_ADD_REPLICAS; ++void bch2_rebalance_add_work(struct bch_fs *c, u64 sectors) ++{ ++ if (atomic64_add_return(sectors, &c->rebalance.work_unknown_dev) == ++ sectors) ++ rebalance_wakeup(c); + } + + struct rebalance_work { +-- +cgit v1.2.3 + + +From aab82eae8e11a671a0f09bdc3ed6d1be7ead5f8a Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 23 Feb 2018 16:26:10 -0500 +Subject: bcachefs: Track incompressible data + +This fixes the background_compression option: wihout some way of marking +data as incompressible, rebalance will keep rewriting incompressible +data over and over. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bcachefs_format.h | 14 ++++++++------ + fs/bcachefs/checksum.c | 3 ++- + fs/bcachefs/checksum.h | 7 +++++-- + fs/bcachefs/compress.c | 2 +- + fs/bcachefs/extents.c | 26 ++++++++++++++++++-------- + fs/bcachefs/extents.h | 7 +++++++ + fs/bcachefs/io.c | 42 +++++++++++++++++++++++++----------------- + fs/bcachefs/io_types.h | 1 + + fs/bcachefs/move.c | 19 ++++++++++++------- + fs/bcachefs/rebalance.c | 3 ++- + fs/bcachefs/sysfs.c | 2 +- + 11 files changed, 82 insertions(+), 44 deletions(-) + +diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h +index f6141fde830b..3b5e70a727b7 100644 +--- a/fs/bcachefs/bcachefs_format.h ++++ b/fs/bcachefs/bcachefs_format.h +@@ -1294,7 +1294,8 @@ LE64_BITMASK(BCH_SB_ERASURE_CODE, struct bch_sb, flags[3], 0, 16); + x(reflink, 6) \ + x(new_siphash, 7) \ + x(inline_data, 8) \ +- x(new_extent_overwrite, 9) ++ x(new_extent_overwrite, 9) \ ++ x(incompressible, 10) + + enum bch_sb_feature { + #define x(f, n) BCH_FEATURE_##f, +@@ -1374,11 +1375,12 @@ enum bch_csum_opts { + }; + + #define BCH_COMPRESSION_TYPES() \ +- x(none, 0) \ +- x(lz4_old, 1) \ +- x(gzip, 2) \ +- x(lz4, 3) \ +- x(zstd, 4) ++ x(none, 0) \ ++ x(lz4_old, 1) \ ++ x(gzip, 2) \ ++ x(lz4, 3) \ ++ x(zstd, 4) \ ++ x(incompressible, 5) + + enum bch_compression_type { + #define x(t, n) BCH_COMPRESSION_TYPE_##t, +diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c +index 2e1dfdc68e15..3d88719ba86c 100644 +--- a/fs/bcachefs/checksum.c ++++ b/fs/bcachefs/checksum.c +@@ -326,7 +326,7 @@ int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio, + + BUG_ON(len_a + len_b > bio_sectors(bio)); + BUG_ON(crc_old.uncompressed_size != bio_sectors(bio)); +- BUG_ON(crc_old.compression_type); ++ BUG_ON(crc_is_compressed(crc_old)); + BUG_ON(bch2_csum_type_is_encryption(crc_old.csum_type) != + bch2_csum_type_is_encryption(new_csum_type)); + +@@ -355,6 +355,7 @@ int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio, + if (i->crc) + *i->crc = (struct bch_extent_crc_unpacked) { + .csum_type = i->csum_type, ++ .compression_type = crc_old.compression_type, + .compressed_size = i->len, + .uncompressed_size = i->len, + .offset = 0, +diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h +index ca9e45906dc8..24dee8039d57 100644 +--- a/fs/bcachefs/checksum.h ++++ b/fs/bcachefs/checksum.h +@@ -155,13 +155,16 @@ static inline struct nonce null_nonce(void) + static inline struct nonce extent_nonce(struct bversion version, + struct bch_extent_crc_unpacked crc) + { +- unsigned size = crc.compression_type ? crc.uncompressed_size : 0; ++ unsigned compression_type = crc_is_compressed(crc) ++ ? crc.compression_type ++ : 0; ++ unsigned size = compression_type ? crc.uncompressed_size : 0; + struct nonce nonce = (struct nonce) {{ + [0] = cpu_to_le32(size << 22), + [1] = cpu_to_le32(version.lo), + [2] = cpu_to_le32(version.lo >> 32), + [3] = cpu_to_le32(version.hi| +- (crc.compression_type << 24))^BCH_NONCE_EXTENT, ++ (compression_type << 24))^BCH_NONCE_EXTENT, + }}; + + return nonce_add(nonce, crc.nonce << 9); +diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c +index e311a382c9c2..81c69c1554f4 100644 +--- a/fs/bcachefs/compress.c ++++ b/fs/bcachefs/compress.c +@@ -434,7 +434,7 @@ out: + bio_unmap_or_unbounce(c, dst_data); + return compression_type; + err: +- compression_type = 0; ++ compression_type = BCH_COMPRESSION_TYPE_incompressible; + goto out; + } + +diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c +index c4b0b9e15a8f..a19b91f9beb4 100644 +--- a/fs/bcachefs/extents.c ++++ b/fs/bcachefs/extents.c +@@ -337,7 +337,7 @@ enum merge_result bch2_extent_merge(struct bch_fs *c, + if (!bch2_checksum_mergeable(crc_l.csum_type)) + return BCH_MERGE_NOMERGE; + +- if (crc_l.compression_type) ++ if (crc_is_compressed(crc_l)) + return BCH_MERGE_NOMERGE; + + if (crc_l.csum_type && +@@ -448,7 +448,7 @@ static inline bool bch2_crc_unpacked_cmp(struct bch_extent_crc_unpacked l, + static inline bool can_narrow_crc(struct bch_extent_crc_unpacked u, + struct bch_extent_crc_unpacked n) + { +- return !u.compression_type && ++ return !crc_is_compressed(u) && + u.csum_type && + u.uncompressed_size > u.live_size && + bch2_csum_type_is_encryption(u.csum_type) == +@@ -492,7 +492,7 @@ bool bch2_bkey_narrow_crcs(struct bkey_i *k, struct bch_extent_crc_unpacked n) + /* Find a checksum entry that covers only live data: */ + if (!n.csum_type) { + bkey_for_each_crc(&k->k, ptrs, u, i) +- if (!u.compression_type && ++ if (!crc_is_compressed(u) && + u.csum_type && + u.live_size == u.uncompressed_size) { + n = u; +@@ -501,7 +501,7 @@ bool bch2_bkey_narrow_crcs(struct bkey_i *k, struct bch_extent_crc_unpacked n) + return false; + } + found: +- BUG_ON(n.compression_type); ++ BUG_ON(crc_is_compressed(n)); + BUG_ON(n.offset); + BUG_ON(n.live_size != k->k.size); + +@@ -610,8 +610,7 @@ unsigned bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c k) + struct extent_ptr_decoded p; + + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) +- ret += !p.ptr.cached && +- p.crc.compression_type == BCH_COMPRESSION_TYPE_none; ++ ret += !p.ptr.cached && !crc_is_compressed(p.crc); + } + + return ret; +@@ -625,13 +624,24 @@ unsigned bch2_bkey_sectors_compressed(struct bkey_s_c k) + unsigned ret = 0; + + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) +- if (!p.ptr.cached && +- p.crc.compression_type != BCH_COMPRESSION_TYPE_none) ++ if (!p.ptr.cached && crc_is_compressed(p.crc)) + ret += p.crc.compressed_size; + + return ret; + } + ++bool bch2_bkey_is_incompressible(struct bkey_s_c k) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const union bch_extent_entry *entry; ++ struct bch_extent_crc_unpacked crc; ++ ++ bkey_for_each_crc(k.k, ptrs, crc, entry) ++ if (crc.compression_type == BCH_COMPRESSION_TYPE_incompressible) ++ return true; ++ return false; ++} ++ + bool bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size, + unsigned nr_replicas) + { +diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h +index 7c5a41e6d79d..0d8554172263 100644 +--- a/fs/bcachefs/extents.h ++++ b/fs/bcachefs/extents.h +@@ -175,6 +175,12 @@ bch2_extent_crc_unpack(const struct bkey *k, const union bch_extent_crc *crc) + #undef common_fields + } + ++static inline bool crc_is_compressed(struct bch_extent_crc_unpacked crc) ++{ ++ return (crc.compression_type != BCH_COMPRESSION_TYPE_none && ++ crc.compression_type != BCH_COMPRESSION_TYPE_incompressible); ++} ++ + /* bkey_ptrs: generically over any key type that has ptrs */ + + struct bkey_ptrs_c { +@@ -483,6 +489,7 @@ static inline struct bch_devs_list bch2_bkey_cached_devs(struct bkey_s_c k) + unsigned bch2_bkey_nr_ptrs(struct bkey_s_c); + unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c); + unsigned bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c); ++bool bch2_bkey_is_incompressible(struct bkey_s_c); + unsigned bch2_bkey_sectors_compressed(struct bkey_s_c); + bool bch2_check_range_allocated(struct bch_fs *, struct bpos, u64, unsigned); + unsigned bch2_bkey_durability(struct bch_fs *, struct bkey_s_c); +diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c +index 4c7dd0994a28..49bd29ccc543 100644 +--- a/fs/bcachefs/io.c ++++ b/fs/bcachefs/io.c +@@ -546,9 +546,14 @@ static void __bch2_write_index(struct bch_write_op *op) + * particularly want to plumb io_opts all the way through the btree + * update stack right now + */ +- for_each_keylist_key(keys, k) ++ for_each_keylist_key(keys, k) { + bch2_rebalance_add_key(c, bkey_i_to_s_c(k), &op->opts); + ++ if (bch2_bkey_is_incompressible(bkey_i_to_s_c(k))) ++ bch2_check_set_feature(op->c, BCH_FEATURE_incompressible); ++ ++ } ++ + if (!bch2_keylist_empty(keys)) { + u64 sectors_start = keylist_sectors(keys); + int ret = op->index_update_fn(op); +@@ -784,8 +789,9 @@ static enum prep_encoded_ret { + /* Can we just write the entire extent as is? */ + if (op->crc.uncompressed_size == op->crc.live_size && + op->crc.compressed_size <= wp->sectors_free && +- op->crc.compression_type == op->compression_type) { +- if (!op->crc.compression_type && ++ (op->crc.compression_type == op->compression_type || ++ op->incompressible)) { ++ if (!crc_is_compressed(op->crc) && + op->csum_type != op->crc.csum_type && + bch2_write_rechecksum(c, op, op->csum_type)) + return PREP_ENCODED_CHECKSUM_ERR; +@@ -797,7 +803,7 @@ static enum prep_encoded_ret { + * If the data is compressed and we couldn't write the entire extent as + * is, we have to decompress it: + */ +- if (op->crc.compression_type) { ++ if (crc_is_compressed(op->crc)) { + struct bch_csum csum; + + if (bch2_write_decrypt(op)) +@@ -908,11 +914,13 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, + bch2_csum_type_is_encryption(op->crc.csum_type)); + BUG_ON(op->compression_type && !bounce); + +- crc.compression_type = op->compression_type +- ? bch2_bio_compress(c, dst, &dst_len, src, &src_len, +- op->compression_type) ++ crc.compression_type = op->incompressible ++ ? BCH_COMPRESSION_TYPE_incompressible ++ : op->compression_type ++ ? bch2_bio_compress(c, dst, &dst_len, src, &src_len, ++ op->compression_type) + : 0; +- if (!crc.compression_type) { ++ if (!crc_is_compressed(crc)) { + dst_len = min(dst->bi_iter.bi_size, src->bi_iter.bi_size); + dst_len = min_t(unsigned, dst_len, wp->sectors_free << 9); + +@@ -941,7 +949,7 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, + } + + if ((op->flags & BCH_WRITE_DATA_ENCODED) && +- !crc.compression_type && ++ !crc_is_compressed(crc) && + bch2_csum_type_is_encryption(op->crc.csum_type) == + bch2_csum_type_is_encryption(op->csum_type)) { + /* +@@ -1338,6 +1346,7 @@ static void promote_start(struct promote_op *op, struct bch_read_bio *rbio) + + static struct promote_op *__promote_alloc(struct bch_fs *c, + enum btree_id btree_id, ++ struct bkey_s_c k, + struct bpos pos, + struct extent_ptr_decoded *pick, + struct bch_io_opts opts, +@@ -1394,8 +1403,7 @@ static struct promote_op *__promote_alloc(struct bch_fs *c, + (struct data_opts) { + .target = opts.promote_target + }, +- btree_id, +- bkey_s_c_null); ++ btree_id, k); + BUG_ON(ret); + + return op; +@@ -1437,7 +1445,7 @@ static struct promote_op *promote_alloc(struct bch_fs *c, + k.k->type == KEY_TYPE_reflink_v + ? BTREE_ID_REFLINK + : BTREE_ID_EXTENTS, +- pos, pick, opts, sectors, rbio); ++ k, pos, pick, opts, sectors, rbio); + if (!promote) + return NULL; + +@@ -1701,7 +1709,7 @@ static void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio) + u64 data_offset = rbio->pos.offset - rbio->pick.crc.offset; + int ret; + +- if (rbio->pick.crc.compression_type) ++ if (crc_is_compressed(rbio->pick.crc)) + return; + + bkey_on_stack_init(&new); +@@ -1786,7 +1794,7 @@ static void __bch2_read_endio(struct work_struct *work) + crc.offset += rbio->offset_into_extent; + crc.live_size = bvec_iter_sectors(rbio->bvec_iter); + +- if (crc.compression_type != BCH_COMPRESSION_TYPE_none) { ++ if (crc_is_compressed(crc)) { + bch2_encrypt_bio(c, crc.csum_type, nonce, src); + if (bch2_bio_uncompress(c, src, dst, dst_iter, crc)) + goto decompression_err; +@@ -1883,7 +1891,7 @@ static void bch2_read_endio(struct bio *bio) + } + + if (rbio->narrow_crcs || +- rbio->pick.crc.compression_type || ++ crc_is_compressed(rbio->pick.crc) || + bch2_csum_type_is_encryption(rbio->pick.crc.csum_type)) + context = RBIO_CONTEXT_UNBOUND, wq = system_unbound_wq; + else if (rbio->pick.crc.csum_type) +@@ -1994,7 +2002,7 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig, + + EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size); + +- if (pick.crc.compression_type != BCH_COMPRESSION_TYPE_none || ++ if (crc_is_compressed(pick.crc) || + (pick.crc.csum_type != BCH_CSUM_NONE && + (bvec_iter_sectors(iter) != pick.crc.uncompressed_size || + (bch2_csum_type_is_encryption(pick.crc.csum_type) && +@@ -2009,7 +2017,7 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig, + &rbio, &bounce, &read_full); + + if (!read_full) { +- EBUG_ON(pick.crc.compression_type); ++ EBUG_ON(crc_is_compressed(pick.crc)); + EBUG_ON(pick.crc.csum_type && + (bvec_iter_sectors(iter) != pick.crc.uncompressed_size || + bvec_iter_sectors(iter) != pick.crc.live_size || +diff --git a/fs/bcachefs/io_types.h b/fs/bcachefs/io_types.h +index c37b7d7401e9..7f7b69b32e80 100644 +--- a/fs/bcachefs/io_types.h ++++ b/fs/bcachefs/io_types.h +@@ -105,6 +105,7 @@ struct bch_write_op { + unsigned nr_replicas:4; + unsigned nr_replicas_required:4; + unsigned alloc_reserve:4; ++ unsigned incompressible:1; + + struct bch_devs_list devs_have; + u16 target; +diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c +index 257e00ae6fa7..ecc74ebe0579 100644 +--- a/fs/bcachefs/move.c ++++ b/fs/bcachefs/move.c +@@ -215,6 +215,9 @@ int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m, + enum btree_id btree_id, + struct bkey_s_c k) + { ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const union bch_extent_entry *entry; ++ struct extent_ptr_decoded p; + int ret; + + m->btree_id = btree_id; +@@ -223,9 +226,14 @@ int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m, + m->nr_ptrs_reserved = 0; + + bch2_write_op_init(&m->op, c, io_opts); +- m->op.compression_type = +- bch2_compression_opt_to_type[io_opts.background_compression ?: +- io_opts.compression]; ++ ++ if (!bch2_bkey_is_incompressible(k)) ++ m->op.compression_type = ++ bch2_compression_opt_to_type[io_opts.background_compression ?: ++ io_opts.compression]; ++ else ++ m->op.incompressible = true; ++ + m->op.target = data_opts.target, + m->op.write_point = wp; + +@@ -265,14 +273,11 @@ int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m, + break; + } + case DATA_REWRITE: { +- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); +- const union bch_extent_entry *entry; +- struct extent_ptr_decoded p; + unsigned compressed_sectors = 0; + + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) + if (!p.ptr.cached && +- p.crc.compression_type != BCH_COMPRESSION_TYPE_none && ++ crc_is_compressed(p.crc) && + bch2_dev_in_target(c, p.ptr.dev, data_opts.target)) + compressed_sectors += p.crc.compressed_size; + +diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c +index d5883f89fb2d..ab1934325948 100644 +--- a/fs/bcachefs/rebalance.c ++++ b/fs/bcachefs/rebalance.c +@@ -30,7 +30,8 @@ static int __bch2_rebalance_pred(struct bch_fs *c, + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + +- if (io_opts->background_compression) ++ if (io_opts->background_compression && ++ !bch2_bkey_is_incompressible(k)) + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) + if (!p.ptr.cached && + p.crc.compression_type != +diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c +index 602def1ee95a..d78ffcc0e8a4 100644 +--- a/fs/bcachefs/sysfs.c ++++ b/fs/bcachefs/sysfs.c +@@ -276,7 +276,7 @@ static ssize_t bch2_compression_stats(struct bch_fs *c, char *buf) + struct extent_ptr_decoded p; + + extent_for_each_ptr_decode(e, p, entry) { +- if (p.crc.compression_type == BCH_COMPRESSION_TYPE_none) { ++ if (!crc_is_compressed(p.crc)) { + nr_uncompressed_extents++; + uncompressed_sectors += e.k->size; + } else { +-- +cgit v1.2.3 + + +From 42e2072951f48bfb86d6ffcbd140cf77c516e476 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 27 Jan 2020 17:47:07 -0500 +Subject: bcachefs: Fix an in iterator leak + +This should fix a transaction iterator overflow bug during fsck. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/str_hash.h | 5 ++++- + 1 file changed, 4 insertions(+), 1 deletion(-) + +diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h +index 35f4232d0755..cf6ecd963a7b 100644 +--- a/fs/bcachefs/str_hash.h ++++ b/fs/bcachefs/str_hash.h +@@ -319,13 +319,16 @@ int bch2_hash_delete(struct btree_trans *trans, + u64 inode, const void *key) + { + struct btree_iter *iter; ++ int ret; + + iter = bch2_hash_lookup(trans, desc, info, inode, key, + BTREE_ITER_INTENT); + if (IS_ERR(iter)) + return PTR_ERR(iter); + +- return bch2_hash_delete_at(trans, desc, info, iter); ++ ret = bch2_hash_delete_at(trans, desc, info, iter); ++ bch2_trans_iter_put(trans, iter); ++ return ret; + } + + #endif /* _BCACHEFS_STR_HASH_H */ +-- +cgit v1.2.3 + + +From 10f4ab5d557cde9a2186da40e74324991ae8b11a Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 29 Jan 2020 13:05:04 -0500 +Subject: bcachefs: Fix an uninitialized field in bch_write_op + +Regression from "bcachefs: Track incompressible data" + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/io.h | 1 + + fs/bcachefs/io_types.h | 2 +- + 2 files changed, 2 insertions(+), 1 deletion(-) + +diff --git a/fs/bcachefs/io.h b/fs/bcachefs/io.h +index 45c950942d78..37f7fa6102fc 100644 +--- a/fs/bcachefs/io.h ++++ b/fs/bcachefs/io.h +@@ -78,6 +78,7 @@ static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c, + op->nr_replicas = 0; + op->nr_replicas_required = c->opts.data_replicas_required; + op->alloc_reserve = RESERVE_NONE; ++ op->incompressible = 0; + op->open_buckets.nr = 0; + op->devs_have.nr = 0; + op->target = 0; +diff --git a/fs/bcachefs/io_types.h b/fs/bcachefs/io_types.h +index 7f7b69b32e80..684e4c9a5d98 100644 +--- a/fs/bcachefs/io_types.h ++++ b/fs/bcachefs/io_types.h +@@ -104,7 +104,7 @@ struct bch_write_op { + unsigned compression_type:4; + unsigned nr_replicas:4; + unsigned nr_replicas_required:4; +- unsigned alloc_reserve:4; ++ unsigned alloc_reserve:3; + unsigned incompressible:1; + + struct bch_devs_list devs_have; +-- +cgit v1.2.3 + + +From ca340ae7b48639884287be2ed92c2e22463dbed3 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 30 Jan 2020 20:26:08 -0500 +Subject: bcachefs: Improve an insert path optimization + +The insert path had an optimization to short circuit lookup +table/iterator fixups when overwriting an existing key with the same +size value - but it was incorrect when other key fields +(size/version) were changing. This is important for the upcoming rework +to have extent updates use the same insert path as regular keys. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bset.c | 3 ++- + fs/bcachefs/btree_update_leaf.c | 38 +++++++++++++------------------------- + 2 files changed, 15 insertions(+), 26 deletions(-) + +diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c +index f268def79927..9dd59343f3a3 100644 +--- a/fs/bcachefs/bset.c ++++ b/fs/bcachefs/bset.c +@@ -1201,7 +1201,8 @@ void bch2_bset_insert(struct btree *b, + memcpy_u64s(bkeyp_val(f, where), &insert->v, + bkeyp_val_u64s(f, src)); + +- bch2_bset_fix_lookup_table(b, t, where, clobber_u64s, src->u64s); ++ if (src->u64s != clobber_u64s) ++ bch2_bset_fix_lookup_table(b, t, where, clobber_u64s, src->u64s); + + bch2_verify_btree_nr_keys(b); + } +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index afd2086edeff..15e1c29d53e9 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -53,9 +53,8 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter, + struct btree_node_iter *node_iter, + struct bkey_i *insert) + { +- const struct bkey_format *f = &b->format; + struct bkey_packed *k; +- unsigned clobber_u64s; ++ unsigned clobber_u64s = 0, new_u64s = 0; + + EBUG_ON(btree_node_just_written(b)); + EBUG_ON(bset_written(b, btree_bset_last(b))); +@@ -68,30 +67,25 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter, + k = NULL; + + /* @k is the key being overwritten/deleted, if any: */ +- + EBUG_ON(k && bkey_whiteout(k)); + ++ /* Deleting, but not found? nothing to do: */ ++ if (bkey_whiteout(&insert->k) && !k) ++ return false; ++ + if (bkey_whiteout(&insert->k)) { + /* Deleting: */ +- +- /* Not found? Nothing to do: */ +- if (!k) +- return false; +- + btree_account_key_drop(b, k); + k->type = KEY_TYPE_deleted; + +- if (k->needs_whiteout) { ++ if (k->needs_whiteout) + push_whiteout(iter->trans->c, b, k); +- k->needs_whiteout = false; +- } ++ k->needs_whiteout = false; + + if (k >= btree_bset_last(b)->start) { + clobber_u64s = k->u64s; +- + bch2_bset_delete(b, k, clobber_u64s); +- bch2_btree_node_iter_fix(iter, b, node_iter, k, +- clobber_u64s, 0); ++ goto fix_iter; + } else { + bch2_btree_iter_fix_key_modified(iter, b, k); + } +@@ -101,14 +95,6 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter, + + if (k) { + /* Overwriting: */ +- if (!bkey_written(b, k) && +- bkey_val_u64s(&insert->k) == bkeyp_val_u64s(f, k)) { +- k->type = insert->k.type; +- memcpy_u64s(bkeyp_val(f, k), &insert->v, +- bkey_val_u64s(&insert->k)); +- return true; +- } +- + btree_account_key_drop(b, k); + k->type = KEY_TYPE_deleted; + +@@ -124,11 +110,13 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter, + } + + k = bch2_btree_node_iter_bset_pos(node_iter, b, bset_tree_last(b)); +- clobber_u64s = 0; + overwrite: + bch2_bset_insert(b, node_iter, k, insert, clobber_u64s); +- bch2_btree_node_iter_fix(iter, b, node_iter, k, +- clobber_u64s, k->u64s); ++ new_u64s = k->u64s; ++fix_iter: ++ if (clobber_u64s != new_u64s) ++ bch2_btree_node_iter_fix(iter, b, node_iter, k, ++ clobber_u64s, new_u64s); + return true; + } + +-- +cgit v1.2.3 + + +From 03d0c7d5afb367fce0b04759e0254bfc6b06b118 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 18 Feb 2020 14:27:10 -0500 +Subject: bcachefs: Make sure we're releasing btree iterators + +This wasn't originally required, but this is the model we're moving +towards. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 14 ++++-- + fs/bcachefs/fs-common.c | 126 ++++++++++++++++++++++++++++++----------------- + fs/bcachefs/inode.c | 60 +++++++++------------- + fs/bcachefs/reflink.c | 5 +- + fs/bcachefs/str_hash.h | 6 +-- + 5 files changed, 120 insertions(+), 91 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index ea0555b806f0..64afa032031a 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -1729,7 +1729,12 @@ static inline void __bch2_trans_iter_free(struct btree_trans *trans, + int bch2_trans_iter_put(struct btree_trans *trans, + struct btree_iter *iter) + { +- int ret = btree_iter_err(iter); ++ int ret; ++ ++ if (IS_ERR_OR_NULL(iter)) ++ return 0; ++ ++ ret = btree_iter_err(iter); + + if (!(trans->iters_touched & (1ULL << iter->idx)) && + !(iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT)) +@@ -1742,6 +1747,9 @@ int bch2_trans_iter_put(struct btree_trans *trans, + int bch2_trans_iter_free(struct btree_trans *trans, + struct btree_iter *iter) + { ++ if (IS_ERR_OR_NULL(iter)) ++ return 0; ++ + trans->iters_touched &= ~(1ULL << iter->idx); + + return bch2_trans_iter_put(trans, iter); +@@ -1981,8 +1989,8 @@ struct btree_iter *bch2_trans_copy_iter(struct btree_trans *trans, + + trans->iters_live |= 1ULL << iter->idx; + /* +- * Don't mark it as touched, we don't need to preserve this iter since +- * it's cheap to copy it again: ++ * We don't need to preserve this iter since it's cheap to copy it ++ * again - this will cause trans_iter_put() to free it right away: + */ + trans->iters_touched &= ~(1ULL << iter->idx); + +diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c +index 96f7bbe0a3ed..878419d40992 100644 +--- a/fs/bcachefs/fs-common.c ++++ b/fs/bcachefs/fs-common.c +@@ -19,14 +19,15 @@ int bch2_create_trans(struct btree_trans *trans, u64 dir_inum, + struct posix_acl *acl) + { + struct bch_fs *c = trans->c; +- struct btree_iter *dir_iter; ++ struct btree_iter *dir_iter = NULL; + struct bch_hash_info hash = bch2_hash_info_init(c, new_inode); + u64 now = bch2_current_time(trans->c); + int ret; + + dir_iter = bch2_inode_peek(trans, dir_u, dir_inum, BTREE_ITER_INTENT); +- if (IS_ERR(dir_iter)) +- return PTR_ERR(dir_iter); ++ ret = PTR_ERR_OR_ZERO(dir_iter); ++ if (ret) ++ goto err; + + bch2_inode_init_late(new_inode, now, uid, gid, mode, rdev, dir_u); + +@@ -37,20 +38,20 @@ int bch2_create_trans(struct btree_trans *trans, u64 dir_inum, + BLOCKDEV_INODE_MAX, 0, + &c->unused_inode_hint); + if (ret) +- return ret; ++ goto err; + + if (default_acl) { + ret = bch2_set_acl_trans(trans, new_inode, &hash, + default_acl, ACL_TYPE_DEFAULT); + if (ret) +- return ret; ++ goto err; + } + + if (acl) { + ret = bch2_set_acl_trans(trans, new_inode, &hash, + acl, ACL_TYPE_ACCESS); + if (ret) +- return ret; ++ goto err; + } + + if (name) { +@@ -62,48 +63,55 @@ int bch2_create_trans(struct btree_trans *trans, u64 dir_inum, + + ret = bch2_inode_write(trans, dir_iter, dir_u); + if (ret) +- return ret; ++ goto err; + + ret = bch2_dirent_create(trans, dir_inum, &dir_hash, + mode_to_type(new_inode->bi_mode), + name, new_inode->bi_inum, + BCH_HASH_SET_MUST_CREATE); + if (ret) +- return ret; ++ goto err; + } +- +- return 0; ++err: ++ bch2_trans_iter_put(trans, dir_iter); ++ return ret; + } + + int bch2_link_trans(struct btree_trans *trans, u64 dir_inum, + u64 inum, struct bch_inode_unpacked *dir_u, + struct bch_inode_unpacked *inode_u, const struct qstr *name) + { +- struct btree_iter *dir_iter, *inode_iter; ++ struct btree_iter *dir_iter = NULL, *inode_iter = NULL; + struct bch_hash_info dir_hash; + u64 now = bch2_current_time(trans->c); ++ int ret; + + inode_iter = bch2_inode_peek(trans, inode_u, inum, BTREE_ITER_INTENT); +- if (IS_ERR(inode_iter)) +- return PTR_ERR(inode_iter); ++ ret = PTR_ERR_OR_ZERO(inode_iter); ++ if (ret) ++ goto err; + + inode_u->bi_ctime = now; + bch2_inode_nlink_inc(inode_u); + + dir_iter = bch2_inode_peek(trans, dir_u, dir_inum, 0); +- if (IS_ERR(dir_iter)) +- return PTR_ERR(dir_iter); ++ ret = PTR_ERR_OR_ZERO(dir_iter); ++ if (ret) ++ goto err; + + dir_u->bi_mtime = dir_u->bi_ctime = now; + + dir_hash = bch2_hash_info_init(trans->c, dir_u); +- bch2_trans_iter_put(trans, dir_iter); + +- return bch2_dirent_create(trans, dir_inum, &dir_hash, ++ ret = bch2_dirent_create(trans, dir_inum, &dir_hash, + mode_to_type(inode_u->bi_mode), + name, inum, BCH_HASH_SET_MUST_CREATE) ?: + bch2_inode_write(trans, dir_iter, dir_u) ?: + bch2_inode_write(trans, inode_iter, inode_u); ++err: ++ bch2_trans_iter_put(trans, dir_iter); ++ bch2_trans_iter_put(trans, inode_iter); ++ return ret; + } + + int bch2_unlink_trans(struct btree_trans *trans, +@@ -111,39 +119,49 @@ int bch2_unlink_trans(struct btree_trans *trans, + struct bch_inode_unpacked *inode_u, + const struct qstr *name) + { +- struct btree_iter *dir_iter, *dirent_iter, *inode_iter; ++ struct btree_iter *dir_iter = NULL, *dirent_iter = NULL, ++ *inode_iter = NULL; + struct bch_hash_info dir_hash; + u64 inum, now = bch2_current_time(trans->c); + struct bkey_s_c k; ++ int ret; + + dir_iter = bch2_inode_peek(trans, dir_u, dir_inum, BTREE_ITER_INTENT); +- if (IS_ERR(dir_iter)) +- return PTR_ERR(dir_iter); ++ ret = PTR_ERR_OR_ZERO(dir_iter); ++ if (ret) ++ goto err; + + dir_hash = bch2_hash_info_init(trans->c, dir_u); + + dirent_iter = __bch2_dirent_lookup_trans(trans, dir_inum, &dir_hash, + name, BTREE_ITER_INTENT); +- if (IS_ERR(dirent_iter)) +- return PTR_ERR(dirent_iter); ++ ret = PTR_ERR_OR_ZERO(dirent_iter); ++ if (ret) ++ goto err; + + k = bch2_btree_iter_peek_slot(dirent_iter); + inum = le64_to_cpu(bkey_s_c_to_dirent(k).v->d_inum); + + inode_iter = bch2_inode_peek(trans, inode_u, inum, BTREE_ITER_INTENT); +- if (IS_ERR(inode_iter)) +- return PTR_ERR(inode_iter); ++ ret = PTR_ERR_OR_ZERO(inode_iter); ++ if (ret) ++ goto err; + + dir_u->bi_mtime = dir_u->bi_ctime = inode_u->bi_ctime = now; + dir_u->bi_nlink -= S_ISDIR(inode_u->bi_mode); + bch2_inode_nlink_dec(inode_u); + +- return (S_ISDIR(inode_u->bi_mode) ++ ret = (S_ISDIR(inode_u->bi_mode) + ? bch2_empty_dir_trans(trans, inum) + : 0) ?: + bch2_dirent_delete_at(trans, &dir_hash, dirent_iter) ?: + bch2_inode_write(trans, dir_iter, dir_u) ?: + bch2_inode_write(trans, inode_iter, inode_u); ++err: ++ bch2_trans_iter_put(trans, inode_iter); ++ bch2_trans_iter_put(trans, dirent_iter); ++ bch2_trans_iter_put(trans, dir_iter); ++ return ret; + } + + bool bch2_reinherit_attrs(struct bch_inode_unpacked *dst_u, +@@ -179,24 +197,26 @@ int bch2_rename_trans(struct btree_trans *trans, + const struct qstr *dst_name, + enum bch_rename_mode mode) + { +- struct btree_iter *src_dir_iter, *dst_dir_iter = NULL; +- struct btree_iter *src_inode_iter, *dst_inode_iter = NULL; ++ struct btree_iter *src_dir_iter = NULL, *dst_dir_iter = NULL; ++ struct btree_iter *src_inode_iter = NULL, *dst_inode_iter = NULL; + struct bch_hash_info src_hash, dst_hash; + u64 src_inode, dst_inode, now = bch2_current_time(trans->c); + int ret; + + src_dir_iter = bch2_inode_peek(trans, src_dir_u, src_dir, + BTREE_ITER_INTENT); +- if (IS_ERR(src_dir_iter)) +- return PTR_ERR(src_dir_iter); ++ ret = PTR_ERR_OR_ZERO(src_dir_iter); ++ if (ret) ++ goto err; + + src_hash = bch2_hash_info_init(trans->c, src_dir_u); + + if (dst_dir != src_dir) { + dst_dir_iter = bch2_inode_peek(trans, dst_dir_u, dst_dir, + BTREE_ITER_INTENT); +- if (IS_ERR(dst_dir_iter)) +- return PTR_ERR(dst_dir_iter); ++ ret = PTR_ERR_OR_ZERO(dst_dir_iter); ++ if (ret) ++ goto err; + + dst_hash = bch2_hash_info_init(trans->c, dst_dir_u); + } else { +@@ -211,38 +231,48 @@ int bch2_rename_trans(struct btree_trans *trans, + dst_name, &dst_inode, + mode); + if (ret) +- return ret; ++ goto err; + + src_inode_iter = bch2_inode_peek(trans, src_inode_u, src_inode, + BTREE_ITER_INTENT); +- if (IS_ERR(src_inode_iter)) +- return PTR_ERR(src_inode_iter); ++ ret = PTR_ERR_OR_ZERO(src_inode_iter); ++ if (ret) ++ goto err; + + if (dst_inode) { + dst_inode_iter = bch2_inode_peek(trans, dst_inode_u, dst_inode, + BTREE_ITER_INTENT); +- if (IS_ERR(dst_inode_iter)) +- return PTR_ERR(dst_inode_iter); ++ ret = PTR_ERR_OR_ZERO(dst_inode_iter); ++ if (ret) ++ goto err; + } + + if (mode == BCH_RENAME_OVERWRITE) { + if (S_ISDIR(src_inode_u->bi_mode) != +- S_ISDIR(dst_inode_u->bi_mode)) +- return -ENOTDIR; ++ S_ISDIR(dst_inode_u->bi_mode)) { ++ ret = -ENOTDIR; ++ goto err; ++ } + + if (S_ISDIR(dst_inode_u->bi_mode) && +- bch2_empty_dir_trans(trans, dst_inode)) +- return -ENOTEMPTY; ++ bch2_empty_dir_trans(trans, dst_inode)) { ++ ret = -ENOTEMPTY; ++ goto err; ++ } + } + + if (bch2_reinherit_attrs(src_inode_u, dst_dir_u) && +- S_ISDIR(src_inode_u->bi_mode)) +- return -EXDEV; ++ S_ISDIR(src_inode_u->bi_mode)) { ++ ret = -EXDEV; ++ goto err; ++ } + + if (mode == BCH_RENAME_EXCHANGE && + bch2_reinherit_attrs(dst_inode_u, src_dir_u) && +- S_ISDIR(dst_inode_u->bi_mode)) +- return -EXDEV; ++ S_ISDIR(dst_inode_u->bi_mode)) { ++ ret = -EXDEV; ++ goto err; ++ } + + if (S_ISDIR(src_inode_u->bi_mode)) { + src_dir_u->bi_nlink--; +@@ -270,7 +300,7 @@ int bch2_rename_trans(struct btree_trans *trans, + if (dst_inode) + dst_inode_u->bi_ctime = now; + +- return bch2_inode_write(trans, src_dir_iter, src_dir_u) ?: ++ ret = bch2_inode_write(trans, src_dir_iter, src_dir_u) ?: + (src_dir != dst_dir + ? bch2_inode_write(trans, dst_dir_iter, dst_dir_u) + : 0 ) ?: +@@ -278,4 +308,10 @@ int bch2_rename_trans(struct btree_trans *trans, + (dst_inode + ? bch2_inode_write(trans, dst_inode_iter, dst_inode_u) + : 0 ); ++err: ++ bch2_trans_iter_put(trans, dst_inode_iter); ++ bch2_trans_iter_put(trans, src_inode_iter); ++ bch2_trans_iter_put(trans, dst_dir_iter); ++ bch2_trans_iter_put(trans, src_dir_iter); ++ return ret; + } +diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c +index e811b98d0f03..26171ff754a6 100644 +--- a/fs/bcachefs/inode.c ++++ b/fs/bcachefs/inode.c +@@ -362,16 +362,16 @@ int bch2_inode_create(struct btree_trans *trans, + struct bch_inode_unpacked *inode_u, + u64 min, u64 max, u64 *hint) + { +- struct bch_fs *c = trans->c; + struct bkey_inode_buf *inode_p; +- struct btree_iter *iter; ++ struct btree_iter *iter = NULL; ++ struct bkey_s_c k; + u64 start; + int ret; + + if (!max) + max = ULLONG_MAX; + +- if (c->opts.inodes_32bit) ++ if (trans->c->opts.inodes_32bit) + max = min_t(u64, max, U32_MAX); + + start = READ_ONCE(*hint); +@@ -382,48 +382,37 @@ int bch2_inode_create(struct btree_trans *trans, + inode_p = bch2_trans_kmalloc(trans, sizeof(*inode_p)); + if (IS_ERR(inode_p)) + return PTR_ERR(inode_p); +- +- iter = bch2_trans_get_iter(trans, +- BTREE_ID_INODES, POS(start, 0), +- BTREE_ITER_SLOTS|BTREE_ITER_INTENT); +- if (IS_ERR(iter)) +- return PTR_ERR(iter); + again: +- while (1) { +- struct bkey_s_c k = bch2_btree_iter_peek_slot(iter); +- +- ret = bkey_err(k); +- if (ret) +- return ret; ++ for_each_btree_key(trans, iter, BTREE_ID_INODES, POS(start, 0), ++ BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { ++ if (iter->pos.inode > max) ++ break; + +- switch (k.k->type) { +- case KEY_TYPE_inode: +- /* slot used */ +- if (iter->pos.inode >= max) +- goto out; ++ if (k.k->type != KEY_TYPE_inode) ++ goto found_slot; ++ } + +- bch2_btree_iter_next_slot(iter); +- break; ++ bch2_trans_iter_put(trans, iter); + +- default: +- *hint = k.k->p.inode; +- inode_u->bi_inum = k.k->p.inode; +- inode_u->bi_generation = bkey_generation(k); ++ if (ret) ++ return ret; + +- bch2_inode_pack(inode_p, inode_u); +- bch2_trans_update(trans, iter, &inode_p->inode.k_i, 0); +- return 0; +- } +- } +-out: + if (start != min) { + /* Retry from start */ + start = min; +- bch2_btree_iter_set_pos(iter, POS(start, 0)); + goto again; + } + + return -ENOSPC; ++found_slot: ++ *hint = k.k->p.inode; ++ inode_u->bi_inum = k.k->p.inode; ++ inode_u->bi_generation = bkey_generation(k); ++ ++ bch2_inode_pack(inode_p, inode_u); ++ bch2_trans_update(trans, iter, &inode_p->inode.k_i, 0); ++ bch2_trans_iter_put(trans, iter); ++ return 0; + } + + int bch2_inode_rm(struct bch_fs *c, u64 inode_nr) +@@ -518,14 +507,13 @@ int bch2_inode_find_by_inum_trans(struct btree_trans *trans, u64 inode_nr, + k = bch2_btree_iter_peek_slot(iter); + ret = bkey_err(k); + if (ret) +- return ret; ++ goto err; + + ret = k.k->type == KEY_TYPE_inode + ? bch2_inode_unpack(bkey_s_c_to_inode(k), inode) + : -ENOENT; +- ++err: + bch2_trans_iter_put(trans, iter); +- + return ret; + } + +diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c +index 3b8c74ca3725..d78a3d5f7246 100644 +--- a/fs/bcachefs/reflink.c ++++ b/fs/bcachefs/reflink.c +@@ -128,10 +128,9 @@ static int bch2_make_extent_indirect(struct btree_trans *trans, + + bch2_trans_update(trans, extent_iter, &r_p->k_i, 0); + err: +- if (!IS_ERR(reflink_iter)) { ++ if (!IS_ERR(reflink_iter)) + c->reflink_hint = reflink_iter->pos.offset; +- bch2_trans_iter_put(trans, reflink_iter); +- } ++ bch2_trans_iter_put(trans, reflink_iter); + + return ret; + } +diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h +index cf6ecd963a7b..0710d0bbe36d 100644 +--- a/fs/bcachefs/str_hash.h ++++ b/fs/bcachefs/str_hash.h +@@ -262,10 +262,8 @@ int bch2_hash_set(struct btree_trans *trans, + if (!ret) + ret = -ENOSPC; + out: +- if (!IS_ERR_OR_NULL(slot)) +- bch2_trans_iter_put(trans, slot); +- if (!IS_ERR_OR_NULL(iter)) +- bch2_trans_iter_put(trans, iter); ++ bch2_trans_iter_put(trans, slot); ++ bch2_trans_iter_put(trans, iter); + + return ret; + found: +-- +cgit v1.2.3 + + +From 8cc82a2c866f9e14a1dc0d9ebb05aebfcba991ef Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 27 Dec 2019 20:51:35 -0500 +Subject: bcachefs: btree_and_journal_iter + +Introduce a new iterator that iterates over keys in the btree with keys +from the journal overlaid on top. This factors out what the erasure +coding init code was doing manually. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/alloc_background.c | 20 ++++--- + fs/bcachefs/ec.c | 37 +++---------- + fs/bcachefs/recovery.c | 116 +++++++++++++++++++++++++++++++++++------ + fs/bcachefs/recovery.h | 26 +++++++-- + 4 files changed, 139 insertions(+), 60 deletions(-) + +diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c +index 87dd137fed3f..0da8de167ff1 100644 +--- a/fs/bcachefs/alloc_background.c ++++ b/fs/bcachefs/alloc_background.c +@@ -211,33 +211,31 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, + int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys) + { + struct btree_trans trans; +- struct btree_iter *iter; ++ struct btree_and_journal_iter iter; + struct bkey_s_c k; + struct bch_dev *ca; +- struct journal_key *j; + unsigned i; +- int ret; ++ int ret = 0; + + bch2_trans_init(&trans, c, 0, 0); + +- for_each_btree_key(&trans, iter, BTREE_ID_ALLOC, POS_MIN, 0, k, ret) ++ bch2_btree_and_journal_iter_init(&iter, &trans, journal_keys, ++ BTREE_ID_ALLOC, POS_MIN); ++ ++ while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { + bch2_mark_key(c, k, 0, 0, NULL, 0, + BTREE_TRIGGER_ALLOC_READ| + BTREE_TRIGGER_NOATOMIC); + ++ bch2_btree_and_journal_iter_advance(&iter); ++ } ++ + ret = bch2_trans_exit(&trans) ?: ret; + if (ret) { + bch_err(c, "error reading alloc info: %i", ret); + return ret; + } + +- for_each_journal_key(*journal_keys, j) +- if (j->btree_id == BTREE_ID_ALLOC) +- bch2_mark_key(c, bkey_i_to_s_c(j->k), +- 0, 0, NULL, 0, +- BTREE_TRIGGER_ALLOC_READ| +- BTREE_TRIGGER_NOATOMIC); +- + percpu_down_write(&c->mark_lock); + bch2_dev_usage_from_buckets(c); + percpu_up_write(&c->mark_lock); +diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c +index 1648dd3dac6f..63e1ca668b3e 100644 +--- a/fs/bcachefs/ec.c ++++ b/fs/bcachefs/ec.c +@@ -1280,9 +1280,8 @@ int bch2_stripes_write(struct bch_fs *c, unsigned flags, bool *wrote) + int bch2_stripes_read(struct bch_fs *c, struct journal_keys *journal_keys) + { + struct btree_trans trans; +- struct btree_iter *btree_iter; +- struct journal_iter journal_iter; +- struct bkey_s_c btree_k, journal_k; ++ struct btree_and_journal_iter iter; ++ struct bkey_s_c k; + int ret; + + ret = bch2_fs_ec_start(c); +@@ -1291,38 +1290,16 @@ int bch2_stripes_read(struct bch_fs *c, struct journal_keys *journal_keys) + + bch2_trans_init(&trans, c, 0, 0); + +- btree_iter = bch2_trans_get_iter(&trans, BTREE_ID_EC, POS_MIN, 0); +- journal_iter = bch2_journal_iter_init(journal_keys, BTREE_ID_EC); ++ bch2_btree_and_journal_iter_init(&iter, &trans, journal_keys, ++ BTREE_ID_EC, POS_MIN); + +- btree_k = bch2_btree_iter_peek(btree_iter); +- journal_k = bch2_journal_iter_peek(&journal_iter); + +- while (1) { +- bool btree; +- +- if (btree_k.k && journal_k.k) { +- int cmp = bkey_cmp(btree_k.k->p, journal_k.k->p); +- +- if (!cmp) +- btree_k = bch2_btree_iter_next(btree_iter); +- btree = cmp < 0; +- } else if (btree_k.k) { +- btree = true; +- } else if (journal_k.k) { +- btree = false; +- } else { +- break; +- } +- +- bch2_mark_key(c, btree ? btree_k : journal_k, +- 0, 0, NULL, 0, ++ while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { ++ bch2_mark_key(c, k, 0, 0, NULL, 0, + BTREE_TRIGGER_ALLOC_READ| + BTREE_TRIGGER_NOATOMIC); + +- if (btree) +- btree_k = bch2_btree_iter_next(btree_iter); +- else +- journal_k = bch2_journal_iter_next(&journal_iter); ++ bch2_btree_and_journal_iter_advance(&iter); + } + + ret = bch2_trans_exit(&trans) ?: ret; +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index 8ecd4abc8eeb..29e6f9f00bad 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -27,26 +27,15 @@ + + /* iterate over keys read from the journal: */ + +-struct journal_iter bch2_journal_iter_init(struct journal_keys *keys, +- enum btree_id id) +-{ +- return (struct journal_iter) { +- .keys = keys, +- .k = keys->d, +- .btree_id = id, +- }; +-} +- + struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter) + { +- while (1) { +- if (iter->k == iter->keys->d + iter->keys->nr) +- return bkey_s_c_null; +- ++ while (iter->k) { + if (iter->k->btree_id == iter->btree_id) + return bkey_i_to_s_c(iter->k->k); + + iter->k++; ++ if (iter->k == iter->keys->d + iter->keys->nr) ++ iter->k = NULL; + } + + return bkey_s_c_null; +@@ -54,13 +43,110 @@ struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter) + + struct bkey_s_c bch2_journal_iter_next(struct journal_iter *iter) + { +- if (iter->k == iter->keys->d + iter->keys->nr) ++ if (!iter->k) + return bkey_s_c_null; + + iter->k++; ++ if (iter->k == iter->keys->d + iter->keys->nr) ++ iter->k = NULL; ++ + return bch2_journal_iter_peek(iter); + } + ++void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *iter) ++{ ++ switch (iter->last) { ++ case none: ++ break; ++ case btree: ++ bch2_btree_iter_next(iter->btree); ++ break; ++ case journal: ++ bch2_journal_iter_next(&iter->journal); ++ break; ++ } ++ ++ iter->last = none; ++} ++ ++struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *iter) ++{ ++ struct bkey_s_c ret; ++ ++ while (1) { ++ struct bkey_s_c btree_k = bch2_btree_iter_peek(iter->btree); ++ struct bkey_s_c journal_k = bch2_journal_iter_peek(&iter->journal); ++ ++ if (btree_k.k && journal_k.k) { ++ int cmp = bkey_cmp(btree_k.k->p, journal_k.k->p); ++ ++ if (!cmp) ++ bch2_btree_iter_next(iter->btree); ++ ++ iter->last = cmp < 0 ? btree : journal; ++ } else if (btree_k.k) { ++ iter->last = btree; ++ } else if (journal_k.k) { ++ iter->last = journal; ++ } else { ++ iter->last = none; ++ return bkey_s_c_null; ++ } ++ ++ ret = iter->last == journal ? journal_k : btree_k; ++ if (!bkey_deleted(ret.k)) ++ break; ++ ++ bch2_btree_and_journal_iter_advance(iter); ++ } ++ ++ return ret; ++} ++ ++struct bkey_s_c bch2_btree_and_journal_iter_next(struct btree_and_journal_iter *iter) ++{ ++ bch2_btree_and_journal_iter_advance(iter); ++ ++ return bch2_btree_and_journal_iter_peek(iter); ++} ++ ++struct journal_key *journal_key_search(struct journal_keys *journal_keys, ++ enum btree_id id, struct bpos pos) ++{ ++ size_t l = 0, r = journal_keys->nr, m; ++ ++ while (l < r) { ++ m = l + ((r - l) >> 1); ++ if ((cmp_int(id, journal_keys->d[m].btree_id) ?: ++ bkey_cmp(pos, journal_keys->d[m].k->k.p)) > 0) ++ l = m + 1; ++ else ++ r = m; ++ } ++ ++ BUG_ON(l < journal_keys->nr && ++ (cmp_int(id, journal_keys->d[l].btree_id) ?: ++ bkey_cmp(pos, journal_keys->d[l].k->k.p)) > 0); ++ ++ BUG_ON(l && ++ (cmp_int(id, journal_keys->d[l - 1].btree_id) ?: ++ bkey_cmp(pos, journal_keys->d[l - 1].k->k.p)) <= 0); ++ ++ return l < journal_keys->nr ? journal_keys->d + l : NULL; ++} ++ ++void bch2_btree_and_journal_iter_init(struct btree_and_journal_iter *iter, ++ struct btree_trans *trans, ++ struct journal_keys *journal_keys, ++ enum btree_id id, struct bpos pos) ++{ ++ iter->journal.keys = journal_keys; ++ iter->journal.k = journal_key_search(journal_keys, id, pos); ++ iter->journal.btree_id = id; ++ ++ iter->btree = bch2_trans_get_iter(trans, id, pos, 0); ++} ++ + /* sort and dedup all keys in the journal: */ + + static void journal_entries_free(struct list_head *list) +diff --git a/fs/bcachefs/recovery.h b/fs/bcachefs/recovery.h +index 479ea46f8dcb..ccd84a8fe60d 100644 +--- a/fs/bcachefs/recovery.h ++++ b/fs/bcachefs/recovery.h +@@ -24,10 +24,28 @@ struct journal_iter { + enum btree_id btree_id; + }; + +-struct journal_iter bch2_journal_iter_init(struct journal_keys *, +- enum btree_id); +-struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *); +-struct bkey_s_c bch2_journal_iter_next(struct journal_iter *); ++struct btree_and_journal_iter { ++ enum btree_id btree_id; ++ ++ struct btree_iter *btree; ++ struct journal_iter journal; ++ ++ enum last_key_returned { ++ none, ++ btree, ++ journal, ++ } last; ++}; ++ ++void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *); ++struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *); ++struct bkey_s_c bch2_btree_and_journal_iter_next(struct btree_and_journal_iter *); ++struct journal_key *journal_key_search(struct journal_keys *, ++ enum btree_id, struct bpos); ++void bch2_btree_and_journal_iter_init(struct btree_and_journal_iter *, ++ struct btree_trans *, ++ struct journal_keys *, ++ enum btree_id, struct bpos); + + int bch2_fs_recovery(struct bch_fs *); + int bch2_fs_initialize(struct bch_fs *); +-- +cgit v1.2.3 + + +From ba1d22c81b3844762abe4ed0798b04ab386ab63a Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 31 Jan 2020 13:23:18 -0500 +Subject: bcachefs: __bch2_btree_iter_set_pos() + +This one takes an additional argument for whether we're searching for >= +or > the search key. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 26 +++++++++++++++++++++++++- + fs/bcachefs/btree_iter.h | 1 + + 2 files changed, 26 insertions(+), 1 deletion(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 64afa032031a..f0323d691e8f 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -1271,6 +1271,29 @@ static unsigned btree_iter_pos_changed(struct btree_iter *iter, int cmp) + return l; + } + ++void __bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos, ++ bool strictly_greater) ++{ ++ struct bpos old = btree_iter_search_key(iter); ++ unsigned l; ++ int cmp; ++ ++ iter->flags &= ~BTREE_ITER_IS_EXTENTS; ++ iter->flags |= strictly_greater ? BTREE_ITER_IS_EXTENTS : 0; ++ iter->pos = new_pos; ++ ++ cmp = bkey_cmp(btree_iter_search_key(iter), old); ++ if (!cmp) ++ return; ++ ++ l = btree_iter_pos_changed(iter, cmp); ++ ++ if (l != iter->level) ++ btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); ++ else ++ btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); ++} ++ + void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos) + { + int cmp = bkey_cmp(new_pos, iter->pos); +@@ -1947,7 +1970,8 @@ struct btree_iter *bch2_trans_get_iter(struct btree_trans *trans, + __btree_trans_get_iter(trans, btree_id, pos, flags); + + if (!IS_ERR(iter)) +- bch2_btree_iter_set_pos(iter, pos); ++ __bch2_btree_iter_set_pos(iter, pos, ++ btree_node_type_is_extents(btree_id)); + return iter; + } + +diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h +index 962380925511..336901f9780b 100644 +--- a/fs/bcachefs/btree_iter.h ++++ b/fs/bcachefs/btree_iter.h +@@ -166,6 +166,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *); + struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *); + + void bch2_btree_iter_set_pos_same_leaf(struct btree_iter *, struct bpos); ++void __bch2_btree_iter_set_pos(struct btree_iter *, struct bpos, bool); + void bch2_btree_iter_set_pos(struct btree_iter *, struct bpos); + + static inline struct bpos btree_type_successor(enum btree_id id, +-- +cgit v1.2.3 + + +From b97caa050884f358a53d8eed23ffbfff25e29af3 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 31 Jan 2020 13:26:05 -0500 +Subject: bcachefs: Make BTREE_ITER_IS_EXTENTS private to iter code + +Prep work for changing the core btree update path to handle extents like +regular keys; we need to reduce the scope of what BTREE_ITER_IS_EXTENTS +means + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 4 ++-- + fs/bcachefs/btree_update_leaf.c | 4 ++-- + 2 files changed, 4 insertions(+), 4 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index f0323d691e8f..431efb924532 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -408,7 +408,7 @@ static void __bch2_btree_iter_verify(struct btree_iter *iter, + * For extents, the iterator may have skipped past deleted keys (but not + * whiteouts) + */ +- k = b->level || iter->flags & BTREE_ITER_IS_EXTENTS ++ k = b->level || btree_node_type_is_extents(iter->btree_id) + ? bch2_btree_node_iter_prev_filter(&tmp, b, KEY_TYPE_discard) + : bch2_btree_node_iter_prev_all(&tmp, b); + if (k && bkey_iter_pos_cmp(b, k, &pos) >= 0) { +@@ -563,7 +563,7 @@ fixup_done: + if (!bch2_btree_node_iter_end(node_iter) && + iter_current_key_modified && + (b->level || +- (iter->flags & BTREE_ITER_IS_EXTENTS))) { ++ btree_node_type_is_extents(iter->btree_id))) { + struct bset_tree *t; + struct bkey_packed *k, *k2, *p; + +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index 15e1c29d53e9..a036c7dd1fc1 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -780,7 +780,7 @@ int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter, + + iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT; + +- if (iter->flags & BTREE_ITER_IS_EXTENTS) { ++ if (btree_node_type_is_extents(iter->btree_id)) { + iter->pos_after_commit = k->k.p; + iter->flags |= BTREE_ITER_SET_POS_AFTER_COMMIT; + } +@@ -898,7 +898,7 @@ retry: + */ + delete.k.p = iter->pos; + +- if (iter->flags & BTREE_ITER_IS_EXTENTS) { ++ if (btree_node_type_is_extents(iter->btree_id)) { + unsigned max_sectors = + KEY_SIZE_MAX & (~0 << trans->c->block_bits); + +-- +cgit v1.2.3 + + +From 8cc17a4332954181c27c13d9387691bdf6791b0d Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 6 Feb 2020 20:15:15 -0500 +Subject: bcachefs: Fix bch2_ptr_swab for indirect extents + +bch2_ptr_swab was never updated when the code for generic keys with +pointers was added - it assumed the entire val was only used for +pointers. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bkey_methods.c | 9 +++------ + fs/bcachefs/bkey_methods.h | 4 ++-- + fs/bcachefs/btree_io.c | 17 ++++++++++------- + fs/bcachefs/ec.h | 1 + + fs/bcachefs/extents.c | 16 +++++++++------- + fs/bcachefs/extents.h | 2 +- + fs/bcachefs/journal_io.c | 6 ++++-- + fs/bcachefs/reflink.h | 1 + + 8 files changed, 31 insertions(+), 25 deletions(-) + +diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c +index 320e17d108d2..c064cf468a9b 100644 +--- a/fs/bcachefs/bkey_methods.c ++++ b/fs/bcachefs/bkey_methods.c +@@ -202,15 +202,12 @@ void bch2_bkey_val_to_text(struct printbuf *out, struct bch_fs *c, + bch2_val_to_text(out, c, k); + } + +-void bch2_bkey_swab(const struct bkey_format *f, +- struct bkey_packed *k) ++void bch2_bkey_swab_val(struct bkey_s k) + { +- const struct bkey_ops *ops = &bch2_bkey_ops[k->type]; +- +- bch2_bkey_swab_key(f, k); ++ const struct bkey_ops *ops = &bch2_bkey_ops[k.k->type]; + + if (ops->swab) +- ops->swab(f, k); ++ ops->swab(k); + } + + bool bch2_bkey_normalize(struct bch_fs *c, struct bkey_s k) +diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h +index 8568b65c1ed2..d36468b75223 100644 +--- a/fs/bcachefs/bkey_methods.h ++++ b/fs/bcachefs/bkey_methods.h +@@ -29,7 +29,7 @@ struct bkey_ops { + void (*key_debugcheck)(struct bch_fs *, struct bkey_s_c); + void (*val_to_text)(struct printbuf *, struct bch_fs *, + struct bkey_s_c); +- void (*swab)(const struct bkey_format *, struct bkey_packed *); ++ void (*swab)(struct bkey_s); + bool (*key_normalize)(struct bch_fs *, struct bkey_s); + enum merge_result (*key_merge)(struct bch_fs *, + struct bkey_s, struct bkey_s); +@@ -51,7 +51,7 @@ void bch2_val_to_text(struct printbuf *, struct bch_fs *, + void bch2_bkey_val_to_text(struct printbuf *, struct bch_fs *, + struct bkey_s_c); + +-void bch2_bkey_swab(const struct bkey_format *, struct bkey_packed *); ++void bch2_bkey_swab_val(struct bkey_s); + + bool bch2_bkey_normalize(struct bch_fs *, struct bkey_s); + +diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c +index 5f1c3183fa85..422e54774b27 100644 +--- a/fs/bcachefs/btree_io.c ++++ b/fs/bcachefs/btree_io.c +@@ -784,7 +784,7 @@ static int validate_bset(struct bch_fs *c, struct btree *b, + + for (k = i->start; + k != vstruct_last(i);) { +- struct bkey_s_c u; ++ struct bkey_s u; + struct bkey tmp; + const char *invalid; + +@@ -805,21 +805,24 @@ static int validate_bset(struct bch_fs *c, struct btree *b, + } + + if (BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN) +- bch2_bkey_swab(&b->format, k); ++ bch2_bkey_swab_key(&b->format, k); + + if (!write && + version < bcachefs_metadata_version_bkey_renumber) + bch2_bkey_renumber(btree_node_type(b), k, write); + +- u = bkey_disassemble(b, k, &tmp); ++ u = __bkey_disassemble(b, k, &tmp); + +- invalid = __bch2_bkey_invalid(c, u, btree_node_type(b)) ?: +- bch2_bkey_in_btree_node(b, u) ?: +- (write ? bch2_bkey_val_invalid(c, u) : NULL); ++ if (BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN) ++ bch2_bkey_swab_val(u); ++ ++ invalid = __bch2_bkey_invalid(c, u.s_c, btree_node_type(b)) ?: ++ bch2_bkey_in_btree_node(b, u.s_c) ?: ++ (write ? bch2_bkey_val_invalid(c, u.s_c) : NULL); + if (invalid) { + char buf[160]; + +- bch2_bkey_val_to_text(&PBUF(buf), c, u); ++ bch2_bkey_val_to_text(&PBUF(buf), c, u.s_c); + btree_err(BTREE_ERR_FIXABLE, c, b, i, + "invalid bkey:\n%s\n%s", invalid, buf); + +diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h +index 8d9fbfd19f66..cf67abd48490 100644 +--- a/fs/bcachefs/ec.h ++++ b/fs/bcachefs/ec.h +@@ -12,6 +12,7 @@ void bch2_stripe_to_text(struct printbuf *, struct bch_fs *, + #define bch2_bkey_ops_stripe (struct bkey_ops) { \ + .key_invalid = bch2_stripe_invalid, \ + .val_to_text = bch2_stripe_to_text, \ ++ .swab = bch2_ptr_swab, \ + } + + static inline unsigned stripe_csums_per_device(const struct bch_stripe *s) +diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c +index a19b91f9beb4..c9d474920b47 100644 +--- a/fs/bcachefs/extents.c ++++ b/fs/bcachefs/extents.c +@@ -1079,17 +1079,19 @@ const char *bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k) + return NULL; + } + +-void bch2_ptr_swab(const struct bkey_format *f, struct bkey_packed *k) ++void bch2_ptr_swab(struct bkey_s k) + { ++ struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); + union bch_extent_entry *entry; +- u64 *d = (u64 *) bkeyp_val(f, k); +- unsigned i; ++ u64 *d; + +- for (i = 0; i < bkeyp_val_u64s(f, k); i++) +- d[i] = swab64(d[i]); ++ for (d = (u64 *) ptrs.start; ++ d != (u64 *) ptrs.end; ++ d++) ++ *d = swab64(*d); + +- for (entry = (union bch_extent_entry *) d; +- entry < (union bch_extent_entry *) (d + bkeyp_val_u64s(f, k)); ++ for (entry = ptrs.start; ++ entry < ptrs.end; + entry = extent_entry_next(entry)) { + switch (extent_entry_type(entry)) { + case BCH_EXTENT_ENTRY_ptr: +diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h +index 0d8554172263..6e8119a8ad30 100644 +--- a/fs/bcachefs/extents.h ++++ b/fs/bcachefs/extents.h +@@ -532,7 +532,7 @@ void bch2_bkey_ptrs_to_text(struct printbuf *, struct bch_fs *, + struct bkey_s_c); + const char *bch2_bkey_ptrs_invalid(const struct bch_fs *, struct bkey_s_c); + +-void bch2_ptr_swab(const struct bkey_format *, struct bkey_packed *); ++void bch2_ptr_swab(struct bkey_s); + + /* Generic extent code: */ + +diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c +index 7112a25d0600..db722a8ae4ea 100644 +--- a/fs/bcachefs/journal_io.c ++++ b/fs/bcachefs/journal_io.c +@@ -171,8 +171,10 @@ static int journal_validate_key(struct bch_fs *c, struct jset *jset, + return 0; + } + +- if (JSET_BIG_ENDIAN(jset) != CPU_BIG_ENDIAN) +- bch2_bkey_swab(NULL, bkey_to_packed(k)); ++ if (JSET_BIG_ENDIAN(jset) != CPU_BIG_ENDIAN) { ++ bch2_bkey_swab_key(NULL, bkey_to_packed(k)); ++ bch2_bkey_swab_val(bkey_i_to_s(k)); ++ } + + if (!write && + version < bcachefs_metadata_version_bkey_renumber) +diff --git a/fs/bcachefs/reflink.h b/fs/bcachefs/reflink.h +index ac23b855858c..5445c1cf0797 100644 +--- a/fs/bcachefs/reflink.h ++++ b/fs/bcachefs/reflink.h +@@ -22,6 +22,7 @@ void bch2_reflink_v_to_text(struct printbuf *, struct bch_fs *, + #define bch2_bkey_ops_reflink_v (struct bkey_ops) { \ + .key_invalid = bch2_reflink_v_invalid, \ + .val_to_text = bch2_reflink_v_to_text, \ ++ .swab = bch2_ptr_swab, \ + } + + s64 bch2_remap_range(struct bch_fs *, struct bpos, struct bpos, +-- +cgit v1.2.3 + + +From 2ac41ce21010d85b7b13c1be111cbed4a39b6511 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 18 Feb 2020 20:02:41 -0500 +Subject: bcachefs: Check for bad key version number + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_gc.c | 6 +++++- + fs/bcachefs/io.c | 2 +- + 2 files changed, 6 insertions(+), 2 deletions(-) + +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +index 05879b66d6af..3705c41f5151 100644 +--- a/fs/bcachefs/btree_gc.c ++++ b/fs/bcachefs/btree_gc.c +@@ -124,7 +124,11 @@ static int bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k, + BUG_ON(journal_seq_verify(c) && + k.k->version.lo > journal_cur_seq(&c->journal)); + +- if (k.k->version.lo > atomic64_read(&c->key_version)) ++ /* XXX change to fsck check */ ++ if (fsck_err_on(k.k->version.lo > atomic64_read(&c->key_version), c, ++ "key version number higher than recorded: %llu > %llu", ++ k.k->version.lo, ++ atomic64_read(&c->key_version))) + atomic64_set(&c->key_version, k.k->version.lo); + + if (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) || +diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c +index 49bd29ccc543..795be72b2364 100644 +--- a/fs/bcachefs/io.c ++++ b/fs/bcachefs/io.c +@@ -941,7 +941,7 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, + + if (bch2_csum_type_is_encryption(op->csum_type)) { + if (bversion_zero(version)) { +- version.lo = atomic64_inc_return(&c->key_version) + 1; ++ version.lo = atomic64_inc_return(&c->key_version); + } else { + crc.nonce = op->nonce; + op->nonce += src_len >> 9; +-- +cgit v1.2.3 + + +From fccdfbb8b169dcb7cff30dd38fea39f679a050dd Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 18 Feb 2020 19:29:33 -0500 +Subject: bcachefs: Fix traversing to interior nodes + +NULL is used to mean "reach end of traversal" - we were only +initializing the leaf node in the iterator to the right sentinal value. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 3 +-- + 1 file changed, 1 insertion(+), 2 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 431efb924532..c365a2aff446 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -1732,8 +1732,7 @@ static inline void bch2_btree_iter_init(struct btree_trans *trans, + iter->nodes_locked = 0; + iter->nodes_intent_locked = 0; + for (i = 0; i < ARRAY_SIZE(iter->l); i++) +- iter->l[i].b = NULL; +- iter->l[iter->level].b = BTREE_ITER_NO_NODE_INIT; ++ iter->l[i].b = BTREE_ITER_NO_NODE_INIT; + + prefetch(c->btree_roots[btree_id].b); + } +-- +cgit v1.2.3 + + +From 46eb4b9942c9034919eb3011e9a107fbed2b35aa Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 18 Feb 2020 17:15:32 -0500 +Subject: bcachefs: introduce b->hash_val + +This is partly prep work for introducing bch_btree_ptr_v2, but it'll +also be a bit of a performance boost by moving the full key out of the +hot part of struct btree. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_cache.c | 24 ++++++++++++++---------- + fs/bcachefs/btree_cache.h | 13 ++++++++++--- + fs/bcachefs/btree_io.c | 9 ++------- + fs/bcachefs/btree_types.h | 7 ++++--- + fs/bcachefs/btree_update.h | 2 +- + fs/bcachefs/btree_update_interior.c | 35 +++++++++++++++++++---------------- + fs/bcachefs/migrate.c | 6 ++---- + 7 files changed, 52 insertions(+), 44 deletions(-) + +diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c +index 0c737f35f430..2c9c3c18defe 100644 +--- a/fs/bcachefs/btree_cache.c ++++ b/fs/bcachefs/btree_cache.c +@@ -62,13 +62,13 @@ static int bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg *arg, + const struct btree *b = obj; + const u64 *v = arg->key; + +- return PTR_HASH(&b->key) == *v ? 0 : 1; ++ return b->hash_val == *v ? 0 : 1; + } + + static const struct rhashtable_params bch_btree_cache_params = { + .head_offset = offsetof(struct btree, hash), +- .key_offset = offsetof(struct btree, key.v), +- .key_len = sizeof(struct bch_extent_ptr), ++ .key_offset = offsetof(struct btree, hash_val), ++ .key_len = sizeof(u64), + .obj_cmpfn = bch2_btree_cache_cmp_fn, + }; + +@@ -114,11 +114,14 @@ void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b) + rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params); + + /* Cause future lookups for this node to fail: */ +- PTR_HASH(&b->key) = 0; ++ b->hash_val = 0; + } + + int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b) + { ++ BUG_ON(b->hash_val); ++ b->hash_val = btree_ptr_hash_val(&b->key); ++ + return rhashtable_lookup_insert_fast(&bc->table, &b->hash, + bch_btree_cache_params); + } +@@ -144,8 +147,9 @@ __flatten + static inline struct btree *btree_cache_find(struct btree_cache *bc, + const struct bkey_i *k) + { +- return rhashtable_lookup_fast(&bc->table, &PTR_HASH(k), +- bch_btree_cache_params); ++ u64 v = btree_ptr_hash_val(k); ++ ++ return rhashtable_lookup_fast(&bc->table, &v, bch_btree_cache_params); + } + + /* +@@ -199,7 +203,7 @@ static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush) + btree_node_wait_on_io(b); + } + out: +- if (PTR_HASH(&b->key) && !ret) ++ if (b->hash_val && !ret) + trace_btree_node_reap(c, b); + return ret; + out_unlock: +@@ -607,7 +611,7 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c, + /* raced with another fill: */ + + /* mark as unhashed... */ +- PTR_HASH(&b->key) = 0; ++ b->hash_val = 0; + + mutex_lock(&bc->lock); + list_add(&b->list, &bc->freeable); +@@ -710,7 +714,7 @@ retry: + * free it: + * + * To guard against this, btree nodes are evicted from the cache +- * when they're freed - and PTR_HASH() is zeroed out, which we ++ * when they're freed - and b->hash_val is zeroed out, which we + * check for after we lock the node. + * + * Then, bch2_btree_node_relock() on the parent will fail - because +@@ -723,7 +727,7 @@ retry: + if (!btree_node_lock(b, k->k.p, level, iter, lock_type)) + return ERR_PTR(-EINTR); + +- if (unlikely(PTR_HASH(&b->key) != PTR_HASH(k) || ++ if (unlikely(b->hash_val != btree_ptr_hash_val(k) || + b->level != level || + race_fault())) { + six_unlock_type(&b->lock, lock_type); +diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h +index 83358d6a4df8..b284d8933a3e 100644 +--- a/fs/bcachefs/btree_cache.h ++++ b/fs/bcachefs/btree_cache.h +@@ -35,13 +35,20 @@ void bch2_fs_btree_cache_exit(struct bch_fs *); + int bch2_fs_btree_cache_init(struct bch_fs *); + void bch2_fs_btree_cache_init_early(struct btree_cache *); + +-#define PTR_HASH(_k) *((u64 *) &bkey_i_to_btree_ptr_c(_k)->v) ++static inline u64 btree_ptr_hash_val(const struct bkey_i *k) ++{ ++ switch (k->k.type) { ++ case KEY_TYPE_btree_ptr: ++ return *((u64 *) bkey_i_to_btree_ptr_c(k)->v.start); ++ default: ++ return 0; ++ } ++} + + /* is btree node in hash table? */ + static inline bool btree_node_hashed(struct btree *b) + { +- return b->key.k.type == KEY_TYPE_btree_ptr && +- PTR_HASH(&b->key); ++ return b->hash_val != 0; + } + + #define for_each_cached_btree(_b, _c, _tbl, _iter, _pos) \ +diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c +index 422e54774b27..9d08418c49e6 100644 +--- a/fs/bcachefs/btree_io.c ++++ b/fs/bcachefs/btree_io.c +@@ -1255,8 +1255,6 @@ static void bch2_btree_node_write_error(struct bch_fs *c, + { + struct btree *b = wbio->wbio.bio.bi_private; + __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp; +- struct bkey_i_btree_ptr *new_key; +- struct bkey_s_btree_ptr bp; + struct bch_extent_ptr *ptr; + struct btree_trans trans; + struct btree_iter *iter; +@@ -1282,16 +1280,13 @@ retry: + + bkey_copy(&tmp.k, &b->key); + +- new_key = bkey_i_to_btree_ptr(&tmp.k); +- bp = btree_ptr_i_to_s(new_key); +- + bch2_bkey_drop_ptrs(bkey_i_to_s(&tmp.k), ptr, + bch2_dev_list_has_dev(wbio->wbio.failed, ptr->dev)); + +- if (!bch2_bkey_nr_ptrs(bp.s_c)) ++ if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(&tmp.k))) + goto err; + +- ret = bch2_btree_node_update_key(c, iter, b, new_key); ++ ret = bch2_btree_node_update_key(c, iter, b, &tmp.k); + if (ret == -EINTR) + goto retry; + if (ret) +diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h +index b7af88e05837..20757d0c3e53 100644 +--- a/fs/bcachefs/btree_types.h ++++ b/fs/bcachefs/btree_types.h +@@ -64,9 +64,7 @@ struct btree_alloc { + struct btree { + /* Hottest entries first */ + struct rhash_head hash; +- +- /* Key/pointer for this btree node */ +- __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX); ++ u64 hash_val; + + struct six_lock lock; + +@@ -133,6 +131,9 @@ struct btree { + #ifdef CONFIG_BCACHEFS_DEBUG + bool *expensive_debug_checks; + #endif ++ ++ /* Key/pointer for this btree node */ ++ __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX); + }; + + struct btree_cache { +diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h +index 2c34bae64281..be4fe818eac8 100644 +--- a/fs/bcachefs/btree_update.h ++++ b/fs/bcachefs/btree_update.h +@@ -70,7 +70,7 @@ int bch2_btree_delete_range(struct bch_fs *, enum btree_id, + int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *, + __le64, unsigned); + int bch2_btree_node_update_key(struct bch_fs *, struct btree_iter *, +- struct btree *, struct bkey_i_btree_ptr *); ++ struct btree *, struct bkey_i *); + + int bch2_trans_update(struct btree_trans *, struct btree_iter *, + struct bkey_i *, enum btree_trigger_flags); +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index da13a20d9a95..4641897b6024 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -1944,7 +1944,7 @@ static void __bch2_btree_node_update_key(struct bch_fs *c, + struct btree_update *as, + struct btree_iter *iter, + struct btree *b, struct btree *new_hash, +- struct bkey_i_btree_ptr *new_key) ++ struct bkey_i *new_key) + { + struct btree *parent; + int ret; +@@ -1989,20 +1989,20 @@ static void __bch2_btree_node_update_key(struct bch_fs *c, + */ + ret = bch2_disk_reservation_add(c, &as->reserve->disk_res, + c->opts.btree_node_size * +- bch2_bkey_nr_ptrs(bkey_i_to_s_c(&new_key->k_i)), ++ bch2_bkey_nr_ptrs(bkey_i_to_s_c(new_key)), + BCH_DISK_RESERVATION_NOFAIL); + BUG_ON(ret); + + parent = btree_node_parent(iter, b); + if (parent) { + if (new_hash) { +- bkey_copy(&new_hash->key, &new_key->k_i); ++ bkey_copy(&new_hash->key, new_key); + ret = bch2_btree_node_hash_insert(&c->btree_cache, + new_hash, b->level, b->btree_id); + BUG_ON(ret); + } + +- bch2_keylist_add(&as->parent_keys, &new_key->k_i); ++ bch2_keylist_add(&as->parent_keys, new_key); + bch2_btree_insert_node(as, parent, iter, &as->parent_keys, 0); + + if (new_hash) { +@@ -2011,12 +2011,12 @@ static void __bch2_btree_node_update_key(struct bch_fs *c, + + bch2_btree_node_hash_remove(&c->btree_cache, b); + +- bkey_copy(&b->key, &new_key->k_i); ++ bkey_copy(&b->key, new_key); + ret = __bch2_btree_node_hash_insert(&c->btree_cache, b); + BUG_ON(ret); + mutex_unlock(&c->btree_cache.lock); + } else { +- bkey_copy(&b->key, &new_key->k_i); ++ bkey_copy(&b->key, new_key); + } + } else { + struct bch_fs_usage *fs_usage; +@@ -2029,11 +2029,11 @@ static void __bch2_btree_node_update_key(struct bch_fs *c, + percpu_down_read(&c->mark_lock); + fs_usage = bch2_fs_usage_scratch_get(c); + +- bch2_mark_key_locked(c, bkey_i_to_s_c(&new_key->k_i), ++ bch2_mark_key_locked(c, bkey_i_to_s_c(new_key), + 0, 0, fs_usage, 0, + BTREE_TRIGGER_INSERT); + if (gc_visited(c, gc_pos_btree_root(b->btree_id))) +- bch2_mark_key_locked(c, bkey_i_to_s_c(&new_key->k_i), ++ bch2_mark_key_locked(c, bkey_i_to_s_c(new_key), + 0, 0, NULL, 0, + BTREE_TRIGGER_INSERT|| + BTREE_TRIGGER_GC); +@@ -2047,16 +2047,16 @@ static void __bch2_btree_node_update_key(struct bch_fs *c, + percpu_up_read(&c->mark_lock); + mutex_unlock(&c->btree_interior_update_lock); + +- if (PTR_HASH(&new_key->k_i) != PTR_HASH(&b->key)) { ++ if (btree_ptr_hash_val(new_key) != b->hash_val) { + mutex_lock(&c->btree_cache.lock); + bch2_btree_node_hash_remove(&c->btree_cache, b); + +- bkey_copy(&b->key, &new_key->k_i); ++ bkey_copy(&b->key, new_key); + ret = __bch2_btree_node_hash_insert(&c->btree_cache, b); + BUG_ON(ret); + mutex_unlock(&c->btree_cache.lock); + } else { +- bkey_copy(&b->key, &new_key->k_i); ++ bkey_copy(&b->key, new_key); + } + + btree_update_updated_root(as); +@@ -2068,7 +2068,7 @@ static void __bch2_btree_node_update_key(struct bch_fs *c, + + int bch2_btree_node_update_key(struct bch_fs *c, struct btree_iter *iter, + struct btree *b, +- struct bkey_i_btree_ptr *new_key) ++ struct bkey_i *new_key) + { + struct btree *parent = btree_node_parent(iter, b); + struct btree_update *as = NULL; +@@ -2091,8 +2091,11 @@ int bch2_btree_node_update_key(struct bch_fs *c, struct btree_iter *iter, + } + } + +- /* check PTR_HASH() after @b is locked by btree_iter_traverse(): */ +- if (PTR_HASH(&new_key->k_i) != PTR_HASH(&b->key)) { ++ /* ++ * check btree_ptr_hash_val() after @b is locked by ++ * btree_iter_traverse(): ++ */ ++ if (btree_ptr_hash_val(new_key) != b->hash_val) { + /* bch2_btree_reserve_get will unlock */ + ret = bch2_btree_cache_cannibalize_lock(c, &cl); + if (ret) { +@@ -2134,7 +2137,7 @@ int bch2_btree_node_update_key(struct bch_fs *c, struct btree_iter *iter, + goto err; + } + +- ret = bch2_mark_bkey_replicas(c, bkey_i_to_s_c(&new_key->k_i)); ++ ret = bch2_mark_bkey_replicas(c, bkey_i_to_s_c(new_key)); + if (ret) + goto err_free_update; + +@@ -2193,7 +2196,7 @@ void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id) + + bkey_btree_ptr_init(&b->key); + b->key.k.p = POS_MAX; +- PTR_HASH(&b->key) = U64_MAX - id; ++ *((u64 *) bkey_i_to_btree_ptr(&b->key)->v.start) = U64_MAX - id; + + bch2_bset_init_first(b, &b->data->keys); + bch2_btree_build_aux_trees(b); +diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c +index 1ef62a189e33..e26fa1608f39 100644 +--- a/fs/bcachefs/migrate.c ++++ b/fs/bcachefs/migrate.c +@@ -123,23 +123,21 @@ static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags) + for_each_btree_node(&trans, iter, id, POS_MIN, + BTREE_ITER_PREFETCH, b) { + __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp; +- struct bkey_i_btree_ptr *new_key; + retry: + if (!bch2_bkey_has_device(bkey_i_to_s_c(&b->key), + dev_idx)) + continue; + + bkey_copy(&tmp.k, &b->key); +- new_key = bkey_i_to_btree_ptr(&tmp.k); + +- ret = drop_dev_ptrs(c, bkey_i_to_s(&new_key->k_i), ++ ret = drop_dev_ptrs(c, bkey_i_to_s(&tmp.k), + dev_idx, flags, true); + if (ret) { + bch_err(c, "Cannot drop device without losing data"); + goto err; + } + +- ret = bch2_btree_node_update_key(c, iter, b, new_key); ++ ret = bch2_btree_node_update_key(c, iter, b, &tmp.k); + if (ret == -EINTR) { + b = bch2_btree_iter_peek_node(iter); + goto retry; +-- +cgit v1.2.3 + + +From f7aae5e027e71749b7197205274c1dd2e1522ae5 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 7 Feb 2020 13:38:02 -0500 +Subject: bcachefs: btree_ptr_v2 + +Add a new btree ptr type which contains the sequence number (random 64 +bit cookie, actually) for that btree node - this lets us verify that +when we read in a btree node it really is the btree node we wanted. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bcachefs_format.h | 22 +++++++++++-- + fs/bcachefs/bkey.h | 1 + + fs/bcachefs/btree_cache.h | 2 ++ + fs/bcachefs/btree_io.c | 30 ++++++++++++++++-- + fs/bcachefs/btree_update_interior.c | 63 ++++++++++++++++++++++++++----------- + fs/bcachefs/buckets.c | 2 ++ + fs/bcachefs/buckets.h | 3 +- + fs/bcachefs/extents.c | 3 ++ + fs/bcachefs/extents.h | 15 +++++++++ + fs/bcachefs/recovery.c | 1 + + fs/bcachefs/replicas.c | 1 + + 11 files changed, 117 insertions(+), 26 deletions(-) + +diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h +index 3b5e70a727b7..bb251fcb4bb0 100644 +--- a/fs/bcachefs/bcachefs_format.h ++++ b/fs/bcachefs/bcachefs_format.h +@@ -339,7 +339,8 @@ static inline void bkey_init(struct bkey *k) + x(stripe, 14) \ + x(reflink_p, 15) \ + x(reflink_v, 16) \ +- x(inline_data, 17) ++ x(inline_data, 17) \ ++ x(btree_ptr_v2, 18) + + enum bch_bkey_type { + #define x(name, nr) KEY_TYPE_##name = nr, +@@ -595,6 +596,19 @@ struct bch_btree_ptr { + __u64 _data[0]; + } __attribute__((packed, aligned(8))); + ++struct bch_btree_ptr_v2 { ++ struct bch_val v; ++ ++ __u64 mem_ptr; ++ __le64 seq; ++ __le16 sectors_written; ++ /* In case we ever decide to do variable size btree nodes: */ ++ __le16 sectors; ++ struct bpos min_key; ++ struct bch_extent_ptr start[0]; ++ __u64 _data[0]; ++} __attribute__((packed, aligned(8))); ++ + struct bch_extent { + struct bch_val v; + +@@ -626,7 +640,8 @@ struct bch_reservation { + + /* Btree pointers don't carry around checksums: */ + #define BKEY_BTREE_PTR_VAL_U64s_MAX \ +- ((sizeof(struct bch_extent_ptr)) / sizeof(u64) * BCH_REPLICAS_MAX) ++ ((sizeof(struct bch_btree_ptr_v2) + \ ++ sizeof(struct bch_extent_ptr) * BCH_REPLICAS_MAX) / sizeof(u64)) + #define BKEY_BTREE_PTR_U64s_MAX \ + (BKEY_U64s + BKEY_BTREE_PTR_VAL_U64s_MAX) + +@@ -1295,7 +1310,8 @@ LE64_BITMASK(BCH_SB_ERASURE_CODE, struct bch_sb, flags[3], 0, 16); + x(new_siphash, 7) \ + x(inline_data, 8) \ + x(new_extent_overwrite, 9) \ +- x(incompressible, 10) ++ x(incompressible, 10) \ ++ x(btree_ptr_v2, 11) + + enum bch_sb_feature { + #define x(f, n) BCH_FEATURE_##f, +diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h +index f2d5f3009b21..9106bea9ac06 100644 +--- a/fs/bcachefs/bkey.h ++++ b/fs/bcachefs/bkey.h +@@ -565,6 +565,7 @@ BKEY_VAL_ACCESSORS(stripe); + BKEY_VAL_ACCESSORS(reflink_p); + BKEY_VAL_ACCESSORS(reflink_v); + BKEY_VAL_ACCESSORS(inline_data); ++BKEY_VAL_ACCESSORS(btree_ptr_v2); + + /* byte order helpers */ + +diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h +index b284d8933a3e..d27acd87e4b8 100644 +--- a/fs/bcachefs/btree_cache.h ++++ b/fs/bcachefs/btree_cache.h +@@ -40,6 +40,8 @@ static inline u64 btree_ptr_hash_val(const struct bkey_i *k) + switch (k->k.type) { + case KEY_TYPE_btree_ptr: + return *((u64 *) bkey_i_to_btree_ptr_c(k)->v.start); ++ case KEY_TYPE_btree_ptr_v2: ++ return bkey_i_to_btree_ptr_v2_c(k)->v.seq; + default: + return 0; + } +diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c +index 9d08418c49e6..84fbceea5027 100644 +--- a/fs/bcachefs/btree_io.c ++++ b/fs/bcachefs/btree_io.c +@@ -735,6 +735,15 @@ static int validate_bset(struct bch_fs *c, struct btree *b, + bch2_bpos_swab(&b->data->max_key); + } + ++ if (b->key.k.type == KEY_TYPE_btree_ptr_v2) { ++ struct bch_btree_ptr_v2 *bp = ++ &bkey_i_to_btree_ptr_v2(&b->key)->v; ++ ++ btree_err_on(bkey_cmp(b->data->min_key, bp->min_key), ++ BTREE_ERR_MUST_RETRY, c, b, NULL, ++ "incorrect min_key"); ++ } ++ + btree_err_on(bkey_cmp(b->data->max_key, b->key.k.p), + BTREE_ERR_MUST_RETRY, c, b, i, + "incorrect max key"); +@@ -898,6 +907,15 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry + BTREE_ERR_MUST_RETRY, c, b, NULL, + "bad btree header"); + ++ if (b->key.k.type == KEY_TYPE_btree_ptr_v2) { ++ struct bch_btree_ptr_v2 *bp = ++ &bkey_i_to_btree_ptr_v2(&b->key)->v; ++ ++ btree_err_on(b->data->keys.seq != bp->seq, ++ BTREE_ERR_MUST_RETRY, c, b, NULL, ++ "got wrong btree node"); ++ } ++ + while (b->written < c->opts.btree_node_size) { + unsigned sectors, whiteout_u64s = 0; + struct nonce nonce; +@@ -1005,15 +1023,15 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry + i = &b->data->keys; + for (k = i->start; k != vstruct_last(i);) { + struct bkey tmp; +- struct bkey_s_c u = bkey_disassemble(b, k, &tmp); +- const char *invalid = bch2_bkey_val_invalid(c, u); ++ struct bkey_s u = __bkey_disassemble(b, k, &tmp); ++ const char *invalid = bch2_bkey_val_invalid(c, u.s_c); + + if (invalid || + (inject_invalid_keys(c) && + !bversion_cmp(u.k->version, MAX_VERSION))) { + char buf[160]; + +- bch2_bkey_val_to_text(&PBUF(buf), c, u); ++ bch2_bkey_val_to_text(&PBUF(buf), c, u.s_c); + btree_err(BTREE_ERR_FIXABLE, c, b, i, + "invalid bkey %s: %s", buf, invalid); + +@@ -1026,6 +1044,12 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry + continue; + } + ++ if (u.k->type == KEY_TYPE_btree_ptr_v2) { ++ struct bkey_s_btree_ptr_v2 bp = bkey_s_to_btree_ptr_v2(u); ++ ++ bp.v->mem_ptr = 0; ++ } ++ + k = bkey_next_skip_noops(k, vstruct_last(i)); + } + +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index 4641897b6024..8411ab57a318 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -332,7 +332,11 @@ retry: + goto retry; + } + +- bkey_btree_ptr_init(&tmp.k); ++ if (c->sb.features & (1ULL << BCH_FEATURE_btree_ptr_v2)) ++ bkey_btree_ptr_v2_init(&tmp.k); ++ else ++ bkey_btree_ptr_init(&tmp.k); ++ + bch2_alloc_sectors_append_ptrs(c, wp, &tmp.k, c->opts.btree_node_size); + + bch2_open_bucket_get(c, wp, &ob); +@@ -354,14 +358,13 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned lev + { + struct bch_fs *c = as->c; + struct btree *b; ++ int ret; + + BUG_ON(level >= BTREE_MAX_DEPTH); + BUG_ON(!as->reserve->nr); + + b = as->reserve->b[--as->reserve->nr]; + +- BUG_ON(bch2_btree_node_hash_insert(&c->btree_cache, b, level, as->btree_id)); +- + set_btree_node_accessed(b); + set_btree_node_dirty(b); + set_btree_node_need_write(b); +@@ -372,7 +375,16 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned lev + b->data->flags = 0; + SET_BTREE_NODE_ID(b->data, as->btree_id); + SET_BTREE_NODE_LEVEL(b->data, level); +- b->data->ptr = bkey_i_to_btree_ptr(&b->key)->v.start[0]; ++ b->data->ptr = bch2_bkey_ptrs_c(bkey_i_to_s_c(&b->key)).start->ptr; ++ ++ if (b->key.k.type == KEY_TYPE_btree_ptr_v2) { ++ struct bkey_i_btree_ptr_v2 *bp = bkey_i_to_btree_ptr_v2(&b->key); ++ ++ bp->v.mem_ptr = 0; ++ bp->v.seq = b->data->keys.seq; ++ bp->v.sectors_written = 0; ++ bp->v.sectors = cpu_to_le16(c->opts.btree_node_size); ++ } + + if (c->sb.features & (1ULL << BCH_FEATURE_new_extent_overwrite)) + SET_BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data, true); +@@ -385,10 +397,26 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned lev + + btree_node_will_make_reachable(as, b); + ++ ret = bch2_btree_node_hash_insert(&c->btree_cache, b, level, as->btree_id); ++ BUG_ON(ret); ++ + trace_btree_node_alloc(c, b); + return b; + } + ++static void btree_set_min(struct btree *b, struct bpos pos) ++{ ++ if (b->key.k.type == KEY_TYPE_btree_ptr_v2) ++ bkey_i_to_btree_ptr_v2(&b->key)->v.min_key = pos; ++ b->data->min_key = pos; ++} ++ ++static void btree_set_max(struct btree *b, struct bpos pos) ++{ ++ b->key.k.p = pos; ++ b->data->max_key = pos; ++} ++ + struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *as, + struct btree *b, + struct bkey_format format) +@@ -397,11 +425,12 @@ struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *as, + + n = bch2_btree_node_alloc(as, b->level); + +- n->data->min_key = b->data->min_key; +- n->data->max_key = b->data->max_key; +- n->data->format = format; + SET_BTREE_NODE_SEQ(n->data, BTREE_NODE_SEQ(b->data) + 1); + ++ btree_set_min(n, b->data->min_key); ++ btree_set_max(n, b->data->max_key); ++ ++ n->data->format = format; + btree_node_set_format(n, format); + + bch2_btree_sort_into(as->c, n, b); +@@ -431,10 +460,9 @@ static struct btree *__btree_root_alloc(struct btree_update *as, unsigned level) + { + struct btree *b = bch2_btree_node_alloc(as, level); + +- b->data->min_key = POS_MIN; +- b->data->max_key = POS_MAX; ++ btree_set_min(b, POS_MIN); ++ btree_set_max(b, POS_MAX); + b->data->format = bch2_btree_calc_format(b); +- b->key.k.p = POS_MAX; + + btree_node_set_format(b, b->data->format); + bch2_btree_build_aux_trees(b); +@@ -1263,10 +1291,8 @@ static struct btree *__btree_split_node(struct btree_update *as, + + BUG_ON(!prev); + +- n1->key.k.p = bkey_unpack_pos(n1, prev); +- n1->data->max_key = n1->key.k.p; +- n2->data->min_key = +- btree_type_successor(n1->btree_id, n1->key.k.p); ++ btree_set_max(n1, bkey_unpack_pos(n1, prev)); ++ btree_set_min(n2, btree_type_successor(n1->btree_id, n1->key.k.p)); + + set2->u64s = cpu_to_le16((u64 *) vstruct_end(set1) - (u64 *) k); + set1->u64s = cpu_to_le16(le16_to_cpu(set1->u64s) - le16_to_cpu(set2->u64s)); +@@ -1749,10 +1775,9 @@ retry: + + n = bch2_btree_node_alloc(as, b->level); + +- n->data->min_key = prev->data->min_key; +- n->data->max_key = next->data->max_key; ++ btree_set_min(n, prev->data->min_key); ++ btree_set_max(n, next->data->max_key); + n->data->format = new_f; +- n->key.k.p = next->key.k.p; + + btree_node_set_format(n, new_f); + +@@ -2202,8 +2227,8 @@ void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id) + bch2_btree_build_aux_trees(b); + + b->data->flags = 0; +- b->data->min_key = POS_MIN; +- b->data->max_key = POS_MAX; ++ btree_set_min(b, POS_MIN); ++ btree_set_max(b, POS_MAX); + b->data->format = bch2_btree_calc_format(b); + btree_node_set_format(b, b->data->format); + +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index 731b93255876..b9bc524f373b 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -1194,6 +1194,7 @@ int bch2_mark_key_locked(struct bch_fs *c, + ret = bch2_mark_alloc(c, k, fs_usage, journal_seq, flags); + break; + case KEY_TYPE_btree_ptr: ++ case KEY_TYPE_btree_ptr_v2: + sectors = !(flags & BTREE_TRIGGER_OVERWRITE) + ? c->opts.btree_node_size + : -c->opts.btree_node_size; +@@ -1729,6 +1730,7 @@ int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c k, + + switch (k.k->type) { + case KEY_TYPE_btree_ptr: ++ case KEY_TYPE_btree_ptr_v2: + sectors = !(flags & BTREE_TRIGGER_OVERWRITE) + ? c->opts.btree_node_size + : -c->opts.btree_node_size; +diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h +index 4717a1a6f568..c1cc63af9feb 100644 +--- a/fs/bcachefs/buckets.h ++++ b/fs/bcachefs/buckets.h +@@ -97,7 +97,8 @@ static inline struct bucket *PTR_BUCKET(struct bch_dev *ca, + static inline enum bch_data_type ptr_data_type(const struct bkey *k, + const struct bch_extent_ptr *ptr) + { +- if (k->type == KEY_TYPE_btree_ptr) ++ if (k->type == KEY_TYPE_btree_ptr || ++ k->type == KEY_TYPE_btree_ptr_v2) + return BCH_DATA_BTREE; + + return ptr->cached ? BCH_DATA_CACHED : BCH_DATA_USER; +diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c +index c9d474920b47..10feb856e314 100644 +--- a/fs/bcachefs/extents.c ++++ b/fs/bcachefs/extents.c +@@ -749,6 +749,7 @@ void bch2_bkey_append_ptr(struct bkey_i *k, + + switch (k->k.type) { + case KEY_TYPE_btree_ptr: ++ case KEY_TYPE_btree_ptr_v2: + case KEY_TYPE_extent: + EBUG_ON(bkey_val_u64s(&k->k) >= BKEY_EXTENT_VAL_U64s_MAX); + +@@ -1031,6 +1032,8 @@ const char *bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k) + + if (k.k->type == KEY_TYPE_btree_ptr) + size_ondisk = c->opts.btree_node_size; ++ if (k.k->type == KEY_TYPE_btree_ptr_v2) ++ size_ondisk = le16_to_cpu(bkey_s_c_to_btree_ptr_v2(k).v->sectors); + + bkey_extent_entry_for_each(ptrs, entry) { + if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX) +diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h +index 6e8119a8ad30..70b7d70269dc 100644 +--- a/fs/bcachefs/extents.h ++++ b/fs/bcachefs/extents.h +@@ -225,6 +225,13 @@ static inline struct bkey_ptrs_c bch2_bkey_ptrs_c(struct bkey_s_c k) + bkey_val_end(r), + }; + } ++ case KEY_TYPE_btree_ptr_v2: { ++ struct bkey_s_c_btree_ptr_v2 e = bkey_s_c_to_btree_ptr_v2(k); ++ return (struct bkey_ptrs_c) { ++ to_entry(&e.v->start[0]), ++ to_entry(extent_entry_last(e)) ++ }; ++ } + default: + return (struct bkey_ptrs_c) { NULL, NULL }; + } +@@ -372,6 +379,13 @@ void bch2_btree_ptr_to_text(struct printbuf *, struct bch_fs *, + .swab = bch2_ptr_swab, \ + } + ++#define bch2_bkey_ops_btree_ptr_v2 (struct bkey_ops) { \ ++ .key_invalid = bch2_btree_ptr_invalid, \ ++ .key_debugcheck = bch2_btree_ptr_debugcheck, \ ++ .val_to_text = bch2_btree_ptr_to_text, \ ++ .swab = bch2_ptr_swab, \ ++} ++ + /* KEY_TYPE_extent: */ + + const char *bch2_extent_invalid(const struct bch_fs *, struct bkey_s_c); +@@ -416,6 +430,7 @@ static inline bool bkey_extent_is_direct_data(const struct bkey *k) + { + switch (k->type) { + case KEY_TYPE_btree_ptr: ++ case KEY_TYPE_btree_ptr_v2: + case KEY_TYPE_extent: + case KEY_TYPE_reflink_v: + return true; +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index 29e6f9f00bad..c9d12f7c180e 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -1010,6 +1010,7 @@ int bch2_fs_recovery(struct bch_fs *c) + c->disk_sb.sb->version = le16_to_cpu(bcachefs_metadata_version_current); + c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_new_siphash; + c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_new_extent_overwrite; ++ c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_btree_ptr_v2; + write_sb = true; + } + +diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c +index 366888b1b36d..be4908575f72 100644 +--- a/fs/bcachefs/replicas.c ++++ b/fs/bcachefs/replicas.c +@@ -112,6 +112,7 @@ void bch2_bkey_to_replicas(struct bch_replicas_entry *e, + + switch (k.k->type) { + case KEY_TYPE_btree_ptr: ++ case KEY_TYPE_btree_ptr_v2: + e->data_type = BCH_DATA_BTREE; + extent_to_replicas(k, e); + break; +-- +cgit v1.2.3 + + +From 7c7c80f3835146d9240ff7f78ccd07004739c564 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 8 Feb 2020 16:39:37 -0500 +Subject: bcachefs: Seralize btree_update operations at + btree_update_nodes_written() + +Prep work for journalling updates to interior nodes - enforcing ordering +will greatly simplify those changes. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bcachefs.h | 1 + + fs/bcachefs/btree_update_interior.c | 32 ++++++++++++++++++++++++++------ + fs/bcachefs/btree_update_interior.h | 1 + + fs/bcachefs/super.c | 1 + + 4 files changed, 29 insertions(+), 6 deletions(-) + +diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h +index 58e4c494b540..cce3d12f5283 100644 +--- a/fs/bcachefs/bcachefs.h ++++ b/fs/bcachefs/bcachefs.h +@@ -610,6 +610,7 @@ struct bch_fs { + + mempool_t btree_interior_update_pool; + struct list_head btree_interior_update_list; ++ struct list_head btree_interior_updates_unwritten; + struct mutex btree_interior_update_lock; + struct closure_waitlist btree_interior_update_wait; + +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index 8411ab57a318..e94619a5ac97 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -666,9 +666,15 @@ static void btree_update_nodes_written(struct closure *cl) + * to child nodes that weren't written yet: now, the child nodes have + * been written so we can write out the update to the interior node. + */ +-retry: + mutex_lock(&c->btree_interior_update_lock); + as->nodes_written = true; ++retry: ++ as = list_first_entry_or_null(&c->btree_interior_updates_unwritten, ++ struct btree_update, unwritten_list); ++ if (!as || !as->nodes_written) { ++ mutex_unlock(&c->btree_interior_update_lock); ++ return; ++ } + + switch (as->mode) { + case BTREE_INTERIOR_NO_UPDATE: +@@ -681,11 +687,12 @@ retry: + mutex_unlock(&c->btree_interior_update_lock); + btree_node_lock_type(c, b, SIX_LOCK_read); + six_unlock_read(&b->lock); ++ mutex_lock(&c->btree_interior_update_lock); + goto retry; + } + + BUG_ON(!btree_node_dirty(b)); +- closure_wait(&btree_current_write(b)->wait, cl); ++ closure_wait(&btree_current_write(b)->wait, &as->cl); + + list_del(&as->write_blocked_list); + +@@ -694,6 +701,8 @@ retry: + * nodes to be writeable: + */ + closure_wake_up(&c->btree_interior_update_wait); ++ ++ list_del(&as->unwritten_list); + mutex_unlock(&c->btree_interior_update_lock); + + /* +@@ -702,6 +711,7 @@ retry: + */ + bch2_btree_node_write_cond(c, b, true); + six_unlock_read(&b->lock); ++ continue_at(&as->cl, btree_update_nodes_reachable, system_wq); + break; + + case BTREE_INTERIOR_UPDATING_AS: +@@ -716,8 +726,12 @@ retry: + /* + * and then we have to wait on that btree_update to finish: + */ +- closure_wait(&as->parent_as->wait, cl); ++ closure_wait(&as->parent_as->wait, &as->cl); ++ ++ list_del(&as->unwritten_list); + mutex_unlock(&c->btree_interior_update_lock); ++ ++ continue_at(&as->cl, btree_update_nodes_reachable, system_wq); + break; + + case BTREE_INTERIOR_UPDATING_ROOT: +@@ -728,6 +742,7 @@ retry: + mutex_unlock(&c->btree_interior_update_lock); + btree_node_lock_type(c, b, SIX_LOCK_read); + six_unlock_read(&b->lock); ++ mutex_lock(&c->btree_interior_update_lock); + goto retry; + } + +@@ -744,6 +759,8 @@ retry: + * can reuse the old nodes it'll have to do a journal commit: + */ + six_unlock_read(&b->lock); ++ ++ list_del(&as->unwritten_list); + mutex_unlock(&c->btree_interior_update_lock); + + /* +@@ -762,11 +779,12 @@ retry: + + as->journal_seq = bch2_journal_last_unwritten_seq(&c->journal); + +- btree_update_wait_on_journal(cl); +- return; ++ btree_update_wait_on_journal(&as->cl); ++ break; + } + +- continue_at(cl, btree_update_nodes_reachable, system_wq); ++ mutex_lock(&c->btree_interior_update_lock); ++ goto retry; + } + + /* +@@ -778,6 +796,7 @@ static void btree_update_updated_node(struct btree_update *as, struct btree *b) + struct bch_fs *c = as->c; + + mutex_lock(&c->btree_interior_update_lock); ++ list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten); + + BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE); + BUG_ON(!btree_node_dirty(b)); +@@ -858,6 +877,7 @@ static void btree_update_updated_root(struct btree_update *as) + struct btree_root *r = &c->btree_roots[as->btree_id]; + + mutex_lock(&c->btree_interior_update_lock); ++ list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten); + + BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE); + +diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h +index 2d8e0b7f3aaf..657b3d310e89 100644 +--- a/fs/bcachefs/btree_update_interior.h ++++ b/fs/bcachefs/btree_update_interior.h +@@ -55,6 +55,7 @@ struct btree_update { + struct bch_fs *c; + + struct list_head list; ++ struct list_head unwritten_list; + + /* What kind of update are we doing? */ + enum { +diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c +index b46b7d78173e..73f123bee264 100644 +--- a/fs/bcachefs/super.c ++++ b/fs/bcachefs/super.c +@@ -634,6 +634,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) + INIT_LIST_HEAD(&c->list); + + INIT_LIST_HEAD(&c->btree_interior_update_list); ++ INIT_LIST_HEAD(&c->btree_interior_updates_unwritten); + mutex_init(&c->btree_reserve_cache_lock); + mutex_init(&c->btree_interior_update_lock); + +-- +cgit v1.2.3 + + +From b97f4aa0cd6420991ccd0a77c4bbdfcd4b1efa90 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 26 Feb 2020 15:39:46 -0500 +Subject: bcachefs: Kill TRANS_RESET_MEM|TRANS_RESET_ITERS + +All iterators should be released now with bch2_trans_iter_put(), so +TRANS_RESET_ITERS shouldn't be needed anymore, and TRANS_RESET_MEM is +always used. + +Also convert more code to __bch2_trans_do(). + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 15 +++++---- + fs/bcachefs/btree_iter.h | 6 ++-- + fs/bcachefs/btree_update.h | 15 +++++---- + fs/bcachefs/btree_update_leaf.c | 15 +++++---- + fs/bcachefs/dirent.c | 45 +++++++++++++++---------- + fs/bcachefs/ec.c | 19 +++++------ + fs/bcachefs/fs-io.c | 2 +- + fs/bcachefs/fsck.c | 29 +++++----------- + fs/bcachefs/io.c | 4 +-- + fs/bcachefs/reflink.c | 2 +- + fs/bcachefs/str_hash.h | 4 +++ + fs/bcachefs/tests.c | 73 +++++++++++++++++++++++++++++++---------- + 12 files changed, 136 insertions(+), 93 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index c365a2aff446..6b0f45ac46ab 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -1756,6 +1756,8 @@ int bch2_trans_iter_put(struct btree_trans *trans, + if (IS_ERR_OR_NULL(iter)) + return 0; + ++ BUG_ON(trans->iters + iter->idx != iter); ++ + ret = btree_iter_err(iter); + + if (!(trans->iters_touched & (1ULL << iter->idx)) && +@@ -2080,16 +2082,11 @@ void bch2_trans_reset(struct btree_trans *trans, unsigned flags) + + bch2_trans_unlink_iters(trans); + +- if (flags & TRANS_RESET_ITERS) +- trans->iters_live = 0; +- + trans->iters_touched &= trans->iters_live; + + trans->need_reset = 0; + trans->nr_updates = 0; +- +- if (flags & TRANS_RESET_MEM) +- trans->mem_top = 0; ++ trans->mem_top = 0; + + if (trans->fs_usage_deltas) { + trans->fs_usage_deltas->used = 0; +@@ -2108,6 +2105,12 @@ void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, + { + memset(trans, 0, offsetof(struct btree_trans, iters_onstack)); + ++ /* ++ * reallocating iterators currently completely breaks ++ * bch2_trans_iter_put(): ++ */ ++ expected_nr_iters = BTREE_ITER_MAX; ++ + trans->c = c; + trans->ip = _RET_IP_; + trans->size = ARRAY_SIZE(trans->iters_onstack); +diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h +index 336901f9780b..a95d0f13c65d 100644 +--- a/fs/bcachefs/btree_iter.h ++++ b/fs/bcachefs/btree_iter.h +@@ -290,15 +290,13 @@ struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *, + enum btree_id, struct bpos, + unsigned, unsigned, unsigned); + +-#define TRANS_RESET_ITERS (1 << 0) +-#define TRANS_RESET_MEM (1 << 1) +-#define TRANS_RESET_NOTRAVERSE (1 << 2) ++#define TRANS_RESET_NOTRAVERSE (1 << 0) + + void bch2_trans_reset(struct btree_trans *, unsigned); + + static inline void bch2_trans_begin(struct btree_trans *trans) + { +- return bch2_trans_reset(trans, TRANS_RESET_ITERS|TRANS_RESET_MEM); ++ return bch2_trans_reset(trans, 0); + } + + void *bch2_trans_kmalloc(struct btree_trans *, size_t); +diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h +index be4fe818eac8..d1cd839ac08f 100644 +--- a/fs/bcachefs/btree_update.h ++++ b/fs/bcachefs/btree_update.h +@@ -59,6 +59,7 @@ enum btree_insert_flags { + + int bch2_btree_delete_at(struct btree_trans *, struct btree_iter *, unsigned); + ++int __bch2_btree_insert(struct btree_trans *, enum btree_id, struct bkey_i *); + int bch2_btree_insert(struct bch_fs *, enum btree_id, struct bkey_i *, + struct disk_reservation *, u64 *, int flags); + +@@ -98,17 +99,17 @@ static inline int bch2_trans_commit(struct btree_trans *trans, + return __bch2_trans_commit(trans); + } + +-#define __bch2_trans_do(_trans, _disk_res, _journal_seq, \ +- _flags, _reset_flags, _do) \ ++#define __bch2_trans_do(_trans, _disk_res, _journal_seq, _flags, _do) \ + ({ \ + int _ret; \ + \ +- do { \ +- bch2_trans_reset(_trans, _reset_flags); \ +- \ ++ while (1) { \ + _ret = (_do) ?: bch2_trans_commit(_trans, (_disk_res), \ + (_journal_seq), (_flags)); \ +- } while (_ret == -EINTR); \ ++ if (_ret != -EINTR) \ ++ break; \ ++ bch2_trans_reset(_trans, 0); \ ++ } \ + \ + _ret; \ + }) +@@ -120,7 +121,7 @@ static inline int bch2_trans_commit(struct btree_trans *trans, + \ + bch2_trans_init(&trans, (_c), 0, 0); \ + _ret = __bch2_trans_do(&trans, _disk_res, _journal_seq, _flags, \ +- TRANS_RESET_MEM|TRANS_RESET_ITERS, _do); \ ++ _do); \ + _ret2 = bch2_trans_exit(&trans); \ + \ + _ret ?: _ret2; \ +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index a036c7dd1fc1..1d94fb94a570 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -758,7 +758,7 @@ out: + if (likely(!(trans->flags & BTREE_INSERT_NOCHECK_RW))) + percpu_ref_put(&trans->c->writes); + out_noupdates: +- bch2_trans_reset(trans, TRANS_RESET_MEM|TRANS_RESET_NOTRAVERSE); ++ bch2_trans_reset(trans, !ret ? TRANS_RESET_NOTRAVERSE : 0); + + return ret; + err: +@@ -839,18 +839,21 @@ int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter, + return 0; + } + +-static int __bch2_btree_insert(struct btree_trans *trans, +- enum btree_id id, struct bkey_i *k) ++int __bch2_btree_insert(struct btree_trans *trans, ++ enum btree_id id, struct bkey_i *k) + { + struct btree_iter *iter; ++ int ret; + + iter = bch2_trans_get_iter(trans, id, bkey_start_pos(&k->k), + BTREE_ITER_INTENT); + if (IS_ERR(iter)) + return PTR_ERR(iter); + +- bch2_trans_update(trans, iter, k, 0); +- return 0; ++ ret = bch2_btree_iter_traverse(iter) ?: ++ bch2_trans_update(trans, iter, k, 0); ++ bch2_trans_iter_put(trans, iter); ++ return ret; + } + + /** +@@ -882,7 +885,7 @@ retry: + bkey_cmp(iter->pos, end) < 0) { + struct bkey_i delete; + +- bch2_trans_reset(trans, TRANS_RESET_MEM); ++ bch2_trans_begin(trans); + + bkey_init(&delete.k); + +diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c +index 623b6c3eda95..ae5c9fd8d9f7 100644 +--- a/fs/bcachefs/dirent.c ++++ b/fs/bcachefs/dirent.c +@@ -169,12 +169,12 @@ int bch2_dirent_rename(struct btree_trans *trans, + const struct qstr *dst_name, u64 *dst_inum, + enum bch_rename_mode mode) + { +- struct btree_iter *src_iter, *dst_iter; ++ struct btree_iter *src_iter = NULL, *dst_iter = NULL; + struct bkey_s_c old_src, old_dst; + struct bkey_i_dirent *new_src = NULL, *new_dst = NULL; + struct bpos dst_pos = + POS(dst_dir, bch2_dirent_hash(dst_hash, dst_name)); +- int ret; ++ int ret = 0; + + *src_inum = *dst_inum = 0; + +@@ -191,8 +191,10 @@ int bch2_dirent_rename(struct btree_trans *trans, + : bch2_hash_lookup(trans, bch2_dirent_hash_desc, + dst_hash, dst_dir, dst_name, + BTREE_ITER_INTENT); +- if (IS_ERR(dst_iter)) +- return PTR_ERR(dst_iter); ++ ret = PTR_ERR_OR_ZERO(dst_iter); ++ if (ret) ++ goto out; ++ + old_dst = bch2_btree_iter_peek_slot(dst_iter); + + if (mode != BCH_RENAME) +@@ -202,15 +204,18 @@ int bch2_dirent_rename(struct btree_trans *trans, + src_iter = bch2_hash_lookup(trans, bch2_dirent_hash_desc, + src_hash, src_dir, src_name, + BTREE_ITER_INTENT); +- if (IS_ERR(src_iter)) +- return PTR_ERR(src_iter); ++ ret = PTR_ERR_OR_ZERO(src_iter); ++ if (ret) ++ goto out; ++ + old_src = bch2_btree_iter_peek_slot(src_iter); + *src_inum = le64_to_cpu(bkey_s_c_to_dirent(old_src).v->d_inum); + + /* Create new dst key: */ + new_dst = dirent_create_key(trans, 0, dst_name, 0); +- if (IS_ERR(new_dst)) +- return PTR_ERR(new_dst); ++ ret = PTR_ERR_OR_ZERO(new_dst); ++ if (ret) ++ goto out; + + dirent_copy_target(new_dst, bkey_s_c_to_dirent(old_src)); + new_dst->k.p = dst_iter->pos; +@@ -218,15 +223,18 @@ int bch2_dirent_rename(struct btree_trans *trans, + /* Create new src key: */ + if (mode == BCH_RENAME_EXCHANGE) { + new_src = dirent_create_key(trans, 0, src_name, 0); +- if (IS_ERR(new_src)) +- return PTR_ERR(new_src); ++ ret = PTR_ERR_OR_ZERO(new_src); ++ if (ret) ++ goto out; + + dirent_copy_target(new_src, bkey_s_c_to_dirent(old_dst)); + new_src->k.p = src_iter->pos; + } else { + new_src = bch2_trans_kmalloc(trans, sizeof(struct bkey_i)); +- if (IS_ERR(new_src)) +- return PTR_ERR(new_src); ++ ret = PTR_ERR_OR_ZERO(new_src); ++ if (ret) ++ goto out; ++ + bkey_init(&new_src->k); + new_src->k.p = src_iter->pos; + +@@ -247,7 +255,7 @@ int bch2_dirent_rename(struct btree_trans *trans, + new_dst->k.p = src_iter->pos; + bch2_trans_update(trans, src_iter, + &new_dst->k_i, 0); +- return 0; ++ goto out; + } else { + /* If we're overwriting, we can't insert new_dst + * at a different slot because it has to +@@ -261,7 +269,7 @@ int bch2_dirent_rename(struct btree_trans *trans, + ret = bch2_hash_needs_whiteout(trans, bch2_dirent_hash_desc, + src_hash, src_iter); + if (ret < 0) +- return ret; ++ goto out; + + if (ret) + new_src->k.type = KEY_TYPE_whiteout; +@@ -270,7 +278,10 @@ int bch2_dirent_rename(struct btree_trans *trans, + + bch2_trans_update(trans, src_iter, &new_src->k_i, 0); + bch2_trans_update(trans, dst_iter, &new_dst->k_i, 0); +- return 0; ++out: ++ bch2_trans_iter_put(trans, src_iter); ++ bch2_trans_iter_put(trans, dst_iter); ++ return ret; + } + + int bch2_dirent_delete_at(struct btree_trans *trans, +@@ -331,9 +342,7 @@ int bch2_empty_dir_trans(struct btree_trans *trans, u64 dir_inum) + break; + } + } +- +- if (!IS_ERR(iter)) +- bch2_trans_iter_put(trans, iter); ++ bch2_trans_iter_put(trans, iter); + + return ret; + } +diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c +index 63e1ca668b3e..d87e0093c7ee 100644 +--- a/fs/bcachefs/ec.c ++++ b/fs/bcachefs/ec.c +@@ -741,6 +741,8 @@ found_slot: + ret = bch2_trans_commit(&trans, NULL, NULL, + BTREE_INSERT_NOFAIL); + err: ++ bch2_trans_iter_put(&trans, iter); ++ + if (ret == -EINTR) + goto retry; + +@@ -1201,8 +1203,7 @@ static int __bch2_stripe_write_key(struct btree_trans *trans, + struct btree_iter *iter, + struct stripe *m, + size_t idx, +- struct bkey_i_stripe *new_key, +- unsigned flags) ++ struct bkey_i_stripe *new_key) + { + struct bch_fs *c = trans->c; + struct bkey_s_c k; +@@ -1231,9 +1232,7 @@ static int __bch2_stripe_write_key(struct btree_trans *trans, + spin_unlock(&c->ec_stripes_heap_lock); + + bch2_trans_update(trans, iter, &new_key->k_i, 0); +- +- return bch2_trans_commit(trans, NULL, NULL, +- BTREE_INSERT_NOFAIL|flags); ++ return 0; + } + + int bch2_stripes_write(struct bch_fs *c, unsigned flags, bool *wrote) +@@ -1257,12 +1256,10 @@ int bch2_stripes_write(struct bch_fs *c, unsigned flags, bool *wrote) + if (!m->dirty) + continue; + +- do { +- bch2_trans_reset(&trans, TRANS_RESET_MEM); +- +- ret = __bch2_stripe_write_key(&trans, iter, m, +- giter.pos, new_key, flags); +- } while (ret == -EINTR); ++ ret = __bch2_trans_do(&trans, NULL, NULL, ++ BTREE_INSERT_NOFAIL|flags, ++ __bch2_stripe_write_key(&trans, iter, m, ++ giter.pos, new_key)); + + if (ret) + break; +diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c +index 92f42c2fee33..e2b293a6ff91 100644 +--- a/fs/bcachefs/fs-io.c ++++ b/fs/bcachefs/fs-io.c +@@ -2674,7 +2674,7 @@ static long bchfs_fallocate(struct bch_inode_info *inode, int mode, + struct bkey_i_reservation reservation; + struct bkey_s_c k; + +- bch2_trans_reset(&trans, TRANS_RESET_MEM); ++ bch2_trans_begin(&trans); + + k = bch2_btree_iter_peek_slot(iter); + if ((ret = bkey_err(k))) +diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c +index 9ef532d875e8..eca723121a2c 100644 +--- a/fs/bcachefs/fsck.c ++++ b/fs/bcachefs/fsck.c +@@ -81,7 +81,6 @@ static int remove_dirent(struct btree_trans *trans, + return __bch2_trans_do(trans, NULL, NULL, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW, +- TRANS_RESET_MEM, + __remove_dirent(trans, dirent)); + } + +@@ -182,8 +181,6 @@ static int hash_redo_key(const struct bch_hash_desc desc, + struct bkey_i delete; + struct bkey_i *tmp; + +- bch2_trans_reset(trans, TRANS_RESET_MEM); +- + tmp = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); + if (IS_ERR(tmp)) + return PTR_ERR(tmp); +@@ -194,11 +191,8 @@ static int hash_redo_key(const struct bch_hash_desc desc, + delete.k.p = k_iter->pos; + bch2_trans_update(trans, k_iter, &delete, 0); + +- return bch2_hash_set(trans, desc, &h->info, k_iter->pos.inode, +- tmp, BCH_HASH_SET_MUST_CREATE) ?: +- bch2_trans_commit(trans, NULL, NULL, +- BTREE_INSERT_NOFAIL| +- BTREE_INSERT_LAZY_RW); ++ return bch2_hash_set(trans, desc, &h->info, k_iter->pos.inode, ++ tmp, BCH_HASH_SET_MUST_CREATE); + } + + static int fsck_hash_delete_at(struct btree_trans *trans, +@@ -320,10 +314,9 @@ static int hash_check_key(struct btree_trans *trans, + desc.btree_id, k.k->p.offset, + hashed, h->chain->pos.offset, + (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf))) { +- do { +- ret = hash_redo_key(desc, trans, h, k_iter, k, hashed); +- } while (ret == -EINTR); +- ++ ret = __bch2_trans_do(trans, NULL, NULL, ++ BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW, ++ hash_redo_key(desc, trans, h, k_iter, k, hashed)); + if (ret) { + bch_err(c, "hash_redo_key err %i", ret); + return ret; +@@ -387,7 +380,6 @@ static int check_dirent_hash(struct btree_trans *trans, struct hash_check *h, + ret = __bch2_trans_do(trans, NULL, NULL, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW, +- TRANS_RESET_MEM, + (bch2_trans_update(trans, iter, &d->k_i, 0), 0)); + if (ret) + goto err; +@@ -410,11 +402,10 @@ err_redo: + k->k->p.offset, hash, h->chain->pos.offset, + (bch2_bkey_val_to_text(&PBUF(buf), c, + *k), buf))) { +- do { +- ret = hash_redo_key(bch2_dirent_hash_desc, trans, +- h, iter, *k, hash); +- } while (ret == -EINTR); +- ++ ret = __bch2_trans_do(trans, NULL, NULL, ++ BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW, ++ hash_redo_key(bch2_dirent_hash_desc, trans, ++ h, iter, *k, hash)); + if (ret) + bch_err(c, "hash_redo_key err %i", ret); + else +@@ -660,7 +651,6 @@ retry: + ret = __bch2_trans_do(&trans, NULL, NULL, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW, +- TRANS_RESET_MEM, + (bch2_trans_update(&trans, iter, &n->k_i, 0), 0)); + kfree(n); + if (ret) +@@ -1275,7 +1265,6 @@ static int check_inode(struct btree_trans *trans, + ret = __bch2_trans_do(trans, NULL, NULL, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW, +- TRANS_RESET_MEM, + (bch2_trans_update(trans, iter, &p.inode.k_i, 0), 0)); + if (ret) + bch_err(c, "error in fsck: error %i " +diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c +index 795be72b2364..df419ad01cb0 100644 +--- a/fs/bcachefs/io.c ++++ b/fs/bcachefs/io.c +@@ -325,7 +325,7 @@ int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter, + bch2_disk_reservation_init(c, 0); + struct bkey_i delete; + +- bch2_trans_reset(trans, TRANS_RESET_MEM); ++ bch2_trans_begin(trans); + + ret = bkey_err(k); + if (ret) +@@ -399,7 +399,7 @@ int bch2_write_index_default(struct bch_write_op *op) + BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + + do { +- bch2_trans_reset(&trans, TRANS_RESET_MEM); ++ bch2_trans_begin(&trans); + + k = bch2_keylist_front(keys); + +diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c +index d78a3d5f7246..2f223be74926 100644 +--- a/fs/bcachefs/reflink.c ++++ b/fs/bcachefs/reflink.c +@@ -184,7 +184,7 @@ s64 bch2_remap_range(struct bch_fs *c, + BTREE_ITER_INTENT); + + while (1) { +- bch2_trans_reset(&trans, TRANS_RESET_MEM); ++ bch2_trans_begin(&trans); + + trans.mem_top = 0; + +diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h +index 0710d0bbe36d..9c9549d0a8f6 100644 +--- a/fs/bcachefs/str_hash.h ++++ b/fs/bcachefs/str_hash.h +@@ -163,6 +163,7 @@ bch2_hash_lookup(struct btree_trans *trans, + break; + } + } ++ bch2_trans_iter_put(trans, iter); + + return ERR_PTR(ret ?: -ENOENT); + } +@@ -187,6 +188,9 @@ bch2_hash_hole(struct btree_trans *trans, + return iter; + } + ++ iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT; ++ bch2_trans_iter_put(trans, iter); ++ + return ERR_PTR(ret ?: -ENOSPC); + } + +diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c +index 8f9b0cca17da..9bc2c4a03c88 100644 +--- a/fs/bcachefs/tests.c ++++ b/fs/bcachefs/tests.c +@@ -43,8 +43,8 @@ static void test_delete(struct bch_fs *c, u64 nr) + ret = bch2_btree_iter_traverse(iter); + BUG_ON(ret); + +- bch2_trans_update(&trans, iter, &k.k_i, 0); +- ret = bch2_trans_commit(&trans, NULL, NULL, 0); ++ ret = __bch2_trans_do(&trans, NULL, NULL, 0, ++ bch2_trans_update(&trans, iter, &k.k_i, 0)); + BUG_ON(ret); + + pr_info("deleting once"); +@@ -75,8 +75,8 @@ static void test_delete_written(struct bch_fs *c, u64 nr) + ret = bch2_btree_iter_traverse(iter); + BUG_ON(ret); + +- bch2_trans_update(&trans, iter, &k.k_i, 0); +- ret = bch2_trans_commit(&trans, NULL, NULL, 0); ++ ret = __bch2_trans_do(&trans, NULL, NULL, 0, ++ bch2_trans_update(&trans, iter, &k.k_i, 0)); + BUG_ON(ret); + + bch2_journal_flush_all_pins(&c->journal); +@@ -409,18 +409,24 @@ static u64 test_rand(void) + + static void rand_insert(struct bch_fs *c, u64 nr) + { ++ struct btree_trans trans; + struct bkey_i_cookie k; + int ret; + u64 i; + ++ bch2_trans_init(&trans, c, 0, 0); ++ + for (i = 0; i < nr; i++) { + bkey_cookie_init(&k.k_i); + k.k.p.offset = test_rand(); + +- ret = bch2_btree_insert(c, BTREE_ID_DIRENTS, &k.k_i, +- NULL, NULL, 0); ++ ret = __bch2_trans_do(&trans, NULL, NULL, 0, ++ __bch2_btree_insert(&trans, BTREE_ID_DIRENTS, &k.k_i)); ++ + BUG_ON(ret); + } ++ ++ bch2_trans_exit(&trans); + } + + static void rand_lookup(struct bch_fs *c, u64 nr) +@@ -465,8 +471,9 @@ static void rand_mixed(struct bch_fs *c, u64 nr) + bkey_cookie_init(&k.k_i); + k.k.p = iter->pos; + +- bch2_trans_update(&trans, iter, &k.k_i, 0); +- ret = bch2_trans_commit(&trans, NULL, NULL, 0); ++ ret = __bch2_trans_do(&trans, NULL, NULL, 0, ++ bch2_trans_update(&trans, iter, &k.k_i, 0)); ++ + BUG_ON(ret); + } + +@@ -476,20 +483,50 @@ static void rand_mixed(struct bch_fs *c, u64 nr) + bch2_trans_exit(&trans); + } + ++static int __do_delete(struct btree_trans *trans, struct bpos pos) ++{ ++ struct btree_iter *iter; ++ struct bkey_i delete; ++ struct bkey_s_c k; ++ int ret = 0; ++ ++ iter = bch2_trans_get_iter(trans, BTREE_ID_DIRENTS, pos, ++ BTREE_ITER_INTENT); ++ ret = PTR_ERR_OR_ZERO(iter); ++ if (ret) ++ goto err; ++ ++ k = bch2_btree_iter_peek(iter); ++ ret = bkey_err(k); ++ if (ret) ++ goto err; ++ ++ bkey_init(&delete.k); ++ delete.k.p = k.k->p; ++ ++ bch2_trans_update(trans, iter, &delete, 0); ++err: ++ bch2_trans_iter_put(trans, iter); ++ return ret; ++} ++ + static void rand_delete(struct bch_fs *c, u64 nr) + { +- struct bkey_i k; ++ struct btree_trans trans; + int ret; + u64 i; + ++ bch2_trans_init(&trans, c, 0, 0); ++ + for (i = 0; i < nr; i++) { +- bkey_init(&k.k); +- k.k.p.offset = test_rand(); ++ struct bpos pos = POS(0, test_rand()); + +- ret = bch2_btree_insert(c, BTREE_ID_DIRENTS, &k, +- NULL, NULL, 0); ++ ret = __bch2_trans_do(&trans, NULL, NULL, 0, ++ __do_delete(&trans, pos)); + BUG_ON(ret); + } ++ ++ bch2_trans_exit(&trans); + } + + static void seq_insert(struct bch_fs *c, u64 nr) +@@ -509,8 +546,9 @@ static void seq_insert(struct bch_fs *c, u64 nr) + BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { + insert.k.p = iter->pos; + +- bch2_trans_update(&trans, iter, &insert.k_i, 0); +- ret = bch2_trans_commit(&trans, NULL, NULL, 0); ++ ret = __bch2_trans_do(&trans, NULL, NULL, 0, ++ bch2_trans_update(&trans, iter, &insert.k_i, 0)); ++ + BUG_ON(ret); + + if (++i == nr) +@@ -548,8 +586,9 @@ static void seq_overwrite(struct bch_fs *c, u64 nr) + + bkey_reassemble(&u.k_i, k); + +- bch2_trans_update(&trans, iter, &u.k_i, 0); +- ret = bch2_trans_commit(&trans, NULL, NULL, 0); ++ ret = __bch2_trans_do(&trans, NULL, NULL, 0, ++ bch2_trans_update(&trans, iter, &u.k_i, 0)); ++ + BUG_ON(ret); + } + bch2_trans_exit(&trans); +-- +cgit v1.2.3 + + +From 1d9023e085abf07527df77d5f305729500462d1d Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 26 Feb 2020 15:58:36 -0500 +Subject: bcachefs: Issue discards when needed to allocate journal write + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/journal_io.c | 7 +++++++ + 1 file changed, 7 insertions(+) + +diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c +index db722a8ae4ea..0974805c8923 100644 +--- a/fs/bcachefs/journal_io.c ++++ b/fs/bcachefs/journal_io.c +@@ -1043,9 +1043,16 @@ void bch2_journal_write(struct closure *cl) + bytes = vstruct_bytes(jset); + memset((void *) jset + bytes, 0, (sectors << 9) - bytes); + ++retry_alloc: + spin_lock(&j->lock); + ret = journal_write_alloc(j, w, sectors); + ++ if (ret && j->can_discard) { ++ spin_unlock(&j->lock); ++ bch2_journal_do_discards(j); ++ goto retry_alloc; ++ } ++ + /* + * write is allocated, no longer need to account for it in + * bch2_journal_space_available(): +-- +cgit v1.2.3 + + +From 461ad135ef646160fa3bfd5d6685a7d7f9063d28 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 26 Feb 2020 17:11:00 -0500 +Subject: bcachefs: Fix incorrect initialization of + btree_node_old_extent_overwrite() + +b->level and b->btree_id weren't set when the code was checking +btree_node_is_extents() + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_update_interior.c | 3 +++ + 1 file changed, 3 insertions(+) + +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index e94619a5ac97..48189417f492 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -370,6 +370,9 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned lev + set_btree_node_need_write(b); + + bch2_bset_init_first(b, &b->data->keys); ++ b->level = level; ++ b->btree_id = as->btree_id; ++ + memset(&b->nr, 0, sizeof(b->nr)); + b->data->magic = cpu_to_le64(bset_magic(c)); + b->data->flags = 0; +-- +cgit v1.2.3 + + +From 52d2b0c57274bece7f4db4a567a1fd27e7dcd7bd Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 24 Feb 2020 15:25:00 -0500 +Subject: bcachefs: Use btree_ptr_v2.mem_ptr to avoid hash table lookup + +Nice performance optimization + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_cache.c | 28 +++++++++++----------------- + fs/bcachefs/btree_cache.h | 7 +++++++ + fs/bcachefs/btree_io.c | 1 + + fs/bcachefs/btree_iter.c | 25 +++++++++++++++++++++++++ + 4 files changed, 44 insertions(+), 17 deletions(-) + +diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c +index 2c9c3c18defe..ae3803138399 100644 +--- a/fs/bcachefs/btree_cache.c ++++ b/fs/bcachefs/btree_cache.c +@@ -595,12 +595,13 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c, + struct btree_cache *bc = &c->btree_cache; + struct btree *b; + ++ BUG_ON(level + 1 >= BTREE_MAX_DEPTH); + /* + * Parent node must be locked, else we could read in a btree node that's + * been freed: + */ +- BUG_ON(!btree_node_locked(iter, level + 1)); +- BUG_ON(level >= BTREE_MAX_DEPTH); ++ if (!bch2_btree_node_relock(iter, level + 1)) ++ return ERR_PTR(-EINTR); + + b = bch2_btree_node_mem_alloc(c); + if (IS_ERR(b)) +@@ -623,13 +624,9 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c, + } + + /* +- * If the btree node wasn't cached, we can't drop our lock on +- * the parent until after it's added to the cache - because +- * otherwise we could race with a btree_split() freeing the node +- * we're trying to lock. ++ * Unlock before doing IO: + * +- * But the deadlock described below doesn't exist in this case, +- * so it's safe to not drop the parent lock until here: ++ * XXX: ideally should be dropping all btree node locks here + */ + if (btree_node_read_locked(iter, level + 1)) + btree_node_unlock(iter, level + 1); +@@ -666,16 +663,11 @@ struct btree *bch2_btree_node_get(struct bch_fs *c, struct btree_iter *iter, + struct btree *b; + struct bset_tree *t; + +- /* +- * XXX: locking optimization +- * +- * we can make the locking looser here - caller can drop lock on parent +- * node before locking child node (and potentially blocking): we just +- * have to have bch2_btree_node_fill() call relock on the parent and +- * return -EINTR if that fails +- */ +- EBUG_ON(!btree_node_locked(iter, level + 1)); + EBUG_ON(level >= BTREE_MAX_DEPTH); ++ ++ b = btree_node_mem_ptr(k); ++ if (b) ++ goto lock_node; + retry: + b = btree_cache_find(bc, k); + if (unlikely(!b)) { +@@ -693,6 +685,7 @@ retry: + if (IS_ERR(b)) + return b; + } else { ++lock_node: + /* + * There's a potential deadlock with splits and insertions into + * interior nodes we have to avoid: +@@ -739,6 +732,7 @@ retry: + } + } + ++ /* XXX: waiting on IO with btree locks held: */ + wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight, + TASK_UNINTERRUPTIBLE); + +diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h +index d27acd87e4b8..bc24d92678d3 100644 +--- a/fs/bcachefs/btree_cache.h ++++ b/fs/bcachefs/btree_cache.h +@@ -47,6 +47,13 @@ static inline u64 btree_ptr_hash_val(const struct bkey_i *k) + } + } + ++static inline struct btree *btree_node_mem_ptr(const struct bkey_i *k) ++{ ++ return k->k.type == KEY_TYPE_btree_ptr_v2 ++ ? (void *)(unsigned long)bkey_i_to_btree_ptr_v2_c(k)->v.mem_ptr ++ : NULL; ++} ++ + /* is btree node in hash table? */ + static inline bool btree_node_hashed(struct btree *b) + { +diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c +index 84fbceea5027..5c3779a47c51 100644 +--- a/fs/bcachefs/btree_io.c ++++ b/fs/bcachefs/btree_io.c +@@ -1647,6 +1647,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, + + b->written += sectors_to_write; + ++ /* XXX: submitting IO with btree locks held: */ + bch2_submit_wbio_replicas(&wbio->wbio, c, BCH_DATA_BTREE, &k.key); + return; + err: +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 6b0f45ac46ab..321fe306cc04 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -912,6 +912,27 @@ static void btree_iter_prefetch(struct btree_iter *iter) + btree_node_unlock(iter, iter->level); + } + ++static noinline void btree_node_mem_ptr_set(struct btree_iter *iter, ++ unsigned plevel, struct btree *b) ++{ ++ struct btree_iter_level *l = &iter->l[plevel]; ++ bool locked = btree_node_locked(iter, plevel); ++ struct bkey_packed *k; ++ struct bch_btree_ptr_v2 *bp; ++ ++ if (!bch2_btree_node_relock(iter, plevel)) ++ return; ++ ++ k = bch2_btree_node_iter_peek_all(&l->iter, l->b); ++ BUG_ON(k->type != KEY_TYPE_btree_ptr_v2); ++ ++ bp = (void *) bkeyp_val(&l->b->format, k); ++ bp->mem_ptr = (unsigned long)b; ++ ++ if (!locked) ++ btree_node_unlock(iter, plevel); ++} ++ + static __always_inline int btree_iter_down(struct btree_iter *iter) + { + struct bch_fs *c = iter->trans->c; +@@ -933,6 +954,10 @@ static __always_inline int btree_iter_down(struct btree_iter *iter) + mark_btree_node_locked(iter, level, lock_type); + btree_iter_node_set(iter, b); + ++ if (tmp.k.k.type == KEY_TYPE_btree_ptr_v2 && ++ unlikely(b != btree_node_mem_ptr(&tmp.k))) ++ btree_node_mem_ptr_set(iter, level + 1, b); ++ + if (iter->flags & BTREE_ITER_PREFETCH) + btree_iter_prefetch(iter); + +-- +cgit v1.2.3 + + +From 2266cca0b6b6c819895fa17dace5b8611193bea9 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 26 Feb 2020 17:25:13 -0500 +Subject: bcachefs: fix setting btree_node_accessed() + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_cache.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c +index ae3803138399..e9df7e82a766 100644 +--- a/fs/bcachefs/btree_cache.c ++++ b/fs/bcachefs/btree_cache.c +@@ -747,7 +747,7 @@ lock_node: + } + + /* avoid atomic set bit if it's not needed: */ +- if (btree_node_accessed(b)) ++ if (!btree_node_accessed(b)) + set_btree_node_accessed(b); + + if (unlikely(btree_node_read_error(b))) { +-- +cgit v1.2.3 + + +From 7b031ffb47eee8c984c07353f61aa01a12592afd Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 26 Feb 2020 17:34:27 -0500 +Subject: bcachefs: BCH_SB_FEATURES_ALL + +BCH_FEATURE_btree_ptr_v2 wasn't getting set on new filesystems, oops + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bcachefs_format.h | 5 +++++ + fs/bcachefs/recovery.c | 7 ++----- + 2 files changed, 7 insertions(+), 5 deletions(-) + +diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h +index bb251fcb4bb0..42dd1022477c 100644 +--- a/fs/bcachefs/bcachefs_format.h ++++ b/fs/bcachefs/bcachefs_format.h +@@ -1313,6 +1313,11 @@ LE64_BITMASK(BCH_SB_ERASURE_CODE, struct bch_sb, flags[3], 0, 16); + x(incompressible, 10) \ + x(btree_ptr_v2, 11) + ++#define BCH_SB_FEATURES_ALL \ ++ ((1ULL << BCH_FEATURE_new_siphash)| \ ++ (1ULL << BCH_FEATURE_new_extent_overwrite)| \ ++ (1ULL << BCH_FEATURE_btree_ptr_v2)) ++ + enum bch_sb_feature { + #define x(f, n) BCH_FEATURE_##f, + BCH_SB_FEATURES() +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index c9d12f7c180e..1871485c079d 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -1008,9 +1008,7 @@ int bch2_fs_recovery(struct bch_fs *c) + c->disk_sb.sb->version_min = + le16_to_cpu(bcachefs_metadata_version_min); + c->disk_sb.sb->version = le16_to_cpu(bcachefs_metadata_version_current); +- c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_new_siphash; +- c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_new_extent_overwrite; +- c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_btree_ptr_v2; ++ c->disk_sb.sb->features[0] |= BCH_SB_FEATURES_ALL; + write_sb = true; + } + +@@ -1129,8 +1127,7 @@ int bch2_fs_initialize(struct bch_fs *c) + c->disk_sb.sb->version = c->disk_sb.sb->version_min = + le16_to_cpu(bcachefs_metadata_version_current); + c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_atomic_nlink; +- c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_new_siphash; +- c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_new_extent_overwrite; ++ c->disk_sb.sb->features[0] |= BCH_SB_FEATURES_ALL; + + SET_BCH_SB_INITIALIZED(c->disk_sb.sb, true); + SET_BCH_SB_CLEAN(c->disk_sb.sb, false); +-- +cgit v1.2.3 + + +From c58db7f22f7e1b23e8c35ba3bf5557aaa49eddc5 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 26 Feb 2020 20:39:06 -0500 +Subject: bcachefs: Improve an error message + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_io.c | 27 +++++++++++++++------------ + 1 file changed, 15 insertions(+), 12 deletions(-) + +diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c +index 5c3779a47c51..0370c7821aa1 100644 +--- a/fs/bcachefs/btree_io.c ++++ b/fs/bcachefs/btree_io.c +@@ -709,15 +709,15 @@ static int validate_bset(struct bch_fs *c, struct btree *b, + unsigned *whiteout_u64s, int write, + bool have_retry) + { +- struct bkey_packed *k, *prev = NULL; +- struct bpos prev_pos = POS_MIN; ++ struct bkey_packed *k; ++ struct bkey prev = KEY(0, 0, 0); + struct bpos prev_data = POS_MIN; + bool seen_non_whiteout = false; + unsigned version; + const char *err; + int ret = 0; + +- if (i == &b->data->keys) { ++ if (!b->written) { + /* These indicate that we read the wrong btree node: */ + btree_err_on(BTREE_NODE_ID(b->data) != b->btree_id, + BTREE_ERR_MUST_RETRY, c, b, i, +@@ -853,25 +853,28 @@ static int validate_bset(struct bch_fs *c, struct btree *b, + + if (!seen_non_whiteout && + (!bkey_whiteout(k) || +- (bkey_cmp(prev_pos, bkey_start_pos(u.k)) > 0))) { ++ (bkey_cmp(prev.p, bkey_start_pos(u.k)) > 0))) { + *whiteout_u64s = k->_data - i->_data; + seen_non_whiteout = true; + } else if (bkey_cmp(prev_data, bkey_start_pos(u.k)) > 0 || +- bkey_cmp(prev_pos, u.k->p) > 0) { ++ bkey_cmp(prev.p, u.k->p) > 0) { ++ char buf1[80]; ++ char buf2[80]; ++ ++ bch2_bkey_to_text(&PBUF(buf1), &prev); ++ bch2_bkey_to_text(&PBUF(buf2), u.k); ++ ++ bch2_dump_bset(b, i, 0); + btree_err(BTREE_ERR_FATAL, c, b, i, +- "keys out of order: %llu:%llu > %llu:%llu", +- prev_pos.inode, +- prev_pos.offset, +- u.k->p.inode, +- bkey_start_offset(u.k)); ++ "keys out of order: %s > %s", ++ buf1, buf2); + /* XXX: repair this */ + } + + if (!bkey_deleted(u.k)) + prev_data = u.k->p; +- prev_pos = u.k->p; ++ prev = *u.k; + +- prev = k; + k = bkey_next_skip_noops(k, vstruct_last(i)); + } + +-- +cgit v1.2.3 + + +From 68e9a14d5dbcaf6483323f809e78308decfc31c1 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 26 Feb 2020 22:29:52 -0500 +Subject: bcachefs: Fix error message on bucket sector count overflow + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/buckets.c | 10 ++++------ + 1 file changed, 4 insertions(+), 6 deletions(-) + +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index b9bc524f373b..61831a498e16 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -1444,8 +1444,7 @@ static int bch2_trans_mark_pointer(struct btree_trans *trans, + struct bkey_s_c k; + struct bkey_alloc_unpacked u; + struct bkey_i_alloc *a; +- u16 *dst_sectors; +- bool overflow; ++ u16 *dst_sectors, orig_sectors; + int ret; + + ret = trans_get_key(trans, BTREE_ID_ALLOC, +@@ -1502,13 +1501,12 @@ static int bch2_trans_mark_pointer(struct btree_trans *trans, + dst_sectors = !p.ptr.cached + ? &u.dirty_sectors + : &u.cached_sectors; ++ orig_sectors = *dst_sectors; + +- overflow = checked_add(*dst_sectors, sectors); +- +- if (overflow) { ++ if (checked_add(*dst_sectors, sectors)) { + bch2_fs_inconsistent(c, + "bucket sector count overflow: %u + %lli > U16_MAX", +- *dst_sectors, sectors); ++ orig_sectors, sectors); + /* return an error indicating that we need full fsck */ + ret = -EIO; + goto out; +-- +cgit v1.2.3 + + +From 0454aa5ad2f4eb8f0013adcc63471dd0f67b44c6 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 27 Feb 2020 15:03:53 -0500 +Subject: bcachefs: Dont't del sysfs dir until after we go RO + +This will help for debugging hangs during unmount + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/super.c | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c +index 73f123bee264..fae43f3c338c 100644 +--- a/fs/bcachefs/super.c ++++ b/fs/bcachefs/super.c +@@ -510,6 +510,10 @@ void bch2_fs_stop(struct bch_fs *c) + + cancel_work_sync(&c->journal_seq_blacklist_gc_work); + ++ mutex_lock(&c->state_lock); ++ bch2_fs_read_only(c); ++ mutex_unlock(&c->state_lock); ++ + for_each_member_device(ca, c, i) + if (ca->kobj.state_in_sysfs && + ca->disk_sb.bdev) +@@ -532,10 +536,6 @@ void bch2_fs_stop(struct bch_fs *c) + closure_sync(&c->cl); + closure_debug_destroy(&c->cl); + +- mutex_lock(&c->state_lock); +- bch2_fs_read_only(c); +- mutex_unlock(&c->state_lock); +- + /* btree prefetch might have kicked off reads in the background: */ + bch2_btree_flush_all_reads(c); + +-- +cgit v1.2.3 + + +From 0eee841fff17919b112c8be0a2ba2e2a72ec9022 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 27 Feb 2020 15:03:44 -0500 +Subject: bcachefs: Journal pin cleanups + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_update_interior.c | 12 +++--- + fs/bcachefs/btree_update_leaf.c | 17 +++----- + fs/bcachefs/journal_reclaim.c | 86 +++++++++++++------------------------ + fs/bcachefs/journal_reclaim.h | 26 +++++++---- + 4 files changed, 59 insertions(+), 82 deletions(-) + +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index 48189417f492..9180d2da3749 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -867,8 +867,8 @@ static void btree_update_reparent(struct btree_update *as, + * just transfer the journal pin to the new interior update so + * btree_update_nodes_written() can drop it. + */ +- bch2_journal_pin_add_if_older(&c->journal, &child->journal, +- &as->journal, interior_update_flush); ++ bch2_journal_pin_copy(&c->journal, &as->journal, ++ &child->journal, interior_update_flush); + bch2_journal_pin_drop(&c->journal, &child->journal); + + as->journal_seq = max(as->journal_seq, child->journal_seq); +@@ -1049,13 +1049,13 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *as, + * oldest pin of any of the nodes we're freeing. We'll release the pin + * when the new nodes are persistent and reachable on disk: + */ +- bch2_journal_pin_add_if_older(&c->journal, &w->journal, +- &as->journal, interior_update_flush); ++ bch2_journal_pin_copy(&c->journal, &as->journal, ++ &w->journal, interior_update_flush); + bch2_journal_pin_drop(&c->journal, &w->journal); + + w = btree_prev_write(b); +- bch2_journal_pin_add_if_older(&c->journal, &w->journal, +- &as->journal, interior_update_flush); ++ bch2_journal_pin_copy(&c->journal, &as->journal, ++ &w->journal, interior_update_flush); + bch2_journal_pin_drop(&c->journal, &w->journal); + + mutex_unlock(&c->btree_interior_update_lock); +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index 1d94fb94a570..59a1175c5411 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -172,6 +172,9 @@ void bch2_btree_journal_key(struct btree_trans *trans, + struct journal *j = &c->journal; + struct btree *b = iter->l[0].b; + struct btree_write *w = btree_current_write(b); ++ u64 seq = likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)) ++ ? trans->journal_res.seq ++ : j->replay_journal_seq; + + EBUG_ON(iter->level || b->level); + EBUG_ON(trans->journal_res.ref != +@@ -183,16 +186,10 @@ void bch2_btree_journal_key(struct btree_trans *trans, + cpu_to_le64(trans->journal_res.seq); + } + +- if (unlikely(!journal_pin_active(&w->journal))) { +- u64 seq = likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)) +- ? trans->journal_res.seq +- : j->replay_journal_seq; +- +- bch2_journal_pin_add(j, seq, &w->journal, +- btree_node_write_idx(b) == 0 +- ? btree_node_flush0 +- : btree_node_flush1); +- } ++ bch2_journal_pin_add(j, seq, &w->journal, ++ btree_node_write_idx(b) == 0 ++ ? btree_node_flush0 ++ : btree_node_flush1); + + if (unlikely(!btree_node_dirty(b))) + set_btree_node_dirty(b); +diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c +index 695b2c8ba03b..db3afd908474 100644 +--- a/fs/bcachefs/journal_reclaim.c ++++ b/fs/bcachefs/journal_reclaim.c +@@ -290,38 +290,6 @@ void bch2_journal_pin_put(struct journal *j, u64 seq) + } + } + +-static inline void __journal_pin_add(struct journal *j, +- u64 seq, +- struct journal_entry_pin *pin, +- journal_pin_flush_fn flush_fn) +-{ +- struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq); +- +- BUG_ON(journal_pin_active(pin)); +- BUG_ON(!atomic_read(&pin_list->count)); +- +- atomic_inc(&pin_list->count); +- pin->seq = seq; +- pin->flush = flush_fn; +- +- list_add(&pin->list, flush_fn ? &pin_list->list : &pin_list->flushed); +- +- /* +- * If the journal is currently full, we might want to call flush_fn +- * immediately: +- */ +- journal_wake(j); +-} +- +-void bch2_journal_pin_add(struct journal *j, u64 seq, +- struct journal_entry_pin *pin, +- journal_pin_flush_fn flush_fn) +-{ +- spin_lock(&j->lock); +- __journal_pin_add(j, seq, pin, flush_fn); +- spin_unlock(&j->lock); +-} +- + static inline void __journal_pin_drop(struct journal *j, + struct journal_entry_pin *pin) + { +@@ -354,42 +322,46 @@ void bch2_journal_pin_drop(struct journal *j, + spin_unlock(&j->lock); + } + +-void bch2_journal_pin_update(struct journal *j, u64 seq, +- struct journal_entry_pin *pin, +- journal_pin_flush_fn flush_fn) ++void __bch2_journal_pin_add(struct journal *j, u64 seq, ++ struct journal_entry_pin *pin, ++ journal_pin_flush_fn flush_fn) + { ++ struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq); ++ + spin_lock(&j->lock); + +- if (pin->seq != seq) { +- __journal_pin_drop(j, pin); +- __journal_pin_add(j, seq, pin, flush_fn); +- } else { +- struct journal_entry_pin_list *pin_list = +- journal_seq_pin(j, seq); ++ __journal_pin_drop(j, pin); ++ ++ BUG_ON(!atomic_read(&pin_list->count)); + +- list_move(&pin->list, &pin_list->list); +- } ++ atomic_inc(&pin_list->count); ++ pin->seq = seq; ++ pin->flush = flush_fn; ++ ++ list_add(&pin->list, flush_fn ? &pin_list->list : &pin_list->flushed); + + spin_unlock(&j->lock); ++ ++ /* ++ * If the journal is currently full, we might want to call flush_fn ++ * immediately: ++ */ ++ journal_wake(j); + } + +-void bch2_journal_pin_add_if_older(struct journal *j, +- struct journal_entry_pin *src_pin, +- struct journal_entry_pin *pin, +- journal_pin_flush_fn flush_fn) ++void bch2_journal_pin_copy(struct journal *j, ++ struct journal_entry_pin *dst, ++ struct journal_entry_pin *src, ++ journal_pin_flush_fn flush_fn) + { +- spin_lock(&j->lock); +- +- if (journal_pin_active(src_pin) && +- (!journal_pin_active(pin) || +- src_pin->seq < pin->seq)) { +- __journal_pin_drop(j, pin); +- __journal_pin_add(j, src_pin->seq, pin, flush_fn); +- } +- +- spin_unlock(&j->lock); ++ if (journal_pin_active(src) && ++ (!journal_pin_active(dst) || src->seq < dst->seq)) ++ __bch2_journal_pin_add(j, src->seq, dst, flush_fn); + } + ++/** ++ * bch2_journal_pin_flush: ensure journal pin callback is no longer running ++ */ + void bch2_journal_pin_flush(struct journal *j, struct journal_entry_pin *pin) + { + BUG_ON(journal_pin_active(pin)); +diff --git a/fs/bcachefs/journal_reclaim.h b/fs/bcachefs/journal_reclaim.h +index 9bf982a17797..883a0a5680af 100644 +--- a/fs/bcachefs/journal_reclaim.h ++++ b/fs/bcachefs/journal_reclaim.h +@@ -29,16 +29,24 @@ journal_seq_pin(struct journal *j, u64 seq) + } + + void bch2_journal_pin_put(struct journal *, u64); +- +-void bch2_journal_pin_add(struct journal *, u64, struct journal_entry_pin *, +- journal_pin_flush_fn); +-void bch2_journal_pin_update(struct journal *, u64, struct journal_entry_pin *, +- journal_pin_flush_fn); + void bch2_journal_pin_drop(struct journal *, struct journal_entry_pin *); +-void bch2_journal_pin_add_if_older(struct journal *, +- struct journal_entry_pin *, +- struct journal_entry_pin *, +- journal_pin_flush_fn); ++ ++void __bch2_journal_pin_add(struct journal *, u64, struct journal_entry_pin *, ++ journal_pin_flush_fn); ++ ++static inline void bch2_journal_pin_add(struct journal *j, u64 seq, ++ struct journal_entry_pin *pin, ++ journal_pin_flush_fn flush_fn) ++{ ++ if (unlikely(!journal_pin_active(pin))) ++ __bch2_journal_pin_add(j, seq, pin, flush_fn); ++} ++ ++void bch2_journal_pin_copy(struct journal *, ++ struct journal_entry_pin *, ++ struct journal_entry_pin *, ++ journal_pin_flush_fn); ++ + void bch2_journal_pin_flush(struct journal *, struct journal_entry_pin *); + + void bch2_journal_do_discards(struct journal *); +-- +cgit v1.2.3 + + +From d6869e0aee6f871c416267b92711912a657482cd Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 2 Mar 2020 13:38:19 -0500 +Subject: bcachefs: Some btree iterator improvements + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 83 ++++++++++++++++++++++-------------------------- + fs/bcachefs/tests.c | 46 ++++++++++++++++----------- + 2 files changed, 65 insertions(+), 64 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 321fe306cc04..024a931b3e60 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -35,6 +35,26 @@ static inline struct bpos btree_iter_search_key(struct btree_iter *iter) + return pos; + } + ++static inline bool btree_iter_pos_before_node(struct btree_iter *iter, ++ struct btree *b) ++{ ++ return bkey_cmp(iter->pos, b->data->min_key) < 0; ++} ++ ++static inline bool btree_iter_pos_after_node(struct btree_iter *iter, ++ struct btree *b) ++{ ++ return bkey_cmp(b->key.k.p, btree_iter_search_key(iter)) < 0; ++} ++ ++static inline bool btree_iter_pos_in_node(struct btree_iter *iter, ++ struct btree *b) ++{ ++ return iter->btree_id == b->btree_id && ++ !btree_iter_pos_before_node(iter, b) && ++ !btree_iter_pos_after_node(iter, b); ++} ++ + /* Btree node locking: */ + + void bch2_btree_node_unlock_write(struct btree *b, struct btree_iter *iter) +@@ -399,6 +419,8 @@ static void __bch2_btree_iter_verify(struct btree_iter *iter, + if (iter->uptodate > BTREE_ITER_NEED_PEEK) + return; + ++ BUG_ON(!btree_iter_pos_in_node(iter, b)); ++ + bch2_btree_node_iter_verify(&l->iter, b); + + /* +@@ -736,26 +758,6 @@ static void btree_iter_verify_new_node(struct btree_iter *iter, struct btree *b) + btree_node_unlock(iter, b->level + 1); + } + +-static inline bool btree_iter_pos_before_node(struct btree_iter *iter, +- struct btree *b) +-{ +- return bkey_cmp(iter->pos, b->data->min_key) < 0; +-} +- +-static inline bool btree_iter_pos_after_node(struct btree_iter *iter, +- struct btree *b) +-{ +- return bkey_cmp(b->key.k.p, btree_iter_search_key(iter)) < 0; +-} +- +-static inline bool btree_iter_pos_in_node(struct btree_iter *iter, +- struct btree *b) +-{ +- return iter->btree_id == b->btree_id && +- !btree_iter_pos_before_node(iter, b) && +- !btree_iter_pos_after_node(iter, b); +-} +- + static inline void __btree_iter_init(struct btree_iter *iter, + unsigned level) + { +@@ -1373,6 +1375,10 @@ static inline bool btree_iter_set_pos_to_prev_leaf(struct btree_iter *iter) + return true; + } + ++/** ++ * btree_iter_peek_uptodate - given an iterator that is uptodate, return the key ++ * it currently points to ++ */ + static inline struct bkey_s_c btree_iter_peek_uptodate(struct btree_iter *iter) + { + struct btree_iter_level *l = &iter->l[0]; +@@ -1409,7 +1415,8 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) + + bch2_btree_iter_checks(iter, BTREE_ITER_KEYS); + +- if (iter->uptodate == BTREE_ITER_UPTODATE) ++ if (iter->uptodate == BTREE_ITER_UPTODATE && ++ !bkey_deleted(&iter->k)) + return btree_iter_peek_uptodate(iter); + + while (1) { +@@ -1503,7 +1510,8 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) + + bch2_btree_iter_checks(iter, BTREE_ITER_KEYS); + +- if (iter->uptodate == BTREE_ITER_UPTODATE) ++ if (iter->uptodate == BTREE_ITER_UPTODATE && ++ !bkey_deleted(&iter->k)) + return btree_iter_peek_uptodate(iter); + + while (1) { +@@ -1655,33 +1663,15 @@ __bch2_btree_iter_peek_slot(struct btree_iter *iter) + { + struct btree_iter_level *l = &iter->l[0]; + struct bkey_s_c k; +- int ret; + + if (iter->flags & BTREE_ITER_IS_EXTENTS) + return __bch2_btree_iter_peek_slot_extents(iter); + +-recheck: +- while ((k = __btree_iter_peek_all(iter, l, &iter->k)).k && +- bkey_deleted(k.k) && +- bkey_cmp(k.k->p, iter->pos) == 0) +- bch2_btree_node_iter_advance(&l->iter, l->b); ++ k = __btree_iter_peek_all(iter, l, &iter->k); + +- /* +- * If we got to the end of the node, check if we need to traverse to the +- * next node: +- */ +- if (unlikely(!k.k && btree_iter_pos_after_node(iter, l->b))) { +- btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); +- ret = bch2_btree_iter_traverse(iter); +- if (unlikely(ret)) +- return bkey_s_c_err(ret); ++ EBUG_ON(k.k && bkey_deleted(k.k) && bkey_cmp(k.k->p, iter->pos) == 0); + +- goto recheck; +- } +- +- if (!k.k || +- bkey_deleted(k.k) || +- bkey_cmp(iter->pos, k.k->p)) { ++ if (!k.k || bkey_cmp(iter->pos, k.k->p)) { + /* hole */ + bkey_init(&iter->k); + iter->k.p = iter->pos; +@@ -1713,8 +1703,12 @@ struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *iter) + { + bch2_btree_iter_checks(iter, BTREE_ITER_KEYS); + ++ /* XXX directly setting iter->pos is wrong */ + iter->pos = btree_type_successor(iter->btree_id, iter->k.p); + ++ if (unlikely(btree_iter_pos_after_node(iter, iter->l[0].b))) ++ btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); ++ + if (unlikely(iter->uptodate != BTREE_ITER_UPTODATE)) { + /* + * XXX: when we just need to relock we should be able to avoid +@@ -1726,8 +1720,7 @@ struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *iter) + return bch2_btree_iter_peek_slot(iter); + } + +- if (!bkey_deleted(&iter->k)) +- bch2_btree_node_iter_advance(&iter->l[0].iter, iter->l[0].b); ++ btree_iter_advance_to_pos(iter, &iter->l[0], -1); + + btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); + +diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c +index 9bc2c4a03c88..4dcace650416 100644 +--- a/fs/bcachefs/tests.c ++++ b/fs/bcachefs/tests.c +@@ -18,7 +18,7 @@ static void delete_test_keys(struct bch_fs *c) + NULL); + BUG_ON(ret); + +- ret = bch2_btree_delete_range(c, BTREE_ID_DIRENTS, ++ ret = bch2_btree_delete_range(c, BTREE_ID_XATTRS, + POS(0, 0), POS(0, U64_MAX), + NULL); + BUG_ON(ret); +@@ -37,7 +37,7 @@ static void test_delete(struct bch_fs *c, u64 nr) + + bch2_trans_init(&trans, c, 0, 0); + +- iter = bch2_trans_get_iter(&trans, BTREE_ID_DIRENTS, k.k.p, ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_XATTRS, k.k.p, + BTREE_ITER_INTENT); + + ret = bch2_btree_iter_traverse(iter); +@@ -69,7 +69,7 @@ static void test_delete_written(struct bch_fs *c, u64 nr) + + bch2_trans_init(&trans, c, 0, 0); + +- iter = bch2_trans_get_iter(&trans, BTREE_ID_DIRENTS, k.k.p, ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_XATTRS, k.k.p, + BTREE_ITER_INTENT); + + ret = bch2_btree_iter_traverse(iter); +@@ -107,7 +107,7 @@ static void test_iterate(struct bch_fs *c, u64 nr) + bkey_cookie_init(&k.k_i); + k.k.p.offset = i; + +- ret = bch2_btree_insert(c, BTREE_ID_DIRENTS, &k.k_i, ++ ret = bch2_btree_insert(c, BTREE_ID_XATTRS, &k.k_i, + NULL, NULL, 0); + BUG_ON(ret); + } +@@ -116,9 +116,13 @@ static void test_iterate(struct bch_fs *c, u64 nr) + + i = 0; + +- for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS, +- POS_MIN, 0, k, ret) ++ for_each_btree_key(&trans, iter, BTREE_ID_XATTRS, ++ POS_MIN, 0, k, ret) { ++ if (k.k->p.inode) ++ break; ++ + BUG_ON(k.k->p.offset != i++); ++ } + + BUG_ON(i != nr); + +@@ -202,7 +206,7 @@ static void test_iterate_slots(struct bch_fs *c, u64 nr) + bkey_cookie_init(&k.k_i); + k.k.p.offset = i * 2; + +- ret = bch2_btree_insert(c, BTREE_ID_DIRENTS, &k.k_i, ++ ret = bch2_btree_insert(c, BTREE_ID_XATTRS, &k.k_i, + NULL, NULL, 0); + BUG_ON(ret); + } +@@ -211,8 +215,11 @@ static void test_iterate_slots(struct bch_fs *c, u64 nr) + + i = 0; + +- for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS, POS_MIN, ++ for_each_btree_key(&trans, iter, BTREE_ID_XATTRS, POS_MIN, + 0, k, ret) { ++ if (k.k->p.inode) ++ break; ++ + BUG_ON(k.k->p.offset != i); + i += 2; + } +@@ -224,11 +231,12 @@ static void test_iterate_slots(struct bch_fs *c, u64 nr) + + i = 0; + +- for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS, POS_MIN, ++ for_each_btree_key(&trans, iter, BTREE_ID_XATTRS, POS_MIN, + BTREE_ITER_SLOTS, k, ret) { ++ BUG_ON(k.k->p.offset != i); + BUG_ON(bkey_deleted(k.k) != (i & 1)); +- BUG_ON(k.k->p.offset != i++); + ++ i++; + if (i == nr * 2) + break; + } +@@ -307,7 +315,7 @@ static void test_peek_end(struct bch_fs *c, u64 nr) + + bch2_trans_init(&trans, c, 0, 0); + +- iter = bch2_trans_get_iter(&trans, BTREE_ID_DIRENTS, POS_MIN, 0); ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_XATTRS, POS_MIN, 0); + + k = bch2_btree_iter_peek(iter); + BUG_ON(k.k); +@@ -421,7 +429,7 @@ static void rand_insert(struct bch_fs *c, u64 nr) + k.k.p.offset = test_rand(); + + ret = __bch2_trans_do(&trans, NULL, NULL, 0, +- __bch2_btree_insert(&trans, BTREE_ID_DIRENTS, &k.k_i)); ++ __bch2_btree_insert(&trans, BTREE_ID_XATTRS, &k.k_i)); + + BUG_ON(ret); + } +@@ -439,7 +447,7 @@ static void rand_lookup(struct bch_fs *c, u64 nr) + bch2_trans_init(&trans, c, 0, 0); + + for (i = 0; i < nr; i++) { +- iter = bch2_trans_get_iter(&trans, BTREE_ID_DIRENTS, ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_XATTRS, + POS(0, test_rand()), 0); + + k = bch2_btree_iter_peek(iter); +@@ -460,7 +468,7 @@ static void rand_mixed(struct bch_fs *c, u64 nr) + bch2_trans_init(&trans, c, 0, 0); + + for (i = 0; i < nr; i++) { +- iter = bch2_trans_get_iter(&trans, BTREE_ID_DIRENTS, ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_XATTRS, + POS(0, test_rand()), 0); + + k = bch2_btree_iter_peek(iter); +@@ -490,7 +498,7 @@ static int __do_delete(struct btree_trans *trans, struct bpos pos) + struct bkey_s_c k; + int ret = 0; + +- iter = bch2_trans_get_iter(trans, BTREE_ID_DIRENTS, pos, ++ iter = bch2_trans_get_iter(trans, BTREE_ID_XATTRS, pos, + BTREE_ITER_INTENT); + ret = PTR_ERR_OR_ZERO(iter); + if (ret) +@@ -542,7 +550,7 @@ static void seq_insert(struct bch_fs *c, u64 nr) + + bch2_trans_init(&trans, c, 0, 0); + +- for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS, POS_MIN, ++ for_each_btree_key(&trans, iter, BTREE_ID_XATTRS, POS_MIN, + BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { + insert.k.p = iter->pos; + +@@ -566,7 +574,7 @@ static void seq_lookup(struct bch_fs *c, u64 nr) + + bch2_trans_init(&trans, c, 0, 0); + +- for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS, POS_MIN, 0, k, ret) ++ for_each_btree_key(&trans, iter, BTREE_ID_XATTRS, POS_MIN, 0, k, ret) + ; + bch2_trans_exit(&trans); + } +@@ -580,7 +588,7 @@ static void seq_overwrite(struct bch_fs *c, u64 nr) + + bch2_trans_init(&trans, c, 0, 0); + +- for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS, POS_MIN, ++ for_each_btree_key(&trans, iter, BTREE_ID_XATTRS, POS_MIN, + BTREE_ITER_INTENT, k, ret) { + struct bkey_i_cookie u; + +@@ -598,7 +606,7 @@ static void seq_delete(struct bch_fs *c, u64 nr) + { + int ret; + +- ret = bch2_btree_delete_range(c, BTREE_ID_DIRENTS, ++ ret = bch2_btree_delete_range(c, BTREE_ID_XATTRS, + POS(0, 0), POS(0, U64_MAX), + NULL); + BUG_ON(ret); +-- +cgit v1.2.3 + + +From cedfd3d1041bd3239f5332b1b0aa9e2321a5a080 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 2 Mar 2020 17:08:19 -0500 +Subject: bcachefs: Fix extent_sort_fix_overlapping() + +Recently the extent update path started emmiting 0 size whiteouts on +extent overwrite, as part of transitioning to moving extent handling +out of the core btree code. + +Unfortunately, this broke the old code path that handles overlapping +extents when reading in btree nodes - it relies on sorting incomming +extents by start position, but the 0 size whiteouts broke that ordering. +Skipping over them before the main algorithm sees them fixes this. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bkey_sort.c | 39 +++++++++++++++++++++++++++++++-------- + 1 file changed, 31 insertions(+), 8 deletions(-) + +diff --git a/fs/bcachefs/bkey_sort.c b/fs/bcachefs/bkey_sort.c +index 7cbb57042af1..68965a0f973a 100644 +--- a/fs/bcachefs/bkey_sort.c ++++ b/fs/bcachefs/bkey_sort.c +@@ -311,6 +311,25 @@ static inline int extent_sort_fix_overlapping_cmp(struct btree *b, + cmp_int((unsigned long) r, (unsigned long) l); + } + ++/* ++ * The algorithm in extent_sort_fix_overlapping() relies on keys in the same ++ * bset being ordered by start offset - but 0 size whiteouts (which are always ++ * KEY_TYPE_deleted) break this ordering, so we need to skip over them: ++ */ ++static void extent_iter_advance(struct sort_iter *iter, unsigned idx) ++{ ++ struct sort_iter_set *i = iter->data + idx; ++ ++ do { ++ i->k = bkey_next_skip_noops(i->k, i->end); ++ } while (i->k != i->end && bkey_deleted(i->k)); ++ ++ if (i->k == i->end) ++ array_remove_item(iter->data, iter->used, idx); ++ else ++ __sort_iter_sift(iter, idx, extent_sort_fix_overlapping_cmp); ++} ++ + struct btree_nr_keys + bch2_extent_sort_fix_overlapping(struct bch_fs *c, struct bset *dst, + struct sort_iter *iter) +@@ -323,19 +342,26 @@ bch2_extent_sort_fix_overlapping(struct bch_fs *c, struct bset *dst, + struct bkey_s l, r; + struct btree_nr_keys nr; + struct bkey_on_stack split; ++ unsigned i; + + memset(&nr, 0, sizeof(nr)); + bkey_on_stack_init(&split); + + sort_iter_sort(iter, extent_sort_fix_overlapping_cmp); ++ for (i = 0; i < iter->used;) { ++ if (bkey_deleted(iter->data[i].k)) ++ __sort_iter_advance(iter, i, ++ extent_sort_fix_overlapping_cmp); ++ else ++ i++; ++ } + + while (!sort_iter_end(iter)) { + l = __bkey_disassemble(b, _l->k, &l_unpacked); + + if (iter->used == 1) { + extent_sort_append(c, f, &nr, dst->start, &prev, l); +- sort_iter_advance(iter, +- extent_sort_fix_overlapping_cmp); ++ extent_iter_advance(iter, 0); + continue; + } + +@@ -344,15 +370,13 @@ bch2_extent_sort_fix_overlapping(struct bch_fs *c, struct bset *dst, + /* If current key and next key don't overlap, just append */ + if (bkey_cmp(l.k->p, bkey_start_pos(r.k)) <= 0) { + extent_sort_append(c, f, &nr, dst->start, &prev, l); +- sort_iter_advance(iter, +- extent_sort_fix_overlapping_cmp); ++ extent_iter_advance(iter, 0); + continue; + } + + /* Skip 0 size keys */ + if (!r.k->size) { +- __sort_iter_advance(iter, 1, +- extent_sort_fix_overlapping_cmp); ++ extent_iter_advance(iter, 1); + continue; + } + +@@ -369,8 +393,7 @@ bch2_extent_sort_fix_overlapping(struct bch_fs *c, struct bset *dst, + if (_l->k > _r->k) { + /* l wins, trim r */ + if (bkey_cmp(l.k->p, r.k->p) >= 0) { +- __sort_iter_advance(iter, 1, +- extent_sort_fix_overlapping_cmp); ++ extent_iter_advance(iter, 1); + } else { + bch2_cut_front_s(l.k->p, r); + extent_save(b, _r->k, r.k); +-- +cgit v1.2.3 + + +From 1ef4202c65c24dc142f893e3ff2777c23e82022d Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 5 Mar 2020 17:06:15 -0500 +Subject: bcachefs: Fix off by one error in bch2_extent_crc_append() + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/extents.c | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c +index 10feb856e314..cb88dd15a86c 100644 +--- a/fs/bcachefs/extents.c ++++ b/fs/bcachefs/extents.c +@@ -345,7 +345,7 @@ enum merge_result bch2_extent_merge(struct bch_fs *c, + crc_r.uncompressed_size > c->sb.encoded_extent_max) + return BCH_MERGE_NOMERGE; + +- if (crc_l.uncompressed_size + crc_r.uncompressed_size - 1 > ++ if (crc_l.uncompressed_size + crc_r.uncompressed_size > + bch2_crc_field_size_max[extent_entry_type(en_l)]) + return BCH_MERGE_NOMERGE; + +@@ -563,15 +563,15 @@ void bch2_extent_crc_append(struct bkey_i *k, + enum bch_extent_entry_type type; + + if (bch_crc_bytes[new.csum_type] <= 4 && +- new.uncompressed_size - 1 <= CRC32_SIZE_MAX && ++ new.uncompressed_size <= CRC32_SIZE_MAX && + new.nonce <= CRC32_NONCE_MAX) + type = BCH_EXTENT_ENTRY_crc32; + else if (bch_crc_bytes[new.csum_type] <= 10 && +- new.uncompressed_size - 1 <= CRC64_SIZE_MAX && ++ new.uncompressed_size <= CRC64_SIZE_MAX && + new.nonce <= CRC64_NONCE_MAX) + type = BCH_EXTENT_ENTRY_crc64; + else if (bch_crc_bytes[new.csum_type] <= 16 && +- new.uncompressed_size - 1 <= CRC128_SIZE_MAX && ++ new.uncompressed_size <= CRC128_SIZE_MAX && + new.nonce <= CRC128_NONCE_MAX) + type = BCH_EXTENT_ENTRY_crc128; + else +-- +cgit v1.2.3 + + +From e48a5bd6a18ec92adf7c77e58d9ecbabcd63447e Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 7 Mar 2020 13:30:55 -0500 +Subject: bcachefs: Fix another iterator leak + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/recovery.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index 1871485c079d..3b9c20cf389a 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -424,6 +424,7 @@ static int __bch2_journal_replay_key(struct btree_trans *trans, + return PTR_ERR(iter); + + bch2_trans_update(trans, iter, k, BTREE_TRIGGER_NORUN); ++ bch2_trans_iter_put(trans, iter); + return 0; + } + +-- +cgit v1.2.3 + + +From 46fb1cb463ae8458c7b2b82c47f67ceb4a4dffc9 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 7 Mar 2020 17:20:39 -0500 +Subject: bcachefs: Fix bch2_dump_bset() + +It's used in the write path when the bset isn't in the btree node +buffer. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bset.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c +index 9dd59343f3a3..e7af1acb5cc1 100644 +--- a/fs/bcachefs/bset.c ++++ b/fs/bcachefs/bset.c +@@ -79,8 +79,8 @@ void bch2_dump_bset(struct btree *b, struct bset *i, unsigned set) + _n = bkey_next_skip_noops(_k, vstruct_last(i)); + + bch2_bkey_to_text(&PBUF(buf), &k); +- printk(KERN_ERR "block %u key %5u: %s\n", set, +- __btree_node_key_to_offset(b, _k), buf); ++ printk(KERN_ERR "block %u key %5zu: %s\n", set, ++ _k->_data - i->_data, buf); + + if (_n == vstruct_last(i)) + continue; +-- +cgit v1.2.3 + + +From 7707a6412aee453ac494140cf349e04bf2c1b8f2 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 9 Mar 2020 14:19:58 -0400 +Subject: bcachefs: Don't log errors that are expected during shutdown + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/io.c | 3 ++- + fs/bcachefs/io.h | 5 +++-- + fs/bcachefs/move.c | 3 ++- + 3 files changed, 7 insertions(+), 4 deletions(-) + +diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c +index df419ad01cb0..cc70b1f388fd 100644 +--- a/fs/bcachefs/io.c ++++ b/fs/bcachefs/io.c +@@ -1220,7 +1220,8 @@ void bch2_write(struct closure *cl) + + if (c->opts.nochanges || + !percpu_ref_tryget(&c->writes)) { +- __bcache_io_error(c, "read only"); ++ if (!(op->flags & BCH_WRITE_FROM_INTERNAL)) ++ __bcache_io_error(c, "read only"); + op->error = -EROFS; + goto err; + } +diff --git a/fs/bcachefs/io.h b/fs/bcachefs/io.h +index 37f7fa6102fc..e45dcf9635ae 100644 +--- a/fs/bcachefs/io.h ++++ b/fs/bcachefs/io.h +@@ -31,10 +31,11 @@ enum bch_write_flags { + BCH_WRITE_ONLY_SPECIFIED_DEVS = (1 << 6), + BCH_WRITE_NOPUT_RESERVATION = (1 << 7), + BCH_WRITE_WROTE_DATA_INLINE = (1 << 8), ++ BCH_WRITE_FROM_INTERNAL = (1 << 9), + + /* Internal: */ +- BCH_WRITE_JOURNAL_SEQ_PTR = (1 << 9), +- BCH_WRITE_SKIP_CLOSURE_PUT = (1 << 10), ++ BCH_WRITE_JOURNAL_SEQ_PTR = (1 << 10), ++ BCH_WRITE_SKIP_CLOSURE_PUT = (1 << 11), + }; + + static inline u64 *op_journal_seq(struct bch_write_op *op) +diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c +index ecc74ebe0579..4afda95f4017 100644 +--- a/fs/bcachefs/move.c ++++ b/fs/bcachefs/move.c +@@ -243,7 +243,8 @@ int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m, + m->op.flags |= BCH_WRITE_ONLY_SPECIFIED_DEVS| + BCH_WRITE_PAGES_STABLE| + BCH_WRITE_PAGES_OWNED| +- BCH_WRITE_DATA_ENCODED; ++ BCH_WRITE_DATA_ENCODED| ++ BCH_WRITE_FROM_INTERNAL; + + m->op.nr_replicas = 1; + m->op.nr_replicas_required = 1; +-- +cgit v1.2.3 + + +From 2f72f5a7d7be3719e0e7aeb523f87ad2b31469e2 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 9 Mar 2020 16:15:54 -0400 +Subject: bcachefs: Traverse iterator in journal replay + +This fixes a bug where we end up spinning in journal replay - in theory +this shouldn't be necessary though, transaction reset should be +re-traversing all iterators. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/recovery.c | 6 ++++-- + 1 file changed, 4 insertions(+), 2 deletions(-) + +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index 3b9c20cf389a..712a6b1fd968 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -417,15 +417,17 @@ static int __bch2_journal_replay_key(struct btree_trans *trans, + enum btree_id id, struct bkey_i *k) + { + struct btree_iter *iter; ++ int ret; + + iter = bch2_trans_get_iter(trans, id, bkey_start_pos(&k->k), + BTREE_ITER_INTENT); + if (IS_ERR(iter)) + return PTR_ERR(iter); + +- bch2_trans_update(trans, iter, k, BTREE_TRIGGER_NORUN); ++ ret = bch2_btree_iter_traverse(iter) ?: ++ bch2_trans_update(trans, iter, k, BTREE_TRIGGER_NORUN); + bch2_trans_iter_put(trans, iter); +- return 0; ++ return ret; + } + + static int bch2_journal_replay_key(struct bch_fs *c, enum btree_id id, +-- +cgit v1.2.3 + + +From 0c92103857164ce8475292fab3fa609dc8fc3a7d Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 5 Mar 2020 18:43:31 -0500 +Subject: bcachefs: Skip 0 size deleted extents in journal replay + +These are created by the new extent update path, but not used yet by the +recovery code and they break the existing recovery code, so we can just +skip them. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/recovery.c | 9 +++++++-- + 1 file changed, 7 insertions(+), 2 deletions(-) + +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index 712a6b1fd968..bd0edda7abf9 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -230,7 +230,11 @@ static struct journal_keys journal_keys_sort(struct list_head *journal_entries) + goto err; + + list_for_each_entry(p, journal_entries, list) +- for_each_jset_key(k, _n, entry, &p->j) ++ for_each_jset_key(k, _n, entry, &p->j) { ++ if (bkey_deleted(&k->k) && ++ btree_node_type_is_extents(entry->btree_id)) ++ continue; ++ + keys.d[keys.nr++] = (struct journal_key) { + .btree_id = entry->btree_id, + .pos = bkey_start_pos(&k->k), +@@ -239,8 +243,9 @@ static struct journal_keys journal_keys_sort(struct list_head *journal_entries) + keys.journal_seq_base, + .journal_offset = k->_data - p->j._data, + }; ++ } + +- sort(keys.d, nr_keys, sizeof(keys.d[0]), journal_sort_key_cmp, NULL); ++ sort(keys.d, keys.nr, sizeof(keys.d[0]), journal_sort_key_cmp, NULL); + + i = keys.d; + while (i < keys.d + keys.nr) { +-- +cgit v1.2.3 + + +From 1a2c9a5d4a96eafd6dea53b7844d830cf7510e92 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 18 Feb 2020 16:17:55 -0500 +Subject: bcachefs: Iterator debug code improvements + +More aggressively checking iterator invariants, and fixing the resulting +bugs. Also greatly simplifying iter_next() and iter_next_slot() - they +were hyper optimized before, but the optimizations were getting too +brittle. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bset.c | 6 +- + fs/bcachefs/btree_iter.c | 216 ++++++++++++++++++------------------ + fs/bcachefs/btree_iter.h | 10 +- + fs/bcachefs/btree_types.h | 3 +- + fs/bcachefs/btree_update_interior.c | 4 +- + 5 files changed, 120 insertions(+), 119 deletions(-) + +diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c +index e7af1acb5cc1..a18b00a5ec90 100644 +--- a/fs/bcachefs/bset.c ++++ b/fs/bcachefs/bset.c +@@ -1677,7 +1677,8 @@ struct bkey_packed *bch2_btree_node_iter_prev_all(struct btree_node_iter *iter, + struct bset_tree *t; + unsigned end = 0; + +- bch2_btree_node_iter_verify(iter, b); ++ if (btree_keys_expensive_checks(b)) ++ bch2_btree_node_iter_verify(iter, b); + + for_each_bset(b, t) { + k = bch2_bkey_prev_all(b, t, +@@ -1712,7 +1713,8 @@ found: + iter->data[0].k = __btree_node_key_to_offset(b, prev); + iter->data[0].end = end; + +- bch2_btree_node_iter_verify(iter, b); ++ if (btree_keys_expensive_checks(b)) ++ bch2_btree_node_iter_verify(iter, b); + return prev; + } + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 024a931b3e60..d2fa6517d77e 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -405,23 +405,43 @@ void bch2_trans_unlock(struct btree_trans *trans) + + #ifdef CONFIG_BCACHEFS_DEBUG + +-static void __bch2_btree_iter_verify(struct btree_iter *iter, +- struct btree *b) ++static void bch2_btree_iter_verify_level(struct btree_iter *iter, ++ unsigned level) + { + struct bpos pos = btree_iter_search_key(iter); +- struct btree_iter_level *l = &iter->l[b->level]; ++ struct btree_iter_level *l = &iter->l[level]; + struct btree_node_iter tmp = l->iter; +- struct bkey_packed *k; ++ bool locked = btree_node_locked(iter, level); ++ struct bkey_packed *p, *k; ++ char buf1[100], buf2[100]; ++ const char *msg; + + if (!debug_check_iterators(iter->trans->c)) + return; + +- if (iter->uptodate > BTREE_ITER_NEED_PEEK) ++ BUG_ON(iter->level < iter->min_depth); ++ ++ if (!btree_iter_node(iter, level)) ++ return; ++ ++ if (!bch2_btree_node_relock(iter, level)) + return; + +- BUG_ON(!btree_iter_pos_in_node(iter, b)); ++ /* ++ * Ideally this invariant would always be true, and hopefully in the ++ * future it will be, but for now set_pos_same_leaf() breaks it: ++ */ ++ BUG_ON(iter->uptodate < BTREE_ITER_NEED_TRAVERSE && ++ !btree_iter_pos_in_node(iter, l->b)); ++ ++ /* ++ * node iterators don't use leaf node iterator: ++ */ ++ if (btree_iter_type(iter) == BTREE_ITER_NODES && ++ level <= iter->min_depth) ++ goto unlock; + +- bch2_btree_node_iter_verify(&l->iter, b); ++ bch2_btree_node_iter_verify(&l->iter, l->b); + + /* + * For interior nodes, the iterator will have skipped past +@@ -430,46 +450,72 @@ static void __bch2_btree_iter_verify(struct btree_iter *iter, + * For extents, the iterator may have skipped past deleted keys (but not + * whiteouts) + */ +- k = b->level || btree_node_type_is_extents(iter->btree_id) +- ? bch2_btree_node_iter_prev_filter(&tmp, b, KEY_TYPE_discard) +- : bch2_btree_node_iter_prev_all(&tmp, b); +- if (k && bkey_iter_pos_cmp(b, k, &pos) >= 0) { +- char buf[100]; +- struct bkey uk = bkey_unpack_key(b, k); ++ p = level || btree_node_type_is_extents(iter->btree_id) ++ ? bch2_btree_node_iter_prev_filter(&tmp, l->b, KEY_TYPE_discard) ++ : bch2_btree_node_iter_prev_all(&tmp, l->b); ++ k = bch2_btree_node_iter_peek_all(&l->iter, l->b); + +- bch2_bkey_to_text(&PBUF(buf), &uk); +- panic("iterator should be before prev key:\n%s\n%llu:%llu\n", +- buf, iter->pos.inode, iter->pos.offset); ++ if (p && bkey_iter_pos_cmp(l->b, p, &pos) >= 0) { ++ msg = "before"; ++ goto err; + } + +- k = bch2_btree_node_iter_peek_all(&l->iter, b); +- if (k && bkey_iter_pos_cmp(b, k, &pos) < 0) { +- char buf[100]; +- struct bkey uk = bkey_unpack_key(b, k); ++ if (k && bkey_iter_pos_cmp(l->b, k, &pos) < 0) { ++ msg = "after"; ++ goto err; ++ } ++unlock: ++ if (!locked) ++ btree_node_unlock(iter, level); ++ return; ++err: ++ strcpy(buf1, "(none)"); ++ strcpy(buf2, "(none)"); ++ ++ if (p) { ++ struct bkey uk = bkey_unpack_key(l->b, p); ++ bch2_bkey_to_text(&PBUF(buf1), &uk); ++ } + +- bch2_bkey_to_text(&PBUF(buf), &uk); +- panic("iter should be after current key:\n" +- "iter pos %llu:%llu\n" +- "cur key %s\n", +- iter->pos.inode, iter->pos.offset, buf); ++ if (k) { ++ struct bkey uk = bkey_unpack_key(l->b, k); ++ bch2_bkey_to_text(&PBUF(buf2), &uk); + } ++ ++ panic("iterator should be %s key at level %u:\n" ++ "iter pos %s %llu:%llu\n" ++ "prev key %s\n" ++ "cur key %s\n", ++ msg, level, ++ iter->flags & BTREE_ITER_IS_EXTENTS ? ">" : "=>", ++ iter->pos.inode, iter->pos.offset, ++ buf1, buf2); + } + +-void bch2_btree_iter_verify(struct btree_iter *iter, struct btree *b) ++static void bch2_btree_iter_verify(struct btree_iter *iter) + { +- struct btree_iter *linked; ++ unsigned i; + +- if (!debug_check_iterators(iter->trans->c)) ++ bch2_btree_trans_verify_locks(iter->trans); ++ ++ for (i = 0; i < BTREE_MAX_DEPTH; i++) ++ bch2_btree_iter_verify_level(iter, i); ++} ++ ++void bch2_btree_trans_verify_iters(struct btree_trans *trans, struct btree *b) ++{ ++ struct btree_iter *iter; ++ ++ if (!debug_check_iterators(trans->c)) + return; + +- trans_for_each_iter_with_node(iter->trans, b, linked) +- __bch2_btree_iter_verify(linked, b); ++ trans_for_each_iter_with_node(trans, b, iter) ++ bch2_btree_iter_verify_level(iter, b->level); + } + + #else + +-static inline void __bch2_btree_iter_verify(struct btree_iter *iter, +- struct btree *b) {} ++static inline void bch2_btree_iter_verify_level(struct btree_iter *iter, unsigned) {} + + #endif + +@@ -514,7 +560,7 @@ void bch2_btree_iter_fix_key_modified(struct btree_iter *iter, + + trans_for_each_iter_with_node(iter->trans, b, linked) { + __bch2_btree_iter_fix_key_modified(linked, b, where); +- __bch2_btree_iter_verify(linked, b); ++ bch2_btree_iter_verify_level(linked, b->level); + } + } + +@@ -641,14 +687,16 @@ void bch2_btree_node_iter_fix(struct btree_iter *iter, + if (node_iter != &iter->l[b->level].iter) { + __bch2_btree_node_iter_fix(iter, b, node_iter, t, + where, clobber_u64s, new_u64s); +- bch2_btree_node_iter_verify(node_iter, b); ++ ++ if (debug_check_iterators(iter->trans->c)) ++ bch2_btree_node_iter_verify(node_iter, b); + } + + trans_for_each_iter_with_node(iter->trans, b, linked) { + __bch2_btree_node_iter_fix(linked, b, + &linked->l[b->level].iter, t, + where, clobber_u64s, new_u64s); +- __bch2_btree_iter_verify(linked, b); ++ bch2_btree_iter_verify_level(linked, b->level); + } + } + +@@ -1134,9 +1182,7 @@ static int btree_iter_traverse_one(struct btree_iter *iter) + + iter->uptodate = BTREE_ITER_NEED_PEEK; + +- bch2_btree_trans_verify_locks(iter->trans); +- if (btree_iter_node(iter, iter->level)) +- __bch2_btree_iter_verify(iter, iter->l[iter->level].b); ++ bch2_btree_iter_verify(iter); + return 0; + } + +@@ -1156,12 +1202,10 @@ static inline void bch2_btree_iter_checks(struct btree_iter *iter, + enum btree_iter_type type) + { + EBUG_ON(iter->btree_id >= BTREE_ID_NR); +- EBUG_ON(!!(iter->flags & BTREE_ITER_IS_EXTENTS) != +- (btree_node_type_is_extents(iter->btree_id) && +- type != BTREE_ITER_NODES)); + EBUG_ON(btree_iter_type(iter) != type); + +- bch2_btree_trans_verify_locks(iter->trans); ++ bch2_btree_iter_verify_locks(iter); ++ bch2_btree_iter_verify_level(iter, iter->level); + } + + /* Iterate across nodes (leaf and interior nodes) */ +@@ -1189,10 +1233,12 @@ struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter) + iter->pos = b->key.k.p; + iter->uptodate = BTREE_ITER_UPTODATE; + ++ bch2_btree_iter_verify(iter); ++ + return b; + } + +-struct btree *bch2_btree_iter_next_node(struct btree_iter *iter, unsigned depth) ++struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) + { + struct btree *b; + int ret; +@@ -1238,7 +1284,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter, unsigned depth) + iter->pos = iter->btree_id == BTREE_ID_INODES + ? btree_type_successor(iter->btree_id, iter->pos) + : bkey_successor(iter->pos); +- iter->level = depth; ++ iter->level = iter->min_depth; + + btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); + ret = bch2_btree_iter_traverse(iter); +@@ -1251,6 +1297,8 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter, unsigned depth) + iter->pos = b->key.k.p; + iter->uptodate = BTREE_ITER_UPTODATE; + ++ bch2_btree_iter_verify(iter); ++ + return b; + } + +@@ -1441,6 +1489,8 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) + iter->pos = bkey_start_pos(k.k); + + iter->uptodate = BTREE_ITER_UPTODATE; ++ ++ bch2_btree_iter_verify_level(iter, 0); + return k; + } + +@@ -1450,52 +1500,16 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) + */ + struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter) + { +- struct btree_iter_level *l = &iter->l[0]; +- struct bkey_packed *p; +- struct bkey_s_c k; ++ struct bpos next = iter->k.p; + + bch2_btree_iter_checks(iter, BTREE_ITER_KEYS); + +- if (unlikely(iter->uptodate != BTREE_ITER_UPTODATE)) { +- if (unlikely(!bkey_cmp(iter->k.p, POS_MAX))) +- return bkey_s_c_null; +- +- /* +- * XXX: when we just need to relock we should be able to avoid +- * calling traverse, but we need to kill BTREE_ITER_NEED_PEEK +- * for that to work +- */ +- iter->uptodate = BTREE_ITER_NEED_TRAVERSE; ++ if (bkey_cmp(next, POS_MAX)) ++ next = btree_type_successor(iter->btree_id, next); + +- bch2_btree_iter_set_pos(iter, +- btree_type_successor(iter->btree_id, iter->k.p)); ++ bch2_btree_iter_set_pos(iter, next); + +- return bch2_btree_iter_peek(iter); +- } +- +- if (unlikely(bkey_deleted(&iter->k))) { +- /* +- * we're currently pointed at a hole, because previously we were +- * iterating over slots: +- */ +- return bch2_btree_iter_peek(iter); +- } +- +- do { +- bch2_btree_node_iter_advance(&l->iter, l->b); +- p = bch2_btree_node_iter_peek_all(&l->iter, l->b); +- } while (likely(p) && bkey_whiteout(p)); +- +- if (unlikely(!p)) +- return btree_iter_set_pos_to_next_leaf(iter) +- ? bch2_btree_iter_peek(iter) +- : bkey_s_c_null; +- +- k = __btree_iter_unpack(iter, l, &iter->k, p); +- +- EBUG_ON(bkey_cmp(bkey_start_pos(k.k), iter->pos) < 0); +- iter->pos = bkey_start_pos(k.k); +- return k; ++ return bch2_btree_iter_peek(iter); + } + + /** +@@ -1609,7 +1623,7 @@ recheck: + EBUG_ON(bkey_cmp(k.k->p, iter->pos) <= 0); + iter->uptodate = BTREE_ITER_UPTODATE; + +- __bch2_btree_iter_verify(iter, l->b); ++ bch2_btree_iter_verify_level(iter, 0); + return k; + } + +@@ -1654,7 +1668,7 @@ recheck: + iter->k = n; + iter->uptodate = BTREE_ITER_UPTODATE; + +- __bch2_btree_iter_verify(iter, l->b); ++ bch2_btree_iter_verify_level(iter, 0); + return (struct bkey_s_c) { &iter->k, NULL }; + } + +@@ -1679,7 +1693,7 @@ __bch2_btree_iter_peek_slot(struct btree_iter *iter) + } + + iter->uptodate = BTREE_ITER_UPTODATE; +- __bch2_btree_iter_verify(iter, l->b); ++ bch2_btree_iter_verify_level(iter, 0); + return k; + } + +@@ -1703,28 +1717,10 @@ struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *iter) + { + bch2_btree_iter_checks(iter, BTREE_ITER_KEYS); + +- /* XXX directly setting iter->pos is wrong */ +- iter->pos = btree_type_successor(iter->btree_id, iter->k.p); +- +- if (unlikely(btree_iter_pos_after_node(iter, iter->l[0].b))) +- btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); +- +- if (unlikely(iter->uptodate != BTREE_ITER_UPTODATE)) { +- /* +- * XXX: when we just need to relock we should be able to avoid +- * calling traverse, but we need to kill BTREE_ITER_NEED_PEEK +- * for that to work +- */ +- btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); +- +- return bch2_btree_iter_peek_slot(iter); +- } +- +- btree_iter_advance_to_pos(iter, &iter->l[0], -1); ++ bch2_btree_iter_set_pos(iter, ++ btree_type_successor(iter->btree_id, iter->k.p)); + +- btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); +- +- return __bch2_btree_iter_peek_slot(iter); ++ return bch2_btree_iter_peek_slot(iter); + } + + static inline void bch2_btree_iter_init(struct btree_trans *trans, +@@ -1746,6 +1742,7 @@ static inline void bch2_btree_iter_init(struct btree_trans *trans, + iter->uptodate = BTREE_ITER_NEED_TRAVERSE; + iter->btree_id = btree_id; + iter->level = 0; ++ iter->min_depth = 0; + iter->locks_want = flags & BTREE_ITER_INTENT ? 1 : 0; + iter->nodes_locked = 0; + iter->nodes_intent_locked = 0; +@@ -2011,6 +2008,7 @@ struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *trans, + + iter->locks_want = locks_want; + iter->level = depth; ++ iter->min_depth = depth; + + for (i = 0; i < ARRAY_SIZE(iter->l); i++) + iter->l[i].b = NULL; +diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h +index a95d0f13c65d..2ca7845143ad 100644 +--- a/fs/bcachefs/btree_iter.h ++++ b/fs/bcachefs/btree_iter.h +@@ -96,11 +96,11 @@ __trans_next_iter_with_node(struct btree_trans *trans, struct btree *b, + (_iter)->idx + 1)) + + #ifdef CONFIG_BCACHEFS_DEBUG +-void bch2_btree_iter_verify(struct btree_iter *, struct btree *); ++void bch2_btree_trans_verify_iters(struct btree_trans *, struct btree *); + void bch2_btree_trans_verify_locks(struct btree_trans *); + #else +-static inline void bch2_btree_iter_verify(struct btree_iter *iter, +- struct btree *b) {} ++static inline void bch2_btree_trans_verify_iters(struct btree_trans *trans, ++ struct btree *b) {} + static inline void bch2_btree_trans_verify_locks(struct btree_trans *iter) {} + #endif + +@@ -154,7 +154,7 @@ bch2_btree_iter_traverse(struct btree_iter *iter) + int bch2_btree_iter_traverse_all(struct btree_trans *); + + struct btree *bch2_btree_iter_peek_node(struct btree_iter *); +-struct btree *bch2_btree_iter_next_node(struct btree_iter *, unsigned); ++struct btree *bch2_btree_iter_next_node(struct btree_iter *); + + struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *); + struct bkey_s_c bch2_btree_iter_next(struct btree_iter *); +@@ -231,7 +231,7 @@ static inline int bch2_trans_cond_resched(struct btree_trans *trans) + _start, _locks_want, _depth, _flags), \ + _b = bch2_btree_iter_peek_node(_iter); \ + (_b); \ +- (_b) = bch2_btree_iter_next_node(_iter, _depth)) ++ (_b) = bch2_btree_iter_next_node(_iter)) + + #define for_each_btree_node(_trans, _iter, _btree_id, _start, \ + _flags, _b) \ +diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h +index 20757d0c3e53..435e61b5cb27 100644 +--- a/fs/bcachefs/btree_types.h ++++ b/fs/bcachefs/btree_types.h +@@ -235,9 +235,10 @@ struct btree_iter { + u16 flags; + u8 idx; + +- enum btree_iter_uptodate uptodate:4; + enum btree_id btree_id:4; ++ enum btree_iter_uptodate uptodate:4; + unsigned level:4, ++ min_depth:4, + locks_want:4, + nodes_locked:4, + nodes_intent_locked:4; +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index 9180d2da3749..65c83fc3145e 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -1557,7 +1557,7 @@ bch2_btree_insert_keys_interior(struct btree_update *as, struct btree *b, + trans_for_each_iter_with_node(iter->trans, b, linked) + bch2_btree_node_iter_peek(&linked->l[b->level].iter, b); + +- bch2_btree_iter_verify(iter, b); ++ bch2_btree_trans_verify_iters(iter->trans, b); + } + + /** +@@ -1827,7 +1827,7 @@ retry: + + bch2_btree_iter_node_replace(iter, n); + +- bch2_btree_iter_verify(iter, n); ++ bch2_btree_trans_verify_iters(trans, n); + + bch2_btree_node_free_inmem(c, b, iter); + bch2_btree_node_free_inmem(c, m, iter); +-- +cgit v1.2.3 + + +From da632fc80fc81e2c958f0892bcbe9a6596278163 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 13 Mar 2020 21:41:22 -0400 +Subject: bcachefs: Simplify bch2_btree_iter_peek_slot() + +--- + fs/bcachefs/btree_iter.c | 76 +++++++++++++++--------------------------------- + 1 file changed, 24 insertions(+), 52 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index d2fa6517d77e..040cb7ed9ec9 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -1595,8 +1595,17 @@ __bch2_btree_iter_peek_slot_extents(struct btree_iter *iter) + struct bkey n; + int ret; + +-recheck: +- btree_iter_advance_to_pos(iter, l, -1); ++ /* keys & holes can't span inode numbers: */ ++ if (iter->pos.offset == KEY_OFFSET_MAX) { ++ if (iter->pos.inode == KEY_INODE_MAX) ++ return bkey_s_c_null; ++ ++ bch2_btree_iter_set_pos(iter, bkey_successor(iter->pos)); ++ ++ ret = bch2_btree_iter_traverse(iter); ++ if (unlikely(ret)) ++ return bkey_s_c_err(ret); ++ } + + /* + * iterator is now at the correct position for inserting at iter->pos, +@@ -1610,47 +1619,17 @@ recheck: + + if (k.k && bkey_cmp(bkey_start_pos(k.k), iter->pos) <= 0) { + /* +- * If there wasn't actually a hole, want the iterator to be +- * pointed at the key we found: +- * +- * XXX: actually, we shouldn't be changing the iterator here: +- * the iterator needs to be correct for inserting at iter->pos, +- * and there may be whiteouts between iter->pos and what this +- * iterator points at: ++ * We're not setting iter->uptodate because the node iterator ++ * doesn't necessarily point at the key we're returning: + */ +- l->iter = node_iter; + + EBUG_ON(bkey_cmp(k.k->p, iter->pos) <= 0); +- iter->uptodate = BTREE_ITER_UPTODATE; +- + bch2_btree_iter_verify_level(iter, 0); + return k; + } + +- /* +- * If we got to the end of the node, check if we need to traverse to the +- * next node: +- */ +- if (unlikely(!k.k && btree_iter_pos_after_node(iter, l->b))) { +- btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); +- ret = bch2_btree_iter_traverse(iter); +- if (unlikely(ret)) +- return bkey_s_c_err(ret); +- +- goto recheck; +- } +- + /* hole */ + +- /* holes can't span inode numbers: */ +- if (iter->pos.offset == KEY_OFFSET_MAX) { +- if (iter->pos.inode == KEY_INODE_MAX) +- return bkey_s_c_null; +- +- iter->pos = bkey_successor(iter->pos); +- goto recheck; +- } +- + if (!k.k) + k.k = &l->b->key.k; + +@@ -1672,11 +1651,20 @@ recheck: + return (struct bkey_s_c) { &iter->k, NULL }; + } + +-static inline struct bkey_s_c +-__bch2_btree_iter_peek_slot(struct btree_iter *iter) ++struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) + { + struct btree_iter_level *l = &iter->l[0]; + struct bkey_s_c k; ++ int ret; ++ ++ bch2_btree_iter_checks(iter, BTREE_ITER_KEYS); ++ ++ if (iter->uptodate == BTREE_ITER_UPTODATE) ++ return btree_iter_peek_uptodate(iter); ++ ++ ret = bch2_btree_iter_traverse(iter); ++ if (unlikely(ret)) ++ return bkey_s_c_err(ret); + + if (iter->flags & BTREE_ITER_IS_EXTENTS) + return __bch2_btree_iter_peek_slot_extents(iter); +@@ -1697,22 +1685,6 @@ __bch2_btree_iter_peek_slot(struct btree_iter *iter) + return k; + } + +-struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) +-{ +- int ret; +- +- bch2_btree_iter_checks(iter, BTREE_ITER_KEYS); +- +- if (iter->uptodate == BTREE_ITER_UPTODATE) +- return btree_iter_peek_uptodate(iter); +- +- ret = bch2_btree_iter_traverse(iter); +- if (unlikely(ret)) +- return bkey_s_c_err(ret); +- +- return __bch2_btree_iter_peek_slot(iter); +-} +- + struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *iter) + { + bch2_btree_iter_checks(iter, BTREE_ITER_KEYS); +-- +cgit v1.2.3 + + +From 046548d35586d1bf3319fc3a97fd6b4c5a27db33 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 18 Feb 2020 16:17:55 -0500 +Subject: bcachefs: More btree iter invariants + +Ensure that iter->pos always lies between the start and end of iter->k +(the last key returned). Also, bch2_btree_iter_set_pos() now invalidates +the key that peek() or next() returned. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 142 ++++++++++++++++++----------------------------- + fs/bcachefs/buckets.c | 8 ++- + 2 files changed, 59 insertions(+), 91 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 040cb7ed9ec9..ecdaa61f4f75 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -659,19 +659,8 @@ fixup_done: + + if (!b->level && + node_iter == &iter->l[0].iter && +- iter_current_key_modified) { +- struct bkey_packed *k = +- bch2_btree_node_iter_peek_all(node_iter, b); +- +- if (likely(k)) { +- bkey_disassemble(b, k, &iter->k); +- } else { +- /* XXX: for extents, calculate size of hole? */ +- iter->k.type = KEY_TYPE_deleted; +- } +- ++ iter_current_key_modified) + btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); +- } + } + + void bch2_btree_node_iter_fix(struct btree_iter *iter, +@@ -1204,6 +1193,10 @@ static inline void bch2_btree_iter_checks(struct btree_iter *iter, + EBUG_ON(iter->btree_id >= BTREE_ID_NR); + EBUG_ON(btree_iter_type(iter) != type); + ++ BUG_ON(type == BTREE_ITER_KEYS && ++ (bkey_cmp(iter->pos, bkey_start_pos(&iter->k)) < 0 || ++ bkey_cmp(iter->pos, iter->k.p) > 0)); ++ + bch2_btree_iter_verify_locks(iter); + bch2_btree_iter_verify_level(iter, iter->level); + } +@@ -1313,7 +1306,8 @@ void bch2_btree_iter_set_pos_same_leaf(struct btree_iter *iter, struct bpos new_ + EBUG_ON(!btree_node_locked(iter, 0)); + EBUG_ON(bkey_cmp(new_pos, l->b->key.k.p) > 0); + +- iter->pos = new_pos; ++ bkey_init(&iter->k); ++ iter->k.p = iter->pos = new_pos; + btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); + + btree_iter_advance_to_pos(iter, l, -1); +@@ -1323,9 +1317,14 @@ void bch2_btree_iter_set_pos_same_leaf(struct btree_iter *iter, struct bpos new_ + btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); + } + +-static unsigned btree_iter_pos_changed(struct btree_iter *iter, int cmp) ++static void btree_iter_pos_changed(struct btree_iter *iter, int cmp) + { +- unsigned l = btree_iter_up_until_good_node(iter, cmp); ++ unsigned l = iter->level; ++ ++ if (!cmp) ++ goto out; ++ ++ l = btree_iter_up_until_good_node(iter, cmp); + + if (btree_iter_node(iter, l)) { + /* +@@ -1342,85 +1341,71 @@ static unsigned btree_iter_pos_changed(struct btree_iter *iter, int cmp) + if (btree_lock_want(iter, l) == BTREE_NODE_UNLOCKED) + btree_node_unlock(iter, l); + } +- +- return l; ++out: ++ if (l != iter->level) ++ btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); ++ else ++ btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); + } + + void __bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos, + bool strictly_greater) + { + struct bpos old = btree_iter_search_key(iter); +- unsigned l; + int cmp; + + iter->flags &= ~BTREE_ITER_IS_EXTENTS; + iter->flags |= strictly_greater ? BTREE_ITER_IS_EXTENTS : 0; +- iter->pos = new_pos; + +- cmp = bkey_cmp(btree_iter_search_key(iter), old); +- if (!cmp) +- return; ++ bkey_init(&iter->k); ++ iter->k.p = iter->pos = new_pos; + +- l = btree_iter_pos_changed(iter, cmp); ++ cmp = bkey_cmp(btree_iter_search_key(iter), old); + +- if (l != iter->level) +- btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); +- else +- btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); ++ btree_iter_pos_changed(iter, cmp); + } + + void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos) + { + int cmp = bkey_cmp(new_pos, iter->pos); +- unsigned l; +- +- if (!cmp) +- return; + +- iter->pos = new_pos; +- +- l = btree_iter_pos_changed(iter, cmp); ++ bkey_init(&iter->k); ++ iter->k.p = iter->pos = new_pos; + +- if (l != iter->level) +- btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); +- else +- btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); ++ btree_iter_pos_changed(iter, cmp); + } + + static inline bool btree_iter_set_pos_to_next_leaf(struct btree_iter *iter) + { + struct btree_iter_level *l = &iter->l[0]; ++ bool ret; + +- iter->pos = l->b->key.k.p; +- iter->uptodate = BTREE_ITER_NEED_TRAVERSE; ++ bkey_init(&iter->k); ++ iter->k.p = iter->pos = l->b->key.k.p; + +- if (!bkey_cmp(iter->pos, POS_MAX)) { +- bkey_init(&iter->k); +- iter->k.p = POS_MAX; +- return false; +- } ++ ret = bkey_cmp(iter->pos, POS_MAX) != 0; ++ if (ret) ++ iter->k.p = iter->pos = btree_type_successor(iter->btree_id, iter->pos); + +- iter->pos = btree_type_successor(iter->btree_id, iter->pos); + btree_iter_pos_changed(iter, 1); +- return true; ++ return ret; + } + + static inline bool btree_iter_set_pos_to_prev_leaf(struct btree_iter *iter) + { + struct btree_iter_level *l = &iter->l[0]; ++ bool ret; + +- iter->pos = l->b->data->min_key; ++ bkey_init(&iter->k); ++ iter->k.p = iter->pos = l->b->data->min_key; + iter->uptodate = BTREE_ITER_NEED_TRAVERSE; + +- if (!bkey_cmp(iter->pos, POS_MIN)) { +- bkey_init(&iter->k); +- iter->k.p = POS_MIN; +- return false; +- } ++ ret = bkey_cmp(iter->pos, POS_MIN) != 0; ++ if (ret) ++ iter->k.p = iter->pos = btree_type_predecessor(iter->btree_id, iter->pos); + +- iter->pos = btree_type_predecessor(iter->btree_id, iter->pos); + btree_iter_pos_changed(iter, -1); +- return true; ++ return ret; + } + + /** +@@ -1500,14 +1485,11 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) + */ + struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter) + { +- struct bpos next = iter->k.p; +- +- bch2_btree_iter_checks(iter, BTREE_ITER_KEYS); +- +- if (bkey_cmp(next, POS_MAX)) +- next = btree_type_successor(iter->btree_id, next); ++ if (unlikely(!bkey_cmp(iter->k.p, POS_MAX))) ++ return bkey_s_c_null; + +- bch2_btree_iter_set_pos(iter, next); ++ bch2_btree_iter_set_pos(iter, ++ btree_type_successor(iter->btree_id, iter->k.p)); + + return bch2_btree_iter_peek(iter); + } +@@ -1518,6 +1500,7 @@ struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter) + */ + struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) + { ++ struct bpos pos = iter->pos; + struct btree_iter_level *l = &iter->l[0]; + struct bkey_s_c k; + int ret; +@@ -1534,8 +1517,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) + return bkey_s_c_err(ret); + + k = __btree_iter_peek(iter, l); +- if (!k.k || +- bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0) ++ if (!k.k || bkey_cmp(bkey_start_pos(k.k), pos) > 0) + k = __btree_iter_prev(iter, l); + + if (likely(k.k)) +@@ -1545,7 +1527,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) + return bkey_s_c_null; + } + +- EBUG_ON(bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0); ++ EBUG_ON(bkey_cmp(bkey_start_pos(k.k), pos) > 0); + iter->pos = bkey_start_pos(k.k); + iter->uptodate = BTREE_ITER_UPTODATE; + return k; +@@ -1557,33 +1539,16 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) + */ + struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *iter) + { +- struct btree_iter_level *l = &iter->l[0]; +- struct bkey_s_c k; ++ struct bpos pos = bkey_start_pos(&iter->k); + + bch2_btree_iter_checks(iter, BTREE_ITER_KEYS); + +- if (unlikely(iter->uptodate != BTREE_ITER_UPTODATE)) { +- /* +- * XXX: when we just need to relock we should be able to avoid +- * calling traverse, but we need to kill BTREE_ITER_NEED_PEEK +- * for that to work +- */ +- iter->pos = btree_type_predecessor(iter->btree_id, +- iter->pos); +- iter->uptodate = BTREE_ITER_NEED_TRAVERSE; +- +- return bch2_btree_iter_peek_prev(iter); +- } ++ if (unlikely(!bkey_cmp(pos, POS_MIN))) ++ return bkey_s_c_null; + +- k = __btree_iter_prev(iter, l); +- if (unlikely(!k.k)) +- return btree_iter_set_pos_to_prev_leaf(iter) +- ? bch2_btree_iter_peek(iter) +- : bkey_s_c_null; ++ bch2_btree_iter_set_pos(iter, bkey_predecessor(pos)); + +- EBUG_ON(bkey_cmp(bkey_start_pos(k.k), iter->pos) >= 0); +- iter->pos = bkey_start_pos(k.k); +- return k; ++ return bch2_btree_iter_peek_prev(iter); + } + + static inline struct bkey_s_c +@@ -1687,7 +1652,8 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) + + struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *iter) + { +- bch2_btree_iter_checks(iter, BTREE_ITER_KEYS); ++ if (unlikely(!bkey_cmp(iter->k.p, POS_MAX))) ++ return bkey_s_c_null; + + bch2_btree_iter_set_pos(iter, + btree_type_successor(iter->btree_id, iter->k.p)); +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index 61831a498e16..bca077e1bdea 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -1671,8 +1671,7 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans, + k.k->p.offset > idx + sectors)) + goto out; + +- bch2_btree_iter_set_pos(iter, bkey_start_pos(k.k)); +- BUG_ON(iter->uptodate > BTREE_ITER_NEED_PEEK); ++ sectors = k.k->p.offset - idx; + + r_v = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); + ret = PTR_ERR_OR_ZERO(r_v); +@@ -1689,9 +1688,12 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans, + set_bkey_val_u64s(&r_v->k, 0); + } + ++ bch2_btree_iter_set_pos(iter, bkey_start_pos(k.k)); ++ BUG_ON(iter->uptodate > BTREE_ITER_NEED_PEEK); ++ + bch2_trans_update(trans, iter, &r_v->k_i, 0); + out: +- ret = k.k->p.offset - idx; ++ ret = sectors; + err: + bch2_trans_iter_put(trans, iter); + return ret; +-- +cgit v1.2.3 + + +From 06f530c68d880f40f4e324108075b9356ccd2a41 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 15 Mar 2020 16:15:08 -0400 +Subject: bcachefs: Fix build when CONFIG_BCACHEFS_DEBUG=n + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 7 +++++-- + 1 file changed, 5 insertions(+), 2 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index ecdaa61f4f75..9094f9bfb299 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -261,7 +261,7 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos, + /* Btree iterator locking: */ + + #ifdef CONFIG_BCACHEFS_DEBUG +-void bch2_btree_iter_verify_locks(struct btree_iter *iter) ++static void bch2_btree_iter_verify_locks(struct btree_iter *iter) + { + unsigned l; + +@@ -282,6 +282,8 @@ void bch2_btree_trans_verify_locks(struct btree_trans *trans) + trans_for_each_iter(trans, iter) + bch2_btree_iter_verify_locks(iter); + } ++#else ++static inline void bch2_btree_iter_verify_locks(struct btree_iter *iter) {} + #endif + + __flatten +@@ -515,7 +517,8 @@ void bch2_btree_trans_verify_iters(struct btree_trans *trans, struct btree *b) + + #else + +-static inline void bch2_btree_iter_verify_level(struct btree_iter *iter, unsigned) {} ++static inline void bch2_btree_iter_verify_level(struct btree_iter *iter, unsigned l) {} ++static inline void bch2_btree_iter_verify(struct btree_iter *iter) {} + + #endif + +-- +cgit v1.2.3 + + +From c5031802571727982c30dd50db22ffc66790c909 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 5 Mar 2020 18:44:59 -0500 +Subject: bcachefs: btree_iter_peek_with_updates() + +Introduce a new iterator method that provides a consistent view of the +btree plus uncommitted updates. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 83 ++++++++++++++++++++++++++++++++++++++++++++++++ + fs/bcachefs/btree_iter.h | 7 ++-- + 2 files changed, 87 insertions(+), 3 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 9094f9bfb299..5896f015ea09 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -5,6 +5,7 @@ + #include "btree_cache.h" + #include "btree_iter.h" + #include "btree_locking.h" ++#include "btree_update.h" + #include "debug.h" + #include "extents.h" + +@@ -1497,6 +1498,88 @@ struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter) + return bch2_btree_iter_peek(iter); + } + ++static struct bkey_s_c __btree_trans_updates_peek(struct btree_iter *iter) ++{ ++ struct bpos pos = btree_iter_search_key(iter); ++ struct btree_trans *trans = iter->trans; ++ struct btree_insert_entry *i; ++ ++ trans_for_each_update(trans, i) ++ if ((cmp_int(iter->btree_id, i->iter->btree_id) ?: ++ bkey_cmp(pos, i->k->k.p)) <= 0) ++ break; ++ ++ return i < trans->updates + trans->nr_updates && ++ iter->btree_id == i->iter->btree_id ++ ? bkey_i_to_s_c(i->k) ++ : bkey_s_c_null; ++} ++ ++static struct bkey_s_c __bch2_btree_iter_peek_with_updates(struct btree_iter *iter) ++{ ++ struct btree_iter_level *l = &iter->l[0]; ++ struct bkey_s_c k = __btree_iter_peek(iter, l); ++ struct bkey_s_c u = __btree_trans_updates_peek(iter); ++ ++ if (k.k && (!u.k || bkey_cmp(k.k->p, u.k->p) < 0)) ++ return k; ++ if (u.k && bkey_cmp(u.k->p, l->b->key.k.p) <= 0) { ++ iter->k = *u.k; ++ return u; ++ } ++ return bkey_s_c_null; ++} ++ ++struct bkey_s_c bch2_btree_iter_peek_with_updates(struct btree_iter *iter) ++{ ++ struct bkey_s_c k; ++ int ret; ++ ++ bch2_btree_iter_checks(iter, BTREE_ITER_KEYS); ++ ++ while (1) { ++ ret = bch2_btree_iter_traverse(iter); ++ if (unlikely(ret)) ++ return bkey_s_c_err(ret); ++ ++ k = __bch2_btree_iter_peek_with_updates(iter); ++ ++ if (k.k && bkey_deleted(k.k)) { ++ bch2_btree_iter_set_pos(iter, ++ btree_type_successor(iter->btree_id, iter->k.p)); ++ continue; ++ } ++ ++ if (likely(k.k)) ++ break; ++ ++ if (!btree_iter_set_pos_to_next_leaf(iter)) ++ return bkey_s_c_null; ++ } ++ ++ /* ++ * iter->pos should always be equal to the key we just ++ * returned - except extents can straddle iter->pos: ++ */ ++ if (!(iter->flags & BTREE_ITER_IS_EXTENTS) || ++ bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0) ++ iter->pos = bkey_start_pos(k.k); ++ ++ iter->uptodate = BTREE_ITER_UPTODATE; ++ return k; ++} ++ ++struct bkey_s_c bch2_btree_iter_next_with_updates(struct btree_iter *iter) ++{ ++ if (unlikely(!bkey_cmp(iter->k.p, POS_MAX))) ++ return bkey_s_c_null; ++ ++ bch2_btree_iter_set_pos(iter, ++ btree_type_successor(iter->btree_id, iter->k.p)); ++ ++ return bch2_btree_iter_peek_with_updates(iter); ++} ++ + /** + * bch2_btree_iter_peek_prev: returns first key less than or equal to + * iterator's current position +diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h +index 2ca7845143ad..6f51ef35db75 100644 +--- a/fs/bcachefs/btree_iter.h ++++ b/fs/bcachefs/btree_iter.h +@@ -159,6 +159,9 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *); + struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *); + struct bkey_s_c bch2_btree_iter_next(struct btree_iter *); + ++struct bkey_s_c bch2_btree_iter_peek_with_updates(struct btree_iter *); ++struct bkey_s_c bch2_btree_iter_next_with_updates(struct btree_iter *); ++ + struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *); + struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *); + +@@ -199,9 +202,7 @@ static inline int __btree_iter_cmp(enum btree_id id, + struct bpos pos, + const struct btree_iter *r) + { +- if (id != r->btree_id) +- return id < r->btree_id ? -1 : 1; +- return bkey_cmp(pos, r->pos); ++ return cmp_int(id, r->btree_id) ?: bkey_cmp(pos, r->pos); + } + + static inline int btree_iter_cmp(const struct btree_iter *l, +-- +cgit v1.2.3 + + +From 9dfb86f5bf0f6cc33fdcc7da4d4ad985155810d0 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 30 Dec 2019 14:37:25 -0500 +Subject: bcachefs: Move extent overwrite handling out of core btree code + +Ever since the btree code was first written, handling of overwriting +existing extents - including partially overwriting and splittin existing +extents - was handled as part of the core btree insert path. The modern +transaction and iterator infrastructure didn't exist then, so that was +the only way for it to be done. + +This patch moves that outside of the core btree code to a pass that runs +at transaction commit time. + +This is a significant simplification to the btree code and overall +reduction in code size, but more importantly it gets us much closer to +the core btree code being completely independent of extents and is +important prep work for snapshots. + +This introduces a new feature bit; the old and new extent update models +are incompatible when the filesystem needs journal replay. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bcachefs_format.h | 6 +- + fs/bcachefs/btree_gc.c | 57 ++--- + fs/bcachefs/btree_io.c | 17 +- + fs/bcachefs/btree_iter.c | 25 ++- + fs/bcachefs/btree_types.h | 3 + + fs/bcachefs/btree_update.h | 5 + + fs/bcachefs/btree_update_interior.h | 23 +- + fs/bcachefs/btree_update_leaf.c | 228 ++++++++++++++++---- + fs/bcachefs/buckets.c | 13 +- + fs/bcachefs/buckets.h | 2 +- + fs/bcachefs/extent_update.c | 410 +++--------------------------------- + fs/bcachefs/extent_update.h | 5 +- + fs/bcachefs/fsck.c | 56 +++++ + fs/bcachefs/recovery.c | 154 +++++--------- + fs/bcachefs/recovery.h | 2 - + 15 files changed, 404 insertions(+), 602 deletions(-) + +diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h +index 42dd1022477c..798f5c9ea164 100644 +--- a/fs/bcachefs/bcachefs_format.h ++++ b/fs/bcachefs/bcachefs_format.h +@@ -1311,12 +1311,14 @@ LE64_BITMASK(BCH_SB_ERASURE_CODE, struct bch_sb, flags[3], 0, 16); + x(inline_data, 8) \ + x(new_extent_overwrite, 9) \ + x(incompressible, 10) \ +- x(btree_ptr_v2, 11) ++ x(btree_ptr_v2, 11) \ ++ x(extents_above_btree_updates, 12) + + #define BCH_SB_FEATURES_ALL \ + ((1ULL << BCH_FEATURE_new_siphash)| \ + (1ULL << BCH_FEATURE_new_extent_overwrite)| \ +- (1ULL << BCH_FEATURE_btree_ptr_v2)) ++ (1ULL << BCH_FEATURE_btree_ptr_v2)| \ ++ (1ULL << BCH_FEATURE_extents_above_btree_updates)) + + enum bch_sb_feature { + #define x(f, n) BCH_FEATURE_##f, +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +index 3705c41f5151..c5a0c0ed22a0 100644 +--- a/fs/bcachefs/btree_gc.c ++++ b/fs/bcachefs/btree_gc.c +@@ -184,8 +184,16 @@ fsck_err: + return ret; + } + +-static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, +- u8 *max_stale, bool initial) ++static bool pos_in_journal_keys(struct journal_keys *journal_keys, ++ enum btree_id id, struct bpos pos) ++{ ++ struct journal_key *k = journal_key_search(journal_keys, id, pos); ++ ++ return k && k->btree_id == id && !bkey_cmp(k->k->k.p, pos); ++} ++ ++static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, u8 *max_stale, ++ struct journal_keys *journal_keys, bool initial) + { + struct btree_node_iter iter; + struct bkey unpacked; +@@ -199,6 +207,10 @@ static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, + + for_each_btree_node_key_unpack(b, k, &iter, + &unpacked) { ++ if (!b->level && journal_keys && ++ pos_in_journal_keys(journal_keys, b->btree_id, k.k->p)) ++ continue; ++ + bch2_bkey_debugcheck(c, b, k); + + ret = bch2_gc_mark_key(c, k, max_stale, initial); +@@ -210,6 +222,7 @@ static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, + } + + static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id, ++ struct journal_keys *journal_keys, + bool initial, bool metadata_only) + { + struct btree_trans trans; +@@ -237,7 +250,8 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id, + + gc_pos_set(c, gc_pos_btree_node(b)); + +- ret = btree_gc_mark_node(c, b, &max_stale, initial); ++ ret = btree_gc_mark_node(c, b, &max_stale, ++ journal_keys, initial); + if (ret) + break; + +@@ -279,36 +293,6 @@ static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r) + (int) btree_id_to_gc_phase(r); + } + +-static int mark_journal_key(struct bch_fs *c, enum btree_id id, +- struct bkey_i *insert) +-{ +- struct btree_trans trans; +- struct btree_iter *iter; +- struct bkey_s_c k; +- u8 max_stale; +- int ret = 0; +- +- ret = bch2_gc_mark_key(c, bkey_i_to_s_c(insert), &max_stale, true); +- if (ret) +- return ret; +- +- bch2_trans_init(&trans, c, 0, 0); +- +- for_each_btree_key(&trans, iter, id, bkey_start_pos(&insert->k), +- BTREE_ITER_SLOTS, k, ret) { +- percpu_down_read(&c->mark_lock); +- ret = bch2_mark_overwrite(&trans, iter, k, insert, NULL, +- BTREE_TRIGGER_GC| +- BTREE_TRIGGER_NOATOMIC); +- percpu_up_read(&c->mark_lock); +- +- if (!ret) +- break; +- } +- +- return bch2_trans_exit(&trans) ?: ret; +-} +- + static int bch2_gc_btrees(struct bch_fs *c, struct journal_keys *journal_keys, + bool initial, bool metadata_only) + { +@@ -323,18 +307,21 @@ static int bch2_gc_btrees(struct bch_fs *c, struct journal_keys *journal_keys, + enum btree_id id = ids[i]; + enum btree_node_type type = __btree_node_type(0, id); + +- int ret = bch2_gc_btree(c, id, initial, metadata_only); ++ int ret = bch2_gc_btree(c, id, journal_keys, ++ initial, metadata_only); + if (ret) + return ret; + + if (journal_keys && !metadata_only && + btree_node_type_needs_gc(type)) { + struct journal_key *j; ++ u8 max_stale; + int ret; + + for_each_journal_key(*journal_keys, j) + if (j->btree_id == id) { +- ret = mark_journal_key(c, id, j->k); ++ ret = bch2_gc_mark_key(c, bkey_i_to_s_c(j->k), ++ &max_stale, initial); + if (ret) + return ret; + } +diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c +index 0370c7821aa1..3f7c10420042 100644 +--- a/fs/bcachefs/btree_io.c ++++ b/fs/bcachefs/btree_io.c +@@ -709,9 +709,7 @@ static int validate_bset(struct bch_fs *c, struct btree *b, + unsigned *whiteout_u64s, int write, + bool have_retry) + { +- struct bkey_packed *k; +- struct bkey prev = KEY(0, 0, 0); +- struct bpos prev_data = POS_MIN; ++ struct bkey_packed *k, *prev = NULL; + bool seen_non_whiteout = false; + unsigned version; + const char *err; +@@ -853,15 +851,15 @@ static int validate_bset(struct bch_fs *c, struct btree *b, + + if (!seen_non_whiteout && + (!bkey_whiteout(k) || +- (bkey_cmp(prev.p, bkey_start_pos(u.k)) > 0))) { ++ (prev && bkey_iter_cmp(b, prev, k) > 0))) { + *whiteout_u64s = k->_data - i->_data; + seen_non_whiteout = true; +- } else if (bkey_cmp(prev_data, bkey_start_pos(u.k)) > 0 || +- bkey_cmp(prev.p, u.k->p) > 0) { ++ } else if (prev && bkey_iter_cmp(b, prev, k) > 0) { + char buf1[80]; + char buf2[80]; ++ struct bkey up = bkey_unpack_key(b, prev); + +- bch2_bkey_to_text(&PBUF(buf1), &prev); ++ bch2_bkey_to_text(&PBUF(buf1), &up); + bch2_bkey_to_text(&PBUF(buf2), u.k); + + bch2_dump_bset(b, i, 0); +@@ -871,10 +869,7 @@ static int validate_bset(struct bch_fs *c, struct btree *b, + /* XXX: repair this */ + } + +- if (!bkey_deleted(u.k)) +- prev_data = u.k->p; +- prev = *u.k; +- ++ prev = k; + k = bkey_next_skip_noops(k, vstruct_last(i)); + } + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 5896f015ea09..2819b9a487f2 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -1504,12 +1504,12 @@ static struct bkey_s_c __btree_trans_updates_peek(struct btree_iter *iter) + struct btree_trans *trans = iter->trans; + struct btree_insert_entry *i; + +- trans_for_each_update(trans, i) ++ trans_for_each_update2(trans, i) + if ((cmp_int(iter->btree_id, i->iter->btree_id) ?: + bkey_cmp(pos, i->k->k.p)) <= 0) + break; + +- return i < trans->updates + trans->nr_updates && ++ return i < trans->updates2 + trans->nr_updates2 && + iter->btree_id == i->iter->btree_id + ? bkey_i_to_s_c(i->k) + : bkey_s_c_null; +@@ -1821,7 +1821,7 @@ int bch2_trans_iter_free(struct btree_trans *trans, + static int bch2_trans_realloc_iters(struct btree_trans *trans, + unsigned new_size) + { +- void *new_iters, *new_updates; ++ void *p, *new_iters, *new_updates, *new_updates2; + size_t iters_bytes; + size_t updates_bytes; + +@@ -1839,21 +1839,27 @@ static int bch2_trans_realloc_iters(struct btree_trans *trans, + iters_bytes = sizeof(struct btree_iter) * new_size; + updates_bytes = sizeof(struct btree_insert_entry) * new_size; + +- new_iters = kmalloc(iters_bytes + updates_bytes, GFP_NOFS); +- if (new_iters) ++ p = kmalloc(iters_bytes + ++ updates_bytes + ++ updates_bytes, GFP_NOFS); ++ if (p) + goto success; + +- new_iters = mempool_alloc(&trans->c->btree_iters_pool, GFP_NOFS); ++ p = mempool_alloc(&trans->c->btree_iters_pool, GFP_NOFS); + new_size = BTREE_ITER_MAX; + + trans->used_mempool = true; + success: +- new_updates = new_iters + iters_bytes; ++ new_iters = p; p += iters_bytes; ++ new_updates = p; p += updates_bytes; ++ new_updates2 = p; p += updates_bytes; + + memcpy(new_iters, trans->iters, + sizeof(struct btree_iter) * trans->nr_iters); + memcpy(new_updates, trans->updates, + sizeof(struct btree_insert_entry) * trans->nr_updates); ++ memcpy(new_updates2, trans->updates2, ++ sizeof(struct btree_insert_entry) * trans->nr_updates2); + + if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) + memset(trans->iters, POISON_FREE, +@@ -1865,6 +1871,7 @@ success: + + trans->iters = new_iters; + trans->updates = new_updates; ++ trans->updates2 = new_updates2; + trans->size = new_size; + + if (trans->iters_live) { +@@ -2126,6 +2133,7 @@ void bch2_trans_reset(struct btree_trans *trans, unsigned flags) + + trans->need_reset = 0; + trans->nr_updates = 0; ++ trans->nr_updates2 = 0; + trans->mem_top = 0; + + if (trans->fs_usage_deltas) { +@@ -2156,6 +2164,7 @@ void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, + trans->size = ARRAY_SIZE(trans->iters_onstack); + trans->iters = trans->iters_onstack; + trans->updates = trans->updates_onstack; ++ trans->updates2 = trans->updates2_onstack; + trans->fs_usage_deltas = NULL; + + if (expected_nr_iters > trans->size) +@@ -2193,5 +2202,5 @@ int bch2_fs_btree_iter_init(struct bch_fs *c) + return mempool_init_kmalloc_pool(&c->btree_iters_pool, 1, + sizeof(struct btree_iter) * nr + + sizeof(struct btree_insert_entry) * nr + +- sizeof(u8) * nr); ++ sizeof(struct btree_insert_entry) * nr); + } +diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h +index 435e61b5cb27..51d579a4ffae 100644 +--- a/fs/bcachefs/btree_types.h ++++ b/fs/bcachefs/btree_types.h +@@ -280,6 +280,7 @@ struct btree_trans { + + u8 nr_iters; + u8 nr_updates; ++ u8 nr_updates2; + u8 size; + unsigned used_mempool:1; + unsigned error:1; +@@ -292,6 +293,7 @@ struct btree_trans { + + struct btree_iter *iters; + struct btree_insert_entry *updates; ++ struct btree_insert_entry *updates2; + + /* update path: */ + struct journal_res journal_res; +@@ -305,6 +307,7 @@ struct btree_trans { + + struct btree_iter iters_onstack[2]; + struct btree_insert_entry updates_onstack[2]; ++ struct btree_insert_entry updates2_onstack[2]; + }; + + #define BTREE_FLAG(flag) \ +diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h +index d1cd839ac08f..12127a33906b 100644 +--- a/fs/bcachefs/btree_update.h ++++ b/fs/bcachefs/btree_update.h +@@ -132,4 +132,9 @@ static inline int bch2_trans_commit(struct btree_trans *trans, + (_i) < (_trans)->updates + (_trans)->nr_updates; \ + (_i)++) + ++#define trans_for_each_update2(_trans, _i) \ ++ for ((_i) = (_trans)->updates2; \ ++ (_i) < (_trans)->updates2 + (_trans)->nr_updates2; \ ++ (_i)++) ++ + #endif /* _BCACHEFS_BTREE_UPDATE_H */ +diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h +index 657b3d310e89..c90fcd48eeb7 100644 +--- a/fs/bcachefs/btree_update_interior.h ++++ b/fs/bcachefs/btree_update_interior.h +@@ -303,18 +303,23 @@ static inline struct btree_node_entry *want_new_bset(struct bch_fs *c, + } + + static inline void push_whiteout(struct bch_fs *c, struct btree *b, +- struct bkey_packed *k) ++ struct bpos pos) + { +- unsigned u64s = bkeyp_key_u64s(&b->format, k); +- struct bkey_packed *dst; ++ struct bkey_packed k; + +- BUG_ON(u64s > bch_btree_keys_u64s_remaining(c, b)); ++ BUG_ON(bch_btree_keys_u64s_remaining(c, b) < BKEY_U64s); + +- b->whiteout_u64s += bkeyp_key_u64s(&b->format, k); +- dst = unwritten_whiteouts_start(c, b); +- memcpy_u64s(dst, k, u64s); +- dst->u64s = u64s; +- dst->type = KEY_TYPE_deleted; ++ if (!bkey_pack_pos(&k, pos, b)) { ++ struct bkey *u = (void *) &k; ++ ++ bkey_init(u); ++ u->p = pos; ++ } ++ ++ k.needs_whiteout = true; ++ ++ b->whiteout_u64s += k.u64s; ++ bkey_copy(unwritten_whiteouts_start(c, b), &k); + } + + /* +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index 59a1175c5411..389b5ee203c4 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -23,11 +23,10 @@ + static inline bool same_leaf_as_prev(struct btree_trans *trans, + struct btree_insert_entry *i) + { +- return i != trans->updates && ++ return i != trans->updates2 && + i[0].iter->l[0].b == i[-1].iter->l[0].b; + } + +- + inline void bch2_btree_node_lock_for_insert(struct bch_fs *c, struct btree *b, + struct btree_iter *iter) + { +@@ -61,6 +60,9 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter, + EBUG_ON(bkey_deleted(&insert->k) && bkey_val_u64s(&insert->k)); + EBUG_ON(bkey_cmp(bkey_start_pos(&insert->k), b->data->min_key) < 0 || + bkey_cmp(insert->k.p, b->data->max_key) > 0); ++ EBUG_ON(insert->k.u64s > ++ bch_btree_keys_u64s_remaining(iter->trans->c, b)); ++ EBUG_ON(iter->flags & BTREE_ITER_IS_EXTENTS); + + k = bch2_btree_node_iter_peek_all(node_iter, b); + if (k && bkey_cmp_packed(b, k, &insert->k)) +@@ -79,7 +81,7 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter, + k->type = KEY_TYPE_deleted; + + if (k->needs_whiteout) +- push_whiteout(iter->trans->c, b, k); ++ push_whiteout(iter->trans->c, b, insert->k.p); + k->needs_whiteout = false; + + if (k >= btree_bset_last(b)->start) { +@@ -195,20 +197,6 @@ void bch2_btree_journal_key(struct btree_trans *trans, + set_btree_node_dirty(b); + } + +-static void bch2_insert_fixup_key(struct btree_trans *trans, +- struct btree_iter *iter, +- struct bkey_i *insert) +-{ +- struct btree_iter_level *l = &iter->l[0]; +- +- EBUG_ON(iter->level); +- EBUG_ON(insert->k.u64s > +- bch_btree_keys_u64s_remaining(trans->c, l->b)); +- +- if (likely(bch2_btree_bset_insert_key(iter, l->b, &l->iter, insert))) +- bch2_btree_journal_key(trans, iter, insert); +-} +- + /** + * btree_insert_key - insert a key one key into a leaf node + */ +@@ -223,12 +211,12 @@ static void btree_insert_key_leaf(struct btree_trans *trans, + int old_live_u64s = b->nr.live_u64s; + int live_u64s_added, u64s_added; + ++ EBUG_ON(iter->level); ++ + insert->k.needs_whiteout = false; + +- if (!btree_node_is_extents(b)) +- bch2_insert_fixup_key(trans, iter, insert); +- else +- bch2_insert_fixup_extent(trans, iter, insert); ++ if (likely(bch2_btree_bset_insert_key(iter, b, &iter->l[0].iter, insert))) ++ bch2_btree_journal_key(trans, iter, insert); + + live_u64s_added = (int) b->nr.live_u64s - old_live_u64s; + u64s_added = (int) bset_u64s(t) - old_u64s; +@@ -254,12 +242,8 @@ static inline void btree_insert_entry_checks(struct btree_trans *trans, + struct bch_fs *c = trans->c; + + BUG_ON(iter->level); +- BUG_ON(bkey_cmp(bkey_start_pos(&insert->k), iter->pos)); +- EBUG_ON((iter->flags & BTREE_ITER_IS_EXTENTS) && +- bkey_cmp(insert->k.p, iter->l[0].b->key.k.p) > 0); +- ++ BUG_ON(bkey_cmp(insert->k.p, iter->pos)); + BUG_ON(debug_check_bkeys(c) && +- !bkey_deleted(&insert->k) && + bch2_bkey_invalid(c, bkey_i_to_s_c(insert), iter->btree_id)); + } + +@@ -312,9 +296,16 @@ btree_key_can_insert(struct btree_trans *trans, + if (unlikely(btree_node_fake(b))) + return BTREE_INSERT_BTREE_NODE_FULL; + ++ /* ++ * old bch2_extent_sort_fix_overlapping() algorithm won't work with new ++ * style extent updates: ++ */ ++ if (unlikely(btree_node_old_extent_overwrite(b))) ++ return BTREE_INSERT_BTREE_NODE_FULL; ++ + ret = !btree_node_is_extents(b) + ? BTREE_INSERT_OK +- : bch2_extent_can_insert(trans, iter, insert, u64s); ++ : bch2_extent_can_insert(trans, iter, insert); + if (ret) + return ret; + +@@ -383,7 +374,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, + + prefetch(&trans->c->journal.flags); + +- trans_for_each_update(trans, i) { ++ trans_for_each_update2(trans, i) { + /* Multiple inserts might go to same leaf: */ + if (!same_leaf_as_prev(trans, i)) + u64s = 0; +@@ -422,10 +413,10 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, + + if (!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)) { + if (journal_seq_verify(c)) +- trans_for_each_update(trans, i) ++ trans_for_each_update2(trans, i) + i->k->k.version.lo = trans->journal_res.seq; + else if (inject_invalid_keys(c)) +- trans_for_each_update(trans, i) ++ trans_for_each_update2(trans, i) + i->k->k.version = MAX_VERSION; + } + +@@ -448,7 +439,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, + if (unlikely(c->gc_pos.phase)) + bch2_trans_mark_gc(trans); + +- trans_for_each_update(trans, i) ++ trans_for_each_update2(trans, i) + do_btree_insert_one(trans, i->iter, i->k); + err: + if (marking) { +@@ -469,7 +460,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, + struct btree_iter *iter; + int ret; + +- trans_for_each_update(trans, i) ++ trans_for_each_update2(trans, i) + BUG_ON(!btree_node_intent_locked(i->iter, 0)); + + ret = bch2_journal_preres_get(&trans->c->journal, +@@ -497,18 +488,18 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, + } + + if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) +- trans_for_each_update(trans, i) ++ trans_for_each_update2(trans, i) + btree_insert_entry_checks(trans, i->iter, i->k); + bch2_btree_trans_verify_locks(trans); + +- trans_for_each_update(trans, i) ++ trans_for_each_update2(trans, i) + if (!same_leaf_as_prev(trans, i)) + bch2_btree_node_lock_for_insert(trans->c, + i->iter->l[0].b, i->iter); + + ret = bch2_trans_commit_write_locked(trans, stopped_at); + +- trans_for_each_update(trans, i) ++ trans_for_each_update2(trans, i) + if (!same_leaf_as_prev(trans, i)) + bch2_btree_node_unlock_write_inlined(i->iter->l[0].b, + i->iter); +@@ -525,14 +516,14 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, + if (trans->flags & BTREE_INSERT_NOUNLOCK) + trans->nounlock = true; + +- trans_for_each_update(trans, i) ++ trans_for_each_update2(trans, i) + if (!same_leaf_as_prev(trans, i)) + bch2_foreground_maybe_merge(trans->c, i->iter, + 0, trans->flags); + + trans->nounlock = false; + +- trans_for_each_update(trans, i) ++ trans_for_each_update2(trans, i) + bch2_btree_iter_downgrade(i->iter); + + return 0; +@@ -655,6 +646,135 @@ bch2_trans_commit_get_rw_cold(struct btree_trans *trans) + return 0; + } + ++static void bch2_trans_update2(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct bkey_i *insert) ++{ ++ struct btree_insert_entry *i, n = (struct btree_insert_entry) { ++ .iter = iter, .k = insert ++ }; ++ ++ btree_insert_entry_checks(trans, n.iter, n.k); ++ ++ BUG_ON(iter->uptodate > BTREE_ITER_NEED_PEEK); ++ ++ EBUG_ON(trans->nr_updates2 >= trans->nr_iters); ++ ++ iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT; ++ ++ trans_for_each_update2(trans, i) { ++ if (btree_iter_cmp(n.iter, i->iter) == 0) { ++ *i = n; ++ return; ++ } ++ ++ if (btree_iter_cmp(n.iter, i->iter) <= 0) ++ break; ++ } ++ ++ array_insert_item(trans->updates2, trans->nr_updates2, ++ i - trans->updates2, n); ++} ++ ++static int extent_update_to_keys(struct btree_trans *trans, ++ struct btree_iter *orig_iter, ++ struct bkey_i *insert) ++{ ++ struct btree_iter *iter; ++ ++ if (bkey_deleted(&insert->k)) ++ return 0; ++ ++ iter = bch2_trans_copy_iter(trans, orig_iter); ++ if (IS_ERR(iter)) ++ return PTR_ERR(iter); ++ ++ iter->flags |= BTREE_ITER_INTENT; ++ __bch2_btree_iter_set_pos(iter, insert->k.p, false); ++ bch2_trans_update2(trans, iter, insert); ++ bch2_trans_iter_put(trans, iter); ++ return 0; ++} ++ ++static int extent_handle_overwrites(struct btree_trans *trans, ++ enum btree_id btree_id, ++ struct bpos start, struct bpos end) ++{ ++ struct btree_iter *iter = NULL, *update_iter; ++ struct bkey_i *update; ++ struct bkey_s_c k; ++ int ret = 0; ++ ++ iter = bch2_trans_get_iter(trans, btree_id, start, BTREE_ITER_INTENT); ++ ret = PTR_ERR_OR_ZERO(iter); ++ if (ret) ++ return ret; ++ ++ k = bch2_btree_iter_peek_with_updates(iter); ++ ++ while (k.k && !(ret = bkey_err(k))) { ++ if (bkey_cmp(end, bkey_start_pos(k.k)) <= 0) ++ break; ++ ++ if (bkey_cmp(bkey_start_pos(k.k), start) < 0) { ++ update_iter = bch2_trans_copy_iter(trans, iter); ++ if ((ret = PTR_ERR_OR_ZERO(update_iter))) ++ goto err; ++ ++ update = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); ++ if ((ret = PTR_ERR_OR_ZERO(update))) ++ goto err; ++ ++ bkey_reassemble(update, k); ++ bch2_cut_back(start, update); ++ ++ __bch2_btree_iter_set_pos(update_iter, update->k.p, false); ++ bch2_trans_update2(trans, update_iter, update); ++ bch2_trans_iter_put(trans, update_iter); ++ } ++ ++ if (bkey_cmp(k.k->p, end) > 0) { ++ update_iter = bch2_trans_copy_iter(trans, iter); ++ if ((ret = PTR_ERR_OR_ZERO(update_iter))) ++ goto err; ++ ++ update = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); ++ if ((ret = PTR_ERR_OR_ZERO(update))) ++ goto err; ++ ++ bkey_reassemble(update, k); ++ bch2_cut_front(end, update); ++ ++ __bch2_btree_iter_set_pos(update_iter, update->k.p, false); ++ bch2_trans_update2(trans, update_iter, update); ++ bch2_trans_iter_put(trans, update_iter); ++ } else { ++ update_iter = bch2_trans_copy_iter(trans, iter); ++ if ((ret = PTR_ERR_OR_ZERO(update_iter))) ++ goto err; ++ ++ update = bch2_trans_kmalloc(trans, sizeof(struct bkey)); ++ if ((ret = PTR_ERR_OR_ZERO(update))) ++ goto err; ++ ++ update->k = *k.k; ++ set_bkey_val_u64s(&update->k, 0); ++ update->k.type = KEY_TYPE_deleted; ++ update->k.size = 0; ++ ++ __bch2_btree_iter_set_pos(update_iter, update->k.p, false); ++ bch2_trans_update2(trans, update_iter, update); ++ bch2_trans_iter_put(trans, update_iter); ++ } ++ ++ k = bch2_btree_iter_next_with_updates(iter); ++ } ++err: ++ if (!IS_ERR_OR_NULL(iter)) ++ bch2_trans_iter_put(trans, iter); ++ return ret; ++} ++ + int __bch2_trans_commit(struct btree_trans *trans) + { + struct btree_insert_entry *i = NULL; +@@ -724,7 +844,36 @@ int __bch2_trans_commit(struct btree_trans *trans) + } + } while (trans_trigger_run); + ++ /* Turn extents updates into keys: */ ++ trans_for_each_update(trans, i) ++ if (i->iter->flags & BTREE_ITER_IS_EXTENTS) { ++ struct bpos start = bkey_start_pos(&i->k->k); ++ ++ while (i + 1 < trans->updates + trans->nr_updates && ++ i[0].iter->btree_id == i[1].iter->btree_id && ++ !bkey_cmp(i[0].k->k.p, bkey_start_pos(&i[1].k->k))) ++ i++; ++ ++ ret = extent_handle_overwrites(trans, i->iter->btree_id, ++ start, i->k->k.p); ++ if (ret) ++ goto out; ++ } ++ + trans_for_each_update(trans, i) { ++ if (i->iter->flags & BTREE_ITER_IS_EXTENTS) { ++ ret = extent_update_to_keys(trans, i->iter, i->k); ++ if (ret) ++ goto out; ++ } else { ++ bch2_trans_update2(trans, i->iter, i->k); ++ } ++ } ++ ++ trans_for_each_update2(trans, i) { ++ BUG_ON(i->iter->uptodate > BTREE_ITER_NEED_PEEK); ++ BUG_ON(i->iter->locks_want < 1); ++ + u64s = jset_u64s(i->k->k.u64s); + if (0) + trans->journal_preres_u64s += u64s; +@@ -773,7 +922,10 @@ int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter, + .trigger_flags = flags, .iter = iter, .k = k + }; + +- EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&k->k))); ++ EBUG_ON(bkey_cmp(iter->pos, ++ (iter->flags & BTREE_ITER_IS_EXTENTS) ++ ? bkey_start_pos(&k->k) ++ : k->k.p)); + + iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT; + +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index bca077e1bdea..4497c9840865 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -1254,21 +1254,21 @@ inline int bch2_mark_overwrite(struct btree_trans *trans, + struct bkey_s_c old, + struct bkey_i *new, + struct bch_fs_usage *fs_usage, +- unsigned flags) ++ unsigned flags, ++ bool is_extents) + { + struct bch_fs *c = trans->c; +- struct btree *b = iter->l[0].b; + unsigned offset = 0; +- s64 sectors = 0; ++ s64 sectors = -((s64) old.k->size); + + flags |= BTREE_TRIGGER_OVERWRITE; + +- if (btree_node_is_extents(b) ++ if (is_extents + ? bkey_cmp(new->k.p, bkey_start_pos(old.k)) <= 0 + : bkey_cmp(new->k.p, old.k->p)) + return 0; + +- if (btree_node_is_extents(b)) { ++ if (is_extents) { + switch (bch2_extent_overlap(&new->k, old.k)) { + case BCH_EXTENT_OVERLAP_ALL: + offset = 0; +@@ -1341,7 +1341,8 @@ int bch2_mark_update(struct btree_trans *trans, + struct bkey_s_c k = bkey_disassemble(b, _k, &unpacked); + + ret = bch2_mark_overwrite(trans, iter, k, insert, +- fs_usage, flags); ++ fs_usage, flags, ++ btree_node_type_is_extents(iter->btree_id)); + if (ret <= 0) + break; + +diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h +index c1cc63af9feb..765650ce9d0a 100644 +--- a/fs/bcachefs/buckets.h ++++ b/fs/bcachefs/buckets.h +@@ -268,7 +268,7 @@ int bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage *, + + int bch2_mark_overwrite(struct btree_trans *, struct btree_iter *, + struct bkey_s_c, struct bkey_i *, +- struct bch_fs_usage *, unsigned); ++ struct bch_fs_usage *, unsigned, bool); + int bch2_mark_update(struct btree_trans *, struct btree_iter *, + struct bkey_i *, struct bch_fs_usage *, unsigned); + +diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c +index 846d77dc2530..fa6c0698f385 100644 +--- a/fs/bcachefs/extent_update.c ++++ b/fs/bcachefs/extent_update.c +@@ -39,6 +39,12 @@ static int count_iters_for_insert(struct btree_trans *trans, + { + int ret = 0; + ++ /* ++ * The extent update path requires an _additional_ iterator for each ++ * extent we're inserting and overwriting: ++ */ ++ *nr_iters += 1; ++ + switch (k.k->type) { + case KEY_TYPE_extent: + case KEY_TYPE_reflink_v: +@@ -167,402 +173,40 @@ int bch2_extent_is_atomic(struct bkey_i *k, struct btree_iter *iter) + enum btree_insert_ret + bch2_extent_can_insert(struct btree_trans *trans, + struct btree_iter *iter, +- struct bkey_i *insert, +- unsigned *u64s) ++ struct bkey_i *insert) + { + struct btree_iter_level *l = &iter->l[0]; + struct btree_node_iter node_iter = l->iter; + struct bkey_packed *_k; ++ struct bkey_s_c k; + struct bkey unpacked; + int sectors; + +- while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, l->b, +- KEY_TYPE_discard))) { +- struct bkey_s_c k = bkey_disassemble(l->b, _k, &unpacked); +- enum bch_extent_overlap overlap = +- bch2_extent_overlap(&insert->k, k.k); +- +- if (bkey_cmp(bkey_start_pos(k.k), insert->k.p) >= 0) +- break; +- +- overlap = bch2_extent_overlap(&insert->k, k.k); +- +- /* +- * If we're overwriting an existing extent, we may need to emit +- * a whiteout - unless we're inserting a new extent at the same +- * position: +- */ +- if (k.k->needs_whiteout && +- (!bkey_whiteout(&insert->k) || +- bkey_cmp(k.k->p, insert->k.p))) +- *u64s += BKEY_U64s; +- +- /* +- * If we're partially overwriting an existing extent which has +- * been written out to disk, we'll need to emit a new version of +- * that extent: +- */ +- if (bkey_written(l->b, _k) && +- overlap != BCH_EXTENT_OVERLAP_ALL) +- *u64s += _k->u64s; +- +- /* And we may be splitting an existing extent: */ +- if (overlap == BCH_EXTENT_OVERLAP_MIDDLE) +- *u64s += _k->u64s; +- +- if (overlap == BCH_EXTENT_OVERLAP_MIDDLE && +- (sectors = bch2_bkey_sectors_compressed(k))) { +- int flags = trans->flags & BTREE_INSERT_NOFAIL +- ? BCH_DISK_RESERVATION_NOFAIL : 0; +- +- switch (bch2_disk_reservation_add(trans->c, +- trans->disk_res, +- sectors, flags)) { +- case 0: +- break; +- case -ENOSPC: +- return BTREE_INSERT_ENOSPC; +- default: +- BUG(); +- } +- } +- +- if (overlap == BCH_EXTENT_OVERLAP_FRONT || +- overlap == BCH_EXTENT_OVERLAP_MIDDLE) +- break; +- +- bch2_btree_node_iter_advance(&node_iter, l->b); +- } +- +- return BTREE_INSERT_OK; +-} +- +-static void verify_extent_nonoverlapping(struct bch_fs *c, +- struct btree *b, +- struct btree_node_iter *_iter, +- struct bkey_i *insert) +-{ +-#ifdef CONFIG_BCACHEFS_DEBUG +- struct btree_node_iter iter; +- struct bkey_packed *k; +- struct bkey uk; +- +- if (!expensive_debug_checks(c)) +- return; +- +- iter = *_iter; +- k = bch2_btree_node_iter_prev_filter(&iter, b, KEY_TYPE_discard); +- BUG_ON(k && +- (uk = bkey_unpack_key(b, k), +- bkey_cmp(uk.p, bkey_start_pos(&insert->k)) > 0)); +- +- iter = *_iter; +- k = bch2_btree_node_iter_peek_filter(&iter, b, KEY_TYPE_discard); +-#if 0 +- BUG_ON(k && +- (uk = bkey_unpack_key(b, k), +- bkey_cmp(insert->k.p, bkey_start_pos(&uk))) > 0); +-#else +- if (k && +- (uk = bkey_unpack_key(b, k), +- bkey_cmp(insert->k.p, bkey_start_pos(&uk))) > 0) { +- char buf1[100]; +- char buf2[100]; +- +- bch2_bkey_to_text(&PBUF(buf1), &insert->k); +- bch2_bkey_to_text(&PBUF(buf2), &uk); +- +- bch2_dump_btree_node(b); +- panic("insert > next :\n" +- "insert %s\n" +- "next %s\n", +- buf1, buf2); +- } +-#endif +- +-#endif +-} +- +-static void extent_bset_insert(struct bch_fs *c, struct btree_iter *iter, +- struct bkey_i *insert) +-{ +- struct btree_iter_level *l = &iter->l[0]; +- struct bkey_packed *k = +- bch2_btree_node_iter_bset_pos(&l->iter, l->b, bset_tree_last(l->b)); +- +- BUG_ON(insert->k.u64s > bch_btree_keys_u64s_remaining(c, l->b)); +- +- EBUG_ON(bkey_deleted(&insert->k) || !insert->k.size); +- verify_extent_nonoverlapping(c, l->b, &l->iter, insert); +- +- if (debug_check_bkeys(c)) +- bch2_bkey_debugcheck(c, l->b, bkey_i_to_s_c(insert)); +- +- bch2_bset_insert(l->b, &l->iter, k, insert, 0); +- bch2_btree_node_iter_fix(iter, l->b, &l->iter, k, 0, k->u64s); +-} +- +-static void pack_push_whiteout(struct bch_fs *c, struct btree *b, +- struct bpos pos) +-{ +- struct bkey_packed k; +- +- if (!bkey_pack_pos(&k, pos, b)) { +- struct bkey_i tmp; +- +- bkey_init(&tmp.k); +- tmp.k.p = pos; +- bkey_copy(&k, &tmp); +- } +- +- k.needs_whiteout = true; +- push_whiteout(c, b, &k); +-} +- +-static void +-extent_drop(struct bch_fs *c, struct btree_iter *iter, +- struct bkey_packed *_k, struct bkey_s k) +-{ +- struct btree_iter_level *l = &iter->l[0]; +- +- if (!bkey_whiteout(k.k)) +- btree_account_key_drop(l->b, _k); +- +- k.k->size = 0; +- k.k->type = KEY_TYPE_deleted; +- +- if (!btree_node_old_extent_overwrite(l->b) && +- k.k->needs_whiteout) { +- pack_push_whiteout(c, l->b, k.k->p); +- k.k->needs_whiteout = false; +- } +- +- if (_k >= btree_bset_last(l->b)->start) { +- unsigned u64s = _k->u64s; +- +- bch2_bset_delete(l->b, _k, _k->u64s); +- bch2_btree_node_iter_fix(iter, l->b, &l->iter, _k, u64s, 0); +- } else { +- extent_save(l->b, _k, k.k); +- bch2_btree_iter_fix_key_modified(iter, l->b, _k); +- } +-} +- +-static void +-extent_squash(struct bch_fs *c, struct btree_iter *iter, +- struct bkey_i *insert, +- struct bkey_packed *_k, struct bkey_s k, +- enum bch_extent_overlap overlap) +-{ +- struct btree_iter_level *l = &iter->l[0]; +- struct bkey_on_stack tmp, split; +- +- bkey_on_stack_init(&tmp); +- bkey_on_stack_init(&split); +- +- if (!btree_node_old_extent_overwrite(l->b)) { +- if (!bkey_whiteout(&insert->k) && +- !bkey_cmp(k.k->p, insert->k.p)) { +- insert->k.needs_whiteout = k.k->needs_whiteout; +- k.k->needs_whiteout = false; +- } +- } else { +- insert->k.needs_whiteout |= k.k->needs_whiteout; +- } +- +- switch (overlap) { +- case BCH_EXTENT_OVERLAP_FRONT: +- if (bkey_written(l->b, _k)) { +- bkey_on_stack_reassemble(&tmp, c, k.s_c); +- bch2_cut_front(insert->k.p, tmp.k); +- +- /* +- * needs_whiteout was propagated to new version of @k, +- * @tmp: +- */ +- if (!btree_node_old_extent_overwrite(l->b)) +- k.k->needs_whiteout = false; +- +- extent_drop(c, iter, _k, k); +- extent_bset_insert(c, iter, tmp.k); +- } else { +- btree_keys_account_val_delta(l->b, _k, +- bch2_cut_front_s(insert->k.p, k)); +- +- extent_save(l->b, _k, k.k); +- /* +- * No need to call bset_fix_invalidated_key, start of +- * extent changed but extents are indexed by where they +- * end +- */ +- bch2_btree_iter_fix_key_modified(iter, l->b, _k); +- } +- break; +- case BCH_EXTENT_OVERLAP_BACK: +- if (bkey_written(l->b, _k)) { +- bkey_on_stack_reassemble(&tmp, c, k.s_c); +- bch2_cut_back(bkey_start_pos(&insert->k), tmp.k); +- +- /* +- * @tmp has different position than @k, needs_whiteout +- * should not be propagated: +- */ +- if (!btree_node_old_extent_overwrite(l->b)) +- tmp.k->k.needs_whiteout = false; +- +- extent_drop(c, iter, _k, k); +- extent_bset_insert(c, iter, tmp.k); +- } else { +- /* +- * position of @k is changing, emit a whiteout if +- * needs_whiteout is set: +- */ +- if (!btree_node_old_extent_overwrite(l->b) && +- k.k->needs_whiteout) { +- pack_push_whiteout(c, l->b, k.k->p); +- k.k->needs_whiteout = false; +- } +- +- btree_keys_account_val_delta(l->b, _k, +- bch2_cut_back_s(bkey_start_pos(&insert->k), k)); +- extent_save(l->b, _k, k.k); +- +- bch2_bset_fix_invalidated_key(l->b, _k); +- bch2_btree_node_iter_fix(iter, l->b, &l->iter, +- _k, _k->u64s, _k->u64s); +- } +- break; +- case BCH_EXTENT_OVERLAP_ALL: +- extent_drop(c, iter, _k, k); +- break; +- case BCH_EXTENT_OVERLAP_MIDDLE: +- bkey_on_stack_reassemble(&split, c, k.s_c); +- bch2_cut_back(bkey_start_pos(&insert->k), split.k); +- +- if (!btree_node_old_extent_overwrite(l->b)) +- split.k->k.needs_whiteout = false; +- +- /* this is identical to BCH_EXTENT_OVERLAP_FRONT: */ +- if (bkey_written(l->b, _k)) { +- bkey_on_stack_reassemble(&tmp, c, k.s_c); +- bch2_cut_front(insert->k.p, tmp.k); +- +- if (!btree_node_old_extent_overwrite(l->b)) +- k.k->needs_whiteout = false; +- +- extent_drop(c, iter, _k, k); +- extent_bset_insert(c, iter, tmp.k); +- } else { +- btree_keys_account_val_delta(l->b, _k, +- bch2_cut_front_s(insert->k.p, k)); +- +- extent_save(l->b, _k, k.k); +- bch2_btree_iter_fix_key_modified(iter, l->b, _k); +- } +- +- extent_bset_insert(c, iter, split.k); +- break; +- } +- +- bkey_on_stack_exit(&split, c); +- bkey_on_stack_exit(&tmp, c); +-} ++ _k = bch2_btree_node_iter_peek_filter(&node_iter, l->b, ++ KEY_TYPE_discard); ++ if (!_k) ++ return BTREE_INSERT_OK; + +-/** +- * bch_extent_insert_fixup - insert a new extent and deal with overlaps +- * +- * this may result in not actually doing the insert, or inserting some subset +- * of the insert key. For cmpxchg operations this is where that logic lives. +- * +- * All subsets of @insert that need to be inserted are inserted using +- * bch2_btree_insert_and_journal(). If @b or @res fills up, this function +- * returns false, setting @iter->pos for the prefix of @insert that actually got +- * inserted. +- * +- * BSET INVARIANTS: this function is responsible for maintaining all the +- * invariants for bsets of extents in memory. things get really hairy with 0 +- * size extents +- * +- * within one bset: +- * +- * bkey_start_pos(bkey_next(k)) >= k +- * or bkey_start_offset(bkey_next(k)) >= k->offset +- * +- * i.e. strict ordering, no overlapping extents. +- * +- * multiple bsets (i.e. full btree node): +- * +- * ∀ k, j +- * k.size != 0 ∧ j.size != 0 → +- * ¬ (k > bkey_start_pos(j) ∧ k < j) +- * +- * i.e. no two overlapping keys _of nonzero size_ +- * +- * We can't realistically maintain this invariant for zero size keys because of +- * the key merging done in bch2_btree_insert_key() - for two mergeable keys k, j +- * there may be another 0 size key between them in another bset, and it will +- * thus overlap with the merged key. +- * +- * In addition, the end of iter->pos indicates how much has been processed. +- * If the end of iter->pos is not the same as the end of insert, then +- * key insertion needs to continue/be retried. +- */ +-void bch2_insert_fixup_extent(struct btree_trans *trans, +- struct btree_iter *iter, +- struct bkey_i *insert) +-{ +- struct bch_fs *c = trans->c; +- struct btree_iter_level *l = &iter->l[0]; +- struct btree_node_iter node_iter = l->iter; +- bool do_update = !bkey_whiteout(&insert->k); +- struct bkey_packed *_k; +- struct bkey unpacked; ++ k = bkey_disassemble(l->b, _k, &unpacked); + +- EBUG_ON(iter->level); +- EBUG_ON(!insert->k.size); +- EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k))); ++ /* Check if we're splitting a compressed extent: */ + +- while ((_k = bch2_btree_node_iter_peek_filter(&l->iter, l->b, +- KEY_TYPE_discard))) { +- struct bkey_s k = __bkey_disassemble(l->b, _k, &unpacked); +- enum bch_extent_overlap overlap = +- bch2_extent_overlap(&insert->k, k.k); ++ if (bkey_cmp(bkey_start_pos(&insert->k), bkey_start_pos(k.k)) > 0 && ++ bkey_cmp(insert->k.p, k.k->p) < 0 && ++ (sectors = bch2_bkey_sectors_compressed(k))) { ++ int flags = trans->flags & BTREE_INSERT_NOFAIL ++ ? BCH_DISK_RESERVATION_NOFAIL : 0; + +- if (bkey_cmp(bkey_start_pos(k.k), insert->k.p) >= 0) ++ switch (bch2_disk_reservation_add(trans->c, trans->disk_res, ++ sectors, flags)) { ++ case 0: + break; +- +- if (!bkey_whiteout(k.k)) +- do_update = true; +- +- if (!do_update) { +- struct bpos cur_end = bpos_min(insert->k.p, k.k->p); +- +- bch2_cut_front(cur_end, insert); +- bch2_btree_iter_set_pos_same_leaf(iter, cur_end); +- } else { +- extent_squash(c, iter, insert, _k, k, overlap); ++ case -ENOSPC: ++ return BTREE_INSERT_ENOSPC; ++ default: ++ BUG(); + } +- +- node_iter = l->iter; +- +- if (overlap == BCH_EXTENT_OVERLAP_FRONT || +- overlap == BCH_EXTENT_OVERLAP_MIDDLE) +- break; + } + +- l->iter = node_iter; +- bch2_btree_iter_set_pos_same_leaf(iter, insert->k.p); +- +- if (do_update) { +- if (insert->k.type == KEY_TYPE_deleted) +- insert->k.type = KEY_TYPE_discard; +- +- if (!bkey_whiteout(&insert->k) || +- btree_node_old_extent_overwrite(l->b)) +- extent_bset_insert(c, iter, insert); +- +- bch2_btree_journal_key(trans, iter, insert); +- } +- +- bch2_cut_front(insert->k.p, insert); ++ return BTREE_INSERT_OK; + } +diff --git a/fs/bcachefs/extent_update.h b/fs/bcachefs/extent_update.h +index e9dc8091ba3f..38dc084627d2 100644 +--- a/fs/bcachefs/extent_update.h ++++ b/fs/bcachefs/extent_update.h +@@ -11,9 +11,6 @@ int bch2_extent_is_atomic(struct bkey_i *, struct btree_iter *); + + enum btree_insert_ret + bch2_extent_can_insert(struct btree_trans *, struct btree_iter *, +- struct bkey_i *, unsigned *); +-void bch2_insert_fixup_extent(struct btree_trans *, +- struct btree_iter *, +- struct bkey_i *); ++ struct bkey_i *); + + #endif /* _BCACHEFS_EXTENT_UPDATE_H */ +diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c +index eca723121a2c..902c8da9dc15 100644 +--- a/fs/bcachefs/fsck.c ++++ b/fs/bcachefs/fsck.c +@@ -422,6 +422,42 @@ static int bch2_inode_truncate(struct bch_fs *c, u64 inode_nr, u64 new_size) + POS(inode_nr + 1, 0), NULL); + } + ++static int bch2_fix_overlapping_extent(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct bkey_s_c k, struct bpos cut_at) ++{ ++ struct btree_iter *u_iter; ++ struct bkey_i *u; ++ int ret; ++ ++ u = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); ++ ret = PTR_ERR_OR_ZERO(u); ++ if (ret) ++ return ret; ++ ++ bkey_reassemble(u, k); ++ bch2_cut_front(cut_at, u); ++ ++ u_iter = bch2_trans_copy_iter(trans, iter); ++ ret = PTR_ERR_OR_ZERO(u_iter); ++ if (ret) ++ return ret; ++ ++ /* ++ * We don't want to go through the ++ * extent_handle_overwrites path: ++ */ ++ __bch2_btree_iter_set_pos(u_iter, u->k.p, false); ++ ++ /* ++ * XXX: this is going to leave disk space ++ * accounting slightly wrong ++ */ ++ ret = bch2_trans_update(trans, u_iter, u, 0); ++ bch2_trans_iter_put(trans, u_iter); ++ return ret; ++} ++ + /* + * Walk extents: verify that extents have a corresponding S_ISREG inode, and + * that i_size an i_sectors are consistent +@@ -433,6 +469,7 @@ static int check_extents(struct bch_fs *c) + struct btree_trans trans; + struct btree_iter *iter; + struct bkey_s_c k; ++ struct bkey prev = KEY(0, 0, 0); + u64 i_sectors; + int ret = 0; + +@@ -444,6 +481,25 @@ static int check_extents(struct bch_fs *c) + POS(BCACHEFS_ROOT_INO, 0), 0); + retry: + for_each_btree_key_continue(iter, 0, k, ret) { ++ if (bkey_cmp(prev.p, bkey_start_pos(k.k)) > 0) { ++ char buf1[100]; ++ char buf2[100]; ++ ++ bch2_bkey_to_text(&PBUF(buf1), &prev); ++ bch2_bkey_to_text(&PBUF(buf2), k.k); ++ ++ if (fsck_err(c, "overlapping extents: %s, %s", buf1, buf2)) { ++ ret = __bch2_trans_do(&trans, NULL, NULL, ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_LAZY_RW, ++ bch2_fix_overlapping_extent(&trans, ++ iter, k, prev.p)); ++ if (ret) ++ goto err; ++ } ++ } ++ prev = *k.k; ++ + ret = walk_inode(&trans, &w, k.k->p.inode); + if (ret) + break; +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index bd0edda7abf9..27378cc9cdd5 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -161,13 +161,16 @@ static void journal_entries_free(struct list_head *list) + } + } + ++/* ++ * When keys compare equal, oldest compares first: ++ */ + static int journal_sort_key_cmp(const void *_l, const void *_r) + { + const struct journal_key *l = _l; + const struct journal_key *r = _r; + + return cmp_int(l->btree_id, r->btree_id) ?: +- bkey_cmp(l->pos, r->pos) ?: ++ bkey_cmp(l->k->k.p, r->k->k.p) ?: + cmp_int(l->journal_seq, r->journal_seq) ?: + cmp_int(l->journal_offset, r->journal_offset); + } +@@ -179,25 +182,11 @@ static int journal_sort_seq_cmp(const void *_l, const void *_r) + + return cmp_int(l->journal_seq, r->journal_seq) ?: + cmp_int(l->btree_id, r->btree_id) ?: +- bkey_cmp(l->pos, r->pos); +-} +- +-static void journal_keys_sift(struct journal_keys *keys, struct journal_key *i) +-{ +- while (i + 1 < keys->d + keys->nr && +- journal_sort_key_cmp(i, i + 1) > 0) { +- swap(i[0], i[1]); +- i++; +- } ++ bkey_cmp(l->k->k.p, r->k->k.p); + } + + static void journal_keys_free(struct journal_keys *keys) + { +- struct journal_key *i; +- +- for_each_journal_key(*keys, i) +- if (i->allocated) +- kfree(i->k); + kvfree(keys->d); + keys->d = NULL; + keys->nr = 0; +@@ -208,15 +197,15 @@ static struct journal_keys journal_keys_sort(struct list_head *journal_entries) + struct journal_replay *p; + struct jset_entry *entry; + struct bkey_i *k, *_n; +- struct journal_keys keys = { NULL }, keys_deduped = { NULL }; +- struct journal_key *i; ++ struct journal_keys keys = { NULL }; ++ struct journal_key *src, *dst; + size_t nr_keys = 0; + + list_for_each_entry(p, journal_entries, list) + for_each_jset_key(k, _n, entry, &p->j) + nr_keys++; + +- keys.journal_seq_base = keys_deduped.journal_seq_base = ++ keys.journal_seq_base = + le64_to_cpu(list_first_entry(journal_entries, + struct journal_replay, + list)->j.seq); +@@ -225,96 +214,31 @@ static struct journal_keys journal_keys_sort(struct list_head *journal_entries) + if (!keys.d) + goto err; + +- keys_deduped.d = kvmalloc(sizeof(keys.d[0]) * nr_keys * 2, GFP_KERNEL); +- if (!keys_deduped.d) +- goto err; +- + list_for_each_entry(p, journal_entries, list) +- for_each_jset_key(k, _n, entry, &p->j) { +- if (bkey_deleted(&k->k) && +- btree_node_type_is_extents(entry->btree_id)) +- continue; +- ++ for_each_jset_key(k, _n, entry, &p->j) + keys.d[keys.nr++] = (struct journal_key) { + .btree_id = entry->btree_id, +- .pos = bkey_start_pos(&k->k), + .k = k, + .journal_seq = le64_to_cpu(p->j.seq) - + keys.journal_seq_base, + .journal_offset = k->_data - p->j._data, + }; +- } + + sort(keys.d, keys.nr, sizeof(keys.d[0]), journal_sort_key_cmp, NULL); + +- i = keys.d; +- while (i < keys.d + keys.nr) { +- if (i + 1 < keys.d + keys.nr && +- i[0].btree_id == i[1].btree_id && +- !bkey_cmp(i[0].pos, i[1].pos)) { +- if (bkey_cmp(i[0].k->k.p, i[1].k->k.p) <= 0) { +- i++; +- } else { +- bch2_cut_front(i[1].k->k.p, i[0].k); +- i[0].pos = i[1].k->k.p; +- journal_keys_sift(&keys, i); +- } +- continue; +- } +- +- if (i + 1 < keys.d + keys.nr && +- i[0].btree_id == i[1].btree_id && +- bkey_cmp(i[0].k->k.p, bkey_start_pos(&i[1].k->k)) > 0) { +- if ((cmp_int(i[0].journal_seq, i[1].journal_seq) ?: +- cmp_int(i[0].journal_offset, i[1].journal_offset)) < 0) { +- if (bkey_cmp(i[0].k->k.p, i[1].k->k.p) <= 0) { +- bch2_cut_back(bkey_start_pos(&i[1].k->k), i[0].k); +- } else { +- struct bkey_i *split = +- kmalloc(bkey_bytes(i[0].k), GFP_KERNEL); +- +- if (!split) +- goto err; +- +- bkey_copy(split, i[0].k); +- bch2_cut_back(bkey_start_pos(&i[1].k->k), split); +- keys_deduped.d[keys_deduped.nr++] = (struct journal_key) { +- .btree_id = i[0].btree_id, +- .allocated = true, +- .pos = bkey_start_pos(&split->k), +- .k = split, +- .journal_seq = i[0].journal_seq, +- .journal_offset = i[0].journal_offset, +- }; +- +- bch2_cut_front(i[1].k->k.p, i[0].k); +- i[0].pos = i[1].k->k.p; +- journal_keys_sift(&keys, i); +- continue; +- } +- } else { +- if (bkey_cmp(i[0].k->k.p, i[1].k->k.p) >= 0) { +- i[1] = i[0]; +- i++; +- continue; +- } else { +- bch2_cut_front(i[0].k->k.p, i[1].k); +- i[1].pos = i[0].k->k.p; +- journal_keys_sift(&keys, i + 1); +- continue; +- } +- } +- } ++ src = dst = keys.d; ++ while (src < keys.d + keys.nr) { ++ while (src + 1 < keys.d + keys.nr && ++ src[0].btree_id == src[1].btree_id && ++ !bkey_cmp(src[0].k->k.p, src[1].k->k.p)) ++ src++; + +- keys_deduped.d[keys_deduped.nr++] = *i++; ++ *dst++ = *src++; + } + +- kvfree(keys.d); +- return keys_deduped; ++ keys.nr = dst - keys.d; + err: +- journal_keys_free(&keys_deduped); +- kvfree(keys.d); +- return (struct journal_keys) { NULL }; ++ return keys; + } + + /* journal replay: */ +@@ -365,11 +289,6 @@ retry: + + atomic_end = bpos_min(k->k.p, iter->l[0].b->key.k.p); + +- split_iter = bch2_trans_copy_iter(&trans, iter); +- ret = PTR_ERR_OR_ZERO(split_iter); +- if (ret) +- goto err; +- + split = bch2_trans_kmalloc(&trans, bkey_bytes(&k->k)); + ret = PTR_ERR_OR_ZERO(split); + if (ret) +@@ -388,12 +307,25 @@ retry: + } + + bkey_copy(split, k); +- bch2_cut_front(split_iter->pos, split); ++ bch2_cut_front(iter->pos, split); + bch2_cut_back(atomic_end, split); + ++ split_iter = bch2_trans_copy_iter(&trans, iter); ++ ret = PTR_ERR_OR_ZERO(split_iter); ++ if (ret) ++ goto err; ++ ++ /* ++ * It's important that we don't go through the ++ * extent_handle_overwrites() and extent_update_to_keys() path ++ * here: journal replay is supposed to treat extents like ++ * regular keys ++ */ ++ __bch2_btree_iter_set_pos(split_iter, split->k.p, false); + bch2_trans_update(&trans, split_iter, split, !remark + ? BTREE_TRIGGER_NORUN + : BTREE_TRIGGER_NOOVERWRITES); ++ + bch2_btree_iter_set_pos(iter, split->k.p); + } while (bkey_cmp(iter->pos, k->k.p) < 0); + +@@ -424,11 +356,18 @@ static int __bch2_journal_replay_key(struct btree_trans *trans, + struct btree_iter *iter; + int ret; + +- iter = bch2_trans_get_iter(trans, id, bkey_start_pos(&k->k), +- BTREE_ITER_INTENT); ++ iter = bch2_trans_get_iter(trans, id, k->k.p, BTREE_ITER_INTENT); + if (IS_ERR(iter)) + return PTR_ERR(iter); + ++ /* ++ * iter->flags & BTREE_ITER_IS_EXTENTS triggers the update path to run ++ * extent_handle_overwrites() and extent_update_to_keys() - but we don't ++ * want that here, journal replay is supposed to treat extents like ++ * regular keys: ++ */ ++ __bch2_btree_iter_set_pos(iter, k->k.p, false); ++ + ret = bch2_btree_iter_traverse(iter) ?: + bch2_trans_update(trans, iter, k, BTREE_TRIGGER_NORUN); + bch2_trans_iter_put(trans, iter); +@@ -459,7 +398,7 @@ static int bch2_journal_replay(struct bch_fs *c, + + if (i->btree_id == BTREE_ID_ALLOC) + ret = bch2_alloc_replay_key(c, i->k); +- else if (btree_node_type_is_extents(i->btree_id)) ++ else if (i->k->k.size) + ret = bch2_extent_replay_key(c, i->btree_id, i->k); + else + ret = bch2_journal_replay_key(c, i->btree_id, i->k); +@@ -859,6 +798,15 @@ int bch2_fs_recovery(struct bch_fs *c) + journal_seq = le64_to_cpu(clean->journal_seq) + 1; + } + ++ if (!c->sb.clean && ++ !(c->sb.features & (1ULL << BCH_FEATURE_extents_above_btree_updates))) { ++ bch_err(c, "filesystem needs recovery from older version; run fsck from older bcachefs-tools to fix"); ++ ret = -EINVAL; ++ goto err; ++ } ++ ++ c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_extents_above_btree_updates; ++ + ret = journal_replay_early(c, clean, &journal_entries); + if (ret) + goto err; +diff --git a/fs/bcachefs/recovery.h b/fs/bcachefs/recovery.h +index ccd84a8fe60d..c91309301563 100644 +--- a/fs/bcachefs/recovery.h ++++ b/fs/bcachefs/recovery.h +@@ -5,8 +5,6 @@ + struct journal_keys { + struct journal_key { + enum btree_id btree_id:8; +- unsigned allocated:1; +- struct bpos pos; + struct bkey_i *k; + u32 journal_seq; + u32 journal_offset; +-- +cgit v1.2.3 + + +From ad9d51567c69b3077f2c2d84d4b118a9304d55f2 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 15 Mar 2020 22:41:10 -0400 +Subject: bcachefs: Drop unused export + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_update.h | 2 -- + fs/bcachefs/btree_update_leaf.c | 6 +++--- + 2 files changed, 3 insertions(+), 5 deletions(-) + +diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h +index 12127a33906b..9f58d47ef5d6 100644 +--- a/fs/bcachefs/btree_update.h ++++ b/fs/bcachefs/btree_update.h +@@ -12,8 +12,6 @@ void bch2_btree_node_lock_for_insert(struct bch_fs *, struct btree *, + struct btree_iter *); + bool bch2_btree_bset_insert_key(struct btree_iter *, struct btree *, + struct btree_node_iter *, struct bkey_i *); +-void bch2_btree_journal_key(struct btree_trans *, struct btree_iter *, +- struct bkey_i *); + + enum btree_insert_flags { + __BTREE_INSERT_NOUNLOCK, +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index 389b5ee203c4..7c2f72a3a725 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -166,9 +166,9 @@ static inline void __btree_journal_key(struct btree_trans *trans, + *trans->journal_seq = seq; + } + +-void bch2_btree_journal_key(struct btree_trans *trans, +- struct btree_iter *iter, +- struct bkey_i *insert) ++static void bch2_btree_journal_key(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct bkey_i *insert) + { + struct bch_fs *c = trans->c; + struct journal *j = &c->journal; +-- +cgit v1.2.3 + + +From 217bd3b569656175d37ff985c8f315403e443256 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 16 Mar 2020 14:49:52 -0400 +Subject: bcachefs: Fix a use after free in dio write path + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/fs-io.c | 7 +++---- + 1 file changed, 3 insertions(+), 4 deletions(-) + +diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c +index e2b293a6ff91..e20e7ec722aa 100644 +--- a/fs/bcachefs/fs-io.c ++++ b/fs/bcachefs/fs-io.c +@@ -1831,7 +1831,7 @@ static long bch2_dio_write_loop(struct dio_write *dio) + struct bio_vec *bv; + unsigned unaligned; + u64 new_i_size; +- bool sync; ++ bool sync = dio->sync; + long ret; + + if (dio->loop) +@@ -1879,7 +1879,7 @@ static long bch2_dio_write_loop(struct dio_write *dio) + iov = kmalloc(dio->iter.nr_segs * sizeof(*iov), + GFP_KERNEL); + if (unlikely(!iov)) { +- dio->sync = true; ++ dio->sync = sync = true; + goto do_io; + } + +@@ -1893,7 +1893,7 @@ do_io: + dio->loop = true; + closure_call(&dio->op.cl, bch2_write, NULL, NULL); + +- if (dio->sync) ++ if (sync) + wait_for_completion(&dio->done); + else + return -EIOCBQUEUED; +@@ -1927,7 +1927,6 @@ err: + if (dio->free_iov) + kfree(dio->iter.iov); + +- sync = dio->sync; + bio_put(bio); + + /* inode->i_dio_count is our ref on inode and thus bch_fs */ +-- +cgit v1.2.3 + + +From 4a174c9c59a23d1810555b902c1f3e5b055efd18 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 16 Mar 2020 15:49:23 -0400 +Subject: bcachefs: Don't use peek_filter() unnecessarily + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/buckets.c | 9 +++------ + fs/bcachefs/extent_update.c | 6 ++---- + 2 files changed, 5 insertions(+), 10 deletions(-) + +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index 4497c9840865..2e1df04c760d 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -1335,8 +1335,7 @@ int bch2_mark_update(struct btree_trans *trans, + !bkey_deleted(&insert->k)) + return 0; + +- while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, b, +- KEY_TYPE_discard))) { ++ while ((_k = bch2_btree_node_iter_peek(&node_iter, b))) { + struct bkey unpacked; + struct bkey_s_c k = bkey_disassemble(b, _k, &unpacked); + +@@ -1382,8 +1381,7 @@ void bch2_trans_fs_usage_apply(struct btree_trans *trans, + pr_err("overlapping with"); + + node_iter = iter->l[0].iter; +- while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, b, +- KEY_TYPE_discard))) { ++ while ((_k = bch2_btree_node_iter_peek(&node_iter, b))) { + struct bkey unpacked; + struct bkey_s_c k; + +@@ -1795,8 +1793,7 @@ int bch2_trans_mark_update(struct btree_trans *trans, + if (unlikely(flags & BTREE_TRIGGER_NOOVERWRITES)) + return 0; + +- while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, b, +- KEY_TYPE_discard))) { ++ while ((_k = bch2_btree_node_iter_peek(&node_iter, b))) { + struct bkey unpacked; + struct bkey_s_c k; + unsigned offset = 0; +diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c +index fa6c0698f385..beb3b694e33c 100644 +--- a/fs/bcachefs/extent_update.c ++++ b/fs/bcachefs/extent_update.c +@@ -120,8 +120,7 @@ int bch2_extent_atomic_end(struct btree_iter *iter, + if (ret < 0) + return ret; + +- while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, b, +- KEY_TYPE_discard))) { ++ while ((_k = bch2_btree_node_iter_peek(&node_iter, b))) { + struct bkey unpacked; + struct bkey_s_c k = bkey_disassemble(b, _k, &unpacked); + unsigned offset = 0; +@@ -182,8 +181,7 @@ bch2_extent_can_insert(struct btree_trans *trans, + struct bkey unpacked; + int sectors; + +- _k = bch2_btree_node_iter_peek_filter(&node_iter, l->b, +- KEY_TYPE_discard); ++ _k = bch2_btree_node_iter_peek(&node_iter, l->b); + if (!_k) + return BTREE_INSERT_OK; + +-- +cgit v1.2.3 + + +From 85e5d14085cfef217db4a9a66d1a469c16e5b8f8 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 16 Mar 2020 15:48:58 -0400 +Subject: bcachefs: Fix another iterator leak + +This updates bch2_rbio_narrow_crcs() to the current style for +transactional btree code, and fixes a rare panic on iterator overflow. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/io.c | 60 ++++++++++++++++++++++++++++++++------------------------ + 1 file changed, 34 insertions(+), 26 deletions(-) + +diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c +index cc70b1f388fd..917705bada4c 100644 +--- a/fs/bcachefs/io.c ++++ b/fs/bcachefs/io.c +@@ -1699,33 +1699,39 @@ static void bch2_rbio_error(struct bch_read_bio *rbio, int retry, + } + } + +-static void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio) ++static int __bch2_rbio_narrow_crcs(struct btree_trans *trans, ++ struct bch_read_bio *rbio) + { + struct bch_fs *c = rbio->c; +- struct btree_trans trans; +- struct btree_iter *iter; +- struct bkey_s_c k; +- struct bkey_on_stack new; +- struct bch_extent_crc_unpacked new_crc; + u64 data_offset = rbio->pos.offset - rbio->pick.crc.offset; +- int ret; ++ struct bch_extent_crc_unpacked new_crc; ++ struct btree_iter *iter = NULL; ++ struct bkey_i *new; ++ struct bkey_s_c k; ++ int ret = 0; + + if (crc_is_compressed(rbio->pick.crc)) +- return; +- +- bkey_on_stack_init(&new); +- bch2_trans_init(&trans, c, 0, 0); +-retry: +- bch2_trans_begin(&trans); ++ return 0; + +- iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, rbio->pos, ++ iter = bch2_trans_get_iter(trans, BTREE_ID_EXTENTS, rbio->pos, + BTREE_ITER_SLOTS|BTREE_ITER_INTENT); ++ if ((ret = PTR_ERR_OR_ZERO(iter))) ++ goto out; ++ + k = bch2_btree_iter_peek_slot(iter); +- if (IS_ERR_OR_NULL(k.k)) ++ if ((ret = bkey_err(k))) ++ goto out; ++ ++ /* ++ * going to be temporarily appending another checksum entry: ++ */ ++ new = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + ++ BKEY_EXTENT_U64s_MAX * 8); ++ if ((ret = PTR_ERR_OR_ZERO(new))) + goto out; + +- bkey_on_stack_reassemble(&new, c, k); +- k = bkey_i_to_s_c(new.k); ++ bkey_reassemble(new, k); ++ k = bkey_i_to_s_c(new); + + if (bversion_cmp(k.k->version, rbio->version) || + !bch2_bkey_matches_ptr(c, k, rbio->pick.ptr, data_offset)) +@@ -1741,21 +1747,23 @@ retry: + bkey_start_offset(k.k) - data_offset, k.k->size, + rbio->pick.crc.csum_type)) { + bch_err(c, "error verifying existing checksum while narrowing checksum (memory corruption?)"); ++ ret = 0; + goto out; + } + +- if (!bch2_bkey_narrow_crcs(new.k, new_crc)) ++ if (!bch2_bkey_narrow_crcs(new, new_crc)) + goto out; + +- bch2_trans_update(&trans, iter, new.k, 0); +- ret = bch2_trans_commit(&trans, NULL, NULL, +- BTREE_INSERT_NOFAIL| +- BTREE_INSERT_NOWAIT); +- if (ret == -EINTR) +- goto retry; ++ bch2_trans_update(trans, iter, new, 0); + out: +- bch2_trans_exit(&trans); +- bkey_on_stack_exit(&new, c); ++ bch2_trans_iter_put(trans, iter); ++ return ret; ++} ++ ++static noinline void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio) ++{ ++ bch2_trans_do(rbio->c, NULL, NULL, BTREE_INSERT_NOFAIL, ++ __bch2_rbio_narrow_crcs(&trans, rbio)); + } + + /* Inner part that may run in process context */ +-- +cgit v1.2.3 + + +From 2dfb404c3394e03737a2e8c0a55378dd983a4efb Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 16 Mar 2020 17:23:37 -0400 +Subject: bcachefs: Clear BCH_FEATURE_extents_above_btree_updates on clean + shutdown + +This is needed so that users can roll back to before "d9bb516b2d +bcachefs: Move extent overwrite handling out of core btree code", which +it appears may still be buggy. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/recovery.c | 2 -- + fs/bcachefs/super-io.c | 2 ++ + 2 files changed, 2 insertions(+), 2 deletions(-) + +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index 27378cc9cdd5..02b381cb567b 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -805,8 +805,6 @@ int bch2_fs_recovery(struct bch_fs *c) + goto err; + } + +- c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_extents_above_btree_updates; +- + ret = journal_replay_early(c, clean, &journal_entries); + if (ret) + goto err; +diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c +index 180d9091a75b..20b699b85333 100644 +--- a/fs/bcachefs/super-io.c ++++ b/fs/bcachefs/super-io.c +@@ -953,6 +953,7 @@ int bch2_fs_mark_dirty(struct bch_fs *c) + mutex_lock(&c->sb_lock); + SET_BCH_SB_CLEAN(c->disk_sb.sb, false); + c->disk_sb.sb->compat[0] &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_METADATA); ++ c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_extents_above_btree_updates; + ret = bch2_write_super(c); + mutex_unlock(&c->sb_lock); + +@@ -1083,6 +1084,7 @@ void bch2_fs_mark_clean(struct bch_fs *c) + + c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_ALLOC_INFO; + c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_ALLOC_METADATA; ++ c->disk_sb.sb->features[0] &= ~(1ULL << BCH_FEATURE_extents_above_btree_updates); + + u64s = sizeof(*sb_clean) / sizeof(u64) + c->journal.entry_u64s_reserved; + +-- +cgit v1.2.3 + + +From 4c80049e8c413ba286f2d5abf6db8805cc83ac1b Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 18 Mar 2020 11:40:07 -0400 +Subject: bcachefs: BCH_FEATURE_new_extent_overwrite is now required + +The patch "bcachefs: Move extent overwrite handling out of core btree +code" should have been flipping on this feature bit; extent btree nodes +in the old format have to be rewritten before we can insert into them +with the new extent update path. Not turning on this feature bit was +causing us to go into an infinite loop where we keep rewriting btree +nodes over and over. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/super-io.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c +index 20b699b85333..b5e276539bd6 100644 +--- a/fs/bcachefs/super-io.c ++++ b/fs/bcachefs/super-io.c +@@ -953,6 +953,7 @@ int bch2_fs_mark_dirty(struct bch_fs *c) + mutex_lock(&c->sb_lock); + SET_BCH_SB_CLEAN(c->disk_sb.sb, false); + c->disk_sb.sb->compat[0] &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_METADATA); ++ c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_new_extent_overwrite; + c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_extents_above_btree_updates; + ret = bch2_write_super(c); + mutex_unlock(&c->sb_lock); +-- +cgit v1.2.3 + + +From 49602c48188a1648e4c29fa78269f5699c7952b2 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 18 Mar 2020 11:46:46 -0400 +Subject: bcachefs: Shut down quicker + +Internal writes (i.e. copygc/rebalance operations) shouldn't be blocking +on the allocator when we're going RO. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/io.c | 6 ++++++ + 1 file changed, 6 insertions(+) + +diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c +index 917705bada4c..ff1649e10ef9 100644 +--- a/fs/bcachefs/io.c ++++ b/fs/bcachefs/io.c +@@ -1068,6 +1068,12 @@ again: + BKEY_EXTENT_U64s_MAX)) + goto flush_io; + ++ if ((op->flags & BCH_WRITE_FROM_INTERNAL) && ++ percpu_ref_is_dying(&c->writes)) { ++ ret = -EROFS; ++ goto err; ++ } ++ + wp = bch2_alloc_sectors_start(c, + op->target, + op->opts.erasure_code, +-- +cgit v1.2.3 + + +From 2ac069fb3fe3586a3ff19aa7bc86eeafda659b73 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 18 Mar 2020 13:40:28 -0400 +Subject: bcachefs: Fix an iterator bug + +We were incorrectly not restarting the transaction when re-traversing +iterators. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 9 ++++++++- + 1 file changed, 8 insertions(+), 1 deletion(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 2819b9a487f2..6ed688cdcfde 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -1068,7 +1068,14 @@ retry_all: + goto retry_all; + } + +- ret = hweight64(trans->iters_live) > 1 ? -EINTR : 0; ++ if (hweight64(trans->iters_live) > 1) ++ ret = -EINTR; ++ else ++ trans_for_each_iter(trans, iter) ++ if (iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT) { ++ ret = -EINTR; ++ break; ++ } + out: + bch2_btree_cache_cannibalize_unlock(c); + return ret; +-- +cgit v1.2.3 + + +From db24fa30f50c47852499aaac0f08abd02df71f8b Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 21 Mar 2020 14:08:01 -0400 +Subject: bcachefs: Fix count_iters_for_insert() + +This fixes a transaction iterator overflow. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/extent_update.c | 4 ++++ + 1 file changed, 4 insertions(+) + +diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c +index beb3b694e33c..8e5070d5a39b 100644 +--- a/fs/bcachefs/extent_update.c ++++ b/fs/bcachefs/extent_update.c +@@ -44,6 +44,10 @@ static int count_iters_for_insert(struct btree_trans *trans, + * extent we're inserting and overwriting: + */ + *nr_iters += 1; ++ if (*nr_iters >= max_iters) { ++ *end = bpos_min(*end, k.k->p); ++ ret = 1; ++ } + + switch (k.k->type) { + case KEY_TYPE_extent: +-- +cgit v1.2.3 + + +From fecf8a22f305e3508c269e9208a0f0e8159487d4 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 21 Mar 2020 14:47:00 -0400 +Subject: bcachefs: Fix a locking bug in fsck + +This works around a btree locking issue - we can't be holding read locks +while taking write locks, which currently means we can't have live +iterators holding read locks at commit time. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/fsck.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c +index 902c8da9dc15..936e6366cb04 100644 +--- a/fs/bcachefs/fsck.c ++++ b/fs/bcachefs/fsck.c +@@ -478,7 +478,8 @@ static int check_extents(struct bch_fs *c) + bch_verbose(c, "checking extents"); + + iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, +- POS(BCACHEFS_ROOT_INO, 0), 0); ++ POS(BCACHEFS_ROOT_INO, 0), ++ BTREE_ITER_INTENT); + retry: + for_each_btree_key_continue(iter, 0, k, ret) { + if (bkey_cmp(prev.p, bkey_start_pos(k.k)) > 0) { +-- +cgit v1.2.3 + + +From 3aafdb290cfa487bd54f33c1856113014a35a0d4 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 24 Mar 2020 17:00:48 -0400 +Subject: bcachefs: Disable extent merging + +Extent merging is currently broken, and will be reimplemented +differently soon - right now it only happens when btree nodes are being +compacted, which makes it difficult to test. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bkey.h | 3 +-- + fs/bcachefs/bkey_sort.c | 64 ++++++++++++++----------------------------------- + 2 files changed, 19 insertions(+), 48 deletions(-) + +diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h +index 9106bea9ac06..cbcfbd26bc58 100644 +--- a/fs/bcachefs/bkey.h ++++ b/fs/bcachefs/bkey.h +@@ -400,9 +400,8 @@ static inline int bch2_compile_bkey_format(const struct bkey_format *format, + static inline void bkey_reassemble(struct bkey_i *dst, + struct bkey_s_c src) + { +- BUG_ON(bkey_packed(src.k)); + dst->k = *src.k; +- memcpy_u64s(&dst->v, src.v, bkey_val_u64s(src.k)); ++ memcpy_u64s_small(&dst->v, src.v, bkey_val_u64s(src.k)); + } + + #define bkey_s_null ((struct bkey_s) { .k = NULL }) +diff --git a/fs/bcachefs/bkey_sort.c b/fs/bcachefs/bkey_sort.c +index 68965a0f973a..839e78d1dc35 100644 +--- a/fs/bcachefs/bkey_sort.c ++++ b/fs/bcachefs/bkey_sort.c +@@ -130,44 +130,21 @@ bch2_key_sort_fix_overlapping(struct bch_fs *c, struct bset *dst, + return nr; + } + +-static void extent_sort_advance_prev(struct bkey_format *f, +- struct btree_nr_keys *nr, +- struct bkey_packed *start, +- struct bkey_packed **prev) +-{ +- if (*prev) { +- bch2_bkey_pack(*prev, (void *) *prev, f); +- +- btree_keys_account_key_add(nr, 0, *prev); +- *prev = bkey_next(*prev); +- } else { +- *prev = start; +- } +-} +- + static void extent_sort_append(struct bch_fs *c, + struct bkey_format *f, + struct btree_nr_keys *nr, +- struct bkey_packed *start, +- struct bkey_packed **prev, ++ struct bkey_packed **out, + struct bkey_s k) + { +- if (bkey_whiteout(k.k)) +- return; +- +- /* +- * prev is always unpacked, for key merging - until right before we +- * advance it: +- */ ++ if (!bkey_whiteout(k.k)) { ++ if (!bch2_bkey_pack_key(*out, k.k, f)) ++ memcpy_u64s_small(*out, k.k, BKEY_U64s); + +- if (*prev && +- bch2_bkey_merge(c, bkey_i_to_s((void *) *prev), k) == +- BCH_MERGE_MERGE) +- return; ++ memcpy_u64s_small(bkeyp_val(f, *out), k.v, bkey_val_u64s(k.k)); + +- extent_sort_advance_prev(f, nr, start, prev); +- +- bkey_reassemble((void *) *prev, k.s_c); ++ btree_keys_account_key_add(nr, 0, *out); ++ *out = bkey_next(*out); ++ } + } + + /* Sort + repack in a new format: */ +@@ -201,7 +178,7 @@ bch2_sort_repack(struct bset *dst, struct btree *src, + return nr; + } + +-/* Sort, repack, and merge: */ ++/* Sort, repack, and call bch2_bkey_normalize() to drop stale pointers: */ + struct btree_nr_keys + bch2_sort_repack_merge(struct bch_fs *c, + struct bset *dst, struct btree *src, +@@ -209,7 +186,7 @@ bch2_sort_repack_merge(struct bch_fs *c, + struct bkey_format *out_f, + bool filter_whiteouts) + { +- struct bkey_packed *prev = NULL, *k_packed; ++ struct bkey_packed *out = vstruct_last(dst), *k_packed; + struct bkey_on_stack k; + struct btree_nr_keys nr; + +@@ -234,13 +211,10 @@ bch2_sort_repack_merge(struct bch_fs *c, + bch2_bkey_normalize(c, bkey_i_to_s(k.k))) + continue; + +- extent_sort_append(c, out_f, &nr, vstruct_last(dst), +- &prev, bkey_i_to_s(k.k)); ++ extent_sort_append(c, out_f, &nr, &out, bkey_i_to_s(k.k)); + } + +- extent_sort_advance_prev(out_f, &nr, vstruct_last(dst), &prev); +- +- dst->u64s = cpu_to_le16((u64 *) prev - dst->_data); ++ dst->u64s = cpu_to_le16((u64 *) out - dst->_data); + bkey_on_stack_exit(&k, c); + return nr; + } +@@ -337,7 +311,7 @@ bch2_extent_sort_fix_overlapping(struct bch_fs *c, struct bset *dst, + struct btree *b = iter->b; + struct bkey_format *f = &b->format; + struct sort_iter_set *_l = iter->data, *_r = iter->data + 1; +- struct bkey_packed *prev = NULL; ++ struct bkey_packed *out = dst->start; + struct bkey l_unpacked, r_unpacked; + struct bkey_s l, r; + struct btree_nr_keys nr; +@@ -360,7 +334,7 @@ bch2_extent_sort_fix_overlapping(struct bch_fs *c, struct bset *dst, + l = __bkey_disassemble(b, _l->k, &l_unpacked); + + if (iter->used == 1) { +- extent_sort_append(c, f, &nr, dst->start, &prev, l); ++ extent_sort_append(c, f, &nr, &out, l); + extent_iter_advance(iter, 0); + continue; + } +@@ -369,7 +343,7 @@ bch2_extent_sort_fix_overlapping(struct bch_fs *c, struct bset *dst, + + /* If current key and next key don't overlap, just append */ + if (bkey_cmp(l.k->p, bkey_start_pos(r.k)) <= 0) { +- extent_sort_append(c, f, &nr, dst->start, &prev, l); ++ extent_sort_append(c, f, &nr, &out, l); + extent_iter_advance(iter, 0); + continue; + } +@@ -414,17 +388,15 @@ bch2_extent_sort_fix_overlapping(struct bch_fs *c, struct bset *dst, + __sort_iter_sift(iter, 0, + extent_sort_fix_overlapping_cmp); + +- extent_sort_append(c, f, &nr, dst->start, +- &prev, bkey_i_to_s(split.k)); ++ extent_sort_append(c, f, &nr, &out, ++ bkey_i_to_s(split.k)); + } else { + bch2_cut_back_s(bkey_start_pos(r.k), l); + extent_save(b, _l->k, l.k); + } + } + +- extent_sort_advance_prev(f, &nr, dst->start, &prev); +- +- dst->u64s = cpu_to_le16((u64 *) prev - dst->_data); ++ dst->u64s = cpu_to_le16((u64 *) out - dst->_data); + + bkey_on_stack_exit(&split, c); + return nr; +-- +cgit v1.2.3 + + +From 7fdc42bd0b538365d8ced04ed6adc9b5083d145c Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 15 Mar 2020 23:29:43 -0400 +Subject: bcachefs: trans_commit() path can now insert to interior nodes + +This will be needed for the upcoming patches to journal updates to +interior btree nodes. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_types.h | 5 +++++ + fs/bcachefs/btree_update_interior.c | 2 +- + fs/bcachefs/btree_update_leaf.c | 22 +++++++++------------- + 3 files changed, 15 insertions(+), 14 deletions(-) + +diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h +index 51d579a4ffae..3e9b924c21b8 100644 +--- a/fs/bcachefs/btree_types.h ++++ b/fs/bcachefs/btree_types.h +@@ -261,6 +261,11 @@ static inline enum btree_iter_type btree_iter_type(struct btree_iter *iter) + return iter->flags & BTREE_ITER_TYPE; + } + ++static inline struct btree_iter_level *iter_l(struct btree_iter *iter) ++{ ++ return iter->l + iter->level; ++} ++ + struct btree_insert_entry { + unsigned trigger_flags; + unsigned trans_triggers_run:1; +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index 65c83fc3145e..02e47828914b 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -1630,7 +1630,7 @@ int bch2_btree_split_leaf(struct bch_fs *c, struct btree_iter *iter, + unsigned flags) + { + struct btree_trans *trans = iter->trans; +- struct btree *b = iter->l[0].b; ++ struct btree *b = iter_l(iter)->b; + struct btree_update *as; + struct closure cl; + int ret = 0; +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index 7c2f72a3a725..e491972007e1 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -24,7 +24,7 @@ static inline bool same_leaf_as_prev(struct btree_trans *trans, + struct btree_insert_entry *i) + { + return i != trans->updates2 && +- i[0].iter->l[0].b == i[-1].iter->l[0].b; ++ iter_l(i[0].iter)->b == iter_l(i[-1].iter)->b; + } + + inline void bch2_btree_node_lock_for_insert(struct bch_fs *c, struct btree *b, +@@ -172,13 +172,12 @@ static void bch2_btree_journal_key(struct btree_trans *trans, + { + struct bch_fs *c = trans->c; + struct journal *j = &c->journal; +- struct btree *b = iter->l[0].b; ++ struct btree *b = iter_l(iter)->b; + struct btree_write *w = btree_current_write(b); + u64 seq = likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)) + ? trans->journal_res.seq + : j->replay_journal_seq; + +- EBUG_ON(iter->level || b->level); + EBUG_ON(trans->journal_res.ref != + !(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)); + +@@ -205,17 +204,15 @@ static void btree_insert_key_leaf(struct btree_trans *trans, + struct bkey_i *insert) + { + struct bch_fs *c = trans->c; +- struct btree *b = iter->l[0].b; ++ struct btree *b = iter_l(iter)->b; + struct bset_tree *t = bset_tree_last(b); + int old_u64s = bset_u64s(t); + int old_live_u64s = b->nr.live_u64s; + int live_u64s_added, u64s_added; + +- EBUG_ON(iter->level); +- + insert->k.needs_whiteout = false; + +- if (likely(bch2_btree_bset_insert_key(iter, b, &iter->l[0].iter, insert))) ++ if (likely(bch2_btree_bset_insert_key(iter, b, &iter_l(iter)->iter, insert))) + bch2_btree_journal_key(trans, iter, insert); + + live_u64s_added = (int) b->nr.live_u64s - old_live_u64s; +@@ -241,7 +238,6 @@ static inline void btree_insert_entry_checks(struct btree_trans *trans, + { + struct bch_fs *c = trans->c; + +- BUG_ON(iter->level); + BUG_ON(bkey_cmp(insert->k.p, iter->pos)); + BUG_ON(debug_check_bkeys(c) && + bch2_bkey_invalid(c, bkey_i_to_s_c(insert), iter->btree_id)); +@@ -290,7 +286,7 @@ btree_key_can_insert(struct btree_trans *trans, + unsigned *u64s) + { + struct bch_fs *c = trans->c; +- struct btree *b = iter->l[0].b; ++ struct btree *b = iter_l(iter)->b; + static enum btree_insert_ret ret; + + if (unlikely(btree_node_fake(b))) +@@ -345,7 +341,7 @@ static noinline void bch2_trans_mark_gc(struct btree_trans *trans) + struct btree_insert_entry *i; + + trans_for_each_update(trans, i) +- if (gc_visited(c, gc_pos_btree_node(i->iter->l[0].b))) ++ if (gc_visited(c, gc_pos_btree_node(iter_l(i->iter)->b))) + bch2_mark_update(trans, i->iter, i->k, NULL, + i->trigger_flags|BTREE_TRIGGER_GC); + } +@@ -461,7 +457,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, + int ret; + + trans_for_each_update2(trans, i) +- BUG_ON(!btree_node_intent_locked(i->iter, 0)); ++ BUG_ON(!btree_node_intent_locked(i->iter, i->iter->level)); + + ret = bch2_journal_preres_get(&trans->c->journal, + &trans->journal_preres, trans->journal_preres_u64s, +@@ -495,13 +491,13 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, + trans_for_each_update2(trans, i) + if (!same_leaf_as_prev(trans, i)) + bch2_btree_node_lock_for_insert(trans->c, +- i->iter->l[0].b, i->iter); ++ iter_l(i->iter)->b, i->iter); + + ret = bch2_trans_commit_write_locked(trans, stopped_at); + + trans_for_each_update2(trans, i) + if (!same_leaf_as_prev(trans, i)) +- bch2_btree_node_unlock_write_inlined(i->iter->l[0].b, ++ bch2_btree_node_unlock_write_inlined(iter_l(i->iter)->b, + i->iter); + + /* +-- +cgit v1.2.3 + + +From 41bc880a87e89255605128accdf3a513362bca32 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 27 Feb 2020 16:58:29 -0500 +Subject: bcachefs: Walk btree with keys from journal + +In this patch series, we'll start journalling updates to interior btree +nodes; this means that the initial GC must overlay keys from the journal +as it's walking the btree. + +Rather than try to shoehorn this into btree iterators, this implements +an open coded tree traversal that doesn't use btree iterators. + +We also add a new mode to the existing btree + journal iterator where it +can iterate over keys in a single btree node - using btree_node_iter +instead of btree_iter. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bkey_methods.c | 2 +- + fs/bcachefs/btree_cache.c | 81 +++++++++++++++++++++-- + fs/bcachefs/btree_cache.h | 3 + + fs/bcachefs/btree_gc.c | 113 +++++++++++++++++++++---------- + fs/bcachefs/recovery.c | 161 ++++++++++++++++++++++++++++++--------------- + fs/bcachefs/recovery.h | 21 ++++-- + 6 files changed, 281 insertions(+), 100 deletions(-) + +diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c +index c064cf468a9b..0aa3d3b9a281 100644 +--- a/fs/bcachefs/bkey_methods.c ++++ b/fs/bcachefs/bkey_methods.c +@@ -134,7 +134,7 @@ const char *bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k, + + const char *bch2_bkey_in_btree_node(struct btree *b, struct bkey_s_c k) + { +- if (bkey_cmp(bkey_start_pos(k.k), b->data->min_key) < 0) ++ if (bkey_cmp(k.k->p, b->data->min_key) < 0) + return "key before start of btree node"; + + if (bkey_cmp(k.k->p, b->data->max_key) > 0) +diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c +index e9df7e82a766..a2fa92f5bc8a 100644 +--- a/fs/bcachefs/btree_cache.c ++++ b/fs/bcachefs/btree_cache.c +@@ -588,6 +588,7 @@ err: + static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c, + struct btree_iter *iter, + const struct bkey_i *k, ++ enum btree_id btree_id, + unsigned level, + enum six_lock_type lock_type, + bool sync) +@@ -600,7 +601,7 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c, + * Parent node must be locked, else we could read in a btree node that's + * been freed: + */ +- if (!bch2_btree_node_relock(iter, level + 1)) ++ if (iter && !bch2_btree_node_relock(iter, level + 1)) + return ERR_PTR(-EINTR); + + b = bch2_btree_node_mem_alloc(c); +@@ -608,7 +609,7 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c, + return b; + + bkey_copy(&b->key, k); +- if (bch2_btree_node_hash_insert(bc, b, level, iter->btree_id)) { ++ if (bch2_btree_node_hash_insert(bc, b, level, btree_id)) { + /* raced with another fill: */ + + /* mark as unhashed... */ +@@ -628,7 +629,7 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c, + * + * XXX: ideally should be dropping all btree node locks here + */ +- if (btree_node_read_locked(iter, level + 1)) ++ if (iter && btree_node_read_locked(iter, level + 1)) + btree_node_unlock(iter, level + 1); + + bch2_btree_node_read(c, b, sync); +@@ -676,7 +677,8 @@ retry: + * else we could read in a btree node from disk that's been + * freed: + */ +- b = bch2_btree_node_fill(c, iter, k, level, lock_type, true); ++ b = bch2_btree_node_fill(c, iter, k, iter->btree_id, ++ level, lock_type, true); + + /* We raced and found the btree node in the cache */ + if (!b) +@@ -762,6 +764,74 @@ lock_node: + return b; + } + ++struct btree *bch2_btree_node_get_noiter(struct bch_fs *c, ++ const struct bkey_i *k, ++ enum btree_id btree_id, ++ unsigned level) ++{ ++ struct btree_cache *bc = &c->btree_cache; ++ struct btree *b; ++ struct bset_tree *t; ++ ++ EBUG_ON(level >= BTREE_MAX_DEPTH); ++ ++ b = btree_node_mem_ptr(k); ++ if (b) ++ goto lock_node; ++retry: ++ b = btree_cache_find(bc, k); ++ if (unlikely(!b)) { ++ b = bch2_btree_node_fill(c, NULL, k, btree_id, ++ level, SIX_LOCK_read, true); ++ ++ /* We raced and found the btree node in the cache */ ++ if (!b) ++ goto retry; ++ ++ if (IS_ERR(b)) ++ return b; ++ } else { ++lock_node: ++ six_lock_read(&b->lock, NULL, NULL); ++ ++ if (unlikely(b->hash_val != btree_ptr_hash_val(k) || ++ b->btree_id != btree_id || ++ b->level != level)) { ++ six_unlock_read(&b->lock); ++ goto retry; ++ } ++ } ++ ++ /* XXX: waiting on IO with btree locks held: */ ++ wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight, ++ TASK_UNINTERRUPTIBLE); ++ ++ prefetch(b->aux_data); ++ ++ for_each_bset(b, t) { ++ void *p = (u64 *) b->aux_data + t->aux_data_offset; ++ ++ prefetch(p + L1_CACHE_BYTES * 0); ++ prefetch(p + L1_CACHE_BYTES * 1); ++ prefetch(p + L1_CACHE_BYTES * 2); ++ } ++ ++ /* avoid atomic set bit if it's not needed: */ ++ if (!btree_node_accessed(b)) ++ set_btree_node_accessed(b); ++ ++ if (unlikely(btree_node_read_error(b))) { ++ six_unlock_read(&b->lock); ++ return ERR_PTR(-EIO); ++ } ++ ++ EBUG_ON(b->btree_id != btree_id || ++ BTREE_NODE_LEVEL(b->data) != level || ++ bkey_cmp(b->data->max_key, k->k.p)); ++ ++ return b; ++} ++ + struct btree *bch2_btree_node_get_sibling(struct bch_fs *c, + struct btree_iter *iter, + struct btree *b, +@@ -876,7 +946,8 @@ void bch2_btree_node_prefetch(struct bch_fs *c, struct btree_iter *iter, + if (b) + return; + +- bch2_btree_node_fill(c, iter, k, level, SIX_LOCK_read, false); ++ bch2_btree_node_fill(c, iter, k, iter->btree_id, ++ level, SIX_LOCK_read, false); + } + + void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, +diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h +index bc24d92678d3..132cc95a4c02 100644 +--- a/fs/bcachefs/btree_cache.h ++++ b/fs/bcachefs/btree_cache.h +@@ -25,6 +25,9 @@ struct btree *bch2_btree_node_get(struct bch_fs *, struct btree_iter *, + const struct bkey_i *, unsigned, + enum six_lock_type); + ++struct btree *bch2_btree_node_get_noiter(struct bch_fs *, const struct bkey_i *, ++ enum btree_id, unsigned); ++ + struct btree *bch2_btree_node_get_sibling(struct bch_fs *, struct btree_iter *, + struct btree *, enum btree_node_sibling); + +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +index c5a0c0ed22a0..8138df6c9d14 100644 +--- a/fs/bcachefs/btree_gc.c ++++ b/fs/bcachefs/btree_gc.c +@@ -184,16 +184,8 @@ fsck_err: + return ret; + } + +-static bool pos_in_journal_keys(struct journal_keys *journal_keys, +- enum btree_id id, struct bpos pos) +-{ +- struct journal_key *k = journal_key_search(journal_keys, id, pos); +- +- return k && k->btree_id == id && !bkey_cmp(k->k->k.p, pos); +-} +- + static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, u8 *max_stale, +- struct journal_keys *journal_keys, bool initial) ++ bool initial) + { + struct btree_node_iter iter; + struct bkey unpacked; +@@ -207,10 +199,6 @@ static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, u8 *max_stale, + + for_each_btree_node_key_unpack(b, k, &iter, + &unpacked) { +- if (!b->level && journal_keys && +- pos_in_journal_keys(journal_keys, b->btree_id, k.k->p)) +- continue; +- + bch2_bkey_debugcheck(c, b, k); + + ret = bch2_gc_mark_key(c, k, max_stale, initial); +@@ -222,7 +210,6 @@ static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, u8 *max_stale, + } + + static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id, +- struct journal_keys *journal_keys, + bool initial, bool metadata_only) + { + struct btree_trans trans; +@@ -250,8 +237,7 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id, + + gc_pos_set(c, gc_pos_btree_node(b)); + +- ret = btree_gc_mark_node(c, b, &max_stale, +- journal_keys, initial); ++ ret = btree_gc_mark_node(c, b, &max_stale, initial); + if (ret) + break; + +@@ -287,6 +273,78 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id, + return ret; + } + ++static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b, ++ struct journal_keys *journal_keys, ++ unsigned target_depth) ++{ ++ struct btree_and_journal_iter iter; ++ struct bkey_s_c k; ++ u8 max_stale = 0; ++ int ret = 0; ++ ++ bch2_btree_and_journal_iter_init_node_iter(&iter, journal_keys, b); ++ ++ while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { ++ bch2_bkey_debugcheck(c, b, k); ++ ++ ret = bch2_gc_mark_key(c, k, &max_stale, true); ++ if (ret) ++ break; ++ ++ if (b->level > target_depth) { ++ struct btree *child; ++ BKEY_PADDED(k) tmp; ++ ++ bkey_reassemble(&tmp.k, k); ++ ++ child = bch2_btree_node_get_noiter(c, &tmp.k, ++ b->btree_id, b->level - 1); ++ ret = PTR_ERR_OR_ZERO(child); ++ if (ret) ++ break; ++ ++ bch2_gc_btree_init_recurse(c, child, ++ journal_keys, target_depth); ++ six_unlock_read(&child->lock); ++ } ++ ++ bch2_btree_and_journal_iter_advance(&iter); ++ } ++ ++ return ret; ++} ++ ++static int bch2_gc_btree_init(struct bch_fs *c, ++ struct journal_keys *journal_keys, ++ enum btree_id btree_id, ++ bool metadata_only) ++{ ++ struct btree *b; ++ unsigned target_depth = metadata_only ? 1 ++ : expensive_debug_checks(c) ? 0 ++ : !btree_node_type_needs_gc(btree_id) ? 1 ++ : 0; ++ u8 max_stale = 0; ++ int ret = 0; ++ ++ b = c->btree_roots[btree_id].b; ++ ++ if (btree_node_fake(b)) ++ return 0; ++ ++ six_lock_read(&b->lock, NULL, NULL); ++ if (b->level >= target_depth) ++ ret = bch2_gc_btree_init_recurse(c, b, ++ journal_keys, target_depth); ++ ++ if (!ret) ++ ret = bch2_gc_mark_key(c, bkey_i_to_s_c(&b->key), ++ &max_stale, true); ++ six_unlock_read(&b->lock); ++ ++ return ret; ++} ++ + static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r) + { + return (int) btree_id_to_gc_phase(l) - +@@ -305,27 +363,12 @@ static int bch2_gc_btrees(struct bch_fs *c, struct journal_keys *journal_keys, + + for (i = 0; i < BTREE_ID_NR; i++) { + enum btree_id id = ids[i]; +- enum btree_node_type type = __btree_node_type(0, id); +- +- int ret = bch2_gc_btree(c, id, journal_keys, +- initial, metadata_only); ++ int ret = initial ++ ? bch2_gc_btree_init(c, journal_keys, ++ id, metadata_only) ++ : bch2_gc_btree(c, id, initial, metadata_only); + if (ret) + return ret; +- +- if (journal_keys && !metadata_only && +- btree_node_type_needs_gc(type)) { +- struct journal_key *j; +- u8 max_stale; +- int ret; +- +- for_each_journal_key(*journal_keys, j) +- if (j->btree_id == id) { +- ret = bch2_gc_mark_key(c, bkey_i_to_s_c(j->k), +- &max_stale, initial); +- if (ret) +- return ret; +- } +- } + } + + return 0; +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index 02b381cb567b..11083331fe65 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -27,30 +27,78 @@ + + /* iterate over keys read from the journal: */ + +-struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter) ++static struct journal_key *journal_key_search(struct journal_keys *journal_keys, ++ enum btree_id id, unsigned level, ++ struct bpos pos) + { +- while (iter->k) { +- if (iter->k->btree_id == iter->btree_id) +- return bkey_i_to_s_c(iter->k->k); ++ size_t l = 0, r = journal_keys->nr, m; + +- iter->k++; +- if (iter->k == iter->keys->d + iter->keys->nr) +- iter->k = NULL; ++ while (l < r) { ++ m = l + ((r - l) >> 1); ++ if ((cmp_int(id, journal_keys->d[m].btree_id) ?: ++ cmp_int(level, journal_keys->d[m].level) ?: ++ bkey_cmp(pos, journal_keys->d[m].k->k.p)) > 0) ++ l = m + 1; ++ else ++ r = m; + } + +- return bkey_s_c_null; ++ BUG_ON(l < journal_keys->nr && ++ (cmp_int(id, journal_keys->d[l].btree_id) ?: ++ cmp_int(level, journal_keys->d[l].level) ?: ++ bkey_cmp(pos, journal_keys->d[l].k->k.p)) > 0); ++ ++ BUG_ON(l && ++ (cmp_int(id, journal_keys->d[l - 1].btree_id) ?: ++ cmp_int(level, journal_keys->d[l - 1].level) ?: ++ bkey_cmp(pos, journal_keys->d[l - 1].k->k.p)) <= 0); ++ ++ return l < journal_keys->nr ? journal_keys->d + l : NULL; ++} ++ ++static struct bkey_i *bch2_journal_iter_peek(struct journal_iter *iter) ++{ ++ if (iter->k && ++ iter->k < iter->keys->d + iter->keys->nr && ++ iter->k->btree_id == iter->btree_id && ++ iter->k->level == iter->level) ++ return iter->k->k; ++ ++ iter->k = NULL; ++ return NULL; ++} ++ ++static void bch2_journal_iter_advance(struct journal_iter *iter) ++{ ++ if (iter->k) ++ iter->k++; + } + +-struct bkey_s_c bch2_journal_iter_next(struct journal_iter *iter) ++static void bch2_journal_iter_init(struct journal_iter *iter, ++ struct journal_keys *journal_keys, ++ enum btree_id id, unsigned level, ++ struct bpos pos) + { +- if (!iter->k) +- return bkey_s_c_null; ++ iter->btree_id = id; ++ iter->level = level; ++ iter->keys = journal_keys; ++ iter->k = journal_key_search(journal_keys, id, level, pos); ++} + +- iter->k++; +- if (iter->k == iter->keys->d + iter->keys->nr) +- iter->k = NULL; ++static struct bkey_s_c bch2_journal_iter_peek_btree(struct btree_and_journal_iter *iter) ++{ ++ return iter->btree ++ ? bch2_btree_iter_peek(iter->btree) ++ : bch2_btree_node_iter_peek_unpack(&iter->node_iter, ++ iter->b, &iter->unpacked); ++} + +- return bch2_journal_iter_peek(iter); ++static void bch2_journal_iter_advance_btree(struct btree_and_journal_iter *iter) ++{ ++ if (iter->btree) ++ bch2_btree_iter_next(iter->btree); ++ else ++ bch2_btree_node_iter_advance(&iter->node_iter, iter->b); + } + + void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *iter) +@@ -59,10 +107,10 @@ void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *iter) + case none: + break; + case btree: +- bch2_btree_iter_next(iter->btree); ++ bch2_journal_iter_advance_btree(iter); + break; + case journal: +- bch2_journal_iter_next(&iter->journal); ++ bch2_journal_iter_advance(&iter->journal); + break; + } + +@@ -74,14 +122,16 @@ struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter * + struct bkey_s_c ret; + + while (1) { +- struct bkey_s_c btree_k = bch2_btree_iter_peek(iter->btree); +- struct bkey_s_c journal_k = bch2_journal_iter_peek(&iter->journal); ++ struct bkey_s_c btree_k = ++ bch2_journal_iter_peek_btree(iter); ++ struct bkey_s_c journal_k = ++ bkey_i_to_s_c(bch2_journal_iter_peek(&iter->journal)); + + if (btree_k.k && journal_k.k) { + int cmp = bkey_cmp(btree_k.k->p, journal_k.k->p); + + if (!cmp) +- bch2_btree_iter_next(iter->btree); ++ bch2_journal_iter_advance_btree(iter); + + iter->last = cmp < 0 ? btree : journal; + } else if (btree_k.k) { +@@ -94,6 +144,14 @@ struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter * + } + + ret = iter->last == journal ? journal_k : btree_k; ++ ++ if (iter->b && ++ bkey_cmp(ret.k->p, iter->b->data->max_key) > 0) { ++ iter->journal.k = NULL; ++ iter->last = none; ++ return bkey_s_c_null; ++ } ++ + if (!bkey_deleted(ret.k)) + break; + +@@ -110,41 +168,32 @@ struct bkey_s_c bch2_btree_and_journal_iter_next(struct btree_and_journal_iter * + return bch2_btree_and_journal_iter_peek(iter); + } + +-struct journal_key *journal_key_search(struct journal_keys *journal_keys, +- enum btree_id id, struct bpos pos) +-{ +- size_t l = 0, r = journal_keys->nr, m; +- +- while (l < r) { +- m = l + ((r - l) >> 1); +- if ((cmp_int(id, journal_keys->d[m].btree_id) ?: +- bkey_cmp(pos, journal_keys->d[m].k->k.p)) > 0) +- l = m + 1; +- else +- r = m; +- } +- +- BUG_ON(l < journal_keys->nr && +- (cmp_int(id, journal_keys->d[l].btree_id) ?: +- bkey_cmp(pos, journal_keys->d[l].k->k.p)) > 0); +- +- BUG_ON(l && +- (cmp_int(id, journal_keys->d[l - 1].btree_id) ?: +- bkey_cmp(pos, journal_keys->d[l - 1].k->k.p)) <= 0); +- +- return l < journal_keys->nr ? journal_keys->d + l : NULL; +-} +- + void bch2_btree_and_journal_iter_init(struct btree_and_journal_iter *iter, + struct btree_trans *trans, + struct journal_keys *journal_keys, + enum btree_id id, struct bpos pos) + { +- iter->journal.keys = journal_keys; +- iter->journal.k = journal_key_search(journal_keys, id, pos); +- iter->journal.btree_id = id; ++ memset(iter, 0, sizeof(*iter)); + + iter->btree = bch2_trans_get_iter(trans, id, pos, 0); ++ bch2_journal_iter_init(&iter->journal, journal_keys, id, 0, pos); ++} ++ ++void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter, ++ struct journal_keys *journal_keys, ++ struct btree *b) ++{ ++ struct bpos start = b->data->min_key; ++ ++ if (btree_node_type_is_extents(b->btree_id)) ++ start = bkey_successor(start); ++ ++ memset(iter, 0, sizeof(*iter)); ++ ++ iter->b = b; ++ bch2_btree_node_iter_init_from_start(&iter->node_iter, iter->b); ++ bch2_journal_iter_init(&iter->journal, journal_keys, ++ b->btree_id, b->level, start); + } + + /* sort and dedup all keys in the journal: */ +@@ -169,7 +218,8 @@ static int journal_sort_key_cmp(const void *_l, const void *_r) + const struct journal_key *l = _l; + const struct journal_key *r = _r; + +- return cmp_int(l->btree_id, r->btree_id) ?: ++ return cmp_int(l->btree_id, r->btree_id) ?: ++ cmp_int(l->level, r->level) ?: + bkey_cmp(l->k->k.p, r->k->k.p) ?: + cmp_int(l->journal_seq, r->journal_seq) ?: + cmp_int(l->journal_offset, r->journal_offset); +@@ -180,9 +230,10 @@ static int journal_sort_seq_cmp(const void *_l, const void *_r) + const struct journal_key *l = _l; + const struct journal_key *r = _r; + +- return cmp_int(l->journal_seq, r->journal_seq) ?: +- cmp_int(l->btree_id, r->btree_id) ?: +- bkey_cmp(l->k->k.p, r->k->k.p); ++ return cmp_int(l->journal_seq, r->journal_seq) ?: ++ cmp_int(l->btree_id, r->btree_id) ?: ++ cmp_int(l->level, r->level) ?: ++ bkey_cmp(l->k->k.p, r->k->k.p); + } + + static void journal_keys_free(struct journal_keys *keys) +@@ -218,6 +269,7 @@ static struct journal_keys journal_keys_sort(struct list_head *journal_entries) + for_each_jset_key(k, _n, entry, &p->j) + keys.d[keys.nr++] = (struct journal_key) { + .btree_id = entry->btree_id, ++ .level = entry->level, + .k = k, + .journal_seq = le64_to_cpu(p->j.seq) - + keys.journal_seq_base, +@@ -229,7 +281,8 @@ static struct journal_keys journal_keys_sort(struct list_head *journal_entries) + src = dst = keys.d; + while (src < keys.d + keys.nr) { + while (src + 1 < keys.d + keys.nr && +- src[0].btree_id == src[1].btree_id && ++ src[0].btree_id == src[1].btree_id && ++ src[0].level == src[1].level && + !bkey_cmp(src[0].k->k.p, src[1].k->k.p)) + src++; + +@@ -864,7 +917,7 @@ int bch2_fs_recovery(struct bch_fs *c) + */ + bch_info(c, "starting metadata mark and sweep"); + err = "error in mark and sweep"; +- ret = bch2_gc(c, NULL, true, true); ++ ret = bch2_gc(c, &journal_keys, true, true); + if (ret) + goto err; + bch_verbose(c, "mark and sweep done"); +diff --git a/fs/bcachefs/recovery.h b/fs/bcachefs/recovery.h +index c91309301563..fa1f2818817d 100644 +--- a/fs/bcachefs/recovery.h ++++ b/fs/bcachefs/recovery.h +@@ -5,6 +5,7 @@ + struct journal_keys { + struct journal_key { + enum btree_id btree_id:8; ++ unsigned level:8; + struct bkey_i *k; + u32 journal_seq; + u32 journal_offset; +@@ -17,15 +18,23 @@ struct journal_keys { + for (i = (keys).d; i < (keys).d + (keys).nr; (i)++) + + struct journal_iter { ++ enum btree_id btree_id; ++ unsigned level; + struct journal_keys *keys; + struct journal_key *k; +- enum btree_id btree_id; + }; + +-struct btree_and_journal_iter { +- enum btree_id btree_id; ++/* ++ * Iterate over keys in the btree, with keys from the journal overlaid on top: ++ */ + ++struct btree_and_journal_iter { + struct btree_iter *btree; ++ ++ struct btree *b; ++ struct btree_node_iter node_iter; ++ struct bkey unpacked; ++ + struct journal_iter journal; + + enum last_key_returned { +@@ -38,12 +47,14 @@ struct btree_and_journal_iter { + void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *); + struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *); + struct bkey_s_c bch2_btree_and_journal_iter_next(struct btree_and_journal_iter *); +-struct journal_key *journal_key_search(struct journal_keys *, +- enum btree_id, struct bpos); ++ + void bch2_btree_and_journal_iter_init(struct btree_and_journal_iter *, + struct btree_trans *, + struct journal_keys *, + enum btree_id, struct bpos); ++void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *, ++ struct journal_keys *, ++ struct btree *); + + int bch2_fs_recovery(struct bch_fs *); + int bch2_fs_initialize(struct bch_fs *); +-- +cgit v1.2.3 + + +From b223c70e496b550948fd8fc4c5db58ead91518e9 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 15 Mar 2020 22:32:03 -0400 +Subject: bcachefs: Replay interior node keys + +This slightly modifies the journal replay code so that it can replay +updates to interior nodes. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/recovery.c | 24 ++++++++++++++++-------- + 1 file changed, 16 insertions(+), 8 deletions(-) + +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index 11083331fe65..2b428ee73364 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -230,9 +230,9 @@ static int journal_sort_seq_cmp(const void *_l, const void *_r) + const struct journal_key *l = _l; + const struct journal_key *r = _r; + +- return cmp_int(l->journal_seq, r->journal_seq) ?: ++ return cmp_int(r->level, l->level) ?: ++ cmp_int(l->journal_seq, r->journal_seq) ?: + cmp_int(l->btree_id, r->btree_id) ?: +- cmp_int(l->level, r->level) ?: + bkey_cmp(l->k->k.p, r->k->k.p); + } + +@@ -404,12 +404,15 @@ err: + } + + static int __bch2_journal_replay_key(struct btree_trans *trans, +- enum btree_id id, struct bkey_i *k) ++ enum btree_id id, unsigned level, ++ struct bkey_i *k) + { + struct btree_iter *iter; + int ret; + +- iter = bch2_trans_get_iter(trans, id, k->k.p, BTREE_ITER_INTENT); ++ iter = bch2_trans_get_node_iter(trans, id, k->k.p, ++ BTREE_MAX_DEPTH, level, ++ BTREE_ITER_INTENT); + if (IS_ERR(iter)) + return PTR_ERR(iter); + +@@ -428,13 +431,13 @@ static int __bch2_journal_replay_key(struct btree_trans *trans, + } + + static int bch2_journal_replay_key(struct bch_fs *c, enum btree_id id, +- struct bkey_i *k) ++ unsigned level, struct bkey_i *k) + { + return bch2_trans_do(c, NULL, NULL, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW| + BTREE_INSERT_JOURNAL_REPLAY, +- __bch2_journal_replay_key(&trans, id, k)); ++ __bch2_journal_replay_key(&trans, id, level, k)); + } + + static int bch2_journal_replay(struct bch_fs *c, +@@ -446,15 +449,20 @@ static int bch2_journal_replay(struct bch_fs *c, + + sort(keys.d, keys.nr, sizeof(keys.d[0]), journal_sort_seq_cmp, NULL); + ++ replay_now_at(j, keys.journal_seq_base); ++ + for_each_journal_key(keys, i) { +- replay_now_at(j, keys.journal_seq_base + i->journal_seq); ++ if (!i->level) ++ replay_now_at(j, keys.journal_seq_base + i->journal_seq); + ++ if (i->level) ++ ret = bch2_journal_replay_key(c, i->btree_id, i->level, i->k); + if (i->btree_id == BTREE_ID_ALLOC) + ret = bch2_alloc_replay_key(c, i->k); + else if (i->k->k.size) + ret = bch2_extent_replay_key(c, i->btree_id, i->k); + else +- ret = bch2_journal_replay_key(c, i->btree_id, i->k); ++ ret = bch2_journal_replay_key(c, i->btree_id, i->level, i->k); + + if (ret) { + bch_err(c, "journal replay: error %d while replaying key", +-- +cgit v1.2.3 + + +From b64105a243dc14e45fcbc936839e1172cae6a433 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 8 Feb 2020 19:06:31 -0500 +Subject: bcachefs: Journal updates to interior nodes + +Previously, the btree has always been self contained and internally +consistent on disk without anything from the journal - the journal just +contained pointers to the btree roots. + +However, this meant that btree node split or compact operations - i.e. +anything that changes btree node topology and involves updates to +interior nodes - would require that interior btree node to be written +immediately, which means emitting a btree node write that's mostly empty +(using 4k of space on disk if the filesystemm blocksize is 4k to only +write perhaps ~100 bytes of new keys). + +More importantly, this meant most btree node writes had to be FUA, and +consumer drives have a history of slow and/or buggy FUA support - other +filesystes have been bit by this. + +This patch changes the interior btree update path to journal updates to +interior nodes, after the writes for the new btree nodes have completed. +Best of all, it turns out to simplify the interior node update path +somewhat. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bcachefs_format.h | 3 +- + fs/bcachefs/btree_io.c | 11 +- + fs/bcachefs/btree_io.h | 9 +- + fs/bcachefs/btree_types.h | 3 - + fs/bcachefs/btree_update.h | 1 + + fs/bcachefs/btree_update_interior.c | 352 ++++++++++++------------------------ + fs/bcachefs/btree_update_interior.h | 16 +- + fs/bcachefs/btree_update_leaf.c | 23 ++- + fs/bcachefs/super-io.c | 2 + + 9 files changed, 146 insertions(+), 274 deletions(-) + +diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h +index 798f5c9ea164..a78988e3ded7 100644 +--- a/fs/bcachefs/bcachefs_format.h ++++ b/fs/bcachefs/bcachefs_format.h +@@ -1312,7 +1312,8 @@ LE64_BITMASK(BCH_SB_ERASURE_CODE, struct bch_sb, flags[3], 0, 16); + x(new_extent_overwrite, 9) \ + x(incompressible, 10) \ + x(btree_ptr_v2, 11) \ +- x(extents_above_btree_updates, 12) ++ x(extents_above_btree_updates, 12) \ ++ x(btree_updates_journalled, 13) + + #define BCH_SB_FEATURES_ALL \ + ((1ULL << BCH_FEATURE_new_siphash)| \ +diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c +index 3f7c10420042..b48d48b8c27d 100644 +--- a/fs/bcachefs/btree_io.c ++++ b/fs/bcachefs/btree_io.c +@@ -1261,7 +1261,6 @@ void bch2_btree_complete_write(struct bch_fs *c, struct btree *b, + closure_put(&((struct btree_update *) new)->cl); + + bch2_journal_pin_drop(&c->journal, &w->journal); +- closure_wake_up(&w->wait); + } + + static void btree_node_write_done(struct bch_fs *c, struct btree *b) +@@ -1618,9 +1617,6 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, + wbio->wbio.bio.bi_end_io = btree_node_write_endio; + wbio->wbio.bio.bi_private = b; + +- if (b->level || !b->written) +- wbio->wbio.bio.bi_opf |= REQ_FUA; +- + bch2_bio_map(&wbio->wbio.bio, data, sectors_to_write << 9); + + /* +@@ -1794,12 +1790,11 @@ ssize_t bch2_dirty_btree_nodes_print(struct bch_fs *c, char *buf) + rcu_read_lock(); + for_each_cached_btree(b, c, tbl, i, pos) { + unsigned long flags = READ_ONCE(b->flags); +- unsigned idx = (flags & (1 << BTREE_NODE_write_idx)) != 0; + + if (!(flags & (1 << BTREE_NODE_dirty))) + continue; + +- pr_buf(&out, "%p d %u n %u l %u w %u b %u r %u:%lu c %u p %u\n", ++ pr_buf(&out, "%p d %u n %u l %u w %u b %u r %u:%lu\n", + b, + (flags & (1 << BTREE_NODE_dirty)) != 0, + (flags & (1 << BTREE_NODE_need_write)) != 0, +@@ -1807,9 +1802,7 @@ ssize_t bch2_dirty_btree_nodes_print(struct bch_fs *c, char *buf) + b->written, + !list_empty_careful(&b->write_blocked), + b->will_make_reachable != 0, +- b->will_make_reachable & 1, +- b->writes[ idx].wait.list.first != NULL, +- b->writes[!idx].wait.list.first != NULL); ++ b->will_make_reachable & 1); + } + rcu_read_unlock(); + +diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h +index e90e89eee273..fd719dda7d91 100644 +--- a/fs/bcachefs/btree_io.h ++++ b/fs/bcachefs/btree_io.h +@@ -102,19 +102,20 @@ bool bch2_btree_post_write_cleanup(struct bch_fs *, struct btree *); + void bch2_btree_node_write(struct bch_fs *, struct btree *, + enum six_lock_type); + +-static inline void btree_node_write_if_need(struct bch_fs *c, struct btree *b) ++static inline void btree_node_write_if_need(struct bch_fs *c, struct btree *b, ++ enum six_lock_type lock_held) + { + while (b->written && + btree_node_need_write(b) && + btree_node_may_write(b)) { + if (!btree_node_write_in_flight(b)) { +- bch2_btree_node_write(c, b, SIX_LOCK_read); ++ bch2_btree_node_write(c, b, lock_held); + break; + } + + six_unlock_read(&b->lock); + btree_node_wait_on_io(b); +- btree_node_lock_type(c, b, SIX_LOCK_read); ++ btree_node_lock_type(c, b, lock_held); + } + } + +@@ -131,7 +132,7 @@ do { \ + new |= (1 << BTREE_NODE_need_write); \ + } while ((v = cmpxchg(&(_b)->flags, old, new)) != old); \ + \ +- btree_node_write_if_need(_c, _b); \ ++ btree_node_write_if_need(_c, _b, SIX_LOCK_read); \ + } while (0) + + void bch2_btree_flush_all_reads(struct bch_fs *); +diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h +index 3e9b924c21b8..31a5c215ca34 100644 +--- a/fs/bcachefs/btree_types.h ++++ b/fs/bcachefs/btree_types.h +@@ -53,7 +53,6 @@ struct bset_tree { + + struct btree_write { + struct journal_entry_pin journal; +- struct closure_waitlist wait; + }; + + struct btree_alloc { +@@ -544,8 +543,6 @@ static inline bool btree_node_type_needs_gc(enum btree_node_type type) + struct btree_root { + struct btree *b; + +- struct btree_update *as; +- + /* On disk root - see async splits: */ + __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX); + u8 level; +diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h +index 9f58d47ef5d6..11f7d02de622 100644 +--- a/fs/bcachefs/btree_update.h ++++ b/fs/bcachefs/btree_update.h +@@ -12,6 +12,7 @@ void bch2_btree_node_lock_for_insert(struct bch_fs *, struct btree *, + struct btree_iter *); + bool bch2_btree_bset_insert_key(struct btree_iter *, struct btree *, + struct btree_node_iter *, struct bkey_i *); ++void bch2_btree_add_journal_pin(struct bch_fs *, struct btree *, u64); + + enum btree_insert_flags { + __BTREE_INSERT_NOUNLOCK, +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index 02e47828914b..7b00b26ed0f2 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -24,7 +24,6 @@ + static void btree_node_will_make_reachable(struct btree_update *, + struct btree *); + static void btree_update_drop_new_node(struct bch_fs *, struct btree *); +-static void bch2_btree_set_root_ondisk(struct bch_fs *, struct btree *, int); + + /* Debug code: */ + +@@ -260,16 +259,17 @@ void bch2_btree_node_free_inmem(struct bch_fs *c, struct btree *b, + } + + static void bch2_btree_node_free_ondisk(struct bch_fs *c, +- struct pending_btree_node_free *pending) ++ struct pending_btree_node_free *pending, ++ u64 journal_seq) + { + BUG_ON(!pending->index_update_done); + + bch2_mark_key(c, bkey_i_to_s_c(&pending->key), +- 0, 0, NULL, 0, BTREE_TRIGGER_OVERWRITE); ++ 0, 0, NULL, journal_seq, BTREE_TRIGGER_OVERWRITE); + + if (gc_visited(c, gc_phase(GC_PHASE_PENDING_DELETE))) + bch2_mark_key(c, bkey_i_to_s_c(&pending->key), +- 0, 0, NULL, 0, ++ 0, 0, NULL, journal_seq, + BTREE_TRIGGER_OVERWRITE| + BTREE_TRIGGER_GC); + } +@@ -585,10 +585,13 @@ static void bch2_btree_update_free(struct btree_update *as) + { + struct bch_fs *c = as->c; + ++ bch2_journal_preres_put(&c->journal, &as->journal_preres); ++ ++ bch2_journal_pin_drop(&c->journal, &as->journal); + bch2_journal_pin_flush(&c->journal, &as->journal); + +- BUG_ON(as->nr_new_nodes); +- BUG_ON(as->nr_pending); ++ BUG_ON((as->nr_new_nodes || as->nr_pending) && ++ !bch2_journal_error(&c->journal));; + + if (as->reserve) + bch2_btree_reserve_put(c, as->reserve); +@@ -603,13 +606,10 @@ static void bch2_btree_update_free(struct btree_update *as) + mutex_unlock(&c->btree_interior_update_lock); + } + +-static void btree_update_nodes_reachable(struct closure *cl) ++static void btree_update_nodes_reachable(struct btree_update *as, u64 seq) + { +- struct btree_update *as = container_of(cl, struct btree_update, cl); + struct bch_fs *c = as->c; + +- bch2_journal_pin_drop(&c->journal, &as->journal); +- + mutex_lock(&c->btree_interior_update_lock); + + while (as->nr_new_nodes) { +@@ -630,39 +630,22 @@ static void btree_update_nodes_reachable(struct closure *cl) + } + + while (as->nr_pending) +- bch2_btree_node_free_ondisk(c, &as->pending[--as->nr_pending]); ++ bch2_btree_node_free_ondisk(c, &as->pending[--as->nr_pending], ++ seq); + + mutex_unlock(&c->btree_interior_update_lock); +- +- closure_wake_up(&as->wait); +- +- bch2_btree_update_free(as); +-} +- +-static void btree_update_wait_on_journal(struct closure *cl) +-{ +- struct btree_update *as = container_of(cl, struct btree_update, cl); +- struct bch_fs *c = as->c; +- int ret; +- +- ret = bch2_journal_open_seq_async(&c->journal, as->journal_seq, cl); +- if (ret == -EAGAIN) { +- continue_at(cl, btree_update_wait_on_journal, system_wq); +- return; +- } +- if (ret < 0) +- goto err; +- +- bch2_journal_flush_seq_async(&c->journal, as->journal_seq, cl); +-err: +- continue_at(cl, btree_update_nodes_reachable, system_wq); + } + + static void btree_update_nodes_written(struct closure *cl) + { + struct btree_update *as = container_of(cl, struct btree_update, cl); ++ struct journal_res res = { 0 }; + struct bch_fs *c = as->c; + struct btree *b; ++ struct bset *i; ++ struct bkey_i *k; ++ unsigned journal_u64s = 0; ++ int ret; + + /* + * We did an update to a parent node where the pointers we added pointed +@@ -671,7 +654,7 @@ static void btree_update_nodes_written(struct closure *cl) + */ + mutex_lock(&c->btree_interior_update_lock); + as->nodes_written = true; +-retry: ++again: + as = list_first_entry_or_null(&c->btree_interior_updates_unwritten, + struct btree_update, unwritten_list); + if (!as || !as->nodes_written) { +@@ -679,31 +662,53 @@ retry: + return; + } + ++ b = as->b; ++ if (b && !six_trylock_intent(&b->lock)) { ++ mutex_unlock(&c->btree_interior_update_lock); ++ btree_node_lock_type(c, b, SIX_LOCK_intent); ++ six_unlock_intent(&b->lock); ++ goto out; ++ } ++ ++ journal_u64s = 0; ++ ++ if (as->mode != BTREE_INTERIOR_UPDATING_ROOT) ++ for_each_keylist_key(&as->parent_keys, k) ++ journal_u64s += jset_u64s(k->k.u64s); ++ ++ ret = bch2_journal_res_get(&c->journal, &res, journal_u64s, ++ JOURNAL_RES_GET_RESERVED); ++ if (ret) { ++ BUG_ON(!bch2_journal_error(&c->journal)); ++ /* can't unblock btree writes */ ++ goto free_update; ++ } ++ ++ if (as->mode != BTREE_INTERIOR_UPDATING_ROOT) ++ for_each_keylist_key(&as->parent_keys, k) ++ bch2_journal_add_entry(&c->journal, &res, ++ BCH_JSET_ENTRY_btree_keys, ++ as->btree_id, ++ as->level, ++ k, k->k.u64s); ++ + switch (as->mode) { + case BTREE_INTERIOR_NO_UPDATE: + BUG(); + case BTREE_INTERIOR_UPDATING_NODE: +- /* The usual case: */ +- b = READ_ONCE(as->b); +- +- if (!six_trylock_read(&b->lock)) { +- mutex_unlock(&c->btree_interior_update_lock); +- btree_node_lock_type(c, b, SIX_LOCK_read); +- six_unlock_read(&b->lock); +- mutex_lock(&c->btree_interior_update_lock); +- goto retry; +- } +- +- BUG_ON(!btree_node_dirty(b)); +- closure_wait(&btree_current_write(b)->wait, &as->cl); ++ /* @b is the node we did the final insert into: */ ++ BUG_ON(!res.ref); + ++ six_lock_write(&b->lock, NULL, NULL); + list_del(&as->write_blocked_list); + +- /* +- * for flush_held_btree_writes() waiting on updates to flush or +- * nodes to be writeable: +- */ +- closure_wake_up(&c->btree_interior_update_wait); ++ i = btree_bset_last(b); ++ i->journal_seq = cpu_to_le64( ++ max(res.seq, ++ le64_to_cpu(i->journal_seq))); ++ ++ bch2_btree_add_journal_pin(c, b, res.seq); ++ six_unlock_write(&b->lock); + + list_del(&as->unwritten_list); + mutex_unlock(&c->btree_interior_update_lock); +@@ -712,82 +717,51 @@ retry: + * b->write_blocked prevented it from being written, so + * write it now if it needs to be written: + */ +- bch2_btree_node_write_cond(c, b, true); +- six_unlock_read(&b->lock); +- continue_at(&as->cl, btree_update_nodes_reachable, system_wq); ++ btree_node_write_if_need(c, b, SIX_LOCK_intent); ++ six_unlock_intent(&b->lock); + break; + + case BTREE_INTERIOR_UPDATING_AS: +- /* +- * The btree node we originally updated has been freed and is +- * being rewritten - so we need to write anything here, we just +- * need to signal to that btree_update that it's ok to make the +- * new replacement node visible: +- */ +- closure_put(&as->parent_as->cl); +- +- /* +- * and then we have to wait on that btree_update to finish: +- */ +- closure_wait(&as->parent_as->wait, &as->cl); ++ BUG_ON(b); + + list_del(&as->unwritten_list); + mutex_unlock(&c->btree_interior_update_lock); +- +- continue_at(&as->cl, btree_update_nodes_reachable, system_wq); + break; + +- case BTREE_INTERIOR_UPDATING_ROOT: +- /* b is the new btree root: */ +- b = READ_ONCE(as->b); +- +- if (!six_trylock_read(&b->lock)) { +- mutex_unlock(&c->btree_interior_update_lock); +- btree_node_lock_type(c, b, SIX_LOCK_read); +- six_unlock_read(&b->lock); +- mutex_lock(&c->btree_interior_update_lock); +- goto retry; +- } +- +- BUG_ON(c->btree_roots[b->btree_id].as != as); +- c->btree_roots[b->btree_id].as = NULL; ++ case BTREE_INTERIOR_UPDATING_ROOT: { ++ struct btree_root *r = &c->btree_roots[as->btree_id]; + +- bch2_btree_set_root_ondisk(c, b, WRITE); ++ BUG_ON(b); + +- /* +- * We don't have to wait anything anything here (before +- * btree_update_nodes_reachable frees the old nodes +- * ondisk) - we've ensured that the very next journal write will +- * have the pointer to the new root, and before the allocator +- * can reuse the old nodes it'll have to do a journal commit: +- */ +- six_unlock_read(&b->lock); ++ mutex_lock(&c->btree_root_lock); ++ bkey_copy(&r->key, as->parent_keys.keys); ++ r->level = as->level; ++ r->alive = true; ++ c->btree_roots_dirty = true; ++ mutex_unlock(&c->btree_root_lock); + + list_del(&as->unwritten_list); + mutex_unlock(&c->btree_interior_update_lock); +- +- /* +- * Bit of funny circularity going on here we have to break: +- * +- * We have to drop our journal pin before writing the journal +- * entry that points to the new btree root: else, we could +- * deadlock if the journal currently happens to be full. +- * +- * This mean we're dropping the journal pin _before_ the new +- * nodes are technically reachable - but this is safe, because +- * after the bch2_btree_set_root_ondisk() call above they will +- * be reachable as of the very next journal write: +- */ +- bch2_journal_pin_drop(&c->journal, &as->journal); +- +- as->journal_seq = bch2_journal_last_unwritten_seq(&c->journal); +- +- btree_update_wait_on_journal(&as->cl); + break; + } ++ } + ++ bch2_journal_pin_drop(&c->journal, &as->journal); ++ ++ bch2_journal_res_put(&c->journal, &res); ++ bch2_journal_preres_put(&c->journal, &as->journal_preres); ++ ++ btree_update_nodes_reachable(as, res.seq); ++free_update: ++ bch2_btree_update_free(as); ++ /* ++ * for flush_held_btree_writes() waiting on updates to flush or ++ * nodes to be writeable: ++ */ ++ closure_wake_up(&c->btree_interior_update_wait); ++out: + mutex_lock(&c->btree_interior_update_lock); +- goto retry; ++ goto again; + } + + /* +@@ -804,48 +778,12 @@ static void btree_update_updated_node(struct btree_update *as, struct btree *b) + BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE); + BUG_ON(!btree_node_dirty(b)); + +- as->mode = BTREE_INTERIOR_UPDATING_NODE; +- as->b = b; ++ as->mode = BTREE_INTERIOR_UPDATING_NODE; ++ as->b = b; ++ as->level = b->level; + list_add(&as->write_blocked_list, &b->write_blocked); + + mutex_unlock(&c->btree_interior_update_lock); +- +- /* +- * In general, when you're staging things in a journal that will later +- * be written elsewhere, and you also want to guarantee ordering: that +- * is, if you have updates a, b, c, after a crash you should never see c +- * and not a or b - there's a problem: +- * +- * If the final destination of the update(s) (i.e. btree node) can be +- * written/flushed _before_ the relevant journal entry - oops, that +- * breaks ordering, since the various leaf nodes can be written in any +- * order. +- * +- * Normally we use bset->journal_seq to deal with this - if during +- * recovery we find a btree node write that's newer than the newest +- * journal entry, we just ignore it - we don't need it, anything we're +- * supposed to have (that we reported as completed via fsync()) will +- * still be in the journal, and as far as the state of the journal is +- * concerned that btree node write never happened. +- * +- * That breaks when we're rewriting/splitting/merging nodes, since we're +- * mixing btree node writes that haven't happened yet with previously +- * written data that has been reported as completed to the journal. +- * +- * Thus, before making the new nodes reachable, we have to wait the +- * newest journal sequence number we have data for to be written (if it +- * hasn't been yet). +- */ +- bch2_journal_wait_on_seq(&c->journal, as->journal_seq, &as->cl); +-} +- +-static void interior_update_flush(struct journal *j, +- struct journal_entry_pin *pin, u64 seq) +-{ +- struct btree_update *as = +- container_of(pin, struct btree_update, journal); +- +- bch2_journal_flush_seq_async(j, as->journal_seq, NULL); + } + + static void btree_update_reparent(struct btree_update *as, +@@ -853,10 +791,10 @@ static void btree_update_reparent(struct btree_update *as, + { + struct bch_fs *c = as->c; + ++ lockdep_assert_held(&c->btree_interior_update_lock); ++ + child->b = NULL; + child->mode = BTREE_INTERIOR_UPDATING_AS; +- child->parent_as = as; +- closure_get(&as->cl); + + /* + * When we write a new btree root, we have to drop our journal pin +@@ -867,46 +805,24 @@ static void btree_update_reparent(struct btree_update *as, + * just transfer the journal pin to the new interior update so + * btree_update_nodes_written() can drop it. + */ +- bch2_journal_pin_copy(&c->journal, &as->journal, +- &child->journal, interior_update_flush); ++ bch2_journal_pin_copy(&c->journal, &as->journal, &child->journal, NULL); + bch2_journal_pin_drop(&c->journal, &child->journal); +- +- as->journal_seq = max(as->journal_seq, child->journal_seq); + } + +-static void btree_update_updated_root(struct btree_update *as) ++static void btree_update_updated_root(struct btree_update *as, struct btree *b) + { + struct bch_fs *c = as->c; +- struct btree_root *r = &c->btree_roots[as->btree_id]; +- +- mutex_lock(&c->btree_interior_update_lock); +- list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten); + + BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE); ++ BUG_ON(!bch2_keylist_empty(&as->parent_keys)); + +- /* +- * Old root might not be persistent yet - if so, redirect its +- * btree_update operation to point to us: +- */ +- if (r->as) +- btree_update_reparent(as, r->as); +- +- as->mode = BTREE_INTERIOR_UPDATING_ROOT; +- as->b = r->b; +- r->as = as; ++ mutex_lock(&c->btree_interior_update_lock); ++ list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten); + ++ as->mode = BTREE_INTERIOR_UPDATING_ROOT; ++ as->level = b->level; ++ bch2_keylist_add(&as->parent_keys, &b->key); + mutex_unlock(&c->btree_interior_update_lock); +- +- /* +- * When we're rewriting nodes and updating interior nodes, there's an +- * issue with updates that haven't been written in the journal getting +- * mixed together with older data - see btree_update_updated_node() +- * for the explanation. +- * +- * However, this doesn't affect us when we're writing a new btree root - +- * because to make that new root reachable we have to write out a new +- * journal entry, which must necessarily be newer than as->journal_seq. +- */ + } + + static void btree_node_will_make_reachable(struct btree_update *as, +@@ -983,10 +899,8 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *as, + struct btree *b) + { + struct bch_fs *c = as->c; +- struct closure *cl, *cl_n; + struct btree_update *p, *n; + struct btree_write *w; +- struct bset_tree *t; + + set_btree_node_dying(b); + +@@ -995,18 +909,6 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *as, + + btree_interior_update_add_node_reference(as, b); + +- /* +- * Does this node have data that hasn't been written in the journal? +- * +- * If so, we have to wait for the corresponding journal entry to be +- * written before making the new nodes reachable - we can't just carry +- * over the bset->journal_seq tracking, since we'll be mixing those keys +- * in with keys that aren't in the journal anymore: +- */ +- for_each_bset(b, t) +- as->journal_seq = max(as->journal_seq, +- le64_to_cpu(bset(b, t)->journal_seq)); +- + mutex_lock(&c->btree_interior_update_lock); + + /* +@@ -1030,16 +932,6 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *as, + + clear_btree_node_dirty(b); + clear_btree_node_need_write(b); +- w = btree_current_write(b); +- +- /* +- * Does this node have any btree_update operations waiting on this node +- * to be written? +- * +- * If so, wake them up when this btree_update operation is reachable: +- */ +- llist_for_each_entry_safe(cl, cl_n, llist_del_all(&w->wait.list), list) +- llist_add(&cl->list, &as->wait.list); + + /* + * Does this node have unwritten data that has a pin on the journal? +@@ -1049,13 +941,12 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *as, + * oldest pin of any of the nodes we're freeing. We'll release the pin + * when the new nodes are persistent and reachable on disk: + */ +- bch2_journal_pin_copy(&c->journal, &as->journal, +- &w->journal, interior_update_flush); ++ w = btree_current_write(b); ++ bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal, NULL); + bch2_journal_pin_drop(&c->journal, &w->journal); + + w = btree_prev_write(b); +- bch2_journal_pin_copy(&c->journal, &as->journal, +- &w->journal, interior_update_flush); ++ bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal, NULL); + bch2_journal_pin_drop(&c->journal, &w->journal); + + mutex_unlock(&c->btree_interior_update_lock); +@@ -1078,6 +969,7 @@ bch2_btree_update_start(struct bch_fs *c, enum btree_id id, + { + struct btree_reserve *reserve; + struct btree_update *as; ++ int ret; + + reserve = bch2_btree_reserve_get(c, nr_nodes, flags, cl); + if (IS_ERR(reserve)) +@@ -1094,6 +986,15 @@ bch2_btree_update_start(struct bch_fs *c, enum btree_id id, + + bch2_keylist_init(&as->parent_keys, as->inline_keys); + ++ ret = bch2_journal_preres_get(&c->journal, &as->journal_preres, ++ jset_u64s(BKEY_BTREE_PTR_U64s_MAX) * 3, 0); ++ if (ret) { ++ bch2_btree_reserve_put(c, reserve); ++ closure_debug_destroy(&as->cl); ++ mempool_free(as, &c->btree_interior_update_pool); ++ return ERR_PTR(ret); ++ } ++ + mutex_lock(&c->btree_interior_update_lock); + list_add_tail(&as->list, &c->btree_interior_update_list); + mutex_unlock(&c->btree_interior_update_lock); +@@ -1153,22 +1054,6 @@ static void bch2_btree_set_root_inmem(struct btree_update *as, struct btree *b) + mutex_unlock(&c->btree_interior_update_lock); + } + +-static void bch2_btree_set_root_ondisk(struct bch_fs *c, struct btree *b, int rw) +-{ +- struct btree_root *r = &c->btree_roots[b->btree_id]; +- +- mutex_lock(&c->btree_root_lock); +- +- BUG_ON(b != r->b); +- bkey_copy(&r->key, &b->key); +- r->level = b->level; +- r->alive = true; +- if (rw == WRITE) +- c->btree_roots_dirty = true; +- +- mutex_unlock(&c->btree_root_lock); +-} +- + /** + * bch_btree_set_root - update the root in memory and on disk + * +@@ -1201,7 +1086,7 @@ static void bch2_btree_set_root(struct btree_update *as, struct btree *b, + + bch2_btree_set_root_inmem(as, b); + +- btree_update_updated_root(as); ++ btree_update_updated_root(as, b); + + /* + * Unlock old root after new root is visible: +@@ -1471,7 +1356,8 @@ static void btree_split(struct btree_update *as, struct btree *b, + bch2_btree_build_aux_trees(n1); + six_unlock_write(&n1->lock); + +- bch2_keylist_add(&as->parent_keys, &n1->key); ++ if (parent) ++ bch2_keylist_add(&as->parent_keys, &n1->key); + } + + bch2_btree_node_write(c, n1, SIX_LOCK_intent); +@@ -1545,12 +1431,8 @@ bch2_btree_insert_keys_interior(struct btree_update *as, struct btree *b, + (bkey_cmp_packed(b, k, &insert->k) >= 0)) + ; + +- while (!bch2_keylist_empty(keys)) { +- insert = bch2_keylist_front(keys); +- ++ for_each_keylist_key(keys, insert) + bch2_insert_fixup_btree_ptr(as, b, iter, insert, &node_iter); +- bch2_keylist_pop_front(keys); +- } + + btree_update_updated_node(as, b); + +@@ -2107,7 +1989,7 @@ static void __bch2_btree_node_update_key(struct bch_fs *c, + bkey_copy(&b->key, new_key); + } + +- btree_update_updated_root(as); ++ btree_update_updated_root(as, b); + bch2_btree_node_unlock_write(b, iter); + } + +diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h +index c90fcd48eeb7..0ac95dd80a38 100644 +--- a/fs/bcachefs/btree_update_interior.h ++++ b/fs/bcachefs/btree_update_interior.h +@@ -69,8 +69,10 @@ struct btree_update { + unsigned nodes_written:1; + + enum btree_id btree_id; ++ u8 level; + + struct btree_reserve *reserve; ++ struct journal_preres journal_preres; + + /* + * BTREE_INTERIOR_UPDATING_NODE: +@@ -83,18 +85,6 @@ struct btree_update { + struct btree *b; + struct list_head write_blocked_list; + +- /* +- * BTREE_INTERIOR_UPDATING_AS: btree node we updated was freed, so now +- * we're now blocking another btree_update +- * @parent_as - btree_update that's waiting on our nodes to finish +- * writing, before it can make new nodes visible on disk +- * @wait - list of child btree_updates that are waiting on this +- * btree_update to make all the new nodes visible before they can free +- * their old btree nodes +- */ +- struct btree_update *parent_as; +- struct closure_waitlist wait; +- + /* + * We may be freeing nodes that were dirty, and thus had journal entries + * pinned: we need to transfer the oldest of those pins to the +@@ -103,8 +93,6 @@ struct btree_update { + */ + struct journal_entry_pin journal; + +- u64 journal_seq; +- + /* + * Nodes being freed: + * Protected by c->btree_node_pending_free_lock +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index e491972007e1..f94bc6a0b699 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -145,6 +145,17 @@ static void btree_node_flush1(struct journal *j, struct journal_entry_pin *pin, + return __btree_node_flush(j, pin, 1, seq); + } + ++inline void bch2_btree_add_journal_pin(struct bch_fs *c, ++ struct btree *b, u64 seq) ++{ ++ struct btree_write *w = btree_current_write(b); ++ ++ bch2_journal_pin_add(&c->journal, seq, &w->journal, ++ btree_node_write_idx(b) == 0 ++ ? btree_node_flush0 ++ : btree_node_flush1); ++} ++ + static inline void __btree_journal_key(struct btree_trans *trans, + enum btree_id btree_id, + struct bkey_i *insert) +@@ -173,10 +184,6 @@ static void bch2_btree_journal_key(struct btree_trans *trans, + struct bch_fs *c = trans->c; + struct journal *j = &c->journal; + struct btree *b = iter_l(iter)->b; +- struct btree_write *w = btree_current_write(b); +- u64 seq = likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)) +- ? trans->journal_res.seq +- : j->replay_journal_seq; + + EBUG_ON(trans->journal_res.ref != + !(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)); +@@ -187,10 +194,10 @@ static void bch2_btree_journal_key(struct btree_trans *trans, + cpu_to_le64(trans->journal_res.seq); + } + +- bch2_journal_pin_add(j, seq, &w->journal, +- btree_node_write_idx(b) == 0 +- ? btree_node_flush0 +- : btree_node_flush1); ++ bch2_btree_add_journal_pin(c, b, ++ likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)) ++ ? trans->journal_res.seq ++ : j->replay_journal_seq); + + if (unlikely(!btree_node_dirty(b))) + set_btree_node_dirty(b); +diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c +index b5e276539bd6..9c1aaa594cd1 100644 +--- a/fs/bcachefs/super-io.c ++++ b/fs/bcachefs/super-io.c +@@ -955,6 +955,7 @@ int bch2_fs_mark_dirty(struct bch_fs *c) + c->disk_sb.sb->compat[0] &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_METADATA); + c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_new_extent_overwrite; + c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_extents_above_btree_updates; ++ c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_btree_updates_journalled; + ret = bch2_write_super(c); + mutex_unlock(&c->sb_lock); + +@@ -1086,6 +1087,7 @@ void bch2_fs_mark_clean(struct bch_fs *c) + c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_ALLOC_INFO; + c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_ALLOC_METADATA; + c->disk_sb.sb->features[0] &= ~(1ULL << BCH_FEATURE_extents_above_btree_updates); ++ c->disk_sb.sb->features[0] &= ~(1ULL << BCH_FEATURE_btree_updates_journalled); + + u64s = sizeof(*sb_clean) / sizeof(u64) + c->journal.entry_u64s_reserved; + +-- +cgit v1.2.3 + + +From 70b71a55da8d4c8c2564952fb643a8dd644acbb1 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 25 Mar 2020 17:57:29 -0400 +Subject: bcachefs: Fix an assertion when nothing to replay + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/recovery.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index 2b428ee73364..56e23ef2c1ac 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -449,7 +449,8 @@ static int bch2_journal_replay(struct bch_fs *c, + + sort(keys.d, keys.nr, sizeof(keys.d[0]), journal_sort_seq_cmp, NULL); + +- replay_now_at(j, keys.journal_seq_base); ++ if (keys.nr) ++ replay_now_at(j, keys.journal_seq_base); + + for_each_journal_key(keys, i) { + if (!i->level) +-- +cgit v1.2.3 + + +From 97bdef9dcb33c69fad01756821e802c7aa15f286 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 25 Mar 2020 16:12:33 -0400 +Subject: bcachefs: Add an option for keeping journal entries after startup + +This will be used by the userspace debug tools. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bcachefs.h | 14 ++++++++++++++ + fs/bcachefs/opts.h | 5 +++++ + fs/bcachefs/recovery.c | 42 +++++++++++++++++++++--------------------- + fs/bcachefs/recovery.h | 15 +++------------ + fs/bcachefs/super.c | 4 ++++ + 5 files changed, 47 insertions(+), 33 deletions(-) + +diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h +index cce3d12f5283..43161028333b 100644 +--- a/fs/bcachefs/bcachefs.h ++++ b/fs/bcachefs/bcachefs.h +@@ -523,6 +523,18 @@ struct journal_seq_blacklist_table { + } entries[0]; + }; + ++struct journal_keys { ++ struct journal_key { ++ enum btree_id btree_id:8; ++ unsigned level:8; ++ struct bkey_i *k; ++ u32 journal_seq; ++ u32 journal_offset; ++ } *d; ++ size_t nr; ++ u64 journal_seq_base; ++}; ++ + struct bch_fs { + struct closure cl; + +@@ -789,6 +801,8 @@ struct bch_fs { + mempool_t btree_bounce_pool; + + struct journal journal; ++ struct list_head journal_entries; ++ struct journal_keys journal_keys; + + u64 last_bucket_seq_cleanup; + +diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h +index 1c05effa71e6..ba4903352343 100644 +--- a/fs/bcachefs/opts.h ++++ b/fs/bcachefs/opts.h +@@ -255,6 +255,11 @@ enum opt_type { + OPT_BOOL(), \ + NO_SB_OPT, false, \ + NULL, "Don't replay the journal") \ ++ x(keep_journal, u8, \ ++ OPT_MOUNT, \ ++ OPT_BOOL(), \ ++ NO_SB_OPT, false, \ ++ NULL, "Don't free journal entries/keys after startup")\ + x(noexcl, u8, \ + OPT_MOUNT, \ + OPT_BOOL(), \ +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index 56e23ef2c1ac..5f27b8402d71 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -198,7 +198,7 @@ void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *i + + /* sort and dedup all keys in the journal: */ + +-static void journal_entries_free(struct list_head *list) ++void bch2_journal_entries_free(struct list_head *list) + { + + while (!list_empty(list)) { +@@ -236,7 +236,7 @@ static int journal_sort_seq_cmp(const void *_l, const void *_r) + bkey_cmp(l->k->k.p, r->k->k.p); + } + +-static void journal_keys_free(struct journal_keys *keys) ++void bch2_journal_keys_free(struct journal_keys *keys) + { + kvfree(keys->d); + keys->d = NULL; +@@ -802,8 +802,6 @@ int bch2_fs_recovery(struct bch_fs *c) + const char *err = "cannot allocate memory"; + struct bch_sb_field_clean *clean = NULL; + u64 journal_seq; +- LIST_HEAD(journal_entries); +- struct journal_keys journal_keys = { NULL }; + bool wrote = false, write_sb = false; + int ret; + +@@ -825,30 +823,30 @@ int bch2_fs_recovery(struct bch_fs *c) + if (!c->sb.clean || c->opts.fsck) { + struct jset *j; + +- ret = bch2_journal_read(c, &journal_entries); ++ ret = bch2_journal_read(c, &c->journal_entries); + if (ret) + goto err; + +- if (mustfix_fsck_err_on(c->sb.clean && !journal_empty(&journal_entries), c, ++ if (mustfix_fsck_err_on(c->sb.clean && !journal_empty(&c->journal_entries), c, + "filesystem marked clean but journal not empty")) { + c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO); + SET_BCH_SB_CLEAN(c->disk_sb.sb, false); + c->sb.clean = false; + } + +- if (!c->sb.clean && list_empty(&journal_entries)) { ++ if (!c->sb.clean && list_empty(&c->journal_entries)) { + bch_err(c, "no journal entries found"); + ret = BCH_FSCK_REPAIR_IMPOSSIBLE; + goto err; + } + +- journal_keys = journal_keys_sort(&journal_entries); +- if (!journal_keys.d) { ++ c->journal_keys = journal_keys_sort(&c->journal_entries); ++ if (!c->journal_keys.d) { + ret = -ENOMEM; + goto err; + } + +- j = &list_last_entry(&journal_entries, ++ j = &list_last_entry(&c->journal_entries, + struct journal_replay, list)->j; + + ret = verify_superblock_clean(c, &clean, j); +@@ -867,7 +865,7 @@ int bch2_fs_recovery(struct bch_fs *c) + goto err; + } + +- ret = journal_replay_early(c, clean, &journal_entries); ++ ret = journal_replay_early(c, clean, &c->journal_entries); + if (ret) + goto err; + +@@ -885,15 +883,15 @@ int bch2_fs_recovery(struct bch_fs *c) + + ret = bch2_blacklist_table_initialize(c); + +- if (!list_empty(&journal_entries)) { ++ if (!list_empty(&c->journal_entries)) { + ret = verify_journal_entries_not_blacklisted_or_missing(c, +- &journal_entries); ++ &c->journal_entries); + if (ret) + goto err; + } + + ret = bch2_fs_journal_start(&c->journal, journal_seq, +- &journal_entries); ++ &c->journal_entries); + if (ret) + goto err; + +@@ -903,14 +901,14 @@ int bch2_fs_recovery(struct bch_fs *c) + + bch_verbose(c, "starting alloc read"); + err = "error reading allocation information"; +- ret = bch2_alloc_read(c, &journal_keys); ++ ret = bch2_alloc_read(c, &c->journal_keys); + if (ret) + goto err; + bch_verbose(c, "alloc read done"); + + bch_verbose(c, "starting stripes_read"); + err = "error reading stripes"; +- ret = bch2_stripes_read(c, &journal_keys); ++ ret = bch2_stripes_read(c, &c->journal_keys); + if (ret) + goto err; + bch_verbose(c, "stripes_read done"); +@@ -926,7 +924,7 @@ int bch2_fs_recovery(struct bch_fs *c) + */ + bch_info(c, "starting metadata mark and sweep"); + err = "error in mark and sweep"; +- ret = bch2_gc(c, &journal_keys, true, true); ++ ret = bch2_gc(c, &c->journal_keys, true, true); + if (ret) + goto err; + bch_verbose(c, "mark and sweep done"); +@@ -937,7 +935,7 @@ int bch2_fs_recovery(struct bch_fs *c) + test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags)) { + bch_info(c, "starting mark and sweep"); + err = "error in mark and sweep"; +- ret = bch2_gc(c, &journal_keys, true, false); ++ ret = bch2_gc(c, &c->journal_keys, true, false); + if (ret) + goto err; + bch_verbose(c, "mark and sweep done"); +@@ -958,7 +956,7 @@ int bch2_fs_recovery(struct bch_fs *c) + + bch_verbose(c, "starting journal replay"); + err = "journal replay failed"; +- ret = bch2_journal_replay(c, journal_keys); ++ ret = bch2_journal_replay(c, c->journal_keys); + if (ret) + goto err; + bch_verbose(c, "journal replay done"); +@@ -1054,8 +1052,10 @@ fsck_err: + set_bit(BCH_FS_FSCK_DONE, &c->flags); + bch2_flush_fsck_errs(c); + +- journal_keys_free(&journal_keys); +- journal_entries_free(&journal_entries); ++ if (!c->opts.keep_journal) { ++ bch2_journal_keys_free(&c->journal_keys); ++ bch2_journal_entries_free(&c->journal_entries); ++ } + kfree(clean); + if (ret) + bch_err(c, "Error in recovery: %s (%i)", err, ret); +diff --git a/fs/bcachefs/recovery.h b/fs/bcachefs/recovery.h +index fa1f2818817d..19f2f172a26b 100644 +--- a/fs/bcachefs/recovery.h ++++ b/fs/bcachefs/recovery.h +@@ -2,18 +2,6 @@ + #ifndef _BCACHEFS_RECOVERY_H + #define _BCACHEFS_RECOVERY_H + +-struct journal_keys { +- struct journal_key { +- enum btree_id btree_id:8; +- unsigned level:8; +- struct bkey_i *k; +- u32 journal_seq; +- u32 journal_offset; +- } *d; +- size_t nr; +- u64 journal_seq_base; +-}; +- + #define for_each_journal_key(keys, i) \ + for (i = (keys).d; i < (keys).d + (keys).nr; (i)++) + +@@ -56,6 +44,9 @@ void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *, + struct journal_keys *, + struct btree *); + ++void bch2_journal_keys_free(struct journal_keys *); ++void bch2_journal_entries_free(struct list_head *); ++ + int bch2_fs_recovery(struct bch_fs *); + int bch2_fs_initialize(struct bch_fs *); + +diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c +index fae43f3c338c..dcca94f7b0a0 100644 +--- a/fs/bcachefs/super.c ++++ b/fs/bcachefs/super.c +@@ -461,6 +461,8 @@ static void bch2_fs_free(struct bch_fs *c) + bch2_io_clock_exit(&c->io_clock[WRITE]); + bch2_io_clock_exit(&c->io_clock[READ]); + bch2_fs_compress_exit(c); ++ bch2_journal_keys_free(&c->journal_keys); ++ bch2_journal_entries_free(&c->journal_entries); + percpu_free_rwsem(&c->mark_lock); + kfree(c->usage_scratch); + free_percpu(c->usage[1]); +@@ -649,6 +651,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) + INIT_WORK(&c->journal_seq_blacklist_gc_work, + bch2_blacklist_entries_gc); + ++ INIT_LIST_HEAD(&c->journal_entries); ++ + INIT_LIST_HEAD(&c->fsck_errors); + mutex_init(&c->fsck_error_lock); + +-- +cgit v1.2.3 + + +From 2e23de160617ecdd973ea912a5f74380dd0d456a Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 25 Mar 2020 16:13:00 -0400 +Subject: bcachefs: Improve error message in fsck + +Seeing the extents that were overlapping is highly useful for figuring +out what went wrong. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/fsck.c | 25 +++++++++++++++---------- + 1 file changed, 15 insertions(+), 10 deletions(-) + +diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c +index 936e6366cb04..822541e6adfc 100644 +--- a/fs/bcachefs/fsck.c ++++ b/fs/bcachefs/fsck.c +@@ -1,6 +1,7 @@ + // SPDX-License-Identifier: GPL-2.0 + + #include "bcachefs.h" ++#include "bkey_on_stack.h" + #include "btree_update.h" + #include "dirent.h" + #include "error.h" +@@ -469,10 +470,12 @@ static int check_extents(struct bch_fs *c) + struct btree_trans trans; + struct btree_iter *iter; + struct bkey_s_c k; +- struct bkey prev = KEY(0, 0, 0); ++ struct bkey_on_stack prev; + u64 i_sectors; + int ret = 0; + ++ bkey_on_stack_init(&prev); ++ prev.k->k = KEY(0, 0, 0); + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); + + bch_verbose(c, "checking extents"); +@@ -482,24 +485,24 @@ static int check_extents(struct bch_fs *c) + BTREE_ITER_INTENT); + retry: + for_each_btree_key_continue(iter, 0, k, ret) { +- if (bkey_cmp(prev.p, bkey_start_pos(k.k)) > 0) { +- char buf1[100]; +- char buf2[100]; ++ if (bkey_cmp(prev.k->k.p, bkey_start_pos(k.k)) > 0) { ++ char buf1[200]; ++ char buf2[200]; + +- bch2_bkey_to_text(&PBUF(buf1), &prev); +- bch2_bkey_to_text(&PBUF(buf2), k.k); ++ bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(prev.k)); ++ bch2_bkey_val_to_text(&PBUF(buf2), c, k); + +- if (fsck_err(c, "overlapping extents: %s, %s", buf1, buf2)) { ++ if (fsck_err(c, "overlapping extents:\n%s\n%s", buf1, buf2)) { + ret = __bch2_trans_do(&trans, NULL, NULL, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW, + bch2_fix_overlapping_extent(&trans, +- iter, k, prev.p)); ++ iter, k, prev.k->k.p)); + if (ret) + goto err; + } + } +- prev = *k.k; ++ bkey_on_stack_reassemble(&prev, c, k); + + ret = walk_inode(&trans, &w, k.k->p.inode); + if (ret) +@@ -525,7 +528,8 @@ retry: + !(w.inode.bi_flags & BCH_INODE_I_SECTORS_DIRTY) && + w.inode.bi_sectors != + (i_sectors = bch2_count_inode_sectors(&trans, w.cur_inum)), +- c, "i_sectors wrong: got %llu, should be %llu", ++ c, "inode %llu has incorrect i_sectors: got %llu, should be %llu", ++ w.inode.bi_inum, + w.inode.bi_sectors, i_sectors)) { + struct bkey_inode_buf p; + +@@ -567,6 +571,7 @@ err: + fsck_err: + if (ret == -EINTR) + goto retry; ++ bkey_on_stack_exit(&prev, c); + return bch2_trans_exit(&trans) ?: ret; + } + +-- +cgit v1.2.3 + + +From ee212ee45a9c7d420a0e34b95667387e69aa1d30 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 27 Mar 2020 17:38:51 -0400 +Subject: bcachefs: Use memalloc_nofs_save() + +vmalloc allocations don't always obey GFP_NOFS - memalloc_nofs_save() is +the prefered approach for the future. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_io.c | 13 ++++++++----- + 1 file changed, 8 insertions(+), 5 deletions(-) + +diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c +index b48d48b8c27d..bc586031e9e1 100644 +--- a/fs/bcachefs/btree_io.c ++++ b/fs/bcachefs/btree_io.c +@@ -19,6 +19,7 @@ + #include "journal_seq_blacklist.h" + #include "super-io.h" + ++#include + #include + + static void verify_no_dups(struct btree *b, +@@ -68,17 +69,19 @@ static void btree_bounce_free(struct bch_fs *c, unsigned order, + static void *btree_bounce_alloc(struct bch_fs *c, unsigned order, + bool *used_mempool) + { ++ unsigned flags = memalloc_nofs_save(); + void *p; + + BUG_ON(order > btree_page_order(c)); + + *used_mempool = false; + p = (void *) __get_free_pages(__GFP_NOWARN|GFP_NOWAIT, order); +- if (p) +- return p; +- +- *used_mempool = true; +- return mempool_alloc(&c->btree_bounce_pool, GFP_NOIO); ++ if (!p) { ++ *used_mempool = true; ++ p = mempool_alloc(&c->btree_bounce_pool, GFP_NOIO); ++ } ++ memalloc_nofs_restore(flags); ++ return p; + } + + static void sort_bkey_ptrs(const struct btree *bt, +-- +cgit v1.2.3 + + +From 6b1b454a54917deac35f494573640d60fce633de Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 28 Mar 2020 19:17:23 -0400 +Subject: bcachefs: Various fixes for interior update path + +The locking was wrong, and we could get a use after free in the error +path where we weren't taking the entrie being freed off the unwritten +list. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_update_interior.c | 35 ++++++++++++++--------------------- + 1 file changed, 14 insertions(+), 21 deletions(-) + +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index 7b00b26ed0f2..36b78c27be29 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -581,7 +581,7 @@ err_free: + + /* Asynchronous interior node update machinery */ + +-static void bch2_btree_update_free(struct btree_update *as) ++static void __bch2_btree_update_free(struct btree_update *as) + { + struct bch_fs *c = as->c; + +@@ -596,28 +596,32 @@ static void bch2_btree_update_free(struct btree_update *as) + if (as->reserve) + bch2_btree_reserve_put(c, as->reserve); + +- mutex_lock(&c->btree_interior_update_lock); + list_del(&as->list); + + closure_debug_destroy(&as->cl); + mempool_free(as, &c->btree_interior_update_pool); + + closure_wake_up(&c->btree_interior_update_wait); +- mutex_unlock(&c->btree_interior_update_lock); + } + +-static void btree_update_nodes_reachable(struct btree_update *as, u64 seq) ++static void bch2_btree_update_free(struct btree_update *as) + { + struct bch_fs *c = as->c; + + mutex_lock(&c->btree_interior_update_lock); ++ __bch2_btree_update_free(as); ++ mutex_unlock(&c->btree_interior_update_lock); ++} ++ ++static void btree_update_nodes_reachable(struct btree_update *as, u64 seq) ++{ ++ struct bch_fs *c = as->c; + + while (as->nr_new_nodes) { + struct btree *b = as->new_nodes[--as->nr_new_nodes]; + + BUG_ON(b->will_make_reachable != (unsigned long) as); + b->will_make_reachable = 0; +- mutex_unlock(&c->btree_interior_update_lock); + + /* + * b->will_make_reachable prevented it from being written, so +@@ -626,14 +630,11 @@ static void btree_update_nodes_reachable(struct btree_update *as, u64 seq) + btree_node_lock_type(c, b, SIX_LOCK_read); + bch2_btree_node_write_cond(c, b, btree_node_need_write(b)); + six_unlock_read(&b->lock); +- mutex_lock(&c->btree_interior_update_lock); + } + + while (as->nr_pending) + bch2_btree_node_free_ondisk(c, &as->pending[--as->nr_pending], + seq); +- +- mutex_unlock(&c->btree_interior_update_lock); + } + + static void btree_update_nodes_written(struct closure *cl) +@@ -667,9 +668,12 @@ again: + mutex_unlock(&c->btree_interior_update_lock); + btree_node_lock_type(c, b, SIX_LOCK_intent); + six_unlock_intent(&b->lock); +- goto out; ++ mutex_lock(&c->btree_interior_update_lock); ++ goto again; + } + ++ list_del(&as->unwritten_list); ++ + journal_u64s = 0; + + if (as->mode != BTREE_INTERIOR_UPDATING_ROOT) +@@ -710,9 +714,6 @@ again: + bch2_btree_add_journal_pin(c, b, res.seq); + six_unlock_write(&b->lock); + +- list_del(&as->unwritten_list); +- mutex_unlock(&c->btree_interior_update_lock); +- + /* + * b->write_blocked prevented it from being written, so + * write it now if it needs to be written: +@@ -723,9 +724,6 @@ again: + + case BTREE_INTERIOR_UPDATING_AS: + BUG_ON(b); +- +- list_del(&as->unwritten_list); +- mutex_unlock(&c->btree_interior_update_lock); + break; + + case BTREE_INTERIOR_UPDATING_ROOT: { +@@ -739,9 +737,6 @@ again: + r->alive = true; + c->btree_roots_dirty = true; + mutex_unlock(&c->btree_root_lock); +- +- list_del(&as->unwritten_list); +- mutex_unlock(&c->btree_interior_update_lock); + break; + } + } +@@ -753,14 +748,12 @@ again: + + btree_update_nodes_reachable(as, res.seq); + free_update: +- bch2_btree_update_free(as); ++ __bch2_btree_update_free(as); + /* + * for flush_held_btree_writes() waiting on updates to flush or + * nodes to be writeable: + */ + closure_wake_up(&c->btree_interior_update_wait); +-out: +- mutex_lock(&c->btree_interior_update_lock); + goto again; + } + +-- +cgit v1.2.3 + + +From 6cd4b5fcf220e494dddeee98ef886b5b6799f198 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 28 Mar 2020 18:26:01 -0400 +Subject: bcachefs: Read journal when keep_journal on + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/recovery.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index 5f27b8402d71..8cfae639e23f 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -820,7 +820,7 @@ int bch2_fs_recovery(struct bch_fs *c) + set_bit(BCH_FS_REBUILD_REPLICAS, &c->flags); + } + +- if (!c->sb.clean || c->opts.fsck) { ++ if (!c->sb.clean || c->opts.fsck || c->opts.keep_journal) { + struct jset *j; + + ret = bch2_journal_read(c, &c->journal_entries); +-- +cgit v1.2.3 + + +From 704ac47efb6ca31086baab02e40cb1903693d90a Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 29 Mar 2020 12:33:41 -0400 +Subject: bcachefs: Use kvpmalloc mempools for compression bounce + +This fixes an issue where mounting would fail because of memory +fragmentation - previously the compression bounce buffers were using +get_free_pages(). + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/compress.c | 25 +++++-------------------- + 1 file changed, 5 insertions(+), 20 deletions(-) + +diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c +index 81c69c1554f4..62e560a83a61 100644 +--- a/fs/bcachefs/compress.c ++++ b/fs/bcachefs/compress.c +@@ -17,7 +17,6 @@ struct bbuf { + BB_NONE, + BB_VMAP, + BB_KMALLOC, +- BB_VMALLOC, + BB_MEMPOOL, + } type; + int rw; +@@ -33,17 +32,7 @@ static struct bbuf __bounce_alloc(struct bch_fs *c, unsigned size, int rw) + if (b) + return (struct bbuf) { .b = b, .type = BB_KMALLOC, .rw = rw }; + +- b = mempool_alloc(&c->compression_bounce[rw], GFP_NOWAIT); +- b = b ? page_address(b) : NULL; +- if (b) +- return (struct bbuf) { .b = b, .type = BB_MEMPOOL, .rw = rw }; +- +- b = vmalloc(size); +- if (b) +- return (struct bbuf) { .b = b, .type = BB_VMALLOC, .rw = rw }; +- + b = mempool_alloc(&c->compression_bounce[rw], GFP_NOIO); +- b = b ? page_address(b) : NULL; + if (b) + return (struct bbuf) { .b = b, .type = BB_MEMPOOL, .rw = rw }; + +@@ -129,12 +118,8 @@ static void bio_unmap_or_unbounce(struct bch_fs *c, struct bbuf buf) + case BB_KMALLOC: + kfree(buf.b); + break; +- case BB_VMALLOC: +- vfree(buf.b); +- break; + case BB_MEMPOOL: +- mempool_free(virt_to_page(buf.b), +- &c->compression_bounce[buf.rw]); ++ mempool_free(buf.b, &c->compression_bounce[buf.rw]); + break; + } + } +@@ -561,15 +546,15 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features) + have_compressed: + + if (!mempool_initialized(&c->compression_bounce[READ])) { +- ret = mempool_init_page_pool(&c->compression_bounce[READ], +- 1, order); ++ ret = mempool_init_kvpmalloc_pool(&c->compression_bounce[READ], ++ 1, order); + if (ret) + goto out; + } + + if (!mempool_initialized(&c->compression_bounce[WRITE])) { +- ret = mempool_init_page_pool(&c->compression_bounce[WRITE], +- 1, order); ++ ret = mempool_init_kvpmalloc_pool(&c->compression_bounce[WRITE], ++ 1, order); + if (ret) + goto out; + } +-- +cgit v1.2.3 + + +From ca303b03865e3073bfed6aa87df74320eaffdc54 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 29 Mar 2020 14:21:44 -0400 +Subject: bcachefs: Switch a BUG_ON() to a warning + +This has popped and thus needs to be debugged, but the assertion firing +isn't necessarily fatal so switch it to a warning. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/journal.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c +index 9f03a479c9a2..0a4538b3dc60 100644 +--- a/fs/bcachefs/journal.c ++++ b/fs/bcachefs/journal.c +@@ -376,7 +376,8 @@ unlock: + goto retry; + + if (ret == -ENOSPC) { +- BUG_ON(!can_discard && (flags & JOURNAL_RES_GET_RESERVED)); ++ WARN_ONCE(!can_discard && (flags & JOURNAL_RES_GET_RESERVED), ++ "JOURNAL_RES_GET_RESERVED set but journal full"); + + /* + * Journal is full - can't rely on reclaim from work item due to +-- +cgit v1.2.3 + + +From 8b305917f026472a514492da5e02874b2e90f6c6 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 7 Jan 2020 13:29:32 -0500 +Subject: bcachefs: Kill bkey_type_successor + +Previously, BTREE_ID_INODES was special - inodes were indexed by the +inode field, which meant the offset field of struct bpos wasn't used, +which led to special cases in e.g. the btree iterator code. + +Now, inodes in the inodes btree are indexed by the offset field. + +Also: prevously min_key was special for extents btrees, min_key for +extents would equal max_key for the previous node. Now, min_key = +bkey_successor() of the previous node, same as non extent btrees. + +This means we can completely get rid of +btree_type_sucessor/predecessor. + +Also make some improvements to the metadata IO validate/compat code. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bcachefs_format.h | 3 +- + fs/bcachefs/bkey_methods.c | 56 ++++++++++++++++ + fs/bcachefs/bkey_methods.h | 19 ++++++ + fs/bcachefs/btree_cache.c | 3 +- + fs/bcachefs/btree_gc.c | 9 +-- + fs/bcachefs/btree_io.c | 129 ++++++++++++++++++++++-------------- + fs/bcachefs/btree_io.h | 47 +++++++++++++ + fs/bcachefs/btree_iter.c | 35 ++++++---- + fs/bcachefs/btree_iter.h | 26 -------- + fs/bcachefs/btree_update_interior.c | 2 +- + fs/bcachefs/btree_update_leaf.c | 7 +- + fs/bcachefs/extent_update.c | 4 +- + fs/bcachefs/extents.c | 17 +++++ + fs/bcachefs/extents.h | 3 + + fs/bcachefs/fsck.c | 6 +- + fs/bcachefs/inode.c | 32 ++++----- + fs/bcachefs/journal_io.c | 39 ++++++----- + 17 files changed, 295 insertions(+), 142 deletions(-) + +diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h +index a78988e3ded7..616863ef77d4 100644 +--- a/fs/bcachefs/bcachefs_format.h ++++ b/fs/bcachefs/bcachefs_format.h +@@ -1156,7 +1156,8 @@ enum bcachefs_metadata_version { + bcachefs_metadata_version_min = 9, + bcachefs_metadata_version_new_versioning = 10, + bcachefs_metadata_version_bkey_renumber = 10, +- bcachefs_metadata_version_max = 11, ++ bcachefs_metadata_version_inode_btree_change = 11, ++ bcachefs_metadata_version_max = 12, + }; + + #define bcachefs_metadata_version_current (bcachefs_metadata_version_max - 1) +diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c +index 0aa3d3b9a281..c97e1e9002cb 100644 +--- a/fs/bcachefs/bkey_methods.c ++++ b/fs/bcachefs/bkey_methods.c +@@ -273,3 +273,59 @@ void bch2_bkey_renumber(enum btree_node_type btree_node_type, + break; + } + } ++ ++void __bch2_bkey_compat(unsigned level, enum btree_id btree_id, ++ unsigned version, unsigned big_endian, ++ int write, ++ struct bkey_format *f, ++ struct bkey_packed *k) ++{ ++ const struct bkey_ops *ops; ++ struct bkey uk; ++ struct bkey_s u; ++ ++ if (big_endian != CPU_BIG_ENDIAN) ++ bch2_bkey_swab_key(f, k); ++ ++ if (version < bcachefs_metadata_version_bkey_renumber) ++ bch2_bkey_renumber(__btree_node_type(level, btree_id), k, write); ++ ++ if (version < bcachefs_metadata_version_inode_btree_change && ++ btree_id == BTREE_ID_INODES) { ++ if (!bkey_packed(k)) { ++ struct bkey_i *u = packed_to_bkey(k); ++ swap(u->k.p.inode, u->k.p.offset); ++ } else if (f->bits_per_field[BKEY_FIELD_INODE] && ++ f->bits_per_field[BKEY_FIELD_OFFSET]) { ++ struct bkey_format tmp = *f, *in = f, *out = &tmp; ++ ++ swap(tmp.bits_per_field[BKEY_FIELD_INODE], ++ tmp.bits_per_field[BKEY_FIELD_OFFSET]); ++ swap(tmp.field_offset[BKEY_FIELD_INODE], ++ tmp.field_offset[BKEY_FIELD_OFFSET]); ++ ++ if (!write) ++ swap(in, out); ++ ++ uk = __bch2_bkey_unpack_key(in, k); ++ swap(uk.p.inode, uk.p.offset); ++ BUG_ON(!bch2_bkey_pack_key(k, &uk, out)); ++ } ++ } ++ ++ if (!bkey_packed(k)) { ++ u = bkey_i_to_s(packed_to_bkey(k)); ++ } else { ++ uk = __bch2_bkey_unpack_key(f, k); ++ u.k = &uk; ++ u.v = bkeyp_val(f, k); ++ } ++ ++ if (big_endian != CPU_BIG_ENDIAN) ++ bch2_bkey_swab_val(u); ++ ++ ops = &bch2_bkey_ops[k->type]; ++ ++ if (ops->compat) ++ ops->compat(btree_id, version, big_endian, write, u); ++} +diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h +index d36468b75223..0bca725ae3b8 100644 +--- a/fs/bcachefs/bkey_methods.h ++++ b/fs/bcachefs/bkey_methods.h +@@ -33,6 +33,9 @@ struct bkey_ops { + bool (*key_normalize)(struct bch_fs *, struct bkey_s); + enum merge_result (*key_merge)(struct bch_fs *, + struct bkey_s, struct bkey_s); ++ void (*compat)(enum btree_id id, unsigned version, ++ unsigned big_endian, int write, ++ struct bkey_s); + }; + + const char *bch2_bkey_val_invalid(struct bch_fs *, struct bkey_s_c); +@@ -60,4 +63,20 @@ enum merge_result bch2_bkey_merge(struct bch_fs *, + + void bch2_bkey_renumber(enum btree_node_type, struct bkey_packed *, int); + ++void __bch2_bkey_compat(unsigned, enum btree_id, unsigned, unsigned, ++ int, struct bkey_format *, struct bkey_packed *); ++ ++static inline void bch2_bkey_compat(unsigned level, enum btree_id btree_id, ++ unsigned version, unsigned big_endian, ++ int write, ++ struct bkey_format *f, ++ struct bkey_packed *k) ++{ ++ if (version < bcachefs_metadata_version_current || ++ big_endian != CPU_BIG_ENDIAN) ++ __bch2_bkey_compat(level, btree_id, version, ++ big_endian, write, f, k); ++ ++} ++ + #endif /* _BCACHEFS_BKEY_METHODS_H */ +diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c +index a2fa92f5bc8a..bc59f0363a21 100644 +--- a/fs/bcachefs/btree_cache.c ++++ b/fs/bcachefs/btree_cache.c +@@ -923,8 +923,7 @@ out: + if (sib != btree_prev_sib) + swap(n1, n2); + +- BUG_ON(bkey_cmp(btree_type_successor(n1->btree_id, +- n1->key.k.p), ++ BUG_ON(bkey_cmp(bkey_successor(n1->key.k.p), + n2->data->min_key)); + } + +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +index 8138df6c9d14..64c99630b9d2 100644 +--- a/fs/bcachefs/btree_gc.c ++++ b/fs/bcachefs/btree_gc.c +@@ -72,7 +72,7 @@ static void btree_node_range_checks(struct bch_fs *c, struct btree *b, + struct range_level *l = &r->l[b->level]; + + struct bpos expected_min = bkey_cmp(l->min, l->max) +- ? btree_type_successor(b->btree_id, l->max) ++ ? bkey_successor(l->max) + : l->max; + + bch2_fs_inconsistent_on(bkey_cmp(b->data->min_key, expected_min), c, +@@ -103,8 +103,7 @@ static void btree_node_range_checks(struct bch_fs *c, struct btree *b, + + if (bkey_cmp(b->data->max_key, POS_MAX)) + l->min = l->max = +- btree_type_successor(b->btree_id, +- b->data->max_key); ++ bkey_successor(b->data->max_key); + } + } + +@@ -985,9 +984,7 @@ static void bch2_coalesce_nodes(struct bch_fs *c, struct btree_iter *iter, + n1->key.k.p = n1->data->max_key = + bkey_unpack_pos(n1, last); + +- n2->data->min_key = +- btree_type_successor(iter->btree_id, +- n1->data->max_key); ++ n2->data->min_key = bkey_successor(n1->data->max_key); + + memcpy_u64s(vstruct_last(s1), + s2->start, u64s); +diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c +index bc586031e9e1..ac8b98861aae 100644 +--- a/fs/bcachefs/btree_io.c ++++ b/fs/bcachefs/btree_io.c +@@ -709,83 +709,107 @@ out: \ + + static int validate_bset(struct bch_fs *c, struct btree *b, + struct bset *i, unsigned sectors, +- unsigned *whiteout_u64s, int write, +- bool have_retry) ++ int write, bool have_retry) + { +- struct bkey_packed *k, *prev = NULL; +- bool seen_non_whiteout = false; +- unsigned version; ++ unsigned version = le16_to_cpu(i->version); + const char *err; + int ret = 0; + ++ btree_err_on((version != BCH_BSET_VERSION_OLD && ++ version < bcachefs_metadata_version_min) || ++ version >= bcachefs_metadata_version_max, ++ BTREE_ERR_FATAL, c, b, i, ++ "unsupported bset version"); ++ ++ if (btree_err_on(b->written + sectors > c->opts.btree_node_size, ++ BTREE_ERR_FIXABLE, c, b, i, ++ "bset past end of btree node")) { ++ i->u64s = 0; ++ return 0; ++ } ++ ++ btree_err_on(b->written && !i->u64s, ++ BTREE_ERR_FIXABLE, c, b, i, ++ "empty bset"); ++ + if (!b->written) { ++ struct btree_node *bn = ++ container_of(i, struct btree_node, keys); + /* These indicate that we read the wrong btree node: */ +- btree_err_on(BTREE_NODE_ID(b->data) != b->btree_id, ++ btree_err_on(BTREE_NODE_ID(bn) != b->btree_id, + BTREE_ERR_MUST_RETRY, c, b, i, + "incorrect btree id"); + +- btree_err_on(BTREE_NODE_LEVEL(b->data) != b->level, ++ btree_err_on(BTREE_NODE_LEVEL(bn) != b->level, + BTREE_ERR_MUST_RETRY, c, b, i, + "incorrect level"); + + if (BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN) { +- u64 *p = (u64 *) &b->data->ptr; ++ u64 *p = (u64 *) &bn->ptr; + + *p = swab64(*p); +- bch2_bpos_swab(&b->data->min_key); +- bch2_bpos_swab(&b->data->max_key); + } + ++ if (!write) ++ compat_btree_node(b->level, b->btree_id, version, ++ BSET_BIG_ENDIAN(i), write, bn); ++ + if (b->key.k.type == KEY_TYPE_btree_ptr_v2) { + struct bch_btree_ptr_v2 *bp = + &bkey_i_to_btree_ptr_v2(&b->key)->v; + + btree_err_on(bkey_cmp(b->data->min_key, bp->min_key), + BTREE_ERR_MUST_RETRY, c, b, NULL, +- "incorrect min_key"); ++ "incorrect min_key: got %llu:%llu should be %llu:%llu", ++ b->data->min_key.inode, ++ b->data->min_key.offset, ++ bp->min_key.inode, ++ bp->min_key.offset); + } + +- btree_err_on(bkey_cmp(b->data->max_key, b->key.k.p), ++ btree_err_on(bkey_cmp(bn->max_key, b->key.k.p), + BTREE_ERR_MUST_RETRY, c, b, i, + "incorrect max key"); + ++ if (write) ++ compat_btree_node(b->level, b->btree_id, version, ++ BSET_BIG_ENDIAN(i), write, bn); ++ + /* XXX: ideally we would be validating min_key too */ + #if 0 + /* + * not correct anymore, due to btree node write error + * handling + * +- * need to add b->data->seq to btree keys and verify ++ * need to add bn->seq to btree keys and verify + * against that + */ + btree_err_on(!extent_contains_ptr(bkey_i_to_s_c_extent(&b->key), +- b->data->ptr), ++ bn->ptr), + BTREE_ERR_FATAL, c, b, i, + "incorrect backpointer"); + #endif +- err = bch2_bkey_format_validate(&b->data->format); ++ err = bch2_bkey_format_validate(&bn->format); + btree_err_on(err, + BTREE_ERR_FATAL, c, b, i, + "invalid bkey format: %s", err); +- } +- +- version = le16_to_cpu(i->version); +- btree_err_on((version != BCH_BSET_VERSION_OLD && +- version < bcachefs_metadata_version_min) || +- version >= bcachefs_metadata_version_max, +- BTREE_ERR_FATAL, c, b, i, +- "unsupported bset version"); + +- if (btree_err_on(b->written + sectors > c->opts.btree_node_size, +- BTREE_ERR_FIXABLE, c, b, i, +- "bset past end of btree node")) { +- i->u64s = 0; +- return 0; ++ compat_bformat(b->level, b->btree_id, version, ++ BSET_BIG_ENDIAN(i), write, ++ &bn->format); + } ++fsck_err: ++ return ret; ++} + +- btree_err_on(b->written && !i->u64s, +- BTREE_ERR_FIXABLE, c, b, i, +- "empty bset"); ++static int validate_bset_keys(struct bch_fs *c, struct btree *b, ++ struct bset *i, unsigned *whiteout_u64s, ++ int write, bool have_retry) ++{ ++ unsigned version = le16_to_cpu(i->version); ++ struct bkey_packed *k, *prev = NULL; ++ bool seen_non_whiteout = false; ++ int ret = 0; + + if (!BSET_SEPARATE_WHITEOUTS(i)) { + seen_non_whiteout = true; +@@ -814,18 +838,14 @@ static int validate_bset(struct bch_fs *c, struct btree *b, + continue; + } + +- if (BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN) +- bch2_bkey_swab_key(&b->format, k); +- +- if (!write && +- version < bcachefs_metadata_version_bkey_renumber) +- bch2_bkey_renumber(btree_node_type(b), k, write); ++ /* XXX: validate k->u64s */ ++ if (!write) ++ bch2_bkey_compat(b->level, b->btree_id, version, ++ BSET_BIG_ENDIAN(i), write, ++ &b->format, k); + + u = __bkey_disassemble(b, k, &tmp); + +- if (BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN) +- bch2_bkey_swab_val(u); +- + invalid = __bch2_bkey_invalid(c, u.s_c, btree_node_type(b)) ?: + bch2_bkey_in_btree_node(b, u.s_c) ?: + (write ? bch2_bkey_val_invalid(c, u.s_c) : NULL); +@@ -842,9 +862,10 @@ static int validate_bset(struct bch_fs *c, struct btree *b, + continue; + } + +- if (write && +- version < bcachefs_metadata_version_bkey_renumber) +- bch2_bkey_renumber(btree_node_type(b), k, write); ++ if (write) ++ bch2_bkey_compat(b->level, b->btree_id, version, ++ BSET_BIG_ENDIAN(i), write, ++ &b->format, k); + + /* + * with the separate whiteouts thing (used for extents), the +@@ -875,8 +896,6 @@ static int validate_bset(struct bch_fs *c, struct btree *b, + prev = k; + k = bkey_next_skip_noops(k, vstruct_last(i)); + } +- +- SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN); + fsck_err: + return ret; + } +@@ -944,8 +963,6 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry + set_btree_node_old_extent_overwrite(b); + + sectors = vstruct_sectors(b->data, c->block_bits); +- +- btree_node_set_format(b, b->data->format); + } else { + bne = write_block(b); + i = &bne->keys; +@@ -969,11 +986,21 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry + sectors = vstruct_sectors(bne, c->block_bits); + } + +- ret = validate_bset(c, b, i, sectors, &whiteout_u64s, ++ ret = validate_bset(c, b, i, sectors, + READ, have_retry); + if (ret) + goto fsck_err; + ++ if (!b->written) ++ btree_node_set_format(b, b->data->format); ++ ++ ret = validate_bset_keys(c, b, i, &whiteout_u64s, ++ READ, have_retry); ++ if (ret) ++ goto fsck_err; ++ ++ SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN); ++ + b->written += sectors; + + blacklisted = bch2_journal_seq_is_blacklisted(c, +@@ -1416,7 +1443,8 @@ static int validate_bset_for_write(struct bch_fs *c, struct btree *b, + if (bch2_bkey_invalid(c, bkey_i_to_s_c(&b->key), BKEY_TYPE_BTREE)) + return -1; + +- ret = validate_bset(c, b, i, sectors, &whiteout_u64s, WRITE, false); ++ ret = validate_bset(c, b, i, sectors, WRITE, false) ?: ++ validate_bset_keys(c, b, i, &whiteout_u64s, WRITE, false); + if (ret) + bch2_inconsistent_error(c); + +@@ -1566,8 +1594,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, + validate_before_checksum = true; + + /* validate_bset will be modifying: */ +- if (le16_to_cpu(i->version) < +- bcachefs_metadata_version_bkey_renumber) ++ if (le16_to_cpu(i->version) < bcachefs_metadata_version_max) + validate_before_checksum = true; + + /* if we're going to be encrypting, check metadata validity first: */ +diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h +index fd719dda7d91..1f16394fd5c3 100644 +--- a/fs/bcachefs/btree_io.h ++++ b/fs/bcachefs/btree_io.h +@@ -2,6 +2,7 @@ + #ifndef _BCACHEFS_BTREE_IO_H + #define _BCACHEFS_BTREE_IO_H + ++#include "bkey_methods.h" + #include "bset.h" + #include "btree_locking.h" + #include "extents.h" +@@ -140,4 +141,50 @@ void bch2_btree_flush_all_writes(struct bch_fs *); + void bch2_btree_verify_flushed(struct bch_fs *); + ssize_t bch2_dirty_btree_nodes_print(struct bch_fs *, char *); + ++static inline void compat_bformat(unsigned level, enum btree_id btree_id, ++ unsigned version, unsigned big_endian, ++ int write, struct bkey_format *f) ++{ ++ if (version < bcachefs_metadata_version_inode_btree_change && ++ btree_id == BTREE_ID_INODES) { ++ swap(f->bits_per_field[BKEY_FIELD_INODE], ++ f->bits_per_field[BKEY_FIELD_OFFSET]); ++ swap(f->field_offset[BKEY_FIELD_INODE], ++ f->field_offset[BKEY_FIELD_OFFSET]); ++ } ++} ++ ++static inline void compat_bpos(unsigned level, enum btree_id btree_id, ++ unsigned version, unsigned big_endian, ++ int write, struct bpos *p) ++{ ++ if (big_endian != CPU_BIG_ENDIAN) ++ bch2_bpos_swab(p); ++ ++ if (version < bcachefs_metadata_version_inode_btree_change && ++ btree_id == BTREE_ID_INODES) ++ swap(p->inode, p->offset); ++} ++ ++static inline void compat_btree_node(unsigned level, enum btree_id btree_id, ++ unsigned version, unsigned big_endian, ++ int write, ++ struct btree_node *bn) ++{ ++ if (version < bcachefs_metadata_version_inode_btree_change && ++ btree_node_type_is_extents(btree_id) && ++ bkey_cmp(bn->min_key, POS_MIN) && ++ write) ++ bn->min_key = bkey_predecessor(bn->min_key); ++ ++ compat_bpos(level, btree_id, version, big_endian, write, &bn->min_key); ++ compat_bpos(level, btree_id, version, big_endian, write, &bn->max_key); ++ ++ if (version < bcachefs_metadata_version_inode_btree_change && ++ btree_node_type_is_extents(btree_id) && ++ bkey_cmp(bn->min_key, POS_MIN) && ++ !write) ++ bn->min_key = bkey_successor(bn->min_key); ++} ++ + #endif /* _BCACHEFS_BTREE_IO_H */ +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 6ed688cdcfde..7345fec8a98f 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -39,7 +39,7 @@ static inline struct bpos btree_iter_search_key(struct btree_iter *iter) + static inline bool btree_iter_pos_before_node(struct btree_iter *iter, + struct btree *b) + { +- return bkey_cmp(iter->pos, b->data->min_key) < 0; ++ return bkey_cmp(btree_iter_search_key(iter), b->data->min_key) < 0; + } + + static inline bool btree_iter_pos_after_node(struct btree_iter *iter, +@@ -1284,10 +1284,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) + if (btree_node_read_locked(iter, iter->level)) + btree_node_unlock(iter, iter->level); + +- /* ick: */ +- iter->pos = iter->btree_id == BTREE_ID_INODES +- ? btree_type_successor(iter->btree_id, iter->pos) +- : bkey_successor(iter->pos); ++ iter->pos = bkey_successor(iter->pos); + iter->level = iter->min_depth; + + btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); +@@ -1395,8 +1392,8 @@ static inline bool btree_iter_set_pos_to_next_leaf(struct btree_iter *iter) + iter->k.p = iter->pos = l->b->key.k.p; + + ret = bkey_cmp(iter->pos, POS_MAX) != 0; +- if (ret) +- iter->k.p = iter->pos = btree_type_successor(iter->btree_id, iter->pos); ++ if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS)) ++ iter->k.p = iter->pos = bkey_successor(iter->pos); + + btree_iter_pos_changed(iter, 1); + return ret; +@@ -1412,8 +1409,12 @@ static inline bool btree_iter_set_pos_to_prev_leaf(struct btree_iter *iter) + iter->uptodate = BTREE_ITER_NEED_TRAVERSE; + + ret = bkey_cmp(iter->pos, POS_MIN) != 0; +- if (ret) +- iter->k.p = iter->pos = btree_type_predecessor(iter->btree_id, iter->pos); ++ if (ret) { ++ iter->k.p = iter->pos = bkey_predecessor(iter->pos); ++ ++ if (iter->flags & BTREE_ITER_IS_EXTENTS) ++ iter->k.p = iter->pos = bkey_predecessor(iter->pos); ++ } + + btree_iter_pos_changed(iter, -1); + return ret; +@@ -1500,7 +1501,9 @@ struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter) + return bkey_s_c_null; + + bch2_btree_iter_set_pos(iter, +- btree_type_successor(iter->btree_id, iter->k.p)); ++ (iter->flags & BTREE_ITER_IS_EXTENTS) ++ ? iter->k.p ++ : bkey_successor(iter->k.p)); + + return bch2_btree_iter_peek(iter); + } +@@ -1553,7 +1556,9 @@ struct bkey_s_c bch2_btree_iter_peek_with_updates(struct btree_iter *iter) + + if (k.k && bkey_deleted(k.k)) { + bch2_btree_iter_set_pos(iter, +- btree_type_successor(iter->btree_id, iter->k.p)); ++ (iter->flags & BTREE_ITER_IS_EXTENTS) ++ ? iter->k.p ++ : bkey_successor(iter->k.p)); + continue; + } + +@@ -1582,7 +1587,9 @@ struct bkey_s_c bch2_btree_iter_next_with_updates(struct btree_iter *iter) + return bkey_s_c_null; + + bch2_btree_iter_set_pos(iter, +- btree_type_successor(iter->btree_id, iter->k.p)); ++ (iter->flags & BTREE_ITER_IS_EXTENTS) ++ ? iter->k.p ++ : bkey_successor(iter->k.p)); + + return bch2_btree_iter_peek_with_updates(iter); + } +@@ -1749,7 +1756,9 @@ struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *iter) + return bkey_s_c_null; + + bch2_btree_iter_set_pos(iter, +- btree_type_successor(iter->btree_id, iter->k.p)); ++ (iter->flags & BTREE_ITER_IS_EXTENTS) ++ ? iter->k.p ++ : bkey_successor(iter->k.p)); + + return bch2_btree_iter_peek_slot(iter); + } +diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h +index 6f51ef35db75..1a3672a23b86 100644 +--- a/fs/bcachefs/btree_iter.h ++++ b/fs/bcachefs/btree_iter.h +@@ -172,32 +172,6 @@ void bch2_btree_iter_set_pos_same_leaf(struct btree_iter *, struct bpos); + void __bch2_btree_iter_set_pos(struct btree_iter *, struct bpos, bool); + void bch2_btree_iter_set_pos(struct btree_iter *, struct bpos); + +-static inline struct bpos btree_type_successor(enum btree_id id, +- struct bpos pos) +-{ +- if (id == BTREE_ID_INODES) { +- pos.inode++; +- pos.offset = 0; +- } else if (!btree_node_type_is_extents(id)) { +- pos = bkey_successor(pos); +- } +- +- return pos; +-} +- +-static inline struct bpos btree_type_predecessor(enum btree_id id, +- struct bpos pos) +-{ +- if (id == BTREE_ID_INODES) { +- --pos.inode; +- pos.offset = 0; +- } else { +- pos = bkey_predecessor(pos); +- } +- +- return pos; +-} +- + static inline int __btree_iter_cmp(enum btree_id id, + struct bpos pos, + const struct btree_iter *r) +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index 36b78c27be29..51fa558a4f0b 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -1193,7 +1193,7 @@ static struct btree *__btree_split_node(struct btree_update *as, + BUG_ON(!prev); + + btree_set_max(n1, bkey_unpack_pos(n1, prev)); +- btree_set_min(n2, btree_type_successor(n1->btree_id, n1->key.k.p)); ++ btree_set_min(n2, bkey_successor(n1->key.k.p)); + + set2->u64s = cpu_to_le16((u64 *) vstruct_end(set1) - (u64 *) k); + set1->u64s = cpu_to_le16(le16_to_cpu(set1->u64s) - le16_to_cpu(set2->u64s)); +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index f94bc6a0b699..da2b93b58eed 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -58,8 +58,11 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter, + EBUG_ON(btree_node_just_written(b)); + EBUG_ON(bset_written(b, btree_bset_last(b))); + EBUG_ON(bkey_deleted(&insert->k) && bkey_val_u64s(&insert->k)); +- EBUG_ON(bkey_cmp(bkey_start_pos(&insert->k), b->data->min_key) < 0 || +- bkey_cmp(insert->k.p, b->data->max_key) > 0); ++ EBUG_ON(bkey_cmp(b->data->min_key, POS_MIN) && ++ bkey_cmp(bkey_start_pos(&insert->k), ++ bkey_predecessor(b->data->min_key)) < 0); ++ EBUG_ON(bkey_cmp(insert->k.p, b->data->min_key) < 0); ++ EBUG_ON(bkey_cmp(insert->k.p, b->data->max_key) > 0); + EBUG_ON(insert->k.u64s > + bch_btree_keys_u64s_remaining(iter->trans->c, b)); + EBUG_ON(iter->flags & BTREE_ITER_IS_EXTENTS); +diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c +index 8e5070d5a39b..2a7d913bdda3 100644 +--- a/fs/bcachefs/extent_update.c ++++ b/fs/bcachefs/extent_update.c +@@ -115,7 +115,9 @@ int bch2_extent_atomic_end(struct btree_iter *iter, + b = iter->l[0].b; + node_iter = iter->l[0].iter; + +- BUG_ON(bkey_cmp(bkey_start_pos(&insert->k), b->data->min_key) < 0); ++ BUG_ON(bkey_cmp(b->data->min_key, POS_MIN) && ++ bkey_cmp(bkey_start_pos(&insert->k), ++ bkey_predecessor(b->data->min_key)) < 0); + + *end = bpos_min(insert->k.p, b->key.k.p); + +diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c +index cb88dd15a86c..792c9c1e50b1 100644 +--- a/fs/bcachefs/extents.c ++++ b/fs/bcachefs/extents.c +@@ -9,6 +9,7 @@ + #include "bcachefs.h" + #include "bkey_methods.h" + #include "btree_gc.h" ++#include "btree_io.h" + #include "btree_iter.h" + #include "buckets.h" + #include "checksum.h" +@@ -214,6 +215,22 @@ void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c, + bch2_bkey_ptrs_to_text(out, c, k); + } + ++void bch2_btree_ptr_v2_compat(enum btree_id btree_id, unsigned version, ++ unsigned big_endian, int write, ++ struct bkey_s k) ++{ ++ struct bkey_s_btree_ptr_v2 bp = bkey_s_to_btree_ptr_v2(k); ++ ++ compat_bpos(0, btree_id, version, big_endian, write, &bp.v->min_key); ++ ++ if (version < bcachefs_metadata_version_inode_btree_change && ++ btree_node_type_is_extents(btree_id) && ++ bkey_cmp(bp.v->min_key, POS_MIN)) ++ bp.v->min_key = write ++ ? bkey_predecessor(bp.v->min_key) ++ : bkey_successor(bp.v->min_key); ++} ++ + /* KEY_TYPE_extent: */ + + const char *bch2_extent_invalid(const struct bch_fs *c, struct bkey_s_c k) +diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h +index 70b7d70269dc..8ff2eac3ee2b 100644 +--- a/fs/bcachefs/extents.h ++++ b/fs/bcachefs/extents.h +@@ -371,6 +371,8 @@ const char *bch2_btree_ptr_invalid(const struct bch_fs *, struct bkey_s_c); + void bch2_btree_ptr_debugcheck(struct bch_fs *, struct bkey_s_c); + void bch2_btree_ptr_to_text(struct printbuf *, struct bch_fs *, + struct bkey_s_c); ++void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned, ++ int, struct bkey_s); + + #define bch2_bkey_ops_btree_ptr (struct bkey_ops) { \ + .key_invalid = bch2_btree_ptr_invalid, \ +@@ -384,6 +386,7 @@ void bch2_btree_ptr_to_text(struct printbuf *, struct bch_fs *, + .key_debugcheck = bch2_btree_ptr_debugcheck, \ + .val_to_text = bch2_btree_ptr_to_text, \ + .swab = bch2_ptr_swab, \ ++ .compat = bch2_btree_ptr_v2_compat, \ + } + + /* KEY_TYPE_extent: */ +diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c +index 822541e6adfc..c7508e81188c 100644 +--- a/fs/bcachefs/fsck.c ++++ b/fs/bcachefs/fsck.c +@@ -1038,12 +1038,12 @@ retry: + if (!ret) + continue; + +- if (fsck_err_on(!inode_bitmap_test(&dirs_done, k.k->p.inode), c, ++ if (fsck_err_on(!inode_bitmap_test(&dirs_done, k.k->p.offset), c, + "unreachable directory found (inum %llu)", +- k.k->p.inode)) { ++ k.k->p.offset)) { + bch2_trans_unlock(&trans); + +- ret = reattach_inode(c, lostfound_inode, k.k->p.inode); ++ ret = reattach_inode(c, lostfound_inode, k.k->p.offset); + if (ret) { + goto err; + } +diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c +index 26171ff754a6..7d20f082ad45 100644 +--- a/fs/bcachefs/inode.c ++++ b/fs/bcachefs/inode.c +@@ -98,7 +98,7 @@ void bch2_inode_pack(struct bkey_inode_buf *packed, + unsigned bytes; + + bkey_inode_init(&packed->inode.k_i); +- packed->inode.k.p.inode = inode->bi_inum; ++ packed->inode.k.p.offset = inode->bi_inum; + packed->inode.v.bi_hash_seed = inode->bi_hash_seed; + packed->inode.v.bi_flags = cpu_to_le32(inode->bi_flags); + packed->inode.v.bi_mode = cpu_to_le16(inode->bi_mode); +@@ -149,7 +149,7 @@ int bch2_inode_unpack(struct bkey_s_c_inode inode, + unsigned fieldnr = 0, field_bits; + int ret; + +- unpacked->bi_inum = inode.k->p.inode; ++ unpacked->bi_inum = inode.k->p.offset; + unpacked->bi_hash_seed = inode.v->bi_hash_seed; + unpacked->bi_flags = le32_to_cpu(inode.v->bi_flags); + unpacked->bi_mode = le16_to_cpu(inode.v->bi_mode); +@@ -188,7 +188,7 @@ struct btree_iter *bch2_inode_peek(struct btree_trans *trans, + struct bkey_s_c k; + int ret; + +- iter = bch2_trans_get_iter(trans, BTREE_ID_INODES, POS(inum, 0), ++ iter = bch2_trans_get_iter(trans, BTREE_ID_INODES, POS(0, inum), + BTREE_ITER_SLOTS|flags); + if (IS_ERR(iter)) + return iter; +@@ -232,13 +232,13 @@ const char *bch2_inode_invalid(const struct bch_fs *c, struct bkey_s_c k) + struct bkey_s_c_inode inode = bkey_s_c_to_inode(k); + struct bch_inode_unpacked unpacked; + +- if (k.k->p.offset) +- return "nonzero offset"; ++ if (k.k->p.inode) ++ return "nonzero k.p.inode"; + + if (bkey_val_bytes(k.k) < sizeof(struct bch_inode)) + return "incorrect value size"; + +- if (k.k->p.inode < BLOCKDEV_INODE_MAX) ++ if (k.k->p.offset < BLOCKDEV_INODE_MAX) + return "fs inode in blockdev range"; + + if (INODE_STR_HASH(inode.v) >= BCH_STR_HASH_NR) +@@ -280,8 +280,8 @@ void bch2_inode_to_text(struct printbuf *out, struct bch_fs *c, + const char *bch2_inode_generation_invalid(const struct bch_fs *c, + struct bkey_s_c k) + { +- if (k.k->p.offset) +- return "nonzero offset"; ++ if (k.k->p.inode) ++ return "nonzero k.p.inode"; + + if (bkey_val_bytes(k.k) != sizeof(struct bch_inode_generation)) + return "incorrect value size"; +@@ -383,9 +383,9 @@ int bch2_inode_create(struct btree_trans *trans, + if (IS_ERR(inode_p)) + return PTR_ERR(inode_p); + again: +- for_each_btree_key(trans, iter, BTREE_ID_INODES, POS(start, 0), ++ for_each_btree_key(trans, iter, BTREE_ID_INODES, POS(0, start), + BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { +- if (iter->pos.inode > max) ++ if (bkey_cmp(iter->pos, POS(0, max)) > 0) + break; + + if (k.k->type != KEY_TYPE_inode) +@@ -405,8 +405,8 @@ again: + + return -ENOSPC; + found_slot: +- *hint = k.k->p.inode; +- inode_u->bi_inum = k.k->p.inode; ++ *hint = k.k->p.offset; ++ inode_u->bi_inum = k.k->p.offset; + inode_u->bi_generation = bkey_generation(k); + + bch2_inode_pack(inode_p, inode_u); +@@ -443,7 +443,7 @@ int bch2_inode_rm(struct bch_fs *c, u64 inode_nr) + + bch2_trans_init(&trans, c, 0, 0); + +- iter = bch2_trans_get_iter(&trans, BTREE_ID_INODES, POS(inode_nr, 0), ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_INODES, POS(0, inode_nr), + BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + do { + struct bkey_s_c k = bch2_btree_iter_peek_slot(iter); +@@ -475,10 +475,10 @@ int bch2_inode_rm(struct bch_fs *c, u64 inode_nr) + + if (!bi_generation) { + bkey_init(&delete.k); +- delete.k.p.inode = inode_nr; ++ delete.k.p.offset = inode_nr; + } else { + bkey_inode_generation_init(&delete.k_i); +- delete.k.p.inode = inode_nr; ++ delete.k.p.offset = inode_nr; + delete.v.bi_generation = cpu_to_le32(bi_generation); + } + +@@ -500,7 +500,7 @@ int bch2_inode_find_by_inum_trans(struct btree_trans *trans, u64 inode_nr, + int ret; + + iter = bch2_trans_get_iter(trans, BTREE_ID_INODES, +- POS(inode_nr, 0), BTREE_ITER_SLOTS); ++ POS(0, inode_nr), BTREE_ITER_SLOTS); + if (IS_ERR(iter)) + return PTR_ERR(iter); + +diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c +index 0974805c8923..39bb2154cce1 100644 +--- a/fs/bcachefs/journal_io.c ++++ b/fs/bcachefs/journal_io.c +@@ -1,6 +1,7 @@ + // SPDX-License-Identifier: GPL-2.0 + #include "bcachefs.h" + #include "alloc_foreground.h" ++#include "btree_io.h" + #include "buckets.h" + #include "checksum.h" + #include "error.h" +@@ -138,7 +139,8 @@ static void journal_entry_null_range(void *start, void *end) + + static int journal_validate_key(struct bch_fs *c, struct jset *jset, + struct jset_entry *entry, +- struct bkey_i *k, enum btree_node_type key_type, ++ unsigned level, enum btree_id btree_id, ++ struct bkey_i *k, + const char *type, int write) + { + void *next = vstruct_next(entry); +@@ -171,16 +173,13 @@ static int journal_validate_key(struct bch_fs *c, struct jset *jset, + return 0; + } + +- if (JSET_BIG_ENDIAN(jset) != CPU_BIG_ENDIAN) { +- bch2_bkey_swab_key(NULL, bkey_to_packed(k)); +- bch2_bkey_swab_val(bkey_i_to_s(k)); +- } +- +- if (!write && +- version < bcachefs_metadata_version_bkey_renumber) +- bch2_bkey_renumber(key_type, bkey_to_packed(k), write); ++ if (!write) ++ bch2_bkey_compat(level, btree_id, version, ++ JSET_BIG_ENDIAN(jset), write, ++ NULL, bkey_to_packed(k)); + +- invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(k), key_type); ++ invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(k), ++ __btree_node_type(level, btree_id)); + if (invalid) { + char buf[160]; + +@@ -194,9 +193,10 @@ static int journal_validate_key(struct bch_fs *c, struct jset *jset, + return 0; + } + +- if (write && +- version < bcachefs_metadata_version_bkey_renumber) +- bch2_bkey_renumber(key_type, bkey_to_packed(k), write); ++ if (write) ++ bch2_bkey_compat(level, btree_id, version, ++ JSET_BIG_ENDIAN(jset), write, ++ NULL, bkey_to_packed(k)); + fsck_err: + return ret; + } +@@ -209,10 +209,10 @@ static int journal_entry_validate_btree_keys(struct bch_fs *c, + struct bkey_i *k; + + vstruct_for_each(entry, k) { +- int ret = journal_validate_key(c, jset, entry, k, +- __btree_node_type(entry->level, +- entry->btree_id), +- "key", write); ++ int ret = journal_validate_key(c, jset, entry, ++ entry->level, ++ entry->btree_id, ++ k, "key", write); + if (ret) + return ret; + } +@@ -242,7 +242,7 @@ static int journal_entry_validate_btree_root(struct bch_fs *c, + return 0; + } + +- return journal_validate_key(c, jset, entry, k, BKEY_TYPE_BTREE, ++ return journal_validate_key(c, jset, entry, 1, entry->btree_id, k, + "btree root", write); + fsck_err: + return ret; +@@ -1018,8 +1018,7 @@ void bch2_journal_write(struct closure *cl) + if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset))) + validate_before_checksum = true; + +- if (le32_to_cpu(jset->version) < +- bcachefs_metadata_version_bkey_renumber) ++ if (le32_to_cpu(jset->version) < bcachefs_metadata_version_max) + validate_before_checksum = true; + + if (validate_before_checksum && +-- +cgit v1.2.3 + + +From 87041e61b757bf7f608fb0691d611910eba90cb1 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 29 Mar 2020 17:01:05 -0400 +Subject: bcachefs: Reduce max nr of btree iters when lockdep is on + +This is so we don't overflow MAX_LOCK_DEPTH. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_types.h | 4 ++++ + 1 file changed, 4 insertions(+) + +diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h +index 31a5c215ca34..e2649503cc9b 100644 +--- a/fs/bcachefs/btree_types.h ++++ b/fs/bcachefs/btree_types.h +@@ -272,7 +272,11 @@ struct btree_insert_entry { + struct btree_iter *iter; + }; + ++#ifndef CONFIG_LOCKDEP + #define BTREE_ITER_MAX 64 ++#else ++#define BTREE_ITER_MAX 32 ++#endif + + struct btree_trans { + struct bch_fs *c; +-- +cgit v1.2.3 + + +From a8fdd5fc0cf6bc689d90750f99fc1ae50176dcea Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 30 Mar 2020 12:33:30 -0400 +Subject: bcachefs: Don't allocate memory while holding journal reservation + +This fixes a lockdep splat - allocating memory can call +bch2_clear_page_bits() which takes mark_lock. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_update_interior.c | 17 ++++++++++------- + 1 file changed, 10 insertions(+), 7 deletions(-) + +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index 51fa558a4f0b..018171b522af 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -713,13 +713,6 @@ again: + + bch2_btree_add_journal_pin(c, b, res.seq); + six_unlock_write(&b->lock); +- +- /* +- * b->write_blocked prevented it from being written, so +- * write it now if it needs to be written: +- */ +- btree_node_write_if_need(c, b, SIX_LOCK_intent); +- six_unlock_intent(&b->lock); + break; + + case BTREE_INTERIOR_UPDATING_AS: +@@ -746,6 +739,16 @@ again: + bch2_journal_res_put(&c->journal, &res); + bch2_journal_preres_put(&c->journal, &as->journal_preres); + ++ /* Do btree write after dropping journal res: */ ++ if (b) { ++ /* ++ * b->write_blocked prevented it from being written, so ++ * write it now if it needs to be written: ++ */ ++ btree_node_write_if_need(c, b, SIX_LOCK_intent); ++ six_unlock_intent(&b->lock); ++ } ++ + btree_update_nodes_reachable(as, res.seq); + free_update: + __bch2_btree_update_free(as); +-- +cgit v1.2.3 + + +From f3141b39da3eaa7dd0a494c2c460f1e6ae281b07 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 29 Mar 2020 16:48:53 -0400 +Subject: bcachefs: Check btree topology at startup + +When initial btree gc was changed to overlay journal keys as it walks +the btree, it also stopped checking btree topology. + +Previously, checking btree topology was a fairly complicated affair - +but it's much easier now that btree_ptr_v2 has min_key in the pointer. + +This rewrites the old range_checks code and uses it in both runtime and +initial gc. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_gc.c | 162 +++++++++++++++++++++++++++---------------------- + 1 file changed, 91 insertions(+), 71 deletions(-) + +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +index 64c99630b9d2..5ca6851937de 100644 +--- a/fs/bcachefs/btree_gc.c ++++ b/fs/bcachefs/btree_gc.c +@@ -47,64 +47,42 @@ static inline void gc_pos_set(struct bch_fs *c, struct gc_pos new_pos) + __gc_pos_set(c, new_pos); + } + +-/* range_checks - for validating min/max pos of each btree node: */ +- +-struct range_checks { +- struct range_level { +- struct bpos min; +- struct bpos max; +- } l[BTREE_MAX_DEPTH]; +- unsigned depth; +-}; +- +-static void btree_node_range_checks_init(struct range_checks *r, unsigned depth) ++static int bch2_gc_check_topology(struct bch_fs *c, ++ struct bkey_s_c k, ++ struct bpos *expected_start, ++ struct bpos expected_end, ++ bool is_last) + { +- unsigned i; +- +- for (i = 0; i < BTREE_MAX_DEPTH; i++) +- r->l[i].min = r->l[i].max = POS_MIN; +- r->depth = depth; +-} +- +-static void btree_node_range_checks(struct bch_fs *c, struct btree *b, +- struct range_checks *r) +-{ +- struct range_level *l = &r->l[b->level]; +- +- struct bpos expected_min = bkey_cmp(l->min, l->max) +- ? bkey_successor(l->max) +- : l->max; +- +- bch2_fs_inconsistent_on(bkey_cmp(b->data->min_key, expected_min), c, +- "btree node has incorrect min key: %llu:%llu != %llu:%llu", +- b->data->min_key.inode, +- b->data->min_key.offset, +- expected_min.inode, +- expected_min.offset); +- +- l->max = b->data->max_key; +- +- if (b->level > r->depth) { +- l = &r->l[b->level - 1]; ++ int ret = 0; + +- bch2_fs_inconsistent_on(bkey_cmp(b->data->min_key, l->min), c, +- "btree node min doesn't match min of child nodes: %llu:%llu != %llu:%llu", +- b->data->min_key.inode, +- b->data->min_key.offset, +- l->min.inode, +- l->min.offset); ++ if (k.k->type == KEY_TYPE_btree_ptr_v2) { ++ struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k); + +- bch2_fs_inconsistent_on(bkey_cmp(b->data->max_key, l->max), c, +- "btree node max doesn't match max of child nodes: %llu:%llu != %llu:%llu", +- b->data->max_key.inode, +- b->data->max_key.offset, +- l->max.inode, +- l->max.offset); ++ if (fsck_err_on(bkey_cmp(*expected_start, bp.v->min_key), c, ++ "btree node with incorrect min_key: got %llu:%llu, should be %llu:%llu", ++ bp.v->min_key.inode, ++ bp.v->min_key.offset, ++ expected_start->inode, ++ expected_start->offset)) { ++ BUG(); ++ } ++ } + +- if (bkey_cmp(b->data->max_key, POS_MAX)) +- l->min = l->max = +- bkey_successor(b->data->max_key); ++ *expected_start = bkey_cmp(k.k->p, POS_MAX) ++ ? bkey_successor(k.k->p) ++ : k.k->p; ++ ++ if (fsck_err_on(is_last && ++ bkey_cmp(k.k->p, expected_end), c, ++ "btree node with incorrect max_key: got %llu:%llu, should be %llu:%llu", ++ k.k->p.inode, ++ k.k->p.offset, ++ expected_end.inode, ++ expected_end.offset)) { ++ BUG(); + } ++fsck_err: ++ return ret; + } + + /* marking of btree keys/nodes: */ +@@ -186,6 +164,7 @@ fsck_err: + static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, u8 *max_stale, + bool initial) + { ++ struct bpos next_node_start = b->data->min_key; + struct btree_node_iter iter; + struct bkey unpacked; + struct bkey_s_c k; +@@ -196,13 +175,25 @@ static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, u8 *max_stale, + if (!btree_node_type_needs_gc(btree_node_type(b))) + return 0; + +- for_each_btree_node_key_unpack(b, k, &iter, +- &unpacked) { ++ bch2_btree_node_iter_init_from_start(&iter, b); ++ ++ while ((k = bch2_btree_node_iter_peek_unpack(&iter, b, &unpacked)).k) { + bch2_bkey_debugcheck(c, b, k); + + ret = bch2_gc_mark_key(c, k, max_stale, initial); + if (ret) + break; ++ ++ bch2_btree_node_iter_advance(&iter, b); ++ ++ if (b->level) { ++ ret = bch2_gc_check_topology(c, k, ++ &next_node_start, ++ b->data->max_key, ++ bch2_btree_node_iter_end(&iter)); ++ if (ret) ++ break; ++ } + } + + return ret; +@@ -214,7 +205,6 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id, + struct btree_trans trans; + struct btree_iter *iter; + struct btree *b; +- struct range_checks r; + unsigned depth = metadata_only ? 1 + : expensive_debug_checks(c) ? 0 + : !btree_node_type_needs_gc(btree_id) ? 1 +@@ -226,12 +216,8 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id, + + gc_pos_set(c, gc_pos_btree(btree_id, POS_MIN, 0)); + +- btree_node_range_checks_init(&r, depth); +- + __for_each_btree_node(&trans, iter, btree_id, POS_MIN, + 0, depth, BTREE_ITER_PREFETCH, b) { +- btree_node_range_checks(c, b, &r); +- + bch2_verify_btree_nr_keys(b); + + gc_pos_set(c, gc_pos_btree_node(b)); +@@ -273,11 +259,12 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id, + } + + static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b, +- struct journal_keys *journal_keys, +- unsigned target_depth) ++ struct journal_keys *journal_keys, ++ unsigned target_depth) + { + struct btree_and_journal_iter iter; + struct bkey_s_c k; ++ struct bpos next_node_start = b->data->min_key; + u8 max_stale = 0; + int ret = 0; + +@@ -286,28 +273,46 @@ static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b, + while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { + bch2_bkey_debugcheck(c, b, k); + ++ BUG_ON(bkey_cmp(k.k->p, b->data->min_key) < 0); ++ BUG_ON(bkey_cmp(k.k->p, b->data->max_key) > 0); ++ + ret = bch2_gc_mark_key(c, k, &max_stale, true); + if (ret) + break; + +- if (b->level > target_depth) { ++ if (b->level) { + struct btree *child; + BKEY_PADDED(k) tmp; + + bkey_reassemble(&tmp.k, k); ++ k = bkey_i_to_s_c(&tmp.k); ++ ++ bch2_btree_and_journal_iter_advance(&iter); + +- child = bch2_btree_node_get_noiter(c, &tmp.k, +- b->btree_id, b->level - 1); +- ret = PTR_ERR_OR_ZERO(child); ++ ret = bch2_gc_check_topology(c, k, ++ &next_node_start, ++ b->data->max_key, ++ !bch2_btree_and_journal_iter_peek(&iter).k); + if (ret) + break; + +- bch2_gc_btree_init_recurse(c, child, +- journal_keys, target_depth); +- six_unlock_read(&child->lock); +- } ++ if (b->level > target_depth) { ++ child = bch2_btree_node_get_noiter(c, &tmp.k, ++ b->btree_id, b->level - 1); ++ ret = PTR_ERR_OR_ZERO(child); ++ if (ret) ++ break; + +- bch2_btree_and_journal_iter_advance(&iter); ++ ret = bch2_gc_btree_init_recurse(c, child, ++ journal_keys, target_depth); ++ six_unlock_read(&child->lock); ++ ++ if (ret) ++ break; ++ } ++ } else { ++ bch2_btree_and_journal_iter_advance(&iter); ++ } + } + + return ret; +@@ -332,6 +337,20 @@ static int bch2_gc_btree_init(struct bch_fs *c, + return 0; + + six_lock_read(&b->lock, NULL, NULL); ++ if (fsck_err_on(bkey_cmp(b->data->min_key, POS_MIN), c, ++ "btree root with incorrect min_key: %llu:%llu", ++ b->data->min_key.inode, ++ b->data->min_key.offset)) { ++ BUG(); ++ } ++ ++ if (fsck_err_on(bkey_cmp(b->data->max_key, POS_MAX), c, ++ "btree root with incorrect min_key: %llu:%llu", ++ b->data->max_key.inode, ++ b->data->max_key.offset)) { ++ BUG(); ++ } ++ + if (b->level >= target_depth) + ret = bch2_gc_btree_init_recurse(c, b, + journal_keys, target_depth); +@@ -339,6 +358,7 @@ static int bch2_gc_btree_init(struct bch_fs *c, + if (!ret) + ret = bch2_gc_mark_key(c, bkey_i_to_s_c(&b->key), + &max_stale, true); ++fsck_err: + six_unlock_read(&b->lock); + + return ret; +-- +cgit v1.2.3 + + +From 4cb5daf2bb73440e6396d49bbe0d07abd064aa8f Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 30 Mar 2020 14:05:05 -0400 +Subject: bcachefs: Fix ec_stripe_update_ptrs() + +bch2_btree_iter_set_pos() invalidates the key returned by peek(). + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/ec.c | 3 +-- + 1 file changed, 1 insertion(+), 2 deletions(-) + +diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c +index d87e0093c7ee..4da1cdbc55db 100644 +--- a/fs/bcachefs/ec.c ++++ b/fs/bcachefs/ec.c +@@ -804,8 +804,6 @@ static int ec_stripe_update_ptrs(struct bch_fs *c, + continue; + } + +- bch2_btree_iter_set_pos(iter, bkey_start_pos(k.k)); +- + dev = s->key.v.ptrs[idx].dev; + + bkey_on_stack_reassemble(&sk, c, k); +@@ -820,6 +818,7 @@ static int ec_stripe_update_ptrs(struct bch_fs *c, + + extent_stripe_ptr_add(e, s, ec_ptr, idx); + ++ bch2_btree_iter_set_pos(iter, bkey_start_pos(&sk.k->k)); + bch2_trans_update(&trans, iter, sk.k, 0); + + ret = bch2_trans_commit(&trans, NULL, NULL, +-- +cgit v1.2.3 + + +From 0c7749713147bf0c269616543767b51e64d54a45 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 30 Mar 2020 14:29:06 -0400 +Subject: bcachefs: Fix inodes pass in fsck + +It wasn't updated for the patch that switched inodes to using the offset +field of struct bkey. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/fsck.c | 10 +++++----- + 1 file changed, 5 insertions(+), 5 deletions(-) + +diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c +index c7508e81188c..3ab621c62c43 100644 +--- a/fs/bcachefs/fsck.c ++++ b/fs/bcachefs/fsck.c +@@ -1353,18 +1353,18 @@ static int bch2_gc_walk_inodes(struct bch_fs *c, + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); + + iter = bch2_trans_get_iter(&trans, BTREE_ID_INODES, +- POS(range_start, 0), 0); ++ POS(0, range_start), 0); + nlinks_iter = genradix_iter_init(links, 0); + + while ((k = bch2_btree_iter_peek(iter)).k && + !(ret2 = bkey_err(k))) { + peek_nlinks: link = genradix_iter_peek(&nlinks_iter, links); + +- if (!link && (!k.k || iter->pos.inode >= range_end)) ++ if (!link && (!k.k || iter->pos.offset >= range_end)) + break; + + nlinks_pos = range_start + nlinks_iter.pos; +- if (iter->pos.inode > nlinks_pos) { ++ if (iter->pos.offset > nlinks_pos) { + /* Should have been caught by dirents pass: */ + need_fsck_err_on(link && link->count, c, + "missing inode %llu (nlink %u)", +@@ -1373,7 +1373,7 @@ peek_nlinks: link = genradix_iter_peek(&nlinks_iter, links); + goto peek_nlinks; + } + +- if (iter->pos.inode < nlinks_pos || !link) ++ if (iter->pos.offset < nlinks_pos || !link) + link = &zero_links; + + if (k.k && k.k->type == KEY_TYPE_inode) { +@@ -1389,7 +1389,7 @@ peek_nlinks: link = genradix_iter_peek(&nlinks_iter, links); + nlinks_pos, link->count); + } + +- if (nlinks_pos == iter->pos.inode) ++ if (nlinks_pos == iter->pos.offset) + genradix_iter_advance(&nlinks_iter, links); + + bch2_btree_iter_next(iter); +-- +cgit v1.2.3 + + +From f4cfb1b4cbbd7e7b1fcc10791a3054f6e5400c39 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 30 Mar 2020 17:43:21 -0400 +Subject: bcachefs: Fix a locking bug + +Dropping the wrong kind of lock can't lead to anything good... + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_io.h | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h +index 1f16394fd5c3..337d2bdd29e8 100644 +--- a/fs/bcachefs/btree_io.h ++++ b/fs/bcachefs/btree_io.h +@@ -114,7 +114,7 @@ static inline void btree_node_write_if_need(struct bch_fs *c, struct btree *b, + break; + } + +- six_unlock_read(&b->lock); ++ six_unlock_type(&b->lock, lock_held); + btree_node_wait_on_io(b); + btree_node_lock_type(c, b, lock_held); + } +-- +cgit v1.2.3 + + +From b9c53bd4e0833ef287e6f6bdf656be5450fff6c0 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 30 Mar 2020 18:11:13 -0400 +Subject: bcachefs: Fix iterating of journal keys within a btree node + +Extent btrees no longer have weird special behaviour for min_key. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/recovery.c | 7 +------ + 1 file changed, 1 insertion(+), 6 deletions(-) + +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index 8cfae639e23f..a4d0eec2ea3e 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -183,17 +183,12 @@ void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *i + struct journal_keys *journal_keys, + struct btree *b) + { +- struct bpos start = b->data->min_key; +- +- if (btree_node_type_is_extents(b->btree_id)) +- start = bkey_successor(start); +- + memset(iter, 0, sizeof(*iter)); + + iter->b = b; + bch2_btree_node_iter_init_from_start(&iter->node_iter, iter->b); + bch2_journal_iter_init(&iter->journal, journal_keys, +- b->btree_id, b->level, start); ++ b->btree_id, b->level, b->data->min_key); + } + + /* sort and dedup all keys in the journal: */ +-- +cgit v1.2.3 + + +From 463f078d8d96241c88e5282cbb02217ef88c7070 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 31 Mar 2020 16:23:43 -0400 +Subject: bcachefs: Fix journalling of interior node updates + +We weren't journalling updates done while splitting/compacting nodes - +oops. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_update_interior.c | 99 +++++++++++++++++++------------------ + fs/bcachefs/btree_update_interior.h | 4 ++ + 2 files changed, 54 insertions(+), 49 deletions(-) + +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index 018171b522af..8287ff6b80c6 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -27,43 +27,37 @@ static void btree_update_drop_new_node(struct bch_fs *, struct btree *); + + /* Debug code: */ + ++/* ++ * Verify that child nodes correctly span parent node's range: ++ */ + static void btree_node_interior_verify(struct btree *b) + { ++#ifdef CONFIG_BCACHEFS_DEBUG ++ struct bpos next_node = b->data->min_key; + struct btree_node_iter iter; +- struct bkey_packed *k; ++ struct bkey_s_c k; ++ struct bkey_s_c_btree_ptr_v2 bp; ++ struct bkey unpacked; + + BUG_ON(!b->level); + +- bch2_btree_node_iter_init(&iter, b, &b->key.k.p); +-#if 1 +- BUG_ON(!(k = bch2_btree_node_iter_peek(&iter, b)) || +- bkey_cmp_left_packed(b, k, &b->key.k.p)); ++ bch2_btree_node_iter_init_from_start(&iter, b); + +- BUG_ON((bch2_btree_node_iter_advance(&iter, b), +- !bch2_btree_node_iter_end(&iter))); +-#else +- const char *msg; ++ while (1) { ++ k = bch2_btree_node_iter_peek_unpack(&iter, b, &unpacked); ++ bp = bkey_s_c_to_btree_ptr_v2(k); + +- msg = "not found"; +- k = bch2_btree_node_iter_peek(&iter, b); +- if (!k) +- goto err; ++ BUG_ON(bkey_cmp(next_node, bp.v->min_key)); + +- msg = "isn't what it should be"; +- if (bkey_cmp_left_packed(b, k, &b->key.k.p)) +- goto err; ++ bch2_btree_node_iter_advance(&iter, b); + +- bch2_btree_node_iter_advance(&iter, b); ++ if (bch2_btree_node_iter_end(&iter)) { ++ BUG_ON(bkey_cmp(k.k->p, b->key.k.p)); ++ break; ++ } + +- msg = "isn't last key"; +- if (!bch2_btree_node_iter_end(&iter)) +- goto err; +- return; +-err: +- bch2_dump_btree_node(b); +- printk(KERN_ERR "last key %llu:%llu %s\n", b->key.k.p.inode, +- b->key.k.p.offset, msg); +- BUG(); ++ next_node = bkey_successor(k.k->p); ++ } + #endif + } + +@@ -644,8 +638,6 @@ static void btree_update_nodes_written(struct closure *cl) + struct bch_fs *c = as->c; + struct btree *b; + struct bset *i; +- struct bkey_i *k; +- unsigned journal_u64s = 0; + int ret; + + /* +@@ -674,13 +666,7 @@ again: + + list_del(&as->unwritten_list); + +- journal_u64s = 0; +- +- if (as->mode != BTREE_INTERIOR_UPDATING_ROOT) +- for_each_keylist_key(&as->parent_keys, k) +- journal_u64s += jset_u64s(k->k.u64s); +- +- ret = bch2_journal_res_get(&c->journal, &res, journal_u64s, ++ ret = bch2_journal_res_get(&c->journal, &res, as->journal_u64s, + JOURNAL_RES_GET_RESERVED); + if (ret) { + BUG_ON(!bch2_journal_error(&c->journal)); +@@ -688,13 +674,14 @@ again: + goto free_update; + } + +- if (as->mode != BTREE_INTERIOR_UPDATING_ROOT) +- for_each_keylist_key(&as->parent_keys, k) +- bch2_journal_add_entry(&c->journal, &res, +- BCH_JSET_ENTRY_btree_keys, +- as->btree_id, +- as->level, +- k, k->k.u64s); ++ { ++ struct journal_buf *buf = &c->journal.buf[res.idx]; ++ struct jset_entry *entry = vstruct_idx(buf->data, res.offset); ++ ++ res.offset += as->journal_u64s; ++ res.u64s -= as->journal_u64s; ++ memcpy_u64s(entry, as->journal_entries, as->journal_u64s); ++ } + + switch (as->mode) { + case BTREE_INTERIOR_NO_UPDATE: +@@ -983,7 +970,7 @@ bch2_btree_update_start(struct bch_fs *c, enum btree_id id, + bch2_keylist_init(&as->parent_keys, as->inline_keys); + + ret = bch2_journal_preres_get(&c->journal, &as->journal_preres, +- jset_u64s(BKEY_BTREE_PTR_U64s_MAX) * 3, 0); ++ ARRAY_SIZE(as->journal_entries), 0); + if (ret) { + bch2_btree_reserve_put(c, reserve); + closure_debug_destroy(&as->cl); +@@ -1103,10 +1090,21 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b + { + struct bch_fs *c = as->c; + struct bch_fs_usage *fs_usage; ++ struct jset_entry *entry; + struct bkey_packed *k; + struct bkey tmp; + +- BUG_ON(insert->k.u64s > bch_btree_keys_u64s_remaining(c, b)); ++ BUG_ON(as->journal_u64s + jset_u64s(insert->k.u64s) > ++ ARRAY_SIZE(as->journal_entries)); ++ ++ entry = (void *) &as->journal_entries[as->journal_u64s]; ++ memset(entry, 0, sizeof(*entry)); ++ entry->u64s = cpu_to_le16(insert->k.u64s); ++ entry->type = BCH_JSET_ENTRY_btree_keys; ++ entry->btree_id = b->btree_id; ++ entry->level = b->level; ++ memcpy_u64s_small(entry->_data, insert, insert->k.u64s); ++ as->journal_u64s += jset_u64s(insert->k.u64s); + + mutex_lock(&c->btree_interior_update_lock); + percpu_down_read(&c->mark_lock); +@@ -1255,6 +1253,14 @@ static void btree_split_insert_keys(struct btree_update *as, struct btree *b, + struct bkey_packed *src, *dst, *n; + struct bset *i; + ++ /* ++ * XXX ++ * ++ * these updates must be journalled ++ * ++ * oops ++ */ ++ + BUG_ON(btree_node_type(b) != BKEY_TYPE_BTREE); + + bch2_btree_node_iter_init(&node_iter, b, &k->k.p); +@@ -1262,11 +1268,6 @@ static void btree_split_insert_keys(struct btree_update *as, struct btree *b, + while (!bch2_keylist_empty(keys)) { + k = bch2_keylist_front(keys); + +- BUG_ON(bch_keylist_u64s(keys) > +- bch_btree_keys_u64s_remaining(as->c, b)); +- BUG_ON(bkey_cmp(k->k.p, b->data->min_key) < 0); +- BUG_ON(bkey_cmp(k->k.p, b->data->max_key) > 0); +- + bch2_insert_fixup_btree_ptr(as, b, iter, k, &node_iter); + bch2_keylist_pop_front(keys); + } +diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h +index 0ac95dd80a38..aef8adf8c032 100644 +--- a/fs/bcachefs/btree_update_interior.h ++++ b/fs/bcachefs/btree_update_interior.h +@@ -104,6 +104,10 @@ struct btree_update { + struct btree *new_nodes[BTREE_MAX_DEPTH * 2 + GC_MERGE_NODES]; + unsigned nr_new_nodes; + ++ unsigned journal_u64s; ++ u64 journal_entries[ ++ (BKEY_BTREE_PTR_U64s_MAX + 1) * (BTREE_MAX_DEPTH - 1) * 2]; ++ + /* Only here to reduce stack usage on recursive splits: */ + struct keylist parent_keys; + /* +-- +cgit v1.2.3 + + +From 5fee5d3fe8c4ced15ac5a7fa8cb3b60af7f461ac Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 31 Mar 2020 16:25:30 -0400 +Subject: bcachefs: Add print method for bch2_btree_ptr_v2 + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/extents.c | 15 +++++++++++++++ + fs/bcachefs/extents.h | 5 ++++- + 2 files changed, 19 insertions(+), 1 deletion(-) + +diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c +index 792c9c1e50b1..3f66457d2272 100644 +--- a/fs/bcachefs/extents.c ++++ b/fs/bcachefs/extents.c +@@ -215,6 +215,21 @@ void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c, + bch2_bkey_ptrs_to_text(out, c, k); + } + ++void bch2_btree_ptr_v2_to_text(struct printbuf *out, struct bch_fs *c, ++ struct bkey_s_c k) ++{ ++ struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k); ++ ++ pr_buf(out, "seq %llu sectors %u written %u min_key ", ++ le64_to_cpu(bp.v->seq), ++ le16_to_cpu(bp.v->sectors), ++ le16_to_cpu(bp.v->sectors_written)); ++ ++ bch2_bpos_to_text(out, bp.v->min_key); ++ pr_buf(out, " "); ++ bch2_bkey_ptrs_to_text(out, c, k); ++} ++ + void bch2_btree_ptr_v2_compat(enum btree_id btree_id, unsigned version, + unsigned big_endian, int write, + struct bkey_s k) +diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h +index 8ff2eac3ee2b..29b15365d19c 100644 +--- a/fs/bcachefs/extents.h ++++ b/fs/bcachefs/extents.h +@@ -371,6 +371,9 @@ const char *bch2_btree_ptr_invalid(const struct bch_fs *, struct bkey_s_c); + void bch2_btree_ptr_debugcheck(struct bch_fs *, struct bkey_s_c); + void bch2_btree_ptr_to_text(struct printbuf *, struct bch_fs *, + struct bkey_s_c); ++ ++void bch2_btree_ptr_v2_to_text(struct printbuf *, struct bch_fs *, ++ struct bkey_s_c); + void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned, + int, struct bkey_s); + +@@ -384,7 +387,7 @@ void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned, + #define bch2_bkey_ops_btree_ptr_v2 (struct bkey_ops) { \ + .key_invalid = bch2_btree_ptr_invalid, \ + .key_debugcheck = bch2_btree_ptr_debugcheck, \ +- .val_to_text = bch2_btree_ptr_to_text, \ ++ .val_to_text = bch2_btree_ptr_v2_to_text, \ + .swab = bch2_ptr_swab, \ + .compat = bch2_btree_ptr_v2_compat, \ + } +-- +cgit v1.2.3 + + +From 86f97fe47d7c2f7d64de8b995f2b18f64c31baf6 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 1 Apr 2020 16:07:57 -0400 +Subject: bcachefs: Fix fallocate FL_INSERT_RANGE + +This was another bug because of bch2_btree_iter_set_pos() invalidating +iterators. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/fs-io.c | 11 ++++++----- + 1 file changed, 6 insertions(+), 5 deletions(-) + +diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c +index e20e7ec722aa..32416ceb09aa 100644 +--- a/fs/bcachefs/fs-io.c ++++ b/fs/bcachefs/fs-io.c +@@ -2538,10 +2538,8 @@ reassemble: + bkey_on_stack_reassemble(©, c, k); + + if (insert && +- bkey_cmp(bkey_start_pos(k.k), move_pos) < 0) { ++ bkey_cmp(bkey_start_pos(k.k), move_pos) < 0) + bch2_cut_front(move_pos, copy.k); +- bch2_btree_iter_set_pos(src, bkey_start_pos(©.k->k)); +- } + + copy.k->k.p.offset += shift >> 9; + bch2_btree_iter_set_pos(dst, bkey_start_pos(©.k->k)); +@@ -2561,8 +2559,9 @@ reassemble: + } + + bkey_init(&delete.k); +- delete.k.p = src->pos; +- bch2_key_resize(&delete.k, copy.k->k.size); ++ delete.k.p = copy.k->k.p; ++ delete.k.size = copy.k->k.size; ++ delete.k.p.offset -= shift >> 9; + + next_pos = insert ? bkey_start_pos(&delete.k) : delete.k.p; + +@@ -2583,6 +2582,8 @@ reassemble: + BUG_ON(ret); + } + ++ bch2_btree_iter_set_pos(src, bkey_start_pos(&delete.k)); ++ + ret = bch2_trans_update(&trans, src, &delete, trigger_flags) ?: + bch2_trans_update(&trans, dst, copy.k, trigger_flags) ?: + bch2_trans_commit(&trans, &disk_res, +-- +cgit v1.2.3 + + +From 08458a84280a4eddc639531b4ad360f4e3674cbd Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 1 Apr 2020 17:14:14 -0400 +Subject: bcachefs: Trace where btree iterators are allocated + +This will help with iterator overflow bugs. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 13 +++++++------ + fs/bcachefs/btree_iter.h | 31 ++++++++++++++++++++++++++++--- + fs/bcachefs/btree_types.h | 1 + + 3 files changed, 36 insertions(+), 9 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 7345fec8a98f..5528ba0f1d44 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -1912,13 +1912,14 @@ static struct btree_iter *btree_trans_iter_alloc(struct btree_trans *trans) + struct btree_iter *iter; + + trans_for_each_iter(trans, iter) { +- pr_err("iter: btree %s pos %llu:%llu%s%s%s", ++ pr_err("iter: btree %s pos %llu:%llu%s%s%s %pf", + bch2_btree_ids[iter->btree_id], + iter->pos.inode, + iter->pos.offset, + (trans->iters_live & (1ULL << iter->idx)) ? " live" : "", + (trans->iters_touched & (1ULL << iter->idx)) ? " touched" : "", +- iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT ? " keep" : ""); ++ iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT ? " keep" : "", ++ (void *) iter->ip_allocated); + } + + panic("trans iter oveflow\n"); +@@ -2025,9 +2026,9 @@ static struct btree_iter *__btree_trans_get_iter(struct btree_trans *trans, + return iter; + } + +-struct btree_iter *bch2_trans_get_iter(struct btree_trans *trans, +- enum btree_id btree_id, +- struct bpos pos, unsigned flags) ++struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans, ++ enum btree_id btree_id, ++ struct bpos pos, unsigned flags) + { + struct btree_iter *iter = + __btree_trans_get_iter(trans, btree_id, pos, flags); +@@ -2064,7 +2065,7 @@ struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *trans, + return iter; + } + +-struct btree_iter *bch2_trans_copy_iter(struct btree_trans *trans, ++struct btree_iter *__bch2_trans_copy_iter(struct btree_trans *trans, + struct btree_iter *src) + { + struct btree_iter *iter; +diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h +index 1a3672a23b86..6456787a8f77 100644 +--- a/fs/bcachefs/btree_iter.h ++++ b/fs/bcachefs/btree_iter.h +@@ -257,10 +257,35 @@ int bch2_trans_iter_free(struct btree_trans *, struct btree_iter *); + + void bch2_trans_unlink_iters(struct btree_trans *); + +-struct btree_iter *bch2_trans_get_iter(struct btree_trans *, enum btree_id, +- struct bpos, unsigned); +-struct btree_iter *bch2_trans_copy_iter(struct btree_trans *, ++struct btree_iter *__bch2_trans_get_iter(struct btree_trans *, enum btree_id, ++ struct bpos, unsigned); ++ ++static inline struct btree_iter * ++bch2_trans_get_iter(struct btree_trans *trans, enum btree_id btree_id, ++ struct bpos pos, unsigned flags) ++{ ++ struct btree_iter *iter = ++ __bch2_trans_get_iter(trans, btree_id, pos, flags); ++ ++ if (!IS_ERR(iter)) ++ iter->ip_allocated = _THIS_IP_; ++ return iter; ++} ++ ++struct btree_iter *__bch2_trans_copy_iter(struct btree_trans *, + struct btree_iter *); ++static inline struct btree_iter * ++bch2_trans_copy_iter(struct btree_trans *trans, struct btree_iter *src) ++{ ++ struct btree_iter *iter = ++ __bch2_trans_copy_iter(trans, src); ++ ++ if (!IS_ERR(iter)) ++ iter->ip_allocated = _THIS_IP_; ++ return iter; ++ ++} ++ + struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *, + enum btree_id, struct bpos, + unsigned, unsigned, unsigned); +diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h +index e2649503cc9b..732cdc35aa7c 100644 +--- a/fs/bcachefs/btree_types.h ++++ b/fs/bcachefs/btree_types.h +@@ -253,6 +253,7 @@ struct btree_iter { + * bch2_btree_iter_next_slot() can correctly advance pos. + */ + struct bkey k; ++ unsigned long ip_allocated; + }; + + static inline enum btree_iter_type btree_iter_type(struct btree_iter *iter) +-- +cgit v1.2.3 + + +From 84577e5c294c8d14c8ec8a90eca8d2df7099835a Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 1 Apr 2020 17:28:39 -0400 +Subject: bcachefs: Add another mssing bch2_trans_iter_put() call + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/fs.c | 7 +++++-- + 1 file changed, 5 insertions(+), 2 deletions(-) + +diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c +index 3d586e6a4e44..adae8a5bfa54 100644 +--- a/fs/bcachefs/fs.c ++++ b/fs/bcachefs/fs.c +@@ -145,8 +145,6 @@ retry: + &inode->ei_journal_seq, + BTREE_INSERT_NOUNLOCK| + BTREE_INSERT_NOFAIL); +- if (ret == -EINTR) +- goto retry; + + /* + * the btree node lock protects inode->ei_inode, not ei_update_lock; +@@ -155,6 +153,11 @@ retry: + if (!ret) + bch2_inode_update_after_write(c, inode, &inode_u, fields); + ++ bch2_trans_iter_put(&trans, iter); ++ ++ if (ret == -EINTR) ++ goto retry; ++ + bch2_trans_exit(&trans); + return ret < 0 ? ret : 0; + } +-- +cgit v1.2.3 + + +From 7a322d63d571dc978ba466a0904a94073a8df588 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 4 Apr 2020 13:54:19 -0400 +Subject: bcachefs: Fix a null ptr deref during journal replay + +We were calling bch2_extent_can_insert() incorrectly; it should only be +called when the extents-to-keys pass is running because that's when we +could be splitting a compressed extent. Calling bch2_extent_can_insert() +without passing in a disk reservation was causing a null ptr deref. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_update_leaf.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index da2b93b58eed..b305c2e4013b 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -309,7 +309,7 @@ btree_key_can_insert(struct btree_trans *trans, + if (unlikely(btree_node_old_extent_overwrite(b))) + return BTREE_INSERT_BTREE_NODE_FULL; + +- ret = !btree_node_is_extents(b) ++ ret = !(iter->flags & BTREE_ITER_IS_EXTENTS) + ? BTREE_INSERT_OK + : bch2_extent_can_insert(trans, iter, insert); + if (ret) +-- +cgit v1.2.3 + + +From 64ef969ac8d93d56a57ca2a0b04927f8620bb0e8 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 4 Apr 2020 15:45:06 -0400 +Subject: bcachefs: Fix another error path locking bug + +btree_update_nodes_written() was leaking a btree node lock on failure to +get a journal reservation. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_update_interior.c | 7 ++++--- + 1 file changed, 4 insertions(+), 3 deletions(-) + +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index 8287ff6b80c6..90323a60e51c 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -725,7 +725,7 @@ again: + + bch2_journal_res_put(&c->journal, &res); + bch2_journal_preres_put(&c->journal, &as->journal_preres); +- ++free_update: + /* Do btree write after dropping journal res: */ + if (b) { + /* +@@ -736,8 +736,9 @@ again: + six_unlock_intent(&b->lock); + } + +- btree_update_nodes_reachable(as, res.seq); +-free_update: ++ if (!ret) ++ btree_update_nodes_reachable(as, res.seq); ++ + __bch2_btree_update_free(as); + /* + * for flush_held_btree_writes() waiting on updates to flush or +-- +cgit v1.2.3 + + +From 33c9ef1136ce95d9441be37e615f0c714dbd81f0 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 4 Apr 2020 15:49:42 -0400 +Subject: bcachefs: Fix a debug assertion + +This assertion was passing the wrong btree node type when inserting into +interior nodes. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_update_leaf.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index b305c2e4013b..7faf98fd2f64 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -250,7 +250,8 @@ static inline void btree_insert_entry_checks(struct btree_trans *trans, + + BUG_ON(bkey_cmp(insert->k.p, iter->pos)); + BUG_ON(debug_check_bkeys(c) && +- bch2_bkey_invalid(c, bkey_i_to_s_c(insert), iter->btree_id)); ++ bch2_bkey_invalid(c, bkey_i_to_s_c(insert), ++ __btree_node_type(iter->level, iter->btree_id))); + } + + static noinline int +-- +cgit v1.2.3 + + +From 487ea7ade35e3853afd13b6604bb263fe37fdfc9 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 4 Apr 2020 16:47:59 -0400 +Subject: bcachefs: Fix a debug mode assertion + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_update_interior.c | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index 90323a60e51c..67f83dc95ca2 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -45,6 +45,8 @@ static void btree_node_interior_verify(struct btree *b) + + while (1) { + k = bch2_btree_node_iter_peek_unpack(&iter, b, &unpacked); ++ if (k.k->type != KEY_TYPE_btree_ptr_v2) ++ break; + bp = bkey_s_c_to_btree_ptr_v2(k); + + BUG_ON(bkey_cmp(next_node, bp.v->min_key)); +-- +cgit v1.2.3 + + +From 0e2f90f62613033c659bf5a269cf2822975cd364 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 5 Apr 2020 21:49:17 -0400 +Subject: bcachefs: Fix a deadlock on starting an interior btree update + +Not legal to block on a journal prereservation with btree locks held. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_gc.c | 2 +- + fs/bcachefs/btree_update_interior.c | 40 ++++++++++++++++++++++++------------- + fs/bcachefs/btree_update_interior.h | 8 +++++--- + 3 files changed, 32 insertions(+), 18 deletions(-) + +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +index 5ca6851937de..59fc3a454cb9 100644 +--- a/fs/bcachefs/btree_gc.c ++++ b/fs/bcachefs/btree_gc.c +@@ -933,7 +933,7 @@ static void bch2_coalesce_nodes(struct bch_fs *c, struct btree_iter *iter, + return; + } + +- as = bch2_btree_update_start(c, iter->btree_id, ++ as = bch2_btree_update_start(iter->trans, iter->btree_id, + btree_update_reserve_required(c, parent) + nr_old_nodes, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_USE_RESERVE, +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index 67f83dc95ca2..0d08d6fcd401 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -949,14 +949,34 @@ void bch2_btree_update_done(struct btree_update *as) + } + + struct btree_update * +-bch2_btree_update_start(struct bch_fs *c, enum btree_id id, ++bch2_btree_update_start(struct btree_trans *trans, enum btree_id id, + unsigned nr_nodes, unsigned flags, + struct closure *cl) + { ++ struct bch_fs *c = trans->c; ++ struct journal_preres journal_preres = { 0 }; + struct btree_reserve *reserve; + struct btree_update *as; + int ret; + ++ ret = bch2_journal_preres_get(&c->journal, &journal_preres, ++ BTREE_UPDATE_JOURNAL_RES, ++ JOURNAL_RES_GET_NONBLOCK); ++ if (ret == -EAGAIN) { ++ bch2_trans_unlock(trans); ++ ++ ret = bch2_journal_preres_get(&c->journal, &journal_preres, ++ BTREE_UPDATE_JOURNAL_RES, ++ JOURNAL_RES_GET_NONBLOCK); ++ if (ret) ++ return ERR_PTR(ret); ++ ++ if (!bch2_trans_relock(trans)) { ++ bch2_journal_preres_put(&c->journal, &journal_preres); ++ return ERR_PTR(-EINTR); ++ } ++ } ++ + reserve = bch2_btree_reserve_get(c, nr_nodes, flags, cl); + if (IS_ERR(reserve)) + return ERR_CAST(reserve); +@@ -969,18 +989,10 @@ bch2_btree_update_start(struct bch_fs *c, enum btree_id id, + as->btree_id = id; + as->reserve = reserve; + INIT_LIST_HEAD(&as->write_blocked_list); ++ as->journal_preres = journal_preres; + + bch2_keylist_init(&as->parent_keys, as->inline_keys); + +- ret = bch2_journal_preres_get(&c->journal, &as->journal_preres, +- ARRAY_SIZE(as->journal_entries), 0); +- if (ret) { +- bch2_btree_reserve_put(c, reserve); +- closure_debug_destroy(&as->cl); +- mempool_free(as, &c->btree_interior_update_pool); +- return ERR_PTR(ret); +- } +- + mutex_lock(&c->btree_interior_update_lock); + list_add_tail(&as->list, &c->btree_interior_update_list); + mutex_unlock(&c->btree_interior_update_lock); +@@ -1551,7 +1563,7 @@ int bch2_btree_split_leaf(struct bch_fs *c, struct btree_iter *iter, + goto out; + } + +- as = bch2_btree_update_start(c, iter->btree_id, ++ as = bch2_btree_update_start(trans, iter->btree_id, + btree_update_reserve_required(c, b), flags, + !(flags & BTREE_INSERT_NOUNLOCK) ? &cl : NULL); + if (IS_ERR(as)) { +@@ -1663,7 +1675,7 @@ retry: + goto err_unlock; + } + +- as = bch2_btree_update_start(c, iter->btree_id, ++ as = bch2_btree_update_start(trans, iter->btree_id, + btree_update_reserve_required(c, parent) + 1, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_USE_RESERVE, +@@ -1776,7 +1788,7 @@ static int __btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter, + struct btree *n, *parent = btree_node_parent(iter, b); + struct btree_update *as; + +- as = bch2_btree_update_start(c, iter->btree_id, ++ as = bch2_btree_update_start(iter->trans, iter->btree_id, + (parent + ? btree_update_reserve_required(c, parent) + : 0) + 1, +@@ -2043,7 +2055,7 @@ int bch2_btree_node_update_key(struct bch_fs *c, struct btree_iter *iter, + new_hash = bch2_btree_node_mem_alloc(c); + } + +- as = bch2_btree_update_start(c, iter->btree_id, ++ as = bch2_btree_update_start(iter->trans, iter->btree_id, + parent ? btree_update_reserve_required(c, parent) : 0, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_USE_RESERVE| +diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h +index aef8adf8c032..2fddf5d31eb9 100644 +--- a/fs/bcachefs/btree_update_interior.h ++++ b/fs/bcachefs/btree_update_interior.h +@@ -32,6 +32,9 @@ struct pending_btree_node_free { + __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX); + }; + ++#define BTREE_UPDATE_JOURNAL_RES \ ++ ((BKEY_BTREE_PTR_U64s_MAX + 1) * (BTREE_MAX_DEPTH - 1) * 2) ++ + /* + * Tracks an in progress split/rewrite of a btree node and the update to the + * parent node: +@@ -105,8 +108,7 @@ struct btree_update { + unsigned nr_new_nodes; + + unsigned journal_u64s; +- u64 journal_entries[ +- (BKEY_BTREE_PTR_U64s_MAX + 1) * (BTREE_MAX_DEPTH - 1) * 2]; ++ u64 journal_entries[BTREE_UPDATE_JOURNAL_RES]; + + /* Only here to reduce stack usage on recursive splits: */ + struct keylist parent_keys; +@@ -132,7 +134,7 @@ struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *, + + void bch2_btree_update_done(struct btree_update *); + struct btree_update * +-bch2_btree_update_start(struct bch_fs *, enum btree_id, unsigned, ++bch2_btree_update_start(struct btree_trans *, enum btree_id, unsigned, + unsigned, struct closure *); + + void bch2_btree_interior_update_will_free_node(struct btree_update *, +-- +cgit v1.2.3 + + +From cc540a105b666d58820d248874d12f67c64b008f Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 7 Apr 2020 13:49:14 -0400 +Subject: bcachefs: Account for ioclock slop when throttling rebalance thread + +This should fix an issue where the rebalance thread was spinning + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/rebalance.c | 16 ++++++++++------ + 1 file changed, 10 insertions(+), 6 deletions(-) + +diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c +index ab1934325948..e15a2b1dc5d0 100644 +--- a/fs/bcachefs/rebalance.c ++++ b/fs/bcachefs/rebalance.c +@@ -204,17 +204,21 @@ static int bch2_rebalance_thread(void *arg) + prev_run_time; + + if (w.dev_most_full_percent < 20 && throttle > 0) { +- r->state = REBALANCE_THROTTLED; + r->throttled_until_iotime = io_start + + div_u64(w.dev_most_full_capacity * + (20 - w.dev_most_full_percent), + 50); +- r->throttled_until_cputime = start + throttle; + +- bch2_kthread_io_clock_wait(clock, +- r->throttled_until_iotime, +- throttle); +- continue; ++ if (atomic_long_read(&clock->now) + clock->max_slop < ++ r->throttled_until_iotime) { ++ r->throttled_until_cputime = start + throttle; ++ r->state = REBALANCE_THROTTLED; ++ ++ bch2_kthread_io_clock_wait(clock, ++ r->throttled_until_iotime, ++ throttle); ++ continue; ++ } + } + + /* minimum 1 mb/sec: */ +-- +cgit v1.2.3 + + +From 6156db57767fb7dba7e73067d8f7a280ee88a9bd Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 7 Apr 2020 17:31:38 -0400 +Subject: bcachefs: Fix a locking bug in bch2_btree_ptr_debugcheck() + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/extents.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c +index 3f66457d2272..d1a4ab04fbbf 100644 +--- a/fs/bcachefs/extents.c ++++ b/fs/bcachefs/extents.c +@@ -180,7 +180,7 @@ void bch2_btree_ptr_debugcheck(struct bch_fs *c, struct bkey_s_c k) + return; + + bch2_fs_inconsistent_on(!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) && +- !bch2_bkey_replicas_marked(c, k, false), c, ++ !bch2_bkey_replicas_marked_locked(c, k, false), c, + "btree key bad (replicas not marked in superblock):\n%s", + (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); + +-- +cgit v1.2.3 + + +From 91be52e537ad2061741f935da6d499a5c0a2c92f Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 7 Apr 2020 17:27:12 -0400 +Subject: bcachefs: Fix another deadlock in the btree interior update path + +Can't take read locks on btree nodes while holding +btree_interior_update_lock. Also, fix a bug where we were leaking +journal prereservations. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_update_interior.c | 71 ++++++++++++++++++++++--------------- + 1 file changed, 42 insertions(+), 29 deletions(-) + +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index 0d08d6fcd401..7815214ed317 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -609,33 +609,11 @@ static void bch2_btree_update_free(struct btree_update *as) + mutex_unlock(&c->btree_interior_update_lock); + } + +-static void btree_update_nodes_reachable(struct btree_update *as, u64 seq) +-{ +- struct bch_fs *c = as->c; +- +- while (as->nr_new_nodes) { +- struct btree *b = as->new_nodes[--as->nr_new_nodes]; +- +- BUG_ON(b->will_make_reachable != (unsigned long) as); +- b->will_make_reachable = 0; +- +- /* +- * b->will_make_reachable prevented it from being written, so +- * write it now if it needs to be written: +- */ +- btree_node_lock_type(c, b, SIX_LOCK_read); +- bch2_btree_node_write_cond(c, b, btree_node_need_write(b)); +- six_unlock_read(&b->lock); +- } +- +- while (as->nr_pending) +- bch2_btree_node_free_ondisk(c, &as->pending[--as->nr_pending], +- seq); +-} +- + static void btree_update_nodes_written(struct closure *cl) + { + struct btree_update *as = container_of(cl, struct btree_update, cl); ++ struct btree *new_nodes[BTREE_MAX_DEPTH * 2 + GC_MERGE_NODES]; ++ unsigned nr_new_nodes; + struct journal_res res = { 0 }; + struct bch_fs *c = as->c; + struct btree *b; +@@ -650,6 +628,7 @@ static void btree_update_nodes_written(struct closure *cl) + mutex_lock(&c->btree_interior_update_lock); + as->nodes_written = true; + again: ++ nr_new_nodes = 0; + as = list_first_entry_or_null(&c->btree_interior_updates_unwritten, + struct btree_update, unwritten_list); + if (!as || !as->nodes_written) { +@@ -738,8 +717,23 @@ free_update: + six_unlock_intent(&b->lock); + } + +- if (!ret) +- btree_update_nodes_reachable(as, res.seq); ++ if (!ret) { ++ nr_new_nodes = as->nr_new_nodes; ++ memcpy(new_nodes, ++ as->new_nodes, ++ as->nr_new_nodes * sizeof(struct btree *)); ++ ++ while (as->nr_new_nodes) { ++ struct btree *b = as->new_nodes[--as->nr_new_nodes]; ++ ++ BUG_ON(b->will_make_reachable != (unsigned long) as); ++ b->will_make_reachable = 0; ++ } ++ ++ while (as->nr_pending) ++ bch2_btree_node_free_ondisk(c, ++ &as->pending[--as->nr_pending], res.seq); ++ } + + __bch2_btree_update_free(as); + /* +@@ -747,6 +741,20 @@ free_update: + * nodes to be writeable: + */ + closure_wake_up(&c->btree_interior_update_wait); ++ ++ /* ++ * Can't take btree node locks while holding btree_interior_update_lock: ++ * */ ++ mutex_unlock(&c->btree_interior_update_lock); ++ ++ while (nr_new_nodes) { ++ struct btree *b = new_nodes[--nr_new_nodes]; ++ btree_node_lock_type(c, b, SIX_LOCK_read); ++ bch2_btree_node_write_cond(c, b, btree_node_need_write(b)); ++ six_unlock_read(&b->lock); ++ } ++ ++ mutex_lock(&c->btree_interior_update_lock); + goto again; + } + +@@ -963,11 +971,13 @@ bch2_btree_update_start(struct btree_trans *trans, enum btree_id id, + BTREE_UPDATE_JOURNAL_RES, + JOURNAL_RES_GET_NONBLOCK); + if (ret == -EAGAIN) { ++ if (flags & BTREE_INSERT_NOUNLOCK) ++ return ERR_PTR(-EINTR); ++ + bch2_trans_unlock(trans); + + ret = bch2_journal_preres_get(&c->journal, &journal_preres, +- BTREE_UPDATE_JOURNAL_RES, +- JOURNAL_RES_GET_NONBLOCK); ++ BTREE_UPDATE_JOURNAL_RES, 0); + if (ret) + return ERR_PTR(ret); + +@@ -978,8 +988,10 @@ bch2_btree_update_start(struct btree_trans *trans, enum btree_id id, + } + + reserve = bch2_btree_reserve_get(c, nr_nodes, flags, cl); +- if (IS_ERR(reserve)) ++ if (IS_ERR(reserve)) { ++ bch2_journal_preres_put(&c->journal, &journal_preres); + return ERR_CAST(reserve); ++ } + + as = mempool_alloc(&c->btree_interior_update_pool, GFP_NOIO); + memset(as, 0, sizeof(*as)); +@@ -1677,6 +1689,7 @@ retry: + + as = bch2_btree_update_start(trans, iter->btree_id, + btree_update_reserve_required(c, parent) + 1, ++ flags| + BTREE_INSERT_NOFAIL| + BTREE_INSERT_USE_RESERVE, + !(flags & BTREE_INSERT_NOUNLOCK) ? &cl : NULL); +-- +cgit v1.2.3 + + +From 12aae2f315304616b149c41c290096c930a88801 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 11 Apr 2020 12:29:32 -0400 +Subject: bcachefs: Fix a locking bug in bch2_journal_pin_copy() + +There was a race where the src pin would be flushed - releasing the last +pin on that sequence number - before adding the new journal pin. Oops. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/journal_reclaim.c | 17 +++++++++++++---- + 1 file changed, 13 insertions(+), 4 deletions(-) + +diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c +index db3afd908474..d34434f62454 100644 +--- a/fs/bcachefs/journal_reclaim.c ++++ b/fs/bcachefs/journal_reclaim.c +@@ -322,14 +322,12 @@ void bch2_journal_pin_drop(struct journal *j, + spin_unlock(&j->lock); + } + +-void __bch2_journal_pin_add(struct journal *j, u64 seq, ++static void bch2_journal_pin_add_locked(struct journal *j, u64 seq, + struct journal_entry_pin *pin, + journal_pin_flush_fn flush_fn) + { + struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq); + +- spin_lock(&j->lock); +- + __journal_pin_drop(j, pin); + + BUG_ON(!atomic_read(&pin_list->count)); +@@ -339,7 +337,14 @@ void __bch2_journal_pin_add(struct journal *j, u64 seq, + pin->flush = flush_fn; + + list_add(&pin->list, flush_fn ? &pin_list->list : &pin_list->flushed); ++} + ++void __bch2_journal_pin_add(struct journal *j, u64 seq, ++ struct journal_entry_pin *pin, ++ journal_pin_flush_fn flush_fn) ++{ ++ spin_lock(&j->lock); ++ bch2_journal_pin_add_locked(j, seq, pin, flush_fn); + spin_unlock(&j->lock); + + /* +@@ -354,9 +359,13 @@ void bch2_journal_pin_copy(struct journal *j, + struct journal_entry_pin *src, + journal_pin_flush_fn flush_fn) + { ++ spin_lock(&j->lock); ++ + if (journal_pin_active(src) && + (!journal_pin_active(dst) || src->seq < dst->seq)) +- __bch2_journal_pin_add(j, src->seq, dst, flush_fn); ++ bch2_journal_pin_add_locked(j, src->seq, dst, flush_fn); ++ ++ spin_unlock(&j->lock); + } + + /** +-- +cgit v1.2.3 + + +From f3bd583ee93874fd5eacaa6009e01fdcd165f5c4 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 11 Apr 2020 12:30:30 -0400 +Subject: bcachefs: Improve lockdep annotation in journalling code + +bch2_journal_res_get() in nonblocking mode is equivalent to a trylock. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/journal.h | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h +index f0da2c52581c..78f5fac06bf5 100644 +--- a/fs/bcachefs/journal.h ++++ b/fs/bcachefs/journal.h +@@ -344,7 +344,9 @@ static inline int bch2_journal_res_get(struct journal *j, struct journal_res *re + return ret; + out: + if (!(flags & JOURNAL_RES_GET_CHECK)) { +- lock_acquire_shared(&j->res_map, 0, 0, NULL, _THIS_IP_); ++ lock_acquire_shared(&j->res_map, 0, ++ (flags & JOURNAL_RES_GET_NONBLOCK) != 0, ++ NULL, _THIS_IP_); + EBUG_ON(!res->ref); + } + return 0; +-- +cgit v1.2.3 + + +From 0bfe8beb8d20f5ff1aa7b59782e42a92570a2a8c Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 11 Apr 2020 12:31:16 -0400 +Subject: bcachefs: Slightly reduce btree split threshold + +2/3rds performs a lot better than 3/4ths on the tested workloda, leading +to significanly fewer btree node compactions. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_cache.h | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h +index 132cc95a4c02..98cca30778ea 100644 +--- a/fs/bcachefs/btree_cache.h ++++ b/fs/bcachefs/btree_cache.h +@@ -94,7 +94,7 @@ static inline unsigned btree_blocks(struct bch_fs *c) + return c->opts.btree_node_size >> c->block_bits; + } + +-#define BTREE_SPLIT_THRESHOLD(c) (btree_max_u64s(c) * 3 / 4) ++#define BTREE_SPLIT_THRESHOLD(c) (btree_max_u64s(c) * 2 / 3) + + #define BTREE_FOREGROUND_MERGE_THRESHOLD(c) (btree_max_u64s(c) * 1 / 3) + #define BTREE_FOREGROUND_MERGE_HYSTERESIS(c) \ +-- +cgit v1.2.3 + + +From 4fa92a1f74582d4fb5f1a3f4b833fc372568a945 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 11 Apr 2020 12:32:27 -0400 +Subject: bcachefs: Add a few tracepoints + +Transaction restart tracing should probably be overhaulled at some +point. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_update_interior.c | 6 +++++- + include/trace/events/bcachefs.h | 17 +++++++++++++++++ + 2 files changed, 22 insertions(+), 1 deletion(-) + +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index 7815214ed317..a5bd24bec059 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -1555,8 +1555,10 @@ int bch2_btree_split_leaf(struct bch_fs *c, struct btree_iter *iter, + /* Hack, because gc and splitting nodes doesn't mix yet: */ + if (!(flags & BTREE_INSERT_GC_LOCK_HELD) && + !down_read_trylock(&c->gc_lock)) { +- if (flags & BTREE_INSERT_NOUNLOCK) ++ if (flags & BTREE_INSERT_NOUNLOCK) { ++ trace_transaction_restart_ip(trans->ip, _THIS_IP_); + return -EINTR; ++ } + + bch2_trans_unlock(trans); + down_read(&c->gc_lock); +@@ -1584,6 +1586,8 @@ int bch2_btree_split_leaf(struct bch_fs *c, struct btree_iter *iter, + BUG_ON(flags & BTREE_INSERT_NOUNLOCK); + bch2_trans_unlock(trans); + ret = -EINTR; ++ ++ trace_transaction_restart_ip(trans->ip, _THIS_IP_); + } + goto out; + } +diff --git a/include/trace/events/bcachefs.h b/include/trace/events/bcachefs.h +index db828e9c1390..c30a60e1d805 100644 +--- a/include/trace/events/bcachefs.h ++++ b/include/trace/events/bcachefs.h +@@ -499,6 +499,23 @@ TRACE_EVENT(copygc, + __entry->buckets_moved, __entry->buckets_not_moved) + ); + ++TRACE_EVENT(transaction_restart_ip, ++ TP_PROTO(unsigned long caller, unsigned long ip), ++ TP_ARGS(caller, ip), ++ ++ TP_STRUCT__entry( ++ __field(unsigned long, caller ) ++ __field(unsigned long, ip ) ++ ), ++ ++ TP_fast_assign( ++ __entry->caller = caller; ++ __entry->ip = ip; ++ ), ++ ++ TP_printk("%pF %pF", (void *) __entry->caller, (void *) __entry->ip) ++); ++ + DECLARE_EVENT_CLASS(transaction_restart, + TP_PROTO(unsigned long ip), + TP_ARGS(ip), +-- +cgit v1.2.3 + + +From cf93d7a568b4bce8f3288e9f2be2b7d79695d704 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 24 Apr 2020 14:08:18 -0400 +Subject: bcachefs: Fix for the bkey compat path + +In the write path, we were calling bch2_bkey_ops.compat() in the wrong +place. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bkey_methods.c | 95 +++++++++++++++++++++++++++------------------- + 1 file changed, 55 insertions(+), 40 deletions(-) + +diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c +index c97e1e9002cb..55ef4032b37c 100644 +--- a/fs/bcachefs/bkey_methods.c ++++ b/fs/bcachefs/bkey_methods.c +@@ -283,49 +283,64 @@ void __bch2_bkey_compat(unsigned level, enum btree_id btree_id, + const struct bkey_ops *ops; + struct bkey uk; + struct bkey_s u; +- +- if (big_endian != CPU_BIG_ENDIAN) +- bch2_bkey_swab_key(f, k); +- +- if (version < bcachefs_metadata_version_bkey_renumber) +- bch2_bkey_renumber(__btree_node_type(level, btree_id), k, write); +- +- if (version < bcachefs_metadata_version_inode_btree_change && +- btree_id == BTREE_ID_INODES) { ++ int i; ++ ++ /* ++ * Do these operations in reverse order in the write path: ++ */ ++ ++ for (i = 0; i < 4; i++) ++ switch (!write ? i : 3 - i) { ++ case 0: ++ if (big_endian != CPU_BIG_ENDIAN) ++ bch2_bkey_swab_key(f, k); ++ break; ++ case 1: ++ if (version < bcachefs_metadata_version_bkey_renumber) ++ bch2_bkey_renumber(__btree_node_type(level, btree_id), k, write); ++ break; ++ case 2: ++ if (version < bcachefs_metadata_version_inode_btree_change && ++ btree_id == BTREE_ID_INODES) { ++ if (!bkey_packed(k)) { ++ struct bkey_i *u = packed_to_bkey(k); ++ swap(u->k.p.inode, u->k.p.offset); ++ } else if (f->bits_per_field[BKEY_FIELD_INODE] && ++ f->bits_per_field[BKEY_FIELD_OFFSET]) { ++ struct bkey_format tmp = *f, *in = f, *out = &tmp; ++ ++ swap(tmp.bits_per_field[BKEY_FIELD_INODE], ++ tmp.bits_per_field[BKEY_FIELD_OFFSET]); ++ swap(tmp.field_offset[BKEY_FIELD_INODE], ++ tmp.field_offset[BKEY_FIELD_OFFSET]); ++ ++ if (!write) ++ swap(in, out); ++ ++ uk = __bch2_bkey_unpack_key(in, k); ++ swap(uk.p.inode, uk.p.offset); ++ BUG_ON(!bch2_bkey_pack_key(k, &uk, out)); ++ } ++ } ++ break; ++ case 3: + if (!bkey_packed(k)) { +- struct bkey_i *u = packed_to_bkey(k); +- swap(u->k.p.inode, u->k.p.offset); +- } else if (f->bits_per_field[BKEY_FIELD_INODE] && +- f->bits_per_field[BKEY_FIELD_OFFSET]) { +- struct bkey_format tmp = *f, *in = f, *out = &tmp; +- +- swap(tmp.bits_per_field[BKEY_FIELD_INODE], +- tmp.bits_per_field[BKEY_FIELD_OFFSET]); +- swap(tmp.field_offset[BKEY_FIELD_INODE], +- tmp.field_offset[BKEY_FIELD_OFFSET]); +- +- if (!write) +- swap(in, out); +- +- uk = __bch2_bkey_unpack_key(in, k); +- swap(uk.p.inode, uk.p.offset); +- BUG_ON(!bch2_bkey_pack_key(k, &uk, out)); ++ u = bkey_i_to_s(packed_to_bkey(k)); ++ } else { ++ uk = __bch2_bkey_unpack_key(f, k); ++ u.k = &uk; ++ u.v = bkeyp_val(f, k); + } +- } + +- if (!bkey_packed(k)) { +- u = bkey_i_to_s(packed_to_bkey(k)); +- } else { +- uk = __bch2_bkey_unpack_key(f, k); +- u.k = &uk; +- u.v = bkeyp_val(f, k); +- } ++ if (big_endian != CPU_BIG_ENDIAN) ++ bch2_bkey_swab_val(u); + +- if (big_endian != CPU_BIG_ENDIAN) +- bch2_bkey_swab_val(u); ++ ops = &bch2_bkey_ops[k->type]; + +- ops = &bch2_bkey_ops[k->type]; +- +- if (ops->compat) +- ops->compat(btree_id, version, big_endian, write, u); ++ if (ops->compat) ++ ops->compat(btree_id, version, big_endian, write, u); ++ break; ++ default: ++ BUG(); ++ } + } +-- +cgit v1.2.3 + + +From 4eabb8a85a17d2706507e0adee4306ca7d2052ba Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 24 Apr 2020 14:08:56 -0400 +Subject: bcachefs: Handle -EINTR bch2_migrate_index_update() + +peek_slot() shouldn't return -EINTR when there's only a single live +iterator, but that's tricky to guarantee - we seem to be returning +-EINTR when we shouldn't, but it's easy enough to handle in the caller. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/move.c | 15 +++++++++++---- + 1 file changed, 11 insertions(+), 4 deletions(-) + +diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c +index 4afda95f4017..67e495bc8aba 100644 +--- a/fs/bcachefs/move.c ++++ b/fs/bcachefs/move.c +@@ -70,19 +70,26 @@ static int bch2_migrate_index_update(struct bch_write_op *op) + BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + + while (1) { +- struct bkey_s_c k = bch2_btree_iter_peek_slot(iter); ++ struct bkey_s_c k; + struct bkey_i *insert; +- struct bkey_i_extent *new = +- bkey_i_to_extent(bch2_keylist_front(keys)); ++ struct bkey_i_extent *new; + BKEY_PADDED(k) _new, _insert; + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + bool did_work = false; + int nr; + ++ bch2_trans_reset(&trans, 0); ++ ++ k = bch2_btree_iter_peek_slot(iter); + ret = bkey_err(k); +- if (ret) ++ if (ret) { ++ if (ret == -EINTR) ++ continue; + break; ++ } ++ ++ new = bkey_i_to_extent(bch2_keylist_front(keys)); + + if (bversion_cmp(k.k->version, new->k.version) || + !bch2_bkey_matches_ptr(c, k, m->ptr, m->offset)) +-- +cgit v1.2.3 + + +From b393afab2d96f58bf715e6a481123c0d5e0d5e72 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 24 Apr 2020 18:25:11 -0400 +Subject: bcachefs: Fix a deadlock + +btree_node_lock_increment() was incorrectly skipping over the current +iter when checking if we should increment a node we already have locked. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_locking.h | 3 +-- + 1 file changed, 1 insertion(+), 2 deletions(-) + +diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h +index fe8b58384a9e..bb4f66646da2 100644 +--- a/fs/bcachefs/btree_locking.h ++++ b/fs/bcachefs/btree_locking.h +@@ -165,8 +165,7 @@ static inline bool btree_node_lock_increment(struct btree_iter *iter, + struct btree_iter *linked; + + trans_for_each_iter(iter->trans, linked) +- if (linked != iter && +- linked->l[level].b == b && ++ if (linked->l[level].b == b && + btree_node_locked_type(linked, level) >= want) { + six_lock_increment(&b->lock, want); + return true; +-- +cgit v1.2.3 + + +From 39427a4929e71e620c1db24ed72dc41aebeb693c Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 24 Apr 2020 17:57:59 -0400 +Subject: bcachefs: More fixes for counting extent update iterators + +This is unfortunately really fragile - hopefully we'll be able to think +of a new approach at some point. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/extent_update.c | 36 ++++++++++++++++++++++++------------ + 1 file changed, 24 insertions(+), 12 deletions(-) + +diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c +index 2a7d913bdda3..d0af1bc17018 100644 +--- a/fs/bcachefs/extent_update.c ++++ b/fs/bcachefs/extent_update.c +@@ -34,16 +34,10 @@ static int count_iters_for_insert(struct btree_trans *trans, + unsigned offset, + struct bpos *end, + unsigned *nr_iters, +- unsigned max_iters, +- bool overwrite) ++ unsigned max_iters) + { +- int ret = 0; ++ int ret = 0, ret2 = 0; + +- /* +- * The extent update path requires an _additional_ iterator for each +- * extent we're inserting and overwriting: +- */ +- *nr_iters += 1; + if (*nr_iters >= max_iters) { + *end = bpos_min(*end, k.k->p); + ret = 1; +@@ -70,11 +64,14 @@ static int count_iters_for_insert(struct btree_trans *trans, + + for_each_btree_key(trans, iter, + BTREE_ID_REFLINK, POS(0, idx + offset), +- BTREE_ITER_SLOTS, r_k, ret) { ++ BTREE_ITER_SLOTS, r_k, ret2) { + if (bkey_cmp(bkey_start_pos(r_k.k), + POS(0, idx + sectors)) >= 0) + break; + ++ /* extent_update_to_keys(), for the reflink_v update */ ++ *nr_iters += 1; ++ + *nr_iters += 1 + bch2_bkey_nr_alloc_ptrs(r_k); + + if (*nr_iters >= max_iters) { +@@ -92,7 +89,7 @@ static int count_iters_for_insert(struct btree_trans *trans, + } + } + +- return ret; ++ return ret2 ?: ret; + } + + #define EXTENT_ITERS_MAX (BTREE_ITER_MAX / 3) +@@ -121,8 +118,11 @@ int bch2_extent_atomic_end(struct btree_iter *iter, + + *end = bpos_min(insert->k.p, b->key.k.p); + ++ /* extent_update_to_keys(): */ ++ nr_iters += 1; ++ + ret = count_iters_for_insert(trans, bkey_i_to_s_c(insert), 0, end, +- &nr_iters, EXTENT_ITERS_MAX / 2, false); ++ &nr_iters, EXTENT_ITERS_MAX / 2); + if (ret < 0) + return ret; + +@@ -139,8 +139,20 @@ int bch2_extent_atomic_end(struct btree_iter *iter, + offset = bkey_start_offset(&insert->k) - + bkey_start_offset(k.k); + ++ /* extent_handle_overwrites(): */ ++ switch (bch2_extent_overlap(&insert->k, k.k)) { ++ case BCH_EXTENT_OVERLAP_ALL: ++ case BCH_EXTENT_OVERLAP_FRONT: ++ nr_iters += 1; ++ break; ++ case BCH_EXTENT_OVERLAP_BACK: ++ case BCH_EXTENT_OVERLAP_MIDDLE: ++ nr_iters += 2; ++ break; ++ } ++ + ret = count_iters_for_insert(trans, k, offset, end, +- &nr_iters, EXTENT_ITERS_MAX, true); ++ &nr_iters, EXTENT_ITERS_MAX); + if (ret) + break; + +-- +cgit v1.2.3 + + +From 6df5b51910576219cf70e50fc4d0feca73f3f7e5 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 29 Apr 2020 15:28:25 -0400 +Subject: bcachefs: Don't issue writes that are more than 1 MB + +the bcachefs io path in io.c can't bounce writes larger than that. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/fs-io.c | 13 ++++++++++++- + 1 file changed, 12 insertions(+), 1 deletion(-) + +diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c +index 32416ceb09aa..22e34806f777 100644 +--- a/fs/bcachefs/fs-io.c ++++ b/fs/bcachefs/fs-io.c +@@ -1264,7 +1264,8 @@ do_io: + if (w->io && + (w->io->op.res.nr_replicas != nr_replicas_this_write || + bio_full(&w->io->op.wbio.bio, PAGE_SIZE) || +- w->io->op.wbio.bio.bi_iter.bi_size >= (256U << 20) || ++ w->io->op.wbio.bio.bi_iter.bi_size + (sectors << 9) >= ++ (BIO_MAX_VECS * PAGE_SIZE) || + bio_end_sector(&w->io->op.wbio.bio) != sector)) + bch2_writepage_do_io(w); + +@@ -1838,12 +1839,22 @@ static long bch2_dio_write_loop(struct dio_write *dio) + goto loop; + + while (1) { ++ size_t extra = dio->iter.count - ++ min(BIO_MAX_VECS * PAGE_SIZE, dio->iter.count); ++ + if (kthread) + kthread_use_mm(dio->mm); + BUG_ON(current->faults_disabled_mapping); + current->faults_disabled_mapping = mapping; + ++ /* ++ * Don't issue more than 2MB at once, the bcachefs io path in ++ * io.c can't bounce more than that: ++ */ ++ ++ dio->iter.count -= extra; + ret = bio_iov_iter_get_pages(bio, &dio->iter); ++ dio->iter.count += extra; + + current->faults_disabled_mapping = NULL; + if (kthread) +-- +cgit v1.2.3 + + +From e69a598b86613168d073f37d7d10d578e6d80de0 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 29 Apr 2020 12:57:04 -0400 +Subject: bcachefs: Add some printks for error paths + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_gc.c | 10 ++++++++-- + fs/bcachefs/replicas.c | 17 ++++++++++++----- + 2 files changed, 20 insertions(+), 7 deletions(-) + +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +index 59fc3a454cb9..769f3bc8faf9 100644 +--- a/fs/bcachefs/btree_gc.c ++++ b/fs/bcachefs/btree_gc.c +@@ -699,8 +699,10 @@ static int bch2_gc_start(struct bch_fs *c, + + c->usage_gc = __alloc_percpu_gfp(fs_usage_u64s(c) * sizeof(u64), + sizeof(u64), GFP_KERNEL); +- if (!c->usage_gc) ++ if (!c->usage_gc) { ++ bch_err(c, "error allocating c->usage_gc"); + return -ENOMEM; ++ } + + for_each_member_device(ca, c, i) { + BUG_ON(ca->buckets[1]); +@@ -711,19 +713,23 @@ static int bch2_gc_start(struct bch_fs *c, + GFP_KERNEL|__GFP_ZERO); + if (!ca->buckets[1]) { + percpu_ref_put(&ca->ref); ++ bch_err(c, "error allocating ca->buckets[gc]"); + return -ENOMEM; + } + + ca->usage[1] = alloc_percpu(struct bch_dev_usage); + if (!ca->usage[1]) { ++ bch_err(c, "error allocating ca->usage[gc]"); + percpu_ref_put(&ca->ref); + return -ENOMEM; + } + } + + ret = bch2_ec_mem_alloc(c, true); +- if (ret) ++ if (ret) { ++ bch_err(c, "error allocating ec gc mem"); + return ret; ++ } + + percpu_down_write(&c->mark_lock); + +diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c +index be4908575f72..67a7128fd9af 100644 +--- a/fs/bcachefs/replicas.c ++++ b/fs/bcachefs/replicas.c +@@ -299,8 +299,10 @@ static int replicas_table_update(struct bch_fs *c, + GFP_NOIO)) || + !(new_scratch = kmalloc(bytes, GFP_NOIO)) || + (c->usage_gc && +- !(new_gc = __alloc_percpu_gfp(bytes, sizeof(u64), GFP_NOIO)))) ++ !(new_gc = __alloc_percpu_gfp(bytes, sizeof(u64), GFP_NOIO)))) { ++ bch_err(c, "error updating replicas table: memory allocation failure"); + goto err; ++ } + + if (c->usage_base) + __replicas_table_update(new_base, new_r, +@@ -362,7 +364,7 @@ static int bch2_mark_replicas_slowpath(struct bch_fs *c, + struct bch_replicas_entry *new_entry) + { + struct bch_replicas_cpu new_r, new_gc; +- int ret = -ENOMEM; ++ int ret = 0; + + verify_replicas_entry(new_entry); + +@@ -409,14 +411,16 @@ static int bch2_mark_replicas_slowpath(struct bch_fs *c, + swap(new_gc, c->replicas_gc); + percpu_up_write(&c->mark_lock); + out: +- ret = 0; +-err: + mutex_unlock(&c->sb_lock); + + kfree(new_r.entries); + kfree(new_gc.entries); + + return ret; ++err: ++ bch_err(c, "error adding replicas entry: memory allocation failure"); ++ ret = -ENOMEM; ++ goto out; + } + + int bch2_mark_replicas(struct bch_fs *c, +@@ -561,6 +565,7 @@ int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask) + GFP_NOIO); + if (!c->replicas_gc.entries) { + mutex_unlock(&c->sb_lock); ++ bch_err(c, "error allocating c->replicas_gc"); + return -ENOMEM; + } + +@@ -586,8 +591,10 @@ retry: + nr = READ_ONCE(c->replicas.nr); + new.entry_size = READ_ONCE(c->replicas.entry_size); + new.entries = kcalloc(nr, new.entry_size, GFP_KERNEL); +- if (!new.entries) ++ if (!new.entries) { ++ bch_err(c, "error allocating c->replicas_gc"); + return -ENOMEM; ++ } + + mutex_lock(&c->sb_lock); + percpu_down_write(&c->mark_lock); +-- +cgit v1.2.3 + + +From 14ed207af2b031cd4a73ed14135d6d1eb8cb844e Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 1 May 2020 19:56:31 -0400 +Subject: bcachefs: Fix another deadlock in btree_update_nodes_written() + +We also can't be blocking on btree node write locks while holding +btree_interior_update_lock. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_update_interior.c | 41 ++++++++++++++++++++++++++++++++++--- + 1 file changed, 38 insertions(+), 3 deletions(-) + +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index a5bd24bec059..007941f93b25 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -609,6 +609,19 @@ static void bch2_btree_update_free(struct btree_update *as) + mutex_unlock(&c->btree_interior_update_lock); + } + ++static inline bool six_trylock_intentwrite(struct six_lock *lock) ++{ ++ if (!six_trylock_intent(lock)) ++ return false; ++ ++ if (!six_trylock_write(lock)) { ++ six_unlock_intent(lock); ++ return false; ++ } ++ ++ return true; ++} ++ + static void btree_update_nodes_written(struct closure *cl) + { + struct btree_update *as = container_of(cl, struct btree_update, cl); +@@ -637,10 +650,15 @@ again: + } + + b = as->b; +- if (b && !six_trylock_intent(&b->lock)) { ++ if (b && !six_trylock_intentwrite(&b->lock)) { + mutex_unlock(&c->btree_interior_update_lock); ++ + btree_node_lock_type(c, b, SIX_LOCK_intent); ++ six_lock_write(&b->lock, NULL, NULL); ++ ++ six_unlock_write(&b->lock); + six_unlock_intent(&b->lock); ++ + mutex_lock(&c->btree_interior_update_lock); + goto again; + } +@@ -648,7 +666,25 @@ again: + list_del(&as->unwritten_list); + + ret = bch2_journal_res_get(&c->journal, &res, as->journal_u64s, ++ JOURNAL_RES_GET_NONBLOCK| + JOURNAL_RES_GET_RESERVED); ++ if (ret == -EAGAIN) { ++ unsigned u64s = as->journal_u64s; ++ ++ six_unlock_write(&b->lock); ++ six_unlock_intent(&b->lock); ++ ++ mutex_unlock(&c->btree_interior_update_lock); ++ ++ ret = bch2_journal_res_get(&c->journal, &res, u64s, ++ JOURNAL_RES_GET_CHECK| ++ JOURNAL_RES_GET_RESERVED); ++ if (!ret) { ++ mutex_lock(&c->btree_interior_update_lock); ++ goto again; ++ } ++ } ++ + if (ret) { + BUG_ON(!bch2_journal_error(&c->journal)); + /* can't unblock btree writes */ +@@ -671,7 +707,6 @@ again: + /* @b is the node we did the final insert into: */ + BUG_ON(!res.ref); + +- six_lock_write(&b->lock, NULL, NULL); + list_del(&as->write_blocked_list); + + i = btree_bset_last(b); +@@ -680,7 +715,6 @@ again: + le64_to_cpu(i->journal_seq))); + + bch2_btree_add_journal_pin(c, b, res.seq); +- six_unlock_write(&b->lock); + break; + + case BTREE_INTERIOR_UPDATING_AS: +@@ -709,6 +743,7 @@ again: + free_update: + /* Do btree write after dropping journal res: */ + if (b) { ++ six_unlock_write(&b->lock); + /* + * b->write_blocked prevented it from being written, so + * write it now if it needs to be written: +-- +cgit v1.2.3 + + +From d8723df7f1510bf1ab94279e681fcb9a15c88db4 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 2 May 2020 16:21:35 -0400 +Subject: bcachefs: Fix two more deadlocks + +Deadlock on shutdown: + +btree_update_nodes_written() unblocks btree nodes from being written; +after doing so, it has to check if they were marked as needing to be +written and if so kick off those writes - if that doesn't happen, we'll +never release journal pins and shutdown will get stuck when flushing the +journal. + +There was an error path where this didn't happen, because in the error +path we don't actually want those btree nodes write to happen; however, +we still have to kick off the write path so the journal pins get +released. The btree write path checks if we're in a journal error state +and doesn't do the actual write if we are. + +Also - there was another deadlock because btree_update_nodes_written() +was taking the btree update off of the unwritten_list too soon - before +getting a journal reservation, which could fail and have to be retried. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_io.c | 5 ++ + fs/bcachefs/btree_update_interior.c | 112 +++++++++++++++++++----------------- + 2 files changed, 64 insertions(+), 53 deletions(-) + +diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c +index ac8b98861aae..946fffb6f51e 100644 +--- a/fs/bcachefs/btree_io.c ++++ b/fs/bcachefs/btree_io.c +@@ -1626,6 +1626,11 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, + * reflect that those writes were done and the data flushed from the + * journal: + * ++ * Also on journal error, the pending write may have updates that were ++ * never journalled (interior nodes, see btree_update_nodes_written()) - ++ * it's critical that we don't do the write in that case otherwise we ++ * will have updates visible that weren't in the journal: ++ * + * Make sure to update b->written so bch2_btree_init_next() doesn't + * break: + */ +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index 007941f93b25..deb67c5e0ba4 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -586,12 +586,12 @@ static void __bch2_btree_update_free(struct btree_update *as) + bch2_journal_pin_drop(&c->journal, &as->journal); + bch2_journal_pin_flush(&c->journal, &as->journal); + +- BUG_ON((as->nr_new_nodes || as->nr_pending) && +- !bch2_journal_error(&c->journal));; ++ BUG_ON(as->nr_new_nodes || as->nr_pending); + + if (as->reserve) + bch2_btree_reserve_put(c, as->reserve); + ++ list_del(&as->unwritten_list); + list_del(&as->list); + + closure_debug_destroy(&as->cl); +@@ -625,12 +625,12 @@ static inline bool six_trylock_intentwrite(struct six_lock *lock) + static void btree_update_nodes_written(struct closure *cl) + { + struct btree_update *as = container_of(cl, struct btree_update, cl); +- struct btree *new_nodes[BTREE_MAX_DEPTH * 2 + GC_MERGE_NODES]; +- unsigned nr_new_nodes; ++ struct btree *nodes_need_write[BTREE_MAX_DEPTH * 2 + GC_MERGE_NODES + 1]; ++ unsigned nr_nodes_need_write; + struct journal_res res = { 0 }; + struct bch_fs *c = as->c; ++ struct btree_root *r; + struct btree *b; +- struct bset *i; + int ret; + + /* +@@ -641,7 +641,7 @@ static void btree_update_nodes_written(struct closure *cl) + mutex_lock(&c->btree_interior_update_lock); + as->nodes_written = true; + again: +- nr_new_nodes = 0; ++ nr_nodes_need_write = 0; + as = list_first_entry_or_null(&c->btree_interior_updates_unwritten, + struct btree_update, unwritten_list); + if (!as || !as->nodes_written) { +@@ -663,16 +663,16 @@ again: + goto again; + } + +- list_del(&as->unwritten_list); +- + ret = bch2_journal_res_get(&c->journal, &res, as->journal_u64s, + JOURNAL_RES_GET_NONBLOCK| + JOURNAL_RES_GET_RESERVED); + if (ret == -EAGAIN) { + unsigned u64s = as->journal_u64s; + +- six_unlock_write(&b->lock); +- six_unlock_intent(&b->lock); ++ if (b) { ++ six_unlock_write(&b->lock); ++ six_unlock_intent(&b->lock); ++ } + + mutex_unlock(&c->btree_interior_update_lock); + +@@ -685,19 +685,22 @@ again: + } + } + +- if (ret) { +- BUG_ON(!bch2_journal_error(&c->journal)); +- /* can't unblock btree writes */ +- goto free_update; +- } +- +- { ++ if (!ret) { + struct journal_buf *buf = &c->journal.buf[res.idx]; + struct jset_entry *entry = vstruct_idx(buf->data, res.offset); + + res.offset += as->journal_u64s; + res.u64s -= as->journal_u64s; + memcpy_u64s(entry, as->journal_entries, as->journal_u64s); ++ } else { ++ /* ++ * On journal error we have to run most of the normal path so ++ * that shutdown works - unblocking btree node writes in ++ * particular and writing them if needed - except for ++ * journalling the update: ++ */ ++ ++ BUG_ON(!bch2_journal_error(&c->journal)); + } + + switch (as->mode) { +@@ -705,24 +708,41 @@ again: + BUG(); + case BTREE_INTERIOR_UPDATING_NODE: + /* @b is the node we did the final insert into: */ +- BUG_ON(!res.ref); ++ ++ /* ++ * On failure to get a journal reservation, we still have to ++ * unblock the write and allow most of the write path to happen ++ * so that shutdown works, but the i->journal_seq mechanism ++ * won't work to prevent the btree write from being visible (we ++ * didn't get a journal sequence number) - instead ++ * __bch2_btree_node_write() doesn't do the actual write if ++ * we're in journal error state: ++ */ + + list_del(&as->write_blocked_list); + +- i = btree_bset_last(b); +- i->journal_seq = cpu_to_le64( +- max(res.seq, +- le64_to_cpu(i->journal_seq))); ++ if (!ret) { ++ struct bset *i = btree_bset_last(b); ++ ++ i->journal_seq = cpu_to_le64( ++ max(res.seq, ++ le64_to_cpu(i->journal_seq))); ++ ++ bch2_btree_add_journal_pin(c, b, res.seq); ++ } ++ ++ nodes_need_write[nr_nodes_need_write++] = b; + +- bch2_btree_add_journal_pin(c, b, res.seq); ++ six_unlock_write(&b->lock); ++ six_unlock_intent(&b->lock); + break; + + case BTREE_INTERIOR_UPDATING_AS: + BUG_ON(b); + break; + +- case BTREE_INTERIOR_UPDATING_ROOT: { +- struct btree_root *r = &c->btree_roots[as->btree_id]; ++ case BTREE_INTERIOR_UPDATING_ROOT: ++ r = &c->btree_roots[as->btree_id]; + + BUG_ON(b); + +@@ -734,42 +754,25 @@ again: + mutex_unlock(&c->btree_root_lock); + break; + } +- } + + bch2_journal_pin_drop(&c->journal, &as->journal); + + bch2_journal_res_put(&c->journal, &res); + bch2_journal_preres_put(&c->journal, &as->journal_preres); +-free_update: +- /* Do btree write after dropping journal res: */ +- if (b) { +- six_unlock_write(&b->lock); +- /* +- * b->write_blocked prevented it from being written, so +- * write it now if it needs to be written: +- */ +- btree_node_write_if_need(c, b, SIX_LOCK_intent); +- six_unlock_intent(&b->lock); +- } + +- if (!ret) { +- nr_new_nodes = as->nr_new_nodes; +- memcpy(new_nodes, +- as->new_nodes, +- as->nr_new_nodes * sizeof(struct btree *)); ++ while (as->nr_new_nodes) { ++ b = as->new_nodes[--as->nr_new_nodes]; + +- while (as->nr_new_nodes) { +- struct btree *b = as->new_nodes[--as->nr_new_nodes]; ++ BUG_ON(b->will_make_reachable != (unsigned long) as); ++ b->will_make_reachable = 0; + +- BUG_ON(b->will_make_reachable != (unsigned long) as); +- b->will_make_reachable = 0; +- } +- +- while (as->nr_pending) +- bch2_btree_node_free_ondisk(c, +- &as->pending[--as->nr_pending], res.seq); ++ nodes_need_write[nr_nodes_need_write++] = b; + } + ++ while (as->nr_pending) ++ bch2_btree_node_free_ondisk(c, ++ &as->pending[--as->nr_pending], res.seq); ++ + __bch2_btree_update_free(as); + /* + * for flush_held_btree_writes() waiting on updates to flush or +@@ -782,8 +785,10 @@ free_update: + * */ + mutex_unlock(&c->btree_interior_update_lock); + +- while (nr_new_nodes) { +- struct btree *b = new_nodes[--nr_new_nodes]; ++ /* Do btree writes after dropping journal res/locks: */ ++ while (nr_nodes_need_write) { ++ b = nodes_need_write[--nr_nodes_need_write]; ++ + btree_node_lock_type(c, b, SIX_LOCK_read); + bch2_btree_node_write_cond(c, b, btree_node_need_write(b)); + six_unlock_read(&b->lock); +@@ -1036,6 +1041,7 @@ bch2_btree_update_start(struct btree_trans *trans, enum btree_id id, + as->btree_id = id; + as->reserve = reserve; + INIT_LIST_HEAD(&as->write_blocked_list); ++ INIT_LIST_HEAD(&as->unwritten_list); + as->journal_preres = journal_preres; + + bch2_keylist_init(&as->parent_keys, as->inline_keys); +-- +cgit v1.2.3 + + +From e16cc5dcc4d03a3d6c9e340237617977b04ad586 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 6 May 2020 15:37:04 -0400 +Subject: bcachefs: Some compression improvements + +In __bio_map_or_bounce(), the check for if the bio is physically +contiguous is improved; it's now more readable and handles multi page +but contiguous bios. + +Also when decompressing, we were doing a redundant memcpy in the case +where we were able to use vmap to map a bio contigiously. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/compress.c | 53 +++++++++++++++++++++++++++++++++++--------------- + 1 file changed, 37 insertions(+), 16 deletions(-) + +diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c +index 62e560a83a61..ed3879deafb6 100644 +--- a/fs/bcachefs/compress.c ++++ b/fs/bcachefs/compress.c +@@ -39,6 +39,24 @@ static struct bbuf __bounce_alloc(struct bch_fs *c, unsigned size, int rw) + BUG(); + } + ++static bool bio_phys_contig(struct bio *bio, struct bvec_iter start) ++{ ++ struct bio_vec bv; ++ struct bvec_iter iter; ++ void *expected_start = NULL; ++ ++ __bio_for_each_bvec(bv, bio, iter, start) { ++ if (expected_start && ++ expected_start != page_address(bv.bv_page) + bv.bv_offset) ++ return false; ++ ++ expected_start = page_address(bv.bv_page) + ++ bv.bv_offset + bv.bv_len; ++ } ++ ++ return true; ++} ++ + static struct bbuf __bio_map_or_bounce(struct bch_fs *c, struct bio *bio, + struct bvec_iter start, int rw) + { +@@ -48,27 +66,28 @@ static struct bbuf __bio_map_or_bounce(struct bch_fs *c, struct bio *bio, + unsigned nr_pages = 0; + struct page *stack_pages[16]; + struct page **pages = NULL; +- bool first = true; +- unsigned prev_end = PAGE_SIZE; + void *data; + + BUG_ON(bvec_iter_sectors(start) > c->sb.encoded_extent_max); + +-#ifndef CONFIG_HIGHMEM +- __bio_for_each_bvec(bv, bio, iter, start) { +- if (bv.bv_len == start.bi_size) +- return (struct bbuf) { +- .b = page_address(bv.bv_page) + bv.bv_offset, +- .type = BB_NONE, .rw = rw +- }; +- } +-#endif ++ if (!IS_ENABLED(CONFIG_HIGHMEM) && ++ bio_phys_contig(bio, start)) ++ return (struct bbuf) { ++ .b = page_address(bio_iter_page(bio, start)) + ++ bio_iter_offset(bio, start), ++ .type = BB_NONE, .rw = rw ++ }; ++ ++ /* check if we can map the pages contiguously: */ + __bio_for_each_segment(bv, bio, iter, start) { +- if ((!first && bv.bv_offset) || +- prev_end != PAGE_SIZE) ++ if (iter.bi_size != start.bi_size && ++ bv.bv_offset) ++ goto bounce; ++ ++ if (bv.bv_len < iter.bi_size && ++ bv.bv_offset + bv.bv_len < PAGE_SIZE) + goto bounce; + +- prev_end = bv.bv_offset + bv.bv_len; + nr_pages++; + } + +@@ -264,7 +283,8 @@ int bch2_bio_uncompress(struct bch_fs *c, struct bio *src, + if (ret) + goto err; + +- if (dst_data.type != BB_NONE) ++ if (dst_data.type != BB_NONE && ++ dst_data.type != BB_VMAP) + memcpy_to_bio(dst, dst_iter, dst_data.b + (crc.offset << 9)); + err: + bio_unmap_or_unbounce(c, dst_data); +@@ -407,7 +427,8 @@ static unsigned __bio_compress(struct bch_fs *c, + memset(dst_data.b + *dst_len, 0, pad); + *dst_len += pad; + +- if (dst_data.type != BB_NONE) ++ if (dst_data.type != BB_NONE && ++ dst_data.type != BB_VMAP) + memcpy_to_bio(dst, dst->bi_iter, dst_data.b); + + BUG_ON(!*dst_len || *dst_len > dst->bi_iter.bi_size); +-- +cgit v1.2.3 + + +From 90959c01853b6c526c691a29f550360b54088dce Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 8 May 2020 23:15:42 -0400 +Subject: bcachefs: Fix initialization of bounce mempools + +When they were converted to kvpmalloc pools they weren't converted to +pass the actual size of the allocation. Oops. + +Also, validate the real length in the zstd decompression path. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/compress.c | 16 ++++++++-------- + 1 file changed, 8 insertions(+), 8 deletions(-) + +diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c +index ed3879deafb6..20bde73a17a8 100644 +--- a/fs/bcachefs/compress.c ++++ b/fs/bcachefs/compress.c +@@ -191,20 +191,21 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src, + } + case BCH_COMPRESSION_TYPE_zstd: { + ZSTD_DCtx *ctx; +- size_t len; ++ size_t real_src_len = le32_to_cpup(src_data.b); ++ ++ if (real_src_len > src_len - 4) ++ goto err; + + workspace = mempool_alloc(&c->decompress_workspace, GFP_NOIO); + ctx = zstd_init_dctx(workspace, zstd_dctx_workspace_bound()); + +- src_len = le32_to_cpup(src_data.b); +- + ret = zstd_decompress_dctx(ctx, + dst_data, dst_len, +- src_data.b + 4, src_len); ++ src_data.b + 4, real_src_len); + + mempool_free(workspace, &c->decompress_workspace); + +- if (len != dst_len) ++ if (ret != dst_len) + goto err; + break; + } +@@ -533,7 +534,6 @@ void bch2_fs_compress_exit(struct bch_fs *c) + static int __bch2_fs_compress_init(struct bch_fs *c, u64 features) + { + size_t max_extent = c->sb.encoded_extent_max << 9; +- size_t order = get_order(max_extent); + size_t decompress_workspace_size = 0; + bool decompress_workspace_needed; + ZSTD_parameters params = zstd_get_params(0, max_extent); +@@ -568,14 +568,14 @@ have_compressed: + + if (!mempool_initialized(&c->compression_bounce[READ])) { + ret = mempool_init_kvpmalloc_pool(&c->compression_bounce[READ], +- 1, order); ++ 1, max_extent); + if (ret) + goto out; + } + + if (!mempool_initialized(&c->compression_bounce[WRITE])) { + ret = mempool_init_kvpmalloc_pool(&c->compression_bounce[WRITE], +- 1, order); ++ 1, max_extent); + if (ret) + goto out; + } +-- +cgit v1.2.3 + + +From 67731b5e2f677538026f655a495eadd022b92bbf Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 11 May 2020 20:01:07 -0400 +Subject: bcachefs: Fixes for startup on very full filesystems + + - Always pass BTREE_INSERT_USE_RESERVE when writing alloc btree keys + - Don't strand buckest on the copygc freelist until after recovery is + done and we're starting copygc. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/alloc_background.c | 16 ++++++++++++++-- + fs/bcachefs/super.c | 3 ++- + 2 files changed, 16 insertions(+), 3 deletions(-) + +diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c +index 0da8de167ff1..98b87994ca64 100644 +--- a/fs/bcachefs/alloc_background.c ++++ b/fs/bcachefs/alloc_background.c +@@ -315,7 +315,9 @@ retry: + bch2_trans_update(trans, iter, &a->k_i, + BTREE_TRIGGER_NORUN); + ret = bch2_trans_commit(trans, NULL, NULL, +- BTREE_INSERT_NOFAIL|flags); ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_USE_RESERVE| ++ flags); + err: + if (ret == -EINTR) + goto retry; +@@ -1033,7 +1035,16 @@ static int push_invalidated_bucket(struct bch_fs *c, struct bch_dev *ca, size_t + set_current_state(TASK_INTERRUPTIBLE); + + spin_lock(&c->freelist_lock); +- for (i = 0; i < RESERVE_NR; i++) ++ for (i = 0; i < RESERVE_NR; i++) { ++ ++ /* ++ * Don't strand buckets on the copygc freelist until ++ * after recovery is finished: ++ */ ++ if (!test_bit(BCH_FS_STARTED, &c->flags) && ++ i == RESERVE_MOVINGGC) ++ continue; ++ + if (fifo_push(&ca->free[i], bucket)) { + fifo_pop(&ca->free_inc, bucket); + +@@ -1043,6 +1054,7 @@ static int push_invalidated_bucket(struct bch_fs *c, struct bch_dev *ca, size_t + spin_unlock(&c->freelist_lock); + goto out; + } ++ } + + if (ca->allocator_state != ALLOCATOR_BLOCKED_FULL) { + ca->allocator_state = ALLOCATOR_BLOCKED_FULL; +diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c +index dcca94f7b0a0..4560398f8c27 100644 +--- a/fs/bcachefs/super.c ++++ b/fs/bcachefs/super.c +@@ -849,6 +849,8 @@ int bch2_fs_start(struct bch_fs *c) + if (bch2_fs_init_fault("fs_start")) + goto err; + ++ set_bit(BCH_FS_STARTED, &c->flags); ++ + if (c->opts.read_only || c->opts.nochanges) { + bch2_fs_read_only(c); + } else { +@@ -860,7 +862,6 @@ int bch2_fs_start(struct bch_fs *c) + goto err; + } + +- set_bit(BCH_FS_STARTED, &c->flags); + print_mount_opts(c); + ret = 0; + out: +-- +cgit v1.2.3 + + +From c868d2461d5181c5c33a7f4541dd4d961743ee87 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 12 May 2020 18:34:16 -0400 +Subject: bcachefs: Validate that we read the correct btree node + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_io.c | 11 +++++++++++ + 1 file changed, 11 insertions(+) + +diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c +index 946fffb6f51e..63063748d4f5 100644 +--- a/fs/bcachefs/btree_io.c ++++ b/fs/bcachefs/btree_io.c +@@ -736,6 +736,17 @@ static int validate_bset(struct bch_fs *c, struct btree *b, + struct btree_node *bn = + container_of(i, struct btree_node, keys); + /* These indicate that we read the wrong btree node: */ ++ ++ if (b->key.k.type == KEY_TYPE_btree_ptr_v2) { ++ struct bch_btree_ptr_v2 *bp = ++ &bkey_i_to_btree_ptr_v2(&b->key)->v; ++ ++ /* XXX endianness */ ++ btree_err_on(bp->seq != bn->keys.seq, ++ BTREE_ERR_MUST_RETRY, c, b, NULL, ++ "incorrect sequence number (wrong btree node)"); ++ } ++ + btree_err_on(BTREE_NODE_ID(bn) != b->btree_id, + BTREE_ERR_MUST_RETRY, c, b, i, + "incorrect btree id"); +-- +cgit v1.2.3 + + +From 7520d3d86c3e95bb4ccabd008556e9dc9c7ea2f3 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 13 May 2020 00:15:28 -0400 +Subject: bcachefs: Fix a workqueue deadlock + +writes running out of a workqueue (via dio path) could block and prevent +other writes from calling bch2_write_index() and completing. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/io.c | 29 +++++++++++++++++++++++++++-- + fs/bcachefs/io.h | 1 + + 2 files changed, 28 insertions(+), 2 deletions(-) + +diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c +index ff1649e10ef9..de8d926b5f0d 100644 +--- a/fs/bcachefs/io.c ++++ b/fs/bcachefs/io.c +@@ -588,7 +588,9 @@ static void bch2_write_index(struct closure *cl) + + __bch2_write_index(op); + +- if (!op->error && (op->flags & BCH_WRITE_FLUSH)) { ++ if (!(op->flags & BCH_WRITE_DONE)) { ++ continue_at(cl, __bch2_write, index_update_wq(op)); ++ } else if (!op->error && (op->flags & BCH_WRITE_FLUSH)) { + bch2_journal_flush_seq_async(&c->journal, + *op_journal_seq(op), + cl); +@@ -1102,8 +1104,15 @@ again: + if (ret < 0) + goto err; + +- if (ret) ++ if (ret) { + skip_put = false; ++ } else { ++ /* ++ * for the skip_put optimization this has to be set ++ * before we submit the bio: ++ */ ++ op->flags |= BCH_WRITE_DONE; ++ } + + bio->bi_end_io = bch2_write_endio; + bio->bi_private = &op->cl; +@@ -1126,16 +1135,30 @@ again: + return; + err: + op->error = ret; ++ op->flags |= BCH_WRITE_DONE; + + continue_at(cl, bch2_write_index, index_update_wq(op)); + return; + flush_io: ++ /* ++ * If the write can't all be submitted at once, we generally want to ++ * block synchronously as that signals backpressure to the caller. ++ * ++ * However, if we're running out of a workqueue, we can't block here ++ * because we'll be blocking other work items from completing: ++ */ ++ if (current->flags & PF_WQ_WORKER) { ++ continue_at(cl, bch2_write_index, index_update_wq(op)); ++ return; ++ } ++ + closure_sync(cl); + + if (!bch2_keylist_empty(&op->insert_keys)) { + __bch2_write_index(op); + + if (op->error) { ++ op->flags |= BCH_WRITE_DONE; + continue_at_nobarrier(cl, bch2_write_done, NULL); + return; + } +@@ -1181,6 +1204,8 @@ static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len) + bch2_keylist_push(&op->insert_keys); + + op->flags |= BCH_WRITE_WROTE_DATA_INLINE; ++ op->flags |= BCH_WRITE_DONE; ++ + continue_at_nobarrier(cl, bch2_write_index, NULL); + return; + err: +diff --git a/fs/bcachefs/io.h b/fs/bcachefs/io.h +index e45dcf9635ae..c4c847306345 100644 +--- a/fs/bcachefs/io.h ++++ b/fs/bcachefs/io.h +@@ -36,6 +36,7 @@ enum bch_write_flags { + /* Internal: */ + BCH_WRITE_JOURNAL_SEQ_PTR = (1 << 10), + BCH_WRITE_SKIP_CLOSURE_PUT = (1 << 11), ++ BCH_WRITE_DONE = (1 << 12), + }; + + static inline u64 *op_journal_seq(struct bch_write_op *op) +-- +cgit v1.2.3 + + +From 6f7e561f65ac8314e6af30705588cdb5aa618304 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 13 May 2020 17:53:33 -0400 +Subject: bcachefs: Fix setquota + +We were returning -EINTR because we were failing to retry the btree +transaction. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/quota.c | 59 +++++++++++++++++++++++++++-------------------------- + 1 file changed, 30 insertions(+), 29 deletions(-) + +diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c +index e7787c5063ce..d3032a46e7f3 100644 +--- a/fs/bcachefs/quota.c ++++ b/fs/bcachefs/quota.c +@@ -710,25 +710,15 @@ found: + return ret; + } + +-static int bch2_set_quota(struct super_block *sb, struct kqid qid, +- struct qc_dqblk *qdq) ++static int bch2_set_quota_trans(struct btree_trans *trans, ++ struct bkey_i_quota *new_quota, ++ struct qc_dqblk *qdq) + { +- struct bch_fs *c = sb->s_fs_info; +- struct btree_trans trans; + struct btree_iter *iter; + struct bkey_s_c k; +- struct bkey_i_quota new_quota; + int ret; + +- if (sb->s_flags & SB_RDONLY) +- return -EROFS; +- +- bkey_quota_init(&new_quota.k_i); +- new_quota.k.p = POS(qid.type, from_kqid(&init_user_ns, qid)); +- +- bch2_trans_init(&trans, c, 0, 0); +- +- iter = bch2_trans_get_iter(&trans, BTREE_ID_QUOTAS, new_quota.k.p, ++ iter = bch2_trans_get_iter(trans, BTREE_ID_QUOTAS, new_quota->k.p, + BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + k = bch2_btree_iter_peek_slot(iter); + +@@ -736,32 +726,43 @@ static int bch2_set_quota(struct super_block *sb, struct kqid qid, + if (unlikely(ret)) + return ret; + +- switch (k.k->type) { +- case KEY_TYPE_quota: +- new_quota.v = *bkey_s_c_to_quota(k).v; +- break; +- } ++ if (k.k->type == KEY_TYPE_quota) ++ new_quota->v = *bkey_s_c_to_quota(k).v; + + if (qdq->d_fieldmask & QC_SPC_SOFT) +- new_quota.v.c[Q_SPC].softlimit = cpu_to_le64(qdq->d_spc_softlimit >> 9); ++ new_quota->v.c[Q_SPC].softlimit = cpu_to_le64(qdq->d_spc_softlimit >> 9); + if (qdq->d_fieldmask & QC_SPC_HARD) +- new_quota.v.c[Q_SPC].hardlimit = cpu_to_le64(qdq->d_spc_hardlimit >> 9); ++ new_quota->v.c[Q_SPC].hardlimit = cpu_to_le64(qdq->d_spc_hardlimit >> 9); + + if (qdq->d_fieldmask & QC_INO_SOFT) +- new_quota.v.c[Q_INO].softlimit = cpu_to_le64(qdq->d_ino_softlimit); ++ new_quota->v.c[Q_INO].softlimit = cpu_to_le64(qdq->d_ino_softlimit); + if (qdq->d_fieldmask & QC_INO_HARD) +- new_quota.v.c[Q_INO].hardlimit = cpu_to_le64(qdq->d_ino_hardlimit); ++ new_quota->v.c[Q_INO].hardlimit = cpu_to_le64(qdq->d_ino_hardlimit); ++ ++ return bch2_trans_update(trans, iter, &new_quota->k_i, 0); ++} + +- bch2_trans_update(&trans, iter, &new_quota.k_i, 0); ++static int bch2_set_quota(struct super_block *sb, struct kqid qid, ++ struct qc_dqblk *qdq) ++{ ++ struct bch_fs *c = sb->s_fs_info; ++ struct btree_trans trans; ++ struct bkey_i_quota new_quota; ++ int ret; + +- ret = bch2_trans_commit(&trans, NULL, NULL, 0); ++ if (sb->s_flags & SB_RDONLY) ++ return -EROFS; + +- bch2_trans_exit(&trans); ++ bkey_quota_init(&new_quota.k_i); ++ new_quota.k.p = POS(qid.type, from_kqid(&init_user_ns, qid)); + +- if (ret) +- return ret; ++ bch2_trans_init(&trans, c, 0, 0); + +- ret = __bch2_quota_set(c, bkey_i_to_s_c(&new_quota.k_i)); ++ ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOUNLOCK, ++ bch2_set_quota_trans(&trans, &new_quota, qdq)) ?: ++ __bch2_quota_set(c, bkey_i_to_s_c(&new_quota.k_i)); ++ ++ bch2_trans_exit(&trans); + + return ret; + } +-- +cgit v1.2.3 + + +From d45c032b49bb1a7193c66cd59d1a0d564dc549ad Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 14 May 2020 21:45:08 -0400 +Subject: bcachefs: Fix another iterator counting bug + +We were marking the end of where we could insert incorrectly for +indirect extents. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/extent_update.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c +index d0af1bc17018..fd011df3cb99 100644 +--- a/fs/bcachefs/extent_update.c ++++ b/fs/bcachefs/extent_update.c +@@ -76,7 +76,8 @@ static int count_iters_for_insert(struct btree_trans *trans, + + if (*nr_iters >= max_iters) { + struct bpos pos = bkey_start_pos(k.k); +- pos.offset += r_k.k->p.offset - idx; ++ pos.offset += min_t(u64, k.k->size, ++ r_k.k->p.offset - idx); + + *end = bpos_min(*end, pos); + ret = 1; +-- +cgit v1.2.3 + + +From cb376e0b28c4c40d7436b05297361cc709776cc6 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 21 May 2020 17:23:40 -0400 +Subject: bcachefs: Wrap vmap() in memalloc_nofs_save()/restore() + +vmalloc() and vmap() don't take GFP_NOFS - this should be pushed further +up the IO path, but for now just doing the simple fix. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/compress.c | 6 +++++- + 1 file changed, 5 insertions(+), 1 deletion(-) + +diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c +index 20bde73a17a8..6115e0294e4d 100644 +--- a/fs/bcachefs/compress.c ++++ b/fs/bcachefs/compress.c +@@ -7,6 +7,7 @@ + #include "super-io.h" + + #include ++#include + #include + #include + +@@ -63,7 +64,7 @@ static struct bbuf __bio_map_or_bounce(struct bch_fs *c, struct bio *bio, + struct bbuf ret; + struct bio_vec bv; + struct bvec_iter iter; +- unsigned nr_pages = 0; ++ unsigned nr_pages = 0, flags; + struct page *stack_pages[16]; + struct page **pages = NULL; + void *data; +@@ -103,7 +104,10 @@ static struct bbuf __bio_map_or_bounce(struct bch_fs *c, struct bio *bio, + __bio_for_each_segment(bv, bio, iter, start) + pages[nr_pages++] = bv.bv_page; + ++ flags = memalloc_nofs_save(); + data = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL); ++ memalloc_nofs_restore(flags); ++ + if (pages != stack_pages) + kfree(pages); + +-- +cgit v1.2.3 + + +From 25871e11607f5c24f8b3fba73676e417b2f6b9c8 Mon Sep 17 00:00:00 2001 +From: Yuxuan Shui +Date: Fri, 22 May 2020 15:50:05 +0100 +Subject: bcachefs: fix stack corruption + +When a bkey_on_stack is passed to bch_read_indirect_extent, there is no +guarantee that it will be big enough to hold the bkey. And +bch_read_indirect_extent is not aware of bkey_on_stack to call realloc +on it. This cause a stack corruption. + +This commit makes bch_read_indirect_extent aware of bkey_on_stack so it +can call realloc when appropriate. + +Tested-by: Yuxuan Shui +Signed-off-by: Yuxuan Shui +--- + fs/bcachefs/fs-io.c | 2 +- + fs/bcachefs/fs.c | 2 +- + fs/bcachefs/io.c | 10 +++++----- + fs/bcachefs/io.h | 7 ++++--- + 4 files changed, 11 insertions(+), 10 deletions(-) + +diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c +index 22e34806f777..9f7e011e4b35 100644 +--- a/fs/bcachefs/fs-io.c ++++ b/fs/bcachefs/fs-io.c +@@ -871,7 +871,7 @@ retry: + sectors = k.k->size - offset_into_extent; + + ret = bch2_read_indirect_extent(trans, +- &offset_into_extent, sk.k); ++ &offset_into_extent, &sk); + if (ret) + break; + +diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c +index adae8a5bfa54..8b09fd55cbc3 100644 +--- a/fs/bcachefs/fs.c ++++ b/fs/bcachefs/fs.c +@@ -917,7 +917,7 @@ retry: + sectors = k.k->size - offset_into_extent; + + ret = bch2_read_indirect_extent(&trans, +- &offset_into_extent, cur.k); ++ &offset_into_extent, &cur); + if (ret) + break; + +diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c +index de8d926b5f0d..2060a6a1bdea 100644 +--- a/fs/bcachefs/io.c ++++ b/fs/bcachefs/io.c +@@ -1640,7 +1640,7 @@ retry: + sectors = k.k->size - offset_into_extent; + + ret = bch2_read_indirect_extent(&trans, +- &offset_into_extent, sk.k); ++ &offset_into_extent, &sk); + if (ret) + break; + +@@ -1942,14 +1942,14 @@ static void bch2_read_endio(struct bio *bio) + + int __bch2_read_indirect_extent(struct btree_trans *trans, + unsigned *offset_into_extent, +- struct bkey_i *orig_k) ++ struct bkey_on_stack *orig_k) + { + struct btree_iter *iter; + struct bkey_s_c k; + u64 reflink_offset; + int ret; + +- reflink_offset = le64_to_cpu(bkey_i_to_reflink_p(orig_k)->v.idx) + ++ reflink_offset = le64_to_cpu(bkey_i_to_reflink_p(orig_k->k)->v.idx) + + *offset_into_extent; + + iter = bch2_trans_get_iter(trans, BTREE_ID_REFLINK, +@@ -1972,7 +1972,7 @@ int __bch2_read_indirect_extent(struct btree_trans *trans, + } + + *offset_into_extent = iter->pos.offset - bkey_start_offset(k.k); +- bkey_reassemble(orig_k, k); ++ bkey_on_stack_reassemble(orig_k, trans->c, k); + err: + bch2_trans_iter_put(trans, iter); + return ret; +@@ -2272,7 +2272,7 @@ retry: + k = bkey_i_to_s_c(sk.k); + + ret = bch2_read_indirect_extent(&trans, +- &offset_into_extent, sk.k); ++ &offset_into_extent, &sk); + if (ret) + goto err; + +diff --git a/fs/bcachefs/io.h b/fs/bcachefs/io.h +index c4c847306345..8814a8fb260f 100644 +--- a/fs/bcachefs/io.h ++++ b/fs/bcachefs/io.h +@@ -3,6 +3,7 @@ + #define _BCACHEFS_IO_H + + #include "checksum.h" ++#include "bkey_on_stack.h" + #include "io_types.h" + + #define to_wbio(_bio) \ +@@ -110,13 +111,13 @@ struct cache_promote_op; + struct extent_ptr_decoded; + + int __bch2_read_indirect_extent(struct btree_trans *, unsigned *, +- struct bkey_i *); ++ struct bkey_on_stack *); + + static inline int bch2_read_indirect_extent(struct btree_trans *trans, + unsigned *offset_into_extent, +- struct bkey_i *k) ++ struct bkey_on_stack *k) + { +- return k->k.type == KEY_TYPE_reflink_p ++ return k->k->k.type == KEY_TYPE_reflink_p + ? __bch2_read_indirect_extent(trans, offset_into_extent, k) + : 0; + } +-- +cgit v1.2.3 + + +From 0bbf7576862ade2871fc032608044cefe89f6ca2 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 23 May 2020 11:44:12 -0400 +Subject: bcachefs: Print out d_type in dirent_to_text() + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/dirent.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c +index ae5c9fd8d9f7..f34bfda8ab0d 100644 +--- a/fs/bcachefs/dirent.c ++++ b/fs/bcachefs/dirent.c +@@ -104,7 +104,7 @@ void bch2_dirent_to_text(struct printbuf *out, struct bch_fs *c, + + bch_scnmemcpy(out, d.v->d_name, + bch2_dirent_name_bytes(d)); +- pr_buf(out, " -> %llu", d.v->d_inum); ++ pr_buf(out, " -> %llu type %u", d.v->d_inum, d.v->d_type); + } + + static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans, +-- +cgit v1.2.3 + + +From 18b6791eb825915e820a2cab5728179fe48f420a Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 25 May 2020 18:47:21 -0400 +Subject: bcachefs: Add vmalloc fallback for decompress workspace + +--- + fs/bcachefs/compress.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c +index 6115e0294e4d..920460a182b4 100644 +--- a/fs/bcachefs/compress.c ++++ b/fs/bcachefs/compress.c +@@ -607,7 +607,7 @@ have_compressed: + } + + if (!mempool_initialized(&c->decompress_workspace)) { +- ret = mempool_init_kmalloc_pool( ++ ret = mempool_init_kvpmalloc_pool( + &c->decompress_workspace, + 1, decompress_workspace_size); + if (ret) +-- +cgit v1.2.3 + + +From ed46c35387c15557dc6cf02b427057ae79c5d514 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 25 May 2020 21:25:31 -0400 +Subject: bcachefs: Handle printing of null bkeys + +This fixes a null ptr deref. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bkey_methods.c | 21 ++++++++++++++------- + 1 file changed, 14 insertions(+), 7 deletions(-) + +diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c +index 55ef4032b37c..36e0c5152b47 100644 +--- a/fs/bcachefs/bkey_methods.c ++++ b/fs/bcachefs/bkey_methods.c +@@ -176,13 +176,17 @@ void bch2_bpos_to_text(struct printbuf *out, struct bpos pos) + + void bch2_bkey_to_text(struct printbuf *out, const struct bkey *k) + { +- pr_buf(out, "u64s %u type %s ", k->u64s, +- bch2_bkey_types[k->type]); ++ if (k) { ++ pr_buf(out, "u64s %u type %s ", k->u64s, ++ bch2_bkey_types[k->type]); + +- bch2_bpos_to_text(out, k->p); ++ bch2_bpos_to_text(out, k->p); + +- pr_buf(out, " snap %u len %u ver %llu", +- k->p.snapshot, k->size, k->version.lo); ++ pr_buf(out, " snap %u len %u ver %llu", ++ k->p.snapshot, k->size, k->version.lo); ++ } else { ++ pr_buf(out, "(null)"); ++ } + } + + void bch2_val_to_text(struct printbuf *out, struct bch_fs *c, +@@ -198,8 +202,11 @@ void bch2_bkey_val_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) + { + bch2_bkey_to_text(out, k.k); +- pr_buf(out, ": "); +- bch2_val_to_text(out, c, k); ++ ++ if (k.k) { ++ pr_buf(out, ": "); ++ bch2_val_to_text(out, c, k); ++ } + } + + void bch2_bkey_swab_val(struct bkey_s k) +-- +cgit v1.2.3 + + +From 8e437d0c53c45ec65f9bad8d4998ec88807e2847 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 24 May 2020 13:37:44 -0400 +Subject: bcachefs: Be more rigorous about marking the filesystem clean + +Previously, there was at least one error path where we could mark the +filesystem clean when we hadn't sucessfully written out alloc info. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bcachefs.h | 1 + + fs/bcachefs/super.c | 15 ++++++++++++--- + 2 files changed, 13 insertions(+), 3 deletions(-) + +diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h +index 43161028333b..2608d6515d4a 100644 +--- a/fs/bcachefs/bcachefs.h ++++ b/fs/bcachefs/bcachefs.h +@@ -479,6 +479,7 @@ struct bch_dev { + enum { + /* startup: */ + BCH_FS_ALLOC_READ_DONE, ++ BCH_FS_ALLOC_CLEAN, + BCH_FS_ALLOCATOR_STARTED, + BCH_FS_ALLOCATOR_RUNNING, + BCH_FS_INITIAL_GC_DONE, +diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c +index 4560398f8c27..e5c6cb80d7d6 100644 +--- a/fs/bcachefs/super.c ++++ b/fs/bcachefs/super.c +@@ -185,8 +185,12 @@ static void __bch2_fs_read_only(struct bch_fs *c) + */ + bch2_journal_flush_all_pins(&c->journal); + ++ /* ++ * If the allocator threads didn't all start up, the btree updates to ++ * write out alloc info aren't going to work: ++ */ + if (!test_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags)) +- goto allocator_not_running; ++ goto nowrote_alloc; + + do { + wrote = false; +@@ -198,7 +202,7 @@ static void __bch2_fs_read_only(struct bch_fs *c) + bch2_fs_inconsistent(c, "error writing out alloc info %i", ret); + + if (ret) +- break; ++ goto nowrote_alloc; + + for_each_member_device(ca, c, i) + bch2_dev_allocator_quiesce(c, ca); +@@ -217,7 +221,9 @@ static void __bch2_fs_read_only(struct bch_fs *c) + + clean_passes = wrote ? 0 : clean_passes + 1; + } while (clean_passes < 2); +-allocator_not_running: ++ ++ set_bit(BCH_FS_ALLOC_CLEAN, &c->flags); ++nowrote_alloc: + for_each_member_device(ca, c, i) + bch2_dev_allocator_stop(ca); + +@@ -299,6 +305,7 @@ void bch2_fs_read_only(struct bch_fs *c) + !test_bit(BCH_FS_ERROR, &c->flags) && + !test_bit(BCH_FS_EMERGENCY_RO, &c->flags) && + test_bit(BCH_FS_STARTED, &c->flags) && ++ test_bit(BCH_FS_ALLOC_CLEAN, &c->flags) && + !c->opts.norecovery) + bch2_fs_mark_clean(c); + +@@ -387,6 +394,8 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early) + if (ret) + goto err; + ++ clear_bit(BCH_FS_ALLOC_CLEAN, &c->flags); ++ + for_each_rw_member(ca, c, i) + bch2_dev_allocator_add(c, ca); + bch2_recalc_capacity(c); +-- +cgit v1.2.3 + + +From 8061a5692c1a18d11f260d5040bde528fc55f759 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 24 May 2020 14:20:00 -0400 +Subject: bcachefs: Better error messages on bucket sector count overflows + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_gc.c | 4 +++- + fs/bcachefs/buckets.c | 39 +++++++++++++++++++++++---------------- + 2 files changed, 26 insertions(+), 17 deletions(-) + +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +index 769f3bc8faf9..0cc47783cb4e 100644 +--- a/fs/bcachefs/btree_gc.c ++++ b/fs/bcachefs/btree_gc.c +@@ -579,8 +579,10 @@ static int bch2_gc_done(struct bch_fs *c, + #define copy_bucket_field(_f) \ + if (dst->b[b].mark._f != src->b[b].mark._f) { \ + if (verify) \ +- fsck_err(c, "dev %u bucket %zu has wrong " #_f \ ++ fsck_err(c, "bucket %u:%zu gen %u data type %s has wrong " #_f \ + ": got %u, should be %u", i, b, \ ++ dst->b[b].mark.gen, \ ++ bch2_data_types[dst->b[b].mark.data_type],\ + dst->b[b].mark._f, src->b[b].mark._f); \ + dst->b[b]._mark._f = src->b[b].mark._f; \ + } +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index 2e1df04c760d..2fe33d744d33 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -778,29 +778,31 @@ static int bch2_mark_alloc(struct bch_fs *c, struct bkey_s_c k, + }) + + static int __bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, +- size_t b, enum bch_data_type type, ++ size_t b, enum bch_data_type data_type, + unsigned sectors, bool gc) + { + struct bucket *g = __bucket(ca, b, gc); + struct bucket_mark old, new; + bool overflow; + +- BUG_ON(type != BCH_DATA_SB && +- type != BCH_DATA_JOURNAL); ++ BUG_ON(data_type != BCH_DATA_SB && ++ data_type != BCH_DATA_JOURNAL); + + old = bucket_cmpxchg(g, new, ({ +- new.data_type = type; ++ new.data_type = data_type; + overflow = checked_add(new.dirty_sectors, sectors); + })); + + bch2_fs_inconsistent_on(old.data_type && +- old.data_type != type, c, ++ old.data_type != data_type, c, + "different types of data in same bucket: %s, %s", + bch2_data_types[old.data_type], +- bch2_data_types[type]); ++ bch2_data_types[data_type]); + + bch2_fs_inconsistent_on(overflow, c, +- "bucket sector count overflow: %u + %u > U16_MAX", ++ "bucket %u:%zu gen %u data type %s sector count overflow: %u + %u > U16_MAX", ++ ca->dev_idx, b, new.gen, ++ bch2_data_types[old.data_type ?: data_type], + old.dirty_sectors, sectors); + + if (c) +@@ -926,6 +928,7 @@ static bool bch2_mark_pointer(struct bch_fs *c, + struct bucket_mark old, new; + struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); + struct bucket *g = PTR_BUCKET(ca, &p.ptr, gc); ++ u16 *dst_sectors, orig_sectors; + bool overflow; + u64 v; + +@@ -953,10 +956,12 @@ static bool bch2_mark_pointer(struct bch_fs *c, + return true; + } + +- if (!p.ptr.cached) +- overflow = checked_add(new.dirty_sectors, sectors); +- else +- overflow = checked_add(new.cached_sectors, sectors); ++ dst_sectors = !p.ptr.cached ++ ? &new.dirty_sectors ++ : &new.cached_sectors; ++ orig_sectors = *dst_sectors; ++ ++ overflow = checked_add(*dst_sectors, sectors); + + if (!new.dirty_sectors && + !new.cached_sectors) { +@@ -987,10 +992,10 @@ static bool bch2_mark_pointer(struct bch_fs *c, + bch2_data_types[data_type]); + + bch2_fs_inconsistent_on(overflow, c, +- "bucket sector count overflow: %u + %lli > U16_MAX", +- !p.ptr.cached +- ? old.dirty_sectors +- : old.cached_sectors, sectors); ++ "bucket %u:%zu gen %u data type %s sector count overflow: %u + %lli > U16_MAX", ++ p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), new.gen, ++ bch2_data_types[old.data_type ?: data_type], ++ orig_sectors, sectors); + + bch2_dev_usage_update(c, ca, fs_usage, old, new, gc); + +@@ -1504,7 +1509,9 @@ static int bch2_trans_mark_pointer(struct btree_trans *trans, + + if (checked_add(*dst_sectors, sectors)) { + bch2_fs_inconsistent(c, +- "bucket sector count overflow: %u + %lli > U16_MAX", ++ "bucket %llu:%llu gen %u data type %s sector count overflow: %u + %lli > U16_MAX", ++ iter->pos.inode, iter->pos.offset, u.gen, ++ bch2_data_types[u.data_type ?: data_type], + orig_sectors, sectors); + /* return an error indicating that we need full fsck */ + ret = -EIO; +-- +cgit v1.2.3 + + +From 08fc76b1b0157d72f93f391ff088c71ef52c1f36 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 27 May 2020 14:10:27 -0400 +Subject: bcachefs: fix memalloc_nofs_restore() usage + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_cache.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c +index bc59f0363a21..4868137ecc88 100644 +--- a/fs/bcachefs/btree_cache.c ++++ b/fs/bcachefs/btree_cache.c +@@ -553,7 +553,6 @@ out_unlock: + + list_del_init(&b->list); + mutex_unlock(&bc->lock); +- memalloc_nofs_restore(flags); + out: + b->flags = 0; + b->written = 0; +@@ -566,6 +565,7 @@ out: + bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc], + start_time); + ++ memalloc_nofs_restore(flags); + return b; + err: + /* Try to cannibalize another cached btree node: */ +@@ -581,6 +581,7 @@ err: + } + + mutex_unlock(&bc->lock); ++ memalloc_nofs_restore(flags); + return ERR_PTR(-ENOMEM); + } + +-- +cgit v1.2.3 + + +From f159ba22a5d900b48280a06e6e54a355ca225d72 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 24 May 2020 14:06:10 -0400 +Subject: bcachefs: Fix reading of alloc info after unclean shutdown + +When updates to interior nodes started being journalled, that meant that +after an unclean shutdown, until journal replay is done we can't walk +the btree without overlaying the updates from the journal. + +The initial btree gc was changed to walk the btree overlaying keys from +the journal - but bch2_alloc_read() and bch2_stripes_read() were missed. +Major whoops... + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/alloc_background.c | 30 ++++++++---------- + fs/bcachefs/ec.c | 47 ++++++++++----------------- + fs/bcachefs/ec.h | 2 -- + fs/bcachefs/recovery.c | 72 ++++++++++++++++++++++++++++++++++++++++++ + fs/bcachefs/recovery.h | 7 ++++ + fs/bcachefs/super.c | 7 +++- + 6 files changed, 114 insertions(+), 51 deletions(-) + +diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c +index 98b87994ca64..4eba6897f02e 100644 +--- a/fs/bcachefs/alloc_background.c ++++ b/fs/bcachefs/alloc_background.c +@@ -208,29 +208,25 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, + get_alloc_field(a.v, &d, i)); + } + +-int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys) ++static int bch2_alloc_read_fn(struct bch_fs *c, enum btree_id id, ++ unsigned level, struct bkey_s_c k) + { +- struct btree_trans trans; +- struct btree_and_journal_iter iter; +- struct bkey_s_c k; +- struct bch_dev *ca; +- unsigned i; +- int ret = 0; +- +- bch2_trans_init(&trans, c, 0, 0); +- +- bch2_btree_and_journal_iter_init(&iter, &trans, journal_keys, +- BTREE_ID_ALLOC, POS_MIN); +- +- while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { ++ if (!level) + bch2_mark_key(c, k, 0, 0, NULL, 0, + BTREE_TRIGGER_ALLOC_READ| + BTREE_TRIGGER_NOATOMIC); + +- bch2_btree_and_journal_iter_advance(&iter); +- } ++ return 0; ++} ++ ++int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys) ++{ ++ struct bch_dev *ca; ++ unsigned i; ++ int ret = 0; + +- ret = bch2_trans_exit(&trans) ?: ret; ++ ret = bch2_btree_and_journal_walk(c, journal_keys, BTREE_ID_ALLOC, ++ NULL, bch2_alloc_read_fn); + if (ret) { + bch_err(c, "error reading alloc info: %i", ret); + return ret; +diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c +index 4da1cdbc55db..50a214d2b122 100644 +--- a/fs/bcachefs/ec.c ++++ b/fs/bcachefs/ec.c +@@ -1273,38 +1273,28 @@ int bch2_stripes_write(struct bch_fs *c, unsigned flags, bool *wrote) + return ret; + } + +-int bch2_stripes_read(struct bch_fs *c, struct journal_keys *journal_keys) ++static int bch2_stripes_read_fn(struct bch_fs *c, enum btree_id id, ++ unsigned level, struct bkey_s_c k) + { +- struct btree_trans trans; +- struct btree_and_journal_iter iter; +- struct bkey_s_c k; +- int ret; +- +- ret = bch2_fs_ec_start(c); +- if (ret) +- return ret; +- +- bch2_trans_init(&trans, c, 0, 0); +- +- bch2_btree_and_journal_iter_init(&iter, &trans, journal_keys, +- BTREE_ID_EC, POS_MIN); +- ++ int ret = 0; + +- while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { +- bch2_mark_key(c, k, 0, 0, NULL, 0, +- BTREE_TRIGGER_ALLOC_READ| +- BTREE_TRIGGER_NOATOMIC); ++ if (k.k->type == KEY_TYPE_stripe) ++ ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL) ?: ++ bch2_mark_key(c, k, 0, 0, NULL, 0, ++ BTREE_TRIGGER_ALLOC_READ| ++ BTREE_TRIGGER_NOATOMIC); + +- bch2_btree_and_journal_iter_advance(&iter); +- } ++ return ret; ++} + +- ret = bch2_trans_exit(&trans) ?: ret; +- if (ret) { ++int bch2_stripes_read(struct bch_fs *c, struct journal_keys *journal_keys) ++{ ++ int ret = bch2_btree_and_journal_walk(c, journal_keys, BTREE_ID_EC, ++ NULL, bch2_stripes_read_fn); ++ if (ret) + bch_err(c, "error reading stripes: %i", ret); +- return ret; +- } + +- return 0; ++ return ret; + } + + int bch2_ec_mem_alloc(struct bch_fs *c, bool gc) +@@ -1343,11 +1333,6 @@ int bch2_ec_mem_alloc(struct bch_fs *c, bool gc) + return 0; + } + +-int bch2_fs_ec_start(struct bch_fs *c) +-{ +- return bch2_ec_mem_alloc(c, false); +-} +- + void bch2_fs_ec_exit(struct bch_fs *c) + { + struct ec_stripe_head *h; +diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h +index cf67abd48490..4dfaac034886 100644 +--- a/fs/bcachefs/ec.h ++++ b/fs/bcachefs/ec.h +@@ -157,8 +157,6 @@ int bch2_stripes_write(struct bch_fs *, unsigned, bool *); + + int bch2_ec_mem_alloc(struct bch_fs *, bool); + +-int bch2_fs_ec_start(struct bch_fs *); +- + void bch2_fs_ec_exit(struct bch_fs *); + int bch2_fs_ec_init(struct bch_fs *); + +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index a4d0eec2ea3e..b4ccecdd3f77 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -191,6 +191,78 @@ void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *i + b->btree_id, b->level, b->data->min_key); + } + ++/* Walk btree, overlaying keys from the journal: */ ++ ++static int bch2_btree_and_journal_walk_recurse(struct bch_fs *c, struct btree *b, ++ struct journal_keys *journal_keys, ++ enum btree_id btree_id, ++ btree_walk_node_fn node_fn, ++ btree_walk_key_fn key_fn) ++{ ++ struct btree_and_journal_iter iter; ++ struct bkey_s_c k; ++ int ret = 0; ++ ++ bch2_btree_and_journal_iter_init_node_iter(&iter, journal_keys, b); ++ ++ while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { ++ ret = key_fn(c, btree_id, b->level, k); ++ if (ret) ++ break; ++ ++ if (b->level) { ++ struct btree *child; ++ BKEY_PADDED(k) tmp; ++ ++ bkey_reassemble(&tmp.k, k); ++ k = bkey_i_to_s_c(&tmp.k); ++ ++ bch2_btree_and_journal_iter_advance(&iter); ++ ++ if (b->level > 0) { ++ child = bch2_btree_node_get_noiter(c, &tmp.k, ++ b->btree_id, b->level - 1); ++ ret = PTR_ERR_OR_ZERO(child); ++ if (ret) ++ break; ++ ++ ret = (node_fn ? node_fn(c, b) : 0) ?: ++ bch2_btree_and_journal_walk_recurse(c, child, ++ journal_keys, btree_id, node_fn, key_fn); ++ six_unlock_read(&child->lock); ++ ++ if (ret) ++ break; ++ } ++ } else { ++ bch2_btree_and_journal_iter_advance(&iter); ++ } ++ } ++ ++ return ret; ++} ++ ++int bch2_btree_and_journal_walk(struct bch_fs *c, struct journal_keys *journal_keys, ++ enum btree_id btree_id, ++ btree_walk_node_fn node_fn, ++ btree_walk_key_fn key_fn) ++{ ++ struct btree *b = c->btree_roots[btree_id].b; ++ int ret = 0; ++ ++ if (btree_node_fake(b)) ++ return 0; ++ ++ six_lock_read(&b->lock, NULL, NULL); ++ ret = (node_fn ? node_fn(c, b) : 0) ?: ++ bch2_btree_and_journal_walk_recurse(c, b, journal_keys, btree_id, ++ node_fn, key_fn) ?: ++ key_fn(c, btree_id, b->level + 1, bkey_i_to_s_c(&b->key)); ++ six_unlock_read(&b->lock); ++ ++ return ret; ++} ++ + /* sort and dedup all keys in the journal: */ + + void bch2_journal_entries_free(struct list_head *list) +diff --git a/fs/bcachefs/recovery.h b/fs/bcachefs/recovery.h +index 19f2f172a26b..a66827c9addf 100644 +--- a/fs/bcachefs/recovery.h ++++ b/fs/bcachefs/recovery.h +@@ -44,6 +44,13 @@ void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *, + struct journal_keys *, + struct btree *); + ++typedef int (*btree_walk_node_fn)(struct bch_fs *c, struct btree *b); ++typedef int (*btree_walk_key_fn)(struct bch_fs *c, enum btree_id id, ++ unsigned level, struct bkey_s_c k); ++ ++int bch2_btree_and_journal_walk(struct bch_fs *, struct journal_keys *, enum btree_id, ++ btree_walk_node_fn, btree_walk_key_fn); ++ + void bch2_journal_keys_free(struct journal_keys *); + void bch2_journal_entries_free(struct list_head *); + +diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c +index e5c6cb80d7d6..5d37f3035aec 100644 +--- a/fs/bcachefs/super.c ++++ b/fs/bcachefs/super.c +@@ -192,6 +192,8 @@ static void __bch2_fs_read_only(struct bch_fs *c) + if (!test_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags)) + goto nowrote_alloc; + ++ bch_verbose(c, "writing alloc info"); ++ + do { + wrote = false; + +@@ -222,6 +224,7 @@ static void __bch2_fs_read_only(struct bch_fs *c) + clean_passes = wrote ? 0 : clean_passes + 1; + } while (clean_passes < 2); + ++ bch_verbose(c, "writing alloc info complete"); + set_bit(BCH_FS_ALLOC_CLEAN, &c->flags); + nowrote_alloc: + for_each_member_device(ca, c, i) +@@ -306,8 +309,10 @@ void bch2_fs_read_only(struct bch_fs *c) + !test_bit(BCH_FS_EMERGENCY_RO, &c->flags) && + test_bit(BCH_FS_STARTED, &c->flags) && + test_bit(BCH_FS_ALLOC_CLEAN, &c->flags) && +- !c->opts.norecovery) ++ !c->opts.norecovery) { ++ bch_verbose(c, "marking filesystem clean"); + bch2_fs_mark_clean(c); ++ } + + clear_bit(BCH_FS_RW, &c->flags); + } +-- +cgit v1.2.3 + + +From 9b20160c836b697436d55470b4a77d3c8828a9ce Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 25 May 2020 19:29:48 -0400 +Subject: bcachefs: Add a mechanism for passing extra journal entries to + bch2_trans_commit() + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 3 +++ + fs/bcachefs/btree_types.h | 3 +++ + fs/bcachefs/btree_update_leaf.c | 12 +++++++++++- + fs/bcachefs/journal.h | 11 ++++++++--- + 4 files changed, 25 insertions(+), 4 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 5528ba0f1d44..1cded0540af5 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -2153,6 +2153,9 @@ void bch2_trans_reset(struct btree_trans *trans, unsigned flags) + trans->nr_updates2 = 0; + trans->mem_top = 0; + ++ trans->extra_journal_entries = NULL; ++ trans->extra_journal_entry_u64s = 0; ++ + if (trans->fs_usage_deltas) { + trans->fs_usage_deltas->used = 0; + memset(&trans->fs_usage_deltas->memset_start, 0, +diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h +index 732cdc35aa7c..f957dd2cbbef 100644 +--- a/fs/bcachefs/btree_types.h ++++ b/fs/bcachefs/btree_types.h +@@ -305,6 +305,9 @@ struct btree_trans { + struct btree_insert_entry *updates2; + + /* update path: */ ++ struct jset_entry *extra_journal_entries; ++ unsigned extra_journal_entry_u64s; ++ + struct journal_res journal_res; + struct journal_preres journal_preres; + u64 *journal_seq; +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index 7faf98fd2f64..6e402027c63f 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -413,6 +413,16 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, + goto err; + } + ++ if (unlikely(trans->extra_journal_entry_u64s)) { ++ memcpy_u64s_small(bch2_journal_reservation_entry(&c->journal, ++ &trans->journal_res), ++ trans->extra_journal_entries, ++ trans->extra_journal_entry_u64s); ++ ++ trans->journal_res.offset += trans->extra_journal_entry_u64s; ++ trans->journal_res.u64s -= trans->extra_journal_entry_u64s; ++ } ++ + /* + * Not allowed to fail after we've gotten our journal reservation - we + * have to use it: +@@ -800,7 +810,7 @@ int __bch2_trans_commit(struct btree_trans *trans) + + memset(&trans->journal_preres, 0, sizeof(trans->journal_preres)); + +- trans->journal_u64s = 0; ++ trans->journal_u64s = trans->extra_journal_entry_u64s; + trans->journal_preres_u64s = 0; + + if (!(trans->flags & BTREE_INSERT_NOCHECK_RW) && +diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h +index 78f5fac06bf5..e4b7fe8ffa82 100644 +--- a/fs/bcachefs/journal.h ++++ b/fs/bcachefs/journal.h +@@ -199,13 +199,18 @@ bch2_journal_add_entry_noreservation(struct journal_buf *buf, size_t u64s) + return entry; + } + ++static inline struct jset_entry * ++bch2_journal_reservation_entry(struct journal *j, struct journal_res *res) ++{ ++ return vstruct_idx(j->buf[res->idx].data, res->offset); ++} ++ + static inline void bch2_journal_add_entry(struct journal *j, struct journal_res *res, + unsigned type, enum btree_id id, + unsigned level, + const void *data, unsigned u64s) + { +- struct journal_buf *buf = &j->buf[res->idx]; +- struct jset_entry *entry = vstruct_idx(buf->data, res->offset); ++ struct jset_entry *entry = bch2_journal_reservation_entry(j, res); + unsigned actual = jset_u64s(u64s); + + EBUG_ON(!res->ref); +@@ -219,7 +224,7 @@ static inline void bch2_journal_add_entry(struct journal *j, struct journal_res + entry->type = type; + entry->btree_id = id; + entry->level = level; +- memcpy_u64s(entry->_data, data, u64s); ++ memcpy_u64s_small(entry->_data, data, u64s); + } + + static inline void bch2_journal_add_keys(struct journal *j, struct journal_res *res, +-- +cgit v1.2.3 + + +From 894762b1500b8cdae9cf6959332e75c15939689a Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 25 May 2020 20:35:53 -0400 +Subject: bcachefs: Factor out bch2_fs_btree_interior_update_init() + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_update_interior.c | 19 +++++++++++++++++++ + fs/bcachefs/btree_update_interior.h | 3 +++ + fs/bcachefs/super.c | 13 ++----------- + 3 files changed, 24 insertions(+), 11 deletions(-) + +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index deb67c5e0ba4..86838826adc8 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -2246,3 +2246,22 @@ size_t bch2_btree_interior_updates_nr_pending(struct bch_fs *c) + + return ret; + } ++ ++void bch2_fs_btree_interior_update_exit(struct bch_fs *c) ++{ ++ mempool_exit(&c->btree_interior_update_pool); ++ mempool_exit(&c->btree_reserve_pool); ++} ++ ++int bch2_fs_btree_interior_update_init(struct bch_fs *c) ++{ ++ mutex_init(&c->btree_reserve_cache_lock); ++ INIT_LIST_HEAD(&c->btree_interior_update_list); ++ INIT_LIST_HEAD(&c->btree_interior_updates_unwritten); ++ mutex_init(&c->btree_interior_update_lock); ++ ++ return mempool_init_kmalloc_pool(&c->btree_reserve_pool, 1, ++ sizeof(struct btree_reserve)) ?: ++ mempool_init_kmalloc_pool(&c->btree_interior_update_pool, 1, ++ sizeof(struct btree_update)); ++} +diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h +index 2fddf5d31eb9..739a5ac536b8 100644 +--- a/fs/bcachefs/btree_update_interior.h ++++ b/fs/bcachefs/btree_update_interior.h +@@ -333,4 +333,7 @@ ssize_t bch2_btree_updates_print(struct bch_fs *, char *); + + size_t bch2_btree_interior_updates_nr_pending(struct bch_fs *); + ++void bch2_fs_btree_interior_update_exit(struct bch_fs *); ++int bch2_fs_btree_interior_update_init(struct bch_fs *); ++ + #endif /* _BCACHEFS_BTREE_UPDATE_INTERIOR_H */ +diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c +index 5d37f3035aec..2fbed2a6d8bb 100644 +--- a/fs/bcachefs/super.c ++++ b/fs/bcachefs/super.c +@@ -469,6 +469,7 @@ static void bch2_fs_free(struct bch_fs *c) + bch2_fs_ec_exit(c); + bch2_fs_encryption_exit(c); + bch2_fs_io_exit(c); ++ bch2_fs_btree_interior_update_exit(c); + bch2_fs_btree_iter_exit(c); + bch2_fs_btree_cache_exit(c); + bch2_fs_journal_exit(&c->journal); +@@ -486,8 +487,6 @@ static void bch2_fs_free(struct bch_fs *c) + mempool_exit(&c->large_bkey_pool); + mempool_exit(&c->btree_bounce_pool); + bioset_exit(&c->btree_bio); +- mempool_exit(&c->btree_interior_update_pool); +- mempool_exit(&c->btree_reserve_pool); + mempool_exit(&c->fill_iter); + percpu_ref_exit(&c->writes); + kfree(c->replicas.entries); +@@ -649,11 +648,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) + + INIT_LIST_HEAD(&c->list); + +- INIT_LIST_HEAD(&c->btree_interior_update_list); +- INIT_LIST_HEAD(&c->btree_interior_updates_unwritten); +- mutex_init(&c->btree_reserve_cache_lock); +- mutex_init(&c->btree_interior_update_lock); +- + mutex_init(&c->usage_scratch_lock); + + mutex_init(&c->bio_bounce_pages_lock); +@@ -726,10 +720,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) + WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_HIGHPRI, 1)) || + percpu_ref_init(&c->writes, bch2_writes_disabled, + PERCPU_REF_INIT_DEAD, GFP_KERNEL) || +- mempool_init_kmalloc_pool(&c->btree_reserve_pool, 1, +- sizeof(struct btree_reserve)) || +- mempool_init_kmalloc_pool(&c->btree_interior_update_pool, 1, +- sizeof(struct btree_update)) || + mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) || + bioset_init(&c->btree_bio, 1, + max(offsetof(struct btree_read_bio, bio), +@@ -745,6 +735,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) + bch2_fs_replicas_init(c) || + bch2_fs_btree_cache_init(c) || + bch2_fs_btree_iter_init(c) || ++ bch2_fs_btree_interior_update_init(c) || + bch2_fs_io_init(c) || + bch2_fs_encryption_init(c) || + bch2_fs_compress_init(c) || +-- +cgit v1.2.3 + + +From 3ee993e3f0a31c50c3bcca7bc1df2ff91a9331cc Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 25 May 2020 14:57:06 -0400 +Subject: bcachefs: Interior btree updates are now fully transactional + +We now update the alloc info (bucket sector counts) atomically with +journalling the update to the interior btree nodes, and we also set new +btree roots atomically with the journalled part of the btree update. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/alloc_background.c | 5 - + fs/bcachefs/bcachefs.h | 6 +- + fs/bcachefs/btree_gc.c | 12 +- + fs/bcachefs/btree_types.h | 1 + + fs/bcachefs/btree_update_interior.c | 816 +++++++++++++----------------------- + fs/bcachefs/btree_update_interior.h | 64 ++- + fs/bcachefs/btree_update_leaf.c | 7 +- + fs/bcachefs/buckets.c | 2 +- + fs/bcachefs/buckets.h | 2 - + fs/bcachefs/journal.c | 5 +- + fs/bcachefs/journal.h | 27 +- + fs/bcachefs/journal_io.c | 20 +- + fs/bcachefs/journal_reclaim.c | 2 +- + fs/bcachefs/journal_reclaim.h | 2 +- + fs/bcachefs/keylist.c | 4 +- + fs/bcachefs/keylist.h | 4 +- + fs/bcachefs/migrate.c | 11 +- + fs/bcachefs/move.c | 10 +- + fs/bcachefs/recovery.c | 7 +- + fs/bcachefs/super-io.c | 22 +- + fs/bcachefs/super.c | 5 + + 21 files changed, 410 insertions(+), 624 deletions(-) + +diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c +index 4eba6897f02e..e57b23ba1844 100644 +--- a/fs/bcachefs/alloc_background.c ++++ b/fs/bcachefs/alloc_background.c +@@ -1461,11 +1461,6 @@ again: + } + rcu_read_unlock(); + +- if (c->btree_roots_dirty) { +- bch2_journal_meta(&c->journal); +- goto again; +- } +- + return !nodes_unwritten && + !bch2_btree_interior_updates_nr_pending(c); + } +diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h +index 2608d6515d4a..9bcdf2658f95 100644 +--- a/fs/bcachefs/bcachefs.h ++++ b/fs/bcachefs/bcachefs.h +@@ -603,13 +603,10 @@ struct bch_fs { + struct bio_set btree_bio; + + struct btree_root btree_roots[BTREE_ID_NR]; +- bool btree_roots_dirty; + struct mutex btree_root_lock; + + struct btree_cache btree_cache; + +- mempool_t btree_reserve_pool; +- + /* + * Cache of allocated btree nodes - if we allocate a btree node and + * don't use it, if we free it that space can't be reused until going +@@ -627,6 +624,9 @@ struct bch_fs { + struct mutex btree_interior_update_lock; + struct closure_waitlist btree_interior_update_wait; + ++ struct workqueue_struct *btree_interior_update_worker; ++ struct work_struct btree_interior_update_work; ++ + mempool_t btree_iters_pool; + + struct workqueue_struct *wq; +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +index 0cc47783cb4e..c62fa3583b73 100644 +--- a/fs/bcachefs/btree_gc.c ++++ b/fs/bcachefs/btree_gc.c +@@ -464,6 +464,7 @@ static void bch2_mark_superblocks(struct bch_fs *c) + mutex_unlock(&c->sb_lock); + } + ++#if 0 + /* Also see bch2_pending_btree_node_free_insert_done() */ + static void bch2_mark_pending_btree_node_frees(struct bch_fs *c) + { +@@ -481,6 +482,7 @@ static void bch2_mark_pending_btree_node_frees(struct bch_fs *c) + + mutex_unlock(&c->btree_interior_update_lock); + } ++#endif + + static void bch2_mark_allocator_buckets(struct bch_fs *c) + { +@@ -799,6 +801,10 @@ int bch2_gc(struct bch_fs *c, struct journal_keys *journal_keys, + trace_gc_start(c); + + down_write(&c->gc_lock); ++ ++ /* flush interior btree updates: */ ++ closure_wait_event(&c->btree_interior_update_wait, ++ !bch2_btree_interior_updates_nr_pending(c)); + again: + ret = bch2_gc_start(c, metadata_only); + if (ret) +@@ -810,7 +816,9 @@ again: + if (ret) + goto out; + ++#if 0 + bch2_mark_pending_btree_node_frees(c); ++#endif + bch2_mark_allocator_buckets(c); + + c->gc_count++; +@@ -1035,6 +1043,8 @@ static void bch2_coalesce_nodes(struct bch_fs *c, struct btree_iter *iter, + btree_node_reset_sib_u64s(n); + + bch2_btree_build_aux_trees(n); ++ ++ bch2_btree_update_add_new_node(as, n); + six_unlock_write(&n->lock); + + bch2_btree_node_write(c, n, SIX_LOCK_intent); +@@ -1083,7 +1093,7 @@ next: + bch2_btree_iter_node_replace(iter, new_nodes[0]); + + for (i = 0; i < nr_new_nodes; i++) +- bch2_open_buckets_put(c, &new_nodes[i]->ob); ++ bch2_btree_update_get_open_buckets(as, new_nodes[i]); + + /* Free the old nodes and update our sliding window */ + for (i = 0; i < nr_old_nodes; i++) { +diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h +index f957dd2cbbef..8357b5251a43 100644 +--- a/fs/bcachefs/btree_types.h ++++ b/fs/bcachefs/btree_types.h +@@ -307,6 +307,7 @@ struct btree_trans { + /* update path: */ + struct jset_entry *extra_journal_entries; + unsigned extra_journal_entry_u64s; ++ struct journal_entry_pin *journal_pin; + + struct journal_res journal_res; + struct journal_preres journal_preres; +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index 86838826adc8..c027c8106c81 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -21,10 +21,6 @@ + #include + #include + +-static void btree_node_will_make_reachable(struct btree_update *, +- struct btree *); +-static void btree_update_drop_new_node(struct bch_fs *, struct btree *); +- + /* Debug code: */ + + /* +@@ -124,74 +120,6 @@ bool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *b, + + /* Btree node freeing/allocation: */ + +-static bool btree_key_matches(struct bch_fs *c, +- struct bkey_s_c l, +- struct bkey_s_c r) +-{ +- struct bkey_ptrs_c ptrs1 = bch2_bkey_ptrs_c(l); +- struct bkey_ptrs_c ptrs2 = bch2_bkey_ptrs_c(r); +- const struct bch_extent_ptr *ptr1, *ptr2; +- +- bkey_for_each_ptr(ptrs1, ptr1) +- bkey_for_each_ptr(ptrs2, ptr2) +- if (ptr1->dev == ptr2->dev && +- ptr1->gen == ptr2->gen && +- ptr1->offset == ptr2->offset) +- return true; +- +- return false; +-} +- +-/* +- * We're doing the index update that makes @b unreachable, update stuff to +- * reflect that: +- * +- * Must be called _before_ btree_update_updated_root() or +- * btree_update_updated_node: +- */ +-static void bch2_btree_node_free_index(struct btree_update *as, struct btree *b, +- struct bkey_s_c k, +- struct bch_fs_usage *stats) +-{ +- struct bch_fs *c = as->c; +- struct pending_btree_node_free *d; +- +- for (d = as->pending; d < as->pending + as->nr_pending; d++) +- if (!bkey_cmp(k.k->p, d->key.k.p) && +- btree_key_matches(c, k, bkey_i_to_s_c(&d->key))) +- goto found; +- BUG(); +-found: +- BUG_ON(d->index_update_done); +- d->index_update_done = true; +- +- /* +- * We're dropping @k from the btree, but it's still live until the +- * index update is persistent so we need to keep a reference around for +- * mark and sweep to find - that's primarily what the +- * btree_node_pending_free list is for. +- * +- * So here (when we set index_update_done = true), we're moving an +- * existing reference to a different part of the larger "gc keyspace" - +- * and the new position comes after the old position, since GC marks +- * the pending free list after it walks the btree. +- * +- * If we move the reference while mark and sweep is _between_ the old +- * and the new position, mark and sweep will see the reference twice +- * and it'll get double accounted - so check for that here and subtract +- * to cancel out one of mark and sweep's markings if necessary: +- */ +- +- if (gc_pos_cmp(c->gc_pos, b +- ? gc_pos_btree_node(b) +- : gc_pos_btree_root(as->btree_id)) >= 0 && +- gc_pos_cmp(c->gc_pos, gc_phase(GC_PHASE_PENDING_DELETE)) < 0) +- bch2_mark_key_locked(c, bkey_i_to_s_c(&d->key), +- 0, 0, NULL, 0, +- BTREE_TRIGGER_OVERWRITE| +- BTREE_TRIGGER_GC); +-} +- + static void __btree_node_free(struct bch_fs *c, struct btree *b) + { + trace_btree_node_free(c, b); +@@ -216,8 +144,6 @@ void bch2_btree_node_free_never_inserted(struct bch_fs *c, struct btree *b) + { + struct open_buckets ob = b->ob; + +- btree_update_drop_new_node(c, b); +- + b->ob.nr = 0; + + clear_btree_node_dirty(b); +@@ -237,39 +163,12 @@ void bch2_btree_node_free_inmem(struct bch_fs *c, struct btree *b, + trans_for_each_iter(iter->trans, linked) + BUG_ON(linked->l[b->level].b == b); + +- /* +- * Is this a node that isn't reachable on disk yet? +- * +- * Nodes that aren't reachable yet have writes blocked until they're +- * reachable - now that we've cancelled any pending writes and moved +- * things waiting on that write to wait on this update, we can drop this +- * node from the list of nodes that the other update is making +- * reachable, prior to freeing it: +- */ +- btree_update_drop_new_node(c, b); +- + six_lock_write(&b->lock, NULL, NULL); + __btree_node_free(c, b); + six_unlock_write(&b->lock); + six_unlock_intent(&b->lock); + } + +-static void bch2_btree_node_free_ondisk(struct bch_fs *c, +- struct pending_btree_node_free *pending, +- u64 journal_seq) +-{ +- BUG_ON(!pending->index_update_done); +- +- bch2_mark_key(c, bkey_i_to_s_c(&pending->key), +- 0, 0, NULL, journal_seq, BTREE_TRIGGER_OVERWRITE); +- +- if (gc_visited(c, gc_phase(GC_PHASE_PENDING_DELETE))) +- bch2_mark_key(c, bkey_i_to_s_c(&pending->key), +- 0, 0, NULL, journal_seq, +- BTREE_TRIGGER_OVERWRITE| +- BTREE_TRIGGER_GC); +-} +- + static struct btree *__bch2_btree_node_alloc(struct bch_fs *c, + struct disk_reservation *res, + struct closure *cl, +@@ -357,9 +256,9 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned lev + int ret; + + BUG_ON(level >= BTREE_MAX_DEPTH); +- BUG_ON(!as->reserve->nr); ++ BUG_ON(!as->nr_prealloc_nodes); + +- b = as->reserve->b[--as->reserve->nr]; ++ b = as->prealloc_nodes[--as->nr_prealloc_nodes]; + + set_btree_node_accessed(b); + set_btree_node_dirty(b); +@@ -394,8 +293,6 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned lev + + bch2_btree_build_aux_trees(b); + +- btree_node_will_make_reachable(as, b); +- + ret = bch2_btree_node_hash_insert(&c->btree_cache, b, level, as->btree_id); + BUG_ON(ret); + +@@ -466,19 +363,20 @@ static struct btree *__btree_root_alloc(struct btree_update *as, unsigned level) + btree_node_set_format(b, b->data->format); + bch2_btree_build_aux_trees(b); + ++ bch2_btree_update_add_new_node(as, b); + six_unlock_write(&b->lock); + + return b; + } + +-static void bch2_btree_reserve_put(struct bch_fs *c, struct btree_reserve *reserve) ++static void bch2_btree_reserve_put(struct btree_update *as) + { +- bch2_disk_reservation_put(c, &reserve->disk_res); ++ struct bch_fs *c = as->c; + + mutex_lock(&c->btree_reserve_cache_lock); + +- while (reserve->nr) { +- struct btree *b = reserve->b[--reserve->nr]; ++ while (as->nr_prealloc_nodes) { ++ struct btree *b = as->prealloc_nodes[--as->nr_prealloc_nodes]; + + six_unlock_write(&b->lock); + +@@ -502,36 +400,14 @@ static void bch2_btree_reserve_put(struct bch_fs *c, struct btree_reserve *reser + } + + mutex_unlock(&c->btree_reserve_cache_lock); +- +- mempool_free(reserve, &c->btree_reserve_pool); + } + +-static struct btree_reserve *bch2_btree_reserve_get(struct bch_fs *c, +- unsigned nr_nodes, +- unsigned flags, +- struct closure *cl) ++static int bch2_btree_reserve_get(struct btree_update *as, unsigned nr_nodes, ++ unsigned flags, struct closure *cl) + { +- struct btree_reserve *reserve; ++ struct bch_fs *c = as->c; + struct btree *b; +- struct disk_reservation disk_res = { 0, 0 }; +- unsigned sectors = nr_nodes * c->opts.btree_node_size; +- int ret, disk_res_flags = 0; +- +- if (flags & BTREE_INSERT_NOFAIL) +- disk_res_flags |= BCH_DISK_RESERVATION_NOFAIL; +- +- /* +- * This check isn't necessary for correctness - it's just to potentially +- * prevent us from doing a lot of work that'll end up being wasted: +- */ +- ret = bch2_journal_error(&c->journal); +- if (ret) +- return ERR_PTR(ret); +- +- if (bch2_disk_reservation_get(c, &disk_res, sectors, +- c->opts.metadata_replicas, +- disk_res_flags)) +- return ERR_PTR(-ENOSPC); ++ int ret; + + BUG_ON(nr_nodes > BTREE_RESERVE_MAX); + +@@ -540,18 +416,11 @@ static struct btree_reserve *bch2_btree_reserve_get(struct bch_fs *c, + * open bucket reserve: + */ + ret = bch2_btree_cache_cannibalize_lock(c, cl); +- if (ret) { +- bch2_disk_reservation_put(c, &disk_res); +- return ERR_PTR(ret); +- } +- +- reserve = mempool_alloc(&c->btree_reserve_pool, GFP_NOIO); +- +- reserve->disk_res = disk_res; +- reserve->nr = 0; ++ if (ret) ++ return ret; + +- while (reserve->nr < nr_nodes) { +- b = __bch2_btree_node_alloc(c, &disk_res, ++ while (as->nr_prealloc_nodes < nr_nodes) { ++ b = __bch2_btree_node_alloc(c, &as->disk_res, + flags & BTREE_INSERT_NOWAIT + ? NULL : cl, flags); + if (IS_ERR(b)) { +@@ -563,21 +432,20 @@ static struct btree_reserve *bch2_btree_reserve_get(struct bch_fs *c, + if (ret) + goto err_free; + +- reserve->b[reserve->nr++] = b; ++ as->prealloc_nodes[as->nr_prealloc_nodes++] = b; + } + + bch2_btree_cache_cannibalize_unlock(c); +- return reserve; ++ return 0; + err_free: +- bch2_btree_reserve_put(c, reserve); + bch2_btree_cache_cannibalize_unlock(c); + trace_btree_reserve_get_fail(c, nr_nodes, cl); +- return ERR_PTR(ret); ++ return ret; + } + + /* Asynchronous interior node update machinery */ + +-static void __bch2_btree_update_free(struct btree_update *as) ++static void bch2_btree_update_free(struct btree_update *as) + { + struct bch_fs *c = as->c; + +@@ -585,14 +453,13 @@ static void __bch2_btree_update_free(struct btree_update *as) + + bch2_journal_pin_drop(&c->journal, &as->journal); + bch2_journal_pin_flush(&c->journal, &as->journal); ++ bch2_disk_reservation_put(c, &as->disk_res); ++ bch2_btree_reserve_put(as); + +- BUG_ON(as->nr_new_nodes || as->nr_pending); +- +- if (as->reserve) +- bch2_btree_reserve_put(c, as->reserve); +- ++ mutex_lock(&c->btree_interior_update_lock); + list_del(&as->unwritten_list); + list_del(&as->list); ++ mutex_unlock(&c->btree_interior_update_lock); + + closure_debug_destroy(&as->cl); + mempool_free(as, &c->btree_interior_update_pool); +@@ -600,37 +467,59 @@ static void __bch2_btree_update_free(struct btree_update *as) + closure_wake_up(&c->btree_interior_update_wait); + } + +-static void bch2_btree_update_free(struct btree_update *as) ++static void btree_update_will_delete_key(struct btree_update *as, ++ struct bkey_i *k) + { +- struct bch_fs *c = as->c; ++ BUG_ON(bch2_keylist_u64s(&as->old_keys) + k->k.u64s > ++ ARRAY_SIZE(as->_old_keys)); ++ bch2_keylist_add(&as->old_keys, k); ++} + +- mutex_lock(&c->btree_interior_update_lock); +- __bch2_btree_update_free(as); +- mutex_unlock(&c->btree_interior_update_lock); ++static void btree_update_will_add_key(struct btree_update *as, ++ struct bkey_i *k) ++{ ++ BUG_ON(bch2_keylist_u64s(&as->new_keys) + k->k.u64s > ++ ARRAY_SIZE(as->_new_keys)); ++ bch2_keylist_add(&as->new_keys, k); + } + +-static inline bool six_trylock_intentwrite(struct six_lock *lock) ++/* ++ * The transactional part of an interior btree node update, where we journal the ++ * update we did to the interior node and update alloc info: ++ */ ++static int btree_update_nodes_written_trans(struct btree_trans *trans, ++ struct btree_update *as) + { +- if (!six_trylock_intent(lock)) +- return false; ++ struct bkey_i *k; ++ int ret; ++ ++ trans->extra_journal_entries = (void *) &as->journal_entries[0]; ++ trans->extra_journal_entry_u64s = as->journal_u64s; ++ trans->journal_pin = &as->journal; + +- if (!six_trylock_write(lock)) { +- six_unlock_intent(lock); +- return false; ++ for_each_keylist_key(&as->new_keys, k) { ++ ret = bch2_trans_mark_key(trans, bkey_i_to_s_c(k), ++ 0, 0, BTREE_TRIGGER_INSERT); ++ if (ret) ++ return ret; + } + +- return true; ++ for_each_keylist_key(&as->old_keys, k) { ++ ret = bch2_trans_mark_key(trans, bkey_i_to_s_c(k), ++ 0, 0, BTREE_TRIGGER_OVERWRITE); ++ if (ret) ++ return ret; ++ } ++ ++ return 0; + } + +-static void btree_update_nodes_written(struct closure *cl) ++static void btree_update_nodes_written(struct btree_update *as) + { +- struct btree_update *as = container_of(cl, struct btree_update, cl); +- struct btree *nodes_need_write[BTREE_MAX_DEPTH * 2 + GC_MERGE_NODES + 1]; +- unsigned nr_nodes_need_write; +- struct journal_res res = { 0 }; + struct bch_fs *c = as->c; +- struct btree_root *r; +- struct btree *b; ++ struct btree *b = as->b; ++ u64 journal_seq = 0; ++ unsigned i; + int ret; + + /* +@@ -638,78 +527,17 @@ static void btree_update_nodes_written(struct closure *cl) + * to child nodes that weren't written yet: now, the child nodes have + * been written so we can write out the update to the interior node. + */ +- mutex_lock(&c->btree_interior_update_lock); +- as->nodes_written = true; +-again: +- nr_nodes_need_write = 0; +- as = list_first_entry_or_null(&c->btree_interior_updates_unwritten, +- struct btree_update, unwritten_list); +- if (!as || !as->nodes_written) { +- mutex_unlock(&c->btree_interior_update_lock); +- return; +- } +- +- b = as->b; +- if (b && !six_trylock_intentwrite(&b->lock)) { +- mutex_unlock(&c->btree_interior_update_lock); +- +- btree_node_lock_type(c, b, SIX_LOCK_intent); +- six_lock_write(&b->lock, NULL, NULL); +- +- six_unlock_write(&b->lock); +- six_unlock_intent(&b->lock); +- +- mutex_lock(&c->btree_interior_update_lock); +- goto again; +- } +- +- ret = bch2_journal_res_get(&c->journal, &res, as->journal_u64s, +- JOURNAL_RES_GET_NONBLOCK| +- JOURNAL_RES_GET_RESERVED); +- if (ret == -EAGAIN) { +- unsigned u64s = as->journal_u64s; +- +- if (b) { +- six_unlock_write(&b->lock); +- six_unlock_intent(&b->lock); +- } +- +- mutex_unlock(&c->btree_interior_update_lock); +- +- ret = bch2_journal_res_get(&c->journal, &res, u64s, +- JOURNAL_RES_GET_CHECK| +- JOURNAL_RES_GET_RESERVED); +- if (!ret) { +- mutex_lock(&c->btree_interior_update_lock); +- goto again; +- } +- } +- +- if (!ret) { +- struct journal_buf *buf = &c->journal.buf[res.idx]; +- struct jset_entry *entry = vstruct_idx(buf->data, res.offset); +- +- res.offset += as->journal_u64s; +- res.u64s -= as->journal_u64s; +- memcpy_u64s(entry, as->journal_entries, as->journal_u64s); +- } else { +- /* +- * On journal error we have to run most of the normal path so +- * that shutdown works - unblocking btree node writes in +- * particular and writing them if needed - except for +- * journalling the update: +- */ +- +- BUG_ON(!bch2_journal_error(&c->journal)); +- } +- +- switch (as->mode) { +- case BTREE_INTERIOR_NO_UPDATE: +- BUG(); +- case BTREE_INTERIOR_UPDATING_NODE: +- /* @b is the node we did the final insert into: */ +- ++ ret = bch2_trans_do(c, &as->disk_res, &journal_seq, ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_NOCHECK_RW| ++ BTREE_INSERT_JOURNAL_RESERVED, ++ btree_update_nodes_written_trans(&trans, as)); ++ BUG_ON(ret && !bch2_journal_error(&c->journal)); ++ ++ if (b) { + /* ++ * @b is the node we did the final insert into: ++ * + * On failure to get a journal reservation, we still have to + * unblock the write and allow most of the write path to happen + * so that shutdown works, but the i->journal_seq mechanism +@@ -719,83 +547,90 @@ again: + * we're in journal error state: + */ + ++ btree_node_lock_type(c, b, SIX_LOCK_intent); ++ btree_node_lock_type(c, b, SIX_LOCK_write); ++ mutex_lock(&c->btree_interior_update_lock); ++ + list_del(&as->write_blocked_list); + +- if (!ret) { ++ if (!ret && as->b == b) { + struct bset *i = btree_bset_last(b); + ++ BUG_ON(!b->level); ++ BUG_ON(!btree_node_dirty(b)); ++ + i->journal_seq = cpu_to_le64( +- max(res.seq, ++ max(journal_seq, + le64_to_cpu(i->journal_seq))); + +- bch2_btree_add_journal_pin(c, b, res.seq); ++ bch2_btree_add_journal_pin(c, b, journal_seq); + } + +- nodes_need_write[nr_nodes_need_write++] = b; +- ++ mutex_unlock(&c->btree_interior_update_lock); + six_unlock_write(&b->lock); +- six_unlock_intent(&b->lock); +- break; +- +- case BTREE_INTERIOR_UPDATING_AS: +- BUG_ON(b); +- break; +- +- case BTREE_INTERIOR_UPDATING_ROOT: +- r = &c->btree_roots[as->btree_id]; + +- BUG_ON(b); +- +- mutex_lock(&c->btree_root_lock); +- bkey_copy(&r->key, as->parent_keys.keys); +- r->level = as->level; +- r->alive = true; +- c->btree_roots_dirty = true; +- mutex_unlock(&c->btree_root_lock); +- break; ++ btree_node_write_if_need(c, b, SIX_LOCK_intent); ++ six_unlock_intent(&b->lock); + } + + bch2_journal_pin_drop(&c->journal, &as->journal); + +- bch2_journal_res_put(&c->journal, &res); + bch2_journal_preres_put(&c->journal, &as->journal_preres); + +- while (as->nr_new_nodes) { +- b = as->new_nodes[--as->nr_new_nodes]; ++ mutex_lock(&c->btree_interior_update_lock); ++ for (i = 0; i < as->nr_new_nodes; i++) { ++ b = as->new_nodes[i]; + + BUG_ON(b->will_make_reachable != (unsigned long) as); + b->will_make_reachable = 0; ++ } ++ mutex_unlock(&c->btree_interior_update_lock); ++ ++ for (i = 0; i < as->nr_new_nodes; i++) { ++ b = as->new_nodes[i]; + +- nodes_need_write[nr_nodes_need_write++] = b; ++ btree_node_lock_type(c, b, SIX_LOCK_read); ++ btree_node_write_if_need(c, b, SIX_LOCK_read); ++ six_unlock_read(&b->lock); + } + +- while (as->nr_pending) +- bch2_btree_node_free_ondisk(c, +- &as->pending[--as->nr_pending], res.seq); ++ for (i = 0; i < as->nr_open_buckets; i++) ++ bch2_open_bucket_put(c, c->open_buckets + as->open_buckets[i]); + +- __bch2_btree_update_free(as); +- /* +- * for flush_held_btree_writes() waiting on updates to flush or +- * nodes to be writeable: +- */ +- closure_wake_up(&c->btree_interior_update_wait); ++ bch2_btree_update_free(as); ++} + +- /* +- * Can't take btree node locks while holding btree_interior_update_lock: +- * */ +- mutex_unlock(&c->btree_interior_update_lock); ++static void btree_interior_update_work(struct work_struct *work) ++{ ++ struct bch_fs *c = ++ container_of(work, struct bch_fs, btree_interior_update_work); ++ struct btree_update *as; + +- /* Do btree writes after dropping journal res/locks: */ +- while (nr_nodes_need_write) { +- b = nodes_need_write[--nr_nodes_need_write]; ++ while (1) { ++ mutex_lock(&c->btree_interior_update_lock); ++ as = list_first_entry_or_null(&c->btree_interior_updates_unwritten, ++ struct btree_update, unwritten_list); ++ if (as && !as->nodes_written) ++ as = NULL; ++ mutex_unlock(&c->btree_interior_update_lock); + +- btree_node_lock_type(c, b, SIX_LOCK_read); +- bch2_btree_node_write_cond(c, b, btree_node_need_write(b)); +- six_unlock_read(&b->lock); ++ if (!as) ++ break; ++ ++ btree_update_nodes_written(as); + } ++} ++ ++static void btree_update_set_nodes_written(struct closure *cl) ++{ ++ struct btree_update *as = container_of(cl, struct btree_update, cl); ++ struct bch_fs *c = as->c; + + mutex_lock(&c->btree_interior_update_lock); +- goto again; ++ as->nodes_written = true; ++ mutex_unlock(&c->btree_interior_update_lock); ++ ++ queue_work(c->btree_interior_update_worker, &c->btree_interior_update_work); + } + + /* +@@ -814,7 +649,6 @@ static void btree_update_updated_node(struct btree_update *as, struct btree *b) + + as->mode = BTREE_INTERIOR_UPDATING_NODE; + as->b = b; +- as->level = b->level; + list_add(&as->write_blocked_list, &b->write_blocked); + + mutex_unlock(&c->btree_interior_update_lock); +@@ -845,25 +679,45 @@ static void btree_update_reparent(struct btree_update *as, + + static void btree_update_updated_root(struct btree_update *as, struct btree *b) + { ++ struct bkey_i *insert = &b->key; + struct bch_fs *c = as->c; + + BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE); +- BUG_ON(!bch2_keylist_empty(&as->parent_keys)); ++ ++ BUG_ON(as->journal_u64s + jset_u64s(insert->k.u64s) > ++ ARRAY_SIZE(as->journal_entries)); ++ ++ as->journal_u64s += ++ journal_entry_set((void *) &as->journal_entries[as->journal_u64s], ++ BCH_JSET_ENTRY_btree_root, ++ b->btree_id, b->level, ++ insert, insert->k.u64s); + + mutex_lock(&c->btree_interior_update_lock); + list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten); + + as->mode = BTREE_INTERIOR_UPDATING_ROOT; +- as->level = b->level; +- bch2_keylist_add(&as->parent_keys, &b->key); + mutex_unlock(&c->btree_interior_update_lock); + } + +-static void btree_node_will_make_reachable(struct btree_update *as, +- struct btree *b) ++/* ++ * bch2_btree_update_add_new_node: ++ * ++ * This causes @as to wait on @b to be written, before it gets to ++ * bch2_btree_update_nodes_written ++ * ++ * Additionally, it sets b->will_make_reachable to prevent any additional writes ++ * to @b from happening besides the first until @b is reachable on disk ++ * ++ * And it adds @b to the list of @as's new nodes, so that we can update sector ++ * counts in bch2_btree_update_nodes_written: ++ */ ++void bch2_btree_update_add_new_node(struct btree_update *as, struct btree *b) + { + struct bch_fs *c = as->c; + ++ closure_get(&as->cl); ++ + mutex_lock(&c->btree_interior_update_lock); + BUG_ON(as->nr_new_nodes >= ARRAY_SIZE(as->new_nodes)); + BUG_ON(b->will_make_reachable); +@@ -871,10 +725,14 @@ static void btree_node_will_make_reachable(struct btree_update *as, + as->new_nodes[as->nr_new_nodes++] = b; + b->will_make_reachable = 1UL|(unsigned long) as; + +- closure_get(&as->cl); + mutex_unlock(&c->btree_interior_update_lock); ++ ++ btree_update_will_add_key(as, &b->key); + } + ++/* ++ * returns true if @b was a new node ++ */ + static void btree_update_drop_new_node(struct bch_fs *c, struct btree *b) + { + struct btree_update *as; +@@ -882,6 +740,11 @@ static void btree_update_drop_new_node(struct bch_fs *c, struct btree *b) + unsigned i; + + mutex_lock(&c->btree_interior_update_lock); ++ /* ++ * When b->will_make_reachable != 0, it owns a ref on as->cl that's ++ * dropped when it gets written by bch2_btree_complete_write - the ++ * xchg() is for synchronization with bch2_btree_complete_write: ++ */ + v = xchg(&b->will_make_reachable, 0); + as = (struct btree_update *) (v & ~1UL); + +@@ -903,25 +766,11 @@ found: + closure_put(&as->cl); + } + +-static void btree_interior_update_add_node_reference(struct btree_update *as, +- struct btree *b) ++void bch2_btree_update_get_open_buckets(struct btree_update *as, struct btree *b) + { +- struct bch_fs *c = as->c; +- struct pending_btree_node_free *d; +- +- mutex_lock(&c->btree_interior_update_lock); +- +- /* Add this node to the list of nodes being freed: */ +- BUG_ON(as->nr_pending >= ARRAY_SIZE(as->pending)); +- +- d = &as->pending[as->nr_pending++]; +- d->index_update_done = false; +- d->seq = b->data->keys.seq; +- d->btree_id = b->btree_id; +- d->level = b->level; +- bkey_copy(&d->key, &b->key); +- +- mutex_unlock(&c->btree_interior_update_lock); ++ while (b->ob.nr) ++ as->open_buckets[as->nr_open_buckets++] = ++ b->ob.v[--b->ob.nr]; + } + + /* +@@ -941,8 +790,6 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *as, + if (btree_node_fake(b)) + return; + +- btree_interior_update_add_node_reference(as, b); +- + mutex_lock(&c->btree_interior_update_lock); + + /* +@@ -984,16 +831,28 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *as, + bch2_journal_pin_drop(&c->journal, &w->journal); + + mutex_unlock(&c->btree_interior_update_lock); ++ ++ /* ++ * Is this a node that isn't reachable on disk yet? ++ * ++ * Nodes that aren't reachable yet have writes blocked until they're ++ * reachable - now that we've cancelled any pending writes and moved ++ * things waiting on that write to wait on this update, we can drop this ++ * node from the list of nodes that the other update is making ++ * reachable, prior to freeing it: ++ */ ++ btree_update_drop_new_node(c, b); ++ ++ btree_update_will_delete_key(as, &b->key); + } + + void bch2_btree_update_done(struct btree_update *as) + { + BUG_ON(as->mode == BTREE_INTERIOR_NO_UPDATE); + +- bch2_btree_reserve_put(as->c, as->reserve); +- as->reserve = NULL; ++ bch2_btree_reserve_put(as); + +- continue_at(&as->cl, btree_update_nodes_written, system_freezable_wq); ++ continue_at(&as->cl, btree_update_set_nodes_written, system_freezable_wq); + } + + struct btree_update * +@@ -1002,12 +861,32 @@ bch2_btree_update_start(struct btree_trans *trans, enum btree_id id, + struct closure *cl) + { + struct bch_fs *c = trans->c; +- struct journal_preres journal_preres = { 0 }; +- struct btree_reserve *reserve; + struct btree_update *as; +- int ret; ++ int ret, disk_res_flags = (flags & BTREE_INSERT_NOFAIL) ++ ? BCH_DISK_RESERVATION_NOFAIL : 0; ++ ++ /* ++ * This check isn't necessary for correctness - it's just to potentially ++ * prevent us from doing a lot of work that'll end up being wasted: ++ */ ++ ret = bch2_journal_error(&c->journal); ++ if (ret) ++ return ERR_PTR(ret); ++ ++ as = mempool_alloc(&c->btree_interior_update_pool, GFP_NOIO); ++ memset(as, 0, sizeof(*as)); ++ closure_init(&as->cl, NULL); ++ as->c = c; ++ as->mode = BTREE_INTERIOR_NO_UPDATE; ++ as->btree_id = id; ++ INIT_LIST_HEAD(&as->list); ++ INIT_LIST_HEAD(&as->unwritten_list); ++ INIT_LIST_HEAD(&as->write_blocked_list); ++ bch2_keylist_init(&as->old_keys, as->_old_keys); ++ bch2_keylist_init(&as->new_keys, as->_new_keys); ++ bch2_keylist_init(&as->parent_keys, as->inline_keys); + +- ret = bch2_journal_preres_get(&c->journal, &journal_preres, ++ ret = bch2_journal_preres_get(&c->journal, &as->journal_preres, + BTREE_UPDATE_JOURNAL_RES, + JOURNAL_RES_GET_NONBLOCK); + if (ret == -EAGAIN) { +@@ -1016,46 +895,41 @@ bch2_btree_update_start(struct btree_trans *trans, enum btree_id id, + + bch2_trans_unlock(trans); + +- ret = bch2_journal_preres_get(&c->journal, &journal_preres, ++ ret = bch2_journal_preres_get(&c->journal, &as->journal_preres, + BTREE_UPDATE_JOURNAL_RES, 0); + if (ret) + return ERR_PTR(ret); + + if (!bch2_trans_relock(trans)) { +- bch2_journal_preres_put(&c->journal, &journal_preres); +- return ERR_PTR(-EINTR); ++ ret = -EINTR; ++ goto err; + } + } + +- reserve = bch2_btree_reserve_get(c, nr_nodes, flags, cl); +- if (IS_ERR(reserve)) { +- bch2_journal_preres_put(&c->journal, &journal_preres); +- return ERR_CAST(reserve); +- } +- +- as = mempool_alloc(&c->btree_interior_update_pool, GFP_NOIO); +- memset(as, 0, sizeof(*as)); +- closure_init(&as->cl, NULL); +- as->c = c; +- as->mode = BTREE_INTERIOR_NO_UPDATE; +- as->btree_id = id; +- as->reserve = reserve; +- INIT_LIST_HEAD(&as->write_blocked_list); +- INIT_LIST_HEAD(&as->unwritten_list); +- as->journal_preres = journal_preres; ++ ret = bch2_disk_reservation_get(c, &as->disk_res, ++ nr_nodes * c->opts.btree_node_size, ++ c->opts.metadata_replicas, ++ disk_res_flags); ++ if (ret) ++ goto err; + +- bch2_keylist_init(&as->parent_keys, as->inline_keys); ++ ret = bch2_btree_reserve_get(as, nr_nodes, flags, cl); ++ if (ret) ++ goto err; + + mutex_lock(&c->btree_interior_update_lock); + list_add_tail(&as->list, &c->btree_interior_update_list); + mutex_unlock(&c->btree_interior_update_lock); + + return as; ++err: ++ bch2_btree_update_free(as); ++ return ERR_PTR(ret); + } + + /* Btree root updates: */ + +-static void __bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b) ++static void bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b) + { + /* Root nodes cannot be reaped */ + mutex_lock(&c->btree_cache.lock); +@@ -1073,38 +947,6 @@ static void __bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b) + bch2_recalc_btree_reserve(c); + } + +-static void bch2_btree_set_root_inmem(struct btree_update *as, struct btree *b) +-{ +- struct bch_fs *c = as->c; +- struct btree *old = btree_node_root(c, b); +- struct bch_fs_usage *fs_usage; +- +- __bch2_btree_set_root_inmem(c, b); +- +- mutex_lock(&c->btree_interior_update_lock); +- percpu_down_read(&c->mark_lock); +- fs_usage = bch2_fs_usage_scratch_get(c); +- +- bch2_mark_key_locked(c, bkey_i_to_s_c(&b->key), +- 0, 0, fs_usage, 0, +- BTREE_TRIGGER_INSERT); +- if (gc_visited(c, gc_pos_btree_root(b->btree_id))) +- bch2_mark_key_locked(c, bkey_i_to_s_c(&b->key), +- 0, 0, NULL, 0, +- BTREE_TRIGGER_INSERT| +- BTREE_TRIGGER_GC); +- +- if (old && !btree_node_fake(old)) +- bch2_btree_node_free_index(as, NULL, +- bkey_i_to_s_c(&old->key), +- fs_usage); +- bch2_fs_usage_apply(c, fs_usage, &as->reserve->disk_res, 0); +- +- bch2_fs_usage_scratch_put(c, fs_usage); +- percpu_up_read(&c->mark_lock); +- mutex_unlock(&c->btree_interior_update_lock); +-} +- + /** + * bch_btree_set_root - update the root in memory and on disk + * +@@ -1135,7 +977,7 @@ static void bch2_btree_set_root(struct btree_update *as, struct btree *b, + */ + bch2_btree_node_lock_write(old, iter); + +- bch2_btree_set_root_inmem(as, b); ++ bch2_btree_set_root_inmem(c, b); + + btree_update_updated_root(as, b); + +@@ -1156,57 +998,21 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b + struct bkey_i *insert, + struct btree_node_iter *node_iter) + { +- struct bch_fs *c = as->c; +- struct bch_fs_usage *fs_usage; +- struct jset_entry *entry; + struct bkey_packed *k; +- struct bkey tmp; + + BUG_ON(as->journal_u64s + jset_u64s(insert->k.u64s) > + ARRAY_SIZE(as->journal_entries)); + +- entry = (void *) &as->journal_entries[as->journal_u64s]; +- memset(entry, 0, sizeof(*entry)); +- entry->u64s = cpu_to_le16(insert->k.u64s); +- entry->type = BCH_JSET_ENTRY_btree_keys; +- entry->btree_id = b->btree_id; +- entry->level = b->level; +- memcpy_u64s_small(entry->_data, insert, insert->k.u64s); +- as->journal_u64s += jset_u64s(insert->k.u64s); +- +- mutex_lock(&c->btree_interior_update_lock); +- percpu_down_read(&c->mark_lock); +- fs_usage = bch2_fs_usage_scratch_get(c); +- +- bch2_mark_key_locked(c, bkey_i_to_s_c(insert), +- 0, 0, fs_usage, 0, +- BTREE_TRIGGER_INSERT); +- +- if (gc_visited(c, gc_pos_btree_node(b))) +- bch2_mark_key_locked(c, bkey_i_to_s_c(insert), +- 0, 0, NULL, 0, +- BTREE_TRIGGER_INSERT| +- BTREE_TRIGGER_GC); ++ as->journal_u64s += ++ journal_entry_set((void *) &as->journal_entries[as->journal_u64s], ++ BCH_JSET_ENTRY_btree_keys, ++ b->btree_id, b->level, ++ insert, insert->k.u64s); + + while ((k = bch2_btree_node_iter_peek_all(node_iter, b)) && + bkey_iter_pos_cmp(b, k, &insert->k.p) < 0) + bch2_btree_node_iter_advance(node_iter, b); + +- /* +- * If we're overwriting, look up pending delete and mark so that gc +- * marks it on the pending delete list: +- */ +- if (k && !bkey_cmp_packed(b, k, &insert->k)) +- bch2_btree_node_free_index(as, b, +- bkey_disassemble(b, k, &tmp), +- fs_usage); +- +- bch2_fs_usage_apply(c, fs_usage, &as->reserve->disk_res, 0); +- +- bch2_fs_usage_scratch_put(c, fs_usage); +- percpu_up_read(&c->mark_lock); +- mutex_unlock(&c->btree_interior_update_lock); +- + bch2_btree_bset_insert_key(iter, b, node_iter, insert); + set_btree_node_dirty(b); + set_btree_node_need_write(b); +@@ -1226,6 +1032,7 @@ static struct btree *__btree_split_node(struct btree_update *as, + struct bkey_packed *k, *prev = NULL; + + n2 = bch2_btree_node_alloc(as, n1->level); ++ bch2_btree_update_add_new_node(as, n2); + + n2->data->max_key = n1->data->max_key; + n2->data->format = n1->format; +@@ -1321,14 +1128,6 @@ static void btree_split_insert_keys(struct btree_update *as, struct btree *b, + struct bkey_packed *src, *dst, *n; + struct bset *i; + +- /* +- * XXX +- * +- * these updates must be journalled +- * +- * oops +- */ +- + BUG_ON(btree_node_type(b) != BKEY_TYPE_BTREE); + + bch2_btree_node_iter_init(&node_iter, b, &k->k.p); +@@ -1380,6 +1179,7 @@ static void btree_split(struct btree_update *as, struct btree *b, + bch2_btree_interior_update_will_free_node(as, b); + + n1 = bch2_btree_node_alloc_replacement(as, b); ++ bch2_btree_update_add_new_node(as, n1); + + if (keys) + btree_split_insert_keys(as, n1, iter, keys); +@@ -1439,11 +1239,11 @@ static void btree_split(struct btree_update *as, struct btree *b, + bch2_btree_set_root(as, n1, iter); + } + +- bch2_open_buckets_put(c, &n1->ob); ++ bch2_btree_update_get_open_buckets(as, n1); + if (n2) +- bch2_open_buckets_put(c, &n2->ob); ++ bch2_btree_update_get_open_buckets(as, n2); + if (n3) +- bch2_open_buckets_put(c, &n3->ob); ++ bch2_btree_update_get_open_buckets(as, n3); + + /* Successful split, update the iterator to point to the new nodes: */ + +@@ -1538,7 +1338,7 @@ void bch2_btree_insert_node(struct btree_update *as, struct btree *b, + + bch2_btree_node_lock_for_insert(c, b, iter); + +- if (!bch2_btree_node_insert_fits(c, b, bch_keylist_u64s(keys))) { ++ if (!bch2_btree_node_insert_fits(c, b, bch2_keylist_u64s(keys))) { + bch2_btree_node_unlock_write(b, iter); + goto split; + } +@@ -1749,6 +1549,7 @@ retry: + bch2_btree_interior_update_will_free_node(as, m); + + n = bch2_btree_node_alloc(as, b->level); ++ bch2_btree_update_add_new_node(as, n); + + btree_set_min(n, prev->data->min_key); + btree_set_max(n, next->data->max_key); +@@ -1771,7 +1572,7 @@ retry: + + bch2_btree_insert_node(as, parent, iter, &as->parent_keys, flags); + +- bch2_open_buckets_put(c, &n->ob); ++ bch2_btree_update_get_open_buckets(as, n); + + six_lock_increment(&b->lock, SIX_LOCK_intent); + bch2_btree_iter_node_drop(iter, b); +@@ -1859,6 +1660,7 @@ static int __btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter, + bch2_btree_interior_update_will_free_node(as, b); + + n = bch2_btree_node_alloc_replacement(as, b); ++ bch2_btree_update_add_new_node(as, n); + + bch2_btree_build_aux_trees(n); + six_unlock_write(&n->lock); +@@ -1874,7 +1676,7 @@ static int __btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter, + bch2_btree_set_root(as, n, iter); + } + +- bch2_open_buckets_put(c, &n->ob); ++ bch2_btree_update_get_open_buckets(as, n); + + six_lock_increment(&b->lock, SIX_LOCK_intent); + bch2_btree_iter_node_drop(iter, b); +@@ -1949,49 +1751,8 @@ static void __bch2_btree_node_update_key(struct bch_fs *c, + struct btree *parent; + int ret; + +- /* +- * Two corner cases that need to be thought about here: +- * +- * @b may not be reachable yet - there might be another interior update +- * operation waiting on @b to be written, and we're gonna deliver the +- * write completion to that interior update operation _before_ +- * persisting the new_key update +- * +- * That ends up working without us having to do anything special here: +- * the reason is, we do kick off (and do the in memory updates) for the +- * update for @new_key before we return, creating a new interior_update +- * operation here. +- * +- * The new interior update operation here will in effect override the +- * previous one. The previous one was going to terminate - make @b +- * reachable - in one of two ways: +- * - updating the btree root pointer +- * In that case, +- * no, this doesn't work. argh. +- */ +- +- if (b->will_make_reachable) +- as->must_rewrite = true; +- +- btree_interior_update_add_node_reference(as, b); +- +- /* +- * XXX: the rest of the update path treats this like we're actually +- * inserting a new node and deleting the existing node, so the +- * reservation needs to include enough space for @b +- * +- * that is actually sketch as fuck though and I am surprised the code +- * seems to work like that, definitely need to go back and rework it +- * into something saner. +- * +- * (I think @b is just getting double counted until the btree update +- * finishes and "deletes" @b on disk) +- */ +- ret = bch2_disk_reservation_add(c, &as->reserve->disk_res, +- c->opts.btree_node_size * +- bch2_bkey_nr_ptrs(bkey_i_to_s_c(new_key)), +- BCH_DISK_RESERVATION_NOFAIL); +- BUG_ON(ret); ++ btree_update_will_delete_key(as, &b->key); ++ btree_update_will_add_key(as, new_key); + + parent = btree_node_parent(iter, b); + if (parent) { +@@ -2019,44 +1780,18 @@ static void __bch2_btree_node_update_key(struct bch_fs *c, + bkey_copy(&b->key, new_key); + } + } else { +- struct bch_fs_usage *fs_usage; +- + BUG_ON(btree_node_root(c, b) != b); + + bch2_btree_node_lock_write(b, iter); ++ bkey_copy(&b->key, new_key); + +- mutex_lock(&c->btree_interior_update_lock); +- percpu_down_read(&c->mark_lock); +- fs_usage = bch2_fs_usage_scratch_get(c); +- +- bch2_mark_key_locked(c, bkey_i_to_s_c(new_key), +- 0, 0, fs_usage, 0, +- BTREE_TRIGGER_INSERT); +- if (gc_visited(c, gc_pos_btree_root(b->btree_id))) +- bch2_mark_key_locked(c, bkey_i_to_s_c(new_key), +- 0, 0, NULL, 0, +- BTREE_TRIGGER_INSERT|| +- BTREE_TRIGGER_GC); +- +- bch2_btree_node_free_index(as, NULL, +- bkey_i_to_s_c(&b->key), +- fs_usage); +- bch2_fs_usage_apply(c, fs_usage, &as->reserve->disk_res, 0); +- +- bch2_fs_usage_scratch_put(c, fs_usage); +- percpu_up_read(&c->mark_lock); +- mutex_unlock(&c->btree_interior_update_lock); +- +- if (btree_ptr_hash_val(new_key) != b->hash_val) { ++ if (btree_ptr_hash_val(&b->key) != b->hash_val) { + mutex_lock(&c->btree_cache.lock); + bch2_btree_node_hash_remove(&c->btree_cache, b); + +- bkey_copy(&b->key, new_key); + ret = __bch2_btree_node_hash_insert(&c->btree_cache, b); + BUG_ON(ret); + mutex_unlock(&c->btree_cache.lock); +- } else { +- bkey_copy(&b->key, new_key); + } + + btree_update_updated_root(as, b); +@@ -2171,7 +1906,7 @@ void bch2_btree_set_root_for_read(struct bch_fs *c, struct btree *b) + { + BUG_ON(btree_node_root(c, b)); + +- __bch2_btree_set_root_inmem(c, b); ++ bch2_btree_set_root_inmem(c, b); + } + + void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id) +@@ -2210,7 +1945,7 @@ void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id) + ret = bch2_btree_node_hash_insert(&c->btree_cache, b, b->level, b->btree_id); + BUG_ON(ret); + +- __bch2_btree_set_root_inmem(c, b); ++ bch2_btree_set_root_inmem(c, b); + + six_unlock_write(&b->lock); + six_unlock_intent(&b->lock); +@@ -2247,10 +1982,59 @@ size_t bch2_btree_interior_updates_nr_pending(struct bch_fs *c) + return ret; + } + ++void bch2_journal_entries_to_btree_roots(struct bch_fs *c, struct jset *jset) ++{ ++ struct btree_root *r; ++ struct jset_entry *entry; ++ ++ mutex_lock(&c->btree_root_lock); ++ ++ vstruct_for_each(jset, entry) ++ if (entry->type == BCH_JSET_ENTRY_btree_root) { ++ r = &c->btree_roots[entry->btree_id]; ++ r->level = entry->level; ++ r->alive = true; ++ bkey_copy(&r->key, &entry->start[0]); ++ } ++ ++ mutex_unlock(&c->btree_root_lock); ++} ++ ++struct jset_entry * ++bch2_btree_roots_to_journal_entries(struct bch_fs *c, ++ struct jset_entry *start, ++ struct jset_entry *end) ++{ ++ struct jset_entry *entry; ++ unsigned long have = 0; ++ unsigned i; ++ ++ for (entry = start; entry < end; entry = vstruct_next(entry)) ++ if (entry->type == BCH_JSET_ENTRY_btree_root) ++ __set_bit(entry->btree_id, &have); ++ ++ mutex_lock(&c->btree_root_lock); ++ ++ for (i = 0; i < BTREE_ID_NR; i++) ++ if (c->btree_roots[i].alive && !test_bit(i, &have)) { ++ journal_entry_set(end, ++ BCH_JSET_ENTRY_btree_root, ++ i, c->btree_roots[i].level, ++ &c->btree_roots[i].key, ++ c->btree_roots[i].key.u64s); ++ end = vstruct_next(end); ++ } ++ ++ mutex_unlock(&c->btree_root_lock); ++ ++ return end; ++} ++ + void bch2_fs_btree_interior_update_exit(struct bch_fs *c) + { ++ if (c->btree_interior_update_worker) ++ destroy_workqueue(c->btree_interior_update_worker); + mempool_exit(&c->btree_interior_update_pool); +- mempool_exit(&c->btree_reserve_pool); + } + + int bch2_fs_btree_interior_update_init(struct bch_fs *c) +@@ -2259,9 +2043,13 @@ int bch2_fs_btree_interior_update_init(struct bch_fs *c) + INIT_LIST_HEAD(&c->btree_interior_update_list); + INIT_LIST_HEAD(&c->btree_interior_updates_unwritten); + mutex_init(&c->btree_interior_update_lock); ++ INIT_WORK(&c->btree_interior_update_work, btree_interior_update_work); ++ ++ c->btree_interior_update_worker = ++ alloc_workqueue("btree_update", WQ_UNBOUND|WQ_MEM_RECLAIM, 1); ++ if (!c->btree_interior_update_worker) ++ return -ENOMEM; + +- return mempool_init_kmalloc_pool(&c->btree_reserve_pool, 1, +- sizeof(struct btree_reserve)) ?: +- mempool_init_kmalloc_pool(&c->btree_interior_update_pool, 1, +- sizeof(struct btree_update)); ++ return mempool_init_kmalloc_pool(&c->btree_interior_update_pool, 1, ++ sizeof(struct btree_update)); + } +diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h +index 739a5ac536b8..a6be62d3a18f 100644 +--- a/fs/bcachefs/btree_update_interior.h ++++ b/fs/bcachefs/btree_update_interior.h +@@ -6,34 +6,13 @@ + #include "btree_locking.h" + #include "btree_update.h" + +-struct btree_reserve { +- struct disk_reservation disk_res; +- unsigned nr; +- struct btree *b[BTREE_RESERVE_MAX]; +-}; +- + void __bch2_btree_calc_format(struct bkey_format_state *, struct btree *); + bool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *, + struct bkey_format *); + +-/* Btree node freeing/allocation: */ +- +-/* +- * Tracks a btree node that has been (or is about to be) freed in memory, but +- * has _not_ yet been freed on disk (because the write that makes the new +- * node(s) visible and frees the old hasn't completed yet) +- */ +-struct pending_btree_node_free { +- bool index_update_done; +- +- __le64 seq; +- enum btree_id btree_id; +- unsigned level; +- __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX); +-}; ++#define BTREE_UPDATE_NODES_MAX ((BTREE_MAX_DEPTH - 2) * 2 + GC_MERGE_NODES) + +-#define BTREE_UPDATE_JOURNAL_RES \ +- ((BKEY_BTREE_PTR_U64s_MAX + 1) * (BTREE_MAX_DEPTH - 1) * 2) ++#define BTREE_UPDATE_JOURNAL_RES (BTREE_UPDATE_NODES_MAX * (BKEY_BTREE_PTR_U64s_MAX + 1)) + + /* + * Tracks an in progress split/rewrite of a btree node and the update to the +@@ -72,9 +51,8 @@ struct btree_update { + unsigned nodes_written:1; + + enum btree_id btree_id; +- u8 level; + +- struct btree_reserve *reserve; ++ struct disk_reservation disk_res; + struct journal_preres journal_preres; + + /* +@@ -96,17 +74,28 @@ struct btree_update { + */ + struct journal_entry_pin journal; + +- /* +- * Nodes being freed: +- * Protected by c->btree_node_pending_free_lock +- */ +- struct pending_btree_node_free pending[BTREE_MAX_DEPTH + GC_MERGE_NODES]; +- unsigned nr_pending; ++ /* Preallocated nodes we reserve when we start the update: */ ++ struct btree *prealloc_nodes[BTREE_UPDATE_NODES_MAX]; ++ unsigned nr_prealloc_nodes; ++ ++ /* Nodes being freed: */ ++ struct keylist old_keys; ++ u64 _old_keys[BTREE_UPDATE_NODES_MAX * ++ BKEY_BTREE_PTR_VAL_U64s_MAX]; ++ ++ /* Nodes being added: */ ++ struct keylist new_keys; ++ u64 _new_keys[BTREE_UPDATE_NODES_MAX * ++ BKEY_BTREE_PTR_VAL_U64s_MAX]; + + /* New nodes, that will be made reachable by this update: */ +- struct btree *new_nodes[BTREE_MAX_DEPTH * 2 + GC_MERGE_NODES]; ++ struct btree *new_nodes[BTREE_UPDATE_NODES_MAX]; + unsigned nr_new_nodes; + ++ u8 open_buckets[BTREE_UPDATE_NODES_MAX * ++ BCH_REPLICAS_MAX]; ++ u8 nr_open_buckets; ++ + unsigned journal_u64s; + u64 journal_entries[BTREE_UPDATE_JOURNAL_RES]; + +@@ -120,14 +109,12 @@ struct btree_update { + u64 inline_keys[BKEY_BTREE_PTR_U64s_MAX * 3]; + }; + +-#define for_each_pending_btree_node_free(c, as, p) \ +- list_for_each_entry(as, &c->btree_interior_update_list, list) \ +- for (p = as->pending; p < as->pending + as->nr_pending; p++) +- + void bch2_btree_node_free_inmem(struct bch_fs *, struct btree *, + struct btree_iter *); + void bch2_btree_node_free_never_inserted(struct bch_fs *, struct btree *); + ++void bch2_btree_update_get_open_buckets(struct btree_update *, struct btree *); ++ + struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *, + struct btree *, + struct bkey_format); +@@ -139,6 +126,7 @@ bch2_btree_update_start(struct btree_trans *, enum btree_id, unsigned, + + void bch2_btree_interior_update_will_free_node(struct btree_update *, + struct btree *); ++void bch2_btree_update_add_new_node(struct btree_update *, struct btree *); + + void bch2_btree_insert_node(struct btree_update *, struct btree *, + struct btree_iter *, struct keylist *, +@@ -333,6 +321,10 @@ ssize_t bch2_btree_updates_print(struct bch_fs *, char *); + + size_t bch2_btree_interior_updates_nr_pending(struct bch_fs *); + ++void bch2_journal_entries_to_btree_roots(struct bch_fs *, struct jset *); ++struct jset_entry *bch2_btree_roots_to_journal_entries(struct bch_fs *, ++ struct jset_entry *, struct jset_entry *); ++ + void bch2_fs_btree_interior_update_exit(struct bch_fs *); + int bch2_fs_btree_interior_update_init(struct bch_fs *); + +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index 6e402027c63f..e343d80fede3 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -414,8 +414,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, + } + + if (unlikely(trans->extra_journal_entry_u64s)) { +- memcpy_u64s_small(bch2_journal_reservation_entry(&c->journal, +- &trans->journal_res), ++ memcpy_u64s_small(journal_res_entry(&c->journal, &trans->journal_res), + trans->extra_journal_entries, + trans->extra_journal_entry_u64s); + +@@ -521,6 +520,10 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, + bch2_btree_node_unlock_write_inlined(iter_l(i->iter)->b, + i->iter); + ++ if (!ret && trans->journal_pin) ++ bch2_journal_pin_add(&trans->c->journal, trans->journal_res.seq, ++ trans->journal_pin, NULL); ++ + /* + * Drop journal reservation after dropping write locks, since dropping + * the journal reservation may kick off a journal write: +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index 2fe33d744d33..1d8381656d81 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -1180,7 +1180,7 @@ static int bch2_mark_stripe(struct bch_fs *c, struct bkey_s_c k, + return 0; + } + +-int bch2_mark_key_locked(struct bch_fs *c, ++static int bch2_mark_key_locked(struct bch_fs *c, + struct bkey_s_c k, + unsigned offset, s64 sectors, + struct bch_fs_usage *fs_usage, +diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h +index 765650ce9d0a..97265fe90e96 100644 +--- a/fs/bcachefs/buckets.h ++++ b/fs/bcachefs/buckets.h +@@ -259,8 +259,6 @@ void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *, + size_t, enum bch_data_type, unsigned, + struct gc_pos, unsigned); + +-int bch2_mark_key_locked(struct bch_fs *, struct bkey_s_c, unsigned, s64, +- struct bch_fs_usage *, u64, unsigned); + int bch2_mark_key(struct bch_fs *, struct bkey_s_c, unsigned, s64, + struct bch_fs_usage *, u64, unsigned); + int bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage *, +diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c +index 0a4538b3dc60..32999161bdd8 100644 +--- a/fs/bcachefs/journal.c ++++ b/fs/bcachefs/journal.c +@@ -959,15 +959,12 @@ void bch2_dev_journal_stop(struct journal *j, struct bch_dev *ca) + + void bch2_fs_journal_stop(struct journal *j) + { +- struct bch_fs *c = container_of(j, struct bch_fs, journal); +- + bch2_journal_flush_all_pins(j); + + wait_event(j->wait, journal_entry_close(j)); + + /* do we need to write another journal entry? */ +- if (test_bit(JOURNAL_NOT_EMPTY, &j->flags) || +- c->btree_roots_dirty) ++ if (test_bit(JOURNAL_NOT_EMPTY, &j->flags)) + bch2_journal_meta(j); + + journal_quiesce(j); +diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h +index e4b7fe8ffa82..997a28ae862e 100644 +--- a/fs/bcachefs/journal.h ++++ b/fs/bcachefs/journal.h +@@ -200,31 +200,38 @@ bch2_journal_add_entry_noreservation(struct journal_buf *buf, size_t u64s) + } + + static inline struct jset_entry * +-bch2_journal_reservation_entry(struct journal *j, struct journal_res *res) ++journal_res_entry(struct journal *j, struct journal_res *res) + { + return vstruct_idx(j->buf[res->idx].data, res->offset); + } + ++static inline unsigned journal_entry_set(struct jset_entry *entry, unsigned type, ++ enum btree_id id, unsigned level, ++ const void *data, unsigned u64s) ++{ ++ memset(entry, 0, sizeof(*entry)); ++ entry->u64s = cpu_to_le16(u64s); ++ entry->type = type; ++ entry->btree_id = id; ++ entry->level = level; ++ memcpy_u64s_small(entry->_data, data, u64s); ++ ++ return jset_u64s(u64s); ++} ++ + static inline void bch2_journal_add_entry(struct journal *j, struct journal_res *res, + unsigned type, enum btree_id id, + unsigned level, + const void *data, unsigned u64s) + { +- struct jset_entry *entry = bch2_journal_reservation_entry(j, res); +- unsigned actual = jset_u64s(u64s); ++ unsigned actual = journal_entry_set(journal_res_entry(j, res), ++ type, id, level, data, u64s); + + EBUG_ON(!res->ref); + EBUG_ON(actual > res->u64s); + + res->offset += actual; + res->u64s -= actual; +- +- memset(entry, 0, sizeof(*entry)); +- entry->u64s = cpu_to_le16(u64s); +- entry->type = type; +- entry->btree_id = id; +- entry->level = level; +- memcpy_u64s_small(entry->_data, data, u64s); + } + + static inline void bch2_journal_add_keys(struct journal *j, struct journal_res *res, +diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c +index 39bb2154cce1..b923efc42099 100644 +--- a/fs/bcachefs/journal_io.c ++++ b/fs/bcachefs/journal_io.c +@@ -2,6 +2,7 @@ + #include "bcachefs.h" + #include "alloc_foreground.h" + #include "btree_io.h" ++#include "btree_update_interior.h" + #include "buckets.h" + #include "checksum.h" + #include "error.h" +@@ -993,8 +994,23 @@ void bch2_journal_write(struct closure *cl) + + j->write_start_time = local_clock(); + +- start = vstruct_last(jset); +- end = bch2_journal_super_entries_add_common(c, start, ++ /* ++ * New btree roots are set by journalling them; when the journal entry ++ * gets written we have to propagate them to c->btree_roots ++ * ++ * But, every journal entry we write has to contain all the btree roots ++ * (at least for now); so after we copy btree roots to c->btree_roots we ++ * have to get any missing btree roots and add them to this journal ++ * entry: ++ */ ++ ++ bch2_journal_entries_to_btree_roots(c, jset); ++ ++ start = end = vstruct_last(jset); ++ ++ end = bch2_btree_roots_to_journal_entries(c, jset->start, end); ++ ++ end = bch2_journal_super_entries_add_common(c, end, + le64_to_cpu(jset->seq)); + u64s = (u64 *) end - (u64 *) start; + BUG_ON(u64s > j->entry_u64s_reserved); +diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c +index d34434f62454..d5eed53f1298 100644 +--- a/fs/bcachefs/journal_reclaim.c ++++ b/fs/bcachefs/journal_reclaim.c +@@ -330,7 +330,7 @@ static void bch2_journal_pin_add_locked(struct journal *j, u64 seq, + + __journal_pin_drop(j, pin); + +- BUG_ON(!atomic_read(&pin_list->count)); ++ BUG_ON(!atomic_read(&pin_list->count) && seq == journal_last_seq(j)); + + atomic_inc(&pin_list->count); + pin->seq = seq; +diff --git a/fs/bcachefs/journal_reclaim.h b/fs/bcachefs/journal_reclaim.h +index 883a0a5680af..3ef641f7ce30 100644 +--- a/fs/bcachefs/journal_reclaim.h ++++ b/fs/bcachefs/journal_reclaim.h +@@ -38,7 +38,7 @@ static inline void bch2_journal_pin_add(struct journal *j, u64 seq, + struct journal_entry_pin *pin, + journal_pin_flush_fn flush_fn) + { +- if (unlikely(!journal_pin_active(pin))) ++ if (unlikely(!journal_pin_active(pin) || pin->seq > seq)) + __bch2_journal_pin_add(j, seq, pin, flush_fn); + } + +diff --git a/fs/bcachefs/keylist.c b/fs/bcachefs/keylist.c +index 5da54ced9cad..864dfaa67b7a 100644 +--- a/fs/bcachefs/keylist.c ++++ b/fs/bcachefs/keylist.c +@@ -6,7 +6,7 @@ + int bch2_keylist_realloc(struct keylist *l, u64 *inline_u64s, + size_t nr_inline_u64s, size_t new_u64s) + { +- size_t oldsize = bch_keylist_u64s(l); ++ size_t oldsize = bch2_keylist_u64s(l); + size_t newsize = oldsize + new_u64s; + u64 *old_buf = l->keys_p == inline_u64s ? NULL : l->keys_p; + u64 *new_keys; +@@ -52,7 +52,7 @@ void bch2_keylist_pop_front(struct keylist *l) + + memmove_u64s_down(l->keys, + bkey_next(l->keys), +- bch_keylist_u64s(l)); ++ bch2_keylist_u64s(l)); + } + + #ifdef CONFIG_BCACHEFS_DEBUG +diff --git a/fs/bcachefs/keylist.h b/fs/bcachefs/keylist.h +index a7ff86b08abc..195799bb20bc 100644 +--- a/fs/bcachefs/keylist.h ++++ b/fs/bcachefs/keylist.h +@@ -36,14 +36,14 @@ static inline bool bch2_keylist_empty(struct keylist *l) + return l->top == l->keys; + } + +-static inline size_t bch_keylist_u64s(struct keylist *l) ++static inline size_t bch2_keylist_u64s(struct keylist *l) + { + return l->top_p - l->keys_p; + } + + static inline size_t bch2_keylist_bytes(struct keylist *l) + { +- return bch_keylist_u64s(l) * sizeof(u64); ++ return bch2_keylist_u64s(l) * sizeof(u64); + } + + static inline struct bkey_i *bch2_keylist_front(struct keylist *l) +diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c +index e26fa1608f39..96c8690adc5b 100644 +--- a/fs/bcachefs/migrate.c ++++ b/fs/bcachefs/migrate.c +@@ -151,15 +151,8 @@ retry: + } + + /* flush relevant btree updates */ +- while (1) { +- closure_wait_event(&c->btree_interior_update_wait, +- !bch2_btree_interior_updates_nr_pending(c) || +- c->btree_roots_dirty); +- if (c->btree_roots_dirty) +- bch2_journal_meta(&c->journal); +- if (!bch2_btree_interior_updates_nr_pending(c)) +- break; +- } ++ closure_wait_event(&c->btree_interior_update_wait, ++ !bch2_btree_interior_updates_nr_pending(c)); + + ret = 0; + err: +diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c +index 67e495bc8aba..11a92c099afd 100644 +--- a/fs/bcachefs/move.c ++++ b/fs/bcachefs/move.c +@@ -775,14 +775,8 @@ int bch2_data_job(struct bch_fs *c, + + ret = bch2_move_btree(c, rereplicate_pred, c, stats) ?: ret; + +- while (1) { +- closure_wait_event(&c->btree_interior_update_wait, +- !bch2_btree_interior_updates_nr_pending(c) || +- c->btree_roots_dirty); +- if (!bch2_btree_interior_updates_nr_pending(c)) +- break; +- bch2_journal_meta(&c->journal); +- } ++ closure_wait_event(&c->btree_interior_update_wait, ++ !bch2_btree_interior_updates_nr_pending(c)); + + ret = bch2_replicas_gc2(c) ?: ret; + +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index b4ccecdd3f77..e7f65fa21151 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -763,6 +763,7 @@ static int verify_superblock_clean(struct bch_fs *c, + "superblock read clock doesn't match journal after clean shutdown"); + + for (i = 0; i < BTREE_ID_NR; i++) { ++ char buf1[200], buf2[200]; + struct bkey_i *k1, *k2; + unsigned l1 = 0, l2 = 0; + +@@ -778,7 +779,11 @@ static int verify_superblock_clean(struct bch_fs *c, + k1->k.u64s != k2->k.u64s || + memcmp(k1, k2, bkey_bytes(k1)) || + l1 != l2, c, +- "superblock btree root doesn't match journal after clean shutdown"); ++ "superblock btree root %u doesn't match journal after clean shutdown\n" ++ "sb: l=%u %s\n" ++ "journal: l=%u %s\n", i, ++ l1, (bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(k1)), buf1), ++ l2, (bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(k2)), buf2)); + } + fsck_err: + return ret; +diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c +index 9c1aaa594cd1..fc35ba6116e7 100644 +--- a/fs/bcachefs/super-io.c ++++ b/fs/bcachefs/super-io.c +@@ -1,6 +1,7 @@ + // SPDX-License-Identifier: GPL-2.0 + + #include "bcachefs.h" ++#include "btree_update_interior.h" + #include "buckets.h" + #include "checksum.h" + #include "disk_groups.h" +@@ -952,7 +953,6 @@ int bch2_fs_mark_dirty(struct bch_fs *c) + + mutex_lock(&c->sb_lock); + SET_BCH_SB_CLEAN(c->disk_sb.sb, false); +- c->disk_sb.sb->compat[0] &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_METADATA); + c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_new_extent_overwrite; + c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_extents_above_btree_updates; + c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_btree_updates_journalled; +@@ -986,27 +986,8 @@ bch2_journal_super_entries_add_common(struct bch_fs *c, + struct jset_entry *entry, + u64 journal_seq) + { +- struct btree_root *r; + unsigned i; + +- mutex_lock(&c->btree_root_lock); +- +- for (r = c->btree_roots; +- r < c->btree_roots + BTREE_ID_NR; +- r++) +- if (r->alive) { +- entry_init_u64s(entry, r->key.u64s + 1); +- entry->btree_id = r - c->btree_roots; +- entry->level = r->level; +- entry->type = BCH_JSET_ENTRY_btree_root; +- bkey_copy(&entry->start[0], &r->key); +- +- entry = vstruct_next(entry); +- } +- c->btree_roots_dirty = false; +- +- mutex_unlock(&c->btree_root_lock); +- + percpu_down_write(&c->mark_lock); + + if (!journal_seq) { +@@ -1107,6 +1088,7 @@ void bch2_fs_mark_clean(struct bch_fs *c) + + entry = sb_clean->start; + entry = bch2_journal_super_entries_add_common(c, entry, 0); ++ entry = bch2_btree_roots_to_journal_entries(c, entry, entry); + BUG_ON((void *) entry > vstruct_end(&sb_clean->field)); + + memset(entry, 0, +diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c +index 2fbed2a6d8bb..a57afb55e90a 100644 +--- a/fs/bcachefs/super.c ++++ b/fs/bcachefs/super.c +@@ -220,6 +220,7 @@ static void __bch2_fs_read_only(struct bch_fs *c) + */ + closure_wait_event(&c->btree_interior_update_wait, + !bch2_btree_interior_updates_nr_pending(c)); ++ flush_work(&c->btree_interior_update_work); + + clean_passes = wrote ? 0 : clean_passes + 1; + } while (clean_passes < 2); +@@ -227,6 +228,10 @@ static void __bch2_fs_read_only(struct bch_fs *c) + bch_verbose(c, "writing alloc info complete"); + set_bit(BCH_FS_ALLOC_CLEAN, &c->flags); + nowrote_alloc: ++ closure_wait_event(&c->btree_interior_update_wait, ++ !bch2_btree_interior_updates_nr_pending(c)); ++ flush_work(&c->btree_interior_update_work); ++ + for_each_member_device(ca, c, i) + bch2_dev_allocator_stop(ca); + +-- +cgit v1.2.3 + + +From d8ea313e92fb7e25fa960f42965987f84c872ca5 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 28 May 2020 17:15:41 -0400 +Subject: bcachefs: fsck_error_lock requires GFP_NOFS + +this fixes a lockdep splat + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/error.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c +index 5a5cfee623e2..1662a36244cd 100644 +--- a/fs/bcachefs/error.c ++++ b/fs/bcachefs/error.c +@@ -85,7 +85,7 @@ enum fsck_err_ret bch2_fsck_err(struct bch_fs *c, unsigned flags, + if (s->fmt == fmt) + goto found; + +- s = kzalloc(sizeof(*s), GFP_KERNEL); ++ s = kzalloc(sizeof(*s), GFP_NOFS); + if (!s) { + if (!c->fsck_alloc_err) + bch_err(c, "kmalloc err, cannot ratelimit fsck errs"); +-- +cgit v1.2.3 + + +From a66c52405c09ea1a3595e84e39f4e1a63a820fe4 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 28 May 2020 15:51:50 -0400 +Subject: bcachefs: Don't require alloc btree to be updated before buckets are + used + +This is to break a circular dependency in the shutdown path. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/alloc_background.c | 11 ++++++++-- + fs/bcachefs/buckets.c | 47 +++++++++++++++++++++++++++++------------- + 2 files changed, 42 insertions(+), 16 deletions(-) + +diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c +index e57b23ba1844..876285e0eb38 100644 +--- a/fs/bcachefs/alloc_background.c ++++ b/fs/bcachefs/alloc_background.c +@@ -843,7 +843,7 @@ static int bch2_invalidate_one_bucket2(struct btree_trans *trans, + struct bkey_s_c k; + bool invalidating_cached_data; + size_t b; +- int ret; ++ int ret = 0; + + BUG_ON(!ca->alloc_heap.used || + !ca->alloc_heap.data[0].nr); +@@ -857,11 +857,18 @@ static int bch2_invalidate_one_bucket2(struct btree_trans *trans, + + BUG_ON(!fifo_push(&ca->free_inc, b)); + ++ g = bucket(ca, b); ++ m = READ_ONCE(g->mark); ++ + bch2_mark_alloc_bucket(c, ca, b, true, gc_pos_alloc(c, NULL), 0); + + spin_unlock(&c->freelist_lock); + percpu_up_read(&c->mark_lock); + ++ invalidating_cached_data = m.cached_sectors != 0; ++ if (!invalidating_cached_data) ++ goto out; ++ + BUG_ON(BKEY_ALLOC_VAL_U64s_MAX > 8); + + bch2_btree_iter_set_pos(iter, POS(ca->dev_idx, b)); +@@ -915,7 +922,7 @@ retry: + flags); + if (ret == -EINTR) + goto retry; +- ++out: + if (!ret) { + /* remove from alloc_heap: */ + struct alloc_heap_entry e, *top = ca->alloc_heap.data; +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index 1d8381656d81..49a70ea21979 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -1457,11 +1457,13 @@ static int bch2_trans_mark_pointer(struct btree_trans *trans, + if (ret < 0) + return ret; + +- if (!ret && unlikely(!test_bit(BCH_FS_ALLOC_WRITTEN, &c->flags))) { ++ if (k.k->type != KEY_TYPE_alloc || ++ (!ret && unlikely(!test_bit(BCH_FS_ALLOC_WRITTEN, &c->flags)))) { + /* + * During journal replay, and if gc repairs alloc info at + * runtime, the alloc info in the btree might not be up to date +- * yet - so, trust the in memory mark: ++ * yet - so, trust the in memory mark - unless we're already ++ * updating that key: + */ + struct bucket *g; + struct bucket_mark m; +@@ -1472,22 +1474,39 @@ static int bch2_trans_mark_pointer(struct btree_trans *trans, + u = alloc_mem_to_key(g, m); + percpu_up_read(&c->mark_lock); + } else { +- /* +- * Unless we're already updating that key: +- */ +- if (k.k->type != KEY_TYPE_alloc) { +- bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, +- "pointer to nonexistent bucket %llu:%llu", +- iter->pos.inode, iter->pos.offset); +- ret = -1; +- goto out; +- } +- + u = bch2_alloc_unpack(k); + } + +- if (gen_after(u.gen, p.ptr.gen)) { ++ if (u.gen != p.ptr.gen) { + ret = 1; ++ ++ if (gen_after(p.ptr.gen, u.gen)) { ++ bch2_fs_inconsistent(c, ++ "bucket %llu:%llu gen %u data type %s: ptr gen %u newer than bucket gen", ++ iter->pos.inode, iter->pos.offset, u.gen, ++ bch2_data_types[u.data_type ?: data_type], ++ p.ptr.gen); ++ ret = -EIO; ++ } ++ ++ if (gen_cmp(u.gen, p.ptr.gen) >= 96U) { ++ bch2_fs_inconsistent(c, ++ "bucket %llu:%llu gen %u data type %s: ptr gen %u too stale", ++ iter->pos.inode, iter->pos.offset, u.gen, ++ bch2_data_types[u.data_type ?: data_type], ++ p.ptr.gen); ++ ret = -EIO; ++ } ++ ++ if (!p.ptr.cached) { ++ bch2_fs_inconsistent(c, ++ "bucket %llu:%llu gen %u data type %s: stale dirty ptr (gen %u)", ++ iter->pos.inode, iter->pos.offset, u.gen, ++ bch2_data_types[u.data_type ?: data_type], ++ p.ptr.gen); ++ ret = -EIO; ++ } ++ + goto out; + } + +-- +cgit v1.2.3 + + +From 6b62f8a2045f7b8c443d6c63709941b8c221e245 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 28 May 2020 16:06:13 -0400 +Subject: bcachefs: Fixes for going RO + +Now that interior btree updates are fully transactional, we don't need +to write out alloc info in a loop. However, interior btree updates do +put more things in the journal, so we still need a loop in the RO +sequence. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/alloc_background.c | 11 ++++++++- + fs/bcachefs/bcachefs.h | 1 + + fs/bcachefs/journal_reclaim.c | 23 ++++++++++++++----- + fs/bcachefs/journal_reclaim.h | 6 ++--- + fs/bcachefs/super.c | 52 +++++++++++++++++++++++------------------- + 5 files changed, 60 insertions(+), 33 deletions(-) + +diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c +index 876285e0eb38..03f8dceaa686 100644 +--- a/fs/bcachefs/alloc_background.c ++++ b/fs/bcachefs/alloc_background.c +@@ -869,6 +869,15 @@ static int bch2_invalidate_one_bucket2(struct btree_trans *trans, + if (!invalidating_cached_data) + goto out; + ++ /* ++ * If the read-only path is trying to shut down, we can't be generating ++ * new btree updates: ++ */ ++ if (test_bit(BCH_FS_ALLOCATOR_STOPPING, &c->flags)) { ++ ret = 1; ++ goto out; ++ } ++ + BUG_ON(BKEY_ALLOC_VAL_U64s_MAX > 8); + + bch2_btree_iter_set_pos(iter, POS(ca->dev_idx, b)); +@@ -956,7 +965,7 @@ out: + percpu_up_read(&c->mark_lock); + } + +- return ret; ++ return ret < 0 ? ret : 0; + } + + static bool bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca, +diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h +index 9bcdf2658f95..5b1537304abd 100644 +--- a/fs/bcachefs/bcachefs.h ++++ b/fs/bcachefs/bcachefs.h +@@ -482,6 +482,7 @@ enum { + BCH_FS_ALLOC_CLEAN, + BCH_FS_ALLOCATOR_STARTED, + BCH_FS_ALLOCATOR_RUNNING, ++ BCH_FS_ALLOCATOR_STOPPING, + BCH_FS_INITIAL_GC_DONE, + BCH_FS_FSCK_DONE, + BCH_FS_STARTED, +diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c +index d5eed53f1298..5b3f2548561b 100644 +--- a/fs/bcachefs/journal_reclaim.c ++++ b/fs/bcachefs/journal_reclaim.c +@@ -413,10 +413,12 @@ journal_get_next_pin(struct journal *j, u64 max_seq, u64 *seq) + return ret; + } + +-static void journal_flush_pins(struct journal *j, u64 seq_to_flush, ++/* returns true if we did work */ ++static bool journal_flush_pins(struct journal *j, u64 seq_to_flush, + unsigned min_nr) + { + struct journal_entry_pin *pin; ++ bool ret = false; + u64 seq; + + lockdep_assert_held(&j->reclaim_lock); +@@ -431,7 +433,10 @@ static void journal_flush_pins(struct journal *j, u64 seq_to_flush, + BUG_ON(j->flush_in_progress != pin); + j->flush_in_progress = NULL; + wake_up(&j->pin_flush_wait); ++ ret = true; + } ++ ++ return ret; + } + + /** +@@ -523,7 +528,8 @@ void bch2_journal_reclaim_work(struct work_struct *work) + mutex_unlock(&j->reclaim_lock); + } + +-static int journal_flush_done(struct journal *j, u64 seq_to_flush) ++static int journal_flush_done(struct journal *j, u64 seq_to_flush, ++ bool *did_work) + { + int ret; + +@@ -533,7 +539,7 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush) + + mutex_lock(&j->reclaim_lock); + +- journal_flush_pins(j, seq_to_flush, 0); ++ *did_work = journal_flush_pins(j, seq_to_flush, 0); + + spin_lock(&j->lock); + /* +@@ -551,12 +557,17 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush) + return ret; + } + +-void bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush) ++bool bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush) + { ++ bool did_work = false; ++ + if (!test_bit(JOURNAL_STARTED, &j->flags)) +- return; ++ return false; ++ ++ closure_wait_event(&j->async_wait, ++ journal_flush_done(j, seq_to_flush, &did_work)); + +- closure_wait_event(&j->async_wait, journal_flush_done(j, seq_to_flush)); ++ return did_work; + } + + int bch2_journal_flush_device_pins(struct journal *j, int dev_idx) +diff --git a/fs/bcachefs/journal_reclaim.h b/fs/bcachefs/journal_reclaim.h +index 3ef641f7ce30..272ba8a37967 100644 +--- a/fs/bcachefs/journal_reclaim.h ++++ b/fs/bcachefs/journal_reclaim.h +@@ -53,11 +53,11 @@ void bch2_journal_do_discards(struct journal *); + void bch2_journal_reclaim(struct journal *); + void bch2_journal_reclaim_work(struct work_struct *); + +-void bch2_journal_flush_pins(struct journal *, u64); ++bool bch2_journal_flush_pins(struct journal *, u64); + +-static inline void bch2_journal_flush_all_pins(struct journal *j) ++static inline bool bch2_journal_flush_all_pins(struct journal *j) + { +- bch2_journal_flush_pins(j, U64_MAX); ++ return bch2_journal_flush_pins(j, U64_MAX); + } + + int bch2_journal_flush_device_pins(struct journal *, int); +diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c +index a57afb55e90a..efc1cac1d412 100644 +--- a/fs/bcachefs/super.c ++++ b/fs/bcachefs/super.c +@@ -168,7 +168,7 @@ struct bch_fs *bch2_uuid_to_fs(uuid_le uuid) + static void __bch2_fs_read_only(struct bch_fs *c) + { + struct bch_dev *ca; +- bool wrote; ++ bool wrote = false; + unsigned i, clean_passes = 0; + int ret; + +@@ -193,39 +193,46 @@ static void __bch2_fs_read_only(struct bch_fs *c) + goto nowrote_alloc; + + bch_verbose(c, "writing alloc info"); ++ /* ++ * This should normally just be writing the bucket read/write clocks: ++ */ ++ ret = bch2_stripes_write(c, BTREE_INSERT_NOCHECK_RW, &wrote) ?: ++ bch2_alloc_write(c, BTREE_INSERT_NOCHECK_RW, &wrote); ++ bch_verbose(c, "writing alloc info complete"); + +- do { +- wrote = false; ++ if (ret && !test_bit(BCH_FS_EMERGENCY_RO, &c->flags)) ++ bch2_fs_inconsistent(c, "error writing out alloc info %i", ret); + +- ret = bch2_stripes_write(c, BTREE_INSERT_NOCHECK_RW, &wrote) ?: +- bch2_alloc_write(c, BTREE_INSERT_NOCHECK_RW, &wrote); ++ if (ret) ++ goto nowrote_alloc; + +- if (ret && !test_bit(BCH_FS_EMERGENCY_RO, &c->flags)) +- bch2_fs_inconsistent(c, "error writing out alloc info %i", ret); ++ bch_verbose(c, "flushing journal and stopping allocators"); + +- if (ret) +- goto nowrote_alloc; ++ bch2_journal_flush_all_pins(&c->journal); ++ set_bit(BCH_FS_ALLOCATOR_STOPPING, &c->flags); + +- for_each_member_device(ca, c, i) +- bch2_dev_allocator_quiesce(c, ca); ++ do { ++ clean_passes++; + +- bch2_journal_flush_all_pins(&c->journal); ++ if (bch2_journal_flush_all_pins(&c->journal)) ++ clean_passes = 0; + + /* +- * We need to explicitly wait on btree interior updates to complete +- * before stopping the journal, flushing all journal pins isn't +- * sufficient, because in the BTREE_INTERIOR_UPDATING_ROOT case btree +- * interior updates have to drop their journal pin before they're +- * fully complete: ++ * In flight interior btree updates will generate more journal ++ * updates and btree updates (alloc btree): + */ +- closure_wait_event(&c->btree_interior_update_wait, +- !bch2_btree_interior_updates_nr_pending(c)); ++ if (bch2_btree_interior_updates_nr_pending(c)) { ++ closure_wait_event(&c->btree_interior_update_wait, ++ !bch2_btree_interior_updates_nr_pending(c)); ++ clean_passes = 0; ++ } + flush_work(&c->btree_interior_update_work); + +- clean_passes = wrote ? 0 : clean_passes + 1; ++ if (bch2_journal_flush_all_pins(&c->journal)) ++ clean_passes = 0; + } while (clean_passes < 2); ++ bch_verbose(c, "flushing journal and stopping allocators complete"); + +- bch_verbose(c, "writing alloc info complete"); + set_bit(BCH_FS_ALLOC_CLEAN, &c->flags); + nowrote_alloc: + closure_wait_event(&c->btree_interior_update_wait, +@@ -236,11 +243,10 @@ nowrote_alloc: + bch2_dev_allocator_stop(ca); + + clear_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags); ++ clear_bit(BCH_FS_ALLOCATOR_STOPPING, &c->flags); + + bch2_fs_journal_stop(&c->journal); + +- /* XXX: mark super that alloc info is persistent */ +- + /* + * the journal kicks off btree writes via reclaim - wait for in flight + * writes after stopping journal: +-- +cgit v1.2.3 + + +From 2701b564d7716ca40f11b4464d7952a132f06064 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 2 Jun 2020 16:30:54 -0400 +Subject: bcachefs: Add an option to disable reflink support + +Reflink might be buggy, so we're adding an option so users can help +bisect what's going on. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bcachefs_format.h | 2 ++ + fs/bcachefs/fs-io.c | 3 +++ + fs/bcachefs/opts.h | 5 +++++ + fs/bcachefs/reflink.c | 3 +++ + 4 files changed, 13 insertions(+) + +diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h +index 616863ef77d4..f808e63a713d 100644 +--- a/fs/bcachefs/bcachefs_format.h ++++ b/fs/bcachefs/bcachefs_format.h +@@ -1262,6 +1262,8 @@ LE64_BITMASK(BCH_SB_PRJQUOTA, struct bch_sb, flags[0], 59, 60); + + LE64_BITMASK(BCH_SB_HAS_ERRORS, struct bch_sb, flags[0], 60, 61); + ++LE64_BITMASK(BCH_SB_REFLINK, struct bch_sb, flags[0], 61, 62); ++ + /* 61-64 unused */ + + LE64_BITMASK(BCH_SB_STR_HASH_TYPE, struct bch_sb, flags[1], 0, 4); +diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c +index 9f7e011e4b35..e5088402c37d 100644 +--- a/fs/bcachefs/fs-io.c ++++ b/fs/bcachefs/fs-io.c +@@ -2870,6 +2870,9 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src, + u64 aligned_len; + loff_t ret = 0; + ++ if (!c->opts.reflink) ++ return -EOPNOTSUPP; ++ + if (remap_flags & ~(REMAP_FILE_DEDUP|REMAP_FILE_ADVISORY)) + return -EINVAL; + +diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h +index ba4903352343..71ebace78453 100644 +--- a/fs/bcachefs/opts.h ++++ b/fs/bcachefs/opts.h +@@ -207,6 +207,11 @@ enum opt_type { + OPT_BOOL(), \ + BCH_SB_PRJQUOTA, false, \ + NULL, "Enable project quotas") \ ++ x(reflink, u8, \ ++ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ ++ OPT_BOOL(), \ ++ BCH_SB_REFLINK, true, \ ++ NULL, "Enable reflink support") \ + x(degraded, u8, \ + OPT_MOUNT, \ + OPT_BOOL(), \ +diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c +index 2f223be74926..3c473f1380a6 100644 +--- a/fs/bcachefs/reflink.c ++++ b/fs/bcachefs/reflink.c +@@ -167,6 +167,9 @@ s64 bch2_remap_range(struct bch_fs *c, + u64 src_done, dst_done; + int ret = 0, ret2 = 0; + ++ if (!c->opts.reflink) ++ return -EOPNOTSUPP; ++ + if (!percpu_ref_tryget(&c->writes)) + return -EROFS; + +-- +cgit v1.2.3 + + +From 82b7540a1b7e919330c5098c87f47ec74d18e676 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 3 Jun 2020 16:20:22 -0400 +Subject: bcachefs: Set filesystem features earlier in fs init path + +Before we were setting features after allocating btree nodes, which +meant we were using the old btree pointer format. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/recovery.c | 14 +++++++++----- + 1 file changed, 9 insertions(+), 5 deletions(-) + +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index e7f65fa21151..c478d19e5691 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -1154,6 +1154,15 @@ int bch2_fs_initialize(struct bch_fs *c) + bch2_mark_dev_superblock(c, ca, 0); + mutex_unlock(&c->sb_lock); + ++ mutex_lock(&c->sb_lock); ++ c->disk_sb.sb->version = c->disk_sb.sb->version_min = ++ le16_to_cpu(bcachefs_metadata_version_current); ++ c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_atomic_nlink; ++ c->disk_sb.sb->features[0] |= BCH_SB_FEATURES_ALL; ++ ++ bch2_write_super(c); ++ mutex_unlock(&c->sb_lock); ++ + set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags); + set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags); + +@@ -1212,11 +1221,6 @@ int bch2_fs_initialize(struct bch_fs *c) + goto err; + + mutex_lock(&c->sb_lock); +- c->disk_sb.sb->version = c->disk_sb.sb->version_min = +- le16_to_cpu(bcachefs_metadata_version_current); +- c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_atomic_nlink; +- c->disk_sb.sb->features[0] |= BCH_SB_FEATURES_ALL; +- + SET_BCH_SB_INITIALIZED(c->disk_sb.sb, true); + SET_BCH_SB_CLEAN(c->disk_sb.sb, false); + +-- +cgit v1.2.3 + + +From e6f533b5cad024d1331b55ea703789e5e5109829 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 2 Jun 2020 16:36:11 -0400 +Subject: bcachefs: Add debug code to print btree transactions + +Intented to help debug deadlocks, since we can't use lockdep to check +btree node lock ordering. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bcachefs.h | 3 +++ + fs/bcachefs/btree_iter.c | 62 ++++++++++++++++++++++++++++++++++++++++++++- + fs/bcachefs/btree_iter.h | 2 ++ + fs/bcachefs/btree_locking.h | 12 ++++++++- + fs/bcachefs/btree_types.h | 4 +++ + fs/bcachefs/clock.c | 2 +- + fs/bcachefs/journal.c | 4 +-- + fs/bcachefs/sysfs.c | 8 ++++++ + 8 files changed, 92 insertions(+), 5 deletions(-) + +diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h +index 5b1537304abd..d58ee567bcd6 100644 +--- a/fs/bcachefs/bcachefs.h ++++ b/fs/bcachefs/bcachefs.h +@@ -628,6 +628,9 @@ struct bch_fs { + struct workqueue_struct *btree_interior_update_worker; + struct work_struct btree_interior_update_work; + ++ /* btree_iter.c: */ ++ struct mutex btree_trans_lock; ++ struct list_head btree_trans_list; + mempool_t btree_iters_pool; + + struct workqueue_struct *wq; +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 1cded0540af5..29929298a1a9 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -1912,7 +1912,7 @@ static struct btree_iter *btree_trans_iter_alloc(struct btree_trans *trans) + struct btree_iter *iter; + + trans_for_each_iter(trans, iter) { +- pr_err("iter: btree %s pos %llu:%llu%s%s%s %pf", ++ pr_err("iter: btree %s pos %llu:%llu%s%s%s %ps", + bch2_btree_ids[iter->btree_id], + iter->pos.inode, + iter->pos.offset, +@@ -2192,12 +2192,24 @@ void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, + + if (expected_mem_bytes) + bch2_trans_preload_mem(trans, expected_mem_bytes); ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++ mutex_lock(&c->btree_trans_lock); ++ list_add(&trans->list, &c->btree_trans_list); ++ mutex_unlock(&c->btree_trans_lock); ++#endif + } + + int bch2_trans_exit(struct btree_trans *trans) + { + bch2_trans_unlock(trans); + ++#ifdef CONFIG_BCACHEFS_DEBUG ++ mutex_lock(&trans->c->btree_trans_lock); ++ list_del(&trans->list); ++ mutex_unlock(&trans->c->btree_trans_lock); ++#endif ++ + kfree(trans->fs_usage_deltas); + kfree(trans->mem); + if (trans->used_mempool) +@@ -2210,6 +2222,51 @@ int bch2_trans_exit(struct btree_trans *trans) + return trans->error ? -EIO : 0; + } + ++void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c) ++{ ++#ifdef CONFIG_BCACHEFS_DEBUG ++ struct btree_trans *trans; ++ struct btree_iter *iter; ++ struct btree *b; ++ unsigned l; ++ ++ mutex_lock(&c->btree_trans_lock); ++ list_for_each_entry(trans, &c->btree_trans_list, list) { ++ pr_buf(out, "%ps\n", (void *) trans->ip); ++ ++ trans_for_each_iter(trans, iter) { ++ if (!iter->nodes_locked) ++ continue; ++ ++ pr_buf(out, " iter %s:", bch2_btree_ids[iter->btree_id]); ++ bch2_bpos_to_text(out, iter->pos); ++ pr_buf(out, "\n"); ++ ++ for (l = 0; l < BTREE_MAX_DEPTH; l++) { ++ if (btree_node_locked(iter, l)) { ++ b = iter->l[l].b; ++ ++ pr_buf(out, " %p l=%u %s ", ++ b, l, btree_node_intent_locked(iter, l) ? "i" : "r"); ++ bch2_bpos_to_text(out, b->key.k.p); ++ pr_buf(out, "\n"); ++ } ++ } ++ } ++ ++ b = READ_ONCE(trans->locking); ++ if (b) { ++ pr_buf(out, " locking %px l=%u %s:", ++ b, b->level, ++ bch2_btree_ids[b->btree_id]); ++ bch2_bpos_to_text(out, b->key.k.p); ++ pr_buf(out, "\n"); ++ } ++ } ++ mutex_unlock(&c->btree_trans_lock); ++#endif ++} ++ + void bch2_fs_btree_iter_exit(struct bch_fs *c) + { + mempool_exit(&c->btree_iters_pool); +@@ -2219,6 +2276,9 @@ int bch2_fs_btree_iter_init(struct bch_fs *c) + { + unsigned nr = BTREE_ITER_MAX; + ++ INIT_LIST_HEAD(&c->btree_trans_list); ++ mutex_init(&c->btree_trans_lock); ++ + return mempool_init_kmalloc_pool(&c->btree_iters_pool, 1, + sizeof(struct btree_iter) * nr + + sizeof(struct btree_insert_entry) * nr + +diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h +index 6456787a8f77..841a5834f1a8 100644 +--- a/fs/bcachefs/btree_iter.h ++++ b/fs/bcachefs/btree_iter.h +@@ -303,6 +303,8 @@ void *bch2_trans_kmalloc(struct btree_trans *, size_t); + void bch2_trans_init(struct btree_trans *, struct bch_fs *, unsigned, size_t); + int bch2_trans_exit(struct btree_trans *); + ++void bch2_btree_trans_to_text(struct printbuf *, struct bch_fs *); ++ + void bch2_fs_btree_iter_exit(struct bch_fs *); + int bch2_fs_btree_iter_init(struct bch_fs *); + +diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h +index bb4f66646da2..730a9dc89de8 100644 +--- a/fs/bcachefs/btree_locking.h ++++ b/fs/bcachefs/btree_locking.h +@@ -182,11 +182,21 @@ static inline bool btree_node_lock(struct btree *b, struct bpos pos, + struct btree_iter *iter, + enum six_lock_type type) + { ++ bool ret; ++ + EBUG_ON(level >= BTREE_MAX_DEPTH); ++#ifdef CONFIG_BCACHEFS_DEBUG ++ iter->trans->locking = b; ++#endif + +- return likely(six_trylock_type(&b->lock, type)) || ++ ret = likely(six_trylock_type(&b->lock, type)) || + btree_node_lock_increment(iter, b, level, type) || + __bch2_btree_node_lock(b, pos, level, iter, type); ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++ iter->trans->locking = NULL; ++#endif ++ return ret; + } + + bool __bch2_btree_node_relock(struct btree_iter *, unsigned); +diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h +index 8357b5251a43..b86d7369eb2d 100644 +--- a/fs/bcachefs/btree_types.h ++++ b/fs/bcachefs/btree_types.h +@@ -281,6 +281,10 @@ struct btree_insert_entry { + + struct btree_trans { + struct bch_fs *c; ++#ifdef CONFIG_BCACHEFS_DEBUG ++ struct list_head list; ++ struct btree *locking; ++#endif + unsigned long ip; + + u64 iters_linked; +diff --git a/fs/bcachefs/clock.c b/fs/bcachefs/clock.c +index d9de0d1302e2..a9f5d5696622 100644 +--- a/fs/bcachefs/clock.c ++++ b/fs/bcachefs/clock.c +@@ -162,7 +162,7 @@ ssize_t bch2_io_timers_show(struct io_clock *clock, char *buf) + now = atomic_long_read(&clock->now); + + for (i = 0; i < clock->timers.used; i++) +- pr_buf(&out, "%pf:\t%li\n", ++ pr_buf(&out, "%ps:\t%li\n", + clock->timers.data[i]->fn, + clock->timers.data[i]->expire - now); + spin_unlock(&clock->timer_lock); +diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c +index 32999161bdd8..17dc60d98dc3 100644 +--- a/fs/bcachefs/journal.c ++++ b/fs/bcachefs/journal.c +@@ -1235,14 +1235,14 @@ ssize_t bch2_journal_print_pins(struct journal *j, char *buf) + i, atomic_read(&pin_list->count)); + + list_for_each_entry(pin, &pin_list->list, list) +- pr_buf(&out, "\t%p %pf\n", ++ pr_buf(&out, "\t%px %ps\n", + pin, pin->flush); + + if (!list_empty(&pin_list->flushed)) + pr_buf(&out, "flushed:\n"); + + list_for_each_entry(pin, &pin_list->flushed, list) +- pr_buf(&out, "\t%p %pf\n", ++ pr_buf(&out, "\t%px %ps\n", + pin, pin->flush); + } + spin_unlock(&j->lock); +diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c +index d78ffcc0e8a4..5f2bc933b0e9 100644 +--- a/fs/bcachefs/sysfs.c ++++ b/fs/bcachefs/sysfs.c +@@ -166,6 +166,7 @@ read_attribute(journal_debug); + read_attribute(journal_pins); + read_attribute(btree_updates); + read_attribute(dirty_btree_nodes); ++read_attribute(btree_transactions); + + read_attribute(internal_uuid); + +@@ -401,6 +402,12 @@ SHOW(bch2_fs) + + if (attr == &sysfs_dirty_btree_nodes) + return bch2_dirty_btree_nodes_print(c, buf); ++ if (attr == &sysfs_btree_transactions) { ++ struct printbuf out = _PBUF(buf, PAGE_SIZE); ++ ++ bch2_btree_trans_to_text(&out, c); ++ return out.pos - buf; ++ } + + if (attr == &sysfs_compression_stats) + return bch2_compression_stats(c, buf); +@@ -571,6 +578,7 @@ struct attribute *bch2_fs_internal_files[] = { + &sysfs_journal_pins, + &sysfs_btree_updates, + &sysfs_dirty_btree_nodes, ++ &sysfs_btree_transactions, + + &sysfs_read_realloc_races, + &sysfs_extent_migrate_done, +-- +cgit v1.2.3 + + +From d4c9d10c42b37b723efbd9150175ba5b6e234c04 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 2 Jun 2020 19:41:47 -0400 +Subject: bcachefs: Fix a deadlock in bch2_btree_node_get_sibling() + +There was a bad interaction with bch2_btree_iter_set_pos_same_leaf(), +which can leave a btree node locked that is just outside iter->pos, +breaking the lock ordering checks in __bch2_btree_node_lock(). Ideally +we should get rid of this corner case, but for now fix it locally with +verbose comments. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_cache.c | 12 ++++++++++++ + fs/bcachefs/btree_iter.c | 18 +++++++++++++++--- + fs/bcachefs/btree_iter.h | 9 +-------- + fs/bcachefs/btree_types.h | 1 + + 4 files changed, 29 insertions(+), 11 deletions(-) + +diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c +index 4868137ecc88..1f62c4556ca7 100644 +--- a/fs/bcachefs/btree_cache.c ++++ b/fs/bcachefs/btree_cache.c +@@ -850,6 +850,18 @@ struct btree *bch2_btree_node_get_sibling(struct bch_fs *c, + if (!parent) + return NULL; + ++ /* ++ * There's a corner case where a btree_iter might have a node locked ++ * that is just outside its current pos - when ++ * bch2_btree_iter_set_pos_same_leaf() gets to the end of the node. ++ * ++ * But the lock ordering checks in __bch2_btree_node_lock() go off of ++ * iter->pos, not the node's key: so if the iterator is marked as ++ * needing to be traversed, we risk deadlock if we don't bail out here: ++ */ ++ if (iter->uptodate >= BTREE_ITER_NEED_TRAVERSE) ++ return ERR_PTR(-EINTR); ++ + if (!bch2_btree_node_relock(iter, level + 1)) { + ret = ERR_PTR(-EINTR); + goto out; +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 29929298a1a9..6abcbe3debe5 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -205,8 +205,9 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos, + if (!linked->nodes_locked) + continue; + +- /* * Must lock btree nodes in key order: */ +- if (__btree_iter_cmp(iter->btree_id, pos, linked) < 0) ++ /* Must lock btree nodes in key order: */ ++ if ((cmp_int(iter->btree_id, linked->btree_id) ?: ++ bkey_cmp(pos, linked->pos)) < 0) + ret = false; + + /* +@@ -1320,6 +1321,16 @@ void bch2_btree_iter_set_pos_same_leaf(struct btree_iter *iter, struct bpos new_ + + btree_iter_advance_to_pos(iter, l, -1); + ++ /* ++ * XXX: ++ * keeping a node locked that's outside (even just outside) iter->pos ++ * breaks __bch2_btree_node_lock(). This seems to only affect ++ * bch2_btree_node_get_sibling so for now it's fixed there, but we ++ * should try to get rid of this corner case. ++ * ++ * (this behaviour is currently needed for BTREE_INSERT_NOUNLOCK) ++ */ ++ + if (bch2_btree_node_iter_end(&l->iter) && + btree_iter_pos_after_node(iter, l->b)) + btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); +@@ -2194,6 +2205,7 @@ void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, + bch2_trans_preload_mem(trans, expected_mem_bytes); + + #ifdef CONFIG_BCACHEFS_DEBUG ++ trans->pid = current->pid; + mutex_lock(&c->btree_trans_lock); + list_add(&trans->list, &c->btree_trans_list); + mutex_unlock(&c->btree_trans_lock); +@@ -2232,7 +2244,7 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c) + + mutex_lock(&c->btree_trans_lock); + list_for_each_entry(trans, &c->btree_trans_list, list) { +- pr_buf(out, "%ps\n", (void *) trans->ip); ++ pr_buf(out, "%i %ps\n", trans->pid, (void *) trans->ip); + + trans_for_each_iter(trans, iter) { + if (!iter->nodes_locked) +diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h +index 841a5834f1a8..ab35fcd8b8b4 100644 +--- a/fs/bcachefs/btree_iter.h ++++ b/fs/bcachefs/btree_iter.h +@@ -172,17 +172,10 @@ void bch2_btree_iter_set_pos_same_leaf(struct btree_iter *, struct bpos); + void __bch2_btree_iter_set_pos(struct btree_iter *, struct bpos, bool); + void bch2_btree_iter_set_pos(struct btree_iter *, struct bpos); + +-static inline int __btree_iter_cmp(enum btree_id id, +- struct bpos pos, +- const struct btree_iter *r) +-{ +- return cmp_int(id, r->btree_id) ?: bkey_cmp(pos, r->pos); +-} +- + static inline int btree_iter_cmp(const struct btree_iter *l, + const struct btree_iter *r) + { +- return __btree_iter_cmp(l->btree_id, l->pos, r); ++ return cmp_int(l->btree_id, r->btree_id) ?: bkey_cmp(l->pos, r->pos); + } + + /* +diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h +index b86d7369eb2d..e97248ca3aa2 100644 +--- a/fs/bcachefs/btree_types.h ++++ b/fs/bcachefs/btree_types.h +@@ -284,6 +284,7 @@ struct btree_trans { + #ifdef CONFIG_BCACHEFS_DEBUG + struct list_head list; + struct btree *locking; ++ pid_t pid; + #endif + unsigned long ip; + +-- +cgit v1.2.3 + + +From 115d7721b8f778bdeb91896b92be99df38de0d6c Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 3 Jun 2020 18:27:07 -0400 +Subject: bcachefs: Improve assorted error messages + +This also consolidates the various checks in bch2_mark_pointer() and +bch2_trans_mark_pointer(). + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_io.c | 15 +-- + fs/bcachefs/buckets.c | 243 +++++++++++++++++++++++-------------------------- + fs/bcachefs/error.h | 1 + + fs/bcachefs/extents.c | 2 +- + fs/bcachefs/fsck.c | 2 +- + 5 files changed, 127 insertions(+), 136 deletions(-) + +diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c +index 63063748d4f5..6a42ce2522fd 100644 +--- a/fs/bcachefs/btree_io.c ++++ b/fs/bcachefs/btree_io.c +@@ -631,14 +631,14 @@ static void btree_err_msg(struct printbuf *out, struct bch_fs *c, + struct btree *b, struct bset *i, + unsigned offset, int write) + { +- pr_buf(out, "error validating btree node %s" +- "at btree %u level %u/%u\n" +- "pos %llu:%llu node offset %u", ++ pr_buf(out, "error validating btree node %sat btree %u level %u/%u\n" ++ "pos ", + write ? "before write " : "", + b->btree_id, b->level, +- c->btree_roots[b->btree_id].level, +- b->key.k.p.inode, b->key.k.p.offset, +- b->written); ++ c->btree_roots[b->btree_id].level); ++ bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key)); ++ ++ pr_buf(out, " node offset %u", b->written); + if (i) + pr_buf(out, " bset u64s %u", le16_to_cpu(i->u64s)); + } +@@ -944,7 +944,8 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry + + btree_err_on(b->data->keys.seq != bp->seq, + BTREE_ERR_MUST_RETRY, c, b, NULL, +- "got wrong btree node"); ++ "got wrong btree node (seq %llx want %llx)", ++ b->data->keys.seq, bp->seq); + } + + while (b->written < c->opts.btree_node_size) { +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index 49a70ea21979..41e91bd70dde 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -918,61 +918,117 @@ static void bucket_set_stripe(struct bch_fs *c, + } + } + +-static bool bch2_mark_pointer(struct bch_fs *c, +- struct extent_ptr_decoded p, +- s64 sectors, enum bch_data_type data_type, +- struct bch_fs_usage *fs_usage, +- u64 journal_seq, unsigned flags) ++static int __mark_pointer(struct bch_fs *c, struct bkey_s_c k, ++ struct extent_ptr_decoded p, ++ s64 sectors, enum bch_data_type ptr_data_type, ++ u8 bucket_gen, u8 *bucket_data_type, ++ u16 *dirty_sectors, u16 *cached_sectors) ++{ ++ u16 *dst_sectors = !p.ptr.cached ++ ? dirty_sectors ++ : cached_sectors; ++ u16 orig_sectors = *dst_sectors; ++ char buf[200]; ++ ++ if (gen_after(p.ptr.gen, bucket_gen)) { ++ bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ++ "bucket %u:%zu gen %u data type %s: ptr gen %u newer than bucket gen\n" ++ "while marking %s", ++ p.ptr.dev, PTR_BUCKET_NR(bch_dev_bkey_exists(c, p.ptr.dev), &p.ptr), ++ bucket_gen, ++ bch2_data_types[*bucket_data_type ?: ptr_data_type], ++ p.ptr.gen, ++ (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); ++ return -EIO; ++ } ++ ++ if (gen_cmp(bucket_gen, p.ptr.gen) >= 96U) { ++ bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ++ "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n" ++ "while marking %s", ++ p.ptr.dev, PTR_BUCKET_NR(bch_dev_bkey_exists(c, p.ptr.dev), &p.ptr), ++ bucket_gen, ++ bch2_data_types[*bucket_data_type ?: ptr_data_type], ++ p.ptr.gen, ++ (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); ++ return -EIO; ++ } ++ ++ if (bucket_gen != p.ptr.gen && !p.ptr.cached) { ++ bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ++ "bucket %u:%zu gen %u data type %s: stale dirty ptr (gen %u)\n" ++ "while marking %s", ++ p.ptr.dev, PTR_BUCKET_NR(bch_dev_bkey_exists(c, p.ptr.dev), &p.ptr), ++ bucket_gen, ++ bch2_data_types[*bucket_data_type ?: ptr_data_type], ++ p.ptr.gen, ++ (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); ++ return -EIO; ++ } ++ ++ if (bucket_gen != p.ptr.gen) ++ return 1; ++ ++ if (*bucket_data_type && *bucket_data_type != ptr_data_type) { ++ bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ++ "bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n" ++ "while marking %s", ++ p.ptr.dev, PTR_BUCKET_NR(bch_dev_bkey_exists(c, p.ptr.dev), &p.ptr), ++ bucket_gen, ++ bch2_data_types[*bucket_data_type], ++ bch2_data_types[ptr_data_type], ++ (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); ++ return -EIO; ++ } ++ ++ if (checked_add(*dst_sectors, sectors)) { ++ bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ++ "bucket %u:%zu gen %u data type %s sector count overflow: %u + %lli > U16_MAX\n" ++ "while marking %s", ++ p.ptr.dev, PTR_BUCKET_NR(bch_dev_bkey_exists(c, p.ptr.dev), &p.ptr), ++ bucket_gen, ++ bch2_data_types[*bucket_data_type ?: ptr_data_type], ++ orig_sectors, sectors, ++ (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); ++ return -EIO; ++ } ++ ++ *bucket_data_type = *dirty_sectors || *cached_sectors ++ ? ptr_data_type : 0; ++ return 0; ++} ++ ++static int bch2_mark_pointer(struct bch_fs *c, struct bkey_s_c k, ++ struct extent_ptr_decoded p, ++ s64 sectors, enum bch_data_type data_type, ++ struct bch_fs_usage *fs_usage, ++ u64 journal_seq, unsigned flags) + { + bool gc = flags & BTREE_TRIGGER_GC; + struct bucket_mark old, new; + struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); + struct bucket *g = PTR_BUCKET(ca, &p.ptr, gc); +- u16 *dst_sectors, orig_sectors; +- bool overflow; ++ u8 bucket_data_type; + u64 v; ++ int ret; + + v = atomic64_read(&g->_mark.v); + do { + new.v.counter = old.v.counter = v; ++ bucket_data_type = new.data_type; + +- /* +- * Check this after reading bucket mark to guard against +- * the allocator invalidating a bucket after we've already +- * checked the gen +- */ +- if (gen_after(p.ptr.gen, new.gen)) { +- bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, +- "pointer gen in the future"); +- return true; +- } +- +- if (new.gen != p.ptr.gen) { +- /* XXX write repair code for this */ +- if (!p.ptr.cached && +- test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)) +- bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, +- "stale dirty pointer"); +- return true; +- } +- +- dst_sectors = !p.ptr.cached +- ? &new.dirty_sectors +- : &new.cached_sectors; +- orig_sectors = *dst_sectors; +- +- overflow = checked_add(*dst_sectors, sectors); ++ ret = __mark_pointer(c, k, p, sectors, data_type, new.gen, ++ &bucket_data_type, ++ &new.dirty_sectors, ++ &new.cached_sectors); ++ if (ret) ++ return ret; + +- if (!new.dirty_sectors && +- !new.cached_sectors) { +- new.data_type = 0; ++ new.data_type = bucket_data_type; + +- if (journal_seq) { +- new.journal_seq_valid = 1; +- new.journal_seq = journal_seq; +- } +- } else { +- new.data_type = data_type; ++ if (journal_seq) { ++ new.journal_seq_valid = 1; ++ new.journal_seq = journal_seq; + } + + if (flags & BTREE_TRIGGER_NOATOMIC) { +@@ -983,25 +1039,11 @@ static bool bch2_mark_pointer(struct bch_fs *c, + old.v.counter, + new.v.counter)) != old.v.counter); + +- if (old.data_type && old.data_type != data_type) +- bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, +- "bucket %u:%zu gen %u different types of data in same bucket: %s, %s", +- p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), +- new.gen, +- bch2_data_types[old.data_type], +- bch2_data_types[data_type]); +- +- bch2_fs_inconsistent_on(overflow, c, +- "bucket %u:%zu gen %u data type %s sector count overflow: %u + %lli > U16_MAX", +- p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), new.gen, +- bch2_data_types[old.data_type ?: data_type], +- orig_sectors, sectors); +- + bch2_dev_usage_update(c, ca, fs_usage, old, new, gc); + + BUG_ON(!gc && bucket_became_unavailable(old, new)); + +- return false; ++ return 0; + } + + static int bch2_mark_stripe_ptr(struct bch_fs *c, +@@ -1065,6 +1107,7 @@ static int bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k, + struct extent_ptr_decoded p; + struct bch_replicas_padded r; + s64 dirty_sectors = 0; ++ bool stale; + int ret; + + r.e.data_type = data_type; +@@ -1077,8 +1120,13 @@ static int bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k, + s64 disk_sectors = data_type == BCH_DATA_BTREE + ? sectors + : ptr_disk_sectors_delta(p, offset, sectors, flags); +- bool stale = bch2_mark_pointer(c, p, disk_sectors, data_type, +- fs_usage, journal_seq, flags); ++ ++ ret = bch2_mark_pointer(c, k, p, disk_sectors, data_type, ++ fs_usage, journal_seq, flags); ++ if (ret < 0) ++ return ret; ++ ++ stale = ret > 0; + + if (p.ptr.cached) { + if (!stale) +@@ -1439,25 +1487,24 @@ static int trans_get_key(struct btree_trans *trans, + } + + static int bch2_trans_mark_pointer(struct btree_trans *trans, +- struct extent_ptr_decoded p, ++ struct bkey_s_c k, struct extent_ptr_decoded p, + s64 sectors, enum bch_data_type data_type) + { + struct bch_fs *c = trans->c; + struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); + struct btree_iter *iter; +- struct bkey_s_c k; ++ struct bkey_s_c k_a; + struct bkey_alloc_unpacked u; + struct bkey_i_alloc *a; +- u16 *dst_sectors, orig_sectors; + int ret; + + ret = trans_get_key(trans, BTREE_ID_ALLOC, + POS(p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr)), +- &iter, &k); ++ &iter, &k_a); + if (ret < 0) + return ret; + +- if (k.k->type != KEY_TYPE_alloc || ++ if (k_a.k->type != KEY_TYPE_alloc || + (!ret && unlikely(!test_bit(BCH_FS_ALLOC_WRITTEN, &c->flags)))) { + /* + * During journal replay, and if gc repairs alloc info at +@@ -1474,71 +1521,13 @@ static int bch2_trans_mark_pointer(struct btree_trans *trans, + u = alloc_mem_to_key(g, m); + percpu_up_read(&c->mark_lock); + } else { +- u = bch2_alloc_unpack(k); ++ u = bch2_alloc_unpack(k_a); + } + +- if (u.gen != p.ptr.gen) { +- ret = 1; +- +- if (gen_after(p.ptr.gen, u.gen)) { +- bch2_fs_inconsistent(c, +- "bucket %llu:%llu gen %u data type %s: ptr gen %u newer than bucket gen", +- iter->pos.inode, iter->pos.offset, u.gen, +- bch2_data_types[u.data_type ?: data_type], +- p.ptr.gen); +- ret = -EIO; +- } +- +- if (gen_cmp(u.gen, p.ptr.gen) >= 96U) { +- bch2_fs_inconsistent(c, +- "bucket %llu:%llu gen %u data type %s: ptr gen %u too stale", +- iter->pos.inode, iter->pos.offset, u.gen, +- bch2_data_types[u.data_type ?: data_type], +- p.ptr.gen); +- ret = -EIO; +- } +- +- if (!p.ptr.cached) { +- bch2_fs_inconsistent(c, +- "bucket %llu:%llu gen %u data type %s: stale dirty ptr (gen %u)", +- iter->pos.inode, iter->pos.offset, u.gen, +- bch2_data_types[u.data_type ?: data_type], +- p.ptr.gen); +- ret = -EIO; +- } +- +- goto out; +- } +- +- if (u.data_type && u.data_type != data_type) { +- bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, +- "bucket %llu:%llu gen %u different types of data in same bucket: %s, %s", +- iter->pos.inode, iter->pos.offset, +- u.gen, +- bch2_data_types[u.data_type], +- bch2_data_types[data_type]); +- ret = -1; +- goto out; +- } +- +- dst_sectors = !p.ptr.cached +- ? &u.dirty_sectors +- : &u.cached_sectors; +- orig_sectors = *dst_sectors; +- +- if (checked_add(*dst_sectors, sectors)) { +- bch2_fs_inconsistent(c, +- "bucket %llu:%llu gen %u data type %s sector count overflow: %u + %lli > U16_MAX", +- iter->pos.inode, iter->pos.offset, u.gen, +- bch2_data_types[u.data_type ?: data_type], +- orig_sectors, sectors); +- /* return an error indicating that we need full fsck */ +- ret = -EIO; ++ ret = __mark_pointer(c, k, p, sectors, data_type, u.gen, &u.data_type, ++ &u.dirty_sectors, &u.cached_sectors); ++ if (ret) + goto out; +- } +- +- u.data_type = u.dirty_sectors || u.cached_sectors +- ? data_type : 0; + + a = bch2_trans_kmalloc(trans, BKEY_ALLOC_U64s_MAX * 8); + ret = PTR_ERR_OR_ZERO(a); +@@ -1623,7 +1612,7 @@ static int bch2_trans_mark_extent(struct btree_trans *trans, + ? sectors + : ptr_disk_sectors_delta(p, offset, sectors, flags); + +- ret = bch2_trans_mark_pointer(trans, p, disk_sectors, ++ ret = bch2_trans_mark_pointer(trans, k, p, disk_sectors, + data_type); + if (ret < 0) + return ret; +diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h +index de319794ccd1..94b53312fbbd 100644 +--- a/fs/bcachefs/error.h ++++ b/fs/bcachefs/error.h +@@ -102,6 +102,7 @@ struct fsck_err_state { + #define FSCK_CAN_IGNORE (1 << 1) + #define FSCK_NEED_FSCK (1 << 2) + ++__printf(3, 4) __cold + enum fsck_err_ret bch2_fsck_err(struct bch_fs *, + unsigned, const char *, ...); + void bch2_flush_fsck_errs(struct bch_fs *); +diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c +index d1a4ab04fbbf..251d4af773a5 100644 +--- a/fs/bcachefs/extents.c ++++ b/fs/bcachefs/extents.c +@@ -220,7 +220,7 @@ void bch2_btree_ptr_v2_to_text(struct printbuf *out, struct bch_fs *c, + { + struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k); + +- pr_buf(out, "seq %llu sectors %u written %u min_key ", ++ pr_buf(out, "seq %llx sectors %u written %u min_key ", + le64_to_cpu(bp.v->seq), + le16_to_cpu(bp.v->sectors), + le16_to_cpu(bp.v->sectors_written)); +diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c +index 3ab621c62c43..c6ca5968a2e0 100644 +--- a/fs/bcachefs/fsck.c ++++ b/fs/bcachefs/fsck.c +@@ -1169,7 +1169,7 @@ static int check_inode_nlink(struct bch_fs *c, + } + + if (!S_ISDIR(u->bi_mode) && link->dir_count) { +- need_fsck_err(c, "non directory with subdirectories", ++ need_fsck_err(c, "non directory with subdirectories (inum %llu)", + u->bi_inum); + return 0; + } +-- +cgit v1.2.3 + + +From e0b56324d1f1e70e57f8a21776752f0bf23e941a Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 3 Jun 2020 22:11:10 -0400 +Subject: bcachefs: Kill old allocator startup code + +It's not needed anymore since we can now write to buckets before +updating the alloc btree. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/alloc_background.c | 235 ----------------------------------------- + fs/bcachefs/alloc_background.h | 4 +- + fs/bcachefs/bcachefs.h | 1 - + fs/bcachefs/journal_reclaim.c | 8 -- + fs/bcachefs/super.c | 10 -- + 5 files changed, 1 insertion(+), 257 deletions(-) + +diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c +index 03f8dceaa686..482520761f76 100644 +--- a/fs/bcachefs/alloc_background.c ++++ b/fs/bcachefs/alloc_background.c +@@ -968,31 +968,6 @@ out: + return ret < 0 ? ret : 0; + } + +-static bool bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca, +- size_t bucket, u64 *flush_seq) +-{ +- struct bucket_mark m; +- +- percpu_down_read(&c->mark_lock); +- spin_lock(&c->freelist_lock); +- +- bch2_invalidate_bucket(c, ca, bucket, &m); +- +- verify_not_on_freelist(c, ca, bucket); +- BUG_ON(!fifo_push(&ca->free_inc, bucket)); +- +- spin_unlock(&c->freelist_lock); +- +- bucket_io_clock_reset(c, ca, bucket, READ); +- bucket_io_clock_reset(c, ca, bucket, WRITE); +- +- percpu_up_read(&c->mark_lock); +- +- *flush_seq = max(*flush_seq, bucket_journal_seq(c, m)); +- +- return m.cached_sectors != 0; +-} +- + /* + * Pull buckets off ca->alloc_heap, invalidate them, move them to ca->free_inc: + */ +@@ -1448,216 +1423,6 @@ int bch2_dev_allocator_start(struct bch_dev *ca) + return 0; + } + +-static bool flush_held_btree_writes(struct bch_fs *c) +-{ +- struct bucket_table *tbl; +- struct rhash_head *pos; +- struct btree *b; +- bool nodes_unwritten; +- size_t i; +-again: +- cond_resched(); +- nodes_unwritten = false; +- +- if (bch2_journal_error(&c->journal)) +- return true; +- +- rcu_read_lock(); +- for_each_cached_btree(b, c, tbl, i, pos) +- if (btree_node_need_write(b)) { +- if (btree_node_may_write(b)) { +- rcu_read_unlock(); +- btree_node_lock_type(c, b, SIX_LOCK_read); +- bch2_btree_node_write(c, b, SIX_LOCK_read); +- six_unlock_read(&b->lock); +- goto again; +- } else { +- nodes_unwritten = true; +- } +- } +- rcu_read_unlock(); +- +- return !nodes_unwritten && +- !bch2_btree_interior_updates_nr_pending(c); +-} +- +-static void allocator_start_issue_discards(struct bch_fs *c) +-{ +- struct bch_dev *ca; +- unsigned dev_iter; +- size_t bu; +- +- for_each_rw_member(ca, c, dev_iter) +- while (fifo_pop(&ca->free_inc, bu)) +- blkdev_issue_discard(ca->disk_sb.bdev, +- bucket_to_sector(ca, bu), +- ca->mi.bucket_size, GFP_NOIO, 0); +-} +- +-static int resize_free_inc(struct bch_dev *ca) +-{ +- alloc_fifo free_inc; +- +- if (!fifo_full(&ca->free_inc)) +- return 0; +- +- if (!init_fifo(&free_inc, +- ca->free_inc.size * 2, +- GFP_KERNEL)) +- return -ENOMEM; +- +- fifo_move(&free_inc, &ca->free_inc); +- swap(free_inc, ca->free_inc); +- free_fifo(&free_inc); +- return 0; +-} +- +-static bool bch2_fs_allocator_start_fast(struct bch_fs *c) +-{ +- struct bch_dev *ca; +- unsigned dev_iter; +- bool ret = true; +- +- if (test_alloc_startup(c)) +- return false; +- +- down_read(&c->gc_lock); +- +- /* Scan for buckets that are already invalidated: */ +- for_each_rw_member(ca, c, dev_iter) { +- struct bucket_array *buckets; +- struct bucket_mark m; +- long bu; +- +- down_read(&ca->bucket_lock); +- buckets = bucket_array(ca); +- +- for (bu = buckets->first_bucket; +- bu < buckets->nbuckets; bu++) { +- m = READ_ONCE(buckets->b[bu].mark); +- +- if (!buckets->b[bu].gen_valid || +- !is_available_bucket(m) || +- m.cached_sectors || +- (ca->buckets_nouse && +- test_bit(bu, ca->buckets_nouse))) +- continue; +- +- percpu_down_read(&c->mark_lock); +- bch2_mark_alloc_bucket(c, ca, bu, true, +- gc_pos_alloc(c, NULL), 0); +- percpu_up_read(&c->mark_lock); +- +- fifo_push(&ca->free_inc, bu); +- +- discard_invalidated_buckets(c, ca); +- +- if (fifo_full(&ca->free[RESERVE_BTREE])) +- break; +- } +- up_read(&ca->bucket_lock); +- } +- +- up_read(&c->gc_lock); +- +- /* did we find enough buckets? */ +- for_each_rw_member(ca, c, dev_iter) +- if (!fifo_full(&ca->free[RESERVE_BTREE])) +- ret = false; +- +- return ret; +-} +- +-int bch2_fs_allocator_start(struct bch_fs *c) +-{ +- struct bch_dev *ca; +- unsigned dev_iter; +- u64 journal_seq = 0; +- bool wrote; +- long bu; +- int ret = 0; +- +- if (!test_alloc_startup(c) && +- bch2_fs_allocator_start_fast(c)) +- return 0; +- +- pr_debug("not enough empty buckets; scanning for reclaimable buckets"); +- +- /* +- * We're moving buckets to freelists _before_ they've been marked as +- * invalidated on disk - we have to so that we can allocate new btree +- * nodes to mark them as invalidated on disk. +- * +- * However, we can't _write_ to any of these buckets yet - they might +- * have cached data in them, which is live until they're marked as +- * invalidated on disk: +- */ +- set_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags); +- +- down_read(&c->gc_lock); +- do { +- wrote = false; +- +- for_each_rw_member(ca, c, dev_iter) { +- find_reclaimable_buckets(c, ca); +- +- while (!fifo_full(&ca->free[RESERVE_BTREE]) && +- (bu = next_alloc_bucket(ca)) >= 0) { +- ret = resize_free_inc(ca); +- if (ret) { +- percpu_ref_put(&ca->io_ref); +- up_read(&c->gc_lock); +- goto err; +- } +- +- bch2_invalidate_one_bucket(c, ca, bu, +- &journal_seq); +- +- fifo_push(&ca->free[RESERVE_BTREE], bu); +- } +- } +- +- pr_debug("done scanning for reclaimable buckets"); +- +- /* +- * XXX: it's possible for this to deadlock waiting on journal reclaim, +- * since we're holding btree writes. What then? +- */ +- ret = bch2_alloc_write(c, +- BTREE_INSERT_NOCHECK_RW| +- BTREE_INSERT_USE_ALLOC_RESERVE| +- BTREE_INSERT_NOWAIT, &wrote); +- +- /* +- * If bch2_alloc_write() did anything, it may have used some +- * buckets, and we need the RESERVE_BTREE freelist full - so we +- * need to loop and scan again. +- * And if it errored, it may have been because there weren't +- * enough buckets, so just scan and loop again as long as it +- * made some progress: +- */ +- } while (wrote); +- up_read(&c->gc_lock); +- +- if (ret) +- goto err; +- +- pr_debug("flushing journal"); +- +- ret = bch2_journal_flush(&c->journal); +- if (ret) +- goto err; +- +- pr_debug("issuing discards"); +- allocator_start_issue_discards(c); +-err: +- clear_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags); +- closure_wait_event(&c->btree_interior_update_wait, +- flush_held_btree_writes(c)); +- +- return ret; +-} +- + void bch2_fs_allocator_background_init(struct bch_fs *c) + { + spin_lock_init(&c->freelist_lock); +diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h +index 501c444353fb..b53a27450889 100644 +--- a/fs/bcachefs/alloc_background.h ++++ b/fs/bcachefs/alloc_background.h +@@ -70,8 +70,7 @@ static inline void bch2_wake_allocator(struct bch_dev *ca) + static inline void verify_not_on_freelist(struct bch_fs *c, struct bch_dev *ca, + size_t bucket) + { +- if (expensive_debug_checks(c) && +- test_bit(BCH_FS_ALLOCATOR_STARTED, &c->flags)) { ++ if (expensive_debug_checks(c)) { + size_t iter; + long i; + unsigned j; +@@ -94,7 +93,6 @@ void bch2_dev_allocator_stop(struct bch_dev *); + int bch2_dev_allocator_start(struct bch_dev *); + + int bch2_alloc_write(struct bch_fs *, unsigned, bool *); +-int bch2_fs_allocator_start(struct bch_fs *); + void bch2_fs_allocator_background_init(struct bch_fs *); + + #endif /* _BCACHEFS_ALLOC_BACKGROUND_H */ +diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h +index d58ee567bcd6..49519badf3e9 100644 +--- a/fs/bcachefs/bcachefs.h ++++ b/fs/bcachefs/bcachefs.h +@@ -480,7 +480,6 @@ enum { + /* startup: */ + BCH_FS_ALLOC_READ_DONE, + BCH_FS_ALLOC_CLEAN, +- BCH_FS_ALLOCATOR_STARTED, + BCH_FS_ALLOCATOR_RUNNING, + BCH_FS_ALLOCATOR_STOPPING, + BCH_FS_INITIAL_GC_DONE, +diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c +index 5b3f2548561b..0cae90d6e053 100644 +--- a/fs/bcachefs/journal_reclaim.c ++++ b/fs/bcachefs/journal_reclaim.c +@@ -28,17 +28,9 @@ unsigned bch2_journal_dev_buckets_available(struct journal *j, + struct journal_device *ja, + enum journal_space_from from) + { +- struct bch_fs *c = container_of(j, struct bch_fs, journal); + unsigned available = (journal_space_from(ja, from) - + ja->cur_idx - 1 + ja->nr) % ja->nr; + +- /* +- * Allocator startup needs some journal space before we can do journal +- * replay: +- */ +- if (available && test_bit(BCH_FS_ALLOCATOR_STARTED, &c->flags)) +- --available; +- + /* + * Don't use the last bucket unless writing the new last_seq + * will make another bucket available: +diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c +index efc1cac1d412..b939166ddbe6 100644 +--- a/fs/bcachefs/super.c ++++ b/fs/bcachefs/super.c +@@ -416,16 +416,6 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early) + bch2_dev_allocator_add(c, ca); + bch2_recalc_capacity(c); + +- if (!test_bit(BCH_FS_ALLOCATOR_STARTED, &c->flags)) { +- ret = bch2_fs_allocator_start(c); +- if (ret) { +- bch_err(c, "error initializing allocator"); +- goto err; +- } +- +- set_bit(BCH_FS_ALLOCATOR_STARTED, &c->flags); +- } +- + for_each_rw_member(ca, c, i) { + ret = bch2_dev_allocator_start(ca); + if (ret) { +-- +cgit v1.2.3 + + +From 142c1ee7f11081cd8a2cd8181d82a7924fd55beb Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 3 Jun 2020 23:46:15 -0400 +Subject: bcachefs: Always increment bucket gen on bucket reuse + +Not doing so confuses copygc + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/alloc_background.c | 38 ++++++++++++++++++++++++++++---------- + fs/bcachefs/buckets.c | 30 +++++++++++++++++++----------- + 2 files changed, 47 insertions(+), 21 deletions(-) + +diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c +index 482520761f76..bdebb0eccd9c 100644 +--- a/fs/bcachefs/alloc_background.c ++++ b/fs/bcachefs/alloc_background.c +@@ -860,12 +860,22 @@ static int bch2_invalidate_one_bucket2(struct btree_trans *trans, + g = bucket(ca, b); + m = READ_ONCE(g->mark); + +- bch2_mark_alloc_bucket(c, ca, b, true, gc_pos_alloc(c, NULL), 0); ++ invalidating_cached_data = m.cached_sectors != 0; ++ ++ /* ++ * If we're not invalidating cached data, we only increment the bucket ++ * gen in memory here, the incremented gen will be updated in the btree ++ * by bch2_trans_mark_pointer(): ++ */ ++ ++ if (!invalidating_cached_data) ++ bch2_invalidate_bucket(c, ca, b, &m); ++ else ++ bch2_mark_alloc_bucket(c, ca, b, true, gc_pos_alloc(c, NULL), 0); + + spin_unlock(&c->freelist_lock); + percpu_up_read(&c->mark_lock); + +- invalidating_cached_data = m.cached_sectors != 0; + if (!invalidating_cached_data) + goto out; + +@@ -887,18 +897,26 @@ retry: + if (ret) + return ret; + +- /* +- * The allocator has to start before journal replay is finished - thus, +- * we have to trust the in memory bucket @m, not the version in the +- * btree: +- */ + percpu_down_read(&c->mark_lock); +- g = bucket(ca, b); ++ g = bucket(ca, iter->pos.offset); + m = READ_ONCE(g->mark); +- u = alloc_mem_to_key(g, m); ++ ++ if (unlikely(!test_bit(BCH_FS_ALLOC_WRITTEN, &c->flags))) { ++ /* ++ * During journal replay, and if gc repairs alloc info at ++ * runtime, the alloc info in the btree might not be up to date ++ * yet - so, trust the in memory mark: ++ */ ++ u = alloc_mem_to_key(g, m); ++ } else { ++ u = bch2_alloc_unpack(k); ++ u.read_time = g->io_time[READ]; ++ u.write_time = g->io_time[WRITE]; ++ } ++ + percpu_up_read(&c->mark_lock); + +- invalidating_cached_data = m.cached_sectors != 0; ++ invalidating_cached_data = u.cached_sectors != 0; + + u.gen++; + u.data_type = 0; +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index 41e91bd70dde..0b15c0468892 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -1496,6 +1496,8 @@ static int bch2_trans_mark_pointer(struct btree_trans *trans, + struct bkey_s_c k_a; + struct bkey_alloc_unpacked u; + struct bkey_i_alloc *a; ++ struct bucket *g; ++ struct bucket_mark m; + int ret; + + ret = trans_get_key(trans, BTREE_ID_ALLOC, +@@ -1504,26 +1506,32 @@ static int bch2_trans_mark_pointer(struct btree_trans *trans, + if (ret < 0) + return ret; + +- if (k_a.k->type != KEY_TYPE_alloc || +- (!ret && unlikely(!test_bit(BCH_FS_ALLOC_WRITTEN, &c->flags)))) { ++ percpu_down_read(&c->mark_lock); ++ g = bucket(ca, iter->pos.offset); ++ m = READ_ONCE(g->mark); ++ ++ if (unlikely(!test_bit(BCH_FS_ALLOC_WRITTEN, &c->flags) && !ret)) { + /* + * During journal replay, and if gc repairs alloc info at + * runtime, the alloc info in the btree might not be up to date + * yet - so, trust the in memory mark - unless we're already + * updating that key: + */ +- struct bucket *g; +- struct bucket_mark m; +- +- percpu_down_read(&c->mark_lock); +- g = bucket(ca, iter->pos.offset); +- m = READ_ONCE(g->mark); +- u = alloc_mem_to_key(g, m); +- percpu_up_read(&c->mark_lock); ++ u = alloc_mem_to_key(g, m); + } else { +- u = bch2_alloc_unpack(k_a); ++ u = bch2_alloc_unpack(k_a); ++ u.read_time = g->io_time[READ]; ++ u.write_time = g->io_time[WRITE]; + } + ++ percpu_up_read(&c->mark_lock); ++ ++ /* ++ * Incrementing the bucket gen can be done lazily: ++ */ ++ if (gen_after(m.gen, u.gen) && !u.data_type) ++ u.gen = m.gen; ++ + ret = __mark_pointer(c, k, p, sectors, data_type, u.gen, &u.data_type, + &u.dirty_sectors, &u.cached_sectors); + if (ret) +-- +cgit v1.2.3 + + +From d1b0216bbbce92eeea95b6a8bfd43a42278e0c25 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 3 Jun 2020 23:47:50 -0400 +Subject: bcachefs: Improve warning for copygc failing to move data + +This will help narrow down which code is at fault when this happens. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/move.c | 5 ++++- + fs/bcachefs/move_types.h | 1 + + fs/bcachefs/movinggc.c | 17 +++++++++++++++-- + 3 files changed, 20 insertions(+), 3 deletions(-) + +diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c +index 11a92c099afd..b42350f9e9fb 100644 +--- a/fs/bcachefs/move.c ++++ b/fs/bcachefs/move.c +@@ -178,9 +178,12 @@ next: + } + continue; + nomatch: +- if (m->ctxt) ++ if (m->ctxt) { ++ BUG_ON(k.k->p.offset <= iter->pos.offset); ++ atomic64_inc(&m->ctxt->stats->keys_raced); + atomic64_add(k.k->p.offset - iter->pos.offset, + &m->ctxt->stats->sectors_raced); ++ } + atomic_long_inc(&c->extent_migrate_raced); + trace_move_race(&new->k); + bch2_btree_iter_next_slot(iter); +diff --git a/fs/bcachefs/move_types.h b/fs/bcachefs/move_types.h +index 6788170d3f95..fc0de165af9f 100644 +--- a/fs/bcachefs/move_types.h ++++ b/fs/bcachefs/move_types.h +@@ -8,6 +8,7 @@ struct bch_move_stats { + struct bpos pos; + + atomic64_t keys_moved; ++ atomic64_t keys_raced; + atomic64_t sectors_moved; + atomic64_t sectors_seen; + atomic64_t sectors_raced; +diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c +index e9cb2304576f..0a87cd7405dd 100644 +--- a/fs/bcachefs/movinggc.c ++++ b/fs/bcachefs/movinggc.c +@@ -78,7 +78,17 @@ static bool __copygc_pred(struct bch_dev *ca, + ssize_t i = eytzinger0_find_le(h->data, h->used, + sizeof(h->data[0]), + bucket_offset_cmp, &search); ++#if 0 ++ /* eytzinger search verify code: */ ++ ssize_t j = -1, k; + ++ for (k = 0; k < h->used; k++) ++ if (h->data[k].offset <= ptr->offset && ++ (j < 0 || h->data[k].offset > h->data[j].offset)) ++ j = k; ++ ++ BUG_ON(i != j); ++#endif + return (i >= 0 && + ptr->offset < h->data[i].offset + ca->mi.bucket_size && + ptr->gen == h->data[i].gen); +@@ -203,9 +213,12 @@ static void bch2_copygc(struct bch_fs *c, struct bch_dev *ca) + + if (sectors_not_moved && !ret) + bch_warn_ratelimited(c, +- "copygc finished but %llu/%llu sectors, %llu/%llu buckets not moved", ++ "copygc finished but %llu/%llu sectors, %llu/%llu buckets not moved (move stats: moved %llu sectors, raced %llu keys, %llu sectors)", + sectors_not_moved, sectors_to_move, +- buckets_not_moved, buckets_to_move); ++ buckets_not_moved, buckets_to_move, ++ atomic64_read(&move_stats.sectors_moved), ++ atomic64_read(&move_stats.keys_raced), ++ atomic64_read(&move_stats.sectors_raced)); + + trace_copygc(ca, + atomic64_read(&move_stats.sectors_moved), sectors_not_moved, +-- +cgit v1.2.3 + + +From 1e8f8cc3e8256d84310b90c577e4a0070f1936e9 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 8 Jun 2020 13:26:48 -0400 +Subject: bcachefs: bch2_trans_downgrade() + +bch2_btree_iter_downgrade() was looping over all iterators in a +transaction; bch2_trans_downgrade() should be doing that. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 41 +++++++++++++++++++---------------------- + fs/bcachefs/btree_iter.h | 2 ++ + fs/bcachefs/btree_update_leaf.c | 3 +-- + 3 files changed, 22 insertions(+), 24 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 6abcbe3debe5..814b4f154c2c 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -349,31 +349,20 @@ bool __bch2_btree_iter_upgrade_nounlock(struct btree_iter *iter, + void __bch2_btree_iter_downgrade(struct btree_iter *iter, + unsigned downgrade_to) + { +- struct btree_iter *linked; +- unsigned l; +- +- /* +- * We downgrade linked iterators as well because btree_iter_upgrade +- * might have had to modify locks_want on linked iterators due to lock +- * ordering: +- */ +- trans_for_each_iter(iter->trans, linked) { +- unsigned new_locks_want = downgrade_to ?: +- (linked->flags & BTREE_ITER_INTENT ? 1 : 0); +- +- if (linked->locks_want <= new_locks_want) +- continue; ++ unsigned l, new_locks_want = downgrade_to ?: ++ (iter->flags & BTREE_ITER_INTENT ? 1 : 0); + +- linked->locks_want = new_locks_want; ++ if (iter->locks_want < downgrade_to) { ++ iter->locks_want = new_locks_want; + +- while (linked->nodes_locked && +- (l = __fls(linked->nodes_locked)) >= linked->locks_want) { +- if (l > linked->level) { +- btree_node_unlock(linked, l); ++ while (iter->nodes_locked && ++ (l = __fls(iter->nodes_locked)) >= iter->locks_want) { ++ if (l > iter->level) { ++ btree_node_unlock(iter, l); + } else { +- if (btree_node_intent_locked(linked, l)) { +- six_lock_downgrade(&linked->l[l].b->lock); +- linked->nodes_intent_locked ^= 1 << l; ++ if (btree_node_intent_locked(iter, l)) { ++ six_lock_downgrade(&iter->l[l].b->lock); ++ iter->nodes_intent_locked ^= 1 << l; + } + break; + } +@@ -383,6 +372,14 @@ void __bch2_btree_iter_downgrade(struct btree_iter *iter, + bch2_btree_trans_verify_locks(iter->trans); + } + ++void bch2_trans_downgrade(struct btree_trans *trans) ++{ ++ struct btree_iter *iter; ++ ++ trans_for_each_iter(trans, iter) ++ bch2_btree_iter_downgrade(iter); ++} ++ + /* Btree transaction locking: */ + + bool bch2_trans_relock(struct btree_trans *trans) +diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h +index ab35fcd8b8b4..b11d2a30d9c7 100644 +--- a/fs/bcachefs/btree_iter.h ++++ b/fs/bcachefs/btree_iter.h +@@ -136,6 +136,8 @@ static inline void bch2_btree_iter_downgrade(struct btree_iter *iter) + __bch2_btree_iter_downgrade(iter, 0); + } + ++void bch2_trans_downgrade(struct btree_trans *); ++ + void bch2_btree_iter_node_replace(struct btree_iter *, struct btree *); + void bch2_btree_iter_node_drop(struct btree_iter *, struct btree *); + +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index e343d80fede3..c44559cb412d 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -543,8 +543,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, + + trans->nounlock = false; + +- trans_for_each_update2(trans, i) +- bch2_btree_iter_downgrade(i->iter); ++ bch2_trans_downgrade(trans); + + return 0; + } +-- +cgit v1.2.3 + + +From 0e0663a90265a3dd5b26b4ee5eaa2a77461a8f6b Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 8 Jun 2020 14:28:16 -0400 +Subject: bcachefs: Call bch2_btree_iter_traverse() if necessary in commit path + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_update_leaf.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index c44559cb412d..5933ac1a6552 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -830,9 +830,9 @@ int __bch2_trans_commit(struct btree_trans *trans) + trans_trigger_run = false; + + trans_for_each_update(trans, i) { +- if (unlikely(i->iter->uptodate > BTREE_ITER_NEED_PEEK)) { ++ if (unlikely(i->iter->uptodate > BTREE_ITER_NEED_PEEK && ++ (ret = bch2_btree_iter_traverse(i->iter)))) { + trace_trans_restart_traverse(trans->ip); +- ret = -EINTR; + goto out; + } + +-- +cgit v1.2.3 + + +From e41a5836e41c0c1a77b622fdaaf7939260aaed3a Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 5 Jun 2020 09:01:23 -0400 +Subject: bcachefs: Check gfp_flags correctly in bch2_btree_cache_scan() + +bch2_btree_node_mem_alloc() uses memalloc_nofs_save()/GFP_NOFS, but +GFP_NOFS does include __GFP_IO - oops. We used to use GFP_NOIO, but as +we're a filesystem now GFP_NOFS makes more sense now and is looser. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_cache.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c +index 1f62c4556ca7..ea23cc23753e 100644 +--- a/fs/bcachefs/btree_cache.c ++++ b/fs/bcachefs/btree_cache.c +@@ -241,7 +241,7 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, + return SHRINK_STOP; + + /* Return -1 if we can't do anything right now */ +- if (sc->gfp_mask & __GFP_IO) ++ if (sc->gfp_mask & __GFP_FS) + mutex_lock(&bc->lock); + else if (!mutex_trylock(&bc->lock)) + return -1; +-- +cgit v1.2.3 + + +From 610743e77a0cd557e6e0eb5b7abc7973bf57e02c Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 9 Jun 2020 15:59:03 -0400 +Subject: bcachefs: btree_update_nodes_written() requires alloc reserve + +Also, in the btree_update_start() path, if we already have a journal +pre-reservation we don't want to take another - that's a deadlock. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_update_interior.c | 9 ++++++--- + 1 file changed, 6 insertions(+), 3 deletions(-) + +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index c027c8106c81..57c47bca6333 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -529,6 +529,8 @@ static void btree_update_nodes_written(struct btree_update *as) + */ + ret = bch2_trans_do(c, &as->disk_res, &journal_seq, + BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_USE_RESERVE| ++ BTREE_INSERT_USE_ALLOC_RESERVE| + BTREE_INSERT_NOCHECK_RW| + BTREE_INSERT_JOURNAL_RESERVED, + btree_update_nodes_written_trans(&trans, as)); +@@ -886,9 +888,10 @@ bch2_btree_update_start(struct btree_trans *trans, enum btree_id id, + bch2_keylist_init(&as->new_keys, as->_new_keys); + bch2_keylist_init(&as->parent_keys, as->inline_keys); + +- ret = bch2_journal_preres_get(&c->journal, &as->journal_preres, +- BTREE_UPDATE_JOURNAL_RES, +- JOURNAL_RES_GET_NONBLOCK); ++ if (!(flags & BTREE_INSERT_JOURNAL_RESERVED)) ++ ret = bch2_journal_preres_get(&c->journal, &as->journal_preres, ++ BTREE_UPDATE_JOURNAL_RES, ++ JOURNAL_RES_GET_NONBLOCK); + if (ret == -EAGAIN) { + if (flags & BTREE_INSERT_NOUNLOCK) + return ERR_PTR(-EINTR); +-- +cgit v1.2.3 + + +From aa5c367ee417e32bd431cb8952f184e74e6dd92e Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 9 Jun 2020 15:46:22 -0400 +Subject: bcachefs: Make open bucket reserves more conservative + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/alloc_foreground.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c +index 697d576802b6..979aba30bc9d 100644 +--- a/fs/bcachefs/alloc_foreground.c ++++ b/fs/bcachefs/alloc_foreground.c +@@ -212,9 +212,9 @@ static inline unsigned open_buckets_reserved(enum alloc_reserve reserve) + case RESERVE_ALLOC: + return 0; + case RESERVE_BTREE: +- return BTREE_NODE_OPEN_BUCKET_RESERVE; ++ return OPEN_BUCKETS_COUNT / 4; + default: +- return BTREE_NODE_OPEN_BUCKET_RESERVE * 2; ++ return OPEN_BUCKETS_COUNT / 2; + } + } + +-- +cgit v1.2.3 + + +From dbb33c26615d75a8a0d67f635a71735027e2b50d Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 9 Jun 2020 16:25:07 -0400 +Subject: bcachefs: Fix a linked list bug + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_update_interior.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index 57c47bca6333..943c27abfef1 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -803,7 +803,7 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *as, + * operations complete + */ + list_for_each_entry_safe(p, n, &b->write_blocked, write_blocked_list) { +- list_del(&p->write_blocked_list); ++ list_del_init(&p->write_blocked_list); + btree_update_reparent(as, p); + + /* +-- +cgit v1.2.3 + + +From b8000f221737482fc258f44dc58eb0649db31e29 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 9 Jun 2020 17:49:24 -0400 +Subject: bcachefs: Don't allocate memory under the btree cache lock + +The btree cache lock is needed for reclaiming from the btree node cache, +and memory allocation can potentially spin and sleep (for 100 ms at a +time), so.. don't do that. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_cache.c | 87 +++++++++++++++++++++++++++++++---------------- + 1 file changed, 58 insertions(+), 29 deletions(-) + +diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c +index ea23cc23753e..80718ffba767 100644 +--- a/fs/bcachefs/btree_cache.c ++++ b/fs/bcachefs/btree_cache.c +@@ -72,24 +72,33 @@ static const struct rhashtable_params bch_btree_cache_params = { + .obj_cmpfn = bch2_btree_cache_cmp_fn, + }; + +-static void btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp) ++static int __btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp) + { +- struct btree_cache *bc = &c->btree_cache; ++ BUG_ON(b->data || b->aux_data); + + b->data = kvpmalloc(btree_bytes(c), gfp); + if (!b->data) +- goto err; ++ return -ENOMEM; + +- if (bch2_btree_keys_alloc(b, btree_page_order(c), gfp)) +- goto err; ++ if (bch2_btree_keys_alloc(b, btree_page_order(c), gfp)) { ++ kvpfree(b->data, btree_bytes(c)); ++ b->data = NULL; ++ return -ENOMEM; ++ } + +- bc->used++; +- list_move(&b->list, &bc->freeable); +- return; +-err: +- kvpfree(b->data, btree_bytes(c)); +- b->data = NULL; +- list_move(&b->list, &bc->freed); ++ return 0; ++} ++ ++static void btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp) ++{ ++ struct btree_cache *bc = &c->btree_cache; ++ ++ if (!__btree_node_data_alloc(c, b, gfp)) { ++ bc->used++; ++ list_move(&b->list, &bc->freeable); ++ } else { ++ list_move(&b->list, &bc->freed); ++ } + } + + static struct btree *btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp) +@@ -524,35 +533,47 @@ struct btree *bch2_btree_node_mem_alloc(struct bch_fs *c) + */ + list_for_each_entry(b, &bc->freeable, list) + if (!btree_node_reclaim(c, b)) +- goto out_unlock; ++ goto got_node; + + /* + * We never free struct btree itself, just the memory that holds the on + * disk node. Check the freed list before allocating a new one: + */ + list_for_each_entry(b, &bc->freed, list) +- if (!btree_node_reclaim(c, b)) { +- btree_node_data_alloc(c, b, __GFP_NOWARN|GFP_NOIO); +- if (b->data) +- goto out_unlock; ++ if (!btree_node_reclaim(c, b)) ++ goto got_node; + +- six_unlock_write(&b->lock); +- six_unlock_intent(&b->lock); ++ b = NULL; ++got_node: ++ if (b) ++ list_del_init(&b->list); ++ mutex_unlock(&bc->lock); ++ ++ if (!b) { ++ b = kzalloc(sizeof(struct btree), GFP_KERNEL); ++ if (!b) + goto err; +- } + +- b = btree_node_mem_alloc(c, __GFP_NOWARN|GFP_NOIO); +- if (!b) +- goto err; ++ bkey_btree_ptr_init(&b->key); ++ six_lock_init(&b->lock); ++ INIT_LIST_HEAD(&b->list); ++ INIT_LIST_HEAD(&b->write_blocked); ++ ++ BUG_ON(!six_trylock_intent(&b->lock)); ++ BUG_ON(!six_trylock_write(&b->lock)); ++ } ++ ++ if (!b->data) { ++ if (__btree_node_data_alloc(c, b, __GFP_NOWARN|GFP_KERNEL)) ++ goto err; ++ ++ mutex_lock(&bc->lock); ++ bc->used++; ++ mutex_unlock(&bc->lock); ++ } + +- BUG_ON(!six_trylock_intent(&b->lock)); +- BUG_ON(!six_trylock_write(&b->lock)); +-out_unlock: + BUG_ON(btree_node_hashed(b)); + BUG_ON(btree_node_write_in_flight(b)); +- +- list_del_init(&b->list); +- mutex_unlock(&bc->lock); + out: + b->flags = 0; + b->written = 0; +@@ -568,6 +589,14 @@ out: + memalloc_nofs_restore(flags); + return b; + err: ++ mutex_lock(&bc->lock); ++ ++ if (b) { ++ list_add(&b->list, &bc->freed); ++ six_unlock_write(&b->lock); ++ six_unlock_intent(&b->lock); ++ } ++ + /* Try to cannibalize another cached btree node: */ + if (bc->alloc_lock == current) { + b = btree_node_cannibalize(c); +-- +cgit v1.2.3 + + +From 13c8d4859b11a6c4581a26633823534a8a7685f9 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 9 Jun 2020 15:44:03 -0400 +Subject: bcachefs: More open buckets + +We need a larger open bucket reserve now that the btree interior update +path holds onto open bucket references; filesystems with many high +through devices may need more open buckets now. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/alloc_types.h | 16 +++++++++++----- + fs/bcachefs/bcachefs.h | 8 ++++---- + fs/bcachefs/btree_update_interior.h | 4 ++-- + 3 files changed, 17 insertions(+), 11 deletions(-) + +diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h +index 832568dc9551..4f1465077994 100644 +--- a/fs/bcachefs/alloc_types.h ++++ b/fs/bcachefs/alloc_types.h +@@ -46,16 +46,22 @@ enum alloc_reserve { + + typedef FIFO(long) alloc_fifo; + +-/* Enough for 16 cache devices, 2 tiers and some left over for pipelining */ +-#define OPEN_BUCKETS_COUNT 256 ++#define OPEN_BUCKETS_COUNT 1024 + + #define WRITE_POINT_HASH_NR 32 + #define WRITE_POINT_MAX 32 + ++typedef u16 open_bucket_idx_t; ++ + struct open_bucket { + spinlock_t lock; + atomic_t pin; +- u8 freelist; ++ open_bucket_idx_t freelist; ++ ++ /* ++ * When an open bucket has an ec_stripe attached, this is the index of ++ * the block in the stripe this open_bucket corresponds to: ++ */ + u8 ec_idx; + u8 type; + unsigned valid:1; +@@ -68,8 +74,8 @@ struct open_bucket { + #define OPEN_BUCKET_LIST_MAX 15 + + struct open_buckets { +- u8 nr; +- u8 v[OPEN_BUCKET_LIST_MAX]; ++ open_bucket_idx_t nr; ++ open_bucket_idx_t v[OPEN_BUCKET_LIST_MAX]; + }; + + struct dev_stripe_state { +diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h +index 49519badf3e9..e1622ba8b9d9 100644 +--- a/fs/bcachefs/bcachefs.h ++++ b/fs/bcachefs/bcachefs.h +@@ -428,8 +428,8 @@ struct bch_dev { + alloc_fifo free[RESERVE_NR]; + alloc_fifo free_inc; + +- u8 open_buckets_partial[OPEN_BUCKETS_COUNT]; +- unsigned open_buckets_partial_nr; ++ open_bucket_idx_t open_buckets_partial[OPEN_BUCKETS_COUNT]; ++ open_bucket_idx_t open_buckets_partial_nr; + + size_t fifo_last_bucket; + +@@ -688,8 +688,8 @@ struct bch_fs { + struct closure_waitlist freelist_wait; + u64 blocked_allocate; + u64 blocked_allocate_open_bucket; +- u8 open_buckets_freelist; +- u8 open_buckets_nr_free; ++ open_bucket_idx_t open_buckets_freelist; ++ open_bucket_idx_t open_buckets_nr_free; + struct closure_waitlist open_buckets_wait; + struct open_bucket open_buckets[OPEN_BUCKETS_COUNT]; + +diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h +index a6be62d3a18f..e00dc51ff3eb 100644 +--- a/fs/bcachefs/btree_update_interior.h ++++ b/fs/bcachefs/btree_update_interior.h +@@ -92,9 +92,9 @@ struct btree_update { + struct btree *new_nodes[BTREE_UPDATE_NODES_MAX]; + unsigned nr_new_nodes; + +- u8 open_buckets[BTREE_UPDATE_NODES_MAX * ++ open_bucket_idx_t open_buckets[BTREE_UPDATE_NODES_MAX * + BCH_REPLICAS_MAX]; +- u8 nr_open_buckets; ++ open_bucket_idx_t nr_open_buckets; + + unsigned journal_u64s; + u64 journal_entries[BTREE_UPDATE_JOURNAL_RES]; +-- +cgit v1.2.3 + + +From 67b401879e57ff14ae7e33e8ef529eddb8514674 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 9 Jun 2020 20:54:36 -0400 +Subject: bcachefs: Always give out journal pre-res if we already have one + +This is better than skipping the journal pre-reservation if we already +have one - we should still acount for the journal reservation we're +going to have to get. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_update_interior.c | 15 +++++++++------ + fs/bcachefs/journal.c | 10 ++++++---- + fs/bcachefs/journal.h | 20 +++++++++++++++----- + 3 files changed, 30 insertions(+), 15 deletions(-) + +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index 943c27abfef1..9f849f52969f 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -864,8 +864,11 @@ bch2_btree_update_start(struct btree_trans *trans, enum btree_id id, + { + struct bch_fs *c = trans->c; + struct btree_update *as; +- int ret, disk_res_flags = (flags & BTREE_INSERT_NOFAIL) ++ int disk_res_flags = (flags & BTREE_INSERT_NOFAIL) + ? BCH_DISK_RESERVATION_NOFAIL : 0; ++ int journal_flags = (flags & BTREE_INSERT_JOURNAL_RESERVED) ++ ? JOURNAL_RES_GET_RECLAIM : 0; ++ int ret = 0; + + /* + * This check isn't necessary for correctness - it's just to potentially +@@ -888,10 +891,9 @@ bch2_btree_update_start(struct btree_trans *trans, enum btree_id id, + bch2_keylist_init(&as->new_keys, as->_new_keys); + bch2_keylist_init(&as->parent_keys, as->inline_keys); + +- if (!(flags & BTREE_INSERT_JOURNAL_RESERVED)) +- ret = bch2_journal_preres_get(&c->journal, &as->journal_preres, +- BTREE_UPDATE_JOURNAL_RES, +- JOURNAL_RES_GET_NONBLOCK); ++ ret = bch2_journal_preres_get(&c->journal, &as->journal_preres, ++ BTREE_UPDATE_JOURNAL_RES, ++ journal_flags|JOURNAL_RES_GET_NONBLOCK); + if (ret == -EAGAIN) { + if (flags & BTREE_INSERT_NOUNLOCK) + return ERR_PTR(-EINTR); +@@ -899,7 +901,8 @@ bch2_btree_update_start(struct btree_trans *trans, enum btree_id id, + bch2_trans_unlock(trans); + + ret = bch2_journal_preres_get(&c->journal, &as->journal_preres, +- BTREE_UPDATE_JOURNAL_RES, 0); ++ BTREE_UPDATE_JOURNAL_RES, ++ journal_flags); + if (ret) + return ERR_PTR(ret); + +diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c +index 17dc60d98dc3..ab4134305bba 100644 +--- a/fs/bcachefs/journal.c ++++ b/fs/bcachefs/journal.c +@@ -428,9 +428,10 @@ int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res, + + static bool journal_preres_available(struct journal *j, + struct journal_preres *res, +- unsigned new_u64s) ++ unsigned new_u64s, ++ unsigned flags) + { +- bool ret = bch2_journal_preres_get_fast(j, res, new_u64s); ++ bool ret = bch2_journal_preres_get_fast(j, res, new_u64s, flags); + + if (!ret) + bch2_journal_reclaim_work(&j->reclaim_work.work); +@@ -440,13 +441,14 @@ static bool journal_preres_available(struct journal *j, + + int __bch2_journal_preres_get(struct journal *j, + struct journal_preres *res, +- unsigned new_u64s) ++ unsigned new_u64s, ++ unsigned flags) + { + int ret; + + closure_wait_event(&j->preres_wait, + (ret = bch2_journal_error(j)) || +- journal_preres_available(j, res, new_u64s)); ++ journal_preres_available(j, res, new_u64s, flags)); + return ret; + } + +diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h +index 997a28ae862e..30de6d96188e 100644 +--- a/fs/bcachefs/journal.h ++++ b/fs/bcachefs/journal.h +@@ -299,6 +299,7 @@ int bch2_journal_res_get_slowpath(struct journal *, struct journal_res *, + #define JOURNAL_RES_GET_NONBLOCK (1 << 0) + #define JOURNAL_RES_GET_CHECK (1 << 1) + #define JOURNAL_RES_GET_RESERVED (1 << 2) ++#define JOURNAL_RES_GET_RECLAIM (1 << 3) + + static inline int journal_res_get_fast(struct journal *j, + struct journal_res *res, +@@ -406,11 +407,12 @@ static inline void bch2_journal_preres_put(struct journal *j, + } + + int __bch2_journal_preres_get(struct journal *, +- struct journal_preres *, unsigned); ++ struct journal_preres *, unsigned, unsigned); + + static inline int bch2_journal_preres_get_fast(struct journal *j, + struct journal_preres *res, +- unsigned new_u64s) ++ unsigned new_u64s, ++ unsigned flags) + { + int d = new_u64s - res->u64s; + union journal_preres_state old, new; +@@ -421,7 +423,15 @@ static inline int bch2_journal_preres_get_fast(struct journal *j, + + new.reserved += d; + +- if (new.reserved > new.remaining) ++ /* ++ * If we're being called from the journal reclaim path, we have ++ * to unconditionally give out the pre-reservation, there's ++ * nothing else sensible we can do - otherwise we'd recurse back ++ * into the reclaim path and deadlock: ++ */ ++ ++ if (!(flags & JOURNAL_RES_GET_RECLAIM) && ++ new.reserved > new.remaining) + return 0; + } while ((v = atomic64_cmpxchg(&j->prereserved.counter, + old.v, new.v)) != old.v); +@@ -438,13 +448,13 @@ static inline int bch2_journal_preres_get(struct journal *j, + if (new_u64s <= res->u64s) + return 0; + +- if (bch2_journal_preres_get_fast(j, res, new_u64s)) ++ if (bch2_journal_preres_get_fast(j, res, new_u64s, flags)) + return 0; + + if (flags & JOURNAL_RES_GET_NONBLOCK) + return -EAGAIN; + +- return __bch2_journal_preres_get(j, res, new_u64s); ++ return __bch2_journal_preres_get(j, res, new_u64s, flags); + } + + /* journal_entry_res: */ +-- +cgit v1.2.3 + + +From bd0f3df4a4f23ab8e030a129a847dc349230036e Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 9 Jun 2020 21:00:29 -0400 +Subject: bcachefs: Refactor btree insert path + +This splits out the journalling code from the btree update code; prep +work for the btree key cache. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_update_leaf.c | 90 +++++++++++++++++------------------------ + 1 file changed, 38 insertions(+), 52 deletions(-) + +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index 5933ac1a6552..9c2b7c030544 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -159,71 +159,32 @@ inline void bch2_btree_add_journal_pin(struct bch_fs *c, + : btree_node_flush1); + } + +-static inline void __btree_journal_key(struct btree_trans *trans, +- enum btree_id btree_id, +- struct bkey_i *insert) +-{ +- struct journal *j = &trans->c->journal; +- u64 seq = trans->journal_res.seq; +- bool needs_whiteout = insert->k.needs_whiteout; +- +- /* ick */ +- insert->k.needs_whiteout = false; +- bch2_journal_add_keys(j, &trans->journal_res, +- btree_id, insert); +- insert->k.needs_whiteout = needs_whiteout; +- +- bch2_journal_set_has_inode(j, &trans->journal_res, +- insert->k.p.inode); +- +- if (trans->journal_seq) +- *trans->journal_seq = seq; +-} +- +-static void bch2_btree_journal_key(struct btree_trans *trans, +- struct btree_iter *iter, +- struct bkey_i *insert) +-{ +- struct bch_fs *c = trans->c; +- struct journal *j = &c->journal; +- struct btree *b = iter_l(iter)->b; +- +- EBUG_ON(trans->journal_res.ref != +- !(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)); +- +- if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) { +- __btree_journal_key(trans, iter->btree_id, insert); +- btree_bset_last(b)->journal_seq = +- cpu_to_le64(trans->journal_res.seq); +- } +- +- bch2_btree_add_journal_pin(c, b, +- likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)) +- ? trans->journal_res.seq +- : j->replay_journal_seq); +- +- if (unlikely(!btree_node_dirty(b))) +- set_btree_node_dirty(b); +-} +- + /** + * btree_insert_key - insert a key one key into a leaf node + */ +-static void btree_insert_key_leaf(struct btree_trans *trans, ++static bool btree_insert_key_leaf(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_i *insert) + { + struct bch_fs *c = trans->c; + struct btree *b = iter_l(iter)->b; + struct bset_tree *t = bset_tree_last(b); ++ struct bset *i = bset(b, t); + int old_u64s = bset_u64s(t); + int old_live_u64s = b->nr.live_u64s; + int live_u64s_added, u64s_added; + +- insert->k.needs_whiteout = false; ++ if (unlikely(!bch2_btree_bset_insert_key(iter, b, ++ &iter_l(iter)->iter, insert))) ++ return false; ++ ++ i->journal_seq = cpu_to_le64(max(trans->journal_res.seq, ++ le64_to_cpu(i->journal_seq))); + +- if (likely(bch2_btree_bset_insert_key(iter, b, &iter_l(iter)->iter, insert))) +- bch2_btree_journal_key(trans, iter, insert); ++ bch2_btree_add_journal_pin(c, b, trans->journal_res.seq); ++ ++ if (unlikely(!btree_node_dirty(b))) ++ set_btree_node_dirty(b); + + live_u64s_added = (int) b->nr.live_u64s - old_live_u64s; + u64s_added = (int) bset_u64s(t) - old_u64s; +@@ -238,6 +199,7 @@ static void btree_insert_key_leaf(struct btree_trans *trans, + bch2_btree_iter_reinit_node(iter, b); + + trace_btree_insert_key(c, b, insert); ++ return true; + } + + /* Normal update interface: */ +@@ -326,7 +288,29 @@ static inline void do_btree_insert_one(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_i *insert) + { +- btree_insert_key_leaf(trans, iter, insert); ++ struct bch_fs *c = trans->c; ++ struct journal *j = &c->journal; ++ bool did_work; ++ ++ EBUG_ON(trans->journal_res.ref != ++ !(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)); ++ ++ insert->k.needs_whiteout = false; ++ ++ did_work = btree_insert_key_leaf(trans, iter, insert); ++ if (!did_work) ++ return; ++ ++ if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) { ++ bch2_journal_add_keys(j, &trans->journal_res, ++ iter->btree_id, insert); ++ ++ bch2_journal_set_has_inode(j, &trans->journal_res, ++ insert->k.p.inode); ++ ++ if (trans->journal_seq) ++ *trans->journal_seq = trans->journal_res.seq; ++ } + } + + static inline bool iter_has_trans_triggers(struct btree_iter *iter) +@@ -411,6 +395,8 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, + JOURNAL_RES_GET_NONBLOCK); + if (ret) + goto err; ++ } else { ++ trans->journal_res.seq = c->journal.replay_journal_seq; + } + + if (unlikely(trans->extra_journal_entry_u64s)) { +-- +cgit v1.2.3 + + +From dc05fbaa1e8729348bcdb56619a2df786c473f91 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 9 Jun 2020 20:27:33 -0400 +Subject: bcachefs: Hacky io-in-flight throttling + +We've been seeing btree updates get stuck, due to some sort of bug; when +this happens, buffered writeback will keep queueing up writes that lead +to the system running out of memory. + +Not sure if this kind of throttling is something we'll want to keep and +improve, or get rid of when the bug with btree updates getting stuck is +fixed. For now it should make debugging easier. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bcachefs.h | 2 ++ + fs/bcachefs/io.c | 9 +++++++++ + fs/bcachefs/super.c | 2 ++ + 3 files changed, 13 insertions(+) + +diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h +index e1622ba8b9d9..20cc9d20f643 100644 +--- a/fs/bcachefs/bcachefs.h ++++ b/fs/bcachefs/bcachefs.h +@@ -191,6 +191,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -725,6 +726,7 @@ struct bch_fs { + struct rw_semaphore gc_lock; + + /* IO PATH */ ++ struct semaphore io_in_flight; + struct bio_set bio_read; + struct bio_set bio_read_split; + struct bio_set bio_write; +diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c +index 2060a6a1bdea..0d95975780cd 100644 +--- a/fs/bcachefs/io.c ++++ b/fs/bcachefs/io.c +@@ -500,6 +500,9 @@ static void bch2_write_done(struct closure *cl) + + bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time); + ++ if (!(op->flags & BCH_WRITE_FROM_INTERNAL)) ++ up(&c->io_in_flight); ++ + if (op->end_io) { + EBUG_ON(cl->parent); + closure_debug_destroy(cl); +@@ -1257,6 +1260,12 @@ void bch2_write(struct closure *cl) + goto err; + } + ++ /* ++ * Can't ratelimit copygc - we'd deadlock: ++ */ ++ if (!(op->flags & BCH_WRITE_FROM_INTERNAL)) ++ down(&c->io_in_flight); ++ + bch2_increment_clock(c, bio_sectors(bio), WRITE); + + data_len = min_t(u64, bio->bi_iter.bi_size, +diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c +index b939166ddbe6..8b37c16370c2 100644 +--- a/fs/bcachefs/super.c ++++ b/fs/bcachefs/super.c +@@ -674,6 +674,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) + + seqcount_init(&c->usage_lock); + ++ sema_init(&c->io_in_flight, 64); ++ + c->copy_gc_enabled = 1; + c->rebalance.enabled = 1; + c->promote_whole_extents = true; +-- +cgit v1.2.3 + + +From a15a9eb457d5bad2ea245f6dc2b40b3875dac06b Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 12 Jun 2020 14:58:07 -0400 +Subject: bcachefs: Fix a deadlock + +__bch2_btree_node_lock() was incorrectly using iter->pos as a proxy for +btree node lock ordering, this caused an off by one error that was +triggered by bch2_btree_node_get_sibling() getting the previous node. + +This refactors the code to compare against btree node keys directly. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 69 ++++++++++++++++++++++++++++------------- + fs/bcachefs/btree_locking.h | 24 ++++++++------ + fs/bcachefs/btree_types.h | 4 +++ + fs/bcachefs/btree_update_leaf.c | 2 +- + 4 files changed, 67 insertions(+), 32 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 814b4f154c2c..aa0473e0df7a 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -101,7 +101,7 @@ bool __bch2_btree_node_relock(struct btree_iter *iter, unsigned level) + + if (six_relock_type(&b->lock, want, iter->l[level].lock_seq) || + (btree_node_lock_seq_matches(iter, b, level) && +- btree_node_lock_increment(iter, b, level, want))) { ++ btree_node_lock_increment(iter->trans, b, level, want))) { + mark_btree_node_locked(iter, level, want); + return true; + } else { +@@ -130,7 +130,7 @@ static bool bch2_btree_node_upgrade(struct btree_iter *iter, unsigned level) + goto success; + + if (btree_node_lock_seq_matches(iter, b, level) && +- btree_node_lock_increment(iter, b, level, BTREE_NODE_INTENT_LOCKED)) { ++ btree_node_lock_increment(iter->trans, b, level, BTREE_NODE_INTENT_LOCKED)) { + btree_node_unlock(iter, level); + goto success; + } +@@ -193,23 +193,18 @@ static inline bool btree_iter_get_locks(struct btree_iter *iter, + + /* Slowpath: */ + bool __bch2_btree_node_lock(struct btree *b, struct bpos pos, +- unsigned level, +- struct btree_iter *iter, +- enum six_lock_type type) ++ unsigned level, struct btree_iter *iter, ++ enum six_lock_type type) + { ++ struct btree_trans *trans = iter->trans; + struct btree_iter *linked; + bool ret = true; + + /* Check if it's safe to block: */ +- trans_for_each_iter(iter->trans, linked) { ++ trans_for_each_iter(trans, linked) { + if (!linked->nodes_locked) + continue; + +- /* Must lock btree nodes in key order: */ +- if ((cmp_int(iter->btree_id, linked->btree_id) ?: +- bkey_cmp(pos, linked->pos)) < 0) +- ret = false; +- + /* + * Can't block taking an intent lock if we have _any_ nodes read + * locked: +@@ -224,13 +219,15 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos, + */ + if (type == SIX_LOCK_intent && + linked->nodes_locked != linked->nodes_intent_locked) { +- if (!(iter->trans->nounlock)) { ++ if (!(trans->nounlock)) { + linked->locks_want = max_t(unsigned, + linked->locks_want, + __fls(linked->nodes_locked) + 1); +- btree_iter_get_locks(linked, true, false); ++ if (!btree_iter_get_locks(linked, true, false)) ++ ret = false; ++ } else { ++ ret = false; + } +- ret = false; + } + + /* +@@ -240,14 +237,36 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos, + */ + if (linked->btree_id == iter->btree_id && + level > __fls(linked->nodes_locked)) { +- if (!(iter->trans->nounlock)) { ++ if (!(trans->nounlock)) { + linked->locks_want = + max(level + 1, max_t(unsigned, + linked->locks_want, + iter->locks_want)); +- btree_iter_get_locks(linked, true, false); ++ if (!btree_iter_get_locks(linked, true, false)) ++ ret = false; ++ } else { ++ ret = false; + } ++ } ++ ++ /* Must lock btree nodes in key order: */ ++ if (iter->btree_id < linked->btree_id) ++ ret = false; ++ ++ if (iter->btree_id == linked->btree_id && ++ btree_node_locked(linked, level) && ++ bkey_cmp(pos, linked->l[level].b->key.k.p) <= 0) + ret = false; ++ ++ /* ++ * Recheck if this is a node we already have locked - since one ++ * of the get_locks() calls might've successfully ++ * upgraded/relocked it: ++ */ ++ if (linked->l[level].b == b && ++ btree_node_locked_type(linked, level) >= type) { ++ six_lock_increment(&b->lock, type); ++ return true; + } + } + +@@ -2241,13 +2260,15 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c) + + mutex_lock(&c->btree_trans_lock); + list_for_each_entry(trans, &c->btree_trans_list, list) { +- pr_buf(out, "%i %ps\n", trans->pid, (void *) trans->ip); ++ pr_buf(out, "%i %px %ps\n", trans->pid, trans, (void *) trans->ip); + + trans_for_each_iter(trans, iter) { + if (!iter->nodes_locked) + continue; + +- pr_buf(out, " iter %s:", bch2_btree_ids[iter->btree_id]); ++ pr_buf(out, " iter %u %s:", ++ iter->idx, ++ bch2_btree_ids[iter->btree_id]); + bch2_bpos_to_text(out, iter->pos); + pr_buf(out, "\n"); + +@@ -2255,8 +2276,8 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c) + if (btree_node_locked(iter, l)) { + b = iter->l[l].b; + +- pr_buf(out, " %p l=%u %s ", +- b, l, btree_node_intent_locked(iter, l) ? "i" : "r"); ++ pr_buf(out, " %px %s l=%u ", ++ b, btree_node_intent_locked(iter, l) ? "i" : "r", l); + bch2_bpos_to_text(out, b->key.k.p); + pr_buf(out, "\n"); + } +@@ -2265,7 +2286,13 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c) + + b = READ_ONCE(trans->locking); + if (b) { +- pr_buf(out, " locking %px l=%u %s:", ++ pr_buf(out, " locking iter %u l=%u %s:", ++ trans->locking_iter_idx, ++ trans->locking_level, ++ bch2_btree_ids[trans->locking_btree_id]); ++ bch2_bpos_to_text(out, trans->locking_pos); ++ ++ pr_buf(out, " node %px l=%u %s:", + b, b->level, + bch2_btree_ids[b->btree_id]); + bch2_bpos_to_text(out, b->key.k.p); +diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h +index 730a9dc89de8..64f46461e624 100644 +--- a/fs/bcachefs/btree_locking.h ++++ b/fs/bcachefs/btree_locking.h +@@ -158,15 +158,15 @@ static inline void btree_node_lock_type(struct bch_fs *c, struct btree *b, + * Lock a btree node if we already have it locked on one of our linked + * iterators: + */ +-static inline bool btree_node_lock_increment(struct btree_iter *iter, ++static inline bool btree_node_lock_increment(struct btree_trans *trans, + struct btree *b, unsigned level, + enum btree_node_locked_type want) + { +- struct btree_iter *linked; ++ struct btree_iter *iter; + +- trans_for_each_iter(iter->trans, linked) +- if (linked->l[level].b == b && +- btree_node_locked_type(linked, level) >= want) { ++ trans_for_each_iter(trans, iter) ++ if (iter->l[level].b == b && ++ btree_node_locked_type(iter, level) >= want) { + six_lock_increment(&b->lock, want); + return true; + } +@@ -182,19 +182,23 @@ static inline bool btree_node_lock(struct btree *b, struct bpos pos, + struct btree_iter *iter, + enum six_lock_type type) + { ++ struct btree_trans *trans = iter->trans; + bool ret; + + EBUG_ON(level >= BTREE_MAX_DEPTH); + #ifdef CONFIG_BCACHEFS_DEBUG +- iter->trans->locking = b; ++ trans->locking = b; ++ trans->locking_iter_idx = iter->idx; ++ trans->locking_pos = pos; ++ trans->locking_btree_id = iter->btree_id; ++ trans->locking_level = level; + #endif +- +- ret = likely(six_trylock_type(&b->lock, type)) || +- btree_node_lock_increment(iter, b, level, type) || ++ ret = likely(six_trylock_type(&b->lock, type)) || ++ btree_node_lock_increment(trans, b, level, type) || + __bch2_btree_node_lock(b, pos, level, iter, type); + + #ifdef CONFIG_BCACHEFS_DEBUG +- iter->trans->locking = NULL; ++ trans->locking = NULL; + #endif + return ret; + } +diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h +index e97248ca3aa2..047b7b0776a1 100644 +--- a/fs/bcachefs/btree_types.h ++++ b/fs/bcachefs/btree_types.h +@@ -284,6 +284,10 @@ struct btree_trans { + #ifdef CONFIG_BCACHEFS_DEBUG + struct list_head list; + struct btree *locking; ++ unsigned locking_iter_idx; ++ struct bpos locking_pos; ++ u8 locking_btree_id; ++ u8 locking_level; + pid_t pid; + #endif + unsigned long ip; +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index 9c2b7c030544..283c10feb81f 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -481,7 +481,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, + * or anything else that might call bch2_trans_relock(), since that + * would just retake the read locks: + */ +- trans_for_each_iter_all(trans, iter) { ++ trans_for_each_iter(trans, iter) { + if (iter->nodes_locked != iter->nodes_intent_locked) { + EBUG_ON(iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT); + EBUG_ON(trans->iters_live & (1ULL << iter->idx)); +-- +cgit v1.2.3 + + +From 7a9b210ef70a2ce84c235a49640d1ea7df8d958e Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 12 Jun 2020 22:29:48 -0400 +Subject: bcachefs: Don't deadlock when btree node reuse changes lock ordering + +Btree node lock ordering is based on the logical key. However, 'struct +btree' may be reused for a different btree node under memory pressure. +This patch uses the new six lock callback to check if a btree node is no +longer the node we wanted to lock before blocking. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_cache.c | 19 +++++++++++++++++-- + fs/bcachefs/btree_iter.c | 38 ++++++++++++++++++++++++++++++------- + fs/bcachefs/btree_locking.h | 19 ++++++++++++------- + fs/bcachefs/btree_update_interior.c | 2 ++ + 4 files changed, 62 insertions(+), 16 deletions(-) + +diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c +index 80718ffba767..dc169a845da7 100644 +--- a/fs/bcachefs/btree_cache.c ++++ b/fs/bcachefs/btree_cache.c +@@ -677,6 +677,14 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c, + return b; + } + ++static int lock_node_check_fn(struct six_lock *lock, void *p) ++{ ++ struct btree *b = container_of(lock, struct btree, lock); ++ const struct bkey_i *k = p; ++ ++ return b->hash_val == btree_ptr_hash_val(k) ? 0 : -1; ++} ++ + /** + * bch_btree_node_get - find a btree node in the cache and lock it, reading it + * in from disk if necessary. +@@ -749,8 +757,12 @@ lock_node: + if (btree_node_read_locked(iter, level + 1)) + btree_node_unlock(iter, level + 1); + +- if (!btree_node_lock(b, k->k.p, level, iter, lock_type)) ++ if (!btree_node_lock(b, k->k.p, level, iter, lock_type, ++ lock_node_check_fn, (void *) k)) { ++ if (b->hash_val != btree_ptr_hash_val(k)) ++ goto retry; + return ERR_PTR(-EINTR); ++ } + + if (unlikely(b->hash_val != btree_ptr_hash_val(k) || + b->level != level || +@@ -802,6 +814,7 @@ struct btree *bch2_btree_node_get_noiter(struct bch_fs *c, + struct btree_cache *bc = &c->btree_cache; + struct btree *b; + struct bset_tree *t; ++ int ret; + + EBUG_ON(level >= BTREE_MAX_DEPTH); + +@@ -822,7 +835,9 @@ retry: + return b; + } else { + lock_node: +- six_lock_read(&b->lock, NULL, NULL); ++ ret = six_lock_read(&b->lock, lock_node_check_fn, (void *) k); ++ if (ret) ++ goto retry; + + if (unlikely(b->hash_val != btree_ptr_hash_val(k) || + b->btree_id != btree_id || +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index aa0473e0df7a..acbd7a31ba0e 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -194,10 +194,13 @@ static inline bool btree_iter_get_locks(struct btree_iter *iter, + /* Slowpath: */ + bool __bch2_btree_node_lock(struct btree *b, struct bpos pos, + unsigned level, struct btree_iter *iter, +- enum six_lock_type type) ++ enum six_lock_type type, ++ six_lock_should_sleep_fn should_sleep_fn, ++ void *p) + { + struct btree_trans *trans = iter->trans; + struct btree_iter *linked; ++ u64 start_time = local_clock(); + bool ret = true; + + /* Check if it's safe to block: */ +@@ -275,7 +278,14 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos, + return false; + } + +- __btree_node_lock_type(iter->trans->c, b, type); ++ if (six_trylock_type(&b->lock, type)) ++ return true; ++ ++ if (six_lock_type(&b->lock, type, should_sleep_fn, p)) ++ return false; ++ ++ bch2_time_stats_update(&trans->c->times[lock_to_time_stat(type)], ++ start_time); + return true; + } + +@@ -286,6 +296,11 @@ static void bch2_btree_iter_verify_locks(struct btree_iter *iter) + { + unsigned l; + ++ if (!(iter->trans->iters_linked & (1ULL << iter->idx))) { ++ BUG_ON(iter->nodes_locked); ++ return; ++ } ++ + for (l = 0; btree_iter_node(iter, l); l++) { + if (iter->uptodate >= BTREE_ITER_NEED_RELOCK && + !btree_node_locked(iter, l)) +@@ -300,7 +315,7 @@ void bch2_btree_trans_verify_locks(struct btree_trans *trans) + { + struct btree_iter *iter; + +- trans_for_each_iter(trans, iter) ++ trans_for_each_iter_all(trans, iter) + bch2_btree_iter_verify_locks(iter); + } + #else +@@ -892,18 +907,26 @@ void bch2_btree_iter_reinit_node(struct btree_iter *iter, struct btree *b) + __btree_iter_init(linked, b->level); + } + ++static int lock_root_check_fn(struct six_lock *lock, void *p) ++{ ++ struct btree *b = container_of(lock, struct btree, lock); ++ struct btree **rootp = p; ++ ++ return b == *rootp ? 0 : -1; ++} ++ + static inline int btree_iter_lock_root(struct btree_iter *iter, + unsigned depth_want) + { + struct bch_fs *c = iter->trans->c; +- struct btree *b; ++ struct btree *b, **rootp = &c->btree_roots[iter->btree_id].b; + enum six_lock_type lock_type; + unsigned i; + + EBUG_ON(iter->nodes_locked); + + while (1) { +- b = READ_ONCE(c->btree_roots[iter->btree_id].b); ++ b = READ_ONCE(*rootp); + iter->level = READ_ONCE(b->level); + + if (unlikely(iter->level < depth_want)) { +@@ -921,10 +944,11 @@ static inline int btree_iter_lock_root(struct btree_iter *iter, + + lock_type = __btree_lock_want(iter, iter->level); + if (unlikely(!btree_node_lock(b, POS_MAX, iter->level, +- iter, lock_type))) ++ iter, lock_type, ++ lock_root_check_fn, rootp))) + return -EINTR; + +- if (likely(b == c->btree_roots[iter->btree_id].b && ++ if (likely(b == READ_ONCE(*rootp) && + b->level == iter->level && + !race_fault())) { + for (i = 0; i < iter->level; i++) +diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h +index 64f46461e624..da2a0ebbc24f 100644 +--- a/fs/bcachefs/btree_locking.h ++++ b/fs/bcachefs/btree_locking.h +@@ -175,17 +175,21 @@ static inline bool btree_node_lock_increment(struct btree_trans *trans, + } + + bool __bch2_btree_node_lock(struct btree *, struct bpos, unsigned, +- struct btree_iter *, enum six_lock_type); +- +-static inline bool btree_node_lock(struct btree *b, struct bpos pos, +- unsigned level, +- struct btree_iter *iter, +- enum six_lock_type type) ++ struct btree_iter *, enum six_lock_type, ++ six_lock_should_sleep_fn, void *); ++ ++static inline bool btree_node_lock(struct btree *b, ++ struct bpos pos, unsigned level, ++ struct btree_iter *iter, ++ enum six_lock_type type, ++ six_lock_should_sleep_fn should_sleep_fn, void *p) + { + struct btree_trans *trans = iter->trans; + bool ret; + + EBUG_ON(level >= BTREE_MAX_DEPTH); ++ EBUG_ON(!(trans->iters_linked & (1ULL << iter->idx))); ++ + #ifdef CONFIG_BCACHEFS_DEBUG + trans->locking = b; + trans->locking_iter_idx = iter->idx; +@@ -195,7 +199,8 @@ static inline bool btree_node_lock(struct btree *b, struct bpos pos, + #endif + ret = likely(six_trylock_type(&b->lock, type)) || + btree_node_lock_increment(trans, b, level, type) || +- __bch2_btree_node_lock(b, pos, level, iter, type); ++ __bch2_btree_node_lock(b, pos, level, iter, type, ++ should_sleep_fn, p); + + #ifdef CONFIG_BCACHEFS_DEBUG + trans->locking = NULL; +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index 9f849f52969f..4d38943a4f0c 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -135,6 +135,8 @@ static void __btree_node_free(struct bch_fs *c, struct btree *b) + + bch2_btree_node_hash_remove(&c->btree_cache, b); + ++ six_lock_wakeup_all(&b->lock); ++ + mutex_lock(&c->btree_cache.lock); + list_move(&b->list, &c->btree_cache.freeable); + mutex_unlock(&c->btree_cache.lock); +-- +cgit v1.2.3 + + +From 8c8ce618e0877d0ce45f40bc483b9808bab0d30e Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 13 Jun 2020 18:43:14 -0400 +Subject: bcachefs: Add an internal option for reading entire journal + +To be used the debug tool that dumps the contents of the journal. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/journal.c | 9 +++++---- + fs/bcachefs/journal_io.c | 26 ++++++++++++++------------ + fs/bcachefs/opts.h | 5 +++++ + fs/bcachefs/recovery.c | 26 ++++++++++++++++++++------ + 4 files changed, 44 insertions(+), 22 deletions(-) + +diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c +index ab4134305bba..b4f7b61ba9ac 100644 +--- a/fs/bcachefs/journal.c ++++ b/fs/bcachefs/journal.c +@@ -987,9 +987,8 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq, + u64 last_seq = cur_seq, nr, seq; + + if (!list_empty(journal_entries)) +- last_seq = le64_to_cpu(list_first_entry(journal_entries, +- struct journal_replay, +- list)->j.seq); ++ last_seq = le64_to_cpu(list_last_entry(journal_entries, ++ struct journal_replay, list)->j.last_seq); + + nr = cur_seq - last_seq; + +@@ -1018,8 +1017,10 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq, + + list_for_each_entry(i, journal_entries, list) { + seq = le64_to_cpu(i->j.seq); ++ BUG_ON(seq >= cur_seq); + +- BUG_ON(seq < last_seq || seq >= cur_seq); ++ if (seq < last_seq) ++ continue; + + journal_seq_pin(j, seq)->devs = i->devs; + } +diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c +index b923efc42099..b7625285b3ad 100644 +--- a/fs/bcachefs/journal_io.c ++++ b/fs/bcachefs/journal_io.c +@@ -41,19 +41,21 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca, + list)->j.last_seq + : 0; + +- /* Is this entry older than the range we need? */ +- if (le64_to_cpu(j->seq) < le64_to_cpu(last_seq)) { +- ret = JOURNAL_ENTRY_ADD_OUT_OF_RANGE; +- goto out; +- } ++ if (!c->opts.read_entire_journal) { ++ /* Is this entry older than the range we need? */ ++ if (le64_to_cpu(j->seq) < le64_to_cpu(last_seq)) { ++ ret = JOURNAL_ENTRY_ADD_OUT_OF_RANGE; ++ goto out; ++ } + +- /* Drop entries we don't need anymore */ +- list_for_each_entry_safe(i, pos, jlist->head, list) { +- if (le64_to_cpu(i->j.seq) >= le64_to_cpu(j->last_seq)) +- break; +- list_del(&i->list); +- kvpfree(i, offsetof(struct journal_replay, j) + +- vstruct_bytes(&i->j)); ++ /* Drop entries we don't need anymore */ ++ list_for_each_entry_safe(i, pos, jlist->head, list) { ++ if (le64_to_cpu(i->j.seq) >= le64_to_cpu(j->last_seq)) ++ break; ++ list_del(&i->list); ++ kvpfree(i, offsetof(struct journal_replay, j) + ++ vstruct_bytes(&i->j)); ++ } + } + + list_for_each_entry_reverse(i, jlist->head, list) { +diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h +index 71ebace78453..3b051e7a8f1d 100644 +--- a/fs/bcachefs/opts.h ++++ b/fs/bcachefs/opts.h +@@ -265,6 +265,11 @@ enum opt_type { + OPT_BOOL(), \ + NO_SB_OPT, false, \ + NULL, "Don't free journal entries/keys after startup")\ ++ x(read_entire_journal, u8, \ ++ 0, \ ++ OPT_BOOL(), \ ++ NO_SB_OPT, false, \ ++ NULL, "Read all journal entries, not just dirty ones")\ + x(noexcl, u8, \ + OPT_MOUNT, \ + OPT_BOOL(), \ +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index c478d19e5691..1f26d9e19fe9 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -319,20 +319,30 @@ static struct journal_keys journal_keys_sort(struct list_head *journal_entries) + struct journal_key *src, *dst; + size_t nr_keys = 0; + +- list_for_each_entry(p, journal_entries, list) ++ if (list_empty(journal_entries)) ++ return keys; ++ ++ keys.journal_seq_base = ++ le64_to_cpu(list_last_entry(journal_entries, ++ struct journal_replay, list)->j.last_seq); ++ ++ list_for_each_entry(p, journal_entries, list) { ++ if (le64_to_cpu(p->j.seq) < keys.journal_seq_base) ++ continue; ++ + for_each_jset_key(k, _n, entry, &p->j) + nr_keys++; ++ } + +- keys.journal_seq_base = +- le64_to_cpu(list_first_entry(journal_entries, +- struct journal_replay, +- list)->j.seq); + + keys.d = kvmalloc(sizeof(keys.d[0]) * nr_keys, GFP_KERNEL); + if (!keys.d) + goto err; + +- list_for_each_entry(p, journal_entries, list) ++ list_for_each_entry(p, journal_entries, list) { ++ if (le64_to_cpu(p->j.seq) < keys.journal_seq_base) ++ continue; ++ + for_each_jset_key(k, _n, entry, &p->j) + keys.d[keys.nr++] = (struct journal_key) { + .btree_id = entry->btree_id, +@@ -342,6 +352,7 @@ static struct journal_keys journal_keys_sort(struct list_head *journal_entries) + keys.journal_seq_base, + .journal_offset = k->_data - p->j._data, + }; ++ } + + sort(keys.d, keys.nr, sizeof(keys.d[0]), journal_sort_key_cmp, NULL); + +@@ -568,6 +579,9 @@ verify_journal_entries_not_blacklisted_or_missing(struct bch_fs *c, + int ret = 0; + + list_for_each_entry(i, journal, list) { ++ if (le64_to_cpu(i->j.seq) < start_seq) ++ continue; ++ + fsck_err_on(seq != le64_to_cpu(i->j.seq), c, + "journal entries %llu-%llu missing! (replaying %llu-%llu)", + seq, le64_to_cpu(i->j.seq) - 1, +-- +cgit v1.2.3 + + +From 09f540d4e0c37e56ff2ecee2b8e7960cce7fb6d4 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 15 Jun 2020 14:58:47 -0400 +Subject: bcachefs: Turn c->state_lock into an rwsem + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bcachefs.h | 4 ++-- + fs/bcachefs/btree_gc.c | 1 + + fs/bcachefs/buckets.c | 7 +++--- + fs/bcachefs/error.c | 4 ++-- + fs/bcachefs/fs.c | 12 +++++----- + fs/bcachefs/super.c | 60 +++++++++++++++++++++++++------------------------- + fs/bcachefs/sysfs.c | 19 +++++----------- + 7 files changed, 50 insertions(+), 57 deletions(-) + +diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h +index 20cc9d20f643..09c69e7a0ae2 100644 +--- a/fs/bcachefs/bcachefs.h ++++ b/fs/bcachefs/bcachefs.h +@@ -552,8 +552,8 @@ struct bch_fs { + struct super_block *vfs_sb; + char name[40]; + +- /* ro/rw, add/remove devices: */ +- struct mutex state_lock; ++ /* ro/rw, add/remove/resize devices: */ ++ struct rw_semaphore state_lock; + + /* Counts outstanding writes, for clean transition to read-only */ + struct percpu_ref writes; +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +index c62fa3583b73..47ef7d031a13 100644 +--- a/fs/bcachefs/btree_gc.c ++++ b/fs/bcachefs/btree_gc.c +@@ -798,6 +798,7 @@ int bch2_gc(struct bch_fs *c, struct journal_keys *journal_keys, + unsigned i, iter = 0; + int ret; + ++ lockdep_assert_held(&c->state_lock); + trace_gc_start(c); + + down_write(&c->gc_lock); +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index 0b15c0468892..2da60ba3b7cb 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -1983,6 +1983,8 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) + int ret = -ENOMEM; + unsigned i; + ++ lockdep_assert_held(&c->state_lock); ++ + memset(&free, 0, sizeof(free)); + memset(&free_inc, 0, sizeof(free_inc)); + memset(&alloc_heap, 0, sizeof(alloc_heap)); +@@ -2009,7 +2011,6 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) + bch2_copygc_stop(ca); + + if (resize) { +- down_write(&c->gc_lock); + down_write(&ca->bucket_lock); + percpu_down_write(&c->mark_lock); + } +@@ -2052,10 +2053,8 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) + + nbuckets = ca->mi.nbuckets; + +- if (resize) { ++ if (resize) + up_write(&ca->bucket_lock); +- up_write(&c->gc_lock); +- } + + if (start_copygc && + bch2_copygc_start(c, ca)) +diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c +index 1662a36244cd..cd46706fb6f5 100644 +--- a/fs/bcachefs/error.c ++++ b/fs/bcachefs/error.c +@@ -37,7 +37,7 @@ void bch2_io_error_work(struct work_struct *work) + struct bch_fs *c = ca->fs; + bool dev; + +- mutex_lock(&c->state_lock); ++ down_write(&c->state_lock); + dev = bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_RO, + BCH_FORCE_IF_DEGRADED); + if (dev +@@ -47,7 +47,7 @@ void bch2_io_error_work(struct work_struct *work) + bch_err(ca, + "too many IO errors, setting %s RO", + dev ? "device" : "filesystem"); +- mutex_unlock(&c->state_lock); ++ up_write(&c->state_lock); + } + + void bch2_io_error(struct bch_dev *ca) +diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c +index 8b09fd55cbc3..f5fd9bb3b66f 100644 +--- a/fs/bcachefs/fs.c ++++ b/fs/bcachefs/fs.c +@@ -1341,16 +1341,16 @@ static struct bch_fs *__bch2_open_as_blockdevs(const char *dev_name, char * cons + if (IS_ERR(c)) + return c; + +- mutex_lock(&c->state_lock); ++ down_write(&c->state_lock); + + if (!test_bit(BCH_FS_STARTED, &c->flags)) { +- mutex_unlock(&c->state_lock); ++ up_write(&c->state_lock); + closure_put(&c->cl); + pr_err("err mounting %s: incomplete filesystem", dev_name); + return ERR_PTR(-EINVAL); + } + +- mutex_unlock(&c->state_lock); ++ up_write(&c->state_lock); + + set_bit(BCH_FS_BDEV_MOUNTED, &c->flags); + return c; +@@ -1399,7 +1399,7 @@ static int bch2_remount(struct super_block *sb, int *flags, char *data) + return ret; + + if (opts.read_only != c->opts.read_only) { +- mutex_lock(&c->state_lock); ++ down_write(&c->state_lock); + + if (opts.read_only) { + bch2_fs_read_only(c); +@@ -1409,7 +1409,7 @@ static int bch2_remount(struct super_block *sb, int *flags, char *data) + ret = bch2_fs_read_write(c); + if (ret) { + bch_err(c, "error going rw: %i", ret); +- mutex_unlock(&c->state_lock); ++ up_write(&c->state_lock); + return -EINVAL; + } + +@@ -1418,7 +1418,7 @@ static int bch2_remount(struct super_block *sb, int *flags, char *data) + + c->opts.read_only = opts.read_only; + +- mutex_unlock(&c->state_lock); ++ up_write(&c->state_lock); + } + + if (opts.errors >= 0) +diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c +index 8b37c16370c2..480c865f3e8e 100644 +--- a/fs/bcachefs/super.c ++++ b/fs/bcachefs/super.c +@@ -333,9 +333,9 @@ static void bch2_fs_read_only_work(struct work_struct *work) + struct bch_fs *c = + container_of(work, struct bch_fs, read_only_work); + +- mutex_lock(&c->state_lock); ++ down_write(&c->state_lock); + bch2_fs_read_only(c); +- mutex_unlock(&c->state_lock); ++ up_write(&c->state_lock); + } + + static void bch2_fs_read_only_async(struct bch_fs *c) +@@ -526,9 +526,9 @@ void bch2_fs_stop(struct bch_fs *c) + + cancel_work_sync(&c->journal_seq_blacklist_gc_work); + +- mutex_lock(&c->state_lock); ++ down_write(&c->state_lock); + bch2_fs_read_only(c); +- mutex_unlock(&c->state_lock); ++ up_write(&c->state_lock); + + for_each_member_device(ca, c, i) + if (ca->kobj.state_in_sysfs && +@@ -599,7 +599,7 @@ static const char *bch2_fs_online(struct bch_fs *c) + bch2_opts_create_sysfs_files(&c->opts_dir)) + return "error creating sysfs objects"; + +- mutex_lock(&c->state_lock); ++ down_write(&c->state_lock); + + err = "error creating sysfs objects"; + __for_each_member_device(ca, c, i, NULL) +@@ -609,7 +609,7 @@ static const char *bch2_fs_online(struct bch_fs *c) + list_add(&c->list, &bch_fs_list); + err = NULL; + err: +- mutex_unlock(&c->state_lock); ++ up_write(&c->state_lock); + return err; + } + +@@ -631,7 +631,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) + c->minor = -1; + c->disk_sb.fs_sb = true; + +- mutex_init(&c->state_lock); ++ init_rwsem(&c->state_lock); + mutex_init(&c->sb_lock); + mutex_init(&c->replicas_gc_lock); + mutex_init(&c->btree_root_lock); +@@ -823,7 +823,7 @@ int bch2_fs_start(struct bch_fs *c) + unsigned i; + int ret = -EINVAL; + +- mutex_lock(&c->state_lock); ++ down_write(&c->state_lock); + + BUG_ON(test_bit(BCH_FS_STARTED, &c->flags)); + +@@ -873,7 +873,7 @@ int bch2_fs_start(struct bch_fs *c) + print_mount_opts(c); + ret = 0; + out: +- mutex_unlock(&c->state_lock); ++ up_write(&c->state_lock); + return ret; + err: + switch (ret) { +@@ -1369,9 +1369,9 @@ int bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca, + { + int ret; + +- mutex_lock(&c->state_lock); ++ down_write(&c->state_lock); + ret = __bch2_dev_set_state(c, ca, new_state, flags); +- mutex_unlock(&c->state_lock); ++ up_write(&c->state_lock); + + return ret; + } +@@ -1384,7 +1384,7 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) + unsigned dev_idx = ca->dev_idx, data; + int ret = -EINVAL; + +- mutex_lock(&c->state_lock); ++ down_write(&c->state_lock); + + /* + * We consume a reference to ca->ref, regardless of whether we succeed +@@ -1474,13 +1474,13 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) + bch2_write_super(c); + + mutex_unlock(&c->sb_lock); +- mutex_unlock(&c->state_lock); ++ up_write(&c->state_lock); + return 0; + err: + if (ca->mi.state == BCH_MEMBER_STATE_RW && + !percpu_ref_is_zero(&ca->io_ref)) + __bch2_dev_read_write(c, ca); +- mutex_unlock(&c->state_lock); ++ up_write(&c->state_lock); + return ret; + } + +@@ -1556,7 +1556,7 @@ int bch2_dev_add(struct bch_fs *c, const char *path) + + dev_usage_clear(ca); + +- mutex_lock(&c->state_lock); ++ down_write(&c->state_lock); + mutex_lock(&c->sb_lock); + + err = "insufficient space in new superblock"; +@@ -1617,12 +1617,12 @@ have_slot: + goto err_late; + } + +- mutex_unlock(&c->state_lock); ++ up_write(&c->state_lock); + return 0; + + err_unlock: + mutex_unlock(&c->sb_lock); +- mutex_unlock(&c->state_lock); ++ up_write(&c->state_lock); + err: + if (ca) + bch2_dev_free(ca); +@@ -1645,11 +1645,11 @@ int bch2_dev_online(struct bch_fs *c, const char *path) + const char *err; + int ret; + +- mutex_lock(&c->state_lock); ++ down_write(&c->state_lock); + + ret = bch2_read_super(path, &opts, &sb); + if (ret) { +- mutex_unlock(&c->state_lock); ++ up_write(&c->state_lock); + return ret; + } + +@@ -1680,10 +1680,10 @@ int bch2_dev_online(struct bch_fs *c, const char *path) + bch2_write_super(c); + mutex_unlock(&c->sb_lock); + +- mutex_unlock(&c->state_lock); ++ up_write(&c->state_lock); + return 0; + err: +- mutex_unlock(&c->state_lock); ++ up_write(&c->state_lock); + bch2_free_super(&sb); + bch_err(c, "error bringing %s online: %s", path, err); + return -EINVAL; +@@ -1691,23 +1691,23 @@ err: + + int bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca, int flags) + { +- mutex_lock(&c->state_lock); ++ down_write(&c->state_lock); + + if (!bch2_dev_is_online(ca)) { + bch_err(ca, "Already offline"); +- mutex_unlock(&c->state_lock); ++ up_write(&c->state_lock); + return 0; + } + + if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_FAILED, flags)) { + bch_err(ca, "Cannot offline required disk"); +- mutex_unlock(&c->state_lock); ++ up_write(&c->state_lock); + return -EINVAL; + } + + __bch2_dev_offline(c, ca); + +- mutex_unlock(&c->state_lock); ++ up_write(&c->state_lock); + return 0; + } + +@@ -1716,7 +1716,7 @@ int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) + struct bch_member *mi; + int ret = 0; + +- mutex_lock(&c->state_lock); ++ down_write(&c->state_lock); + + if (nbuckets < ca->mi.nbuckets) { + bch_err(ca, "Cannot shrink yet"); +@@ -1747,7 +1747,7 @@ int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) + + bch2_recalc_capacity(c); + err: +- mutex_unlock(&c->state_lock); ++ up_write(&c->state_lock); + return ret; + } + +@@ -1827,13 +1827,13 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices, + goto err; + + err = "bch2_dev_online() error"; +- mutex_lock(&c->state_lock); ++ down_write(&c->state_lock); + for (i = 0; i < nr_devices; i++) + if (bch2_dev_attach_bdev(c, &sb[i])) { +- mutex_unlock(&c->state_lock); ++ up_write(&c->state_lock); + goto err_print; + } +- mutex_unlock(&c->state_lock); ++ up_write(&c->state_lock); + + err = "insufficient devices"; + if (!bch2_fs_may_start(c)) +diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c +index 5f2bc933b0e9..8456064e4864 100644 +--- a/fs/bcachefs/sysfs.c ++++ b/fs/bcachefs/sysfs.c +@@ -427,7 +427,7 @@ SHOW(bch2_fs) + return 0; + } + +-STORE(__bch2_fs) ++STORE(bch2_fs) + { + struct bch_fs *c = container_of(kobj, struct bch_fs, kobj); + +@@ -485,8 +485,11 @@ STORE(__bch2_fs) + if (attr == &sysfs_trigger_btree_coalesce) + bch2_coalesce(c); + +- if (attr == &sysfs_trigger_gc) ++ if (attr == &sysfs_trigger_gc) { ++ down_read(&c->state_lock); + bch2_gc(c, NULL, false, false); ++ up_read(&c->state_lock); ++ } + + if (attr == &sysfs_trigger_alloc_write) { + bool wrote; +@@ -501,6 +504,7 @@ STORE(__bch2_fs) + sc.nr_to_scan = strtoul_or_return(buf); + c->btree_cache.shrink.scan_objects(&c->btree_cache.shrink, &sc); + } ++ + #ifdef CONFIG_BCACHEFS_TESTS + if (attr == &sysfs_perf_test) { + char *tmp = kstrdup(buf, GFP_KERNEL), *p = tmp; +@@ -522,17 +526,6 @@ STORE(__bch2_fs) + #endif + return size; + } +- +-STORE(bch2_fs) +-{ +- struct bch_fs *c = container_of(kobj, struct bch_fs, kobj); +- +- mutex_lock(&c->state_lock); +- size = __bch2_fs_store(kobj, attr, buf, size); +- mutex_unlock(&c->state_lock); +- +- return size; +-} + SYSFS_OPS(bch2_fs); + + struct attribute *bch2_fs_files[] = { +-- +cgit v1.2.3 + + +From ef4a50585a565e73767295615dd970a62beb916c Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 15 Jun 2020 15:10:54 -0400 +Subject: bcachefs: Implement a new gc that only recalcs oldest gen + +Full mark and sweep gc doesn't (yet?) work with the new btree key cache +code, but it also blocks updates to interior btree nodes for the +duration and isn't really necessary in practice; we aren't currently +attempting to repair errors in allocation info at runtime. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_gc.c | 83 +++++++++++++++++++++++++++++++++++++++++++++ + fs/bcachefs/btree_gc.h | 1 + + fs/bcachefs/buckets_types.h | 1 + + fs/bcachefs/sysfs.c | 7 ++++ + 4 files changed, 92 insertions(+) + +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +index 47ef7d031a13..ede58aa2e4bb 100644 +--- a/fs/bcachefs/btree_gc.c ++++ b/fs/bcachefs/btree_gc.c +@@ -885,6 +885,82 @@ out: + return ret; + } + ++/* ++ * For recalculating oldest gen, we only need to walk keys in leaf nodes; btree ++ * node pointers currently never have cached pointers that can become stale: ++ */ ++static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id id) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ int ret; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ for_each_btree_key(&trans, iter, id, POS_MIN, BTREE_ITER_PREFETCH, k, ret) { ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const struct bch_extent_ptr *ptr; ++ ++ percpu_down_read(&c->mark_lock); ++ bkey_for_each_ptr(ptrs, ptr) { ++ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); ++ struct bucket *g = PTR_BUCKET(ca, ptr, false); ++ ++ if (gen_after(g->gc_gen, ptr->gen)) ++ g->gc_gen = ptr->gen; ++ ++ if (gen_after(g->mark.gen, ptr->gen) > 32) { ++ /* rewrite btree node */ ++ ++ } ++ } ++ percpu_up_read(&c->mark_lock); ++ } ++ ++ bch2_trans_exit(&trans); ++ return ret; ++} ++ ++int bch2_gc_gens(struct bch_fs *c) ++{ ++ struct bch_dev *ca; ++ struct bucket_array *buckets; ++ struct bucket *g; ++ unsigned i; ++ int ret; ++ ++ down_read(&c->state_lock); ++ ++ for_each_member_device(ca, c, i) { ++ down_read(&ca->bucket_lock); ++ buckets = bucket_array(ca); ++ ++ for_each_bucket(g, buckets) ++ g->gc_gen = g->mark.gen; ++ up_read(&ca->bucket_lock); ++ } ++ ++ for (i = 0; i < BTREE_ID_NR; i++) ++ if (btree_node_type_needs_gc(i)) { ++ ret = bch2_gc_btree_gens(c, i); ++ if (ret) ++ goto err; ++ } ++ ++ for_each_member_device(ca, c, i) { ++ down_read(&ca->bucket_lock); ++ buckets = bucket_array(ca); ++ ++ for_each_bucket(g, buckets) ++ g->oldest_gen = g->gc_gen; ++ up_read(&ca->bucket_lock); ++ } ++err: ++ up_read(&c->state_lock); ++ return ret; ++} ++ + /* Btree coalescing */ + + static void recalc_packed_keys(struct btree *b) +@@ -1260,7 +1336,14 @@ static int bch2_gc_thread(void *arg) + last = atomic_long_read(&clock->now); + last_kick = atomic_read(&c->kick_gc); + ++ /* ++ * Full gc is currently incompatible with btree key cache: ++ */ ++#if 0 + ret = bch2_gc(c, NULL, false, false); ++#else ++ ret = bch2_gc_gens(c); ++#endif + if (ret) + bch_err(c, "btree gc failed: %i", ret); + +diff --git a/fs/bcachefs/btree_gc.h b/fs/bcachefs/btree_gc.h +index bd5f2752954f..e09af2fda3b6 100644 +--- a/fs/bcachefs/btree_gc.h ++++ b/fs/bcachefs/btree_gc.h +@@ -8,6 +8,7 @@ void bch2_coalesce(struct bch_fs *); + + struct journal_keys; + int bch2_gc(struct bch_fs *, struct journal_keys *, bool, bool); ++int bch2_gc_gens(struct bch_fs *); + void bch2_gc_thread_stop(struct bch_fs *); + int bch2_gc_thread_start(struct bch_fs *); + void bch2_mark_dev_superblock(struct bch_fs *, struct bch_dev *, unsigned); +diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h +index f3ff4a18b1fd..59e92a6d26be 100644 +--- a/fs/bcachefs/buckets_types.h ++++ b/fs/bcachefs/buckets_types.h +@@ -39,6 +39,7 @@ struct bucket { + + u16 io_time[2]; + u8 oldest_gen; ++ u8 gc_gen; + unsigned gen_valid:1; + }; + +diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c +index 8456064e4864..26b061381e23 100644 +--- a/fs/bcachefs/sysfs.c ++++ b/fs/bcachefs/sysfs.c +@@ -486,9 +486,16 @@ STORE(bch2_fs) + bch2_coalesce(c); + + if (attr == &sysfs_trigger_gc) { ++ /* ++ * Full gc is currently incompatible with btree key cache: ++ */ ++#if 0 + down_read(&c->state_lock); + bch2_gc(c, NULL, false, false); + up_read(&c->state_lock); ++#else ++ bch2_gc_gens(c); ++#endif + } + + if (attr == &sysfs_trigger_alloc_write) { +-- +cgit v1.2.3 + + +From 0b6befe0978795fee59a4e89af290198752027d3 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 6 Jun 2020 12:28:01 -0400 +Subject: bcachefs: btree_bkey_cached_common + +This is prep work for the btree key cache: btree iterators will point to +either struct btree, or a new struct bkey_cached. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_cache.c | 74 ++++++++++++------------ + fs/bcachefs/btree_cache.h | 2 +- + fs/bcachefs/btree_gc.c | 40 ++++++------- + fs/bcachefs/btree_gc.h | 2 +- + fs/bcachefs/btree_io.c | 42 +++++++------- + fs/bcachefs/btree_io.h | 2 +- + fs/bcachefs/btree_iter.c | 82 +++++++++++++------------- + fs/bcachefs/btree_iter.h | 8 +-- + fs/bcachefs/btree_locking.h | 24 ++++---- + fs/bcachefs/btree_types.h | 15 +++-- + fs/bcachefs/btree_update_interior.c | 111 ++++++++++++++++++------------------ + fs/bcachefs/btree_update_interior.h | 6 +- + fs/bcachefs/btree_update_leaf.c | 2 +- + fs/bcachefs/debug.c | 4 +- + fs/bcachefs/recovery.c | 18 +++--- + include/trace/events/bcachefs.h | 6 +- + 16 files changed, 221 insertions(+), 217 deletions(-) + +diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c +index dc169a845da7..b6a716cd4b6d 100644 +--- a/fs/bcachefs/btree_cache.c ++++ b/fs/bcachefs/btree_cache.c +@@ -28,7 +28,7 @@ void bch2_recalc_btree_reserve(struct bch_fs *c) + for (i = 0; i < BTREE_ID_NR; i++) + if (c->btree_roots[i].b) + reserve += min_t(unsigned, 1, +- c->btree_roots[i].b->level) * 8; ++ c->btree_roots[i].b->c.level) * 8; + + c->btree_cache.reserve = reserve; + } +@@ -108,7 +108,7 @@ static struct btree *btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp) + return NULL; + + bkey_btree_ptr_init(&b->key); +- six_lock_init(&b->lock); ++ six_lock_init(&b->c.lock); + INIT_LIST_HEAD(&b->list); + INIT_LIST_HEAD(&b->write_blocked); + +@@ -140,8 +140,8 @@ int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b, + { + int ret; + +- b->level = level; +- b->btree_id = id; ++ b->c.level = level; ++ b->c.btree_id = id; + + mutex_lock(&bc->lock); + ret = __bch2_btree_node_hash_insert(bc, b); +@@ -172,10 +172,10 @@ static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush) + + lockdep_assert_held(&bc->lock); + +- if (!six_trylock_intent(&b->lock)) ++ if (!six_trylock_intent(&b->c.lock)) + return -ENOMEM; + +- if (!six_trylock_write(&b->lock)) ++ if (!six_trylock_write(&b->c.lock)) + goto out_unlock_intent; + + if (btree_node_noevict(b)) +@@ -216,9 +216,9 @@ out: + trace_btree_node_reap(c, b); + return ret; + out_unlock: +- six_unlock_write(&b->lock); ++ six_unlock_write(&b->c.lock); + out_unlock_intent: +- six_unlock_intent(&b->lock); ++ six_unlock_intent(&b->c.lock); + ret = -ENOMEM; + goto out; + } +@@ -276,8 +276,8 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, + if (++i > 3 && + !btree_node_reclaim(c, b)) { + btree_node_data_free(c, b); +- six_unlock_write(&b->lock); +- six_unlock_intent(&b->lock); ++ six_unlock_write(&b->c.lock); ++ six_unlock_intent(&b->c.lock); + freed++; + } + } +@@ -303,8 +303,8 @@ restart: + mutex_unlock(&bc->lock); + + bch2_btree_node_hash_remove(bc, b); +- six_unlock_write(&b->lock); +- six_unlock_intent(&b->lock); ++ six_unlock_write(&b->c.lock); ++ six_unlock_intent(&b->c.lock); + + if (freed >= nr) + goto out; +@@ -555,12 +555,12 @@ got_node: + goto err; + + bkey_btree_ptr_init(&b->key); +- six_lock_init(&b->lock); ++ six_lock_init(&b->c.lock); + INIT_LIST_HEAD(&b->list); + INIT_LIST_HEAD(&b->write_blocked); + +- BUG_ON(!six_trylock_intent(&b->lock)); +- BUG_ON(!six_trylock_write(&b->lock)); ++ BUG_ON(!six_trylock_intent(&b->c.lock)); ++ BUG_ON(!six_trylock_write(&b->c.lock)); + } + + if (!b->data) { +@@ -593,8 +593,8 @@ err: + + if (b) { + list_add(&b->list, &bc->freed); +- six_unlock_write(&b->lock); +- six_unlock_intent(&b->lock); ++ six_unlock_write(&b->c.lock); ++ six_unlock_intent(&b->c.lock); + } + + /* Try to cannibalize another cached btree node: */ +@@ -649,8 +649,8 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c, + list_add(&b->list, &bc->freeable); + mutex_unlock(&bc->lock); + +- six_unlock_write(&b->lock); +- six_unlock_intent(&b->lock); ++ six_unlock_write(&b->c.lock); ++ six_unlock_intent(&b->c.lock); + return NULL; + } + +@@ -664,22 +664,22 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c, + + bch2_btree_node_read(c, b, sync); + +- six_unlock_write(&b->lock); ++ six_unlock_write(&b->c.lock); + + if (!sync) { +- six_unlock_intent(&b->lock); ++ six_unlock_intent(&b->c.lock); + return NULL; + } + + if (lock_type == SIX_LOCK_read) +- six_lock_downgrade(&b->lock); ++ six_lock_downgrade(&b->c.lock); + + return b; + } + + static int lock_node_check_fn(struct six_lock *lock, void *p) + { +- struct btree *b = container_of(lock, struct btree, lock); ++ struct btree *b = container_of(lock, struct btree, c.lock); + const struct bkey_i *k = p; + + return b->hash_val == btree_ptr_hash_val(k) ? 0 : -1; +@@ -765,9 +765,9 @@ lock_node: + } + + if (unlikely(b->hash_val != btree_ptr_hash_val(k) || +- b->level != level || ++ b->c.level != level || + race_fault())) { +- six_unlock_type(&b->lock, lock_type); ++ six_unlock_type(&b->c.lock, lock_type); + if (bch2_btree_node_relock(iter, level + 1)) + goto retry; + +@@ -795,11 +795,11 @@ lock_node: + set_btree_node_accessed(b); + + if (unlikely(btree_node_read_error(b))) { +- six_unlock_type(&b->lock, lock_type); ++ six_unlock_type(&b->c.lock, lock_type); + return ERR_PTR(-EIO); + } + +- EBUG_ON(b->btree_id != iter->btree_id || ++ EBUG_ON(b->c.btree_id != iter->btree_id || + BTREE_NODE_LEVEL(b->data) != level || + bkey_cmp(b->data->max_key, k->k.p)); + +@@ -835,14 +835,14 @@ retry: + return b; + } else { + lock_node: +- ret = six_lock_read(&b->lock, lock_node_check_fn, (void *) k); ++ ret = six_lock_read(&b->c.lock, lock_node_check_fn, (void *) k); + if (ret) + goto retry; + + if (unlikely(b->hash_val != btree_ptr_hash_val(k) || +- b->btree_id != btree_id || +- b->level != level)) { +- six_unlock_read(&b->lock); ++ b->c.btree_id != btree_id || ++ b->c.level != level)) { ++ six_unlock_read(&b->c.lock); + goto retry; + } + } +@@ -866,11 +866,11 @@ lock_node: + set_btree_node_accessed(b); + + if (unlikely(btree_node_read_error(b))) { +- six_unlock_read(&b->lock); ++ six_unlock_read(&b->c.lock); + return ERR_PTR(-EIO); + } + +- EBUG_ON(b->btree_id != btree_id || ++ EBUG_ON(b->c.btree_id != btree_id || + BTREE_NODE_LEVEL(b->data) != level || + bkey_cmp(b->data->max_key, k->k.p)); + +@@ -888,7 +888,7 @@ struct btree *bch2_btree_node_get_sibling(struct bch_fs *c, + struct bkey_packed *k; + BKEY_PADDED(k) tmp; + struct btree *ret = NULL; +- unsigned level = b->level; ++ unsigned level = b->c.level; + + parent = btree_iter_node(iter, level + 1); + if (!parent) +@@ -911,7 +911,7 @@ struct btree *bch2_btree_node_get_sibling(struct bch_fs *c, + goto out; + } + +- node_iter = iter->l[parent->level].iter; ++ node_iter = iter->l[parent->c.level].iter; + + k = bch2_btree_node_iter_peek_all(&node_iter, parent); + BUG_ON(bkey_cmp_left_packed(parent, k, &b->key.k.p)); +@@ -958,7 +958,7 @@ struct btree *bch2_btree_node_get_sibling(struct bch_fs *c, + btree_iter_set_dirty(iter, BTREE_ITER_NEED_RELOCK); + + if (!IS_ERR(ret)) { +- six_unlock_intent(&ret->lock); ++ six_unlock_intent(&ret->c.lock); + ret = ERR_PTR(-EINTR); + } + } +@@ -1019,7 +1019,7 @@ void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, + pr_buf(out, + "l %u %llu:%llu - %llu:%llu:\n" + " ptrs: ", +- b->level, ++ b->c.level, + b->data->min_key.inode, + b->data->min_key.offset, + b->data->max_key.inode, +diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h +index 98cca30778ea..2160012c734f 100644 +--- a/fs/bcachefs/btree_cache.h ++++ b/fs/bcachefs/btree_cache.h +@@ -101,7 +101,7 @@ static inline unsigned btree_blocks(struct bch_fs *c) + (BTREE_FOREGROUND_MERGE_THRESHOLD(c) + \ + (BTREE_FOREGROUND_MERGE_THRESHOLD(c) << 2)) + +-#define btree_node_root(_c, _b) ((_c)->btree_roots[(_b)->btree_id].b) ++#define btree_node_root(_c, _b) ((_c)->btree_roots[(_b)->c.btree_id].b) + + void bch2_btree_node_to_text(struct printbuf *, struct bch_fs *, + struct btree *); +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +index ede58aa2e4bb..7293b8fedd27 100644 +--- a/fs/bcachefs/btree_gc.c ++++ b/fs/bcachefs/btree_gc.c +@@ -186,7 +186,7 @@ static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, u8 *max_stale, + + bch2_btree_node_iter_advance(&iter, b); + +- if (b->level) { ++ if (b->c.level) { + ret = bch2_gc_check_topology(c, k, + &next_node_start, + b->data->max_key, +@@ -252,7 +252,7 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id, + if (!btree_node_fake(b)) + ret = bch2_gc_mark_key(c, bkey_i_to_s_c(&b->key), + &max_stale, initial); +- gc_pos_set(c, gc_pos_btree_root(b->btree_id)); ++ gc_pos_set(c, gc_pos_btree_root(b->c.btree_id)); + mutex_unlock(&c->btree_root_lock); + + return ret; +@@ -280,7 +280,7 @@ static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b, + if (ret) + break; + +- if (b->level) { ++ if (b->c.level) { + struct btree *child; + BKEY_PADDED(k) tmp; + +@@ -296,16 +296,16 @@ static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b, + if (ret) + break; + +- if (b->level > target_depth) { ++ if (b->c.level > target_depth) { + child = bch2_btree_node_get_noiter(c, &tmp.k, +- b->btree_id, b->level - 1); ++ b->c.btree_id, b->c.level - 1); + ret = PTR_ERR_OR_ZERO(child); + if (ret) + break; + + ret = bch2_gc_btree_init_recurse(c, child, + journal_keys, target_depth); +- six_unlock_read(&child->lock); ++ six_unlock_read(&child->c.lock); + + if (ret) + break; +@@ -336,7 +336,7 @@ static int bch2_gc_btree_init(struct bch_fs *c, + if (btree_node_fake(b)) + return 0; + +- six_lock_read(&b->lock, NULL, NULL); ++ six_lock_read(&b->c.lock, NULL, NULL); + if (fsck_err_on(bkey_cmp(b->data->min_key, POS_MIN), c, + "btree root with incorrect min_key: %llu:%llu", + b->data->min_key.inode, +@@ -351,7 +351,7 @@ static int bch2_gc_btree_init(struct bch_fs *c, + BUG(); + } + +- if (b->level >= target_depth) ++ if (b->c.level >= target_depth) + ret = bch2_gc_btree_init_recurse(c, b, + journal_keys, target_depth); + +@@ -359,7 +359,7 @@ static int bch2_gc_btree_init(struct bch_fs *c, + ret = bch2_gc_mark_key(c, bkey_i_to_s_c(&b->key), + &max_stale, true); + fsck_err: +- six_unlock_read(&b->lock); ++ six_unlock_read(&b->c.lock); + + return ret; + } +@@ -1084,9 +1084,9 @@ static void bch2_coalesce_nodes(struct bch_fs *c, struct btree_iter *iter, + + set_btree_bset_end(n1, n1->set); + +- six_unlock_write(&n2->lock); ++ six_unlock_write(&n2->c.lock); + bch2_btree_node_free_never_inserted(c, n2); +- six_unlock_intent(&n2->lock); ++ six_unlock_intent(&n2->c.lock); + + memmove(new_nodes + i - 1, + new_nodes + i, +@@ -1122,7 +1122,7 @@ static void bch2_coalesce_nodes(struct bch_fs *c, struct btree_iter *iter, + bch2_btree_build_aux_trees(n); + + bch2_btree_update_add_new_node(as, n); +- six_unlock_write(&n->lock); ++ six_unlock_write(&n->c.lock); + + bch2_btree_node_write(c, n, SIX_LOCK_intent); + } +@@ -1165,7 +1165,7 @@ next: + + BUG_ON(!bch2_keylist_empty(&keylist)); + +- BUG_ON(iter->l[old_nodes[0]->level].b != old_nodes[0]); ++ BUG_ON(iter->l[old_nodes[0]->c.level].b != old_nodes[0]); + + bch2_btree_iter_node_replace(iter, new_nodes[0]); + +@@ -1190,7 +1190,7 @@ next: + } + + for (i = 0; i < nr_new_nodes; i++) +- six_unlock_intent(&new_nodes[i]->lock); ++ six_unlock_intent(&new_nodes[i]->c.lock); + + bch2_btree_update_done(as); + bch2_keylist_free(&keylist, NULL); +@@ -1231,11 +1231,11 @@ static int bch2_coalesce_btree(struct bch_fs *c, enum btree_id btree_id) + + for (i = 1; i < GC_MERGE_NODES; i++) { + if (!merge[i] || +- !six_relock_intent(&merge[i]->lock, lock_seq[i])) ++ !six_relock_intent(&merge[i]->c.lock, lock_seq[i])) + break; + +- if (merge[i]->level != merge[0]->level) { +- six_unlock_intent(&merge[i]->lock); ++ if (merge[i]->c.level != merge[0]->c.level) { ++ six_unlock_intent(&merge[i]->c.lock); + break; + } + } +@@ -1244,11 +1244,11 @@ static int bch2_coalesce_btree(struct bch_fs *c, enum btree_id btree_id) + bch2_coalesce_nodes(c, iter, merge); + + for (i = 1; i < GC_MERGE_NODES && merge[i]; i++) { +- lock_seq[i] = merge[i]->lock.state.seq; +- six_unlock_intent(&merge[i]->lock); ++ lock_seq[i] = merge[i]->c.lock.state.seq; ++ six_unlock_intent(&merge[i]->c.lock); + } + +- lock_seq[0] = merge[0]->lock.state.seq; ++ lock_seq[0] = merge[0]->c.lock.state.seq; + + if (kthread && kthread_should_stop()) { + bch2_trans_exit(&trans); +diff --git a/fs/bcachefs/btree_gc.h b/fs/bcachefs/btree_gc.h +index e09af2fda3b6..3694a3df62a8 100644 +--- a/fs/bcachefs/btree_gc.h ++++ b/fs/bcachefs/btree_gc.h +@@ -82,7 +82,7 @@ static inline struct gc_pos gc_pos_btree(enum btree_id id, + */ + static inline struct gc_pos gc_pos_btree_node(struct btree *b) + { +- return gc_pos_btree(b->btree_id, b->key.k.p, b->level); ++ return gc_pos_btree(b->c.btree_id, b->key.k.p, b->c.level); + } + + /* +diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c +index 6a42ce2522fd..5fc9137b822e 100644 +--- a/fs/bcachefs/btree_io.c ++++ b/fs/bcachefs/btree_io.c +@@ -584,8 +584,8 @@ void bch2_btree_init_next(struct bch_fs *c, struct btree *b, + struct btree_node_entry *bne; + bool did_sort; + +- EBUG_ON(!(b->lock.state.seq & 1)); +- EBUG_ON(iter && iter->l[b->level].b != b); ++ EBUG_ON(!(b->c.lock.state.seq & 1)); ++ EBUG_ON(iter && iter->l[b->c.level].b != b); + + did_sort = btree_node_compact(c, b, iter); + +@@ -634,8 +634,8 @@ static void btree_err_msg(struct printbuf *out, struct bch_fs *c, + pr_buf(out, "error validating btree node %sat btree %u level %u/%u\n" + "pos ", + write ? "before write " : "", +- b->btree_id, b->level, +- c->btree_roots[b->btree_id].level); ++ b->c.btree_id, b->c.level, ++ c->btree_roots[b->c.btree_id].level); + bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key)); + + pr_buf(out, " node offset %u", b->written); +@@ -747,11 +747,11 @@ static int validate_bset(struct bch_fs *c, struct btree *b, + "incorrect sequence number (wrong btree node)"); + } + +- btree_err_on(BTREE_NODE_ID(bn) != b->btree_id, ++ btree_err_on(BTREE_NODE_ID(bn) != b->c.btree_id, + BTREE_ERR_MUST_RETRY, c, b, i, + "incorrect btree id"); + +- btree_err_on(BTREE_NODE_LEVEL(bn) != b->level, ++ btree_err_on(BTREE_NODE_LEVEL(bn) != b->c.level, + BTREE_ERR_MUST_RETRY, c, b, i, + "incorrect level"); + +@@ -762,7 +762,7 @@ static int validate_bset(struct bch_fs *c, struct btree *b, + } + + if (!write) +- compat_btree_node(b->level, b->btree_id, version, ++ compat_btree_node(b->c.level, b->c.btree_id, version, + BSET_BIG_ENDIAN(i), write, bn); + + if (b->key.k.type == KEY_TYPE_btree_ptr_v2) { +@@ -783,7 +783,7 @@ static int validate_bset(struct bch_fs *c, struct btree *b, + "incorrect max key"); + + if (write) +- compat_btree_node(b->level, b->btree_id, version, ++ compat_btree_node(b->c.level, b->c.btree_id, version, + BSET_BIG_ENDIAN(i), write, bn); + + /* XXX: ideally we would be validating min_key too */ +@@ -805,7 +805,7 @@ static int validate_bset(struct bch_fs *c, struct btree *b, + BTREE_ERR_FATAL, c, b, i, + "invalid bkey format: %s", err); + +- compat_bformat(b->level, b->btree_id, version, ++ compat_bformat(b->c.level, b->c.btree_id, version, + BSET_BIG_ENDIAN(i), write, + &bn->format); + } +@@ -851,7 +851,7 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b, + + /* XXX: validate k->u64s */ + if (!write) +- bch2_bkey_compat(b->level, b->btree_id, version, ++ bch2_bkey_compat(b->c.level, b->c.btree_id, version, + BSET_BIG_ENDIAN(i), write, + &b->format, k); + +@@ -874,7 +874,7 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b, + } + + if (write) +- bch2_bkey_compat(b->level, b->btree_id, version, ++ bch2_bkey_compat(b->c.level, b->c.btree_id, version, + BSET_BIG_ENDIAN(i), write, + &b->format, k); + +@@ -1280,8 +1280,8 @@ int bch2_btree_root_read(struct bch_fs *c, enum btree_id id, + + bch2_btree_set_root_for_read(c, b); + err: +- six_unlock_write(&b->lock); +- six_unlock_intent(&b->lock); ++ six_unlock_write(&b->c.lock); ++ six_unlock_intent(&b->c.lock); + + return ret; + } +@@ -1325,15 +1325,15 @@ static void bch2_btree_node_write_error(struct bch_fs *c, + + bch2_trans_init(&trans, c, 0, 0); + +- iter = bch2_trans_get_node_iter(&trans, b->btree_id, b->key.k.p, +- BTREE_MAX_DEPTH, b->level, 0); ++ iter = bch2_trans_get_node_iter(&trans, b->c.btree_id, b->key.k.p, ++ BTREE_MAX_DEPTH, b->c.level, 0); + retry: + ret = bch2_btree_iter_traverse(iter); + if (ret) + goto err; + + /* has node been freed? */ +- if (iter->l[b->level].b != b) { ++ if (iter->l[b->c.level].b != b) { + /* node has been freed: */ + BUG_ON(!btree_node_dying(b)); + goto out; +@@ -1764,18 +1764,18 @@ void bch2_btree_node_write(struct bch_fs *c, struct btree *b, + BUG_ON(lock_type_held == SIX_LOCK_write); + + if (lock_type_held == SIX_LOCK_intent || +- six_lock_tryupgrade(&b->lock)) { ++ six_lock_tryupgrade(&b->c.lock)) { + __bch2_btree_node_write(c, b, SIX_LOCK_intent); + + /* don't cycle lock unnecessarily: */ + if (btree_node_just_written(b) && +- six_trylock_write(&b->lock)) { ++ six_trylock_write(&b->c.lock)) { + bch2_btree_post_write_cleanup(c, b); +- six_unlock_write(&b->lock); ++ six_unlock_write(&b->c.lock); + } + + if (lock_type_held == SIX_LOCK_read) +- six_lock_downgrade(&b->lock); ++ six_lock_downgrade(&b->c.lock); + } else { + __bch2_btree_node_write(c, b, SIX_LOCK_read); + } +@@ -1845,7 +1845,7 @@ ssize_t bch2_dirty_btree_nodes_print(struct bch_fs *c, char *buf) + b, + (flags & (1 << BTREE_NODE_dirty)) != 0, + (flags & (1 << BTREE_NODE_need_write)) != 0, +- b->level, ++ b->c.level, + b->written, + !list_empty_careful(&b->write_blocked), + b->will_make_reachable != 0, +diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h +index 337d2bdd29e8..f3d7ec749b61 100644 +--- a/fs/bcachefs/btree_io.h ++++ b/fs/bcachefs/btree_io.h +@@ -114,7 +114,7 @@ static inline void btree_node_write_if_need(struct bch_fs *c, struct btree *b, + break; + } + +- six_unlock_type(&b->lock, lock_held); ++ six_unlock_type(&b->c.lock, lock_held); + btree_node_wait_on_io(b); + btree_node_lock_type(c, b, lock_held); + } +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index acbd7a31ba0e..93d710faddae 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -51,7 +51,7 @@ static inline bool btree_iter_pos_after_node(struct btree_iter *iter, + static inline bool btree_iter_pos_in_node(struct btree_iter *iter, + struct btree *b) + { +- return iter->btree_id == b->btree_id && ++ return iter->btree_id == b->c.btree_id && + !btree_iter_pos_before_node(iter, b) && + !btree_iter_pos_after_node(iter, b); + } +@@ -68,11 +68,11 @@ void __bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter) + struct btree_iter *linked; + unsigned readers = 0; + +- EBUG_ON(!btree_node_intent_locked(iter, b->level)); ++ EBUG_ON(!btree_node_intent_locked(iter, b->c.level)); + + trans_for_each_iter(iter->trans, linked) +- if (linked->l[b->level].b == b && +- btree_node_read_locked(linked, b->level)) ++ if (linked->l[b->c.level].b == b && ++ btree_node_read_locked(linked, b->c.level)) + readers++; + + /* +@@ -82,10 +82,10 @@ void __bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter) + * locked: + */ + atomic64_sub(__SIX_VAL(read_lock, readers), +- &b->lock.state.counter); ++ &b->c.lock.state.counter); + btree_node_lock_type(iter->trans->c, b, SIX_LOCK_write); + atomic64_add(__SIX_VAL(read_lock, readers), +- &b->lock.state.counter); ++ &b->c.lock.state.counter); + } + + bool __bch2_btree_node_relock(struct btree_iter *iter, unsigned level) +@@ -99,7 +99,7 @@ bool __bch2_btree_node_relock(struct btree_iter *iter, unsigned level) + if (race_fault()) + return false; + +- if (six_relock_type(&b->lock, want, iter->l[level].lock_seq) || ++ if (six_relock_type(&b->c.lock, want, iter->l[level].lock_seq) || + (btree_node_lock_seq_matches(iter, b, level) && + btree_node_lock_increment(iter->trans, b, level, want))) { + mark_btree_node_locked(iter, level, want); +@@ -125,8 +125,8 @@ static bool bch2_btree_node_upgrade(struct btree_iter *iter, unsigned level) + return false; + + if (btree_node_locked(iter, level) +- ? six_lock_tryupgrade(&b->lock) +- : six_relock_type(&b->lock, SIX_LOCK_intent, iter->l[level].lock_seq)) ++ ? six_lock_tryupgrade(&b->c.lock) ++ : six_relock_type(&b->c.lock, SIX_LOCK_intent, iter->l[level].lock_seq)) + goto success; + + if (btree_node_lock_seq_matches(iter, b, level) && +@@ -162,7 +162,7 @@ static inline bool btree_iter_get_locks(struct btree_iter *iter, + ? 0 + : (unsigned long) iter->l[l].b, + is_btree_node(iter, l) +- ? iter->l[l].b->lock.state.seq ++ ? iter->l[l].b->c.lock.state.seq + : 0); + + fail_idx = l; +@@ -268,7 +268,7 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos, + */ + if (linked->l[level].b == b && + btree_node_locked_type(linked, level) >= type) { +- six_lock_increment(&b->lock, type); ++ six_lock_increment(&b->c.lock, type); + return true; + } + } +@@ -278,10 +278,10 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos, + return false; + } + +- if (six_trylock_type(&b->lock, type)) ++ if (six_trylock_type(&b->c.lock, type)) + return true; + +- if (six_lock_type(&b->lock, type, should_sleep_fn, p)) ++ if (six_lock_type(&b->c.lock, type, should_sleep_fn, p)) + return false; + + bch2_time_stats_update(&trans->c->times[lock_to_time_stat(type)], +@@ -395,7 +395,7 @@ void __bch2_btree_iter_downgrade(struct btree_iter *iter, + btree_node_unlock(iter, l); + } else { + if (btree_node_intent_locked(iter, l)) { +- six_lock_downgrade(&iter->l[l].b->lock); ++ six_lock_downgrade(&iter->l[l].b->c.lock); + iter->nodes_intent_locked ^= 1 << l; + } + break; +@@ -545,7 +545,7 @@ void bch2_btree_trans_verify_iters(struct btree_trans *trans, struct btree *b) + return; + + trans_for_each_iter_with_node(trans, b, iter) +- bch2_btree_iter_verify_level(iter, b->level); ++ bch2_btree_iter_verify_level(iter, b->c.level); + } + + #else +@@ -576,7 +576,7 @@ static void __bch2_btree_iter_fix_key_modified(struct btree_iter *iter, + struct btree *b, + struct bkey_packed *where) + { +- struct btree_iter_level *l = &iter->l[b->level]; ++ struct btree_iter_level *l = &iter->l[b->c.level]; + struct bpos pos = btree_iter_search_key(iter); + + if (where != bch2_btree_node_iter_peek_all(&l->iter, l->b)) +@@ -596,7 +596,7 @@ void bch2_btree_iter_fix_key_modified(struct btree_iter *iter, + + trans_for_each_iter_with_node(iter->trans, b, linked) { + __bch2_btree_iter_fix_key_modified(linked, b, where); +- bch2_btree_iter_verify_level(linked, b->level); ++ bch2_btree_iter_verify_level(linked, b->c.level); + } + } + +@@ -666,7 +666,7 @@ fixup_done: + */ + if (!bch2_btree_node_iter_end(node_iter) && + iter_current_key_modified && +- (b->level || ++ (b->c.level || + btree_node_type_is_extents(iter->btree_id))) { + struct bset_tree *t; + struct bkey_packed *k, *k2, *p; +@@ -693,7 +693,7 @@ fixup_done: + } + } + +- if (!b->level && ++ if (!b->c.level && + node_iter == &iter->l[0].iter && + iter_current_key_modified) + btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); +@@ -709,7 +709,7 @@ void bch2_btree_node_iter_fix(struct btree_iter *iter, + struct bset_tree *t = bch2_bkey_to_bset(b, where); + struct btree_iter *linked; + +- if (node_iter != &iter->l[b->level].iter) { ++ if (node_iter != &iter->l[b->c.level].iter) { + __bch2_btree_node_iter_fix(iter, b, node_iter, t, + where, clobber_u64s, new_u64s); + +@@ -719,9 +719,9 @@ void bch2_btree_node_iter_fix(struct btree_iter *iter, + + trans_for_each_iter_with_node(iter->trans, b, linked) { + __bch2_btree_node_iter_fix(linked, b, +- &linked->l[b->level].iter, t, ++ &linked->l[b->c.level].iter, t, + where, clobber_u64s, new_u64s); +- bch2_btree_iter_verify_level(linked, b->level); ++ bch2_btree_iter_verify_level(linked, b->c.level); + } + } + +@@ -805,7 +805,7 @@ static void btree_iter_verify_new_node(struct btree_iter *iter, struct btree *b) + if (!IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) + return; + +- plevel = b->level + 1; ++ plevel = b->c.level + 1; + if (!btree_iter_node(iter, plevel)) + return; + +@@ -828,7 +828,7 @@ static void btree_iter_verify_new_node(struct btree_iter *iter, struct btree *b) + } + + if (!parent_locked) +- btree_node_unlock(iter, b->level + 1); ++ btree_node_unlock(iter, b->c.level + 1); + } + + static inline void __btree_iter_init(struct btree_iter *iter, +@@ -848,11 +848,11 @@ static inline void btree_iter_node_set(struct btree_iter *iter, + btree_iter_verify_new_node(iter, b); + + EBUG_ON(!btree_iter_pos_in_node(iter, b)); +- EBUG_ON(b->lock.state.seq & 1); ++ EBUG_ON(b->c.lock.state.seq & 1); + +- iter->l[b->level].lock_seq = b->lock.state.seq; +- iter->l[b->level].b = b; +- __btree_iter_init(iter, b->level); ++ iter->l[b->c.level].lock_seq = b->c.lock.state.seq; ++ iter->l[b->c.level].b = b; ++ __btree_iter_init(iter, b->c.level); + } + + /* +@@ -871,12 +871,12 @@ void bch2_btree_iter_node_replace(struct btree_iter *iter, struct btree *b) + * the old node we're replacing has already been + * unlocked and the pointer invalidated + */ +- BUG_ON(btree_node_locked(linked, b->level)); ++ BUG_ON(btree_node_locked(linked, b->c.level)); + +- t = btree_lock_want(linked, b->level); ++ t = btree_lock_want(linked, b->c.level); + if (t != BTREE_NODE_UNLOCKED) { +- six_lock_increment(&b->lock, t); +- mark_btree_node_locked(linked, b->level, t); ++ six_lock_increment(&b->c.lock, t); ++ mark_btree_node_locked(linked, b->c.level, t); + } + + btree_iter_node_set(linked, b); +@@ -886,7 +886,7 @@ void bch2_btree_iter_node_replace(struct btree_iter *iter, struct btree *b) + void bch2_btree_iter_node_drop(struct btree_iter *iter, struct btree *b) + { + struct btree_iter *linked; +- unsigned level = b->level; ++ unsigned level = b->c.level; + + trans_for_each_iter(iter->trans, linked) + if (linked->l[level].b == b) { +@@ -904,12 +904,12 @@ void bch2_btree_iter_reinit_node(struct btree_iter *iter, struct btree *b) + struct btree_iter *linked; + + trans_for_each_iter_with_node(iter->trans, b, linked) +- __btree_iter_init(linked, b->level); ++ __btree_iter_init(linked, b->c.level); + } + + static int lock_root_check_fn(struct six_lock *lock, void *p) + { +- struct btree *b = container_of(lock, struct btree, lock); ++ struct btree *b = container_of(lock, struct btree, c.lock); + struct btree **rootp = p; + + return b == *rootp ? 0 : -1; +@@ -927,7 +927,7 @@ static inline int btree_iter_lock_root(struct btree_iter *iter, + + while (1) { + b = READ_ONCE(*rootp); +- iter->level = READ_ONCE(b->level); ++ iter->level = READ_ONCE(b->c.level); + + if (unlikely(iter->level < depth_want)) { + /* +@@ -949,7 +949,7 @@ static inline int btree_iter_lock_root(struct btree_iter *iter, + return -EINTR; + + if (likely(b == READ_ONCE(*rootp) && +- b->level == iter->level && ++ b->c.level == iter->level && + !race_fault())) { + for (i = 0; i < iter->level; i++) + iter->l[i].b = BTREE_ITER_NO_NODE_LOCK_ROOT; +@@ -962,7 +962,7 @@ static inline int btree_iter_lock_root(struct btree_iter *iter, + return 0; + } + +- six_unlock_type(&b->lock, lock_type); ++ six_unlock_type(&b->c.lock, lock_type); + } + } + +@@ -2002,7 +2002,7 @@ static inline void btree_iter_copy(struct btree_iter *dst, + + for (i = 0; i < BTREE_MAX_DEPTH; i++) + if (btree_node_locked(dst, i)) +- six_lock_increment(&dst->l[i].b->lock, ++ six_lock_increment(&dst->l[i].b->c.lock, + __btree_lock_want(dst, i)); + + dst->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT; +@@ -2317,8 +2317,8 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c) + bch2_bpos_to_text(out, trans->locking_pos); + + pr_buf(out, " node %px l=%u %s:", +- b, b->level, +- bch2_btree_ids[b->btree_id]); ++ b, b->c.level, ++ bch2_btree_ids[b->c.btree_id]); + bch2_bpos_to_text(out, b->key.k.p); + pr_buf(out, "\n"); + } +diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h +index b11d2a30d9c7..bc408f1272e7 100644 +--- a/fs/bcachefs/btree_iter.h ++++ b/fs/bcachefs/btree_iter.h +@@ -27,13 +27,13 @@ static inline bool btree_node_lock_seq_matches(const struct btree_iter *iter, + * that write lock. The lock sequence number is incremented by taking + * and releasing write locks and is even when unlocked: + */ +- return iter->l[level].lock_seq >> 1 == b->lock.state.seq >> 1; ++ return iter->l[level].lock_seq >> 1 == b->c.lock.state.seq >> 1; + } + + static inline struct btree *btree_node_parent(struct btree_iter *iter, + struct btree *b) + { +- return btree_iter_node(iter, b->level + 1); ++ return btree_iter_node(iter, b->c.level + 1); + } + + static inline bool btree_trans_has_multiple_iters(const struct btree_trans *trans) +@@ -73,8 +73,8 @@ __trans_next_iter(struct btree_trans *trans, unsigned idx) + static inline bool __iter_has_node(const struct btree_iter *iter, + const struct btree *b) + { +- return iter->l[b->level].b == b && +- btree_node_lock_seq_matches(iter, b, b->level); ++ return iter->l[b->c.level].b == b && ++ btree_node_lock_seq_matches(iter, b, b->c.level); + } + + static inline struct btree_iter * +diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h +index da2a0ebbc24f..81fbf3e18647 100644 +--- a/fs/bcachefs/btree_locking.h ++++ b/fs/bcachefs/btree_locking.h +@@ -102,7 +102,7 @@ static inline void __btree_node_unlock(struct btree_iter *iter, unsigned level) + EBUG_ON(level >= BTREE_MAX_DEPTH); + + if (lock_type != BTREE_NODE_UNLOCKED) +- six_unlock_type(&iter->l[level].b->lock, lock_type); ++ six_unlock_type(&iter->l[level].b->c.lock, lock_type); + mark_btree_node_unlocked(iter, level); + } + +@@ -143,14 +143,14 @@ static inline void __btree_node_lock_type(struct bch_fs *c, struct btree *b, + { + u64 start_time = local_clock(); + +- six_lock_type(&b->lock, type, NULL, NULL); ++ six_lock_type(&b->c.lock, type, NULL, NULL); + bch2_time_stats_update(&c->times[lock_to_time_stat(type)], start_time); + } + + static inline void btree_node_lock_type(struct bch_fs *c, struct btree *b, + enum six_lock_type type) + { +- if (!six_trylock_type(&b->lock, type)) ++ if (!six_trylock_type(&b->c.lock, type)) + __btree_node_lock_type(c, b, type); + } + +@@ -167,7 +167,7 @@ static inline bool btree_node_lock_increment(struct btree_trans *trans, + trans_for_each_iter(trans, iter) + if (iter->l[level].b == b && + btree_node_locked_type(iter, level) >= want) { +- six_lock_increment(&b->lock, want); ++ six_lock_increment(&b->c.lock, want); + return true; + } + +@@ -197,7 +197,7 @@ static inline bool btree_node_lock(struct btree *b, + trans->locking_btree_id = iter->btree_id; + trans->locking_level = level; + #endif +- ret = likely(six_trylock_type(&b->lock, type)) || ++ ret = likely(six_trylock_type(&b->c.lock, type)) || + btree_node_lock_increment(trans, b, level, type) || + __bch2_btree_node_lock(b, pos, level, iter, type, + should_sleep_fn, p); +@@ -230,13 +230,13 @@ bch2_btree_node_unlock_write_inlined(struct btree *b, struct btree_iter *iter) + { + struct btree_iter *linked; + +- EBUG_ON(iter->l[b->level].b != b); +- EBUG_ON(iter->l[b->level].lock_seq + 1 != b->lock.state.seq); ++ EBUG_ON(iter->l[b->c.level].b != b); ++ EBUG_ON(iter->l[b->c.level].lock_seq + 1 != b->c.lock.state.seq); + + trans_for_each_iter_with_node(iter->trans, b, linked) +- linked->l[b->level].lock_seq += 2; ++ linked->l[b->c.level].lock_seq += 2; + +- six_unlock_write(&b->lock); ++ six_unlock_write(&b->c.lock); + } + + void bch2_btree_node_unlock_write(struct btree *, struct btree_iter *); +@@ -245,10 +245,10 @@ void __bch2_btree_node_lock_write(struct btree *, struct btree_iter *); + + static inline void bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter) + { +- EBUG_ON(iter->l[b->level].b != b); +- EBUG_ON(iter->l[b->level].lock_seq != b->lock.state.seq); ++ EBUG_ON(iter->l[b->c.level].b != b); ++ EBUG_ON(iter->l[b->c.level].lock_seq != b->c.lock.state.seq); + +- if (unlikely(!six_trylock_write(&b->lock))) ++ if (unlikely(!six_trylock_write(&b->c.lock))) + __bch2_btree_node_lock_write(b, iter); + } + +diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h +index 047b7b0776a1..9ca4032f49a6 100644 +--- a/fs/bcachefs/btree_types.h ++++ b/fs/bcachefs/btree_types.h +@@ -60,17 +60,20 @@ struct btree_alloc { + BKEY_PADDED(k); + }; + ++struct btree_bkey_cached_common { ++ struct six_lock lock; ++ u8 level; ++ u8 btree_id; ++}; ++ + struct btree { +- /* Hottest entries first */ ++ struct btree_bkey_cached_common c; ++ + struct rhash_head hash; + u64 hash_val; + +- struct six_lock lock; +- + unsigned long flags; + u16 written; +- u8 level; +- u8 btree_id; + u8 nsets; + u8 nr_key_bits; + +@@ -495,7 +498,7 @@ static inline enum btree_node_type __btree_node_type(unsigned level, enum btree_ + /* Type of keys @b contains: */ + static inline enum btree_node_type btree_node_type(struct btree *b) + { +- return __btree_node_type(b->level, b->btree_id); ++ return __btree_node_type(b->c.level, b->c.btree_id); + } + + static inline bool btree_node_type_is_extents(enum btree_node_type type) +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index 4d38943a4f0c..a626a7698d13 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -35,7 +35,7 @@ static void btree_node_interior_verify(struct btree *b) + struct bkey_s_c_btree_ptr_v2 bp; + struct bkey unpacked; + +- BUG_ON(!b->level); ++ BUG_ON(!b->c.level); + + bch2_btree_node_iter_init_from_start(&iter, b); + +@@ -135,7 +135,7 @@ static void __btree_node_free(struct bch_fs *c, struct btree *b) + + bch2_btree_node_hash_remove(&c->btree_cache, b); + +- six_lock_wakeup_all(&b->lock); ++ six_lock_wakeup_all(&b->c.lock); + + mutex_lock(&c->btree_cache.lock); + list_move(&b->list, &c->btree_cache.freeable); +@@ -152,7 +152,7 @@ void bch2_btree_node_free_never_inserted(struct bch_fs *c, struct btree *b) + + btree_node_lock_type(c, b, SIX_LOCK_write); + __btree_node_free(c, b); +- six_unlock_write(&b->lock); ++ six_unlock_write(&b->c.lock); + + bch2_open_buckets_put(c, &ob); + } +@@ -163,12 +163,12 @@ void bch2_btree_node_free_inmem(struct bch_fs *c, struct btree *b, + struct btree_iter *linked; + + trans_for_each_iter(iter->trans, linked) +- BUG_ON(linked->l[b->level].b == b); ++ BUG_ON(linked->l[b->c.level].b == b); + +- six_lock_write(&b->lock, NULL, NULL); ++ six_lock_write(&b->c.lock, NULL, NULL); + __btree_node_free(c, b); +- six_unlock_write(&b->lock); +- six_unlock_intent(&b->lock); ++ six_unlock_write(&b->c.lock); ++ six_unlock_intent(&b->c.lock); + } + + static struct btree *__bch2_btree_node_alloc(struct bch_fs *c, +@@ -267,8 +267,8 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned lev + set_btree_node_need_write(b); + + bch2_bset_init_first(b, &b->data->keys); +- b->level = level; +- b->btree_id = as->btree_id; ++ b->c.level = level; ++ b->c.btree_id = as->btree_id; + + memset(&b->nr, 0, sizeof(b->nr)); + b->data->magic = cpu_to_le64(bset_magic(c)); +@@ -321,7 +321,7 @@ struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *as, + { + struct btree *n; + +- n = bch2_btree_node_alloc(as, b->level); ++ n = bch2_btree_node_alloc(as, b->c.level); + + SET_BTREE_NODE_SEQ(n->data, BTREE_NODE_SEQ(b->data) + 1); + +@@ -366,7 +366,7 @@ static struct btree *__btree_root_alloc(struct btree_update *as, unsigned level) + bch2_btree_build_aux_trees(b); + + bch2_btree_update_add_new_node(as, b); +- six_unlock_write(&b->lock); ++ six_unlock_write(&b->c.lock); + + return b; + } +@@ -380,7 +380,7 @@ static void bch2_btree_reserve_put(struct btree_update *as) + while (as->nr_prealloc_nodes) { + struct btree *b = as->prealloc_nodes[--as->nr_prealloc_nodes]; + +- six_unlock_write(&b->lock); ++ six_unlock_write(&b->c.lock); + + if (c->btree_reserve_cache_nr < + ARRAY_SIZE(c->btree_reserve_cache)) { +@@ -396,9 +396,9 @@ static void bch2_btree_reserve_put(struct btree_update *as) + + btree_node_lock_type(c, b, SIX_LOCK_write); + __btree_node_free(c, b); +- six_unlock_write(&b->lock); ++ six_unlock_write(&b->c.lock); + +- six_unlock_intent(&b->lock); ++ six_unlock_intent(&b->c.lock); + } + + mutex_unlock(&c->btree_reserve_cache_lock); +@@ -560,7 +560,7 @@ static void btree_update_nodes_written(struct btree_update *as) + if (!ret && as->b == b) { + struct bset *i = btree_bset_last(b); + +- BUG_ON(!b->level); ++ BUG_ON(!b->c.level); + BUG_ON(!btree_node_dirty(b)); + + i->journal_seq = cpu_to_le64( +@@ -571,10 +571,10 @@ static void btree_update_nodes_written(struct btree_update *as) + } + + mutex_unlock(&c->btree_interior_update_lock); +- six_unlock_write(&b->lock); ++ six_unlock_write(&b->c.lock); + + btree_node_write_if_need(c, b, SIX_LOCK_intent); +- six_unlock_intent(&b->lock); ++ six_unlock_intent(&b->c.lock); + } + + bch2_journal_pin_drop(&c->journal, &as->journal); +@@ -595,7 +595,7 @@ static void btree_update_nodes_written(struct btree_update *as) + + btree_node_lock_type(c, b, SIX_LOCK_read); + btree_node_write_if_need(c, b, SIX_LOCK_read); +- six_unlock_read(&b->lock); ++ six_unlock_read(&b->c.lock); + } + + for (i = 0; i < as->nr_open_buckets; i++) +@@ -694,7 +694,7 @@ static void btree_update_updated_root(struct btree_update *as, struct btree *b) + as->journal_u64s += + journal_entry_set((void *) &as->journal_entries[as->journal_u64s], + BCH_JSET_ENTRY_btree_root, +- b->btree_id, b->level, ++ b->c.btree_id, b->c.level, + insert, insert->k.u64s); + + mutex_lock(&c->btree_interior_update_lock); +@@ -946,7 +946,7 @@ static void bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b) + + mutex_lock(&c->btree_root_lock); + BUG_ON(btree_node_root(c, b) && +- (b->level < btree_node_root(c, b)->level || ++ (b->c.level < btree_node_root(c, b)->c.level || + !btree_node_dying(btree_node_root(c, b)))); + + btree_node_root(c, b) = b; +@@ -1014,7 +1014,7 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b + as->journal_u64s += + journal_entry_set((void *) &as->journal_entries[as->journal_u64s], + BCH_JSET_ENTRY_btree_keys, +- b->btree_id, b->level, ++ b->c.btree_id, b->c.level, + insert, insert->k.u64s); + + while ((k = bch2_btree_node_iter_peek_all(node_iter, b)) && +@@ -1039,7 +1039,7 @@ static struct btree *__btree_split_node(struct btree_update *as, + struct bset *set1, *set2; + struct bkey_packed *k, *prev = NULL; + +- n2 = bch2_btree_node_alloc(as, n1->level); ++ n2 = bch2_btree_node_alloc(as, n1->c.level); + bch2_btree_update_add_new_node(as, n2); + + n2->data->max_key = n1->data->max_key; +@@ -1108,7 +1108,7 @@ static struct btree *__btree_split_node(struct btree_update *as, + bch2_verify_btree_nr_keys(n1); + bch2_verify_btree_nr_keys(n2); + +- if (n1->level) { ++ if (n1->c.level) { + btree_node_interior_verify(n1); + btree_node_interior_verify(n2); + } +@@ -1182,7 +1182,7 @@ static void btree_split(struct btree_update *as, struct btree *b, + u64 start_time = local_clock(); + + BUG_ON(!parent && (b != btree_node_root(c, b))); +- BUG_ON(!btree_node_intent_locked(iter, btree_node_root(c, b)->level)); ++ BUG_ON(!btree_node_intent_locked(iter, btree_node_root(c, b)->c.level)); + + bch2_btree_interior_update_will_free_node(as, b); + +@@ -1199,8 +1199,8 @@ static void btree_split(struct btree_update *as, struct btree *b, + + bch2_btree_build_aux_trees(n2); + bch2_btree_build_aux_trees(n1); +- six_unlock_write(&n2->lock); +- six_unlock_write(&n1->lock); ++ six_unlock_write(&n2->c.lock); ++ six_unlock_write(&n1->c.lock); + + bch2_btree_node_write(c, n2, SIX_LOCK_intent); + +@@ -1214,7 +1214,7 @@ static void btree_split(struct btree_update *as, struct btree *b, + + if (!parent) { + /* Depth increases, make a new root */ +- n3 = __btree_root_alloc(as, b->level + 1); ++ n3 = __btree_root_alloc(as, b->c.level + 1); + + n3->sib_u64s[0] = U16_MAX; + n3->sib_u64s[1] = U16_MAX; +@@ -1227,7 +1227,7 @@ static void btree_split(struct btree_update *as, struct btree *b, + trace_btree_compact(c, b); + + bch2_btree_build_aux_trees(n1); +- six_unlock_write(&n1->lock); ++ six_unlock_write(&n1->c.lock); + + if (parent) + bch2_keylist_add(&as->parent_keys, &n1->key); +@@ -1255,7 +1255,7 @@ static void btree_split(struct btree_update *as, struct btree *b, + + /* Successful split, update the iterator to point to the new nodes: */ + +- six_lock_increment(&b->lock, SIX_LOCK_intent); ++ six_lock_increment(&b->c.lock, SIX_LOCK_intent); + bch2_btree_iter_node_drop(iter, b); + if (n3) + bch2_btree_iter_node_replace(iter, n3); +@@ -1272,10 +1272,10 @@ static void btree_split(struct btree_update *as, struct btree *b, + bch2_btree_node_free_inmem(c, b, iter); + + if (n3) +- six_unlock_intent(&n3->lock); ++ six_unlock_intent(&n3->c.lock); + if (n2) +- six_unlock_intent(&n2->lock); +- six_unlock_intent(&n1->lock); ++ six_unlock_intent(&n2->c.lock); ++ six_unlock_intent(&n1->c.lock); + + bch2_btree_trans_verify_locks(iter->trans); + +@@ -1293,7 +1293,7 @@ bch2_btree_insert_keys_interior(struct btree_update *as, struct btree *b, + struct bkey_packed *k; + + /* Don't screw up @iter's position: */ +- node_iter = iter->l[b->level].iter; ++ node_iter = iter->l[b->c.level].iter; + + /* + * btree_split(), btree_gc_coalesce() will insert keys before +@@ -1310,7 +1310,7 @@ bch2_btree_insert_keys_interior(struct btree_update *as, struct btree *b, + btree_update_updated_node(as, b); + + trans_for_each_iter_with_node(iter->trans, b, linked) +- bch2_btree_node_iter_peek(&linked->l[b->level].iter, b); ++ bch2_btree_node_iter_peek(&linked->l[b->c.level].iter, b); + + bch2_btree_trans_verify_iters(iter->trans, b); + } +@@ -1336,8 +1336,8 @@ void bch2_btree_insert_node(struct btree_update *as, struct btree *b, + int old_live_u64s = b->nr.live_u64s; + int live_u64s_added, u64s_added; + +- BUG_ON(!btree_node_intent_locked(iter, btree_node_root(c, b)->level)); +- BUG_ON(!b->level); ++ BUG_ON(!btree_node_intent_locked(iter, btree_node_root(c, b)->c.level)); ++ BUG_ON(!b->c.level); + BUG_ON(!as || as->b); + bch2_verify_keylist_sorted(keys); + +@@ -1374,7 +1374,7 @@ void bch2_btree_insert_node(struct btree_update *as, struct btree *b, + * the btree iterator yet, so the merge path's unlock/wait/relock dance + * won't work: + */ +- bch2_foreground_maybe_merge(c, iter, b->level, ++ bch2_foreground_maybe_merge(c, iter, b->c.level, + flags|BTREE_INSERT_NOUNLOCK); + return; + split: +@@ -1526,7 +1526,7 @@ retry: + b->sib_u64s[sib] = sib_u64s; + + if (b->sib_u64s[sib] > BTREE_FOREGROUND_MERGE_THRESHOLD(c)) { +- six_unlock_intent(&m->lock); ++ six_unlock_intent(&m->c.lock); + goto out; + } + +@@ -1556,7 +1556,7 @@ retry: + bch2_btree_interior_update_will_free_node(as, b); + bch2_btree_interior_update_will_free_node(as, m); + +- n = bch2_btree_node_alloc(as, b->level); ++ n = bch2_btree_node_alloc(as, b->c.level); + bch2_btree_update_add_new_node(as, n); + + btree_set_min(n, prev->data->min_key); +@@ -1569,7 +1569,7 @@ retry: + bch2_btree_sort_into(c, n, next); + + bch2_btree_build_aux_trees(n); +- six_unlock_write(&n->lock); ++ six_unlock_write(&n->c.lock); + + bkey_init(&delete.k); + delete.k.p = prev->key.k.p; +@@ -1582,7 +1582,7 @@ retry: + + bch2_btree_update_get_open_buckets(as, n); + +- six_lock_increment(&b->lock, SIX_LOCK_intent); ++ six_lock_increment(&b->c.lock, SIX_LOCK_intent); + bch2_btree_iter_node_drop(iter, b); + bch2_btree_iter_node_drop(iter, m); + +@@ -1593,7 +1593,7 @@ retry: + bch2_btree_node_free_inmem(c, b, iter); + bch2_btree_node_free_inmem(c, m, iter); + +- six_unlock_intent(&n->lock); ++ six_unlock_intent(&n->c.lock); + + bch2_btree_update_done(as); + +@@ -1615,7 +1615,7 @@ out: + return; + + err_cycle_gc_lock: +- six_unlock_intent(&m->lock); ++ six_unlock_intent(&m->c.lock); + + if (flags & BTREE_INSERT_NOUNLOCK) + goto out; +@@ -1628,7 +1628,7 @@ err_cycle_gc_lock: + goto err; + + err_unlock: +- six_unlock_intent(&m->lock); ++ six_unlock_intent(&m->c.lock); + if (!(flags & BTREE_INSERT_GC_LOCK_HELD)) + up_read(&c->gc_lock); + err: +@@ -1671,7 +1671,7 @@ static int __btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter, + bch2_btree_update_add_new_node(as, n); + + bch2_btree_build_aux_trees(n); +- six_unlock_write(&n->lock); ++ six_unlock_write(&n->c.lock); + + trace_btree_gc_rewrite_node(c, b); + +@@ -1686,11 +1686,11 @@ static int __btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter, + + bch2_btree_update_get_open_buckets(as, n); + +- six_lock_increment(&b->lock, SIX_LOCK_intent); ++ six_lock_increment(&b->c.lock, SIX_LOCK_intent); + bch2_btree_iter_node_drop(iter, b); + bch2_btree_iter_node_replace(iter, n); + bch2_btree_node_free_inmem(c, b, iter); +- six_unlock_intent(&n->lock); ++ six_unlock_intent(&n->c.lock); + + bch2_btree_update_done(as); + return 0; +@@ -1767,7 +1767,7 @@ static void __bch2_btree_node_update_key(struct bch_fs *c, + if (new_hash) { + bkey_copy(&new_hash->key, new_key); + ret = bch2_btree_node_hash_insert(&c->btree_cache, +- new_hash, b->level, b->btree_id); ++ new_hash, b->c.level, b->c.btree_id); + BUG_ON(ret); + } + +@@ -1893,8 +1893,8 @@ err: + list_move(&new_hash->list, &c->btree_cache.freeable); + mutex_unlock(&c->btree_cache.lock); + +- six_unlock_write(&new_hash->lock); +- six_unlock_intent(&new_hash->lock); ++ six_unlock_write(&new_hash->c.lock); ++ six_unlock_intent(&new_hash->c.lock); + } + up_read(&c->gc_lock); + closure_sync(&cl); +@@ -1934,8 +1934,8 @@ void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id) + bch2_btree_cache_cannibalize_unlock(c); + + set_btree_node_fake(b); +- b->level = 0; +- b->btree_id = id; ++ b->c.level = 0; ++ b->c.btree_id = id; + + bkey_btree_ptr_init(&b->key); + b->key.k.p = POS_MAX; +@@ -1950,13 +1950,14 @@ void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id) + b->data->format = bch2_btree_calc_format(b); + btree_node_set_format(b, b->data->format); + +- ret = bch2_btree_node_hash_insert(&c->btree_cache, b, b->level, b->btree_id); ++ ret = bch2_btree_node_hash_insert(&c->btree_cache, b, ++ b->c.level, b->c.btree_id); + BUG_ON(ret); + + bch2_btree_set_root_inmem(c, b); + +- six_unlock_write(&b->lock); +- six_unlock_intent(&b->lock); ++ six_unlock_write(&b->c.lock); ++ six_unlock_intent(&b->c.lock); + } + + ssize_t bch2_btree_updates_print(struct bch_fs *c, char *buf) +diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h +index e00dc51ff3eb..4a5b9dcfbdd0 100644 +--- a/fs/bcachefs/btree_update_interior.h ++++ b/fs/bcachefs/btree_update_interior.h +@@ -173,7 +173,7 @@ void bch2_btree_root_alloc(struct bch_fs *, enum btree_id); + static inline unsigned btree_update_reserve_required(struct bch_fs *c, + struct btree *b) + { +- unsigned depth = btree_node_root(c, b)->level + 1; ++ unsigned depth = btree_node_root(c, b)->c.level + 1; + + /* + * Number of nodes we might have to allocate in a worst case btree +@@ -181,9 +181,9 @@ static inline unsigned btree_update_reserve_required(struct bch_fs *c, + * a new root, unless we're already at max depth: + */ + if (depth < BTREE_MAX_DEPTH) +- return (depth - b->level) * 2 + 1; ++ return (depth - b->c.level) * 2 + 1; + else +- return (depth - b->level) * 2 - 1; ++ return (depth - b->c.level) * 2 - 1; + } + + static inline void btree_node_reset_sib_u64s(struct btree *b) +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index 283c10feb81f..1a1fd230e4b9 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -135,7 +135,7 @@ static void __btree_node_flush(struct journal *j, struct journal_entry_pin *pin, + btree_node_lock_type(c, b, SIX_LOCK_read); + bch2_btree_node_write_cond(c, b, + (btree_current_write(b) == w && w->journal.seq == seq)); +- six_unlock_read(&b->lock); ++ six_unlock_read(&b->c.lock); + } + + static void btree_node_flush0(struct journal *j, struct journal_entry_pin *pin, u64 seq) +diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c +index 69b123bad83b..4e0d14e37287 100644 +--- a/fs/bcachefs/debug.c ++++ b/fs/bcachefs/debug.c +@@ -52,8 +52,8 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b) + + bkey_copy(&v->key, &b->key); + v->written = 0; +- v->level = b->level; +- v->btree_id = b->btree_id; ++ v->c.level = b->c.level; ++ v->c.btree_id = b->c.btree_id; + bch2_btree_keys_init(v, &c->expensive_debug_checks); + + if (bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key), +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index 1f26d9e19fe9..26e5767aa5de 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -188,7 +188,7 @@ void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *i + iter->b = b; + bch2_btree_node_iter_init_from_start(&iter->node_iter, iter->b); + bch2_journal_iter_init(&iter->journal, journal_keys, +- b->btree_id, b->level, b->data->min_key); ++ b->c.btree_id, b->c.level, b->data->min_key); + } + + /* Walk btree, overlaying keys from the journal: */ +@@ -206,11 +206,11 @@ static int bch2_btree_and_journal_walk_recurse(struct bch_fs *c, struct btree *b + bch2_btree_and_journal_iter_init_node_iter(&iter, journal_keys, b); + + while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { +- ret = key_fn(c, btree_id, b->level, k); ++ ret = key_fn(c, btree_id, b->c.level, k); + if (ret) + break; + +- if (b->level) { ++ if (b->c.level) { + struct btree *child; + BKEY_PADDED(k) tmp; + +@@ -219,9 +219,9 @@ static int bch2_btree_and_journal_walk_recurse(struct bch_fs *c, struct btree *b + + bch2_btree_and_journal_iter_advance(&iter); + +- if (b->level > 0) { ++ if (b->c.level > 0) { + child = bch2_btree_node_get_noiter(c, &tmp.k, +- b->btree_id, b->level - 1); ++ b->c.btree_id, b->c.level - 1); + ret = PTR_ERR_OR_ZERO(child); + if (ret) + break; +@@ -229,7 +229,7 @@ static int bch2_btree_and_journal_walk_recurse(struct bch_fs *c, struct btree *b + ret = (node_fn ? node_fn(c, b) : 0) ?: + bch2_btree_and_journal_walk_recurse(c, child, + journal_keys, btree_id, node_fn, key_fn); +- six_unlock_read(&child->lock); ++ six_unlock_read(&child->c.lock); + + if (ret) + break; +@@ -253,12 +253,12 @@ int bch2_btree_and_journal_walk(struct bch_fs *c, struct journal_keys *journal_k + if (btree_node_fake(b)) + return 0; + +- six_lock_read(&b->lock, NULL, NULL); ++ six_lock_read(&b->c.lock, NULL, NULL); + ret = (node_fn ? node_fn(c, b) : 0) ?: + bch2_btree_and_journal_walk_recurse(c, b, journal_keys, btree_id, + node_fn, key_fn) ?: +- key_fn(c, btree_id, b->level + 1, bkey_i_to_s_c(&b->key)); +- six_unlock_read(&b->lock); ++ key_fn(c, btree_id, b->c.level + 1, bkey_i_to_s_c(&b->key)); ++ six_unlock_read(&b->c.lock); + + return ret; + } +diff --git a/include/trace/events/bcachefs.h b/include/trace/events/bcachefs.h +index c30a60e1d805..05ec550806f8 100644 +--- a/include/trace/events/bcachefs.h ++++ b/include/trace/events/bcachefs.h +@@ -144,8 +144,8 @@ DECLARE_EVENT_CLASS(btree_node, + + TP_fast_assign( + memcpy(__entry->uuid, c->sb.user_uuid.b, 16); +- __entry->level = b->level; +- __entry->id = b->btree_id; ++ __entry->level = b->c.level; ++ __entry->id = b->c.btree_id; + __entry->inode = b->key.k.p.inode; + __entry->offset = b->key.k.p.offset; + ), +@@ -262,7 +262,7 @@ TRACE_EVENT(btree_insert_key, + ), + + TP_fast_assign( +- __entry->id = b->btree_id; ++ __entry->id = b->c.btree_id; + __entry->inode = k->k.p.inode; + __entry->offset = k->k.p.offset; + __entry->size = k->k.size; +-- +cgit v1.2.3 + + +From 5b5faeb468ed3d333076b0e5da3d3fac2dd05288 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 7 Mar 2019 19:46:10 -0500 +Subject: bcachefs: Btree key cache + +This introduces a new kind of btree iterator, cached iterators, which +point to keys cached in a hash table. The cache also acts as a write +cache - in the update path, we journal the update but defer updating the +btree until the cached entry is flushed by journal reclaim. + +Cache coherency is for now up to the users to handle, which isn't ideal +but should be good enough for now. + +These new iterators will be used for updating inodes and alloc info (the +alloc and stripes btrees). + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/Makefile | 1 + + fs/bcachefs/bcachefs.h | 3 + + fs/bcachefs/btree_iter.c | 115 ++++++--- + fs/bcachefs/btree_iter.h | 16 +- + fs/bcachefs/btree_key_cache.c | 494 ++++++++++++++++++++++++++++++++++++ + fs/bcachefs/btree_key_cache.h | 23 ++ + fs/bcachefs/btree_types.h | 53 +++- + fs/bcachefs/btree_update.h | 5 + + fs/bcachefs/btree_update_interior.c | 9 + + fs/bcachefs/btree_update_leaf.c | 67 ++++- + fs/bcachefs/buckets.c | 7 + + fs/bcachefs/journal_reclaim.c | 31 +++ + fs/bcachefs/journal_reclaim.h | 4 + + fs/bcachefs/super.c | 4 + + 14 files changed, 787 insertions(+), 45 deletions(-) + create mode 100644 fs/bcachefs/btree_key_cache.c + create mode 100644 fs/bcachefs/btree_key_cache.h + +diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile +index c7727d05cf49..d85ced62c0dd 100644 +--- a/fs/bcachefs/Makefile ++++ b/fs/bcachefs/Makefile +@@ -13,6 +13,7 @@ bcachefs-y := \ + btree_gc.o \ + btree_io.o \ + btree_iter.o \ ++ btree_key_cache.o \ + btree_update_interior.o \ + btree_update_leaf.o \ + buckets.o \ +diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h +index 09c69e7a0ae2..b25bc1d6c659 100644 +--- a/fs/bcachefs/bcachefs.h ++++ b/fs/bcachefs/bcachefs.h +@@ -484,6 +484,7 @@ enum { + BCH_FS_ALLOCATOR_RUNNING, + BCH_FS_ALLOCATOR_STOPPING, + BCH_FS_INITIAL_GC_DONE, ++ BCH_FS_BTREE_INTERIOR_REPLAY_DONE, + BCH_FS_FSCK_DONE, + BCH_FS_STARTED, + BCH_FS_RW, +@@ -633,6 +634,8 @@ struct bch_fs { + struct list_head btree_trans_list; + mempool_t btree_iters_pool; + ++ struct btree_key_cache btree_key_cache; ++ + struct workqueue_struct *wq; + /* copygc needs its own workqueue for index updates.. */ + struct workqueue_struct *copygc_wq; +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 93d710faddae..e620088d3116 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -4,22 +4,16 @@ + #include "bkey_methods.h" + #include "btree_cache.h" + #include "btree_iter.h" ++#include "btree_key_cache.h" + #include "btree_locking.h" + #include "btree_update.h" + #include "debug.h" + #include "extents.h" ++#include "journal.h" + + #include + #include + +-#define BTREE_ITER_NO_NODE_GET_LOCKS ((struct btree *) 1) +-#define BTREE_ITER_NO_NODE_DROP ((struct btree *) 2) +-#define BTREE_ITER_NO_NODE_LOCK_ROOT ((struct btree *) 3) +-#define BTREE_ITER_NO_NODE_UP ((struct btree *) 4) +-#define BTREE_ITER_NO_NODE_DOWN ((struct btree *) 5) +-#define BTREE_ITER_NO_NODE_INIT ((struct btree *) 6) +-#define BTREE_ITER_NO_NODE_ERROR ((struct btree *) 7) +- + static inline bool is_btree_node(struct btree_iter *iter, unsigned l) + { + return l < BTREE_MAX_DEPTH && +@@ -253,7 +247,8 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos, + } + + /* Must lock btree nodes in key order: */ +- if (iter->btree_id < linked->btree_id) ++ if ((cmp_int(iter->btree_id, linked->btree_id) ?: ++ -cmp_int(btree_iter_type(iter), btree_iter_type(linked))) < 0) + ret = false; + + if (iter->btree_id == linked->btree_id && +@@ -301,7 +296,7 @@ static void bch2_btree_iter_verify_locks(struct btree_iter *iter) + return; + } + +- for (l = 0; btree_iter_node(iter, l); l++) { ++ for (l = 0; is_btree_node(iter, l); l++) { + if (iter->uptodate >= BTREE_ITER_NEED_RELOCK && + !btree_node_locked(iter, l)) + continue; +@@ -323,7 +318,7 @@ static inline void bch2_btree_iter_verify_locks(struct btree_iter *iter) {} + #endif + + __flatten +-static bool bch2_btree_iter_relock(struct btree_iter *iter, bool trace) ++bool bch2_btree_iter_relock(struct btree_iter *iter, bool trace) + { + return btree_iter_get_locks(iter, false, trace); + } +@@ -845,6 +840,8 @@ static inline void __btree_iter_init(struct btree_iter *iter, + static inline void btree_iter_node_set(struct btree_iter *iter, + struct btree *b) + { ++ BUG_ON(btree_iter_type(iter) == BTREE_ITER_CACHED); ++ + btree_iter_verify_new_node(iter, b); + + EBUG_ON(!btree_iter_pos_in_node(iter, b)); +@@ -865,7 +862,8 @@ void bch2_btree_iter_node_replace(struct btree_iter *iter, struct btree *b) + struct btree_iter *linked; + + trans_for_each_iter(iter->trans, linked) +- if (btree_iter_pos_in_node(linked, b)) { ++ if (btree_iter_type(linked) != BTREE_ITER_CACHED && ++ btree_iter_pos_in_node(linked, b)) { + /* + * bch2_btree_iter_node_drop() has already been called - + * the old node we're replacing has already been +@@ -1057,24 +1055,28 @@ static void btree_iter_up(struct btree_iter *iter) + + static int btree_iter_traverse_one(struct btree_iter *); + +-static int __btree_iter_traverse_all(struct btree_trans *trans, +- struct btree_iter *orig_iter, int ret) ++static int __btree_iter_traverse_all(struct btree_trans *trans, int ret) + { + struct bch_fs *c = trans->c; + struct btree_iter *iter; + u8 sorted[BTREE_ITER_MAX]; + unsigned i, nr_sorted = 0; + ++ if (trans->in_traverse_all) ++ return -EINTR; ++ ++ trans->in_traverse_all = true; ++retry_all: ++ nr_sorted = 0; ++ + trans_for_each_iter(trans, iter) +- sorted[nr_sorted++] = iter - trans->iters; ++ sorted[nr_sorted++] = iter->idx; + + #define btree_iter_cmp_by_idx(_l, _r) \ + btree_iter_cmp(&trans->iters[_l], &trans->iters[_r]) + + bubble_sort(sorted, nr_sorted, btree_iter_cmp_by_idx); + #undef btree_iter_cmp_by_idx +- +-retry_all: + bch2_trans_unlock(trans); + + if (unlikely(ret == -ENOMEM)) { +@@ -1090,11 +1092,6 @@ retry_all: + + if (unlikely(ret == -EIO)) { + trans->error = true; +- if (orig_iter) { +- orig_iter->flags |= BTREE_ITER_ERROR; +- orig_iter->l[orig_iter->level].b = +- BTREE_ITER_NO_NODE_ERROR; +- } + goto out; + } + +@@ -1102,9 +1099,16 @@ retry_all: + + /* Now, redo traversals in correct order: */ + for (i = 0; i < nr_sorted; i++) { +- iter = &trans->iters[sorted[i]]; ++ unsigned idx = sorted[i]; ++ ++ /* ++ * sucessfully traversing one iterator can cause another to be ++ * unlinked, in btree_key_cache_fill() ++ */ ++ if (!(trans->iters_linked & (1ULL << idx))) ++ continue; + +- ret = btree_iter_traverse_one(iter); ++ ret = btree_iter_traverse_one(&trans->iters[idx]); + if (ret) + goto retry_all; + } +@@ -1119,12 +1123,14 @@ retry_all: + } + out: + bch2_btree_cache_cannibalize_unlock(c); ++ ++ trans->in_traverse_all = false; + return ret; + } + + int bch2_btree_iter_traverse_all(struct btree_trans *trans) + { +- return __btree_iter_traverse_all(trans, NULL, 0); ++ return __btree_iter_traverse_all(trans, 0); + } + + static inline bool btree_iter_good_node(struct btree_iter *iter, +@@ -1169,9 +1175,6 @@ static int btree_iter_traverse_one(struct btree_iter *iter) + { + unsigned depth_want = iter->level; + +- if (unlikely(iter->level >= BTREE_MAX_DEPTH)) +- return 0; +- + /* + * if we need interior nodes locked, call btree_iter_relock() to make + * sure we walk back up enough that we lock them: +@@ -1180,9 +1183,15 @@ static int btree_iter_traverse_one(struct btree_iter *iter) + iter->locks_want > 1) + bch2_btree_iter_relock(iter, false); + ++ if (btree_iter_type(iter) == BTREE_ITER_CACHED) ++ return bch2_btree_iter_traverse_cached(iter); ++ + if (iter->uptodate < BTREE_ITER_NEED_RELOCK) + return 0; + ++ if (unlikely(iter->level >= BTREE_MAX_DEPTH)) ++ return 0; ++ + /* + * XXX: correctly using BTREE_ITER_UPTODATE should make using check_pos + * here unnecessary +@@ -1216,7 +1225,15 @@ static int btree_iter_traverse_one(struct btree_iter *iter) + return 0; + + iter->level = depth_want; +- iter->l[iter->level].b = BTREE_ITER_NO_NODE_DOWN; ++ ++ if (ret == -EIO) { ++ iter->flags |= BTREE_ITER_ERROR; ++ iter->l[iter->level].b = ++ BTREE_ITER_NO_NODE_ERROR; ++ } else { ++ iter->l[iter->level].b = ++ BTREE_ITER_NO_NODE_DOWN; ++ } + return ret; + } + } +@@ -1229,12 +1246,13 @@ static int btree_iter_traverse_one(struct btree_iter *iter) + + int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter) + { ++ struct btree_trans *trans = iter->trans; + int ret; + +- ret = bch2_trans_cond_resched(iter->trans) ?: ++ ret = bch2_trans_cond_resched(trans) ?: + btree_iter_traverse_one(iter); + if (unlikely(ret)) +- ret = __btree_iter_traverse_all(iter->trans, iter, ret); ++ ret = __btree_iter_traverse_all(trans, ret); + + return ret; + } +@@ -1383,6 +1401,13 @@ static void btree_iter_pos_changed(struct btree_iter *iter, int cmp) + if (!cmp) + goto out; + ++ if (unlikely(btree_iter_type(iter) == BTREE_ITER_CACHED)) { ++ btree_node_unlock(iter, 0); ++ iter->l[0].b = BTREE_ITER_NO_NODE_UP; ++ btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); ++ return; ++ } ++ + l = btree_iter_up_until_good_node(iter, cmp); + + if (btree_iter_node(iter, l)) { +@@ -1814,6 +1839,26 @@ struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *iter) + return bch2_btree_iter_peek_slot(iter); + } + ++struct bkey_s_c bch2_btree_iter_peek_cached(struct btree_iter *iter) ++{ ++ struct bkey_cached *ck; ++ int ret; ++ ++ bch2_btree_iter_checks(iter, BTREE_ITER_CACHED); ++ ++ ret = bch2_btree_iter_traverse(iter); ++ if (unlikely(ret)) ++ return bkey_s_c_err(ret); ++ ++ ck = (void *) iter->l[0].b; ++ ++ EBUG_ON(iter->btree_id != ck->key.btree_id || ++ bkey_cmp(iter->pos, ck->key.pos)); ++ BUG_ON(!ck->valid); ++ ++ return bkey_i_to_s_c(ck->k); ++} ++ + static inline void bch2_btree_iter_init(struct btree_trans *trans, + struct btree_iter *iter, enum btree_id btree_id, + struct bpos pos, unsigned flags) +@@ -1999,6 +2044,7 @@ static inline void btree_iter_copy(struct btree_iter *dst, + + *dst = *src; + dst->idx = idx; ++ dst->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT; + + for (i = 0; i < BTREE_MAX_DEPTH; i++) + if (btree_node_locked(dst, i)) +@@ -2057,8 +2103,9 @@ static struct btree_iter *__btree_trans_get_iter(struct btree_trans *trans, + iter = best; + } + +- iter->flags &= ~(BTREE_ITER_SLOTS|BTREE_ITER_INTENT|BTREE_ITER_PREFETCH); +- iter->flags |= flags & (BTREE_ITER_SLOTS|BTREE_ITER_INTENT|BTREE_ITER_PREFETCH); ++ iter->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT; ++ iter->flags &= ~BTREE_ITER_USER_FLAGS; ++ iter->flags |= flags & BTREE_ITER_USER_FLAGS; + + if (iter->flags & BTREE_ITER_INTENT) + bch2_btree_iter_upgrade(iter, 1); +@@ -2262,6 +2309,8 @@ int bch2_trans_exit(struct btree_trans *trans) + mutex_unlock(&trans->c->btree_trans_lock); + #endif + ++ bch2_journal_preres_put(&trans->c->journal, &trans->journal_preres); ++ + kfree(trans->fs_usage_deltas); + kfree(trans->mem); + if (trans->used_mempool) +diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h +index bc408f1272e7..bd9ec3ec9a92 100644 +--- a/fs/bcachefs/btree_iter.h ++++ b/fs/bcachefs/btree_iter.h +@@ -110,6 +110,7 @@ void bch2_btree_node_iter_fix(struct btree_iter *, struct btree *, + struct btree_node_iter *, struct bkey_packed *, + unsigned, unsigned); + ++bool bch2_btree_iter_relock(struct btree_iter *, bool); + bool bch2_trans_relock(struct btree_trans *); + void bch2_trans_unlock(struct btree_trans *); + +@@ -170,6 +171,8 @@ struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *); + struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *); + struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *); + ++struct bkey_s_c bch2_btree_iter_peek_cached(struct btree_iter *); ++ + void bch2_btree_iter_set_pos_same_leaf(struct btree_iter *, struct bpos); + void __bch2_btree_iter_set_pos(struct btree_iter *, struct bpos, bool); + void bch2_btree_iter_set_pos(struct btree_iter *, struct bpos); +@@ -177,7 +180,9 @@ void bch2_btree_iter_set_pos(struct btree_iter *, struct bpos); + static inline int btree_iter_cmp(const struct btree_iter *l, + const struct btree_iter *r) + { +- return cmp_int(l->btree_id, r->btree_id) ?: bkey_cmp(l->pos, r->pos); ++ return cmp_int(l->btree_id, r->btree_id) ?: ++ -cmp_int(btree_iter_type(l), btree_iter_type(r)) ?: ++ bkey_cmp(l->pos, r->pos); + } + + /* +@@ -211,9 +216,12 @@ static inline int bch2_trans_cond_resched(struct btree_trans *trans) + static inline struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, + unsigned flags) + { +- return flags & BTREE_ITER_SLOTS +- ? bch2_btree_iter_peek_slot(iter) +- : bch2_btree_iter_peek(iter); ++ if ((flags & BTREE_ITER_TYPE) == BTREE_ITER_CACHED) ++ return bch2_btree_iter_peek_cached(iter); ++ else ++ return flags & BTREE_ITER_SLOTS ++ ? bch2_btree_iter_peek_slot(iter) ++ : bch2_btree_iter_peek(iter); + } + + static inline struct bkey_s_c __bch2_btree_iter_next(struct btree_iter *iter, +diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c +new file mode 100644 +index 000000000000..0b03262acd1e +--- /dev/null ++++ b/fs/bcachefs/btree_key_cache.c +@@ -0,0 +1,494 @@ ++ ++#include "bcachefs.h" ++#include "btree_iter.h" ++#include "btree_key_cache.h" ++#include "btree_locking.h" ++#include "btree_update.h" ++#include "error.h" ++#include "journal.h" ++#include "journal_reclaim.h" ++ ++#include ++ ++static int bch2_btree_key_cache_cmp_fn(struct rhashtable_compare_arg *arg, ++ const void *obj) ++{ ++ const struct bkey_cached *ck = obj; ++ const struct bkey_cached_key *key = arg->key; ++ ++ return cmp_int(ck->key.btree_id, key->btree_id) ?: ++ bkey_cmp(ck->key.pos, key->pos); ++} ++ ++static const struct rhashtable_params bch2_btree_key_cache_params = { ++ .head_offset = offsetof(struct bkey_cached, hash), ++ .key_offset = offsetof(struct bkey_cached, key), ++ .key_len = sizeof(struct bkey_cached_key), ++ .obj_cmpfn = bch2_btree_key_cache_cmp_fn, ++}; ++ ++__flatten ++static inline struct bkey_cached * ++btree_key_cache_find(struct bch_fs *c, enum btree_id btree_id, struct bpos pos) ++{ ++ struct bkey_cached_key key = { ++ .btree_id = btree_id, ++ .pos = pos, ++ }; ++ ++ return rhashtable_lookup_fast(&c->btree_key_cache.table, &key, ++ bch2_btree_key_cache_params); ++} ++ ++static bool bkey_cached_lock_for_evict(struct bkey_cached *ck) ++{ ++ if (!six_trylock_intent(&ck->c.lock)) ++ return false; ++ ++ if (!six_trylock_write(&ck->c.lock)) { ++ six_unlock_intent(&ck->c.lock); ++ return false; ++ } ++ ++ if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { ++ six_unlock_write(&ck->c.lock); ++ six_unlock_intent(&ck->c.lock); ++ return false; ++ } ++ ++ return true; ++} ++ ++static void bkey_cached_evict(struct btree_key_cache *c, ++ struct bkey_cached *ck) ++{ ++ BUG_ON(rhashtable_remove_fast(&c->table, &ck->hash, ++ bch2_btree_key_cache_params)); ++ memset(&ck->key, ~0, sizeof(ck->key)); ++} ++ ++static void bkey_cached_free(struct btree_key_cache *c, ++ struct bkey_cached *ck) ++{ ++ list_move(&ck->list, &c->freed); ++ ++ kfree(ck->k); ++ ck->k = NULL; ++ ck->u64s = 0; ++ ++ six_unlock_write(&ck->c.lock); ++ six_unlock_intent(&ck->c.lock); ++} ++ ++static struct bkey_cached * ++bkey_cached_alloc(struct btree_key_cache *c) ++{ ++ struct bkey_cached *ck; ++ ++ list_for_each_entry(ck, &c->freed, list) ++ if (bkey_cached_lock_for_evict(ck)) ++ return ck; ++ ++ list_for_each_entry(ck, &c->clean, list) ++ if (bkey_cached_lock_for_evict(ck)) { ++ bkey_cached_evict(c, ck); ++ return ck; ++ } ++ ++ ck = kzalloc(sizeof(*ck), GFP_NOFS); ++ if (!ck) ++ return NULL; ++ ++ INIT_LIST_HEAD(&ck->list); ++ six_lock_init(&ck->c.lock); ++ BUG_ON(!six_trylock_intent(&ck->c.lock)); ++ BUG_ON(!six_trylock_write(&ck->c.lock)); ++ ++ return ck; ++} ++ ++static struct bkey_cached * ++btree_key_cache_create(struct btree_key_cache *c, ++ enum btree_id btree_id, ++ struct bpos pos) ++{ ++ struct bkey_cached *ck; ++ ++ ck = bkey_cached_alloc(c); ++ if (!ck) ++ return ERR_PTR(-ENOMEM); ++ ++ ck->c.level = 0; ++ ck->c.btree_id = btree_id; ++ ck->key.btree_id = btree_id; ++ ck->key.pos = pos; ++ ck->valid = false; ++ ++ BUG_ON(ck->flags); ++ ++ if (rhashtable_lookup_insert_fast(&c->table, ++ &ck->hash, ++ bch2_btree_key_cache_params)) { ++ /* We raced with another fill: */ ++ bkey_cached_free(c, ck); ++ return NULL; ++ } ++ ++ list_move(&ck->list, &c->clean); ++ six_unlock_write(&ck->c.lock); ++ ++ return ck; ++} ++ ++static int btree_key_cache_fill(struct btree_trans *trans, ++ struct btree_iter *ck_iter, ++ struct bkey_cached *ck) ++{ ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ unsigned new_u64s = 0; ++ struct bkey_i *new_k = NULL; ++ int ret; ++ ++ iter = bch2_trans_get_iter(trans, ck->key.btree_id, ++ ck->key.pos, BTREE_ITER_SLOTS); ++ if (IS_ERR(iter)) ++ return PTR_ERR(iter); ++ ++ k = bch2_btree_iter_peek_slot(iter); ++ ret = bkey_err(k); ++ if (ret) { ++ bch2_trans_iter_put(trans, iter); ++ return ret; ++ } ++ ++ if (!bch2_btree_node_relock(ck_iter, 0)) { ++ bch2_trans_iter_put(trans, iter); ++ trace_transaction_restart_ip(trans->ip, _THIS_IP_); ++ return -EINTR; ++ } ++ ++ if (k.k->u64s > ck->u64s) { ++ new_u64s = roundup_pow_of_two(k.k->u64s); ++ new_k = kmalloc(new_u64s * sizeof(u64), GFP_NOFS); ++ if (!new_k) { ++ bch2_trans_iter_put(trans, iter); ++ return -ENOMEM; ++ } ++ } ++ ++ bch2_btree_node_lock_write(ck_iter->l[0].b, ck_iter); ++ if (new_k) { ++ kfree(ck->k); ++ ck->u64s = new_u64s; ++ ck->k = new_k; ++ } ++ ++ bkey_reassemble(ck->k, k); ++ ck->valid = true; ++ bch2_btree_node_unlock_write(ck_iter->l[0].b, ck_iter); ++ ++ /* We're not likely to need this iterator again: */ ++ bch2_trans_iter_free(trans, iter); ++ ++ return 0; ++} ++ ++static int bkey_cached_check_fn(struct six_lock *lock, void *p) ++{ ++ struct bkey_cached *ck = container_of(lock, struct bkey_cached, c.lock); ++ const struct btree_iter *iter = p; ++ ++ return ck->key.btree_id == iter->btree_id && ++ !bkey_cmp(ck->key.pos, iter->pos) ? 0 : -1; ++} ++ ++int bch2_btree_iter_traverse_cached(struct btree_iter *iter) ++{ ++ struct btree_trans *trans = iter->trans; ++ struct bch_fs *c = trans->c; ++ struct bkey_cached *ck; ++ int ret = 0; ++ ++ BUG_ON(iter->level); ++ ++ if (btree_node_locked(iter, 0)) { ++ ck = (void *) iter->l[0].b; ++ goto fill; ++ } ++retry: ++ ck = btree_key_cache_find(c, iter->btree_id, iter->pos); ++ if (!ck) { ++ if (iter->flags & BTREE_ITER_CACHED_NOCREATE) { ++ iter->l[0].b = NULL; ++ return 0; ++ } ++ ++ mutex_lock(&c->btree_key_cache.lock); ++ ck = btree_key_cache_create(&c->btree_key_cache, ++ iter->btree_id, iter->pos); ++ mutex_unlock(&c->btree_key_cache.lock); ++ ++ ret = PTR_ERR_OR_ZERO(ck); ++ if (ret) ++ goto err; ++ if (!ck) ++ goto retry; ++ ++ mark_btree_node_locked(iter, 0, SIX_LOCK_intent); ++ iter->locks_want = 1; ++ } else { ++ enum six_lock_type lock_want = __btree_lock_want(iter, 0); ++ ++ if (!btree_node_lock((void *) ck, iter->pos, 0, iter, lock_want, ++ bkey_cached_check_fn, iter)) { ++ if (ck->key.btree_id != iter->btree_id || ++ bkey_cmp(ck->key.pos, iter->pos)) { ++ goto retry; ++ } ++ ++ trace_transaction_restart_ip(trans->ip, _THIS_IP_); ++ ret = -EINTR; ++ goto err; ++ } ++ ++ if (ck->key.btree_id != iter->btree_id || ++ bkey_cmp(ck->key.pos, iter->pos)) { ++ six_unlock_type(&ck->c.lock, lock_want); ++ goto retry; ++ } ++ ++ mark_btree_node_locked(iter, 0, lock_want); ++ } ++ ++ iter->l[0].lock_seq = ck->c.lock.state.seq; ++ iter->l[0].b = (void *) ck; ++fill: ++ if (!ck->valid && !(iter->flags & BTREE_ITER_CACHED_NOFILL)) { ++ if (!btree_node_intent_locked(iter, 0)) ++ bch2_btree_iter_upgrade(iter, 1); ++ if (!btree_node_intent_locked(iter, 0)) { ++ trace_transaction_restart_ip(trans->ip, _THIS_IP_); ++ ret = -EINTR; ++ goto err; ++ } ++ ++ ret = btree_key_cache_fill(trans, iter, ck); ++ if (ret) ++ goto err; ++ } ++ ++ iter->uptodate = BTREE_ITER_NEED_PEEK; ++ bch2_btree_iter_downgrade(iter); ++ return ret; ++err: ++ if (ret != -EINTR) { ++ btree_node_unlock(iter, 0); ++ iter->flags |= BTREE_ITER_ERROR; ++ iter->l[0].b = BTREE_ITER_NO_NODE_ERROR; ++ } ++ return ret; ++} ++ ++static int btree_key_cache_flush_pos(struct btree_trans *trans, ++ struct bkey_cached_key key, ++ u64 journal_seq, ++ bool evict) ++{ ++ struct bch_fs *c = trans->c; ++ struct journal *j = &c->journal; ++ struct btree_iter *c_iter = NULL, *b_iter = NULL; ++ struct bkey_cached *ck; ++ int ret; ++ ++ b_iter = bch2_trans_get_iter(trans, key.btree_id, key.pos, ++ BTREE_ITER_SLOTS| ++ BTREE_ITER_INTENT); ++ ret = PTR_ERR_OR_ZERO(b_iter); ++ if (ret) ++ goto out; ++ ++ c_iter = bch2_trans_get_iter(trans, key.btree_id, key.pos, ++ BTREE_ITER_CACHED| ++ BTREE_ITER_CACHED_NOFILL| ++ BTREE_ITER_CACHED_NOCREATE| ++ BTREE_ITER_INTENT); ++ ret = PTR_ERR_OR_ZERO(c_iter); ++ if (ret) ++ goto out; ++retry: ++ ret = bch2_btree_iter_traverse(c_iter); ++ if (ret) ++ goto err; ++ ++ ck = (void *) c_iter->l[0].b; ++ if (!ck || ++ (journal_seq && ck->journal.seq != journal_seq)) ++ goto out; ++ ++ if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { ++ if (!evict) ++ goto out; ++ goto evict; ++ } ++ ++ ret = bch2_btree_iter_traverse(b_iter) ?: ++ bch2_trans_update(trans, b_iter, ck->k, BTREE_TRIGGER_NORUN) ?: ++ bch2_trans_commit(trans, NULL, NULL, ++ BTREE_INSERT_NOUNLOCK| ++ BTREE_INSERT_NOCHECK_RW| ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_USE_RESERVE| ++ BTREE_INSERT_USE_ALLOC_RESERVE| ++ BTREE_INSERT_JOURNAL_RESERVED| ++ BTREE_INSERT_JOURNAL_RECLAIM); ++err: ++ if (ret == -EINTR) ++ goto retry; ++ ++ BUG_ON(ret && !bch2_journal_error(j)); ++ ++ if (ret) ++ goto out; ++ ++ bch2_journal_pin_drop(j, &ck->journal); ++ bch2_journal_preres_put(j, &ck->res); ++ clear_bit(BKEY_CACHED_DIRTY, &ck->flags); ++ ++ if (!evict) { ++ mutex_lock(&c->btree_key_cache.lock); ++ list_move_tail(&ck->list, &c->btree_key_cache.clean); ++ mutex_unlock(&c->btree_key_cache.lock); ++ } else { ++evict: ++ BUG_ON(!btree_node_intent_locked(c_iter, 0)); ++ ++ mark_btree_node_unlocked(c_iter, 0); ++ c_iter->l[0].b = NULL; ++ ++ six_lock_write(&ck->c.lock, NULL, NULL); ++ ++ mutex_lock(&c->btree_key_cache.lock); ++ bkey_cached_evict(&c->btree_key_cache, ck); ++ bkey_cached_free(&c->btree_key_cache, ck); ++ mutex_unlock(&c->btree_key_cache.lock); ++ } ++out: ++ bch2_trans_iter_put(trans, b_iter); ++ bch2_trans_iter_put(trans, c_iter); ++ return ret; ++} ++ ++static void btree_key_cache_journal_flush(struct journal *j, ++ struct journal_entry_pin *pin, ++ u64 seq) ++{ ++ struct bch_fs *c = container_of(j, struct bch_fs, journal); ++ struct bkey_cached *ck = ++ container_of(pin, struct bkey_cached, journal); ++ struct bkey_cached_key key; ++ struct btree_trans trans; ++ ++ six_lock_read(&ck->c.lock, NULL, NULL); ++ key = ck->key; ++ ++ if (ck->journal.seq != seq || ++ !test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { ++ six_unlock_read(&ck->c.lock); ++ return; ++ } ++ six_unlock_read(&ck->c.lock); ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ btree_key_cache_flush_pos(&trans, key, seq, false); ++ bch2_trans_exit(&trans); ++} ++ ++/* ++ * Flush and evict a key from the key cache: ++ */ ++int bch2_btree_key_cache_flush(struct btree_trans *trans, ++ enum btree_id id, struct bpos pos) ++{ ++ struct bch_fs *c = trans->c; ++ struct bkey_cached_key key = { id, pos }; ++ ++ /* Fastpath - assume it won't be found: */ ++ if (!btree_key_cache_find(c, id, pos)) ++ return 0; ++ ++ return btree_key_cache_flush_pos(trans, key, 0, true); ++} ++ ++bool bch2_btree_insert_key_cached(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct bkey_i *insert) ++{ ++ struct bch_fs *c = trans->c; ++ struct bkey_cached *ck = (void *) iter->l[0].b; ++ ++ BUG_ON(insert->u64s > ck->u64s); ++ ++ if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) { ++ int difference; ++ ++ BUG_ON(jset_u64s(insert->u64s) > trans->journal_preres.u64s); ++ ++ difference = jset_u64s(insert->u64s) - ck->res.u64s; ++ if (difference > 0) { ++ trans->journal_preres.u64s -= difference; ++ ck->res.u64s += difference; ++ } ++ } ++ ++ bkey_copy(ck->k, insert); ++ ck->valid = true; ++ ++ if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { ++ mutex_lock(&c->btree_key_cache.lock); ++ list_del_init(&ck->list); ++ ++ set_bit(BKEY_CACHED_DIRTY, &ck->flags); ++ mutex_unlock(&c->btree_key_cache.lock); ++ } ++ ++ bch2_journal_pin_update(&c->journal, trans->journal_res.seq, ++ &ck->journal, btree_key_cache_journal_flush); ++ return true; ++} ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++void bch2_btree_key_cache_verify_clean(struct btree_trans *trans, ++ enum btree_id id, struct bpos pos) ++{ ++ BUG_ON(btree_key_cache_find(trans->c, id, pos)); ++} ++#endif ++ ++void bch2_fs_btree_key_cache_exit(struct btree_key_cache *c) ++{ ++ struct bkey_cached *ck, *n; ++ ++ mutex_lock(&c->lock); ++ list_for_each_entry_safe(ck, n, &c->clean, list) { ++ kfree(ck->k); ++ kfree(ck); ++ } ++ list_for_each_entry_safe(ck, n, &c->freed, list) ++ kfree(ck); ++ mutex_unlock(&c->lock); ++ ++ rhashtable_destroy(&c->table); ++} ++ ++void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c) ++{ ++ mutex_init(&c->lock); ++ INIT_LIST_HEAD(&c->freed); ++ INIT_LIST_HEAD(&c->clean); ++} ++ ++int bch2_fs_btree_key_cache_init(struct btree_key_cache *c) ++{ ++ return rhashtable_init(&c->table, &bch2_btree_key_cache_params); ++} +diff --git a/fs/bcachefs/btree_key_cache.h b/fs/bcachefs/btree_key_cache.h +new file mode 100644 +index 000000000000..fbc29336091f +--- /dev/null ++++ b/fs/bcachefs/btree_key_cache.h +@@ -0,0 +1,23 @@ ++#ifndef _BCACHEFS_BTREE_KEY_CACHE_H ++#define _BCACHEFS_BTREE_KEY_CACHE_H ++ ++int bch2_btree_iter_traverse_cached(struct btree_iter *); ++ ++bool bch2_btree_insert_key_cached(struct btree_trans *, ++ struct btree_iter *, struct bkey_i *); ++int bch2_btree_key_cache_flush(struct btree_trans *, ++ enum btree_id, struct bpos); ++#ifdef CONFIG_BCACHEFS_DEBUG ++void bch2_btree_key_cache_verify_clean(struct btree_trans *, ++ enum btree_id, struct bpos); ++#else ++static inline void ++bch2_btree_key_cache_verify_clean(struct btree_trans *trans, ++ enum btree_id id, struct bpos pos) {} ++#endif ++ ++void bch2_fs_btree_key_cache_exit(struct btree_key_cache *); ++void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *); ++int bch2_fs_btree_key_cache_init(struct btree_key_cache *); ++ ++#endif /* _BCACHEFS_BTREE_KEY_CACHE_H */ +diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h +index 9ca4032f49a6..ba47f51263f9 100644 +--- a/fs/bcachefs/btree_types.h ++++ b/fs/bcachefs/btree_types.h +@@ -183,6 +183,7 @@ struct btree_node_iter { + enum btree_iter_type { + BTREE_ITER_KEYS, + BTREE_ITER_NODES, ++ BTREE_ITER_CACHED, + }; + + #define BTREE_ITER_TYPE ((1 << 2) - 1) +@@ -214,6 +215,15 @@ enum btree_iter_type { + #define BTREE_ITER_IS_EXTENTS (1 << 6) + #define BTREE_ITER_ERROR (1 << 7) + #define BTREE_ITER_SET_POS_AFTER_COMMIT (1 << 8) ++#define BTREE_ITER_CACHED_NOFILL (1 << 9) ++#define BTREE_ITER_CACHED_NOCREATE (1 << 10) ++ ++#define BTREE_ITER_USER_FLAGS \ ++ (BTREE_ITER_SLOTS \ ++ |BTREE_ITER_INTENT \ ++ |BTREE_ITER_PREFETCH \ ++ |BTREE_ITER_CACHED_NOFILL \ ++ |BTREE_ITER_CACHED_NOCREATE) + + enum btree_iter_uptodate { + BTREE_ITER_UPTODATE = 0, +@@ -222,6 +232,14 @@ enum btree_iter_uptodate { + BTREE_ITER_NEED_TRAVERSE = 3, + }; + ++#define BTREE_ITER_NO_NODE_GET_LOCKS ((struct btree *) 1) ++#define BTREE_ITER_NO_NODE_DROP ((struct btree *) 2) ++#define BTREE_ITER_NO_NODE_LOCK_ROOT ((struct btree *) 3) ++#define BTREE_ITER_NO_NODE_UP ((struct btree *) 4) ++#define BTREE_ITER_NO_NODE_DOWN ((struct btree *) 5) ++#define BTREE_ITER_NO_NODE_INIT ((struct btree *) 6) ++#define BTREE_ITER_NO_NODE_ERROR ((struct btree *) 7) ++ + /* + * @pos - iterator's current position + * @level - current btree depth +@@ -259,7 +277,8 @@ struct btree_iter { + unsigned long ip_allocated; + }; + +-static inline enum btree_iter_type btree_iter_type(struct btree_iter *iter) ++static inline enum btree_iter_type ++btree_iter_type(const struct btree_iter *iter) + { + return iter->flags & BTREE_ITER_TYPE; + } +@@ -269,6 +288,37 @@ static inline struct btree_iter_level *iter_l(struct btree_iter *iter) + return iter->l + iter->level; + } + ++struct btree_key_cache { ++ struct mutex lock; ++ struct rhashtable table; ++ struct list_head freed; ++ struct list_head clean; ++}; ++ ++struct bkey_cached_key { ++ u32 btree_id; ++ struct bpos pos; ++} __packed; ++ ++#define BKEY_CACHED_DIRTY 0 ++ ++struct bkey_cached { ++ struct btree_bkey_cached_common c; ++ ++ unsigned long flags; ++ u8 u64s; ++ bool valid; ++ struct bkey_cached_key key; ++ ++ struct rhash_head hash; ++ struct list_head list; ++ ++ struct journal_preres res; ++ struct journal_entry_pin journal; ++ ++ struct bkey_i *k; ++}; ++ + struct btree_insert_entry { + unsigned trigger_flags; + unsigned trans_triggers_run:1; +@@ -307,6 +357,7 @@ struct btree_trans { + unsigned error:1; + unsigned nounlock:1; + unsigned need_reset:1; ++ unsigned in_traverse_all:1; + + unsigned mem_top; + unsigned mem_bytes; +diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h +index 11f7d02de622..e0b1bde37484 100644 +--- a/fs/bcachefs/btree_update.h ++++ b/fs/bcachefs/btree_update.h +@@ -23,6 +23,7 @@ enum btree_insert_flags { + __BTREE_INSERT_USE_ALLOC_RESERVE, + __BTREE_INSERT_JOURNAL_REPLAY, + __BTREE_INSERT_JOURNAL_RESERVED, ++ __BTREE_INSERT_JOURNAL_RECLAIM, + __BTREE_INSERT_NOWAIT, + __BTREE_INSERT_GC_LOCK_HELD, + __BCH_HASH_SET_MUST_CREATE, +@@ -47,8 +48,12 @@ enum btree_insert_flags { + /* Insert is for journal replay - don't get journal reservations: */ + #define BTREE_INSERT_JOURNAL_REPLAY (1 << __BTREE_INSERT_JOURNAL_REPLAY) + ++/* Indicates that we have pre-reserved space in the journal: */ + #define BTREE_INSERT_JOURNAL_RESERVED (1 << __BTREE_INSERT_JOURNAL_RESERVED) + ++/* Insert is being called from journal reclaim path: */ ++#define BTREE_INSERT_JOURNAL_RECLAIM (1 << __BTREE_INSERT_JOURNAL_RECLAIM) ++ + /* Don't block on allocation failure (for new btree nodes: */ + #define BTREE_INSERT_NOWAIT (1 << __BTREE_INSERT_NOWAIT) + #define BTREE_INSERT_GC_LOCK_HELD (1 << __BTREE_INSERT_GC_LOCK_HELD) +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index a626a7698d13..9e6006d07585 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -529,11 +529,20 @@ static void btree_update_nodes_written(struct btree_update *as) + * to child nodes that weren't written yet: now, the child nodes have + * been written so we can write out the update to the interior node. + */ ++ ++ /* ++ * We can't call into journal reclaim here: we'd block on the journal ++ * reclaim lock, but we may need to release the open buckets we have ++ * pinned in order for other btree updates to make forward progress, and ++ * journal reclaim does btree updates when flushing bkey_cached entries, ++ * which may require allocations as well. ++ */ + ret = bch2_trans_do(c, &as->disk_res, &journal_seq, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_USE_RESERVE| + BTREE_INSERT_USE_ALLOC_RESERVE| + BTREE_INSERT_NOCHECK_RW| ++ BTREE_INSERT_JOURNAL_RECLAIM| + BTREE_INSERT_JOURNAL_RESERVED, + btree_update_nodes_written_trans(&trans, as)); + BUG_ON(ret && !bch2_journal_error(&c->journal)); +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index 1a1fd230e4b9..2e28f0c9cc42 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -6,6 +6,7 @@ + #include "btree_gc.h" + #include "btree_io.h" + #include "btree_iter.h" ++#include "btree_key_cache.h" + #include "btree_locking.h" + #include "buckets.h" + #include "debug.h" +@@ -32,6 +33,9 @@ inline void bch2_btree_node_lock_for_insert(struct bch_fs *c, struct btree *b, + { + bch2_btree_node_lock_write(b, iter); + ++ if (btree_iter_type(iter) == BTREE_ITER_CACHED) ++ return; ++ + if (unlikely(btree_node_just_written(b)) && + bch2_btree_post_write_cleanup(c, b)) + bch2_btree_iter_reinit_node(iter, b); +@@ -202,6 +206,8 @@ static bool btree_insert_key_leaf(struct btree_trans *trans, + return true; + } + ++/* Cached btree updates: */ ++ + /* Normal update interface: */ + + static inline void btree_insert_entry_checks(struct btree_trans *trans, +@@ -284,6 +290,31 @@ btree_key_can_insert(struct btree_trans *trans, + return BTREE_INSERT_OK; + } + ++static enum btree_insert_ret ++btree_key_can_insert_cached(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct bkey_i *insert, ++ unsigned *u64s) ++{ ++ struct bkey_cached *ck = (void *) iter->l[0].b; ++ unsigned new_u64s; ++ struct bkey_i *new_k; ++ ++ BUG_ON(iter->level); ++ ++ if (*u64s <= ck->u64s) ++ return BTREE_INSERT_OK; ++ ++ new_u64s = roundup_pow_of_two(*u64s); ++ new_k = krealloc(ck->k, new_u64s * sizeof(u64), GFP_NOFS); ++ if (!new_k) ++ return -ENOMEM; ++ ++ ck->u64s = new_u64s; ++ ck->k = new_k; ++ return BTREE_INSERT_OK; ++} ++ + static inline void do_btree_insert_one(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_i *insert) +@@ -297,7 +328,9 @@ static inline void do_btree_insert_one(struct btree_trans *trans, + + insert->k.needs_whiteout = false; + +- did_work = btree_insert_key_leaf(trans, iter, insert); ++ did_work = (btree_iter_type(iter) != BTREE_ITER_CACHED) ++ ? btree_insert_key_leaf(trans, iter, insert) ++ : bch2_btree_insert_key_cached(trans, iter, insert); + if (!did_work) + return; + +@@ -335,10 +368,16 @@ static noinline void bch2_trans_mark_gc(struct btree_trans *trans) + struct bch_fs *c = trans->c; + struct btree_insert_entry *i; + +- trans_for_each_update(trans, i) +- if (gc_visited(c, gc_pos_btree_node(iter_l(i->iter)->b))) ++ trans_for_each_update(trans, i) { ++ /* ++ * XXX: synchronization of cached update triggers with gc ++ */ ++ BUG_ON(btree_iter_type(i->iter) == BTREE_ITER_CACHED); ++ ++ if (gc_visited(c, gc_pos_btree_node(i->iter->l[0].b))) + bch2_mark_update(trans, i->iter, i->k, NULL, + i->trigger_flags|BTREE_TRIGGER_GC); ++ } + } + + static inline int +@@ -371,7 +410,9 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, + u64s = 0; + + u64s += i->k->k.u64s; +- ret = btree_key_can_insert(trans, i->iter, i->k, &u64s); ++ ret = btree_iter_type(i->iter) != BTREE_ITER_CACHED ++ ? btree_key_can_insert(trans, i->iter, i->k, &u64s) ++ : btree_key_can_insert_cached(trans, i->iter, i->k, &u64s); + if (ret) { + *stopped_at = i; + return ret; +@@ -467,7 +508,9 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, + + ret = bch2_journal_preres_get(&trans->c->journal, + &trans->journal_preres, trans->journal_preres_u64s, +- JOURNAL_RES_GET_NONBLOCK); ++ JOURNAL_RES_GET_NONBLOCK| ++ ((trans->flags & BTREE_INSERT_JOURNAL_RECLAIM) ++ ? JOURNAL_RES_GET_RECLAIM : 0)); + if (unlikely(ret == -EAGAIN)) + ret = bch2_trans_journal_preres_get_cold(trans, + trans->journal_preres_u64s); +@@ -523,7 +566,8 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, + trans->nounlock = true; + + trans_for_each_update2(trans, i) +- if (!same_leaf_as_prev(trans, i)) ++ if (btree_iter_type(i->iter) != BTREE_ITER_CACHED && ++ !same_leaf_as_prev(trans, i)) + bch2_foreground_maybe_merge(trans->c, i->iter, + 0, trans->flags); + +@@ -808,6 +852,14 @@ int __bch2_trans_commit(struct btree_trans *trans) + return ret; + } + ++#ifdef CONFIG_BCACHEFS_DEBUG ++ trans_for_each_update(trans, i) ++ if (btree_iter_type(i->iter) != BTREE_ITER_CACHED && ++ !(i->trigger_flags & BTREE_TRIGGER_NORUN)) ++ bch2_btree_key_cache_verify_clean(trans, ++ i->iter->btree_id, i->iter->pos); ++#endif ++ + /* + * Running triggers will append more updates to the list of updates as + * we're walking it: +@@ -880,7 +932,8 @@ int __bch2_trans_commit(struct btree_trans *trans) + BUG_ON(i->iter->locks_want < 1); + + u64s = jset_u64s(i->k->k.u64s); +- if (0) ++ if (btree_iter_type(i->iter) == BTREE_ITER_CACHED && ++ likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) + trans->journal_preres_u64s += u64s; + trans->journal_u64s += u64s; + } +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index 2da60ba3b7cb..86ce91ef76a9 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -1816,6 +1816,13 @@ int bch2_trans_mark_update(struct btree_trans *trans, + if (unlikely(flags & BTREE_TRIGGER_NOOVERWRITES)) + return 0; + ++ if (btree_iter_type(iter) == BTREE_ITER_CACHED) { ++ struct bkey_cached *ck = (void *) iter->l[0].b; ++ ++ return bch2_trans_mark_key(trans, bkey_i_to_s_c(ck->k), ++ 0, 0, BTREE_TRIGGER_OVERWRITE); ++ } ++ + while ((_k = bch2_btree_node_iter_peek(&node_iter, b))) { + struct bkey unpacked; + struct bkey_s_c k; +diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c +index 0cae90d6e053..357f42d31aaf 100644 +--- a/fs/bcachefs/journal_reclaim.c ++++ b/fs/bcachefs/journal_reclaim.c +@@ -346,6 +346,37 @@ void __bch2_journal_pin_add(struct journal *j, u64 seq, + journal_wake(j); + } + ++void bch2_journal_pin_update(struct journal *j, u64 seq, ++ struct journal_entry_pin *pin, ++ journal_pin_flush_fn flush_fn) ++{ ++ if (journal_pin_active(pin) && pin->seq < seq) ++ return; ++ ++ spin_lock(&j->lock); ++ ++ if (pin->seq != seq) { ++ bch2_journal_pin_add_locked(j, seq, pin, flush_fn); ++ } else { ++ struct journal_entry_pin_list *pin_list = ++ journal_seq_pin(j, seq); ++ ++ /* ++ * If the pin is already pinning the right sequence number, it ++ * still might've already been flushed: ++ */ ++ list_move(&pin->list, &pin_list->list); ++ } ++ ++ spin_unlock(&j->lock); ++ ++ /* ++ * If the journal is currently full, we might want to call flush_fn ++ * immediately: ++ */ ++ journal_wake(j); ++} ++ + void bch2_journal_pin_copy(struct journal *j, + struct journal_entry_pin *dst, + struct journal_entry_pin *src, +diff --git a/fs/bcachefs/journal_reclaim.h b/fs/bcachefs/journal_reclaim.h +index 272ba8a37967..8128907a7623 100644 +--- a/fs/bcachefs/journal_reclaim.h ++++ b/fs/bcachefs/journal_reclaim.h +@@ -42,6 +42,10 @@ static inline void bch2_journal_pin_add(struct journal *j, u64 seq, + __bch2_journal_pin_add(j, seq, pin, flush_fn); + } + ++void bch2_journal_pin_update(struct journal *, u64, ++ struct journal_entry_pin *, ++ journal_pin_flush_fn); ++ + void bch2_journal_pin_copy(struct journal *, + struct journal_entry_pin *, + struct journal_entry_pin *, +diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c +index 480c865f3e8e..872434eceb8b 100644 +--- a/fs/bcachefs/super.c ++++ b/fs/bcachefs/super.c +@@ -13,6 +13,7 @@ + #include "bkey_sort.h" + #include "btree_cache.h" + #include "btree_gc.h" ++#include "btree_key_cache.h" + #include "btree_update_interior.h" + #include "btree_io.h" + #include "chardev.h" +@@ -472,6 +473,7 @@ static void bch2_fs_free(struct bch_fs *c) + bch2_fs_io_exit(c); + bch2_fs_btree_interior_update_exit(c); + bch2_fs_btree_iter_exit(c); ++ bch2_fs_btree_key_cache_exit(&c->btree_key_cache); + bch2_fs_btree_cache_exit(c); + bch2_fs_journal_exit(&c->journal); + bch2_io_clock_exit(&c->io_clock[WRITE]); +@@ -642,6 +644,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) + for (i = 0; i < BCH_TIME_STAT_NR; i++) + bch2_time_stats_init(&c->times[i]); + ++ bch2_fs_btree_key_cache_init_early(&c->btree_key_cache); + bch2_fs_allocator_background_init(c); + bch2_fs_allocator_foreground_init(c); + bch2_fs_rebalance_init(c); +@@ -737,6 +740,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) + bch2_fs_journal_init(&c->journal) || + bch2_fs_replicas_init(c) || + bch2_fs_btree_cache_init(c) || ++ bch2_fs_btree_key_cache_init(&c->btree_key_cache) || + bch2_fs_btree_iter_init(c) || + bch2_fs_btree_interior_update_init(c) || + bch2_fs_io_init(c) || +-- +cgit v1.2.3 + + +From bdf229e272eae9dca517b86eabbbfce1bfde6c57 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 5 Oct 2019 12:54:53 -0400 +Subject: bcachefs: Use cached iterators for alloc btree + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/alloc_background.c | 51 +++++------------- + fs/bcachefs/alloc_background.h | 1 - + fs/bcachefs/btree_update_leaf.c | 3 ++ + fs/bcachefs/buckets.c | 85 +++++++++++++++-------------- + fs/bcachefs/journal_reclaim.c | 3 ++ + fs/bcachefs/journal_types.h | 1 + + fs/bcachefs/recovery.c | 117 +++++++++++++++++++++++++++++++--------- + fs/bcachefs/super.c | 30 +++++++++-- + fs/bcachefs/sysfs.c | 8 --- + 9 files changed, 184 insertions(+), 115 deletions(-) + +diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c +index bdebb0eccd9c..8b1207dd1282 100644 +--- a/fs/bcachefs/alloc_background.c ++++ b/fs/bcachefs/alloc_background.c +@@ -4,6 +4,7 @@ + #include "alloc_foreground.h" + #include "btree_cache.h" + #include "btree_io.h" ++#include "btree_key_cache.h" + #include "btree_update.h" + #include "btree_update_interior.h" + #include "btree_gc.h" +@@ -276,6 +277,13 @@ static int bch2_alloc_write_key(struct btree_trans *trans, + struct bkey_i_alloc *a; + int ret; + retry: ++ bch2_trans_begin(trans); ++ ++ ret = bch2_btree_key_cache_flush(trans, ++ BTREE_ID_ALLOC, iter->pos); ++ if (ret) ++ goto err; ++ + k = bch2_btree_iter_peek_slot(iter); + ret = bkey_err(k); + if (ret) +@@ -330,7 +338,7 @@ int bch2_alloc_write(struct bch_fs *c, unsigned flags, bool *wrote) + + BUG_ON(BKEY_ALLOC_VAL_U64s_MAX > 8); + +- bch2_trans_init(&trans, c, 0, 0); ++ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); + + iter = bch2_trans_get_iter(&trans, BTREE_ID_ALLOC, POS_MIN, + BTREE_ITER_SLOTS|BTREE_ITER_INTENT); +@@ -364,25 +372,6 @@ int bch2_alloc_write(struct bch_fs *c, unsigned flags, bool *wrote) + return ret < 0 ? ret : 0; + } + +-int bch2_alloc_replay_key(struct bch_fs *c, struct bkey_i *k) +-{ +- struct btree_trans trans; +- struct btree_iter *iter; +- int ret; +- +- bch2_trans_init(&trans, c, 0, 0); +- +- iter = bch2_trans_get_iter(&trans, BTREE_ID_ALLOC, k->k.p, +- BTREE_ITER_SLOTS|BTREE_ITER_INTENT); +- +- ret = bch2_alloc_write_key(&trans, iter, +- BTREE_INSERT_NOFAIL| +- BTREE_INSERT_LAZY_RW| +- BTREE_INSERT_JOURNAL_REPLAY); +- bch2_trans_exit(&trans); +- return ret < 0 ? ret : 0; +-} +- + /* Bucket IO clocks: */ + + static void bch2_recalc_oldest_io(struct bch_fs *c, struct bch_dev *ca, int rw) +@@ -840,7 +829,6 @@ static int bch2_invalidate_one_bucket2(struct btree_trans *trans, + struct bkey_alloc_unpacked u; + struct bucket *g; + struct bucket_mark m; +- struct bkey_s_c k; + bool invalidating_cached_data; + size_t b; + int ret = 0; +@@ -892,27 +880,14 @@ static int bch2_invalidate_one_bucket2(struct btree_trans *trans, + + bch2_btree_iter_set_pos(iter, POS(ca->dev_idx, b)); + retry: +- k = bch2_btree_iter_peek_slot(iter); +- ret = bkey_err(k); ++ ret = bch2_btree_iter_traverse(iter); + if (ret) + return ret; + + percpu_down_read(&c->mark_lock); + g = bucket(ca, iter->pos.offset); + m = READ_ONCE(g->mark); +- +- if (unlikely(!test_bit(BCH_FS_ALLOC_WRITTEN, &c->flags))) { +- /* +- * During journal replay, and if gc repairs alloc info at +- * runtime, the alloc info in the btree might not be up to date +- * yet - so, trust the in memory mark: +- */ +- u = alloc_mem_to_key(g, m); +- } else { +- u = bch2_alloc_unpack(k); +- u.read_time = g->io_time[READ]; +- u.write_time = g->io_time[WRITE]; +- } ++ u = alloc_mem_to_key(g, m); + + percpu_up_read(&c->mark_lock); + +@@ -1000,7 +975,9 @@ static int bch2_invalidate_buckets(struct bch_fs *c, struct bch_dev *ca) + + iter = bch2_trans_get_iter(&trans, BTREE_ID_ALLOC, + POS(ca->dev_idx, 0), +- BTREE_ITER_SLOTS|BTREE_ITER_INTENT); ++ BTREE_ITER_CACHED| ++ BTREE_ITER_CACHED_NOFILL| ++ BTREE_ITER_INTENT); + + /* Only use nowait if we've already invalidated at least one bucket: */ + while (!ret && +diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h +index b53a27450889..f6b9f27f0713 100644 +--- a/fs/bcachefs/alloc_background.h ++++ b/fs/bcachefs/alloc_background.h +@@ -54,7 +54,6 @@ void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); + + struct journal_keys; + int bch2_alloc_read(struct bch_fs *, struct journal_keys *); +-int bch2_alloc_replay_key(struct bch_fs *, struct bkey_i *); + + static inline void bch2_wake_allocator(struct bch_dev *ca) + { +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index 2e28f0c9cc42..e82d4df9ccab 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -178,6 +178,9 @@ static bool btree_insert_key_leaf(struct btree_trans *trans, + int old_live_u64s = b->nr.live_u64s; + int live_u64s_added, u64s_added; + ++ EBUG_ON(!iter->level && ++ !test_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags)); ++ + if (unlikely(!bch2_btree_bset_insert_key(iter, b, + &iter_l(iter)->iter, insert))) + return false; +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index 86ce91ef76a9..085e0af30fc0 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -1455,13 +1455,11 @@ void bch2_trans_fs_usage_apply(struct btree_trans *trans, + + /* trans_mark: */ + +-static int trans_get_key(struct btree_trans *trans, +- enum btree_id btree_id, struct bpos pos, +- struct btree_iter **iter, +- struct bkey_s_c *k) ++static struct btree_iter *trans_get_update(struct btree_trans *trans, ++ enum btree_id btree_id, struct bpos pos, ++ struct bkey_s_c *k) + { + struct btree_insert_entry *i; +- int ret; + + trans_for_each_update(trans, i) + if (i->iter->btree_id == btree_id && +@@ -1469,17 +1467,33 @@ static int trans_get_key(struct btree_trans *trans, + ? bkey_cmp(pos, bkey_start_pos(&i->k->k)) >= 0 && + bkey_cmp(pos, i->k->k.p) < 0 + : !bkey_cmp(pos, i->iter->pos))) { +- *iter = i->iter; +- *k = bkey_i_to_s_c(i->k); +- return 1; ++ *k = bkey_i_to_s_c(i->k); ++ return i->iter; + } + ++ return NULL; ++} ++ ++static int trans_get_key(struct btree_trans *trans, ++ enum btree_id btree_id, struct bpos pos, ++ struct btree_iter **iter, ++ struct bkey_s_c *k) ++{ ++ unsigned flags = btree_id != BTREE_ID_ALLOC ++ ? BTREE_ITER_SLOTS ++ : BTREE_ITER_CACHED; ++ int ret; ++ ++ *iter = trans_get_update(trans, btree_id, pos, k); ++ if (*iter) ++ return 1; ++ + *iter = bch2_trans_get_iter(trans, btree_id, pos, +- BTREE_ITER_SLOTS|BTREE_ITER_INTENT); ++ flags|BTREE_ITER_INTENT); + if (IS_ERR(*iter)) + return PTR_ERR(*iter); + +- *k = bch2_btree_iter_peek_slot(*iter); ++ *k = __bch2_btree_iter_peek(*iter, flags); + ret = bkey_err(*k); + if (ret) + bch2_trans_iter_put(trans, *iter); +@@ -1492,45 +1506,34 @@ static int bch2_trans_mark_pointer(struct btree_trans *trans, + { + struct bch_fs *c = trans->c; + struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); ++ struct bpos pos = POS(p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr)); + struct btree_iter *iter; + struct bkey_s_c k_a; + struct bkey_alloc_unpacked u; + struct bkey_i_alloc *a; + struct bucket *g; +- struct bucket_mark m; + int ret; + +- ret = trans_get_key(trans, BTREE_ID_ALLOC, +- POS(p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr)), +- &iter, &k_a); +- if (ret < 0) +- return ret; +- +- percpu_down_read(&c->mark_lock); +- g = bucket(ca, iter->pos.offset); +- m = READ_ONCE(g->mark); +- +- if (unlikely(!test_bit(BCH_FS_ALLOC_WRITTEN, &c->flags) && !ret)) { +- /* +- * During journal replay, and if gc repairs alloc info at +- * runtime, the alloc info in the btree might not be up to date +- * yet - so, trust the in memory mark - unless we're already +- * updating that key: +- */ +- u = alloc_mem_to_key(g, m); ++ iter = trans_get_update(trans, BTREE_ID_ALLOC, pos, &k_a); ++ if (iter) { ++ u = bch2_alloc_unpack(k_a); + } else { +- u = bch2_alloc_unpack(k_a); +- u.read_time = g->io_time[READ]; +- u.write_time = g->io_time[WRITE]; +- } +- +- percpu_up_read(&c->mark_lock); ++ iter = bch2_trans_get_iter(trans, BTREE_ID_ALLOC, pos, ++ BTREE_ITER_CACHED| ++ BTREE_ITER_CACHED_NOFILL| ++ BTREE_ITER_INTENT); ++ if (IS_ERR(iter)) ++ return PTR_ERR(iter); ++ ++ ret = bch2_btree_iter_traverse(iter); ++ if (ret) ++ goto out; + +- /* +- * Incrementing the bucket gen can be done lazily: +- */ +- if (gen_after(m.gen, u.gen) && !u.data_type) +- u.gen = m.gen; ++ percpu_down_read(&c->mark_lock); ++ g = bucket(ca, pos.offset); ++ u = alloc_mem_to_key(g, READ_ONCE(g->mark)); ++ percpu_up_read(&c->mark_lock); ++ } + + ret = __mark_pointer(c, k, p, sectors, data_type, u.gen, &u.data_type, + &u.dirty_sectors, &u.cached_sectors); +@@ -1543,7 +1546,7 @@ static int bch2_trans_mark_pointer(struct btree_trans *trans, + goto out; + + bkey_alloc_init(&a->k_i); +- a->k.p = iter->pos; ++ a->k.p = pos; + bch2_alloc_pack(a, u); + bch2_trans_update(trans, iter, &a->k_i, 0); + out: +diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c +index 357f42d31aaf..4811ab9f879e 100644 +--- a/fs/bcachefs/journal_reclaim.c ++++ b/fs/bcachefs/journal_reclaim.c +@@ -416,6 +416,9 @@ journal_get_next_pin(struct journal *j, u64 max_seq, u64 *seq) + struct journal_entry_pin_list *pin_list; + struct journal_entry_pin *ret = NULL; + ++ if (!test_bit(JOURNAL_RECLAIM_STARTED, &j->flags)) ++ return NULL; ++ + spin_lock(&j->lock); + + fifo_for_each_entry_ptr(pin_list, &j->pin, *seq) +diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h +index 8eea12a03c06..154b51b891d3 100644 +--- a/fs/bcachefs/journal_types.h ++++ b/fs/bcachefs/journal_types.h +@@ -125,6 +125,7 @@ union journal_preres_state { + enum { + JOURNAL_REPLAY_DONE, + JOURNAL_STARTED, ++ JOURNAL_RECLAIM_STARTED, + JOURNAL_NEED_WRITE, + JOURNAL_NOT_EMPTY, + JOURNAL_MAY_GET_UNRESERVED, +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index 26e5767aa5de..41b864dcdc39 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -292,17 +292,6 @@ static int journal_sort_key_cmp(const void *_l, const void *_r) + cmp_int(l->journal_offset, r->journal_offset); + } + +-static int journal_sort_seq_cmp(const void *_l, const void *_r) +-{ +- const struct journal_key *l = _l; +- const struct journal_key *r = _r; +- +- return cmp_int(r->level, l->level) ?: +- cmp_int(l->journal_seq, r->journal_seq) ?: +- cmp_int(l->btree_id, r->btree_id) ?: +- bkey_cmp(l->k->k.p, r->k->k.p); +-} +- + void bch2_journal_keys_free(struct journal_keys *keys) + { + kvfree(keys->d); +@@ -518,11 +507,48 @@ static int bch2_journal_replay_key(struct bch_fs *c, enum btree_id id, + __bch2_journal_replay_key(&trans, id, level, k)); + } + ++static int __bch2_alloc_replay_key(struct btree_trans *trans, struct bkey_i *k) ++{ ++ struct btree_iter *iter; ++ int ret; ++ ++ iter = bch2_trans_get_iter(trans, BTREE_ID_ALLOC, k->k.p, ++ BTREE_ITER_CACHED| ++ BTREE_ITER_CACHED_NOFILL| ++ BTREE_ITER_INTENT); ++ ret = PTR_ERR_OR_ZERO(iter) ?: ++ bch2_trans_update(trans, iter, k, BTREE_TRIGGER_NORUN); ++ bch2_trans_iter_put(trans, iter); ++ return ret; ++} ++ ++static int bch2_alloc_replay_key(struct bch_fs *c, struct bkey_i *k) ++{ ++ return bch2_trans_do(c, NULL, NULL, ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_USE_RESERVE| ++ BTREE_INSERT_LAZY_RW| ++ BTREE_INSERT_JOURNAL_REPLAY, ++ __bch2_alloc_replay_key(&trans, k)); ++} ++ ++static int journal_sort_seq_cmp(const void *_l, const void *_r) ++{ ++ const struct journal_key *l = _l; ++ const struct journal_key *r = _r; ++ ++ return cmp_int(r->level, l->level) ?: ++ cmp_int(l->journal_seq, r->journal_seq) ?: ++ cmp_int(l->btree_id, r->btree_id) ?: ++ bkey_cmp(l->k->k.p, r->k->k.p); ++} ++ + static int bch2_journal_replay(struct bch_fs *c, + struct journal_keys keys) + { + struct journal *j = &c->journal; + struct journal_key *i; ++ u64 seq; + int ret; + + sort(keys.d, keys.nr, sizeof(keys.d[0]), journal_sort_seq_cmp, NULL); +@@ -530,26 +556,63 @@ static int bch2_journal_replay(struct bch_fs *c, + if (keys.nr) + replay_now_at(j, keys.journal_seq_base); + ++ seq = j->replay_journal_seq; ++ ++ /* ++ * First replay updates to the alloc btree - these will only update the ++ * btree key cache: ++ */ + for_each_journal_key(keys, i) { +- if (!i->level) +- replay_now_at(j, keys.journal_seq_base + i->journal_seq); ++ cond_resched(); + +- if (i->level) +- ret = bch2_journal_replay_key(c, i->btree_id, i->level, i->k); +- if (i->btree_id == BTREE_ID_ALLOC) ++ if (!i->level && i->btree_id == BTREE_ID_ALLOC) { ++ j->replay_journal_seq = keys.journal_seq_base + i->journal_seq; + ret = bch2_alloc_replay_key(c, i->k); +- else if (i->k->k.size) +- ret = bch2_extent_replay_key(c, i->btree_id, i->k); +- else +- ret = bch2_journal_replay_key(c, i->btree_id, i->level, i->k); ++ if (ret) ++ goto err; ++ } ++ } + +- if (ret) { +- bch_err(c, "journal replay: error %d while replaying key", +- ret); +- return ret; ++ /* ++ * Next replay updates to interior btree nodes: ++ */ ++ for_each_journal_key(keys, i) { ++ cond_resched(); ++ ++ if (i->level) { ++ j->replay_journal_seq = keys.journal_seq_base + i->journal_seq; ++ ret = bch2_journal_replay_key(c, i->btree_id, i->level, i->k); ++ if (ret) ++ goto err; + } ++ } ++ ++ /* ++ * Now that the btree is in a consistent state, we can start journal ++ * reclaim (which will be flushing entries from the btree key cache back ++ * to the btree: ++ */ ++ set_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags); ++ set_bit(JOURNAL_RECLAIM_STARTED, &j->flags); ++ ++ j->replay_journal_seq = seq; + ++ /* ++ * Now replay leaf node updates: ++ */ ++ for_each_journal_key(keys, i) { + cond_resched(); ++ ++ if (i->level || i->btree_id == BTREE_ID_ALLOC) ++ continue; ++ ++ replay_now_at(j, keys.journal_seq_base + i->journal_seq); ++ ++ ret = i->k->k.size ++ ? bch2_extent_replay_key(c, i->btree_id, i->k) ++ : bch2_journal_replay_key(c, i->btree_id, i->level, i->k); ++ if (ret) ++ goto err; + } + + replay_now_at(j, j->replay_journal_seq_end); +@@ -558,6 +621,9 @@ static int bch2_journal_replay(struct bch_fs *c, + bch2_journal_set_replay_done(j); + bch2_journal_flush_all_pins(j); + return bch2_journal_error(j); ++err: ++ bch_err(c, "journal replay: error %d while replaying key", ret); ++ return ret; + } + + static bool journal_empty(struct list_head *journal) +@@ -1183,6 +1249,9 @@ int bch2_fs_initialize(struct bch_fs *c) + for (i = 0; i < BTREE_ID_NR; i++) + bch2_btree_root_alloc(c, i); + ++ set_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags); ++ set_bit(JOURNAL_RECLAIM_STARTED, &c->journal.flags); ++ + err = "unable to allocate journal buckets"; + for_each_online_member(ca, c, i) { + ret = bch2_dev_journal_alloc(ca); +diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c +index 872434eceb8b..4123727178e1 100644 +--- a/fs/bcachefs/super.c ++++ b/fs/bcachefs/super.c +@@ -1382,6 +1382,31 @@ int bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca, + + /* Device add/removal: */ + ++int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca) ++{ ++ struct btree_trans trans; ++ size_t i; ++ int ret; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ for (i = 0; i < ca->mi.nbuckets; i++) { ++ ret = bch2_btree_key_cache_flush(&trans, ++ BTREE_ID_ALLOC, POS(ca->dev_idx, i)); ++ if (ret) ++ break; ++ } ++ bch2_trans_exit(&trans); ++ ++ if (ret) ++ return ret; ++ ++ return bch2_btree_delete_range(c, BTREE_ID_ALLOC, ++ POS(ca->dev_idx, 0), ++ POS(ca->dev_idx + 1, 0), ++ NULL); ++} ++ + int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) + { + struct bch_sb_field_members *mi; +@@ -1415,10 +1440,7 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) + goto err; + } + +- ret = bch2_btree_delete_range(c, BTREE_ID_ALLOC, +- POS(ca->dev_idx, 0), +- POS(ca->dev_idx + 1, 0), +- NULL); ++ ret = bch2_dev_remove_alloc(c, ca); + if (ret) { + bch_err(ca, "Remove failed, error deleting alloc info"); + goto err; +diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c +index 26b061381e23..15c5dc1dd46b 100644 +--- a/fs/bcachefs/sysfs.c ++++ b/fs/bcachefs/sysfs.c +@@ -134,7 +134,6 @@ do { \ + write_attribute(trigger_journal_flush); + write_attribute(trigger_btree_coalesce); + write_attribute(trigger_gc); +-write_attribute(trigger_alloc_write); + write_attribute(prune_cache); + rw_attribute(btree_gc_periodic); + +@@ -498,12 +497,6 @@ STORE(bch2_fs) + #endif + } + +- if (attr == &sysfs_trigger_alloc_write) { +- bool wrote; +- +- bch2_alloc_write(c, 0, &wrote); +- } +- + if (attr == &sysfs_prune_cache) { + struct shrink_control sc; + +@@ -587,7 +580,6 @@ struct attribute *bch2_fs_internal_files[] = { + &sysfs_trigger_journal_flush, + &sysfs_trigger_btree_coalesce, + &sysfs_trigger_gc, +- &sysfs_trigger_alloc_write, + &sysfs_prune_cache, + + &sysfs_copy_gc_enabled, +-- +cgit v1.2.3 + + +From 4ffc53fb592f8682c00379da5638f9a8a4ad826d Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 15 Jun 2020 16:59:36 -0400 +Subject: bcachefs: Give bkey_cached_key same attributes as bpos + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_types.h | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h +index ba47f51263f9..ea25b04b7517 100644 +--- a/fs/bcachefs/btree_types.h ++++ b/fs/bcachefs/btree_types.h +@@ -298,7 +298,7 @@ struct btree_key_cache { + struct bkey_cached_key { + u32 btree_id; + struct bpos pos; +-} __packed; ++} __attribute__((packed, aligned(4))); + + #define BKEY_CACHED_DIRTY 0 + +-- +cgit v1.2.3 + + +From 504b555ff8e1b030f00d37ba3af3b1043d3e1728 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 15 Jun 2020 17:38:26 -0400 +Subject: bcachefs: Increase size of btree node reserve + +Also tweak the allocator to be more aggressive about keeping it full. +The recent changes to make updates to interior nodes transactional (and +thus generate updates to the alloc btree) all put more stress on the +btree node reserves. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/alloc_background.c | 9 ++++++--- + fs/bcachefs/bcachefs.h | 2 +- + 2 files changed, 7 insertions(+), 4 deletions(-) + +diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c +index 8b1207dd1282..d9088be6923a 100644 +--- a/fs/bcachefs/alloc_background.c ++++ b/fs/bcachefs/alloc_background.c +@@ -501,6 +501,7 @@ static void bch2_bucket_clock_init(struct bch_fs *c, int rw) + static int wait_buckets_available(struct bch_fs *c, struct bch_dev *ca) + { + unsigned long gc_count = c->gc_count; ++ u64 available; + int ret = 0; + + ca->allocator_state = ALLOCATOR_BLOCKED; +@@ -516,9 +517,11 @@ static int wait_buckets_available(struct bch_fs *c, struct bch_dev *ca) + if (gc_count != c->gc_count) + ca->inc_gen_really_needs_gc = 0; + +- if ((ssize_t) (dev_buckets_available(c, ca) - +- ca->inc_gen_really_needs_gc) >= +- (ssize_t) fifo_free(&ca->free_inc)) ++ available = max_t(s64, 0, dev_buckets_available(c, ca) - ++ ca->inc_gen_really_needs_gc); ++ ++ if (available > fifo_free(&ca->free_inc) || ++ (available && !fifo_full(&ca->free[RESERVE_BTREE]))) + break; + + up_read(&c->gc_lock); +diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h +index b25bc1d6c659..27efe4c342ce 100644 +--- a/fs/bcachefs/bcachefs.h ++++ b/fs/bcachefs/bcachefs.h +@@ -341,7 +341,7 @@ enum bch_time_stats { + #define BTREE_RESERVE_MAX (BTREE_MAX_DEPTH + (BTREE_MAX_DEPTH - 1)) + + /* Size of the freelist we allocate btree nodes from: */ +-#define BTREE_NODE_RESERVE BTREE_RESERVE_MAX ++#define BTREE_NODE_RESERVE (BTREE_RESERVE_MAX * 4) + + #define BTREE_NODE_OPEN_BUCKET_RESERVE (BTREE_RESERVE_MAX * BCH_REPLICAS_MAX) + +-- +cgit v1.2.3 + + +From 6725c5bddf2a9f1fdea1bfd68bb9033ba629ea71 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 15 Jun 2020 17:59:09 -0400 +Subject: bcachefs: delete a slightly faulty assertion + +state lock isn't held at startup + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/buckets.c | 2 -- + 1 file changed, 2 deletions(-) + +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index 085e0af30fc0..a9935cc3df2e 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -1993,8 +1993,6 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) + int ret = -ENOMEM; + unsigned i; + +- lockdep_assert_held(&c->state_lock); +- + memset(&free, 0, sizeof(free)); + memset(&free_inc, 0, sizeof(free_inc)); + memset(&alloc_heap, 0, sizeof(alloc_heap)); +-- +cgit v1.2.3 + + +From 7b475f6b4a0fdf106f16343b610b2f68aabe2e47 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 15 Jun 2020 19:53:46 -0400 +Subject: bcachefs: Fix lock ordering with new btree cache code + +The code that checks lock ordering was recently changed to go off of the +pos of the btree node, rather than the iterator, but the btree cache +code didn't update to handle iterators that point to cached bkeys. Oops + +Also, update various debug code. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 94 +++++++++++++++++++++++++++++++++---------- + fs/bcachefs/btree_key_cache.c | 25 ++++++++++++ + fs/bcachefs/btree_key_cache.h | 2 + + fs/bcachefs/sysfs.c | 11 +++++ + 4 files changed, 110 insertions(+), 22 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index e620088d3116..e98a6480969e 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -185,6 +185,14 @@ static inline bool btree_iter_get_locks(struct btree_iter *iter, + return iter->uptodate < BTREE_ITER_NEED_RELOCK; + } + ++static struct bpos btree_node_pos(struct btree_bkey_cached_common *_b, ++ enum btree_iter_type type) ++{ ++ return type != BTREE_ITER_CACHED ++ ? container_of(_b, struct btree, c)->key.k.p ++ : container_of(_b, struct bkey_cached, c)->key.pos; ++} ++ + /* Slowpath: */ + bool __bch2_btree_node_lock(struct btree *b, struct bpos pos, + unsigned level, struct btree_iter *iter, +@@ -253,7 +261,8 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos, + + if (iter->btree_id == linked->btree_id && + btree_node_locked(linked, level) && +- bkey_cmp(pos, linked->l[level].b->key.k.p) <= 0) ++ bkey_cmp(pos, btree_node_pos((void *) linked->l[level].b, ++ btree_iter_type(linked))) <= 0) + ret = false; + + /* +@@ -435,6 +444,22 @@ void bch2_trans_unlock(struct btree_trans *trans) + + #ifdef CONFIG_BCACHEFS_DEBUG + ++static void bch2_btree_iter_verify_cached(struct btree_iter *iter) ++{ ++ struct bkey_cached *ck; ++ bool locked = btree_node_locked(iter, 0); ++ ++ if (!bch2_btree_node_relock(iter, 0)) ++ return; ++ ++ ck = (void *) iter->l[0].b; ++ BUG_ON(ck->key.btree_id != iter->btree_id || ++ bkey_cmp(ck->key.pos, iter->pos)); ++ ++ if (!locked) ++ btree_node_unlock(iter, 0); ++} ++ + static void bch2_btree_iter_verify_level(struct btree_iter *iter, + unsigned level) + { +@@ -449,6 +474,12 @@ static void bch2_btree_iter_verify_level(struct btree_iter *iter, + if (!debug_check_iterators(iter->trans->c)) + return; + ++ if (btree_iter_type(iter) == BTREE_ITER_CACHED) { ++ if (!level) ++ bch2_btree_iter_verify_cached(iter); ++ return; ++ } ++ + BUG_ON(iter->level < iter->min_depth); + + if (!btree_iter_node(iter, level)) +@@ -1257,13 +1288,14 @@ int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter) + return ret; + } + +-static inline void bch2_btree_iter_checks(struct btree_iter *iter, +- enum btree_iter_type type) ++static inline void bch2_btree_iter_checks(struct btree_iter *iter) + { ++ enum btree_iter_type type = btree_iter_type(iter); ++ + EBUG_ON(iter->btree_id >= BTREE_ID_NR); +- EBUG_ON(btree_iter_type(iter) != type); + +- BUG_ON(type == BTREE_ITER_KEYS && ++ BUG_ON((type == BTREE_ITER_KEYS || ++ type == BTREE_ITER_CACHED) && + (bkey_cmp(iter->pos, bkey_start_pos(&iter->k)) < 0 || + bkey_cmp(iter->pos, iter->k.p) > 0)); + +@@ -1278,7 +1310,8 @@ struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter) + struct btree *b; + int ret; + +- bch2_btree_iter_checks(iter, BTREE_ITER_NODES); ++ EBUG_ON(btree_iter_type(iter) != BTREE_ITER_NODES); ++ bch2_btree_iter_checks(iter); + + if (iter->uptodate == BTREE_ITER_UPTODATE) + return iter->l[iter->level].b; +@@ -1306,7 +1339,8 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) + struct btree *b; + int ret; + +- bch2_btree_iter_checks(iter, BTREE_ITER_NODES); ++ EBUG_ON(btree_iter_type(iter) != BTREE_ITER_NODES); ++ bch2_btree_iter_checks(iter); + + /* already got to end? */ + if (!btree_iter_node(iter, iter->level)) +@@ -1534,7 +1568,8 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) + struct bkey_s_c k; + int ret; + +- bch2_btree_iter_checks(iter, BTREE_ITER_KEYS); ++ EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS); ++ bch2_btree_iter_checks(iter); + + if (iter->uptodate == BTREE_ITER_UPTODATE && + !bkey_deleted(&iter->k)) +@@ -1621,7 +1656,8 @@ struct bkey_s_c bch2_btree_iter_peek_with_updates(struct btree_iter *iter) + struct bkey_s_c k; + int ret; + +- bch2_btree_iter_checks(iter, BTREE_ITER_KEYS); ++ EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS); ++ bch2_btree_iter_checks(iter); + + while (1) { + ret = bch2_btree_iter_traverse(iter); +@@ -1681,7 +1717,8 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) + struct bkey_s_c k; + int ret; + +- bch2_btree_iter_checks(iter, BTREE_ITER_KEYS); ++ EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS); ++ bch2_btree_iter_checks(iter); + + if (iter->uptodate == BTREE_ITER_UPTODATE && + !bkey_deleted(&iter->k)) +@@ -1717,7 +1754,8 @@ struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *iter) + { + struct bpos pos = bkey_start_pos(&iter->k); + +- bch2_btree_iter_checks(iter, BTREE_ITER_KEYS); ++ EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS); ++ bch2_btree_iter_checks(iter); + + if (unlikely(!bkey_cmp(pos, POS_MIN))) + return bkey_s_c_null; +@@ -1798,7 +1836,8 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) + struct bkey_s_c k; + int ret; + +- bch2_btree_iter_checks(iter, BTREE_ITER_KEYS); ++ EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS); ++ bch2_btree_iter_checks(iter); + + if (iter->uptodate == BTREE_ITER_UPTODATE) + return btree_iter_peek_uptodate(iter); +@@ -1844,7 +1883,8 @@ struct bkey_s_c bch2_btree_iter_peek_cached(struct btree_iter *iter) + struct bkey_cached *ck; + int ret; + +- bch2_btree_iter_checks(iter, BTREE_ITER_CACHED); ++ EBUG_ON(btree_iter_type(iter) != BTREE_ITER_CACHED); ++ bch2_btree_iter_checks(iter); + + ret = bch2_btree_iter_traverse(iter); + if (unlikely(ret)) +@@ -2323,6 +2363,15 @@ int bch2_trans_exit(struct btree_trans *trans) + return trans->error ? -EIO : 0; + } + ++static void bch2_btree_iter_node_to_text(struct printbuf *out, ++ struct btree_bkey_cached_common *_b, ++ enum btree_iter_type type) ++{ ++ pr_buf(out, " %px l=%u %s:", ++ _b, _b->level, bch2_btree_ids[_b->btree_id]); ++ bch2_bpos_to_text(out, btree_node_pos(_b, type)); ++} ++ + void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c) + { + #ifdef CONFIG_BCACHEFS_DEBUG +@@ -2347,11 +2396,11 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c) + + for (l = 0; l < BTREE_MAX_DEPTH; l++) { + if (btree_node_locked(iter, l)) { +- b = iter->l[l].b; +- +- pr_buf(out, " %px %s l=%u ", +- b, btree_node_intent_locked(iter, l) ? "i" : "r", l); +- bch2_bpos_to_text(out, b->key.k.p); ++ pr_buf(out, " %s l=%u ", ++ btree_node_intent_locked(iter, l) ? "i" : "r", l); ++ bch2_btree_iter_node_to_text(out, ++ (void *) iter->l[l].b, ++ btree_iter_type(iter)); + pr_buf(out, "\n"); + } + } +@@ -2365,10 +2414,11 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c) + bch2_btree_ids[trans->locking_btree_id]); + bch2_bpos_to_text(out, trans->locking_pos); + +- pr_buf(out, " node %px l=%u %s:", +- b, b->c.level, +- bch2_btree_ids[b->c.btree_id]); +- bch2_bpos_to_text(out, b->key.k.p); ++ ++ pr_buf(out, " node "); ++ bch2_btree_iter_node_to_text(out, ++ (void *) b, ++ btree_iter_type(&trans->iters[trans->locking_iter_idx])); + pr_buf(out, "\n"); + } + } +diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c +index 0b03262acd1e..61662750dfc0 100644 +--- a/fs/bcachefs/btree_key_cache.c ++++ b/fs/bcachefs/btree_key_cache.c +@@ -1,5 +1,6 @@ + + #include "bcachefs.h" ++#include "btree_cache.h" + #include "btree_iter.h" + #include "btree_key_cache.h" + #include "btree_locking.h" +@@ -492,3 +493,27 @@ int bch2_fs_btree_key_cache_init(struct btree_key_cache *c) + { + return rhashtable_init(&c->table, &bch2_btree_key_cache_params); + } ++ ++void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *c) ++{ ++ struct bucket_table *tbl; ++ struct bkey_cached *ck; ++ struct rhash_head *pos; ++ size_t i; ++ ++ mutex_lock(&c->lock); ++ tbl = rht_dereference_rcu(c->table.tbl, &c->table); ++ ++ for (i = 0; i < tbl->size; i++) { ++ rht_for_each_entry_rcu(ck, pos, tbl, i, hash) { ++ pr_buf(out, "%s:", ++ bch2_btree_ids[ck->key.btree_id]); ++ bch2_bpos_to_text(out, ck->key.pos); ++ ++ if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) ++ pr_buf(out, " journal seq %llu", ck->journal.seq); ++ pr_buf(out, "\n"); ++ } ++ } ++ mutex_unlock(&c->lock); ++} +diff --git a/fs/bcachefs/btree_key_cache.h b/fs/bcachefs/btree_key_cache.h +index fbc29336091f..b1756c6c622c 100644 +--- a/fs/bcachefs/btree_key_cache.h ++++ b/fs/bcachefs/btree_key_cache.h +@@ -20,4 +20,6 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *); + void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *); + int bch2_fs_btree_key_cache_init(struct btree_key_cache *); + ++void bch2_btree_key_cache_to_text(struct printbuf *, struct btree_key_cache *); ++ + #endif /* _BCACHEFS_BTREE_KEY_CACHE_H */ +diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c +index 15c5dc1dd46b..b163064f0c5c 100644 +--- a/fs/bcachefs/sysfs.c ++++ b/fs/bcachefs/sysfs.c +@@ -14,6 +14,7 @@ + #include "btree_cache.h" + #include "btree_io.h" + #include "btree_iter.h" ++#include "btree_key_cache.h" + #include "btree_update.h" + #include "btree_update_interior.h" + #include "btree_gc.h" +@@ -165,6 +166,7 @@ read_attribute(journal_debug); + read_attribute(journal_pins); + read_attribute(btree_updates); + read_attribute(dirty_btree_nodes); ++read_attribute(btree_key_cache); + read_attribute(btree_transactions); + + read_attribute(internal_uuid); +@@ -401,6 +403,14 @@ SHOW(bch2_fs) + + if (attr == &sysfs_dirty_btree_nodes) + return bch2_dirty_btree_nodes_print(c, buf); ++ ++ if (attr == &sysfs_btree_key_cache) { ++ struct printbuf out = _PBUF(buf, PAGE_SIZE); ++ ++ bch2_btree_key_cache_to_text(&out, &c->btree_key_cache); ++ return out.pos - buf; ++ } ++ + if (attr == &sysfs_btree_transactions) { + struct printbuf out = _PBUF(buf, PAGE_SIZE); + +@@ -571,6 +581,7 @@ struct attribute *bch2_fs_internal_files[] = { + &sysfs_journal_pins, + &sysfs_btree_updates, + &sysfs_dirty_btree_nodes, ++ &sysfs_btree_key_cache, + &sysfs_btree_transactions, + + &sysfs_read_realloc_races, +-- +cgit v1.2.3 + + +From aabff1b7e1cc2e5e3e59b5aac8e0f4a10a3ff9db Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 15 Jun 2020 20:18:02 -0400 +Subject: bcachefs: Fix incorrect gfp check + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_cache.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c +index b6a716cd4b6d..d3addd3a8964 100644 +--- a/fs/bcachefs/btree_cache.c ++++ b/fs/bcachefs/btree_cache.c +@@ -309,7 +309,7 @@ restart: + if (freed >= nr) + goto out; + +- if (sc->gfp_mask & __GFP_IO) ++ if (sc->gfp_mask & __GFP_FS) + mutex_lock(&bc->lock); + else if (!mutex_trylock(&bc->lock)) + goto out; +-- +cgit v1.2.3 + + +From 3402a268fff164a81c273f7d0a7a638b58e58568 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 17 Jun 2020 17:30:38 -0400 +Subject: bcachefs: Fix a deadlock in the RO path + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_gc.c | 9 +++++++-- + fs/bcachefs/buckets.c | 5 ++++- + 2 files changed, 11 insertions(+), 3 deletions(-) + +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +index 7293b8fedd27..8771ef1f07cc 100644 +--- a/fs/bcachefs/btree_gc.c ++++ b/fs/bcachefs/btree_gc.c +@@ -930,7 +930,12 @@ int bch2_gc_gens(struct bch_fs *c) + unsigned i; + int ret; + +- down_read(&c->state_lock); ++ /* ++ * Ideally we would be using state_lock and not gc_lock here, but that ++ * introduces a deadlock in the RO path - we currently take the state ++ * lock at the start of going RO, thus the gc thread may get stuck: ++ */ ++ down_read(&c->gc_lock); + + for_each_member_device(ca, c, i) { + down_read(&ca->bucket_lock); +@@ -957,7 +962,7 @@ int bch2_gc_gens(struct bch_fs *c) + up_read(&ca->bucket_lock); + } + err: +- up_read(&c->state_lock); ++ up_read(&c->gc_lock); + return ret; + } + +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index a9935cc3df2e..f75edb3c175a 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -2019,6 +2019,7 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) + bch2_copygc_stop(ca); + + if (resize) { ++ down_write(&c->gc_lock); + down_write(&ca->bucket_lock); + percpu_down_write(&c->mark_lock); + } +@@ -2041,8 +2042,10 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) + + swap(ca->buckets_nouse, buckets_nouse); + +- if (resize) ++ if (resize) { + percpu_up_write(&c->mark_lock); ++ up_write(&c->gc_lock); ++ } + + spin_lock(&c->freelist_lock); + for (i = 0; i < RESERVE_NR; i++) { +-- +cgit v1.2.3 + + +From 9ea58d589940b89d77c17021858cfc5ef588dcc1 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 17 Jun 2020 17:33:53 -0400 +Subject: bcachefs: Change bch2_dump_bset() to also print key values + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bset.c | 41 ++++++++++++++++++++--------------------- + fs/bcachefs/bset.h | 4 ++-- + fs/bcachefs/btree_io.c | 2 +- + fs/bcachefs/debug.c | 6 +++--- + 4 files changed, 26 insertions(+), 27 deletions(-) + +diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c +index a18b00a5ec90..651394a330a3 100644 +--- a/fs/bcachefs/bset.c ++++ b/fs/bcachefs/bset.c +@@ -64,21 +64,27 @@ struct bset_tree *bch2_bkey_to_bset(struct btree *b, struct bkey_packed *k) + * by the time we actually do the insert will all be deleted. + */ + +-void bch2_dump_bset(struct btree *b, struct bset *i, unsigned set) ++void bch2_dump_bset(struct bch_fs *c, struct btree *b, ++ struct bset *i, unsigned set) + { + struct bkey_packed *_k, *_n; +- struct bkey k, n; +- char buf[120]; ++ struct bkey uk, n; ++ struct bkey_s_c k; ++ char buf[200]; + + if (!i->u64s) + return; + +- for (_k = i->start, k = bkey_unpack_key(b, _k); ++ for (_k = i->start; + _k < vstruct_last(i); +- _k = _n, k = n) { ++ _k = _n) { + _n = bkey_next_skip_noops(_k, vstruct_last(i)); + +- bch2_bkey_to_text(&PBUF(buf), &k); ++ k = bkey_disassemble(b, _k, &uk); ++ if (c) ++ bch2_bkey_val_to_text(&PBUF(buf), c, k); ++ else ++ bch2_bkey_to_text(&PBUF(buf), k.k); + printk(KERN_ERR "block %u key %5zu: %s\n", set, + _k->_data - i->_data, buf); + +@@ -87,31 +93,24 @@ void bch2_dump_bset(struct btree *b, struct bset *i, unsigned set) + + n = bkey_unpack_key(b, _n); + +- if (bkey_cmp(bkey_start_pos(&n), k.p) < 0) { ++ if (bkey_cmp(bkey_start_pos(&n), k.k->p) < 0) { + printk(KERN_ERR "Key skipped backwards\n"); + continue; + } + +- /* +- * Weird check for duplicate non extent keys: extents are +- * deleted iff they have 0 size, so if it has zero size and it's +- * not deleted these aren't extents: +- */ +- if (((!k.size && !bkey_deleted(&k)) || +- (!n.size && !bkey_deleted(&n))) && +- !bkey_deleted(&k) && +- !bkey_cmp(n.p, k.p)) ++ if (!bkey_deleted(k.k) && ++ !bkey_cmp(n.p, k.k->p)) + printk(KERN_ERR "Duplicate keys\n"); + } + } + +-void bch2_dump_btree_node(struct btree *b) ++void bch2_dump_btree_node(struct bch_fs *c, struct btree *b) + { + struct bset_tree *t; + + console_lock(); + for_each_bset(b, t) +- bch2_dump_bset(b, bset(b, t), t - b->set); ++ bch2_dump_bset(c, b, bset(b, t), t - b->set); + console_unlock(); + } + +@@ -170,7 +169,7 @@ static void bch2_btree_node_iter_next_check(struct btree_node_iter *_iter, + struct bkey nu = bkey_unpack_key(b, n); + char buf1[80], buf2[80]; + +- bch2_dump_btree_node(b); ++ bch2_dump_btree_node(NULL, b); + bch2_bkey_to_text(&PBUF(buf1), &ku); + bch2_bkey_to_text(&PBUF(buf2), &nu); + printk(KERN_ERR "out of order/overlapping:\n%s\n%s\n", +@@ -248,7 +247,7 @@ void bch2_verify_insert_pos(struct btree *b, struct bkey_packed *where, + char buf1[100]; + char buf2[100]; + +- bch2_dump_btree_node(b); ++ bch2_dump_btree_node(NULL, b); + bch2_bkey_to_text(&PBUF(buf1), &k1); + bch2_bkey_to_text(&PBUF(buf2), &k2); + +@@ -269,7 +268,7 @@ void bch2_verify_insert_pos(struct btree *b, struct bkey_packed *where, + char buf1[100]; + char buf2[100]; + +- bch2_dump_btree_node(b); ++ bch2_dump_btree_node(NULL, b); + bch2_bkey_to_text(&PBUF(buf1), &k1); + bch2_bkey_to_text(&PBUF(buf2), &k2); + +diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h +index 7338ccbc8cbd..652ffed4adfb 100644 +--- a/fs/bcachefs/bset.h ++++ b/fs/bcachefs/bset.h +@@ -600,8 +600,8 @@ void bch2_bfloat_to_text(struct printbuf *, struct btree *, + + /* Debug stuff */ + +-void bch2_dump_bset(struct btree *, struct bset *, unsigned); +-void bch2_dump_btree_node(struct btree *); ++void bch2_dump_bset(struct bch_fs *, struct btree *, struct bset *, unsigned); ++void bch2_dump_btree_node(struct bch_fs *, struct btree *); + void bch2_dump_btree_node_iter(struct btree *, struct btree_node_iter *); + + #ifdef CONFIG_BCACHEFS_DEBUG +diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c +index 5fc9137b822e..bb3aeccef67e 100644 +--- a/fs/bcachefs/btree_io.c ++++ b/fs/bcachefs/btree_io.c +@@ -897,7 +897,7 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b, + bch2_bkey_to_text(&PBUF(buf1), &up); + bch2_bkey_to_text(&PBUF(buf2), u.k); + +- bch2_dump_bset(b, i, 0); ++ bch2_dump_bset(c, b, i, 0); + btree_err(BTREE_ERR_FATAL, c, b, i, + "keys out of order: %s > %s", + buf1, buf2); +diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c +index 4e0d14e37287..aa10591a3b1a 100644 +--- a/fs/bcachefs/debug.c ++++ b/fs/bcachefs/debug.c +@@ -97,10 +97,10 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b) + console_lock(); + + printk(KERN_ERR "*** in memory:\n"); +- bch2_dump_bset(b, inmemory, 0); ++ bch2_dump_bset(c, b, inmemory, 0); + + printk(KERN_ERR "*** read back in:\n"); +- bch2_dump_bset(v, sorted, 0); ++ bch2_dump_bset(c, v, sorted, 0); + + while (offset < b->written) { + if (!offset ) { +@@ -117,7 +117,7 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b) + } + + printk(KERN_ERR "*** on disk block %u:\n", offset); +- bch2_dump_bset(b, i, offset); ++ bch2_dump_bset(c, b, i, offset); + + offset += sectors; + } +-- +cgit v1.2.3 + + +From cb66df4079376cb0d09c1b578ae1d402476e2094 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 17 Jun 2020 18:20:26 -0400 +Subject: bcachefs: Add a kthread_should_stop() check to allocator thread + +Turns out it's possible during shutdown for the allocator to get stuck +spinning on bch2_invalidate_buckets() without hitting any of the other +checks. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/alloc_background.c | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c +index d9088be6923a..22d690b5a242 100644 +--- a/fs/bcachefs/alloc_background.c ++++ b/fs/bcachefs/alloc_background.c +@@ -1104,6 +1104,8 @@ static int bch2_allocator_thread(void *arg) + + while (1) { + cond_resched(); ++ if (kthread_should_stop()) ++ break; + + pr_debug("discarding %zu invalidated buckets", + fifo_used(&ca->free_inc)); +-- +cgit v1.2.3 + + +From 6c5083e531d53f65328670b70d1b252ac774e60e Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 18 Jun 2020 17:16:29 -0400 +Subject: bcachefs: Use btree reserve when appropriate + +Whenever we're doing an update that has pointers, that generally means +we need to do the update in order to release open bucket references - so +we should be using the btree open bucket reserve. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_update_interior.c | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index 9e6006d07585..a8cd6ffb6c7c 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -1398,14 +1398,14 @@ int bch2_btree_split_leaf(struct bch_fs *c, struct btree_iter *iter, + struct btree_update *as; + struct closure cl; + int ret = 0; +- struct btree_iter *linked; ++ struct btree_insert_entry *i; + + /* + * We already have a disk reservation and open buckets pinned; this + * allocation must not block: + */ +- trans_for_each_iter(trans, linked) +- if (linked->btree_id == BTREE_ID_EXTENTS) ++ trans_for_each_update(trans, i) ++ if (btree_node_type_needs_gc(i->iter->btree_id)) + flags |= BTREE_INSERT_USE_RESERVE; + + closure_init_stack(&cl); +-- +cgit v1.2.3 + + +From 137c1341af951025091c788bcc86afef4655f557 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 18 Jun 2020 21:06:42 -0400 +Subject: bcachefs: Track sectors of erasure coded data + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bcachefs_ioctl.h | 4 ++++ + fs/bcachefs/buckets.c | 31 +++++++++++++++++++------------ + fs/bcachefs/buckets_types.h | 4 +++- + fs/bcachefs/chardev.c | 9 ++++++--- + fs/bcachefs/sysfs.c | 2 ++ + 5 files changed, 34 insertions(+), 16 deletions(-) + +diff --git a/fs/bcachefs/bcachefs_ioctl.h b/fs/bcachefs/bcachefs_ioctl.h +index ba8c75706bf1..d71157a3e073 100644 +--- a/fs/bcachefs/bcachefs_ioctl.h ++++ b/fs/bcachefs/bcachefs_ioctl.h +@@ -275,9 +275,13 @@ struct bch_ioctl_dev_usage { + + __u32 bucket_size; + __u64 nr_buckets; ++ __u64 available_buckets; + + __u64 buckets[BCH_DATA_NR]; + __u64 sectors[BCH_DATA_NR]; ++ ++ __u64 ec_buckets; ++ __u64 ec_sectors; + }; + + /* +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index f75edb3c175a..4c9371991fa6 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -374,6 +374,11 @@ static inline int is_fragmented_bucket(struct bucket_mark m, + return 0; + } + ++static inline int bucket_stripe_sectors(struct bucket_mark m) ++{ ++ return m.stripe ? m.dirty_sectors : 0; ++} ++ + static inline enum bch_data_type bucket_type(struct bucket_mark m) + { + return m.cached_sectors && !m.dirty_sectors +@@ -441,33 +446,35 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca, + struct bucket_mark old, struct bucket_mark new, + bool gc) + { +- struct bch_dev_usage *dev_usage; ++ struct bch_dev_usage *u; + + percpu_rwsem_assert_held(&c->mark_lock); + + preempt_disable(); +- dev_usage = this_cpu_ptr(ca->usage[gc]); ++ u = this_cpu_ptr(ca->usage[gc]); + + if (bucket_type(old)) +- account_bucket(fs_usage, dev_usage, bucket_type(old), ++ account_bucket(fs_usage, u, bucket_type(old), + -1, -ca->mi.bucket_size); + + if (bucket_type(new)) +- account_bucket(fs_usage, dev_usage, bucket_type(new), ++ account_bucket(fs_usage, u, bucket_type(new), + 1, ca->mi.bucket_size); + +- dev_usage->buckets_alloc += ++ u->buckets_alloc += + (int) new.owned_by_allocator - (int) old.owned_by_allocator; +- dev_usage->buckets_ec += +- (int) new.stripe - (int) old.stripe; +- dev_usage->buckets_unavailable += ++ u->buckets_unavailable += + is_unavailable_bucket(new) - is_unavailable_bucket(old); + +- dev_usage->sectors[old.data_type] -= old.dirty_sectors; +- dev_usage->sectors[new.data_type] += new.dirty_sectors; +- dev_usage->sectors[BCH_DATA_CACHED] += ++ u->buckets_ec += (int) new.stripe - (int) old.stripe; ++ u->sectors_ec += bucket_stripe_sectors(new) - ++ bucket_stripe_sectors(old); ++ ++ u->sectors[old.data_type] -= old.dirty_sectors; ++ u->sectors[new.data_type] += new.dirty_sectors; ++ u->sectors[BCH_DATA_CACHED] += + (int) new.cached_sectors - (int) old.cached_sectors; +- dev_usage->sectors_fragmented += ++ u->sectors_fragmented += + is_fragmented_bucket(new, ca) - is_fragmented_bucket(old, ca); + preempt_enable(); + +diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h +index 59e92a6d26be..53f22726893d 100644 +--- a/fs/bcachefs/buckets_types.h ++++ b/fs/bcachefs/buckets_types.h +@@ -53,12 +53,14 @@ struct bucket_array { + struct bch_dev_usage { + u64 buckets[BCH_DATA_NR]; + u64 buckets_alloc; +- u64 buckets_ec; + u64 buckets_unavailable; + + /* _compressed_ sectors: */ + u64 sectors[BCH_DATA_NR]; + u64 sectors_fragmented; ++ ++ u64 buckets_ec; ++ u64 sectors_ec; + }; + + struct bch_fs_usage { +diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c +index 5028d0dcc2d6..3af521947502 100644 +--- a/fs/bcachefs/chardev.c ++++ b/fs/bcachefs/chardev.c +@@ -470,9 +470,12 @@ static long bch2_ioctl_dev_usage(struct bch_fs *c, + + src = bch2_dev_usage_read(c, ca); + +- arg.state = ca->mi.state; +- arg.bucket_size = ca->mi.bucket_size; +- arg.nr_buckets = ca->mi.nbuckets - ca->mi.first_bucket; ++ arg.state = ca->mi.state; ++ arg.bucket_size = ca->mi.bucket_size; ++ arg.nr_buckets = ca->mi.nbuckets - ca->mi.first_bucket; ++ arg.available_buckets = arg.nr_buckets - src.buckets_unavailable; ++ arg.ec_buckets = src.buckets_ec; ++ arg.ec_sectors = src.sectors_ec; + + for (i = 0; i < BCH_DATA_NR; i++) { + arg.buckets[i] = src.buckets[i]; +diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c +index b163064f0c5c..c169d282a1f9 100644 +--- a/fs/bcachefs/sysfs.c ++++ b/fs/bcachefs/sysfs.c +@@ -846,6 +846,7 @@ static ssize_t show_dev_alloc_debug(struct bch_dev *ca, char *buf) + " meta: %llu\n" + " user: %llu\n" + " cached: %llu\n" ++ " erasure coded: %llu\n" + " fragmented: %llu\n" + " copygc threshold: %llu\n" + "freelist_wait: %s\n" +@@ -872,6 +873,7 @@ static ssize_t show_dev_alloc_debug(struct bch_dev *ca, char *buf) + stats.sectors[BCH_DATA_BTREE], + stats.sectors[BCH_DATA_USER], + stats.sectors[BCH_DATA_CACHED], ++ stats.sectors_ec, + stats.sectors_fragmented, + ca->copygc_threshold, + c->freelist_wait.list.first ? "waiting" : "empty", +-- +cgit v1.2.3 + + +From 304031c9745c0829eea9b46ae8b0c2eb094128d0 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 26 Jun 2020 13:56:21 -0400 +Subject: bcachefs: Fix a null ptr deref in bch2_btree_iter_traverse_one() + +We use sentinal values that aren't NULL to indicate there's a btree node +at a higher level; occasionally, this may result in +btree_iter_up_until_good_node() stopping at one of those sentinal +values. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index e98a6480969e..6fab76c3220c 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -1235,7 +1235,7 @@ static int btree_iter_traverse_one(struct btree_iter *iter) + * + * XXX correctly using BTREE_ITER_UPTODATE should make this unnecessary + */ +- if (btree_iter_node(iter, iter->level)) { ++ if (is_btree_node(iter, iter->level)) { + BUG_ON(!btree_iter_pos_in_node(iter, iter->l[iter->level].b)); + + btree_iter_advance_to_pos(iter, &iter->l[iter->level], -1); +-- +cgit v1.2.3 + + +From c1a3ea42e1ebab766ab44da3b569190d713feb2a Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 28 Jun 2020 18:11:12 -0400 +Subject: bcachefs: Fix bch2_extent_can_insert() not being called + +It's supposed to check whether we're splitting a compressed extent and +if so get a bigger disk reservation - hence this fixes a "disk usage +increased by x without a reservaiton" bug. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_types.h | 10 +++++++++ + fs/bcachefs/btree_update_leaf.c | 26 +++++++++++----------- + fs/bcachefs/buckets.c | 48 +++++++++++++++++++++++------------------ + 3 files changed, 49 insertions(+), 35 deletions(-) + +diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h +index ea25b04b7517..16c4d058358b 100644 +--- a/fs/bcachefs/btree_types.h ++++ b/fs/bcachefs/btree_types.h +@@ -568,6 +568,16 @@ static inline bool btree_node_is_extents(struct btree *b) + return btree_node_type_is_extents(btree_node_type(b)); + } + ++static inline enum btree_node_type btree_iter_key_type(struct btree_iter *iter) ++{ ++ return __btree_node_type(iter->level, iter->btree_id); ++} ++ ++static inline bool btree_iter_is_extents(struct btree_iter *iter) ++{ ++ return btree_node_type_is_extents(btree_iter_key_type(iter)); ++} ++ + #define BTREE_NODE_TYPE_HAS_TRIGGERS \ + ((1U << BKEY_TYPE_EXTENTS)| \ + (1U << BKEY_TYPE_ALLOC)| \ +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index e82d4df9ccab..57c0311b184f 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -265,11 +265,10 @@ static enum btree_insert_ret + btree_key_can_insert(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_i *insert, +- unsigned *u64s) ++ unsigned u64s) + { + struct bch_fs *c = trans->c; + struct btree *b = iter_l(iter)->b; +- static enum btree_insert_ret ret; + + if (unlikely(btree_node_fake(b))) + return BTREE_INSERT_BTREE_NODE_FULL; +@@ -281,13 +280,7 @@ btree_key_can_insert(struct btree_trans *trans, + if (unlikely(btree_node_old_extent_overwrite(b))) + return BTREE_INSERT_BTREE_NODE_FULL; + +- ret = !(iter->flags & BTREE_ITER_IS_EXTENTS) +- ? BTREE_INSERT_OK +- : bch2_extent_can_insert(trans, iter, insert); +- if (ret) +- return ret; +- +- if (*u64s > bch_btree_keys_u64s_remaining(c, b)) ++ if (unlikely(u64s > bch_btree_keys_u64s_remaining(c, b))) + return BTREE_INSERT_BTREE_NODE_FULL; + + return BTREE_INSERT_OK; +@@ -297,7 +290,7 @@ static enum btree_insert_ret + btree_key_can_insert_cached(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_i *insert, +- unsigned *u64s) ++ unsigned u64s) + { + struct bkey_cached *ck = (void *) iter->l[0].b; + unsigned new_u64s; +@@ -305,10 +298,10 @@ btree_key_can_insert_cached(struct btree_trans *trans, + + BUG_ON(iter->level); + +- if (*u64s <= ck->u64s) ++ if (u64s <= ck->u64s) + return BTREE_INSERT_OK; + +- new_u64s = roundup_pow_of_two(*u64s); ++ new_u64s = roundup_pow_of_two(u64s); + new_k = krealloc(ck->k, new_u64s * sizeof(u64), GFP_NOFS); + if (!new_k) + return -ENOMEM; +@@ -414,8 +407,8 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, + + u64s += i->k->k.u64s; + ret = btree_iter_type(i->iter) != BTREE_ITER_CACHED +- ? btree_key_can_insert(trans, i->iter, i->k, &u64s) +- : btree_key_can_insert_cached(trans, i->iter, i->k, &u64s); ++ ? btree_key_can_insert(trans, i->iter, i->k, u64s) ++ : btree_key_can_insert_cached(trans, i->iter, i->k, u64s); + if (ret) { + *stopped_at = i; + return ret; +@@ -733,6 +726,11 @@ static int extent_update_to_keys(struct btree_trans *trans, + struct bkey_i *insert) + { + struct btree_iter *iter; ++ int ret; ++ ++ ret = bch2_extent_can_insert(trans, orig_iter, insert); ++ if (ret) ++ return ret; + + if (bkey_deleted(&insert->k)) + return 0; +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index 4c9371991fa6..0ec194b93c71 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -1367,8 +1367,8 @@ int bch2_mark_update(struct btree_trans *trans, + unsigned flags) + { + struct bch_fs *c = trans->c; +- struct btree *b = iter->l[0].b; +- struct btree_node_iter node_iter = iter->l[0].iter; ++ struct btree *b = iter_l(iter)->b; ++ struct btree_node_iter node_iter = iter_l(iter)->iter; + struct bkey_packed *_k; + int ret = 0; + +@@ -1430,32 +1430,38 @@ void bch2_trans_fs_usage_apply(struct btree_trans *trans, + disk_res_sectors); + + trans_for_each_update(trans, i) { +- struct btree_iter *iter = i->iter; +- struct btree *b = iter->l[0].b; +- struct btree_node_iter node_iter = iter->l[0].iter; +- struct bkey_packed *_k; +- + pr_err("while inserting"); + bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(i->k)); + pr_err("%s", buf); + pr_err("overlapping with"); + +- node_iter = iter->l[0].iter; +- while ((_k = bch2_btree_node_iter_peek(&node_iter, b))) { +- struct bkey unpacked; +- struct bkey_s_c k; ++ if (btree_iter_type(i->iter) != BTREE_ITER_CACHED) { ++ struct btree *b = iter_l(i->iter)->b; ++ struct btree_node_iter node_iter = iter_l(i->iter)->iter; ++ struct bkey_packed *_k; + +- k = bkey_disassemble(b, _k, &unpacked); ++ while ((_k = bch2_btree_node_iter_peek(&node_iter, b))) { ++ struct bkey unpacked; ++ struct bkey_s_c k; + +- if (btree_node_is_extents(b) +- ? bkey_cmp(i->k->k.p, bkey_start_pos(k.k)) <= 0 +- : bkey_cmp(i->k->k.p, k.k->p)) +- break; ++ pr_info("_k %px format %u", _k, _k->format); ++ k = bkey_disassemble(b, _k, &unpacked); + +- bch2_bkey_val_to_text(&PBUF(buf), c, k); +- pr_err("%s", buf); ++ if (btree_node_is_extents(b) ++ ? bkey_cmp(i->k->k.p, bkey_start_pos(k.k)) <= 0 ++ : bkey_cmp(i->k->k.p, k.k->p)) ++ break; ++ ++ bch2_bkey_val_to_text(&PBUF(buf), c, k); ++ pr_err("%s", buf); + +- bch2_btree_node_iter_advance(&node_iter, b); ++ bch2_btree_node_iter_advance(&node_iter, b); ++ } ++ } else { ++ struct bkey_cached *ck = (void *) i->iter->l[0].b; ++ ++ bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(ck->k)); ++ pr_err("%s", buf); + } + } + } +@@ -1807,8 +1813,8 @@ int bch2_trans_mark_update(struct btree_trans *trans, + struct bkey_i *insert, + unsigned flags) + { +- struct btree *b = iter->l[0].b; +- struct btree_node_iter node_iter = iter->l[0].iter; ++ struct btree *b = iter_l(iter)->b; ++ struct btree_node_iter node_iter = iter_l(iter)->iter; + struct bkey_packed *_k; + int ret; + +-- +cgit v1.2.3 + + +From a523ea8e7672538ff54b0f8492d2660a25c7c0c8 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 29 Jun 2020 18:22:06 -0400 +Subject: bcachefs: Refactor dio write code to reinit bch_write_op + +This fixes a bug where the BCH_WRITE_SKIP_CLOSURE_PUT was set +incorrectly, causing the completion to be delivered multiple times. +oops. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/fs-io.c | 65 +++++++++++++++++++++++------------------------------ + fs/bcachefs/io.c | 6 ++--- + fs/bcachefs/io.h | 11 +++++---- + 3 files changed, 35 insertions(+), 47 deletions(-) + +diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c +index e5088402c37d..a4974f999ea1 100644 +--- a/fs/bcachefs/fs-io.c ++++ b/fs/bcachefs/fs-io.c +@@ -63,6 +63,7 @@ struct dio_write { + sync:1, + free_iov:1; + struct quota_res quota_res; ++ u64 written; + + struct iov_iter iter; + struct iovec inline_vecs[2]; +@@ -1820,18 +1821,19 @@ ssize_t bch2_read_iter(struct kiocb *iocb, struct iov_iter *iter) + + /* O_DIRECT writes */ + ++static void bch2_dio_write_loop_async(struct bch_write_op *); ++ + static long bch2_dio_write_loop(struct dio_write *dio) + { + bool kthread = (current->flags & PF_KTHREAD) != 0; +- struct bch_fs *c = dio->op.c; + struct kiocb *req = dio->req; + struct address_space *mapping = req->ki_filp->f_mapping; + struct bch_inode_info *inode = file_bch_inode(req->ki_filp); ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct bio *bio = &dio->op.wbio.bio; + struct bvec_iter_all iter; + struct bio_vec *bv; + unsigned unaligned; +- u64 new_i_size; + bool sync = dio->sync; + long ret; + +@@ -1878,8 +1880,24 @@ static long bch2_dio_write_loop(struct dio_write *dio) + goto err; + } + +- dio->op.pos = POS(inode->v.i_ino, +- (req->ki_pos >> 9) + dio->op.written); ++ bch2_write_op_init(&dio->op, c, io_opts(c, &inode->ei_inode)); ++ dio->op.end_io = bch2_dio_write_loop_async; ++ dio->op.target = dio->op.opts.foreground_target; ++ op_journal_seq_set(&dio->op, &inode->ei_journal_seq); ++ dio->op.write_point = writepoint_hashed((unsigned long) current); ++ dio->op.nr_replicas = dio->op.opts.data_replicas; ++ dio->op.pos = POS(inode->v.i_ino, (u64) req->ki_pos >> 9); ++ ++ if ((req->ki_flags & IOCB_DSYNC) && ++ !c->opts.journal_flush_disabled) ++ dio->op.flags |= BCH_WRITE_FLUSH; ++ ++ ret = bch2_disk_reservation_get(c, &dio->op.res, bio_sectors(bio), ++ dio->op.opts.data_replicas, 0); ++ if (unlikely(ret) && ++ !bch2_check_range_allocated(c, dio->op.pos, ++ bio_sectors(bio), dio->op.opts.data_replicas)) ++ goto err; + + task_io_account_write(bio->bi_iter.bi_size); + +@@ -1911,13 +1929,12 @@ do_io: + loop: + i_sectors_acct(c, inode, &dio->quota_res, + dio->op.i_sectors_delta); +- dio->op.i_sectors_delta = 0; +- +- new_i_size = req->ki_pos + ((u64) dio->op.written << 9); ++ req->ki_pos += (u64) dio->op.written << 9; ++ dio->written += dio->op.written; + + spin_lock(&inode->v.i_lock); +- if (new_i_size > inode->v.i_size) +- i_size_write(&inode->v, new_i_size); ++ if (req->ki_pos > inode->v.i_size) ++ i_size_write(&inode->v, req->ki_pos); + spin_unlock(&inode->v.i_lock); + + bio_for_each_segment_all(bv, bio, iter) +@@ -1929,10 +1946,9 @@ loop: + reinit_completion(&dio->done); + } + +- ret = dio->op.error ?: ((long) dio->op.written << 9); ++ ret = dio->op.error ?: ((long) dio->written << 9); + err: + bch2_pagecache_block_put(&inode->ei_pagecache_lock); +- bch2_disk_reservation_put(c, &dio->op.res); + bch2_quota_reservation_put(c, inode, &dio->quota_res); + + if (dio->free_iov) +@@ -1967,7 +1983,6 @@ ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter) + struct address_space *mapping = file->f_mapping; + struct bch_inode_info *inode = file_bch_inode(file); + struct bch_fs *c = inode->v.i_sb->s_fs_info; +- struct bch_io_opts opts = io_opts(c, &inode->ei_inode); + struct dio_write *dio; + struct bio *bio; + bool locked = true, extending; +@@ -2015,35 +2030,14 @@ ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter) + dio->sync = is_sync_kiocb(req) || extending; + dio->free_iov = false; + dio->quota_res.sectors = 0; ++ dio->written = 0; + dio->iter = *iter; + +- bch2_write_op_init(&dio->op, c, opts); +- dio->op.end_io = bch2_dio_write_loop_async; +- dio->op.target = opts.foreground_target; +- op_journal_seq_set(&dio->op, &inode->ei_journal_seq); +- dio->op.write_point = writepoint_hashed((unsigned long) current); +- dio->op.flags |= BCH_WRITE_NOPUT_RESERVATION; +- +- if ((req->ki_flags & IOCB_DSYNC) && +- !c->opts.journal_flush_disabled) +- dio->op.flags |= BCH_WRITE_FLUSH; +- + ret = bch2_quota_reservation_add(c, inode, &dio->quota_res, + iter->count >> 9, true); + if (unlikely(ret)) + goto err_put_bio; + +- dio->op.nr_replicas = dio->op.opts.data_replicas; +- +- ret = bch2_disk_reservation_get(c, &dio->op.res, iter->count >> 9, +- dio->op.opts.data_replicas, 0); +- if (unlikely(ret) && +- !bch2_check_range_allocated(c, POS(inode->v.i_ino, +- req->ki_pos >> 9), +- iter->count >> 9, +- dio->op.opts.data_replicas)) +- goto err_put_bio; +- + ret = write_invalidate_inode_pages_range(mapping, + req->ki_pos, + req->ki_pos + iter->count - 1); +@@ -2054,12 +2048,9 @@ ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter) + err: + if (locked) + inode_unlock(&inode->v); +- if (ret > 0) +- req->ki_pos += ret; + return ret; + err_put_bio: + bch2_pagecache_block_put(&inode->ei_pagecache_lock); +- bch2_disk_reservation_put(c, &dio->op.res); + bch2_quota_reservation_put(c, inode, &dio->quota_res); + bio_put(bio); + inode_dio_end(&inode->v); +diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c +index 0d95975780cd..92967280be38 100644 +--- a/fs/bcachefs/io.c ++++ b/fs/bcachefs/io.c +@@ -493,8 +493,7 @@ static void bch2_write_done(struct closure *cl) + if (!op->error && (op->flags & BCH_WRITE_FLUSH)) + op->error = bch2_journal_error(&c->journal); + +- if (!(op->flags & BCH_WRITE_NOPUT_RESERVATION)) +- bch2_disk_reservation_put(c, &op->res); ++ bch2_disk_reservation_put(c, &op->res); + percpu_ref_put(&c->writes); + bch2_keylist_free(&op->insert_keys, op->inline_keys); + +@@ -1280,8 +1279,7 @@ void bch2_write(struct closure *cl) + continue_at_nobarrier(cl, __bch2_write, NULL); + return; + err: +- if (!(op->flags & BCH_WRITE_NOPUT_RESERVATION)) +- bch2_disk_reservation_put(c, &op->res); ++ bch2_disk_reservation_put(c, &op->res); + + if (op->end_io) { + EBUG_ON(cl->parent); +diff --git a/fs/bcachefs/io.h b/fs/bcachefs/io.h +index 8814a8fb260f..0ad293bd6295 100644 +--- a/fs/bcachefs/io.h ++++ b/fs/bcachefs/io.h +@@ -30,14 +30,13 @@ enum bch_write_flags { + BCH_WRITE_PAGES_STABLE = (1 << 4), + BCH_WRITE_PAGES_OWNED = (1 << 5), + BCH_WRITE_ONLY_SPECIFIED_DEVS = (1 << 6), +- BCH_WRITE_NOPUT_RESERVATION = (1 << 7), +- BCH_WRITE_WROTE_DATA_INLINE = (1 << 8), +- BCH_WRITE_FROM_INTERNAL = (1 << 9), ++ BCH_WRITE_WROTE_DATA_INLINE = (1 << 7), ++ BCH_WRITE_FROM_INTERNAL = (1 << 8), + + /* Internal: */ +- BCH_WRITE_JOURNAL_SEQ_PTR = (1 << 10), +- BCH_WRITE_SKIP_CLOSURE_PUT = (1 << 11), +- BCH_WRITE_DONE = (1 << 12), ++ BCH_WRITE_JOURNAL_SEQ_PTR = (1 << 9), ++ BCH_WRITE_SKIP_CLOSURE_PUT = (1 << 10), ++ BCH_WRITE_DONE = (1 << 11), + }; + + static inline u64 *op_journal_seq(struct bch_write_op *op) +-- +cgit v1.2.3 + + +From c13b13edb8937777bb8d1b6591ae1261afe97cd7 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 30 Jun 2020 10:12:45 -0400 +Subject: bcachefs: Don't cap ios in dio write path at 2 MB + +It appears this was erronious, a different bug was responsible + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/fs-io.c | 10 ---------- + 1 file changed, 10 deletions(-) + +diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c +index a4974f999ea1..b5bdb26db575 100644 +--- a/fs/bcachefs/fs-io.c ++++ b/fs/bcachefs/fs-io.c +@@ -1841,22 +1841,12 @@ static long bch2_dio_write_loop(struct dio_write *dio) + goto loop; + + while (1) { +- size_t extra = dio->iter.count - +- min(BIO_MAX_VECS * PAGE_SIZE, dio->iter.count); +- + if (kthread) + kthread_use_mm(dio->mm); + BUG_ON(current->faults_disabled_mapping); + current->faults_disabled_mapping = mapping; + +- /* +- * Don't issue more than 2MB at once, the bcachefs io path in +- * io.c can't bounce more than that: +- */ +- +- dio->iter.count -= extra; + ret = bio_iov_iter_get_pages(bio, &dio->iter); +- dio->iter.count += extra; + + current->faults_disabled_mapping = NULL; + if (kthread) +-- +cgit v1.2.3 + + +From 9bec244c5fa6ba54a8bb552ab90a4c396bb7e823 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 2 Jul 2020 13:43:58 -0400 +Subject: bcachefs: Use blk_status_to_str() + +Improved error messages are always a good thing + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_io.c | 7 ++++--- + fs/bcachefs/ec.c | 4 +++- + fs/bcachefs/io.c | 6 ++++-- + fs/bcachefs/journal_io.c | 3 ++- + fs/bcachefs/super-io.c | 3 ++- + 5 files changed, 15 insertions(+), 8 deletions(-) + +diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c +index bb3aeccef67e..a133fa98a917 100644 +--- a/fs/bcachefs/btree_io.c ++++ b/fs/bcachefs/btree_io.c +@@ -1139,7 +1139,8 @@ static void btree_node_read_work(struct work_struct *work) + bio->bi_status = BLK_STS_REMOVED; + } + start: +- bch2_dev_io_err_on(bio->bi_status, ca, "btree read"); ++ bch2_dev_io_err_on(bio->bi_status, ca, "btree read: %s", ++ blk_status_to_str(bio->bi_status)); + if (rb->have_ioref) + percpu_ref_put(&ca->io_ref); + rb->have_ioref = false; +@@ -1423,8 +1424,8 @@ static void btree_node_write_endio(struct bio *bio) + if (wbio->have_ioref) + bch2_latency_acct(ca, wbio->submit_time, WRITE); + +- if (bio->bi_status == BLK_STS_REMOVED || +- bch2_dev_io_err_on(bio->bi_status, ca, "btree write") || ++ if (bch2_dev_io_err_on(bio->bi_status, ca, "btree write: %s", ++ blk_status_to_str(bio->bi_status)) || + bch2_meta_write_fault("btree")) { + spin_lock_irqsave(&c->btree_write_error_lock, flags); + bch2_dev_list_add_dev(&orig->failed, wbio->dev); +diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c +index 50a214d2b122..6a8f440526fd 100644 +--- a/fs/bcachefs/ec.c ++++ b/fs/bcachefs/ec.c +@@ -360,7 +360,9 @@ static void ec_block_endio(struct bio *bio) + struct bch_dev *ca = ec_bio->ca; + struct closure *cl = bio->bi_private; + +- if (bch2_dev_io_err_on(bio->bi_status, ca, "erasure coding")) ++ if (bch2_dev_io_err_on(bio->bi_status, ca, "erasure coding %s: %s", ++ bio_data_dir(bio) ? "write" : "read", ++ blk_status_to_str(bio->bi_status))) + clear_bit(ec_bio->idx, ec_bio->buf->valid); + + bio_put(&ec_bio->bio); +diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c +index 92967280be38..82ea3642b8c5 100644 +--- a/fs/bcachefs/io.c ++++ b/fs/bcachefs/io.c +@@ -611,7 +611,8 @@ static void bch2_write_endio(struct bio *bio) + struct bch_fs *c = wbio->c; + struct bch_dev *ca = bch_dev_bkey_exists(c, wbio->dev); + +- if (bch2_dev_io_err_on(bio->bi_status, ca, "data write")) ++ if (bch2_dev_io_err_on(bio->bi_status, ca, "data write: %s", ++ blk_status_to_str(bio->bi_status))) + set_bit(wbio->dev, op->failed.d); + + if (wbio->have_ioref) { +@@ -1920,7 +1921,8 @@ static void bch2_read_endio(struct bio *bio) + if (!rbio->split) + rbio->bio.bi_end_io = rbio->end_io; + +- if (bch2_dev_io_err_on(bio->bi_status, ca, "data read")) { ++ if (bch2_dev_io_err_on(bio->bi_status, ca, "data read; %s", ++ blk_status_to_str(bio->bi_status))) { + bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status); + return; + } +diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c +index b7625285b3ad..c298c2b7721d 100644 +--- a/fs/bcachefs/journal_io.c ++++ b/fs/bcachefs/journal_io.c +@@ -961,7 +961,8 @@ static void journal_write_endio(struct bio *bio) + struct bch_dev *ca = bio->bi_private; + struct journal *j = &ca->fs->journal; + +- if (bch2_dev_io_err_on(bio->bi_status, ca, "journal write") || ++ if (bch2_dev_io_err_on(bio->bi_status, ca, "journal write: %s", ++ blk_status_to_str(bio->bi_status)) || + bch2_meta_write_fault("journal")) { + struct journal_buf *w = journal_prev_buf(j); + unsigned long flags; +diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c +index fc35ba6116e7..e4ea12fc0bfa 100644 +--- a/fs/bcachefs/super-io.c ++++ b/fs/bcachefs/super-io.c +@@ -633,7 +633,8 @@ static void write_super_endio(struct bio *bio) + + /* XXX: return errors directly */ + +- if (bch2_dev_io_err_on(bio->bi_status, ca, "superblock write")) ++ if (bch2_dev_io_err_on(bio->bi_status, ca, "superblock write: %s", ++ blk_status_to_str(bio->bi_status))) + ca->sb_write_error = 1; + + closure_put(&ca->fs->sb_write); +-- +cgit v1.2.3 + + +From b0f5318effb93b2ad876a56b333e8f4e807c7dd4 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 3 Jul 2020 16:32:00 -0400 +Subject: bcachefs: Mark btree nodes as needing rewrite when not all replicas + are RW + +This fixes a bug where recovery fails when one of the devices is read +only. + +Also - consolidate the "must rewrite this node to insert it" behind a +new btree node flag. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_io.c | 12 +++++++++++- + fs/bcachefs/btree_types.h | 2 ++ + fs/bcachefs/btree_update_interior.c | 5 ++++- + fs/bcachefs/btree_update_leaf.c | 13 ++----------- + 4 files changed, 19 insertions(+), 13 deletions(-) + +diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c +index a133fa98a917..d2c28eb75bde 100644 +--- a/fs/bcachefs/btree_io.c ++++ b/fs/bcachefs/btree_io.c +@@ -917,6 +917,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry + struct sort_iter *iter; + struct btree_node *sorted; + struct bkey_packed *k; ++ struct bch_extent_ptr *ptr; + struct bset *i; + bool used_mempool, blacklisted; + unsigned u64s; +@@ -971,8 +972,10 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry + bset_encrypt(c, i, b->written << 9); + + if (btree_node_is_extents(b) && +- !BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data)) ++ !BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data)) { + set_btree_node_old_extent_overwrite(b); ++ set_btree_node_need_rewrite(b); ++ } + + sectors = vstruct_sectors(b->data, c->block_bits); + } else { +@@ -1098,6 +1101,13 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry + set_needs_whiteout(btree_bset_first(b), true); + + btree_node_reset_sib_u64s(b); ++ ++ bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&b->key)), ptr) { ++ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); ++ ++ if (ca->mi.state != BCH_MEMBER_STATE_RW) ++ set_btree_node_need_rewrite(b); ++ } + out: + mempool_free(iter, &c->fill_iter); + return retry_read; +diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h +index 16c4d058358b..98611b1da1ed 100644 +--- a/fs/bcachefs/btree_types.h ++++ b/fs/bcachefs/btree_types.h +@@ -409,6 +409,7 @@ enum btree_flags { + BTREE_NODE_dying, + BTREE_NODE_fake, + BTREE_NODE_old_extent_overwrite, ++ BTREE_NODE_need_rewrite, + }; + + BTREE_FLAG(read_in_flight); +@@ -423,6 +424,7 @@ BTREE_FLAG(just_written); + BTREE_FLAG(dying); + BTREE_FLAG(fake); + BTREE_FLAG(old_extent_overwrite); ++BTREE_FLAG(need_rewrite); + + static inline struct btree_write *btree_current_write(struct btree *b) + { +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index a8cd6ffb6c7c..b41916f93c9b 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -290,8 +290,10 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned lev + SET_BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data, true); + + if (btree_node_is_extents(b) && +- !BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data)) ++ !BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data)) { + set_btree_node_old_extent_overwrite(b); ++ set_btree_node_need_rewrite(b); ++ } + + bch2_btree_build_aux_trees(b); + +@@ -1943,6 +1945,7 @@ void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id) + bch2_btree_cache_cannibalize_unlock(c); + + set_btree_node_fake(b); ++ set_btree_node_need_rewrite(b); + b->c.level = 0; + b->c.btree_id = id; + +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index 57c0311b184f..cf4105e83eda 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -270,17 +270,8 @@ btree_key_can_insert(struct btree_trans *trans, + struct bch_fs *c = trans->c; + struct btree *b = iter_l(iter)->b; + +- if (unlikely(btree_node_fake(b))) +- return BTREE_INSERT_BTREE_NODE_FULL; +- +- /* +- * old bch2_extent_sort_fix_overlapping() algorithm won't work with new +- * style extent updates: +- */ +- if (unlikely(btree_node_old_extent_overwrite(b))) +- return BTREE_INSERT_BTREE_NODE_FULL; +- +- if (unlikely(u64s > bch_btree_keys_u64s_remaining(c, b))) ++ if (unlikely(btree_node_need_rewrite(b)) || ++ unlikely(u64s > bch_btree_keys_u64s_remaining(c, b))) + return BTREE_INSERT_BTREE_NODE_FULL; + + return BTREE_INSERT_OK; +-- +cgit v1.2.3 + + +From 40b5ed77060e0de243c4717d1a4cfb9a83773503 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 6 Jul 2020 17:02:37 -0400 +Subject: bcachefs: Kill BTREE_TRIGGER_NOOVERWRITES + +This is prep work for reworking the triggers machinery - we have +triggers that need to know both the old and the new key. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_types.h | 2 -- + fs/bcachefs/buckets.c | 8 +------- + fs/bcachefs/buckets.h | 3 --- + fs/bcachefs/recovery.c | 13 ++++++++++--- + 4 files changed, 11 insertions(+), 15 deletions(-) + +diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h +index 98611b1da1ed..5b9b47700a15 100644 +--- a/fs/bcachefs/btree_types.h ++++ b/fs/bcachefs/btree_types.h +@@ -595,7 +595,6 @@ static inline bool btree_iter_is_extents(struct btree_iter *iter) + + enum btree_trigger_flags { + __BTREE_TRIGGER_NORUN, /* Don't run triggers at all */ +- __BTREE_TRIGGER_NOOVERWRITES, /* Don't run triggers on overwrites */ + + __BTREE_TRIGGER_INSERT, + __BTREE_TRIGGER_OVERWRITE, +@@ -608,7 +607,6 @@ enum btree_trigger_flags { + }; + + #define BTREE_TRIGGER_NORUN (1U << __BTREE_TRIGGER_NORUN) +-#define BTREE_TRIGGER_NOOVERWRITES (1U << __BTREE_TRIGGER_NOOVERWRITES) + + #define BTREE_TRIGGER_INSERT (1U << __BTREE_TRIGGER_INSERT) + #define BTREE_TRIGGER_OVERWRITE (1U << __BTREE_TRIGGER_OVERWRITE) +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index 0ec194b93c71..1babd919b90a 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -1309,7 +1309,7 @@ int bch2_mark_key(struct bch_fs *c, struct bkey_s_c k, + return ret; + } + +-inline int bch2_mark_overwrite(struct btree_trans *trans, ++static int bch2_mark_overwrite(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c old, + struct bkey_i *new, +@@ -1383,9 +1383,6 @@ int bch2_mark_update(struct btree_trans *trans, + fs_usage, trans->journal_res.seq, + BTREE_TRIGGER_INSERT|flags); + +- if (unlikely(flags & BTREE_TRIGGER_NOOVERWRITES)) +- return 0; +- + /* + * For non extents, we only mark the new key, not the key being + * overwritten - unless we're actually deleting: +@@ -1829,9 +1826,6 @@ int bch2_trans_mark_update(struct btree_trans *trans, + if (ret) + return ret; + +- if (unlikely(flags & BTREE_TRIGGER_NOOVERWRITES)) +- return 0; +- + if (btree_iter_type(iter) == BTREE_ITER_CACHED) { + struct bkey_cached *ck = (void *) iter->l[0].b; + +diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h +index 97265fe90e96..d029bdbbf858 100644 +--- a/fs/bcachefs/buckets.h ++++ b/fs/bcachefs/buckets.h +@@ -264,9 +264,6 @@ int bch2_mark_key(struct bch_fs *, struct bkey_s_c, unsigned, s64, + int bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage *, + struct disk_reservation *, unsigned); + +-int bch2_mark_overwrite(struct btree_trans *, struct btree_iter *, +- struct bkey_s_c, struct bkey_i *, +- struct bch_fs_usage *, unsigned, bool); + int bch2_mark_update(struct btree_trans *, struct btree_iter *, + struct bkey_i *, struct bch_fs_usage *, unsigned); + +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index 41b864dcdc39..1695a609ecd9 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -442,11 +442,18 @@ retry: + * regular keys + */ + __bch2_btree_iter_set_pos(split_iter, split->k.p, false); +- bch2_trans_update(&trans, split_iter, split, !remark +- ? BTREE_TRIGGER_NORUN +- : BTREE_TRIGGER_NOOVERWRITES); ++ bch2_trans_update(&trans, split_iter, split, ++ BTREE_TRIGGER_NORUN); + + bch2_btree_iter_set_pos(iter, split->k.p); ++ ++ if (remark) { ++ ret = bch2_trans_mark_key(&trans, bkey_i_to_s_c(split), ++ 0, split->k.size, ++ BTREE_TRIGGER_INSERT); ++ if (ret) ++ goto err; ++ } + } while (bkey_cmp(iter->pos, k->k.p) < 0); + + if (remark) { +-- +cgit v1.2.3 + + +From 41270e7b002d36ff4616897a371040949815ce02 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 6 Jul 2020 19:16:25 -0400 +Subject: bcachefs: Rework triggers interface + +The trigger for stripe keys is shortly going to need both the old and +the new key passed to the trigger - this patch does that rework. + +For now, this just changes the in memory triggers, and this doesn't +change how extent triggers work. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/buckets.c | 297 ++++++++++++++++++++++++++++---------------------- + fs/bcachefs/buckets.h | 4 +- + 2 files changed, 169 insertions(+), 132 deletions(-) + +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index 1babd919b90a..371acedd7eba 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -713,7 +713,8 @@ void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca, + preempt_enable(); + } + +-static int bch2_mark_alloc(struct bch_fs *c, struct bkey_s_c k, ++static int bch2_mark_alloc(struct bch_fs *c, ++ struct bkey_s_c old, struct bkey_s_c new, + struct bch_fs_usage *fs_usage, + u64 journal_seq, unsigned flags) + { +@@ -721,7 +722,11 @@ static int bch2_mark_alloc(struct bch_fs *c, struct bkey_s_c k, + struct bkey_alloc_unpacked u; + struct bch_dev *ca; + struct bucket *g; +- struct bucket_mark old, m; ++ struct bucket_mark old_m, m; ++ ++ /* We don't do anything for deletions - do we?: */ ++ if (new.k->type != KEY_TYPE_alloc) ++ return 0; + + /* + * alloc btree is read in by bch2_alloc_read, not gc: +@@ -730,15 +735,15 @@ static int bch2_mark_alloc(struct bch_fs *c, struct bkey_s_c k, + !(flags & BTREE_TRIGGER_BUCKET_INVALIDATE)) + return 0; + +- ca = bch_dev_bkey_exists(c, k.k->p.inode); ++ ca = bch_dev_bkey_exists(c, new.k->p.inode); + +- if (k.k->p.offset >= ca->mi.nbuckets) ++ if (new.k->p.offset >= ca->mi.nbuckets) + return 0; + +- g = __bucket(ca, k.k->p.offset, gc); +- u = bch2_alloc_unpack(k); ++ g = __bucket(ca, new.k->p.offset, gc); ++ u = bch2_alloc_unpack(new); + +- old = bucket_cmpxchg(g, m, ({ ++ old_m = bucket_cmpxchg(g, m, ({ + m.gen = u.gen; + m.data_type = u.data_type; + m.dirty_sectors = u.dirty_sectors; +@@ -751,7 +756,7 @@ static int bch2_mark_alloc(struct bch_fs *c, struct bkey_s_c k, + })); + + if (!(flags & BTREE_TRIGGER_ALLOC_READ)) +- bch2_dev_usage_update(c, ca, fs_usage, old, m, gc); ++ bch2_dev_usage_update(c, ca, fs_usage, old_m, m, gc); + + g->io_time[READ] = u.read_time; + g->io_time[WRITE] = u.write_time; +@@ -764,11 +769,11 @@ static int bch2_mark_alloc(struct bch_fs *c, struct bkey_s_c k, + */ + + if ((flags & BTREE_TRIGGER_BUCKET_INVALIDATE) && +- old.cached_sectors) { ++ old_m.cached_sectors) { + update_cached_sectors(c, fs_usage, ca->dev_idx, +- -old.cached_sectors); +- trace_invalidate(ca, bucket_to_sector(ca, k.k->p.offset), +- old.cached_sectors); ++ -old_m.cached_sectors); ++ trace_invalidate(ca, bucket_to_sector(ca, new.k->p.offset), ++ old_m.cached_sectors); + } + + return 0; +@@ -881,9 +886,9 @@ static void bucket_set_stripe(struct bch_fs *c, + const struct bch_stripe *v, + struct bch_fs_usage *fs_usage, + u64 journal_seq, +- unsigned flags) ++ unsigned flags, ++ bool enabled) + { +- bool enabled = !(flags & BTREE_TRIGGER_OVERWRITE); + bool gc = flags & BTREE_TRIGGER_GC; + unsigned i; + +@@ -1103,12 +1108,14 @@ static int bch2_mark_stripe_ptr(struct bch_fs *c, + return 0; + } + +-static int bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k, ++static int bch2_mark_extent(struct bch_fs *c, ++ struct bkey_s_c old, struct bkey_s_c new, + unsigned offset, s64 sectors, + enum bch_data_type data_type, + struct bch_fs_usage *fs_usage, + unsigned journal_seq, unsigned flags) + { ++ struct bkey_s_c k = flags & BTREE_TRIGGER_INSERT ? new : old; + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; +@@ -1177,72 +1184,88 @@ static int bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k, + return 0; + } + +-static int bch2_mark_stripe(struct bch_fs *c, struct bkey_s_c k, ++static int bch2_mark_stripe(struct bch_fs *c, ++ struct bkey_s_c old, struct bkey_s_c new, + struct bch_fs_usage *fs_usage, + u64 journal_seq, unsigned flags) + { + bool gc = flags & BTREE_TRIGGER_GC; +- struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k); +- size_t idx = s.k->p.offset; ++ size_t idx = new.k->p.offset; ++ const struct bch_stripe *old_s = old.k->type == KEY_TYPE_stripe ++ ? bkey_s_c_to_stripe(old).v : NULL; ++ const struct bch_stripe *new_s = new.k->type == KEY_TYPE_stripe ++ ? bkey_s_c_to_stripe(new).v : NULL; + struct stripe *m = genradix_ptr(&c->stripes[gc], idx); + unsigned i; + +- spin_lock(&c->ec_stripes_heap_lock); +- +- if (!m || ((flags & BTREE_TRIGGER_OVERWRITE) && !m->alive)) { +- spin_unlock(&c->ec_stripes_heap_lock); ++ if (!m || (old_s && !m->alive)) { + bch_err_ratelimited(c, "error marking nonexistent stripe %zu", + idx); + return -1; + } + +- if (!(flags & BTREE_TRIGGER_OVERWRITE)) { +- m->sectors = le16_to_cpu(s.v->sectors); +- m->algorithm = s.v->algorithm; +- m->nr_blocks = s.v->nr_blocks; +- m->nr_redundant = s.v->nr_redundant; ++ if (!new_s) { ++ /* Deleting: */ ++ bucket_set_stripe(c, old_s, fs_usage, ++ journal_seq, flags, false); + +- bch2_bkey_to_replicas(&m->r.e, k); ++ if (!gc) { ++ spin_lock(&c->ec_stripes_heap_lock); ++ bch2_stripes_heap_del(c, m, idx); ++ spin_unlock(&c->ec_stripes_heap_lock); ++ } + +- /* +- * XXX: account for stripes somehow here +- */ +-#if 0 +- update_replicas(c, fs_usage, &m->r.e, stripe_sectors); +-#endif ++ memset(m, 0, sizeof(*m)); ++ } else { ++ BUG_ON(old_s && new_s->nr_blocks != old_s->nr_blocks); ++ BUG_ON(old_s && new_s->nr_redundant != old_s->nr_redundant); ++ ++ if (!old_s) ++ bucket_set_stripe(c, new_s, fs_usage, ++ journal_seq, flags, true); ++ ++ m->sectors = le16_to_cpu(new_s->sectors); ++ m->algorithm = new_s->algorithm; ++ m->nr_blocks = new_s->nr_blocks; ++ m->nr_redundant = new_s->nr_redundant; ++ ++ bch2_bkey_to_replicas(&m->r.e, new); + + /* gc recalculates these fields: */ + if (!(flags & BTREE_TRIGGER_GC)) { +- for (i = 0; i < s.v->nr_blocks; i++) { ++ m->blocks_nonempty = 0; ++ ++ for (i = 0; i < new_s->nr_blocks; i++) { + m->block_sectors[i] = +- stripe_blockcount_get(s.v, i); ++ stripe_blockcount_get(new_s, i); + m->blocks_nonempty += !!m->block_sectors[i]; + } + } + +- if (!gc) ++ if (!gc) { ++ spin_lock(&c->ec_stripes_heap_lock); + bch2_stripes_heap_update(c, m, idx); ++ spin_unlock(&c->ec_stripes_heap_lock); ++ } ++ + m->alive = true; +- } else { +- if (!gc) +- bch2_stripes_heap_del(c, m, idx); +- memset(m, 0, sizeof(*m)); + } + +- spin_unlock(&c->ec_stripes_heap_lock); +- +- bucket_set_stripe(c, s.v, fs_usage, 0, flags); + return 0; + } + + static int bch2_mark_key_locked(struct bch_fs *c, +- struct bkey_s_c k, ++ struct bkey_s_c old, ++ struct bkey_s_c new, + unsigned offset, s64 sectors, + struct bch_fs_usage *fs_usage, + u64 journal_seq, unsigned flags) + { ++ struct bkey_s_c k = flags & BTREE_TRIGGER_INSERT ? new : old; + int ret = 0; + ++ BUG_ON(!(flags & (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE))); ++ + preempt_disable(); + + if (!fs_usage || (flags & BTREE_TRIGGER_GC)) +@@ -1251,7 +1274,7 @@ static int bch2_mark_key_locked(struct bch_fs *c, + + switch (k.k->type) { + case KEY_TYPE_alloc: +- ret = bch2_mark_alloc(c, k, fs_usage, journal_seq, flags); ++ ret = bch2_mark_alloc(c, old, new, fs_usage, journal_seq, flags); + break; + case KEY_TYPE_btree_ptr: + case KEY_TYPE_btree_ptr_v2: +@@ -1259,16 +1282,16 @@ static int bch2_mark_key_locked(struct bch_fs *c, + ? c->opts.btree_node_size + : -c->opts.btree_node_size; + +- ret = bch2_mark_extent(c, k, offset, sectors, BCH_DATA_BTREE, +- fs_usage, journal_seq, flags); ++ ret = bch2_mark_extent(c, old, new, offset, sectors, ++ BCH_DATA_BTREE, fs_usage, journal_seq, flags); + break; + case KEY_TYPE_extent: + case KEY_TYPE_reflink_v: +- ret = bch2_mark_extent(c, k, offset, sectors, BCH_DATA_USER, +- fs_usage, journal_seq, flags); ++ ret = bch2_mark_extent(c, old, new, offset, sectors, ++ BCH_DATA_USER, fs_usage, journal_seq, flags); + break; + case KEY_TYPE_stripe: +- ret = bch2_mark_stripe(c, k, fs_usage, journal_seq, flags); ++ ret = bch2_mark_stripe(c, old, new, fs_usage, journal_seq, flags); + break; + case KEY_TYPE_inode: + if (!(flags & BTREE_TRIGGER_OVERWRITE)) +@@ -1294,82 +1317,38 @@ static int bch2_mark_key_locked(struct bch_fs *c, + return ret; + } + +-int bch2_mark_key(struct bch_fs *c, struct bkey_s_c k, ++int bch2_mark_key(struct bch_fs *c, struct bkey_s_c new, + unsigned offset, s64 sectors, + struct bch_fs_usage *fs_usage, + u64 journal_seq, unsigned flags) + { ++ struct bkey deleted; ++ struct bkey_s_c old = (struct bkey_s_c) { &deleted, NULL }; + int ret; + ++ bkey_init(&deleted); ++ + percpu_down_read(&c->mark_lock); +- ret = bch2_mark_key_locked(c, k, offset, sectors, +- fs_usage, journal_seq, flags); ++ ret = bch2_mark_key_locked(c, old, new, offset, sectors, ++ fs_usage, journal_seq, ++ BTREE_TRIGGER_INSERT|flags); + percpu_up_read(&c->mark_lock); + + return ret; + } + +-static int bch2_mark_overwrite(struct btree_trans *trans, +- struct btree_iter *iter, +- struct bkey_s_c old, +- struct bkey_i *new, +- struct bch_fs_usage *fs_usage, +- unsigned flags, +- bool is_extents) +-{ +- struct bch_fs *c = trans->c; +- unsigned offset = 0; +- s64 sectors = -((s64) old.k->size); +- +- flags |= BTREE_TRIGGER_OVERWRITE; +- +- if (is_extents +- ? bkey_cmp(new->k.p, bkey_start_pos(old.k)) <= 0 +- : bkey_cmp(new->k.p, old.k->p)) +- return 0; +- +- if (is_extents) { +- switch (bch2_extent_overlap(&new->k, old.k)) { +- case BCH_EXTENT_OVERLAP_ALL: +- offset = 0; +- sectors = -((s64) old.k->size); +- break; +- case BCH_EXTENT_OVERLAP_BACK: +- offset = bkey_start_offset(&new->k) - +- bkey_start_offset(old.k); +- sectors = bkey_start_offset(&new->k) - +- old.k->p.offset; +- break; +- case BCH_EXTENT_OVERLAP_FRONT: +- offset = 0; +- sectors = bkey_start_offset(old.k) - +- new->k.p.offset; +- break; +- case BCH_EXTENT_OVERLAP_MIDDLE: +- offset = bkey_start_offset(&new->k) - +- bkey_start_offset(old.k); +- sectors = -((s64) new->k.size); +- flags |= BTREE_TRIGGER_OVERWRITE_SPLIT; +- break; +- } +- +- BUG_ON(sectors >= 0); +- } +- +- return bch2_mark_key_locked(c, old, offset, sectors, fs_usage, +- trans->journal_res.seq, flags) ?: 1; +-} +- + int bch2_mark_update(struct btree_trans *trans, + struct btree_iter *iter, +- struct bkey_i *insert, ++ struct bkey_i *new, + struct bch_fs_usage *fs_usage, + unsigned flags) + { + struct bch_fs *c = trans->c; + struct btree *b = iter_l(iter)->b; + struct btree_node_iter node_iter = iter_l(iter)->iter; +- struct bkey_packed *_k; ++ struct bkey_packed *_old; ++ struct bkey_s_c old; ++ struct bkey unpacked; + int ret = 0; + + if (unlikely(flags & BTREE_TRIGGER_NORUN)) +@@ -1378,31 +1357,87 @@ int bch2_mark_update(struct btree_trans *trans, + if (!btree_node_type_needs_gc(iter->btree_id)) + return 0; + +- bch2_mark_key_locked(c, bkey_i_to_s_c(insert), +- 0, insert->k.size, +- fs_usage, trans->journal_res.seq, +- BTREE_TRIGGER_INSERT|flags); ++ bkey_init(&unpacked); ++ old = (struct bkey_s_c) { &unpacked, NULL }; + +- /* +- * For non extents, we only mark the new key, not the key being +- * overwritten - unless we're actually deleting: +- */ +- if ((iter->btree_id == BTREE_ID_ALLOC || +- iter->btree_id == BTREE_ID_EC) && +- !bkey_deleted(&insert->k)) +- return 0; ++ if (!btree_node_type_is_extents(iter->btree_id)) { ++ if (btree_iter_type(iter) != BTREE_ITER_CACHED) { ++ _old = bch2_btree_node_iter_peek(&node_iter, b); ++ if (_old) ++ old = bkey_disassemble(b, _old, &unpacked); ++ } else { ++ struct bkey_cached *ck = (void *) iter->l[0].b; + +- while ((_k = bch2_btree_node_iter_peek(&node_iter, b))) { +- struct bkey unpacked; +- struct bkey_s_c k = bkey_disassemble(b, _k, &unpacked); ++ if (ck->valid) ++ old = bkey_i_to_s_c(ck->k); ++ } + +- ret = bch2_mark_overwrite(trans, iter, k, insert, +- fs_usage, flags, +- btree_node_type_is_extents(iter->btree_id)); +- if (ret <= 0) +- break; ++ if (old.k->type == new->k.type) { ++ bch2_mark_key_locked(c, old, bkey_i_to_s_c(new), 0, 0, ++ fs_usage, trans->journal_res.seq, ++ BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags); + +- bch2_btree_node_iter_advance(&node_iter, b); ++ } else { ++ bch2_mark_key_locked(c, old, bkey_i_to_s_c(new), 0, 0, ++ fs_usage, trans->journal_res.seq, ++ BTREE_TRIGGER_INSERT|flags); ++ bch2_mark_key_locked(c, old, bkey_i_to_s_c(new), 0, 0, ++ fs_usage, trans->journal_res.seq, ++ BTREE_TRIGGER_OVERWRITE|flags); ++ } ++ } else { ++ BUG_ON(btree_iter_type(iter) == BTREE_ITER_CACHED); ++ bch2_mark_key_locked(c, old, bkey_i_to_s_c(new), ++ 0, new->k.size, ++ fs_usage, trans->journal_res.seq, ++ BTREE_TRIGGER_INSERT|flags); ++ ++ while ((_old = bch2_btree_node_iter_peek(&node_iter, b))) { ++ unsigned offset = 0; ++ s64 sectors; ++ ++ old = bkey_disassemble(b, _old, &unpacked); ++ sectors = -((s64) old.k->size); ++ ++ flags |= BTREE_TRIGGER_OVERWRITE; ++ ++ if (bkey_cmp(new->k.p, bkey_start_pos(old.k)) <= 0) ++ return 0; ++ ++ switch (bch2_extent_overlap(&new->k, old.k)) { ++ case BCH_EXTENT_OVERLAP_ALL: ++ offset = 0; ++ sectors = -((s64) old.k->size); ++ break; ++ case BCH_EXTENT_OVERLAP_BACK: ++ offset = bkey_start_offset(&new->k) - ++ bkey_start_offset(old.k); ++ sectors = bkey_start_offset(&new->k) - ++ old.k->p.offset; ++ break; ++ case BCH_EXTENT_OVERLAP_FRONT: ++ offset = 0; ++ sectors = bkey_start_offset(old.k) - ++ new->k.p.offset; ++ break; ++ case BCH_EXTENT_OVERLAP_MIDDLE: ++ offset = bkey_start_offset(&new->k) - ++ bkey_start_offset(old.k); ++ sectors = -((s64) new->k.size); ++ flags |= BTREE_TRIGGER_OVERWRITE_SPLIT; ++ break; ++ } ++ ++ BUG_ON(sectors >= 0); ++ ++ ret = bch2_mark_key_locked(c, old, bkey_i_to_s_c(new), ++ offset, sectors, fs_usage, ++ trans->journal_res.seq, flags) ?: 1; ++ if (ret <= 0) ++ break; ++ ++ bch2_btree_node_iter_advance(&node_iter, b); ++ } + } + + return ret; +@@ -1457,8 +1492,10 @@ void bch2_trans_fs_usage_apply(struct btree_trans *trans, + } else { + struct bkey_cached *ck = (void *) i->iter->l[0].b; + +- bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(ck->k)); +- pr_err("%s", buf); ++ if (ck->valid) { ++ bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(ck->k)); ++ pr_err("%s", buf); ++ } + } + } + } +diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h +index d029bdbbf858..4bde58130d39 100644 +--- a/fs/bcachefs/buckets.h ++++ b/fs/bcachefs/buckets.h +@@ -259,8 +259,8 @@ void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *, + size_t, enum bch_data_type, unsigned, + struct gc_pos, unsigned); + +-int bch2_mark_key(struct bch_fs *, struct bkey_s_c, unsigned, s64, +- struct bch_fs_usage *, u64, unsigned); ++int bch2_mark_key(struct bch_fs *, struct bkey_s_c, unsigned, ++ s64, struct bch_fs_usage *, u64, unsigned); + int bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage *, + struct disk_reservation *, unsigned); + +-- +cgit v1.2.3 + + +From 36ab543a26ee48c93689c79c6496d929713b01f6 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 6 Jul 2020 20:18:13 -0400 +Subject: bcachefs: Improve stripe triggers/heap code + +Soon we'll be able to modify existing stripes - replacing empty blocks +with new blocks and new p/q blocks. This patch updates the trigger code +to handle pointers changing in an existing stripe; also, it +significantly improves how the stripes heap works, which means we can +get rid of the stripe creation/deletion lock. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bcachefs.h | 1 - + fs/bcachefs/btree_gc.c | 5 +- + fs/bcachefs/buckets.c | 103 ++++++++++++++++++++--------------------- + fs/bcachefs/ec.c | 122 +++++++++++++++++++++++++++++++------------------ + fs/bcachefs/ec.h | 2 + + fs/bcachefs/ec_types.h | 1 + + fs/bcachefs/super.c | 1 - + fs/bcachefs/sysfs.c | 9 ++++ + 8 files changed, 146 insertions(+), 98 deletions(-) + +diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h +index 27efe4c342ce..081c17f35b33 100644 +--- a/fs/bcachefs/bcachefs.h ++++ b/fs/bcachefs/bcachefs.h +@@ -755,7 +755,6 @@ struct bch_fs { + + /* STRIPES: */ + GENRADIX(struct stripe) stripes[2]; +- struct mutex ec_stripe_create_lock; + + ec_stripes_heap ec_stripes_heap; + spinlock_t ec_stripes_heap_lock; +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +index 8771ef1f07cc..dac1c3a3c527 100644 +--- a/fs/bcachefs/btree_gc.c ++++ b/fs/bcachefs/btree_gc.c +@@ -617,8 +617,11 @@ static int bch2_gc_done(struct bch_fs *c, + copy_stripe_field(block_sectors[i], + "block_sectors[%u]", i); + +- if (dst->alive) ++ if (dst->alive) { ++ spin_lock(&c->ec_stripes_heap_lock); + bch2_stripes_heap_insert(c, dst, dst_iter.pos); ++ spin_unlock(&c->ec_stripes_heap_lock); ++ } + + genradix_iter_advance(&dst_iter, &c->stripes[0]); + genradix_iter_advance(&src_iter, &c->stripes[1]); +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index 371acedd7eba..dbf2a3f1d904 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -883,51 +883,46 @@ static s64 ptr_disk_sectors_delta(struct extent_ptr_decoded p, + } + + static void bucket_set_stripe(struct bch_fs *c, +- const struct bch_stripe *v, ++ const struct bch_extent_ptr *ptr, + struct bch_fs_usage *fs_usage, + u64 journal_seq, + unsigned flags, + bool enabled) + { + bool gc = flags & BTREE_TRIGGER_GC; +- unsigned i; ++ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); ++ struct bucket *g = PTR_BUCKET(ca, ptr, gc); ++ struct bucket_mark new, old; + +- for (i = 0; i < v->nr_blocks; i++) { +- const struct bch_extent_ptr *ptr = v->ptrs + i; +- struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); +- struct bucket *g = PTR_BUCKET(ca, ptr, gc); +- struct bucket_mark new, old; +- +- old = bucket_cmpxchg(g, new, ({ +- new.stripe = enabled; +- if (journal_seq) { +- new.journal_seq_valid = 1; +- new.journal_seq = journal_seq; +- } +- })); ++ old = bucket_cmpxchg(g, new, ({ ++ new.stripe = enabled; ++ if (journal_seq) { ++ new.journal_seq_valid = 1; ++ new.journal_seq = journal_seq; ++ } ++ })); + +- bch2_dev_usage_update(c, ca, fs_usage, old, new, gc); ++ bch2_dev_usage_update(c, ca, fs_usage, old, new, gc); + +- /* +- * XXX write repair code for these, flag stripe as possibly bad +- */ +- if (old.gen != ptr->gen) +- bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, +- "stripe with stale pointer"); ++ /* ++ * XXX write repair code for these, flag stripe as possibly bad ++ */ ++ if (old.gen != ptr->gen) ++ bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ++ "stripe with stale pointer"); + #if 0 +- /* +- * We'd like to check for these, but these checks don't work +- * yet: +- */ +- if (old.stripe && enabled) +- bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, +- "multiple stripes using same bucket"); +- +- if (!old.stripe && !enabled) +- bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, +- "deleting stripe but bucket not marked as stripe bucket"); ++ /* ++ * We'd like to check for these, but these checks don't work ++ * yet: ++ */ ++ if (old.stripe && enabled) ++ bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ++ "multiple stripes using same bucket"); ++ ++ if (!old.stripe && !enabled) ++ bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ++ "deleting stripe but bucket not marked as stripe bucket"); + #endif +- } + } + + static int __mark_pointer(struct bch_fs *c, struct bkey_s_c k, +@@ -1069,8 +1064,7 @@ static int bch2_mark_stripe_ptr(struct bch_fs *c, + { + bool gc = flags & BTREE_TRIGGER_GC; + struct stripe *m; +- unsigned old, new; +- int blocks_nonempty_delta; ++ unsigned i, blocks_nonempty = 0; + + m = genradix_ptr(&c->stripes[gc], p.idx); + +@@ -1089,20 +1083,17 @@ static int bch2_mark_stripe_ptr(struct bch_fs *c, + *nr_parity = m->nr_redundant; + *r = m->r; + +- old = m->block_sectors[p.block]; + m->block_sectors[p.block] += sectors; +- new = m->block_sectors[p.block]; + +- blocks_nonempty_delta = (int) !!new - (int) !!old; +- if (blocks_nonempty_delta) { +- m->blocks_nonempty += blocks_nonempty_delta; ++ for (i = 0; i < m->nr_blocks; i++) ++ blocks_nonempty += m->block_sectors[i] != 0; + ++ if (m->blocks_nonempty != blocks_nonempty) { ++ m->blocks_nonempty = blocks_nonempty; + if (!gc) + bch2_stripes_heap_update(c, m, p.idx); + } + +- m->dirty = true; +- + spin_unlock(&c->ec_stripes_heap_lock); + + return 0; +@@ -1206,10 +1197,11 @@ static int bch2_mark_stripe(struct bch_fs *c, + + if (!new_s) { + /* Deleting: */ +- bucket_set_stripe(c, old_s, fs_usage, +- journal_seq, flags, false); ++ for (i = 0; i < old_s->nr_blocks; i++) ++ bucket_set_stripe(c, old_s->ptrs + i, fs_usage, ++ journal_seq, flags, false); + +- if (!gc) { ++ if (!gc && m->on_heap) { + spin_lock(&c->ec_stripes_heap_lock); + bch2_stripes_heap_del(c, m, idx); + spin_unlock(&c->ec_stripes_heap_lock); +@@ -1220,10 +1212,21 @@ static int bch2_mark_stripe(struct bch_fs *c, + BUG_ON(old_s && new_s->nr_blocks != old_s->nr_blocks); + BUG_ON(old_s && new_s->nr_redundant != old_s->nr_redundant); + +- if (!old_s) +- bucket_set_stripe(c, new_s, fs_usage, +- journal_seq, flags, true); ++ for (i = 0; i < new_s->nr_blocks; i++) { ++ if (!old_s || ++ memcmp(new_s->ptrs + i, ++ old_s->ptrs + i, ++ sizeof(struct bch_extent_ptr))) { ++ ++ if (old_s) ++ bucket_set_stripe(c, old_s->ptrs + i, fs_usage, ++ journal_seq, flags, false); ++ bucket_set_stripe(c, new_s->ptrs + i, fs_usage, ++ journal_seq, flags, true); ++ } ++ } + ++ m->alive = true; + m->sectors = le16_to_cpu(new_s->sectors); + m->algorithm = new_s->algorithm; + m->nr_blocks = new_s->nr_blocks; +@@ -1247,8 +1250,6 @@ static int bch2_mark_stripe(struct bch_fs *c, + bch2_stripes_heap_update(c, m, idx); + spin_unlock(&c->ec_stripes_heap_lock); + } +- +- m->alive = true; + } + + return 0; +diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c +index 6a8f440526fd..5735ae9f5f83 100644 +--- a/fs/bcachefs/ec.c ++++ b/fs/bcachefs/ec.c +@@ -607,39 +607,16 @@ static void heap_verify_backpointer(struct bch_fs *c, size_t idx) + BUG_ON(h->data[m->heap_idx].idx != idx); + } + +-void bch2_stripes_heap_update(struct bch_fs *c, +- struct stripe *m, size_t idx) +-{ +- ec_stripes_heap *h = &c->ec_stripes_heap; +- size_t i; +- +- if (m->alive) { +- heap_verify_backpointer(c, idx); +- +- h->data[m->heap_idx].blocks_nonempty = m->blocks_nonempty; +- +- i = m->heap_idx; +- heap_sift_up(h, i, ec_stripes_heap_cmp, +- ec_stripes_heap_set_backpointer); +- heap_sift_down(h, i, ec_stripes_heap_cmp, +- ec_stripes_heap_set_backpointer); +- +- heap_verify_backpointer(c, idx); +- } else { +- bch2_stripes_heap_insert(c, m, idx); +- } +- +- if (stripe_idx_to_delete(c) >= 0 && +- !percpu_ref_is_dying(&c->writes)) +- schedule_work(&c->ec_stripe_delete_work); +-} +- + void bch2_stripes_heap_del(struct bch_fs *c, + struct stripe *m, size_t idx) + { ++ if (!m->on_heap) ++ return; ++ ++ m->on_heap = false; ++ + heap_verify_backpointer(c, idx); + +- m->alive = false; + heap_del(&c->ec_stripes_heap, m->heap_idx, + ec_stripes_heap_cmp, + ec_stripes_heap_set_backpointer); +@@ -648,19 +625,49 @@ void bch2_stripes_heap_del(struct bch_fs *c, + void bch2_stripes_heap_insert(struct bch_fs *c, + struct stripe *m, size_t idx) + { ++ if (m->on_heap) ++ return; ++ + BUG_ON(heap_full(&c->ec_stripes_heap)); + ++ m->on_heap = true; ++ + heap_add(&c->ec_stripes_heap, ((struct ec_stripe_heap_entry) { + .idx = idx, + .blocks_nonempty = m->blocks_nonempty, + }), + ec_stripes_heap_cmp, + ec_stripes_heap_set_backpointer); +- m->alive = true; + + heap_verify_backpointer(c, idx); + } + ++void bch2_stripes_heap_update(struct bch_fs *c, ++ struct stripe *m, size_t idx) ++{ ++ ec_stripes_heap *h = &c->ec_stripes_heap; ++ size_t i; ++ ++ if (!m->on_heap) ++ return; ++ ++ heap_verify_backpointer(c, idx); ++ ++ h->data[m->heap_idx].blocks_nonempty = m->blocks_nonempty; ++ ++ i = m->heap_idx; ++ heap_sift_up(h, i, ec_stripes_heap_cmp, ++ ec_stripes_heap_set_backpointer); ++ heap_sift_down(h, i, ec_stripes_heap_cmp, ++ ec_stripes_heap_set_backpointer); ++ ++ heap_verify_backpointer(c, idx); ++ ++ if (stripe_idx_to_delete(c) >= 0 && ++ !percpu_ref_is_dying(&c->writes)) ++ schedule_work(&c->ec_stripe_delete_work); ++} ++ + /* stripe deletion */ + + static int ec_stripe_delete(struct bch_fs *c, size_t idx) +@@ -677,23 +684,20 @@ static void ec_stripe_delete_work(struct work_struct *work) + container_of(work, struct bch_fs, ec_stripe_delete_work); + ssize_t idx; + +- down_read(&c->gc_lock); +- mutex_lock(&c->ec_stripe_create_lock); +- + while (1) { + spin_lock(&c->ec_stripes_heap_lock); + idx = stripe_idx_to_delete(c); +- spin_unlock(&c->ec_stripes_heap_lock); +- +- if (idx < 0) ++ if (idx < 0) { ++ spin_unlock(&c->ec_stripes_heap_lock); + break; ++ } ++ ++ bch2_stripes_heap_del(c, genradix_ptr(&c->stripes[0], idx), idx); ++ spin_unlock(&c->ec_stripes_heap_lock); + + if (ec_stripe_delete(c, idx)) + break; + } +- +- mutex_unlock(&c->ec_stripe_create_lock); +- up_read(&c->gc_lock); + } + + /* stripe creation: */ +@@ -846,6 +850,7 @@ static void ec_stripe_create(struct ec_stripe_new *s) + struct bch_fs *c = s->c; + struct open_bucket *ob; + struct bkey_i *k; ++ struct stripe *m; + struct bch_stripe *v = &s->stripe.key.v; + unsigned i, nr_data = v->nr_blocks - v->nr_redundant; + struct closure cl; +@@ -882,12 +887,10 @@ static void ec_stripe_create(struct ec_stripe_new *s) + goto err_put_writes; + } + +- mutex_lock(&c->ec_stripe_create_lock); +- + ret = ec_stripe_bkey_insert(c, &s->stripe.key); + if (ret) { + bch_err(c, "error creating stripe: error creating stripe key"); +- goto err_unlock; ++ goto err_put_writes; + } + + for_each_keylist_key(&s->keys, k) { +@@ -896,8 +899,11 @@ static void ec_stripe_create(struct ec_stripe_new *s) + break; + } + +-err_unlock: +- mutex_unlock(&c->ec_stripe_create_lock); ++ spin_lock(&c->ec_stripes_heap_lock); ++ m = genradix_ptr(&c->stripes[0], s->stripe.key.k.p.offset); ++ BUG_ON(m->on_heap); ++ bch2_stripes_heap_insert(c, m, s->stripe.key.k.p.offset); ++ spin_unlock(&c->ec_stripes_heap_lock); + err_put_writes: + percpu_ref_put(&c->writes); + err: +@@ -1280,11 +1286,21 @@ static int bch2_stripes_read_fn(struct bch_fs *c, enum btree_id id, + { + int ret = 0; + +- if (k.k->type == KEY_TYPE_stripe) ++ if (k.k->type == KEY_TYPE_stripe) { ++ struct stripe *m; ++ + ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL) ?: + bch2_mark_key(c, k, 0, 0, NULL, 0, + BTREE_TRIGGER_ALLOC_READ| + BTREE_TRIGGER_NOATOMIC); ++ if (ret) ++ return ret; ++ ++ spin_lock(&c->ec_stripes_heap_lock); ++ m = genradix_ptr(&c->stripes[0], k.k->p.offset); ++ bch2_stripes_heap_insert(c, m, k.k->p.offset); ++ spin_unlock(&c->ec_stripes_heap_lock); ++ } + + return ret; + } +@@ -1335,6 +1351,24 @@ int bch2_ec_mem_alloc(struct bch_fs *c, bool gc) + return 0; + } + ++void bch2_stripes_heap_to_text(struct printbuf *out, struct bch_fs *c) ++{ ++ ec_stripes_heap *h = &c->ec_stripes_heap; ++ struct stripe *m; ++ size_t i; ++ ++ spin_lock(&c->ec_stripes_heap_lock); ++ for (i = 0; i < min(h->used, 20UL); i++) { ++ m = genradix_ptr(&c->stripes[0], h->data[i].idx); ++ ++ pr_buf(out, "%zu %u/%u+%u\n", h->data[i].idx, ++ h->data[i].blocks_nonempty, ++ m->nr_blocks - m->nr_redundant, ++ m->nr_redundant); ++ } ++ spin_unlock(&c->ec_stripes_heap_lock); ++} ++ + void bch2_fs_ec_exit(struct bch_fs *c) + { + struct ec_stripe_head *h; +diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h +index 4dfaac034886..36444cb14190 100644 +--- a/fs/bcachefs/ec.h ++++ b/fs/bcachefs/ec.h +@@ -157,6 +157,8 @@ int bch2_stripes_write(struct bch_fs *, unsigned, bool *); + + int bch2_ec_mem_alloc(struct bch_fs *, bool); + ++void bch2_stripes_heap_to_text(struct printbuf *, struct bch_fs *); ++ + void bch2_fs_ec_exit(struct bch_fs *); + int bch2_fs_ec_init(struct bch_fs *); + +diff --git a/fs/bcachefs/ec_types.h b/fs/bcachefs/ec_types.h +index 5c3f77c8aac7..e4d633fca5bf 100644 +--- a/fs/bcachefs/ec_types.h ++++ b/fs/bcachefs/ec_types.h +@@ -22,6 +22,7 @@ struct stripe { + + unsigned alive:1; + unsigned dirty:1; ++ unsigned on_heap:1; + u8 blocks_nonempty; + u16 block_sectors[EC_STRIPE_MAX]; + +diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c +index 4123727178e1..41bbd81c4dcb 100644 +--- a/fs/bcachefs/super.c ++++ b/fs/bcachefs/super.c +@@ -670,7 +670,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) + + INIT_LIST_HEAD(&c->ec_new_stripe_list); + mutex_init(&c->ec_new_stripe_lock); +- mutex_init(&c->ec_stripe_create_lock); + spin_lock_init(&c->ec_stripes_heap_lock); + + seqcount_init(&c->gc_pos_lock); +diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c +index c169d282a1f9..63b0c9715e9c 100644 +--- a/fs/bcachefs/sysfs.c ++++ b/fs/bcachefs/sysfs.c +@@ -168,6 +168,7 @@ read_attribute(btree_updates); + read_attribute(dirty_btree_nodes); + read_attribute(btree_key_cache); + read_attribute(btree_transactions); ++read_attribute(stripes_heap); + + read_attribute(internal_uuid); + +@@ -418,6 +419,13 @@ SHOW(bch2_fs) + return out.pos - buf; + } + ++ if (attr == &sysfs_stripes_heap) { ++ struct printbuf out = _PBUF(buf, PAGE_SIZE); ++ ++ bch2_stripes_heap_to_text(&out, c); ++ return out.pos - buf; ++ } ++ + if (attr == &sysfs_compression_stats) + return bch2_compression_stats(c, buf); + +@@ -583,6 +591,7 @@ struct attribute *bch2_fs_internal_files[] = { + &sysfs_dirty_btree_nodes, + &sysfs_btree_key_cache, + &sysfs_btree_transactions, ++ &sysfs_stripes_heap, + + &sysfs_read_realloc_races, + &sysfs_extent_migrate_done, +-- +cgit v1.2.3 + + +From 51660038c4623b2e9e11b806381a532cf0e61983 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 6 Jul 2020 20:59:46 -0400 +Subject: bcachefs: Move stripe creation to workqueue + +This is mainly to solve a lock ordering issue, and also simplifies the +code a bit. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/alloc_foreground.c | 2 +- + fs/bcachefs/bcachefs.h | 9 +++- + fs/bcachefs/ec.c | 93 ++++++++++++++++++++++++------------------ + fs/bcachefs/ec.h | 5 +-- + fs/bcachefs/super.c | 8 +++- + fs/bcachefs/sysfs.c | 26 ++++++------ + 6 files changed, 82 insertions(+), 61 deletions(-) + +diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c +index 979aba30bc9d..97f620019f1c 100644 +--- a/fs/bcachefs/alloc_foreground.c ++++ b/fs/bcachefs/alloc_foreground.c +@@ -582,7 +582,7 @@ got_bucket: + nr_effective, have_cache, flags, ob); + atomic_inc(&h->s->pin); + out_put_head: +- bch2_ec_stripe_head_put(h); ++ bch2_ec_stripe_head_put(c, h); + } + + /* Sector allocator */ +diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h +index 081c17f35b33..271add3ade9b 100644 +--- a/fs/bcachefs/bcachefs.h ++++ b/fs/bcachefs/bcachefs.h +@@ -760,8 +760,13 @@ struct bch_fs { + spinlock_t ec_stripes_heap_lock; + + /* ERASURE CODING */ +- struct list_head ec_new_stripe_list; +- struct mutex ec_new_stripe_lock; ++ struct list_head ec_stripe_head_list; ++ struct mutex ec_stripe_head_lock; ++ ++ struct list_head ec_stripe_new_list; ++ struct mutex ec_stripe_new_lock; ++ ++ struct work_struct ec_stripe_create_work; + u64 ec_stripe_hint; + + struct bio_set ec_bioset; +diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c +index 5735ae9f5f83..e7d1caa8885e 100644 +--- a/fs/bcachefs/ec.c ++++ b/fs/bcachefs/ec.c +@@ -861,7 +861,8 @@ static void ec_stripe_create(struct ec_stripe_new *s) + closure_init_stack(&cl); + + if (s->err) { +- bch_err(c, "error creating stripe: error writing data buckets"); ++ if (s->err != -EROFS) ++ bch_err(c, "error creating stripe: error writing data buckets"); + goto err; + } + +@@ -916,30 +917,50 @@ err: + + bch2_keylist_free(&s->keys, s->inline_keys); + +- mutex_lock(&s->h->lock); +- list_del(&s->list); +- mutex_unlock(&s->h->lock); +- + for (i = 0; i < s->stripe.key.v.nr_blocks; i++) + kvpfree(s->stripe.data[i], s->stripe.size << 9); + kfree(s); + } + +-static struct ec_stripe_new *ec_stripe_set_pending(struct ec_stripe_head *h) ++static void ec_stripe_create_work(struct work_struct *work) + { +- struct ec_stripe_new *s = h->s; ++ struct bch_fs *c = container_of(work, ++ struct bch_fs, ec_stripe_create_work); ++ struct ec_stripe_new *s, *n; ++restart: ++ mutex_lock(&c->ec_stripe_new_lock); ++ list_for_each_entry_safe(s, n, &c->ec_stripe_new_list, list) ++ if (!atomic_read(&s->pin)) { ++ list_del(&s->list); ++ mutex_unlock(&c->ec_stripe_new_lock); ++ ec_stripe_create(s); ++ goto restart; ++ } ++ mutex_unlock(&c->ec_stripe_new_lock); ++} + +- list_add(&s->list, &h->stripes); +- h->s = NULL; ++static void ec_stripe_new_put(struct bch_fs *c, struct ec_stripe_new *s) ++{ ++ BUG_ON(atomic_read(&s->pin) <= 0); + +- return s; ++ if (atomic_dec_and_test(&s->pin)) { ++ BUG_ON(!s->pending); ++ queue_work(system_long_wq, &c->ec_stripe_create_work); ++ } + } + +-static void ec_stripe_new_put(struct ec_stripe_new *s) ++static void ec_stripe_set_pending(struct bch_fs *c, struct ec_stripe_head *h) + { +- BUG_ON(atomic_read(&s->pin) <= 0); +- if (atomic_dec_and_test(&s->pin)) +- ec_stripe_create(s); ++ struct ec_stripe_new *s = h->s; ++ ++ h->s = NULL; ++ s->pending = true; ++ ++ mutex_lock(&c->ec_stripe_new_lock); ++ list_add(&s->list, &c->ec_stripe_new_list); ++ mutex_unlock(&c->ec_stripe_new_lock); ++ ++ ec_stripe_new_put(c, s); + } + + /* have a full bucket - hand it off to be erasure coded: */ +@@ -950,7 +971,7 @@ void bch2_ec_bucket_written(struct bch_fs *c, struct open_bucket *ob) + if (ob->sectors_free) + s->err = -1; + +- ec_stripe_new_put(s); ++ ec_stripe_new_put(c, s); + } + + void bch2_ec_bucket_cancel(struct bch_fs *c, struct open_bucket *ob) +@@ -1106,7 +1127,6 @@ ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target, + + mutex_init(&h->lock); + mutex_lock(&h->lock); +- INIT_LIST_HEAD(&h->stripes); + + h->target = target; + h->algo = algo; +@@ -1126,23 +1146,18 @@ ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target, + h->nr_active_devs++; + + rcu_read_unlock(); +- list_add(&h->list, &c->ec_new_stripe_list); ++ list_add(&h->list, &c->ec_stripe_head_list); + return h; + } + +-void bch2_ec_stripe_head_put(struct ec_stripe_head *h) ++void bch2_ec_stripe_head_put(struct bch_fs *c, struct ec_stripe_head *h) + { +- struct ec_stripe_new *s = NULL; +- + if (h->s && + bitmap_weight(h->s->blocks_allocated, + h->s->blocks.nr) == h->s->blocks.nr) +- s = ec_stripe_set_pending(h); ++ ec_stripe_set_pending(c, h); + + mutex_unlock(&h->lock); +- +- if (s) +- ec_stripe_new_put(s); + } + + struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c, +@@ -1155,8 +1170,8 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c, + if (!redundancy) + return NULL; + +- mutex_lock(&c->ec_new_stripe_lock); +- list_for_each_entry(h, &c->ec_new_stripe_list, list) ++ mutex_lock(&c->ec_stripe_head_lock); ++ list_for_each_entry(h, &c->ec_stripe_head_list, list) + if (h->target == target && + h->algo == algo && + h->redundancy == redundancy) { +@@ -1166,7 +1181,7 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c, + + h = ec_new_stripe_head_alloc(c, target, algo, redundancy); + found: +- mutex_unlock(&c->ec_new_stripe_lock); ++ mutex_unlock(&c->ec_stripe_head_lock); + return h; + } + +@@ -1176,9 +1191,8 @@ void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca) + struct open_bucket *ob; + unsigned i; + +- mutex_lock(&c->ec_new_stripe_lock); +- list_for_each_entry(h, &c->ec_new_stripe_list, list) { +- struct ec_stripe_new *s = NULL; ++ mutex_lock(&c->ec_stripe_head_lock); ++ list_for_each_entry(h, &c->ec_stripe_head_list, list) { + + mutex_lock(&h->lock); + bch2_open_buckets_stop_dev(c, ca, &h->blocks); +@@ -1195,15 +1209,12 @@ void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca) + goto found; + goto unlock; + found: +- h->s->err = -1; +- s = ec_stripe_set_pending(h); ++ h->s->err = -EROFS; ++ ec_stripe_set_pending(c, h); + unlock: + mutex_unlock(&h->lock); +- +- if (s) +- ec_stripe_new_put(s); + } +- mutex_unlock(&c->ec_new_stripe_lock); ++ mutex_unlock(&c->ec_stripe_head_lock); + } + + static int __bch2_stripe_write_key(struct btree_trans *trans, +@@ -1374,20 +1385,21 @@ void bch2_fs_ec_exit(struct bch_fs *c) + struct ec_stripe_head *h; + + while (1) { +- mutex_lock(&c->ec_new_stripe_lock); +- h = list_first_entry_or_null(&c->ec_new_stripe_list, ++ mutex_lock(&c->ec_stripe_head_lock); ++ h = list_first_entry_or_null(&c->ec_stripe_head_list, + struct ec_stripe_head, list); + if (h) + list_del(&h->list); +- mutex_unlock(&c->ec_new_stripe_lock); ++ mutex_unlock(&c->ec_stripe_head_lock); + if (!h) + break; + + BUG_ON(h->s); +- BUG_ON(!list_empty(&h->stripes)); + kfree(h); + } + ++ BUG_ON(!list_empty(&c->ec_stripe_new_list)); ++ + free_heap(&c->ec_stripes_heap); + genradix_free(&c->stripes[0]); + bioset_exit(&c->ec_bioset); +@@ -1395,6 +1407,7 @@ void bch2_fs_ec_exit(struct bch_fs *c) + + int bch2_fs_ec_init(struct bch_fs *c) + { ++ INIT_WORK(&c->ec_stripe_create_work, ec_stripe_create_work); + INIT_WORK(&c->ec_stripe_delete_work, ec_stripe_delete_work); + + return bioset_init(&c->ec_bioset, 1, offsetof(struct ec_bio, bio), +diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h +index 36444cb14190..6f9354f82656 100644 +--- a/fs/bcachefs/ec.h ++++ b/fs/bcachefs/ec.h +@@ -92,6 +92,7 @@ struct ec_stripe_new { + atomic_t pin; + + int err; ++ bool pending; + + unsigned long blocks_allocated[BITS_TO_LONGS(EC_STRIPE_MAX)]; + +@@ -108,8 +109,6 @@ struct ec_stripe_head { + struct list_head list; + struct mutex lock; + +- struct list_head stripes; +- + unsigned target; + unsigned algo; + unsigned redundancy; +@@ -139,7 +138,7 @@ void bch2_ec_bucket_cancel(struct bch_fs *, struct open_bucket *); + + int bch2_ec_stripe_new_alloc(struct bch_fs *, struct ec_stripe_head *); + +-void bch2_ec_stripe_head_put(struct ec_stripe_head *); ++void bch2_ec_stripe_head_put(struct bch_fs *, struct ec_stripe_head *); + struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *, unsigned, + unsigned, unsigned); + +diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c +index 41bbd81c4dcb..b7b4c5c0a2fa 100644 +--- a/fs/bcachefs/super.c ++++ b/fs/bcachefs/super.c +@@ -668,8 +668,12 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) + INIT_LIST_HEAD(&c->fsck_errors); + mutex_init(&c->fsck_error_lock); + +- INIT_LIST_HEAD(&c->ec_new_stripe_list); +- mutex_init(&c->ec_new_stripe_lock); ++ INIT_LIST_HEAD(&c->ec_stripe_head_list); ++ mutex_init(&c->ec_stripe_head_lock); ++ ++ INIT_LIST_HEAD(&c->ec_stripe_new_list); ++ mutex_init(&c->ec_stripe_new_lock); ++ + spin_lock_init(&c->ec_stripes_heap_lock); + + seqcount_init(&c->gc_pos_lock); +diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c +index 63b0c9715e9c..9a4b93433170 100644 +--- a/fs/bcachefs/sysfs.c ++++ b/fs/bcachefs/sysfs.c +@@ -320,8 +320,8 @@ static ssize_t bch2_new_stripes(struct bch_fs *c, char *buf) + struct ec_stripe_head *h; + struct ec_stripe_new *s; + +- mutex_lock(&c->ec_new_stripe_lock); +- list_for_each_entry(h, &c->ec_new_stripe_list, list) { ++ mutex_lock(&c->ec_stripe_head_lock); ++ list_for_each_entry(h, &c->ec_stripe_head_list, list) { + out += scnprintf(out, end - out, + "target %u algo %u redundancy %u:\n", + h->target, h->algo, h->redundancy); +@@ -332,19 +332,19 @@ static ssize_t bch2_new_stripes(struct bch_fs *c, char *buf) + h->s->blocks.nr, + bitmap_weight(h->s->blocks_allocated, + h->s->blocks.nr)); ++ } ++ mutex_unlock(&c->ec_stripe_head_lock); + +- mutex_lock(&h->lock); +- list_for_each_entry(s, &h->stripes, list) +- out += scnprintf(out, end - out, +- "\tin flight: blocks %u allocated %u pin %u\n", +- s->blocks.nr, +- bitmap_weight(s->blocks_allocated, +- s->blocks.nr), +- atomic_read(&s->pin)); +- mutex_unlock(&h->lock); +- ++ mutex_lock(&c->ec_stripe_new_lock); ++ list_for_each_entry(h, &c->ec_stripe_new_list, list) { ++ out += scnprintf(out, end - out, ++ "\tin flight: blocks %u allocated %u pin %u\n", ++ s->blocks.nr, ++ bitmap_weight(s->blocks_allocated, ++ s->blocks.nr), ++ atomic_read(&s->pin)); + } +- mutex_unlock(&c->ec_new_stripe_lock); ++ mutex_unlock(&c->ec_stripe_new_lock); + + return out - buf; + } +-- +cgit v1.2.3 + + +From 395582f745f605b3096d0b10c273ac07eef5b63d Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 6 Jul 2020 22:33:54 -0400 +Subject: bcachefs: Refactor stripe creation + +Prep work for the patch to update existing stripes with new data blocks. +This moves allocating new stripes into ec.c, and also sets up the data +structures so that we can handly only allocating some of the blocks in a +stripe. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/alloc_foreground.c | 103 +++------------------ + fs/bcachefs/alloc_foreground.h | 5 + + fs/bcachefs/ec.c | 205 +++++++++++++++++++++++++++++++---------- + fs/bcachefs/ec.h | 6 +- + 4 files changed, 180 insertions(+), 139 deletions(-) + +diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c +index 97f620019f1c..d08820effc8f 100644 +--- a/fs/bcachefs/alloc_foreground.c ++++ b/fs/bcachefs/alloc_foreground.c +@@ -344,10 +344,9 @@ struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *c, + struct bch_devs_mask *devs) + { + struct dev_alloc_list ret = { .nr = 0 }; +- struct bch_dev *ca; + unsigned i; + +- for_each_member_device_rcu(ca, c, i, devs) ++ for_each_set_bit(i, devs->d, BCH_SB_MEMBERS_MAX) + ret.devs[ret.nr++] = i; + + bubble_sort(ret.devs, ret.nr, dev_stripe_cmp); +@@ -396,16 +395,16 @@ static void add_new_bucket(struct bch_fs *c, + ob_push(c, ptrs, ob); + } + +-static int bch2_bucket_alloc_set(struct bch_fs *c, +- struct open_buckets *ptrs, +- struct dev_stripe_state *stripe, +- struct bch_devs_mask *devs_may_alloc, +- unsigned nr_replicas, +- unsigned *nr_effective, +- bool *have_cache, +- enum alloc_reserve reserve, +- unsigned flags, +- struct closure *cl) ++int bch2_bucket_alloc_set(struct bch_fs *c, ++ struct open_buckets *ptrs, ++ struct dev_stripe_state *stripe, ++ struct bch_devs_mask *devs_may_alloc, ++ unsigned nr_replicas, ++ unsigned *nr_effective, ++ bool *have_cache, ++ enum alloc_reserve reserve, ++ unsigned flags, ++ struct closure *cl) + { + struct dev_alloc_list devs_sorted = + bch2_dev_alloc_list(c, stripe, devs_may_alloc); +@@ -455,74 +454,6 @@ static int bch2_bucket_alloc_set(struct bch_fs *c, + + /* Allocate from stripes: */ + +-/* +- * XXX: use a higher watermark for allocating open buckets here: +- */ +-static int ec_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h) +-{ +- struct bch_devs_mask devs; +- struct open_bucket *ob; +- unsigned i, nr_have = 0, nr_data = +- min_t(unsigned, h->nr_active_devs, +- EC_STRIPE_MAX) - h->redundancy; +- bool have_cache = true; +- int ret = 0; +- +- BUG_ON(h->blocks.nr > nr_data); +- BUG_ON(h->parity.nr > h->redundancy); +- +- devs = h->devs; +- +- open_bucket_for_each(c, &h->parity, ob, i) +- __clear_bit(ob->ptr.dev, devs.d); +- open_bucket_for_each(c, &h->blocks, ob, i) +- __clear_bit(ob->ptr.dev, devs.d); +- +- percpu_down_read(&c->mark_lock); +- rcu_read_lock(); +- +- if (h->parity.nr < h->redundancy) { +- nr_have = h->parity.nr; +- +- ret = bch2_bucket_alloc_set(c, &h->parity, +- &h->parity_stripe, +- &devs, +- h->redundancy, +- &nr_have, +- &have_cache, +- RESERVE_NONE, +- 0, +- NULL); +- if (ret) +- goto err; +- } +- +- if (h->blocks.nr < nr_data) { +- nr_have = h->blocks.nr; +- +- ret = bch2_bucket_alloc_set(c, &h->blocks, +- &h->block_stripe, +- &devs, +- nr_data, +- &nr_have, +- &have_cache, +- RESERVE_NONE, +- 0, +- NULL); +- if (ret) +- goto err; +- } +- +- rcu_read_unlock(); +- percpu_up_read(&c->mark_lock); +- +- return bch2_ec_stripe_new_alloc(c, h); +-err: +- rcu_read_unlock(); +- percpu_up_read(&c->mark_lock); +- return -1; +-} +- + /* + * if we can't allocate a new stripe because there are already too many + * partially filled stripes, force allocating from an existing stripe even when +@@ -555,27 +486,23 @@ static void bucket_alloc_from_stripe(struct bch_fs *c, + if (ec_open_bucket(c, ptrs)) + return; + +- h = bch2_ec_stripe_head_get(c, target, erasure_code, nr_replicas - 1); ++ h = bch2_ec_stripe_head_get(c, target, 0, nr_replicas - 1); + if (!h) + return; + +- if (!h->s && ec_stripe_alloc(c, h)) +- goto out_put_head; +- +- rcu_read_lock(); + devs_sorted = bch2_dev_alloc_list(c, &wp->stripe, devs_may_alloc); +- rcu_read_unlock(); + + for (i = 0; i < devs_sorted.nr; i++) + open_bucket_for_each(c, &h->s->blocks, ob, ec_idx) + if (ob->ptr.dev == devs_sorted.devs[i] && +- !test_and_set_bit(ec_idx, h->s->blocks_allocated)) ++ !test_and_set_bit(h->s->data_block_idx[ec_idx], ++ h->s->blocks_allocated)) + goto got_bucket; + goto out_put_head; + got_bucket: + ca = bch_dev_bkey_exists(c, ob->ptr.dev); + +- ob->ec_idx = ec_idx; ++ ob->ec_idx = h->s->data_block_idx[ec_idx]; + ob->ec = h->s; + + add_new_bucket(c, ptrs, devs_may_alloc, +diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h +index 687f973e4b3a..17a6869bb8cd 100644 +--- a/fs/bcachefs/alloc_foreground.h ++++ b/fs/bcachefs/alloc_foreground.h +@@ -92,6 +92,11 @@ static inline void bch2_open_bucket_get(struct bch_fs *c, + } + } + ++int bch2_bucket_alloc_set(struct bch_fs *, struct open_buckets *, ++ struct dev_stripe_state *, struct bch_devs_mask *, ++ unsigned, unsigned *, bool *, enum alloc_reserve, ++ unsigned, struct closure *); ++ + struct write_point *bch2_alloc_sectors_start(struct bch_fs *, + unsigned, unsigned, + struct write_point_specifier, +diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c +index e7d1caa8885e..a897be3eb7fe 100644 +--- a/fs/bcachefs/ec.c ++++ b/fs/bcachefs/ec.c +@@ -200,40 +200,6 @@ static bool extent_has_stripe_ptr(struct bkey_s_c k, u64 idx) + return false; + } + +-static void ec_stripe_key_init(struct bch_fs *c, +- struct bkey_i_stripe *s, +- struct open_buckets *blocks, +- struct open_buckets *parity, +- unsigned stripe_size) +-{ +- struct open_bucket *ob; +- unsigned i, u64s; +- +- bkey_stripe_init(&s->k_i); +- s->v.sectors = cpu_to_le16(stripe_size); +- s->v.algorithm = 0; +- s->v.nr_blocks = parity->nr + blocks->nr; +- s->v.nr_redundant = parity->nr; +- s->v.csum_granularity_bits = ilog2(c->sb.encoded_extent_max); +- s->v.csum_type = BCH_CSUM_CRC32C; +- s->v.pad = 0; +- +- open_bucket_for_each(c, blocks, ob, i) +- s->v.ptrs[i] = ob->ptr; +- +- open_bucket_for_each(c, parity, ob, i) +- s->v.ptrs[blocks->nr + i] = ob->ptr; +- +- while ((u64s = stripe_val_u64s(&s->v)) > BKEY_VAL_U64s_MAX) { +- BUG_ON(1 << s->v.csum_granularity_bits >= +- le16_to_cpu(s->v.sectors) || +- s->v.csum_granularity_bits == U8_MAX); +- s->v.csum_granularity_bits++; +- } +- +- set_bkey_val_u64s(&s->k, u64s); +-} +- + /* Checksumming: */ + + static void ec_generate_checksums(struct ec_stripe_buf *buf) +@@ -866,6 +832,8 @@ static void ec_stripe_create(struct ec_stripe_new *s) + goto err; + } + ++ BUG_ON(!s->allocated); ++ + if (!percpu_ref_tryget(&c->writes)) + goto err; + +@@ -953,6 +921,8 @@ static void ec_stripe_set_pending(struct bch_fs *c, struct ec_stripe_head *h) + { + struct ec_stripe_new *s = h->s; + ++ BUG_ON(!s->allocated && !s->err); ++ + h->s = NULL; + s->pending = true; + +@@ -1063,14 +1033,38 @@ static unsigned pick_blocksize(struct bch_fs *c, + return best.size; + } + +-int bch2_ec_stripe_new_alloc(struct bch_fs *c, struct ec_stripe_head *h) ++static void ec_stripe_key_init(struct bch_fs *c, ++ struct bkey_i_stripe *s, ++ unsigned nr_data, ++ unsigned nr_parity, ++ unsigned stripe_size) ++{ ++ unsigned u64s; ++ ++ bkey_stripe_init(&s->k_i); ++ s->v.sectors = cpu_to_le16(stripe_size); ++ s->v.algorithm = 0; ++ s->v.nr_blocks = nr_data + nr_parity; ++ s->v.nr_redundant = nr_parity; ++ s->v.csum_granularity_bits = ilog2(c->sb.encoded_extent_max); ++ s->v.csum_type = BCH_CSUM_CRC32C; ++ s->v.pad = 0; ++ ++ while ((u64s = stripe_val_u64s(&s->v)) > BKEY_VAL_U64s_MAX) { ++ BUG_ON(1 << s->v.csum_granularity_bits >= ++ le16_to_cpu(s->v.sectors) || ++ s->v.csum_granularity_bits == U8_MAX); ++ s->v.csum_granularity_bits++; ++ } ++ ++ set_bkey_val_u64s(&s->k, u64s); ++} ++ ++static int ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h) + { + struct ec_stripe_new *s; + unsigned i; + +- BUG_ON(h->parity.nr != h->redundancy); +- BUG_ON(!h->blocks.nr); +- BUG_ON(h->parity.nr + h->blocks.nr > EC_STRIPE_MAX); + lockdep_assert_held(&h->lock); + + s = kzalloc(sizeof(*s), GFP_KERNEL); +@@ -1081,11 +1075,9 @@ int bch2_ec_stripe_new_alloc(struct bch_fs *c, struct ec_stripe_head *h) + atomic_set(&s->pin, 1); + s->c = c; + s->h = h; +- s->blocks = h->blocks; +- s->parity = h->parity; +- +- memset(&h->blocks, 0, sizeof(h->blocks)); +- memset(&h->parity, 0, sizeof(h->parity)); ++ s->nr_data = min_t(unsigned, h->nr_active_devs, ++ EC_STRIPE_MAX) - h->redundancy; ++ s->nr_parity = h->redundancy; + + bch2_keylist_init(&s->keys, s->inline_keys); + +@@ -1093,9 +1085,8 @@ int bch2_ec_stripe_new_alloc(struct bch_fs *c, struct ec_stripe_head *h) + s->stripe.size = h->blocksize; + memset(s->stripe.valid, 0xFF, sizeof(s->stripe.valid)); + +- ec_stripe_key_init(c, &s->stripe.key, +- &s->blocks, &s->parity, +- h->blocksize); ++ ec_stripe_key_init(c, &s->stripe.key, s->nr_data, ++ s->nr_parity, h->blocksize); + + for (i = 0; i < s->stripe.key.v.nr_blocks; i++) { + s->stripe.data[i] = kvpmalloc(s->stripe.size << 9, GFP_KERNEL); +@@ -1153,6 +1144,7 @@ ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target, + void bch2_ec_stripe_head_put(struct bch_fs *c, struct ec_stripe_head *h) + { + if (h->s && ++ h->s->allocated && + bitmap_weight(h->s->blocks_allocated, + h->s->blocks.nr) == h->s->blocks.nr) + ec_stripe_set_pending(c, h); +@@ -1160,7 +1152,7 @@ void bch2_ec_stripe_head_put(struct bch_fs *c, struct ec_stripe_head *h) + mutex_unlock(&h->lock); + } + +-struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c, ++struct ec_stripe_head *__bch2_ec_stripe_head_get(struct bch_fs *c, + unsigned target, + unsigned algo, + unsigned redundancy) +@@ -1185,6 +1177,122 @@ found: + return h; + } + ++/* ++ * XXX: use a higher watermark for allocating open buckets here: ++ */ ++static int new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h) ++{ ++ struct bch_devs_mask devs; ++ struct open_bucket *ob; ++ unsigned i, nr_have, nr_data = ++ min_t(unsigned, h->nr_active_devs, ++ EC_STRIPE_MAX) - h->redundancy; ++ bool have_cache = true; ++ int ret = 0; ++ ++ devs = h->devs; ++ ++ for_each_set_bit(i, h->s->blocks_allocated, EC_STRIPE_MAX) { ++ __clear_bit(h->s->stripe.key.v.ptrs[i].dev, devs.d); ++ --nr_data; ++ } ++ ++ BUG_ON(h->s->blocks.nr > nr_data); ++ BUG_ON(h->s->parity.nr > h->redundancy); ++ ++ open_bucket_for_each(c, &h->s->parity, ob, i) ++ __clear_bit(ob->ptr.dev, devs.d); ++ open_bucket_for_each(c, &h->s->blocks, ob, i) ++ __clear_bit(ob->ptr.dev, devs.d); ++ ++ percpu_down_read(&c->mark_lock); ++ rcu_read_lock(); ++ ++ if (h->s->parity.nr < h->redundancy) { ++ nr_have = h->s->parity.nr; ++ ++ ret = bch2_bucket_alloc_set(c, &h->s->parity, ++ &h->parity_stripe, ++ &devs, ++ h->redundancy, ++ &nr_have, ++ &have_cache, ++ RESERVE_NONE, ++ 0, ++ NULL); ++ if (ret) ++ goto err; ++ } ++ ++ if (h->s->blocks.nr < nr_data) { ++ nr_have = h->s->blocks.nr; ++ ++ ret = bch2_bucket_alloc_set(c, &h->s->blocks, ++ &h->block_stripe, ++ &devs, ++ nr_data, ++ &nr_have, ++ &have_cache, ++ RESERVE_NONE, ++ 0, ++ NULL); ++ if (ret) ++ goto err; ++ } ++err: ++ rcu_read_unlock(); ++ percpu_up_read(&c->mark_lock); ++ return ret; ++} ++ ++struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c, ++ unsigned target, ++ unsigned algo, ++ unsigned redundancy) ++{ ++ struct closure cl; ++ struct ec_stripe_head *h; ++ struct open_bucket *ob; ++ unsigned i, data_idx = 0; ++ ++ closure_init_stack(&cl); ++ ++ h = __bch2_ec_stripe_head_get(c, target, algo, redundancy); ++ if (!h) ++ return NULL; ++ ++ if (!h->s && ec_new_stripe_alloc(c, h)) { ++ bch2_ec_stripe_head_put(c, h); ++ return NULL; ++ } ++ ++ if (!h->s->allocated) { ++ if (new_stripe_alloc_buckets(c, h)) { ++ bch2_ec_stripe_head_put(c, h); ++ h = NULL; ++ goto out; ++ } ++ ++ open_bucket_for_each(c, &h->s->blocks, ob, i) { ++ data_idx = find_next_zero_bit(h->s->blocks_allocated, ++ h->s->nr_data, data_idx); ++ BUG_ON(data_idx >= h->s->nr_data); ++ ++ h->s->stripe.key.v.ptrs[data_idx] = ob->ptr; ++ h->s->data_block_idx[i] = data_idx; ++ data_idx++; ++ } ++ ++ open_bucket_for_each(c, &h->s->parity, ob, i) ++ h->s->stripe.key.v.ptrs[h->s->nr_data + i] = ob->ptr; ++ ++ h->s->allocated = true; ++ } ++out: ++ closure_sync(&cl); ++ return h; ++} ++ + void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca) + { + struct ec_stripe_head *h; +@@ -1195,9 +1303,6 @@ void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca) + list_for_each_entry(h, &c->ec_stripe_head_list, list) { + + mutex_lock(&h->lock); +- bch2_open_buckets_stop_dev(c, ca, &h->blocks); +- bch2_open_buckets_stop_dev(c, ca, &h->parity); +- + if (!h->s) + goto unlock; + +diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h +index 6f9354f82656..d7396885792e 100644 +--- a/fs/bcachefs/ec.h ++++ b/fs/bcachefs/ec.h +@@ -92,11 +92,15 @@ struct ec_stripe_new { + atomic_t pin; + + int err; +- bool pending; + ++ u8 nr_data; ++ u8 nr_parity; ++ bool allocated; ++ bool pending; + unsigned long blocks_allocated[BITS_TO_LONGS(EC_STRIPE_MAX)]; + + struct open_buckets blocks; ++ u8 data_block_idx[EC_STRIPE_MAX]; + struct open_buckets parity; + + struct keylist keys; +-- +cgit v1.2.3 + + +From 607b0a15693285d0862140d27c563d5c43b5815b Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 30 Jun 2020 14:44:19 -0400 +Subject: bcachefs: Allow existing stripes to be updated with new data buckets + +This solves internal fragmentation within stripes. We already have +copygc, which evacuates buckets that are partially or mostly empty, but +it's up to the ec code that manages stripes to deal with stripes that +have empty buckets in them. + +This patch changes the path for creating new stripes to check if there's +existing stripes with empty buckets - and if so, update them with new +data buckets instead of creating new stripes. + +TODO: improve the disk space accounting so that we can only use this +(more expensive path) when we have too much fragmentation in existing +stripes. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/ec.c | 97 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-- + fs/bcachefs/ec.h | 6 ++-- + 2 files changed, 98 insertions(+), 5 deletions(-) + +diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c +index a897be3eb7fe..8922a3da4151 100644 +--- a/fs/bcachefs/ec.c ++++ b/fs/bcachefs/ec.c +@@ -638,6 +638,7 @@ void bch2_stripes_heap_update(struct bch_fs *c, + + static int ec_stripe_delete(struct bch_fs *c, size_t idx) + { ++ //pr_info("deleting stripe %zu", idx); + return bch2_btree_delete_range(c, BTREE_ID_EC, + POS(0, idx), + POS(0, idx + 1), +@@ -756,6 +757,8 @@ static int ec_stripe_update_ptrs(struct bch_fs *c, + bkey_on_stack_init(&sk); + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); + ++ /* XXX this doesn't support the reflink btree */ ++ + iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, + bkey_start_pos(pos), + BTREE_ITER_INTENT); +@@ -856,7 +859,10 @@ static void ec_stripe_create(struct ec_stripe_new *s) + goto err_put_writes; + } + +- ret = ec_stripe_bkey_insert(c, &s->stripe.key); ++ ret = s->existing_stripe ++ ? bch2_btree_insert(c, BTREE_ID_EC, &s->stripe.key.k_i, ++ NULL, NULL, BTREE_INSERT_NOFAIL) ++ : ec_stripe_bkey_insert(c, &s->stripe.key); + if (ret) { + bch_err(c, "error creating stripe: error creating stripe key"); + goto err_put_writes; +@@ -864,12 +870,19 @@ static void ec_stripe_create(struct ec_stripe_new *s) + + for_each_keylist_key(&s->keys, k) { + ret = ec_stripe_update_ptrs(c, &s->stripe, &k->k); +- if (ret) ++ if (ret) { ++ bch_err(c, "error creating stripe: error updating pointers"); + break; ++ } + } + + spin_lock(&c->ec_stripes_heap_lock); + m = genradix_ptr(&c->stripes[0], s->stripe.key.k.p.offset); ++#if 0 ++ pr_info("created a %s stripe %llu", ++ s->existing_stripe ? "existing" : "new", ++ s->stripe.key.k.p.offset); ++#endif + BUG_ON(m->on_heap); + bch2_stripes_heap_insert(c, m, s->stripe.key.k.p.offset); + spin_unlock(&c->ec_stripes_heap_lock); +@@ -975,6 +988,8 @@ void bch2_ec_add_backpointer(struct bch_fs *c, struct write_point *wp, + if (!ob) + return; + ++ //pr_info("adding backpointer at %llu:%llu", pos.inode, pos.offset); ++ + ec = ob->ec; + mutex_lock(&ec->lock); + +@@ -1033,6 +1048,11 @@ static unsigned pick_blocksize(struct bch_fs *c, + return best.size; + } + ++static bool may_create_new_stripe(struct bch_fs *c) ++{ ++ return false; ++} ++ + static void ec_stripe_key_init(struct bch_fs *c, + struct bkey_i_stripe *s, + unsigned nr_data, +@@ -1245,6 +1265,59 @@ err: + return ret; + } + ++/* XXX: doesn't obey target: */ ++static s64 get_existing_stripe(struct bch_fs *c, ++ unsigned target, ++ unsigned algo, ++ unsigned redundancy) ++{ ++ ec_stripes_heap *h = &c->ec_stripes_heap; ++ struct stripe *m; ++ size_t heap_idx; ++ u64 stripe_idx; ++ ++ if (may_create_new_stripe(c)) ++ return -1; ++ ++ spin_lock(&c->ec_stripes_heap_lock); ++ for (heap_idx = 0; heap_idx < h->used; heap_idx++) { ++ if (!h->data[heap_idx].blocks_nonempty) ++ continue; ++ ++ stripe_idx = h->data[heap_idx].idx; ++ m = genradix_ptr(&c->stripes[0], stripe_idx); ++ ++ if (m->algorithm == algo && ++ m->nr_redundant == redundancy && ++ m->blocks_nonempty < m->nr_blocks - m->nr_redundant) { ++ bch2_stripes_heap_del(c, m, stripe_idx); ++ spin_unlock(&c->ec_stripes_heap_lock); ++ return stripe_idx; ++ } ++ } ++ ++ spin_unlock(&c->ec_stripes_heap_lock); ++ return -1; ++} ++ ++static int get_stripe_key(struct bch_fs *c, u64 idx, struct ec_stripe_buf *stripe) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ int ret; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_EC, POS(0, idx), BTREE_ITER_SLOTS); ++ k = bch2_btree_iter_peek_slot(iter); ++ ret = bkey_err(k); ++ if (!ret) ++ bkey_reassemble(&stripe->key.k_i, k); ++ bch2_trans_exit(&trans); ++ ++ return ret; ++} ++ + struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c, + unsigned target, + unsigned algo, +@@ -1254,6 +1327,7 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c, + struct ec_stripe_head *h; + struct open_bucket *ob; + unsigned i, data_idx = 0; ++ s64 idx; + + closure_init_stack(&cl); + +@@ -1267,6 +1341,24 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c, + } + + if (!h->s->allocated) { ++ if (!h->s->existing_stripe && ++ (idx = get_existing_stripe(c, target, algo, redundancy)) >= 0) { ++ //pr_info("got existing stripe %llu", idx); ++ ++ h->s->existing_stripe = true; ++ h->s->existing_stripe_idx = idx; ++ if (get_stripe_key(c, idx, &h->s->stripe)) { ++ /* btree error */ ++ BUG(); ++ } ++ ++ for (i = 0; i < h->s->stripe.key.v.nr_blocks; i++) ++ if (stripe_blockcount_get(&h->s->stripe.key.v, i)) { ++ __set_bit(i, h->s->blocks_allocated); ++ ec_block_io(c, &h->s->stripe, READ, i, &cl); ++ } ++ } ++ + if (new_stripe_alloc_buckets(c, h)) { + bch2_ec_stripe_head_put(c, h); + h = NULL; +@@ -1286,6 +1378,7 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c, + open_bucket_for_each(c, &h->s->parity, ob, i) + h->s->stripe.key.v.ptrs[h->s->nr_data + i] = ob->ptr; + ++ //pr_info("new stripe, blocks_allocated %lx", h->s->blocks_allocated[0]); + h->s->allocated = true; + } + out: +diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h +index d7396885792e..ad9078fdb045 100644 +--- a/fs/bcachefs/ec.h ++++ b/fs/bcachefs/ec.h +@@ -97,6 +97,9 @@ struct ec_stripe_new { + u8 nr_parity; + bool allocated; + bool pending; ++ bool existing_stripe; ++ u64 existing_stripe_idx; ++ + unsigned long blocks_allocated[BITS_TO_LONGS(EC_STRIPE_MAX)]; + + struct open_buckets blocks; +@@ -125,9 +128,6 @@ struct ec_stripe_head { + struct dev_stripe_state block_stripe; + struct dev_stripe_state parity_stripe; + +- struct open_buckets blocks; +- struct open_buckets parity; +- + struct ec_stripe_new *s; + }; + +-- +cgit v1.2.3 + + +From ecc1a189363bc43df9b6c4767862738604892bf5 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 9 Jul 2020 13:54:58 -0400 +Subject: bcachefs: Fix short buffered writes + +In the buffered write path, we have to check for short writes that write +to the full page, where the page wasn't UpToDate; when this happens, the +page is partly garbage, so we have to zero it out and revert that part +of the write. + +This check was wrong - we reverted total from copied, but didn't revert +the iov_iter, probably also leading to corrupted writes. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/fs-io.c | 21 +++++++++++---------- + 1 file changed, 11 insertions(+), 10 deletions(-) + +diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c +index b5bdb26db575..c5aa63e4044c 100644 +--- a/fs/bcachefs/fs-io.c ++++ b/fs/bcachefs/fs-io.c +@@ -1541,23 +1541,23 @@ retry_reservation: + if (!pg_copied) + break; + ++ if (!PageUptodate(page) && ++ pg_copied != PAGE_SIZE && ++ pos + copied + pg_copied < inode->v.i_size) { ++ zero_user(page, 0, PAGE_SIZE); ++ break; ++ } ++ + flush_dcache_page(page); + copied += pg_copied; ++ ++ if (pg_copied != pg_len) ++ break; + } + + if (!copied) + goto out; + +- if (copied < len && +- ((offset + copied) & (PAGE_SIZE - 1))) { +- struct page *page = pages[(offset + copied) >> PAGE_SHIFT]; +- +- if (!PageUptodate(page)) { +- zero_user(page, 0, PAGE_SIZE); +- copied -= (offset + copied) & (PAGE_SIZE - 1); +- } +- } +- + spin_lock(&inode->v.i_lock); + if (pos + copied > inode->v.i_size) + i_size_write(&inode->v, pos + copied); +@@ -1654,6 +1654,7 @@ again: + } + pos += ret; + written += ret; ++ ret = 0; + + balance_dirty_pages_ratelimited(mapping); + } while (iov_iter_count(iter)); +-- +cgit v1.2.3 + + +From 840cc11445b55f979498971e1db2a6a6aac3ed55 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 9 Jul 2020 18:28:11 -0400 +Subject: bcachefs: Use x-macros for data types + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/alloc_background.c | 8 ++++---- + fs/bcachefs/alloc_foreground.c | 18 +++++++++--------- + fs/bcachefs/bcachefs_format.h | 19 ++++++++++++------- + fs/bcachefs/btree_gc.c | 14 +++++++------- + fs/bcachefs/btree_io.c | 4 ++-- + fs/bcachefs/buckets.c | 40 ++++++++++++++++++++-------------------- + fs/bcachefs/buckets.h | 4 ++-- + fs/bcachefs/ec.c | 2 +- + fs/bcachefs/extents.c | 4 ++-- + fs/bcachefs/io.c | 4 ++-- + fs/bcachefs/journal.c | 4 ++-- + fs/bcachefs/journal_io.c | 10 +++++----- + fs/bcachefs/journal_reclaim.c | 8 ++++---- + fs/bcachefs/move.c | 10 +++++----- + fs/bcachefs/movinggc.c | 2 +- + fs/bcachefs/opts.c | 9 +++------ + fs/bcachefs/replicas.c | 22 +++++++++++----------- + fs/bcachefs/replicas.h | 2 +- + fs/bcachefs/super-io.c | 4 ++-- + fs/bcachefs/super.c | 4 ++-- + fs/bcachefs/sysfs.c | 24 ++++++++++++------------ + 21 files changed, 109 insertions(+), 107 deletions(-) + +diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c +index 22d690b5a242..88ef65559765 100644 +--- a/fs/bcachefs/alloc_background.c ++++ b/fs/bcachefs/alloc_background.c +@@ -53,10 +53,10 @@ static void pd_controllers_update(struct work_struct *work) + * reclaimed by copy GC + */ + s64 fragmented = (bucket_to_sector(ca, +- stats.buckets[BCH_DATA_USER] + +- stats.buckets[BCH_DATA_CACHED]) - +- (stats.sectors[BCH_DATA_USER] + +- stats.sectors[BCH_DATA_CACHED])) << 9; ++ stats.buckets[BCH_DATA_user] + ++ stats.buckets[BCH_DATA_cached]) - ++ (stats.sectors[BCH_DATA_user] + ++ stats.sectors[BCH_DATA_cached])) << 9; + + fragmented = max(0LL, fragmented); + +diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c +index d08820effc8f..cc5e6d3d0012 100644 +--- a/fs/bcachefs/alloc_foreground.c ++++ b/fs/bcachefs/alloc_foreground.c +@@ -534,7 +534,7 @@ static void get_buckets_from_writepoint(struct bch_fs *c, + if (*nr_effective < nr_replicas && + test_bit(ob->ptr.dev, devs_may_alloc->d) && + (ca->mi.durability || +- (wp->type == BCH_DATA_USER && !*have_cache)) && ++ (wp->type == BCH_DATA_user && !*have_cache)) && + (ob->ec || !need_ec)) { + add_new_bucket(c, ptrs, devs_may_alloc, + nr_effective, have_cache, +@@ -813,11 +813,11 @@ retry: + + wp = writepoint_find(c, write_point.v); + +- if (wp->type == BCH_DATA_USER) ++ if (wp->type == BCH_DATA_user) + ob_flags |= BUCKET_MAY_ALLOC_PARTIAL; + + /* metadata may not allocate on cache devices: */ +- if (wp->type != BCH_DATA_USER) ++ if (wp->type != BCH_DATA_user) + have_cache = true; + + if (!target || (flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)) { +@@ -856,7 +856,7 @@ alloc_done: + + /* Free buckets we didn't use: */ + open_bucket_for_each(c, &wp->ptrs, ob, i) +- open_bucket_free_unused(c, ob, wp->type == BCH_DATA_USER); ++ open_bucket_free_unused(c, ob, wp->type == BCH_DATA_user); + + wp->ptrs = ptrs; + +@@ -876,7 +876,7 @@ err: + ob_push(c, &ptrs, ob); + else + open_bucket_free_unused(c, ob, +- wp->type == BCH_DATA_USER); ++ wp->type == BCH_DATA_user); + wp->ptrs = ptrs; + + mutex_unlock(&wp->lock); +@@ -907,7 +907,7 @@ void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct write_point *wp, + struct bch_extent_ptr tmp = ob->ptr; + + tmp.cached = !ca->mi.durability && +- wp->type == BCH_DATA_USER; ++ wp->type == BCH_DATA_user; + + tmp.offset += ca->mi.bucket_size - ob->sectors_free; + bch2_bkey_append_ptr(k, tmp); +@@ -956,12 +956,12 @@ void bch2_fs_allocator_foreground_init(struct bch_fs *c) + c->open_buckets_freelist = ob - c->open_buckets; + } + +- writepoint_init(&c->btree_write_point, BCH_DATA_BTREE); +- writepoint_init(&c->rebalance_write_point, BCH_DATA_USER); ++ writepoint_init(&c->btree_write_point, BCH_DATA_btree); ++ writepoint_init(&c->rebalance_write_point, BCH_DATA_user); + + for (wp = c->write_points; + wp < c->write_points + c->write_points_nr; wp++) { +- writepoint_init(wp, BCH_DATA_USER); ++ writepoint_init(wp, BCH_DATA_user); + + wp->last_used = sched_clock(); + wp->write_point = (unsigned long) wp; +diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h +index f808e63a713d..d5a2230e403c 100644 +--- a/fs/bcachefs/bcachefs_format.h ++++ b/fs/bcachefs/bcachefs_format.h +@@ -1026,14 +1026,19 @@ LE64_BITMASK(BCH_KDF_SCRYPT_P, struct bch_sb_field_crypt, kdf_flags, 32, 48); + + /* BCH_SB_FIELD_replicas: */ + ++#define BCH_DATA_TYPES() \ ++ x(none, 0) \ ++ x(sb, 1) \ ++ x(journal, 2) \ ++ x(btree, 3) \ ++ x(user, 4) \ ++ x(cached, 5) ++ + enum bch_data_type { +- BCH_DATA_NONE = 0, +- BCH_DATA_SB = 1, +- BCH_DATA_JOURNAL = 2, +- BCH_DATA_BTREE = 3, +- BCH_DATA_USER = 4, +- BCH_DATA_CACHED = 5, +- BCH_DATA_NR = 6, ++#define x(t, n) BCH_DATA_##t, ++ BCH_DATA_TYPES() ++#undef x ++ BCH_DATA_NR + }; + + struct bch_replicas_entry_v0 { +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +index dac1c3a3c527..7ede033d40cc 100644 +--- a/fs/bcachefs/btree_gc.c ++++ b/fs/bcachefs/btree_gc.c +@@ -433,16 +433,16 @@ void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca, + + if (offset == BCH_SB_SECTOR) + mark_metadata_sectors(c, ca, 0, BCH_SB_SECTOR, +- BCH_DATA_SB, flags); ++ BCH_DATA_sb, flags); + + mark_metadata_sectors(c, ca, offset, + offset + (1 << layout->sb_max_size_bits), +- BCH_DATA_SB, flags); ++ BCH_DATA_sb, flags); + } + + for (i = 0; i < ca->journal.nr; i++) { + b = ca->journal.buckets[i]; +- bch2_mark_metadata_bucket(c, ca, b, BCH_DATA_JOURNAL, ++ bch2_mark_metadata_bucket(c, ca, b, BCH_DATA_journal, + ca->mi.bucket_size, + gc_phase(GC_PHASE_SB), flags); + } +@@ -676,8 +676,8 @@ static int bch2_gc_done(struct bch_fs *c, + char buf[80]; + + if (metadata_only && +- (e->data_type == BCH_DATA_USER || +- e->data_type == BCH_DATA_CACHED)) ++ (e->data_type == BCH_DATA_user || ++ e->data_type == BCH_DATA_cached)) + continue; + + bch2_replicas_entry_to_text(&PBUF(buf), e); +@@ -762,8 +762,8 @@ static int bch2_gc_start(struct bch_fs *c, + d->gen_valid = s->gen_valid; + + if (metadata_only && +- (s->mark.data_type == BCH_DATA_USER || +- s->mark.data_type == BCH_DATA_CACHED)) { ++ (s->mark.data_type == BCH_DATA_user || ++ s->mark.data_type == BCH_DATA_cached)) { + d->_mark = s->mark; + d->_mark.owned_by_allocator = 0; + } +diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c +index d2c28eb75bde..b23246087e4e 100644 +--- a/fs/bcachefs/btree_io.c ++++ b/fs/bcachefs/btree_io.c +@@ -1231,7 +1231,7 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b, + set_btree_node_read_in_flight(b); + + if (rb->have_ioref) { +- this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_BTREE], ++ this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_btree], + bio_sectors(bio)); + bio_set_dev(bio, ca->disk_sb.bdev); + +@@ -1700,7 +1700,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, + b->written += sectors_to_write; + + /* XXX: submitting IO with btree locks held: */ +- bch2_submit_wbio_replicas(&wbio->wbio, c, BCH_DATA_BTREE, &k.key); ++ bch2_submit_wbio_replicas(&wbio->wbio, c, BCH_DATA_btree, &k.key); + return; + err: + set_btree_node_noevict(b); +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index dbf2a3f1d904..6b0fbbfd6a35 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -133,13 +133,13 @@ void bch2_fs_usage_initialize(struct bch_fs *c) + cpu_replicas_entry(&c->replicas, i); + + switch (e->data_type) { +- case BCH_DATA_BTREE: ++ case BCH_DATA_btree: + usage->btree += usage->replicas[i]; + break; +- case BCH_DATA_USER: ++ case BCH_DATA_user: + usage->data += usage->replicas[i]; + break; +- case BCH_DATA_CACHED: ++ case BCH_DATA_cached: + usage->cached += usage->replicas[i]; + break; + } +@@ -367,7 +367,7 @@ static inline int is_fragmented_bucket(struct bucket_mark m, + struct bch_dev *ca) + { + if (!m.owned_by_allocator && +- m.data_type == BCH_DATA_USER && ++ m.data_type == BCH_DATA_user && + bucket_sectors_used(m)) + return max_t(int, 0, (int) ca->mi.bucket_size - + bucket_sectors_used(m)); +@@ -382,7 +382,7 @@ static inline int bucket_stripe_sectors(struct bucket_mark m) + static inline enum bch_data_type bucket_type(struct bucket_mark m) + { + return m.cached_sectors && !m.dirty_sectors +- ? BCH_DATA_CACHED ++ ? BCH_DATA_cached + : m.data_type; + } + +@@ -435,7 +435,7 @@ static inline void account_bucket(struct bch_fs_usage *fs_usage, + enum bch_data_type type, + int nr, s64 size) + { +- if (type == BCH_DATA_SB || type == BCH_DATA_JOURNAL) ++ if (type == BCH_DATA_sb || type == BCH_DATA_journal) + fs_usage->hidden += size; + + dev_usage->buckets[type] += nr; +@@ -472,7 +472,7 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca, + + u->sectors[old.data_type] -= old.dirty_sectors; + u->sectors[new.data_type] += new.dirty_sectors; +- u->sectors[BCH_DATA_CACHED] += ++ u->sectors[BCH_DATA_cached] += + (int) new.cached_sectors - (int) old.cached_sectors; + u->sectors_fragmented += + is_fragmented_bucket(new, ca) - is_fragmented_bucket(old, ca); +@@ -520,13 +520,13 @@ static inline int update_replicas(struct bch_fs *c, + return 0; + + switch (r->data_type) { +- case BCH_DATA_BTREE: ++ case BCH_DATA_btree: + fs_usage->btree += sectors; + break; +- case BCH_DATA_USER: ++ case BCH_DATA_user: + fs_usage->data += sectors; + break; +- case BCH_DATA_CACHED: ++ case BCH_DATA_cached: + fs_usage->cached += sectors; + break; + } +@@ -797,8 +797,8 @@ static int __bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, + struct bucket_mark old, new; + bool overflow; + +- BUG_ON(data_type != BCH_DATA_SB && +- data_type != BCH_DATA_JOURNAL); ++ BUG_ON(data_type != BCH_DATA_sb && ++ data_type != BCH_DATA_journal); + + old = bucket_cmpxchg(g, new, ({ + new.data_type = data_type; +@@ -829,8 +829,8 @@ void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, + unsigned sectors, struct gc_pos pos, + unsigned flags) + { +- BUG_ON(type != BCH_DATA_SB && +- type != BCH_DATA_JOURNAL); ++ BUG_ON(type != BCH_DATA_sb && ++ type != BCH_DATA_journal); + + preempt_disable(); + +@@ -1122,7 +1122,7 @@ static int bch2_mark_extent(struct bch_fs *c, + BUG_ON(!sectors); + + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { +- s64 disk_sectors = data_type == BCH_DATA_BTREE ++ s64 disk_sectors = data_type == BCH_DATA_btree + ? sectors + : ptr_disk_sectors_delta(p, offset, sectors, flags); + +@@ -1284,12 +1284,12 @@ static int bch2_mark_key_locked(struct bch_fs *c, + : -c->opts.btree_node_size; + + ret = bch2_mark_extent(c, old, new, offset, sectors, +- BCH_DATA_BTREE, fs_usage, journal_seq, flags); ++ BCH_DATA_btree, fs_usage, journal_seq, flags); + break; + case KEY_TYPE_extent: + case KEY_TYPE_reflink_v: + ret = bch2_mark_extent(c, old, new, offset, sectors, +- BCH_DATA_USER, fs_usage, journal_seq, flags); ++ BCH_DATA_user, fs_usage, journal_seq, flags); + break; + case KEY_TYPE_stripe: + ret = bch2_mark_stripe(c, old, new, fs_usage, journal_seq, flags); +@@ -1667,7 +1667,7 @@ static int bch2_trans_mark_extent(struct btree_trans *trans, + BUG_ON(!sectors); + + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { +- s64 disk_sectors = data_type == BCH_DATA_BTREE ++ s64 disk_sectors = data_type == BCH_DATA_btree + ? sectors + : ptr_disk_sectors_delta(p, offset, sectors, flags); + +@@ -1809,11 +1809,11 @@ int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c k, + : -c->opts.btree_node_size; + + return bch2_trans_mark_extent(trans, k, offset, sectors, +- flags, BCH_DATA_BTREE); ++ flags, BCH_DATA_btree); + case KEY_TYPE_extent: + case KEY_TYPE_reflink_v: + return bch2_trans_mark_extent(trans, k, offset, sectors, +- flags, BCH_DATA_USER); ++ flags, BCH_DATA_user); + case KEY_TYPE_inode: + d = replicas_deltas_realloc(trans, 0); + +diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h +index 4bde58130d39..fe342f0d2c88 100644 +--- a/fs/bcachefs/buckets.h ++++ b/fs/bcachefs/buckets.h +@@ -99,9 +99,9 @@ static inline enum bch_data_type ptr_data_type(const struct bkey *k, + { + if (k->type == KEY_TYPE_btree_ptr || + k->type == KEY_TYPE_btree_ptr_v2) +- return BCH_DATA_BTREE; ++ return BCH_DATA_btree; + +- return ptr->cached ? BCH_DATA_CACHED : BCH_DATA_USER; ++ return ptr->cached ? BCH_DATA_cached : BCH_DATA_user; + } + + static inline struct bucket_mark ptr_bucket_mark(struct bch_dev *ca, +diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c +index 8922a3da4151..1aa8d5cd427b 100644 +--- a/fs/bcachefs/ec.c ++++ b/fs/bcachefs/ec.c +@@ -1144,7 +1144,7 @@ ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target, + h->redundancy = redundancy; + + rcu_read_lock(); +- h->devs = target_rw_devs(c, BCH_DATA_USER, target); ++ h->devs = target_rw_devs(c, BCH_DATA_user, target); + + for_each_member_device_rcu(ca, c, i, &h->devs) + if (!ca->mi.durability) +diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c +index 251d4af773a5..b001498f404c 100644 +--- a/fs/bcachefs/extents.c ++++ b/fs/bcachefs/extents.c +@@ -194,7 +194,7 @@ void bch2_btree_ptr_debugcheck(struct bch_fs *c, struct bkey_s_c k) + goto err; + + err = "inconsistent"; +- if (mark.data_type != BCH_DATA_BTREE || ++ if (mark.data_type != BCH_DATA_btree || + mark.dirty_sectors < c->opts.btree_node_size) + goto err; + } +@@ -289,7 +289,7 @@ void bch2_extent_debugcheck(struct bch_fs *c, struct bkey_s_c k) + "key too stale: %i", stale); + + bch2_fs_inconsistent_on(!stale && +- (mark.data_type != BCH_DATA_USER || ++ (mark.data_type != BCH_DATA_user || + mark_sectors < disk_sectors), c, + "extent pointer not marked: %s:\n" + "type %u sectors %u < %u", +diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c +index 82ea3642b8c5..ae1e157c591b 100644 +--- a/fs/bcachefs/io.c ++++ b/fs/bcachefs/io.c +@@ -1129,7 +1129,7 @@ again: + key_to_write = (void *) (op->insert_keys.keys_p + + key_to_write_offset); + +- bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_USER, ++ bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_user, + key_to_write); + } while (ret); + +@@ -2175,7 +2175,7 @@ get_bio: + goto out; + } + +- this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_USER], ++ this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_user], + bio_sectors(&rbio->bio)); + bio_set_dev(&rbio->bio, ca->disk_sb.bdev); + +diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c +index b4f7b61ba9ac..e84d80a4dcd1 100644 +--- a/fs/bcachefs/journal.c ++++ b/fs/bcachefs/journal.c +@@ -847,7 +847,7 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, + if (pos <= ja->cur_idx) + ja->cur_idx = (ja->cur_idx + 1) % ja->nr; + +- bch2_mark_metadata_bucket(c, ca, bucket, BCH_DATA_JOURNAL, ++ bch2_mark_metadata_bucket(c, ca, bucket, BCH_DATA_journal, + ca->mi.bucket_size, + gc_phase(GC_PHASE_SB), + 0); +@@ -1196,7 +1196,7 @@ ssize_t bch2_journal_print_debug(struct journal *j, char *buf) + test_bit(JOURNAL_REPLAY_DONE, &j->flags)); + + for_each_member_device_rcu(ca, c, iter, +- &c->rw_devs[BCH_DATA_JOURNAL]) { ++ &c->rw_devs[BCH_DATA_journal]) { + struct journal_device *ja = &ca->journal; + + if (!ja->nr) +diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c +index c298c2b7721d..d32b4d5d88cf 100644 +--- a/fs/bcachefs/journal_io.c ++++ b/fs/bcachefs/journal_io.c +@@ -661,7 +661,7 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list) + + for_each_member_device(ca, c, iter) { + if (!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) && +- !(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_JOURNAL))) ++ !(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_journal))) + continue; + + if ((ca->mi.state == BCH_MEMBER_STATE_RW || +@@ -695,7 +695,7 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list) + * the devices - this is wrong: + */ + +- bch2_devlist_to_replicas(&replicas.e, BCH_DATA_JOURNAL, i->devs); ++ bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, i->devs); + + if (!degraded && + (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) || +@@ -796,7 +796,7 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w, + rcu_read_lock(); + + devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe, +- &c->rw_devs[BCH_DATA_JOURNAL]); ++ &c->rw_devs[BCH_DATA_journal]); + + __journal_write_alloc(j, w, &devs_sorted, + sectors, &replicas, replicas_want); +@@ -914,7 +914,7 @@ static void journal_write_done(struct closure *cl) + goto err; + } + +- bch2_devlist_to_replicas(&replicas.e, BCH_DATA_JOURNAL, devs); ++ bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, devs); + + if (bch2_mark_replicas(c, &replicas.e)) + goto err; +@@ -1106,7 +1106,7 @@ retry_alloc: + continue; + } + +- this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_JOURNAL], ++ this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_journal], + sectors); + + bio = ca->journal.bio; +diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c +index 4811ab9f879e..57591983eebd 100644 +--- a/fs/bcachefs/journal_reclaim.c ++++ b/fs/bcachefs/journal_reclaim.c +@@ -70,7 +70,7 @@ static struct journal_space { + + rcu_read_lock(); + for_each_member_device_rcu(ca, c, i, +- &c->rw_devs[BCH_DATA_JOURNAL]) { ++ &c->rw_devs[BCH_DATA_journal]) { + struct journal_device *ja = &ca->journal; + unsigned buckets_this_device, sectors_this_device; + +@@ -139,7 +139,7 @@ void bch2_journal_space_available(struct journal *j) + + rcu_read_lock(); + for_each_member_device_rcu(ca, c, i, +- &c->rw_devs[BCH_DATA_JOURNAL]) { ++ &c->rw_devs[BCH_DATA_journal]) { + struct journal_device *ja = &ca->journal; + + if (!ja->nr) +@@ -618,7 +618,7 @@ int bch2_journal_flush_device_pins(struct journal *j, int dev_idx) + return ret; + + mutex_lock(&c->replicas_gc_lock); +- bch2_replicas_gc_start(c, 1 << BCH_DATA_JOURNAL); ++ bch2_replicas_gc_start(c, 1 << BCH_DATA_journal); + + seq = 0; + +@@ -627,7 +627,7 @@ int bch2_journal_flush_device_pins(struct journal *j, int dev_idx) + struct bch_replicas_padded replicas; + + seq = max(seq, journal_last_seq(j)); +- bch2_devlist_to_replicas(&replicas.e, BCH_DATA_JOURNAL, ++ bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, + journal_seq_pin(j, seq)->devs); + seq++; + +diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c +index b42350f9e9fb..486ba34af3c1 100644 +--- a/fs/bcachefs/move.c ++++ b/fs/bcachefs/move.c +@@ -517,7 +517,7 @@ static int __bch2_move_data(struct bch_fs *c, + bkey_on_stack_init(&sk); + bch2_trans_init(&trans, c, 0, 0); + +- stats->data_type = BCH_DATA_USER; ++ stats->data_type = BCH_DATA_user; + stats->btree_id = btree_id; + stats->pos = POS_MIN; + +@@ -642,7 +642,7 @@ int bch2_move_data(struct bch_fs *c, + INIT_LIST_HEAD(&ctxt.reads); + init_waitqueue_head(&ctxt.wait); + +- stats->data_type = BCH_DATA_USER; ++ stats->data_type = BCH_DATA_user; + + ret = __bch2_move_data(c, &ctxt, rate, wp, start, end, + pred, arg, stats, BTREE_ID_EXTENTS) ?: +@@ -677,7 +677,7 @@ static int bch2_move_btree(struct bch_fs *c, + + bch2_trans_init(&trans, c, 0, 0); + +- stats->data_type = BCH_DATA_BTREE; ++ stats->data_type = BCH_DATA_btree; + + for (id = 0; id < BTREE_ID_NR; id++) { + stats->btree_id = id; +@@ -773,7 +773,7 @@ int bch2_data_job(struct bch_fs *c, + + switch (op.op) { + case BCH_DATA_OP_REREPLICATE: +- stats->data_type = BCH_DATA_JOURNAL; ++ stats->data_type = BCH_DATA_journal; + ret = bch2_journal_flush_device_pins(&c->journal, -1); + + ret = bch2_move_btree(c, rereplicate_pred, c, stats) ?: ret; +@@ -794,7 +794,7 @@ int bch2_data_job(struct bch_fs *c, + if (op.migrate.dev >= c->sb.nr_devices) + return -EINVAL; + +- stats->data_type = BCH_DATA_JOURNAL; ++ stats->data_type = BCH_DATA_journal; + ret = bch2_journal_flush_device_pins(&c->journal, op.migrate.dev); + + ret = bch2_move_btree(c, migrate_pred, &op, stats) ?: ret; +diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c +index 0a87cd7405dd..27e966edac23 100644 +--- a/fs/bcachefs/movinggc.c ++++ b/fs/bcachefs/movinggc.c +@@ -160,7 +160,7 @@ static void bch2_copygc(struct bch_fs *c, struct bch_dev *ca) + struct copygc_heap_entry e; + + if (m.owned_by_allocator || +- m.data_type != BCH_DATA_USER || ++ m.data_type != BCH_DATA_user || + !bucket_sectors_used(m) || + bucket_sectors_used(m) >= ca->mi.bucket_size) + continue; +diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c +index 94d6c044a27d..afe25cd26c06 100644 +--- a/fs/bcachefs/opts.c ++++ b/fs/bcachefs/opts.c +@@ -45,12 +45,9 @@ const char * const bch2_str_hash_types[] = { + }; + + const char * const bch2_data_types[] = { +- "none", +- "sb", +- "journal", +- "btree", +- "data", +- "cached", ++#define x(t, n) #t, ++ BCH_DATA_TYPES() ++#undef x + NULL + }; + +diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c +index 67a7128fd9af..be44a25e595e 100644 +--- a/fs/bcachefs/replicas.c ++++ b/fs/bcachefs/replicas.c +@@ -113,16 +113,16 @@ void bch2_bkey_to_replicas(struct bch_replicas_entry *e, + switch (k.k->type) { + case KEY_TYPE_btree_ptr: + case KEY_TYPE_btree_ptr_v2: +- e->data_type = BCH_DATA_BTREE; ++ e->data_type = BCH_DATA_btree; + extent_to_replicas(k, e); + break; + case KEY_TYPE_extent: + case KEY_TYPE_reflink_v: +- e->data_type = BCH_DATA_USER; ++ e->data_type = BCH_DATA_user; + extent_to_replicas(k, e); + break; + case KEY_TYPE_stripe: +- e->data_type = BCH_DATA_USER; ++ e->data_type = BCH_DATA_user; + stripe_to_replicas(k, e); + break; + } +@@ -137,7 +137,7 @@ void bch2_devlist_to_replicas(struct bch_replicas_entry *e, + unsigned i; + + BUG_ON(!data_type || +- data_type == BCH_DATA_SB || ++ data_type == BCH_DATA_sb || + data_type >= BCH_DATA_NR); + + e->data_type = data_type; +@@ -611,7 +611,7 @@ retry: + struct bch_replicas_entry *e = + cpu_replicas_entry(&c->replicas, i); + +- if (e->data_type == BCH_DATA_JOURNAL || ++ if (e->data_type == BCH_DATA_journal || + c->usage_base->replicas[i] || + percpu_u64_get(&c->usage[0]->replicas[i]) || + percpu_u64_get(&c->usage[1]->replicas[i])) +@@ -1037,13 +1037,13 @@ static bool have_enough_devs(struct replicas_status s, + + bool bch2_have_enough_devs(struct replicas_status s, unsigned flags) + { +- return (have_enough_devs(s, BCH_DATA_JOURNAL, ++ return (have_enough_devs(s, BCH_DATA_journal, + flags & BCH_FORCE_IF_METADATA_DEGRADED, + flags & BCH_FORCE_IF_METADATA_LOST) && +- have_enough_devs(s, BCH_DATA_BTREE, ++ have_enough_devs(s, BCH_DATA_btree, + flags & BCH_FORCE_IF_METADATA_DEGRADED, + flags & BCH_FORCE_IF_METADATA_LOST) && +- have_enough_devs(s, BCH_DATA_USER, ++ have_enough_devs(s, BCH_DATA_user, + flags & BCH_FORCE_IF_DATA_DEGRADED, + flags & BCH_FORCE_IF_DATA_LOST)); + } +@@ -1053,9 +1053,9 @@ int bch2_replicas_online(struct bch_fs *c, bool meta) + struct replicas_status s = bch2_replicas_status(c); + + return (meta +- ? min(s.replicas[BCH_DATA_JOURNAL].redundancy, +- s.replicas[BCH_DATA_BTREE].redundancy) +- : s.replicas[BCH_DATA_USER].redundancy) + 1; ++ ? min(s.replicas[BCH_DATA_journal].redundancy, ++ s.replicas[BCH_DATA_btree].redundancy) ++ : s.replicas[BCH_DATA_user].redundancy) + 1; + } + + unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca) +diff --git a/fs/bcachefs/replicas.h b/fs/bcachefs/replicas.h +index 8527d82841bb..deda5f5c6e20 100644 +--- a/fs/bcachefs/replicas.h ++++ b/fs/bcachefs/replicas.h +@@ -36,7 +36,7 @@ int bch2_mark_bkey_replicas(struct bch_fs *, struct bkey_s_c); + static inline void bch2_replicas_entry_cached(struct bch_replicas_entry *e, + unsigned dev) + { +- e->data_type = BCH_DATA_CACHED; ++ e->data_type = BCH_DATA_cached; + e->nr_devs = 1; + e->nr_required = 1; + e->devs[0] = dev; +diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c +index e4ea12fc0bfa..b600668258e2 100644 +--- a/fs/bcachefs/super-io.c ++++ b/fs/bcachefs/super-io.c +@@ -654,7 +654,7 @@ static void read_back_super(struct bch_fs *c, struct bch_dev *ca) + bio_set_op_attrs(bio, REQ_OP_READ, REQ_SYNC|REQ_META); + bch2_bio_map(bio, ca->sb_read_scratch, PAGE_SIZE); + +- this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_SB], ++ this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_sb], + bio_sectors(bio)); + + percpu_ref_get(&ca->io_ref); +@@ -682,7 +682,7 @@ static void write_one_super(struct bch_fs *c, struct bch_dev *ca, unsigned idx) + roundup((size_t) vstruct_bytes(sb), + bdev_logical_block_size(ca->disk_sb.bdev))); + +- this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_SB], ++ this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_sb], + bio_sectors(bio)); + + percpu_ref_get(&ca->io_ref); +diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c +index b7b4c5c0a2fa..1440ad9fc25e 100644 +--- a/fs/bcachefs/super.c ++++ b/fs/bcachefs/super.c +@@ -1067,7 +1067,7 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c, + + init_rwsem(&ca->bucket_lock); + +- writepoint_init(&ca->copygc_write_point, BCH_DATA_USER); ++ writepoint_init(&ca->copygc_write_point, BCH_DATA_user); + + bch2_dev_copygc_init(ca); + +@@ -1200,7 +1200,7 @@ static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb) + return ret; + + if (test_bit(BCH_FS_ALLOC_READ_DONE, &c->flags) && +- !percpu_u64_get(&ca->usage[0]->buckets[BCH_DATA_SB])) { ++ !percpu_u64_get(&ca->usage[0]->buckets[BCH_DATA_sb])) { + mutex_lock(&c->sb_lock); + bch2_mark_dev_superblock(ca->fs, ca, 0); + mutex_unlock(&c->sb_lock); +diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c +index 9a4b93433170..0c3f8c357129 100644 +--- a/fs/bcachefs/sysfs.c ++++ b/fs/bcachefs/sysfs.c +@@ -870,18 +870,18 @@ static ssize_t show_dev_alloc_debug(struct bch_dev *ca, char *buf) + fifo_used(&ca->free[RESERVE_NONE]), ca->free[RESERVE_NONE].size, + ca->mi.nbuckets - ca->mi.first_bucket, + stats.buckets_alloc, +- stats.buckets[BCH_DATA_SB], +- stats.buckets[BCH_DATA_JOURNAL], +- stats.buckets[BCH_DATA_BTREE], +- stats.buckets[BCH_DATA_USER], +- stats.buckets[BCH_DATA_CACHED], ++ stats.buckets[BCH_DATA_sb], ++ stats.buckets[BCH_DATA_journal], ++ stats.buckets[BCH_DATA_btree], ++ stats.buckets[BCH_DATA_user], ++ stats.buckets[BCH_DATA_cached], + stats.buckets_ec, + ca->mi.nbuckets - ca->mi.first_bucket - stats.buckets_unavailable, +- stats.sectors[BCH_DATA_SB], +- stats.sectors[BCH_DATA_JOURNAL], +- stats.sectors[BCH_DATA_BTREE], +- stats.sectors[BCH_DATA_USER], +- stats.sectors[BCH_DATA_CACHED], ++ stats.sectors[BCH_DATA_sb], ++ stats.sectors[BCH_DATA_journal], ++ stats.sectors[BCH_DATA_btree], ++ stats.sectors[BCH_DATA_user], ++ stats.sectors[BCH_DATA_cached], + stats.sectors_ec, + stats.sectors_fragmented, + ca->copygc_threshold, +@@ -889,8 +889,8 @@ static ssize_t show_dev_alloc_debug(struct bch_dev *ca, char *buf) + c->open_buckets_nr_free, OPEN_BUCKETS_COUNT, + BTREE_NODE_OPEN_BUCKET_RESERVE, + c->open_buckets_wait.list.first ? "waiting" : "empty", +- nr[BCH_DATA_BTREE], +- nr[BCH_DATA_USER], ++ nr[BCH_DATA_btree], ++ nr[BCH_DATA_user], + c->btree_reserve_cache_nr); + } + +-- +cgit v1.2.3 + + +From 9c64a9e3112ecb69a9a22dea856b37a32c2b961a Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 11 Jul 2020 15:35:04 -0400 +Subject: bcachefs: Fix extent_ptr_durability() calculation for erasure coded + data + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/extents.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c +index b001498f404c..a4191ccf565b 100644 +--- a/fs/bcachefs/extents.c ++++ b/fs/bcachefs/extents.c +@@ -724,7 +724,7 @@ static unsigned bch2_extent_ptr_durability(struct bch_fs *c, + if (WARN_ON(!s)) + goto out; + +- durability = max_t(unsigned, durability, s->nr_redundant); ++ durability += s->nr_redundant; + } + out: + return durability; +-- +cgit v1.2.3 + + +From 699ce325cf9021fb714789d29e4a45103812678e Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 11 Jul 2020 13:23:17 -0400 +Subject: bcachefs: Drop extra pointers when marking data as in a stripe + +We ideally want the buckets used for the extra initial replicas to be +reused right away. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/ec.c | 9 +++------ + 1 file changed, 3 insertions(+), 6 deletions(-) + +diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c +index 1aa8d5cd427b..f1f41056844f 100644 +--- a/fs/bcachefs/ec.c ++++ b/fs/bcachefs/ec.c +@@ -784,12 +784,9 @@ static int ec_stripe_update_ptrs(struct bch_fs *c, + bkey_on_stack_reassemble(&sk, c, k); + e = bkey_i_to_s_extent(sk.k); + +- extent_for_each_ptr(e, ptr) { +- if (ptr->dev == dev) +- ec_ptr = ptr; +- else +- ptr->cached = true; +- } ++ bch2_bkey_drop_ptrs(e.s, ptr, ptr->dev != dev); ++ ec_ptr = (void *) bch2_bkey_has_device(e.s_c, dev); ++ BUG_ON(!ec_ptr); + + extent_stripe_ptr_add(e, s, ec_ptr, idx); + +-- +cgit v1.2.3 + + +From 0fe86ca43793a1e86441e4267b2dd4baa295c6c0 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 11 Jul 2020 16:28:54 -0400 +Subject: bcachefs: Make copygc thread global + +Per device copygc threads don't move data to different devices and they +make fragmentation works - they don't make much sense anymore. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/alloc_background.c | 20 ++-- + fs/bcachefs/alloc_foreground.c | 5 +- + fs/bcachefs/bcachefs.h | 14 +-- + fs/bcachefs/buckets.c | 19 +--- + fs/bcachefs/buckets_types.h | 1 + + fs/bcachefs/movinggc.c | 213 ++++++++++++++++++++++------------------ + fs/bcachefs/movinggc.h | 6 +- + fs/bcachefs/super.c | 29 ++---- + fs/bcachefs/sysfs.c | 18 ++-- + include/trace/events/bcachefs.h | 6 +- + 10 files changed, 159 insertions(+), 172 deletions(-) + +diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c +index 88ef65559765..fd5b932c84a5 100644 +--- a/fs/bcachefs/alloc_background.c ++++ b/fs/bcachefs/alloc_background.c +@@ -41,29 +41,26 @@ static void pd_controllers_update(struct work_struct *work) + struct bch_fs, + pd_controllers_update); + struct bch_dev *ca; ++ s64 free = 0, fragmented = 0; + unsigned i; + + for_each_member_device(ca, c, i) { + struct bch_dev_usage stats = bch2_dev_usage_read(c, ca); + +- u64 free = bucket_to_sector(ca, ++ free += bucket_to_sector(ca, + __dev_buckets_free(ca, stats)) << 9; + /* + * Bytes of internal fragmentation, which can be + * reclaimed by copy GC + */ +- s64 fragmented = (bucket_to_sector(ca, ++ fragmented += max_t(s64, 0, (bucket_to_sector(ca, + stats.buckets[BCH_DATA_user] + + stats.buckets[BCH_DATA_cached]) - + (stats.sectors[BCH_DATA_user] + +- stats.sectors[BCH_DATA_cached])) << 9; +- +- fragmented = max(0LL, fragmented); +- +- bch2_pd_controller_update(&ca->copygc_pd, +- free, fragmented, -1); ++ stats.sectors[BCH_DATA_cached])) << 9); + } + ++ bch2_pd_controller_update(&c->copygc_pd, free, fragmented, -1); + schedule_delayed_work(&c->pd_controllers_update, + c->pd_controllers_update_seconds * HZ); + } +@@ -1191,7 +1188,7 @@ stop: + void bch2_recalc_capacity(struct bch_fs *c) + { + struct bch_dev *ca; +- u64 capacity = 0, reserved_sectors = 0, gc_reserve; ++ u64 capacity = 0, reserved_sectors = 0, gc_reserve, copygc_threshold = 0; + unsigned bucket_size_max = 0; + unsigned long ra_pages = 0; + unsigned i, j; +@@ -1234,7 +1231,7 @@ void bch2_recalc_capacity(struct bch_fs *c) + + dev_reserve *= ca->mi.bucket_size; + +- ca->copygc_threshold = dev_reserve; ++ copygc_threshold += dev_reserve; + + capacity += bucket_to_sector(ca, ca->mi.nbuckets - + ca->mi.first_bucket); +@@ -1253,6 +1250,7 @@ void bch2_recalc_capacity(struct bch_fs *c) + + reserved_sectors = min(reserved_sectors, capacity); + ++ c->copygc_threshold = copygc_threshold; + c->capacity = capacity - reserved_sectors; + + c->bucket_size_max = bucket_size_max; +@@ -1312,7 +1310,7 @@ void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca) + for (i = 0; i < ARRAY_SIZE(c->write_points); i++) + bch2_writepoint_stop(c, ca, &c->write_points[i]); + +- bch2_writepoint_stop(c, ca, &ca->copygc_write_point); ++ bch2_writepoint_stop(c, ca, &c->copygc_write_point); + bch2_writepoint_stop(c, ca, &c->rebalance_write_point); + bch2_writepoint_stop(c, ca, &c->btree_write_point); + +diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c +index cc5e6d3d0012..10bc5bfaf2fa 100644 +--- a/fs/bcachefs/alloc_foreground.c ++++ b/fs/bcachefs/alloc_foreground.c +@@ -956,8 +956,9 @@ void bch2_fs_allocator_foreground_init(struct bch_fs *c) + c->open_buckets_freelist = ob - c->open_buckets; + } + +- writepoint_init(&c->btree_write_point, BCH_DATA_btree); +- writepoint_init(&c->rebalance_write_point, BCH_DATA_user); ++ writepoint_init(&c->btree_write_point, BCH_DATA_btree); ++ writepoint_init(&c->rebalance_write_point, BCH_DATA_user); ++ writepoint_init(&c->copygc_write_point, BCH_DATA_user); + + for (wp = c->write_points; + wp < c->write_points + c->write_points_nr; wp++) { +diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h +index 271add3ade9b..ee79b9294461 100644 +--- a/fs/bcachefs/bcachefs.h ++++ b/fs/bcachefs/bcachefs.h +@@ -453,13 +453,6 @@ struct bch_dev { + + alloc_heap alloc_heap; + +- /* Copying GC: */ +- struct task_struct *copygc_thread; +- copygc_heap copygc_heap; +- struct bch_pd_controller copygc_pd; +- struct write_point copygc_write_point; +- u64 copygc_threshold; +- + atomic64_t rebalance_work; + + struct journal_device journal; +@@ -753,6 +746,13 @@ struct bch_fs { + /* REBALANCE */ + struct bch_fs_rebalance rebalance; + ++ /* COPYGC */ ++ struct task_struct *copygc_thread; ++ copygc_heap copygc_heap; ++ struct bch_pd_controller copygc_pd; ++ struct write_point copygc_write_point; ++ u64 copygc_threshold; ++ + /* STRIPES: */ + GENRADIX(struct stripe) stripes[2]; + +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index 6b0fbbfd6a35..ddace47a2f77 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -2024,7 +2024,6 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) + alloc_fifo free[RESERVE_NR]; + alloc_fifo free_inc; + alloc_heap alloc_heap; +- copygc_heap copygc_heap; + + size_t btree_reserve = DIV_ROUND_UP(BTREE_NODE_RESERVE, + ca->mi.bucket_size / c->opts.btree_node_size); +@@ -2033,15 +2032,13 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) + size_t copygc_reserve = max_t(size_t, 2, nbuckets >> 7); + size_t free_inc_nr = max(max_t(size_t, 1, nbuckets >> 12), + btree_reserve * 2); +- bool resize = ca->buckets[0] != NULL, +- start_copygc = ca->copygc_thread != NULL; ++ bool resize = ca->buckets[0] != NULL; + int ret = -ENOMEM; + unsigned i; + + memset(&free, 0, sizeof(free)); + memset(&free_inc, 0, sizeof(free_inc)); + memset(&alloc_heap, 0, sizeof(alloc_heap)); +- memset(©gc_heap, 0, sizeof(copygc_heap)); + + if (!(buckets = kvpmalloc(sizeof(struct bucket_array) + + nbuckets * sizeof(struct bucket), +@@ -2054,14 +2051,13 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) + copygc_reserve, GFP_KERNEL) || + !init_fifo(&free[RESERVE_NONE], reserve_none, GFP_KERNEL) || + !init_fifo(&free_inc, free_inc_nr, GFP_KERNEL) || +- !init_heap(&alloc_heap, ALLOC_SCAN_BATCH(ca) << 1, GFP_KERNEL) || +- !init_heap(©gc_heap, copygc_reserve, GFP_KERNEL)) ++ !init_heap(&alloc_heap, ALLOC_SCAN_BATCH(ca) << 1, GFP_KERNEL)) + goto err; + + buckets->first_bucket = ca->mi.first_bucket; + buckets->nbuckets = nbuckets; + +- bch2_copygc_stop(ca); ++ bch2_copygc_stop(c); + + if (resize) { + down_write(&c->gc_lock); +@@ -2104,21 +2100,13 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) + /* with gc lock held, alloc_heap can't be in use: */ + swap(ca->alloc_heap, alloc_heap); + +- /* and we shut down copygc: */ +- swap(ca->copygc_heap, copygc_heap); +- + nbuckets = ca->mi.nbuckets; + + if (resize) + up_write(&ca->bucket_lock); + +- if (start_copygc && +- bch2_copygc_start(c, ca)) +- bch_err(ca, "error restarting copygc thread"); +- + ret = 0; + err: +- free_heap(©gc_heap); + free_heap(&alloc_heap); + free_fifo(&free_inc); + for (i = 0; i < RESERVE_NR; i++) +@@ -2135,7 +2123,6 @@ void bch2_dev_buckets_free(struct bch_dev *ca) + { + unsigned i; + +- free_heap(&ca->copygc_heap); + free_heap(&ca->alloc_heap); + free_fifo(&ca->free_inc); + for (i = 0; i < RESERVE_NR; i++) +diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h +index 53f22726893d..4ebe80b05ffc 100644 +--- a/fs/bcachefs/buckets_types.h ++++ b/fs/bcachefs/buckets_types.h +@@ -123,6 +123,7 @@ struct disk_reservation { + }; + + struct copygc_heap_entry { ++ u8 dev; + u8 gen; + u32 sectors; + u64 offset; +diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c +index 27e966edac23..4429b2bb2df8 100644 +--- a/fs/bcachefs/movinggc.c ++++ b/fs/bcachefs/movinggc.c +@@ -43,13 +43,6 @@ + #define COPYGC_BUCKETS_PER_ITER(ca) \ + ((ca)->free[RESERVE_MOVINGGC].size / 2) + +-/* +- * Max sectors to move per iteration: Have to take into account internal +- * fragmentation from the multiple write points for each generation: +- */ +-#define COPYGC_SECTORS_PER_ITER(ca) \ +- ((ca)->mi.bucket_size * COPYGC_BUCKETS_PER_ITER(ca)) +- + static inline int sectors_used_cmp(copygc_heap *heap, + struct copygc_heap_entry l, + struct copygc_heap_entry r) +@@ -62,18 +55,22 @@ static int bucket_offset_cmp(const void *_l, const void *_r, size_t size) + const struct copygc_heap_entry *l = _l; + const struct copygc_heap_entry *r = _r; + +- return cmp_int(l->offset, r->offset); ++ return cmp_int(l->dev, r->dev) ?: ++ cmp_int(l->offset, r->offset); + } + +-static bool __copygc_pred(struct bch_dev *ca, +- struct bkey_s_c k) ++static int __copygc_pred(struct bch_fs *c, struct bkey_s_c k) + { +- copygc_heap *h = &ca->copygc_heap; +- const struct bch_extent_ptr *ptr = +- bch2_bkey_has_device(k, ca->dev_idx); +- +- if (ptr) { +- struct copygc_heap_entry search = { .offset = ptr->offset }; ++ copygc_heap *h = &c->copygc_heap; ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const struct bch_extent_ptr *ptr; ++ ++ bkey_for_each_ptr(ptrs, ptr) { ++ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); ++ struct copygc_heap_entry search = { ++ .dev = ptr->dev, ++ .offset = ptr->offset ++ }; + + ssize_t i = eytzinger0_find_le(h->data, h->used, + sizeof(h->data[0]), +@@ -89,12 +86,13 @@ static bool __copygc_pred(struct bch_dev *ca, + + BUG_ON(i != j); + #endif +- return (i >= 0 && +- ptr->offset < h->data[i].offset + ca->mi.bucket_size && +- ptr->gen == h->data[i].gen); ++ if (i >= 0 && ++ ptr->offset < h->data[i].offset + ca->mi.bucket_size && ++ ptr->gen == h->data[i].gen) ++ return ptr->dev; + } + +- return false; ++ return -1; + } + + static enum data_cmd copygc_pred(struct bch_fs *c, void *arg, +@@ -102,14 +100,14 @@ static enum data_cmd copygc_pred(struct bch_fs *c, void *arg, + struct bch_io_opts *io_opts, + struct data_opts *data_opts) + { +- struct bch_dev *ca = arg; +- +- if (!__copygc_pred(ca, k)) ++ int dev_idx = __copygc_pred(c, k); ++ if (dev_idx < 0) + return DATA_SKIP; + +- data_opts->target = dev_to_target(ca->dev_idx); ++ /* XXX: use io_opts for this inode */ ++ data_opts->target = dev_to_target(dev_idx); + data_opts->btree_insert_flags = BTREE_INSERT_USE_RESERVE; +- data_opts->rewrite_dev = ca->dev_idx; ++ data_opts->rewrite_dev = dev_idx; + return DATA_REWRITE; + } + +@@ -125,20 +123,21 @@ static bool have_copygc_reserve(struct bch_dev *ca) + return ret; + } + +-static void bch2_copygc(struct bch_fs *c, struct bch_dev *ca) ++static void bch2_copygc(struct bch_fs *c) + { +- copygc_heap *h = &ca->copygc_heap; ++ copygc_heap *h = &c->copygc_heap; + struct copygc_heap_entry e, *i; + struct bucket_array *buckets; + struct bch_move_stats move_stats; + u64 sectors_to_move = 0, sectors_not_moved = 0; ++ u64 sectors_reserved = 0; + u64 buckets_to_move, buckets_not_moved = 0; +- size_t b; ++ struct bch_dev *ca; ++ unsigned dev_idx; ++ size_t b, heap_size = 0; + int ret; + + memset(&move_stats, 0, sizeof(move_stats)); +- closure_wait_event(&c->freelist_wait, have_copygc_reserve(ca)); +- + /* + * Find buckets with lowest sector counts, skipping completely + * empty buckets, by building a maxheap sorted by sector count, +@@ -147,38 +146,51 @@ static void bch2_copygc(struct bch_fs *c, struct bch_dev *ca) + */ + h->used = 0; + +- /* +- * We need bucket marks to be up to date - gc can't be recalculating +- * them: +- */ +- down_read(&c->gc_lock); +- down_read(&ca->bucket_lock); +- buckets = bucket_array(ca); +- +- for (b = buckets->first_bucket; b < buckets->nbuckets; b++) { +- struct bucket_mark m = READ_ONCE(buckets->b[b].mark); +- struct copygc_heap_entry e; +- +- if (m.owned_by_allocator || +- m.data_type != BCH_DATA_user || +- !bucket_sectors_used(m) || +- bucket_sectors_used(m) >= ca->mi.bucket_size) +- continue; ++ for_each_rw_member(ca, c, dev_idx) ++ heap_size += ca->mi.nbuckets >> 7; + +- e = (struct copygc_heap_entry) { +- .gen = m.gen, +- .sectors = bucket_sectors_used(m), +- .offset = bucket_to_sector(ca, b), +- }; +- heap_add_or_replace(h, e, -sectors_used_cmp, NULL); ++ if (h->size < heap_size) { ++ free_heap(&c->copygc_heap); ++ if (!init_heap(&c->copygc_heap, heap_size, GFP_KERNEL)) { ++ bch_err(c, "error allocating copygc heap"); ++ return; ++ } ++ } ++ ++ for_each_rw_member(ca, c, dev_idx) { ++ closure_wait_event(&c->freelist_wait, have_copygc_reserve(ca)); ++ ++ spin_lock(&ca->fs->freelist_lock); ++ sectors_reserved += fifo_used(&ca->free[RESERVE_MOVINGGC]) * ca->mi.bucket_size; ++ spin_unlock(&ca->fs->freelist_lock); ++ ++ down_read(&ca->bucket_lock); ++ buckets = bucket_array(ca); ++ ++ for (b = buckets->first_bucket; b < buckets->nbuckets; b++) { ++ struct bucket_mark m = READ_ONCE(buckets->b[b].mark); ++ struct copygc_heap_entry e; ++ ++ if (m.owned_by_allocator || ++ m.data_type != BCH_DATA_user || ++ !bucket_sectors_used(m) || ++ bucket_sectors_used(m) >= ca->mi.bucket_size) ++ continue; ++ ++ e = (struct copygc_heap_entry) { ++ .gen = m.gen, ++ .sectors = bucket_sectors_used(m), ++ .offset = bucket_to_sector(ca, b), ++ }; ++ heap_add_or_replace(h, e, -sectors_used_cmp, NULL); ++ } ++ up_read(&ca->bucket_lock); + } +- up_read(&ca->bucket_lock); +- up_read(&c->gc_lock); + + for (i = h->data; i < h->data + h->used; i++) + sectors_to_move += i->sectors; + +- while (sectors_to_move > COPYGC_SECTORS_PER_ITER(ca)) { ++ while (sectors_to_move > sectors_reserved) { + BUG_ON(!heap_pop(h, e, -sectors_used_cmp, NULL)); + sectors_to_move -= e.sectors; + } +@@ -192,24 +204,26 @@ static void bch2_copygc(struct bch_fs *c, struct bch_dev *ca) + sizeof(h->data[0]), + bucket_offset_cmp, NULL); + +- ret = bch2_move_data(c, &ca->copygc_pd.rate, +- writepoint_ptr(&ca->copygc_write_point), ++ ret = bch2_move_data(c, &c->copygc_pd.rate, ++ writepoint_ptr(&c->copygc_write_point), + POS_MIN, POS_MAX, +- copygc_pred, ca, ++ copygc_pred, NULL, + &move_stats); + +- down_read(&ca->bucket_lock); +- buckets = bucket_array(ca); +- for (i = h->data; i < h->data + h->used; i++) { +- size_t b = sector_to_bucket(ca, i->offset); +- struct bucket_mark m = READ_ONCE(buckets->b[b].mark); +- +- if (i->gen == m.gen && bucket_sectors_used(m)) { +- sectors_not_moved += bucket_sectors_used(m); +- buckets_not_moved++; ++ for_each_rw_member(ca, c, dev_idx) { ++ down_read(&ca->bucket_lock); ++ buckets = bucket_array(ca); ++ for (i = h->data; i < h->data + h->used; i++) { ++ size_t b = sector_to_bucket(ca, i->offset); ++ struct bucket_mark m = READ_ONCE(buckets->b[b].mark); ++ ++ if (i->gen == m.gen && bucket_sectors_used(m)) { ++ sectors_not_moved += bucket_sectors_used(m); ++ buckets_not_moved++; ++ } + } ++ up_read(&ca->bucket_lock); + } +- up_read(&ca->bucket_lock); + + if (sectors_not_moved && !ret) + bch_warn_ratelimited(c, +@@ -220,7 +234,7 @@ static void bch2_copygc(struct bch_fs *c, struct bch_dev *ca) + atomic64_read(&move_stats.keys_raced), + atomic64_read(&move_stats.sectors_raced)); + +- trace_copygc(ca, ++ trace_copygc(c, + atomic64_read(&move_stats.sectors_moved), sectors_not_moved, + buckets_to_move, buckets_not_moved); + } +@@ -239,20 +253,27 @@ static void bch2_copygc(struct bch_fs *c, struct bch_dev *ca) + * often and continually reduce the amount of fragmented space as the device + * fills up. So, we increase the threshold by half the current free space. + */ +-unsigned long bch2_copygc_wait_amount(struct bch_dev *ca) ++unsigned long bch2_copygc_wait_amount(struct bch_fs *c) + { +- struct bch_fs *c = ca->fs; +- struct bch_dev_usage usage = bch2_dev_usage_read(c, ca); +- u64 fragmented_allowed = ca->copygc_threshold + +- ((__dev_buckets_available(ca, usage) * ca->mi.bucket_size) >> 1); ++ struct bch_dev *ca; ++ unsigned dev_idx; ++ u64 fragmented_allowed = c->copygc_threshold; ++ u64 fragmented = 0; ++ ++ for_each_rw_member(ca, c, dev_idx) { ++ struct bch_dev_usage usage = bch2_dev_usage_read(c, ca); ++ ++ fragmented_allowed += ((__dev_buckets_available(ca, usage) * ++ ca->mi.bucket_size) >> 1); ++ fragmented += usage.sectors_fragmented; ++ } + +- return max_t(s64, 0, fragmented_allowed - usage.sectors_fragmented); ++ return max_t(s64, 0, fragmented_allowed - fragmented); + } + + static int bch2_copygc_thread(void *arg) + { +- struct bch_dev *ca = arg; +- struct bch_fs *c = ca->fs; ++ struct bch_fs *c = arg; + struct io_clock *clock = &c->io_clock[WRITE]; + unsigned long last, wait; + +@@ -263,7 +284,7 @@ static int bch2_copygc_thread(void *arg) + break; + + last = atomic_long_read(&clock->now); +- wait = bch2_copygc_wait_amount(ca); ++ wait = bch2_copygc_wait_amount(c); + + if (wait > clock->max_slop) { + bch2_kthread_io_clock_wait(clock, last + wait, +@@ -271,29 +292,29 @@ static int bch2_copygc_thread(void *arg) + continue; + } + +- bch2_copygc(c, ca); ++ bch2_copygc(c); + } + + return 0; + } + +-void bch2_copygc_stop(struct bch_dev *ca) ++void bch2_copygc_stop(struct bch_fs *c) + { +- ca->copygc_pd.rate.rate = UINT_MAX; +- bch2_ratelimit_reset(&ca->copygc_pd.rate); ++ c->copygc_pd.rate.rate = UINT_MAX; ++ bch2_ratelimit_reset(&c->copygc_pd.rate); + +- if (ca->copygc_thread) { +- kthread_stop(ca->copygc_thread); +- put_task_struct(ca->copygc_thread); ++ if (c->copygc_thread) { ++ kthread_stop(c->copygc_thread); ++ put_task_struct(c->copygc_thread); + } +- ca->copygc_thread = NULL; ++ c->copygc_thread = NULL; + } + +-int bch2_copygc_start(struct bch_fs *c, struct bch_dev *ca) ++int bch2_copygc_start(struct bch_fs *c) + { + struct task_struct *t; + +- if (ca->copygc_thread) ++ if (c->copygc_thread) + return 0; + + if (c->opts.nochanges) +@@ -302,21 +323,21 @@ int bch2_copygc_start(struct bch_fs *c, struct bch_dev *ca) + if (bch2_fs_init_fault("copygc_start")) + return -ENOMEM; + +- t = kthread_create(bch2_copygc_thread, ca, +- "bch_copygc[%s]", ca->name); ++ t = kthread_create(bch2_copygc_thread, c, ++ "bch_copygc[%s]", c->name); + if (IS_ERR(t)) + return PTR_ERR(t); + + get_task_struct(t); + +- ca->copygc_thread = t; +- wake_up_process(ca->copygc_thread); ++ c->copygc_thread = t; ++ wake_up_process(c->copygc_thread); + + return 0; + } + +-void bch2_dev_copygc_init(struct bch_dev *ca) ++void bch2_fs_copygc_init(struct bch_fs *c) + { +- bch2_pd_controller_init(&ca->copygc_pd); +- ca->copygc_pd.d_term = 0; ++ bch2_pd_controller_init(&c->copygc_pd); ++ c->copygc_pd.d_term = 0; + } +diff --git a/fs/bcachefs/movinggc.h b/fs/bcachefs/movinggc.h +index dcd479632cf1..922738247d03 100644 +--- a/fs/bcachefs/movinggc.h ++++ b/fs/bcachefs/movinggc.h +@@ -2,8 +2,8 @@ + #ifndef _BCACHEFS_MOVINGGC_H + #define _BCACHEFS_MOVINGGC_H + +-void bch2_copygc_stop(struct bch_dev *); +-int bch2_copygc_start(struct bch_fs *, struct bch_dev *); +-void bch2_dev_copygc_init(struct bch_dev *); ++void bch2_copygc_stop(struct bch_fs *); ++int bch2_copygc_start(struct bch_fs *); ++void bch2_fs_copygc_init(struct bch_fs *); + + #endif /* _BCACHEFS_MOVINGGC_H */ +diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c +index 1440ad9fc25e..24a0fe266a85 100644 +--- a/fs/bcachefs/super.c ++++ b/fs/bcachefs/super.c +@@ -174,10 +174,7 @@ static void __bch2_fs_read_only(struct bch_fs *c) + int ret; + + bch2_rebalance_stop(c); +- +- for_each_member_device(ca, c, i) +- bch2_copygc_stop(ca); +- ++ bch2_copygc_stop(c); + bch2_gc_thread_stop(c); + + /* +@@ -357,8 +354,6 @@ bool bch2_fs_emergency_read_only(struct bch_fs *c) + + static int bch2_fs_read_write_late(struct bch_fs *c) + { +- struct bch_dev *ca; +- unsigned i; + int ret; + + ret = bch2_gc_thread_start(c); +@@ -367,13 +362,10 @@ static int bch2_fs_read_write_late(struct bch_fs *c) + return ret; + } + +- for_each_rw_member(ca, c, i) { +- ret = bch2_copygc_start(c, ca); +- if (ret) { +- bch_err(c, "error starting copygc threads"); +- percpu_ref_put(&ca->io_ref); +- return ret; +- } ++ ret = bch2_copygc_start(c); ++ if (ret) { ++ bch_err(c, "error starting copygc thread"); ++ return ret; + } + + ret = bch2_rebalance_start(c); +@@ -496,6 +488,7 @@ static void bch2_fs_free(struct bch_fs *c) + kfree(c->replicas_gc.entries); + kfree(rcu_dereference_protected(c->disk_groups, 1)); + kfree(c->journal_seq_blacklist_table); ++ free_heap(&c->copygc_heap); + + if (c->journal_reclaim_wq) + destroy_workqueue(c->journal_reclaim_wq); +@@ -644,6 +637,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) + for (i = 0; i < BCH_TIME_STAT_NR; i++) + bch2_time_stats_init(&c->times[i]); + ++ bch2_fs_copygc_init(c); + bch2_fs_btree_key_cache_init_early(&c->btree_key_cache); + bch2_fs_allocator_background_init(c); + bch2_fs_allocator_foreground_init(c); +@@ -1067,10 +1061,6 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c, + + init_rwsem(&ca->bucket_lock); + +- writepoint_init(&ca->copygc_write_point, BCH_DATA_user); +- +- bch2_dev_copygc_init(ca); +- + INIT_WORK(&ca->io_error_work, bch2_io_error_work); + + bch2_time_stats_init(&ca->io_latency[READ]); +@@ -1311,8 +1301,6 @@ static bool bch2_fs_may_start(struct bch_fs *c) + + static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca) + { +- bch2_copygc_stop(ca); +- + /* + * The allocator thread itself allocates btree nodes, so stop it first: + */ +@@ -1333,9 +1321,6 @@ static const char *__bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca) + if (bch2_dev_allocator_start(ca)) + return "error starting allocator thread"; + +- if (bch2_copygc_start(c, ca)) +- return "error starting copygc thread"; +- + return NULL; + } + +diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c +index 0c3f8c357129..fadfcae473f8 100644 +--- a/fs/bcachefs/sysfs.c ++++ b/fs/bcachefs/sysfs.c +@@ -379,6 +379,7 @@ SHOW(bch2_fs) + + sysfs_printf(rebalance_enabled, "%i", c->rebalance.enabled); + sysfs_pd_controller_show(rebalance, &c->rebalance.pd); /* XXX */ ++ sysfs_pd_controller_show(copy_gc, &c->copygc_pd); + + if (attr == &sysfs_rebalance_work) + return bch2_rebalance_work_show(c, buf); +@@ -460,14 +461,11 @@ STORE(bch2_fs) + } + + if (attr == &sysfs_copy_gc_enabled) { +- struct bch_dev *ca; +- unsigned i; + ssize_t ret = strtoul_safe(buf, c->copy_gc_enabled) + ?: (ssize_t) size; + +- for_each_member_device(ca, c, i) +- if (ca->copygc_thread) +- wake_up_process(ca->copygc_thread); ++ if (c->copygc_thread) ++ wake_up_process(c->copygc_thread); + return ret; + } + +@@ -482,6 +480,7 @@ STORE(bch2_fs) + sysfs_strtoul(pd_controllers_update_seconds, + c->pd_controllers_update_seconds); + sysfs_pd_controller_store(rebalance, &c->rebalance.pd); ++ sysfs_pd_controller_store(copy_gc, &c->copygc_pd); + + sysfs_strtoul(promote_whole_extents, c->promote_whole_extents); + +@@ -607,6 +606,7 @@ struct attribute *bch2_fs_internal_files[] = { + &sysfs_rebalance_enabled, + &sysfs_rebalance_work, + sysfs_pd_controller_files(rebalance), ++ sysfs_pd_controller_files(copy_gc), + + &sysfs_new_stripes, + +@@ -884,7 +884,7 @@ static ssize_t show_dev_alloc_debug(struct bch_dev *ca, char *buf) + stats.sectors[BCH_DATA_cached], + stats.sectors_ec, + stats.sectors_fragmented, +- ca->copygc_threshold, ++ c->copygc_threshold, + c->freelist_wait.list.first ? "waiting" : "empty", + c->open_buckets_nr_free, OPEN_BUCKETS_COUNT, + BTREE_NODE_OPEN_BUCKET_RESERVE, +@@ -951,8 +951,6 @@ SHOW(bch2_dev) + return out.pos - buf; + } + +- sysfs_pd_controller_show(copy_gc, &ca->copygc_pd); +- + if (attr == &sysfs_cache_replacement_policy) { + bch2_string_opt_to_text(&out, + bch2_cache_replacement_policies, +@@ -1006,8 +1004,6 @@ STORE(bch2_dev) + struct bch_fs *c = ca->fs; + struct bch_member *mi; + +- sysfs_pd_controller_store(copy_gc, &ca->copygc_pd); +- + if (attr == &sysfs_discard) { + bool v = strtoul_or_return(buf); + +@@ -1092,8 +1088,6 @@ struct attribute *bch2_dev_files[] = { + /* debug: */ + &sysfs_alloc_debug, + &sysfs_wake_allocator, +- +- sysfs_pd_controller_files(copy_gc), + NULL + }; + +diff --git a/include/trace/events/bcachefs.h b/include/trace/events/bcachefs.h +index 05ec550806f8..55d1eff1108b 100644 +--- a/include/trace/events/bcachefs.h ++++ b/include/trace/events/bcachefs.h +@@ -470,10 +470,10 @@ TRACE_EVENT(move_data, + ); + + TRACE_EVENT(copygc, +- TP_PROTO(struct bch_dev *ca, ++ TP_PROTO(struct bch_fs *c, + u64 sectors_moved, u64 sectors_not_moved, + u64 buckets_moved, u64 buckets_not_moved), +- TP_ARGS(ca, ++ TP_ARGS(c, + sectors_moved, sectors_not_moved, + buckets_moved, buckets_not_moved), + +@@ -486,7 +486,7 @@ TRACE_EVENT(copygc, + ), + + TP_fast_assign( +- memcpy(__entry->uuid, ca->uuid.b, 16); ++ memcpy(__entry->uuid, c->sb.user_uuid.b, 16); + __entry->sectors_moved = sectors_moved; + __entry->sectors_not_moved = sectors_not_moved; + __entry->buckets_moved = buckets_moved; +-- +cgit v1.2.3 + + +From 2289c2982cac8033e6d8a00e30605a7b7f1271de Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 20 Jul 2020 15:51:05 -0400 +Subject: bcachefs: Add an option for rebuilding the replicas section + +There is a bug where we cnan end up clearing the data_has field in the +superblock members section, which causes us to skip reading the journal +and thus journal replay fails. This option tells the recovery path to +not trust those fields. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/opts.h | 5 +++++ + fs/bcachefs/recovery.c | 3 ++- + 2 files changed, 7 insertions(+), 1 deletion(-) + +diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h +index 3b051e7a8f1d..d6a832a38b20 100644 +--- a/fs/bcachefs/opts.h ++++ b/fs/bcachefs/opts.h +@@ -260,6 +260,11 @@ enum opt_type { + OPT_BOOL(), \ + NO_SB_OPT, false, \ + NULL, "Don't replay the journal") \ ++ x(rebuild_replicas, u8, \ ++ OPT_MOUNT, \ ++ OPT_BOOL(), \ ++ NO_SB_OPT, false, \ ++ NULL, "Rebuild the superblock replicas section") \ + x(keep_journal, u8, \ + OPT_MOUNT, \ + OPT_BOOL(), \ +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index 1695a609ecd9..28972f30e198 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -974,7 +974,8 @@ int bch2_fs_recovery(struct bch_fs *c) + bch_info(c, "recovering from clean shutdown, journal seq %llu", + le64_to_cpu(clean->journal_seq)); + +- if (!c->replicas.entries) { ++ if (!c->replicas.entries || ++ c->opts.rebuild_replicas) { + bch_info(c, "building replicas info"); + set_bit(BCH_FS_REBUILD_REPLICAS, &c->flags); + } +-- +cgit v1.2.3 + + +From 8282ccb66c50d15575ff84be96f9f10b1ae612bb Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 20 Jul 2020 13:00:15 -0400 +Subject: bcachefs: Wrap write path in memalloc_nofs_save() + +This fixes a lockdep splat where we're allocating memory with vmalloc in +the compression bounce path, which doesn't always obey GFP_NOFS. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/compress.c | 6 +----- + fs/bcachefs/io.c | 12 +++++++++--- + 2 files changed, 10 insertions(+), 8 deletions(-) + +diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c +index 920460a182b4..595d76aa3956 100644 +--- a/fs/bcachefs/compress.c ++++ b/fs/bcachefs/compress.c +@@ -7,7 +7,6 @@ + #include "super-io.h" + + #include +-#include + #include + #include + +@@ -64,7 +63,7 @@ static struct bbuf __bio_map_or_bounce(struct bch_fs *c, struct bio *bio, + struct bbuf ret; + struct bio_vec bv; + struct bvec_iter iter; +- unsigned nr_pages = 0, flags; ++ unsigned nr_pages = 0; + struct page *stack_pages[16]; + struct page **pages = NULL; + void *data; +@@ -104,10 +103,7 @@ static struct bbuf __bio_map_or_bounce(struct bch_fs *c, struct bio *bio, + __bio_for_each_segment(bv, bio, iter, start) + pages[nr_pages++] = bv.bv_page; + +- flags = memalloc_nofs_save(); + data = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL); +- memalloc_nofs_restore(flags); +- + if (pages != stack_pages) + kfree(pages); + +diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c +index ae1e157c591b..bab018b81d09 100644 +--- a/fs/bcachefs/io.c ++++ b/fs/bcachefs/io.c +@@ -31,6 +31,7 @@ + + #include + #include ++#include + + #include + +@@ -1053,7 +1054,10 @@ static void __bch2_write(struct closure *cl) + struct write_point *wp; + struct bio *bio; + bool skip_put = true; ++ unsigned nofs_flags; + int ret; ++ ++ nofs_flags = memalloc_nofs_save(); + again: + memset(&op->failed, 0, sizeof(op->failed)); + +@@ -1135,13 +1139,15 @@ again: + + if (!skip_put) + continue_at(cl, bch2_write_index, index_update_wq(op)); ++out: ++ memalloc_nofs_restore(nofs_flags); + return; + err: + op->error = ret; + op->flags |= BCH_WRITE_DONE; + + continue_at(cl, bch2_write_index, index_update_wq(op)); +- return; ++ goto out; + flush_io: + /* + * If the write can't all be submitted at once, we generally want to +@@ -1152,7 +1158,7 @@ flush_io: + */ + if (current->flags & PF_WQ_WORKER) { + continue_at(cl, bch2_write_index, index_update_wq(op)); +- return; ++ goto out; + } + + closure_sync(cl); +@@ -1163,7 +1169,7 @@ flush_io: + if (op->error) { + op->flags |= BCH_WRITE_DONE; + continue_at_nobarrier(cl, bch2_write_done, NULL); +- return; ++ goto out; + } + } + +-- +cgit v1.2.3 + + +From 0bfa693f42434bb1321b827505d97cfb891cb154 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 21 Jul 2020 11:51:17 -0400 +Subject: bcachefs: Fix a faulty assertion + +Now that updates to interior nodes are journalled, we shouldn't be +checking topology of interior nodes until we've finished replaying +updates to that node. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_update_interior.c | 13 ++++++++----- + 1 file changed, 8 insertions(+), 5 deletions(-) + +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index b41916f93c9b..69c87cf1faa1 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -26,7 +26,7 @@ + /* + * Verify that child nodes correctly span parent node's range: + */ +-static void btree_node_interior_verify(struct btree *b) ++static void btree_node_interior_verify(struct bch_fs *c, struct btree *b) + { + #ifdef CONFIG_BCACHEFS_DEBUG + struct bpos next_node = b->data->min_key; +@@ -37,6 +37,9 @@ static void btree_node_interior_verify(struct btree *b) + + BUG_ON(!b->c.level); + ++ if (!test_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags)) ++ return; ++ + bch2_btree_node_iter_init_from_start(&iter, b); + + while (1) { +@@ -1120,8 +1123,8 @@ static struct btree *__btree_split_node(struct btree_update *as, + bch2_verify_btree_nr_keys(n2); + + if (n1->c.level) { +- btree_node_interior_verify(n1); +- btree_node_interior_verify(n2); ++ btree_node_interior_verify(as->c, n1); ++ btree_node_interior_verify(as->c, n2); + } + + return n2; +@@ -1180,7 +1183,7 @@ static void btree_split_insert_keys(struct btree_update *as, struct btree *b, + BUG_ON(b->nsets != 1 || + b->nr.live_u64s != le16_to_cpu(btree_bset_first(b)->u64s)); + +- btree_node_interior_verify(b); ++ btree_node_interior_verify(as->c, b); + } + + static void btree_split(struct btree_update *as, struct btree *b, +@@ -1378,7 +1381,7 @@ void bch2_btree_insert_node(struct btree_update *as, struct btree *b, + + bch2_btree_node_unlock_write(b, iter); + +- btree_node_interior_verify(b); ++ btree_node_interior_verify(c, b); + + /* + * when called from the btree_split path the new nodes aren't added to +-- +cgit v1.2.3 + + +From c31e77d8a66740a84b85e9c2f5b07599675842d4 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 21 Jul 2020 13:34:22 -0400 +Subject: bcachefs: Add bch2_blk_status_to_str() + +We define our own BLK_STS_REMOVED, so we need our own to_str helper too. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_io.c | 4 ++-- + fs/bcachefs/ec.c | 2 +- + fs/bcachefs/io.c | 11 +++++++++-- + fs/bcachefs/io.h | 2 ++ + fs/bcachefs/journal_io.c | 3 ++- + fs/bcachefs/super-io.c | 2 +- + 6 files changed, 17 insertions(+), 7 deletions(-) + +diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c +index b23246087e4e..8827f04836e0 100644 +--- a/fs/bcachefs/btree_io.c ++++ b/fs/bcachefs/btree_io.c +@@ -1150,7 +1150,7 @@ static void btree_node_read_work(struct work_struct *work) + } + start: + bch2_dev_io_err_on(bio->bi_status, ca, "btree read: %s", +- blk_status_to_str(bio->bi_status)); ++ bch2_blk_status_to_str(bio->bi_status)); + if (rb->have_ioref) + percpu_ref_put(&ca->io_ref); + rb->have_ioref = false; +@@ -1435,7 +1435,7 @@ static void btree_node_write_endio(struct bio *bio) + bch2_latency_acct(ca, wbio->submit_time, WRITE); + + if (bch2_dev_io_err_on(bio->bi_status, ca, "btree write: %s", +- blk_status_to_str(bio->bi_status)) || ++ bch2_blk_status_to_str(bio->bi_status)) || + bch2_meta_write_fault("btree")) { + spin_lock_irqsave(&c->btree_write_error_lock, flags); + bch2_dev_list_add_dev(&orig->failed, wbio->dev); +diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c +index f1f41056844f..084cb4dfc2ed 100644 +--- a/fs/bcachefs/ec.c ++++ b/fs/bcachefs/ec.c +@@ -328,7 +328,7 @@ static void ec_block_endio(struct bio *bio) + + if (bch2_dev_io_err_on(bio->bi_status, ca, "erasure coding %s: %s", + bio_data_dir(bio) ? "write" : "read", +- blk_status_to_str(bio->bi_status))) ++ bch2_blk_status_to_str(bio->bi_status))) + clear_bit(ec_bio->idx, ec_bio->buf->valid); + + bio_put(&ec_bio->bio); +diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c +index bab018b81d09..5d5ab85f01f0 100644 +--- a/fs/bcachefs/io.c ++++ b/fs/bcachefs/io.c +@@ -35,6 +35,13 @@ + + #include + ++const char *bch2_blk_status_to_str(blk_status_t status) ++{ ++ if (status == BLK_STS_REMOVED) ++ return "device removed"; ++ return blk_status_to_str(status); ++} ++ + static bool bch2_target_congested(struct bch_fs *c, u16 target) + { + const struct bch_devs_mask *devs; +@@ -613,7 +620,7 @@ static void bch2_write_endio(struct bio *bio) + struct bch_dev *ca = bch_dev_bkey_exists(c, wbio->dev); + + if (bch2_dev_io_err_on(bio->bi_status, ca, "data write: %s", +- blk_status_to_str(bio->bi_status))) ++ bch2_blk_status_to_str(bio->bi_status))) + set_bit(wbio->dev, op->failed.d); + + if (wbio->have_ioref) { +@@ -1928,7 +1935,7 @@ static void bch2_read_endio(struct bio *bio) + rbio->bio.bi_end_io = rbio->end_io; + + if (bch2_dev_io_err_on(bio->bi_status, ca, "data read; %s", +- blk_status_to_str(bio->bi_status))) { ++ bch2_blk_status_to_str(bio->bi_status))) { + bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status); + return; + } +diff --git a/fs/bcachefs/io.h b/fs/bcachefs/io.h +index 0ad293bd6295..ded468d70f09 100644 +--- a/fs/bcachefs/io.h ++++ b/fs/bcachefs/io.h +@@ -22,6 +22,8 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *, + + #define BLK_STS_REMOVED ((__force blk_status_t)128) + ++const char *bch2_blk_status_to_str(blk_status_t); ++ + enum bch_write_flags { + BCH_WRITE_ALLOC_NOWAIT = (1 << 0), + BCH_WRITE_CACHED = (1 << 1), +diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c +index d32b4d5d88cf..b62a4f292fbb 100644 +--- a/fs/bcachefs/journal_io.c ++++ b/fs/bcachefs/journal_io.c +@@ -6,6 +6,7 @@ + #include "buckets.h" + #include "checksum.h" + #include "error.h" ++#include "io.h" + #include "journal.h" + #include "journal_io.h" + #include "journal_reclaim.h" +@@ -962,7 +963,7 @@ static void journal_write_endio(struct bio *bio) + struct journal *j = &ca->fs->journal; + + if (bch2_dev_io_err_on(bio->bi_status, ca, "journal write: %s", +- blk_status_to_str(bio->bi_status)) || ++ bch2_blk_status_to_str(bio->bi_status)) || + bch2_meta_write_fault("journal")) { + struct journal_buf *w = journal_prev_buf(j); + unsigned long flags; +diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c +index b600668258e2..f969b5df0b23 100644 +--- a/fs/bcachefs/super-io.c ++++ b/fs/bcachefs/super-io.c +@@ -634,7 +634,7 @@ static void write_super_endio(struct bio *bio) + /* XXX: return errors directly */ + + if (bch2_dev_io_err_on(bio->bi_status, ca, "superblock write: %s", +- blk_status_to_str(bio->bi_status))) ++ bch2_blk_status_to_str(bio->bi_status))) + ca->sb_write_error = 1; + + closure_put(&ca->fs->sb_write); +-- +cgit v1.2.3 + + +From afbbfaca9222e1d96da8bae17ede15e5db85d3db Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 11 Jul 2020 18:52:14 -0400 +Subject: bcachefs: Don't restrict copygc writes to the same device + +This no longer makes any sense, since copygc is now one thread per +filesystem, not per device, with a single write point. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/alloc_foreground.c | 92 +++++++++++++++++++++--------------------- + fs/bcachefs/alloc_foreground.h | 16 ++++++-- + fs/bcachefs/move.c | 9 +++-- + fs/bcachefs/movinggc.c | 2 +- + 4 files changed, 66 insertions(+), 53 deletions(-) + +diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c +index 10bc5bfaf2fa..3fb849b44ddd 100644 +--- a/fs/bcachefs/alloc_foreground.c ++++ b/fs/bcachefs/alloc_foreground.c +@@ -70,12 +70,6 @@ + #include + #include + +-enum bucket_alloc_ret { +- ALLOC_SUCCESS, +- OPEN_BUCKETS_EMPTY, +- FREELIST_EMPTY, /* Allocator thread not keeping up */ +-}; +- + /* + * Open buckets represent a bucket that's currently being allocated from. They + * serve two purposes: +@@ -395,21 +389,22 @@ static void add_new_bucket(struct bch_fs *c, + ob_push(c, ptrs, ob); + } + +-int bch2_bucket_alloc_set(struct bch_fs *c, +- struct open_buckets *ptrs, +- struct dev_stripe_state *stripe, +- struct bch_devs_mask *devs_may_alloc, +- unsigned nr_replicas, +- unsigned *nr_effective, +- bool *have_cache, +- enum alloc_reserve reserve, +- unsigned flags, +- struct closure *cl) ++enum bucket_alloc_ret ++bch2_bucket_alloc_set(struct bch_fs *c, ++ struct open_buckets *ptrs, ++ struct dev_stripe_state *stripe, ++ struct bch_devs_mask *devs_may_alloc, ++ unsigned nr_replicas, ++ unsigned *nr_effective, ++ bool *have_cache, ++ enum alloc_reserve reserve, ++ unsigned flags, ++ struct closure *cl) + { + struct dev_alloc_list devs_sorted = + bch2_dev_alloc_list(c, stripe, devs_may_alloc); + struct bch_dev *ca; +- bool alloc_failure = false; ++ enum bucket_alloc_ret ret = INSUFFICIENT_DEVICES; + unsigned i; + + BUG_ON(*nr_effective >= nr_replicas); +@@ -427,16 +422,10 @@ int bch2_bucket_alloc_set(struct bch_fs *c, + ob = bch2_bucket_alloc(c, ca, reserve, + flags & BUCKET_MAY_ALLOC_PARTIAL, cl); + if (IS_ERR(ob)) { +- enum bucket_alloc_ret ret = -PTR_ERR(ob); +- +- WARN_ON(reserve == RESERVE_MOVINGGC && +- ret != OPEN_BUCKETS_EMPTY); ++ ret = -PTR_ERR(ob); + + if (cl) +- return -EAGAIN; +- if (ret == OPEN_BUCKETS_EMPTY) +- return -ENOSPC; +- alloc_failure = true; ++ return ret; + continue; + } + +@@ -446,10 +435,10 @@ int bch2_bucket_alloc_set(struct bch_fs *c, + bch2_dev_stripe_increment(c, ca, stripe); + + if (*nr_effective >= nr_replicas) +- return 0; ++ return ALLOC_SUCCESS; + } + +- return alloc_failure ? -ENOSPC : -EROFS; ++ return ret; + } + + /* Allocate from stripes: */ +@@ -546,24 +535,25 @@ static void get_buckets_from_writepoint(struct bch_fs *c, + wp->ptrs = ptrs_skip; + } + +-static int open_bucket_add_buckets(struct bch_fs *c, +- struct open_buckets *ptrs, +- struct write_point *wp, +- struct bch_devs_list *devs_have, +- u16 target, +- unsigned erasure_code, +- unsigned nr_replicas, +- unsigned *nr_effective, +- bool *have_cache, +- enum alloc_reserve reserve, +- unsigned flags, +- struct closure *_cl) ++static enum bucket_alloc_ret ++open_bucket_add_buckets(struct bch_fs *c, ++ struct open_buckets *ptrs, ++ struct write_point *wp, ++ struct bch_devs_list *devs_have, ++ u16 target, ++ unsigned erasure_code, ++ unsigned nr_replicas, ++ unsigned *nr_effective, ++ bool *have_cache, ++ enum alloc_reserve reserve, ++ unsigned flags, ++ struct closure *_cl) + { + struct bch_devs_mask devs; + struct open_bucket *ob; + struct closure *cl = NULL; ++ enum bucket_alloc_ret ret; + unsigned i; +- int ret; + + rcu_read_lock(); + devs = target_rw_devs(c, wp->type, target); +@@ -608,7 +598,7 @@ retry_blocking: + ret = bch2_bucket_alloc_set(c, ptrs, &wp->stripe, &devs, + nr_replicas, nr_effective, have_cache, + reserve, flags, cl); +- if (ret && ret != -EROFS && !cl && _cl) { ++ if (ret && ret != INSUFFICIENT_DEVICES && !cl && _cl) { + cl = _cl; + goto retry_blocking; + } +@@ -799,7 +789,8 @@ struct write_point *bch2_alloc_sectors_start(struct bch_fs *c, + unsigned nr_effective, write_points_nr; + unsigned ob_flags = 0; + bool have_cache; +- int ret, i; ++ enum bucket_alloc_ret ret; ++ int i; + + if (!(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)) + ob_flags |= BUCKET_ALLOC_USE_DURABILITY; +@@ -844,10 +835,13 @@ retry: + alloc_done: + BUG_ON(!ret && nr_effective < nr_replicas); + ++ WARN_ON(reserve == RESERVE_MOVINGGC && ++ ret == FREELIST_EMPTY); ++ + if (erasure_code && !ec_open_bucket(c, &ptrs)) + pr_debug("failed to get ec bucket: ret %u", ret); + +- if (ret == -EROFS && ++ if (ret == INSUFFICIENT_DEVICES && + nr_effective >= nr_replicas_required) + ret = 0; + +@@ -881,11 +875,19 @@ err: + + mutex_unlock(&wp->lock); + +- if (ret == -ENOSPC && ++ if (ret == FREELIST_EMPTY && + try_decrease_writepoints(c, write_points_nr)) + goto retry; + +- return ERR_PTR(ret); ++ switch (ret) { ++ case OPEN_BUCKETS_EMPTY: ++ case FREELIST_EMPTY: ++ return cl ? ERR_PTR(-EAGAIN) : ERR_PTR(-ENOSPC); ++ case INSUFFICIENT_DEVICES: ++ return ERR_PTR(-EROFS); ++ default: ++ BUG(); ++ } + } + + /* +diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h +index 17a6869bb8cd..e8357ec0b333 100644 +--- a/fs/bcachefs/alloc_foreground.h ++++ b/fs/bcachefs/alloc_foreground.h +@@ -12,6 +12,13 @@ struct bch_dev; + struct bch_fs; + struct bch_devs_List; + ++enum bucket_alloc_ret { ++ ALLOC_SUCCESS, ++ OPEN_BUCKETS_EMPTY, ++ FREELIST_EMPTY, /* Allocator thread not keeping up */ ++ INSUFFICIENT_DEVICES, ++}; ++ + struct dev_alloc_list { + unsigned nr; + u8 devs[BCH_SB_MEMBERS_MAX]; +@@ -92,10 +99,11 @@ static inline void bch2_open_bucket_get(struct bch_fs *c, + } + } + +-int bch2_bucket_alloc_set(struct bch_fs *, struct open_buckets *, +- struct dev_stripe_state *, struct bch_devs_mask *, +- unsigned, unsigned *, bool *, enum alloc_reserve, +- unsigned, struct closure *); ++enum bucket_alloc_ret ++bch2_bucket_alloc_set(struct bch_fs *, struct open_buckets *, ++ struct dev_stripe_state *, struct bch_devs_mask *, ++ unsigned, unsigned *, bool *, enum alloc_reserve, ++ unsigned, struct closure *); + + struct write_point *bch2_alloc_sectors_start(struct bch_fs *, + unsigned, unsigned, +diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c +index 486ba34af3c1..9e515326793e 100644 +--- a/fs/bcachefs/move.c ++++ b/fs/bcachefs/move.c +@@ -247,11 +247,14 @@ int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m, + m->op.target = data_opts.target, + m->op.write_point = wp; + +- if (m->data_opts.btree_insert_flags & BTREE_INSERT_USE_RESERVE) ++ if (m->data_opts.btree_insert_flags & BTREE_INSERT_USE_RESERVE) { + m->op.alloc_reserve = RESERVE_MOVINGGC; ++ } else { ++ /* XXX: this should probably be passed in */ ++ m->op.flags |= BCH_WRITE_ONLY_SPECIFIED_DEVS; ++ } + +- m->op.flags |= BCH_WRITE_ONLY_SPECIFIED_DEVS| +- BCH_WRITE_PAGES_STABLE| ++ m->op.flags |= BCH_WRITE_PAGES_STABLE| + BCH_WRITE_PAGES_OWNED| + BCH_WRITE_DATA_ENCODED| + BCH_WRITE_FROM_INTERNAL; +diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c +index 4429b2bb2df8..ba708bd8e60a 100644 +--- a/fs/bcachefs/movinggc.c ++++ b/fs/bcachefs/movinggc.c +@@ -105,7 +105,7 @@ static enum data_cmd copygc_pred(struct bch_fs *c, void *arg, + return DATA_SKIP; + + /* XXX: use io_opts for this inode */ +- data_opts->target = dev_to_target(dev_idx); ++ data_opts->target = io_opts->background_target; + data_opts->btree_insert_flags = BTREE_INSERT_USE_RESERVE; + data_opts->rewrite_dev = dev_idx; + return DATA_REWRITE; +-- +cgit v1.2.3 + + +From 1694df6114ffe74fd616e9b60efc4362c6b3aaac Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 10 Jul 2020 16:13:52 -0400 +Subject: bcachefs: Refactor replicas code + +Awhile back the mechanism for garbage collecting unused replicas entries +was significantly improved, but some cleanup was missed - this patch +does that now. + +This is also prep work for a patch to account for erasure coded parity +blocks separately - we need to consolidate the logic for +checking/marking the various replicas entries from one bkey into a +single function. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_gc.c | 2 +- + fs/bcachefs/extents.c | 10 ------ + fs/bcachefs/journal_io.c | 2 +- + fs/bcachefs/replicas.c | 79 +++++++++++++++++------------------------------- + fs/bcachefs/replicas.h | 8 ++--- + 5 files changed, 31 insertions(+), 70 deletions(-) + +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +index 7ede033d40cc..6c03f2aa7713 100644 +--- a/fs/bcachefs/btree_gc.c ++++ b/fs/bcachefs/btree_gc.c +@@ -109,7 +109,7 @@ static int bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k, + atomic64_set(&c->key_version, k.k->version.lo); + + if (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) || +- fsck_err_on(!bch2_bkey_replicas_marked(c, k, false), c, ++ fsck_err_on(!bch2_bkey_replicas_marked(c, k), c, + "superblock not marked as containing replicas (type %u)", + k.k->type)) { + ret = bch2_mark_bkey_replicas(c, k); +diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c +index a4191ccf565b..568f039edcff 100644 +--- a/fs/bcachefs/extents.c ++++ b/fs/bcachefs/extents.c +@@ -179,11 +179,6 @@ void bch2_btree_ptr_debugcheck(struct bch_fs *c, struct bkey_s_c k) + if (!percpu_down_read_trylock(&c->mark_lock)) + return; + +- bch2_fs_inconsistent_on(!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) && +- !bch2_bkey_replicas_marked_locked(c, k, false), c, +- "btree key bad (replicas not marked in superblock):\n%s", +- (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); +- + bkey_for_each_ptr(ptrs, ptr) { + ca = bch_dev_bkey_exists(c, ptr->dev); + +@@ -267,11 +262,6 @@ void bch2_extent_debugcheck(struct bch_fs *c, struct bkey_s_c k) + if (!percpu_down_read_trylock(&c->mark_lock)) + return; + +- bch2_fs_inconsistent_on(!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) && +- !bch2_bkey_replicas_marked_locked(c, e.s_c, false), c, +- "extent key bad (replicas not marked in superblock):\n%s", +- (bch2_bkey_val_to_text(&PBUF(buf), c, e.s_c), buf)); +- + extent_for_each_ptr_decode(e, p, entry) { + struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); + struct bucket_mark mark = ptr_bucket_mark(ca, &p.ptr); +diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c +index b62a4f292fbb..9357f207f9e2 100644 +--- a/fs/bcachefs/journal_io.c ++++ b/fs/bcachefs/journal_io.c +@@ -700,7 +700,7 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list) + + if (!degraded && + (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) || +- fsck_err_on(!bch2_replicas_marked(c, &replicas.e, false), c, ++ fsck_err_on(!bch2_replicas_marked(c, &replicas.e), c, + "superblock not marked as containing replicas %s", + (bch2_replicas_entry_to_text(&PBUF(buf), + &replicas.e), buf)))) { +diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c +index be44a25e595e..6b6506c68609 100644 +--- a/fs/bcachefs/replicas.c ++++ b/fs/bcachefs/replicas.c +@@ -213,29 +213,20 @@ static bool __replicas_has_entry(struct bch_replicas_cpu *r, + return __replicas_entry_idx(r, search) >= 0; + } + +-static bool bch2_replicas_marked_locked(struct bch_fs *c, +- struct bch_replicas_entry *search, +- bool check_gc_replicas) ++bool bch2_replicas_marked(struct bch_fs *c, ++ struct bch_replicas_entry *search) + { ++ bool marked; ++ + if (!search->nr_devs) + return true; + + verify_replicas_entry(search); + +- return __replicas_has_entry(&c->replicas, search) && +- (!check_gc_replicas || +- likely((!c->replicas_gc.entries)) || +- __replicas_has_entry(&c->replicas_gc, search)); +-} +- +-bool bch2_replicas_marked(struct bch_fs *c, +- struct bch_replicas_entry *search, +- bool check_gc_replicas) +-{ +- bool marked; +- + percpu_down_read(&c->mark_lock); +- marked = bch2_replicas_marked_locked(c, search, check_gc_replicas); ++ marked = __replicas_has_entry(&c->replicas, search) && ++ (likely((!c->replicas_gc.entries)) || ++ __replicas_has_entry(&c->replicas_gc, search)); + percpu_up_read(&c->mark_lock); + + return marked; +@@ -423,66 +414,50 @@ err: + goto out; + } + +-int bch2_mark_replicas(struct bch_fs *c, +- struct bch_replicas_entry *r) ++static int __bch2_mark_replicas(struct bch_fs *c, ++ struct bch_replicas_entry *r, ++ bool check) + { +- return likely(bch2_replicas_marked(c, r, true)) +- ? 0 ++ return likely(bch2_replicas_marked(c, r)) ? 0 ++ : check ? -1 + : bch2_mark_replicas_slowpath(c, r); + } + +-bool bch2_bkey_replicas_marked_locked(struct bch_fs *c, +- struct bkey_s_c k, +- bool check_gc_replicas) ++int bch2_mark_replicas(struct bch_fs *c, struct bch_replicas_entry *r) ++{ ++ return __bch2_mark_replicas(c, r, false); ++} ++ ++static int __bch2_mark_bkey_replicas(struct bch_fs *c, struct bkey_s_c k, ++ bool check) + { + struct bch_replicas_padded search; + struct bch_devs_list cached = bch2_bkey_cached_devs(k); + unsigned i; ++ int ret; + + for (i = 0; i < cached.nr; i++) { + bch2_replicas_entry_cached(&search.e, cached.devs[i]); + +- if (!bch2_replicas_marked_locked(c, &search.e, +- check_gc_replicas)) +- return false; ++ ret = __bch2_mark_replicas(c, &search.e, check); ++ if (ret) ++ return ret; + } + + bch2_bkey_to_replicas(&search.e, k); + +- return bch2_replicas_marked_locked(c, &search.e, check_gc_replicas); ++ return __bch2_mark_replicas(c, &search.e, check); + } + + bool bch2_bkey_replicas_marked(struct bch_fs *c, +- struct bkey_s_c k, +- bool check_gc_replicas) ++ struct bkey_s_c k) + { +- bool marked; +- +- percpu_down_read(&c->mark_lock); +- marked = bch2_bkey_replicas_marked_locked(c, k, check_gc_replicas); +- percpu_up_read(&c->mark_lock); +- +- return marked; ++ return __bch2_mark_bkey_replicas(c, k, true) == 0; + } + + int bch2_mark_bkey_replicas(struct bch_fs *c, struct bkey_s_c k) + { +- struct bch_replicas_padded search; +- struct bch_devs_list cached = bch2_bkey_cached_devs(k); +- unsigned i; +- int ret; +- +- for (i = 0; i < cached.nr; i++) { +- bch2_replicas_entry_cached(&search.e, cached.devs[i]); +- +- ret = bch2_mark_replicas(c, &search.e); +- if (ret) +- return ret; +- } +- +- bch2_bkey_to_replicas(&search.e, k); +- +- return bch2_mark_replicas(c, &search.e); ++ return __bch2_mark_bkey_replicas(c, k, false); + } + + int bch2_replicas_gc_end(struct bch_fs *c, int ret) +diff --git a/fs/bcachefs/replicas.h b/fs/bcachefs/replicas.h +index deda5f5c6e20..8b95164fbb56 100644 +--- a/fs/bcachefs/replicas.h ++++ b/fs/bcachefs/replicas.h +@@ -21,16 +21,12 @@ int bch2_replicas_entry_idx(struct bch_fs *, + void bch2_devlist_to_replicas(struct bch_replicas_entry *, + enum bch_data_type, + struct bch_devs_list); +-bool bch2_replicas_marked(struct bch_fs *, +- struct bch_replicas_entry *, bool); ++bool bch2_replicas_marked(struct bch_fs *, struct bch_replicas_entry *); + int bch2_mark_replicas(struct bch_fs *, + struct bch_replicas_entry *); + +-bool bch2_bkey_replicas_marked_locked(struct bch_fs *, +- struct bkey_s_c, bool); + void bch2_bkey_to_replicas(struct bch_replicas_entry *, struct bkey_s_c); +-bool bch2_bkey_replicas_marked(struct bch_fs *, +- struct bkey_s_c, bool); ++bool bch2_bkey_replicas_marked(struct bch_fs *, struct bkey_s_c); + int bch2_mark_bkey_replicas(struct bch_fs *, struct bkey_s_c); + + static inline void bch2_replicas_entry_cached(struct bch_replicas_entry *e, +-- +cgit v1.2.3 + + +From e97b6b3ae6fa4c98834a01d2d443b5340ade9a55 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 22 Jul 2020 18:26:04 -0400 +Subject: bcachefs: Fix an error path + +We were missing a 'goto retry' and continuing on with an error pointer. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_update_interior.c | 19 ++++++++++--------- + 1 file changed, 10 insertions(+), 9 deletions(-) + +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index 69c87cf1faa1..8947f06f1b31 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -1869,7 +1869,7 @@ int bch2_btree_node_update_key(struct bch_fs *c, struct btree_iter *iter, + + new_hash = bch2_btree_node_mem_alloc(c); + } +- ++retry: + as = bch2_btree_update_start(iter->trans, iter->btree_id, + parent ? btree_update_reserve_required(c, parent) : 0, + BTREE_INSERT_NOFAIL| +@@ -1882,16 +1882,17 @@ int bch2_btree_node_update_key(struct bch_fs *c, struct btree_iter *iter, + if (ret == -EAGAIN) + ret = -EINTR; + +- if (ret != -EINTR) +- goto err; ++ if (ret == -EINTR) { ++ bch2_trans_unlock(iter->trans); ++ up_read(&c->gc_lock); ++ closure_sync(&cl); ++ down_read(&c->gc_lock); + +- bch2_trans_unlock(iter->trans); +- up_read(&c->gc_lock); +- closure_sync(&cl); +- down_read(&c->gc_lock); ++ if (bch2_trans_relock(iter->trans)) ++ goto retry; ++ } + +- if (!bch2_trans_relock(iter->trans)) +- goto err; ++ goto err; + } + + ret = bch2_mark_bkey_replicas(c, bkey_i_to_s_c(new_key)); +-- +cgit v1.2.3 + + +From f590b509b95df2b189642e852b922422e7af320f Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 22 Jul 2020 13:27:00 -0400 +Subject: bcachefs: Delete unused arguments + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/alloc_background.c | 4 ++-- + fs/bcachefs/alloc_foreground.c | 6 +++--- + fs/bcachefs/alloc_foreground.h | 3 +-- + fs/bcachefs/buckets.c | 2 +- + fs/bcachefs/buckets.h | 10 +++++----- + fs/bcachefs/chardev.c | 2 +- + fs/bcachefs/journal_io.c | 2 +- + fs/bcachefs/movinggc.c | 2 +- + fs/bcachefs/sysfs.c | 4 ++-- + 9 files changed, 17 insertions(+), 18 deletions(-) + +diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c +index fd5b932c84a5..17aa1ea860ee 100644 +--- a/fs/bcachefs/alloc_background.c ++++ b/fs/bcachefs/alloc_background.c +@@ -45,7 +45,7 @@ static void pd_controllers_update(struct work_struct *work) + unsigned i; + + for_each_member_device(ca, c, i) { +- struct bch_dev_usage stats = bch2_dev_usage_read(c, ca); ++ struct bch_dev_usage stats = bch2_dev_usage_read(ca); + + free += bucket_to_sector(ca, + __dev_buckets_free(ca, stats)) << 9; +@@ -514,7 +514,7 @@ static int wait_buckets_available(struct bch_fs *c, struct bch_dev *ca) + if (gc_count != c->gc_count) + ca->inc_gen_really_needs_gc = 0; + +- available = max_t(s64, 0, dev_buckets_available(c, ca) - ++ available = max_t(s64, 0, dev_buckets_available(ca) - + ca->inc_gen_really_needs_gc); + + if (available > fifo_free(&ca->free_inc) || +diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c +index 3fb849b44ddd..84c0d16b574d 100644 +--- a/fs/bcachefs/alloc_foreground.c ++++ b/fs/bcachefs/alloc_foreground.c +@@ -347,11 +347,11 @@ struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *c, + return ret; + } + +-void bch2_dev_stripe_increment(struct bch_fs *c, struct bch_dev *ca, ++void bch2_dev_stripe_increment(struct bch_dev *ca, + struct dev_stripe_state *stripe) + { + u64 *v = stripe->next_alloc + ca->dev_idx; +- u64 free_space = dev_buckets_free(c, ca); ++ u64 free_space = dev_buckets_free(ca); + u64 free_space_inv = free_space + ? div64_u64(1ULL << 48, free_space) + : 1ULL << 48; +@@ -432,7 +432,7 @@ bch2_bucket_alloc_set(struct bch_fs *c, + add_new_bucket(c, ptrs, devs_may_alloc, + nr_effective, have_cache, flags, ob); + +- bch2_dev_stripe_increment(c, ca, stripe); ++ bch2_dev_stripe_increment(ca, stripe); + + if (*nr_effective >= nr_replicas) + return ALLOC_SUCCESS; +diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h +index e8357ec0b333..dc8574a1a76a 100644 +--- a/fs/bcachefs/alloc_foreground.h ++++ b/fs/bcachefs/alloc_foreground.h +@@ -27,8 +27,7 @@ struct dev_alloc_list { + struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *, + struct dev_stripe_state *, + struct bch_devs_mask *); +-void bch2_dev_stripe_increment(struct bch_fs *, struct bch_dev *, +- struct dev_stripe_state *); ++void bch2_dev_stripe_increment(struct bch_dev *, struct dev_stripe_state *); + + long bch2_bucket_alloc_new_fs(struct bch_dev *); + +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index ddace47a2f77..97a8af31ded1 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -179,7 +179,7 @@ out_pool: + return ret; + } + +-struct bch_dev_usage bch2_dev_usage_read(struct bch_fs *c, struct bch_dev *ca) ++struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *ca) + { + struct bch_dev_usage ret; + +diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h +index fe342f0d2c88..653f6761862e 100644 +--- a/fs/bcachefs/buckets.h ++++ b/fs/bcachefs/buckets.h +@@ -182,7 +182,7 @@ static inline bool bucket_needs_journal_commit(struct bucket_mark m, + + /* Device usage: */ + +-struct bch_dev_usage bch2_dev_usage_read(struct bch_fs *, struct bch_dev *); ++struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *); + + void bch2_dev_usage_from_buckets(struct bch_fs *); + +@@ -202,9 +202,9 @@ static inline u64 __dev_buckets_available(struct bch_dev *ca, + /* + * Number of reclaimable buckets - only for use by the allocator thread: + */ +-static inline u64 dev_buckets_available(struct bch_fs *c, struct bch_dev *ca) ++static inline u64 dev_buckets_available(struct bch_dev *ca) + { +- return __dev_buckets_available(ca, bch2_dev_usage_read(c, ca)); ++ return __dev_buckets_available(ca, bch2_dev_usage_read(ca)); + } + + static inline u64 __dev_buckets_free(struct bch_dev *ca, +@@ -215,9 +215,9 @@ static inline u64 __dev_buckets_free(struct bch_dev *ca, + fifo_used(&ca->free_inc); + } + +-static inline u64 dev_buckets_free(struct bch_fs *c, struct bch_dev *ca) ++static inline u64 dev_buckets_free(struct bch_dev *ca) + { +- return __dev_buckets_free(ca, bch2_dev_usage_read(c, ca)); ++ return __dev_buckets_free(ca, bch2_dev_usage_read(ca)); + } + + /* Filesystem usage: */ +diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c +index 3af521947502..0377f9018d27 100644 +--- a/fs/bcachefs/chardev.c ++++ b/fs/bcachefs/chardev.c +@@ -468,7 +468,7 @@ static long bch2_ioctl_dev_usage(struct bch_fs *c, + if (IS_ERR(ca)) + return PTR_ERR(ca); + +- src = bch2_dev_usage_read(c, ca); ++ src = bch2_dev_usage_read(ca); + + arg.state = ca->mi.state; + arg.bucket_size = ca->mi.bucket_size; +diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c +index 9357f207f9e2..89585833c846 100644 +--- a/fs/bcachefs/journal_io.c ++++ b/fs/bcachefs/journal_io.c +@@ -760,7 +760,7 @@ static void __journal_write_alloc(struct journal *j, + sectors > ja->sectors_free) + continue; + +- bch2_dev_stripe_increment(c, ca, &j->wp.stripe); ++ bch2_dev_stripe_increment(ca, &j->wp.stripe); + + bch2_bkey_append_ptr(&w->key, + (struct bch_extent_ptr) { +diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c +index ba708bd8e60a..deb7c27b316e 100644 +--- a/fs/bcachefs/movinggc.c ++++ b/fs/bcachefs/movinggc.c +@@ -261,7 +261,7 @@ unsigned long bch2_copygc_wait_amount(struct bch_fs *c) + u64 fragmented = 0; + + for_each_rw_member(ca, c, dev_idx) { +- struct bch_dev_usage usage = bch2_dev_usage_read(c, ca); ++ struct bch_dev_usage usage = bch2_dev_usage_read(ca); + + fragmented_allowed += ((__dev_buckets_available(ca, usage) * + ca->mi.bucket_size) >> 1); +diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c +index fadfcae473f8..4406dfccd7be 100644 +--- a/fs/bcachefs/sysfs.c ++++ b/fs/bcachefs/sysfs.c +@@ -826,7 +826,7 @@ static ssize_t show_reserve_stats(struct bch_dev *ca, char *buf) + static ssize_t show_dev_alloc_debug(struct bch_dev *ca, char *buf) + { + struct bch_fs *c = ca->fs; +- struct bch_dev_usage stats = bch2_dev_usage_read(c, ca); ++ struct bch_dev_usage stats = bch2_dev_usage_read(ca); + unsigned i, nr[BCH_DATA_NR]; + + memset(nr, 0, sizeof(nr)); +@@ -876,7 +876,7 @@ static ssize_t show_dev_alloc_debug(struct bch_dev *ca, char *buf) + stats.buckets[BCH_DATA_user], + stats.buckets[BCH_DATA_cached], + stats.buckets_ec, +- ca->mi.nbuckets - ca->mi.first_bucket - stats.buckets_unavailable, ++ __dev_buckets_available(ca, stats), + stats.sectors[BCH_DATA_sb], + stats.sectors[BCH_DATA_journal], + stats.sectors[BCH_DATA_btree], +-- +cgit v1.2.3 + + +From ac5328f84f8e15653aee8c3c5bce5d5e399816d9 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 21 Jul 2020 17:12:39 -0400 +Subject: bcachefs: Don't let copygc buckets be stolen by other threads + +And assorted other copygc fixes. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/alloc_background.c | 4 +++- + fs/bcachefs/alloc_foreground.c | 46 +++++++++++++++++++++++++++--------------- + fs/bcachefs/alloc_foreground.h | 7 ------- + fs/bcachefs/alloc_types.h | 1 + + fs/bcachefs/btree_gc.c | 6 +++++- + fs/bcachefs/move.c | 1 + + fs/bcachefs/movinggc.c | 34 ++++++++++++++++++++++--------- + fs/bcachefs/super.c | 8 ++++++++ + 8 files changed, 72 insertions(+), 35 deletions(-) + +diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c +index 17aa1ea860ee..bf034168eb8b 100644 +--- a/fs/bcachefs/alloc_background.c ++++ b/fs/bcachefs/alloc_background.c +@@ -518,7 +518,9 @@ static int wait_buckets_available(struct bch_fs *c, struct bch_dev *ca) + ca->inc_gen_really_needs_gc); + + if (available > fifo_free(&ca->free_inc) || +- (available && !fifo_full(&ca->free[RESERVE_BTREE]))) ++ (available && ++ (!fifo_full(&ca->free[RESERVE_BTREE]) || ++ !fifo_full(&ca->free[RESERVE_MOVINGGC])))) + break; + + up_read(&c->gc_lock); +diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c +index 84c0d16b574d..1e888719aeaf 100644 +--- a/fs/bcachefs/alloc_foreground.c ++++ b/fs/bcachefs/alloc_foreground.c +@@ -144,12 +144,13 @@ static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c) + } + + static void open_bucket_free_unused(struct bch_fs *c, +- struct open_bucket *ob, +- bool may_realloc) ++ struct write_point *wp, ++ struct open_bucket *ob) + { + struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev); ++ bool may_realloc = wp->type == BCH_DATA_user; + +- BUG_ON(ca->open_buckets_partial_nr >= ++ BUG_ON(ca->open_buckets_partial_nr > + ARRAY_SIZE(ca->open_buckets_partial)); + + if (ca->open_buckets_partial_nr < +@@ -228,13 +229,22 @@ struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca, + + spin_lock(&c->freelist_lock); + +- if (may_alloc_partial && +- ca->open_buckets_partial_nr) { +- ob = c->open_buckets + +- ca->open_buckets_partial[--ca->open_buckets_partial_nr]; +- ob->on_partial_list = false; +- spin_unlock(&c->freelist_lock); +- return ob; ++ if (may_alloc_partial) { ++ int i; ++ ++ for (i = ca->open_buckets_partial_nr - 1; i >= 0; --i) { ++ ob = c->open_buckets + ca->open_buckets_partial[i]; ++ ++ if (reserve <= ob->alloc_reserve) { ++ array_remove_item(ca->open_buckets_partial, ++ ca->open_buckets_partial_nr, ++ i); ++ ob->on_partial_list = false; ++ ob->alloc_reserve = reserve; ++ spin_unlock(&c->freelist_lock); ++ return ob; ++ } ++ } + } + + if (unlikely(c->open_buckets_nr_free <= open_buckets_reserved(reserve))) { +@@ -291,6 +301,7 @@ out: + + ob->valid = true; + ob->sectors_free = ca->mi.bucket_size; ++ ob->alloc_reserve = reserve; + ob->ptr = (struct bch_extent_ptr) { + .type = 1 << BCH_EXTENT_ENTRY_ptr, + .gen = buckets->b[bucket].mark.gen, +@@ -835,9 +846,6 @@ retry: + alloc_done: + BUG_ON(!ret && nr_effective < nr_replicas); + +- WARN_ON(reserve == RESERVE_MOVINGGC && +- ret == FREELIST_EMPTY); +- + if (erasure_code && !ec_open_bucket(c, &ptrs)) + pr_debug("failed to get ec bucket: ret %u", ret); + +@@ -850,7 +858,7 @@ alloc_done: + + /* Free buckets we didn't use: */ + open_bucket_for_each(c, &wp->ptrs, ob, i) +- open_bucket_free_unused(c, ob, wp->type == BCH_DATA_user); ++ open_bucket_free_unused(c, wp, ob); + + wp->ptrs = ptrs; + +@@ -869,8 +877,7 @@ err: + if (ptrs.nr < ARRAY_SIZE(ptrs.v)) + ob_push(c, &ptrs, ob); + else +- open_bucket_free_unused(c, ob, +- wp->type == BCH_DATA_user); ++ open_bucket_free_unused(c, wp, ob); + wp->ptrs = ptrs; + + mutex_unlock(&wp->lock); +@@ -938,6 +945,13 @@ void bch2_alloc_sectors_done(struct bch_fs *c, struct write_point *wp) + bch2_open_buckets_put(c, &ptrs); + } + ++static inline void writepoint_init(struct write_point *wp, ++ enum bch_data_type type) ++{ ++ mutex_init(&wp->lock); ++ wp->type = type; ++} ++ + void bch2_fs_allocator_foreground_init(struct bch_fs *c) + { + struct open_bucket *ob; +diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h +index dc8574a1a76a..c658295cb8e0 100644 +--- a/fs/bcachefs/alloc_foreground.h ++++ b/fs/bcachefs/alloc_foreground.h +@@ -133,13 +133,6 @@ static inline struct write_point_specifier writepoint_ptr(struct write_point *wp + return (struct write_point_specifier) { .v = (unsigned long) wp }; + } + +-static inline void writepoint_init(struct write_point *wp, +- enum bch_data_type type) +-{ +- mutex_init(&wp->lock); +- wp->type = type; +-} +- + void bch2_fs_allocator_foreground_init(struct bch_fs *); + + #endif /* _BCACHEFS_ALLOC_FOREGROUND_H */ +diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h +index 4f1465077994..20705460bb0a 100644 +--- a/fs/bcachefs/alloc_types.h ++++ b/fs/bcachefs/alloc_types.h +@@ -66,6 +66,7 @@ struct open_bucket { + u8 type; + unsigned valid:1; + unsigned on_partial_list:1; ++ int alloc_reserve:3; + unsigned sectors_free; + struct bch_extent_ptr ptr; + struct ec_stripe_new *ec; +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +index 6c03f2aa7713..4f581130270c 100644 +--- a/fs/bcachefs/btree_gc.c ++++ b/fs/bcachefs/btree_gc.c +@@ -952,8 +952,10 @@ int bch2_gc_gens(struct bch_fs *c) + for (i = 0; i < BTREE_ID_NR; i++) + if (btree_node_type_needs_gc(i)) { + ret = bch2_gc_btree_gens(c, i); +- if (ret) ++ if (ret) { ++ bch_err(c, "error recalculating oldest_gen: %i", ret); + goto err; ++ } + } + + for_each_member_device(ca, c, i) { +@@ -964,6 +966,8 @@ int bch2_gc_gens(struct bch_fs *c) + g->oldest_gen = g->gc_gen; + up_read(&ca->bucket_lock); + } ++ ++ c->gc_count++; + err: + up_read(&c->gc_lock); + return ret; +diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c +index 9e515326793e..2f3be487ef65 100644 +--- a/fs/bcachefs/move.c ++++ b/fs/bcachefs/move.c +@@ -249,6 +249,7 @@ int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m, + + if (m->data_opts.btree_insert_flags & BTREE_INSERT_USE_RESERVE) { + m->op.alloc_reserve = RESERVE_MOVINGGC; ++ m->op.flags |= BCH_WRITE_ALLOC_NOWAIT; + } else { + /* XXX: this should probably be passed in */ + m->op.flags |= BCH_WRITE_ONLY_SPECIFIED_DEVS; +diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c +index deb7c27b316e..55aa463f992f 100644 +--- a/fs/bcachefs/movinggc.c ++++ b/fs/bcachefs/movinggc.c +@@ -12,6 +12,7 @@ + #include "buckets.h" + #include "clock.h" + #include "disk_groups.h" ++#include "error.h" + #include "extents.h" + #include "eytzinger.h" + #include "io.h" +@@ -104,7 +105,6 @@ static enum data_cmd copygc_pred(struct bch_fs *c, void *arg, + if (dev_idx < 0) + return DATA_SKIP; + +- /* XXX: use io_opts for this inode */ + data_opts->target = io_opts->background_target; + data_opts->btree_insert_flags = BTREE_INSERT_USE_RESERVE; + data_opts->rewrite_dev = dev_idx; +@@ -123,7 +123,7 @@ static bool have_copygc_reserve(struct bch_dev *ca) + return ret; + } + +-static void bch2_copygc(struct bch_fs *c) ++static int bch2_copygc(struct bch_fs *c) + { + copygc_heap *h = &c->copygc_heap; + struct copygc_heap_entry e, *i; +@@ -153,7 +153,7 @@ static void bch2_copygc(struct bch_fs *c) + free_heap(&c->copygc_heap); + if (!init_heap(&c->copygc_heap, heap_size, GFP_KERNEL)) { + bch_err(c, "error allocating copygc heap"); +- return; ++ return 0; + } + } + +@@ -178,6 +178,7 @@ static void bch2_copygc(struct bch_fs *c) + continue; + + e = (struct copygc_heap_entry) { ++ .dev = dev_idx, + .gen = m.gen, + .sectors = bucket_sectors_used(m), + .offset = bucket_to_sector(ca, b), +@@ -187,6 +188,11 @@ static void bch2_copygc(struct bch_fs *c) + up_read(&ca->bucket_lock); + } + ++ if (!sectors_reserved) { ++ bch2_fs_fatal_error(c, "stuck, ran out of copygc reserve!"); ++ return -1; ++ } ++ + for (i = h->data; i < h->data + h->used; i++) + sectors_to_move += i->sectors; + +@@ -198,7 +204,7 @@ static void bch2_copygc(struct bch_fs *c) + buckets_to_move = h->used; + + if (!buckets_to_move) +- return; ++ return 0; + + eytzinger0_sort(h->data, h->used, + sizeof(h->data[0]), +@@ -214,10 +220,17 @@ static void bch2_copygc(struct bch_fs *c) + down_read(&ca->bucket_lock); + buckets = bucket_array(ca); + for (i = h->data; i < h->data + h->used; i++) { +- size_t b = sector_to_bucket(ca, i->offset); +- struct bucket_mark m = READ_ONCE(buckets->b[b].mark); ++ struct bucket_mark m; ++ size_t b; + +- if (i->gen == m.gen && bucket_sectors_used(m)) { ++ if (i->dev != dev_idx) ++ continue; ++ ++ b = sector_to_bucket(ca, i->offset); ++ m = READ_ONCE(buckets->b[b].mark); ++ ++ if (i->gen == m.gen && ++ bucket_sectors_used(m)) { + sectors_not_moved += bucket_sectors_used(m); + buckets_not_moved++; + } +@@ -237,6 +250,7 @@ static void bch2_copygc(struct bch_fs *c) + trace_copygc(c, + atomic64_read(&move_stats.sectors_moved), sectors_not_moved, + buckets_to_move, buckets_not_moved); ++ return 0; + } + + /* +@@ -292,7 +306,8 @@ static int bch2_copygc_thread(void *arg) + continue; + } + +- bch2_copygc(c); ++ if (bch2_copygc(c)) ++ break; + } + + return 0; +@@ -323,8 +338,7 @@ int bch2_copygc_start(struct bch_fs *c) + if (bch2_fs_init_fault("copygc_start")) + return -ENOMEM; + +- t = kthread_create(bch2_copygc_thread, c, +- "bch_copygc[%s]", c->name); ++ t = kthread_create(bch2_copygc_thread, c, "bch_copygc"); + if (IS_ERR(t)) + return PTR_ERR(t); + +diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c +index 24a0fe266a85..f0ed7c9ea623 100644 +--- a/fs/bcachefs/super.c ++++ b/fs/bcachefs/super.c +@@ -1301,12 +1301,20 @@ static bool bch2_fs_may_start(struct bch_fs *c) + + static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca) + { ++ /* ++ * Device going read only means the copygc reserve get smaller, so we ++ * don't want that happening while copygc is in progress: ++ */ ++ bch2_copygc_stop(c); ++ + /* + * The allocator thread itself allocates btree nodes, so stop it first: + */ + bch2_dev_allocator_stop(ca); + bch2_dev_allocator_remove(c, ca); + bch2_dev_journal_stop(&c->journal, ca); ++ ++ bch2_copygc_start(c); + } + + static const char *__bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca) +-- +cgit v1.2.3 + + +From 6cefff53c8ad6eb834f614d75afa2ceb0eedd100 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 22 Jul 2020 22:40:32 -0400 +Subject: bcachefs: Fix a race with BCH_WRITE_SKIP_CLOSURE_PUT + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/io.c | 10 ++++++++++ + 1 file changed, 10 insertions(+) + +diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c +index 5d5ab85f01f0..91fdae1a3628 100644 +--- a/fs/bcachefs/io.c ++++ b/fs/bcachefs/io.c +@@ -1111,6 +1111,16 @@ again: + goto flush_io; + } + ++ /* ++ * It's possible for the allocator to fail, put us on the ++ * freelist waitlist, and then succeed in one of various retry ++ * paths: if that happens, we need to disable the skip_put ++ * optimization because otherwise there won't necessarily be a ++ * barrier before we free the bch_write_op: ++ */ ++ if (atomic_read(&cl->remaining) & CLOSURE_WAITING) ++ skip_put = false; ++ + bch2_open_bucket_get(c, wp, &op->open_buckets); + ret = bch2_write_extent(op, wp, &bio); + bch2_alloc_sectors_done(c, wp); +-- +cgit v1.2.3 + + +From becb364688756ce9b50a13215348f805ff0911e2 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 23 Jul 2020 11:31:01 -0400 +Subject: bcachefs: Ensure we only allocate one EC bucket per writepoint + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/alloc_foreground.c | 26 +++++++++++++++----------- + 1 file changed, 15 insertions(+), 11 deletions(-) + +diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c +index 1e888719aeaf..4a048828869b 100644 +--- a/fs/bcachefs/alloc_foreground.c ++++ b/fs/bcachefs/alloc_foreground.c +@@ -578,18 +578,22 @@ open_bucket_add_buckets(struct bch_fs *c, + __clear_bit(ob->ptr.dev, devs.d); + + if (erasure_code) { +- get_buckets_from_writepoint(c, ptrs, wp, &devs, +- nr_replicas, nr_effective, +- have_cache, flags, true); +- if (*nr_effective >= nr_replicas) +- return 0; ++ if (!ec_open_bucket(c, ptrs)) { ++ get_buckets_from_writepoint(c, ptrs, wp, &devs, ++ nr_replicas, nr_effective, ++ have_cache, flags, true); ++ if (*nr_effective >= nr_replicas) ++ return 0; ++ } + +- bucket_alloc_from_stripe(c, ptrs, wp, &devs, +- target, erasure_code, +- nr_replicas, nr_effective, +- have_cache, flags); +- if (*nr_effective >= nr_replicas) +- return 0; ++ if (!ec_open_bucket(c, ptrs)) { ++ bucket_alloc_from_stripe(c, ptrs, wp, &devs, ++ target, erasure_code, ++ nr_replicas, nr_effective, ++ have_cache, flags); ++ if (*nr_effective >= nr_replicas) ++ return 0; ++ } + } + + get_buckets_from_writepoint(c, ptrs, wp, &devs, +-- +cgit v1.2.3 + + +From 5f5cb89a5d112f61e0cfa7efb8483d328b008745 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 25 Jul 2020 14:19:37 -0400 +Subject: bcachefs: Fix bch2_btree_node_insert_fits() + +It should be checking for the recently added flag +btree_node_needs_rewrite. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_update_interior.h | 2 +- + fs/bcachefs/btree_update_leaf.c | 9 +++------ + 2 files changed, 4 insertions(+), 7 deletions(-) + +diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h +index 4a5b9dcfbdd0..812bafdc2d04 100644 +--- a/fs/bcachefs/btree_update_interior.h ++++ b/fs/bcachefs/btree_update_interior.h +@@ -311,7 +311,7 @@ static inline void push_whiteout(struct bch_fs *c, struct btree *b, + static inline bool bch2_btree_node_insert_fits(struct bch_fs *c, + struct btree *b, unsigned u64s) + { +- if (unlikely(btree_node_fake(b))) ++ if (unlikely(btree_node_need_rewrite(b))) + return false; + + return u64s <= bch_btree_keys_u64s_remaining(c, b); +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index cf4105e83eda..cd699c257244 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -264,14 +264,12 @@ static inline int bch2_trans_journal_res_get(struct btree_trans *trans, + static enum btree_insert_ret + btree_key_can_insert(struct btree_trans *trans, + struct btree_iter *iter, +- struct bkey_i *insert, + unsigned u64s) + { + struct bch_fs *c = trans->c; + struct btree *b = iter_l(iter)->b; + +- if (unlikely(btree_node_need_rewrite(b)) || +- unlikely(u64s > bch_btree_keys_u64s_remaining(c, b))) ++ if (!bch2_btree_node_insert_fits(c, b, u64s)) + return BTREE_INSERT_BTREE_NODE_FULL; + + return BTREE_INSERT_OK; +@@ -280,7 +278,6 @@ btree_key_can_insert(struct btree_trans *trans, + static enum btree_insert_ret + btree_key_can_insert_cached(struct btree_trans *trans, + struct btree_iter *iter, +- struct bkey_i *insert, + unsigned u64s) + { + struct bkey_cached *ck = (void *) iter->l[0].b; +@@ -398,8 +395,8 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, + + u64s += i->k->k.u64s; + ret = btree_iter_type(i->iter) != BTREE_ITER_CACHED +- ? btree_key_can_insert(trans, i->iter, i->k, u64s) +- : btree_key_can_insert_cached(trans, i->iter, i->k, u64s); ++ ? btree_key_can_insert(trans, i->iter, u64s) ++ : btree_key_can_insert_cached(trans, i->iter, u64s); + if (ret) { + *stopped_at = i; + return ret; +-- +cgit v1.2.3 + + +From 0e48418185c80bfdeec9c8f1e698e5652a1bd35e Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 25 Jul 2020 15:37:14 -0400 +Subject: bcachefs: Ensure we wake up threads locking node when reusing it + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_cache.c | 2 ++ + fs/bcachefs/btree_update_interior.c | 2 -- + 2 files changed, 2 insertions(+), 2 deletions(-) + +diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c +index d3addd3a8964..88fdade3a18c 100644 +--- a/fs/bcachefs/btree_cache.c ++++ b/fs/bcachefs/btree_cache.c +@@ -124,6 +124,8 @@ void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b) + + /* Cause future lookups for this node to fail: */ + b->hash_val = 0; ++ ++ six_lock_wakeup_all(&b->c.lock); + } + + int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b) +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index 8947f06f1b31..17651d027858 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -138,8 +138,6 @@ static void __btree_node_free(struct bch_fs *c, struct btree *b) + + bch2_btree_node_hash_remove(&c->btree_cache, b); + +- six_lock_wakeup_all(&b->c.lock); +- + mutex_lock(&c->btree_cache.lock); + list_move(&b->list, &c->btree_cache.freeable); + mutex_unlock(&c->btree_cache.lock); +-- +cgit v1.2.3 + + +From 0b31fef0c39c7e3d714851a92750c2b2b4821757 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 25 Jul 2020 15:07:37 -0400 +Subject: bcachefs: Remove some uses of PAGE_SIZE in the btree code + +For portability to userspace, we should try to avoid working in kernel +pages. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bset.c | 56 -------------------------------------------- + fs/bcachefs/bset.h | 34 +++++++++++++++++++++++++-- + fs/bcachefs/btree_cache.c | 57 +++++++++++++++++++++++---------------------- + fs/bcachefs/btree_cache.h | 7 +----- + fs/bcachefs/btree_io.c | 59 ++++++++++++++++++++++------------------------- + fs/bcachefs/btree_io.h | 3 ++- + fs/bcachefs/btree_types.h | 2 +- + fs/bcachefs/io_types.h | 1 - + 8 files changed, 93 insertions(+), 126 deletions(-) + +diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c +index 651394a330a3..f7c2841ed8a7 100644 +--- a/fs/bcachefs/bset.c ++++ b/fs/bcachefs/bset.c +@@ -313,44 +313,6 @@ struct rw_aux_tree { + struct bpos k; + }; + +-/* +- * BSET_CACHELINE was originally intended to match the hardware cacheline size - +- * it used to be 64, but I realized the lookup code would touch slightly less +- * memory if it was 128. +- * +- * It definites the number of bytes (in struct bset) per struct bkey_float in +- * the auxiliar search tree - when we're done searching the bset_float tree we +- * have this many bytes left that we do a linear search over. +- * +- * Since (after level 5) every level of the bset_tree is on a new cacheline, +- * we're touching one fewer cacheline in the bset tree in exchange for one more +- * cacheline in the linear search - but the linear search might stop before it +- * gets to the second cacheline. +- */ +- +-#define BSET_CACHELINE 128 +- +-/* Space required for the btree node keys */ +-static inline size_t btree_keys_bytes(struct btree *b) +-{ +- return PAGE_SIZE << b->page_order; +-} +- +-static inline size_t btree_keys_cachelines(struct btree *b) +-{ +- return btree_keys_bytes(b) / BSET_CACHELINE; +-} +- +-static inline size_t btree_aux_data_bytes(struct btree *b) +-{ +- return btree_keys_cachelines(b) * 8; +-} +- +-static inline size_t btree_aux_data_u64s(struct btree *b) +-{ +- return btree_aux_data_bytes(b) / sizeof(u64); +-} +- + static unsigned bset_aux_tree_buf_end(const struct bset_tree *t) + { + BUG_ON(t->aux_data_offset == U16_MAX); +@@ -426,24 +388,6 @@ static void bset_aux_tree_verify(struct btree *b) + #endif + } + +-/* Memory allocation */ +- +-void bch2_btree_keys_free(struct btree *b) +-{ +- vfree(b->aux_data); +- b->aux_data = NULL; +-} +- +-int bch2_btree_keys_alloc(struct btree *b, unsigned page_order, gfp_t gfp) +-{ +- b->page_order = page_order; +- b->aux_data = vmalloc_exec(btree_aux_data_bytes(b), gfp); +- if (!b->aux_data) +- return -ENOMEM; +- +- return 0; +-} +- + void bch2_btree_keys_init(struct btree *b, bool *expensive_debug_checks) + { + unsigned i; +diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h +index 652ffed4adfb..5921cf689105 100644 +--- a/fs/bcachefs/bset.h ++++ b/fs/bcachefs/bset.h +@@ -184,6 +184,38 @@ static inline enum bset_aux_tree_type bset_aux_tree_type(const struct bset_tree + } + } + ++/* ++ * BSET_CACHELINE was originally intended to match the hardware cacheline size - ++ * it used to be 64, but I realized the lookup code would touch slightly less ++ * memory if it was 128. ++ * ++ * It definites the number of bytes (in struct bset) per struct bkey_float in ++ * the auxiliar search tree - when we're done searching the bset_float tree we ++ * have this many bytes left that we do a linear search over. ++ * ++ * Since (after level 5) every level of the bset_tree is on a new cacheline, ++ * we're touching one fewer cacheline in the bset tree in exchange for one more ++ * cacheline in the linear search - but the linear search might stop before it ++ * gets to the second cacheline. ++ */ ++ ++#define BSET_CACHELINE 128 ++ ++static inline size_t btree_keys_cachelines(struct btree *b) ++{ ++ return (1U << b->byte_order) / BSET_CACHELINE; ++} ++ ++static inline size_t btree_aux_data_bytes(struct btree *b) ++{ ++ return btree_keys_cachelines(b) * 8; ++} ++ ++static inline size_t btree_aux_data_u64s(struct btree *b) ++{ ++ return btree_aux_data_bytes(b) / sizeof(u64); ++} ++ + typedef void (*compiled_unpack_fn)(struct bkey *, const struct bkey_packed *); + + static inline void +@@ -334,8 +366,6 @@ static inline struct bset *bset_next_set(struct btree *b, + return ((void *) i) + round_up(vstruct_bytes(i), block_bytes); + } + +-void bch2_btree_keys_free(struct btree *); +-int bch2_btree_keys_alloc(struct btree *, unsigned, gfp_t); + void bch2_btree_keys_init(struct btree *, bool *); + + void bch2_bset_init_first(struct btree *, struct bset *); +diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c +index 88fdade3a18c..736671112861 100644 +--- a/fs/bcachefs/btree_cache.c ++++ b/fs/bcachefs/btree_cache.c +@@ -44,7 +44,8 @@ static void __btree_node_data_free(struct bch_fs *c, struct btree *b) + + kvpfree(b->data, btree_bytes(c)); + b->data = NULL; +- bch2_btree_keys_free(b); ++ vfree(b->aux_data); ++ b->aux_data = NULL; + } + + static void btree_node_data_free(struct bch_fs *c, struct btree *b) +@@ -72,7 +73,7 @@ static const struct rhashtable_params bch_btree_cache_params = { + .obj_cmpfn = bch2_btree_cache_cmp_fn, + }; + +-static int __btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp) ++static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp) + { + BUG_ON(b->data || b->aux_data); + +@@ -80,7 +81,8 @@ static int __btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp) + if (!b->data) + return -ENOMEM; + +- if (bch2_btree_keys_alloc(b, btree_page_order(c), gfp)) { ++ b->aux_data = vmalloc_exec(btree_aux_data_bytes(b), gfp); ++ if (!b->aux_data) { + kvpfree(b->data, btree_bytes(c)); + b->data = NULL; + return -ENOMEM; +@@ -89,21 +91,9 @@ static int __btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp) + return 0; + } + +-static void btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp) ++static struct btree *__btree_node_mem_alloc(struct bch_fs *c) + { +- struct btree_cache *bc = &c->btree_cache; +- +- if (!__btree_node_data_alloc(c, b, gfp)) { +- bc->used++; +- list_move(&b->list, &bc->freeable); +- } else { +- list_move(&b->list, &bc->freed); +- } +-} +- +-static struct btree *btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp) +-{ +- struct btree *b = kzalloc(sizeof(struct btree), gfp); ++ struct btree *b = kzalloc(sizeof(struct btree), GFP_KERNEL); + if (!b) + return NULL; + +@@ -111,9 +101,25 @@ static struct btree *btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp) + six_lock_init(&b->c.lock); + INIT_LIST_HEAD(&b->list); + INIT_LIST_HEAD(&b->write_blocked); ++ b->byte_order = ilog2(btree_bytes(c)); ++ return b; ++} + +- btree_node_data_alloc(c, b, gfp); +- return b->data ? b : NULL; ++static struct btree *btree_node_mem_alloc(struct bch_fs *c) ++{ ++ struct btree_cache *bc = &c->btree_cache; ++ struct btree *b = __btree_node_mem_alloc(c); ++ if (!b) ++ return NULL; ++ ++ if (btree_node_data_alloc(c, b, GFP_KERNEL)) { ++ kfree(b); ++ return NULL; ++ } ++ ++ bc->used++; ++ list_add(&b->list, &bc->freeable); ++ return b; + } + + /* Btree in memory cache - hash table */ +@@ -404,7 +410,7 @@ int bch2_fs_btree_cache_init(struct bch_fs *c) + bch2_recalc_btree_reserve(c); + + for (i = 0; i < bc->reserve; i++) +- if (!btree_node_mem_alloc(c, GFP_KERNEL)) { ++ if (!btree_node_mem_alloc(c)) { + ret = -ENOMEM; + goto out; + } +@@ -420,7 +426,7 @@ int bch2_fs_btree_cache_init(struct bch_fs *c) + goto out; + } + +- c->verify_data = btree_node_mem_alloc(c, GFP_KERNEL); ++ c->verify_data = btree_node_mem_alloc(c); + if (!c->verify_data) { + ret = -ENOMEM; + goto out; +@@ -552,21 +558,16 @@ got_node: + mutex_unlock(&bc->lock); + + if (!b) { +- b = kzalloc(sizeof(struct btree), GFP_KERNEL); ++ b = __btree_node_mem_alloc(c); + if (!b) + goto err; + +- bkey_btree_ptr_init(&b->key); +- six_lock_init(&b->c.lock); +- INIT_LIST_HEAD(&b->list); +- INIT_LIST_HEAD(&b->write_blocked); +- + BUG_ON(!six_trylock_intent(&b->c.lock)); + BUG_ON(!six_trylock_write(&b->c.lock)); + } + + if (!b->data) { +- if (__btree_node_data_alloc(c, b, __GFP_NOWARN|GFP_KERNEL)) ++ if (btree_node_data_alloc(c, b, __GFP_NOWARN|GFP_KERNEL)) + goto err; + + mutex_lock(&bc->lock); +diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h +index 2160012c734f..d0d3a85bb8be 100644 +--- a/fs/bcachefs/btree_cache.h ++++ b/fs/bcachefs/btree_cache.h +@@ -79,14 +79,9 @@ static inline size_t btree_max_u64s(struct bch_fs *c) + return (btree_bytes(c) - sizeof(struct btree_node)) / sizeof(u64); + } + +-static inline size_t btree_page_order(struct bch_fs *c) +-{ +- return get_order(btree_bytes(c)); +-} +- + static inline size_t btree_pages(struct bch_fs *c) + { +- return 1 << btree_page_order(c); ++ return btree_bytes(c) / PAGE_SIZE; + } + + static inline unsigned btree_blocks(struct bch_fs *c) +diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c +index 8827f04836e0..76c9cbb97894 100644 +--- a/fs/bcachefs/btree_io.c ++++ b/fs/bcachefs/btree_io.c +@@ -57,25 +57,25 @@ static void set_needs_whiteout(struct bset *i, int v) + k->needs_whiteout = v; + } + +-static void btree_bounce_free(struct bch_fs *c, unsigned order, ++static void btree_bounce_free(struct bch_fs *c, size_t size, + bool used_mempool, void *p) + { + if (used_mempool) + mempool_free(p, &c->btree_bounce_pool); + else +- vpfree(p, PAGE_SIZE << order); ++ vpfree(p, size); + } + +-static void *btree_bounce_alloc(struct bch_fs *c, unsigned order, ++static void *btree_bounce_alloc(struct bch_fs *c, size_t size, + bool *used_mempool) + { + unsigned flags = memalloc_nofs_save(); + void *p; + +- BUG_ON(order > btree_page_order(c)); ++ BUG_ON(size > btree_bytes(c)); + + *used_mempool = false; +- p = (void *) __get_free_pages(__GFP_NOWARN|GFP_NOWAIT, order); ++ p = vpmalloc(size, __GFP_NOWARN|GFP_NOWAIT); + if (!p) { + *used_mempool = true; + p = mempool_alloc(&c->btree_bounce_pool, GFP_NOIO); +@@ -125,16 +125,14 @@ static void bch2_sort_whiteouts(struct bch_fs *c, struct btree *b) + { + struct bkey_packed *new_whiteouts, **ptrs, **ptrs_end, *k; + bool used_mempool = false; +- unsigned order; ++ size_t bytes = b->whiteout_u64s * sizeof(u64); + + if (!b->whiteout_u64s) + return; + +- order = get_order(b->whiteout_u64s * sizeof(u64)); ++ new_whiteouts = btree_bounce_alloc(c, bytes, &used_mempool); + +- new_whiteouts = btree_bounce_alloc(c, order, &used_mempool); +- +- ptrs = ptrs_end = ((void *) new_whiteouts + (PAGE_SIZE << order)); ++ ptrs = ptrs_end = ((void *) new_whiteouts + bytes); + + for (k = unwritten_whiteouts_start(c, b); + k != unwritten_whiteouts_end(c, b); +@@ -158,7 +156,7 @@ static void bch2_sort_whiteouts(struct bch_fs *c, struct btree *b) + memcpy_u64s(unwritten_whiteouts_start(c, b), + new_whiteouts, b->whiteout_u64s); + +- btree_bounce_free(c, order, used_mempool, new_whiteouts); ++ btree_bounce_free(c, bytes, used_mempool, new_whiteouts); + } + + static bool should_compact_bset(struct btree *b, struct bset_tree *t, +@@ -187,7 +185,7 @@ static bool bch2_compact_extent_whiteouts(struct bch_fs *c, + struct bkey_packed *whiteouts = NULL; + struct bkey_packed *u_start, *u_pos; + struct sort_iter sort_iter; +- unsigned order, whiteout_u64s = 0, u64s; ++ unsigned bytes, whiteout_u64s = 0, u64s; + bool used_mempool, compacting = false; + + BUG_ON(!btree_node_is_extents(b)); +@@ -204,9 +202,9 @@ static bool bch2_compact_extent_whiteouts(struct bch_fs *c, + sort_iter_init(&sort_iter, b); + + whiteout_u64s += b->whiteout_u64s; +- order = get_order(whiteout_u64s * sizeof(u64)); ++ bytes = whiteout_u64s * sizeof(u64); + +- whiteouts = btree_bounce_alloc(c, order, &used_mempool); ++ whiteouts = btree_bounce_alloc(c, bytes, &used_mempool); + u_start = u_pos = whiteouts; + + memcpy_u64s(u_pos, unwritten_whiteouts_start(c, b), +@@ -306,7 +304,7 @@ static bool bch2_compact_extent_whiteouts(struct bch_fs *c, + unwritten_whiteouts_end(c, b), + true); + +- btree_bounce_free(c, order, used_mempool, whiteouts); ++ btree_bounce_free(c, bytes, used_mempool, whiteouts); + + bch2_btree_build_aux_trees(b); + +@@ -401,7 +399,7 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b, + struct bset *start_bset = bset(b, &b->set[start_idx]); + bool used_mempool = false; + u64 start_time, seq = 0; +- unsigned i, u64s = 0, order, shift = end_idx - start_idx - 1; ++ unsigned i, u64s = 0, bytes, shift = end_idx - start_idx - 1; + bool sorting_entire_node = start_idx == 0 && + end_idx == b->nsets; + +@@ -416,11 +414,11 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b, + btree_bkey_last(b, t)); + } + +- order = sorting_entire_node +- ? btree_page_order(c) +- : get_order(__vstruct_bytes(struct btree_node, u64s)); ++ bytes = sorting_entire_node ++ ? btree_bytes(c) ++ : __vstruct_bytes(struct btree_node, u64s); + +- out = btree_bounce_alloc(c, order, &used_mempool); ++ out = btree_bounce_alloc(c, bytes, &used_mempool); + + start_time = local_clock(); + +@@ -435,7 +433,7 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b, + + out->keys.u64s = cpu_to_le16(u64s); + +- BUG_ON(vstruct_end(&out->keys) > (void *) out + (PAGE_SIZE << order)); ++ BUG_ON(vstruct_end(&out->keys) > (void *) out + bytes); + + if (sorting_entire_node) + bch2_time_stats_update(&c->times[BCH_TIME_btree_node_sort], +@@ -449,7 +447,7 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b, + if (sorting_entire_node) { + unsigned u64s = le16_to_cpu(out->keys.u64s); + +- BUG_ON(order != btree_page_order(c)); ++ BUG_ON(bytes != btree_bytes(c)); + + /* + * Our temporary buffer is the same size as the btree node's +@@ -484,7 +482,7 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b, + set_btree_bset_end(b, &b->set[start_idx]); + bch2_bset_set_no_aux_tree(b, &b->set[start_idx]); + +- btree_bounce_free(c, order, used_mempool, out); ++ btree_bounce_free(c, bytes, used_mempool, out); + + bch2_verify_btree_nr_keys(b); + } +@@ -1043,7 +1041,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry + BTREE_ERR_WANT_RETRY, c, b, NULL, + "found bset signature after last bset"); + +- sorted = btree_bounce_alloc(c, btree_page_order(c), &used_mempool); ++ sorted = btree_bounce_alloc(c, btree_bytes(c), &used_mempool); + sorted->keys.u64s = 0; + + set_btree_bset(b, b->set, &b->data->keys); +@@ -1061,7 +1059,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry + + BUG_ON(b->nr.live_u64s != u64s); + +- btree_bounce_free(c, btree_page_order(c), used_mempool, sorted); ++ btree_bounce_free(c, btree_bytes(c), used_mempool, sorted); + + i = &b->data->keys; + for (k = i->start; k != vstruct_last(i);) { +@@ -1403,7 +1401,7 @@ static void btree_node_write_work(struct work_struct *work) + struct btree *b = wbio->wbio.bio.bi_private; + + btree_bounce_free(c, +- wbio->wbio.order, ++ wbio->bytes, + wbio->wbio.used_mempool, + wbio->data); + +@@ -1486,7 +1484,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, + struct bch_extent_ptr *ptr; + struct sort_iter sort_iter; + struct nonce nonce; +- unsigned bytes_to_write, sectors_to_write, order, bytes, u64s; ++ unsigned bytes_to_write, sectors_to_write, bytes, u64s; + u64 seq = 0; + bool used_mempool; + unsigned long old, new; +@@ -1556,8 +1554,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, + seq = max(seq, le64_to_cpu(i->journal_seq)); + } + +- order = get_order(bytes); +- data = btree_bounce_alloc(c, order, &used_mempool); ++ data = btree_bounce_alloc(c, bytes, &used_mempool); + + if (!b->written) { + bn = data; +@@ -1669,7 +1666,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, + struct btree_write_bio, wbio.bio); + wbio_init(&wbio->wbio.bio); + wbio->data = data; +- wbio->wbio.order = order; ++ wbio->bytes = bytes; + wbio->wbio.used_mempool = used_mempool; + wbio->wbio.bio.bi_opf = REQ_OP_WRITE|REQ_META; + wbio->wbio.bio.bi_end_io = btree_node_write_endio; +@@ -1706,7 +1703,7 @@ err: + set_btree_node_noevict(b); + b->written += sectors_to_write; + nowrite: +- btree_bounce_free(c, order, used_mempool, data); ++ btree_bounce_free(c, bytes, used_mempool, data); + btree_node_write_done(c, b); + } + +diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h +index f3d7ec749b61..db013dc28eec 100644 +--- a/fs/bcachefs/btree_io.h ++++ b/fs/bcachefs/btree_io.h +@@ -23,8 +23,9 @@ struct btree_read_bio { + }; + + struct btree_write_bio { +- void *data; + struct work_struct work; ++ void *data; ++ unsigned bytes; + struct bch_write_bio wbio; + }; + +diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h +index 5b9b47700a15..683b416ef427 100644 +--- a/fs/bcachefs/btree_types.h ++++ b/fs/bcachefs/btree_types.h +@@ -94,7 +94,7 @@ struct btree { + struct btree_nr_keys nr; + u16 sib_u64s[2]; + u16 whiteout_u64s; +- u8 page_order; ++ u8 byte_order; + u8 unpack_fn_len; + + /* +diff --git a/fs/bcachefs/io_types.h b/fs/bcachefs/io_types.h +index 684e4c9a5d98..b23727d212b9 100644 +--- a/fs/bcachefs/io_types.h ++++ b/fs/bcachefs/io_types.h +@@ -78,7 +78,6 @@ struct bch_write_bio { + u64 submit_time; + + struct bch_devs_list failed; +- u8 order; + u8 dev; + + unsigned split:1, +-- +cgit v1.2.3 + + +From 693144948bbd40059bdfc5fc27bd5dc389e1651a Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 25 Jul 2020 17:06:11 -0400 +Subject: bcachefs: Convert various code to printbuf + +printbufs know how big the buffer is that was allocated, so we can get +rid of the random PAGE_SIZEs all over the place. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_io.c | 7 +- + fs/bcachefs/btree_io.h | 2 +- + fs/bcachefs/btree_update_interior.c | 7 +- + fs/bcachefs/btree_update_interior.h | 2 +- + fs/bcachefs/clock.c | 7 +- + fs/bcachefs/clock.h | 2 +- + fs/bcachefs/ec.c | 29 +++++ + fs/bcachefs/ec.h | 1 + + fs/bcachefs/journal.c | 36 +++--- + fs/bcachefs/journal.h | 4 +- + fs/bcachefs/rebalance.c | 19 ++-- + fs/bcachefs/rebalance.h | 2 +- + fs/bcachefs/sysfs.c | 220 ++++++++++++++++-------------------- + fs/bcachefs/util.c | 25 ++-- + fs/bcachefs/util.h | 2 +- + 15 files changed, 177 insertions(+), 188 deletions(-) + +diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c +index 76c9cbb97894..887e40574c93 100644 +--- a/fs/bcachefs/btree_io.c ++++ b/fs/bcachefs/btree_io.c +@@ -1834,9 +1834,8 @@ void bch2_btree_verify_flushed(struct bch_fs *c) + rcu_read_unlock(); + } + +-ssize_t bch2_dirty_btree_nodes_print(struct bch_fs *c, char *buf) ++void bch2_dirty_btree_nodes_to_text(struct printbuf *out, struct bch_fs *c) + { +- struct printbuf out = _PBUF(buf, PAGE_SIZE); + struct bucket_table *tbl; + struct rhash_head *pos; + struct btree *b; +@@ -1849,7 +1848,7 @@ ssize_t bch2_dirty_btree_nodes_print(struct bch_fs *c, char *buf) + if (!(flags & (1 << BTREE_NODE_dirty))) + continue; + +- pr_buf(&out, "%p d %u n %u l %u w %u b %u r %u:%lu\n", ++ pr_buf(out, "%p d %u n %u l %u w %u b %u r %u:%lu\n", + b, + (flags & (1 << BTREE_NODE_dirty)) != 0, + (flags & (1 << BTREE_NODE_need_write)) != 0, +@@ -1860,6 +1859,4 @@ ssize_t bch2_dirty_btree_nodes_print(struct bch_fs *c, char *buf) + b->will_make_reachable & 1); + } + rcu_read_unlock(); +- +- return out.pos - buf; + } +diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h +index db013dc28eec..66ebdd39f5b3 100644 +--- a/fs/bcachefs/btree_io.h ++++ b/fs/bcachefs/btree_io.h +@@ -140,7 +140,7 @@ do { \ + void bch2_btree_flush_all_reads(struct bch_fs *); + void bch2_btree_flush_all_writes(struct bch_fs *); + void bch2_btree_verify_flushed(struct bch_fs *); +-ssize_t bch2_dirty_btree_nodes_print(struct bch_fs *, char *); ++void bch2_dirty_btree_nodes_to_text(struct printbuf *, struct bch_fs *); + + static inline void compat_bformat(unsigned level, enum btree_id btree_id, + unsigned version, unsigned big_endian, +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index 17651d027858..a2604b0ce2d8 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -1974,22 +1974,19 @@ void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id) + six_unlock_intent(&b->c.lock); + } + +-ssize_t bch2_btree_updates_print(struct bch_fs *c, char *buf) ++void bch2_btree_updates_to_text(struct printbuf *out, struct bch_fs *c) + { +- struct printbuf out = _PBUF(buf, PAGE_SIZE); + struct btree_update *as; + + mutex_lock(&c->btree_interior_update_lock); + list_for_each_entry(as, &c->btree_interior_update_list, list) +- pr_buf(&out, "%p m %u w %u r %u j %llu\n", ++ pr_buf(out, "%p m %u w %u r %u j %llu\n", + as, + as->mode, + as->nodes_written, + atomic_read(&as->cl.remaining) & CLOSURE_REMAINING_MASK, + as->journal.seq); + mutex_unlock(&c->btree_interior_update_lock); +- +- return out.pos - buf; + } + + size_t bch2_btree_interior_updates_nr_pending(struct bch_fs *c) +diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h +index 812bafdc2d04..7668225e72c6 100644 +--- a/fs/bcachefs/btree_update_interior.h ++++ b/fs/bcachefs/btree_update_interior.h +@@ -317,7 +317,7 @@ static inline bool bch2_btree_node_insert_fits(struct bch_fs *c, + return u64s <= bch_btree_keys_u64s_remaining(c, b); + } + +-ssize_t bch2_btree_updates_print(struct bch_fs *, char *); ++void bch2_btree_updates_to_text(struct printbuf *, struct bch_fs *); + + size_t bch2_btree_interior_updates_nr_pending(struct bch_fs *); + +diff --git a/fs/bcachefs/clock.c b/fs/bcachefs/clock.c +index a9f5d5696622..1d1590de55e8 100644 +--- a/fs/bcachefs/clock.c ++++ b/fs/bcachefs/clock.c +@@ -152,9 +152,8 @@ void __bch2_increment_clock(struct io_clock *clock, unsigned sectors) + timer->fn(timer); + } + +-ssize_t bch2_io_timers_show(struct io_clock *clock, char *buf) ++void bch2_io_timers_to_text(struct printbuf *out, struct io_clock *clock) + { +- struct printbuf out = _PBUF(buf, PAGE_SIZE); + unsigned long now; + unsigned i; + +@@ -162,12 +161,10 @@ ssize_t bch2_io_timers_show(struct io_clock *clock, char *buf) + now = atomic_long_read(&clock->now); + + for (i = 0; i < clock->timers.used; i++) +- pr_buf(&out, "%ps:\t%li\n", ++ pr_buf(out, "%ps:\t%li\n", + clock->timers.data[i]->fn, + clock->timers.data[i]->expire - now); + spin_unlock(&clock->timer_lock); +- +- return out.pos - buf; + } + + void bch2_io_clock_exit(struct io_clock *clock) +diff --git a/fs/bcachefs/clock.h b/fs/bcachefs/clock.h +index da50afe206cc..70a0f7436c84 100644 +--- a/fs/bcachefs/clock.h ++++ b/fs/bcachefs/clock.h +@@ -30,7 +30,7 @@ void bch2_io_clock_schedule_timeout(struct io_clock *, unsigned long); + __ret; \ + }) + +-ssize_t bch2_io_timers_show(struct io_clock *, char *); ++void bch2_io_timers_to_text(struct printbuf *, struct io_clock *); + + void bch2_io_clock_exit(struct io_clock *); + int bch2_io_clock_init(struct io_clock *); +diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c +index 084cb4dfc2ed..66e97b347634 100644 +--- a/fs/bcachefs/ec.c ++++ b/fs/bcachefs/ec.c +@@ -1575,6 +1575,35 @@ void bch2_stripes_heap_to_text(struct printbuf *out, struct bch_fs *c) + spin_unlock(&c->ec_stripes_heap_lock); + } + ++void bch2_new_stripes_to_text(struct printbuf *out, struct bch_fs *c) ++{ ++ struct ec_stripe_head *h; ++ struct ec_stripe_new *s; ++ ++ mutex_lock(&c->ec_stripe_head_lock); ++ list_for_each_entry(h, &c->ec_stripe_head_list, list) { ++ pr_buf(out, "target %u algo %u redundancy %u:\n", ++ h->target, h->algo, h->redundancy); ++ ++ if (h->s) ++ pr_buf(out, "\tpending: blocks %u allocated %u\n", ++ h->s->blocks.nr, ++ bitmap_weight(h->s->blocks_allocated, ++ h->s->blocks.nr)); ++ } ++ mutex_unlock(&c->ec_stripe_head_lock); ++ ++ mutex_lock(&c->ec_stripe_new_lock); ++ list_for_each_entry(h, &c->ec_stripe_new_list, list) { ++ pr_buf(out, "\tin flight: blocks %u allocated %u pin %u\n", ++ s->blocks.nr, ++ bitmap_weight(s->blocks_allocated, ++ s->blocks.nr), ++ atomic_read(&s->pin)); ++ } ++ mutex_unlock(&c->ec_stripe_new_lock); ++} ++ + void bch2_fs_ec_exit(struct bch_fs *c) + { + struct ec_stripe_head *h; +diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h +index ad9078fdb045..f8fc3d616cd7 100644 +--- a/fs/bcachefs/ec.h ++++ b/fs/bcachefs/ec.h +@@ -161,6 +161,7 @@ int bch2_stripes_write(struct bch_fs *, unsigned, bool *); + int bch2_ec_mem_alloc(struct bch_fs *, bool); + + void bch2_stripes_heap_to_text(struct printbuf *, struct bch_fs *); ++void bch2_new_stripes_to_text(struct printbuf *, struct bch_fs *); + + void bch2_fs_ec_exit(struct bch_fs *); + int bch2_fs_ec_init(struct bch_fs *); +diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c +index e84d80a4dcd1..210ad1b0c469 100644 +--- a/fs/bcachefs/journal.c ++++ b/fs/bcachefs/journal.c +@@ -1135,9 +1135,8 @@ out: + + /* debug: */ + +-ssize_t bch2_journal_print_debug(struct journal *j, char *buf) ++void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) + { +- struct printbuf out = _PBUF(buf, PAGE_SIZE); + struct bch_fs *c = container_of(j, struct bch_fs, journal); + union journal_res_state s; + struct bch_dev *ca; +@@ -1147,7 +1146,7 @@ ssize_t bch2_journal_print_debug(struct journal *j, char *buf) + spin_lock(&j->lock); + s = READ_ONCE(j->reservations); + +- pr_buf(&out, ++ pr_buf(out, + "active journal entries:\t%llu\n" + "seq:\t\t\t%llu\n" + "last_seq:\t\t%llu\n" +@@ -1165,31 +1164,31 @@ ssize_t bch2_journal_print_debug(struct journal *j, char *buf) + + switch (s.cur_entry_offset) { + case JOURNAL_ENTRY_ERROR_VAL: +- pr_buf(&out, "error\n"); ++ pr_buf(out, "error\n"); + break; + case JOURNAL_ENTRY_CLOSED_VAL: +- pr_buf(&out, "closed\n"); ++ pr_buf(out, "closed\n"); + break; + default: +- pr_buf(&out, "%u/%u\n", ++ pr_buf(out, "%u/%u\n", + s.cur_entry_offset, + j->cur_entry_u64s); + break; + } + +- pr_buf(&out, ++ pr_buf(out, + "current entry refs:\t%u\n" + "prev entry unwritten:\t", + journal_state_count(s, s.idx)); + + if (s.prev_buf_unwritten) +- pr_buf(&out, "yes, ref %u sectors %u\n", ++ pr_buf(out, "yes, ref %u sectors %u\n", + journal_state_count(s, !s.idx), + journal_prev_buf(j)->sectors); + else +- pr_buf(&out, "no\n"); ++ pr_buf(out, "no\n"); + +- pr_buf(&out, ++ pr_buf(out, + "need write:\t\t%i\n" + "replay done:\t\t%i\n", + test_bit(JOURNAL_NEED_WRITE, &j->flags), +@@ -1202,7 +1201,7 @@ ssize_t bch2_journal_print_debug(struct journal *j, char *buf) + if (!ja->nr) + continue; + +- pr_buf(&out, ++ pr_buf(out, + "dev %u:\n" + "\tnr\t\t%u\n" + "\tavailable\t%u:%u\n" +@@ -1221,34 +1220,29 @@ ssize_t bch2_journal_print_debug(struct journal *j, char *buf) + + spin_unlock(&j->lock); + rcu_read_unlock(); +- +- return out.pos - buf; + } + +-ssize_t bch2_journal_print_pins(struct journal *j, char *buf) ++void bch2_journal_pins_to_text(struct printbuf *out, struct journal *j) + { +- struct printbuf out = _PBUF(buf, PAGE_SIZE); + struct journal_entry_pin_list *pin_list; + struct journal_entry_pin *pin; + u64 i; + + spin_lock(&j->lock); + fifo_for_each_entry_ptr(pin_list, &j->pin, i) { +- pr_buf(&out, "%llu: count %u\n", ++ pr_buf(out, "%llu: count %u\n", + i, atomic_read(&pin_list->count)); + + list_for_each_entry(pin, &pin_list->list, list) +- pr_buf(&out, "\t%px %ps\n", ++ pr_buf(out, "\t%px %ps\n", + pin, pin->flush); + + if (!list_empty(&pin_list->flushed)) +- pr_buf(&out, "flushed:\n"); ++ pr_buf(out, "flushed:\n"); + + list_for_each_entry(pin, &pin_list->flushed, list) +- pr_buf(&out, "\t%px %ps\n", ++ pr_buf(out, "\t%px %ps\n", + pin, pin->flush); + } + spin_unlock(&j->lock); +- +- return out.pos - buf; + } +diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h +index 30de6d96188e..56438840efd7 100644 +--- a/fs/bcachefs/journal.h ++++ b/fs/bcachefs/journal.h +@@ -499,8 +499,8 @@ static inline void bch2_journal_set_replay_done(struct journal *j) + void bch2_journal_unblock(struct journal *); + void bch2_journal_block(struct journal *); + +-ssize_t bch2_journal_print_debug(struct journal *, char *); +-ssize_t bch2_journal_print_pins(struct journal *, char *); ++void bch2_journal_debug_to_text(struct printbuf *, struct journal *); ++void bch2_journal_pins_to_text(struct printbuf *, struct journal *); + + int bch2_set_nr_journal_buckets(struct bch_fs *, struct bch_dev *, + unsigned nr); +diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c +index e15a2b1dc5d0..56a1f761271f 100644 +--- a/fs/bcachefs/rebalance.c ++++ b/fs/bcachefs/rebalance.c +@@ -249,45 +249,42 @@ static int bch2_rebalance_thread(void *arg) + return 0; + } + +-ssize_t bch2_rebalance_work_show(struct bch_fs *c, char *buf) ++void bch2_rebalance_work_to_text(struct printbuf *out, struct bch_fs *c) + { +- struct printbuf out = _PBUF(buf, PAGE_SIZE); + struct bch_fs_rebalance *r = &c->rebalance; + struct rebalance_work w = rebalance_work(c); + char h1[21], h2[21]; + + bch2_hprint(&PBUF(h1), w.dev_most_full_work << 9); + bch2_hprint(&PBUF(h2), w.dev_most_full_capacity << 9); +- pr_buf(&out, "fullest_dev (%i):\t%s/%s\n", ++ pr_buf(out, "fullest_dev (%i):\t%s/%s\n", + w.dev_most_full_idx, h1, h2); + + bch2_hprint(&PBUF(h1), w.total_work << 9); + bch2_hprint(&PBUF(h2), c->capacity << 9); +- pr_buf(&out, "total work:\t\t%s/%s\n", h1, h2); ++ pr_buf(out, "total work:\t\t%s/%s\n", h1, h2); + +- pr_buf(&out, "rate:\t\t\t%u\n", r->pd.rate.rate); ++ pr_buf(out, "rate:\t\t\t%u\n", r->pd.rate.rate); + + switch (r->state) { + case REBALANCE_WAITING: +- pr_buf(&out, "waiting\n"); ++ pr_buf(out, "waiting\n"); + break; + case REBALANCE_THROTTLED: + bch2_hprint(&PBUF(h1), + (r->throttled_until_iotime - + atomic_long_read(&c->io_clock[WRITE].now)) << 9); +- pr_buf(&out, "throttled for %lu sec or %s io\n", ++ pr_buf(out, "throttled for %lu sec or %s io\n", + (r->throttled_until_cputime - jiffies) / HZ, + h1); + break; + case REBALANCE_RUNNING: +- pr_buf(&out, "running\n"); +- pr_buf(&out, "pos %llu:%llu\n", ++ pr_buf(out, "running\n"); ++ pr_buf(out, "pos %llu:%llu\n", + r->move_stats.pos.inode, + r->move_stats.pos.offset); + break; + } +- +- return out.pos - buf; + } + + void bch2_rebalance_stop(struct bch_fs *c) +diff --git a/fs/bcachefs/rebalance.h b/fs/bcachefs/rebalance.h +index 99e2a1fb6084..7ade0bb81cce 100644 +--- a/fs/bcachefs/rebalance.h ++++ b/fs/bcachefs/rebalance.h +@@ -19,7 +19,7 @@ void bch2_rebalance_add_key(struct bch_fs *, struct bkey_s_c, + struct bch_io_opts *); + void bch2_rebalance_add_work(struct bch_fs *, u64); + +-ssize_t bch2_rebalance_work_show(struct bch_fs *, char *); ++void bch2_rebalance_work_to_text(struct printbuf *, struct bch_fs *); + + void bch2_rebalance_stop(struct bch_fs *); + int bch2_rebalance_start(struct bch_fs *); +diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c +index 4406dfccd7be..0cb29f43d99d 100644 +--- a/fs/bcachefs/sysfs.c ++++ b/fs/bcachefs/sysfs.c +@@ -75,7 +75,6 @@ do { \ + #define sysfs_hprint(file, val) \ + do { \ + if (attr == &sysfs_ ## file) { \ +- struct printbuf out = _PBUF(buf, PAGE_SIZE); \ + bch2_hprint(&out, val); \ + pr_buf(&out, "\n"); \ + return out.pos - buf; \ +@@ -239,24 +238,22 @@ static size_t bch2_btree_cache_size(struct bch_fs *c) + return ret; + } + +-static ssize_t show_fs_alloc_debug(struct bch_fs *c, char *buf) ++static int fs_alloc_debug_to_text(struct printbuf *out, struct bch_fs *c) + { +- struct printbuf out = _PBUF(buf, PAGE_SIZE); + struct bch_fs_usage *fs_usage = bch2_fs_usage_read(c); + + if (!fs_usage) + return -ENOMEM; + +- bch2_fs_usage_to_text(&out, c, fs_usage); ++ bch2_fs_usage_to_text(out, c, fs_usage); + + percpu_up_read(&c->mark_lock); + + kfree(fs_usage); +- +- return out.pos - buf; ++ return 0; + } + +-static ssize_t bch2_compression_stats(struct bch_fs *c, char *buf) ++static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c) + { + struct btree_trans trans; + struct btree_iter *iter; +@@ -299,59 +296,26 @@ static ssize_t bch2_compression_stats(struct bch_fs *c, char *buf) + if (ret) + return ret; + +- return scnprintf(buf, PAGE_SIZE, +- "uncompressed data:\n" +- " nr extents: %llu\n" +- " size (bytes): %llu\n" +- "compressed data:\n" +- " nr extents: %llu\n" +- " compressed size (bytes): %llu\n" +- " uncompressed size (bytes): %llu\n", +- nr_uncompressed_extents, +- uncompressed_sectors << 9, +- nr_compressed_extents, +- compressed_sectors_compressed << 9, +- compressed_sectors_uncompressed << 9); +-} +- +-static ssize_t bch2_new_stripes(struct bch_fs *c, char *buf) +-{ +- char *out = buf, *end = buf + PAGE_SIZE; +- struct ec_stripe_head *h; +- struct ec_stripe_new *s; +- +- mutex_lock(&c->ec_stripe_head_lock); +- list_for_each_entry(h, &c->ec_stripe_head_list, list) { +- out += scnprintf(out, end - out, +- "target %u algo %u redundancy %u:\n", +- h->target, h->algo, h->redundancy); +- +- if (h->s) +- out += scnprintf(out, end - out, +- "\tpending: blocks %u allocated %u\n", +- h->s->blocks.nr, +- bitmap_weight(h->s->blocks_allocated, +- h->s->blocks.nr)); +- } +- mutex_unlock(&c->ec_stripe_head_lock); +- +- mutex_lock(&c->ec_stripe_new_lock); +- list_for_each_entry(h, &c->ec_stripe_new_list, list) { +- out += scnprintf(out, end - out, +- "\tin flight: blocks %u allocated %u pin %u\n", +- s->blocks.nr, +- bitmap_weight(s->blocks_allocated, +- s->blocks.nr), +- atomic_read(&s->pin)); +- } +- mutex_unlock(&c->ec_stripe_new_lock); +- +- return out - buf; ++ pr_buf(out, ++ "uncompressed data:\n" ++ " nr extents: %llu\n" ++ " size (bytes): %llu\n" ++ "compressed data:\n" ++ " nr extents: %llu\n" ++ " compressed size (bytes): %llu\n" ++ " uncompressed size (bytes): %llu\n", ++ nr_uncompressed_extents, ++ uncompressed_sectors << 9, ++ nr_compressed_extents, ++ compressed_sectors_compressed << 9, ++ compressed_sectors_uncompressed << 9); ++ return 0; + } + + SHOW(bch2_fs) + { + struct bch_fs *c = container_of(kobj, struct bch_fs, kobj); ++ struct printbuf out = _PBUF(buf, PAGE_SIZE); + + sysfs_print(minor, c->minor); + sysfs_printf(internal_uuid, "%pU", c->sb.uuid.b); +@@ -381,8 +345,10 @@ SHOW(bch2_fs) + sysfs_pd_controller_show(rebalance, &c->rebalance.pd); /* XXX */ + sysfs_pd_controller_show(copy_gc, &c->copygc_pd); + +- if (attr == &sysfs_rebalance_work) +- return bch2_rebalance_work_show(c, buf); ++ if (attr == &sysfs_rebalance_work) { ++ bch2_rebalance_work_to_text(&out, c); ++ return out.pos - buf; ++ } + + sysfs_print(promote_whole_extents, c->promote_whole_extents); + +@@ -392,51 +358,61 @@ SHOW(bch2_fs) + /* Debugging: */ + + if (attr == &sysfs_alloc_debug) +- return show_fs_alloc_debug(c, buf); ++ return fs_alloc_debug_to_text(&out, c) ?: out.pos - buf; + +- if (attr == &sysfs_journal_debug) +- return bch2_journal_print_debug(&c->journal, buf); ++ if (attr == &sysfs_journal_debug) { ++ bch2_journal_debug_to_text(&out, &c->journal); ++ return out.pos - buf; ++ } + +- if (attr == &sysfs_journal_pins) +- return bch2_journal_print_pins(&c->journal, buf); ++ if (attr == &sysfs_journal_pins) { ++ bch2_journal_pins_to_text(&out, &c->journal); ++ return out.pos - buf; ++ } + +- if (attr == &sysfs_btree_updates) +- return bch2_btree_updates_print(c, buf); ++ if (attr == &sysfs_btree_updates) { ++ bch2_btree_updates_to_text(&out, c); ++ return out.pos - buf; ++ } + +- if (attr == &sysfs_dirty_btree_nodes) +- return bch2_dirty_btree_nodes_print(c, buf); ++ if (attr == &sysfs_dirty_btree_nodes) { ++ bch2_dirty_btree_nodes_to_text(&out, c); ++ return out.pos - buf; ++ } + + if (attr == &sysfs_btree_key_cache) { +- struct printbuf out = _PBUF(buf, PAGE_SIZE); +- + bch2_btree_key_cache_to_text(&out, &c->btree_key_cache); + return out.pos - buf; + } + + if (attr == &sysfs_btree_transactions) { +- struct printbuf out = _PBUF(buf, PAGE_SIZE); +- + bch2_btree_trans_to_text(&out, c); + return out.pos - buf; + } + + if (attr == &sysfs_stripes_heap) { +- struct printbuf out = _PBUF(buf, PAGE_SIZE); +- + bch2_stripes_heap_to_text(&out, c); + return out.pos - buf; + } + +- if (attr == &sysfs_compression_stats) +- return bch2_compression_stats(c, buf); ++ if (attr == &sysfs_compression_stats) { ++ bch2_compression_stats_to_text(&out, c); ++ return out.pos - buf; ++ } + +- if (attr == &sysfs_new_stripes) +- return bch2_new_stripes(c, buf); ++ if (attr == &sysfs_new_stripes) { ++ bch2_new_stripes_to_text(&out, c); ++ return out.pos - buf; ++ } + +- if (attr == &sysfs_io_timers_read) +- return bch2_io_timers_show(&c->io_clock[READ], buf); +- if (attr == &sysfs_io_timers_write) +- return bch2_io_timers_show(&c->io_clock[WRITE], buf); ++ if (attr == &sysfs_io_timers_read) { ++ bch2_io_timers_to_text(&out, &c->io_clock[READ]); ++ return out.pos - buf; ++ } ++ if (attr == &sysfs_io_timers_write) { ++ bch2_io_timers_to_text(&out, &c->io_clock[WRITE]); ++ return out.pos - buf; ++ } + + #define BCH_DEBUG_PARAM(name, description) sysfs_print(name, c->name); + BCH_DEBUG_PARAMS() +@@ -705,11 +681,13 @@ int bch2_opts_create_sysfs_files(struct kobject *kobj) + SHOW(bch2_fs_time_stats) + { + struct bch_fs *c = container_of(kobj, struct bch_fs, time_stats); ++ struct printbuf out = _PBUF(buf, PAGE_SIZE); + +-#define x(name) \ +- if (attr == &sysfs_time_stat_##name) \ +- return bch2_time_stats_print(&c->times[BCH_TIME_##name],\ +- buf, PAGE_SIZE); ++#define x(name) \ ++ if (attr == &sysfs_time_stat_##name) { \ ++ bch2_time_stats_to_text(&out, &c->times[BCH_TIME_##name]);\ ++ return out.pos - buf; \ ++ } + BCH_TIME_STATS() + #undef x + +@@ -762,13 +740,13 @@ static int unsigned_cmp(const void *_l, const void *_r) + return cmp_int(*l, *r); + } + +-static ssize_t show_quantiles(struct bch_fs *c, struct bch_dev *ca, +- char *buf, bucket_map_fn *fn, void *private) ++static int quantiles_to_text(struct printbuf *out, ++ struct bch_fs *c, struct bch_dev *ca, ++ bucket_map_fn *fn, void *private) + { + size_t i, n; + /* Compute 31 quantiles */ + unsigned q[31], *p; +- ssize_t ret = 0; + + down_read(&ca->bucket_lock); + n = ca->mi.nbuckets; +@@ -795,35 +773,30 @@ static ssize_t show_quantiles(struct bch_fs *c, struct bch_dev *ca, + vfree(p); + + for (i = 0; i < ARRAY_SIZE(q); i++) +- ret += scnprintf(buf + ret, PAGE_SIZE - ret, +- "%u ", q[i]); +- buf[ret - 1] = '\n'; +- +- return ret; ++ pr_buf(out, "%u ", q[i]); ++ pr_buf(out, "\n"); ++ return 0; + } + +-static ssize_t show_reserve_stats(struct bch_dev *ca, char *buf) ++static void reserve_stats_to_text(struct printbuf *out, struct bch_dev *ca) + { +- struct printbuf out = _PBUF(buf, PAGE_SIZE); + enum alloc_reserve i; + + spin_lock(&ca->fs->freelist_lock); + +- pr_buf(&out, "free_inc:\t%zu\t%zu\n", ++ pr_buf(out, "free_inc:\t%zu\t%zu\n", + fifo_used(&ca->free_inc), + ca->free_inc.size); + + for (i = 0; i < RESERVE_NR; i++) +- pr_buf(&out, "free[%u]:\t%zu\t%zu\n", i, ++ pr_buf(out, "free[%u]:\t%zu\t%zu\n", i, + fifo_used(&ca->free[i]), + ca->free[i].size); + + spin_unlock(&ca->fs->freelist_lock); +- +- return out.pos - buf; + } + +-static ssize_t show_dev_alloc_debug(struct bch_dev *ca, char *buf) ++static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca) + { + struct bch_fs *c = ca->fs; + struct bch_dev_usage stats = bch2_dev_usage_read(ca); +@@ -834,7 +807,7 @@ static ssize_t show_dev_alloc_debug(struct bch_dev *ca, char *buf) + for (i = 0; i < ARRAY_SIZE(c->open_buckets); i++) + nr[c->open_buckets[i].type]++; + +- return scnprintf(buf, PAGE_SIZE, ++ pr_buf(out, + "free_inc: %zu/%zu\n" + "free[RESERVE_BTREE]: %zu/%zu\n" + "free[RESERVE_MOVINGGC]: %zu/%zu\n" +@@ -900,21 +873,18 @@ static const char * const bch2_rw[] = { + NULL + }; + +-static ssize_t show_dev_iodone(struct bch_dev *ca, char *buf) ++static void dev_iodone_to_text(struct printbuf *out, struct bch_dev *ca) + { +- struct printbuf out = _PBUF(buf, PAGE_SIZE); + int rw, i; + + for (rw = 0; rw < 2; rw++) { +- pr_buf(&out, "%s:\n", bch2_rw[rw]); ++ pr_buf(out, "%s:\n", bch2_rw[rw]); + + for (i = 1; i < BCH_DATA_NR; i++) +- pr_buf(&out, "%-12s:%12llu\n", ++ pr_buf(out, "%-12s:%12llu\n", + bch2_data_types[i], + percpu_u64_get(&ca->io_done->sectors[rw][i]) << 9); + } +- +- return out.pos - buf; + } + + SHOW(bch2_dev) +@@ -966,34 +936,44 @@ SHOW(bch2_dev) + return out.pos - buf; + } + +- if (attr == &sysfs_iodone) +- return show_dev_iodone(ca, buf); ++ if (attr == &sysfs_iodone) { ++ dev_iodone_to_text(&out, ca); ++ return out.pos - buf; ++ } + + sysfs_print(io_latency_read, atomic64_read(&ca->cur_latency[READ])); + sysfs_print(io_latency_write, atomic64_read(&ca->cur_latency[WRITE])); + +- if (attr == &sysfs_io_latency_stats_read) +- return bch2_time_stats_print(&ca->io_latency[READ], buf, PAGE_SIZE); +- if (attr == &sysfs_io_latency_stats_write) +- return bch2_time_stats_print(&ca->io_latency[WRITE], buf, PAGE_SIZE); ++ if (attr == &sysfs_io_latency_stats_read) { ++ bch2_time_stats_to_text(&out, &ca->io_latency[READ]); ++ return out.pos - buf; ++ } ++ if (attr == &sysfs_io_latency_stats_write) { ++ bch2_time_stats_to_text(&out, &ca->io_latency[WRITE]); ++ return out.pos - buf; ++ } + + sysfs_printf(congested, "%u%%", + clamp(atomic_read(&ca->congested), 0, CONGESTED_MAX) + * 100 / CONGESTED_MAX); + + if (attr == &sysfs_bucket_quantiles_last_read) +- return show_quantiles(c, ca, buf, bucket_last_io_fn, (void *) 0); ++ return quantiles_to_text(&out, c, ca, bucket_last_io_fn, (void *) 0) ?: out.pos - buf; + if (attr == &sysfs_bucket_quantiles_last_write) +- return show_quantiles(c, ca, buf, bucket_last_io_fn, (void *) 1); ++ return quantiles_to_text(&out, c, ca, bucket_last_io_fn, (void *) 1) ?: out.pos - buf; + if (attr == &sysfs_bucket_quantiles_fragmentation) +- return show_quantiles(c, ca, buf, bucket_sectors_used_fn, NULL); ++ return quantiles_to_text(&out, c, ca, bucket_sectors_used_fn, NULL) ?: out.pos - buf; + if (attr == &sysfs_bucket_quantiles_oldest_gen) +- return show_quantiles(c, ca, buf, bucket_oldest_gen_fn, NULL); ++ return quantiles_to_text(&out, c, ca, bucket_oldest_gen_fn, NULL) ?: out.pos - buf; + +- if (attr == &sysfs_reserve_stats) +- return show_reserve_stats(ca, buf); +- if (attr == &sysfs_alloc_debug) +- return show_dev_alloc_debug(ca, buf); ++ if (attr == &sysfs_reserve_stats) { ++ reserve_stats_to_text(&out, ca); ++ return out.pos - buf; ++ } ++ if (attr == &sysfs_alloc_debug) { ++ dev_alloc_debug_to_text(&out, ca); ++ return out.pos - buf; ++ } + + return 0; + } +diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c +index e69d03d1109f..fd4044a6a08f 100644 +--- a/fs/bcachefs/util.c ++++ b/fs/bcachefs/util.c +@@ -318,43 +318,40 @@ static void pr_time_units(struct printbuf *out, u64 ns) + pr_buf(out, "%llu %s", div_u64(ns, u->nsecs), u->name); + } + +-size_t bch2_time_stats_print(struct time_stats *stats, char *buf, size_t len) ++void bch2_time_stats_to_text(struct printbuf *out, struct time_stats *stats) + { +- struct printbuf out = _PBUF(buf, len); + const struct time_unit *u; + u64 freq = READ_ONCE(stats->average_frequency); + u64 q, last_q = 0; + int i; + +- pr_buf(&out, "count:\t\t%llu\n", ++ pr_buf(out, "count:\t\t%llu\n", + stats->count); +- pr_buf(&out, "rate:\t\t%llu/sec\n", ++ pr_buf(out, "rate:\t\t%llu/sec\n", + freq ? div64_u64(NSEC_PER_SEC, freq) : 0); + +- pr_buf(&out, "frequency:\t"); +- pr_time_units(&out, freq); ++ pr_buf(out, "frequency:\t"); ++ pr_time_units(out, freq); + +- pr_buf(&out, "\navg duration:\t"); +- pr_time_units(&out, stats->average_duration); ++ pr_buf(out, "\navg duration:\t"); ++ pr_time_units(out, stats->average_duration); + +- pr_buf(&out, "\nmax duration:\t"); +- pr_time_units(&out, stats->max_duration); ++ pr_buf(out, "\nmax duration:\t"); ++ pr_time_units(out, stats->max_duration); + + i = eytzinger0_first(NR_QUANTILES); + u = pick_time_units(stats->quantiles.entries[i].m); + +- pr_buf(&out, "\nquantiles (%s):\t", u->name); ++ pr_buf(out, "\nquantiles (%s):\t", u->name); + eytzinger0_for_each(i, NR_QUANTILES) { + bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1; + + q = max(stats->quantiles.entries[i].m, last_q); +- pr_buf(&out, "%llu%s", ++ pr_buf(out, "%llu%s", + div_u64(q, u->nsecs), + is_last ? "\n" : " "); + last_q = q; + } +- +- return out.pos - buf; + } + + void bch2_time_stats_exit(struct time_stats *stats) +diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h +index b2f423e49954..0f3be4d59e97 100644 +--- a/fs/bcachefs/util.h ++++ b/fs/bcachefs/util.h +@@ -397,7 +397,7 @@ static inline void bch2_time_stats_update(struct time_stats *stats, u64 start) + __bch2_time_stats_update(stats, start, local_clock()); + } + +-size_t bch2_time_stats_print(struct time_stats *, char *, size_t); ++void bch2_time_stats_to_text(struct printbuf *, struct time_stats *); + + void bch2_time_stats_exit(struct time_stats *); + void bch2_time_stats_init(struct time_stats *); +-- +cgit v1.2.3 + + +From 4a4045c0609dffd5b191e32b98aa5fc05d9ff1a0 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 3 Aug 2020 13:37:11 -0400 +Subject: bcachefs: Fix maximum btree node size + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/opts.h | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h +index d6a832a38b20..014c608ca0c6 100644 +--- a/fs/bcachefs/opts.h ++++ b/fs/bcachefs/opts.h +@@ -83,7 +83,7 @@ enum opt_type { + "size", NULL) \ + x(btree_node_size, u16, \ + OPT_FORMAT, \ +- OPT_SECTORS(1, 128), \ ++ OPT_SECTORS(1, 512), \ + BCH_SB_BTREE_NODE_SIZE, 512, \ + "size", "Btree node size, default 256k") \ + x(errors, u8, \ +-- +cgit v1.2.3 + + +From 03ba49669cb12888262b5b8b3e1b512de70c3daf Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 3 Aug 2020 13:58:36 -0400 +Subject: bcachefs: Don't disallow btree writes to RO devices + +There's an inherent race with setting devices RO when they have dirty +btree nodes on them. We already check if a btree node is on an RO device +before we dirty it, so this patch just allows those writes so that we +don't have errors forcing the entire filesystem read only when trying to +remove a device. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/io.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c +index 91fdae1a3628..be373b55743c 100644 +--- a/fs/bcachefs/io.c ++++ b/fs/bcachefs/io.c +@@ -471,7 +471,8 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, + + n->c = c; + n->dev = ptr->dev; +- n->have_ioref = bch2_dev_get_ioref(ca, WRITE); ++ n->have_ioref = bch2_dev_get_ioref(ca, ++ type == BCH_DATA_btree ? READ : WRITE); + n->submit_time = local_clock(); + n->bio.bi_iter.bi_sector = ptr->offset; + +-- +cgit v1.2.3 + + +From 32de81081b803078ea584b08c5a8a26c9697deef Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 4 Aug 2020 23:12:49 -0400 +Subject: bcachefs: Fix bch2_new_stripes_to_text() + +painful looking typo, fortunately difficult to hit. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/ec.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c +index 66e97b347634..abb714456060 100644 +--- a/fs/bcachefs/ec.c ++++ b/fs/bcachefs/ec.c +@@ -1594,7 +1594,7 @@ void bch2_new_stripes_to_text(struct printbuf *out, struct bch_fs *c) + mutex_unlock(&c->ec_stripe_head_lock); + + mutex_lock(&c->ec_stripe_new_lock); +- list_for_each_entry(h, &c->ec_stripe_new_list, list) { ++ list_for_each_entry(s, &c->ec_stripe_new_list, list) { + pr_buf(out, "\tin flight: blocks %u allocated %u pin %u\n", + s->blocks.nr, + bitmap_weight(s->blocks_allocated, +-- +cgit v1.2.3 + + +From 978a6b6130b8179fec5112df3cc88d3597c1e298 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 4 Aug 2020 23:10:08 -0400 +Subject: bcachefs: Fix a bug with the journal_seq_blacklist mechanism + +Previously, we would start doing btree updates before writing the first +journal entry; if this was after an unclean shutdown, this could cause +those btree updates to not be blacklisted. + +Also, move some code to headers for userspace debug tools. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_io.c | 28 ---------------------------- + fs/bcachefs/btree_io.h | 29 +++++++++++++++++++++++++++++ + fs/bcachefs/journal_seq_blacklist.c | 9 --------- + fs/bcachefs/journal_seq_blacklist.h | 9 +++++++++ + fs/bcachefs/recovery.c | 5 +++++ + fs/bcachefs/super.c | 9 ++++++++- + 6 files changed, 51 insertions(+), 38 deletions(-) + +diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c +index 887e40574c93..2f5097218f9c 100644 +--- a/fs/bcachefs/btree_io.c ++++ b/fs/bcachefs/btree_io.c +@@ -597,34 +597,6 @@ void bch2_btree_init_next(struct bch_fs *c, struct btree *b, + bch2_btree_iter_reinit_node(iter, b); + } + +-static struct nonce btree_nonce(struct bset *i, unsigned offset) +-{ +- return (struct nonce) {{ +- [0] = cpu_to_le32(offset), +- [1] = ((__le32 *) &i->seq)[0], +- [2] = ((__le32 *) &i->seq)[1], +- [3] = ((__le32 *) &i->journal_seq)[0]^BCH_NONCE_BTREE, +- }}; +-} +- +-static void bset_encrypt(struct bch_fs *c, struct bset *i, unsigned offset) +-{ +- struct nonce nonce = btree_nonce(i, offset); +- +- if (!offset) { +- struct btree_node *bn = container_of(i, struct btree_node, keys); +- unsigned bytes = (void *) &bn->keys - (void *) &bn->flags; +- +- bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, &bn->flags, +- bytes); +- +- nonce = nonce_add(nonce, round_up(bytes, CHACHA_BLOCK_SIZE)); +- } +- +- bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, i->_data, +- vstruct_end(i) - (void *) i->_data); +-} +- + static void btree_err_msg(struct printbuf *out, struct bch_fs *c, + struct btree *b, struct bset *i, + unsigned offset, int write) +diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h +index 66ebdd39f5b3..626d0f071b70 100644 +--- a/fs/bcachefs/btree_io.h ++++ b/fs/bcachefs/btree_io.h +@@ -5,6 +5,7 @@ + #include "bkey_methods.h" + #include "bset.h" + #include "btree_locking.h" ++#include "checksum.h" + #include "extents.h" + #include "io_types.h" + +@@ -82,6 +83,34 @@ static inline bool bch2_maybe_compact_whiteouts(struct bch_fs *c, struct btree * + return false; + } + ++static inline struct nonce btree_nonce(struct bset *i, unsigned offset) ++{ ++ return (struct nonce) {{ ++ [0] = cpu_to_le32(offset), ++ [1] = ((__le32 *) &i->seq)[0], ++ [2] = ((__le32 *) &i->seq)[1], ++ [3] = ((__le32 *) &i->journal_seq)[0]^BCH_NONCE_BTREE, ++ }}; ++} ++ ++static inline void bset_encrypt(struct bch_fs *c, struct bset *i, unsigned offset) ++{ ++ struct nonce nonce = btree_nonce(i, offset); ++ ++ if (!offset) { ++ struct btree_node *bn = container_of(i, struct btree_node, keys); ++ unsigned bytes = (void *) &bn->keys - (void *) &bn->flags; ++ ++ bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, &bn->flags, ++ bytes); ++ ++ nonce = nonce_add(nonce, round_up(bytes, CHACHA_BLOCK_SIZE)); ++ } ++ ++ bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, i->_data, ++ vstruct_end(i) - (void *) i->_data); ++} ++ + void bch2_btree_sort_into(struct bch_fs *, struct btree *, struct btree *); + + void bch2_btree_build_aux_trees(struct btree *); +diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c +index a21de0088753..d0f1bbf8f6a7 100644 +--- a/fs/bcachefs/journal_seq_blacklist.c ++++ b/fs/bcachefs/journal_seq_blacklist.c +@@ -36,15 +36,6 @@ + * that bset, until that btree node is rewritten. + */ + +-static unsigned +-blacklist_nr_entries(struct bch_sb_field_journal_seq_blacklist *bl) +-{ +- return bl +- ? ((vstruct_end(&bl->field) - (void *) &bl->start[0]) / +- sizeof(struct journal_seq_blacklist_entry)) +- : 0; +-} +- + static unsigned sb_blacklist_u64s(unsigned nr) + { + struct bch_sb_field_journal_seq_blacklist *bl; +diff --git a/fs/bcachefs/journal_seq_blacklist.h b/fs/bcachefs/journal_seq_blacklist.h +index 03f4b97247fd..afb886ec8e25 100644 +--- a/fs/bcachefs/journal_seq_blacklist.h ++++ b/fs/bcachefs/journal_seq_blacklist.h +@@ -2,6 +2,15 @@ + #ifndef _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H + #define _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H + ++static inline unsigned ++blacklist_nr_entries(struct bch_sb_field_journal_seq_blacklist *bl) ++{ ++ return bl ++ ? ((vstruct_end(&bl->field) - (void *) &bl->start[0]) / ++ sizeof(struct journal_seq_blacklist_entry)) ++ : 0; ++} ++ + bool bch2_journal_seq_is_blacklisted(struct bch_fs *, u64, bool); + int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64, u64); + int bch2_blacklist_table_initialize(struct bch_fs *); +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index 28972f30e198..6e829bf0a31f 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -1039,6 +1039,11 @@ int bch2_fs_recovery(struct bch_fs *c) + } + + journal_seq += 4; ++ ++ /* ++ * The superblock needs to be written before we do any btree ++ * node writes: it will be in the read_write() path ++ */ + } + + ret = bch2_blacklist_table_initialize(c); +diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c +index f0ed7c9ea623..feac88eae540 100644 +--- a/fs/bcachefs/super.c ++++ b/fs/bcachefs/super.c +@@ -345,8 +345,8 @@ bool bch2_fs_emergency_read_only(struct bch_fs *c) + { + bool ret = !test_and_set_bit(BCH_FS_EMERGENCY_RO, &c->flags); + +- bch2_fs_read_only_async(c); + bch2_journal_halt(&c->journal); ++ bch2_fs_read_only_async(c); + + wake_up(&bch_read_only_wait); + return ret; +@@ -403,6 +403,13 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early) + if (ret) + goto err; + ++ /* ++ * We need to write out a journal entry before we start doing btree ++ * updates, to ensure that on unclean shutdown new journal blacklist ++ * entries are created: ++ */ ++ bch2_journal_meta(&c->journal); ++ + clear_bit(BCH_FS_ALLOC_CLEAN, &c->flags); + + for_each_rw_member(ca, c, i) +-- +cgit v1.2.3 + + +From 47d0d2856537816d8d1e4b2eb4e4e62a051f4ff1 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 12 Aug 2020 13:48:02 -0400 +Subject: bcachefs: Don't block on allocations when only writing to specific + device + +Since the copygc thread is now global and not per device, we're not +freeing up space on any one device in bounded time - and indeed we never +really were, since rebalance wasn't moving data around between devices +with that objective. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/io.c | 8 +++++++- + 1 file changed, 7 insertions(+), 1 deletion(-) + +diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c +index be373b55743c..75904f2ce92c 100644 +--- a/fs/bcachefs/io.c ++++ b/fs/bcachefs/io.c +@@ -1091,6 +1091,11 @@ again: + goto err; + } + ++ /* ++ * The copygc thread is now global, which means it's no longer ++ * freeing up space on specific disks, which means that ++ * allocations for specific disks may hang arbitrarily long: ++ */ + wp = bch2_alloc_sectors_start(c, + op->target, + op->opts.erasure_code, +@@ -1100,7 +1105,8 @@ again: + op->nr_replicas_required, + op->alloc_reserve, + op->flags, +- (op->flags & BCH_WRITE_ALLOC_NOWAIT) ? NULL : cl); ++ (op->flags & (BCH_WRITE_ALLOC_NOWAIT| ++ BCH_WRITE_ONLY_SPECIFIED_DEVS)) ? NULL : cl); + EBUG_ON(!wp); + + if (unlikely(IS_ERR(wp))) { +-- +cgit v1.2.3 + + +From 2f8c0c5a7c864c64f0a522934853be9005f37002 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 12 Aug 2020 13:49:09 -0400 +Subject: bcachefs: Change copygc to consider bucket fragmentation + +When devices have different sized buckets this is more correct. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/buckets_types.h | 1 + + fs/bcachefs/movinggc.c | 20 +++++++++++--------- + 2 files changed, 12 insertions(+), 9 deletions(-) + +diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h +index 4ebe80b05ffc..d5215b14d7d9 100644 +--- a/fs/bcachefs/buckets_types.h ++++ b/fs/bcachefs/buckets_types.h +@@ -125,6 +125,7 @@ struct disk_reservation { + struct copygc_heap_entry { + u8 dev; + u8 gen; ++ u16 fragmentation; + u32 sectors; + u64 offset; + }; +diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c +index 55aa463f992f..de0a7974ec9f 100644 +--- a/fs/bcachefs/movinggc.c ++++ b/fs/bcachefs/movinggc.c +@@ -44,13 +44,6 @@ + #define COPYGC_BUCKETS_PER_ITER(ca) \ + ((ca)->free[RESERVE_MOVINGGC].size / 2) + +-static inline int sectors_used_cmp(copygc_heap *heap, +- struct copygc_heap_entry l, +- struct copygc_heap_entry r) +-{ +- return cmp_int(l.sectors, r.sectors); +-} +- + static int bucket_offset_cmp(const void *_l, const void *_r, size_t size) + { + const struct copygc_heap_entry *l = _l; +@@ -123,6 +116,13 @@ static bool have_copygc_reserve(struct bch_dev *ca) + return ret; + } + ++static inline int fragmentation_cmp(copygc_heap *heap, ++ struct copygc_heap_entry l, ++ struct copygc_heap_entry r) ++{ ++ return cmp_int(l.fragmentation, r.fragmentation); ++} ++ + static int bch2_copygc(struct bch_fs *c) + { + copygc_heap *h = &c->copygc_heap; +@@ -180,10 +180,12 @@ static int bch2_copygc(struct bch_fs *c) + e = (struct copygc_heap_entry) { + .dev = dev_idx, + .gen = m.gen, ++ .fragmentation = bucket_sectors_used(m) * (1U << 15) ++ / ca->mi.bucket_size, + .sectors = bucket_sectors_used(m), + .offset = bucket_to_sector(ca, b), + }; +- heap_add_or_replace(h, e, -sectors_used_cmp, NULL); ++ heap_add_or_replace(h, e, -fragmentation_cmp, NULL); + } + up_read(&ca->bucket_lock); + } +@@ -197,7 +199,7 @@ static int bch2_copygc(struct bch_fs *c) + sectors_to_move += i->sectors; + + while (sectors_to_move > sectors_reserved) { +- BUG_ON(!heap_pop(h, e, -sectors_used_cmp, NULL)); ++ BUG_ON(!heap_pop(h, e, -fragmentation_cmp, NULL)); + sectors_to_move -= e.sectors; + } + +-- +cgit v1.2.3 + + +From 4ab2a455b9a8e071d7ae7049be123873f5595075 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 12 Aug 2020 15:00:08 -0400 +Subject: bcachefs: Fix disk groups not being updated when set via sysfs + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/disk_groups.c | 7 ++++++- + fs/bcachefs/disk_groups.h | 3 +++ + 2 files changed, 9 insertions(+), 1 deletion(-) + +diff --git a/fs/bcachefs/disk_groups.c b/fs/bcachefs/disk_groups.c +index 4a4ec8f46108..b6e3152312ba 100644 +--- a/fs/bcachefs/disk_groups.c ++++ b/fs/bcachefs/disk_groups.c +@@ -387,6 +387,7 @@ int bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name) + { + struct bch_member *mi; + int v = -1; ++ int ret = 0; + + mutex_lock(&c->sb_lock); + +@@ -399,14 +400,18 @@ int bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name) + return v; + } + ++ ret = bch2_sb_disk_groups_to_cpu(c); ++ if (ret) ++ goto unlock; + write_sb: + mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx]; + SET_BCH_MEMBER_GROUP(mi, v + 1); + + bch2_write_super(c); ++unlock: + mutex_unlock(&c->sb_lock); + +- return 0; ++ return ret; + } + + int bch2_opt_target_parse(struct bch_fs *c, const char *buf, u64 *v) +diff --git a/fs/bcachefs/disk_groups.h b/fs/bcachefs/disk_groups.h +index c8e0c37a5e1a..3d84f23c34ed 100644 +--- a/fs/bcachefs/disk_groups.h ++++ b/fs/bcachefs/disk_groups.h +@@ -71,7 +71,10 @@ static inline struct bch_devs_mask target_rw_devs(struct bch_fs *c, + bool bch2_dev_in_target(struct bch_fs *, unsigned, unsigned); + + int bch2_disk_path_find(struct bch_sb_handle *, const char *); ++ ++/* Exported for userspace bcachefs-tools: */ + int bch2_disk_path_find_or_create(struct bch_sb_handle *, const char *); ++ + void bch2_disk_path_to_text(struct printbuf *, struct bch_sb_handle *, + unsigned); + +-- +cgit v1.2.3 + + +From 61182c8c140539981b53872a93ec7478a12e7ffe Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 6 Aug 2020 15:22:24 -0400 +Subject: bcachefs: Fix a couple null ptr derefs when no disk groups exist + +Normally successfully parsing a target means disk groups should exist, +but we don't want a BUG() or null ptr deref if we end up with an invalid +target. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/disk_groups.c | 4 ++-- + fs/bcachefs/io.c | 4 +++- + 2 files changed, 5 insertions(+), 3 deletions(-) + +diff --git a/fs/bcachefs/disk_groups.c b/fs/bcachefs/disk_groups.c +index b6e3152312ba..c52b6faac9b4 100644 +--- a/fs/bcachefs/disk_groups.c ++++ b/fs/bcachefs/disk_groups.c +@@ -183,7 +183,7 @@ const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *c, unsigned targe + case TARGET_GROUP: { + struct bch_disk_groups_cpu *g = rcu_dereference(c->disk_groups); + +- return t.group < g->nr && !g->entries[t.group].deleted ++ return g && t.group < g->nr && !g->entries[t.group].deleted + ? &g->entries[t.group].devs + : NULL; + } +@@ -208,7 +208,7 @@ bool bch2_dev_in_target(struct bch_fs *c, unsigned dev, unsigned target) + + rcu_read_lock(); + g = rcu_dereference(c->disk_groups); +- m = t.group < g->nr && !g->entries[t.group].deleted ++ m = g && t.group < g->nr && !g->entries[t.group].deleted + ? &g->entries[t.group].devs + : NULL; + +diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c +index 75904f2ce92c..590e15dc776f 100644 +--- a/fs/bcachefs/io.c ++++ b/fs/bcachefs/io.c +@@ -54,7 +54,9 @@ static bool bch2_target_congested(struct bch_fs *c, u16 target) + return false; + + rcu_read_lock(); +- devs = bch2_target_to_mask(c, target); ++ devs = bch2_target_to_mask(c, target) ?: ++ &c->rw_devs[BCH_DATA_user]; ++ + for_each_set_bit(d, devs->d, BCH_SB_MEMBERS_MAX) { + ca = rcu_dereference(c->devs[d]); + if (!ca) +-- +cgit v1.2.3 + + +From eb5fa942bee58593f58b9b98313373fcef957ba7 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 12 Aug 2020 15:08:17 -0400 +Subject: bcachefs: Add a cond_resched() to bch2_alloc_write() + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/alloc_background.c | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c +index bf034168eb8b..d5f0607733fe 100644 +--- a/fs/bcachefs/alloc_background.c ++++ b/fs/bcachefs/alloc_background.c +@@ -350,6 +350,8 @@ int bch2_alloc_write(struct bch_fs *c, unsigned flags, bool *wrote) + bch2_btree_iter_set_pos(iter, POS(i, first_bucket)); + + while (1) { ++ bch2_trans_cond_resched(&trans); ++ + ret = bch2_alloc_write_key(&trans, iter, flags); + if (ret < 0 || ret == ALLOC_END) + break; +-- +cgit v1.2.3 + + +From 86e1943f94dba107f3f8f721b6fdb030d4742205 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 15 Aug 2020 22:41:35 -0400 +Subject: bcachefs: Don't report inodes to statfs + +We don't have a limit on the number of inodes in a filesystem, so this +is apparently the right way to report that. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/fs.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c +index f5fd9bb3b66f..11cb8c76f653 100644 +--- a/fs/bcachefs/fs.c ++++ b/fs/bcachefs/fs.c +@@ -1261,8 +1261,8 @@ static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf) + buf->f_blocks = usage.capacity >> shift; + buf->f_bfree = (usage.capacity - usage.used) >> shift; + buf->f_bavail = buf->f_bfree; +- buf->f_files = usage.nr_inodes; +- buf->f_ffree = U64_MAX; ++ buf->f_files = 0; ++ buf->f_ffree = 0; + + fsid = le64_to_cpup((void *) c->sb.user_uuid.b) ^ + le64_to_cpup((void *) c->sb.user_uuid.b + sizeof(u64)); +-- +cgit v1.2.3 + + +From 2770844d2c541b1219c3c6636594742da61e7661 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 24 Aug 2020 14:57:48 -0400 +Subject: bcachefs: Some project id fixes + +Inode options that are accessible via the xattr interface are stored +with a +1 bias, so that a value of 0 means unset. We weren't handling +this consistently. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/fs-ioctl.c | 6 +++++- + fs/bcachefs/xattr.c | 6 +++++- + 2 files changed, 10 insertions(+), 2 deletions(-) + +diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c +index 75fdb2fe861e..9988fe2e8c45 100644 +--- a/fs/bcachefs/fs-ioctl.c ++++ b/fs/bcachefs/fs-ioctl.c +@@ -138,6 +138,10 @@ static int bch2_ioc_fssetxattr(struct bch_fs *c, + if (fa.fsx_projid >= U32_MAX) + return -EINVAL; + ++ /* ++ * inode fields accessible via the xattr interface are stored with a +1 ++ * bias, so that 0 means unset: ++ */ + s.projid = fa.fsx_projid + 1; + + ret = mnt_want_write_file(file); +@@ -151,7 +155,7 @@ static int bch2_ioc_fssetxattr(struct bch_fs *c, + } + + mutex_lock(&inode->ei_update_lock); +- ret = bch2_set_projid(c, inode, s.projid); ++ ret = bch2_set_projid(c, inode, fa.fsx_projid); + if (ret) + goto err_unlock; + +diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c +index 3f383039765f..2c4034f12147 100644 +--- a/fs/bcachefs/xattr.c ++++ b/fs/bcachefs/xattr.c +@@ -513,7 +513,11 @@ static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler, + + mutex_lock(&inode->ei_update_lock); + if (inode_opt_id == Inode_opt_project) { +- ret = bch2_set_projid(c, inode, s.v); ++ /* ++ * inode fields accessible via the xattr interface are stored ++ * with a +1 bias, so that 0 means unset: ++ */ ++ ret = bch2_set_projid(c, inode, s.v ? s.v - 1 : 0); + if (ret) + goto err; + } +-- +cgit v1.2.3 + + +From c7376b2e08bcfcf461ecbece916d0f97afe3d569 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 24 Aug 2020 15:16:32 -0400 +Subject: bcachefs: Make sure to go rw if lazy in fsck + +The paths where we delete or truncate inodes don't pass commit flags for +BTREE_INSERT_LAZY_RW, so just go rw if necessary in the fsck code. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/fsck.c | 4 ++++ + fs/bcachefs/super.h | 9 +++++++++ + 2 files changed, 13 insertions(+) + +diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c +index c6ca5968a2e0..5a6df3d1973a 100644 +--- a/fs/bcachefs/fsck.c ++++ b/fs/bcachefs/fsck.c +@@ -1265,6 +1265,8 @@ static int check_inode(struct btree_trans *trans, + u.bi_inum))) { + bch_verbose(c, "deleting inode %llu", u.bi_inum); + ++ bch2_fs_lazy_rw(c); ++ + ret = bch2_inode_rm(c, u.bi_inum); + if (ret) + bch_err(c, "error in fsck: error %i while deleting inode", ret); +@@ -1277,6 +1279,8 @@ static int check_inode(struct btree_trans *trans, + u.bi_inum))) { + bch_verbose(c, "truncating inode %llu", u.bi_inum); + ++ bch2_fs_lazy_rw(c); ++ + /* + * XXX: need to truncate partial blocks too here - or ideally + * just switch units to bytes and that issue goes away +diff --git a/fs/bcachefs/super.h b/fs/bcachefs/super.h +index 543cc5422d9e..3adab04d236a 100644 +--- a/fs/bcachefs/super.h ++++ b/fs/bcachefs/super.h +@@ -221,6 +221,15 @@ void bch2_fs_read_only(struct bch_fs *); + int bch2_fs_read_write(struct bch_fs *); + int bch2_fs_read_write_early(struct bch_fs *); + ++/* ++ * Only for use in the recovery/fsck path: ++ */ ++static inline void bch2_fs_lazy_rw(struct bch_fs *c) ++{ ++ if (percpu_ref_is_zero(&c->writes)) ++ bch2_fs_read_write_early(c); ++} ++ + void bch2_fs_stop(struct bch_fs *); + + int bch2_fs_start(struct bch_fs *); +-- +cgit v1.2.3 + + +From fb4934e854c6f98684649cb2c804f50ad1e69505 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 24 Aug 2020 15:58:26 -0400 +Subject: bcachefs: Improvements to the journal read error paths + + - Print out more information in error messages + - On checksum error, keep the journal entry but mark it bad so that we + can prefer entries from other devices that don't have bad checksums + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/journal_io.c | 84 +++++++++++++++++++++++++++++++++--------------- + fs/bcachefs/journal_io.h | 2 ++ + 2 files changed, 60 insertions(+), 26 deletions(-) + +diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c +index 89585833c846..bd0e6b371701 100644 +--- a/fs/bcachefs/journal_io.c ++++ b/fs/bcachefs/journal_io.c +@@ -29,9 +29,11 @@ struct journal_list { + * be replayed: + */ + static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca, +- struct journal_list *jlist, struct jset *j) ++ struct journal_list *jlist, struct jset *j, ++ bool bad) + { + struct journal_replay *i, *pos; ++ struct bch_devs_list devs = { .nr = 0 }; + struct list_head *where; + size_t bytes = vstruct_bytes(j); + __le64 last_seq; +@@ -60,8 +62,31 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca, + } + + list_for_each_entry_reverse(i, jlist->head, list) { +- /* Duplicate? */ +- if (le64_to_cpu(j->seq) == le64_to_cpu(i->j.seq)) { ++ if (le64_to_cpu(j->seq) > le64_to_cpu(i->j.seq)) { ++ where = &i->list; ++ goto add; ++ } ++ } ++ ++ where = jlist->head; ++add: ++ i = where->next != jlist->head ++ ? container_of(where->next, struct journal_replay, list) ++ : NULL; ++ ++ /* ++ * Duplicate journal entries? If so we want the one that didn't have a ++ * checksum error: ++ */ ++ if (i && le64_to_cpu(j->seq) == le64_to_cpu(i->j.seq)) { ++ if (i->bad) { ++ devs = i->devs; ++ list_del(&i->list); ++ kvpfree(i, offsetof(struct journal_replay, j) + ++ vstruct_bytes(&i->j)); ++ } else if (bad) { ++ goto found; ++ } else { + fsck_err_on(bytes != vstruct_bytes(&i->j) || + memcmp(j, &i->j, bytes), c, + "found duplicate but non identical journal entries (seq %llu)", +@@ -69,14 +94,8 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca, + goto found; + } + +- if (le64_to_cpu(j->seq) > le64_to_cpu(i->j.seq)) { +- where = &i->list; +- goto add; +- } + } + +- where = jlist->head; +-add: + i = kvpmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL); + if (!i) { + ret = -ENOMEM; +@@ -84,7 +103,8 @@ add: + } + + list_add(&i->list, where); +- i->devs.nr = 0; ++ i->devs = devs; ++ i->bad = bad; + memcpy(&i->j, j, bytes); + found: + if (!bch2_dev_list_has_dev(i->devs, ca->dev_idx)) +@@ -391,6 +411,7 @@ fsck_err: + } + + static int jset_validate(struct bch_fs *c, ++ struct bch_dev *ca, + struct jset *jset, u64 sector, + unsigned bucket_sectors_left, + unsigned sectors_read, +@@ -405,16 +426,19 @@ static int jset_validate(struct bch_fs *c, + return JOURNAL_ENTRY_NONE; + + version = le32_to_cpu(jset->version); +- if ((version != BCH_JSET_VERSION_OLD && +- version < bcachefs_metadata_version_min) || +- version >= bcachefs_metadata_version_max) { +- bch_err(c, "unknown journal entry version %u", jset->version); +- return BCH_FSCK_UNKNOWN_VERSION; ++ if (journal_entry_err_on((version != BCH_JSET_VERSION_OLD && ++ version < bcachefs_metadata_version_min) || ++ version >= bcachefs_metadata_version_max, c, ++ "%s sector %llu seq %llu: unknown journal entry version %u", ++ ca->name, sector, le64_to_cpu(jset->seq), ++ version)) { ++ /* XXX: note we might have missing journal entries */ ++ return JOURNAL_ENTRY_BAD; + } + + if (journal_entry_err_on(bytes > bucket_sectors_left << 9, c, +- "journal entry too big (%zu bytes), sector %lluu", +- bytes, sector)) { ++ "%s sector %llu seq %llu: journal entry too big (%zu bytes)", ++ ca->name, sector, le64_to_cpu(jset->seq), bytes)) { + /* XXX: note we might have missing journal entries */ + return JOURNAL_ENTRY_BAD; + } +@@ -423,13 +447,15 @@ static int jset_validate(struct bch_fs *c, + return JOURNAL_ENTRY_REREAD; + + if (fsck_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)), c, +- "journal entry with unknown csum type %llu sector %lluu", +- JSET_CSUM_TYPE(jset), sector)) ++ "%s sector %llu seq %llu: journal entry with unknown csum type %llu", ++ ca->name, sector, le64_to_cpu(jset->seq), ++ JSET_CSUM_TYPE(jset))) + return JOURNAL_ENTRY_BAD; + + csum = csum_vstruct(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), jset); + if (journal_entry_err_on(bch2_crc_cmp(csum, jset->csum), c, +- "journal checksum bad, sector %llu", sector)) { ++ "%s sector %llu seq %llu: journal checksum bad", ++ ca->name, sector, le64_to_cpu(jset->seq))) { + /* XXX: retry IO, when we start retrying checksum errors */ + /* XXX: note we might have missing journal entries */ + return JOURNAL_ENTRY_BAD; +@@ -440,8 +466,10 @@ static int jset_validate(struct bch_fs *c, + vstruct_end(jset) - (void *) jset->encrypted_start); + + if (journal_entry_err_on(le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq), c, +- "invalid journal entry: last_seq > seq")) ++ "invalid journal entry: last_seq > seq")) { + jset->last_seq = jset->seq; ++ return JOURNAL_ENTRY_BAD; ++ } + + return 0; + fsck_err: +@@ -516,11 +544,12 @@ reread: + j = buf->data; + } + +- ret = jset_validate(c, j, offset, ++ ret = jset_validate(c, ca, j, offset, + end - offset, sectors_read, + READ); + switch (ret) { + case BCH_FSCK_OK: ++ sectors = vstruct_sectors(j, c->block_bits); + break; + case JOURNAL_ENTRY_REREAD: + if (vstruct_bytes(j) > buf->size) { +@@ -537,8 +566,13 @@ reread: + goto next_block; + case JOURNAL_ENTRY_BAD: + saw_bad = true; ++ /* ++ * On checksum error we don't really trust the size ++ * field of the journal entry we read, so try reading ++ * again at next block boundary: ++ */ + sectors = c->opts.block_size; +- goto next_block; ++ break; + default: + return ret; + } +@@ -555,7 +589,7 @@ reread: + ja->bucket_seq[bucket] = le64_to_cpu(j->seq); + + mutex_lock(&jlist->lock); +- ret = journal_entry_add(c, ca, jlist, j); ++ ret = journal_entry_add(c, ca, jlist, j, ret != 0); + mutex_unlock(&jlist->lock); + + switch (ret) { +@@ -566,8 +600,6 @@ reread: + default: + return ret; + } +- +- sectors = vstruct_sectors(j, c->block_bits); + next_block: + pr_debug("next"); + offset += sectors; +diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h +index 72e575f360af..6958ee0f8cf2 100644 +--- a/fs/bcachefs/journal_io.h ++++ b/fs/bcachefs/journal_io.h +@@ -9,6 +9,8 @@ + struct journal_replay { + struct list_head list; + struct bch_devs_list devs; ++ /* checksum error, but we may want to try using it anyways: */ ++ bool bad; + /* must be last: */ + struct jset j; + }; +-- +cgit v1.2.3 + + +From 94ad047046d61953fb0547894968a4f2139b1de9 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 6 Sep 2020 22:58:28 -0400 +Subject: bcachefs: Don't fail mount if device has been removed + +Also - make sure to show the devices we actually have open in /proc + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/fs.c | 20 +++++++++++++++++++- + fs/bcachefs/super.c | 18 ++++++++++++++++-- + 2 files changed, 35 insertions(+), 3 deletions(-) + +diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c +index 11cb8c76f653..cf4655ede0d7 100644 +--- a/fs/bcachefs/fs.c ++++ b/fs/bcachefs/fs.c +@@ -1427,6 +1427,24 @@ static int bch2_remount(struct super_block *sb, int *flags, char *data) + return ret; + } + ++static int bch2_show_devname(struct seq_file *seq, struct dentry *root) ++{ ++ struct bch_fs *c = root->d_sb->s_fs_info; ++ struct bch_dev *ca; ++ unsigned i; ++ bool first = true; ++ ++ for_each_online_member(ca, c, i) { ++ if (!first) ++ seq_putc(seq, ':'); ++ first = false; ++ seq_puts(seq, "/dev/"); ++ seq_puts(seq, ca->name); ++ } ++ ++ return 0; ++} ++ + static int bch2_show_options(struct seq_file *seq, struct dentry *root) + { + struct bch_fs *c = root->d_sb->s_fs_info; +@@ -1450,7 +1468,6 @@ static int bch2_show_options(struct seq_file *seq, struct dentry *root) + } + + return 0; +- + } + + static const struct super_operations bch_super_operations = { +@@ -1460,6 +1477,7 @@ static const struct super_operations bch_super_operations = { + .evict_inode = bch2_evict_inode, + .sync_fs = bch2_sync_fs, + .statfs = bch2_statfs, ++ .show_devname = bch2_show_devname, + .show_options = bch2_show_options, + .remount_fs = bch2_remount, + #if 0 +diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c +index feac88eae540..ff83abe318e8 100644 +--- a/fs/bcachefs/super.c ++++ b/fs/bcachefs/super.c +@@ -1783,7 +1783,6 @@ err: + /* return with ref on ca->ref: */ + struct bch_dev *bch2_dev_lookup(struct bch_fs *c, const char *path) + { +- + struct bch_dev *ca; + dev_t dev; + unsigned i; +@@ -1809,6 +1808,7 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices, + { + struct bch_sb_handle *sb = NULL; + struct bch_fs *c = NULL; ++ struct bch_sb_field_members *mi; + unsigned i, best_sb = 0; + const char *err; + int ret = -ENOMEM; +@@ -1844,10 +1844,24 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices, + le64_to_cpu(sb[best_sb].sb->seq)) + best_sb = i; + +- for (i = 0; i < nr_devices; i++) { ++ mi = bch2_sb_get_members(sb[best_sb].sb); ++ ++ i = 0; ++ while (i < nr_devices) { ++ if (i != best_sb && ++ !bch2_dev_exists(sb[best_sb].sb, mi, sb[i].sb->dev_idx)) { ++ char buf[BDEVNAME_SIZE]; ++ pr_info("%s has been removed, skipping", ++ bdevname(sb[i].bdev, buf)); ++ bch2_free_super(&sb[i]); ++ array_remove_item(sb, nr_devices, i); ++ continue; ++ } ++ + err = bch2_dev_in_fs(sb[best_sb].sb, sb[i].sb); + if (err) + goto err_print; ++ i++; + } + + ret = -ENOMEM; +-- +cgit v1.2.3 + + +From afbf741b92ce5b638b99073280f0846ddfe684b2 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 8 Sep 2020 18:30:32 -0400 +Subject: bcachefs: Fix unmount path + +There was a long standing race in the mount/unmount code - the VFS +intends for mount/unmount synchronizatino to be handled by the list of +superblocks, but we were still holding devices open after tearing down +our superblock in the unmount path. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bcachefs.h | 1 - + fs/bcachefs/fs.c | 161 ++++++++++++++++++++++--------------------------- + fs/bcachefs/super.c | 35 ++++++++--- + fs/bcachefs/super.h | 2 + + 4 files changed, 100 insertions(+), 99 deletions(-) + +diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h +index ee79b9294461..a6178bdfd362 100644 +--- a/fs/bcachefs/bcachefs.h ++++ b/fs/bcachefs/bcachefs.h +@@ -492,7 +492,6 @@ enum { + BCH_FS_ERRORS_FIXED, + + /* misc: */ +- BCH_FS_BDEV_MOUNTED, + BCH_FS_FIXED_GENS, + BCH_FS_ALLOC_WRITTEN, + BCH_FS_REBUILD_REPLICAS, +diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c +index cf4655ede0d7..292329a1486b 100644 +--- a/fs/bcachefs/fs.c ++++ b/fs/bcachefs/fs.c +@@ -1299,91 +1299,36 @@ static struct bch_fs *bch2_path_to_fs(const char *path) + return ERR_PTR(ret); + + c = bch2_dev_to_fs(dev); +- return c ?: ERR_PTR(-ENOENT); +-} +- +-static struct bch_fs *__bch2_open_as_blockdevs(const char *dev_name, char * const *devs, +- unsigned nr_devs, struct bch_opts opts) +-{ +- struct bch_fs *c, *c1, *c2; +- size_t i; +- +- if (!nr_devs) +- return ERR_PTR(-EINVAL); +- +- c = bch2_fs_open(devs, nr_devs, opts); +- +- if (IS_ERR(c) && PTR_ERR(c) == -EBUSY) { +- /* +- * Already open? +- * Look up each block device, make sure they all belong to a +- * filesystem and they all belong to the _same_ filesystem +- */ +- +- c1 = bch2_path_to_fs(devs[0]); +- if (IS_ERR(c1)) +- return c; +- +- for (i = 1; i < nr_devs; i++) { +- c2 = bch2_path_to_fs(devs[i]); +- if (!IS_ERR(c2)) +- closure_put(&c2->cl); +- +- if (c1 != c2) { +- closure_put(&c1->cl); +- return c; +- } +- } +- +- c = c1; +- } +- +- if (IS_ERR(c)) +- return c; +- +- down_write(&c->state_lock); +- +- if (!test_bit(BCH_FS_STARTED, &c->flags)) { +- up_write(&c->state_lock); ++ if (c) + closure_put(&c->cl); +- pr_err("err mounting %s: incomplete filesystem", dev_name); +- return ERR_PTR(-EINVAL); +- } +- +- up_write(&c->state_lock); +- +- set_bit(BCH_FS_BDEV_MOUNTED, &c->flags); +- return c; ++ return c ?: ERR_PTR(-ENOENT); + } + +-static struct bch_fs *bch2_open_as_blockdevs(const char *_dev_name, +- struct bch_opts opts) ++static char **split_devs(const char *_dev_name, unsigned *nr) + { + char *dev_name = NULL, **devs = NULL, *s; +- struct bch_fs *c = ERR_PTR(-ENOMEM); + size_t i, nr_devs = 0; + + dev_name = kstrdup(_dev_name, GFP_KERNEL); + if (!dev_name) +- goto err; ++ return NULL; + + for (s = dev_name; s; s = strchr(s + 1, ':')) + nr_devs++; + +- devs = kcalloc(nr_devs, sizeof(const char *), GFP_KERNEL); +- if (!devs) +- goto err; ++ devs = kcalloc(nr_devs + 1, sizeof(const char *), GFP_KERNEL); ++ if (!devs) { ++ kfree(dev_name); ++ return NULL; ++ } + + for (i = 0, s = dev_name; + s; + (s = strchr(s, ':')) && (*s++ = '\0')) + devs[i++] = s; + +- c = __bch2_open_as_blockdevs(_dev_name, devs, nr_devs, opts); +-err: +- kfree(devs); +- kfree(dev_name); +- return c; ++ *nr = nr_devs; ++ return devs; + } + + static int bch2_remount(struct super_block *sb, int *flags, char *data) +@@ -1470,6 +1415,13 @@ static int bch2_show_options(struct seq_file *seq, struct dentry *root) + return 0; + } + ++static void bch2_put_super(struct super_block *sb) ++{ ++ struct bch_fs *c = sb->s_fs_info; ++ ++ __bch2_fs_stop(c); ++} ++ + static const struct super_operations bch_super_operations = { + .alloc_inode = bch2_alloc_inode, + .destroy_inode = bch2_destroy_inode, +@@ -1480,24 +1432,39 @@ static const struct super_operations bch_super_operations = { + .show_devname = bch2_show_devname, + .show_options = bch2_show_options, + .remount_fs = bch2_remount, +-#if 0 + .put_super = bch2_put_super, ++#if 0 + .freeze_fs = bch2_freeze, + .unfreeze_fs = bch2_unfreeze, + #endif + }; + +-static int bch2_test_super(struct super_block *s, void *data) +-{ +- return s->s_fs_info == data; +-} +- + static int bch2_set_super(struct super_block *s, void *data) + { + s->s_fs_info = data; + return 0; + } + ++static int bch2_noset_super(struct super_block *s, void *data) ++{ ++ return -EBUSY; ++} ++ ++static int bch2_test_super(struct super_block *s, void *data) ++{ ++ struct bch_fs *c = s->s_fs_info; ++ struct bch_fs **devs = data; ++ unsigned i; ++ ++ if (!c) ++ return false; ++ ++ for (i = 0; devs[i]; i++) ++ if (c != devs[i]) ++ return false; ++ return true; ++} ++ + static struct dentry *bch2_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) + { +@@ -1506,7 +1473,9 @@ static struct dentry *bch2_mount(struct file_system_type *fs_type, + struct super_block *sb; + struct inode *vinode; + struct bch_opts opts = bch2_opts_empty(); +- unsigned i; ++ char **devs; ++ struct bch_fs **devs_to_fs = NULL; ++ unsigned i, nr_devs; + int ret; + + opt_set(opts, read_only, (flags & SB_RDONLY) != 0); +@@ -1515,21 +1484,41 @@ static struct dentry *bch2_mount(struct file_system_type *fs_type, + if (ret) + return ERR_PTR(ret); + +- c = bch2_open_as_blockdevs(dev_name, opts); +- if (IS_ERR(c)) +- return ERR_CAST(c); ++ devs = split_devs(dev_name, &nr_devs); ++ if (!devs) ++ return ERR_PTR(-ENOMEM); + +- sb = sget(fs_type, bch2_test_super, bch2_set_super, flags|SB_NOSEC, c); +- if (IS_ERR(sb)) { +- closure_put(&c->cl); +- return ERR_CAST(sb); ++ devs_to_fs = kcalloc(nr_devs + 1, sizeof(void *), GFP_KERNEL); ++ if (!devs_to_fs) { ++ sb = ERR_PTR(-ENOMEM); ++ goto got_sb; + } + +- BUG_ON(sb->s_fs_info != c); ++ for (i = 0; i < nr_devs; i++) ++ devs_to_fs[i] = bch2_path_to_fs(devs[i]); + +- if (sb->s_root) { +- closure_put(&c->cl); ++ sb = sget(fs_type, bch2_test_super, bch2_noset_super, ++ flags|SB_NOSEC, devs_to_fs); ++ if (!IS_ERR(sb)) ++ goto got_sb; ++ ++ c = bch2_fs_open(devs, nr_devs, opts); ++ ++ if (!IS_ERR(c)) ++ sb = sget(fs_type, NULL, bch2_set_super, flags|SB_NOSEC, c); ++ else ++ sb = ERR_CAST(c); ++got_sb: ++ kfree(devs_to_fs); ++ kfree(devs[0]); ++ kfree(devs); ++ ++ if (IS_ERR(sb)) ++ return ERR_CAST(sb); ++ ++ c = sb->s_fs_info; + ++ if (sb->s_root) { + if ((flags ^ sb->s_flags) & SB_RDONLY) { + ret = -EBUSY; + goto err_put_super; +@@ -1602,11 +1591,7 @@ static void bch2_kill_sb(struct super_block *sb) + struct bch_fs *c = sb->s_fs_info; + + generic_shutdown_super(sb); +- +- if (test_bit(BCH_FS_BDEV_MOUNTED, &c->flags)) +- bch2_fs_stop(c); +- else +- closure_put(&c->cl); ++ bch2_fs_free(c); + } + + static struct file_system_type bcache_fs_type = { +diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c +index ff83abe318e8..ffa6ac8ef32b 100644 +--- a/fs/bcachefs/super.c ++++ b/fs/bcachefs/super.c +@@ -458,7 +458,7 @@ int bch2_fs_read_write_early(struct bch_fs *c) + + /* Filesystem startup/shutdown: */ + +-static void bch2_fs_free(struct bch_fs *c) ++static void __bch2_fs_free(struct bch_fs *c) + { + unsigned i; + +@@ -514,10 +514,10 @@ static void bch2_fs_release(struct kobject *kobj) + { + struct bch_fs *c = container_of(kobj, struct bch_fs, kobj); + +- bch2_fs_free(c); ++ __bch2_fs_free(c); + } + +-void bch2_fs_stop(struct bch_fs *c) ++void __bch2_fs_stop(struct bch_fs *c) + { + struct bch_dev *ca; + unsigned i; +@@ -547,13 +547,6 @@ void bch2_fs_stop(struct bch_fs *c) + kobject_put(&c->opts_dir); + kobject_put(&c->internal); + +- mutex_lock(&bch_fs_list_lock); +- list_del(&c->list); +- mutex_unlock(&bch_fs_list_lock); +- +- closure_sync(&c->cl); +- closure_debug_destroy(&c->cl); +- + /* btree prefetch might have kicked off reads in the background: */ + bch2_btree_flush_all_reads(c); + +@@ -564,6 +557,22 @@ void bch2_fs_stop(struct bch_fs *c) + cancel_delayed_work_sync(&c->pd_controllers_update); + cancel_work_sync(&c->read_only_work); + ++ for (i = 0; i < c->sb.nr_devices; i++) ++ if (c->devs[i]) ++ bch2_free_super(&c->devs[i]->disk_sb); ++} ++ ++void bch2_fs_free(struct bch_fs *c) ++{ ++ unsigned i; ++ ++ mutex_lock(&bch_fs_list_lock); ++ list_del(&c->list); ++ mutex_unlock(&bch_fs_list_lock); ++ ++ closure_sync(&c->cl); ++ closure_debug_destroy(&c->cl); ++ + for (i = 0; i < c->sb.nr_devices; i++) + if (c->devs[i]) + bch2_dev_free(rcu_dereference_protected(c->devs[i], 1)); +@@ -573,6 +582,12 @@ void bch2_fs_stop(struct bch_fs *c) + kobject_put(&c->kobj); + } + ++void bch2_fs_stop(struct bch_fs *c) ++{ ++ __bch2_fs_stop(c); ++ bch2_fs_free(c); ++} ++ + static const char *bch2_fs_online(struct bch_fs *c) + { + struct bch_dev *ca; +diff --git a/fs/bcachefs/super.h b/fs/bcachefs/super.h +index 3adab04d236a..2820ca110598 100644 +--- a/fs/bcachefs/super.h ++++ b/fs/bcachefs/super.h +@@ -230,6 +230,8 @@ static inline void bch2_fs_lazy_rw(struct bch_fs *c) + bch2_fs_read_write_early(c); + } + ++void __bch2_fs_stop(struct bch_fs *); ++void bch2_fs_free(struct bch_fs *); + void bch2_fs_stop(struct bch_fs *); + + int bch2_fs_start(struct bch_fs *); +-- +cgit v1.2.3 + + +From 0b42b6147f10905dae556188002a9384f5f950f5 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 6 Oct 2020 22:18:21 -0400 +Subject: bcachefs: Fix journal_seq_copy() + +We also need to update the journal's bloom filter of inode numbers that +each journal write has upudates for - in case the inode gets evicted +before it gets fsynced. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/fs.c | 25 ++++++++++++++----------- + fs/bcachefs/journal.c | 15 +++++++++++++++ + fs/bcachefs/journal.h | 1 + + 3 files changed, 30 insertions(+), 11 deletions(-) + +diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c +index 292329a1486b..f4a101a68fc5 100644 +--- a/fs/bcachefs/fs.c ++++ b/fs/bcachefs/fs.c +@@ -40,7 +40,8 @@ static void bch2_vfs_inode_init(struct bch_fs *, + struct bch_inode_info *, + struct bch_inode_unpacked *); + +-static void journal_seq_copy(struct bch_inode_info *dst, ++static void journal_seq_copy(struct bch_fs *c, ++ struct bch_inode_info *dst, + u64 journal_seq) + { + u64 old, v = READ_ONCE(dst->ei_journal_seq); +@@ -51,6 +52,8 @@ static void journal_seq_copy(struct bch_inode_info *dst, + if (old >= journal_seq) + break; + } while ((v = cmpxchg(&dst->ei_journal_seq, old, journal_seq)) != old); ++ ++ bch2_journal_set_has_inum(&c->journal, dst->v.i_ino, journal_seq); + } + + static void __pagecache_lock_put(struct pagecache_lock *lock, long i) +@@ -294,12 +297,12 @@ err_before_quota: + if (!tmpfile) { + bch2_inode_update_after_write(c, dir, &dir_u, + ATTR_MTIME|ATTR_CTIME); +- journal_seq_copy(dir, journal_seq); ++ journal_seq_copy(c, dir, journal_seq); + mutex_unlock(&dir->ei_update_lock); + } + + bch2_vfs_inode_init(c, inode, &inode_u); +- journal_seq_copy(inode, journal_seq); ++ journal_seq_copy(c, inode, journal_seq); + + set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl); + set_cached_acl(&inode->v, ACL_TYPE_DEFAULT, default_acl); +@@ -320,7 +323,7 @@ err_before_quota: + * We raced, another process pulled the new inode into cache + * before us: + */ +- journal_seq_copy(old, journal_seq); ++ journal_seq_copy(c, old, journal_seq); + make_bad_inode(&inode->v); + iput(&inode->v); + +@@ -416,7 +419,7 @@ static int __bch2_link(struct bch_fs *c, + if (likely(!ret)) { + BUG_ON(inode_u.bi_inum != inode->v.i_ino); + +- journal_seq_copy(inode, dir->ei_journal_seq); ++ journal_seq_copy(c, inode, dir->ei_journal_seq); + bch2_inode_update_after_write(c, dir, &dir_u, + ATTR_MTIME|ATTR_CTIME); + bch2_inode_update_after_write(c, inode, &inode_u, ATTR_CTIME); +@@ -473,7 +476,7 @@ static int bch2_unlink(struct inode *vdir, struct dentry *dentry) + if (likely(!ret)) { + BUG_ON(inode_u.bi_inum != inode->v.i_ino); + +- journal_seq_copy(inode, dir->ei_journal_seq); ++ journal_seq_copy(c, inode, dir->ei_journal_seq); + bch2_inode_update_after_write(c, dir, &dir_u, + ATTR_MTIME|ATTR_CTIME); + bch2_inode_update_after_write(c, inode, &inode_u, +@@ -509,7 +512,7 @@ static int bch2_symlink(struct user_namespace *mnt_userns, + if (unlikely(ret)) + goto err; + +- journal_seq_copy(dir, inode->ei_journal_seq); ++ journal_seq_copy(c, dir, inode->ei_journal_seq); + + ret = __bch2_link(c, inode, dir, dentry); + if (unlikely(ret)) +@@ -609,22 +612,22 @@ retry: + + bch2_inode_update_after_write(c, src_dir, &src_dir_u, + ATTR_MTIME|ATTR_CTIME); +- journal_seq_copy(src_dir, journal_seq); ++ journal_seq_copy(c, src_dir, journal_seq); + + if (src_dir != dst_dir) { + bch2_inode_update_after_write(c, dst_dir, &dst_dir_u, + ATTR_MTIME|ATTR_CTIME); +- journal_seq_copy(dst_dir, journal_seq); ++ journal_seq_copy(c, dst_dir, journal_seq); + } + + bch2_inode_update_after_write(c, src_inode, &src_inode_u, + ATTR_CTIME); +- journal_seq_copy(src_inode, journal_seq); ++ journal_seq_copy(c, src_inode, journal_seq); + + if (dst_inode) { + bch2_inode_update_after_write(c, dst_inode, &dst_inode_u, + ATTR_CTIME); +- journal_seq_copy(dst_inode, journal_seq); ++ journal_seq_copy(c, dst_inode, journal_seq); + } + err: + bch2_trans_exit(&trans); +diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c +index 210ad1b0c469..b8b719902c63 100644 +--- a/fs/bcachefs/journal.c ++++ b/fs/bcachefs/journal.c +@@ -18,6 +18,8 @@ + + #include + ++static inline struct journal_buf *journal_seq_to_buf(struct journal *, u64); ++ + static bool __journal_entry_is_open(union journal_res_state state) + { + return state.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL; +@@ -305,6 +307,19 @@ u64 bch2_inode_journal_seq(struct journal *j, u64 inode) + return seq; + } + ++void bch2_journal_set_has_inum(struct journal *j, u64 inode, u64 seq) ++{ ++ size_t h = hash_64(inode, ilog2(sizeof(j->buf[0].has_inode) * 8)); ++ struct journal_buf *buf; ++ ++ spin_lock(&j->lock); ++ ++ if ((buf = journal_seq_to_buf(j, seq))) ++ set_bit(h, buf->has_inode); ++ ++ spin_unlock(&j->lock); ++} ++ + static int __journal_res_get(struct journal *j, struct journal_res *res, + unsigned flags) + { +diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h +index 56438840efd7..f60bc964ee1f 100644 +--- a/fs/bcachefs/journal.h ++++ b/fs/bcachefs/journal.h +@@ -147,6 +147,7 @@ static inline u64 journal_cur_seq(struct journal *j) + } + + u64 bch2_inode_journal_seq(struct journal *, u64); ++void bch2_journal_set_has_inum(struct journal *, u64, u64); + + static inline int journal_state_count(union journal_res_state s, int idx) + { +-- +cgit v1.2.3 + + +From 76f811c7eb6e31d1d55dd17cd705d8e501365cca Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 9 Oct 2020 00:09:20 -0400 +Subject: bcachefs: Fix __bch2_truncate_page() + +__bch2_truncate_page() will mark some of the blocks in a page as +unallocated. But, if the page is mmapped (and writable), every block in +the page needs to be marked dirty, else those blocks won't be written by +__bch2_writepage(). + +The solution is to change those userspace mappings to RO, so that we +force bch2_page_mkwrite() to be called again. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/fs-io.c | 7 +++++++ + 1 file changed, 7 insertions(+) + +diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c +index c5aa63e4044c..e0e53ec2d2cf 100644 +--- a/fs/bcachefs/fs-io.c ++++ b/fs/bcachefs/fs-io.c +@@ -26,6 +26,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -2214,6 +2215,12 @@ static int __bch2_truncate_page(struct bch_inode_info *inode, + ret = bch2_get_page_disk_reservation(c, inode, page, false); + BUG_ON(ret); + ++ /* ++ * This removes any writeable userspace mappings; we need to force ++ * .page_mkwrite to be called again before any mmapped writes, to ++ * redirty the full page: ++ */ ++ page_mkclean(page); + __set_page_dirty_nobuffers(page); + unlock: + unlock_page(page); +-- +cgit v1.2.3 + + +From 7bfdeea62f0cef9fe31c5cbb64b0f017a8ae6f3e Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 11 Oct 2020 16:33:49 -0400 +Subject: bcachefs: Fix a lockdep splat + +We can't allocate memory with GFP_FS while holding the btree cache lock, +and vfree() can allocate memory. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_cache.c | 5 ++++- + 1 file changed, 4 insertions(+), 1 deletion(-) + +diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c +index 736671112861..3a43b8ecef27 100644 +--- a/fs/bcachefs/btree_cache.c ++++ b/fs/bcachefs/btree_cache.c +@@ -348,11 +348,13 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c) + { + struct btree_cache *bc = &c->btree_cache; + struct btree *b; +- unsigned i; ++ unsigned i, flags; + + if (bc->shrink.list.next) + unregister_shrinker(&bc->shrink); + ++ /* vfree() can allocate memory: */ ++ flags = memalloc_nofs_save(); + mutex_lock(&bc->lock); + + #ifdef CONFIG_BCACHEFS_DEBUG +@@ -388,6 +390,7 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c) + } + + mutex_unlock(&bc->lock); ++ memalloc_nofs_restore(flags); + + if (bc->table_init_done) + rhashtable_destroy(&bc->table); +-- +cgit v1.2.3 + + +From e90999d635c54c3db9049d9860d6de84309d1d01 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 13 Oct 2020 00:06:36 -0400 +Subject: bcachefs: Fix off-by-one error in ptr gen check + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/buckets.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index 97a8af31ded1..797114353aa2 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -949,7 +949,7 @@ static int __mark_pointer(struct bch_fs *c, struct bkey_s_c k, + return -EIO; + } + +- if (gen_cmp(bucket_gen, p.ptr.gen) >= 96U) { ++ if (gen_cmp(bucket_gen, p.ptr.gen) > 96U) { + bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, + "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n" + "while marking %s", +-- +cgit v1.2.3 + + +From 5f0870ce7d2836aa09bbacfc82aa7386788ffa19 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 13 Oct 2020 03:58:50 -0400 +Subject: bcachefs: Fix gc of stale ptr gens + +Awhile back, gcing of stale pointers was split out from full +mark-and-sweep gc - but, the bit to actually drop those stale pointers +wasn't implemnted. Whoops. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_gc.c | 66 +++++++++++++++++++++++++++++++++++++++----------- + 1 file changed, 52 insertions(+), 14 deletions(-) + +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +index 4f581130270c..2aa8140aec32 100644 +--- a/fs/bcachefs/btree_gc.c ++++ b/fs/bcachefs/btree_gc.c +@@ -8,6 +8,7 @@ + #include "alloc_background.h" + #include "alloc_foreground.h" + #include "bkey_methods.h" ++#include "bkey_on_stack.h" + #include "btree_locking.h" + #include "btree_update_interior.h" + #include "btree_io.h" +@@ -888,40 +889,77 @@ out: + return ret; + } + ++static bool gc_btree_gens_key(struct bch_fs *c, struct bkey_s_c k) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const struct bch_extent_ptr *ptr; ++ ++ percpu_down_read(&c->mark_lock); ++ bkey_for_each_ptr(ptrs, ptr) { ++ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); ++ struct bucket *g = PTR_BUCKET(ca, ptr, false); ++ ++ if (gen_after(g->mark.gen, ptr->gen) > 16) { ++ percpu_up_read(&c->mark_lock); ++ return true; ++ } ++ } ++ ++ bkey_for_each_ptr(ptrs, ptr) { ++ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); ++ struct bucket *g = PTR_BUCKET(ca, ptr, false); ++ ++ if (gen_after(g->gc_gen, ptr->gen)) ++ g->gc_gen = ptr->gen; ++ } ++ percpu_up_read(&c->mark_lock); ++ ++ return false; ++} ++ + /* + * For recalculating oldest gen, we only need to walk keys in leaf nodes; btree + * node pointers currently never have cached pointers that can become stale: + */ +-static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id id) ++static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id btree_id) + { + struct btree_trans trans; + struct btree_iter *iter; + struct bkey_s_c k; +- int ret; ++ struct bkey_on_stack sk; ++ int ret = 0; + ++ bkey_on_stack_init(&sk); + bch2_trans_init(&trans, c, 0, 0); + +- for_each_btree_key(&trans, iter, id, POS_MIN, BTREE_ITER_PREFETCH, k, ret) { +- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); +- const struct bch_extent_ptr *ptr; ++ iter = bch2_trans_get_iter(&trans, btree_id, POS_MIN, ++ BTREE_ITER_PREFETCH); + +- percpu_down_read(&c->mark_lock); +- bkey_for_each_ptr(ptrs, ptr) { +- struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); +- struct bucket *g = PTR_BUCKET(ca, ptr, false); ++ while ((k = bch2_btree_iter_peek(iter)).k && ++ !(ret = bkey_err(k))) { ++ if (gc_btree_gens_key(c, k)) { ++ bkey_on_stack_reassemble(&sk, c, k); ++ bch2_extent_normalize(c, bkey_i_to_s(sk.k)); + +- if (gen_after(g->gc_gen, ptr->gen)) +- g->gc_gen = ptr->gen; ++ bch2_btree_iter_set_pos(iter, bkey_start_pos(&sk.k->k)); + +- if (gen_after(g->mark.gen, ptr->gen) > 32) { +- /* rewrite btree node */ ++ bch2_trans_update(&trans, iter, sk.k, 0); + ++ ret = bch2_trans_commit(&trans, NULL, NULL, ++ BTREE_INSERT_NOFAIL); ++ if (ret == -EINTR) ++ continue; ++ if (ret) { ++ break; + } + } +- percpu_up_read(&c->mark_lock); ++ ++ bch2_btree_iter_next(iter); + } + + bch2_trans_exit(&trans); ++ bkey_on_stack_exit(&sk, c); ++ + return ret; + } + +-- +cgit v1.2.3 + + +From 873ba1b32eea18c8f960f52058e0ab412270c889 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 10 Jul 2020 19:49:34 -0400 +Subject: bcachefs: Copy ptr->cached when migrating data + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/move.c | 20 +++++++++++++------- + 1 file changed, 13 insertions(+), 7 deletions(-) + +diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c +index 2f3be487ef65..7019f1132086 100644 +--- a/fs/bcachefs/move.c ++++ b/fs/bcachefs/move.c +@@ -95,10 +95,6 @@ static int bch2_migrate_index_update(struct bch_write_op *op) + !bch2_bkey_matches_ptr(c, k, m->ptr, m->offset)) + goto nomatch; + +- if (m->data_cmd == DATA_REWRITE && +- !bch2_bkey_has_device(k, m->data_opts.rewrite_dev)) +- goto nomatch; +- + bkey_reassemble(&_insert.k, k); + insert = &_insert.k; + +@@ -110,9 +106,19 @@ static int bch2_migrate_index_update(struct bch_write_op *op) + bch2_cut_back(new->k.p, insert); + bch2_cut_back(insert->k.p, &new->k_i); + +- if (m->data_cmd == DATA_REWRITE) +- bch2_bkey_drop_device(bkey_i_to_s(insert), +- m->data_opts.rewrite_dev); ++ if (m->data_cmd == DATA_REWRITE) { ++ struct bch_extent_ptr *new_ptr, *old_ptr = (void *) ++ bch2_bkey_has_device(bkey_i_to_s_c(insert), ++ m->data_opts.rewrite_dev); ++ if (!old_ptr) ++ goto nomatch; ++ ++ if (old_ptr->cached) ++ extent_for_each_ptr(extent_i_to_s(new), new_ptr) ++ new_ptr->cached = true; ++ ++ bch2_bkey_drop_ptr(bkey_i_to_s(insert), old_ptr); ++ } + + extent_for_each_ptr_decode(extent_i_to_s(new), p, entry) { + if (bch2_bkey_has_device(bkey_i_to_s_c(insert), p.ptr.dev)) { +-- +cgit v1.2.3 + + +From 1a5799a69acc2e5051146107913bd76e67ffe3fa Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 15 Oct 2020 15:58:36 -0400 +Subject: bcachefs: Fix errors early in the fs init process + +At some point bch2_fs_alloc() was changed to always call bch2_fs_free() +in the error path, which means we need c->cl to always be initialized. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/super.c | 20 ++++++++------------ + 1 file changed, 8 insertions(+), 12 deletions(-) + +diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c +index ffa6ac8ef32b..3a2b2a050961 100644 +--- a/fs/bcachefs/super.c ++++ b/fs/bcachefs/super.c +@@ -645,6 +645,14 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) + + __module_get(THIS_MODULE); + ++ closure_init(&c->cl, NULL); ++ ++ c->kobj.kset = bcachefs_kset; ++ kobject_init(&c->kobj, &bch2_fs_ktype); ++ kobject_init(&c->internal, &bch2_fs_internal_ktype); ++ kobject_init(&c->opts_dir, &bch2_fs_opts_dir_ktype); ++ kobject_init(&c->time_stats, &bch2_fs_time_stats_ktype); ++ + c->minor = -1; + c->disk_sb.fs_sb = true; + +@@ -775,18 +783,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) + bch2_dev_alloc(c, i)) + goto err; + +- /* +- * Now that all allocations have succeeded, init various refcounty +- * things that let us shutdown: +- */ +- closure_init(&c->cl, NULL); +- +- c->kobj.kset = bcachefs_kset; +- kobject_init(&c->kobj, &bch2_fs_ktype); +- kobject_init(&c->internal, &bch2_fs_internal_ktype); +- kobject_init(&c->opts_dir, &bch2_fs_opts_dir_ktype); +- kobject_init(&c->time_stats, &bch2_fs_time_stats_ktype); +- + mutex_lock(&bch_fs_list_lock); + err = bch2_fs_online(c); + mutex_unlock(&bch_fs_list_lock); +-- +cgit v1.2.3 + + +From aae6984e75fc452208c29d1d71023fcb2f7534d9 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 15 Oct 2020 21:48:58 -0400 +Subject: bcachefs: Fix another lockdep splat + +vfree() can allocate memory, so we need to call memalloc_nofs_save(). + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_cache.c | 5 ++++- + 1 file changed, 4 insertions(+), 1 deletion(-) + +diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c +index 3a43b8ecef27..bb94fa2341ee 100644 +--- a/fs/bcachefs/btree_cache.c ++++ b/fs/bcachefs/btree_cache.c +@@ -252,7 +252,7 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, + unsigned long can_free; + unsigned long touched = 0; + unsigned long freed = 0; +- unsigned i; ++ unsigned i, flags; + + if (btree_shrinker_disabled(c)) + return SHRINK_STOP; +@@ -263,6 +263,8 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, + else if (!mutex_trylock(&bc->lock)) + return -1; + ++ flags = memalloc_nofs_save(); ++ + /* + * It's _really_ critical that we don't free too many btree nodes - we + * have to always leave ourselves a reserve. The reserve is how we +@@ -326,6 +328,7 @@ restart: + clear_btree_node_accessed(b); + } + ++ memalloc_nofs_restore(flags); + mutex_unlock(&bc->lock); + out: + return (unsigned long) freed * btree_pages(c); +-- +cgit v1.2.3 + + +From 0c64689579a98e3b81526b915ee11e937d5f2b72 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 15 Oct 2020 22:23:02 -0400 +Subject: bcachefs: Fix copygc of compressed data + +The check for when we need to get a disk reservation was wrong. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/move.c | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c +index 7019f1132086..1ffb14a22f94 100644 +--- a/fs/bcachefs/move.c ++++ b/fs/bcachefs/move.c +@@ -297,14 +297,14 @@ int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m, + unsigned compressed_sectors = 0; + + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) +- if (!p.ptr.cached && +- crc_is_compressed(p.crc) && +- bch2_dev_in_target(c, p.ptr.dev, data_opts.target)) ++ if (p.ptr.dev == data_opts.rewrite_dev && ++ !p.ptr.cached && ++ crc_is_compressed(p.crc)) + compressed_sectors += p.crc.compressed_size; + + if (compressed_sectors) { + ret = bch2_disk_reservation_add(c, &m->op.res, +- compressed_sectors, ++ k.k->size * m->op.nr_replicas, + BCH_DISK_RESERVATION_NOFAIL); + if (ret) + return ret; +-- +cgit v1.2.3 + + +From 209cbef9df47395fd33585db9546dc79b0f95d73 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 15 Oct 2020 22:50:48 -0400 +Subject: bcachefs: Fix copygc dying on startup + +The copygc threads errors out and makes the filesystem go RO if it ever +tries to run and discovers it has no reserve allocated - which is a +problem if it races with the allocator thread and its reserve hasn't +been filled yet. + +The allocator thread doesn't start filling the copygc reserve until +after BCH_FS_STARTED has been set, so make sure to wake up the allocator +threads after setting that and before starting copygc. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/alloc_background.h | 4 +++- + fs/bcachefs/super.c | 7 +++++++ + 2 files changed, 10 insertions(+), 1 deletion(-) + +diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h +index f6b9f27f0713..4f462696b747 100644 +--- a/fs/bcachefs/alloc_background.h ++++ b/fs/bcachefs/alloc_background.h +@@ -61,8 +61,10 @@ static inline void bch2_wake_allocator(struct bch_dev *ca) + + rcu_read_lock(); + p = rcu_dereference(ca->alloc_thread); +- if (p) ++ if (p) { + wake_up_process(p); ++ ca->allocator_state = ALLOCATOR_RUNNING; ++ } + rcu_read_unlock(); + } + +diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c +index 3a2b2a050961..1247b3a196ac 100644 +--- a/fs/bcachefs/super.c ++++ b/fs/bcachefs/super.c +@@ -878,6 +878,13 @@ int bch2_fs_start(struct bch_fs *c) + + set_bit(BCH_FS_STARTED, &c->flags); + ++ /* ++ * Allocator threads don't start filling copygc reserve until after we ++ * set BCH_FS_STARTED - wake them now: ++ */ ++ for_each_online_member(ca, c, i) ++ bch2_wake_allocator(ca); ++ + if (c->opts.read_only || c->opts.nochanges) { + bch2_fs_read_only(c); + } else { +-- +cgit v1.2.3 + + +From e8ef0f1417f343303a6c95e3b58e8c3ecc795f29 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 16 Oct 2020 21:49:13 -0400 +Subject: bcachefs: Disable preemption around write_seqcount() lock + +Not sure why the lock doesn't do this itself... + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_gc.c | 2 ++ + fs/bcachefs/buckets.c | 2 ++ + 2 files changed, 4 insertions(+) + +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +index 2aa8140aec32..c6f1acb97803 100644 +--- a/fs/bcachefs/btree_gc.c ++++ b/fs/bcachefs/btree_gc.c +@@ -37,9 +37,11 @@ + + static inline void __gc_pos_set(struct bch_fs *c, struct gc_pos new_pos) + { ++ preempt_disable(); + write_seqcount_begin(&c->gc_pos_lock); + c->gc_pos = new_pos; + write_seqcount_end(&c->gc_pos_lock); ++ preempt_enable(); + } + + static inline void gc_pos_set(struct bch_fs *c, struct gc_pos new_pos) +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index 797114353aa2..6c6ce0aaae2e 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -254,6 +254,7 @@ void bch2_fs_usage_acc_to_base(struct bch_fs *c, unsigned idx) + + BUG_ON(idx >= 2); + ++ preempt_disable(); + write_seqcount_begin(&c->usage_lock); + + acc_u64s_percpu((u64 *) c->usage_base, +@@ -261,6 +262,7 @@ void bch2_fs_usage_acc_to_base(struct bch_fs *c, unsigned idx) + percpu_memset(c->usage[idx], 0, u64s * sizeof(u64)); + + write_seqcount_end(&c->usage_lock); ++ preempt_enable(); + } + + void bch2_fs_usage_to_text(struct printbuf *out, +-- +cgit v1.2.3 + + +From 1ac97c253785644e055f2e7ea70c50e4d680dd7b Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 16 Oct 2020 21:32:02 -0400 +Subject: bcachefs: Perf improvements for bch_alloc_read() + +On large filesystems reading in the alloc info takes a significant +amount of time. But we don't need to be calling into the fully general +bch2_mark_key() path, just open code what we need in +bch2_alloc_read_fn(). + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/alloc_background.c | 26 ++++++++++++++++++++++---- + fs/bcachefs/btree_types.h | 2 -- + fs/bcachefs/buckets.c | 4 ++-- + fs/bcachefs/ec.c | 1 - + 4 files changed, 24 insertions(+), 9 deletions(-) + +diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c +index d5f0607733fe..23f2c353f5ad 100644 +--- a/fs/bcachefs/alloc_background.c ++++ b/fs/bcachefs/alloc_background.c +@@ -209,10 +209,25 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, + static int bch2_alloc_read_fn(struct bch_fs *c, enum btree_id id, + unsigned level, struct bkey_s_c k) + { +- if (!level) +- bch2_mark_key(c, k, 0, 0, NULL, 0, +- BTREE_TRIGGER_ALLOC_READ| +- BTREE_TRIGGER_NOATOMIC); ++ struct bch_dev *ca; ++ struct bucket *g; ++ struct bkey_alloc_unpacked u; ++ ++ if (level || k.k->type != KEY_TYPE_alloc) ++ return 0; ++ ++ ca = bch_dev_bkey_exists(c, k.k->p.inode); ++ g = __bucket(ca, k.k->p.offset, 0); ++ u = bch2_alloc_unpack(k); ++ ++ g->_mark.gen = u.gen; ++ g->_mark.data_type = u.data_type; ++ g->_mark.dirty_sectors = u.dirty_sectors; ++ g->_mark.cached_sectors = u.cached_sectors; ++ g->io_time[READ] = u.read_time; ++ g->io_time[WRITE] = u.write_time; ++ g->oldest_gen = u.oldest_gen; ++ g->gen_valid = 1; + + return 0; + } +@@ -223,8 +238,11 @@ int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys) + unsigned i; + int ret = 0; + ++ down_read(&c->gc_lock); + ret = bch2_btree_and_journal_walk(c, journal_keys, BTREE_ID_ALLOC, + NULL, bch2_alloc_read_fn); ++ up_read(&c->gc_lock); ++ + if (ret) { + bch_err(c, "error reading alloc info: %i", ret); + return ret; +diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h +index 683b416ef427..c1717b7c8c38 100644 +--- a/fs/bcachefs/btree_types.h ++++ b/fs/bcachefs/btree_types.h +@@ -602,7 +602,6 @@ enum btree_trigger_flags { + + __BTREE_TRIGGER_GC, + __BTREE_TRIGGER_BUCKET_INVALIDATE, +- __BTREE_TRIGGER_ALLOC_READ, + __BTREE_TRIGGER_NOATOMIC, + }; + +@@ -614,7 +613,6 @@ enum btree_trigger_flags { + + #define BTREE_TRIGGER_GC (1U << __BTREE_TRIGGER_GC) + #define BTREE_TRIGGER_BUCKET_INVALIDATE (1U << __BTREE_TRIGGER_BUCKET_INVALIDATE) +-#define BTREE_TRIGGER_ALLOC_READ (1U << __BTREE_TRIGGER_ALLOC_READ) + #define BTREE_TRIGGER_NOATOMIC (1U << __BTREE_TRIGGER_NOATOMIC) + + static inline bool btree_node_type_needs_gc(enum btree_node_type type) +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index 6c6ce0aaae2e..c3fc3abbc0dc 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -484,6 +484,7 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca, + bch2_wake_allocator(ca); + } + ++__flatten + void bch2_dev_usage_from_buckets(struct bch_fs *c) + { + struct bch_dev *ca; +@@ -757,8 +758,7 @@ static int bch2_mark_alloc(struct bch_fs *c, + } + })); + +- if (!(flags & BTREE_TRIGGER_ALLOC_READ)) +- bch2_dev_usage_update(c, ca, fs_usage, old_m, m, gc); ++ bch2_dev_usage_update(c, ca, fs_usage, old_m, m, gc); + + g->io_time[READ] = u.read_time; + g->io_time[WRITE] = u.write_time; +diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c +index abb714456060..39abee9f0fb2 100644 +--- a/fs/bcachefs/ec.c ++++ b/fs/bcachefs/ec.c +@@ -1497,7 +1497,6 @@ static int bch2_stripes_read_fn(struct bch_fs *c, enum btree_id id, + + ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL) ?: + bch2_mark_key(c, k, 0, 0, NULL, 0, +- BTREE_TRIGGER_ALLOC_READ| + BTREE_TRIGGER_NOATOMIC); + if (ret) + return ret; +-- +cgit v1.2.3 + + +From 0f9b0a20078ac20cccd5172a8cce4f2c0058e60c Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 16 Dec 2020 13:35:16 -0500 +Subject: bcachefs: Fix assertion popping in transaction commit path + +We can't be holding read locks on btree nodes when we go to take write +locks: this would deadlock if another thread is holding an intent lock +on the node we have a read lock on, and it tries to commit and upgrade +to a write lock. + +But instead of triggering an assertion, if this happens we can just +upgrade the read lock to an intent lock. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_update_leaf.c | 16 +++++++++++++--- + 1 file changed, 13 insertions(+), 3 deletions(-) + +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index cd699c257244..83c60b04bac1 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -503,6 +503,10 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, + + /* + * Can't be holding any read locks when we go to take write locks: ++ * another thread could be holding an intent lock on the same node we ++ * have a read lock on, and it'll block trying to take a write lock ++ * (because we hold a read lock) and it could be blocking us by holding ++ * its own read lock (while we're trying to to take write locks). + * + * note - this must be done after bch2_trans_journal_preres_get_cold() + * or anything else that might call bch2_trans_relock(), since that +@@ -510,9 +514,15 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, + */ + trans_for_each_iter(trans, iter) { + if (iter->nodes_locked != iter->nodes_intent_locked) { +- EBUG_ON(iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT); +- EBUG_ON(trans->iters_live & (1ULL << iter->idx)); +- bch2_btree_iter_unlock_noinline(iter); ++ if ((iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT) || ++ (trans->iters_live & (1ULL << iter->idx))) { ++ if (!bch2_btree_iter_upgrade(iter, 1)) { ++ trace_trans_restart_upgrade(trans->ip); ++ return -EINTR; ++ } ++ } else { ++ bch2_btree_iter_unlock_noinline(iter); ++ } + } + } + +-- +cgit v1.2.3 + + +From 48f77cd4983d3670bf3ff9345337c98b140e912c Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 16 Oct 2020 21:36:26 -0400 +Subject: bcachefs: Improvements to writing alloc info + +Now that we've got transactional alloc info updates (and have for +awhile), we don't need to write it out on shutdown, and we don't need to +write it out on startup except when GC found errors - this is a big +improvement to mount/unmount performance. + +This patch also fixes a few bugs where we weren't writing out alloc +info (on new filesystems, and new devices) and should have been. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/alloc_background.c | 68 ++++++++++++++++++------------------------ + fs/bcachefs/alloc_background.h | 3 +- + fs/bcachefs/btree_gc.c | 5 +++- + fs/bcachefs/ec.c | 4 +-- + fs/bcachefs/ec.h | 2 +- + fs/bcachefs/recovery.c | 38 +++++++++++++++++------ + fs/bcachefs/super.c | 21 ++++--------- + 7 files changed, 71 insertions(+), 70 deletions(-) + +diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c +index 23f2c353f5ad..f03f4826e68b 100644 +--- a/fs/bcachefs/alloc_background.c ++++ b/fs/bcachefs/alloc_background.c +@@ -271,12 +271,6 @@ int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys) + return 0; + } + +-enum alloc_write_ret { +- ALLOC_WROTE, +- ALLOC_NOWROTE, +- ALLOC_END, +-}; +- + static int bch2_alloc_write_key(struct btree_trans *trans, + struct btree_iter *iter, + unsigned flags) +@@ -306,26 +300,17 @@ retry: + + old_u = bch2_alloc_unpack(k); + +- if (iter->pos.inode >= c->sb.nr_devices || +- !c->devs[iter->pos.inode]) +- return ALLOC_END; +- + percpu_down_read(&c->mark_lock); + ca = bch_dev_bkey_exists(c, iter->pos.inode); + ba = bucket_array(ca); + +- if (iter->pos.offset >= ba->nbuckets) { +- percpu_up_read(&c->mark_lock); +- return ALLOC_END; +- } +- + g = &ba->b[iter->pos.offset]; + m = READ_ONCE(g->mark); + new_u = alloc_mem_to_key(g, m); + percpu_up_read(&c->mark_lock); + + if (!bkey_alloc_unpacked_cmp(old_u, new_u)) +- return ALLOC_NOWROTE; ++ return 0; + + a = bkey_alloc_init(&alloc_key.k); + a->k.p = iter->pos; +@@ -343,50 +328,55 @@ err: + return ret; + } + +-int bch2_alloc_write(struct bch_fs *c, unsigned flags, bool *wrote) ++int bch2_dev_alloc_write(struct bch_fs *c, struct bch_dev *ca, unsigned flags) + { + struct btree_trans trans; + struct btree_iter *iter; +- struct bch_dev *ca; +- unsigned i; ++ u64 first_bucket, nbuckets; + int ret = 0; + ++ percpu_down_read(&c->mark_lock); ++ first_bucket = bucket_array(ca)->first_bucket; ++ nbuckets = bucket_array(ca)->nbuckets; ++ percpu_up_read(&c->mark_lock); ++ + BUG_ON(BKEY_ALLOC_VAL_U64s_MAX > 8); + + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); + +- iter = bch2_trans_get_iter(&trans, BTREE_ID_ALLOC, POS_MIN, ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_ALLOC, ++ POS(ca->dev_idx, first_bucket), + BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + +- for_each_rw_member(ca, c, i) { +- unsigned first_bucket; ++ while (iter->pos.offset < nbuckets) { ++ bch2_trans_cond_resched(&trans); + +- percpu_down_read(&c->mark_lock); +- first_bucket = bucket_array(ca)->first_bucket; +- percpu_up_read(&c->mark_lock); ++ ret = bch2_alloc_write_key(&trans, iter, flags); ++ if (ret) ++ break; ++ bch2_btree_iter_next_slot(iter); ++ } + +- bch2_btree_iter_set_pos(iter, POS(i, first_bucket)); ++ bch2_trans_exit(&trans); + +- while (1) { +- bch2_trans_cond_resched(&trans); ++ return ret; ++} + +- ret = bch2_alloc_write_key(&trans, iter, flags); +- if (ret < 0 || ret == ALLOC_END) +- break; +- if (ret == ALLOC_WROTE) +- *wrote = true; +- bch2_btree_iter_next_slot(iter); +- } ++int bch2_alloc_write(struct bch_fs *c, unsigned flags) ++{ ++ struct bch_dev *ca; ++ unsigned i; ++ int ret = 0; + +- if (ret < 0) { ++ for_each_rw_member(ca, c, i) { ++ bch2_dev_alloc_write(c, ca, flags); ++ if (ret) { + percpu_ref_put(&ca->io_ref); + break; + } + } + +- bch2_trans_exit(&trans); +- +- return ret < 0 ? ret : 0; ++ return ret; + } + + /* Bucket IO clocks: */ +diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h +index 4f462696b747..56a846fde8dd 100644 +--- a/fs/bcachefs/alloc_background.h ++++ b/fs/bcachefs/alloc_background.h +@@ -93,7 +93,8 @@ void bch2_dev_allocator_quiesce(struct bch_fs *, struct bch_dev *); + void bch2_dev_allocator_stop(struct bch_dev *); + int bch2_dev_allocator_start(struct bch_dev *); + +-int bch2_alloc_write(struct bch_fs *, unsigned, bool *); ++int bch2_dev_alloc_write(struct bch_fs *, struct bch_dev *, unsigned); ++int bch2_alloc_write(struct bch_fs *, unsigned); + void bch2_fs_allocator_background_init(struct bch_fs *); + + #endif /* _BCACHEFS_ALLOC_BACKGROUND_H */ +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +index c6f1acb97803..e8c1e752a25d 100644 +--- a/fs/bcachefs/btree_gc.c ++++ b/fs/bcachefs/btree_gc.c +@@ -570,6 +570,7 @@ static int bch2_gc_done(struct bch_fs *c, + fsck_err(c, _msg ": got %llu, should be %llu" \ + , ##__VA_ARGS__, dst->_f, src->_f); \ + dst->_f = src->_f; \ ++ ret = 1; \ + } + #define copy_stripe_field(_f, _msg, ...) \ + if (dst->_f != src->_f) { \ +@@ -580,6 +581,7 @@ static int bch2_gc_done(struct bch_fs *c, + dst->_f, src->_f); \ + dst->_f = src->_f; \ + dst->dirty = true; \ ++ ret = 1; \ + } + #define copy_bucket_field(_f) \ + if (dst->b[b].mark._f != src->b[b].mark._f) { \ +@@ -590,6 +592,7 @@ static int bch2_gc_done(struct bch_fs *c, + bch2_data_types[dst->b[b].mark.data_type],\ + dst->b[b].mark._f, src->b[b].mark._f); \ + dst->b[b]._mark._f = src->b[b].mark._f; \ ++ ret = 1; \ + } + #define copy_dev_field(_f, _msg, ...) \ + copy_field(_f, "dev %u has wrong " _msg, i, ##__VA_ARGS__) +@@ -1396,7 +1399,7 @@ static int bch2_gc_thread(void *arg) + #else + ret = bch2_gc_gens(c); + #endif +- if (ret) ++ if (ret < 0) + bch_err(c, "btree gc failed: %i", ret); + + debug_check_no_locks_held(); +diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c +index 39abee9f0fb2..ad08abfde843 100644 +--- a/fs/bcachefs/ec.c ++++ b/fs/bcachefs/ec.c +@@ -1448,7 +1448,7 @@ static int __bch2_stripe_write_key(struct btree_trans *trans, + return 0; + } + +-int bch2_stripes_write(struct bch_fs *c, unsigned flags, bool *wrote) ++int bch2_stripes_write(struct bch_fs *c, unsigned flags) + { + struct btree_trans trans; + struct btree_iter *iter; +@@ -1476,8 +1476,6 @@ int bch2_stripes_write(struct bch_fs *c, unsigned flags, bool *wrote) + + if (ret) + break; +- +- *wrote = true; + } + + bch2_trans_exit(&trans); +diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h +index f8fc3d616cd7..6db16cf768da 100644 +--- a/fs/bcachefs/ec.h ++++ b/fs/bcachefs/ec.h +@@ -156,7 +156,7 @@ void bch2_ec_flush_new_stripes(struct bch_fs *); + + struct journal_keys; + int bch2_stripes_read(struct bch_fs *, struct journal_keys *); +-int bch2_stripes_write(struct bch_fs *, unsigned, bool *); ++int bch2_stripes_write(struct bch_fs *, unsigned); + + int bch2_ec_mem_alloc(struct bch_fs *, bool); + +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index 6e829bf0a31f..d70fa968db50 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -845,9 +845,11 @@ static int verify_superblock_clean(struct bch_fs *c, + } + + mustfix_fsck_err_on(j->read_clock != clean->read_clock, c, +- "superblock read clock doesn't match journal after clean shutdown"); ++ "superblock read clock %u doesn't match journal %u after clean shutdown", ++ clean->read_clock, j->read_clock); + mustfix_fsck_err_on(j->write_clock != clean->write_clock, c, +- "superblock read clock doesn't match journal after clean shutdown"); ++ "superblock write clock %u doesn't match journal %u after clean shutdown", ++ clean->write_clock, j->write_clock); + + for (i = 0; i < BTREE_ID_NR; i++) { + char buf1[200], buf2[200]; +@@ -961,7 +963,7 @@ int bch2_fs_recovery(struct bch_fs *c) + const char *err = "cannot allocate memory"; + struct bch_sb_field_clean *clean = NULL; + u64 journal_seq; +- bool wrote = false, write_sb = false; ++ bool write_sb = false, need_write_alloc = false; + int ret; + + if (c->sb.clean) +@@ -1090,8 +1092,10 @@ int bch2_fs_recovery(struct bch_fs *c) + bch_info(c, "starting metadata mark and sweep"); + err = "error in mark and sweep"; + ret = bch2_gc(c, &c->journal_keys, true, true); +- if (ret) ++ if (ret < 0) + goto err; ++ if (ret) ++ need_write_alloc = true; + bch_verbose(c, "mark and sweep done"); + } + +@@ -1101,8 +1105,10 @@ int bch2_fs_recovery(struct bch_fs *c) + bch_info(c, "starting mark and sweep"); + err = "error in mark and sweep"; + ret = bch2_gc(c, &c->journal_keys, true, false); +- if (ret) ++ if (ret < 0) + goto err; ++ if (ret) ++ need_write_alloc = true; + bch_verbose(c, "mark and sweep done"); + } + +@@ -1126,7 +1132,7 @@ int bch2_fs_recovery(struct bch_fs *c) + goto err; + bch_verbose(c, "journal replay done"); + +- if (!c->opts.nochanges) { ++ if (need_write_alloc && !c->opts.nochanges) { + /* + * note that even when filesystem was clean there might be work + * to do here, if we ran gc (because of fsck) which recalculated +@@ -1134,8 +1140,8 @@ int bch2_fs_recovery(struct bch_fs *c) + */ + bch_verbose(c, "writing allocation info"); + err = "error writing out alloc info"; +- ret = bch2_stripes_write(c, BTREE_INSERT_LAZY_RW, &wrote) ?: +- bch2_alloc_write(c, BTREE_INSERT_LAZY_RW, &wrote); ++ ret = bch2_stripes_write(c, BTREE_INSERT_LAZY_RW) ?: ++ bch2_alloc_write(c, BTREE_INSERT_LAZY_RW); + if (ret) { + bch_err(c, "error writing alloc info"); + goto err; +@@ -1281,6 +1287,20 @@ int bch2_fs_initialize(struct bch_fs *c) + bch2_fs_journal_start(&c->journal, 1, &journal); + bch2_journal_set_replay_done(&c->journal); + ++ err = "error going read-write"; ++ ret = bch2_fs_read_write_early(c); ++ if (ret) ++ goto err; ++ ++ /* ++ * Write out the superblock and journal buckets, now that we can do ++ * btree updates ++ */ ++ err = "error writing alloc info"; ++ ret = bch2_alloc_write(c, 0); ++ if (ret) ++ goto err; ++ + bch2_inode_init(c, &root_inode, 0, 0, + S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0, NULL); + root_inode.bi_inum = BCACHEFS_ROOT_INO; +@@ -1289,7 +1309,7 @@ int bch2_fs_initialize(struct bch_fs *c) + err = "error creating root directory"; + ret = bch2_btree_insert(c, BTREE_ID_INODES, + &packed_inode.inode.k_i, +- NULL, NULL, BTREE_INSERT_LAZY_RW); ++ NULL, NULL, 0); + if (ret) + goto err; + +diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c +index 1247b3a196ac..bd3da41dd159 100644 +--- a/fs/bcachefs/super.c ++++ b/fs/bcachefs/super.c +@@ -169,9 +169,7 @@ struct bch_fs *bch2_uuid_to_fs(uuid_le uuid) + static void __bch2_fs_read_only(struct bch_fs *c) + { + struct bch_dev *ca; +- bool wrote = false; + unsigned i, clean_passes = 0; +- int ret; + + bch2_rebalance_stop(c); + bch2_copygc_stop(c); +@@ -190,20 +188,6 @@ static void __bch2_fs_read_only(struct bch_fs *c) + if (!test_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags)) + goto nowrote_alloc; + +- bch_verbose(c, "writing alloc info"); +- /* +- * This should normally just be writing the bucket read/write clocks: +- */ +- ret = bch2_stripes_write(c, BTREE_INSERT_NOCHECK_RW, &wrote) ?: +- bch2_alloc_write(c, BTREE_INSERT_NOCHECK_RW, &wrote); +- bch_verbose(c, "writing alloc info complete"); +- +- if (ret && !test_bit(BCH_FS_EMERGENCY_RO, &c->flags)) +- bch2_fs_inconsistent(c, "error writing out alloc info %i", ret); +- +- if (ret) +- goto nowrote_alloc; +- + bch_verbose(c, "flushing journal and stopping allocators"); + + bch2_journal_flush_all_pins(&c->journal); +@@ -1658,6 +1642,11 @@ have_slot: + bch2_write_super(c); + mutex_unlock(&c->sb_lock); + ++ err = "alloc write failed"; ++ ret = bch2_dev_alloc_write(c, ca, 0); ++ if (ret) ++ goto err; ++ + if (ca->mi.state == BCH_MEMBER_STATE_RW) { + err = __bch2_dev_read_write(c, ca); + if (err) +-- +cgit v1.2.3 + + +From aa48e858227291802857ac51e80cc61ae9a740df Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 17 Oct 2020 16:44:27 -0400 +Subject: bcachefs: Start/stop io clock hands in read/write paths + +This fixes a bug where the clock hands in the journal and superblock +didn't match, because we were still incrementing the read clock hand +while read-only. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/alloc_background.c | 12 ------------ + fs/bcachefs/super.c | 6 ++++++ + 2 files changed, 6 insertions(+), 12 deletions(-) + +diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c +index f03f4826e68b..a4daaee74138 100644 +--- a/fs/bcachefs/alloc_background.c ++++ b/fs/bcachefs/alloc_background.c +@@ -1267,18 +1267,6 @@ void bch2_recalc_capacity(struct bch_fs *c) + + c->bucket_size_max = bucket_size_max; + +- if (c->capacity) { +- bch2_io_timer_add(&c->io_clock[READ], +- &c->bucket_clock[READ].rescale); +- bch2_io_timer_add(&c->io_clock[WRITE], +- &c->bucket_clock[WRITE].rescale); +- } else { +- bch2_io_timer_del(&c->io_clock[READ], +- &c->bucket_clock[READ].rescale); +- bch2_io_timer_del(&c->io_clock[WRITE], +- &c->bucket_clock[WRITE].rescale); +- } +- + /* Wake up case someone was waiting for buckets */ + closure_wake_up(&c->freelist_wait); + } +diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c +index bd3da41dd159..8489b96e758f 100644 +--- a/fs/bcachefs/super.c ++++ b/fs/bcachefs/super.c +@@ -175,6 +175,9 @@ static void __bch2_fs_read_only(struct bch_fs *c) + bch2_copygc_stop(c); + bch2_gc_thread_stop(c); + ++ bch2_io_timer_del(&c->io_clock[READ], &c->bucket_clock[READ].rescale); ++ bch2_io_timer_del(&c->io_clock[WRITE], &c->bucket_clock[WRITE].rescale); ++ + /* + * Flush journal before stopping allocators, because flushing journal + * blacklist entries involves allocating new btree nodes: +@@ -400,6 +403,9 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early) + bch2_dev_allocator_add(c, ca); + bch2_recalc_capacity(c); + ++ bch2_io_timer_add(&c->io_clock[READ], &c->bucket_clock[READ].rescale); ++ bch2_io_timer_add(&c->io_clock[WRITE], &c->bucket_clock[WRITE].rescale); ++ + for_each_rw_member(ca, c, i) { + ret = bch2_dev_allocator_start(ca); + if (ret) { +-- +cgit v1.2.3 + + +From 75a4933648d31d052a7b6825cdfe4af0f2b7772a Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 19 Oct 2020 22:36:24 -0400 +Subject: bcachefs: Fix for bad stripe pointers + +The allocator usually doesn't increment bucket gens right away on +buckets that it's about to hand out (for reasons that need to be +documented), instead deferring that to whatever extent update first +references that bucket. + +But stripe pointers reference buckets without changing bucket sector +counts, meaning we could end up with a pointer in a stripe with a gen +newer than the bucket it points to. + +Fix this by adding a transactional trigger for KEY_TYPE_stripe that just +writes out the keys in the alloc btree for the buckets it points to. + +Also - consolidate the code that checks pointer validity. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/alloc_background.c | 2 - + fs/bcachefs/alloc_background.h | 3 + + fs/bcachefs/btree_types.h | 1 + + fs/bcachefs/btree_update_leaf.c | 5 +- + fs/bcachefs/buckets.c | 283 ++++++++++++++++++++++++++-------------- + 5 files changed, 190 insertions(+), 104 deletions(-) + +diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c +index a4daaee74138..1f6b2742efd9 100644 +--- a/fs/bcachefs/alloc_background.c ++++ b/fs/bcachefs/alloc_background.c +@@ -497,8 +497,6 @@ static void bch2_bucket_clock_init(struct bch_fs *c, int rw) + * commands to the newly free buckets, then puts them on the various freelists. + */ + +-#define BUCKET_GC_GEN_MAX 96U +- + /** + * wait_buckets_available - wait on reclaimable buckets + * +diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h +index 56a846fde8dd..66ce54724e93 100644 +--- a/fs/bcachefs/alloc_background.h ++++ b/fs/bcachefs/alloc_background.h +@@ -13,6 +13,9 @@ struct bkey_alloc_unpacked { + #undef x + }; + ++/* How out of date a pointer gen is allowed to be: */ ++#define BUCKET_GC_GEN_MAX 96U ++ + /* returns true if not equal */ + static inline bool bkey_alloc_unpacked_cmp(struct bkey_alloc_unpacked l, + struct bkey_alloc_unpacked r) +diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h +index c1717b7c8c38..cc01baeec138 100644 +--- a/fs/bcachefs/btree_types.h ++++ b/fs/bcachefs/btree_types.h +@@ -591,6 +591,7 @@ static inline bool btree_iter_is_extents(struct btree_iter *iter) + #define BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS \ + ((1U << BKEY_TYPE_EXTENTS)| \ + (1U << BKEY_TYPE_INODES)| \ ++ (1U << BKEY_TYPE_EC)| \ + (1U << BKEY_TYPE_REFLINK)) + + enum btree_trigger_flags { +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index 83c60b04bac1..5dbb19ff11ae 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -337,8 +337,9 @@ static inline bool iter_has_trans_triggers(struct btree_iter *iter) + + static inline bool iter_has_nontrans_triggers(struct btree_iter *iter) + { +- return (BTREE_NODE_TYPE_HAS_TRIGGERS & +- ~BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS) & ++ return (((BTREE_NODE_TYPE_HAS_TRIGGERS & ++ ~BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS)) | ++ (1U << BTREE_ID_EC)) & + (1U << iter->btree_id); + } + +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index c3fc3abbc0dc..2a3b95968a86 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -884,124 +884,140 @@ static s64 ptr_disk_sectors_delta(struct extent_ptr_decoded p, + p.crc.uncompressed_size); + } + +-static void bucket_set_stripe(struct bch_fs *c, +- const struct bch_extent_ptr *ptr, +- struct bch_fs_usage *fs_usage, +- u64 journal_seq, +- unsigned flags, +- bool enabled) +-{ +- bool gc = flags & BTREE_TRIGGER_GC; +- struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); +- struct bucket *g = PTR_BUCKET(ca, ptr, gc); +- struct bucket_mark new, old; +- +- old = bucket_cmpxchg(g, new, ({ +- new.stripe = enabled; +- if (journal_seq) { +- new.journal_seq_valid = 1; +- new.journal_seq = journal_seq; +- } +- })); +- +- bch2_dev_usage_update(c, ca, fs_usage, old, new, gc); +- +- /* +- * XXX write repair code for these, flag stripe as possibly bad +- */ +- if (old.gen != ptr->gen) +- bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, +- "stripe with stale pointer"); +-#if 0 +- /* +- * We'd like to check for these, but these checks don't work +- * yet: +- */ +- if (old.stripe && enabled) +- bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, +- "multiple stripes using same bucket"); +- +- if (!old.stripe && !enabled) +- bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, +- "deleting stripe but bucket not marked as stripe bucket"); +-#endif +-} +- +-static int __mark_pointer(struct bch_fs *c, struct bkey_s_c k, +- struct extent_ptr_decoded p, +- s64 sectors, enum bch_data_type ptr_data_type, +- u8 bucket_gen, u8 *bucket_data_type, +- u16 *dirty_sectors, u16 *cached_sectors) +-{ +- u16 *dst_sectors = !p.ptr.cached ++static int check_bucket_ref(struct bch_fs *c, struct bkey_s_c k, ++ const struct bch_extent_ptr *ptr, ++ s64 sectors, enum bch_data_type ptr_data_type, ++ u8 bucket_gen, u8 bucket_data_type, ++ u16 dirty_sectors, u16 cached_sectors) ++{ ++ size_t bucket_nr = PTR_BUCKET_NR(bch_dev_bkey_exists(c, ptr->dev), ptr); ++ u16 bucket_sectors = !ptr->cached + ? dirty_sectors + : cached_sectors; +- u16 orig_sectors = *dst_sectors; + char buf[200]; + +- if (gen_after(p.ptr.gen, bucket_gen)) { ++ if (gen_after(ptr->gen, bucket_gen)) { + bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, + "bucket %u:%zu gen %u data type %s: ptr gen %u newer than bucket gen\n" + "while marking %s", +- p.ptr.dev, PTR_BUCKET_NR(bch_dev_bkey_exists(c, p.ptr.dev), &p.ptr), +- bucket_gen, +- bch2_data_types[*bucket_data_type ?: ptr_data_type], +- p.ptr.gen, ++ ptr->dev, bucket_nr, bucket_gen, ++ bch2_data_types[bucket_data_type ?: ptr_data_type], ++ ptr->gen, + (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); + return -EIO; + } + +- if (gen_cmp(bucket_gen, p.ptr.gen) > 96U) { ++ if (gen_cmp(bucket_gen, ptr->gen) > BUCKET_GC_GEN_MAX) { + bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, + "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n" + "while marking %s", +- p.ptr.dev, PTR_BUCKET_NR(bch_dev_bkey_exists(c, p.ptr.dev), &p.ptr), +- bucket_gen, +- bch2_data_types[*bucket_data_type ?: ptr_data_type], +- p.ptr.gen, ++ ptr->dev, bucket_nr, bucket_gen, ++ bch2_data_types[bucket_data_type ?: ptr_data_type], ++ ptr->gen, + (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); + return -EIO; + } + +- if (bucket_gen != p.ptr.gen && !p.ptr.cached) { ++ if (bucket_gen != ptr->gen && !ptr->cached) { + bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, + "bucket %u:%zu gen %u data type %s: stale dirty ptr (gen %u)\n" + "while marking %s", +- p.ptr.dev, PTR_BUCKET_NR(bch_dev_bkey_exists(c, p.ptr.dev), &p.ptr), +- bucket_gen, +- bch2_data_types[*bucket_data_type ?: ptr_data_type], +- p.ptr.gen, ++ ptr->dev, bucket_nr, bucket_gen, ++ bch2_data_types[bucket_data_type ?: ptr_data_type], ++ ptr->gen, + (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); + return -EIO; + } + +- if (bucket_gen != p.ptr.gen) ++ if (bucket_gen != ptr->gen) + return 1; + +- if (*bucket_data_type && *bucket_data_type != ptr_data_type) { ++ if (bucket_data_type && ptr_data_type && ++ bucket_data_type != ptr_data_type) { + bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, + "bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n" + "while marking %s", +- p.ptr.dev, PTR_BUCKET_NR(bch_dev_bkey_exists(c, p.ptr.dev), &p.ptr), +- bucket_gen, +- bch2_data_types[*bucket_data_type], ++ ptr->dev, bucket_nr, bucket_gen, ++ bch2_data_types[bucket_data_type], + bch2_data_types[ptr_data_type], + (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); + return -EIO; + } + +- if (checked_add(*dst_sectors, sectors)) { ++ if ((unsigned) (bucket_sectors + sectors) > U16_MAX) { + bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, + "bucket %u:%zu gen %u data type %s sector count overflow: %u + %lli > U16_MAX\n" + "while marking %s", +- p.ptr.dev, PTR_BUCKET_NR(bch_dev_bkey_exists(c, p.ptr.dev), &p.ptr), +- bucket_gen, +- bch2_data_types[*bucket_data_type ?: ptr_data_type], +- orig_sectors, sectors, ++ ptr->dev, bucket_nr, bucket_gen, ++ bch2_data_types[bucket_data_type ?: ptr_data_type], ++ bucket_sectors, sectors, + (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); + return -EIO; + } + ++ return 0; ++} ++ ++static int bucket_set_stripe(struct bch_fs *c, struct bkey_s_c k, ++ const struct bch_extent_ptr *ptr, ++ struct bch_fs_usage *fs_usage, ++ u64 journal_seq, ++ unsigned flags, ++ bool enabled) ++{ ++ bool gc = flags & BTREE_TRIGGER_GC; ++ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); ++ struct bucket *g = PTR_BUCKET(ca, ptr, gc); ++ struct bucket_mark new, old; ++ char buf[200]; ++ int ret; ++ ++ old = bucket_cmpxchg(g, new, ({ ++ ret = check_bucket_ref(c, k, ptr, 0, 0, new.gen, new.data_type, ++ new.dirty_sectors, new.cached_sectors); ++ if (ret) ++ return ret; ++ ++ if (new.stripe && enabled) ++ bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ++ "bucket %u:%zu gen %u: multiple stripes using same bucket\n%s", ++ ptr->dev, PTR_BUCKET_NR(ca, ptr), new.gen, ++ (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); ++ ++ if (!new.stripe && !enabled) ++ bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ++ "bucket %u:%zu gen %u: deleting stripe but not marked\n%s", ++ ptr->dev, PTR_BUCKET_NR(ca, ptr), new.gen, ++ (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); ++ ++ new.stripe = enabled; ++ if (journal_seq) { ++ new.journal_seq_valid = 1; ++ new.journal_seq = journal_seq; ++ } ++ })); ++ ++ bch2_dev_usage_update(c, ca, fs_usage, old, new, gc); ++ return 0; ++} ++ ++static int __mark_pointer(struct bch_fs *c, struct bkey_s_c k, ++ const struct bch_extent_ptr *ptr, ++ s64 sectors, enum bch_data_type ptr_data_type, ++ u8 bucket_gen, u8 *bucket_data_type, ++ u16 *dirty_sectors, u16 *cached_sectors) ++{ ++ u16 *dst_sectors = !ptr->cached ++ ? dirty_sectors ++ : cached_sectors; ++ int ret = check_bucket_ref(c, k, ptr, sectors, ptr_data_type, ++ bucket_gen, *bucket_data_type, ++ *dirty_sectors, *cached_sectors); ++ ++ if (ret) ++ return ret; ++ ++ *dst_sectors += sectors; + *bucket_data_type = *dirty_sectors || *cached_sectors + ? ptr_data_type : 0; + return 0; +@@ -1026,7 +1042,7 @@ static int bch2_mark_pointer(struct bch_fs *c, struct bkey_s_c k, + new.v.counter = old.v.counter = v; + bucket_data_type = new.data_type; + +- ret = __mark_pointer(c, k, p, sectors, data_type, new.gen, ++ ret = __mark_pointer(c, k, &p.ptr, sectors, data_type, new.gen, + &bucket_data_type, + &new.dirty_sectors, + &new.cached_sectors); +@@ -1190,6 +1206,7 @@ static int bch2_mark_stripe(struct bch_fs *c, + ? bkey_s_c_to_stripe(new).v : NULL; + struct stripe *m = genradix_ptr(&c->stripes[gc], idx); + unsigned i; ++ int ret; + + if (!m || (old_s && !m->alive)) { + bch_err_ratelimited(c, "error marking nonexistent stripe %zu", +@@ -1199,9 +1216,12 @@ static int bch2_mark_stripe(struct bch_fs *c, + + if (!new_s) { + /* Deleting: */ +- for (i = 0; i < old_s->nr_blocks; i++) +- bucket_set_stripe(c, old_s->ptrs + i, fs_usage, +- journal_seq, flags, false); ++ for (i = 0; i < old_s->nr_blocks; i++) { ++ ret = bucket_set_stripe(c, old, old_s->ptrs + i, fs_usage, ++ journal_seq, flags, false); ++ if (ret) ++ return ret; ++ } + + if (!gc && m->on_heap) { + spin_lock(&c->ec_stripes_heap_lock); +@@ -1220,11 +1240,16 @@ static int bch2_mark_stripe(struct bch_fs *c, + old_s->ptrs + i, + sizeof(struct bch_extent_ptr))) { + +- if (old_s) +- bucket_set_stripe(c, old_s->ptrs + i, fs_usage, ++ if (old_s) { ++ bucket_set_stripe(c, old, old_s->ptrs + i, fs_usage, + journal_seq, flags, false); +- bucket_set_stripe(c, new_s->ptrs + i, fs_usage, +- journal_seq, flags, true); ++ if (ret) ++ return ret; ++ } ++ ret = bucket_set_stripe(c, new, new_s->ptrs + i, fs_usage, ++ journal_seq, flags, true); ++ if (ret) ++ return ret; + } + } + +@@ -1550,23 +1575,21 @@ static int trans_get_key(struct btree_trans *trans, + return ret; + } + +-static int bch2_trans_mark_pointer(struct btree_trans *trans, +- struct bkey_s_c k, struct extent_ptr_decoded p, +- s64 sectors, enum bch_data_type data_type) ++static int bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter **_iter, ++ const struct bch_extent_ptr *ptr, ++ struct bkey_alloc_unpacked *u) + { + struct bch_fs *c = trans->c; +- struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); +- struct bpos pos = POS(p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr)); +- struct btree_iter *iter; +- struct bkey_s_c k_a; +- struct bkey_alloc_unpacked u; +- struct bkey_i_alloc *a; ++ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); ++ struct bpos pos = POS(ptr->dev, PTR_BUCKET_NR(ca, ptr)); + struct bucket *g; ++ struct btree_iter *iter; ++ struct bkey_s_c k; + int ret; + +- iter = trans_get_update(trans, BTREE_ID_ALLOC, pos, &k_a); ++ iter = trans_get_update(trans, BTREE_ID_ALLOC, pos, &k); + if (iter) { +- u = bch2_alloc_unpack(k_a); ++ *u = bch2_alloc_unpack(k); + } else { + iter = bch2_trans_get_iter(trans, BTREE_ID_ALLOC, pos, + BTREE_ITER_CACHED| +@@ -1576,16 +1599,36 @@ static int bch2_trans_mark_pointer(struct btree_trans *trans, + return PTR_ERR(iter); + + ret = bch2_btree_iter_traverse(iter); +- if (ret) +- goto out; ++ if (ret) { ++ bch2_trans_iter_put(trans, iter); ++ return ret; ++ } + + percpu_down_read(&c->mark_lock); + g = bucket(ca, pos.offset); +- u = alloc_mem_to_key(g, READ_ONCE(g->mark)); ++ *u = alloc_mem_to_key(g, READ_ONCE(g->mark)); + percpu_up_read(&c->mark_lock); + } + +- ret = __mark_pointer(c, k, p, sectors, data_type, u.gen, &u.data_type, ++ *_iter = iter; ++ return 0; ++} ++ ++static int bch2_trans_mark_pointer(struct btree_trans *trans, ++ struct bkey_s_c k, struct extent_ptr_decoded p, ++ s64 sectors, enum bch_data_type data_type) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_iter *iter; ++ struct bkey_alloc_unpacked u; ++ struct bkey_i_alloc *a; ++ int ret; ++ ++ ret = bch2_trans_start_alloc_update(trans, &iter, &p.ptr, &u); ++ if (ret) ++ return ret; ++ ++ ret = __mark_pointer(c, k, &p.ptr, sectors, data_type, u.gen, &u.data_type, + &u.dirty_sectors, &u.cached_sectors); + if (ret) + goto out; +@@ -1596,7 +1639,7 @@ static int bch2_trans_mark_pointer(struct btree_trans *trans, + goto out; + + bkey_alloc_init(&a->k_i); +- a->k.p = pos; ++ a->k.p = iter->pos; + bch2_alloc_pack(a, u); + bch2_trans_update(trans, iter, &a->k_i, 0); + out: +@@ -1717,6 +1760,44 @@ static int bch2_trans_mark_extent(struct btree_trans *trans, + return 0; + } + ++static int bch2_trans_mark_stripe(struct btree_trans *trans, ++ struct bkey_s_c k) ++{ ++ const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; ++ struct bkey_alloc_unpacked u; ++ struct bkey_i_alloc *a; ++ struct btree_iter *iter; ++ unsigned i; ++ int ret = 0; ++ ++ /* ++ * The allocator code doesn't necessarily update bucket gens in the ++ * btree when incrementing them, right before handing out new buckets - ++ * we just need to persist those updates here along with the new stripe: ++ */ ++ ++ for (i = 0; i < s->nr_blocks && !ret; i++) { ++ ret = bch2_trans_start_alloc_update(trans, &iter, ++ &s->ptrs[i], &u); ++ if (ret) ++ break; ++ ++ a = bch2_trans_kmalloc(trans, BKEY_ALLOC_U64s_MAX * 8); ++ ret = PTR_ERR_OR_ZERO(a); ++ if (ret) ++ goto put_iter; ++ ++ bkey_alloc_init(&a->k_i); ++ a->k.p = iter->pos; ++ bch2_alloc_pack(a, u); ++ bch2_trans_update(trans, iter, &a->k_i, 0); ++put_iter: ++ bch2_trans_iter_put(trans, iter); ++ } ++ ++ return ret; ++} ++ + static int __bch2_trans_mark_reflink_p(struct btree_trans *trans, + struct bkey_s_c_reflink_p p, + u64 idx, unsigned sectors, +@@ -1816,6 +1897,8 @@ int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c k, + case KEY_TYPE_reflink_v: + return bch2_trans_mark_extent(trans, k, offset, sectors, + flags, BCH_DATA_user); ++ case KEY_TYPE_stripe: ++ return bch2_trans_mark_stripe(trans, k); + case KEY_TYPE_inode: + d = replicas_deltas_realloc(trans, 0); + +-- +cgit v1.2.3 + + +From 7e4ac2c2727050df8de80f6289c84386c7d81c9c Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 9 Jul 2020 18:31:51 -0400 +Subject: bcachefs: Account for stripe parity sectors separately + +Instead of trying to charge EC parity to the data within the stripe +(which is subject to rounding errors), let's charge it to the stripe +itself. It should also make -ENOSPC issues easier to deal with if we +charge for parity blocks up front, and means we can also make more fine +grained accounting available to the user. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bcachefs_format.h | 3 +- + fs/bcachefs/buckets.c | 176 ++++++++++++++++++++++-------------------- + fs/bcachefs/ec.c | 31 +++++++- + fs/bcachefs/ec.h | 2 + + fs/bcachefs/replicas.c | 20 ++++- + 5 files changed, 144 insertions(+), 88 deletions(-) + +diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h +index d5a2230e403c..45dc42865811 100644 +--- a/fs/bcachefs/bcachefs_format.h ++++ b/fs/bcachefs/bcachefs_format.h +@@ -1032,7 +1032,8 @@ LE64_BITMASK(BCH_KDF_SCRYPT_P, struct bch_sb_field_crypt, kdf_flags, 32, 48); + x(journal, 2) \ + x(btree, 3) \ + x(user, 4) \ +- x(cached, 5) ++ x(cached, 5) \ ++ x(parity, 6) + + enum bch_data_type { + #define x(t, n) BCH_DATA_##t, +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index 2a3b95968a86..ac9895f15b80 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -77,6 +77,26 @@ + #include + #include + ++static inline void fs_usage_data_type_to_base(struct bch_fs_usage *fs_usage, ++ enum bch_data_type data_type, ++ s64 sectors) ++{ ++ switch (data_type) { ++ case BCH_DATA_btree: ++ fs_usage->btree += sectors; ++ break; ++ case BCH_DATA_user: ++ case BCH_DATA_parity: ++ fs_usage->data += sectors; ++ break; ++ case BCH_DATA_cached: ++ fs_usage->cached += sectors; ++ break; ++ default: ++ break; ++ } ++} ++ + /* + * Clear journal_seq_valid for buckets for which it's not needed, to prevent + * wraparound: +@@ -132,17 +152,7 @@ void bch2_fs_usage_initialize(struct bch_fs *c) + struct bch_replicas_entry *e = + cpu_replicas_entry(&c->replicas, i); + +- switch (e->data_type) { +- case BCH_DATA_btree: +- usage->btree += usage->replicas[i]; +- break; +- case BCH_DATA_user: +- usage->data += usage->replicas[i]; +- break; +- case BCH_DATA_cached: +- usage->cached += usage->replicas[i]; +- break; +- } ++ fs_usage_data_type_to_base(usage, e->data_type, usage->replicas[i]); + } + + percpu_up_write(&c->mark_lock); +@@ -376,9 +386,14 @@ static inline int is_fragmented_bucket(struct bucket_mark m, + return 0; + } + ++static inline int is_stripe_data_bucket(struct bucket_mark m) ++{ ++ return m.stripe && m.data_type != BCH_DATA_parity; ++} ++ + static inline int bucket_stripe_sectors(struct bucket_mark m) + { +- return m.stripe ? m.dirty_sectors : 0; ++ return is_stripe_data_bucket(m) ? m.dirty_sectors : 0; + } + + static inline enum bch_data_type bucket_type(struct bucket_mark m) +@@ -412,8 +427,8 @@ int bch2_fs_usage_apply(struct bch_fs *c, + */ + should_not_have_added = added - (s64) (disk_res ? disk_res->sectors : 0); + if (WARN_ONCE(should_not_have_added > 0, +- "disk usage increased by %lli without a reservation", +- should_not_have_added)) { ++ "disk usage increased by %lli more than reservation of %llu", ++ added, disk_res ? disk_res->sectors : 0)) { + atomic64_sub(should_not_have_added, &c->sectors_available); + added -= should_not_have_added; + ret = -1; +@@ -522,17 +537,7 @@ static inline int update_replicas(struct bch_fs *c, + if (!fs_usage) + return 0; + +- switch (r->data_type) { +- case BCH_DATA_btree: +- fs_usage->btree += sectors; +- break; +- case BCH_DATA_user: +- fs_usage->data += sectors; +- break; +- case BCH_DATA_cached: +- fs_usage->cached += sectors; +- break; +- } ++ fs_usage_data_type_to_base(fs_usage, r->data_type, sectors); + fs_usage->replicas[idx] += sectors; + return 0; + } +@@ -959,12 +964,15 @@ static int check_bucket_ref(struct bch_fs *c, struct bkey_s_c k, + } + + static int bucket_set_stripe(struct bch_fs *c, struct bkey_s_c k, +- const struct bch_extent_ptr *ptr, ++ unsigned ptr_idx, + struct bch_fs_usage *fs_usage, +- u64 journal_seq, +- unsigned flags, ++ u64 journal_seq, unsigned flags, + bool enabled) + { ++ const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; ++ unsigned nr_data = s->nr_blocks - s->nr_redundant; ++ bool parity = ptr_idx >= nr_data; ++ const struct bch_extent_ptr *ptr = s->ptrs + ptr_idx; + bool gc = flags & BTREE_TRIGGER_GC; + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); + struct bucket *g = PTR_BUCKET(ca, ptr, gc); +@@ -991,6 +999,12 @@ static int bucket_set_stripe(struct bch_fs *c, struct bkey_s_c k, + (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); + + new.stripe = enabled; ++ ++ if ((flags & BTREE_TRIGGER_GC) && parity) { ++ new.data_type = enabled ? BCH_DATA_parity : 0; ++ new.dirty_sectors = enabled ? le16_to_cpu(s->sectors): 0; ++ } ++ + if (journal_seq) { + new.journal_seq_valid = 1; + new.journal_seq = journal_seq; +@@ -1075,12 +1089,10 @@ static int bch2_mark_stripe_ptr(struct bch_fs *c, + struct bch_extent_stripe_ptr p, + enum bch_data_type data_type, + struct bch_fs_usage *fs_usage, +- s64 sectors, unsigned flags, +- struct bch_replicas_padded *r, +- unsigned *nr_data, +- unsigned *nr_parity) ++ s64 sectors, unsigned flags) + { + bool gc = flags & BTREE_TRIGGER_GC; ++ struct bch_replicas_padded r; + struct stripe *m; + unsigned i, blocks_nonempty = 0; + +@@ -1095,14 +1107,10 @@ static int bch2_mark_stripe_ptr(struct bch_fs *c, + return -EIO; + } + +- BUG_ON(m->r.e.data_type != data_type); +- +- *nr_data = m->nr_blocks - m->nr_redundant; +- *nr_parity = m->nr_redundant; +- *r = m->r; +- + m->block_sectors[p.block] += sectors; + ++ r = m->r; ++ + for (i = 0; i < m->nr_blocks; i++) + blocks_nonempty += m->block_sectors[i] != 0; + +@@ -1114,6 +1122,9 @@ static int bch2_mark_stripe_ptr(struct bch_fs *c, + + spin_unlock(&c->ec_stripes_heap_lock); + ++ r.e.data_type = data_type; ++ update_replicas(c, fs_usage, &r.e, sectors); ++ + return 0; + } + +@@ -1159,25 +1170,11 @@ static int bch2_mark_extent(struct bch_fs *c, + dirty_sectors += disk_sectors; + r.e.devs[r.e.nr_devs++] = p.ptr.dev; + } else { +- struct bch_replicas_padded ec_r; +- unsigned nr_data, nr_parity; +- s64 parity_sectors; +- + ret = bch2_mark_stripe_ptr(c, p.ec, data_type, +- fs_usage, disk_sectors, flags, +- &ec_r, &nr_data, &nr_parity); ++ fs_usage, disk_sectors, flags); + if (ret) + return ret; + +- parity_sectors = +- __ptr_disk_sectors_delta(p.crc.live_size, +- offset, sectors, flags, +- p.crc.compressed_size * nr_parity, +- p.crc.uncompressed_size * nr_data); +- +- update_replicas(c, fs_usage, &ec_r.e, +- disk_sectors + parity_sectors); +- + /* + * There may be other dirty pointers in this extent, but + * if so they're not required for mounting if we have an +@@ -1217,7 +1214,7 @@ static int bch2_mark_stripe(struct bch_fs *c, + if (!new_s) { + /* Deleting: */ + for (i = 0; i < old_s->nr_blocks; i++) { +- ret = bucket_set_stripe(c, old, old_s->ptrs + i, fs_usage, ++ ret = bucket_set_stripe(c, old, i, fs_usage, + journal_seq, flags, false); + if (ret) + return ret; +@@ -1229,6 +1226,10 @@ static int bch2_mark_stripe(struct bch_fs *c, + spin_unlock(&c->ec_stripes_heap_lock); + } + ++ if (gc) ++ update_replicas(c, fs_usage, &m->r.e, ++ -((s64) m->sectors * m->nr_redundant)); ++ + memset(m, 0, sizeof(*m)); + } else { + BUG_ON(old_s && new_s->nr_blocks != old_s->nr_blocks); +@@ -1241,12 +1242,12 @@ static int bch2_mark_stripe(struct bch_fs *c, + sizeof(struct bch_extent_ptr))) { + + if (old_s) { +- bucket_set_stripe(c, old, old_s->ptrs + i, fs_usage, ++ bucket_set_stripe(c, old, i, fs_usage, + journal_seq, flags, false); + if (ret) + return ret; + } +- ret = bucket_set_stripe(c, new, new_s->ptrs + i, fs_usage, ++ ret = bucket_set_stripe(c, new, i, fs_usage, + journal_seq, flags, true); + if (ret) + return ret; +@@ -1259,8 +1260,16 @@ static int bch2_mark_stripe(struct bch_fs *c, + m->nr_blocks = new_s->nr_blocks; + m->nr_redundant = new_s->nr_redundant; + ++ if (gc && old_s) ++ update_replicas(c, fs_usage, &m->r.e, ++ -((s64) m->sectors * m->nr_redundant)); ++ + bch2_bkey_to_replicas(&m->r.e, new); + ++ if (gc) ++ update_replicas(c, fs_usage, &m->r.e, ++ ((s64) m->sectors * m->nr_redundant)); ++ + /* gc recalculates these fields: */ + if (!(flags & BTREE_TRIGGER_GC)) { + m->blocks_nonempty = 0; +@@ -1649,15 +1658,13 @@ out: + + static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans, + struct bch_extent_stripe_ptr p, +- s64 sectors, enum bch_data_type data_type, +- struct bch_replicas_padded *r, +- unsigned *nr_data, +- unsigned *nr_parity) ++ s64 sectors, enum bch_data_type data_type) + { + struct bch_fs *c = trans->c; + struct btree_iter *iter; + struct bkey_s_c k; + struct bkey_i_stripe *s; ++ struct bch_replicas_padded r; + int ret = 0; + + ret = trans_get_key(trans, BTREE_ID_EC, POS(0, p.idx), &iter, &k); +@@ -1678,15 +1685,14 @@ static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans, + goto out; + + bkey_reassemble(&s->k_i, k); +- + stripe_blockcount_set(&s->v, p.block, + stripe_blockcount_get(&s->v, p.block) + + sectors); +- +- *nr_data = s->v.nr_blocks - s->v.nr_redundant; +- *nr_parity = s->v.nr_redundant; +- bch2_bkey_to_replicas(&r->e, bkey_i_to_s_c(&s->k_i)); + bch2_trans_update(trans, iter, &s->k_i, 0); ++ ++ bch2_bkey_to_replicas(&r.e, bkey_i_to_s_c(&s->k_i)); ++ r.e.data_type = data_type; ++ update_replicas_list(trans, &r.e, sectors); + out: + bch2_trans_iter_put(trans, iter); + return ret; +@@ -1731,25 +1737,11 @@ static int bch2_trans_mark_extent(struct btree_trans *trans, + dirty_sectors += disk_sectors; + r.e.devs[r.e.nr_devs++] = p.ptr.dev; + } else { +- struct bch_replicas_padded ec_r; +- unsigned nr_data, nr_parity; +- s64 parity_sectors; +- + ret = bch2_trans_mark_stripe_ptr(trans, p.ec, +- disk_sectors, data_type, +- &ec_r, &nr_data, &nr_parity); ++ disk_sectors, data_type); + if (ret) + return ret; + +- parity_sectors = +- __ptr_disk_sectors_delta(p.crc.live_size, +- offset, sectors, flags, +- p.crc.compressed_size * nr_parity, +- p.crc.uncompressed_size * nr_data); +- +- update_replicas_list(trans, &ec_r.e, +- disk_sectors + parity_sectors); +- + r.e.nr_required = 0; + } + } +@@ -1761,15 +1753,26 @@ static int bch2_trans_mark_extent(struct btree_trans *trans, + } + + static int bch2_trans_mark_stripe(struct btree_trans *trans, +- struct bkey_s_c k) ++ struct bkey_s_c k, ++ unsigned flags) + { + const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; ++ unsigned nr_data = s->nr_blocks - s->nr_redundant; ++ struct bch_replicas_padded r; + struct bkey_alloc_unpacked u; + struct bkey_i_alloc *a; + struct btree_iter *iter; ++ bool deleting = flags & BTREE_TRIGGER_OVERWRITE; ++ s64 sectors = le16_to_cpu(s->sectors); + unsigned i; + int ret = 0; + ++ if (deleting) ++ sectors = -sectors; ++ ++ bch2_bkey_to_replicas(&r.e, k); ++ update_replicas_list(trans, &r.e, sectors * s->nr_redundant); ++ + /* + * The allocator code doesn't necessarily update bucket gens in the + * btree when incrementing them, right before handing out new buckets - +@@ -1777,11 +1780,20 @@ static int bch2_trans_mark_stripe(struct btree_trans *trans, + */ + + for (i = 0; i < s->nr_blocks && !ret; i++) { ++ bool parity = i >= nr_data; ++ + ret = bch2_trans_start_alloc_update(trans, &iter, + &s->ptrs[i], &u); + if (ret) + break; + ++ if (parity) { ++ u.dirty_sectors += sectors; ++ u.data_type = u.dirty_sectors ++ ? BCH_DATA_parity ++ : 0; ++ } ++ + a = bch2_trans_kmalloc(trans, BKEY_ALLOC_U64s_MAX * 8); + ret = PTR_ERR_OR_ZERO(a); + if (ret) +@@ -1898,7 +1910,7 @@ int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c k, + return bch2_trans_mark_extent(trans, k, offset, sectors, + flags, BCH_DATA_user); + case KEY_TYPE_stripe: +- return bch2_trans_mark_stripe(trans, k); ++ return bch2_trans_mark_stripe(trans, k, flags); + case KEY_TYPE_inode: + d = replicas_deltas_realloc(trans, 0); + +diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c +index ad08abfde843..bc43a20fb3c4 100644 +--- a/fs/bcachefs/ec.c ++++ b/fs/bcachefs/ec.c +@@ -343,12 +343,17 @@ static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf, + unsigned offset = 0, bytes = buf->size << 9; + struct bch_extent_ptr *ptr = &v->ptrs[idx]; + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); ++ enum bch_data_type data_type = idx < buf->key.v.nr_blocks - buf->key.v.nr_redundant ++ ? BCH_DATA_user ++ : BCH_DATA_parity; + + if (!bch2_dev_get_ioref(ca, rw)) { + clear_bit(idx, buf->valid); + return; + } + ++ this_cpu_add(ca->io_done->sectors[rw][data_type], buf->size); ++ + while (offset < bytes) { + unsigned nr_iovecs = min_t(size_t, BIO_MAX_VECS, + DIV_ROUND_UP(bytes, PAGE_SIZE)); +@@ -670,6 +675,7 @@ static void ec_stripe_delete_work(struct work_struct *work) + /* stripe creation: */ + + static int ec_stripe_bkey_insert(struct bch_fs *c, ++ struct ec_stripe_new *s, + struct bkey_i_stripe *stripe) + { + struct btree_trans trans; +@@ -711,7 +717,7 @@ found_slot: + + bch2_trans_update(&trans, iter, &stripe->k_i, 0); + +- ret = bch2_trans_commit(&trans, NULL, NULL, ++ ret = bch2_trans_commit(&trans, &s->res, NULL, + BTREE_INSERT_NOFAIL); + err: + bch2_trans_iter_put(&trans, iter); +@@ -858,8 +864,8 @@ static void ec_stripe_create(struct ec_stripe_new *s) + + ret = s->existing_stripe + ? bch2_btree_insert(c, BTREE_ID_EC, &s->stripe.key.k_i, +- NULL, NULL, BTREE_INSERT_NOFAIL) +- : ec_stripe_bkey_insert(c, &s->stripe.key); ++ &s->res, NULL, BTREE_INSERT_NOFAIL) ++ : ec_stripe_bkey_insert(c, s, &s->stripe.key); + if (ret) { + bch_err(c, "error creating stripe: error creating stripe key"); + goto err_put_writes; +@@ -886,6 +892,8 @@ static void ec_stripe_create(struct ec_stripe_new *s) + err_put_writes: + percpu_ref_put(&c->writes); + err: ++ bch2_disk_reservation_put(c, &s->res); ++ + open_bucket_for_each(c, &s->blocks, ob, i) { + ob->ec = NULL; + __bch2_open_bucket_put(c, ob); +@@ -1325,6 +1333,7 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c, + struct open_bucket *ob; + unsigned i, data_idx = 0; + s64 idx; ++ int ret; + + closure_init_stack(&cl); + +@@ -1356,6 +1365,22 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c, + } + } + ++ if (!h->s->existing_stripe && ++ !h->s->res.sectors) { ++ ret = bch2_disk_reservation_get(c, &h->s->res, ++ h->blocksize, ++ h->s->nr_parity, 0); ++ if (ret) { ++ /* What should we do here? */ ++ bch_err(c, "unable to create new stripe: %i", ret); ++ bch2_ec_stripe_head_put(c, h); ++ h = NULL; ++ goto out; ++ ++ } ++ ++ } ++ + if (new_stripe_alloc_buckets(c, h)) { + bch2_ec_stripe_head_put(c, h); + h = NULL; +diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h +index 6db16cf768da..15f751fc2a35 100644 +--- a/fs/bcachefs/ec.h ++++ b/fs/bcachefs/ec.h +@@ -3,6 +3,7 @@ + #define _BCACHEFS_EC_H + + #include "ec_types.h" ++#include "buckets_types.h" + #include "keylist_types.h" + + const char *bch2_stripe_invalid(const struct bch_fs *, struct bkey_s_c); +@@ -105,6 +106,7 @@ struct ec_stripe_new { + struct open_buckets blocks; + u8 data_block_idx[EC_STRIPE_MAX]; + struct open_buckets parity; ++ struct disk_reservation res; + + struct keylist keys; + u64 inline_keys[BKEY_U64s * 8]; +diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c +index 6b6506c68609..91518c0d6794 100644 +--- a/fs/bcachefs/replicas.c ++++ b/fs/bcachefs/replicas.c +@@ -122,7 +122,7 @@ void bch2_bkey_to_replicas(struct bch_replicas_entry *e, + extent_to_replicas(k, e); + break; + case KEY_TYPE_stripe: +- e->data_type = BCH_DATA_user; ++ e->data_type = BCH_DATA_parity; + stripe_to_replicas(k, e); + break; + } +@@ -446,7 +446,23 @@ static int __bch2_mark_bkey_replicas(struct bch_fs *c, struct bkey_s_c k, + + bch2_bkey_to_replicas(&search.e, k); + +- return __bch2_mark_replicas(c, &search.e, check); ++ ret = __bch2_mark_replicas(c, &search.e, check); ++ if (ret) ++ return ret; ++ ++ if (search.e.data_type == BCH_DATA_parity) { ++ search.e.data_type = BCH_DATA_cached; ++ ret = __bch2_mark_replicas(c, &search.e, check); ++ if (ret) ++ return ret; ++ ++ search.e.data_type = BCH_DATA_user; ++ ret = __bch2_mark_replicas(c, &search.e, check); ++ if (ret) ++ return ret; ++ } ++ ++ return 0; + } + + bool bch2_bkey_replicas_marked(struct bch_fs *c, +-- +cgit v1.2.3 + + +From 4599c51b95c2faa50c799335cefac38ff8102a03 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 22 Jul 2020 23:11:48 -0400 +Subject: bcachefs: Don't drop replicas when copygcing ec data + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/buckets.c | 6 +++++ + fs/bcachefs/buckets_types.h | 2 ++ + fs/bcachefs/io.c | 3 ++- + fs/bcachefs/move.c | 6 +++-- + fs/bcachefs/move.h | 3 ++- + fs/bcachefs/movinggc.c | 61 ++++++++++++++++++++++++--------------------- + fs/bcachefs/rebalance.c | 1 + + 7 files changed, 50 insertions(+), 32 deletions(-) + +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index ac9895f15b80..99aeba1deb0a 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -980,6 +980,9 @@ static int bucket_set_stripe(struct bch_fs *c, struct bkey_s_c k, + char buf[200]; + int ret; + ++ if (enabled) ++ g->ec_redundancy = s->nr_redundant; ++ + old = bucket_cmpxchg(g, new, ({ + ret = check_bucket_ref(c, k, ptr, 0, 0, new.gen, new.data_type, + new.dirty_sectors, new.cached_sectors); +@@ -1011,6 +1014,9 @@ static int bucket_set_stripe(struct bch_fs *c, struct bkey_s_c k, + } + })); + ++ if (!enabled) ++ g->ec_redundancy = 0; ++ + bch2_dev_usage_update(c, ca, fs_usage, old, new, gc); + return 0; + } +diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h +index d5215b14d7d9..d6057d22b18e 100644 +--- a/fs/bcachefs/buckets_types.h ++++ b/fs/bcachefs/buckets_types.h +@@ -41,6 +41,7 @@ struct bucket { + u8 oldest_gen; + u8 gc_gen; + unsigned gen_valid:1; ++ u8 ec_redundancy; + }; + + struct bucket_array { +@@ -125,6 +126,7 @@ struct disk_reservation { + struct copygc_heap_entry { + u8 dev; + u8 gen; ++ u8 replicas; + u16 fragmentation; + u32 sectors; + u64 offset; +diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c +index 590e15dc776f..f1744ad886ef 100644 +--- a/fs/bcachefs/io.c ++++ b/fs/bcachefs/io.c +@@ -1473,7 +1473,8 @@ static struct promote_op *__promote_alloc(struct bch_fs *c, + opts, + DATA_PROMOTE, + (struct data_opts) { +- .target = opts.promote_target ++ .target = opts.promote_target, ++ .nr_replicas = 1, + }, + btree_id, k); + BUG_ON(ret); +diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c +index 1ffb14a22f94..a8df9ad0e449 100644 +--- a/fs/bcachefs/move.c ++++ b/fs/bcachefs/move.c +@@ -266,8 +266,8 @@ int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m, + BCH_WRITE_DATA_ENCODED| + BCH_WRITE_FROM_INTERNAL; + +- m->op.nr_replicas = 1; +- m->op.nr_replicas_required = 1; ++ m->op.nr_replicas = data_opts.nr_replicas; ++ m->op.nr_replicas_required = data_opts.nr_replicas; + m->op.index_update_fn = bch2_migrate_index_update; + + switch (data_cmd) { +@@ -755,6 +755,7 @@ static enum data_cmd rereplicate_pred(struct bch_fs *c, void *arg, + return DATA_SKIP; + + data_opts->target = 0; ++ data_opts->nr_replicas = 1; + data_opts->btree_insert_flags = 0; + return DATA_ADD_REPLICAS; + } +@@ -770,6 +771,7 @@ static enum data_cmd migrate_pred(struct bch_fs *c, void *arg, + return DATA_SKIP; + + data_opts->target = 0; ++ data_opts->nr_replicas = 1; + data_opts->btree_insert_flags = 0; + data_opts->rewrite_dev = op->migrate.dev; + return DATA_REWRITE; +diff --git a/fs/bcachefs/move.h b/fs/bcachefs/move.h +index 0acd1720d4f8..b04bc669226d 100644 +--- a/fs/bcachefs/move.h ++++ b/fs/bcachefs/move.h +@@ -20,7 +20,8 @@ enum data_cmd { + + struct data_opts { + u16 target; +- unsigned rewrite_dev; ++ u8 rewrite_dev; ++ u8 nr_replicas; + int btree_insert_flags; + }; + +diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c +index de0a7974ec9f..ddfda1ef8a79 100644 +--- a/fs/bcachefs/movinggc.c ++++ b/fs/bcachefs/movinggc.c +@@ -53,17 +53,21 @@ static int bucket_offset_cmp(const void *_l, const void *_r, size_t size) + cmp_int(l->offset, r->offset); + } + +-static int __copygc_pred(struct bch_fs *c, struct bkey_s_c k) ++static enum data_cmd copygc_pred(struct bch_fs *c, void *arg, ++ struct bkey_s_c k, ++ struct bch_io_opts *io_opts, ++ struct data_opts *data_opts) + { + copygc_heap *h = &c->copygc_heap; + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); +- const struct bch_extent_ptr *ptr; ++ const union bch_extent_entry *entry; ++ struct extent_ptr_decoded p; + +- bkey_for_each_ptr(ptrs, ptr) { +- struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); ++ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { ++ struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); + struct copygc_heap_entry search = { +- .dev = ptr->dev, +- .offset = ptr->offset ++ .dev = p.ptr.dev, ++ .offset = p.ptr.offset, + }; + + ssize_t i = eytzinger0_find_le(h->data, h->used, +@@ -81,27 +85,24 @@ static int __copygc_pred(struct bch_fs *c, struct bkey_s_c k) + BUG_ON(i != j); + #endif + if (i >= 0 && +- ptr->offset < h->data[i].offset + ca->mi.bucket_size && +- ptr->gen == h->data[i].gen) +- return ptr->dev; +- } ++ p.ptr.offset < h->data[i].offset + ca->mi.bucket_size && ++ p.ptr.gen == h->data[i].gen) { ++ data_opts->target = io_opts->background_target; ++ data_opts->nr_replicas = 1; ++ data_opts->btree_insert_flags = BTREE_INSERT_USE_RESERVE; ++ data_opts->rewrite_dev = p.ptr.dev; + +- return -1; +-} ++ if (p.has_ec) { ++ struct stripe *m = genradix_ptr(&c->stripes[0], p.ec.idx); + +-static enum data_cmd copygc_pred(struct bch_fs *c, void *arg, +- struct bkey_s_c k, +- struct bch_io_opts *io_opts, +- struct data_opts *data_opts) +-{ +- int dev_idx = __copygc_pred(c, k); +- if (dev_idx < 0) +- return DATA_SKIP; +- +- data_opts->target = io_opts->background_target; +- data_opts->btree_insert_flags = BTREE_INSERT_USE_RESERVE; +- data_opts->rewrite_dev = dev_idx; +- return DATA_REWRITE; ++ data_opts->nr_replicas += m->nr_redundant; ++ } ++ ++ return DATA_REWRITE; ++ } ++ } ++ ++ return DATA_SKIP; + } + + static bool have_copygc_reserve(struct bch_dev *ca) +@@ -168,7 +169,8 @@ static int bch2_copygc(struct bch_fs *c) + buckets = bucket_array(ca); + + for (b = buckets->first_bucket; b < buckets->nbuckets; b++) { +- struct bucket_mark m = READ_ONCE(buckets->b[b].mark); ++ struct bucket *g = buckets->b + b; ++ struct bucket_mark m = READ_ONCE(g->mark); + struct copygc_heap_entry e; + + if (m.owned_by_allocator || +@@ -177,9 +179,12 @@ static int bch2_copygc(struct bch_fs *c) + bucket_sectors_used(m) >= ca->mi.bucket_size) + continue; + ++ WARN_ON(m.stripe && !g->ec_redundancy); ++ + e = (struct copygc_heap_entry) { + .dev = dev_idx, + .gen = m.gen, ++ .replicas = 1 + g->ec_redundancy, + .fragmentation = bucket_sectors_used(m) * (1U << 15) + / ca->mi.bucket_size, + .sectors = bucket_sectors_used(m), +@@ -196,11 +201,11 @@ static int bch2_copygc(struct bch_fs *c) + } + + for (i = h->data; i < h->data + h->used; i++) +- sectors_to_move += i->sectors; ++ sectors_to_move += i->sectors * i->replicas; + + while (sectors_to_move > sectors_reserved) { + BUG_ON(!heap_pop(h, e, -fragmentation_cmp, NULL)); +- sectors_to_move -= e.sectors; ++ sectors_to_move -= e.sectors * e.replicas; + } + + buckets_to_move = h->used; +diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c +index 56a1f761271f..44d2651be970 100644 +--- a/fs/bcachefs/rebalance.c ++++ b/fs/bcachefs/rebalance.c +@@ -73,6 +73,7 @@ static enum data_cmd rebalance_pred(struct bch_fs *c, void *arg, + { + if (__bch2_rebalance_pred(c, k, io_opts) >= 0) { + data_opts->target = io_opts->background_target; ++ data_opts->nr_replicas = 1; + data_opts->btree_insert_flags = 0; + return DATA_ADD_REPLICAS; + } else { +-- +cgit v1.2.3 + + +From b0da1fe27682b223dd89f31f4d71d230ffe1882e Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 23 Oct 2020 18:40:30 -0400 +Subject: bcachefs: Fix bch2_mark_stripe() + +There's no reason not to always recalculate these fields + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/buckets.c | 18 +++++++----------- + 1 file changed, 7 insertions(+), 11 deletions(-) + +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index 99aeba1deb0a..c8e8e978d143 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -1265,6 +1265,13 @@ static int bch2_mark_stripe(struct bch_fs *c, + m->algorithm = new_s->algorithm; + m->nr_blocks = new_s->nr_blocks; + m->nr_redundant = new_s->nr_redundant; ++ m->blocks_nonempty = 0; ++ ++ for (i = 0; i < new_s->nr_blocks; i++) { ++ m->block_sectors[i] = ++ stripe_blockcount_get(new_s, i); ++ m->blocks_nonempty += !!m->block_sectors[i]; ++ } + + if (gc && old_s) + update_replicas(c, fs_usage, &m->r.e, +@@ -1276,17 +1283,6 @@ static int bch2_mark_stripe(struct bch_fs *c, + update_replicas(c, fs_usage, &m->r.e, + ((s64) m->sectors * m->nr_redundant)); + +- /* gc recalculates these fields: */ +- if (!(flags & BTREE_TRIGGER_GC)) { +- m->blocks_nonempty = 0; +- +- for (i = 0; i < new_s->nr_blocks; i++) { +- m->block_sectors[i] = +- stripe_blockcount_get(new_s, i); +- m->blocks_nonempty += !!m->block_sectors[i]; +- } +- } +- + if (!gc) { + spin_lock(&c->ec_stripes_heap_lock); + bch2_stripes_heap_update(c, m, idx); +-- +cgit v1.2.3 + + +From 91c0d33e5b590008a11987e8095b1f36a26cbcc2 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 23 Oct 2020 21:07:17 -0400 +Subject: bcachefs: Fix for passing target= opts as mount opts + +Some options can't be parsed until the filesystem initialized; +previously, passing these options to mount or remount would cause mount +to fail. + +This changes the mount path so that we parse the options passed in +twice, and just ignore any options that can't be parsed the first time. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/fs.c | 25 +++++++++++++++++++------ + fs/bcachefs/opts.c | 7 ++++--- + fs/bcachefs/opts.h | 2 +- + 3 files changed, 24 insertions(+), 10 deletions(-) + +diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c +index f4a101a68fc5..d137ddde6665 100644 +--- a/fs/bcachefs/fs.c ++++ b/fs/bcachefs/fs.c +@@ -1342,7 +1342,7 @@ static int bch2_remount(struct super_block *sb, int *flags, char *data) + + opt_set(opts, read_only, (*flags & SB_RDONLY) != 0); + +- ret = bch2_parse_mount_opts(&opts, data); ++ ret = bch2_parse_mount_opts(c, &opts, data); + if (ret) + return ret; + +@@ -1483,7 +1483,7 @@ static struct dentry *bch2_mount(struct file_system_type *fs_type, + + opt_set(opts, read_only, (flags & SB_RDONLY) != 0); + +- ret = bch2_parse_mount_opts(&opts, data); ++ ret = bch2_parse_mount_opts(NULL, &opts, data); + if (ret) + return ERR_PTR(ret); + +@@ -1506,11 +1506,24 @@ static struct dentry *bch2_mount(struct file_system_type *fs_type, + goto got_sb; + + c = bch2_fs_open(devs, nr_devs, opts); +- +- if (!IS_ERR(c)) +- sb = sget(fs_type, NULL, bch2_set_super, flags|SB_NOSEC, c); +- else ++ if (IS_ERR(c)) { + sb = ERR_CAST(c); ++ goto got_sb; ++ } ++ ++ /* Some options can't be parsed until after the fs is started: */ ++ ret = bch2_parse_mount_opts(c, &opts, data); ++ if (ret) { ++ bch2_fs_stop(c); ++ sb = ERR_PTR(ret); ++ goto got_sb; ++ } ++ ++ bch2_opts_apply(&c->opts, opts); ++ ++ sb = sget(fs_type, NULL, bch2_set_super, flags|SB_NOSEC, c); ++ if (IS_ERR(sb)) ++ bch2_fs_stop(c); + got_sb: + kfree(devs_to_fs); + kfree(devs[0]); +diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c +index afe25cd26c06..97a36ac0beea 100644 +--- a/fs/bcachefs/opts.c ++++ b/fs/bcachefs/opts.c +@@ -247,7 +247,7 @@ int bch2_opt_parse(struct bch_fs *c, const struct bch_option *opt, + break; + case BCH_OPT_FN: + if (!c) +- return -EINVAL; ++ return 0; + + return opt->parse(c, val, res); + } +@@ -325,7 +325,8 @@ int bch2_opts_check_may_set(struct bch_fs *c) + return 0; + } + +-int bch2_parse_mount_opts(struct bch_opts *opts, char *options) ++int bch2_parse_mount_opts(struct bch_fs *c, struct bch_opts *opts, ++ char *options) + { + char *opt, *name, *val; + int ret, id; +@@ -340,7 +341,7 @@ int bch2_parse_mount_opts(struct bch_opts *opts, char *options) + if (id < 0) + goto bad_opt; + +- ret = bch2_opt_parse(NULL, &bch2_opt_table[id], val, &v); ++ ret = bch2_opt_parse(c, &bch2_opt_table[id], val, &v); + if (ret < 0) + goto bad_val; + } else { +diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h +index 014c608ca0c6..1dd8d47c0972 100644 +--- a/fs/bcachefs/opts.h ++++ b/fs/bcachefs/opts.h +@@ -418,7 +418,7 @@ void bch2_opt_to_text(struct printbuf *, struct bch_fs *, + + int bch2_opt_check_may_set(struct bch_fs *, int, u64); + int bch2_opts_check_may_set(struct bch_fs *); +-int bch2_parse_mount_opts(struct bch_opts *, char *); ++int bch2_parse_mount_opts(struct bch_fs *, struct bch_opts *, char *); + + /* inode opts: */ + +-- +cgit v1.2.3 + + +From c1746167f5ae620099a8e8de637553443b997258 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 24 Oct 2020 16:37:17 -0400 +Subject: bcachefs: Improve some error messages + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_io.c | 10 +++++++--- + 1 file changed, 7 insertions(+), 3 deletions(-) + +diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c +index 2f5097218f9c..682f599cbef5 100644 +--- a/fs/bcachefs/btree_io.c ++++ b/fs/bcachefs/btree_io.c +@@ -750,7 +750,9 @@ static int validate_bset(struct bch_fs *c, struct btree *b, + + btree_err_on(bkey_cmp(bn->max_key, b->key.k.p), + BTREE_ERR_MUST_RETRY, c, b, i, +- "incorrect max key"); ++ "incorrect max key %llu:%llu", ++ bn->max_key.inode, ++ bn->max_key.offset); + + if (write) + compat_btree_node(b->c.level, b->c.btree_id, version, +@@ -930,7 +932,8 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry + + btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)), + BTREE_ERR_WANT_RETRY, c, b, i, +- "unknown checksum type"); ++ "unknown checksum type %llu", ++ BSET_CSUM_TYPE(i)); + + nonce = btree_nonce(i, b->written << 9); + csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data); +@@ -957,7 +960,8 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry + + btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)), + BTREE_ERR_WANT_RETRY, c, b, i, +- "unknown checksum type"); ++ "unknown checksum type %llu", ++ BSET_CSUM_TYPE(i)); + + nonce = btree_nonce(i, b->written << 9); + csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne); +-- +cgit v1.2.3 + + +From 2e45da8a23bb2d899bc7dfb68e33362f6b2e8990 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 24 Oct 2020 20:56:47 -0400 +Subject: bcachefs: Fix rare use after free in read path + +If the bkey_on_stack_reassemble() call in __bch2_read_indirect_extent() +reallocates the buffer, k in bch2_read - which we pointed at the +bkey_on_stack buffer - will now point to a stale buffer. Whoops. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/fs-io.c | 7 ++++--- + fs/bcachefs/fs.c | 11 ++++++----- + fs/bcachefs/io.c | 6 ++++-- + 3 files changed, 14 insertions(+), 10 deletions(-) + +diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c +index e0e53ec2d2cf..a342ba102cdc 100644 +--- a/fs/bcachefs/fs-io.c ++++ b/fs/bcachefs/fs-io.c +@@ -865,18 +865,19 @@ retry: + if (ret) + break; + +- bkey_on_stack_reassemble(&sk, c, k); +- k = bkey_i_to_s_c(sk.k); +- + offset_into_extent = iter->pos.offset - + bkey_start_offset(k.k); + sectors = k.k->size - offset_into_extent; + ++ bkey_on_stack_reassemble(&sk, c, k); ++ + ret = bch2_read_indirect_extent(trans, + &offset_into_extent, &sk); + if (ret) + break; + ++ k = bkey_i_to_s_c(sk.k); ++ + sectors = min(sectors, k.k->size - offset_into_extent); + + bch2_trans_unlock(trans); +diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c +index d137ddde6665..f7107d9e0001 100644 +--- a/fs/bcachefs/fs.c ++++ b/fs/bcachefs/fs.c +@@ -910,20 +910,21 @@ retry: + continue; + } + +- bkey_on_stack_realloc(&cur, c, k.k->u64s); +- bkey_on_stack_realloc(&prev, c, k.k->u64s); +- bkey_reassemble(cur.k, k); +- k = bkey_i_to_s_c(cur.k); +- + offset_into_extent = iter->pos.offset - + bkey_start_offset(k.k); + sectors = k.k->size - offset_into_extent; + ++ bkey_on_stack_realloc(&cur, c, k.k->u64s); ++ bkey_on_stack_realloc(&prev, c, k.k->u64s); ++ bkey_reassemble(cur.k, k); ++ + ret = bch2_read_indirect_extent(&trans, + &offset_into_extent, &cur); + if (ret) + break; + ++ k = bkey_i_to_s_c(cur.k); ++ + sectors = min(sectors, k.k->size - offset_into_extent); + + if (offset_into_extent) +diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c +index f1744ad886ef..57ec7f547f01 100644 +--- a/fs/bcachefs/io.c ++++ b/fs/bcachefs/io.c +@@ -1674,7 +1674,6 @@ retry: + unsigned bytes, sectors, offset_into_extent; + + bkey_on_stack_reassemble(&sk, c, k); +- k = bkey_i_to_s_c(sk.k); + + offset_into_extent = iter->pos.offset - + bkey_start_offset(k.k); +@@ -1685,6 +1684,8 @@ retry: + if (ret) + break; + ++ k = bkey_i_to_s_c(sk.k); ++ + sectors = min(sectors, k.k->size - offset_into_extent); + + bch2_trans_unlock(&trans); +@@ -2311,13 +2312,14 @@ retry: + sectors = k.k->size - offset_into_extent; + + bkey_on_stack_reassemble(&sk, c, k); +- k = bkey_i_to_s_c(sk.k); + + ret = bch2_read_indirect_extent(&trans, + &offset_into_extent, &sk); + if (ret) + goto err; + ++ k = bkey_i_to_s_c(sk.k); ++ + /* + * With indirect extents, the amount of data to read is the min + * of the original extent and the indirect extent: +-- +cgit v1.2.3 + + +From f16b93ccdd9e32e890c3e4f516f7c6973f7a3f41 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 24 Oct 2020 19:51:34 -0400 +Subject: bcachefs: Indirect inline data extents + +When inline data extents were added, reflink was forgotten about - we +need indirect inline data extents for reflink + inline data to work +correctly. + +This patch adds them, and a new feature bit that's flipped when they're +used. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bcachefs_format.h | 12 +++++-- + fs/bcachefs/bkey.h | 1 + + fs/bcachefs/bkey_methods.c | 6 +++- + fs/bcachefs/buckets.c | 49 +++++++++++++++++----------- + fs/bcachefs/extents.c | 16 ++++++---- + fs/bcachefs/extents.h | 30 ++++++++++++++++-- + fs/bcachefs/io.c | 10 +++--- + fs/bcachefs/opts.h | 2 +- + fs/bcachefs/reflink.c | 74 ++++++++++++++++++++++++++++++++----------- + fs/bcachefs/reflink.h | 11 ++++++- + 10 files changed, 156 insertions(+), 55 deletions(-) + +diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h +index 45dc42865811..2926c648a17f 100644 +--- a/fs/bcachefs/bcachefs_format.h ++++ b/fs/bcachefs/bcachefs_format.h +@@ -340,7 +340,8 @@ static inline void bkey_init(struct bkey *k) + x(reflink_p, 15) \ + x(reflink_v, 16) \ + x(inline_data, 17) \ +- x(btree_ptr_v2, 18) ++ x(btree_ptr_v2, 18) \ ++ x(indirect_inline_data, 19) + + enum bch_bkey_type { + #define x(name, nr) KEY_TYPE_##name = nr, +@@ -886,6 +887,12 @@ struct bch_reflink_v { + __u64 _data[0]; + }; + ++struct bch_indirect_inline_data { ++ struct bch_val v; ++ __le64 refcount; ++ u8 data[0]; ++}; ++ + /* Inline data */ + + struct bch_inline_data { +@@ -1322,7 +1329,8 @@ LE64_BITMASK(BCH_SB_ERASURE_CODE, struct bch_sb, flags[3], 0, 16); + x(incompressible, 10) \ + x(btree_ptr_v2, 11) \ + x(extents_above_btree_updates, 12) \ +- x(btree_updates_journalled, 13) ++ x(btree_updates_journalled, 13) \ ++ x(reflink_inline_data, 14) + + #define BCH_SB_FEATURES_ALL \ + ((1ULL << BCH_FEATURE_new_siphash)| \ +diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h +index cbcfbd26bc58..80ea488d57b0 100644 +--- a/fs/bcachefs/bkey.h ++++ b/fs/bcachefs/bkey.h +@@ -565,6 +565,7 @@ BKEY_VAL_ACCESSORS(reflink_p); + BKEY_VAL_ACCESSORS(reflink_v); + BKEY_VAL_ACCESSORS(inline_data); + BKEY_VAL_ACCESSORS(btree_ptr_v2); ++BKEY_VAL_ACCESSORS(indirect_inline_data); + + /* byte order helpers */ + +diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c +index 36e0c5152b47..32849229801d 100644 +--- a/fs/bcachefs/bkey_methods.c ++++ b/fs/bcachefs/bkey_methods.c +@@ -72,7 +72,11 @@ static const char *key_type_inline_data_invalid(const struct bch_fs *c, + static void key_type_inline_data_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) + { +- pr_buf(out, "(%zu bytes)", bkey_val_bytes(k.k)); ++ struct bkey_s_c_inline_data d = bkey_s_c_to_inline_data(k); ++ unsigned datalen = bkey_inline_data_bytes(k.k); ++ ++ pr_buf(out, "datalen %u: %*phN", ++ datalen, min(datalen, 32U), d.v->data); + } + + #define bch2_bkey_ops_inline_data (struct bkey_ops) { \ +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index c8e8e978d143..82f1cc4ca693 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -1812,6 +1812,18 @@ put_iter: + return ret; + } + ++static __le64 *bkey_refcount(struct bkey_i *k) ++{ ++ switch (k->k.type) { ++ case KEY_TYPE_reflink_v: ++ return &bkey_i_to_reflink_v(k)->v.refcount; ++ case KEY_TYPE_indirect_inline_data: ++ return &bkey_i_to_indirect_inline_data(k)->v.refcount; ++ default: ++ return NULL; ++ } ++} ++ + static int __bch2_trans_mark_reflink_p(struct btree_trans *trans, + struct bkey_s_c_reflink_p p, + u64 idx, unsigned sectors, +@@ -1820,7 +1832,8 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans, + struct bch_fs *c = trans->c; + struct btree_iter *iter; + struct bkey_s_c k; +- struct bkey_i_reflink_v *r_v; ++ struct bkey_i *n; ++ __le64 *refcount; + s64 ret; + + ret = trans_get_key(trans, BTREE_ID_REFLINK, +@@ -1828,14 +1841,6 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans, + if (ret < 0) + return ret; + +- if (k.k->type != KEY_TYPE_reflink_v) { +- bch2_fs_inconsistent(c, +- "%llu:%llu len %u points to nonexistent indirect extent %llu", +- p.k->p.inode, p.k->p.offset, p.k->size, idx); +- ret = -EIO; +- goto err; +- } +- + if ((flags & BTREE_TRIGGER_OVERWRITE) && + (bkey_start_offset(k.k) < idx || + k.k->p.offset > idx + sectors)) +@@ -1843,25 +1848,33 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans, + + sectors = k.k->p.offset - idx; + +- r_v = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); +- ret = PTR_ERR_OR_ZERO(r_v); ++ n = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); ++ ret = PTR_ERR_OR_ZERO(n); + if (ret) + goto err; + +- bkey_reassemble(&r_v->k_i, k); ++ bkey_reassemble(n, k); ++ ++ refcount = bkey_refcount(n); ++ if (!refcount) { ++ bch2_fs_inconsistent(c, ++ "%llu:%llu len %u points to nonexistent indirect extent %llu", ++ p.k->p.inode, p.k->p.offset, p.k->size, idx); ++ ret = -EIO; ++ goto err; ++ } + +- le64_add_cpu(&r_v->v.refcount, +- !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1); ++ le64_add_cpu(refcount, !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1); + +- if (!r_v->v.refcount) { +- r_v->k.type = KEY_TYPE_deleted; +- set_bkey_val_u64s(&r_v->k, 0); ++ if (!*refcount) { ++ n->k.type = KEY_TYPE_deleted; ++ set_bkey_val_u64s(&n->k, 0); + } + + bch2_btree_iter_set_pos(iter, bkey_start_pos(k.k)); + BUG_ON(iter->uptodate > BTREE_ITER_NEED_PEEK); + +- bch2_trans_update(trans, iter, &r_v->k_i, 0); ++ bch2_trans_update(trans, iter, n, 0); + out: + ret = sectors; + err: +diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c +index 568f039edcff..88297b30f622 100644 +--- a/fs/bcachefs/extents.c ++++ b/fs/bcachefs/extents.c +@@ -1200,14 +1200,14 @@ int bch2_cut_front_s(struct bpos where, struct bkey_s k) + le64_add_cpu(&p.v->idx, sub); + break; + } +- case KEY_TYPE_inline_data: { +- struct bkey_s_inline_data d = bkey_s_to_inline_data(k); ++ case KEY_TYPE_inline_data: ++ case KEY_TYPE_indirect_inline_data: { ++ void *p = bkey_inline_data_p(k); ++ unsigned bytes = bkey_inline_data_bytes(k.k); + +- sub = min_t(u64, sub << 9, bkey_val_bytes(d.k)); ++ sub = min_t(u64, sub << 9, bytes); + +- memmove(d.v->data, +- d.v->data + sub, +- bkey_val_bytes(d.k) - sub); ++ memmove(p, p + sub, bytes - sub); + + new_val_u64s -= sub >> 3; + break; +@@ -1245,7 +1245,9 @@ int bch2_cut_back_s(struct bpos where, struct bkey_s k) + + switch (k.k->type) { + case KEY_TYPE_inline_data: +- new_val_u64s = min(new_val_u64s, k.k->size << 6); ++ case KEY_TYPE_indirect_inline_data: ++ new_val_u64s = (bkey_inline_data_offset(k.k) + ++ min(bkey_inline_data_bytes(k.k), k.k->size << 9)) >> 3; + break; + } + +diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h +index 29b15365d19c..74c7bb8f9104 100644 +--- a/fs/bcachefs/extents.h ++++ b/fs/bcachefs/extents.h +@@ -445,10 +445,35 @@ static inline bool bkey_extent_is_direct_data(const struct bkey *k) + } + } + ++static inline bool bkey_extent_is_inline_data(const struct bkey *k) ++{ ++ return k->type == KEY_TYPE_inline_data || ++ k->type == KEY_TYPE_indirect_inline_data; ++} ++ ++static inline unsigned bkey_inline_data_offset(const struct bkey *k) ++{ ++ switch (k->type) { ++ case KEY_TYPE_inline_data: ++ return sizeof(struct bch_inline_data); ++ case KEY_TYPE_indirect_inline_data: ++ return sizeof(struct bch_indirect_inline_data); ++ default: ++ BUG(); ++ } ++} ++ ++static inline unsigned bkey_inline_data_bytes(const struct bkey *k) ++{ ++ return bkey_val_bytes(k) - bkey_inline_data_offset(k); ++} ++ ++#define bkey_inline_data_p(_k) (((void *) (_k).v) + bkey_inline_data_offset((_k).k)) ++ + static inline bool bkey_extent_is_data(const struct bkey *k) + { +- return bkey_extent_is_direct_data(k) || +- k->type == KEY_TYPE_inline_data || ++ return bkey_extent_is_direct_data(k) || ++ bkey_extent_is_inline_data(k) || + k->type == KEY_TYPE_reflink_p; + } + +@@ -463,6 +488,7 @@ static inline bool bkey_extent_is_allocation(const struct bkey *k) + case KEY_TYPE_reflink_p: + case KEY_TYPE_reflink_v: + case KEY_TYPE_inline_data: ++ case KEY_TYPE_indirect_inline_data: + return true; + default: + return false; +diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c +index 57ec7f547f01..74393a21ecb5 100644 +--- a/fs/bcachefs/io.c ++++ b/fs/bcachefs/io.c +@@ -2007,7 +2007,8 @@ int __bch2_read_indirect_extent(struct btree_trans *trans, + if (ret) + goto err; + +- if (k.k->type != KEY_TYPE_reflink_v) { ++ if (k.k->type != KEY_TYPE_reflink_v && ++ k.k->type != KEY_TYPE_indirect_inline_data) { + __bcache_io_error(trans->c, + "pointer to nonexistent indirect extent"); + ret = -EIO; +@@ -2034,13 +2035,12 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig, + struct bpos pos = bkey_start_pos(k.k); + int pick_ret; + +- if (k.k->type == KEY_TYPE_inline_data) { +- struct bkey_s_c_inline_data d = bkey_s_c_to_inline_data(k); ++ if (bkey_extent_is_inline_data(k.k)) { + unsigned bytes = min_t(unsigned, iter.bi_size, +- bkey_val_bytes(d.k)); ++ bkey_inline_data_bytes(k.k)); + + swap(iter.bi_size, bytes); +- memcpy_to_bio(&orig->bio, iter, d.v->data); ++ memcpy_to_bio(&orig->bio, iter, bkey_inline_data_p(k)); + swap(iter.bi_size, bytes); + bio_advance_iter(&orig->bio, &iter, bytes); + zero_fill_bio_iter(&orig->bio, iter); +diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h +index 1dd8d47c0972..710a7ee67039 100644 +--- a/fs/bcachefs/opts.h ++++ b/fs/bcachefs/opts.h +@@ -185,7 +185,7 @@ enum opt_type { + x(inline_data, u8, \ + OPT_MOUNT|OPT_RUNTIME, \ + OPT_BOOL(), \ +- NO_SB_OPT, false, \ ++ NO_SB_OPT, true, \ + NULL, "Enable inline data extents") \ + x(acl, u8, \ + OPT_FORMAT|OPT_MOUNT, \ +diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c +index 3c473f1380a6..8abcbfb3bd64 100644 +--- a/fs/bcachefs/reflink.c ++++ b/fs/bcachefs/reflink.c +@@ -9,6 +9,18 @@ + + #include + ++static inline unsigned bkey_type_to_indirect(const struct bkey *k) ++{ ++ switch (k->type) { ++ case KEY_TYPE_extent: ++ return KEY_TYPE_reflink_v; ++ case KEY_TYPE_inline_data: ++ return KEY_TYPE_indirect_inline_data; ++ default: ++ return 0; ++ } ++} ++ + /* reflink pointers */ + + const char *bch2_reflink_p_invalid(const struct bch_fs *c, struct bkey_s_c k) +@@ -71,17 +83,42 @@ void bch2_reflink_v_to_text(struct printbuf *out, struct bch_fs *c, + bch2_bkey_ptrs_to_text(out, c, k); + } + ++/* indirect inline data */ ++ ++const char *bch2_indirect_inline_data_invalid(const struct bch_fs *c, ++ struct bkey_s_c k) ++{ ++ if (bkey_val_bytes(k.k) < sizeof(struct bch_indirect_inline_data)) ++ return "incorrect value size"; ++ return NULL; ++} ++ ++void bch2_indirect_inline_data_to_text(struct printbuf *out, ++ struct bch_fs *c, struct bkey_s_c k) ++{ ++ struct bkey_s_c_indirect_inline_data d = bkey_s_c_to_indirect_inline_data(k); ++ unsigned datalen = bkey_inline_data_bytes(k.k); ++ ++ pr_buf(out, "refcount %llu datalen %u: %*phN", ++ le64_to_cpu(d.v->refcount), datalen, ++ min(datalen, 32U), d.v->data); ++} ++ + static int bch2_make_extent_indirect(struct btree_trans *trans, + struct btree_iter *extent_iter, +- struct bkey_i_extent *e) ++ struct bkey_i *orig) + { + struct bch_fs *c = trans->c; + struct btree_iter *reflink_iter; + struct bkey_s_c k; +- struct bkey_i_reflink_v *r_v; ++ struct bkey_i *r_v; + struct bkey_i_reflink_p *r_p; ++ __le64 *refcount; + int ret; + ++ if (orig->k.type == KEY_TYPE_inline_data) ++ bch2_check_set_feature(c, BCH_FEATURE_reflink_inline_data); ++ + for_each_btree_key(trans, reflink_iter, BTREE_ID_REFLINK, + POS(0, c->reflink_hint), + BTREE_ITER_INTENT|BTREE_ITER_SLOTS, k, ret) { +@@ -90,7 +127,7 @@ static int bch2_make_extent_indirect(struct btree_trans *trans, + continue; + } + +- if (bkey_deleted(k.k) && e->k.size <= k.k->size) ++ if (bkey_deleted(k.k) && orig->k.size <= k.k->size) + break; + } + +@@ -100,29 +137,31 @@ static int bch2_make_extent_indirect(struct btree_trans *trans, + /* rewind iter to start of hole, if necessary: */ + bch2_btree_iter_set_pos(reflink_iter, bkey_start_pos(k.k)); + +- r_v = bch2_trans_kmalloc(trans, sizeof(*r_v) + bkey_val_bytes(&e->k)); ++ r_v = bch2_trans_kmalloc(trans, sizeof(__le64) + bkey_val_bytes(&orig->k)); + ret = PTR_ERR_OR_ZERO(r_v); + if (ret) + goto err; + +- bkey_reflink_v_init(&r_v->k_i); ++ bkey_init(&r_v->k); ++ r_v->k.type = bkey_type_to_indirect(&orig->k); + r_v->k.p = reflink_iter->pos; +- bch2_key_resize(&r_v->k, e->k.size); +- r_v->k.version = e->k.version; ++ bch2_key_resize(&r_v->k, orig->k.size); ++ r_v->k.version = orig->k.version; ++ ++ set_bkey_val_bytes(&r_v->k, sizeof(__le64) + bkey_val_bytes(&orig->k)); + +- set_bkey_val_u64s(&r_v->k, bkey_val_u64s(&r_v->k) + +- bkey_val_u64s(&e->k)); +- r_v->v.refcount = 0; +- memcpy(r_v->v.start, e->v.start, bkey_val_bytes(&e->k)); ++ refcount = (void *) &r_v->v; ++ *refcount = 0; ++ memcpy(refcount + 1, &orig->v, bkey_val_bytes(&orig->k)); + +- bch2_trans_update(trans, reflink_iter, &r_v->k_i, 0); ++ bch2_trans_update(trans, reflink_iter, r_v, 0); + + r_p = bch2_trans_kmalloc(trans, sizeof(*r_p)); + if (IS_ERR(r_p)) + return PTR_ERR(r_p); + +- e->k.type = KEY_TYPE_reflink_p; +- r_p = bkey_i_to_reflink_p(&e->k_i); ++ orig->k.type = KEY_TYPE_reflink_p; ++ r_p = bkey_i_to_reflink_p(orig); + set_bkey_val_bytes(&r_p->k, sizeof(r_p->v)); + r_p->v.idx = cpu_to_le64(bkey_start_offset(&r_v->k)); + +@@ -144,8 +183,7 @@ static struct bkey_s_c get_next_src(struct btree_iter *iter, struct bpos end) + if (bkey_cmp(iter->pos, end) >= 0) + return bkey_s_c_null; + +- if (k.k->type == KEY_TYPE_extent || +- k.k->type == KEY_TYPE_reflink_p) ++ if (bkey_extent_is_data(k.k)) + break; + } + +@@ -218,7 +256,7 @@ s64 bch2_remap_range(struct bch_fs *c, + if (!bkey_cmp(dst_iter->pos, dst_end)) + break; + +- if (src_k.k->type == KEY_TYPE_extent) { ++ if (src_k.k->type != KEY_TYPE_reflink_p) { + bkey_on_stack_reassemble(&new_src, c, src_k); + src_k = bkey_i_to_s_c(new_src.k); + +@@ -226,7 +264,7 @@ s64 bch2_remap_range(struct bch_fs *c, + bch2_cut_back(src_end, new_src.k); + + ret = bch2_make_extent_indirect(&trans, src_iter, +- bkey_i_to_extent(new_src.k)); ++ new_src.k); + if (ret) + goto btree_err; + +diff --git a/fs/bcachefs/reflink.h b/fs/bcachefs/reflink.h +index 5445c1cf0797..9d5e7dc58f2b 100644 +--- a/fs/bcachefs/reflink.h ++++ b/fs/bcachefs/reflink.h +@@ -18,13 +18,22 @@ const char *bch2_reflink_v_invalid(const struct bch_fs *, struct bkey_s_c); + void bch2_reflink_v_to_text(struct printbuf *, struct bch_fs *, + struct bkey_s_c); + +- + #define bch2_bkey_ops_reflink_v (struct bkey_ops) { \ + .key_invalid = bch2_reflink_v_invalid, \ + .val_to_text = bch2_reflink_v_to_text, \ + .swab = bch2_ptr_swab, \ + } + ++const char *bch2_indirect_inline_data_invalid(const struct bch_fs *, ++ struct bkey_s_c); ++void bch2_indirect_inline_data_to_text(struct printbuf *, ++ struct bch_fs *, struct bkey_s_c); ++ ++#define bch2_bkey_ops_indirect_inline_data (struct bkey_ops) { \ ++ .key_invalid = bch2_indirect_inline_data_invalid, \ ++ .val_to_text = bch2_indirect_inline_data_to_text, \ ++} ++ + s64 bch2_remap_range(struct bch_fs *, struct bpos, struct bpos, + u64, u64 *, u64, s64 *); + +-- +cgit v1.2.3 + + +From bb69c679b2cab1da072880be844cd7bbe6567efc Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 24 Oct 2020 21:20:16 -0400 +Subject: bcachefs: Drop alloc keys from journal when -o reconstruct_alloc + +This fixes a bug where we'd pop an assertion due to replaying a key for +an interior btree node when that node no longer exists. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/recovery.c | 18 +++++++++++++++++- + 1 file changed, 17 insertions(+), 1 deletion(-) + +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index d70fa968db50..32fed6b81a52 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -25,6 +25,18 @@ + + #define QSTR(n) { { { .len = strlen(n) } }, .name = n } + ++/* for -o reconstruct_alloc: */ ++static void drop_alloc_keys(struct journal_keys *keys) ++{ ++ size_t src, dst; ++ ++ for (src = 0, dst = 0; src < keys->nr; src++) ++ if (keys->d[src].btree_id != BTREE_ID_ALLOC) ++ keys->d[dst++] = keys->d[src]; ++ ++ keys->nr = dst; ++} ++ + /* iterate over keys read from the journal: */ + + static struct journal_key *journal_key_search(struct journal_keys *journal_keys, +@@ -930,7 +942,6 @@ static int read_btree_roots(struct bch_fs *c) + continue; + } + +- + if (r->error) { + __fsck_err(c, i == BTREE_ID_ALLOC + ? FSCK_CAN_IGNORE : 0, +@@ -1027,6 +1038,11 @@ int bch2_fs_recovery(struct bch_fs *c) + goto err; + } + ++ if (c->opts.reconstruct_alloc) { ++ c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO); ++ drop_alloc_keys(&c->journal_keys); ++ } ++ + ret = journal_replay_early(c, clean, &c->journal_entries); + if (ret) + goto err; +-- +cgit v1.2.3 + + +From 4f330eedfde1d4e8a432ac203d79b68d78857f7d Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 25 Oct 2020 01:08:28 -0400 +Subject: bcachefs: Always write a journal entry when stopping journal + +This is to fix a (harmless) bug where the read clock hand in the +superblock doesn't match the journal. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/journal.c | 8 +++++--- + 1 file changed, 5 insertions(+), 3 deletions(-) + +diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c +index b8b719902c63..c2cafd3892a4 100644 +--- a/fs/bcachefs/journal.c ++++ b/fs/bcachefs/journal.c +@@ -980,9 +980,11 @@ void bch2_fs_journal_stop(struct journal *j) + + wait_event(j->wait, journal_entry_close(j)); + +- /* do we need to write another journal entry? */ +- if (test_bit(JOURNAL_NOT_EMPTY, &j->flags)) +- bch2_journal_meta(j); ++ /* ++ * Always write a new journal entry, to make sure the clock hands are up ++ * to date (and match the superblock) ++ */ ++ bch2_journal_meta(j); + + journal_quiesce(j); + +-- +cgit v1.2.3 + + +From 232097240cc7d8091b363815fb0d532db703724a Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 26 Oct 2020 14:54:55 -0400 +Subject: bcachefs: Add mode to bch2_inode_to_text + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/inode.c | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c +index 7d20f082ad45..28edc0834a92 100644 +--- a/fs/bcachefs/inode.c ++++ b/fs/bcachefs/inode.c +@@ -271,6 +271,8 @@ void bch2_inode_to_text(struct printbuf *out, struct bch_fs *c, + return; + } + ++ pr_buf(out, "mode: %o ", unpacked.bi_mode); ++ + #define x(_name, _bits) \ + pr_buf(out, #_name ": %llu ", (u64) unpacked._name); + BCH_INODE_FIELDS() +-- +cgit v1.2.3 + + +From 570a74cb05e367d4642ff80ae6cb39e2f9890eae Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 26 Oct 2020 14:45:20 -0400 +Subject: bcachefs: Fix btree updates when mixing cached and non cached + iterators + +There was a bug where bch2_trans_update() would incorrectly delete a +pending update where the new update did not actually overwrite the +existing update, because we were incorrectly using BTREE_ITER_TYPE when +sorting pending btree updates. + +This affects the pending patch to use cached iterators for inode +updates. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_cache.c | 2 +- + fs/bcachefs/btree_iter.c | 2 +- + fs/bcachefs/btree_iter.h | 5 +++-- + fs/bcachefs/btree_update_leaf.c | 13 ++++++++++--- + 4 files changed, 15 insertions(+), 7 deletions(-) + +diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c +index bb94fa2341ee..45d44c8785bd 100644 +--- a/fs/bcachefs/btree_cache.c ++++ b/fs/bcachefs/btree_cache.c +@@ -948,7 +948,7 @@ struct btree *bch2_btree_node_get_sibling(struct bch_fs *c, + * holding other locks that would cause us to deadlock: + */ + trans_for_each_iter(trans, linked) +- if (btree_iter_cmp(iter, linked) < 0) ++ if (btree_iter_lock_cmp(iter, linked) < 0) + __bch2_btree_iter_unlock(linked); + + if (sib == btree_prev_sib) +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 6fab76c3220c..ec831dcd6a12 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -1104,7 +1104,7 @@ retry_all: + sorted[nr_sorted++] = iter->idx; + + #define btree_iter_cmp_by_idx(_l, _r) \ +- btree_iter_cmp(&trans->iters[_l], &trans->iters[_r]) ++ btree_iter_lock_cmp(&trans->iters[_l], &trans->iters[_r]) + + bubble_sort(sorted, nr_sorted, btree_iter_cmp_by_idx); + #undef btree_iter_cmp_by_idx +diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h +index bd9ec3ec9a92..f80e09255f68 100644 +--- a/fs/bcachefs/btree_iter.h ++++ b/fs/bcachefs/btree_iter.h +@@ -177,8 +177,9 @@ void bch2_btree_iter_set_pos_same_leaf(struct btree_iter *, struct bpos); + void __bch2_btree_iter_set_pos(struct btree_iter *, struct bpos, bool); + void bch2_btree_iter_set_pos(struct btree_iter *, struct bpos); + +-static inline int btree_iter_cmp(const struct btree_iter *l, +- const struct btree_iter *r) ++/* Sort order for locking btree iterators: */ ++static inline int btree_iter_lock_cmp(const struct btree_iter *l, ++ const struct btree_iter *r) + { + return cmp_int(l->btree_id, r->btree_id) ?: + -cmp_int(btree_iter_type(l), btree_iter_type(r)) ?: +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index 5dbb19ff11ae..add85e3cae40 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -690,6 +690,13 @@ bch2_trans_commit_get_rw_cold(struct btree_trans *trans) + return 0; + } + ++static inline int btree_iter_pos_cmp(const struct btree_iter *l, ++ const struct btree_iter *r) ++{ ++ return cmp_int(l->btree_id, r->btree_id) ?: ++ bkey_cmp(l->pos, r->pos); ++} ++ + static void bch2_trans_update2(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_i *insert) +@@ -707,12 +714,12 @@ static void bch2_trans_update2(struct btree_trans *trans, + iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT; + + trans_for_each_update2(trans, i) { +- if (btree_iter_cmp(n.iter, i->iter) == 0) { ++ if (btree_iter_pos_cmp(n.iter, i->iter) == 0) { + *i = n; + return; + } + +- if (btree_iter_cmp(n.iter, i->iter) <= 0) ++ if (btree_iter_pos_cmp(n.iter, i->iter) <= 0) + break; + } + +@@ -996,7 +1003,7 @@ int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter, + * Pending updates are kept sorted: first, find position of new update: + */ + trans_for_each_update(trans, i) +- if (btree_iter_cmp(iter, i->iter) <= 0) ++ if (btree_iter_pos_cmp(iter, i->iter) <= 0) + break; + + /* +-- +cgit v1.2.3 + + +From 12a6558aca472f44e495f86628889ad7671bbd53 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 26 Oct 2020 17:03:28 -0400 +Subject: bcachefs: fiemap fixes + + - fiemap didn't know about inline extents, fixed + - advancing to the next extent after we'd chased a pointer to the + reflink btree was wrong, fixed + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/fs.c | 23 +++++++++++++---------- + 1 file changed, 13 insertions(+), 10 deletions(-) + +diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c +index f7107d9e0001..46c208b39d8a 100644 +--- a/fs/bcachefs/fs.c ++++ b/fs/bcachefs/fs.c +@@ -830,7 +830,7 @@ static int bch2_fill_extent(struct bch_fs *c, + struct fiemap_extent_info *info, + struct bkey_s_c k, unsigned flags) + { +- if (bkey_extent_is_data(k.k)) { ++ if (bkey_extent_is_direct_data(k.k)) { + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; +@@ -861,6 +861,12 @@ static int bch2_fill_extent(struct bch_fs *c, + } + + return 0; ++ } else if (bkey_extent_is_inline_data(k.k)) { ++ return fiemap_fill_next_extent(info, ++ bkey_start_offset(k.k) << 9, ++ 0, k.k->size << 9, ++ flags| ++ FIEMAP_EXTENT_DATA_INLINE); + } else if (k.k->type == KEY_TYPE_reservation) { + return fiemap_fill_next_extent(info, + bkey_start_offset(k.k) << 9, +@@ -927,11 +933,10 @@ retry: + + sectors = min(sectors, k.k->size - offset_into_extent); + +- if (offset_into_extent) +- bch2_cut_front(POS(k.k->p.inode, +- bkey_start_offset(k.k) + +- offset_into_extent), +- cur.k); ++ bch2_cut_front(POS(k.k->p.inode, ++ bkey_start_offset(k.k) + ++ offset_into_extent), ++ cur.k); + bch2_key_resize(&cur.k->k, sectors); + cur.k->k.p = iter->pos; + cur.k->k.p.offset += cur.k->k.size; +@@ -946,10 +951,8 @@ retry: + bkey_copy(prev.k, cur.k); + have_extent = true; + +- if (k.k->type == KEY_TYPE_reflink_v) +- bch2_btree_iter_set_pos(iter, k.k->p); +- else +- bch2_btree_iter_next(iter); ++ bch2_btree_iter_set_pos(iter, ++ POS(iter->pos.inode, iter->pos.offset + sectors)); + } + + if (ret == -EINTR) +-- +cgit v1.2.3 + + +From 510443f1d2a44bf8ec2f9b62afb521b8e6eb1edc Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 22 Sep 2019 19:10:21 -0400 +Subject: bcachefs: Use cached iterators for inode updates + +This switches inode updates to use cached btree iterators - which should +be a nice performance boost, since lock contention on the inodes btree +can be a bottleneck on multithreaded workloads. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_key_cache.c | 10 ++-- + fs/bcachefs/btree_key_cache.h | 3 ++ + fs/bcachefs/inode.c | 104 ++++++++++++++++++++++++++---------------- + 3 files changed, 72 insertions(+), 45 deletions(-) + +diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c +index 61662750dfc0..53fdbbef534d 100644 +--- a/fs/bcachefs/btree_key_cache.c ++++ b/fs/bcachefs/btree_key_cache.c +@@ -29,8 +29,8 @@ static const struct rhashtable_params bch2_btree_key_cache_params = { + }; + + __flatten +-static inline struct bkey_cached * +-btree_key_cache_find(struct bch_fs *c, enum btree_id btree_id, struct bpos pos) ++inline struct bkey_cached * ++bch2_btree_key_cache_find(struct bch_fs *c, enum btree_id btree_id, struct bpos pos) + { + struct bkey_cached_key key = { + .btree_id = btree_id, +@@ -218,7 +218,7 @@ int bch2_btree_iter_traverse_cached(struct btree_iter *iter) + goto fill; + } + retry: +- ck = btree_key_cache_find(c, iter->btree_id, iter->pos); ++ ck = bch2_btree_key_cache_find(c, iter->btree_id, iter->pos); + if (!ck) { + if (iter->flags & BTREE_ITER_CACHED_NOCREATE) { + iter->l[0].b = NULL; +@@ -415,7 +415,7 @@ int bch2_btree_key_cache_flush(struct btree_trans *trans, + struct bkey_cached_key key = { id, pos }; + + /* Fastpath - assume it won't be found: */ +- if (!btree_key_cache_find(c, id, pos)) ++ if (!bch2_btree_key_cache_find(c, id, pos)) + return 0; + + return btree_key_cache_flush_pos(trans, key, 0, true); +@@ -462,7 +462,7 @@ bool bch2_btree_insert_key_cached(struct btree_trans *trans, + void bch2_btree_key_cache_verify_clean(struct btree_trans *trans, + enum btree_id id, struct bpos pos) + { +- BUG_ON(btree_key_cache_find(trans->c, id, pos)); ++ BUG_ON(bch2_btree_key_cache_find(trans->c, id, pos)); + } + #endif + +diff --git a/fs/bcachefs/btree_key_cache.h b/fs/bcachefs/btree_key_cache.h +index b1756c6c622c..d448264abcc8 100644 +--- a/fs/bcachefs/btree_key_cache.h ++++ b/fs/bcachefs/btree_key_cache.h +@@ -1,6 +1,9 @@ + #ifndef _BCACHEFS_BTREE_KEY_CACHE_H + #define _BCACHEFS_BTREE_KEY_CACHE_H + ++struct bkey_cached * ++bch2_btree_key_cache_find(struct bch_fs *, enum btree_id, struct bpos); ++ + int bch2_btree_iter_traverse_cached(struct btree_iter *); + + bool bch2_btree_insert_key_cached(struct btree_trans *, +diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c +index 28edc0834a92..a988f0ea4eff 100644 +--- a/fs/bcachefs/inode.c ++++ b/fs/bcachefs/inode.c +@@ -1,6 +1,7 @@ + // SPDX-License-Identifier: GPL-2.0 + + #include "bcachefs.h" ++#include "btree_key_cache.h" + #include "bkey_methods.h" + #include "btree_update.h" + #include "error.h" +@@ -189,11 +190,11 @@ struct btree_iter *bch2_inode_peek(struct btree_trans *trans, + int ret; + + iter = bch2_trans_get_iter(trans, BTREE_ID_INODES, POS(0, inum), +- BTREE_ITER_SLOTS|flags); ++ BTREE_ITER_CACHED|flags); + if (IS_ERR(iter)) + return iter; + +- k = bch2_btree_iter_peek_slot(iter); ++ k = bch2_btree_iter_peek_cached(iter); + ret = bkey_err(k); + if (ret) + goto err; +@@ -390,7 +391,17 @@ again: + if (bkey_cmp(iter->pos, POS(0, max)) > 0) + break; + +- if (k.k->type != KEY_TYPE_inode) ++ /* ++ * There's a potential cache coherency issue with the btree key ++ * cache code here - we're iterating over the btree, skipping ++ * that cache. We should never see an empty slot that isn't ++ * actually empty due to a pending update in the key cache ++ * because the update that creates the inode isn't done with a ++ * cached iterator, but - better safe than sorry, check the ++ * cache before using a slot: ++ */ ++ if (k.k->type != KEY_TYPE_inode && ++ !bch2_btree_key_cache_find(trans->c, BTREE_ID_INODES, iter->pos)) + goto found_slot; + } + +@@ -424,6 +435,8 @@ int bch2_inode_rm(struct bch_fs *c, u64 inode_nr) + struct bkey_i_inode_generation delete; + struct bpos start = POS(inode_nr, 0); + struct bpos end = POS(inode_nr + 1, 0); ++ struct bkey_s_c k; ++ u64 bi_generation; + int ret; + + /* +@@ -444,51 +457,62 @@ int bch2_inode_rm(struct bch_fs *c, u64 inode_nr) + return ret; + + bch2_trans_init(&trans, c, 0, 0); ++retry: ++ bch2_trans_begin(&trans); ++ ++ bi_generation = 0; ++ ++ ret = bch2_btree_key_cache_flush(&trans, BTREE_ID_INODES, POS(0, inode_nr)); ++ if (ret) { ++ if (ret != -EINTR) ++ bch_err(c, "error flushing btree key cache: %i", ret); ++ goto err; ++ } + + iter = bch2_trans_get_iter(&trans, BTREE_ID_INODES, POS(0, inode_nr), + BTREE_ITER_SLOTS|BTREE_ITER_INTENT); +- do { +- struct bkey_s_c k = bch2_btree_iter_peek_slot(iter); +- u32 bi_generation = 0; ++ k = bch2_btree_iter_peek_slot(iter); + +- ret = bkey_err(k); +- if (ret) +- break; ++ ret = bkey_err(k); ++ if (ret) ++ goto err; + +- bch2_fs_inconsistent_on(k.k->type != KEY_TYPE_inode, c, +- "inode %llu not found when deleting", +- inode_nr); ++ bch2_fs_inconsistent_on(k.k->type != KEY_TYPE_inode, c, ++ "inode %llu not found when deleting", ++ inode_nr); + +- switch (k.k->type) { +- case KEY_TYPE_inode: { +- struct bch_inode_unpacked inode_u; ++ switch (k.k->type) { ++ case KEY_TYPE_inode: { ++ struct bch_inode_unpacked inode_u; + +- if (!bch2_inode_unpack(bkey_s_c_to_inode(k), &inode_u)) +- bi_generation = inode_u.bi_generation + 1; +- break; +- } +- case KEY_TYPE_inode_generation: { +- struct bkey_s_c_inode_generation g = +- bkey_s_c_to_inode_generation(k); +- bi_generation = le32_to_cpu(g.v->bi_generation); +- break; +- } +- } ++ if (!bch2_inode_unpack(bkey_s_c_to_inode(k), &inode_u)) ++ bi_generation = inode_u.bi_generation + 1; ++ break; ++ } ++ case KEY_TYPE_inode_generation: { ++ struct bkey_s_c_inode_generation g = ++ bkey_s_c_to_inode_generation(k); ++ bi_generation = le32_to_cpu(g.v->bi_generation); ++ break; ++ } ++ } + +- if (!bi_generation) { +- bkey_init(&delete.k); +- delete.k.p.offset = inode_nr; +- } else { +- bkey_inode_generation_init(&delete.k_i); +- delete.k.p.offset = inode_nr; +- delete.v.bi_generation = cpu_to_le32(bi_generation); +- } ++ if (!bi_generation) { ++ bkey_init(&delete.k); ++ delete.k.p.offset = inode_nr; ++ } else { ++ bkey_inode_generation_init(&delete.k_i); ++ delete.k.p.offset = inode_nr; ++ delete.v.bi_generation = cpu_to_le32(bi_generation); ++ } + +- bch2_trans_update(&trans, iter, &delete.k_i, 0); ++ bch2_trans_update(&trans, iter, &delete.k_i, 0); + +- ret = bch2_trans_commit(&trans, NULL, NULL, +- BTREE_INSERT_NOFAIL); +- } while (ret == -EINTR); ++ ret = bch2_trans_commit(&trans, NULL, NULL, ++ BTREE_INSERT_NOFAIL); ++err: ++ if (ret == -EINTR) ++ goto retry; + + bch2_trans_exit(&trans); + return ret; +@@ -502,11 +526,11 @@ int bch2_inode_find_by_inum_trans(struct btree_trans *trans, u64 inode_nr, + int ret; + + iter = bch2_trans_get_iter(trans, BTREE_ID_INODES, +- POS(0, inode_nr), BTREE_ITER_SLOTS); ++ POS(0, inode_nr), BTREE_ITER_CACHED); + if (IS_ERR(iter)) + return PTR_ERR(iter); + +- k = bch2_btree_iter_peek_slot(iter); ++ k = bch2_btree_iter_peek_cached(iter); + ret = bkey_err(k); + if (ret) + goto err; +-- +cgit v1.2.3 + + +From 59dad571a6ecba965d7ce8c06a262eda4e0022c5 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 27 Oct 2020 14:10:52 -0400 +Subject: bcachefs: Fix stack corruption + +A bkey_on_stack_realloc() call was in the wrong place, and broken for +indirect extents + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/fs.c | 5 ++--- + 1 file changed, 2 insertions(+), 3 deletions(-) + +diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c +index 46c208b39d8a..ebe4fb25e896 100644 +--- a/fs/bcachefs/fs.c ++++ b/fs/bcachefs/fs.c +@@ -920,9 +920,7 @@ retry: + bkey_start_offset(k.k); + sectors = k.k->size - offset_into_extent; + +- bkey_on_stack_realloc(&cur, c, k.k->u64s); +- bkey_on_stack_realloc(&prev, c, k.k->u64s); +- bkey_reassemble(cur.k, k); ++ bkey_on_stack_reassemble(&cur, c, k); + + ret = bch2_read_indirect_extent(&trans, + &offset_into_extent, &cur); +@@ -930,6 +928,7 @@ retry: + break; + + k = bkey_i_to_s_c(cur.k); ++ bkey_on_stack_realloc(&prev, c, k.k->u64s); + + sectors = min(sectors, k.k->size - offset_into_extent); + +-- +cgit v1.2.3 + + +From a3e68792b7d3edcc57426911c298b2a1d4f5669a Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 28 Oct 2020 14:17:46 -0400 +Subject: bcachefs: Improve tracing for transaction restarts + +We have a bug where we can get stuck with a process spinning in +transaction restarts - need more information. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_cache.c | 9 +++--- + fs/bcachefs/btree_cache.h | 2 +- + fs/bcachefs/btree_iter.c | 72 ++++++++++++++++++++++++++--------------- + fs/bcachefs/btree_key_cache.c | 2 +- + fs/bcachefs/btree_locking.h | 8 +++-- + include/trace/events/bcachefs.h | 43 ++++++++++++++++++++++-- + 6 files changed, 98 insertions(+), 38 deletions(-) + +diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c +index 45d44c8785bd..18fad71b8d94 100644 +--- a/fs/bcachefs/btree_cache.c ++++ b/fs/bcachefs/btree_cache.c +@@ -705,7 +705,8 @@ static int lock_node_check_fn(struct six_lock *lock, void *p) + */ + struct btree *bch2_btree_node_get(struct bch_fs *c, struct btree_iter *iter, + const struct bkey_i *k, unsigned level, +- enum six_lock_type lock_type) ++ enum six_lock_type lock_type, ++ unsigned long trace_ip) + { + struct btree_cache *bc = &c->btree_cache; + struct btree *b; +@@ -767,7 +768,7 @@ lock_node: + btree_node_unlock(iter, level + 1); + + if (!btree_node_lock(b, k->k.p, level, iter, lock_type, +- lock_node_check_fn, (void *) k)) { ++ lock_node_check_fn, (void *) k, trace_ip)) { + if (b->hash_val != btree_ptr_hash_val(k)) + goto retry; + return ERR_PTR(-EINTR); +@@ -935,7 +936,7 @@ struct btree *bch2_btree_node_get_sibling(struct bch_fs *c, + bch2_bkey_unpack(parent, &tmp.k, k); + + ret = bch2_btree_node_get(c, iter, &tmp.k, level, +- SIX_LOCK_intent); ++ SIX_LOCK_intent, _THIS_IP_); + + if (PTR_ERR_OR_ZERO(ret) == -EINTR && !trans->nounlock) { + struct btree_iter *linked; +@@ -955,7 +956,7 @@ struct btree *bch2_btree_node_get_sibling(struct bch_fs *c, + btree_node_unlock(iter, level); + + ret = bch2_btree_node_get(c, iter, &tmp.k, level, +- SIX_LOCK_intent); ++ SIX_LOCK_intent, _THIS_IP_); + + /* + * before btree_iter_relock() calls btree_iter_verify_locks(): +diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h +index d0d3a85bb8be..8a19e60e9258 100644 +--- a/fs/bcachefs/btree_cache.h ++++ b/fs/bcachefs/btree_cache.h +@@ -23,7 +23,7 @@ struct btree *bch2_btree_node_mem_alloc(struct bch_fs *); + + struct btree *bch2_btree_node_get(struct bch_fs *, struct btree_iter *, + const struct bkey_i *, unsigned, +- enum six_lock_type); ++ enum six_lock_type, unsigned long); + + struct btree *bch2_btree_node_get_noiter(struct bch_fs *, const struct bkey_i *, + enum btree_id, unsigned); +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index ec831dcd6a12..ebed0103b03c 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -197,13 +197,13 @@ static struct bpos btree_node_pos(struct btree_bkey_cached_common *_b, + bool __bch2_btree_node_lock(struct btree *b, struct bpos pos, + unsigned level, struct btree_iter *iter, + enum six_lock_type type, +- six_lock_should_sleep_fn should_sleep_fn, +- void *p) ++ six_lock_should_sleep_fn should_sleep_fn, void *p, ++ unsigned long ip) + { + struct btree_trans *trans = iter->trans; +- struct btree_iter *linked; ++ struct btree_iter *linked, *deadlock_iter = NULL; + u64 start_time = local_clock(); +- bool ret = true; ++ unsigned reason = 9; + + /* Check if it's safe to block: */ + trans_for_each_iter(trans, linked) { +@@ -228,10 +228,13 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos, + linked->locks_want = max_t(unsigned, + linked->locks_want, + __fls(linked->nodes_locked) + 1); +- if (!btree_iter_get_locks(linked, true, false)) +- ret = false; ++ if (!btree_iter_get_locks(linked, true, false)) { ++ deadlock_iter = linked; ++ reason = 1; ++ } + } else { +- ret = false; ++ deadlock_iter = linked; ++ reason = 2; + } + } + +@@ -247,23 +250,30 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos, + max(level + 1, max_t(unsigned, + linked->locks_want, + iter->locks_want)); +- if (!btree_iter_get_locks(linked, true, false)) +- ret = false; ++ if (!btree_iter_get_locks(linked, true, false)) { ++ deadlock_iter = linked; ++ reason = 3; ++ } + } else { +- ret = false; ++ deadlock_iter = linked; ++ reason = 4; + } + } + + /* Must lock btree nodes in key order: */ + if ((cmp_int(iter->btree_id, linked->btree_id) ?: +- -cmp_int(btree_iter_type(iter), btree_iter_type(linked))) < 0) +- ret = false; ++ -cmp_int(btree_iter_type(iter), btree_iter_type(linked))) < 0) { ++ deadlock_iter = linked; ++ reason = 5; ++ } + + if (iter->btree_id == linked->btree_id && + btree_node_locked(linked, level) && + bkey_cmp(pos, btree_node_pos((void *) linked->l[level].b, +- btree_iter_type(linked))) <= 0) +- ret = false; ++ btree_iter_type(linked))) <= 0) { ++ deadlock_iter = linked; ++ reason = 6; ++ } + + /* + * Recheck if this is a node we already have locked - since one +@@ -277,8 +287,13 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos, + } + } + +- if (unlikely(!ret)) { +- trace_trans_restart_would_deadlock(iter->trans->ip); ++ if (unlikely(deadlock_iter)) { ++ trace_trans_restart_would_deadlock(iter->trans->ip, ip, ++ reason, ++ deadlock_iter->btree_id, ++ btree_iter_type(deadlock_iter), ++ iter->btree_id, ++ btree_iter_type(iter)); + return false; + } + +@@ -945,7 +960,8 @@ static int lock_root_check_fn(struct six_lock *lock, void *p) + } + + static inline int btree_iter_lock_root(struct btree_iter *iter, +- unsigned depth_want) ++ unsigned depth_want, ++ unsigned long trace_ip) + { + struct bch_fs *c = iter->trans->c; + struct btree *b, **rootp = &c->btree_roots[iter->btree_id].b; +@@ -974,7 +990,8 @@ static inline int btree_iter_lock_root(struct btree_iter *iter, + lock_type = __btree_lock_want(iter, iter->level); + if (unlikely(!btree_node_lock(b, POS_MAX, iter->level, + iter, lock_type, +- lock_root_check_fn, rootp))) ++ lock_root_check_fn, rootp, ++ trace_ip))) + return -EINTR; + + if (likely(b == READ_ONCE(*rootp) && +@@ -1046,7 +1063,8 @@ static noinline void btree_node_mem_ptr_set(struct btree_iter *iter, + btree_node_unlock(iter, plevel); + } + +-static __always_inline int btree_iter_down(struct btree_iter *iter) ++static __always_inline int btree_iter_down(struct btree_iter *iter, ++ unsigned long trace_ip) + { + struct bch_fs *c = iter->trans->c; + struct btree_iter_level *l = &iter->l[iter->level]; +@@ -1060,7 +1078,7 @@ static __always_inline int btree_iter_down(struct btree_iter *iter) + bch2_bkey_unpack(l->b, &tmp.k, + bch2_btree_node_iter_peek(&l->iter, l->b)); + +- b = bch2_btree_node_get(c, iter, &tmp.k, level, lock_type); ++ b = bch2_btree_node_get(c, iter, &tmp.k, level, lock_type, trace_ip); + if (unlikely(IS_ERR(b))) + return PTR_ERR(b); + +@@ -1084,7 +1102,7 @@ static void btree_iter_up(struct btree_iter *iter) + btree_node_unlock(iter, iter->level++); + } + +-static int btree_iter_traverse_one(struct btree_iter *); ++static int btree_iter_traverse_one(struct btree_iter *, unsigned long); + + static int __btree_iter_traverse_all(struct btree_trans *trans, int ret) + { +@@ -1109,6 +1127,7 @@ retry_all: + bubble_sort(sorted, nr_sorted, btree_iter_cmp_by_idx); + #undef btree_iter_cmp_by_idx + bch2_trans_unlock(trans); ++ cond_resched(); + + if (unlikely(ret == -ENOMEM)) { + struct closure cl; +@@ -1139,7 +1158,7 @@ retry_all: + if (!(trans->iters_linked & (1ULL << idx))) + continue; + +- ret = btree_iter_traverse_one(&trans->iters[idx]); ++ ret = btree_iter_traverse_one(&trans->iters[idx], _THIS_IP_); + if (ret) + goto retry_all; + } +@@ -1202,7 +1221,8 @@ static inline unsigned btree_iter_up_until_good_node(struct btree_iter *iter, + * On error, caller (peek_node()/peek_key()) must return NULL; the error is + * stashed in the iterator and returned from bch2_trans_exit(). + */ +-static int btree_iter_traverse_one(struct btree_iter *iter) ++static int btree_iter_traverse_one(struct btree_iter *iter, ++ unsigned long trace_ip) + { + unsigned depth_want = iter->level; + +@@ -1249,8 +1269,8 @@ static int btree_iter_traverse_one(struct btree_iter *iter) + */ + while (iter->level > depth_want) { + int ret = btree_iter_node(iter, iter->level) +- ? btree_iter_down(iter) +- : btree_iter_lock_root(iter, depth_want); ++ ? btree_iter_down(iter, trace_ip) ++ : btree_iter_lock_root(iter, depth_want, trace_ip); + if (unlikely(ret)) { + if (ret == 1) + return 0; +@@ -1281,7 +1301,7 @@ int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter) + int ret; + + ret = bch2_trans_cond_resched(trans) ?: +- btree_iter_traverse_one(iter); ++ btree_iter_traverse_one(iter, _RET_IP_); + if (unlikely(ret)) + ret = __btree_iter_traverse_all(trans, ret); + +diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c +index 53fdbbef534d..9ff77d982211 100644 +--- a/fs/bcachefs/btree_key_cache.c ++++ b/fs/bcachefs/btree_key_cache.c +@@ -242,7 +242,7 @@ retry: + enum six_lock_type lock_want = __btree_lock_want(iter, 0); + + if (!btree_node_lock((void *) ck, iter->pos, 0, iter, lock_want, +- bkey_cached_check_fn, iter)) { ++ bkey_cached_check_fn, iter, _THIS_IP_)) { + if (ck->key.btree_id != iter->btree_id || + bkey_cmp(ck->key.pos, iter->pos)) { + goto retry; +diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h +index 81fbf3e18647..38323e32731f 100644 +--- a/fs/bcachefs/btree_locking.h ++++ b/fs/bcachefs/btree_locking.h +@@ -176,13 +176,15 @@ static inline bool btree_node_lock_increment(struct btree_trans *trans, + + bool __bch2_btree_node_lock(struct btree *, struct bpos, unsigned, + struct btree_iter *, enum six_lock_type, +- six_lock_should_sleep_fn, void *); ++ six_lock_should_sleep_fn, void *, ++ unsigned long); + + static inline bool btree_node_lock(struct btree *b, + struct bpos pos, unsigned level, + struct btree_iter *iter, + enum six_lock_type type, +- six_lock_should_sleep_fn should_sleep_fn, void *p) ++ six_lock_should_sleep_fn should_sleep_fn, void *p, ++ unsigned long ip) + { + struct btree_trans *trans = iter->trans; + bool ret; +@@ -200,7 +202,7 @@ static inline bool btree_node_lock(struct btree *b, + ret = likely(six_trylock_type(&b->c.lock, type)) || + btree_node_lock_increment(trans, b, level, type) || + __bch2_btree_node_lock(b, pos, level, iter, type, +- should_sleep_fn, p); ++ should_sleep_fn, p, ip); + + #ifdef CONFIG_BCACHEFS_DEBUG + trans->locking = NULL; +diff --git a/include/trace/events/bcachefs.h b/include/trace/events/bcachefs.h +index 55d1eff1108b..235e9cfa6a64 100644 +--- a/include/trace/events/bcachefs.h ++++ b/include/trace/events/bcachefs.h +@@ -536,9 +536,46 @@ DEFINE_EVENT(transaction_restart, trans_restart_btree_node_reused, + TP_ARGS(ip) + ); + +-DEFINE_EVENT(transaction_restart, trans_restart_would_deadlock, +- TP_PROTO(unsigned long ip), +- TP_ARGS(ip) ++TRACE_EVENT(trans_restart_would_deadlock, ++ TP_PROTO(unsigned long trans_ip, ++ unsigned long caller_ip, ++ unsigned reason, ++ enum btree_id have_btree_id, ++ unsigned have_iter_type, ++ enum btree_id want_btree_id, ++ unsigned want_iter_type), ++ TP_ARGS(trans_ip, caller_ip, reason, ++ have_btree_id, have_iter_type, ++ want_btree_id, want_iter_type), ++ ++ TP_STRUCT__entry( ++ __field(unsigned long, trans_ip ) ++ __field(unsigned long, caller_ip ) ++ __field(u8, reason ) ++ __field(u8, have_btree_id ) ++ __field(u8, have_iter_type ) ++ __field(u8, want_btree_id ) ++ __field(u8, want_iter_type ) ++ ), ++ ++ TP_fast_assign( ++ __entry->trans_ip = trans_ip; ++ __entry->caller_ip = caller_ip; ++ __entry->reason = reason; ++ __entry->have_btree_id = have_btree_id; ++ __entry->have_iter_type = have_iter_type; ++ __entry->want_btree_id = want_btree_id; ++ __entry->want_iter_type = want_iter_type; ++ ), ++ ++ TP_printk("%pF %pF because %u have %u:%u want %u:%u", ++ (void *) __entry->trans_ip, ++ (void *) __entry->caller_ip, ++ __entry->reason, ++ __entry->have_btree_id, ++ __entry->have_iter_type, ++ __entry->want_btree_id, ++ __entry->want_iter_type) + ); + + TRACE_EVENT(trans_restart_iters_realloced, +-- +cgit v1.2.3 + + +From 42c98bf5767c5fda3a0993d201daed5aed4272c3 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 28 Oct 2020 14:18:18 -0400 +Subject: bcachefs: Fix spurious transaction restarts + +The check for whether locking a btree node would deadlock was wrong - we +have to check that interior nodes are locked before descendents, but +this check was wrong when consider cached vs. non cached iterators. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 1 + + fs/bcachefs/btree_types.h | 5 +++++ + 2 files changed, 6 insertions(+) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index ebed0103b03c..357b8514e65c 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -244,6 +244,7 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos, + * we're about to lock, it must have the ancestors locked too: + */ + if (linked->btree_id == iter->btree_id && ++ btree_iter_is_cached(linked) == btree_iter_is_cached(iter) && + level > __fls(linked->nodes_locked)) { + if (!(trans->nounlock)) { + linked->locks_want = +diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h +index cc01baeec138..844d853eacc3 100644 +--- a/fs/bcachefs/btree_types.h ++++ b/fs/bcachefs/btree_types.h +@@ -283,6 +283,11 @@ btree_iter_type(const struct btree_iter *iter) + return iter->flags & BTREE_ITER_TYPE; + } + ++static inline bool btree_iter_is_cached(const struct btree_iter *iter) ++{ ++ return btree_iter_type(iter) == BTREE_ITER_CACHED; ++} ++ + static inline struct btree_iter_level *iter_l(struct btree_iter *iter) + { + return iter->l + iter->level; +-- +cgit v1.2.3 + + +From e251ae8dd05f7ca11208f5d1f5da0aeedd063c00 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 30 Oct 2020 17:29:38 -0400 +Subject: bcachefs: Improve check for when bios are physically contiguous + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/compress.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c +index 595d76aa3956..27bbc265d550 100644 +--- a/fs/bcachefs/compress.c ++++ b/fs/bcachefs/compress.c +@@ -70,7 +70,7 @@ static struct bbuf __bio_map_or_bounce(struct bch_fs *c, struct bio *bio, + + BUG_ON(bvec_iter_sectors(start) > c->sb.encoded_extent_max); + +- if (!IS_ENABLED(CONFIG_HIGHMEM) && ++ if (!PageHighMem(bio_iter_page(bio, start)) && + bio_phys_contig(bio, start)) + return (struct bbuf) { + .b = page_address(bio_iter_page(bio, start)) + +-- +cgit v1.2.3 + + +From cee4b965beb2fb7c36da53a1c9402769a64cfedf Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 27 Oct 2020 18:56:21 -0400 +Subject: bcachefs: Inode create optimization + +On workloads that do a lot of multithreaded creates all at once, lock +contention on the inodes btree turns out to still be an issue. + +This patch adds a small buffer of inode numbers that are known to be +free, so that we can avoid touching the btree on every create. Also, +this changes inode creates to update via the btree key cache for the +initial create. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bcachefs.h | 4 ++ + fs/bcachefs/fs-common.c | 4 +- + fs/bcachefs/inode.c | 137 ++++++++++++++++++++++++++++++++---------------- + fs/bcachefs/inode.h | 4 +- + fs/bcachefs/super.c | 1 + + 5 files changed, 100 insertions(+), 50 deletions(-) + +diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h +index a6178bdfd362..b94439d1df9b 100644 +--- a/fs/bcachefs/bcachefs.h ++++ b/fs/bcachefs/bcachefs.h +@@ -802,6 +802,10 @@ struct bch_fs { + struct mutex verify_lock; + #endif + ++ struct mutex inode_create_lock; ++ unsigned unused_inodes_nr; ++ u64 unused_inodes[64]; ++ u32 unused_inodes_gens[64]; + u64 unused_inode_hint; + + /* +diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c +index 878419d40992..503ce1920f39 100644 +--- a/fs/bcachefs/fs-common.c ++++ b/fs/bcachefs/fs-common.c +@@ -34,9 +34,7 @@ int bch2_create_trans(struct btree_trans *trans, u64 dir_inum, + if (!name) + new_inode->bi_flags |= BCH_INODE_UNLINKED; + +- ret = bch2_inode_create(trans, new_inode, +- BLOCKDEV_INODE_MAX, 0, +- &c->unused_inode_hint); ++ ret = bch2_inode_create(trans, new_inode); + if (ret) + goto err; + +diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c +index a988f0ea4eff..9a0991adf550 100644 +--- a/fs/bcachefs/inode.c ++++ b/fs/bcachefs/inode.c +@@ -361,71 +361,120 @@ static inline u32 bkey_generation(struct bkey_s_c k) + } + } + +-int bch2_inode_create(struct btree_trans *trans, +- struct bch_inode_unpacked *inode_u, +- u64 min, u64 max, u64 *hint) ++static int scan_free_inums(struct btree_trans *trans) + { +- struct bkey_inode_buf *inode_p; ++ struct bch_fs *c = trans->c; + struct btree_iter *iter = NULL; + struct bkey_s_c k; +- u64 start; +- int ret; +- +- if (!max) +- max = ULLONG_MAX; +- +- if (trans->c->opts.inodes_32bit) +- max = min_t(u64, max, U32_MAX); ++ u64 min = BLOCKDEV_INODE_MAX; ++ u64 max = c->opts.inodes_32bit ++ ? S32_MAX : S64_MAX; ++ u64 start = max(min, READ_ONCE(c->unused_inode_hint)); ++ int ret = 0; ++ ++ iter = bch2_trans_get_iter(trans, BTREE_ID_INODES, POS(0, start), ++ BTREE_ITER_SLOTS); ++ if (IS_ERR(iter)) ++ return PTR_ERR(iter); ++again: ++ for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, k, ret) { ++ if (bkey_cmp(iter->pos, POS(0, max)) > 0) ++ break; + +- start = READ_ONCE(*hint); ++ /* ++ * This doesn't check the btree key cache, but we don't care: ++ * we have to recheck with an intent lock held on the slot we're ++ * inserting to anyways: ++ */ ++ if (k.k->type != KEY_TYPE_inode) { ++ if (c->unused_inodes_nr < ARRAY_SIZE(c->unused_inodes)) { ++ c->unused_inodes[c->unused_inodes_nr] = k.k->p.offset; ++ c->unused_inodes_gens[c->unused_inodes_nr] = bkey_generation(k); ++ c->unused_inodes_nr++; ++ } ++ ++ if (c->unused_inodes_nr == ARRAY_SIZE(c->unused_inodes)) ++ goto out; ++ } ++ } + +- if (start >= max || start < min) ++ if (!ret && start != min) { ++ max = start; + start = min; ++ bch2_btree_iter_set_pos(iter, POS(0, start)); ++ goto again; ++ } ++out: ++ c->unused_inode_hint = iter->pos.offset; ++ bch2_trans_iter_put(trans, iter); ++ return ret; ++} ++ ++int bch2_inode_create(struct btree_trans *trans, ++ struct bch_inode_unpacked *inode_u) ++{ ++ struct bch_fs *c = trans->c; ++ struct bkey_inode_buf *inode_p; ++ struct btree_iter *iter = NULL; ++ struct bkey_s_c k; ++ u64 inum; ++ u32 generation; ++ int ret = 0; + + inode_p = bch2_trans_kmalloc(trans, sizeof(*inode_p)); + if (IS_ERR(inode_p)) + return PTR_ERR(inode_p); +-again: +- for_each_btree_key(trans, iter, BTREE_ID_INODES, POS(0, start), +- BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { +- if (bkey_cmp(iter->pos, POS(0, max)) > 0) +- break; + +- /* +- * There's a potential cache coherency issue with the btree key +- * cache code here - we're iterating over the btree, skipping +- * that cache. We should never see an empty slot that isn't +- * actually empty due to a pending update in the key cache +- * because the update that creates the inode isn't done with a +- * cached iterator, but - better safe than sorry, check the +- * cache before using a slot: +- */ +- if (k.k->type != KEY_TYPE_inode && +- !bch2_btree_key_cache_find(trans->c, BTREE_ID_INODES, iter->pos)) +- goto found_slot; ++ iter = bch2_trans_get_iter(trans, BTREE_ID_INODES, POS_MIN, ++ BTREE_ITER_CACHED| ++ BTREE_ITER_INTENT); ++ if (IS_ERR(iter)) ++ return PTR_ERR(iter); ++retry: ++ if (!mutex_trylock(&c->inode_create_lock)) { ++ bch2_trans_unlock(trans); ++ mutex_lock(&c->inode_create_lock); ++ if (!bch2_trans_relock(trans)) { ++ mutex_unlock(&c->inode_create_lock); ++ ret = -EINTR; ++ goto err; ++ } + } + +- bch2_trans_iter_put(trans, iter); ++ if (!c->unused_inodes_nr) ++ ret = scan_free_inums(trans); ++ if (!ret && !c->unused_inodes_nr) ++ ret = -ENOSPC; ++ if (!ret) { ++ --c->unused_inodes_nr; ++ inum = c->unused_inodes[c->unused_inodes_nr]; ++ generation = c->unused_inodes_gens[c->unused_inodes_nr]; ++ } ++ ++ mutex_unlock(&c->inode_create_lock); + + if (ret) +- return ret; ++ goto err; + +- if (start != min) { +- /* Retry from start */ +- start = min; +- goto again; +- } ++ bch2_btree_iter_set_pos(iter, POS(0, inum)); ++ ++ /* Recheck that the slot is free with an intent lock held: */ ++ k = bch2_btree_iter_peek_cached(iter); ++ ret = bkey_err(k); ++ if (ret) ++ goto err; ++ ++ if (k.k->type == KEY_TYPE_inode) ++ goto retry; + +- return -ENOSPC; +-found_slot: +- *hint = k.k->p.offset; +- inode_u->bi_inum = k.k->p.offset; +- inode_u->bi_generation = bkey_generation(k); ++ inode_u->bi_inum = inum; ++ inode_u->bi_generation = generation; + + bch2_inode_pack(inode_p, inode_u); + bch2_trans_update(trans, iter, &inode_p->inode.k_i, 0); ++err: + bch2_trans_iter_put(trans, iter); +- return 0; ++ return ret; + } + + int bch2_inode_rm(struct bch_fs *c, u64 inode_nr) +diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h +index bb759a46dc41..5743be2307f3 100644 +--- a/fs/bcachefs/inode.h ++++ b/fs/bcachefs/inode.h +@@ -60,9 +60,7 @@ void bch2_inode_init(struct bch_fs *, struct bch_inode_unpacked *, + uid_t, gid_t, umode_t, dev_t, + struct bch_inode_unpacked *); + +-int bch2_inode_create(struct btree_trans *, +- struct bch_inode_unpacked *, +- u64, u64, u64 *); ++int bch2_inode_create(struct btree_trans *, struct bch_inode_unpacked *); + + int bch2_inode_rm(struct bch_fs *, u64); + +diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c +index 8489b96e758f..3ba3fa531b63 100644 +--- a/fs/bcachefs/super.c ++++ b/fs/bcachefs/super.c +@@ -695,6 +695,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) + seqcount_init(&c->usage_lock); + + sema_init(&c->io_in_flight, 64); ++ mutex_init(&c->inode_create_lock); + + c->copy_gc_enabled = 1; + c->rebalance.enabled = 1; +-- +cgit v1.2.3 + + +From 2547983b2492a7f6f497282c6b42a97deda00b0c Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 2 Nov 2020 17:51:38 -0500 +Subject: bcachefs: Minor journal reclaim improvement + +With the btree key cache code, journal reclaim now has a lot more work +to do. It could be the case that after journal reclaim has finished one +iteration there's already more work to do, so put it in a loop to check +for that. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/journal_reclaim.c | 82 ++++++++++++++++++++++++------------------- + fs/bcachefs/super.c | 4 +-- + 2 files changed, 48 insertions(+), 38 deletions(-) + +diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c +index 57591983eebd..18e45296e7de 100644 +--- a/fs/bcachefs/journal_reclaim.c ++++ b/fs/bcachefs/journal_reclaim.c +@@ -465,34 +465,12 @@ static bool journal_flush_pins(struct journal *j, u64 seq_to_flush, + return ret; + } + +-/** +- * bch2_journal_reclaim - free up journal buckets +- * +- * Background journal reclaim writes out btree nodes. It should be run +- * early enough so that we never completely run out of journal buckets. +- * +- * High watermarks for triggering background reclaim: +- * - FIFO has fewer than 512 entries left +- * - fewer than 25% journal buckets free +- * +- * Background reclaim runs until low watermarks are reached: +- * - FIFO has more than 1024 entries left +- * - more than 50% journal buckets free +- * +- * As long as a reclaim can complete in the time it takes to fill up +- * 512 journal entries or 25% of all journal buckets, then +- * journal_next_bucket() should not stall. +- */ +-void bch2_journal_reclaim(struct journal *j) ++static u64 journal_seq_to_flush(struct journal *j) + { + struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct bch_dev *ca; +- unsigned iter, min_nr = 0; + u64 seq_to_flush = 0; +- +- lockdep_assert_held(&j->reclaim_lock); +- +- bch2_journal_do_discards(j); ++ unsigned iter; + + spin_lock(&j->lock); + +@@ -524,20 +502,52 @@ void bch2_journal_reclaim(struct journal *j) + (j->pin.size >> 1)); + spin_unlock(&j->lock); + +- /* +- * If it's been longer than j->reclaim_delay_ms since we last flushed, +- * make sure to flush at least one journal pin: +- */ +- if (time_after(jiffies, j->last_flushed + +- msecs_to_jiffies(j->reclaim_delay_ms))) +- min_nr = 1; ++ return seq_to_flush; ++} + +- if (j->prereserved.reserved * 2 > j->prereserved.remaining) { +- seq_to_flush = max(seq_to_flush, journal_last_seq(j)); +- min_nr = 1; +- } ++/** ++ * bch2_journal_reclaim - free up journal buckets ++ * ++ * Background journal reclaim writes out btree nodes. It should be run ++ * early enough so that we never completely run out of journal buckets. ++ * ++ * High watermarks for triggering background reclaim: ++ * - FIFO has fewer than 512 entries left ++ * - fewer than 25% journal buckets free ++ * ++ * Background reclaim runs until low watermarks are reached: ++ * - FIFO has more than 1024 entries left ++ * - more than 50% journal buckets free ++ * ++ * As long as a reclaim can complete in the time it takes to fill up ++ * 512 journal entries or 25% of all journal buckets, then ++ * journal_next_bucket() should not stall. ++ */ ++void bch2_journal_reclaim(struct journal *j) ++{ ++ struct bch_fs *c = container_of(j, struct bch_fs, journal); ++ unsigned min_nr = 0; ++ u64 seq_to_flush = 0; ++ ++ lockdep_assert_held(&j->reclaim_lock); ++ ++ do { ++ bch2_journal_do_discards(j); ++ ++ seq_to_flush = journal_seq_to_flush(j); ++ min_nr = 0; ++ ++ /* ++ * If it's been longer than j->reclaim_delay_ms since we last flushed, ++ * make sure to flush at least one journal pin: ++ */ ++ if (time_after(jiffies, j->last_flushed + ++ msecs_to_jiffies(j->reclaim_delay_ms))) ++ min_nr = 1; + +- journal_flush_pins(j, seq_to_flush, min_nr); ++ if (j->prereserved.reserved * 2 > j->prereserved.remaining) ++ min_nr = 1; ++ } while (journal_flush_pins(j, seq_to_flush, min_nr)); + + if (!bch2_journal_error(j)) + queue_delayed_work(c->journal_reclaim_wq, &j->reclaim_work, +diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c +index 3ba3fa531b63..211a17c7eeec 100644 +--- a/fs/bcachefs/super.c ++++ b/fs/bcachefs/super.c +@@ -738,9 +738,9 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) + + if (!(c->wq = alloc_workqueue("bcachefs", + WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) || +- !(c->copygc_wq = alloc_workqueue("bcache_copygc", ++ !(c->copygc_wq = alloc_workqueue("bcachefs_copygc", + WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) || +- !(c->journal_reclaim_wq = alloc_workqueue("bcache_journal", ++ !(c->journal_reclaim_wq = alloc_workqueue("bcachefs_journal_reclaim", + WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_HIGHPRI, 1)) || + percpu_ref_init(&c->writes, bch2_writes_disabled, + PERCPU_REF_INIT_DEAD, GFP_KERNEL) || +-- +cgit v1.2.3 + + +From 9d94b350c4f2d3dd68f117f3267ec7dc410341e1 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 2 Nov 2020 18:20:44 -0500 +Subject: bcachefs: Drop sysfs interface to debug parameters + +It's not used much anymore, the module paramter interface is better. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/alloc_background.h | 2 +- + fs/bcachefs/bcachefs.h | 14 ++++++++++---- + fs/bcachefs/bkey_methods.c | 2 +- + fs/bcachefs/bset.c | 18 ++++++++---------- + fs/bcachefs/bset.h | 19 ++++--------------- + fs/bcachefs/btree_cache.c | 8 ++++---- + fs/bcachefs/btree_gc.c | 12 ++++++------ + fs/bcachefs/btree_io.c | 2 +- + fs/bcachefs/btree_iter.c | 12 ++++++------ + fs/bcachefs/btree_types.h | 4 ---- + fs/bcachefs/btree_update_leaf.c | 6 +++--- + fs/bcachefs/debug.c | 2 +- + fs/bcachefs/debug.h | 33 ++------------------------------- + fs/bcachefs/extents.c | 4 ++-- + fs/bcachefs/sysfs.c | 19 ------------------- + 15 files changed, 49 insertions(+), 108 deletions(-) + +diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h +index 66ce54724e93..8e3abb89dfb7 100644 +--- a/fs/bcachefs/alloc_background.h ++++ b/fs/bcachefs/alloc_background.h +@@ -74,7 +74,7 @@ static inline void bch2_wake_allocator(struct bch_dev *ca) + static inline void verify_not_on_freelist(struct bch_fs *c, struct bch_dev *ca, + size_t bucket) + { +- if (expensive_debug_checks(c)) { ++ if (bch2_expensive_debug_checks) { + size_t iter; + long i; + unsigned j; +diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h +index b94439d1df9b..1bcc44915eb7 100644 +--- a/fs/bcachefs/bcachefs.h ++++ b/fs/bcachefs/bcachefs.h +@@ -296,6 +296,16 @@ do { \ + #define BCH_DEBUG_PARAMS() BCH_DEBUG_PARAMS_ALWAYS() + #endif + ++#define BCH_DEBUG_PARAM(name, description) extern bool bch2_##name; ++BCH_DEBUG_PARAMS() ++#undef BCH_DEBUG_PARAM ++ ++#ifndef CONFIG_BCACHEFS_DEBUG ++#define BCH_DEBUG_PARAM(name, description) static const bool bch2_##name; ++BCH_DEBUG_PARAMS_DEBUG() ++#undef BCH_DEBUG_PARAM ++#endif ++ + #define BCH_TIME_STATS() \ + x(btree_node_mem_alloc) \ + x(btree_node_split) \ +@@ -831,10 +841,6 @@ struct bch_fs { + unsigned copy_gc_enabled:1; + bool promote_whole_extents; + +-#define BCH_DEBUG_PARAM(name, description) bool name; +- BCH_DEBUG_PARAMS_ALL() +-#undef BCH_DEBUG_PARAM +- + struct time_stats times[BCH_TIME_STAT_NR]; + }; + +diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c +index 32849229801d..99b7fce2bfd3 100644 +--- a/fs/bcachefs/bkey_methods.c ++++ b/fs/bcachefs/bkey_methods.c +@@ -236,7 +236,7 @@ enum merge_result bch2_bkey_merge(struct bch_fs *c, + const struct bkey_ops *ops = &bch2_bkey_ops[l.k->type]; + enum merge_result ret; + +- if (key_merging_disabled(c) || ++ if (bch2_key_merging_disabled || + !ops->key_merge || + l.k->type != r.k->type || + bversion_cmp(l.k->version, r.k->version) || +diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c +index f7c2841ed8a7..6c7cc1035bfa 100644 +--- a/fs/bcachefs/bset.c ++++ b/fs/bcachefs/bset.c +@@ -388,15 +388,13 @@ static void bset_aux_tree_verify(struct btree *b) + #endif + } + +-void bch2_btree_keys_init(struct btree *b, bool *expensive_debug_checks) ++void bch2_btree_keys_init(struct btree *b) + { + unsigned i; + + b->nsets = 0; + memset(&b->nr, 0, sizeof(b->nr)); +-#ifdef CONFIG_BCACHEFS_DEBUG +- b->expensive_debug_checks = expensive_debug_checks; +-#endif ++ + for (i = 0; i < MAX_BSETS; i++) + b->set[i].data_offset = U16_MAX; + +@@ -522,7 +520,7 @@ static void bch2_bset_verify_rw_aux_tree(struct btree *b, + struct bkey_packed *k = btree_bkey_first(b, t); + unsigned j = 0; + +- if (!btree_keys_expensive_checks(b)) ++ if (!bch2_expensive_debug_checks) + return; + + BUG_ON(bset_has_ro_aux_tree(t)); +@@ -922,7 +920,7 @@ struct bkey_packed *bch2_bkey_prev_filter(struct btree *b, + k = p; + } + +- if (btree_keys_expensive_checks(b)) { ++ if (bch2_expensive_debug_checks) { + BUG_ON(ret >= orig_k); + + for (i = ret +@@ -1345,7 +1343,7 @@ struct bkey_packed *bch2_bset_search_linear(struct btree *b, + bkey_iter_pos_cmp(b, m, search) < 0) + m = bkey_next_skip_noops(m, btree_bkey_last(b, t)); + +- if (btree_keys_expensive_checks(b)) { ++ if (bch2_expensive_debug_checks) { + struct bkey_packed *prev = bch2_bkey_prev_all(b, t, m); + + BUG_ON(prev && +@@ -1601,7 +1599,7 @@ static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *iter, + void bch2_btree_node_iter_advance(struct btree_node_iter *iter, + struct btree *b) + { +- if (btree_keys_expensive_checks(b)) { ++ if (bch2_expensive_debug_checks) { + bch2_btree_node_iter_verify(iter, b); + bch2_btree_node_iter_next_check(iter, b); + } +@@ -1620,7 +1618,7 @@ struct bkey_packed *bch2_btree_node_iter_prev_all(struct btree_node_iter *iter, + struct bset_tree *t; + unsigned end = 0; + +- if (btree_keys_expensive_checks(b)) ++ if (bch2_expensive_debug_checks) + bch2_btree_node_iter_verify(iter, b); + + for_each_bset(b, t) { +@@ -1656,7 +1654,7 @@ found: + iter->data[0].k = __btree_node_key_to_offset(b, prev); + iter->data[0].end = end; + +- if (btree_keys_expensive_checks(b)) ++ if (bch2_expensive_debug_checks) + bch2_btree_node_iter_verify(iter, b); + return prev; + } +diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h +index 5921cf689105..52d2c9f2d847 100644 +--- a/fs/bcachefs/bset.h ++++ b/fs/bcachefs/bset.h +@@ -5,7 +5,7 @@ + #include + #include + +-#include "bcachefs_format.h" ++#include "bcachefs.h" + #include "bkey.h" + #include "bkey_methods.h" + #include "btree_types.h" +@@ -147,17 +147,6 @@ + * first key in that range of bytes again. + */ + +-extern bool bch2_expensive_debug_checks; +- +-static inline bool btree_keys_expensive_checks(const struct btree *b) +-{ +-#ifdef CONFIG_BCACHEFS_DEBUG +- return bch2_expensive_debug_checks || *b->expensive_debug_checks; +-#else +- return false; +-#endif +-} +- + enum bset_aux_tree_type { + BSET_NO_AUX_TREE, + BSET_RO_AUX_TREE, +@@ -228,7 +217,7 @@ __bkey_unpack_key_format_checked(const struct btree *b, + compiled_unpack_fn unpack_fn = b->aux_data; + unpack_fn(dst, src); + +- if (btree_keys_expensive_checks(b)) { ++ if (bch2_expensive_debug_checks) { + struct bkey dst2 = __bch2_bkey_unpack_key(&b->format, src); + + BUG_ON(memcmp(dst, &dst2, sizeof(*dst))); +@@ -366,7 +355,7 @@ static inline struct bset *bset_next_set(struct btree *b, + return ((void *) i) + round_up(vstruct_bytes(i), block_bytes); + } + +-void bch2_btree_keys_init(struct btree *, bool *); ++void bch2_btree_keys_init(struct btree *); + + void bch2_bset_init_first(struct btree *, struct bset *); + void bch2_bset_init_next(struct bch_fs *, struct btree *, +@@ -654,7 +643,7 @@ static inline void bch2_verify_insert_pos(struct btree *b, + + static inline void bch2_verify_btree_nr_keys(struct btree *b) + { +- if (btree_keys_expensive_checks(b)) ++ if (bch2_expensive_debug_checks) + __bch2_verify_btree_nr_keys(b); + } + +diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c +index 18fad71b8d94..325a16615a06 100644 +--- a/fs/bcachefs/btree_cache.c ++++ b/fs/bcachefs/btree_cache.c +@@ -211,7 +211,7 @@ static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush) + * - unless btree verify mode is enabled, since it runs out of + * the post write cleanup: + */ +- if (verify_btree_ondisk(c)) ++ if (bch2_verify_btree_ondisk) + bch2_btree_node_write(c, b, SIX_LOCK_intent); + else + __bch2_btree_node_write(c, b, SIX_LOCK_read); +@@ -254,7 +254,7 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, + unsigned long freed = 0; + unsigned i, flags; + +- if (btree_shrinker_disabled(c)) ++ if (bch2_btree_shrinker_disabled) + return SHRINK_STOP; + + /* Return -1 if we can't do anything right now */ +@@ -341,7 +341,7 @@ static unsigned long bch2_btree_cache_count(struct shrinker *shrink, + btree_cache.shrink); + struct btree_cache *bc = &c->btree_cache; + +- if (btree_shrinker_disabled(c)) ++ if (bch2_btree_shrinker_disabled) + return 0; + + return btree_cache_can_free(bc) * btree_pages(c); +@@ -590,7 +590,7 @@ out: + b->sib_u64s[0] = 0; + b->sib_u64s[1] = 0; + b->whiteout_u64s = 0; +- bch2_btree_keys_init(b, &c->expensive_debug_checks); ++ bch2_btree_keys_init(b); + + bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc], + start_time); +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +index e8c1e752a25d..ba4acc112ed3 100644 +--- a/fs/bcachefs/btree_gc.c ++++ b/fs/bcachefs/btree_gc.c +@@ -101,7 +101,7 @@ static int bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k, + int ret = 0; + + if (initial) { +- BUG_ON(journal_seq_verify(c) && ++ BUG_ON(bch2_journal_seq_verify && + k.k->version.lo > journal_cur_seq(&c->journal)); + + /* XXX change to fsck check */ +@@ -209,7 +209,7 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id, + struct btree_iter *iter; + struct btree *b; + unsigned depth = metadata_only ? 1 +- : expensive_debug_checks(c) ? 0 ++ : bch2_expensive_debug_checks ? 0 + : !btree_node_type_needs_gc(btree_id) ? 1 + : 0; + u8 max_stale = 0; +@@ -236,8 +236,8 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id, + BTREE_INSERT_USE_RESERVE| + BTREE_INSERT_NOWAIT| + BTREE_INSERT_GC_LOCK_HELD); +- else if (!btree_gc_rewrite_disabled(c) && +- (btree_gc_always_rewrite(c) || max_stale > 16)) ++ else if (!bch2_btree_gc_rewrite_disabled && ++ (bch2_btree_gc_always_rewrite || max_stale > 16)) + bch2_btree_node_rewrite(c, iter, + b->data->keys.seq, + BTREE_INSERT_NOWAIT| +@@ -328,7 +328,7 @@ static int bch2_gc_btree_init(struct bch_fs *c, + { + struct btree *b; + unsigned target_depth = metadata_only ? 1 +- : expensive_debug_checks(c) ? 0 ++ : bch2_expensive_debug_checks ? 0 + : !btree_node_type_needs_gc(btree_id) ? 1 + : 0; + u8 max_stale = 0; +@@ -835,7 +835,7 @@ again: + out: + if (!ret && + (test_bit(BCH_FS_FIXED_GENS, &c->flags) || +- (!iter && test_restart_gc(c)))) { ++ (!iter && bch2_test_restart_gc))) { + /* + * XXX: make sure gens we fixed got saved + */ +diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c +index 682f599cbef5..d344d2ea51be 100644 +--- a/fs/bcachefs/btree_io.c ++++ b/fs/bcachefs/btree_io.c +@@ -1044,7 +1044,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry + const char *invalid = bch2_bkey_val_invalid(c, u.s_c); + + if (invalid || +- (inject_invalid_keys(c) && ++ (bch2_inject_invalid_keys && + !bversion_cmp(u.k->version, MAX_VERSION))) { + char buf[160]; + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 357b8514e65c..9ada864536a6 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -487,7 +487,7 @@ static void bch2_btree_iter_verify_level(struct btree_iter *iter, + char buf1[100], buf2[100]; + const char *msg; + +- if (!debug_check_iterators(iter->trans->c)) ++ if (!bch2_debug_check_iterators) + return; + + if (btree_iter_type(iter) == BTREE_ITER_CACHED) { +@@ -583,7 +583,7 @@ void bch2_btree_trans_verify_iters(struct btree_trans *trans, struct btree *b) + { + struct btree_iter *iter; + +- if (!debug_check_iterators(trans->c)) ++ if (!bch2_debug_check_iterators) + return; + + trans_for_each_iter_with_node(trans, b, iter) +@@ -755,7 +755,7 @@ void bch2_btree_node_iter_fix(struct btree_iter *iter, + __bch2_btree_node_iter_fix(iter, b, node_iter, t, + where, clobber_u64s, new_u64s); + +- if (debug_check_iterators(iter->trans->c)) ++ if (bch2_debug_check_iterators) + bch2_btree_node_iter_verify(node_iter, b); + } + +@@ -785,7 +785,7 @@ static inline struct bkey_s_c __btree_iter_unpack(struct btree_iter *iter, + + ret = bkey_disassemble(l->b, k, u); + +- if (debug_check_bkeys(iter->trans->c)) ++ if (bch2_debug_check_bkeys) + bch2_bkey_debugcheck(iter->trans->c, l->b, ret); + + return ret; +@@ -1566,13 +1566,13 @@ static inline struct bkey_s_c btree_iter_peek_uptodate(struct btree_iter *iter) + + ret.v = bkeyp_val(&l->b->format, _k); + +- if (debug_check_iterators(iter->trans->c)) { ++ if (bch2_debug_check_iterators) { + struct bkey k = bkey_unpack_key(l->b, _k); + + BUG_ON(memcmp(&k, &iter->k, sizeof(k))); + } + +- if (debug_check_bkeys(iter->trans->c)) ++ if (bch2_debug_check_bkeys) + bch2_bkey_debugcheck(iter->trans->c, l->b, ret); + } + +diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h +index 844d853eacc3..7ba016d4ad30 100644 +--- a/fs/bcachefs/btree_types.h ++++ b/fs/bcachefs/btree_types.h +@@ -130,10 +130,6 @@ struct btree { + + struct btree_write writes[2]; + +-#ifdef CONFIG_BCACHEFS_DEBUG +- bool *expensive_debug_checks; +-#endif +- + /* Key/pointer for this btree node */ + __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX); + }; +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index add85e3cae40..8a9048d55103 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -220,7 +220,7 @@ static inline void btree_insert_entry_checks(struct btree_trans *trans, + struct bch_fs *c = trans->c; + + BUG_ON(bkey_cmp(insert->k.p, iter->pos)); +- BUG_ON(debug_check_bkeys(c) && ++ BUG_ON(bch2_debug_check_bkeys && + bch2_bkey_invalid(c, bkey_i_to_s_c(insert), + __btree_node_type(iter->level, iter->btree_id))); + } +@@ -440,10 +440,10 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, + */ + + if (!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)) { +- if (journal_seq_verify(c)) ++ if (bch2_journal_seq_verify) + trans_for_each_update2(trans, i) + i->k->k.version.lo = trans->journal_res.seq; +- else if (inject_invalid_keys(c)) ++ else if (bch2_inject_invalid_keys) + trans_for_each_update2(trans, i) + i->k->k.version = MAX_VERSION; + } +diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c +index aa10591a3b1a..bbe3fefa2651 100644 +--- a/fs/bcachefs/debug.c ++++ b/fs/bcachefs/debug.c +@@ -54,7 +54,7 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b) + v->written = 0; + v->c.level = b->c.level; + v->c.btree_id = b->c.btree_id; +- bch2_btree_keys_init(v, &c->expensive_debug_checks); ++ bch2_btree_keys_init(v); + + if (bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key), + NULL, &pick) <= 0) +diff --git a/fs/bcachefs/debug.h b/fs/bcachefs/debug.h +index 56c2d1ab5f63..7ac1615e9447 100644 +--- a/fs/bcachefs/debug.h ++++ b/fs/bcachefs/debug.h +@@ -8,44 +8,15 @@ struct bio; + struct btree; + struct bch_fs; + +-#define BCH_DEBUG_PARAM(name, description) extern bool bch2_##name; +-BCH_DEBUG_PARAMS() +-#undef BCH_DEBUG_PARAM +- +-#define BCH_DEBUG_PARAM(name, description) \ +- static inline bool name(struct bch_fs *c) \ +- { return bch2_##name || c->name; } +-BCH_DEBUG_PARAMS_ALWAYS() +-#undef BCH_DEBUG_PARAM +- + #ifdef CONFIG_BCACHEFS_DEBUG +- +-#define BCH_DEBUG_PARAM(name, description) \ +- static inline bool name(struct bch_fs *c) \ +- { return bch2_##name || c->name; } +-BCH_DEBUG_PARAMS_DEBUG() +-#undef BCH_DEBUG_PARAM +- + void __bch2_btree_verify(struct bch_fs *, struct btree *); +- +-#define bypass_torture_test(d) ((d)->bypass_torture_test) +- +-#else /* DEBUG */ +- +-#define BCH_DEBUG_PARAM(name, description) \ +- static inline bool name(struct bch_fs *c) { return false; } +-BCH_DEBUG_PARAMS_DEBUG() +-#undef BCH_DEBUG_PARAM +- ++#else + static inline void __bch2_btree_verify(struct bch_fs *c, struct btree *b) {} +- +-#define bypass_torture_test(d) 0 +- + #endif + + static inline void bch2_btree_verify(struct bch_fs *c, struct btree *b) + { +- if (verify_btree_ondisk(c)) ++ if (bch2_verify_btree_ondisk) + __bch2_btree_verify(c, b); + } + +diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c +index 88297b30f622..7fae6a4ba26f 100644 +--- a/fs/bcachefs/extents.c ++++ b/fs/bcachefs/extents.c +@@ -89,7 +89,7 @@ static inline bool ptr_better(struct bch_fs *c, + return bch2_rand_range(l1 + l2) > l1; + } + +- if (force_reconstruct_read(c)) ++ if (bch2_force_reconstruct_read) + return p1.idx > p2.idx; + + return p1.idx < p2.idx; +@@ -137,7 +137,7 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, + !bch2_dev_is_readable(ca)) + p.idx++; + +- if (force_reconstruct_read(c) && ++ if (bch2_force_reconstruct_read && + !p.idx && p.has_ec) + p.idx++; + +diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c +index 0cb29f43d99d..d7ad293aff4d 100644 +--- a/fs/bcachefs/sysfs.c ++++ b/fs/bcachefs/sysfs.c +@@ -208,12 +208,6 @@ read_attribute(io_timers_write); + write_attribute(perf_test); + #endif /* CONFIG_BCACHEFS_TESTS */ + +-#define BCH_DEBUG_PARAM(name, description) \ +- rw_attribute(name); +- +- BCH_DEBUG_PARAMS() +-#undef BCH_DEBUG_PARAM +- + #define x(_name) \ + static struct attribute sysfs_time_stat_##_name = \ + { .name = #_name, .mode = S_IRUGO }; +@@ -414,10 +408,6 @@ SHOW(bch2_fs) + return out.pos - buf; + } + +-#define BCH_DEBUG_PARAM(name, description) sysfs_print(name, c->name); +- BCH_DEBUG_PARAMS() +-#undef BCH_DEBUG_PARAM +- + return 0; + } + +@@ -462,10 +452,6 @@ STORE(bch2_fs) + + /* Debugging: */ + +-#define BCH_DEBUG_PARAM(name, description) sysfs_strtoul(name, c->name); +- BCH_DEBUG_PARAMS() +-#undef BCH_DEBUG_PARAM +- + if (!test_bit(BCH_FS_STARTED, &c->flags)) + return -EPERM; + +@@ -590,11 +576,6 @@ struct attribute *bch2_fs_internal_files[] = { + &sysfs_io_timers_write, + + &sysfs_internal_uuid, +- +-#define BCH_DEBUG_PARAM(name, description) &sysfs_##name, +- BCH_DEBUG_PARAMS() +-#undef BCH_DEBUG_PARAM +- + NULL + }; + +-- +cgit v1.2.3 + + +From a3500e8c657be87621a0b920c7a51afc1feca0a1 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 2 Nov 2020 18:36:08 -0500 +Subject: bcachefs: Split out debug_check_btree_accounting + +This check is very expensive + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bcachefs.h | 2 ++ + fs/bcachefs/bset.h | 2 +- + 2 files changed, 3 insertions(+), 1 deletion(-) + +diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h +index 1bcc44915eb7..6d25a283770f 100644 +--- a/fs/bcachefs/bcachefs.h ++++ b/fs/bcachefs/bcachefs.h +@@ -266,6 +266,8 @@ do { \ + BCH_DEBUG_PARAM(debug_check_bkeys, \ + "Run bkey_debugcheck (primarily checking GC/allocation "\ + "information) when iterating over keys") \ ++ BCH_DEBUG_PARAM(debug_check_btree_accounting, \ ++ "Verify btree accounting for keys within a node") \ + BCH_DEBUG_PARAM(verify_btree_ondisk, \ + "Reread btree nodes at various points to verify the " \ + "mergesort in the read path against modifications " \ +diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h +index 52d2c9f2d847..58b58a484845 100644 +--- a/fs/bcachefs/bset.h ++++ b/fs/bcachefs/bset.h +@@ -643,7 +643,7 @@ static inline void bch2_verify_insert_pos(struct btree *b, + + static inline void bch2_verify_btree_nr_keys(struct btree *b) + { +- if (bch2_expensive_debug_checks) ++ if (bch2_debug_check_btree_accounting) + __bch2_verify_btree_nr_keys(b); + } + +-- +cgit v1.2.3 + + +From bd3334f53e96cf2f513a120c3bea160f44f22fa0 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 2 Nov 2020 18:54:33 -0500 +Subject: bcachefs: Don't embed btree iters in btree_trans + +These haven't been in used since reallocing iterators has been disabled, +and saves us a lot of stack if we get rid of it. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 13 +++---------- + fs/bcachefs/btree_types.h | 4 ---- + 2 files changed, 3 insertions(+), 14 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 9ada864536a6..9e1971e6f484 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -2039,8 +2039,7 @@ success: + sizeof(struct btree_iter) * trans->nr_iters + + sizeof(struct btree_insert_entry) * trans->nr_iters); + +- if (trans->iters != trans->iters_onstack) +- kfree(trans->iters); ++ kfree(trans->iters); + + trans->iters = new_iters; + trans->updates = new_updates; +@@ -2330,21 +2329,15 @@ void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, + unsigned expected_nr_iters, + size_t expected_mem_bytes) + { +- memset(trans, 0, offsetof(struct btree_trans, iters_onstack)); +- + /* + * reallocating iterators currently completely breaks + * bch2_trans_iter_put(): + */ + expected_nr_iters = BTREE_ITER_MAX; + ++ memset(trans, 0, sizeof(*trans)); + trans->c = c; + trans->ip = _RET_IP_; +- trans->size = ARRAY_SIZE(trans->iters_onstack); +- trans->iters = trans->iters_onstack; +- trans->updates = trans->updates_onstack; +- trans->updates2 = trans->updates2_onstack; +- trans->fs_usage_deltas = NULL; + + if (expected_nr_iters > trans->size) + bch2_trans_realloc_iters(trans, expected_nr_iters); +@@ -2376,7 +2369,7 @@ int bch2_trans_exit(struct btree_trans *trans) + kfree(trans->mem); + if (trans->used_mempool) + mempool_free(trans->iters, &trans->c->btree_iters_pool); +- else if (trans->iters != trans->iters_onstack) ++ else + kfree(trans->iters); + trans->mem = (void *) 0x1; + trans->iters = (void *) 0x1; +diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h +index 7ba016d4ad30..93721fbc7794 100644 +--- a/fs/bcachefs/btree_types.h ++++ b/fs/bcachefs/btree_types.h +@@ -381,10 +381,6 @@ struct btree_trans { + unsigned journal_u64s; + unsigned journal_preres_u64s; + struct replicas_delta_list *fs_usage_deltas; +- +- struct btree_iter iters_onstack[2]; +- struct btree_insert_entry updates_onstack[2]; +- struct btree_insert_entry updates2_onstack[2]; + }; + + #define BTREE_FLAG(flag) \ +-- +cgit v1.2.3 + + +From d409f946d79f8d125d4bbdfbe755a7e574cb57a8 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 2 Nov 2020 19:15:18 -0500 +Subject: bcachefs: add const annotations to bset.c + +perhaps a bit silly, but some debug assertions we want to add need const +propagated a bit more. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bset.c | 14 +++++++------- + fs/bcachefs/bset.h | 6 +++--- + 2 files changed, 10 insertions(+), 10 deletions(-) + +diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c +index 6c7cc1035bfa..26716657453f 100644 +--- a/fs/bcachefs/bset.c ++++ b/fs/bcachefs/bset.c +@@ -369,10 +369,10 @@ static struct bkey_float *bkey_float(const struct btree *b, + return ro_aux_tree_base(b, t)->f + idx; + } + +-static void bset_aux_tree_verify(struct btree *b) ++static void bset_aux_tree_verify(const struct btree *b) + { + #ifdef CONFIG_BCACHEFS_DEBUG +- struct bset_tree *t; ++ const struct bset_tree *t; + + for_each_bset(b, t) { + if (t->aux_data_offset == U16_MAX) +@@ -708,20 +708,20 @@ static void make_bfloat(struct btree *b, struct bset_tree *t, + } + + /* bytes remaining - only valid for last bset: */ +-static unsigned __bset_tree_capacity(struct btree *b, struct bset_tree *t) ++static unsigned __bset_tree_capacity(const struct btree *b, const struct bset_tree *t) + { + bset_aux_tree_verify(b); + + return btree_aux_data_bytes(b) - t->aux_data_offset * sizeof(u64); + } + +-static unsigned bset_ro_tree_capacity(struct btree *b, struct bset_tree *t) ++static unsigned bset_ro_tree_capacity(const struct btree *b, const struct bset_tree *t) + { + return __bset_tree_capacity(b, t) / + (sizeof(struct bkey_float) + sizeof(u8)); + } + +-static unsigned bset_rw_tree_capacity(struct btree *b, struct bset_tree *t) ++static unsigned bset_rw_tree_capacity(const struct btree *b, const struct bset_tree *t) + { + return __bset_tree_capacity(b, t) / sizeof(struct rw_aux_tree); + } +@@ -1225,8 +1225,8 @@ static inline bool bkey_mantissa_bits_dropped(const struct btree *b, + + __flatten + static struct bkey_packed *bset_search_tree(const struct btree *b, +- struct bset_tree *t, +- struct bpos *search, ++ const struct bset_tree *t, ++ const struct bpos *search, + const struct bkey_packed *packed_search) + { + struct ro_aux_tree *base = ro_aux_tree_base(b, t); +diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h +index 58b58a484845..60cfecc2a9bc 100644 +--- a/fs/bcachefs/bset.h ++++ b/fs/bcachefs/bset.h +@@ -190,17 +190,17 @@ static inline enum bset_aux_tree_type bset_aux_tree_type(const struct bset_tree + + #define BSET_CACHELINE 128 + +-static inline size_t btree_keys_cachelines(struct btree *b) ++static inline size_t btree_keys_cachelines(const struct btree *b) + { + return (1U << b->byte_order) / BSET_CACHELINE; + } + +-static inline size_t btree_aux_data_bytes(struct btree *b) ++static inline size_t btree_aux_data_bytes(const struct btree *b) + { + return btree_keys_cachelines(b) * 8; + } + +-static inline size_t btree_aux_data_u64s(struct btree *b) ++static inline size_t btree_aux_data_u64s(const struct btree *b) + { + return btree_aux_data_bytes(b) / sizeof(u64); + } +-- +cgit v1.2.3 + + +From ceb491ec838abca15314484c587ad525f4776708 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 2 Nov 2020 19:49:23 -0500 +Subject: bcachefs: Report inode counts via statfs + +Took awhile to figure out exactly what statfs wanted... + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/fs.c | 10 ++++++++-- + 1 file changed, 8 insertions(+), 2 deletions(-) + +diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c +index ebe4fb25e896..c7f16064116d 100644 +--- a/fs/bcachefs/fs.c ++++ b/fs/bcachefs/fs.c +@@ -1260,6 +1260,11 @@ static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf) + struct bch_fs *c = sb->s_fs_info; + struct bch_fs_usage_short usage = bch2_fs_usage_read_short(c); + unsigned shift = sb->s_blocksize_bits - 9; ++ /* ++ * this assumes inodes take up 64 bytes, which is a decent average ++ * number: ++ */ ++ u64 avail_inodes = ((usage.capacity - usage.used) << 3); + u64 fsid; + + buf->f_type = BCACHEFS_STATFS_MAGIC; +@@ -1267,8 +1272,9 @@ static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf) + buf->f_blocks = usage.capacity >> shift; + buf->f_bfree = (usage.capacity - usage.used) >> shift; + buf->f_bavail = buf->f_bfree; +- buf->f_files = 0; +- buf->f_ffree = 0; ++ ++ buf->f_files = usage.nr_inodes + avail_inodes; ++ buf->f_ffree = avail_inodes; + + fsid = le64_to_cpup((void *) c->sb.user_uuid.b) ^ + le64_to_cpup((void *) c->sb.user_uuid.b + sizeof(u64)); +-- +cgit v1.2.3 + + +From 5ea9fed1a161f1567f9525bed175773a19754018 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 2 Nov 2020 23:51:33 -0500 +Subject: bcachefs: Improved inode create optimization + +This shards new inodes into different btree nodes by using the processor +ID for the high bits of the new inode number. Much faster than the +previous inode create optimization - this also helps with sharding in +the other btrees that index by inode number. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bcachefs.h | 7 +-- + fs/bcachefs/inode.c | 139 +++++++++++++++++-------------------------------- + fs/bcachefs/super.c | 6 ++- + 3 files changed, 54 insertions(+), 98 deletions(-) + +diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h +index 6d25a283770f..7369269aef66 100644 +--- a/fs/bcachefs/bcachefs.h ++++ b/fs/bcachefs/bcachefs.h +@@ -814,11 +814,8 @@ struct bch_fs { + struct mutex verify_lock; + #endif + +- struct mutex inode_create_lock; +- unsigned unused_inodes_nr; +- u64 unused_inodes[64]; +- u32 unused_inodes_gens[64]; +- u64 unused_inode_hint; ++ u64 *unused_inode_hints; ++ unsigned inode_shard_bits; + + /* + * A btree node on disk could have too many bsets for an iterator to fit +diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c +index 9a0991adf550..d7622049069e 100644 +--- a/fs/bcachefs/inode.c ++++ b/fs/bcachefs/inode.c +@@ -361,55 +361,6 @@ static inline u32 bkey_generation(struct bkey_s_c k) + } + } + +-static int scan_free_inums(struct btree_trans *trans) +-{ +- struct bch_fs *c = trans->c; +- struct btree_iter *iter = NULL; +- struct bkey_s_c k; +- u64 min = BLOCKDEV_INODE_MAX; +- u64 max = c->opts.inodes_32bit +- ? S32_MAX : S64_MAX; +- u64 start = max(min, READ_ONCE(c->unused_inode_hint)); +- int ret = 0; +- +- iter = bch2_trans_get_iter(trans, BTREE_ID_INODES, POS(0, start), +- BTREE_ITER_SLOTS); +- if (IS_ERR(iter)) +- return PTR_ERR(iter); +-again: +- for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, k, ret) { +- if (bkey_cmp(iter->pos, POS(0, max)) > 0) +- break; +- +- /* +- * This doesn't check the btree key cache, but we don't care: +- * we have to recheck with an intent lock held on the slot we're +- * inserting to anyways: +- */ +- if (k.k->type != KEY_TYPE_inode) { +- if (c->unused_inodes_nr < ARRAY_SIZE(c->unused_inodes)) { +- c->unused_inodes[c->unused_inodes_nr] = k.k->p.offset; +- c->unused_inodes_gens[c->unused_inodes_nr] = bkey_generation(k); +- c->unused_inodes_nr++; +- } +- +- if (c->unused_inodes_nr == ARRAY_SIZE(c->unused_inodes)) +- goto out; +- } +- } +- +- if (!ret && start != min) { +- max = start; +- start = min; +- bch2_btree_iter_set_pos(iter, POS(0, start)); +- goto again; +- } +-out: +- c->unused_inode_hint = iter->pos.offset; +- bch2_trans_iter_put(trans, iter); +- return ret; +-} +- + int bch2_inode_create(struct btree_trans *trans, + struct bch_inode_unpacked *inode_u) + { +@@ -417,64 +368,68 @@ int bch2_inode_create(struct btree_trans *trans, + struct bkey_inode_buf *inode_p; + struct btree_iter *iter = NULL; + struct bkey_s_c k; +- u64 inum; +- u32 generation; +- int ret = 0; ++ u64 min, max, start, *hint; ++ int ret; ++ ++ unsigned cpu = raw_smp_processor_id(); ++ unsigned bits = (c->opts.inodes_32bit ++ ? 31 : 63) - c->inode_shard_bits; ++ ++ min = (cpu << bits); ++ max = (cpu << bits) | ~(ULLONG_MAX << bits); ++ ++ min = max_t(u64, min, BLOCKDEV_INODE_MAX); ++ hint = c->unused_inode_hints + cpu; ++ ++ start = READ_ONCE(*hint); ++ ++ if (start >= max || start < min) ++ start = min; + + inode_p = bch2_trans_kmalloc(trans, sizeof(*inode_p)); + if (IS_ERR(inode_p)) + return PTR_ERR(inode_p); ++again: ++ for_each_btree_key(trans, iter, BTREE_ID_INODES, POS(0, start), ++ BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { ++ if (bkey_cmp(iter->pos, POS(0, max)) > 0) ++ break; + +- iter = bch2_trans_get_iter(trans, BTREE_ID_INODES, POS_MIN, +- BTREE_ITER_CACHED| +- BTREE_ITER_INTENT); +- if (IS_ERR(iter)) +- return PTR_ERR(iter); +-retry: +- if (!mutex_trylock(&c->inode_create_lock)) { +- bch2_trans_unlock(trans); +- mutex_lock(&c->inode_create_lock); +- if (!bch2_trans_relock(trans)) { +- mutex_unlock(&c->inode_create_lock); +- ret = -EINTR; +- goto err; +- } +- } +- +- if (!c->unused_inodes_nr) +- ret = scan_free_inums(trans); +- if (!ret && !c->unused_inodes_nr) +- ret = -ENOSPC; +- if (!ret) { +- --c->unused_inodes_nr; +- inum = c->unused_inodes[c->unused_inodes_nr]; +- generation = c->unused_inodes_gens[c->unused_inodes_nr]; ++ /* ++ * There's a potential cache coherency issue with the btree key ++ * cache code here - we're iterating over the btree, skipping ++ * that cache. We should never see an empty slot that isn't ++ * actually empty due to a pending update in the key cache ++ * because the update that creates the inode isn't done with a ++ * cached iterator, but - better safe than sorry, check the ++ * cache before using a slot: ++ */ ++ if (k.k->type != KEY_TYPE_inode && ++ !bch2_btree_key_cache_find(c, BTREE_ID_INODES, iter->pos)) ++ goto found_slot; + } + +- mutex_unlock(&c->inode_create_lock); +- +- if (ret) +- goto err; +- +- bch2_btree_iter_set_pos(iter, POS(0, inum)); ++ bch2_trans_iter_put(trans, iter); + +- /* Recheck that the slot is free with an intent lock held: */ +- k = bch2_btree_iter_peek_cached(iter); +- ret = bkey_err(k); + if (ret) +- goto err; ++ return ret; + +- if (k.k->type == KEY_TYPE_inode) +- goto retry; ++ if (start != min) { ++ /* Retry from start */ ++ start = min; ++ goto again; ++ } + +- inode_u->bi_inum = inum; +- inode_u->bi_generation = generation; ++ return -ENOSPC; ++found_slot: ++ *hint = k.k->p.offset; ++ inode_u->bi_inum = k.k->p.offset; ++ inode_u->bi_generation = bkey_generation(k); + + bch2_inode_pack(inode_p, inode_u); + bch2_trans_update(trans, iter, &inode_p->inode.k_i, 0); +-err: + bch2_trans_iter_put(trans, iter); +- return ret; ++ return 0; + } + + int bch2_inode_rm(struct bch_fs *c, u64 inode_nr) +diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c +index 211a17c7eeec..ac4fc611f8be 100644 +--- a/fs/bcachefs/super.c ++++ b/fs/bcachefs/super.c +@@ -485,6 +485,7 @@ static void __bch2_fs_free(struct bch_fs *c) + kfree(c->replicas_gc.entries); + kfree(rcu_dereference_protected(c->disk_groups, 1)); + kfree(c->journal_seq_blacklist_table); ++ kfree(c->unused_inode_hints); + free_heap(&c->copygc_heap); + + if (c->journal_reclaim_wq) +@@ -695,7 +696,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) + seqcount_init(&c->usage_lock); + + sema_init(&c->io_in_flight, 64); +- mutex_init(&c->inode_create_lock); + + c->copy_gc_enabled = 1; + c->rebalance.enabled = 1; +@@ -736,6 +736,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) + (btree_blocks(c) + 1) * 2 * + sizeof(struct sort_iter_set); + ++ c->inode_shard_bits = ilog2(roundup_pow_of_two(num_possible_cpus())); ++ + if (!(c->wq = alloc_workqueue("bcachefs", + WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) || + !(c->copygc_wq = alloc_workqueue("bcachefs_copygc", +@@ -753,6 +755,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) + mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1, + btree_bytes(c)) || + mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048) || ++ !(c->unused_inode_hints = kcalloc(1U << c->inode_shard_bits, ++ sizeof(u64), GFP_KERNEL)) || + bch2_io_clock_init(&c->io_clock[READ]) || + bch2_io_clock_init(&c->io_clock[WRITE]) || + bch2_fs_journal_init(&c->journal) || +-- +cgit v1.2.3 + + +From 811e48edf04749593d90c905aa50a52222fee021 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 5 Nov 2020 12:02:10 -0500 +Subject: bcachefs: Delete memcpy() macro + +This checks for overlapping src and dst in debug mode (i.e. should have +used memmove) - kasan should be checking for this now. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/util.h | 11 ----------- + 1 file changed, 11 deletions(-) + +diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h +index 0f3be4d59e97..192e2fd94689 100644 +--- a/fs/bcachefs/util.h ++++ b/fs/bcachefs/util.h +@@ -36,17 +36,6 @@ struct closure; + #define atomic64_sub_bug(i, v) BUG_ON(atomic64_sub_return(i, v) < 0) + #define atomic64_add_bug(i, v) BUG_ON(atomic64_add_return(i, v) < 0) + +-#define memcpy(dst, src, len) \ +-({ \ +- void *_dst = (dst); \ +- const void *_src = (src); \ +- size_t _len = (len); \ +- \ +- BUG_ON(!((void *) (_dst) >= (void *) (_src) + (_len) || \ +- (void *) (_dst) + (_len) <= (void *) (_src))); \ +- memcpy(_dst, _src, _len); \ +-}) +- + #else /* DEBUG */ + + #define EBUG_ON(cond) +-- +cgit v1.2.3 + + +From 14ab09789cb84f41240430333ce360730dc5cd4a Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 5 Nov 2020 12:16:05 -0500 +Subject: bcachefs: Build fixes for 32bit x86 + +PAGE_SIZE and size_t are not unsigned longs on 32 bit, annoying... + +also switch to atomic64_cmpxchg instead of cmpxchg() for +journal_seq_copy, as atomic64_cmpxchg has a fallback that uses spinlocks +for when it's not supported. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/ec.c | 2 +- + fs/bcachefs/fs.c | 7 ++++++- + fs/bcachefs/io.c | 2 +- + fs/bcachefs/util.c | 2 +- + 4 files changed, 9 insertions(+), 4 deletions(-) + +diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c +index bc43a20fb3c4..23254864cfb1 100644 +--- a/fs/bcachefs/ec.c ++++ b/fs/bcachefs/ec.c +@@ -1586,7 +1586,7 @@ void bch2_stripes_heap_to_text(struct printbuf *out, struct bch_fs *c) + size_t i; + + spin_lock(&c->ec_stripes_heap_lock); +- for (i = 0; i < min(h->used, 20UL); i++) { ++ for (i = 0; i < min_t(size_t, h->used, 20); i++) { + m = genradix_ptr(&c->stripes[0], h->data[i].idx); + + pr_buf(out, "%zu %u/%u+%u\n", h->data[i].idx, +diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c +index c7f16064116d..3db630028d06 100644 +--- a/fs/bcachefs/fs.c ++++ b/fs/bcachefs/fs.c +@@ -44,6 +44,11 @@ static void journal_seq_copy(struct bch_fs *c, + struct bch_inode_info *dst, + u64 journal_seq) + { ++ /* ++ * atomic64_cmpxchg has a fallback for archs that don't support it, ++ * cmpxchg does not: ++ */ ++ atomic64_t *dst_seq = (void *) &dst->ei_journal_seq; + u64 old, v = READ_ONCE(dst->ei_journal_seq); + + do { +@@ -51,7 +56,7 @@ static void journal_seq_copy(struct bch_fs *c, + + if (old >= journal_seq) + break; +- } while ((v = cmpxchg(&dst->ei_journal_seq, old, journal_seq)) != old); ++ } while ((v = atomic64_cmpxchg(dst_seq, old, journal_seq)) != old); + + bch2_journal_set_has_inum(&c->journal, dst->v.i_ino, journal_seq); + } +diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c +index 74393a21ecb5..ba2944b071fe 100644 +--- a/fs/bcachefs/io.c ++++ b/fs/bcachefs/io.c +@@ -170,7 +170,7 @@ void bch2_bio_alloc_pages_pool(struct bch_fs *c, struct bio *bio, + + while (size) { + struct page *page = __bio_alloc_page_pool(c, &using_mempool); +- unsigned len = min(PAGE_SIZE, size); ++ unsigned len = min_t(size_t, PAGE_SIZE, size); + + BUG_ON(!bio_add_page(bio, page, len, 0)); + size -= len; +diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c +index fd4044a6a08f..2709163e02b5 100644 +--- a/fs/bcachefs/util.c ++++ b/fs/bcachefs/util.c +@@ -520,7 +520,7 @@ int bch2_bio_alloc_pages(struct bio *bio, size_t size, gfp_t gfp_mask) + { + while (size) { + struct page *page = alloc_page(gfp_mask); +- unsigned len = min(PAGE_SIZE, size); ++ unsigned len = min_t(size_t, PAGE_SIZE, size); + + if (!page) + return -ENOMEM; +-- +cgit v1.2.3 + + +From ba18d7e2ccc732a7714879d4d21894cea343369a Mon Sep 17 00:00:00 2001 +From: "Matthew Wilcox (Oracle)" +Date: Thu, 5 Nov 2020 15:58:37 +0000 +Subject: bcachefs: Remove page_state_init_for_read + +This is dead code; delete the function. + +Signed-off-by: Matthew Wilcox (Oracle) +--- + fs/bcachefs/fs-io.c | 6 ------ + 1 file changed, 6 deletions(-) + +diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c +index a342ba102cdc..d08aba14abdf 100644 +--- a/fs/bcachefs/fs-io.c ++++ b/fs/bcachefs/fs-io.c +@@ -673,12 +673,6 @@ static void bch2_readpages_end_io(struct bio *bio) + bio_put(bio); + } + +-static inline void page_state_init_for_read(struct page *page) +-{ +- SetPagePrivate(page); +- page->private = 0; +-} +- + struct readpages_iter { + struct address_space *mapping; + struct page **pages; +-- +cgit v1.2.3 + + +From f03c7b29134a4530c71110a6c0d4937e956e0dbb Mon Sep 17 00:00:00 2001 +From: "Matthew Wilcox (Oracle)" +Date: Thu, 5 Nov 2020 15:58:38 +0000 +Subject: bcachefs: Use attach_page_private and detach_page_private + +These recently added helpers simplify the code. + +Signed-off-by: Matthew Wilcox (Oracle) +--- + fs/bcachefs/fs-io.c | 39 ++++++--------------------------------- + 1 file changed, 6 insertions(+), 33 deletions(-) + +diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c +index d08aba14abdf..b41848de619a 100644 +--- a/fs/bcachefs/fs-io.c ++++ b/fs/bcachefs/fs-io.c +@@ -291,28 +291,13 @@ static inline struct bch_page_state *bch2_page_state(struct page *page) + /* for newly allocated pages: */ + static void __bch2_page_state_release(struct page *page) + { +- struct bch_page_state *s = __bch2_page_state(page); +- +- if (!s) +- return; +- +- ClearPagePrivate(page); +- set_page_private(page, 0); +- put_page(page); +- kfree(s); ++ kfree(detach_page_private(page)); + } + + static void bch2_page_state_release(struct page *page) + { +- struct bch_page_state *s = bch2_page_state(page); +- +- if (!s) +- return; +- +- ClearPagePrivate(page); +- set_page_private(page, 0); +- put_page(page); +- kfree(s); ++ EBUG_ON(!PageLocked(page)); ++ __bch2_page_state_release(page); + } + + /* for newly allocated pages: */ +@@ -326,13 +311,7 @@ static struct bch_page_state *__bch2_page_state_create(struct page *page, + return NULL; + + spin_lock_init(&s->lock); +- /* +- * migrate_page_move_mapping() assumes that pages with private data +- * have their count elevated by 1. +- */ +- get_page(page); +- set_page_private(page, (unsigned long) s); +- SetPagePrivate(page); ++ attach_page_private(page, s); + return s; + } + +@@ -634,14 +613,8 @@ int bch2_migrate_page(struct address_space *mapping, struct page *newpage, + if (ret != MIGRATEPAGE_SUCCESS) + return ret; + +- if (PagePrivate(page)) { +- ClearPagePrivate(page); +- get_page(newpage); +- set_page_private(newpage, page_private(page)); +- set_page_private(page, 0); +- put_page(page); +- SetPagePrivate(newpage); +- } ++ if (PagePrivate(page)) ++ attach_page_private(newpage, detach_page_private(page)); + + if (mode != MIGRATE_SYNC_NO_COPY) + migrate_page_copy(newpage, page); +-- +cgit v1.2.3 + + +From 9004e01e95b7778c762be769291d6fbe2d465093 Mon Sep 17 00:00:00 2001 +From: "Matthew Wilcox (Oracle)" +Date: Thu, 5 Nov 2020 23:28:37 +0000 +Subject: bcachefs: Convert to readahead + +Use the new readahead method instead of readpages. + +Signed-off-by: Matthew Wilcox (Oracle) +--- + fs/bcachefs/fs-io.c | 85 ++++++++++------------------------------------------- + fs/bcachefs/fs-io.h | 3 +- + fs/bcachefs/fs.c | 2 +- + 3 files changed, 17 insertions(+), 73 deletions(-) + +diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c +index b41848de619a..7bf9ca86f854 100644 +--- a/fs/bcachefs/fs-io.c ++++ b/fs/bcachefs/fs-io.c +@@ -80,24 +80,6 @@ struct dio_read { + struct bch_read_bio rbio; + }; + +-/* stub version */ +-static int add_to_page_cache_lru_vec(struct address_space *mapping, +- struct page **pages, +- unsigned nr_pages, +- pgoff_t offset, gfp_t gfp_mask) +-{ +- int i, err = 0; +- +- for (i = 0; i < nr_pages; i++) { +- err = add_to_page_cache_lru(pages[i], mapping, +- offset + i, gfp_mask); +- if (err) +- break; +- } +- +- return i ?: err; +-} +- + /* pagecache_block must be held */ + static int write_invalidate_inode_pages_range(struct address_space *mapping, + loff_t start, loff_t end) +@@ -650,31 +632,29 @@ struct readpages_iter { + struct address_space *mapping; + struct page **pages; + unsigned nr_pages; +- unsigned nr_added; + unsigned idx; + pgoff_t offset; + }; + + static int readpages_iter_init(struct readpages_iter *iter, +- struct address_space *mapping, +- struct list_head *pages, unsigned nr_pages) ++ struct readahead_control *ractl) + { ++ unsigned i, nr_pages = readahead_count(ractl); ++ + memset(iter, 0, sizeof(*iter)); + +- iter->mapping = mapping; +- iter->offset = list_last_entry(pages, struct page, lru)->index; ++ iter->mapping = ractl->mapping; ++ iter->offset = readahead_index(ractl); ++ iter->nr_pages = nr_pages; + + iter->pages = kmalloc_array(nr_pages, sizeof(struct page *), GFP_NOFS); + if (!iter->pages) + return -ENOMEM; + +- while (!list_empty(pages)) { +- struct page *page = list_last_entry(pages, struct page, lru); +- +- __bch2_page_state_create(page, __GFP_NOFAIL); +- +- iter->pages[iter->nr_pages++] = page; +- list_del(&page->lru); ++ __readahead_batch(ractl, iter->pages, nr_pages); ++ for (i = 0; i < nr_pages; i++) { ++ __bch2_page_state_create(iter->pages[i], __GFP_NOFAIL); ++ put_page(iter->pages[i]); + } + + return 0; +@@ -682,41 +662,9 @@ static int readpages_iter_init(struct readpages_iter *iter, + + static inline struct page *readpage_iter_next(struct readpages_iter *iter) + { +- struct page *page; +- unsigned i; +- int ret; +- +- BUG_ON(iter->idx > iter->nr_added); +- BUG_ON(iter->nr_added > iter->nr_pages); +- +- if (iter->idx < iter->nr_added) +- goto out; +- +- while (1) { +- if (iter->idx == iter->nr_pages) +- return NULL; +- +- ret = add_to_page_cache_lru_vec(iter->mapping, +- iter->pages + iter->nr_added, +- iter->nr_pages - iter->nr_added, +- iter->offset + iter->nr_added, +- GFP_NOFS); +- if (ret > 0) +- break; +- +- page = iter->pages[iter->nr_added]; +- iter->idx++; +- iter->nr_added++; +- +- __bch2_page_state_release(page); +- put_page(page); +- } +- +- iter->nr_added += ret; ++ if (iter->idx >= iter->nr_pages) ++ return NULL; + +- for (i = iter->idx; i < iter->nr_added; i++) +- put_page(iter->pages[i]); +-out: + EBUG_ON(iter->pages[iter->idx]->index != iter->offset + iter->idx); + + return iter->pages[iter->idx]; +@@ -882,10 +830,9 @@ retry: + bkey_on_stack_exit(&sk, c); + } + +-int bch2_readpages(struct file *file, struct address_space *mapping, +- struct list_head *pages, unsigned nr_pages) ++void bch2_readahead(struct readahead_control *ractl) + { +- struct bch_inode_info *inode = to_bch_ei(mapping->host); ++ struct bch_inode_info *inode = to_bch_ei(ractl->mapping->host); + struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct bch_io_opts opts = io_opts(c, &inode->ei_inode); + struct btree_trans trans; +@@ -894,7 +841,7 @@ int bch2_readpages(struct file *file, struct address_space *mapping, + struct readpages_iter readpages_iter; + int ret; + +- ret = readpages_iter_init(&readpages_iter, mapping, pages, nr_pages); ++ ret = readpages_iter_init(&readpages_iter, ractl); + BUG_ON(ret); + + bch2_trans_init(&trans, c, 0, 0); +@@ -929,8 +876,6 @@ int bch2_readpages(struct file *file, struct address_space *mapping, + + bch2_trans_exit(&trans); + kfree(readpages_iter.pages); +- +- return 0; + } + + static void __bchfs_readpage(struct bch_fs *c, struct bch_read_bio *rbio, +diff --git a/fs/bcachefs/fs-io.h b/fs/bcachefs/fs-io.h +index 7063556d289b..2537a3d25ede 100644 +--- a/fs/bcachefs/fs-io.h ++++ b/fs/bcachefs/fs-io.h +@@ -19,8 +19,7 @@ int bch2_writepage(struct page *, struct writeback_control *); + int bch2_readpage(struct file *, struct page *); + + int bch2_writepages(struct address_space *, struct writeback_control *); +-int bch2_readpages(struct file *, struct address_space *, +- struct list_head *, unsigned); ++void bch2_readahead(struct readahead_control *); + + int bch2_write_begin(struct file *, struct address_space *, loff_t, + unsigned, unsigned, struct page **, void **); +diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c +index 3db630028d06..57a4b59c77d5 100644 +--- a/fs/bcachefs/fs.c ++++ b/fs/bcachefs/fs.c +@@ -1089,7 +1089,7 @@ static const struct address_space_operations bch_address_space_operations = { + .writepage = bch2_writepage, + .readpage = bch2_readpage, + .writepages = bch2_writepages, +- .readpages = bch2_readpages, ++ .readahead = bch2_readahead, + .set_page_dirty = __set_page_dirty_nobuffers, + .write_begin = bch2_write_begin, + .write_end = bch2_write_end, +-- +cgit v1.2.3 + + +From 7b91bd915c0fbc95e9d0c347bf057e4bb655195b Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 5 Nov 2020 20:02:01 -0500 +Subject: bcachefs: Add a single slot percpu buf for btree iters + +Allocating our array of btree iters is a big enough allocation that it +hits the buddy allocator, and we're seeing lots of lock contention. +Sticking a single element buffer in front of it should help. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bcachefs.h | 5 ++++ + fs/bcachefs/btree_iter.c | 73 ++++++++++++++++++++++++++++++------------------ + fs/bcachefs/super.c | 8 ++++++ + 3 files changed, 59 insertions(+), 27 deletions(-) + +diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h +index 7369269aef66..cf05ffa94af9 100644 +--- a/fs/bcachefs/bcachefs.h ++++ b/fs/bcachefs/bcachefs.h +@@ -542,6 +542,10 @@ struct journal_keys { + u64 journal_seq_base; + }; + ++struct btree_iter_buf { ++ struct btree_iter *iter; ++}; ++ + struct bch_fs { + struct closure cl; + +@@ -637,6 +641,7 @@ struct bch_fs { + struct mutex btree_trans_lock; + struct list_head btree_trans_list; + mempool_t btree_iters_pool; ++ struct btree_iter_buf __percpu *btree_iters_bufs; + + struct btree_key_cache btree_key_cache; + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 9e1971e6f484..f8c0b68c77c7 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -1991,6 +1991,7 @@ int bch2_trans_iter_free(struct btree_trans *trans, + return bch2_trans_iter_put(trans, iter); + } + ++#if 0 + static int bch2_trans_realloc_iters(struct btree_trans *trans, + unsigned new_size) + { +@@ -2053,6 +2054,7 @@ success: + + return 0; + } ++#endif + + static struct btree_iter *btree_trans_iter_alloc(struct btree_trans *trans) + { +@@ -2062,28 +2064,27 @@ static struct btree_iter *btree_trans_iter_alloc(struct btree_trans *trans) + goto got_slot; + + if (trans->nr_iters == trans->size) { +- int ret; +- +- if (trans->nr_iters >= BTREE_ITER_MAX) { +- struct btree_iter *iter; +- +- trans_for_each_iter(trans, iter) { +- pr_err("iter: btree %s pos %llu:%llu%s%s%s %ps", +- bch2_btree_ids[iter->btree_id], +- iter->pos.inode, +- iter->pos.offset, +- (trans->iters_live & (1ULL << iter->idx)) ? " live" : "", +- (trans->iters_touched & (1ULL << iter->idx)) ? " touched" : "", +- iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT ? " keep" : "", +- (void *) iter->ip_allocated); +- } ++ struct btree_iter *iter; + +- panic("trans iter oveflow\n"); ++ BUG_ON(trans->size < BTREE_ITER_MAX); ++ ++ trans_for_each_iter(trans, iter) { ++ pr_err("iter: btree %s pos %llu:%llu%s%s%s %ps", ++ bch2_btree_ids[iter->btree_id], ++ iter->pos.inode, ++ iter->pos.offset, ++ (trans->iters_live & (1ULL << iter->idx)) ? " live" : "", ++ (trans->iters_touched & (1ULL << iter->idx)) ? " touched" : "", ++ iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT ? " keep" : "", ++ (void *) iter->ip_allocated); + } + ++ panic("trans iter oveflow\n"); ++#if 0 + ret = bch2_trans_realloc_iters(trans, trans->size * 2); + if (ret) + return ERR_PTR(ret); ++#endif + } + + idx = trans->nr_iters++; +@@ -2325,22 +2326,37 @@ void bch2_trans_reset(struct btree_trans *trans, unsigned flags) + bch2_btree_iter_traverse_all(trans); + } + ++static void bch2_trans_alloc_iters(struct btree_trans *trans, struct bch_fs *c) ++{ ++ unsigned new_size = BTREE_ITER_MAX; ++ size_t iters_bytes = sizeof(struct btree_iter) * new_size; ++ size_t updates_bytes = sizeof(struct btree_insert_entry) * new_size; ++ void *p; ++ ++ BUG_ON(trans->used_mempool); ++ ++ p = this_cpu_xchg(c->btree_iters_bufs->iter, NULL) ?: ++ mempool_alloc(&trans->c->btree_iters_pool, GFP_NOFS); ++ ++ trans->iters = p; p += iters_bytes; ++ trans->updates = p; p += updates_bytes; ++ trans->updates2 = p; p += updates_bytes; ++ trans->size = new_size; ++} ++ + void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, + unsigned expected_nr_iters, + size_t expected_mem_bytes) + { +- /* +- * reallocating iterators currently completely breaks +- * bch2_trans_iter_put(): +- */ +- expected_nr_iters = BTREE_ITER_MAX; +- + memset(trans, 0, sizeof(*trans)); + trans->c = c; + trans->ip = _RET_IP_; + +- if (expected_nr_iters > trans->size) +- bch2_trans_realloc_iters(trans, expected_nr_iters); ++ /* ++ * reallocating iterators currently completely breaks ++ * bch2_trans_iter_put(), we always allocate the max: ++ */ ++ bch2_trans_alloc_iters(trans, c); + + if (expected_mem_bytes) + bch2_trans_preload_mem(trans, expected_mem_bytes); +@@ -2355,6 +2371,8 @@ void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, + + int bch2_trans_exit(struct btree_trans *trans) + { ++ struct bch_fs *c = trans->c; ++ + bch2_trans_unlock(trans); + + #ifdef CONFIG_BCACHEFS_DEBUG +@@ -2367,10 +2385,11 @@ int bch2_trans_exit(struct btree_trans *trans) + + kfree(trans->fs_usage_deltas); + kfree(trans->mem); +- if (trans->used_mempool) ++ ++ trans->iters = this_cpu_xchg(c->btree_iters_bufs->iter, trans->iters); ++ if (trans->iters) + mempool_free(trans->iters, &trans->c->btree_iters_pool); +- else +- kfree(trans->iters); ++ + trans->mem = (void *) 0x1; + trans->iters = (void *) 0x1; + +diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c +index ac4fc611f8be..d2ecc82b534c 100644 +--- a/fs/bcachefs/super.c ++++ b/fs/bcachefs/super.c +@@ -451,6 +451,7 @@ int bch2_fs_read_write_early(struct bch_fs *c) + static void __bch2_fs_free(struct bch_fs *c) + { + unsigned i; ++ int cpu; + + for (i = 0; i < BCH_TIME_STAT_NR; i++) + bch2_time_stats_exit(&c->times[i]); +@@ -475,6 +476,12 @@ static void __bch2_fs_free(struct bch_fs *c) + free_percpu(c->usage[1]); + free_percpu(c->usage[0]); + kfree(c->usage_base); ++ ++ if (c->btree_iters_bufs) ++ for_each_possible_cpu(cpu) ++ kfree(per_cpu_ptr(c->btree_iters_bufs, cpu)->iter); ++ ++ free_percpu(c->btree_iters_bufs); + free_percpu(c->pcpu); + mempool_exit(&c->large_bkey_pool); + mempool_exit(&c->btree_bounce_pool); +@@ -752,6 +759,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) + offsetof(struct btree_write_bio, wbio.bio)), + BIOSET_NEED_BVECS) || + !(c->pcpu = alloc_percpu(struct bch_fs_pcpu)) || ++ !(c->btree_iters_bufs = alloc_percpu(struct btree_iter_buf)) || + mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1, + btree_bytes(c)) || + mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048) || +-- +cgit v1.2.3 + + +From ae30e70778fb63514b798661d408f0ff5286b3ca Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 5 Nov 2020 20:49:08 -0500 +Subject: bcachefs: Fix spurious transaction restarts + +The checks for lock ordering violations weren't quite right. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 39 +++++++++++++++++++++++++-------------- + fs/bcachefs/btree_iter.h | 2 +- + 2 files changed, 26 insertions(+), 15 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index f8c0b68c77c7..075926866e36 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -238,14 +238,32 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos, + } + } + ++ if (linked->btree_id != iter->btree_id) { ++ if (linked->btree_id > iter->btree_id) { ++ deadlock_iter = linked; ++ reason = 3; ++ } ++ continue; ++ } ++ ++ /* ++ * Within the same btree, cached iterators come before non ++ * cached iterators: ++ */ ++ if (btree_iter_is_cached(linked) != btree_iter_is_cached(iter)) { ++ if (btree_iter_is_cached(iter)) { ++ deadlock_iter = linked; ++ reason = 4; ++ } ++ continue; ++ } ++ + /* + * Interior nodes must be locked before their descendants: if + * another iterator has possible descendants locked of the node + * we're about to lock, it must have the ancestors locked too: + */ +- if (linked->btree_id == iter->btree_id && +- btree_iter_is_cached(linked) == btree_iter_is_cached(iter) && +- level > __fls(linked->nodes_locked)) { ++ if (level > __fls(linked->nodes_locked)) { + if (!(trans->nounlock)) { + linked->locks_want = + max(level + 1, max_t(unsigned, +@@ -253,27 +271,20 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos, + iter->locks_want)); + if (!btree_iter_get_locks(linked, true, false)) { + deadlock_iter = linked; +- reason = 3; ++ reason = 5; + } + } else { + deadlock_iter = linked; +- reason = 4; ++ reason = 6; + } + } + + /* Must lock btree nodes in key order: */ +- if ((cmp_int(iter->btree_id, linked->btree_id) ?: +- -cmp_int(btree_iter_type(iter), btree_iter_type(linked))) < 0) { +- deadlock_iter = linked; +- reason = 5; +- } +- +- if (iter->btree_id == linked->btree_id && +- btree_node_locked(linked, level) && ++ if (btree_node_locked(linked, level) && + bkey_cmp(pos, btree_node_pos((void *) linked->l[level].b, + btree_iter_type(linked))) <= 0) { + deadlock_iter = linked; +- reason = 6; ++ reason = 7; + } + + /* +diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h +index f80e09255f68..f7a73619c85b 100644 +--- a/fs/bcachefs/btree_iter.h ++++ b/fs/bcachefs/btree_iter.h +@@ -182,7 +182,7 @@ static inline int btree_iter_lock_cmp(const struct btree_iter *l, + const struct btree_iter *r) + { + return cmp_int(l->btree_id, r->btree_id) ?: +- -cmp_int(btree_iter_type(l), btree_iter_type(r)) ?: ++ -cmp_int(btree_iter_is_cached(l), btree_iter_is_cached(r)) ?: + bkey_cmp(l->pos, r->pos); + } + +-- +cgit v1.2.3 + + +From d896a84bf651231ab496cbb1a1a4e8e85de476be Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 6 Nov 2020 01:34:41 -0500 +Subject: bcachefs: More inlinining in the btree key cache code + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_key_cache.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c +index 9ff77d982211..0ee4f78ce67a 100644 +--- a/fs/bcachefs/btree_key_cache.c ++++ b/fs/bcachefs/btree_key_cache.c +@@ -204,6 +204,7 @@ static int bkey_cached_check_fn(struct six_lock *lock, void *p) + !bkey_cmp(ck->key.pos, iter->pos) ? 0 : -1; + } + ++__flatten + int bch2_btree_iter_traverse_cached(struct btree_iter *iter) + { + struct btree_trans *trans = iter->trans; +-- +cgit v1.2.3 + + +From e192fd86ba464b9a959a84aeb2872c61131ca4e5 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 7 Nov 2020 12:31:20 -0500 +Subject: bcachefs: Drop typechecking from bkey_cmp_packed() + +This only did anything in two places, and those can just be replaced +wiht bkey_cmp_left_packed()). + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bkey.c | 8 +++---- + fs/bcachefs/bkey.h | 47 +++---------------------------------- + fs/bcachefs/bkey_sort.c | 10 ++++---- + fs/bcachefs/bset.h | 2 +- + fs/bcachefs/btree_io.c | 6 ++--- + fs/bcachefs/btree_update_interior.c | 2 +- + fs/bcachefs/btree_update_leaf.c | 2 +- + 7 files changed, 18 insertions(+), 59 deletions(-) + +diff --git a/fs/bcachefs/bkey.c b/fs/bcachefs/bkey.c +index 4d0c9129cd4a..c06d0a965be1 100644 +--- a/fs/bcachefs/bkey.c ++++ b/fs/bcachefs/bkey.c +@@ -411,7 +411,7 @@ static bool bkey_packed_successor(struct bkey_packed *out, + + if ((*p & mask) != mask) { + *p += 1ULL << offset; +- EBUG_ON(bkey_cmp_packed(b, out, &k) <= 0); ++ EBUG_ON(bch2_bkey_cmp_packed(b, out, &k) <= 0); + return true; + } + +@@ -1054,9 +1054,9 @@ int __bch2_bkey_cmp_left_packed_format_checked(const struct btree *b, + } + + __pure __flatten +-int __bch2_bkey_cmp_packed(const struct bkey_packed *l, +- const struct bkey_packed *r, +- const struct btree *b) ++int bch2_bkey_cmp_packed(const struct btree *b, ++ const struct bkey_packed *l, ++ const struct bkey_packed *r) + { + struct bkey unpacked; + +diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h +index 80ea488d57b0..2d2c640305e2 100644 +--- a/fs/bcachefs/bkey.h ++++ b/fs/bcachefs/bkey.h +@@ -67,13 +67,6 @@ static inline void set_bkey_val_bytes(struct bkey *k, unsigned bytes) + #define bkey_whiteout(_k) \ + ((_k)->type == KEY_TYPE_deleted || (_k)->type == KEY_TYPE_discard) + +-#define bkey_packed_typecheck(_k) \ +-({ \ +- BUILD_BUG_ON(!type_is(_k, struct bkey *) && \ +- !type_is(_k, struct bkey_packed *)); \ +- type_is(_k, struct bkey_packed *); \ +-}) +- + enum bkey_lr_packed { + BKEY_PACKED_BOTH, + BKEY_PACKED_RIGHT, +@@ -81,9 +74,6 @@ enum bkey_lr_packed { + BKEY_PACKED_NONE, + }; + +-#define bkey_lr_packed_typecheck(_l, _r) \ +- (!bkey_packed_typecheck(_l) + ((!bkey_packed_typecheck(_r)) << 1)) +- + #define bkey_lr_packed(_l, _r) \ + ((_l)->format + ((_r)->format << 1)) + +@@ -132,9 +122,9 @@ int __bch2_bkey_cmp_left_packed_format_checked(const struct btree *, + const struct bpos *); + + __pure +-int __bch2_bkey_cmp_packed(const struct bkey_packed *, +- const struct bkey_packed *, +- const struct btree *); ++int bch2_bkey_cmp_packed(const struct btree *, ++ const struct bkey_packed *, ++ const struct bkey_packed *); + + __pure + int __bch2_bkey_cmp_left_packed(const struct btree *, +@@ -160,37 +150,6 @@ static inline int bkey_cmp_left_packed_byval(const struct btree *b, + return bkey_cmp_left_packed(b, l, &r); + } + +-/* +- * If @_l or @_r are struct bkey * (not bkey_packed *), uses type information to +- * skip dispatching on k->format: +- */ +-#define bkey_cmp_packed(_b, _l, _r) \ +-({ \ +- int _cmp; \ +- \ +- switch (bkey_lr_packed_typecheck(_l, _r)) { \ +- case BKEY_PACKED_NONE: \ +- _cmp = bkey_cmp(((struct bkey *) (_l))->p, \ +- ((struct bkey *) (_r))->p); \ +- break; \ +- case BKEY_PACKED_LEFT: \ +- _cmp = bkey_cmp_left_packed((_b), \ +- (struct bkey_packed *) (_l), \ +- &((struct bkey *) (_r))->p); \ +- break; \ +- case BKEY_PACKED_RIGHT: \ +- _cmp = -bkey_cmp_left_packed((_b), \ +- (struct bkey_packed *) (_r), \ +- &((struct bkey *) (_l))->p); \ +- break; \ +- case BKEY_PACKED_BOTH: \ +- _cmp = __bch2_bkey_cmp_packed((void *) (_l), \ +- (void *) (_r), (_b)); \ +- break; \ +- } \ +- _cmp; \ +-}) +- + #if 1 + static __always_inline int bkey_cmp(struct bpos l, struct bpos r) + { +diff --git a/fs/bcachefs/bkey_sort.c b/fs/bcachefs/bkey_sort.c +index 839e78d1dc35..99e0a4011fae 100644 +--- a/fs/bcachefs/bkey_sort.c ++++ b/fs/bcachefs/bkey_sort.c +@@ -86,7 +86,7 @@ static inline int key_sort_fix_overlapping_cmp(struct btree *b, + struct bkey_packed *l, + struct bkey_packed *r) + { +- return bkey_cmp_packed(b, l, r) ?: ++ return bch2_bkey_cmp_packed(b, l, r) ?: + cmp_int((unsigned long) l, (unsigned long) r); + } + +@@ -98,7 +98,7 @@ static inline bool should_drop_next_key(struct sort_iter *iter) + * and should be dropped. + */ + return iter->used >= 2 && +- !bkey_cmp_packed(iter->b, ++ !bch2_bkey_cmp_packed(iter->b, + iter->data[0].k, + iter->data[1].k); + } +@@ -223,7 +223,7 @@ static inline int sort_keys_cmp(struct btree *b, + struct bkey_packed *l, + struct bkey_packed *r) + { +- return bkey_cmp_packed(b, l, r) ?: ++ return bch2_bkey_cmp_packed(b, l, r) ?: + (int) bkey_deleted(r) - (int) bkey_deleted(l) ?: + (int) l->needs_whiteout - (int) r->needs_whiteout; + } +@@ -245,7 +245,7 @@ unsigned bch2_sort_keys(struct bkey_packed *dst, + continue; + + while ((next = sort_iter_peek(iter)) && +- !bkey_cmp_packed(iter->b, in, next)) { ++ !bch2_bkey_cmp_packed(iter->b, in, next)) { + BUG_ON(in->needs_whiteout && + next->needs_whiteout); + needs_whiteout |= in->needs_whiteout; +@@ -406,7 +406,7 @@ static inline int sort_extents_cmp(struct btree *b, + struct bkey_packed *l, + struct bkey_packed *r) + { +- return bkey_cmp_packed(b, l, r) ?: ++ return bch2_bkey_cmp_packed(b, l, r) ?: + (int) bkey_deleted(l) - (int) bkey_deleted(r); + } + +diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h +index 60cfecc2a9bc..469294cc716c 100644 +--- a/fs/bcachefs/bset.h ++++ b/fs/bcachefs/bset.h +@@ -466,7 +466,7 @@ static inline int bkey_iter_cmp(const struct btree *b, + const struct bkey_packed *l, + const struct bkey_packed *r) + { +- return bkey_cmp_packed(b, l, r) ++ return bch2_bkey_cmp_packed(b, l, r) + ?: (int) bkey_deleted(r) - (int) bkey_deleted(l) + ?: cmp_int(l, r); + } +diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c +index d344d2ea51be..10a00085cdd6 100644 +--- a/fs/bcachefs/btree_io.c ++++ b/fs/bcachefs/btree_io.c +@@ -42,7 +42,7 @@ static void verify_no_dups(struct btree *b, + BUG_ON(extents + ? bkey_cmp(l.p, bkey_start_pos(&r)) > 0 + : bkey_cmp(l.p, bkey_start_pos(&r)) >= 0); +- //BUG_ON(bkey_cmp_packed(&b->format, p, k) >= 0); ++ //BUG_ON(bch2_bkey_cmp_packed(&b->format, p, k) >= 0); + } + #endif + } +@@ -102,14 +102,14 @@ static void sort_bkey_ptrs(const struct btree *bt, + break; + + for (b = a; c = 2 * b + 1, (d = c + 1) < n;) +- b = bkey_cmp_packed(bt, ++ b = bch2_bkey_cmp_packed(bt, + ptrs[c], + ptrs[d]) >= 0 ? c : d; + if (d == n) + b = c; + + while (b != a && +- bkey_cmp_packed(bt, ++ bch2_bkey_cmp_packed(bt, + ptrs[a], + ptrs[b]) >= 0) + b = (b - 1) / 2; +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index a2604b0ce2d8..4ddd1697ffde 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -1313,7 +1313,7 @@ bch2_btree_insert_keys_interior(struct btree_update *as, struct btree *b, + * the node the iterator points to: + */ + while ((k = bch2_btree_node_iter_prev_all(&node_iter, b)) && +- (bkey_cmp_packed(b, k, &insert->k) >= 0)) ++ (bkey_cmp_left_packed(b, k, &insert->k.p) >= 0)) + ; + + for_each_keylist_key(keys, insert) +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index 8a9048d55103..adf202f7989c 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -72,7 +72,7 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter, + EBUG_ON(iter->flags & BTREE_ITER_IS_EXTENTS); + + k = bch2_btree_node_iter_peek_all(node_iter, b); +- if (k && bkey_cmp_packed(b, k, &insert->k)) ++ if (k && bkey_cmp_left_packed(b, k, &insert->k.p)) + k = NULL; + + /* @k is the key being overwritten/deleted, if any: */ +-- +cgit v1.2.3 + + +From 55c95f778b5aefc6ebdb3ce0c036492da7891511 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 7 Nov 2020 12:43:48 -0500 +Subject: bcachefs: Fix build warning when CONFIG_BCACHEFS_DEBUG=n + +this function is only used by debug code, but we'd like to always build +it so we know that it does build. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 7 ++++--- + 1 file changed, 4 insertions(+), 3 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 075926866e36..58f1a3dd97d3 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -2407,9 +2407,10 @@ int bch2_trans_exit(struct btree_trans *trans) + return trans->error ? -EIO : 0; + } + +-static void bch2_btree_iter_node_to_text(struct printbuf *out, +- struct btree_bkey_cached_common *_b, +- enum btree_iter_type type) ++static void __maybe_unused ++bch2_btree_iter_node_to_text(struct printbuf *out, ++ struct btree_bkey_cached_common *_b, ++ enum btree_iter_type type) + { + pr_buf(out, " %px l=%u %s:", + _b, _b->level, bch2_btree_ids[_b->btree_id]); +-- +cgit v1.2.3 + + +From 76c5ad7fb21c2f29f16304ca89242fa0e1e94fb7 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 5 Nov 2020 23:39:33 -0500 +Subject: bcachefs: New varints + +Previous varint implementation used by the inode code was not nearly as +fast as it could have been; partly because it was attempting to encode +integers up to 96 bits (for timestamps) but this meant that encoding and +decoding the length required a table lookup. + +Instead, we'll just encode timestamps greater than 64 bits as two +separate varints; this will make decoding/encoding of inodes +significantly faster overall. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/Makefile | 1 + + fs/bcachefs/bcachefs_format.h | 17 ++-- + fs/bcachefs/fsck.c | 6 +- + fs/bcachefs/inode.c | 187 ++++++++++++++++++++++++++++++------------ + fs/bcachefs/inode.h | 17 ++-- + fs/bcachefs/io.c | 2 +- + fs/bcachefs/recovery.c | 2 +- + fs/bcachefs/super.c | 1 - + fs/bcachefs/varint.c | 42 ++++++++++ + fs/bcachefs/varint.h | 8 ++ + 10 files changed, 209 insertions(+), 74 deletions(-) + create mode 100644 fs/bcachefs/varint.c + create mode 100644 fs/bcachefs/varint.h + +diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile +index d85ced62c0dd..2fbf978424ed 100644 +--- a/fs/bcachefs/Makefile ++++ b/fs/bcachefs/Makefile +@@ -56,4 +56,5 @@ bcachefs-y := \ + tests.o \ + trace.o \ + util.o \ ++ varint.o \ + xattr.o +diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h +index 2926c648a17f..94b5418587e3 100644 +--- a/fs/bcachefs/bcachefs_format.h ++++ b/fs/bcachefs/bcachefs_format.h +@@ -669,10 +669,10 @@ struct bch_inode_generation { + } __attribute__((packed, aligned(8))); + + #define BCH_INODE_FIELDS() \ +- x(bi_atime, 64) \ +- x(bi_ctime, 64) \ +- x(bi_mtime, 64) \ +- x(bi_otime, 64) \ ++ x(bi_atime, 96) \ ++ x(bi_ctime, 96) \ ++ x(bi_mtime, 96) \ ++ x(bi_otime, 96) \ + x(bi_size, 64) \ + x(bi_sectors, 64) \ + x(bi_uid, 32) \ +@@ -739,7 +739,8 @@ enum { + #define BCH_INODE_UNLINKED (1 << __BCH_INODE_UNLINKED) + + LE32_BITMASK(INODE_STR_HASH, struct bch_inode, bi_flags, 20, 24); +-LE32_BITMASK(INODE_NR_FIELDS, struct bch_inode, bi_flags, 24, 32); ++LE32_BITMASK(INODE_NR_FIELDS, struct bch_inode, bi_flags, 24, 31); ++LE32_BITMASK(INODE_NEW_VARINT, struct bch_inode, bi_flags, 31, 32); + + /* Dirents */ + +@@ -1330,13 +1331,15 @@ LE64_BITMASK(BCH_SB_ERASURE_CODE, struct bch_sb, flags[3], 0, 16); + x(btree_ptr_v2, 11) \ + x(extents_above_btree_updates, 12) \ + x(btree_updates_journalled, 13) \ +- x(reflink_inline_data, 14) ++ x(reflink_inline_data, 14) \ ++ x(new_varint, 15) + + #define BCH_SB_FEATURES_ALL \ + ((1ULL << BCH_FEATURE_new_siphash)| \ + (1ULL << BCH_FEATURE_new_extent_overwrite)| \ + (1ULL << BCH_FEATURE_btree_ptr_v2)| \ +- (1ULL << BCH_FEATURE_extents_above_btree_updates)) ++ (1ULL << BCH_FEATURE_extents_above_btree_updates)|\ ++ (1ULL << BCH_FEATURE_new_varint))\ + + enum bch_sb_feature { + #define x(f, n) BCH_FEATURE_##f, +diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c +index 5a6df3d1973a..e3671b66c046 100644 +--- a/fs/bcachefs/fsck.c ++++ b/fs/bcachefs/fsck.c +@@ -537,7 +537,7 @@ retry: + + bch2_trans_unlock(&trans); + +- bch2_inode_pack(&p, &w.inode); ++ bch2_inode_pack(c, &p, &w.inode); + + ret = bch2_btree_insert(c, BTREE_ID_INODES, + &p.inode.k_i, NULL, NULL, +@@ -808,7 +808,7 @@ create_root: + 0, NULL); + root_inode->bi_inum = BCACHEFS_ROOT_INO; + +- bch2_inode_pack(&packed, root_inode); ++ bch2_inode_pack(c, &packed, root_inode); + + return bch2_btree_insert(c, BTREE_ID_INODES, &packed.inode.k_i, + NULL, NULL, +@@ -1326,7 +1326,7 @@ static int check_inode(struct btree_trans *trans, + if (do_update) { + struct bkey_inode_buf p; + +- bch2_inode_pack(&p, &u); ++ bch2_inode_pack(c, &p, &u); + + ret = __bch2_trans_do(trans, NULL, NULL, + BTREE_INSERT_NOFAIL| +diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c +index d7622049069e..42371de7f72a 100644 +--- a/fs/bcachefs/inode.c ++++ b/fs/bcachefs/inode.c +@@ -8,6 +8,7 @@ + #include "extents.h" + #include "inode.h" + #include "str_hash.h" ++#include "varint.h" + + #include + +@@ -89,22 +90,17 @@ static int inode_decode_field(const u8 *in, const u8 *end, + return bytes; + } + +-void bch2_inode_pack(struct bkey_inode_buf *packed, +- const struct bch_inode_unpacked *inode) ++static noinline void bch2_inode_pack_v1(struct bkey_inode_buf *packed, ++ const struct bch_inode_unpacked *inode) + { +- u8 *out = packed->inode.v.fields; ++ struct bkey_i_inode *k = &packed->inode; ++ u8 *out = k->v.fields; + u8 *end = (void *) &packed[1]; + u8 *last_nonzero_field = out; + unsigned nr_fields = 0, last_nonzero_fieldnr = 0; + unsigned bytes; + +- bkey_inode_init(&packed->inode.k_i); +- packed->inode.k.p.offset = inode->bi_inum; +- packed->inode.v.bi_hash_seed = inode->bi_hash_seed; +- packed->inode.v.bi_flags = cpu_to_le32(inode->bi_flags); +- packed->inode.v.bi_mode = cpu_to_le16(inode->bi_mode); +- +-#define x(_name, _bits) \ ++#define x(_name, _bits) \ + out += inode_encode_field(out, end, 0, inode->_name); \ + nr_fields++; \ + \ +@@ -123,7 +119,69 @@ void bch2_inode_pack(struct bkey_inode_buf *packed, + set_bkey_val_bytes(&packed->inode.k, bytes); + memset_u64s_tail(&packed->inode.v, 0, bytes); + +- SET_INODE_NR_FIELDS(&packed->inode.v, nr_fields); ++ SET_INODE_NR_FIELDS(&k->v, nr_fields); ++} ++ ++static void bch2_inode_pack_v2(struct bkey_inode_buf *packed, ++ const struct bch_inode_unpacked *inode) ++{ ++ struct bkey_i_inode *k = &packed->inode; ++ u8 *out = k->v.fields; ++ u8 *end = (void *) &packed[1]; ++ u8 *last_nonzero_field = out; ++ unsigned nr_fields = 0, last_nonzero_fieldnr = 0; ++ unsigned bytes; ++ int ret; ++ ++#define x(_name, _bits) \ ++ nr_fields++; \ ++ \ ++ if (inode->_name) { \ ++ ret = bch2_varint_encode(out, inode->_name); \ ++ out += ret; \ ++ \ ++ if (_bits > 64) \ ++ *out++ = 0; \ ++ \ ++ last_nonzero_field = out; \ ++ last_nonzero_fieldnr = nr_fields; \ ++ } else { \ ++ *out++ = 0; \ ++ \ ++ if (_bits > 64) \ ++ *out++ = 0; \ ++ } ++ ++ BCH_INODE_FIELDS() ++#undef x ++ BUG_ON(out > end); ++ ++ out = last_nonzero_field; ++ nr_fields = last_nonzero_fieldnr; ++ ++ bytes = out - (u8 *) &packed->inode.v; ++ set_bkey_val_bytes(&packed->inode.k, bytes); ++ memset_u64s_tail(&packed->inode.v, 0, bytes); ++ ++ SET_INODE_NR_FIELDS(&k->v, nr_fields); ++} ++ ++void bch2_inode_pack(struct bch_fs *c, ++ struct bkey_inode_buf *packed, ++ const struct bch_inode_unpacked *inode) ++{ ++ bkey_inode_init(&packed->inode.k_i); ++ packed->inode.k.p.offset = inode->bi_inum; ++ packed->inode.v.bi_hash_seed = inode->bi_hash_seed; ++ packed->inode.v.bi_flags = cpu_to_le32(inode->bi_flags); ++ packed->inode.v.bi_mode = cpu_to_le16(inode->bi_mode); ++ ++ if (c->sb.features & (1ULL << BCH_FEATURE_new_varint)) { ++ SET_INODE_NEW_VARINT(&packed->inode.v, true); ++ bch2_inode_pack_v2(packed, inode); ++ } else { ++ bch2_inode_pack_v1(packed, inode); ++ } + + if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) { + struct bch_inode_unpacked unpacked; +@@ -135,26 +193,23 @@ void bch2_inode_pack(struct bkey_inode_buf *packed, + BUG_ON(unpacked.bi_hash_seed != inode->bi_hash_seed); + BUG_ON(unpacked.bi_mode != inode->bi_mode); + +-#define x(_name, _bits) BUG_ON(unpacked._name != inode->_name); ++#define x(_name, _bits) if (unpacked._name != inode->_name) \ ++ panic("unpacked %llu should be %llu", \ ++ (u64) unpacked._name, (u64) inode->_name); + BCH_INODE_FIELDS() + #undef x + } + } + +-int bch2_inode_unpack(struct bkey_s_c_inode inode, +- struct bch_inode_unpacked *unpacked) ++static noinline int bch2_inode_unpack_v1(struct bkey_s_c_inode inode, ++ struct bch_inode_unpacked *unpacked) + { + const u8 *in = inode.v->fields; +- const u8 *end = (void *) inode.v + bkey_val_bytes(inode.k); ++ const u8 *end = bkey_val_end(inode); + u64 field[2]; + unsigned fieldnr = 0, field_bits; + int ret; + +- unpacked->bi_inum = inode.k->p.offset; +- unpacked->bi_hash_seed = inode.v->bi_hash_seed; +- unpacked->bi_flags = le32_to_cpu(inode.v->bi_flags); +- unpacked->bi_mode = le16_to_cpu(inode.v->bi_mode); +- + #define x(_name, _bits) \ + if (fieldnr++ == INODE_NR_FIELDS(inode.v)) { \ + memset(&unpacked->_name, 0, \ +@@ -177,6 +232,62 @@ int bch2_inode_unpack(struct bkey_s_c_inode inode, + #undef x + + /* XXX: signal if there were more fields than expected? */ ++ return 0; ++} ++ ++static int bch2_inode_unpack_v2(struct bkey_s_c_inode inode, ++ struct bch_inode_unpacked *unpacked) ++{ ++ const u8 *in = inode.v->fields; ++ const u8 *end = bkey_val_end(inode); ++ unsigned fieldnr = 0; ++ int ret; ++ u64 v[2]; ++ ++#define x(_name, _bits) \ ++ if (fieldnr < INODE_NR_FIELDS(inode.v)) { \ ++ ret = bch2_varint_decode(in, end, &v[0]); \ ++ if (ret < 0) \ ++ return ret; \ ++ in += ret; \ ++ \ ++ if (_bits > 64) { \ ++ ret = bch2_varint_decode(in, end, &v[1]); \ ++ if (ret < 0) \ ++ return ret; \ ++ in += ret; \ ++ } else { \ ++ v[1] = 0; \ ++ } \ ++ } else { \ ++ v[0] = v[1] = 0; \ ++ } \ ++ \ ++ unpacked->_name = v[0]; \ ++ if (v[1] || v[0] != unpacked->_name) \ ++ return -1; \ ++ fieldnr++; ++ ++ BCH_INODE_FIELDS() ++#undef x ++ ++ /* XXX: signal if there were more fields than expected? */ ++ return 0; ++} ++ ++int bch2_inode_unpack(struct bkey_s_c_inode inode, ++ struct bch_inode_unpacked *unpacked) ++{ ++ unpacked->bi_inum = inode.k->p.offset; ++ unpacked->bi_hash_seed = inode.v->bi_hash_seed; ++ unpacked->bi_flags = le32_to_cpu(inode.v->bi_flags); ++ unpacked->bi_mode = le16_to_cpu(inode.v->bi_mode); ++ ++ if (INODE_NEW_VARINT(inode.v)) { ++ return bch2_inode_unpack_v2(inode, unpacked); ++ } else { ++ return bch2_inode_unpack_v1(inode, unpacked); ++ } + + return 0; + } +@@ -223,7 +334,7 @@ int bch2_inode_write(struct btree_trans *trans, + if (IS_ERR(inode_p)) + return PTR_ERR(inode_p); + +- bch2_inode_pack(inode_p, inode); ++ bch2_inode_pack(trans->c, inode_p, inode); + bch2_trans_update(trans, iter, &inode_p->inode.k_i, 0); + return 0; + } +@@ -426,10 +537,7 @@ found_slot: + inode_u->bi_inum = k.k->p.offset; + inode_u->bi_generation = bkey_generation(k); + +- bch2_inode_pack(inode_p, inode_u); +- bch2_trans_update(trans, iter, &inode_p->inode.k_i, 0); +- bch2_trans_iter_put(trans, iter); +- return 0; ++ return bch2_inode_write(trans, iter, inode_u); + } + + int bch2_inode_rm(struct bch_fs *c, u64 inode_nr) +@@ -553,32 +661,3 @@ int bch2_inode_find_by_inum(struct bch_fs *c, u64 inode_nr, + return bch2_trans_do(c, NULL, NULL, 0, + bch2_inode_find_by_inum_trans(&trans, inode_nr, inode)); + } +- +-#ifdef CONFIG_BCACHEFS_DEBUG +-void bch2_inode_pack_test(void) +-{ +- struct bch_inode_unpacked *u, test_inodes[] = { +- { +- .bi_atime = U64_MAX, +- .bi_ctime = U64_MAX, +- .bi_mtime = U64_MAX, +- .bi_otime = U64_MAX, +- .bi_size = U64_MAX, +- .bi_sectors = U64_MAX, +- .bi_uid = U32_MAX, +- .bi_gid = U32_MAX, +- .bi_nlink = U32_MAX, +- .bi_generation = U32_MAX, +- .bi_dev = U32_MAX, +- }, +- }; +- +- for (u = test_inodes; +- u < test_inodes + ARRAY_SIZE(test_inodes); +- u++) { +- struct bkey_inode_buf p; +- +- bch2_inode_pack(&p, u); +- } +-} +-#endif +diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h +index 5743be2307f3..ef7e885dce0c 100644 +--- a/fs/bcachefs/inode.h ++++ b/fs/bcachefs/inode.h +@@ -24,6 +24,14 @@ void bch2_inode_generation_to_text(struct printbuf *, struct bch_fs *, + .val_to_text = bch2_inode_generation_to_text, \ + } + ++#if 0 ++typedef struct { ++ u64 lo; ++ u32 hi; ++} __packed __aligned(4) u96; ++#endif ++typedef u64 u96; ++ + struct bch_inode_unpacked { + u64 bi_inum; + __le64 bi_hash_seed; +@@ -43,7 +51,8 @@ struct bkey_inode_buf { + #undef x + } __attribute__((packed, aligned(8))); + +-void bch2_inode_pack(struct bkey_inode_buf *, const struct bch_inode_unpacked *); ++void bch2_inode_pack(struct bch_fs *, struct bkey_inode_buf *, ++ const struct bch_inode_unpacked *); + int bch2_inode_unpack(struct bkey_s_c_inode, struct bch_inode_unpacked *); + + struct btree_iter *bch2_inode_peek(struct btree_trans *, +@@ -166,10 +175,4 @@ static inline void bch2_inode_nlink_set(struct bch_inode_unpacked *bi, + } + } + +-#ifdef CONFIG_BCACHEFS_DEBUG +-void bch2_inode_pack_test(void); +-#else +-static inline void bch2_inode_pack_test(void) {} +-#endif +- + #endif /* _BCACHEFS_INODE_H */ +diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c +index ba2944b071fe..e12b5b5e0598 100644 +--- a/fs/bcachefs/io.c ++++ b/fs/bcachefs/io.c +@@ -300,7 +300,7 @@ int bch2_extent_update(struct btree_trans *trans, + inode_u.bi_sectors += delta; + + if (delta || new_i_size) { +- bch2_inode_pack(&inode_p, &inode_u); ++ bch2_inode_pack(trans->c, &inode_p, &inode_u); + bch2_trans_update(trans, inode_iter, + &inode_p.inode.k_i, 0); + } +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index 32fed6b81a52..1745cfac6b26 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -1320,7 +1320,7 @@ int bch2_fs_initialize(struct bch_fs *c) + bch2_inode_init(c, &root_inode, 0, 0, + S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0, NULL); + root_inode.bi_inum = BCACHEFS_ROOT_INO; +- bch2_inode_pack(&packed_inode, &root_inode); ++ bch2_inode_pack(c, &packed_inode, &root_inode); + + err = "error creating root directory"; + ret = bch2_btree_insert(c, BTREE_ID_INODES, +diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c +index d2ecc82b534c..432bece444c3 100644 +--- a/fs/bcachefs/super.c ++++ b/fs/bcachefs/super.c +@@ -2021,7 +2021,6 @@ static void bcachefs_exit(void) + static int __init bcachefs_init(void) + { + bch2_bkey_pack_test(); +- bch2_inode_pack_test(); + + if (!(bcachefs_kset = kset_create_and_add("bcachefs", NULL, fs_kobj)) || + bch2_chardev_init() || +diff --git a/fs/bcachefs/varint.c b/fs/bcachefs/varint.c +new file mode 100644 +index 000000000000..a3d252c741c8 +--- /dev/null ++++ b/fs/bcachefs/varint.c +@@ -0,0 +1,42 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include ++#include ++ ++#include "varint.h" ++ ++int bch2_varint_encode(u8 *out, u64 v) ++{ ++ unsigned bits = fls64(v|1); ++ unsigned bytes = DIV_ROUND_UP(bits, 7); ++ ++ if (likely(bytes < 9)) { ++ v <<= bytes; ++ v |= ~(~0 << (bytes - 1)); ++ } else { ++ *out++ = 255; ++ bytes = 9; ++ } ++ ++ put_unaligned_le64(v, out); ++ return bytes; ++} ++ ++int bch2_varint_decode(const u8 *in, const u8 *end, u64 *out) ++{ ++ u64 v = get_unaligned_le64(in); ++ unsigned bytes = ffz(v & 255) + 1; ++ ++ if (unlikely(in + bytes > end)) ++ return -1; ++ ++ if (likely(bytes < 9)) { ++ v >>= bytes; ++ v &= ~(~0ULL << (7 * bytes)); ++ } else { ++ v = get_unaligned_le64(++in); ++ } ++ ++ *out = v; ++ return bytes; ++} +diff --git a/fs/bcachefs/varint.h b/fs/bcachefs/varint.h +new file mode 100644 +index 000000000000..8daf813576b7 +--- /dev/null ++++ b/fs/bcachefs/varint.h +@@ -0,0 +1,8 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_VARINT_H ++#define _BCACHEFS_VARINT_H ++ ++int bch2_varint_encode(u8 *, u64); ++int bch2_varint_decode(const u8 *, const u8 *, u64 *); ++ ++#endif /* _BCACHEFS_VARINT_H */ +-- +cgit v1.2.3 + + +From 7973b9bb260ebfee20f7afa508d02bcaae6004f3 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 7 Nov 2020 13:03:24 -0500 +Subject: bcachefs: use a radix tree for inum bitmap in fsck + +The change to use the cpu nr for the high bits of new inode numbers +means that inode numbers are very space - we see -ENOMEM during fsck +without this. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/fsck.c | 39 +++++++++++++-------------------------- + 1 file changed, 13 insertions(+), 26 deletions(-) + +diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c +index e3671b66c046..0c5035270846 100644 +--- a/fs/bcachefs/fsck.c ++++ b/fs/bcachefs/fsck.c +@@ -866,36 +866,22 @@ create_lostfound: + return ret; + } + +-struct inode_bitmap { +- unsigned long *bits; +- size_t size; +-}; ++typedef GENRADIX(unsigned long) inode_bitmap; + +-static inline bool inode_bitmap_test(struct inode_bitmap *b, size_t nr) ++static inline bool inode_bitmap_test(inode_bitmap *b, size_t nr) + { +- return nr < b->size ? test_bit(nr, b->bits) : false; ++ unsigned long *w = genradix_ptr(b, nr / BITS_PER_LONG); ++ return w ? test_bit(nr & (BITS_PER_LONG - 1), w) : false; + } + +-static inline int inode_bitmap_set(struct inode_bitmap *b, size_t nr) ++static inline int inode_bitmap_set(inode_bitmap *b, size_t nr) + { +- if (nr >= b->size) { +- size_t new_size = max_t(size_t, max_t(size_t, +- PAGE_SIZE * 8, +- b->size * 2), +- nr + 1); +- void *n; +- +- new_size = roundup_pow_of_two(new_size); +- n = krealloc(b->bits, new_size / 8, GFP_KERNEL|__GFP_ZERO); +- if (!n) { +- return -ENOMEM; +- } ++ unsigned long *w = genradix_ptr_alloc(b, nr / BITS_PER_LONG, GFP_KERNEL); + +- b->bits = n; +- b->size = new_size; +- } ++ if (!w) ++ return -ENOMEM; + +- __set_bit(nr, b->bits); ++ *w |= 1UL << (nr & (BITS_PER_LONG - 1)); + return 0; + } + +@@ -934,7 +920,7 @@ noinline_for_stack + static int check_directory_structure(struct bch_fs *c, + struct bch_inode_unpacked *lostfound_inode) + { +- struct inode_bitmap dirs_done = { NULL, 0 }; ++ inode_bitmap dirs_done; + struct pathbuf path = { 0, 0, NULL }; + struct pathbuf_entry *e; + struct btree_trans trans; +@@ -951,6 +937,7 @@ static int check_directory_structure(struct bch_fs *c, + + /* DFS: */ + restart_dfs: ++ genradix_init(&dirs_done); + had_unreachable = false; + + ret = inode_bitmap_set(&dirs_done, BCACHEFS_ROOT_INO); +@@ -1057,7 +1044,7 @@ retry: + + if (had_unreachable) { + bch_info(c, "reattached unreachable directories, restarting pass to check for loops"); +- kfree(dirs_done.bits); ++ genradix_free(&dirs_done); + kfree(path.entries); + memset(&dirs_done, 0, sizeof(dirs_done)); + memset(&path, 0, sizeof(path)); +@@ -1066,7 +1053,7 @@ retry: + err: + fsck_err: + ret = bch2_trans_exit(&trans) ?: ret; +- kfree(dirs_done.bits); ++ genradix_free(&dirs_done); + kfree(path.entries); + return ret; + } +-- +cgit v1.2.3 + + +From d4a4e87d3dd29675b45a4882538920b297bfa9de Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 7 Nov 2020 16:16:52 -0500 +Subject: bcachefs: Inline make_bfloat() into __build_ro_aux_tree() + +This is a fast path - also, lift out the checks/init for min/max key. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bset.c | 94 ++++++++++++++++++++++++++---------------------------- + 1 file changed, 46 insertions(+), 48 deletions(-) + +diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c +index 26716657453f..1c7318c6e46f 100644 +--- a/fs/bcachefs/bset.c ++++ b/fs/bcachefs/bset.c +@@ -604,53 +604,23 @@ static inline unsigned bkey_mantissa(const struct bkey_packed *k, + return (u16) v; + } + +-static void make_bfloat(struct btree *b, struct bset_tree *t, +- unsigned j, +- struct bkey_packed *min_key, +- struct bkey_packed *max_key) ++__always_inline ++static inline void __make_bfloat(struct btree *b, struct bset_tree *t, ++ unsigned j, ++ struct bkey_packed *min_key, ++ struct bkey_packed *max_key) + { + struct bkey_float *f = bkey_float(b, t, j); + struct bkey_packed *m = tree_to_bkey(b, t, j); +- struct bkey_packed *l, *r; ++ struct bkey_packed *l = is_power_of_2(j) ++ ? min_key ++ : tree_to_prev_bkey(b, t, j >> ffs(j)); ++ struct bkey_packed *r = is_power_of_2(j + 1) ++ ? max_key ++ : tree_to_bkey(b, t, j >> (ffz(j) + 1)); + unsigned mantissa; + int shift, exponent, high_bit; + +- if (is_power_of_2(j)) { +- l = min_key; +- +- if (!l->u64s) { +- if (!bkey_pack_pos(l, b->data->min_key, b)) { +- struct bkey_i tmp; +- +- bkey_init(&tmp.k); +- tmp.k.p = b->data->min_key; +- bkey_copy(l, &tmp); +- } +- } +- } else { +- l = tree_to_prev_bkey(b, t, j >> ffs(j)); +- +- EBUG_ON(m < l); +- } +- +- if (is_power_of_2(j + 1)) { +- r = max_key; +- +- if (!r->u64s) { +- if (!bkey_pack_pos(r, t->max_key, b)) { +- struct bkey_i tmp; +- +- bkey_init(&tmp.k); +- tmp.k.p = t->max_key; +- bkey_copy(r, &tmp); +- } +- } +- } else { +- r = tree_to_bkey(b, t, j >> (ffz(j) + 1)); +- +- EBUG_ON(m > r); +- } +- + /* + * for failed bfloats, the lookup code falls back to comparing against + * the original key. +@@ -707,6 +677,30 @@ static void make_bfloat(struct btree *b, struct bset_tree *t, + f->mantissa = mantissa; + } + ++static void make_bfloat(struct btree *b, struct bset_tree *t, ++ unsigned j, ++ struct bkey_packed *min_key, ++ struct bkey_packed *max_key) ++{ ++ struct bkey_i *k; ++ ++ if (is_power_of_2(j) && ++ !min_key->u64s) { ++ k = (void *) min_key; ++ bkey_init(&k->k); ++ k->k.p = b->data->min_key; ++ } ++ ++ if (is_power_of_2(j + 1) && ++ !max_key->u64s) { ++ k = (void *) max_key; ++ bkey_init(&k->k); ++ k->k.p = t->max_key; ++ } ++ ++ __make_bfloat(b, t, j, min_key, max_key); ++} ++ + /* bytes remaining - only valid for last bset: */ + static unsigned __bset_tree_capacity(const struct btree *b, const struct bset_tree *t) + { +@@ -726,7 +720,7 @@ static unsigned bset_rw_tree_capacity(const struct btree *b, const struct bset_t + return __bset_tree_capacity(b, t) / sizeof(struct rw_aux_tree); + } + +-static void __build_rw_aux_tree(struct btree *b, struct bset_tree *t) ++static noinline void __build_rw_aux_tree(struct btree *b, struct bset_tree *t) + { + struct bkey_packed *k; + +@@ -745,15 +739,12 @@ static void __build_rw_aux_tree(struct btree *b, struct bset_tree *t) + } + } + +-static void __build_ro_aux_tree(struct btree *b, struct bset_tree *t) ++static noinline void __build_ro_aux_tree(struct btree *b, struct bset_tree *t) + { + struct bkey_packed *prev = NULL, *k = btree_bkey_first(b, t); +- struct bkey_packed min_key, max_key; ++ struct bkey_i min_key, max_key; + unsigned j, cacheline = 1; + +- /* signal to make_bfloat() that they're uninitialized: */ +- min_key.u64s = max_key.u64s = 0; +- + t->size = min(bkey_to_cacheline(b, t, btree_bkey_last(b, t)), + bset_ro_tree_capacity(b, t)); + retry: +@@ -789,9 +780,16 @@ retry: + + t->max_key = bkey_unpack_pos(b, prev); + ++ bkey_init(&min_key.k); ++ min_key.k.p = b->data->min_key; ++ bkey_init(&max_key.k); ++ max_key.k.p = t->max_key; ++ + /* Then we build the tree */ + eytzinger1_for_each(j, t->size) +- make_bfloat(b, t, j, &min_key, &max_key); ++ __make_bfloat(b, t, j, ++ bkey_to_packed(&min_key), ++ bkey_to_packed(&max_key)); + } + + static void bset_alloc_tree(struct btree *b, struct bset_tree *t) +-- +cgit v1.2.3 + + +From c3b058a58325683a27aa389f3e1a52cfc692a8b8 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 7 Nov 2020 16:55:57 -0500 +Subject: bcachefs: Fix btree iterator leak + +this fixes an occasonial btree transaction iterators overflow. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/inode.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c +index 42371de7f72a..c51b34077dde 100644 +--- a/fs/bcachefs/inode.c ++++ b/fs/bcachefs/inode.c +@@ -537,7 +537,9 @@ found_slot: + inode_u->bi_inum = k.k->p.offset; + inode_u->bi_generation = bkey_generation(k); + +- return bch2_inode_write(trans, iter, inode_u); ++ ret = bch2_inode_write(trans, iter, inode_u); ++ bch2_trans_iter_put(trans, iter); ++ return ret; + } + + int bch2_inode_rm(struct bch_fs *c, u64 inode_nr) +-- +cgit v1.2.3 + + +From 8ab5867e3a7fd5eaadbb90b4a33609fa75f171c1 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 9 Nov 2020 13:01:52 -0500 +Subject: bcachefs: Add accounting for dirty btree nodes/keys + +This lets us improve journal reclaim, so that it now tries to make sure +no more than 3/4s of the btree node cache and btree key cache are dirty +- ensuring the shrinkers can free memory. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_cache.c | 4 +++- + fs/bcachefs/btree_io.c | 2 ++ + fs/bcachefs/btree_io.h | 17 ++++++++++++++++ + fs/bcachefs/btree_key_cache.c | 39 +++++++++++++++++++++++++++++-------- + fs/bcachefs/btree_types.h | 6 +++++- + fs/bcachefs/btree_update_interior.c | 8 ++++---- + fs/bcachefs/btree_update_leaf.c | 2 +- + fs/bcachefs/journal_reclaim.c | 6 ++++++ + 8 files changed, 69 insertions(+), 15 deletions(-) + +diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c +index 325a16615a06..912e90b89e8b 100644 +--- a/fs/bcachefs/btree_cache.c ++++ b/fs/bcachefs/btree_cache.c +@@ -381,11 +381,13 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c) + + if (btree_node_dirty(b)) + bch2_btree_complete_write(c, b, btree_current_write(b)); +- clear_btree_node_dirty(b); ++ clear_btree_node_dirty(c, b); + + btree_node_data_free(c, b); + } + ++ BUG_ON(atomic_read(&c->btree_cache.dirty)); ++ + while (!list_empty(&bc->freed)) { + b = list_first_entry(&bc->freed, struct btree, list); + list_del(&b->list); +diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c +index 10a00085cdd6..edca11d255eb 100644 +--- a/fs/bcachefs/btree_io.c ++++ b/fs/bcachefs/btree_io.c +@@ -1498,6 +1498,8 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, + new ^= (1 << BTREE_NODE_write_idx); + } while (cmpxchg_acquire(&b->flags, old, new) != old); + ++ atomic_dec(&c->btree_cache.dirty); ++ + BUG_ON(btree_node_fake(b)); + BUG_ON((b->will_make_reachable != 0) != !b->written); + +diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h +index 626d0f071b70..1a4b11e99cc4 100644 +--- a/fs/bcachefs/btree_io.h ++++ b/fs/bcachefs/btree_io.h +@@ -14,6 +14,23 @@ struct btree_write; + struct btree; + struct btree_iter; + ++static inline bool btree_node_dirty(struct btree *b) ++{ ++ return test_bit(BTREE_NODE_dirty, &b->flags); ++} ++ ++static inline void set_btree_node_dirty(struct bch_fs *c, struct btree *b) ++{ ++ if (!test_and_set_bit(BTREE_NODE_dirty, &b->flags)) ++ atomic_inc(&c->btree_cache.dirty); ++} ++ ++static inline void clear_btree_node_dirty(struct bch_fs *c, struct btree *b) ++{ ++ if (test_and_clear_bit(BTREE_NODE_dirty, &b->flags)) ++ atomic_dec(&c->btree_cache.dirty); ++} ++ + struct btree_read_bio { + struct bch_fs *c; + u64 start_time; +diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c +index 0ee4f78ce67a..f2ad54d933e6 100644 +--- a/fs/bcachefs/btree_key_cache.c ++++ b/fs/bcachefs/btree_key_cache.c +@@ -66,6 +66,8 @@ static void bkey_cached_evict(struct btree_key_cache *c, + BUG_ON(rhashtable_remove_fast(&c->table, &ck->hash, + bch2_btree_key_cache_params)); + memset(&ck->key, ~0, sizeof(ck->key)); ++ ++ c->nr_keys--; + } + + static void bkey_cached_free(struct btree_key_cache *c, +@@ -135,6 +137,8 @@ btree_key_cache_create(struct btree_key_cache *c, + return NULL; + } + ++ c->nr_keys++; ++ + list_move(&ck->list, &c->clean); + six_unlock_write(&ck->c.lock); + +@@ -355,10 +359,14 @@ err: + + bch2_journal_pin_drop(j, &ck->journal); + bch2_journal_preres_put(j, &ck->res); +- clear_bit(BKEY_CACHED_DIRTY, &ck->flags); + + if (!evict) { + mutex_lock(&c->btree_key_cache.lock); ++ if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { ++ clear_bit(BKEY_CACHED_DIRTY, &ck->flags); ++ c->btree_key_cache.nr_dirty--; ++ } ++ + list_move_tail(&ck->list, &c->btree_key_cache.clean); + mutex_unlock(&c->btree_key_cache.lock); + } else { +@@ -371,6 +379,11 @@ evict: + six_lock_write(&ck->c.lock, NULL, NULL); + + mutex_lock(&c->btree_key_cache.lock); ++ if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { ++ clear_bit(BKEY_CACHED_DIRTY, &ck->flags); ++ c->btree_key_cache.nr_dirty--; ++ } ++ + bkey_cached_evict(&c->btree_key_cache, ck); + bkey_cached_free(&c->btree_key_cache, ck); + mutex_unlock(&c->btree_key_cache.lock); +@@ -448,9 +461,10 @@ bool bch2_btree_insert_key_cached(struct btree_trans *trans, + + if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { + mutex_lock(&c->btree_key_cache.lock); +- list_del_init(&ck->list); ++ list_move(&ck->list, &c->btree_key_cache.dirty); + + set_bit(BKEY_CACHED_DIRTY, &ck->flags); ++ c->btree_key_cache.nr_dirty++; + mutex_unlock(&c->btree_key_cache.lock); + } + +@@ -467,20 +481,28 @@ void bch2_btree_key_cache_verify_clean(struct btree_trans *trans, + } + #endif + +-void bch2_fs_btree_key_cache_exit(struct btree_key_cache *c) ++void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc) + { ++ struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); + struct bkey_cached *ck, *n; + +- mutex_lock(&c->lock); +- list_for_each_entry_safe(ck, n, &c->clean, list) { ++ mutex_lock(&bc->lock); ++ list_splice(&bc->dirty, &bc->clean); ++ ++ list_for_each_entry_safe(ck, n, &bc->clean, list) { + kfree(ck->k); + kfree(ck); ++ bc->nr_keys--; + } +- list_for_each_entry_safe(ck, n, &c->freed, list) ++ ++ BUG_ON(bc->nr_dirty && !bch2_journal_error(&c->journal)); ++ BUG_ON(bc->nr_keys); ++ ++ list_for_each_entry_safe(ck, n, &bc->freed, list) + kfree(ck); +- mutex_unlock(&c->lock); ++ mutex_unlock(&bc->lock); + +- rhashtable_destroy(&c->table); ++ rhashtable_destroy(&bc->table); + } + + void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c) +@@ -488,6 +510,7 @@ void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c) + mutex_init(&c->lock); + INIT_LIST_HEAD(&c->freed); + INIT_LIST_HEAD(&c->clean); ++ INIT_LIST_HEAD(&c->dirty); + } + + int bch2_fs_btree_key_cache_init(struct btree_key_cache *c) +diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h +index 93721fbc7794..0ec782a69cb9 100644 +--- a/fs/bcachefs/btree_types.h ++++ b/fs/bcachefs/btree_types.h +@@ -158,6 +158,7 @@ struct btree_cache { + /* Number of elements in live + freeable lists */ + unsigned used; + unsigned reserve; ++ atomic_t dirty; + struct shrinker shrink; + + /* +@@ -294,6 +295,10 @@ struct btree_key_cache { + struct rhashtable table; + struct list_head freed; + struct list_head clean; ++ struct list_head dirty; ++ ++ size_t nr_keys; ++ size_t nr_dirty; + }; + + struct bkey_cached_key { +@@ -411,7 +416,6 @@ enum btree_flags { + + BTREE_FLAG(read_in_flight); + BTREE_FLAG(read_error); +-BTREE_FLAG(dirty); + BTREE_FLAG(need_write); + BTREE_FLAG(noevict); + BTREE_FLAG(write_idx); +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index 4ddd1697ffde..c0ae76411c2e 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -149,7 +149,7 @@ void bch2_btree_node_free_never_inserted(struct bch_fs *c, struct btree *b) + + b->ob.nr = 0; + +- clear_btree_node_dirty(b); ++ clear_btree_node_dirty(c, b); + + btree_node_lock_type(c, b, SIX_LOCK_write); + __btree_node_free(c, b); +@@ -264,7 +264,7 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned lev + b = as->prealloc_nodes[--as->nr_prealloc_nodes]; + + set_btree_node_accessed(b); +- set_btree_node_dirty(b); ++ set_btree_node_dirty(c, b); + set_btree_node_need_write(b); + + bch2_bset_init_first(b, &b->data->keys); +@@ -827,7 +827,7 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *as, + closure_wake_up(&c->btree_interior_update_wait); + } + +- clear_btree_node_dirty(b); ++ clear_btree_node_dirty(c, b); + clear_btree_node_need_write(b); + + /* +@@ -1034,7 +1034,7 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b + bch2_btree_node_iter_advance(node_iter, b); + + bch2_btree_bset_insert_key(iter, b, node_iter, insert); +- set_btree_node_dirty(b); ++ set_btree_node_dirty(as->c, b); + set_btree_node_need_write(b); + } + +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index adf202f7989c..f96a3571d9ee 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -191,7 +191,7 @@ static bool btree_insert_key_leaf(struct btree_trans *trans, + bch2_btree_add_journal_pin(c, b, trans->journal_res.seq); + + if (unlikely(!btree_node_dirty(b))) +- set_btree_node_dirty(b); ++ set_btree_node_dirty(c, b); + + live_u64s_added = (int) b->nr.live_u64s - old_live_u64s; + u64s_added = (int) bset_u64s(t) - old_u64s; +diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c +index 18e45296e7de..e6b51e3b5335 100644 +--- a/fs/bcachefs/journal_reclaim.c ++++ b/fs/bcachefs/journal_reclaim.c +@@ -547,6 +547,12 @@ void bch2_journal_reclaim(struct journal *j) + + if (j->prereserved.reserved * 2 > j->prereserved.remaining) + min_nr = 1; ++ ++ if ((atomic_read(&c->btree_cache.dirty) * 4 > ++ c->btree_cache.used * 3) || ++ (c->btree_key_cache.nr_dirty * 4 > ++ c->btree_key_cache.nr_keys)) ++ min_nr = 1; + } while (journal_flush_pins(j, seq_to_flush, min_nr)); + + if (!bch2_journal_error(j)) +-- +cgit v1.2.3 + + +From 868aab98a9e44d56707a9742e3650c45b4b20a1a Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 11 Nov 2020 17:47:39 -0500 +Subject: bcachefs: Fix btree key cache shutdown + +On emergency shutdown, we might still have dirty keys in the btree key +cache that need to be cleaned up properly. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_key_cache.c | 3 +++ + fs/bcachefs/journal_reclaim.c | 1 + + 2 files changed, 4 insertions(+) + +diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c +index f2ad54d933e6..6a3d909c6d6e 100644 +--- a/fs/bcachefs/btree_key_cache.c ++++ b/fs/bcachefs/btree_key_cache.c +@@ -490,6 +490,9 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc) + list_splice(&bc->dirty, &bc->clean); + + list_for_each_entry_safe(ck, n, &bc->clean, list) { ++ bch2_journal_pin_drop(&c->journal, &ck->journal); ++ bch2_journal_preres_put(&c->journal, &ck->res); ++ + kfree(ck->k); + kfree(ck); + bc->nr_keys--; +diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c +index e6b51e3b5335..7a04d06bb342 100644 +--- a/fs/bcachefs/journal_reclaim.c ++++ b/fs/bcachefs/journal_reclaim.c +@@ -263,6 +263,7 @@ static void bch2_journal_reclaim_fast(struct journal *j) + while (!fifo_empty(&j->pin) && + !atomic_read(&fifo_peek_front(&j->pin).count)) { + BUG_ON(!list_empty(&fifo_peek_front(&j->pin).list)); ++ BUG_ON(!list_empty(&fifo_peek_front(&j->pin).flushed)); + BUG_ON(!fifo_pop(&j->pin, temp)); + popped = true; + } +-- +cgit v1.2.3 + + +From 826342e2b9b4ad7af2d05183444631185efc5585 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 11 Nov 2020 18:59:41 -0500 +Subject: bcachefs: Fix missing memalloc_nofs_restore() + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_cache.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c +index 912e90b89e8b..f8f6079c0199 100644 +--- a/fs/bcachefs/btree_cache.c ++++ b/fs/bcachefs/btree_cache.c +@@ -328,9 +328,9 @@ restart: + clear_btree_node_accessed(b); + } + +- memalloc_nofs_restore(flags); + mutex_unlock(&bc->lock); + out: ++ memalloc_nofs_restore(flags); + return (unsigned long) freed * btree_pages(c); + } + +-- +cgit v1.2.3 + + +From 35d8bdc9fc7365d72934e84d7c6899d74723ec32 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 11 Nov 2020 12:42:54 -0500 +Subject: bcachefs: Hack around bch2_varint_decode invalid reads + +bch2_varint_decode can do reads up to 7 bytes past the end ptr, for the +sake of performance - these extra bytes are always masked off. + +This won't be a problem in practice if we make sure to burn 8 bytes in +any buffer that has bkeys in it. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_io.c | 3 +++ + fs/bcachefs/btree_update_interior.h | 3 +++ + 2 files changed, 6 insertions(+) + +diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c +index edca11d255eb..3c1575826b2d 100644 +--- a/fs/bcachefs/btree_io.c ++++ b/fs/bcachefs/btree_io.c +@@ -1532,6 +1532,9 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, + seq = max(seq, le64_to_cpu(i->journal_seq)); + } + ++ /* bch2_varint_decode may read up to 7 bytes past the end of the buffer: */ ++ bytes += 8; ++ + data = btree_bounce_alloc(c, bytes, &used_mempool); + + if (!b->written) { +diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h +index 7668225e72c6..41854fc345d2 100644 +--- a/fs/bcachefs/btree_update_interior.h ++++ b/fs/bcachefs/btree_update_interior.h +@@ -237,6 +237,9 @@ static inline ssize_t __bch_btree_u64s_remaining(struct bch_fs *c, + b->whiteout_u64s; + ssize_t total = c->opts.btree_node_size << 6; + ++ /* Always leave one extra u64 for bch2_varint_decode: */ ++ used++; ++ + return total - used; + } + +-- +cgit v1.2.3 + + +From 4101d85f3ae32f7d9b17e5b4ff8fcf8f8fa29e9b Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 11 Nov 2020 12:33:12 -0500 +Subject: bcachefs: Deadlock prevention for ei_pagecache_lock + +In the dio write path, when get_user_pages() invokes the fault handler +we have a recursive locking situation - we have to handle the lock +ordering ourselves or we have a deadlock: this patch addresses that by +checking for locking ordering violations and doing the unlock/relock +dance if necessary. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/fs-io.c | 68 +++++++++++++++++++++++++++++++++++++++++++++++++++-- + fs/bcachefs/fs.c | 5 ++++ + fs/bcachefs/fs.h | 1 + + 3 files changed, 72 insertions(+), 2 deletions(-) + +diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c +index 7bf9ca86f854..222e390acc64 100644 +--- a/fs/bcachefs/fs-io.c ++++ b/fs/bcachefs/fs-io.c +@@ -44,6 +44,22 @@ static inline bool bio_full(struct bio *bio, unsigned len) + return false; + } + ++static inline struct address_space *faults_disabled_mapping(void) ++{ ++ return (void *) (((unsigned long) current->faults_disabled_mapping) & ~1UL); ++} ++ ++static inline void set_fdm_dropped_locks(void) ++{ ++ current->faults_disabled_mapping = ++ (void *) (((unsigned long) current->faults_disabled_mapping)|1); ++} ++ ++static inline bool fdm_dropped_locks(void) ++{ ++ return ((unsigned long) current->faults_disabled_mapping) & 1; ++} ++ + struct quota_res { + u64 sectors; + }; +@@ -501,10 +517,35 @@ static void bch2_set_page_dirty(struct bch_fs *c, + vm_fault_t bch2_page_fault(struct vm_fault *vmf) + { + struct file *file = vmf->vma->vm_file; ++ struct address_space *mapping = file->f_mapping; ++ struct address_space *fdm = faults_disabled_mapping(); + struct bch_inode_info *inode = file_bch_inode(file); + int ret; + ++ if (fdm == mapping) ++ return VM_FAULT_SIGBUS; ++ ++ /* Lock ordering: */ ++ if (fdm > mapping) { ++ struct bch_inode_info *fdm_host = to_bch_ei(fdm->host); ++ ++ if (bch2_pagecache_add_tryget(&inode->ei_pagecache_lock)) ++ goto got_lock; ++ ++ bch2_pagecache_block_put(&fdm_host->ei_pagecache_lock); ++ ++ bch2_pagecache_add_get(&inode->ei_pagecache_lock); ++ bch2_pagecache_add_put(&inode->ei_pagecache_lock); ++ ++ bch2_pagecache_block_get(&fdm_host->ei_pagecache_lock); ++ ++ /* Signal that lock has been dropped: */ ++ set_fdm_dropped_locks(); ++ return VM_FAULT_SIGBUS; ++ } ++ + bch2_pagecache_add_get(&inode->ei_pagecache_lock); ++got_lock: + ret = filemap_fault(vmf); + bch2_pagecache_add_put(&inode->ei_pagecache_lock); + +@@ -1748,14 +1789,16 @@ static long bch2_dio_write_loop(struct dio_write *dio) + struct bio *bio = &dio->op.wbio.bio; + struct bvec_iter_all iter; + struct bio_vec *bv; +- unsigned unaligned; +- bool sync = dio->sync; ++ unsigned unaligned, iter_count; ++ bool sync = dio->sync, dropped_locks; + long ret; + + if (dio->loop) + goto loop; + + while (1) { ++ iter_count = dio->iter.count; ++ + if (kthread) + kthread_use_mm(dio->mm); + BUG_ON(current->faults_disabled_mapping); +@@ -1763,13 +1806,34 @@ static long bch2_dio_write_loop(struct dio_write *dio) + + ret = bio_iov_iter_get_pages(bio, &dio->iter); + ++ dropped_locks = fdm_dropped_locks(); ++ + current->faults_disabled_mapping = NULL; + if (kthread) + kthread_unuse_mm(dio->mm); + ++ /* ++ * If the fault handler returned an error but also signalled ++ * that it dropped & retook ei_pagecache_lock, we just need to ++ * re-shoot down the page cache and retry: ++ */ ++ if (dropped_locks && ret) ++ ret = 0; ++ + if (unlikely(ret < 0)) + goto err; + ++ if (unlikely(dropped_locks)) { ++ ret = write_invalidate_inode_pages_range(mapping, ++ req->ki_pos, ++ req->ki_pos + iter_count - 1); ++ if (unlikely(ret)) ++ goto err; ++ ++ if (!bio->bi_iter.bi_size) ++ continue; ++ } ++ + unaligned = bio->bi_iter.bi_size & (block_bytes(c) - 1); + bio->bi_iter.bi_size -= unaligned; + iov_iter_revert(&dio->iter, unaligned); +diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c +index 57a4b59c77d5..55471c6434b7 100644 +--- a/fs/bcachefs/fs.c ++++ b/fs/bcachefs/fs.c +@@ -93,6 +93,11 @@ void bch2_pagecache_add_put(struct pagecache_lock *lock) + __pagecache_lock_put(lock, 1); + } + ++bool bch2_pagecache_add_tryget(struct pagecache_lock *lock) ++{ ++ return __pagecache_lock_tryget(lock, 1); ++} ++ + void bch2_pagecache_add_get(struct pagecache_lock *lock) + { + __pagecache_lock_get(lock, 1); +diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h +index eda903a45325..4ee1ac994420 100644 +--- a/fs/bcachefs/fs.h ++++ b/fs/bcachefs/fs.h +@@ -26,6 +26,7 @@ static inline void pagecache_lock_init(struct pagecache_lock *lock) + } + + void bch2_pagecache_add_put(struct pagecache_lock *); ++bool bch2_pagecache_add_tryget(struct pagecache_lock *); + void bch2_pagecache_add_get(struct pagecache_lock *); + void bch2_pagecache_block_put(struct pagecache_lock *); + void bch2_pagecache_block_get(struct pagecache_lock *); +-- +cgit v1.2.3 + + +From 35ebf29ccdf62ae5671c30ba628d523cc47db5da Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 13 Nov 2020 14:39:43 -0500 +Subject: bcachefs: Improve journal entry validate code + +Previously, the journal entry read code was changed so that if we got a +journal entry that failed validation, we'd try to use it, preferring to +use a good version from another device if available. + +But this left a bug where if an earlier validation check (say, checksum) +failed, the later checks (for last_seq) wouldn't run and we'd end up +using a journal entry with a garbage last_seq field. This fixes that so +that the later validation checks run and if necessary change those +fields to something sensible. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/journal_io.c | 33 ++++++++++++++++----------------- + 1 file changed, 16 insertions(+), 17 deletions(-) + +diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c +index bd0e6b371701..8f2b1e81e2af 100644 +--- a/fs/bcachefs/journal_io.c ++++ b/fs/bcachefs/journal_io.c +@@ -432,46 +432,45 @@ static int jset_validate(struct bch_fs *c, + "%s sector %llu seq %llu: unknown journal entry version %u", + ca->name, sector, le64_to_cpu(jset->seq), + version)) { +- /* XXX: note we might have missing journal entries */ +- return JOURNAL_ENTRY_BAD; ++ /* don't try to continue: */ ++ return EINVAL; + } + ++ if (bytes > (sectors_read << 9) && ++ sectors_read < bucket_sectors_left) ++ return JOURNAL_ENTRY_REREAD; ++ + if (journal_entry_err_on(bytes > bucket_sectors_left << 9, c, + "%s sector %llu seq %llu: journal entry too big (%zu bytes)", + ca->name, sector, le64_to_cpu(jset->seq), bytes)) { +- /* XXX: note we might have missing journal entries */ +- return JOURNAL_ENTRY_BAD; ++ ret = JOURNAL_ENTRY_BAD; ++ le32_add_cpu(&jset->u64s, ++ -((bytes - (bucket_sectors_left << 9)) / 8)); + } + +- if (bytes > sectors_read << 9) +- return JOURNAL_ENTRY_REREAD; +- + if (fsck_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)), c, + "%s sector %llu seq %llu: journal entry with unknown csum type %llu", + ca->name, sector, le64_to_cpu(jset->seq), +- JSET_CSUM_TYPE(jset))) +- return JOURNAL_ENTRY_BAD; ++ JSET_CSUM_TYPE(jset))) { ++ ret = JOURNAL_ENTRY_BAD; ++ goto bad_csum_type; ++ } + + csum = csum_vstruct(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), jset); + if (journal_entry_err_on(bch2_crc_cmp(csum, jset->csum), c, + "%s sector %llu seq %llu: journal checksum bad", +- ca->name, sector, le64_to_cpu(jset->seq))) { +- /* XXX: retry IO, when we start retrying checksum errors */ +- /* XXX: note we might have missing journal entries */ +- return JOURNAL_ENTRY_BAD; +- } ++ ca->name, sector, le64_to_cpu(jset->seq))) ++ ret = JOURNAL_ENTRY_BAD; + + bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), + jset->encrypted_start, + vstruct_end(jset) - (void *) jset->encrypted_start); +- ++bad_csum_type: + if (journal_entry_err_on(le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq), c, + "invalid journal entry: last_seq > seq")) { + jset->last_seq = jset->seq; + return JOURNAL_ENTRY_BAD; + } +- +- return 0; + fsck_err: + return ret; + } +-- +cgit v1.2.3 + + +From f850b7df9677ee26ce6fff66ceccca359756904b Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 13 Nov 2020 14:49:57 -0500 +Subject: bcachefs: Fix a 64 bit divide + +this fixes builds on 32 bit. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/buckets.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index 82f1cc4ca693..be65f2e78a62 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -323,7 +323,7 @@ static u64 reserve_factor(u64 r) + + static u64 avail_factor(u64 r) + { +- return (r << RESERVE_FACTOR) / ((1 << RESERVE_FACTOR) + 1); ++ return div_u64(r << RESERVE_FACTOR, (1 << RESERVE_FACTOR) + 1); + } + + u64 bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage *fs_usage) +-- +cgit v1.2.3 + + +From f7fbea9b6d47e9bf5911e1f11e14ad0c8373af0c Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 13 Nov 2020 18:30:53 -0500 +Subject: bcachefs: Fix a btree transaction iter overflow + +extent_replay_key dates from before putting iterators was required - +fixed. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/recovery.c | 3 +++ + 1 file changed, 3 insertions(+) + +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index 1745cfac6b26..6750063663b5 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -456,6 +456,7 @@ retry: + __bch2_btree_iter_set_pos(split_iter, split->k.p, false); + bch2_trans_update(&trans, split_iter, split, + BTREE_TRIGGER_NORUN); ++ bch2_trans_iter_put(&trans, split_iter); + + bch2_btree_iter_set_pos(iter, split->k.p); + +@@ -481,6 +482,8 @@ retry: + BTREE_INSERT_LAZY_RW| + BTREE_INSERT_JOURNAL_REPLAY); + err: ++ bch2_trans_iter_put(&trans, iter); ++ + if (ret == -EINTR) + goto retry; + +-- +cgit v1.2.3 + + +From a00a2a898c69d0104e445716f20cf42e2aebf93a Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 13 Nov 2020 16:51:02 -0500 +Subject: bcachefs: Inode delete doesn't need to flush key cache anymore + +Inode create checks to make sure the slot doesn't exist in the btree key +cache. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/inode.c | 11 ++--------- + 1 file changed, 2 insertions(+), 9 deletions(-) + +diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c +index c51b34077dde..823a1ddec5ac 100644 +--- a/fs/bcachefs/inode.c ++++ b/fs/bcachefs/inode.c +@@ -576,16 +576,9 @@ retry: + + bi_generation = 0; + +- ret = bch2_btree_key_cache_flush(&trans, BTREE_ID_INODES, POS(0, inode_nr)); +- if (ret) { +- if (ret != -EINTR) +- bch_err(c, "error flushing btree key cache: %i", ret); +- goto err; +- } +- + iter = bch2_trans_get_iter(&trans, BTREE_ID_INODES, POS(0, inode_nr), +- BTREE_ITER_SLOTS|BTREE_ITER_INTENT); +- k = bch2_btree_iter_peek_slot(iter); ++ BTREE_ITER_CACHED|BTREE_ITER_INTENT); ++ k = bch2_btree_iter_peek_cached(iter); + + ret = bkey_err(k); + if (ret) +-- +cgit v1.2.3 + + +From e42c8f3a61ca73cc1f506a4e9421961275801dce Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 13 Nov 2020 15:03:34 -0500 +Subject: bcachefs: Be more careful in bch2_bkey_to_text() + +This is used to print keys that failed bch2_bkey_invalid(), so be more +careful with k->type. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bkey_methods.c | 18 +++++++++++++----- + 1 file changed, 13 insertions(+), 5 deletions(-) + +diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c +index 99b7fce2bfd3..f5779795a4b2 100644 +--- a/fs/bcachefs/bkey_methods.c ++++ b/fs/bcachefs/bkey_methods.c +@@ -181,8 +181,12 @@ void bch2_bpos_to_text(struct printbuf *out, struct bpos pos) + void bch2_bkey_to_text(struct printbuf *out, const struct bkey *k) + { + if (k) { +- pr_buf(out, "u64s %u type %s ", k->u64s, +- bch2_bkey_types[k->type]); ++ pr_buf(out, "u64s %u type ", k->u64s); ++ ++ if (k->type < KEY_TYPE_MAX) ++ pr_buf(out, "%s ", bch2_bkey_types[k->type]); ++ else ++ pr_buf(out, "%u ", k->type); + + bch2_bpos_to_text(out, k->p); + +@@ -196,10 +200,14 @@ void bch2_bkey_to_text(struct printbuf *out, const struct bkey *k) + void bch2_val_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) + { +- const struct bkey_ops *ops = &bch2_bkey_ops[k.k->type]; ++ if (k.k->type < KEY_TYPE_MAX) { ++ const struct bkey_ops *ops = &bch2_bkey_ops[k.k->type]; + +- if (likely(ops->val_to_text)) +- ops->val_to_text(out, c, k); ++ if (likely(ops->val_to_text)) ++ ops->val_to_text(out, c, k); ++ } else { ++ pr_buf(out, "(invalid type %u)", k.k->type); ++ } + } + + void bch2_bkey_val_to_text(struct printbuf *out, struct bch_fs *c, +-- +cgit v1.2.3 + + +From d1d8f0f8fc3ddc0f0f47aac73fd58dc54247a384 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 13 Nov 2020 16:19:24 -0500 +Subject: bcachefs: Improve journal error messages + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/journal_io.c | 21 ++++++++++++++------- + 1 file changed, 14 insertions(+), 7 deletions(-) + +diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c +index 8f2b1e81e2af..b07b5eda67b9 100644 +--- a/fs/bcachefs/journal_io.c ++++ b/fs/bcachefs/journal_io.c +@@ -173,7 +173,9 @@ static int journal_validate_key(struct bch_fs *c, struct jset *jset, + int ret = 0; + + if (journal_entry_err_on(!k->k.u64s, c, +- "invalid %s in journal: k->u64s 0", type)) { ++ "invalid %s in journal entry %llu offset %zi: k->u64s 0", ++ type, le64_to_cpu(jset->seq), ++ (u64 *) entry - jset->_data)) { + entry->u64s = cpu_to_le16((u64 *) k - entry->_data); + journal_entry_null_range(vstruct_next(entry), next); + return 0; +@@ -181,16 +183,19 @@ static int journal_validate_key(struct bch_fs *c, struct jset *jset, + + if (journal_entry_err_on((void *) bkey_next(k) > + (void *) vstruct_next(entry), c, +- "invalid %s in journal: extends past end of journal entry", +- type)) { ++ "invalid %s in journal entry %llu offset %zi: extends past end of journal entry", ++ type, le64_to_cpu(jset->seq), ++ (u64 *) entry - jset->_data)) { + entry->u64s = cpu_to_le16((u64 *) k - entry->_data); + journal_entry_null_range(vstruct_next(entry), next); + return 0; + } + + if (journal_entry_err_on(k->k.format != KEY_FORMAT_CURRENT, c, +- "invalid %s in journal: bad format %u", +- type, k->k.format)) { ++ "invalid %s in journal entry %llu offset %zi: bad format %u", ++ type, le64_to_cpu(jset->seq), ++ (u64 *) entry - jset->_data, ++ k->k.format)) { + le16_add_cpu(&entry->u64s, -k->k.u64s); + memmove(k, bkey_next(k), next - (void *) bkey_next(k)); + journal_entry_null_range(vstruct_next(entry), next); +@@ -208,8 +213,10 @@ static int journal_validate_key(struct bch_fs *c, struct jset *jset, + char buf[160]; + + bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(k)); +- mustfix_fsck_err(c, "invalid %s in journal: %s\n%s", +- type, invalid, buf); ++ mustfix_fsck_err(c, "invalid %s in journal entry %llu offset %zi: %s\n%s", ++ type, le64_to_cpu(jset->seq), ++ (u64 *) entry - jset->_data, ++ invalid, buf); + + le16_add_cpu(&entry->u64s, -k->k.u64s); + memmove(k, bkey_next(k), next - (void *) bkey_next(k)); +-- +cgit v1.2.3 + + +From 64349ce80f76a2e8399364f8faee682482124bb8 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 14 Nov 2020 13:12:50 -0500 +Subject: bcachefs: Delete dead journalling code + +Usage of the journal has gotten somewhat simpler over time - neat. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/journal.c | 112 -------------------------------------------------- + fs/bcachefs/journal.h | 5 --- + fs/bcachefs/sysfs.c | 2 +- + 3 files changed, 1 insertion(+), 118 deletions(-) + +diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c +index c2cafd3892a4..0a06caa95bac 100644 +--- a/fs/bcachefs/journal.c ++++ b/fs/bcachefs/journal.c +@@ -503,74 +503,6 @@ out: + + /* journal flushing: */ + +-u64 bch2_journal_last_unwritten_seq(struct journal *j) +-{ +- u64 seq; +- +- spin_lock(&j->lock); +- seq = journal_cur_seq(j); +- if (j->reservations.prev_buf_unwritten) +- seq--; +- spin_unlock(&j->lock); +- +- return seq; +-} +- +-/** +- * bch2_journal_open_seq_async - try to open a new journal entry if @seq isn't +- * open yet, or wait if we cannot +- * +- * used by the btree interior update machinery, when it needs to write a new +- * btree root - every journal entry contains the roots of all the btrees, so it +- * doesn't need to bother with getting a journal reservation +- */ +-int bch2_journal_open_seq_async(struct journal *j, u64 seq, struct closure *cl) +-{ +- struct bch_fs *c = container_of(j, struct bch_fs, journal); +- int ret; +- +- spin_lock(&j->lock); +- +- /* +- * Can't try to open more than one sequence number ahead: +- */ +- BUG_ON(journal_cur_seq(j) < seq && !journal_entry_is_open(j)); +- +- if (journal_cur_seq(j) > seq || +- journal_entry_is_open(j)) { +- spin_unlock(&j->lock); +- return 0; +- } +- +- if (journal_cur_seq(j) < seq && +- !__journal_entry_close(j)) { +- /* haven't finished writing out the previous one: */ +- trace_journal_entry_full(c); +- ret = -EAGAIN; +- } else { +- BUG_ON(journal_cur_seq(j) != seq); +- +- ret = journal_entry_open(j); +- } +- +- if ((ret == -EAGAIN || ret == -ENOSPC) && +- !j->res_get_blocked_start) +- j->res_get_blocked_start = local_clock() ?: 1; +- +- if (ret == -EAGAIN || ret == -ENOSPC) +- closure_wait(&j->async_wait, cl); +- +- spin_unlock(&j->lock); +- +- if (ret == -ENOSPC) { +- trace_journal_full(c); +- bch2_journal_reclaim_work(&j->reclaim_work.work); +- ret = -EAGAIN; +- } +- +- return ret; +-} +- + static int journal_seq_error(struct journal *j, u64 seq) + { + union journal_res_state state = READ_ONCE(j->reservations); +@@ -602,35 +534,6 @@ journal_seq_to_buf(struct journal *j, u64 seq) + return NULL; + } + +-/** +- * bch2_journal_wait_on_seq - wait for a journal entry to be written +- * +- * does _not_ cause @seq to be written immediately - if there is no other +- * activity to cause the relevant journal entry to be filled up or flushed it +- * can wait for an arbitrary amount of time (up to @j->write_delay_ms, which is +- * configurable). +- */ +-void bch2_journal_wait_on_seq(struct journal *j, u64 seq, +- struct closure *parent) +-{ +- struct journal_buf *buf; +- +- spin_lock(&j->lock); +- +- if ((buf = journal_seq_to_buf(j, seq))) { +- if (!closure_wait(&buf->wait, parent)) +- BUG(); +- +- if (seq == journal_cur_seq(j)) { +- smp_mb(); +- if (bch2_journal_error(j)) +- closure_wake_up(&buf->wait); +- } +- } +- +- spin_unlock(&j->lock); +-} +- + /** + * bch2_journal_flush_seq_async - wait for a journal entry to be written + * +@@ -680,21 +583,6 @@ int bch2_journal_flush_seq(struct journal *j, u64 seq) + return ret ?: ret2 < 0 ? ret2 : 0; + } + +-/** +- * bch2_journal_meta_async - force a journal entry to be written +- */ +-void bch2_journal_meta_async(struct journal *j, struct closure *parent) +-{ +- struct journal_res res; +- +- memset(&res, 0, sizeof(res)); +- +- bch2_journal_res_get(j, &res, jset_u64s(0), 0); +- bch2_journal_res_put(j, &res); +- +- bch2_journal_flush_seq_async(j, res.seq, parent); +-} +- + int bch2_journal_meta(struct journal *j) + { + struct journal_res res; +diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h +index f60bc964ee1f..348a78a5c62c 100644 +--- a/fs/bcachefs/journal.h ++++ b/fs/bcachefs/journal.h +@@ -464,13 +464,8 @@ void bch2_journal_entry_res_resize(struct journal *, + struct journal_entry_res *, + unsigned); + +-u64 bch2_journal_last_unwritten_seq(struct journal *); +-int bch2_journal_open_seq_async(struct journal *, u64, struct closure *); +- +-void bch2_journal_wait_on_seq(struct journal *, u64, struct closure *); + void bch2_journal_flush_seq_async(struct journal *, u64, struct closure *); + void bch2_journal_flush_async(struct journal *, struct closure *); +-void bch2_journal_meta_async(struct journal *, struct closure *); + + int bch2_journal_flush_seq(struct journal *, u64); + int bch2_journal_flush(struct journal *); +diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c +index d7ad293aff4d..58c00e26ebe8 100644 +--- a/fs/bcachefs/sysfs.c ++++ b/fs/bcachefs/sysfs.c +@@ -458,7 +458,7 @@ STORE(bch2_fs) + /* Debugging: */ + + if (attr == &sysfs_trigger_journal_flush) +- bch2_journal_meta_async(&c->journal, NULL); ++ bch2_journal_meta(&c->journal); + + if (attr == &sysfs_trigger_btree_coalesce) + bch2_coalesce(c); +-- +cgit v1.2.3 + + +From 554b75da36f2376752258449f50307af64bedace Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 14 Nov 2020 16:04:30 -0500 +Subject: bcachefs: Assorted journal refactoring + +Improved the way we track various state by adding j->err_seq, which +records the first journal sequence number that encountered an error +being written, and j->last_empty_seq, which records the most recent +journal entry that was completely empty. + +Also, use the low bits of the journal sequence number to index the +corresponding journal_buf. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/journal.c | 105 +++++++++++++++++++++----------------------- + fs/bcachefs/journal.h | 2 +- + fs/bcachefs/journal_io.c | 25 ++++++----- + fs/bcachefs/journal_types.h | 3 +- + 4 files changed, 67 insertions(+), 68 deletions(-) + +diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c +index 0a06caa95bac..9a83b9568e33 100644 +--- a/fs/bcachefs/journal.c ++++ b/fs/bcachefs/journal.c +@@ -18,7 +18,19 @@ + + #include + +-static inline struct journal_buf *journal_seq_to_buf(struct journal *, u64); ++static u64 last_unwritten_seq(struct journal *j) ++{ ++ union journal_res_state s = READ_ONCE(j->reservations); ++ ++ lockdep_assert_held(&j->lock); ++ ++ return journal_cur_seq(j) - s.prev_buf_unwritten; ++} ++ ++static inline bool journal_seq_unwritten(struct journal *j, u64 seq) ++{ ++ return seq >= last_unwritten_seq(j); ++} + + static bool __journal_entry_is_open(union journal_res_state state) + { +@@ -30,6 +42,22 @@ static bool journal_entry_is_open(struct journal *j) + return __journal_entry_is_open(j->reservations); + } + ++static inline struct journal_buf * ++journal_seq_to_buf(struct journal *j, u64 seq) ++{ ++ struct journal_buf *buf = NULL; ++ ++ EBUG_ON(seq > journal_cur_seq(j)); ++ EBUG_ON(seq == journal_cur_seq(j) && ++ j->reservations.cur_entry_offset == JOURNAL_ENTRY_CLOSED_VAL); ++ ++ if (journal_seq_unwritten(j, seq)) { ++ buf = j->buf + (seq & 1); ++ EBUG_ON(le64_to_cpu(buf->data->seq) != seq); ++ } ++ return buf; ++} ++ + static void journal_pin_new_entry(struct journal *j, int count) + { + struct journal_entry_pin_list *p; +@@ -51,6 +79,8 @@ static void bch2_journal_buf_init(struct journal *j) + { + struct journal_buf *buf = journal_cur_buf(j); + ++ bkey_extent_init(&buf->key); ++ + memset(buf->has_inode, 0, sizeof(buf->has_inode)); + + memset(buf->data, 0, sizeof(*buf->data)); +@@ -72,6 +102,7 @@ void bch2_journal_halt(struct journal *j) + } while ((v = atomic64_cmpxchg(&j->reservations.counter, + old.v, new.v)) != old.v); + ++ j->err_seq = journal_cur_seq(j); + journal_wake(j); + closure_wake_up(&journal_cur_buf(j)->wait); + } +@@ -139,8 +170,6 @@ static bool __journal_entry_close(struct journal *j) + BUG_ON(sectors > buf->sectors); + buf->sectors = sectors; + +- bkey_extent_init(&buf->key); +- + /* + * We have to set last_seq here, _before_ opening a new journal entry: + * +@@ -162,11 +191,6 @@ static bool __journal_entry_close(struct journal *j) + */ + buf->data->last_seq = cpu_to_le64(journal_last_seq(j)); + +- if (journal_entry_empty(buf->data)) +- clear_bit(JOURNAL_NOT_EMPTY, &j->flags); +- else +- set_bit(JOURNAL_NOT_EMPTY, &j->flags); +- + journal_pin_new_entry(j, 1); + + bch2_journal_buf_init(j); +@@ -503,49 +527,28 @@ out: + + /* journal flushing: */ + +-static int journal_seq_error(struct journal *j, u64 seq) +-{ +- union journal_res_state state = READ_ONCE(j->reservations); +- +- if (seq == journal_cur_seq(j)) +- return bch2_journal_error(j); +- +- if (seq + 1 == journal_cur_seq(j) && +- !state.prev_buf_unwritten && +- seq > j->seq_ondisk) +- return -EIO; +- +- return 0; +-} +- +-static inline struct journal_buf * +-journal_seq_to_buf(struct journal *j, u64 seq) +-{ +- /* seq should be for a journal entry that has been opened: */ +- BUG_ON(seq > journal_cur_seq(j)); +- BUG_ON(seq == journal_cur_seq(j) && +- j->reservations.cur_entry_offset == JOURNAL_ENTRY_CLOSED_VAL); +- +- if (seq == journal_cur_seq(j)) +- return journal_cur_buf(j); +- if (seq + 1 == journal_cur_seq(j) && +- j->reservations.prev_buf_unwritten) +- return journal_prev_buf(j); +- return NULL; +-} +- + /** + * bch2_journal_flush_seq_async - wait for a journal entry to be written + * + * like bch2_journal_wait_on_seq, except that it triggers a write immediately if + * necessary + */ +-void bch2_journal_flush_seq_async(struct journal *j, u64 seq, ++int bch2_journal_flush_seq_async(struct journal *j, u64 seq, + struct closure *parent) + { + struct journal_buf *buf; ++ int ret = 0; + + spin_lock(&j->lock); ++ if (seq <= j->err_seq) { ++ ret = -EIO; ++ goto out; ++ } ++ ++ if (seq <= j->seq_ondisk) { ++ ret = 1; ++ goto out; ++ } + + if (parent && + (buf = journal_seq_to_buf(j, seq))) +@@ -554,20 +557,8 @@ void bch2_journal_flush_seq_async(struct journal *j, u64 seq, + + if (seq == journal_cur_seq(j)) + __journal_entry_close(j); ++out: + spin_unlock(&j->lock); +-} +- +-static int journal_seq_flushed(struct journal *j, u64 seq) +-{ +- int ret; +- +- spin_lock(&j->lock); +- ret = seq <= j->seq_ondisk ? 1 : journal_seq_error(j, seq); +- +- if (seq == journal_cur_seq(j)) +- __journal_entry_close(j); +- spin_unlock(&j->lock); +- + return ret; + } + +@@ -576,7 +567,7 @@ int bch2_journal_flush_seq(struct journal *j, u64 seq) + u64 start_time = local_clock(); + int ret, ret2; + +- ret = wait_event_killable(j->wait, (ret2 = journal_seq_flushed(j, seq))); ++ ret = wait_event_killable(j->wait, (ret2 = bch2_journal_flush_seq_async(j, seq, NULL))); + + bch2_time_stats_update(j->flush_seq_time, start_time); + +@@ -877,7 +868,8 @@ void bch2_fs_journal_stop(struct journal *j) + journal_quiesce(j); + + BUG_ON(!bch2_journal_error(j) && +- test_bit(JOURNAL_NOT_EMPTY, &j->flags)); ++ (journal_entry_is_open(j) || ++ j->last_empty_seq + 1 != journal_cur_seq(j))); + + cancel_delayed_work_sync(&j->write_work); + cancel_delayed_work_sync(&j->reclaim_work); +@@ -935,6 +927,9 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq, + set_bit(JOURNAL_STARTED, &j->flags); + + journal_pin_new_entry(j, 1); ++ ++ j->reservations.idx = journal_cur_seq(j); ++ + bch2_journal_buf_init(j); + + c->last_bucket_seq_cleanup = journal_cur_seq(j); +diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h +index 348a78a5c62c..25c6876765ac 100644 +--- a/fs/bcachefs/journal.h ++++ b/fs/bcachefs/journal.h +@@ -464,7 +464,7 @@ void bch2_journal_entry_res_resize(struct journal *, + struct journal_entry_res *, + unsigned); + +-void bch2_journal_flush_seq_async(struct journal *, u64, struct closure *); ++int bch2_journal_flush_seq_async(struct journal *, u64, struct closure *); + void bch2_journal_flush_async(struct journal *, struct closure *); + + int bch2_journal_flush_seq(struct journal *, u64); +diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c +index b07b5eda67b9..df28ca118dec 100644 +--- a/fs/bcachefs/journal_io.c ++++ b/fs/bcachefs/journal_io.c +@@ -945,24 +945,29 @@ static void journal_write_done(struct closure *cl) + struct bch_replicas_padded replicas; + u64 seq = le64_to_cpu(w->data->seq); + u64 last_seq = le64_to_cpu(w->data->last_seq); ++ int err = 0; + + bch2_time_stats_update(j->write_time, j->write_start_time); + + if (!devs.nr) { + bch_err(c, "unable to write journal to sufficient devices"); +- goto err; ++ err = -EIO; ++ } else { ++ bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, devs); ++ if (bch2_mark_replicas(c, &replicas.e)) ++ err = -EIO; + } + +- bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, devs); +- +- if (bch2_mark_replicas(c, &replicas.e)) +- goto err; ++ if (err) ++ bch2_fatal_error(c); + + spin_lock(&j->lock); + if (seq >= j->pin.front) + journal_seq_pin(j, seq)->devs = devs; + + j->seq_ondisk = seq; ++ if (err && (!j->err_seq || seq < j->err_seq)) ++ j->err_seq = seq; + j->last_seq_ondisk = last_seq; + bch2_journal_space_available(j); + +@@ -974,7 +979,7 @@ static void journal_write_done(struct closure *cl) + * bch2_fs_journal_stop(): + */ + mod_delayed_work(c->journal_reclaim_wq, &j->reclaim_work, 0); +-out: ++ + /* also must come before signalling write completion: */ + closure_debug_destroy(cl); + +@@ -988,11 +993,6 @@ out: + if (test_bit(JOURNAL_NEED_WRITE, &j->flags)) + mod_delayed_work(system_freezable_wq, &j->write_work, 0); + spin_unlock(&j->lock); +- return; +-err: +- bch2_fatal_error(c); +- spin_lock(&j->lock); +- goto out; + } + + static void journal_write_endio(struct bio *bio) +@@ -1073,6 +1073,9 @@ void bch2_journal_write(struct closure *cl) + SET_JSET_BIG_ENDIAN(jset, CPU_BIG_ENDIAN); + SET_JSET_CSUM_TYPE(jset, bch2_meta_checksum_type(c)); + ++ if (journal_entry_empty(jset)) ++ j->last_empty_seq = le64_to_cpu(jset->seq); ++ + if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset))) + validate_before_checksum = true; + +diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h +index 154b51b891d3..9757e3d55991 100644 +--- a/fs/bcachefs/journal_types.h ++++ b/fs/bcachefs/journal_types.h +@@ -127,7 +127,6 @@ enum { + JOURNAL_STARTED, + JOURNAL_RECLAIM_STARTED, + JOURNAL_NEED_WRITE, +- JOURNAL_NOT_EMPTY, + JOURNAL_MAY_GET_UNRESERVED, + }; + +@@ -181,6 +180,8 @@ struct journal { + /* seq, last_seq from the most recent journal entry successfully written */ + u64 seq_ondisk; + u64 last_seq_ondisk; ++ u64 err_seq; ++ u64 last_empty_seq; + + /* + * FIFO of journal entries whose btree updates have not yet been +-- +cgit v1.2.3 + + +From bd105b28e221c3b66dfe9314b25e918e82153066 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 15 Nov 2020 16:31:58 -0500 +Subject: bcachefs: Check for errors from register_shrinker() + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_cache.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c +index f8f6079c0199..5bceff48078e 100644 +--- a/fs/bcachefs/btree_cache.c ++++ b/fs/bcachefs/btree_cache.c +@@ -447,7 +447,7 @@ int bch2_fs_btree_cache_init(struct bch_fs *c) + bc->shrink.scan_objects = bch2_btree_cache_scan; + bc->shrink.seeks = 4; + bc->shrink.batch = btree_pages(c) * 2; +- register_shrinker(&bc->shrink); ++ ret = register_shrinker(&bc->shrink); + out: + pr_verbose_init(c->opts, "ret %i", ret); + return ret; +-- +cgit v1.2.3 + + +From 56d53c9dd2042ac9ba2d464499473ff66b501763 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 15 Nov 2020 16:30:22 -0500 +Subject: bcachefs: Take a SRCU lock in btree transactions + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/Kconfig | 1 + + fs/bcachefs/bcachefs.h | 3 +++ + fs/bcachefs/btree_iter.c | 8 +++++++- + fs/bcachefs/btree_types.h | 1 + + 4 files changed, 12 insertions(+), 1 deletion(-) + +diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig +index 5594af719b2a..57c5d58c2d87 100644 +--- a/fs/bcachefs/Kconfig ++++ b/fs/bcachefs/Kconfig +@@ -20,6 +20,7 @@ config BCACHEFS_FS + select SIXLOCKS + select RAID6_PQ + select XOR_BLOCKS ++ select SRCU + help + The bcachefs filesystem - a modern, copy on write filesystem, with + support for multiple devices, compression, checksumming, etc. +diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h +index cf05ffa94af9..bd5b9207ee64 100644 +--- a/fs/bcachefs/bcachefs.h ++++ b/fs/bcachefs/bcachefs.h +@@ -194,6 +194,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -643,6 +644,8 @@ struct bch_fs { + mempool_t btree_iters_pool; + struct btree_iter_buf __percpu *btree_iters_bufs; + ++ struct srcu_struct btree_trans_barrier; ++ + struct btree_key_cache btree_key_cache; + + struct workqueue_struct *wq; +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 58f1a3dd97d3..d3fcc049fa10 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -2372,6 +2372,8 @@ void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, + if (expected_mem_bytes) + bch2_trans_preload_mem(trans, expected_mem_bytes); + ++ trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier); ++ + #ifdef CONFIG_BCACHEFS_DEBUG + trans->pid = current->pid; + mutex_lock(&c->btree_trans_lock); +@@ -2392,6 +2394,8 @@ int bch2_trans_exit(struct btree_trans *trans) + mutex_unlock(&trans->c->btree_trans_lock); + #endif + ++ srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx); ++ + bch2_journal_preres_put(&trans->c->journal, &trans->journal_preres); + + kfree(trans->fs_usage_deltas); +@@ -2474,6 +2478,7 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c) + void bch2_fs_btree_iter_exit(struct bch_fs *c) + { + mempool_exit(&c->btree_iters_pool); ++ cleanup_srcu_struct(&c->btree_trans_barrier); + } + + int bch2_fs_btree_iter_init(struct bch_fs *c) +@@ -2483,7 +2488,8 @@ int bch2_fs_btree_iter_init(struct bch_fs *c) + INIT_LIST_HEAD(&c->btree_trans_list); + mutex_init(&c->btree_trans_lock); + +- return mempool_init_kmalloc_pool(&c->btree_iters_pool, 1, ++ return init_srcu_struct(&c->btree_trans_barrier) ?: ++ mempool_init_kmalloc_pool(&c->btree_iters_pool, 1, + sizeof(struct btree_iter) * nr + + sizeof(struct btree_insert_entry) * nr + + sizeof(struct btree_insert_entry) * nr); +diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h +index 0ec782a69cb9..8b13d843e0c3 100644 +--- a/fs/bcachefs/btree_types.h ++++ b/fs/bcachefs/btree_types.h +@@ -350,6 +350,7 @@ struct btree_trans { + pid_t pid; + #endif + unsigned long ip; ++ int srcu_idx; + + u64 iters_linked; + u64 iters_live; +-- +cgit v1.2.3 + + +From f073b291fecbf1fe6b7410e7279fec3630701dc3 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 12 Nov 2020 17:19:47 -0500 +Subject: bcachefs: Add a shrinker for the btree key cache + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_key_cache.c | 88 +++++++++++++++++++++++++++++++++++++++++-- + fs/bcachefs/btree_types.h | 2 + + 2 files changed, 86 insertions(+), 4 deletions(-) + +diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c +index 6a3d909c6d6e..d605ff181d2e 100644 +--- a/fs/bcachefs/btree_key_cache.c ++++ b/fs/bcachefs/btree_key_cache.c +@@ -9,6 +9,7 @@ + #include "journal.h" + #include "journal_reclaim.h" + ++#include + #include + + static int bch2_btree_key_cache_cmp_fn(struct rhashtable_compare_arg *arg, +@@ -70,10 +71,15 @@ static void bkey_cached_evict(struct btree_key_cache *c, + c->nr_keys--; + } + +-static void bkey_cached_free(struct btree_key_cache *c, ++static void bkey_cached_free(struct btree_key_cache *bc, + struct bkey_cached *ck) + { +- list_move(&ck->list, &c->freed); ++ struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); ++ ++ ck->btree_trans_barrier_seq = ++ start_poll_synchronize_srcu(&c->btree_trans_barrier); ++ ++ list_move(&ck->list, &bc->freed); + + kfree(ck->k); + ck->k = NULL; +@@ -404,19 +410,23 @@ static void btree_key_cache_journal_flush(struct journal *j, + struct bkey_cached_key key; + struct btree_trans trans; + ++ int srcu_idx = srcu_read_lock(&c->btree_trans_barrier); ++ + six_lock_read(&ck->c.lock, NULL, NULL); + key = ck->key; + + if (ck->journal.seq != seq || + !test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { + six_unlock_read(&ck->c.lock); +- return; ++ goto unlock; + } + six_unlock_read(&ck->c.lock); + + bch2_trans_init(&trans, c, 0, 0); + btree_key_cache_flush_pos(&trans, key, seq, false); + bch2_trans_exit(&trans); ++unlock: ++ srcu_read_unlock(&c->btree_trans_barrier, srcu_idx); + } + + /* +@@ -481,11 +491,77 @@ void bch2_btree_key_cache_verify_clean(struct btree_trans *trans, + } + #endif + ++static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, ++ struct shrink_control *sc) ++{ ++ struct bch_fs *c = container_of(shrink, struct bch_fs, ++ btree_key_cache.shrink); ++ struct btree_key_cache *bc = &c->btree_key_cache; ++ struct bkey_cached *ck, *t; ++ size_t scanned = 0, freed = 0, nr = sc->nr_to_scan; ++ unsigned flags; ++ ++ /* Return -1 if we can't do anything right now */ ++ if (sc->gfp_mask & __GFP_FS) ++ mutex_lock(&bc->lock); ++ else if (!mutex_trylock(&bc->lock)) ++ return -1; ++ ++ flags = memalloc_nofs_save(); ++ ++ list_for_each_entry_safe(ck, t, &bc->freed, list) { ++ scanned++; ++ ++ if (poll_state_synchronize_srcu(&c->btree_trans_barrier, ++ ck->btree_trans_barrier_seq)) { ++ list_del(&ck->list); ++ kfree(ck); ++ freed++; ++ } ++ ++ if (scanned >= nr) ++ goto out; ++ } ++ ++ list_for_each_entry_safe(ck, t, &bc->clean, list) { ++ scanned++; ++ ++ if (bkey_cached_lock_for_evict(ck)) { ++ bkey_cached_evict(bc, ck); ++ bkey_cached_free(bc, ck); ++ } ++ ++ if (scanned >= nr) { ++ if (&t->list != &bc->clean) ++ list_move_tail(&bc->clean, &t->list); ++ goto out; ++ } ++ } ++out: ++ memalloc_nofs_restore(flags); ++ mutex_unlock(&bc->lock); ++ ++ return freed; ++} ++ ++static unsigned long bch2_btree_key_cache_count(struct shrinker *shrink, ++ struct shrink_control *sc) ++{ ++ struct bch_fs *c = container_of(shrink, struct bch_fs, ++ btree_key_cache.shrink); ++ struct btree_key_cache *bc = &c->btree_key_cache; ++ ++ return bc->nr_keys; ++} ++ + void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc) + { + struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); + struct bkey_cached *ck, *n; + ++ if (bc->shrink.list.next) ++ unregister_shrinker(&bc->shrink); ++ + mutex_lock(&bc->lock); + list_splice(&bc->dirty, &bc->clean); + +@@ -518,7 +594,11 @@ void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c) + + int bch2_fs_btree_key_cache_init(struct btree_key_cache *c) + { +- return rhashtable_init(&c->table, &bch2_btree_key_cache_params); ++ c->shrink.count_objects = bch2_btree_key_cache_count; ++ c->shrink.scan_objects = bch2_btree_key_cache_scan; ++ ++ return register_shrinker(&c->shrink) ?: ++ rhashtable_init(&c->table, &bch2_btree_key_cache_params); + } + + void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *c) +diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h +index 8b13d843e0c3..6013c9164f69 100644 +--- a/fs/bcachefs/btree_types.h ++++ b/fs/bcachefs/btree_types.h +@@ -296,6 +296,7 @@ struct btree_key_cache { + struct list_head freed; + struct list_head clean; + struct list_head dirty; ++ struct shrinker shrink; + + size_t nr_keys; + size_t nr_dirty; +@@ -314,6 +315,7 @@ struct bkey_cached { + unsigned long flags; + u8 u64s; + bool valid; ++ u32 btree_trans_barrier_seq; + struct bkey_cached_key key; + + struct rhash_head hash; +-- +cgit v1.2.3 + + +From bace7503ff9f93ddd36b19b93e06f6cd1b8255c7 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 16 Nov 2020 12:22:30 -0500 +Subject: bcachefs: Fix journal entry repair code + +When we detect bad keys in the journal that have to be dropped, the flow +control was wrong - we ended up not checking the next key in that entry. +Oops. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/journal_io.c | 48 ++++++++++++++++++++++++++++++++---------------- + 1 file changed, 32 insertions(+), 16 deletions(-) + +diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c +index df28ca118dec..7c157bc50268 100644 +--- a/fs/bcachefs/journal_io.c ++++ b/fs/bcachefs/journal_io.c +@@ -161,6 +161,8 @@ static void journal_entry_null_range(void *start, void *end) + #define journal_entry_err_on(cond, c, msg, ...) \ + ((cond) ? journal_entry_err(c, msg, ##__VA_ARGS__) : false) + ++#define FSCK_DELETED_KEY 5 ++ + static int journal_validate_key(struct bch_fs *c, struct jset *jset, + struct jset_entry *entry, + unsigned level, enum btree_id btree_id, +@@ -173,33 +175,42 @@ static int journal_validate_key(struct bch_fs *c, struct jset *jset, + int ret = 0; + + if (journal_entry_err_on(!k->k.u64s, c, +- "invalid %s in journal entry %llu offset %zi: k->u64s 0", ++ "invalid %s in jset %llu offset %zi/%u entry offset %zi/%u: k->u64s 0", + type, le64_to_cpu(jset->seq), +- (u64 *) entry - jset->_data)) { ++ (u64 *) entry - jset->_data, ++ le32_to_cpu(jset->u64s), ++ (u64 *) k - entry->_data, ++ le16_to_cpu(entry->u64s))) { + entry->u64s = cpu_to_le16((u64 *) k - entry->_data); + journal_entry_null_range(vstruct_next(entry), next); +- return 0; ++ return FSCK_DELETED_KEY; + } + + if (journal_entry_err_on((void *) bkey_next(k) > + (void *) vstruct_next(entry), c, +- "invalid %s in journal entry %llu offset %zi: extends past end of journal entry", ++ "invalid %s in jset %llu offset %zi/%u entry offset %zi/%u: extends past end of journal entry", + type, le64_to_cpu(jset->seq), +- (u64 *) entry - jset->_data)) { ++ (u64 *) entry - jset->_data, ++ le32_to_cpu(jset->u64s), ++ (u64 *) k - entry->_data, ++ le16_to_cpu(entry->u64s))) { + entry->u64s = cpu_to_le16((u64 *) k - entry->_data); + journal_entry_null_range(vstruct_next(entry), next); +- return 0; ++ return FSCK_DELETED_KEY; + } + + if (journal_entry_err_on(k->k.format != KEY_FORMAT_CURRENT, c, +- "invalid %s in journal entry %llu offset %zi: bad format %u", ++ "invalid %s in jset %llu offset %zi/%u entry offset %zi/%u: bad format %u", + type, le64_to_cpu(jset->seq), + (u64 *) entry - jset->_data, ++ le32_to_cpu(jset->u64s), ++ (u64 *) k - entry->_data, ++ le16_to_cpu(entry->u64s), + k->k.format)) { +- le16_add_cpu(&entry->u64s, -k->k.u64s); ++ le16_add_cpu(&entry->u64s, -((u16) k->k.u64s)); + memmove(k, bkey_next(k), next - (void *) bkey_next(k)); + journal_entry_null_range(vstruct_next(entry), next); +- return 0; ++ return FSCK_DELETED_KEY; + } + + if (!write) +@@ -213,15 +224,18 @@ static int journal_validate_key(struct bch_fs *c, struct jset *jset, + char buf[160]; + + bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(k)); +- mustfix_fsck_err(c, "invalid %s in journal entry %llu offset %zi: %s\n%s", ++ mustfix_fsck_err(c, "invalid %s in jset %llu offset %zi/%u entry offset %zi/%u: %s\n%s", + type, le64_to_cpu(jset->seq), + (u64 *) entry - jset->_data, ++ le32_to_cpu(jset->u64s), ++ (u64 *) k - entry->_data, ++ le16_to_cpu(entry->u64s), + invalid, buf); + +- le16_add_cpu(&entry->u64s, -k->k.u64s); ++ le16_add_cpu(&entry->u64s, -((u16) k->k.u64s)); + memmove(k, bkey_next(k), next - (void *) bkey_next(k)); + journal_entry_null_range(vstruct_next(entry), next); +- return 0; ++ return FSCK_DELETED_KEY; + } + + if (write) +@@ -237,15 +251,17 @@ static int journal_entry_validate_btree_keys(struct bch_fs *c, + struct jset_entry *entry, + int write) + { +- struct bkey_i *k; ++ struct bkey_i *k = entry->start; + +- vstruct_for_each(entry, k) { ++ while (k != vstruct_last(entry)) { + int ret = journal_validate_key(c, jset, entry, + entry->level, + entry->btree_id, + k, "key", write); +- if (ret) +- return ret; ++ if (ret == FSCK_DELETED_KEY) ++ continue; ++ ++ k = bkey_next(k); + } + + return 0; +-- +cgit v1.2.3 + + +From 01c8b52dda3879237ead7a0c6b2863f687bc4fb9 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 16 Nov 2020 13:06:28 -0500 +Subject: bcachefs: Convert tracepoints to use %ps, not %pf + +Symbol decoding was changed from %pf to %ps + +Signed-off-by: Kent Overstreet +--- + include/trace/events/bcachefs.h | 10 +++++----- + 1 file changed, 5 insertions(+), 5 deletions(-) + +diff --git a/include/trace/events/bcachefs.h b/include/trace/events/bcachefs.h +index 235e9cfa6a64..7a2b8844b998 100644 +--- a/include/trace/events/bcachefs.h ++++ b/include/trace/events/bcachefs.h +@@ -513,7 +513,7 @@ TRACE_EVENT(transaction_restart_ip, + __entry->ip = ip; + ), + +- TP_printk("%pF %pF", (void *) __entry->caller, (void *) __entry->ip) ++ TP_printk("%ps %pS", (void *) __entry->caller, (void *) __entry->ip) + ); + + DECLARE_EVENT_CLASS(transaction_restart, +@@ -528,7 +528,7 @@ DECLARE_EVENT_CLASS(transaction_restart, + __entry->ip = ip; + ), + +- TP_printk("%pf", (void *) __entry->ip) ++ TP_printk("%ps", (void *) __entry->ip) + ); + + DEFINE_EVENT(transaction_restart, trans_restart_btree_node_reused, +@@ -568,7 +568,7 @@ TRACE_EVENT(trans_restart_would_deadlock, + __entry->want_iter_type = want_iter_type; + ), + +- TP_printk("%pF %pF because %u have %u:%u want %u:%u", ++ TP_printk("%ps %pS because %u have %u:%u want %u:%u", + (void *) __entry->trans_ip, + (void *) __entry->caller_ip, + __entry->reason, +@@ -592,7 +592,7 @@ TRACE_EVENT(trans_restart_iters_realloced, + __entry->nr = nr; + ), + +- TP_printk("%pf nr %u", (void *) __entry->ip, __entry->nr) ++ TP_printk("%ps nr %u", (void *) __entry->ip, __entry->nr) + ); + + TRACE_EVENT(trans_restart_mem_realloced, +@@ -609,7 +609,7 @@ TRACE_EVENT(trans_restart_mem_realloced, + __entry->bytes = bytes; + ), + +- TP_printk("%pf bytes %lu", (void *) __entry->ip, __entry->bytes) ++ TP_printk("%ps bytes %lu", (void *) __entry->ip, __entry->bytes) + ); + + DEFINE_EVENT(transaction_restart, trans_restart_journal_res_get, +-- +cgit v1.2.3 + + +From 76a7087d996f29abff764b5d133b351fe80b9553 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 15 Nov 2020 20:52:55 -0500 +Subject: bcachefs: Set preallocated transaction mem to avoid restarts + +this will reduce transaction restarts, from observation of tracepoints. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 6 ++++-- + fs/bcachefs/btree_update_interior.c | 19 +++++++++++-------- + fs/bcachefs/fs.c | 3 ++- + 3 files changed, 17 insertions(+), 11 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index d3fcc049fa10..d560d1b99b64 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -2369,8 +2369,10 @@ void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, + */ + bch2_trans_alloc_iters(trans, c); + +- if (expected_mem_bytes) +- bch2_trans_preload_mem(trans, expected_mem_bytes); ++ if (expected_mem_bytes) { ++ trans->mem_bytes = roundup_pow_of_two(expected_mem_bytes); ++ trans->mem = kmalloc(trans->mem_bytes, GFP_KERNEL|__GFP_NOFAIL); ++ } + + trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier); + +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index c0ae76411c2e..c19205e06f36 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -523,6 +523,7 @@ static void btree_update_nodes_written(struct btree_update *as) + { + struct bch_fs *c = as->c; + struct btree *b = as->b; ++ struct btree_trans trans; + u64 journal_seq = 0; + unsigned i; + int ret; +@@ -540,14 +541,16 @@ static void btree_update_nodes_written(struct btree_update *as) + * journal reclaim does btree updates when flushing bkey_cached entries, + * which may require allocations as well. + */ +- ret = bch2_trans_do(c, &as->disk_res, &journal_seq, +- BTREE_INSERT_NOFAIL| +- BTREE_INSERT_USE_RESERVE| +- BTREE_INSERT_USE_ALLOC_RESERVE| +- BTREE_INSERT_NOCHECK_RW| +- BTREE_INSERT_JOURNAL_RECLAIM| +- BTREE_INSERT_JOURNAL_RESERVED, +- btree_update_nodes_written_trans(&trans, as)); ++ bch2_trans_init(&trans, c, 0, 512); ++ ret = __bch2_trans_do(&trans, &as->disk_res, &journal_seq, ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_USE_RESERVE| ++ BTREE_INSERT_USE_ALLOC_RESERVE| ++ BTREE_INSERT_NOCHECK_RW| ++ BTREE_INSERT_JOURNAL_RECLAIM| ++ BTREE_INSERT_JOURNAL_RESERVED, ++ btree_update_nodes_written_trans(&trans, as)); ++ bch2_trans_exit(&trans); + BUG_ON(ret && !bch2_journal_error(&c->journal)); + + if (b) { +diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c +index 55471c6434b7..e612bcb88564 100644 +--- a/fs/bcachefs/fs.c ++++ b/fs/bcachefs/fs.c +@@ -278,7 +278,8 @@ __bch2_create(struct user_namespace *mnt_userns, + if (!tmpfile) + mutex_lock(&dir->ei_update_lock); + +- bch2_trans_init(&trans, c, 8, 1024); ++ bch2_trans_init(&trans, c, 8, ++ 2048 + (!tmpfile ? dentry->d_name.len : 0)); + retry: + bch2_trans_begin(&trans); + +-- +cgit v1.2.3 + + +From bdda274a56b39ee6a1a2700f4aa0dd26f1611c0c Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 16 Nov 2020 18:20:50 -0500 +Subject: bcachefs: Dont' use percpu btree_iter buf in userspace + +bcachefs-tools doesn't have a real percpu (per thread) implementation +yet + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 14 +++++++++++--- + 1 file changed, 11 insertions(+), 3 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index d560d1b99b64..96cc5394295e 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -2342,12 +2342,15 @@ static void bch2_trans_alloc_iters(struct btree_trans *trans, struct bch_fs *c) + unsigned new_size = BTREE_ITER_MAX; + size_t iters_bytes = sizeof(struct btree_iter) * new_size; + size_t updates_bytes = sizeof(struct btree_insert_entry) * new_size; +- void *p; ++ void *p = NULL; + + BUG_ON(trans->used_mempool); + +- p = this_cpu_xchg(c->btree_iters_bufs->iter, NULL) ?: +- mempool_alloc(&trans->c->btree_iters_pool, GFP_NOFS); ++#ifdef __KERNEL__ ++ p = this_cpu_xchg(c->btree_iters_bufs->iter, NULL); ++#endif ++ if (!p) ++ p = mempool_alloc(&trans->c->btree_iters_pool, GFP_NOFS); + + trans->iters = p; p += iters_bytes; + trans->updates = p; p += updates_bytes; +@@ -2403,7 +2406,12 @@ int bch2_trans_exit(struct btree_trans *trans) + kfree(trans->fs_usage_deltas); + kfree(trans->mem); + ++#ifdef __KERNEL__ ++ /* ++ * Userspace doesn't have a real percpu implementation: ++ */ + trans->iters = this_cpu_xchg(c->btree_iters_bufs->iter, trans->iters); ++#endif + if (trans->iters) + mempool_free(trans->iters, &trans->c->btree_iters_pool); + +-- +cgit v1.2.3 + + +From 35740537fbad951f154cd841319951da84ad5652 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 16 Nov 2020 18:21:55 -0500 +Subject: bcachefs: Dump journal state when the journal deadlocks + +Currently tracking down one of these bugs. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/journal.c | 13 +++++++++++-- + 1 file changed, 11 insertions(+), 2 deletions(-) + +diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c +index 9a83b9568e33..61c32d73d093 100644 +--- a/fs/bcachefs/journal.c ++++ b/fs/bcachefs/journal.c +@@ -415,8 +415,17 @@ unlock: + goto retry; + + if (ret == -ENOSPC) { +- WARN_ONCE(!can_discard && (flags & JOURNAL_RES_GET_RESERVED), +- "JOURNAL_RES_GET_RESERVED set but journal full"); ++ if (WARN_ONCE(!can_discard && (flags & JOURNAL_RES_GET_RESERVED), ++ "JOURNAL_RES_GET_RESERVED set but journal full")) { ++ char *buf; ++ ++ buf = kmalloc(4096, GFP_NOFS); ++ if (buf) { ++ bch2_journal_debug_to_text(&_PBUF(buf, 4096), j); ++ pr_err("\n%s", buf); ++ kfree(buf); ++ } ++ } + + /* + * Journal is full - can't rely on reclaim from work item due to +-- +cgit v1.2.3 + + +From 772efaca85b0e767e64ce6be8b6f6b96a134eb44 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 16 Nov 2020 14:16:42 -0500 +Subject: bcachefs: Add more debug checks + +tracking down a bug where we see a btree node pointer in the wrong node + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_io.c | 4 +++- + fs/bcachefs/btree_update_interior.c | 17 ++++++++++++++++- + 2 files changed, 19 insertions(+), 2 deletions(-) + +diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c +index 3c1575826b2d..2406745fb365 100644 +--- a/fs/bcachefs/btree_io.c ++++ b/fs/bcachefs/btree_io.c +@@ -1442,8 +1442,10 @@ static int validate_bset_for_write(struct bch_fs *c, struct btree *b, + + ret = validate_bset(c, b, i, sectors, WRITE, false) ?: + validate_bset_keys(c, b, i, &whiteout_u64s, WRITE, false); +- if (ret) ++ if (ret) { + bch2_inconsistent_error(c); ++ dump_stack(); ++ } + + return ret; + } +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index c19205e06f36..6445ca614757 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -11,6 +11,7 @@ + #include "btree_iter.h" + #include "btree_locking.h" + #include "buckets.h" ++#include "error.h" + #include "extents.h" + #include "journal.h" + #include "journal_reclaim.h" +@@ -1021,7 +1022,19 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b + struct bkey_i *insert, + struct btree_node_iter *node_iter) + { ++ struct bch_fs *c = as->c; + struct bkey_packed *k; ++ const char *invalid; ++ ++ invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(insert), btree_node_type(b)) ?: ++ bch2_bkey_in_btree_node(b, bkey_i_to_s_c(insert)); ++ if (invalid) { ++ char buf[160]; ++ ++ bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(insert)); ++ bch2_fs_inconsistent(c, "inserting invalid bkey %s: %s", buf, invalid); ++ dump_stack(); ++ } + + BUG_ON(as->journal_u64s + jset_u64s(insert->k.u64s) > + ARRAY_SIZE(as->journal_entries)); +@@ -1037,7 +1050,7 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b + bch2_btree_node_iter_advance(node_iter, b); + + bch2_btree_bset_insert_key(iter, b, node_iter, insert); +- set_btree_node_dirty(as->c, b); ++ set_btree_node_dirty(c, b); + set_btree_node_need_write(b); + } + +@@ -1366,6 +1379,8 @@ void bch2_btree_insert_node(struct btree_update *as, struct btree *b, + goto split; + } + ++ btree_node_interior_verify(c, b); ++ + bch2_btree_insert_keys_interior(as, b, iter, keys); + + live_u64s_added = (int) b->nr.live_u64s - old_live_u64s; +-- +cgit v1.2.3 + + +From 6f9499f01c4a12d3ec457396e8e24e695499ae7e Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 16 Nov 2020 14:23:06 -0500 +Subject: bcachefs: Add an ioctl for resizing journal on a device + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bcachefs_ioctl.h | 14 ++++++++++++++ + fs/bcachefs/chardev.c | 23 +++++++++++++++++++++++ + fs/bcachefs/journal.c | 10 +++++++++- + 3 files changed, 46 insertions(+), 1 deletion(-) + +diff --git a/fs/bcachefs/bcachefs_ioctl.h b/fs/bcachefs/bcachefs_ioctl.h +index d71157a3e073..0e626b098d91 100644 +--- a/fs/bcachefs/bcachefs_ioctl.h ++++ b/fs/bcachefs/bcachefs_ioctl.h +@@ -73,6 +73,7 @@ struct bch_ioctl_incremental { + #define BCH_IOCTL_READ_SUPER _IOW(0xbc, 12, struct bch_ioctl_read_super) + #define BCH_IOCTL_DISK_GET_IDX _IOW(0xbc, 13, struct bch_ioctl_disk_get_idx) + #define BCH_IOCTL_DISK_RESIZE _IOW(0xbc, 14, struct bch_ioctl_disk_resize) ++#define BCH_IOCTL_DISK_RESIZE_JOURNAL _IOW(0xbc,15, struct bch_ioctl_disk_resize_journal) + + /* ioctl below act on a particular file, not the filesystem as a whole: */ + +@@ -329,4 +330,17 @@ struct bch_ioctl_disk_resize { + __u64 nbuckets; + }; + ++/* ++ * BCH_IOCTL_DISK_RESIZE_JOURNAL: resize journal on a device ++ * ++ * @dev - member to resize ++ * @nbuckets - new number of buckets ++ */ ++struct bch_ioctl_disk_resize_journal { ++ __u32 flags; ++ __u32 pad; ++ __u64 dev; ++ __u64 nbuckets; ++}; ++ + #endif /* _BCACHEFS_IOCTL_H */ +diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c +index 0377f9018d27..4663784d2f28 100644 +--- a/fs/bcachefs/chardev.c ++++ b/fs/bcachefs/chardev.c +@@ -5,6 +5,7 @@ + #include "bcachefs_ioctl.h" + #include "buckets.h" + #include "chardev.h" ++#include "journal.h" + #include "move.h" + #include "replicas.h" + #include "super.h" +@@ -563,6 +564,26 @@ static long bch2_ioctl_disk_resize(struct bch_fs *c, + return ret; + } + ++static long bch2_ioctl_disk_resize_journal(struct bch_fs *c, ++ struct bch_ioctl_disk_resize_journal arg) ++{ ++ struct bch_dev *ca; ++ int ret; ++ ++ if ((arg.flags & ~BCH_BY_INDEX) || ++ arg.pad) ++ return -EINVAL; ++ ++ ca = bch2_device_lookup(c, arg.dev, arg.flags); ++ if (IS_ERR(ca)) ++ return PTR_ERR(ca); ++ ++ ret = bch2_set_nr_journal_buckets(c, ca, arg.nbuckets); ++ ++ percpu_ref_put(&ca->ref); ++ return ret; ++} ++ + #define BCH_IOCTL(_name, _argtype) \ + do { \ + _argtype i; \ +@@ -619,6 +640,8 @@ long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg) + BCH_IOCTL(data, struct bch_ioctl_data); + case BCH_IOCTL_DISK_RESIZE: + BCH_IOCTL(disk_resize, struct bch_ioctl_disk_resize); ++ case BCH_IOCTL_DISK_RESIZE_JOURNAL: ++ BCH_IOCTL(disk_resize_journal, struct bch_ioctl_disk_resize_journal); + + default: + return -ENOTTY; +diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c +index 61c32d73d093..519e68b3c855 100644 +--- a/fs/bcachefs/journal.c ++++ b/fs/bcachefs/journal.c +@@ -685,7 +685,7 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, + goto err; + + journal_buckets = bch2_sb_resize_journal(&ca->disk_sb, +- nr + sizeof(*journal_buckets) / sizeof(u64)); ++ nr + sizeof(*journal_buckets) / sizeof(u64)); + if (!journal_buckets) + goto err; + +@@ -731,6 +731,12 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, + spin_lock(&c->journal.lock); + } + ++ /* ++ * XXX ++ * For resize at runtime, we should be writing the new ++ * superblock before inserting into the journal array ++ */ ++ + pos = ja->nr ? (ja->cur_idx + 1) % ja->nr : 0; + __array_insert_item(ja->buckets, ja->nr, pos); + __array_insert_item(ja->bucket_seq, ja->nr, pos); +@@ -766,6 +772,8 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, + + ret = 0; + err: ++ bch2_sb_resize_journal(&ca->disk_sb, ++ ja->nr + sizeof(*journal_buckets) / sizeof(u64)); + kfree(new_bucket_seq); + kfree(new_buckets); + +-- +cgit v1.2.3 + + +From 2c9fa67dab5b22e7787d6c146d2889c41e38db47 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 19 Nov 2020 20:13:30 -0500 +Subject: bcachefs: Add btree cache stats to sysfs + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_cache.c | 6 ++++++ + fs/bcachefs/btree_cache.h | 1 + + fs/bcachefs/sysfs.c | 7 +++++++ + 3 files changed, 14 insertions(+) + +diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c +index 5bceff48078e..09774f56f11c 100644 +--- a/fs/bcachefs/btree_cache.c ++++ b/fs/bcachefs/btree_cache.c +@@ -1064,3 +1064,9 @@ void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, + stats.floats, + stats.failed); + } ++ ++void bch2_btree_cache_to_text(struct printbuf *out, struct bch_fs *c) ++{ ++ pr_buf(out, "nr nodes:\t%u\n", c->btree_cache.used); ++ pr_buf(out, "nr dirty:\t%u\n", atomic_read(&c->btree_cache.dirty)); ++} +diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h +index 8a19e60e9258..e766ef552ce7 100644 +--- a/fs/bcachefs/btree_cache.h ++++ b/fs/bcachefs/btree_cache.h +@@ -100,5 +100,6 @@ static inline unsigned btree_blocks(struct bch_fs *c) + + void bch2_btree_node_to_text(struct printbuf *, struct bch_fs *, + struct btree *); ++void bch2_btree_cache_to_text(struct printbuf *, struct bch_fs *); + + #endif /* _BCACHEFS_BTREE_CACHE_H */ +diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c +index 58c00e26ebe8..900eda88a5dc 100644 +--- a/fs/bcachefs/sysfs.c ++++ b/fs/bcachefs/sysfs.c +@@ -165,6 +165,7 @@ read_attribute(journal_debug); + read_attribute(journal_pins); + read_attribute(btree_updates); + read_attribute(dirty_btree_nodes); ++read_attribute(btree_cache); + read_attribute(btree_key_cache); + read_attribute(btree_transactions); + read_attribute(stripes_heap); +@@ -374,6 +375,11 @@ SHOW(bch2_fs) + return out.pos - buf; + } + ++ if (attr == &sysfs_btree_cache) { ++ bch2_btree_cache_to_text(&out, c); ++ return out.pos - buf; ++ } ++ + if (attr == &sysfs_btree_key_cache) { + bch2_btree_key_cache_to_text(&out, &c->btree_key_cache); + return out.pos - buf; +@@ -550,6 +556,7 @@ struct attribute *bch2_fs_internal_files[] = { + &sysfs_journal_pins, + &sysfs_btree_updates, + &sysfs_dirty_btree_nodes, ++ &sysfs_btree_cache, + &sysfs_btree_key_cache, + &sysfs_btree_transactions, + &sysfs_stripes_heap, +-- +cgit v1.2.3 + + +From 971f096fb87d0abfa58314329da50f495949ee37 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 18 Nov 2020 13:21:59 -0500 +Subject: bcachefs: Be more precise with journal error reporting + +We were incorrectly detecting a journal deadlock - the journal filling +up - when only the journal pin fifo had filled up; if the journal pin +fifo is full that just means we need to wait on reclaim. + +This plumbs through better error reporting so we can better discriminate +in the journal_res_get path what's going on. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/journal.c | 76 ++++++++++++++++++++++--------------------- + fs/bcachefs/journal_reclaim.c | 6 ++-- + fs/bcachefs/journal_types.h | 8 ++++- + 3 files changed, 49 insertions(+), 41 deletions(-) + +diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c +index 519e68b3c855..a4cc98f86a88 100644 +--- a/fs/bcachefs/journal.c ++++ b/fs/bcachefs/journal.c +@@ -235,7 +235,7 @@ static int journal_entry_open(struct journal *j) + BUG_ON(journal_entry_is_open(j)); + + if (j->blocked) +- return -EAGAIN; ++ return cur_entry_blocked; + + if (j->cur_entry_error) + return j->cur_entry_error; +@@ -251,7 +251,7 @@ static int journal_entry_open(struct journal *j) + u64s = clamp_t(int, u64s, 0, JOURNAL_ENTRY_CLOSED_VAL - 1); + + if (u64s <= le32_to_cpu(buf->data->u64s)) +- return -ENOSPC; ++ return cur_entry_journal_full; + + /* + * Must be set before marking the journal entry as open: +@@ -263,7 +263,7 @@ static int journal_entry_open(struct journal *j) + old.v = new.v = v; + + if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL) +- return -EROFS; ++ return cur_entry_insufficient_devices; + + /* Handle any already added entries */ + new.cur_entry_offset = le32_to_cpu(buf->data->u64s); +@@ -376,7 +376,7 @@ retry: + * Don't want to close current journal entry, just need to + * invoke reclaim: + */ +- ret = -ENOSPC; ++ ret = cur_entry_journal_full; + goto unlock; + } + +@@ -399,14 +399,16 @@ retry: + * there's still a previous one in flight: + */ + trace_journal_entry_full(c); +- ret = -EAGAIN; ++ ret = cur_entry_blocked; + } else { + ret = journal_entry_open(j); + } + unlock: +- if ((ret == -EAGAIN || ret == -ENOSPC) && +- !j->res_get_blocked_start) ++ if ((ret && ret != cur_entry_insufficient_devices) && ++ !j->res_get_blocked_start) { + j->res_get_blocked_start = local_clock() ?: 1; ++ trace_journal_full(c); ++ } + + can_discard = j->can_discard; + spin_unlock(&j->lock); +@@ -414,41 +416,39 @@ unlock: + if (!ret) + goto retry; + +- if (ret == -ENOSPC) { +- if (WARN_ONCE(!can_discard && (flags & JOURNAL_RES_GET_RESERVED), +- "JOURNAL_RES_GET_RESERVED set but journal full")) { +- char *buf; +- +- buf = kmalloc(4096, GFP_NOFS); +- if (buf) { +- bch2_journal_debug_to_text(&_PBUF(buf, 4096), j); +- pr_err("\n%s", buf); +- kfree(buf); +- } ++ if (WARN_ONCE(ret == cur_entry_journal_full && ++ !can_discard && ++ (flags & JOURNAL_RES_GET_RESERVED), ++ "JOURNAL_RES_GET_RESERVED set but journal full")) { ++ char *buf; ++ ++ buf = kmalloc(4096, GFP_NOFS); ++ if (buf) { ++ bch2_journal_debug_to_text(&_PBUF(buf, 4096), j); ++ pr_err("\n%s", buf); ++ kfree(buf); + } ++ } + +- /* +- * Journal is full - can't rely on reclaim from work item due to +- * freezing: +- */ +- trace_journal_full(c); +- +- if (!(flags & JOURNAL_RES_GET_NONBLOCK)) { +- if (can_discard) { +- bch2_journal_do_discards(j); +- goto retry; +- } +- +- if (mutex_trylock(&j->reclaim_lock)) { +- bch2_journal_reclaim(j); +- mutex_unlock(&j->reclaim_lock); +- } ++ /* ++ * Journal is full - can't rely on reclaim from work item due to ++ * freezing: ++ */ ++ if ((ret == cur_entry_journal_full || ++ ret == cur_entry_journal_pin_full) && ++ !(flags & JOURNAL_RES_GET_NONBLOCK)) { ++ if (can_discard) { ++ bch2_journal_do_discards(j); ++ goto retry; + } + +- ret = -EAGAIN; ++ if (mutex_trylock(&j->reclaim_lock)) { ++ bch2_journal_reclaim(j); ++ mutex_unlock(&j->reclaim_lock); ++ } + } + +- return ret; ++ return ret == cur_entry_insufficient_devices ? -EROFS : -EAGAIN; + } + + /* +@@ -1070,6 +1070,7 @@ void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) + "last_seq_ondisk:\t%llu\n" + "prereserved:\t\t%u/%u\n" + "current entry sectors:\t%u\n" ++ "current entry error:\t%u\n" + "current entry:\t\t", + fifo_used(&j->pin), + journal_cur_seq(j), +@@ -1077,7 +1078,8 @@ void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) + j->last_seq_ondisk, + j->prereserved.reserved, + j->prereserved.remaining, +- j->cur_entry_sectors); ++ j->cur_entry_sectors, ++ j->cur_entry_error); + + switch (s.cur_entry_offset) { + case JOURNAL_ENTRY_ERROR_VAL: +diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c +index 7a04d06bb342..62eda89b1047 100644 +--- a/fs/bcachefs/journal_reclaim.c ++++ b/fs/bcachefs/journal_reclaim.c +@@ -164,12 +164,12 @@ void bch2_journal_space_available(struct journal *j) + j->can_discard = can_discard; + + if (nr_online < c->opts.metadata_replicas_required) { +- ret = -EROFS; ++ ret = cur_entry_insufficient_devices; + goto out; + } + + if (!fifo_free(&j->pin)) { +- ret = -ENOSPC; ++ ret = cur_entry_journal_pin_full; + goto out; + } + +@@ -180,7 +180,7 @@ void bch2_journal_space_available(struct journal *j) + clean = __journal_space_available(j, nr_devs_want, journal_space_clean); + + if (!discarded.next_entry) +- ret = -ENOSPC; ++ ret = cur_entry_journal_full; + + overhead = DIV_ROUND_UP(clean.remaining, max_entry_size) * + journal_entry_overhead(j); +diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h +index 9757e3d55991..8a05bb991c2f 100644 +--- a/fs/bcachefs/journal_types.h ++++ b/fs/bcachefs/journal_types.h +@@ -146,7 +146,13 @@ struct journal { + * 0, or -ENOSPC if waiting on journal reclaim, or -EROFS if + * insufficient devices: + */ +- int cur_entry_error; ++ enum { ++ cur_entry_ok, ++ cur_entry_blocked, ++ cur_entry_journal_full, ++ cur_entry_journal_pin_full, ++ cur_entry_insufficient_devices, ++ } cur_entry_error; + + union journal_preres_state prereserved; + +-- +cgit v1.2.3 + + +From ef01c0671512ee93c6b168ce885a31742eb13464 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 18 Nov 2020 14:09:33 -0500 +Subject: bcachefs: Add a kmem_cache for btree_key_cache objects + +We allocate a lot of these, and we're seeing sporading OOMs - this will +help with tracking those down. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_key_cache.c | 30 +++++++++++++++++++++++++----- + fs/bcachefs/btree_key_cache.h | 3 +++ + fs/bcachefs/super.c | 2 ++ + 3 files changed, 30 insertions(+), 5 deletions(-) + +diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c +index d605ff181d2e..9ecd63b50c44 100644 +--- a/fs/bcachefs/btree_key_cache.c ++++ b/fs/bcachefs/btree_key_cache.c +@@ -12,6 +12,8 @@ + #include + #include + ++static struct kmem_cache *bch2_key_cache; ++ + static int bch2_btree_key_cache_cmp_fn(struct rhashtable_compare_arg *arg, + const void *obj) + { +@@ -104,7 +106,7 @@ bkey_cached_alloc(struct btree_key_cache *c) + return ck; + } + +- ck = kzalloc(sizeof(*ck), GFP_NOFS); ++ ck = kmem_cache_alloc(bch2_key_cache, GFP_NOFS|__GFP_ZERO); + if (!ck) + return NULL; + +@@ -515,7 +517,7 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, + if (poll_state_synchronize_srcu(&c->btree_trans_barrier, + ck->btree_trans_barrier_seq)) { + list_del(&ck->list); +- kfree(ck); ++ kmem_cache_free(bch2_key_cache, ck); + freed++; + } + +@@ -570,15 +572,18 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc) + bch2_journal_preres_put(&c->journal, &ck->res); + + kfree(ck->k); +- kfree(ck); ++ list_del(&ck->list); ++ kmem_cache_free(bch2_key_cache, ck); + bc->nr_keys--; + } + + BUG_ON(bc->nr_dirty && !bch2_journal_error(&c->journal)); + BUG_ON(bc->nr_keys); + +- list_for_each_entry_safe(ck, n, &bc->freed, list) +- kfree(ck); ++ list_for_each_entry_safe(ck, n, &bc->freed, list) { ++ list_del(&ck->list); ++ kmem_cache_free(bch2_key_cache, ck); ++ } + mutex_unlock(&bc->lock); + + rhashtable_destroy(&bc->table); +@@ -624,3 +629,18 @@ void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache * + } + mutex_unlock(&c->lock); + } ++ ++void bch2_btree_key_cache_exit(void) ++{ ++ if (bch2_key_cache) ++ kmem_cache_destroy(bch2_key_cache); ++} ++ ++int __init bch2_btree_key_cache_init(void) ++{ ++ bch2_key_cache = KMEM_CACHE(bkey_cached, 0); ++ if (!bch2_key_cache) ++ return -ENOMEM; ++ ++ return 0; ++} +diff --git a/fs/bcachefs/btree_key_cache.h b/fs/bcachefs/btree_key_cache.h +index d448264abcc8..e64a8e9c726f 100644 +--- a/fs/bcachefs/btree_key_cache.h ++++ b/fs/bcachefs/btree_key_cache.h +@@ -25,4 +25,7 @@ int bch2_fs_btree_key_cache_init(struct btree_key_cache *); + + void bch2_btree_key_cache_to_text(struct printbuf *, struct btree_key_cache *); + ++void bch2_btree_key_cache_exit(void); ++int __init bch2_btree_key_cache_init(void); ++ + #endif /* _BCACHEFS_BTREE_KEY_CACHE_H */ +diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c +index 432bece444c3..0d4416d8ea29 100644 +--- a/fs/bcachefs/super.c ++++ b/fs/bcachefs/super.c +@@ -2014,6 +2014,7 @@ static void bcachefs_exit(void) + bch2_debug_exit(); + bch2_vfs_exit(); + bch2_chardev_exit(); ++ bch2_btree_key_cache_exit(); + if (bcachefs_kset) + kset_unregister(bcachefs_kset); + } +@@ -2023,6 +2024,7 @@ static int __init bcachefs_init(void) + bch2_bkey_pack_test(); + + if (!(bcachefs_kset = kset_create_and_add("bcachefs", NULL, fs_kobj)) || ++ bch2_btree_key_cache_init() || + bch2_chardev_init() || + bch2_vfs_init() || + bch2_debug_init()) +-- +cgit v1.2.3 + + +From e7850e553c4bd69c503571f3211ef904658558c2 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 19 Nov 2020 11:53:38 -0500 +Subject: bcachefs: More debug code improvements + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_update_interior.c | 19 +++++++++++++++++-- + 1 file changed, 17 insertions(+), 2 deletions(-) + +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index 6445ca614757..27e32262da11 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -49,12 +49,27 @@ static void btree_node_interior_verify(struct bch_fs *c, struct btree *b) + break; + bp = bkey_s_c_to_btree_ptr_v2(k); + +- BUG_ON(bkey_cmp(next_node, bp.v->min_key)); ++ if (bkey_cmp(next_node, bp.v->min_key)) { ++ bch2_dump_btree_node(c, b); ++ panic("expected next min_key %llu:%llu got %llu:%llu\n", ++ next_node.inode, ++ next_node.offset, ++ bp.v->min_key.inode, ++ bp.v->min_key.offset); ++ } + + bch2_btree_node_iter_advance(&iter, b); + + if (bch2_btree_node_iter_end(&iter)) { +- BUG_ON(bkey_cmp(k.k->p, b->key.k.p)); ++ ++ if (bkey_cmp(k.k->p, b->key.k.p)) { ++ bch2_dump_btree_node(c, b); ++ panic("expected end %llu:%llu got %llu:%llu\n", ++ b->key.k.p.inode, ++ b->key.k.p.offset, ++ k.k->p.inode, ++ k.k->p.offset); ++ } + break; + } + +-- +cgit v1.2.3 + + +From 00cf2e2227bdfab1c6b147624ab631f268bfd480 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 19 Nov 2020 15:38:27 -0500 +Subject: bcachefs: Improve btree key cache shrinker + +The shrinker should start scanning for entries that can be freed oldest +to newest - this way, we can avoid scanning a lot of entries that are +too new to be freed. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_key_cache.c | 93 +++++++++++++++++++++---------------------- + fs/bcachefs/btree_types.h | 4 +- + 2 files changed, 48 insertions(+), 49 deletions(-) + +diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c +index 9ecd63b50c44..aeeb36e07f9a 100644 +--- a/fs/bcachefs/btree_key_cache.c ++++ b/fs/bcachefs/btree_key_cache.c +@@ -78,10 +78,13 @@ static void bkey_cached_free(struct btree_key_cache *bc, + { + struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); + ++ BUG_ON(test_bit(BKEY_CACHED_DIRTY, &ck->flags)); ++ + ck->btree_trans_barrier_seq = + start_poll_synchronize_srcu(&c->btree_trans_barrier); + +- list_move(&ck->list, &bc->freed); ++ list_move_tail(&ck->list, &bc->freed); ++ bc->nr_freed++; + + kfree(ck->k); + ck->k = NULL; +@@ -96,9 +99,20 @@ bkey_cached_alloc(struct btree_key_cache *c) + { + struct bkey_cached *ck; + +- list_for_each_entry(ck, &c->freed, list) +- if (bkey_cached_lock_for_evict(ck)) ++ list_for_each_entry_reverse(ck, &c->freed, list) ++ if (bkey_cached_lock_for_evict(ck)) { ++ c->nr_freed--; + return ck; ++ } ++ ++ ck = kmem_cache_alloc(bch2_key_cache, GFP_NOFS|__GFP_ZERO); ++ if (likely(ck)) { ++ INIT_LIST_HEAD(&ck->list); ++ six_lock_init(&ck->c.lock); ++ BUG_ON(!six_trylock_intent(&ck->c.lock)); ++ BUG_ON(!six_trylock_write(&ck->c.lock)); ++ return ck; ++ } + + list_for_each_entry(ck, &c->clean, list) + if (bkey_cached_lock_for_evict(ck)) { +@@ -106,16 +120,7 @@ bkey_cached_alloc(struct btree_key_cache *c) + return ck; + } + +- ck = kmem_cache_alloc(bch2_key_cache, GFP_NOFS|__GFP_ZERO); +- if (!ck) +- return NULL; +- +- INIT_LIST_HEAD(&ck->list); +- six_lock_init(&ck->c.lock); +- BUG_ON(!six_trylock_intent(&ck->c.lock)); +- BUG_ON(!six_trylock_write(&ck->c.lock)); +- +- return ck; ++ return NULL; + } + + static struct bkey_cached * +@@ -134,8 +139,7 @@ btree_key_cache_create(struct btree_key_cache *c, + ck->key.btree_id = btree_id; + ck->key.pos = pos; + ck->valid = false; +- +- BUG_ON(ck->flags); ++ ck->flags = 1U << BKEY_CACHED_ACCESSED; + + if (rhashtable_lookup_insert_fast(&c->table, + &ck->hash, +@@ -292,6 +296,9 @@ fill: + goto err; + } + ++ if (!test_bit(BKEY_CACHED_ACCESSED, &ck->flags)) ++ set_bit(BKEY_CACHED_ACCESSED, &ck->flags); ++ + iter->uptodate = BTREE_ITER_NEED_PEEK; + bch2_btree_iter_downgrade(iter); + return ret; +@@ -511,28 +518,34 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, + + flags = memalloc_nofs_save(); + ++ /* ++ * Newest freed entries are at the end of the list - once we hit one ++ * that's too new to be freed, we can bail out: ++ */ + list_for_each_entry_safe(ck, t, &bc->freed, list) { +- scanned++; +- +- if (poll_state_synchronize_srcu(&c->btree_trans_barrier, +- ck->btree_trans_barrier_seq)) { +- list_del(&ck->list); +- kmem_cache_free(bch2_key_cache, ck); +- freed++; +- } ++ if (!poll_state_synchronize_srcu(&c->btree_trans_barrier, ++ ck->btree_trans_barrier_seq)) ++ break; + +- if (scanned >= nr) +- goto out; ++ list_del(&ck->list); ++ kmem_cache_free(bch2_key_cache, ck); ++ bc->nr_freed--; ++ scanned++; ++ freed++; + } + +- list_for_each_entry_safe(ck, t, &bc->clean, list) { +- scanned++; ++ if (scanned >= nr) ++ goto out; + +- if (bkey_cached_lock_for_evict(ck)) { ++ list_for_each_entry_safe(ck, t, &bc->clean, list) { ++ if (test_bit(BKEY_CACHED_ACCESSED, &ck->flags)) ++ clear_bit(BKEY_CACHED_ACCESSED, &ck->flags); ++ else if (bkey_cached_lock_for_evict(ck)) { + bkey_cached_evict(bc, ck); + bkey_cached_free(bc, ck); + } + ++ scanned++; + if (scanned >= nr) { + if (&t->list != &bc->clean) + list_move_tail(&bc->clean, &t->list); +@@ -599,6 +612,7 @@ void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c) + + int bch2_fs_btree_key_cache_init(struct btree_key_cache *c) + { ++ c->shrink.seeks = 1; + c->shrink.count_objects = bch2_btree_key_cache_count; + c->shrink.scan_objects = bch2_btree_key_cache_scan; + +@@ -608,26 +622,9 @@ int bch2_fs_btree_key_cache_init(struct btree_key_cache *c) + + void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *c) + { +- struct bucket_table *tbl; +- struct bkey_cached *ck; +- struct rhash_head *pos; +- size_t i; +- +- mutex_lock(&c->lock); +- tbl = rht_dereference_rcu(c->table.tbl, &c->table); +- +- for (i = 0; i < tbl->size; i++) { +- rht_for_each_entry_rcu(ck, pos, tbl, i, hash) { +- pr_buf(out, "%s:", +- bch2_btree_ids[ck->key.btree_id]); +- bch2_bpos_to_text(out, ck->key.pos); +- +- if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) +- pr_buf(out, " journal seq %llu", ck->journal.seq); +- pr_buf(out, "\n"); +- } +- } +- mutex_unlock(&c->lock); ++ pr_buf(out, "nr_freed:\t%zu\n", c->nr_freed); ++ pr_buf(out, "nr_keys:\t%zu\n", c->nr_keys); ++ pr_buf(out, "nr_dirty:\t%zu\n", c->nr_dirty); + } + + void bch2_btree_key_cache_exit(void) +diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h +index 6013c9164f69..5fde3ce4090c 100644 +--- a/fs/bcachefs/btree_types.h ++++ b/fs/bcachefs/btree_types.h +@@ -298,6 +298,7 @@ struct btree_key_cache { + struct list_head dirty; + struct shrinker shrink; + ++ size_t nr_freed; + size_t nr_keys; + size_t nr_dirty; + }; +@@ -307,7 +308,8 @@ struct bkey_cached_key { + struct bpos pos; + } __attribute__((packed, aligned(4))); + +-#define BKEY_CACHED_DIRTY 0 ++#define BKEY_CACHED_ACCESSED 0 ++#define BKEY_CACHED_DIRTY 1 + + struct bkey_cached { + struct btree_bkey_cached_common c; +-- +cgit v1.2.3 + + +From 0a16417b4d285afb2840fe83754acd86c283758a Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 19 Nov 2020 19:54:40 -0500 +Subject: bcachefs: Ensure journal reclaim runs when btree key cache is too + dirty + +Ensuring the key cache isn't too dirty is critical for ensuring that the +shrinker can reclaim memory. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_key_cache.c | 8 ++++++ + fs/bcachefs/btree_key_cache.h | 9 +++++++ + fs/bcachefs/journal_reclaim.c | 54 ++++++++++++++++++++++++++----------- + include/trace/events/bcachefs.h | 59 +++++++++++++++++++++++++++++++++++++++++ + 4 files changed, 114 insertions(+), 16 deletions(-) + +diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c +index aeeb36e07f9a..aa7767c3f044 100644 +--- a/fs/bcachefs/btree_key_cache.c ++++ b/fs/bcachefs/btree_key_cache.c +@@ -460,6 +460,7 @@ bool bch2_btree_insert_key_cached(struct btree_trans *trans, + { + struct bch_fs *c = trans->c; + struct bkey_cached *ck = (void *) iter->l[0].b; ++ bool kick_reclaim = false; + + BUG_ON(insert->u64s > ck->u64s); + +@@ -484,11 +485,18 @@ bool bch2_btree_insert_key_cached(struct btree_trans *trans, + + set_bit(BKEY_CACHED_DIRTY, &ck->flags); + c->btree_key_cache.nr_dirty++; ++ ++ if (bch2_nr_btree_keys_need_flush(c)) ++ kick_reclaim = true; ++ + mutex_unlock(&c->btree_key_cache.lock); + } + + bch2_journal_pin_update(&c->journal, trans->journal_res.seq, + &ck->journal, btree_key_cache_journal_flush); ++ ++ if (kick_reclaim) ++ mod_delayed_work(c->journal_reclaim_wq, &c->journal.reclaim_work, 0); + return true; + } + +diff --git a/fs/bcachefs/btree_key_cache.h b/fs/bcachefs/btree_key_cache.h +index e64a8e9c726f..7723a2178430 100644 +--- a/fs/bcachefs/btree_key_cache.h ++++ b/fs/bcachefs/btree_key_cache.h +@@ -1,6 +1,15 @@ + #ifndef _BCACHEFS_BTREE_KEY_CACHE_H + #define _BCACHEFS_BTREE_KEY_CACHE_H + ++static inline size_t bch2_nr_btree_keys_need_flush(struct bch_fs *c) ++{ ++ size_t nr_dirty = READ_ONCE(c->btree_key_cache.nr_dirty); ++ size_t nr_keys = READ_ONCE(c->btree_key_cache.nr_dirty); ++ size_t max_dirty = 1024 + (nr_keys * 3) / 4; ++ ++ return max_t(ssize_t, 0, nr_dirty - max_dirty); ++} ++ + struct bkey_cached * + bch2_btree_key_cache_find(struct bch_fs *, enum btree_id, struct bpos); + +diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c +index 62eda89b1047..a626df18461d 100644 +--- a/fs/bcachefs/journal_reclaim.c ++++ b/fs/bcachefs/journal_reclaim.c +@@ -1,12 +1,15 @@ + // SPDX-License-Identifier: GPL-2.0 + + #include "bcachefs.h" ++#include "btree_key_cache.h" + #include "journal.h" + #include "journal_io.h" + #include "journal_reclaim.h" + #include "replicas.h" + #include "super.h" + ++#include ++ + /* Free space calculations: */ + + static unsigned journal_space_from(struct journal_device *ja, +@@ -432,7 +435,6 @@ journal_get_next_pin(struct journal *j, u64 max_seq, u64 *seq) + list_move(&ret->list, &pin_list->flushed); + BUG_ON(j->flush_in_progress); + j->flush_in_progress = ret; +- j->last_flushed = jiffies; + } + + spin_unlock(&j->lock); +@@ -441,17 +443,24 @@ journal_get_next_pin(struct journal *j, u64 max_seq, u64 *seq) + } + + /* returns true if we did work */ +-static bool journal_flush_pins(struct journal *j, u64 seq_to_flush, +- unsigned min_nr) ++static u64 journal_flush_pins(struct journal *j, u64 seq_to_flush, ++ unsigned min_nr) + { + struct journal_entry_pin *pin; +- bool ret = false; +- u64 seq; ++ u64 seq, ret = 0; + + lockdep_assert_held(&j->reclaim_lock); + +- while ((pin = journal_get_next_pin(j, min_nr +- ? U64_MAX : seq_to_flush, &seq))) { ++ while (1) { ++ cond_resched(); ++ ++ j->last_flushed = jiffies; ++ ++ pin = journal_get_next_pin(j, min_nr ++ ? U64_MAX : seq_to_flush, &seq); ++ if (!pin) ++ break; ++ + if (min_nr) + min_nr--; + +@@ -460,7 +469,7 @@ static bool journal_flush_pins(struct journal *j, u64 seq_to_flush, + BUG_ON(j->flush_in_progress != pin); + j->flush_in_progress = NULL; + wake_up(&j->pin_flush_wait); +- ret = true; ++ ret++; + } + + return ret; +@@ -527,8 +536,8 @@ static u64 journal_seq_to_flush(struct journal *j) + void bch2_journal_reclaim(struct journal *j) + { + struct bch_fs *c = container_of(j, struct bch_fs, journal); +- unsigned min_nr = 0; +- u64 seq_to_flush = 0; ++ u64 seq_to_flush, nr_flushed = 0; ++ size_t min_nr; + + lockdep_assert_held(&j->reclaim_lock); + +@@ -549,12 +558,25 @@ void bch2_journal_reclaim(struct journal *j) + if (j->prereserved.reserved * 2 > j->prereserved.remaining) + min_nr = 1; + +- if ((atomic_read(&c->btree_cache.dirty) * 4 > +- c->btree_cache.used * 3) || +- (c->btree_key_cache.nr_dirty * 4 > +- c->btree_key_cache.nr_keys)) ++ if (atomic_read(&c->btree_cache.dirty) * 4 > ++ c->btree_cache.used * 3) + min_nr = 1; +- } while (journal_flush_pins(j, seq_to_flush, min_nr)); ++ ++ min_nr = max(min_nr, bch2_nr_btree_keys_need_flush(c)); ++ ++ trace_journal_reclaim_start(c, ++ min_nr, ++ j->prereserved.reserved, ++ j->prereserved.remaining, ++ atomic_read(&c->btree_cache.dirty), ++ c->btree_cache.used, ++ c->btree_key_cache.nr_dirty, ++ c->btree_key_cache.nr_keys); ++ ++ nr_flushed += journal_flush_pins(j, seq_to_flush, min_nr); ++ } while (min_nr); ++ ++ trace_journal_reclaim_finish(c, nr_flushed); + + if (!bch2_journal_error(j)) + queue_delayed_work(c->journal_reclaim_wq, &j->reclaim_work, +@@ -582,7 +604,7 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush, + + mutex_lock(&j->reclaim_lock); + +- *did_work = journal_flush_pins(j, seq_to_flush, 0); ++ *did_work = journal_flush_pins(j, seq_to_flush, 0) != 0; + + spin_lock(&j->lock); + /* +diff --git a/include/trace/events/bcachefs.h b/include/trace/events/bcachefs.h +index 7a2b8844b998..b0fe4d6bea2e 100644 +--- a/include/trace/events/bcachefs.h ++++ b/include/trace/events/bcachefs.h +@@ -121,6 +121,65 @@ DEFINE_EVENT(bio, journal_write, + TP_ARGS(bio) + ); + ++TRACE_EVENT(journal_reclaim_start, ++ TP_PROTO(struct bch_fs *c, u64 min_nr, ++ u64 prereserved, u64 prereserved_total, ++ u64 btree_cache_dirty, u64 btree_cache_total, ++ u64 btree_key_cache_dirty, u64 btree_key_cache_total), ++ TP_ARGS(c, min_nr, prereserved, prereserved_total, ++ btree_cache_dirty, btree_cache_total, ++ btree_key_cache_dirty, btree_key_cache_total), ++ ++ TP_STRUCT__entry( ++ __array(char, uuid, 16 ) ++ __field(u64, min_nr ) ++ __field(u64, prereserved ) ++ __field(u64, prereserved_total ) ++ __field(u64, btree_cache_dirty ) ++ __field(u64, btree_cache_total ) ++ __field(u64, btree_key_cache_dirty ) ++ __field(u64, btree_key_cache_total ) ++ ), ++ ++ TP_fast_assign( ++ memcpy(__entry->uuid, c->sb.user_uuid.b, 16); ++ __entry->min_nr = min_nr; ++ __entry->prereserved = prereserved; ++ __entry->prereserved_total = prereserved_total; ++ __entry->btree_cache_dirty = btree_cache_dirty; ++ __entry->btree_cache_total = btree_cache_total; ++ __entry->btree_key_cache_dirty = btree_key_cache_dirty; ++ __entry->btree_key_cache_total = btree_key_cache_total; ++ ), ++ ++ TP_printk("%pU min %llu prereserved %llu/%llu btree cache %llu/%llu key cache %llu/%llu", ++ __entry->uuid, ++ __entry->min_nr, ++ __entry->prereserved, ++ __entry->prereserved_total, ++ __entry->btree_cache_dirty, ++ __entry->btree_cache_total, ++ __entry->btree_key_cache_dirty, ++ __entry->btree_key_cache_total) ++); ++ ++TRACE_EVENT(journal_reclaim_finish, ++ TP_PROTO(struct bch_fs *c, u64 nr_flushed), ++ TP_ARGS(c, nr_flushed), ++ ++ TP_STRUCT__entry( ++ __array(char, uuid, 16 ) ++ __field(u64, nr_flushed ) ++ ), ++ ++ TP_fast_assign( ++ memcpy(__entry->uuid, c->sb.user_uuid.b, 16); ++ __entry->nr_flushed = nr_flushed; ++ ), ++ ++ TP_printk("%pU flushed %llu", __entry->uuid, __entry->nr_flushed) ++); ++ + /* bset.c: */ + + DEFINE_EVENT(bpos, bkey_pack_pos_fail, +-- +cgit v1.2.3 + + +From 8936b66d024152a3347362b5c374e7e02b1da09a Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 20 Nov 2020 13:24:51 -0500 +Subject: bcachefs: Simplify transaction commit error path + +The transaction restart path traverses all iterators, we don't need to +do it here. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_update_leaf.c | 11 ----------- + include/trace/events/bcachefs.h | 5 ----- + 2 files changed, 16 deletions(-) + +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index f96a3571d9ee..a47bba452308 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -657,17 +657,6 @@ int bch2_trans_commit_error(struct btree_trans *trans, + break; + } + +- if (ret == -EINTR) { +- int ret2 = bch2_btree_iter_traverse_all(trans); +- +- if (ret2) { +- trace_trans_restart_traverse(trans->ip); +- return ret2; +- } +- +- trace_trans_restart_atomic(trans->ip); +- } +- + return ret; + } + +diff --git a/include/trace/events/bcachefs.h b/include/trace/events/bcachefs.h +index b0fe4d6bea2e..a3ecc1a23003 100644 +--- a/include/trace/events/bcachefs.h ++++ b/include/trace/events/bcachefs.h +@@ -716,11 +716,6 @@ DEFINE_EVENT(transaction_restart, trans_restart_traverse, + TP_ARGS(ip) + ); + +-DEFINE_EVENT(transaction_restart, trans_restart_atomic, +- TP_PROTO(unsigned long ip), +- TP_ARGS(ip) +-); +- + DECLARE_EVENT_CLASS(node_lock_fail, + TP_PROTO(unsigned level, u32 iter_seq, unsigned node, u32 node_seq), + TP_ARGS(level, iter_seq, node, node_seq), +-- +cgit v1.2.3 + + +From 2233cfd5c2eb12bb5855a3198ea0a448c32a40ca Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 19 Nov 2020 21:15:39 -0500 +Subject: bcachefs: Journal reclaim requires memalloc_noreclaim_save() + +Memory reclaim requires journal reclaim to make forward progress - it's +what cleans our caches - thus, while we're in journal reclaim or holding +the journal reclaim lock we can't recurse into memory reclaim. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/journal_reclaim.c | 11 +++++++++++ + 1 file changed, 11 insertions(+) + +diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c +index a626df18461d..3af085dee387 100644 +--- a/fs/bcachefs/journal_reclaim.c ++++ b/fs/bcachefs/journal_reclaim.c +@@ -8,6 +8,7 @@ + #include "replicas.h" + #include "super.h" + ++#include + #include + + /* Free space calculations: */ +@@ -538,8 +539,16 @@ void bch2_journal_reclaim(struct journal *j) + struct bch_fs *c = container_of(j, struct bch_fs, journal); + u64 seq_to_flush, nr_flushed = 0; + size_t min_nr; ++ unsigned flags; + ++ /* ++ * We can't invoke memory reclaim while holding the reclaim_lock - ++ * journal reclaim is required to make progress for memory reclaim ++ * (cleaning the caches), so we can't get stuck in memory reclaim while ++ * we're holding the reclaim lock: ++ */ + lockdep_assert_held(&j->reclaim_lock); ++ flags = memalloc_noreclaim_save(); + + do { + bch2_journal_do_discards(j); +@@ -576,6 +585,8 @@ void bch2_journal_reclaim(struct journal *j) + nr_flushed += journal_flush_pins(j, seq_to_flush, min_nr); + } while (min_nr); + ++ memalloc_noreclaim_restore(flags); ++ + trace_journal_reclaim_finish(c, nr_flushed); + + if (!bch2_journal_error(j)) +-- +cgit v1.2.3 + + +From ecb2e7f72b5866df82a07351bf128e5a645100ec Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 19 Nov 2020 21:40:03 -0500 +Subject: bcachefs: Throttle updates when btree key cache is too dirty + +This is needed to ensure we don't deadlock because journal reclaim and +thus memory reclaim isn't making forward progress. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_key_cache.h | 11 ++++++++++- + fs/bcachefs/btree_types.h | 1 + + fs/bcachefs/btree_update_leaf.c | 19 +++++++++++++++++++ + include/trace/events/bcachefs.h | 5 +++++ + 4 files changed, 35 insertions(+), 1 deletion(-) + +diff --git a/fs/bcachefs/btree_key_cache.h b/fs/bcachefs/btree_key_cache.h +index 7723a2178430..d7d31a0662c3 100644 +--- a/fs/bcachefs/btree_key_cache.h ++++ b/fs/bcachefs/btree_key_cache.h +@@ -5,11 +5,20 @@ static inline size_t bch2_nr_btree_keys_need_flush(struct bch_fs *c) + { + size_t nr_dirty = READ_ONCE(c->btree_key_cache.nr_dirty); + size_t nr_keys = READ_ONCE(c->btree_key_cache.nr_dirty); +- size_t max_dirty = 1024 + (nr_keys * 3) / 4; ++ size_t max_dirty = 4096 + nr_keys / 2; + + return max_t(ssize_t, 0, nr_dirty - max_dirty); + } + ++static inline bool bch2_btree_key_cache_must_wait(struct bch_fs *c) ++{ ++ size_t nr_dirty = READ_ONCE(c->btree_key_cache.nr_dirty); ++ size_t nr_keys = READ_ONCE(c->btree_key_cache.nr_dirty); ++ size_t max_dirty = 4096 + (nr_keys * 3) / 4; ++ ++ return nr_dirty > max_dirty; ++} ++ + struct bkey_cached * + bch2_btree_key_cache_find(struct bch_fs *, enum btree_id, struct bpos); + +diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h +index 5fde3ce4090c..2d142ef601e1 100644 +--- a/fs/bcachefs/btree_types.h ++++ b/fs/bcachefs/btree_types.h +@@ -649,6 +649,7 @@ enum btree_insert_ret { + BTREE_INSERT_ENOSPC, + BTREE_INSERT_NEED_MARK_REPLICAS, + BTREE_INSERT_NEED_JOURNAL_RES, ++ BTREE_INSERT_NEED_JOURNAL_RECLAIM, + }; + + enum btree_gc_coalesce_fail_reason { +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index a47bba452308..3c9251483313 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -286,6 +286,10 @@ btree_key_can_insert_cached(struct btree_trans *trans, + + BUG_ON(iter->level); + ++ if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) && ++ bch2_btree_key_cache_must_wait(trans->c)) ++ return BTREE_INSERT_NEED_JOURNAL_RECLAIM; ++ + if (u64s <= ck->u64s) + return BTREE_INSERT_OK; + +@@ -652,6 +656,21 @@ int bch2_trans_commit_error(struct btree_trans *trans, + trace_trans_restart_journal_res_get(trans->ip); + ret = -EINTR; + break; ++ case BTREE_INSERT_NEED_JOURNAL_RECLAIM: ++ bch2_trans_unlock(trans); ++ ++ while (bch2_btree_key_cache_must_wait(c)) { ++ mutex_lock(&c->journal.reclaim_lock); ++ bch2_journal_reclaim(&c->journal); ++ mutex_unlock(&c->journal.reclaim_lock); ++ } ++ ++ if (bch2_trans_relock(trans)) ++ return 0; ++ ++ trace_trans_restart_journal_reclaim(trans->ip); ++ ret = -EINTR; ++ break; + default: + BUG_ON(ret >= 0); + break; +diff --git a/include/trace/events/bcachefs.h b/include/trace/events/bcachefs.h +index a3ecc1a23003..cb22db36fc03 100644 +--- a/include/trace/events/bcachefs.h ++++ b/include/trace/events/bcachefs.h +@@ -681,6 +681,11 @@ DEFINE_EVENT(transaction_restart, trans_restart_journal_preres_get, + TP_ARGS(ip) + ); + ++DEFINE_EVENT(transaction_restart, trans_restart_journal_reclaim, ++ TP_PROTO(unsigned long ip), ++ TP_ARGS(ip) ++); ++ + DEFINE_EVENT(transaction_restart, trans_restart_mark_replicas, + TP_PROTO(unsigned long ip), + TP_ARGS(ip) +-- +cgit v1.2.3 + + +From 84516ca8ec24b1a8090969684d0b71955e17796d Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 19 Nov 2020 20:55:33 -0500 +Subject: bcachefs: Move journal reclaim to a kthread + +This is to make tracing easier. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/alloc_background.c | 2 +- + fs/bcachefs/bcachefs.h | 1 - + fs/bcachefs/btree_gc.c | 2 +- + fs/bcachefs/btree_key_cache.c | 2 +- + fs/bcachefs/chardev.c | 3 +- + fs/bcachefs/journal.c | 16 ++++++-- + fs/bcachefs/journal_io.c | 4 +- + fs/bcachefs/journal_reclaim.c | 86 ++++++++++++++++++++++++++++++++++++------ + fs/bcachefs/journal_reclaim.h | 15 +++++++- + fs/bcachefs/journal_types.h | 6 ++- + fs/bcachefs/movinggc.c | 2 +- + fs/bcachefs/rebalance.c | 2 +- + fs/bcachefs/super.c | 16 ++++---- + 13 files changed, 122 insertions(+), 35 deletions(-) + +diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c +index 1f6b2742efd9..067631f51ddc 100644 +--- a/fs/bcachefs/alloc_background.c ++++ b/fs/bcachefs/alloc_background.c +@@ -1409,7 +1409,7 @@ int bch2_dev_allocator_start(struct bch_dev *ca) + return 0; + + p = kthread_create(bch2_allocator_thread, ca, +- "bch_alloc[%s]", ca->name); ++ "bch-alloc/%s", ca->name); + if (IS_ERR(p)) + return PTR_ERR(p); + +diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h +index bd5b9207ee64..91f5844a1d36 100644 +--- a/fs/bcachefs/bcachefs.h ++++ b/fs/bcachefs/bcachefs.h +@@ -651,7 +651,6 @@ struct bch_fs { + struct workqueue_struct *wq; + /* copygc needs its own workqueue for index updates.. */ + struct workqueue_struct *copygc_wq; +- struct workqueue_struct *journal_reclaim_wq; + + /* ALLOCATION */ + struct delayed_work pd_controllers_update; +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +index ba4acc112ed3..ac81c9b9a06a 100644 +--- a/fs/bcachefs/btree_gc.c ++++ b/fs/bcachefs/btree_gc.c +@@ -1427,7 +1427,7 @@ int bch2_gc_thread_start(struct bch_fs *c) + + BUG_ON(c->gc_thread); + +- p = kthread_create(bch2_gc_thread, c, "bch_gc"); ++ p = kthread_create(bch2_gc_thread, c, "bch-gc/%s", c->name); + if (IS_ERR(p)) + return PTR_ERR(p); + +diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c +index aa7767c3f044..a8d05b4739b7 100644 +--- a/fs/bcachefs/btree_key_cache.c ++++ b/fs/bcachefs/btree_key_cache.c +@@ -496,7 +496,7 @@ bool bch2_btree_insert_key_cached(struct btree_trans *trans, + &ck->journal, btree_key_cache_journal_flush); + + if (kick_reclaim) +- mod_delayed_work(c->journal_reclaim_wq, &c->journal.reclaim_work, 0); ++ journal_reclaim_kick(&c->journal); + return true; + } + +diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c +index 4663784d2f28..e7c8969aaad1 100644 +--- a/fs/bcachefs/chardev.c ++++ b/fs/bcachefs/chardev.c +@@ -341,7 +341,8 @@ static long bch2_ioctl_data(struct bch_fs *c, + ctx->c = c; + ctx->arg = arg; + +- ctx->thread = kthread_create(bch2_data_thread, ctx, "[bcachefs]"); ++ ctx->thread = kthread_create(bch2_data_thread, ctx, ++ "bch-data/%s", c->name); + if (IS_ERR(ctx->thread)) { + ret = PTR_ERR(ctx->thread); + goto err; +diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c +index a4cc98f86a88..0dfd95094cdf 100644 +--- a/fs/bcachefs/journal.c ++++ b/fs/bcachefs/journal.c +@@ -226,11 +226,14 @@ static bool journal_entry_close(struct journal *j) + */ + static int journal_entry_open(struct journal *j) + { ++ struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct journal_buf *buf = journal_cur_buf(j); + union journal_res_state old, new; + int u64s; + u64 v; + ++ BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb)); ++ + lockdep_assert_held(&j->lock); + BUG_ON(journal_entry_is_open(j)); + +@@ -481,8 +484,10 @@ static bool journal_preres_available(struct journal *j, + { + bool ret = bch2_journal_preres_get_fast(j, res, new_u64s, flags); + +- if (!ret) +- bch2_journal_reclaim_work(&j->reclaim_work.work); ++ if (!ret && mutex_trylock(&j->reclaim_lock)) { ++ bch2_journal_reclaim(j); ++ mutex_unlock(&j->reclaim_lock); ++ } + + return ret; + } +@@ -889,7 +894,7 @@ void bch2_fs_journal_stop(struct journal *j) + j->last_empty_seq + 1 != journal_cur_seq(j))); + + cancel_delayed_work_sync(&j->write_work); +- cancel_delayed_work_sync(&j->reclaim_work); ++ bch2_journal_reclaim_stop(j); + } + + int bch2_fs_journal_start(struct journal *j, u64 cur_seq, +@@ -1017,7 +1022,6 @@ int bch2_fs_journal_init(struct journal *j) + spin_lock_init(&j->err_lock); + init_waitqueue_head(&j->wait); + INIT_DELAYED_WORK(&j->write_work, journal_write_work); +- INIT_DELAYED_WORK(&j->reclaim_work, bch2_journal_reclaim_work); + init_waitqueue_head(&j->pin_flush_wait); + mutex_init(&j->reclaim_lock); + mutex_init(&j->discard_lock); +@@ -1069,6 +1073,8 @@ void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) + "last_seq:\t\t%llu\n" + "last_seq_ondisk:\t%llu\n" + "prereserved:\t\t%u/%u\n" ++ "nr direct reclaim:\t%llu\n" ++ "nr background reclaim:\t%llu\n" + "current entry sectors:\t%u\n" + "current entry error:\t%u\n" + "current entry:\t\t", +@@ -1078,6 +1084,8 @@ void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) + j->last_seq_ondisk, + j->prereserved.reserved, + j->prereserved.remaining, ++ j->nr_direct_reclaim, ++ j->nr_background_reclaim, + j->cur_entry_sectors, + j->cur_entry_error); + +diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c +index 7c157bc50268..d1367cf067d3 100644 +--- a/fs/bcachefs/journal_io.c ++++ b/fs/bcachefs/journal_io.c +@@ -994,7 +994,7 @@ static void journal_write_done(struct closure *cl) + * Must come before signaling write completion, for + * bch2_fs_journal_stop(): + */ +- mod_delayed_work(c->journal_reclaim_wq, &j->reclaim_work, 0); ++ journal_reclaim_kick(&c->journal); + + /* also must come before signalling write completion: */ + closure_debug_destroy(cl); +@@ -1045,6 +1045,8 @@ void bch2_journal_write(struct closure *cl) + unsigned i, sectors, bytes, u64s; + int ret; + ++ BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb)); ++ + bch2_journal_pin_put(j, le64_to_cpu(w->data->seq)); + + journal_buf_realloc(j, w); +diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c +index 3af085dee387..2fa87c7dab7a 100644 +--- a/fs/bcachefs/journal_reclaim.c ++++ b/fs/bcachefs/journal_reclaim.c +@@ -8,6 +8,7 @@ + #include "replicas.h" + #include "super.h" + ++#include + #include + #include + +@@ -534,9 +535,10 @@ static u64 journal_seq_to_flush(struct journal *j) + * 512 journal entries or 25% of all journal buckets, then + * journal_next_bucket() should not stall. + */ +-void bch2_journal_reclaim(struct journal *j) ++static void __bch2_journal_reclaim(struct journal *j, bool direct) + { + struct bch_fs *c = container_of(j, struct bch_fs, journal); ++ bool kthread = (current->flags & PF_KTHREAD) != 0; + u64 seq_to_flush, nr_flushed = 0; + size_t min_nr; + unsigned flags; +@@ -551,6 +553,9 @@ void bch2_journal_reclaim(struct journal *j) + flags = memalloc_noreclaim_save(); + + do { ++ if (kthread && kthread_should_stop()) ++ break; ++ + bch2_journal_do_discards(j); + + seq_to_flush = journal_seq_to_flush(j); +@@ -582,26 +587,83 @@ void bch2_journal_reclaim(struct journal *j) + c->btree_key_cache.nr_dirty, + c->btree_key_cache.nr_keys); + +- nr_flushed += journal_flush_pins(j, seq_to_flush, min_nr); ++ nr_flushed = journal_flush_pins(j, seq_to_flush, min_nr); ++ ++ if (direct) ++ j->nr_direct_reclaim += nr_flushed; ++ else ++ j->nr_background_reclaim += nr_flushed; ++ trace_journal_reclaim_finish(c, nr_flushed); + } while (min_nr); + + memalloc_noreclaim_restore(flags); ++} ++ ++void bch2_journal_reclaim(struct journal *j) ++{ ++ __bch2_journal_reclaim(j, true); ++} ++ ++static int bch2_journal_reclaim_thread(void *arg) ++{ ++ struct journal *j = arg; ++ unsigned long next; ++ ++ while (!kthread_should_stop()) { ++ j->reclaim_kicked = false; ++ ++ mutex_lock(&j->reclaim_lock); ++ __bch2_journal_reclaim(j, false); ++ mutex_unlock(&j->reclaim_lock); ++ ++ next = j->last_flushed + msecs_to_jiffies(j->reclaim_delay_ms); + +- trace_journal_reclaim_finish(c, nr_flushed); ++ while (1) { ++ set_current_state(TASK_INTERRUPTIBLE); ++ if (kthread_should_stop()) ++ break; ++ if (j->reclaim_kicked) ++ break; ++ if (time_after_eq(jiffies, next)) ++ break; ++ schedule_timeout(next - jiffies); + +- if (!bch2_journal_error(j)) +- queue_delayed_work(c->journal_reclaim_wq, &j->reclaim_work, +- msecs_to_jiffies(j->reclaim_delay_ms)); ++ } ++ __set_current_state(TASK_RUNNING); ++ } ++ ++ return 0; + } + +-void bch2_journal_reclaim_work(struct work_struct *work) ++void bch2_journal_reclaim_stop(struct journal *j) + { +- struct journal *j = container_of(to_delayed_work(work), +- struct journal, reclaim_work); ++ struct task_struct *p = j->reclaim_thread; + +- mutex_lock(&j->reclaim_lock); +- bch2_journal_reclaim(j); +- mutex_unlock(&j->reclaim_lock); ++ j->reclaim_thread = NULL; ++ ++ if (p) { ++ kthread_stop(p); ++ put_task_struct(p); ++ } ++} ++ ++int bch2_journal_reclaim_start(struct journal *j) ++{ ++ struct bch_fs *c = container_of(j, struct bch_fs, journal); ++ struct task_struct *p; ++ ++ if (j->reclaim_thread) ++ return 0; ++ ++ p = kthread_create(bch2_journal_reclaim_thread, j, ++ "bch-reclaim/%s", c->name); ++ if (IS_ERR(p)) ++ return PTR_ERR(p); ++ ++ get_task_struct(p); ++ j->reclaim_thread = p; ++ wake_up_process(p); ++ return 0; + } + + static int journal_flush_done(struct journal *j, u64 seq_to_flush, +diff --git a/fs/bcachefs/journal_reclaim.h b/fs/bcachefs/journal_reclaim.h +index 8128907a7623..bae2c9210db8 100644 +--- a/fs/bcachefs/journal_reclaim.h ++++ b/fs/bcachefs/journal_reclaim.h +@@ -10,6 +10,17 @@ enum journal_space_from { + journal_space_clean, + }; + ++static inline void journal_reclaim_kick(struct journal *j) ++{ ++ struct task_struct *p = READ_ONCE(j->reclaim_thread); ++ ++ if (p && !j->reclaim_kicked) { ++ j->reclaim_kicked = true; ++ if (p) ++ wake_up_process(p); ++ } ++} ++ + unsigned bch2_journal_dev_buckets_available(struct journal *, + struct journal_device *, + enum journal_space_from); +@@ -55,7 +66,9 @@ void bch2_journal_pin_flush(struct journal *, struct journal_entry_pin *); + + void bch2_journal_do_discards(struct journal *); + void bch2_journal_reclaim(struct journal *); +-void bch2_journal_reclaim_work(struct work_struct *); ++ ++void bch2_journal_reclaim_stop(struct journal *); ++int bch2_journal_reclaim_start(struct journal *); + + bool bch2_journal_flush_pins(struct journal *, u64); + +diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h +index 8a05bb991c2f..4640bb8687cc 100644 +--- a/fs/bcachefs/journal_types.h ++++ b/fs/bcachefs/journal_types.h +@@ -216,8 +216,12 @@ struct journal { + struct write_point wp; + spinlock_t err_lock; + +- struct delayed_work reclaim_work; + struct mutex reclaim_lock; ++ struct task_struct *reclaim_thread; ++ bool reclaim_kicked; ++ u64 nr_direct_reclaim; ++ u64 nr_background_reclaim; ++ + unsigned long last_flushed; + struct journal_entry_pin *flush_in_progress; + wait_queue_head_t pin_flush_wait; +diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c +index ddfda1ef8a79..4834f41f48ed 100644 +--- a/fs/bcachefs/movinggc.c ++++ b/fs/bcachefs/movinggc.c +@@ -345,7 +345,7 @@ int bch2_copygc_start(struct bch_fs *c) + if (bch2_fs_init_fault("copygc_start")) + return -ENOMEM; + +- t = kthread_create(bch2_copygc_thread, c, "bch_copygc"); ++ t = kthread_create(bch2_copygc_thread, c, "bch-copygc/%s", c->name); + if (IS_ERR(t)) + return PTR_ERR(t); + +diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c +index 44d2651be970..c3373c48fa81 100644 +--- a/fs/bcachefs/rebalance.c ++++ b/fs/bcachefs/rebalance.c +@@ -314,7 +314,7 @@ int bch2_rebalance_start(struct bch_fs *c) + if (c->opts.nochanges) + return 0; + +- p = kthread_create(bch2_rebalance_thread, c, "bch_rebalance"); ++ p = kthread_create(bch2_rebalance_thread, c, "bch-rebalance/%s", c->name); + if (IS_ERR(p)) + return PTR_ERR(p); + +diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c +index 0d4416d8ea29..5dc594192bc0 100644 +--- a/fs/bcachefs/super.c ++++ b/fs/bcachefs/super.c +@@ -49,7 +49,6 @@ + #include + #include + #include +-#include + #include + #include + #include +@@ -259,7 +258,7 @@ static void bch2_writes_disabled(struct percpu_ref *writes) + void bch2_fs_read_only(struct bch_fs *c) + { + if (!test_bit(BCH_FS_RW, &c->flags)) { +- cancel_delayed_work_sync(&c->journal.reclaim_work); ++ BUG_ON(c->journal.reclaim_thread); + return; + } + +@@ -417,6 +416,12 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early) + + set_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags); + ++ ret = bch2_journal_reclaim_start(&c->journal); ++ if (ret) { ++ bch_err(c, "error starting journal reclaim: %i", ret); ++ return ret; ++ } ++ + if (!early) { + ret = bch2_fs_read_write_late(c); + if (ret) +@@ -425,9 +430,6 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early) + + percpu_ref_reinit(&c->writes); + set_bit(BCH_FS_RW, &c->flags); +- +- queue_delayed_work(c->journal_reclaim_wq, +- &c->journal.reclaim_work, 0); + return 0; + err: + __bch2_fs_read_only(c); +@@ -495,8 +497,6 @@ static void __bch2_fs_free(struct bch_fs *c) + kfree(c->unused_inode_hints); + free_heap(&c->copygc_heap); + +- if (c->journal_reclaim_wq) +- destroy_workqueue(c->journal_reclaim_wq); + if (c->copygc_wq) + destroy_workqueue(c->copygc_wq); + if (c->wq) +@@ -749,8 +749,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) + WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) || + !(c->copygc_wq = alloc_workqueue("bcachefs_copygc", + WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) || +- !(c->journal_reclaim_wq = alloc_workqueue("bcachefs_journal_reclaim", +- WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_HIGHPRI, 1)) || + percpu_ref_init(&c->writes, bch2_writes_disabled, + PERCPU_REF_INIT_DEAD, GFP_KERNEL) || + mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) || +-- +cgit v1.2.3 + + +From 35d1245f5a01a31e5ea500a25de3bae26106832c Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 20 Nov 2020 16:12:39 -0500 +Subject: bcachefs: Fix an rcu splat + +bch2_bucket_alloc() requires rcu_read_lock() to be held. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/journal.c | 13 ++++++++----- + 1 file changed, 8 insertions(+), 5 deletions(-) + +diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c +index 0dfd95094cdf..1c4dce654aa0 100644 +--- a/fs/bcachefs/journal.c ++++ b/fs/bcachefs/journal.c +@@ -683,16 +683,19 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, + if (nr <= ja->nr) + return 0; + +- ret = -ENOMEM; + new_buckets = kzalloc(nr * sizeof(u64), GFP_KERNEL); + new_bucket_seq = kzalloc(nr * sizeof(u64), GFP_KERNEL); +- if (!new_buckets || !new_bucket_seq) ++ if (!new_buckets || !new_bucket_seq) { ++ ret = -ENOMEM; + goto err; ++ } + + journal_buckets = bch2_sb_resize_journal(&ca->disk_sb, + nr + sizeof(*journal_buckets) / sizeof(u64)); +- if (!journal_buckets) ++ if (!journal_buckets) { ++ ret = -ENOSPC; + goto err; ++ } + + /* + * We may be called from the device add path, before the new device has +@@ -721,8 +724,10 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, + goto err; + } + } else { ++ rcu_read_lock(); + ob = bch2_bucket_alloc(c, ca, RESERVE_ALLOC, + false, cl); ++ rcu_read_unlock(); + if (IS_ERR(ob)) { + ret = cl ? -EAGAIN : -ENOSPC; + goto err; +@@ -774,8 +779,6 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, + if (!new_fs) + bch2_open_bucket_put(c, ob); + } +- +- ret = 0; + err: + bch2_sb_resize_journal(&ca->disk_sb, + ja->nr + sizeof(*journal_buckets) / sizeof(u64)); +-- +cgit v1.2.3 + + +From 0012394b25f2347e13535ff280a41f1867b46971 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 20 Nov 2020 21:21:28 -0500 +Subject: bcachefs: Don't use bkey cache for inode update in fsck + +fsck doesn't know about the btree key cache, and non-cached iterators +aren't cache coherent (yet?) + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/fs.c | 2 +- + fs/bcachefs/fsck.c | 2 +- + fs/bcachefs/inode.c | 14 ++++++++++---- + fs/bcachefs/inode.h | 2 +- + 4 files changed, 13 insertions(+), 7 deletions(-) + +diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c +index e612bcb88564..7ee34771a867 100644 +--- a/fs/bcachefs/fs.c ++++ b/fs/bcachefs/fs.c +@@ -1261,7 +1261,7 @@ static void bch2_evict_inode(struct inode *vinode) + KEY_TYPE_QUOTA_WARN); + bch2_quota_acct(c, inode->ei_qid, Q_INO, -1, + KEY_TYPE_QUOTA_WARN); +- bch2_inode_rm(c, inode->v.i_ino); ++ bch2_inode_rm(c, inode->v.i_ino, true); + } + } + +diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c +index 0c5035270846..09ce6c29b88c 100644 +--- a/fs/bcachefs/fsck.c ++++ b/fs/bcachefs/fsck.c +@@ -1254,7 +1254,7 @@ static int check_inode(struct btree_trans *trans, + + bch2_fs_lazy_rw(c); + +- ret = bch2_inode_rm(c, u.bi_inum); ++ ret = bch2_inode_rm(c, u.bi_inum, false); + if (ret) + bch_err(c, "error in fsck: error %i while deleting inode", ret); + return ret; +diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c +index 823a1ddec5ac..76f62f1e3969 100644 +--- a/fs/bcachefs/inode.c ++++ b/fs/bcachefs/inode.c +@@ -542,7 +542,7 @@ found_slot: + return ret; + } + +-int bch2_inode_rm(struct bch_fs *c, u64 inode_nr) ++int bch2_inode_rm(struct bch_fs *c, u64 inode_nr, bool cached) + { + struct btree_trans trans; + struct btree_iter *iter; +@@ -576,9 +576,15 @@ retry: + + bi_generation = 0; + +- iter = bch2_trans_get_iter(&trans, BTREE_ID_INODES, POS(0, inode_nr), +- BTREE_ITER_CACHED|BTREE_ITER_INTENT); +- k = bch2_btree_iter_peek_cached(iter); ++ if (cached) { ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_INODES, POS(0, inode_nr), ++ BTREE_ITER_CACHED|BTREE_ITER_INTENT); ++ k = bch2_btree_iter_peek_cached(iter); ++ } else { ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_INODES, POS(0, inode_nr), ++ BTREE_ITER_SLOTS|BTREE_ITER_INTENT); ++ k = bch2_btree_iter_peek_slot(iter); ++ } + + ret = bkey_err(k); + if (ret) +diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h +index ef7e885dce0c..dbdfcf63d079 100644 +--- a/fs/bcachefs/inode.h ++++ b/fs/bcachefs/inode.h +@@ -71,7 +71,7 @@ void bch2_inode_init(struct bch_fs *, struct bch_inode_unpacked *, + + int bch2_inode_create(struct btree_trans *, struct bch_inode_unpacked *); + +-int bch2_inode_rm(struct bch_fs *, u64); ++int bch2_inode_rm(struct bch_fs *, u64, bool); + + int bch2_inode_find_by_inum_trans(struct btree_trans *, u64, + struct bch_inode_unpacked *); +-- +cgit v1.2.3 + + +From 34194f3b645e457ccad270faf5f751b2b83b9dad Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 20 Nov 2020 21:28:55 -0500 +Subject: bcachefs: bch2_btree_delete_range_trans() + +This helps reduce stack usage by avoiding multiple btree_trans on the +stack. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_update.h | 4 +-- + fs/bcachefs/btree_update_leaf.c | 66 +++++++++++++++++++---------------------- + fs/bcachefs/inode.c | 20 ++++++------- + 3 files changed, 42 insertions(+), 48 deletions(-) + +diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h +index e0b1bde37484..adb07043cbb3 100644 +--- a/fs/bcachefs/btree_update.h ++++ b/fs/bcachefs/btree_update.h +@@ -67,8 +67,8 @@ int __bch2_btree_insert(struct btree_trans *, enum btree_id, struct bkey_i *); + int bch2_btree_insert(struct bch_fs *, enum btree_id, struct bkey_i *, + struct disk_reservation *, u64 *, int flags); + +-int bch2_btree_delete_at_range(struct btree_trans *, struct btree_iter *, +- struct bpos, u64 *); ++int bch2_btree_delete_range_trans(struct btree_trans *, enum btree_id, ++ struct bpos, struct bpos, u64 *); + int bch2_btree_delete_range(struct bch_fs *, enum btree_id, + struct bpos, struct bpos, u64 *); + +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index 3c9251483313..d1196ce70058 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -1094,13 +1094,32 @@ int bch2_btree_insert(struct bch_fs *c, enum btree_id id, + __bch2_btree_insert(&trans, id, k)); + } + +-int bch2_btree_delete_at_range(struct btree_trans *trans, +- struct btree_iter *iter, +- struct bpos end, +- u64 *journal_seq) ++int bch2_btree_delete_at(struct btree_trans *trans, ++ struct btree_iter *iter, unsigned flags) ++{ ++ struct bkey_i k; ++ ++ bkey_init(&k.k); ++ k.k.p = iter->pos; ++ ++ bch2_trans_update(trans, iter, &k, 0); ++ return bch2_trans_commit(trans, NULL, NULL, ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_USE_RESERVE|flags); ++} ++ ++int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id, ++ struct bpos start, struct bpos end, ++ u64 *journal_seq) + { ++ struct btree_iter *iter; + struct bkey_s_c k; + int ret = 0; ++ ++ iter = bch2_trans_get_iter(trans, id, start, BTREE_ITER_INTENT); ++ ret = PTR_ERR_OR_ZERO(iter); ++ if (ret) ++ return ret; + retry: + while ((k = bch2_btree_iter_peek(iter)).k && + !(ret = bkey_err(k)) && +@@ -1111,6 +1130,10 @@ retry: + + bkey_init(&delete.k); + ++ /* ++ * This could probably be more efficient for extents: ++ */ ++ + /* + * For extents, iter.pos won't necessarily be the same as + * bkey_start_pos(k.k) (for non extents they always will be the +@@ -1150,22 +1173,8 @@ retry: + goto retry; + } + ++ bch2_trans_iter_put(trans, iter); + return ret; +- +-} +- +-int bch2_btree_delete_at(struct btree_trans *trans, +- struct btree_iter *iter, unsigned flags) +-{ +- struct bkey_i k; +- +- bkey_init(&k.k); +- k.k.p = iter->pos; +- +- bch2_trans_update(trans, iter, &k, 0); +- return bch2_trans_commit(trans, NULL, NULL, +- BTREE_INSERT_NOFAIL| +- BTREE_INSERT_USE_RESERVE|flags); + } + + /* +@@ -1177,21 +1186,6 @@ int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id, + struct bpos start, struct bpos end, + u64 *journal_seq) + { +- struct btree_trans trans; +- struct btree_iter *iter; +- int ret = 0; +- +- /* +- * XXX: whether we need mem/more iters depends on whether this btree id +- * has triggers +- */ +- bch2_trans_init(&trans, c, BTREE_ITER_MAX, 512); +- +- iter = bch2_trans_get_iter(&trans, id, start, BTREE_ITER_INTENT); +- +- ret = bch2_btree_delete_at_range(&trans, iter, end, journal_seq); +- ret = bch2_trans_exit(&trans) ?: ret; +- +- BUG_ON(ret == -EINTR); +- return ret; ++ return bch2_trans_do(c, NULL, journal_seq, 0, ++ bch2_btree_delete_range_trans(&trans, id, start, end, journal_seq)); + } +diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c +index 76f62f1e3969..82099e5a48d8 100644 +--- a/fs/bcachefs/inode.c ++++ b/fs/bcachefs/inode.c +@@ -553,6 +553,8 @@ int bch2_inode_rm(struct bch_fs *c, u64 inode_nr, bool cached) + u64 bi_generation; + int ret; + ++ bch2_trans_init(&trans, c, 0, 0); ++ + /* + * If this was a directory, there shouldn't be any real dirents left - + * but there could be whiteouts (from hash collisions) that we should +@@ -561,16 +563,14 @@ int bch2_inode_rm(struct bch_fs *c, u64 inode_nr, bool cached) + * XXX: the dirent could ideally would delete whiteouts when they're no + * longer needed + */ +- ret = bch2_btree_delete_range(c, BTREE_ID_EXTENTS, +- start, end, NULL) ?: +- bch2_btree_delete_range(c, BTREE_ID_XATTRS, +- start, end, NULL) ?: +- bch2_btree_delete_range(c, BTREE_ID_DIRENTS, +- start, end, NULL); ++ ret = bch2_btree_delete_range_trans(&trans, BTREE_ID_EXTENTS, ++ start, end, NULL) ?: ++ bch2_btree_delete_range_trans(&trans, BTREE_ID_XATTRS, ++ start, end, NULL) ?: ++ bch2_btree_delete_range_trans(&trans, BTREE_ID_DIRENTS, ++ start, end, NULL); + if (ret) +- return ret; +- +- bch2_trans_init(&trans, c, 0, 0); ++ goto err; + retry: + bch2_trans_begin(&trans); + +@@ -590,7 +590,7 @@ retry: + if (ret) + goto err; + +- bch2_fs_inconsistent_on(k.k->type != KEY_TYPE_inode, c, ++ bch2_fs_inconsistent_on(k.k->type != KEY_TYPE_inode, trans.c, + "inode %llu not found when deleting", + inode_nr); + +-- +cgit v1.2.3 + + +From ae4492ab8745dbb85d3c40c61e8352eb46d584a1 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 20 Nov 2020 22:51:04 -0500 +Subject: bcachefs: Delete dead code + +The interior btree node update path has changed, this is no longer +needed. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_update_interior.c | 3 --- + fs/bcachefs/btree_update_interior.h | 1 - + 2 files changed, 4 deletions(-) + +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index 27e32262da11..5143896e1b29 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -1384,9 +1384,6 @@ void bch2_btree_insert_node(struct btree_update *as, struct btree *b, + BUG_ON(!as || as->b); + bch2_verify_keylist_sorted(keys); + +- if (as->must_rewrite) +- goto split; +- + bch2_btree_node_lock_for_insert(c, b, iter); + + if (!bch2_btree_node_insert_fits(c, b, bch2_keylist_u64s(keys))) { +diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h +index 41854fc345d2..45d212730fd7 100644 +--- a/fs/bcachefs/btree_update_interior.h ++++ b/fs/bcachefs/btree_update_interior.h +@@ -47,7 +47,6 @@ struct btree_update { + BTREE_INTERIOR_UPDATING_AS, + } mode; + +- unsigned must_rewrite:1; + unsigned nodes_written:1; + + enum btree_id btree_id; +-- +cgit v1.2.3 + + +From d169bccbd0aced82734a10ad8e9704118cf0d93b Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 20 Nov 2020 19:27:57 -0500 +Subject: bcachefs: Optimize bch2_journal_flush_seq_async() + +Avoid taking the journal lock if we don't have to. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/journal.c | 10 +++++++++- + 1 file changed, 9 insertions(+), 1 deletion(-) + +diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c +index 1c4dce654aa0..5874a9ff2204 100644 +--- a/fs/bcachefs/journal.c ++++ b/fs/bcachefs/journal.c +@@ -548,12 +548,20 @@ out: + * necessary + */ + int bch2_journal_flush_seq_async(struct journal *j, u64 seq, +- struct closure *parent) ++ struct closure *parent) + { + struct journal_buf *buf; + int ret = 0; + ++ if (seq <= j->err_seq) ++ return -EIO; ++ ++ if (seq <= j->seq_ondisk) ++ return 1; ++ + spin_lock(&j->lock); ++ ++ /* Recheck under lock: */ + if (seq <= j->err_seq) { + ret = -EIO; + goto out; +-- +cgit v1.2.3 + + +From 2725c03cfc21d719d6edd0d36ba0bdd8fc33934f Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 29 Nov 2020 16:00:47 -0500 +Subject: bcachefs: Fix for __readahead_batch getting partial batch + +We were incorrectly ignoring the return value of __readahead_batch, +leading to a null ptr deref in __bch2_page_state_create(). + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/fs-io.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c +index 222e390acc64..86236e851ce9 100644 +--- a/fs/bcachefs/fs-io.c ++++ b/fs/bcachefs/fs-io.c +@@ -692,7 +692,7 @@ static int readpages_iter_init(struct readpages_iter *iter, + if (!iter->pages) + return -ENOMEM; + +- __readahead_batch(ractl, iter->pages, nr_pages); ++ nr_pages = __readahead_batch(ractl, iter->pages, nr_pages); + for (i = 0; i < nr_pages; i++) { + __bch2_page_state_create(iter->pages[i], __GFP_NOFAIL); + put_page(iter->pages[i]); +-- +cgit v1.2.3 + + +From 6a914550fcd32de909725ff471d6b44a8e9d96e5 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 29 Nov 2020 17:09:13 -0500 +Subject: bcachefs: Fix journal reclaim spinning in recovery + +We can't run journal reclaim until we've finished replaying updates to +interior btree nodes - the check for this was in the wrong place though, +leading to journal reclaim spinning before it was allowed to proceed. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/journal_reclaim.c | 5 +++++ + fs/bcachefs/recovery.c | 1 + + 2 files changed, 6 insertions(+) + +diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c +index 2fa87c7dab7a..66f5dcce8889 100644 +--- a/fs/bcachefs/journal_reclaim.c ++++ b/fs/bcachefs/journal_reclaim.c +@@ -609,6 +609,10 @@ static int bch2_journal_reclaim_thread(void *arg) + struct journal *j = arg; + unsigned long next; + ++ set_freezable(); ++ ++ kthread_wait_freezable(test_bit(JOURNAL_RECLAIM_STARTED, &j->flags)); ++ + while (!kthread_should_stop()) { + j->reclaim_kicked = false; + +@@ -627,6 +631,7 @@ static int bch2_journal_reclaim_thread(void *arg) + if (time_after_eq(jiffies, next)) + break; + schedule_timeout(next - jiffies); ++ try_to_freeze(); + + } + __set_current_state(TASK_RUNNING); +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index 6750063663b5..0b3521c9cc19 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -616,6 +616,7 @@ static int bch2_journal_replay(struct bch_fs *c, + */ + set_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags); + set_bit(JOURNAL_RECLAIM_STARTED, &j->flags); ++ journal_reclaim_kick(j); + + j->replay_journal_seq = seq; + +-- +cgit v1.2.3 + + +From ee435745d6e959c09ce1bf3ac5f88912908df95b Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 29 Nov 2020 23:48:20 -0500 +Subject: bcachefs: Fix error in filesystem initialization + +The rhashtable code doesn't like when we destroy an rhashtable that was +never initialized + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_key_cache.c | 17 ++++++++++++++--- + fs/bcachefs/btree_types.h | 1 + + 2 files changed, 15 insertions(+), 3 deletions(-) + +diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c +index a8d05b4739b7..a21dc485c677 100644 +--- a/fs/bcachefs/btree_key_cache.c ++++ b/fs/bcachefs/btree_key_cache.c +@@ -607,7 +607,8 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc) + } + mutex_unlock(&bc->lock); + +- rhashtable_destroy(&bc->table); ++ if (bc->table_init_done) ++ rhashtable_destroy(&bc->table); + } + + void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c) +@@ -620,12 +621,22 @@ void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c) + + int bch2_fs_btree_key_cache_init(struct btree_key_cache *c) + { ++ int ret; ++ + c->shrink.seeks = 1; + c->shrink.count_objects = bch2_btree_key_cache_count; + c->shrink.scan_objects = bch2_btree_key_cache_scan; + +- return register_shrinker(&c->shrink) ?: +- rhashtable_init(&c->table, &bch2_btree_key_cache_params); ++ ret = register_shrinker(&c->shrink); ++ if (ret) ++ return ret; ++ ++ ret = rhashtable_init(&c->table, &bch2_btree_key_cache_params); ++ if (ret) ++ return ret; ++ ++ c->table_init_done = true; ++ return 0; + } + + void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *c) +diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h +index 2d142ef601e1..cf59f1224741 100644 +--- a/fs/bcachefs/btree_types.h ++++ b/fs/bcachefs/btree_types.h +@@ -293,6 +293,7 @@ static inline struct btree_iter_level *iter_l(struct btree_iter *iter) + struct btree_key_cache { + struct mutex lock; + struct rhashtable table; ++ bool table_init_done; + struct list_head freed; + struct list_head clean; + struct list_head dirty; +-- +cgit v1.2.3 + + +From 0b2c7a3d6130ddeabceae71b2991826fdb64b2a2 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 30 Nov 2020 02:07:38 -0500 +Subject: bcachefs: Change a BUG_ON() to a fatal error + +In the btree key cache code, failing to flush a dirty key is a serious +error, but it doesn't need to be a BUG_ON(), we can stop the filesystem +instead. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_key_cache.c | 7 ++++--- + 1 file changed, 4 insertions(+), 3 deletions(-) + +diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c +index a21dc485c677..3ed7d3778aca 100644 +--- a/fs/bcachefs/btree_key_cache.c ++++ b/fs/bcachefs/btree_key_cache.c +@@ -367,10 +367,11 @@ err: + if (ret == -EINTR) + goto retry; + +- BUG_ON(ret && !bch2_journal_error(j)); +- +- if (ret) ++ if (ret) { ++ bch2_fs_fatal_err_on(!bch2_journal_error(j), c, ++ "error flushing key cache: %i", ret); + goto out; ++ } + + bch2_journal_pin_drop(j, &ck->journal); + bch2_journal_preres_put(j, &ck->res); +-- +cgit v1.2.3 + + +From 387ec8bcd1315855dffe7295ad73170c2b627bdd Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 30 Nov 2020 02:08:14 -0500 +Subject: bcachefs: Ensure we always have a journal pin in interior update path + +For the new nodes an interior btree update makes reachable, updates to +those nodes may be journalled after the btree update starts but before +the transactional part - where we make those nodes reachable. Those +updates need to be kept in the journal until after the btree update +completes, hence we should always get a journal pin at the start of the +interior update. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_update_interior.c | 16 ++++++---------- + 1 file changed, 6 insertions(+), 10 deletions(-) + +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index 5143896e1b29..dc7b1342410e 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -544,6 +544,8 @@ static void btree_update_nodes_written(struct btree_update *as) + unsigned i; + int ret; + ++ BUG_ON(!journal_pin_active(&as->journal)); ++ + /* + * We did an update to a parent node where the pointers we added pointed + * to child nodes that weren't written yet: now, the child nodes have +@@ -699,17 +701,7 @@ static void btree_update_reparent(struct btree_update *as, + child->b = NULL; + child->mode = BTREE_INTERIOR_UPDATING_AS; + +- /* +- * When we write a new btree root, we have to drop our journal pin +- * _before_ the new nodes are technically reachable; see +- * btree_update_nodes_written(). +- * +- * This goes for journal pins that are recursively blocked on us - so, +- * just transfer the journal pin to the new interior update so +- * btree_update_nodes_written() can drop it. +- */ + bch2_journal_pin_copy(&c->journal, &as->journal, &child->journal, NULL); +- bch2_journal_pin_drop(&c->journal, &child->journal); + } + + static void btree_update_updated_root(struct btree_update *as, struct btree *b) +@@ -956,6 +948,10 @@ bch2_btree_update_start(struct btree_trans *trans, enum btree_id id, + if (ret) + goto err; + ++ bch2_journal_pin_add(&c->journal, ++ atomic64_read(&c->journal.seq), ++ &as->journal, NULL); ++ + mutex_lock(&c->btree_interior_update_lock); + list_add_tail(&as->list, &c->btree_interior_update_list); + mutex_unlock(&c->btree_interior_update_lock); +-- +cgit v1.2.3 + + +From 3fabb6e97e52ad2b8ea4b358b2c7d38d4844c5e4 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 1 Dec 2020 11:40:59 -0500 +Subject: bcachefs: Use BTREE_ITER_PREFETCH in journal+btree iter + +Introducing the journal+btree iter introduced a regression where we +stopped using BTREE_ITER_PREFETCH - this is a performance regression on +rotating disks. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/recovery.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index 0b3521c9cc19..a837d9eb0f6d 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -187,7 +187,7 @@ void bch2_btree_and_journal_iter_init(struct btree_and_journal_iter *iter, + { + memset(iter, 0, sizeof(*iter)); + +- iter->btree = bch2_trans_get_iter(trans, id, pos, 0); ++ iter->btree = bch2_trans_get_iter(trans, id, pos, BTREE_ITER_PREFETCH); + bch2_journal_iter_init(&iter->journal, journal_keys, id, 0, pos); + } + +-- +cgit v1.2.3 + + +From 3bc85b89b50197501d0a07c95403be0cc5fd223d Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 1 Dec 2020 11:42:23 -0500 +Subject: bcachefs: Fix for fsck spuriously finding duplicate extents + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/fsck.c | 6 +++++- + 1 file changed, 5 insertions(+), 1 deletion(-) + +diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c +index 09ce6c29b88c..7449819d8eac 100644 +--- a/fs/bcachefs/fsck.c ++++ b/fs/bcachefs/fsck.c +@@ -485,7 +485,11 @@ static int check_extents(struct bch_fs *c) + BTREE_ITER_INTENT); + retry: + for_each_btree_key_continue(iter, 0, k, ret) { +- if (bkey_cmp(prev.k->k.p, bkey_start_pos(k.k)) > 0) { ++ /* ++ * due to retry errors we might see the same extent twice: ++ */ ++ if (bkey_cmp(prev.k->k.p, k.k->p) && ++ bkey_cmp(prev.k->k.p, bkey_start_pos(k.k)) > 0) { + char buf1[200]; + char buf2[200]; + +-- +cgit v1.2.3 + + +From 626ca1d830675d9f9390dd671aeee6ae1a191bc4 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 1 Dec 2020 11:48:08 -0500 +Subject: bcachefs: Journal pin refactoring + +This deletes some duplicated code. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/journal_reclaim.c | 64 +++++-------------------------------------- + fs/bcachefs/journal_reclaim.h | 28 ++++++++++++------- + 2 files changed, 25 insertions(+), 67 deletions(-) + +diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c +index 66f5dcce8889..beaa39f7bf5e 100644 +--- a/fs/bcachefs/journal_reclaim.c ++++ b/fs/bcachefs/journal_reclaim.c +@@ -320,11 +320,14 @@ void bch2_journal_pin_drop(struct journal *j, + spin_unlock(&j->lock); + } + +-static void bch2_journal_pin_add_locked(struct journal *j, u64 seq, +- struct journal_entry_pin *pin, +- journal_pin_flush_fn flush_fn) ++void bch2_journal_pin_set(struct journal *j, u64 seq, ++ struct journal_entry_pin *pin, ++ journal_pin_flush_fn flush_fn) + { +- struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq); ++ struct journal_entry_pin_list *pin_list; ++ ++ spin_lock(&j->lock); ++ pin_list = journal_seq_pin(j, seq); + + __journal_pin_drop(j, pin); + +@@ -335,45 +338,6 @@ static void bch2_journal_pin_add_locked(struct journal *j, u64 seq, + pin->flush = flush_fn; + + list_add(&pin->list, flush_fn ? &pin_list->list : &pin_list->flushed); +-} +- +-void __bch2_journal_pin_add(struct journal *j, u64 seq, +- struct journal_entry_pin *pin, +- journal_pin_flush_fn flush_fn) +-{ +- spin_lock(&j->lock); +- bch2_journal_pin_add_locked(j, seq, pin, flush_fn); +- spin_unlock(&j->lock); +- +- /* +- * If the journal is currently full, we might want to call flush_fn +- * immediately: +- */ +- journal_wake(j); +-} +- +-void bch2_journal_pin_update(struct journal *j, u64 seq, +- struct journal_entry_pin *pin, +- journal_pin_flush_fn flush_fn) +-{ +- if (journal_pin_active(pin) && pin->seq < seq) +- return; +- +- spin_lock(&j->lock); +- +- if (pin->seq != seq) { +- bch2_journal_pin_add_locked(j, seq, pin, flush_fn); +- } else { +- struct journal_entry_pin_list *pin_list = +- journal_seq_pin(j, seq); +- +- /* +- * If the pin is already pinning the right sequence number, it +- * still might've already been flushed: +- */ +- list_move(&pin->list, &pin_list->list); +- } +- + spin_unlock(&j->lock); + + /* +@@ -383,20 +347,6 @@ void bch2_journal_pin_update(struct journal *j, u64 seq, + journal_wake(j); + } + +-void bch2_journal_pin_copy(struct journal *j, +- struct journal_entry_pin *dst, +- struct journal_entry_pin *src, +- journal_pin_flush_fn flush_fn) +-{ +- spin_lock(&j->lock); +- +- if (journal_pin_active(src) && +- (!journal_pin_active(dst) || src->seq < dst->seq)) +- bch2_journal_pin_add_locked(j, src->seq, dst, flush_fn); +- +- spin_unlock(&j->lock); +-} +- + /** + * bch2_journal_pin_flush: ensure journal pin callback is no longer running + */ +diff --git a/fs/bcachefs/journal_reclaim.h b/fs/bcachefs/journal_reclaim.h +index bae2c9210db8..e25355042e6e 100644 +--- a/fs/bcachefs/journal_reclaim.h ++++ b/fs/bcachefs/journal_reclaim.h +@@ -42,25 +42,33 @@ journal_seq_pin(struct journal *j, u64 seq) + void bch2_journal_pin_put(struct journal *, u64); + void bch2_journal_pin_drop(struct journal *, struct journal_entry_pin *); + +-void __bch2_journal_pin_add(struct journal *, u64, struct journal_entry_pin *, +- journal_pin_flush_fn); ++void bch2_journal_pin_set(struct journal *, u64, struct journal_entry_pin *, ++ journal_pin_flush_fn); + + static inline void bch2_journal_pin_add(struct journal *j, u64 seq, + struct journal_entry_pin *pin, + journal_pin_flush_fn flush_fn) + { + if (unlikely(!journal_pin_active(pin) || pin->seq > seq)) +- __bch2_journal_pin_add(j, seq, pin, flush_fn); ++ bch2_journal_pin_set(j, seq, pin, flush_fn); + } + +-void bch2_journal_pin_update(struct journal *, u64, +- struct journal_entry_pin *, +- journal_pin_flush_fn); ++static inline void bch2_journal_pin_copy(struct journal *j, ++ struct journal_entry_pin *dst, ++ struct journal_entry_pin *src, ++ journal_pin_flush_fn flush_fn) ++{ ++ if (journal_pin_active(src)) ++ bch2_journal_pin_add(j, src->seq, dst, flush_fn); ++} + +-void bch2_journal_pin_copy(struct journal *, +- struct journal_entry_pin *, +- struct journal_entry_pin *, +- journal_pin_flush_fn); ++static inline void bch2_journal_pin_update(struct journal *j, u64 seq, ++ struct journal_entry_pin *pin, ++ journal_pin_flush_fn flush_fn) ++{ ++ if (unlikely(!journal_pin_active(pin) || pin->seq < seq)) ++ bch2_journal_pin_set(j, seq, pin, flush_fn); ++} + + void bch2_journal_pin_flush(struct journal *, struct journal_entry_pin *); + +-- +cgit v1.2.3 + + +From 51cfd03abceaa8d4f69af1077d76e2170b308656 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 1 Dec 2020 12:23:55 -0500 +Subject: bcachefs: Add error handling to unit & perf tests + +This way, these tests can be used with tests that inject IO errors and +shut down the filesystem. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/sysfs.c | 7 +- + fs/bcachefs/tests.c | 249 ++++++++++++++++++++++++++++++++++------------------ + fs/bcachefs/tests.h | 2 +- + 3 files changed, 170 insertions(+), 88 deletions(-) + +diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c +index 900eda88a5dc..cc13fc258115 100644 +--- a/fs/bcachefs/sysfs.c ++++ b/fs/bcachefs/sysfs.c +@@ -503,10 +503,11 @@ STORE(bch2_fs) + if (threads_str && + !(ret = kstrtouint(threads_str, 10, &threads)) && + !(ret = bch2_strtoull_h(nr_str, &nr))) +- bch2_btree_perf_test(c, test, nr, threads); +- else +- size = ret; ++ ret = bch2_btree_perf_test(c, test, nr, threads); + kfree(tmp); ++ ++ if (ret) ++ size = ret; + } + #endif + return size; +diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c +index 4dcace650416..5f40b048dd0d 100644 +--- a/fs/bcachefs/tests.c ++++ b/fs/bcachefs/tests.c +@@ -26,7 +26,7 @@ static void delete_test_keys(struct bch_fs *c) + + /* unit tests */ + +-static void test_delete(struct bch_fs *c, u64 nr) ++static int test_delete(struct bch_fs *c, u64 nr) + { + struct btree_trans trans; + struct btree_iter *iter; +@@ -41,24 +41,37 @@ static void test_delete(struct bch_fs *c, u64 nr) + BTREE_ITER_INTENT); + + ret = bch2_btree_iter_traverse(iter); +- BUG_ON(ret); ++ if (ret) { ++ bch_err(c, "lookup error in test_delete: %i", ret); ++ goto err; ++ } + + ret = __bch2_trans_do(&trans, NULL, NULL, 0, + bch2_trans_update(&trans, iter, &k.k_i, 0)); +- BUG_ON(ret); ++ if (ret) { ++ bch_err(c, "update error in test_delete: %i", ret); ++ goto err; ++ } + + pr_info("deleting once"); + ret = bch2_btree_delete_at(&trans, iter, 0); +- BUG_ON(ret); ++ if (ret) { ++ bch_err(c, "delete error (first) in test_delete: %i", ret); ++ goto err; ++ } + + pr_info("deleting twice"); + ret = bch2_btree_delete_at(&trans, iter, 0); +- BUG_ON(ret); +- ++ if (ret) { ++ bch_err(c, "delete error (second) in test_delete: %i", ret); ++ goto err; ++ } ++err: + bch2_trans_exit(&trans); ++ return ret; + } + +-static void test_delete_written(struct bch_fs *c, u64 nr) ++static int test_delete_written(struct bch_fs *c, u64 nr) + { + struct btree_trans trans; + struct btree_iter *iter; +@@ -73,27 +86,37 @@ static void test_delete_written(struct bch_fs *c, u64 nr) + BTREE_ITER_INTENT); + + ret = bch2_btree_iter_traverse(iter); +- BUG_ON(ret); ++ if (ret) { ++ bch_err(c, "lookup error in test_delete_written: %i", ret); ++ goto err; ++ } + + ret = __bch2_trans_do(&trans, NULL, NULL, 0, + bch2_trans_update(&trans, iter, &k.k_i, 0)); +- BUG_ON(ret); ++ if (ret) { ++ bch_err(c, "update error in test_delete_written: %i", ret); ++ goto err; ++ } + + bch2_journal_flush_all_pins(&c->journal); + + ret = bch2_btree_delete_at(&trans, iter, 0); +- BUG_ON(ret); +- ++ if (ret) { ++ bch_err(c, "delete error in test_delete_written: %i", ret); ++ goto err; ++ } ++err: + bch2_trans_exit(&trans); ++ return ret; + } + +-static void test_iterate(struct bch_fs *c, u64 nr) ++static int test_iterate(struct bch_fs *c, u64 nr) + { + struct btree_trans trans; + struct btree_iter *iter; + struct bkey_s_c k; + u64 i; +- int ret; ++ int ret = 0; + + bch2_trans_init(&trans, c, 0, 0); + +@@ -109,7 +132,10 @@ static void test_iterate(struct bch_fs *c, u64 nr) + + ret = bch2_btree_insert(c, BTREE_ID_XATTRS, &k.k_i, + NULL, NULL, 0); +- BUG_ON(ret); ++ if (ret) { ++ bch_err(c, "insert error in test_iterate: %i", ret); ++ goto err; ++ } + } + + pr_info("iterating forwards"); +@@ -132,17 +158,18 @@ static void test_iterate(struct bch_fs *c, u64 nr) + BUG_ON(k.k->p.offset != --i); + + BUG_ON(i); +- ++err: + bch2_trans_exit(&trans); ++ return ret; + } + +-static void test_iterate_extents(struct bch_fs *c, u64 nr) ++static int test_iterate_extents(struct bch_fs *c, u64 nr) + { + struct btree_trans trans; + struct btree_iter *iter; + struct bkey_s_c k; + u64 i; +- int ret; ++ int ret = 0; + + bch2_trans_init(&trans, c, 0, 0); + +@@ -159,7 +186,10 @@ static void test_iterate_extents(struct bch_fs *c, u64 nr) + + ret = bch2_btree_insert(c, BTREE_ID_EXTENTS, &k.k_i, + NULL, NULL, 0); +- BUG_ON(ret); ++ if (ret) { ++ bch_err(c, "insert error in test_iterate_extents: %i", ret); ++ goto err; ++ } + } + + pr_info("iterating forwards"); +@@ -182,17 +212,18 @@ static void test_iterate_extents(struct bch_fs *c, u64 nr) + } + + BUG_ON(i); +- ++err: + bch2_trans_exit(&trans); ++ return ret; + } + +-static void test_iterate_slots(struct bch_fs *c, u64 nr) ++static int test_iterate_slots(struct bch_fs *c, u64 nr) + { + struct btree_trans trans; + struct btree_iter *iter; + struct bkey_s_c k; + u64 i; +- int ret; ++ int ret = 0; + + bch2_trans_init(&trans, c, 0, 0); + +@@ -208,7 +239,10 @@ static void test_iterate_slots(struct bch_fs *c, u64 nr) + + ret = bch2_btree_insert(c, BTREE_ID_XATTRS, &k.k_i, + NULL, NULL, 0); +- BUG_ON(ret); ++ if (ret) { ++ bch_err(c, "insert error in test_iterate_slots: %i", ret); ++ goto err; ++ } + } + + pr_info("iterating forwards"); +@@ -240,17 +274,18 @@ static void test_iterate_slots(struct bch_fs *c, u64 nr) + if (i == nr * 2) + break; + } +- ++err: + bch2_trans_exit(&trans); ++ return ret; + } + +-static void test_iterate_slots_extents(struct bch_fs *c, u64 nr) ++static int test_iterate_slots_extents(struct bch_fs *c, u64 nr) + { + struct btree_trans trans; + struct btree_iter *iter; + struct bkey_s_c k; + u64 i; +- int ret; ++ int ret = 0; + + bch2_trans_init(&trans, c, 0, 0); + +@@ -267,7 +302,10 @@ static void test_iterate_slots_extents(struct bch_fs *c, u64 nr) + + ret = bch2_btree_insert(c, BTREE_ID_EXTENTS, &k.k_i, + NULL, NULL, 0); +- BUG_ON(ret); ++ if (ret) { ++ bch_err(c, "insert error in test_iterate_slots_extents: %i", ret); ++ goto err; ++ } + } + + pr_info("iterating forwards"); +@@ -299,15 +337,16 @@ static void test_iterate_slots_extents(struct bch_fs *c, u64 nr) + if (i == nr) + break; + } +- ++err: + bch2_trans_exit(&trans); ++ return 0; + } + + /* + * XXX: we really want to make sure we've got a btree with depth > 0 for these + * tests + */ +-static void test_peek_end(struct bch_fs *c, u64 nr) ++static int test_peek_end(struct bch_fs *c, u64 nr) + { + struct btree_trans trans; + struct btree_iter *iter; +@@ -324,9 +363,10 @@ static void test_peek_end(struct bch_fs *c, u64 nr) + BUG_ON(k.k); + + bch2_trans_exit(&trans); ++ return 0; + } + +-static void test_peek_end_extents(struct bch_fs *c, u64 nr) ++static int test_peek_end_extents(struct bch_fs *c, u64 nr) + { + struct btree_trans trans; + struct btree_iter *iter; +@@ -343,14 +383,15 @@ static void test_peek_end_extents(struct bch_fs *c, u64 nr) + BUG_ON(k.k); + + bch2_trans_exit(&trans); ++ return 0; + } + + /* extent unit tests */ + + u64 test_version; + +-static void insert_test_extent(struct bch_fs *c, +- u64 start, u64 end) ++static int insert_test_extent(struct bch_fs *c, ++ u64 start, u64 end) + { + struct bkey_i_cookie k; + int ret; +@@ -364,42 +405,47 @@ static void insert_test_extent(struct bch_fs *c, + + ret = bch2_btree_insert(c, BTREE_ID_EXTENTS, &k.k_i, + NULL, NULL, 0); +- BUG_ON(ret); ++ if (ret) ++ bch_err(c, "insert error in insert_test_extent: %i", ret); ++ return ret; + } + +-static void __test_extent_overwrite(struct bch_fs *c, ++static int __test_extent_overwrite(struct bch_fs *c, + u64 e1_start, u64 e1_end, + u64 e2_start, u64 e2_end) + { +- insert_test_extent(c, e1_start, e1_end); +- insert_test_extent(c, e2_start, e2_end); ++ int ret; ++ ++ ret = insert_test_extent(c, e1_start, e1_end) ?: ++ insert_test_extent(c, e2_start, e2_end); + + delete_test_keys(c); ++ return ret; + } + +-static void test_extent_overwrite_front(struct bch_fs *c, u64 nr) ++static int test_extent_overwrite_front(struct bch_fs *c, u64 nr) + { +- __test_extent_overwrite(c, 0, 64, 0, 32); +- __test_extent_overwrite(c, 8, 64, 0, 32); ++ return __test_extent_overwrite(c, 0, 64, 0, 32) ?: ++ __test_extent_overwrite(c, 8, 64, 0, 32); + } + +-static void test_extent_overwrite_back(struct bch_fs *c, u64 nr) ++static int test_extent_overwrite_back(struct bch_fs *c, u64 nr) + { +- __test_extent_overwrite(c, 0, 64, 32, 64); +- __test_extent_overwrite(c, 0, 64, 32, 72); ++ return __test_extent_overwrite(c, 0, 64, 32, 64) ?: ++ __test_extent_overwrite(c, 0, 64, 32, 72); + } + +-static void test_extent_overwrite_middle(struct bch_fs *c, u64 nr) ++static int test_extent_overwrite_middle(struct bch_fs *c, u64 nr) + { +- __test_extent_overwrite(c, 0, 64, 32, 40); ++ return __test_extent_overwrite(c, 0, 64, 32, 40); + } + +-static void test_extent_overwrite_all(struct bch_fs *c, u64 nr) ++static int test_extent_overwrite_all(struct bch_fs *c, u64 nr) + { +- __test_extent_overwrite(c, 32, 64, 0, 64); +- __test_extent_overwrite(c, 32, 64, 0, 128); +- __test_extent_overwrite(c, 32, 64, 32, 64); +- __test_extent_overwrite(c, 32, 64, 32, 128); ++ return __test_extent_overwrite(c, 32, 64, 0, 64) ?: ++ __test_extent_overwrite(c, 32, 64, 0, 128) ?: ++ __test_extent_overwrite(c, 32, 64, 32, 64) ?: ++ __test_extent_overwrite(c, 32, 64, 32, 128); + } + + /* perf tests */ +@@ -415,11 +461,11 @@ static u64 test_rand(void) + return v; + } + +-static void rand_insert(struct bch_fs *c, u64 nr) ++static int rand_insert(struct bch_fs *c, u64 nr) + { + struct btree_trans trans; + struct bkey_i_cookie k; +- int ret; ++ int ret = 0; + u64 i; + + bch2_trans_init(&trans, c, 0, 0); +@@ -430,48 +476,63 @@ static void rand_insert(struct bch_fs *c, u64 nr) + + ret = __bch2_trans_do(&trans, NULL, NULL, 0, + __bch2_btree_insert(&trans, BTREE_ID_XATTRS, &k.k_i)); +- +- BUG_ON(ret); ++ if (ret) { ++ bch_err(c, "error in rand_insert: %i", ret); ++ break; ++ } + } + + bch2_trans_exit(&trans); ++ return ret; + } + +-static void rand_lookup(struct bch_fs *c, u64 nr) ++static int rand_lookup(struct bch_fs *c, u64 nr) + { + struct btree_trans trans; + struct btree_iter *iter; + struct bkey_s_c k; ++ int ret = 0; + u64 i; + + bch2_trans_init(&trans, c, 0, 0); ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_XATTRS, POS_MIN, 0); + + for (i = 0; i < nr; i++) { +- iter = bch2_trans_get_iter(&trans, BTREE_ID_XATTRS, +- POS(0, test_rand()), 0); ++ bch2_btree_iter_set_pos(iter, POS(0, test_rand())); + + k = bch2_btree_iter_peek(iter); +- bch2_trans_iter_free(&trans, iter); ++ ret = bkey_err(k); ++ if (ret) { ++ bch_err(c, "error in rand_lookup: %i", ret); ++ break; ++ } + } + ++ bch2_trans_iter_free(&trans, iter); + bch2_trans_exit(&trans); ++ return ret; + } + +-static void rand_mixed(struct bch_fs *c, u64 nr) ++static int rand_mixed(struct bch_fs *c, u64 nr) + { + struct btree_trans trans; + struct btree_iter *iter; + struct bkey_s_c k; +- int ret; ++ int ret = 0; + u64 i; + + bch2_trans_init(&trans, c, 0, 0); ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_XATTRS, POS_MIN, 0); + + for (i = 0; i < nr; i++) { +- iter = bch2_trans_get_iter(&trans, BTREE_ID_XATTRS, +- POS(0, test_rand()), 0); ++ bch2_btree_iter_set_pos(iter, POS(0, test_rand())); + + k = bch2_btree_iter_peek(iter); ++ ret = bkey_err(k); ++ if (ret) { ++ bch_err(c, "lookup error in rand_mixed: %i", ret); ++ break; ++ } + + if (!(i & 3) && k.k) { + struct bkey_i_cookie k; +@@ -481,14 +542,16 @@ static void rand_mixed(struct bch_fs *c, u64 nr) + + ret = __bch2_trans_do(&trans, NULL, NULL, 0, + bch2_trans_update(&trans, iter, &k.k_i, 0)); +- +- BUG_ON(ret); ++ if (ret) { ++ bch_err(c, "update error in rand_mixed: %i", ret); ++ break; ++ } + } +- +- bch2_trans_iter_free(&trans, iter); + } + ++ bch2_trans_iter_free(&trans, iter); + bch2_trans_exit(&trans); ++ return ret; + } + + static int __do_delete(struct btree_trans *trans, struct bpos pos) +@@ -518,10 +581,10 @@ err: + return ret; + } + +-static void rand_delete(struct bch_fs *c, u64 nr) ++static int rand_delete(struct bch_fs *c, u64 nr) + { + struct btree_trans trans; +- int ret; ++ int ret = 0; + u64 i; + + bch2_trans_init(&trans, c, 0, 0); +@@ -531,19 +594,23 @@ static void rand_delete(struct bch_fs *c, u64 nr) + + ret = __bch2_trans_do(&trans, NULL, NULL, 0, + __do_delete(&trans, pos)); +- BUG_ON(ret); ++ if (ret) { ++ bch_err(c, "error in rand_delete: %i", ret); ++ break; ++ } + } + + bch2_trans_exit(&trans); ++ return ret; + } + +-static void seq_insert(struct bch_fs *c, u64 nr) ++static int seq_insert(struct bch_fs *c, u64 nr) + { + struct btree_trans trans; + struct btree_iter *iter; + struct bkey_s_c k; + struct bkey_i_cookie insert; +- int ret; ++ int ret = 0; + u64 i = 0; + + bkey_cookie_init(&insert.k_i); +@@ -556,35 +623,39 @@ static void seq_insert(struct bch_fs *c, u64 nr) + + ret = __bch2_trans_do(&trans, NULL, NULL, 0, + bch2_trans_update(&trans, iter, &insert.k_i, 0)); +- +- BUG_ON(ret); ++ if (ret) { ++ bch_err(c, "error in seq_insert: %i", ret); ++ break; ++ } + + if (++i == nr) + break; + } + bch2_trans_exit(&trans); ++ return ret; + } + +-static void seq_lookup(struct bch_fs *c, u64 nr) ++static int seq_lookup(struct bch_fs *c, u64 nr) + { + struct btree_trans trans; + struct btree_iter *iter; + struct bkey_s_c k; +- int ret; ++ int ret = 0; + + bch2_trans_init(&trans, c, 0, 0); + + for_each_btree_key(&trans, iter, BTREE_ID_XATTRS, POS_MIN, 0, k, ret) + ; + bch2_trans_exit(&trans); ++ return ret; + } + +-static void seq_overwrite(struct bch_fs *c, u64 nr) ++static int seq_overwrite(struct bch_fs *c, u64 nr) + { + struct btree_trans trans; + struct btree_iter *iter; + struct bkey_s_c k; +- int ret; ++ int ret = 0; + + bch2_trans_init(&trans, c, 0, 0); + +@@ -596,23 +667,28 @@ static void seq_overwrite(struct bch_fs *c, u64 nr) + + ret = __bch2_trans_do(&trans, NULL, NULL, 0, + bch2_trans_update(&trans, iter, &u.k_i, 0)); +- +- BUG_ON(ret); ++ if (ret) { ++ bch_err(c, "error in seq_overwrite: %i", ret); ++ break; ++ } + } + bch2_trans_exit(&trans); ++ return ret; + } + +-static void seq_delete(struct bch_fs *c, u64 nr) ++static int seq_delete(struct bch_fs *c, u64 nr) + { + int ret; + + ret = bch2_btree_delete_range(c, BTREE_ID_XATTRS, + POS(0, 0), POS(0, U64_MAX), + NULL); +- BUG_ON(ret); ++ if (ret) ++ bch_err(c, "error in seq_delete: %i", ret); ++ return ret; + } + +-typedef void (*perf_test_fn)(struct bch_fs *, u64); ++typedef int (*perf_test_fn)(struct bch_fs *, u64); + + struct test_job { + struct bch_fs *c; +@@ -628,11 +704,13 @@ struct test_job { + + u64 start; + u64 finish; ++ int ret; + }; + + static int btree_perf_test_thread(void *data) + { + struct test_job *j = data; ++ int ret; + + if (atomic_dec_and_test(&j->ready)) { + wake_up(&j->ready_wait); +@@ -641,7 +719,9 @@ static int btree_perf_test_thread(void *data) + wait_event(j->ready_wait, !atomic_read(&j->ready)); + } + +- j->fn(j->c, j->nr / j->nr_threads); ++ ret = j->fn(j->c, j->nr / j->nr_threads); ++ if (ret) ++ j->ret = ret; + + if (atomic_dec_and_test(&j->done)) { + j->finish = sched_clock(); +@@ -651,8 +731,8 @@ static int btree_perf_test_thread(void *data) + return 0; + } + +-void bch2_btree_perf_test(struct bch_fs *c, const char *testname, +- u64 nr, unsigned nr_threads) ++int bch2_btree_perf_test(struct bch_fs *c, const char *testname, ++ u64 nr, unsigned nr_threads) + { + struct test_job j = { .c = c, .nr = nr, .nr_threads = nr_threads }; + char name_buf[20], nr_buf[20], per_sec_buf[20]; +@@ -695,7 +775,7 @@ void bch2_btree_perf_test(struct bch_fs *c, const char *testname, + + if (!j.fn) { + pr_err("unknown test %s", testname); +- return; ++ return -EINVAL; + } + + //pr_info("running test %s:", testname); +@@ -720,6 +800,7 @@ void bch2_btree_perf_test(struct bch_fs *c, const char *testname, + time / NSEC_PER_SEC, + time * nr_threads / nr, + per_sec_buf); ++ return j.ret; + } + + #endif /* CONFIG_BCACHEFS_TESTS */ +diff --git a/fs/bcachefs/tests.h b/fs/bcachefs/tests.h +index 551d0764225e..c73b18aea7e0 100644 +--- a/fs/bcachefs/tests.h ++++ b/fs/bcachefs/tests.h +@@ -6,7 +6,7 @@ struct bch_fs; + + #ifdef CONFIG_BCACHEFS_TESTS + +-void bch2_btree_perf_test(struct bch_fs *, const char *, u64, unsigned); ++int bch2_btree_perf_test(struct bch_fs *, const char *, u64, unsigned); + + #else + +-- +cgit v1.2.3 + + +From c9572fdcd766543883a5211315f4b4a8d58e5472 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 1 Dec 2020 23:11:53 -0500 +Subject: bcachefs: bch2_trans_get_iter() no longer returns errors + +Since we now always preallocate the maximum number of iterators when we +initialize a btree transaction, getting an iterator never fails - we can +delete a fair amount of error path code. + +This patch also simplifies the iterator allocation code a bit. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 141 ++++++++-------------------------------- + fs/bcachefs/btree_iter.h | 32 ++++----- + fs/bcachefs/btree_key_cache.c | 10 --- + fs/bcachefs/btree_types.h | 11 ++-- + fs/bcachefs/btree_update_leaf.c | 27 +------- + fs/bcachefs/buckets.c | 6 -- + fs/bcachefs/fs-io.c | 3 - + fs/bcachefs/fsck.c | 8 +-- + fs/bcachefs/inode.c | 6 -- + fs/bcachefs/io.c | 9 --- + fs/bcachefs/recovery.c | 8 +-- + fs/bcachefs/str_hash.h | 7 +- + 12 files changed, 46 insertions(+), 222 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 96cc5394295e..7a95fcc0b244 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -346,7 +346,7 @@ void bch2_btree_trans_verify_locks(struct btree_trans *trans) + { + struct btree_iter *iter; + +- trans_for_each_iter_all(trans, iter) ++ trans_for_each_iter(trans, iter) + bch2_btree_iter_verify_locks(iter); + } + #else +@@ -2002,110 +2002,37 @@ int bch2_trans_iter_free(struct btree_trans *trans, + return bch2_trans_iter_put(trans, iter); + } + +-#if 0 +-static int bch2_trans_realloc_iters(struct btree_trans *trans, +- unsigned new_size) ++noinline __cold ++static void btree_trans_iter_alloc_fail(struct btree_trans *trans) + { +- void *p, *new_iters, *new_updates, *new_updates2; +- size_t iters_bytes; +- size_t updates_bytes; +- +- new_size = roundup_pow_of_two(new_size); +- +- BUG_ON(new_size > BTREE_ITER_MAX); +- +- if (new_size <= trans->size) +- return 0; +- +- BUG_ON(trans->used_mempool); +- +- bch2_trans_unlock(trans); + +- iters_bytes = sizeof(struct btree_iter) * new_size; +- updates_bytes = sizeof(struct btree_insert_entry) * new_size; +- +- p = kmalloc(iters_bytes + +- updates_bytes + +- updates_bytes, GFP_NOFS); +- if (p) +- goto success; +- +- p = mempool_alloc(&trans->c->btree_iters_pool, GFP_NOFS); +- new_size = BTREE_ITER_MAX; +- +- trans->used_mempool = true; +-success: +- new_iters = p; p += iters_bytes; +- new_updates = p; p += updates_bytes; +- new_updates2 = p; p += updates_bytes; +- +- memcpy(new_iters, trans->iters, +- sizeof(struct btree_iter) * trans->nr_iters); +- memcpy(new_updates, trans->updates, +- sizeof(struct btree_insert_entry) * trans->nr_updates); +- memcpy(new_updates2, trans->updates2, +- sizeof(struct btree_insert_entry) * trans->nr_updates2); +- +- if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) +- memset(trans->iters, POISON_FREE, +- sizeof(struct btree_iter) * trans->nr_iters + +- sizeof(struct btree_insert_entry) * trans->nr_iters); +- +- kfree(trans->iters); +- +- trans->iters = new_iters; +- trans->updates = new_updates; +- trans->updates2 = new_updates2; +- trans->size = new_size; +- +- if (trans->iters_live) { +- trace_trans_restart_iters_realloced(trans->ip, trans->size); +- return -EINTR; +- } ++ struct btree_iter *iter; + +- return 0; ++ trans_for_each_iter(trans, iter) ++ pr_err("iter: btree %s pos %llu:%llu%s%s%s %ps", ++ bch2_btree_ids[iter->btree_id], ++ iter->pos.inode, ++ iter->pos.offset, ++ (trans->iters_live & (1ULL << iter->idx)) ? " live" : "", ++ (trans->iters_touched & (1ULL << iter->idx)) ? " touched" : "", ++ iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT ? " keep" : "", ++ (void *) iter->ip_allocated); ++ panic("trans iter oveflow\n"); + } +-#endif + + static struct btree_iter *btree_trans_iter_alloc(struct btree_trans *trans) + { +- unsigned idx = __ffs64(~trans->iters_linked); +- +- if (idx < trans->nr_iters) +- goto got_slot; +- +- if (trans->nr_iters == trans->size) { +- struct btree_iter *iter; +- +- BUG_ON(trans->size < BTREE_ITER_MAX); +- +- trans_for_each_iter(trans, iter) { +- pr_err("iter: btree %s pos %llu:%llu%s%s%s %ps", +- bch2_btree_ids[iter->btree_id], +- iter->pos.inode, +- iter->pos.offset, +- (trans->iters_live & (1ULL << iter->idx)) ? " live" : "", +- (trans->iters_touched & (1ULL << iter->idx)) ? " touched" : "", +- iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT ? " keep" : "", +- (void *) iter->ip_allocated); +- } ++ unsigned idx; + +- panic("trans iter oveflow\n"); +-#if 0 +- ret = bch2_trans_realloc_iters(trans, trans->size * 2); +- if (ret) +- return ERR_PTR(ret); +-#endif +- } ++ if (unlikely(trans->iters_linked == ++ ~((~0ULL << 1) << (BTREE_ITER_MAX - 1)))) ++ btree_trans_iter_alloc_fail(trans); + +- idx = trans->nr_iters++; +- BUG_ON(trans->nr_iters > trans->size); ++ idx = __ffs64(~trans->iters_linked); + +- trans->iters[idx].idx = idx; +-got_slot: +- BUG_ON(trans->iters_linked & (1ULL << idx)); +- trans->iters_linked |= 1ULL << idx; +- trans->iters[idx].flags = 0; ++ trans->iters_linked |= 1ULL << idx; ++ trans->iters[idx].idx = idx; ++ trans->iters[idx].flags = 0; + return &trans->iters[idx]; + } + +@@ -2141,8 +2068,6 @@ static struct btree_iter *__btree_trans_get_iter(struct btree_trans *trans, + { + struct btree_iter *iter, *best = NULL; + +- BUG_ON(trans->nr_iters > BTREE_ITER_MAX); +- + trans_for_each_iter(trans, iter) { + if (btree_iter_type(iter) != (flags & BTREE_ITER_TYPE)) + continue; +@@ -2160,16 +2085,10 @@ static struct btree_iter *__btree_trans_get_iter(struct btree_trans *trans, + + if (!best) { + iter = btree_trans_iter_alloc(trans); +- if (IS_ERR(iter)) +- return iter; +- + bch2_btree_iter_init(trans, iter, btree_id, pos, flags); + } else if ((trans->iters_live & (1ULL << best->idx)) || + (best->flags & BTREE_ITER_KEEP_UNTIL_COMMIT)) { + iter = btree_trans_iter_alloc(trans); +- if (IS_ERR(iter)) +- return iter; +- + btree_iter_copy(iter, best); + } else { + iter = best; +@@ -2203,9 +2122,8 @@ struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans, + struct btree_iter *iter = + __btree_trans_get_iter(trans, btree_id, pos, flags); + +- if (!IS_ERR(iter)) +- __bch2_btree_iter_set_pos(iter, pos, +- btree_node_type_is_extents(btree_id)); ++ __bch2_btree_iter_set_pos(iter, pos, ++ btree_node_type_is_extents(btree_id)); + return iter; + } + +@@ -2221,7 +2139,6 @@ struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *trans, + flags|BTREE_ITER_NODES); + unsigned i; + +- BUG_ON(IS_ERR(iter)); + BUG_ON(bkey_cmp(iter->pos, pos)); + + iter->locks_want = locks_want; +@@ -2241,9 +2158,6 @@ struct btree_iter *__bch2_trans_copy_iter(struct btree_trans *trans, + struct btree_iter *iter; + + iter = btree_trans_iter_alloc(trans); +- if (IS_ERR(iter)) +- return iter; +- + btree_iter_copy(iter, src); + + trans->iters_live |= 1ULL << iter->idx; +@@ -2318,7 +2232,6 @@ void bch2_trans_reset(struct btree_trans *trans, unsigned flags) + + trans->iters_touched &= trans->iters_live; + +- trans->need_reset = 0; + trans->nr_updates = 0; + trans->nr_updates2 = 0; + trans->mem_top = 0; +@@ -2339,9 +2252,8 @@ void bch2_trans_reset(struct btree_trans *trans, unsigned flags) + + static void bch2_trans_alloc_iters(struct btree_trans *trans, struct bch_fs *c) + { +- unsigned new_size = BTREE_ITER_MAX; +- size_t iters_bytes = sizeof(struct btree_iter) * new_size; +- size_t updates_bytes = sizeof(struct btree_insert_entry) * new_size; ++ size_t iters_bytes = sizeof(struct btree_iter) * BTREE_ITER_MAX; ++ size_t updates_bytes = sizeof(struct btree_insert_entry) * BTREE_ITER_MAX; + void *p = NULL; + + BUG_ON(trans->used_mempool); +@@ -2355,7 +2267,6 @@ static void bch2_trans_alloc_iters(struct btree_trans *trans, struct bch_fs *c) + trans->iters = p; p += iters_bytes; + trans->updates = p; p += updates_bytes; + trans->updates2 = p; p += updates_bytes; +- trans->size = new_size; + } + + void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, +diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h +index f7a73619c85b..ee8c4346aadb 100644 +--- a/fs/bcachefs/btree_iter.h ++++ b/fs/bcachefs/btree_iter.h +@@ -48,21 +48,16 @@ static inline int btree_iter_err(const struct btree_iter *iter) + + /* Iterate over iters within a transaction: */ + +-#define trans_for_each_iter_all(_trans, _iter) \ +- for (_iter = (_trans)->iters; \ +- _iter < (_trans)->iters + (_trans)->nr_iters; \ +- _iter++) +- + static inline struct btree_iter * + __trans_next_iter(struct btree_trans *trans, unsigned idx) + { +- EBUG_ON(idx < trans->nr_iters && trans->iters[idx].idx != idx); +- +- for (; idx < trans->nr_iters; idx++) +- if (trans->iters_linked & (1ULL << idx)) +- return &trans->iters[idx]; ++ u64 l = trans->iters_linked >> idx; ++ if (!l) ++ return NULL; + +- return NULL; ++ idx += __ffs64(l); ++ EBUG_ON(trans->iters[idx].idx != idx); ++ return &trans->iters[idx]; + } + + #define trans_for_each_iter(_trans, _iter) \ +@@ -240,10 +235,9 @@ static inline int bkey_err(struct bkey_s_c k) + + #define for_each_btree_key(_trans, _iter, _btree_id, \ + _start, _flags, _k, _ret) \ +- for ((_ret) = PTR_ERR_OR_ZERO((_iter) = \ +- bch2_trans_get_iter((_trans), (_btree_id), \ +- (_start), (_flags))) ?: \ +- PTR_ERR_OR_ZERO(((_k) = \ ++ for ((_iter) = bch2_trans_get_iter((_trans), (_btree_id), \ ++ (_start), (_flags)), \ ++ (_ret) = PTR_ERR_OR_ZERO(((_k) = \ + __bch2_btree_iter_peek(_iter, _flags)).k); \ + !_ret && (_k).k; \ + (_ret) = PTR_ERR_OR_ZERO(((_k) = \ +@@ -270,9 +264,7 @@ bch2_trans_get_iter(struct btree_trans *trans, enum btree_id btree_id, + { + struct btree_iter *iter = + __bch2_trans_get_iter(trans, btree_id, pos, flags); +- +- if (!IS_ERR(iter)) +- iter->ip_allocated = _THIS_IP_; ++ iter->ip_allocated = _THIS_IP_; + return iter; + } + +@@ -284,10 +276,8 @@ bch2_trans_copy_iter(struct btree_trans *trans, struct btree_iter *src) + struct btree_iter *iter = + __bch2_trans_copy_iter(trans, src); + +- if (!IS_ERR(iter)) +- iter->ip_allocated = _THIS_IP_; ++ iter->ip_allocated = _THIS_IP_; + return iter; +- + } + + struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *, +diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c +index 3ed7d3778aca..c3fead003a23 100644 +--- a/fs/bcachefs/btree_key_cache.c ++++ b/fs/bcachefs/btree_key_cache.c +@@ -169,9 +169,6 @@ static int btree_key_cache_fill(struct btree_trans *trans, + + iter = bch2_trans_get_iter(trans, ck->key.btree_id, + ck->key.pos, BTREE_ITER_SLOTS); +- if (IS_ERR(iter)) +- return PTR_ERR(iter); +- + k = bch2_btree_iter_peek_slot(iter); + ret = bkey_err(k); + if (ret) { +@@ -325,18 +322,11 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans, + b_iter = bch2_trans_get_iter(trans, key.btree_id, key.pos, + BTREE_ITER_SLOTS| + BTREE_ITER_INTENT); +- ret = PTR_ERR_OR_ZERO(b_iter); +- if (ret) +- goto out; +- + c_iter = bch2_trans_get_iter(trans, key.btree_id, key.pos, + BTREE_ITER_CACHED| + BTREE_ITER_CACHED_NOFILL| + BTREE_ITER_CACHED_NOCREATE| + BTREE_ITER_INTENT); +- ret = PTR_ERR_OR_ZERO(c_iter); +- if (ret) +- goto out; + retry: + ret = bch2_btree_iter_traverse(c_iter); + if (ret) +diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h +index cf59f1224741..15af60e92820 100644 +--- a/fs/bcachefs/btree_types.h ++++ b/fs/bcachefs/btree_types.h +@@ -357,20 +357,17 @@ struct btree_trans { + unsigned long ip; + int srcu_idx; + +- u64 iters_linked; +- u64 iters_live; +- u64 iters_touched; +- +- u8 nr_iters; + u8 nr_updates; + u8 nr_updates2; +- u8 size; + unsigned used_mempool:1; + unsigned error:1; + unsigned nounlock:1; +- unsigned need_reset:1; + unsigned in_traverse_all:1; + ++ u64 iters_linked; ++ u64 iters_live; ++ u64 iters_touched; ++ + unsigned mem_top; + unsigned mem_bytes; + void *mem; +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index d1196ce70058..f35cdbfb43c5 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -717,7 +717,7 @@ static void bch2_trans_update2(struct btree_trans *trans, + + BUG_ON(iter->uptodate > BTREE_ITER_NEED_PEEK); + +- EBUG_ON(trans->nr_updates2 >= trans->nr_iters); ++ EBUG_ON(trans->nr_updates2 >= BTREE_ITER_MAX); + + iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT; + +@@ -750,8 +750,6 @@ static int extent_update_to_keys(struct btree_trans *trans, + return 0; + + iter = bch2_trans_copy_iter(trans, orig_iter); +- if (IS_ERR(iter)) +- return PTR_ERR(iter); + + iter->flags |= BTREE_ITER_INTENT; + __bch2_btree_iter_set_pos(iter, insert->k.p, false); +@@ -770,10 +768,6 @@ static int extent_handle_overwrites(struct btree_trans *trans, + int ret = 0; + + iter = bch2_trans_get_iter(trans, btree_id, start, BTREE_ITER_INTENT); +- ret = PTR_ERR_OR_ZERO(iter); +- if (ret) +- return ret; +- + k = bch2_btree_iter_peek_with_updates(iter); + + while (k.k && !(ret = bkey_err(k))) { +@@ -782,8 +776,6 @@ static int extent_handle_overwrites(struct btree_trans *trans, + + if (bkey_cmp(bkey_start_pos(k.k), start) < 0) { + update_iter = bch2_trans_copy_iter(trans, iter); +- if ((ret = PTR_ERR_OR_ZERO(update_iter))) +- goto err; + + update = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); + if ((ret = PTR_ERR_OR_ZERO(update))) +@@ -799,8 +791,6 @@ static int extent_handle_overwrites(struct btree_trans *trans, + + if (bkey_cmp(k.k->p, end) > 0) { + update_iter = bch2_trans_copy_iter(trans, iter); +- if ((ret = PTR_ERR_OR_ZERO(update_iter))) +- goto err; + + update = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); + if ((ret = PTR_ERR_OR_ZERO(update))) +@@ -814,8 +804,6 @@ static int extent_handle_overwrites(struct btree_trans *trans, + bch2_trans_iter_put(trans, update_iter); + } else { + update_iter = bch2_trans_copy_iter(trans, iter); +- if ((ret = PTR_ERR_OR_ZERO(update_iter))) +- goto err; + + update = bch2_trans_kmalloc(trans, sizeof(struct bkey)); + if ((ret = PTR_ERR_OR_ZERO(update))) +@@ -847,8 +835,6 @@ int __bch2_trans_commit(struct btree_trans *trans) + unsigned u64s; + int ret = 0; + +- BUG_ON(trans->need_reset); +- + if (!trans->nr_updates) + goto out_noupdates; + +@@ -1041,10 +1027,6 @@ int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter, + */ + if (trans->iters_live & (1ULL << i->iter->idx)) { + i->iter = bch2_trans_copy_iter(trans, i->iter); +- if (IS_ERR(i->iter)) { +- trans->need_reset = true; +- return PTR_ERR(i->iter); +- } + + i->iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT; + bch2_trans_iter_put(trans, i->iter); +@@ -1054,7 +1036,7 @@ int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter, + bch2_btree_iter_set_pos(i->iter, n.k->k.p); + } + +- EBUG_ON(trans->nr_updates >= trans->nr_iters); ++ EBUG_ON(trans->nr_updates >= BTREE_ITER_MAX); + + array_insert_item(trans->updates, trans->nr_updates, + i - trans->updates, n); +@@ -1069,8 +1051,6 @@ int __bch2_btree_insert(struct btree_trans *trans, + + iter = bch2_trans_get_iter(trans, id, bkey_start_pos(&k->k), + BTREE_ITER_INTENT); +- if (IS_ERR(iter)) +- return PTR_ERR(iter); + + ret = bch2_btree_iter_traverse(iter) ?: + bch2_trans_update(trans, iter, k, 0); +@@ -1117,9 +1097,6 @@ int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id, + int ret = 0; + + iter = bch2_trans_get_iter(trans, id, start, BTREE_ITER_INTENT); +- ret = PTR_ERR_OR_ZERO(iter); +- if (ret) +- return ret; + retry: + while ((k = bch2_btree_iter_peek(iter)).k && + !(ret = bkey_err(k)) && +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index be65f2e78a62..1b1200c55134 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -1576,9 +1576,6 @@ static int trans_get_key(struct btree_trans *trans, + + *iter = bch2_trans_get_iter(trans, btree_id, pos, + flags|BTREE_ITER_INTENT); +- if (IS_ERR(*iter)) +- return PTR_ERR(*iter); +- + *k = __bch2_btree_iter_peek(*iter, flags); + ret = bkey_err(*k); + if (ret) +@@ -1606,9 +1603,6 @@ static int bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree + BTREE_ITER_CACHED| + BTREE_ITER_CACHED_NOFILL| + BTREE_ITER_INTENT); +- if (IS_ERR(iter)) +- return PTR_ERR(iter); +- + ret = bch2_btree_iter_traverse(iter); + if (ret) { + bch2_trans_iter_put(trans, iter); +diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c +index 86236e851ce9..806c37499c9b 100644 +--- a/fs/bcachefs/fs-io.c ++++ b/fs/bcachefs/fs-io.c +@@ -2482,10 +2482,7 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode, + src = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, + POS(inode->v.i_ino, src_start >> 9), + BTREE_ITER_INTENT); +- BUG_ON(IS_ERR_OR_NULL(src)); +- + dst = bch2_trans_copy_iter(&trans, src); +- BUG_ON(IS_ERR_OR_NULL(dst)); + + while (1) { + struct disk_reservation disk_res = +diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c +index 7449819d8eac..39f872de0c18 100644 +--- a/fs/bcachefs/fsck.c ++++ b/fs/bcachefs/fsck.c +@@ -230,7 +230,6 @@ static int hash_check_duplicates(struct btree_trans *trans, + return 0; + + iter = bch2_trans_copy_iter(trans, h->chain); +- BUG_ON(IS_ERR(iter)); + + for_each_btree_key_continue(iter, 0, k2, ret) { + if (bkey_cmp(k2.k->p, k.k->p) >= 0) +@@ -265,10 +264,8 @@ static void hash_set_chain_start(struct btree_trans *trans, + hash_stop_chain(trans, h); + + if (!hole) { +- if (!h->chain) { ++ if (!h->chain) + h->chain = bch2_trans_copy_iter(trans, k_iter); +- BUG_ON(IS_ERR(h->chain)); +- } + + h->chain_end = k.k->p.offset; + } +@@ -440,9 +437,6 @@ static int bch2_fix_overlapping_extent(struct btree_trans *trans, + bch2_cut_front(cut_at, u); + + u_iter = bch2_trans_copy_iter(trans, iter); +- ret = PTR_ERR_OR_ZERO(u_iter); +- if (ret) +- return ret; + + /* + * We don't want to go through the +diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c +index 82099e5a48d8..bf1c7319669c 100644 +--- a/fs/bcachefs/inode.c ++++ b/fs/bcachefs/inode.c +@@ -302,9 +302,6 @@ struct btree_iter *bch2_inode_peek(struct btree_trans *trans, + + iter = bch2_trans_get_iter(trans, BTREE_ID_INODES, POS(0, inum), + BTREE_ITER_CACHED|flags); +- if (IS_ERR(iter)) +- return iter; +- + k = bch2_btree_iter_peek_cached(iter); + ret = bkey_err(k); + if (ret) +@@ -640,9 +637,6 @@ int bch2_inode_find_by_inum_trans(struct btree_trans *trans, u64 inode_nr, + + iter = bch2_trans_get_iter(trans, BTREE_ID_INODES, + POS(0, inode_nr), BTREE_ITER_CACHED); +- if (IS_ERR(iter)) +- return PTR_ERR(iter); +- + k = bch2_btree_iter_peek_cached(iter); + ret = bkey_err(k); + if (ret) +diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c +index e12b5b5e0598..70b3a2e6707f 100644 +--- a/fs/bcachefs/io.c ++++ b/fs/bcachefs/io.c +@@ -197,8 +197,6 @@ static int sum_sector_overwrites(struct btree_trans *trans, + *delta = 0; + + iter = bch2_trans_copy_iter(trans, extent_iter); +- if (IS_ERR(iter)) +- return PTR_ERR(iter); + + for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, old, ret) { + if (!may_allocate && +@@ -1788,9 +1786,6 @@ static int __bch2_rbio_narrow_crcs(struct btree_trans *trans, + + iter = bch2_trans_get_iter(trans, BTREE_ID_EXTENTS, rbio->pos, + BTREE_ITER_SLOTS|BTREE_ITER_INTENT); +- if ((ret = PTR_ERR_OR_ZERO(iter))) +- goto out; +- + k = bch2_btree_iter_peek_slot(iter); + if ((ret = bkey_err(k))) + goto out; +@@ -1998,10 +1993,6 @@ int __bch2_read_indirect_extent(struct btree_trans *trans, + iter = bch2_trans_get_iter(trans, BTREE_ID_REFLINK, + POS(0, reflink_offset), + BTREE_ITER_SLOTS); +- ret = PTR_ERR_OR_ZERO(iter); +- if (ret) +- return ret; +- + k = bch2_btree_iter_peek_slot(iter); + ret = bkey_err(k); + if (ret) +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index a837d9eb0f6d..d24cef2bf1aa 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -443,9 +443,6 @@ retry: + bch2_cut_back(atomic_end, split); + + split_iter = bch2_trans_copy_iter(&trans, iter); +- ret = PTR_ERR_OR_ZERO(split_iter); +- if (ret) +- goto err; + + /* + * It's important that we don't go through the +@@ -502,8 +499,6 @@ static int __bch2_journal_replay_key(struct btree_trans *trans, + iter = bch2_trans_get_node_iter(trans, id, k->k.p, + BTREE_MAX_DEPTH, level, + BTREE_ITER_INTENT); +- if (IS_ERR(iter)) +- return PTR_ERR(iter); + + /* + * iter->flags & BTREE_ITER_IS_EXTENTS triggers the update path to run +@@ -538,8 +533,7 @@ static int __bch2_alloc_replay_key(struct btree_trans *trans, struct bkey_i *k) + BTREE_ITER_CACHED| + BTREE_ITER_CACHED_NOFILL| + BTREE_ITER_INTENT); +- ret = PTR_ERR_OR_ZERO(iter) ?: +- bch2_trans_update(trans, iter, k, BTREE_TRIGGER_NORUN); ++ ret = bch2_trans_update(trans, iter, k, BTREE_TRIGGER_NORUN); + bch2_trans_iter_put(trans, iter); + return ret; + } +diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h +index 9c9549d0a8f6..f6b694b9346b 100644 +--- a/fs/bcachefs/str_hash.h ++++ b/fs/bcachefs/str_hash.h +@@ -205,8 +205,6 @@ int bch2_hash_needs_whiteout(struct btree_trans *trans, + int ret; + + iter = bch2_trans_copy_iter(trans, start); +- if (IS_ERR(iter)) +- return PTR_ERR(iter); + + bch2_btree_iter_next_slot(iter); + +@@ -253,11 +251,8 @@ int bch2_hash_set(struct btree_trans *trans, + } + + if (!slot && +- !(flags & BCH_HASH_SET_MUST_REPLACE)) { ++ !(flags & BCH_HASH_SET_MUST_REPLACE)) + slot = bch2_trans_copy_iter(trans, iter); +- if (IS_ERR(slot)) +- return PTR_ERR(slot); +- } + + if (k.k->type != KEY_TYPE_whiteout) + goto not_found; +-- +cgit v1.2.3 + + +From bd70c2bb0d4aeb865f95589202a8bd4351b74edc Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 2 Dec 2020 15:33:12 -0500 +Subject: bcachefs: Fix journal_flush_seq() + +The error check was inverted - leading fsyncs to get stuck and hang, +oops. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/journal.c | 5 +---- + 1 file changed, 1 insertion(+), 4 deletions(-) + +diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c +index 5874a9ff2204..dd8db8c0c980 100644 +--- a/fs/bcachefs/journal.c ++++ b/fs/bcachefs/journal.c +@@ -553,16 +553,13 @@ int bch2_journal_flush_seq_async(struct journal *j, u64 seq, + struct journal_buf *buf; + int ret = 0; + +- if (seq <= j->err_seq) +- return -EIO; +- + if (seq <= j->seq_ondisk) + return 1; + + spin_lock(&j->lock); + + /* Recheck under lock: */ +- if (seq <= j->err_seq) { ++ if (j->err_seq && seq >= j->err_seq) { + ret = -EIO; + goto out; + } +-- +cgit v1.2.3 + + +From 752af2ef729c6e9f4a17cf3bb9226ad46ea03729 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 3 Dec 2020 13:09:08 -0500 +Subject: bcachefs: Fix some spurious gcc warnings + +These only come up when building in userspace, for some reason. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_key_cache.c | 2 +- + fs/bcachefs/movinggc.c | 2 +- + 2 files changed, 2 insertions(+), 2 deletions(-) + +diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c +index c3fead003a23..244c5dbcd3e9 100644 +--- a/fs/bcachefs/btree_key_cache.c ++++ b/fs/bcachefs/btree_key_cache.c +@@ -316,7 +316,7 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans, + struct bch_fs *c = trans->c; + struct journal *j = &c->journal; + struct btree_iter *c_iter = NULL, *b_iter = NULL; +- struct bkey_cached *ck; ++ struct bkey_cached *ck = NULL; + int ret; + + b_iter = bch2_trans_get_iter(trans, key.btree_id, key.pos, +diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c +index 4834f41f48ed..2c5daed58aca 100644 +--- a/fs/bcachefs/movinggc.c ++++ b/fs/bcachefs/movinggc.c +@@ -61,7 +61,7 @@ static enum data_cmd copygc_pred(struct bch_fs *c, void *arg, + copygc_heap *h = &c->copygc_heap; + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; +- struct extent_ptr_decoded p; ++ struct extent_ptr_decoded p = { 0 }; + + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { + struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); +-- +cgit v1.2.3 + + +From ecec4ae29f2583ede66c9db6320fb09bf1f03c79 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 2 Dec 2020 18:30:06 -0500 +Subject: bcachefs: Fix spurious alloc errors on forced shutdown + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_update_interior.c | 15 ++++++++++++++- + 1 file changed, 14 insertions(+), 1 deletion(-) + +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index dc7b1342410e..edc11c22308c 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -544,6 +544,17 @@ static void btree_update_nodes_written(struct btree_update *as) + unsigned i; + int ret; + ++ /* ++ * If we're already in an error state, it might be because a btree node ++ * was never written, and we might be trying to free that same btree ++ * node here, but it won't have been marked as allocated and we'll see ++ * spurious disk usage inconsistencies in the transactional part below ++ * if we don't skip it: ++ */ ++ ret = bch2_journal_error(&c->journal); ++ if (ret) ++ goto err; ++ + BUG_ON(!journal_pin_active(&as->journal)); + + /* +@@ -569,8 +580,10 @@ static void btree_update_nodes_written(struct btree_update *as) + BTREE_INSERT_JOURNAL_RESERVED, + btree_update_nodes_written_trans(&trans, as)); + bch2_trans_exit(&trans); +- BUG_ON(ret && !bch2_journal_error(&c->journal)); + ++ bch2_fs_fatal_err_on(ret && !bch2_journal_error(&c->journal), c, ++ "error %i in btree_update_nodes_written()", ret); ++err: + if (b) { + /* + * @b is the node we did the final insert into: +-- +cgit v1.2.3 + + +From b08630d14b49e1ad2094d9126f56b64b5f5d77b7 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 13 Nov 2020 18:36:33 -0500 +Subject: bcachefs: Refactor filesystem usage accounting + +Various filesystem usage counters are kept in percpu counters, with one +set per in flight journal buffer. Right now all the code that deals with +it assumes that there's only two buffers/sets of counters, but the +number of journal bufs is getting increased to 4 in the next patch - so +refactor that code to not assume a constant. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bcachefs.h | 2 +- + fs/bcachefs/btree_gc.c | 5 ++--- + fs/bcachefs/buckets.c | 23 +++++++++++---------- + fs/bcachefs/journal_types.h | 4 ++++ + fs/bcachefs/replicas.c | 50 ++++++++++++++++++++++----------------------- + fs/bcachefs/super-io.c | 6 +++--- + fs/bcachefs/super.c | 4 ++-- + 7 files changed, 48 insertions(+), 46 deletions(-) + +diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h +index 91f5844a1d36..d216163f62e2 100644 +--- a/fs/bcachefs/bcachefs.h ++++ b/fs/bcachefs/bcachefs.h +@@ -676,7 +676,7 @@ struct bch_fs { + + seqcount_t usage_lock; + struct bch_fs_usage *usage_base; +- struct bch_fs_usage __percpu *usage[2]; ++ struct bch_fs_usage __percpu *usage[JOURNAL_BUF_NR]; + struct bch_fs_usage __percpu *usage_gc; + + /* single element mempool: */ +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +index ac81c9b9a06a..6268ea637d19 100644 +--- a/fs/bcachefs/btree_gc.c ++++ b/fs/bcachefs/btree_gc.c +@@ -603,7 +603,6 @@ static int bch2_gc_done(struct bch_fs *c, + struct genradix_iter dst_iter = genradix_iter_init(&c->stripes[0], 0); + struct genradix_iter src_iter = genradix_iter_init(&c->stripes[1], 0); + struct stripe *dst, *src; +- unsigned i; + + c->ec_stripes_heap.used = 0; + +@@ -651,8 +650,8 @@ static int bch2_gc_done(struct bch_fs *c, + } + }; + +- bch2_fs_usage_acc_to_base(c, 0); +- bch2_fs_usage_acc_to_base(c, 1); ++ for (i = 0; i < ARRAY_SIZE(c->usage); i++) ++ bch2_fs_usage_acc_to_base(c, i); + + bch2_dev_usage_from_buckets(c); + +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index 1b1200c55134..e297101af3a1 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -142,8 +142,8 @@ void bch2_fs_usage_initialize(struct bch_fs *c) + percpu_down_write(&c->mark_lock); + usage = c->usage_base; + +- bch2_fs_usage_acc_to_base(c, 0); +- bch2_fs_usage_acc_to_base(c, 1); ++ for (i = 0; i < ARRAY_SIZE(c->usage); i++) ++ bch2_fs_usage_acc_to_base(c, i); + + for (i = 0; i < BCH_REPLICAS_MAX; i++) + usage->reserved += usage->persistent_reserved[i]; +@@ -207,13 +207,13 @@ static inline struct bch_fs_usage *fs_usage_ptr(struct bch_fs *c, + { + return this_cpu_ptr(gc + ? c->usage_gc +- : c->usage[journal_seq & 1]); ++ : c->usage[journal_seq & JOURNAL_BUF_MASK]); + } + + u64 bch2_fs_usage_read_one(struct bch_fs *c, u64 *v) + { + ssize_t offset = v - (u64 *) c->usage_base; +- unsigned seq; ++ unsigned i, seq; + u64 ret; + + BUG_ON(offset < 0 || offset >= fs_usage_u64s(c)); +@@ -221,9 +221,10 @@ u64 bch2_fs_usage_read_one(struct bch_fs *c, u64 *v) + + do { + seq = read_seqcount_begin(&c->usage_lock); +- ret = *v + +- percpu_u64_get((u64 __percpu *) c->usage[0] + offset) + +- percpu_u64_get((u64 __percpu *) c->usage[1] + offset); ++ ret = *v; ++ ++ for (i = 0; i < ARRAY_SIZE(c->usage); i++) ++ ret += percpu_u64_get((u64 __percpu *) c->usage[i] + offset); + } while (read_seqcount_retry(&c->usage_lock, seq)); + + return ret; +@@ -232,7 +233,7 @@ u64 bch2_fs_usage_read_one(struct bch_fs *c, u64 *v) + struct bch_fs_usage *bch2_fs_usage_read(struct bch_fs *c) + { + struct bch_fs_usage *ret; +- unsigned seq, v, u64s = fs_usage_u64s(c); ++ unsigned seq, i, v, u64s = fs_usage_u64s(c); + retry: + ret = kmalloc(u64s * sizeof(u64), GFP_NOFS); + if (unlikely(!ret)) +@@ -251,8 +252,8 @@ retry: + do { + seq = read_seqcount_begin(&c->usage_lock); + memcpy(ret, c->usage_base, u64s * sizeof(u64)); +- acc_u64s_percpu((u64 *) ret, (u64 __percpu *) c->usage[0], u64s); +- acc_u64s_percpu((u64 *) ret, (u64 __percpu *) c->usage[1], u64s); ++ for (i = 0; i < ARRAY_SIZE(c->usage); i++) ++ acc_u64s_percpu((u64 *) ret, (u64 __percpu *) c->usage[i], u64s); + } while (read_seqcount_retry(&c->usage_lock, seq)); + + return ret; +@@ -262,7 +263,7 @@ void bch2_fs_usage_acc_to_base(struct bch_fs *c, unsigned idx) + { + unsigned u64s = fs_usage_u64s(c); + +- BUG_ON(idx >= 2); ++ BUG_ON(idx >= ARRAY_SIZE(c->usage)); + + preempt_disable(); + write_seqcount_begin(&c->usage_lock); +diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h +index 4640bb8687cc..00c3de77e823 100644 +--- a/fs/bcachefs/journal_types.h ++++ b/fs/bcachefs/journal_types.h +@@ -11,6 +11,10 @@ + + struct journal_res; + ++#define JOURNAL_BUF_BITS 1 ++#define JOURNAL_BUF_NR (1U << JOURNAL_BUF_BITS) ++#define JOURNAL_BUF_MASK (JOURNAL_BUF_NR - 1) ++ + /* + * We put two of these in struct journal; we used them for writes to the + * journal that are being staged or in flight. +diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c +index 91518c0d6794..00a197b65e0b 100644 +--- a/fs/bcachefs/replicas.c ++++ b/fs/bcachefs/replicas.c +@@ -275,53 +275,55 @@ static void __replicas_table_update_pcpu(struct bch_fs_usage __percpu *dst_p, + static int replicas_table_update(struct bch_fs *c, + struct bch_replicas_cpu *new_r) + { +- struct bch_fs_usage __percpu *new_usage[2] = { NULL, NULL }; ++ struct bch_fs_usage __percpu *new_usage[JOURNAL_BUF_NR]; + struct bch_fs_usage *new_scratch = NULL; + struct bch_fs_usage __percpu *new_gc = NULL; + struct bch_fs_usage *new_base = NULL; +- unsigned bytes = sizeof(struct bch_fs_usage) + ++ unsigned i, bytes = sizeof(struct bch_fs_usage) + + sizeof(u64) * new_r->nr; +- int ret = -ENOMEM; ++ int ret = 0; ++ ++ memset(new_usage, 0, sizeof(new_usage)); ++ ++ for (i = 0; i < ARRAY_SIZE(new_usage); i++) ++ if (!(new_usage[i] = __alloc_percpu_gfp(bytes, ++ sizeof(u64), GFP_NOIO))) ++ goto err; + + if (!(new_base = kzalloc(bytes, GFP_NOIO)) || +- !(new_usage[0] = __alloc_percpu_gfp(bytes, sizeof(u64), +- GFP_NOIO)) || +- !(new_usage[1] = __alloc_percpu_gfp(bytes, sizeof(u64), +- GFP_NOIO)) || + !(new_scratch = kmalloc(bytes, GFP_NOIO)) || + (c->usage_gc && +- !(new_gc = __alloc_percpu_gfp(bytes, sizeof(u64), GFP_NOIO)))) { +- bch_err(c, "error updating replicas table: memory allocation failure"); ++ !(new_gc = __alloc_percpu_gfp(bytes, sizeof(u64), GFP_NOIO)))) + goto err; +- } + ++ for (i = 0; i < ARRAY_SIZE(new_usage); i++) ++ if (c->usage[i]) ++ __replicas_table_update_pcpu(new_usage[i], new_r, ++ c->usage[i], &c->replicas); + if (c->usage_base) + __replicas_table_update(new_base, new_r, + c->usage_base, &c->replicas); +- if (c->usage[0]) +- __replicas_table_update_pcpu(new_usage[0], new_r, +- c->usage[0], &c->replicas); +- if (c->usage[1]) +- __replicas_table_update_pcpu(new_usage[1], new_r, +- c->usage[1], &c->replicas); + if (c->usage_gc) + __replicas_table_update_pcpu(new_gc, new_r, + c->usage_gc, &c->replicas); + ++ for (i = 0; i < ARRAY_SIZE(new_usage); i++) ++ swap(c->usage[i], new_usage[i]); + swap(c->usage_base, new_base); +- swap(c->usage[0], new_usage[0]); +- swap(c->usage[1], new_usage[1]); + swap(c->usage_scratch, new_scratch); + swap(c->usage_gc, new_gc); + swap(c->replicas, *new_r); +- ret = 0; +-err: ++out: + free_percpu(new_gc); + kfree(new_scratch); + free_percpu(new_usage[1]); + free_percpu(new_usage[0]); + kfree(new_base); + return ret; ++err: ++ bch_err(c, "error updating replicas table: memory allocation failure"); ++ ret = -ENOMEM; ++ goto out; + } + + static unsigned reserve_journal_replicas(struct bch_fs *c, +@@ -496,9 +498,7 @@ int bch2_replicas_gc_end(struct bch_fs *c, int ret) + struct bch_replicas_cpu n; + + if (!__replicas_has_entry(&c->replicas_gc, e) && +- (c->usage_base->replicas[i] || +- percpu_u64_get(&c->usage[0]->replicas[i]) || +- percpu_u64_get(&c->usage[1]->replicas[i]))) { ++ bch2_fs_usage_read_one(c, &c->usage_base->replicas[i])) { + n = cpu_replicas_add_entry(&c->replicas_gc, e); + if (!n.entries) { + ret = -ENOSPC; +@@ -603,9 +603,7 @@ retry: + cpu_replicas_entry(&c->replicas, i); + + if (e->data_type == BCH_DATA_journal || +- c->usage_base->replicas[i] || +- percpu_u64_get(&c->usage[0]->replicas[i]) || +- percpu_u64_get(&c->usage[1]->replicas[i])) ++ bch2_fs_usage_read_one(c, &c->usage_base->replicas[i])) + memcpy(cpu_replicas_entry(&new, new.nr++), + e, new.entry_size); + } +diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c +index f969b5df0b23..ffd219091ea6 100644 +--- a/fs/bcachefs/super-io.c ++++ b/fs/bcachefs/super-io.c +@@ -992,10 +992,10 @@ bch2_journal_super_entries_add_common(struct bch_fs *c, + percpu_down_write(&c->mark_lock); + + if (!journal_seq) { +- bch2_fs_usage_acc_to_base(c, 0); +- bch2_fs_usage_acc_to_base(c, 1); ++ for (i = 0; i < ARRAY_SIZE(c->usage); i++) ++ bch2_fs_usage_acc_to_base(c, i); + } else { +- bch2_fs_usage_acc_to_base(c, journal_seq & 1); ++ bch2_fs_usage_acc_to_base(c, journal_seq & JOURNAL_BUF_MASK); + } + + { +diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c +index 5dc594192bc0..741a4c225fa9 100644 +--- a/fs/bcachefs/super.c ++++ b/fs/bcachefs/super.c +@@ -475,8 +475,8 @@ static void __bch2_fs_free(struct bch_fs *c) + bch2_journal_entries_free(&c->journal_entries); + percpu_free_rwsem(&c->mark_lock); + kfree(c->usage_scratch); +- free_percpu(c->usage[1]); +- free_percpu(c->usage[0]); ++ for (i = 0; i < ARRAY_SIZE(c->usage); i++) ++ free_percpu(c->usage[i]); + kfree(c->usage_base); + + if (c->btree_iters_bufs) +-- +cgit v1.2.3 + + +From 5b36bd7dfbbb9846bda9d0bd8f17a4d868179ac4 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 3 Dec 2020 13:57:22 -0500 +Subject: bcachefs: Improve some IO error messages + +it's useful to know whether an error was for a read or a write - this +also standardizes error messages a bit more. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bcachefs.h | 9 +++++++-- + fs/bcachefs/btree_io.c | 31 +++++++++++++++++++++---------- + fs/bcachefs/ec.c | 12 ++++++------ + fs/bcachefs/error.h | 29 ++++++++++++++++------------- + fs/bcachefs/fs-io.c | 4 +++- + fs/bcachefs/io.c | 47 ++++++++++++++++++++++++++++++----------------- + fs/bcachefs/journal_io.c | 4 ++-- + fs/bcachefs/super-io.c | 2 +- + 8 files changed, 86 insertions(+), 52 deletions(-) + +diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h +index d216163f62e2..0c4e7f47ff79 100644 +--- a/fs/bcachefs/bcachefs.h ++++ b/fs/bcachefs/bcachefs.h +@@ -215,9 +215,11 @@ + dynamic_fault("bcachefs:meta:write:" name) + + #ifdef __KERNEL__ +-#define bch2_fmt(_c, fmt) "bcachefs (%s): " fmt "\n", ((_c)->name) ++#define bch2_fmt(_c, fmt) "bcachefs (%s): " fmt "\n", ((_c)->name) ++#define bch2_fmt_inum(_c, _inum, fmt) "bcachefs (%s inum %llu): " fmt "\n", ((_c)->name), (_inum) + #else +-#define bch2_fmt(_c, fmt) fmt "\n" ++#define bch2_fmt(_c, fmt) fmt "\n" ++#define bch2_fmt_inum(_c, _inum, fmt) "inum %llu: " fmt "\n", (_inum) + #endif + + #define bch_info(c, fmt, ...) \ +@@ -230,8 +232,11 @@ + printk_ratelimited(KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__) + #define bch_err(c, fmt, ...) \ + printk(KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__) ++ + #define bch_err_ratelimited(c, fmt, ...) \ + printk_ratelimited(KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__) ++#define bch_err_inum_ratelimited(c, _inum, fmt, ...) \ ++ printk_ratelimited(KERN_ERR bch2_fmt_inum(c, _inum, fmt), ##__VA_ARGS__) + + #define bch_verbose(c, fmt, ...) \ + do { \ +diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c +index 2406745fb365..893ffe193479 100644 +--- a/fs/bcachefs/btree_io.c ++++ b/fs/bcachefs/btree_io.c +@@ -597,18 +597,25 @@ void bch2_btree_init_next(struct bch_fs *c, struct btree *b, + bch2_btree_iter_reinit_node(iter, b); + } + ++static void btree_pos_to_text(struct printbuf *out, struct bch_fs *c, ++ struct btree *b) ++{ ++ pr_buf(out, "%s level %u/%u\n ", ++ bch2_btree_ids[b->c.btree_id], ++ b->c.level, ++ c->btree_roots[b->c.btree_id].level); ++ bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key)); ++} ++ + static void btree_err_msg(struct printbuf *out, struct bch_fs *c, + struct btree *b, struct bset *i, + unsigned offset, int write) + { +- pr_buf(out, "error validating btree node %sat btree %u level %u/%u\n" +- "pos ", +- write ? "before write " : "", +- b->c.btree_id, b->c.level, +- c->btree_roots[b->c.btree_id].level); +- bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key)); ++ pr_buf(out, "error validating btree node %sat btree ", ++ write ? "before write " : ""); ++ btree_pos_to_text(out, c, b); + +- pr_buf(out, " node offset %u", b->written); ++ pr_buf(out, "\n node offset %u", b->written); + if (i) + pr_buf(out, " bset u64s %u", le16_to_cpu(i->u64s)); + } +@@ -1104,6 +1111,8 @@ static void btree_node_read_work(struct work_struct *work) + struct btree *b = rb->bio.bi_private; + struct bio *bio = &rb->bio; + struct bch_io_failures failed = { .nr = 0 }; ++ char buf[200]; ++ struct printbuf out; + bool can_retry; + + goto start; +@@ -1123,8 +1132,10 @@ static void btree_node_read_work(struct work_struct *work) + bio->bi_status = BLK_STS_REMOVED; + } + start: +- bch2_dev_io_err_on(bio->bi_status, ca, "btree read: %s", +- bch2_blk_status_to_str(bio->bi_status)); ++ out = PBUF(buf); ++ btree_pos_to_text(&out, c, b); ++ bch2_dev_io_err_on(bio->bi_status, ca, "btree read error %s for %s", ++ bch2_blk_status_to_str(bio->bi_status), buf); + if (rb->have_ioref) + percpu_ref_put(&ca->io_ref); + rb->have_ioref = false; +@@ -1408,7 +1419,7 @@ static void btree_node_write_endio(struct bio *bio) + if (wbio->have_ioref) + bch2_latency_acct(ca, wbio->submit_time, WRITE); + +- if (bch2_dev_io_err_on(bio->bi_status, ca, "btree write: %s", ++ if (bch2_dev_io_err_on(bio->bi_status, ca, "btree write error: %s", + bch2_blk_status_to_str(bio->bi_status)) || + bch2_meta_write_fault("btree")) { + spin_lock_irqsave(&c->btree_write_error_lock, flags); +diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c +index 23254864cfb1..138df875f750 100644 +--- a/fs/bcachefs/ec.c ++++ b/fs/bcachefs/ec.c +@@ -264,7 +264,7 @@ static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf) + len << 9); + + if (memcmp(stripe_csum(v, i, j), &csum, csum_bytes)) { +- __bcache_io_error(c, ++ bch_err_ratelimited(c, + "checksum error while doing reconstruct read (%u:%u)", + i, j); + clear_bit(i, buf->valid); +@@ -305,7 +305,7 @@ static int ec_do_recov(struct bch_fs *c, struct ec_stripe_buf *buf) + unsigned bytes = buf->size << 9; + + if (ec_nr_failed(buf) > v->nr_redundant) { +- __bcache_io_error(c, ++ bch_err_ratelimited(c, + "error doing reconstruct read: unable to read enough blocks"); + return -1; + } +@@ -326,7 +326,7 @@ static void ec_block_endio(struct bio *bio) + struct bch_dev *ca = ec_bio->ca; + struct closure *cl = bio->bi_private; + +- if (bch2_dev_io_err_on(bio->bi_status, ca, "erasure coding %s: %s", ++ if (bch2_dev_io_err_on(bio->bi_status, ca, "erasure coding %s error: %s", + bio_data_dir(bio) ? "write" : "read", + bch2_blk_status_to_str(bio->bi_status))) + clear_bit(ec_bio->idx, ec_bio->buf->valid); +@@ -420,7 +420,7 @@ int bch2_ec_read_extent(struct bch_fs *c, struct bch_read_bio *rbio) + BTREE_ITER_SLOTS); + k = bch2_btree_iter_peek_slot(iter); + if (bkey_err(k) || k.k->type != KEY_TYPE_stripe) { +- __bcache_io_error(c, ++ bch_err_ratelimited(c, + "error doing reconstruct read: stripe not found"); + kfree(buf); + return bch2_trans_exit(&trans) ?: -EIO; +@@ -462,7 +462,7 @@ int bch2_ec_read_extent(struct bch_fs *c, struct bch_read_bio *rbio) + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); + + if (ptr_stale(ca, ptr)) { +- __bcache_io_error(c, ++ bch_err_ratelimited(c, + "error doing reconstruct read: stale pointer"); + clear_bit(i, buf->valid); + continue; +@@ -474,7 +474,7 @@ int bch2_ec_read_extent(struct bch_fs *c, struct bch_read_bio *rbio) + closure_sync(&cl); + + if (ec_nr_failed(buf) > v->nr_redundant) { +- __bcache_io_error(c, ++ bch_err_ratelimited(c, + "error doing reconstruct read: unable to read enough blocks"); + ret = -EIO; + goto err; +diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h +index 94b53312fbbd..0e49fd728e44 100644 +--- a/fs/bcachefs/error.h ++++ b/fs/bcachefs/error.h +@@ -181,12 +181,18 @@ void bch2_io_error(struct bch_dev *); + /* Logs message and handles the error: */ + #define bch2_dev_io_error(ca, fmt, ...) \ + do { \ +- printk_ratelimited(KERN_ERR bch2_fmt((ca)->fs, \ +- "IO error on %s for " fmt), \ ++ printk_ratelimited(KERN_ERR "bcachefs (%s): " fmt, \ + (ca)->name, ##__VA_ARGS__); \ + bch2_io_error(ca); \ + } while (0) + ++#define bch2_dev_inum_io_error(ca, _inum, _offset, fmt, ...) \ ++do { \ ++ printk_ratelimited(KERN_ERR "bcachefs (%s inum %llu offset %llu): " fmt,\ ++ (ca)->name, (_inum), (_offset), ##__VA_ARGS__); \ ++ bch2_io_error(ca); \ ++} while (0) ++ + #define bch2_dev_io_err_on(cond, ca, ...) \ + ({ \ + bool _ret = (cond); \ +@@ -196,16 +202,13 @@ do { \ + _ret; \ + }) + +-/* kill? */ +- +-#define __bcache_io_error(c, fmt, ...) \ +- printk_ratelimited(KERN_ERR bch2_fmt(c, \ +- "IO error: " fmt), ##__VA_ARGS__) +- +-#define bcache_io_error(c, bio, fmt, ...) \ +-do { \ +- __bcache_io_error(c, fmt, ##__VA_ARGS__); \ +- (bio)->bi_status = BLK_STS_IOERR; \ +-} while (0) ++#define bch2_dev_inum_io_err_on(cond, ca, _inum, _offset, ...) \ ++({ \ ++ bool _ret = (cond); \ ++ \ ++ if (_ret) \ ++ bch2_dev_inum_io_error(ca, _inum, _offset, __VA_ARGS__);\ ++ _ret; \ ++}) + + #endif /* _BCACHEFS_ERROR_H */ +diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c +index 806c37499c9b..21fc524d8dd6 100644 +--- a/fs/bcachefs/fs-io.c ++++ b/fs/bcachefs/fs-io.c +@@ -864,7 +864,9 @@ retry: + goto retry; + + if (ret) { +- bcache_io_error(c, &rbio->bio, "btree IO error %i", ret); ++ bch_err_inum_ratelimited(c, inum, ++ "read error %i from btree lookup", ret); ++ rbio->bio.bi_status = BLK_STS_IOERR; + bio_endio(&rbio->bio); + } + +diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c +index 70b3a2e6707f..a1c17512bd86 100644 +--- a/fs/bcachefs/io.c ++++ b/fs/bcachefs/io.c +@@ -575,7 +575,8 @@ static void __bch2_write_index(struct bch_write_op *op) + op->written += sectors_start - keylist_sectors(keys); + + if (ret) { +- __bcache_io_error(c, "btree IO error %i", ret); ++ bch_err_inum_ratelimited(c, op->pos.inode, ++ "write error %i from btree update", ret); + op->error = ret; + } + } +@@ -620,7 +621,10 @@ static void bch2_write_endio(struct bio *bio) + struct bch_fs *c = wbio->c; + struct bch_dev *ca = bch_dev_bkey_exists(c, wbio->dev); + +- if (bch2_dev_io_err_on(bio->bi_status, ca, "data write: %s", ++ if (bch2_dev_inum_io_err_on(bio->bi_status, ca, ++ op->pos.inode, ++ op->pos.offset - bio_sectors(bio), /* XXX definitely wrong */ ++ "data write error: %s", + bch2_blk_status_to_str(bio->bi_status))) + set_bit(wbio->dev, op->failed.d); + +@@ -1277,15 +1281,14 @@ void bch2_write(struct closure *cl) + wbio_init(bio)->put_bio = false; + + if (bio_sectors(bio) & (c->opts.block_size - 1)) { +- __bcache_io_error(c, "misaligned write"); ++ bch_err_inum_ratelimited(c, op->pos.inode, ++ "misaligned write"); + op->error = -EIO; + goto err; + } + + if (c->opts.nochanges || + !percpu_ref_tryget(&c->writes)) { +- if (!(op->flags & BCH_WRITE_FROM_INTERNAL)) +- __bcache_io_error(c, "read only"); + op->error = -EROFS; + goto err; + } +@@ -1714,7 +1717,8 @@ retry: + * reading a btree node + */ + BUG_ON(!ret); +- __bcache_io_error(c, "btree IO error: %i", ret); ++ bch_err_inum_ratelimited(c, inode, ++ "read error %i from btree lookup", ret); + err: + rbio->bio.bi_status = BLK_STS_IOERR; + out: +@@ -1918,17 +1922,15 @@ csum_err: + return; + } + +- bch2_dev_io_error(ca, +- "data checksum error, inode %llu offset %llu: expected %0llx:%0llx got %0llx:%0llx (type %u)", +- rbio->pos.inode, (u64) rbio->bvec_iter.bi_sector, ++ bch2_dev_inum_io_error(ca, rbio->pos.inode, (u64) rbio->bvec_iter.bi_sector, ++ "data checksum error: expected %0llx:%0llx got %0llx:%0llx (type %u)", + rbio->pick.crc.csum.hi, rbio->pick.crc.csum.lo, + csum.hi, csum.lo, crc.csum_type); + bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); + return; + decompression_err: +- __bcache_io_error(c, "decompression error, inode %llu offset %llu", +- rbio->pos.inode, +- (u64) rbio->bvec_iter.bi_sector); ++ bch_err_inum_ratelimited(c, rbio->pos.inode, ++ "decompression error"); + bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR); + return; + } +@@ -1950,7 +1952,14 @@ static void bch2_read_endio(struct bio *bio) + if (!rbio->split) + rbio->bio.bi_end_io = rbio->end_io; + +- if (bch2_dev_io_err_on(bio->bi_status, ca, "data read; %s", ++ /* ++ * XXX: rbio->pos is not what we want here when reading from indirect ++ * extents ++ */ ++ if (bch2_dev_inum_io_err_on(bio->bi_status, ca, ++ rbio->pos.inode, ++ rbio->pos.offset, ++ "data read error: %s", + bch2_blk_status_to_str(bio->bi_status))) { + bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status); + return; +@@ -2000,7 +2009,7 @@ int __bch2_read_indirect_extent(struct btree_trans *trans, + + if (k.k->type != KEY_TYPE_reflink_v && + k.k->type != KEY_TYPE_indirect_inline_data) { +- __bcache_io_error(trans->c, ++ bch_err_inum_ratelimited(trans->c, orig_k->k->k.p.inode, + "pointer to nonexistent indirect extent"); + ret = -EIO; + goto err; +@@ -2045,7 +2054,8 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig, + goto hole; + + if (pick_ret < 0) { +- __bcache_io_error(c, "no device to read from"); ++ bch_err_inum_ratelimited(c, k.k->p.inode, ++ "no device to read from"); + goto err; + } + +@@ -2195,7 +2205,8 @@ get_bio: + + if (!rbio->pick.idx) { + if (!rbio->have_ioref) { +- __bcache_io_error(c, "no device to read from"); ++ bch_err_inum_ratelimited(c, k.k->p.inode, ++ "no device to read from"); + bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); + goto out; + } +@@ -2345,7 +2356,9 @@ err: + if (ret == -EINTR) + goto retry; + +- bcache_io_error(c, &rbio->bio, "btree IO error: %i", ret); ++ bch_err_inum_ratelimited(c, inode, ++ "read error %i from btree lookup", ret); ++ rbio->bio.bi_status = BLK_STS_IOERR; + bch2_rbio_done(rbio); + goto out; + } +diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c +index d1367cf067d3..97c98ab96c29 100644 +--- a/fs/bcachefs/journal_io.c ++++ b/fs/bcachefs/journal_io.c +@@ -558,7 +558,7 @@ reread: + bio_put(bio); + + if (bch2_dev_io_err_on(ret, ca, +- "journal read from sector %llu", ++ "journal read error: sector %llu", + offset) || + bch2_meta_read_fault("journal")) + return -EIO; +@@ -1016,7 +1016,7 @@ static void journal_write_endio(struct bio *bio) + struct bch_dev *ca = bio->bi_private; + struct journal *j = &ca->fs->journal; + +- if (bch2_dev_io_err_on(bio->bi_status, ca, "journal write: %s", ++ if (bch2_dev_io_err_on(bio->bi_status, ca, "journal write error: %s", + bch2_blk_status_to_str(bio->bi_status)) || + bch2_meta_write_fault("journal")) { + struct journal_buf *w = journal_prev_buf(j); +diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c +index ffd219091ea6..78835bd2d6bc 100644 +--- a/fs/bcachefs/super-io.c ++++ b/fs/bcachefs/super-io.c +@@ -633,7 +633,7 @@ static void write_super_endio(struct bio *bio) + + /* XXX: return errors directly */ + +- if (bch2_dev_io_err_on(bio->bi_status, ca, "superblock write: %s", ++ if (bch2_dev_io_err_on(bio->bi_status, ca, "superblock write error: %s", + bch2_blk_status_to_str(bio->bi_status))) + ca->sb_write_error = 1; + +-- +cgit v1.2.3 + + +From d6ed976211a757ab01a35fd824aaec087675a8d6 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 3 Dec 2020 14:17:33 -0500 +Subject: bcachefs: Avoid write lock on mark_lock + +mark_lock is a frequently taken lock, and there's also potential for +deadlocks since currently bch2_clear_page_bits which is called from +memory reclaim has to take it to drop disk reservations. + +The disk reservation get path takes it when it recalculates the number +of sectors known to be available, but it's not really needed for +consistency. We just want to make sure we only have one thread updating +the sectors_available count, which we can do with a dedicated mutex. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bcachefs.h | 1 + + fs/bcachefs/buckets.c | 16 +++++----------- + fs/bcachefs/super.c | 2 ++ + 3 files changed, 8 insertions(+), 11 deletions(-) + +diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h +index 0c4e7f47ff79..d54413bec18f 100644 +--- a/fs/bcachefs/bcachefs.h ++++ b/fs/bcachefs/bcachefs.h +@@ -674,6 +674,7 @@ struct bch_fs { + unsigned bucket_size_max; + + atomic64_t sectors_available; ++ struct mutex sectors_available_lock; + + struct bch_fs_pcpu __percpu *pcpu; + +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index e297101af3a1..0000fc76d2d9 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -2032,13 +2032,6 @@ int bch2_trans_mark_update(struct btree_trans *trans, + + /* Disk reservations: */ + +-static u64 bch2_recalc_sectors_available(struct bch_fs *c) +-{ +- percpu_u64_set(&c->pcpu->sectors_available, 0); +- +- return avail_factor(__bch2_fs_usage_read_short(c).free); +-} +- + void __bch2_disk_reservation_put(struct bch_fs *c, struct disk_reservation *res) + { + percpu_down_read(&c->mark_lock); +@@ -2073,7 +2066,6 @@ int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res, + + if (get < sectors) { + preempt_enable(); +- percpu_up_read(&c->mark_lock); + goto recalculate; + } + } while ((v = atomic64_cmpxchg(&c->sectors_available, +@@ -2091,9 +2083,10 @@ out: + return 0; + + recalculate: +- percpu_down_write(&c->mark_lock); ++ mutex_lock(&c->sectors_available_lock); + +- sectors_available = bch2_recalc_sectors_available(c); ++ percpu_u64_set(&c->pcpu->sectors_available, 0); ++ sectors_available = avail_factor(__bch2_fs_usage_read_short(c).free); + + if (sectors <= sectors_available || + (flags & BCH_DISK_RESERVATION_NOFAIL)) { +@@ -2107,7 +2100,8 @@ recalculate: + ret = -ENOSPC; + } + +- percpu_up_write(&c->mark_lock); ++ mutex_unlock(&c->sectors_available_lock); ++ percpu_up_read(&c->mark_lock); + + return ret; + } +diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c +index 741a4c225fa9..8442605537b1 100644 +--- a/fs/bcachefs/super.c ++++ b/fs/bcachefs/super.c +@@ -715,6 +715,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) + + bch2_fs_btree_cache_init_early(&c->btree_cache); + ++ mutex_init(&c->sectors_available_lock); ++ + if (percpu_init_rwsem(&c->mark_lock)) + goto err; + +-- +cgit v1.2.3 + + +From 1ad33274a51262932bb266aaa6417bd2180e253e Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 3 Dec 2020 14:27:20 -0500 +Subject: bcachefs: Flag inodes that had btree update errors + +On write error, the vfs inode's i_size may be inconsistent with the +btree inode's i_size - flag this so we don't have spurious assertions. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/fs-io.c | 13 +++++++++++-- + fs/bcachefs/fs.c | 1 + + fs/bcachefs/fs.h | 7 +++++++ + 3 files changed, 19 insertions(+), 2 deletions(-) + +diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c +index 21fc524d8dd6..39282d78cc51 100644 +--- a/fs/bcachefs/fs-io.c ++++ b/fs/bcachefs/fs-io.c +@@ -1023,6 +1023,8 @@ static void bch2_writepage_io_done(struct closure *cl) + unsigned i; + + if (io->op.error) { ++ set_bit(EI_INODE_ERROR, &io->inode->ei_flags); ++ + bio_for_each_segment_all(bvec, bio, iter) { + struct bch_page_state *s; + +@@ -1910,7 +1912,13 @@ loop: + + bio_for_each_segment_all(bv, bio, iter) + put_page(bv->bv_page); +- if (!dio->iter.count || dio->op.error) ++ ++ if (dio->op.error) { ++ set_bit(EI_INODE_ERROR, &inode->ei_flags); ++ break; ++ } ++ ++ if (!dio->iter.count) + break; + + bio_reset(bio); +@@ -2299,7 +2307,8 @@ int bch2_truncate(struct bch_inode_info *inode, struct iattr *iattr) + if (ret) + goto err; + +- BUG_ON(inode->v.i_size < inode_u.bi_size); ++ WARN_ON(!test_bit(EI_INODE_ERROR, &inode->ei_flags) && ++ inode->v.i_size < inode_u.bi_size); + + if (iattr->ia_size > inode->v.i_size) { + ret = bch2_extend(inode, &inode_u, iattr); +diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c +index 7ee34771a867..983c1555622d 100644 +--- a/fs/bcachefs/fs.c ++++ b/fs/bcachefs/fs.c +@@ -1160,6 +1160,7 @@ static void bch2_vfs_inode_init(struct bch_fs *c, + inode->v.i_generation = bi->bi_generation; + inode->v.i_size = bi->bi_size; + ++ inode->ei_flags = 0; + inode->ei_journal_seq = 0; + inode->ei_quota_reserved = 0; + inode->ei_str_hash = bch2_hash_info_init(c, bi); +diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h +index 4ee1ac994420..3df85ffb450c 100644 +--- a/fs/bcachefs/fs.h ++++ b/fs/bcachefs/fs.h +@@ -33,6 +33,7 @@ void bch2_pagecache_block_get(struct pagecache_lock *); + + struct bch_inode_info { + struct inode v; ++ unsigned long ei_flags; + + struct mutex ei_update_lock; + u64 ei_journal_seq; +@@ -50,6 +51,12 @@ struct bch_inode_info { + struct bch_inode_unpacked ei_inode; + }; + ++/* ++ * Set if we've gotten a btree error for this inode, and thus the vfs inode and ++ * btree inode may be inconsistent: ++ */ ++#define EI_INODE_ERROR 0 ++ + #define to_bch_ei(_inode) \ + container_of_or_null(_inode, struct bch_inode_info, v) + +-- +cgit v1.2.3 + + +From 18941b4083150046704a5095a43fceb61ff990b4 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 3 Dec 2020 13:23:58 -0500 +Subject: bcachefs: Check for errors in bch2_journal_reclaim() + +If the journal is halted, journal reclaim won't necessarily be able to +make any forward progress, and won't accomplish anything anyways - we +should bail out so that we don't get stuck looping in reclaim when the +caches are too dirty and we should be shutting down. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_update_leaf.c | 8 ++++---- + fs/bcachefs/journal_reclaim.c | 19 ++++++++++++++----- + fs/bcachefs/journal_reclaim.h | 2 +- + 3 files changed, 19 insertions(+), 10 deletions(-) + +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index f35cdbfb43c5..64734f9158c3 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -659,13 +659,13 @@ int bch2_trans_commit_error(struct btree_trans *trans, + case BTREE_INSERT_NEED_JOURNAL_RECLAIM: + bch2_trans_unlock(trans); + +- while (bch2_btree_key_cache_must_wait(c)) { ++ do { + mutex_lock(&c->journal.reclaim_lock); +- bch2_journal_reclaim(&c->journal); ++ ret = bch2_journal_reclaim(&c->journal); + mutex_unlock(&c->journal.reclaim_lock); +- } ++ } while (!ret && bch2_btree_key_cache_must_wait(c)); + +- if (bch2_trans_relock(trans)) ++ if (!ret && bch2_trans_relock(trans)) + return 0; + + trace_trans_restart_journal_reclaim(trans->ip); +diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c +index beaa39f7bf5e..d59a1795d57b 100644 +--- a/fs/bcachefs/journal_reclaim.c ++++ b/fs/bcachefs/journal_reclaim.c +@@ -485,13 +485,14 @@ static u64 journal_seq_to_flush(struct journal *j) + * 512 journal entries or 25% of all journal buckets, then + * journal_next_bucket() should not stall. + */ +-static void __bch2_journal_reclaim(struct journal *j, bool direct) ++static int __bch2_journal_reclaim(struct journal *j, bool direct) + { + struct bch_fs *c = container_of(j, struct bch_fs, journal); + bool kthread = (current->flags & PF_KTHREAD) != 0; + u64 seq_to_flush, nr_flushed = 0; + size_t min_nr; + unsigned flags; ++ int ret = 0; + + /* + * We can't invoke memory reclaim while holding the reclaim_lock - +@@ -506,6 +507,11 @@ static void __bch2_journal_reclaim(struct journal *j, bool direct) + if (kthread && kthread_should_stop()) + break; + ++ if (bch2_journal_error(j)) { ++ ret = -EIO; ++ break; ++ } ++ + bch2_journal_do_discards(j); + + seq_to_flush = journal_seq_to_flush(j); +@@ -547,27 +553,30 @@ static void __bch2_journal_reclaim(struct journal *j, bool direct) + } while (min_nr); + + memalloc_noreclaim_restore(flags); ++ ++ return ret; + } + +-void bch2_journal_reclaim(struct journal *j) ++int bch2_journal_reclaim(struct journal *j) + { +- __bch2_journal_reclaim(j, true); ++ return __bch2_journal_reclaim(j, true); + } + + static int bch2_journal_reclaim_thread(void *arg) + { + struct journal *j = arg; + unsigned long next; ++ int ret = 0; + + set_freezable(); + + kthread_wait_freezable(test_bit(JOURNAL_RECLAIM_STARTED, &j->flags)); + +- while (!kthread_should_stop()) { ++ while (!ret && !kthread_should_stop()) { + j->reclaim_kicked = false; + + mutex_lock(&j->reclaim_lock); +- __bch2_journal_reclaim(j, false); ++ ret = __bch2_journal_reclaim(j, false); + mutex_unlock(&j->reclaim_lock); + + next = j->last_flushed + msecs_to_jiffies(j->reclaim_delay_ms); +diff --git a/fs/bcachefs/journal_reclaim.h b/fs/bcachefs/journal_reclaim.h +index e25355042e6e..3404fef241ea 100644 +--- a/fs/bcachefs/journal_reclaim.h ++++ b/fs/bcachefs/journal_reclaim.h +@@ -73,7 +73,7 @@ static inline void bch2_journal_pin_update(struct journal *j, u64 seq, + void bch2_journal_pin_flush(struct journal *, struct journal_entry_pin *); + + void bch2_journal_do_discards(struct journal *); +-void bch2_journal_reclaim(struct journal *); ++int bch2_journal_reclaim(struct journal *); + + void bch2_journal_reclaim_stop(struct journal *); + int bch2_journal_reclaim_start(struct journal *); +-- +cgit v1.2.3 + + +From aa7921a9814ba684bce7679983b9f796c0b337fd Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 3 Dec 2020 16:20:18 -0500 +Subject: bcachefs: Don't issue btree writes that weren't journalled + +If we have an error in the btree interior update path that prevents us +from journalling the update, we can't issue the corresponding btree node +write - we didn't get a journal sequence number that would cause it to +be ignored in recovery. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_io.c | 5 +++++ + fs/bcachefs/btree_types.h | 2 ++ + fs/bcachefs/btree_update_interior.c | 25 +++++++++++++++++++------ + 3 files changed, 26 insertions(+), 6 deletions(-) + +diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c +index 893ffe193479..9b19432ae7a5 100644 +--- a/fs/bcachefs/btree_io.c ++++ b/fs/bcachefs/btree_io.c +@@ -1499,6 +1499,9 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, + if (!btree_node_may_write(b)) + return; + ++ if (old & (1 << BTREE_NODE_never_write)) ++ return; ++ + if (old & (1 << BTREE_NODE_write_in_flight)) { + btree_node_wait_on_io(b); + continue; +@@ -1545,6 +1548,8 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, + seq = max(seq, le64_to_cpu(i->journal_seq)); + } + ++ BUG_ON(b->written && !seq); ++ + /* bch2_varint_decode may read up to 7 bytes past the end of the buffer: */ + bytes += 8; + +diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h +index 15af60e92820..dc7de27112c6 100644 +--- a/fs/bcachefs/btree_types.h ++++ b/fs/bcachefs/btree_types.h +@@ -415,6 +415,7 @@ enum btree_flags { + BTREE_NODE_fake, + BTREE_NODE_old_extent_overwrite, + BTREE_NODE_need_rewrite, ++ BTREE_NODE_never_write, + }; + + BTREE_FLAG(read_in_flight); +@@ -429,6 +430,7 @@ BTREE_FLAG(dying); + BTREE_FLAG(fake); + BTREE_FLAG(old_extent_overwrite); + BTREE_FLAG(need_rewrite); ++BTREE_FLAG(never_write); + + static inline struct btree_write *btree_current_write(struct btree *b) + { +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index edc11c22308c..4a169d366538 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -603,17 +603,30 @@ err: + + list_del(&as->write_blocked_list); + +- if (!ret && as->b == b) { ++ /* ++ * Node might have been freed, recheck under ++ * btree_interior_update_lock: ++ */ ++ if (as->b == b) { + struct bset *i = btree_bset_last(b); + + BUG_ON(!b->c.level); + BUG_ON(!btree_node_dirty(b)); + +- i->journal_seq = cpu_to_le64( +- max(journal_seq, +- le64_to_cpu(i->journal_seq))); +- +- bch2_btree_add_journal_pin(c, b, journal_seq); ++ if (!ret) { ++ i->journal_seq = cpu_to_le64( ++ max(journal_seq, ++ le64_to_cpu(i->journal_seq))); ++ ++ bch2_btree_add_journal_pin(c, b, journal_seq); ++ } else { ++ /* ++ * If we didn't get a journal sequence number we ++ * can't write this btree node, because recovery ++ * won't know to ignore this write: ++ */ ++ set_btree_node_never_write(b); ++ } + } + + mutex_unlock(&c->btree_interior_update_lock); +-- +cgit v1.2.3 + + +From b1e18b04d1a1a2df65d50cc17d8147267353995e Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 13 Nov 2020 18:36:33 -0500 +Subject: bcachefs: Increase journal pipelining + +This patch increases the maximum journal buffers in flight from 2 to 4 - +this will be particularly helpful when in the future we stop requiring +flush+fua for every journal write. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/journal.c | 143 ++++++++++++++++++++++++++---------------- + fs/bcachefs/journal.h | 47 +++++++++----- + fs/bcachefs/journal_io.c | 30 ++++++--- + fs/bcachefs/journal_reclaim.c | 46 ++++++++++---- + fs/bcachefs/journal_reclaim.h | 1 + + fs/bcachefs/journal_types.h | 18 +++--- + fs/bcachefs/recovery.c | 4 +- + 7 files changed, 188 insertions(+), 101 deletions(-) + +diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c +index dd8db8c0c980..ee8643a4be0f 100644 +--- a/fs/bcachefs/journal.c ++++ b/fs/bcachefs/journal.c +@@ -24,7 +24,7 @@ static u64 last_unwritten_seq(struct journal *j) + + lockdep_assert_held(&j->lock); + +- return journal_cur_seq(j) - s.prev_buf_unwritten; ++ return journal_cur_seq(j) - ((s.idx - s.unwritten_idx) & JOURNAL_BUF_MASK); + } + + static inline bool journal_seq_unwritten(struct journal *j, u64 seq) +@@ -52,7 +52,7 @@ journal_seq_to_buf(struct journal *j, u64 seq) + j->reservations.cur_entry_offset == JOURNAL_ENTRY_CLOSED_VAL); + + if (journal_seq_unwritten(j, seq)) { +- buf = j->buf + (seq & 1); ++ buf = j->buf + (seq & JOURNAL_BUF_MASK); + EBUG_ON(le64_to_cpu(buf->data->seq) != seq); + } + return buf; +@@ -109,15 +109,8 @@ void bch2_journal_halt(struct journal *j) + + /* journal entry close/open: */ + +-void __bch2_journal_buf_put(struct journal *j, bool need_write_just_set) ++void __bch2_journal_buf_put(struct journal *j) + { +- if (!need_write_just_set && +- test_bit(JOURNAL_NEED_WRITE, &j->flags)) +- bch2_time_stats_update(j->delay_time, +- j->need_write_time); +- +- clear_bit(JOURNAL_NEED_WRITE, &j->flags); +- + closure_call(&j->io, bch2_journal_write, system_highpri_wq, NULL); + } + +@@ -130,7 +123,6 @@ static bool __journal_entry_close(struct journal *j) + struct journal_buf *buf = journal_cur_buf(j); + union journal_res_state old, new; + u64 v = atomic64_read(&j->reservations.counter); +- bool set_need_write = false; + unsigned sectors; + + lockdep_assert_held(&j->lock); +@@ -149,15 +141,13 @@ static bool __journal_entry_close(struct journal *j) + if (!test_bit(JOURNAL_NEED_WRITE, &j->flags)) { + set_bit(JOURNAL_NEED_WRITE, &j->flags); + j->need_write_time = local_clock(); +- set_need_write = true; + } + +- if (new.prev_buf_unwritten) +- return false; +- + new.cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL; + new.idx++; +- new.prev_buf_unwritten = 1; ++ ++ if (new.idx == new.unwritten_idx) ++ return false; + + BUG_ON(journal_state_count(new, new.idx)); + } while ((v = atomic64_cmpxchg(&j->reservations.counter, +@@ -191,24 +181,44 @@ static bool __journal_entry_close(struct journal *j) + */ + buf->data->last_seq = cpu_to_le64(journal_last_seq(j)); + ++ __bch2_journal_pin_put(j, le64_to_cpu(buf->data->seq)); ++ + journal_pin_new_entry(j, 1); + + bch2_journal_buf_init(j); + + cancel_delayed_work(&j->write_work); ++ clear_bit(JOURNAL_NEED_WRITE, &j->flags); + + bch2_journal_space_available(j); + +- bch2_journal_buf_put(j, old.idx, set_need_write); ++ bch2_journal_buf_put(j, old.idx); + return true; + } + ++static bool journal_entry_want_write(struct journal *j) ++{ ++ union journal_res_state s = READ_ONCE(j->reservations); ++ bool ret = false; ++ ++ /* ++ * Don't close it yet if we already have a write in flight, but do set ++ * NEED_WRITE: ++ */ ++ if (s.idx != s.unwritten_idx) ++ set_bit(JOURNAL_NEED_WRITE, &j->flags); ++ else ++ ret = __journal_entry_close(j); ++ ++ return ret; ++} ++ + static bool journal_entry_close(struct journal *j) + { + bool ret; + + spin_lock(&j->lock); +- ret = __journal_entry_close(j); ++ ret = journal_entry_want_write(j); + spin_unlock(&j->lock); + + return ret; +@@ -290,8 +300,8 @@ static int journal_entry_open(struct journal *j) + + static bool journal_quiesced(struct journal *j) + { +- union journal_res_state state = READ_ONCE(j->reservations); +- bool ret = !state.prev_buf_unwritten && !__journal_entry_is_open(state); ++ union journal_res_state s = READ_ONCE(j->reservations); ++ bool ret = s.idx == s.unwritten_idx && !__journal_entry_is_open(s); + + if (!ret) + journal_entry_close(j); +@@ -318,17 +328,29 @@ static void journal_write_work(struct work_struct *work) + u64 bch2_inode_journal_seq(struct journal *j, u64 inode) + { + size_t h = hash_64(inode, ilog2(sizeof(j->buf[0].has_inode) * 8)); +- u64 seq = 0; ++ union journal_res_state s; ++ unsigned i; ++ u64 seq; + +- if (!test_bit(h, j->buf[0].has_inode) && +- !test_bit(h, j->buf[1].has_inode)) +- return 0; + + spin_lock(&j->lock); +- if (test_bit(h, journal_cur_buf(j)->has_inode)) +- seq = journal_cur_seq(j); +- else if (test_bit(h, journal_prev_buf(j)->has_inode)) +- seq = journal_cur_seq(j) - 1; ++ seq = journal_cur_seq(j); ++ s = READ_ONCE(j->reservations); ++ i = s.idx; ++ ++ while (1) { ++ if (test_bit(h, j->buf[i].has_inode)) ++ goto out; ++ ++ if (i == s.unwritten_idx) ++ break; ++ ++ i = (i - 1) & JOURNAL_BUF_MASK; ++ seq--; ++ } ++ ++ seq = 0; ++out: + spin_unlock(&j->lock); + + return seq; +@@ -575,7 +597,7 @@ int bch2_journal_flush_seq_async(struct journal *j, u64 seq, + BUG(); + + if (seq == journal_cur_seq(j)) +- __journal_entry_close(j); ++ journal_entry_want_write(j); + out: + spin_unlock(&j->lock); + return ret; +@@ -864,15 +886,18 @@ int bch2_dev_journal_alloc(struct bch_dev *ca) + static bool bch2_journal_writing_to_device(struct journal *j, unsigned dev_idx) + { + union journal_res_state state; +- struct journal_buf *w; +- bool ret; ++ bool ret = false; ++ unsigned i; + + spin_lock(&j->lock); + state = READ_ONCE(j->reservations); +- w = j->buf + !state.idx; ++ i = state.idx; + +- ret = state.prev_buf_unwritten && +- bch2_bkey_has_device(bkey_i_to_s_c(&w->key), dev_idx); ++ while (i != state.unwritten_idx) { ++ i = (i - 1) & JOURNAL_BUF_MASK; ++ if (bch2_bkey_has_device(bkey_i_to_s_c(&j->buf[i].key), dev_idx)) ++ ret = true; ++ } + spin_unlock(&j->lock); + + return ret; +@@ -958,7 +983,7 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq, + + journal_pin_new_entry(j, 1); + +- j->reservations.idx = journal_cur_seq(j); ++ j->reservations.idx = j->reservations.unwritten_idx = journal_cur_seq(j); + + bch2_journal_buf_init(j); + +@@ -1013,8 +1038,10 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb) + + void bch2_fs_journal_exit(struct journal *j) + { +- kvpfree(j->buf[1].data, j->buf[1].buf_size); +- kvpfree(j->buf[0].data, j->buf[0].buf_size); ++ unsigned i; ++ ++ for (i = 0; i < ARRAY_SIZE(j->buf); i++) ++ kvpfree(j->buf[i].data, j->buf[i].buf_size); + free_fifo(&j->pin); + } + +@@ -1022,6 +1049,7 @@ int bch2_fs_journal_init(struct journal *j) + { + struct bch_fs *c = container_of(j, struct bch_fs, journal); + static struct lock_class_key res_key; ++ unsigned i; + int ret = 0; + + pr_verbose_init(c->opts, ""); +@@ -1036,8 +1064,6 @@ int bch2_fs_journal_init(struct journal *j) + + lockdep_init_map(&j->res_map, "journal res", &res_key, 0); + +- j->buf[0].buf_size = JOURNAL_ENTRY_SIZE_MIN; +- j->buf[1].buf_size = JOURNAL_ENTRY_SIZE_MIN; + j->write_delay_ms = 1000; + j->reclaim_delay_ms = 100; + +@@ -1049,13 +1075,20 @@ int bch2_fs_journal_init(struct journal *j) + ((union journal_res_state) + { .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v); + +- if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) || +- !(j->buf[0].data = kvpmalloc(j->buf[0].buf_size, GFP_KERNEL)) || +- !(j->buf[1].data = kvpmalloc(j->buf[1].buf_size, GFP_KERNEL))) { ++ if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL))) { + ret = -ENOMEM; + goto out; + } + ++ for (i = 0; i < ARRAY_SIZE(j->buf); i++) { ++ j->buf[i].buf_size = JOURNAL_ENTRY_SIZE_MIN; ++ j->buf[i].data = kvpmalloc(j->buf[i].buf_size, GFP_KERNEL); ++ if (!j->buf[i].data) { ++ ret = -ENOMEM; ++ goto out; ++ } ++ } ++ + j->pin.front = j->pin.back = 1; + out: + pr_verbose_init(c->opts, "ret %i", ret); +@@ -1069,7 +1102,7 @@ void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) + struct bch_fs *c = container_of(j, struct bch_fs, journal); + union journal_res_state s; + struct bch_dev *ca; +- unsigned iter; ++ unsigned i; + + rcu_read_lock(); + spin_lock(&j->lock); +@@ -1112,16 +1145,16 @@ void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) + } + + pr_buf(out, +- "current entry refs:\t%u\n" +- "prev entry unwritten:\t", +- journal_state_count(s, s.idx)); +- +- if (s.prev_buf_unwritten) +- pr_buf(out, "yes, ref %u sectors %u\n", +- journal_state_count(s, !s.idx), +- journal_prev_buf(j)->sectors); +- else +- pr_buf(out, "no\n"); ++ "current entry:\tidx %u refcount %u\n", ++ s.idx, journal_state_count(s, s.idx)); ++ ++ i = s.idx; ++ while (i != s.unwritten_idx) { ++ i = (i - 1) & JOURNAL_BUF_MASK; ++ ++ pr_buf(out, "unwritten entry:\tidx %u refcount %u sectors %u\n", ++ i, journal_state_count(s, i), j->buf[i].sectors); ++ } + + pr_buf(out, + "need write:\t\t%i\n" +@@ -1129,7 +1162,7 @@ void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) + test_bit(JOURNAL_NEED_WRITE, &j->flags), + test_bit(JOURNAL_REPLAY_DONE, &j->flags)); + +- for_each_member_device_rcu(ca, c, iter, ++ for_each_member_device_rcu(ca, c, i, + &c->rw_devs[BCH_DATA_journal]) { + struct journal_device *ja = &ca->journal; + +@@ -1144,7 +1177,7 @@ void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) + "\tdirty_idx_ondisk\t%u (seq %llu)\n" + "\tdirty_idx\t\t%u (seq %llu)\n" + "\tcur_idx\t\t%u (seq %llu)\n", +- iter, ja->nr, ++ i, ja->nr, + bch2_journal_dev_buckets_available(j, ja, journal_space_discarded), + ja->sectors_free, + ja->discard_idx, +diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h +index 25c6876765ac..5b92e8cd569d 100644 +--- a/fs/bcachefs/journal.h ++++ b/fs/bcachefs/journal.h +@@ -127,11 +127,6 @@ static inline struct journal_buf *journal_cur_buf(struct journal *j) + return j->buf + j->reservations.idx; + } + +-static inline struct journal_buf *journal_prev_buf(struct journal *j) +-{ +- return j->buf + !j->reservations.idx; +-} +- + /* Sequence number of oldest dirty journal entry */ + + static inline u64 journal_last_seq(struct journal *j) +@@ -151,13 +146,21 @@ void bch2_journal_set_has_inum(struct journal *, u64, u64); + + static inline int journal_state_count(union journal_res_state s, int idx) + { +- return idx == 0 ? s.buf0_count : s.buf1_count; ++ switch (idx) { ++ case 0: return s.buf0_count; ++ case 1: return s.buf1_count; ++ case 2: return s.buf2_count; ++ case 3: return s.buf3_count; ++ } ++ BUG(); + } + + static inline void journal_state_inc(union journal_res_state *s) + { + s->buf0_count += s->idx == 0; + s->buf1_count += s->idx == 1; ++ s->buf2_count += s->idx == 2; ++ s->buf3_count += s->idx == 3; + } + + static inline void bch2_journal_set_has_inode(struct journal *j, +@@ -255,21 +258,24 @@ static inline bool journal_entry_empty(struct jset *j) + return true; + } + +-void __bch2_journal_buf_put(struct journal *, bool); ++void __bch2_journal_buf_put(struct journal *); + +-static inline void bch2_journal_buf_put(struct journal *j, unsigned idx, +- bool need_write_just_set) ++static inline void bch2_journal_buf_put(struct journal *j, unsigned idx) + { + union journal_res_state s; + + s.v = atomic64_sub_return(((union journal_res_state) { + .buf0_count = idx == 0, + .buf1_count = idx == 1, ++ .buf2_count = idx == 2, ++ .buf3_count = idx == 3, + }).v, &j->reservations.counter); +- if (!journal_state_count(s, idx)) { +- EBUG_ON(s.idx == idx || !s.prev_buf_unwritten); +- __bch2_journal_buf_put(j, need_write_just_set); +- } ++ ++ EBUG_ON(((s.idx - idx) & 3) > ++ ((s.idx - s.unwritten_idx) & 3)); ++ ++ if (!journal_state_count(s, idx) && idx == s.unwritten_idx) ++ __bch2_journal_buf_put(j); + } + + /* +@@ -289,7 +295,7 @@ static inline void bch2_journal_res_put(struct journal *j, + BCH_JSET_ENTRY_btree_keys, + 0, 0, NULL, 0); + +- bch2_journal_buf_put(j, res->idx, false); ++ bch2_journal_buf_put(j, res->idx); + + res->ref = 0; + } +@@ -325,11 +331,18 @@ static inline int journal_res_get_fast(struct journal *j, + !test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags)) + return 0; + +- if (flags & JOURNAL_RES_GET_CHECK) +- return 1; +- + new.cur_entry_offset += res->u64s; + journal_state_inc(&new); ++ ++ /* ++ * If the refcount would overflow, we have to wait: ++ * XXX - tracepoint this: ++ */ ++ if (!journal_state_count(new, new.idx)) ++ return 0; ++ ++ if (flags & JOURNAL_RES_GET_CHECK) ++ return 1; + } while ((v = atomic64_cmpxchg(&j->reservations.counter, + old.v, new.v)) != old.v); + +diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c +index 97c98ab96c29..2fea568a04c3 100644 +--- a/fs/bcachefs/journal_io.c ++++ b/fs/bcachefs/journal_io.c +@@ -951,16 +951,23 @@ static void journal_buf_realloc(struct journal *j, struct journal_buf *buf) + buf->buf_size = new_size; + } + ++static inline struct journal_buf *journal_last_unwritten_buf(struct journal *j) ++{ ++ return j->buf + j->reservations.unwritten_idx; ++} ++ + static void journal_write_done(struct closure *cl) + { + struct journal *j = container_of(cl, struct journal, io); + struct bch_fs *c = container_of(j, struct bch_fs, journal); +- struct journal_buf *w = journal_prev_buf(j); ++ struct journal_buf *w = journal_last_unwritten_buf(j); + struct bch_devs_list devs = + bch2_bkey_devs(bkey_i_to_s_c(&w->key)); + struct bch_replicas_padded replicas; ++ union journal_res_state old, new; + u64 seq = le64_to_cpu(w->data->seq); + u64 last_seq = le64_to_cpu(w->data->last_seq); ++ u64 v; + int err = 0; + + bch2_time_stats_update(j->write_time, j->write_start_time); +@@ -999,9 +1006,14 @@ static void journal_write_done(struct closure *cl) + /* also must come before signalling write completion: */ + closure_debug_destroy(cl); + +- BUG_ON(!j->reservations.prev_buf_unwritten); +- atomic64_sub(((union journal_res_state) { .prev_buf_unwritten = 1 }).v, +- &j->reservations.counter); ++ v = atomic64_read(&j->reservations.counter); ++ do { ++ old.v = new.v = v; ++ BUG_ON(new.idx == new.unwritten_idx); ++ ++ new.unwritten_idx++; ++ } while ((v = atomic64_cmpxchg(&j->reservations.counter, ++ old.v, new.v)) != old.v); + + closure_wake_up(&w->wait); + journal_wake(j); +@@ -1009,6 +1021,10 @@ static void journal_write_done(struct closure *cl) + if (test_bit(JOURNAL_NEED_WRITE, &j->flags)) + mod_delayed_work(system_freezable_wq, &j->write_work, 0); + spin_unlock(&j->lock); ++ ++ if (new.unwritten_idx != new.idx && ++ !journal_state_count(new, new.unwritten_idx)) ++ closure_call(&j->io, bch2_journal_write, system_highpri_wq, NULL); + } + + static void journal_write_endio(struct bio *bio) +@@ -1019,7 +1035,7 @@ static void journal_write_endio(struct bio *bio) + if (bch2_dev_io_err_on(bio->bi_status, ca, "journal write error: %s", + bch2_blk_status_to_str(bio->bi_status)) || + bch2_meta_write_fault("journal")) { +- struct journal_buf *w = journal_prev_buf(j); ++ struct journal_buf *w = journal_last_unwritten_buf(j); + unsigned long flags; + + spin_lock_irqsave(&j->err_lock, flags); +@@ -1036,7 +1052,7 @@ void bch2_journal_write(struct closure *cl) + struct journal *j = container_of(cl, struct journal, io); + struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct bch_dev *ca; +- struct journal_buf *w = journal_prev_buf(j); ++ struct journal_buf *w = journal_last_unwritten_buf(j); + struct jset_entry *start, *end; + struct jset *jset; + struct bio *bio; +@@ -1047,8 +1063,6 @@ void bch2_journal_write(struct closure *cl) + + BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb)); + +- bch2_journal_pin_put(j, le64_to_cpu(w->data->seq)); +- + journal_buf_realloc(j, w); + jset = w->data; + +diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c +index d59a1795d57b..e356295ea660 100644 +--- a/fs/bcachefs/journal_reclaim.c ++++ b/fs/bcachefs/journal_reclaim.c +@@ -58,6 +58,19 @@ static void journal_set_remaining(struct journal *j, unsigned u64s_remaining) + old.v, new.v)) != old.v); + } + ++static inline unsigned get_unwritten_sectors(struct journal *j, unsigned *idx) ++{ ++ unsigned sectors = 0; ++ ++ while (!sectors && *idx != j->reservations.idx) { ++ sectors = j->buf[*idx].sectors; ++ ++ *idx = (*idx + 1) & JOURNAL_BUF_MASK; ++ } ++ ++ return sectors; ++} ++ + static struct journal_space { + unsigned next_entry; + unsigned remaining; +@@ -69,15 +82,14 @@ static struct journal_space { + unsigned sectors_next_entry = UINT_MAX; + unsigned sectors_total = UINT_MAX; + unsigned i, nr_devs = 0; +- unsigned unwritten_sectors = j->reservations.prev_buf_unwritten +- ? journal_prev_buf(j)->sectors +- : 0; ++ unsigned unwritten_sectors; + + rcu_read_lock(); + for_each_member_device_rcu(ca, c, i, + &c->rw_devs[BCH_DATA_journal]) { + struct journal_device *ja = &ca->journal; + unsigned buckets_this_device, sectors_this_device; ++ unsigned idx = j->reservations.unwritten_idx; + + if (!ja->nr) + continue; +@@ -89,16 +101,20 @@ static struct journal_space { + * We that we don't allocate the space for a journal entry + * until we write it out - thus, account for it here: + */ +- if (unwritten_sectors >= sectors_this_device) { +- if (!buckets_this_device) +- continue; +- +- buckets_this_device--; +- sectors_this_device = ca->mi.bucket_size; ++ while ((unwritten_sectors = get_unwritten_sectors(j, &idx))) { ++ if (unwritten_sectors >= sectors_this_device) { ++ if (!buckets_this_device) { ++ sectors_this_device = 0; ++ break; ++ } ++ ++ buckets_this_device--; ++ sectors_this_device = ca->mi.bucket_size; ++ } ++ ++ sectors_this_device -= unwritten_sectors; + } + +- sectors_this_device -= unwritten_sectors; +- + if (sectors_this_device < ca->mi.bucket_size && + buckets_this_device) { + buckets_this_device--; +@@ -277,6 +293,14 @@ static void bch2_journal_reclaim_fast(struct journal *j) + bch2_journal_space_available(j); + } + ++void __bch2_journal_pin_put(struct journal *j, u64 seq) ++{ ++ struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq); ++ ++ if (atomic_dec_and_test(&pin_list->count)) ++ bch2_journal_reclaim_fast(j); ++} ++ + void bch2_journal_pin_put(struct journal *j, u64 seq) + { + struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq); +diff --git a/fs/bcachefs/journal_reclaim.h b/fs/bcachefs/journal_reclaim.h +index 3404fef241ea..b0f05839396d 100644 +--- a/fs/bcachefs/journal_reclaim.h ++++ b/fs/bcachefs/journal_reclaim.h +@@ -39,6 +39,7 @@ journal_seq_pin(struct journal *j, u64 seq) + return &j->pin.data[seq & j->pin.mask]; + } + ++void __bch2_journal_pin_put(struct journal *, u64); + void bch2_journal_pin_put(struct journal *, u64); + void bch2_journal_pin_drop(struct journal *, struct journal_entry_pin *); + +diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h +index 00c3de77e823..d4d95280dc0e 100644 +--- a/fs/bcachefs/journal_types.h ++++ b/fs/bcachefs/journal_types.h +@@ -11,13 +11,13 @@ + + struct journal_res; + +-#define JOURNAL_BUF_BITS 1 ++#define JOURNAL_BUF_BITS 2 + #define JOURNAL_BUF_NR (1U << JOURNAL_BUF_BITS) + #define JOURNAL_BUF_MASK (JOURNAL_BUF_NR - 1) + + /* +- * We put two of these in struct journal; we used them for writes to the +- * journal that are being staged or in flight. ++ * We put JOURNAL_BUF_NR of these in struct journal; we used them for writes to ++ * the journal that are being staged or in flight. + */ + struct journal_buf { + struct jset *data; +@@ -85,10 +85,12 @@ union journal_res_state { + + struct { + u64 cur_entry_offset:20, +- idx:1, +- prev_buf_unwritten:1, +- buf0_count:21, +- buf1_count:21; ++ idx:2, ++ unwritten_idx:2, ++ buf0_count:10, ++ buf1_count:10, ++ buf2_count:10, ++ buf3_count:10; + }; + }; + +@@ -169,7 +171,7 @@ struct journal { + * Two journal entries -- one is currently open for new entries, the + * other is possibly being written out. + */ +- struct journal_buf buf[2]; ++ struct journal_buf buf[JOURNAL_BUF_NR]; + + spinlock_t lock; + +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index d24cef2bf1aa..7ad5b8234747 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -1048,13 +1048,13 @@ int bch2_fs_recovery(struct bch_fs *c) + if (!c->sb.clean) { + ret = bch2_journal_seq_blacklist_add(c, + journal_seq, +- journal_seq + 4); ++ journal_seq + 8); + if (ret) { + bch_err(c, "error creating new journal seq blacklist entry"); + goto err; + } + +- journal_seq += 4; ++ journal_seq += 8; + + /* + * The superblock needs to be written before we do any btree +-- +cgit v1.2.3 + + +From 672459b1b725605215f3d23d4869ff824109b018 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 14 Nov 2020 12:29:21 -0500 +Subject: bcachefs: Improve journal free space calculations + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/journal.c | 25 ++++++-- + fs/bcachefs/journal_reclaim.c | 136 ++++++++++++++++++++++-------------------- + fs/bcachefs/journal_reclaim.h | 6 -- + fs/bcachefs/journal_types.h | 18 +++++- + 4 files changed, 108 insertions(+), 77 deletions(-) + +diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c +index ee8643a4be0f..811c456b04d2 100644 +--- a/fs/bcachefs/journal.c ++++ b/fs/bcachefs/journal.c +@@ -1145,7 +1145,7 @@ void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) + } + + pr_buf(out, +- "current entry:\tidx %u refcount %u\n", ++ "current entry:\t\tidx %u refcount %u\n", + s.idx, journal_state_count(s, s.idx)); + + i = s.idx; +@@ -1162,6 +1162,20 @@ void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) + test_bit(JOURNAL_NEED_WRITE, &j->flags), + test_bit(JOURNAL_REPLAY_DONE, &j->flags)); + ++ pr_buf(out, "space:\n"); ++ pr_buf(out, "\tdiscarded\t%u:%u\n", ++ j->space[journal_space_discarded].next_entry, ++ j->space[journal_space_discarded].total); ++ pr_buf(out, "\tclean ondisk\t%u:%u\n", ++ j->space[journal_space_clean_ondisk].next_entry, ++ j->space[journal_space_clean_ondisk].total); ++ pr_buf(out, "\tclean\t\t%u:%u\n", ++ j->space[journal_space_clean].next_entry, ++ j->space[journal_space_clean].total); ++ pr_buf(out, "\ttotal\t\t%u:%u\n", ++ j->space[journal_space_total].next_entry, ++ j->space[journal_space_total].total); ++ + for_each_member_device_rcu(ca, c, i, + &c->rw_devs[BCH_DATA_journal]) { + struct journal_device *ja = &ca->journal; +@@ -1172,12 +1186,13 @@ void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) + pr_buf(out, + "dev %u:\n" + "\tnr\t\t%u\n" ++ "\tbucket size\t%u\n" + "\tavailable\t%u:%u\n" +- "\tdiscard_idx\t\t%u\n" +- "\tdirty_idx_ondisk\t%u (seq %llu)\n" +- "\tdirty_idx\t\t%u (seq %llu)\n" ++ "\tdiscard_idx\t%u\n" ++ "\tdirty_ondisk\t%u (seq %llu)\n" ++ "\tdirty_idx\t%u (seq %llu)\n" + "\tcur_idx\t\t%u (seq %llu)\n", +- i, ja->nr, ++ i, ja->nr, ca->mi.bucket_size, + bch2_journal_dev_buckets_available(j, ja, journal_space_discarded), + ja->sectors_free, + ja->discard_idx, +diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c +index e356295ea660..a209159a475d 100644 +--- a/fs/bcachefs/journal_reclaim.c ++++ b/fs/bcachefs/journal_reclaim.c +@@ -71,84 +71,94 @@ static inline unsigned get_unwritten_sectors(struct journal *j, unsigned *idx) + return sectors; + } + +-static struct journal_space { +- unsigned next_entry; +- unsigned remaining; +-} __journal_space_available(struct journal *j, unsigned nr_devs_want, ++static struct journal_space ++journal_dev_space_available(struct journal *j, struct bch_dev *ca, + enum journal_space_from from) + { +- struct bch_fs *c = container_of(j, struct bch_fs, journal); +- struct bch_dev *ca; +- unsigned sectors_next_entry = UINT_MAX; +- unsigned sectors_total = UINT_MAX; +- unsigned i, nr_devs = 0; +- unsigned unwritten_sectors; ++ struct journal_device *ja = &ca->journal; ++ unsigned sectors, buckets, unwritten, idx = j->reservations.unwritten_idx; + +- rcu_read_lock(); +- for_each_member_device_rcu(ca, c, i, +- &c->rw_devs[BCH_DATA_journal]) { +- struct journal_device *ja = &ca->journal; +- unsigned buckets_this_device, sectors_this_device; +- unsigned idx = j->reservations.unwritten_idx; ++ if (from == journal_space_total) ++ return (struct journal_space) { ++ .next_entry = ca->mi.bucket_size, ++ .total = ca->mi.bucket_size * ja->nr, ++ }; + +- if (!ja->nr) +- continue; +- +- buckets_this_device = bch2_journal_dev_buckets_available(j, ja, from); +- sectors_this_device = ja->sectors_free; ++ buckets = bch2_journal_dev_buckets_available(j, ja, from); ++ sectors = ja->sectors_free; + +- /* +- * We that we don't allocate the space for a journal entry +- * until we write it out - thus, account for it here: +- */ +- while ((unwritten_sectors = get_unwritten_sectors(j, &idx))) { +- if (unwritten_sectors >= sectors_this_device) { +- if (!buckets_this_device) { +- sectors_this_device = 0; +- break; +- } +- +- buckets_this_device--; +- sectors_this_device = ca->mi.bucket_size; ++ /* ++ * We that we don't allocate the space for a journal entry ++ * until we write it out - thus, account for it here: ++ */ ++ while ((unwritten = get_unwritten_sectors(j, &idx))) { ++ if (unwritten >= sectors) { ++ if (!buckets) { ++ sectors = 0; ++ break; + } + +- sectors_this_device -= unwritten_sectors; ++ buckets--; ++ sectors = ca->mi.bucket_size; + } + +- if (sectors_this_device < ca->mi.bucket_size && +- buckets_this_device) { +- buckets_this_device--; +- sectors_this_device = ca->mi.bucket_size; +- } ++ sectors -= unwritten; ++ } ++ ++ if (sectors < ca->mi.bucket_size && buckets) { ++ buckets--; ++ sectors = ca->mi.bucket_size; ++ } ++ ++ return (struct journal_space) { ++ .next_entry = sectors, ++ .total = sectors + buckets * ca->mi.bucket_size, ++ }; ++} + +- if (!sectors_this_device) ++static struct journal_space __journal_space_available(struct journal *j, unsigned nr_devs_want, ++ enum journal_space_from from) ++{ ++ struct bch_fs *c = container_of(j, struct bch_fs, journal); ++ struct bch_dev *ca; ++ unsigned i, pos, nr_devs = 0; ++ struct journal_space space, dev_space[BCH_SB_MEMBERS_MAX]; ++ ++ BUG_ON(nr_devs_want > ARRAY_SIZE(dev_space)); ++ ++ rcu_read_lock(); ++ for_each_member_device_rcu(ca, c, i, ++ &c->rw_devs[BCH_DATA_journal]) { ++ if (!ca->journal.nr) + continue; + +- sectors_next_entry = min(sectors_next_entry, +- sectors_this_device); ++ space = journal_dev_space_available(j, ca, from); ++ if (!space.next_entry) ++ continue; + +- sectors_total = min(sectors_total, +- buckets_this_device * ca->mi.bucket_size + +- sectors_this_device); ++ for (pos = 0; pos < nr_devs; pos++) ++ if (space.total > dev_space[pos].total) ++ break; + +- nr_devs++; ++ array_insert_item(dev_space, nr_devs, pos, space); + } + rcu_read_unlock(); + + if (nr_devs < nr_devs_want) + return (struct journal_space) { 0, 0 }; + +- return (struct journal_space) { +- .next_entry = sectors_next_entry, +- .remaining = max_t(int, 0, sectors_total - sectors_next_entry), +- }; ++ /* ++ * We sorted largest to smallest, and we want the smallest out of the ++ * @nr_devs_want largest devices: ++ */ ++ return dev_space[nr_devs_want - 1]; + } + + void bch2_journal_space_available(struct journal *j) + { + struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct bch_dev *ca; +- struct journal_space discarded, clean_ondisk, clean; ++ unsigned clean; + unsigned overhead, u64s_remaining = 0; + unsigned max_entry_size = min(j->buf[0].buf_size >> 9, + j->buf[1].buf_size >> 9); +@@ -189,27 +199,25 @@ void bch2_journal_space_available(struct journal *j) + goto out; + } + +- if (!fifo_free(&j->pin)) { +- ret = cur_entry_journal_pin_full; +- goto out; +- } +- + nr_devs_want = min_t(unsigned, nr_online, c->opts.metadata_replicas); + +- discarded = __journal_space_available(j, nr_devs_want, journal_space_discarded); +- clean_ondisk = __journal_space_available(j, nr_devs_want, journal_space_clean_ondisk); +- clean = __journal_space_available(j, nr_devs_want, journal_space_clean); ++ for (i = 0; i < journal_space_nr; i++) ++ j->space[i] = __journal_space_available(j, nr_devs_want, i); + +- if (!discarded.next_entry) ++ clean = j->space[journal_space_clean].total; ++ ++ if (!j->space[journal_space_discarded].next_entry) + ret = cur_entry_journal_full; ++ else if (!fifo_free(&j->pin)) ++ ret = cur_entry_journal_pin_full; + +- overhead = DIV_ROUND_UP(clean.remaining, max_entry_size) * ++ overhead = DIV_ROUND_UP(clean, max_entry_size) * + journal_entry_overhead(j); +- u64s_remaining = clean.remaining << 6; ++ u64s_remaining = clean << 6; + u64s_remaining = max_t(int, 0, u64s_remaining - overhead); + u64s_remaining /= 4; + out: +- j->cur_entry_sectors = !ret ? discarded.next_entry : 0; ++ j->cur_entry_sectors = !ret ? j->space[journal_space_discarded].next_entry : 0; + j->cur_entry_error = ret; + journal_set_remaining(j, u64s_remaining); + journal_check_may_get_unreserved(j); +diff --git a/fs/bcachefs/journal_reclaim.h b/fs/bcachefs/journal_reclaim.h +index b0f05839396d..f02caa3d49ea 100644 +--- a/fs/bcachefs/journal_reclaim.h ++++ b/fs/bcachefs/journal_reclaim.h +@@ -4,12 +4,6 @@ + + #define JOURNAL_PIN (32 * 1024) + +-enum journal_space_from { +- journal_space_discarded, +- journal_space_clean_ondisk, +- journal_space_clean, +-}; +- + static inline void journal_reclaim_kick(struct journal *j) + { + struct task_struct *p = READ_ONCE(j->reclaim_thread); +diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h +index d4d95280dc0e..da420d227244 100644 +--- a/fs/bcachefs/journal_types.h ++++ b/fs/bcachefs/journal_types.h +@@ -9,8 +9,6 @@ + #include "super_types.h" + #include "fifo.h" + +-struct journal_res; +- + #define JOURNAL_BUF_BITS 2 + #define JOURNAL_BUF_NR (1U << JOURNAL_BUF_BITS) + #define JOURNAL_BUF_MASK (JOURNAL_BUF_NR - 1) +@@ -122,6 +120,20 @@ union journal_preres_state { + #define JOURNAL_ENTRY_CLOSED_VAL (JOURNAL_ENTRY_OFFSET_MAX - 1) + #define JOURNAL_ENTRY_ERROR_VAL (JOURNAL_ENTRY_OFFSET_MAX) + ++struct journal_space { ++ /* Units of 512 bytes sectors: */ ++ unsigned next_entry; /* How big the next journal entry can be */ ++ unsigned total; ++}; ++ ++enum journal_space_from { ++ journal_space_discarded, ++ journal_space_clean_ondisk, ++ journal_space_clean, ++ journal_space_total, ++ journal_space_nr, ++}; ++ + /* + * JOURNAL_NEED_WRITE - current (pending) journal entry should be written ASAP, + * either because something's waiting on the write to complete or because it's +@@ -216,6 +228,8 @@ struct journal { + struct journal_entry_pin_list *data; + } pin; + ++ struct journal_space space[journal_space_nr]; ++ + u64 replay_journal_seq; + u64 replay_journal_seq_end; + +-- +cgit v1.2.3 + + +From 0637da9592cd9c17ca30e47768c53b4513794185 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 14 Nov 2020 09:59:58 -0500 +Subject: bcachefs: Don't require flush/fua on every journal write + +This patch adds a flag to journal entries which, if set, indicates that +they weren't done as flush/fua writes. + + - non flush/fua journal writes don't update last_seq (i.e. they don't + free up space in the journal), thus the journal free space + calculations now check whether nonflush journal writes are currently + allowed (i.e. are we low on free space, or would doing a flush write + free up a lot of space in the journal) + + - write_delay_ms, the user configurable option for when open journal + entries are automatically written, is now interpreted as the max + delay between flush journal writes (default 1 second). + + - bch2_journal_flush_seq_async is changed to ensure a flush write >= + the requested sequence number has happened + + - journal read/replay must now ignore, and blacklist, any journal + entries newer than the most recent flush entry in the journal. Also, + the way the read_entire_journal option is handled has been improved; + struct journal_replay now has an entry, 'ignore', for entries that + were read but should not be used. + + - assorted refactoring and improvements related to journal read in + journal_io.c and recovery.c + +Previously, we'd have to issue a flush/fua write every time we +accumulated a full journal entry - typically the bucket size. Now we +need to issue them much less frequently: when an fsync is requested, or +it's been more than write_delay_ms since the last flush, or when we need +to free up space in the journal. This is a significant performance +improvement on many write heavy workloads. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bcachefs_format.h | 7 +- + fs/bcachefs/journal.c | 54 ++++++++- + fs/bcachefs/journal.h | 2 +- + fs/bcachefs/journal_io.c | 212 ++++++++++++++++++++++++++++-------- + fs/bcachefs/journal_io.h | 3 +- + fs/bcachefs/journal_reclaim.c | 10 +- + fs/bcachefs/journal_seq_blacklist.c | 5 +- + fs/bcachefs/journal_types.h | 8 ++ + fs/bcachefs/recovery.c | 166 ++++++++++++---------------- + 9 files changed, 314 insertions(+), 153 deletions(-) + +diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h +index 94b5418587e3..02a76c3d3acb 100644 +--- a/fs/bcachefs/bcachefs_format.h ++++ b/fs/bcachefs/bcachefs_format.h +@@ -1332,14 +1332,16 @@ LE64_BITMASK(BCH_SB_ERASURE_CODE, struct bch_sb, flags[3], 0, 16); + x(extents_above_btree_updates, 12) \ + x(btree_updates_journalled, 13) \ + x(reflink_inline_data, 14) \ +- x(new_varint, 15) ++ x(new_varint, 15) \ ++ x(journal_no_flush, 16) + + #define BCH_SB_FEATURES_ALL \ + ((1ULL << BCH_FEATURE_new_siphash)| \ + (1ULL << BCH_FEATURE_new_extent_overwrite)| \ + (1ULL << BCH_FEATURE_btree_ptr_v2)| \ + (1ULL << BCH_FEATURE_extents_above_btree_updates)|\ +- (1ULL << BCH_FEATURE_new_varint))\ ++ (1ULL << BCH_FEATURE_new_varint)| \ ++ (1ULL << BCH_FEATURE_journal_no_flush)) + + enum bch_sb_feature { + #define x(f, n) BCH_FEATURE_##f, +@@ -1575,6 +1577,7 @@ struct jset { + + LE32_BITMASK(JSET_CSUM_TYPE, struct jset, flags, 0, 4); + LE32_BITMASK(JSET_BIG_ENDIAN, struct jset, flags, 4, 5); ++LE32_BITMASK(JSET_NO_FLUSH, struct jset, flags, 5, 6); + + #define BCH_JOURNAL_BUCKETS_MIN 8 + +diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c +index 811c456b04d2..701521030c3d 100644 +--- a/fs/bcachefs/journal.c ++++ b/fs/bcachefs/journal.c +@@ -80,6 +80,8 @@ static void bch2_journal_buf_init(struct journal *j) + struct journal_buf *buf = journal_cur_buf(j); + + bkey_extent_init(&buf->key); ++ buf->noflush = false; ++ buf->must_flush = false; + + memset(buf->has_inode, 0, sizeof(buf->has_inode)); + +@@ -575,7 +577,7 @@ int bch2_journal_flush_seq_async(struct journal *j, u64 seq, + struct journal_buf *buf; + int ret = 0; + +- if (seq <= j->seq_ondisk) ++ if (seq <= j->flushed_seq_ondisk) + return 1; + + spin_lock(&j->lock); +@@ -586,16 +588,53 @@ int bch2_journal_flush_seq_async(struct journal *j, u64 seq, + goto out; + } + +- if (seq <= j->seq_ondisk) { ++ if (seq <= j->flushed_seq_ondisk) { + ret = 1; + goto out; + } + +- if (parent && +- (buf = journal_seq_to_buf(j, seq))) +- if (!closure_wait(&buf->wait, parent)) ++ /* if seq was written, but not flushed - flush a newer one instead */ ++ seq = max(seq, last_unwritten_seq(j)); ++ ++recheck_need_open: ++ if (seq == journal_cur_seq(j) && !journal_entry_is_open(j)) { ++ struct journal_res res = { 0 }; ++ ++ spin_unlock(&j->lock); ++ ++ ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0); ++ if (ret) ++ return ret; ++ ++ seq = res.seq; ++ buf = j->buf + (seq & JOURNAL_BUF_MASK); ++ buf->must_flush = true; ++ set_bit(JOURNAL_NEED_WRITE, &j->flags); ++ ++ if (parent && !closure_wait(&buf->wait, parent)) + BUG(); + ++ bch2_journal_res_put(j, &res); ++ ++ spin_lock(&j->lock); ++ goto want_write; ++ } ++ ++ /* ++ * if write was kicked off without a flush, flush the next sequence ++ * number instead ++ */ ++ buf = journal_seq_to_buf(j, seq); ++ if (buf->noflush) { ++ seq++; ++ goto recheck_need_open; ++ } ++ ++ buf->must_flush = true; ++ ++ if (parent && !closure_wait(&buf->wait, parent)) ++ BUG(); ++want_write: + if (seq == journal_cur_seq(j)) + journal_entry_want_write(j); + out: +@@ -980,6 +1019,7 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq, + spin_lock(&j->lock); + + set_bit(JOURNAL_STARTED, &j->flags); ++ j->last_flush_write = jiffies; + + journal_pin_new_entry(j, 1); + +@@ -1114,6 +1154,8 @@ void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) + "last_seq:\t\t%llu\n" + "last_seq_ondisk:\t%llu\n" + "prereserved:\t\t%u/%u\n" ++ "nr flush writes:\t%llu\n" ++ "nr noflush writes:\t%llu\n" + "nr direct reclaim:\t%llu\n" + "nr background reclaim:\t%llu\n" + "current entry sectors:\t%u\n" +@@ -1125,6 +1167,8 @@ void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) + j->last_seq_ondisk, + j->prereserved.reserved, + j->prereserved.remaining, ++ j->nr_flush_writes, ++ j->nr_noflush_writes, + j->nr_direct_reclaim, + j->nr_background_reclaim, + j->cur_entry_sectors, +diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h +index 5b92e8cd569d..a6ce03a724cb 100644 +--- a/fs/bcachefs/journal.h ++++ b/fs/bcachefs/journal.h +@@ -136,7 +136,7 @@ static inline u64 journal_last_seq(struct journal *j) + + static inline u64 journal_cur_seq(struct journal *j) + { +- BUG_ON(j->pin.back - 1 != atomic64_read(&j->seq)); ++ EBUG_ON(j->pin.back - 1 != atomic64_read(&j->seq)); + + return j->pin.back - 1; + } +diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c +index 2fea568a04c3..bb9a1936c24c 100644 +--- a/fs/bcachefs/journal_io.c ++++ b/fs/bcachefs/journal_io.c +@@ -10,10 +10,27 @@ + #include "journal.h" + #include "journal_io.h" + #include "journal_reclaim.h" ++#include "journal_seq_blacklist.h" + #include "replicas.h" + + #include + ++static void __journal_replay_free(struct journal_replay *i) ++{ ++ list_del(&i->list); ++ kvpfree(i, offsetof(struct journal_replay, j) + ++ vstruct_bytes(&i->j)); ++ ++} ++ ++static void journal_replay_free(struct bch_fs *c, struct journal_replay *i) ++{ ++ i->ignore = true; ++ ++ if (!c->opts.read_entire_journal) ++ __journal_replay_free(i); ++} ++ + struct journal_list { + struct closure cl; + struct mutex lock; +@@ -36,28 +53,29 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca, + struct bch_devs_list devs = { .nr = 0 }; + struct list_head *where; + size_t bytes = vstruct_bytes(j); +- __le64 last_seq; ++ u64 last_seq = 0; + int ret; + +- last_seq = !list_empty(jlist->head) +- ? list_last_entry(jlist->head, struct journal_replay, +- list)->j.last_seq +- : 0; +- +- if (!c->opts.read_entire_journal) { +- /* Is this entry older than the range we need? */ +- if (le64_to_cpu(j->seq) < le64_to_cpu(last_seq)) { +- ret = JOURNAL_ENTRY_ADD_OUT_OF_RANGE; +- goto out; ++ list_for_each_entry_reverse(i, jlist->head, list) { ++ if (!JSET_NO_FLUSH(&i->j)) { ++ last_seq = le64_to_cpu(i->j.last_seq); ++ break; + } ++ } + +- /* Drop entries we don't need anymore */ ++ /* Is this entry older than the range we need? */ ++ if (!c->opts.read_entire_journal && ++ le64_to_cpu(j->seq) < last_seq) { ++ ret = JOURNAL_ENTRY_ADD_OUT_OF_RANGE; ++ goto out; ++ } ++ ++ /* Drop entries we don't need anymore */ ++ if (!JSET_NO_FLUSH(j)) { + list_for_each_entry_safe(i, pos, jlist->head, list) { + if (le64_to_cpu(i->j.seq) >= le64_to_cpu(j->last_seq)) + break; +- list_del(&i->list); +- kvpfree(i, offsetof(struct journal_replay, j) + +- vstruct_bytes(&i->j)); ++ journal_replay_free(c, i); + } + } + +@@ -81,9 +99,7 @@ add: + if (i && le64_to_cpu(j->seq) == le64_to_cpu(i->j.seq)) { + if (i->bad) { + devs = i->devs; +- list_del(&i->list); +- kvpfree(i, offsetof(struct journal_replay, j) + +- vstruct_bytes(&i->j)); ++ __journal_replay_free(i); + } else if (bad) { + goto found; + } else { +@@ -105,6 +121,7 @@ add: + list_add(&i->list, where); + i->devs = devs; + i->bad = bad; ++ i->ignore = false; + memcpy(&i->j, j, bytes); + found: + if (!bch2_dev_list_has_dev(i->devs, ca->dev_idx)) +@@ -699,14 +716,16 @@ err: + goto out; + } + +-int bch2_journal_read(struct bch_fs *c, struct list_head *list) ++int bch2_journal_read(struct bch_fs *c, struct list_head *list, ++ u64 *blacklist_seq, u64 *start_seq) + { + struct journal_list jlist; +- struct journal_replay *i; ++ struct journal_replay *i, *t; + struct bch_dev *ca; + unsigned iter; + size_t keys = 0, entries = 0; + bool degraded = false; ++ u64 seq, last_seq = 0; + int ret = 0; + + closure_init_stack(&jlist.cl); +@@ -735,12 +754,97 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list) + if (jlist.ret) + return jlist.ret; + ++ if (list_empty(list)) { ++ bch_info(c, "journal read done, but no entries found"); ++ return 0; ++ } ++ ++ i = list_last_entry(list, struct journal_replay, list); ++ *start_seq = le64_to_cpu(i->j.seq) + 1; ++ ++ /* ++ * Find most recent flush entry, and ignore newer non flush entries - ++ * those entries will be blacklisted: ++ */ ++ list_for_each_entry_safe_reverse(i, t, list, list) { ++ if (i->ignore) ++ continue; ++ ++ if (!JSET_NO_FLUSH(&i->j)) { ++ last_seq = le64_to_cpu(i->j.last_seq); ++ *blacklist_seq = le64_to_cpu(i->j.seq) + 1; ++ break; ++ } ++ ++ journal_replay_free(c, i); ++ } ++ ++ if (!last_seq) { ++ fsck_err(c, "journal read done, but no entries found after dropping non-flushes"); ++ return -1; ++ } ++ ++ /* Drop blacklisted entries and entries older than last_seq: */ ++ list_for_each_entry_safe(i, t, list, list) { ++ if (i->ignore) ++ continue; ++ ++ seq = le64_to_cpu(i->j.seq); ++ if (seq < last_seq) { ++ journal_replay_free(c, i); ++ continue; ++ } ++ ++ if (bch2_journal_seq_is_blacklisted(c, seq, true)) { ++ fsck_err_on(!JSET_NO_FLUSH(&i->j), c, ++ "found blacklisted journal entry %llu", seq); ++ ++ journal_replay_free(c, i); ++ } ++ } ++ ++ /* Check for missing entries: */ ++ seq = last_seq; ++ list_for_each_entry(i, list, list) { ++ if (i->ignore) ++ continue; ++ ++ BUG_ON(seq > le64_to_cpu(i->j.seq)); ++ ++ while (seq < le64_to_cpu(i->j.seq)) { ++ u64 missing_start, missing_end; ++ ++ while (seq < le64_to_cpu(i->j.seq) && ++ bch2_journal_seq_is_blacklisted(c, seq, false)) ++ seq++; ++ ++ if (seq == le64_to_cpu(i->j.seq)) ++ break; ++ ++ missing_start = seq; ++ ++ while (seq < le64_to_cpu(i->j.seq) && ++ !bch2_journal_seq_is_blacklisted(c, seq, false)) ++ seq++; ++ ++ missing_end = seq - 1; ++ fsck_err(c, "journal entries %llu-%llu missing! (replaying %llu-%llu)", ++ missing_start, missing_end, ++ last_seq, *blacklist_seq - 1); ++ } ++ ++ seq++; ++ } ++ + list_for_each_entry(i, list, list) { + struct jset_entry *entry; + struct bkey_i *k, *_n; + struct bch_replicas_padded replicas; + char buf[80]; + ++ if (i->ignore) ++ continue; ++ + ret = jset_validate_entries(c, &i->j, READ); + if (ret) + goto fsck_err; +@@ -768,12 +872,12 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list) + entries++; + } + +- if (!list_empty(list)) { +- i = list_last_entry(list, struct journal_replay, list); ++ bch_info(c, "journal read done, %zu keys in %zu entries, seq %llu", ++ keys, entries, *start_seq); + +- bch_info(c, "journal read done, %zu keys in %zu entries, seq %llu", +- keys, entries, le64_to_cpu(i->j.seq)); +- } ++ if (*start_seq != *blacklist_seq) ++ bch_info(c, "dropped unflushed entries %llu-%llu", ++ *blacklist_seq, *start_seq - 1); + fsck_err: + return ret; + } +@@ -991,8 +1095,12 @@ static void journal_write_done(struct closure *cl) + j->seq_ondisk = seq; + if (err && (!j->err_seq || seq < j->err_seq)) + j->err_seq = seq; +- j->last_seq_ondisk = last_seq; +- bch2_journal_space_available(j); ++ ++ if (!w->noflush) { ++ j->flushed_seq_ondisk = seq; ++ j->last_seq_ondisk = last_seq; ++ bch2_journal_space_available(j); ++ } + + /* + * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard +@@ -1068,6 +1176,22 @@ void bch2_journal_write(struct closure *cl) + + j->write_start_time = local_clock(); + ++ spin_lock(&j->lock); ++ if (c->sb.features & (1ULL << BCH_FEATURE_journal_no_flush) && ++ !w->must_flush && ++ (jiffies - j->last_flush_write) < msecs_to_jiffies(j->write_delay_ms) && ++ test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags)) { ++ w->noflush = true; ++ SET_JSET_NO_FLUSH(jset, true); ++ jset->last_seq = cpu_to_le64(j->last_seq_ondisk); ++ ++ j->nr_noflush_writes++; ++ } else { ++ j->last_flush_write = jiffies; ++ j->nr_flush_writes++; ++ } ++ spin_unlock(&j->lock); ++ + /* + * New btree roots are set by journalling them; when the journal entry + * gets written we have to propagate them to c->btree_roots +@@ -1189,8 +1313,9 @@ retry_alloc: + bio->bi_iter.bi_sector = ptr->offset; + bio->bi_end_io = journal_write_endio; + bio->bi_private = ca; +- bio_set_op_attrs(bio, REQ_OP_WRITE, +- REQ_SYNC|REQ_META|REQ_PREFLUSH|REQ_FUA); ++ bio->bi_opf = REQ_OP_WRITE|REQ_SYNC|REQ_META; ++ if (!JSET_NO_FLUSH(jset)) ++ bio->bi_opf |= REQ_PREFLUSH|REQ_FUA; + bch2_bio_map(bio, jset, sectors << 9); + + trace_journal_write(bio); +@@ -1199,20 +1324,21 @@ retry_alloc: + ca->journal.bucket_seq[ca->journal.cur_idx] = le64_to_cpu(jset->seq); + } + +- for_each_rw_member(ca, c, i) +- if (journal_flushes_device(ca) && +- !bch2_bkey_has_device(bkey_i_to_s_c(&w->key), i)) { +- percpu_ref_get(&ca->io_ref); +- +- bio = ca->journal.bio; +- bio_reset(bio); +- bio_set_dev(bio, ca->disk_sb.bdev); +- bio->bi_opf = REQ_OP_FLUSH; +- bio->bi_end_io = journal_write_endio; +- bio->bi_private = ca; +- closure_bio_submit(bio, cl); +- } +- ++ if (!JSET_NO_FLUSH(jset)) { ++ for_each_rw_member(ca, c, i) ++ if (journal_flushes_device(ca) && ++ !bch2_bkey_has_device(bkey_i_to_s_c(&w->key), i)) { ++ percpu_ref_get(&ca->io_ref); ++ ++ bio = ca->journal.bio; ++ bio_reset(bio); ++ bio_set_dev(bio, ca->disk_sb.bdev); ++ bio->bi_opf = REQ_OP_FLUSH; ++ bio->bi_end_io = journal_write_endio; ++ bio->bi_private = ca; ++ closure_bio_submit(bio, cl); ++ } ++ } + no_io: + bch2_bucket_seq_cleanup(c); + +diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h +index 6958ee0f8cf2..6b4c80968f52 100644 +--- a/fs/bcachefs/journal_io.h ++++ b/fs/bcachefs/journal_io.h +@@ -11,6 +11,7 @@ struct journal_replay { + struct bch_devs_list devs; + /* checksum error, but we may want to try using it anyways: */ + bool bad; ++ bool ignore; + /* must be last: */ + struct jset j; + }; +@@ -37,7 +38,7 @@ static inline struct jset_entry *__jset_entry_type_next(struct jset *jset, + for_each_jset_entry_type(entry, jset, BCH_JSET_ENTRY_btree_keys) \ + vstruct_for_each_safe(entry, k, _n) + +-int bch2_journal_read(struct bch_fs *, struct list_head *); ++int bch2_journal_read(struct bch_fs *, struct list_head *, u64 *, u64 *); + + void bch2_journal_write(struct closure *); + +diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c +index a209159a475d..9d778306efc5 100644 +--- a/fs/bcachefs/journal_reclaim.c ++++ b/fs/bcachefs/journal_reclaim.c +@@ -158,7 +158,7 @@ void bch2_journal_space_available(struct journal *j) + { + struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct bch_dev *ca; +- unsigned clean; ++ unsigned clean, clean_ondisk, total; + unsigned overhead, u64s_remaining = 0; + unsigned max_entry_size = min(j->buf[0].buf_size >> 9, + j->buf[1].buf_size >> 9); +@@ -204,13 +204,21 @@ void bch2_journal_space_available(struct journal *j) + for (i = 0; i < journal_space_nr; i++) + j->space[i] = __journal_space_available(j, nr_devs_want, i); + ++ clean_ondisk = j->space[journal_space_clean_ondisk].total; + clean = j->space[journal_space_clean].total; ++ total = j->space[journal_space_total].total; + + if (!j->space[journal_space_discarded].next_entry) + ret = cur_entry_journal_full; + else if (!fifo_free(&j->pin)) + ret = cur_entry_journal_pin_full; + ++ if ((clean - clean_ondisk <= total / 8) && ++ (clean_ondisk * 2 > clean )) ++ set_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags); ++ else ++ clear_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags); ++ + overhead = DIV_ROUND_UP(clean, max_entry_size) * + journal_entry_overhead(j); + u64s_remaining = clean << 6; +diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c +index d0f1bbf8f6a7..e1b63f3879f4 100644 +--- a/fs/bcachefs/journal_seq_blacklist.c ++++ b/fs/bcachefs/journal_seq_blacklist.c +@@ -118,7 +118,7 @@ out_write_sb: + out: + mutex_unlock(&c->sb_lock); + +- return ret; ++ return ret ?: bch2_blacklist_table_initialize(c); + } + + static int journal_seq_blacklist_table_cmp(const void *_l, +@@ -164,8 +164,6 @@ int bch2_blacklist_table_initialize(struct bch_fs *c) + struct journal_seq_blacklist_table *t; + unsigned i, nr = blacklist_nr_entries(bl); + +- BUG_ON(c->journal_seq_blacklist_table); +- + if (!bl) + return 0; + +@@ -187,6 +185,7 @@ int bch2_blacklist_table_initialize(struct bch_fs *c) + journal_seq_blacklist_table_cmp, + NULL); + ++ kfree(c->journal_seq_blacklist_table); + c->journal_seq_blacklist_table = t; + return 0; + } +diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h +index da420d227244..308b899b4214 100644 +--- a/fs/bcachefs/journal_types.h ++++ b/fs/bcachefs/journal_types.h +@@ -29,6 +29,8 @@ struct journal_buf { + unsigned disk_sectors; /* maximum size entry could have been, if + buf_size was bigger */ + unsigned u64s_reserved; ++ bool noflush; /* write has already been kicked off, and was noflush */ ++ bool must_flush; /* something wants a flush */ + /* bloom filter: */ + unsigned long has_inode[1024 / sizeof(unsigned long)]; + }; +@@ -146,6 +148,7 @@ enum { + JOURNAL_RECLAIM_STARTED, + JOURNAL_NEED_WRITE, + JOURNAL_MAY_GET_UNRESERVED, ++ JOURNAL_MAY_SKIP_FLUSH, + }; + + /* Embedded in struct bch_fs */ +@@ -203,6 +206,7 @@ struct journal { + + /* seq, last_seq from the most recent journal entry successfully written */ + u64 seq_ondisk; ++ u64 flushed_seq_ondisk; + u64 last_seq_ondisk; + u64 err_seq; + u64 last_empty_seq; +@@ -252,11 +256,15 @@ struct journal { + + unsigned write_delay_ms; + unsigned reclaim_delay_ms; ++ unsigned long last_flush_write; + + u64 res_get_blocked_start; + u64 need_write_time; + u64 write_start_time; + ++ u64 nr_flush_writes; ++ u64 nr_noflush_writes; ++ + struct time_stats *write_time; + struct time_stats *delay_time; + struct time_stats *blocked_time; +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index 7ad5b8234747..ecd51d45743a 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -313,7 +313,7 @@ void bch2_journal_keys_free(struct journal_keys *keys) + + static struct journal_keys journal_keys_sort(struct list_head *journal_entries) + { +- struct journal_replay *p; ++ struct journal_replay *i; + struct jset_entry *entry; + struct bkey_i *k, *_n; + struct journal_keys keys = { NULL }; +@@ -323,35 +323,35 @@ static struct journal_keys journal_keys_sort(struct list_head *journal_entries) + if (list_empty(journal_entries)) + return keys; + +- keys.journal_seq_base = +- le64_to_cpu(list_last_entry(journal_entries, +- struct journal_replay, list)->j.last_seq); +- +- list_for_each_entry(p, journal_entries, list) { +- if (le64_to_cpu(p->j.seq) < keys.journal_seq_base) ++ list_for_each_entry(i, journal_entries, list) { ++ if (i->ignore) + continue; + +- for_each_jset_key(k, _n, entry, &p->j) ++ if (!keys.journal_seq_base) ++ keys.journal_seq_base = le64_to_cpu(i->j.seq); ++ ++ for_each_jset_key(k, _n, entry, &i->j) + nr_keys++; + } + +- + keys.d = kvmalloc(sizeof(keys.d[0]) * nr_keys, GFP_KERNEL); + if (!keys.d) + goto err; + +- list_for_each_entry(p, journal_entries, list) { +- if (le64_to_cpu(p->j.seq) < keys.journal_seq_base) ++ list_for_each_entry(i, journal_entries, list) { ++ if (i->ignore) + continue; + +- for_each_jset_key(k, _n, entry, &p->j) ++ BUG_ON(le64_to_cpu(i->j.seq) - keys.journal_seq_base > U32_MAX); ++ ++ for_each_jset_key(k, _n, entry, &i->j) + keys.d[keys.nr++] = (struct journal_key) { + .btree_id = entry->btree_id, + .level = entry->level, + .k = k, +- .journal_seq = le64_to_cpu(p->j.seq) - ++ .journal_seq = le64_to_cpu(i->j.seq) - + keys.journal_seq_base, +- .journal_offset = k->_data - p->j._data, ++ .journal_offset = k->_data - i->j._data, + }; + } + +@@ -643,46 +643,6 @@ err: + return ret; + } + +-static bool journal_empty(struct list_head *journal) +-{ +- return list_empty(journal) || +- journal_entry_empty(&list_last_entry(journal, +- struct journal_replay, list)->j); +-} +- +-static int +-verify_journal_entries_not_blacklisted_or_missing(struct bch_fs *c, +- struct list_head *journal) +-{ +- struct journal_replay *i = +- list_last_entry(journal, struct journal_replay, list); +- u64 start_seq = le64_to_cpu(i->j.last_seq); +- u64 end_seq = le64_to_cpu(i->j.seq); +- u64 seq = start_seq; +- int ret = 0; +- +- list_for_each_entry(i, journal, list) { +- if (le64_to_cpu(i->j.seq) < start_seq) +- continue; +- +- fsck_err_on(seq != le64_to_cpu(i->j.seq), c, +- "journal entries %llu-%llu missing! (replaying %llu-%llu)", +- seq, le64_to_cpu(i->j.seq) - 1, +- start_seq, end_seq); +- +- seq = le64_to_cpu(i->j.seq); +- +- fsck_err_on(bch2_journal_seq_is_blacklisted(c, seq, false), c, +- "found blacklisted journal entry %llu", seq); +- +- do { +- seq++; +- } while (bch2_journal_seq_is_blacklisted(c, seq, false)); +- } +-fsck_err: +- return ret; +-} +- + /* journal replay early: */ + + static int journal_replay_entry_early(struct bch_fs *c, +@@ -767,6 +727,7 @@ static int journal_replay_early(struct bch_fs *c, + struct bch_sb_field_clean *clean, + struct list_head *journal) + { ++ struct journal_replay *i; + struct jset_entry *entry; + int ret; + +@@ -782,18 +743,19 @@ static int journal_replay_early(struct bch_fs *c, + return ret; + } + } else { +- struct journal_replay *i = +- list_last_entry(journal, struct journal_replay, list); ++ list_for_each_entry(i, journal, list) { ++ if (i->ignore) ++ continue; + +- c->bucket_clock[READ].hand = le16_to_cpu(i->j.read_clock); +- c->bucket_clock[WRITE].hand = le16_to_cpu(i->j.write_clock); ++ c->bucket_clock[READ].hand = le16_to_cpu(i->j.read_clock); ++ c->bucket_clock[WRITE].hand = le16_to_cpu(i->j.write_clock); + +- list_for_each_entry(i, journal, list) + vstruct_for_each(&i->j, entry) { + ret = journal_replay_entry_early(c, entry); + if (ret) + return ret; + } ++ } + } + + bch2_fs_usage_initialize(c); +@@ -842,9 +804,6 @@ static int verify_superblock_clean(struct bch_fs *c, + struct bch_sb_field_clean *clean = *cleanp; + int ret = 0; + +- if (!c->sb.clean || !j) +- return 0; +- + if (mustfix_fsck_err_on(j->seq != clean->journal_seq, c, + "superblock journal seq (%llu) doesn't match journal (%llu) after clean shutdown", + le64_to_cpu(clean->journal_seq), +@@ -971,7 +930,8 @@ int bch2_fs_recovery(struct bch_fs *c) + { + const char *err = "cannot allocate memory"; + struct bch_sb_field_clean *clean = NULL; +- u64 journal_seq; ++ struct jset *last_journal_entry = NULL; ++ u64 blacklist_seq, journal_seq; + bool write_sb = false, need_write_alloc = false; + int ret; + +@@ -991,24 +951,38 @@ int bch2_fs_recovery(struct bch_fs *c) + set_bit(BCH_FS_REBUILD_REPLICAS, &c->flags); + } + ++ ret = bch2_blacklist_table_initialize(c); ++ if (ret) { ++ bch_err(c, "error initializing blacklist table"); ++ goto err; ++ } ++ + if (!c->sb.clean || c->opts.fsck || c->opts.keep_journal) { +- struct jset *j; ++ struct journal_replay *i; + +- ret = bch2_journal_read(c, &c->journal_entries); ++ ret = bch2_journal_read(c, &c->journal_entries, ++ &blacklist_seq, &journal_seq); + if (ret) + goto err; + +- if (mustfix_fsck_err_on(c->sb.clean && !journal_empty(&c->journal_entries), c, ++ list_for_each_entry_reverse(i, &c->journal_entries, list) ++ if (!i->ignore) { ++ last_journal_entry = &i->j; ++ break; ++ } ++ ++ if (mustfix_fsck_err_on(c->sb.clean && ++ last_journal_entry && ++ !journal_entry_empty(last_journal_entry), c, + "filesystem marked clean but journal not empty")) { + c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO); + SET_BCH_SB_CLEAN(c->disk_sb.sb, false); + c->sb.clean = false; + } + +- if (!c->sb.clean && list_empty(&c->journal_entries)) { +- bch_err(c, "no journal entries found"); +- ret = BCH_FSCK_REPAIR_IMPOSSIBLE; +- goto err; ++ if (!last_journal_entry) { ++ fsck_err_on(!c->sb.clean, c, "no journal entries found"); ++ goto use_clean; + } + + c->journal_keys = journal_keys_sort(&c->journal_entries); +@@ -1017,16 +991,21 @@ int bch2_fs_recovery(struct bch_fs *c) + goto err; + } + +- j = &list_last_entry(&c->journal_entries, +- struct journal_replay, list)->j; +- +- ret = verify_superblock_clean(c, &clean, j); +- if (ret) ++ if (c->sb.clean && last_journal_entry) { ++ ret = verify_superblock_clean(c, &clean, ++ last_journal_entry); ++ if (ret) ++ goto err; ++ } ++ } else { ++use_clean: ++ if (!clean) { ++ bch_err(c, "no superblock clean section found"); ++ ret = BCH_FSCK_REPAIR_IMPOSSIBLE; + goto err; + +- journal_seq = le64_to_cpu(j->seq) + 1; +- } else { +- journal_seq = le64_to_cpu(clean->journal_seq) + 1; ++ } ++ blacklist_seq = journal_seq = le64_to_cpu(clean->journal_seq) + 1; + } + + if (!c->sb.clean && +@@ -1045,30 +1024,23 @@ int bch2_fs_recovery(struct bch_fs *c) + if (ret) + goto err; + +- if (!c->sb.clean) { ++ /* ++ * After an unclean shutdown, skip then next few journal sequence ++ * numbers as they may have been referenced by btree writes that ++ * happened before their corresponding journal writes - those btree ++ * writes need to be ignored, by skipping and blacklisting the next few ++ * journal sequence numbers: ++ */ ++ if (!c->sb.clean) ++ journal_seq += 8; ++ ++ if (blacklist_seq != journal_seq) { + ret = bch2_journal_seq_blacklist_add(c, +- journal_seq, +- journal_seq + 8); ++ blacklist_seq, journal_seq); + if (ret) { + bch_err(c, "error creating new journal seq blacklist entry"); + goto err; + } +- +- journal_seq += 8; +- +- /* +- * The superblock needs to be written before we do any btree +- * node writes: it will be in the read_write() path +- */ +- } +- +- ret = bch2_blacklist_table_initialize(c); +- +- if (!list_empty(&c->journal_entries)) { +- ret = verify_journal_entries_not_blacklisted_or_missing(c, +- &c->journal_entries); +- if (ret) +- goto err; + } + + ret = bch2_fs_journal_start(&c->journal, journal_seq, +-- +cgit v1.2.3 + + +From 0857ef56b4b082f8adf8355037740ff071f49ad7 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 5 Dec 2020 16:25:05 -0500 +Subject: bcachefs: Be more conservation about journal pre-reservations + + - Try to always keep 1/8th of the journal free, on top of + pre-reservations + - Move the check for whether the journal is stuck to + bch2_journal_space_available, and make it only fire when there aren't + any journal writes in flight (that might free up space by updating + last_seq) + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/journal.c | 25 ++++++++----------------- + fs/bcachefs/journal.h | 3 ++- + fs/bcachefs/journal_io.c | 3 ++- + fs/bcachefs/journal_reclaim.c | 35 +++++++++++++++++++++++++++-------- + fs/bcachefs/journal_types.h | 1 + + 5 files changed, 40 insertions(+), 27 deletions(-) + +diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c +index 701521030c3d..d54424829378 100644 +--- a/fs/bcachefs/journal.c ++++ b/fs/bcachefs/journal.c +@@ -443,20 +443,6 @@ unlock: + if (!ret) + goto retry; + +- if (WARN_ONCE(ret == cur_entry_journal_full && +- !can_discard && +- (flags & JOURNAL_RES_GET_RESERVED), +- "JOURNAL_RES_GET_RESERVED set but journal full")) { +- char *buf; +- +- buf = kmalloc(4096, GFP_NOFS); +- if (buf) { +- bch2_journal_debug_to_text(&_PBUF(buf, 4096), j); +- pr_err("\n%s", buf); +- kfree(buf); +- } +- } +- + /* + * Journal is full - can't rely on reclaim from work item due to + * freezing: +@@ -1137,7 +1123,7 @@ out: + + /* debug: */ + +-void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) ++void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) + { + struct bch_fs *c = container_of(j, struct bch_fs, journal); + union journal_res_state s; +@@ -1145,7 +1131,6 @@ void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) + unsigned i; + + rcu_read_lock(); +- spin_lock(&j->lock); + s = READ_ONCE(j->reservations); + + pr_buf(out, +@@ -1245,10 +1230,16 @@ void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) + ja->cur_idx, ja->bucket_seq[ja->cur_idx]); + } + +- spin_unlock(&j->lock); + rcu_read_unlock(); + } + ++void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) ++{ ++ spin_lock(&j->lock); ++ __bch2_journal_debug_to_text(out, j); ++ spin_unlock(&j->lock); ++} ++ + void bch2_journal_pins_to_text(struct printbuf *out, struct journal *j) + { + struct journal_entry_pin_list *pin_list; +diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h +index a6ce03a724cb..1db1f190a168 100644 +--- a/fs/bcachefs/journal.h ++++ b/fs/bcachefs/journal.h +@@ -384,7 +384,7 @@ out: + static inline bool journal_check_may_get_unreserved(struct journal *j) + { + union journal_preres_state s = READ_ONCE(j->prereserved); +- bool ret = s.reserved <= s.remaining && ++ bool ret = s.reserved < s.remaining && + fifo_free(&j->pin) > 8; + + lockdep_assert_held(&j->lock); +@@ -508,6 +508,7 @@ static inline void bch2_journal_set_replay_done(struct journal *j) + void bch2_journal_unblock(struct journal *); + void bch2_journal_block(struct journal *); + ++void __bch2_journal_debug_to_text(struct printbuf *, struct journal *); + void bch2_journal_debug_to_text(struct printbuf *, struct journal *); + void bch2_journal_pins_to_text(struct printbuf *, struct journal *); + +diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c +index bb9a1936c24c..0e6fbe2f6a75 100644 +--- a/fs/bcachefs/journal_io.c ++++ b/fs/bcachefs/journal_io.c +@@ -1099,7 +1099,6 @@ static void journal_write_done(struct closure *cl) + if (!w->noflush) { + j->flushed_seq_ondisk = seq; + j->last_seq_ondisk = last_seq; +- bch2_journal_space_available(j); + } + + /* +@@ -1123,6 +1122,8 @@ static void journal_write_done(struct closure *cl) + } while ((v = atomic64_cmpxchg(&j->reservations.counter, + old.v, new.v)) != old.v); + ++ bch2_journal_space_available(j); ++ + closure_wake_up(&w->wait); + journal_wake(j); + +diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c +index 9d778306efc5..0655acfd1b3e 100644 +--- a/fs/bcachefs/journal_reclaim.c ++++ b/fs/bcachefs/journal_reclaim.c +@@ -2,6 +2,7 @@ + + #include "bcachefs.h" + #include "btree_key_cache.h" ++#include "error.h" + #include "journal.h" + #include "journal_io.h" + #include "journal_reclaim.h" +@@ -159,7 +160,7 @@ void bch2_journal_space_available(struct journal *j) + struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct bch_dev *ca; + unsigned clean, clean_ondisk, total; +- unsigned overhead, u64s_remaining = 0; ++ s64 u64s_remaining = 0; + unsigned max_entry_size = min(j->buf[0].buf_size >> 9, + j->buf[1].buf_size >> 9); + unsigned i, nr_online = 0, nr_devs_want; +@@ -208,22 +209,37 @@ void bch2_journal_space_available(struct journal *j) + clean = j->space[journal_space_clean].total; + total = j->space[journal_space_total].total; + +- if (!j->space[journal_space_discarded].next_entry) ++ if (!clean_ondisk && ++ j->reservations.idx == ++ j->reservations.unwritten_idx) { ++ char *buf = kmalloc(4096, GFP_ATOMIC); ++ ++ bch_err(c, "journal stuck"); ++ if (buf) { ++ __bch2_journal_debug_to_text(&_PBUF(buf, 4096), j); ++ pr_err("\n%s", buf); ++ kfree(buf); ++ } ++ ++ bch2_fatal_error(c); ++ ret = cur_entry_journal_stuck; ++ } else if (!j->space[journal_space_discarded].next_entry) + ret = cur_entry_journal_full; + else if (!fifo_free(&j->pin)) + ret = cur_entry_journal_pin_full; + +- if ((clean - clean_ondisk <= total / 8) && ++ if ((j->space[journal_space_clean_ondisk].next_entry < ++ j->space[journal_space_clean_ondisk].total) && ++ (clean - clean_ondisk <= total / 8) && + (clean_ondisk * 2 > clean )) + set_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags); + else + clear_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags); + +- overhead = DIV_ROUND_UP(clean, max_entry_size) * +- journal_entry_overhead(j); +- u64s_remaining = clean << 6; +- u64s_remaining = max_t(int, 0, u64s_remaining - overhead); +- u64s_remaining /= 4; ++ u64s_remaining = (u64) clean << 6; ++ u64s_remaining -= (u64) total << 3; ++ u64s_remaining = max(0LL, u64s_remaining); ++ u64s_remaining /= 2; + out: + j->cur_entry_sectors = !ret ? j->space[journal_space_discarded].next_entry : 0; + j->cur_entry_error = ret; +@@ -572,6 +588,9 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct) + c->btree_cache.used * 3) + min_nr = 1; + ++ if (fifo_free(&j->pin) <= 32) ++ min_nr = 1; ++ + min_nr = max(min_nr, bch2_nr_btree_keys_need_flush(c)); + + trace_journal_reclaim_start(c, +diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h +index 308b899b4214..67ee47eb17a7 100644 +--- a/fs/bcachefs/journal_types.h ++++ b/fs/bcachefs/journal_types.h +@@ -172,6 +172,7 @@ struct journal { + cur_entry_blocked, + cur_entry_journal_full, + cur_entry_journal_pin_full, ++ cur_entry_journal_stuck, + cur_entry_insufficient_devices, + } cur_entry_error; + +-- +cgit v1.2.3 + + +From 81c98237df5bfcb0b8f3d4ec6e3e6b2e15040ae8 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 5 Dec 2020 21:03:57 -0500 +Subject: bcachefs: Fix btree key cache dirty checks + +Had a type that meant we were triggering journal reclaim _much_ more +aggressively than needed. Also, fix a potential integer overflow. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_key_cache.h | 6 +++--- + fs/bcachefs/journal_reclaim.c | 1 + + 2 files changed, 4 insertions(+), 3 deletions(-) + +diff --git a/fs/bcachefs/btree_key_cache.h b/fs/bcachefs/btree_key_cache.h +index d7d31a0662c3..dad3e344dcf9 100644 +--- a/fs/bcachefs/btree_key_cache.h ++++ b/fs/bcachefs/btree_key_cache.h +@@ -4,8 +4,8 @@ + static inline size_t bch2_nr_btree_keys_need_flush(struct bch_fs *c) + { + size_t nr_dirty = READ_ONCE(c->btree_key_cache.nr_dirty); +- size_t nr_keys = READ_ONCE(c->btree_key_cache.nr_dirty); +- size_t max_dirty = 4096 + nr_keys / 2; ++ size_t nr_keys = READ_ONCE(c->btree_key_cache.nr_keys); ++ size_t max_dirty = 1024 + nr_keys / 2; + + return max_t(ssize_t, 0, nr_dirty - max_dirty); + } +@@ -13,7 +13,7 @@ static inline size_t bch2_nr_btree_keys_need_flush(struct bch_fs *c) + static inline bool bch2_btree_key_cache_must_wait(struct bch_fs *c) + { + size_t nr_dirty = READ_ONCE(c->btree_key_cache.nr_dirty); +- size_t nr_keys = READ_ONCE(c->btree_key_cache.nr_dirty); ++ size_t nr_keys = READ_ONCE(c->btree_key_cache.nr_keys); + size_t max_dirty = 4096 + (nr_keys * 3) / 4; + + return nr_dirty > max_dirty; +diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c +index 0655acfd1b3e..2319a2b6528e 100644 +--- a/fs/bcachefs/journal_reclaim.c ++++ b/fs/bcachefs/journal_reclaim.c +@@ -240,6 +240,7 @@ void bch2_journal_space_available(struct journal *j) + u64s_remaining -= (u64) total << 3; + u64s_remaining = max(0LL, u64s_remaining); + u64s_remaining /= 2; ++ u64s_remaining = min_t(u64, u64s_remaining, U32_MAX); + out: + j->cur_entry_sectors = !ret ? j->space[journal_space_discarded].next_entry : 0; + j->cur_entry_error = ret; +-- +cgit v1.2.3 + + +From 1033b2a21fdaea15a25a3c3b76622a3641bf463c Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 6 Dec 2020 16:29:13 -0500 +Subject: bcachefs: Prevent journal reclaim from spinning + +Without checking if we actually flushed anything, journal reclaim could +still go into an infinite loop while trying ot shut down. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/journal_reclaim.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c +index 2319a2b6528e..b77d4e7f42d6 100644 +--- a/fs/bcachefs/journal_reclaim.c ++++ b/fs/bcachefs/journal_reclaim.c +@@ -610,7 +610,7 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct) + else + j->nr_background_reclaim += nr_flushed; + trace_journal_reclaim_finish(c, nr_flushed); +- } while (min_nr); ++ } while (min_nr && nr_flushed); + + memalloc_noreclaim_restore(flags); + +-- +cgit v1.2.3 + + +From 1e3337b9ebd7871c293581a1cbf2de6ef0f054ae Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 6 Dec 2020 16:30:02 -0500 +Subject: bcachefs: Try to print full btree error message + +Metadata corruption bugs are hard to debug if we can't see exactly what +went wrong - try to allocate a bigger buffer so we can print out +everything we have. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_io.c | 15 +++++++++++---- + fs/bcachefs/btree_iter.c | 14 ++++++++++++-- + 2 files changed, 23 insertions(+), 6 deletions(-) + +diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c +index 9b19432ae7a5..4dde972d353a 100644 +--- a/fs/bcachefs/btree_io.c ++++ b/fs/bcachefs/btree_io.c +@@ -635,21 +635,26 @@ enum btree_validate_ret { + ({ \ + __label__ out; \ + char _buf[300]; \ ++ char *buf2 = _buf; \ + struct printbuf out = PBUF(_buf); \ + \ ++ buf2 = kmalloc(4096, GFP_ATOMIC); \ ++ if (buf2) \ ++ out = _PBUF(buf2, 4986); \ ++ \ + btree_err_msg(&out, c, b, i, b->written, write); \ + pr_buf(&out, ": " msg, ##__VA_ARGS__); \ + \ + if (type == BTREE_ERR_FIXABLE && \ + write == READ && \ + !test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) { \ +- mustfix_fsck_err(c, "%s", _buf); \ ++ mustfix_fsck_err(c, "%s", buf2); \ + goto out; \ + } \ + \ + switch (write) { \ + case READ: \ +- bch_err(c, "%s", _buf); \ ++ bch_err(c, "%s", buf2); \ + \ + switch (type) { \ + case BTREE_ERR_FIXABLE: \ +@@ -670,7 +675,7 @@ enum btree_validate_ret { + } \ + break; \ + case WRITE: \ +- bch_err(c, "corrupt metadata before write: %s", _buf); \ ++ bch_err(c, "corrupt metadata before write: %s", buf2); \ + \ + if (bch2_fs_inconsistent(c)) { \ + ret = BCH_FSCK_ERRORS_NOT_FIXED; \ +@@ -679,6 +684,8 @@ enum btree_validate_ret { + break; \ + } \ + out: \ ++ if (buf2 != _buf) \ ++ kfree(buf2); \ + true; \ + }) + +@@ -844,7 +851,7 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b, + + bch2_bkey_val_to_text(&PBUF(buf), c, u.s_c); + btree_err(BTREE_ERR_FIXABLE, c, b, i, +- "invalid bkey:\n%s\n%s", invalid, buf); ++ "invalid bkey: %s\n%s", invalid, buf); + + i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); + memmove_u64s_down(k, bkey_next(k), +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 7a95fcc0b244..bf9ba9d2daed 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -875,9 +875,19 @@ static void btree_iter_verify_new_node(struct btree_iter *iter, struct btree *b) + char buf[100]; + struct bkey uk = bkey_unpack_key(b, k); + ++ bch2_dump_btree_node(iter->trans->c, l->b); + bch2_bkey_to_text(&PBUF(buf), &uk); +- panic("parent iter doesn't point to new node:\n%s\n%llu:%llu\n", +- buf, b->key.k.p.inode, b->key.k.p.offset); ++ panic("parent iter doesn't point to new node:\n" ++ "iter pos %s %llu:%llu\n" ++ "iter key %s\n" ++ "new node %llu:%llu-%llu:%llu\n", ++ bch2_btree_ids[iter->btree_id], ++ iter->pos.inode, ++ iter->pos.offset, ++ buf, ++ b->data->min_key.inode, ++ b->data->min_key.offset, ++ b->key.k.p.inode, b->key.k.p.offset); + } + + if (!parent_locked) +-- +cgit v1.2.3 + + +From db794bd734a17805eefb563c031dcfc8905cc325 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 7 Dec 2020 11:44:12 -0500 +Subject: bcachefs: Fix rand_delete() test + +When we didn't find a key to delete we were getting a null ptr deref. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/tests.c | 7 +++---- + 1 file changed, 3 insertions(+), 4 deletions(-) + +diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c +index 5f40b048dd0d..f1d09e3ada09 100644 +--- a/fs/bcachefs/tests.c ++++ b/fs/bcachefs/tests.c +@@ -563,15 +563,14 @@ static int __do_delete(struct btree_trans *trans, struct bpos pos) + + iter = bch2_trans_get_iter(trans, BTREE_ID_XATTRS, pos, + BTREE_ITER_INTENT); +- ret = PTR_ERR_OR_ZERO(iter); +- if (ret) +- goto err; +- + k = bch2_btree_iter_peek(iter); + ret = bkey_err(k); + if (ret) + goto err; + ++ if (!k.k) ++ goto err; ++ + bkey_init(&delete.k); + delete.k.p = k.k->p; + +-- +cgit v1.2.3 + + +From e15f6fcc966b7c7d91949ca957df262e809369c9 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 9 Dec 2020 13:34:42 -0500 +Subject: bcachefs: Fix __btree_iter_next() when all iters are in use_next() + when all iters are in use + +Also, print out more information on btree transaction iterator overflow. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 11 ++++++++++- + fs/bcachefs/btree_iter.h | 8 +++++++- + 2 files changed, 17 insertions(+), 2 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index bf9ba9d2daed..21253be5aab6 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -2017,9 +2017,10 @@ static void btree_trans_iter_alloc_fail(struct btree_trans *trans) + { + + struct btree_iter *iter; ++ struct btree_insert_entry *i; + + trans_for_each_iter(trans, iter) +- pr_err("iter: btree %s pos %llu:%llu%s%s%s %ps", ++ printk(KERN_ERR "iter: btree %s pos %llu:%llu%s%s%s %ps\n", + bch2_btree_ids[iter->btree_id], + iter->pos.inode, + iter->pos.offset, +@@ -2027,6 +2028,14 @@ static void btree_trans_iter_alloc_fail(struct btree_trans *trans) + (trans->iters_touched & (1ULL << iter->idx)) ? " touched" : "", + iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT ? " keep" : "", + (void *) iter->ip_allocated); ++ ++ trans_for_each_update(trans, i) { ++ char buf[300]; ++ ++ bch2_bkey_val_to_text(&PBUF(buf), trans->c, bkey_i_to_s_c(i->k)); ++ printk(KERN_ERR "update: btree %s %s\n", ++ bch2_btree_ids[i->iter->btree_id], buf); ++ } + panic("trans iter oveflow\n"); + } + +diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h +index ee8c4346aadb..9a7f8d0197ec 100644 +--- a/fs/bcachefs/btree_iter.h ++++ b/fs/bcachefs/btree_iter.h +@@ -51,11 +51,17 @@ static inline int btree_iter_err(const struct btree_iter *iter) + static inline struct btree_iter * + __trans_next_iter(struct btree_trans *trans, unsigned idx) + { +- u64 l = trans->iters_linked >> idx; ++ u64 l; ++ ++ if (idx == BTREE_ITER_MAX) ++ return NULL; ++ ++ l = trans->iters_linked >> idx; + if (!l) + return NULL; + + idx += __ffs64(l); ++ EBUG_ON(idx >= BTREE_ITER_MAX); + EBUG_ON(trans->iters[idx].idx != idx); + return &trans->iters[idx]; + } +-- +cgit v1.2.3 + + +From 48ba6280477b3677945faade2c7355582f4ef15d Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 9 Dec 2020 13:39:30 -0500 +Subject: bcachefs: Only try to get existing stripe once in stripe create path + +The stripe creation path was too state-machiney: it would always run the +full state machine until it had succesfully created a new stripe. + +But if we tried to get and reuse an existing stripe after we'd already +allocated some buckets, the buckets we'd allocated might have conflicted +with the blocks in the existing stripe we need to keep - oops. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/ec.c | 20 ++++++++++---------- + 1 file changed, 10 insertions(+), 10 deletions(-) + +diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c +index 138df875f750..2a6749682dc5 100644 +--- a/fs/bcachefs/ec.c ++++ b/fs/bcachefs/ec.c +@@ -874,7 +874,7 @@ static void ec_stripe_create(struct ec_stripe_new *s) + for_each_keylist_key(&s->keys, k) { + ret = ec_stripe_update_ptrs(c, &s->stripe, &k->k); + if (ret) { +- bch_err(c, "error creating stripe: error updating pointers"); ++ bch_err(c, "error creating stripe: error %i updating pointers", ret); + break; + } + } +@@ -1341,16 +1341,14 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c, + if (!h) + return NULL; + +- if (!h->s && ec_new_stripe_alloc(c, h)) { +- bch2_ec_stripe_head_put(c, h); +- return NULL; +- } +- +- if (!h->s->allocated) { +- if (!h->s->existing_stripe && +- (idx = get_existing_stripe(c, target, algo, redundancy)) >= 0) { +- //pr_info("got existing stripe %llu", idx); ++ if (!h->s) { ++ if (ec_new_stripe_alloc(c, h)) { ++ bch2_ec_stripe_head_put(c, h); ++ return NULL; ++ } + ++ idx = get_existing_stripe(c, target, algo, redundancy); ++ if (idx >= 0) { + h->s->existing_stripe = true; + h->s->existing_stripe_idx = idx; + if (get_stripe_key(c, idx, &h->s->stripe)) { +@@ -1364,7 +1362,9 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c, + ec_block_io(c, &h->s->stripe, READ, i, &cl); + } + } ++ } + ++ if (!h->s->allocated) { + if (!h->s->existing_stripe && + !h->s->res.sectors) { + ret = bch2_disk_reservation_get(c, &h->s->res, +-- +cgit v1.2.3 + + +From 554aaa19e8a04d74bb87f4584e98d40d7bfff294 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 10 Dec 2020 13:13:56 -0500 +Subject: bcachefs: Update transactional triggers interface to pass old & new + keys + +This is needed to fix a bug where we're overflowing iterators within a +btree transaction, because we're updating the stripes btree (to update +block counts) and the stripes btree trigger is unnecessarily updating +the alloc btree - it doesn't need to update the alloc btree when the +pointers within a stripe aren't changing. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_update_interior.c | 8 +- + fs/bcachefs/buckets.c | 256 ++++++++++++++++++++++-------------- + fs/bcachefs/buckets.h | 2 +- + fs/bcachefs/recovery.c | 8 +- + 4 files changed, 172 insertions(+), 102 deletions(-) + +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index 4a169d366538..8f96756ba648 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -519,14 +519,18 @@ static int btree_update_nodes_written_trans(struct btree_trans *trans, + trans->journal_pin = &as->journal; + + for_each_keylist_key(&as->new_keys, k) { +- ret = bch2_trans_mark_key(trans, bkey_i_to_s_c(k), ++ ret = bch2_trans_mark_key(trans, ++ bkey_s_c_null, ++ bkey_i_to_s_c(k), + 0, 0, BTREE_TRIGGER_INSERT); + if (ret) + return ret; + } + + for_each_keylist_key(&as->old_keys, k) { +- ret = bch2_trans_mark_key(trans, bkey_i_to_s_c(k), ++ ret = bch2_trans_mark_key(trans, ++ bkey_i_to_s_c(k), ++ bkey_s_c_null, + 0, 0, BTREE_TRIGGER_OVERWRITE); + if (ret) + return ret; +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index 0000fc76d2d9..1934b845ea15 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -1334,10 +1334,8 @@ static int bch2_mark_key_locked(struct bch_fs *c, + ret = bch2_mark_stripe(c, old, new, fs_usage, journal_seq, flags); + break; + case KEY_TYPE_inode: +- if (!(flags & BTREE_TRIGGER_OVERWRITE)) +- fs_usage->nr_inodes++; +- else +- fs_usage->nr_inodes--; ++ fs_usage->nr_inodes += new.k->type == KEY_TYPE_inode; ++ fs_usage->nr_inodes -= old.k->type == KEY_TYPE_inode; + break; + case KEY_TYPE_reservation: { + unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas; +@@ -1401,10 +1399,10 @@ int bch2_mark_update(struct btree_trans *trans, + old = (struct bkey_s_c) { &unpacked, NULL }; + + if (!btree_node_type_is_extents(iter->btree_id)) { ++ /* iterators should be uptodate, shouldn't get errors here: */ + if (btree_iter_type(iter) != BTREE_ITER_CACHED) { +- _old = bch2_btree_node_iter_peek(&node_iter, b); +- if (_old) +- old = bkey_disassemble(b, _old, &unpacked); ++ old = bch2_btree_iter_peek_slot(iter); ++ BUG_ON(bkey_err(old)); + } else { + struct bkey_cached *ck = (void *) iter->l[0].b; + +@@ -1749,59 +1747,92 @@ static int bch2_trans_mark_extent(struct btree_trans *trans, + return 0; + } + ++static int bch2_trans_mark_stripe_alloc_ref(struct btree_trans *trans, ++ const struct bch_extent_ptr *ptr, ++ s64 sectors, bool parity) ++{ ++ struct bkey_i_alloc *a; ++ struct btree_iter *iter; ++ struct bkey_alloc_unpacked u; ++ int ret; ++ ++ ret = bch2_trans_start_alloc_update(trans, &iter, ptr, &u); ++ if (ret) ++ return ret; ++ ++ if (parity) { ++ u.dirty_sectors += sectors; ++ u.data_type = u.dirty_sectors ++ ? BCH_DATA_parity ++ : 0; ++ } ++ ++ a = bch2_trans_kmalloc(trans, BKEY_ALLOC_U64s_MAX * 8); ++ ret = PTR_ERR_OR_ZERO(a); ++ if (ret) ++ goto err; ++ ++ bkey_alloc_init(&a->k_i); ++ a->k.p = iter->pos; ++ bch2_alloc_pack(a, u); ++ bch2_trans_update(trans, iter, &a->k_i, 0); ++err: ++ bch2_trans_iter_put(trans, iter); ++ return ret; ++} ++ + static int bch2_trans_mark_stripe(struct btree_trans *trans, +- struct bkey_s_c k, ++ struct bkey_s_c old, struct bkey_s_c new, + unsigned flags) + { +- const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; +- unsigned nr_data = s->nr_blocks - s->nr_redundant; ++ const struct bch_stripe *old_s = old.k->type == KEY_TYPE_stripe ++ ? bkey_s_c_to_stripe(old).v : NULL; ++ const struct bch_stripe *new_s = new.k->type == KEY_TYPE_stripe ++ ? bkey_s_c_to_stripe(new).v : NULL; + struct bch_replicas_padded r; +- struct bkey_alloc_unpacked u; +- struct bkey_i_alloc *a; +- struct btree_iter *iter; +- bool deleting = flags & BTREE_TRIGGER_OVERWRITE; +- s64 sectors = le16_to_cpu(s->sectors); + unsigned i; + int ret = 0; + +- if (deleting) +- sectors = -sectors; +- +- bch2_bkey_to_replicas(&r.e, k); +- update_replicas_list(trans, &r.e, sectors * s->nr_redundant); +- + /* +- * The allocator code doesn't necessarily update bucket gens in the +- * btree when incrementing them, right before handing out new buckets - +- * we just need to persist those updates here along with the new stripe: ++ * If the pointers aren't changing, we don't need to do anything: + */ ++ if (new_s && old_s && ++ !memcmp(old_s->ptrs, new_s->ptrs, ++ new_s->nr_blocks * sizeof(struct bch_extent_ptr))) ++ return 0; + +- for (i = 0; i < s->nr_blocks && !ret; i++) { +- bool parity = i >= nr_data; ++ if (new_s) { ++ unsigned nr_data = new_s->nr_blocks - new_s->nr_redundant; ++ s64 sectors = le16_to_cpu(new_s->sectors); + +- ret = bch2_trans_start_alloc_update(trans, &iter, +- &s->ptrs[i], &u); +- if (ret) +- break; ++ bch2_bkey_to_replicas(&r.e, new); ++ update_replicas_list(trans, &r.e, sectors * new_s->nr_redundant); + +- if (parity) { +- u.dirty_sectors += sectors; +- u.data_type = u.dirty_sectors +- ? BCH_DATA_parity +- : 0; ++ for (i = 0; i < new_s->nr_blocks; i++) { ++ bool parity = i >= nr_data; ++ ++ ret = bch2_trans_mark_stripe_alloc_ref(trans, ++ &new_s->ptrs[i], sectors, parity); ++ if (ret) ++ return ret; + } ++ } + +- a = bch2_trans_kmalloc(trans, BKEY_ALLOC_U64s_MAX * 8); +- ret = PTR_ERR_OR_ZERO(a); +- if (ret) +- goto put_iter; +- +- bkey_alloc_init(&a->k_i); +- a->k.p = iter->pos; +- bch2_alloc_pack(a, u); +- bch2_trans_update(trans, iter, &a->k_i, 0); +-put_iter: +- bch2_trans_iter_put(trans, iter); ++ if (old_s) { ++ unsigned nr_data = old_s->nr_blocks - old_s->nr_redundant; ++ s64 sectors = -((s64) le16_to_cpu(old_s->sectors)); ++ ++ bch2_bkey_to_replicas(&r.e, old); ++ update_replicas_list(trans, &r.e, sectors * old_s->nr_redundant); ++ ++ for (i = 0; i < old_s->nr_blocks; i++) { ++ bool parity = i >= nr_data; ++ ++ ret = bch2_trans_mark_stripe_alloc_ref(trans, ++ &old_s->ptrs[i], sectors, parity); ++ if (ret) ++ return ret; ++ } + } + + return ret; +@@ -1900,11 +1931,16 @@ static int bch2_trans_mark_reflink_p(struct btree_trans *trans, + return ret; + } + +-int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c k, ++int bch2_trans_mark_key(struct btree_trans *trans, ++ struct bkey_s_c old, ++ struct bkey_s_c new, + unsigned offset, s64 sectors, unsigned flags) + { +- struct replicas_delta_list *d; + struct bch_fs *c = trans->c; ++ struct bkey_s_c k = flags & BTREE_TRIGGER_INSERT ? new : old; ++ struct replicas_delta_list *d; ++ ++ BUG_ON(!(flags & (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE))); + + switch (k.k->type) { + case KEY_TYPE_btree_ptr: +@@ -1920,15 +1956,18 @@ int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c k, + return bch2_trans_mark_extent(trans, k, offset, sectors, + flags, BCH_DATA_user); + case KEY_TYPE_stripe: +- return bch2_trans_mark_stripe(trans, k, flags); +- case KEY_TYPE_inode: +- d = replicas_deltas_realloc(trans, 0); ++ return bch2_trans_mark_stripe(trans, old, new, flags); ++ case KEY_TYPE_inode: { ++ int nr = (new.k->type == KEY_TYPE_inode) - ++ (old.k->type == KEY_TYPE_inode); ++ ++ if (nr) { ++ d = replicas_deltas_realloc(trans, 0); ++ d->nr_inodes += nr; ++ } + +- if (!(flags & BTREE_TRIGGER_OVERWRITE)) +- d->nr_inodes++; +- else +- d->nr_inodes--; + return 0; ++ } + case KEY_TYPE_reservation: { + unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas; + +@@ -1952,12 +1991,10 @@ int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c k, + + int bch2_trans_mark_update(struct btree_trans *trans, + struct btree_iter *iter, +- struct bkey_i *insert, ++ struct bkey_i *new, + unsigned flags) + { +- struct btree *b = iter_l(iter)->b; +- struct btree_node_iter node_iter = iter_l(iter)->iter; +- struct bkey_packed *_k; ++ struct bkey_s_c old; + int ret; + + if (unlikely(flags & BTREE_TRIGGER_NORUN)) +@@ -1966,68 +2003,93 @@ int bch2_trans_mark_update(struct btree_trans *trans, + if (!btree_node_type_needs_gc(iter->btree_id)) + return 0; + +- ret = bch2_trans_mark_key(trans, bkey_i_to_s_c(insert), +- 0, insert->k.size, BTREE_TRIGGER_INSERT); +- if (ret) +- return ret; +- +- if (btree_iter_type(iter) == BTREE_ITER_CACHED) { +- struct bkey_cached *ck = (void *) iter->l[0].b; ++ if (!btree_node_type_is_extents(iter->btree_id)) { ++ /* iterators should be uptodate, shouldn't get errors here: */ ++ if (btree_iter_type(iter) != BTREE_ITER_CACHED) { ++ old = bch2_btree_iter_peek_slot(iter); ++ BUG_ON(bkey_err(old)); ++ } else { ++ struct bkey_cached *ck = (void *) iter->l[0].b; + +- return bch2_trans_mark_key(trans, bkey_i_to_s_c(ck->k), +- 0, 0, BTREE_TRIGGER_OVERWRITE); +- } ++ BUG_ON(!ck->valid); ++ old = bkey_i_to_s_c(ck->k); ++ } + +- while ((_k = bch2_btree_node_iter_peek(&node_iter, b))) { ++ if (old.k->type == new->k.type) { ++ ret = bch2_trans_mark_key(trans, old, bkey_i_to_s_c(new), 0, 0, ++ BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags); ++ } else { ++ ret = bch2_trans_mark_key(trans, old, bkey_i_to_s_c(new), 0, 0, ++ BTREE_TRIGGER_INSERT|flags) ?: ++ bch2_trans_mark_key(trans, old, bkey_i_to_s_c(new), 0, 0, ++ BTREE_TRIGGER_OVERWRITE|flags); ++ } ++ } else { ++ struct btree *b = iter_l(iter)->b; ++ struct btree_node_iter node_iter = iter_l(iter)->iter; ++ struct bkey_packed *_old; + struct bkey unpacked; +- struct bkey_s_c k; +- unsigned offset = 0; +- s64 sectors = 0; +- unsigned flags = BTREE_TRIGGER_OVERWRITE; + +- k = bkey_disassemble(b, _k, &unpacked); ++ EBUG_ON(btree_iter_type(iter) == BTREE_ITER_CACHED); + +- if (btree_node_is_extents(b) +- ? bkey_cmp(insert->k.p, bkey_start_pos(k.k)) <= 0 +- : bkey_cmp(insert->k.p, k.k->p)) +- break; ++ bkey_init(&unpacked); ++ old = (struct bkey_s_c) { &unpacked, NULL }; ++ ++ ret = bch2_trans_mark_key(trans, old, bkey_i_to_s_c(new), ++ 0, new->k.size, ++ BTREE_TRIGGER_INSERT); ++ if (ret) ++ return ret; ++ ++ while ((_old = bch2_btree_node_iter_peek(&node_iter, b))) { ++ unsigned flags = BTREE_TRIGGER_OVERWRITE; ++ unsigned offset = 0; ++ s64 sectors; ++ ++ old = bkey_disassemble(b, _old, &unpacked); ++ sectors = -((s64) old.k->size); ++ ++ flags |= BTREE_TRIGGER_OVERWRITE; ++ ++ if (bkey_cmp(new->k.p, bkey_start_pos(old.k)) <= 0) ++ return 0; + +- if (btree_node_is_extents(b)) { +- switch (bch2_extent_overlap(&insert->k, k.k)) { ++ switch (bch2_extent_overlap(&new->k, old.k)) { + case BCH_EXTENT_OVERLAP_ALL: + offset = 0; +- sectors = -((s64) k.k->size); ++ sectors = -((s64) old.k->size); + break; + case BCH_EXTENT_OVERLAP_BACK: +- offset = bkey_start_offset(&insert->k) - +- bkey_start_offset(k.k); +- sectors = bkey_start_offset(&insert->k) - +- k.k->p.offset; ++ offset = bkey_start_offset(&new->k) - ++ bkey_start_offset(old.k); ++ sectors = bkey_start_offset(&new->k) - ++ old.k->p.offset; + break; + case BCH_EXTENT_OVERLAP_FRONT: + offset = 0; +- sectors = bkey_start_offset(k.k) - +- insert->k.p.offset; ++ sectors = bkey_start_offset(old.k) - ++ new->k.p.offset; + break; + case BCH_EXTENT_OVERLAP_MIDDLE: +- offset = bkey_start_offset(&insert->k) - +- bkey_start_offset(k.k); +- sectors = -((s64) insert->k.size); ++ offset = bkey_start_offset(&new->k) - ++ bkey_start_offset(old.k); ++ sectors = -((s64) new->k.size); + flags |= BTREE_TRIGGER_OVERWRITE_SPLIT; + break; + } + + BUG_ON(sectors >= 0); +- } + +- ret = bch2_trans_mark_key(trans, k, offset, sectors, flags); +- if (ret) +- return ret; ++ ret = bch2_trans_mark_key(trans, old, bkey_i_to_s_c(new), ++ offset, sectors, flags); ++ if (ret) ++ return ret; + +- bch2_btree_node_iter_advance(&node_iter, b); ++ bch2_btree_node_iter_advance(&node_iter, b); ++ } + } + +- return 0; ++ return ret; + } + + /* Disk reservations: */ +diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h +index 653f6761862e..8436d9610c86 100644 +--- a/fs/bcachefs/buckets.h ++++ b/fs/bcachefs/buckets.h +@@ -270,7 +270,7 @@ int bch2_mark_update(struct btree_trans *, struct btree_iter *, + int bch2_replicas_delta_list_apply(struct bch_fs *, + struct bch_fs_usage *, + struct replicas_delta_list *); +-int bch2_trans_mark_key(struct btree_trans *, struct bkey_s_c, ++int bch2_trans_mark_key(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, + unsigned, s64, unsigned); + int bch2_trans_mark_update(struct btree_trans *, struct btree_iter *iter, + struct bkey_i *insert, unsigned); +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index ecd51d45743a..1883a1faf380 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -458,7 +458,9 @@ retry: + bch2_btree_iter_set_pos(iter, split->k.p); + + if (remark) { +- ret = bch2_trans_mark_key(&trans, bkey_i_to_s_c(split), ++ ret = bch2_trans_mark_key(&trans, ++ bkey_s_c_null, ++ bkey_i_to_s_c(split), + 0, split->k.size, + BTREE_TRIGGER_INSERT); + if (ret) +@@ -467,7 +469,9 @@ retry: + } while (bkey_cmp(iter->pos, k->k.p) < 0); + + if (remark) { +- ret = bch2_trans_mark_key(&trans, bkey_i_to_s_c(k), ++ ret = bch2_trans_mark_key(&trans, ++ bkey_i_to_s_c(k), ++ bkey_s_c_null, + 0, -((s64) k->k.size), + BTREE_TRIGGER_OVERWRITE); + if (ret) +-- +cgit v1.2.3 + + +From f83cfe13255d77d4a0b2c42a5a2ca3a99b9bfc55 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 10 Dec 2020 13:38:54 -0500 +Subject: bcachefs: Always check if we need disk res in extent update path + +With erasure coding, we now have processes in the background that +compact data, causing it to take up less space on disk than when it was +written, or potentially when it was read. + +This means that we can't trust the page cache when it says "we have data +on disk taking up x amount of space here" - there's always the potential +to race with background compaction. + +To fix this, just check if we need to add to our disk reservation in the +bch2_extent_update() path, in the transaction that will do the btree +update. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/io.c | 59 +++++++++++++++++++++++++++++++++----------------------- + 1 file changed, 35 insertions(+), 24 deletions(-) + +diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c +index a1c17512bd86..106e6e56a7ba 100644 +--- a/fs/bcachefs/io.c ++++ b/fs/bcachefs/io.c +@@ -185,34 +185,33 @@ void bch2_bio_alloc_pages_pool(struct bch_fs *c, struct bio *bio, + static int sum_sector_overwrites(struct btree_trans *trans, + struct btree_iter *extent_iter, + struct bkey_i *new, +- bool may_allocate, + bool *maybe_extending, +- s64 *delta) ++ s64 *i_sectors_delta, ++ s64 *disk_sectors_delta) + { + struct btree_iter *iter; + struct bkey_s_c old; + int ret = 0; + +- *maybe_extending = true; +- *delta = 0; ++ *maybe_extending = true; ++ *i_sectors_delta = 0; ++ *disk_sectors_delta = 0; + + iter = bch2_trans_copy_iter(trans, extent_iter); + + for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, old, ret) { +- if (!may_allocate && +- bch2_bkey_nr_ptrs_fully_allocated(old) < +- bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(new))) { +- ret = -ENOSPC; +- break; +- } ++ s64 sectors = min(new->k.p.offset, old.k->p.offset) - ++ max(bkey_start_offset(&new->k), ++ bkey_start_offset(old.k)); + +- *delta += (min(new->k.p.offset, +- old.k->p.offset) - +- max(bkey_start_offset(&new->k), +- bkey_start_offset(old.k))) * ++ *i_sectors_delta += sectors * + (bkey_extent_is_allocation(&new->k) - + bkey_extent_is_allocation(old.k)); + ++ *disk_sectors_delta += sectors * ++ (int) (bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(new)) - ++ bch2_bkey_nr_ptrs_fully_allocated(old)); ++ + if (bkey_cmp(old.k->p, new->k.p) >= 0) { + /* + * Check if there's already data above where we're +@@ -246,12 +245,12 @@ int bch2_extent_update(struct btree_trans *trans, + struct disk_reservation *disk_res, + u64 *journal_seq, + u64 new_i_size, +- s64 *i_sectors_delta) ++ s64 *i_sectors_delta_total) + { + /* this must live until after bch2_trans_commit(): */ + struct bkey_inode_buf inode_p; + bool extending = false; +- s64 delta = 0; ++ s64 i_sectors_delta = 0, disk_sectors_delta = 0; + int ret; + + ret = bch2_extent_trim_atomic(k, iter); +@@ -259,16 +258,26 @@ int bch2_extent_update(struct btree_trans *trans, + return ret; + + ret = sum_sector_overwrites(trans, iter, k, +- disk_res && disk_res->sectors != 0, +- &extending, &delta); ++ &extending, ++ &i_sectors_delta, ++ &disk_sectors_delta); + if (ret) + return ret; + ++ if (disk_res && ++ disk_sectors_delta > (s64) disk_res->sectors) { ++ ret = bch2_disk_reservation_add(trans->c, disk_res, ++ disk_sectors_delta - disk_res->sectors, ++ 0); ++ if (ret) ++ return ret; ++ } ++ + new_i_size = extending + ? min(k->k.p.offset << 9, new_i_size) + : 0; + +- if (delta || new_i_size) { ++ if (i_sectors_delta || new_i_size) { + struct btree_iter *inode_iter; + struct bch_inode_unpacked inode_u; + +@@ -295,9 +304,9 @@ int bch2_extent_update(struct btree_trans *trans, + else + new_i_size = 0; + +- inode_u.bi_sectors += delta; ++ inode_u.bi_sectors += i_sectors_delta; + +- if (delta || new_i_size) { ++ if (i_sectors_delta || new_i_size) { + bch2_inode_pack(trans->c, &inode_p, &inode_u); + bch2_trans_update(trans, inode_iter, + &inode_p.inode.k_i, 0); +@@ -312,10 +321,12 @@ int bch2_extent_update(struct btree_trans *trans, + BTREE_INSERT_NOCHECK_RW| + BTREE_INSERT_NOFAIL| + BTREE_INSERT_USE_RESERVE); +- if (!ret && i_sectors_delta) +- *i_sectors_delta += delta; ++ if (ret) ++ return ret; + +- return ret; ++ if (i_sectors_delta_total) ++ *i_sectors_delta_total += i_sectors_delta; ++ return 0; + } + + int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter, +-- +cgit v1.2.3 + + +From cfa4bf3a912bff5535f797e640f49ef79719351b Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 11 Dec 2020 12:02:48 -0500 +Subject: bcachefs: Fix btree node merge -> split operations + +If a btree node merger is followed by a split or compact of the parent +node, we could end up with the parent btree node iterator pointing to +the whiteout inserted by the btree node merge operation - the fix is to +ensure that interior btree node iterators always point to the first non +whiteout. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 7 +++++++ + 1 file changed, 7 insertions(+) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 21253be5aab6..8c35e39ea97f 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -902,6 +902,13 @@ static inline void __btree_iter_init(struct btree_iter *iter, + + bch2_btree_node_iter_init(&l->iter, l->b, &pos); + ++ /* ++ * Iterators to interior nodes should always be pointed at the first non ++ * whiteout: ++ */ ++ if (level) ++ bch2_btree_node_iter_peek(&l->iter, l->b); ++ + btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); + } + +-- +cgit v1.2.3 + + +From 574f9f4a87d194acfdb008e3de3ec619d5269e9e Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 13 Dec 2020 16:12:04 -0500 +Subject: bcachefs: Add some cond_rescheds() in shutdown path + +Particularly on emergency shutdown we can end up having to clean up a +lot of dirty cached btree keys here. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_key_cache.c | 4 ++++ + 1 file changed, 4 insertions(+) + +diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c +index 244c5dbcd3e9..1a557b753bc1 100644 +--- a/fs/bcachefs/btree_key_cache.c ++++ b/fs/bcachefs/btree_key_cache.c +@@ -580,6 +580,8 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc) + list_splice(&bc->dirty, &bc->clean); + + list_for_each_entry_safe(ck, n, &bc->clean, list) { ++ cond_resched(); ++ + bch2_journal_pin_drop(&c->journal, &ck->journal); + bch2_journal_preres_put(&c->journal, &ck->res); + +@@ -593,6 +595,8 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc) + BUG_ON(bc->nr_keys); + + list_for_each_entry_safe(ck, n, &bc->freed, list) { ++ cond_resched(); ++ + list_del(&ck->list); + kmem_cache_free(bch2_key_cache, ck); + } +-- +cgit v1.2.3 + + +From ef56c54a93f0ae34ae138bc4c883649e6f4d9fc9 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 16 Dec 2020 14:18:33 -0500 +Subject: bcachefs: Check for duplicate device ptrs in bch2_bkey_ptrs_invalid() + +This is something we clearly should be checking for, but weren't - +oops. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/extents.c | 8 ++++++++ + fs/bcachefs/replicas.c | 5 ----- + fs/bcachefs/util.h | 5 +++++ + 3 files changed, 13 insertions(+), 5 deletions(-) + +diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c +index 7fae6a4ba26f..828ccf07da61 100644 +--- a/fs/bcachefs/extents.c ++++ b/fs/bcachefs/extents.c +@@ -1046,11 +1046,13 @@ static const char *extent_ptr_invalid(const struct bch_fs *c, + const char *bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k) + { + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ struct bch_devs_list devs; + const union bch_extent_entry *entry; + struct bch_extent_crc_unpacked crc; + unsigned size_ondisk = k.k->size; + const char *reason; + unsigned nonce = UINT_MAX; ++ unsigned i; + + if (k.k->type == KEY_TYPE_btree_ptr) + size_ondisk = c->opts.btree_node_size; +@@ -1101,6 +1103,12 @@ const char *bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k) + } + } + ++ devs = bch2_bkey_devs(k); ++ bubble_sort(devs.devs, devs.nr, u8_cmp); ++ for (i = 0; i + 1 < devs.nr; i++) ++ if (devs.devs[i] == devs.devs[i + 1]) ++ return "multiple ptrs to same device"; ++ + return NULL; + } + +diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c +index 00a197b65e0b..d37d173f3ba6 100644 +--- a/fs/bcachefs/replicas.c ++++ b/fs/bcachefs/replicas.c +@@ -11,11 +11,6 @@ static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *, + + /* Replicas tracking - in memory: */ + +-static inline int u8_cmp(u8 l, u8 r) +-{ +- return cmp_int(l, r); +-} +- + static void verify_replicas_entry(struct bch_replicas_entry *e) + { + #ifdef CONFIG_BCACHEFS_DEBUG +diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h +index 192e2fd94689..2cf8568e630b 100644 +--- a/fs/bcachefs/util.h ++++ b/fs/bcachefs/util.h +@@ -746,4 +746,9 @@ u64 *bch2_acc_percpu_u64s(u64 __percpu *, unsigned); + + #define cmp_int(l, r) ((l > r) - (l < r)) + ++static inline int u8_cmp(u8 l, u8 r) ++{ ++ return cmp_int(l, r); ++} ++ + #endif /* _BCACHEFS_UTIL_H */ +-- +cgit v1.2.3 + + +From 3ee7c67da03992ecf22dcd199e6b355eb2a8d55b Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 16 Dec 2020 14:23:27 -0500 +Subject: bcachefs: Add BCH_BKEY_PTRS_MAX + +This now means "the maximum number of pointers within a bkey" - and +bch_devs_list is updated to use it instead of BCH_REPLICAS_MAX, since +stripes can contain more than BCH_REPLICAS_MAX pointers. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bcachefs_format.h | 2 ++ + fs/bcachefs/ec.c | 8 ++++---- + fs/bcachefs/ec.h | 8 ++++---- + fs/bcachefs/ec_types.h | 6 ++---- + fs/bcachefs/super_types.h | 2 +- + 5 files changed, 13 insertions(+), 13 deletions(-) + +diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h +index 02a76c3d3acb..9f59c6b3a25e 100644 +--- a/fs/bcachefs/bcachefs_format.h ++++ b/fs/bcachefs/bcachefs_format.h +@@ -1359,6 +1359,8 @@ enum bch_sb_compat { + + #define BCH_REPLICAS_MAX 4U + ++#define BCH_BKEY_PTRS_MAX 16U ++ + enum bch_error_actions { + BCH_ON_ERROR_CONTINUE = 0, + BCH_ON_ERROR_RO = 1, +diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c +index 2a6749682dc5..eea9660971d4 100644 +--- a/fs/bcachefs/ec.c ++++ b/fs/bcachefs/ec.c +@@ -300,7 +300,7 @@ static unsigned ec_nr_failed(struct ec_stripe_buf *buf) + static int ec_do_recov(struct bch_fs *c, struct ec_stripe_buf *buf) + { + struct bch_stripe *v = &buf->key.v; +- unsigned i, failed[EC_STRIPE_MAX], nr_failed = 0; ++ unsigned i, failed[BCH_BKEY_PTRS_MAX], nr_failed = 0; + unsigned nr_data = v->nr_blocks - v->nr_redundant; + unsigned bytes = buf->size << 9; + +@@ -1101,7 +1101,7 @@ static int ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h) + s->c = c; + s->h = h; + s->nr_data = min_t(unsigned, h->nr_active_devs, +- EC_STRIPE_MAX) - h->redundancy; ++ BCH_BKEY_PTRS_MAX) - h->redundancy; + s->nr_parity = h->redundancy; + + bch2_keylist_init(&s->keys, s->inline_keys); +@@ -1211,13 +1211,13 @@ static int new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h) + struct open_bucket *ob; + unsigned i, nr_have, nr_data = + min_t(unsigned, h->nr_active_devs, +- EC_STRIPE_MAX) - h->redundancy; ++ BCH_BKEY_PTRS_MAX) - h->redundancy; + bool have_cache = true; + int ret = 0; + + devs = h->devs; + +- for_each_set_bit(i, h->s->blocks_allocated, EC_STRIPE_MAX) { ++ for_each_set_bit(i, h->s->blocks_allocated, BCH_BKEY_PTRS_MAX) { + __clear_bit(h->s->stripe.key.v.ptrs[i].dev, devs.d); + --nr_data; + } +diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h +index 15f751fc2a35..450bb1a113a3 100644 +--- a/fs/bcachefs/ec.h ++++ b/fs/bcachefs/ec.h +@@ -71,9 +71,9 @@ struct ec_stripe_buf { + /* might not be buffering the entire stripe: */ + unsigned offset; + unsigned size; +- unsigned long valid[BITS_TO_LONGS(EC_STRIPE_MAX)]; ++ unsigned long valid[BITS_TO_LONGS(BCH_BKEY_PTRS_MAX)]; + +- void *data[EC_STRIPE_MAX]; ++ void *data[BCH_BKEY_PTRS_MAX]; + + union { + struct bkey_i_stripe key; +@@ -101,10 +101,10 @@ struct ec_stripe_new { + bool existing_stripe; + u64 existing_stripe_idx; + +- unsigned long blocks_allocated[BITS_TO_LONGS(EC_STRIPE_MAX)]; ++ unsigned long blocks_allocated[BITS_TO_LONGS(BCH_BKEY_PTRS_MAX)]; + + struct open_buckets blocks; +- u8 data_block_idx[EC_STRIPE_MAX]; ++ u8 data_block_idx[BCH_BKEY_PTRS_MAX]; + struct open_buckets parity; + struct disk_reservation res; + +diff --git a/fs/bcachefs/ec_types.h b/fs/bcachefs/ec_types.h +index e4d633fca5bf..5b688b4394f7 100644 +--- a/fs/bcachefs/ec_types.h ++++ b/fs/bcachefs/ec_types.h +@@ -4,11 +4,9 @@ + + #include + +-#define EC_STRIPE_MAX 16 +- + struct bch_replicas_padded { + struct bch_replicas_entry e; +- u8 pad[EC_STRIPE_MAX]; ++ u8 pad[BCH_BKEY_PTRS_MAX]; + }; + + struct stripe { +@@ -24,7 +22,7 @@ struct stripe { + unsigned dirty:1; + unsigned on_heap:1; + u8 blocks_nonempty; +- u16 block_sectors[EC_STRIPE_MAX]; ++ u16 block_sectors[BCH_BKEY_PTRS_MAX]; + + struct bch_replicas_padded r; + }; +diff --git a/fs/bcachefs/super_types.h b/fs/bcachefs/super_types.h +index 20406ebd6f5b..069973a38f12 100644 +--- a/fs/bcachefs/super_types.h ++++ b/fs/bcachefs/super_types.h +@@ -20,7 +20,7 @@ struct bch_devs_mask { + + struct bch_devs_list { + u8 nr; +- u8 devs[BCH_REPLICAS_MAX + 1]; ++ u8 devs[BCH_BKEY_PTRS_MAX]; + }; + + struct bch_member_cpu { +-- +cgit v1.2.3 + + +From 3c1252f15911e84daf2be0dbaac30d4928d4294c Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 16 Oct 2020 21:39:16 -0400 +Subject: bcachefs: Don't write bucket IO time lazily + +With the btree key cache code, we don't need to update the alloc btree +lazily - and this will mean we can remove the bch2_alloc_write() call in +the shutdown path. + +Future work: we really need to expend the bucket IO clocks from 16 to 64 +bits, so that we don't have to rescale them. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/alloc_background.c | 48 ++++++++++++++++++++++++++++++++++++++++++ + fs/bcachefs/alloc_background.h | 2 ++ + fs/bcachefs/alloc_foreground.c | 2 -- + fs/bcachefs/buckets.h | 6 ------ + fs/bcachefs/fs-io.c | 2 +- + fs/bcachefs/io.c | 16 ++++++++------ + fs/bcachefs/io.h | 6 +++--- + fs/bcachefs/move.c | 7 +++--- + 8 files changed, 67 insertions(+), 22 deletions(-) + +diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c +index 067631f51ddc..4d0dc10e05eb 100644 +--- a/fs/bcachefs/alloc_background.c ++++ b/fs/bcachefs/alloc_background.c +@@ -489,6 +489,54 @@ static void bch2_bucket_clock_init(struct bch_fs *c, int rw) + mutex_init(&clock->lock); + } + ++int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev, ++ size_t bucket_nr, int rw) ++{ ++ struct bch_fs *c = trans->c; ++ struct bch_dev *ca = bch_dev_bkey_exists(c, dev); ++ struct btree_iter *iter; ++ struct bucket *g; ++ struct bkey_i_alloc *a; ++ struct bkey_alloc_unpacked u; ++ u16 *time; ++ int ret = 0; ++ ++ iter = bch2_trans_get_iter(trans, BTREE_ID_ALLOC, POS(dev, bucket_nr), ++ BTREE_ITER_CACHED| ++ BTREE_ITER_CACHED_NOFILL| ++ BTREE_ITER_INTENT); ++ ret = bch2_btree_iter_traverse(iter); ++ if (ret) ++ goto out; ++ ++ a = bch2_trans_kmalloc(trans, BKEY_ALLOC_U64s_MAX * 8); ++ ret = PTR_ERR_OR_ZERO(a); ++ if (ret) ++ goto out; ++ ++ percpu_down_read(&c->mark_lock); ++ g = bucket(ca, bucket_nr); ++ u = alloc_mem_to_key(g, READ_ONCE(g->mark)); ++ percpu_up_read(&c->mark_lock); ++ ++ bkey_alloc_init(&a->k_i); ++ a->k.p = iter->pos; ++ ++ time = rw == READ ? &u.read_time : &u.write_time; ++ if (*time == c->bucket_clock[rw].hand) ++ goto out; ++ ++ *time = c->bucket_clock[rw].hand; ++ ++ bch2_alloc_pack(a, u); ++ ++ ret = bch2_trans_update(trans, iter, &a->k_i, 0) ?: ++ bch2_trans_commit(trans, NULL, NULL, 0); ++out: ++ bch2_trans_iter_put(trans, iter); ++ return ret; ++} ++ + /* Background allocator thread: */ + + /* +diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h +index 8e3abb89dfb7..d10ff56e4de1 100644 +--- a/fs/bcachefs/alloc_background.h ++++ b/fs/bcachefs/alloc_background.h +@@ -31,6 +31,8 @@ struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c); + void bch2_alloc_pack(struct bkey_i_alloc *, + const struct bkey_alloc_unpacked); + ++int bch2_bucket_io_time_reset(struct btree_trans *, unsigned, size_t, int); ++ + static inline struct bkey_alloc_unpacked + alloc_mem_to_key(struct bucket *g, struct bucket_mark m) + { +diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c +index 4a048828869b..7a92e3d53254 100644 +--- a/fs/bcachefs/alloc_foreground.c ++++ b/fs/bcachefs/alloc_foreground.c +@@ -309,8 +309,6 @@ out: + .dev = ca->dev_idx, + }; + +- bucket_io_clock_reset(c, ca, bucket, READ); +- bucket_io_clock_reset(c, ca, bucket, WRITE); + spin_unlock(&ob->lock); + + if (c->blocked_allocate_open_bucket) { +diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h +index 8436d9610c86..3a5ed1fcaf78 100644 +--- a/fs/bcachefs/buckets.h ++++ b/fs/bcachefs/buckets.h +@@ -58,12 +58,6 @@ static inline struct bucket *bucket(struct bch_dev *ca, size_t b) + return __bucket(ca, b, false); + } + +-static inline void bucket_io_clock_reset(struct bch_fs *c, struct bch_dev *ca, +- size_t b, int rw) +-{ +- bucket(ca, b)->io_time[rw] = c->bucket_clock[rw].hand; +-} +- + static inline u16 bucket_last_io(struct bch_fs *c, struct bucket *g, int rw) + { + return c->bucket_clock[rw].hand - g->io_time[rw]; +diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c +index 39282d78cc51..71ec97e7650b 100644 +--- a/fs/bcachefs/fs-io.c ++++ b/fs/bcachefs/fs-io.c +@@ -851,7 +851,7 @@ retry: + if (bkey_extent_is_allocation(k.k)) + bch2_add_page_sectors(&rbio->bio, k); + +- bch2_read_extent(c, rbio, k, offset_into_extent, flags); ++ bch2_read_extent(trans, rbio, k, offset_into_extent, flags); + + if (flags & BCH_READ_LAST_FRAGMENT) + break; +diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c +index 106e6e56a7ba..77eb82f40697 100644 +--- a/fs/bcachefs/io.c ++++ b/fs/bcachefs/io.c +@@ -7,6 +7,7 @@ + */ + + #include "bcachefs.h" ++#include "alloc_background.h" + #include "alloc_foreground.h" + #include "bkey_on_stack.h" + #include "bset.h" +@@ -1647,7 +1648,7 @@ retry: + goto out; + } + +- ret = __bch2_read_extent(c, rbio, bvec_iter, k, 0, failed, flags); ++ ret = __bch2_read_extent(&trans, rbio, bvec_iter, k, 0, failed, flags); + if (ret == READ_RETRY) + goto retry; + if (ret) +@@ -1705,7 +1706,7 @@ retry: + bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9; + swap(bvec_iter.bi_size, bytes); + +- ret = __bch2_read_extent(c, rbio, bvec_iter, k, ++ ret = __bch2_read_extent(&trans, rbio, bvec_iter, k, + offset_into_extent, failed, flags); + switch (ret) { + case READ_RETRY: +@@ -2033,11 +2034,12 @@ err: + return ret; + } + +-int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig, ++int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, + struct bvec_iter iter, struct bkey_s_c k, + unsigned offset_into_extent, + struct bch_io_failures *failed, unsigned flags) + { ++ struct bch_fs *c = trans->c; + struct extent_ptr_decoded pick; + struct bch_read_bio *rbio = NULL; + struct bch_dev *ca; +@@ -2205,9 +2207,9 @@ get_bio: + + bch2_increment_clock(c, bio_sectors(&rbio->bio), READ); + +- rcu_read_lock(); +- bucket_io_clock_reset(c, ca, PTR_BUCKET_NR(ca, &pick.ptr), READ); +- rcu_read_unlock(); ++ if (pick.ptr.cached) ++ bch2_bucket_io_time_reset(trans, pick.ptr.dev, ++ PTR_BUCKET_NR(ca, &pick.ptr), READ); + + if (!(flags & (BCH_READ_IN_RETRY|BCH_READ_LAST_FRAGMENT))) { + bio_inc_remaining(&orig->bio); +@@ -2351,7 +2353,7 @@ retry: + if (rbio->bio.bi_iter.bi_size == bytes) + flags |= BCH_READ_LAST_FRAGMENT; + +- bch2_read_extent(c, rbio, k, offset_into_extent, flags); ++ bch2_read_extent(&trans, rbio, k, offset_into_extent, flags); + + if (flags & BCH_READ_LAST_FRAGMENT) + break; +diff --git a/fs/bcachefs/io.h b/fs/bcachefs/io.h +index ded468d70f09..e6aac594f3e6 100644 +--- a/fs/bcachefs/io.h ++++ b/fs/bcachefs/io.h +@@ -136,17 +136,17 @@ enum bch_read_flags { + BCH_READ_IN_RETRY = 1 << 7, + }; + +-int __bch2_read_extent(struct bch_fs *, struct bch_read_bio *, ++int __bch2_read_extent(struct btree_trans *, struct bch_read_bio *, + struct bvec_iter, struct bkey_s_c, unsigned, + struct bch_io_failures *, unsigned); + +-static inline void bch2_read_extent(struct bch_fs *c, ++static inline void bch2_read_extent(struct btree_trans *trans, + struct bch_read_bio *rbio, + struct bkey_s_c k, + unsigned offset_into_extent, + unsigned flags) + { +- __bch2_read_extent(c, rbio, rbio->bio.bi_iter, k, ++ __bch2_read_extent(trans, rbio, rbio->bio.bi_iter, k, + offset_into_extent, NULL, flags); + } + +diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c +index a8df9ad0e449..6633d21f604a 100644 +--- a/fs/bcachefs/move.c ++++ b/fs/bcachefs/move.c +@@ -415,7 +415,7 @@ static void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt) + atomic_read(&ctxt->write_sectors) != sectors_pending); + } + +-static int bch2_move_extent(struct bch_fs *c, ++static int bch2_move_extent(struct btree_trans *trans, + struct moving_context *ctxt, + struct write_point_specifier wp, + struct bch_io_opts io_opts, +@@ -424,6 +424,7 @@ static int bch2_move_extent(struct bch_fs *c, + enum data_cmd data_cmd, + struct data_opts data_opts) + { ++ struct bch_fs *c = trans->c; + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + struct moving_io *io; + const union bch_extent_entry *entry; +@@ -490,7 +491,7 @@ static int bch2_move_extent(struct bch_fs *c, + * ctxt when doing wakeup + */ + closure_get(&ctxt->cl); +- bch2_read_extent(c, &io->rbio, k, 0, ++ bch2_read_extent(trans, &io->rbio, k, 0, + BCH_READ_NODECODE| + BCH_READ_LAST_FRAGMENT); + return 0; +@@ -608,7 +609,7 @@ peek: + k = bkey_i_to_s_c(sk.k); + bch2_trans_unlock(&trans); + +- ret2 = bch2_move_extent(c, ctxt, wp, io_opts, btree_id, k, ++ ret2 = bch2_move_extent(&trans, ctxt, wp, io_opts, btree_id, k, + data_cmd, data_opts); + if (ret2) { + if (ret2 == -ENOMEM) { +-- +cgit v1.2.3 + + +From d9c3f7b8a01c3736e30dc3fee14dc8f03e3d86d1 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 16 Dec 2020 15:41:29 -0500 +Subject: bcachefs: Fix race between journal_seq_copy() and journal_seq_drop() + +In bch2_btree_interior_update_will_free_node, we copy the journal pins +from outstanding writes on the btree node we're about to free. But, this +can race with the writes completing, and dropping their journal pins. + +To guard against this, just use READ_ONCE() in bch2_journal_pin_copy(). + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/journal_reclaim.c | 14 ++++++++++++-- + fs/bcachefs/journal_reclaim.h | 7 +++++-- + 2 files changed, 17 insertions(+), 4 deletions(-) + +diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c +index b77d4e7f42d6..4e3cf219fb91 100644 +--- a/fs/bcachefs/journal_reclaim.c ++++ b/fs/bcachefs/journal_reclaim.c +@@ -384,12 +384,22 @@ void bch2_journal_pin_set(struct journal *j, u64 seq, + struct journal_entry_pin_list *pin_list; + + spin_lock(&j->lock); ++ ++ if (seq < journal_last_seq(j)) { ++ /* ++ * bch2_journal_pin_copy() raced with bch2_journal_pin_drop() on ++ * the src pin - with the pin dropped, the entry to pin might no ++ * longer to exist, but that means there's no longer anything to ++ * copy and we can bail out here: ++ */ ++ spin_unlock(&j->lock); ++ return; ++ } ++ + pin_list = journal_seq_pin(j, seq); + + __journal_pin_drop(j, pin); + +- BUG_ON(!atomic_read(&pin_list->count) && seq == journal_last_seq(j)); +- + atomic_inc(&pin_list->count); + pin->seq = seq; + pin->flush = flush_fn; +diff --git a/fs/bcachefs/journal_reclaim.h b/fs/bcachefs/journal_reclaim.h +index f02caa3d49ea..adf1f5c981cd 100644 +--- a/fs/bcachefs/journal_reclaim.h ++++ b/fs/bcachefs/journal_reclaim.h +@@ -53,8 +53,11 @@ static inline void bch2_journal_pin_copy(struct journal *j, + struct journal_entry_pin *src, + journal_pin_flush_fn flush_fn) + { +- if (journal_pin_active(src)) +- bch2_journal_pin_add(j, src->seq, dst, flush_fn); ++ /* Guard against racing with journal_pin_drop(src): */ ++ u64 seq = READ_ONCE(src->seq); ++ ++ if (seq) ++ bch2_journal_pin_add(j, seq, dst, flush_fn); + } + + static inline void bch2_journal_pin_update(struct journal *j, u64 seq, +-- +cgit v1.2.3 + + +From f2d042568f9aaf75b1d5f22b5f0ac472210c7058 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 19 Dec 2020 15:39:10 -0500 +Subject: bcachefs: Fix for spinning in journal reclaim on startup + +We normally avoid having too many dirty keys in the btree key cache, to +ensure that we can always shrink our caches to reclaim memory if needed. + +But this check was causing us to go into an infinite loop on startup, in +the btree insert path before journal reclaim was started. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_key_cache.h | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/fs/bcachefs/btree_key_cache.h b/fs/bcachefs/btree_key_cache.h +index dad3e344dcf9..2f8b5521718a 100644 +--- a/fs/bcachefs/btree_key_cache.h ++++ b/fs/bcachefs/btree_key_cache.h +@@ -16,7 +16,8 @@ static inline bool bch2_btree_key_cache_must_wait(struct bch_fs *c) + size_t nr_keys = READ_ONCE(c->btree_key_cache.nr_keys); + size_t max_dirty = 4096 + (nr_keys * 3) / 4; + +- return nr_dirty > max_dirty; ++ return nr_dirty > max_dirty && ++ test_bit(JOURNAL_RECLAIM_STARTED, &c->journal.flags); + } + + struct bkey_cached * +-- +cgit v1.2.3 + + +From 5d20d4718896b925bfb4621320c326fcbc644bcc Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 19 Dec 2020 21:31:05 -0500 +Subject: bcachefs: Fix btree lock being incorrectly dropped + +__btree_trans_get_iter() was using bch2_btree_iter_upgrade, but it +shouldn't have been because on failure bch2_btree_iter_upgrade may drop +locks in other iterators, expecting the transaction to be restarted. But +__btree_trans_get_iter can't return an error to indicate that we need to +restart thet transaction - oops. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 9 ++++++--- + fs/bcachefs/btree_update_leaf.c | 8 ++++---- + 2 files changed, 10 insertions(+), 7 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 8c35e39ea97f..f7c6f7bfca67 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -2124,9 +2124,12 @@ static struct btree_iter *__btree_trans_get_iter(struct btree_trans *trans, + iter->flags &= ~BTREE_ITER_USER_FLAGS; + iter->flags |= flags & BTREE_ITER_USER_FLAGS; + +- if (iter->flags & BTREE_ITER_INTENT) +- bch2_btree_iter_upgrade(iter, 1); +- else ++ if (iter->flags & BTREE_ITER_INTENT) { ++ if (!iter->locks_want) { ++ __bch2_btree_iter_unlock(iter); ++ iter->locks_want = 1; ++ } ++ } else + bch2_btree_iter_downgrade(iter); + + BUG_ON(iter->btree_id != btree_id); +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index 64734f9158c3..c77e243b2a04 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -869,8 +869,8 @@ int __bch2_trans_commit(struct btree_trans *trans) + trans_trigger_run = false; + + trans_for_each_update(trans, i) { +- if (unlikely(i->iter->uptodate > BTREE_ITER_NEED_PEEK && +- (ret = bch2_btree_iter_traverse(i->iter)))) { ++ ret = bch2_btree_iter_traverse(i->iter); ++ if (unlikely(ret)) { + trace_trans_restart_traverse(trans->ip); + goto out; + } +@@ -879,8 +879,8 @@ int __bch2_trans_commit(struct btree_trans *trans) + * We're not using bch2_btree_iter_upgrade here because + * we know trans->nounlock can't be set: + */ +- if (unlikely(i->iter->locks_want < 1 && +- !__bch2_btree_iter_upgrade(i->iter, 1))) { ++ if (unlikely(!btree_node_intent_locked(i->iter, i->iter->level) && ++ !__bch2_btree_iter_upgrade(i->iter, i->iter->level + 1))) { + trace_trans_restart_upgrade(trans->ip); + ret = -EINTR; + goto out; +-- +cgit v1.2.3 + + +From 50c4ccf4bcf6a64962a0ddc5eb08bce3880af1b6 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 20 Dec 2020 21:42:19 -0500 +Subject: bcachefs: Fix iterator overflow in move path + +The move path was calling bch2_bucket_io_time_reset() for cached +pointers (which it shouldn't have been), and then not calling +bch2_trans_reset() when it got -EINTR (indicating transaction restart). +Oops. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/io.c | 6 +++++- + fs/bcachefs/move.c | 6 ++++++ + 2 files changed, 11 insertions(+), 1 deletion(-) + +diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c +index 77eb82f40697..452c7820b3b4 100644 +--- a/fs/bcachefs/io.c ++++ b/fs/bcachefs/io.c +@@ -2207,7 +2207,11 @@ get_bio: + + bch2_increment_clock(c, bio_sectors(&rbio->bio), READ); + +- if (pick.ptr.cached) ++ /* ++ * If it's being moved internally, we don't want to flag it as a cache ++ * hit: ++ */ ++ if (pick.ptr.cached && !(flags & BCH_READ_NODECODE)) + bch2_bucket_io_time_reset(trans, pick.ptr.dev, + PTR_BUCKET_NR(ca, &pick.ptr), READ); + +diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c +index 6633d21f604a..6242b7d91a00 100644 +--- a/fs/bcachefs/move.c ++++ b/fs/bcachefs/move.c +@@ -612,6 +612,12 @@ peek: + ret2 = bch2_move_extent(&trans, ctxt, wp, io_opts, btree_id, k, + data_cmd, data_opts); + if (ret2) { ++ if (ret2 == -EINTR) { ++ bch2_trans_reset(&trans, 0); ++ bch2_trans_cond_resched(&trans); ++ continue; ++ } ++ + if (ret2 == -ENOMEM) { + /* memory allocation failure, wait for some IO to finish */ + bch2_move_ctxt_wait_for_io(ctxt); +-- +cgit v1.2.3 + + +From 4e0780d60178899d02e4bc7c0f3d2e5e68111bdb Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 21 Dec 2020 17:17:18 -0500 +Subject: bcachefs: Don't use BTREE_INSERT_USE_RESERVE so much + +Previously, we were using BTREE_INSERT_RESERVE in a lot of places where +it no longer makes sense. + + - we now have more open_buckets than we used to, and the reserves work + better, so we shouldn't need to use BTREE_INSERT_RESERVE just because + we're holding open_buckets pinned anymore. + + - We have the btree key cache for updates to the alloc btree, meaning + we no longer need the btree reserve to ensure the allocator can make + forward progress. + +This means that we should only need a reserve for btree updates to +ensure that copygc can make forward progress. + +Since it's now just for copygc, we can also fold RESERVE_BTREE into +RESERVE_MOVINGGC (the allocator's freelist reserve). + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/alloc_background.c | 10 +++------- + fs/bcachefs/alloc_foreground.c | 14 +------------- + fs/bcachefs/alloc_types.h | 8 +++----- + fs/bcachefs/btree_gc.c | 1 - + fs/bcachefs/btree_key_cache.c | 2 -- + fs/bcachefs/btree_update.h | 2 -- + fs/bcachefs/btree_update_interior.c | 23 +++-------------------- + fs/bcachefs/btree_update_leaf.c | 3 +-- + fs/bcachefs/buckets.c | 3 +-- + fs/bcachefs/ec.c | 3 +-- + fs/bcachefs/io.c | 3 +-- + fs/bcachefs/journal.c | 2 +- + fs/bcachefs/move.c | 1 - + fs/bcachefs/movinggc.c | 5 +++++ + fs/bcachefs/sysfs.c | 2 -- + 15 files changed, 20 insertions(+), 62 deletions(-) + +diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c +index 4d0dc10e05eb..6c3fdc41aec5 100644 +--- a/fs/bcachefs/alloc_background.c ++++ b/fs/bcachefs/alloc_background.c +@@ -319,9 +319,7 @@ retry: + bch2_trans_update(trans, iter, &a->k_i, + BTREE_TRIGGER_NORUN); + ret = bch2_trans_commit(trans, NULL, NULL, +- BTREE_INSERT_NOFAIL| +- BTREE_INSERT_USE_RESERVE| +- flags); ++ BTREE_INSERT_NOFAIL|flags); + err: + if (ret == -EINTR) + goto retry; +@@ -575,8 +573,7 @@ static int wait_buckets_available(struct bch_fs *c, struct bch_dev *ca) + + if (available > fifo_free(&ca->free_inc) || + (available && +- (!fifo_full(&ca->free[RESERVE_BTREE]) || +- !fifo_full(&ca->free[RESERVE_MOVINGGC])))) ++ !fifo_full(&ca->free[RESERVE_MOVINGGC]))) + break; + + up_read(&c->gc_lock); +@@ -977,8 +974,7 @@ retry: + BTREE_INSERT_NOUNLOCK| + BTREE_INSERT_NOCHECK_RW| + BTREE_INSERT_NOFAIL| +- BTREE_INSERT_USE_RESERVE| +- BTREE_INSERT_USE_ALLOC_RESERVE| ++ BTREE_INSERT_JOURNAL_RESERVED| + flags); + if (ret == -EINTR) + goto retry; +diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c +index 7a92e3d53254..5432a8dae719 100644 +--- a/fs/bcachefs/alloc_foreground.c ++++ b/fs/bcachefs/alloc_foreground.c +@@ -204,10 +204,8 @@ success: + static inline unsigned open_buckets_reserved(enum alloc_reserve reserve) + { + switch (reserve) { +- case RESERVE_ALLOC: ++ case RESERVE_MOVINGGC: + return 0; +- case RESERVE_BTREE: +- return OPEN_BUCKETS_COUNT / 4; + default: + return OPEN_BUCKETS_COUNT / 2; + } +@@ -263,16 +261,6 @@ struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca, + goto out; + + switch (reserve) { +- case RESERVE_ALLOC: +- if (fifo_pop(&ca->free[RESERVE_BTREE], bucket)) +- goto out; +- break; +- case RESERVE_BTREE: +- if (fifo_used(&ca->free[RESERVE_BTREE]) * 2 >= +- ca->free[RESERVE_BTREE].size && +- fifo_pop(&ca->free[RESERVE_BTREE], bucket)) +- goto out; +- break; + case RESERVE_MOVINGGC: + if (fifo_pop(&ca->free[RESERVE_MOVINGGC], bucket)) + goto out; +diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h +index 20705460bb0a..a510ca9a295b 100644 +--- a/fs/bcachefs/alloc_types.h ++++ b/fs/bcachefs/alloc_types.h +@@ -37,11 +37,9 @@ struct bucket_clock { + /* There is one reserve for each type of btree, one for prios and gens + * and one for moving GC */ + enum alloc_reserve { +- RESERVE_ALLOC = -1, +- RESERVE_BTREE = 0, +- RESERVE_MOVINGGC = 1, +- RESERVE_NONE = 2, +- RESERVE_NR = 3, ++ RESERVE_MOVINGGC = 0, ++ RESERVE_NONE = 1, ++ RESERVE_NR = 2, + }; + + typedef FIFO(long) alloc_fifo; +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +index 6268ea637d19..d0a856ec60e3 100644 +--- a/fs/bcachefs/btree_gc.c ++++ b/fs/bcachefs/btree_gc.c +@@ -233,7 +233,6 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id, + if (max_stale > 64) + bch2_btree_node_rewrite(c, iter, + b->data->keys.seq, +- BTREE_INSERT_USE_RESERVE| + BTREE_INSERT_NOWAIT| + BTREE_INSERT_GC_LOCK_HELD); + else if (!bch2_btree_gc_rewrite_disabled && +diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c +index 1a557b753bc1..4357aefdb668 100644 +--- a/fs/bcachefs/btree_key_cache.c ++++ b/fs/bcachefs/btree_key_cache.c +@@ -349,8 +349,6 @@ retry: + BTREE_INSERT_NOUNLOCK| + BTREE_INSERT_NOCHECK_RW| + BTREE_INSERT_NOFAIL| +- BTREE_INSERT_USE_RESERVE| +- BTREE_INSERT_USE_ALLOC_RESERVE| + BTREE_INSERT_JOURNAL_RESERVED| + BTREE_INSERT_JOURNAL_RECLAIM); + err: +diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h +index adb07043cbb3..a25138080169 100644 +--- a/fs/bcachefs/btree_update.h ++++ b/fs/bcachefs/btree_update.h +@@ -20,7 +20,6 @@ enum btree_insert_flags { + __BTREE_INSERT_NOCHECK_RW, + __BTREE_INSERT_LAZY_RW, + __BTREE_INSERT_USE_RESERVE, +- __BTREE_INSERT_USE_ALLOC_RESERVE, + __BTREE_INSERT_JOURNAL_REPLAY, + __BTREE_INSERT_JOURNAL_RESERVED, + __BTREE_INSERT_JOURNAL_RECLAIM, +@@ -43,7 +42,6 @@ enum btree_insert_flags { + + /* for copygc, or when merging btree nodes */ + #define BTREE_INSERT_USE_RESERVE (1 << __BTREE_INSERT_USE_RESERVE) +-#define BTREE_INSERT_USE_ALLOC_RESERVE (1 << __BTREE_INSERT_USE_ALLOC_RESERVE) + + /* Insert is for journal replay - don't get journal reservations: */ + #define BTREE_INSERT_JOURNAL_REPLAY (1 << __BTREE_INSERT_JOURNAL_REPLAY) +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index 8f96756ba648..8c7f7a8b5375 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -201,12 +201,9 @@ static struct btree *__bch2_btree_node_alloc(struct bch_fs *c, + unsigned nr_reserve; + enum alloc_reserve alloc_reserve; + +- if (flags & BTREE_INSERT_USE_ALLOC_RESERVE) { ++ if (flags & BTREE_INSERT_USE_RESERVE) { + nr_reserve = 0; +- alloc_reserve = RESERVE_ALLOC; +- } else if (flags & BTREE_INSERT_USE_RESERVE) { +- nr_reserve = BTREE_NODE_RESERVE / 2; +- alloc_reserve = RESERVE_BTREE; ++ alloc_reserve = RESERVE_MOVINGGC; + } else { + nr_reserve = BTREE_NODE_RESERVE; + alloc_reserve = RESERVE_NONE; +@@ -577,8 +574,6 @@ static void btree_update_nodes_written(struct btree_update *as) + bch2_trans_init(&trans, c, 0, 512); + ret = __bch2_trans_do(&trans, &as->disk_res, &journal_seq, + BTREE_INSERT_NOFAIL| +- BTREE_INSERT_USE_RESERVE| +- BTREE_INSERT_USE_ALLOC_RESERVE| + BTREE_INSERT_NOCHECK_RW| + BTREE_INSERT_JOURNAL_RECLAIM| + BTREE_INSERT_JOURNAL_RESERVED, +@@ -1457,15 +1452,6 @@ int bch2_btree_split_leaf(struct bch_fs *c, struct btree_iter *iter, + struct btree_update *as; + struct closure cl; + int ret = 0; +- struct btree_insert_entry *i; +- +- /* +- * We already have a disk reservation and open buckets pinned; this +- * allocation must not block: +- */ +- trans_for_each_update(trans, i) +- if (btree_node_type_needs_gc(i->iter->btree_id)) +- flags |= BTREE_INSERT_USE_RESERVE; + + closure_init_stack(&cl); + +@@ -1926,10 +1912,7 @@ int bch2_btree_node_update_key(struct bch_fs *c, struct btree_iter *iter, + retry: + as = bch2_btree_update_start(iter->trans, iter->btree_id, + parent ? btree_update_reserve_required(c, parent) : 0, +- BTREE_INSERT_NOFAIL| +- BTREE_INSERT_USE_RESERVE| +- BTREE_INSERT_USE_ALLOC_RESERVE, +- &cl); ++ BTREE_INSERT_NOFAIL, &cl); + + if (IS_ERR(as)) { + ret = PTR_ERR(as); +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index c77e243b2a04..c490df4709ba 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -1084,8 +1084,7 @@ int bch2_btree_delete_at(struct btree_trans *trans, + + bch2_trans_update(trans, iter, &k, 0); + return bch2_trans_commit(trans, NULL, NULL, +- BTREE_INSERT_NOFAIL| +- BTREE_INSERT_USE_RESERVE|flags); ++ BTREE_INSERT_NOFAIL|flags); + } + + int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id, +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index 1934b845ea15..8bbf958d64e4 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -2192,7 +2192,7 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) + ca->mi.bucket_size / c->opts.btree_node_size); + /* XXX: these should be tunable */ + size_t reserve_none = max_t(size_t, 1, nbuckets >> 9); +- size_t copygc_reserve = max_t(size_t, 2, nbuckets >> 7); ++ size_t copygc_reserve = max_t(size_t, 2, nbuckets >> 6); + size_t free_inc_nr = max(max_t(size_t, 1, nbuckets >> 12), + btree_reserve * 2); + bool resize = ca->buckets[0] != NULL; +@@ -2209,7 +2209,6 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) + !(buckets_nouse = kvpmalloc(BITS_TO_LONGS(nbuckets) * + sizeof(unsigned long), + GFP_KERNEL|__GFP_ZERO)) || +- !init_fifo(&free[RESERVE_BTREE], btree_reserve, GFP_KERNEL) || + !init_fifo(&free[RESERVE_MOVINGGC], + copygc_reserve, GFP_KERNEL) || + !init_fifo(&free[RESERVE_NONE], reserve_none, GFP_KERNEL) || +diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c +index eea9660971d4..0a36eb5a1f75 100644 +--- a/fs/bcachefs/ec.c ++++ b/fs/bcachefs/ec.c +@@ -800,8 +800,7 @@ static int ec_stripe_update_ptrs(struct bch_fs *c, + bch2_trans_update(&trans, iter, sk.k, 0); + + ret = bch2_trans_commit(&trans, NULL, NULL, +- BTREE_INSERT_NOFAIL| +- BTREE_INSERT_USE_RESERVE); ++ BTREE_INSERT_NOFAIL); + if (ret == -EINTR) + ret = 0; + if (ret) +diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c +index 452c7820b3b4..0499ec218c9b 100644 +--- a/fs/bcachefs/io.c ++++ b/fs/bcachefs/io.c +@@ -320,8 +320,7 @@ int bch2_extent_update(struct btree_trans *trans, + + ret = bch2_trans_commit(trans, disk_res, journal_seq, + BTREE_INSERT_NOCHECK_RW| +- BTREE_INSERT_NOFAIL| +- BTREE_INSERT_USE_RESERVE); ++ BTREE_INSERT_NOFAIL); + if (ret) + return ret; + +diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c +index d54424829378..56b750c50fbd 100644 +--- a/fs/bcachefs/journal.c ++++ b/fs/bcachefs/journal.c +@@ -777,7 +777,7 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, + } + } else { + rcu_read_lock(); +- ob = bch2_bucket_alloc(c, ca, RESERVE_ALLOC, ++ ob = bch2_bucket_alloc(c, ca, RESERVE_NONE, + false, cl); + rcu_read_unlock(); + if (IS_ERR(ob)) { +diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c +index 6242b7d91a00..48b47857acc4 100644 +--- a/fs/bcachefs/move.c ++++ b/fs/bcachefs/move.c +@@ -168,7 +168,6 @@ static int bch2_migrate_index_update(struct bch_write_op *op) + ret = bch2_trans_commit(&trans, &op->res, + op_journal_seq(op), + BTREE_INSERT_NOFAIL| +- BTREE_INSERT_USE_RESERVE| + m->data_opts.btree_insert_flags); + if (!ret) + atomic_long_inc(&c->extent_migrate_done); +diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c +index 2c5daed58aca..efa7f38ecec6 100644 +--- a/fs/bcachefs/movinggc.c ++++ b/fs/bcachefs/movinggc.c +@@ -200,6 +200,11 @@ static int bch2_copygc(struct bch_fs *c) + return -1; + } + ++ /* ++ * Our btree node allocations also come out of RESERVE_MOVINGGC: ++ */ ++ sectors_to_move = (sectors_to_move * 3) / 4; ++ + for (i = h->data; i < h->data + h->used; i++) + sectors_to_move += i->sectors * i->replicas; + +diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c +index cc13fc258115..bfae0d7142e0 100644 +--- a/fs/bcachefs/sysfs.c ++++ b/fs/bcachefs/sysfs.c +@@ -798,7 +798,6 @@ static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca) + + pr_buf(out, + "free_inc: %zu/%zu\n" +- "free[RESERVE_BTREE]: %zu/%zu\n" + "free[RESERVE_MOVINGGC]: %zu/%zu\n" + "free[RESERVE_NONE]: %zu/%zu\n" + "buckets:\n" +@@ -827,7 +826,6 @@ static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca) + "open_buckets_user: %u\n" + "btree reserve cache: %u\n", + fifo_used(&ca->free_inc), ca->free_inc.size, +- fifo_used(&ca->free[RESERVE_BTREE]), ca->free[RESERVE_BTREE].size, + fifo_used(&ca->free[RESERVE_MOVINGGC]), ca->free[RESERVE_MOVINGGC].size, + fifo_used(&ca->free[RESERVE_NONE]), ca->free[RESERVE_NONE].size, + ca->mi.nbuckets - ca->mi.first_bucket, +-- +cgit v1.2.3 + + +From 1e54ebc21cb5b54c3caf94bbfbfc28b5fbaa0469 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 14 Dec 2020 21:59:33 -0500 +Subject: bcachefs: Change when we allow overwrites + +Originally, we'd check for -ENOSPC when getting a disk reservation +whenever the new extent took up more space on disk than the old extent. + +Erasure coding screwed this up, because with erasure coding writes are +initially replicated, and then in the background the extra replicas are +dropped when the stripe is created. This means that with erasure coding +enabled, writes will always take up more space on disk than the data +they're overwriting - but, according to posix, overwrites aren't +supposed to return ENOSPC. + +So, in this patch we fudge things: if the new extent has more replicas +than the _effective_ replicas of the old extent, or if the old extent is +compressed and the new one isn't, we check for ENOSPC when getting the +disk reservation - otherwise, we don't. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/extents.c | 32 ++++++++++++++++++++++++++++++-- + fs/bcachefs/extents.h | 4 +++- + fs/bcachefs/fs-io.c | 4 +++- + fs/bcachefs/io.c | 30 +++++++++++++++++++++--------- + fs/bcachefs/io.h | 2 ++ + fs/bcachefs/move.c | 35 ++++++++++++++++------------------- + 6 files changed, 75 insertions(+), 32 deletions(-) + +diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c +index 828ccf07da61..c0ae31238b48 100644 +--- a/fs/bcachefs/extents.c ++++ b/fs/bcachefs/extents.c +@@ -665,7 +665,7 @@ bool bch2_bkey_is_incompressible(struct bkey_s_c k) + } + + bool bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size, +- unsigned nr_replicas) ++ unsigned nr_replicas, bool compressed) + { + struct btree_trans trans; + struct btree_iter *iter; +@@ -683,7 +683,8 @@ bool bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size, + if (bkey_cmp(bkey_start_pos(k.k), end) >= 0) + break; + +- if (nr_replicas > bch2_bkey_nr_ptrs_fully_allocated(k)) { ++ if (nr_replicas > bch2_bkey_replicas(c, k) || ++ (!compressed && bch2_bkey_sectors_compressed(k))) { + ret = false; + break; + } +@@ -693,6 +694,33 @@ bool bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size, + return ret; + } + ++unsigned bch2_bkey_replicas(struct bch_fs *c, struct bkey_s_c k) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const union bch_extent_entry *entry; ++ struct extent_ptr_decoded p; ++ unsigned replicas = 0; ++ ++ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { ++ if (p.ptr.cached) ++ continue; ++ ++ if (p.has_ec) { ++ struct stripe *s = ++ genradix_ptr(&c->stripes[0], p.ec.idx); ++ ++ WARN_ON(!s); ++ if (s) ++ replicas += s->nr_redundant; ++ } ++ ++ replicas++; ++ ++ } ++ ++ return replicas; ++} ++ + static unsigned bch2_extent_ptr_durability(struct bch_fs *c, + struct extent_ptr_decoded p) + { +diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h +index 74c7bb8f9104..ebe0a04c7850 100644 +--- a/fs/bcachefs/extents.h ++++ b/fs/bcachefs/extents.h +@@ -538,7 +538,9 @@ unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c); + unsigned bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c); + bool bch2_bkey_is_incompressible(struct bkey_s_c); + unsigned bch2_bkey_sectors_compressed(struct bkey_s_c); +-bool bch2_check_range_allocated(struct bch_fs *, struct bpos, u64, unsigned); ++bool bch2_check_range_allocated(struct bch_fs *, struct bpos, u64, unsigned, bool); ++ ++unsigned bch2_bkey_replicas(struct bch_fs *, struct bkey_s_c); + unsigned bch2_bkey_durability(struct bch_fs *, struct bkey_s_c); + + void bch2_bkey_mark_replicas_cached(struct bch_fs *, struct bkey_s, +diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c +index 71ec97e7650b..ae059fddf44c 100644 +--- a/fs/bcachefs/fs-io.c ++++ b/fs/bcachefs/fs-io.c +@@ -1869,7 +1869,9 @@ static long bch2_dio_write_loop(struct dio_write *dio) + dio->op.opts.data_replicas, 0); + if (unlikely(ret) && + !bch2_check_range_allocated(c, dio->op.pos, +- bio_sectors(bio), dio->op.opts.data_replicas)) ++ bio_sectors(bio), ++ dio->op.opts.data_replicas, ++ dio->op.opts.compression != 0)) + goto err; + + task_io_account_write(bio->bi_iter.bi_size); +diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c +index 0499ec218c9b..69341b5becef 100644 +--- a/fs/bcachefs/io.c ++++ b/fs/bcachefs/io.c +@@ -183,18 +183,23 @@ void bch2_bio_alloc_pages_pool(struct bch_fs *c, struct bio *bio, + + /* Extent update path: */ + +-static int sum_sector_overwrites(struct btree_trans *trans, +- struct btree_iter *extent_iter, +- struct bkey_i *new, +- bool *maybe_extending, +- s64 *i_sectors_delta, +- s64 *disk_sectors_delta) ++int bch2_sum_sector_overwrites(struct btree_trans *trans, ++ struct btree_iter *extent_iter, ++ struct bkey_i *new, ++ bool *maybe_extending, ++ bool *should_check_enospc, ++ s64 *i_sectors_delta, ++ s64 *disk_sectors_delta) + { ++ struct bch_fs *c = trans->c; + struct btree_iter *iter; + struct bkey_s_c old; ++ unsigned new_replicas = bch2_bkey_replicas(c, bkey_i_to_s_c(new)); ++ bool new_compressed = bch2_bkey_sectors_compressed(bkey_i_to_s_c(new)); + int ret = 0; + + *maybe_extending = true; ++ *should_check_enospc = false; + *i_sectors_delta = 0; + *disk_sectors_delta = 0; + +@@ -213,6 +218,11 @@ static int sum_sector_overwrites(struct btree_trans *trans, + (int) (bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(new)) - + bch2_bkey_nr_ptrs_fully_allocated(old)); + ++ if (!*should_check_enospc && ++ (new_replicas > bch2_bkey_replicas(c, old) || ++ (!new_compressed && bch2_bkey_sectors_compressed(old)))) ++ *should_check_enospc = true; ++ + if (bkey_cmp(old.k->p, new->k.p) >= 0) { + /* + * Check if there's already data above where we're +@@ -250,7 +260,7 @@ int bch2_extent_update(struct btree_trans *trans, + { + /* this must live until after bch2_trans_commit(): */ + struct bkey_inode_buf inode_p; +- bool extending = false; ++ bool extending = false, should_check_enospc; + s64 i_sectors_delta = 0, disk_sectors_delta = 0; + int ret; + +@@ -258,8 +268,9 @@ int bch2_extent_update(struct btree_trans *trans, + if (ret) + return ret; + +- ret = sum_sector_overwrites(trans, iter, k, ++ ret = bch2_sum_sector_overwrites(trans, iter, k, + &extending, ++ &should_check_enospc, + &i_sectors_delta, + &disk_sectors_delta); + if (ret) +@@ -269,7 +280,8 @@ int bch2_extent_update(struct btree_trans *trans, + disk_sectors_delta > (s64) disk_res->sectors) { + ret = bch2_disk_reservation_add(trans->c, disk_res, + disk_sectors_delta - disk_res->sectors, +- 0); ++ !should_check_enospc ++ ? BCH_DISK_RESERVATION_NOFAIL : 0); + if (ret) + return ret; + } +diff --git a/fs/bcachefs/io.h b/fs/bcachefs/io.h +index e6aac594f3e6..55ccc923614c 100644 +--- a/fs/bcachefs/io.h ++++ b/fs/bcachefs/io.h +@@ -60,6 +60,8 @@ static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op) + : op->c->wq; + } + ++int bch2_sum_sector_overwrites(struct btree_trans *, struct btree_iter *, ++ struct bkey_i *, bool *, bool *, s64 *, s64 *); + int bch2_extent_update(struct btree_trans *, struct btree_iter *, + struct bkey_i *, struct disk_reservation *, + u64 *, u64, s64 *); +diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c +index 48b47857acc4..44f3c6eec375 100644 +--- a/fs/bcachefs/move.c ++++ b/fs/bcachefs/move.c +@@ -77,17 +77,15 @@ static int bch2_migrate_index_update(struct bch_write_op *op) + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + bool did_work = false; +- int nr; ++ bool extending = false, should_check_enospc; ++ s64 i_sectors_delta = 0, disk_sectors_delta = 0; + + bch2_trans_reset(&trans, 0); + + k = bch2_btree_iter_peek_slot(iter); + ret = bkey_err(k); +- if (ret) { +- if (ret == -EINTR) +- continue; +- break; +- } ++ if (ret) ++ goto err; + + new = bkey_i_to_extent(bch2_keylist_front(keys)); + +@@ -144,23 +142,21 @@ static int bch2_migrate_index_update(struct bch_write_op *op) + op->opts.background_target, + op->opts.data_replicas); + +- /* +- * If we're not fully overwriting @k, and it's compressed, we +- * need a reservation for all the pointers in @insert +- */ +- nr = bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(insert)) - +- m->nr_ptrs_reserved; ++ ret = bch2_sum_sector_overwrites(&trans, iter, insert, ++ &extending, ++ &should_check_enospc, ++ &i_sectors_delta, ++ &disk_sectors_delta); ++ if (ret) ++ goto err; + +- if (insert->k.size < k.k->size && +- bch2_bkey_sectors_compressed(k) && +- nr > 0) { ++ if (disk_sectors_delta > (s64) op->res.sectors) { + ret = bch2_disk_reservation_add(c, &op->res, +- keylist_sectors(keys) * nr, 0); ++ disk_sectors_delta - op->res.sectors, ++ !should_check_enospc ++ ? BCH_DISK_RESERVATION_NOFAIL : 0); + if (ret) + goto out; +- +- m->nr_ptrs_reserved += nr; +- goto next; + } + + bch2_trans_update(&trans, iter, insert, 0); +@@ -169,6 +165,7 @@ static int bch2_migrate_index_update(struct bch_write_op *op) + op_journal_seq(op), + BTREE_INSERT_NOFAIL| + m->data_opts.btree_insert_flags); ++err: + if (!ret) + atomic_long_inc(&c->extent_migrate_done); + if (ret == -EINTR) +-- +cgit v1.2.3 + + +From 02a7f594d294e832968bc7e40883e740e5ad6d4b Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 14 Dec 2020 19:41:03 -0500 +Subject: bcachefs: Don't read existing stripes synchronously in write path + +Previously, in the stripe creation path, when reusing an existing stripe +we'd read the existing stripe synchronously - ouch. + +Now, we allocate two stripe bufs if we're using an existing stripe, so +that we can do the read asynchronously - and, we read the full stripe so +that we can run recovery, if necessary. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/ec.c | 179 ++++++++++++++++++++++++++++++++++--------------------- + fs/bcachefs/ec.h | 7 ++- + 2 files changed, 116 insertions(+), 70 deletions(-) + +diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c +index 0a36eb5a1f75..de82165a676f 100644 +--- a/fs/bcachefs/ec.c ++++ b/fs/bcachefs/ec.c +@@ -200,6 +200,36 @@ static bool extent_has_stripe_ptr(struct bkey_s_c k, u64 idx) + return false; + } + ++/* Stripe bufs: */ ++ ++static void ec_stripe_buf_free(struct ec_stripe_buf *stripe) ++{ ++ unsigned i; ++ ++ for (i = 0; i < stripe->key.v.nr_blocks; i++) { ++ kvpfree(stripe->data[i], stripe->size << 9); ++ stripe->data[i] = NULL; ++ } ++} ++ ++static int ec_stripe_buf_alloc(struct ec_stripe_buf *stripe) ++{ ++ unsigned i; ++ ++ memset(stripe->valid, 0xFF, sizeof(stripe->valid)); ++ ++ for (i = 0; i < stripe->key.v.nr_blocks; i++) { ++ stripe->data[i] = kvpmalloc(stripe->size << 9, GFP_KERNEL); ++ if (!stripe->data[i]) ++ goto err; ++ } ++ ++ return 0; ++err: ++ ec_stripe_buf_free(stripe); ++ return -ENOMEM; ++} ++ + /* Checksumming: */ + + static void ec_generate_checksums(struct ec_stripe_buf *buf) +@@ -287,14 +317,10 @@ static void ec_generate_ec(struct ec_stripe_buf *buf) + raid_gen(nr_data, v->nr_redundant, bytes, buf->data); + } + +-static unsigned __ec_nr_failed(struct ec_stripe_buf *buf, unsigned nr) +-{ +- return nr - bitmap_weight(buf->valid, nr); +-} +- + static unsigned ec_nr_failed(struct ec_stripe_buf *buf) + { +- return __ec_nr_failed(buf, buf->key.v.nr_blocks); ++ return buf->key.v.nr_blocks - ++ bitmap_weight(buf->valid, buf->key.v.nr_blocks); + } + + static int ec_do_recov(struct bch_fs *c, struct ec_stripe_buf *buf) +@@ -822,14 +848,13 @@ static void ec_stripe_create(struct ec_stripe_new *s) + struct open_bucket *ob; + struct bkey_i *k; + struct stripe *m; +- struct bch_stripe *v = &s->stripe.key.v; ++ struct bch_stripe *v = &s->new_stripe.key.v; + unsigned i, nr_data = v->nr_blocks - v->nr_redundant; +- struct closure cl; + int ret; + + BUG_ON(s->h->s == s); + +- closure_init_stack(&cl); ++ closure_sync(&s->iodone); + + if (s->err) { + if (s->err != -EROFS) +@@ -837,6 +862,22 @@ static void ec_stripe_create(struct ec_stripe_new *s) + goto err; + } + ++ if (s->have_existing_stripe) { ++ ec_validate_checksums(c, &s->existing_stripe); ++ ++ if (ec_do_recov(c, &s->existing_stripe)) { ++ bch_err(c, "error creating stripe: error reading existing stripe"); ++ goto err; ++ } ++ ++ for (i = 0; i < nr_data; i++) ++ if (stripe_blockcount_get(&s->existing_stripe.key.v, i)) ++ swap(s->new_stripe.data[i], ++ s->existing_stripe.data[i]); ++ ++ ec_stripe_buf_free(&s->existing_stripe); ++ } ++ + BUG_ON(!s->allocated); + + if (!percpu_ref_tryget(&c->writes)) +@@ -845,33 +886,31 @@ static void ec_stripe_create(struct ec_stripe_new *s) + BUG_ON(bitmap_weight(s->blocks_allocated, + s->blocks.nr) != s->blocks.nr); + +- ec_generate_ec(&s->stripe); ++ ec_generate_ec(&s->new_stripe); + +- ec_generate_checksums(&s->stripe); ++ ec_generate_checksums(&s->new_stripe); + + /* write p/q: */ + for (i = nr_data; i < v->nr_blocks; i++) +- ec_block_io(c, &s->stripe, REQ_OP_WRITE, i, &cl); +- +- closure_sync(&cl); ++ ec_block_io(c, &s->new_stripe, REQ_OP_WRITE, i, &s->iodone); ++ closure_sync(&s->iodone); + +- for (i = nr_data; i < v->nr_blocks; i++) +- if (!test_bit(i, s->stripe.valid)) { +- bch_err(c, "error creating stripe: error writing redundancy buckets"); +- goto err_put_writes; +- } ++ if (ec_nr_failed(&s->new_stripe)) { ++ bch_err(c, "error creating stripe: error writing redundancy buckets"); ++ goto err_put_writes; ++ } + +- ret = s->existing_stripe +- ? bch2_btree_insert(c, BTREE_ID_EC, &s->stripe.key.k_i, ++ ret = s->have_existing_stripe ++ ? bch2_btree_insert(c, BTREE_ID_EC, &s->new_stripe.key.k_i, + &s->res, NULL, BTREE_INSERT_NOFAIL) +- : ec_stripe_bkey_insert(c, s, &s->stripe.key); ++ : ec_stripe_bkey_insert(c, s, &s->new_stripe.key); + if (ret) { + bch_err(c, "error creating stripe: error creating stripe key"); + goto err_put_writes; + } + + for_each_keylist_key(&s->keys, k) { +- ret = ec_stripe_update_ptrs(c, &s->stripe, &k->k); ++ ret = ec_stripe_update_ptrs(c, &s->new_stripe, &k->k); + if (ret) { + bch_err(c, "error creating stripe: error %i updating pointers", ret); + break; +@@ -879,14 +918,14 @@ static void ec_stripe_create(struct ec_stripe_new *s) + } + + spin_lock(&c->ec_stripes_heap_lock); +- m = genradix_ptr(&c->stripes[0], s->stripe.key.k.p.offset); ++ m = genradix_ptr(&c->stripes[0], s->new_stripe.key.k.p.offset); + #if 0 + pr_info("created a %s stripe %llu", +- s->existing_stripe ? "existing" : "new", ++ s->have_existing_stripe ? "existing" : "new", + s->stripe.key.k.p.offset); + #endif + BUG_ON(m->on_heap); +- bch2_stripes_heap_insert(c, m, s->stripe.key.k.p.offset); ++ bch2_stripes_heap_insert(c, m, s->new_stripe.key.k.p.offset); + spin_unlock(&c->ec_stripes_heap_lock); + err_put_writes: + percpu_ref_put(&c->writes); +@@ -902,8 +941,9 @@ err: + + bch2_keylist_free(&s->keys, s->inline_keys); + +- for (i = 0; i < s->stripe.key.v.nr_blocks; i++) +- kvpfree(s->stripe.data[i], s->stripe.size << 9); ++ ec_stripe_buf_free(&s->existing_stripe); ++ ec_stripe_buf_free(&s->new_stripe); ++ closure_debug_destroy(&s->iodone); + kfree(s); + } + +@@ -980,7 +1020,7 @@ void *bch2_writepoint_ec_buf(struct bch_fs *c, struct write_point *wp) + ca = bch_dev_bkey_exists(c, ob->ptr.dev); + offset = ca->mi.bucket_size - ob->sectors_free; + +- return ob->ec->stripe.data[ob->ec_idx] + (offset << 9); ++ return ob->ec->new_stripe.data[ob->ec_idx] + (offset << 9); + } + + void bch2_ec_add_backpointer(struct bch_fs *c, struct write_point *wp, +@@ -1087,7 +1127,6 @@ static void ec_stripe_key_init(struct bch_fs *c, + static int ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h) + { + struct ec_stripe_new *s; +- unsigned i; + + lockdep_assert_held(&h->lock); + +@@ -1096,6 +1135,7 @@ static int ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h) + return -ENOMEM; + + mutex_init(&s->lock); ++ closure_init(&s->iodone, NULL); + atomic_set(&s->pin, 1); + s->c = c; + s->h = h; +@@ -1105,27 +1145,14 @@ static int ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h) + + bch2_keylist_init(&s->keys, s->inline_keys); + +- s->stripe.offset = 0; +- s->stripe.size = h->blocksize; +- memset(s->stripe.valid, 0xFF, sizeof(s->stripe.valid)); ++ s->new_stripe.offset = 0; ++ s->new_stripe.size = h->blocksize; + +- ec_stripe_key_init(c, &s->stripe.key, s->nr_data, ++ ec_stripe_key_init(c, &s->new_stripe.key, s->nr_data, + s->nr_parity, h->blocksize); + +- for (i = 0; i < s->stripe.key.v.nr_blocks; i++) { +- s->stripe.data[i] = kvpmalloc(s->stripe.size << 9, GFP_KERNEL); +- if (!s->stripe.data[i]) +- goto err; +- } +- + h->s = s; +- + return 0; +-err: +- for (i = 0; i < s->stripe.key.v.nr_blocks; i++) +- kvpfree(s->stripe.data[i], s->stripe.size << 9); +- kfree(s); +- return -ENOMEM; + } + + static struct ec_stripe_head * +@@ -1217,7 +1244,7 @@ static int new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h) + devs = h->devs; + + for_each_set_bit(i, h->s->blocks_allocated, BCH_BKEY_PTRS_MAX) { +- __clear_bit(h->s->stripe.key.v.ptrs[i].dev, devs.d); ++ __clear_bit(h->s->new_stripe.key.v.ptrs[i].dev, devs.d); + --nr_data; + } + +@@ -1327,51 +1354,70 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c, + unsigned algo, + unsigned redundancy) + { +- struct closure cl; + struct ec_stripe_head *h; + struct open_bucket *ob; + unsigned i, data_idx = 0; + s64 idx; + int ret; + +- closure_init_stack(&cl); +- + h = __bch2_ec_stripe_head_get(c, target, algo, redundancy); +- if (!h) ++ if (!h) { ++ bch_err(c, "no stripe head"); + return NULL; ++ } + + if (!h->s) { + if (ec_new_stripe_alloc(c, h)) { + bch2_ec_stripe_head_put(c, h); ++ bch_err(c, "failed to allocate new stripe"); + return NULL; + } + + idx = get_existing_stripe(c, target, algo, redundancy); + if (idx >= 0) { +- h->s->existing_stripe = true; +- h->s->existing_stripe_idx = idx; +- if (get_stripe_key(c, idx, &h->s->stripe)) { +- /* btree error */ ++ h->s->have_existing_stripe = true; ++ ret = get_stripe_key(c, idx, &h->s->existing_stripe); ++ if (ret) { ++ bch2_fs_fatal_error(c, "error reading stripe key: %i", ret); ++ bch2_ec_stripe_head_put(c, h); ++ return NULL; ++ } ++ ++ if (ec_stripe_buf_alloc(&h->s->existing_stripe)) { ++ /* ++ * this is a problem: we have deleted from the ++ * stripes heap already ++ */ + BUG(); + } + +- for (i = 0; i < h->s->stripe.key.v.nr_blocks; i++) +- if (stripe_blockcount_get(&h->s->stripe.key.v, i)) { ++ for (i = 0; i < h->s->existing_stripe.key.v.nr_blocks; i++) { ++ if (stripe_blockcount_get(&h->s->existing_stripe.key.v, i)) + __set_bit(i, h->s->blocks_allocated); +- ec_block_io(c, &h->s->stripe, READ, i, &cl); +- } ++ ++ ec_block_io(c, &h->s->existing_stripe, READ, i, &h->s->iodone); ++ } ++ ++ bkey_copy(&h->s->new_stripe.key.k_i, ++ &h->s->existing_stripe.key.k_i); ++ } ++ ++ if (ec_stripe_buf_alloc(&h->s->new_stripe)) { ++ BUG(); + } + } + + if (!h->s->allocated) { +- if (!h->s->existing_stripe && ++ if (!h->s->have_existing_stripe && + !h->s->res.sectors) { + ret = bch2_disk_reservation_get(c, &h->s->res, +- h->blocksize, +- h->s->nr_parity, 0); ++ h->blocksize, ++ h->s->nr_parity, 0); + if (ret) { +- /* What should we do here? */ +- bch_err(c, "unable to create new stripe: %i", ret); ++ /* ++ * This means we need to wait for copygc to ++ * empty out buckets from existing stripes: ++ */ + bch2_ec_stripe_head_put(c, h); + h = NULL; + goto out; +@@ -1391,19 +1437,18 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c, + h->s->nr_data, data_idx); + BUG_ON(data_idx >= h->s->nr_data); + +- h->s->stripe.key.v.ptrs[data_idx] = ob->ptr; ++ h->s->new_stripe.key.v.ptrs[data_idx] = ob->ptr; + h->s->data_block_idx[i] = data_idx; + data_idx++; + } + + open_bucket_for_each(c, &h->s->parity, ob, i) +- h->s->stripe.key.v.ptrs[h->s->nr_data + i] = ob->ptr; ++ h->s->new_stripe.key.v.ptrs[h->s->nr_data + i] = ob->ptr; + + //pr_info("new stripe, blocks_allocated %lx", h->s->blocks_allocated[0]); + h->s->allocated = true; + } + out: +- closure_sync(&cl); + return h; + } + +diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h +index 450bb1a113a3..1d4aad50db4d 100644 +--- a/fs/bcachefs/ec.h ++++ b/fs/bcachefs/ec.h +@@ -88,6 +88,7 @@ struct ec_stripe_new { + struct ec_stripe_head *h; + struct mutex lock; + struct list_head list; ++ struct closure iodone; + + /* counts in flight writes, stripe is created when pin == 0 */ + atomic_t pin; +@@ -98,8 +99,7 @@ struct ec_stripe_new { + u8 nr_parity; + bool allocated; + bool pending; +- bool existing_stripe; +- u64 existing_stripe_idx; ++ bool have_existing_stripe; + + unsigned long blocks_allocated[BITS_TO_LONGS(BCH_BKEY_PTRS_MAX)]; + +@@ -111,7 +111,8 @@ struct ec_stripe_new { + struct keylist keys; + u64 inline_keys[BKEY_U64s * 8]; + +- struct ec_stripe_buf stripe; ++ struct ec_stripe_buf new_stripe; ++ struct ec_stripe_buf existing_stripe; + }; + + struct ec_stripe_head { +-- +cgit v1.2.3 + + +From 7cd27816edd4c12ea6ea62f4d67520fceacb7ade Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 15 Dec 2020 12:38:17 -0500 +Subject: bcachefs: Change allocations for ec stripes to blocking + +We don't want writes to not get erasure coded just because the allocator +temporarily wasn't keeping up. + +However, it's not guaranteed that these allocations will ever succeed, +we can currently get stuck - especially if devices are different sizes - +we still have work to do in this area. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/alloc_foreground.c | 42 +++++++++++++++++++++++++----------------- + fs/bcachefs/ec.c | 23 +++++++++++------------ + fs/bcachefs/ec.h | 4 ++-- + 3 files changed, 38 insertions(+), 31 deletions(-) + +diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c +index 5432a8dae719..eef69e69f63e 100644 +--- a/fs/bcachefs/alloc_foreground.c ++++ b/fs/bcachefs/alloc_foreground.c +@@ -446,16 +446,18 @@ bch2_bucket_alloc_set(struct bch_fs *c, + * it's to a device we don't want: + */ + +-static void bucket_alloc_from_stripe(struct bch_fs *c, +- struct open_buckets *ptrs, +- struct write_point *wp, +- struct bch_devs_mask *devs_may_alloc, +- u16 target, +- unsigned erasure_code, +- unsigned nr_replicas, +- unsigned *nr_effective, +- bool *have_cache, +- unsigned flags) ++static enum bucket_alloc_ret ++bucket_alloc_from_stripe(struct bch_fs *c, ++ struct open_buckets *ptrs, ++ struct write_point *wp, ++ struct bch_devs_mask *devs_may_alloc, ++ u16 target, ++ unsigned erasure_code, ++ unsigned nr_replicas, ++ unsigned *nr_effective, ++ bool *have_cache, ++ unsigned flags, ++ struct closure *cl) + { + struct dev_alloc_list devs_sorted; + struct ec_stripe_head *h; +@@ -464,17 +466,19 @@ static void bucket_alloc_from_stripe(struct bch_fs *c, + unsigned i, ec_idx; + + if (!erasure_code) +- return; ++ return 0; + + if (nr_replicas < 2) +- return; ++ return 0; + + if (ec_open_bucket(c, ptrs)) +- return; ++ return 0; + +- h = bch2_ec_stripe_head_get(c, target, 0, nr_replicas - 1); ++ h = bch2_ec_stripe_head_get(c, target, 0, nr_replicas - 1, cl); ++ if (IS_ERR(h)) ++ return -PTR_ERR(h); + if (!h) +- return; ++ return 0; + + devs_sorted = bch2_dev_alloc_list(c, &wp->stripe, devs_may_alloc); + +@@ -496,6 +500,7 @@ got_bucket: + atomic_inc(&h->s->pin); + out_put_head: + bch2_ec_stripe_head_put(c, h); ++ return 0; + } + + /* Sector allocator */ +@@ -573,10 +578,13 @@ open_bucket_add_buckets(struct bch_fs *c, + } + + if (!ec_open_bucket(c, ptrs)) { +- bucket_alloc_from_stripe(c, ptrs, wp, &devs, ++ ret = bucket_alloc_from_stripe(c, ptrs, wp, &devs, + target, erasure_code, + nr_replicas, nr_effective, +- have_cache, flags); ++ have_cache, flags, _cl); ++ if (ret == FREELIST_EMPTY || ++ ret == OPEN_BUCKETS_EMPTY) ++ return ret; + if (*nr_effective >= nr_replicas) + return 0; + } +diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c +index de82165a676f..c1b0b1c63672 100644 +--- a/fs/bcachefs/ec.c ++++ b/fs/bcachefs/ec.c +@@ -1228,10 +1228,9 @@ found: + return h; + } + +-/* +- * XXX: use a higher watermark for allocating open buckets here: +- */ +-static int new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h) ++static enum bucket_alloc_ret ++new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h, ++ struct closure *cl) + { + struct bch_devs_mask devs; + struct open_bucket *ob; +@@ -1239,7 +1238,7 @@ static int new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h) + min_t(unsigned, h->nr_active_devs, + BCH_BKEY_PTRS_MAX) - h->redundancy; + bool have_cache = true; +- int ret = 0; ++ enum bucket_alloc_ret ret = ALLOC_SUCCESS; + + devs = h->devs; + +@@ -1270,7 +1269,7 @@ static int new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h) + &have_cache, + RESERVE_NONE, + 0, +- NULL); ++ cl); + if (ret) + goto err; + } +@@ -1286,7 +1285,7 @@ static int new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h) + &have_cache, + RESERVE_NONE, + 0, +- NULL); ++ cl); + if (ret) + goto err; + } +@@ -1352,7 +1351,8 @@ static int get_stripe_key(struct bch_fs *c, u64 idx, struct ec_stripe_buf *strip + struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c, + unsigned target, + unsigned algo, +- unsigned redundancy) ++ unsigned redundancy, ++ struct closure *cl) + { + struct ec_stripe_head *h; + struct open_bucket *ob; +@@ -1421,14 +1421,13 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c, + bch2_ec_stripe_head_put(c, h); + h = NULL; + goto out; +- + } +- + } + +- if (new_stripe_alloc_buckets(c, h)) { ++ ret = new_stripe_alloc_buckets(c, h, cl); ++ if (ret) { + bch2_ec_stripe_head_put(c, h); +- h = NULL; ++ h = ERR_PTR(-ret); + goto out; + } + +diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h +index 1d4aad50db4d..3f1999bae6d4 100644 +--- a/fs/bcachefs/ec.h ++++ b/fs/bcachefs/ec.h +@@ -146,8 +146,8 @@ void bch2_ec_bucket_cancel(struct bch_fs *, struct open_bucket *); + int bch2_ec_stripe_new_alloc(struct bch_fs *, struct ec_stripe_head *); + + void bch2_ec_stripe_head_put(struct bch_fs *, struct ec_stripe_head *); +-struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *, unsigned, +- unsigned, unsigned); ++struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *, ++ unsigned, unsigned, unsigned, struct closure *); + + void bch2_stripes_heap_update(struct bch_fs *, struct stripe *, size_t); + void bch2_stripes_heap_del(struct bch_fs *, struct stripe *, size_t); +-- +cgit v1.2.3 + + +From 4169655434ed1304508a5888939d84fdcff37dbe Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 15 Dec 2020 12:53:30 -0500 +Subject: bcachefs: Use separate new stripes for copygc and non-copygc + +Allocations for copygc have to be kept separate from everything else, +so that copygc doesn't get starved. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/alloc_foreground.c | 4 +++- + fs/bcachefs/alloc_types.h | 1 - + fs/bcachefs/ec.c | 27 ++++++++++++++++++--------- + fs/bcachefs/ec.h | 3 ++- + 4 files changed, 23 insertions(+), 12 deletions(-) + +diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c +index eef69e69f63e..cb7f1fc16daa 100644 +--- a/fs/bcachefs/alloc_foreground.c ++++ b/fs/bcachefs/alloc_foreground.c +@@ -474,7 +474,9 @@ bucket_alloc_from_stripe(struct bch_fs *c, + if (ec_open_bucket(c, ptrs)) + return 0; + +- h = bch2_ec_stripe_head_get(c, target, 0, nr_replicas - 1, cl); ++ h = bch2_ec_stripe_head_get(c, target, 0, nr_replicas - 1, ++ wp == &c->copygc_write_point, ++ cl); + if (IS_ERR(h)) + return -PTR_ERR(h); + if (!h) +diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h +index a510ca9a295b..0cfb026a02e5 100644 +--- a/fs/bcachefs/alloc_types.h ++++ b/fs/bcachefs/alloc_types.h +@@ -87,7 +87,6 @@ struct write_point { + u64 last_used; + unsigned long write_point; + enum bch_data_type type; +- bool is_ec; + + /* calculated based on how many pointers we're actually going to use: */ + unsigned sectors_free; +diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c +index c1b0b1c63672..c75d86071226 100644 +--- a/fs/bcachefs/ec.c ++++ b/fs/bcachefs/ec.c +@@ -1157,7 +1157,8 @@ static int ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h) + + static struct ec_stripe_head * + ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target, +- unsigned algo, unsigned redundancy) ++ unsigned algo, unsigned redundancy, ++ bool copygc) + { + struct ec_stripe_head *h; + struct bch_dev *ca; +@@ -1173,6 +1174,7 @@ ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target, + h->target = target; + h->algo = algo; + h->redundancy = redundancy; ++ h->copygc = copygc; + + rcu_read_lock(); + h->devs = target_rw_devs(c, BCH_DATA_user, target); +@@ -1204,9 +1206,10 @@ void bch2_ec_stripe_head_put(struct bch_fs *c, struct ec_stripe_head *h) + } + + struct ec_stripe_head *__bch2_ec_stripe_head_get(struct bch_fs *c, +- unsigned target, +- unsigned algo, +- unsigned redundancy) ++ unsigned target, ++ unsigned algo, ++ unsigned redundancy, ++ bool copygc) + { + struct ec_stripe_head *h; + +@@ -1217,12 +1220,13 @@ struct ec_stripe_head *__bch2_ec_stripe_head_get(struct bch_fs *c, + list_for_each_entry(h, &c->ec_stripe_head_list, list) + if (h->target == target && + h->algo == algo && +- h->redundancy == redundancy) { ++ h->redundancy == redundancy && ++ h->copygc == copygc) { + mutex_lock(&h->lock); + goto found; + } + +- h = ec_new_stripe_head_alloc(c, target, algo, redundancy); ++ h = ec_new_stripe_head_alloc(c, target, algo, redundancy, copygc); + found: + mutex_unlock(&c->ec_stripe_head_lock); + return h; +@@ -1267,7 +1271,9 @@ new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h, + h->redundancy, + &nr_have, + &have_cache, +- RESERVE_NONE, ++ h->copygc ++ ? RESERVE_MOVINGGC ++ : RESERVE_NONE, + 0, + cl); + if (ret) +@@ -1283,7 +1289,9 @@ new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h, + nr_data, + &nr_have, + &have_cache, +- RESERVE_NONE, ++ h->copygc ++ ? RESERVE_MOVINGGC ++ : RESERVE_NONE, + 0, + cl); + if (ret) +@@ -1352,6 +1360,7 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c, + unsigned target, + unsigned algo, + unsigned redundancy, ++ bool copygc, + struct closure *cl) + { + struct ec_stripe_head *h; +@@ -1360,7 +1369,7 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c, + s64 idx; + int ret; + +- h = __bch2_ec_stripe_head_get(c, target, algo, redundancy); ++ h = __bch2_ec_stripe_head_get(c, target, algo, redundancy, copygc); + if (!h) { + bch_err(c, "no stripe head"); + return NULL; +diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h +index 3f1999bae6d4..97a263cf9c87 100644 +--- a/fs/bcachefs/ec.h ++++ b/fs/bcachefs/ec.h +@@ -122,6 +122,7 @@ struct ec_stripe_head { + unsigned target; + unsigned algo; + unsigned redundancy; ++ bool copygc; + + struct bch_devs_mask devs; + unsigned nr_active_devs; +@@ -147,7 +148,7 @@ int bch2_ec_stripe_new_alloc(struct bch_fs *, struct ec_stripe_head *); + + void bch2_ec_stripe_head_put(struct bch_fs *, struct ec_stripe_head *); + struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *, +- unsigned, unsigned, unsigned, struct closure *); ++ unsigned, unsigned, unsigned, bool, struct closure *); + + void bch2_stripes_heap_update(struct bch_fs *, struct stripe *, size_t); + void bch2_stripes_heap_del(struct bch_fs *, struct stripe *, size_t); +-- +cgit v1.2.3 + + +From e682316525f3880d34f8d621017787ac6b050e2d Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 17 Dec 2020 15:08:58 -0500 +Subject: bcachefs: Reduce/kill BKEY_PADDED use + +With various newer key types - stripe keys, inline data extents - the +old approach of calculating the maximum size of the value is becoming +more and more error prone. Better to switch to bkey_on_stack, which can +dynamically allocate if necessary to handle any size bkey. + +In particular we also want to get rid of BKEY_EXTENT_VAL_U64s_MAX. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bcachefs_format.h | 2 -- + fs/bcachefs/bkey_buf.h | 60 +++++++++++++++++++++++++++++++++++++ + fs/bcachefs/bkey_on_stack.h | 43 -------------------------- + fs/bcachefs/bkey_sort.c | 18 +++++------ + fs/bcachefs/btree_cache.c | 13 +++++--- + fs/bcachefs/btree_gc.c | 22 +++++++------- + fs/bcachefs/btree_io.c | 23 ++++++++------ + fs/bcachefs/btree_iter.c | 35 ++++++++++++++-------- + fs/bcachefs/btree_types.h | 2 +- + fs/bcachefs/btree_update_interior.c | 2 +- + fs/bcachefs/ec.c | 10 +++---- + fs/bcachefs/extent_update.c | 1 - + fs/bcachefs/fs-io.c | 18 +++++------ + fs/bcachefs/fs.c | 16 +++++----- + fs/bcachefs/fsck.c | 10 +++---- + fs/bcachefs/io.c | 59 ++++++++++++++++++------------------ + fs/bcachefs/io.h | 6 ++-- + fs/bcachefs/journal.c | 2 +- + fs/bcachefs/journal_io.c | 2 ++ + fs/bcachefs/journal_types.h | 2 +- + fs/bcachefs/migrate.c | 20 +++++++------ + fs/bcachefs/move.c | 26 +++++++++------- + fs/bcachefs/recovery.c | 34 +++++++++++---------- + fs/bcachefs/reflink.c | 21 ++++++------- + 24 files changed, 247 insertions(+), 200 deletions(-) + create mode 100644 fs/bcachefs/bkey_buf.h + delete mode 100644 fs/bcachefs/bkey_on_stack.h + +diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h +index 9f59c6b3a25e..307d5523a52d 100644 +--- a/fs/bcachefs/bcachefs_format.h ++++ b/fs/bcachefs/bcachefs_format.h +@@ -634,8 +634,6 @@ struct bch_reservation { + #define BKEY_EXTENT_VAL_U64s_MAX \ + (1 + BKEY_EXTENT_PTR_U64s_MAX * (BCH_REPLICAS_MAX + 1)) + +-#define BKEY_PADDED(key) __BKEY_PADDED(key, BKEY_EXTENT_VAL_U64s_MAX) +- + /* * Maximum possible size of an entire extent, key + value: */ + #define BKEY_EXTENT_U64s_MAX (BKEY_U64s + BKEY_EXTENT_VAL_U64s_MAX) + +diff --git a/fs/bcachefs/bkey_buf.h b/fs/bcachefs/bkey_buf.h +new file mode 100644 +index 000000000000..0d7c67a959af +--- /dev/null ++++ b/fs/bcachefs/bkey_buf.h +@@ -0,0 +1,60 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_BKEY_BUF_H ++#define _BCACHEFS_BKEY_BUF_H ++ ++#include "bcachefs.h" ++ ++struct bkey_buf { ++ struct bkey_i *k; ++ u64 onstack[12]; ++}; ++ ++static inline void bch2_bkey_buf_realloc(struct bkey_buf *s, ++ struct bch_fs *c, unsigned u64s) ++{ ++ if (s->k == (void *) s->onstack && ++ u64s > ARRAY_SIZE(s->onstack)) { ++ s->k = mempool_alloc(&c->large_bkey_pool, GFP_NOFS); ++ memcpy(s->k, s->onstack, sizeof(s->onstack)); ++ } ++} ++ ++static inline void bch2_bkey_buf_reassemble(struct bkey_buf *s, ++ struct bch_fs *c, ++ struct bkey_s_c k) ++{ ++ bch2_bkey_buf_realloc(s, c, k.k->u64s); ++ bkey_reassemble(s->k, k); ++} ++ ++static inline void bch2_bkey_buf_copy(struct bkey_buf *s, ++ struct bch_fs *c, ++ struct bkey_i *src) ++{ ++ bch2_bkey_buf_realloc(s, c, src->k.u64s); ++ bkey_copy(s->k, src); ++} ++ ++static inline void bch2_bkey_buf_unpack(struct bkey_buf *s, ++ struct bch_fs *c, ++ struct btree *b, ++ struct bkey_packed *src) ++{ ++ bch2_bkey_buf_realloc(s, c, BKEY_U64s + ++ bkeyp_val_u64s(&b->format, src)); ++ bch2_bkey_unpack(b, s->k, src); ++} ++ ++static inline void bch2_bkey_buf_init(struct bkey_buf *s) ++{ ++ s->k = (void *) s->onstack; ++} ++ ++static inline void bch2_bkey_buf_exit(struct bkey_buf *s, struct bch_fs *c) ++{ ++ if (s->k != (void *) s->onstack) ++ mempool_free(s->k, &c->large_bkey_pool); ++ s->k = NULL; ++} ++ ++#endif /* _BCACHEFS_BKEY_BUF_H */ +diff --git a/fs/bcachefs/bkey_on_stack.h b/fs/bcachefs/bkey_on_stack.h +deleted file mode 100644 +index f607a0cb37ed..000000000000 +--- a/fs/bcachefs/bkey_on_stack.h ++++ /dev/null +@@ -1,43 +0,0 @@ +-/* SPDX-License-Identifier: GPL-2.0 */ +-#ifndef _BCACHEFS_BKEY_ON_STACK_H +-#define _BCACHEFS_BKEY_ON_STACK_H +- +-#include "bcachefs.h" +- +-struct bkey_on_stack { +- struct bkey_i *k; +- u64 onstack[12]; +-}; +- +-static inline void bkey_on_stack_realloc(struct bkey_on_stack *s, +- struct bch_fs *c, unsigned u64s) +-{ +- if (s->k == (void *) s->onstack && +- u64s > ARRAY_SIZE(s->onstack)) { +- s->k = mempool_alloc(&c->large_bkey_pool, GFP_NOFS); +- memcpy(s->k, s->onstack, sizeof(s->onstack)); +- } +-} +- +-static inline void bkey_on_stack_reassemble(struct bkey_on_stack *s, +- struct bch_fs *c, +- struct bkey_s_c k) +-{ +- bkey_on_stack_realloc(s, c, k.k->u64s); +- bkey_reassemble(s->k, k); +-} +- +-static inline void bkey_on_stack_init(struct bkey_on_stack *s) +-{ +- s->k = (void *) s->onstack; +-} +- +-static inline void bkey_on_stack_exit(struct bkey_on_stack *s, +- struct bch_fs *c) +-{ +- if (s->k != (void *) s->onstack) +- mempool_free(s->k, &c->large_bkey_pool); +- s->k = NULL; +-} +- +-#endif /* _BCACHEFS_BKEY_ON_STACK_H */ +diff --git a/fs/bcachefs/bkey_sort.c b/fs/bcachefs/bkey_sort.c +index 99e0a4011fae..2e1d9cd65f43 100644 +--- a/fs/bcachefs/bkey_sort.c ++++ b/fs/bcachefs/bkey_sort.c +@@ -1,6 +1,6 @@ + // SPDX-License-Identifier: GPL-2.0 + #include "bcachefs.h" +-#include "bkey_on_stack.h" ++#include "bkey_buf.h" + #include "bkey_sort.h" + #include "bset.h" + #include "extents.h" +@@ -187,11 +187,11 @@ bch2_sort_repack_merge(struct bch_fs *c, + bool filter_whiteouts) + { + struct bkey_packed *out = vstruct_last(dst), *k_packed; +- struct bkey_on_stack k; ++ struct bkey_buf k; + struct btree_nr_keys nr; + + memset(&nr, 0, sizeof(nr)); +- bkey_on_stack_init(&k); ++ bch2_bkey_buf_init(&k); + + while ((k_packed = bch2_btree_node_iter_next_all(iter, src))) { + if (filter_whiteouts && bkey_whiteout(k_packed)) +@@ -204,7 +204,7 @@ bch2_sort_repack_merge(struct bch_fs *c, + * node; we have to make a copy of the entire key before calling + * normalize + */ +- bkey_on_stack_realloc(&k, c, k_packed->u64s + BKEY_U64s); ++ bch2_bkey_buf_realloc(&k, c, k_packed->u64s + BKEY_U64s); + bch2_bkey_unpack(src, k.k, k_packed); + + if (filter_whiteouts && +@@ -215,7 +215,7 @@ bch2_sort_repack_merge(struct bch_fs *c, + } + + dst->u64s = cpu_to_le16((u64 *) out - dst->_data); +- bkey_on_stack_exit(&k, c); ++ bch2_bkey_buf_exit(&k, c); + return nr; + } + +@@ -315,11 +315,11 @@ bch2_extent_sort_fix_overlapping(struct bch_fs *c, struct bset *dst, + struct bkey l_unpacked, r_unpacked; + struct bkey_s l, r; + struct btree_nr_keys nr; +- struct bkey_on_stack split; ++ struct bkey_buf split; + unsigned i; + + memset(&nr, 0, sizeof(nr)); +- bkey_on_stack_init(&split); ++ bch2_bkey_buf_init(&split); + + sort_iter_sort(iter, extent_sort_fix_overlapping_cmp); + for (i = 0; i < iter->used;) { +@@ -379,7 +379,7 @@ bch2_extent_sort_fix_overlapping(struct bch_fs *c, struct bset *dst, + /* + * r wins, but it overlaps in the middle of l - split l: + */ +- bkey_on_stack_reassemble(&split, c, l.s_c); ++ bch2_bkey_buf_reassemble(&split, c, l.s_c); + bch2_cut_back(bkey_start_pos(r.k), split.k); + + bch2_cut_front_s(r.k->p, l); +@@ -398,7 +398,7 @@ bch2_extent_sort_fix_overlapping(struct bch_fs *c, struct bset *dst, + + dst->u64s = cpu_to_le16((u64 *) out - dst->_data); + +- bkey_on_stack_exit(&split, c); ++ bch2_bkey_buf_exit(&split, c); + return nr; + } + +diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c +index 09774f56f11c..fda6540be035 100644 +--- a/fs/bcachefs/btree_cache.c ++++ b/fs/bcachefs/btree_cache.c +@@ -1,6 +1,7 @@ + // SPDX-License-Identifier: GPL-2.0 + + #include "bcachefs.h" ++#include "bkey_buf.h" + #include "btree_cache.h" + #include "btree_io.h" + #include "btree_iter.h" +@@ -898,10 +899,12 @@ struct btree *bch2_btree_node_get_sibling(struct bch_fs *c, + struct btree *parent; + struct btree_node_iter node_iter; + struct bkey_packed *k; +- BKEY_PADDED(k) tmp; ++ struct bkey_buf tmp; + struct btree *ret = NULL; + unsigned level = b->c.level; + ++ bch2_bkey_buf_init(&tmp); ++ + parent = btree_iter_node(iter, level + 1); + if (!parent) + return NULL; +@@ -935,9 +938,9 @@ struct btree *bch2_btree_node_get_sibling(struct bch_fs *c, + if (!k) + goto out; + +- bch2_bkey_unpack(parent, &tmp.k, k); ++ bch2_bkey_buf_unpack(&tmp, c, parent, k); + +- ret = bch2_btree_node_get(c, iter, &tmp.k, level, ++ ret = bch2_btree_node_get(c, iter, tmp.k, level, + SIX_LOCK_intent, _THIS_IP_); + + if (PTR_ERR_OR_ZERO(ret) == -EINTR && !trans->nounlock) { +@@ -957,7 +960,7 @@ struct btree *bch2_btree_node_get_sibling(struct bch_fs *c, + if (sib == btree_prev_sib) + btree_node_unlock(iter, level); + +- ret = bch2_btree_node_get(c, iter, &tmp.k, level, ++ ret = bch2_btree_node_get(c, iter, tmp.k, level, + SIX_LOCK_intent, _THIS_IP_); + + /* +@@ -998,6 +1001,8 @@ out: + + bch2_btree_trans_verify_locks(trans); + ++ bch2_bkey_buf_exit(&tmp, c); ++ + return ret; + } + +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +index d0a856ec60e3..7554d7d4662c 100644 +--- a/fs/bcachefs/btree_gc.c ++++ b/fs/bcachefs/btree_gc.c +@@ -8,7 +8,7 @@ + #include "alloc_background.h" + #include "alloc_foreground.h" + #include "bkey_methods.h" +-#include "bkey_on_stack.h" ++#include "bkey_buf.h" + #include "btree_locking.h" + #include "btree_update_interior.h" + #include "btree_io.h" +@@ -267,10 +267,12 @@ static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b, + struct btree_and_journal_iter iter; + struct bkey_s_c k; + struct bpos next_node_start = b->data->min_key; ++ struct bkey_buf tmp; + u8 max_stale = 0; + int ret = 0; + + bch2_btree_and_journal_iter_init_node_iter(&iter, journal_keys, b); ++ bch2_bkey_buf_init(&tmp); + + while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { + bch2_bkey_debugcheck(c, b, k); +@@ -284,10 +286,9 @@ static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b, + + if (b->c.level) { + struct btree *child; +- BKEY_PADDED(k) tmp; + +- bkey_reassemble(&tmp.k, k); +- k = bkey_i_to_s_c(&tmp.k); ++ bch2_bkey_buf_reassemble(&tmp, c, k); ++ k = bkey_i_to_s_c(tmp.k); + + bch2_btree_and_journal_iter_advance(&iter); + +@@ -299,7 +300,7 @@ static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b, + break; + + if (b->c.level > target_depth) { +- child = bch2_btree_node_get_noiter(c, &tmp.k, ++ child = bch2_btree_node_get_noiter(c, tmp.k, + b->c.btree_id, b->c.level - 1); + ret = PTR_ERR_OR_ZERO(child); + if (ret) +@@ -317,6 +318,7 @@ static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b, + } + } + ++ bch2_bkey_buf_exit(&tmp, c); + return ret; + } + +@@ -929,10 +931,10 @@ static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id btree_id) + struct btree_trans trans; + struct btree_iter *iter; + struct bkey_s_c k; +- struct bkey_on_stack sk; ++ struct bkey_buf sk; + int ret = 0; + +- bkey_on_stack_init(&sk); ++ bch2_bkey_buf_init(&sk); + bch2_trans_init(&trans, c, 0, 0); + + iter = bch2_trans_get_iter(&trans, btree_id, POS_MIN, +@@ -941,7 +943,7 @@ static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id btree_id) + while ((k = bch2_btree_iter_peek(iter)).k && + !(ret = bkey_err(k))) { + if (gc_btree_gens_key(c, k)) { +- bkey_on_stack_reassemble(&sk, c, k); ++ bch2_bkey_buf_reassemble(&sk, c, k); + bch2_extent_normalize(c, bkey_i_to_s(sk.k)); + + bch2_btree_iter_set_pos(iter, bkey_start_pos(&sk.k->k)); +@@ -961,7 +963,7 @@ static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id btree_id) + } + + bch2_trans_exit(&trans); +- bkey_on_stack_exit(&sk, c); ++ bch2_bkey_buf_exit(&sk, c); + + return ret; + } +@@ -1073,7 +1075,7 @@ static void bch2_coalesce_nodes(struct bch_fs *c, struct btree_iter *iter, + } + + if (bch2_keylist_realloc(&keylist, NULL, 0, +- (BKEY_U64s + BKEY_EXTENT_U64s_MAX) * nr_old_nodes)) { ++ BKEY_BTREE_PTR_U64s_MAX * nr_old_nodes)) { + trace_btree_gc_coalesce_fail(c, + BTREE_GC_COALESCE_FAIL_KEYLIST_REALLOC); + return; +diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c +index 4dde972d353a..768fc85eaa4e 100644 +--- a/fs/bcachefs/btree_io.c ++++ b/fs/bcachefs/btree_io.c +@@ -1320,12 +1320,13 @@ static void bch2_btree_node_write_error(struct bch_fs *c, + struct btree_write_bio *wbio) + { + struct btree *b = wbio->wbio.bio.bi_private; +- __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp; ++ struct bkey_buf k; + struct bch_extent_ptr *ptr; + struct btree_trans trans; + struct btree_iter *iter; + int ret; + ++ bch2_bkey_buf_init(&k); + bch2_trans_init(&trans, c, 0, 0); + + iter = bch2_trans_get_node_iter(&trans, b->c.btree_id, b->key.k.p, +@@ -1344,21 +1345,22 @@ retry: + + BUG_ON(!btree_node_hashed(b)); + +- bkey_copy(&tmp.k, &b->key); ++ bch2_bkey_buf_copy(&k, c, &b->key); + +- bch2_bkey_drop_ptrs(bkey_i_to_s(&tmp.k), ptr, ++ bch2_bkey_drop_ptrs(bkey_i_to_s(k.k), ptr, + bch2_dev_list_has_dev(wbio->wbio.failed, ptr->dev)); + +- if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(&tmp.k))) ++ if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(k.k))) + goto err; + +- ret = bch2_btree_node_update_key(c, iter, b, &tmp.k); ++ ret = bch2_btree_node_update_key(c, iter, b, k.k); + if (ret == -EINTR) + goto retry; + if (ret) + goto err; + out: + bch2_trans_exit(&trans); ++ bch2_bkey_buf_exit(&k, c); + bio_put(&wbio->wbio.bio); + btree_node_write_done(c, b); + return; +@@ -1476,7 +1478,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, + struct bset *i; + struct btree_node *bn = NULL; + struct btree_node_entry *bne = NULL; +- BKEY_PADDED(key) k; ++ struct bkey_buf k; + struct bch_extent_ptr *ptr; + struct sort_iter sort_iter; + struct nonce nonce; +@@ -1487,6 +1489,8 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, + bool validate_before_checksum = false; + void *data; + ++ bch2_bkey_buf_init(&k); ++ + if (test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags)) + return; + +@@ -1695,15 +1699,16 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, + * just make all btree node writes FUA to keep things sane. + */ + +- bkey_copy(&k.key, &b->key); ++ bch2_bkey_buf_copy(&k, c, &b->key); + +- bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&k.key)), ptr) ++ bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(k.k)), ptr) + ptr->offset += b->written; + + b->written += sectors_to_write; + + /* XXX: submitting IO with btree locks held: */ +- bch2_submit_wbio_replicas(&wbio->wbio, c, BCH_DATA_btree, &k.key); ++ bch2_submit_wbio_replicas(&wbio->wbio, c, BCH_DATA_btree, k.k); ++ bch2_bkey_buf_exit(&k, c); + return; + err: + set_btree_node_noevict(b); +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index f7c6f7bfca67..4d825cac22ce 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -2,6 +2,7 @@ + + #include "bcachefs.h" + #include "bkey_methods.h" ++#include "bkey_buf.h" + #include "btree_cache.h" + #include "btree_iter.h" + #include "btree_key_cache.h" +@@ -1048,27 +1049,31 @@ static void btree_iter_prefetch(struct btree_iter *iter) + struct btree_iter_level *l = &iter->l[iter->level]; + struct btree_node_iter node_iter = l->iter; + struct bkey_packed *k; +- BKEY_PADDED(k) tmp; ++ struct bkey_buf tmp; + unsigned nr = test_bit(BCH_FS_STARTED, &c->flags) + ? (iter->level > 1 ? 0 : 2) + : (iter->level > 1 ? 1 : 16); + bool was_locked = btree_node_locked(iter, iter->level); + ++ bch2_bkey_buf_init(&tmp); ++ + while (nr) { + if (!bch2_btree_node_relock(iter, iter->level)) +- return; ++ break; + + bch2_btree_node_iter_advance(&node_iter, l->b); + k = bch2_btree_node_iter_peek(&node_iter, l->b); + if (!k) + break; + +- bch2_bkey_unpack(l->b, &tmp.k, k); +- bch2_btree_node_prefetch(c, iter, &tmp.k, iter->level - 1); ++ bch2_bkey_buf_unpack(&tmp, c, l->b, k); ++ bch2_btree_node_prefetch(c, iter, tmp.k, iter->level - 1); + } + + if (!was_locked) + btree_node_unlock(iter, iter->level); ++ ++ bch2_bkey_buf_exit(&tmp, c); + } + + static noinline void btree_node_mem_ptr_set(struct btree_iter *iter, +@@ -1100,30 +1105,34 @@ static __always_inline int btree_iter_down(struct btree_iter *iter, + struct btree *b; + unsigned level = iter->level - 1; + enum six_lock_type lock_type = __btree_lock_want(iter, level); +- BKEY_PADDED(k) tmp; ++ struct bkey_buf tmp; ++ int ret; + + EBUG_ON(!btree_node_locked(iter, iter->level)); + +- bch2_bkey_unpack(l->b, &tmp.k, ++ bch2_bkey_buf_init(&tmp); ++ bch2_bkey_buf_unpack(&tmp, c, l->b, + bch2_btree_node_iter_peek(&l->iter, l->b)); + +- b = bch2_btree_node_get(c, iter, &tmp.k, level, lock_type, trace_ip); +- if (unlikely(IS_ERR(b))) +- return PTR_ERR(b); ++ b = bch2_btree_node_get(c, iter, tmp.k, level, lock_type, trace_ip); ++ ret = PTR_ERR_OR_ZERO(b); ++ if (unlikely(ret)) ++ goto err; + + mark_btree_node_locked(iter, level, lock_type); + btree_iter_node_set(iter, b); + +- if (tmp.k.k.type == KEY_TYPE_btree_ptr_v2 && +- unlikely(b != btree_node_mem_ptr(&tmp.k))) ++ if (tmp.k->k.type == KEY_TYPE_btree_ptr_v2 && ++ unlikely(b != btree_node_mem_ptr(tmp.k))) + btree_node_mem_ptr_set(iter, level + 1, b); + + if (iter->flags & BTREE_ITER_PREFETCH) + btree_iter_prefetch(iter); + + iter->level = level; +- +- return 0; ++err: ++ bch2_bkey_buf_exit(&tmp, c); ++ return ret; + } + + static void btree_iter_up(struct btree_iter *iter) +diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h +index dc7de27112c6..631bf4694f4d 100644 +--- a/fs/bcachefs/btree_types.h ++++ b/fs/bcachefs/btree_types.h +@@ -57,7 +57,7 @@ struct btree_write { + + struct btree_alloc { + struct open_buckets ob; +- BKEY_PADDED(k); ++ __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX); + }; + + struct btree_bkey_cached_common { +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index 8c7f7a8b5375..39d5206b8eae 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -195,7 +195,7 @@ static struct btree *__bch2_btree_node_alloc(struct bch_fs *c, + { + struct write_point *wp; + struct btree *b; +- BKEY_PADDED(k) tmp; ++ __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp; + struct open_buckets ob = { .nr = 0 }; + struct bch_devs_list devs_have = (struct bch_devs_list) { 0 }; + unsigned nr_reserve; +diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c +index c75d86071226..eb37b79ade7f 100644 +--- a/fs/bcachefs/ec.c ++++ b/fs/bcachefs/ec.c +@@ -4,7 +4,7 @@ + + #include "bcachefs.h" + #include "alloc_foreground.h" +-#include "bkey_on_stack.h" ++#include "bkey_buf.h" + #include "bset.h" + #include "btree_gc.h" + #include "btree_update.h" +@@ -783,10 +783,10 @@ static int ec_stripe_update_ptrs(struct bch_fs *c, + struct btree_iter *iter; + struct bkey_s_c k; + struct bkey_s_extent e; +- struct bkey_on_stack sk; ++ struct bkey_buf sk; + int ret = 0, dev, idx; + +- bkey_on_stack_init(&sk); ++ bch2_bkey_buf_init(&sk); + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); + + /* XXX this doesn't support the reflink btree */ +@@ -813,7 +813,7 @@ static int ec_stripe_update_ptrs(struct bch_fs *c, + + dev = s->key.v.ptrs[idx].dev; + +- bkey_on_stack_reassemble(&sk, c, k); ++ bch2_bkey_buf_reassemble(&sk, c, k); + e = bkey_i_to_s_extent(sk.k); + + bch2_bkey_drop_ptrs(e.s, ptr, ptr->dev != dev); +@@ -834,7 +834,7 @@ static int ec_stripe_update_ptrs(struct bch_fs *c, + } + + bch2_trans_exit(&trans); +- bkey_on_stack_exit(&sk, c); ++ bch2_bkey_buf_exit(&sk, c); + + return ret; + } +diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c +index fd011df3cb99..1faca4bc1825 100644 +--- a/fs/bcachefs/extent_update.c ++++ b/fs/bcachefs/extent_update.c +@@ -1,6 +1,5 @@ + // SPDX-License-Identifier: GPL-2.0 + #include "bcachefs.h" +-#include "bkey_on_stack.h" + #include "btree_update.h" + #include "btree_update_interior.h" + #include "buckets.h" +diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c +index ae059fddf44c..dbb20d0f9092 100644 +--- a/fs/bcachefs/fs-io.c ++++ b/fs/bcachefs/fs-io.c +@@ -3,7 +3,7 @@ + + #include "bcachefs.h" + #include "alloc_foreground.h" +-#include "bkey_on_stack.h" ++#include "bkey_buf.h" + #include "btree_update.h" + #include "buckets.h" + #include "clock.h" +@@ -799,7 +799,7 @@ static void bchfs_read(struct btree_trans *trans, struct btree_iter *iter, + struct readpages_iter *readpages_iter) + { + struct bch_fs *c = trans->c; +- struct bkey_on_stack sk; ++ struct bkey_buf sk; + int flags = BCH_READ_RETRY_IF_STALE| + BCH_READ_MAY_PROMOTE; + int ret = 0; +@@ -807,7 +807,7 @@ static void bchfs_read(struct btree_trans *trans, struct btree_iter *iter, + rbio->c = c; + rbio->start_time = local_clock(); + +- bkey_on_stack_init(&sk); ++ bch2_bkey_buf_init(&sk); + retry: + while (1) { + struct bkey_s_c k; +@@ -825,7 +825,7 @@ retry: + bkey_start_offset(k.k); + sectors = k.k->size - offset_into_extent; + +- bkey_on_stack_reassemble(&sk, c, k); ++ bch2_bkey_buf_reassemble(&sk, c, k); + + ret = bch2_read_indirect_extent(trans, + &offset_into_extent, &sk); +@@ -870,7 +870,7 @@ retry: + bio_endio(&rbio->bio); + } + +- bkey_on_stack_exit(&sk, c); ++ bch2_bkey_buf_exit(&sk, c); + } + + void bch2_readahead(struct readahead_control *ractl) +@@ -2424,7 +2424,7 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode, + { + struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct address_space *mapping = inode->v.i_mapping; +- struct bkey_on_stack copy; ++ struct bkey_buf copy; + struct btree_trans trans; + struct btree_iter *src, *dst; + loff_t shift, new_size; +@@ -2434,7 +2434,7 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode, + if ((offset | len) & (block_bytes(c) - 1)) + return -EINVAL; + +- bkey_on_stack_init(©); ++ bch2_bkey_buf_init(©); + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 256); + + /* +@@ -2522,7 +2522,7 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode, + bkey_cmp(k.k->p, POS(inode->v.i_ino, offset >> 9)) <= 0) + break; + reassemble: +- bkey_on_stack_reassemble(©, c, k); ++ bch2_bkey_buf_reassemble(©, c, k); + + if (insert && + bkey_cmp(bkey_start_pos(k.k), move_pos) < 0) +@@ -2599,7 +2599,7 @@ bkey_err: + } + err: + bch2_trans_exit(&trans); +- bkey_on_stack_exit(©, c); ++ bch2_bkey_buf_exit(©, c); + bch2_pagecache_block_put(&inode->ei_pagecache_lock); + inode_unlock(&inode->v); + return ret; +diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c +index 983c1555622d..2735aeba7d15 100644 +--- a/fs/bcachefs/fs.c ++++ b/fs/bcachefs/fs.c +@@ -3,7 +3,7 @@ + + #include "bcachefs.h" + #include "acl.h" +-#include "bkey_on_stack.h" ++#include "bkey_buf.h" + #include "btree_update.h" + #include "buckets.h" + #include "chardev.h" +@@ -898,7 +898,7 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, + struct btree_trans trans; + struct btree_iter *iter; + struct bkey_s_c k; +- struct bkey_on_stack cur, prev; ++ struct bkey_buf cur, prev; + struct bpos end = POS(ei->v.i_ino, (start + len) >> 9); + unsigned offset_into_extent, sectors; + bool have_extent = false; +@@ -911,8 +911,8 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, + if (start + len < start) + return -EINVAL; + +- bkey_on_stack_init(&cur); +- bkey_on_stack_init(&prev); ++ bch2_bkey_buf_init(&cur); ++ bch2_bkey_buf_init(&prev); + bch2_trans_init(&trans, c, 0, 0); + + iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, +@@ -931,7 +931,7 @@ retry: + bkey_start_offset(k.k); + sectors = k.k->size - offset_into_extent; + +- bkey_on_stack_reassemble(&cur, c, k); ++ bch2_bkey_buf_reassemble(&cur, c, k); + + ret = bch2_read_indirect_extent(&trans, + &offset_into_extent, &cur); +@@ -939,7 +939,7 @@ retry: + break; + + k = bkey_i_to_s_c(cur.k); +- bkey_on_stack_realloc(&prev, c, k.k->u64s); ++ bch2_bkey_buf_realloc(&prev, c, k.k->u64s); + + sectors = min(sectors, k.k->size - offset_into_extent); + +@@ -973,8 +973,8 @@ retry: + FIEMAP_EXTENT_LAST); + + ret = bch2_trans_exit(&trans) ?: ret; +- bkey_on_stack_exit(&cur, c); +- bkey_on_stack_exit(&prev, c); ++ bch2_bkey_buf_exit(&cur, c); ++ bch2_bkey_buf_exit(&prev, c); + return ret < 0 ? ret : 0; + } + +diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c +index 39f872de0c18..df0f00f10bd7 100644 +--- a/fs/bcachefs/fsck.c ++++ b/fs/bcachefs/fsck.c +@@ -1,7 +1,7 @@ + // SPDX-License-Identifier: GPL-2.0 + + #include "bcachefs.h" +-#include "bkey_on_stack.h" ++#include "bkey_buf.h" + #include "btree_update.h" + #include "dirent.h" + #include "error.h" +@@ -464,11 +464,11 @@ static int check_extents(struct bch_fs *c) + struct btree_trans trans; + struct btree_iter *iter; + struct bkey_s_c k; +- struct bkey_on_stack prev; ++ struct bkey_buf prev; + u64 i_sectors; + int ret = 0; + +- bkey_on_stack_init(&prev); ++ bch2_bkey_buf_init(&prev); + prev.k->k = KEY(0, 0, 0); + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); + +@@ -500,7 +500,7 @@ retry: + goto err; + } + } +- bkey_on_stack_reassemble(&prev, c, k); ++ bch2_bkey_buf_reassemble(&prev, c, k); + + ret = walk_inode(&trans, &w, k.k->p.inode); + if (ret) +@@ -569,7 +569,7 @@ err: + fsck_err: + if (ret == -EINTR) + goto retry; +- bkey_on_stack_exit(&prev, c); ++ bch2_bkey_buf_exit(&prev, c); + return bch2_trans_exit(&trans) ?: ret; + } + +diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c +index 69341b5becef..5d884f7c137d 100644 +--- a/fs/bcachefs/io.c ++++ b/fs/bcachefs/io.c +@@ -9,7 +9,7 @@ + #include "bcachefs.h" + #include "alloc_background.h" + #include "alloc_foreground.h" +-#include "bkey_on_stack.h" ++#include "bkey_buf.h" + #include "bset.h" + #include "btree_update.h" + #include "buckets.h" +@@ -415,14 +415,14 @@ int bch2_fpunch(struct bch_fs *c, u64 inum, u64 start, u64 end, + int bch2_write_index_default(struct bch_write_op *op) + { + struct bch_fs *c = op->c; +- struct bkey_on_stack sk; ++ struct bkey_buf sk; + struct keylist *keys = &op->insert_keys; + struct bkey_i *k = bch2_keylist_front(keys); + struct btree_trans trans; + struct btree_iter *iter; + int ret; + +- bkey_on_stack_init(&sk); ++ bch2_bkey_buf_init(&sk); + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024); + + iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, +@@ -434,7 +434,7 @@ int bch2_write_index_default(struct bch_write_op *op) + + k = bch2_keylist_front(keys); + +- bkey_on_stack_realloc(&sk, c, k->k.u64s); ++ bch2_bkey_buf_realloc(&sk, c, k->k.u64s); + bkey_copy(sk.k, k); + bch2_cut_front(iter->pos, sk.k); + +@@ -451,7 +451,7 @@ int bch2_write_index_default(struct bch_write_op *op) + } while (!bch2_keylist_empty(keys)); + + bch2_trans_exit(&trans); +- bkey_on_stack_exit(&sk, c); ++ bch2_bkey_buf_exit(&sk, c); + + return ret; + } +@@ -1627,14 +1627,14 @@ static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio + { + struct btree_trans trans; + struct btree_iter *iter; +- struct bkey_on_stack sk; ++ struct bkey_buf sk; + struct bkey_s_c k; + int ret; + + flags &= ~BCH_READ_LAST_FRAGMENT; + flags |= BCH_READ_MUST_CLONE; + +- bkey_on_stack_init(&sk); ++ bch2_bkey_buf_init(&sk); + bch2_trans_init(&trans, c, 0, 0); + + iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, +@@ -1646,7 +1646,7 @@ retry: + if (bkey_err(k)) + goto err; + +- bkey_on_stack_reassemble(&sk, c, k); ++ bch2_bkey_buf_reassemble(&sk, c, k); + k = bkey_i_to_s_c(sk.k); + bch2_trans_unlock(&trans); + +@@ -1667,7 +1667,7 @@ retry: + out: + bch2_rbio_done(rbio); + bch2_trans_exit(&trans); +- bkey_on_stack_exit(&sk, c); ++ bch2_bkey_buf_exit(&sk, c); + return; + err: + rbio->bio.bi_status = BLK_STS_IOERR; +@@ -1680,14 +1680,14 @@ static void bch2_read_retry(struct bch_fs *c, struct bch_read_bio *rbio, + { + struct btree_trans trans; + struct btree_iter *iter; +- struct bkey_on_stack sk; ++ struct bkey_buf sk; + struct bkey_s_c k; + int ret; + + flags &= ~BCH_READ_LAST_FRAGMENT; + flags |= BCH_READ_MUST_CLONE; + +- bkey_on_stack_init(&sk); ++ bch2_bkey_buf_init(&sk); + bch2_trans_init(&trans, c, 0, 0); + retry: + bch2_trans_begin(&trans); +@@ -1697,7 +1697,7 @@ retry: + BTREE_ITER_SLOTS, k, ret) { + unsigned bytes, sectors, offset_into_extent; + +- bkey_on_stack_reassemble(&sk, c, k); ++ bch2_bkey_buf_reassemble(&sk, c, k); + + offset_into_extent = iter->pos.offset - + bkey_start_offset(k.k); +@@ -1746,7 +1746,7 @@ err: + rbio->bio.bi_status = BLK_STS_IOERR; + out: + bch2_trans_exit(&trans); +- bkey_on_stack_exit(&sk, c); ++ bch2_bkey_buf_exit(&sk, c); + bch2_rbio_done(rbio); + } + +@@ -1817,17 +1817,6 @@ static int __bch2_rbio_narrow_crcs(struct btree_trans *trans, + if ((ret = bkey_err(k))) + goto out; + +- /* +- * going to be temporarily appending another checksum entry: +- */ +- new = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + +- BKEY_EXTENT_U64s_MAX * 8); +- if ((ret = PTR_ERR_OR_ZERO(new))) +- goto out; +- +- bkey_reassemble(new, k); +- k = bkey_i_to_s_c(new); +- + if (bversion_cmp(k.k->version, rbio->version) || + !bch2_bkey_matches_ptr(c, k, rbio->pick.ptr, data_offset)) + goto out; +@@ -1846,6 +1835,16 @@ static int __bch2_rbio_narrow_crcs(struct btree_trans *trans, + goto out; + } + ++ /* ++ * going to be temporarily appending another checksum entry: ++ */ ++ new = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + ++ sizeof(struct bch_extent_crc128)); ++ if ((ret = PTR_ERR_OR_ZERO(new))) ++ goto out; ++ ++ bkey_reassemble(new, k); ++ + if (!bch2_bkey_narrow_crcs(new, new_crc)) + goto out; + +@@ -2012,7 +2011,7 @@ static void bch2_read_endio(struct bio *bio) + + int __bch2_read_indirect_extent(struct btree_trans *trans, + unsigned *offset_into_extent, +- struct bkey_on_stack *orig_k) ++ struct bkey_buf *orig_k) + { + struct btree_iter *iter; + struct bkey_s_c k; +@@ -2039,7 +2038,7 @@ int __bch2_read_indirect_extent(struct btree_trans *trans, + } + + *offset_into_extent = iter->pos.offset - bkey_start_offset(k.k); +- bkey_on_stack_reassemble(orig_k, trans->c, k); ++ bch2_bkey_buf_reassemble(orig_k, trans->c, k); + err: + bch2_trans_iter_put(trans, iter); + return ret; +@@ -2304,7 +2303,7 @@ void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, u64 inode) + { + struct btree_trans trans; + struct btree_iter *iter; +- struct bkey_on_stack sk; ++ struct bkey_buf sk; + struct bkey_s_c k; + unsigned flags = BCH_READ_RETRY_IF_STALE| + BCH_READ_MAY_PROMOTE| +@@ -2318,7 +2317,7 @@ void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, u64 inode) + rbio->c = c; + rbio->start_time = local_clock(); + +- bkey_on_stack_init(&sk); ++ bch2_bkey_buf_init(&sk); + bch2_trans_init(&trans, c, 0, 0); + retry: + bch2_trans_begin(&trans); +@@ -2341,7 +2340,7 @@ retry: + bkey_start_offset(k.k); + sectors = k.k->size - offset_into_extent; + +- bkey_on_stack_reassemble(&sk, c, k); ++ bch2_bkey_buf_reassemble(&sk, c, k); + + ret = bch2_read_indirect_extent(&trans, + &offset_into_extent, &sk); +@@ -2378,7 +2377,7 @@ retry: + } + out: + bch2_trans_exit(&trans); +- bkey_on_stack_exit(&sk, c); ++ bch2_bkey_buf_exit(&sk, c); + return; + err: + if (ret == -EINTR) +diff --git a/fs/bcachefs/io.h b/fs/bcachefs/io.h +index 55ccc923614c..04f6baa1daf7 100644 +--- a/fs/bcachefs/io.h ++++ b/fs/bcachefs/io.h +@@ -3,7 +3,7 @@ + #define _BCACHEFS_IO_H + + #include "checksum.h" +-#include "bkey_on_stack.h" ++#include "bkey_buf.h" + #include "io_types.h" + + #define to_wbio(_bio) \ +@@ -114,11 +114,11 @@ struct cache_promote_op; + struct extent_ptr_decoded; + + int __bch2_read_indirect_extent(struct btree_trans *, unsigned *, +- struct bkey_on_stack *); ++ struct bkey_buf *); + + static inline int bch2_read_indirect_extent(struct btree_trans *trans, + unsigned *offset_into_extent, +- struct bkey_on_stack *k) ++ struct bkey_buf *k) + { + return k->k->k.type == KEY_TYPE_reflink_p + ? __bch2_read_indirect_extent(trans, offset_into_extent, k) +diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c +index 56b750c50fbd..69e487bc29ff 100644 +--- a/fs/bcachefs/journal.c ++++ b/fs/bcachefs/journal.c +@@ -1095,7 +1095,7 @@ int bch2_fs_journal_init(struct journal *j) + + /* Btree roots: */ + j->entry_u64s_reserved += +- BTREE_ID_NR * (JSET_KEYS_U64s + BKEY_EXTENT_U64s_MAX); ++ BTREE_ID_NR * (JSET_KEYS_U64s + BKEY_BTREE_PTR_U64s_MAX); + + atomic64_set(&j->reservations.counter, + ((union journal_res_state) +diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c +index 0e6fbe2f6a75..173b483a9771 100644 +--- a/fs/bcachefs/journal_io.c ++++ b/fs/bcachefs/journal_io.c +@@ -990,6 +990,8 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w, + done: + rcu_read_unlock(); + ++ BUG_ON(bkey_val_u64s(&w->key.k) > BCH_REPLICAS_MAX); ++ + return replicas >= c->opts.metadata_replicas_required ? 0 : -EROFS; + } + +diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h +index 67ee47eb17a7..9953663e3a63 100644 +--- a/fs/bcachefs/journal_types.h ++++ b/fs/bcachefs/journal_types.h +@@ -20,7 +20,7 @@ + struct journal_buf { + struct jset *data; + +- BKEY_PADDED(key); ++ __BKEY_PADDED(key, BCH_REPLICAS_MAX); + + struct closure_waitlist wait; + +diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c +index 96c8690adc5b..6241ff0c129f 100644 +--- a/fs/bcachefs/migrate.c ++++ b/fs/bcachefs/migrate.c +@@ -4,7 +4,7 @@ + */ + + #include "bcachefs.h" +-#include "bkey_on_stack.h" ++#include "bkey_buf.h" + #include "btree_update.h" + #include "btree_update_interior.h" + #include "buckets.h" +@@ -41,10 +41,10 @@ static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags + struct btree_trans trans; + struct btree_iter *iter; + struct bkey_s_c k; +- struct bkey_on_stack sk; ++ struct bkey_buf sk; + int ret = 0; + +- bkey_on_stack_init(&sk); ++ bch2_bkey_buf_init(&sk); + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); + + iter = bch2_trans_get_iter(&trans, btree_id, POS_MIN, +@@ -57,7 +57,7 @@ static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags + continue; + } + +- bkey_on_stack_reassemble(&sk, c, k); ++ bch2_bkey_buf_reassemble(&sk, c, k); + + ret = drop_dev_ptrs(c, bkey_i_to_s(sk.k), + dev_idx, flags, false); +@@ -90,7 +90,7 @@ static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags + } + + ret = bch2_trans_exit(&trans) ?: ret; +- bkey_on_stack_exit(&sk, c); ++ bch2_bkey_buf_exit(&sk, c); + + BUG_ON(ret == -EINTR); + +@@ -109,6 +109,7 @@ static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags) + struct btree_iter *iter; + struct closure cl; + struct btree *b; ++ struct bkey_buf k; + unsigned id; + int ret; + +@@ -116,28 +117,28 @@ static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags) + if (flags & BCH_FORCE_IF_METADATA_LOST) + return -EINVAL; + ++ bch2_bkey_buf_init(&k); + bch2_trans_init(&trans, c, 0, 0); + closure_init_stack(&cl); + + for (id = 0; id < BTREE_ID_NR; id++) { + for_each_btree_node(&trans, iter, id, POS_MIN, + BTREE_ITER_PREFETCH, b) { +- __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp; + retry: + if (!bch2_bkey_has_device(bkey_i_to_s_c(&b->key), + dev_idx)) + continue; + +- bkey_copy(&tmp.k, &b->key); ++ bch2_bkey_buf_copy(&k, c, &b->key); + +- ret = drop_dev_ptrs(c, bkey_i_to_s(&tmp.k), ++ ret = drop_dev_ptrs(c, bkey_i_to_s(k.k), + dev_idx, flags, true); + if (ret) { + bch_err(c, "Cannot drop device without losing data"); + goto err; + } + +- ret = bch2_btree_node_update_key(c, iter, b, &tmp.k); ++ ret = bch2_btree_node_update_key(c, iter, b, k.k); + if (ret == -EINTR) { + b = bch2_btree_iter_peek_node(iter); + goto retry; +@@ -157,6 +158,7 @@ retry: + ret = 0; + err: + ret = bch2_trans_exit(&trans) ?: ret; ++ bch2_bkey_buf_exit(&k, c); + + BUG_ON(ret == -EINTR); + +diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c +index 44f3c6eec375..b4c315cf68df 100644 +--- a/fs/bcachefs/move.c ++++ b/fs/bcachefs/move.c +@@ -2,7 +2,7 @@ + + #include "bcachefs.h" + #include "alloc_foreground.h" +-#include "bkey_on_stack.h" ++#include "bkey_buf.h" + #include "btree_gc.h" + #include "btree_update.h" + #include "btree_update_interior.h" +@@ -61,8 +61,13 @@ static int bch2_migrate_index_update(struct bch_write_op *op) + struct migrate_write *m = + container_of(op, struct migrate_write, op); + struct keylist *keys = &op->insert_keys; ++ struct bkey_buf _new, _insert; + int ret = 0; + ++ bch2_bkey_buf_init(&_new); ++ bch2_bkey_buf_init(&_insert); ++ bch2_bkey_buf_realloc(&_insert, c, U8_MAX); ++ + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); + + iter = bch2_trans_get_iter(&trans, m->btree_id, +@@ -73,7 +78,6 @@ static int bch2_migrate_index_update(struct bch_write_op *op) + struct bkey_s_c k; + struct bkey_i *insert; + struct bkey_i_extent *new; +- BKEY_PADDED(k) _new, _insert; + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + bool did_work = false; +@@ -93,11 +97,11 @@ static int bch2_migrate_index_update(struct bch_write_op *op) + !bch2_bkey_matches_ptr(c, k, m->ptr, m->offset)) + goto nomatch; + +- bkey_reassemble(&_insert.k, k); +- insert = &_insert.k; ++ bkey_reassemble(_insert.k, k); ++ insert = _insert.k; + +- bkey_copy(&_new.k, bch2_keylist_front(keys)); +- new = bkey_i_to_extent(&_new.k); ++ bch2_bkey_buf_copy(&_new, c, bch2_keylist_front(keys)); ++ new = bkey_i_to_extent(_new.k); + bch2_cut_front(iter->pos, &new->k_i); + + bch2_cut_front(iter->pos, insert); +@@ -193,6 +197,8 @@ nomatch: + } + out: + bch2_trans_exit(&trans); ++ bch2_bkey_buf_exit(&_insert, c); ++ bch2_bkey_buf_exit(&_new, c); + BUG_ON(ret == -EINTR); + return ret; + } +@@ -512,7 +518,7 @@ static int __bch2_move_data(struct bch_fs *c, + { + bool kthread = (current->flags & PF_KTHREAD) != 0; + struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts); +- struct bkey_on_stack sk; ++ struct bkey_buf sk; + struct btree_trans trans; + struct btree_iter *iter; + struct bkey_s_c k; +@@ -521,7 +527,7 @@ static int __bch2_move_data(struct bch_fs *c, + u64 delay, cur_inum = U64_MAX; + int ret = 0, ret2; + +- bkey_on_stack_init(&sk); ++ bch2_bkey_buf_init(&sk); + bch2_trans_init(&trans, c, 0, 0); + + stats->data_type = BCH_DATA_user; +@@ -601,7 +607,7 @@ peek: + } + + /* unlock before doing IO: */ +- bkey_on_stack_reassemble(&sk, c, k); ++ bch2_bkey_buf_reassemble(&sk, c, k); + k = bkey_i_to_s_c(sk.k); + bch2_trans_unlock(&trans); + +@@ -635,7 +641,7 @@ next_nondata: + } + out: + ret = bch2_trans_exit(&trans) ?: ret; +- bkey_on_stack_exit(&sk, c); ++ bch2_bkey_buf_exit(&sk, c); + + return ret; + } +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index 1883a1faf380..c5da1be46444 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -1,6 +1,7 @@ + // SPDX-License-Identifier: GPL-2.0 + + #include "bcachefs.h" ++#include "bkey_buf.h" + #include "alloc_background.h" + #include "btree_gc.h" + #include "btree_update.h" +@@ -224,28 +225,29 @@ static int bch2_btree_and_journal_walk_recurse(struct bch_fs *c, struct btree *b + + if (b->c.level) { + struct btree *child; +- BKEY_PADDED(k) tmp; ++ struct bkey_buf tmp; + +- bkey_reassemble(&tmp.k, k); +- k = bkey_i_to_s_c(&tmp.k); ++ bch2_bkey_buf_init(&tmp); ++ bch2_bkey_buf_reassemble(&tmp, c, k); ++ k = bkey_i_to_s_c(tmp.k); + + bch2_btree_and_journal_iter_advance(&iter); + +- if (b->c.level > 0) { +- child = bch2_btree_node_get_noiter(c, &tmp.k, +- b->c.btree_id, b->c.level - 1); +- ret = PTR_ERR_OR_ZERO(child); +- if (ret) +- break; ++ child = bch2_btree_node_get_noiter(c, tmp.k, ++ b->c.btree_id, b->c.level - 1); ++ bch2_bkey_buf_exit(&tmp, c); + +- ret = (node_fn ? node_fn(c, b) : 0) ?: +- bch2_btree_and_journal_walk_recurse(c, child, +- journal_keys, btree_id, node_fn, key_fn); +- six_unlock_read(&child->c.lock); ++ ret = PTR_ERR_OR_ZERO(child); ++ if (ret) ++ break; + +- if (ret) +- break; +- } ++ ret = (node_fn ? node_fn(c, b) : 0) ?: ++ bch2_btree_and_journal_walk_recurse(c, child, ++ journal_keys, btree_id, node_fn, key_fn); ++ six_unlock_read(&child->c.lock); ++ ++ if (ret) ++ break; + } else { + bch2_btree_and_journal_iter_advance(&iter); + } +diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c +index 8abcbfb3bd64..930547de3309 100644 +--- a/fs/bcachefs/reflink.c ++++ b/fs/bcachefs/reflink.c +@@ -1,6 +1,6 @@ + // SPDX-License-Identifier: GPL-2.0 + #include "bcachefs.h" +-#include "bkey_on_stack.h" ++#include "bkey_buf.h" + #include "btree_update.h" + #include "extents.h" + #include "inode.h" +@@ -198,8 +198,7 @@ s64 bch2_remap_range(struct bch_fs *c, + struct btree_trans trans; + struct btree_iter *dst_iter, *src_iter; + struct bkey_s_c src_k; +- BKEY_PADDED(k) new_dst; +- struct bkey_on_stack new_src; ++ struct bkey_buf new_dst, new_src; + struct bpos dst_end = dst_start, src_end = src_start; + struct bpos dst_want, src_want; + u64 src_done, dst_done; +@@ -216,7 +215,8 @@ s64 bch2_remap_range(struct bch_fs *c, + dst_end.offset += remap_sectors; + src_end.offset += remap_sectors; + +- bkey_on_stack_init(&new_src); ++ bch2_bkey_buf_init(&new_dst); ++ bch2_bkey_buf_init(&new_src); + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 4096); + + src_iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, src_start, +@@ -257,7 +257,7 @@ s64 bch2_remap_range(struct bch_fs *c, + break; + + if (src_k.k->type != KEY_TYPE_reflink_p) { +- bkey_on_stack_reassemble(&new_src, c, src_k); ++ bch2_bkey_buf_reassemble(&new_src, c, src_k); + src_k = bkey_i_to_s_c(new_src.k); + + bch2_cut_front(src_iter->pos, new_src.k); +@@ -275,7 +275,7 @@ s64 bch2_remap_range(struct bch_fs *c, + struct bkey_s_c_reflink_p src_p = + bkey_s_c_to_reflink_p(src_k); + struct bkey_i_reflink_p *dst_p = +- bkey_reflink_p_init(&new_dst.k); ++ bkey_reflink_p_init(new_dst.k); + + u64 offset = le64_to_cpu(src_p.v->idx) + + (src_iter->pos.offset - +@@ -286,12 +286,12 @@ s64 bch2_remap_range(struct bch_fs *c, + BUG(); + } + +- new_dst.k.k.p = dst_iter->pos; +- bch2_key_resize(&new_dst.k.k, ++ new_dst.k->k.p = dst_iter->pos; ++ bch2_key_resize(&new_dst.k->k, + min(src_k.k->p.offset - src_iter->pos.offset, + dst_end.offset - dst_iter->pos.offset)); + +- ret = bch2_extent_update(&trans, dst_iter, &new_dst.k, ++ ret = bch2_extent_update(&trans, dst_iter, new_dst.k, + NULL, journal_seq, + new_i_size, i_sectors_delta); + if (ret) +@@ -333,7 +333,8 @@ err: + } while (ret2 == -EINTR); + + ret = bch2_trans_exit(&trans) ?: ret; +- bkey_on_stack_exit(&new_src, c); ++ bch2_bkey_buf_exit(&new_src, c); ++ bch2_bkey_buf_exit(&new_dst, c); + + percpu_ref_put(&c->writes); + +-- +cgit v1.2.3 + + +From a284bd2353b17117f75f77676383419de8ea9737 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 4 Jan 2021 15:46:57 -0500 +Subject: bcachefs: Fix journal_buf_realloc() + +It used to be safe to reallocate a buf that the write path owns without +holding the journal lock, but now this can trigger an assertion in +journal_seq_to_buf(). + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/journal_io.c | 10 +++++++--- + 1 file changed, 7 insertions(+), 3 deletions(-) + +diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c +index 173b483a9771..53bb77e28a43 100644 +--- a/fs/bcachefs/journal_io.c ++++ b/fs/bcachefs/journal_io.c +@@ -1052,9 +1052,13 @@ static void journal_buf_realloc(struct journal *j, struct journal_buf *buf) + return; + + memcpy(new_buf, buf->data, buf->buf_size); +- kvpfree(buf->data, buf->buf_size); +- buf->data = new_buf; +- buf->buf_size = new_size; ++ ++ spin_lock(&j->lock); ++ swap(buf->data, new_buf); ++ swap(buf->buf_size, new_size); ++ spin_unlock(&j->lock); ++ ++ kvpfree(new_buf, new_size); + } + + static inline struct journal_buf *journal_last_unwritten_buf(struct journal *j) +-- +cgit v1.2.3 + + +From 23bf38b6d4cd891b902c6253fdc730f46a40ceb2 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 6 Jan 2021 18:49:35 -0500 +Subject: bcachefs: Don't error out of recovery process on journal read error + +We don't want to fail the recovery/mount because of a single error +reading from the journal - the relevant journal entry may still be found +on other devices, and missing or no journal entries found is already +handled later in the recovery process. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/journal_io.c | 11 +++++++++-- + 1 file changed, 9 insertions(+), 2 deletions(-) + +diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c +index 53bb77e28a43..2a344a04de87 100644 +--- a/fs/bcachefs/journal_io.c ++++ b/fs/bcachefs/journal_io.c +@@ -577,8 +577,15 @@ reread: + if (bch2_dev_io_err_on(ret, ca, + "journal read error: sector %llu", + offset) || +- bch2_meta_read_fault("journal")) +- return -EIO; ++ bch2_meta_read_fault("journal")) { ++ /* ++ * We don't error out of the recovery process ++ * here, since the relevant journal entry may be ++ * found on a different device, and missing or ++ * no journal entries will be handled later ++ */ ++ return 0; ++ } + + j = buf->data; + } +-- +cgit v1.2.3 + + +From 99cc9eb395e9e580b24bafc19fe9cb6702428d66 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 7 Jan 2021 17:06:22 -0500 +Subject: bcachefs: Work around a zstd bug + +The zstd compression code seems to have a bug where it will write just +past the end of the destination buffer - probably only when the +compressed output isn't going to fit in the destination buffer, which +will never happen if you're always allocating a bigger buffer than the +source buffer which would explain other users not hitting it. But, we +size the buffer according to how much contiguous space on disk we have, +so... + +generally, bugs like this don't write more than a word past the end of +the buffer, so an easy workaround is to subtract a fudge factor from the +buffer size. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/compress.c | 13 ++++++++++++- + 1 file changed, 12 insertions(+), 1 deletion(-) + +diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c +index 27bbc265d550..78757dcede36 100644 +--- a/fs/bcachefs/compress.c ++++ b/fs/bcachefs/compress.c +@@ -336,8 +336,19 @@ static int attempt_compress(struct bch_fs *c, + ZSTD_CCtx *ctx = zstd_init_cctx(workspace, + zstd_cctx_workspace_bound(&c->zstd_params.cParams)); + ++ /* ++ * ZSTD requires that when we decompress we pass in the exact ++ * compressed size - rounding it up to the nearest sector ++ * doesn't work, so we use the first 4 bytes of the buffer for ++ * that. ++ * ++ * Additionally, the ZSTD code seems to have a bug where it will ++ * write just past the end of the buffer - so subtract a fudge ++ * factor (7 bytes) from the dst buffer size to account for ++ * that. ++ */ + size_t len = zstd_compress_cctx(ctx, +- dst + 4, dst_len - 4, ++ dst + 4, dst_len - 4 - 7, + src, src_len, + &c->zstd_params); + if (zstd_is_error(len)) +-- +cgit v1.2.3 + + +From 97dc401a0c556943be22124b93b719895cb44181 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 7 Jan 2021 17:18:14 -0500 +Subject: bcachefs: Reserve some open buckets for btree allocations + +This reverts part of the change from "bcachefs: Don't use +BTREE_INSERT_USE_RESERVE so much" - it turns out we still should be +reserving open buckets for btree node allocations, because otherwise +data bucket allocations (especially with erasure coding enabled) can use +up all our open buckets and we won't be able to do the metadata update +that lets us release those open bucket references. Oops. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/alloc_foreground.c | 6 +++++- + fs/bcachefs/alloc_types.h | 4 ++-- + fs/bcachefs/btree_update_interior.c | 4 ++-- + 3 files changed, 9 insertions(+), 5 deletions(-) + +diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c +index cb7f1fc16daa..dcbe04040a39 100644 +--- a/fs/bcachefs/alloc_foreground.c ++++ b/fs/bcachefs/alloc_foreground.c +@@ -204,8 +204,11 @@ success: + static inline unsigned open_buckets_reserved(enum alloc_reserve reserve) + { + switch (reserve) { +- case RESERVE_MOVINGGC: ++ case RESERVE_BTREE: ++ case RESERVE_BTREE_MOVINGGC: + return 0; ++ case RESERVE_MOVINGGC: ++ return OPEN_BUCKETS_COUNT / 4; + default: + return OPEN_BUCKETS_COUNT / 2; + } +@@ -261,6 +264,7 @@ struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca, + goto out; + + switch (reserve) { ++ case RESERVE_BTREE_MOVINGGC: + case RESERVE_MOVINGGC: + if (fifo_pop(&ca->free[RESERVE_MOVINGGC], bucket)) + goto out; +diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h +index 0cfb026a02e5..1abfff5290bc 100644 +--- a/fs/bcachefs/alloc_types.h ++++ b/fs/bcachefs/alloc_types.h +@@ -34,9 +34,9 @@ struct bucket_clock { + struct mutex lock; + }; + +-/* There is one reserve for each type of btree, one for prios and gens +- * and one for moving GC */ + enum alloc_reserve { ++ RESERVE_BTREE_MOVINGGC = -2, ++ RESERVE_BTREE = -1, + RESERVE_MOVINGGC = 0, + RESERVE_NONE = 1, + RESERVE_NR = 2, +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index 39d5206b8eae..36bb1d8d164e 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -203,10 +203,10 @@ static struct btree *__bch2_btree_node_alloc(struct bch_fs *c, + + if (flags & BTREE_INSERT_USE_RESERVE) { + nr_reserve = 0; +- alloc_reserve = RESERVE_MOVINGGC; ++ alloc_reserve = RESERVE_BTREE_MOVINGGC; + } else { + nr_reserve = BTREE_NODE_RESERVE; +- alloc_reserve = RESERVE_NONE; ++ alloc_reserve = RESERVE_BTREE; + } + + mutex_lock(&c->btree_reserve_cache_lock); +-- +cgit v1.2.3 + + +From 82dc196d34bc14697164d52ab996d706ebf1eeaf Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 8 Jan 2021 10:56:39 -0500 +Subject: bcachefs: Fix btree node split after merge operations + +A btree node merge operation deletes a key in the parent node; if when +inserting into the parent node we split the parent node, we can end up +with a whiteout in the parent node that we don't want. + +The existing code drops them before doing the split, because they can +screw up picking the pivot, but we forgot about the unwritten writeouts +area - that needs to be cleared out too. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_update_interior.c | 3 +++ + 1 file changed, 3 insertions(+) + +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index 36bb1d8d164e..5bb653298c6c 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -1227,6 +1227,9 @@ static void btree_split_insert_keys(struct btree_update *as, struct btree *b, + src = n; + } + ++ /* Also clear out the unwritten whiteouts area: */ ++ b->whiteout_u64s = 0; ++ + i->u64s = cpu_to_le16((u64 *) dst - i->_data); + set_btree_bset_end(b, b->set); + +-- +cgit v1.2.3 + + +From c08db5c9275b4bcd79a8ad999892bc03145abdaa Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 8 Jan 2021 21:20:58 -0500 +Subject: bcachefs: bch2_alloc_write() should be writing for all devices + +Alloc info isn't stored on a particular device, it makes no sense to +only be writing it out for rw members - this was causing fsck to not fix +alloc info errors, oops. + +Also, make sure we write out alloc info in other repair paths. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/alloc_background.c | 2 +- + fs/bcachefs/bcachefs.h | 2 +- + fs/bcachefs/btree_gc.c | 8 +++++--- + fs/bcachefs/recovery.c | 15 +++++---------- + 4 files changed, 12 insertions(+), 15 deletions(-) + +diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c +index 6c3fdc41aec5..922b24aaf367 100644 +--- a/fs/bcachefs/alloc_background.c ++++ b/fs/bcachefs/alloc_background.c +@@ -366,7 +366,7 @@ int bch2_alloc_write(struct bch_fs *c, unsigned flags) + unsigned i; + int ret = 0; + +- for_each_rw_member(ca, c, i) { ++ for_each_member_device(ca, c, i) { + bch2_dev_alloc_write(c, ca, flags); + if (ret) { + percpu_ref_put(&ca->io_ref); +diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h +index d54413bec18f..06d68e97ae4d 100644 +--- a/fs/bcachefs/bcachefs.h ++++ b/fs/bcachefs/bcachefs.h +@@ -511,7 +511,7 @@ enum { + + /* misc: */ + BCH_FS_FIXED_GENS, +- BCH_FS_ALLOC_WRITTEN, ++ BCH_FS_NEED_ALLOC_WRITE, + BCH_FS_REBUILD_REPLICAS, + BCH_FS_HOLD_BTREE_WRITES, + }; +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +index 7554d7d4662c..6b06f6079908 100644 +--- a/fs/bcachefs/btree_gc.c ++++ b/fs/bcachefs/btree_gc.c +@@ -132,6 +132,7 @@ static int bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k, + ptr->gen)) { + g2->_mark.gen = g->_mark.gen = ptr->gen; + g2->gen_valid = g->gen_valid = true; ++ set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags); + } + + if (mustfix_fsck_err_on(gen_cmp(ptr->gen, g->mark.gen) > 0, c, +@@ -145,6 +146,7 @@ static int bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k, + g2->_mark.dirty_sectors = 0; + g2->_mark.cached_sectors = 0; + set_bit(BCH_FS_FIXED_GENS, &c->flags); ++ set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags); + } + } + } +@@ -571,7 +573,7 @@ static int bch2_gc_done(struct bch_fs *c, + fsck_err(c, _msg ": got %llu, should be %llu" \ + , ##__VA_ARGS__, dst->_f, src->_f); \ + dst->_f = src->_f; \ +- ret = 1; \ ++ set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags); \ + } + #define copy_stripe_field(_f, _msg, ...) \ + if (dst->_f != src->_f) { \ +@@ -582,7 +584,7 @@ static int bch2_gc_done(struct bch_fs *c, + dst->_f, src->_f); \ + dst->_f = src->_f; \ + dst->dirty = true; \ +- ret = 1; \ ++ set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags); \ + } + #define copy_bucket_field(_f) \ + if (dst->b[b].mark._f != src->b[b].mark._f) { \ +@@ -593,7 +595,7 @@ static int bch2_gc_done(struct bch_fs *c, + bch2_data_types[dst->b[b].mark.data_type],\ + dst->b[b].mark._f, src->b[b].mark._f); \ + dst->b[b]._mark._f = src->b[b].mark._f; \ +- ret = 1; \ ++ set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags); \ + } + #define copy_dev_field(_f, _msg, ...) \ + copy_field(_f, "dev %u has wrong " _msg, i, ##__VA_ARGS__) +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index c5da1be46444..5a43682c26ef 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -938,7 +938,7 @@ int bch2_fs_recovery(struct bch_fs *c) + struct bch_sb_field_clean *clean = NULL; + struct jset *last_journal_entry = NULL; + u64 blacklist_seq, journal_seq; +- bool write_sb = false, need_write_alloc = false; ++ bool write_sb = false; + int ret; + + if (c->sb.clean) +@@ -1084,10 +1084,8 @@ use_clean: + bch_info(c, "starting metadata mark and sweep"); + err = "error in mark and sweep"; + ret = bch2_gc(c, &c->journal_keys, true, true); +- if (ret < 0) +- goto err; + if (ret) +- need_write_alloc = true; ++ goto err; + bch_verbose(c, "mark and sweep done"); + } + +@@ -1097,10 +1095,8 @@ use_clean: + bch_info(c, "starting mark and sweep"); + err = "error in mark and sweep"; + ret = bch2_gc(c, &c->journal_keys, true, false); +- if (ret < 0) +- goto err; + if (ret) +- need_write_alloc = true; ++ goto err; + bch_verbose(c, "mark and sweep done"); + } + +@@ -1124,7 +1120,8 @@ use_clean: + goto err; + bch_verbose(c, "journal replay done"); + +- if (need_write_alloc && !c->opts.nochanges) { ++ if (test_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags) && ++ !c->opts.nochanges) { + /* + * note that even when filesystem was clean there might be work + * to do here, if we ran gc (because of fsck) which recalculated +@@ -1139,8 +1136,6 @@ use_clean: + goto err; + } + bch_verbose(c, "alloc write done"); +- +- set_bit(BCH_FS_ALLOC_WRITTEN, &c->flags); + } + + if (!c->sb.clean) { +-- +cgit v1.2.3 + + +From 4e359264fb33ce8f1caf5cd83086edd004c556f4 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 10 Jan 2021 13:38:09 -0500 +Subject: bcachefs: Fix bch2_replicas_gc2 + +This fixes a regression introduced by "bcachefs: Refactor filesystem +usage accounting". We have to include all the replicas entries that have +any of the entries for different journal entries nonzero, we can't skip +them if they sum to zero. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/replicas.c | 6 +++++- + 1 file changed, 5 insertions(+), 1 deletion(-) + +diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c +index d37d173f3ba6..b1d8db677c1c 100644 +--- a/fs/bcachefs/replicas.c ++++ b/fs/bcachefs/replicas.c +@@ -598,7 +598,11 @@ retry: + cpu_replicas_entry(&c->replicas, i); + + if (e->data_type == BCH_DATA_journal || +- bch2_fs_usage_read_one(c, &c->usage_base->replicas[i])) ++ c->usage_base->replicas[i] || ++ percpu_u64_get(&c->usage[0]->replicas[i]) || ++ percpu_u64_get(&c->usage[1]->replicas[i]) || ++ percpu_u64_get(&c->usage[2]->replicas[i]) || ++ percpu_u64_get(&c->usage[3]->replicas[i])) + memcpy(cpu_replicas_entry(&new, new.nr++), + e, new.entry_size); + } +-- +cgit v1.2.3 + + +From 86effbb72eaf7ad1f6ec27665b6c801e4fc281fc Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 11 Jan 2021 13:31:15 -0500 +Subject: bcachefs: Disable splice_write + +This is currently busted - not sure why. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/fs.c | 3 +++ + 1 file changed, 3 insertions(+) + +diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c +index 2735aeba7d15..444f8f279742 100644 +--- a/fs/bcachefs/fs.c ++++ b/fs/bcachefs/fs.c +@@ -1019,7 +1019,10 @@ static const struct file_operations bch_file_operations = { + .open = generic_file_open, + .fsync = bch2_fsync, + .splice_read = generic_file_splice_read, ++#if 0 ++ /* Busted: */ + .splice_write = iter_file_splice_write, ++#endif + .fallocate = bch2_fallocate_dispatch, + .unlocked_ioctl = bch2_fs_file_ioctl, + #ifdef CONFIG_COMPAT +-- +cgit v1.2.3 + + +From fa53c6e9e10c8332b41f285e216e9dfc9ecd6a59 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 11 Jan 2021 13:37:35 -0500 +Subject: bcachefs: Add cannibalize lock to btree_cache_to_text() + +More debugging info is always a good thing. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_cache.c | 5 +++-- + 1 file changed, 3 insertions(+), 2 deletions(-) + +diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c +index fda6540be035..3a692828b80d 100644 +--- a/fs/bcachefs/btree_cache.c ++++ b/fs/bcachefs/btree_cache.c +@@ -1072,6 +1072,7 @@ void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, + + void bch2_btree_cache_to_text(struct printbuf *out, struct bch_fs *c) + { +- pr_buf(out, "nr nodes:\t%u\n", c->btree_cache.used); +- pr_buf(out, "nr dirty:\t%u\n", atomic_read(&c->btree_cache.dirty)); ++ pr_buf(out, "nr nodes:\t\t%u\n", c->btree_cache.used); ++ pr_buf(out, "nr dirty:\t\t%u\n", atomic_read(&c->btree_cache.dirty)); ++ pr_buf(out, "cannibalize lock:\t%p\n", c->btree_cache.alloc_lock); + } +-- +cgit v1.2.3 + + +From 01899926132eb47793cf34617b5686e86ce71884 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 11 Jan 2021 13:51:23 -0500 +Subject: bcachefs: Erasure coding fixes & refactoring + + - Originally bch_extent_stripe_ptr didn't contain the block index, + instead we'd have to search through the stripe pointers to figure out + which pointer matched. When the block field was added to + bch_extent_stripe_ptr, not all of the code was updated to use it. + This patch fixes that, and we also now verify that field where it + makes sense. + + - The ec_stripe_buf_init/exit() functions have been improved, and are + now used by the bch2_ec_read_extent() (recovery read) path. + + - get_stripe_key() is now used by bch2_ec_read_extent(). + + - We now have a getter and setter for checksums within a stripe, like + we had previously for block sector counts, and ec_generate_checksums + and ec_validate_checksums are now quite a bit smaller and cleaner. + +ec.c still needs a lot of work, but this patch is slowly moving things +in the right direction. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/buckets.c | 20 +++- + fs/bcachefs/ec.c | 311 ++++++++++++++++++++++---------------------------- + fs/bcachefs/ec.h | 46 +++++++- + 3 files changed, 194 insertions(+), 183 deletions(-) + +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index 8bbf958d64e4..ed07dfee0ae3 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -1652,7 +1652,7 @@ out: + } + + static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans, +- struct bch_extent_stripe_ptr p, ++ struct extent_ptr_decoded p, + s64 sectors, enum bch_data_type data_type) + { + struct bch_fs *c = trans->c; +@@ -1662,14 +1662,22 @@ static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans, + struct bch_replicas_padded r; + int ret = 0; + +- ret = trans_get_key(trans, BTREE_ID_EC, POS(0, p.idx), &iter, &k); ++ ret = trans_get_key(trans, BTREE_ID_EC, POS(0, p.ec.idx), &iter, &k); + if (ret < 0) + return ret; + + if (k.k->type != KEY_TYPE_stripe) { + bch2_fs_inconsistent(c, + "pointer to nonexistent stripe %llu", +- (u64) p.idx); ++ (u64) p.ec.idx); ++ ret = -EIO; ++ goto out; ++ } ++ ++ if (!bch2_ptr_matches_stripe(bkey_s_c_to_stripe(k).v, p)) { ++ bch2_fs_inconsistent(c, ++ "stripe pointer doesn't match stripe %llu", ++ (u64) p.ec.idx); + ret = -EIO; + goto out; + } +@@ -1680,8 +1688,8 @@ static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans, + goto out; + + bkey_reassemble(&s->k_i, k); +- stripe_blockcount_set(&s->v, p.block, +- stripe_blockcount_get(&s->v, p.block) + ++ stripe_blockcount_set(&s->v, p.ec.block, ++ stripe_blockcount_get(&s->v, p.ec.block) + + sectors); + bch2_trans_update(trans, iter, &s->k_i, 0); + +@@ -1732,7 +1740,7 @@ static int bch2_trans_mark_extent(struct btree_trans *trans, + dirty_sectors += disk_sectors; + r.e.devs[r.e.nr_devs++] = p.ptr.dev; + } else { +- ret = bch2_trans_mark_stripe_ptr(trans, p.ec, ++ ret = bch2_trans_mark_stripe_ptr(trans, p, + disk_sectors, data_type); + if (ret) + return ret; +diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c +index eb37b79ade7f..de266af446a6 100644 +--- a/fs/bcachefs/ec.c ++++ b/fs/bcachefs/ec.c +@@ -138,44 +138,18 @@ void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c, + stripe_blockcount_get(s, i)); + } + +-static int ptr_matches_stripe(struct bch_fs *c, +- struct bch_stripe *v, +- const struct bch_extent_ptr *ptr) ++/* returns blocknr in stripe that we matched: */ ++static int bkey_matches_stripe(struct bch_stripe *s, ++ struct bkey_s_c k) + { +- unsigned i; +- +- for (i = 0; i < v->nr_blocks - v->nr_redundant; i++) { +- const struct bch_extent_ptr *ptr2 = v->ptrs + i; +- +- if (ptr->dev == ptr2->dev && +- ptr->gen == ptr2->gen && +- ptr->offset >= ptr2->offset && +- ptr->offset < ptr2->offset + le16_to_cpu(v->sectors)) +- return i; +- } ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const struct bch_extent_ptr *ptr; ++ unsigned i, nr_data = s->nr_blocks - s->nr_redundant; + +- return -1; +-} +- +-static int extent_matches_stripe(struct bch_fs *c, +- struct bch_stripe *v, +- struct bkey_s_c k) +-{ +- +- switch (k.k->type) { +- case KEY_TYPE_extent: { +- struct bkey_s_c_extent e = bkey_s_c_to_extent(k); +- const struct bch_extent_ptr *ptr; +- int idx; +- +- extent_for_each_ptr(e, ptr) { +- idx = ptr_matches_stripe(c, v, ptr); +- if (idx >= 0) +- return idx; +- } +- break; +- } +- } ++ bkey_for_each_ptr(ptrs, ptr) ++ for (i = 0; i < nr_data; i++) ++ if (__bch2_ptr_matches_stripe(s, ptr, i)) ++ return i; + + return -1; + } +@@ -202,74 +176,93 @@ static bool extent_has_stripe_ptr(struct bkey_s_c k, u64 idx) + + /* Stripe bufs: */ + +-static void ec_stripe_buf_free(struct ec_stripe_buf *stripe) ++static void ec_stripe_buf_exit(struct ec_stripe_buf *buf) + { + unsigned i; + +- for (i = 0; i < stripe->key.v.nr_blocks; i++) { +- kvpfree(stripe->data[i], stripe->size << 9); +- stripe->data[i] = NULL; ++ for (i = 0; i < buf->key.v.nr_blocks; i++) { ++ kvpfree(buf->data[i], buf->size << 9); ++ buf->data[i] = NULL; + } + } + +-static int ec_stripe_buf_alloc(struct ec_stripe_buf *stripe) ++static int ec_stripe_buf_init(struct ec_stripe_buf *buf, ++ unsigned offset, unsigned size) + { ++ struct bch_stripe *v = &buf->key.v; ++ unsigned csum_granularity = 1U << v->csum_granularity_bits; ++ unsigned end = offset + size; + unsigned i; + +- memset(stripe->valid, 0xFF, sizeof(stripe->valid)); ++ BUG_ON(end > le16_to_cpu(v->sectors)); ++ ++ offset = round_down(offset, csum_granularity); ++ end = min_t(unsigned, le16_to_cpu(v->sectors), ++ round_up(end, csum_granularity)); + +- for (i = 0; i < stripe->key.v.nr_blocks; i++) { +- stripe->data[i] = kvpmalloc(stripe->size << 9, GFP_KERNEL); +- if (!stripe->data[i]) ++ buf->offset = offset; ++ buf->size = end - offset; ++ ++ memset(buf->valid, 0xFF, sizeof(buf->valid)); ++ ++ for (i = 0; i < buf->key.v.nr_blocks; i++) { ++ buf->data[i] = kvpmalloc(buf->size << 9, GFP_KERNEL); ++ if (!buf->data[i]) + goto err; + } + + return 0; + err: +- ec_stripe_buf_free(stripe); ++ ec_stripe_buf_exit(buf); + return -ENOMEM; + } + + /* Checksumming: */ + +-static void ec_generate_checksums(struct ec_stripe_buf *buf) ++static struct bch_csum ec_block_checksum(struct ec_stripe_buf *buf, ++ unsigned block, unsigned offset) + { + struct bch_stripe *v = &buf->key.v; + unsigned csum_granularity = 1 << v->csum_granularity_bits; +- unsigned csums_per_device = stripe_csums_per_device(v); +- unsigned csum_bytes = bch_crc_bytes[v->csum_type]; +- unsigned i, j; ++ unsigned end = buf->offset + buf->size; ++ unsigned len = min(csum_granularity, end - offset); ++ ++ BUG_ON(offset >= end); ++ BUG_ON(offset < buf->offset); ++ BUG_ON(offset & (csum_granularity - 1)); ++ BUG_ON(offset + len != le16_to_cpu(v->sectors) && ++ (len & (csum_granularity - 1))); ++ ++ return bch2_checksum(NULL, v->csum_type, ++ null_nonce(), ++ buf->data[block] + ((offset - buf->offset) << 9), ++ len << 9); ++} ++ ++static void ec_generate_checksums(struct ec_stripe_buf *buf) ++{ ++ struct bch_stripe *v = &buf->key.v; ++ unsigned i, j, csums_per_device = stripe_csums_per_device(v); + +- if (!csum_bytes) ++ if (!v->csum_type) + return; + + BUG_ON(buf->offset); + BUG_ON(buf->size != le16_to_cpu(v->sectors)); + +- for (i = 0; i < v->nr_blocks; i++) { +- for (j = 0; j < csums_per_device; j++) { +- unsigned offset = j << v->csum_granularity_bits; +- unsigned len = min(csum_granularity, buf->size - offset); +- +- struct bch_csum csum = +- bch2_checksum(NULL, v->csum_type, +- null_nonce(), +- buf->data[i] + (offset << 9), +- len << 9); +- +- memcpy(stripe_csum(v, i, j), &csum, csum_bytes); +- } +- } ++ for (i = 0; i < v->nr_blocks; i++) ++ for (j = 0; j < csums_per_device; j++) ++ stripe_csum_set(v, i, j, ++ ec_block_checksum(buf, i, j << v->csum_granularity_bits)); + } + + static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf) + { + struct bch_stripe *v = &buf->key.v; + unsigned csum_granularity = 1 << v->csum_granularity_bits; +- unsigned csum_bytes = bch_crc_bytes[v->csum_type]; + unsigned i; + +- if (!csum_bytes) ++ if (!v->csum_type) + return; + + for (i = 0; i < v->nr_blocks; i++) { +@@ -282,21 +275,14 @@ static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf) + while (offset < end) { + unsigned j = offset >> v->csum_granularity_bits; + unsigned len = min(csum_granularity, end - offset); +- struct bch_csum csum; +- +- BUG_ON(offset & (csum_granularity - 1)); +- BUG_ON(offset + len != le16_to_cpu(v->sectors) && +- ((offset + len) & (csum_granularity - 1))); +- +- csum = bch2_checksum(NULL, v->csum_type, +- null_nonce(), +- buf->data[i] + ((offset - buf->offset) << 9), +- len << 9); ++ struct bch_csum want = stripe_csum_get(v, i, j); ++ struct bch_csum got = ec_block_checksum(buf, i, offset); + +- if (memcmp(stripe_csum(v, i, j), &csum, csum_bytes)) { ++ if (bch2_crc_cmp(want, got)) { + bch_err_ratelimited(c, +- "checksum error while doing reconstruct read (%u:%u)", +- i, j); ++ "stripe checksum error at %u:%u: csum type %u, expected %llx got %llx", ++ i, j, v->csum_type, ++ want.lo, got.lo); + clear_bit(i, buf->valid); + break; + } +@@ -373,6 +359,14 @@ static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf, + ? BCH_DATA_user + : BCH_DATA_parity; + ++ if (ptr_stale(ca, ptr)) { ++ bch_err_ratelimited(c, ++ "error %s stripe: stale pointer", ++ rw == READ ? "reading from" : "writing to"); ++ clear_bit(idx, buf->valid); ++ return; ++ } ++ + if (!bch2_dev_get_ioref(ca, rw)) { + clear_bit(idx, buf->valid); + return; +@@ -415,87 +409,77 @@ static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf, + percpu_ref_put(&ca->io_ref); + } + +-/* recovery read path: */ +-int bch2_ec_read_extent(struct bch_fs *c, struct bch_read_bio *rbio) ++static int get_stripe_key(struct bch_fs *c, u64 idx, struct ec_stripe_buf *stripe) + { + struct btree_trans trans; + struct btree_iter *iter; ++ struct bkey_s_c k; ++ int ret; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_EC, POS(0, idx), BTREE_ITER_SLOTS); ++ k = bch2_btree_iter_peek_slot(iter); ++ ret = bkey_err(k); ++ if (ret) ++ goto err; ++ if (k.k->type != KEY_TYPE_stripe) { ++ ret = -ENOENT; ++ goto err; ++ } ++ bkey_reassemble(&stripe->key.k_i, k); ++err: ++ bch2_trans_exit(&trans); ++ return ret; ++} ++ ++/* recovery read path: */ ++int bch2_ec_read_extent(struct bch_fs *c, struct bch_read_bio *rbio) ++{ + struct ec_stripe_buf *buf; + struct closure cl; +- struct bkey_s_c k; + struct bch_stripe *v; +- unsigned stripe_idx; +- unsigned offset, end; +- unsigned i, nr_data, csum_granularity; +- int ret = 0, idx; ++ unsigned i, offset; ++ int ret = 0; + + closure_init_stack(&cl); + + BUG_ON(!rbio->pick.has_ec); + +- stripe_idx = rbio->pick.ec.idx; +- + buf = kzalloc(sizeof(*buf), GFP_NOIO); + if (!buf) + return -ENOMEM; + +- bch2_trans_init(&trans, c, 0, 0); +- +- iter = bch2_trans_get_iter(&trans, BTREE_ID_EC, +- POS(0, stripe_idx), +- BTREE_ITER_SLOTS); +- k = bch2_btree_iter_peek_slot(iter); +- if (bkey_err(k) || k.k->type != KEY_TYPE_stripe) { ++ ret = get_stripe_key(c, rbio->pick.ec.idx, buf); ++ if (ret) { + bch_err_ratelimited(c, +- "error doing reconstruct read: stripe not found"); ++ "error doing reconstruct read: error %i looking up stripe", ret); + kfree(buf); +- return bch2_trans_exit(&trans) ?: -EIO; ++ return -EIO; + } + +- bkey_reassemble(&buf->key.k_i, k); +- bch2_trans_exit(&trans); +- + v = &buf->key.v; + +- nr_data = v->nr_blocks - v->nr_redundant; +- +- idx = ptr_matches_stripe(c, v, &rbio->pick.ptr); +- BUG_ON(idx < 0); +- +- csum_granularity = 1U << v->csum_granularity_bits; +- +- offset = rbio->bio.bi_iter.bi_sector - v->ptrs[idx].offset; +- end = offset + bio_sectors(&rbio->bio); +- +- BUG_ON(end > le16_to_cpu(v->sectors)); +- +- buf->offset = round_down(offset, csum_granularity); +- buf->size = min_t(unsigned, le16_to_cpu(v->sectors), +- round_up(end, csum_granularity)) - buf->offset; +- +- for (i = 0; i < v->nr_blocks; i++) { +- buf->data[i] = kmalloc(buf->size << 9, GFP_NOIO); +- if (!buf->data[i]) { +- ret = -ENOMEM; +- goto err; +- } ++ if (!bch2_ptr_matches_stripe(v, rbio->pick)) { ++ bch_err_ratelimited(c, ++ "error doing reconstruct read: pointer doesn't match stripe"); ++ ret = -EIO; ++ goto err; + } + +- memset(buf->valid, 0xFF, sizeof(buf->valid)); +- +- for (i = 0; i < v->nr_blocks; i++) { +- struct bch_extent_ptr *ptr = v->ptrs + i; +- struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); ++ offset = rbio->bio.bi_iter.bi_sector - v->ptrs[rbio->pick.ec.block].offset; ++ if (offset + bio_sectors(&rbio->bio) > le16_to_cpu(v->sectors)) { ++ bch_err_ratelimited(c, ++ "error doing reconstruct read: read is bigger than stripe"); ++ ret = -EIO; ++ goto err; ++ } + +- if (ptr_stale(ca, ptr)) { +- bch_err_ratelimited(c, +- "error doing reconstruct read: stale pointer"); +- clear_bit(i, buf->valid); +- continue; +- } ++ ret = ec_stripe_buf_init(buf, offset, bio_sectors(&rbio->bio)); ++ if (ret) ++ goto err; + ++ for (i = 0; i < v->nr_blocks; i++) + ec_block_io(c, buf, REQ_OP_READ, i, &cl); +- } + + closure_sync(&cl); + +@@ -513,10 +497,9 @@ int bch2_ec_read_extent(struct bch_fs *c, struct bch_read_bio *rbio) + goto err; + + memcpy_to_bio(&rbio->bio, rbio->bio.bi_iter, +- buf->data[idx] + ((offset - buf->offset) << 9)); ++ buf->data[rbio->pick.ec.block] + ((offset - buf->offset) << 9)); + err: +- for (i = 0; i < v->nr_blocks; i++) +- kfree(buf->data[i]); ++ ec_stripe_buf_exit(buf); + kfree(buf); + return ret; + } +@@ -784,7 +767,7 @@ static int ec_stripe_update_ptrs(struct bch_fs *c, + struct bkey_s_c k; + struct bkey_s_extent e; + struct bkey_buf sk; +- int ret = 0, dev, idx; ++ int ret = 0, dev, block; + + bch2_bkey_buf_init(&sk); + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); +@@ -805,13 +788,13 @@ static int ec_stripe_update_ptrs(struct bch_fs *c, + continue; + } + +- idx = extent_matches_stripe(c, &s->key.v, k); +- if (idx < 0) { ++ block = bkey_matches_stripe(&s->key.v, k); ++ if (block < 0) { + bch2_btree_iter_next(iter); + continue; + } + +- dev = s->key.v.ptrs[idx].dev; ++ dev = s->key.v.ptrs[block].dev; + + bch2_bkey_buf_reassemble(&sk, c, k); + e = bkey_i_to_s_extent(sk.k); +@@ -820,7 +803,7 @@ static int ec_stripe_update_ptrs(struct bch_fs *c, + ec_ptr = (void *) bch2_bkey_has_device(e.s_c, dev); + BUG_ON(!ec_ptr); + +- extent_stripe_ptr_add(e, s, ec_ptr, idx); ++ extent_stripe_ptr_add(e, s, ec_ptr, block); + + bch2_btree_iter_set_pos(iter, bkey_start_pos(&sk.k->k)); + bch2_trans_update(&trans, iter, sk.k, 0); +@@ -875,7 +858,7 @@ static void ec_stripe_create(struct ec_stripe_new *s) + swap(s->new_stripe.data[i], + s->existing_stripe.data[i]); + +- ec_stripe_buf_free(&s->existing_stripe); ++ ec_stripe_buf_exit(&s->existing_stripe); + } + + BUG_ON(!s->allocated); +@@ -941,8 +924,8 @@ err: + + bch2_keylist_free(&s->keys, s->inline_keys); + +- ec_stripe_buf_free(&s->existing_stripe); +- ec_stripe_buf_free(&s->new_stripe); ++ ec_stripe_buf_exit(&s->existing_stripe); ++ ec_stripe_buf_exit(&s->new_stripe); + closure_debug_destroy(&s->iodone); + kfree(s); + } +@@ -1145,9 +1128,6 @@ static int ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h) + + bch2_keylist_init(&s->keys, s->inline_keys); + +- s->new_stripe.offset = 0; +- s->new_stripe.size = h->blocksize; +- + ec_stripe_key_init(c, &s->new_stripe.key, s->nr_data, + s->nr_parity, h->blocksize); + +@@ -1305,9 +1285,7 @@ err: + + /* XXX: doesn't obey target: */ + static s64 get_existing_stripe(struct bch_fs *c, +- unsigned target, +- unsigned algo, +- unsigned redundancy) ++ struct ec_stripe_head *head) + { + ec_stripes_heap *h = &c->ec_stripes_heap; + struct stripe *m; +@@ -1325,8 +1303,9 @@ static s64 get_existing_stripe(struct bch_fs *c, + stripe_idx = h->data[heap_idx].idx; + m = genradix_ptr(&c->stripes[0], stripe_idx); + +- if (m->algorithm == algo && +- m->nr_redundant == redundancy && ++ if (m->algorithm == head->algo && ++ m->nr_redundant == head->redundancy && ++ m->sectors == head->blocksize && + m->blocks_nonempty < m->nr_blocks - m->nr_redundant) { + bch2_stripes_heap_del(c, m, stripe_idx); + spin_unlock(&c->ec_stripes_heap_lock); +@@ -1338,24 +1317,6 @@ static s64 get_existing_stripe(struct bch_fs *c, + return -1; + } + +-static int get_stripe_key(struct bch_fs *c, u64 idx, struct ec_stripe_buf *stripe) +-{ +- struct btree_trans trans; +- struct btree_iter *iter; +- struct bkey_s_c k; +- int ret; +- +- bch2_trans_init(&trans, c, 0, 0); +- iter = bch2_trans_get_iter(&trans, BTREE_ID_EC, POS(0, idx), BTREE_ITER_SLOTS); +- k = bch2_btree_iter_peek_slot(iter); +- ret = bkey_err(k); +- if (!ret) +- bkey_reassemble(&stripe->key.k_i, k); +- bch2_trans_exit(&trans); +- +- return ret; +-} +- + struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c, + unsigned target, + unsigned algo, +@@ -1382,7 +1343,7 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c, + return NULL; + } + +- idx = get_existing_stripe(c, target, algo, redundancy); ++ idx = get_existing_stripe(c, h); + if (idx >= 0) { + h->s->have_existing_stripe = true; + ret = get_stripe_key(c, idx, &h->s->existing_stripe); +@@ -1392,7 +1353,7 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c, + return NULL; + } + +- if (ec_stripe_buf_alloc(&h->s->existing_stripe)) { ++ if (ec_stripe_buf_init(&h->s->existing_stripe, 0, h->blocksize)) { + /* + * this is a problem: we have deleted from the + * stripes heap already +@@ -1411,7 +1372,7 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c, + &h->s->existing_stripe.key.k_i); + } + +- if (ec_stripe_buf_alloc(&h->s->new_stripe)) { ++ if (ec_stripe_buf_init(&h->s->new_stripe, 0, h->blocksize)) { + BUG(); + } + } +diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h +index 97a263cf9c87..c3959af46833 100644 +--- a/fs/bcachefs/ec.h ++++ b/fs/bcachefs/ec.h +@@ -60,9 +60,51 @@ static inline unsigned stripe_val_u64s(const struct bch_stripe *s) + } + + static inline void *stripe_csum(struct bch_stripe *s, +- unsigned dev, unsigned csum_idx) ++ unsigned block, unsigned csum_idx) + { +- return (void *) s + stripe_csum_offset(s, dev, csum_idx); ++ EBUG_ON(block >= s->nr_blocks); ++ EBUG_ON(csum_idx >= stripe_csums_per_device(s)); ++ ++ return (void *) s + stripe_csum_offset(s, block, csum_idx); ++} ++ ++static inline struct bch_csum stripe_csum_get(struct bch_stripe *s, ++ unsigned block, unsigned csum_idx) ++{ ++ struct bch_csum csum = { 0 }; ++ ++ memcpy(&csum, stripe_csum(s, block, csum_idx), bch_crc_bytes[s->csum_type]); ++ return csum; ++} ++ ++static inline void stripe_csum_set(struct bch_stripe *s, ++ unsigned block, unsigned csum_idx, ++ struct bch_csum csum) ++{ ++ memcpy(stripe_csum(s, block, csum_idx), &csum, bch_crc_bytes[s->csum_type]); ++} ++ ++static inline bool __bch2_ptr_matches_stripe(const struct bch_stripe *s, ++ const struct bch_extent_ptr *ptr, ++ unsigned block) ++{ ++ unsigned nr_data = s->nr_blocks - s->nr_redundant; ++ ++ if (block >= nr_data) ++ return false; ++ ++ return ptr->dev == s->ptrs[block].dev && ++ ptr->gen == s->ptrs[block].gen && ++ ptr->offset >= s->ptrs[block].offset && ++ ptr->offset < s->ptrs[block].offset + le16_to_cpu(s->sectors); ++} ++ ++static inline bool bch2_ptr_matches_stripe(const struct bch_stripe *s, ++ struct extent_ptr_decoded p) ++{ ++ BUG_ON(!p.has_ec); ++ ++ return __bch2_ptr_matches_stripe(s, &p.ptr, p.ec.block); + } + + struct bch_read_bio; +-- +cgit v1.2.3 + + +From 64bcbeab40803552a4d20b87df68cb3ccf137f29 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 11 Jan 2021 16:11:02 -0500 +Subject: bcachefs: Add btree node prefetching to bch2_btree_and_journal_walk() + +bch2_btree_and_journal_walk() walks the btree overlaying keys from the +journal; it was introduced so that we could read in the alloc btree +prior to journal replay being done, when journalling of updates to +interior btree nodes was introduced. + +But it didn't have btree node prefetching, which introduced a severe +regression with mount times, particularly on spinning rust. This patch +implements btree node prefetching for the btree + journal walk, +hopefully fixing that. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_cache.c | 8 ++++---- + fs/bcachefs/btree_cache.h | 2 +- + fs/bcachefs/btree_iter.c | 3 ++- + fs/bcachefs/recovery.c | 37 +++++++++++++++++++++++++++++++------ + 4 files changed, 38 insertions(+), 12 deletions(-) + +diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c +index 3a692828b80d..bebf9fb01fe1 100644 +--- a/fs/bcachefs/btree_cache.c ++++ b/fs/bcachefs/btree_cache.c +@@ -1007,20 +1007,20 @@ out: + } + + void bch2_btree_node_prefetch(struct bch_fs *c, struct btree_iter *iter, +- const struct bkey_i *k, unsigned level) ++ const struct bkey_i *k, ++ enum btree_id btree_id, unsigned level) + { + struct btree_cache *bc = &c->btree_cache; + struct btree *b; + +- BUG_ON(!btree_node_locked(iter, level + 1)); ++ BUG_ON(iter && !btree_node_locked(iter, level + 1)); + BUG_ON(level >= BTREE_MAX_DEPTH); + + b = btree_cache_find(bc, k); + if (b) + return; + +- bch2_btree_node_fill(c, iter, k, iter->btree_id, +- level, SIX_LOCK_read, false); ++ bch2_btree_node_fill(c, iter, k, btree_id, level, SIX_LOCK_read, false); + } + + void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, +diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h +index e766ef552ce7..0eeca0bcc48e 100644 +--- a/fs/bcachefs/btree_cache.h ++++ b/fs/bcachefs/btree_cache.h +@@ -32,7 +32,7 @@ struct btree *bch2_btree_node_get_sibling(struct bch_fs *, struct btree_iter *, + struct btree *, enum btree_node_sibling); + + void bch2_btree_node_prefetch(struct bch_fs *, struct btree_iter *, +- const struct bkey_i *, unsigned); ++ const struct bkey_i *, enum btree_id, unsigned); + + void bch2_fs_btree_cache_exit(struct bch_fs *); + int bch2_fs_btree_cache_init(struct bch_fs *); +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 4d825cac22ce..401dfd2c450a 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -1067,7 +1067,8 @@ static void btree_iter_prefetch(struct btree_iter *iter) + break; + + bch2_bkey_buf_unpack(&tmp, c, l->b, k); +- bch2_btree_node_prefetch(c, iter, tmp.k, iter->level - 1); ++ bch2_btree_node_prefetch(c, iter, tmp.k, iter->btree_id, ++ iter->level - 1); + } + + if (!was_locked) +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index 5a43682c26ef..c700b12b2ac0 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -206,6 +206,31 @@ void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *i + + /* Walk btree, overlaying keys from the journal: */ + ++static void btree_and_journal_iter_prefetch(struct bch_fs *c, struct btree *b, ++ struct btree_and_journal_iter iter) ++{ ++ unsigned i = 0, nr = b->c.level > 1 ? 2 : 16; ++ struct bkey_s_c k; ++ struct bkey_buf tmp; ++ ++ BUG_ON(!b->c.level); ++ ++ bch2_bkey_buf_init(&tmp); ++ ++ while (i < nr && ++ (k = bch2_btree_and_journal_iter_peek(&iter)).k) { ++ bch2_bkey_buf_reassemble(&tmp, c, k); ++ ++ bch2_btree_node_prefetch(c, NULL, tmp.k, ++ b->c.btree_id, b->c.level - 1); ++ ++ bch2_btree_and_journal_iter_advance(&iter); ++ i++; ++ } ++ ++ bch2_bkey_buf_exit(&tmp, c); ++} ++ + static int bch2_btree_and_journal_walk_recurse(struct bch_fs *c, struct btree *b, + struct journal_keys *journal_keys, + enum btree_id btree_id, +@@ -214,8 +239,11 @@ static int bch2_btree_and_journal_walk_recurse(struct bch_fs *c, struct btree *b + { + struct btree_and_journal_iter iter; + struct bkey_s_c k; ++ struct bkey_buf tmp; ++ struct btree *child; + int ret = 0; + ++ bch2_bkey_buf_init(&tmp); + bch2_btree_and_journal_iter_init_node_iter(&iter, journal_keys, b); + + while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { +@@ -224,23 +252,19 @@ static int bch2_btree_and_journal_walk_recurse(struct bch_fs *c, struct btree *b + break; + + if (b->c.level) { +- struct btree *child; +- struct bkey_buf tmp; +- +- bch2_bkey_buf_init(&tmp); + bch2_bkey_buf_reassemble(&tmp, c, k); +- k = bkey_i_to_s_c(tmp.k); + + bch2_btree_and_journal_iter_advance(&iter); + + child = bch2_btree_node_get_noiter(c, tmp.k, + b->c.btree_id, b->c.level - 1); +- bch2_bkey_buf_exit(&tmp, c); + + ret = PTR_ERR_OR_ZERO(child); + if (ret) + break; + ++ btree_and_journal_iter_prefetch(c, b, iter); ++ + ret = (node_fn ? node_fn(c, b) : 0) ?: + bch2_btree_and_journal_walk_recurse(c, child, + journal_keys, btree_id, node_fn, key_fn); +@@ -253,6 +277,7 @@ static int bch2_btree_and_journal_walk_recurse(struct bch_fs *c, struct btree *b + } + } + ++ bch2_bkey_buf_exit(&tmp, c); + return ret; + } + +-- +cgit v1.2.3 + + +From 223c28720c4e4df18fe6dded6f0547b06a7647f0 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 14 Jan 2021 16:19:23 -0500 +Subject: bcachefs: Factor out bch2_ec_stripes_heap_start() + +This fixes a bug where mark and sweep gc incorrectly was clearing out +the stripes heap and causing assertions to fire later - simpler to just +create the stripes heap after gc has finished. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_gc.c | 8 -------- + fs/bcachefs/ec.c | 17 ++++++++++------- + fs/bcachefs/ec.h | 2 ++ + fs/bcachefs/recovery.c | 2 ++ + 4 files changed, 14 insertions(+), 15 deletions(-) + +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +index 6b06f6079908..d0635a08d68f 100644 +--- a/fs/bcachefs/btree_gc.c ++++ b/fs/bcachefs/btree_gc.c +@@ -607,8 +607,6 @@ static int bch2_gc_done(struct bch_fs *c, + struct genradix_iter src_iter = genradix_iter_init(&c->stripes[1], 0); + struct stripe *dst, *src; + +- c->ec_stripes_heap.used = 0; +- + while ((dst = genradix_iter_peek(&dst_iter, &c->stripes[0])) && + (src = genradix_iter_peek(&src_iter, &c->stripes[1]))) { + BUG_ON(src_iter.pos != dst_iter.pos); +@@ -625,12 +623,6 @@ static int bch2_gc_done(struct bch_fs *c, + copy_stripe_field(block_sectors[i], + "block_sectors[%u]", i); + +- if (dst->alive) { +- spin_lock(&c->ec_stripes_heap_lock); +- bch2_stripes_heap_insert(c, dst, dst_iter.pos); +- spin_unlock(&c->ec_stripes_heap_lock); +- } +- + genradix_iter_advance(&dst_iter, &c->stripes[0]); + genradix_iter_advance(&src_iter, &c->stripes[1]); + } +diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c +index de266af446a6..8119eba38b47 100644 +--- a/fs/bcachefs/ec.c ++++ b/fs/bcachefs/ec.c +@@ -1450,6 +1450,16 @@ unlock: + mutex_unlock(&c->ec_stripe_head_lock); + } + ++void bch2_stripes_heap_start(struct bch_fs *c) ++{ ++ struct genradix_iter iter; ++ struct stripe *m; ++ ++ genradix_for_each(&c->stripes[0], iter, m) ++ if (m->alive) ++ bch2_stripes_heap_insert(c, m, iter.pos); ++} ++ + static int __bch2_stripe_write_key(struct btree_trans *trans, + struct btree_iter *iter, + struct stripe *m, +@@ -1529,18 +1539,11 @@ static int bch2_stripes_read_fn(struct bch_fs *c, enum btree_id id, + int ret = 0; + + if (k.k->type == KEY_TYPE_stripe) { +- struct stripe *m; +- + ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL) ?: + bch2_mark_key(c, k, 0, 0, NULL, 0, + BTREE_TRIGGER_NOATOMIC); + if (ret) + return ret; +- +- spin_lock(&c->ec_stripes_heap_lock); +- m = genradix_ptr(&c->stripes[0], k.k->p.offset); +- bch2_stripes_heap_insert(c, m, k.k->p.offset); +- spin_unlock(&c->ec_stripes_heap_lock); + } + + return ret; +diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h +index c3959af46833..f124582fdc5f 100644 +--- a/fs/bcachefs/ec.h ++++ b/fs/bcachefs/ec.h +@@ -200,6 +200,8 @@ void bch2_ec_stop_dev(struct bch_fs *, struct bch_dev *); + + void bch2_ec_flush_new_stripes(struct bch_fs *); + ++void bch2_stripes_heap_start(struct bch_fs *); ++ + struct journal_keys; + int bch2_stripes_read(struct bch_fs *, struct journal_keys *); + int bch2_stripes_write(struct bch_fs *, unsigned); +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index c700b12b2ac0..8c67f1468945 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -1125,6 +1125,8 @@ use_clean: + bch_verbose(c, "mark and sweep done"); + } + ++ bch2_stripes_heap_start(c); ++ + clear_bit(BCH_FS_REBUILD_REPLICAS, &c->flags); + set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags); + +-- +cgit v1.2.3 + + +From c0b08f9afc280b1a4b1f210f7761b092b9cb8ac9 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 14 Jan 2021 16:21:22 -0500 +Subject: bcachefs: Run jset_validate in write path as well + +This is because we had a bug where we were writing out journal entries +with garbage last_seq, and not catching it. + +Also, completely ignore jset->last_seq when JSET_NO_FLUSH is true, +because of aforementioned bug, but change the write path to set last_seq +to 0 when JSET_NO_FLUSH is true. + +Minor other cleanups and comments. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_io.c | 2 +- + fs/bcachefs/journal.c | 5 +++++ + fs/bcachefs/journal_io.c | 54 +++++++++++++++++++++++++++++++++--------------- + 3 files changed, 43 insertions(+), 18 deletions(-) + +diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c +index 768fc85eaa4e..b94f0807d615 100644 +--- a/fs/bcachefs/btree_io.c ++++ b/fs/bcachefs/btree_io.c +@@ -1624,7 +1624,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, + validate_before_checksum = true; + + /* validate_bset will be modifying: */ +- if (le16_to_cpu(i->version) < bcachefs_metadata_version_max) ++ if (le16_to_cpu(i->version) <= bcachefs_metadata_version_inode_btree_change) + validate_before_checksum = true; + + /* if we're going to be encrypting, check metadata validity first: */ +diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c +index 69e487bc29ff..04c94e579ad9 100644 +--- a/fs/bcachefs/journal.c ++++ b/fs/bcachefs/journal.c +@@ -118,6 +118,9 @@ void __bch2_journal_buf_put(struct journal *j) + + /* + * Returns true if journal entry is now closed: ++ * ++ * We don't close a journal_buf until the next journal_buf is finished writing, ++ * and can be opened again - this also initializes the next journal_buf: + */ + static bool __journal_entry_close(struct journal *j) + { +@@ -155,6 +158,7 @@ static bool __journal_entry_close(struct journal *j) + } while ((v = atomic64_cmpxchg(&j->reservations.counter, + old.v, new.v)) != old.v); + ++ /* Close out old buffer: */ + buf->data->u64s = cpu_to_le32(old.cur_entry_offset); + + sectors = vstruct_blocks_plus(buf->data, c->block_bits, +@@ -185,6 +189,7 @@ static bool __journal_entry_close(struct journal *j) + + __bch2_journal_pin_put(j, le64_to_cpu(buf->data->seq)); + ++ /* Initialize new buffer: */ + journal_pin_new_entry(j, 1); + + bch2_journal_buf_init(j); +diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c +index 2a344a04de87..385cb4d519e3 100644 +--- a/fs/bcachefs/journal_io.c ++++ b/fs/bcachefs/journal_io.c +@@ -470,7 +470,8 @@ static int jset_validate(struct bch_fs *c, + version < bcachefs_metadata_version_min) || + version >= bcachefs_metadata_version_max, c, + "%s sector %llu seq %llu: unknown journal entry version %u", +- ca->name, sector, le64_to_cpu(jset->seq), ++ ca ? ca->name : c->name, ++ sector, le64_to_cpu(jset->seq), + version)) { + /* don't try to continue: */ + return EINVAL; +@@ -482,32 +483,42 @@ static int jset_validate(struct bch_fs *c, + + if (journal_entry_err_on(bytes > bucket_sectors_left << 9, c, + "%s sector %llu seq %llu: journal entry too big (%zu bytes)", +- ca->name, sector, le64_to_cpu(jset->seq), bytes)) { ++ ca ? ca->name : c->name, ++ sector, le64_to_cpu(jset->seq), bytes)) { + ret = JOURNAL_ENTRY_BAD; + le32_add_cpu(&jset->u64s, + -((bytes - (bucket_sectors_left << 9)) / 8)); + } + +- if (fsck_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)), c, ++ if (journal_entry_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)), c, + "%s sector %llu seq %llu: journal entry with unknown csum type %llu", +- ca->name, sector, le64_to_cpu(jset->seq), ++ ca ? ca->name : c->name, ++ sector, le64_to_cpu(jset->seq), + JSET_CSUM_TYPE(jset))) { + ret = JOURNAL_ENTRY_BAD; +- goto bad_csum_type; ++ goto csum_done; + } + ++ if (write) ++ goto csum_done; ++ + csum = csum_vstruct(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), jset); + if (journal_entry_err_on(bch2_crc_cmp(csum, jset->csum), c, + "%s sector %llu seq %llu: journal checksum bad", +- ca->name, sector, le64_to_cpu(jset->seq))) ++ ca ? ca->name : c->name, ++ sector, le64_to_cpu(jset->seq))) + ret = JOURNAL_ENTRY_BAD; + + bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), + jset->encrypted_start, + vstruct_end(jset) - (void *) jset->encrypted_start); +-bad_csum_type: +- if (journal_entry_err_on(le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq), c, +- "invalid journal entry: last_seq > seq")) { ++csum_done: ++ /* last_seq is ignored when JSET_NO_FLUSH is true */ ++ if (journal_entry_err_on(!JSET_NO_FLUSH(jset) && ++ le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq), c, ++ "invalid journal entry: last_seq > seq (%llu > %llu)", ++ le64_to_cpu(jset->last_seq), ++ le64_to_cpu(jset->seq))) { + jset->last_seq = jset->seq; + return JOURNAL_ENTRY_BAD; + } +@@ -515,6 +526,14 @@ fsck_err: + return ret; + } + ++static int jset_validate_for_write(struct bch_fs *c, struct jset *jset) ++{ ++ unsigned sectors = vstruct_sectors(jset, c->block_bits); ++ ++ return jset_validate(c, NULL, jset, 0, sectors, sectors, WRITE) ?: ++ jset_validate_entries(c, jset, WRITE); ++} ++ + struct journal_read_buf { + void *data; + size_t size; +@@ -1082,9 +1101,7 @@ static void journal_write_done(struct closure *cl) + bch2_bkey_devs(bkey_i_to_s_c(&w->key)); + struct bch_replicas_padded replicas; + union journal_res_state old, new; +- u64 seq = le64_to_cpu(w->data->seq); +- u64 last_seq = le64_to_cpu(w->data->last_seq); +- u64 v; ++ u64 v, seq, last_seq; + int err = 0; + + bch2_time_stats_update(j->write_time, j->write_start_time); +@@ -1102,6 +1119,9 @@ static void journal_write_done(struct closure *cl) + bch2_fatal_error(c); + + spin_lock(&j->lock); ++ seq = le64_to_cpu(w->data->seq); ++ last_seq = le64_to_cpu(w->data->last_seq); ++ + if (seq >= j->pin.front) + journal_seq_pin(j, seq)->devs = devs; + +@@ -1109,7 +1129,7 @@ static void journal_write_done(struct closure *cl) + if (err && (!j->err_seq || seq < j->err_seq)) + j->err_seq = seq; + +- if (!w->noflush) { ++ if (!JSET_NO_FLUSH(w->data)) { + j->flushed_seq_ondisk = seq; + j->last_seq_ondisk = last_seq; + } +@@ -1197,7 +1217,7 @@ void bch2_journal_write(struct closure *cl) + test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags)) { + w->noflush = true; + SET_JSET_NO_FLUSH(jset, true); +- jset->last_seq = cpu_to_le64(j->last_seq_ondisk); ++ jset->last_seq = 0; + + j->nr_noflush_writes++; + } else { +@@ -1249,11 +1269,11 @@ void bch2_journal_write(struct closure *cl) + if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset))) + validate_before_checksum = true; + +- if (le32_to_cpu(jset->version) < bcachefs_metadata_version_max) ++ if (le32_to_cpu(jset->version) <= bcachefs_metadata_version_inode_btree_change) + validate_before_checksum = true; + + if (validate_before_checksum && +- jset_validate_entries(c, jset, WRITE)) ++ jset_validate_for_write(c, jset)) + goto err; + + bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), +@@ -1264,7 +1284,7 @@ void bch2_journal_write(struct closure *cl) + journal_nonce(jset), jset); + + if (!validate_before_checksum && +- jset_validate_entries(c, jset, WRITE)) ++ jset_validate_for_write(c, jset)) + goto err; + + sectors = vstruct_sectors(jset, c->block_bits); +-- +cgit v1.2.3 + + +From 6ac994d6d2080d3420d0b3a03678deb44835ddb0 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 16 Jan 2021 15:40:33 -0500 +Subject: bcachefs: Correctly order flushes and journal writes on multi device + filesystems + +All writes prior to a journal write need to be flushed before the +journal write itself happens. On single device filesystems, it suffices +to mark the write with REQ_PREFLUSH|REQ_FUA, but on multi device +filesystems we need to issue flushes to every device - and wait for them +to complete - before issuing the journal writes. Previously, we were +issuing flushes to every device, but we weren't waiting for them to +complete before issuing the journal writes. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/io.c | 3 -- + fs/bcachefs/journal.c | 1 + + fs/bcachefs/journal.h | 5 --- + fs/bcachefs/journal_io.c | 107 +++++++++++++++++++++++++++----------------- + fs/bcachefs/journal_types.h | 1 + + 5 files changed, 69 insertions(+), 48 deletions(-) + +diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c +index 5d884f7c137d..03ce492b4f81 100644 +--- a/fs/bcachefs/io.c ++++ b/fs/bcachefs/io.c +@@ -499,9 +499,6 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, + n->submit_time = local_clock(); + n->bio.bi_iter.bi_sector = ptr->offset; + +- if (!journal_flushes_device(ca)) +- n->bio.bi_opf |= REQ_FUA; +- + if (likely(n->have_ioref)) { + this_cpu_add(ca->io_done->sectors[WRITE][type], + bio_sectors(&n->bio)); +diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c +index 04c94e579ad9..b257c2900ac5 100644 +--- a/fs/bcachefs/journal.c ++++ b/fs/bcachefs/journal.c +@@ -82,6 +82,7 @@ static void bch2_journal_buf_init(struct journal *j) + bkey_extent_init(&buf->key); + buf->noflush = false; + buf->must_flush = false; ++ buf->separate_flush = false; + + memset(buf->has_inode, 0, sizeof(buf->has_inode)); + +diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h +index 1db1f190a168..bda8cb97d321 100644 +--- a/fs/bcachefs/journal.h ++++ b/fs/bcachefs/journal.h +@@ -494,11 +494,6 @@ static inline int bch2_journal_error(struct journal *j) + + struct bch_dev; + +-static inline bool journal_flushes_device(struct bch_dev *ca) +-{ +- return true; +-} +- + static inline void bch2_journal_set_replay_done(struct journal *j) + { + BUG_ON(!test_bit(JOURNAL_STARTED, &j->flags)); +diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c +index 385cb4d519e3..750f6fab2e63 100644 +--- a/fs/bcachefs/journal_io.c ++++ b/fs/bcachefs/journal_io.c +@@ -1189,6 +1189,53 @@ static void journal_write_endio(struct bio *bio) + percpu_ref_put(&ca->io_ref); + } + ++static void do_journal_write(struct closure *cl) ++{ ++ struct journal *j = container_of(cl, struct journal, io); ++ struct bch_fs *c = container_of(j, struct bch_fs, journal); ++ struct bch_dev *ca; ++ struct journal_buf *w = journal_last_unwritten_buf(j); ++ struct bch_extent_ptr *ptr; ++ struct bio *bio; ++ unsigned sectors = vstruct_sectors(w->data, c->block_bits); ++ ++ extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) { ++ ca = bch_dev_bkey_exists(c, ptr->dev); ++ if (!percpu_ref_tryget(&ca->io_ref)) { ++ /* XXX: fix this */ ++ bch_err(c, "missing device for journal write\n"); ++ continue; ++ } ++ ++ this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_journal], ++ sectors); ++ ++ bio = ca->journal.bio; ++ bio_reset(bio); ++ bio_set_dev(bio, ca->disk_sb.bdev); ++ bio->bi_iter.bi_sector = ptr->offset; ++ bio->bi_end_io = journal_write_endio; ++ bio->bi_private = ca; ++ bio->bi_opf = REQ_OP_WRITE|REQ_SYNC|REQ_META; ++ ++ if (!JSET_NO_FLUSH(w->data)) ++ bio->bi_opf |= REQ_FUA; ++ if (!JSET_NO_FLUSH(w->data) && !w->separate_flush) ++ bio->bi_opf |= REQ_PREFLUSH; ++ ++ bch2_bio_map(bio, w->data, sectors << 9); ++ ++ trace_journal_write(bio); ++ closure_bio_submit(bio, cl); ++ ++ ca->journal.bucket_seq[ca->journal.cur_idx] = ++ le64_to_cpu(w->data->seq); ++ } ++ ++ continue_at(cl, journal_write_done, system_highpri_wq); ++ return; ++} ++ + void bch2_journal_write(struct closure *cl) + { + struct journal *j = container_of(cl, struct journal, io); +@@ -1198,9 +1245,8 @@ void bch2_journal_write(struct closure *cl) + struct jset_entry *start, *end; + struct jset *jset; + struct bio *bio; +- struct bch_extent_ptr *ptr; + bool validate_before_checksum = false; +- unsigned i, sectors, bytes, u64s; ++ unsigned i, sectors, bytes, u64s, nr_rw_members = 0; + int ret; + + BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb)); +@@ -1330,49 +1376,30 @@ retry_alloc: + if (c->opts.nochanges) + goto no_io; + +- extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) { +- ca = bch_dev_bkey_exists(c, ptr->dev); +- if (!percpu_ref_tryget(&ca->io_ref)) { +- /* XXX: fix this */ +- bch_err(c, "missing device for journal write\n"); +- continue; +- } +- +- this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_journal], +- sectors); ++ for_each_rw_member(ca, c, i) ++ nr_rw_members++; + +- bio = ca->journal.bio; +- bio_reset(bio); +- bio_set_dev(bio, ca->disk_sb.bdev); +- bio->bi_iter.bi_sector = ptr->offset; +- bio->bi_end_io = journal_write_endio; +- bio->bi_private = ca; +- bio->bi_opf = REQ_OP_WRITE|REQ_SYNC|REQ_META; +- if (!JSET_NO_FLUSH(jset)) +- bio->bi_opf |= REQ_PREFLUSH|REQ_FUA; +- bch2_bio_map(bio, jset, sectors << 9); ++ if (nr_rw_members > 1) ++ w->separate_flush = true; + +- trace_journal_write(bio); +- closure_bio_submit(bio, cl); ++ if (!JSET_NO_FLUSH(jset) && w->separate_flush) { ++ for_each_rw_member(ca, c, i) { ++ percpu_ref_get(&ca->io_ref); + +- ca->journal.bucket_seq[ca->journal.cur_idx] = le64_to_cpu(jset->seq); ++ bio = ca->journal.bio; ++ bio_reset(bio); ++ bio_set_dev(bio, ca->disk_sb.bdev); ++ bio->bi_opf = REQ_OP_FLUSH; ++ bio->bi_end_io = journal_write_endio; ++ bio->bi_private = ca; ++ closure_bio_submit(bio, cl); ++ } + } + +- if (!JSET_NO_FLUSH(jset)) { +- for_each_rw_member(ca, c, i) +- if (journal_flushes_device(ca) && +- !bch2_bkey_has_device(bkey_i_to_s_c(&w->key), i)) { +- percpu_ref_get(&ca->io_ref); +- +- bio = ca->journal.bio; +- bio_reset(bio); +- bio_set_dev(bio, ca->disk_sb.bdev); +- bio->bi_opf = REQ_OP_FLUSH; +- bio->bi_end_io = journal_write_endio; +- bio->bi_private = ca; +- closure_bio_submit(bio, cl); +- } +- } ++ bch2_bucket_seq_cleanup(c); ++ ++ continue_at(cl, do_journal_write, system_highpri_wq); ++ return; + no_io: + bch2_bucket_seq_cleanup(c); + +diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h +index 9953663e3a63..d17a1ff82a18 100644 +--- a/fs/bcachefs/journal_types.h ++++ b/fs/bcachefs/journal_types.h +@@ -31,6 +31,7 @@ struct journal_buf { + unsigned u64s_reserved; + bool noflush; /* write has already been kicked off, and was noflush */ + bool must_flush; /* something wants a flush */ ++ bool separate_flush; + /* bloom filter: */ + unsigned long has_inode[1024 / sizeof(unsigned long)]; + }; +-- +cgit v1.2.3 + + +From 2e99f65b36c810cf9f2d56b8573c51cf64910127 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 17 Jan 2021 13:19:16 -0500 +Subject: bcachefs: Fix integer overflow in bch2_disk_reservation_get() + +The sectors argument shouldn't have been a u32 - it can be up to U32_MAX +(i.e. fallocate creating persistent reservations), and if replication is +enabled we'll overflow when we calculate the real number of sectors to +reserve. Oops. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/buckets.c | 2 +- + fs/bcachefs/buckets.h | 7 +++---- + 2 files changed, 4 insertions(+), 5 deletions(-) + +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index ed07dfee0ae3..00b2eadc8024 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -2115,7 +2115,7 @@ void __bch2_disk_reservation_put(struct bch_fs *c, struct disk_reservation *res) + #define SECTORS_CACHE 1024 + + int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res, +- unsigned sectors, int flags) ++ u64 sectors, int flags) + { + struct bch_fs_pcpu *pcpu; + u64 old, v, get; +diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h +index 3a5ed1fcaf78..d5b685196858 100644 +--- a/fs/bcachefs/buckets.h ++++ b/fs/bcachefs/buckets.h +@@ -284,8 +284,8 @@ static inline void bch2_disk_reservation_put(struct bch_fs *c, + #define BCH_DISK_RESERVATION_NOFAIL (1 << 0) + + int bch2_disk_reservation_add(struct bch_fs *, +- struct disk_reservation *, +- unsigned, int); ++ struct disk_reservation *, ++ u64, int); + + static inline struct disk_reservation + bch2_disk_reservation_init(struct bch_fs *c, unsigned nr_replicas) +@@ -302,8 +302,7 @@ bch2_disk_reservation_init(struct bch_fs *c, unsigned nr_replicas) + + static inline int bch2_disk_reservation_get(struct bch_fs *c, + struct disk_reservation *res, +- unsigned sectors, +- unsigned nr_replicas, ++ u64 sectors, unsigned nr_replicas, + int flags) + { + *res = bch2_disk_reservation_init(c, nr_replicas); +-- +cgit v1.2.3 + + +From fb08141a2e50cc79eb948e361ea0d2fdfca0fafb Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 17 Jan 2021 15:18:11 -0500 +Subject: bcachefs: Fix double counting of stripe block counts by GC + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/buckets.c | 12 +++++++++--- + 1 file changed, 9 insertions(+), 3 deletions(-) + +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index 00b2eadc8024..ddbc0041dc21 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -1269,9 +1269,15 @@ static int bch2_mark_stripe(struct bch_fs *c, + m->blocks_nonempty = 0; + + for (i = 0; i < new_s->nr_blocks; i++) { +- m->block_sectors[i] = +- stripe_blockcount_get(new_s, i); +- m->blocks_nonempty += !!m->block_sectors[i]; ++ unsigned s = stripe_blockcount_get(new_s, i); ++ ++ /* ++ * gc recalculates this field from stripe ptr ++ * references: ++ */ ++ if (!gc) ++ m->block_sectors[i] = s; ++ m->blocks_nonempty += !!s; + } + + if (gc && old_s) +-- +cgit v1.2.3 + + +From ab6d3120e048f008074c47f2ece852931a96a855 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 17 Jan 2021 16:16:37 -0500 +Subject: bcachefs: Fix gc updating stripes info + +The primary stripes radix tree can be sparse, which was causing an +assertion to pop because the one use for gc isn't. Fix this by changing +the algorithm to copy between the two radix trees. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_gc.c | 13 +++++-------- + 1 file changed, 5 insertions(+), 8 deletions(-) + +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +index d0635a08d68f..1e06f77a455f 100644 +--- a/fs/bcachefs/btree_gc.c ++++ b/fs/bcachefs/btree_gc.c +@@ -580,7 +580,7 @@ static int bch2_gc_done(struct bch_fs *c, + if (verify) \ + fsck_err(c, "stripe %zu has wrong "_msg \ + ": got %u, should be %u", \ +- dst_iter.pos, ##__VA_ARGS__, \ ++ iter.pos, ##__VA_ARGS__, \ + dst->_f, src->_f); \ + dst->_f = src->_f; \ + dst->dirty = true; \ +@@ -603,13 +603,11 @@ static int bch2_gc_done(struct bch_fs *c, + copy_field(_f, "fs has wrong " _msg, ##__VA_ARGS__) + + if (!metadata_only) { +- struct genradix_iter dst_iter = genradix_iter_init(&c->stripes[0], 0); +- struct genradix_iter src_iter = genradix_iter_init(&c->stripes[1], 0); ++ struct genradix_iter iter = genradix_iter_init(&c->stripes[1], 0); + struct stripe *dst, *src; + +- while ((dst = genradix_iter_peek(&dst_iter, &c->stripes[0])) && +- (src = genradix_iter_peek(&src_iter, &c->stripes[1]))) { +- BUG_ON(src_iter.pos != dst_iter.pos); ++ while ((src = genradix_iter_peek(&iter, &c->stripes[1]))) { ++ dst = genradix_ptr_alloc(&c->stripes[0], iter.pos, GFP_KERNEL); + + copy_stripe_field(alive, "alive"); + copy_stripe_field(sectors, "sectors"); +@@ -623,8 +621,7 @@ static int bch2_gc_done(struct bch_fs *c, + copy_stripe_field(block_sectors[i], + "block_sectors[%u]", i); + +- genradix_iter_advance(&dst_iter, &c->stripes[0]); +- genradix_iter_advance(&src_iter, &c->stripes[1]); ++ genradix_iter_advance(&iter, &c->stripes[1]); + } + } + +-- +cgit v1.2.3 + + +From ae752f2daec8fc2122426c40975a825b217e951b Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 17 Jan 2021 16:45:19 -0500 +Subject: bcachefs: Kill stripe->dirty + +This makes bch2_stripes_write() work more like bch2_alloc_write(). + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_gc.c | 21 +++++++++++++-------- + fs/bcachefs/ec.c | 15 ++++++++------- + fs/bcachefs/ec_types.h | 3 +-- + 3 files changed, 22 insertions(+), 17 deletions(-) + +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +index 1e06f77a455f..e8adae71f563 100644 +--- a/fs/bcachefs/btree_gc.c ++++ b/fs/bcachefs/btree_gc.c +@@ -583,7 +583,6 @@ static int bch2_gc_done(struct bch_fs *c, + iter.pos, ##__VA_ARGS__, \ + dst->_f, src->_f); \ + dst->_f = src->_f; \ +- dst->dirty = true; \ + set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags); \ + } + #define copy_bucket_field(_f) \ +@@ -609,18 +608,24 @@ static int bch2_gc_done(struct bch_fs *c, + while ((src = genradix_iter_peek(&iter, &c->stripes[1]))) { + dst = genradix_ptr_alloc(&c->stripes[0], iter.pos, GFP_KERNEL); + +- copy_stripe_field(alive, "alive"); +- copy_stripe_field(sectors, "sectors"); +- copy_stripe_field(algorithm, "algorithm"); +- copy_stripe_field(nr_blocks, "nr_blocks"); +- copy_stripe_field(nr_redundant, "nr_redundant"); +- copy_stripe_field(blocks_nonempty, +- "blocks_nonempty"); ++ if (dst->alive != src->alive || ++ dst->sectors != src->sectors || ++ dst->algorithm != src->algorithm || ++ dst->nr_blocks != src->nr_blocks || ++ dst->nr_redundant != src->nr_redundant) { ++ bch_err(c, "unexpected stripe inconsistency at bch2_gc_done, confused"); ++ ret = -EINVAL; ++ goto fsck_err; ++ } + + for (i = 0; i < ARRAY_SIZE(dst->block_sectors); i++) + copy_stripe_field(block_sectors[i], + "block_sectors[%u]", i); + ++ dst->blocks_nonempty = 0; ++ for (i = 0; i < dst->nr_blocks; i++) ++ dst->blocks_nonempty += dst->block_sectors[i] != 0; ++ + genradix_iter_advance(&iter, &c->stripes[1]); + } + } +diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c +index 8119eba38b47..ad6fdaad413f 100644 +--- a/fs/bcachefs/ec.c ++++ b/fs/bcachefs/ec.c +@@ -1466,7 +1466,7 @@ static int __bch2_stripe_write_key(struct btree_trans *trans, + size_t idx, + struct bkey_i_stripe *new_key) + { +- struct bch_fs *c = trans->c; ++ const struct bch_stripe *v; + struct bkey_s_c k; + unsigned i; + int ret; +@@ -1481,16 +1481,17 @@ static int __bch2_stripe_write_key(struct btree_trans *trans, + if (k.k->type != KEY_TYPE_stripe) + return -EIO; + ++ v = bkey_s_c_to_stripe(k).v; ++ for (i = 0; i < v->nr_blocks; i++) ++ if (m->block_sectors[i] != stripe_blockcount_get(v, i)) ++ goto write; ++ return 0; ++write: + bkey_reassemble(&new_key->k_i, k); + +- spin_lock(&c->ec_stripes_heap_lock); +- + for (i = 0; i < new_key->v.nr_blocks; i++) + stripe_blockcount_set(&new_key->v, i, + m->block_sectors[i]); +- m->dirty = false; +- +- spin_unlock(&c->ec_stripes_heap_lock); + + bch2_trans_update(trans, iter, &new_key->k_i, 0); + return 0; +@@ -1514,7 +1515,7 @@ int bch2_stripes_write(struct bch_fs *c, unsigned flags) + BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + + genradix_for_each(&c->stripes[0], giter, m) { +- if (!m->dirty) ++ if (!m->alive) + continue; + + ret = __bch2_trans_do(&trans, NULL, NULL, +diff --git a/fs/bcachefs/ec_types.h b/fs/bcachefs/ec_types.h +index 5b688b4394f7..847770166223 100644 +--- a/fs/bcachefs/ec_types.h ++++ b/fs/bcachefs/ec_types.h +@@ -18,8 +18,7 @@ struct stripe { + u8 nr_blocks; + u8 nr_redundant; + +- unsigned alive:1; +- unsigned dirty:1; ++ unsigned alive:1; /* does a corresponding key exist in stripes btree? */ + unsigned on_heap:1; + u8 blocks_nonempty; + u16 block_sectors[BCH_BKEY_PTRS_MAX]; +-- +cgit v1.2.3 + + +From 6b91d6f19c7b553c95a6094c90f4a04cd5c8a883 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 17 Jan 2021 17:43:49 -0500 +Subject: bcachefs: Preserve stripe blockcounts on existing stripes + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/ec.c | 59 +++++++++++++++++++++++++++++++++++++++++++++----------- + 1 file changed, 48 insertions(+), 11 deletions(-) + +diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c +index ad6fdaad413f..20050d08467b 100644 +--- a/fs/bcachefs/ec.c ++++ b/fs/bcachefs/ec.c +@@ -684,8 +684,8 @@ static void ec_stripe_delete_work(struct work_struct *work) + /* stripe creation: */ + + static int ec_stripe_bkey_insert(struct bch_fs *c, +- struct ec_stripe_new *s, +- struct bkey_i_stripe *stripe) ++ struct bkey_i_stripe *stripe, ++ struct disk_reservation *res) + { + struct btree_trans trans; + struct btree_iter *iter; +@@ -726,7 +726,7 @@ found_slot: + + bch2_trans_update(&trans, iter, &stripe->k_i, 0); + +- ret = bch2_trans_commit(&trans, &s->res, NULL, ++ ret = bch2_trans_commit(&trans, res, NULL, + BTREE_INSERT_NOFAIL); + err: + bch2_trans_iter_put(&trans, iter); +@@ -740,6 +740,47 @@ err: + return ret; + } + ++static int ec_stripe_bkey_update(struct btree_trans *trans, ++ struct bkey_i_stripe *new) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ const struct bch_stripe *existing; ++ unsigned i; ++ int ret; ++ ++ iter = bch2_trans_get_iter(trans, BTREE_ID_EC, ++ new->k.p, BTREE_ITER_INTENT); ++ k = bch2_btree_iter_peek_slot(iter); ++ ret = bkey_err(k); ++ if (ret) ++ goto err; ++ ++ if (!k.k || k.k->type != KEY_TYPE_stripe) { ++ bch_err(c, "error updating stripe: not found"); ++ ret = -ENOENT; ++ goto err; ++ } ++ ++ existing = bkey_s_c_to_stripe(k).v; ++ ++ if (existing->nr_blocks != new->v.nr_blocks) { ++ bch_err(c, "error updating stripe: nr_blocks does not match"); ++ ret = -EINVAL; ++ goto err; ++ } ++ ++ for (i = 0; i < new->v.nr_blocks; i++) ++ stripe_blockcount_set(&new->v, i, ++ stripe_blockcount_get(existing, i)); ++ ++ bch2_trans_update(trans, iter, &new->k_i, 0); ++err: ++ bch2_trans_iter_put(trans, iter); ++ return ret; ++} ++ + static void extent_stripe_ptr_add(struct bkey_s_extent e, + struct ec_stripe_buf *s, + struct bch_extent_ptr *ptr, +@@ -884,9 +925,9 @@ static void ec_stripe_create(struct ec_stripe_new *s) + } + + ret = s->have_existing_stripe +- ? bch2_btree_insert(c, BTREE_ID_EC, &s->new_stripe.key.k_i, +- &s->res, NULL, BTREE_INSERT_NOFAIL) +- : ec_stripe_bkey_insert(c, s, &s->new_stripe.key); ++ ? bch2_trans_do(c, &s->res, NULL, BTREE_INSERT_NOFAIL, ++ ec_stripe_bkey_update(&trans, &s->new_stripe.key)) ++ : ec_stripe_bkey_insert(c, &s->new_stripe.key, &s->res); + if (ret) { + bch_err(c, "error creating stripe: error creating stripe key"); + goto err_put_writes; +@@ -902,11 +943,7 @@ static void ec_stripe_create(struct ec_stripe_new *s) + + spin_lock(&c->ec_stripes_heap_lock); + m = genradix_ptr(&c->stripes[0], s->new_stripe.key.k.p.offset); +-#if 0 +- pr_info("created a %s stripe %llu", +- s->have_existing_stripe ? "existing" : "new", +- s->stripe.key.k.p.offset); +-#endif ++ + BUG_ON(m->on_heap); + bch2_stripes_heap_insert(c, m, s->new_stripe.key.k.p.offset); + spin_unlock(&c->ec_stripes_heap_lock); +-- +cgit v1.2.3 + + +From 1a19b969fb70b921d79354a24f9faa8d5b21c25a Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 18 Jan 2021 19:59:03 -0500 +Subject: bcachefs: Verify transaction updates are sorted + +A user reported a bug that implies they might not be correctly sorted, +this should help track that down. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_update_leaf.c | 20 ++++++++++++++++---- + 1 file changed, 16 insertions(+), 4 deletions(-) + +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index c490df4709ba..42969a846617 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -981,10 +981,22 @@ int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter, + .trigger_flags = flags, .iter = iter, .k = k + }; + +- EBUG_ON(bkey_cmp(iter->pos, +- (iter->flags & BTREE_ITER_IS_EXTENTS) +- ? bkey_start_pos(&k->k) +- : k->k.p)); ++#ifdef CONFIG_BCACHEFS_DEBUG ++ BUG_ON(bkey_cmp(iter->pos, ++ (iter->flags & BTREE_ITER_IS_EXTENTS) ++ ? bkey_start_pos(&k->k) ++ : k->k.p)); ++ ++ trans_for_each_update(trans, i) { ++ BUG_ON(bkey_cmp(i->iter->pos, ++ (i->iter->flags & BTREE_ITER_IS_EXTENTS) ++ ? bkey_start_pos(&i->k->k) ++ : i->k->k.p)); ++ ++ BUG_ON(i != trans->updates && ++ btree_iter_pos_cmp(i[-1].iter, i[0].iter) >= 0); ++ } ++#endif + + iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT; + +-- +cgit v1.2.3 + + +From 4f178ba1500d37084e0e6deb8d3067d82ec740f5 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 18 Jan 2021 23:26:42 -0500 +Subject: bcachefs: Rework allocating buckets for stripes + +Allocating buckets for existing stripes was busted, in part because the +data structures were too contorted. This reworks new stripes so that we +have an array of open buckets that matches blocks in the stripe, and +it's sparse if we're reusing an existing stripe. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/alloc_foreground.c | 21 ++++-- + fs/bcachefs/ec.c | 142 ++++++++++++++++++++++------------------- + fs/bcachefs/ec.h | 6 +- + 3 files changed, 92 insertions(+), 77 deletions(-) + +diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c +index dcbe04040a39..b65c483e6c4f 100644 +--- a/fs/bcachefs/alloc_foreground.c ++++ b/fs/bcachefs/alloc_foreground.c +@@ -489,16 +489,20 @@ bucket_alloc_from_stripe(struct bch_fs *c, + devs_sorted = bch2_dev_alloc_list(c, &wp->stripe, devs_may_alloc); + + for (i = 0; i < devs_sorted.nr; i++) +- open_bucket_for_each(c, &h->s->blocks, ob, ec_idx) ++ for (ec_idx = 0; ec_idx < h->s->nr_data; ec_idx++) { ++ if (!h->s->blocks[ec_idx]) ++ continue; ++ ++ ob = c->open_buckets + h->s->blocks[ec_idx]; + if (ob->ptr.dev == devs_sorted.devs[i] && +- !test_and_set_bit(h->s->data_block_idx[ec_idx], +- h->s->blocks_allocated)) ++ !test_and_set_bit(ec_idx, h->s->blocks_allocated)) + goto got_bucket; ++ } + goto out_put_head; + got_bucket: + ca = bch_dev_bkey_exists(c, ob->ptr.dev); + +- ob->ec_idx = h->s->data_block_idx[ec_idx]; ++ ob->ec_idx = ec_idx; + ob->ec = h->s; + + add_new_bucket(c, ptrs, devs_may_alloc, +@@ -636,10 +640,13 @@ void bch2_open_buckets_stop_dev(struct bch_fs *c, struct bch_dev *ca, + + if (!drop && ob->ec) { + mutex_lock(&ob->ec->lock); +- open_bucket_for_each(c, &ob->ec->blocks, ob2, j) +- drop |= ob2->ptr.dev == ca->dev_idx; +- open_bucket_for_each(c, &ob->ec->parity, ob2, j) ++ for (j = 0; j < ob->ec->new_stripe.key.v.nr_blocks; j++) { ++ if (!ob->ec->blocks[j]) ++ continue; ++ ++ ob2 = c->open_buckets + ob->ec->blocks[j]; + drop |= ob2->ptr.dev == ca->dev_idx; ++ } + mutex_unlock(&ob->ec->lock); + } + +diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c +index 20050d08467b..3b363e7f895b 100644 +--- a/fs/bcachefs/ec.c ++++ b/fs/bcachefs/ec.c +@@ -907,9 +907,6 @@ static void ec_stripe_create(struct ec_stripe_new *s) + if (!percpu_ref_tryget(&c->writes)) + goto err; + +- BUG_ON(bitmap_weight(s->blocks_allocated, +- s->blocks.nr) != s->blocks.nr); +- + ec_generate_ec(&s->new_stripe); + + ec_generate_checksums(&s->new_stripe); +@@ -952,12 +949,17 @@ err_put_writes: + err: + bch2_disk_reservation_put(c, &s->res); + +- open_bucket_for_each(c, &s->blocks, ob, i) { +- ob->ec = NULL; +- __bch2_open_bucket_put(c, ob); +- } +- +- bch2_open_buckets_put(c, &s->parity); ++ for (i = 0; i < v->nr_blocks; i++) ++ if (s->blocks[i]) { ++ ob = c->open_buckets + s->blocks[i]; ++ ++ if (i < nr_data) { ++ ob->ec = NULL; ++ __bch2_open_bucket_put(c, ob); ++ } else { ++ bch2_open_bucket_put(c, ob); ++ } ++ } + + bch2_keylist_free(&s->keys, s->inline_keys); + +@@ -1216,7 +1218,7 @@ void bch2_ec_stripe_head_put(struct bch_fs *c, struct ec_stripe_head *h) + if (h->s && + h->s->allocated && + bitmap_weight(h->s->blocks_allocated, +- h->s->blocks.nr) == h->s->blocks.nr) ++ h->s->nr_data) == h->s->nr_data) + ec_stripe_set_pending(c, h); + + mutex_unlock(&h->lock); +@@ -1253,64 +1255,82 @@ static enum bucket_alloc_ret + new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h, + struct closure *cl) + { +- struct bch_devs_mask devs; ++ struct bch_devs_mask devs = h->devs; + struct open_bucket *ob; +- unsigned i, nr_have, nr_data = +- min_t(unsigned, h->nr_active_devs, +- BCH_BKEY_PTRS_MAX) - h->redundancy; ++ struct open_buckets buckets; ++ unsigned i, j, nr_have_parity = 0, nr_have_data = 0; + bool have_cache = true; + enum bucket_alloc_ret ret = ALLOC_SUCCESS; + +- devs = h->devs; +- +- for_each_set_bit(i, h->s->blocks_allocated, BCH_BKEY_PTRS_MAX) { +- __clear_bit(h->s->new_stripe.key.v.ptrs[i].dev, devs.d); +- --nr_data; ++ for (i = 0; i < h->s->new_stripe.key.v.nr_blocks; i++) { ++ if (test_bit(i, h->s->blocks_gotten)) { ++ __clear_bit(h->s->new_stripe.key.v.ptrs[i].dev, devs.d); ++ if (i < h->s->nr_data) ++ nr_have_data++; ++ else ++ nr_have_parity++; ++ } + } + +- BUG_ON(h->s->blocks.nr > nr_data); +- BUG_ON(h->s->parity.nr > h->redundancy); +- +- open_bucket_for_each(c, &h->s->parity, ob, i) +- __clear_bit(ob->ptr.dev, devs.d); +- open_bucket_for_each(c, &h->s->blocks, ob, i) +- __clear_bit(ob->ptr.dev, devs.d); ++ BUG_ON(nr_have_data > h->s->nr_data); ++ BUG_ON(nr_have_parity > h->s->nr_parity); + + percpu_down_read(&c->mark_lock); + rcu_read_lock(); + +- if (h->s->parity.nr < h->redundancy) { +- nr_have = h->s->parity.nr; +- +- ret = bch2_bucket_alloc_set(c, &h->s->parity, ++ buckets.nr = 0; ++ if (nr_have_parity < h->s->nr_parity) { ++ ret = bch2_bucket_alloc_set(c, &buckets, + &h->parity_stripe, + &devs, +- h->redundancy, +- &nr_have, ++ h->s->nr_parity, ++ &nr_have_parity, + &have_cache, + h->copygc + ? RESERVE_MOVINGGC + : RESERVE_NONE, + 0, + cl); ++ ++ open_bucket_for_each(c, &buckets, ob, i) { ++ j = find_next_zero_bit(h->s->blocks_gotten, ++ h->s->nr_data + h->s->nr_parity, ++ h->s->nr_data); ++ BUG_ON(j >= h->s->nr_data + h->s->nr_parity); ++ ++ h->s->blocks[j] = buckets.v[i]; ++ h->s->new_stripe.key.v.ptrs[j] = ob->ptr; ++ __set_bit(j, h->s->blocks_gotten); ++ } ++ + if (ret) + goto err; + } + +- if (h->s->blocks.nr < nr_data) { +- nr_have = h->s->blocks.nr; +- +- ret = bch2_bucket_alloc_set(c, &h->s->blocks, ++ buckets.nr = 0; ++ if (nr_have_data < h->s->nr_data) { ++ ret = bch2_bucket_alloc_set(c, &buckets, + &h->block_stripe, + &devs, +- nr_data, +- &nr_have, ++ h->s->nr_data, ++ &nr_have_data, + &have_cache, + h->copygc + ? RESERVE_MOVINGGC + : RESERVE_NONE, + 0, + cl); ++ ++ open_bucket_for_each(c, &buckets, ob, i) { ++ j = find_next_zero_bit(h->s->blocks_gotten, ++ h->s->nr_data, 0); ++ BUG_ON(j >= h->s->nr_data); ++ ++ h->s->blocks[j] = buckets.v[i]; ++ h->s->new_stripe.key.v.ptrs[j] = ob->ptr; ++ __set_bit(j, h->s->blocks_gotten); ++ } ++ + if (ret) + goto err; + } +@@ -1362,8 +1382,7 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c, + struct closure *cl) + { + struct ec_stripe_head *h; +- struct open_bucket *ob; +- unsigned i, data_idx = 0; ++ unsigned i; + s64 idx; + int ret; + +@@ -1398,9 +1417,14 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c, + BUG(); + } + ++ BUG_ON(h->s->existing_stripe.size != h->blocksize); ++ BUG_ON(h->s->existing_stripe.size != h->s->existing_stripe.key.v.sectors); ++ + for (i = 0; i < h->s->existing_stripe.key.v.nr_blocks; i++) { +- if (stripe_blockcount_get(&h->s->existing_stripe.key.v, i)) ++ if (stripe_blockcount_get(&h->s->existing_stripe.key.v, i)) { ++ __set_bit(i, h->s->blocks_gotten); + __set_bit(i, h->s->blocks_allocated); ++ } + + ec_block_io(c, &h->s->existing_stripe, READ, i, &h->s->iodone); + } +@@ -1438,20 +1462,6 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c, + goto out; + } + +- open_bucket_for_each(c, &h->s->blocks, ob, i) { +- data_idx = find_next_zero_bit(h->s->blocks_allocated, +- h->s->nr_data, data_idx); +- BUG_ON(data_idx >= h->s->nr_data); +- +- h->s->new_stripe.key.v.ptrs[data_idx] = ob->ptr; +- h->s->data_block_idx[i] = data_idx; +- data_idx++; +- } +- +- open_bucket_for_each(c, &h->s->parity, ob, i) +- h->s->new_stripe.key.v.ptrs[h->s->nr_data + i] = ob->ptr; +- +- //pr_info("new stripe, blocks_allocated %lx", h->s->blocks_allocated[0]); + h->s->allocated = true; + } + out: +@@ -1471,12 +1481,14 @@ void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca) + if (!h->s) + goto unlock; + +- open_bucket_for_each(c, &h->s->blocks, ob, i) +- if (ob->ptr.dev == ca->dev_idx) +- goto found; +- open_bucket_for_each(c, &h->s->parity, ob, i) ++ for (i = 0; i < h->s->new_stripe.key.v.nr_blocks; i++) { ++ if (!h->s->blocks[i]) ++ continue; ++ ++ ob = c->open_buckets + h->s->blocks[i]; + if (ob->ptr.dev == ca->dev_idx) + goto found; ++ } + goto unlock; + found: + h->s->err = -EROFS; +@@ -1662,19 +1674,17 @@ void bch2_new_stripes_to_text(struct printbuf *out, struct bch_fs *c) + h->target, h->algo, h->redundancy); + + if (h->s) +- pr_buf(out, "\tpending: blocks %u allocated %u\n", +- h->s->blocks.nr, ++ pr_buf(out, "\tpending: blocks %u+%u allocated %u\n", ++ h->s->nr_data, h->s->nr_parity, + bitmap_weight(h->s->blocks_allocated, +- h->s->blocks.nr)); ++ h->s->nr_data)); + } + mutex_unlock(&c->ec_stripe_head_lock); + + mutex_lock(&c->ec_stripe_new_lock); + list_for_each_entry(s, &c->ec_stripe_new_list, list) { +- pr_buf(out, "\tin flight: blocks %u allocated %u pin %u\n", +- s->blocks.nr, +- bitmap_weight(s->blocks_allocated, +- s->blocks.nr), ++ pr_buf(out, "\tin flight: blocks %u+%u pin %u\n", ++ s->nr_data, s->nr_parity, + atomic_read(&s->pin)); + } + mutex_unlock(&c->ec_stripe_new_lock); +diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h +index f124582fdc5f..765baa9d9264 100644 +--- a/fs/bcachefs/ec.h ++++ b/fs/bcachefs/ec.h +@@ -143,11 +143,9 @@ struct ec_stripe_new { + bool pending; + bool have_existing_stripe; + ++ unsigned long blocks_gotten[BITS_TO_LONGS(BCH_BKEY_PTRS_MAX)]; + unsigned long blocks_allocated[BITS_TO_LONGS(BCH_BKEY_PTRS_MAX)]; +- +- struct open_buckets blocks; +- u8 data_block_idx[BCH_BKEY_PTRS_MAX]; +- struct open_buckets parity; ++ open_bucket_idx_t blocks[BCH_BKEY_PTRS_MAX]; + struct disk_reservation res; + + struct keylist keys; +-- +cgit v1.2.3 + + +From 74c89cb1868778a3ecc7ae4e5362e873a80d08ef Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 18 Jan 2021 20:20:24 -0500 +Subject: bcachefs: Don't allocate stripes at POS_MIN + +In the future, stripe index 0 will be a sentinal value. This patch +doesn't disallow stripes at POS_MIN yet, leaving that for when we do the +on disk format changes. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bkey.h | 5 +++++ + fs/bcachefs/ec.c | 5 +++-- + 2 files changed, 8 insertions(+), 2 deletions(-) + +diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h +index 2d2c640305e2..2c3b73a6fea3 100644 +--- a/fs/bcachefs/bkey.h ++++ b/fs/bcachefs/bkey.h +@@ -170,6 +170,11 @@ static inline struct bpos bpos_min(struct bpos l, struct bpos r) + return bkey_cmp(l, r) < 0 ? l : r; + } + ++static inline struct bpos bpos_max(struct bpos l, struct bpos r) ++{ ++ return bkey_cmp(l, r) > 0 ? l : r; ++} ++ + void bch2_bpos_swab(struct bpos *); + void bch2_bkey_swab_key(const struct bkey_format *, struct bkey_packed *); + +diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c +index 3b363e7f895b..7a3b9cd3c8ba 100644 +--- a/fs/bcachefs/ec.c ++++ b/fs/bcachefs/ec.c +@@ -690,7 +690,8 @@ static int ec_stripe_bkey_insert(struct bch_fs *c, + struct btree_trans trans; + struct btree_iter *iter; + struct bkey_s_c k; +- struct bpos start_pos = POS(0, c->ec_stripe_hint); ++ struct bpos min_pos = POS(0, 1); ++ struct bpos start_pos = bpos_max(min_pos, POS(0, c->ec_stripe_hint)); + int ret; + + bch2_trans_init(&trans, c, 0, 0); +@@ -701,7 +702,7 @@ retry: + BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { + if (bkey_cmp(k.k->p, POS(0, U32_MAX)) > 0) { + if (start_pos.offset) { +- start_pos = POS_MIN; ++ start_pos = min_pos; + bch2_btree_iter_set_pos(iter, start_pos); + continue; + } +-- +cgit v1.2.3 + + +From 0bbd5e24417b3130f2162f95de2ec3cb4b08a63d Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 20 Jan 2021 17:31:31 -0500 +Subject: bcachefs: Fix an assertion pop + +There was a race: btree node writes drop their reference on journal pins +before clearing the btree_node_write_in_flight flag. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_io.c | 17 ----------------- + fs/bcachefs/btree_io.h | 1 - + fs/bcachefs/super.c | 5 +---- + 3 files changed, 1 insertion(+), 22 deletions(-) + +diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c +index b94f0807d615..65f7e36677b7 100644 +--- a/fs/bcachefs/btree_io.c ++++ b/fs/bcachefs/btree_io.c +@@ -1828,23 +1828,6 @@ void bch2_btree_flush_all_writes(struct bch_fs *c) + __bch2_btree_flush_all(c, BTREE_NODE_write_in_flight); + } + +-void bch2_btree_verify_flushed(struct bch_fs *c) +-{ +- struct bucket_table *tbl; +- struct rhash_head *pos; +- struct btree *b; +- unsigned i; +- +- rcu_read_lock(); +- for_each_cached_btree(b, c, tbl, i, pos) { +- unsigned long flags = READ_ONCE(b->flags); +- +- BUG_ON((flags & (1 << BTREE_NODE_dirty)) || +- (flags & (1 << BTREE_NODE_write_in_flight))); +- } +- rcu_read_unlock(); +-} +- + void bch2_dirty_btree_nodes_to_text(struct printbuf *out, struct bch_fs *c) + { + struct bucket_table *tbl; +diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h +index 1a4b11e99cc4..3b61555ef906 100644 +--- a/fs/bcachefs/btree_io.h ++++ b/fs/bcachefs/btree_io.h +@@ -185,7 +185,6 @@ do { \ + + void bch2_btree_flush_all_reads(struct bch_fs *); + void bch2_btree_flush_all_writes(struct bch_fs *); +-void bch2_btree_verify_flushed(struct bch_fs *); + void bch2_dirty_btree_nodes_to_text(struct printbuf *, struct bch_fs *); + + static inline void compat_bformat(unsigned level, enum btree_id btree_id, +diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c +index 8442605537b1..84c589959220 100644 +--- a/fs/bcachefs/super.c ++++ b/fs/bcachefs/super.c +@@ -235,10 +235,7 @@ nowrote_alloc: + * the journal kicks off btree writes via reclaim - wait for in flight + * writes after stopping journal: + */ +- if (test_bit(BCH_FS_EMERGENCY_RO, &c->flags)) +- bch2_btree_flush_all_writes(c); +- else +- bch2_btree_verify_flushed(c); ++ bch2_btree_flush_all_writes(c); + + /* + * After stopping journal: +-- +cgit v1.2.3 + + +From 70d378b9a50e989053c8284d78701fca7ec5a369 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 20 Jan 2021 19:42:09 -0500 +Subject: bcachefs: Clean up bch2_extent_can_insert + +It was using an internal btree node iterator interface, when +bch2_btree_iter_peek_slot() sufficed. We were hitting a null ptr deref +that looked like it was from the iterator not being uptodate - this will +also fix that. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/extent_update.c | 15 +++++---------- + 1 file changed, 5 insertions(+), 10 deletions(-) + +diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c +index 1faca4bc1825..5c43678e94a3 100644 +--- a/fs/bcachefs/extent_update.c ++++ b/fs/bcachefs/extent_update.c +@@ -192,18 +192,13 @@ bch2_extent_can_insert(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_i *insert) + { +- struct btree_iter_level *l = &iter->l[0]; +- struct btree_node_iter node_iter = l->iter; +- struct bkey_packed *_k; + struct bkey_s_c k; +- struct bkey unpacked; +- int sectors; ++ int ret, sectors; + +- _k = bch2_btree_node_iter_peek(&node_iter, l->b); +- if (!_k) +- return BTREE_INSERT_OK; +- +- k = bkey_disassemble(l->b, _k, &unpacked); ++ k = bch2_btree_iter_peek_slot(iter); ++ ret = bkey_err(k); ++ if (ret) ++ return ret; + + /* Check if we're splitting a compressed extent: */ + +-- +cgit v1.2.3 + + +From 9393fa7a813285464e13c8c53bef894c888d6dca Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 21 Jan 2021 14:42:23 -0500 +Subject: bcachefs: Fix loopback in dio mode + +We had a deadlock on page_lock, because buffered reads signal completion +by unlocking the page, but the dio read path normally dirties the pages +it's reading to with set_page_dirty_lock. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/fs-io.c | 30 ++++++++++++++++++++++++++---- + 1 file changed, 26 insertions(+), 4 deletions(-) + +diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c +index dbb20d0f9092..718bf60f1857 100644 +--- a/fs/bcachefs/fs-io.c ++++ b/fs/bcachefs/fs-io.c +@@ -93,6 +93,7 @@ struct dio_read { + struct closure cl; + struct kiocb *req; + long ret; ++ bool should_dirty; + struct bch_read_bio rbio; + }; + +@@ -1625,12 +1626,22 @@ again: + + /* O_DIRECT reads */ + ++static void bio_check_or_release(struct bio *bio, bool check_dirty) ++{ ++ if (check_dirty) { ++ bio_check_pages_dirty(bio); ++ } else { ++ bio_release_pages(bio, false); ++ bio_put(bio); ++ } ++} ++ + static void bch2_dio_read_complete(struct closure *cl) + { + struct dio_read *dio = container_of(cl, struct dio_read, cl); + + dio->req->ki_complete(dio->req, dio->ret); +- bio_check_pages_dirty(&dio->rbio.bio); /* transfers ownership */ ++ bio_check_or_release(&dio->rbio.bio, dio->should_dirty); + } + + static void bch2_direct_IO_read_endio(struct bio *bio) +@@ -1645,8 +1656,11 @@ static void bch2_direct_IO_read_endio(struct bio *bio) + + static void bch2_direct_IO_read_split_endio(struct bio *bio) + { ++ struct dio_read *dio = bio->bi_private; ++ bool should_dirty = dio->should_dirty; ++ + bch2_direct_IO_read_endio(bio); +- bio_check_pages_dirty(bio); /* transfers ownership */ ++ bio_check_or_release(bio, should_dirty); + } + + static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter) +@@ -1700,6 +1714,12 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter) + + dio->req = req; + dio->ret = ret; ++ /* ++ * This is one of the sketchier things I've encountered: we have to skip ++ * the dirtying of requests that are internal from the kernel (i.e. from ++ * loopback), because we'll deadlock on page_lock. ++ */ ++ dio->should_dirty = iter_is_iovec(iter); + + goto start; + while (iter->count) { +@@ -1721,7 +1741,9 @@ start: + } + + offset += bio->bi_iter.bi_size; +- bio_set_pages_dirty(bio); ++ ++ if (dio->should_dirty) ++ bio_set_pages_dirty(bio); + + if (iter->count) + closure_get(&dio->cl); +@@ -1735,7 +1757,7 @@ start: + closure_sync(&dio->cl); + closure_debug_destroy(&dio->cl); + ret = dio->ret; +- bio_check_pages_dirty(&dio->rbio.bio); /* transfers ownership */ ++ bio_check_or_release(&dio->rbio.bio, dio->should_dirty); + return ret; + } else { + return -EIOCBQUEUED; +-- +cgit v1.2.3 + + +From ff1aff5a2d4d5c2d093b32387efd128de2c8a2f0 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 21 Jan 2021 19:14:37 -0500 +Subject: bcachefs: Switch replicas.c allocations to GFP_KERNEL + +We're transitioning to memalloc_nofs_save/restore instead of GFP flags +with the rest of the kernel, and GFP_NOIO was excessively strict and +causing unnnecessary allocation failures - these allocations are done +with btree locks dropped. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/replicas.c | 16 ++++++++-------- + 1 file changed, 8 insertions(+), 8 deletions(-) + +diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c +index b1d8db677c1c..ce8b7355b349 100644 +--- a/fs/bcachefs/replicas.c ++++ b/fs/bcachefs/replicas.c +@@ -159,7 +159,7 @@ cpu_replicas_add_entry(struct bch_replicas_cpu *old, + BUG_ON(!new_entry->data_type); + verify_replicas_entry(new_entry); + +- new.entries = kcalloc(new.nr, new.entry_size, GFP_NOIO); ++ new.entries = kcalloc(new.nr, new.entry_size, GFP_KERNEL); + if (!new.entries) + return new; + +@@ -282,13 +282,13 @@ static int replicas_table_update(struct bch_fs *c, + + for (i = 0; i < ARRAY_SIZE(new_usage); i++) + if (!(new_usage[i] = __alloc_percpu_gfp(bytes, +- sizeof(u64), GFP_NOIO))) ++ sizeof(u64), GFP_KERNEL))) + goto err; + +- if (!(new_base = kzalloc(bytes, GFP_NOIO)) || +- !(new_scratch = kmalloc(bytes, GFP_NOIO)) || ++ if (!(new_base = kzalloc(bytes, GFP_KERNEL)) || ++ !(new_scratch = kmalloc(bytes, GFP_KERNEL)) || + (c->usage_gc && +- !(new_gc = __alloc_percpu_gfp(bytes, sizeof(u64), GFP_NOIO)))) ++ !(new_gc = __alloc_percpu_gfp(bytes, sizeof(u64), GFP_KERNEL)))) + goto err; + + for (i = 0; i < ARRAY_SIZE(new_usage); i++) +@@ -548,7 +548,7 @@ int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask) + + c->replicas_gc.entries = kcalloc(c->replicas_gc.nr, + c->replicas_gc.entry_size, +- GFP_NOIO); ++ GFP_KERNEL); + if (!c->replicas_gc.entries) { + mutex_unlock(&c->sb_lock); + bch_err(c, "error allocating c->replicas_gc"); +@@ -671,7 +671,7 @@ __bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r, + nr++; + } + +- cpu_r->entries = kcalloc(nr, entry_size, GFP_NOIO); ++ cpu_r->entries = kcalloc(nr, entry_size, GFP_KERNEL); + if (!cpu_r->entries) + return -ENOMEM; + +@@ -703,7 +703,7 @@ __bch2_sb_replicas_v0_to_cpu_replicas(struct bch_sb_field_replicas_v0 *sb_r, + entry_size += sizeof(struct bch_replicas_entry) - + sizeof(struct bch_replicas_entry_v0); + +- cpu_r->entries = kcalloc(nr, entry_size, GFP_NOIO); ++ cpu_r->entries = kcalloc(nr, entry_size, GFP_KERNEL); + if (!cpu_r->entries) + return -ENOMEM; + +-- +cgit v1.2.3 + + +From 7c8fea6755b8b49038ce66b6b990f02288588781 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 21 Jan 2021 19:15:49 -0500 +Subject: bcachefs: Fix a faulty assertion + +If journal replay hasn't finished, the journal can't be empty - oops. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/journal.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c +index b257c2900ac5..be019c4729e5 100644 +--- a/fs/bcachefs/journal.c ++++ b/fs/bcachefs/journal.c +@@ -954,6 +954,7 @@ void bch2_fs_journal_stop(struct journal *j) + journal_quiesce(j); + + BUG_ON(!bch2_journal_error(j) && ++ test_bit(JOURNAL_REPLAY_DONE, &j->flags) && + (journal_entry_is_open(j) || + j->last_empty_seq + 1 != journal_cur_seq(j))); + +-- +cgit v1.2.3 + + +From 80106fbc2984bea1afe03770e24fd563f020d2ff Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 21 Jan 2021 19:30:35 -0500 +Subject: bcachefs: Ensure __bch2_trans_commit() always calls + bch2_trans_reset() + +This was leading to a very strange bug in bch2_bucket_io_time_reset(), +where we'd retry without clearing out the list of updates. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_update_leaf.c | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index 42969a846617..967e1e4d9620 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -836,7 +836,7 @@ int __bch2_trans_commit(struct btree_trans *trans) + int ret = 0; + + if (!trans->nr_updates) +- goto out_noupdates; ++ goto out_reset; + + if (trans->flags & BTREE_INSERT_GC_LOCK_HELD) + lockdep_assert_held(&trans->c->gc_lock); +@@ -850,7 +850,7 @@ int __bch2_trans_commit(struct btree_trans *trans) + unlikely(!percpu_ref_tryget(&trans->c->writes))) { + ret = bch2_trans_commit_get_rw_cold(trans); + if (ret) +- return ret; ++ goto out_reset; + } + + #ifdef CONFIG_BCACHEFS_DEBUG +@@ -962,7 +962,7 @@ out: + + if (likely(!(trans->flags & BTREE_INSERT_NOCHECK_RW))) + percpu_ref_put(&trans->c->writes); +-out_noupdates: ++out_reset: + bch2_trans_reset(trans, !ret ? TRANS_RESET_NOTRAVERSE : 0); + + return ret; +-- +cgit v1.2.3 + + +From cb26d3777f9cd7769063be061e6361e38346adfa Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 21 Jan 2021 21:51:42 -0500 +Subject: bcachefs: Kill metadata only gc + +This was useful before we had transactional updates to interior btree +nodes - but now, it's just extra unneeded complexity. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_gc.c | 68 ++++++++++++++++++-------------------------------- + fs/bcachefs/btree_gc.h | 2 +- + fs/bcachefs/recovery.c | 18 ++----------- + 3 files changed, 27 insertions(+), 61 deletions(-) + +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +index e8adae71f563..efeaec3d9c03 100644 +--- a/fs/bcachefs/btree_gc.c ++++ b/fs/bcachefs/btree_gc.c +@@ -205,13 +205,12 @@ static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, u8 *max_stale, + } + + static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id, +- bool initial, bool metadata_only) ++ bool initial) + { + struct btree_trans trans; + struct btree_iter *iter; + struct btree *b; +- unsigned depth = metadata_only ? 1 +- : bch2_expensive_debug_checks ? 0 ++ unsigned depth = bch2_expensive_debug_checks ? 0 + : !btree_node_type_needs_gc(btree_id) ? 1 + : 0; + u8 max_stale = 0; +@@ -326,13 +325,11 @@ static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b, + + static int bch2_gc_btree_init(struct bch_fs *c, + struct journal_keys *journal_keys, +- enum btree_id btree_id, +- bool metadata_only) ++ enum btree_id btree_id) + { + struct btree *b; +- unsigned target_depth = metadata_only ? 1 +- : bch2_expensive_debug_checks ? 0 +- : !btree_node_type_needs_gc(btree_id) ? 1 ++ unsigned target_depth = bch2_expensive_debug_checks ? 0 ++ : !btree_node_type_needs_gc(btree_id) ? 1 + : 0; + u8 max_stale = 0; + int ret = 0; +@@ -377,7 +374,7 @@ static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r) + } + + static int bch2_gc_btrees(struct bch_fs *c, struct journal_keys *journal_keys, +- bool initial, bool metadata_only) ++ bool initial) + { + enum btree_id ids[BTREE_ID_NR]; + unsigned i; +@@ -390,8 +387,8 @@ static int bch2_gc_btrees(struct bch_fs *c, struct journal_keys *journal_keys, + enum btree_id id = ids[i]; + int ret = initial + ? bch2_gc_btree_init(c, journal_keys, +- id, metadata_only) +- : bch2_gc_btree(c, id, initial, metadata_only); ++ id) ++ : bch2_gc_btree(c, id, initial); + if (ret) + return ret; + } +@@ -558,12 +555,11 @@ static void bch2_gc_free(struct bch_fs *c) + } + + static int bch2_gc_done(struct bch_fs *c, +- bool initial, bool metadata_only) ++ bool initial) + { + struct bch_dev *ca; +- bool verify = !metadata_only && +- (!initial || +- (c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO))); ++ bool verify = (!initial || ++ (c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO))); + unsigned i; + int ret = 0; + +@@ -601,7 +597,7 @@ static int bch2_gc_done(struct bch_fs *c, + #define copy_fs_field(_f, _msg, ...) \ + copy_field(_f, "fs has wrong " _msg, ##__VA_ARGS__) + +- if (!metadata_only) { ++ { + struct genradix_iter iter = genradix_iter_init(&c->stripes[1], 0); + struct stripe *dst, *src; + +@@ -660,28 +656,20 @@ static int bch2_gc_done(struct bch_fs *c, + + copy_fs_field(hidden, "hidden"); + copy_fs_field(btree, "btree"); ++ copy_fs_field(data, "data"); ++ copy_fs_field(cached, "cached"); ++ copy_fs_field(reserved, "reserved"); ++ copy_fs_field(nr_inodes,"nr_inodes"); + +- if (!metadata_only) { +- copy_fs_field(data, "data"); +- copy_fs_field(cached, "cached"); +- copy_fs_field(reserved, "reserved"); +- copy_fs_field(nr_inodes,"nr_inodes"); +- +- for (i = 0; i < BCH_REPLICAS_MAX; i++) +- copy_fs_field(persistent_reserved[i], +- "persistent_reserved[%i]", i); +- } ++ for (i = 0; i < BCH_REPLICAS_MAX; i++) ++ copy_fs_field(persistent_reserved[i], ++ "persistent_reserved[%i]", i); + + for (i = 0; i < c->replicas.nr; i++) { + struct bch_replicas_entry *e = + cpu_replicas_entry(&c->replicas, i); + char buf[80]; + +- if (metadata_only && +- (e->data_type == BCH_DATA_user || +- e->data_type == BCH_DATA_cached)) +- continue; +- + bch2_replicas_entry_to_text(&PBUF(buf), e); + + copy_fs_field(replicas[i], "%s", buf); +@@ -697,8 +685,7 @@ fsck_err: + return ret; + } + +-static int bch2_gc_start(struct bch_fs *c, +- bool metadata_only) ++static int bch2_gc_start(struct bch_fs *c) + { + struct bch_dev *ca; + unsigned i; +@@ -762,13 +749,6 @@ static int bch2_gc_start(struct bch_fs *c, + + d->_mark.gen = dst->b[b].oldest_gen = s->mark.gen; + d->gen_valid = s->gen_valid; +- +- if (metadata_only && +- (s->mark.data_type == BCH_DATA_user || +- s->mark.data_type == BCH_DATA_cached)) { +- d->_mark = s->mark; +- d->_mark.owned_by_allocator = 0; +- } + } + }; + +@@ -796,7 +776,7 @@ static int bch2_gc_start(struct bch_fs *c, + * uses, GC could skip past them + */ + int bch2_gc(struct bch_fs *c, struct journal_keys *journal_keys, +- bool initial, bool metadata_only) ++ bool initial) + { + struct bch_dev *ca; + u64 start_time = local_clock(); +@@ -812,13 +792,13 @@ int bch2_gc(struct bch_fs *c, struct journal_keys *journal_keys, + closure_wait_event(&c->btree_interior_update_wait, + !bch2_btree_interior_updates_nr_pending(c)); + again: +- ret = bch2_gc_start(c, metadata_only); ++ ret = bch2_gc_start(c); + if (ret) + goto out; + + bch2_mark_superblocks(c); + +- ret = bch2_gc_btrees(c, journal_keys, initial, metadata_only); ++ ret = bch2_gc_btrees(c, journal_keys, initial); + if (ret) + goto out; + +@@ -857,7 +837,7 @@ out: + bch2_journal_block(&c->journal); + + percpu_down_write(&c->mark_lock); +- ret = bch2_gc_done(c, initial, metadata_only); ++ ret = bch2_gc_done(c, initial); + + bch2_journal_unblock(&c->journal); + } else { +diff --git a/fs/bcachefs/btree_gc.h b/fs/bcachefs/btree_gc.h +index 3694a3df62a8..f0435a58793b 100644 +--- a/fs/bcachefs/btree_gc.h ++++ b/fs/bcachefs/btree_gc.h +@@ -7,7 +7,7 @@ + void bch2_coalesce(struct bch_fs *); + + struct journal_keys; +-int bch2_gc(struct bch_fs *, struct journal_keys *, bool, bool); ++int bch2_gc(struct bch_fs *, struct journal_keys *, bool); + int bch2_gc_gens(struct bch_fs *); + void bch2_gc_thread_stop(struct bch_fs *); + int bch2_gc_thread_start(struct bch_fs *); +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index 8c67f1468945..422f2fbe6dfb 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -1099,27 +1099,13 @@ use_clean: + + set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags); + +- if ((c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO)) && +- !(c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_METADATA))) { +- /* +- * interior btree node updates aren't consistent with the +- * journal; after an unclean shutdown we have to walk all +- * pointers to metadata: +- */ +- bch_info(c, "starting metadata mark and sweep"); +- err = "error in mark and sweep"; +- ret = bch2_gc(c, &c->journal_keys, true, true); +- if (ret) +- goto err; +- bch_verbose(c, "mark and sweep done"); +- } +- + if (c->opts.fsck || + !(c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO)) || ++ !(c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_METADATA)) || + test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags)) { + bch_info(c, "starting mark and sweep"); + err = "error in mark and sweep"; +- ret = bch2_gc(c, &c->journal_keys, true, false); ++ ret = bch2_gc(c, &c->journal_keys, true); + if (ret) + goto err; + bch_verbose(c, "mark and sweep done"); +-- +cgit v1.2.3 + + +From 8260b434bc31386dd1021f4f479d63568bbb090c Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 21 Jan 2021 20:51:51 -0500 +Subject: bcachefs: Refactor dev usage + +This is to make it more amenable for serialization. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/alloc_background.c | 42 +++++++++---------- + fs/bcachefs/alloc_foreground.c | 19 +++++---- + fs/bcachefs/buckets.c | 39 +++++++----------- + fs/bcachefs/buckets.h | 11 +---- + fs/bcachefs/buckets_types.h | 13 +++--- + fs/bcachefs/chardev.c | 6 +-- + fs/bcachefs/movinggc.c | 2 +- + fs/bcachefs/super.c | 2 +- + fs/bcachefs/sysfs.c | 91 +++++++++++++++++------------------------- + 9 files changed, 94 insertions(+), 131 deletions(-) + +diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c +index 922b24aaf367..1a670f6bd298 100644 +--- a/fs/bcachefs/alloc_background.c ++++ b/fs/bcachefs/alloc_background.c +@@ -54,10 +54,10 @@ static void pd_controllers_update(struct work_struct *work) + * reclaimed by copy GC + */ + fragmented += max_t(s64, 0, (bucket_to_sector(ca, +- stats.buckets[BCH_DATA_user] + +- stats.buckets[BCH_DATA_cached]) - +- (stats.sectors[BCH_DATA_user] + +- stats.sectors[BCH_DATA_cached])) << 9); ++ stats.d[BCH_DATA_user].buckets + ++ stats.d[BCH_DATA_cached].buckets) - ++ (stats.d[BCH_DATA_user].sectors + ++ stats.d[BCH_DATA_cached].sectors)) << 9); + } + + bch2_pd_controller_update(&c->copygc_pd, free, fragmented, -1); +@@ -217,7 +217,7 @@ static int bch2_alloc_read_fn(struct bch_fs *c, enum btree_id id, + return 0; + + ca = bch_dev_bkey_exists(c, k.k->p.inode); +- g = __bucket(ca, k.k->p.offset, 0); ++ g = bucket(ca, k.k->p.offset); + u = bch2_alloc_unpack(k); + + g->_mark.gen = u.gen; +@@ -278,7 +278,6 @@ static int bch2_alloc_write_key(struct btree_trans *trans, + struct bch_fs *c = trans->c; + struct bkey_s_c k; + struct bch_dev *ca; +- struct bucket_array *ba; + struct bucket *g; + struct bucket_mark m; + struct bkey_alloc_unpacked old_u, new_u; +@@ -302,9 +301,7 @@ retry: + + percpu_down_read(&c->mark_lock); + ca = bch_dev_bkey_exists(c, iter->pos.inode); +- ba = bucket_array(ca); +- +- g = &ba->b[iter->pos.offset]; ++ g = bucket(ca, iter->pos.offset); + m = READ_ONCE(g->mark); + new_u = alloc_mem_to_key(g, m); + percpu_up_read(&c->mark_lock); +@@ -330,16 +327,10 @@ int bch2_dev_alloc_write(struct bch_fs *c, struct bch_dev *ca, unsigned flags) + { + struct btree_trans trans; + struct btree_iter *iter; +- u64 first_bucket, nbuckets; ++ u64 first_bucket = ca->mi.first_bucket; ++ u64 nbuckets = ca->mi.nbuckets; + int ret = 0; + +- percpu_down_read(&c->mark_lock); +- first_bucket = bucket_array(ca)->first_bucket; +- nbuckets = bucket_array(ca)->nbuckets; +- percpu_up_read(&c->mark_lock); +- +- BUG_ON(BKEY_ALLOC_VAL_U64s_MAX > 8); +- + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); + + iter = bch2_trans_get_iter(&trans, BTREE_ID_ALLOC, +@@ -552,7 +543,8 @@ out: + static int wait_buckets_available(struct bch_fs *c, struct bch_dev *ca) + { + unsigned long gc_count = c->gc_count; +- u64 available; ++ s64 available; ++ unsigned i; + int ret = 0; + + ca->allocator_state = ALLOCATOR_BLOCKED; +@@ -568,8 +560,15 @@ static int wait_buckets_available(struct bch_fs *c, struct bch_dev *ca) + if (gc_count != c->gc_count) + ca->inc_gen_really_needs_gc = 0; + +- available = max_t(s64, 0, dev_buckets_available(ca) - +- ca->inc_gen_really_needs_gc); ++ available = dev_buckets_available(ca); ++ available -= ca->inc_gen_really_needs_gc; ++ ++ spin_lock(&c->freelist_lock); ++ for (i = 0; i < RESERVE_NR; i++) ++ available -= fifo_used(&ca->free[i]); ++ spin_unlock(&c->freelist_lock); ++ ++ available = max(available, 0LL); + + if (available > fifo_free(&ca->free_inc) || + (available && +@@ -598,6 +597,9 @@ static bool bch2_can_invalidate_bucket(struct bch_dev *ca, + if (!is_available_bucket(mark)) + return false; + ++ if (mark.owned_by_allocator) ++ return false; ++ + if (ca->buckets_nouse && + test_bit(bucket, ca->buckets_nouse)) + return false; +diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c +index b65c483e6c4f..8f0b94f591be 100644 +--- a/fs/bcachefs/alloc_foreground.c ++++ b/fs/bcachefs/alloc_foreground.c +@@ -192,8 +192,9 @@ long bch2_bucket_alloc_new_fs(struct bch_dev *ca) + rcu_read_lock(); + buckets = bucket_array(ca); + +- for (b = ca->mi.first_bucket; b < ca->mi.nbuckets; b++) +- if (is_available_bucket(buckets->b[b].mark)) ++ for (b = buckets->first_bucket; b < buckets->nbuckets; b++) ++ if (is_available_bucket(buckets->b[b].mark) && ++ !buckets->b[b].mark.owned_by_allocator) + goto success; + b = -1; + success: +@@ -224,9 +225,8 @@ struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca, + bool may_alloc_partial, + struct closure *cl) + { +- struct bucket_array *buckets; + struct open_bucket *ob; +- long bucket = 0; ++ long b = 0; + + spin_lock(&c->freelist_lock); + +@@ -260,13 +260,13 @@ struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca, + return ERR_PTR(-OPEN_BUCKETS_EMPTY); + } + +- if (likely(fifo_pop(&ca->free[RESERVE_NONE], bucket))) ++ if (likely(fifo_pop(&ca->free[RESERVE_NONE], b))) + goto out; + + switch (reserve) { + case RESERVE_BTREE_MOVINGGC: + case RESERVE_MOVINGGC: +- if (fifo_pop(&ca->free[RESERVE_MOVINGGC], bucket)) ++ if (fifo_pop(&ca->free[RESERVE_MOVINGGC], b)) + goto out; + break; + default: +@@ -284,20 +284,19 @@ struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca, + trace_bucket_alloc_fail(ca, reserve); + return ERR_PTR(-FREELIST_EMPTY); + out: +- verify_not_on_freelist(c, ca, bucket); ++ verify_not_on_freelist(c, ca, b); + + ob = bch2_open_bucket_alloc(c); + + spin_lock(&ob->lock); +- buckets = bucket_array(ca); + + ob->valid = true; + ob->sectors_free = ca->mi.bucket_size; + ob->alloc_reserve = reserve; + ob->ptr = (struct bch_extent_ptr) { + .type = 1 << BCH_EXTENT_ENTRY_ptr, +- .gen = buckets->b[bucket].mark.gen, +- .offset = bucket_to_sector(ca, bucket), ++ .gen = bucket(ca, b)->mark.gen, ++ .offset = bucket_to_sector(ca, b), + .dev = ca->dev_idx, + }; + +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index ddbc0041dc21..172456e07513 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -376,15 +376,12 @@ static inline int is_unavailable_bucket(struct bucket_mark m) + return !is_available_bucket(m); + } + +-static inline int is_fragmented_bucket(struct bucket_mark m, +- struct bch_dev *ca) +-{ +- if (!m.owned_by_allocator && +- m.data_type == BCH_DATA_user && +- bucket_sectors_used(m)) +- return max_t(int, 0, (int) ca->mi.bucket_size - +- bucket_sectors_used(m)); +- return 0; ++static inline int bucket_sectors_fragmented(struct bch_dev *ca, ++ struct bucket_mark m) ++{ ++ return bucket_sectors_used(m) ++ ? max(0, (int) ca->mi.bucket_size - (int) bucket_sectors_used(m)) ++ : 0; + } + + static inline int is_stripe_data_bucket(struct bucket_mark m) +@@ -392,11 +389,6 @@ static inline int is_stripe_data_bucket(struct bucket_mark m) + return m.stripe && m.data_type != BCH_DATA_parity; + } + +-static inline int bucket_stripe_sectors(struct bucket_mark m) +-{ +- return is_stripe_data_bucket(m) ? m.dirty_sectors : 0; +-} +- + static inline enum bch_data_type bucket_type(struct bucket_mark m) + { + return m.cached_sectors && !m.dirty_sectors +@@ -456,7 +448,7 @@ static inline void account_bucket(struct bch_fs_usage *fs_usage, + if (type == BCH_DATA_sb || type == BCH_DATA_journal) + fs_usage->hidden += size; + +- dev_usage->buckets[type] += nr; ++ dev_usage->d[type].buckets += nr; + } + + static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca, +@@ -481,19 +473,18 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca, + + u->buckets_alloc += + (int) new.owned_by_allocator - (int) old.owned_by_allocator; ++ u->buckets_ec += (int) new.stripe - (int) old.stripe; + u->buckets_unavailable += + is_unavailable_bucket(new) - is_unavailable_bucket(old); + +- u->buckets_ec += (int) new.stripe - (int) old.stripe; +- u->sectors_ec += bucket_stripe_sectors(new) - +- bucket_stripe_sectors(old); +- +- u->sectors[old.data_type] -= old.dirty_sectors; +- u->sectors[new.data_type] += new.dirty_sectors; +- u->sectors[BCH_DATA_cached] += ++ u->d[old.data_type].sectors -= old.dirty_sectors; ++ u->d[new.data_type].sectors += new.dirty_sectors; ++ u->d[BCH_DATA_cached].sectors += + (int) new.cached_sectors - (int) old.cached_sectors; +- u->sectors_fragmented += +- is_fragmented_bucket(new, ca) - is_fragmented_bucket(old, ca); ++ ++ u->d[old.data_type].fragmented -= bucket_sectors_fragmented(ca, old); ++ u->d[new.data_type].fragmented += bucket_sectors_fragmented(ca, new); ++ + preempt_enable(); + + if (!is_available_bucket(old) && is_available_bucket(new)) +diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h +index d5b685196858..25d6785c0fe6 100644 +--- a/fs/bcachefs/buckets.h ++++ b/fs/bcachefs/buckets.h +@@ -153,18 +153,9 @@ static inline unsigned bucket_sectors_used(struct bucket_mark mark) + return mark.dirty_sectors + mark.cached_sectors; + } + +-static inline bool bucket_unused(struct bucket_mark mark) +-{ +- return !mark.owned_by_allocator && +- !mark.data_type && +- !bucket_sectors_used(mark); +-} +- + static inline bool is_available_bucket(struct bucket_mark mark) + { +- return (!mark.owned_by_allocator && +- !mark.dirty_sectors && +- !mark.stripe); ++ return !mark.dirty_sectors && !mark.stripe; + } + + static inline bool bucket_needs_journal_commit(struct bucket_mark m, +diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h +index d6057d22b18e..5fbe940a5f6f 100644 +--- a/fs/bcachefs/buckets_types.h ++++ b/fs/bcachefs/buckets_types.h +@@ -52,16 +52,15 @@ struct bucket_array { + }; + + struct bch_dev_usage { +- u64 buckets[BCH_DATA_NR]; + u64 buckets_alloc; ++ u64 buckets_ec; + u64 buckets_unavailable; + +- /* _compressed_ sectors: */ +- u64 sectors[BCH_DATA_NR]; +- u64 sectors_fragmented; +- +- u64 buckets_ec; +- u64 sectors_ec; ++ struct { ++ u64 buckets; ++ u64 sectors; /* _compressed_ sectors: */ ++ u64 fragmented; ++ } d[BCH_DATA_NR]; + }; + + struct bch_fs_usage { +diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c +index e7c8969aaad1..49842ec88390 100644 +--- a/fs/bcachefs/chardev.c ++++ b/fs/bcachefs/chardev.c +@@ -477,11 +477,11 @@ static long bch2_ioctl_dev_usage(struct bch_fs *c, + arg.nr_buckets = ca->mi.nbuckets - ca->mi.first_bucket; + arg.available_buckets = arg.nr_buckets - src.buckets_unavailable; + arg.ec_buckets = src.buckets_ec; +- arg.ec_sectors = src.sectors_ec; ++ arg.ec_sectors = 0; + + for (i = 0; i < BCH_DATA_NR; i++) { +- arg.buckets[i] = src.buckets[i]; +- arg.sectors[i] = src.sectors[i]; ++ arg.buckets[i] = src.d[i].buckets; ++ arg.sectors[i] = src.d[i].sectors; + } + + percpu_ref_put(&ca->ref); +diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c +index efa7f38ecec6..d0acc1ee5cfe 100644 +--- a/fs/bcachefs/movinggc.c ++++ b/fs/bcachefs/movinggc.c +@@ -291,7 +291,7 @@ unsigned long bch2_copygc_wait_amount(struct bch_fs *c) + + fragmented_allowed += ((__dev_buckets_available(ca, usage) * + ca->mi.bucket_size) >> 1); +- fragmented += usage.sectors_fragmented; ++ fragmented += usage.d[BCH_DATA_user].fragmented; + } + + return max_t(s64, 0, fragmented_allowed - fragmented); +diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c +index 84c589959220..4043950ac4cd 100644 +--- a/fs/bcachefs/super.c ++++ b/fs/bcachefs/super.c +@@ -1215,7 +1215,7 @@ static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb) + return ret; + + if (test_bit(BCH_FS_ALLOC_READ_DONE, &c->flags) && +- !percpu_u64_get(&ca->usage[0]->buckets[BCH_DATA_sb])) { ++ !percpu_u64_get(&ca->usage[0]->d[BCH_DATA_sb].buckets)) { + mutex_lock(&c->sb_lock); + bch2_mark_dev_superblock(ca->fs, ca, 0); + mutex_unlock(&c->sb_lock); +diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c +index bfae0d7142e0..4fc5777ecfb0 100644 +--- a/fs/bcachefs/sysfs.c ++++ b/fs/bcachefs/sysfs.c +@@ -797,61 +797,42 @@ static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca) + nr[c->open_buckets[i].type]++; + + pr_buf(out, +- "free_inc: %zu/%zu\n" +- "free[RESERVE_MOVINGGC]: %zu/%zu\n" +- "free[RESERVE_NONE]: %zu/%zu\n" +- "buckets:\n" +- " capacity: %llu\n" +- " alloc: %llu\n" +- " sb: %llu\n" +- " journal: %llu\n" +- " meta: %llu\n" +- " user: %llu\n" +- " cached: %llu\n" +- " erasure coded: %llu\n" +- " available: %lli\n" +- "sectors:\n" +- " sb: %llu\n" +- " journal: %llu\n" +- " meta: %llu\n" +- " user: %llu\n" +- " cached: %llu\n" +- " erasure coded: %llu\n" +- " fragmented: %llu\n" +- " copygc threshold: %llu\n" +- "freelist_wait: %s\n" +- "open buckets: %u/%u (reserved %u)\n" +- "open_buckets_wait: %s\n" +- "open_buckets_btree: %u\n" +- "open_buckets_user: %u\n" +- "btree reserve cache: %u\n", +- fifo_used(&ca->free_inc), ca->free_inc.size, +- fifo_used(&ca->free[RESERVE_MOVINGGC]), ca->free[RESERVE_MOVINGGC].size, +- fifo_used(&ca->free[RESERVE_NONE]), ca->free[RESERVE_NONE].size, +- ca->mi.nbuckets - ca->mi.first_bucket, +- stats.buckets_alloc, +- stats.buckets[BCH_DATA_sb], +- stats.buckets[BCH_DATA_journal], +- stats.buckets[BCH_DATA_btree], +- stats.buckets[BCH_DATA_user], +- stats.buckets[BCH_DATA_cached], +- stats.buckets_ec, +- __dev_buckets_available(ca, stats), +- stats.sectors[BCH_DATA_sb], +- stats.sectors[BCH_DATA_journal], +- stats.sectors[BCH_DATA_btree], +- stats.sectors[BCH_DATA_user], +- stats.sectors[BCH_DATA_cached], +- stats.sectors_ec, +- stats.sectors_fragmented, +- c->copygc_threshold, +- c->freelist_wait.list.first ? "waiting" : "empty", +- c->open_buckets_nr_free, OPEN_BUCKETS_COUNT, +- BTREE_NODE_OPEN_BUCKET_RESERVE, +- c->open_buckets_wait.list.first ? "waiting" : "empty", +- nr[BCH_DATA_btree], +- nr[BCH_DATA_user], +- c->btree_reserve_cache_nr); ++ "\t\t buckets\t sectors fragmented\n" ++ "capacity%16llu\n", ++ ca->mi.nbuckets - ca->mi.first_bucket); ++ ++ for (i = 1; i < BCH_DATA_NR; i++) ++ pr_buf(out, "%-8s%16llu%16llu%16llu\n", ++ bch2_data_types[i], stats.d[i].buckets, ++ stats.d[i].sectors, stats.d[i].fragmented); ++ ++ pr_buf(out, ++ "ec\t%16llu\n" ++ "available%15llu\n" ++ "alloc\t%16llu\n" ++ "\n" ++ "free_inc\t\t%zu/%zu\n" ++ "free[RESERVE_MOVINGGC]\t%zu/%zu\n" ++ "free[RESERVE_NONE]\t%zu/%zu\n" ++ "freelist_wait\t\t%s\n" ++ "open buckets\t\t%u/%u (reserved %u)\n" ++ "open_buckets_wait\t%s\n" ++ "open_buckets_btree\t%u\n" ++ "open_buckets_user\t%u\n" ++ "btree reserve cache\t%u\n", ++ stats.buckets_ec, ++ __dev_buckets_available(ca, stats), ++ stats.buckets_alloc, ++ fifo_used(&ca->free_inc), ca->free_inc.size, ++ fifo_used(&ca->free[RESERVE_MOVINGGC]), ca->free[RESERVE_MOVINGGC].size, ++ fifo_used(&ca->free[RESERVE_NONE]), ca->free[RESERVE_NONE].size, ++ c->freelist_wait.list.first ? "waiting" : "empty", ++ c->open_buckets_nr_free, OPEN_BUCKETS_COUNT, ++ BTREE_NODE_OPEN_BUCKET_RESERVE, ++ c->open_buckets_wait.list.first ? "waiting" : "empty", ++ nr[BCH_DATA_btree], ++ nr[BCH_DATA_user], ++ c->btree_reserve_cache_nr); + } + + static const char * const bch2_rw[] = { +-- +cgit v1.2.3 + + +From 9252fcd34b92af106efeb2a7ffa8c8c717b0002e Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 22 Jan 2021 18:19:15 -0500 +Subject: bcachefs: Kill bch2_invalidate_bucket() + +This patch is working towards eventually getting rid of the in memory +struct bucket, and relying only on the btree representation. + +Since bch2_invalidate_bucket() was only used for incrementing gens, not +invalidating cached data, no other counters were being changed as a side +effect - meaning it's safe for the allocator code to increment the +bucket gen directly. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/alloc_background.c | 30 ++++++++++++++---------------- + fs/bcachefs/buckets.c | 40 ---------------------------------------- + fs/bcachefs/buckets.h | 2 -- + 3 files changed, 14 insertions(+), 58 deletions(-) + +diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c +index 1a670f6bd298..e7df246ba6cf 100644 +--- a/fs/bcachefs/alloc_background.c ++++ b/fs/bcachefs/alloc_background.c +@@ -896,34 +896,32 @@ static int bch2_invalidate_one_bucket2(struct btree_trans *trans, + + /* first, put on free_inc and mark as owned by allocator: */ + percpu_down_read(&c->mark_lock); +- spin_lock(&c->freelist_lock); +- +- verify_not_on_freelist(c, ca, b); +- +- BUG_ON(!fifo_push(&ca->free_inc, b)); +- + g = bucket(ca, b); + m = READ_ONCE(g->mark); + +- invalidating_cached_data = m.cached_sectors != 0; ++ BUG_ON(m.data_type || m.dirty_sectors); ++ ++ bch2_mark_alloc_bucket(c, ca, b, true, gc_pos_alloc(c, NULL), 0); ++ ++ spin_lock(&c->freelist_lock); ++ verify_not_on_freelist(c, ca, b); ++ BUG_ON(!fifo_push(&ca->free_inc, b)); ++ spin_unlock(&c->freelist_lock); + + /* + * If we're not invalidating cached data, we only increment the bucket + * gen in memory here, the incremented gen will be updated in the btree + * by bch2_trans_mark_pointer(): + */ ++ if (!m.cached_sectors && ++ !bucket_needs_journal_commit(m, c->journal.last_seq_ondisk)) { ++ bucket_cmpxchg(g, m, m.gen++); ++ percpu_up_read(&c->mark_lock); ++ goto out; ++ } + +- if (!invalidating_cached_data) +- bch2_invalidate_bucket(c, ca, b, &m); +- else +- bch2_mark_alloc_bucket(c, ca, b, true, gc_pos_alloc(c, NULL), 0); +- +- spin_unlock(&c->freelist_lock); + percpu_up_read(&c->mark_lock); + +- if (!invalidating_cached_data) +- goto out; +- + /* + * If the read-only path is trying to shut down, we can't be generating + * new btree updates: +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index 172456e07513..eafaf3faf959 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -641,46 +641,6 @@ unwind: + ret; \ + }) + +-static int __bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca, +- size_t b, struct bucket_mark *ret, +- bool gc) +-{ +- struct bch_fs_usage *fs_usage = fs_usage_ptr(c, 0, gc); +- struct bucket *g = __bucket(ca, b, gc); +- struct bucket_mark old, new; +- +- old = bucket_cmpxchg(g, new, ({ +- BUG_ON(!is_available_bucket(new)); +- +- new.owned_by_allocator = true; +- new.data_type = 0; +- new.cached_sectors = 0; +- new.dirty_sectors = 0; +- new.gen++; +- })); +- +- bch2_dev_usage_update(c, ca, fs_usage, old, new, gc); +- +- if (old.cached_sectors) +- update_cached_sectors(c, fs_usage, ca->dev_idx, +- -((s64) old.cached_sectors)); +- +- if (!gc) +- *ret = old; +- return 0; +-} +- +-void bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca, +- size_t b, struct bucket_mark *old) +-{ +- do_mark_fn(__bch2_invalidate_bucket, c, gc_phase(GC_PHASE_START), 0, +- ca, b, old); +- +- if (!old->owned_by_allocator && old->cached_sectors) +- trace_invalidate(ca, bucket_to_sector(ca, b), +- old->cached_sectors); +-} +- + static int __bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca, + size_t b, bool owned_by_allocator, + bool gc) +diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h +index 25d6785c0fe6..443d1dd6d91d 100644 +--- a/fs/bcachefs/buckets.h ++++ b/fs/bcachefs/buckets.h +@@ -236,8 +236,6 @@ bch2_fs_usage_read_short(struct bch_fs *); + void bch2_bucket_seq_cleanup(struct bch_fs *); + void bch2_fs_usage_initialize(struct bch_fs *); + +-void bch2_invalidate_bucket(struct bch_fs *, struct bch_dev *, +- size_t, struct bucket_mark *); + void bch2_mark_alloc_bucket(struct bch_fs *, struct bch_dev *, + size_t, bool, struct gc_pos, unsigned); + void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *, +-- +cgit v1.2.3 + + +From 9057362e2741a3e8322a9822d30e4c66fe124e7f Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 22 Jan 2021 17:56:34 -0500 +Subject: bcachefs: Mark superblocks transactionally + +More work towards getting rid of the in memory struct bucket: this path +adds code for marking superblock and journal buckets via the btree, and +uses it in the device add and journal resize paths. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/alloc_background.c | 46 +++++------- + fs/bcachefs/alloc_background.h | 1 - + fs/bcachefs/buckets.c | 162 +++++++++++++++++++++++++++++++++++++++++ + fs/bcachefs/buckets.h | 6 ++ + fs/bcachefs/journal.c | 19 ++++- + fs/bcachefs/super.c | 24 +++--- + 6 files changed, 211 insertions(+), 47 deletions(-) + +diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c +index e7df246ba6cf..f9b5a7271c3d 100644 +--- a/fs/bcachefs/alloc_background.c ++++ b/fs/bcachefs/alloc_background.c +@@ -323,48 +323,36 @@ err: + return ret; + } + +-int bch2_dev_alloc_write(struct bch_fs *c, struct bch_dev *ca, unsigned flags) ++int bch2_alloc_write(struct bch_fs *c, unsigned flags) + { + struct btree_trans trans; + struct btree_iter *iter; +- u64 first_bucket = ca->mi.first_bucket; +- u64 nbuckets = ca->mi.nbuckets; ++ struct bch_dev *ca; ++ unsigned i; + int ret = 0; + + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); + +- iter = bch2_trans_get_iter(&trans, BTREE_ID_ALLOC, +- POS(ca->dev_idx, first_bucket), ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_ALLOC, POS_MIN, + BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + +- while (iter->pos.offset < nbuckets) { +- bch2_trans_cond_resched(&trans); +- +- ret = bch2_alloc_write_key(&trans, iter, flags); +- if (ret) +- break; +- bch2_btree_iter_next_slot(iter); +- } +- +- bch2_trans_exit(&trans); +- +- return ret; +-} ++ for_each_member_device(ca, c, i) { ++ bch2_btree_iter_set_pos(iter, ++ POS(ca->dev_idx, ca->mi.first_bucket)); + +-int bch2_alloc_write(struct bch_fs *c, unsigned flags) +-{ +- struct bch_dev *ca; +- unsigned i; +- int ret = 0; ++ while (iter->pos.offset < ca->mi.nbuckets) { ++ bch2_trans_cond_resched(&trans); + +- for_each_member_device(ca, c, i) { +- bch2_dev_alloc_write(c, ca, flags); +- if (ret) { +- percpu_ref_put(&ca->io_ref); +- break; ++ ret = bch2_alloc_write_key(&trans, iter, flags); ++ if (ret) { ++ percpu_ref_put(&ca->io_ref); ++ goto err; ++ } ++ bch2_btree_iter_next_slot(iter); + } + } +- ++err: ++ bch2_trans_exit(&trans); + return ret; + } + +diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h +index d10ff56e4de1..f60fcebff2ce 100644 +--- a/fs/bcachefs/alloc_background.h ++++ b/fs/bcachefs/alloc_background.h +@@ -98,7 +98,6 @@ void bch2_dev_allocator_quiesce(struct bch_fs *, struct bch_dev *); + void bch2_dev_allocator_stop(struct bch_dev *); + int bch2_dev_allocator_start(struct bch_dev *); + +-int bch2_dev_alloc_write(struct bch_fs *, struct bch_dev *, unsigned); + int bch2_alloc_write(struct bch_fs *, unsigned); + void bch2_fs_allocator_background_init(struct bch_fs *); + +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index eafaf3faf959..cb0f0e09a2c1 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -2057,6 +2057,168 @@ int bch2_trans_mark_update(struct btree_trans *trans, + return ret; + } + ++static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans, ++ struct bch_dev *ca, size_t b, ++ enum bch_data_type type, ++ unsigned sectors) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_iter *iter; ++ struct bkey_alloc_unpacked u; ++ struct bkey_i_alloc *a; ++ struct bch_extent_ptr ptr = { ++ .dev = ca->dev_idx, ++ .offset = bucket_to_sector(ca, b), ++ }; ++ int ret = 0; ++ ++ a = bch2_trans_kmalloc(trans, BKEY_ALLOC_U64s_MAX * 8); ++ ret = PTR_ERR_OR_ZERO(a); ++ if (ret) ++ return ret; ++ ++ ret = bch2_trans_start_alloc_update(trans, &iter, &ptr, &u); ++ if (ret) ++ return ret; ++ ++ if (u.data_type && u.data_type != type) { ++ bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ++ "bucket %llu:%llu gen %u different types of data in same bucket: %s, %s\n" ++ "while marking %s", ++ iter->pos.inode, iter->pos.offset, u.gen, ++ bch2_data_types[u.data_type], ++ bch2_data_types[type], ++ bch2_data_types[type]); ++ ret = -EIO; ++ goto out; ++ } ++ ++ if ((unsigned) (u.dirty_sectors + sectors) > ca->mi.bucket_size) { ++ bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ++ "bucket %llu:%llu gen %u data type %s sector count overflow: %u + %u > %u\n" ++ "while marking %s", ++ iter->pos.inode, iter->pos.offset, u.gen, ++ bch2_data_types[u.data_type ?: type], ++ u.dirty_sectors, sectors, ca->mi.bucket_size, ++ bch2_data_types[type]); ++ ret = -EIO; ++ goto out; ++ } ++ ++ if (u.data_type == type && ++ u.dirty_sectors == sectors) ++ goto out; ++ ++ u.data_type = type; ++ u.dirty_sectors = sectors; ++ ++ bkey_alloc_init(&a->k_i); ++ a->k.p = iter->pos; ++ bch2_alloc_pack(a, u); ++ bch2_trans_update(trans, iter, &a->k_i, 0); ++out: ++ bch2_trans_iter_put(trans, iter); ++ return ret; ++} ++ ++int bch2_trans_mark_metadata_bucket(struct btree_trans *trans, ++ struct disk_reservation *res, ++ struct bch_dev *ca, size_t b, ++ enum bch_data_type type, ++ unsigned sectors) ++{ ++ return __bch2_trans_do(trans, res, NULL, 0, ++ __bch2_trans_mark_metadata_bucket(trans, ca, b, BCH_DATA_journal, ++ ca->mi.bucket_size)); ++ ++} ++ ++static int bch2_trans_mark_metadata_sectors(struct btree_trans *trans, ++ struct disk_reservation *res, ++ struct bch_dev *ca, ++ u64 start, u64 end, ++ enum bch_data_type type, ++ u64 *bucket, unsigned *bucket_sectors) ++{ ++ int ret; ++ ++ do { ++ u64 b = sector_to_bucket(ca, start); ++ unsigned sectors = ++ min_t(u64, bucket_to_sector(ca, b + 1), end) - start; ++ ++ if (b != *bucket) { ++ if (*bucket_sectors) { ++ ret = bch2_trans_mark_metadata_bucket(trans, res, ca, ++ *bucket, type, *bucket_sectors); ++ if (ret) ++ return ret; ++ } ++ ++ *bucket = b; ++ *bucket_sectors = 0; ++ } ++ ++ *bucket_sectors += sectors; ++ start += sectors; ++ } while (!ret && start < end); ++ ++ return 0; ++} ++ ++static int __bch2_trans_mark_dev_sb(struct btree_trans *trans, ++ struct disk_reservation *res, ++ struct bch_dev *ca) ++{ ++ struct bch_sb_layout *layout = &ca->disk_sb.sb->layout; ++ u64 bucket = 0; ++ unsigned i, bucket_sectors = 0; ++ int ret; ++ ++ for (i = 0; i < layout->nr_superblocks; i++) { ++ u64 offset = le64_to_cpu(layout->sb_offset[i]); ++ ++ if (offset == BCH_SB_SECTOR) { ++ ret = bch2_trans_mark_metadata_sectors(trans, res, ca, ++ 0, BCH_SB_SECTOR, ++ BCH_DATA_sb, &bucket, &bucket_sectors); ++ if (ret) ++ return ret; ++ } ++ ++ ret = bch2_trans_mark_metadata_sectors(trans, res, ca, offset, ++ offset + (1 << layout->sb_max_size_bits), ++ BCH_DATA_sb, &bucket, &bucket_sectors); ++ if (ret) ++ return ret; ++ } ++ ++ if (bucket_sectors) { ++ ret = bch2_trans_mark_metadata_bucket(trans, res, ca, ++ bucket, BCH_DATA_sb, bucket_sectors); ++ if (ret) ++ return ret; ++ } ++ ++ for (i = 0; i < ca->journal.nr; i++) { ++ ret = bch2_trans_mark_metadata_bucket(trans, res, ca, ++ ca->journal.buckets[i], ++ BCH_DATA_journal, ca->mi.bucket_size); ++ if (ret) ++ return ret; ++ } ++ ++ return 0; ++} ++ ++int bch2_trans_mark_dev_sb(struct bch_fs *c, ++ struct disk_reservation *res, ++ struct bch_dev *ca) ++{ ++ return bch2_trans_do(c, res, NULL, 0, ++ __bch2_trans_mark_dev_sb(&trans, res, ca)); ++} ++ + /* Disk reservations: */ + + void __bch2_disk_reservation_put(struct bch_fs *c, struct disk_reservation *res) +diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h +index 443d1dd6d91d..37346240cb7b 100644 +--- a/fs/bcachefs/buckets.h ++++ b/fs/bcachefs/buckets.h +@@ -259,6 +259,12 @@ int bch2_trans_mark_update(struct btree_trans *, struct btree_iter *iter, + struct bkey_i *insert, unsigned); + void bch2_trans_fs_usage_apply(struct btree_trans *, struct bch_fs_usage *); + ++int bch2_trans_mark_metadata_bucket(struct btree_trans *, ++ struct disk_reservation *, struct bch_dev *, ++ size_t, enum bch_data_type, unsigned); ++int bch2_trans_mark_dev_sb(struct bch_fs *, struct disk_reservation *, ++ struct bch_dev *); ++ + /* disk reservations: */ + + void __bch2_disk_reservation_put(struct bch_fs *, struct disk_reservation *); +diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c +index be019c4729e5..d6273c8d7d0c 100644 +--- a/fs/bcachefs/journal.c ++++ b/fs/bcachefs/journal.c +@@ -9,6 +9,7 @@ + #include "alloc_foreground.h" + #include "bkey_methods.h" + #include "btree_gc.h" ++#include "btree_update.h" + #include "buckets.h" + #include "journal.h" + #include "journal_io.h" +@@ -824,18 +825,28 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, + if (pos <= ja->cur_idx) + ja->cur_idx = (ja->cur_idx + 1) % ja->nr; + +- bch2_mark_metadata_bucket(c, ca, bucket, BCH_DATA_journal, +- ca->mi.bucket_size, +- gc_phase(GC_PHASE_SB), +- 0); ++ if (!c || new_fs) ++ bch2_mark_metadata_bucket(c, ca, bucket, BCH_DATA_journal, ++ ca->mi.bucket_size, ++ gc_phase(GC_PHASE_SB), ++ 0); + + if (c) { + spin_unlock(&c->journal.lock); + percpu_up_read(&c->mark_lock); + } + ++ if (c && !new_fs) ++ ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOFAIL, ++ bch2_trans_mark_metadata_bucket(&trans, NULL, ca, ++ bucket, BCH_DATA_journal, ++ ca->mi.bucket_size)); ++ + if (!new_fs) + bch2_open_bucket_put(c, ob); ++ ++ if (ret) ++ goto err; + } + err: + bch2_sb_resize_journal(&ca->disk_sb, +diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c +index 4043950ac4cd..682f9febc705 100644 +--- a/fs/bcachefs/super.c ++++ b/fs/bcachefs/super.c +@@ -1214,13 +1214,6 @@ static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb) + if (ret) + return ret; + +- if (test_bit(BCH_FS_ALLOC_READ_DONE, &c->flags) && +- !percpu_u64_get(&ca->usage[0]->d[BCH_DATA_sb].buckets)) { +- mutex_lock(&c->sb_lock); +- bch2_mark_dev_superblock(ca->fs, ca, 0); +- mutex_unlock(&c->sb_lock); +- } +- + bch2_dev_sysfs_online(c, ca); + + if (c->sb.nr_devices == 1) +@@ -1594,7 +1587,7 @@ int bch2_dev_add(struct bch_fs *c, const char *path) + * allocate the journal, reset all the marks, then remark after we + * attach... + */ +- bch2_mark_dev_superblock(ca->fs, ca, 0); ++ bch2_mark_dev_superblock(NULL, ca, 0); + + err = "journal alloc failed"; + ret = bch2_dev_journal_alloc(ca); +@@ -1653,15 +1646,13 @@ have_slot: + ca->disk_sb.sb->dev_idx = dev_idx; + bch2_dev_attach(c, ca, dev_idx); + +- bch2_mark_dev_superblock(c, ca, 0); +- + bch2_write_super(c); + mutex_unlock(&c->sb_lock); + +- err = "alloc write failed"; +- ret = bch2_dev_alloc_write(c, ca, 0); ++ err = "error marking superblock"; ++ ret = bch2_trans_mark_dev_sb(c, NULL, ca); + if (ret) +- goto err; ++ goto err_late; + + if (ca->mi.state == BCH_MEMBER_STATE_RW) { + err = __bch2_dev_read_write(c, ca); +@@ -1682,6 +1673,7 @@ err: + bch_err(c, "Unable to add device: %s", err); + return ret; + err_late: ++ up_write(&c->state_lock); + bch_err(c, "Error going rw after adding device: %s", err); + return -EINVAL; + } +@@ -1717,6 +1709,12 @@ int bch2_dev_online(struct bch_fs *c, const char *path) + } + + ca = bch_dev_locked(c, dev_idx); ++ ++ if (bch2_trans_mark_dev_sb(c, NULL, ca)) { ++ err = "bch2_trans_mark_dev_sb() error"; ++ goto err; ++ } ++ + if (ca->mi.state == BCH_MEMBER_STATE_RW) { + err = __bch2_dev_read_write(c, ca); + if (err) +-- +cgit v1.2.3 + + +From a910087a287e0711fd21eea4351636de520dc684 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 25 Jan 2021 14:04:31 -0500 +Subject: bcachefs: Fix an assertion + +If we're invalidating a bucket that has cached data in it, data_type +won't be 0 - oops. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/alloc_background.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c +index f9b5a7271c3d..a198c64f3412 100644 +--- a/fs/bcachefs/alloc_background.c ++++ b/fs/bcachefs/alloc_background.c +@@ -887,7 +887,7 @@ static int bch2_invalidate_one_bucket2(struct btree_trans *trans, + g = bucket(ca, b); + m = READ_ONCE(g->mark); + +- BUG_ON(m.data_type || m.dirty_sectors); ++ BUG_ON(m.dirty_sectors); + + bch2_mark_alloc_bucket(c, ca, b, true, gc_pos_alloc(c, NULL), 0); + +@@ -903,6 +903,7 @@ static int bch2_invalidate_one_bucket2(struct btree_trans *trans, + */ + if (!m.cached_sectors && + !bucket_needs_journal_commit(m, c->journal.last_seq_ondisk)) { ++ BUG_ON(m.data_type); + bucket_cmpxchg(g, m, m.gen++); + percpu_up_read(&c->mark_lock); + goto out; +-- +cgit v1.2.3 + + +From f0185b5408546656c74243b2ea2d6bf7abab41cc Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 27 Jan 2021 19:36:09 -0500 +Subject: bcachefs: Fix build in userspace + +The userspace bch_err() macro doesn't use the filesystem argument. Could +also be fixed with a better macro. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/ec.c | 5 ++--- + 1 file changed, 2 insertions(+), 3 deletions(-) + +diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c +index 7a3b9cd3c8ba..4e26ef6f5813 100644 +--- a/fs/bcachefs/ec.c ++++ b/fs/bcachefs/ec.c +@@ -744,7 +744,6 @@ err: + static int ec_stripe_bkey_update(struct btree_trans *trans, + struct bkey_i_stripe *new) + { +- struct bch_fs *c = trans->c; + struct btree_iter *iter; + struct bkey_s_c k; + const struct bch_stripe *existing; +@@ -759,7 +758,7 @@ static int ec_stripe_bkey_update(struct btree_trans *trans, + goto err; + + if (!k.k || k.k->type != KEY_TYPE_stripe) { +- bch_err(c, "error updating stripe: not found"); ++ bch_err(trans->c, "error updating stripe: not found"); + ret = -ENOENT; + goto err; + } +@@ -767,7 +766,7 @@ static int ec_stripe_bkey_update(struct btree_trans *trans, + existing = bkey_s_c_to_stripe(k).v; + + if (existing->nr_blocks != new->v.nr_blocks) { +- bch_err(c, "error updating stripe: nr_blocks does not match"); ++ bch_err(trans->c, "error updating stripe: nr_blocks does not match"); + ret = -EINVAL; + goto err; + } +-- +cgit v1.2.3 + + +From a60663b3960afc32d521398f3935bc4e31d1b517 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 26 Jan 2021 21:22:19 -0500 +Subject: bcachefs: Fix BCH_REPLICAS_MAX check + +Ideally, this limit will be going away in the future. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/super-io.c | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c +index 78835bd2d6bc..751efd28b672 100644 +--- a/fs/bcachefs/super-io.c ++++ b/fs/bcachefs/super-io.c +@@ -276,19 +276,19 @@ const char *bch2_sb_validate(struct bch_sb_handle *disk_sb) + return "Bad number of member devices"; + + if (!BCH_SB_META_REPLICAS_WANT(sb) || +- BCH_SB_META_REPLICAS_WANT(sb) >= BCH_REPLICAS_MAX) ++ BCH_SB_META_REPLICAS_WANT(sb) > BCH_REPLICAS_MAX) + return "Invalid number of metadata replicas"; + + if (!BCH_SB_META_REPLICAS_REQ(sb) || +- BCH_SB_META_REPLICAS_REQ(sb) >= BCH_REPLICAS_MAX) ++ BCH_SB_META_REPLICAS_REQ(sb) > BCH_REPLICAS_MAX) + return "Invalid number of metadata replicas"; + + if (!BCH_SB_DATA_REPLICAS_WANT(sb) || +- BCH_SB_DATA_REPLICAS_WANT(sb) >= BCH_REPLICAS_MAX) ++ BCH_SB_DATA_REPLICAS_WANT(sb) > BCH_REPLICAS_MAX) + return "Invalid number of data replicas"; + + if (!BCH_SB_DATA_REPLICAS_REQ(sb) || +- BCH_SB_DATA_REPLICAS_REQ(sb) >= BCH_REPLICAS_MAX) ++ BCH_SB_DATA_REPLICAS_REQ(sb) > BCH_REPLICAS_MAX) + return "Invalid number of data replicas"; + + if (BCH_SB_META_CSUM_TYPE(sb) >= BCH_CSUM_OPT_NR) +-- +cgit v1.2.3 + + +From aeaf2eba7183349764b6b936c82489d9dec34721 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 26 Jan 2021 16:04:12 -0500 +Subject: bcachefs: Improve diagnostics when journal entries are missing + +There's an outstanding bug with journal entries being missing in journal +replay. This patch adds code to print out where the journal entries were +physically located that were around the entry(ies) being missing, which +should make debugging easier. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/journal.c | 8 +++- + fs/bcachefs/journal_io.c | 112 ++++++++++++++++++++++++++++++++++++----------- + fs/bcachefs/journal_io.h | 4 +- + 3 files changed, 96 insertions(+), 28 deletions(-) + +diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c +index d6273c8d7d0c..a7c5f5fddedb 100644 +--- a/fs/bcachefs/journal.c ++++ b/fs/bcachefs/journal.c +@@ -1011,13 +1011,19 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq, + } + + list_for_each_entry(i, journal_entries, list) { ++ unsigned ptr; ++ + seq = le64_to_cpu(i->j.seq); + BUG_ON(seq >= cur_seq); + + if (seq < last_seq) + continue; + +- journal_seq_pin(j, seq)->devs = i->devs; ++ p = journal_seq_pin(j, seq); ++ ++ p->devs.nr = 0; ++ for (ptr = 0; ptr < i->nr_ptrs; ptr++) ++ bch2_dev_list_add_dev(&p->devs, i->ptrs[ptr].dev); + } + + spin_lock(&j->lock); +diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c +index 750f6fab2e63..eacc9b2c362f 100644 +--- a/fs/bcachefs/journal_io.c ++++ b/fs/bcachefs/journal_io.c +@@ -46,15 +46,16 @@ struct journal_list { + * be replayed: + */ + static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca, ++ struct bch_extent_ptr entry_ptr, + struct journal_list *jlist, struct jset *j, + bool bad) + { +- struct journal_replay *i, *pos; +- struct bch_devs_list devs = { .nr = 0 }; ++ struct journal_replay *i, *pos, *dup = NULL; ++ struct bch_extent_ptr *ptr; + struct list_head *where; + size_t bytes = vstruct_bytes(j); + u64 last_seq = 0; +- int ret; ++ int ret = JOURNAL_ENTRY_ADD_OK; + + list_for_each_entry_reverse(i, jlist->head, list) { + if (!JSET_NO_FLUSH(&i->j)) { +@@ -88,28 +89,31 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca, + + where = jlist->head; + add: +- i = where->next != jlist->head ++ dup = where->next != jlist->head + ? container_of(where->next, struct journal_replay, list) + : NULL; + ++ if (dup && le64_to_cpu(j->seq) != le64_to_cpu(dup->j.seq)) ++ dup = NULL; ++ + /* + * Duplicate journal entries? If so we want the one that didn't have a + * checksum error: + */ +- if (i && le64_to_cpu(j->seq) == le64_to_cpu(i->j.seq)) { +- if (i->bad) { +- devs = i->devs; +- __journal_replay_free(i); ++ if (dup) { ++ if (dup->bad) { ++ /* we'll replace @dup: */ + } else if (bad) { ++ i = dup; + goto found; + } else { +- fsck_err_on(bytes != vstruct_bytes(&i->j) || +- memcmp(j, &i->j, bytes), c, ++ fsck_err_on(bytes != vstruct_bytes(&dup->j) || ++ memcmp(j, &dup->j, bytes), c, + "found duplicate but non identical journal entries (seq %llu)", + le64_to_cpu(j->seq)); ++ i = dup; + goto found; + } +- + } + + i = kvpmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL); +@@ -118,17 +122,34 @@ add: + goto out; + } + +- list_add(&i->list, where); +- i->devs = devs; +- i->bad = bad; +- i->ignore = false; ++ i->nr_ptrs = 0; ++ i->bad = bad; ++ i->ignore = false; + memcpy(&i->j, j, bytes); ++ ++ if (dup) { ++ i->nr_ptrs = dup->nr_ptrs; ++ memcpy(i->ptrs, dup->ptrs, sizeof(dup->ptrs)); ++ __journal_replay_free(dup); ++ } ++ ++ list_add(&i->list, where); + found: +- if (!bch2_dev_list_has_dev(i->devs, ca->dev_idx)) +- bch2_dev_list_add_dev(&i->devs, ca->dev_idx); +- else +- fsck_err_on(1, c, "duplicate journal entries on same device"); +- ret = JOURNAL_ENTRY_ADD_OK; ++ for (ptr = i->ptrs; ptr < i->ptrs + i->nr_ptrs; ptr++) { ++ if (ptr->dev == ca->dev_idx) { ++ bch_err(c, "duplicate journal entry %llu on same device", ++ le64_to_cpu(i->j.seq)); ++ goto out; ++ } ++ } ++ ++ if (i->nr_ptrs >= ARRAY_SIZE(i->ptrs)) { ++ bch_err(c, "found too many copies of journal entry %llu", ++ le64_to_cpu(i->j.seq)); ++ goto out; ++ } ++ ++ i->ptrs[i->nr_ptrs++] = entry_ptr; + out: + fsck_err: + return ret; +@@ -654,7 +675,10 @@ reread: + ja->bucket_seq[bucket] = le64_to_cpu(j->seq); + + mutex_lock(&jlist->lock); +- ret = journal_entry_add(c, ca, jlist, j, ret != 0); ++ ret = journal_entry_add(c, ca, (struct bch_extent_ptr) { ++ .dev = ca->dev_idx, ++ .offset = offset, ++ }, jlist, j, ret != 0); + mutex_unlock(&jlist->lock); + + switch (ret) { +@@ -742,6 +766,23 @@ err: + goto out; + } + ++static void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c, ++ struct journal_replay *j) ++{ ++ unsigned i; ++ ++ for (i = 0; i < j->nr_ptrs; i++) { ++ struct bch_dev *ca = c->devs[j->ptrs[i].dev]; ++ ++ if (i) ++ pr_buf(out, " "); ++ pr_buf(out, "%u:%llu (offset %llu)", ++ j->ptrs[i].dev, ++ (u64) j->ptrs[i].offset, ++ (u64) j->ptrs[i].offset % ca->mi.bucket_size); ++ } ++} ++ + int bch2_journal_read(struct bch_fs *c, struct list_head *list, + u64 *blacklist_seq, u64 *start_seq) + { +@@ -839,6 +880,7 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list, + + while (seq < le64_to_cpu(i->j.seq)) { + u64 missing_start, missing_end; ++ char buf1[200], buf2[200]; + + while (seq < le64_to_cpu(i->j.seq) && + bch2_journal_seq_is_blacklisted(c, seq, false)) +@@ -853,10 +895,23 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list, + !bch2_journal_seq_is_blacklisted(c, seq, false)) + seq++; + ++ if (i->list.prev != list) { ++ struct printbuf out = PBUF(buf1); ++ struct journal_replay *p = list_prev_entry(i, list); ++ ++ bch2_journal_ptrs_to_text(&out, c, p); ++ pr_buf(&out, " size %llu", vstruct_sectors(&p->j, c->block_bits)); ++ } else ++ sprintf(buf1, "(none)"); ++ bch2_journal_ptrs_to_text(&PBUF(buf2), c, i); ++ + missing_end = seq - 1; +- fsck_err(c, "journal entries %llu-%llu missing! (replaying %llu-%llu)", ++ fsck_err(c, "journal entries %llu-%llu missing! (replaying %llu-%llu)\n" ++ " prev at %s\n" ++ " next at %s", + missing_start, missing_end, +- last_seq, *blacklist_seq - 1); ++ last_seq, *blacklist_seq - 1, ++ buf1, buf2); + } + + seq++; +@@ -865,7 +920,11 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list, + list_for_each_entry(i, list, list) { + struct jset_entry *entry; + struct bkey_i *k, *_n; +- struct bch_replicas_padded replicas; ++ struct bch_replicas_padded replicas = { ++ .e.data_type = BCH_DATA_journal, ++ .e.nr_required = 1, ++ }; ++ unsigned ptr; + char buf[80]; + + if (i->ignore) +@@ -875,13 +934,14 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list, + if (ret) + goto fsck_err; + ++ for (ptr = 0; ptr < i->nr_ptrs; ptr++) ++ replicas.e.devs[replicas.e.nr_devs++] = i->ptrs[ptr].dev; ++ + /* + * If we're mounting in degraded mode - if we didn't read all + * the devices - this is wrong: + */ + +- bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, i->devs); +- + if (!degraded && + (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) || + fsck_err_on(!bch2_replicas_marked(c, &replicas.e), c, +diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h +index 6b4c80968f52..a4931ab93a68 100644 +--- a/fs/bcachefs/journal_io.h ++++ b/fs/bcachefs/journal_io.h +@@ -8,7 +8,9 @@ + */ + struct journal_replay { + struct list_head list; +- struct bch_devs_list devs; ++ struct bch_extent_ptr ptrs[BCH_REPLICAS_MAX]; ++ unsigned nr_ptrs; ++ + /* checksum error, but we may want to try using it anyways: */ + bool bad; + bool ignore; +-- +cgit v1.2.3 + + +From b958d4ce95be14518fd1975f095e20f22b19d28c Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 26 Jan 2021 16:04:38 -0500 +Subject: bcachefs: Refactor checking of btree topology + +Still a lot of work to be done here: we can't yet repair btree topology +issues, but this patch refactors things so that we have better access to +what we need in the topology checks. Next up will be figuring out a way +to do btree updates during gc, before journal replay is done. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_gc.c | 83 +++++++++++++++++++++++++++++--------------------- + 1 file changed, 48 insertions(+), 35 deletions(-) + +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +index efeaec3d9c03..d31add9bb86b 100644 +--- a/fs/bcachefs/btree_gc.c ++++ b/fs/bcachefs/btree_gc.c +@@ -51,39 +51,46 @@ static inline void gc_pos_set(struct bch_fs *c, struct gc_pos new_pos) + } + + static int bch2_gc_check_topology(struct bch_fs *c, +- struct bkey_s_c k, +- struct bpos *expected_start, +- struct bpos expected_end, ++ struct btree *b, ++ struct bkey_buf *prev, ++ struct bkey_buf cur, + bool is_last) + { ++ struct bpos node_start = b->data->min_key; ++ struct bpos node_end = b->data->max_key; ++ struct bpos expected_start = bkey_deleted(&prev->k->k) ++ ? node_start ++ : bkey_successor(prev->k->k.p); ++ char buf1[200], buf2[200]; + int ret = 0; + +- if (k.k->type == KEY_TYPE_btree_ptr_v2) { +- struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k); ++ if (cur.k->k.type == KEY_TYPE_btree_ptr_v2) { ++ struct bkey_i_btree_ptr_v2 *bp = bkey_i_to_btree_ptr_v2(cur.k); + +- if (fsck_err_on(bkey_cmp(*expected_start, bp.v->min_key), c, +- "btree node with incorrect min_key: got %llu:%llu, should be %llu:%llu", +- bp.v->min_key.inode, +- bp.v->min_key.offset, +- expected_start->inode, +- expected_start->offset)) { ++ if (bkey_deleted(&prev->k->k)) ++ scnprintf(buf1, sizeof(buf1), "start of node: %llu:%llu", ++ node_start.inode, ++ node_start.offset); ++ else ++ bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(prev->k)); ++ ++ if (fsck_err_on(bkey_cmp(expected_start, bp->v.min_key), c, ++ "btree node with incorrect min_key:\n prev %s\n cur %s", ++ buf1, ++ (bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(cur.k)), buf2))) { + BUG(); + } + } + +- *expected_start = bkey_cmp(k.k->p, POS_MAX) +- ? bkey_successor(k.k->p) +- : k.k->p; +- + if (fsck_err_on(is_last && +- bkey_cmp(k.k->p, expected_end), c, +- "btree node with incorrect max_key: got %llu:%llu, should be %llu:%llu", +- k.k->p.inode, +- k.k->p.offset, +- expected_end.inode, +- expected_end.offset)) { ++ bkey_cmp(cur.k->k.p, node_end), c, ++ "btree node with incorrect max_key:\n %s\n expected %s", ++ (bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(cur.k)), buf1), ++ (bch2_bpos_to_text(&PBUF(buf2), node_end), buf2))) { + BUG(); + } ++ ++ bch2_bkey_buf_copy(prev, c, cur.k); + fsck_err: + return ret; + } +@@ -169,10 +176,10 @@ fsck_err: + static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, u8 *max_stale, + bool initial) + { +- struct bpos next_node_start = b->data->min_key; + struct btree_node_iter iter; + struct bkey unpacked; + struct bkey_s_c k; ++ struct bkey_buf prev, cur; + int ret = 0; + + *max_stale = 0; +@@ -181,6 +188,9 @@ static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, u8 *max_stale, + return 0; + + bch2_btree_node_iter_init_from_start(&iter, b); ++ bch2_bkey_buf_init(&prev); ++ bch2_bkey_buf_init(&cur); ++ bkey_init(&prev.k->k); + + while ((k = bch2_btree_node_iter_peek_unpack(&iter, b, &unpacked)).k) { + bch2_bkey_debugcheck(c, b, k); +@@ -192,15 +202,17 @@ static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, u8 *max_stale, + bch2_btree_node_iter_advance(&iter, b); + + if (b->c.level) { +- ret = bch2_gc_check_topology(c, k, +- &next_node_start, +- b->data->max_key, ++ bch2_bkey_buf_reassemble(&cur, c, k); ++ ++ ret = bch2_gc_check_topology(c, b, &prev, cur, + bch2_btree_node_iter_end(&iter)); + if (ret) + break; + } + } + ++ bch2_bkey_buf_exit(&cur, c); ++ bch2_bkey_buf_exit(&prev, c); + return ret; + } + +@@ -267,13 +279,14 @@ static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b, + { + struct btree_and_journal_iter iter; + struct bkey_s_c k; +- struct bpos next_node_start = b->data->min_key; +- struct bkey_buf tmp; ++ struct bkey_buf cur, prev; + u8 max_stale = 0; + int ret = 0; + + bch2_btree_and_journal_iter_init_node_iter(&iter, journal_keys, b); +- bch2_bkey_buf_init(&tmp); ++ bch2_bkey_buf_init(&prev); ++ bch2_bkey_buf_init(&cur); ++ bkey_init(&prev.k->k); + + while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { + bch2_bkey_debugcheck(c, b, k); +@@ -288,20 +301,19 @@ static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b, + if (b->c.level) { + struct btree *child; + +- bch2_bkey_buf_reassemble(&tmp, c, k); +- k = bkey_i_to_s_c(tmp.k); ++ bch2_bkey_buf_reassemble(&cur, c, k); ++ k = bkey_i_to_s_c(cur.k); + + bch2_btree_and_journal_iter_advance(&iter); + +- ret = bch2_gc_check_topology(c, k, +- &next_node_start, +- b->data->max_key, ++ ret = bch2_gc_check_topology(c, b, ++ &prev, cur, + !bch2_btree_and_journal_iter_peek(&iter).k); + if (ret) + break; + + if (b->c.level > target_depth) { +- child = bch2_btree_node_get_noiter(c, tmp.k, ++ child = bch2_btree_node_get_noiter(c, cur.k, + b->c.btree_id, b->c.level - 1); + ret = PTR_ERR_OR_ZERO(child); + if (ret) +@@ -319,7 +331,8 @@ static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b, + } + } + +- bch2_bkey_buf_exit(&tmp, c); ++ bch2_bkey_buf_exit(&cur, c); ++ bch2_bkey_buf_exit(&prev, c); + return ret; + } + +-- +cgit v1.2.3 + + +From a69a607748b0f13268720ea91560fb94c883242d Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 26 Jan 2021 20:13:54 -0500 +Subject: bcachefs: Add BTREE_PTR_RANGE_UPDATED + +This is so that when we discover btree topology issues, we can just +update the pointer to a btree node and signal btree read path that the +min/max keys in the node header should be updated from the node pointer. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bcachefs_format.h | 5 +++-- + fs/bcachefs/btree_io.c | 5 +++++ + fs/bcachefs/btree_update_interior.c | 1 - + fs/bcachefs/extents.c | 8 +++----- + 4 files changed, 11 insertions(+), 8 deletions(-) + +diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h +index 307d5523a52d..6dc150cbf2af 100644 +--- a/fs/bcachefs/bcachefs_format.h ++++ b/fs/bcachefs/bcachefs_format.h +@@ -603,13 +603,14 @@ struct bch_btree_ptr_v2 { + __u64 mem_ptr; + __le64 seq; + __le16 sectors_written; +- /* In case we ever decide to do variable size btree nodes: */ +- __le16 sectors; ++ __le16 flags; + struct bpos min_key; + struct bch_extent_ptr start[0]; + __u64 _data[0]; + } __attribute__((packed, aligned(8))); + ++LE16_BITMASK(BTREE_PTR_RANGE_UPDATED, struct bch_btree_ptr_v2, flags, 0, 1); ++ + struct bch_extent { + struct bch_val v; + +diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c +index 65f7e36677b7..91e578b2d8c0 100644 +--- a/fs/bcachefs/btree_io.c ++++ b/fs/bcachefs/btree_io.c +@@ -753,6 +753,11 @@ static int validate_bset(struct bch_fs *c, struct btree *b, + struct bch_btree_ptr_v2 *bp = + &bkey_i_to_btree_ptr_v2(&b->key)->v; + ++ if (BTREE_PTR_RANGE_UPDATED(bp)) { ++ b->data->min_key = bp->min_key; ++ b->data->max_key = b->key.k.p; ++ } ++ + btree_err_on(bkey_cmp(b->data->min_key, bp->min_key), + BTREE_ERR_MUST_RETRY, c, b, NULL, + "incorrect min_key: got %llu:%llu should be %llu:%llu", +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index 5bb653298c6c..8919ea628138 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -297,7 +297,6 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned lev + bp->v.mem_ptr = 0; + bp->v.seq = b->data->keys.seq; + bp->v.sectors_written = 0; +- bp->v.sectors = cpu_to_le16(c->opts.btree_node_size); + } + + if (c->sb.features & (1ULL << BCH_FEATURE_new_extent_overwrite)) +diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c +index c0ae31238b48..67ba2c21627e 100644 +--- a/fs/bcachefs/extents.c ++++ b/fs/bcachefs/extents.c +@@ -215,9 +215,8 @@ void bch2_btree_ptr_v2_to_text(struct printbuf *out, struct bch_fs *c, + { + struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k); + +- pr_buf(out, "seq %llx sectors %u written %u min_key ", ++ pr_buf(out, "seq %llx written %u min_key ", + le64_to_cpu(bp.v->seq), +- le16_to_cpu(bp.v->sectors), + le16_to_cpu(bp.v->sectors_written)); + + bch2_bpos_to_text(out, bp.v->min_key); +@@ -1082,10 +1081,9 @@ const char *bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k) + unsigned nonce = UINT_MAX; + unsigned i; + +- if (k.k->type == KEY_TYPE_btree_ptr) ++ if (k.k->type == KEY_TYPE_btree_ptr || ++ k.k->type == KEY_TYPE_btree_ptr_v2) + size_ondisk = c->opts.btree_node_size; +- if (k.k->type == KEY_TYPE_btree_ptr_v2) +- size_ondisk = le16_to_cpu(bkey_s_c_to_btree_ptr_v2(k).v->sectors); + + bkey_extent_entry_for_each(ptrs, entry) { + if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX) +-- +cgit v1.2.3 + + +From 53033939b1430894cc03df85f964e73642ceab1a Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 26 Jan 2021 20:15:46 -0500 +Subject: bcachefs: Add support for doing btree updates prior to journal replay + +Some errors may need to be fixed in order for GC to successfully run - +walk and mark all metadata. But we can't start the allocators and do +normal btree updates until after GC has completed, and allocation +information is known to be consistent, so we need a different method of +doing btree updates. + +Fortunately, we already have code for walking the btree while overlaying +keys from the journal to be replayed. This patch adds an update path +that adds keys to the list of keys to be replayed by journal replay, and +also fixes up iterators. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bcachefs.h | 3 + + fs/bcachefs/btree_gc.c | 23 +++--- + fs/bcachefs/btree_gc.h | 3 +- + fs/bcachefs/recovery.c | 208 ++++++++++++++++++++++++++++++++++++------------- + fs/bcachefs/recovery.h | 17 ++-- + fs/bcachefs/super.c | 1 + + fs/bcachefs/sysfs.c | 2 +- + 7 files changed, 176 insertions(+), 81 deletions(-) + +diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h +index 06d68e97ae4d..a6a2af547c95 100644 +--- a/fs/bcachefs/bcachefs.h ++++ b/fs/bcachefs/bcachefs.h +@@ -540,11 +540,13 @@ struct journal_keys { + struct journal_key { + enum btree_id btree_id:8; + unsigned level:8; ++ bool allocated; + struct bkey_i *k; + u32 journal_seq; + u32 journal_offset; + } *d; + size_t nr; ++ size_t size; + u64 journal_seq_base; + }; + +@@ -841,6 +843,7 @@ struct bch_fs { + struct journal journal; + struct list_head journal_entries; + struct journal_keys journal_keys; ++ struct list_head journal_iters; + + u64 last_bucket_seq_cleanup; + +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +index d31add9bb86b..69529f68e355 100644 +--- a/fs/bcachefs/btree_gc.c ++++ b/fs/bcachefs/btree_gc.c +@@ -274,7 +274,6 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id, + } + + static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b, +- struct journal_keys *journal_keys, + unsigned target_depth) + { + struct btree_and_journal_iter iter; +@@ -283,7 +282,7 @@ static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b, + u8 max_stale = 0; + int ret = 0; + +- bch2_btree_and_journal_iter_init_node_iter(&iter, journal_keys, b); ++ bch2_btree_and_journal_iter_init_node_iter(&iter, c, b); + bch2_bkey_buf_init(&prev); + bch2_bkey_buf_init(&cur); + bkey_init(&prev.k->k); +@@ -320,7 +319,7 @@ static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b, + break; + + ret = bch2_gc_btree_init_recurse(c, child, +- journal_keys, target_depth); ++ target_depth); + six_unlock_read(&child->c.lock); + + if (ret) +@@ -333,11 +332,11 @@ static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b, + + bch2_bkey_buf_exit(&cur, c); + bch2_bkey_buf_exit(&prev, c); ++ bch2_btree_and_journal_iter_exit(&iter); + return ret; + } + + static int bch2_gc_btree_init(struct bch_fs *c, +- struct journal_keys *journal_keys, + enum btree_id btree_id) + { + struct btree *b; +@@ -368,8 +367,7 @@ static int bch2_gc_btree_init(struct bch_fs *c, + } + + if (b->c.level >= target_depth) +- ret = bch2_gc_btree_init_recurse(c, b, +- journal_keys, target_depth); ++ ret = bch2_gc_btree_init_recurse(c, b, target_depth); + + if (!ret) + ret = bch2_gc_mark_key(c, bkey_i_to_s_c(&b->key), +@@ -386,8 +384,7 @@ static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r) + (int) btree_id_to_gc_phase(r); + } + +-static int bch2_gc_btrees(struct bch_fs *c, struct journal_keys *journal_keys, +- bool initial) ++static int bch2_gc_btrees(struct bch_fs *c, bool initial) + { + enum btree_id ids[BTREE_ID_NR]; + unsigned i; +@@ -399,8 +396,7 @@ static int bch2_gc_btrees(struct bch_fs *c, struct journal_keys *journal_keys, + for (i = 0; i < BTREE_ID_NR; i++) { + enum btree_id id = ids[i]; + int ret = initial +- ? bch2_gc_btree_init(c, journal_keys, +- id) ++ ? bch2_gc_btree_init(c, id) + : bch2_gc_btree(c, id, initial); + if (ret) + return ret; +@@ -788,8 +784,7 @@ static int bch2_gc_start(struct bch_fs *c) + * move around - if references move backwards in the ordering GC + * uses, GC could skip past them + */ +-int bch2_gc(struct bch_fs *c, struct journal_keys *journal_keys, +- bool initial) ++int bch2_gc(struct bch_fs *c, bool initial) + { + struct bch_dev *ca; + u64 start_time = local_clock(); +@@ -811,7 +806,7 @@ again: + + bch2_mark_superblocks(c); + +- ret = bch2_gc_btrees(c, journal_keys, initial); ++ ret = bch2_gc_btrees(c, initial); + if (ret) + goto out; + +@@ -1384,7 +1379,7 @@ static int bch2_gc_thread(void *arg) + * Full gc is currently incompatible with btree key cache: + */ + #if 0 +- ret = bch2_gc(c, NULL, false, false); ++ ret = bch2_gc(c, false, false); + #else + ret = bch2_gc_gens(c); + #endif +diff --git a/fs/bcachefs/btree_gc.h b/fs/bcachefs/btree_gc.h +index f0435a58793b..fa604efc70cc 100644 +--- a/fs/bcachefs/btree_gc.h ++++ b/fs/bcachefs/btree_gc.h +@@ -6,8 +6,7 @@ + + void bch2_coalesce(struct bch_fs *); + +-struct journal_keys; +-int bch2_gc(struct bch_fs *, struct journal_keys *, bool); ++int bch2_gc(struct bch_fs *, bool); + int bch2_gc_gens(struct bch_fs *); + void bch2_gc_thread_stop(struct bch_fs *); + int bch2_gc_thread_start(struct bch_fs *); +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index 422f2fbe6dfb..88a1d47e6e4b 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -40,78 +40,169 @@ static void drop_alloc_keys(struct journal_keys *keys) + + /* iterate over keys read from the journal: */ + +-static struct journal_key *journal_key_search(struct journal_keys *journal_keys, +- enum btree_id id, unsigned level, +- struct bpos pos) ++static int __journal_key_cmp(enum btree_id l_btree_id, ++ unsigned l_level, ++ struct bpos l_pos, ++ struct journal_key *r) ++{ ++ return (cmp_int(l_btree_id, r->btree_id) ?: ++ cmp_int(l_level, r->level) ?: ++ bkey_cmp(l_pos, r->k->k.p)); ++} ++ ++static int journal_key_cmp(struct journal_key *l, struct journal_key *r) ++{ ++ return (cmp_int(l->btree_id, r->btree_id) ?: ++ cmp_int(l->level, r->level) ?: ++ bkey_cmp(l->k->k.p, r->k->k.p)); ++} ++ ++static size_t journal_key_search(struct journal_keys *journal_keys, ++ enum btree_id id, unsigned level, ++ struct bpos pos) + { + size_t l = 0, r = journal_keys->nr, m; + + while (l < r) { + m = l + ((r - l) >> 1); +- if ((cmp_int(id, journal_keys->d[m].btree_id) ?: +- cmp_int(level, journal_keys->d[m].level) ?: +- bkey_cmp(pos, journal_keys->d[m].k->k.p)) > 0) ++ if (__journal_key_cmp(id, level, pos, &journal_keys->d[m]) > 0) + l = m + 1; + else + r = m; + } + + BUG_ON(l < journal_keys->nr && +- (cmp_int(id, journal_keys->d[l].btree_id) ?: +- cmp_int(level, journal_keys->d[l].level) ?: +- bkey_cmp(pos, journal_keys->d[l].k->k.p)) > 0); ++ __journal_key_cmp(id, level, pos, &journal_keys->d[l]) > 0); + + BUG_ON(l && +- (cmp_int(id, journal_keys->d[l - 1].btree_id) ?: +- cmp_int(level, journal_keys->d[l - 1].level) ?: +- bkey_cmp(pos, journal_keys->d[l - 1].k->k.p)) <= 0); ++ __journal_key_cmp(id, level, pos, &journal_keys->d[l - 1]) <= 0); + +- return l < journal_keys->nr ? journal_keys->d + l : NULL; ++ return l; ++} ++ ++static void journal_iter_fix(struct bch_fs *c, struct journal_iter *iter, unsigned idx) ++{ ++ struct bkey_i *n = iter->keys->d[idx].k; ++ struct btree_and_journal_iter *biter = ++ container_of(iter, struct btree_and_journal_iter, journal); ++ ++ if (iter->idx > idx || ++ (iter->idx == idx && ++ biter->last && ++ bkey_cmp(n->k.p, biter->unpacked.p) <= 0)) ++ iter->idx++; ++} ++ ++int bch2_journal_key_insert(struct bch_fs *c, enum btree_id id, ++ unsigned level, struct bkey_i *k) ++{ ++ struct journal_key n = { ++ .btree_id = id, ++ .level = level, ++ .k = k, ++ .allocated = true ++ }; ++ struct journal_keys *keys = &c->journal_keys; ++ struct journal_iter *iter; ++ unsigned idx = journal_key_search(keys, id, level, k->k.p); ++ ++ if (idx < keys->nr && ++ journal_key_cmp(&n, &keys->d[idx]) == 0) { ++ if (keys->d[idx].allocated) ++ kfree(keys->d[idx].k); ++ keys->d[idx] = n; ++ return 0; ++ } ++ ++ if (keys->nr == keys->size) { ++ struct journal_keys new_keys = { ++ .nr = keys->nr, ++ .size = keys->size * 2, ++ .journal_seq_base = keys->journal_seq_base, ++ }; ++ ++ new_keys.d = kvmalloc(sizeof(new_keys.d[0]) * new_keys.size, GFP_KERNEL); ++ if (!new_keys.d) ++ return -ENOMEM; ++ ++ memcpy(new_keys.d, keys->d, sizeof(keys->d[0]) * keys->nr); ++ kvfree(keys->d); ++ *keys = new_keys; ++ } ++ ++ array_insert_item(keys->d, keys->nr, idx, n); ++ ++ list_for_each_entry(iter, &c->journal_iters, list) ++ journal_iter_fix(c, iter, idx); ++ ++ return 0; ++} ++ ++int bch2_journal_key_delete(struct bch_fs *c, enum btree_id id, ++ unsigned level, struct bpos pos) ++{ ++ struct bkey_i *whiteout = ++ kmalloc(sizeof(struct bkey), GFP_KERNEL); ++ int ret; ++ ++ if (!whiteout) ++ return -ENOMEM; ++ ++ bkey_init(&whiteout->k); ++ whiteout->k.p = pos; ++ ++ ret = bch2_journal_key_insert(c, id, level, whiteout); ++ if (ret) ++ kfree(whiteout); ++ return ret; + } + + static struct bkey_i *bch2_journal_iter_peek(struct journal_iter *iter) + { +- if (iter->k && +- iter->k < iter->keys->d + iter->keys->nr && +- iter->k->btree_id == iter->btree_id && +- iter->k->level == iter->level) +- return iter->k->k; ++ struct journal_key *k = iter->idx - iter->keys->nr ++ ? iter->keys->d + iter->idx : NULL; ++ ++ if (k && ++ k->btree_id == iter->btree_id && ++ k->level == iter->level) ++ return k->k; + +- iter->k = NULL; ++ iter->idx = iter->keys->nr; + return NULL; + } + + static void bch2_journal_iter_advance(struct journal_iter *iter) + { +- if (iter->k) +- iter->k++; ++ if (iter->idx < iter->keys->nr) ++ iter->idx++; ++} ++ ++static void bch2_journal_iter_exit(struct journal_iter *iter) ++{ ++ list_del(&iter->list); + } + +-static void bch2_journal_iter_init(struct journal_iter *iter, +- struct journal_keys *journal_keys, ++static void bch2_journal_iter_init(struct bch_fs *c, ++ struct journal_iter *iter, + enum btree_id id, unsigned level, + struct bpos pos) + { + iter->btree_id = id; + iter->level = level; +- iter->keys = journal_keys; +- iter->k = journal_key_search(journal_keys, id, level, pos); ++ iter->keys = &c->journal_keys; ++ iter->idx = journal_key_search(&c->journal_keys, id, level, pos); ++ list_add(&iter->list, &c->journal_iters); + } + + static struct bkey_s_c bch2_journal_iter_peek_btree(struct btree_and_journal_iter *iter) + { +- return iter->btree +- ? bch2_btree_iter_peek(iter->btree) +- : bch2_btree_node_iter_peek_unpack(&iter->node_iter, +- iter->b, &iter->unpacked); ++ return bch2_btree_node_iter_peek_unpack(&iter->node_iter, ++ iter->b, &iter->unpacked); + } + + static void bch2_journal_iter_advance_btree(struct btree_and_journal_iter *iter) + { +- if (iter->btree) +- bch2_btree_iter_next(iter->btree); +- else +- bch2_btree_node_iter_advance(&iter->node_iter, iter->b); ++ bch2_btree_node_iter_advance(&iter->node_iter, iter->b); + } + + void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *iter) +@@ -160,7 +251,7 @@ struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter * + + if (iter->b && + bkey_cmp(ret.k->p, iter->b->data->max_key) > 0) { +- iter->journal.k = NULL; ++ iter->journal.idx = iter->journal.keys->nr; + iter->last = none; + return bkey_s_c_null; + } +@@ -181,26 +272,20 @@ struct bkey_s_c bch2_btree_and_journal_iter_next(struct btree_and_journal_iter * + return bch2_btree_and_journal_iter_peek(iter); + } + +-void bch2_btree_and_journal_iter_init(struct btree_and_journal_iter *iter, +- struct btree_trans *trans, +- struct journal_keys *journal_keys, +- enum btree_id id, struct bpos pos) ++void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *iter) + { +- memset(iter, 0, sizeof(*iter)); +- +- iter->btree = bch2_trans_get_iter(trans, id, pos, BTREE_ITER_PREFETCH); +- bch2_journal_iter_init(&iter->journal, journal_keys, id, 0, pos); ++ bch2_journal_iter_exit(&iter->journal); + } + + void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter, +- struct journal_keys *journal_keys, ++ struct bch_fs *c, + struct btree *b) + { + memset(iter, 0, sizeof(*iter)); + + iter->b = b; + bch2_btree_node_iter_init_from_start(&iter->node_iter, iter->b); +- bch2_journal_iter_init(&iter->journal, journal_keys, ++ bch2_journal_iter_init(c, &iter->journal, + b->c.btree_id, b->c.level, b->data->min_key); + } + +@@ -244,7 +329,7 @@ static int bch2_btree_and_journal_walk_recurse(struct bch_fs *c, struct btree *b + int ret = 0; + + bch2_bkey_buf_init(&tmp); +- bch2_btree_and_journal_iter_init_node_iter(&iter, journal_keys, b); ++ bch2_btree_and_journal_iter_init_node_iter(&iter, c, b); + + while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { + ret = key_fn(c, btree_id, b->c.level, k); +@@ -277,6 +362,7 @@ static int bch2_btree_and_journal_walk_recurse(struct bch_fs *c, struct btree *b + } + } + ++ bch2_btree_and_journal_iter_exit(&iter); + bch2_bkey_buf_exit(&tmp, c); + return ret; + } +@@ -333,6 +419,12 @@ static int journal_sort_key_cmp(const void *_l, const void *_r) + + void bch2_journal_keys_free(struct journal_keys *keys) + { ++ struct journal_key *i; ++ ++ for (i = keys->d; i < keys->d + keys->nr; i++) ++ if (i->allocated) ++ kfree(i->k); ++ + kvfree(keys->d); + keys->d = NULL; + keys->nr = 0; +@@ -361,7 +453,9 @@ static struct journal_keys journal_keys_sort(struct list_head *journal_entries) + nr_keys++; + } + +- keys.d = kvmalloc(sizeof(keys.d[0]) * nr_keys, GFP_KERNEL); ++ keys.size = roundup_pow_of_two(nr_keys); ++ ++ keys.d = kvmalloc(sizeof(keys.d[0]) * keys.size, GFP_KERNEL); + if (!keys.d) + goto err; + +@@ -545,14 +639,16 @@ static int __bch2_journal_replay_key(struct btree_trans *trans, + return ret; + } + +-static int bch2_journal_replay_key(struct bch_fs *c, enum btree_id id, +- unsigned level, struct bkey_i *k) ++static int bch2_journal_replay_key(struct bch_fs *c, struct journal_key *k) + { +- return bch2_trans_do(c, NULL, NULL, +- BTREE_INSERT_NOFAIL| +- BTREE_INSERT_LAZY_RW| +- BTREE_INSERT_JOURNAL_REPLAY, +- __bch2_journal_replay_key(&trans, id, level, k)); ++ unsigned commit_flags = BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_LAZY_RW; ++ ++ if (!k->allocated) ++ commit_flags |= BTREE_INSERT_JOURNAL_REPLAY; ++ ++ return bch2_trans_do(c, NULL, NULL, commit_flags, ++ __bch2_journal_replay_key(&trans, k->btree_id, k->level, k->k)); + } + + static int __bch2_alloc_replay_key(struct btree_trans *trans, struct bkey_i *k) +@@ -628,7 +724,7 @@ static int bch2_journal_replay(struct bch_fs *c, + + if (i->level) { + j->replay_journal_seq = keys.journal_seq_base + i->journal_seq; +- ret = bch2_journal_replay_key(c, i->btree_id, i->level, i->k); ++ ret = bch2_journal_replay_key(c, i); + if (ret) + goto err; + } +@@ -658,7 +754,7 @@ static int bch2_journal_replay(struct bch_fs *c, + + ret = i->k->k.size + ? bch2_extent_replay_key(c, i->btree_id, i->k) +- : bch2_journal_replay_key(c, i->btree_id, i->level, i->k); ++ : bch2_journal_replay_key(c, i); + if (ret) + goto err; + } +@@ -1105,7 +1201,7 @@ use_clean: + test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags)) { + bch_info(c, "starting mark and sweep"); + err = "error in mark and sweep"; +- ret = bch2_gc(c, &c->journal_keys, true); ++ ret = bch2_gc(c, true); + if (ret) + goto err; + bch_verbose(c, "mark and sweep done"); +diff --git a/fs/bcachefs/recovery.h b/fs/bcachefs/recovery.h +index a66827c9addf..fa91851b9ed7 100644 +--- a/fs/bcachefs/recovery.h ++++ b/fs/bcachefs/recovery.h +@@ -6,10 +6,11 @@ + for (i = (keys).d; i < (keys).d + (keys).nr; (i)++) + + struct journal_iter { ++ struct list_head list; + enum btree_id btree_id; + unsigned level; ++ size_t idx; + struct journal_keys *keys; +- struct journal_key *k; + }; + + /* +@@ -17,8 +18,6 @@ struct journal_iter { + */ + + struct btree_and_journal_iter { +- struct btree_iter *btree; +- + struct btree *b; + struct btree_node_iter node_iter; + struct bkey unpacked; +@@ -32,16 +31,18 @@ struct btree_and_journal_iter { + } last; + }; + ++int bch2_journal_key_insert(struct bch_fs *, enum btree_id, ++ unsigned, struct bkey_i *); ++int bch2_journal_key_delete(struct bch_fs *, enum btree_id, ++ unsigned, struct bpos); ++ + void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *); + struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *); + struct bkey_s_c bch2_btree_and_journal_iter_next(struct btree_and_journal_iter *); + +-void bch2_btree_and_journal_iter_init(struct btree_and_journal_iter *, +- struct btree_trans *, +- struct journal_keys *, +- enum btree_id, struct bpos); ++void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *); + void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *, +- struct journal_keys *, ++ struct bch_fs *, + struct btree *); + + typedef int (*btree_walk_node_fn)(struct bch_fs *c, struct btree *b); +diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c +index 682f9febc705..054e995ae7bd 100644 +--- a/fs/bcachefs/super.c ++++ b/fs/bcachefs/super.c +@@ -683,6 +683,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) + bch2_blacklist_entries_gc); + + INIT_LIST_HEAD(&c->journal_entries); ++ INIT_LIST_HEAD(&c->journal_iters); + + INIT_LIST_HEAD(&c->fsck_errors); + mutex_init(&c->fsck_error_lock); +diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c +index 4fc5777ecfb0..80964bdf6237 100644 +--- a/fs/bcachefs/sysfs.c ++++ b/fs/bcachefs/sysfs.c +@@ -475,7 +475,7 @@ STORE(bch2_fs) + */ + #if 0 + down_read(&c->state_lock); +- bch2_gc(c, NULL, false, false); ++ bch2_gc(c, false, false); + up_read(&c->state_lock); + #else + bch2_gc_gens(c); +-- +cgit v1.2.3 + + +From fa9771d707c29f662738678759b1da04afe4e69f Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 26 Jan 2021 20:59:00 -0500 +Subject: bcachefs: Add (partial) support for fixing btree topology + +When we walk the btrees during recovery, part of that is checking that +btree topology is correct: for every interior btree node, its child +nodes should exactly span the range the parent node covers. + +Previously, we had checks for this, but not repair code. Now that we +have the ability to do btree updates during initial GC, this patch adds +that repair code. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bcachefs.h | 3 +- + fs/bcachefs/btree_cache.c | 43 +++++++++++--- + fs/bcachefs/btree_cache.h | 2 +- + fs/bcachefs/btree_gc.c | 146 ++++++++++++++++++++++++++++++++++++---------- + fs/bcachefs/recovery.c | 6 +- + 5 files changed, 156 insertions(+), 44 deletions(-) + +diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h +index a6a2af547c95..0aab736c4494 100644 +--- a/fs/bcachefs/bcachefs.h ++++ b/fs/bcachefs/bcachefs.h +@@ -510,7 +510,8 @@ enum { + BCH_FS_ERRORS_FIXED, + + /* misc: */ +- BCH_FS_FIXED_GENS, ++ BCH_FS_NEED_ANOTHER_GC, ++ BCH_FS_DELETED_NODES, + BCH_FS_NEED_ALLOC_WRITE, + BCH_FS_REBUILD_REPLICAS, + BCH_FS_HOLD_BTREE_WRITES, +diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c +index bebf9fb01fe1..4fa3f80a805e 100644 +--- a/fs/bcachefs/btree_cache.c ++++ b/fs/bcachefs/btree_cache.c +@@ -7,6 +7,7 @@ + #include "btree_iter.h" + #include "btree_locking.h" + #include "debug.h" ++#include "error.h" + + #include + #include +@@ -812,9 +813,12 @@ lock_node: + return ERR_PTR(-EIO); + } + +- EBUG_ON(b->c.btree_id != iter->btree_id || +- BTREE_NODE_LEVEL(b->data) != level || +- bkey_cmp(b->data->max_key, k->k.p)); ++ EBUG_ON(b->c.btree_id != iter->btree_id); ++ EBUG_ON(BTREE_NODE_LEVEL(b->data) != level); ++ EBUG_ON(bkey_cmp(b->data->max_key, k->k.p)); ++ EBUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 && ++ bkey_cmp(b->data->min_key, ++ bkey_i_to_btree_ptr_v2(&b->key)->v.min_key)); + + return b; + } +@@ -822,7 +826,8 @@ lock_node: + struct btree *bch2_btree_node_get_noiter(struct bch_fs *c, + const struct bkey_i *k, + enum btree_id btree_id, +- unsigned level) ++ unsigned level, ++ bool nofill) + { + struct btree_cache *bc = &c->btree_cache; + struct btree *b; +@@ -837,6 +842,9 @@ struct btree *bch2_btree_node_get_noiter(struct bch_fs *c, + retry: + b = btree_cache_find(bc, k); + if (unlikely(!b)) { ++ if (nofill) ++ return NULL; ++ + b = bch2_btree_node_fill(c, NULL, k, btree_id, + level, SIX_LOCK_read, true); + +@@ -883,9 +891,12 @@ lock_node: + return ERR_PTR(-EIO); + } + +- EBUG_ON(b->c.btree_id != btree_id || +- BTREE_NODE_LEVEL(b->data) != level || +- bkey_cmp(b->data->max_key, k->k.p)); ++ EBUG_ON(b->c.btree_id != btree_id); ++ EBUG_ON(BTREE_NODE_LEVEL(b->data) != level); ++ EBUG_ON(bkey_cmp(b->data->max_key, k->k.p)); ++ EBUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 && ++ bkey_cmp(b->data->min_key, ++ bkey_i_to_btree_ptr_v2(&b->key)->v.min_key)); + + return b; + } +@@ -995,8 +1006,22 @@ out: + if (sib != btree_prev_sib) + swap(n1, n2); + +- BUG_ON(bkey_cmp(bkey_successor(n1->key.k.p), +- n2->data->min_key)); ++ if (bkey_cmp(bkey_successor(n1->key.k.p), ++ n2->data->min_key)) { ++ char buf1[200], buf2[200]; ++ ++ bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(&n1->key)); ++ bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(&n2->key)); ++ ++ bch2_fs_inconsistent(c, "btree topology error at btree %s level %u:\n" ++ "prev: %s\n" ++ "next: %s\n", ++ bch2_btree_ids[iter->btree_id], level, ++ buf1, buf2); ++ ++ six_unlock_intent(&ret->c.lock); ++ ret = NULL; ++ } + } + + bch2_btree_trans_verify_locks(trans); +diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h +index 0eeca0bcc48e..5fffae92effb 100644 +--- a/fs/bcachefs/btree_cache.h ++++ b/fs/bcachefs/btree_cache.h +@@ -26,7 +26,7 @@ struct btree *bch2_btree_node_get(struct bch_fs *, struct btree_iter *, + enum six_lock_type, unsigned long); + + struct btree *bch2_btree_node_get_noiter(struct bch_fs *, const struct bkey_i *, +- enum btree_id, unsigned); ++ enum btree_id, unsigned, bool); + + struct btree *bch2_btree_node_get_sibling(struct bch_fs *, struct btree_iter *, + struct btree *, enum btree_node_sibling); +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +index 69529f68e355..0c4e07e8cf51 100644 +--- a/fs/bcachefs/btree_gc.c ++++ b/fs/bcachefs/btree_gc.c +@@ -50,6 +50,10 @@ static inline void gc_pos_set(struct bch_fs *c, struct gc_pos new_pos) + __gc_pos_set(c, new_pos); + } + ++/* ++ * Missing: if an interior btree node is empty, we need to do something - ++ * perhaps just kill it ++ */ + static int bch2_gc_check_topology(struct bch_fs *c, + struct btree *b, + struct bkey_buf *prev, +@@ -62,6 +66,8 @@ static int bch2_gc_check_topology(struct bch_fs *c, + ? node_start + : bkey_successor(prev->k->k.p); + char buf1[200], buf2[200]; ++ bool update_min = false; ++ bool update_max = false; + int ret = 0; + + if (cur.k->k.type == KEY_TYPE_btree_ptr_v2) { +@@ -75,22 +81,79 @@ static int bch2_gc_check_topology(struct bch_fs *c, + bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(prev->k)); + + if (fsck_err_on(bkey_cmp(expected_start, bp->v.min_key), c, +- "btree node with incorrect min_key:\n prev %s\n cur %s", ++ "btree node with incorrect min_key at btree %s level %u:\n" ++ " prev %s\n" ++ " cur %s", ++ bch2_btree_ids[b->c.btree_id], b->c.level, + buf1, +- (bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(cur.k)), buf2))) { +- BUG(); +- } ++ (bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(cur.k)), buf2))) ++ update_min = true; + } + + if (fsck_err_on(is_last && + bkey_cmp(cur.k->k.p, node_end), c, +- "btree node with incorrect max_key:\n %s\n expected %s", ++ "btree node with incorrect max_key at btree %s level %u:\n" ++ " %s\n" ++ " expected %s", ++ bch2_btree_ids[b->c.btree_id], b->c.level, + (bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(cur.k)), buf1), +- (bch2_bpos_to_text(&PBUF(buf2), node_end), buf2))) { +- BUG(); +- } ++ (bch2_bpos_to_text(&PBUF(buf2), node_end), buf2))) ++ update_max = true; + + bch2_bkey_buf_copy(prev, c, cur.k); ++ ++ if (update_min || update_max) { ++ struct bkey_i *new; ++ struct bkey_i_btree_ptr_v2 *bp = NULL; ++ struct btree *n; ++ ++ if (update_max) { ++ ret = bch2_journal_key_delete(c, b->c.btree_id, ++ b->c.level, cur.k->k.p); ++ if (ret) ++ return ret; ++ } ++ ++ new = kmalloc(bkey_bytes(&cur.k->k), GFP_KERNEL); ++ if (!new) ++ return -ENOMEM; ++ ++ bkey_copy(new, cur.k); ++ ++ if (new->k.type == KEY_TYPE_btree_ptr_v2) ++ bp = bkey_i_to_btree_ptr_v2(new); ++ ++ if (update_min) ++ bp->v.min_key = expected_start; ++ if (update_max) ++ new->k.p = node_end; ++ if (bp) ++ SET_BTREE_PTR_RANGE_UPDATED(&bp->v, true); ++ ++ ret = bch2_journal_key_insert(c, b->c.btree_id, b->c.level, new); ++ if (ret) { ++ kfree(new); ++ return ret; ++ } ++ ++ n = bch2_btree_node_get_noiter(c, cur.k, b->c.btree_id, ++ b->c.level - 1, true); ++ if (n) { ++ mutex_lock(&c->btree_cache.lock); ++ bch2_btree_node_hash_remove(&c->btree_cache, n); ++ ++ bkey_copy(&n->key, new); ++ if (update_min) ++ n->data->min_key = expected_start; ++ if (update_max) ++ n->data->max_key = node_end; ++ ++ ret = __bch2_btree_node_hash_insert(&c->btree_cache, n); ++ BUG_ON(ret); ++ mutex_unlock(&c->btree_cache.lock); ++ six_unlock_read(&n->c.lock); ++ } ++ } + fsck_err: + return ret; + } +@@ -147,12 +210,13 @@ static int bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k, + ptr->dev, PTR_BUCKET_NR(ca, ptr), + bch2_data_types[ptr_data_type(k.k, ptr)], + ptr->gen, g->mark.gen)) { ++ /* XXX if it's a cached ptr, drop it */ + g2->_mark.gen = g->_mark.gen = ptr->gen; + g2->gen_valid = g->gen_valid = true; + g2->_mark.data_type = 0; + g2->_mark.dirty_sectors = 0; + g2->_mark.cached_sectors = 0; +- set_bit(BCH_FS_FIXED_GENS, &c->flags); ++ set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags); + set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags); + } + } +@@ -298,8 +362,6 @@ static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b, + break; + + if (b->c.level) { +- struct btree *child; +- + bch2_bkey_buf_reassemble(&cur, c, k); + k = bkey_i_to_s_c(cur.k); + +@@ -310,26 +372,49 @@ static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b, + !bch2_btree_and_journal_iter_peek(&iter).k); + if (ret) + break; ++ } else { ++ bch2_btree_and_journal_iter_advance(&iter); ++ } ++ } + +- if (b->c.level > target_depth) { +- child = bch2_btree_node_get_noiter(c, cur.k, +- b->c.btree_id, b->c.level - 1); +- ret = PTR_ERR_OR_ZERO(child); +- if (ret) +- break; ++ if (b->c.level > target_depth) { ++ bch2_btree_and_journal_iter_exit(&iter); ++ bch2_btree_and_journal_iter_init_node_iter(&iter, c, b); + +- ret = bch2_gc_btree_init_recurse(c, child, +- target_depth); +- six_unlock_read(&child->c.lock); ++ while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { ++ struct btree *child; ++ ++ bch2_bkey_buf_reassemble(&cur, c, k); ++ bch2_btree_and_journal_iter_advance(&iter); + ++ child = bch2_btree_node_get_noiter(c, cur.k, ++ b->c.btree_id, b->c.level - 1, ++ false); ++ ret = PTR_ERR_OR_ZERO(child); ++ ++ if (fsck_err_on(ret == -EIO, c, ++ "unreadable btree node")) { ++ ret = bch2_journal_key_delete(c, b->c.btree_id, ++ b->c.level, cur.k->k.p); + if (ret) +- break; ++ return ret; ++ ++ set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags); ++ continue; + } +- } else { +- bch2_btree_and_journal_iter_advance(&iter); ++ ++ if (ret) ++ break; ++ ++ ret = bch2_gc_btree_init_recurse(c, child, ++ target_depth); ++ six_unlock_read(&child->c.lock); ++ ++ if (ret) ++ break; + } + } +- ++fsck_err: + bch2_bkey_buf_exit(&cur, c); + bch2_bkey_buf_exit(&prev, c); + bch2_btree_and_journal_iter_exit(&iter); +@@ -816,16 +901,15 @@ again: + bch2_mark_allocator_buckets(c); + + c->gc_count++; +-out: +- if (!ret && +- (test_bit(BCH_FS_FIXED_GENS, &c->flags) || +- (!iter && bch2_test_restart_gc))) { ++ ++ if (test_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags) || ++ (!iter && bch2_test_restart_gc)) { + /* + * XXX: make sure gens we fixed got saved + */ + if (iter++ <= 2) { +- bch_info(c, "Fixed gens, restarting mark and sweep:"); +- clear_bit(BCH_FS_FIXED_GENS, &c->flags); ++ bch_info(c, "Second GC pass needed, restarting:"); ++ clear_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags); + __gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING)); + + percpu_down_write(&c->mark_lock); +@@ -840,7 +924,7 @@ out: + bch_info(c, "Unable to fix bucket gens, looping"); + ret = -EINVAL; + } +- ++out: + if (!ret) { + bch2_journal_block(&c->journal); + +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index 88a1d47e6e4b..f470e0e233ce 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -342,7 +342,8 @@ static int bch2_btree_and_journal_walk_recurse(struct bch_fs *c, struct btree *b + bch2_btree_and_journal_iter_advance(&iter); + + child = bch2_btree_node_get_noiter(c, tmp.k, +- b->c.btree_id, b->c.level - 1); ++ b->c.btree_id, b->c.level - 1, ++ false); + + ret = PTR_ERR_OR_ZERO(child); + if (ret) +@@ -766,7 +767,8 @@ static int bch2_journal_replay(struct bch_fs *c, + bch2_journal_flush_all_pins(j); + return bch2_journal_error(j); + err: +- bch_err(c, "journal replay: error %d while replaying key", ret); ++ bch_err(c, "journal replay: error %d while replaying key at btree %s level %u", ++ ret, bch2_btree_ids[i->btree_id], i->level); + return ret; + } + +-- +cgit v1.2.3 + + +From 47d9ae6e44435da6bd575bc041d966b417e0d07b Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 27 Jan 2021 19:08:54 -0500 +Subject: bcachefs: Repair bad data pointers + +Now that we can repair metadata during GC, we can handle bad pointers +that would trigger errors being marked, when they need to just be +dropped. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_gc.c | 138 ++++++++++++++++++++++++++++++++++++------------- + 1 file changed, 102 insertions(+), 36 deletions(-) + +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +index 0c4e07e8cf51..bab5ebd37f04 100644 +--- a/fs/bcachefs/btree_gc.c ++++ b/fs/bcachefs/btree_gc.c +@@ -158,9 +158,101 @@ fsck_err: + return ret; + } + ++static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id, ++ unsigned level, bool is_root, ++ struct bkey_s_c *k) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(*k); ++ const struct bch_extent_ptr *ptr; ++ bool do_update = false; ++ int ret = 0; ++ ++ bkey_for_each_ptr(ptrs, ptr) { ++ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); ++ struct bucket *g = PTR_BUCKET(ca, ptr, true); ++ struct bucket *g2 = PTR_BUCKET(ca, ptr, false); ++ ++ if (fsck_err_on(!g->gen_valid, c, ++ "bucket %u:%zu data type %s ptr gen %u missing in alloc btree", ++ ptr->dev, PTR_BUCKET_NR(ca, ptr), ++ bch2_data_types[ptr_data_type(k->k, ptr)], ++ ptr->gen)) { ++ if (!ptr->cached) { ++ g2->_mark.gen = g->_mark.gen = ptr->gen; ++ g2->gen_valid = g->gen_valid = true; ++ set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags); ++ } else { ++ do_update = true; ++ } ++ } ++ ++ if (fsck_err_on(gen_cmp(ptr->gen, g->mark.gen) > 0, c, ++ "bucket %u:%zu data type %s ptr gen in the future: %u > %u", ++ ptr->dev, PTR_BUCKET_NR(ca, ptr), ++ bch2_data_types[ptr_data_type(k->k, ptr)], ++ ptr->gen, g->mark.gen)) { ++ if (!ptr->cached) { ++ g2->_mark.gen = g->_mark.gen = ptr->gen; ++ g2->gen_valid = g->gen_valid = true; ++ g2->_mark.data_type = 0; ++ g2->_mark.dirty_sectors = 0; ++ g2->_mark.cached_sectors = 0; ++ set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags); ++ set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags); ++ } else { ++ do_update = true; ++ } ++ } ++ ++ if (fsck_err_on(!ptr->cached && ++ gen_cmp(ptr->gen, g->mark.gen) < 0, c, ++ "bucket %u:%zu data type %s stale dirty ptr: %u < %u", ++ ptr->dev, PTR_BUCKET_NR(ca, ptr), ++ bch2_data_types[ptr_data_type(k->k, ptr)], ++ ptr->gen, g->mark.gen)) ++ do_update = true; ++ } ++ ++ if (do_update) { ++ struct bch_extent_ptr *ptr; ++ struct bkey_i *new; ++ ++ if (is_root) { ++ bch_err(c, "cannot update btree roots yet"); ++ return -EINVAL; ++ } ++ ++ new = kmalloc(bkey_bytes(k->k), GFP_KERNEL); ++ if (!new) ++ return -ENOMEM; ++ ++ bkey_reassemble(new, *k); ++ ++ bch2_bkey_drop_ptrs(bkey_i_to_s(new), ptr, ({ ++ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); ++ struct bucket *g = PTR_BUCKET(ca, ptr, true); ++ ++ (ptr->cached && ++ (!g->gen_valid || gen_cmp(ptr->gen, g->mark.gen) > 0)) || ++ (!ptr->cached && ++ gen_cmp(ptr->gen, g->mark.gen) < 0); ++ })); ++ ++ ret = bch2_journal_key_insert(c, btree_id, level, new); ++ if (ret) ++ kfree(new); ++ else ++ *k = bkey_i_to_s_c(new); ++ } ++fsck_err: ++ return ret; ++} ++ + /* marking of btree keys/nodes: */ + +-static int bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k, ++static int bch2_gc_mark_key(struct bch_fs *c, enum btree_id btree_id, ++ unsigned level, bool is_root, ++ struct bkey_s_c k, + u8 *max_stale, bool initial) + { + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); +@@ -174,7 +266,6 @@ static int bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k, + BUG_ON(bch2_journal_seq_verify && + k.k->version.lo > journal_cur_seq(&c->journal)); + +- /* XXX change to fsck check */ + if (fsck_err_on(k.k->version.lo > atomic64_read(&c->key_version), c, + "key version number higher than recorded: %llu > %llu", + k.k->version.lo, +@@ -190,36 +281,7 @@ static int bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k, + return ret; + } + +- bkey_for_each_ptr(ptrs, ptr) { +- struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); +- struct bucket *g = PTR_BUCKET(ca, ptr, true); +- struct bucket *g2 = PTR_BUCKET(ca, ptr, false); +- +- if (mustfix_fsck_err_on(!g->gen_valid, c, +- "bucket %u:%zu data type %s ptr gen %u missing in alloc btree", +- ptr->dev, PTR_BUCKET_NR(ca, ptr), +- bch2_data_types[ptr_data_type(k.k, ptr)], +- ptr->gen)) { +- g2->_mark.gen = g->_mark.gen = ptr->gen; +- g2->gen_valid = g->gen_valid = true; +- set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags); +- } +- +- if (mustfix_fsck_err_on(gen_cmp(ptr->gen, g->mark.gen) > 0, c, +- "bucket %u:%zu data type %s ptr gen in the future: %u > %u", +- ptr->dev, PTR_BUCKET_NR(ca, ptr), +- bch2_data_types[ptr_data_type(k.k, ptr)], +- ptr->gen, g->mark.gen)) { +- /* XXX if it's a cached ptr, drop it */ +- g2->_mark.gen = g->_mark.gen = ptr->gen; +- g2->gen_valid = g->gen_valid = true; +- g2->_mark.data_type = 0; +- g2->_mark.dirty_sectors = 0; +- g2->_mark.cached_sectors = 0; +- set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags); +- set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags); +- } +- } ++ ret = bch2_check_fix_ptrs(c, btree_id, level, is_root, &k); + } + + bkey_for_each_ptr(ptrs, ptr) { +@@ -259,7 +321,8 @@ static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, u8 *max_stale, + while ((k = bch2_btree_node_iter_peek_unpack(&iter, b, &unpacked)).k) { + bch2_bkey_debugcheck(c, b, k); + +- ret = bch2_gc_mark_key(c, k, max_stale, initial); ++ ret = bch2_gc_mark_key(c, b->c.btree_id, b->c.level, false, ++ k, max_stale, initial); + if (ret) + break; + +@@ -329,7 +392,8 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id, + mutex_lock(&c->btree_root_lock); + b = c->btree_roots[btree_id].b; + if (!btree_node_fake(b)) +- ret = bch2_gc_mark_key(c, bkey_i_to_s_c(&b->key), ++ ret = bch2_gc_mark_key(c, b->c.btree_id, b->c.level, true, ++ bkey_i_to_s_c(&b->key), + &max_stale, initial); + gc_pos_set(c, gc_pos_btree_root(b->c.btree_id)); + mutex_unlock(&c->btree_root_lock); +@@ -357,7 +421,8 @@ static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b, + BUG_ON(bkey_cmp(k.k->p, b->data->min_key) < 0); + BUG_ON(bkey_cmp(k.k->p, b->data->max_key) > 0); + +- ret = bch2_gc_mark_key(c, k, &max_stale, true); ++ ret = bch2_gc_mark_key(c, b->c.btree_id, b->c.level, false, ++ k, &max_stale, true); + if (ret) + break; + +@@ -455,7 +520,8 @@ static int bch2_gc_btree_init(struct bch_fs *c, + ret = bch2_gc_btree_init_recurse(c, b, target_depth); + + if (!ret) +- ret = bch2_gc_mark_key(c, bkey_i_to_s_c(&b->key), ++ ret = bch2_gc_mark_key(c, b->c.btree_id, b->c.level, true, ++ bkey_i_to_s_c(&b->key), + &max_stale, true); + fsck_err: + six_unlock_read(&b->c.lock); +-- +cgit v1.2.3 + + +From ae6cc33010a8bde86e0d5fd584be39ce4e583cc3 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 29 Jan 2021 15:37:28 -0500 +Subject: bcachefs: Add an option for metadata_target + +Also, make journal writes obey foreground_target and metadata_target. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bcachefs_format.h | 1 + + fs/bcachefs/btree_update_interior.c | 5 ++++- + fs/bcachefs/journal_io.c | 15 +++++++++++++-- + fs/bcachefs/opts.h | 5 +++++ + 4 files changed, 23 insertions(+), 3 deletions(-) + +diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h +index 6dc150cbf2af..a8fc9f8d3702 100644 +--- a/fs/bcachefs/bcachefs_format.h ++++ b/fs/bcachefs/bcachefs_format.h +@@ -1305,6 +1305,7 @@ LE64_BITMASK(BCH_SB_BACKGROUND_COMPRESSION_TYPE, + LE64_BITMASK(BCH_SB_GC_RESERVE_BYTES, struct bch_sb, flags[2], 4, 64); + + LE64_BITMASK(BCH_SB_ERASURE_CODE, struct bch_sb, flags[3], 0, 16); ++LE64_BITMASK(BCH_SB_METADATA_TARGET, struct bch_sb, flags[3], 16, 28); + + /* + * Features: +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index 8919ea628138..dd1b8f6ef9b0 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -222,7 +222,10 @@ static struct btree *__bch2_btree_node_alloc(struct bch_fs *c, + mutex_unlock(&c->btree_reserve_cache_lock); + + retry: +- wp = bch2_alloc_sectors_start(c, c->opts.foreground_target, 0, ++ wp = bch2_alloc_sectors_start(c, ++ c->opts.metadata_target ?: ++ c->opts.foreground_target, ++ 0, + writepoint_ptr(&c->btree_write_point), + &devs_have, + res->nr_replicas, +diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c +index eacc9b2c362f..85d418a311f6 100644 +--- a/fs/bcachefs/journal_io.c ++++ b/fs/bcachefs/journal_io.c +@@ -5,6 +5,7 @@ + #include "btree_update_interior.h" + #include "buckets.h" + #include "checksum.h" ++#include "disk_groups.h" + #include "error.h" + #include "io.h" + #include "journal.h" +@@ -1032,16 +1033,20 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w, + unsigned sectors) + { + struct bch_fs *c = container_of(j, struct bch_fs, journal); ++ struct bch_devs_mask devs; + struct journal_device *ja; + struct bch_dev *ca; + struct dev_alloc_list devs_sorted; ++ unsigned target = c->opts.metadata_target ?: ++ c->opts.foreground_target; + unsigned i, replicas = 0, replicas_want = + READ_ONCE(c->opts.metadata_replicas); + + rcu_read_lock(); ++retry: ++ devs = target_rw_devs(c, BCH_DATA_journal, target); + +- devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe, +- &c->rw_devs[BCH_DATA_journal]); ++ devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe, &devs); + + __journal_write_alloc(j, w, &devs_sorted, + sectors, &replicas, replicas_want); +@@ -1073,6 +1078,12 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w, + + __journal_write_alloc(j, w, &devs_sorted, + sectors, &replicas, replicas_want); ++ ++ if (replicas < replicas_want && target) { ++ /* Retry from all devices: */ ++ target = 0; ++ goto retry; ++ } + done: + rcu_read_unlock(); + +diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h +index 710a7ee67039..d835a85338c6 100644 +--- a/fs/bcachefs/opts.h ++++ b/fs/bcachefs/opts.h +@@ -136,6 +136,11 @@ enum opt_type { + OPT_STR(bch2_str_hash_types), \ + BCH_SB_STR_HASH_TYPE, BCH_STR_HASH_OPT_SIPHASH, \ + NULL, "Hash function for directory entries and xattrs")\ ++ x(metadata_target, u16, \ ++ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \ ++ OPT_FN(bch2_opt_target), \ ++ BCH_SB_METADATA_TARGET, 0, \ ++ "(target)", "Device or disk group for metadata writes") \ + x(foreground_target, u16, \ + OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \ + OPT_FN(bch2_opt_target), \ +-- +cgit v1.2.3 + + +From 5dc9f9c276d9f49b4b7c2d3ff24c5122c3585112 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 29 Jan 2021 13:58:10 -0500 +Subject: bcachefs: Add an assertion to check for journal writes to same + location + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bcachefs.h | 1 + + fs/bcachefs/journal_io.c | 3 +++ + 2 files changed, 4 insertions(+) + +diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h +index 0aab736c4494..a65fade3a6c7 100644 +--- a/fs/bcachefs/bcachefs.h ++++ b/fs/bcachefs/bcachefs.h +@@ -474,6 +474,7 @@ struct bch_dev { + atomic64_t rebalance_work; + + struct journal_device journal; ++ u64 prev_journal_sector; + + struct work_struct io_error_work; + +diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c +index 85d418a311f6..a4e9417e6e64 100644 +--- a/fs/bcachefs/journal_io.c ++++ b/fs/bcachefs/journal_io.c +@@ -1289,6 +1289,9 @@ static void do_journal_write(struct closure *cl) + bio->bi_private = ca; + bio->bi_opf = REQ_OP_WRITE|REQ_SYNC|REQ_META; + ++ BUG_ON(bio->bi_iter.bi_sector == ca->prev_journal_sector); ++ ca->prev_journal_sector = bio->bi_iter.bi_sector; ++ + if (!JSET_NO_FLUSH(w->data)) + bio->bi_opf |= REQ_FUA; + if (!JSET_NO_FLUSH(w->data) && !w->separate_flush) +-- +cgit v1.2.3 + + +From d43a1add007c53700a2ddce5e232c51c18f42099 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 2 Feb 2021 15:56:44 -0500 +Subject: bcachefs: Add missing call to bch2_replicas_entry_sort() + +This fixes a bug introduced by "bcachefs: Improve diagnostics when +journal entries are missing" - devices in a replicas entry are supposed +to be sorted. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/journal_io.c | 2 ++ + fs/bcachefs/replicas.c | 12 ++++++------ + fs/bcachefs/replicas.h | 1 + + 3 files changed, 9 insertions(+), 6 deletions(-) + +diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c +index a4e9417e6e64..3c9ad5e98ebd 100644 +--- a/fs/bcachefs/journal_io.c ++++ b/fs/bcachefs/journal_io.c +@@ -938,6 +938,8 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list, + for (ptr = 0; ptr < i->nr_ptrs; ptr++) + replicas.e.devs[replicas.e.nr_devs++] = i->ptrs[ptr].dev; + ++ bch2_replicas_entry_sort(&replicas.e); ++ + /* + * If we're mounting in degraded mode - if we didn't read all + * the devices - this is wrong: +diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c +index ce8b7355b349..3970c442f199 100644 +--- a/fs/bcachefs/replicas.c ++++ b/fs/bcachefs/replicas.c +@@ -26,7 +26,7 @@ static void verify_replicas_entry(struct bch_replicas_entry *e) + #endif + } + +-static void replicas_entry_sort(struct bch_replicas_entry *e) ++void bch2_replicas_entry_sort(struct bch_replicas_entry *e) + { + bubble_sort(e->devs, e->nr_devs, u8_cmp); + } +@@ -122,7 +122,7 @@ void bch2_bkey_to_replicas(struct bch_replicas_entry *e, + break; + } + +- replicas_entry_sort(e); ++ bch2_replicas_entry_sort(e); + } + + void bch2_devlist_to_replicas(struct bch_replicas_entry *e, +@@ -142,7 +142,7 @@ void bch2_devlist_to_replicas(struct bch_replicas_entry *e, + for (i = 0; i < devs.nr; i++) + e->devs[e->nr_devs++] = devs.devs[i]; + +- replicas_entry_sort(e); ++ bch2_replicas_entry_sort(e); + } + + static struct bch_replicas_cpu +@@ -197,7 +197,7 @@ static inline int __replicas_entry_idx(struct bch_replicas_cpu *r, + int bch2_replicas_entry_idx(struct bch_fs *c, + struct bch_replicas_entry *search) + { +- replicas_entry_sort(search); ++ bch2_replicas_entry_sort(search); + + return __replicas_entry_idx(&c->replicas, search); + } +@@ -681,7 +681,7 @@ __bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r, + for_each_replicas_entry(sb_r, e) { + dst = cpu_replicas_entry(cpu_r, idx++); + memcpy(dst, e, replicas_entry_bytes(e)); +- replicas_entry_sort(dst); ++ bch2_replicas_entry_sort(dst); + } + + return 0; +@@ -718,7 +718,7 @@ __bch2_sb_replicas_v0_to_cpu_replicas(struct bch_sb_field_replicas_v0 *sb_r, + dst->nr_devs = e->nr_devs; + dst->nr_required = 1; + memcpy(dst->devs, e->devs, e->nr_devs); +- replicas_entry_sort(dst); ++ bch2_replicas_entry_sort(dst); + } + + return 0; +diff --git a/fs/bcachefs/replicas.h b/fs/bcachefs/replicas.h +index 8b95164fbb56..a16ef23bde8a 100644 +--- a/fs/bcachefs/replicas.h ++++ b/fs/bcachefs/replicas.h +@@ -5,6 +5,7 @@ + #include "eytzinger.h" + #include "replicas_types.h" + ++void bch2_replicas_entry_sort(struct bch_replicas_entry *); + void bch2_replicas_entry_to_text(struct printbuf *, + struct bch_replicas_entry *); + void bch2_cpu_replicas_to_text(struct printbuf *, struct bch_replicas_cpu *); +-- +cgit v1.2.3 + + +From 9350d3ee2d2e8dfdabcafd4a42e30a73af082648 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 22 Jan 2021 18:01:07 -0500 +Subject: bcachefs: KEY_TYPE_alloc_v2 + +This introduces a new version of KEY_TYPE_alloc, which uses the new +varint encoding introduced for inodes. This means we'll eventually be +able to support much larger bucket sizes (for SMR devices), and the +read/write time fields are expanded to 64 bits - which will be used in +the next patch to get rid of the periodic rescaling of those fields. + +Also, for buckets that are members of erasure coded stripes, this adds +persistent fields for the index of the stripe they're members of and the +stripe redundancy. This is part of work to get rid of having to scan and +read into memory the alloc and stripes btrees at mount time. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/alloc_background.c | 247 ++++++++++++++++++++++++------------ + fs/bcachefs/alloc_background.h | 48 +++++-- + fs/bcachefs/bcachefs_format.h | 56 +++++---- + fs/bcachefs/bkey.h | 1 + + fs/bcachefs/buckets.c | 280 +++++++++++++++++++---------------------- + fs/bcachefs/buckets_types.h | 3 +- + fs/bcachefs/ec.c | 35 ++++-- + fs/bcachefs/extents.c | 21 +--- + fs/bcachefs/movinggc.c | 11 +- + 9 files changed, 403 insertions(+), 299 deletions(-) + +diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c +index a198c64f3412..1501af285c18 100644 +--- a/fs/bcachefs/alloc_background.c ++++ b/fs/bcachefs/alloc_background.c +@@ -14,6 +14,7 @@ + #include "ec.h" + #include "error.h" + #include "recovery.h" ++#include "varint.h" + + #include + #include +@@ -24,11 +25,10 @@ + #include + #include + +-static const char * const bch2_alloc_field_names[] = { +-#define x(name, bytes) #name, +- BCH_ALLOC_FIELDS() ++static const unsigned BCH_ALLOC_V1_FIELD_BYTES[] = { ++#define x(name, bits) [BCH_ALLOC_FIELD_V1_##name] = bits / 8, ++ BCH_ALLOC_FIELDS_V1() + #undef x +- NULL + }; + + static void bch2_recalc_oldest_io(struct bch_fs *, struct bch_dev *, int); +@@ -67,10 +67,10 @@ static void pd_controllers_update(struct work_struct *work) + + /* Persistent alloc info: */ + +-static inline u64 get_alloc_field(const struct bch_alloc *a, +- const void **p, unsigned field) ++static inline u64 alloc_field_v1_get(const struct bch_alloc *a, ++ const void **p, unsigned field) + { +- unsigned bytes = BCH_ALLOC_FIELD_BYTES[field]; ++ unsigned bytes = BCH_ALLOC_V1_FIELD_BYTES[field]; + u64 v; + + if (!(a->fields & (1 << field))) +@@ -97,10 +97,10 @@ static inline u64 get_alloc_field(const struct bch_alloc *a, + return v; + } + +-static inline void put_alloc_field(struct bkey_i_alloc *a, void **p, +- unsigned field, u64 v) ++static inline void alloc_field_v1_put(struct bkey_i_alloc *a, void **p, ++ unsigned field, u64 v) + { +- unsigned bytes = BCH_ALLOC_FIELD_BYTES[field]; ++ unsigned bytes = BCH_ALLOC_V1_FIELD_BYTES[field]; + + if (!v) + return; +@@ -127,55 +127,149 @@ static inline void put_alloc_field(struct bkey_i_alloc *a, void **p, + *p += bytes; + } + +-struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c k) ++static void bch2_alloc_unpack_v1(struct bkey_alloc_unpacked *out, ++ struct bkey_s_c k) + { +- struct bkey_alloc_unpacked ret = { .gen = 0 }; ++ const struct bch_alloc *in = bkey_s_c_to_alloc(k).v; ++ const void *d = in->data; ++ unsigned idx = 0; + +- if (k.k->type == KEY_TYPE_alloc) { +- const struct bch_alloc *a = bkey_s_c_to_alloc(k).v; +- const void *d = a->data; +- unsigned idx = 0; ++ out->gen = in->gen; ++ ++#define x(_name, _bits) out->_name = alloc_field_v1_get(in, &d, idx++); ++ BCH_ALLOC_FIELDS_V1() ++#undef x ++} ++ ++static void bch2_alloc_pack_v1(struct bkey_alloc_buf *dst, ++ const struct bkey_alloc_unpacked src) ++{ ++ struct bkey_i_alloc *a = bkey_alloc_init(&dst->k); ++ void *d = a->v.data; ++ unsigned bytes, idx = 0; + +- ret.gen = a->gen; ++ a->k.p = POS(src.dev, src.bucket); ++ a->v.fields = 0; ++ a->v.gen = src.gen; + +-#define x(_name, _bits) ret._name = get_alloc_field(a, &d, idx++); +- BCH_ALLOC_FIELDS() ++#define x(_name, _bits) alloc_field_v1_put(a, &d, idx++, src._name); ++ BCH_ALLOC_FIELDS_V1() + #undef x +- } +- return ret; ++ bytes = (void *) d - (void *) &a->v; ++ set_bkey_val_bytes(&a->k, bytes); ++ memset_u64s_tail(&a->v, 0, bytes); + } + +-void bch2_alloc_pack(struct bkey_i_alloc *dst, +- const struct bkey_alloc_unpacked src) ++static int bch2_alloc_unpack_v2(struct bkey_alloc_unpacked *out, ++ struct bkey_s_c k) + { +- unsigned idx = 0; +- void *d = dst->v.data; ++ struct bkey_s_c_alloc_v2 a = bkey_s_c_to_alloc_v2(k); ++ const u8 *in = a.v->data; ++ const u8 *end = bkey_val_end(a); ++ unsigned fieldnr = 0; ++ int ret; ++ u64 v; ++ ++ out->gen = a.v->gen; ++ out->oldest_gen = a.v->oldest_gen; ++ out->data_type = a.v->data_type; ++ ++#define x(_name, _bits) \ ++ if (fieldnr < a.v->nr_fields) { \ ++ ret = bch2_varint_decode(in, end, &v); \ ++ if (ret < 0) \ ++ return ret; \ ++ in += ret; \ ++ } else { \ ++ v = 0; \ ++ } \ ++ out->_name = v; \ ++ if (v != out->_name) \ ++ return -1; \ ++ fieldnr++; ++ ++ BCH_ALLOC_FIELDS_V2() ++#undef x ++ return 0; ++} ++ ++static void bch2_alloc_pack_v2(struct bkey_alloc_buf *dst, ++ const struct bkey_alloc_unpacked src) ++{ ++ struct bkey_i_alloc_v2 *a = bkey_alloc_v2_init(&dst->k); ++ unsigned nr_fields = 0, last_nonzero_fieldnr = 0; ++ u8 *out = a->v.data; ++ u8 *end = (void *) &dst[1]; ++ u8 *last_nonzero_field = out; + unsigned bytes; + +- dst->v.fields = 0; +- dst->v.gen = src.gen; ++ a->k.p = POS(src.dev, src.bucket); ++ a->v.gen = src.gen; ++ a->v.oldest_gen = src.oldest_gen; ++ a->v.data_type = src.data_type; ++ ++#define x(_name, _bits) \ ++ nr_fields++; \ ++ \ ++ if (src._name) { \ ++ out += bch2_varint_encode(out, src._name); \ ++ \ ++ last_nonzero_field = out; \ ++ last_nonzero_fieldnr = nr_fields; \ ++ } else { \ ++ *out++ = 0; \ ++ } + +-#define x(_name, _bits) put_alloc_field(dst, &d, idx++, src._name); +- BCH_ALLOC_FIELDS() ++ BCH_ALLOC_FIELDS_V2() + #undef x ++ BUG_ON(out > end); ++ ++ out = last_nonzero_field; ++ a->v.nr_fields = last_nonzero_fieldnr; ++ ++ bytes = (u8 *) out - (u8 *) &a->v; ++ set_bkey_val_bytes(&a->k, bytes); ++ memset_u64s_tail(&a->v, 0, bytes); ++} ++ ++struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c k) ++{ ++ struct bkey_alloc_unpacked ret = { ++ .dev = k.k->p.inode, ++ .bucket = k.k->p.offset, ++ .gen = 0, ++ }; + +- bytes = (void *) d - (void *) &dst->v; +- set_bkey_val_bytes(&dst->k, bytes); +- memset_u64s_tail(&dst->v, 0, bytes); ++ if (k.k->type == KEY_TYPE_alloc_v2) ++ bch2_alloc_unpack_v2(&ret, k); ++ else if (k.k->type == KEY_TYPE_alloc) ++ bch2_alloc_unpack_v1(&ret, k); ++ ++ return ret; ++} ++ ++void bch2_alloc_pack(struct bch_fs *c, ++ struct bkey_alloc_buf *dst, ++ const struct bkey_alloc_unpacked src) ++{ ++ if (c->sb.features & (1ULL << BCH_FEATURE_alloc_v2)) ++ bch2_alloc_pack_v2(dst, src); ++ else ++ bch2_alloc_pack_v1(dst, src); + } + + static unsigned bch_alloc_val_u64s(const struct bch_alloc *a) + { + unsigned i, bytes = offsetof(struct bch_alloc, data); + +- for (i = 0; i < ARRAY_SIZE(BCH_ALLOC_FIELD_BYTES); i++) ++ for (i = 0; i < ARRAY_SIZE(BCH_ALLOC_V1_FIELD_BYTES); i++) + if (a->fields & (1 << i)) +- bytes += BCH_ALLOC_FIELD_BYTES[i]; ++ bytes += BCH_ALLOC_V1_FIELD_BYTES[i]; + + return DIV_ROUND_UP(bytes, sizeof(u64)); + } + +-const char *bch2_alloc_invalid(const struct bch_fs *c, struct bkey_s_c k) ++const char *bch2_alloc_v1_invalid(const struct bch_fs *c, struct bkey_s_c k) + { + struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k); + +@@ -190,20 +284,30 @@ const char *bch2_alloc_invalid(const struct bch_fs *c, struct bkey_s_c k) + return NULL; + } + +-void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, +- struct bkey_s_c k) ++const char *bch2_alloc_v2_invalid(const struct bch_fs *c, struct bkey_s_c k) + { +- struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k); +- const void *d = a.v->data; +- unsigned i; ++ struct bkey_alloc_unpacked u; ++ ++ if (k.k->p.inode >= c->sb.nr_devices || ++ !c->devs[k.k->p.inode]) ++ return "invalid device"; + +- pr_buf(out, "gen %u", a.v->gen); ++ if (bch2_alloc_unpack_v2(&u, k)) ++ return "unpack error"; + +- for (i = 0; i < BCH_ALLOC_FIELD_NR; i++) +- if (a.v->fields & (1 << i)) +- pr_buf(out, " %s %llu", +- bch2_alloc_field_names[i], +- get_alloc_field(a.v, &d, i)); ++ return NULL; ++} ++ ++void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, ++ struct bkey_s_c k) ++{ ++ struct bkey_alloc_unpacked u = bch2_alloc_unpack(k); ++ ++ pr_buf(out, "gen %u oldest_gen %u data_type %u", ++ u.gen, u.oldest_gen, u.data_type); ++#define x(_name, ...) pr_buf(out, #_name " %llu ", (u64) u._name); ++ BCH_ALLOC_FIELDS_V2() ++#undef x + } + + static int bch2_alloc_read_fn(struct bch_fs *c, enum btree_id id, +@@ -213,7 +317,9 @@ static int bch2_alloc_read_fn(struct bch_fs *c, enum btree_id id, + struct bucket *g; + struct bkey_alloc_unpacked u; + +- if (level || k.k->type != KEY_TYPE_alloc) ++ if (level || ++ (k.k->type != KEY_TYPE_alloc && ++ k.k->type != KEY_TYPE_alloc_v2)) + return 0; + + ca = bch_dev_bkey_exists(c, k.k->p.inode); +@@ -281,8 +387,7 @@ static int bch2_alloc_write_key(struct btree_trans *trans, + struct bucket *g; + struct bucket_mark m; + struct bkey_alloc_unpacked old_u, new_u; +- __BKEY_PADDED(k, 8) alloc_key; /* hack: */ +- struct bkey_i_alloc *a; ++ struct bkey_alloc_buf a; + int ret; + retry: + bch2_trans_begin(trans); +@@ -303,17 +408,14 @@ retry: + ca = bch_dev_bkey_exists(c, iter->pos.inode); + g = bucket(ca, iter->pos.offset); + m = READ_ONCE(g->mark); +- new_u = alloc_mem_to_key(g, m); ++ new_u = alloc_mem_to_key(iter, g, m); + percpu_up_read(&c->mark_lock); + + if (!bkey_alloc_unpacked_cmp(old_u, new_u)) + return 0; + +- a = bkey_alloc_init(&alloc_key.k); +- a->k.p = iter->pos; +- bch2_alloc_pack(a, new_u); +- +- bch2_trans_update(trans, iter, &a->k_i, ++ bch2_alloc_pack(c, &a, new_u); ++ bch2_trans_update(trans, iter, &a.k, + BTREE_TRIGGER_NORUN); + ret = bch2_trans_commit(trans, NULL, NULL, + BTREE_INSERT_NOFAIL|flags); +@@ -473,9 +575,9 @@ int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev, + struct bch_dev *ca = bch_dev_bkey_exists(c, dev); + struct btree_iter *iter; + struct bucket *g; +- struct bkey_i_alloc *a; ++ struct bkey_alloc_buf *a; + struct bkey_alloc_unpacked u; +- u16 *time; ++ u64 *time; + int ret = 0; + + iter = bch2_trans_get_iter(trans, BTREE_ID_ALLOC, POS(dev, bucket_nr), +@@ -486,28 +588,24 @@ int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev, + if (ret) + goto out; + +- a = bch2_trans_kmalloc(trans, BKEY_ALLOC_U64s_MAX * 8); ++ a = bch2_trans_kmalloc(trans, sizeof(struct bkey_alloc_buf)); + ret = PTR_ERR_OR_ZERO(a); + if (ret) + goto out; + + percpu_down_read(&c->mark_lock); + g = bucket(ca, bucket_nr); +- u = alloc_mem_to_key(g, READ_ONCE(g->mark)); ++ u = alloc_mem_to_key(iter, g, READ_ONCE(g->mark)); + percpu_up_read(&c->mark_lock); + +- bkey_alloc_init(&a->k_i); +- a->k.p = iter->pos; +- + time = rw == READ ? &u.read_time : &u.write_time; + if (*time == c->bucket_clock[rw].hand) + goto out; + + *time = c->bucket_clock[rw].hand; + +- bch2_alloc_pack(a, u); +- +- ret = bch2_trans_update(trans, iter, &a->k_i, 0) ?: ++ bch2_alloc_pack(c, a, u); ++ ret = bch2_trans_update(trans, iter, &a->k, 0) ?: + bch2_trans_commit(trans, NULL, NULL, 0); + out: + bch2_trans_iter_put(trans, iter); +@@ -863,14 +961,8 @@ static int bch2_invalidate_one_bucket2(struct btree_trans *trans, + struct btree_iter *iter, + u64 *journal_seq, unsigned flags) + { +-#if 0 +- __BKEY_PADDED(k, BKEY_ALLOC_VAL_U64s_MAX) alloc_key; +-#else +- /* hack: */ +- __BKEY_PADDED(k, 8) alloc_key; +-#endif + struct bch_fs *c = trans->c; +- struct bkey_i_alloc *a; ++ struct bkey_alloc_buf a; + struct bkey_alloc_unpacked u; + struct bucket *g; + struct bucket_mark m; +@@ -920,8 +1012,6 @@ static int bch2_invalidate_one_bucket2(struct btree_trans *trans, + goto out; + } + +- BUG_ON(BKEY_ALLOC_VAL_U64s_MAX > 8); +- + bch2_btree_iter_set_pos(iter, POS(ca->dev_idx, b)); + retry: + ret = bch2_btree_iter_traverse(iter); +@@ -931,7 +1021,7 @@ retry: + percpu_down_read(&c->mark_lock); + g = bucket(ca, iter->pos.offset); + m = READ_ONCE(g->mark); +- u = alloc_mem_to_key(g, m); ++ u = alloc_mem_to_key(iter, g, m); + + percpu_up_read(&c->mark_lock); + +@@ -944,11 +1034,8 @@ retry: + u.read_time = c->bucket_clock[READ].hand; + u.write_time = c->bucket_clock[WRITE].hand; + +- a = bkey_alloc_init(&alloc_key.k); +- a->k.p = iter->pos; +- bch2_alloc_pack(a, u); +- +- bch2_trans_update(trans, iter, &a->k_i, ++ bch2_alloc_pack(c, &a, u); ++ bch2_trans_update(trans, iter, &a.k, + BTREE_TRIGGER_BUCKET_INVALIDATE); + + /* +diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h +index f60fcebff2ce..6fededcd9f86 100644 +--- a/fs/bcachefs/alloc_background.h ++++ b/fs/bcachefs/alloc_background.h +@@ -7,12 +7,33 @@ + #include "debug.h" + + struct bkey_alloc_unpacked { ++ u64 bucket; ++ u8 dev; + u8 gen; ++ u8 oldest_gen; ++ u8 data_type; + #define x(_name, _bits) u##_bits _name; +- BCH_ALLOC_FIELDS() ++ BCH_ALLOC_FIELDS_V2() + #undef x + }; + ++struct bkey_alloc_buf { ++ struct bkey_i k; ++ ++ union { ++ struct { ++#define x(_name, _bits) + _bits / 8 ++ u8 _pad[8 + BCH_ALLOC_FIELDS_V1()]; ++#undef x ++ } _v1; ++ struct { ++#define x(_name, _bits) + 8 + _bits / 8 ++ u8 _pad[8 + BCH_ALLOC_FIELDS_V2()]; ++#undef x ++ } _v2; ++ }; ++} __attribute__((packed, aligned(8))); ++ + /* How out of date a pointer gen is allowed to be: */ + #define BUCKET_GC_GEN_MAX 96U + +@@ -20,23 +41,28 @@ struct bkey_alloc_unpacked { + static inline bool bkey_alloc_unpacked_cmp(struct bkey_alloc_unpacked l, + struct bkey_alloc_unpacked r) + { +- return l.gen != r.gen +-#define x(_name, _bits) || l._name != r._name +- BCH_ALLOC_FIELDS() ++ return l.gen != r.gen || ++ l.oldest_gen != r.oldest_gen || ++ l.data_type != r.data_type ++#define x(_name, ...) || l._name != r._name ++ BCH_ALLOC_FIELDS_V2() + #undef x + ; + } + + struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c); +-void bch2_alloc_pack(struct bkey_i_alloc *, ++void bch2_alloc_pack(struct bch_fs *, struct bkey_alloc_buf *, + const struct bkey_alloc_unpacked); + + int bch2_bucket_io_time_reset(struct btree_trans *, unsigned, size_t, int); + + static inline struct bkey_alloc_unpacked +-alloc_mem_to_key(struct bucket *g, struct bucket_mark m) ++alloc_mem_to_key(struct btree_iter *iter, ++ struct bucket *g, struct bucket_mark m) + { + return (struct bkey_alloc_unpacked) { ++ .dev = iter->pos.inode, ++ .bucket = iter->pos.offset, + .gen = m.gen, + .oldest_gen = g->oldest_gen, + .data_type = m.data_type, +@@ -49,11 +75,17 @@ alloc_mem_to_key(struct bucket *g, struct bucket_mark m) + + #define ALLOC_SCAN_BATCH(ca) max_t(size_t, 1, (ca)->mi.nbuckets >> 9) + +-const char *bch2_alloc_invalid(const struct bch_fs *, struct bkey_s_c); ++const char *bch2_alloc_v1_invalid(const struct bch_fs *, struct bkey_s_c); ++const char *bch2_alloc_v2_invalid(const struct bch_fs *, struct bkey_s_c); + void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); + + #define bch2_bkey_ops_alloc (struct bkey_ops) { \ +- .key_invalid = bch2_alloc_invalid, \ ++ .key_invalid = bch2_alloc_v1_invalid, \ ++ .val_to_text = bch2_alloc_to_text, \ ++} ++ ++#define bch2_bkey_ops_alloc_v2 (struct bkey_ops) { \ ++ .key_invalid = bch2_alloc_v2_invalid, \ + .val_to_text = bch2_alloc_to_text, \ + } + +diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h +index a8fc9f8d3702..3ffd66e262f7 100644 +--- a/fs/bcachefs/bcachefs_format.h ++++ b/fs/bcachefs/bcachefs_format.h +@@ -341,7 +341,8 @@ static inline void bkey_init(struct bkey *k) + x(reflink_v, 16) \ + x(inline_data, 17) \ + x(btree_ptr_v2, 18) \ +- x(indirect_inline_data, 19) ++ x(indirect_inline_data, 19) \ ++ x(alloc_v2, 20) + + enum bch_bkey_type { + #define x(name, nr) KEY_TYPE_##name = nr, +@@ -551,9 +552,11 @@ struct bch_extent_stripe_ptr { + #if defined(__LITTLE_ENDIAN_BITFIELD) + __u64 type:5, + block:8, +- idx:51; ++ redundancy:4, ++ idx:47; + #elif defined (__BIG_ENDIAN_BITFIELD) +- __u64 idx:51, ++ __u64 idx:47, ++ redundancy:4, + block:8, + type:5; + #endif +@@ -799,35 +802,40 @@ struct bch_alloc { + __u8 data[]; + } __attribute__((packed, aligned(8))); + +-#define BCH_ALLOC_FIELDS() \ ++#define BCH_ALLOC_FIELDS_V1() \ + x(read_time, 16) \ + x(write_time, 16) \ + x(data_type, 8) \ + x(dirty_sectors, 16) \ + x(cached_sectors, 16) \ +- x(oldest_gen, 8) ++ x(oldest_gen, 8) \ ++ x(stripe, 32) \ ++ x(stripe_redundancy, 8) ++ ++struct bch_alloc_v2 { ++ struct bch_val v; ++ __u8 nr_fields; ++ __u8 gen; ++ __u8 oldest_gen; ++ __u8 data_type; ++ __u8 data[]; ++} __attribute__((packed, aligned(8))); ++ ++#define BCH_ALLOC_FIELDS_V2() \ ++ x(read_time, 64) \ ++ x(write_time, 64) \ ++ x(dirty_sectors, 16) \ ++ x(cached_sectors, 16) \ ++ x(stripe, 32) \ ++ x(stripe_redundancy, 8) + + enum { +-#define x(name, bytes) BCH_ALLOC_FIELD_##name, +- BCH_ALLOC_FIELDS() ++#define x(name, _bits) BCH_ALLOC_FIELD_V1_##name, ++ BCH_ALLOC_FIELDS_V1() + #undef x + BCH_ALLOC_FIELD_NR + }; + +-static const unsigned BCH_ALLOC_FIELD_BYTES[] = { +-#define x(name, bits) [BCH_ALLOC_FIELD_##name] = bits / 8, +- BCH_ALLOC_FIELDS() +-#undef x +-}; +- +-#define x(name, bits) + (bits / 8) +-static const unsigned BKEY_ALLOC_VAL_U64s_MAX = +- DIV_ROUND_UP(offsetof(struct bch_alloc, data) +- BCH_ALLOC_FIELDS(), sizeof(u64)); +-#undef x +- +-#define BKEY_ALLOC_U64s_MAX (BKEY_U64s + BKEY_ALLOC_VAL_U64s_MAX) +- + /* Quotas: */ + + enum quota_types { +@@ -1333,7 +1341,8 @@ LE64_BITMASK(BCH_SB_METADATA_TARGET, struct bch_sb, flags[3], 16, 28); + x(btree_updates_journalled, 13) \ + x(reflink_inline_data, 14) \ + x(new_varint, 15) \ +- x(journal_no_flush, 16) ++ x(journal_no_flush, 16) \ ++ x(alloc_v2, 17) + + #define BCH_SB_FEATURES_ALL \ + ((1ULL << BCH_FEATURE_new_siphash)| \ +@@ -1341,7 +1350,8 @@ LE64_BITMASK(BCH_SB_METADATA_TARGET, struct bch_sb, flags[3], 16, 28); + (1ULL << BCH_FEATURE_btree_ptr_v2)| \ + (1ULL << BCH_FEATURE_extents_above_btree_updates)|\ + (1ULL << BCH_FEATURE_new_varint)| \ +- (1ULL << BCH_FEATURE_journal_no_flush)) ++ (1ULL << BCH_FEATURE_journal_no_flush)| \ ++ (1ULL << BCH_FEATURE_alloc_v2)) + + enum bch_sb_feature { + #define x(f, n) BCH_FEATURE_##f, +diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h +index 2c3b73a6fea3..48821f6c09aa 100644 +--- a/fs/bcachefs/bkey.h ++++ b/fs/bcachefs/bkey.h +@@ -530,6 +530,7 @@ BKEY_VAL_ACCESSORS(reflink_v); + BKEY_VAL_ACCESSORS(inline_data); + BKEY_VAL_ACCESSORS(btree_ptr_v2); + BKEY_VAL_ACCESSORS(indirect_inline_data); ++BKEY_VAL_ACCESSORS(alloc_v2); + + /* byte order helpers */ + +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index cb0f0e09a2c1..0bce4bfff9e8 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -685,7 +685,8 @@ static int bch2_mark_alloc(struct bch_fs *c, + struct bucket_mark old_m, m; + + /* We don't do anything for deletions - do we?: */ +- if (new.k->type != KEY_TYPE_alloc) ++ if (new.k->type != KEY_TYPE_alloc && ++ new.k->type != KEY_TYPE_alloc_v2) + return 0; + + /* +@@ -708,6 +709,7 @@ static int bch2_mark_alloc(struct bch_fs *c, + m.data_type = u.data_type; + m.dirty_sectors = u.dirty_sectors; + m.cached_sectors = u.cached_sectors; ++ m.stripe = u.stripe != 0; + + if (journal_seq) { + m.journal_seq_valid = 1; +@@ -721,6 +723,8 @@ static int bch2_mark_alloc(struct bch_fs *c, + g->io_time[WRITE] = u.write_time; + g->oldest_gen = u.oldest_gen; + g->gen_valid = 1; ++ g->stripe = u.stripe; ++ g->stripe_redundancy = u.stripe_redundancy; + + /* + * need to know if we're getting called from the invalidate path or +@@ -915,11 +919,10 @@ static int check_bucket_ref(struct bch_fs *c, struct bkey_s_c k, + return 0; + } + +-static int bucket_set_stripe(struct bch_fs *c, struct bkey_s_c k, ++static int mark_stripe_bucket(struct bch_fs *c, struct bkey_s_c k, + unsigned ptr_idx, + struct bch_fs_usage *fs_usage, +- u64 journal_seq, unsigned flags, +- bool enabled) ++ u64 journal_seq, unsigned flags) + { + const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; + unsigned nr_data = s->nr_blocks - s->nr_redundant; +@@ -932,8 +935,13 @@ static int bucket_set_stripe(struct bch_fs *c, struct bkey_s_c k, + char buf[200]; + int ret; + +- if (enabled) +- g->ec_redundancy = s->nr_redundant; ++ if (g->stripe && g->stripe != k.k->p.offset) { ++ bch2_fs_inconsistent(c, ++ "bucket %u:%zu gen %u: multiple stripes using same bucket\n%s", ++ ptr->dev, PTR_BUCKET_NR(ca, ptr), new.gen, ++ (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); ++ return -EINVAL; ++ } + + old = bucket_cmpxchg(g, new, ({ + ret = check_bucket_ref(c, k, ptr, 0, 0, new.gen, new.data_type, +@@ -941,23 +949,9 @@ static int bucket_set_stripe(struct bch_fs *c, struct bkey_s_c k, + if (ret) + return ret; + +- if (new.stripe && enabled) +- bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, +- "bucket %u:%zu gen %u: multiple stripes using same bucket\n%s", +- ptr->dev, PTR_BUCKET_NR(ca, ptr), new.gen, +- (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); +- +- if (!new.stripe && !enabled) +- bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, +- "bucket %u:%zu gen %u: deleting stripe but not marked\n%s", +- ptr->dev, PTR_BUCKET_NR(ca, ptr), new.gen, +- (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); +- +- new.stripe = enabled; +- +- if ((flags & BTREE_TRIGGER_GC) && parity) { +- new.data_type = enabled ? BCH_DATA_parity : 0; +- new.dirty_sectors = enabled ? le16_to_cpu(s->sectors): 0; ++ if (parity) { ++ new.data_type = BCH_DATA_parity; ++ new.dirty_sectors = le16_to_cpu(s->sectors); + } + + if (journal_seq) { +@@ -966,8 +960,8 @@ static int bucket_set_stripe(struct bch_fs *c, struct bkey_s_c k, + } + })); + +- if (!enabled) +- g->ec_redundancy = 0; ++ g->stripe = k.k->p.offset; ++ g->stripe_redundancy = s->nr_redundant; + + bch2_dev_usage_update(c, ca, fs_usage, old, new, gc); + return 0; +@@ -1163,6 +1157,8 @@ static int bch2_mark_stripe(struct bch_fs *c, + unsigned i; + int ret; + ++ BUG_ON(gc && old_s); ++ + if (!m || (old_s && !m->alive)) { + bch_err_ratelimited(c, "error marking nonexistent stripe %zu", + idx); +@@ -1170,48 +1166,12 @@ static int bch2_mark_stripe(struct bch_fs *c, + } + + if (!new_s) { +- /* Deleting: */ +- for (i = 0; i < old_s->nr_blocks; i++) { +- ret = bucket_set_stripe(c, old, i, fs_usage, +- journal_seq, flags, false); +- if (ret) +- return ret; +- } +- +- if (!gc && m->on_heap) { +- spin_lock(&c->ec_stripes_heap_lock); +- bch2_stripes_heap_del(c, m, idx); +- spin_unlock(&c->ec_stripes_heap_lock); +- } +- +- if (gc) +- update_replicas(c, fs_usage, &m->r.e, +- -((s64) m->sectors * m->nr_redundant)); ++ spin_lock(&c->ec_stripes_heap_lock); ++ bch2_stripes_heap_del(c, m, idx); ++ spin_unlock(&c->ec_stripes_heap_lock); + + memset(m, 0, sizeof(*m)); + } else { +- BUG_ON(old_s && new_s->nr_blocks != old_s->nr_blocks); +- BUG_ON(old_s && new_s->nr_redundant != old_s->nr_redundant); +- +- for (i = 0; i < new_s->nr_blocks; i++) { +- if (!old_s || +- memcmp(new_s->ptrs + i, +- old_s->ptrs + i, +- sizeof(struct bch_extent_ptr))) { +- +- if (old_s) { +- bucket_set_stripe(c, old, i, fs_usage, +- journal_seq, flags, false); +- if (ret) +- return ret; +- } +- ret = bucket_set_stripe(c, new, i, fs_usage, +- journal_seq, flags, true); +- if (ret) +- return ret; +- } +- } +- + m->alive = true; + m->sectors = le16_to_cpu(new_s->sectors); + m->algorithm = new_s->algorithm; +@@ -1220,27 +1180,13 @@ static int bch2_mark_stripe(struct bch_fs *c, + m->blocks_nonempty = 0; + + for (i = 0; i < new_s->nr_blocks; i++) { +- unsigned s = stripe_blockcount_get(new_s, i); +- +- /* +- * gc recalculates this field from stripe ptr +- * references: +- */ +- if (!gc) +- m->block_sectors[i] = s; +- m->blocks_nonempty += !!s; ++ m->block_sectors[i] = ++ stripe_blockcount_get(new_s, i); ++ m->blocks_nonempty += !!m->block_sectors[i]; + } + +- if (gc && old_s) +- update_replicas(c, fs_usage, &m->r.e, +- -((s64) m->sectors * m->nr_redundant)); +- + bch2_bkey_to_replicas(&m->r.e, new); + +- if (gc) +- update_replicas(c, fs_usage, &m->r.e, +- ((s64) m->sectors * m->nr_redundant)); +- + if (!gc) { + spin_lock(&c->ec_stripes_heap_lock); + bch2_stripes_heap_update(c, m, idx); +@@ -1248,6 +1194,25 @@ static int bch2_mark_stripe(struct bch_fs *c, + } + } + ++ if (gc) { ++ /* ++ * gc recalculates this field from stripe ptr ++ * references: ++ */ ++ memset(m->block_sectors, 0, sizeof(m->block_sectors)); ++ m->blocks_nonempty = 0; ++ ++ for (i = 0; i < new_s->nr_blocks; i++) { ++ ret = mark_stripe_bucket(c, new, i, fs_usage, ++ journal_seq, flags); ++ if (ret) ++ return ret; ++ } ++ ++ update_replicas(c, fs_usage, &m->r.e, ++ ((s64) m->sectors * m->nr_redundant)); ++ } ++ + return 0; + } + +@@ -1271,6 +1236,7 @@ static int bch2_mark_key_locked(struct bch_fs *c, + + switch (k.k->type) { + case KEY_TYPE_alloc: ++ case KEY_TYPE_alloc_v2: + ret = bch2_mark_alloc(c, old, new, fs_usage, journal_seq, flags); + break; + case KEY_TYPE_btree_ptr: +@@ -1539,9 +1505,10 @@ static int trans_get_key(struct btree_trans *trans, + return ret; + } + +-static int bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter **_iter, +- const struct bch_extent_ptr *ptr, +- struct bkey_alloc_unpacked *u) ++static struct bkey_alloc_buf * ++bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter **_iter, ++ const struct bch_extent_ptr *ptr, ++ struct bkey_alloc_unpacked *u) + { + struct bch_fs *c = trans->c; + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); +@@ -1549,8 +1516,13 @@ static int bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree + struct bucket *g; + struct btree_iter *iter; + struct bkey_s_c k; ++ struct bkey_alloc_buf *a; + int ret; + ++ a = bch2_trans_kmalloc(trans, sizeof(struct bkey_alloc_buf)); ++ if (IS_ERR(a)) ++ return a; ++ + iter = trans_get_update(trans, BTREE_ID_ALLOC, pos, &k); + if (iter) { + *u = bch2_alloc_unpack(k); +@@ -1562,17 +1534,17 @@ static int bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree + ret = bch2_btree_iter_traverse(iter); + if (ret) { + bch2_trans_iter_put(trans, iter); +- return ret; ++ return ERR_PTR(ret); + } + + percpu_down_read(&c->mark_lock); + g = bucket(ca, pos.offset); +- *u = alloc_mem_to_key(g, READ_ONCE(g->mark)); ++ *u = alloc_mem_to_key(iter, g, READ_ONCE(g->mark)); + percpu_up_read(&c->mark_lock); + } + + *_iter = iter; +- return 0; ++ return a; + } + + static int bch2_trans_mark_pointer(struct btree_trans *trans, +@@ -1582,27 +1554,20 @@ static int bch2_trans_mark_pointer(struct btree_trans *trans, + struct bch_fs *c = trans->c; + struct btree_iter *iter; + struct bkey_alloc_unpacked u; +- struct bkey_i_alloc *a; ++ struct bkey_alloc_buf *a; + int ret; + +- ret = bch2_trans_start_alloc_update(trans, &iter, &p.ptr, &u); +- if (ret) +- return ret; ++ a = bch2_trans_start_alloc_update(trans, &iter, &p.ptr, &u); ++ if (IS_ERR(a)) ++ return PTR_ERR(a); + + ret = __mark_pointer(c, k, &p.ptr, sectors, data_type, u.gen, &u.data_type, + &u.dirty_sectors, &u.cached_sectors); + if (ret) + goto out; + +- a = bch2_trans_kmalloc(trans, BKEY_ALLOC_U64s_MAX * 8); +- ret = PTR_ERR_OR_ZERO(a); +- if (ret) +- goto out; +- +- bkey_alloc_init(&a->k_i); +- a->k.p = iter->pos; +- bch2_alloc_pack(a, u); +- bch2_trans_update(trans, iter, &a->k_i, 0); ++ bch2_alloc_pack(c, a, u); ++ bch2_trans_update(trans, iter, &a->k, 0); + out: + bch2_trans_iter_put(trans, iter); + return ret; +@@ -1713,34 +1678,51 @@ static int bch2_trans_mark_extent(struct btree_trans *trans, + } + + static int bch2_trans_mark_stripe_alloc_ref(struct btree_trans *trans, +- const struct bch_extent_ptr *ptr, +- s64 sectors, bool parity) ++ struct bkey_s_c_stripe s, ++ unsigned idx, bool deleting) + { +- struct bkey_i_alloc *a; ++ struct bch_fs *c = trans->c; ++ const struct bch_extent_ptr *ptr = &s.v->ptrs[idx]; ++ struct bkey_alloc_buf *a; + struct btree_iter *iter; + struct bkey_alloc_unpacked u; +- int ret; ++ bool parity = idx >= s.v->nr_blocks - s.v->nr_redundant; ++ int ret = 0; + +- ret = bch2_trans_start_alloc_update(trans, &iter, ptr, &u); +- if (ret) +- return ret; ++ a = bch2_trans_start_alloc_update(trans, &iter, ptr, &u); ++ if (IS_ERR(a)) ++ return PTR_ERR(a); + + if (parity) { ++ s64 sectors = le16_to_cpu(s.v->sectors); ++ ++ if (deleting) ++ sectors = -sectors; ++ + u.dirty_sectors += sectors; + u.data_type = u.dirty_sectors + ? BCH_DATA_parity + : 0; + } + +- a = bch2_trans_kmalloc(trans, BKEY_ALLOC_U64s_MAX * 8); +- ret = PTR_ERR_OR_ZERO(a); +- if (ret) +- goto err; ++ if (!deleting) { ++ if (bch2_fs_inconsistent_on(u.stripe && u.stripe != s.k->p.offset, c, ++ "bucket %llu:%llu gen %u: multiple stripes using same bucket (%u, %llu)", ++ iter->pos.inode, iter->pos.offset, u.gen, ++ u.stripe, s.k->p.offset)) { ++ ret = -EIO; ++ goto err; ++ } + +- bkey_alloc_init(&a->k_i); +- a->k.p = iter->pos; +- bch2_alloc_pack(a, u); +- bch2_trans_update(trans, iter, &a->k_i, 0); ++ u.stripe = s.k->p.offset; ++ u.stripe_redundancy = s.v->nr_redundant; ++ } else { ++ u.stripe = 0; ++ u.stripe_redundancy = 0; ++ } ++ ++ bch2_alloc_pack(c, a, u); ++ bch2_trans_update(trans, iter, &a->k, 0); + err: + bch2_trans_iter_put(trans, iter); + return ret; +@@ -1750,51 +1732,50 @@ static int bch2_trans_mark_stripe(struct btree_trans *trans, + struct bkey_s_c old, struct bkey_s_c new, + unsigned flags) + { +- const struct bch_stripe *old_s = old.k->type == KEY_TYPE_stripe +- ? bkey_s_c_to_stripe(old).v : NULL; +- const struct bch_stripe *new_s = new.k->type == KEY_TYPE_stripe +- ? bkey_s_c_to_stripe(new).v : NULL; ++ struct bkey_s_c_stripe old_s = { NULL }; ++ struct bkey_s_c_stripe new_s = { NULL }; + struct bch_replicas_padded r; + unsigned i; + int ret = 0; + ++ if (old.k->type == KEY_TYPE_stripe) ++ old_s = bkey_s_c_to_stripe(old); ++ if (new.k->type == KEY_TYPE_stripe) ++ new_s = bkey_s_c_to_stripe(new); ++ + /* + * If the pointers aren't changing, we don't need to do anything: + */ +- if (new_s && old_s && +- !memcmp(old_s->ptrs, new_s->ptrs, +- new_s->nr_blocks * sizeof(struct bch_extent_ptr))) ++ if (new_s.k && old_s.k && ++ new_s.v->nr_blocks == old_s.v->nr_blocks && ++ new_s.v->nr_redundant == old_s.v->nr_redundant && ++ !memcmp(old_s.v->ptrs, new_s.v->ptrs, ++ new_s.v->nr_blocks * sizeof(struct bch_extent_ptr))) + return 0; + +- if (new_s) { +- unsigned nr_data = new_s->nr_blocks - new_s->nr_redundant; +- s64 sectors = le16_to_cpu(new_s->sectors); ++ if (new_s.k) { ++ s64 sectors = le16_to_cpu(new_s.v->sectors); + + bch2_bkey_to_replicas(&r.e, new); +- update_replicas_list(trans, &r.e, sectors * new_s->nr_redundant); ++ update_replicas_list(trans, &r.e, sectors * new_s.v->nr_redundant); + +- for (i = 0; i < new_s->nr_blocks; i++) { +- bool parity = i >= nr_data; +- +- ret = bch2_trans_mark_stripe_alloc_ref(trans, +- &new_s->ptrs[i], sectors, parity); ++ for (i = 0; i < new_s.v->nr_blocks; i++) { ++ ret = bch2_trans_mark_stripe_alloc_ref(trans, new_s, ++ i, false); + if (ret) + return ret; + } + } + +- if (old_s) { +- unsigned nr_data = old_s->nr_blocks - old_s->nr_redundant; +- s64 sectors = -((s64) le16_to_cpu(old_s->sectors)); ++ if (old_s.k) { ++ s64 sectors = -((s64) le16_to_cpu(old_s.v->sectors)); + + bch2_bkey_to_replicas(&r.e, old); +- update_replicas_list(trans, &r.e, sectors * old_s->nr_redundant); ++ update_replicas_list(trans, &r.e, sectors * old_s.v->nr_redundant); + +- for (i = 0; i < old_s->nr_blocks; i++) { +- bool parity = i >= nr_data; +- +- ret = bch2_trans_mark_stripe_alloc_ref(trans, +- &old_s->ptrs[i], sectors, parity); ++ for (i = 0; i < old_s.v->nr_blocks; i++) { ++ ret = bch2_trans_mark_stripe_alloc_ref(trans, old_s, ++ i, true); + if (ret) + return ret; + } +@@ -2065,21 +2046,16 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans, + struct bch_fs *c = trans->c; + struct btree_iter *iter; + struct bkey_alloc_unpacked u; +- struct bkey_i_alloc *a; ++ struct bkey_alloc_buf *a; + struct bch_extent_ptr ptr = { + .dev = ca->dev_idx, + .offset = bucket_to_sector(ca, b), + }; + int ret = 0; + +- a = bch2_trans_kmalloc(trans, BKEY_ALLOC_U64s_MAX * 8); +- ret = PTR_ERR_OR_ZERO(a); +- if (ret) +- return ret; +- +- ret = bch2_trans_start_alloc_update(trans, &iter, &ptr, &u); +- if (ret) +- return ret; ++ a = bch2_trans_start_alloc_update(trans, &iter, &ptr, &u); ++ if (IS_ERR(a)) ++ return PTR_ERR(a); + + if (u.data_type && u.data_type != type) { + bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, +@@ -2112,10 +2088,8 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans, + u.data_type = type; + u.dirty_sectors = sectors; + +- bkey_alloc_init(&a->k_i); +- a->k.p = iter->pos; +- bch2_alloc_pack(a, u); +- bch2_trans_update(trans, iter, &a->k_i, 0); ++ bch2_alloc_pack(c, a, u); ++ bch2_trans_update(trans, iter, &a->k, 0); + out: + bch2_trans_iter_put(trans, iter); + return ret; +diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h +index 5fbe940a5f6f..866a895cc4d2 100644 +--- a/fs/bcachefs/buckets_types.h ++++ b/fs/bcachefs/buckets_types.h +@@ -41,7 +41,8 @@ struct bucket { + u8 oldest_gen; + u8 gc_gen; + unsigned gen_valid:1; +- u8 ec_redundancy; ++ u8 stripe_redundancy; ++ u32 stripe; + }; + + struct bucket_array { +diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c +index 4e26ef6f5813..3f794a4d8575 100644 +--- a/fs/bcachefs/ec.c ++++ b/fs/bcachefs/ec.c +@@ -105,6 +105,9 @@ const char *bch2_stripe_invalid(const struct bch_fs *c, struct bkey_s_c k) + { + const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; + ++ if (!bkey_cmp(k.k->p, POS_MIN)) ++ return "stripe at pos 0"; ++ + if (k.k->p.inode) + return "invalid stripe key"; + +@@ -279,10 +282,14 @@ static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf) + struct bch_csum got = ec_block_checksum(buf, i, offset); + + if (bch2_crc_cmp(want, got)) { ++ char buf2[200]; ++ ++ bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(&buf->key.k_i)); ++ + bch_err_ratelimited(c, +- "stripe checksum error at %u:%u: csum type %u, expected %llx got %llx", +- i, j, v->csum_type, +- want.lo, got.lo); ++ "stripe checksum error for %ps at %u:%u: csum type %u, expected %llx got %llx\n%s", ++ (void *) _RET_IP_, i, j, v->csum_type, ++ want.lo, got.lo, buf2); + clear_bit(i, buf->valid); + break; + } +@@ -335,6 +342,8 @@ static int ec_do_recov(struct bch_fs *c, struct ec_stripe_buf *buf) + static void ec_block_endio(struct bio *bio) + { + struct ec_bio *ec_bio = container_of(bio, struct ec_bio, bio); ++ struct bch_stripe *v = &ec_bio->buf->key.v; ++ struct bch_extent_ptr *ptr = &v->ptrs[ec_bio->idx]; + struct bch_dev *ca = ec_bio->ca; + struct closure *cl = bio->bi_private; + +@@ -343,6 +352,13 @@ static void ec_block_endio(struct bio *bio) + bch2_blk_status_to_str(bio->bi_status))) + clear_bit(ec_bio->idx, ec_bio->buf->valid); + ++ if (ptr_stale(ca, ptr)) { ++ bch_err_ratelimited(ca->fs, ++ "error %s stripe: stale pointer after io", ++ bio_data_dir(bio) == READ ? "reading from" : "writing to"); ++ clear_bit(ec_bio->idx, ec_bio->buf->valid); ++ } ++ + bio_put(&ec_bio->bio); + percpu_ref_put(&ca->io_ref); + closure_put(cl); +@@ -652,7 +668,6 @@ void bch2_stripes_heap_update(struct bch_fs *c, + + static int ec_stripe_delete(struct bch_fs *c, size_t idx) + { +- //pr_info("deleting stripe %zu", idx); + return bch2_btree_delete_range(c, BTREE_ID_EC, + POS(0, idx), + POS(0, idx + 1), +@@ -795,6 +810,7 @@ static void extent_stripe_ptr_add(struct bkey_s_extent e, + *dst = (struct bch_extent_stripe_ptr) { + .type = 1 << BCH_EXTENT_ENTRY_stripe_ptr, + .block = block, ++ .redundancy = s->key.v.nr_redundant, + .idx = s->key.k.p.offset, + }; + } +@@ -1054,8 +1070,6 @@ void bch2_ec_add_backpointer(struct bch_fs *c, struct write_point *wp, + if (!ob) + return; + +- //pr_info("adding backpointer at %llu:%llu", pos.inode, pos.offset); +- + ec = ob->ec; + mutex_lock(&ec->lock); + +@@ -1348,12 +1362,14 @@ static s64 get_existing_stripe(struct bch_fs *c, + struct stripe *m; + size_t heap_idx; + u64 stripe_idx; ++ s64 ret = -1; + + if (may_create_new_stripe(c)) + return -1; + + spin_lock(&c->ec_stripes_heap_lock); + for (heap_idx = 0; heap_idx < h->used; heap_idx++) { ++ /* No blocks worth reusing, stripe will just be deleted: */ + if (!h->data[heap_idx].blocks_nonempty) + continue; + +@@ -1365,13 +1381,12 @@ static s64 get_existing_stripe(struct bch_fs *c, + m->sectors == head->blocksize && + m->blocks_nonempty < m->nr_blocks - m->nr_redundant) { + bch2_stripes_heap_del(c, m, stripe_idx); +- spin_unlock(&c->ec_stripes_heap_lock); +- return stripe_idx; ++ ret = stripe_idx; ++ break; + } + } +- + spin_unlock(&c->ec_stripes_heap_lock); +- return -1; ++ return ret; + } + + struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c, +diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c +index 67ba2c21627e..4a3a3291a31b 100644 +--- a/fs/bcachefs/extents.c ++++ b/fs/bcachefs/extents.c +@@ -704,14 +704,8 @@ unsigned bch2_bkey_replicas(struct bch_fs *c, struct bkey_s_c k) + if (p.ptr.cached) + continue; + +- if (p.has_ec) { +- struct stripe *s = +- genradix_ptr(&c->stripes[0], p.ec.idx); +- +- WARN_ON(!s); +- if (s) +- replicas += s->nr_redundant; +- } ++ if (p.has_ec) ++ replicas += p.ec.redundancy; + + replicas++; + +@@ -734,16 +728,9 @@ static unsigned bch2_extent_ptr_durability(struct bch_fs *c, + if (ca->mi.state != BCH_MEMBER_STATE_FAILED) + durability = max_t(unsigned, durability, ca->mi.durability); + +- if (p.has_ec) { +- struct stripe *s = +- genradix_ptr(&c->stripes[0], p.ec.idx); +- +- if (WARN_ON(!s)) +- goto out; ++ if (p.has_ec) ++ durability += p.ec.redundancy; + +- durability += s->nr_redundant; +- } +-out: + return durability; + } + +diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c +index d0acc1ee5cfe..e4f5db1d3977 100644 +--- a/fs/bcachefs/movinggc.c ++++ b/fs/bcachefs/movinggc.c +@@ -92,11 +92,8 @@ static enum data_cmd copygc_pred(struct bch_fs *c, void *arg, + data_opts->btree_insert_flags = BTREE_INSERT_USE_RESERVE; + data_opts->rewrite_dev = p.ptr.dev; + +- if (p.has_ec) { +- struct stripe *m = genradix_ptr(&c->stripes[0], p.ec.idx); +- +- data_opts->nr_replicas += m->nr_redundant; +- } ++ if (p.has_ec) ++ data_opts->nr_replicas += p.ec.redundancy; + + return DATA_REWRITE; + } +@@ -179,12 +176,12 @@ static int bch2_copygc(struct bch_fs *c) + bucket_sectors_used(m) >= ca->mi.bucket_size) + continue; + +- WARN_ON(m.stripe && !g->ec_redundancy); ++ WARN_ON(m.stripe && !g->stripe_redundancy); + + e = (struct copygc_heap_entry) { + .dev = dev_idx, + .gen = m.gen, +- .replicas = 1 + g->ec_redundancy, ++ .replicas = 1 + g->stripe_redundancy, + .fragmentation = bucket_sectors_used(m) * (1U << 15) + / ca->mi.bucket_size, + .sectors = bucket_sectors_used(m), +-- +cgit v1.2.3 + + +From 4e8cf60659237ac2b302a6b76e7ebca55fcd0ad2 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 21 Jan 2021 15:28:59 -0500 +Subject: bcachefs: Persist 64 bit io clocks + +Originally, bcachefs - going back to bcache - stored, for each bucket, a +16 bit counter corresponding to how long it had been since the bucket +was read from. But, this required periodically rescaling counters on +every bucket to avoid wraparound. That wasn't an issue in bcache, where +we'd perodically rewrite the per bucket metadata all at once, but in +bcachefs we're trying to avoid having to walk every single bucket. + +This patch switches to persisting 64 bit io clocks, corresponding to the +64 bit bucket timestaps introduced in the previous patch with +KEY_TYPE_alloc_v2. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/alloc_background.c | 225 ++++++++--------------------------------- + fs/bcachefs/alloc_types.h | 24 ----- + fs/bcachefs/bcachefs.h | 11 -- + fs/bcachefs/bcachefs_format.h | 18 +++- + fs/bcachefs/btree_gc.c | 6 +- + fs/bcachefs/buckets.h | 9 +- + fs/bcachefs/buckets_types.h | 2 +- + fs/bcachefs/clock.c | 8 +- + fs/bcachefs/clock_types.h | 2 +- + fs/bcachefs/journal.c | 3 + + fs/bcachefs/journal_io.c | 33 +++++- + fs/bcachefs/movinggc.c | 4 +- + fs/bcachefs/rebalance.c | 10 +- + fs/bcachefs/rebalance_types.h | 2 +- + fs/bcachefs/recovery.c | 19 ++-- + fs/bcachefs/super-io.c | 60 +++++------ + fs/bcachefs/super-io.h | 5 +- + fs/bcachefs/super.c | 6 -- + fs/bcachefs/sysfs.c | 4 +- + 19 files changed, 141 insertions(+), 310 deletions(-) + +diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c +index 1501af285c18..b77b34366f04 100644 +--- a/fs/bcachefs/alloc_background.c ++++ b/fs/bcachefs/alloc_background.c +@@ -31,8 +31,6 @@ static const unsigned BCH_ALLOC_V1_FIELD_BYTES[] = { + #undef x + }; + +-static void bch2_recalc_oldest_io(struct bch_fs *, struct bch_dev *, int); +- + /* Ratelimiting/PD controllers */ + + static void pd_controllers_update(struct work_struct *work) +@@ -340,9 +338,7 @@ static int bch2_alloc_read_fn(struct bch_fs *c, enum btree_id id, + + int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys) + { +- struct bch_dev *ca; +- unsigned i; +- int ret = 0; ++ int ret; + + down_read(&c->gc_lock); + ret = bch2_btree_and_journal_walk(c, journal_keys, BTREE_ID_ALLOC, +@@ -358,22 +354,6 @@ int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys) + bch2_dev_usage_from_buckets(c); + percpu_up_write(&c->mark_lock); + +- mutex_lock(&c->bucket_clock[READ].lock); +- for_each_member_device(ca, c, i) { +- down_read(&ca->bucket_lock); +- bch2_recalc_oldest_io(c, ca, READ); +- up_read(&ca->bucket_lock); +- } +- mutex_unlock(&c->bucket_clock[READ].lock); +- +- mutex_lock(&c->bucket_clock[WRITE].lock); +- for_each_member_device(ca, c, i) { +- down_read(&ca->bucket_lock); +- bch2_recalc_oldest_io(c, ca, WRITE); +- up_read(&ca->bucket_lock); +- } +- mutex_unlock(&c->bucket_clock[WRITE].lock); +- + return 0; + } + +@@ -460,114 +440,6 @@ err: + + /* Bucket IO clocks: */ + +-static void bch2_recalc_oldest_io(struct bch_fs *c, struct bch_dev *ca, int rw) +-{ +- struct bucket_clock *clock = &c->bucket_clock[rw]; +- struct bucket_array *buckets = bucket_array(ca); +- struct bucket *g; +- u16 max_last_io = 0; +- unsigned i; +- +- lockdep_assert_held(&c->bucket_clock[rw].lock); +- +- /* Recalculate max_last_io for this device: */ +- for_each_bucket(g, buckets) +- max_last_io = max(max_last_io, bucket_last_io(c, g, rw)); +- +- ca->max_last_bucket_io[rw] = max_last_io; +- +- /* Recalculate global max_last_io: */ +- max_last_io = 0; +- +- for_each_member_device(ca, c, i) +- max_last_io = max(max_last_io, ca->max_last_bucket_io[rw]); +- +- clock->max_last_io = max_last_io; +-} +- +-static void bch2_rescale_bucket_io_times(struct bch_fs *c, int rw) +-{ +- struct bucket_clock *clock = &c->bucket_clock[rw]; +- struct bucket_array *buckets; +- struct bch_dev *ca; +- struct bucket *g; +- unsigned i; +- +- trace_rescale_prios(c); +- +- for_each_member_device(ca, c, i) { +- down_read(&ca->bucket_lock); +- buckets = bucket_array(ca); +- +- for_each_bucket(g, buckets) +- g->io_time[rw] = clock->hand - +- bucket_last_io(c, g, rw) / 2; +- +- bch2_recalc_oldest_io(c, ca, rw); +- +- up_read(&ca->bucket_lock); +- } +-} +- +-static inline u64 bucket_clock_freq(u64 capacity) +-{ +- return max(capacity >> 10, 2028ULL); +-} +- +-static void bch2_inc_clock_hand(struct io_timer *timer) +-{ +- struct bucket_clock *clock = container_of(timer, +- struct bucket_clock, rescale); +- struct bch_fs *c = container_of(clock, +- struct bch_fs, bucket_clock[clock->rw]); +- struct bch_dev *ca; +- u64 capacity; +- unsigned i; +- +- mutex_lock(&clock->lock); +- +- /* if clock cannot be advanced more, rescale prio */ +- if (clock->max_last_io >= U16_MAX - 2) +- bch2_rescale_bucket_io_times(c, clock->rw); +- +- BUG_ON(clock->max_last_io >= U16_MAX - 2); +- +- for_each_member_device(ca, c, i) +- ca->max_last_bucket_io[clock->rw]++; +- clock->max_last_io++; +- clock->hand++; +- +- mutex_unlock(&clock->lock); +- +- capacity = READ_ONCE(c->capacity); +- +- if (!capacity) +- return; +- +- /* +- * we only increment when 0.1% of the filesystem capacity has been read +- * or written too, this determines if it's time +- * +- * XXX: we shouldn't really be going off of the capacity of devices in +- * RW mode (that will be 0 when we're RO, yet we can still service +- * reads) +- */ +- timer->expire += bucket_clock_freq(capacity); +- +- bch2_io_timer_add(&c->io_clock[clock->rw], timer); +-} +- +-static void bch2_bucket_clock_init(struct bch_fs *c, int rw) +-{ +- struct bucket_clock *clock = &c->bucket_clock[rw]; +- +- clock->hand = 1; +- clock->rw = rw; +- clock->rescale.fn = bch2_inc_clock_hand; +- clock->rescale.expire = bucket_clock_freq(c->capacity); +- mutex_init(&clock->lock); +-} +- + int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev, + size_t bucket_nr, int rw) + { +@@ -577,7 +449,7 @@ int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev, + struct bucket *g; + struct bkey_alloc_buf *a; + struct bkey_alloc_unpacked u; +- u64 *time; ++ u64 *time, now; + int ret = 0; + + iter = bch2_trans_get_iter(trans, BTREE_ID_ALLOC, POS(dev, bucket_nr), +@@ -599,10 +471,11 @@ int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev, + percpu_up_read(&c->mark_lock); + + time = rw == READ ? &u.read_time : &u.write_time; +- if (*time == c->bucket_clock[rw].hand) ++ now = atomic64_read(&c->io_clock[rw].now); ++ if (*time == now) + goto out; + +- *time = c->bucket_clock[rw].hand; ++ *time = now; + + bch2_alloc_pack(c, a, u); + ret = bch2_trans_update(trans, iter, &a->k, 0) ?: +@@ -674,23 +547,22 @@ static int wait_buckets_available(struct bch_fs *c, struct bch_dev *ca) + return ret; + } + +-static bool bch2_can_invalidate_bucket(struct bch_dev *ca, +- size_t bucket, +- struct bucket_mark mark) ++static bool bch2_can_invalidate_bucket(struct bch_dev *ca, size_t b, ++ struct bucket_mark m) + { + u8 gc_gen; + +- if (!is_available_bucket(mark)) ++ if (!is_available_bucket(m)) + return false; + +- if (mark.owned_by_allocator) ++ if (m.owned_by_allocator) + return false; + + if (ca->buckets_nouse && +- test_bit(bucket, ca->buckets_nouse)) ++ test_bit(b, ca->buckets_nouse)) + return false; + +- gc_gen = bucket_gc_gen(ca, bucket); ++ gc_gen = bucket_gc_gen(bucket(ca, b)); + + if (gc_gen >= BUCKET_GC_GEN_MAX / 2) + ca->inc_gen_needs_gc++; +@@ -704,43 +576,33 @@ static bool bch2_can_invalidate_bucket(struct bch_dev *ca, + /* + * Determines what order we're going to reuse buckets, smallest bucket_key() + * first. +- * +- * +- * - We take into account the read prio of the bucket, which gives us an +- * indication of how hot the data is -- we scale the prio so that the prio +- * farthest from the clock is worth 1/8th of the closest. +- * +- * - The number of sectors of cached data in the bucket, which gives us an +- * indication of the cost in cache misses this eviction will cause. +- * +- * - If hotness * sectors used compares equal, we pick the bucket with the +- * smallest bucket_gc_gen() - since incrementing the same bucket's generation +- * number repeatedly forces us to run mark and sweep gc to avoid generation +- * number wraparound. + */ + +-static unsigned long bucket_sort_key(struct bch_fs *c, struct bch_dev *ca, +- size_t b, struct bucket_mark m) ++static unsigned bucket_sort_key(struct bucket *g, struct bucket_mark m, ++ u64 now, u64 last_seq_ondisk) + { +- unsigned last_io = bucket_last_io(c, bucket(ca, b), READ); +- unsigned max_last_io = ca->max_last_bucket_io[READ]; +- +- /* +- * Time since last read, scaled to [0, 8) where larger value indicates +- * more recently read data: +- */ +- unsigned long hotness = (max_last_io - last_io) * 7 / max_last_io; +- +- /* How much we want to keep the data in this bucket: */ +- unsigned long data_wantness = +- (hotness + 1) * bucket_sectors_used(m); ++ unsigned used = bucket_sectors_used(m); + +- unsigned long needs_journal_commit = +- bucket_needs_journal_commit(m, c->journal.last_seq_ondisk); ++ if (used) { ++ /* ++ * Prefer to keep buckets that have been read more recently, and ++ * buckets that have more data in them: ++ */ ++ u64 last_read = max_t(s64, 0, now - g->io_time[READ]); ++ u32 last_read_scaled = max_t(u64, U32_MAX, div_u64(last_read, used)); + +- return (data_wantness << 9) | +- (needs_journal_commit << 8) | +- (bucket_gc_gen(ca, b) / 16); ++ return -last_read_scaled; ++ } else { ++ /* ++ * Prefer to use buckets with smaller gc_gen so that we don't ++ * have to walk the btree and recalculate oldest_gen - but shift ++ * off the low bits so that buckets will still have equal sort ++ * keys when there's only a small difference, so that we can ++ * keep sequential buckets together: ++ */ ++ return (bucket_needs_journal_commit(m, last_seq_ondisk) << 4)| ++ (bucket_gc_gen(g) >> 4); ++ } + } + + static inline int bucket_alloc_cmp(alloc_heap *h, +@@ -763,16 +625,15 @@ static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca) + { + struct bucket_array *buckets; + struct alloc_heap_entry e = { 0 }; ++ u64 now, last_seq_ondisk; + size_t b, i, nr = 0; + +- ca->alloc_heap.used = 0; +- +- mutex_lock(&c->bucket_clock[READ].lock); + down_read(&ca->bucket_lock); + + buckets = bucket_array(ca); +- +- bch2_recalc_oldest_io(c, ca, READ); ++ ca->alloc_heap.used = 0; ++ now = atomic64_read(&c->io_clock[READ].now); ++ last_seq_ondisk = c->journal.last_seq_ondisk; + + /* + * Find buckets with lowest read priority, by building a maxheap sorted +@@ -780,8 +641,9 @@ static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca) + * all buckets have been visited. + */ + for (b = ca->mi.first_bucket; b < ca->mi.nbuckets; b++) { +- struct bucket_mark m = READ_ONCE(buckets->b[b].mark); +- unsigned long key = bucket_sort_key(c, ca, b, m); ++ struct bucket *g = &buckets->b[b]; ++ struct bucket_mark m = READ_ONCE(g->mark); ++ unsigned key = bucket_sort_key(g, m, now, last_seq_ondisk); + + if (!bch2_can_invalidate_bucket(ca, b, m)) + continue; +@@ -816,7 +678,6 @@ static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca) + } + + up_read(&ca->bucket_lock); +- mutex_unlock(&c->bucket_clock[READ].lock); + } + + static void find_reclaimable_buckets_fifo(struct bch_fs *c, struct bch_dev *ca) +@@ -1031,8 +892,8 @@ retry: + u.data_type = 0; + u.dirty_sectors = 0; + u.cached_sectors = 0; +- u.read_time = c->bucket_clock[READ].hand; +- u.write_time = c->bucket_clock[WRITE].hand; ++ u.read_time = atomic64_read(&c->io_clock[READ].now); ++ u.write_time = atomic64_read(&c->io_clock[WRITE].now); + + bch2_alloc_pack(c, &a, u); + bch2_trans_update(trans, iter, &a.k, +@@ -1542,8 +1403,6 @@ int bch2_dev_allocator_start(struct bch_dev *ca) + void bch2_fs_allocator_background_init(struct bch_fs *c) + { + spin_lock_init(&c->freelist_lock); +- bch2_bucket_clock_init(c, READ); +- bch2_bucket_clock_init(c, WRITE); + + c->pd_controllers_update_seconds = 5; + INIT_DELAYED_WORK(&c->pd_controllers_update, pd_controllers_update); +diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h +index 1abfff5290bc..be164d6108bb 100644 +--- a/fs/bcachefs/alloc_types.h ++++ b/fs/bcachefs/alloc_types.h +@@ -10,30 +10,6 @@ + + struct ec_bucket_buf; + +-/* There's two of these clocks, one for reads and one for writes: */ +-struct bucket_clock { +- /* +- * "now" in (read/write) IO time - incremented whenever we do X amount +- * of reads or writes. +- * +- * Goes with the bucket read/write prios: when we read or write to a +- * bucket we reset the bucket's prio to the current hand; thus hand - +- * prio = time since bucket was last read/written. +- * +- * The units are some amount (bytes/sectors) of data read/written, and +- * the units can change on the fly if we need to rescale to fit +- * everything in a u16 - your only guarantee is that the units are +- * consistent. +- */ +- u16 hand; +- u16 max_last_io; +- +- int rw; +- +- struct io_timer rescale; +- struct mutex lock; +-}; +- + enum alloc_reserve { + RESERVE_BTREE_MOVINGGC = -2, + RESERVE_BTREE = -1, +diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h +index a65fade3a6c7..b8ba708c4a0d 100644 +--- a/fs/bcachefs/bcachefs.h ++++ b/fs/bcachefs/bcachefs.h +@@ -452,9 +452,6 @@ struct bch_dev { + + size_t fifo_last_bucket; + +- /* last calculated minimum prio */ +- u16 max_last_bucket_io[2]; +- + size_t inc_gen_needs_gc; + size_t inc_gen_really_needs_gc; + +@@ -693,14 +690,6 @@ struct bch_fs { + struct mutex usage_scratch_lock; + struct bch_fs_usage *usage_scratch; + +- /* +- * When we invalidate buckets, we use both the priority and the amount +- * of good data to determine which buckets to reuse first - to weight +- * those together consistently we keep track of the smallest nonzero +- * priority of any bucket. +- */ +- struct bucket_clock bucket_clock[2]; +- + struct io_clock io_clock[2]; + + /* JOURNAL SEQ BLACKLIST */ +diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h +index 3ffd66e262f7..2df1949dc9da 100644 +--- a/fs/bcachefs/bcachefs_format.h ++++ b/fs/bcachefs/bcachefs_format.h +@@ -1139,8 +1139,8 @@ struct bch_sb_field_clean { + struct bch_sb_field field; + + __le32 flags; +- __le16 read_clock; +- __le16 write_clock; ++ __le16 _read_clock; /* no longer used */ ++ __le16 _write_clock; + __le64 journal_seq; + + union { +@@ -1504,7 +1504,8 @@ static inline __u64 __bset_magic(struct bch_sb *sb) + x(blacklist, 3) \ + x(blacklist_v2, 4) \ + x(usage, 5) \ +- x(data_usage, 6) ++ x(data_usage, 6) \ ++ x(clock, 7) + + enum { + #define x(f, nr) BCH_JSET_ENTRY_##f = nr, +@@ -1552,6 +1553,13 @@ struct jset_entry_data_usage { + struct bch_replicas_entry r; + } __attribute__((packed)); + ++struct jset_entry_clock { ++ struct jset_entry entry; ++ __u8 rw; ++ __u8 pad[7]; ++ __le64 time; ++} __attribute__((packed)); ++ + /* + * On disk format for a journal entry: + * seq is monotonically increasing; every journal entry has its own unique +@@ -1574,8 +1582,8 @@ struct jset { + + __u8 encrypted_start[0]; + +- __le16 read_clock; +- __le16 write_clock; ++ __le16 _read_clock; /* no longer used */ ++ __le16 _write_clock; + + /* Sequence number of oldest dirty journal entry */ + __le64 last_seq; +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +index bab5ebd37f04..303ace78ced6 100644 +--- a/fs/bcachefs/btree_gc.c ++++ b/fs/bcachefs/btree_gc.c +@@ -1489,7 +1489,7 @@ static int bch2_gc_thread(void *arg) + { + struct bch_fs *c = arg; + struct io_clock *clock = &c->io_clock[WRITE]; +- unsigned long last = atomic_long_read(&clock->now); ++ unsigned long last = atomic64_read(&clock->now); + unsigned last_kick = atomic_read(&c->kick_gc); + int ret; + +@@ -1510,7 +1510,7 @@ static int bch2_gc_thread(void *arg) + if (c->btree_gc_periodic) { + unsigned long next = last + c->capacity / 16; + +- if (atomic_long_read(&clock->now) >= next) ++ if (atomic64_read(&clock->now) >= next) + break; + + bch2_io_clock_schedule_timeout(clock, next); +@@ -1522,7 +1522,7 @@ static int bch2_gc_thread(void *arg) + } + __set_current_state(TASK_RUNNING); + +- last = atomic_long_read(&clock->now); ++ last = atomic64_read(&clock->now); + last_kick = atomic_read(&c->kick_gc); + + /* +diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h +index 37346240cb7b..659f1ba01b6f 100644 +--- a/fs/bcachefs/buckets.h ++++ b/fs/bcachefs/buckets.h +@@ -58,20 +58,13 @@ static inline struct bucket *bucket(struct bch_dev *ca, size_t b) + return __bucket(ca, b, false); + } + +-static inline u16 bucket_last_io(struct bch_fs *c, struct bucket *g, int rw) +-{ +- return c->bucket_clock[rw].hand - g->io_time[rw]; +-} +- + /* + * bucket_gc_gen() returns the difference between the bucket's current gen and + * the oldest gen of any pointer into that bucket in the btree. + */ + +-static inline u8 bucket_gc_gen(struct bch_dev *ca, size_t b) ++static inline u8 bucket_gc_gen(struct bucket *g) + { +- struct bucket *g = bucket(ca, b); +- + return g->mark.gen - g->oldest_gen; + } + +diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h +index 866a895cc4d2..404c89a7a264 100644 +--- a/fs/bcachefs/buckets_types.h ++++ b/fs/bcachefs/buckets_types.h +@@ -37,7 +37,7 @@ struct bucket { + const struct bucket_mark mark; + }; + +- u16 io_time[2]; ++ u64 io_time[2]; + u8 oldest_gen; + u8 gc_gen; + unsigned gen_valid:1; +diff --git a/fs/bcachefs/clock.c b/fs/bcachefs/clock.c +index 1d1590de55e8..4324cfe7eed0 100644 +--- a/fs/bcachefs/clock.c ++++ b/fs/bcachefs/clock.c +@@ -19,7 +19,7 @@ void bch2_io_timer_add(struct io_clock *clock, struct io_timer *timer) + + spin_lock(&clock->timer_lock); + +- if (time_after_eq((unsigned long) atomic_long_read(&clock->now), ++ if (time_after_eq((unsigned long) atomic64_read(&clock->now), + timer->expire)) { + spin_unlock(&clock->timer_lock); + timer->fn(timer); +@@ -146,7 +146,7 @@ static struct io_timer *get_expired_timer(struct io_clock *clock, + void __bch2_increment_clock(struct io_clock *clock, unsigned sectors) + { + struct io_timer *timer; +- unsigned long now = atomic_long_add_return(sectors, &clock->now); ++ unsigned long now = atomic64_add_return(sectors, &clock->now); + + while ((timer = get_expired_timer(clock, now))) + timer->fn(timer); +@@ -158,7 +158,7 @@ void bch2_io_timers_to_text(struct printbuf *out, struct io_clock *clock) + unsigned i; + + spin_lock(&clock->timer_lock); +- now = atomic_long_read(&clock->now); ++ now = atomic64_read(&clock->now); + + for (i = 0; i < clock->timers.used; i++) + pr_buf(out, "%ps:\t%li\n", +@@ -175,7 +175,7 @@ void bch2_io_clock_exit(struct io_clock *clock) + + int bch2_io_clock_init(struct io_clock *clock) + { +- atomic_long_set(&clock->now, 0); ++ atomic64_set(&clock->now, 0); + spin_lock_init(&clock->timer_lock); + + clock->max_slop = IO_CLOCK_PCPU_SECTORS * num_possible_cpus(); +diff --git a/fs/bcachefs/clock_types.h b/fs/bcachefs/clock_types.h +index 92c740a47565..5fae0012d808 100644 +--- a/fs/bcachefs/clock_types.h ++++ b/fs/bcachefs/clock_types.h +@@ -26,7 +26,7 @@ struct io_timer { + typedef HEAP(struct io_timer *) io_timer_heap; + + struct io_clock { +- atomic_long_t now; ++ atomic64_t now; + u16 __percpu *pcpu_buf; + unsigned max_slop; + +diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c +index a7c5f5fddedb..e41f02773dd0 100644 +--- a/fs/bcachefs/journal.c ++++ b/fs/bcachefs/journal.c +@@ -1121,6 +1121,9 @@ int bch2_fs_journal_init(struct journal *j) + j->entry_u64s_reserved += + BTREE_ID_NR * (JSET_KEYS_U64s + BKEY_BTREE_PTR_U64s_MAX); + ++ j->entry_u64s_reserved += ++ 2 * (sizeof(struct jset_entry_clock) / sizeof(u64)); ++ + atomic64_set(&j->reservations.counter, + ((union journal_res_state) + { .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v); +diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c +index 3c9ad5e98ebd..39b8cbe178b0 100644 +--- a/fs/bcachefs/journal_io.c ++++ b/fs/bcachefs/journal_io.c +@@ -427,6 +427,32 @@ fsck_err: + return ret; + } + ++static int journal_entry_validate_clock(struct bch_fs *c, ++ struct jset *jset, ++ struct jset_entry *entry, ++ int write) ++{ ++ struct jset_entry_clock *clock = ++ container_of(entry, struct jset_entry_clock, entry); ++ unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); ++ int ret = 0; ++ ++ if (journal_entry_err_on(bytes != sizeof(*clock), ++ c, "invalid journal entry clock: bad size")) { ++ journal_entry_null_range(entry, vstruct_next(entry)); ++ return ret; ++ } ++ ++ if (journal_entry_err_on(clock->rw > 1, ++ c, "invalid journal entry clock: bad rw")) { ++ journal_entry_null_range(entry, vstruct_next(entry)); ++ return ret; ++ } ++ ++fsck_err: ++ return ret; ++} ++ + struct jset_entry_ops { + int (*validate)(struct bch_fs *, struct jset *, + struct jset_entry *, int); +@@ -1364,8 +1390,8 @@ void bch2_journal_write(struct closure *cl) + + end = bch2_btree_roots_to_journal_entries(c, jset->start, end); + +- end = bch2_journal_super_entries_add_common(c, end, +- le64_to_cpu(jset->seq)); ++ bch2_journal_super_entries_add_common(c, &end, ++ le64_to_cpu(jset->seq)); + u64s = (u64 *) end - (u64 *) start; + BUG_ON(u64s > j->entry_u64s_reserved); + +@@ -1374,10 +1400,7 @@ void bch2_journal_write(struct closure *cl) + + journal_write_compact(jset); + +- jset->read_clock = cpu_to_le16(c->bucket_clock[READ].hand); +- jset->write_clock = cpu_to_le16(c->bucket_clock[WRITE].hand); + jset->magic = cpu_to_le64(jset_magic(c)); +- + jset->version = c->sb.version < bcachefs_metadata_version_new_versioning + ? cpu_to_le32(BCH_JSET_VERSION_OLD) + : cpu_to_le32(c->sb.version); +diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c +index e4f5db1d3977..f915b30ab6e0 100644 +--- a/fs/bcachefs/movinggc.c ++++ b/fs/bcachefs/movinggc.c +@@ -298,7 +298,7 @@ static int bch2_copygc_thread(void *arg) + { + struct bch_fs *c = arg; + struct io_clock *clock = &c->io_clock[WRITE]; +- unsigned long last, wait; ++ u64 last, wait; + + set_freezable(); + +@@ -306,7 +306,7 @@ static int bch2_copygc_thread(void *arg) + if (kthread_wait_freezable(c->copy_gc_enabled)) + break; + +- last = atomic_long_read(&clock->now); ++ last = atomic64_read(&clock->now); + wait = bch2_copygc_wait_amount(c); + + if (wait > clock->max_slop) { +diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c +index c3373c48fa81..d89920b848ee 100644 +--- a/fs/bcachefs/rebalance.c ++++ b/fs/bcachefs/rebalance.c +@@ -169,12 +169,12 @@ static int bch2_rebalance_thread(void *arg) + unsigned long start, prev_start; + unsigned long prev_run_time, prev_run_cputime; + unsigned long cputime, prev_cputime; +- unsigned long io_start; ++ u64 io_start; + long throttle; + + set_freezable(); + +- io_start = atomic_long_read(&clock->now); ++ io_start = atomic64_read(&clock->now); + p = rebalance_work(c); + prev_start = jiffies; + prev_cputime = curr_cputime(); +@@ -210,7 +210,7 @@ static int bch2_rebalance_thread(void *arg) + (20 - w.dev_most_full_percent), + 50); + +- if (atomic_long_read(&clock->now) + clock->max_slop < ++ if (atomic64_read(&clock->now) + clock->max_slop < + r->throttled_until_iotime) { + r->throttled_until_cputime = start + throttle; + r->state = REBALANCE_THROTTLED; +@@ -229,7 +229,7 @@ static int bch2_rebalance_thread(void *arg) + max(p.dev_most_full_percent, 1U) / + max(w.dev_most_full_percent, 1U)); + +- io_start = atomic_long_read(&clock->now); ++ io_start = atomic64_read(&clock->now); + p = w; + prev_start = start; + prev_cputime = cputime; +@@ -274,7 +274,7 @@ void bch2_rebalance_work_to_text(struct printbuf *out, struct bch_fs *c) + case REBALANCE_THROTTLED: + bch2_hprint(&PBUF(h1), + (r->throttled_until_iotime - +- atomic_long_read(&c->io_clock[WRITE].now)) << 9); ++ atomic64_read(&c->io_clock[WRITE].now)) << 9); + pr_buf(out, "throttled for %lu sec or %s io\n", + (r->throttled_until_cputime - jiffies) / HZ, + h1); +diff --git a/fs/bcachefs/rebalance_types.h b/fs/bcachefs/rebalance_types.h +index 192c6be20ced..2f62a643c39f 100644 +--- a/fs/bcachefs/rebalance_types.h ++++ b/fs/bcachefs/rebalance_types.h +@@ -17,7 +17,7 @@ struct bch_fs_rebalance { + atomic64_t work_unknown_dev; + + enum rebalance_state state; +- unsigned long throttled_until_iotime; ++ u64 throttled_until_iotime; + unsigned long throttled_until_cputime; + struct bch_move_stats move_stats; + +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index f470e0e233ce..55f7771e11c8 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -847,6 +847,12 @@ static int journal_replay_entry_early(struct bch_fs *c, + le64_to_cpu(bl_entry->end) + 1); + break; + } ++ case BCH_JSET_ENTRY_clock: { ++ struct jset_entry_clock *clock = ++ container_of(entry, struct jset_entry_clock, entry); ++ ++ atomic64_set(&c->io_clock[clock->rw].now, clock->time); ++ } + } + + return ret; +@@ -861,9 +867,6 @@ static int journal_replay_early(struct bch_fs *c, + int ret; + + if (clean) { +- c->bucket_clock[READ].hand = le16_to_cpu(clean->read_clock); +- c->bucket_clock[WRITE].hand = le16_to_cpu(clean->write_clock); +- + for (entry = clean->start; + entry != vstruct_end(&clean->field); + entry = vstruct_next(entry)) { +@@ -876,9 +879,6 @@ static int journal_replay_early(struct bch_fs *c, + if (i->ignore) + continue; + +- c->bucket_clock[READ].hand = le16_to_cpu(i->j.read_clock); +- c->bucket_clock[WRITE].hand = le16_to_cpu(i->j.write_clock); +- + vstruct_for_each(&i->j, entry) { + ret = journal_replay_entry_early(c, entry); + if (ret) +@@ -942,13 +942,6 @@ static int verify_superblock_clean(struct bch_fs *c, + return 0; + } + +- mustfix_fsck_err_on(j->read_clock != clean->read_clock, c, +- "superblock read clock %u doesn't match journal %u after clean shutdown", +- clean->read_clock, j->read_clock); +- mustfix_fsck_err_on(j->write_clock != clean->write_clock, c, +- "superblock write clock %u doesn't match journal %u after clean shutdown", +- clean->write_clock, j->write_clock); +- + for (i = 0; i < BTREE_ID_NR; i++) { + char buf1[200], buf2[200]; + struct bkey_i *k1, *k2; +diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c +index 751efd28b672..068262917e10 100644 +--- a/fs/bcachefs/super-io.c ++++ b/fs/bcachefs/super-io.c +@@ -963,29 +963,25 @@ int bch2_fs_mark_dirty(struct bch_fs *c) + return ret; + } + +-static void +-entry_init_u64s(struct jset_entry *entry, unsigned u64s) ++static struct jset_entry *jset_entry_init(struct jset_entry **end, size_t size) + { +- memset(entry, 0, u64s * sizeof(u64)); ++ struct jset_entry *entry = *end; ++ unsigned u64s = DIV_ROUND_UP(size, sizeof(u64)); + ++ memset(entry, 0, u64s * sizeof(u64)); + /* + * The u64s field counts from the start of data, ignoring the shared + * fields. + */ + entry->u64s = u64s - 1; +-} + +-static void +-entry_init_size(struct jset_entry *entry, size_t size) +-{ +- unsigned u64s = DIV_ROUND_UP(size, sizeof(u64)); +- entry_init_u64s(entry, u64s); ++ *end = vstruct_next(*end); ++ return entry; + } + +-struct jset_entry * +-bch2_journal_super_entries_add_common(struct bch_fs *c, +- struct jset_entry *entry, +- u64 journal_seq) ++void bch2_journal_super_entries_add_common(struct bch_fs *c, ++ struct jset_entry **end, ++ u64 journal_seq) + { + unsigned i; + +@@ -1000,58 +996,58 @@ bch2_journal_super_entries_add_common(struct bch_fs *c, + + { + struct jset_entry_usage *u = +- container_of(entry, struct jset_entry_usage, entry); ++ container_of(jset_entry_init(end, sizeof(*u)), ++ struct jset_entry_usage, entry); + +- entry_init_size(entry, sizeof(*u)); + u->entry.type = BCH_JSET_ENTRY_usage; + u->entry.btree_id = FS_USAGE_INODES; + u->v = cpu_to_le64(c->usage_base->nr_inodes); +- +- entry = vstruct_next(entry); + } + + { + struct jset_entry_usage *u = +- container_of(entry, struct jset_entry_usage, entry); ++ container_of(jset_entry_init(end, sizeof(*u)), ++ struct jset_entry_usage, entry); + +- entry_init_size(entry, sizeof(*u)); + u->entry.type = BCH_JSET_ENTRY_usage; + u->entry.btree_id = FS_USAGE_KEY_VERSION; + u->v = cpu_to_le64(atomic64_read(&c->key_version)); +- +- entry = vstruct_next(entry); + } + + for (i = 0; i < BCH_REPLICAS_MAX; i++) { + struct jset_entry_usage *u = +- container_of(entry, struct jset_entry_usage, entry); ++ container_of(jset_entry_init(end, sizeof(*u)), ++ struct jset_entry_usage, entry); + +- entry_init_size(entry, sizeof(*u)); + u->entry.type = BCH_JSET_ENTRY_usage; + u->entry.btree_id = FS_USAGE_RESERVED; + u->entry.level = i; + u->v = cpu_to_le64(c->usage_base->persistent_reserved[i]); +- +- entry = vstruct_next(entry); + } + + for (i = 0; i < c->replicas.nr; i++) { + struct bch_replicas_entry *e = + cpu_replicas_entry(&c->replicas, i); + struct jset_entry_data_usage *u = +- container_of(entry, struct jset_entry_data_usage, entry); ++ container_of(jset_entry_init(end, sizeof(*u) + e->nr_devs), ++ struct jset_entry_data_usage, entry); + +- entry_init_size(entry, sizeof(*u) + e->nr_devs); + u->entry.type = BCH_JSET_ENTRY_data_usage; + u->v = cpu_to_le64(c->usage_base->replicas[i]); + memcpy(&u->r, e, replicas_entry_bytes(e)); +- +- entry = vstruct_next(entry); + } + + percpu_up_write(&c->mark_lock); + +- return entry; ++ for (i = 0; i < 2; i++) { ++ struct jset_entry_clock *clock = ++ container_of(jset_entry_init(end, sizeof(*clock)), ++ struct jset_entry_clock, entry); ++ ++ clock->entry.type = BCH_JSET_ENTRY_clock; ++ clock->rw = i; ++ clock->time = atomic64_read(&c->io_clock[i].now); ++ } + } + + void bch2_fs_mark_clean(struct bch_fs *c) +@@ -1080,15 +1076,13 @@ void bch2_fs_mark_clean(struct bch_fs *c) + } + + sb_clean->flags = 0; +- sb_clean->read_clock = cpu_to_le16(c->bucket_clock[READ].hand); +- sb_clean->write_clock = cpu_to_le16(c->bucket_clock[WRITE].hand); + sb_clean->journal_seq = cpu_to_le64(journal_cur_seq(&c->journal) - 1); + + /* Trying to catch outstanding bug: */ + BUG_ON(le64_to_cpu(sb_clean->journal_seq) > S64_MAX); + + entry = sb_clean->start; +- entry = bch2_journal_super_entries_add_common(c, entry, 0); ++ bch2_journal_super_entries_add_common(c, &entry, 0); + entry = bch2_btree_roots_to_journal_entries(c, entry, entry); + BUG_ON((void *) entry > vstruct_end(&sb_clean->field)); + +diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h +index 7a068158efca..1a35124f5f47 100644 +--- a/fs/bcachefs/super-io.h ++++ b/fs/bcachefs/super-io.h +@@ -122,9 +122,8 @@ static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi) + + /* BCH_SB_FIELD_clean: */ + +-struct jset_entry * +-bch2_journal_super_entries_add_common(struct bch_fs *, +- struct jset_entry *, u64); ++void bch2_journal_super_entries_add_common(struct bch_fs *, ++ struct jset_entry **, u64); + + void bch2_sb_clean_renumber(struct bch_sb_field_clean *, int); + +diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c +index 054e995ae7bd..0d35df66e8cf 100644 +--- a/fs/bcachefs/super.c ++++ b/fs/bcachefs/super.c +@@ -174,9 +174,6 @@ static void __bch2_fs_read_only(struct bch_fs *c) + bch2_copygc_stop(c); + bch2_gc_thread_stop(c); + +- bch2_io_timer_del(&c->io_clock[READ], &c->bucket_clock[READ].rescale); +- bch2_io_timer_del(&c->io_clock[WRITE], &c->bucket_clock[WRITE].rescale); +- + /* + * Flush journal before stopping allocators, because flushing journal + * blacklist entries involves allocating new btree nodes: +@@ -399,9 +396,6 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early) + bch2_dev_allocator_add(c, ca); + bch2_recalc_capacity(c); + +- bch2_io_timer_add(&c->io_clock[READ], &c->bucket_clock[READ].rescale); +- bch2_io_timer_add(&c->io_clock[WRITE], &c->bucket_clock[WRITE].rescale); +- + for_each_rw_member(ca, c, i) { + ret = bch2_dev_allocator_start(ca); + if (ret) { +diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c +index 80964bdf6237..f934f12bc677 100644 +--- a/fs/bcachefs/sysfs.c ++++ b/fs/bcachefs/sysfs.c +@@ -705,7 +705,7 @@ static unsigned bucket_last_io_fn(struct bch_fs *c, struct bch_dev *ca, + { + int rw = (private ? 1 : 0); + +- return bucket_last_io(c, bucket(ca, b), rw); ++ return atomic64_read(&c->io_clock[rw].now) - bucket(ca, b)->io_time[rw]; + } + + static unsigned bucket_sectors_used_fn(struct bch_fs *c, struct bch_dev *ca, +@@ -718,7 +718,7 @@ static unsigned bucket_sectors_used_fn(struct bch_fs *c, struct bch_dev *ca, + static unsigned bucket_oldest_gen_fn(struct bch_fs *c, struct bch_dev *ca, + size_t b, void *private) + { +- return bucket_gc_gen(ca, b); ++ return bucket_gc_gen(bucket(ca, b)); + } + + static int unsigned_cmp(const void *_l, const void *_r) +-- +cgit v1.2.3 + + +From fac777d9eb1f22705212d20dd6161aefd309e283 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 21 Jan 2021 21:52:06 -0500 +Subject: bcachefs: Journal updates to dev usage + +This eliminates the need to scan every bucket to regenerate dev_usage at +mount time. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/alloc_background.c | 4 -- + fs/bcachefs/bcachefs.h | 6 ++- + fs/bcachefs/bcachefs_format.h | 20 +++++++- + fs/bcachefs/btree_gc.c | 38 ++++++++++----- + fs/bcachefs/buckets.c | 102 +++++++++++++++++++++++++---------------- + fs/bcachefs/buckets.h | 7 ++- + fs/bcachefs/journal_io.c | 37 +++++++++++++++ + fs/bcachefs/recovery.c | 21 +++++++++ + fs/bcachefs/super-io.c | 22 ++++++++- + fs/bcachefs/super.c | 37 +++++++++------ + 10 files changed, 219 insertions(+), 75 deletions(-) + +diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c +index b77b34366f04..476ddac4b266 100644 +--- a/fs/bcachefs/alloc_background.c ++++ b/fs/bcachefs/alloc_background.c +@@ -350,10 +350,6 @@ int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys) + return ret; + } + +- percpu_down_write(&c->mark_lock); +- bch2_dev_usage_from_buckets(c); +- percpu_up_write(&c->mark_lock); +- + return 0; + } + +diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h +index b8ba708c4a0d..04e0feb11609 100644 +--- a/fs/bcachefs/bcachefs.h ++++ b/fs/bcachefs/bcachefs.h +@@ -430,7 +430,9 @@ struct bch_dev { + unsigned long *buckets_nouse; + struct rw_semaphore bucket_lock; + +- struct bch_dev_usage __percpu *usage[2]; ++ struct bch_dev_usage *usage_base; ++ struct bch_dev_usage __percpu *usage[JOURNAL_BUF_NR]; ++ struct bch_dev_usage __percpu *usage_gc; + + /* Allocator: */ + struct task_struct __rcu *alloc_thread; +@@ -583,6 +585,8 @@ struct bch_fs { + + struct journal_entry_res replicas_journal_res; + ++ struct journal_entry_res dev_usage_journal_res; ++ + struct bch_disk_groups_cpu __rcu *disk_groups; + + struct bch_opts opts; +diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h +index 2df1949dc9da..30e77190d97a 100644 +--- a/fs/bcachefs/bcachefs_format.h ++++ b/fs/bcachefs/bcachefs_format.h +@@ -1505,7 +1505,8 @@ static inline __u64 __bset_magic(struct bch_sb *sb) + x(blacklist_v2, 4) \ + x(usage, 5) \ + x(data_usage, 6) \ +- x(clock, 7) ++ x(clock, 7) \ ++ x(dev_usage, 8) + + enum { + #define x(f, nr) BCH_JSET_ENTRY_##f = nr, +@@ -1560,6 +1561,23 @@ struct jset_entry_clock { + __le64 time; + } __attribute__((packed)); + ++struct jset_entry_dev_usage_type { ++ __le64 buckets; ++ __le64 sectors; ++ __le64 fragmented; ++} __attribute__((packed)); ++ ++struct jset_entry_dev_usage { ++ struct jset_entry entry; ++ __le32 dev; ++ __u32 pad; ++ ++ __le64 buckets_ec; ++ __le64 buckets_unavailable; ++ ++ struct jset_entry_dev_usage_type d[]; ++} __attribute__((packed)); ++ + /* + * On disk format for a journal entry: + * seq is monotonically increasing; every journal entry has its own unique +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +index 303ace78ced6..c2c8a34f735d 100644 +--- a/fs/bcachefs/btree_gc.c ++++ b/fs/bcachefs/btree_gc.c +@@ -706,8 +706,8 @@ static void bch2_gc_free(struct bch_fs *c) + ca->mi.nbuckets * sizeof(struct bucket)); + ca->buckets[1] = NULL; + +- free_percpu(ca->usage[1]); +- ca->usage[1] = NULL; ++ free_percpu(ca->usage_gc); ++ ca->usage_gc = NULL; + } + + free_percpu(c->usage_gc); +@@ -720,7 +720,7 @@ static int bch2_gc_done(struct bch_fs *c, + struct bch_dev *ca; + bool verify = (!initial || + (c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO))); +- unsigned i; ++ unsigned i, dev; + int ret = 0; + + #define copy_field(_f, _msg, ...) \ +@@ -786,7 +786,10 @@ static int bch2_gc_done(struct bch_fs *c, + } + } + +- for_each_member_device(ca, c, i) { ++ for (i = 0; i < ARRAY_SIZE(c->usage); i++) ++ bch2_fs_usage_acc_to_base(c, i); ++ ++ for_each_member_device(ca, c, dev) { + struct bucket_array *dst = __bucket_array(ca, 0); + struct bucket_array *src = __bucket_array(ca, 1); + size_t b; +@@ -801,12 +804,23 @@ static int bch2_gc_done(struct bch_fs *c, + + dst->b[b].oldest_gen = src->b[b].oldest_gen; + } +- }; + +- for (i = 0; i < ARRAY_SIZE(c->usage); i++) +- bch2_fs_usage_acc_to_base(c, i); ++ { ++ struct bch_dev_usage *dst = ca->usage_base; ++ struct bch_dev_usage *src = (void *) ++ bch2_acc_percpu_u64s((void *) ca->usage_gc, ++ dev_usage_u64s()); ++ ++ copy_dev_field(buckets_ec, "buckets_ec"); ++ copy_dev_field(buckets_unavailable, "buckets_unavailable"); + +- bch2_dev_usage_from_buckets(c); ++ for (i = 0; i < BCH_DATA_NR; i++) { ++ copy_dev_field(d[i].buckets, "%s buckets", bch2_data_types[i]); ++ copy_dev_field(d[i].sectors, "%s sectors", bch2_data_types[i]); ++ copy_dev_field(d[i].fragmented, "%s fragmented", bch2_data_types[i]); ++ } ++ } ++ }; + + { + unsigned nr = fs_usage_u64s(c); +@@ -862,7 +876,7 @@ static int bch2_gc_start(struct bch_fs *c) + + for_each_member_device(ca, c, i) { + BUG_ON(ca->buckets[1]); +- BUG_ON(ca->usage[1]); ++ BUG_ON(ca->usage_gc); + + ca->buckets[1] = kvpmalloc(sizeof(struct bucket_array) + + ca->mi.nbuckets * sizeof(struct bucket), +@@ -873,9 +887,9 @@ static int bch2_gc_start(struct bch_fs *c) + return -ENOMEM; + } + +- ca->usage[1] = alloc_percpu(struct bch_dev_usage); +- if (!ca->usage[1]) { +- bch_err(c, "error allocating ca->usage[gc]"); ++ ca->usage_gc = alloc_percpu(struct bch_dev_usage); ++ if (!ca->usage_gc) { ++ bch_err(c, "error allocating ca->usage_gc"); + percpu_ref_put(&ca->ref); + return -ENOMEM; + } +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index 0bce4bfff9e8..ef79f5cac64d 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -137,6 +137,7 @@ void bch2_bucket_seq_cleanup(struct bch_fs *c) + void bch2_fs_usage_initialize(struct bch_fs *c) + { + struct bch_fs_usage *usage; ++ struct bch_dev *ca; + unsigned i; + + percpu_down_write(&c->mark_lock); +@@ -155,6 +156,14 @@ void bch2_fs_usage_initialize(struct bch_fs *c) + fs_usage_data_type_to_base(usage, e->data_type, usage->replicas[i]); + } + ++ for_each_member_device(ca, c, i) { ++ struct bch_dev_usage dev = bch2_dev_usage_read(ca); ++ ++ usage->hidden += (dev.d[BCH_DATA_sb].buckets + ++ dev.d[BCH_DATA_journal].buckets) * ++ ca->mi.bucket_size; ++ } ++ + percpu_up_write(&c->mark_lock); + } + +@@ -189,14 +198,27 @@ out_pool: + return ret; + } + ++static inline struct bch_dev_usage *dev_usage_ptr(struct bch_dev *ca, ++ unsigned journal_seq, ++ bool gc) ++{ ++ return this_cpu_ptr(gc ++ ? ca->usage_gc ++ : ca->usage[journal_seq & JOURNAL_BUF_MASK]); ++} ++ + struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *ca) + { ++ struct bch_fs *c = ca->fs; + struct bch_dev_usage ret; ++ unsigned seq, i, u64s = dev_usage_u64s(); + +- memset(&ret, 0, sizeof(ret)); +- acc_u64s_percpu((u64 *) &ret, +- (u64 __percpu *) ca->usage[0], +- sizeof(ret) / sizeof(u64)); ++ do { ++ seq = read_seqcount_begin(&c->usage_lock); ++ memcpy(&ret, ca->usage_base, u64s * sizeof(u64)); ++ for (i = 0; i < ARRAY_SIZE(ca->usage); i++) ++ acc_u64s_percpu((u64 *) &ret, (u64 __percpu *) ca->usage[i], u64s); ++ } while (read_seqcount_retry(&c->usage_lock, seq)); + + return ret; + } +@@ -261,7 +283,8 @@ retry: + + void bch2_fs_usage_acc_to_base(struct bch_fs *c, unsigned idx) + { +- unsigned u64s = fs_usage_u64s(c); ++ struct bch_dev *ca; ++ unsigned i, u64s = fs_usage_u64s(c); + + BUG_ON(idx >= ARRAY_SIZE(c->usage)); + +@@ -272,6 +295,16 @@ void bch2_fs_usage_acc_to_base(struct bch_fs *c, unsigned idx) + (u64 __percpu *) c->usage[idx], u64s); + percpu_memset(c->usage[idx], 0, u64s * sizeof(u64)); + ++ rcu_read_lock(); ++ for_each_member_device_rcu(ca, c, i, NULL) { ++ u64s = dev_usage_u64s(); ++ ++ acc_u64s_percpu((u64 *) ca->usage_base, ++ (u64 __percpu *) ca->usage[idx], u64s); ++ percpu_memset(ca->usage[idx], 0, u64s * sizeof(u64)); ++ } ++ rcu_read_unlock(); ++ + write_seqcount_end(&c->usage_lock); + preempt_enable(); + } +@@ -454,14 +487,14 @@ static inline void account_bucket(struct bch_fs_usage *fs_usage, + static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca, + struct bch_fs_usage *fs_usage, + struct bucket_mark old, struct bucket_mark new, +- bool gc) ++ u64 journal_seq, bool gc) + { + struct bch_dev_usage *u; + + percpu_rwsem_assert_held(&c->mark_lock); + + preempt_disable(); +- u = this_cpu_ptr(ca->usage[gc]); ++ u = dev_usage_ptr(ca, journal_seq, gc); + + if (bucket_type(old)) + account_bucket(fs_usage, u, bucket_type(old), +@@ -491,31 +524,6 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca, + bch2_wake_allocator(ca); + } + +-__flatten +-void bch2_dev_usage_from_buckets(struct bch_fs *c) +-{ +- struct bch_dev *ca; +- struct bucket_mark old = { .v.counter = 0 }; +- struct bucket_array *buckets; +- struct bucket *g; +- unsigned i; +- int cpu; +- +- c->usage_base->hidden = 0; +- +- for_each_member_device(ca, c, i) { +- for_each_possible_cpu(cpu) +- memset(per_cpu_ptr(ca->usage[0], cpu), 0, +- sizeof(*ca->usage[0])); +- +- buckets = bucket_array(ca); +- +- for_each_bucket(g, buckets) +- bch2_dev_usage_update(c, ca, c->usage_base, +- old, g->mark, false); +- } +-} +- + static inline int update_replicas(struct bch_fs *c, + struct bch_fs_usage *fs_usage, + struct bch_replicas_entry *r, +@@ -653,7 +661,12 @@ static int __bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca, + new.owned_by_allocator = owned_by_allocator; + })); + +- bch2_dev_usage_update(c, ca, fs_usage, old, new, gc); ++ /* ++ * XXX: this is wrong, this means we'll be doing updates to the percpu ++ * buckets_alloc counter that don't have an open journal buffer and ++ * we'll race with the machinery that accumulates that to ca->usage_base ++ */ ++ bch2_dev_usage_update(c, ca, fs_usage, old, new, 0, gc); + + BUG_ON(!gc && + !owned_by_allocator && !old.owned_by_allocator); +@@ -717,7 +730,7 @@ static int bch2_mark_alloc(struct bch_fs *c, + } + })); + +- bch2_dev_usage_update(c, ca, fs_usage, old_m, m, gc); ++ bch2_dev_usage_update(c, ca, fs_usage, old_m, m, journal_seq, gc); + + g->io_time[READ] = u.read_time; + g->io_time[WRITE] = u.write_time; +@@ -782,7 +795,7 @@ static int __bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, + + if (c) + bch2_dev_usage_update(c, ca, fs_usage_ptr(c, 0, gc), +- old, new, gc); ++ old, new, 0, gc); + + return 0; + } +@@ -963,7 +976,7 @@ static int mark_stripe_bucket(struct bch_fs *c, struct bkey_s_c k, + g->stripe = k.k->p.offset; + g->stripe_redundancy = s->nr_redundant; + +- bch2_dev_usage_update(c, ca, fs_usage, old, new, gc); ++ bch2_dev_usage_update(c, ca, fs_usage, old, new, journal_seq, gc); + return 0; + } + +@@ -1030,7 +1043,7 @@ static int bch2_mark_pointer(struct bch_fs *c, struct bkey_s_c k, + old.v.counter, + new.v.counter)) != old.v.counter); + +- bch2_dev_usage_update(c, ca, fs_usage, old, new, gc); ++ bch2_dev_usage_update(c, ca, fs_usage, old, new, journal_seq, gc); + + BUG_ON(!gc && bucket_became_unavailable(old, new)); + +@@ -2396,13 +2409,24 @@ void bch2_dev_buckets_free(struct bch_dev *ca) + sizeof(struct bucket_array) + + ca->mi.nbuckets * sizeof(struct bucket)); + +- free_percpu(ca->usage[0]); ++ for (i = 0; i < ARRAY_SIZE(ca->usage); i++) ++ free_percpu(ca->usage[i]); ++ kfree(ca->usage_base); + } + + int bch2_dev_buckets_alloc(struct bch_fs *c, struct bch_dev *ca) + { +- if (!(ca->usage[0] = alloc_percpu(struct bch_dev_usage))) ++ unsigned i; ++ ++ ca->usage_base = kzalloc(sizeof(struct bch_dev_usage), GFP_KERNEL); ++ if (!ca->usage_base) + return -ENOMEM; + ++ for (i = 0; i < ARRAY_SIZE(ca->usage); i++) { ++ ca->usage[i] = alloc_percpu(struct bch_dev_usage); ++ if (!ca->usage[i]) ++ return -ENOMEM; ++ } ++ + return bch2_dev_buckets_resize(c, ca, ca->mi.nbuckets);; + } +diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h +index 659f1ba01b6f..6d15c455e7cc 100644 +--- a/fs/bcachefs/buckets.h ++++ b/fs/bcachefs/buckets.h +@@ -162,8 +162,6 @@ static inline bool bucket_needs_journal_commit(struct bucket_mark m, + + struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *); + +-void bch2_dev_usage_from_buckets(struct bch_fs *); +- + static inline u64 __dev_buckets_available(struct bch_dev *ca, + struct bch_dev_usage stats) + { +@@ -207,6 +205,11 @@ static inline unsigned fs_usage_u64s(struct bch_fs *c) + READ_ONCE(c->replicas.nr); + } + ++static inline unsigned dev_usage_u64s(void) ++{ ++ return sizeof(struct bch_dev_usage) / sizeof(u64); ++} ++ + void bch2_fs_usage_scratch_put(struct bch_fs *, struct bch_fs_usage *); + struct bch_fs_usage *bch2_fs_usage_scratch_get(struct bch_fs *); + +diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c +index 39b8cbe178b0..2abca1644cdc 100644 +--- a/fs/bcachefs/journal_io.c ++++ b/fs/bcachefs/journal_io.c +@@ -453,6 +453,43 @@ fsck_err: + return ret; + } + ++static int journal_entry_validate_dev_usage(struct bch_fs *c, ++ struct jset *jset, ++ struct jset_entry *entry, ++ int write) ++{ ++ struct jset_entry_dev_usage *u = ++ container_of(entry, struct jset_entry_dev_usage, entry); ++ unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); ++ unsigned expected = sizeof(*u) + sizeof(u->d[0]) * 7; /* Current value of BCH_DATA_NR */ ++ unsigned dev; ++ int ret = 0; ++ ++ if (journal_entry_err_on(bytes < expected, ++ c, "invalid journal entry dev usage: bad size (%u < %u)", ++ bytes, expected)) { ++ journal_entry_null_range(entry, vstruct_next(entry)); ++ return ret; ++ } ++ ++ dev = le32_to_cpu(u->dev); ++ ++ if (journal_entry_err_on(!bch2_dev_exists2(c, dev), ++ c, "invalid journal entry dev usage: bad dev")) { ++ journal_entry_null_range(entry, vstruct_next(entry)); ++ return ret; ++ } ++ ++ if (journal_entry_err_on(u->pad, ++ c, "invalid journal entry dev usage: bad pad")) { ++ journal_entry_null_range(entry, vstruct_next(entry)); ++ return ret; ++ } ++ ++fsck_err: ++ return ret; ++} ++ + struct jset_entry_ops { + int (*validate)(struct bch_fs *, struct jset *, + struct jset_entry *, int); +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index 55f7771e11c8..7ba098adcab9 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -825,10 +825,31 @@ static int journal_replay_entry_early(struct bch_fs *c, + case BCH_JSET_ENTRY_data_usage: { + struct jset_entry_data_usage *u = + container_of(entry, struct jset_entry_data_usage, entry); ++ + ret = bch2_replicas_set_usage(c, &u->r, + le64_to_cpu(u->v)); + break; + } ++ case BCH_JSET_ENTRY_dev_usage: { ++ struct jset_entry_dev_usage *u = ++ container_of(entry, struct jset_entry_dev_usage, entry); ++ struct bch_dev *ca = bch_dev_bkey_exists(c, u->dev); ++ unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); ++ unsigned nr_types = (bytes - sizeof(struct jset_entry_dev_usage)) / ++ sizeof(struct jset_entry_dev_usage_type); ++ unsigned i; ++ ++ ca->usage_base->buckets_ec = le64_to_cpu(u->buckets_ec); ++ ca->usage_base->buckets_unavailable = le64_to_cpu(u->buckets_unavailable); ++ ++ for (i = 0; i < nr_types; i++) { ++ ca->usage_base->d[i].buckets = le64_to_cpu(u->d[i].buckets); ++ ca->usage_base->d[i].sectors = le64_to_cpu(u->d[i].sectors); ++ ca->usage_base->d[i].fragmented = le64_to_cpu(u->d[i].fragmented); ++ } ++ ++ break; ++ } + case BCH_JSET_ENTRY_blacklist: { + struct jset_entry_blacklist *bl_entry = + container_of(entry, struct jset_entry_blacklist, entry); +diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c +index 068262917e10..a510a25e2edb 100644 +--- a/fs/bcachefs/super-io.c ++++ b/fs/bcachefs/super-io.c +@@ -983,7 +983,8 @@ void bch2_journal_super_entries_add_common(struct bch_fs *c, + struct jset_entry **end, + u64 journal_seq) + { +- unsigned i; ++ struct bch_dev *ca; ++ unsigned i, dev; + + percpu_down_write(&c->mark_lock); + +@@ -1037,6 +1038,25 @@ void bch2_journal_super_entries_add_common(struct bch_fs *c, + memcpy(&u->r, e, replicas_entry_bytes(e)); + } + ++ for_each_member_device(ca, c, dev) { ++ unsigned b = sizeof(struct jset_entry_dev_usage) + ++ sizeof(struct jset_entry_dev_usage_type) * BCH_DATA_NR; ++ struct jset_entry_dev_usage *u = ++ container_of(jset_entry_init(end, b), ++ struct jset_entry_dev_usage, entry); ++ ++ u->entry.type = BCH_JSET_ENTRY_dev_usage; ++ u->dev = cpu_to_le32(dev); ++ u->buckets_ec = cpu_to_le64(ca->usage_base->buckets_ec); ++ u->buckets_unavailable = cpu_to_le64(ca->usage_base->buckets_unavailable); ++ ++ for (i = 0; i < BCH_DATA_NR; i++) { ++ u->d[i].buckets = cpu_to_le64(ca->usage_base->d[i].buckets); ++ u->d[i].sectors = cpu_to_le64(ca->usage_base->d[i].sectors); ++ u->d[i].fragmented = cpu_to_le64(ca->usage_base->d[i].fragmented); ++ } ++ } ++ + percpu_up_write(&c->mark_lock); + + for (i = 0; i < 2; i++) { +diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c +index 0d35df66e8cf..c0b8e9cc9d5a 100644 +--- a/fs/bcachefs/super.c ++++ b/fs/bcachefs/super.c +@@ -148,6 +148,22 @@ struct bch_fs *bch2_uuid_to_fs(uuid_le uuid) + return c; + } + ++static void bch2_dev_usage_journal_reserve(struct bch_fs *c) ++{ ++ struct bch_dev *ca; ++ unsigned i, nr = 0, u64s = ++ (sizeof(struct jset_entry_dev_usage) + ++ sizeof(struct jset_entry_dev_usage_type) * BCH_DATA_NR); ++ ++ rcu_read_lock(); ++ for_each_member_device_rcu(ca, c, i, NULL) ++ nr++; ++ rcu_read_unlock(); ++ ++ bch2_journal_entry_res_resize(&c->journal, ++ &c->dev_usage_journal_res, u64s * nr); ++} ++ + /* Filesystem RO/RW: */ + + /* +@@ -772,6 +788,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) + bch2_fs_fsio_init(c)) + goto err; + ++ bch2_dev_usage_journal_reserve(c); ++ + mi = bch2_sb_get_members(c->disk_sb.sb); + for (i = 0; i < c->sb.nr_devices; i++) + if (bch2_dev_exists(c->disk_sb.sb, mi, i) && +@@ -1510,6 +1528,8 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) + + mutex_unlock(&c->sb_lock); + up_write(&c->state_lock); ++ ++ bch2_dev_usage_journal_reserve(c); + return 0; + err: + if (ca->mi.state == BCH_MEMBER_STATE_RW && +@@ -1519,19 +1539,6 @@ err: + return ret; + } + +-static void dev_usage_clear(struct bch_dev *ca) +-{ +- struct bucket_array *buckets; +- +- percpu_memset(ca->usage[0], 0, sizeof(*ca->usage[0])); +- +- down_read(&ca->bucket_lock); +- buckets = bucket_array(ca); +- +- memset(buckets->b, 0, sizeof(buckets->b[0]) * buckets->nbuckets); +- up_read(&ca->bucket_lock); +-} +- + /* Add new device to running filesystem: */ + int bch2_dev_add(struct bch_fs *c, const char *path) + { +@@ -1589,8 +1596,6 @@ int bch2_dev_add(struct bch_fs *c, const char *path) + if (ret) + goto err; + +- dev_usage_clear(ca); +- + down_write(&c->state_lock); + mutex_lock(&c->sb_lock); + +@@ -1644,6 +1649,8 @@ have_slot: + bch2_write_super(c); + mutex_unlock(&c->sb_lock); + ++ bch2_dev_usage_journal_reserve(c); ++ + err = "error marking superblock"; + ret = bch2_trans_mark_dev_sb(c, NULL, ca); + if (ret) +-- +cgit v1.2.3 + + +From 35b5d33d97e0122092109d0b20c4e2ed08415c8d Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 2 Feb 2021 17:08:54 -0500 +Subject: bcachefs: Include device in btree IO error messages + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_io.c | 76 +++++++++++++++++++++++++++----------------------- + fs/bcachefs/btree_io.h | 3 +- + fs/bcachefs/debug.c | 2 +- + 3 files changed, 44 insertions(+), 37 deletions(-) + +diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c +index 91e578b2d8c0..8a4fbdf47d23 100644 +--- a/fs/bcachefs/btree_io.c ++++ b/fs/bcachefs/btree_io.c +@@ -608,11 +608,16 @@ static void btree_pos_to_text(struct printbuf *out, struct bch_fs *c, + } + + static void btree_err_msg(struct printbuf *out, struct bch_fs *c, ++ struct bch_dev *ca, + struct btree *b, struct bset *i, + unsigned offset, int write) + { +- pr_buf(out, "error validating btree node %sat btree ", +- write ? "before write " : ""); ++ pr_buf(out, "error validating btree node "); ++ if (write) ++ pr_buf(out, "before write "); ++ if (ca) ++ pr_buf(out, "on %s ", ca->name); ++ pr_buf(out, "at btree "); + btree_pos_to_text(out, c, b); + + pr_buf(out, "\n node offset %u", b->written); +@@ -631,7 +636,7 @@ enum btree_validate_ret { + BTREE_RETRY_READ = 64, + }; + +-#define btree_err(type, c, b, i, msg, ...) \ ++#define btree_err(type, c, ca, b, i, msg, ...) \ + ({ \ + __label__ out; \ + char _buf[300]; \ +@@ -642,7 +647,7 @@ enum btree_validate_ret { + if (buf2) \ + out = _PBUF(buf2, 4986); \ + \ +- btree_err_msg(&out, c, b, i, b->written, write); \ ++ btree_err_msg(&out, c, ca, b, i, b->written, write); \ + pr_buf(&out, ": " msg, ##__VA_ARGS__); \ + \ + if (type == BTREE_ERR_FIXABLE && \ +@@ -691,9 +696,9 @@ out: \ + + #define btree_err_on(cond, ...) ((cond) ? btree_err(__VA_ARGS__) : false) + +-static int validate_bset(struct bch_fs *c, struct btree *b, +- struct bset *i, unsigned sectors, +- int write, bool have_retry) ++static int validate_bset(struct bch_fs *c, struct bch_dev *ca, ++ struct btree *b, struct bset *i, ++ unsigned sectors, int write, bool have_retry) + { + unsigned version = le16_to_cpu(i->version); + const char *err; +@@ -702,18 +707,18 @@ static int validate_bset(struct bch_fs *c, struct btree *b, + btree_err_on((version != BCH_BSET_VERSION_OLD && + version < bcachefs_metadata_version_min) || + version >= bcachefs_metadata_version_max, +- BTREE_ERR_FATAL, c, b, i, ++ BTREE_ERR_FATAL, c, ca, b, i, + "unsupported bset version"); + + if (btree_err_on(b->written + sectors > c->opts.btree_node_size, +- BTREE_ERR_FIXABLE, c, b, i, ++ BTREE_ERR_FIXABLE, c, ca, b, i, + "bset past end of btree node")) { + i->u64s = 0; + return 0; + } + + btree_err_on(b->written && !i->u64s, +- BTREE_ERR_FIXABLE, c, b, i, ++ BTREE_ERR_FIXABLE, c, ca, b, i, + "empty bset"); + + if (!b->written) { +@@ -727,16 +732,16 @@ static int validate_bset(struct bch_fs *c, struct btree *b, + + /* XXX endianness */ + btree_err_on(bp->seq != bn->keys.seq, +- BTREE_ERR_MUST_RETRY, c, b, NULL, ++ BTREE_ERR_MUST_RETRY, c, ca, b, NULL, + "incorrect sequence number (wrong btree node)"); + } + + btree_err_on(BTREE_NODE_ID(bn) != b->c.btree_id, +- BTREE_ERR_MUST_RETRY, c, b, i, ++ BTREE_ERR_MUST_RETRY, c, ca, b, i, + "incorrect btree id"); + + btree_err_on(BTREE_NODE_LEVEL(bn) != b->c.level, +- BTREE_ERR_MUST_RETRY, c, b, i, ++ BTREE_ERR_MUST_RETRY, c, ca, b, i, + "incorrect level"); + + if (BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN) { +@@ -759,7 +764,7 @@ static int validate_bset(struct bch_fs *c, struct btree *b, + } + + btree_err_on(bkey_cmp(b->data->min_key, bp->min_key), +- BTREE_ERR_MUST_RETRY, c, b, NULL, ++ BTREE_ERR_MUST_RETRY, c, ca, b, NULL, + "incorrect min_key: got %llu:%llu should be %llu:%llu", + b->data->min_key.inode, + b->data->min_key.offset, +@@ -768,7 +773,7 @@ static int validate_bset(struct bch_fs *c, struct btree *b, + } + + btree_err_on(bkey_cmp(bn->max_key, b->key.k.p), +- BTREE_ERR_MUST_RETRY, c, b, i, ++ BTREE_ERR_MUST_RETRY, c, ca, b, i, + "incorrect max key %llu:%llu", + bn->max_key.inode, + bn->max_key.offset); +@@ -793,7 +798,7 @@ static int validate_bset(struct bch_fs *c, struct btree *b, + #endif + err = bch2_bkey_format_validate(&bn->format); + btree_err_on(err, +- BTREE_ERR_FATAL, c, b, i, ++ BTREE_ERR_FATAL, c, ca, b, i, + "invalid bkey format: %s", err); + + compat_bformat(b->c.level, b->c.btree_id, version, +@@ -825,14 +830,14 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b, + const char *invalid; + + if (btree_err_on(bkey_next(k) > vstruct_last(i), +- BTREE_ERR_FIXABLE, c, b, i, ++ BTREE_ERR_FIXABLE, c, NULL, b, i, + "key extends past end of bset")) { + i->u64s = cpu_to_le16((u64 *) k - i->_data); + break; + } + + if (btree_err_on(k->format > KEY_FORMAT_CURRENT, +- BTREE_ERR_FIXABLE, c, b, i, ++ BTREE_ERR_FIXABLE, c, NULL, b, i, + "invalid bkey format %u", k->format)) { + i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); + memmove_u64s_down(k, bkey_next(k), +@@ -855,7 +860,7 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b, + char buf[160]; + + bch2_bkey_val_to_text(&PBUF(buf), c, u.s_c); +- btree_err(BTREE_ERR_FIXABLE, c, b, i, ++ btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i, + "invalid bkey: %s\n%s", invalid, buf); + + i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); +@@ -889,7 +894,7 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b, + bch2_bkey_to_text(&PBUF(buf2), u.k); + + bch2_dump_bset(c, b, i, 0); +- btree_err(BTREE_ERR_FATAL, c, b, i, ++ btree_err(BTREE_ERR_FATAL, c, NULL, b, i, + "keys out of order: %s > %s", + buf1, buf2); + /* XXX: repair this */ +@@ -902,7 +907,8 @@ fsck_err: + return ret; + } + +-int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry) ++int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, ++ struct btree *b, bool have_retry) + { + struct btree_node_entry *bne; + struct sort_iter *iter; +@@ -919,15 +925,15 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry + iter->size = (btree_blocks(c) + 1) * 2; + + if (bch2_meta_read_fault("btree")) +- btree_err(BTREE_ERR_MUST_RETRY, c, b, NULL, ++ btree_err(BTREE_ERR_MUST_RETRY, c, ca, b, NULL, + "dynamic fault"); + + btree_err_on(le64_to_cpu(b->data->magic) != bset_magic(c), +- BTREE_ERR_MUST_RETRY, c, b, NULL, ++ BTREE_ERR_MUST_RETRY, c, ca, b, NULL, + "bad magic"); + + btree_err_on(!b->data->keys.seq, +- BTREE_ERR_MUST_RETRY, c, b, NULL, ++ BTREE_ERR_MUST_RETRY, c, ca, b, NULL, + "bad btree header"); + + if (b->key.k.type == KEY_TYPE_btree_ptr_v2) { +@@ -935,7 +941,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry + &bkey_i_to_btree_ptr_v2(&b->key)->v; + + btree_err_on(b->data->keys.seq != bp->seq, +- BTREE_ERR_MUST_RETRY, c, b, NULL, ++ BTREE_ERR_MUST_RETRY, c, ca, b, NULL, + "got wrong btree node (seq %llx want %llx)", + b->data->keys.seq, bp->seq); + } +@@ -950,7 +956,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry + i = &b->data->keys; + + btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)), +- BTREE_ERR_WANT_RETRY, c, b, i, ++ BTREE_ERR_WANT_RETRY, c, ca, b, i, + "unknown checksum type %llu", + BSET_CSUM_TYPE(i)); + +@@ -958,7 +964,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry + csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data); + + btree_err_on(bch2_crc_cmp(csum, b->data->csum), +- BTREE_ERR_WANT_RETRY, c, b, i, ++ BTREE_ERR_WANT_RETRY, c, ca, b, i, + "invalid checksum"); + + bset_encrypt(c, i, b->written << 9); +@@ -978,7 +984,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry + break; + + btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)), +- BTREE_ERR_WANT_RETRY, c, b, i, ++ BTREE_ERR_WANT_RETRY, c, ca, b, i, + "unknown checksum type %llu", + BSET_CSUM_TYPE(i)); + +@@ -986,7 +992,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry + csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne); + + btree_err_on(bch2_crc_cmp(csum, bne->csum), +- BTREE_ERR_WANT_RETRY, c, b, i, ++ BTREE_ERR_WANT_RETRY, c, ca, b, i, + "invalid checksum"); + + bset_encrypt(c, i, b->written << 9); +@@ -994,7 +1000,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry + sectors = vstruct_sectors(bne, c->block_bits); + } + +- ret = validate_bset(c, b, i, sectors, ++ ret = validate_bset(c, ca, b, i, sectors, + READ, have_retry); + if (ret) + goto fsck_err; +@@ -1016,7 +1022,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry + true); + + btree_err_on(blacklisted && first, +- BTREE_ERR_FIXABLE, c, b, i, ++ BTREE_ERR_FIXABLE, c, ca, b, i, + "first btree node bset has blacklisted journal seq"); + if (blacklisted && !first) + continue; +@@ -1033,7 +1039,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry + bset_byte_offset(b, bne) < btree_bytes(c); + bne = (void *) bne + block_bytes(c)) + btree_err_on(bne->keys.seq == b->data->keys.seq, +- BTREE_ERR_WANT_RETRY, c, b, NULL, ++ BTREE_ERR_WANT_RETRY, c, ca, b, NULL, + "found bset signature after last bset"); + + sorted = btree_bounce_alloc(c, btree_bytes(c), &used_mempool); +@@ -1068,7 +1074,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry + char buf[160]; + + bch2_bkey_val_to_text(&PBUF(buf), c, u.s_c); +- btree_err(BTREE_ERR_FIXABLE, c, b, i, ++ btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i, + "invalid bkey %s: %s", buf, invalid); + + btree_keys_account_key_drop(&b->nr, 0, k); +@@ -1159,7 +1165,7 @@ start: + &failed, &rb->pick) > 0; + + if (!bio->bi_status && +- !bch2_btree_node_read_done(c, b, can_retry)) ++ !bch2_btree_node_read_done(c, ca, b, can_retry)) + break; + + if (!can_retry) { +@@ -1465,7 +1471,7 @@ static int validate_bset_for_write(struct bch_fs *c, struct btree *b, + if (bch2_bkey_invalid(c, bkey_i_to_s_c(&b->key), BKEY_TYPE_BTREE)) + return -1; + +- ret = validate_bset(c, b, i, sectors, WRITE, false) ?: ++ ret = validate_bset(c, NULL, b, i, sectors, WRITE, false) ?: + validate_bset_keys(c, b, i, &whiteout_u64s, WRITE, false); + if (ret) { + bch2_inconsistent_error(c); +diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h +index 3b61555ef906..89685bd57fc0 100644 +--- a/fs/bcachefs/btree_io.h ++++ b/fs/bcachefs/btree_io.h +@@ -134,7 +134,8 @@ void bch2_btree_build_aux_trees(struct btree *); + void bch2_btree_init_next(struct bch_fs *, struct btree *, + struct btree_iter *); + +-int bch2_btree_node_read_done(struct bch_fs *, struct btree *, bool); ++int bch2_btree_node_read_done(struct bch_fs *, struct bch_dev *, ++ struct btree *, bool); + void bch2_btree_node_read(struct bch_fs *, struct btree *, bool); + int bch2_btree_root_read(struct bch_fs *, enum btree_id, + const struct bkey_i *, unsigned); +diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c +index bbe3fefa2651..06dbca32e189 100644 +--- a/fs/bcachefs/debug.c ++++ b/fs/bcachefs/debug.c +@@ -79,7 +79,7 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b) + + memcpy(n_ondisk, n_sorted, btree_bytes(c)); + +- if (bch2_btree_node_read_done(c, v, false)) ++ if (bch2_btree_node_read_done(c, ca, v, false)) + goto out; + + n_sorted = c->verify_data->data; +-- +cgit v1.2.3 + + +From 4ac886b0f94eddb4450c0aeedf202c9d6c7cde3a Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 3 Feb 2021 13:10:55 -0500 +Subject: bcachefs: Fixes/improvements for journal entry reservations + +This fixes some arithmetic bugs in "bcachefs: Journal updates to dev +usage" - additionally, it cleans things up by switching everything that +goes in every journal entry to the journal_entry_res mechanism. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bcachefs.h | 3 ++- + fs/bcachefs/journal.c | 7 ------- + fs/bcachefs/replicas.c | 5 +++-- + fs/bcachefs/super.c | 15 +++++++++++---- + 4 files changed, 16 insertions(+), 14 deletions(-) + +diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h +index 04e0feb11609..a9a631a74074 100644 +--- a/fs/bcachefs/bcachefs.h ++++ b/fs/bcachefs/bcachefs.h +@@ -583,8 +583,9 @@ struct bch_fs { + struct bch_replicas_cpu replicas_gc; + struct mutex replicas_gc_lock; + ++ struct journal_entry_res btree_root_journal_res; + struct journal_entry_res replicas_journal_res; +- ++ struct journal_entry_res clock_journal_res; + struct journal_entry_res dev_usage_journal_res; + + struct bch_disk_groups_cpu __rcu *disk_groups; +diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c +index e41f02773dd0..c4cb4f05a66f 100644 +--- a/fs/bcachefs/journal.c ++++ b/fs/bcachefs/journal.c +@@ -1117,13 +1117,6 @@ int bch2_fs_journal_init(struct journal *j) + j->write_delay_ms = 1000; + j->reclaim_delay_ms = 100; + +- /* Btree roots: */ +- j->entry_u64s_reserved += +- BTREE_ID_NR * (JSET_KEYS_U64s + BKEY_BTREE_PTR_U64s_MAX); +- +- j->entry_u64s_reserved += +- 2 * (sizeof(struct jset_entry_clock) / sizeof(u64)); +- + atomic64_set(&j->reservations.counter, + ((union journal_res_state) + { .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v); +diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c +index 3970c442f199..0330204fb4bf 100644 +--- a/fs/bcachefs/replicas.c ++++ b/fs/bcachefs/replicas.c +@@ -1065,8 +1065,9 @@ unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca) + + int bch2_fs_replicas_init(struct bch_fs *c) + { +- c->journal.entry_u64s_reserved += +- reserve_journal_replicas(c, &c->replicas); ++ bch2_journal_entry_res_resize(&c->journal, ++ &c->replicas_journal_res, ++ reserve_journal_replicas(c, &c->replicas)); + + return replicas_table_update(c, &c->replicas); + } +diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c +index c0b8e9cc9d5a..347e39d0bc15 100644 +--- a/fs/bcachefs/super.c ++++ b/fs/bcachefs/super.c +@@ -152,8 +152,9 @@ static void bch2_dev_usage_journal_reserve(struct bch_fs *c) + { + struct bch_dev *ca; + unsigned i, nr = 0, u64s = +- (sizeof(struct jset_entry_dev_usage) + +- sizeof(struct jset_entry_dev_usage_type) * BCH_DATA_NR); ++ ((sizeof(struct jset_entry_dev_usage) + ++ sizeof(struct jset_entry_dev_usage_type) * BCH_DATA_NR)) / ++ sizeof(u64); + + rcu_read_lock(); + for_each_member_device_rcu(ca, c, i, NULL) +@@ -788,14 +789,20 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) + bch2_fs_fsio_init(c)) + goto err; + +- bch2_dev_usage_journal_reserve(c); +- + mi = bch2_sb_get_members(c->disk_sb.sb); + for (i = 0; i < c->sb.nr_devices; i++) + if (bch2_dev_exists(c->disk_sb.sb, mi, i) && + bch2_dev_alloc(c, i)) + goto err; + ++ bch2_journal_entry_res_resize(&c->journal, ++ &c->btree_root_journal_res, ++ BTREE_ID_NR * (JSET_KEYS_U64s + BKEY_BTREE_PTR_U64s_MAX)); ++ bch2_dev_usage_journal_reserve(c); ++ bch2_journal_entry_res_resize(&c->journal, ++ &c->clock_journal_res, ++ (sizeof(struct jset_entry_clock) / sizeof(u64)) * 2); ++ + mutex_lock(&bch_fs_list_lock); + err = bch2_fs_online(c); + mutex_unlock(&bch_fs_list_lock); +-- +cgit v1.2.3 + + +From 7d992a926b3ba1057fdcbfe958832581171106b6 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 3 Feb 2021 15:31:17 -0500 +Subject: bcachefs: Run fsck if BCH_FEATURE_alloc_v2 isn't set + +We're using BCH_FEATURE_alloc_v2 to also gate journalling updates to dev +usage - we don't have the code for reconstructing this from buckets +anymore, so we need to run fsck if it's not set. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/recovery.c | 7 +++++++ + 1 file changed, 7 insertions(+) + +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index 7ba098adcab9..8560023b4c7a 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -1088,6 +1088,13 @@ int bch2_fs_recovery(struct bch_fs *c) + bch_info(c, "recovering from clean shutdown, journal seq %llu", + le64_to_cpu(clean->journal_seq)); + ++ if (!(c->sb.features & (1ULL << BCH_FEATURE_alloc_v2))) { ++ bch_info(c, "alloc_v2 feature bit not set, fsck required"); ++ c->opts.fsck = true; ++ c->opts.fix_errors = FSCK_OPT_YES; ++ c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_alloc_v2; ++ } ++ + if (!c->replicas.entries || + c->opts.rebuild_replicas) { + bch_info(c, "building replicas info"); +-- +cgit v1.2.3 + + +From bb157c1fe2f96757768874c2cb29f997beb6d070 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 6 Feb 2021 23:17:26 -0500 +Subject: bcachefs: Redo checks for sufficient devices + +When the replicas mechanism was added, for tracking data by which drives +it's replicated on, the check for whether we have sufficient devices was +never updated to make use of it. This patch finally does that. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bcachefs_ioctl.h | 3 ++ + fs/bcachefs/opts.h | 5 +++ + fs/bcachefs/replicas.c | 98 ++++++++++++-------------------------------- + fs/bcachefs/replicas.h | 16 ++------ + fs/bcachefs/super-io.c | 7 ++-- + fs/bcachefs/super.c | 23 +++++------ + fs/bcachefs/sysfs.c | 9 ---- + 7 files changed, 51 insertions(+), 110 deletions(-) + +diff --git a/fs/bcachefs/bcachefs_ioctl.h b/fs/bcachefs/bcachefs_ioctl.h +index 0e626b098d91..f1cb5d405129 100644 +--- a/fs/bcachefs/bcachefs_ioctl.h ++++ b/fs/bcachefs/bcachefs_ioctl.h +@@ -14,6 +14,9 @@ + #define BCH_FORCE_IF_DATA_DEGRADED (1 << 2) + #define BCH_FORCE_IF_METADATA_DEGRADED (1 << 3) + ++#define BCH_FORCE_IF_LOST \ ++ (BCH_FORCE_IF_DATA_LOST| \ ++ BCH_FORCE_IF_METADATA_LOST) + #define BCH_FORCE_IF_DEGRADED \ + (BCH_FORCE_IF_DATA_DEGRADED| \ + BCH_FORCE_IF_METADATA_DEGRADED) +diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h +index d835a85338c6..c123c42630a6 100644 +--- a/fs/bcachefs/opts.h ++++ b/fs/bcachefs/opts.h +@@ -222,6 +222,11 @@ enum opt_type { + OPT_BOOL(), \ + NO_SB_OPT, false, \ + NULL, "Allow mounting in degraded mode") \ ++ x(very_degraded, u8, \ ++ OPT_MOUNT, \ ++ OPT_BOOL(), \ ++ NO_SB_OPT, false, \ ++ NULL, "Allow mounting in when data will be missing") \ + x(discard, u8, \ + OPT_MOUNT|OPT_DEVICE, \ + OPT_BOOL(), \ +diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c +index 0330204fb4bf..be73b458e4f6 100644 +--- a/fs/bcachefs/replicas.c ++++ b/fs/bcachefs/replicas.c +@@ -958,94 +958,48 @@ const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0 = { + + /* Query replicas: */ + +-struct replicas_status __bch2_replicas_status(struct bch_fs *c, +- struct bch_devs_mask online_devs) ++bool bch2_have_enough_devs(struct bch_fs *c, struct bch_devs_mask devs, ++ unsigned flags, bool print) + { +- struct bch_sb_field_members *mi; + struct bch_replicas_entry *e; +- unsigned i, nr_online, nr_offline; +- struct replicas_status ret; +- +- memset(&ret, 0, sizeof(ret)); +- +- for (i = 0; i < ARRAY_SIZE(ret.replicas); i++) +- ret.replicas[i].redundancy = INT_MAX; +- +- mi = bch2_sb_get_members(c->disk_sb.sb); ++ bool ret = true; + + percpu_down_read(&c->mark_lock); +- + for_each_cpu_replicas_entry(&c->replicas, e) { +- if (e->data_type >= ARRAY_SIZE(ret.replicas)) +- panic("e %p data_type %u\n", e, e->data_type); ++ unsigned i, nr_online = 0, dflags = 0; ++ bool metadata = e->data_type < BCH_DATA_user; + +- nr_online = nr_offline = 0; ++ for (i = 0; i < e->nr_devs; i++) ++ nr_online += test_bit(e->devs[i], devs.d); + +- for (i = 0; i < e->nr_devs; i++) { +- BUG_ON(!bch2_dev_exists(c->disk_sb.sb, mi, +- e->devs[i])); ++ if (nr_online < e->nr_required) ++ dflags |= metadata ++ ? BCH_FORCE_IF_METADATA_LOST ++ : BCH_FORCE_IF_DATA_LOST; + +- if (test_bit(e->devs[i], online_devs.d)) +- nr_online++; +- else +- nr_offline++; +- } ++ if (nr_online < e->nr_devs) ++ dflags |= metadata ++ ? BCH_FORCE_IF_METADATA_DEGRADED ++ : BCH_FORCE_IF_DATA_DEGRADED; + +- ret.replicas[e->data_type].redundancy = +- min(ret.replicas[e->data_type].redundancy, +- (int) nr_online - (int) e->nr_required); ++ if (dflags & ~flags) { ++ if (print) { ++ char buf[100]; + +- ret.replicas[e->data_type].nr_offline = +- max(ret.replicas[e->data_type].nr_offline, +- nr_offline); +- } ++ bch2_replicas_entry_to_text(&PBUF(buf), e); ++ bch_err(c, "insufficient devices online (%u) for replicas entry %s", ++ nr_online, buf); ++ } ++ ret = false; ++ break; ++ } + ++ } + percpu_up_read(&c->mark_lock); + +- for (i = 0; i < ARRAY_SIZE(ret.replicas); i++) +- if (ret.replicas[i].redundancy == INT_MAX) +- ret.replicas[i].redundancy = 0; +- + return ret; + } + +-struct replicas_status bch2_replicas_status(struct bch_fs *c) +-{ +- return __bch2_replicas_status(c, bch2_online_devs(c)); +-} +- +-static bool have_enough_devs(struct replicas_status s, +- enum bch_data_type type, +- bool force_if_degraded, +- bool force_if_lost) +-{ +- return (!s.replicas[type].nr_offline || force_if_degraded) && +- (s.replicas[type].redundancy >= 0 || force_if_lost); +-} +- +-bool bch2_have_enough_devs(struct replicas_status s, unsigned flags) +-{ +- return (have_enough_devs(s, BCH_DATA_journal, +- flags & BCH_FORCE_IF_METADATA_DEGRADED, +- flags & BCH_FORCE_IF_METADATA_LOST) && +- have_enough_devs(s, BCH_DATA_btree, +- flags & BCH_FORCE_IF_METADATA_DEGRADED, +- flags & BCH_FORCE_IF_METADATA_LOST) && +- have_enough_devs(s, BCH_DATA_user, +- flags & BCH_FORCE_IF_DATA_DEGRADED, +- flags & BCH_FORCE_IF_DATA_LOST)); +-} +- +-int bch2_replicas_online(struct bch_fs *c, bool meta) +-{ +- struct replicas_status s = bch2_replicas_status(c); +- +- return (meta +- ? min(s.replicas[BCH_DATA_journal].redundancy, +- s.replicas[BCH_DATA_btree].redundancy) +- : s.replicas[BCH_DATA_user].redundancy) + 1; +-} +- + unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca) + { + struct bch_replicas_entry *e; +diff --git a/fs/bcachefs/replicas.h b/fs/bcachefs/replicas.h +index a16ef23bde8a..9c8fd3d98247 100644 +--- a/fs/bcachefs/replicas.h ++++ b/fs/bcachefs/replicas.h +@@ -39,19 +39,9 @@ static inline void bch2_replicas_entry_cached(struct bch_replicas_entry *e, + e->devs[0] = dev; + } + +-struct replicas_status { +- struct { +- int redundancy; +- unsigned nr_offline; +- } replicas[BCH_DATA_NR]; +-}; +- +-struct replicas_status __bch2_replicas_status(struct bch_fs *, +- struct bch_devs_mask); +-struct replicas_status bch2_replicas_status(struct bch_fs *); +-bool bch2_have_enough_devs(struct replicas_status, unsigned); +- +-int bch2_replicas_online(struct bch_fs *, bool); ++bool bch2_have_enough_devs(struct bch_fs *, struct bch_devs_mask, ++ unsigned, bool); ++ + unsigned bch2_dev_has_data(struct bch_fs *, struct bch_dev *); + + int bch2_replicas_gc_end(struct bch_fs *, int); +diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c +index a510a25e2edb..47a0e20668e3 100644 +--- a/fs/bcachefs/super-io.c ++++ b/fs/bcachefs/super-io.c +@@ -767,15 +767,13 @@ int bch2_write_super(struct bch_fs *c) + nr_wrote = dev_mask_nr(&sb_written); + + can_mount_with_written = +- bch2_have_enough_devs(__bch2_replicas_status(c, sb_written), +- BCH_FORCE_IF_DEGRADED); ++ bch2_have_enough_devs(c, sb_written, BCH_FORCE_IF_DEGRADED, false); + + for (i = 0; i < ARRAY_SIZE(sb_written.d); i++) + sb_written.d[i] = ~sb_written.d[i]; + + can_mount_without_written = +- bch2_have_enough_devs(__bch2_replicas_status(c, sb_written), +- BCH_FORCE_IF_DEGRADED); ++ bch2_have_enough_devs(c, sb_written, BCH_FORCE_IF_DEGRADED, false); + + /* + * If we would be able to mount _without_ the devices we successfully +@@ -786,6 +784,7 @@ int bch2_write_super(struct bch_fs *c) + * mount with the devices we did successfully write to: + */ + if (bch2_fs_fatal_err_on(!nr_wrote || ++ !can_mount_with_written || + (can_mount_without_written && + !can_mount_with_written), c, + "Unable to write superblock to sufficient devices")) +diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c +index 347e39d0bc15..8e6b4413d820 100644 +--- a/fs/bcachefs/super.c ++++ b/fs/bcachefs/super.c +@@ -1259,7 +1259,6 @@ bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca, + enum bch_member_state new_state, int flags) + { + struct bch_devs_mask new_online_devs; +- struct replicas_status s; + struct bch_dev *ca2; + int i, nr_rw = 0, required; + +@@ -1295,9 +1294,7 @@ bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca, + new_online_devs = bch2_online_devs(c); + __clear_bit(ca->dev_idx, new_online_devs.d); + +- s = __bch2_replicas_status(c, new_online_devs); +- +- return bch2_have_enough_devs(s, flags); ++ return bch2_have_enough_devs(c, new_online_devs, flags, false); + default: + BUG(); + } +@@ -1305,14 +1302,18 @@ bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca, + + static bool bch2_fs_may_start(struct bch_fs *c) + { +- struct replicas_status s; + struct bch_sb_field_members *mi; + struct bch_dev *ca; +- unsigned i, flags = c->opts.degraded +- ? BCH_FORCE_IF_DEGRADED +- : 0; ++ unsigned i, flags = 0; ++ ++ if (c->opts.very_degraded) ++ flags |= BCH_FORCE_IF_DEGRADED|BCH_FORCE_IF_LOST; + +- if (!c->opts.degraded) { ++ if (c->opts.degraded) ++ flags |= BCH_FORCE_IF_DEGRADED; ++ ++ if (!c->opts.degraded && ++ !c->opts.very_degraded) { + mutex_lock(&c->sb_lock); + mi = bch2_sb_get_members(c->disk_sb.sb); + +@@ -1332,9 +1333,7 @@ static bool bch2_fs_may_start(struct bch_fs *c) + mutex_unlock(&c->sb_lock); + } + +- s = bch2_replicas_status(c); +- +- return bch2_have_enough_devs(s, flags); ++ return bch2_have_enough_devs(c, bch2_online_devs(c), flags, true); + } + + static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca) +diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c +index f934f12bc677..bc4c3a77ea62 100644 +--- a/fs/bcachefs/sysfs.c ++++ b/fs/bcachefs/sysfs.c +@@ -199,9 +199,6 @@ read_attribute(new_stripes); + + rw_attribute(pd_controllers_update_seconds); + +-read_attribute(meta_replicas_have); +-read_attribute(data_replicas_have); +- + read_attribute(io_timers_read); + read_attribute(io_timers_write); + +@@ -347,9 +344,6 @@ SHOW(bch2_fs) + + sysfs_print(promote_whole_extents, c->promote_whole_extents); + +- sysfs_printf(meta_replicas_have, "%i", bch2_replicas_online(c, true)); +- sysfs_printf(data_replicas_have, "%i", bch2_replicas_online(c, false)); +- + /* Debugging: */ + + if (attr == &sysfs_alloc_debug) +@@ -520,9 +514,6 @@ struct attribute *bch2_fs_files[] = { + &sysfs_btree_node_size, + &sysfs_btree_cache_size, + +- &sysfs_meta_replicas_have, +- &sysfs_data_replicas_have, +- + &sysfs_journal_write_delay_ms, + &sysfs_journal_reclaim_delay_ms, + +-- +cgit v1.2.3 + + +From fbdf5f3b54ec13438fcb247a512e594c117e2e4a Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 9 Feb 2021 19:54:04 -0500 +Subject: bcachefs: Add flushed_seq_ondisk to journal_debug_to_text() + +Also, make the wait in bch2_journal_flush_seq() interruptible, not just +killable. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/journal.c | 7 +++++-- + 1 file changed, 5 insertions(+), 2 deletions(-) + +diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c +index c4cb4f05a66f..c0831ba51ef3 100644 +--- a/fs/bcachefs/journal.c ++++ b/fs/bcachefs/journal.c +@@ -640,9 +640,10 @@ int bch2_journal_flush_seq(struct journal *j, u64 seq) + u64 start_time = local_clock(); + int ret, ret2; + +- ret = wait_event_killable(j->wait, (ret2 = bch2_journal_flush_seq_async(j, seq, NULL))); ++ ret = wait_event_interruptible(j->wait, (ret2 = bch2_journal_flush_seq_async(j, seq, NULL))); + +- bch2_time_stats_update(j->flush_seq_time, start_time); ++ if (!ret) ++ bch2_time_stats_update(j->flush_seq_time, start_time); + + return ret ?: ret2 < 0 ? ret2 : 0; + } +@@ -1158,6 +1159,7 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) + "seq:\t\t\t%llu\n" + "last_seq:\t\t%llu\n" + "last_seq_ondisk:\t%llu\n" ++ "flushed_seq_ondisk:\t%llu\n" + "prereserved:\t\t%u/%u\n" + "nr flush writes:\t%llu\n" + "nr noflush writes:\t%llu\n" +@@ -1170,6 +1172,7 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) + journal_cur_seq(j), + journal_last_seq(j), + j->last_seq_ondisk, ++ j->flushed_seq_ondisk, + j->prereserved.reserved, + j->prereserved.remaining, + j->nr_flush_writes, +-- +cgit v1.2.3 + + +From ecea3a5a090532d727de244feacf59d960e43efd Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 9 Feb 2021 19:54:40 -0500 +Subject: bcachefs: Fix for hash_redo_key() in fsck + +It's possible we're calling hash_redo_key() because of a duplicate key - +easiest fix for that is to just not use BCH_HASH_SET_MUST_CREATE. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/fsck.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c +index df0f00f10bd7..c3e6137ffd75 100644 +--- a/fs/bcachefs/fsck.c ++++ b/fs/bcachefs/fsck.c +@@ -193,7 +193,7 @@ static int hash_redo_key(const struct bch_hash_desc desc, + bch2_trans_update(trans, k_iter, &delete, 0); + + return bch2_hash_set(trans, desc, &h->info, k_iter->pos.inode, +- tmp, BCH_HASH_SET_MUST_CREATE); ++ tmp, 0); + } + + static int fsck_hash_delete_at(struct btree_trans *trans, +-- +cgit v1.2.3 + + +From 1516de2909b4bfe7ebcdc4a1561a2427f24efd87 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 7 Feb 2021 18:52:13 -0500 +Subject: bcachefs: Simplify btree_iter_(next|prev)_leaf() + +There's no good reason for these functions to not be using +bch2_btree_iter_set_pos(). + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 27 +++++++++------------------ + 1 file changed, 9 insertions(+), 18 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 401dfd2c450a..51480489de05 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -1554,38 +1554,29 @@ void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos) + + static inline bool btree_iter_set_pos_to_next_leaf(struct btree_iter *iter) + { +- struct btree_iter_level *l = &iter->l[0]; +- bool ret; +- +- bkey_init(&iter->k); +- iter->k.p = iter->pos = l->b->key.k.p; ++ struct bpos next_pos = iter->l[0].b->key.k.p; ++ bool ret = bkey_cmp(next_pos, POS_MAX) != 0; + +- ret = bkey_cmp(iter->pos, POS_MAX) != 0; + if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS)) +- iter->k.p = iter->pos = bkey_successor(iter->pos); ++ next_pos = bkey_successor(next_pos); + +- btree_iter_pos_changed(iter, 1); ++ bch2_btree_iter_set_pos(iter, next_pos); + return ret; + } + + static inline bool btree_iter_set_pos_to_prev_leaf(struct btree_iter *iter) + { +- struct btree_iter_level *l = &iter->l[0]; +- bool ret; +- +- bkey_init(&iter->k); +- iter->k.p = iter->pos = l->b->data->min_key; +- iter->uptodate = BTREE_ITER_NEED_TRAVERSE; ++ struct bpos next_pos = iter->l[0].b->data->min_key; ++ bool ret = bkey_cmp(next_pos, POS_MIN) != 0; + +- ret = bkey_cmp(iter->pos, POS_MIN) != 0; + if (ret) { +- iter->k.p = iter->pos = bkey_predecessor(iter->pos); ++ next_pos = bkey_predecessor(next_pos); + + if (iter->flags & BTREE_ITER_IS_EXTENTS) +- iter->k.p = iter->pos = bkey_predecessor(iter->pos); ++ next_pos = bkey_predecessor(next_pos); + } + +- btree_iter_pos_changed(iter, -1); ++ bch2_btree_iter_set_pos(iter, next_pos); + return ret; + } + +-- +cgit v1.2.3 + + +From 2e209c1afb31e879fd63e7b82d62d647717511bb Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 7 Feb 2021 20:16:21 -0500 +Subject: bcachefs: Kill bch2_btree_iter_set_pos_same_leaf() + +The only reason we were keeping this around was for +BTREE_INSERT_NOUNLOCK semantics - if bch2_btree_iter_set_pos() advances +to the next leaf node, it'll drop the lock on the node that we just +inserted to. + +But we don't rely on BTREE_INSERT_NOUNLOCK semantics for the extents +btree, just the inodes btree, and if we do need it for the extents btree +in the future we can do it more cleanly by cloning the iterator - this +lets us delete some special cases in the btree iterator code, which is +complicated enough as it is. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 37 +------------------------------------ + fs/bcachefs/btree_iter.h | 1 - + fs/bcachefs/btree_update_leaf.c | 8 ++------ + 3 files changed, 3 insertions(+), 43 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 51480489de05..2a439b267db5 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -516,12 +516,7 @@ static void bch2_btree_iter_verify_level(struct btree_iter *iter, + if (!bch2_btree_node_relock(iter, level)) + return; + +- /* +- * Ideally this invariant would always be true, and hopefully in the +- * future it will be, but for now set_pos_same_leaf() breaks it: +- */ +- BUG_ON(iter->uptodate < BTREE_ITER_NEED_TRAVERSE && +- !btree_iter_pos_in_node(iter, l->b)); ++ BUG_ON(!btree_iter_pos_in_node(iter, l->b)); + + /* + * node iterators don't use leaf node iterator: +@@ -1457,36 +1452,6 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) + + /* Iterate across keys (in leaf nodes only) */ + +-void bch2_btree_iter_set_pos_same_leaf(struct btree_iter *iter, struct bpos new_pos) +-{ +- struct btree_iter_level *l = &iter->l[0]; +- +- EBUG_ON(iter->level != 0); +- EBUG_ON(bkey_cmp(new_pos, iter->pos) < 0); +- EBUG_ON(!btree_node_locked(iter, 0)); +- EBUG_ON(bkey_cmp(new_pos, l->b->key.k.p) > 0); +- +- bkey_init(&iter->k); +- iter->k.p = iter->pos = new_pos; +- btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); +- +- btree_iter_advance_to_pos(iter, l, -1); +- +- /* +- * XXX: +- * keeping a node locked that's outside (even just outside) iter->pos +- * breaks __bch2_btree_node_lock(). This seems to only affect +- * bch2_btree_node_get_sibling so for now it's fixed there, but we +- * should try to get rid of this corner case. +- * +- * (this behaviour is currently needed for BTREE_INSERT_NOUNLOCK) +- */ +- +- if (bch2_btree_node_iter_end(&l->iter) && +- btree_iter_pos_after_node(iter, l->b)) +- btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); +-} +- + static void btree_iter_pos_changed(struct btree_iter *iter, int cmp) + { + unsigned l = iter->level; +diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h +index 9a7f8d0197ec..12c519ae2a60 100644 +--- a/fs/bcachefs/btree_iter.h ++++ b/fs/bcachefs/btree_iter.h +@@ -174,7 +174,6 @@ struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *); + + struct bkey_s_c bch2_btree_iter_peek_cached(struct btree_iter *); + +-void bch2_btree_iter_set_pos_same_leaf(struct btree_iter *, struct bpos); + void __bch2_btree_iter_set_pos(struct btree_iter *, struct bpos, bool); + void bch2_btree_iter_set_pos(struct btree_iter *, struct bpos); + +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index 967e1e4d9620..d09124fc46f2 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -951,12 +951,8 @@ retry: + + trans_for_each_iter(trans, iter) + if ((trans->iters_live & (1ULL << iter->idx)) && +- (iter->flags & BTREE_ITER_SET_POS_AFTER_COMMIT)) { +- if (trans->flags & BTREE_INSERT_NOUNLOCK) +- bch2_btree_iter_set_pos_same_leaf(iter, iter->pos_after_commit); +- else +- bch2_btree_iter_set_pos(iter, iter->pos_after_commit); +- } ++ (iter->flags & BTREE_ITER_SET_POS_AFTER_COMMIT)) ++ bch2_btree_iter_set_pos(iter, iter->pos_after_commit); + out: + bch2_journal_preres_put(&trans->c->journal, &trans->journal_preres); + +-- +cgit v1.2.3 + + +From 543c7df412f123320a8b58a97423a98242de6238 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 7 Feb 2021 21:28:58 -0500 +Subject: bcachefs: bch2_btree_iter_advance_pos() + +This adds a new common helper for advancing past the last key returned +by peek(). + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 41 +++++++++++++++++------------------------ + 1 file changed, 17 insertions(+), 24 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 2a439b267db5..f9d6f6ad8407 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -1517,6 +1517,18 @@ void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos) + btree_iter_pos_changed(iter, cmp); + } + ++static inline bool bch2_btree_iter_advance_pos(struct btree_iter *iter) ++{ ++ if (unlikely(!bkey_cmp(iter->k.p, POS_MAX))) ++ return false; ++ ++ bch2_btree_iter_set_pos(iter, ++ (iter->flags & BTREE_ITER_IS_EXTENTS) ++ ? iter->k.p ++ : bkey_successor(iter->k.p)); ++ return true; ++} ++ + static inline bool btree_iter_set_pos_to_next_leaf(struct btree_iter *iter) + { + struct bpos next_pos = iter->l[0].b->key.k.p; +@@ -1623,14 +1635,9 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) + */ + struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter) + { +- if (unlikely(!bkey_cmp(iter->k.p, POS_MAX))) ++ if (!bch2_btree_iter_advance_pos(iter)) + return bkey_s_c_null; + +- bch2_btree_iter_set_pos(iter, +- (iter->flags & BTREE_ITER_IS_EXTENTS) +- ? iter->k.p +- : bkey_successor(iter->k.p)); +- + return bch2_btree_iter_peek(iter); + } + +@@ -1682,10 +1689,7 @@ struct bkey_s_c bch2_btree_iter_peek_with_updates(struct btree_iter *iter) + k = __bch2_btree_iter_peek_with_updates(iter); + + if (k.k && bkey_deleted(k.k)) { +- bch2_btree_iter_set_pos(iter, +- (iter->flags & BTREE_ITER_IS_EXTENTS) +- ? iter->k.p +- : bkey_successor(iter->k.p)); ++ bch2_btree_iter_advance_pos(iter); + continue; + } + +@@ -1700,8 +1704,7 @@ struct bkey_s_c bch2_btree_iter_peek_with_updates(struct btree_iter *iter) + * iter->pos should always be equal to the key we just + * returned - except extents can straddle iter->pos: + */ +- if (!(iter->flags & BTREE_ITER_IS_EXTENTS) || +- bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0) ++ if (bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0) + iter->pos = bkey_start_pos(k.k); + + iter->uptodate = BTREE_ITER_UPTODATE; +@@ -1710,14 +1713,9 @@ struct bkey_s_c bch2_btree_iter_peek_with_updates(struct btree_iter *iter) + + struct bkey_s_c bch2_btree_iter_next_with_updates(struct btree_iter *iter) + { +- if (unlikely(!bkey_cmp(iter->k.p, POS_MAX))) ++ if (!bch2_btree_iter_advance_pos(iter)) + return bkey_s_c_null; + +- bch2_btree_iter_set_pos(iter, +- (iter->flags & BTREE_ITER_IS_EXTENTS) +- ? iter->k.p +- : bkey_successor(iter->k.p)); +- + return bch2_btree_iter_peek_with_updates(iter); + } + +@@ -1882,14 +1880,9 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) + + struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *iter) + { +- if (unlikely(!bkey_cmp(iter->k.p, POS_MAX))) ++ if (!bch2_btree_iter_advance_pos(iter)) + return bkey_s_c_null; + +- bch2_btree_iter_set_pos(iter, +- (iter->flags & BTREE_ITER_IS_EXTENTS) +- ? iter->k.p +- : bkey_successor(iter->k.p)); +- + return bch2_btree_iter_peek_slot(iter); + } + +-- +cgit v1.2.3 + + +From 71ec2e7de663b113702c142a02ae152282a4d714 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 7 Feb 2021 21:11:49 -0500 +Subject: bcachefs: Fix bch2_btree_iter_peek_prev() + +This makes bch2_btree_iter_peek_prev() and bch2_btree_iter_prev() +consistent with peek() and next(), w.r.t. iter->pos. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 48 +++++++++++++++++++++++++++++++----------------- + fs/bcachefs/fs-io.c | 10 ++++------ + 2 files changed, 35 insertions(+), 23 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index f9d6f6ad8407..146ad2f531ab 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -1519,13 +1519,27 @@ void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos) + + static inline bool bch2_btree_iter_advance_pos(struct btree_iter *iter) + { +- if (unlikely(!bkey_cmp(iter->k.p, POS_MAX))) ++ struct bpos pos = iter->k.p; ++ ++ if (unlikely(!bkey_cmp(pos, POS_MAX))) ++ return false; ++ ++ if (!(iter->flags & BTREE_ITER_IS_EXTENTS)) ++ pos = bkey_successor(pos); ++ bch2_btree_iter_set_pos(iter, pos); ++ return true; ++} ++ ++static inline bool bch2_btree_iter_rewind_pos(struct btree_iter *iter) ++{ ++ struct bpos pos = bkey_start_pos(&iter->k); ++ ++ if (unlikely(!bkey_cmp(pos, POS_MIN))) + return false; + +- bch2_btree_iter_set_pos(iter, +- (iter->flags & BTREE_ITER_IS_EXTENTS) +- ? iter->k.p +- : bkey_successor(iter->k.p)); ++ if (!(iter->flags & BTREE_ITER_IS_EXTENTS)) ++ pos = bkey_predecessor(pos); ++ bch2_btree_iter_set_pos(iter, pos); + return true; + } + +@@ -1619,8 +1633,7 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) + * iter->pos should always be equal to the key we just + * returned - except extents can straddle iter->pos: + */ +- if (!(iter->flags & BTREE_ITER_IS_EXTENTS) || +- bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0) ++ if (bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0) + iter->pos = bkey_start_pos(k.k); + + iter->uptodate = BTREE_ITER_UPTODATE; +@@ -1743,7 +1756,10 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) + return bkey_s_c_err(ret); + + k = __btree_iter_peek(iter, l); +- if (!k.k || bkey_cmp(bkey_start_pos(k.k), pos) > 0) ++ if (!k.k || ++ ((iter->flags & BTREE_ITER_IS_EXTENTS) ++ ? bkey_cmp(bkey_start_pos(k.k), pos) >= 0 ++ : bkey_cmp(bkey_start_pos(k.k), pos) > 0)) + k = __btree_iter_prev(iter, l); + + if (likely(k.k)) +@@ -1754,8 +1770,13 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) + } + + EBUG_ON(bkey_cmp(bkey_start_pos(k.k), pos) > 0); +- iter->pos = bkey_start_pos(k.k); ++ ++ /* Extents can straddle iter->pos: */ ++ if (bkey_cmp(k.k->p, pos) < 0) ++ iter->pos = k.k->p; + iter->uptodate = BTREE_ITER_UPTODATE; ++ ++ bch2_btree_iter_verify_level(iter, 0); + return k; + } + +@@ -1765,16 +1786,9 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) + */ + struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *iter) + { +- struct bpos pos = bkey_start_pos(&iter->k); +- +- EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS); +- bch2_btree_iter_checks(iter); +- +- if (unlikely(!bkey_cmp(pos, POS_MIN))) ++ if (!bch2_btree_iter_rewind_pos(iter)) + return bkey_s_c_null; + +- bch2_btree_iter_set_pos(iter, bkey_predecessor(pos)); +- + return bch2_btree_iter_peek_prev(iter); + } + +diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c +index 718bf60f1857..76d6e64059f9 100644 +--- a/fs/bcachefs/fs-io.c ++++ b/fs/bcachefs/fs-io.c +@@ -2448,7 +2448,7 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode, + struct address_space *mapping = inode->v.i_mapping; + struct bkey_buf copy; + struct btree_trans trans; +- struct btree_iter *src, *dst; ++ struct btree_iter *src, *dst, *del; + loff_t shift, new_size; + u64 src_start; + int ret; +@@ -2518,6 +2518,7 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode, + POS(inode->v.i_ino, src_start >> 9), + BTREE_ITER_INTENT); + dst = bch2_trans_copy_iter(&trans, src); ++ del = bch2_trans_copy_iter(&trans, src); + + while (1) { + struct disk_reservation disk_res = +@@ -2538,8 +2539,6 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode, + if (!k.k || k.k->p.inode != inode->v.i_ino) + break; + +- BUG_ON(bkey_cmp(src->pos, bkey_start_pos(k.k))); +- + if (insert && + bkey_cmp(k.k->p, POS(inode->v.i_ino, offset >> 9)) <= 0) + break; +@@ -2571,6 +2570,7 @@ reassemble: + delete.k.p = copy.k->k.p; + delete.k.size = copy.k->k.size; + delete.k.p.offset -= shift >> 9; ++ bch2_btree_iter_set_pos(del, bkey_start_pos(&delete.k)); + + next_pos = insert ? bkey_start_pos(&delete.k) : delete.k.p; + +@@ -2591,9 +2591,7 @@ reassemble: + BUG_ON(ret); + } + +- bch2_btree_iter_set_pos(src, bkey_start_pos(&delete.k)); +- +- ret = bch2_trans_update(&trans, src, &delete, trigger_flags) ?: ++ ret = bch2_trans_update(&trans, del, &delete, trigger_flags) ?: + bch2_trans_update(&trans, dst, copy.k, trigger_flags) ?: + bch2_trans_commit(&trans, &disk_res, + &inode->ei_journal_seq, +-- +cgit v1.2.3 + + +From 1a3ae36e169e01c49f29df525d992ab9d5617bb3 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 10 Feb 2021 13:39:48 -0500 +Subject: bcachefs: Assert that we're not trying to flush journal seq in the + future + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/journal.c | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c +index c0831ba51ef3..395021b5ac8e 100644 +--- a/fs/bcachefs/journal.c ++++ b/fs/bcachefs/journal.c +@@ -575,6 +575,8 @@ int bch2_journal_flush_seq_async(struct journal *j, u64 seq, + + spin_lock(&j->lock); + ++ BUG_ON(seq > journal_cur_seq(j)); ++ + /* Recheck under lock: */ + if (j->err_seq && seq >= j->err_seq) { + ret = -EIO; +-- +cgit v1.2.3 + + +From e82fb398ea1e7af48c6139559076d9b82b1a0cd8 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 11 Feb 2021 14:49:36 -0500 +Subject: bcachefs: Fix a shift greater than type size + +Found by UBSAN + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/inode.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c +index bf1c7319669c..746173f15ae3 100644 +--- a/fs/bcachefs/inode.c ++++ b/fs/bcachefs/inode.c +@@ -479,7 +479,7 @@ int bch2_inode_create(struct btree_trans *trans, + u64 min, max, start, *hint; + int ret; + +- unsigned cpu = raw_smp_processor_id(); ++ u64 cpu = raw_smp_processor_id(); + unsigned bits = (c->opts.inodes_32bit + ? 31 : 63) - c->inode_shard_bits; + +-- +cgit v1.2.3 + + +From 985862cc3eded3087bc1e086e785fb3ed499ca64 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 12 Feb 2021 20:53:29 -0500 +Subject: bcachefs: Fsck fixes + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/fsck.c | 15 +++++++++++---- + 1 file changed, 11 insertions(+), 4 deletions(-) + +diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c +index c3e6137ffd75..b2d9d55b1951 100644 +--- a/fs/bcachefs/fsck.c ++++ b/fs/bcachefs/fsck.c +@@ -1072,6 +1072,11 @@ static void inc_link(struct bch_fs *c, nlink_table *links, + if (inum < range_start || inum >= *range_end) + return; + ++ if (inum - range_start >= SIZE_MAX / sizeof(struct nlink)) { ++ *range_end = inum; ++ return; ++ } ++ + link = genradix_ptr_alloc(links, inum - range_start, GFP_KERNEL); + if (!link) { + bch_verbose(c, "allocation failed during fsck - will need another pass"); +@@ -1346,23 +1351,25 @@ static int bch2_gc_walk_inodes(struct bch_fs *c, + nlinks_iter = genradix_iter_init(links, 0); + + while ((k = bch2_btree_iter_peek(iter)).k && +- !(ret2 = bkey_err(k))) { ++ !(ret2 = bkey_err(k)) && ++ iter->pos.offset < range_end) { + peek_nlinks: link = genradix_iter_peek(&nlinks_iter, links); + + if (!link && (!k.k || iter->pos.offset >= range_end)) + break; + + nlinks_pos = range_start + nlinks_iter.pos; +- if (iter->pos.offset > nlinks_pos) { ++ ++ if (link && nlinks_pos < iter->pos.offset) { + /* Should have been caught by dirents pass: */ +- need_fsck_err_on(link && link->count, c, ++ need_fsck_err_on(link->count, c, + "missing inode %llu (nlink %u)", + nlinks_pos, link->count); + genradix_iter_advance(&nlinks_iter, links); + goto peek_nlinks; + } + +- if (iter->pos.offset < nlinks_pos || !link) ++ if (!link || nlinks_pos > iter->pos.offset) + link = &zero_links; + + if (k.k && k.k->type == KEY_TYPE_inode) { +-- +cgit v1.2.3 + + +From 72e7b9c7f1acd7a238ce1937d3d8cac60b74cb55 Mon Sep 17 00:00:00 2001 +From: Robbie Litchfield +Date: Wed, 10 Feb 2021 13:18:13 +1300 +Subject: bcachefs: Fix unnecessary read amplificaiton when allocating ec + stripes + +When allocating an erasure coding stripe, bcachefs will always reuse any +partial stripes before reserving a new stripe. This causes unnecessary +read amplification when preparing a stripe for writing. This patch changes +bcachefs to always reserve new stripes first, only relying on stripe reuse +when copygc needs more time to empty buckets from existing stripes. + +Signed-off-by: Robbie Litchfield +--- + fs/bcachefs/ec.c | 155 +++++++++++++++++++++++++++++++++---------------------- + 1 file changed, 92 insertions(+), 63 deletions(-) + +diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c +index 3f794a4d8575..adcffede0d48 100644 +--- a/fs/bcachefs/ec.c ++++ b/fs/bcachefs/ec.c +@@ -1389,6 +1389,72 @@ static s64 get_existing_stripe(struct bch_fs *c, + return ret; + } + ++static int __bch2_ec_stripe_head_reuse(struct bch_fs *c, ++ struct ec_stripe_head *h) ++{ ++ unsigned i; ++ s64 idx; ++ int ret; ++ ++ idx = get_existing_stripe(c, h); ++ if (idx < 0) { ++ bch_err(c, "failed to find an existing stripe"); ++ return -ENOSPC; ++ } ++ ++ h->s->have_existing_stripe = true; ++ ret = get_stripe_key(c, idx, &h->s->existing_stripe); ++ if (ret) { ++ bch2_fs_fatal_error(c, "error reading stripe key: %i", ret); ++ return ret; ++ } ++ ++ if (ec_stripe_buf_init(&h->s->existing_stripe, 0, h->blocksize)) { ++ /* ++ * this is a problem: we have deleted from the ++ * stripes heap already ++ */ ++ BUG(); ++ } ++ ++ BUG_ON(h->s->existing_stripe.size != h->blocksize); ++ BUG_ON(h->s->existing_stripe.size != h->s->existing_stripe.key.v.sectors); ++ ++ for (i = 0; i < h->s->existing_stripe.key.v.nr_blocks; i++) { ++ if (stripe_blockcount_get(&h->s->existing_stripe.key.v, i)) { ++ __set_bit(i, h->s->blocks_gotten); ++ __set_bit(i, h->s->blocks_allocated); ++ } ++ ++ ec_block_io(c, &h->s->existing_stripe, READ, i, &h->s->iodone); ++ } ++ ++ bkey_copy(&h->s->new_stripe.key.k_i, ++ &h->s->existing_stripe.key.k_i); ++ ++ return 0; ++} ++ ++static int __bch2_ec_stripe_head_reserve(struct bch_fs *c, ++ struct ec_stripe_head *h) ++{ ++ int ret; ++ ++ ret = bch2_disk_reservation_get(c, &h->s->res, ++ h->blocksize, ++ h->s->nr_parity, 0); ++ ++ if (ret) { ++ /* ++ * This means we need to wait for copygc to ++ * empty out buckets from existing stripes: ++ */ ++ bch_err(c, "failed to reserve stripe"); ++ } ++ ++ return ret; ++} ++ + struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c, + unsigned target, + unsigned algo, +@@ -1397,9 +1463,8 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c, + struct closure *cl) + { + struct ec_stripe_head *h; +- unsigned i; +- s64 idx; + int ret; ++ bool needs_stripe_new; + + h = __bch2_ec_stripe_head_get(c, target, algo, redundancy, copygc); + if (!h) { +@@ -1407,80 +1472,44 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c, + return NULL; + } + +- if (!h->s) { ++ needs_stripe_new = !h->s; ++ if (needs_stripe_new) { + if (ec_new_stripe_alloc(c, h)) { +- bch2_ec_stripe_head_put(c, h); ++ ret = -ENOMEM; + bch_err(c, "failed to allocate new stripe"); +- return NULL; +- } +- +- idx = get_existing_stripe(c, h); +- if (idx >= 0) { +- h->s->have_existing_stripe = true; +- ret = get_stripe_key(c, idx, &h->s->existing_stripe); +- if (ret) { +- bch2_fs_fatal_error(c, "error reading stripe key: %i", ret); +- bch2_ec_stripe_head_put(c, h); +- return NULL; +- } +- +- if (ec_stripe_buf_init(&h->s->existing_stripe, 0, h->blocksize)) { +- /* +- * this is a problem: we have deleted from the +- * stripes heap already +- */ +- BUG(); +- } +- +- BUG_ON(h->s->existing_stripe.size != h->blocksize); +- BUG_ON(h->s->existing_stripe.size != h->s->existing_stripe.key.v.sectors); +- +- for (i = 0; i < h->s->existing_stripe.key.v.nr_blocks; i++) { +- if (stripe_blockcount_get(&h->s->existing_stripe.key.v, i)) { +- __set_bit(i, h->s->blocks_gotten); +- __set_bit(i, h->s->blocks_allocated); +- } +- +- ec_block_io(c, &h->s->existing_stripe, READ, i, &h->s->iodone); +- } +- +- bkey_copy(&h->s->new_stripe.key.k_i, +- &h->s->existing_stripe.key.k_i); ++ goto err; + } + +- if (ec_stripe_buf_init(&h->s->new_stripe, 0, h->blocksize)) { ++ if (ec_stripe_buf_init(&h->s->new_stripe, 0, h->blocksize)) + BUG(); +- } + } + +- if (!h->s->allocated) { +- if (!h->s->have_existing_stripe && +- !h->s->res.sectors) { +- ret = bch2_disk_reservation_get(c, &h->s->res, +- h->blocksize, +- h->s->nr_parity, 0); +- if (ret) { +- /* +- * This means we need to wait for copygc to +- * empty out buckets from existing stripes: +- */ +- bch2_ec_stripe_head_put(c, h); +- h = NULL; +- goto out; +- } +- } ++ /* ++ * Try reserve a new stripe before reusing an ++ * existing stripe. This will prevent unnecessary ++ * read amplification during write oriented workloads. ++ */ ++ ret = 0; ++ if (!h->s->allocated && !h->s->res.sectors && !h->s->have_existing_stripe) ++ ret = __bch2_ec_stripe_head_reserve(c, h); ++ if (ret && needs_stripe_new) ++ ret = __bch2_ec_stripe_head_reuse(c, h); ++ if (ret) ++ goto err; + ++ if (!h->s->allocated) { + ret = new_stripe_alloc_buckets(c, h, cl); +- if (ret) { +- bch2_ec_stripe_head_put(c, h); +- h = ERR_PTR(-ret); +- goto out; +- } ++ if (ret) ++ goto err; + + h->s->allocated = true; + } +-out: ++ + return h; ++ ++err: ++ bch2_ec_stripe_head_put(c, h); ++ return ERR_PTR(-ret); + } + + void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca) +-- +cgit v1.2.3 + + +From 115bbdf82ba6e6e003f885f8a3ac69e9627e72e0 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 17 Feb 2021 13:37:22 -0500 +Subject: bcachefs: Drop invalid stripe ptrs in fsck + +More repair code, now that we can repair extents during initial gc. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_gc.c | 67 ++++++++++++++++++++++++++++++++++---------------- + fs/bcachefs/extents.c | 9 +++++++ + fs/bcachefs/extents.h | 1 + + 3 files changed, 56 insertions(+), 21 deletions(-) + +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +index c2c8a34f735d..3eb03cf229fb 100644 +--- a/fs/bcachefs/btree_gc.c ++++ b/fs/bcachefs/btree_gc.c +@@ -163,22 +163,23 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id, + struct bkey_s_c *k) + { + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(*k); +- const struct bch_extent_ptr *ptr; ++ const union bch_extent_entry *entry; ++ struct extent_ptr_decoded p; + bool do_update = false; + int ret = 0; + +- bkey_for_each_ptr(ptrs, ptr) { +- struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); +- struct bucket *g = PTR_BUCKET(ca, ptr, true); +- struct bucket *g2 = PTR_BUCKET(ca, ptr, false); ++ bkey_for_each_ptr_decode(k->k, ptrs, p, entry) { ++ struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); ++ struct bucket *g = PTR_BUCKET(ca, &p.ptr, true); ++ struct bucket *g2 = PTR_BUCKET(ca, &p.ptr, false); + + if (fsck_err_on(!g->gen_valid, c, + "bucket %u:%zu data type %s ptr gen %u missing in alloc btree", +- ptr->dev, PTR_BUCKET_NR(ca, ptr), +- bch2_data_types[ptr_data_type(k->k, ptr)], +- ptr->gen)) { +- if (!ptr->cached) { +- g2->_mark.gen = g->_mark.gen = ptr->gen; ++ p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), ++ bch2_data_types[ptr_data_type(k->k, &p.ptr)], ++ p.ptr.gen)) { ++ if (!p.ptr.cached) { ++ g2->_mark.gen = g->_mark.gen = p.ptr.gen; + g2->gen_valid = g->gen_valid = true; + set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags); + } else { +@@ -186,13 +187,13 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id, + } + } + +- if (fsck_err_on(gen_cmp(ptr->gen, g->mark.gen) > 0, c, ++ if (fsck_err_on(gen_cmp(p.ptr.gen, g->mark.gen) > 0, c, + "bucket %u:%zu data type %s ptr gen in the future: %u > %u", +- ptr->dev, PTR_BUCKET_NR(ca, ptr), +- bch2_data_types[ptr_data_type(k->k, ptr)], +- ptr->gen, g->mark.gen)) { +- if (!ptr->cached) { +- g2->_mark.gen = g->_mark.gen = ptr->gen; ++ p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), ++ bch2_data_types[ptr_data_type(k->k, &p.ptr)], ++ p.ptr.gen, g->mark.gen)) { ++ if (!p.ptr.cached) { ++ g2->_mark.gen = g->_mark.gen = p.ptr.gen; + g2->gen_valid = g->gen_valid = true; + g2->_mark.data_type = 0; + g2->_mark.dirty_sectors = 0; +@@ -204,16 +205,27 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id, + } + } + +- if (fsck_err_on(!ptr->cached && +- gen_cmp(ptr->gen, g->mark.gen) < 0, c, ++ if (fsck_err_on(!p.ptr.cached && ++ gen_cmp(p.ptr.gen, g->mark.gen) < 0, c, + "bucket %u:%zu data type %s stale dirty ptr: %u < %u", +- ptr->dev, PTR_BUCKET_NR(ca, ptr), +- bch2_data_types[ptr_data_type(k->k, ptr)], +- ptr->gen, g->mark.gen)) ++ p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), ++ bch2_data_types[ptr_data_type(k->k, &p.ptr)], ++ p.ptr.gen, g->mark.gen)) + do_update = true; ++ ++ if (p.has_ec) { ++ struct stripe *m = genradix_ptr(&c->stripes[true], p.ec.idx); ++ ++ if (fsck_err_on(!m || !m->alive, c, ++ "pointer to nonexistent stripe %llu", ++ (u64) p.ec.idx)) ++ do_update = true; ++ } + } + + if (do_update) { ++ struct bkey_ptrs ptrs; ++ union bch_extent_entry *entry; + struct bch_extent_ptr *ptr; + struct bkey_i *new; + +@@ -237,6 +249,19 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id, + (!ptr->cached && + gen_cmp(ptr->gen, g->mark.gen) < 0); + })); ++again: ++ ptrs = bch2_bkey_ptrs(bkey_i_to_s(new)); ++ bkey_extent_entry_for_each(ptrs, entry) { ++ if (extent_entry_type(entry) == BCH_EXTENT_ENTRY_stripe_ptr) { ++ struct stripe *m = genradix_ptr(&c->stripes[true], ++ entry->stripe_ptr.idx); ++ ++ if (!m || !m->alive) { ++ bch2_bkey_extent_entry_drop(new, entry); ++ goto again; ++ } ++ } ++ } + + ret = bch2_journal_key_insert(c, btree_id, level, new); + if (ret) +diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c +index 4a3a3291a31b..ad3e88dd1829 100644 +--- a/fs/bcachefs/extents.c ++++ b/fs/bcachefs/extents.c +@@ -778,6 +778,15 @@ void bch2_bkey_mark_replicas_cached(struct bch_fs *c, struct bkey_s k, + } + } + ++void bch2_bkey_extent_entry_drop(struct bkey_i *k, union bch_extent_entry *entry) ++{ ++ union bch_extent_entry *end = bkey_val_end(bkey_i_to_s(k)); ++ union bch_extent_entry *next = extent_entry_next(entry); ++ ++ memmove_u64s(entry, next, (u64 *) end - (u64 *) next); ++ k->k.u64s -= extent_entry_u64s(entry); ++} ++ + void bch2_bkey_append_ptr(struct bkey_i *k, + struct bch_extent_ptr ptr) + { +diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h +index ebe0a04c7850..3988315fc404 100644 +--- a/fs/bcachefs/extents.h ++++ b/fs/bcachefs/extents.h +@@ -546,6 +546,7 @@ unsigned bch2_bkey_durability(struct bch_fs *, struct bkey_s_c); + void bch2_bkey_mark_replicas_cached(struct bch_fs *, struct bkey_s, + unsigned, unsigned); + ++void bch2_bkey_extent_entry_drop(struct bkey_i *, union bch_extent_entry *); + void bch2_bkey_append_ptr(struct bkey_i *, struct bch_extent_ptr); + void bch2_extent_ptr_decoded_append(struct bkey_i *, + struct extent_ptr_decoded *); +-- +cgit v1.2.3 + + +From aee438cf141fcfc99b716c74e6840faad0af6177 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 9 Mar 2021 19:37:40 -0500 +Subject: bcachefs: Ensure btree iterators are traversed in bch2_trans_commit() + +The upcoming patch to allow extents to span btree nodes will require +this... and this assertion seems to be popping, and it's not a very good +assertion anyways. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_update_leaf.c | 7 ++++++- + 1 file changed, 6 insertions(+), 1 deletion(-) + +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index d09124fc46f2..4d1882341efa 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -929,9 +929,14 @@ int __bch2_trans_commit(struct btree_trans *trans) + } + + trans_for_each_update2(trans, i) { +- BUG_ON(i->iter->uptodate > BTREE_ITER_NEED_PEEK); + BUG_ON(i->iter->locks_want < 1); + ++ ret = bch2_btree_iter_traverse(i->iter); ++ if (unlikely(ret)) { ++ trace_trans_restart_traverse(trans->ip); ++ goto out; ++ } ++ + u64s = jset_u64s(i->k->k.u64s); + if (btree_iter_type(i->iter) == BTREE_ITER_CACHED && + likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) +-- +cgit v1.2.3 + + +From b767de579e5e5f3c10961d57b7428a14f554c47e Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 11 Feb 2021 21:57:32 -0500 +Subject: bcachefs: iter->real_pos + +We need to differentiate between the search position of a btree +iterator, vs. what it actually points at (what we found). This matters +for extents, where iter->pos will typically be the start of the key we +found and iter->real_pos will be the end of the key we found (which soon +won't necessarily be in the same btree node!) and it will also matter +for snapshots. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 208 ++++++++++++++++++++++------------------ + fs/bcachefs/btree_types.h | 2 + + fs/bcachefs/btree_update_leaf.c | 2 +- + 3 files changed, 120 insertions(+), 92 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 146ad2f531ab..a69a429eac57 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -34,13 +34,13 @@ static inline struct bpos btree_iter_search_key(struct btree_iter *iter) + static inline bool btree_iter_pos_before_node(struct btree_iter *iter, + struct btree *b) + { +- return bkey_cmp(btree_iter_search_key(iter), b->data->min_key) < 0; ++ return bkey_cmp(iter->real_pos, b->data->min_key) < 0; + } + + static inline bool btree_iter_pos_after_node(struct btree_iter *iter, + struct btree *b) + { +- return bkey_cmp(b->key.k.p, btree_iter_search_key(iter)) < 0; ++ return bkey_cmp(b->key.k.p, iter->real_pos) < 0; + } + + static inline bool btree_iter_pos_in_node(struct btree_iter *iter, +@@ -491,7 +491,6 @@ static void bch2_btree_iter_verify_cached(struct btree_iter *iter) + static void bch2_btree_iter_verify_level(struct btree_iter *iter, + unsigned level) + { +- struct bpos pos = btree_iter_search_key(iter); + struct btree_iter_level *l = &iter->l[level]; + struct btree_node_iter tmp = l->iter; + bool locked = btree_node_locked(iter, level); +@@ -539,12 +538,12 @@ static void bch2_btree_iter_verify_level(struct btree_iter *iter, + : bch2_btree_node_iter_prev_all(&tmp, l->b); + k = bch2_btree_node_iter_peek_all(&l->iter, l->b); + +- if (p && bkey_iter_pos_cmp(l->b, p, &pos) >= 0) { ++ if (p && bkey_iter_pos_cmp(l->b, p, &iter->real_pos) >= 0) { + msg = "before"; + goto err; + } + +- if (k && bkey_iter_pos_cmp(l->b, k, &pos) < 0) { ++ if (k && bkey_iter_pos_cmp(l->b, k, &iter->real_pos) < 0) { + msg = "after"; + goto err; + } +@@ -567,12 +566,11 @@ err: + } + + panic("iterator should be %s key at level %u:\n" +- "iter pos %s %llu:%llu\n" ++ "iter pos %llu:%llu\n" + "prev key %s\n" + "cur key %s\n", + msg, level, +- iter->flags & BTREE_ITER_IS_EXTENTS ? ">" : "=>", +- iter->pos.inode, iter->pos.offset, ++ iter->real_pos.inode, iter->real_pos.offset, + buf1, buf2); + } + +@@ -580,12 +578,24 @@ static void bch2_btree_iter_verify(struct btree_iter *iter) + { + unsigned i; + +- bch2_btree_trans_verify_locks(iter->trans); ++ EBUG_ON(iter->btree_id >= BTREE_ID_NR); ++ ++ bch2_btree_iter_verify_locks(iter); + + for (i = 0; i < BTREE_MAX_DEPTH; i++) + bch2_btree_iter_verify_level(iter, i); + } + ++static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) ++{ ++ enum btree_iter_type type = btree_iter_type(iter); ++ ++ BUG_ON((type == BTREE_ITER_KEYS || ++ type == BTREE_ITER_CACHED) && ++ (bkey_cmp(iter->pos, bkey_start_pos(&iter->k)) < 0 || ++ bkey_cmp(iter->pos, iter->k.p) > 0)); ++} ++ + void bch2_btree_trans_verify_iters(struct btree_trans *trans, struct btree *b) + { + struct btree_iter *iter; +@@ -601,6 +611,7 @@ void bch2_btree_trans_verify_iters(struct btree_trans *trans, struct btree *b) + + static inline void bch2_btree_iter_verify_level(struct btree_iter *iter, unsigned l) {} + static inline void bch2_btree_iter_verify(struct btree_iter *iter) {} ++static inline void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) {} + + #endif + +@@ -626,12 +637,11 @@ static void __bch2_btree_iter_fix_key_modified(struct btree_iter *iter, + struct bkey_packed *where) + { + struct btree_iter_level *l = &iter->l[b->c.level]; +- struct bpos pos = btree_iter_search_key(iter); + + if (where != bch2_btree_node_iter_peek_all(&l->iter, l->b)) + return; + +- if (bkey_iter_pos_cmp(l->b, where, &pos) < 0) ++ if (bkey_iter_pos_cmp(l->b, where, &iter->real_pos) < 0) + bch2_btree_node_iter_advance(&l->iter, l->b); + + btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); +@@ -666,7 +676,6 @@ static void __bch2_btree_node_iter_fix(struct btree_iter *iter, + bool iter_current_key_modified = + orig_iter_pos >= offset && + orig_iter_pos <= offset + clobber_u64s; +- struct bpos iter_pos = btree_iter_search_key(iter); + + btree_node_iter_for_each(node_iter, set) + if (set->end == old_end) +@@ -674,7 +683,7 @@ static void __bch2_btree_node_iter_fix(struct btree_iter *iter, + + /* didn't find the bset in the iterator - might have to readd it: */ + if (new_u64s && +- bkey_iter_pos_cmp(b, where, &iter_pos) >= 0) { ++ bkey_iter_pos_cmp(b, where, &iter->real_pos) >= 0) { + bch2_btree_node_iter_push(node_iter, b, where, end); + goto fixup_done; + } else { +@@ -689,7 +698,7 @@ found: + return; + + if (new_u64s && +- bkey_iter_pos_cmp(b, where, &iter_pos) >= 0) { ++ bkey_iter_pos_cmp(b, where, &iter->real_pos) >= 0) { + set->k = offset; + } else if (set->k < offset + clobber_u64s) { + set->k = offset + new_u64s; +@@ -825,12 +834,11 @@ static inline bool btree_iter_advance_to_pos(struct btree_iter *iter, + struct btree_iter_level *l, + int max_advance) + { +- struct bpos pos = btree_iter_search_key(iter); + struct bkey_packed *k; + int nr_advanced = 0; + + while ((k = bch2_btree_node_iter_peek_all(&l->iter, l->b)) && +- bkey_iter_pos_cmp(l->b, k, &pos) < 0) { ++ bkey_iter_pos_cmp(l->b, k, &iter->real_pos) < 0) { + if (max_advance > 0 && nr_advanced >= max_advance) + return false; + +@@ -893,10 +901,9 @@ static void btree_iter_verify_new_node(struct btree_iter *iter, struct btree *b) + static inline void __btree_iter_init(struct btree_iter *iter, + unsigned level) + { +- struct bpos pos = btree_iter_search_key(iter); + struct btree_iter_level *l = &iter->l[level]; + +- bch2_btree_node_iter_init(&l->iter, l->b, &pos); ++ bch2_btree_node_iter_init(&l->iter, l->b, &iter->real_pos); + + /* + * Iterators to interior nodes should always be pointed at the first non +@@ -1342,21 +1349,6 @@ int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter) + return ret; + } + +-static inline void bch2_btree_iter_checks(struct btree_iter *iter) +-{ +- enum btree_iter_type type = btree_iter_type(iter); +- +- EBUG_ON(iter->btree_id >= BTREE_ID_NR); +- +- BUG_ON((type == BTREE_ITER_KEYS || +- type == BTREE_ITER_CACHED) && +- (bkey_cmp(iter->pos, bkey_start_pos(&iter->k)) < 0 || +- bkey_cmp(iter->pos, iter->k.p) > 0)); +- +- bch2_btree_iter_verify_locks(iter); +- bch2_btree_iter_verify_level(iter, iter->level); +-} +- + /* Iterate across nodes (leaf and interior nodes) */ + + struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter) +@@ -1365,7 +1357,7 @@ struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter) + int ret; + + EBUG_ON(btree_iter_type(iter) != BTREE_ITER_NODES); +- bch2_btree_iter_checks(iter); ++ bch2_btree_iter_verify(iter); + + if (iter->uptodate == BTREE_ITER_UPTODATE) + return iter->l[iter->level].b; +@@ -1380,7 +1372,7 @@ struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter) + + BUG_ON(bkey_cmp(b->key.k.p, iter->pos) < 0); + +- iter->pos = b->key.k.p; ++ iter->pos = iter->real_pos = b->key.k.p; + iter->uptodate = BTREE_ITER_UPTODATE; + + bch2_btree_iter_verify(iter); +@@ -1394,7 +1386,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) + int ret; + + EBUG_ON(btree_iter_type(iter) != BTREE_ITER_NODES); +- bch2_btree_iter_checks(iter); ++ bch2_btree_iter_verify(iter); + + /* already got to end? */ + if (!btree_iter_node(iter, iter->level)) +@@ -1431,7 +1423,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) + if (btree_node_read_locked(iter, iter->level)) + btree_node_unlock(iter, iter->level); + +- iter->pos = bkey_successor(iter->pos); ++ iter->pos = iter->real_pos = bkey_successor(iter->pos); + iter->level = iter->min_depth; + + btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); +@@ -1442,7 +1434,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) + b = iter->l[iter->level].b; + } + +- iter->pos = b->key.k.p; ++ iter->pos = iter->real_pos = b->key.k.p; + iter->uptodate = BTREE_ITER_UPTODATE; + + bch2_btree_iter_verify(iter); +@@ -1490,57 +1482,55 @@ out: + btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); + } + +-void __bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos, +- bool strictly_greater) ++static void btree_iter_set_search_pos(struct btree_iter *iter, struct bpos new_pos) + { +- struct bpos old = btree_iter_search_key(iter); +- int cmp; ++ int cmp = bkey_cmp(new_pos, iter->real_pos); + +- iter->flags &= ~BTREE_ITER_IS_EXTENTS; +- iter->flags |= strictly_greater ? BTREE_ITER_IS_EXTENTS : 0; ++ iter->real_pos = new_pos; ++ ++ btree_iter_pos_changed(iter, cmp); + ++ bch2_btree_iter_verify(iter); ++} ++ ++void __bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos, ++ bool strictly_greater) ++{ + bkey_init(&iter->k); + iter->k.p = iter->pos = new_pos; + +- cmp = bkey_cmp(btree_iter_search_key(iter), old); ++ iter->flags &= ~BTREE_ITER_IS_EXTENTS; ++ iter->flags |= strictly_greater ? BTREE_ITER_IS_EXTENTS : 0; + +- btree_iter_pos_changed(iter, cmp); ++ btree_iter_set_search_pos(iter, btree_iter_search_key(iter)); + } + + void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos) + { +- int cmp = bkey_cmp(new_pos, iter->pos); +- +- bkey_init(&iter->k); +- iter->k.p = iter->pos = new_pos; +- +- btree_iter_pos_changed(iter, cmp); ++ __bch2_btree_iter_set_pos(iter, new_pos, ++ (iter->flags & BTREE_ITER_IS_EXTENTS) != 0); + } + + static inline bool bch2_btree_iter_advance_pos(struct btree_iter *iter) + { + struct bpos pos = iter->k.p; ++ bool ret = bkey_cmp(pos, POS_MAX) != 0; + +- if (unlikely(!bkey_cmp(pos, POS_MAX))) +- return false; +- +- if (!(iter->flags & BTREE_ITER_IS_EXTENTS)) ++ if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS)) + pos = bkey_successor(pos); + bch2_btree_iter_set_pos(iter, pos); +- return true; ++ return ret; + } + + static inline bool bch2_btree_iter_rewind_pos(struct btree_iter *iter) + { + struct bpos pos = bkey_start_pos(&iter->k); ++ bool ret = bkey_cmp(pos, POS_MIN) != 0; + +- if (unlikely(!bkey_cmp(pos, POS_MIN))) +- return false; +- +- if (!(iter->flags & BTREE_ITER_IS_EXTENTS)) ++ if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS)) + pos = bkey_predecessor(pos); + bch2_btree_iter_set_pos(iter, pos); +- return true; ++ return ret; + } + + static inline bool btree_iter_set_pos_to_next_leaf(struct btree_iter *iter) +@@ -1548,10 +1538,16 @@ static inline bool btree_iter_set_pos_to_next_leaf(struct btree_iter *iter) + struct bpos next_pos = iter->l[0].b->key.k.p; + bool ret = bkey_cmp(next_pos, POS_MAX) != 0; + +- if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS)) +- next_pos = bkey_successor(next_pos); ++ /* ++ * Typically, we don't want to modify iter->pos here, since that ++ * indicates where we searched from - unless we got to the end of the ++ * btree, in that case we want iter->pos to reflect that: ++ */ ++ if (ret) ++ btree_iter_set_search_pos(iter, bkey_successor(next_pos)); ++ else ++ bch2_btree_iter_set_pos(iter, POS_MAX); + +- bch2_btree_iter_set_pos(iter, next_pos); + return ret; + } + +@@ -1560,14 +1556,11 @@ static inline bool btree_iter_set_pos_to_prev_leaf(struct btree_iter *iter) + struct bpos next_pos = iter->l[0].b->data->min_key; + bool ret = bkey_cmp(next_pos, POS_MIN) != 0; + +- if (ret) { +- next_pos = bkey_predecessor(next_pos); +- +- if (iter->flags & BTREE_ITER_IS_EXTENTS) +- next_pos = bkey_predecessor(next_pos); +- } ++ if (ret) ++ btree_iter_set_search_pos(iter, bkey_predecessor(next_pos)); ++ else ++ bch2_btree_iter_set_pos(iter, POS_MIN); + +- bch2_btree_iter_set_pos(iter, next_pos); + return ret; + } + +@@ -1610,7 +1603,10 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) + int ret; + + EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS); +- bch2_btree_iter_checks(iter); ++ bch2_btree_iter_verify(iter); ++ bch2_btree_iter_verify_entry_exit(iter); ++ ++ btree_iter_set_search_pos(iter, btree_iter_search_key(iter)); + + if (iter->uptodate == BTREE_ITER_UPTODATE && + !bkey_deleted(&iter->k)) +@@ -1636,9 +1632,12 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) + if (bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0) + iter->pos = bkey_start_pos(k.k); + ++ iter->real_pos = k.k->p; ++ + iter->uptodate = BTREE_ITER_UPTODATE; + +- bch2_btree_iter_verify_level(iter, 0); ++ bch2_btree_iter_verify_entry_exit(iter); ++ bch2_btree_iter_verify(iter); + return k; + } + +@@ -1692,7 +1691,7 @@ struct bkey_s_c bch2_btree_iter_peek_with_updates(struct btree_iter *iter) + int ret; + + EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS); +- bch2_btree_iter_checks(iter); ++ bch2_btree_iter_verify(iter); + + while (1) { + ret = bch2_btree_iter_traverse(iter); +@@ -1714,8 +1713,8 @@ struct bkey_s_c bch2_btree_iter_peek_with_updates(struct btree_iter *iter) + } + + /* +- * iter->pos should always be equal to the key we just +- * returned - except extents can straddle iter->pos: ++ * iter->pos should be mononotically increasing, and always be equal to ++ * the key we just returned - except extents can straddle iter->pos: + */ + if (bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0) + iter->pos = bkey_start_pos(k.k); +@@ -1744,7 +1743,10 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) + int ret; + + EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS); +- bch2_btree_iter_checks(iter); ++ bch2_btree_iter_verify(iter); ++ bch2_btree_iter_verify_entry_exit(iter); ++ ++ btree_iter_set_search_pos(iter, iter->pos); + + if (iter->uptodate == BTREE_ITER_UPTODATE && + !bkey_deleted(&iter->k)) +@@ -1752,8 +1754,10 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) + + while (1) { + ret = bch2_btree_iter_traverse(iter); +- if (unlikely(ret)) +- return bkey_s_c_err(ret); ++ if (unlikely(ret)) { ++ k = bkey_s_c_err(ret); ++ goto no_key; ++ } + + k = __btree_iter_peek(iter, l); + if (!k.k || +@@ -1765,8 +1769,10 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) + if (likely(k.k)) + break; + +- if (!btree_iter_set_pos_to_prev_leaf(iter)) +- return bkey_s_c_null; ++ if (!btree_iter_set_pos_to_prev_leaf(iter)) { ++ k = bkey_s_c_null; ++ goto no_key; ++ } + } + + EBUG_ON(bkey_cmp(bkey_start_pos(k.k), pos) > 0); +@@ -1774,10 +1780,23 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) + /* Extents can straddle iter->pos: */ + if (bkey_cmp(k.k->p, pos) < 0) + iter->pos = k.k->p; +- iter->uptodate = BTREE_ITER_UPTODATE; +- +- bch2_btree_iter_verify_level(iter, 0); ++ iter->real_pos = k.k->p; ++ iter->uptodate = BTREE_ITER_UPTODATE; ++out: ++ bch2_btree_iter_verify_entry_exit(iter); ++ bch2_btree_iter_verify(iter); + return k; ++no_key: ++ /* ++ * __btree_iter_peek() may have set iter->k to a key we didn't want, and ++ * then we errored going to the previous leaf - make sure it's ++ * consistent with iter->pos: ++ */ ++ BUG_ON(bkey_cmp(pos, iter->pos) && ++ bkey_cmp(iter->pos, POS_MIN)); ++ bkey_init(&iter->k); ++ iter->k.p = iter->pos; ++ goto out; + } + + /** +@@ -1830,7 +1849,7 @@ __bch2_btree_iter_peek_slot_extents(struct btree_iter *iter) + */ + + EBUG_ON(bkey_cmp(k.k->p, iter->pos) <= 0); +- bch2_btree_iter_verify_level(iter, 0); ++ bch2_btree_iter_verify(iter); + return k; + } + +@@ -1853,7 +1872,9 @@ __bch2_btree_iter_peek_slot_extents(struct btree_iter *iter) + iter->k = n; + iter->uptodate = BTREE_ITER_UPTODATE; + +- bch2_btree_iter_verify_level(iter, 0); ++ bch2_btree_iter_verify_entry_exit(iter); ++ bch2_btree_iter_verify(iter); ++ + return (struct bkey_s_c) { &iter->k, NULL }; + } + +@@ -1864,7 +1885,10 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) + int ret; + + EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS); +- bch2_btree_iter_checks(iter); ++ bch2_btree_iter_verify(iter); ++ bch2_btree_iter_verify_entry_exit(iter); ++ ++ btree_iter_set_search_pos(iter, btree_iter_search_key(iter)); + + if (iter->uptodate == BTREE_ITER_UPTODATE) + return btree_iter_peek_uptodate(iter); +@@ -1888,7 +1912,8 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) + } + + iter->uptodate = BTREE_ITER_UPTODATE; +- bch2_btree_iter_verify_level(iter, 0); ++ bch2_btree_iter_verify_entry_exit(iter); ++ bch2_btree_iter_verify(iter); + return k; + } + +@@ -1906,7 +1931,7 @@ struct bkey_s_c bch2_btree_iter_peek_cached(struct btree_iter *iter) + int ret; + + EBUG_ON(btree_iter_type(iter) != BTREE_ITER_CACHED); +- bch2_btree_iter_checks(iter); ++ bch2_btree_iter_verify(iter); + + ret = bch2_btree_iter_traverse(iter); + if (unlikely(ret)) +@@ -1937,6 +1962,7 @@ static inline void bch2_btree_iter_init(struct btree_trans *trans, + bkey_init(&iter->k); + iter->k.p = pos; + iter->flags = flags; ++ iter->real_pos = btree_iter_search_key(iter); + iter->uptodate = BTREE_ITER_NEED_TRAVERSE; + iter->btree_id = btree_id; + iter->level = 0; +@@ -2076,7 +2102,7 @@ static struct btree_iter *__btree_trans_get_iter(struct btree_trans *trans, + + if (best && + bkey_cmp(bpos_diff(best->pos, pos), +- bpos_diff(iter->pos, pos)) < 0) ++ bpos_diff(iter->real_pos, pos)) < 0) + continue; + + best = iter; +diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h +index 631bf4694f4d..e57ed9d86ee3 100644 +--- a/fs/bcachefs/btree_types.h ++++ b/fs/bcachefs/btree_types.h +@@ -247,6 +247,8 @@ enum btree_iter_uptodate { + struct btree_iter { + struct btree_trans *trans; + struct bpos pos; ++ /* what we're searching for/what the iterator actually points to: */ ++ struct bpos real_pos; + struct bpos pos_after_commit; + + u16 flags; +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index 4d1882341efa..f2503ad47baa 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -219,7 +219,7 @@ static inline void btree_insert_entry_checks(struct btree_trans *trans, + { + struct bch_fs *c = trans->c; + +- BUG_ON(bkey_cmp(insert->k.p, iter->pos)); ++ BUG_ON(bkey_cmp(insert->k.p, iter->real_pos)); + BUG_ON(bch2_debug_check_bkeys && + bch2_bkey_invalid(c, bkey_i_to_s_c(insert), + __btree_node_type(iter->level, iter->btree_id))); +-- +cgit v1.2.3 + + +From de668c90dc80886395633275fe4cc422d5deacad Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 10 Feb 2021 16:13:57 -0500 +Subject: bcachefs: Extents may now cross btree node boundaries + +When snapshots arrive, we won't necessarily be able to arbitrarily split +existis - when we need to split an existing extent, we'll have to check +if the extent was overwritten in child snapshots and if so emit a +whiteout for the split in the child snapshot. + +Because extents couldn't span btree nodes previously, journal replay +would sometimes have to split existing extents. That's no good anymore, +but fortunately since extent handling has already been lifted above most +of the btree code there's no real need for that rule anymore. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bcachefs_format.h | 14 +++-- + fs/bcachefs/btree_iter.c | 58 +++++++-------------- + fs/bcachefs/btree_update_leaf.c | 39 ++++++++------ + fs/bcachefs/buckets.c | 67 +++++++++--------------- + fs/bcachefs/extent_update.c | 29 +++-------- + fs/bcachefs/recovery.c | 113 +--------------------------------------- + fs/bcachefs/super-io.c | 4 +- + 7 files changed, 87 insertions(+), 237 deletions(-) + +diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h +index 30e77190d97a..618f49ac9f82 100644 +--- a/fs/bcachefs/bcachefs_format.h ++++ b/fs/bcachefs/bcachefs_format.h +@@ -1342,13 +1342,19 @@ LE64_BITMASK(BCH_SB_METADATA_TARGET, struct bch_sb, flags[3], 16, 28); + x(reflink_inline_data, 14) \ + x(new_varint, 15) \ + x(journal_no_flush, 16) \ +- x(alloc_v2, 17) ++ x(alloc_v2, 17) \ ++ x(extents_across_btree_nodes, 18) ++ ++#define BCH_SB_FEATURES_ALWAYS \ ++ ((1ULL << BCH_FEATURE_new_extent_overwrite)| \ ++ (1ULL << BCH_FEATURE_extents_above_btree_updates)|\ ++ (1ULL << BCH_FEATURE_btree_updates_journalled)|\ ++ (1ULL << BCH_FEATURE_extents_across_btree_nodes)) + + #define BCH_SB_FEATURES_ALL \ +- ((1ULL << BCH_FEATURE_new_siphash)| \ +- (1ULL << BCH_FEATURE_new_extent_overwrite)| \ ++ (BCH_SB_FEATURES_ALWAYS| \ ++ (1ULL << BCH_FEATURE_new_siphash)| \ + (1ULL << BCH_FEATURE_btree_ptr_v2)| \ +- (1ULL << BCH_FEATURE_extents_above_btree_updates)|\ + (1ULL << BCH_FEATURE_new_varint)| \ + (1ULL << BCH_FEATURE_journal_no_flush)| \ + (1ULL << BCH_FEATURE_alloc_v2)) +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index a69a429eac57..c41fe4e0bc00 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -1814,11 +1814,8 @@ struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *iter) + static inline struct bkey_s_c + __bch2_btree_iter_peek_slot_extents(struct btree_iter *iter) + { +- struct btree_iter_level *l = &iter->l[0]; +- struct btree_node_iter node_iter; + struct bkey_s_c k; +- struct bkey n; +- int ret; ++ struct bpos pos, next_start; + + /* keys & holes can't span inode numbers: */ + if (iter->pos.offset == KEY_OFFSET_MAX) { +@@ -1826,50 +1823,31 @@ __bch2_btree_iter_peek_slot_extents(struct btree_iter *iter) + return bkey_s_c_null; + + bch2_btree_iter_set_pos(iter, bkey_successor(iter->pos)); +- +- ret = bch2_btree_iter_traverse(iter); +- if (unlikely(ret)) +- return bkey_s_c_err(ret); + } + +- /* +- * iterator is now at the correct position for inserting at iter->pos, +- * but we need to keep iterating until we find the first non whiteout so +- * we know how big a hole we have, if any: +- */ +- +- node_iter = l->iter; +- k = __btree_iter_unpack(iter, l, &iter->k, +- bch2_btree_node_iter_peek(&node_iter, l->b)); +- +- if (k.k && bkey_cmp(bkey_start_pos(k.k), iter->pos) <= 0) { +- /* +- * We're not setting iter->uptodate because the node iterator +- * doesn't necessarily point at the key we're returning: +- */ ++ pos = iter->pos; ++ k = bch2_btree_iter_peek(iter); ++ iter->pos = pos; + +- EBUG_ON(bkey_cmp(k.k->p, iter->pos) <= 0); +- bch2_btree_iter_verify(iter); ++ if (bkey_err(k)) + return k; +- } + +- /* hole */ ++ if (k.k && bkey_cmp(bkey_start_pos(k.k), iter->pos) <= 0) ++ return k; + +- if (!k.k) +- k.k = &l->b->key.k; ++ next_start = k.k ? bkey_start_pos(k.k) : POS_MAX; + +- bkey_init(&n); +- n.p = iter->pos; +- bch2_key_resize(&n, ++ bkey_init(&iter->k); ++ iter->k.p = iter->pos; ++ bch2_key_resize(&iter->k, + min_t(u64, KEY_SIZE_MAX, +- (k.k->p.inode == n.p.inode +- ? bkey_start_offset(k.k) ++ (next_start.inode == iter->pos.inode ++ ? next_start.offset + : KEY_OFFSET_MAX) - +- n.p.offset)); ++ iter->pos.offset)); + +- EBUG_ON(!n.size); ++ EBUG_ON(!iter->k.size); + +- iter->k = n; + iter->uptodate = BTREE_ITER_UPTODATE; + + bch2_btree_iter_verify_entry_exit(iter); +@@ -1893,13 +1871,13 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) + if (iter->uptodate == BTREE_ITER_UPTODATE) + return btree_iter_peek_uptodate(iter); + ++ if (iter->flags & BTREE_ITER_IS_EXTENTS) ++ return __bch2_btree_iter_peek_slot_extents(iter); ++ + ret = bch2_btree_iter_traverse(iter); + if (unlikely(ret)) + return bkey_s_c_err(ret); + +- if (iter->flags & BTREE_ITER_IS_EXTENTS) +- return __bch2_btree_iter_peek_slot_extents(iter); +- + k = __btree_iter_peek_all(iter, l, &iter->k); + + EBUG_ON(k.k && bkey_deleted(k.k) && bkey_cmp(k.k->p, iter->pos) == 0); +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index f2503ad47baa..e18756b38fc2 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -62,9 +62,6 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter, + EBUG_ON(btree_node_just_written(b)); + EBUG_ON(bset_written(b, btree_bset_last(b))); + EBUG_ON(bkey_deleted(&insert->k) && bkey_val_u64s(&insert->k)); +- EBUG_ON(bkey_cmp(b->data->min_key, POS_MIN) && +- bkey_cmp(bkey_start_pos(&insert->k), +- bkey_predecessor(b->data->min_key)) < 0); + EBUG_ON(bkey_cmp(insert->k.p, b->data->min_key) < 0); + EBUG_ON(bkey_cmp(insert->k.p, b->data->max_key) > 0); + EBUG_ON(insert->k.u64s > +@@ -705,26 +702,31 @@ static inline int btree_iter_pos_cmp(const struct btree_iter *l, + bkey_cmp(l->pos, r->pos); + } + +-static void bch2_trans_update2(struct btree_trans *trans, ++static int bch2_trans_update2(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_i *insert) + { + struct btree_insert_entry *i, n = (struct btree_insert_entry) { + .iter = iter, .k = insert + }; ++ int ret; + + btree_insert_entry_checks(trans, n.iter, n.k); + +- BUG_ON(iter->uptodate > BTREE_ITER_NEED_PEEK); +- + EBUG_ON(trans->nr_updates2 >= BTREE_ITER_MAX); + ++ ret = bch2_btree_iter_traverse(iter); ++ if (unlikely(ret)) ++ return ret; ++ ++ BUG_ON(iter->uptodate > BTREE_ITER_NEED_PEEK); ++ + iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT; + + trans_for_each_update2(trans, i) { + if (btree_iter_pos_cmp(n.iter, i->iter) == 0) { + *i = n; +- return; ++ return 0; + } + + if (btree_iter_pos_cmp(n.iter, i->iter) <= 0) +@@ -733,6 +735,7 @@ static void bch2_trans_update2(struct btree_trans *trans, + + array_insert_item(trans->updates2, trans->nr_updates2, + i - trans->updates2, n); ++ return 0; + } + + static int extent_update_to_keys(struct btree_trans *trans, +@@ -753,9 +756,9 @@ static int extent_update_to_keys(struct btree_trans *trans, + + iter->flags |= BTREE_ITER_INTENT; + __bch2_btree_iter_set_pos(iter, insert->k.p, false); +- bch2_trans_update2(trans, iter, insert); ++ ret = bch2_trans_update2(trans, iter, insert); + bch2_trans_iter_put(trans, iter); +- return 0; ++ return ret; + } + + static int extent_handle_overwrites(struct btree_trans *trans, +@@ -785,8 +788,10 @@ static int extent_handle_overwrites(struct btree_trans *trans, + bch2_cut_back(start, update); + + __bch2_btree_iter_set_pos(update_iter, update->k.p, false); +- bch2_trans_update2(trans, update_iter, update); ++ ret = bch2_trans_update2(trans, update_iter, update); + bch2_trans_iter_put(trans, update_iter); ++ if (ret) ++ goto err; + } + + if (bkey_cmp(k.k->p, end) > 0) { +@@ -800,8 +805,10 @@ static int extent_handle_overwrites(struct btree_trans *trans, + bch2_cut_front(end, update); + + __bch2_btree_iter_set_pos(update_iter, update->k.p, false); +- bch2_trans_update2(trans, update_iter, update); ++ ret = bch2_trans_update2(trans, update_iter, update); + bch2_trans_iter_put(trans, update_iter); ++ if (ret) ++ goto err; + } else { + update_iter = bch2_trans_copy_iter(trans, iter); + +@@ -815,8 +822,10 @@ static int extent_handle_overwrites(struct btree_trans *trans, + update->k.size = 0; + + __bch2_btree_iter_set_pos(update_iter, update->k.p, false); +- bch2_trans_update2(trans, update_iter, update); ++ ret = bch2_trans_update2(trans, update_iter, update); + bch2_trans_iter_put(trans, update_iter); ++ if (ret) ++ goto err; + } + + k = bch2_btree_iter_next_with_updates(iter); +@@ -921,11 +930,11 @@ int __bch2_trans_commit(struct btree_trans *trans) + trans_for_each_update(trans, i) { + if (i->iter->flags & BTREE_ITER_IS_EXTENTS) { + ret = extent_update_to_keys(trans, i->iter, i->k); +- if (ret) +- goto out; + } else { +- bch2_trans_update2(trans, i->iter, i->k); ++ ret = bch2_trans_update2(trans, i->iter, i->k); + } ++ if (ret) ++ goto out; + } + + trans_for_each_update2(trans, i) { +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index ef79f5cac64d..ba7a472a1bb7 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -1318,9 +1318,6 @@ int bch2_mark_update(struct btree_trans *trans, + unsigned flags) + { + struct bch_fs *c = trans->c; +- struct btree *b = iter_l(iter)->b; +- struct btree_node_iter node_iter = iter_l(iter)->iter; +- struct bkey_packed *_old; + struct bkey_s_c old; + struct bkey unpacked; + int ret = 0; +@@ -1360,23 +1357,24 @@ int bch2_mark_update(struct btree_trans *trans, + BTREE_TRIGGER_OVERWRITE|flags); + } + } else { ++ struct btree_iter *copy; ++ + BUG_ON(btree_iter_type(iter) == BTREE_ITER_CACHED); + bch2_mark_key_locked(c, old, bkey_i_to_s_c(new), + 0, new->k.size, + fs_usage, trans->journal_res.seq, + BTREE_TRIGGER_INSERT|flags); + +- while ((_old = bch2_btree_node_iter_peek(&node_iter, b))) { +- unsigned offset = 0; +- s64 sectors; ++ copy = bch2_trans_copy_iter(trans, iter); + +- old = bkey_disassemble(b, _old, &unpacked); +- sectors = -((s64) old.k->size); ++ for_each_btree_key_continue(copy, 0, old, ret) { ++ unsigned offset = 0; ++ s64 sectors = -((s64) old.k->size); + + flags |= BTREE_TRIGGER_OVERWRITE; + + if (bkey_cmp(new->k.p, bkey_start_pos(old.k)) <= 0) +- return 0; ++ break; + + switch (bch2_extent_overlap(&new->k, old.k)) { + case BCH_EXTENT_OVERLAP_ALL: +@@ -1409,9 +1407,8 @@ int bch2_mark_update(struct btree_trans *trans, + trans->journal_res.seq, flags) ?: 1; + if (ret <= 0) + break; +- +- bch2_btree_node_iter_advance(&node_iter, b); + } ++ bch2_trans_iter_put(trans, copy); + } + + return ret; +@@ -1442,27 +1439,20 @@ void bch2_trans_fs_usage_apply(struct btree_trans *trans, + pr_err("overlapping with"); + + if (btree_iter_type(i->iter) != BTREE_ITER_CACHED) { +- struct btree *b = iter_l(i->iter)->b; +- struct btree_node_iter node_iter = iter_l(i->iter)->iter; +- struct bkey_packed *_k; +- +- while ((_k = bch2_btree_node_iter_peek(&node_iter, b))) { +- struct bkey unpacked; +- struct bkey_s_c k; ++ struct btree_iter *copy = bch2_trans_copy_iter(trans, i->iter); ++ struct bkey_s_c k; ++ int ret; + +- pr_info("_k %px format %u", _k, _k->format); +- k = bkey_disassemble(b, _k, &unpacked); +- +- if (btree_node_is_extents(b) ++ for_each_btree_key_continue(copy, 0, k, ret) { ++ if (btree_node_type_is_extents(i->iter->btree_id) + ? bkey_cmp(i->k->k.p, bkey_start_pos(k.k)) <= 0 + : bkey_cmp(i->k->k.p, k.k->p)) + break; + + bch2_bkey_val_to_text(&PBUF(buf), c, k); + pr_err("%s", buf); +- +- bch2_btree_node_iter_advance(&node_iter, b); + } ++ bch2_trans_iter_put(trans, copy); + } else { + struct bkey_cached *ck = (void *) i->iter->l[0].b; + +@@ -1857,8 +1847,6 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans, + } + + bch2_btree_iter_set_pos(iter, bkey_start_pos(k.k)); +- BUG_ON(iter->uptodate > BTREE_ITER_NEED_PEEK); +- + bch2_trans_update(trans, iter, n, 0); + out: + ret = sectors; +@@ -1984,15 +1972,13 @@ int bch2_trans_mark_update(struct btree_trans *trans, + BTREE_TRIGGER_OVERWRITE|flags); + } + } else { +- struct btree *b = iter_l(iter)->b; +- struct btree_node_iter node_iter = iter_l(iter)->iter; +- struct bkey_packed *_old; +- struct bkey unpacked; ++ struct btree_iter *copy; ++ struct bkey _old; + + EBUG_ON(btree_iter_type(iter) == BTREE_ITER_CACHED); + +- bkey_init(&unpacked); +- old = (struct bkey_s_c) { &unpacked, NULL }; ++ bkey_init(&_old); ++ old = (struct bkey_s_c) { &_old, NULL }; + + ret = bch2_trans_mark_key(trans, old, bkey_i_to_s_c(new), + 0, new->k.size, +@@ -2000,18 +1986,16 @@ int bch2_trans_mark_update(struct btree_trans *trans, + if (ret) + return ret; + +- while ((_old = bch2_btree_node_iter_peek(&node_iter, b))) { +- unsigned flags = BTREE_TRIGGER_OVERWRITE; +- unsigned offset = 0; +- s64 sectors; ++ copy = bch2_trans_copy_iter(trans, iter); + +- old = bkey_disassemble(b, _old, &unpacked); +- sectors = -((s64) old.k->size); ++ for_each_btree_key_continue(copy, 0, old, ret) { ++ unsigned offset = 0; ++ s64 sectors = -((s64) old.k->size); + + flags |= BTREE_TRIGGER_OVERWRITE; + + if (bkey_cmp(new->k.p, bkey_start_pos(old.k)) <= 0) +- return 0; ++ break; + + switch (bch2_extent_overlap(&new->k, old.k)) { + case BCH_EXTENT_OVERLAP_ALL: +@@ -2042,10 +2026,9 @@ int bch2_trans_mark_update(struct btree_trans *trans, + ret = bch2_trans_mark_key(trans, old, bkey_i_to_s_c(new), + offset, sectors, flags); + if (ret) +- return ret; +- +- bch2_btree_node_iter_advance(&node_iter, b); ++ break; + } ++ bch2_trans_iter_put(trans, copy); + } + + return ret; +diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c +index 5c43678e94a3..16d2bca8a662 100644 +--- a/fs/bcachefs/extent_update.c ++++ b/fs/bcachefs/extent_update.c +@@ -99,24 +99,12 @@ int bch2_extent_atomic_end(struct btree_iter *iter, + struct bpos *end) + { + struct btree_trans *trans = iter->trans; +- struct btree *b; +- struct btree_node_iter node_iter; +- struct bkey_packed *_k; +- unsigned nr_iters = 0; ++ struct btree_iter *copy; ++ struct bkey_s_c k; ++ unsigned nr_iters = 0; + int ret; + +- ret = bch2_btree_iter_traverse(iter); +- if (ret) +- return ret; +- +- b = iter->l[0].b; +- node_iter = iter->l[0].iter; +- +- BUG_ON(bkey_cmp(b->data->min_key, POS_MIN) && +- bkey_cmp(bkey_start_pos(&insert->k), +- bkey_predecessor(b->data->min_key)) < 0); +- +- *end = bpos_min(insert->k.p, b->key.k.p); ++ *end = insert->k.p; + + /* extent_update_to_keys(): */ + nr_iters += 1; +@@ -126,9 +114,9 @@ int bch2_extent_atomic_end(struct btree_iter *iter, + if (ret < 0) + return ret; + +- while ((_k = bch2_btree_node_iter_peek(&node_iter, b))) { +- struct bkey unpacked; +- struct bkey_s_c k = bkey_disassemble(b, _k, &unpacked); ++ copy = bch2_trans_copy_iter(trans, iter); ++ ++ for_each_btree_key_continue(copy, 0, k, ret) { + unsigned offset = 0; + + if (bkey_cmp(bkey_start_pos(k.k), *end) >= 0) +@@ -155,10 +143,9 @@ int bch2_extent_atomic_end(struct btree_iter *iter, + &nr_iters, EXTENT_ITERS_MAX); + if (ret) + break; +- +- bch2_btree_node_iter_advance(&node_iter, b); + } + ++ bch2_trans_iter_put(trans, copy); + return ret < 0 ? ret : 0; + } + +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index 8560023b4c7a..54ac9cc470af 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -506,115 +506,6 @@ static void replay_now_at(struct journal *j, u64 seq) + bch2_journal_pin_put(j, j->replay_journal_seq++); + } + +-static int bch2_extent_replay_key(struct bch_fs *c, enum btree_id btree_id, +- struct bkey_i *k) +-{ +- struct btree_trans trans; +- struct btree_iter *iter, *split_iter; +- /* +- * We might cause compressed extents to be split, so we need to pass in +- * a disk_reservation: +- */ +- struct disk_reservation disk_res = +- bch2_disk_reservation_init(c, 0); +- struct bkey_i *split; +- struct bpos atomic_end; +- /* +- * Some extents aren't equivalent - w.r.t. what the triggers do +- * - if they're split: +- */ +- bool remark_if_split = bch2_bkey_sectors_compressed(bkey_i_to_s_c(k)) || +- k->k.type == KEY_TYPE_reflink_p; +- bool remark = false; +- int ret; +- +- bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); +-retry: +- bch2_trans_begin(&trans); +- +- iter = bch2_trans_get_iter(&trans, btree_id, +- bkey_start_pos(&k->k), +- BTREE_ITER_INTENT); +- +- do { +- ret = bch2_btree_iter_traverse(iter); +- if (ret) +- goto err; +- +- atomic_end = bpos_min(k->k.p, iter->l[0].b->key.k.p); +- +- split = bch2_trans_kmalloc(&trans, bkey_bytes(&k->k)); +- ret = PTR_ERR_OR_ZERO(split); +- if (ret) +- goto err; +- +- if (!remark && +- remark_if_split && +- bkey_cmp(atomic_end, k->k.p) < 0) { +- ret = bch2_disk_reservation_add(c, &disk_res, +- k->k.size * +- bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(k)), +- BCH_DISK_RESERVATION_NOFAIL); +- BUG_ON(ret); +- +- remark = true; +- } +- +- bkey_copy(split, k); +- bch2_cut_front(iter->pos, split); +- bch2_cut_back(atomic_end, split); +- +- split_iter = bch2_trans_copy_iter(&trans, iter); +- +- /* +- * It's important that we don't go through the +- * extent_handle_overwrites() and extent_update_to_keys() path +- * here: journal replay is supposed to treat extents like +- * regular keys +- */ +- __bch2_btree_iter_set_pos(split_iter, split->k.p, false); +- bch2_trans_update(&trans, split_iter, split, +- BTREE_TRIGGER_NORUN); +- bch2_trans_iter_put(&trans, split_iter); +- +- bch2_btree_iter_set_pos(iter, split->k.p); +- +- if (remark) { +- ret = bch2_trans_mark_key(&trans, +- bkey_s_c_null, +- bkey_i_to_s_c(split), +- 0, split->k.size, +- BTREE_TRIGGER_INSERT); +- if (ret) +- goto err; +- } +- } while (bkey_cmp(iter->pos, k->k.p) < 0); +- +- if (remark) { +- ret = bch2_trans_mark_key(&trans, +- bkey_i_to_s_c(k), +- bkey_s_c_null, +- 0, -((s64) k->k.size), +- BTREE_TRIGGER_OVERWRITE); +- if (ret) +- goto err; +- } +- +- ret = bch2_trans_commit(&trans, &disk_res, NULL, +- BTREE_INSERT_NOFAIL| +- BTREE_INSERT_LAZY_RW| +- BTREE_INSERT_JOURNAL_REPLAY); +-err: +- bch2_trans_iter_put(&trans, iter); +- +- if (ret == -EINTR) +- goto retry; +- +- bch2_disk_reservation_put(c, &disk_res); +- +- return bch2_trans_exit(&trans) ?: ret; +-} +- + static int __bch2_journal_replay_key(struct btree_trans *trans, + enum btree_id id, unsigned level, + struct bkey_i *k) +@@ -753,9 +644,7 @@ static int bch2_journal_replay(struct bch_fs *c, + + replay_now_at(j, keys.journal_seq_base + i->journal_seq); + +- ret = i->k->k.size +- ? bch2_extent_replay_key(c, i->btree_id, i->k) +- : bch2_journal_replay_key(c, i); ++ ret = bch2_journal_replay_key(c, i); + if (ret) + goto err; + } +diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c +index 47a0e20668e3..09598ec95171 100644 +--- a/fs/bcachefs/super-io.c ++++ b/fs/bcachefs/super-io.c +@@ -953,9 +953,7 @@ int bch2_fs_mark_dirty(struct bch_fs *c) + + mutex_lock(&c->sb_lock); + SET_BCH_SB_CLEAN(c->disk_sb.sb, false); +- c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_new_extent_overwrite; +- c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_extents_above_btree_updates; +- c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_btree_updates_journalled; ++ c->disk_sb.sb->features[0] |= BCH_SB_FEATURES_ALWAYS; + ret = bch2_write_super(c); + mutex_unlock(&c->sb_lock); + +-- +cgit v1.2.3 + + +From 9c9b66cddbd63a1862e4569afefef6d4729e996e Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 23 Feb 2021 15:16:41 -0500 +Subject: bcachefs: Add error message for some allocation failures + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/alloc_background.c | 5 ++++- + fs/bcachefs/btree_gc.c | 38 ++++++++++++++++++++++++++++++-------- + fs/bcachefs/journal_reclaim.c | 4 +++- + fs/bcachefs/movinggc.c | 4 +++- + fs/bcachefs/rebalance.c | 4 +++- + fs/bcachefs/recovery.c | 13 ++++++++++--- + 6 files changed, 53 insertions(+), 15 deletions(-) + +diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c +index 476ddac4b266..319f401d87c8 100644 +--- a/fs/bcachefs/alloc_background.c ++++ b/fs/bcachefs/alloc_background.c +@@ -1387,8 +1387,11 @@ int bch2_dev_allocator_start(struct bch_dev *ca) + + p = kthread_create(bch2_allocator_thread, ca, + "bch-alloc/%s", ca->name); +- if (IS_ERR(p)) ++ if (IS_ERR(p)) { ++ bch_err(ca->fs, "error creating allocator thread: %li", ++ PTR_ERR(p)); + return PTR_ERR(p); ++ } + + get_task_struct(p); + rcu_assign_pointer(ca->alloc_thread, p); +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +index 3eb03cf229fb..e38f066680a9 100644 +--- a/fs/bcachefs/btree_gc.c ++++ b/fs/bcachefs/btree_gc.c +@@ -115,8 +115,10 @@ static int bch2_gc_check_topology(struct bch_fs *c, + } + + new = kmalloc(bkey_bytes(&cur.k->k), GFP_KERNEL); +- if (!new) ++ if (!new) { ++ bch_err(c, "%s: error allocating new key", __func__); + return -ENOMEM; ++ } + + bkey_copy(new, cur.k); + +@@ -235,8 +237,10 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id, + } + + new = kmalloc(bkey_bytes(k->k), GFP_KERNEL); +- if (!new) ++ if (!new) { ++ bch_err(c, "%s: error allocating new key", __func__); + return -ENOMEM; ++ } + + bkey_reassemble(new, *k); + +@@ -302,8 +306,10 @@ static int bch2_gc_mark_key(struct bch_fs *c, enum btree_id btree_id, + "superblock not marked as containing replicas (type %u)", + k.k->type)) { + ret = bch2_mark_bkey_replicas(c, k); +- if (ret) +- return ret; ++ if (ret) { ++ bch_err(c, "error marking bkey replicas: %i", ret); ++ goto err; ++ } + } + + ret = bch2_check_fix_ptrs(c, btree_id, level, is_root, &k); +@@ -321,6 +327,9 @@ static int bch2_gc_mark_key(struct bch_fs *c, enum btree_id btree_id, + + bch2_mark_key(c, k, 0, k.k->size, NULL, 0, flags); + fsck_err: ++err: ++ if (ret) ++ bch_err(c, "%s: ret %i", __func__, ret); + return ret; + } + +@@ -448,8 +457,10 @@ static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b, + + ret = bch2_gc_mark_key(c, b->c.btree_id, b->c.level, false, + k, &max_stale, true); +- if (ret) ++ if (ret) { ++ bch_err(c, "%s: error %i from bch2_gc_mark_key", __func__, ret); + break; ++ } + + if (b->c.level) { + bch2_bkey_buf_reassemble(&cur, c, k); +@@ -493,8 +504,11 @@ static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b, + continue; + } + +- if (ret) ++ if (ret) { ++ bch_err(c, "%s: error %i getting btree node", ++ __func__, ret); + break; ++ } + + ret = bch2_gc_btree_init_recurse(c, child, + target_depth); +@@ -551,6 +565,8 @@ static int bch2_gc_btree_init(struct bch_fs *c, + fsck_err: + six_unlock_read(&b->c.lock); + ++ if (ret) ++ bch_err(c, "%s: ret %i", __func__, ret); + return ret; + } + +@@ -574,8 +590,10 @@ static int bch2_gc_btrees(struct bch_fs *c, bool initial) + int ret = initial + ? bch2_gc_btree_init(c, id) + : bch2_gc_btree(c, id, initial); +- if (ret) ++ if (ret) { ++ bch_err(c, "%s: ret %i", __func__, ret); + return ret; ++ } + } + + return 0; +@@ -881,6 +899,8 @@ static int bch2_gc_done(struct bch_fs *c, + #undef copy_stripe_field + #undef copy_field + fsck_err: ++ if (ret) ++ bch_err(c, "%s: ret %i", __func__, ret); + return ret; + } + +@@ -1601,8 +1621,10 @@ int bch2_gc_thread_start(struct bch_fs *c) + BUG_ON(c->gc_thread); + + p = kthread_create(bch2_gc_thread, c, "bch-gc/%s", c->name); +- if (IS_ERR(p)) ++ if (IS_ERR(p)) { ++ bch_err(c, "error creating gc thread: %li", PTR_ERR(p)); + return PTR_ERR(p); ++ } + + get_task_struct(p); + c->gc_thread = p; +diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c +index 4e3cf219fb91..bbf8e5ad8aa0 100644 +--- a/fs/bcachefs/journal_reclaim.c ++++ b/fs/bcachefs/journal_reclaim.c +@@ -691,8 +691,10 @@ int bch2_journal_reclaim_start(struct journal *j) + + p = kthread_create(bch2_journal_reclaim_thread, j, + "bch-reclaim/%s", c->name); +- if (IS_ERR(p)) ++ if (IS_ERR(p)) { ++ bch_err(c, "error creating journal reclaim thread: %li", PTR_ERR(p)); + return PTR_ERR(p); ++ } + + get_task_struct(p); + j->reclaim_thread = p; +diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c +index f915b30ab6e0..0b1faee5094c 100644 +--- a/fs/bcachefs/movinggc.c ++++ b/fs/bcachefs/movinggc.c +@@ -348,8 +348,10 @@ int bch2_copygc_start(struct bch_fs *c) + return -ENOMEM; + + t = kthread_create(bch2_copygc_thread, c, "bch-copygc/%s", c->name); +- if (IS_ERR(t)) ++ if (IS_ERR(t)) { ++ bch_err(c, "error creating copygc thread: %li", PTR_ERR(t)); + return PTR_ERR(t); ++ } + + get_task_struct(t); + +diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c +index d89920b848ee..e101c7f23910 100644 +--- a/fs/bcachefs/rebalance.c ++++ b/fs/bcachefs/rebalance.c +@@ -315,8 +315,10 @@ int bch2_rebalance_start(struct bch_fs *c) + return 0; + + p = kthread_create(bch2_rebalance_thread, c, "bch-rebalance/%s", c->name); +- if (IS_ERR(p)) ++ if (IS_ERR(p)) { ++ bch_err(c, "error creating rebalance thread: %li", PTR_ERR(p)); + return PTR_ERR(p); ++ } + + get_task_struct(p); + rcu_assign_pointer(c->rebalance.thread, p); +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index 54ac9cc470af..0aeaaadbf3f8 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -122,8 +122,11 @@ int bch2_journal_key_insert(struct bch_fs *c, enum btree_id id, + }; + + new_keys.d = kvmalloc(sizeof(new_keys.d[0]) * new_keys.size, GFP_KERNEL); +- if (!new_keys.d) ++ if (!new_keys.d) { ++ bch_err(c, "%s: error allocating new key array (size %zu)", ++ __func__, new_keys.size); + return -ENOMEM; ++ } + + memcpy(new_keys.d, keys->d, sizeof(keys->d[0]) * keys->nr); + kvfree(keys->d); +@@ -145,8 +148,10 @@ int bch2_journal_key_delete(struct bch_fs *c, enum btree_id id, + kmalloc(sizeof(struct bkey), GFP_KERNEL); + int ret; + +- if (!whiteout) ++ if (!whiteout) { ++ bch_err(c, "%s: error allocating new key", __func__); + return -ENOMEM; ++ } + + bkey_init(&whiteout->k); + whiteout->k.p = pos; +@@ -1330,8 +1335,10 @@ int bch2_fs_initialize(struct bch_fs *c) + &lostfound, + 0, 0, S_IFDIR|0700, 0, + NULL, NULL)); +- if (ret) ++ if (ret) { ++ bch_err(c, "error creating lost+found"); + goto err; ++ } + + if (enabled_qtypes(c)) { + ret = bch2_fs_quota_read(c); +-- +cgit v1.2.3 + + +From c3be231d91442e6d6bdac2a1e9680169d81149a6 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 23 Feb 2021 21:41:25 -0500 +Subject: bcachefs: Fix for bch2_btree_node_get_noiter() returning -ENOMEM + +bch2_btree_node_get_noiter() isn't used from the btree iterator code, +which retries with the btree node cache cannibalize lock held on +-ENOMEM, so we should do it ourself if necessary. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_cache.c | 14 ++++++++++---- + 1 file changed, 10 insertions(+), 4 deletions(-) + +diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c +index 4fa3f80a805e..19c219cb317b 100644 +--- a/fs/bcachefs/btree_cache.c ++++ b/fs/bcachefs/btree_cache.c +@@ -843,7 +843,7 @@ retry: + b = btree_cache_find(bc, k); + if (unlikely(!b)) { + if (nofill) +- return NULL; ++ goto out; + + b = bch2_btree_node_fill(c, NULL, k, btree_id, + level, SIX_LOCK_read, true); +@@ -852,8 +852,12 @@ retry: + if (!b) + goto retry; + ++ if (IS_ERR(b) && ++ !bch2_btree_cache_cannibalize_lock(c, NULL)) ++ goto retry; ++ + if (IS_ERR(b)) +- return b; ++ goto out; + } else { + lock_node: + ret = six_lock_read(&b->c.lock, lock_node_check_fn, (void *) k); +@@ -888,7 +892,8 @@ lock_node: + + if (unlikely(btree_node_read_error(b))) { + six_unlock_read(&b->c.lock); +- return ERR_PTR(-EIO); ++ b = ERR_PTR(-EIO); ++ goto out; + } + + EBUG_ON(b->c.btree_id != btree_id); +@@ -897,7 +902,8 @@ lock_node: + EBUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 && + bkey_cmp(b->data->min_key, + bkey_i_to_btree_ptr_v2(&b->key)->v.min_key)); +- ++out: ++ bch2_btree_cache_cannibalize_unlock(c); + return b; + } + +-- +cgit v1.2.3 + + +From 6315cb2e7cf75eaee586eddfa881084f13bba2a9 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 5 Mar 2021 18:00:55 -0500 +Subject: bcachefs: Create allocator threads when allocating filesystem + +We're seeing failures to mount because of a failure to start the +allocator threads, which currently happens fairly late in the mount +process, after walking all metadata, and kthread_create() fails if +something has tried to kill the mount process, which is probably not +what we want. + +This patch avoids this issue by creating, but not starting, the +allocator threads when we preallocate all of our other in memory data +structures. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/alloc_background.c | 15 ++++++++++++++- + fs/bcachefs/super.c | 11 +++++++++++ + 2 files changed, 25 insertions(+), 1 deletion(-) + +diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c +index 319f401d87c8..ef75d9b8abd1 100644 +--- a/fs/bcachefs/alloc_background.c ++++ b/fs/bcachefs/alloc_background.c +@@ -1068,6 +1068,12 @@ static int discard_invalidated_buckets(struct bch_fs *c, struct bch_dev *ca) + return 0; + } + ++static inline bool allocator_thread_running(struct bch_dev *ca) ++{ ++ return ca->mi.state == BCH_MEMBER_STATE_RW && ++ test_bit(BCH_FS_ALLOCATOR_RUNNING, &ca->fs->flags); ++} ++ + /** + * bch_allocator_thread - move buckets from free_inc to reserves + * +@@ -1084,9 +1090,16 @@ static int bch2_allocator_thread(void *arg) + int ret; + + set_freezable(); +- ca->allocator_state = ALLOCATOR_RUNNING; + + while (1) { ++ if (!allocator_thread_running(ca)) { ++ ca->allocator_state = ALLOCATOR_STOPPED; ++ if (kthread_wait_freezable(allocator_thread_running(ca))) ++ break; ++ } ++ ++ ca->allocator_state = ALLOCATOR_RUNNING; ++ + cond_resched(); + if (kthread_should_stop()) + break; +diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c +index 8e6b4413d820..e43f7790094a 100644 +--- a/fs/bcachefs/super.c ++++ b/fs/bcachefs/super.c +@@ -424,6 +424,9 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early) + + set_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags); + ++ for_each_rw_member(ca, c, i) ++ bch2_wake_allocator(ca); ++ + ret = bch2_journal_reclaim_start(&c->journal); + if (ret) { + bch_err(c, "error starting journal reclaim: %i", ret); +@@ -1000,6 +1003,8 @@ static void bch2_dev_release(struct kobject *kobj) + + static void bch2_dev_free(struct bch_dev *ca) + { ++ bch2_dev_allocator_stop(ca); ++ + cancel_work_sync(&ca->io_error_work); + + if (ca->kobj.state_in_sysfs && +@@ -1164,6 +1169,12 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx) + if (!ca) + goto err; + ++ if (ca->mi.state == BCH_MEMBER_STATE_RW && ++ bch2_dev_allocator_start(ca)) { ++ bch2_dev_free(ca); ++ goto err; ++ } ++ + bch2_dev_attach(c, ca, dev_idx); + out: + pr_verbose_init(c->opts, "ret %i", ret); +-- +cgit v1.2.3 + + +From a95f8bafcde2bb8ee36d52d367fdd954000b35cd Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 7 Mar 2021 19:04:16 -0500 +Subject: bcachefs: Don't call into journal reclaim when we're not supposed to + +This was causing a deadlock when btree_update_nodes_writtes() invokes +journal reclaim because of the btree cache being too dirty. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_update_leaf.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index e18756b38fc2..53ea91b32fd5 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -284,7 +284,8 @@ btree_key_can_insert_cached(struct btree_trans *trans, + BUG_ON(iter->level); + + if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) && +- bch2_btree_key_cache_must_wait(trans->c)) ++ bch2_btree_key_cache_must_wait(trans->c) && ++ !(trans->flags & BTREE_INSERT_JOURNAL_RECLAIM)) + return BTREE_INSERT_NEED_JOURNAL_RECLAIM; + + if (u64s <= ck->u64s) +-- +cgit v1.2.3 + + +From 4bc5037a5d1fe99cdf259ed2d184fd154ad19d4d Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 7 Mar 2021 21:43:21 -0500 +Subject: bcachefs: Don't use inode btree key cache in fsck code + +We had a cache coherency bug with the btree key cache in the fsck code - +this fixes fsck to be consistent about not using it. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/fsck.c | 15 +++++++++------ + fs/bcachefs/inode.c | 19 +++++++++++++++---- + fs/bcachefs/inode.h | 2 ++ + 3 files changed, 26 insertions(+), 10 deletions(-) + +diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c +index b2d9d55b1951..66c9dad2ef3e 100644 +--- a/fs/bcachefs/fsck.c ++++ b/fs/bcachefs/fsck.c +@@ -58,7 +58,7 @@ static int __remove_dirent(struct btree_trans *trans, + buf[name.len] = '\0'; + name.name = buf; + +- ret = bch2_inode_find_by_inum_trans(trans, dir_inum, &dir_inode); ++ ret = __bch2_inode_find_by_inum_trans(trans, dir_inum, &dir_inode, 0); + if (ret && ret != -EINTR) + bch_err(c, "remove_dirent: err %i looking up directory inode", ret); + if (ret) +@@ -126,8 +126,8 @@ static int walk_inode(struct btree_trans *trans, + struct inode_walker *w, u64 inum) + { + if (inum != w->cur_inum) { +- int ret = bch2_inode_find_by_inum_trans(trans, inum, +- &w->inode); ++ int ret = __bch2_inode_find_by_inum_trans(trans, inum, ++ &w->inode, 0); + + if (ret && ret != -ENOENT) + return ret; +@@ -673,7 +673,7 @@ retry: + continue; + } + +- ret = bch2_inode_find_by_inum_trans(&trans, d_inum, &target); ++ ret = __bch2_inode_find_by_inum_trans(&trans, d_inum, &target, 0); + if (ret && ret != -ENOENT) + break; + +@@ -787,7 +787,9 @@ static int check_root(struct bch_fs *c, struct bch_inode_unpacked *root_inode) + + bch_verbose(c, "checking root directory"); + +- ret = bch2_inode_find_by_inum(c, BCACHEFS_ROOT_INO, root_inode); ++ ret = bch2_trans_do(c, NULL, NULL, 0, ++ __bch2_inode_find_by_inum_trans(&trans, BCACHEFS_ROOT_INO, ++ root_inode, 0)); + if (ret && ret != -ENOENT) + return ret; + +@@ -834,7 +836,8 @@ static int check_lostfound(struct bch_fs *c, + goto create_lostfound; + } + +- ret = bch2_inode_find_by_inum(c, inum, lostfound_inode); ++ ret = bch2_trans_do(c, NULL, NULL, 0, ++ __bch2_inode_find_by_inum_trans(&trans, inum, lostfound_inode, 0)); + if (ret && ret != -ENOENT) + return ret; + +diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c +index 746173f15ae3..81feb47fe8f9 100644 +--- a/fs/bcachefs/inode.c ++++ b/fs/bcachefs/inode.c +@@ -628,16 +628,19 @@ err: + return ret; + } + +-int bch2_inode_find_by_inum_trans(struct btree_trans *trans, u64 inode_nr, +- struct bch_inode_unpacked *inode) ++int __bch2_inode_find_by_inum_trans(struct btree_trans *trans, u64 inode_nr, ++ struct bch_inode_unpacked *inode, ++ unsigned flags) + { + struct btree_iter *iter; + struct bkey_s_c k; + int ret; + + iter = bch2_trans_get_iter(trans, BTREE_ID_INODES, +- POS(0, inode_nr), BTREE_ITER_CACHED); +- k = bch2_btree_iter_peek_cached(iter); ++ POS(0, inode_nr), flags); ++ k = (flags & BTREE_ITER_TYPE) == BTREE_ITER_CACHED ++ ? bch2_btree_iter_peek_cached(iter) ++ : bch2_btree_iter_peek_slot(iter); + ret = bkey_err(k); + if (ret) + goto err; +@@ -650,6 +653,14 @@ err: + return ret; + } + ++int bch2_inode_find_by_inum_trans(struct btree_trans *trans, u64 inode_nr, ++ struct bch_inode_unpacked *inode) ++{ ++ return __bch2_inode_find_by_inum_trans(trans, inode_nr, ++ inode, BTREE_ITER_CACHED); ++ ++} ++ + int bch2_inode_find_by_inum(struct bch_fs *c, u64 inode_nr, + struct bch_inode_unpacked *inode) + { +diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h +index dbdfcf63d079..1caf036ae928 100644 +--- a/fs/bcachefs/inode.h ++++ b/fs/bcachefs/inode.h +@@ -73,6 +73,8 @@ int bch2_inode_create(struct btree_trans *, struct bch_inode_unpacked *); + + int bch2_inode_rm(struct bch_fs *, u64, bool); + ++int __bch2_inode_find_by_inum_trans(struct btree_trans *, u64, ++ struct bch_inode_unpacked *, unsigned); + int bch2_inode_find_by_inum_trans(struct btree_trans *, u64, + struct bch_inode_unpacked *); + int bch2_inode_find_by_inum(struct bch_fs *, u64, struct bch_inode_unpacked *); +-- +cgit v1.2.3 + + +From 0e2840760acc3c22af4715dac09772fabad29b29 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 20 Feb 2021 05:05:18 -0500 +Subject: bcachefs: Fix a 64 bit divide on 32 bit + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/journal_io.c | 6 ++++-- + 1 file changed, 4 insertions(+), 2 deletions(-) + +diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c +index 2abca1644cdc..aff95824a50d 100644 +--- a/fs/bcachefs/journal_io.c ++++ b/fs/bcachefs/journal_io.c +@@ -837,13 +837,15 @@ static void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c, + + for (i = 0; i < j->nr_ptrs; i++) { + struct bch_dev *ca = c->devs[j->ptrs[i].dev]; ++ u64 offset; ++ ++ div64_u64_rem(j->ptrs[i].offset, ca->mi.bucket_size, &offset); + + if (i) + pr_buf(out, " "); + pr_buf(out, "%u:%llu (offset %llu)", + j->ptrs[i].dev, +- (u64) j->ptrs[i].offset, +- (u64) j->ptrs[i].offset % ca->mi.bucket_size); ++ (u64) j->ptrs[i].offset, offset); + } + } + +-- +cgit v1.2.3 + + +From 3f1a691835661e1ecd3698fc4e6225c446b944eb Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 24 Feb 2021 01:16:49 -0500 +Subject: bcachefs: Dump journal state when we get stuck + +We had a bug reported where the journal is failing to allocate a journal +write - this should help figure out what's going on. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/journal_io.c | 11 ++++++++++- + 1 file changed, 10 insertions(+), 1 deletion(-) + +diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c +index aff95824a50d..0d361f5c39b5 100644 +--- a/fs/bcachefs/journal_io.c ++++ b/fs/bcachefs/journal_io.c +@@ -1386,6 +1386,7 @@ void bch2_journal_write(struct closure *cl) + struct jset_entry *start, *end; + struct jset *jset; + struct bio *bio; ++ char *journal_debug_buf = NULL; + bool validate_before_checksum = false; + unsigned i, sectors, bytes, u64s, nr_rw_members = 0; + int ret; +@@ -1487,6 +1488,12 @@ retry_alloc: + goto retry_alloc; + } + ++ if (ret) { ++ journal_debug_buf = kmalloc(4096, GFP_ATOMIC); ++ if (journal_debug_buf) ++ __bch2_journal_debug_to_text(&_PBUF(journal_debug_buf, 4096), j); ++ } ++ + /* + * write is allocated, no longer need to account for it in + * bch2_journal_space_available(): +@@ -1501,7 +1508,9 @@ retry_alloc: + spin_unlock(&j->lock); + + if (ret) { +- bch_err(c, "Unable to allocate journal write"); ++ bch_err(c, "Unable to allocate journal write:\n%s", ++ journal_debug_buf); ++ kfree(journal_debug_buf); + bch2_fatal_error(c); + continue_at(cl, journal_write_done, system_highpri_wq); + return; +-- +cgit v1.2.3 + + +From 0841ac3ecb0f1e60ccb1e2da05925b6b4faddfaa Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 14 Mar 2021 19:01:14 -0400 +Subject: bcachefs: Add code to scan for/rewite old btree nodes + +This adds a new data job type to scan for btree nodes in the old extent +format, and rewrite them. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bcachefs_ioctl.h | 17 +++-- + fs/bcachefs/btree_io.c | 5 ++ + fs/bcachefs/btree_types.h | 1 + + fs/bcachefs/btree_update_interior.c | 1 + + fs/bcachefs/move.c | 131 +++++++++++++++++++++++++++++------- + fs/bcachefs/move.h | 6 +- + fs/bcachefs/movinggc.c | 6 +- + fs/bcachefs/rebalance.c | 3 +- + 8 files changed, 132 insertions(+), 38 deletions(-) + +diff --git a/fs/bcachefs/bcachefs_ioctl.h b/fs/bcachefs/bcachefs_ioctl.h +index f1cb5d405129..f679fc2151bc 100644 +--- a/fs/bcachefs/bcachefs_ioctl.h ++++ b/fs/bcachefs/bcachefs_ioctl.h +@@ -171,10 +171,11 @@ struct bch_ioctl_disk_set_state { + }; + + enum bch_data_ops { +- BCH_DATA_OP_SCRUB = 0, +- BCH_DATA_OP_REREPLICATE = 1, +- BCH_DATA_OP_MIGRATE = 2, +- BCH_DATA_OP_NR = 3, ++ BCH_DATA_OP_SCRUB = 0, ++ BCH_DATA_OP_REREPLICATE = 1, ++ BCH_DATA_OP_MIGRATE = 2, ++ BCH_DATA_OP_REWRITE_OLD_NODES = 3, ++ BCH_DATA_OP_NR = 4, + }; + + /* +@@ -187,11 +188,13 @@ enum bch_data_ops { + * job. The file descriptor is O_CLOEXEC. + */ + struct bch_ioctl_data { +- __u32 op; ++ __u16 op; ++ __u8 start_btree; ++ __u8 end_btree; + __u32 flags; + +- struct bpos start; +- struct bpos end; ++ struct bpos start_pos; ++ struct bpos end_pos; + + union { + struct { +diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c +index 8a4fbdf47d23..a84a473101dc 100644 +--- a/fs/bcachefs/btree_io.c ++++ b/fs/bcachefs/btree_io.c +@@ -920,6 +920,8 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, + unsigned u64s; + int ret, retry_read = 0, write = READ; + ++ b->version_ondisk = U16_MAX; ++ + iter = mempool_alloc(&c->fill_iter, GFP_NOIO); + sort_iter_init(iter, b); + iter->size = (btree_blocks(c) + 1) * 2; +@@ -1000,6 +1002,9 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, + sectors = vstruct_sectors(bne, c->block_bits); + } + ++ b->version_ondisk = min(b->version_ondisk, ++ le16_to_cpu(i->version)); ++ + ret = validate_bset(c, ca, b, i, sectors, + READ, have_retry); + if (ret) +diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h +index e57ed9d86ee3..03894e923037 100644 +--- a/fs/bcachefs/btree_types.h ++++ b/fs/bcachefs/btree_types.h +@@ -76,6 +76,7 @@ struct btree { + u16 written; + u8 nsets; + u8 nr_key_bits; ++ u16 version_ondisk; + + struct bkey_format format; + +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index dd1b8f6ef9b0..275dcabbbdd6 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -286,6 +286,7 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned lev + bch2_bset_init_first(b, &b->data->keys); + b->c.level = level; + b->c.btree_id = as->btree_id; ++ b->version_ondisk = c->sb.version; + + memset(&b->nr, 0, sizeof(b->nr)); + b->data->magic = cpu_to_le64(bset_magic(c)); +diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c +index b4c315cf68df..602072b3d4d2 100644 +--- a/fs/bcachefs/move.c ++++ b/fs/bcachefs/move.c +@@ -532,7 +532,7 @@ static int __bch2_move_data(struct bch_fs *c, + + stats->data_type = BCH_DATA_user; + stats->btree_id = btree_id; +- stats->pos = POS_MIN; ++ stats->pos = start; + + iter = bch2_trans_get_iter(&trans, btree_id, start, + BTREE_ITER_PREFETCH); +@@ -647,14 +647,15 @@ out: + } + + int bch2_move_data(struct bch_fs *c, ++ enum btree_id start_btree_id, struct bpos start_pos, ++ enum btree_id end_btree_id, struct bpos end_pos, + struct bch_ratelimit *rate, + struct write_point_specifier wp, +- struct bpos start, +- struct bpos end, + move_pred_fn pred, void *arg, + struct bch_move_stats *stats) + { + struct moving_context ctxt = { .stats = stats }; ++ enum btree_id id; + int ret; + + closure_init_stack(&ctxt.cl); +@@ -663,10 +664,23 @@ int bch2_move_data(struct bch_fs *c, + + stats->data_type = BCH_DATA_user; + +- ret = __bch2_move_data(c, &ctxt, rate, wp, start, end, +- pred, arg, stats, BTREE_ID_EXTENTS) ?: +- __bch2_move_data(c, &ctxt, rate, wp, start, end, +- pred, arg, stats, BTREE_ID_REFLINK); ++ for (id = start_btree_id; ++ id <= min_t(unsigned, end_btree_id, BTREE_ID_NR - 1); ++ id++) { ++ stats->btree_id = id; ++ ++ if (id != BTREE_ID_EXTENTS && ++ id != BTREE_ID_REFLINK) ++ continue; ++ ++ ret = __bch2_move_data(c, &ctxt, rate, wp, ++ id == start_btree_id ? start_pos : POS_MIN, ++ id == end_btree_id ? end_pos : POS_MAX, ++ pred, arg, stats, id); ++ if (ret) ++ break; ++ } ++ + + move_ctxt_wait_event(&ctxt, list_empty(&ctxt.reads)); + closure_sync(&ctxt.cl); +@@ -680,16 +694,22 @@ int bch2_move_data(struct bch_fs *c, + return ret; + } + ++typedef enum data_cmd (*move_btree_pred)(struct bch_fs *, void *, ++ struct btree *, struct bch_io_opts *, ++ struct data_opts *); ++ + static int bch2_move_btree(struct bch_fs *c, +- move_pred_fn pred, +- void *arg, ++ enum btree_id start_btree_id, struct bpos start_pos, ++ enum btree_id end_btree_id, struct bpos end_pos, ++ move_btree_pred pred, void *arg, + struct bch_move_stats *stats) + { ++ bool kthread = (current->flags & PF_KTHREAD) != 0; + struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts); + struct btree_trans trans; + struct btree_iter *iter; + struct btree *b; +- unsigned id; ++ enum btree_id id; + struct data_opts data_opts; + enum data_cmd cmd; + int ret = 0; +@@ -698,16 +718,24 @@ static int bch2_move_btree(struct bch_fs *c, + + stats->data_type = BCH_DATA_btree; + +- for (id = 0; id < BTREE_ID_NR; id++) { ++ for (id = start_btree_id; ++ id <= min_t(unsigned, end_btree_id, BTREE_ID_NR - 1); ++ id++) { + stats->btree_id = id; + +- for_each_btree_node(&trans, iter, id, POS_MIN, ++ for_each_btree_node(&trans, iter, id, ++ id == start_btree_id ? start_pos : POS_MIN, + BTREE_ITER_PREFETCH, b) { ++ if (kthread && kthread_should_stop()) ++ goto out; ++ ++ if ((cmp_int(id, end_btree_id) ?: ++ bkey_cmp(b->key.k.p, end_pos)) > 0) ++ break; ++ + stats->pos = iter->pos; + +- switch ((cmd = pred(c, arg, +- bkey_i_to_s_c(&b->key), +- &io_opts, &data_opts))) { ++ switch ((cmd = pred(c, arg, b, &io_opts, &data_opts))) { + case DATA_SKIP: + goto next; + case DATA_SCRUB: +@@ -727,7 +755,7 @@ next: + + ret = bch2_trans_iter_free(&trans, iter) ?: ret; + } +- ++out: + bch2_trans_exit(&trans); + + return ret; +@@ -786,6 +814,38 @@ static enum data_cmd migrate_pred(struct bch_fs *c, void *arg, + return DATA_REWRITE; + } + ++static enum data_cmd rereplicate_btree_pred(struct bch_fs *c, void *arg, ++ struct btree *b, ++ struct bch_io_opts *io_opts, ++ struct data_opts *data_opts) ++{ ++ return rereplicate_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts); ++} ++ ++static enum data_cmd migrate_btree_pred(struct bch_fs *c, void *arg, ++ struct btree *b, ++ struct bch_io_opts *io_opts, ++ struct data_opts *data_opts) ++{ ++ return migrate_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts); ++} ++ ++static enum data_cmd rewrite_old_nodes_pred(struct bch_fs *c, void *arg, ++ struct btree *b, ++ struct bch_io_opts *io_opts, ++ struct data_opts *data_opts) ++{ ++ if (b->version_ondisk != c->sb.version || ++ btree_node_need_rewrite(b)) { ++ data_opts->target = 0; ++ data_opts->nr_replicas = 1; ++ data_opts->btree_insert_flags = 0; ++ return DATA_REWRITE; ++ } ++ ++ return DATA_SKIP; ++} ++ + int bch2_data_job(struct bch_fs *c, + struct bch_move_stats *stats, + struct bch_ioctl_data op) +@@ -797,17 +857,20 @@ int bch2_data_job(struct bch_fs *c, + stats->data_type = BCH_DATA_journal; + ret = bch2_journal_flush_device_pins(&c->journal, -1); + +- ret = bch2_move_btree(c, rereplicate_pred, c, stats) ?: ret; ++ ret = bch2_move_btree(c, ++ op.start_btree, op.start_pos, ++ op.end_btree, op.end_pos, ++ rereplicate_btree_pred, c, stats) ?: ret; + + closure_wait_event(&c->btree_interior_update_wait, + !bch2_btree_interior_updates_nr_pending(c)); + + ret = bch2_replicas_gc2(c) ?: ret; + +- ret = bch2_move_data(c, NULL, +- writepoint_hashed((unsigned long) current), +- op.start, +- op.end, ++ ret = bch2_move_data(c, ++ op.start_btree, op.start_pos, ++ op.end_btree, op.end_pos, ++ NULL, writepoint_hashed((unsigned long) current), + rereplicate_pred, c, stats) ?: ret; + ret = bch2_replicas_gc2(c) ?: ret; + break; +@@ -818,16 +881,32 @@ int bch2_data_job(struct bch_fs *c, + stats->data_type = BCH_DATA_journal; + ret = bch2_journal_flush_device_pins(&c->journal, op.migrate.dev); + +- ret = bch2_move_btree(c, migrate_pred, &op, stats) ?: ret; ++ ret = bch2_move_btree(c, ++ op.start_btree, op.start_pos, ++ op.end_btree, op.end_pos, ++ migrate_btree_pred, &op, stats) ?: ret; + ret = bch2_replicas_gc2(c) ?: ret; + +- ret = bch2_move_data(c, NULL, +- writepoint_hashed((unsigned long) current), +- op.start, +- op.end, ++ ret = bch2_move_data(c, ++ op.start_btree, op.start_pos, ++ op.end_btree, op.end_pos, ++ NULL, writepoint_hashed((unsigned long) current), + migrate_pred, &op, stats) ?: ret; + ret = bch2_replicas_gc2(c) ?: ret; + break; ++ case BCH_DATA_OP_REWRITE_OLD_NODES: ++ ret = bch2_move_btree(c, ++ op.start_btree, op.start_pos, ++ op.end_btree, op.end_pos, ++ rewrite_old_nodes_pred, &op, stats) ?: ret; ++ ++ if (!ret) { ++ mutex_lock(&c->sb_lock); ++ c->disk_sb.sb->version_min = c->disk_sb.sb->version; ++ bch2_write_super(c); ++ mutex_unlock(&c->sb_lock); ++ } ++ break; + default: + ret = -EINVAL; + } +diff --git a/fs/bcachefs/move.h b/fs/bcachefs/move.h +index b04bc669226d..403ca695c875 100644 +--- a/fs/bcachefs/move.h ++++ b/fs/bcachefs/move.h +@@ -52,9 +52,11 @@ typedef enum data_cmd (*move_pred_fn)(struct bch_fs *, void *, + struct bkey_s_c, + struct bch_io_opts *, struct data_opts *); + +-int bch2_move_data(struct bch_fs *, struct bch_ratelimit *, ++int bch2_move_data(struct bch_fs *, ++ enum btree_id, struct bpos, ++ enum btree_id, struct bpos, ++ struct bch_ratelimit *, + struct write_point_specifier, +- struct bpos, struct bpos, + move_pred_fn, void *, + struct bch_move_stats *); + +diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c +index 0b1faee5094c..03668e481f7a 100644 +--- a/fs/bcachefs/movinggc.c ++++ b/fs/bcachefs/movinggc.c +@@ -219,9 +219,11 @@ static int bch2_copygc(struct bch_fs *c) + sizeof(h->data[0]), + bucket_offset_cmp, NULL); + +- ret = bch2_move_data(c, &c->copygc_pd.rate, ++ ret = bch2_move_data(c, ++ 0, POS_MIN, ++ BTREE_ID_NR, POS_MAX, ++ &c->copygc_pd.rate, + writepoint_ptr(&c->copygc_write_point), +- POS_MIN, POS_MAX, + copygc_pred, NULL, + &move_stats); + +diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c +index e101c7f23910..9b5078ba6028 100644 +--- a/fs/bcachefs/rebalance.c ++++ b/fs/bcachefs/rebalance.c +@@ -239,10 +239,11 @@ static int bch2_rebalance_thread(void *arg) + rebalance_work_reset(c); + + bch2_move_data(c, ++ 0, POS_MIN, ++ BTREE_ID_NR, POS_MAX, + /* ratelimiting disabled for now */ + NULL, /* &r->pd.rate, */ + writepoint_ptr(&c->rebalance_write_point), +- POS_MIN, POS_MAX, + rebalance_pred, NULL, + &r->move_stats); + } +-- +cgit v1.2.3 + + +From fd936c392a64c6b7efa4c36b95921dda6b247798 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 22 Mar 2021 18:39:16 -0400 +Subject: bcachefs: Scan for old btree nodes if necessary on mount + +We dropped support for !BTREE_NODE_NEW_EXTENT_OVERWRITE but it turned +out there were people who still had filesystems with btree nodes in that +format in the wild. This adds a new compat feature that indicates we've +scanned for and rewritten nodes in the old format, and does that scan at +mount time if the option isn't set. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bcachefs_format.h | 1 + + fs/bcachefs/btree_gc.c | 3 ++- + fs/bcachefs/move.c | 31 ++++++++++++++++++++----------- + fs/bcachefs/move.h | 2 ++ + fs/bcachefs/rebalance.c | 3 +++ + fs/bcachefs/recovery.c | 16 ++++++++++++++++ + 6 files changed, 44 insertions(+), 12 deletions(-) + +diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h +index 618f49ac9f82..266796d40e0e 100644 +--- a/fs/bcachefs/bcachefs_format.h ++++ b/fs/bcachefs/bcachefs_format.h +@@ -1369,6 +1369,7 @@ enum bch_sb_feature { + enum bch_sb_compat { + BCH_COMPAT_FEAT_ALLOC_INFO = 0, + BCH_COMPAT_FEAT_ALLOC_METADATA = 1, ++ BCH_COMPAT_FEAT_EXTENTS_ABOVE_BTREE_UPDATES_DONE = 2, + }; + + /* options: */ +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +index e38f066680a9..0f53217706f3 100644 +--- a/fs/bcachefs/btree_gc.c ++++ b/fs/bcachefs/btree_gc.c +@@ -1618,7 +1618,8 @@ int bch2_gc_thread_start(struct bch_fs *c) + { + struct task_struct *p; + +- BUG_ON(c->gc_thread); ++ if (c->gc_thread) ++ return 0; + + p = kthread_create(bch2_gc_thread, c, "bch-gc/%s", c->name); + if (IS_ERR(p)) { +diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c +index 602072b3d4d2..1a546ff505fd 100644 +--- a/fs/bcachefs/move.c ++++ b/fs/bcachefs/move.c +@@ -846,6 +846,25 @@ static enum data_cmd rewrite_old_nodes_pred(struct bch_fs *c, void *arg, + return DATA_SKIP; + } + ++int bch2_scan_old_btree_nodes(struct bch_fs *c, struct bch_move_stats *stats) ++{ ++ int ret; ++ ++ ret = bch2_move_btree(c, ++ 0, POS_MIN, ++ BTREE_ID_NR, POS_MAX, ++ rewrite_old_nodes_pred, c, stats); ++ if (!ret) { ++ mutex_lock(&c->sb_lock); ++ c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_EXTENTS_ABOVE_BTREE_UPDATES_DONE; ++ c->disk_sb.sb->version_min = c->disk_sb.sb->version; ++ bch2_write_super(c); ++ mutex_unlock(&c->sb_lock); ++ } ++ ++ return ret; ++} ++ + int bch2_data_job(struct bch_fs *c, + struct bch_move_stats *stats, + struct bch_ioctl_data op) +@@ -895,17 +914,7 @@ int bch2_data_job(struct bch_fs *c, + ret = bch2_replicas_gc2(c) ?: ret; + break; + case BCH_DATA_OP_REWRITE_OLD_NODES: +- ret = bch2_move_btree(c, +- op.start_btree, op.start_pos, +- op.end_btree, op.end_pos, +- rewrite_old_nodes_pred, &op, stats) ?: ret; +- +- if (!ret) { +- mutex_lock(&c->sb_lock); +- c->disk_sb.sb->version_min = c->disk_sb.sb->version; +- bch2_write_super(c); +- mutex_unlock(&c->sb_lock); +- } ++ ret = bch2_scan_old_btree_nodes(c, stats); + break; + default: + ret = -EINVAL; +diff --git a/fs/bcachefs/move.h b/fs/bcachefs/move.h +index 403ca695c875..5076153689d1 100644 +--- a/fs/bcachefs/move.h ++++ b/fs/bcachefs/move.h +@@ -52,6 +52,8 @@ typedef enum data_cmd (*move_pred_fn)(struct bch_fs *, void *, + struct bkey_s_c, + struct bch_io_opts *, struct data_opts *); + ++int bch2_scan_old_btree_nodes(struct bch_fs *, struct bch_move_stats *); ++ + int bch2_move_data(struct bch_fs *, + enum btree_id, struct bpos, + enum btree_id, struct bpos, +diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c +index 9b5078ba6028..aa9bbdbfa65e 100644 +--- a/fs/bcachefs/rebalance.c ++++ b/fs/bcachefs/rebalance.c +@@ -312,6 +312,9 @@ int bch2_rebalance_start(struct bch_fs *c) + { + struct task_struct *p; + ++ if (c->rebalance.thread) ++ return 0; ++ + if (c->opts.nochanges) + return 0; + +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index 0aeaaadbf3f8..e322dc35f992 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -16,6 +16,7 @@ + #include "journal_io.h" + #include "journal_reclaim.h" + #include "journal_seq_blacklist.h" ++#include "move.h" + #include "quota.h" + #include "recovery.h" + #include "replicas.h" +@@ -1200,6 +1201,20 @@ use_clean: + bch_verbose(c, "quotas done"); + } + ++ if (!(c->sb.compat & (1ULL << BCH_COMPAT_FEAT_EXTENTS_ABOVE_BTREE_UPDATES_DONE))) { ++ struct bch_move_stats stats = { 0 }; ++ ++ bch_verbose(c, "scanning for old btree nodes"); ++ ret = bch2_fs_read_write(c); ++ if (ret) ++ goto err; ++ ++ ret = bch2_scan_old_btree_nodes(c, &stats); ++ if (ret) ++ goto err; ++ bch_verbose(c, "scanning for old btree nodes done"); ++ } ++ + mutex_lock(&c->sb_lock); + if (c->opts.version_upgrade) { + if (c->sb.version < bcachefs_metadata_version_new_versioning) +@@ -1271,6 +1286,7 @@ int bch2_fs_initialize(struct bch_fs *c) + le16_to_cpu(bcachefs_metadata_version_current); + c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_atomic_nlink; + c->disk_sb.sb->features[0] |= BCH_SB_FEATURES_ALL; ++ c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_EXTENTS_ABOVE_BTREE_UPDATES_DONE; + + bch2_write_super(c); + mutex_unlock(&c->sb_lock); +-- +cgit v1.2.3 + + +From e235c49d7b0e52b3e7a59bfc1d27d2c6e995e52e Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 20 Mar 2021 23:55:36 -0400 +Subject: bcachefs: Fix bkey format generation for 32 bit fields + +Having a packed format that can represent a field larger than the +unpacked type breaks bkey_packed_successor() assertions - we need to fix this to start using the snapshot filed. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bcachefs_format.h | 5 +++-- + fs/bcachefs/bkey.c | 7 ++++++- + fs/bcachefs/move.c | 28 +++++++++++++++++++++++++++- + fs/bcachefs/recovery.c | 4 +++- + 4 files changed, 39 insertions(+), 5 deletions(-) + +diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h +index 266796d40e0e..e580c8c25b53 100644 +--- a/fs/bcachefs/bcachefs_format.h ++++ b/fs/bcachefs/bcachefs_format.h +@@ -1367,9 +1367,10 @@ enum bch_sb_feature { + }; + + enum bch_sb_compat { +- BCH_COMPAT_FEAT_ALLOC_INFO = 0, +- BCH_COMPAT_FEAT_ALLOC_METADATA = 1, ++ BCH_COMPAT_FEAT_ALLOC_INFO = 0, ++ BCH_COMPAT_FEAT_ALLOC_METADATA = 1, + BCH_COMPAT_FEAT_EXTENTS_ABOVE_BTREE_UPDATES_DONE = 2, ++ BCH_COMPAT_FEAT_BFORMAT_OVERFLOW_DONE = 3, + }; + + /* options: */ +diff --git a/fs/bcachefs/bkey.c b/fs/bcachefs/bkey.c +index c06d0a965be1..e1906f257ef2 100644 +--- a/fs/bcachefs/bkey.c ++++ b/fs/bcachefs/bkey.c +@@ -551,7 +551,12 @@ void bch2_bkey_format_add_pos(struct bkey_format_state *s, struct bpos p) + static void set_format_field(struct bkey_format *f, enum bch_bkey_fields i, + unsigned bits, u64 offset) + { +- offset = bits == 64 ? 0 : min(offset, U64_MAX - ((1ULL << bits) - 1)); ++ unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i]; ++ u64 unpacked_max = ~((~0ULL << 1) << (unpacked_bits - 1)); ++ ++ bits = min(bits, unpacked_bits); ++ ++ offset = bits == unpacked_bits ? 0 : min(offset, unpacked_max - ((1ULL << bits) - 1)); + + f->bits_per_field[i] = bits; + f->field_offset[i] = cpu_to_le64(offset); +diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c +index 1a546ff505fd..b163d48be31d 100644 +--- a/fs/bcachefs/move.c ++++ b/fs/bcachefs/move.c +@@ -830,13 +830,38 @@ static enum data_cmd migrate_btree_pred(struct bch_fs *c, void *arg, + return migrate_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts); + } + ++static bool bformat_needs_redo(struct bkey_format *f) ++{ ++ unsigned i; ++ ++ for (i = 0; i < f->nr_fields; i++) { ++ unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i]; ++ u64 unpacked_mask = ~((~0ULL << 1) << (unpacked_bits - 1)); ++ u64 field_offset = le64_to_cpu(f->field_offset[i]); ++ ++ if (f->bits_per_field[i] > unpacked_bits) ++ return true; ++ ++ if ((f->bits_per_field[i] == unpacked_bits) && field_offset) ++ return true; ++ ++ if (((field_offset + ((1ULL << f->bits_per_field[i]) - 1)) & ++ unpacked_mask) < ++ field_offset) ++ return true; ++ } ++ ++ return false; ++} ++ + static enum data_cmd rewrite_old_nodes_pred(struct bch_fs *c, void *arg, + struct btree *b, + struct bch_io_opts *io_opts, + struct data_opts *data_opts) + { + if (b->version_ondisk != c->sb.version || +- btree_node_need_rewrite(b)) { ++ btree_node_need_rewrite(b) || ++ bformat_needs_redo(&b->format)) { + data_opts->target = 0; + data_opts->nr_replicas = 1; + data_opts->btree_insert_flags = 0; +@@ -857,6 +882,7 @@ int bch2_scan_old_btree_nodes(struct bch_fs *c, struct bch_move_stats *stats) + if (!ret) { + mutex_lock(&c->sb_lock); + c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_EXTENTS_ABOVE_BTREE_UPDATES_DONE; ++ c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_BFORMAT_OVERFLOW_DONE; + c->disk_sb.sb->version_min = c->disk_sb.sb->version; + bch2_write_super(c); + mutex_unlock(&c->sb_lock); +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index e322dc35f992..edcf6389d2fd 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -1201,7 +1201,8 @@ use_clean: + bch_verbose(c, "quotas done"); + } + +- if (!(c->sb.compat & (1ULL << BCH_COMPAT_FEAT_EXTENTS_ABOVE_BTREE_UPDATES_DONE))) { ++ if (!(c->sb.compat & (1ULL << BCH_COMPAT_FEAT_EXTENTS_ABOVE_BTREE_UPDATES_DONE)) || ++ !(c->sb.compat & (1ULL << BCH_COMPAT_FEAT_BFORMAT_OVERFLOW_DONE))) { + struct bch_move_stats stats = { 0 }; + + bch_verbose(c, "scanning for old btree nodes"); +@@ -1287,6 +1288,7 @@ int bch2_fs_initialize(struct bch_fs *c) + c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_atomic_nlink; + c->disk_sb.sb->features[0] |= BCH_SB_FEATURES_ALL; + c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_EXTENTS_ABOVE_BTREE_UPDATES_DONE; ++ c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_BFORMAT_OVERFLOW_DONE; + + bch2_write_super(c); + mutex_unlock(&c->sb_lock); +-- +cgit v1.2.3 + + +From abf54e0a89e50b213d890a5c16a137040414569a Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 11 Mar 2021 21:46:23 -0500 +Subject: bcachefs: Fix an allocator startup race + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/super.c | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c +index e43f7790094a..c8ed2e00a1df 100644 +--- a/fs/bcachefs/super.c ++++ b/fs/bcachefs/super.c +@@ -1169,6 +1169,8 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx) + if (!ca) + goto err; + ++ ca->fs = c; ++ + if (ca->mi.state == BCH_MEMBER_STATE_RW && + bch2_dev_allocator_start(ca)) { + bch2_dev_free(ca); +-- +cgit v1.2.3 + + +From 20ac10ccc489052c5d2aa77a6bb2a4a71c0eade0 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 24 Mar 2021 22:11:22 -0400 +Subject: bcachefs: Fix some (spurious) warnings about uninitialized vars + +These are only complained about when building in userspace, for some +reason. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_gc.c | 2 +- + fs/bcachefs/extents.c | 2 +- + 2 files changed, 2 insertions(+), 2 deletions(-) + +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +index 0f53217706f3..42ba9ce510ef 100644 +--- a/fs/bcachefs/btree_gc.c ++++ b/fs/bcachefs/btree_gc.c +@@ -166,7 +166,7 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id, + { + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(*k); + const union bch_extent_entry *entry; +- struct extent_ptr_decoded p; ++ struct extent_ptr_decoded p = { 0 }; + bool do_update = false; + int ret = 0; + +diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c +index ad3e88dd1829..595dd0add509 100644 +--- a/fs/bcachefs/extents.c ++++ b/fs/bcachefs/extents.c +@@ -697,7 +697,7 @@ unsigned bch2_bkey_replicas(struct bch_fs *c, struct bkey_s_c k) + { + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; +- struct extent_ptr_decoded p; ++ struct extent_ptr_decoded p = { 0 }; + unsigned replicas = 0; + + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { +-- +cgit v1.2.3 + + +From aaae336ebca279e42cbb271ca70f8412b1c65a3d Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 4 Apr 2021 21:57:35 -0400 +Subject: bcachefs: Use x-macros for compat feature bits + +This is to generate strings for them, so that we can print them out. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bcachefs_format.h | 14 ++++++++++---- + fs/bcachefs/btree_gc.c | 2 +- + fs/bcachefs/move.c | 7 +++++-- + fs/bcachefs/opts.c | 7 +++++++ + fs/bcachefs/opts.h | 1 + + fs/bcachefs/recovery.c | 28 ++++++++++++++-------------- + fs/bcachefs/super-io.c | 5 ++--- + 7 files changed, 40 insertions(+), 24 deletions(-) + +diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h +index e580c8c25b53..73eeeb10472a 100644 +--- a/fs/bcachefs/bcachefs_format.h ++++ b/fs/bcachefs/bcachefs_format.h +@@ -1366,11 +1366,17 @@ enum bch_sb_feature { + BCH_FEATURE_NR, + }; + ++#define BCH_SB_COMPAT() \ ++ x(alloc_info, 0) \ ++ x(alloc_metadata, 1) \ ++ x(extents_above_btree_updates_done, 2) \ ++ x(bformat_overflow_done, 3) ++ + enum bch_sb_compat { +- BCH_COMPAT_FEAT_ALLOC_INFO = 0, +- BCH_COMPAT_FEAT_ALLOC_METADATA = 1, +- BCH_COMPAT_FEAT_EXTENTS_ABOVE_BTREE_UPDATES_DONE = 2, +- BCH_COMPAT_FEAT_BFORMAT_OVERFLOW_DONE = 3, ++#define x(f, n) BCH_COMPAT_##f, ++ BCH_SB_COMPAT() ++#undef x ++ BCH_COMPAT_NR, + }; + + /* options: */ +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +index 42ba9ce510ef..eb9f6a139aa6 100644 +--- a/fs/bcachefs/btree_gc.c ++++ b/fs/bcachefs/btree_gc.c +@@ -762,7 +762,7 @@ static int bch2_gc_done(struct bch_fs *c, + { + struct bch_dev *ca; + bool verify = (!initial || +- (c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO))); ++ (c->sb.compat & (1ULL << BCH_COMPAT_alloc_info))); + unsigned i, dev; + int ret = 0; + +diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c +index b163d48be31d..75b7046d6042 100644 +--- a/fs/bcachefs/move.c ++++ b/fs/bcachefs/move.c +@@ -758,6 +758,9 @@ next: + out: + bch2_trans_exit(&trans); + ++ if (ret) ++ bch_err(c, "error %i in bch2_move_btree", ret); ++ + return ret; + } + +@@ -881,8 +884,8 @@ int bch2_scan_old_btree_nodes(struct bch_fs *c, struct bch_move_stats *stats) + rewrite_old_nodes_pred, c, stats); + if (!ret) { + mutex_lock(&c->sb_lock); +- c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_EXTENTS_ABOVE_BTREE_UPDATES_DONE; +- c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_BFORMAT_OVERFLOW_DONE; ++ c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_extents_above_btree_updates_done; ++ c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_bformat_overflow_done; + c->disk_sb.sb->version_min = c->disk_sb.sb->version; + bch2_write_super(c); + mutex_unlock(&c->sb_lock); +diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c +index 97a36ac0beea..d53b6dccd161 100644 +--- a/fs/bcachefs/opts.c ++++ b/fs/bcachefs/opts.c +@@ -23,6 +23,13 @@ const char * const bch2_sb_features[] = { + NULL + }; + ++const char * const bch2_sb_compat[] = { ++#define x(f, n) #f, ++ BCH_SB_COMPAT() ++#undef x ++ NULL ++}; ++ + const char * const bch2_csum_opts[] = { + "none", + "crc32c", +diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h +index c123c42630a6..7ce2b3adb8d7 100644 +--- a/fs/bcachefs/opts.h ++++ b/fs/bcachefs/opts.h +@@ -10,6 +10,7 @@ + + extern const char * const bch2_error_actions[]; + extern const char * const bch2_sb_features[]; ++extern const char * const bch2_sb_compat[]; + extern const char * const bch2_csum_opts[]; + extern const char * const bch2_compression_opts[]; + extern const char * const bch2_str_hash_types[]; +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index edcf6389d2fd..c42919277c72 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -933,7 +933,7 @@ static int read_btree_roots(struct bch_fs *c) + + if (i == BTREE_ID_ALLOC && + c->opts.reconstruct_alloc) { +- c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO); ++ c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); + continue; + } + +@@ -943,7 +943,7 @@ static int read_btree_roots(struct bch_fs *c) + "invalid btree root %s", + bch2_btree_ids[i]); + if (i == BTREE_ID_ALLOC) +- c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO); ++ c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); + } + + ret = bch2_btree_root_read(c, i, &r->key, r->level); +@@ -953,7 +953,7 @@ static int read_btree_roots(struct bch_fs *c) + "error reading btree root %s", + bch2_btree_ids[i]); + if (i == BTREE_ID_ALLOC) +- c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO); ++ c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); + } + } + +@@ -1020,7 +1020,7 @@ int bch2_fs_recovery(struct bch_fs *c) + last_journal_entry && + !journal_entry_empty(last_journal_entry), c, + "filesystem marked clean but journal not empty")) { +- c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO); ++ c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); + SET_BCH_SB_CLEAN(c->disk_sb.sb, false); + c->sb.clean = false; + } +@@ -1061,7 +1061,7 @@ use_clean: + } + + if (c->opts.reconstruct_alloc) { +- c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO); ++ c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); + drop_alloc_keys(&c->journal_keys); + } + +@@ -1114,8 +1114,8 @@ use_clean: + set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags); + + if (c->opts.fsck || +- !(c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO)) || +- !(c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_METADATA)) || ++ !(c->sb.compat & (1ULL << BCH_COMPAT_alloc_info)) || ++ !(c->sb.compat & (1ULL << BCH_COMPAT_alloc_metadata)) || + test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags)) { + bch_info(c, "starting mark and sweep"); + err = "error in mark and sweep"; +@@ -1201,11 +1201,11 @@ use_clean: + bch_verbose(c, "quotas done"); + } + +- if (!(c->sb.compat & (1ULL << BCH_COMPAT_FEAT_EXTENTS_ABOVE_BTREE_UPDATES_DONE)) || +- !(c->sb.compat & (1ULL << BCH_COMPAT_FEAT_BFORMAT_OVERFLOW_DONE))) { ++ if (!(c->sb.compat & (1ULL << BCH_COMPAT_extents_above_btree_updates_done)) || ++ !(c->sb.compat & (1ULL << BCH_COMPAT_bformat_overflow_done))) { + struct bch_move_stats stats = { 0 }; + +- bch_verbose(c, "scanning for old btree nodes"); ++ bch_info(c, "scanning for old btree nodes"); + ret = bch2_fs_read_write(c); + if (ret) + goto err; +@@ -1213,7 +1213,7 @@ use_clean: + ret = bch2_scan_old_btree_nodes(c, &stats); + if (ret) + goto err; +- bch_verbose(c, "scanning for old btree nodes done"); ++ bch_info(c, "scanning for old btree nodes done"); + } + + mutex_lock(&c->sb_lock); +@@ -1227,7 +1227,7 @@ use_clean: + } + + if (!test_bit(BCH_FS_ERROR, &c->flags)) { +- c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_ALLOC_INFO; ++ c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_alloc_info; + write_sb = true; + } + +@@ -1287,8 +1287,8 @@ int bch2_fs_initialize(struct bch_fs *c) + le16_to_cpu(bcachefs_metadata_version_current); + c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_atomic_nlink; + c->disk_sb.sb->features[0] |= BCH_SB_FEATURES_ALL; +- c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_EXTENTS_ABOVE_BTREE_UPDATES_DONE; +- c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_BFORMAT_OVERFLOW_DONE; ++ c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_extents_above_btree_updates_done; ++ c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_bformat_overflow_done; + + bch2_write_super(c); + mutex_unlock(&c->sb_lock); +diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c +index 09598ec95171..86f1feff3aaa 100644 +--- a/fs/bcachefs/super-io.c ++++ b/fs/bcachefs/super-io.c +@@ -375,7 +375,6 @@ static void bch2_sb_update(struct bch_fs *c) + ca->mi = bch2_mi_to_cpu(mi->members + i); + } + +-/* doesn't copy member info */ + static void __copy_super(struct bch_sb_handle *dst_handle, struct bch_sb *src) + { + struct bch_sb_field *src_f, *dst_f; +@@ -1079,8 +1078,8 @@ void bch2_fs_mark_clean(struct bch_fs *c) + + SET_BCH_SB_CLEAN(c->disk_sb.sb, true); + +- c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_ALLOC_INFO; +- c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_ALLOC_METADATA; ++ c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_alloc_info; ++ c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_alloc_metadata; + c->disk_sb.sb->features[0] &= ~(1ULL << BCH_FEATURE_extents_above_btree_updates); + c->disk_sb.sb->features[0] &= ~(1ULL << BCH_FEATURE_btree_updates_journalled); + +-- +cgit v1.2.3 + + +From 57be7458e9316b00821e4b14a98a62d84b7c9fd8 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 5 Apr 2021 00:53:42 -0400 +Subject: bcachefs: Add a cond_seched() to the allocator thread + +This is just a band-aid fix for now. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/alloc_background.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c +index ef75d9b8abd1..8fbae9e95f4e 100644 +--- a/fs/bcachefs/alloc_background.c ++++ b/fs/bcachefs/alloc_background.c +@@ -1127,6 +1127,7 @@ static int bch2_allocator_thread(void *arg) + pr_debug("free_inc now empty"); + + do { ++ cond_resched(); + /* + * Find some buckets that we can invalidate, either + * they're completely unused, or only contain clean data +-- +cgit v1.2.3 + + +From 52bee387ad6c45df88671d5b8504997459ade22f Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 2 Apr 2021 23:41:10 -0400 +Subject: bcachefs: Don't fail mounts due to devices that are marked as failed + +If a given set of replicas is entirely on failed devices, don't fail the +mount: we will still fail the mount if we have some copies on non failed +devices. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/replicas.c | 11 +++++++++-- + 1 file changed, 9 insertions(+), 2 deletions(-) + +diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c +index be73b458e4f6..1480a6468dce 100644 +--- a/fs/bcachefs/replicas.c ++++ b/fs/bcachefs/replicas.c +@@ -966,11 +966,18 @@ bool bch2_have_enough_devs(struct bch_fs *c, struct bch_devs_mask devs, + + percpu_down_read(&c->mark_lock); + for_each_cpu_replicas_entry(&c->replicas, e) { +- unsigned i, nr_online = 0, dflags = 0; ++ unsigned i, nr_online = 0, nr_failed = 0, dflags = 0; + bool metadata = e->data_type < BCH_DATA_user; + +- for (i = 0; i < e->nr_devs; i++) ++ for (i = 0; i < e->nr_devs; i++) { ++ struct bch_dev *ca = bch_dev_bkey_exists(c, e->devs[i]); ++ + nr_online += test_bit(e->devs[i], devs.d); ++ nr_failed += ca->mi.state == BCH_MEMBER_STATE_FAILED; ++ } ++ ++ if (nr_failed == e->nr_devs) ++ continue; + + if (nr_online < e->nr_required) + dflags |= metadata +-- +cgit v1.2.3 + + +From bd00455eef753ac669caae4ec6d8b1730443a172 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 9 Apr 2021 19:04:57 -0400 +Subject: bcachefs: Fix bch2_write_super to obey very_degraded option + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/super-io.c | 8 ++++++-- + 1 file changed, 6 insertions(+), 2 deletions(-) + +diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c +index 86f1feff3aaa..677dbb84b81e 100644 +--- a/fs/bcachefs/super-io.c ++++ b/fs/bcachefs/super-io.c +@@ -696,8 +696,12 @@ int bch2_write_super(struct bch_fs *c) + const char *err; + struct bch_devs_mask sb_written; + bool wrote, can_mount_without_written, can_mount_with_written; ++ unsigned degraded_flags = BCH_FORCE_IF_DEGRADED; + int ret = 0; + ++ if (c->opts.very_degraded) ++ degraded_flags |= BCH_FORCE_IF_LOST; ++ + lockdep_assert_held(&c->sb_lock); + + closure_init_stack(cl); +@@ -766,13 +770,13 @@ int bch2_write_super(struct bch_fs *c) + nr_wrote = dev_mask_nr(&sb_written); + + can_mount_with_written = +- bch2_have_enough_devs(c, sb_written, BCH_FORCE_IF_DEGRADED, false); ++ bch2_have_enough_devs(c, sb_written, degraded_flags, false); + + for (i = 0; i < ARRAY_SIZE(sb_written.d); i++) + sb_written.d[i] = ~sb_written.d[i]; + + can_mount_without_written = +- bch2_have_enough_devs(c, sb_written, BCH_FORCE_IF_DEGRADED, false); ++ bch2_have_enough_devs(c, sb_written, degraded_flags, false); + + /* + * If we would be able to mount _without_ the devices we successfully +-- +cgit v1.2.3 + + +From da89d5a2beae860bca92c5fbcf920e77c1d68c4d Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 16 Apr 2021 16:54:11 -0400 +Subject: bcachefs: Bring back metadata only gc + +This is useful for the filesystem dump debugging tool - when we're +hitting bugs we want to skip as much of the recovery process as +possible, and the dump tool only needs to know where metadata lives. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_gc.c | 65 ++++++++++++++++++++++++++++++++------------------ + fs/bcachefs/btree_gc.h | 2 +- + fs/bcachefs/recovery.c | 4 +++- + 3 files changed, 46 insertions(+), 25 deletions(-) + +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +index eb9f6a139aa6..fa64aea2db02 100644 +--- a/fs/bcachefs/btree_gc.c ++++ b/fs/bcachefs/btree_gc.c +@@ -378,12 +378,13 @@ static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, u8 *max_stale, + } + + static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id, +- bool initial) ++ bool initial, bool metadata_only) + { + struct btree_trans trans; + struct btree_iter *iter; + struct btree *b; +- unsigned depth = bch2_expensive_debug_checks ? 0 ++ unsigned depth = metadata_only ? 1 ++ : bch2_expensive_debug_checks ? 0 + : !btree_node_type_needs_gc(btree_id) ? 1 + : 0; + u8 max_stale = 0; +@@ -526,11 +527,13 @@ fsck_err: + } + + static int bch2_gc_btree_init(struct bch_fs *c, +- enum btree_id btree_id) ++ enum btree_id btree_id, ++ bool metadata_only) + { + struct btree *b; +- unsigned target_depth = bch2_expensive_debug_checks ? 0 +- : !btree_node_type_needs_gc(btree_id) ? 1 ++ unsigned target_depth = metadata_only ? 1 ++ : bch2_expensive_debug_checks ? 0 ++ : !btree_node_type_needs_gc(btree_id) ? 1 + : 0; + u8 max_stale = 0; + int ret = 0; +@@ -576,7 +579,7 @@ static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r) + (int) btree_id_to_gc_phase(r); + } + +-static int bch2_gc_btrees(struct bch_fs *c, bool initial) ++static int bch2_gc_btrees(struct bch_fs *c, bool initial, bool metadata_only) + { + enum btree_id ids[BTREE_ID_NR]; + unsigned i; +@@ -588,8 +591,8 @@ static int bch2_gc_btrees(struct bch_fs *c, bool initial) + for (i = 0; i < BTREE_ID_NR; i++) { + enum btree_id id = ids[i]; + int ret = initial +- ? bch2_gc_btree_init(c, id) +- : bch2_gc_btree(c, id, initial); ++ ? bch2_gc_btree_init(c, id, metadata_only) ++ : bch2_gc_btree(c, id, initial, metadata_only); + if (ret) { + bch_err(c, "%s: ret %i", __func__, ret); + return ret; +@@ -758,10 +761,10 @@ static void bch2_gc_free(struct bch_fs *c) + } + + static int bch2_gc_done(struct bch_fs *c, +- bool initial) ++ bool initial, bool metadata_only) + { + struct bch_dev *ca; +- bool verify = (!initial || ++ bool verify = !metadata_only && (!initial || + (c->sb.compat & (1ULL << BCH_COMPAT_alloc_info))); + unsigned i, dev; + int ret = 0; +@@ -800,7 +803,7 @@ static int bch2_gc_done(struct bch_fs *c, + #define copy_fs_field(_f, _msg, ...) \ + copy_field(_f, "fs has wrong " _msg, ##__VA_ARGS__) + +- { ++ if (!metadata_only) { + struct genradix_iter iter = genradix_iter_init(&c->stripes[1], 0); + struct stripe *dst, *src; + +@@ -873,20 +876,28 @@ static int bch2_gc_done(struct bch_fs *c, + + copy_fs_field(hidden, "hidden"); + copy_fs_field(btree, "btree"); +- copy_fs_field(data, "data"); +- copy_fs_field(cached, "cached"); +- copy_fs_field(reserved, "reserved"); +- copy_fs_field(nr_inodes,"nr_inodes"); + +- for (i = 0; i < BCH_REPLICAS_MAX; i++) +- copy_fs_field(persistent_reserved[i], +- "persistent_reserved[%i]", i); ++ if (!metadata_only) { ++ copy_fs_field(data, "data"); ++ copy_fs_field(cached, "cached"); ++ copy_fs_field(reserved, "reserved"); ++ copy_fs_field(nr_inodes,"nr_inodes"); ++ ++ for (i = 0; i < BCH_REPLICAS_MAX; i++) ++ copy_fs_field(persistent_reserved[i], ++ "persistent_reserved[%i]", i); ++ } + + for (i = 0; i < c->replicas.nr; i++) { + struct bch_replicas_entry *e = + cpu_replicas_entry(&c->replicas, i); + char buf[80]; + ++ if (metadata_only && ++ (e->data_type == BCH_DATA_user || ++ e->data_type == BCH_DATA_cached)) ++ continue; ++ + bch2_replicas_entry_to_text(&PBUF(buf), e); + + copy_fs_field(replicas[i], "%s", buf); +@@ -904,7 +915,8 @@ fsck_err: + return ret; + } + +-static int bch2_gc_start(struct bch_fs *c) ++static int bch2_gc_start(struct bch_fs *c, ++ bool metadata_only) + { + struct bch_dev *ca; + unsigned i; +@@ -968,6 +980,13 @@ static int bch2_gc_start(struct bch_fs *c) + + d->_mark.gen = dst->b[b].oldest_gen = s->mark.gen; + d->gen_valid = s->gen_valid; ++ ++ if (metadata_only && ++ (s->mark.data_type == BCH_DATA_user || ++ s->mark.data_type == BCH_DATA_cached)) { ++ d->_mark = s->mark; ++ d->_mark.owned_by_allocator = 0; ++ } + } + }; + +@@ -994,7 +1013,7 @@ static int bch2_gc_start(struct bch_fs *c) + * move around - if references move backwards in the ordering GC + * uses, GC could skip past them + */ +-int bch2_gc(struct bch_fs *c, bool initial) ++int bch2_gc(struct bch_fs *c, bool initial, bool metadata_only) + { + struct bch_dev *ca; + u64 start_time = local_clock(); +@@ -1010,13 +1029,13 @@ int bch2_gc(struct bch_fs *c, bool initial) + closure_wait_event(&c->btree_interior_update_wait, + !bch2_btree_interior_updates_nr_pending(c)); + again: +- ret = bch2_gc_start(c); ++ ret = bch2_gc_start(c, metadata_only); + if (ret) + goto out; + + bch2_mark_superblocks(c); + +- ret = bch2_gc_btrees(c, initial); ++ ret = bch2_gc_btrees(c, initial, metadata_only); + if (ret) + goto out; + +@@ -1054,7 +1073,7 @@ out: + bch2_journal_block(&c->journal); + + percpu_down_write(&c->mark_lock); +- ret = bch2_gc_done(c, initial); ++ ret = bch2_gc_done(c, initial, metadata_only); + + bch2_journal_unblock(&c->journal); + } else { +diff --git a/fs/bcachefs/btree_gc.h b/fs/bcachefs/btree_gc.h +index fa604efc70cc..f516faded269 100644 +--- a/fs/bcachefs/btree_gc.h ++++ b/fs/bcachefs/btree_gc.h +@@ -6,7 +6,7 @@ + + void bch2_coalesce(struct bch_fs *); + +-int bch2_gc(struct bch_fs *, bool); ++int bch2_gc(struct bch_fs *, bool, bool); + int bch2_gc_gens(struct bch_fs *); + void bch2_gc_thread_stop(struct bch_fs *); + int bch2_gc_thread_start(struct bch_fs *); +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index c42919277c72..740fdeafe1a2 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -1117,9 +1117,11 @@ use_clean: + !(c->sb.compat & (1ULL << BCH_COMPAT_alloc_info)) || + !(c->sb.compat & (1ULL << BCH_COMPAT_alloc_metadata)) || + test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags)) { ++ bool metadata_only = c->opts.norecovery; ++ + bch_info(c, "starting mark and sweep"); + err = "error in mark and sweep"; +- ret = bch2_gc(c, true); ++ ret = bch2_gc(c, true, metadata_only); + if (ret) + goto err; + bch_verbose(c, "mark and sweep done"); +-- +cgit v1.2.3 + + +From ac7db1efa8781983d7efc72b17dfbd80ae0dda61 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 16 Apr 2021 18:02:57 -0400 +Subject: bcachefs: Fix a use-after-free in bch2_gc_mark_key() + +bch2_check_fix_ptrs() can update/reallocate k + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_gc.c | 43 ++++++++++++++++++++++++------------------- + 1 file changed, 24 insertions(+), 19 deletions(-) + +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +index fa64aea2db02..11a9e5817219 100644 +--- a/fs/bcachefs/btree_gc.c ++++ b/fs/bcachefs/btree_gc.c +@@ -281,10 +281,10 @@ fsck_err: + + static int bch2_gc_mark_key(struct bch_fs *c, enum btree_id btree_id, + unsigned level, bool is_root, +- struct bkey_s_c k, ++ struct bkey_s_c *k, + u8 *max_stale, bool initial) + { +- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ struct bkey_ptrs_c ptrs; + const struct bch_extent_ptr *ptr; + unsigned flags = + BTREE_TRIGGER_GC| +@@ -293,28 +293,29 @@ static int bch2_gc_mark_key(struct bch_fs *c, enum btree_id btree_id, + + if (initial) { + BUG_ON(bch2_journal_seq_verify && +- k.k->version.lo > journal_cur_seq(&c->journal)); ++ k->k->version.lo > journal_cur_seq(&c->journal)); + +- if (fsck_err_on(k.k->version.lo > atomic64_read(&c->key_version), c, ++ if (fsck_err_on(k->k->version.lo > atomic64_read(&c->key_version), c, + "key version number higher than recorded: %llu > %llu", +- k.k->version.lo, ++ k->k->version.lo, + atomic64_read(&c->key_version))) +- atomic64_set(&c->key_version, k.k->version.lo); ++ atomic64_set(&c->key_version, k->k->version.lo); + + if (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) || +- fsck_err_on(!bch2_bkey_replicas_marked(c, k), c, ++ fsck_err_on(!bch2_bkey_replicas_marked(c, *k), c, + "superblock not marked as containing replicas (type %u)", +- k.k->type)) { +- ret = bch2_mark_bkey_replicas(c, k); ++ k->k->type)) { ++ ret = bch2_mark_bkey_replicas(c, *k); + if (ret) { + bch_err(c, "error marking bkey replicas: %i", ret); + goto err; + } + } + +- ret = bch2_check_fix_ptrs(c, btree_id, level, is_root, &k); ++ ret = bch2_check_fix_ptrs(c, btree_id, level, is_root, k); + } + ++ ptrs = bch2_bkey_ptrs_c(*k); + bkey_for_each_ptr(ptrs, ptr) { + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); + struct bucket *g = PTR_BUCKET(ca, ptr, true); +@@ -325,7 +326,7 @@ static int bch2_gc_mark_key(struct bch_fs *c, enum btree_id btree_id, + *max_stale = max(*max_stale, ptr_stale(ca, ptr)); + } + +- bch2_mark_key(c, k, 0, k.k->size, NULL, 0, flags); ++ bch2_mark_key(c, *k, 0, k->k->size, NULL, 0, flags); + fsck_err: + err: + if (ret) +@@ -356,7 +357,7 @@ static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, u8 *max_stale, + bch2_bkey_debugcheck(c, b, k); + + ret = bch2_gc_mark_key(c, b->c.btree_id, b->c.level, false, +- k, max_stale, initial); ++ &k, max_stale, initial); + if (ret) + break; + +@@ -426,10 +427,12 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id, + + mutex_lock(&c->btree_root_lock); + b = c->btree_roots[btree_id].b; +- if (!btree_node_fake(b)) ++ if (!btree_node_fake(b)) { ++ struct bkey_s_c k = bkey_i_to_s_c(&b->key); ++ + ret = bch2_gc_mark_key(c, b->c.btree_id, b->c.level, true, +- bkey_i_to_s_c(&b->key), +- &max_stale, initial); ++ &k, &max_stale, initial); ++ } + gc_pos_set(c, gc_pos_btree_root(b->c.btree_id)); + mutex_unlock(&c->btree_root_lock); + +@@ -457,7 +460,7 @@ static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b, + BUG_ON(bkey_cmp(k.k->p, b->data->max_key) > 0); + + ret = bch2_gc_mark_key(c, b->c.btree_id, b->c.level, false, +- k, &max_stale, true); ++ &k, &max_stale, true); + if (ret) { + bch_err(c, "%s: error %i from bch2_gc_mark_key", __func__, ret); + break; +@@ -561,10 +564,12 @@ static int bch2_gc_btree_init(struct bch_fs *c, + if (b->c.level >= target_depth) + ret = bch2_gc_btree_init_recurse(c, b, target_depth); + +- if (!ret) ++ if (!ret) { ++ struct bkey_s_c k = bkey_i_to_s_c(&b->key); ++ + ret = bch2_gc_mark_key(c, b->c.btree_id, b->c.level, true, +- bkey_i_to_s_c(&b->key), +- &max_stale, true); ++ &k, &max_stale, true); ++ } + fsck_err: + six_unlock_read(&b->c.lock); + +-- +cgit v1.2.3 + + +From 90505cafb377facfa9796b87a093d75dab93abea Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 16 Apr 2021 18:59:54 -0400 +Subject: bcachefs: Don't drop ptrs to btree nodes + +If a ptr gen doesn't match the bucket gen, the bucket likely doesn't +contain the data we want - but it's still possible the data we want +might have been overwritten, and for btree node pointers we can verify +whether or not the node is the one we wanted with the node's sequence +number, so it's better to keep the pointer and try reading from it. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_gc.c | 51 ++++++++++++++++++++++++++++++++------------------ + fs/bcachefs/btree_io.c | 5 ++++- + 2 files changed, 37 insertions(+), 19 deletions(-) + +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +index 11a9e5817219..ea7ded70570e 100644 +--- a/fs/bcachefs/btree_gc.c ++++ b/fs/bcachefs/btree_gc.c +@@ -244,25 +244,40 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id, + + bkey_reassemble(new, *k); + +- bch2_bkey_drop_ptrs(bkey_i_to_s(new), ptr, ({ +- struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); +- struct bucket *g = PTR_BUCKET(ca, ptr, true); +- +- (ptr->cached && +- (!g->gen_valid || gen_cmp(ptr->gen, g->mark.gen) > 0)) || +- (!ptr->cached && +- gen_cmp(ptr->gen, g->mark.gen) < 0); +- })); ++ if (level) { ++ /* ++ * We don't want to drop btree node pointers - if the ++ * btree node isn't there anymore, the read path will ++ * sort it out: ++ */ ++ ptrs = bch2_bkey_ptrs(bkey_i_to_s(new)); ++ bkey_for_each_ptr(ptrs, ptr) { ++ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); ++ struct bucket *g = PTR_BUCKET(ca, ptr, true); ++ ++ ptr->gen = g->mark.gen; ++ } ++ } else { ++ bch2_bkey_drop_ptrs(bkey_i_to_s(new), ptr, ({ ++ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); ++ struct bucket *g = PTR_BUCKET(ca, ptr, true); ++ ++ (ptr->cached && ++ (!g->gen_valid || gen_cmp(ptr->gen, g->mark.gen) > 0)) || ++ (!ptr->cached && ++ gen_cmp(ptr->gen, g->mark.gen) < 0); ++ })); + again: +- ptrs = bch2_bkey_ptrs(bkey_i_to_s(new)); +- bkey_extent_entry_for_each(ptrs, entry) { +- if (extent_entry_type(entry) == BCH_EXTENT_ENTRY_stripe_ptr) { +- struct stripe *m = genradix_ptr(&c->stripes[true], +- entry->stripe_ptr.idx); +- +- if (!m || !m->alive) { +- bch2_bkey_extent_entry_drop(new, entry); +- goto again; ++ ptrs = bch2_bkey_ptrs(bkey_i_to_s(new)); ++ bkey_extent_entry_for_each(ptrs, entry) { ++ if (extent_entry_type(entry) == BCH_EXTENT_ENTRY_stripe_ptr) { ++ struct stripe *m = genradix_ptr(&c->stripes[true], ++ entry->stripe_ptr.idx); ++ ++ if (!m || !m->alive) { ++ bch2_bkey_extent_entry_drop(new, entry); ++ goto again; ++ } + } + } + } +diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c +index a84a473101dc..5984f7cb8f33 100644 +--- a/fs/bcachefs/btree_io.c ++++ b/fs/bcachefs/btree_io.c +@@ -1207,14 +1207,17 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b, + struct btree_read_bio *rb; + struct bch_dev *ca; + struct bio *bio; ++ char buf[200]; + int ret; + ++ btree_pos_to_text(&PBUF(buf), c, b); + trace_btree_read(c, b); + + ret = bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key), + NULL, &pick); + if (bch2_fs_fatal_err_on(ret <= 0, c, +- "btree node read error: no device to read from")) { ++ "btree node read error: no device to read from\n" ++ " at %s", buf)) { + set_btree_node_read_error(b); + return; + } +-- +cgit v1.2.3 + + +From df0299b825046d57b9a2031da1b92393312d248e Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 13 Apr 2021 09:49:23 -0400 +Subject: bcachefs: Fix copygc threshold + +Awhile back the meaning of is_available_bucket() and thus also +bch_dev_usage->buckets_unavailable changed to include buckets that are +owned by the allocator - this was so that the stat could be persisted +like other allocation information, and wouldn't have to be regenerated +by walking each bucket at mount time. + +This broke copygc, which needs to consider buckets that are reclaimable +and haven't yet been grabbed by the allocator thread and moved onta +freelist. This patch fixes that by adding dev_buckets_reclaimable() for +copygc and the allocator thread, and cleans up some of the callers a bit. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/alloc_background.c | 19 ++++--------------- + fs/bcachefs/alloc_foreground.c | 5 ++++- + fs/bcachefs/bcachefs.h | 2 +- + fs/bcachefs/buckets.h | 26 ++++++++++++++++---------- + fs/bcachefs/movinggc.c | 5 ++--- + fs/bcachefs/sysfs.c | 9 ++++++--- + 6 files changed, 33 insertions(+), 33 deletions(-) + +diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c +index 8fbae9e95f4e..ee9f80be920e 100644 +--- a/fs/bcachefs/alloc_background.c ++++ b/fs/bcachefs/alloc_background.c +@@ -46,7 +46,7 @@ static void pd_controllers_update(struct work_struct *work) + struct bch_dev_usage stats = bch2_dev_usage_read(ca); + + free += bucket_to_sector(ca, +- __dev_buckets_free(ca, stats)) << 9; ++ __dev_buckets_available(ca, stats)) << 9; + /* + * Bytes of internal fragmentation, which can be + * reclaimed by copy GC +@@ -499,7 +499,6 @@ static int wait_buckets_available(struct bch_fs *c, struct bch_dev *ca) + { + unsigned long gc_count = c->gc_count; + s64 available; +- unsigned i; + int ret = 0; + + ca->allocator_state = ALLOCATOR_BLOCKED; +@@ -515,19 +514,12 @@ static int wait_buckets_available(struct bch_fs *c, struct bch_dev *ca) + if (gc_count != c->gc_count) + ca->inc_gen_really_needs_gc = 0; + +- available = dev_buckets_available(ca); ++ available = dev_buckets_reclaimable(ca); + available -= ca->inc_gen_really_needs_gc; + +- spin_lock(&c->freelist_lock); +- for (i = 0; i < RESERVE_NR; i++) +- available -= fifo_used(&ca->free[i]); +- spin_unlock(&c->freelist_lock); +- + available = max(available, 0LL); + +- if (available > fifo_free(&ca->free_inc) || +- (available && +- !fifo_full(&ca->free[RESERVE_MOVINGGC]))) ++ if (available) + break; + + up_read(&c->gc_lock); +@@ -1189,7 +1181,7 @@ stop: + void bch2_recalc_capacity(struct bch_fs *c) + { + struct bch_dev *ca; +- u64 capacity = 0, reserved_sectors = 0, gc_reserve, copygc_threshold = 0; ++ u64 capacity = 0, reserved_sectors = 0, gc_reserve; + unsigned bucket_size_max = 0; + unsigned long ra_pages = 0; + unsigned i, j; +@@ -1232,8 +1224,6 @@ void bch2_recalc_capacity(struct bch_fs *c) + + dev_reserve *= ca->mi.bucket_size; + +- copygc_threshold += dev_reserve; +- + capacity += bucket_to_sector(ca, ca->mi.nbuckets - + ca->mi.first_bucket); + +@@ -1251,7 +1241,6 @@ void bch2_recalc_capacity(struct bch_fs *c) + + reserved_sectors = min(reserved_sectors, capacity); + +- c->copygc_threshold = copygc_threshold; + c->capacity = capacity - reserved_sectors; + + c->bucket_size_max = bucket_size_max; +diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c +index 8f0b94f591be..2e7b19be02b9 100644 +--- a/fs/bcachefs/alloc_foreground.c ++++ b/fs/bcachefs/alloc_foreground.c +@@ -109,7 +109,9 @@ void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob) + spin_lock(&c->freelist_lock); + ob->freelist = c->open_buckets_freelist; + c->open_buckets_freelist = ob - c->open_buckets; ++ + c->open_buckets_nr_free++; ++ ca->nr_open_buckets--; + spin_unlock(&c->freelist_lock); + + closure_wake_up(&c->open_buckets_wait); +@@ -316,6 +318,7 @@ out: + c->blocked_allocate = 0; + } + ++ ca->nr_open_buckets++; + spin_unlock(&c->freelist_lock); + + bch2_wake_allocator(ca); +@@ -351,7 +354,7 @@ void bch2_dev_stripe_increment(struct bch_dev *ca, + struct dev_stripe_state *stripe) + { + u64 *v = stripe->next_alloc + ca->dev_idx; +- u64 free_space = dev_buckets_free(ca); ++ u64 free_space = dev_buckets_available(ca); + u64 free_space_inv = free_space + ? div64_u64(1ULL << 48, free_space) + : 1ULL << 48; +diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h +index a9a631a74074..f7222e7a06cd 100644 +--- a/fs/bcachefs/bcachefs.h ++++ b/fs/bcachefs/bcachefs.h +@@ -448,6 +448,7 @@ struct bch_dev { + */ + alloc_fifo free[RESERVE_NR]; + alloc_fifo free_inc; ++ unsigned nr_open_buckets; + + open_bucket_idx_t open_buckets_partial[OPEN_BUCKETS_COUNT]; + open_bucket_idx_t open_buckets_partial_nr; +@@ -773,7 +774,6 @@ struct bch_fs { + copygc_heap copygc_heap; + struct bch_pd_controller copygc_pd; + struct write_point copygc_write_point; +- u64 copygc_threshold; + + /* STRIPES: */ + GENRADIX(struct stripe) stripes[2]; +diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h +index 6d15c455e7cc..9a91a4969783 100644 +--- a/fs/bcachefs/buckets.h ++++ b/fs/bcachefs/buckets.h +@@ -175,25 +175,31 @@ static inline u64 __dev_buckets_available(struct bch_dev *ca, + return total - stats.buckets_unavailable; + } + +-/* +- * Number of reclaimable buckets - only for use by the allocator thread: +- */ + static inline u64 dev_buckets_available(struct bch_dev *ca) + { + return __dev_buckets_available(ca, bch2_dev_usage_read(ca)); + } + +-static inline u64 __dev_buckets_free(struct bch_dev *ca, +- struct bch_dev_usage stats) ++static inline u64 __dev_buckets_reclaimable(struct bch_dev *ca, ++ struct bch_dev_usage stats) + { +- return __dev_buckets_available(ca, stats) + +- fifo_used(&ca->free[RESERVE_NONE]) + +- fifo_used(&ca->free_inc); ++ struct bch_fs *c = ca->fs; ++ s64 available = __dev_buckets_available(ca, stats); ++ unsigned i; ++ ++ spin_lock(&c->freelist_lock); ++ for (i = 0; i < RESERVE_NR; i++) ++ available -= fifo_used(&ca->free[i]); ++ available -= fifo_used(&ca->free_inc); ++ available -= ca->nr_open_buckets; ++ spin_unlock(&c->freelist_lock); ++ ++ return max(available, 0LL); + } + +-static inline u64 dev_buckets_free(struct bch_dev *ca) ++static inline u64 dev_buckets_reclaimable(struct bch_dev *ca) + { +- return __dev_buckets_free(ca, bch2_dev_usage_read(ca)); ++ return __dev_buckets_reclaimable(ca, bch2_dev_usage_read(ca)); + } + + /* Filesystem usage: */ +diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c +index 03668e481f7a..8cd39efd2a6e 100644 +--- a/fs/bcachefs/movinggc.c ++++ b/fs/bcachefs/movinggc.c +@@ -282,13 +282,12 @@ unsigned long bch2_copygc_wait_amount(struct bch_fs *c) + { + struct bch_dev *ca; + unsigned dev_idx; +- u64 fragmented_allowed = c->copygc_threshold; +- u64 fragmented = 0; ++ u64 fragmented_allowed = 0, fragmented = 0; + + for_each_rw_member(ca, c, dev_idx) { + struct bch_dev_usage usage = bch2_dev_usage_read(ca); + +- fragmented_allowed += ((__dev_buckets_available(ca, usage) * ++ fragmented_allowed += ((__dev_buckets_reclaimable(ca, usage) * + ca->mi.bucket_size) >> 1); + fragmented += usage.d[BCH_DATA_user].fragmented; + } +diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c +index bc4c3a77ea62..301b06346ef0 100644 +--- a/fs/bcachefs/sysfs.c ++++ b/fs/bcachefs/sysfs.c +@@ -806,7 +806,9 @@ static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca) + "free[RESERVE_MOVINGGC]\t%zu/%zu\n" + "free[RESERVE_NONE]\t%zu/%zu\n" + "freelist_wait\t\t%s\n" +- "open buckets\t\t%u/%u (reserved %u)\n" ++ "open buckets allocated\t%u\n" ++ "open buckets this dev\t%u\n" ++ "open buckets total\t%u\n" + "open_buckets_wait\t%s\n" + "open_buckets_btree\t%u\n" + "open_buckets_user\t%u\n" +@@ -818,8 +820,9 @@ static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca) + fifo_used(&ca->free[RESERVE_MOVINGGC]), ca->free[RESERVE_MOVINGGC].size, + fifo_used(&ca->free[RESERVE_NONE]), ca->free[RESERVE_NONE].size, + c->freelist_wait.list.first ? "waiting" : "empty", +- c->open_buckets_nr_free, OPEN_BUCKETS_COUNT, +- BTREE_NODE_OPEN_BUCKET_RESERVE, ++ OPEN_BUCKETS_COUNT - c->open_buckets_nr_free, ++ ca->nr_open_buckets, ++ OPEN_BUCKETS_COUNT, + c->open_buckets_wait.list.first ? "waiting" : "empty", + nr[BCH_DATA_btree], + nr[BCH_DATA_user], +-- +cgit v1.2.3 + + +From 802a951df2da58a6097463f9a25c3cd0caf0cf63 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 13 Apr 2021 14:45:55 -0400 +Subject: bcachefs: Add copygc wait to sysfs + +Currently debugging an issue with copygc not running when it's supposed +to, and this is an obvious first step. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bcachefs.h | 1 + + fs/bcachefs/movinggc.c | 3 +++ + fs/bcachefs/sysfs.c | 5 +++++ + 3 files changed, 9 insertions(+) + +diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h +index f7222e7a06cd..dcae4a85f967 100644 +--- a/fs/bcachefs/bcachefs.h ++++ b/fs/bcachefs/bcachefs.h +@@ -774,6 +774,7 @@ struct bch_fs { + copygc_heap copygc_heap; + struct bch_pd_controller copygc_pd; + struct write_point copygc_write_point; ++ s64 copygc_wait; + + /* STRIPES: */ + GENRADIX(struct stripe) stripes[2]; +diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c +index 8cd39efd2a6e..b7ab9dce8787 100644 +--- a/fs/bcachefs/movinggc.c ++++ b/fs/bcachefs/movinggc.c +@@ -311,11 +311,14 @@ static int bch2_copygc_thread(void *arg) + wait = bch2_copygc_wait_amount(c); + + if (wait > clock->max_slop) { ++ c->copygc_wait = last + wait; + bch2_kthread_io_clock_wait(clock, last + wait, + MAX_SCHEDULE_TIMEOUT); + continue; + } + ++ c->copygc_wait = 0; ++ + if (bch2_copygc(c)) + break; + } +diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c +index 301b06346ef0..f0b4e3116c26 100644 +--- a/fs/bcachefs/sysfs.c ++++ b/fs/bcachefs/sysfs.c +@@ -188,6 +188,7 @@ rw_attribute(cache_replacement_policy); + rw_attribute(label); + + rw_attribute(copy_gc_enabled); ++read_attribute(copy_gc_wait); + sysfs_pd_controller_attribute(copy_gc); + + rw_attribute(rebalance_enabled); +@@ -336,6 +337,9 @@ SHOW(bch2_fs) + sysfs_printf(rebalance_enabled, "%i", c->rebalance.enabled); + sysfs_pd_controller_show(rebalance, &c->rebalance.pd); /* XXX */ + sysfs_pd_controller_show(copy_gc, &c->copygc_pd); ++ sysfs_hprint(copy_gc_wait, ++ max(0LL, c->copygc_wait - ++ atomic64_read(&c->io_clock[WRITE].now)) << 9); + + if (attr == &sysfs_rebalance_work) { + bch2_rebalance_work_to_text(&out, c); +@@ -563,6 +567,7 @@ struct attribute *bch2_fs_internal_files[] = { + &sysfs_prune_cache, + + &sysfs_copy_gc_enabled, ++ &sysfs_copy_gc_wait, + + &sysfs_rebalance_enabled, + &sysfs_rebalance_work, +-- +cgit v1.2.3 + + +From e79bbb3ad5602dae94b9793d6827af697905a727 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 17 Apr 2021 20:24:54 -0400 +Subject: bcachefs: Rip out copygc pd controller + +We have a separate mechanism for ratelimiting copygc now - the pd +controller has only been causing problems. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/alloc_background.c | 35 ----------------------------------- + fs/bcachefs/bcachefs.h | 4 ---- + fs/bcachefs/movinggc.c | 7 +------ + fs/bcachefs/super.c | 4 ---- + fs/bcachefs/sysfs.c | 11 ----------- + 5 files changed, 1 insertion(+), 60 deletions(-) + +diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c +index ee9f80be920e..e115f4f1f1d4 100644 +--- a/fs/bcachefs/alloc_background.c ++++ b/fs/bcachefs/alloc_background.c +@@ -31,38 +31,6 @@ static const unsigned BCH_ALLOC_V1_FIELD_BYTES[] = { + #undef x + }; + +-/* Ratelimiting/PD controllers */ +- +-static void pd_controllers_update(struct work_struct *work) +-{ +- struct bch_fs *c = container_of(to_delayed_work(work), +- struct bch_fs, +- pd_controllers_update); +- struct bch_dev *ca; +- s64 free = 0, fragmented = 0; +- unsigned i; +- +- for_each_member_device(ca, c, i) { +- struct bch_dev_usage stats = bch2_dev_usage_read(ca); +- +- free += bucket_to_sector(ca, +- __dev_buckets_available(ca, stats)) << 9; +- /* +- * Bytes of internal fragmentation, which can be +- * reclaimed by copy GC +- */ +- fragmented += max_t(s64, 0, (bucket_to_sector(ca, +- stats.d[BCH_DATA_user].buckets + +- stats.d[BCH_DATA_cached].buckets) - +- (stats.d[BCH_DATA_user].sectors + +- stats.d[BCH_DATA_cached].sectors)) << 9); +- } +- +- bch2_pd_controller_update(&c->copygc_pd, free, fragmented, -1); +- schedule_delayed_work(&c->pd_controllers_update, +- c->pd_controllers_update_seconds * HZ); +-} +- + /* Persistent alloc info: */ + + static inline u64 alloc_field_v1_get(const struct bch_alloc *a, +@@ -1405,7 +1373,4 @@ int bch2_dev_allocator_start(struct bch_dev *ca) + void bch2_fs_allocator_background_init(struct bch_fs *c) + { + spin_lock_init(&c->freelist_lock); +- +- c->pd_controllers_update_seconds = 5; +- INIT_DELAYED_WORK(&c->pd_controllers_update, pd_controllers_update); + } +diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h +index dcae4a85f967..cb2f96768075 100644 +--- a/fs/bcachefs/bcachefs.h ++++ b/fs/bcachefs/bcachefs.h +@@ -665,9 +665,6 @@ struct bch_fs { + struct workqueue_struct *copygc_wq; + + /* ALLOCATION */ +- struct delayed_work pd_controllers_update; +- unsigned pd_controllers_update_seconds; +- + struct bch_devs_mask rw_devs[BCH_DATA_NR]; + + u64 capacity; /* sectors */ +@@ -772,7 +769,6 @@ struct bch_fs { + /* COPYGC */ + struct task_struct *copygc_thread; + copygc_heap copygc_heap; +- struct bch_pd_controller copygc_pd; + struct write_point copygc_write_point; + s64 copygc_wait; + +diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c +index b7ab9dce8787..852d42e49422 100644 +--- a/fs/bcachefs/movinggc.c ++++ b/fs/bcachefs/movinggc.c +@@ -222,7 +222,7 @@ static int bch2_copygc(struct bch_fs *c) + ret = bch2_move_data(c, + 0, POS_MIN, + BTREE_ID_NR, POS_MAX, +- &c->copygc_pd.rate, ++ NULL, + writepoint_ptr(&c->copygc_write_point), + copygc_pred, NULL, + &move_stats); +@@ -328,9 +328,6 @@ static int bch2_copygc_thread(void *arg) + + void bch2_copygc_stop(struct bch_fs *c) + { +- c->copygc_pd.rate.rate = UINT_MAX; +- bch2_ratelimit_reset(&c->copygc_pd.rate); +- + if (c->copygc_thread) { + kthread_stop(c->copygc_thread); + put_task_struct(c->copygc_thread); +@@ -367,6 +364,4 @@ int bch2_copygc_start(struct bch_fs *c) + + void bch2_fs_copygc_init(struct bch_fs *c) + { +- bch2_pd_controller_init(&c->copygc_pd); +- c->copygc_pd.d_term = 0; + } +diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c +index c8ed2e00a1df..8bead6afd65f 100644 +--- a/fs/bcachefs/super.c ++++ b/fs/bcachefs/super.c +@@ -286,7 +286,6 @@ void bch2_fs_read_only(struct bch_fs *c) + percpu_ref_kill(&c->writes); + + cancel_work_sync(&c->ec_stripe_delete_work); +- cancel_delayed_work(&c->pd_controllers_update); + + /* + * If we're not doing an emergency shutdown, we want to wait on +@@ -371,8 +370,6 @@ static int bch2_fs_read_write_late(struct bch_fs *c) + return ret; + } + +- schedule_delayed_work(&c->pd_controllers_update, 5 * HZ); +- + schedule_work(&c->ec_stripe_delete_work); + + return 0; +@@ -563,7 +560,6 @@ void __bch2_fs_stop(struct bch_fs *c) + cancel_work_sync(&ca->io_error_work); + + cancel_work_sync(&c->btree_write_error_work); +- cancel_delayed_work_sync(&c->pd_controllers_update); + cancel_work_sync(&c->read_only_work); + + for (i = 0; i < c->sb.nr_devices; i++) +diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c +index f0b4e3116c26..0af6d461496c 100644 +--- a/fs/bcachefs/sysfs.c ++++ b/fs/bcachefs/sysfs.c +@@ -189,7 +189,6 @@ rw_attribute(label); + + rw_attribute(copy_gc_enabled); + read_attribute(copy_gc_wait); +-sysfs_pd_controller_attribute(copy_gc); + + rw_attribute(rebalance_enabled); + sysfs_pd_controller_attribute(rebalance); +@@ -198,8 +197,6 @@ rw_attribute(promote_whole_extents); + + read_attribute(new_stripes); + +-rw_attribute(pd_controllers_update_seconds); +- + read_attribute(io_timers_read); + read_attribute(io_timers_write); + +@@ -331,12 +328,8 @@ SHOW(bch2_fs) + + sysfs_printf(copy_gc_enabled, "%i", c->copy_gc_enabled); + +- sysfs_print(pd_controllers_update_seconds, +- c->pd_controllers_update_seconds); +- + sysfs_printf(rebalance_enabled, "%i", c->rebalance.enabled); + sysfs_pd_controller_show(rebalance, &c->rebalance.pd); /* XXX */ +- sysfs_pd_controller_show(copy_gc, &c->copygc_pd); + sysfs_hprint(copy_gc_wait, + max(0LL, c->copygc_wait - + atomic64_read(&c->io_clock[WRITE].now)) << 9); +@@ -447,10 +440,7 @@ STORE(bch2_fs) + return ret; + } + +- sysfs_strtoul(pd_controllers_update_seconds, +- c->pd_controllers_update_seconds); + sysfs_pd_controller_store(rebalance, &c->rebalance.pd); +- sysfs_pd_controller_store(copy_gc, &c->copygc_pd); + + sysfs_strtoul(promote_whole_extents, c->promote_whole_extents); + +@@ -572,7 +562,6 @@ struct attribute *bch2_fs_internal_files[] = { + &sysfs_rebalance_enabled, + &sysfs_rebalance_work, + sysfs_pd_controller_files(rebalance), +- sysfs_pd_controller_files(copy_gc), + + &sysfs_new_stripes, + +-- +cgit v1.2.3 + + +From be9c217156df138229a714b7d01ac668c2941319 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 18 Apr 2021 17:54:56 -0400 +Subject: bcachefs: Add allocator thread state to sysfs + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/alloc_background.c | 25 ++++++++++++++++--------- + fs/bcachefs/alloc_background.h | 4 +++- + fs/bcachefs/alloc_types.h | 12 ++++++++++++ + fs/bcachefs/bcachefs.h | 11 +---------- + fs/bcachefs/movinggc.c | 2 +- + fs/bcachefs/sysfs.c | 6 ++++-- + 6 files changed, 37 insertions(+), 23 deletions(-) + +diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c +index e115f4f1f1d4..0f81d155c1ed 100644 +--- a/fs/bcachefs/alloc_background.c ++++ b/fs/bcachefs/alloc_background.c +@@ -25,6 +25,13 @@ + #include + #include + ++const char * const bch2_allocator_states[] = { ++#define x(n) #n, ++ ALLOC_THREAD_STATES() ++#undef x ++ NULL ++}; ++ + static const unsigned BCH_ALLOC_V1_FIELD_BYTES[] = { + #define x(name, bits) [BCH_ALLOC_FIELD_V1_##name] = bits / 8, + BCH_ALLOC_FIELDS_V1() +@@ -469,7 +476,7 @@ static int wait_buckets_available(struct bch_fs *c, struct bch_dev *ca) + s64 available; + int ret = 0; + +- ca->allocator_state = ALLOCATOR_BLOCKED; ++ ca->allocator_state = ALLOCATOR_blocked; + closure_wake_up(&c->freelist_wait); + + while (1) { +@@ -497,7 +504,7 @@ static int wait_buckets_available(struct bch_fs *c, struct bch_dev *ca) + } + + __set_current_state(TASK_RUNNING); +- ca->allocator_state = ALLOCATOR_RUNNING; ++ ca->allocator_state = ALLOCATOR_running; + closure_wake_up(&c->freelist_wait); + + return ret; +@@ -978,15 +985,15 @@ static int push_invalidated_bucket(struct bch_fs *c, struct bch_dev *ca, size_t + fifo_pop(&ca->free_inc, bucket); + + closure_wake_up(&c->freelist_wait); +- ca->allocator_state = ALLOCATOR_RUNNING; ++ ca->allocator_state = ALLOCATOR_running; + + spin_unlock(&c->freelist_lock); + goto out; + } + } + +- if (ca->allocator_state != ALLOCATOR_BLOCKED_FULL) { +- ca->allocator_state = ALLOCATOR_BLOCKED_FULL; ++ if (ca->allocator_state != ALLOCATOR_blocked_full) { ++ ca->allocator_state = ALLOCATOR_blocked_full; + closure_wake_up(&c->freelist_wait); + } + +@@ -1053,12 +1060,12 @@ static int bch2_allocator_thread(void *arg) + + while (1) { + if (!allocator_thread_running(ca)) { +- ca->allocator_state = ALLOCATOR_STOPPED; ++ ca->allocator_state = ALLOCATOR_stopped; + if (kthread_wait_freezable(allocator_thread_running(ca))) + break; + } + +- ca->allocator_state = ALLOCATOR_RUNNING; ++ ca->allocator_state = ALLOCATOR_running; + + cond_resched(); + if (kthread_should_stop()) +@@ -1139,7 +1146,7 @@ static int bch2_allocator_thread(void *arg) + + stop: + pr_debug("alloc thread stopping (ret %i)", ret); +- ca->allocator_state = ALLOCATOR_STOPPED; ++ ca->allocator_state = ALLOCATOR_stopped; + closure_wake_up(&c->freelist_wait); + return 0; + } +@@ -1319,7 +1326,7 @@ void bch2_dev_allocator_quiesce(struct bch_fs *c, struct bch_dev *ca) + { + if (ca->alloc_thread) + closure_wait_event(&c->freelist_wait, +- ca->allocator_state != ALLOCATOR_RUNNING); ++ ca->allocator_state != ALLOCATOR_running); + } + + /* stop allocator thread: */ +diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h +index 6fededcd9f86..73e1c27c96e3 100644 +--- a/fs/bcachefs/alloc_background.h ++++ b/fs/bcachefs/alloc_background.h +@@ -6,6 +6,8 @@ + #include "alloc_types.h" + #include "debug.h" + ++extern const char * const bch2_allocator_states[]; ++ + struct bkey_alloc_unpacked { + u64 bucket; + u8 dev; +@@ -100,7 +102,7 @@ static inline void bch2_wake_allocator(struct bch_dev *ca) + p = rcu_dereference(ca->alloc_thread); + if (p) { + wake_up_process(p); +- ca->allocator_state = ALLOCATOR_RUNNING; ++ ca->allocator_state = ALLOCATOR_running; + } + rcu_read_unlock(); + } +diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h +index be164d6108bb..4a1cd8b73d16 100644 +--- a/fs/bcachefs/alloc_types.h ++++ b/fs/bcachefs/alloc_types.h +@@ -10,6 +10,18 @@ + + struct ec_bucket_buf; + ++#define ALLOC_THREAD_STATES() \ ++ x(stopped) \ ++ x(running) \ ++ x(blocked) \ ++ x(blocked_full) ++ ++enum allocator_states { ++#define x(n) ALLOCATOR_##n, ++ ALLOC_THREAD_STATES() ++#undef x ++}; ++ + enum alloc_reserve { + RESERVE_BTREE_MOVINGGC = -2, + RESERVE_BTREE = -1, +diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h +index cb2f96768075..69eafed9feac 100644 +--- a/fs/bcachefs/bcachefs.h ++++ b/fs/bcachefs/bcachefs.h +@@ -458,16 +458,7 @@ struct bch_dev { + size_t inc_gen_needs_gc; + size_t inc_gen_really_needs_gc; + +- /* +- * XXX: this should be an enum for allocator state, so as to include +- * error state +- */ +- enum { +- ALLOCATOR_STOPPED, +- ALLOCATOR_RUNNING, +- ALLOCATOR_BLOCKED, +- ALLOCATOR_BLOCKED_FULL, +- } allocator_state; ++ enum allocator_states allocator_state; + + alloc_heap alloc_heap; + +diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c +index 852d42e49422..80772cff0f9d 100644 +--- a/fs/bcachefs/movinggc.c ++++ b/fs/bcachefs/movinggc.c +@@ -108,7 +108,7 @@ static bool have_copygc_reserve(struct bch_dev *ca) + + spin_lock(&ca->fs->freelist_lock); + ret = fifo_full(&ca->free[RESERVE_MOVINGGC]) || +- ca->allocator_state != ALLOCATOR_RUNNING; ++ ca->allocator_state != ALLOCATOR_running; + spin_unlock(&ca->fs->freelist_lock); + + return ret; +diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c +index 0af6d461496c..c01d2cc55d1e 100644 +--- a/fs/bcachefs/sysfs.c ++++ b/fs/bcachefs/sysfs.c +@@ -806,7 +806,8 @@ static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca) + "open_buckets_wait\t%s\n" + "open_buckets_btree\t%u\n" + "open_buckets_user\t%u\n" +- "btree reserve cache\t%u\n", ++ "btree reserve cache\t%u\n" ++ "thread state:\t\t%s\n", + stats.buckets_ec, + __dev_buckets_available(ca, stats), + stats.buckets_alloc, +@@ -820,7 +821,8 @@ static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca) + c->open_buckets_wait.list.first ? "waiting" : "empty", + nr[BCH_DATA_btree], + nr[BCH_DATA_user], +- c->btree_reserve_cache_nr); ++ c->btree_reserve_cache_nr, ++ bch2_allocator_states[ca->allocator_state]); + } + + static const char * const bch2_rw[] = { +-- +cgit v1.2.3 + + +From 6b34dcea9f520a08af4cfdb14b4aec15cd94f0cd Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 18 Apr 2021 18:01:49 -0400 +Subject: bcachefs: Fix for copygc getting stuck waiting for reserve to be + filled + +This fixes a regression from the patch + bcachefs: Fix copygc dying on startup + +In general only the allocator thread itself should be updating +ca->allocator_state, the thread waking up the allocator setting it is an +ugly hack only needed to avoid racing with the copygc threads when we're +first starting up. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/alloc_background.h | 4 +--- + fs/bcachefs/super.c | 9 ++++++++- + 2 files changed, 9 insertions(+), 4 deletions(-) + +diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h +index 73e1c27c96e3..ad15a80602c0 100644 +--- a/fs/bcachefs/alloc_background.h ++++ b/fs/bcachefs/alloc_background.h +@@ -100,10 +100,8 @@ static inline void bch2_wake_allocator(struct bch_dev *ca) + + rcu_read_lock(); + p = rcu_dereference(ca->alloc_thread); +- if (p) { ++ if (p) + wake_up_process(p); +- ca->allocator_state = ALLOCATOR_running; +- } + rcu_read_unlock(); + } + +diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c +index 8bead6afd65f..aaf49a6db1f4 100644 +--- a/fs/bcachefs/super.c ++++ b/fs/bcachefs/super.c +@@ -900,9 +900,16 @@ int bch2_fs_start(struct bch_fs *c) + /* + * Allocator threads don't start filling copygc reserve until after we + * set BCH_FS_STARTED - wake them now: ++ * ++ * XXX ugly hack: ++ * Need to set ca->allocator_state here instead of relying on the ++ * allocator threads to do it to avoid racing with the copygc threads ++ * checking it and thinking they have no alloc reserve: + */ +- for_each_online_member(ca, c, i) ++ for_each_online_member(ca, c, i) { ++ ca->allocator_state = ALLOCATOR_running; + bch2_wake_allocator(ca); ++ } + + if (c->opts.read_only || c->opts.nochanges) { + bch2_fs_read_only(c); +-- +cgit v1.2.3 + + +From f90478644feb63c5d5155f371ba60821961dcb42 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 21 Jun 2021 16:30:52 -0400 +Subject: bcachefs: Start journal reclaim thread earlier + +Especially in userspace, we sometime run into resource exhaustion issues +with starting up threads after mark and sweep/fsck. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/journal.c | 2 +- + fs/bcachefs/super.c | 8 +------- + 2 files changed, 2 insertions(+), 8 deletions(-) + +diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c +index 395021b5ac8e..594f1c754114 100644 +--- a/fs/bcachefs/journal.c ++++ b/fs/bcachefs/journal.c +@@ -1045,7 +1045,7 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq, + bch2_journal_space_available(j); + spin_unlock(&j->lock); + +- return 0; ++ return bch2_journal_reclaim_start(j); + } + + /* init/exit: */ +diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c +index aaf49a6db1f4..6e3b4f10e2ae 100644 +--- a/fs/bcachefs/super.c ++++ b/fs/bcachefs/super.c +@@ -269,7 +269,7 @@ static void bch2_writes_disabled(struct percpu_ref *writes) + void bch2_fs_read_only(struct bch_fs *c) + { + if (!test_bit(BCH_FS_RW, &c->flags)) { +- BUG_ON(c->journal.reclaim_thread); ++ bch2_journal_reclaim_stop(&c->journal); + return; + } + +@@ -424,12 +424,6 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early) + for_each_rw_member(ca, c, i) + bch2_wake_allocator(ca); + +- ret = bch2_journal_reclaim_start(&c->journal); +- if (ret) { +- bch_err(c, "error starting journal reclaim: %i", ret); +- return ret; +- } +- + if (!early) { + ret = bch2_fs_read_write_late(c); + if (ret) +-- +cgit v1.2.3 + + +From f53598336f4aea30ab6464328eb86bbc66ae930e Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 24 Apr 2021 00:09:06 -0400 +Subject: bcachefs: Add a mempool for btree_trans bump allocator + +This allocation is required for filesystem operations to make forward +progress, thus needs a mempool. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bcachefs.h | 1 + + fs/bcachefs/btree_iter.c | 28 +++++++++++++++++++++++++--- + fs/bcachefs/btree_types.h | 2 ++ + 3 files changed, 28 insertions(+), 3 deletions(-) + +diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h +index 69eafed9feac..735d8f70e5fd 100644 +--- a/fs/bcachefs/bcachefs.h ++++ b/fs/bcachefs/bcachefs.h +@@ -645,6 +645,7 @@ struct bch_fs { + struct mutex btree_trans_lock; + struct list_head btree_trans_list; + mempool_t btree_iters_pool; ++ mempool_t btree_trans_mem_pool; + struct btree_iter_buf __percpu *btree_iters_bufs; + + struct srcu_struct btree_trans_barrier; +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index c41fe4e0bc00..152369eb7f8a 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -2181,7 +2181,16 @@ static int bch2_trans_preload_mem(struct btree_trans *trans, size_t size) + if (size > trans->mem_bytes) { + size_t old_bytes = trans->mem_bytes; + size_t new_bytes = roundup_pow_of_two(size); +- void *new_mem = krealloc(trans->mem, new_bytes, GFP_NOFS); ++ void *new_mem; ++ ++ WARN_ON_ONCE(new_bytes > BTREE_TRANS_MEM_MAX); ++ ++ new_mem = krealloc(trans->mem, new_bytes, GFP_NOFS); ++ if (!new_mem && new_bytes <= BTREE_TRANS_MEM_MAX) { ++ new_mem = mempool_alloc(&trans->c->btree_trans_mem_pool, GFP_KERNEL); ++ new_bytes = BTREE_TRANS_MEM_MAX; ++ kfree(trans->mem); ++ } + + if (!new_mem) + return -ENOMEM; +@@ -2292,6 +2301,11 @@ void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, + if (expected_mem_bytes) { + trans->mem_bytes = roundup_pow_of_two(expected_mem_bytes); + trans->mem = kmalloc(trans->mem_bytes, GFP_KERNEL|__GFP_NOFAIL); ++ ++ if (!unlikely(trans->mem)) { ++ trans->mem = mempool_alloc(&c->btree_trans_mem_pool, GFP_KERNEL); ++ trans->mem_bytes = BTREE_TRANS_MEM_MAX; ++ } + } + + trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier); +@@ -2321,7 +2335,11 @@ int bch2_trans_exit(struct btree_trans *trans) + bch2_journal_preres_put(&trans->c->journal, &trans->journal_preres); + + kfree(trans->fs_usage_deltas); +- kfree(trans->mem); ++ ++ if (trans->mem_bytes == BTREE_TRANS_MEM_MAX) ++ mempool_free(trans->mem, &trans->c->btree_trans_mem_pool); ++ else ++ kfree(trans->mem); + + #ifdef __KERNEL__ + /* +@@ -2329,6 +2347,7 @@ int bch2_trans_exit(struct btree_trans *trans) + */ + trans->iters = this_cpu_xchg(c->btree_iters_bufs->iter, trans->iters); + #endif ++ + if (trans->iters) + mempool_free(trans->iters, &trans->c->btree_iters_pool); + +@@ -2404,6 +2423,7 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c) + + void bch2_fs_btree_iter_exit(struct bch_fs *c) + { ++ mempool_exit(&c->btree_trans_mem_pool); + mempool_exit(&c->btree_iters_pool); + cleanup_srcu_struct(&c->btree_trans_barrier); + } +@@ -2419,5 +2439,7 @@ int bch2_fs_btree_iter_init(struct bch_fs *c) + mempool_init_kmalloc_pool(&c->btree_iters_pool, 1, + sizeof(struct btree_iter) * nr + + sizeof(struct btree_insert_entry) * nr + +- sizeof(struct btree_insert_entry) * nr); ++ sizeof(struct btree_insert_entry) * nr) ?: ++ mempool_init_kmalloc_pool(&c->btree_trans_mem_pool, 1, ++ BTREE_TRANS_MEM_MAX); + } +diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h +index 03894e923037..a8d362b3eef4 100644 +--- a/fs/bcachefs/btree_types.h ++++ b/fs/bcachefs/btree_types.h +@@ -346,6 +346,8 @@ struct btree_insert_entry { + #define BTREE_ITER_MAX 32 + #endif + ++#define BTREE_TRANS_MEM_MAX (1U << 14) ++ + struct btree_trans { + struct bch_fs *c; + #ifdef CONFIG_BCACHEFS_DEBUG +-- +cgit v1.2.3 + + +From 34f2fbdfa29050a7fd84c89060fbb6bbfda54489 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 24 Apr 2021 00:24:25 -0400 +Subject: bcachefs: Add a mempool for the replicas delta list + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bcachefs.h | 3 +++ + fs/bcachefs/btree_iter.c | 9 ++++++++- + fs/bcachefs/buckets.c | 20 ++++++++++++++++++-- + fs/bcachefs/replicas.c | 18 +++++++++++++++++- + fs/bcachefs/replicas.h | 1 + + fs/bcachefs/super.c | 7 +------ + 6 files changed, 48 insertions(+), 10 deletions(-) + +diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h +index 735d8f70e5fd..4f5ccf4bc57c 100644 +--- a/fs/bcachefs/bcachefs.h ++++ b/fs/bcachefs/bcachefs.h +@@ -547,6 +547,8 @@ struct btree_iter_buf { + struct btree_iter *iter; + }; + ++#define REPLICAS_DELTA_LIST_MAX (1U << 16) ++ + struct bch_fs { + struct closure cl; + +@@ -574,6 +576,7 @@ struct bch_fs { + struct bch_replicas_cpu replicas; + struct bch_replicas_cpu replicas_gc; + struct mutex replicas_gc_lock; ++ mempool_t replicas_delta_pool; + + struct journal_entry_res btree_root_journal_res; + struct journal_entry_res replicas_journal_res; +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 152369eb7f8a..c5323c0f904a 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -2334,7 +2334,14 @@ int bch2_trans_exit(struct btree_trans *trans) + + bch2_journal_preres_put(&trans->c->journal, &trans->journal_preres); + +- kfree(trans->fs_usage_deltas); ++ if (trans->fs_usage_deltas) { ++ if (trans->fs_usage_deltas->size + sizeof(trans->fs_usage_deltas) == ++ REPLICAS_DELTA_LIST_MAX) ++ mempool_free(trans->fs_usage_deltas, ++ &trans->c->replicas_delta_pool); ++ else ++ kfree(trans->fs_usage_deltas); ++ } + + if (trans->mem_bytes == BTREE_TRANS_MEM_MAX) + mempool_free(trans->mem, &trans->c->btree_trans_mem_pool); +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index ba7a472a1bb7..38939e2c1d8a 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -558,10 +558,26 @@ replicas_deltas_realloc(struct btree_trans *trans, unsigned more) + { + struct replicas_delta_list *d = trans->fs_usage_deltas; + unsigned new_size = d ? (d->size + more) * 2 : 128; ++ unsigned alloc_size = sizeof(*d) + new_size; ++ ++ WARN_ON_ONCE(alloc_size > REPLICAS_DELTA_LIST_MAX); + + if (!d || d->used + more > d->size) { +- d = krealloc(d, sizeof(*d) + new_size, GFP_NOIO|__GFP_ZERO); +- BUG_ON(!d); ++ d = krealloc(d, alloc_size, GFP_NOIO|__GFP_ZERO); ++ ++ BUG_ON(!d && alloc_size > REPLICAS_DELTA_LIST_MAX); ++ ++ if (!d) { ++ d = mempool_alloc(&trans->c->replicas_delta_pool, GFP_NOIO); ++ memset(d, 0, REPLICAS_DELTA_LIST_MAX); ++ ++ if (trans->fs_usage_deltas) ++ memcpy(d, trans->fs_usage_deltas, ++ trans->fs_usage_deltas->size + sizeof(*d)); ++ ++ new_size = REPLICAS_DELTA_LIST_MAX - sizeof(*d); ++ kfree(trans->fs_usage_deltas); ++ } + + d->size = new_size; + trans->fs_usage_deltas = d; +diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c +index 1480a6468dce..fccdb630010c 100644 +--- a/fs/bcachefs/replicas.c ++++ b/fs/bcachefs/replicas.c +@@ -1024,11 +1024,27 @@ unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca) + return ret; + } + ++void bch2_fs_replicas_exit(struct bch_fs *c) ++{ ++ unsigned i; ++ ++ kfree(c->usage_scratch); ++ for (i = 0; i < ARRAY_SIZE(c->usage); i++) ++ free_percpu(c->usage[i]); ++ kfree(c->usage_base); ++ kfree(c->replicas.entries); ++ kfree(c->replicas_gc.entries); ++ ++ mempool_exit(&c->replicas_delta_pool); ++} ++ + int bch2_fs_replicas_init(struct bch_fs *c) + { + bch2_journal_entry_res_resize(&c->journal, + &c->replicas_journal_res, + reserve_journal_replicas(c, &c->replicas)); + +- return replicas_table_update(c, &c->replicas); ++ return mempool_init_kmalloc_pool(&c->replicas_delta_pool, 1, ++ REPLICAS_DELTA_LIST_MAX) ?: ++ replicas_table_update(c, &c->replicas); + } +diff --git a/fs/bcachefs/replicas.h b/fs/bcachefs/replicas.h +index 9c8fd3d98247..8cb1f592f1b6 100644 +--- a/fs/bcachefs/replicas.h ++++ b/fs/bcachefs/replicas.h +@@ -77,6 +77,7 @@ int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *); + extern const struct bch_sb_field_ops bch_sb_field_ops_replicas; + extern const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0; + ++void bch2_fs_replicas_exit(struct bch_fs *); + int bch2_fs_replicas_init(struct bch_fs *); + + #endif /* _BCACHEFS_REPLICAS_H */ +diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c +index 6e3b4f10e2ae..27ee527c789d 100644 +--- a/fs/bcachefs/super.c ++++ b/fs/bcachefs/super.c +@@ -469,6 +469,7 @@ static void __bch2_fs_free(struct bch_fs *c) + bch2_fs_btree_iter_exit(c); + bch2_fs_btree_key_cache_exit(&c->btree_key_cache); + bch2_fs_btree_cache_exit(c); ++ bch2_fs_replicas_exit(c); + bch2_fs_journal_exit(&c->journal); + bch2_io_clock_exit(&c->io_clock[WRITE]); + bch2_io_clock_exit(&c->io_clock[READ]); +@@ -476,10 +477,6 @@ static void __bch2_fs_free(struct bch_fs *c) + bch2_journal_keys_free(&c->journal_keys); + bch2_journal_entries_free(&c->journal_entries); + percpu_free_rwsem(&c->mark_lock); +- kfree(c->usage_scratch); +- for (i = 0; i < ARRAY_SIZE(c->usage); i++) +- free_percpu(c->usage[i]); +- kfree(c->usage_base); + + if (c->btree_iters_bufs) + for_each_possible_cpu(cpu) +@@ -492,8 +489,6 @@ static void __bch2_fs_free(struct bch_fs *c) + bioset_exit(&c->btree_bio); + mempool_exit(&c->fill_iter); + percpu_ref_exit(&c->writes); +- kfree(c->replicas.entries); +- kfree(c->replicas_gc.entries); + kfree(rcu_dereference_protected(c->disk_groups, 1)); + kfree(c->journal_seq_blacklist_table); + kfree(c->unused_inode_hints); +-- +cgit v1.2.3 + + +From 6b2e40f21f95be079b2263a628787682108638c3 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 27 Dec 2021 22:11:54 -0500 +Subject: bcachefs: Fix bch2_btree_cache_scan() + +It was counting nodes on the freed list that it skips - because we want +to leave a few so that btree splits don't touch the allocator - as nodes +that it touched, meaning that if it was called with <= 3 nodes to +reclaim, and those nodes were on the freed list, it would never do any +work. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_cache.c | 10 ++++++++-- + 1 file changed, 8 insertions(+), 2 deletions(-) + +diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c +index 19c219cb317b..76ff1f382794 100644 +--- a/fs/bcachefs/btree_cache.c ++++ b/fs/bcachefs/btree_cache.c +@@ -280,13 +280,19 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, + + i = 0; + list_for_each_entry_safe(b, t, &bc->freeable, list) { ++ /* ++ * Leave a few nodes on the freeable list, so that a btree split ++ * won't have to hit the system allocator: ++ */ ++ if (++i <= 3) ++ continue; ++ + touched++; + + if (freed >= nr) + break; + +- if (++i > 3 && +- !btree_node_reclaim(c, b)) { ++ if (!btree_node_reclaim(c, b)) { + btree_node_data_free(c, b); + six_unlock_write(&b->c.lock); + six_unlock_intent(&b->c.lock); +-- +cgit v1.2.3 + + +From feef92f19e7c21b77cc3febb9147331a88008c69 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 20 Feb 2021 00:00:23 -0500 +Subject: bcachefs: Kill support for !BTREE_NODE_NEW_EXTENT_OVERWRITE() + +bcachefs has been aggressively migrating filesystems and btree nodes to +the new format for quite some time - this shouldn't affect anyone +anymore, and lets us delete a _lot_ of code. Also, it frees up +KEY_TYPE_discard for a new whiteout key type for snapshots. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bkey_sort.c | 278 ++---------------------------------- + fs/bcachefs/bkey_sort.h | 8 -- + fs/bcachefs/btree_io.c | 195 ++----------------------- + fs/bcachefs/btree_types.h | 2 - + fs/bcachefs/btree_update_interior.c | 9 +- + fs/bcachefs/recovery.c | 6 + + 6 files changed, 30 insertions(+), 468 deletions(-) + +diff --git a/fs/bcachefs/bkey_sort.c b/fs/bcachefs/bkey_sort.c +index 2e1d9cd65f43..a88670753cb0 100644 +--- a/fs/bcachefs/bkey_sort.c ++++ b/fs/bcachefs/bkey_sort.c +@@ -14,9 +14,8 @@ static inline bool sort_iter_end(struct sort_iter *iter) + return !iter->used; + } + +-static inline void __sort_iter_sift(struct sort_iter *iter, +- unsigned from, +- sort_cmp_fn cmp) ++static inline void sort_iter_sift(struct sort_iter *iter, unsigned from, ++ sort_cmp_fn cmp) + { + unsigned i; + +@@ -27,18 +26,12 @@ static inline void __sort_iter_sift(struct sort_iter *iter, + swap(iter->data[i], iter->data[i + 1]); + } + +-static inline void sort_iter_sift(struct sort_iter *iter, sort_cmp_fn cmp) +-{ +- +- __sort_iter_sift(iter, 0, cmp); +-} +- + static inline void sort_iter_sort(struct sort_iter *iter, sort_cmp_fn cmp) + { + unsigned i = iter->used; + + while (i--) +- __sort_iter_sift(iter, i, cmp); ++ sort_iter_sift(iter, i, cmp); + } + + static inline struct bkey_packed *sort_iter_peek(struct sort_iter *iter) +@@ -46,26 +39,20 @@ static inline struct bkey_packed *sort_iter_peek(struct sort_iter *iter) + return !sort_iter_end(iter) ? iter->data->k : NULL; + } + +-static inline void __sort_iter_advance(struct sort_iter *iter, +- unsigned idx, sort_cmp_fn cmp) ++static inline void sort_iter_advance(struct sort_iter *iter, sort_cmp_fn cmp) + { +- struct sort_iter_set *i = iter->data + idx; ++ struct sort_iter_set *i = iter->data; + +- BUG_ON(idx >= iter->used); ++ BUG_ON(!iter->used); + + i->k = bkey_next_skip_noops(i->k, i->end); + + BUG_ON(i->k > i->end); + + if (i->k == i->end) +- array_remove_item(iter->data, iter->used, idx); ++ array_remove_item(iter->data, iter->used, 0); + else +- __sort_iter_sift(iter, idx, cmp); +-} +- +-static inline void sort_iter_advance(struct sort_iter *iter, sort_cmp_fn cmp) +-{ +- __sort_iter_advance(iter, 0, cmp); ++ sort_iter_sift(iter, 0, cmp); + } + + static inline struct bkey_packed *sort_iter_next(struct sort_iter *iter, +@@ -264,252 +251,3 @@ unsigned bch2_sort_keys(struct bkey_packed *dst, + + return (u64 *) out - (u64 *) dst; + } +- +-/* Compat code for btree_node_old_extent_overwrite: */ +- +-/* +- * If keys compare equal, compare by pointer order: +- * +- * Necessary for sort_fix_overlapping() - if there are multiple keys that +- * compare equal in different sets, we have to process them newest to oldest. +- */ +-static inline int extent_sort_fix_overlapping_cmp(struct btree *b, +- struct bkey_packed *l, +- struct bkey_packed *r) +-{ +- struct bkey ul = bkey_unpack_key(b, l); +- struct bkey ur = bkey_unpack_key(b, r); +- +- return bkey_cmp(bkey_start_pos(&ul), +- bkey_start_pos(&ur)) ?: +- cmp_int((unsigned long) r, (unsigned long) l); +-} +- +-/* +- * The algorithm in extent_sort_fix_overlapping() relies on keys in the same +- * bset being ordered by start offset - but 0 size whiteouts (which are always +- * KEY_TYPE_deleted) break this ordering, so we need to skip over them: +- */ +-static void extent_iter_advance(struct sort_iter *iter, unsigned idx) +-{ +- struct sort_iter_set *i = iter->data + idx; +- +- do { +- i->k = bkey_next_skip_noops(i->k, i->end); +- } while (i->k != i->end && bkey_deleted(i->k)); +- +- if (i->k == i->end) +- array_remove_item(iter->data, iter->used, idx); +- else +- __sort_iter_sift(iter, idx, extent_sort_fix_overlapping_cmp); +-} +- +-struct btree_nr_keys +-bch2_extent_sort_fix_overlapping(struct bch_fs *c, struct bset *dst, +- struct sort_iter *iter) +-{ +- struct btree *b = iter->b; +- struct bkey_format *f = &b->format; +- struct sort_iter_set *_l = iter->data, *_r = iter->data + 1; +- struct bkey_packed *out = dst->start; +- struct bkey l_unpacked, r_unpacked; +- struct bkey_s l, r; +- struct btree_nr_keys nr; +- struct bkey_buf split; +- unsigned i; +- +- memset(&nr, 0, sizeof(nr)); +- bch2_bkey_buf_init(&split); +- +- sort_iter_sort(iter, extent_sort_fix_overlapping_cmp); +- for (i = 0; i < iter->used;) { +- if (bkey_deleted(iter->data[i].k)) +- __sort_iter_advance(iter, i, +- extent_sort_fix_overlapping_cmp); +- else +- i++; +- } +- +- while (!sort_iter_end(iter)) { +- l = __bkey_disassemble(b, _l->k, &l_unpacked); +- +- if (iter->used == 1) { +- extent_sort_append(c, f, &nr, &out, l); +- extent_iter_advance(iter, 0); +- continue; +- } +- +- r = __bkey_disassemble(b, _r->k, &r_unpacked); +- +- /* If current key and next key don't overlap, just append */ +- if (bkey_cmp(l.k->p, bkey_start_pos(r.k)) <= 0) { +- extent_sort_append(c, f, &nr, &out, l); +- extent_iter_advance(iter, 0); +- continue; +- } +- +- /* Skip 0 size keys */ +- if (!r.k->size) { +- extent_iter_advance(iter, 1); +- continue; +- } +- +- /* +- * overlap: keep the newer key and trim the older key so they +- * don't overlap. comparing pointers tells us which one is +- * newer, since the bsets are appended one after the other. +- */ +- +- /* can't happen because of comparison func */ +- BUG_ON(_l->k < _r->k && +- !bkey_cmp(bkey_start_pos(l.k), bkey_start_pos(r.k))); +- +- if (_l->k > _r->k) { +- /* l wins, trim r */ +- if (bkey_cmp(l.k->p, r.k->p) >= 0) { +- extent_iter_advance(iter, 1); +- } else { +- bch2_cut_front_s(l.k->p, r); +- extent_save(b, _r->k, r.k); +- __sort_iter_sift(iter, 1, +- extent_sort_fix_overlapping_cmp); +- } +- } else if (bkey_cmp(l.k->p, r.k->p) > 0) { +- +- /* +- * r wins, but it overlaps in the middle of l - split l: +- */ +- bch2_bkey_buf_reassemble(&split, c, l.s_c); +- bch2_cut_back(bkey_start_pos(r.k), split.k); +- +- bch2_cut_front_s(r.k->p, l); +- extent_save(b, _l->k, l.k); +- +- __sort_iter_sift(iter, 0, +- extent_sort_fix_overlapping_cmp); +- +- extent_sort_append(c, f, &nr, &out, +- bkey_i_to_s(split.k)); +- } else { +- bch2_cut_back_s(bkey_start_pos(r.k), l); +- extent_save(b, _l->k, l.k); +- } +- } +- +- dst->u64s = cpu_to_le16((u64 *) out - dst->_data); +- +- bch2_bkey_buf_exit(&split, c); +- return nr; +-} +- +-static inline int sort_extents_cmp(struct btree *b, +- struct bkey_packed *l, +- struct bkey_packed *r) +-{ +- return bch2_bkey_cmp_packed(b, l, r) ?: +- (int) bkey_deleted(l) - (int) bkey_deleted(r); +-} +- +-unsigned bch2_sort_extents(struct bkey_packed *dst, +- struct sort_iter *iter, +- bool filter_whiteouts) +-{ +- struct bkey_packed *in, *out = dst; +- +- sort_iter_sort(iter, sort_extents_cmp); +- +- while ((in = sort_iter_next(iter, sort_extents_cmp))) { +- if (bkey_deleted(in)) +- continue; +- +- if (bkey_whiteout(in) && +- (filter_whiteouts || !in->needs_whiteout)) +- continue; +- +- bkey_copy(out, in); +- out = bkey_next(out); +- } +- +- return (u64 *) out - (u64 *) dst; +-} +- +-static inline int sort_extent_whiteouts_cmp(struct btree *b, +- struct bkey_packed *l, +- struct bkey_packed *r) +-{ +- struct bkey ul = bkey_unpack_key(b, l); +- struct bkey ur = bkey_unpack_key(b, r); +- +- return bkey_cmp(bkey_start_pos(&ul), bkey_start_pos(&ur)); +-} +- +-unsigned bch2_sort_extent_whiteouts(struct bkey_packed *dst, +- struct sort_iter *iter) +-{ +- const struct bkey_format *f = &iter->b->format; +- struct bkey_packed *in, *out = dst; +- struct bkey_i l, r; +- bool prev = false, l_packed = false; +- u64 max_packed_size = bkey_field_max(f, BKEY_FIELD_SIZE); +- u64 max_packed_offset = bkey_field_max(f, BKEY_FIELD_OFFSET); +- u64 new_size; +- +- max_packed_size = min_t(u64, max_packed_size, KEY_SIZE_MAX); +- +- sort_iter_sort(iter, sort_extent_whiteouts_cmp); +- +- while ((in = sort_iter_next(iter, sort_extent_whiteouts_cmp))) { +- if (bkey_deleted(in)) +- continue; +- +- EBUG_ON(bkeyp_val_u64s(f, in)); +- EBUG_ON(in->type != KEY_TYPE_discard); +- +- r.k = bkey_unpack_key(iter->b, in); +- +- if (prev && +- bkey_cmp(l.k.p, bkey_start_pos(&r.k)) >= 0) { +- if (bkey_cmp(l.k.p, r.k.p) >= 0) +- continue; +- +- new_size = l_packed +- ? min(max_packed_size, max_packed_offset - +- bkey_start_offset(&l.k)) +- : KEY_SIZE_MAX; +- +- new_size = min(new_size, r.k.p.offset - +- bkey_start_offset(&l.k)); +- +- BUG_ON(new_size < l.k.size); +- +- bch2_key_resize(&l.k, new_size); +- +- if (bkey_cmp(l.k.p, r.k.p) >= 0) +- continue; +- +- bch2_cut_front(l.k.p, &r); +- } +- +- if (prev) { +- if (!bch2_bkey_pack(out, &l, f)) { +- BUG_ON(l_packed); +- bkey_copy(out, &l); +- } +- out = bkey_next(out); +- } +- +- l = r; +- prev = true; +- l_packed = bkey_packed(in); +- } +- +- if (prev) { +- if (!bch2_bkey_pack(out, &l, f)) { +- BUG_ON(l_packed); +- bkey_copy(out, &l); +- } +- out = bkey_next(out); +- } +- +- return (u64 *) out - (u64 *) dst; +-} +diff --git a/fs/bcachefs/bkey_sort.h b/fs/bcachefs/bkey_sort.h +index 458a051fdac5..1059996dac78 100644 +--- a/fs/bcachefs/bkey_sort.h ++++ b/fs/bcachefs/bkey_sort.h +@@ -32,9 +32,6 @@ static inline void sort_iter_add(struct sort_iter *iter, + struct btree_nr_keys + bch2_key_sort_fix_overlapping(struct bch_fs *, struct bset *, + struct sort_iter *); +-struct btree_nr_keys +-bch2_extent_sort_fix_overlapping(struct bch_fs *, struct bset *, +- struct sort_iter *); + + struct btree_nr_keys + bch2_sort_repack(struct bset *, struct btree *, +@@ -48,10 +45,5 @@ bch2_sort_repack_merge(struct bch_fs *, + + unsigned bch2_sort_keys(struct bkey_packed *, + struct sort_iter *, bool); +-unsigned bch2_sort_extents(struct bkey_packed *, +- struct sort_iter *, bool); +- +-unsigned bch2_sort_extent_whiteouts(struct bkey_packed *, +- struct sort_iter *); + + #endif /* _BCACHEFS_BKEY_SORT_H */ +diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c +index 5984f7cb8f33..eb04ad40fbc4 100644 +--- a/fs/bcachefs/btree_io.c ++++ b/fs/bcachefs/btree_io.c +@@ -24,8 +24,7 @@ + + static void verify_no_dups(struct btree *b, + struct bkey_packed *start, +- struct bkey_packed *end, +- bool extents) ++ struct bkey_packed *end) + { + #ifdef CONFIG_BCACHEFS_DEBUG + struct bkey_packed *k, *p; +@@ -39,10 +38,7 @@ static void verify_no_dups(struct btree *b, + struct bkey l = bkey_unpack_key(b, p); + struct bkey r = bkey_unpack_key(b, k); + +- BUG_ON(extents +- ? bkey_cmp(l.p, bkey_start_pos(&r)) > 0 +- : bkey_cmp(l.p, bkey_start_pos(&r)) >= 0); +- //BUG_ON(bch2_bkey_cmp_packed(&b->format, p, k) >= 0); ++ BUG_ON(bkey_cmp(l.p, bkey_start_pos(&r)) >= 0); + } + #endif + } +@@ -150,8 +146,7 @@ static void bch2_sort_whiteouts(struct bch_fs *c, struct btree *b) + } + + verify_no_dups(b, new_whiteouts, +- (void *) ((u64 *) new_whiteouts + b->whiteout_u64s), +- btree_node_old_extent_overwrite(b)); ++ (void *) ((u64 *) new_whiteouts + b->whiteout_u64s)); + + memcpy_u64s(unwritten_whiteouts_start(c, b), + new_whiteouts, b->whiteout_u64s); +@@ -176,144 +171,6 @@ static bool should_compact_bset(struct btree *b, struct bset_tree *t, + } + } + +-static bool bch2_compact_extent_whiteouts(struct bch_fs *c, +- struct btree *b, +- enum compact_mode mode) +-{ +- const struct bkey_format *f = &b->format; +- struct bset_tree *t; +- struct bkey_packed *whiteouts = NULL; +- struct bkey_packed *u_start, *u_pos; +- struct sort_iter sort_iter; +- unsigned bytes, whiteout_u64s = 0, u64s; +- bool used_mempool, compacting = false; +- +- BUG_ON(!btree_node_is_extents(b)); +- +- for_each_bset(b, t) +- if (should_compact_bset(b, t, whiteout_u64s != 0, mode)) +- whiteout_u64s += bset_dead_u64s(b, t); +- +- if (!whiteout_u64s) +- return false; +- +- bch2_sort_whiteouts(c, b); +- +- sort_iter_init(&sort_iter, b); +- +- whiteout_u64s += b->whiteout_u64s; +- bytes = whiteout_u64s * sizeof(u64); +- +- whiteouts = btree_bounce_alloc(c, bytes, &used_mempool); +- u_start = u_pos = whiteouts; +- +- memcpy_u64s(u_pos, unwritten_whiteouts_start(c, b), +- b->whiteout_u64s); +- u_pos = (void *) u_pos + b->whiteout_u64s * sizeof(u64); +- +- sort_iter_add(&sort_iter, u_start, u_pos); +- +- for_each_bset(b, t) { +- struct bset *i = bset(b, t); +- struct bkey_packed *k, *n, *out, *start, *end; +- struct btree_node_entry *src = NULL, *dst = NULL; +- +- if (t != b->set && !bset_written(b, i)) { +- src = container_of(i, struct btree_node_entry, keys); +- dst = max(write_block(b), +- (void *) btree_bkey_last(b, t - 1)); +- } +- +- if (src != dst) +- compacting = true; +- +- if (!should_compact_bset(b, t, compacting, mode)) { +- if (src != dst) { +- memmove(dst, src, sizeof(*src) + +- le16_to_cpu(src->keys.u64s) * +- sizeof(u64)); +- i = &dst->keys; +- set_btree_bset(b, t, i); +- } +- continue; +- } +- +- compacting = true; +- u_start = u_pos; +- start = i->start; +- end = vstruct_last(i); +- +- if (src != dst) { +- memmove(dst, src, sizeof(*src)); +- i = &dst->keys; +- set_btree_bset(b, t, i); +- } +- +- out = i->start; +- +- for (k = start; k != end; k = n) { +- n = bkey_next_skip_noops(k, end); +- +- if (bkey_deleted(k)) +- continue; +- +- BUG_ON(bkey_whiteout(k) && +- k->needs_whiteout && +- bkey_written(b, k)); +- +- if (bkey_whiteout(k) && !k->needs_whiteout) +- continue; +- +- if (bkey_whiteout(k)) { +- memcpy_u64s(u_pos, k, bkeyp_key_u64s(f, k)); +- set_bkeyp_val_u64s(f, u_pos, 0); +- u_pos = bkey_next(u_pos); +- } else { +- bkey_copy(out, k); +- out = bkey_next(out); +- } +- } +- +- sort_iter_add(&sort_iter, u_start, u_pos); +- +- i->u64s = cpu_to_le16((u64 *) out - i->_data); +- set_btree_bset_end(b, t); +- bch2_bset_set_no_aux_tree(b, t); +- } +- +- b->whiteout_u64s = (u64 *) u_pos - (u64 *) whiteouts; +- +- BUG_ON((void *) unwritten_whiteouts_start(c, b) < +- (void *) btree_bkey_last(b, bset_tree_last(b))); +- +- u64s = bch2_sort_extent_whiteouts(unwritten_whiteouts_start(c, b), +- &sort_iter); +- +- BUG_ON(u64s > b->whiteout_u64s); +- BUG_ON(u_pos != whiteouts && !u64s); +- +- if (u64s != b->whiteout_u64s) { +- void *src = unwritten_whiteouts_start(c, b); +- +- b->whiteout_u64s = u64s; +- memmove_u64s_up(unwritten_whiteouts_start(c, b), src, u64s); +- } +- +- verify_no_dups(b, +- unwritten_whiteouts_start(c, b), +- unwritten_whiteouts_end(c, b), +- true); +- +- btree_bounce_free(c, bytes, used_mempool, whiteouts); +- +- bch2_btree_build_aux_trees(b); +- +- bch_btree_keys_u64s_remaining(c, b); +- bch2_verify_btree_nr_keys(b); +- +- return true; +-} +- + static bool bch2_drop_whiteouts(struct btree *b, enum compact_mode mode) + { + struct bset_tree *t; +@@ -382,9 +239,7 @@ static bool bch2_drop_whiteouts(struct btree *b, enum compact_mode mode) + bool bch2_compact_whiteouts(struct bch_fs *c, struct btree *b, + enum compact_mode mode) + { +- return !btree_node_old_extent_overwrite(b) +- ? bch2_drop_whiteouts(b, mode) +- : bch2_compact_extent_whiteouts(c, b, mode); ++ return bch2_drop_whiteouts(b, mode); + } + + static void btree_node_sort(struct bch_fs *c, struct btree *b, +@@ -422,14 +277,7 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b, + + start_time = local_clock(); + +- if (btree_node_old_extent_overwrite(b)) +- filter_whiteouts = bset_written(b, start_bset); +- +- u64s = (btree_node_old_extent_overwrite(b) +- ? bch2_sort_extents +- : bch2_sort_keys)(out->keys.start, +- &sort_iter, +- filter_whiteouts); ++ u64s = bch2_sort_keys(out->keys.start, &sort_iter, filter_whiteouts); + + out->keys.u64s = cpu_to_le16(u64s); + +@@ -971,11 +819,10 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, + + bset_encrypt(c, i, b->written << 9); + +- if (btree_node_is_extents(b) && +- !BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data)) { +- set_btree_node_old_extent_overwrite(b); +- set_btree_node_need_rewrite(b); +- } ++ btree_err_on(btree_node_is_extents(b) && ++ !BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data), ++ BTREE_ERR_FATAL, c, NULL, b, NULL, ++ "btree node does not have NEW_EXTENT_OVERWRITE set"); + + sectors = vstruct_sectors(b->data, c->block_bits); + } else { +@@ -1052,9 +899,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, + + set_btree_bset(b, b->set, &b->data->keys); + +- b->nr = (btree_node_old_extent_overwrite(b) +- ? bch2_extent_sort_fix_overlapping +- : bch2_key_sort_fix_overlapping)(c, &sorted->keys, iter); ++ b->nr = bch2_key_sort_fix_overlapping(c, &sorted->keys, iter); + + u64s = le16_to_cpu(sorted->keys.u64s); + *sorted = *b->data; +@@ -1598,24 +1443,14 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, + i->journal_seq = cpu_to_le64(seq); + i->u64s = 0; + +- if (!btree_node_old_extent_overwrite(b)) { +- sort_iter_add(&sort_iter, +- unwritten_whiteouts_start(c, b), +- unwritten_whiteouts_end(c, b)); +- SET_BSET_SEPARATE_WHITEOUTS(i, false); +- } else { +- memcpy_u64s(i->start, +- unwritten_whiteouts_start(c, b), +- b->whiteout_u64s); +- i->u64s = cpu_to_le16(b->whiteout_u64s); +- SET_BSET_SEPARATE_WHITEOUTS(i, true); +- } ++ sort_iter_add(&sort_iter, ++ unwritten_whiteouts_start(c, b), ++ unwritten_whiteouts_end(c, b)); ++ SET_BSET_SEPARATE_WHITEOUTS(i, false); + + b->whiteout_u64s = 0; + +- u64s = btree_node_old_extent_overwrite(b) +- ? bch2_sort_extents(vstruct_last(i), &sort_iter, false) +- : bch2_sort_keys(i->start, &sort_iter, false); ++ u64s = bch2_sort_keys(i->start, &sort_iter, false); + le16_add_cpu(&i->u64s, u64s); + + set_needs_whiteout(i, false); +diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h +index a8d362b3eef4..4667a0a4201c 100644 +--- a/fs/bcachefs/btree_types.h ++++ b/fs/bcachefs/btree_types.h +@@ -418,7 +418,6 @@ enum btree_flags { + BTREE_NODE_just_written, + BTREE_NODE_dying, + BTREE_NODE_fake, +- BTREE_NODE_old_extent_overwrite, + BTREE_NODE_need_rewrite, + BTREE_NODE_never_write, + }; +@@ -433,7 +432,6 @@ BTREE_FLAG(write_in_flight); + BTREE_FLAG(just_written); + BTREE_FLAG(dying); + BTREE_FLAG(fake); +-BTREE_FLAG(old_extent_overwrite); + BTREE_FLAG(need_rewrite); + BTREE_FLAG(never_write); + +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index 275dcabbbdd6..987c783a6cbd 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -303,14 +303,7 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned lev + bp->v.sectors_written = 0; + } + +- if (c->sb.features & (1ULL << BCH_FEATURE_new_extent_overwrite)) +- SET_BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data, true); +- +- if (btree_node_is_extents(b) && +- !BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data)) { +- set_btree_node_old_extent_overwrite(b); +- set_btree_node_need_rewrite(b); +- } ++ SET_BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data, true); + + bch2_btree_build_aux_trees(b); + +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index 740fdeafe1a2..4d7badcc568b 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -983,6 +983,12 @@ int bch2_fs_recovery(struct bch_fs *c) + bch_info(c, "recovering from clean shutdown, journal seq %llu", + le64_to_cpu(clean->journal_seq)); + ++ if (!(c->sb.features & (1ULL << BCH_FEATURE_new_extent_overwrite))) { ++ bch_err(c, "feature new_extent_overwrite not set, filesystem no longer supported"); ++ ret = -EINVAL; ++ goto err; ++ } ++ + if (!(c->sb.features & (1ULL << BCH_FEATURE_alloc_v2))) { + bch_info(c, "alloc_v2 feature bit not set, fsck required"); + c->opts.fsck = true; +-- +cgit v1.2.3 + + +From dade080d85e9a610dcdb040a2b715024d3430f13 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 19 Feb 2021 23:41:40 -0500 +Subject: bcachefs: KEY_TYPE_discard is no longer used + +KEY_TYPE_discard used to be used for extent whiteouts, but when handling +over overlapping extents was lifted above the core btree code it became +unused. This patch updates various code to reflect that. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bkey_sort.c | 12 +++++----- + fs/bcachefs/bset.c | 11 +++++----- + fs/bcachefs/bset.h | 44 ++++++++++++------------------------- + fs/bcachefs/btree_io.c | 6 ++--- + fs/bcachefs/btree_iter.c | 2 +- + fs/bcachefs/btree_update_interior.c | 2 +- + fs/bcachefs/btree_update_leaf.c | 6 ++--- + fs/bcachefs/extents.c | 4 ++-- + 8 files changed, 35 insertions(+), 52 deletions(-) + +diff --git a/fs/bcachefs/bkey_sort.c b/fs/bcachefs/bkey_sort.c +index a88670753cb0..f2507079ed11 100644 +--- a/fs/bcachefs/bkey_sort.c ++++ b/fs/bcachefs/bkey_sort.c +@@ -103,7 +103,7 @@ bch2_key_sort_fix_overlapping(struct bch_fs *c, struct bset *dst, + sort_iter_sort(iter, key_sort_fix_overlapping_cmp); + + while ((k = sort_iter_peek(iter))) { +- if (!bkey_whiteout(k) && ++ if (!bkey_deleted(k) && + !should_drop_next_key(iter)) { + bkey_copy(out, k); + btree_keys_account_key_add(&nr, 0, out); +@@ -123,7 +123,7 @@ static void extent_sort_append(struct bch_fs *c, + struct bkey_packed **out, + struct bkey_s k) + { +- if (!bkey_whiteout(k.k)) { ++ if (!bkey_deleted(k.k)) { + if (!bch2_bkey_pack_key(*out, k.k, f)) + memcpy_u64s_small(*out, k.k, BKEY_U64s); + +@@ -148,7 +148,7 @@ bch2_sort_repack(struct bset *dst, struct btree *src, + memset(&nr, 0, sizeof(nr)); + + while ((in = bch2_btree_node_iter_next_all(src_iter, src))) { +- if (filter_whiteouts && bkey_whiteout(in)) ++ if (filter_whiteouts && bkey_deleted(in)) + continue; + + if (bch2_bkey_transform(out_f, out, bkey_packed(in) +@@ -181,7 +181,7 @@ bch2_sort_repack_merge(struct bch_fs *c, + bch2_bkey_buf_init(&k); + + while ((k_packed = bch2_btree_node_iter_next_all(iter, src))) { +- if (filter_whiteouts && bkey_whiteout(k_packed)) ++ if (filter_whiteouts && bkey_deleted(k_packed)) + continue; + + /* +@@ -227,7 +227,7 @@ unsigned bch2_sort_keys(struct bkey_packed *dst, + while ((in = sort_iter_next(iter, sort_keys_cmp))) { + bool needs_whiteout = false; + +- if (bkey_whiteout(in) && ++ if (bkey_deleted(in) && + (filter_whiteouts || !in->needs_whiteout)) + continue; + +@@ -239,7 +239,7 @@ unsigned bch2_sort_keys(struct bkey_packed *dst, + in = sort_iter_next(iter, sort_keys_cmp); + } + +- if (bkey_whiteout(in)) { ++ if (bkey_deleted(in)) { + memcpy_u64s(out, in, bkeyp_key_u64s(f, in)); + set_bkeyp_val_u64s(f, out, 0); + } else { +diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c +index 1c7318c6e46f..756cbae6541d 100644 +--- a/fs/bcachefs/bset.c ++++ b/fs/bcachefs/bset.c +@@ -144,7 +144,7 @@ void __bch2_verify_btree_nr_keys(struct btree *b) + + for_each_bset(b, t) + bset_tree_for_each_key(b, t, k) +- if (!bkey_whiteout(k)) ++ if (!bkey_deleted(k)) + btree_keys_account_key_add(&nr, t - b->set, k); + + BUG_ON(memcmp(&nr, &b->nr, sizeof(nr))); +@@ -1120,7 +1120,7 @@ void bch2_bset_insert(struct btree *b, + if (bch2_bkey_pack_key(&packed, &insert->k, f)) + src = &packed; + +- if (!bkey_whiteout(&insert->k)) ++ if (!bkey_deleted(&insert->k)) + btree_keys_account_key_add(&b->nr, t - b->set, src); + + if (src->u64s != clobber_u64s) { +@@ -1657,15 +1657,14 @@ found: + return prev; + } + +-struct bkey_packed *bch2_btree_node_iter_prev_filter(struct btree_node_iter *iter, +- struct btree *b, +- unsigned min_key_type) ++struct bkey_packed *bch2_btree_node_iter_prev(struct btree_node_iter *iter, ++ struct btree *b) + { + struct bkey_packed *prev; + + do { + prev = bch2_btree_node_iter_prev_all(iter, b); +- } while (prev && prev->type < min_key_type); ++ } while (prev && bkey_deleted(prev)); + + return prev; + } +diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h +index 469294cc716c..54b364c8f28c 100644 +--- a/fs/bcachefs/bset.h ++++ b/fs/bcachefs/bset.h +@@ -400,7 +400,7 @@ bch2_bkey_prev_all(struct btree *b, struct bset_tree *t, struct bkey_packed *k) + static inline struct bkey_packed * + bch2_bkey_prev(struct btree *b, struct bset_tree *t, struct bkey_packed *k) + { +- return bch2_bkey_prev_filter(b, t, k, KEY_TYPE_discard + 1); ++ return bch2_bkey_prev_filter(b, t, k, 1); + } + + enum bch_extent_overlap { +@@ -506,33 +506,23 @@ __bch2_btree_node_iter_peek_all(struct btree_node_iter *iter, + } + + static inline struct bkey_packed * +-bch2_btree_node_iter_peek_filter(struct btree_node_iter *iter, +- struct btree *b, +- unsigned min_key_type) ++bch2_btree_node_iter_peek_all(struct btree_node_iter *iter, struct btree *b) + { +- while (!bch2_btree_node_iter_end(iter)) { +- struct bkey_packed *k = __bch2_btree_node_iter_peek_all(iter, b); +- +- if (k->type >= min_key_type) +- return k; +- +- bch2_btree_node_iter_advance(iter, b); +- } +- +- return NULL; +-} +- +-static inline struct bkey_packed * +-bch2_btree_node_iter_peek_all(struct btree_node_iter *iter, +- struct btree *b) +-{ +- return bch2_btree_node_iter_peek_filter(iter, b, 0); ++ return !bch2_btree_node_iter_end(iter) ++ ? __btree_node_offset_to_key(b, iter->data->k) ++ : NULL; + } + + static inline struct bkey_packed * + bch2_btree_node_iter_peek(struct btree_node_iter *iter, struct btree *b) + { +- return bch2_btree_node_iter_peek_filter(iter, b, KEY_TYPE_discard + 1); ++ struct bkey_packed *k; ++ ++ while ((k = bch2_btree_node_iter_peek_all(iter, b)) && ++ bkey_deleted(k)) ++ bch2_btree_node_iter_advance(iter, b); ++ ++ return k; + } + + static inline struct bkey_packed * +@@ -548,14 +538,8 @@ bch2_btree_node_iter_next_all(struct btree_node_iter *iter, struct btree *b) + + struct bkey_packed *bch2_btree_node_iter_prev_all(struct btree_node_iter *, + struct btree *); +-struct bkey_packed *bch2_btree_node_iter_prev_filter(struct btree_node_iter *, +- struct btree *, unsigned); +- +-static inline struct bkey_packed * +-bch2_btree_node_iter_prev(struct btree_node_iter *iter, struct btree *b) +-{ +- return bch2_btree_node_iter_prev_filter(iter, b, KEY_TYPE_discard + 1); +-} ++struct bkey_packed *bch2_btree_node_iter_prev(struct btree_node_iter *, ++ struct btree *); + + struct bkey_s_c bch2_btree_node_iter_peek_unpack(struct btree_node_iter *, + struct btree *, +diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c +index eb04ad40fbc4..abbe44cd3525 100644 +--- a/fs/bcachefs/btree_io.c ++++ b/fs/bcachefs/btree_io.c +@@ -215,7 +215,7 @@ static bool bch2_drop_whiteouts(struct btree *b, enum compact_mode mode) + for (k = start; k != end; k = n) { + n = bkey_next_skip_noops(k, end); + +- if (!bkey_whiteout(k)) { ++ if (!bkey_deleted(k)) { + bkey_copy(out, k); + out = bkey_next(out); + } else { +@@ -725,11 +725,11 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b, + /* + * with the separate whiteouts thing (used for extents), the + * second set of keys actually can have whiteouts too, so we +- * can't solely go off bkey_whiteout()... ++ * can't solely go off bkey_deleted()... + */ + + if (!seen_non_whiteout && +- (!bkey_whiteout(k) || ++ (!bkey_deleted(k) || + (prev && bkey_iter_cmp(b, prev, k) > 0))) { + *whiteout_u64s = k->_data - i->_data; + seen_non_whiteout = true; +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index c5323c0f904a..b55f35b98955 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -534,7 +534,7 @@ static void bch2_btree_iter_verify_level(struct btree_iter *iter, + * whiteouts) + */ + p = level || btree_node_type_is_extents(iter->btree_id) +- ? bch2_btree_node_iter_prev_filter(&tmp, l->b, KEY_TYPE_discard) ++ ? bch2_btree_node_iter_prev(&tmp, l->b) + : bch2_btree_node_iter_prev_all(&tmp, l->b); + k = bch2_btree_node_iter_peek_all(&l->iter, l->b); + +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index 987c783a6cbd..2a0a28bd304c 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -90,7 +90,7 @@ void __bch2_btree_calc_format(struct bkey_format_state *s, struct btree *b) + + for_each_bset(b, t) + bset_tree_for_each_key(b, t, k) +- if (!bkey_whiteout(k)) { ++ if (!bkey_deleted(k)) { + uk = bkey_unpack_key(b, k); + bch2_bkey_format_add_key(s, &uk); + } +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index 53ea91b32fd5..4c032f0bd1d3 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -73,13 +73,13 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter, + k = NULL; + + /* @k is the key being overwritten/deleted, if any: */ +- EBUG_ON(k && bkey_whiteout(k)); ++ EBUG_ON(k && bkey_deleted(k)); + + /* Deleting, but not found? nothing to do: */ +- if (bkey_whiteout(&insert->k) && !k) ++ if (bkey_deleted(&insert->k) && !k) + return false; + +- if (bkey_whiteout(&insert->k)) { ++ if (bkey_deleted(&insert->k)) { + /* Deleting: */ + btree_account_key_drop(b, k); + k->type = KEY_TYPE_deleted; +diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c +index 595dd0add509..1c661c7a79ea 100644 +--- a/fs/bcachefs/extents.c ++++ b/fs/bcachefs/extents.c +@@ -972,9 +972,9 @@ bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k) + + /* will only happen if all pointers were cached: */ + if (!bch2_bkey_nr_ptrs(k.s_c)) +- k.k->type = KEY_TYPE_discard; ++ k.k->type = KEY_TYPE_deleted; + +- return bkey_whiteout(k.k); ++ return bkey_deleted(k.k); + } + + void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c, +-- +cgit v1.2.3 + + +From 5d9bf909f2ed6e34a6e0b7d43e6bc834098241ba Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 20 Feb 2021 19:09:53 -0500 +Subject: bcachefs: Rename KEY_TYPE_whiteout -> KEY_TYPE_hash_whiteout + +Snapshots are going to need a different whiteout key type. Also, switch +to using BCH_BKEY_TYPES() to define the bkey value accessors. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bcachefs_format.h | 18 +++++++++++++++++- + fs/bcachefs/bkey.h | 21 +++------------------ + fs/bcachefs/bkey_methods.c | 6 +++--- + fs/bcachefs/dirent.c | 4 ++-- + fs/bcachefs/fsck.c | 2 +- + fs/bcachefs/str_hash.h | 8 ++++---- + 6 files changed, 30 insertions(+), 29 deletions(-) + +diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h +index 73eeeb10472a..c74cb7ebe9f2 100644 +--- a/fs/bcachefs/bcachefs_format.h ++++ b/fs/bcachefs/bcachefs_format.h +@@ -326,7 +326,7 @@ static inline void bkey_init(struct bkey *k) + x(discard, 1) \ + x(error, 2) \ + x(cookie, 3) \ +- x(whiteout, 4) \ ++ x(hash_whiteout, 4) \ + x(btree_ptr, 5) \ + x(extent, 6) \ + x(reservation, 7) \ +@@ -351,11 +351,27 @@ enum bch_bkey_type { + KEY_TYPE_MAX, + }; + ++struct bch_deleted { ++ struct bch_val v; ++}; ++ ++struct bch_discard { ++ struct bch_val v; ++}; ++ ++struct bch_error { ++ struct bch_val v; ++}; ++ + struct bch_cookie { + struct bch_val v; + __le64 cookie; + }; + ++struct bch_hash_whiteout { ++ struct bch_val v; ++}; ++ + /* Extents */ + + /* +diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h +index 48821f6c09aa..a22a1dc6df78 100644 +--- a/fs/bcachefs/bkey.h ++++ b/fs/bcachefs/bkey.h +@@ -403,7 +403,7 @@ static inline struct bkey_s_c bkey_i_to_s_c(const struct bkey_i *k) + * bkey_i_extent to a bkey_i - since that's always safe, instead of conversion + * functions. + */ +-#define BKEY_VAL_ACCESSORS(name) \ ++#define x(name, ...) \ + struct bkey_i_##name { \ + union { \ + struct bkey k; \ +@@ -514,23 +514,8 @@ static inline struct bkey_i_##name *bkey_##name##_init(struct bkey_i *_k)\ + return k; \ + } + +-BKEY_VAL_ACCESSORS(cookie); +-BKEY_VAL_ACCESSORS(btree_ptr); +-BKEY_VAL_ACCESSORS(extent); +-BKEY_VAL_ACCESSORS(reservation); +-BKEY_VAL_ACCESSORS(inode); +-BKEY_VAL_ACCESSORS(inode_generation); +-BKEY_VAL_ACCESSORS(dirent); +-BKEY_VAL_ACCESSORS(xattr); +-BKEY_VAL_ACCESSORS(alloc); +-BKEY_VAL_ACCESSORS(quota); +-BKEY_VAL_ACCESSORS(stripe); +-BKEY_VAL_ACCESSORS(reflink_p); +-BKEY_VAL_ACCESSORS(reflink_v); +-BKEY_VAL_ACCESSORS(inline_data); +-BKEY_VAL_ACCESSORS(btree_ptr_v2); +-BKEY_VAL_ACCESSORS(indirect_inline_data); +-BKEY_VAL_ACCESSORS(alloc_v2); ++BCH_BKEY_TYPES(); ++#undef x + + /* byte order helpers */ + +diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c +index f5779795a4b2..756bf5aeee9b 100644 +--- a/fs/bcachefs/bkey_methods.c ++++ b/fs/bcachefs/bkey_methods.c +@@ -59,7 +59,7 @@ static const char *key_type_cookie_invalid(const struct bch_fs *c, + .key_invalid = key_type_cookie_invalid, \ + } + +-#define bch2_bkey_ops_whiteout (struct bkey_ops) { \ ++#define bch2_bkey_ops_hash_whiteout (struct bkey_ops) { \ + .key_invalid = empty_val_key_invalid, \ + } + +@@ -270,9 +270,9 @@ static const struct old_bkey_type { + {BKEY_TYPE_INODES, 128, KEY_TYPE_inode }, + {BKEY_TYPE_INODES, 130, KEY_TYPE_inode_generation }, + {BKEY_TYPE_DIRENTS, 128, KEY_TYPE_dirent }, +- {BKEY_TYPE_DIRENTS, 129, KEY_TYPE_whiteout }, ++ {BKEY_TYPE_DIRENTS, 129, KEY_TYPE_hash_whiteout }, + {BKEY_TYPE_XATTRS, 128, KEY_TYPE_xattr }, +- {BKEY_TYPE_XATTRS, 129, KEY_TYPE_whiteout }, ++ {BKEY_TYPE_XATTRS, 129, KEY_TYPE_hash_whiteout }, + {BKEY_TYPE_ALLOC, 128, KEY_TYPE_alloc }, + {BKEY_TYPE_QUOTAS, 128, KEY_TYPE_quota }, + }; +diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c +index f34bfda8ab0d..d2ebf1e5819d 100644 +--- a/fs/bcachefs/dirent.c ++++ b/fs/bcachefs/dirent.c +@@ -262,7 +262,7 @@ int bch2_dirent_rename(struct btree_trans *trans, + * overwrite old_dst - just make sure to use a + * whiteout when deleting src: + */ +- new_src->k.type = KEY_TYPE_whiteout; ++ new_src->k.type = KEY_TYPE_hash_whiteout; + } + } else { + /* Check if we need a whiteout to delete src: */ +@@ -272,7 +272,7 @@ int bch2_dirent_rename(struct btree_trans *trans, + goto out; + + if (ret) +- new_src->k.type = KEY_TYPE_whiteout; ++ new_src->k.type = KEY_TYPE_hash_whiteout; + } + } + +diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c +index 66c9dad2ef3e..7f78edcfe565 100644 +--- a/fs/bcachefs/fsck.c ++++ b/fs/bcachefs/fsck.c +@@ -257,7 +257,7 @@ static void hash_set_chain_start(struct btree_trans *trans, + struct hash_check *h, + struct btree_iter *k_iter, struct bkey_s_c k) + { +- bool hole = (k.k->type != KEY_TYPE_whiteout && ++ bool hole = (k.k->type != KEY_TYPE_hash_whiteout && + k.k->type != desc.key_type); + + if (hole || k.k->p.offset > h->chain_end + 1) +diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h +index f6b694b9346b..952b146af750 100644 +--- a/fs/bcachefs/str_hash.h ++++ b/fs/bcachefs/str_hash.h +@@ -156,7 +156,7 @@ bch2_hash_lookup(struct btree_trans *trans, + if (k.k->type == desc.key_type) { + if (!desc.cmp_key(k, key)) + return iter; +- } else if (k.k->type == KEY_TYPE_whiteout) { ++ } else if (k.k->type == KEY_TYPE_hash_whiteout) { + ; + } else { + /* hole, not found */ +@@ -210,7 +210,7 @@ int bch2_hash_needs_whiteout(struct btree_trans *trans, + + for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, k, ret) { + if (k.k->type != desc.key_type && +- k.k->type != KEY_TYPE_whiteout) ++ k.k->type != KEY_TYPE_hash_whiteout) + break; + + if (k.k->type == desc.key_type && +@@ -254,7 +254,7 @@ int bch2_hash_set(struct btree_trans *trans, + !(flags & BCH_HASH_SET_MUST_REPLACE)) + slot = bch2_trans_copy_iter(trans, iter); + +- if (k.k->type != KEY_TYPE_whiteout) ++ if (k.k->type != KEY_TYPE_hash_whiteout) + goto not_found; + } + +@@ -303,7 +303,7 @@ int bch2_hash_delete_at(struct btree_trans *trans, + + bkey_init(&delete->k); + delete->k.p = iter->pos; +- delete->k.type = ret ? KEY_TYPE_whiteout : KEY_TYPE_deleted; ++ delete->k.type = ret ? KEY_TYPE_hash_whiteout : KEY_TYPE_deleted; + + bch2_trans_update(trans, iter, delete, 0); + return 0; +-- +cgit v1.2.3 + + +From 7ce106c2fe2559858c31397b9f9875b8b541e2d6 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 20 Feb 2021 19:27:37 -0500 +Subject: bcachefs: Rename BTREE_ID enums for consistency with other enums + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/alloc_background.c | 10 ++++---- + fs/bcachefs/bcachefs.h | 18 ++++++------- + fs/bcachefs/bcachefs_format.h | 20 +++++++-------- + fs/bcachefs/bkey_methods.c | 30 +++++++++++----------- + fs/bcachefs/btree_cache.c | 7 ------ + fs/bcachefs/btree_cache.h | 2 -- + fs/bcachefs/btree_gc.h | 2 +- + fs/bcachefs/btree_io.c | 2 +- + fs/bcachefs/btree_io.h | 4 +-- + fs/bcachefs/btree_types.h | 30 +++++++++++----------- + fs/bcachefs/btree_update_interior.c | 2 +- + fs/bcachefs/btree_update_leaf.c | 2 +- + fs/bcachefs/buckets.c | 10 ++++---- + fs/bcachefs/dirent.c | 6 ++--- + fs/bcachefs/ec.c | 16 ++++++------ + fs/bcachefs/extent_update.c | 2 +- + fs/bcachefs/extents.c | 2 +- + fs/bcachefs/fs-io.c | 14 +++++------ + fs/bcachefs/fs.c | 2 +- + fs/bcachefs/fsck.c | 26 +++++++++---------- + fs/bcachefs/inode.c | 18 ++++++------- + fs/bcachefs/io.c | 18 ++++++------- + fs/bcachefs/migrate.c | 4 +-- + fs/bcachefs/move.c | 6 ++--- + fs/bcachefs/opts.c | 7 ++++++ + fs/bcachefs/opts.h | 1 + + fs/bcachefs/quota.c | 12 ++++----- + fs/bcachefs/recovery.c | 20 +++++++-------- + fs/bcachefs/reflink.c | 6 ++--- + fs/bcachefs/super-io.c | 2 +- + fs/bcachefs/super.c | 4 +-- + fs/bcachefs/sysfs.c | 2 +- + fs/bcachefs/tests.c | 50 ++++++++++++++++++------------------- + fs/bcachefs/xattr.c | 4 +-- + 34 files changed, 180 insertions(+), 181 deletions(-) + +diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c +index 0f81d155c1ed..d677a4e64fb9 100644 +--- a/fs/bcachefs/alloc_background.c ++++ b/fs/bcachefs/alloc_background.c +@@ -316,7 +316,7 @@ int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys) + int ret; + + down_read(&c->gc_lock); +- ret = bch2_btree_and_journal_walk(c, journal_keys, BTREE_ID_ALLOC, ++ ret = bch2_btree_and_journal_walk(c, journal_keys, BTREE_ID_alloc, + NULL, bch2_alloc_read_fn); + up_read(&c->gc_lock); + +@@ -344,7 +344,7 @@ retry: + bch2_trans_begin(trans); + + ret = bch2_btree_key_cache_flush(trans, +- BTREE_ID_ALLOC, iter->pos); ++ BTREE_ID_alloc, iter->pos); + if (ret) + goto err; + +@@ -386,7 +386,7 @@ int bch2_alloc_write(struct bch_fs *c, unsigned flags) + + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); + +- iter = bch2_trans_get_iter(&trans, BTREE_ID_ALLOC, POS_MIN, ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_alloc, POS_MIN, + BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + + for_each_member_device(ca, c, i) { +@@ -423,7 +423,7 @@ int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev, + u64 *time, now; + int ret = 0; + +- iter = bch2_trans_get_iter(trans, BTREE_ID_ALLOC, POS(dev, bucket_nr), ++ iter = bch2_trans_get_iter(trans, BTREE_ID_alloc, POS(dev, bucket_nr), + BTREE_ITER_CACHED| + BTREE_ITER_CACHED_NOFILL| + BTREE_ITER_INTENT); +@@ -927,7 +927,7 @@ static int bch2_invalidate_buckets(struct bch_fs *c, struct bch_dev *ca) + + bch2_trans_init(&trans, c, 0, 0); + +- iter = bch2_trans_get_iter(&trans, BTREE_ID_ALLOC, ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_alloc, + POS(ca->dev_idx, 0), + BTREE_ITER_CACHED| + BTREE_ITER_CACHED_NOFILL| +diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h +index 4f5ccf4bc57c..6cded61f8a78 100644 +--- a/fs/bcachefs/bcachefs.h ++++ b/fs/bcachefs/bcachefs.h +@@ -370,14 +370,14 @@ enum gc_phase { + GC_PHASE_START, + GC_PHASE_SB, + +- GC_PHASE_BTREE_EC, +- GC_PHASE_BTREE_EXTENTS, +- GC_PHASE_BTREE_INODES, +- GC_PHASE_BTREE_DIRENTS, +- GC_PHASE_BTREE_XATTRS, +- GC_PHASE_BTREE_ALLOC, +- GC_PHASE_BTREE_QUOTAS, +- GC_PHASE_BTREE_REFLINK, ++ GC_PHASE_BTREE_stripes, ++ GC_PHASE_BTREE_extents, ++ GC_PHASE_BTREE_inodes, ++ GC_PHASE_BTREE_dirents, ++ GC_PHASE_BTREE_xattrs, ++ GC_PHASE_BTREE_alloc, ++ GC_PHASE_BTREE_quotas, ++ GC_PHASE_BTREE_reflink, + + GC_PHASE_PENDING_DELETE, + GC_PHASE_ALLOC, +@@ -722,7 +722,7 @@ struct bch_fs { + * Tracks GC's progress - everything in the range [ZERO_KEY..gc_cur_pos] + * has been marked by GC. + * +- * gc_cur_phase is a superset of btree_ids (BTREE_ID_EXTENTS etc.) ++ * gc_cur_phase is a superset of btree_ids (BTREE_ID_extents etc.) + * + * Protected by gc_pos_lock. Only written to by GC thread, so GC thread + * can read without a lock. +diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h +index c74cb7ebe9f2..c15e5a9c16f7 100644 +--- a/fs/bcachefs/bcachefs_format.h ++++ b/fs/bcachefs/bcachefs_format.h +@@ -1651,18 +1651,18 @@ LE32_BITMASK(JSET_NO_FLUSH, struct jset, flags, 5, 6); + + /* Btree: */ + +-#define BCH_BTREE_IDS() \ +- x(EXTENTS, 0, "extents") \ +- x(INODES, 1, "inodes") \ +- x(DIRENTS, 2, "dirents") \ +- x(XATTRS, 3, "xattrs") \ +- x(ALLOC, 4, "alloc") \ +- x(QUOTAS, 5, "quotas") \ +- x(EC, 6, "stripes") \ +- x(REFLINK, 7, "reflink") ++#define BCH_BTREE_IDS() \ ++ x(extents, 0) \ ++ x(inodes, 1) \ ++ x(dirents, 2) \ ++ x(xattrs, 3) \ ++ x(alloc, 4) \ ++ x(quotas, 5) \ ++ x(stripes, 6) \ ++ x(reflink, 7) + + enum btree_id { +-#define x(kwd, val, name) BTREE_ID_##kwd = val, ++#define x(kwd, val) BTREE_ID_##kwd = val, + BCH_BTREE_IDS() + #undef x + BTREE_ID_NR +diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c +index 756bf5aeee9b..79e249f49971 100644 +--- a/fs/bcachefs/bkey_methods.c ++++ b/fs/bcachefs/bkey_methods.c +@@ -104,7 +104,7 @@ const char *__bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k, + if (k.k->u64s < BKEY_U64s) + return "u64s too small"; + +- if (type == BKEY_TYPE_BTREE && ++ if (type == BKEY_TYPE_btree && + bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX) + return "value too big"; + +@@ -122,7 +122,7 @@ const char *__bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k, + if (k.k->p.snapshot) + return "nonzero snapshot"; + +- if (type != BKEY_TYPE_BTREE && ++ if (type != BKEY_TYPE_btree && + !bkey_cmp(k.k->p, POS_MAX)) + return "POS_MAX key"; + +@@ -263,18 +263,18 @@ static const struct old_bkey_type { + u8 old; + u8 new; + } bkey_renumber_table[] = { +- {BKEY_TYPE_BTREE, 128, KEY_TYPE_btree_ptr }, +- {BKEY_TYPE_EXTENTS, 128, KEY_TYPE_extent }, +- {BKEY_TYPE_EXTENTS, 129, KEY_TYPE_extent }, +- {BKEY_TYPE_EXTENTS, 130, KEY_TYPE_reservation }, +- {BKEY_TYPE_INODES, 128, KEY_TYPE_inode }, +- {BKEY_TYPE_INODES, 130, KEY_TYPE_inode_generation }, +- {BKEY_TYPE_DIRENTS, 128, KEY_TYPE_dirent }, +- {BKEY_TYPE_DIRENTS, 129, KEY_TYPE_hash_whiteout }, +- {BKEY_TYPE_XATTRS, 128, KEY_TYPE_xattr }, +- {BKEY_TYPE_XATTRS, 129, KEY_TYPE_hash_whiteout }, +- {BKEY_TYPE_ALLOC, 128, KEY_TYPE_alloc }, +- {BKEY_TYPE_QUOTAS, 128, KEY_TYPE_quota }, ++ {BKEY_TYPE_btree, 128, KEY_TYPE_btree_ptr }, ++ {BKEY_TYPE_extents, 128, KEY_TYPE_extent }, ++ {BKEY_TYPE_extents, 129, KEY_TYPE_extent }, ++ {BKEY_TYPE_extents, 130, KEY_TYPE_reservation }, ++ {BKEY_TYPE_inodes, 128, KEY_TYPE_inode }, ++ {BKEY_TYPE_inodes, 130, KEY_TYPE_inode_generation }, ++ {BKEY_TYPE_dirents, 128, KEY_TYPE_dirent }, ++ {BKEY_TYPE_dirents, 129, KEY_TYPE_hash_whiteout }, ++ {BKEY_TYPE_xattrs, 128, KEY_TYPE_xattr }, ++ {BKEY_TYPE_xattrs, 129, KEY_TYPE_hash_whiteout }, ++ {BKEY_TYPE_alloc, 128, KEY_TYPE_alloc }, ++ {BKEY_TYPE_quotas, 128, KEY_TYPE_quota }, + }; + + void bch2_bkey_renumber(enum btree_node_type btree_node_type, +@@ -320,7 +320,7 @@ void __bch2_bkey_compat(unsigned level, enum btree_id btree_id, + break; + case 2: + if (version < bcachefs_metadata_version_inode_btree_change && +- btree_id == BTREE_ID_INODES) { ++ btree_id == BTREE_ID_inodes) { + if (!bkey_packed(k)) { + struct bkey_i *u = packed_to_bkey(k); + swap(u->k.p.inode, u->k.p.offset); +diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c +index 76ff1f382794..30f601ccedc0 100644 +--- a/fs/bcachefs/btree_cache.c ++++ b/fs/bcachefs/btree_cache.c +@@ -13,13 +13,6 @@ + #include + #include + +-const char * const bch2_btree_ids[] = { +-#define x(kwd, val, name) name, +- BCH_BTREE_IDS() +-#undef x +- NULL +-}; +- + void bch2_recalc_btree_reserve(struct bch_fs *c) + { + unsigned i, reserve = 16; +diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h +index 5fffae92effb..217988696a77 100644 +--- a/fs/bcachefs/btree_cache.h ++++ b/fs/bcachefs/btree_cache.h +@@ -7,8 +7,6 @@ + + struct btree_iter; + +-extern const char * const bch2_btree_ids[]; +- + void bch2_recalc_btree_reserve(struct bch_fs *); + + void bch2_btree_node_hash_remove(struct btree_cache *, struct btree *); +diff --git a/fs/bcachefs/btree_gc.h b/fs/bcachefs/btree_gc.h +index f516faded269..d5559827ed7f 100644 +--- a/fs/bcachefs/btree_gc.h ++++ b/fs/bcachefs/btree_gc.h +@@ -57,7 +57,7 @@ static inline int gc_pos_cmp(struct gc_pos l, struct gc_pos r) + static inline enum gc_phase btree_id_to_gc_phase(enum btree_id id) + { + switch (id) { +-#define x(n, v, s) case BTREE_ID_##n: return GC_PHASE_BTREE_##n; ++#define x(name, v) case BTREE_ID_##name: return GC_PHASE_BTREE_##name; + BCH_BTREE_IDS() + #undef x + default: +diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c +index abbe44cd3525..c65e6e475083 100644 +--- a/fs/bcachefs/btree_io.c ++++ b/fs/bcachefs/btree_io.c +@@ -1321,7 +1321,7 @@ static int validate_bset_for_write(struct bch_fs *c, struct btree *b, + unsigned whiteout_u64s = 0; + int ret; + +- if (bch2_bkey_invalid(c, bkey_i_to_s_c(&b->key), BKEY_TYPE_BTREE)) ++ if (bch2_bkey_invalid(c, bkey_i_to_s_c(&b->key), BKEY_TYPE_btree)) + return -1; + + ret = validate_bset(c, NULL, b, i, sectors, WRITE, false) ?: +diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h +index 89685bd57fc0..16ce6dff6af7 100644 +--- a/fs/bcachefs/btree_io.h ++++ b/fs/bcachefs/btree_io.h +@@ -193,7 +193,7 @@ static inline void compat_bformat(unsigned level, enum btree_id btree_id, + int write, struct bkey_format *f) + { + if (version < bcachefs_metadata_version_inode_btree_change && +- btree_id == BTREE_ID_INODES) { ++ btree_id == BTREE_ID_inodes) { + swap(f->bits_per_field[BKEY_FIELD_INODE], + f->bits_per_field[BKEY_FIELD_OFFSET]); + swap(f->field_offset[BKEY_FIELD_INODE], +@@ -209,7 +209,7 @@ static inline void compat_bpos(unsigned level, enum btree_id btree_id, + bch2_bpos_swab(p); + + if (version < bcachefs_metadata_version_inode_btree_change && +- btree_id == BTREE_ID_INODES) ++ btree_id == BTREE_ID_inodes) + swap(p->inode, p->offset); + } + +diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h +index 4667a0a4201c..6b221d9c6ae4 100644 +--- a/fs/bcachefs/btree_types.h ++++ b/fs/bcachefs/btree_types.h +@@ -545,16 +545,16 @@ static inline unsigned bset_byte_offset(struct btree *b, void *i) + } + + enum btree_node_type { +-#define x(kwd, val, name) BKEY_TYPE_##kwd = val, ++#define x(kwd, val) BKEY_TYPE_##kwd = val, + BCH_BTREE_IDS() + #undef x +- BKEY_TYPE_BTREE, ++ BKEY_TYPE_btree, + }; + + /* Type of a key in btree @id at level @level: */ + static inline enum btree_node_type __btree_node_type(unsigned level, enum btree_id id) + { +- return level ? BKEY_TYPE_BTREE : (enum btree_node_type) id; ++ return level ? BKEY_TYPE_btree : (enum btree_node_type) id; + } + + /* Type of keys @b contains: */ +@@ -566,8 +566,8 @@ static inline enum btree_node_type btree_node_type(struct btree *b) + static inline bool btree_node_type_is_extents(enum btree_node_type type) + { + switch (type) { +- case BKEY_TYPE_EXTENTS: +- case BKEY_TYPE_REFLINK: ++ case BKEY_TYPE_extents: ++ case BKEY_TYPE_reflink: + return true; + default: + return false; +@@ -590,18 +590,18 @@ static inline bool btree_iter_is_extents(struct btree_iter *iter) + } + + #define BTREE_NODE_TYPE_HAS_TRIGGERS \ +- ((1U << BKEY_TYPE_EXTENTS)| \ +- (1U << BKEY_TYPE_ALLOC)| \ +- (1U << BKEY_TYPE_INODES)| \ +- (1U << BKEY_TYPE_REFLINK)| \ +- (1U << BKEY_TYPE_EC)| \ +- (1U << BKEY_TYPE_BTREE)) ++ ((1U << BKEY_TYPE_extents)| \ ++ (1U << BKEY_TYPE_alloc)| \ ++ (1U << BKEY_TYPE_inodes)| \ ++ (1U << BKEY_TYPE_reflink)| \ ++ (1U << BKEY_TYPE_stripes)| \ ++ (1U << BKEY_TYPE_btree)) + + #define BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS \ +- ((1U << BKEY_TYPE_EXTENTS)| \ +- (1U << BKEY_TYPE_INODES)| \ +- (1U << BKEY_TYPE_EC)| \ +- (1U << BKEY_TYPE_REFLINK)) ++ ((1U << BKEY_TYPE_extents)| \ ++ (1U << BKEY_TYPE_inodes)| \ ++ (1U << BKEY_TYPE_stripes)| \ ++ (1U << BKEY_TYPE_reflink)) + + enum btree_trigger_flags { + __BTREE_TRIGGER_NORUN, /* Don't run triggers at all */ +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index 2a0a28bd304c..d090509c0519 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -1196,7 +1196,7 @@ static void btree_split_insert_keys(struct btree_update *as, struct btree *b, + struct bkey_packed *src, *dst, *n; + struct bset *i; + +- BUG_ON(btree_node_type(b) != BKEY_TYPE_BTREE); ++ BUG_ON(btree_node_type(b) != BKEY_TYPE_btree); + + bch2_btree_node_iter_init(&node_iter, b, &k->k.p); + +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index 4c032f0bd1d3..2e6f14452894 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -341,7 +341,7 @@ static inline bool iter_has_nontrans_triggers(struct btree_iter *iter) + { + return (((BTREE_NODE_TYPE_HAS_TRIGGERS & + ~BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS)) | +- (1U << BTREE_ID_EC)) & ++ (1U << BTREE_ID_stripes)) & + (1U << iter->btree_id); + } + +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index 38939e2c1d8a..5fa0a28a6cf1 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -1506,7 +1506,7 @@ static int trans_get_key(struct btree_trans *trans, + struct btree_iter **iter, + struct bkey_s_c *k) + { +- unsigned flags = btree_id != BTREE_ID_ALLOC ++ unsigned flags = btree_id != BTREE_ID_alloc + ? BTREE_ITER_SLOTS + : BTREE_ITER_CACHED; + int ret; +@@ -1542,11 +1542,11 @@ bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter **_it + if (IS_ERR(a)) + return a; + +- iter = trans_get_update(trans, BTREE_ID_ALLOC, pos, &k); ++ iter = trans_get_update(trans, BTREE_ID_alloc, pos, &k); + if (iter) { + *u = bch2_alloc_unpack(k); + } else { +- iter = bch2_trans_get_iter(trans, BTREE_ID_ALLOC, pos, ++ iter = bch2_trans_get_iter(trans, BTREE_ID_alloc, pos, + BTREE_ITER_CACHED| + BTREE_ITER_CACHED_NOFILL| + BTREE_ITER_INTENT); +@@ -1603,7 +1603,7 @@ static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans, + struct bch_replicas_padded r; + int ret = 0; + +- ret = trans_get_key(trans, BTREE_ID_EC, POS(0, p.ec.idx), &iter, &k); ++ ret = trans_get_key(trans, BTREE_ID_stripes, POS(0, p.ec.idx), &iter, &k); + if (ret < 0) + return ret; + +@@ -1827,7 +1827,7 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans, + __le64 *refcount; + s64 ret; + +- ret = trans_get_key(trans, BTREE_ID_REFLINK, ++ ret = trans_get_key(trans, BTREE_ID_reflink, + POS(0, idx), &iter, &k); + if (ret < 0) + return ret; +diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c +index d2ebf1e5819d..b0625176ab35 100644 +--- a/fs/bcachefs/dirent.c ++++ b/fs/bcachefs/dirent.c +@@ -64,7 +64,7 @@ static bool dirent_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r) + } + + const struct bch_hash_desc bch2_dirent_hash_desc = { +- .btree_id = BTREE_ID_DIRENTS, ++ .btree_id = BTREE_ID_dirents, + .key_type = KEY_TYPE_dirent, + .hash_key = dirent_hash_key, + .hash_bkey = dirent_hash_bkey, +@@ -332,7 +332,7 @@ int bch2_empty_dir_trans(struct btree_trans *trans, u64 dir_inum) + struct bkey_s_c k; + int ret; + +- for_each_btree_key(trans, iter, BTREE_ID_DIRENTS, ++ for_each_btree_key(trans, iter, BTREE_ID_dirents, + POS(dir_inum, 0), 0, k, ret) { + if (k.k->p.inode > dir_inum) + break; +@@ -357,7 +357,7 @@ int bch2_readdir(struct bch_fs *c, u64 inum, struct dir_context *ctx) + + bch2_trans_init(&trans, c, 0, 0); + +- for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS, ++ for_each_btree_key(&trans, iter, BTREE_ID_dirents, + POS(inum, ctx->pos), 0, k, ret) { + if (k.k->p.inode > inum) + break; +diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c +index adcffede0d48..5f80881c2496 100644 +--- a/fs/bcachefs/ec.c ++++ b/fs/bcachefs/ec.c +@@ -433,7 +433,7 @@ static int get_stripe_key(struct bch_fs *c, u64 idx, struct ec_stripe_buf *strip + int ret; + + bch2_trans_init(&trans, c, 0, 0); +- iter = bch2_trans_get_iter(&trans, BTREE_ID_EC, POS(0, idx), BTREE_ITER_SLOTS); ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_stripes, POS(0, idx), BTREE_ITER_SLOTS); + k = bch2_btree_iter_peek_slot(iter); + ret = bkey_err(k); + if (ret) +@@ -668,7 +668,7 @@ void bch2_stripes_heap_update(struct bch_fs *c, + + static int ec_stripe_delete(struct bch_fs *c, size_t idx) + { +- return bch2_btree_delete_range(c, BTREE_ID_EC, ++ return bch2_btree_delete_range(c, BTREE_ID_stripes, + POS(0, idx), + POS(0, idx + 1), + NULL); +@@ -713,7 +713,7 @@ static int ec_stripe_bkey_insert(struct bch_fs *c, + retry: + bch2_trans_begin(&trans); + +- for_each_btree_key(&trans, iter, BTREE_ID_EC, start_pos, ++ for_each_btree_key(&trans, iter, BTREE_ID_stripes, start_pos, + BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { + if (bkey_cmp(k.k->p, POS(0, U32_MAX)) > 0) { + if (start_pos.offset) { +@@ -765,7 +765,7 @@ static int ec_stripe_bkey_update(struct btree_trans *trans, + unsigned i; + int ret; + +- iter = bch2_trans_get_iter(trans, BTREE_ID_EC, ++ iter = bch2_trans_get_iter(trans, BTREE_ID_stripes, + new->k.p, BTREE_ITER_INTENT); + k = bch2_btree_iter_peek_slot(iter); + ret = bkey_err(k); +@@ -831,7 +831,7 @@ static int ec_stripe_update_ptrs(struct bch_fs *c, + + /* XXX this doesn't support the reflink btree */ + +- iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, + bkey_start_pos(pos), + BTREE_ITER_INTENT); + +@@ -1604,7 +1604,7 @@ int bch2_stripes_write(struct bch_fs *c, unsigned flags) + + bch2_trans_init(&trans, c, 0, 0); + +- iter = bch2_trans_get_iter(&trans, BTREE_ID_EC, POS_MIN, ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_stripes, POS_MIN, + BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + + genradix_for_each(&c->stripes[0], giter, m) { +@@ -1645,7 +1645,7 @@ static int bch2_stripes_read_fn(struct bch_fs *c, enum btree_id id, + + int bch2_stripes_read(struct bch_fs *c, struct journal_keys *journal_keys) + { +- int ret = bch2_btree_and_journal_walk(c, journal_keys, BTREE_ID_EC, ++ int ret = bch2_btree_and_journal_walk(c, journal_keys, BTREE_ID_stripes, + NULL, bch2_stripes_read_fn); + if (ret) + bch_err(c, "error reading stripes: %i", ret); +@@ -1663,7 +1663,7 @@ int bch2_ec_mem_alloc(struct bch_fs *c, bool gc) + + bch2_trans_init(&trans, c, 0, 0); + +- iter = bch2_trans_get_iter(&trans, BTREE_ID_EC, POS(0, U64_MAX), 0); ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_stripes, POS(0, U64_MAX), 0); + + k = bch2_btree_iter_prev(iter); + if (!IS_ERR_OR_NULL(k.k)) +diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c +index 16d2bca8a662..bb4b2b4352e0 100644 +--- a/fs/bcachefs/extent_update.c ++++ b/fs/bcachefs/extent_update.c +@@ -62,7 +62,7 @@ static int count_iters_for_insert(struct btree_trans *trans, + struct bkey_s_c r_k; + + for_each_btree_key(trans, iter, +- BTREE_ID_REFLINK, POS(0, idx + offset), ++ BTREE_ID_reflink, POS(0, idx + offset), + BTREE_ITER_SLOTS, r_k, ret2) { + if (bkey_cmp(bkey_start_pos(r_k.k), + POS(0, idx + sectors)) >= 0) +diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c +index 1c661c7a79ea..818609347bd9 100644 +--- a/fs/bcachefs/extents.c ++++ b/fs/bcachefs/extents.c +@@ -677,7 +677,7 @@ bool bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size, + + bch2_trans_init(&trans, c, 0, 0); + +- for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, pos, ++ for_each_btree_key(&trans, iter, BTREE_ID_extents, pos, + BTREE_ITER_SLOTS, k, err) { + if (bkey_cmp(bkey_start_pos(k.k), end) >= 0) + break; +diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c +index 76d6e64059f9..8a5dcf5fa75f 100644 +--- a/fs/bcachefs/fs-io.c ++++ b/fs/bcachefs/fs-io.c +@@ -890,7 +890,7 @@ void bch2_readahead(struct readahead_control *ractl) + + bch2_trans_init(&trans, c, 0, 0); + +- iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, POS_MIN, ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, POS_MIN, + BTREE_ITER_SLOTS); + + bch2_pagecache_add_get(&inode->ei_pagecache_lock); +@@ -936,7 +936,7 @@ static void __bchfs_readpage(struct bch_fs *c, struct bch_read_bio *rbio, + BUG_ON(!bio_add_page(&rbio->bio, page, PAGE_SIZE, 0)); + + bch2_trans_init(&trans, c, 0, 0); +- iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, POS_MIN, ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, POS_MIN, + BTREE_ITER_SLOTS); + + bchfs_read(&trans, iter, rbio, inum, NULL); +@@ -2138,7 +2138,7 @@ static inline int range_has_data(struct bch_fs *c, + + bch2_trans_init(&trans, c, 0, 0); + +- for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, start, 0, k, ret) { ++ for_each_btree_key(&trans, iter, BTREE_ID_extents, start, 0, k, ret) { + if (bkey_cmp(bkey_start_pos(k.k), end) >= 0) + break; + +@@ -2514,7 +2514,7 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode, + goto err; + } + +- src = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, ++ src = bch2_trans_get_iter(&trans, BTREE_ID_extents, + POS(inode->v.i_ino, src_start >> 9), + BTREE_ITER_INTENT); + dst = bch2_trans_copy_iter(&trans, src); +@@ -2669,7 +2669,7 @@ static long bchfs_fallocate(struct bch_inode_info *inode, int mode, + truncate_pagecache_range(&inode->v, offset, end - 1); + } + +- iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, + POS(inode->v.i_ino, block_start >> 9), + BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + end_pos = POS(inode->v.i_ino, block_end >> 9); +@@ -3002,7 +3002,7 @@ static loff_t bch2_seek_data(struct file *file, u64 offset) + + bch2_trans_init(&trans, c, 0, 0); + +- for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, ++ for_each_btree_key(&trans, iter, BTREE_ID_extents, + POS(inode->v.i_ino, offset >> 9), 0, k, ret) { + if (k.k->p.inode != inode->v.i_ino) { + break; +@@ -3097,7 +3097,7 @@ static loff_t bch2_seek_hole(struct file *file, u64 offset) + + bch2_trans_init(&trans, c, 0, 0); + +- for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, ++ for_each_btree_key(&trans, iter, BTREE_ID_extents, + POS(inode->v.i_ino, offset >> 9), + BTREE_ITER_SLOTS, k, ret) { + if (k.k->p.inode != inode->v.i_ino) { +diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c +index 444f8f279742..ff000cc7ba6e 100644 +--- a/fs/bcachefs/fs.c ++++ b/fs/bcachefs/fs.c +@@ -915,7 +915,7 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, + bch2_bkey_buf_init(&prev); + bch2_trans_init(&trans, c, 0, 0); + +- iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, + POS(ei->v.i_ino, start >> 9), 0); + retry: + while ((k = bch2_btree_iter_peek(iter)).k && +diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c +index 7f78edcfe565..ebc234b0b6fe 100644 +--- a/fs/bcachefs/fsck.c ++++ b/fs/bcachefs/fsck.c +@@ -24,7 +24,7 @@ static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum) + u64 sectors = 0; + int ret; + +- for_each_btree_key(trans, iter, BTREE_ID_EXTENTS, ++ for_each_btree_key(trans, iter, BTREE_ID_extents, + POS(inum, 0), 0, k, ret) { + if (k.k->p.inode != inum) + break; +@@ -396,7 +396,7 @@ err_redo: + if (fsck_err(c, "cannot fix dirent by removing trailing garbage %s (%zu)\n" + "hash table key at wrong offset: btree %u, offset %llu, " + "hashed to %llu chain starts at %llu\n%s", +- buf, strlen(buf), BTREE_ID_DIRENTS, ++ buf, strlen(buf), BTREE_ID_dirents, + k->k->p.offset, hash, h->chain->pos.offset, + (bch2_bkey_val_to_text(&PBUF(buf), c, + *k), buf))) { +@@ -415,7 +415,7 @@ err_redo: + + static int bch2_inode_truncate(struct bch_fs *c, u64 inode_nr, u64 new_size) + { +- return bch2_btree_delete_range(c, BTREE_ID_EXTENTS, ++ return bch2_btree_delete_range(c, BTREE_ID_extents, + POS(inode_nr, round_up(new_size, block_bytes(c)) >> 9), + POS(inode_nr + 1, 0), NULL); + } +@@ -474,7 +474,7 @@ static int check_extents(struct bch_fs *c) + + bch_verbose(c, "checking extents"); + +- iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, + POS(BCACHEFS_ROOT_INO, 0), + BTREE_ITER_INTENT); + retry: +@@ -537,7 +537,7 @@ retry: + + bch2_inode_pack(c, &p, &w.inode); + +- ret = bch2_btree_insert(c, BTREE_ID_INODES, ++ ret = bch2_btree_insert(c, BTREE_ID_inodes, + &p.inode.k_i, NULL, NULL, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW); +@@ -595,7 +595,7 @@ static int check_dirents(struct bch_fs *c) + + hash_check_init(&h); + +- iter = bch2_trans_get_iter(&trans, BTREE_ID_DIRENTS, ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_dirents, + POS(BCACHEFS_ROOT_INO, 0), 0); + retry: + for_each_btree_key_continue(iter, 0, k, ret) { +@@ -747,7 +747,7 @@ static int check_xattrs(struct bch_fs *c) + + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); + +- iter = bch2_trans_get_iter(&trans, BTREE_ID_XATTRS, ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_xattrs, + POS(BCACHEFS_ROOT_INO, 0), 0); + retry: + for_each_btree_key_continue(iter, 0, k, ret) { +@@ -810,7 +810,7 @@ create_root: + + bch2_inode_pack(c, &packed, root_inode); + +- return bch2_btree_insert(c, BTREE_ID_INODES, &packed.inode.k_i, ++ return bch2_btree_insert(c, BTREE_ID_inodes, &packed.inode.k_i, + NULL, NULL, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW); +@@ -958,7 +958,7 @@ next: + if (e->offset == U64_MAX) + goto up; + +- for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS, ++ for_each_btree_key(&trans, iter, BTREE_ID_dirents, + POS(e->inum, e->offset + 1), 0, k, ret) { + if (k.k->p.inode != e->inum) + break; +@@ -1011,7 +1011,7 @@ up: + path.nr--; + } + +- iter = bch2_trans_get_iter(&trans, BTREE_ID_INODES, POS_MIN, 0); ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_inodes, POS_MIN, 0); + retry: + for_each_btree_key_continue(iter, 0, k, ret) { + if (k.k->type != KEY_TYPE_inode) +@@ -1108,7 +1108,7 @@ static int bch2_gc_walk_dirents(struct bch_fs *c, nlink_table *links, + + inc_link(c, links, range_start, range_end, BCACHEFS_ROOT_INO, false); + +- for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS, POS_MIN, 0, k, ret) { ++ for_each_btree_key(&trans, iter, BTREE_ID_dirents, POS_MIN, 0, k, ret) { + switch (k.k->type) { + case KEY_TYPE_dirent: + d = bkey_s_c_to_dirent(k); +@@ -1349,7 +1349,7 @@ static int bch2_gc_walk_inodes(struct bch_fs *c, + + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); + +- iter = bch2_trans_get_iter(&trans, BTREE_ID_INODES, ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_inodes, + POS(0, range_start), 0); + nlinks_iter = genradix_iter_init(links, 0); + +@@ -1475,7 +1475,7 @@ int bch2_fsck_walk_inodes_only(struct bch_fs *c) + + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); + +- for_each_btree_key(&trans, iter, BTREE_ID_INODES, POS_MIN, 0, k, ret) { ++ for_each_btree_key(&trans, iter, BTREE_ID_inodes, POS_MIN, 0, k, ret) { + if (k.k->type != KEY_TYPE_inode) + continue; + +diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c +index 81feb47fe8f9..e72c49e18f13 100644 +--- a/fs/bcachefs/inode.c ++++ b/fs/bcachefs/inode.c +@@ -300,7 +300,7 @@ struct btree_iter *bch2_inode_peek(struct btree_trans *trans, + struct bkey_s_c k; + int ret; + +- iter = bch2_trans_get_iter(trans, BTREE_ID_INODES, POS(0, inum), ++ iter = bch2_trans_get_iter(trans, BTREE_ID_inodes, POS(0, inum), + BTREE_ITER_CACHED|flags); + k = bch2_btree_iter_peek_cached(iter); + ret = bkey_err(k); +@@ -498,7 +498,7 @@ int bch2_inode_create(struct btree_trans *trans, + if (IS_ERR(inode_p)) + return PTR_ERR(inode_p); + again: +- for_each_btree_key(trans, iter, BTREE_ID_INODES, POS(0, start), ++ for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, start), + BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { + if (bkey_cmp(iter->pos, POS(0, max)) > 0) + break; +@@ -513,7 +513,7 @@ again: + * cache before using a slot: + */ + if (k.k->type != KEY_TYPE_inode && +- !bch2_btree_key_cache_find(c, BTREE_ID_INODES, iter->pos)) ++ !bch2_btree_key_cache_find(c, BTREE_ID_inodes, iter->pos)) + goto found_slot; + } + +@@ -560,11 +560,11 @@ int bch2_inode_rm(struct bch_fs *c, u64 inode_nr, bool cached) + * XXX: the dirent could ideally would delete whiteouts when they're no + * longer needed + */ +- ret = bch2_btree_delete_range_trans(&trans, BTREE_ID_EXTENTS, ++ ret = bch2_btree_delete_range_trans(&trans, BTREE_ID_extents, + start, end, NULL) ?: +- bch2_btree_delete_range_trans(&trans, BTREE_ID_XATTRS, ++ bch2_btree_delete_range_trans(&trans, BTREE_ID_xattrs, + start, end, NULL) ?: +- bch2_btree_delete_range_trans(&trans, BTREE_ID_DIRENTS, ++ bch2_btree_delete_range_trans(&trans, BTREE_ID_dirents, + start, end, NULL); + if (ret) + goto err; +@@ -574,11 +574,11 @@ retry: + bi_generation = 0; + + if (cached) { +- iter = bch2_trans_get_iter(&trans, BTREE_ID_INODES, POS(0, inode_nr), ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_inodes, POS(0, inode_nr), + BTREE_ITER_CACHED|BTREE_ITER_INTENT); + k = bch2_btree_iter_peek_cached(iter); + } else { +- iter = bch2_trans_get_iter(&trans, BTREE_ID_INODES, POS(0, inode_nr), ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_inodes, POS(0, inode_nr), + BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + k = bch2_btree_iter_peek_slot(iter); + } +@@ -636,7 +636,7 @@ int __bch2_inode_find_by_inum_trans(struct btree_trans *trans, u64 inode_nr, + struct bkey_s_c k; + int ret; + +- iter = bch2_trans_get_iter(trans, BTREE_ID_INODES, ++ iter = bch2_trans_get_iter(trans, BTREE_ID_inodes, + POS(0, inode_nr), flags); + k = (flags & BTREE_ITER_TYPE) == BTREE_ITER_CACHED + ? bch2_btree_iter_peek_cached(iter) +diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c +index 03ce492b4f81..ca16ea473d80 100644 +--- a/fs/bcachefs/io.c ++++ b/fs/bcachefs/io.c +@@ -398,7 +398,7 @@ int bch2_fpunch(struct bch_fs *c, u64 inum, u64 start, u64 end, + int ret = 0; + + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024); +- iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, + POS(inum, start), + BTREE_ITER_INTENT); + +@@ -425,7 +425,7 @@ int bch2_write_index_default(struct bch_write_op *op) + bch2_bkey_buf_init(&sk); + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024); + +- iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, + bkey_start_pos(&k->k), + BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + +@@ -1537,8 +1537,8 @@ static struct promote_op *promote_alloc(struct bch_fs *c, + + promote = __promote_alloc(c, + k.k->type == KEY_TYPE_reflink_v +- ? BTREE_ID_REFLINK +- : BTREE_ID_EXTENTS, ++ ? BTREE_ID_reflink ++ : BTREE_ID_extents, + k, pos, pick, opts, sectors, rbio); + if (!promote) + return NULL; +@@ -1634,7 +1634,7 @@ static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio + bch2_bkey_buf_init(&sk); + bch2_trans_init(&trans, c, 0, 0); + +- iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, + rbio->pos, BTREE_ITER_SLOTS); + retry: + rbio->bio.bi_status = 0; +@@ -1689,7 +1689,7 @@ static void bch2_read_retry(struct bch_fs *c, struct bch_read_bio *rbio, + retry: + bch2_trans_begin(&trans); + +- for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, ++ for_each_btree_key(&trans, iter, BTREE_ID_extents, + POS(inode, bvec_iter.bi_sector), + BTREE_ITER_SLOTS, k, ret) { + unsigned bytes, sectors, offset_into_extent; +@@ -1808,7 +1808,7 @@ static int __bch2_rbio_narrow_crcs(struct btree_trans *trans, + if (crc_is_compressed(rbio->pick.crc)) + return 0; + +- iter = bch2_trans_get_iter(trans, BTREE_ID_EXTENTS, rbio->pos, ++ iter = bch2_trans_get_iter(trans, BTREE_ID_extents, rbio->pos, + BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + k = bch2_btree_iter_peek_slot(iter); + if ((ret = bkey_err(k))) +@@ -2018,7 +2018,7 @@ int __bch2_read_indirect_extent(struct btree_trans *trans, + reflink_offset = le64_to_cpu(bkey_i_to_reflink_p(orig_k->k)->v.idx) + + *offset_into_extent; + +- iter = bch2_trans_get_iter(trans, BTREE_ID_REFLINK, ++ iter = bch2_trans_get_iter(trans, BTREE_ID_reflink, + POS(0, reflink_offset), + BTREE_ITER_SLOTS); + k = bch2_btree_iter_peek_slot(iter); +@@ -2319,7 +2319,7 @@ void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, u64 inode) + retry: + bch2_trans_begin(&trans); + +- iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, + POS(inode, rbio->bio.bi_iter.bi_sector), + BTREE_ITER_SLOTS); + while (1) { +diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c +index 6241ff0c129f..1db2c2d6b970 100644 +--- a/fs/bcachefs/migrate.c ++++ b/fs/bcachefs/migrate.c +@@ -99,8 +99,8 @@ static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags + + static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags) + { +- return __bch2_dev_usrdata_drop(c, dev_idx, flags, BTREE_ID_EXTENTS) ?: +- __bch2_dev_usrdata_drop(c, dev_idx, flags, BTREE_ID_REFLINK); ++ return __bch2_dev_usrdata_drop(c, dev_idx, flags, BTREE_ID_extents) ?: ++ __bch2_dev_usrdata_drop(c, dev_idx, flags, BTREE_ID_reflink); + } + + static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags) +diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c +index 75b7046d6042..bc003e45a9f6 100644 +--- a/fs/bcachefs/move.c ++++ b/fs/bcachefs/move.c +@@ -579,7 +579,7 @@ peek: + if (!bkey_extent_is_direct_data(k.k)) + goto next_nondata; + +- if (btree_id == BTREE_ID_EXTENTS && ++ if (btree_id == BTREE_ID_extents && + cur_inum != k.k->p.inode) { + struct bch_inode_unpacked inode; + +@@ -669,8 +669,8 @@ int bch2_move_data(struct bch_fs *c, + id++) { + stats->btree_id = id; + +- if (id != BTREE_ID_EXTENTS && +- id != BTREE_ID_REFLINK) ++ if (id != BTREE_ID_extents && ++ id != BTREE_ID_reflink) + continue; + + ret = __bch2_move_data(c, &ctxt, rate, wp, +diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c +index d53b6dccd161..a6c734efe328 100644 +--- a/fs/bcachefs/opts.c ++++ b/fs/bcachefs/opts.c +@@ -30,6 +30,13 @@ const char * const bch2_sb_compat[] = { + NULL + }; + ++const char * const bch2_btree_ids[] = { ++#define x(name, ...) #name, ++ BCH_BTREE_IDS() ++#undef x ++ NULL ++}; ++ + const char * const bch2_csum_opts[] = { + "none", + "crc32c", +diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h +index 7ce2b3adb8d7..a1bbe1dc0b94 100644 +--- a/fs/bcachefs/opts.h ++++ b/fs/bcachefs/opts.h +@@ -11,6 +11,7 @@ + extern const char * const bch2_error_actions[]; + extern const char * const bch2_sb_features[]; + extern const char * const bch2_sb_compat[]; ++extern const char * const bch2_btree_ids[]; + extern const char * const bch2_csum_opts[]; + extern const char * const bch2_compression_opts[]; + extern const char * const bch2_str_hash_types[]; +diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c +index d3032a46e7f3..041da982d051 100644 +--- a/fs/bcachefs/quota.c ++++ b/fs/bcachefs/quota.c +@@ -363,7 +363,7 @@ static int bch2_quota_init_type(struct bch_fs *c, enum quota_types type) + + bch2_trans_init(&trans, c, 0, 0); + +- for_each_btree_key(&trans, iter, BTREE_ID_QUOTAS, POS(type, 0), ++ for_each_btree_key(&trans, iter, BTREE_ID_quotas, POS(type, 0), + BTREE_ITER_PREFETCH, k, ret) { + if (k.k->p.inode != type) + break; +@@ -435,7 +435,7 @@ int bch2_fs_quota_read(struct bch_fs *c) + + bch2_trans_init(&trans, c, 0, 0); + +- for_each_btree_key(&trans, iter, BTREE_ID_INODES, POS_MIN, ++ for_each_btree_key(&trans, iter, BTREE_ID_inodes, POS_MIN, + BTREE_ITER_PREFETCH, k, ret) { + switch (k.k->type) { + case KEY_TYPE_inode: +@@ -526,7 +526,7 @@ static int bch2_quota_remove(struct super_block *sb, unsigned uflags) + if (c->opts.usrquota) + return -EINVAL; + +- ret = bch2_btree_delete_range(c, BTREE_ID_QUOTAS, ++ ret = bch2_btree_delete_range(c, BTREE_ID_quotas, + POS(QTYP_USR, 0), + POS(QTYP_USR + 1, 0), + NULL); +@@ -538,7 +538,7 @@ static int bch2_quota_remove(struct super_block *sb, unsigned uflags) + if (c->opts.grpquota) + return -EINVAL; + +- ret = bch2_btree_delete_range(c, BTREE_ID_QUOTAS, ++ ret = bch2_btree_delete_range(c, BTREE_ID_quotas, + POS(QTYP_GRP, 0), + POS(QTYP_GRP + 1, 0), + NULL); +@@ -550,7 +550,7 @@ static int bch2_quota_remove(struct super_block *sb, unsigned uflags) + if (c->opts.prjquota) + return -EINVAL; + +- ret = bch2_btree_delete_range(c, BTREE_ID_QUOTAS, ++ ret = bch2_btree_delete_range(c, BTREE_ID_quotas, + POS(QTYP_PRJ, 0), + POS(QTYP_PRJ + 1, 0), + NULL); +@@ -718,7 +718,7 @@ static int bch2_set_quota_trans(struct btree_trans *trans, + struct bkey_s_c k; + int ret; + +- iter = bch2_trans_get_iter(trans, BTREE_ID_QUOTAS, new_quota->k.p, ++ iter = bch2_trans_get_iter(trans, BTREE_ID_quotas, new_quota->k.p, + BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + k = bch2_btree_iter_peek_slot(iter); + +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index 4d7badcc568b..b68fcd1d19e4 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -33,7 +33,7 @@ static void drop_alloc_keys(struct journal_keys *keys) + size_t src, dst; + + for (src = 0, dst = 0; src < keys->nr; src++) +- if (keys->d[src].btree_id != BTREE_ID_ALLOC) ++ if (keys->d[src].btree_id != BTREE_ID_alloc) + keys->d[dst++] = keys->d[src]; + + keys->nr = dst; +@@ -554,7 +554,7 @@ static int __bch2_alloc_replay_key(struct btree_trans *trans, struct bkey_i *k) + struct btree_iter *iter; + int ret; + +- iter = bch2_trans_get_iter(trans, BTREE_ID_ALLOC, k->k.p, ++ iter = bch2_trans_get_iter(trans, BTREE_ID_alloc, k->k.p, + BTREE_ITER_CACHED| + BTREE_ITER_CACHED_NOFILL| + BTREE_ITER_INTENT); +@@ -606,7 +606,7 @@ static int bch2_journal_replay(struct bch_fs *c, + for_each_journal_key(keys, i) { + cond_resched(); + +- if (!i->level && i->btree_id == BTREE_ID_ALLOC) { ++ if (!i->level && i->btree_id == BTREE_ID_alloc) { + j->replay_journal_seq = keys.journal_seq_base + i->journal_seq; + ret = bch2_alloc_replay_key(c, i->k); + if (ret) +@@ -645,7 +645,7 @@ static int bch2_journal_replay(struct bch_fs *c, + for_each_journal_key(keys, i) { + cond_resched(); + +- if (i->level || i->btree_id == BTREE_ID_ALLOC) ++ if (i->level || i->btree_id == BTREE_ID_alloc) + continue; + + replay_now_at(j, keys.journal_seq_base + i->journal_seq); +@@ -931,28 +931,28 @@ static int read_btree_roots(struct bch_fs *c) + if (!r->alive) + continue; + +- if (i == BTREE_ID_ALLOC && ++ if (i == BTREE_ID_alloc && + c->opts.reconstruct_alloc) { + c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); + continue; + } + + if (r->error) { +- __fsck_err(c, i == BTREE_ID_ALLOC ++ __fsck_err(c, i == BTREE_ID_alloc + ? FSCK_CAN_IGNORE : 0, + "invalid btree root %s", + bch2_btree_ids[i]); +- if (i == BTREE_ID_ALLOC) ++ if (i == BTREE_ID_alloc) + c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); + } + + ret = bch2_btree_root_read(c, i, &r->key, r->level); + if (ret) { +- __fsck_err(c, i == BTREE_ID_ALLOC ++ __fsck_err(c, i == BTREE_ID_alloc + ? FSCK_CAN_IGNORE : 0, + "error reading btree root %s", + bch2_btree_ids[i]); +- if (i == BTREE_ID_ALLOC) ++ if (i == BTREE_ID_alloc) + c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); + } + } +@@ -1346,7 +1346,7 @@ int bch2_fs_initialize(struct bch_fs *c) + bch2_inode_pack(c, &packed_inode, &root_inode); + + err = "error creating root directory"; +- ret = bch2_btree_insert(c, BTREE_ID_INODES, ++ ret = bch2_btree_insert(c, BTREE_ID_inodes, + &packed_inode.inode.k_i, + NULL, NULL, 0); + if (ret) +diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c +index 930547de3309..a2cc078597f2 100644 +--- a/fs/bcachefs/reflink.c ++++ b/fs/bcachefs/reflink.c +@@ -119,7 +119,7 @@ static int bch2_make_extent_indirect(struct btree_trans *trans, + if (orig->k.type == KEY_TYPE_inline_data) + bch2_check_set_feature(c, BCH_FEATURE_reflink_inline_data); + +- for_each_btree_key(trans, reflink_iter, BTREE_ID_REFLINK, ++ for_each_btree_key(trans, reflink_iter, BTREE_ID_reflink, + POS(0, c->reflink_hint), + BTREE_ITER_INTENT|BTREE_ITER_SLOTS, k, ret) { + if (reflink_iter->pos.inode) { +@@ -219,9 +219,9 @@ s64 bch2_remap_range(struct bch_fs *c, + bch2_bkey_buf_init(&new_src); + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 4096); + +- src_iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, src_start, ++ src_iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, src_start, + BTREE_ITER_INTENT); +- dst_iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, dst_start, ++ dst_iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, dst_start, + BTREE_ITER_INTENT); + + while (1) { +diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c +index 677dbb84b81e..66a8bcb4f0b9 100644 +--- a/fs/bcachefs/super-io.c ++++ b/fs/bcachefs/super-io.c +@@ -942,7 +942,7 @@ void bch2_sb_clean_renumber(struct bch_sb_field_clean *clean, int write) + for (entry = clean->start; + entry < (struct jset_entry *) vstruct_end(&clean->field); + entry = vstruct_next(entry)) +- bch2_bkey_renumber(BKEY_TYPE_BTREE, bkey_to_packed(entry->start), write); ++ bch2_bkey_renumber(BKEY_TYPE_btree, bkey_to_packed(entry->start), write); + } + + int bch2_fs_mark_dirty(struct bch_fs *c) +diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c +index 27ee527c789d..aba4a32b84ab 100644 +--- a/fs/bcachefs/super.c ++++ b/fs/bcachefs/super.c +@@ -1430,7 +1430,7 @@ int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca) + + for (i = 0; i < ca->mi.nbuckets; i++) { + ret = bch2_btree_key_cache_flush(&trans, +- BTREE_ID_ALLOC, POS(ca->dev_idx, i)); ++ BTREE_ID_alloc, POS(ca->dev_idx, i)); + if (ret) + break; + } +@@ -1439,7 +1439,7 @@ int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca) + if (ret) + return ret; + +- return bch2_btree_delete_range(c, BTREE_ID_ALLOC, ++ return bch2_btree_delete_range(c, BTREE_ID_alloc, + POS(ca->dev_idx, 0), + POS(ca->dev_idx + 1, 0), + NULL); +diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c +index c01d2cc55d1e..e5fa6683452f 100644 +--- a/fs/bcachefs/sysfs.c ++++ b/fs/bcachefs/sysfs.c +@@ -259,7 +259,7 @@ static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c + + bch2_trans_init(&trans, c, 0, 0); + +- for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, POS_MIN, 0, k, ret) ++ for_each_btree_key(&trans, iter, BTREE_ID_extents, POS_MIN, 0, k, ret) + if (k.k->type == KEY_TYPE_extent) { + struct bkey_s_c_extent e = bkey_s_c_to_extent(k); + const union bch_extent_entry *entry; +diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c +index f1d09e3ada09..dfb12fdd4814 100644 +--- a/fs/bcachefs/tests.c ++++ b/fs/bcachefs/tests.c +@@ -13,12 +13,12 @@ static void delete_test_keys(struct bch_fs *c) + { + int ret; + +- ret = bch2_btree_delete_range(c, BTREE_ID_EXTENTS, ++ ret = bch2_btree_delete_range(c, BTREE_ID_extents, + POS(0, 0), POS(0, U64_MAX), + NULL); + BUG_ON(ret); + +- ret = bch2_btree_delete_range(c, BTREE_ID_XATTRS, ++ ret = bch2_btree_delete_range(c, BTREE_ID_xattrs, + POS(0, 0), POS(0, U64_MAX), + NULL); + BUG_ON(ret); +@@ -37,7 +37,7 @@ static int test_delete(struct bch_fs *c, u64 nr) + + bch2_trans_init(&trans, c, 0, 0); + +- iter = bch2_trans_get_iter(&trans, BTREE_ID_XATTRS, k.k.p, ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_xattrs, k.k.p, + BTREE_ITER_INTENT); + + ret = bch2_btree_iter_traverse(iter); +@@ -82,7 +82,7 @@ static int test_delete_written(struct bch_fs *c, u64 nr) + + bch2_trans_init(&trans, c, 0, 0); + +- iter = bch2_trans_get_iter(&trans, BTREE_ID_XATTRS, k.k.p, ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_xattrs, k.k.p, + BTREE_ITER_INTENT); + + ret = bch2_btree_iter_traverse(iter); +@@ -130,7 +130,7 @@ static int test_iterate(struct bch_fs *c, u64 nr) + bkey_cookie_init(&k.k_i); + k.k.p.offset = i; + +- ret = bch2_btree_insert(c, BTREE_ID_XATTRS, &k.k_i, ++ ret = bch2_btree_insert(c, BTREE_ID_xattrs, &k.k_i, + NULL, NULL, 0); + if (ret) { + bch_err(c, "insert error in test_iterate: %i", ret); +@@ -142,7 +142,7 @@ static int test_iterate(struct bch_fs *c, u64 nr) + + i = 0; + +- for_each_btree_key(&trans, iter, BTREE_ID_XATTRS, ++ for_each_btree_key(&trans, iter, BTREE_ID_xattrs, + POS_MIN, 0, k, ret) { + if (k.k->p.inode) + break; +@@ -184,7 +184,7 @@ static int test_iterate_extents(struct bch_fs *c, u64 nr) + k.k.p.offset = i + 8; + k.k.size = 8; + +- ret = bch2_btree_insert(c, BTREE_ID_EXTENTS, &k.k_i, ++ ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i, + NULL, NULL, 0); + if (ret) { + bch_err(c, "insert error in test_iterate_extents: %i", ret); +@@ -196,7 +196,7 @@ static int test_iterate_extents(struct bch_fs *c, u64 nr) + + i = 0; + +- for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, ++ for_each_btree_key(&trans, iter, BTREE_ID_extents, + POS_MIN, 0, k, ret) { + BUG_ON(bkey_start_offset(k.k) != i); + i = k.k->p.offset; +@@ -237,7 +237,7 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr) + bkey_cookie_init(&k.k_i); + k.k.p.offset = i * 2; + +- ret = bch2_btree_insert(c, BTREE_ID_XATTRS, &k.k_i, ++ ret = bch2_btree_insert(c, BTREE_ID_xattrs, &k.k_i, + NULL, NULL, 0); + if (ret) { + bch_err(c, "insert error in test_iterate_slots: %i", ret); +@@ -249,7 +249,7 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr) + + i = 0; + +- for_each_btree_key(&trans, iter, BTREE_ID_XATTRS, POS_MIN, ++ for_each_btree_key(&trans, iter, BTREE_ID_xattrs, POS_MIN, + 0, k, ret) { + if (k.k->p.inode) + break; +@@ -265,7 +265,7 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr) + + i = 0; + +- for_each_btree_key(&trans, iter, BTREE_ID_XATTRS, POS_MIN, ++ for_each_btree_key(&trans, iter, BTREE_ID_xattrs, POS_MIN, + BTREE_ITER_SLOTS, k, ret) { + BUG_ON(k.k->p.offset != i); + BUG_ON(bkey_deleted(k.k) != (i & 1)); +@@ -300,7 +300,7 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr) + k.k.p.offset = i + 16; + k.k.size = 8; + +- ret = bch2_btree_insert(c, BTREE_ID_EXTENTS, &k.k_i, ++ ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i, + NULL, NULL, 0); + if (ret) { + bch_err(c, "insert error in test_iterate_slots_extents: %i", ret); +@@ -312,7 +312,7 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr) + + i = 0; + +- for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, POS_MIN, ++ for_each_btree_key(&trans, iter, BTREE_ID_extents, POS_MIN, + 0, k, ret) { + BUG_ON(bkey_start_offset(k.k) != i + 8); + BUG_ON(k.k->size != 8); +@@ -326,7 +326,7 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr) + + i = 0; + +- for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, POS_MIN, ++ for_each_btree_key(&trans, iter, BTREE_ID_extents, POS_MIN, + BTREE_ITER_SLOTS, k, ret) { + BUG_ON(bkey_deleted(k.k) != !(i % 16)); + +@@ -354,7 +354,7 @@ static int test_peek_end(struct bch_fs *c, u64 nr) + + bch2_trans_init(&trans, c, 0, 0); + +- iter = bch2_trans_get_iter(&trans, BTREE_ID_XATTRS, POS_MIN, 0); ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_xattrs, POS_MIN, 0); + + k = bch2_btree_iter_peek(iter); + BUG_ON(k.k); +@@ -374,7 +374,7 @@ static int test_peek_end_extents(struct bch_fs *c, u64 nr) + + bch2_trans_init(&trans, c, 0, 0); + +- iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, POS_MIN, 0); ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, POS_MIN, 0); + + k = bch2_btree_iter_peek(iter); + BUG_ON(k.k); +@@ -403,7 +403,7 @@ static int insert_test_extent(struct bch_fs *c, + k.k_i.k.size = end - start; + k.k_i.k.version.lo = test_version++; + +- ret = bch2_btree_insert(c, BTREE_ID_EXTENTS, &k.k_i, ++ ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i, + NULL, NULL, 0); + if (ret) + bch_err(c, "insert error in insert_test_extent: %i", ret); +@@ -475,7 +475,7 @@ static int rand_insert(struct bch_fs *c, u64 nr) + k.k.p.offset = test_rand(); + + ret = __bch2_trans_do(&trans, NULL, NULL, 0, +- __bch2_btree_insert(&trans, BTREE_ID_XATTRS, &k.k_i)); ++ __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k.k_i)); + if (ret) { + bch_err(c, "error in rand_insert: %i", ret); + break; +@@ -495,7 +495,7 @@ static int rand_lookup(struct bch_fs *c, u64 nr) + u64 i; + + bch2_trans_init(&trans, c, 0, 0); +- iter = bch2_trans_get_iter(&trans, BTREE_ID_XATTRS, POS_MIN, 0); ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_xattrs, POS_MIN, 0); + + for (i = 0; i < nr; i++) { + bch2_btree_iter_set_pos(iter, POS(0, test_rand())); +@@ -522,7 +522,7 @@ static int rand_mixed(struct bch_fs *c, u64 nr) + u64 i; + + bch2_trans_init(&trans, c, 0, 0); +- iter = bch2_trans_get_iter(&trans, BTREE_ID_XATTRS, POS_MIN, 0); ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_xattrs, POS_MIN, 0); + + for (i = 0; i < nr; i++) { + bch2_btree_iter_set_pos(iter, POS(0, test_rand())); +@@ -561,7 +561,7 @@ static int __do_delete(struct btree_trans *trans, struct bpos pos) + struct bkey_s_c k; + int ret = 0; + +- iter = bch2_trans_get_iter(trans, BTREE_ID_XATTRS, pos, ++ iter = bch2_trans_get_iter(trans, BTREE_ID_xattrs, pos, + BTREE_ITER_INTENT); + k = bch2_btree_iter_peek(iter); + ret = bkey_err(k); +@@ -616,7 +616,7 @@ static int seq_insert(struct bch_fs *c, u64 nr) + + bch2_trans_init(&trans, c, 0, 0); + +- for_each_btree_key(&trans, iter, BTREE_ID_XATTRS, POS_MIN, ++ for_each_btree_key(&trans, iter, BTREE_ID_xattrs, POS_MIN, + BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { + insert.k.p = iter->pos; + +@@ -643,7 +643,7 @@ static int seq_lookup(struct bch_fs *c, u64 nr) + + bch2_trans_init(&trans, c, 0, 0); + +- for_each_btree_key(&trans, iter, BTREE_ID_XATTRS, POS_MIN, 0, k, ret) ++ for_each_btree_key(&trans, iter, BTREE_ID_xattrs, POS_MIN, 0, k, ret) + ; + bch2_trans_exit(&trans); + return ret; +@@ -658,7 +658,7 @@ static int seq_overwrite(struct bch_fs *c, u64 nr) + + bch2_trans_init(&trans, c, 0, 0); + +- for_each_btree_key(&trans, iter, BTREE_ID_XATTRS, POS_MIN, ++ for_each_btree_key(&trans, iter, BTREE_ID_xattrs, POS_MIN, + BTREE_ITER_INTENT, k, ret) { + struct bkey_i_cookie u; + +@@ -679,7 +679,7 @@ static int seq_delete(struct bch_fs *c, u64 nr) + { + int ret; + +- ret = bch2_btree_delete_range(c, BTREE_ID_XATTRS, ++ ret = bch2_btree_delete_range(c, BTREE_ID_xattrs, + POS(0, 0), POS(0, U64_MAX), + NULL); + if (ret) +diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c +index 2c4034f12147..91bfc340957b 100644 +--- a/fs/bcachefs/xattr.c ++++ b/fs/bcachefs/xattr.c +@@ -61,7 +61,7 @@ static bool xattr_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r) + } + + const struct bch_hash_desc bch2_xattr_hash_desc = { +- .btree_id = BTREE_ID_XATTRS, ++ .btree_id = BTREE_ID_xattrs, + .key_type = KEY_TYPE_xattr, + .hash_key = xattr_hash_key, + .hash_bkey = xattr_hash_bkey, +@@ -279,7 +279,7 @@ ssize_t bch2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) + + bch2_trans_init(&trans, c, 0, 0); + +- for_each_btree_key(&trans, iter, BTREE_ID_XATTRS, ++ for_each_btree_key(&trans, iter, BTREE_ID_xattrs, + POS(inum, 0), 0, k, ret) { + BUG_ON(k.k->p.inode < inum); + +-- +cgit v1.2.3 + + +From 0aa34563839a8b092730ee8eac63d7619f1df548 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 20 Feb 2021 19:47:58 -0500 +Subject: bcachefs: Use x-macros for more enums + +This patch standardizes all the enums that have associated string tables +(probably more enums should have string tables). + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/alloc_background.c | 8 ++--- + fs/bcachefs/bcachefs_format.h | 73 ++++++++++++++++++++++++++++-------------- + fs/bcachefs/btree_io.c | 2 +- + fs/bcachefs/checksum.h | 6 ++-- + fs/bcachefs/error.c | 10 +++--- + fs/bcachefs/extents.c | 2 +- + fs/bcachefs/journal_io.c | 6 ++-- + fs/bcachefs/opts.c | 45 ++++++-------------------- + fs/bcachefs/opts.h | 11 +++---- + fs/bcachefs/replicas.c | 2 +- + fs/bcachefs/str_hash.h | 6 ++-- + fs/bcachefs/super.c | 40 +++++++++++------------ + fs/bcachefs/super.h | 10 +++--- + fs/bcachefs/sysfs.c | 2 +- + 14 files changed, 110 insertions(+), 113 deletions(-) + +diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c +index d677a4e64fb9..37539431c7f2 100644 +--- a/fs/bcachefs/alloc_background.c ++++ b/fs/bcachefs/alloc_background.c +@@ -720,13 +720,13 @@ static size_t find_reclaimable_buckets(struct bch_fs *c, struct bch_dev *ca) + ca->inc_gen_needs_gc = 0; + + switch (ca->mi.replacement) { +- case CACHE_REPLACEMENT_LRU: ++ case BCH_CACHE_REPLACEMENT_lru: + find_reclaimable_buckets_lru(c, ca); + break; +- case CACHE_REPLACEMENT_FIFO: ++ case BCH_CACHE_REPLACEMENT_fifo: + find_reclaimable_buckets_fifo(c, ca); + break; +- case CACHE_REPLACEMENT_RANDOM: ++ case BCH_CACHE_REPLACEMENT_random: + find_reclaimable_buckets_random(c, ca); + break; + } +@@ -1037,7 +1037,7 @@ static int discard_invalidated_buckets(struct bch_fs *c, struct bch_dev *ca) + + static inline bool allocator_thread_running(struct bch_dev *ca) + { +- return ca->mi.state == BCH_MEMBER_STATE_RW && ++ return ca->mi.state == BCH_MEMBER_STATE_rw && + test_bit(BCH_FS_ALLOCATOR_RUNNING, &ca->fs->flags); + } + +diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h +index c15e5a9c16f7..97e548994512 100644 +--- a/fs/bcachefs/bcachefs_format.h ++++ b/fs/bcachefs/bcachefs_format.h +@@ -987,19 +987,29 @@ LE64_BITMASK(BCH_MEMBER_NR_READ_ERRORS, struct bch_member, flags[1], 0, 20); + LE64_BITMASK(BCH_MEMBER_NR_WRITE_ERRORS,struct bch_member, flags[1], 20, 40); + #endif + ++#define BCH_MEMBER_STATES() \ ++ x(rw, 0) \ ++ x(ro, 1) \ ++ x(failed, 2) \ ++ x(spare, 3) ++ + enum bch_member_state { +- BCH_MEMBER_STATE_RW = 0, +- BCH_MEMBER_STATE_RO = 1, +- BCH_MEMBER_STATE_FAILED = 2, +- BCH_MEMBER_STATE_SPARE = 3, +- BCH_MEMBER_STATE_NR = 4, ++#define x(t, n) BCH_MEMBER_STATE_##t = n, ++ BCH_MEMBER_STATES() ++#undef x ++ BCH_MEMBER_STATE_NR + }; + +-enum cache_replacement { +- CACHE_REPLACEMENT_LRU = 0, +- CACHE_REPLACEMENT_FIFO = 1, +- CACHE_REPLACEMENT_RANDOM = 2, +- CACHE_REPLACEMENT_NR = 3, ++#define BCH_CACHE_REPLACEMENT_POLICIES() \ ++ x(lru, 0) \ ++ x(fifo, 1) \ ++ x(random, 2) ++ ++enum bch_cache_replacement_policies { ++#define x(t, n) BCH_CACHE_REPLACEMENT_##t = n, ++ BCH_CACHE_REPLACEMENT_POLICIES() ++#undef x ++ BCH_CACHE_REPLACEMENT_NR + }; + + struct bch_sb_field_members { +@@ -1401,11 +1411,16 @@ enum bch_sb_compat { + + #define BCH_BKEY_PTRS_MAX 16U + ++#define BCH_ERROR_ACTIONS() \ ++ x(continue, 0) \ ++ x(ro, 1) \ ++ x(panic, 2) ++ + enum bch_error_actions { +- BCH_ON_ERROR_CONTINUE = 0, +- BCH_ON_ERROR_RO = 1, +- BCH_ON_ERROR_PANIC = 2, +- BCH_NR_ERROR_ACTIONS = 3, ++#define x(t, n) BCH_ON_ERROR_##t = n, ++ BCH_ERROR_ACTIONS() ++#undef x ++ BCH_ON_ERROR_NR + }; + + enum bch_str_hash_type { +@@ -1416,11 +1431,16 @@ enum bch_str_hash_type { + BCH_STR_HASH_NR = 4, + }; + ++#define BCH_STR_HASH_OPTS() \ ++ x(crc32c, 0) \ ++ x(crc64, 1) \ ++ x(siphash, 2) ++ + enum bch_str_hash_opts { +- BCH_STR_HASH_OPT_CRC32C = 0, +- BCH_STR_HASH_OPT_CRC64 = 1, +- BCH_STR_HASH_OPT_SIPHASH = 2, +- BCH_STR_HASH_OPT_NR = 3, ++#define x(t, n) BCH_STR_HASH_OPT_##t = n, ++ BCH_STR_HASH_OPTS() ++#undef x ++ BCH_STR_HASH_OPT_NR + }; + + enum bch_csum_type { +@@ -1455,11 +1475,16 @@ static inline _Bool bch2_csum_type_is_encryption(enum bch_csum_type type) + } + } + ++#define BCH_CSUM_OPTS() \ ++ x(none, 0) \ ++ x(crc32c, 1) \ ++ x(crc64, 2) ++ + enum bch_csum_opts { +- BCH_CSUM_OPT_NONE = 0, +- BCH_CSUM_OPT_CRC32C = 1, +- BCH_CSUM_OPT_CRC64 = 2, +- BCH_CSUM_OPT_NR = 3, ++#define x(t, n) BCH_CSUM_OPT_##t = n, ++ BCH_CSUM_OPTS() ++#undef x ++ BCH_CSUM_OPT_NR + }; + + #define BCH_COMPRESSION_TYPES() \ +@@ -1471,7 +1496,7 @@ enum bch_csum_opts { + x(incompressible, 5) + + enum bch_compression_type { +-#define x(t, n) BCH_COMPRESSION_TYPE_##t, ++#define x(t, n) BCH_COMPRESSION_TYPE_##t = n, + BCH_COMPRESSION_TYPES() + #undef x + BCH_COMPRESSION_TYPE_NR +@@ -1484,7 +1509,7 @@ enum bch_compression_type { + x(zstd, 3) + + enum bch_compression_opts { +-#define x(t, n) BCH_COMPRESSION_OPT_##t, ++#define x(t, n) BCH_COMPRESSION_OPT_##t = n, + BCH_COMPRESSION_OPTS() + #undef x + BCH_COMPRESSION_OPT_NR +diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c +index c65e6e475083..0d71910d681c 100644 +--- a/fs/bcachefs/btree_io.c ++++ b/fs/bcachefs/btree_io.c +@@ -954,7 +954,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, + bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&b->key)), ptr) { + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); + +- if (ca->mi.state != BCH_MEMBER_STATE_RW) ++ if (ca->mi.state != BCH_MEMBER_STATE_rw) + set_btree_node_need_rewrite(b); + } + out: +diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h +index 24dee8039d57..728b7ef1a149 100644 +--- a/fs/bcachefs/checksum.h ++++ b/fs/bcachefs/checksum.h +@@ -77,11 +77,11 @@ static inline enum bch_csum_type bch2_csum_opt_to_type(enum bch_csum_opts type, + bool data) + { + switch (type) { +- case BCH_CSUM_OPT_NONE: ++ case BCH_CSUM_OPT_none: + return BCH_CSUM_NONE; +- case BCH_CSUM_OPT_CRC32C: ++ case BCH_CSUM_OPT_crc32c: + return data ? BCH_CSUM_CRC32C : BCH_CSUM_CRC32C_NONZERO; +- case BCH_CSUM_OPT_CRC64: ++ case BCH_CSUM_OPT_crc64: + return data ? BCH_CSUM_CRC64 : BCH_CSUM_CRC64_NONZERO; + default: + BUG(); +diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c +index cd46706fb6f5..a8ee1db8aa39 100644 +--- a/fs/bcachefs/error.c ++++ b/fs/bcachefs/error.c +@@ -11,13 +11,13 @@ bool bch2_inconsistent_error(struct bch_fs *c) + set_bit(BCH_FS_ERROR, &c->flags); + + switch (c->opts.errors) { +- case BCH_ON_ERROR_CONTINUE: ++ case BCH_ON_ERROR_continue: + return false; +- case BCH_ON_ERROR_RO: ++ case BCH_ON_ERROR_ro: + if (bch2_fs_emergency_read_only(c)) + bch_err(c, "emergency read only"); + return true; +- case BCH_ON_ERROR_PANIC: ++ case BCH_ON_ERROR_panic: + panic(bch2_fmt(c, "panic after error")); + return true; + default: +@@ -38,10 +38,10 @@ void bch2_io_error_work(struct work_struct *work) + bool dev; + + down_write(&c->state_lock); +- dev = bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_RO, ++ dev = bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_ro, + BCH_FORCE_IF_DEGRADED); + if (dev +- ? __bch2_dev_set_state(c, ca, BCH_MEMBER_STATE_RO, ++ ? __bch2_dev_set_state(c, ca, BCH_MEMBER_STATE_ro, + BCH_FORCE_IF_DEGRADED) + : bch2_fs_emergency_read_only(c)) + bch_err(ca, +diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c +index 818609347bd9..dbaded8176cb 100644 +--- a/fs/bcachefs/extents.c ++++ b/fs/bcachefs/extents.c +@@ -725,7 +725,7 @@ static unsigned bch2_extent_ptr_durability(struct bch_fs *c, + + ca = bch_dev_bkey_exists(c, p.ptr.dev); + +- if (ca->mi.state != BCH_MEMBER_STATE_FAILED) ++ if (ca->mi.state != BCH_MEMBER_STATE_failed) + durability = max_t(unsigned, durability, ca->mi.durability); + + if (p.has_ec) +diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c +index 0d361f5c39b5..37465bdbe4a9 100644 +--- a/fs/bcachefs/journal_io.c ++++ b/fs/bcachefs/journal_io.c +@@ -871,8 +871,8 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list, + !(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_journal))) + continue; + +- if ((ca->mi.state == BCH_MEMBER_STATE_RW || +- ca->mi.state == BCH_MEMBER_STATE_RO) && ++ if ((ca->mi.state == BCH_MEMBER_STATE_rw || ++ ca->mi.state == BCH_MEMBER_STATE_ro) && + percpu_ref_tryget(&ca->io_ref)) + closure_call(&ca->journal.read, + bch2_journal_read_device, +@@ -1065,7 +1065,7 @@ static void __journal_write_alloc(struct journal *j, + * it: + */ + if (!ca->mi.durability || +- ca->mi.state != BCH_MEMBER_STATE_RW || ++ ca->mi.state != BCH_MEMBER_STATE_rw || + !ja->nr || + bch2_bkey_has_device(bkey_i_to_s_c(&w->key), + ca->dev_idx) || +diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c +index a6c734efe328..0cfbb56a57c1 100644 +--- a/fs/bcachefs/opts.c ++++ b/fs/bcachefs/opts.c +@@ -9,86 +9,59 @@ + #include "super-io.h" + #include "util.h" + ++#define x(t, n) #t, ++ + const char * const bch2_error_actions[] = { +- "continue", +- "remount-ro", +- "panic", ++ BCH_ERROR_ACTIONS() + NULL + }; + + const char * const bch2_sb_features[] = { +-#define x(f, n) #f, + BCH_SB_FEATURES() +-#undef x + NULL + }; + + const char * const bch2_sb_compat[] = { +-#define x(f, n) #f, + BCH_SB_COMPAT() +-#undef x + NULL + }; + + const char * const bch2_btree_ids[] = { +-#define x(name, ...) #name, + BCH_BTREE_IDS() +-#undef x + NULL + }; + + const char * const bch2_csum_opts[] = { +- "none", +- "crc32c", +- "crc64", ++ BCH_CSUM_OPTS() + NULL + }; + + const char * const bch2_compression_opts[] = { +-#define x(t, n) #t, + BCH_COMPRESSION_OPTS() +-#undef x + NULL + }; + + const char * const bch2_str_hash_types[] = { +- "crc32c", +- "crc64", +- "siphash", ++ BCH_STR_HASH_OPTS() + NULL + }; + + const char * const bch2_data_types[] = { +-#define x(t, n) #t, + BCH_DATA_TYPES() +-#undef x + NULL + }; + + const char * const bch2_cache_replacement_policies[] = { +- "lru", +- "fifo", +- "random", ++ BCH_CACHE_REPLACEMENT_POLICIES() + NULL + }; + +-/* Default is -1; we skip past it for struct cached_dev's cache mode */ +-const char * const bch2_cache_modes[] = { +- "default", +- "writethrough", +- "writeback", +- "writearound", +- "none", ++const char * const bch2_member_states[] = { ++ BCH_MEMBER_STATES() + NULL + }; + +-const char * const bch2_dev_state[] = { +- "readwrite", +- "readonly", +- "failed", +- "spare", +- NULL +-}; ++#undef x + + void bch2_opts_apply(struct bch_opts *dst, struct bch_opts src) + { +diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h +index a1bbe1dc0b94..f5d55427bf83 100644 +--- a/fs/bcachefs/opts.h ++++ b/fs/bcachefs/opts.h +@@ -17,8 +17,7 @@ extern const char * const bch2_compression_opts[]; + extern const char * const bch2_str_hash_types[]; + extern const char * const bch2_data_types[]; + extern const char * const bch2_cache_replacement_policies[]; +-extern const char * const bch2_cache_modes[]; +-extern const char * const bch2_dev_state[]; ++extern const char * const bch2_member_states[]; + + /* + * Mount options; we also store defaults in the superblock. +@@ -91,7 +90,7 @@ enum opt_type { + x(errors, u8, \ + OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_STR(bch2_error_actions), \ +- BCH_SB_ERROR_ACTION, BCH_ON_ERROR_RO, \ ++ BCH_SB_ERROR_ACTION, BCH_ON_ERROR_ro, \ + NULL, "Action to take on filesystem error") \ + x(metadata_replicas, u8, \ + OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ +@@ -116,12 +115,12 @@ enum opt_type { + x(metadata_checksum, u8, \ + OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_STR(bch2_csum_opts), \ +- BCH_SB_META_CSUM_TYPE, BCH_CSUM_OPT_CRC32C, \ ++ BCH_SB_META_CSUM_TYPE, BCH_CSUM_OPT_crc32c, \ + NULL, NULL) \ + x(data_checksum, u8, \ + OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \ + OPT_STR(bch2_csum_opts), \ +- BCH_SB_DATA_CSUM_TYPE, BCH_CSUM_OPT_CRC32C, \ ++ BCH_SB_DATA_CSUM_TYPE, BCH_CSUM_OPT_crc32c, \ + NULL, NULL) \ + x(compression, u8, \ + OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \ +@@ -136,7 +135,7 @@ enum opt_type { + x(str_hash, u8, \ + OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_STR(bch2_str_hash_types), \ +- BCH_SB_STR_HASH_TYPE, BCH_STR_HASH_OPT_SIPHASH, \ ++ BCH_SB_STR_HASH_TYPE, BCH_STR_HASH_OPT_siphash, \ + NULL, "Hash function for directory entries and xattrs")\ + x(metadata_target, u16, \ + OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \ +diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c +index fccdb630010c..e45a6d6b103c 100644 +--- a/fs/bcachefs/replicas.c ++++ b/fs/bcachefs/replicas.c +@@ -973,7 +973,7 @@ bool bch2_have_enough_devs(struct bch_fs *c, struct bch_devs_mask devs, + struct bch_dev *ca = bch_dev_bkey_exists(c, e->devs[i]); + + nr_online += test_bit(e->devs[i], devs.d); +- nr_failed += ca->mi.state == BCH_MEMBER_STATE_FAILED; ++ nr_failed += ca->mi.state == BCH_MEMBER_STATE_failed; + } + + if (nr_failed == e->nr_devs) +diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h +index 952b146af750..b85f895de346 100644 +--- a/fs/bcachefs/str_hash.h ++++ b/fs/bcachefs/str_hash.h +@@ -18,11 +18,11 @@ static inline enum bch_str_hash_type + bch2_str_hash_opt_to_type(struct bch_fs *c, enum bch_str_hash_opts opt) + { + switch (opt) { +- case BCH_STR_HASH_OPT_CRC32C: ++ case BCH_STR_HASH_OPT_crc32c: + return BCH_STR_HASH_CRC32C; +- case BCH_STR_HASH_OPT_CRC64: ++ case BCH_STR_HASH_OPT_crc64: + return BCH_STR_HASH_CRC64; +- case BCH_STR_HASH_OPT_SIPHASH: ++ case BCH_STR_HASH_OPT_siphash: + return c->sb.features & (1ULL << BCH_FEATURE_new_siphash) + ? BCH_STR_HASH_SIPHASH + : BCH_STR_HASH_SIPHASH_OLD; +diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c +index aba4a32b84ab..10d3c616b9c7 100644 +--- a/fs/bcachefs/super.c ++++ b/fs/bcachefs/super.c +@@ -1163,7 +1163,7 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx) + + ca->fs = c; + +- if (ca->mi.state == BCH_MEMBER_STATE_RW && ++ if (ca->mi.state == BCH_MEMBER_STATE_rw && + bch2_dev_allocator_start(ca)) { + bch2_dev_free(ca); + goto err; +@@ -1270,16 +1270,16 @@ bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca, + lockdep_assert_held(&c->state_lock); + + switch (new_state) { +- case BCH_MEMBER_STATE_RW: ++ case BCH_MEMBER_STATE_rw: + return true; +- case BCH_MEMBER_STATE_RO: +- if (ca->mi.state != BCH_MEMBER_STATE_RW) ++ case BCH_MEMBER_STATE_ro: ++ if (ca->mi.state != BCH_MEMBER_STATE_rw) + return true; + + /* do we have enough devices to write to? */ + for_each_member_device(ca2, c, i) + if (ca2 != ca) +- nr_rw += ca2->mi.state == BCH_MEMBER_STATE_RW; ++ nr_rw += ca2->mi.state == BCH_MEMBER_STATE_rw; + + required = max(!(flags & BCH_FORCE_IF_METADATA_DEGRADED) + ? c->opts.metadata_replicas +@@ -1289,10 +1289,10 @@ bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca, + : c->opts.data_replicas_required); + + return nr_rw >= required; +- case BCH_MEMBER_STATE_FAILED: +- case BCH_MEMBER_STATE_SPARE: +- if (ca->mi.state != BCH_MEMBER_STATE_RW && +- ca->mi.state != BCH_MEMBER_STATE_RO) ++ case BCH_MEMBER_STATE_failed: ++ case BCH_MEMBER_STATE_spare: ++ if (ca->mi.state != BCH_MEMBER_STATE_rw && ++ ca->mi.state != BCH_MEMBER_STATE_ro) + return true; + + /* do we have enough devices to read from? */ +@@ -1329,8 +1329,8 @@ static bool bch2_fs_may_start(struct bch_fs *c) + ca = bch_dev_locked(c, i); + + if (!bch2_dev_is_online(ca) && +- (ca->mi.state == BCH_MEMBER_STATE_RW || +- ca->mi.state == BCH_MEMBER_STATE_RO)) { ++ (ca->mi.state == BCH_MEMBER_STATE_rw || ++ ca->mi.state == BCH_MEMBER_STATE_ro)) { + mutex_unlock(&c->sb_lock); + return false; + } +@@ -1363,7 +1363,7 @@ static const char *__bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca) + { + lockdep_assert_held(&c->state_lock); + +- BUG_ON(ca->mi.state != BCH_MEMBER_STATE_RW); ++ BUG_ON(ca->mi.state != BCH_MEMBER_STATE_rw); + + bch2_dev_allocator_add(c, ca); + bch2_recalc_capacity(c); +@@ -1386,10 +1386,10 @@ int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca, + if (!bch2_dev_state_allowed(c, ca, new_state, flags)) + return -EINVAL; + +- if (new_state != BCH_MEMBER_STATE_RW) ++ if (new_state != BCH_MEMBER_STATE_rw) + __bch2_dev_read_only(c, ca); + +- bch_notice(ca, "%s", bch2_dev_state[new_state]); ++ bch_notice(ca, "%s", bch2_member_states[new_state]); + + mutex_lock(&c->sb_lock); + mi = bch2_sb_get_members(c->disk_sb.sb); +@@ -1397,7 +1397,7 @@ int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca, + bch2_write_super(c); + mutex_unlock(&c->sb_lock); + +- if (new_state == BCH_MEMBER_STATE_RW && ++ if (new_state == BCH_MEMBER_STATE_rw && + __bch2_dev_read_write(c, ca)) + ret = -ENOMEM; + +@@ -1459,7 +1459,7 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) + */ + percpu_ref_put(&ca->ref); + +- if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_FAILED, flags)) { ++ if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_failed, flags)) { + bch_err(ca, "Cannot remove without losing data"); + goto err; + } +@@ -1543,7 +1543,7 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) + bch2_dev_usage_journal_reserve(c); + return 0; + err: +- if (ca->mi.state == BCH_MEMBER_STATE_RW && ++ if (ca->mi.state == BCH_MEMBER_STATE_rw && + !percpu_ref_is_zero(&ca->io_ref)) + __bch2_dev_read_write(c, ca); + up_write(&c->state_lock); +@@ -1667,7 +1667,7 @@ have_slot: + if (ret) + goto err_late; + +- if (ca->mi.state == BCH_MEMBER_STATE_RW) { ++ if (ca->mi.state == BCH_MEMBER_STATE_rw) { + err = __bch2_dev_read_write(c, ca); + if (err) + goto err_late; +@@ -1728,7 +1728,7 @@ int bch2_dev_online(struct bch_fs *c, const char *path) + goto err; + } + +- if (ca->mi.state == BCH_MEMBER_STATE_RW) { ++ if (ca->mi.state == BCH_MEMBER_STATE_rw) { + err = __bch2_dev_read_write(c, ca); + if (err) + goto err; +@@ -1762,7 +1762,7 @@ int bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca, int flags) + return 0; + } + +- if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_FAILED, flags)) { ++ if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_failed, flags)) { + bch_err(ca, "Cannot offline required disk"); + up_write(&c->state_lock); + return -EINVAL; +diff --git a/fs/bcachefs/super.h b/fs/bcachefs/super.h +index 2820ca110598..6cab506150a8 100644 +--- a/fs/bcachefs/super.h ++++ b/fs/bcachefs/super.h +@@ -34,7 +34,7 @@ static inline bool bch2_dev_is_online(struct bch_dev *ca) + static inline bool bch2_dev_is_readable(struct bch_dev *ca) + { + return bch2_dev_is_online(ca) && +- ca->mi.state != BCH_MEMBER_STATE_FAILED; ++ ca->mi.state != BCH_MEMBER_STATE_failed; + } + + static inline bool bch2_dev_get_ioref(struct bch_dev *ca, int rw) +@@ -42,8 +42,8 @@ static inline bool bch2_dev_get_ioref(struct bch_dev *ca, int rw) + if (!percpu_ref_tryget(&ca->io_ref)) + return false; + +- if (ca->mi.state == BCH_MEMBER_STATE_RW || +- (ca->mi.state == BCH_MEMBER_STATE_RO && rw == READ)) ++ if (ca->mi.state == BCH_MEMBER_STATE_rw || ++ (ca->mi.state == BCH_MEMBER_STATE_ro && rw == READ)) + return true; + + percpu_ref_put(&ca->io_ref); +@@ -158,11 +158,11 @@ static inline struct bch_dev *bch2_get_next_online_dev(struct bch_fs *c, + __for_each_online_member(ca, c, iter, ~0) + + #define for_each_rw_member(ca, c, iter) \ +- __for_each_online_member(ca, c, iter, 1 << BCH_MEMBER_STATE_RW) ++ __for_each_online_member(ca, c, iter, 1 << BCH_MEMBER_STATE_rw) + + #define for_each_readable_member(ca, c, iter) \ + __for_each_online_member(ca, c, iter, \ +- (1 << BCH_MEMBER_STATE_RW)|(1 << BCH_MEMBER_STATE_RO)) ++ (1 << BCH_MEMBER_STATE_rw)|(1 << BCH_MEMBER_STATE_ro)) + + /* + * If a key exists that references a device, the device won't be going away and +diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c +index e5fa6683452f..a6a0a3f6f205 100644 +--- a/fs/bcachefs/sysfs.c ++++ b/fs/bcachefs/sysfs.c +@@ -888,7 +888,7 @@ SHOW(bch2_dev) + } + + if (attr == &sysfs_state_rw) { +- bch2_string_opt_to_text(&out, bch2_dev_state, ++ bch2_string_opt_to_text(&out, bch2_member_states, + ca->mi.state); + pr_buf(&out, "\n"); + return out.pos - buf; +-- +cgit v1.2.3 + + +From 4bc9961f21e748ef8d2932a259ec711dbbf0b2d2 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 20 Feb 2021 20:51:57 -0500 +Subject: bcachefs: Improve handling of extents in bch2_trans_update() + +The transaction update/commit path cares about whether it's inserting +extents or regular keys; extents require extra passes (handling of +overlapping extents) but sometimes we want to skip all that. This +clarifies things by adding a new member to btree_insert_entry specifying +whether the key being inserted is an extent, instead of overloading +BTREE_ITER_IS_EXTENTS. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_types.h | 21 +-- + fs/bcachefs/btree_update_leaf.c | 283 ++++++++++++++++++++-------------------- + 2 files changed, 155 insertions(+), 149 deletions(-) + +diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h +index 6b221d9c6ae4..fd00b9c06bfc 100644 +--- a/fs/bcachefs/btree_types.h ++++ b/fs/bcachefs/btree_types.h +@@ -335,7 +335,11 @@ struct bkey_cached { + + struct btree_insert_entry { + unsigned trigger_flags; ++ u8 bkey_type; ++ u8 btree_id; ++ u8 level; + unsigned trans_triggers_run:1; ++ unsigned is_extent:1; + struct bkey_i *k; + struct btree_iter *iter; + }; +@@ -589,19 +593,20 @@ static inline bool btree_iter_is_extents(struct btree_iter *iter) + return btree_node_type_is_extents(btree_iter_key_type(iter)); + } + +-#define BTREE_NODE_TYPE_HAS_TRIGGERS \ ++#define BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS \ + ((1U << BKEY_TYPE_extents)| \ +- (1U << BKEY_TYPE_alloc)| \ + (1U << BKEY_TYPE_inodes)| \ +- (1U << BKEY_TYPE_reflink)| \ + (1U << BKEY_TYPE_stripes)| \ ++ (1U << BKEY_TYPE_reflink)| \ + (1U << BKEY_TYPE_btree)) + +-#define BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS \ +- ((1U << BKEY_TYPE_extents)| \ +- (1U << BKEY_TYPE_inodes)| \ +- (1U << BKEY_TYPE_stripes)| \ +- (1U << BKEY_TYPE_reflink)) ++#define BTREE_NODE_TYPE_HAS_MEM_TRIGGERS \ ++ ((1U << BKEY_TYPE_alloc)| \ ++ (1U << BKEY_TYPE_stripes)) ++ ++#define BTREE_NODE_TYPE_HAS_TRIGGERS \ ++ (BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS| \ ++ BTREE_NODE_TYPE_HAS_MEM_TRIGGERS) + + enum btree_trigger_flags { + __BTREE_TRIGGER_NORUN, /* Don't run triggers at all */ +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index 2e6f14452894..6ac359629afa 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -21,6 +21,14 @@ + #include + #include + ++static inline int btree_insert_entry_cmp(const struct btree_insert_entry *l, ++ const struct btree_insert_entry *r) ++{ ++ return cmp_int(l->btree_id, r->btree_id) ?: ++ -cmp_int(l->level, r->level) ?: ++ bkey_cmp(l->k->k.p, r->k->k.p); ++} ++ + static inline bool same_leaf_as_prev(struct btree_trans *trans, + struct btree_insert_entry *i) + { +@@ -211,15 +219,15 @@ static bool btree_insert_key_leaf(struct btree_trans *trans, + /* Normal update interface: */ + + static inline void btree_insert_entry_checks(struct btree_trans *trans, +- struct btree_iter *iter, +- struct bkey_i *insert) ++ struct btree_insert_entry *i) + { + struct bch_fs *c = trans->c; + +- BUG_ON(bkey_cmp(insert->k.p, iter->real_pos)); + BUG_ON(bch2_debug_check_bkeys && +- bch2_bkey_invalid(c, bkey_i_to_s_c(insert), +- __btree_node_type(iter->level, iter->btree_id))); ++ bch2_bkey_invalid(c, bkey_i_to_s_c(i->k), i->bkey_type)); ++ BUG_ON(bkey_cmp(i->k->k.p, i->iter->real_pos)); ++ BUG_ON(i->level != i->iter->level); ++ BUG_ON(i->btree_id != i->iter->btree_id); + } + + static noinline int +@@ -332,19 +340,6 @@ static inline void do_btree_insert_one(struct btree_trans *trans, + } + } + +-static inline bool iter_has_trans_triggers(struct btree_iter *iter) +-{ +- return BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << iter->btree_id); +-} +- +-static inline bool iter_has_nontrans_triggers(struct btree_iter *iter) +-{ +- return (((BTREE_NODE_TYPE_HAS_TRIGGERS & +- ~BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS)) | +- (1U << BTREE_ID_stripes)) & +- (1U << iter->btree_id); +-} +- + static noinline void bch2_btree_iter_unlock_noinline(struct btree_iter *iter) + { + __bch2_btree_iter_unlock(iter); +@@ -405,7 +400,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, + return ret; + } + +- if (btree_node_type_needs_gc(i->iter->btree_id)) ++ if (btree_node_type_needs_gc(i->bkey_type)) + marking = true; + } + +@@ -459,7 +454,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, + } + + trans_for_each_update(trans, i) +- if (iter_has_nontrans_triggers(i->iter)) ++ if (BTREE_NODE_TYPE_HAS_MEM_TRIGGERS & (1U << i->bkey_type)) + bch2_mark_update(trans, i->iter, i->k, + fs_usage, i->trigger_flags); + +@@ -531,7 +526,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, + + if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) + trans_for_each_update2(trans, i) +- btree_insert_entry_checks(trans, i->iter, i->k); ++ btree_insert_entry_checks(trans, i); + bch2_btree_trans_verify_locks(trans); + + trans_for_each_update2(trans, i) +@@ -696,69 +691,64 @@ bch2_trans_commit_get_rw_cold(struct btree_trans *trans) + return 0; + } + +-static inline int btree_iter_pos_cmp(const struct btree_iter *l, +- const struct btree_iter *r) +-{ +- return cmp_int(l->btree_id, r->btree_id) ?: +- bkey_cmp(l->pos, r->pos); +-} +- +-static int bch2_trans_update2(struct btree_trans *trans, +- struct btree_iter *iter, +- struct bkey_i *insert) ++static int __bch2_trans_update2(struct btree_trans *trans, ++ struct btree_insert_entry n) + { +- struct btree_insert_entry *i, n = (struct btree_insert_entry) { +- .iter = iter, .k = insert +- }; +- int ret; ++ struct btree_insert_entry *i; + +- btree_insert_entry_checks(trans, n.iter, n.k); ++ btree_insert_entry_checks(trans, &n); + + EBUG_ON(trans->nr_updates2 >= BTREE_ITER_MAX); + +- ret = bch2_btree_iter_traverse(iter); +- if (unlikely(ret)) +- return ret; +- +- BUG_ON(iter->uptodate > BTREE_ITER_NEED_PEEK); ++ n.iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT; + +- iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT; +- +- trans_for_each_update2(trans, i) { +- if (btree_iter_pos_cmp(n.iter, i->iter) == 0) { +- *i = n; +- return 0; +- } +- +- if (btree_iter_pos_cmp(n.iter, i->iter) <= 0) ++ trans_for_each_update2(trans, i) ++ if (btree_insert_entry_cmp(&n, i) <= 0) + break; +- } + +- array_insert_item(trans->updates2, trans->nr_updates2, +- i - trans->updates2, n); ++ if (i < trans->updates2 + trans->nr_updates2 && ++ !btree_insert_entry_cmp(&n, i)) ++ *i = n; ++ else ++ array_insert_item(trans->updates2, trans->nr_updates2, ++ i - trans->updates2, n); ++ + return 0; + } + ++static int bch2_trans_update2(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct bkey_i *insert) ++{ ++ return __bch2_trans_update2(trans, (struct btree_insert_entry) { ++ .bkey_type = __btree_node_type(iter->level, iter->btree_id), ++ .btree_id = iter->btree_id, ++ .level = iter->level, ++ .iter = iter, ++ .k = insert, ++ }); ++} ++ + static int extent_update_to_keys(struct btree_trans *trans, +- struct btree_iter *orig_iter, +- struct bkey_i *insert) ++ struct btree_insert_entry n) + { +- struct btree_iter *iter; + int ret; + +- ret = bch2_extent_can_insert(trans, orig_iter, insert); ++ if (bkey_deleted(&n.k->k)) ++ return 0; ++ ++ ret = bch2_extent_can_insert(trans, n.iter, n.k); + if (ret) + return ret; + +- if (bkey_deleted(&insert->k)) +- return 0; ++ n.iter = bch2_trans_copy_iter(trans, n.iter); + +- iter = bch2_trans_copy_iter(trans, orig_iter); ++ n.iter->flags |= BTREE_ITER_INTENT; ++ __bch2_btree_iter_set_pos(n.iter, n.k->k.p, false); ++ n.is_extent = false; + +- iter->flags |= BTREE_ITER_INTENT; +- __bch2_btree_iter_set_pos(iter, insert->k.p, false); +- ret = bch2_trans_update2(trans, iter, insert); +- bch2_trans_iter_put(trans, iter); ++ ret = __bch2_trans_update2(trans, n); ++ bch2_trans_iter_put(trans, n.iter); + return ret; + } + +@@ -868,7 +858,7 @@ int __bch2_trans_commit(struct btree_trans *trans) + if (btree_iter_type(i->iter) != BTREE_ITER_CACHED && + !(i->trigger_flags & BTREE_TRIGGER_NORUN)) + bch2_btree_key_cache_verify_clean(trans, +- i->iter->btree_id, i->iter->pos); ++ i->btree_id, i->k->k.p); + #endif + + /* +@@ -879,24 +869,7 @@ int __bch2_trans_commit(struct btree_trans *trans) + trans_trigger_run = false; + + trans_for_each_update(trans, i) { +- ret = bch2_btree_iter_traverse(i->iter); +- if (unlikely(ret)) { +- trace_trans_restart_traverse(trans->ip); +- goto out; +- } +- +- /* +- * We're not using bch2_btree_iter_upgrade here because +- * we know trans->nounlock can't be set: +- */ +- if (unlikely(!btree_node_intent_locked(i->iter, i->iter->level) && +- !__bch2_btree_iter_upgrade(i->iter, i->iter->level + 1))) { +- trace_trans_restart_upgrade(trans->ip); +- ret = -EINTR; +- goto out; +- } +- +- if (iter_has_trans_triggers(i->iter) && ++ if ((BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)) && + !i->trans_triggers_run) { + i->trans_triggers_run = true; + trans_trigger_run = true; +@@ -914,39 +887,46 @@ int __bch2_trans_commit(struct btree_trans *trans) + + /* Turn extents updates into keys: */ + trans_for_each_update(trans, i) +- if (i->iter->flags & BTREE_ITER_IS_EXTENTS) { ++ if (i->is_extent) { + struct bpos start = bkey_start_pos(&i->k->k); + + while (i + 1 < trans->updates + trans->nr_updates && +- i[0].iter->btree_id == i[1].iter->btree_id && ++ i[0].btree_id == i[1].btree_id && + !bkey_cmp(i[0].k->k.p, bkey_start_pos(&i[1].k->k))) + i++; + +- ret = extent_handle_overwrites(trans, i->iter->btree_id, ++ ret = extent_handle_overwrites(trans, i->btree_id, + start, i->k->k.p); + if (ret) + goto out; + } + + trans_for_each_update(trans, i) { +- if (i->iter->flags & BTREE_ITER_IS_EXTENTS) { +- ret = extent_update_to_keys(trans, i->iter, i->k); +- } else { +- ret = bch2_trans_update2(trans, i->iter, i->k); +- } ++ ret = i->is_extent ++ ? extent_update_to_keys(trans, *i) ++ : __bch2_trans_update2(trans, *i); + if (ret) + goto out; + } + + trans_for_each_update2(trans, i) { +- BUG_ON(i->iter->locks_want < 1); +- + ret = bch2_btree_iter_traverse(i->iter); + if (unlikely(ret)) { + trace_trans_restart_traverse(trans->ip); + goto out; + } + ++ /* ++ * We're not using bch2_btree_iter_upgrade here because ++ * we know trans->nounlock can't be set: ++ */ ++ if (unlikely(!btree_node_intent_locked(i->iter, i->iter->level) && ++ !__bch2_btree_iter_upgrade(i->iter, i->iter->level + 1))) { ++ trace_trans_restart_upgrade(trans->ip); ++ ret = -EINTR; ++ goto out; ++ } ++ + u64s = jset_u64s(i->k->k.u64s); + if (btree_iter_type(i->iter) == BTREE_ITER_CACHED && + likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) +@@ -989,80 +969,101 @@ int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter, + struct bkey_i *k, enum btree_trigger_flags flags) + { + struct btree_insert_entry *i, n = (struct btree_insert_entry) { +- .trigger_flags = flags, .iter = iter, .k = k ++ .trigger_flags = flags, ++ .bkey_type = __btree_node_type(iter->level, iter->btree_id), ++ .btree_id = iter->btree_id, ++ .level = iter->level, ++ .is_extent = (iter->flags & BTREE_ITER_IS_EXTENTS) != 0, ++ .iter = iter, ++ .k = k + }; + ++ BUG_ON(trans->nr_updates >= BTREE_ITER_MAX); ++ + #ifdef CONFIG_BCACHEFS_DEBUG + BUG_ON(bkey_cmp(iter->pos, +- (iter->flags & BTREE_ITER_IS_EXTENTS) +- ? bkey_start_pos(&k->k) +- : k->k.p)); ++ n.is_extent ? bkey_start_pos(&k->k) : k->k.p)); + + trans_for_each_update(trans, i) { + BUG_ON(bkey_cmp(i->iter->pos, +- (i->iter->flags & BTREE_ITER_IS_EXTENTS) +- ? bkey_start_pos(&i->k->k) +- : i->k->k.p)); ++ i->is_extent ? bkey_start_pos(&i->k->k) : i->k->k.p)); + + BUG_ON(i != trans->updates && +- btree_iter_pos_cmp(i[-1].iter, i[0].iter) >= 0); ++ btree_insert_entry_cmp(i - 1, i) >= 0); + } + #endif + + iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT; + +- if (btree_node_type_is_extents(iter->btree_id)) { ++ if (n.is_extent) { + iter->pos_after_commit = k->k.p; + iter->flags |= BTREE_ITER_SET_POS_AFTER_COMMIT; + } + + /* +- * Pending updates are kept sorted: first, find position of new update: ++ * Pending updates are kept sorted: first, find position of new update, ++ * then delete/trim any updates the new update overwrites: + */ +- trans_for_each_update(trans, i) +- if (btree_iter_pos_cmp(iter, i->iter) <= 0) +- break; ++ if (!n.is_extent) { ++ trans_for_each_update(trans, i) ++ if (btree_insert_entry_cmp(&n, i) <= 0) ++ break; + +- /* +- * Now delete/trim any updates the new update overwrites: +- */ +- if (i > trans->updates && +- i[-1].iter->btree_id == iter->btree_id && +- bkey_cmp(iter->pos, i[-1].k->k.p) < 0) +- bch2_cut_back(n.iter->pos, i[-1].k); +- +- while (i < trans->updates + trans->nr_updates && +- iter->btree_id == i->iter->btree_id && +- bkey_cmp(n.k->k.p, i->k->k.p) >= 0) +- array_remove_item(trans->updates, trans->nr_updates, +- i - trans->updates); +- +- if (i < trans->updates + trans->nr_updates && +- iter->btree_id == i->iter->btree_id && +- bkey_cmp(n.k->k.p, i->iter->pos) > 0) { +- /* +- * When we have an extent that overwrites the start of another +- * update, trimming that extent will mean the iterator's +- * position has to change since the iterator position has to +- * match the extent's start pos - but we don't want to change +- * the iterator pos if some other code is using it, so we may +- * need to clone it: +- */ +- if (trans->iters_live & (1ULL << i->iter->idx)) { +- i->iter = bch2_trans_copy_iter(trans, i->iter); ++ if (i < trans->updates + trans->nr_updates && ++ !btree_insert_entry_cmp(&n, i)) ++ *i = n; ++ else ++ array_insert_item(trans->updates, trans->nr_updates, ++ i - trans->updates, n); ++ } else { ++ trans_for_each_update(trans, i) ++ if (btree_insert_entry_cmp(&n, i) < 0) ++ break; + +- i->iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT; +- bch2_trans_iter_put(trans, i->iter); ++ while (i > trans->updates && ++ i[-1].btree_id == n.btree_id && ++ bkey_cmp(bkey_start_pos(&n.k->k), ++ bkey_start_pos(&i[-1].k->k)) <= 0) { ++ --i; ++ array_remove_item(trans->updates, trans->nr_updates, ++ i - trans->updates); + } + +- bch2_cut_front(n.k->k.p, i->k); +- bch2_btree_iter_set_pos(i->iter, n.k->k.p); +- } ++ if (i > trans->updates && ++ i[-1].btree_id == n.btree_id && ++ bkey_cmp(bkey_start_pos(&n.k->k), i[-1].k->k.p) < 0) ++ bch2_cut_back(bkey_start_pos(&n.k->k), i[-1].k); + +- EBUG_ON(trans->nr_updates >= BTREE_ITER_MAX); ++ if (i < trans->updates + trans->nr_updates && ++ i->btree_id == n.btree_id && ++ bkey_cmp(n.k->k.p, bkey_start_pos(&i->k->k)) > 0) { ++ /* We don't handle splitting extents here: */ ++ BUG_ON(bkey_cmp(bkey_start_pos(&n.k->k), ++ bkey_start_pos(&i->k->k)) > 0); ++ ++ /* ++ * When we have an extent that overwrites the start of another ++ * update, trimming that extent will mean the iterator's ++ * position has to change since the iterator position has to ++ * match the extent's start pos - but we don't want to change ++ * the iterator pos if some other code is using it, so we may ++ * need to clone it: ++ */ ++ if (trans->iters_live & (1ULL << i->iter->idx)) { ++ i->iter = bch2_trans_copy_iter(trans, i->iter); ++ ++ i->iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT; ++ bch2_trans_iter_put(trans, i->iter); ++ } ++ ++ bch2_cut_front(n.k->k.p, i->k); ++ bch2_btree_iter_set_pos(i->iter, n.k->k.p); ++ } ++ ++ array_insert_item(trans->updates, trans->nr_updates, ++ i - trans->updates, n); ++ } + +- array_insert_item(trans->updates, trans->nr_updates, +- i - trans->updates, n); + return 0; + } + +-- +cgit v1.2.3 + + +From 2c3303d16c5ac4f006f9a4b757984f69e0cadc60 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 20 Feb 2021 22:19:34 -0500 +Subject: bcachefs: btree_iter_live() + +New helper to clean things up a bit - also, improve iter->flags +handling. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 28 +++++++++------------------- + fs/bcachefs/btree_iter.h | 11 +++++++++++ + fs/bcachefs/btree_types.h | 7 ------- + fs/bcachefs/btree_update_leaf.c | 7 +++---- + 4 files changed, 23 insertions(+), 30 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index b55f35b98955..f048afa17256 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -1701,7 +1701,8 @@ struct bkey_s_c bch2_btree_iter_peek_with_updates(struct btree_iter *iter) + k = __bch2_btree_iter_peek_with_updates(iter); + + if (k.k && bkey_deleted(k.k)) { +- bch2_btree_iter_advance_pos(iter); ++ if (!bch2_btree_iter_advance_pos(iter)) ++ return bkey_s_c_null; + continue; + } + +@@ -2008,7 +2009,7 @@ static void btree_trans_iter_alloc_fail(struct btree_trans *trans) + bch2_btree_ids[iter->btree_id], + iter->pos.inode, + iter->pos.offset, +- (trans->iters_live & (1ULL << iter->idx)) ? " live" : "", ++ btree_iter_live(trans, iter) ? " live" : "", + (trans->iters_touched & (1ULL << iter->idx)) ? " touched" : "", + iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT ? " keep" : "", + (void *) iter->ip_allocated); +@@ -2089,31 +2090,20 @@ static struct btree_iter *__btree_trans_get_iter(struct btree_trans *trans, + if (!best) { + iter = btree_trans_iter_alloc(trans); + bch2_btree_iter_init(trans, iter, btree_id, pos, flags); +- } else if ((trans->iters_live & (1ULL << best->idx)) || +- (best->flags & BTREE_ITER_KEEP_UNTIL_COMMIT)) { ++ } else if (btree_iter_keep(trans, best)) { + iter = btree_trans_iter_alloc(trans); + btree_iter_copy(iter, best); + } else { + iter = best; + } + +- iter->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT; +- iter->flags &= ~BTREE_ITER_USER_FLAGS; +- iter->flags |= flags & BTREE_ITER_USER_FLAGS; ++ flags |= iter->flags & BTREE_ITER_ERROR; ++ iter->flags = flags; + +- if (iter->flags & BTREE_ITER_INTENT) { +- if (!iter->locks_want) { +- __bch2_btree_iter_unlock(iter); +- iter->locks_want = 1; +- } +- } else ++ if (!(iter->flags & BTREE_ITER_INTENT)) + bch2_btree_iter_downgrade(iter); +- +- BUG_ON(iter->btree_id != btree_id); +- BUG_ON((iter->flags ^ flags) & BTREE_ITER_TYPE); +- BUG_ON(iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT); +- BUG_ON(iter->flags & BTREE_ITER_SET_POS_AFTER_COMMIT); +- BUG_ON(trans->iters_live & (1ULL << iter->idx)); ++ else if (!iter->locks_want) ++ __bch2_btree_iter_upgrade_nounlock(iter, 1); + + trans->iters_live |= 1ULL << iter->idx; + trans->iters_touched |= 1ULL << iter->idx; +diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h +index 12c519ae2a60..e2469436f53b 100644 +--- a/fs/bcachefs/btree_iter.h ++++ b/fs/bcachefs/btree_iter.h +@@ -289,6 +289,17 @@ struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *, + enum btree_id, struct bpos, + unsigned, unsigned, unsigned); + ++static inline bool btree_iter_live(struct btree_trans *trans, struct btree_iter *iter) ++{ ++ return (trans->iters_live & (1ULL << iter->idx)) != 0; ++} ++ ++static inline bool btree_iter_keep(struct btree_trans *trans, struct btree_iter *iter) ++{ ++ return btree_iter_live(trans, iter) || ++ (iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT); ++} ++ + #define TRANS_RESET_NOTRAVERSE (1 << 0) + + void bch2_trans_reset(struct btree_trans *, unsigned); +diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h +index fd00b9c06bfc..48c82050e145 100644 +--- a/fs/bcachefs/btree_types.h ++++ b/fs/bcachefs/btree_types.h +@@ -216,13 +216,6 @@ enum btree_iter_type { + #define BTREE_ITER_CACHED_NOFILL (1 << 9) + #define BTREE_ITER_CACHED_NOCREATE (1 << 10) + +-#define BTREE_ITER_USER_FLAGS \ +- (BTREE_ITER_SLOTS \ +- |BTREE_ITER_INTENT \ +- |BTREE_ITER_PREFETCH \ +- |BTREE_ITER_CACHED_NOFILL \ +- |BTREE_ITER_CACHED_NOCREATE) +- + enum btree_iter_uptodate { + BTREE_ITER_UPTODATE = 0, + BTREE_ITER_NEED_PEEK = 1, +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index 6ac359629afa..f6bb55388ba3 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -512,8 +512,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, + */ + trans_for_each_iter(trans, iter) { + if (iter->nodes_locked != iter->nodes_intent_locked) { +- if ((iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT) || +- (trans->iters_live & (1ULL << iter->idx))) { ++ if (btree_iter_keep(trans, iter)) { + if (!bch2_btree_iter_upgrade(iter, 1)) { + trace_trans_restart_upgrade(trans->ip); + return -EINTR; +@@ -945,7 +944,7 @@ retry: + goto err; + + trans_for_each_iter(trans, iter) +- if ((trans->iters_live & (1ULL << iter->idx)) && ++ if (btree_iter_live(trans, iter) && + (iter->flags & BTREE_ITER_SET_POS_AFTER_COMMIT)) + bch2_btree_iter_set_pos(iter, iter->pos_after_commit); + out: +@@ -1049,7 +1048,7 @@ int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter, + * the iterator pos if some other code is using it, so we may + * need to clone it: + */ +- if (trans->iters_live & (1ULL << i->iter->idx)) { ++ if (btree_iter_live(trans, i->iter)) { + i->iter = bch2_trans_copy_iter(trans, i->iter); + + i->iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT; +-- +cgit v1.2.3 + + +From 393d8d1764679ecd7c265c7d49e644fe0533fd20 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 3 Mar 2021 12:10:49 -0500 +Subject: bcachefs: Delete some dead code + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/quota.c | 5 ----- + 1 file changed, 5 deletions(-) + +diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c +index 041da982d051..8e272519ce0e 100644 +--- a/fs/bcachefs/quota.c ++++ b/fs/bcachefs/quota.c +@@ -746,7 +746,6 @@ static int bch2_set_quota(struct super_block *sb, struct kqid qid, + struct qc_dqblk *qdq) + { + struct bch_fs *c = sb->s_fs_info; +- struct btree_trans trans; + struct bkey_i_quota new_quota; + int ret; + +@@ -756,14 +755,10 @@ static int bch2_set_quota(struct super_block *sb, struct kqid qid, + bkey_quota_init(&new_quota.k_i); + new_quota.k.p = POS(qid.type, from_kqid(&init_user_ns, qid)); + +- bch2_trans_init(&trans, c, 0, 0); +- + ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOUNLOCK, + bch2_set_quota_trans(&trans, &new_quota, qdq)) ?: + __bch2_quota_set(c, bkey_i_to_s_c(&new_quota.k_i)); + +- bch2_trans_exit(&trans); +- + return ret; + } + +-- +cgit v1.2.3 + + +From 73bb834c0f811a2b301d6b884057e512b7dd9ea3 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 2 Mar 2021 22:45:28 -0500 +Subject: bcachefs: btree_iter_prev_slot() + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 8 ++++++++ + fs/bcachefs/btree_iter.h | 1 + + 2 files changed, 9 insertions(+) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index f048afa17256..d1b89110434d 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -1904,6 +1904,14 @@ struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *iter) + return bch2_btree_iter_peek_slot(iter); + } + ++struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_iter *iter) ++{ ++ if (!bch2_btree_iter_rewind_pos(iter)) ++ return bkey_s_c_null; ++ ++ return bch2_btree_iter_peek_slot(iter); ++} ++ + struct bkey_s_c bch2_btree_iter_peek_cached(struct btree_iter *iter) + { + struct bkey_cached *ck; +diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h +index e2469436f53b..3ae6c29c6dad 100644 +--- a/fs/bcachefs/btree_iter.h ++++ b/fs/bcachefs/btree_iter.h +@@ -171,6 +171,7 @@ struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *); + + struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *); + struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *); ++struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_iter *); + + struct bkey_s_c bch2_btree_iter_peek_cached(struct btree_iter *); + +-- +cgit v1.2.3 + + +From f6bc5cfec4bf4e504a4117416a15e447b2a85815 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 4 Mar 2021 15:20:22 -0500 +Subject: bcachefs: Use bch2_bpos_to_text() more consistently + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bset.c | 7 +++--- + fs/bcachefs/btree_cache.c | 15 ++++++------ + fs/bcachefs/btree_gc.c | 22 ++++++++--------- + fs/bcachefs/btree_io.c | 47 +++++++++++++------------------------ + fs/bcachefs/btree_iter.c | 45 ++++++++++++++++++----------------- + fs/bcachefs/btree_update_interior.c | 18 ++++++-------- + fs/bcachefs/rebalance.c | 8 +++---- + 7 files changed, 72 insertions(+), 90 deletions(-) + +diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c +index 756cbae6541d..87f951e14061 100644 +--- a/fs/bcachefs/bset.c ++++ b/fs/bcachefs/bset.c +@@ -1729,9 +1729,10 @@ void bch2_bfloat_to_text(struct printbuf *out, struct btree *b, + uk = bkey_unpack_key(b, k); + pr_buf(out, + " failed unpacked at depth %u\n" +- "\t%llu:%llu\n", +- ilog2(j), +- uk.p.inode, uk.p.offset); ++ "\t", ++ ilog2(j)); ++ bch2_bpos_to_text(out, uk.p); ++ pr_buf(out, "\n"); + break; + } + } +diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c +index 30f601ccedc0..e765d8061b06 100644 +--- a/fs/bcachefs/btree_cache.c ++++ b/fs/bcachefs/btree_cache.c +@@ -1063,15 +1063,14 @@ void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, + + bch2_btree_keys_stats(b, &stats); + +- pr_buf(out, +- "l %u %llu:%llu - %llu:%llu:\n" +- " ptrs: ", +- b->c.level, +- b->data->min_key.inode, +- b->data->min_key.offset, +- b->data->max_key.inode, +- b->data->max_key.offset); ++ pr_buf(out, "l %u ", b->c.level); ++ bch2_bpos_to_text(out, b->data->min_key); ++ pr_buf(out, " - "); ++ bch2_bpos_to_text(out, b->data->max_key); ++ pr_buf(out, ":\n" ++ " ptrs: "); + bch2_val_to_text(out, c, bkey_i_to_s_c(&b->key)); ++ + pr_buf(out, "\n" + " format: u64s %u fields %u %u %u %u %u\n" + " unpack fn len: %u\n" +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +index ea7ded70570e..840dab42b9d3 100644 +--- a/fs/bcachefs/btree_gc.c ++++ b/fs/bcachefs/btree_gc.c +@@ -73,12 +73,13 @@ static int bch2_gc_check_topology(struct bch_fs *c, + if (cur.k->k.type == KEY_TYPE_btree_ptr_v2) { + struct bkey_i_btree_ptr_v2 *bp = bkey_i_to_btree_ptr_v2(cur.k); + +- if (bkey_deleted(&prev->k->k)) +- scnprintf(buf1, sizeof(buf1), "start of node: %llu:%llu", +- node_start.inode, +- node_start.offset); +- else ++ if (bkey_deleted(&prev->k->k)) { ++ struct printbuf out = PBUF(buf1); ++ pr_buf(&out, "start of node: "); ++ bch2_bpos_to_text(&out, node_start); ++ } else { + bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(prev->k)); ++ } + + if (fsck_err_on(bkey_cmp(expected_start, bp->v.min_key), c, + "btree node with incorrect min_key at btree %s level %u:\n" +@@ -554,6 +555,7 @@ static int bch2_gc_btree_init(struct bch_fs *c, + : !btree_node_type_needs_gc(btree_id) ? 1 + : 0; + u8 max_stale = 0; ++ char buf[100]; + int ret = 0; + + b = c->btree_roots[btree_id].b; +@@ -563,16 +565,14 @@ static int bch2_gc_btree_init(struct bch_fs *c, + + six_lock_read(&b->c.lock, NULL, NULL); + if (fsck_err_on(bkey_cmp(b->data->min_key, POS_MIN), c, +- "btree root with incorrect min_key: %llu:%llu", +- b->data->min_key.inode, +- b->data->min_key.offset)) { ++ "btree root with incorrect min_key: %s", ++ (bch2_bpos_to_text(&PBUF(buf), b->data->min_key), buf))) { + BUG(); + } + + if (fsck_err_on(bkey_cmp(b->data->max_key, POS_MAX), c, +- "btree root with incorrect min_key: %llu:%llu", +- b->data->max_key.inode, +- b->data->max_key.offset)) { ++ "btree root with incorrect max_key: %s", ++ (bch2_bpos_to_text(&PBUF(buf), b->data->max_key), buf))) { + BUG(); + } + +diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c +index 0d71910d681c..4acaa14a80ff 100644 +--- a/fs/bcachefs/btree_io.c ++++ b/fs/bcachefs/btree_io.c +@@ -488,12 +488,12 @@ enum btree_validate_ret { + ({ \ + __label__ out; \ + char _buf[300]; \ +- char *buf2 = _buf; \ ++ char *_buf2 = _buf; \ + struct printbuf out = PBUF(_buf); \ + \ +- buf2 = kmalloc(4096, GFP_ATOMIC); \ +- if (buf2) \ +- out = _PBUF(buf2, 4986); \ ++ _buf2 = kmalloc(4096, GFP_ATOMIC); \ ++ if (_buf2) \ ++ out = _PBUF(_buf2, 4986); \ + \ + btree_err_msg(&out, c, ca, b, i, b->written, write); \ + pr_buf(&out, ": " msg, ##__VA_ARGS__); \ +@@ -501,13 +501,13 @@ enum btree_validate_ret { + if (type == BTREE_ERR_FIXABLE && \ + write == READ && \ + !test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) { \ +- mustfix_fsck_err(c, "%s", buf2); \ ++ mustfix_fsck_err(c, "%s", _buf2); \ + goto out; \ + } \ + \ + switch (write) { \ + case READ: \ +- bch_err(c, "%s", buf2); \ ++ bch_err(c, "%s", _buf2); \ + \ + switch (type) { \ + case BTREE_ERR_FIXABLE: \ +@@ -528,7 +528,7 @@ enum btree_validate_ret { + } \ + break; \ + case WRITE: \ +- bch_err(c, "corrupt metadata before write: %s", buf2); \ ++ bch_err(c, "corrupt metadata before write: %s", _buf2); \ + \ + if (bch2_fs_inconsistent(c)) { \ + ret = BCH_FSCK_ERRORS_NOT_FIXED; \ +@@ -537,8 +537,8 @@ enum btree_validate_ret { + break; \ + } \ + out: \ +- if (buf2 != _buf) \ +- kfree(buf2); \ ++ if (_buf2 != _buf) \ ++ kfree(_buf2); \ + true; \ + }) + +@@ -550,6 +550,8 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, + { + unsigned version = le16_to_cpu(i->version); + const char *err; ++ char buf1[100]; ++ char buf2[100]; + int ret = 0; + + btree_err_on((version != BCH_BSET_VERSION_OLD && +@@ -613,37 +615,20 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, + + btree_err_on(bkey_cmp(b->data->min_key, bp->min_key), + BTREE_ERR_MUST_RETRY, c, ca, b, NULL, +- "incorrect min_key: got %llu:%llu should be %llu:%llu", +- b->data->min_key.inode, +- b->data->min_key.offset, +- bp->min_key.inode, +- bp->min_key.offset); ++ "incorrect min_key: got %s should be %s", ++ (bch2_bpos_to_text(&PBUF(buf1), bn->min_key), buf1), ++ (bch2_bpos_to_text(&PBUF(buf2), bp->min_key), buf2)); + } + + btree_err_on(bkey_cmp(bn->max_key, b->key.k.p), + BTREE_ERR_MUST_RETRY, c, ca, b, i, +- "incorrect max key %llu:%llu", +- bn->max_key.inode, +- bn->max_key.offset); ++ "incorrect max key %s", ++ (bch2_bpos_to_text(&PBUF(buf1), bn->max_key), buf1)); + + if (write) + compat_btree_node(b->c.level, b->c.btree_id, version, + BSET_BIG_ENDIAN(i), write, bn); + +- /* XXX: ideally we would be validating min_key too */ +-#if 0 +- /* +- * not correct anymore, due to btree node write error +- * handling +- * +- * need to add bn->seq to btree keys and verify +- * against that +- */ +- btree_err_on(!extent_contains_ptr(bkey_i_to_s_c_extent(&b->key), +- bn->ptr), +- BTREE_ERR_FATAL, c, b, i, +- "incorrect backpointer"); +-#endif + err = bch2_bkey_format_validate(&bn->format); + btree_err_on(err, + BTREE_ERR_FATAL, c, ca, b, i, +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index d1b89110434d..577b983ef6bc 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -495,7 +495,7 @@ static void bch2_btree_iter_verify_level(struct btree_iter *iter, + struct btree_node_iter tmp = l->iter; + bool locked = btree_node_locked(iter, level); + struct bkey_packed *p, *k; +- char buf1[100], buf2[100]; ++ char buf1[100], buf2[100], buf3[100]; + const char *msg; + + if (!bch2_debug_check_iterators) +@@ -552,26 +552,26 @@ unlock: + btree_node_unlock(iter, level); + return; + err: +- strcpy(buf1, "(none)"); + strcpy(buf2, "(none)"); ++ strcpy(buf3, "(none)"); ++ ++ bch2_bpos_to_text(&PBUF(buf1), iter->real_pos); + + if (p) { + struct bkey uk = bkey_unpack_key(l->b, p); +- bch2_bkey_to_text(&PBUF(buf1), &uk); ++ bch2_bkey_to_text(&PBUF(buf2), &uk); + } + + if (k) { + struct bkey uk = bkey_unpack_key(l->b, k); +- bch2_bkey_to_text(&PBUF(buf2), &uk); ++ bch2_bkey_to_text(&PBUF(buf3), &uk); + } + + panic("iterator should be %s key at level %u:\n" +- "iter pos %llu:%llu\n" ++ "iter pos %s\n" + "prev key %s\n" + "cur key %s\n", +- msg, level, +- iter->real_pos.inode, iter->real_pos.offset, +- buf1, buf2); ++ msg, level, buf1, buf2, buf3); + } + + static void bch2_btree_iter_verify(struct btree_iter *iter) +@@ -876,22 +876,23 @@ static void btree_iter_verify_new_node(struct btree_iter *iter, struct btree *b) + if (!k || + bkey_deleted(k) || + bkey_cmp_left_packed(l->b, k, &b->key.k.p)) { +- char buf[100]; ++ char buf1[100]; ++ char buf2[100]; ++ char buf3[100]; ++ char buf4[100]; + struct bkey uk = bkey_unpack_key(b, k); + + bch2_dump_btree_node(iter->trans->c, l->b); +- bch2_bkey_to_text(&PBUF(buf), &uk); ++ bch2_bpos_to_text(&PBUF(buf1), iter->real_pos); ++ bch2_bkey_to_text(&PBUF(buf2), &uk); ++ bch2_bpos_to_text(&PBUF(buf3), b->data->min_key); ++ bch2_bpos_to_text(&PBUF(buf3), b->data->max_key); + panic("parent iter doesn't point to new node:\n" +- "iter pos %s %llu:%llu\n" ++ "iter pos %s %s\n" + "iter key %s\n" +- "new node %llu:%llu-%llu:%llu\n", +- bch2_btree_ids[iter->btree_id], +- iter->pos.inode, +- iter->pos.offset, +- buf, +- b->data->min_key.inode, +- b->data->min_key.offset, +- b->key.k.p.inode, b->key.k.p.offset); ++ "new node %s-%s\n", ++ bch2_btree_ids[iter->btree_id], buf1, ++ buf2, buf3, buf4); + } + + if (!parent_locked) +@@ -2011,12 +2012,12 @@ static void btree_trans_iter_alloc_fail(struct btree_trans *trans) + + struct btree_iter *iter; + struct btree_insert_entry *i; ++ char buf[100]; + + trans_for_each_iter(trans, iter) +- printk(KERN_ERR "iter: btree %s pos %llu:%llu%s%s%s %ps\n", ++ printk(KERN_ERR "iter: btree %s pos %s%s%s%s %ps\n", + bch2_btree_ids[iter->btree_id], +- iter->pos.inode, +- iter->pos.offset, ++ (bch2_bpos_to_text(&PBUF(buf), iter->pos), buf), + btree_iter_live(trans, iter) ? " live" : "", + (trans->iters_touched & (1ULL << iter->idx)) ? " touched" : "", + iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT ? " keep" : "", +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index d090509c0519..4c0e3d7c8ddf 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -35,6 +35,7 @@ static void btree_node_interior_verify(struct bch_fs *c, struct btree *b) + struct bkey_s_c k; + struct bkey_s_c_btree_ptr_v2 bp; + struct bkey unpacked; ++ char buf1[100], buf2[100]; + + BUG_ON(!b->c.level); + +@@ -51,24 +52,19 @@ static void btree_node_interior_verify(struct bch_fs *c, struct btree *b) + + if (bkey_cmp(next_node, bp.v->min_key)) { + bch2_dump_btree_node(c, b); +- panic("expected next min_key %llu:%llu got %llu:%llu\n", +- next_node.inode, +- next_node.offset, +- bp.v->min_key.inode, +- bp.v->min_key.offset); ++ panic("expected next min_key %s got %s\n", ++ (bch2_bpos_to_text(&PBUF(buf1), next_node), buf1), ++ (bch2_bpos_to_text(&PBUF(buf2), bp.v->min_key), buf2)); + } + + bch2_btree_node_iter_advance(&iter, b); + + if (bch2_btree_node_iter_end(&iter)) { +- + if (bkey_cmp(k.k->p, b->key.k.p)) { + bch2_dump_btree_node(c, b); +- panic("expected end %llu:%llu got %llu:%llu\n", +- b->key.k.p.inode, +- b->key.k.p.offset, +- k.k->p.inode, +- k.k->p.offset); ++ panic("expected end %s got %s\n", ++ (bch2_bpos_to_text(&PBUF(buf1), b->key.k.p), buf1), ++ (bch2_bpos_to_text(&PBUF(buf2), k.k->p), buf2)); + } + break; + } +diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c +index aa9bbdbfa65e..a0dbf41d1d37 100644 +--- a/fs/bcachefs/rebalance.c ++++ b/fs/bcachefs/rebalance.c +@@ -281,10 +281,10 @@ void bch2_rebalance_work_to_text(struct printbuf *out, struct bch_fs *c) + h1); + break; + case REBALANCE_RUNNING: +- pr_buf(out, "running\n"); +- pr_buf(out, "pos %llu:%llu\n", +- r->move_stats.pos.inode, +- r->move_stats.pos.offset); ++ pr_buf(out, "running\n" ++ "pos "); ++ bch2_bpos_to_text(out, r->move_stats.pos); ++ pr_buf(out, "\n"); + break; + } + } +-- +cgit v1.2.3 + + +From 654c89e11e303f851eda255b80651072bf40bd83 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 4 Mar 2021 16:26:19 -0500 +Subject: bcachefs: Fix bpos_diff() + +Previously, bpos_diff() did not handle borrows correctly. Minor thing +considering how it was used, but worth fixing. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bkey.h | 31 +++++++++++++++++++++++++++++++ + fs/bcachefs/btree_iter.c | 8 -------- + 2 files changed, 31 insertions(+), 8 deletions(-) + +diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h +index a22a1dc6df78..629288a60926 100644 +--- a/fs/bcachefs/bkey.h ++++ b/fs/bcachefs/bkey.h +@@ -175,6 +175,37 @@ static inline struct bpos bpos_max(struct bpos l, struct bpos r) + return bkey_cmp(l, r) > 0 ? l : r; + } + ++#define sbb(a, b, borrow) \ ++do { \ ++ typeof(a) d1, d2; \ ++ \ ++ d1 = a - borrow; \ ++ borrow = d1 > a; \ ++ \ ++ d2 = d1 - b; \ ++ borrow += d2 > d1; \ ++ a = d2; \ ++} while (0) ++ ++/* returns a - b: */ ++static inline struct bpos bpos_sub(struct bpos a, struct bpos b) ++{ ++ int borrow = 0; ++ ++ sbb(a.snapshot, b.snapshot, borrow); ++ sbb(a.offset, b.offset, borrow); ++ sbb(a.inode, b.inode, borrow); ++ return a; ++} ++ ++static inline struct bpos bpos_diff(struct bpos l, struct bpos r) ++{ ++ if (bkey_cmp(l, r) > 0) ++ swap(l, r); ++ ++ return bpos_sub(r, l); ++} ++ + void bch2_bpos_swab(struct bpos *); + void bch2_bkey_swab_key(const struct bkey_format *, struct bkey_packed *); + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 577b983ef6bc..f3f7a6fefb14 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -2067,14 +2067,6 @@ static inline void btree_iter_copy(struct btree_iter *dst, + dst->flags &= ~BTREE_ITER_SET_POS_AFTER_COMMIT; + } + +-static inline struct bpos bpos_diff(struct bpos l, struct bpos r) +-{ +- if (bkey_cmp(l, r) > 0) +- swap(l, r); +- +- return POS(r.inode - l.inode, r.offset - l.offset); +-} +- + static struct btree_iter *__btree_trans_get_iter(struct btree_trans *trans, + unsigned btree_id, struct bpos pos, + unsigned flags) +-- +cgit v1.2.3 + + +From 2abf619d56fcd16b3dc6457ca92719d43e3e2547 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 4 Mar 2021 19:06:26 -0500 +Subject: bcachefs: Fix compat code for superblock + +The bkey compat code wasn't being run for btree roots in the superblock +clean section - this patch fixes it to use the journal entry validate +code. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bcachefs_format.h | 1 + + fs/bcachefs/journal_io.c | 108 +++++++++++++++++++++--------------------- + fs/bcachefs/journal_io.h | 3 ++ + fs/bcachefs/recovery.c | 8 ++-- + fs/bcachefs/super-io.c | 31 +++++++++--- + fs/bcachefs/super-io.h | 2 +- + 6 files changed, 88 insertions(+), 65 deletions(-) + +diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h +index 97e548994512..244c458d6ed3 100644 +--- a/fs/bcachefs/bcachefs_format.h ++++ b/fs/bcachefs/bcachefs_format.h +@@ -1310,6 +1310,7 @@ LE64_BITMASK(BCH_SB_PRJQUOTA, struct bch_sb, flags[0], 59, 60); + LE64_BITMASK(BCH_SB_HAS_ERRORS, struct bch_sb, flags[0], 60, 61); + + LE64_BITMASK(BCH_SB_REFLINK, struct bch_sb, flags[0], 61, 62); ++LE64_BITMASK(BCH_SB_BIG_ENDIAN, struct bch_sb, flags[0], 62, 63); + + /* 61-64 unused */ + +diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c +index 37465bdbe4a9..54f2e2053bc0 100644 +--- a/fs/bcachefs/journal_io.c ++++ b/fs/bcachefs/journal_io.c +@@ -202,22 +202,19 @@ static void journal_entry_null_range(void *start, void *end) + + #define FSCK_DELETED_KEY 5 + +-static int journal_validate_key(struct bch_fs *c, struct jset *jset, ++static int journal_validate_key(struct bch_fs *c, const char *where, + struct jset_entry *entry, + unsigned level, enum btree_id btree_id, +- struct bkey_i *k, +- const char *type, int write) ++ struct bkey_i *k, const char *type, ++ unsigned version, int big_endian, int write) + { + void *next = vstruct_next(entry); + const char *invalid; +- unsigned version = le32_to_cpu(jset->version); + int ret = 0; + + if (journal_entry_err_on(!k->k.u64s, c, +- "invalid %s in jset %llu offset %zi/%u entry offset %zi/%u: k->u64s 0", +- type, le64_to_cpu(jset->seq), +- (u64 *) entry - jset->_data, +- le32_to_cpu(jset->u64s), ++ "invalid %s in %s entry offset %zi/%u: k->u64s 0", ++ type, where, + (u64 *) k - entry->_data, + le16_to_cpu(entry->u64s))) { + entry->u64s = cpu_to_le16((u64 *) k - entry->_data); +@@ -227,10 +224,8 @@ static int journal_validate_key(struct bch_fs *c, struct jset *jset, + + if (journal_entry_err_on((void *) bkey_next(k) > + (void *) vstruct_next(entry), c, +- "invalid %s in jset %llu offset %zi/%u entry offset %zi/%u: extends past end of journal entry", +- type, le64_to_cpu(jset->seq), +- (u64 *) entry - jset->_data, +- le32_to_cpu(jset->u64s), ++ "invalid %s in %s entry offset %zi/%u: extends past end of journal entry", ++ type, where, + (u64 *) k - entry->_data, + le16_to_cpu(entry->u64s))) { + entry->u64s = cpu_to_le16((u64 *) k - entry->_data); +@@ -239,10 +234,8 @@ static int journal_validate_key(struct bch_fs *c, struct jset *jset, + } + + if (journal_entry_err_on(k->k.format != KEY_FORMAT_CURRENT, c, +- "invalid %s in jset %llu offset %zi/%u entry offset %zi/%u: bad format %u", +- type, le64_to_cpu(jset->seq), +- (u64 *) entry - jset->_data, +- le32_to_cpu(jset->u64s), ++ "invalid %s in %s entry offset %zi/%u: bad format %u", ++ type, where, + (u64 *) k - entry->_data, + le16_to_cpu(entry->u64s), + k->k.format)) { +@@ -253,9 +246,8 @@ static int journal_validate_key(struct bch_fs *c, struct jset *jset, + } + + if (!write) +- bch2_bkey_compat(level, btree_id, version, +- JSET_BIG_ENDIAN(jset), write, +- NULL, bkey_to_packed(k)); ++ bch2_bkey_compat(level, btree_id, version, big_endian, ++ write, NULL, bkey_to_packed(k)); + + invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(k), + __btree_node_type(level, btree_id)); +@@ -263,10 +255,8 @@ static int journal_validate_key(struct bch_fs *c, struct jset *jset, + char buf[160]; + + bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(k)); +- mustfix_fsck_err(c, "invalid %s in jset %llu offset %zi/%u entry offset %zi/%u: %s\n%s", +- type, le64_to_cpu(jset->seq), +- (u64 *) entry - jset->_data, +- le32_to_cpu(jset->u64s), ++ mustfix_fsck_err(c, "invalid %s in %s entry offset %zi/%u: %s\n%s", ++ type, where, + (u64 *) k - entry->_data, + le16_to_cpu(entry->u64s), + invalid, buf); +@@ -278,25 +268,24 @@ static int journal_validate_key(struct bch_fs *c, struct jset *jset, + } + + if (write) +- bch2_bkey_compat(level, btree_id, version, +- JSET_BIG_ENDIAN(jset), write, +- NULL, bkey_to_packed(k)); ++ bch2_bkey_compat(level, btree_id, version, big_endian, ++ write, NULL, bkey_to_packed(k)); + fsck_err: + return ret; + } + + static int journal_entry_validate_btree_keys(struct bch_fs *c, +- struct jset *jset, ++ const char *where, + struct jset_entry *entry, +- int write) ++ unsigned version, int big_endian, int write) + { + struct bkey_i *k = entry->start; + + while (k != vstruct_last(entry)) { +- int ret = journal_validate_key(c, jset, entry, ++ int ret = journal_validate_key(c, where, entry, + entry->level, + entry->btree_id, +- k, "key", write); ++ k, "key", version, big_endian, write); + if (ret == FSCK_DELETED_KEY) + continue; + +@@ -307,9 +296,9 @@ static int journal_entry_validate_btree_keys(struct bch_fs *c, + } + + static int journal_entry_validate_btree_root(struct bch_fs *c, +- struct jset *jset, ++ const char *where, + struct jset_entry *entry, +- int write) ++ unsigned version, int big_endian, int write) + { + struct bkey_i *k = entry->start; + int ret = 0; +@@ -328,25 +317,25 @@ static int journal_entry_validate_btree_root(struct bch_fs *c, + return 0; + } + +- return journal_validate_key(c, jset, entry, 1, entry->btree_id, k, +- "btree root", write); ++ return journal_validate_key(c, where, entry, 1, entry->btree_id, k, ++ "btree root", version, big_endian, write); + fsck_err: + return ret; + } + + static int journal_entry_validate_prio_ptrs(struct bch_fs *c, +- struct jset *jset, ++ const char *where, + struct jset_entry *entry, +- int write) ++ unsigned version, int big_endian, int write) + { + /* obsolete, don't care: */ + return 0; + } + + static int journal_entry_validate_blacklist(struct bch_fs *c, +- struct jset *jset, ++ const char *where, + struct jset_entry *entry, +- int write) ++ unsigned version, int big_endian, int write) + { + int ret = 0; + +@@ -359,9 +348,9 @@ fsck_err: + } + + static int journal_entry_validate_blacklist_v2(struct bch_fs *c, +- struct jset *jset, ++ const char *where, + struct jset_entry *entry, +- int write) ++ unsigned version, int big_endian, int write) + { + struct jset_entry_blacklist_v2 *bl_entry; + int ret = 0; +@@ -385,9 +374,9 @@ fsck_err: + } + + static int journal_entry_validate_usage(struct bch_fs *c, +- struct jset *jset, ++ const char *where, + struct jset_entry *entry, +- int write) ++ unsigned version, int big_endian, int write) + { + struct jset_entry_usage *u = + container_of(entry, struct jset_entry_usage, entry); +@@ -406,9 +395,9 @@ fsck_err: + } + + static int journal_entry_validate_data_usage(struct bch_fs *c, +- struct jset *jset, ++ const char *where, + struct jset_entry *entry, +- int write) ++ unsigned version, int big_endian, int write) + { + struct jset_entry_data_usage *u = + container_of(entry, struct jset_entry_data_usage, entry); +@@ -428,9 +417,9 @@ fsck_err: + } + + static int journal_entry_validate_clock(struct bch_fs *c, +- struct jset *jset, ++ const char *where, + struct jset_entry *entry, +- int write) ++ unsigned version, int big_endian, int write) + { + struct jset_entry_clock *clock = + container_of(entry, struct jset_entry_clock, entry); +@@ -454,9 +443,9 @@ fsck_err: + } + + static int journal_entry_validate_dev_usage(struct bch_fs *c, +- struct jset *jset, ++ const char *where, + struct jset_entry *entry, +- int write) ++ unsigned version, int big_endian, int write) + { + struct jset_entry_dev_usage *u = + container_of(entry, struct jset_entry_dev_usage, entry); +@@ -491,8 +480,8 @@ fsck_err: + } + + struct jset_entry_ops { +- int (*validate)(struct bch_fs *, struct jset *, +- struct jset_entry *, int); ++ int (*validate)(struct bch_fs *, const char *, ++ struct jset_entry *, unsigned, int, int); + }; + + static const struct jset_entry_ops bch2_jset_entry_ops[] = { +@@ -504,22 +493,29 @@ static const struct jset_entry_ops bch2_jset_entry_ops[] = { + #undef x + }; + +-static int journal_entry_validate(struct bch_fs *c, struct jset *jset, +- struct jset_entry *entry, int write) ++int bch2_journal_entry_validate(struct bch_fs *c, const char *where, ++ struct jset_entry *entry, ++ unsigned version, int big_endian, int write) + { + return entry->type < BCH_JSET_ENTRY_NR +- ? bch2_jset_entry_ops[entry->type].validate(c, jset, +- entry, write) ++ ? bch2_jset_entry_ops[entry->type].validate(c, where, entry, ++ version, big_endian, write) + : 0; + } + + static int jset_validate_entries(struct bch_fs *c, struct jset *jset, + int write) + { ++ char buf[100]; + struct jset_entry *entry; + int ret = 0; + + vstruct_for_each(jset, entry) { ++ scnprintf(buf, sizeof(buf), "jset %llu entry offset %zi/%u", ++ le64_to_cpu(jset->seq), ++ (u64 *) entry - jset->_data, ++ le32_to_cpu(jset->u64s)); ++ + if (journal_entry_err_on(vstruct_next(entry) > + vstruct_last(jset), c, + "journal entry extends past end of jset")) { +@@ -527,7 +523,9 @@ static int jset_validate_entries(struct bch_fs *c, struct jset *jset, + break; + } + +- ret = journal_entry_validate(c, jset, entry, write); ++ ret = bch2_journal_entry_validate(c, buf, entry, ++ le32_to_cpu(jset->version), ++ JSET_BIG_ENDIAN(jset), write); + if (ret) + break; + } +diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h +index a4931ab93a68..f34281a28f12 100644 +--- a/fs/bcachefs/journal_io.h ++++ b/fs/bcachefs/journal_io.h +@@ -40,6 +40,9 @@ static inline struct jset_entry *__jset_entry_type_next(struct jset *jset, + for_each_jset_entry_type(entry, jset, BCH_JSET_ENTRY_btree_keys) \ + vstruct_for_each_safe(entry, k, _n) + ++int bch2_journal_entry_validate(struct bch_fs *, const char *, struct jset_entry *, ++ unsigned, int, int); ++ + int bch2_journal_read(struct bch_fs *, struct list_head *, u64 *, u64 *); + + void bch2_journal_write(struct closure *); +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index b68fcd1d19e4..11d4894b3d63 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -908,9 +908,11 @@ static struct bch_sb_field_clean *read_superblock_clean(struct bch_fs *c) + return ERR_PTR(-ENOMEM); + } + +- if (le16_to_cpu(c->disk_sb.sb->version) < +- bcachefs_metadata_version_bkey_renumber) +- bch2_sb_clean_renumber(clean, READ); ++ ret = bch2_sb_clean_validate(c, clean, READ); ++ if (ret) { ++ mutex_unlock(&c->sb_lock); ++ return ERR_PTR(ret); ++ } + + mutex_unlock(&c->sb_lock); + +diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c +index 66a8bcb4f0b9..761695c4afa1 100644 +--- a/fs/bcachefs/super-io.c ++++ b/fs/bcachefs/super-io.c +@@ -9,6 +9,7 @@ + #include "error.h" + #include "io.h" + #include "journal.h" ++#include "journal_io.h" + #include "journal_seq_blacklist.h" + #include "replicas.h" + #include "quota.h" +@@ -712,6 +713,8 @@ int bch2_write_super(struct bch_fs *c) + if (test_bit(BCH_FS_ERROR, &c->flags)) + SET_BCH_SB_HAS_ERRORS(c->disk_sb.sb, 1); + ++ SET_BCH_SB_BIG_ENDIAN(c->disk_sb.sb, CPU_BIG_ENDIAN); ++ + for_each_online_member(ca, c, i) + bch2_sb_from_fs(c, ca); + +@@ -935,14 +938,23 @@ static const struct bch_sb_field_ops bch_sb_field_ops_crypt = { + + /* BCH_SB_FIELD_clean: */ + +-void bch2_sb_clean_renumber(struct bch_sb_field_clean *clean, int write) ++int bch2_sb_clean_validate(struct bch_fs *c, struct bch_sb_field_clean *clean, int write) + { + struct jset_entry *entry; ++ int ret; + + for (entry = clean->start; + entry < (struct jset_entry *) vstruct_end(&clean->field); +- entry = vstruct_next(entry)) +- bch2_bkey_renumber(BKEY_TYPE_btree, bkey_to_packed(entry->start), write); ++ entry = vstruct_next(entry)) { ++ ret = bch2_journal_entry_validate(c, "superblock", entry, ++ le16_to_cpu(c->disk_sb.sb->version), ++ BCH_SB_BIG_ENDIAN(c->disk_sb.sb), ++ write); ++ if (ret) ++ return ret; ++ } ++ ++ return 0; + } + + int bch2_fs_mark_dirty(struct bch_fs *c) +@@ -1075,6 +1087,7 @@ void bch2_fs_mark_clean(struct bch_fs *c) + struct bch_sb_field_clean *sb_clean; + struct jset_entry *entry; + unsigned u64s; ++ int ret; + + mutex_lock(&c->sb_lock); + if (BCH_SB_CLEAN(c->disk_sb.sb)) +@@ -1109,9 +1122,15 @@ void bch2_fs_mark_clean(struct bch_fs *c) + memset(entry, 0, + vstruct_end(&sb_clean->field) - (void *) entry); + +- if (le16_to_cpu(c->disk_sb.sb->version) < +- bcachefs_metadata_version_bkey_renumber) +- bch2_sb_clean_renumber(sb_clean, WRITE); ++ /* ++ * this should be in the write path, and we should be validating every ++ * superblock section: ++ */ ++ ret = bch2_sb_clean_validate(c, sb_clean, WRITE); ++ if (ret) { ++ bch_err(c, "error writing marking filesystem clean: validate error"); ++ goto out; ++ } + + bch2_write_super(c); + out: +diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h +index 1a35124f5f47..b64ac2fbbf8b 100644 +--- a/fs/bcachefs/super-io.h ++++ b/fs/bcachefs/super-io.h +@@ -125,7 +125,7 @@ static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi) + void bch2_journal_super_entries_add_common(struct bch_fs *, + struct jset_entry **, u64); + +-void bch2_sb_clean_renumber(struct bch_sb_field_clean *, int); ++int bch2_sb_clean_validate(struct bch_fs *, struct bch_sb_field_clean *, int); + + int bch2_fs_mark_dirty(struct bch_fs *); + void bch2_fs_mark_clean(struct bch_fs *); +-- +cgit v1.2.3 + + +From f94d4724f638766c2fd654b4d8163f560c374f49 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 4 Mar 2021 22:11:28 -0500 +Subject: bcachefs: Simplify for_each_btree_key() + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.h | 8 +++----- + 1 file changed, 3 insertions(+), 5 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h +index 3ae6c29c6dad..0ac8337eba98 100644 +--- a/fs/bcachefs/btree_iter.h ++++ b/fs/bcachefs/btree_iter.h +@@ -243,11 +243,9 @@ static inline int bkey_err(struct bkey_s_c k) + _start, _flags, _k, _ret) \ + for ((_iter) = bch2_trans_get_iter((_trans), (_btree_id), \ + (_start), (_flags)), \ +- (_ret) = PTR_ERR_OR_ZERO(((_k) = \ +- __bch2_btree_iter_peek(_iter, _flags)).k); \ +- !_ret && (_k).k; \ +- (_ret) = PTR_ERR_OR_ZERO(((_k) = \ +- __bch2_btree_iter_next(_iter, _flags)).k)) ++ (_k) = __bch2_btree_iter_peek(_iter, _flags); \ ++ !((_ret) = bkey_err(_k)) && (_k).k; \ ++ (_k) = __bch2_btree_iter_next(_iter, _flags)) + + #define for_each_btree_key_continue(_iter, _flags, _k, _ret) \ + for ((_k) = __bch2_btree_iter_peek(_iter, _flags); \ +-- +cgit v1.2.3 + + +From 1277ce1fcd2760246f123602ba2af768cb6a1b5d Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 4 Mar 2021 22:40:41 -0500 +Subject: bcachefs: Simplify bch2_btree_iter_peek_prev() + +Since we added iter->real_pos, btree_iter_set_pos_to_(next|prev)_leaf no +longer modify iter->pos, so we don't have to save it at the start +anymore. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 11 ++++------- + 1 file changed, 4 insertions(+), 7 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index f3f7a6fefb14..95913a25e121 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -1739,7 +1739,6 @@ struct bkey_s_c bch2_btree_iter_next_with_updates(struct btree_iter *iter) + */ + struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) + { +- struct bpos pos = iter->pos; + struct btree_iter_level *l = &iter->l[0]; + struct bkey_s_c k; + int ret; +@@ -1764,8 +1763,8 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) + k = __btree_iter_peek(iter, l); + if (!k.k || + ((iter->flags & BTREE_ITER_IS_EXTENTS) +- ? bkey_cmp(bkey_start_pos(k.k), pos) >= 0 +- : bkey_cmp(bkey_start_pos(k.k), pos) > 0)) ++ ? bkey_cmp(bkey_start_pos(k.k), iter->pos) >= 0 ++ : bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0)) + k = __btree_iter_prev(iter, l); + + if (likely(k.k)) +@@ -1777,10 +1776,10 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) + } + } + +- EBUG_ON(bkey_cmp(bkey_start_pos(k.k), pos) > 0); ++ EBUG_ON(bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0); + + /* Extents can straddle iter->pos: */ +- if (bkey_cmp(k.k->p, pos) < 0) ++ if (bkey_cmp(k.k->p, iter->pos) < 0) + iter->pos = k.k->p; + iter->real_pos = k.k->p; + iter->uptodate = BTREE_ITER_UPTODATE; +@@ -1794,8 +1793,6 @@ no_key: + * then we errored going to the previous leaf - make sure it's + * consistent with iter->pos: + */ +- BUG_ON(bkey_cmp(pos, iter->pos) && +- bkey_cmp(iter->pos, POS_MIN)); + bkey_init(&iter->k); + iter->k.p = iter->pos; + goto out; +-- +cgit v1.2.3 + + +From f80e662518017849a5f7a93b076ae546afee6439 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 19 Feb 2021 20:44:55 -0500 +Subject: bcachefs: __bch2_trans_get_iter() refactoring, BTREE_ITER_NOT_EXTENTS + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 70 +++++++++++++++-------------------------- + fs/bcachefs/btree_iter.h | 1 - + fs/bcachefs/btree_types.h | 1 + + fs/bcachefs/btree_update_leaf.c | 16 +++++----- + fs/bcachefs/fsck.c | 3 +- + fs/bcachefs/recovery.c | 2 +- + 6 files changed, 38 insertions(+), 55 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 95913a25e121..cf3901c22e87 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -1494,24 +1494,14 @@ static void btree_iter_set_search_pos(struct btree_iter *iter, struct bpos new_p + bch2_btree_iter_verify(iter); + } + +-void __bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos, +- bool strictly_greater) ++void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos) + { + bkey_init(&iter->k); + iter->k.p = iter->pos = new_pos; + +- iter->flags &= ~BTREE_ITER_IS_EXTENTS; +- iter->flags |= strictly_greater ? BTREE_ITER_IS_EXTENTS : 0; +- + btree_iter_set_search_pos(iter, btree_iter_search_key(iter)); + } + +-void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos) +-{ +- __bch2_btree_iter_set_pos(iter, new_pos, +- (iter->flags & BTREE_ITER_IS_EXTENTS) != 0); +-} +- + static inline bool bch2_btree_iter_advance_pos(struct btree_iter *iter) + { + struct bpos pos = iter->k.p; +@@ -1932,27 +1922,17 @@ struct bkey_s_c bch2_btree_iter_peek_cached(struct btree_iter *iter) + } + + static inline void bch2_btree_iter_init(struct btree_trans *trans, +- struct btree_iter *iter, enum btree_id btree_id, +- struct bpos pos, unsigned flags) ++ struct btree_iter *iter, enum btree_id btree_id) + { + struct bch_fs *c = trans->c; + unsigned i; + +- if (btree_node_type_is_extents(btree_id) && +- !(flags & BTREE_ITER_NODES)) +- flags |= BTREE_ITER_IS_EXTENTS; +- + iter->trans = trans; +- iter->pos = pos; +- bkey_init(&iter->k); +- iter->k.p = pos; +- iter->flags = flags; +- iter->real_pos = btree_iter_search_key(iter); + iter->uptodate = BTREE_ITER_NEED_TRAVERSE; + iter->btree_id = btree_id; + iter->level = 0; + iter->min_depth = 0; +- iter->locks_want = flags & BTREE_ITER_INTENT ? 1 : 0; ++ iter->locks_want = 0; + iter->nodes_locked = 0; + iter->nodes_intent_locked = 0; + for (i = 0; i < ARRAY_SIZE(iter->l); i++) +@@ -2064,12 +2044,16 @@ static inline void btree_iter_copy(struct btree_iter *dst, + dst->flags &= ~BTREE_ITER_SET_POS_AFTER_COMMIT; + } + +-static struct btree_iter *__btree_trans_get_iter(struct btree_trans *trans, +- unsigned btree_id, struct bpos pos, +- unsigned flags) ++struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans, ++ unsigned btree_id, struct bpos pos, ++ unsigned flags) + { + struct btree_iter *iter, *best = NULL; + ++ /* We always want a fresh iterator for node iterators: */ ++ if ((flags & BTREE_ITER_TYPE) == BTREE_ITER_NODES) ++ goto alloc_iter; ++ + trans_for_each_iter(trans, iter) { + if (btree_iter_type(iter) != (flags & BTREE_ITER_TYPE)) + continue; +@@ -2084,10 +2068,10 @@ static struct btree_iter *__btree_trans_get_iter(struct btree_trans *trans, + + best = iter; + } +- ++alloc_iter: + if (!best) { + iter = btree_trans_iter_alloc(trans); +- bch2_btree_iter_init(trans, iter, btree_id, pos, flags); ++ bch2_btree_iter_init(trans, iter, btree_id); + } else if (btree_iter_keep(trans, best)) { + iter = btree_trans_iter_alloc(trans); + btree_iter_copy(iter, best); +@@ -2095,7 +2079,14 @@ static struct btree_iter *__btree_trans_get_iter(struct btree_trans *trans, + iter = best; + } + +- flags |= iter->flags & BTREE_ITER_ERROR; ++ trans->iters_live |= 1ULL << iter->idx; ++ trans->iters_touched |= 1ULL << iter->idx; ++ ++ if ((flags & BTREE_ITER_TYPE) != BTREE_ITER_NODES && ++ btree_node_type_is_extents(btree_id) && ++ !(flags & BTREE_ITER_NOT_EXTENTS)) ++ flags |= BTREE_ITER_IS_EXTENTS; ++ + iter->flags = flags; + + if (!(iter->flags & BTREE_ITER_INTENT)) +@@ -2103,21 +2094,8 @@ static struct btree_iter *__btree_trans_get_iter(struct btree_trans *trans, + else if (!iter->locks_want) + __bch2_btree_iter_upgrade_nounlock(iter, 1); + +- trans->iters_live |= 1ULL << iter->idx; +- trans->iters_touched |= 1ULL << iter->idx; +- +- return iter; +-} +- +-struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans, +- enum btree_id btree_id, +- struct bpos pos, unsigned flags) +-{ +- struct btree_iter *iter = +- __btree_trans_get_iter(trans, btree_id, pos, flags); ++ bch2_btree_iter_set_pos(iter, pos); + +- __bch2_btree_iter_set_pos(iter, pos, +- btree_node_type_is_extents(btree_id)); + return iter; + } + +@@ -2129,8 +2107,10 @@ struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *trans, + unsigned flags) + { + struct btree_iter *iter = +- __btree_trans_get_iter(trans, btree_id, pos, +- flags|BTREE_ITER_NODES); ++ __bch2_trans_get_iter(trans, btree_id, pos, ++ BTREE_ITER_NODES| ++ BTREE_ITER_NOT_EXTENTS| ++ flags); + unsigned i; + + BUG_ON(bkey_cmp(iter->pos, pos)); +diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h +index 0ac8337eba98..bd0c429bd91a 100644 +--- a/fs/bcachefs/btree_iter.h ++++ b/fs/bcachefs/btree_iter.h +@@ -175,7 +175,6 @@ struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_iter *); + + struct bkey_s_c bch2_btree_iter_peek_cached(struct btree_iter *); + +-void __bch2_btree_iter_set_pos(struct btree_iter *, struct bpos, bool); + void bch2_btree_iter_set_pos(struct btree_iter *, struct bpos); + + /* Sort order for locking btree iterators: */ +diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h +index 48c82050e145..96c4cd4ba1ea 100644 +--- a/fs/bcachefs/btree_types.h ++++ b/fs/bcachefs/btree_types.h +@@ -215,6 +215,7 @@ enum btree_iter_type { + #define BTREE_ITER_SET_POS_AFTER_COMMIT (1 << 8) + #define BTREE_ITER_CACHED_NOFILL (1 << 9) + #define BTREE_ITER_CACHED_NOCREATE (1 << 10) ++#define BTREE_ITER_NOT_EXTENTS (1 << 11) + + enum btree_iter_uptodate { + BTREE_ITER_UPTODATE = 0, +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index f6bb55388ba3..d7937bdf804b 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -740,10 +740,9 @@ static int extent_update_to_keys(struct btree_trans *trans, + if (ret) + return ret; + +- n.iter = bch2_trans_copy_iter(trans, n.iter); +- +- n.iter->flags |= BTREE_ITER_INTENT; +- __bch2_btree_iter_set_pos(n.iter, n.k->k.p, false); ++ n.iter = bch2_trans_get_iter(trans, n.iter->btree_id, n.k->k.p, ++ BTREE_ITER_INTENT| ++ BTREE_ITER_NOT_EXTENTS); + n.is_extent = false; + + ret = __bch2_trans_update2(trans, n); +@@ -777,7 +776,8 @@ static int extent_handle_overwrites(struct btree_trans *trans, + bkey_reassemble(update, k); + bch2_cut_back(start, update); + +- __bch2_btree_iter_set_pos(update_iter, update->k.p, false); ++ update_iter->flags &= ~BTREE_ITER_IS_EXTENTS; ++ bch2_btree_iter_set_pos(update_iter, update->k.p); + ret = bch2_trans_update2(trans, update_iter, update); + bch2_trans_iter_put(trans, update_iter); + if (ret) +@@ -794,7 +794,8 @@ static int extent_handle_overwrites(struct btree_trans *trans, + bkey_reassemble(update, k); + bch2_cut_front(end, update); + +- __bch2_btree_iter_set_pos(update_iter, update->k.p, false); ++ update_iter->flags &= ~BTREE_ITER_IS_EXTENTS; ++ bch2_btree_iter_set_pos(update_iter, update->k.p); + ret = bch2_trans_update2(trans, update_iter, update); + bch2_trans_iter_put(trans, update_iter); + if (ret) +@@ -811,7 +812,8 @@ static int extent_handle_overwrites(struct btree_trans *trans, + update->k.type = KEY_TYPE_deleted; + update->k.size = 0; + +- __bch2_btree_iter_set_pos(update_iter, update->k.p, false); ++ update_iter->flags &= ~BTREE_ITER_IS_EXTENTS; ++ bch2_btree_iter_set_pos(update_iter, update->k.p); + ret = bch2_trans_update2(trans, update_iter, update); + bch2_trans_iter_put(trans, update_iter); + if (ret) +diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c +index ebc234b0b6fe..7f6b4ac48f3d 100644 +--- a/fs/bcachefs/fsck.c ++++ b/fs/bcachefs/fsck.c +@@ -442,7 +442,8 @@ static int bch2_fix_overlapping_extent(struct btree_trans *trans, + * We don't want to go through the + * extent_handle_overwrites path: + */ +- __bch2_btree_iter_set_pos(u_iter, u->k.p, false); ++ u_iter->flags &= ~BTREE_ITER_IS_EXTENTS; ++ bch2_btree_iter_set_pos(u_iter, u->k.p); + + /* + * XXX: this is going to leave disk space +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index 11d4894b3d63..03a25dd5acc6 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -529,7 +529,7 @@ static int __bch2_journal_replay_key(struct btree_trans *trans, + * want that here, journal replay is supposed to treat extents like + * regular keys: + */ +- __bch2_btree_iter_set_pos(iter, k->k.p, false); ++ BUG_ON(iter->flags & BTREE_ITER_IS_EXTENTS); + + ret = bch2_btree_iter_traverse(iter) ?: + bch2_trans_update(trans, iter, k, BTREE_TRIGGER_NORUN); +-- +cgit v1.2.3 + + +From e7ae1bfb9f342f8ef0756b120f53f2f2c91d03ea Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 8 Mar 2021 17:09:13 -0500 +Subject: bcachefs: Fix locking in bch2_btree_iter_traverse_cached() + +bch2_btree_iter_traverse() is supposed to ensure we have the correct +type of lock - it was downgrading if necessary, but if we entered with a +read lock it wasn't upgrading to an intent lock, oops. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_key_cache.c | 9 ++++++++- + 1 file changed, 8 insertions(+), 1 deletion(-) + +diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c +index 4357aefdb668..2230da8b3acd 100644 +--- a/fs/bcachefs/btree_key_cache.c ++++ b/fs/bcachefs/btree_key_cache.c +@@ -297,7 +297,14 @@ fill: + set_bit(BKEY_CACHED_ACCESSED, &ck->flags); + + iter->uptodate = BTREE_ITER_NEED_PEEK; +- bch2_btree_iter_downgrade(iter); ++ ++ if (!(iter->flags & BTREE_ITER_INTENT)) ++ bch2_btree_iter_downgrade(iter); ++ else if (!iter->locks_want) { ++ if (!__bch2_btree_iter_upgrade(iter, 1)) ++ ret = -EINTR; ++ } ++ + return ret; + err: + if (ret != -EINTR) { +-- +cgit v1.2.3 + + +From cf432a5b46466e48838576e92f51fd08350c03c1 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 12 Mar 2021 16:55:28 -0500 +Subject: bcachefs: Have fsck check for stripe pointers matching stripe + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_gc.c | 21 ++++++++++++++++++++- + fs/bcachefs/buckets.c | 2 ++ + fs/bcachefs/ec.c | 3 ++- + fs/bcachefs/ec.h | 37 ++++++++++++++++++++++++++----------- + fs/bcachefs/ec_types.h | 1 + + 5 files changed, 51 insertions(+), 13 deletions(-) + +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +index 840dab42b9d3..2b5dfdbb602e 100644 +--- a/fs/bcachefs/btree_gc.c ++++ b/fs/bcachefs/btree_gc.c +@@ -223,6 +223,11 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id, + "pointer to nonexistent stripe %llu", + (u64) p.ec.idx)) + do_update = true; ++ ++ if (fsck_err_on(!bch2_ptr_matches_stripe_m(m, p), c, ++ "pointer does not match stripe %llu", ++ (u64) p.ec.idx)) ++ do_update = true; + } + } + +@@ -274,8 +279,22 @@ again: + if (extent_entry_type(entry) == BCH_EXTENT_ENTRY_stripe_ptr) { + struct stripe *m = genradix_ptr(&c->stripes[true], + entry->stripe_ptr.idx); ++ union bch_extent_entry *next_ptr; ++ ++ bkey_extent_entry_for_each_from(ptrs, next_ptr, entry) ++ if (extent_entry_type(next_ptr) == BCH_EXTENT_ENTRY_ptr) ++ goto found; ++ next_ptr = NULL; ++found: ++ if (!next_ptr) { ++ bch_err(c, "aieee, found stripe ptr with no data ptr"); ++ continue; ++ } + +- if (!m || !m->alive) { ++ if (!m || !m->alive || ++ !__bch2_ptr_matches_stripe(&m->ptrs[entry->stripe_ptr.block], ++ &next_ptr->ptr, ++ m->sectors)) { + bch2_bkey_extent_entry_drop(new, entry); + goto again; + } +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index 5fa0a28a6cf1..5b92e9fc3ea6 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -1212,6 +1212,8 @@ static int bch2_mark_stripe(struct bch_fs *c, + m->block_sectors[i] = + stripe_blockcount_get(new_s, i); + m->blocks_nonempty += !!m->block_sectors[i]; ++ ++ m->ptrs[i] = new_s->ptrs[i]; + } + + bch2_bkey_to_replicas(&m->r.e, new); +diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c +index 5f80881c2496..600d324d4725 100644 +--- a/fs/bcachefs/ec.c ++++ b/fs/bcachefs/ec.c +@@ -151,7 +151,8 @@ static int bkey_matches_stripe(struct bch_stripe *s, + + bkey_for_each_ptr(ptrs, ptr) + for (i = 0; i < nr_data; i++) +- if (__bch2_ptr_matches_stripe(s, ptr, i)) ++ if (__bch2_ptr_matches_stripe(&s->ptrs[i], ptr, ++ le16_to_cpu(s->sectors))) + return i; + + return -1; +diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h +index 765baa9d9264..744e51eaf327 100644 +--- a/fs/bcachefs/ec.h ++++ b/fs/bcachefs/ec.h +@@ -84,27 +84,42 @@ static inline void stripe_csum_set(struct bch_stripe *s, + memcpy(stripe_csum(s, block, csum_idx), &csum, bch_crc_bytes[s->csum_type]); + } + +-static inline bool __bch2_ptr_matches_stripe(const struct bch_stripe *s, +- const struct bch_extent_ptr *ptr, +- unsigned block) ++static inline bool __bch2_ptr_matches_stripe(const struct bch_extent_ptr *stripe_ptr, ++ const struct bch_extent_ptr *data_ptr, ++ unsigned sectors) ++{ ++ return data_ptr->dev == stripe_ptr->dev && ++ data_ptr->gen == stripe_ptr->gen && ++ data_ptr->offset >= stripe_ptr->offset && ++ data_ptr->offset < stripe_ptr->offset + sectors; ++} ++ ++static inline bool bch2_ptr_matches_stripe(const struct bch_stripe *s, ++ struct extent_ptr_decoded p) + { + unsigned nr_data = s->nr_blocks - s->nr_redundant; + +- if (block >= nr_data) ++ BUG_ON(!p.has_ec); ++ ++ if (p.ec.block >= nr_data) + return false; + +- return ptr->dev == s->ptrs[block].dev && +- ptr->gen == s->ptrs[block].gen && +- ptr->offset >= s->ptrs[block].offset && +- ptr->offset < s->ptrs[block].offset + le16_to_cpu(s->sectors); ++ return __bch2_ptr_matches_stripe(&s->ptrs[p.ec.block], &p.ptr, ++ le16_to_cpu(s->sectors)); + } + +-static inline bool bch2_ptr_matches_stripe(const struct bch_stripe *s, +- struct extent_ptr_decoded p) ++static inline bool bch2_ptr_matches_stripe_m(const struct stripe *m, ++ struct extent_ptr_decoded p) + { ++ unsigned nr_data = m->nr_blocks - m->nr_redundant; ++ + BUG_ON(!p.has_ec); + +- return __bch2_ptr_matches_stripe(s, &p.ptr, p.ec.block); ++ if (p.ec.block >= nr_data) ++ return false; ++ ++ return __bch2_ptr_matches_stripe(&m->ptrs[p.ec.block], &p.ptr, ++ m->sectors); + } + + struct bch_read_bio; +diff --git a/fs/bcachefs/ec_types.h b/fs/bcachefs/ec_types.h +index 847770166223..3fc31222459a 100644 +--- a/fs/bcachefs/ec_types.h ++++ b/fs/bcachefs/ec_types.h +@@ -22,6 +22,7 @@ struct stripe { + unsigned on_heap:1; + u8 blocks_nonempty; + u16 block_sectors[BCH_BKEY_PTRS_MAX]; ++ struct bch_extent_ptr ptrs[BCH_BKEY_PTRS_MAX]; + + struct bch_replicas_padded r; + }; +-- +cgit v1.2.3 + + +From 6a61bee66d3730f36ac38176d9007dbf499e37b3 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 12 Mar 2021 17:52:42 -0500 +Subject: bcachefs: Use __bch2_trans_do() in a few more places + +Minor cleanup, it was being open coded. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/fs.c | 52 +++++++++++++++++++--------------------------------- + 1 file changed, 19 insertions(+), 33 deletions(-) + +diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c +index ff000cc7ba6e..7d1f00138df1 100644 +--- a/fs/bcachefs/fs.c ++++ b/fs/bcachefs/fs.c +@@ -416,16 +416,12 @@ static int __bch2_link(struct bch_fs *c, + mutex_lock(&inode->ei_update_lock); + bch2_trans_init(&trans, c, 4, 1024); + +- do { +- bch2_trans_begin(&trans); +- ret = bch2_link_trans(&trans, ++ ret = __bch2_trans_do(&trans, NULL, &inode->ei_journal_seq, ++ BTREE_INSERT_NOUNLOCK, ++ bch2_link_trans(&trans, + dir->v.i_ino, + inode->v.i_ino, &dir_u, &inode_u, +- &dentry->d_name) ?: +- bch2_trans_commit(&trans, NULL, +- &inode->ei_journal_seq, +- BTREE_INSERT_NOUNLOCK); +- } while (ret == -EINTR); ++ &dentry->d_name)); + + if (likely(!ret)) { + BUG_ON(inode_u.bi_inum != inode->v.i_ino); +@@ -472,17 +468,12 @@ static int bch2_unlink(struct inode *vdir, struct dentry *dentry) + bch2_lock_inodes(INODE_UPDATE_LOCK, dir, inode); + bch2_trans_init(&trans, c, 4, 1024); + +- do { +- bch2_trans_begin(&trans); +- +- ret = bch2_unlink_trans(&trans, ++ ret = __bch2_trans_do(&trans, NULL, &dir->ei_journal_seq, ++ BTREE_INSERT_NOUNLOCK| ++ BTREE_INSERT_NOFAIL, ++ bch2_unlink_trans(&trans, + dir->v.i_ino, &dir_u, +- &inode_u, &dentry->d_name) ?: +- bch2_trans_commit(&trans, NULL, +- &dir->ei_journal_seq, +- BTREE_INSERT_NOUNLOCK| +- BTREE_INSERT_NOFAIL); +- } while (ret == -EINTR); ++ &inode_u, &dentry->d_name)); + + if (likely(!ret)) { + BUG_ON(inode_u.bi_inum != inode->v.i_ino); +@@ -599,21 +590,16 @@ static int bch2_rename2(struct user_namespace *mnt_userns, + goto err; + } + +-retry: +- bch2_trans_begin(&trans); +- ret = bch2_rename_trans(&trans, +- src_dir->v.i_ino, &src_dir_u, +- dst_dir->v.i_ino, &dst_dir_u, +- &src_inode_u, +- &dst_inode_u, +- &src_dentry->d_name, +- &dst_dentry->d_name, +- mode) ?: +- bch2_trans_commit(&trans, NULL, +- &journal_seq, +- BTREE_INSERT_NOUNLOCK); +- if (ret == -EINTR) +- goto retry; ++ ret = __bch2_trans_do(&trans, NULL, &journal_seq, ++ BTREE_INSERT_NOUNLOCK, ++ bch2_rename_trans(&trans, ++ src_dir->v.i_ino, &src_dir_u, ++ dst_dir->v.i_ino, &dst_dir_u, ++ &src_inode_u, ++ &dst_inode_u, ++ &src_dentry->d_name, ++ &dst_dentry->d_name, ++ mode)); + if (unlikely(ret)) + goto err; + +-- +cgit v1.2.3 + + +From 6a1ca007a925855af46e525aab265e51a2237080 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 2 Mar 2021 18:35:30 -0500 +Subject: bcachefs: Kill ei_str_hash + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/acl.c | 15 +++++++++------ + fs/bcachefs/acl.h | 4 ++-- + fs/bcachefs/fs-ioctl.c | 4 ++-- + fs/bcachefs/fs.c | 7 +++---- + fs/bcachefs/fs.h | 2 -- + fs/bcachefs/xattr.c | 19 ++++++++++--------- + 6 files changed, 26 insertions(+), 25 deletions(-) + +diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c +index 250e9304666e..f111898f6c4f 100644 +--- a/fs/bcachefs/acl.c ++++ b/fs/bcachefs/acl.c +@@ -216,6 +216,7 @@ struct posix_acl *bch2_get_acl(struct inode *vinode, int type, bool rcu) + { + struct bch_inode_info *inode = to_bch_ei(vinode); + struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode); + struct btree_trans trans; + struct btree_iter *iter; + struct bkey_s_c_xattr xattr; +@@ -229,7 +230,7 @@ retry: + bch2_trans_begin(&trans); + + iter = bch2_hash_lookup(&trans, bch2_xattr_hash_desc, +- &inode->ei_str_hash, inode->v.i_ino, ++ &hash, inode->v.i_ino, + &X_SEARCH(acl_to_xattr_type(type), "", 0), + 0); + if (IS_ERR(iter)) { +@@ -291,6 +292,7 @@ int bch2_set_acl(struct user_namespace *mnt_userns, + struct btree_trans trans; + struct btree_iter *inode_iter; + struct bch_inode_unpacked inode_u; ++ struct bch_hash_info hash_info; + struct posix_acl *acl; + umode_t mode; + int ret; +@@ -315,9 +317,9 @@ retry: + goto err; + } + +- ret = bch2_set_acl_trans(&trans, &inode_u, +- &inode->ei_str_hash, +- acl, type); ++ hash_info = bch2_hash_info_init(c, &inode_u); ++ ++ ret = bch2_set_acl_trans(&trans, &inode_u, &hash_info, acl, type); + if (ret) + goto btree_err; + +@@ -346,10 +348,11 @@ err: + } + + int bch2_acl_chmod(struct btree_trans *trans, +- struct bch_inode_info *inode, ++ struct bch_inode_unpacked *inode, + umode_t mode, + struct posix_acl **new_acl) + { ++ struct bch_hash_info hash_info = bch2_hash_info_init(trans->c, inode); + struct btree_iter *iter; + struct bkey_s_c_xattr xattr; + struct bkey_i_xattr *new; +@@ -357,7 +360,7 @@ int bch2_acl_chmod(struct btree_trans *trans, + int ret = 0; + + iter = bch2_hash_lookup(trans, bch2_xattr_hash_desc, +- &inode->ei_str_hash, inode->v.i_ino, ++ &hash_info, inode->bi_inum, + &X_SEARCH(KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS, "", 0), + BTREE_ITER_INTENT); + if (IS_ERR(iter)) +diff --git a/fs/bcachefs/acl.h b/fs/bcachefs/acl.h +index c008d58f2126..f7c758369faf 100644 +--- a/fs/bcachefs/acl.h ++++ b/fs/bcachefs/acl.h +@@ -33,7 +33,7 @@ int bch2_set_acl_trans(struct btree_trans *, + const struct bch_hash_info *, + struct posix_acl *, int); + int bch2_set_acl(struct user_namespace *, struct inode *, struct posix_acl *, int); +-int bch2_acl_chmod(struct btree_trans *, struct bch_inode_info *, ++int bch2_acl_chmod(struct btree_trans *, struct bch_inode_unpacked *, + umode_t, struct posix_acl **); + + #else +@@ -47,7 +47,7 @@ static inline int bch2_set_acl_trans(struct btree_trans *trans, + } + + static inline int bch2_acl_chmod(struct btree_trans *trans, +- struct bch_inode_info *inode, ++ struct bch_inode_unpacked *inode, + umode_t mode, + struct posix_acl **new_acl) + { +diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c +index 9988fe2e8c45..ef2ab3e7dfa5 100644 +--- a/fs/bcachefs/fs-ioctl.c ++++ b/fs/bcachefs/fs-ioctl.c +@@ -183,6 +183,7 @@ static int bch2_ioc_reinherit_attrs(struct bch_fs *c, + struct bch_inode_info *src, + const char __user *name) + { ++ struct bch_hash_info hash = bch2_hash_info_init(c, &src->ei_inode); + struct bch_inode_info *dst; + struct inode *vinode = NULL; + char *kname = NULL; +@@ -202,8 +203,7 @@ static int bch2_ioc_reinherit_attrs(struct bch_fs *c, + qstr.name = kname; + + ret = -ENOENT; +- inum = bch2_dirent_lookup(c, src->v.i_ino, +- &src->ei_str_hash, ++ inum = bch2_dirent_lookup(c, src->v.i_ino, &hash, + &qstr); + if (!inum) + goto err1; +diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c +index 7d1f00138df1..d73bebbc28f7 100644 +--- a/fs/bcachefs/fs.c ++++ b/fs/bcachefs/fs.c +@@ -370,11 +370,11 @@ static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry, + { + struct bch_fs *c = vdir->i_sb->s_fs_info; + struct bch_inode_info *dir = to_bch_ei(vdir); ++ struct bch_hash_info hash = bch2_hash_info_init(c, &dir->ei_inode); + struct inode *vinode = NULL; + u64 inum; + +- inum = bch2_dirent_lookup(c, dir->v.i_ino, +- &dir->ei_str_hash, ++ inum = bch2_dirent_lookup(c, dir->v.i_ino, &hash, + &dentry->d_name); + + if (inum) +@@ -723,7 +723,7 @@ retry: + bch2_setattr_copy(mnt_userns, inode, &inode_u, attr); + + if (attr->ia_valid & ATTR_MODE) { +- ret = bch2_acl_chmod(&trans, inode, inode_u.bi_mode, &acl); ++ ret = bch2_acl_chmod(&trans, &inode_u, inode_u.bi_mode, &acl); + if (ret) + goto btree_err; + } +@@ -1152,7 +1152,6 @@ static void bch2_vfs_inode_init(struct bch_fs *c, + inode->ei_flags = 0; + inode->ei_journal_seq = 0; + inode->ei_quota_reserved = 0; +- inode->ei_str_hash = bch2_hash_info_init(c, bi); + inode->ei_qid = bch_qid(bi); + + inode->v.i_mapping->a_ops = &bch_address_space_operations; +diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h +index 3df85ffb450c..2d82ed7dd740 100644 +--- a/fs/bcachefs/fs.h ++++ b/fs/bcachefs/fs.h +@@ -45,8 +45,6 @@ struct bch_inode_info { + struct mutex ei_quota_lock; + struct bch_qid ei_qid; + +- struct bch_hash_info ei_str_hash; +- + /* copy of inode in btree: */ + struct bch_inode_unpacked ei_inode; + }; +diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c +index 91bfc340957b..92c6a071320d 100644 +--- a/fs/bcachefs/xattr.c ++++ b/fs/bcachefs/xattr.c +@@ -121,6 +121,7 @@ void bch2_xattr_to_text(struct printbuf *out, struct bch_fs *c, + int bch2_xattr_get(struct bch_fs *c, struct bch_inode_info *inode, + const char *name, void *buffer, size_t size, int type) + { ++ struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode); + struct btree_trans trans; + struct btree_iter *iter; + struct bkey_s_c_xattr xattr; +@@ -128,8 +129,8 @@ int bch2_xattr_get(struct bch_fs *c, struct bch_inode_info *inode, + + bch2_trans_init(&trans, c, 0, 0); + +- iter = bch2_hash_lookup(&trans, bch2_xattr_hash_desc, +- &inode->ei_str_hash, inode->v.i_ino, ++ iter = bch2_hash_lookup(&trans, bch2_xattr_hash_desc, &hash, ++ inode->v.i_ino, + &X_SEARCH(type, name, strlen(name)), + 0); + if (IS_ERR(iter)) { +@@ -239,7 +240,7 @@ static int bch2_xattr_emit(struct dentry *dentry, + } + + static int bch2_xattr_list_bcachefs(struct bch_fs *c, +- struct bch_inode_info *inode, ++ struct bch_inode_unpacked *inode, + struct xattr_buf *buf, + bool all) + { +@@ -249,12 +250,12 @@ static int bch2_xattr_list_bcachefs(struct bch_fs *c, + u64 v; + + for (id = 0; id < Inode_opt_nr; id++) { +- v = bch2_inode_opt_get(&inode->ei_inode, id); ++ v = bch2_inode_opt_get(inode, id); + if (!v) + continue; + + if (!all && +- !(inode->ei_inode.bi_fields_set & (1 << id))) ++ !(inode->bi_fields_set & (1 << id))) + continue; + + ret = __bch2_xattr_emit(prefix, bch2_inode_opts[id], +@@ -298,11 +299,11 @@ ssize_t bch2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) + if (ret) + return ret; + +- ret = bch2_xattr_list_bcachefs(c, inode, &buf, false); ++ ret = bch2_xattr_list_bcachefs(c, &inode->ei_inode, &buf, false); + if (ret) + return ret; + +- ret = bch2_xattr_list_bcachefs(c, inode, &buf, true); ++ ret = bch2_xattr_list_bcachefs(c, &inode->ei_inode, &buf, true); + if (ret) + return ret; + +@@ -327,10 +328,10 @@ static int bch2_xattr_set_handler(const struct xattr_handler *handler, + { + struct bch_inode_info *inode = to_bch_ei(vinode); + struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode); + + return bch2_trans_do(c, NULL, &inode->ei_journal_seq, 0, +- bch2_xattr_set(&trans, inode->v.i_ino, +- &inode->ei_str_hash, ++ bch2_xattr_set(&trans, inode->v.i_ino, &hash, + name, value, size, + handler->flags, flags)); + } +-- +cgit v1.2.3 + + +From 63d819fe95371199d34514cc31050f9d1f419910 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 12 Mar 2021 20:29:28 -0500 +Subject: bcachefs: Consolidate bch2_read_retry and bch2_read() + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/io.c | 124 +++++++++++++------------------------------------------ + fs/bcachefs/io.h | 19 ++++++++- + 2 files changed, 46 insertions(+), 97 deletions(-) + +diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c +index ca16ea473d80..5d9935d5b5c0 100644 +--- a/fs/bcachefs/io.c ++++ b/fs/bcachefs/io.c +@@ -1671,82 +1671,6 @@ err: + goto out; + } + +-static void bch2_read_retry(struct bch_fs *c, struct bch_read_bio *rbio, +- struct bvec_iter bvec_iter, u64 inode, +- struct bch_io_failures *failed, unsigned flags) +-{ +- struct btree_trans trans; +- struct btree_iter *iter; +- struct bkey_buf sk; +- struct bkey_s_c k; +- int ret; +- +- flags &= ~BCH_READ_LAST_FRAGMENT; +- flags |= BCH_READ_MUST_CLONE; +- +- bch2_bkey_buf_init(&sk); +- bch2_trans_init(&trans, c, 0, 0); +-retry: +- bch2_trans_begin(&trans); +- +- for_each_btree_key(&trans, iter, BTREE_ID_extents, +- POS(inode, bvec_iter.bi_sector), +- BTREE_ITER_SLOTS, k, ret) { +- unsigned bytes, sectors, offset_into_extent; +- +- bch2_bkey_buf_reassemble(&sk, c, k); +- +- offset_into_extent = iter->pos.offset - +- bkey_start_offset(k.k); +- sectors = k.k->size - offset_into_extent; +- +- ret = bch2_read_indirect_extent(&trans, +- &offset_into_extent, &sk); +- if (ret) +- break; +- +- k = bkey_i_to_s_c(sk.k); +- +- sectors = min(sectors, k.k->size - offset_into_extent); +- +- bch2_trans_unlock(&trans); +- +- bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9; +- swap(bvec_iter.bi_size, bytes); +- +- ret = __bch2_read_extent(&trans, rbio, bvec_iter, k, +- offset_into_extent, failed, flags); +- switch (ret) { +- case READ_RETRY: +- goto retry; +- case READ_ERR: +- goto err; +- }; +- +- if (bytes == bvec_iter.bi_size) +- goto out; +- +- swap(bvec_iter.bi_size, bytes); +- bio_advance_iter(&rbio->bio, &bvec_iter, bytes); +- } +- +- if (ret == -EINTR) +- goto retry; +- /* +- * If we get here, it better have been because there was an error +- * reading a btree node +- */ +- BUG_ON(!ret); +- bch_err_inum_ratelimited(c, inode, +- "read error %i from btree lookup", ret); +-err: +- rbio->bio.bi_status = BLK_STS_IOERR; +-out: +- bch2_trans_exit(&trans); +- bch2_bkey_buf_exit(&sk, c); +- bch2_rbio_done(rbio); +-} +- + static void bch2_rbio_retry(struct work_struct *work) + { + struct bch_read_bio *rbio = +@@ -1769,10 +1693,14 @@ static void bch2_rbio_retry(struct work_struct *work) + flags |= BCH_READ_IN_RETRY; + flags &= ~BCH_READ_MAY_PROMOTE; + +- if (flags & BCH_READ_NODECODE) ++ if (flags & BCH_READ_NODECODE) { + bch2_read_retry_nodecode(c, rbio, iter, inode, &failed, flags); +- else +- bch2_read_retry(c, rbio, iter, inode, &failed, flags); ++ } else { ++ flags &= ~BCH_READ_LAST_FRAGMENT; ++ flags |= BCH_READ_MUST_CLONE; ++ ++ __bch2_read(c, rbio, iter, inode, &failed, flags); ++ } + } + + static void bch2_rbio_error(struct bch_read_bio *rbio, int retry, +@@ -2270,6 +2198,9 @@ out: + ret = READ_RETRY; + } + ++ if (!ret) ++ goto out_read_done; ++ + return ret; + } + +@@ -2296,23 +2227,17 @@ out_read_done: + return 0; + } + +-void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, u64 inode) ++void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, ++ struct bvec_iter bvec_iter, u64 inode, ++ struct bch_io_failures *failed, unsigned flags) + { + struct btree_trans trans; + struct btree_iter *iter; + struct bkey_buf sk; + struct bkey_s_c k; +- unsigned flags = BCH_READ_RETRY_IF_STALE| +- BCH_READ_MAY_PROMOTE| +- BCH_READ_USER_MAPPED; + int ret; + +- BUG_ON(rbio->_state); + BUG_ON(flags & BCH_READ_NODECODE); +- BUG_ON(flags & BCH_READ_IN_RETRY); +- +- rbio->c = c; +- rbio->start_time = local_clock(); + + bch2_bkey_buf_init(&sk); + bch2_trans_init(&trans, c, 0, 0); +@@ -2320,13 +2245,13 @@ retry: + bch2_trans_begin(&trans); + + iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, +- POS(inode, rbio->bio.bi_iter.bi_sector), ++ POS(inode, bvec_iter.bi_sector), + BTREE_ITER_SLOTS); + while (1) { + unsigned bytes, sectors, offset_into_extent; + + bch2_btree_iter_set_pos(iter, +- POS(inode, rbio->bio.bi_iter.bi_sector)); ++ POS(inode, bvec_iter.bi_sector)); + + k = bch2_btree_iter_peek_slot(iter); + ret = bkey_err(k); +@@ -2358,19 +2283,26 @@ retry: + */ + bch2_trans_unlock(&trans); + +- bytes = min(sectors, bio_sectors(&rbio->bio)) << 9; +- swap(rbio->bio.bi_iter.bi_size, bytes); ++ bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9; ++ swap(bvec_iter.bi_size, bytes); + +- if (rbio->bio.bi_iter.bi_size == bytes) ++ if (bvec_iter.bi_size == bytes) + flags |= BCH_READ_LAST_FRAGMENT; + +- bch2_read_extent(&trans, rbio, k, offset_into_extent, flags); ++ ret = __bch2_read_extent(&trans, rbio, bvec_iter, k, ++ offset_into_extent, failed, flags); ++ switch (ret) { ++ case READ_RETRY: ++ goto retry; ++ case READ_ERR: ++ goto err; ++ }; + + if (flags & BCH_READ_LAST_FRAGMENT) + break; + +- swap(rbio->bio.bi_iter.bi_size, bytes); +- bio_advance(&rbio->bio, bytes); ++ swap(bvec_iter.bi_size, bytes); ++ bio_advance_iter(&rbio->bio, &bvec_iter, bytes); + } + out: + bch2_trans_exit(&trans); +diff --git a/fs/bcachefs/io.h b/fs/bcachefs/io.h +index 04f6baa1daf7..65b9b62bc07f 100644 +--- a/fs/bcachefs/io.h ++++ b/fs/bcachefs/io.h +@@ -152,7 +152,24 @@ static inline void bch2_read_extent(struct btree_trans *trans, + offset_into_extent, NULL, flags); + } + +-void bch2_read(struct bch_fs *, struct bch_read_bio *, u64); ++void __bch2_read(struct bch_fs *, struct bch_read_bio *, struct bvec_iter, ++ u64, struct bch_io_failures *, unsigned flags); ++ ++static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, ++ u64 inode) ++{ ++ struct bch_io_failures failed = { .nr = 0 }; ++ ++ BUG_ON(rbio->_state); ++ ++ rbio->c = c; ++ rbio->start_time = local_clock(); ++ ++ __bch2_read(c, rbio, rbio->bio.bi_iter, inode, &failed, ++ BCH_READ_RETRY_IF_STALE| ++ BCH_READ_MAY_PROMOTE| ++ BCH_READ_USER_MAPPED); ++} + + static inline struct bch_read_bio *rbio_init(struct bio *bio, + struct bch_io_opts opts) +-- +cgit v1.2.3 + + +From 15e3fa3c49e36e48fd3a4d0e6e31dac7a2a3b2d1 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 14 Mar 2021 21:30:08 -0400 +Subject: bcachefs: Fix read retry path for indirect extents + +In the read path, for retry of indirect extents to work we need to +differentiate between the location in the btree the read was for, vs. +the location where we found the data. This patch adds that plumbing to +bch_read_bio. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/fs-io.c | 6 ++++-- + fs/bcachefs/fs.c | 4 +++- + fs/bcachefs/io.c | 46 +++++++++++++++++++++++++--------------------- + fs/bcachefs/io.h | 23 +++++++++++++---------- + fs/bcachefs/io_types.h | 14 ++++++++++++-- + fs/bcachefs/move.c | 8 +++++--- + 6 files changed, 62 insertions(+), 39 deletions(-) + +diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c +index 8a5dcf5fa75f..ef27582a68a9 100644 +--- a/fs/bcachefs/fs-io.c ++++ b/fs/bcachefs/fs-io.c +@@ -813,6 +813,7 @@ retry: + while (1) { + struct bkey_s_c k; + unsigned bytes, sectors, offset_into_extent; ++ enum btree_id data_btree = BTREE_ID_extents; + + bch2_btree_iter_set_pos(iter, + POS(inum, rbio->bio.bi_iter.bi_sector)); +@@ -828,7 +829,7 @@ retry: + + bch2_bkey_buf_reassemble(&sk, c, k); + +- ret = bch2_read_indirect_extent(trans, ++ ret = bch2_read_indirect_extent(trans, &data_btree, + &offset_into_extent, &sk); + if (ret) + break; +@@ -852,7 +853,8 @@ retry: + if (bkey_extent_is_allocation(k.k)) + bch2_add_page_sectors(&rbio->bio, k); + +- bch2_read_extent(trans, rbio, k, offset_into_extent, flags); ++ bch2_read_extent(trans, rbio, iter->pos, ++ data_btree, k, offset_into_extent, flags); + + if (flags & BCH_READ_LAST_FRAGMENT) + break; +diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c +index d73bebbc28f7..b96f5cf16deb 100644 +--- a/fs/bcachefs/fs.c ++++ b/fs/bcachefs/fs.c +@@ -907,6 +907,8 @@ retry: + while ((k = bch2_btree_iter_peek(iter)).k && + !(ret = bkey_err(k)) && + bkey_cmp(iter->pos, end) < 0) { ++ enum btree_id data_btree = BTREE_ID_extents; ++ + if (!bkey_extent_is_data(k.k) && + k.k->type != KEY_TYPE_reservation) { + bch2_btree_iter_next(iter); +@@ -919,7 +921,7 @@ retry: + + bch2_bkey_buf_reassemble(&cur, c, k); + +- ret = bch2_read_indirect_extent(&trans, ++ ret = bch2_read_indirect_extent(&trans, &data_btree, + &offset_into_extent, &cur); + if (ret) + break; +diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c +index 5d9935d5b5c0..3faefd1fd8be 100644 +--- a/fs/bcachefs/io.c ++++ b/fs/bcachefs/io.c +@@ -1634,8 +1634,8 @@ static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio + bch2_bkey_buf_init(&sk); + bch2_trans_init(&trans, c, 0, 0); + +- iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, +- rbio->pos, BTREE_ITER_SLOTS); ++ iter = bch2_trans_get_iter(&trans, rbio->data_btree, ++ rbio->read_pos, BTREE_ITER_SLOTS); + retry: + rbio->bio.bi_status = 0; + +@@ -1649,14 +1649,17 @@ retry: + + if (!bch2_bkey_matches_ptr(c, k, + rbio->pick.ptr, +- rbio->pos.offset - ++ rbio->data_pos.offset - + rbio->pick.crc.offset)) { + /* extent we wanted to read no longer exists: */ + rbio->hole = true; + goto out; + } + +- ret = __bch2_read_extent(&trans, rbio, bvec_iter, k, 0, failed, flags); ++ ret = __bch2_read_extent(&trans, rbio, bvec_iter, ++ rbio->read_pos, ++ rbio->data_btree, ++ k, 0, failed, flags); + if (ret == READ_RETRY) + goto retry; + if (ret) +@@ -1678,7 +1681,7 @@ static void bch2_rbio_retry(struct work_struct *work) + struct bch_fs *c = rbio->c; + struct bvec_iter iter = rbio->bvec_iter; + unsigned flags = rbio->flags; +- u64 inode = rbio->pos.inode; ++ u64 inode = rbio->read_pos.inode; + struct bch_io_failures failed = { .nr = 0 }; + + trace_read_retry(&rbio->bio); +@@ -1726,7 +1729,7 @@ static int __bch2_rbio_narrow_crcs(struct btree_trans *trans, + struct bch_read_bio *rbio) + { + struct bch_fs *c = rbio->c; +- u64 data_offset = rbio->pos.offset - rbio->pick.crc.offset; ++ u64 data_offset = rbio->data_pos.offset - rbio->pick.crc.offset; + struct bch_extent_crc_unpacked new_crc; + struct btree_iter *iter = NULL; + struct bkey_i *new; +@@ -1736,7 +1739,7 @@ static int __bch2_rbio_narrow_crcs(struct btree_trans *trans, + if (crc_is_compressed(rbio->pick.crc)) + return 0; + +- iter = bch2_trans_get_iter(trans, BTREE_ID_extents, rbio->pos, ++ iter = bch2_trans_get_iter(trans, rbio->data_btree, rbio->data_pos, + BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + k = bch2_btree_iter_peek_slot(iter); + if ((ret = bkey_err(k))) +@@ -1869,14 +1872,14 @@ csum_err: + return; + } + +- bch2_dev_inum_io_error(ca, rbio->pos.inode, (u64) rbio->bvec_iter.bi_sector, ++ bch2_dev_inum_io_error(ca, rbio->read_pos.inode, (u64) rbio->bvec_iter.bi_sector, + "data checksum error: expected %0llx:%0llx got %0llx:%0llx (type %u)", + rbio->pick.crc.csum.hi, rbio->pick.crc.csum.lo, + csum.hi, csum.lo, crc.csum_type); + bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); + return; + decompression_err: +- bch_err_inum_ratelimited(c, rbio->pos.inode, ++ bch_err_inum_ratelimited(c, rbio->read_pos.inode, + "decompression error"); + bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR); + return; +@@ -1899,13 +1902,9 @@ static void bch2_read_endio(struct bio *bio) + if (!rbio->split) + rbio->bio.bi_end_io = rbio->end_io; + +- /* +- * XXX: rbio->pos is not what we want here when reading from indirect +- * extents +- */ + if (bch2_dev_inum_io_err_on(bio->bi_status, ca, +- rbio->pos.inode, +- rbio->pos.offset, ++ rbio->read_pos.inode, ++ rbio->read_pos.offset, + "data read error: %s", + bch2_blk_status_to_str(bio->bi_status))) { + bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status); +@@ -1970,7 +1969,8 @@ err: + } + + int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, +- struct bvec_iter iter, struct bkey_s_c k, ++ struct bvec_iter iter, struct bpos read_pos, ++ enum btree_id data_btree, struct bkey_s_c k, + unsigned offset_into_extent, + struct bch_io_failures *failed, unsigned flags) + { +@@ -1980,7 +1980,7 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, + struct bch_dev *ca; + struct promote_op *promote = NULL; + bool bounce = false, read_full = false, narrow_crcs = false; +- struct bpos pos = bkey_start_pos(k.k); ++ struct bpos data_pos = bkey_start_pos(k.k); + int pick_ret; + + if (bkey_extent_is_inline_data(k.k)) { +@@ -2056,7 +2056,7 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, + pick.crc.offset || + offset_into_extent)); + +- pos.offset += offset_into_extent; ++ data_pos.offset += offset_into_extent; + pick.ptr.offset += pick.crc.offset + + offset_into_extent; + offset_into_extent = 0; +@@ -2128,7 +2128,9 @@ get_bio: + /* XXX: only initialize this if needed */ + rbio->devs_have = bch2_bkey_devs(k); + rbio->pick = pick; +- rbio->pos = pos; ++ rbio->read_pos = read_pos; ++ rbio->data_btree = data_btree; ++ rbio->data_pos = data_pos; + rbio->version = k.k->version; + rbio->promote = promote; + INIT_WORK(&rbio->work, NULL); +@@ -2249,6 +2251,7 @@ retry: + BTREE_ITER_SLOTS); + while (1) { + unsigned bytes, sectors, offset_into_extent; ++ enum btree_id data_btree = BTREE_ID_extents; + + bch2_btree_iter_set_pos(iter, + POS(inode, bvec_iter.bi_sector)); +@@ -2264,7 +2267,7 @@ retry: + + bch2_bkey_buf_reassemble(&sk, c, k); + +- ret = bch2_read_indirect_extent(&trans, ++ ret = bch2_read_indirect_extent(&trans, &data_btree, + &offset_into_extent, &sk); + if (ret) + goto err; +@@ -2289,7 +2292,8 @@ retry: + if (bvec_iter.bi_size == bytes) + flags |= BCH_READ_LAST_FRAGMENT; + +- ret = __bch2_read_extent(&trans, rbio, bvec_iter, k, ++ ret = __bch2_read_extent(&trans, rbio, bvec_iter, iter->pos, ++ data_btree, k, + offset_into_extent, failed, flags); + switch (ret) { + case READ_RETRY: +diff --git a/fs/bcachefs/io.h b/fs/bcachefs/io.h +index 65b9b62bc07f..2ac03c049c92 100644 +--- a/fs/bcachefs/io.h ++++ b/fs/bcachefs/io.h +@@ -117,12 +117,15 @@ int __bch2_read_indirect_extent(struct btree_trans *, unsigned *, + struct bkey_buf *); + + static inline int bch2_read_indirect_extent(struct btree_trans *trans, ++ enum btree_id *data_btree, + unsigned *offset_into_extent, + struct bkey_buf *k) + { +- return k->k->k.type == KEY_TYPE_reflink_p +- ? __bch2_read_indirect_extent(trans, offset_into_extent, k) +- : 0; ++ if (k->k->k.type != KEY_TYPE_reflink_p) ++ return 0; ++ ++ *data_btree = BTREE_ID_reflink; ++ return __bch2_read_indirect_extent(trans, offset_into_extent, k); + } + + enum bch_read_flags { +@@ -139,17 +142,17 @@ enum bch_read_flags { + }; + + int __bch2_read_extent(struct btree_trans *, struct bch_read_bio *, +- struct bvec_iter, struct bkey_s_c, unsigned, ++ struct bvec_iter, struct bpos, enum btree_id, ++ struct bkey_s_c, unsigned, + struct bch_io_failures *, unsigned); + + static inline void bch2_read_extent(struct btree_trans *trans, +- struct bch_read_bio *rbio, +- struct bkey_s_c k, +- unsigned offset_into_extent, +- unsigned flags) ++ struct bch_read_bio *rbio, struct bpos read_pos, ++ enum btree_id data_btree, struct bkey_s_c k, ++ unsigned offset_into_extent, unsigned flags) + { +- __bch2_read_extent(trans, rbio, rbio->bio.bi_iter, k, +- offset_into_extent, NULL, flags); ++ __bch2_read_extent(trans, rbio, rbio->bio.bi_iter, read_pos, ++ data_btree, k, offset_into_extent, NULL, flags); + } + + void __bch2_read(struct bch_fs *, struct bch_read_bio *, struct bvec_iter, +diff --git a/fs/bcachefs/io_types.h b/fs/bcachefs/io_types.h +index b23727d212b9..e7aca7c9823a 100644 +--- a/fs/bcachefs/io_types.h ++++ b/fs/bcachefs/io_types.h +@@ -58,8 +58,18 @@ struct bch_read_bio { + struct bch_devs_list devs_have; + + struct extent_ptr_decoded pick; +- /* start pos of data we read (may not be pos of data we want) */ +- struct bpos pos; ++ ++ /* ++ * pos we read from - different from data_pos for indirect extents: ++ */ ++ struct bpos read_pos; ++ ++ /* ++ * start pos of data we read (may not be pos of data we want) - for ++ * promote, narrow extents paths: ++ */ ++ enum btree_id data_btree; ++ struct bpos data_pos; + struct bversion version; + + struct promote_op *promote; +diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c +index bc003e45a9f6..9cf670673ad3 100644 +--- a/fs/bcachefs/move.c ++++ b/fs/bcachefs/move.c +@@ -209,9 +209,9 @@ void bch2_migrate_read_done(struct migrate_write *m, struct bch_read_bio *rbio) + BUG_ON(!m->op.wbio.bio.bi_vcnt); + + m->ptr = rbio->pick.ptr; +- m->offset = rbio->pos.offset - rbio->pick.crc.offset; ++ m->offset = rbio->data_pos.offset - rbio->pick.crc.offset; + m->op.devs_have = rbio->devs_have; +- m->op.pos = rbio->pos; ++ m->op.pos = rbio->data_pos; + m->op.version = rbio->version; + m->op.crc = rbio->pick.crc; + m->op.wbio.bio.bi_iter.bi_size = m->op.crc.compressed_size << 9; +@@ -493,7 +493,9 @@ static int bch2_move_extent(struct btree_trans *trans, + * ctxt when doing wakeup + */ + closure_get(&ctxt->cl); +- bch2_read_extent(trans, &io->rbio, k, 0, ++ bch2_read_extent(trans, &io->rbio, ++ bkey_start_pos(k.k), ++ btree_id, k, 0, + BCH_READ_NODECODE| + BCH_READ_LAST_FRAGMENT); + return 0; +-- +cgit v1.2.3 + + +From 96535cc9c778a2d2f37fa37a001b95821c6e51f2 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 15 Mar 2021 17:26:19 -0400 +Subject: bcachefs: Kill reflink option + +An option was added to control whether reflink support was on or off +because for a long time, reflink + inline data extent support was +missing - but that's since been fixed, so we can drop the option now. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bcachefs_format.h | 2 +- + fs/bcachefs/fs-io.c | 3 --- + fs/bcachefs/opts.h | 5 ----- + fs/bcachefs/reflink.c | 3 --- + 4 files changed, 1 insertion(+), 12 deletions(-) + +diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h +index 244c458d6ed3..f1526ce6812d 100644 +--- a/fs/bcachefs/bcachefs_format.h ++++ b/fs/bcachefs/bcachefs_format.h +@@ -1309,7 +1309,7 @@ LE64_BITMASK(BCH_SB_PRJQUOTA, struct bch_sb, flags[0], 59, 60); + + LE64_BITMASK(BCH_SB_HAS_ERRORS, struct bch_sb, flags[0], 60, 61); + +-LE64_BITMASK(BCH_SB_REFLINK, struct bch_sb, flags[0], 61, 62); ++/* bit 61 was reflink option */ + LE64_BITMASK(BCH_SB_BIG_ENDIAN, struct bch_sb, flags[0], 62, 63); + + /* 61-64 unused */ +diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c +index ef27582a68a9..2ec8f3fa94ce 100644 +--- a/fs/bcachefs/fs-io.c ++++ b/fs/bcachefs/fs-io.c +@@ -2868,9 +2868,6 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src, + u64 aligned_len; + loff_t ret = 0; + +- if (!c->opts.reflink) +- return -EOPNOTSUPP; +- + if (remap_flags & ~(REMAP_FILE_DEDUP|REMAP_FILE_ADVISORY)) + return -EINVAL; + +diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h +index f5d55427bf83..001e865c5555 100644 +--- a/fs/bcachefs/opts.h ++++ b/fs/bcachefs/opts.h +@@ -213,11 +213,6 @@ enum opt_type { + OPT_BOOL(), \ + BCH_SB_PRJQUOTA, false, \ + NULL, "Enable project quotas") \ +- x(reflink, u8, \ +- OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ +- OPT_BOOL(), \ +- BCH_SB_REFLINK, true, \ +- NULL, "Enable reflink support") \ + x(degraded, u8, \ + OPT_MOUNT, \ + OPT_BOOL(), \ +diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c +index a2cc078597f2..e0eb2c66300c 100644 +--- a/fs/bcachefs/reflink.c ++++ b/fs/bcachefs/reflink.c +@@ -204,9 +204,6 @@ s64 bch2_remap_range(struct bch_fs *c, + u64 src_done, dst_done; + int ret = 0, ret2 = 0; + +- if (!c->opts.reflink) +- return -EOPNOTSUPP; +- + if (!percpu_ref_tryget(&c->writes)) + return -EROFS; + +-- +cgit v1.2.3 + + +From e1f68c1f65f3af5acdaa27a349da01972d369686 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 15 Mar 2021 21:18:50 -0400 +Subject: bcachefs: Fix a btree iterator leak + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 2 +- + fs/bcachefs/reflink.c | 6 ++++-- + 2 files changed, 5 insertions(+), 3 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index cf3901c22e87..68cf7a2eeb9c 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -1992,7 +1992,7 @@ static void btree_trans_iter_alloc_fail(struct btree_trans *trans) + char buf[100]; + + trans_for_each_iter(trans, iter) +- printk(KERN_ERR "iter: btree %s pos %s%s%s%s %ps\n", ++ printk(KERN_ERR "iter: btree %s pos %s%s%s%s %pS\n", + bch2_btree_ids[iter->btree_id], + (bch2_bpos_to_text(&PBUF(buf), iter->pos), buf), + btree_iter_live(trans, iter) ? " live" : "", +diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c +index e0eb2c66300c..e9a6a5f639b4 100644 +--- a/fs/bcachefs/reflink.c ++++ b/fs/bcachefs/reflink.c +@@ -157,8 +157,10 @@ static int bch2_make_extent_indirect(struct btree_trans *trans, + bch2_trans_update(trans, reflink_iter, r_v, 0); + + r_p = bch2_trans_kmalloc(trans, sizeof(*r_p)); +- if (IS_ERR(r_p)) +- return PTR_ERR(r_p); ++ if (IS_ERR(r_p)) { ++ ret = PTR_ERR(r_p); ++ goto err; ++ } + + orig->k.type = KEY_TYPE_reflink_p; + r_p = bkey_i_to_reflink_p(orig); +-- +cgit v1.2.3 + + +From 3370fd85bd8688c1d6f64a811547b76a22ea679b Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 16 Mar 2021 01:52:55 -0400 +Subject: bcachefs: Kill btree_iter_pos_changed() + +this is used in only one place now, so just inline it into the caller. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 14 ++++---------- + 1 file changed, 4 insertions(+), 10 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 68cf7a2eeb9c..7c3930c81a49 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -1445,13 +1445,16 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) + + /* Iterate across keys (in leaf nodes only) */ + +-static void btree_iter_pos_changed(struct btree_iter *iter, int cmp) ++static void btree_iter_set_search_pos(struct btree_iter *iter, struct bpos new_pos) + { ++ int cmp = bkey_cmp(new_pos, iter->real_pos); + unsigned l = iter->level; + + if (!cmp) + goto out; + ++ iter->real_pos = new_pos; ++ + if (unlikely(btree_iter_type(iter) == BTREE_ITER_CACHED)) { + btree_node_unlock(iter, 0); + iter->l[0].b = BTREE_ITER_NO_NODE_UP; +@@ -1481,15 +1484,6 @@ out: + btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); + else + btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); +-} +- +-static void btree_iter_set_search_pos(struct btree_iter *iter, struct bpos new_pos) +-{ +- int cmp = bkey_cmp(new_pos, iter->real_pos); +- +- iter->real_pos = new_pos; +- +- btree_iter_pos_changed(iter, cmp); + + bch2_btree_iter_verify(iter); + } +-- +cgit v1.2.3 + + +From 540bb492825bdc76c5d2aee264b540a5cc8555be Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 19 Mar 2021 13:23:01 -0400 +Subject: bcachefs: Add a print statement for when we go read-write + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/super.c | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c +index 10d3c616b9c7..ce8b37513af7 100644 +--- a/fs/bcachefs/super.c ++++ b/fs/bcachefs/super.c +@@ -393,6 +393,8 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early) + (!early || c->opts.read_only))) + return -EROFS; + ++ bch_info(c, "going read-write"); ++ + ret = bch2_fs_mark_dirty(c); + if (ret) + goto err; +-- +cgit v1.2.3 + + +From a938fabe25b5afa6402a473b0206b0cd33c3b8f9 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 19 Mar 2021 16:30:01 -0400 +Subject: bcachefs: Don't list non journal devs in journal_debug_to_text() + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/journal.c | 3 +++ + 1 file changed, 3 insertions(+) + +diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c +index 594f1c754114..eaf521c9afaf 100644 +--- a/fs/bcachefs/journal.c ++++ b/fs/bcachefs/journal.c +@@ -1234,6 +1234,9 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) + &c->rw_devs[BCH_DATA_journal]) { + struct journal_device *ja = &ca->journal; + ++ if (!test_bit(ca->dev_idx, c->rw_devs[BCH_DATA_journal].d)) ++ continue; ++ + if (!ja->nr) + continue; + +-- +cgit v1.2.3 + + +From e5e396e1d43e6180b4b09c991b2eefcf06948779 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 19 Mar 2021 16:32:46 -0400 +Subject: bcachefs: Fix btree iterator leak in extent_handle_overwrites() + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_update_leaf.c | 14 +++++--------- + 1 file changed, 5 insertions(+), 9 deletions(-) + +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index d7937bdf804b..ed3009b8b157 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -754,7 +754,7 @@ static int extent_handle_overwrites(struct btree_trans *trans, + enum btree_id btree_id, + struct bpos start, struct bpos end) + { +- struct btree_iter *iter = NULL, *update_iter; ++ struct btree_iter *iter, *update_iter; + struct bkey_i *update; + struct bkey_s_c k; + int ret = 0; +@@ -767,8 +767,6 @@ static int extent_handle_overwrites(struct btree_trans *trans, + break; + + if (bkey_cmp(bkey_start_pos(k.k), start) < 0) { +- update_iter = bch2_trans_copy_iter(trans, iter); +- + update = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); + if ((ret = PTR_ERR_OR_ZERO(update))) + goto err; +@@ -776,6 +774,7 @@ static int extent_handle_overwrites(struct btree_trans *trans, + bkey_reassemble(update, k); + bch2_cut_back(start, update); + ++ update_iter = bch2_trans_copy_iter(trans, iter); + update_iter->flags &= ~BTREE_ITER_IS_EXTENTS; + bch2_btree_iter_set_pos(update_iter, update->k.p); + ret = bch2_trans_update2(trans, update_iter, update); +@@ -785,8 +784,6 @@ static int extent_handle_overwrites(struct btree_trans *trans, + } + + if (bkey_cmp(k.k->p, end) > 0) { +- update_iter = bch2_trans_copy_iter(trans, iter); +- + update = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); + if ((ret = PTR_ERR_OR_ZERO(update))) + goto err; +@@ -794,6 +791,7 @@ static int extent_handle_overwrites(struct btree_trans *trans, + bkey_reassemble(update, k); + bch2_cut_front(end, update); + ++ update_iter = bch2_trans_copy_iter(trans, iter); + update_iter->flags &= ~BTREE_ITER_IS_EXTENTS; + bch2_btree_iter_set_pos(update_iter, update->k.p); + ret = bch2_trans_update2(trans, update_iter, update); +@@ -801,8 +799,6 @@ static int extent_handle_overwrites(struct btree_trans *trans, + if (ret) + goto err; + } else { +- update_iter = bch2_trans_copy_iter(trans, iter); +- + update = bch2_trans_kmalloc(trans, sizeof(struct bkey)); + if ((ret = PTR_ERR_OR_ZERO(update))) + goto err; +@@ -812,6 +808,7 @@ static int extent_handle_overwrites(struct btree_trans *trans, + update->k.type = KEY_TYPE_deleted; + update->k.size = 0; + ++ update_iter = bch2_trans_copy_iter(trans, iter); + update_iter->flags &= ~BTREE_ITER_IS_EXTENTS; + bch2_btree_iter_set_pos(update_iter, update->k.p); + ret = bch2_trans_update2(trans, update_iter, update); +@@ -823,8 +820,7 @@ static int extent_handle_overwrites(struct btree_trans *trans, + k = bch2_btree_iter_next_with_updates(iter); + } + err: +- if (!IS_ERR_OR_NULL(iter)) +- bch2_trans_iter_put(trans, iter); ++ bch2_trans_iter_put(trans, iter); + return ret; + } + +-- +cgit v1.2.3 + + +From de76143ef0b6a1166d24f36f62cc93765de7f2c0 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 19 Mar 2021 22:34:54 -0400 +Subject: bcachefs: Fsck code refactoring + +Change fsck code to always put btree iterators - also, make some flow +control improvements to deal with lock restarts better, and refactor +check_extents() to not walk extents twice for counting/checking +i_sectors. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 4 +- + fs/bcachefs/btree_iter.h | 2 + + fs/bcachefs/fsck.c | 198 +++++++++++++++++++++++------------------------ + 3 files changed, 102 insertions(+), 102 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 7c3930c81a49..a0d37a30a91a 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -1496,7 +1496,7 @@ void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos) + btree_iter_set_search_pos(iter, btree_iter_search_key(iter)); + } + +-static inline bool bch2_btree_iter_advance_pos(struct btree_iter *iter) ++inline bool bch2_btree_iter_advance_pos(struct btree_iter *iter) + { + struct bpos pos = iter->k.p; + bool ret = bkey_cmp(pos, POS_MAX) != 0; +@@ -1507,7 +1507,7 @@ static inline bool bch2_btree_iter_advance_pos(struct btree_iter *iter) + return ret; + } + +-static inline bool bch2_btree_iter_rewind_pos(struct btree_iter *iter) ++inline bool bch2_btree_iter_rewind_pos(struct btree_iter *iter) + { + struct bpos pos = bkey_start_pos(&iter->k); + bool ret = bkey_cmp(pos, POS_MIN) != 0; +diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h +index bd0c429bd91a..76f0f8f3c125 100644 +--- a/fs/bcachefs/btree_iter.h ++++ b/fs/bcachefs/btree_iter.h +@@ -175,6 +175,8 @@ struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_iter *); + + struct bkey_s_c bch2_btree_iter_peek_cached(struct btree_iter *); + ++bool bch2_btree_iter_advance_pos(struct btree_iter *); ++bool bch2_btree_iter_rewind_pos(struct btree_iter *); + void bch2_btree_iter_set_pos(struct btree_iter *, struct bpos); + + /* Sort order for locking btree iterators: */ +diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c +index 7f6b4ac48f3d..033d37891c60 100644 +--- a/fs/bcachefs/fsck.c ++++ b/fs/bcachefs/fsck.c +@@ -319,7 +319,7 @@ static int hash_check_key(struct btree_trans *trans, + bch_err(c, "hash_redo_key err %i", ret); + return ret; + } +- return 1; ++ return -EINTR; + } + + ret = hash_check_duplicates(trans, desc, h, k_iter, k); +@@ -413,18 +413,10 @@ err_redo: + goto err; + } + +-static int bch2_inode_truncate(struct bch_fs *c, u64 inode_nr, u64 new_size) +-{ +- return bch2_btree_delete_range(c, BTREE_ID_extents, +- POS(inode_nr, round_up(new_size, block_bytes(c)) >> 9), +- POS(inode_nr + 1, 0), NULL); +-} +- +-static int bch2_fix_overlapping_extent(struct btree_trans *trans, +- struct btree_iter *iter, ++static int fix_overlapping_extent(struct btree_trans *trans, + struct bkey_s_c k, struct bpos cut_at) + { +- struct btree_iter *u_iter; ++ struct btree_iter *iter; + struct bkey_i *u; + int ret; + +@@ -436,22 +428,24 @@ static int bch2_fix_overlapping_extent(struct btree_trans *trans, + bkey_reassemble(u, k); + bch2_cut_front(cut_at, u); + +- u_iter = bch2_trans_copy_iter(trans, iter); + + /* +- * We don't want to go through the +- * extent_handle_overwrites path: ++ * We don't want to go through the extent_handle_overwrites path: ++ * ++ * XXX: this is going to screw up disk accounting, extent triggers ++ * assume things about extent overwrites - we should be running the ++ * triggers manually here + */ +- u_iter->flags &= ~BTREE_ITER_IS_EXTENTS; +- bch2_btree_iter_set_pos(u_iter, u->k.p); ++ iter = bch2_trans_get_iter(trans, BTREE_ID_extents, u->k.p, ++ BTREE_ITER_INTENT|BTREE_ITER_NOT_EXTENTS); + +- /* +- * XXX: this is going to leave disk space +- * accounting slightly wrong +- */ +- ret = bch2_trans_update(trans, u_iter, u, 0); +- bch2_trans_iter_put(trans, u_iter); +- return ret; ++ BUG_ON(iter->flags & BTREE_ITER_IS_EXTENTS); ++ bch2_trans_update(trans, iter, u, BTREE_TRIGGER_NORUN); ++ bch2_trans_iter_put(trans, iter); ++ ++ return bch2_trans_commit(trans, NULL, NULL, ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_LAZY_RW); + } + + /* +@@ -466,7 +460,7 @@ static int check_extents(struct bch_fs *c) + struct btree_iter *iter; + struct bkey_s_c k; + struct bkey_buf prev; +- u64 i_sectors; ++ u64 i_sectors = 0; + int ret = 0; + + bch2_bkey_buf_init(&prev); +@@ -479,97 +473,86 @@ static int check_extents(struct bch_fs *c) + POS(BCACHEFS_ROOT_INO, 0), + BTREE_ITER_INTENT); + retry: +- for_each_btree_key_continue(iter, 0, k, ret) { +- /* +- * due to retry errors we might see the same extent twice: +- */ +- if (bkey_cmp(prev.k->k.p, k.k->p) && +- bkey_cmp(prev.k->k.p, bkey_start_pos(k.k)) > 0) { ++ while ((k = bch2_btree_iter_peek(iter)).k && ++ !(ret = bkey_err(k))) { ++ if (w.have_inode && ++ w.cur_inum != k.k->p.inode && ++ !(w.inode.bi_flags & BCH_INODE_I_SECTORS_DIRTY) && ++ fsck_err_on(w.inode.bi_sectors != i_sectors, c, ++ "inode %llu has incorrect i_sectors: got %llu, should be %llu", ++ w.inode.bi_inum, ++ w.inode.bi_sectors, i_sectors)) { ++ struct btree_iter *inode_iter = ++ bch2_trans_get_iter(&trans, BTREE_ID_inodes, ++ POS(0, w.cur_inum), ++ BTREE_ITER_INTENT); ++ ++ w.inode.bi_sectors = i_sectors; ++ ++ ret = __bch2_trans_do(&trans, NULL, NULL, ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_LAZY_RW, ++ bch2_inode_write(&trans, inode_iter, &w.inode)); ++ bch2_trans_iter_put(&trans, inode_iter); ++ if (ret) ++ break; ++ } ++ ++ if (bkey_cmp(prev.k->k.p, bkey_start_pos(k.k)) > 0) { + char buf1[200]; + char buf2[200]; + + bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(prev.k)); + bch2_bkey_val_to_text(&PBUF(buf2), c, k); + +- if (fsck_err(c, "overlapping extents:\n%s\n%s", buf1, buf2)) { +- ret = __bch2_trans_do(&trans, NULL, NULL, +- BTREE_INSERT_NOFAIL| +- BTREE_INSERT_LAZY_RW, +- bch2_fix_overlapping_extent(&trans, +- iter, k, prev.k->k.p)); +- if (ret) +- goto err; +- } ++ if (fsck_err(c, "overlapping extents:\n%s\n%s", buf1, buf2)) ++ return fix_overlapping_extent(&trans, k, prev.k->k.p) ?: -EINTR; + } +- bch2_bkey_buf_reassemble(&prev, c, k); + + ret = walk_inode(&trans, &w, k.k->p.inode); + if (ret) + break; + ++ if (w.first_this_inode) ++ i_sectors = 0; ++ + if (fsck_err_on(!w.have_inode, c, +- "extent type %u for missing inode %llu", +- k.k->type, k.k->p.inode) || ++ "extent type %u for missing inode %llu", ++ k.k->type, k.k->p.inode) || + fsck_err_on(w.have_inode && +- !S_ISREG(w.inode.bi_mode) && !S_ISLNK(w.inode.bi_mode), c, +- "extent type %u for non regular file, inode %llu mode %o", +- k.k->type, k.k->p.inode, w.inode.bi_mode)) { +- bch2_trans_unlock(&trans); +- +- ret = bch2_inode_truncate(c, k.k->p.inode, 0); +- if (ret) +- goto err; +- continue; ++ !S_ISREG(w.inode.bi_mode) && !S_ISLNK(w.inode.bi_mode), c, ++ "extent type %u for non regular file, inode %llu mode %o", ++ k.k->type, k.k->p.inode, w.inode.bi_mode)) { ++ bch2_fs_lazy_rw(c); ++ return bch2_btree_delete_range_trans(&trans, BTREE_ID_extents, ++ POS(k.k->p.inode, 0), ++ POS(k.k->p.inode, U64_MAX), ++ NULL) ?: -EINTR; + } + +- if (fsck_err_on(w.first_this_inode && +- w.have_inode && +- !(w.inode.bi_flags & BCH_INODE_I_SECTORS_DIRTY) && +- w.inode.bi_sectors != +- (i_sectors = bch2_count_inode_sectors(&trans, w.cur_inum)), +- c, "inode %llu has incorrect i_sectors: got %llu, should be %llu", +- w.inode.bi_inum, +- w.inode.bi_sectors, i_sectors)) { +- struct bkey_inode_buf p; +- +- w.inode.bi_sectors = i_sectors; +- +- bch2_trans_unlock(&trans); +- +- bch2_inode_pack(c, &p, &w.inode); +- +- ret = bch2_btree_insert(c, BTREE_ID_inodes, +- &p.inode.k_i, NULL, NULL, +- BTREE_INSERT_NOFAIL| +- BTREE_INSERT_LAZY_RW); +- if (ret) { +- bch_err(c, "error in fsck: error %i updating inode", ret); +- goto err; +- } +- +- /* revalidate iterator: */ +- k = bch2_btree_iter_peek(iter); ++ if (fsck_err_on(w.have_inode && ++ !(w.inode.bi_flags & BCH_INODE_I_SIZE_DIRTY) && ++ k.k->type != KEY_TYPE_reservation && ++ k.k->p.offset > round_up(w.inode.bi_size, block_bytes(c)) >> 9, c, ++ "extent type %u offset %llu past end of inode %llu, i_size %llu", ++ k.k->type, k.k->p.offset, k.k->p.inode, w.inode.bi_size)) { ++ bch2_fs_lazy_rw(c); ++ return bch2_btree_delete_range_trans(&trans, BTREE_ID_extents, ++ POS(k.k->p.inode, round_up(w.inode.bi_size, block_bytes(c)) >> 9), ++ POS(k.k->p.inode, U64_MAX), ++ NULL) ?: -EINTR; + } + +- if (fsck_err_on(w.have_inode && +- !(w.inode.bi_flags & BCH_INODE_I_SIZE_DIRTY) && +- k.k->type != KEY_TYPE_reservation && +- k.k->p.offset > round_up(w.inode.bi_size, block_bytes(c)) >> 9, c, +- "extent type %u offset %llu past end of inode %llu, i_size %llu", +- k.k->type, k.k->p.offset, k.k->p.inode, w.inode.bi_size)) { +- bch2_trans_unlock(&trans); ++ if (bkey_extent_is_allocation(k.k)) ++ i_sectors += k.k->size; ++ bch2_bkey_buf_reassemble(&prev, c, k); + +- ret = bch2_inode_truncate(c, k.k->p.inode, +- w.inode.bi_size); +- if (ret) +- goto err; +- continue; +- } ++ bch2_btree_iter_advance_pos(iter); + } +-err: + fsck_err: + if (ret == -EINTR) + goto retry; ++ bch2_trans_iter_put(&trans, iter); + bch2_bkey_buf_exit(&prev, c); + return bch2_trans_exit(&trans) ?: ret; + } +@@ -599,7 +582,8 @@ static int check_dirents(struct bch_fs *c) + iter = bch2_trans_get_iter(&trans, BTREE_ID_dirents, + POS(BCACHEFS_ROOT_INO, 0), 0); + retry: +- for_each_btree_key_continue(iter, 0, k, ret) { ++ while ((k = bch2_btree_iter_peek(iter)).k && ++ !(ret = bkey_err(k))) { + struct bkey_s_c_dirent d; + struct bch_inode_unpacked target; + bool have_target; +@@ -718,6 +702,8 @@ retry: + goto err; + + } ++ ++ bch2_btree_iter_advance_pos(iter); + } + + hash_stop_chain(&trans, &h); +@@ -726,6 +712,8 @@ fsck_err: + if (ret == -EINTR) + goto retry; + ++ bch2_trans_iter_put(&trans, h.chain); ++ bch2_trans_iter_put(&trans, iter); + return bch2_trans_exit(&trans) ?: ret; + } + +@@ -751,7 +739,8 @@ static int check_xattrs(struct bch_fs *c) + iter = bch2_trans_get_iter(&trans, BTREE_ID_xattrs, + POS(BCACHEFS_ROOT_INO, 0), 0); + retry: +- for_each_btree_key_continue(iter, 0, k, ret) { ++ while ((k = bch2_btree_iter_peek(iter)).k && ++ !(ret = bkey_err(k))) { + ret = walk_inode(&trans, &w, k.k->p.inode); + if (ret) + break; +@@ -761,7 +750,7 @@ retry: + k.k->p.inode)) { + ret = bch2_btree_delete_at(&trans, iter, 0); + if (ret) +- goto err; ++ break; + continue; + } + +@@ -771,12 +760,16 @@ retry: + ret = hash_check_key(&trans, bch2_xattr_hash_desc, + &h, iter, k); + if (ret) +- goto fsck_err; ++ break; ++ ++ bch2_btree_iter_advance_pos(iter); + } +-err: + fsck_err: + if (ret == -EINTR) + goto retry; ++ ++ bch2_trans_iter_put(&trans, h.chain); ++ bch2_trans_iter_put(&trans, iter); + return bch2_trans_exit(&trans) ?: ret; + } + +@@ -1127,6 +1120,8 @@ static int bch2_gc_walk_dirents(struct bch_fs *c, nlink_table *links, + + bch2_trans_cond_resched(&trans); + } ++ bch2_trans_iter_put(&trans, iter); ++ + ret = bch2_trans_exit(&trans) ?: ret; + if (ret) + bch_err(c, "error in fsck: btree error %i while walking dirents", ret); +@@ -1279,8 +1274,10 @@ static int check_inode(struct btree_trans *trans, + * XXX: need to truncate partial blocks too here - or ideally + * just switch units to bytes and that issue goes away + */ +- +- ret = bch2_inode_truncate(c, u.bi_inum, u.bi_size); ++ ret = bch2_btree_delete_range_trans(trans, BTREE_ID_extents, ++ POS(u.bi_inum, round_up(u.bi_size, block_bytes(c)) >> 9), ++ POS(u.bi_inum, U64_MAX), ++ NULL); + if (ret) { + bch_err(c, "error in fsck: error %i truncating inode", ret); + return ret; +@@ -1392,10 +1389,11 @@ peek_nlinks: link = genradix_iter_peek(&nlinks_iter, links); + if (nlinks_pos == iter->pos.offset) + genradix_iter_advance(&nlinks_iter, links); + +- bch2_btree_iter_next(iter); ++ bch2_btree_iter_advance_pos(iter); + bch2_trans_cond_resched(&trans); + } + fsck_err: ++ bch2_trans_iter_put(&trans, iter); + bch2_trans_exit(&trans); + + if (ret2) +-- +cgit v1.2.3 + + +From 1a604be69cb7154fb92f57840a2e29dcc56a56c8 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 19 Mar 2021 22:54:18 -0400 +Subject: bcachefs: btree_iter_set_dontneed() + +This is a bit clearer than using bch2_btree_iter_free(). + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 4 ++-- + fs/bcachefs/btree_iter.h | 5 +++++ + fs/bcachefs/btree_key_cache.c | 21 ++++++++++----------- + 3 files changed, 17 insertions(+), 13 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index a0d37a30a91a..0fa2241e3cdc 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -1972,7 +1972,7 @@ int bch2_trans_iter_free(struct btree_trans *trans, + if (IS_ERR_OR_NULL(iter)) + return 0; + +- trans->iters_touched &= ~(1ULL << iter->idx); ++ set_btree_iter_dontneed(trans, iter); + + return bch2_trans_iter_put(trans, iter); + } +@@ -2133,7 +2133,7 @@ struct btree_iter *__bch2_trans_copy_iter(struct btree_trans *trans, + * We don't need to preserve this iter since it's cheap to copy it + * again - this will cause trans_iter_put() to free it right away: + */ +- trans->iters_touched &= ~(1ULL << iter->idx); ++ set_btree_iter_dontneed(trans, iter); + + return iter; + } +diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h +index 76f0f8f3c125..c839bfe6ffa4 100644 +--- a/fs/bcachefs/btree_iter.h ++++ b/fs/bcachefs/btree_iter.h +@@ -300,6 +300,11 @@ static inline bool btree_iter_keep(struct btree_trans *trans, struct btree_iter + (iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT); + } + ++static inline void set_btree_iter_dontneed(struct btree_trans *trans, struct btree_iter *iter) ++{ ++ trans->iters_touched &= ~(1ULL << iter->idx); ++} ++ + #define TRANS_RESET_NOTRAVERSE (1 << 0) + + void bch2_trans_reset(struct btree_trans *, unsigned); +diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c +index 2230da8b3acd..0b3545637bb3 100644 +--- a/fs/bcachefs/btree_key_cache.c ++++ b/fs/bcachefs/btree_key_cache.c +@@ -171,23 +171,21 @@ static int btree_key_cache_fill(struct btree_trans *trans, + ck->key.pos, BTREE_ITER_SLOTS); + k = bch2_btree_iter_peek_slot(iter); + ret = bkey_err(k); +- if (ret) { +- bch2_trans_iter_put(trans, iter); +- return ret; +- } ++ if (ret) ++ goto err; + + if (!bch2_btree_node_relock(ck_iter, 0)) { +- bch2_trans_iter_put(trans, iter); + trace_transaction_restart_ip(trans->ip, _THIS_IP_); +- return -EINTR; ++ ret = -EINTR; ++ goto err; + } + + if (k.k->u64s > ck->u64s) { + new_u64s = roundup_pow_of_two(k.k->u64s); + new_k = kmalloc(new_u64s * sizeof(u64), GFP_NOFS); + if (!new_k) { +- bch2_trans_iter_put(trans, iter); +- return -ENOMEM; ++ ret = -ENOMEM; ++ goto err; + } + } + +@@ -203,9 +201,10 @@ static int btree_key_cache_fill(struct btree_trans *trans, + bch2_btree_node_unlock_write(ck_iter->l[0].b, ck_iter); + + /* We're not likely to need this iterator again: */ +- bch2_trans_iter_free(trans, iter); +- +- return 0; ++ set_btree_iter_dontneed(trans, iter); ++err: ++ bch2_trans_iter_put(trans, iter); ++ return ret; + } + + static int bkey_cached_check_fn(struct six_lock *lock, void *p) +-- +cgit v1.2.3 + + +From fe434d3e69700c80c2e9e7de5e5eadf816f8d236 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 19 Mar 2021 20:29:11 -0400 +Subject: bcachefs: Require all btree iterators to be freed + +We keep running into occasional bugs with btree transaction iterators +overflowing - this will make those bugs more visible. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/acl.c | 20 ++++++++++------- + fs/bcachefs/alloc_background.c | 4 ++-- + fs/bcachefs/btree_gc.c | 12 ++++++++--- + fs/bcachefs/btree_io.c | 1 + + fs/bcachefs/btree_iter.c | 17 +++++++++++++++ + fs/bcachefs/debug.c | 4 ++++ + fs/bcachefs/dirent.c | 3 +++ + fs/bcachefs/ec.c | 4 +++- + fs/bcachefs/extents.c | 2 ++ + fs/bcachefs/fs-io.c | 49 +++++++++++++++++++++++------------------- + fs/bcachefs/fs.c | 3 +++ + fs/bcachefs/fsck.c | 3 ++- + fs/bcachefs/inode.c | 1 + + fs/bcachefs/io.c | 38 ++++++++++++++++---------------- + fs/bcachefs/migrate.c | 9 ++++++-- + fs/bcachefs/move.c | 3 +++ + fs/bcachefs/quota.c | 7 +++++- + fs/bcachefs/reflink.c | 26 +++++++++------------- + fs/bcachefs/tests.c | 28 ++++++++++++++++++------ + fs/bcachefs/xattr.c | 18 +++++++++------- + 20 files changed, 163 insertions(+), 89 deletions(-) + +diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c +index f111898f6c4f..e7f69cab5a6a 100644 +--- a/fs/bcachefs/acl.c ++++ b/fs/bcachefs/acl.c +@@ -243,12 +243,12 @@ retry: + } + + xattr = bkey_s_c_to_xattr(bch2_btree_iter_peek_slot(iter)); +- + acl = bch2_acl_from_disk(xattr_val(xattr.v), + le16_to_cpu(xattr.v->x_val_len)); + + if (!IS_ERR(acl)) + set_cached_acl(&inode->v, type, acl); ++ bch2_trans_iter_put(&trans, iter); + out: + bch2_trans_exit(&trans); + return acl; +@@ -314,7 +314,7 @@ retry: + if (type == ACL_TYPE_ACCESS) { + ret = posix_acl_update_mode(mnt_userns, &inode->v, &mode, &acl); + if (ret) +- goto err; ++ goto btree_err; + } + + hash_info = bch2_hash_info_init(c, &inode_u); +@@ -331,6 +331,8 @@ retry: + &inode->ei_journal_seq, + BTREE_INSERT_NOUNLOCK); + btree_err: ++ bch2_trans_iter_put(&trans, inode_iter); ++ + if (ret == -EINTR) + goto retry; + if (unlikely(ret)) +@@ -357,21 +359,22 @@ int bch2_acl_chmod(struct btree_trans *trans, + struct bkey_s_c_xattr xattr; + struct bkey_i_xattr *new; + struct posix_acl *acl; +- int ret = 0; ++ int ret; + + iter = bch2_hash_lookup(trans, bch2_xattr_hash_desc, + &hash_info, inode->bi_inum, + &X_SEARCH(KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS, "", 0), + BTREE_ITER_INTENT); +- if (IS_ERR(iter)) +- return PTR_ERR(iter) != -ENOENT ? PTR_ERR(iter) : 0; ++ ret = PTR_ERR_OR_ZERO(iter); ++ if (ret) ++ return ret == -ENOENT ? 0 : ret; + + xattr = bkey_s_c_to_xattr(bch2_btree_iter_peek_slot(iter)); +- + acl = bch2_acl_from_disk(xattr_val(xattr.v), + le16_to_cpu(xattr.v->x_val_len)); +- if (IS_ERR_OR_NULL(acl)) +- return PTR_ERR(acl); ++ ret = PTR_ERR_OR_ZERO(acl); ++ if (ret || !acl) ++ goto err; + + ret = __posix_acl_chmod(&acl, GFP_KERNEL, mode); + if (ret) +@@ -388,6 +391,7 @@ int bch2_acl_chmod(struct btree_trans *trans, + *new_acl = acl; + acl = NULL; + err: ++ bch2_trans_iter_put(trans, iter); + kfree(acl); + return ret; + } +diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c +index 37539431c7f2..71569dca3056 100644 +--- a/fs/bcachefs/alloc_background.c ++++ b/fs/bcachefs/alloc_background.c +@@ -385,7 +385,6 @@ int bch2_alloc_write(struct bch_fs *c, unsigned flags) + int ret = 0; + + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); +- + iter = bch2_trans_get_iter(&trans, BTREE_ID_alloc, POS_MIN, + BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + +@@ -405,6 +404,7 @@ int bch2_alloc_write(struct bch_fs *c, unsigned flags) + } + } + err: ++ bch2_trans_iter_put(&trans, iter); + bch2_trans_exit(&trans); + return ret; + } +@@ -926,7 +926,6 @@ static int bch2_invalidate_buckets(struct bch_fs *c, struct bch_dev *ca) + int ret = 0; + + bch2_trans_init(&trans, c, 0, 0); +- + iter = bch2_trans_get_iter(&trans, BTREE_ID_alloc, + POS(ca->dev_idx, 0), + BTREE_ITER_CACHED| +@@ -942,6 +941,7 @@ static int bch2_invalidate_buckets(struct bch_fs *c, struct bch_dev *ca) + (!fifo_empty(&ca->free_inc) + ? BTREE_INSERT_NOWAIT : 0)); + ++ bch2_trans_iter_put(&trans, iter); + bch2_trans_exit(&trans); + + /* If we used NOWAIT, don't return the error: */ +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +index 2b5dfdbb602e..d89bc00aae0c 100644 +--- a/fs/bcachefs/btree_gc.c ++++ b/fs/bcachefs/btree_gc.c +@@ -456,6 +456,8 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id, + + bch2_trans_cond_resched(&trans); + } ++ bch2_trans_iter_put(&trans, iter); ++ + ret = bch2_trans_exit(&trans) ?: ret; + if (ret) + return ret; +@@ -1212,6 +1214,7 @@ static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id btree_id) + + bch2_btree_iter_next(iter); + } ++ bch2_trans_iter_put(&trans, iter); + + bch2_trans_exit(&trans); + bch2_bkey_buf_exit(&sk, c); +@@ -1509,6 +1512,7 @@ static int bch2_coalesce_btree(struct bch_fs *c, enum btree_id btree_id) + struct btree *b; + bool kthread = (current->flags & PF_KTHREAD) != 0; + unsigned i; ++ int ret = 0; + + /* Sliding window of adjacent btree nodes */ + struct btree *merge[GC_MERGE_NODES]; +@@ -1557,8 +1561,8 @@ static int bch2_coalesce_btree(struct bch_fs *c, enum btree_id btree_id) + lock_seq[0] = merge[0]->c.lock.state.seq; + + if (kthread && kthread_should_stop()) { +- bch2_trans_exit(&trans); +- return -ESHUTDOWN; ++ ret = -ESHUTDOWN; ++ break; + } + + bch2_trans_cond_resched(&trans); +@@ -1573,7 +1577,9 @@ static int bch2_coalesce_btree(struct bch_fs *c, enum btree_id btree_id) + memset(merge + 1, 0, + (GC_MERGE_NODES - 1) * sizeof(merge[0])); + } +- return bch2_trans_exit(&trans); ++ bch2_trans_iter_put(&trans, iter); ++ ++ return bch2_trans_exit(&trans) ?: ret; + } + + /** +diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c +index 4acaa14a80ff..cda0611418de 100644 +--- a/fs/bcachefs/btree_io.c ++++ b/fs/bcachefs/btree_io.c +@@ -1208,6 +1208,7 @@ retry: + if (ret) + goto err; + out: ++ bch2_trans_iter_put(&trans, iter); + bch2_trans_exit(&trans); + bch2_bkey_buf_exit(&k, c); + bio_put(&wbio->wbio.bio); +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 0fa2241e3cdc..5a27b0bd3bc4 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -9,6 +9,7 @@ + #include "btree_locking.h" + #include "btree_update.h" + #include "debug.h" ++#include "error.h" + #include "extents.h" + #include "journal.h" + +@@ -2116,6 +2117,7 @@ struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *trans, + for (i = 0; i < ARRAY_SIZE(iter->l); i++) + iter->l[i].b = NULL; + iter->l[iter->level].b = BTREE_ITER_NO_NODE_INIT; ++ iter->ip_allocated = _RET_IP_; + + return iter; + } +@@ -2223,6 +2225,8 @@ void bch2_trans_reset(struct btree_trans *trans, unsigned flags) + (void *) &trans->fs_usage_deltas->memset_start); + } + ++ bch2_trans_cond_resched(trans); ++ + if (!(flags & TRANS_RESET_NOTRAVERSE)) + bch2_btree_iter_traverse_all(trans); + } +@@ -2287,6 +2291,19 @@ int bch2_trans_exit(struct btree_trans *trans) + bch2_trans_unlock(trans); + + #ifdef CONFIG_BCACHEFS_DEBUG ++ if (trans->iters_live) { ++ struct btree_iter *iter; ++ ++ bch_err(c, "btree iterators leaked!"); ++ trans_for_each_iter(trans, iter) ++ if (btree_iter_live(trans, iter)) ++ printk(KERN_ERR " btree %s allocated at %pS\n", ++ bch2_btree_ids[iter->btree_id], ++ (void *) iter->ip_allocated); ++ /* Be noisy about this: */ ++ bch2_fatal_error(c); ++ } ++ + mutex_lock(&trans->c->btree_trans_lock); + list_del(&trans->list); + mutex_unlock(&trans->c->btree_trans_lock); +diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c +index 06dbca32e189..cce747da8b9e 100644 +--- a/fs/bcachefs/debug.c ++++ b/fs/bcachefs/debug.c +@@ -242,6 +242,8 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf, + if (!i->size) + break; + } ++ bch2_trans_iter_put(&trans, iter); ++ + bch2_trans_exit(&trans); + + return err < 0 ? err : i->ret; +@@ -294,6 +296,8 @@ static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf, + if (!i->size) + break; + } ++ bch2_trans_iter_put(&trans, iter); ++ + bch2_trans_exit(&trans); + + return err < 0 ? err : i->ret; +diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c +index b0625176ab35..592dd80cf963 100644 +--- a/fs/bcachefs/dirent.c ++++ b/fs/bcachefs/dirent.c +@@ -321,6 +321,7 @@ u64 bch2_dirent_lookup(struct bch_fs *c, u64 dir_inum, + + k = bch2_btree_iter_peek_slot(iter); + inum = le64_to_cpu(bkey_s_c_to_dirent(k).v->d_inum); ++ bch2_trans_iter_put(&trans, iter); + out: + bch2_trans_exit(&trans); + return inum; +@@ -379,6 +380,8 @@ int bch2_readdir(struct bch_fs *c, u64 inum, struct dir_context *ctx) + break; + ctx->pos = dirent.k->p.offset + 1; + } ++ bch2_trans_iter_put(&trans, iter); ++ + ret = bch2_trans_exit(&trans) ?: ret; + + return ret; +diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c +index 600d324d4725..500094e7000d 100644 +--- a/fs/bcachefs/ec.c ++++ b/fs/bcachefs/ec.c +@@ -873,6 +873,7 @@ static int ec_stripe_update_ptrs(struct bch_fs *c, + if (ret) + break; + } ++ bch2_trans_iter_put(&trans, iter); + + bch2_trans_exit(&trans); + bch2_bkey_buf_exit(&sk, c); +@@ -1663,12 +1664,13 @@ int bch2_ec_mem_alloc(struct bch_fs *c, bool gc) + int ret = 0; + + bch2_trans_init(&trans, c, 0, 0); +- + iter = bch2_trans_get_iter(&trans, BTREE_ID_stripes, POS(0, U64_MAX), 0); + + k = bch2_btree_iter_prev(iter); + if (!IS_ERR_OR_NULL(k.k)) + idx = k.k->p.offset + 1; ++ ++ bch2_trans_iter_put(&trans, iter); + ret = bch2_trans_exit(&trans); + if (ret) + return ret; +diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c +index dbaded8176cb..26541c7bd616 100644 +--- a/fs/bcachefs/extents.c ++++ b/fs/bcachefs/extents.c +@@ -688,6 +688,8 @@ bool bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size, + break; + } + } ++ bch2_trans_iter_put(&trans, iter); ++ + bch2_trans_exit(&trans); + + return ret; +diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c +index 2ec8f3fa94ce..fa85ca78460b 100644 +--- a/fs/bcachefs/fs-io.c ++++ b/fs/bcachefs/fs-io.c +@@ -891,7 +891,6 @@ void bch2_readahead(struct readahead_control *ractl) + BUG_ON(ret); + + bch2_trans_init(&trans, c, 0, 0); +- + iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, POS_MIN, + BTREE_ITER_SLOTS); + +@@ -920,6 +919,7 @@ void bch2_readahead(struct readahead_control *ractl) + + bch2_pagecache_add_put(&inode->ei_pagecache_lock); + ++ bch2_trans_iter_put(&trans, iter); + bch2_trans_exit(&trans); + kfree(readpages_iter.pages); + } +@@ -943,6 +943,7 @@ static void __bchfs_readpage(struct bch_fs *c, struct bch_read_bio *rbio, + + bchfs_read(&trans, iter, rbio, inum, NULL); + ++ bch2_trans_iter_put(&trans, iter); + bch2_trans_exit(&trans); + } + +@@ -2149,6 +2150,7 @@ static inline int range_has_data(struct bch_fs *c, + break; + } + } ++ bch2_trans_iter_put(&trans, iter); + + return bch2_trans_exit(&trans) ?: ret; + } +@@ -2319,6 +2321,7 @@ int bch2_truncate(struct bch_inode_info *inode, struct iattr *iattr) + bch2_trans_init(&trans, c, 0, 0); + iter = bch2_inode_peek(&trans, &inode_u, inode->v.i_ino, 0); + ret = PTR_ERR_OR_ZERO(iter); ++ bch2_trans_iter_put(&trans, iter); + bch2_trans_exit(&trans); + + if (ret) +@@ -2453,14 +2456,11 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode, + struct btree_iter *src, *dst, *del; + loff_t shift, new_size; + u64 src_start; +- int ret; ++ int ret = 0; + + if ((offset | len) & (block_bytes(c) - 1)) + return -EINVAL; + +- bch2_bkey_buf_init(©); +- bch2_trans_init(&trans, c, BTREE_ITER_MAX, 256); +- + /* + * We need i_mutex to keep the page cache consistent with the extents + * btree, and the btree consistent with i_size - we don't need outside +@@ -2516,13 +2516,15 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode, + goto err; + } + ++ bch2_bkey_buf_init(©); ++ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 256); + src = bch2_trans_get_iter(&trans, BTREE_ID_extents, + POS(inode->v.i_ino, src_start >> 9), + BTREE_ITER_INTENT); + dst = bch2_trans_copy_iter(&trans, src); + del = bch2_trans_copy_iter(&trans, src); + +- while (1) { ++ while (ret == 0 || ret == -EINTR) { + struct disk_reservation disk_res = + bch2_disk_reservation_init(c, 0); + struct bkey_i delete; +@@ -2536,7 +2538,7 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode, + ? bch2_btree_iter_peek_prev(src) + : bch2_btree_iter_peek(src); + if ((ret = bkey_err(k))) +- goto bkey_err; ++ continue; + + if (!k.k || k.k->p.inode != inode->v.i_ino) + break; +@@ -2556,7 +2558,7 @@ reassemble: + + ret = bch2_extent_atomic_end(dst, copy.k, &atomic_end); + if (ret) +- goto bkey_err; ++ continue; + + if (bkey_cmp(atomic_end, copy.k->k.p)) { + if (insert) { +@@ -2599,18 +2601,18 @@ reassemble: + &inode->ei_journal_seq, + BTREE_INSERT_NOFAIL); + bch2_disk_reservation_put(c, &disk_res); +-bkey_err: ++ + if (!ret) + bch2_btree_iter_set_pos(src, next_pos); +- +- if (ret == -EINTR) +- ret = 0; +- if (ret) +- goto err; +- +- bch2_trans_cond_resched(&trans); + } +- bch2_trans_unlock(&trans); ++ bch2_trans_iter_put(&trans, del); ++ bch2_trans_iter_put(&trans, dst); ++ bch2_trans_iter_put(&trans, src); ++ bch2_trans_exit(&trans); ++ bch2_bkey_buf_exit(©, c); ++ ++ if (ret) ++ goto err; + + if (!insert) { + i_size_write(&inode->v, new_size); +@@ -2620,8 +2622,6 @@ bkey_err: + mutex_unlock(&inode->ei_update_lock); + } + err: +- bch2_trans_exit(&trans); +- bch2_bkey_buf_exit(©, c); + bch2_pagecache_block_put(&inode->ei_pagecache_lock); + inode_unlock(&inode->v); + return ret; +@@ -2676,7 +2676,7 @@ static long bchfs_fallocate(struct bch_inode_info *inode, int mode, + BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + end_pos = POS(inode->v.i_ino, block_end >> 9); + +- while (bkey_cmp(iter->pos, end_pos) < 0) { ++ while (!ret && bkey_cmp(iter->pos, end_pos) < 0) { + s64 i_sectors_delta = 0; + struct disk_reservation disk_res = { 0 }; + struct quota_res quota_res = { 0 }; +@@ -2740,9 +2740,11 @@ bkey_err: + bch2_disk_reservation_put(c, &disk_res); + if (ret == -EINTR) + ret = 0; +- if (ret) +- goto err; + } ++ bch2_trans_iter_put(&trans, iter); ++ ++ if (ret) ++ goto err; + + /* + * Do we need to extend the file? +@@ -2764,6 +2766,7 @@ bkey_err: + ret = PTR_ERR_OR_ZERO(inode_iter); + } while (ret == -EINTR); + ++ bch2_trans_iter_put(&trans, inode_iter); + bch2_trans_unlock(&trans); + + if (ret) +@@ -3011,6 +3014,7 @@ static loff_t bch2_seek_data(struct file *file, u64 offset) + } else if (k.k->p.offset >> 9 > isize) + break; + } ++ bch2_trans_iter_put(&trans, iter); + + ret = bch2_trans_exit(&trans) ?: ret; + if (ret) +@@ -3114,6 +3118,7 @@ static loff_t bch2_seek_hole(struct file *file, u64 offset) + offset = max(offset, bkey_start_offset(k.k) << 9); + } + } ++ bch2_trans_iter_put(&trans, iter); + + ret = bch2_trans_exit(&trans) ?: ret; + if (ret) +diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c +index b96f5cf16deb..43f17967ea54 100644 +--- a/fs/bcachefs/fs.c ++++ b/fs/bcachefs/fs.c +@@ -734,6 +734,8 @@ retry: + BTREE_INSERT_NOUNLOCK| + BTREE_INSERT_NOFAIL); + btree_err: ++ bch2_trans_iter_put(&trans, inode_iter); ++ + if (ret == -EINTR) + goto retry; + if (unlikely(ret)) +@@ -960,6 +962,7 @@ retry: + ret = bch2_fill_extent(c, info, bkey_i_to_s_c(prev.k), + FIEMAP_EXTENT_LAST); + ++ bch2_trans_iter_put(&trans, iter); + ret = bch2_trans_exit(&trans) ?: ret; + bch2_bkey_buf_exit(&cur, c); + bch2_bkey_buf_exit(&prev, c); +diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c +index 033d37891c60..f8e0b24d087a 100644 +--- a/fs/bcachefs/fsck.c ++++ b/fs/bcachefs/fsck.c +@@ -1485,11 +1485,12 @@ int bch2_fsck_walk_inodes_only(struct bch_fs *c) + BCH_INODE_I_SECTORS_DIRTY| + BCH_INODE_UNLINKED)) { + ret = check_inode(&trans, NULL, iter, inode, NULL); +- BUG_ON(ret == -EINTR); + if (ret) + break; + } + } ++ bch2_trans_iter_put(&trans, iter); ++ + BUG_ON(ret == -EINTR); + + return bch2_trans_exit(&trans) ?: ret; +diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c +index e72c49e18f13..c9b31afc7c97 100644 +--- a/fs/bcachefs/inode.c ++++ b/fs/bcachefs/inode.c +@@ -620,6 +620,7 @@ retry: + + ret = bch2_trans_commit(&trans, NULL, NULL, + BTREE_INSERT_NOFAIL); ++ bch2_trans_iter_put(&trans, iter); + err: + if (ret == -EINTR) + goto retry; +diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c +index 3faefd1fd8be..263f7a6db42a 100644 +--- a/fs/bcachefs/io.c ++++ b/fs/bcachefs/io.c +@@ -404,6 +404,8 @@ int bch2_fpunch(struct bch_fs *c, u64 inum, u64 start, u64 end, + + ret = bch2_fpunch_at(&trans, iter, POS(inum, end), + journal_seq, i_sectors_delta); ++ ++ bch2_trans_iter_put(&trans, iter); + bch2_trans_exit(&trans); + + if (ret == -EINTR) +@@ -450,6 +452,7 @@ int bch2_write_index_default(struct bch_write_op *op) + bch2_keylist_pop_front(keys); + } while (!bch2_keylist_empty(keys)); + ++ bch2_trans_iter_put(&trans, iter); + bch2_trans_exit(&trans); + bch2_bkey_buf_exit(&sk, c); + +@@ -1666,6 +1669,7 @@ retry: + goto err; + out: + bch2_rbio_done(rbio); ++ bch2_trans_iter_put(&trans, iter); + bch2_trans_exit(&trans); + bch2_bkey_buf_exit(&sk, c); + return; +@@ -2259,7 +2263,7 @@ retry: + k = bch2_btree_iter_peek_slot(iter); + ret = bkey_err(k); + if (ret) +- goto err; ++ break; + + offset_into_extent = iter->pos.offset - + bkey_start_offset(k.k); +@@ -2270,7 +2274,7 @@ retry: + ret = bch2_read_indirect_extent(&trans, &data_btree, + &offset_into_extent, &sk); + if (ret) +- goto err; ++ break; + + k = bkey_i_to_s_c(sk.k); + +@@ -2295,12 +2299,8 @@ retry: + ret = __bch2_read_extent(&trans, rbio, bvec_iter, iter->pos, + data_btree, k, + offset_into_extent, failed, flags); +- switch (ret) { +- case READ_RETRY: +- goto retry; +- case READ_ERR: +- goto err; +- }; ++ if (ret) ++ break; + + if (flags & BCH_READ_LAST_FRAGMENT) + break; +@@ -2308,19 +2308,19 @@ retry: + swap(bvec_iter.bi_size, bytes); + bio_advance_iter(&rbio->bio, &bvec_iter, bytes); + } +-out: +- bch2_trans_exit(&trans); +- bch2_bkey_buf_exit(&sk, c); +- return; +-err: +- if (ret == -EINTR) ++ bch2_trans_iter_put(&trans, iter); ++ ++ if (ret == -EINTR || ret == READ_RETRY || ret == READ_RETRY_AVOID) + goto retry; + +- bch_err_inum_ratelimited(c, inode, +- "read error %i from btree lookup", ret); +- rbio->bio.bi_status = BLK_STS_IOERR; +- bch2_rbio_done(rbio); +- goto out; ++ if (ret) { ++ bch_err_inum_ratelimited(c, inode, ++ "read error %i from btree lookup", ret); ++ rbio->bio.bi_status = BLK_STS_IOERR; ++ bch2_rbio_done(rbio); ++ } ++ bch2_trans_exit(&trans); ++ bch2_bkey_buf_exit(&sk, c); + } + + void bch2_fs_io_exit(struct bch_fs *c) +diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c +index 1db2c2d6b970..4d8b4169923d 100644 +--- a/fs/bcachefs/migrate.c ++++ b/fs/bcachefs/migrate.c +@@ -88,6 +88,7 @@ static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags + if (ret) + break; + } ++ bch2_trans_iter_put(&trans, iter); + + ret = bch2_trans_exit(&trans) ?: ret; + bch2_bkey_buf_exit(&sk, c); +@@ -135,20 +136,24 @@ retry: + dev_idx, flags, true); + if (ret) { + bch_err(c, "Cannot drop device without losing data"); +- goto err; ++ break; + } + + ret = bch2_btree_node_update_key(c, iter, b, k.k); + if (ret == -EINTR) { + b = bch2_btree_iter_peek_node(iter); ++ ret = 0; + goto retry; + } + if (ret) { + bch_err(c, "Error updating btree node key: %i", ret); +- goto err; ++ break; + } + } + bch2_trans_iter_free(&trans, iter); ++ ++ if (ret) ++ goto err; + } + + /* flush relevant btree updates */ +diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c +index 9cf670673ad3..1403af076f10 100644 +--- a/fs/bcachefs/move.c ++++ b/fs/bcachefs/move.c +@@ -196,6 +196,7 @@ nomatch: + goto next; + } + out: ++ bch2_trans_iter_put(&trans, iter); + bch2_trans_exit(&trans); + bch2_bkey_buf_exit(&_insert, c); + bch2_bkey_buf_exit(&_new, c); +@@ -642,6 +643,8 @@ next_nondata: + bch2_trans_cond_resched(&trans); + } + out: ++ ++ bch2_trans_iter_put(&trans, iter); + ret = bch2_trans_exit(&trans) ?: ret; + bch2_bkey_buf_exit(&sk, c); + +diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c +index 8e272519ce0e..35b409e0f366 100644 +--- a/fs/bcachefs/quota.c ++++ b/fs/bcachefs/quota.c +@@ -372,6 +372,7 @@ static int bch2_quota_init_type(struct bch_fs *c, enum quota_types type) + if (ret) + break; + } ++ bch2_trans_iter_put(&trans, iter); + + return bch2_trans_exit(&trans) ?: ret; + } +@@ -449,6 +450,8 @@ int bch2_fs_quota_read(struct bch_fs *c) + KEY_TYPE_QUOTA_NOCHECK); + } + } ++ bch2_trans_iter_put(&trans, iter); ++ + return bch2_trans_exit(&trans) ?: ret; + } + +@@ -739,7 +742,9 @@ static int bch2_set_quota_trans(struct btree_trans *trans, + if (qdq->d_fieldmask & QC_INO_HARD) + new_quota->v.c[Q_INO].hardlimit = cpu_to_le64(qdq->d_ino_hardlimit); + +- return bch2_trans_update(trans, iter, &new_quota->k_i, 0); ++ ret = bch2_trans_update(trans, iter, &new_quota->k_i, 0); ++ bch2_trans_iter_put(trans, iter); ++ return ret; + } + + static int bch2_set_quota(struct super_block *sb, struct kqid qid, +diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c +index e9a6a5f639b4..0978ad92614c 100644 +--- a/fs/bcachefs/reflink.c ++++ b/fs/bcachefs/reflink.c +@@ -223,20 +223,18 @@ s64 bch2_remap_range(struct bch_fs *c, + dst_iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, dst_start, + BTREE_ITER_INTENT); + +- while (1) { ++ while (ret == 0 || ret == -EINTR) { + bch2_trans_begin(&trans); + +- trans.mem_top = 0; +- + if (fatal_signal_pending(current)) { + ret = -EINTR; +- goto err; ++ break; + } + + src_k = get_next_src(src_iter, src_end); + ret = bkey_err(src_k); + if (ret) +- goto btree_err; ++ continue; + + src_done = bpos_min(src_iter->pos, src_end).offset - + src_start.offset; +@@ -245,8 +243,6 @@ s64 bch2_remap_range(struct bch_fs *c, + if (bkey_cmp(dst_iter->pos, dst_want) < 0) { + ret = bch2_fpunch_at(&trans, dst_iter, dst_want, + journal_seq, i_sectors_delta); +- if (ret) +- goto btree_err; + continue; + } + +@@ -265,7 +261,7 @@ s64 bch2_remap_range(struct bch_fs *c, + ret = bch2_make_extent_indirect(&trans, src_iter, + new_src.k); + if (ret) +- goto btree_err; ++ continue; + + BUG_ON(src_k.k->type != KEY_TYPE_reflink_p); + } +@@ -294,20 +290,16 @@ s64 bch2_remap_range(struct bch_fs *c, + NULL, journal_seq, + new_i_size, i_sectors_delta); + if (ret) +- goto btree_err; ++ continue; + + dst_done = dst_iter->pos.offset - dst_start.offset; + src_want = POS(src_start.inode, src_start.offset + dst_done); + bch2_btree_iter_set_pos(src_iter, src_want); +-btree_err: +- if (ret == -EINTR) +- ret = 0; +- if (ret) +- goto err; + } ++ bch2_trans_iter_put(&trans, dst_iter); ++ bch2_trans_iter_put(&trans, src_iter); + +- BUG_ON(bkey_cmp(dst_iter->pos, dst_end)); +-err: ++ BUG_ON(!ret && bkey_cmp(dst_iter->pos, dst_end)); + BUG_ON(bkey_cmp(dst_iter->pos, dst_end) > 0); + + dst_done = dst_iter->pos.offset - dst_start.offset; +@@ -329,6 +321,8 @@ err: + ret2 = bch2_inode_write(&trans, inode_iter, &inode_u) ?: + bch2_trans_commit(&trans, NULL, journal_seq, 0); + } ++ ++ bch2_trans_iter_put(&trans, inode_iter); + } while (ret2 == -EINTR); + + ret = bch2_trans_exit(&trans) ?: ret; +diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c +index dfb12fdd4814..14b85dc22342 100644 +--- a/fs/bcachefs/tests.c ++++ b/fs/bcachefs/tests.c +@@ -67,6 +67,7 @@ static int test_delete(struct bch_fs *c, u64 nr) + goto err; + } + err: ++ bch2_trans_iter_put(&trans, iter); + bch2_trans_exit(&trans); + return ret; + } +@@ -106,6 +107,7 @@ static int test_delete_written(struct bch_fs *c, u64 nr) + goto err; + } + err: ++ bch2_trans_iter_put(&trans, iter); + bch2_trans_exit(&trans); + return ret; + } +@@ -113,7 +115,7 @@ err: + static int test_iterate(struct bch_fs *c, u64 nr) + { + struct btree_trans trans; +- struct btree_iter *iter; ++ struct btree_iter *iter = NULL; + struct bkey_s_c k; + u64 i; + int ret = 0; +@@ -159,6 +161,7 @@ static int test_iterate(struct bch_fs *c, u64 nr) + + BUG_ON(i); + err: ++ bch2_trans_iter_put(&trans, iter); + bch2_trans_exit(&trans); + return ret; + } +@@ -166,7 +169,7 @@ err: + static int test_iterate_extents(struct bch_fs *c, u64 nr) + { + struct btree_trans trans; +- struct btree_iter *iter; ++ struct btree_iter *iter = NULL; + struct bkey_s_c k; + u64 i; + int ret = 0; +@@ -213,6 +216,7 @@ static int test_iterate_extents(struct bch_fs *c, u64 nr) + + BUG_ON(i); + err: ++ bch2_trans_iter_put(&trans, iter); + bch2_trans_exit(&trans); + return ret; + } +@@ -257,7 +261,7 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr) + BUG_ON(k.k->p.offset != i); + i += 2; + } +- bch2_trans_iter_free(&trans, iter); ++ bch2_trans_iter_put(&trans, iter); + + BUG_ON(i != nr * 2); + +@@ -274,6 +278,7 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr) + if (i == nr * 2) + break; + } ++ bch2_trans_iter_put(&trans, iter); + err: + bch2_trans_exit(&trans); + return ret; +@@ -318,7 +323,7 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr) + BUG_ON(k.k->size != 8); + i += 16; + } +- bch2_trans_iter_free(&trans, iter); ++ bch2_trans_iter_put(&trans, iter); + + BUG_ON(i != nr); + +@@ -337,6 +342,7 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr) + if (i == nr) + break; + } ++ bch2_trans_iter_put(&trans, iter); + err: + bch2_trans_exit(&trans); + return 0; +@@ -362,6 +368,8 @@ static int test_peek_end(struct bch_fs *c, u64 nr) + k = bch2_btree_iter_peek(iter); + BUG_ON(k.k); + ++ bch2_trans_iter_put(&trans, iter); ++ + bch2_trans_exit(&trans); + return 0; + } +@@ -382,6 +390,8 @@ static int test_peek_end_extents(struct bch_fs *c, u64 nr) + k = bch2_btree_iter_peek(iter); + BUG_ON(k.k); + ++ bch2_trans_iter_put(&trans, iter); ++ + bch2_trans_exit(&trans); + return 0; + } +@@ -508,7 +518,7 @@ static int rand_lookup(struct bch_fs *c, u64 nr) + } + } + +- bch2_trans_iter_free(&trans, iter); ++ bch2_trans_iter_put(&trans, iter); + bch2_trans_exit(&trans); + return ret; + } +@@ -549,7 +559,7 @@ static int rand_mixed(struct bch_fs *c, u64 nr) + } + } + +- bch2_trans_iter_free(&trans, iter); ++ bch2_trans_iter_put(&trans, iter); + bch2_trans_exit(&trans); + return ret; + } +@@ -630,6 +640,8 @@ static int seq_insert(struct bch_fs *c, u64 nr) + if (++i == nr) + break; + } ++ bch2_trans_iter_put(&trans, iter); ++ + bch2_trans_exit(&trans); + return ret; + } +@@ -645,6 +657,8 @@ static int seq_lookup(struct bch_fs *c, u64 nr) + + for_each_btree_key(&trans, iter, BTREE_ID_xattrs, POS_MIN, 0, k, ret) + ; ++ bch2_trans_iter_put(&trans, iter); ++ + bch2_trans_exit(&trans); + return ret; + } +@@ -671,6 +685,8 @@ static int seq_overwrite(struct bch_fs *c, u64 nr) + break; + } + } ++ bch2_trans_iter_put(&trans, iter); ++ + bch2_trans_exit(&trans); + return ret; + } +diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c +index 92c6a071320d..8985a21b122c 100644 +--- a/fs/bcachefs/xattr.c ++++ b/fs/bcachefs/xattr.c +@@ -133,12 +133,9 @@ int bch2_xattr_get(struct bch_fs *c, struct bch_inode_info *inode, + inode->v.i_ino, + &X_SEARCH(type, name, strlen(name)), + 0); +- if (IS_ERR(iter)) { +- bch2_trans_exit(&trans); +- BUG_ON(PTR_ERR(iter) == -EINTR); +- +- return PTR_ERR(iter) == -ENOENT ? -ENODATA : PTR_ERR(iter); +- } ++ ret = PTR_ERR_OR_ZERO(iter); ++ if (ret) ++ goto err; + + xattr = bkey_s_c_to_xattr(bch2_btree_iter_peek_slot(iter)); + ret = le16_to_cpu(xattr.v->x_val_len); +@@ -148,9 +145,12 @@ int bch2_xattr_get(struct bch_fs *c, struct bch_inode_info *inode, + else + memcpy(buffer, xattr_val(xattr.v), ret); + } +- ++ bch2_trans_iter_put(&trans, iter); ++err: + bch2_trans_exit(&trans); +- return ret; ++ ++ BUG_ON(ret == -EINTR); ++ return ret == -ENOENT ? -ENODATA : ret; + } + + int bch2_xattr_set(struct btree_trans *trans, u64 inum, +@@ -294,6 +294,8 @@ ssize_t bch2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) + if (ret) + break; + } ++ bch2_trans_iter_put(&trans, iter); ++ + ret = bch2_trans_exit(&trans) ?: ret; + + if (ret) +-- +cgit v1.2.3 + + +From acbac06954c16d3af6bce6ea3e3299f4ed91bfb2 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 19 Mar 2021 20:40:31 -0400 +Subject: bcachefs: Assert that iterators aren't being double freed + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 1 + + fs/bcachefs/buckets.c | 4 ++++ + 2 files changed, 5 insertions(+) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 5a27b0bd3bc4..197100f0aee3 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -1956,6 +1956,7 @@ int bch2_trans_iter_put(struct btree_trans *trans, + return 0; + + BUG_ON(trans->iters + iter->idx != iter); ++ BUG_ON(!btree_iter_live(trans, iter)); + + ret = btree_iter_err(iter); + +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index 5b92e9fc3ea6..5729123e515d 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -1497,6 +1497,10 @@ static struct btree_iter *trans_get_update(struct btree_trans *trans, + bkey_cmp(pos, i->k->k.p) < 0 + : !bkey_cmp(pos, i->iter->pos))) { + *k = bkey_i_to_s_c(i->k); ++ ++ /* ugly hack.. */ ++ BUG_ON(btree_iter_live(trans, i->iter)); ++ trans->iters_live |= 1ULL << i->iter->idx; + return i->iter; + } + +-- +cgit v1.2.3 + + +From 4b1d5a529bc3ff98100067e6b8271e84e8ce909f Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 19 Mar 2021 23:19:05 -0400 +Subject: bcachefs: Kill bkey ops->debugcheck method + +This code used to be used for running some assertions on alloc info at +runtime, but it long predates fsck and hasn't been good for much in +ages - we can delete it now. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bkey_methods.c | 5 --- + fs/bcachefs/bkey_methods.h | 1 - + fs/bcachefs/btree_gc.c | 4 --- + fs/bcachefs/extents.c | 83 ---------------------------------------------- + fs/bcachefs/extents.h | 5 --- + 5 files changed, 98 deletions(-) + +diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c +index 79e249f49971..878befb5b9ef 100644 +--- a/fs/bcachefs/bkey_methods.c ++++ b/fs/bcachefs/bkey_methods.c +@@ -149,7 +149,6 @@ const char *bch2_bkey_in_btree_node(struct btree *b, struct bkey_s_c k) + + void bch2_bkey_debugcheck(struct bch_fs *c, struct btree *b, struct bkey_s_c k) + { +- const struct bkey_ops *ops = &bch2_bkey_ops[k.k->type]; + const char *invalid; + + BUG_ON(!k.k->u64s); +@@ -161,11 +160,7 @@ void bch2_bkey_debugcheck(struct bch_fs *c, struct btree *b, struct bkey_s_c k) + + bch2_bkey_val_to_text(&PBUF(buf), c, k); + bch2_fs_inconsistent(c, "invalid bkey %s: %s", buf, invalid); +- return; + } +- +- if (ops->key_debugcheck) +- ops->key_debugcheck(c, k); + } + + void bch2_bpos_to_text(struct printbuf *out, struct bpos pos) +diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h +index 0bca725ae3b8..bfa6f112aeed 100644 +--- a/fs/bcachefs/bkey_methods.h ++++ b/fs/bcachefs/bkey_methods.h +@@ -26,7 +26,6 @@ struct bkey_ops { + /* Returns reason for being invalid if invalid, else NULL: */ + const char * (*key_invalid)(const struct bch_fs *, + struct bkey_s_c); +- void (*key_debugcheck)(struct bch_fs *, struct bkey_s_c); + void (*val_to_text)(struct printbuf *, struct bch_fs *, + struct bkey_s_c); + void (*swab)(struct bkey_s); +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +index d89bc00aae0c..2bba36117a5a 100644 +--- a/fs/bcachefs/btree_gc.c ++++ b/fs/bcachefs/btree_gc.c +@@ -389,8 +389,6 @@ static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, u8 *max_stale, + bkey_init(&prev.k->k); + + while ((k = bch2_btree_node_iter_peek_unpack(&iter, b, &unpacked)).k) { +- bch2_bkey_debugcheck(c, b, k); +- + ret = bch2_gc_mark_key(c, b->c.btree_id, b->c.level, false, + &k, max_stale, initial); + if (ret) +@@ -491,8 +489,6 @@ static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b, + bkey_init(&prev.k->k); + + while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { +- bch2_bkey_debugcheck(c, b, k); +- + BUG_ON(bkey_cmp(k.k->p, b->data->min_key) < 0); + BUG_ON(bkey_cmp(k.k->p, b->data->max_key) > 0); + +diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c +index 26541c7bd616..148100be1013 100644 +--- a/fs/bcachefs/extents.c ++++ b/fs/bcachefs/extents.c +@@ -164,46 +164,6 @@ const char *bch2_btree_ptr_invalid(const struct bch_fs *c, struct bkey_s_c k) + return bch2_bkey_ptrs_invalid(c, k); + } + +-void bch2_btree_ptr_debugcheck(struct bch_fs *c, struct bkey_s_c k) +-{ +- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); +- const struct bch_extent_ptr *ptr; +- const char *err; +- char buf[160]; +- struct bucket_mark mark; +- struct bch_dev *ca; +- +- if (!test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) +- return; +- +- if (!percpu_down_read_trylock(&c->mark_lock)) +- return; +- +- bkey_for_each_ptr(ptrs, ptr) { +- ca = bch_dev_bkey_exists(c, ptr->dev); +- +- mark = ptr_bucket_mark(ca, ptr); +- +- err = "stale"; +- if (gen_after(mark.gen, ptr->gen)) +- goto err; +- +- err = "inconsistent"; +- if (mark.data_type != BCH_DATA_btree || +- mark.dirty_sectors < c->opts.btree_node_size) +- goto err; +- } +-out: +- percpu_up_read(&c->mark_lock); +- return; +-err: +- bch2_fs_inconsistent(c, "%s btree pointer %s: bucket %zi gen %i mark %08x", +- err, (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf), +- PTR_BUCKET_NR(ca, ptr), +- mark.gen, (unsigned) mark.v.counter); +- goto out; +-} +- + void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) + { +@@ -247,49 +207,6 @@ const char *bch2_extent_invalid(const struct bch_fs *c, struct bkey_s_c k) + return bch2_bkey_ptrs_invalid(c, k); + } + +-void bch2_extent_debugcheck(struct bch_fs *c, struct bkey_s_c k) +-{ +- struct bkey_s_c_extent e = bkey_s_c_to_extent(k); +- const union bch_extent_entry *entry; +- struct extent_ptr_decoded p; +- char buf[160]; +- +- if (!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags) || +- !test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) +- return; +- +- if (!percpu_down_read_trylock(&c->mark_lock)) +- return; +- +- extent_for_each_ptr_decode(e, p, entry) { +- struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); +- struct bucket_mark mark = ptr_bucket_mark(ca, &p.ptr); +- unsigned stale = gen_after(mark.gen, p.ptr.gen); +- unsigned disk_sectors = ptr_disk_sectors(p); +- unsigned mark_sectors = p.ptr.cached +- ? mark.cached_sectors +- : mark.dirty_sectors; +- +- bch2_fs_inconsistent_on(stale && !p.ptr.cached, c, +- "stale dirty pointer (ptr gen %u bucket %u", +- p.ptr.gen, mark.gen); +- +- bch2_fs_inconsistent_on(stale > 96, c, +- "key too stale: %i", stale); +- +- bch2_fs_inconsistent_on(!stale && +- (mark.data_type != BCH_DATA_user || +- mark_sectors < disk_sectors), c, +- "extent pointer not marked: %s:\n" +- "type %u sectors %u < %u", +- (bch2_bkey_val_to_text(&PBUF(buf), c, e.s_c), buf), +- mark.data_type, +- mark_sectors, disk_sectors); +- } +- +- percpu_up_read(&c->mark_lock); +-} +- + void bch2_extent_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) + { +diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h +index 3988315fc404..2ee50a24501e 100644 +--- a/fs/bcachefs/extents.h ++++ b/fs/bcachefs/extents.h +@@ -368,7 +368,6 @@ int bch2_bkey_pick_read_device(struct bch_fs *, struct bkey_s_c, + /* KEY_TYPE_btree_ptr: */ + + const char *bch2_btree_ptr_invalid(const struct bch_fs *, struct bkey_s_c); +-void bch2_btree_ptr_debugcheck(struct bch_fs *, struct bkey_s_c); + void bch2_btree_ptr_to_text(struct printbuf *, struct bch_fs *, + struct bkey_s_c); + +@@ -379,14 +378,12 @@ void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned, + + #define bch2_bkey_ops_btree_ptr (struct bkey_ops) { \ + .key_invalid = bch2_btree_ptr_invalid, \ +- .key_debugcheck = bch2_btree_ptr_debugcheck, \ + .val_to_text = bch2_btree_ptr_to_text, \ + .swab = bch2_ptr_swab, \ + } + + #define bch2_bkey_ops_btree_ptr_v2 (struct bkey_ops) { \ + .key_invalid = bch2_btree_ptr_invalid, \ +- .key_debugcheck = bch2_btree_ptr_debugcheck, \ + .val_to_text = bch2_btree_ptr_v2_to_text, \ + .swab = bch2_ptr_swab, \ + .compat = bch2_btree_ptr_v2_compat, \ +@@ -395,14 +392,12 @@ void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned, + /* KEY_TYPE_extent: */ + + const char *bch2_extent_invalid(const struct bch_fs *, struct bkey_s_c); +-void bch2_extent_debugcheck(struct bch_fs *, struct bkey_s_c); + void bch2_extent_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); + enum merge_result bch2_extent_merge(struct bch_fs *, + struct bkey_s, struct bkey_s); + + #define bch2_bkey_ops_extent (struct bkey_ops) { \ + .key_invalid = bch2_extent_invalid, \ +- .key_debugcheck = bch2_extent_debugcheck, \ + .val_to_text = bch2_extent_to_text, \ + .swab = bch2_ptr_swab, \ + .key_normalize = bch2_extent_normalize, \ +-- +cgit v1.2.3 + + +From 67049ec25b456f6acda7022bb7534db0f4cf725e Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 19 Mar 2021 16:37:24 -0400 +Subject: bcachefs: Don't overwrite snapshot field in bch2_cut_back() + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/extents.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c +index 148100be1013..66b93ea91127 100644 +--- a/fs/bcachefs/extents.c ++++ b/fs/bcachefs/extents.c +@@ -1184,7 +1184,7 @@ int bch2_cut_back_s(struct bpos where, struct bkey_s k) + + len = where.offset - bkey_start_offset(k.k); + +- k.k->p = where; ++ k.k->p.offset = where.offset; + k.k->size = len; + + if (!len) { +-- +cgit v1.2.3 + + +From dff4e45520283f697651887bdab861ffb19ce233 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 21 Mar 2021 16:03:23 -0400 +Subject: bcachefs: Validate bset version field against sb version fields + +The superblock version fields need to be accurate to know whether a +filesystem is supported, thus we should be verifying them. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bcachefs.h | 1 + + fs/bcachefs/btree_io.c | 20 ++++++++++++++++++++ + fs/bcachefs/super-io.c | 1 + + 3 files changed, 22 insertions(+) + +diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h +index 6cded61f8a78..17e3d55a1f06 100644 +--- a/fs/bcachefs/bcachefs.h ++++ b/fs/bcachefs/bcachefs.h +@@ -593,6 +593,7 @@ struct bch_fs { + uuid_le user_uuid; + + u16 version; ++ u16 version_min; + u16 encoded_extent_max; + + u8 nr_devices; +diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c +index cda0611418de..002025856236 100644 +--- a/fs/bcachefs/btree_io.c ++++ b/fs/bcachefs/btree_io.c +@@ -560,6 +560,26 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, + BTREE_ERR_FATAL, c, ca, b, i, + "unsupported bset version"); + ++ if (btree_err_on(version < c->sb.version_min, ++ BTREE_ERR_FIXABLE, c, NULL, b, i, ++ "bset version %u older than superblock version_min %u", ++ version, c->sb.version_min)) { ++ mutex_lock(&c->sb_lock); ++ c->disk_sb.sb->version_min = cpu_to_le16(version); ++ bch2_write_super(c); ++ mutex_unlock(&c->sb_lock); ++ } ++ ++ if (btree_err_on(version > c->sb.version, ++ BTREE_ERR_FIXABLE, c, NULL, b, i, ++ "bset version %u newer than superblock version %u", ++ version, c->sb.version)) { ++ mutex_lock(&c->sb_lock); ++ c->disk_sb.sb->version = cpu_to_le16(version); ++ bch2_write_super(c); ++ mutex_unlock(&c->sb_lock); ++ } ++ + if (btree_err_on(b->written + sectors > c->opts.btree_node_size, + BTREE_ERR_FIXABLE, c, ca, b, i, + "bset past end of btree node")) { +diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c +index 761695c4afa1..6bb12d5e09e3 100644 +--- a/fs/bcachefs/super-io.c ++++ b/fs/bcachefs/super-io.c +@@ -362,6 +362,7 @@ static void bch2_sb_update(struct bch_fs *c) + c->sb.uuid = src->uuid; + c->sb.user_uuid = src->user_uuid; + c->sb.version = le16_to_cpu(src->version); ++ c->sb.version_min = le16_to_cpu(src->version_min); + c->sb.nr_devices = src->nr_devices; + c->sb.clean = BCH_SB_CLEAN(src); + c->sb.encryption_type = BCH_SB_ENCRYPTION_TYPE(src); +-- +cgit v1.2.3 + + +From 44b27152195ab65172bcddd93ac3558f64194067 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 21 Mar 2021 16:20:40 -0400 +Subject: bcachefs: Don't unconditially version_upgrade in initialize + +This is mkfs's job. Also, clean up the handling of feature bits some. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/alloc_background.c | 24 +----------------------- + fs/bcachefs/bcachefs_format.h | 4 ++-- + fs/bcachefs/recovery.c | 36 +++++++++++++++--------------------- + 3 files changed, 18 insertions(+), 46 deletions(-) + +diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c +index 71569dca3056..a10c1a41e4c9 100644 +--- a/fs/bcachefs/alloc_background.c ++++ b/fs/bcachefs/alloc_background.c +@@ -114,25 +114,6 @@ static void bch2_alloc_unpack_v1(struct bkey_alloc_unpacked *out, + #undef x + } + +-static void bch2_alloc_pack_v1(struct bkey_alloc_buf *dst, +- const struct bkey_alloc_unpacked src) +-{ +- struct bkey_i_alloc *a = bkey_alloc_init(&dst->k); +- void *d = a->v.data; +- unsigned bytes, idx = 0; +- +- a->k.p = POS(src.dev, src.bucket); +- a->v.fields = 0; +- a->v.gen = src.gen; +- +-#define x(_name, _bits) alloc_field_v1_put(a, &d, idx++, src._name); +- BCH_ALLOC_FIELDS_V1() +-#undef x +- bytes = (void *) d - (void *) &a->v; +- set_bkey_val_bytes(&a->k, bytes); +- memset_u64s_tail(&a->v, 0, bytes); +-} +- + static int bch2_alloc_unpack_v2(struct bkey_alloc_unpacked *out, + struct bkey_s_c k) + { +@@ -225,10 +206,7 @@ void bch2_alloc_pack(struct bch_fs *c, + struct bkey_alloc_buf *dst, + const struct bkey_alloc_unpacked src) + { +- if (c->sb.features & (1ULL << BCH_FEATURE_alloc_v2)) +- bch2_alloc_pack_v2(dst, src); +- else +- bch2_alloc_pack_v1(dst, src); ++ bch2_alloc_pack_v2(dst, src); + } + + static unsigned bch_alloc_val_u64s(const struct bch_alloc *a) +diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h +index f1526ce6812d..3de414ceb267 100644 +--- a/fs/bcachefs/bcachefs_format.h ++++ b/fs/bcachefs/bcachefs_format.h +@@ -1376,6 +1376,7 @@ LE64_BITMASK(BCH_SB_METADATA_TARGET, struct bch_sb, flags[3], 16, 28); + ((1ULL << BCH_FEATURE_new_extent_overwrite)| \ + (1ULL << BCH_FEATURE_extents_above_btree_updates)|\ + (1ULL << BCH_FEATURE_btree_updates_journalled)|\ ++ (1ULL << BCH_FEATURE_alloc_v2)|\ + (1ULL << BCH_FEATURE_extents_across_btree_nodes)) + + #define BCH_SB_FEATURES_ALL \ +@@ -1383,8 +1384,7 @@ LE64_BITMASK(BCH_SB_METADATA_TARGET, struct bch_sb, flags[3], 16, 28); + (1ULL << BCH_FEATURE_new_siphash)| \ + (1ULL << BCH_FEATURE_btree_ptr_v2)| \ + (1ULL << BCH_FEATURE_new_varint)| \ +- (1ULL << BCH_FEATURE_journal_no_flush)| \ +- (1ULL << BCH_FEATURE_alloc_v2)) ++ (1ULL << BCH_FEATURE_journal_no_flush)) + + enum bch_sb_feature { + #define x(f, n) BCH_FEATURE_##f, +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index 03a25dd5acc6..92f7568175eb 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -991,11 +991,17 @@ int bch2_fs_recovery(struct bch_fs *c) + goto err; + } + ++ if (!c->sb.clean && ++ !(c->sb.features & (1ULL << BCH_FEATURE_extents_above_btree_updates))) { ++ bch_err(c, "filesystem needs recovery from older version; run fsck from older bcachefs-tools to fix"); ++ ret = -EINVAL; ++ goto err; ++ } ++ + if (!(c->sb.features & (1ULL << BCH_FEATURE_alloc_v2))) { + bch_info(c, "alloc_v2 feature bit not set, fsck required"); + c->opts.fsck = true; + c->opts.fix_errors = FSCK_OPT_YES; +- c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_alloc_v2; + } + + if (!c->replicas.entries || +@@ -1061,13 +1067,6 @@ use_clean: + blacklist_seq = journal_seq = le64_to_cpu(clean->journal_seq) + 1; + } + +- if (!c->sb.clean && +- !(c->sb.features & (1ULL << BCH_FEATURE_extents_above_btree_updates))) { +- bch_err(c, "filesystem needs recovery from older version; run fsck from older bcachefs-tools to fix"); +- ret = -EINVAL; +- goto err; +- } +- + if (c->opts.reconstruct_alloc) { + c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); + drop_alloc_keys(&c->journal_keys); +@@ -1228,9 +1227,6 @@ use_clean: + + mutex_lock(&c->sb_lock); + if (c->opts.version_upgrade) { +- if (c->sb.version < bcachefs_metadata_version_new_versioning) +- c->disk_sb.sb->version_min = +- le16_to_cpu(bcachefs_metadata_version_min); + c->disk_sb.sb->version = le16_to_cpu(bcachefs_metadata_version_current); + c->disk_sb.sb->features[0] |= BCH_SB_FEATURES_ALL; + write_sb = true; +@@ -1288,19 +1284,17 @@ int bch2_fs_initialize(struct bch_fs *c) + bch_notice(c, "initializing new filesystem"); + + mutex_lock(&c->sb_lock); +- for_each_online_member(ca, c, i) +- bch2_mark_dev_superblock(c, ca, 0); +- mutex_unlock(&c->sb_lock); +- +- mutex_lock(&c->sb_lock); +- c->disk_sb.sb->version = c->disk_sb.sb->version_min = +- le16_to_cpu(bcachefs_metadata_version_current); +- c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_atomic_nlink; +- c->disk_sb.sb->features[0] |= BCH_SB_FEATURES_ALL; + c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_extents_above_btree_updates_done; + c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_bformat_overflow_done; + +- bch2_write_super(c); ++ if (c->opts.version_upgrade) { ++ c->disk_sb.sb->version = le16_to_cpu(bcachefs_metadata_version_current); ++ c->disk_sb.sb->features[0] |= BCH_SB_FEATURES_ALL; ++ bch2_write_super(c); ++ } ++ ++ for_each_online_member(ca, c, i) ++ bch2_mark_dev_superblock(c, ca, 0); + mutex_unlock(&c->sb_lock); + + set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags); +-- +cgit v1.2.3 + + +From 68c8d83cbea1a6842709f8755985d5970e43e340 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 20 Mar 2021 22:05:39 -0400 +Subject: bcachefs: Fix iterator picking + +comparison was wrong + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 197100f0aee3..76bd4e6e563d 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -2058,7 +2058,7 @@ struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans, + continue; + + if (best && +- bkey_cmp(bpos_diff(best->pos, pos), ++ bkey_cmp(bpos_diff(best->real_pos, pos), + bpos_diff(iter->real_pos, pos)) < 0) + continue; + +-- +cgit v1.2.3 + + +From 59abd4aec2a18a450c1a907d10afd4b4a4f9981f Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 20 Mar 2021 22:13:30 -0400 +Subject: bcachefs: Optimize bch2_btree_iter_verify_level() + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 10 +++++++--- + 1 file changed, 7 insertions(+), 3 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 76bd4e6e563d..bb785b7700c5 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -492,9 +492,9 @@ static void bch2_btree_iter_verify_cached(struct btree_iter *iter) + static void bch2_btree_iter_verify_level(struct btree_iter *iter, + unsigned level) + { +- struct btree_iter_level *l = &iter->l[level]; +- struct btree_node_iter tmp = l->iter; +- bool locked = btree_node_locked(iter, level); ++ struct btree_iter_level *l; ++ struct btree_node_iter tmp; ++ bool locked; + struct bkey_packed *p, *k; + char buf1[100], buf2[100], buf3[100]; + const char *msg; +@@ -502,6 +502,10 @@ static void bch2_btree_iter_verify_level(struct btree_iter *iter, + if (!bch2_debug_check_iterators) + return; + ++ l = &iter->l[level]; ++ tmp = l->iter; ++ locked = btree_node_locked(iter, level); ++ + if (btree_iter_type(iter) == BTREE_ITER_CACHED) { + if (!level) + bch2_btree_iter_verify_cached(iter); +-- +cgit v1.2.3 + + +From 2db3b276a10eab29c803c9a4835f4ded18f73397 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 20 Mar 2021 21:04:57 -0400 +Subject: bcachefs: Switch extent_handle_overwrites() to one key at a time + +Prep work for snapshots + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_update_leaf.c | 103 ++++++++++++++++++---------------------- + 1 file changed, 46 insertions(+), 57 deletions(-) + +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index ed3009b8b157..d9308bd49fc9 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -690,8 +690,8 @@ bch2_trans_commit_get_rw_cold(struct btree_trans *trans) + return 0; + } + +-static int __bch2_trans_update2(struct btree_trans *trans, +- struct btree_insert_entry n) ++static void __bch2_trans_update2(struct btree_trans *trans, ++ struct btree_insert_entry n) + { + struct btree_insert_entry *i; + +@@ -711,15 +711,13 @@ static int __bch2_trans_update2(struct btree_trans *trans, + else + array_insert_item(trans->updates2, trans->nr_updates2, + i - trans->updates2, n); +- +- return 0; + } + +-static int bch2_trans_update2(struct btree_trans *trans, +- struct btree_iter *iter, +- struct bkey_i *insert) ++static void bch2_trans_update2(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct bkey_i *insert) + { +- return __bch2_trans_update2(trans, (struct btree_insert_entry) { ++ __bch2_trans_update2(trans, (struct btree_insert_entry) { + .bkey_type = __btree_node_type(iter->level, iter->btree_id), + .btree_id = iter->btree_id, + .level = iter->level, +@@ -745,82 +743,81 @@ static int extent_update_to_keys(struct btree_trans *trans, + BTREE_ITER_NOT_EXTENTS); + n.is_extent = false; + +- ret = __bch2_trans_update2(trans, n); ++ __bch2_trans_update2(trans, n); + bch2_trans_iter_put(trans, n.iter); +- return ret; ++ return 0; + } + + static int extent_handle_overwrites(struct btree_trans *trans, + enum btree_id btree_id, +- struct bpos start, struct bpos end) ++ struct bkey_i *insert) + { + struct btree_iter *iter, *update_iter; ++ struct bpos start = bkey_start_pos(&insert->k); + struct bkey_i *update; + struct bkey_s_c k; + int ret = 0; + +- iter = bch2_trans_get_iter(trans, btree_id, start, BTREE_ITER_INTENT); ++ iter = bch2_trans_get_iter(trans, btree_id, start, ++ BTREE_ITER_INTENT); + k = bch2_btree_iter_peek_with_updates(iter); + + while (k.k && !(ret = bkey_err(k))) { +- if (bkey_cmp(end, bkey_start_pos(k.k)) <= 0) ++ if (bkey_cmp(insert->k.p, bkey_start_pos(k.k)) <= 0) + break; + + if (bkey_cmp(bkey_start_pos(k.k), start) < 0) { + update = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); + if ((ret = PTR_ERR_OR_ZERO(update))) +- goto err; ++ break; + + bkey_reassemble(update, k); ++ + bch2_cut_back(start, update); + +- update_iter = bch2_trans_copy_iter(trans, iter); +- update_iter->flags &= ~BTREE_ITER_IS_EXTENTS; +- bch2_btree_iter_set_pos(update_iter, update->k.p); +- ret = bch2_trans_update2(trans, update_iter, update); ++ update_iter = bch2_trans_get_iter(trans, btree_id, update->k.p, ++ BTREE_ITER_NOT_EXTENTS| ++ BTREE_ITER_INTENT); ++ bch2_trans_update2(trans, update_iter, update); + bch2_trans_iter_put(trans, update_iter); +- if (ret) +- goto err; + } + +- if (bkey_cmp(k.k->p, end) > 0) { +- update = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); ++ if (bkey_cmp(k.k->p, insert->k.p) < 0 || ++ (!bkey_cmp(k.k->p, insert->k.p) && bkey_deleted(&insert->k))) { ++ update = bch2_trans_kmalloc(trans, sizeof(struct bkey)); + if ((ret = PTR_ERR_OR_ZERO(update))) +- goto err; ++ break; + +- bkey_reassemble(update, k); +- bch2_cut_front(end, update); ++ bkey_init(&update->k); ++ update->k.p = k.k->p; + +- update_iter = bch2_trans_copy_iter(trans, iter); +- update_iter->flags &= ~BTREE_ITER_IS_EXTENTS; +- bch2_btree_iter_set_pos(update_iter, update->k.p); +- ret = bch2_trans_update2(trans, update_iter, update); ++ update_iter = bch2_trans_get_iter(trans, btree_id, update->k.p, ++ BTREE_ITER_NOT_EXTENTS| ++ BTREE_ITER_INTENT); ++ bch2_trans_update2(trans, update_iter, update); + bch2_trans_iter_put(trans, update_iter); +- if (ret) +- goto err; +- } else { +- update = bch2_trans_kmalloc(trans, sizeof(struct bkey)); ++ } ++ ++ if (bkey_cmp(k.k->p, insert->k.p) > 0) { ++ update = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); + if ((ret = PTR_ERR_OR_ZERO(update))) +- goto err; ++ break; + +- update->k = *k.k; +- set_bkey_val_u64s(&update->k, 0); +- update->k.type = KEY_TYPE_deleted; +- update->k.size = 0; ++ bkey_reassemble(update, k); ++ bch2_cut_front(insert->k.p, update); + +- update_iter = bch2_trans_copy_iter(trans, iter); +- update_iter->flags &= ~BTREE_ITER_IS_EXTENTS; +- bch2_btree_iter_set_pos(update_iter, update->k.p); +- ret = bch2_trans_update2(trans, update_iter, update); ++ update_iter = bch2_trans_get_iter(trans, btree_id, update->k.p, ++ BTREE_ITER_NOT_EXTENTS| ++ BTREE_ITER_INTENT); ++ bch2_trans_update2(trans, update_iter, update); + bch2_trans_iter_put(trans, update_iter); +- if (ret) +- goto err; ++ break; + } + + k = bch2_btree_iter_next_with_updates(iter); + } +-err: + bch2_trans_iter_put(trans, iter); ++ + return ret; + } + +@@ -885,24 +882,16 @@ int __bch2_trans_commit(struct btree_trans *trans) + /* Turn extents updates into keys: */ + trans_for_each_update(trans, i) + if (i->is_extent) { +- struct bpos start = bkey_start_pos(&i->k->k); +- +- while (i + 1 < trans->updates + trans->nr_updates && +- i[0].btree_id == i[1].btree_id && +- !bkey_cmp(i[0].k->k.p, bkey_start_pos(&i[1].k->k))) +- i++; +- +- ret = extent_handle_overwrites(trans, i->btree_id, +- start, i->k->k.p); +- if (ret) ++ ret = extent_handle_overwrites(trans, i->btree_id, i->k); ++ if (unlikely(ret)) + goto out; + } + + trans_for_each_update(trans, i) { + ret = i->is_extent + ? extent_update_to_keys(trans, *i) +- : __bch2_trans_update2(trans, *i); +- if (ret) ++ : (__bch2_trans_update2(trans, *i), 0); ++ if (unlikely(ret)) + goto out; + } + +-- +cgit v1.2.3 + + +From 3001ba7c62557f14f1101f75515f5dfa9d78b4a6 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 21 Mar 2021 00:03:34 -0400 +Subject: bcachefs: Get disk reservation when overwriting data in old snapshot + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/io.c | 7 ++++--- + 1 file changed, 4 insertions(+), 3 deletions(-) + +diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c +index 263f7a6db42a..07b7a648b0c9 100644 +--- a/fs/bcachefs/io.c ++++ b/fs/bcachefs/io.c +@@ -214,9 +214,10 @@ int bch2_sum_sector_overwrites(struct btree_trans *trans, + (bkey_extent_is_allocation(&new->k) - + bkey_extent_is_allocation(old.k)); + +- *disk_sectors_delta += sectors * +- (int) (bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(new)) - +- bch2_bkey_nr_ptrs_fully_allocated(old)); ++ *disk_sectors_delta += sectors * bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(new)); ++ *disk_sectors_delta -= new->k.p.snapshot == old.k->p.snapshot ++ ? sectors * bch2_bkey_nr_ptrs_fully_allocated(old) ++ : 0; + + if (!*should_check_enospc && + (new_replicas > bch2_bkey_replicas(c, old) || +-- +cgit v1.2.3 + + +From 57f766d9c4fafc03a3e932bf8f6c5cd96f5f5fce Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 21 Mar 2021 16:55:25 -0400 +Subject: bcachefs: Replace bch2_btree_iter_next() calls with + bch2_btree_iter_advance + +The way btree iterators work internally has been changing, particularly +with the iter->real_pos changes, and bch2_btree_iter_next() is no longer +hyper optimized - it's just advance followed by peek, so it's more +efficient to just call advance where we're not using the return value of +bch2_btree_iter_next(). + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_gc.c | 2 +- + fs/bcachefs/btree_iter.c | 16 ++++++++-------- + fs/bcachefs/btree_iter.h | 4 ++-- + fs/bcachefs/debug.c | 2 +- + fs/bcachefs/ec.c | 4 ++-- + fs/bcachefs/fs.c | 2 +- + fs/bcachefs/fsck.c | 8 ++++---- + fs/bcachefs/migrate.c | 2 +- + fs/bcachefs/move.c | 2 +- + 9 files changed, 21 insertions(+), 21 deletions(-) + +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +index 2bba36117a5a..c57bcc3f841c 100644 +--- a/fs/bcachefs/btree_gc.c ++++ b/fs/bcachefs/btree_gc.c +@@ -1208,7 +1208,7 @@ static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id btree_id) + } + } + +- bch2_btree_iter_next(iter); ++ bch2_btree_iter_advance(iter); + } + bch2_trans_iter_put(&trans, iter); + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index bb785b7700c5..3e6c167d9fd0 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -1501,7 +1501,7 @@ void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos) + btree_iter_set_search_pos(iter, btree_iter_search_key(iter)); + } + +-inline bool bch2_btree_iter_advance_pos(struct btree_iter *iter) ++inline bool bch2_btree_iter_advance(struct btree_iter *iter) + { + struct bpos pos = iter->k.p; + bool ret = bkey_cmp(pos, POS_MAX) != 0; +@@ -1512,7 +1512,7 @@ inline bool bch2_btree_iter_advance_pos(struct btree_iter *iter) + return ret; + } + +-inline bool bch2_btree_iter_rewind_pos(struct btree_iter *iter) ++inline bool bch2_btree_iter_rewind(struct btree_iter *iter) + { + struct bpos pos = bkey_start_pos(&iter->k); + bool ret = bkey_cmp(pos, POS_MIN) != 0; +@@ -1637,7 +1637,7 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) + */ + struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter) + { +- if (!bch2_btree_iter_advance_pos(iter)) ++ if (!bch2_btree_iter_advance(iter)) + return bkey_s_c_null; + + return bch2_btree_iter_peek(iter); +@@ -1691,7 +1691,7 @@ struct bkey_s_c bch2_btree_iter_peek_with_updates(struct btree_iter *iter) + k = __bch2_btree_iter_peek_with_updates(iter); + + if (k.k && bkey_deleted(k.k)) { +- if (!bch2_btree_iter_advance_pos(iter)) ++ if (!bch2_btree_iter_advance(iter)) + return bkey_s_c_null; + continue; + } +@@ -1716,7 +1716,7 @@ struct bkey_s_c bch2_btree_iter_peek_with_updates(struct btree_iter *iter) + + struct bkey_s_c bch2_btree_iter_next_with_updates(struct btree_iter *iter) + { +- if (!bch2_btree_iter_advance_pos(iter)) ++ if (!bch2_btree_iter_advance(iter)) + return bkey_s_c_null; + + return bch2_btree_iter_peek_with_updates(iter); +@@ -1793,7 +1793,7 @@ no_key: + */ + struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *iter) + { +- if (!bch2_btree_iter_rewind_pos(iter)) ++ if (!bch2_btree_iter_rewind(iter)) + return bkey_s_c_null; + + return bch2_btree_iter_peek_prev(iter); +@@ -1885,7 +1885,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) + + struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *iter) + { +- if (!bch2_btree_iter_advance_pos(iter)) ++ if (!bch2_btree_iter_advance(iter)) + return bkey_s_c_null; + + return bch2_btree_iter_peek_slot(iter); +@@ -1893,7 +1893,7 @@ struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *iter) + + struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_iter *iter) + { +- if (!bch2_btree_iter_rewind_pos(iter)) ++ if (!bch2_btree_iter_rewind(iter)) + return bkey_s_c_null; + + return bch2_btree_iter_peek_slot(iter); +diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h +index c839bfe6ffa4..1276d8aaf652 100644 +--- a/fs/bcachefs/btree_iter.h ++++ b/fs/bcachefs/btree_iter.h +@@ -175,8 +175,8 @@ struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_iter *); + + struct bkey_s_c bch2_btree_iter_peek_cached(struct btree_iter *); + +-bool bch2_btree_iter_advance_pos(struct btree_iter *); +-bool bch2_btree_iter_rewind_pos(struct btree_iter *); ++bool bch2_btree_iter_advance(struct btree_iter *); ++bool bch2_btree_iter_rewind(struct btree_iter *); + void bch2_btree_iter_set_pos(struct btree_iter *, struct bpos); + + /* Sort order for locking btree iterators: */ +diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c +index cce747da8b9e..c6d49f44aa01 100644 +--- a/fs/bcachefs/debug.c ++++ b/fs/bcachefs/debug.c +@@ -356,7 +356,7 @@ static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf, + if (err) + break; + +- bch2_btree_iter_next(iter); ++ bch2_btree_iter_advance(iter); + i->from = iter->pos; + + err = flush_buf(i); +diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c +index 500094e7000d..a13d4e138314 100644 +--- a/fs/bcachefs/ec.c ++++ b/fs/bcachefs/ec.c +@@ -842,13 +842,13 @@ static int ec_stripe_update_ptrs(struct bch_fs *c, + struct bch_extent_ptr *ptr, *ec_ptr = NULL; + + if (extent_has_stripe_ptr(k, s->key.k.p.offset)) { +- bch2_btree_iter_next(iter); ++ bch2_btree_iter_advance(iter); + continue; + } + + block = bkey_matches_stripe(&s->key.v, k); + if (block < 0) { +- bch2_btree_iter_next(iter); ++ bch2_btree_iter_advance(iter); + continue; + } + +diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c +index 43f17967ea54..095b3109ed29 100644 +--- a/fs/bcachefs/fs.c ++++ b/fs/bcachefs/fs.c +@@ -913,7 +913,7 @@ retry: + + if (!bkey_extent_is_data(k.k) && + k.k->type != KEY_TYPE_reservation) { +- bch2_btree_iter_next(iter); ++ bch2_btree_iter_advance(iter); + continue; + } + +diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c +index f8e0b24d087a..ffb30ef7ef00 100644 +--- a/fs/bcachefs/fsck.c ++++ b/fs/bcachefs/fsck.c +@@ -547,7 +547,7 @@ retry: + i_sectors += k.k->size; + bch2_bkey_buf_reassemble(&prev, c, k); + +- bch2_btree_iter_advance_pos(iter); ++ bch2_btree_iter_advance(iter); + } + fsck_err: + if (ret == -EINTR) +@@ -703,7 +703,7 @@ retry: + + } + +- bch2_btree_iter_advance_pos(iter); ++ bch2_btree_iter_advance(iter); + } + + hash_stop_chain(&trans, &h); +@@ -762,7 +762,7 @@ retry: + if (ret) + break; + +- bch2_btree_iter_advance_pos(iter); ++ bch2_btree_iter_advance(iter); + } + fsck_err: + if (ret == -EINTR) +@@ -1389,7 +1389,7 @@ peek_nlinks: link = genradix_iter_peek(&nlinks_iter, links); + if (nlinks_pos == iter->pos.offset) + genradix_iter_advance(&nlinks_iter, links); + +- bch2_btree_iter_advance_pos(iter); ++ bch2_btree_iter_advance(iter); + bch2_trans_cond_resched(&trans); + } + fsck_err: +diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c +index 4d8b4169923d..ef69a19f494a 100644 +--- a/fs/bcachefs/migrate.c ++++ b/fs/bcachefs/migrate.c +@@ -53,7 +53,7 @@ static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags + while ((k = bch2_btree_iter_peek(iter)).k && + !(ret = bkey_err(k))) { + if (!bch2_bkey_has_device(k, dev_idx)) { +- bch2_btree_iter_next(iter); ++ bch2_btree_iter_advance(iter); + continue; + } + +diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c +index 1403af076f10..7448ea36abd9 100644 +--- a/fs/bcachefs/move.c ++++ b/fs/bcachefs/move.c +@@ -639,7 +639,7 @@ next: + atomic64_add(k.k->size * bch2_bkey_nr_ptrs_allocated(k), + &stats->sectors_seen); + next_nondata: +- bch2_btree_iter_next(iter); ++ bch2_btree_iter_advance(iter); + bch2_trans_cond_resched(&trans); + } + out: +-- +cgit v1.2.3 + + +From 4a25c8248fdf0d2c4da4332d2e608386328a0c77 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 21 Mar 2021 18:09:02 -0400 +Subject: bcachefs: Have btree_iter_next_node() use btree_iter_set_search_pos() + +btree node iterators need to obey the regular btree node invarionts +w.r.t. iter->real_pos; once they do, bch2_btree_iter_traverse will have +less that it needs to check. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 32 ++++++++++++-------------------- + 1 file changed, 12 insertions(+), 20 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 3e6c167d9fd0..51e52319912d 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -16,6 +16,8 @@ + #include + #include + ++static void btree_iter_set_search_pos(struct btree_iter *, struct bpos); ++ + static inline bool is_btree_node(struct btree_iter *iter, unsigned l) + { + return l < BTREE_MAX_DEPTH && +@@ -1144,11 +1146,6 @@ err: + return ret; + } + +-static void btree_iter_up(struct btree_iter *iter) +-{ +- btree_node_unlock(iter, iter->level++); +-} +- + static int btree_iter_traverse_one(struct btree_iter *, unsigned long); + + static int __btree_iter_traverse_all(struct btree_trans *trans, int ret) +@@ -1400,11 +1397,11 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) + + bch2_trans_cond_resched(iter->trans); + +- btree_iter_up(iter); +- +- if (!bch2_btree_node_relock(iter, iter->level)) +- btree_iter_set_dirty(iter, BTREE_ITER_NEED_RELOCK); ++ btree_node_unlock(iter, iter->level); ++ iter->l[iter->level].b = BTREE_ITER_NO_NODE_UP; ++ iter->level++; + ++ btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); + ret = bch2_btree_iter_traverse(iter); + if (ret) + return NULL; +@@ -1419,20 +1416,15 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) + * Haven't gotten to the end of the parent node: go back down to + * the next child node + */ ++ btree_iter_set_search_pos(iter, bkey_successor(iter->pos)); + +- /* +- * We don't really want to be unlocking here except we can't +- * directly tell btree_iter_traverse() "traverse to this level" +- * except by setting iter->level, so we have to unlock so we +- * don't screw up our lock invariants: +- */ +- if (btree_node_read_locked(iter, iter->level)) +- btree_node_unlock(iter, iter->level); +- +- iter->pos = iter->real_pos = bkey_successor(iter->pos); +- iter->level = iter->min_depth; ++ /* Unlock to avoid screwing up our lock invariants: */ ++ btree_node_unlock(iter, iter->level); + ++ iter->level = iter->min_depth; + btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); ++ bch2_btree_iter_verify(iter); ++ + ret = bch2_btree_iter_traverse(iter); + if (ret) + return NULL; +-- +cgit v1.2.3 + + +From a6483f349ec0f5e69c97aab62dd29e39da475730 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 21 Mar 2021 17:09:55 -0400 +Subject: bcachefs: Iterators are now always consistent with iter->real_pos + +This means bch2_btree_iter_traverse_one() can be made more efficient. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 20 ++------------------ + 1 file changed, 2 insertions(+), 18 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 51e52319912d..3cc191da6616 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -1234,9 +1234,9 @@ static inline bool btree_iter_good_node(struct btree_iter *iter, + !bch2_btree_node_relock(iter, l)) + return false; + +- if (check_pos <= 0 && btree_iter_pos_before_node(iter, iter->l[l].b)) ++ if (check_pos < 0 && btree_iter_pos_before_node(iter, iter->l[l].b)) + return false; +- if (check_pos >= 0 && btree_iter_pos_after_node(iter, iter->l[l].b)) ++ if (check_pos > 0 && btree_iter_pos_after_node(iter, iter->l[l].b)) + return false; + return true; + } +@@ -1287,24 +1287,8 @@ static int btree_iter_traverse_one(struct btree_iter *iter, + if (unlikely(iter->level >= BTREE_MAX_DEPTH)) + return 0; + +- /* +- * XXX: correctly using BTREE_ITER_UPTODATE should make using check_pos +- * here unnecessary +- */ + iter->level = btree_iter_up_until_good_node(iter, 0); + +- /* +- * If we've got a btree node locked (i.e. we aren't about to relock the +- * root) - advance its node iterator if necessary: +- * +- * XXX correctly using BTREE_ITER_UPTODATE should make this unnecessary +- */ +- if (is_btree_node(iter, iter->level)) { +- BUG_ON(!btree_iter_pos_in_node(iter, iter->l[iter->level].b)); +- +- btree_iter_advance_to_pos(iter, &iter->l[iter->level], -1); +- } +- + /* + * Note: iter->nodes[iter->level] may be temporarily NULL here - that + * would indicate to other code that we got to the end of the btree, +-- +cgit v1.2.3 + + +From faaa76492f35966d287f90fc755367499ba62acc Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 21 Mar 2021 17:01:34 -0400 +Subject: bcachefs: Kill btree_iter_peek_uptodate() + +Since we're no longer doing next() immediately followed by peek(), this +optimization isn't doing anything anymore. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 51 ------------------------------------------------ + 1 file changed, 51 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 3cc191da6616..c18ee533a034 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -1346,9 +1346,6 @@ struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter) + EBUG_ON(btree_iter_type(iter) != BTREE_ITER_NODES); + bch2_btree_iter_verify(iter); + +- if (iter->uptodate == BTREE_ITER_UPTODATE) +- return iter->l[iter->level].b; +- + ret = bch2_btree_iter_traverse(iter); + if (ret) + return NULL; +@@ -1360,7 +1357,6 @@ struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter) + BUG_ON(bkey_cmp(b->key.k.p, iter->pos) < 0); + + iter->pos = iter->real_pos = b->key.k.p; +- iter->uptodate = BTREE_ITER_UPTODATE; + + bch2_btree_iter_verify(iter); + +@@ -1417,7 +1413,6 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) + } + + iter->pos = iter->real_pos = b->key.k.p; +- iter->uptodate = BTREE_ITER_UPTODATE; + + bch2_btree_iter_verify(iter); + +@@ -1530,34 +1525,6 @@ static inline bool btree_iter_set_pos_to_prev_leaf(struct btree_iter *iter) + return ret; + } + +-/** +- * btree_iter_peek_uptodate - given an iterator that is uptodate, return the key +- * it currently points to +- */ +-static inline struct bkey_s_c btree_iter_peek_uptodate(struct btree_iter *iter) +-{ +- struct btree_iter_level *l = &iter->l[0]; +- struct bkey_s_c ret = { .k = &iter->k }; +- +- if (!bkey_deleted(&iter->k)) { +- struct bkey_packed *_k = +- __bch2_btree_node_iter_peek_all(&l->iter, l->b); +- +- ret.v = bkeyp_val(&l->b->format, _k); +- +- if (bch2_debug_check_iterators) { +- struct bkey k = bkey_unpack_key(l->b, _k); +- +- BUG_ON(memcmp(&k, &iter->k, sizeof(k))); +- } +- +- if (bch2_debug_check_bkeys) +- bch2_bkey_debugcheck(iter->trans->c, l->b, ret); +- } +- +- return ret; +-} +- + /** + * bch2_btree_iter_peek: returns first key greater than or equal to iterator's + * current position +@@ -1574,10 +1541,6 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) + + btree_iter_set_search_pos(iter, btree_iter_search_key(iter)); + +- if (iter->uptodate == BTREE_ITER_UPTODATE && +- !bkey_deleted(&iter->k)) +- return btree_iter_peek_uptodate(iter); +- + while (1) { + ret = bch2_btree_iter_traverse(iter); + if (unlikely(ret)) +@@ -1600,8 +1563,6 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) + + iter->real_pos = k.k->p; + +- iter->uptodate = BTREE_ITER_UPTODATE; +- + bch2_btree_iter_verify_entry_exit(iter); + bch2_btree_iter_verify(iter); + return k; +@@ -1686,7 +1647,6 @@ struct bkey_s_c bch2_btree_iter_peek_with_updates(struct btree_iter *iter) + if (bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0) + iter->pos = bkey_start_pos(k.k); + +- iter->uptodate = BTREE_ITER_UPTODATE; + return k; + } + +@@ -1714,10 +1674,6 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) + + btree_iter_set_search_pos(iter, iter->pos); + +- if (iter->uptodate == BTREE_ITER_UPTODATE && +- !bkey_deleted(&iter->k)) +- return btree_iter_peek_uptodate(iter); +- + while (1) { + ret = bch2_btree_iter_traverse(iter); + if (unlikely(ret)) { +@@ -1747,7 +1703,6 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) + if (bkey_cmp(k.k->p, iter->pos) < 0) + iter->pos = k.k->p; + iter->real_pos = k.k->p; +- iter->uptodate = BTREE_ITER_UPTODATE; + out: + bch2_btree_iter_verify_entry_exit(iter); + bch2_btree_iter_verify(iter); +@@ -1812,8 +1767,6 @@ __bch2_btree_iter_peek_slot_extents(struct btree_iter *iter) + + EBUG_ON(!iter->k.size); + +- iter->uptodate = BTREE_ITER_UPTODATE; +- + bch2_btree_iter_verify_entry_exit(iter); + bch2_btree_iter_verify(iter); + +@@ -1832,9 +1785,6 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) + + btree_iter_set_search_pos(iter, btree_iter_search_key(iter)); + +- if (iter->uptodate == BTREE_ITER_UPTODATE) +- return btree_iter_peek_uptodate(iter); +- + if (iter->flags & BTREE_ITER_IS_EXTENTS) + return __bch2_btree_iter_peek_slot_extents(iter); + +@@ -1853,7 +1803,6 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) + k = (struct bkey_s_c) { &iter->k, NULL }; + } + +- iter->uptodate = BTREE_ITER_UPTODATE; + bch2_btree_iter_verify_entry_exit(iter); + bch2_btree_iter_verify(iter); + return k; +-- +cgit v1.2.3 + + +From ad2a2b501afce36b307043ee5878ab3fb23f6647 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 21 Mar 2021 19:22:58 -0400 +Subject: bcachefs: Internal btree iterator renaming + +This just gives some internal helpers some better names. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 26 +++++++++++++------------- + 1 file changed, 13 insertions(+), 13 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index c18ee533a034..91d9452a8642 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -815,23 +815,23 @@ static inline struct bkey_s_c __btree_iter_unpack(struct btree_iter *iter, + } + + /* peek_all() doesn't skip deleted keys */ +-static inline struct bkey_s_c __btree_iter_peek_all(struct btree_iter *iter, +- struct btree_iter_level *l, +- struct bkey *u) ++static inline struct bkey_s_c btree_iter_level_peek_all(struct btree_iter *iter, ++ struct btree_iter_level *l, ++ struct bkey *u) + { + return __btree_iter_unpack(iter, l, u, + bch2_btree_node_iter_peek_all(&l->iter, l->b)); + } + +-static inline struct bkey_s_c __btree_iter_peek(struct btree_iter *iter, +- struct btree_iter_level *l) ++static inline struct bkey_s_c btree_iter_level_peek(struct btree_iter *iter, ++ struct btree_iter_level *l) + { + return __btree_iter_unpack(iter, l, &iter->k, + bch2_btree_node_iter_peek(&l->iter, l->b)); + } + +-static inline struct bkey_s_c __btree_iter_prev(struct btree_iter *iter, +- struct btree_iter_level *l) ++static inline struct bkey_s_c btree_iter_level_prev(struct btree_iter *iter, ++ struct btree_iter_level *l) + { + return __btree_iter_unpack(iter, l, &iter->k, + bch2_btree_node_iter_prev(&l->iter, l->b)); +@@ -1546,7 +1546,7 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) + if (unlikely(ret)) + return bkey_s_c_err(ret); + +- k = __btree_iter_peek(iter, l); ++ k = btree_iter_level_peek(iter, l); + if (likely(k.k)) + break; + +@@ -1600,7 +1600,7 @@ static struct bkey_s_c __btree_trans_updates_peek(struct btree_iter *iter) + static struct bkey_s_c __bch2_btree_iter_peek_with_updates(struct btree_iter *iter) + { + struct btree_iter_level *l = &iter->l[0]; +- struct bkey_s_c k = __btree_iter_peek(iter, l); ++ struct bkey_s_c k = btree_iter_level_peek(iter, l); + struct bkey_s_c u = __btree_trans_updates_peek(iter); + + if (k.k && (!u.k || bkey_cmp(k.k->p, u.k->p) < 0)) +@@ -1681,12 +1681,12 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) + goto no_key; + } + +- k = __btree_iter_peek(iter, l); ++ k = btree_iter_level_peek(iter, l); + if (!k.k || + ((iter->flags & BTREE_ITER_IS_EXTENTS) + ? bkey_cmp(bkey_start_pos(k.k), iter->pos) >= 0 + : bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0)) +- k = __btree_iter_prev(iter, l); ++ k = btree_iter_level_prev(iter, l); + + if (likely(k.k)) + break; +@@ -1709,7 +1709,7 @@ out: + return k; + no_key: + /* +- * __btree_iter_peek() may have set iter->k to a key we didn't want, and ++ * btree_iter_level_peek() may have set iter->k to a key we didn't want, and + * then we errored going to the previous leaf - make sure it's + * consistent with iter->pos: + */ +@@ -1792,7 +1792,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) + if (unlikely(ret)) + return bkey_s_c_err(ret); + +- k = __btree_iter_peek_all(iter, l, &iter->k); ++ k = btree_iter_level_peek_all(iter, l, &iter->k); + + EBUG_ON(k.k && bkey_deleted(k.k) && bkey_cmp(k.k->p, iter->pos) == 0); + +-- +cgit v1.2.3 + + +From 707fcf9b8a3d68120fab486b5b73d07f33a2991e Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 21 Mar 2021 19:32:01 -0400 +Subject: bcachefs: Improve iter->real_pos handling + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 21 ++++++++++++++------- + 1 file changed, 14 insertions(+), 7 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 91d9452a8642..993d613a8208 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -826,15 +826,21 @@ static inline struct bkey_s_c btree_iter_level_peek_all(struct btree_iter *iter, + static inline struct bkey_s_c btree_iter_level_peek(struct btree_iter *iter, + struct btree_iter_level *l) + { +- return __btree_iter_unpack(iter, l, &iter->k, ++ struct bkey_s_c k = __btree_iter_unpack(iter, l, &iter->k, + bch2_btree_node_iter_peek(&l->iter, l->b)); ++ ++ iter->real_pos = k.k ? k.k->p : l->b->key.k.p; ++ return k; + } + + static inline struct bkey_s_c btree_iter_level_prev(struct btree_iter *iter, + struct btree_iter_level *l) + { +- return __btree_iter_unpack(iter, l, &iter->k, ++ struct bkey_s_c k = __btree_iter_unpack(iter, l, &iter->k, + bch2_btree_node_iter_prev(&l->iter, l->b)); ++ ++ iter->real_pos = k.k ? k.k->p : l->b->data->min_key; ++ return k; + } + + static inline bool btree_iter_advance_to_pos(struct btree_iter *iter, +@@ -1531,7 +1537,6 @@ static inline bool btree_iter_set_pos_to_prev_leaf(struct btree_iter *iter) + */ + struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) + { +- struct btree_iter_level *l = &iter->l[0]; + struct bkey_s_c k; + int ret; + +@@ -1546,7 +1551,7 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) + if (unlikely(ret)) + return bkey_s_c_err(ret); + +- k = btree_iter_level_peek(iter, l); ++ k = btree_iter_level_peek(iter, &iter->l[0]); + if (likely(k.k)) + break; + +@@ -1561,8 +1566,6 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) + if (bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0) + iter->pos = bkey_start_pos(k.k); + +- iter->real_pos = k.k->p; +- + bch2_btree_iter_verify_entry_exit(iter); + bch2_btree_iter_verify(iter); + return k; +@@ -1619,6 +1622,9 @@ struct bkey_s_c bch2_btree_iter_peek_with_updates(struct btree_iter *iter) + + EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS); + bch2_btree_iter_verify(iter); ++ bch2_btree_iter_verify_entry_exit(iter); ++ ++ btree_iter_set_search_pos(iter, btree_iter_search_key(iter)); + + while (1) { + ret = bch2_btree_iter_traverse(iter); +@@ -1647,6 +1653,8 @@ struct bkey_s_c bch2_btree_iter_peek_with_updates(struct btree_iter *iter) + if (bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0) + iter->pos = bkey_start_pos(k.k); + ++ bch2_btree_iter_verify_entry_exit(iter); ++ bch2_btree_iter_verify(iter); + return k; + } + +@@ -1702,7 +1710,6 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) + /* Extents can straddle iter->pos: */ + if (bkey_cmp(k.k->p, iter->pos) < 0) + iter->pos = k.k->p; +- iter->real_pos = k.k->p; + out: + bch2_btree_iter_verify_entry_exit(iter); + bch2_btree_iter_verify(iter); +-- +cgit v1.2.3 + + +From 6839e3a44798e84fd017fa5132f1d1f6f4bf47b6 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 21 Mar 2021 19:43:31 -0400 +Subject: bcachefs: Consolidate bch2_btree_iter_peek() and peek_with_updates() + +Ideally we'll be getting rid of peek_with_updates(), but the callers +will need to be checked. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 127 ++++++++++++++++++----------------------------- + 1 file changed, 47 insertions(+), 80 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 993d613a8208..2d53a75decff 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -1531,12 +1531,28 @@ static inline bool btree_iter_set_pos_to_prev_leaf(struct btree_iter *iter) + return ret; + } + +-/** +- * bch2_btree_iter_peek: returns first key greater than or equal to iterator's +- * current position +- */ +-struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) ++static struct bkey_i *btree_trans_peek_updates(struct btree_trans *trans, ++ enum btree_id btree_id, struct bpos pos) + { ++ struct btree_insert_entry *i; ++ ++ trans_for_each_update2(trans, i) ++ if ((cmp_int(btree_id, i->iter->btree_id) ?: ++ bkey_cmp(pos, i->k->k.p)) <= 0) { ++ if (btree_id == i->iter->btree_id) ++ return i->k; ++ break; ++ } ++ ++ return NULL; ++} ++ ++static inline struct bkey_s_c __btree_iter_peek(struct btree_iter *iter, bool with_updates) ++{ ++ struct bpos search_key = btree_iter_search_key(iter); ++ struct bkey_i *next_update = with_updates ++ ? btree_trans_peek_updates(iter->trans, iter->btree_id, search_key) ++ : NULL; + struct bkey_s_c k; + int ret; + +@@ -1544,7 +1560,7 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) + bch2_btree_iter_verify(iter); + bch2_btree_iter_verify_entry_exit(iter); + +- btree_iter_set_search_pos(iter, btree_iter_search_key(iter)); ++ btree_iter_set_search_pos(iter, search_key); + + while (1) { + ret = bch2_btree_iter_traverse(iter); +@@ -1552,16 +1568,28 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) + return bkey_s_c_err(ret); + + k = btree_iter_level_peek(iter, &iter->l[0]); +- if (likely(k.k)) ++ ++ if (next_update && ++ bkey_cmp(next_update->k.p, iter->real_pos) <= 0) ++ k = bkey_i_to_s_c(next_update); ++ ++ if (likely(k.k)) { ++ if (bkey_deleted(k.k)) { ++ btree_iter_set_search_pos(iter, ++ bkey_successor(k.k->p)); ++ continue; ++ } ++ + break; ++ } + + if (!btree_iter_set_pos_to_next_leaf(iter)) + return bkey_s_c_null; + } + + /* +- * iter->pos should always be equal to the key we just +- * returned - except extents can straddle iter->pos: ++ * iter->pos should be mononotically increasing, and always be equal to ++ * the key we just returned - except extents can straddle iter->pos: + */ + if (bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0) + iter->pos = bkey_start_pos(k.k); +@@ -1571,6 +1599,15 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) + return k; + } + ++/** ++ * bch2_btree_iter_peek: returns first key greater than or equal to iterator's ++ * current position ++ */ ++struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) ++{ ++ return __btree_iter_peek(iter, false); ++} ++ + /** + * bch2_btree_iter_next: returns first key greater than iterator's current + * position +@@ -1583,79 +1620,9 @@ struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter) + return bch2_btree_iter_peek(iter); + } + +-static struct bkey_s_c __btree_trans_updates_peek(struct btree_iter *iter) +-{ +- struct bpos pos = btree_iter_search_key(iter); +- struct btree_trans *trans = iter->trans; +- struct btree_insert_entry *i; +- +- trans_for_each_update2(trans, i) +- if ((cmp_int(iter->btree_id, i->iter->btree_id) ?: +- bkey_cmp(pos, i->k->k.p)) <= 0) +- break; +- +- return i < trans->updates2 + trans->nr_updates2 && +- iter->btree_id == i->iter->btree_id +- ? bkey_i_to_s_c(i->k) +- : bkey_s_c_null; +-} +- +-static struct bkey_s_c __bch2_btree_iter_peek_with_updates(struct btree_iter *iter) +-{ +- struct btree_iter_level *l = &iter->l[0]; +- struct bkey_s_c k = btree_iter_level_peek(iter, l); +- struct bkey_s_c u = __btree_trans_updates_peek(iter); +- +- if (k.k && (!u.k || bkey_cmp(k.k->p, u.k->p) < 0)) +- return k; +- if (u.k && bkey_cmp(u.k->p, l->b->key.k.p) <= 0) { +- iter->k = *u.k; +- return u; +- } +- return bkey_s_c_null; +-} +- + struct bkey_s_c bch2_btree_iter_peek_with_updates(struct btree_iter *iter) + { +- struct bkey_s_c k; +- int ret; +- +- EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS); +- bch2_btree_iter_verify(iter); +- bch2_btree_iter_verify_entry_exit(iter); +- +- btree_iter_set_search_pos(iter, btree_iter_search_key(iter)); +- +- while (1) { +- ret = bch2_btree_iter_traverse(iter); +- if (unlikely(ret)) +- return bkey_s_c_err(ret); +- +- k = __bch2_btree_iter_peek_with_updates(iter); +- +- if (k.k && bkey_deleted(k.k)) { +- if (!bch2_btree_iter_advance(iter)) +- return bkey_s_c_null; +- continue; +- } +- +- if (likely(k.k)) +- break; +- +- if (!btree_iter_set_pos_to_next_leaf(iter)) +- return bkey_s_c_null; +- } +- +- /* +- * iter->pos should be mononotically increasing, and always be equal to +- * the key we just returned - except extents can straddle iter->pos: +- */ +- if (bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0) +- iter->pos = bkey_start_pos(k.k); +- +- bch2_btree_iter_verify_entry_exit(iter); +- bch2_btree_iter_verify(iter); +- return k; ++ return __btree_iter_peek(iter, true); + } + + struct bkey_s_c bch2_btree_iter_next_with_updates(struct btree_iter *iter) +-- +cgit v1.2.3 + + +From dfd25f9c1d94cc4ea64bc5f77e244bae3a897471 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 21 Mar 2021 21:16:52 -0400 +Subject: bcachefs: Update iter->real_pos lazily + +peek() has to update iter->real_pos - there's no need for +bch2_btree_iter_set_pos() to update it as well. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 9 +-------- + fs/bcachefs/btree_iter.h | 7 ++++++- + 2 files changed, 7 insertions(+), 9 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 2d53a75decff..69b24e0cb955 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -1470,14 +1470,6 @@ out: + bch2_btree_iter_verify(iter); + } + +-void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos) +-{ +- bkey_init(&iter->k); +- iter->k.p = iter->pos = new_pos; +- +- btree_iter_set_search_pos(iter, btree_iter_search_key(iter)); +-} +- + inline bool bch2_btree_iter_advance(struct btree_iter *iter) + { + struct bpos pos = iter->k.p; +@@ -1994,6 +1986,7 @@ alloc_iter: + __bch2_btree_iter_upgrade_nounlock(iter, 1); + + bch2_btree_iter_set_pos(iter, pos); ++ btree_iter_set_search_pos(iter, btree_iter_search_key(iter)); + + return iter; + } +diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h +index 1276d8aaf652..3ae19e2900a6 100644 +--- a/fs/bcachefs/btree_iter.h ++++ b/fs/bcachefs/btree_iter.h +@@ -177,7 +177,12 @@ struct bkey_s_c bch2_btree_iter_peek_cached(struct btree_iter *); + + bool bch2_btree_iter_advance(struct btree_iter *); + bool bch2_btree_iter_rewind(struct btree_iter *); +-void bch2_btree_iter_set_pos(struct btree_iter *, struct bpos); ++ ++static inline void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos) ++{ ++ bkey_init(&iter->k); ++ iter->k.p = iter->pos = new_pos; ++} + + /* Sort order for locking btree iterators: */ + static inline int btree_iter_lock_cmp(const struct btree_iter *l, +-- +cgit v1.2.3 + + +From c3f5b135f973d7ca628f1cef1d06428a32e654cb Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 22 Mar 2021 15:50:02 -0400 +Subject: bcachefs: Include snapshot field in bch2_bpos_to_text + +More prep work for snapshots. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bkey_methods.c | 21 +++++++++++++++++---- + 1 file changed, 17 insertions(+), 4 deletions(-) + +diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c +index 878befb5b9ef..641169ef91b5 100644 +--- a/fs/bcachefs/bkey_methods.c ++++ b/fs/bcachefs/bkey_methods.c +@@ -169,8 +169,22 @@ void bch2_bpos_to_text(struct printbuf *out, struct bpos pos) + pr_buf(out, "POS_MIN"); + else if (!bkey_cmp(pos, POS_MAX)) + pr_buf(out, "POS_MAX"); +- else +- pr_buf(out, "%llu:%llu", pos.inode, pos.offset); ++ else { ++ if (pos.inode == U64_MAX) ++ pr_buf(out, "U64_MAX"); ++ else ++ pr_buf(out, "%llu", pos.inode); ++ pr_buf(out, ":"); ++ if (pos.offset == U64_MAX) ++ pr_buf(out, "U64_MAX"); ++ else ++ pr_buf(out, "%llu", pos.offset); ++ pr_buf(out, ":"); ++ if (pos.snapshot == U32_MAX) ++ pr_buf(out, "U32_MAX"); ++ else ++ pr_buf(out, "%u", pos.snapshot); ++ } + } + + void bch2_bkey_to_text(struct printbuf *out, const struct bkey *k) +@@ -185,8 +199,7 @@ void bch2_bkey_to_text(struct printbuf *out, const struct bkey *k) + + bch2_bpos_to_text(out, k->p); + +- pr_buf(out, " snap %u len %u ver %llu", +- k->p.snapshot, k->size, k->version.lo); ++ pr_buf(out, " len %u ver %llu", k->size, k->version.lo); + } else { + pr_buf(out, "(null)"); + } +-- +cgit v1.2.3 + + +From faa1616ad32da359bc3b210915155ad8d0ac028b Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 22 Mar 2021 17:23:30 -0400 +Subject: bcachefs: Add an .invalid method for bch2_btree_ptr_v2 + +It was using the method for btree_ptr_v1, but that wasn't checking all +the fields. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/extents.c | 18 +++++++++++++++++- + fs/bcachefs/extents.h | 3 ++- + 2 files changed, 19 insertions(+), 2 deletions(-) + +diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c +index 66b93ea91127..a7e0408213a9 100644 +--- a/fs/bcachefs/extents.c ++++ b/fs/bcachefs/extents.c +@@ -158,7 +158,7 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, + + const char *bch2_btree_ptr_invalid(const struct bch_fs *c, struct bkey_s_c k) + { +- if (bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX) ++ if (bkey_val_u64s(k.k) > BCH_REPLICAS_MAX) + return "value too big"; + + return bch2_bkey_ptrs_invalid(c, k); +@@ -170,6 +170,22 @@ void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c, + bch2_bkey_ptrs_to_text(out, c, k); + } + ++const char *bch2_btree_ptr_v2_invalid(const struct bch_fs *c, struct bkey_s_c k) ++{ ++ struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k); ++ ++ if (bkey_val_bytes(k.k) <= sizeof(*bp.v)) ++ return "value too small"; ++ ++ if (bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX) ++ return "value too big"; ++ ++ if (bp.v->min_key.snapshot) ++ return "invalid min_key.snapshot"; ++ ++ return bch2_bkey_ptrs_invalid(c, k); ++} ++ + void bch2_btree_ptr_v2_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) + { +diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h +index 2ee50a24501e..c8069dfb90ff 100644 +--- a/fs/bcachefs/extents.h ++++ b/fs/bcachefs/extents.h +@@ -371,6 +371,7 @@ const char *bch2_btree_ptr_invalid(const struct bch_fs *, struct bkey_s_c); + void bch2_btree_ptr_to_text(struct printbuf *, struct bch_fs *, + struct bkey_s_c); + ++const char *bch2_btree_ptr_v2_invalid(const struct bch_fs *, struct bkey_s_c); + void bch2_btree_ptr_v2_to_text(struct printbuf *, struct bch_fs *, + struct bkey_s_c); + void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned, +@@ -383,7 +384,7 @@ void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned, + } + + #define bch2_bkey_ops_btree_ptr_v2 (struct bkey_ops) { \ +- .key_invalid = bch2_btree_ptr_invalid, \ ++ .key_invalid = bch2_btree_ptr_v2_invalid, \ + .val_to_text = bch2_btree_ptr_v2_to_text, \ + .swab = bch2_ptr_swab, \ + .compat = bch2_btree_ptr_v2_compat, \ +-- +cgit v1.2.3 + + +From 35aae6c10a8038be65babbdfd274d8e47282921f Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 21 Mar 2021 22:01:12 -0400 +Subject: bcachefs: Improve inode deletion code + +It had some silly redundancies. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/inode.c | 45 ++++++++++++++------------------------------- + 1 file changed, 14 insertions(+), 31 deletions(-) + +diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c +index c9b31afc7c97..4559e77f91f0 100644 +--- a/fs/bcachefs/inode.c ++++ b/fs/bcachefs/inode.c +@@ -542,12 +542,12 @@ found_slot: + int bch2_inode_rm(struct bch_fs *c, u64 inode_nr, bool cached) + { + struct btree_trans trans; +- struct btree_iter *iter; ++ struct btree_iter *iter = NULL; + struct bkey_i_inode_generation delete; + struct bpos start = POS(inode_nr, 0); + struct bpos end = POS(inode_nr + 1, 0); ++ struct bch_inode_unpacked inode_u; + struct bkey_s_c k; +- u64 bi_generation; + int ret; + + bch2_trans_init(&trans, c, 0, 0); +@@ -571,8 +571,6 @@ int bch2_inode_rm(struct bch_fs *c, u64 inode_nr, bool cached) + retry: + bch2_trans_begin(&trans); + +- bi_generation = 0; +- + if (cached) { + iter = bch2_trans_get_iter(&trans, BTREE_ID_inodes, POS(0, inode_nr), + BTREE_ITER_CACHED|BTREE_ITER_INTENT); +@@ -587,41 +585,26 @@ retry: + if (ret) + goto err; + +- bch2_fs_inconsistent_on(k.k->type != KEY_TYPE_inode, trans.c, +- "inode %llu not found when deleting", +- inode_nr); +- +- switch (k.k->type) { +- case KEY_TYPE_inode: { +- struct bch_inode_unpacked inode_u; +- +- if (!bch2_inode_unpack(bkey_s_c_to_inode(k), &inode_u)) +- bi_generation = inode_u.bi_generation + 1; +- break; +- } +- case KEY_TYPE_inode_generation: { +- struct bkey_s_c_inode_generation g = +- bkey_s_c_to_inode_generation(k); +- bi_generation = le32_to_cpu(g.v->bi_generation); +- break; +- } ++ if (k.k->type != KEY_TYPE_inode) { ++ bch2_fs_inconsistent(trans.c, ++ "inode %llu not found when deleting", ++ inode_nr); ++ ret = -EIO; ++ goto err; + } + +- if (!bi_generation) { +- bkey_init(&delete.k); +- delete.k.p.offset = inode_nr; +- } else { +- bkey_inode_generation_init(&delete.k_i); +- delete.k.p.offset = inode_nr; +- delete.v.bi_generation = cpu_to_le32(bi_generation); +- } ++ bch2_inode_unpack(bkey_s_c_to_inode(k), &inode_u); ++ ++ bkey_inode_generation_init(&delete.k_i); ++ delete.k.p = iter->pos; ++ delete.v.bi_generation = cpu_to_le32(inode_u.bi_generation + 1); + + bch2_trans_update(&trans, iter, &delete.k_i, 0); + + ret = bch2_trans_commit(&trans, NULL, NULL, + BTREE_INSERT_NOFAIL); +- bch2_trans_iter_put(&trans, iter); + err: ++ bch2_trans_iter_put(&trans, iter); + if (ret == -EINTR) + goto retry; + +-- +cgit v1.2.3 + + +From 04fc8ce317e4a96fb54fc01851e395f982b9308b Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 23 Mar 2021 21:22:50 -0400 +Subject: bcachefs: Split btree_iter_traverse and bch2_btree_iter_traverse() + +External (to the btree iterator code) users of bch2_btree_iter_traverse +expect that on success the iterator will be pointed at iter->pos and +have that position locked - but since we split iter->pos and +iter->real_pos, that means it has to update iter->real_pos if necessary. + +Internal users don't expect it to modify iter->real_pos, so we need two +separate functions. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 40 ++++++++++++++++++++++++++++++++-------- + fs/bcachefs/btree_iter.h | 10 +--------- + 2 files changed, 33 insertions(+), 17 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 69b24e0cb955..ce03b3f98e91 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -1329,7 +1329,7 @@ static int btree_iter_traverse_one(struct btree_iter *iter, + return 0; + } + +-int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter) ++static int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter) + { + struct btree_trans *trans = iter->trans; + int ret; +@@ -1342,6 +1342,30 @@ int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter) + return ret; + } + ++/* ++ * Note: ++ * bch2_btree_iter_traverse() is for external users, btree_iter_traverse() is ++ * for internal btree iterator users ++ * ++ * bch2_btree_iter_traverse sets iter->real_pos to iter->pos, ++ * btree_iter_traverse() does not: ++ */ ++static inline int __must_check ++btree_iter_traverse(struct btree_iter *iter) ++{ ++ return iter->uptodate >= BTREE_ITER_NEED_RELOCK ++ ? __bch2_btree_iter_traverse(iter) ++ : 0; ++} ++ ++int __must_check ++bch2_btree_iter_traverse(struct btree_iter *iter) ++{ ++ btree_iter_set_search_pos(iter, btree_iter_search_key(iter)); ++ ++ return btree_iter_traverse(iter); ++} ++ + /* Iterate across nodes (leaf and interior nodes) */ + + struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter) +@@ -1352,7 +1376,7 @@ struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter) + EBUG_ON(btree_iter_type(iter) != BTREE_ITER_NODES); + bch2_btree_iter_verify(iter); + +- ret = bch2_btree_iter_traverse(iter); ++ ret = btree_iter_traverse(iter); + if (ret) + return NULL; + +@@ -1388,7 +1412,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) + iter->level++; + + btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); +- ret = bch2_btree_iter_traverse(iter); ++ ret = btree_iter_traverse(iter); + if (ret) + return NULL; + +@@ -1411,7 +1435,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) + btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); + bch2_btree_iter_verify(iter); + +- ret = bch2_btree_iter_traverse(iter); ++ ret = btree_iter_traverse(iter); + if (ret) + return NULL; + +@@ -1555,7 +1579,7 @@ static inline struct bkey_s_c __btree_iter_peek(struct btree_iter *iter, bool wi + btree_iter_set_search_pos(iter, search_key); + + while (1) { +- ret = bch2_btree_iter_traverse(iter); ++ ret = btree_iter_traverse(iter); + if (unlikely(ret)) + return bkey_s_c_err(ret); + +@@ -1642,7 +1666,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) + btree_iter_set_search_pos(iter, iter->pos); + + while (1) { +- ret = bch2_btree_iter_traverse(iter); ++ ret = btree_iter_traverse(iter); + if (unlikely(ret)) { + k = bkey_s_c_err(ret); + goto no_key; +@@ -1754,7 +1778,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) + if (iter->flags & BTREE_ITER_IS_EXTENTS) + return __bch2_btree_iter_peek_slot_extents(iter); + +- ret = bch2_btree_iter_traverse(iter); ++ ret = btree_iter_traverse(iter); + if (unlikely(ret)) + return bkey_s_c_err(ret); + +@@ -1798,7 +1822,7 @@ struct bkey_s_c bch2_btree_iter_peek_cached(struct btree_iter *iter) + EBUG_ON(btree_iter_type(iter) != BTREE_ITER_CACHED); + bch2_btree_iter_verify(iter); + +- ret = bch2_btree_iter_traverse(iter); ++ ret = btree_iter_traverse(iter); + if (unlikely(ret)) + return bkey_s_c_err(ret); + +diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h +index 3ae19e2900a6..8768f4cb96fa 100644 +--- a/fs/bcachefs/btree_iter.h ++++ b/fs/bcachefs/btree_iter.h +@@ -145,15 +145,7 @@ void bch2_btree_iter_node_drop(struct btree_iter *, struct btree *); + + void bch2_btree_iter_reinit_node(struct btree_iter *, struct btree *); + +-int __must_check __bch2_btree_iter_traverse(struct btree_iter *); +- +-static inline int __must_check +-bch2_btree_iter_traverse(struct btree_iter *iter) +-{ +- return iter->uptodate >= BTREE_ITER_NEED_RELOCK +- ? __bch2_btree_iter_traverse(iter) +- : 0; +-} ++int __must_check bch2_btree_iter_traverse(struct btree_iter *); + + int bch2_btree_iter_traverse_all(struct btree_trans *); + +-- +cgit v1.2.3 + + +From e4341368ef16707a7ff5effcab8fc6ea9de1ed60 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 23 Mar 2021 23:52:27 -0400 +Subject: bcachefs: Use pcpu mode of six locks for interior nodes + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_cache.c | 6 ++++++ + fs/bcachefs/btree_iter.c | 16 ++++++++++++---- + fs/bcachefs/btree_update_interior.c | 5 +++++ + 3 files changed, 23 insertions(+), 4 deletions(-) + +diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c +index e765d8061b06..16099641a42d 100644 +--- a/fs/bcachefs/btree_cache.c ++++ b/fs/bcachefs/btree_cache.c +@@ -146,6 +146,11 @@ int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b, + b->c.level = level; + b->c.btree_id = id; + ++ if (level) ++ six_lock_pcpu_alloc(&b->c.lock); ++ else ++ six_lock_pcpu_free_rcu(&b->c.lock); ++ + mutex_lock(&bc->lock); + ret = __bch2_btree_node_hash_insert(bc, b); + if (!ret) +@@ -392,6 +397,7 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c) + while (!list_empty(&bc->freed)) { + b = list_first_entry(&bc->freed, struct btree, list); + list_del(&b->list); ++ six_lock_pcpu_free(&b->c.lock); + kfree(b); + } + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index ce03b3f98e91..533316f1a1c4 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -79,11 +79,19 @@ void __bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter) + * goes to 0, and it's safe because we have the node intent + * locked: + */ +- atomic64_sub(__SIX_VAL(read_lock, readers), +- &b->c.lock.state.counter); ++ if (!b->c.lock.readers) ++ atomic64_sub(__SIX_VAL(read_lock, readers), ++ &b->c.lock.state.counter); ++ else ++ this_cpu_sub(*b->c.lock.readers, readers); ++ + btree_node_lock_type(iter->trans->c, b, SIX_LOCK_write); +- atomic64_add(__SIX_VAL(read_lock, readers), +- &b->c.lock.state.counter); ++ ++ if (!b->c.lock.readers) ++ atomic64_add(__SIX_VAL(read_lock, readers), ++ &b->c.lock.state.counter); ++ else ++ this_cpu_add(*b->c.lock.readers, readers); + } + + bool __bch2_btree_node_relock(struct btree_iter *iter, unsigned level) +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index 4c0e3d7c8ddf..a661bc0cf98a 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -988,6 +988,11 @@ static void bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b) + list_del_init(&b->list); + mutex_unlock(&c->btree_cache.lock); + ++ if (b->c.level) ++ six_lock_pcpu_alloc(&b->c.lock); ++ else ++ six_lock_pcpu_free(&b->c.lock); ++ + mutex_lock(&c->btree_root_lock); + BUG_ON(btree_node_root(c, b) && + (b->c.level < btree_node_root(c, b)->c.level || +-- +cgit v1.2.3 + + +From c4e38749d32be3182d336cf8d73b25a9c0c91eb9 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 24 Mar 2021 22:49:05 -0400 +Subject: bcachefs: Increase default journal size + +The default was 1/256th of the device and capped at 512MB, which is +fairly tiny these days. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/journal.c | 11 +++++++---- + 1 file changed, 7 insertions(+), 4 deletions(-) + +diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c +index eaf521c9afaf..690b0358e437 100644 +--- a/fs/bcachefs/journal.c ++++ b/fs/bcachefs/journal.c +@@ -914,14 +914,17 @@ int bch2_dev_journal_alloc(struct bch_dev *ca) + if (dynamic_fault("bcachefs:add:journal_alloc")) + return -ENOMEM; + ++ /* 1/128th of the device by default: */ ++ nr = ca->mi.nbuckets >> 7; ++ + /* +- * clamp journal size to 1024 buckets or 512MB (in sectors), whichever ++ * clamp journal size to 8192 buckets or 8GB (in sectors), whichever + * is smaller: + */ +- nr = clamp_t(unsigned, ca->mi.nbuckets >> 8, ++ nr = clamp_t(unsigned, nr, + BCH_JOURNAL_BUCKETS_MIN, +- min(1 << 10, +- (1 << 20) / ca->mi.bucket_size)); ++ min(1 << 13, ++ (1 << 24) / ca->mi.bucket_size)); + + return __bch2_set_nr_journal_buckets(ca, nr, true, NULL); + } +-- +cgit v1.2.3 + + +From c9beb59094a627e45e958f3cb01335dfbe5ea7d0 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 24 Mar 2021 20:22:51 -0400 +Subject: bcachefs: Drop bkey noops + +Bkey noops were introduced to deal with trimming inline data extents in +place in the btree: if the u64s field of a bkey was 0, that u64 was a +noop and we'd start looking for the next bkey immediately after it. + +But extent handling has been lifted above the btree - we no longer +modify existing extents in place in the btree, and the compatibilty code +for old style extent btree nodes is gone, so we can completely drop this +code. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bkey.h | 10 ---------- + fs/bcachefs/bkey_sort.c | 2 +- + fs/bcachefs/bset.c | 26 +++++++++++--------------- + fs/bcachefs/bset.h | 2 +- + fs/bcachefs/btree_gc.c | 2 +- + fs/bcachefs/btree_io.c | 14 ++++++-------- + fs/bcachefs/btree_update_interior.c | 4 ++-- + 7 files changed, 22 insertions(+), 38 deletions(-) + +diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h +index 629288a60926..33dd57dab4c8 100644 +--- a/fs/bcachefs/bkey.h ++++ b/fs/bcachefs/bkey.h +@@ -33,16 +33,6 @@ struct bkey_s { + + #define bkey_next(_k) vstruct_next(_k) + +-static inline struct bkey_packed *bkey_next_skip_noops(struct bkey_packed *k, +- struct bkey_packed *end) +-{ +- k = bkey_next(k); +- +- while (k != end && !k->u64s) +- k = (void *) ((u64 *) k + 1); +- return k; +-} +- + #define bkey_val_u64s(_k) ((_k)->u64s - BKEY_U64s) + + static inline size_t bkey_val_bytes(const struct bkey *k) +diff --git a/fs/bcachefs/bkey_sort.c b/fs/bcachefs/bkey_sort.c +index f2507079ed11..537ab7919e88 100644 +--- a/fs/bcachefs/bkey_sort.c ++++ b/fs/bcachefs/bkey_sort.c +@@ -45,7 +45,7 @@ static inline void sort_iter_advance(struct sort_iter *iter, sort_cmp_fn cmp) + + BUG_ON(!iter->used); + +- i->k = bkey_next_skip_noops(i->k, i->end); ++ i->k = bkey_next(i->k); + + BUG_ON(i->k > i->end); + +diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c +index 87f951e14061..1446839100cf 100644 +--- a/fs/bcachefs/bset.c ++++ b/fs/bcachefs/bset.c +@@ -78,7 +78,7 @@ void bch2_dump_bset(struct bch_fs *c, struct btree *b, + for (_k = i->start; + _k < vstruct_last(i); + _k = _n) { +- _n = bkey_next_skip_noops(_k, vstruct_last(i)); ++ _n = bkey_next(_k); + + k = bkey_disassemble(b, _k, &uk); + if (c) +@@ -544,7 +544,7 @@ start: + rw_aux_tree(b, t)[j - 1].offset); + } + +- k = bkey_next_skip_noops(k, btree_bkey_last(b, t)); ++ k = bkey_next(k); + BUG_ON(k >= btree_bkey_last(b, t)); + } + } +@@ -759,7 +759,7 @@ retry: + /* First we figure out where the first key in each cacheline is */ + eytzinger1_for_each(j, t->size) { + while (bkey_to_cacheline(b, t, k) < cacheline) +- prev = k, k = bkey_next_skip_noops(k, btree_bkey_last(b, t)); ++ prev = k, k = bkey_next(k); + + if (k >= btree_bkey_last(b, t)) { + /* XXX: this path sucks */ +@@ -776,7 +776,7 @@ retry: + } + + while (k != btree_bkey_last(b, t)) +- prev = k, k = bkey_next_skip_noops(k, btree_bkey_last(b, t)); ++ prev = k, k = bkey_next(k); + + t->max_key = bkey_unpack_pos(b, prev); + +@@ -911,7 +911,7 @@ struct bkey_packed *bch2_bkey_prev_filter(struct btree *b, + struct bkey_packed *p, *i, *ret = NULL, *orig_k = k; + + while ((p = __bkey_prev(b, t, k)) && !ret) { +- for (i = p; i != k; i = bkey_next_skip_noops(i, k)) ++ for (i = p; i != k; i = bkey_next(i)) + if (i->type >= min_key_type) + ret = i; + +@@ -922,10 +922,10 @@ struct bkey_packed *bch2_bkey_prev_filter(struct btree *b, + BUG_ON(ret >= orig_k); + + for (i = ret +- ? bkey_next_skip_noops(ret, orig_k) ++ ? bkey_next(ret) + : btree_bkey_first(b, t); + i != orig_k; +- i = bkey_next_skip_noops(i, orig_k)) ++ i = bkey_next(i)) + BUG_ON(i->type >= min_key_type); + } + +@@ -960,7 +960,7 @@ static void ro_aux_tree_fix_invalidated_key(struct btree *b, + /* signal to make_bfloat() that they're uninitialized: */ + min_key.u64s = max_key.u64s = 0; + +- if (bkey_next_skip_noops(k, btree_bkey_last(b, t)) == btree_bkey_last(b, t)) { ++ if (bkey_next(k) == btree_bkey_last(b, t)) { + t->max_key = bkey_unpack_pos(b, k); + + for (j = 1; j < t->size; j = j * 2 + 1) +@@ -1084,7 +1084,7 @@ static void bch2_bset_fix_lookup_table(struct btree *b, + struct bkey_packed *k = start; + + while (1) { +- k = bkey_next_skip_noops(k, end); ++ k = bkey_next(k); + if (k == end) + break; + +@@ -1334,12 +1334,12 @@ struct bkey_packed *bch2_bset_search_linear(struct btree *b, + while (m != btree_bkey_last(b, t) && + bkey_iter_cmp_p_or_unp(b, m, + lossy_packed_search, search) < 0) +- m = bkey_next_skip_noops(m, btree_bkey_last(b, t)); ++ m = bkey_next(m); + + if (!packed_search) + while (m != btree_bkey_last(b, t) && + bkey_iter_pos_cmp(b, m, search) < 0) +- m = bkey_next_skip_noops(m, btree_bkey_last(b, t)); ++ m = bkey_next(m); + + if (bch2_expensive_debug_checks) { + struct bkey_packed *prev = bch2_bkey_prev_all(b, t, m); +@@ -1573,10 +1573,6 @@ static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *iter, + + EBUG_ON(iter->data->k > iter->data->end); + +- while (!__btree_node_iter_set_end(iter, 0) && +- !__bch2_btree_node_iter_peek_all(iter, b)->u64s) +- iter->data->k++; +- + if (unlikely(__btree_node_iter_set_end(iter, 0))) { + bch2_btree_node_iter_set_drop(iter, iter->data); + return; +diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h +index 54b364c8f28c..80ea75b2935a 100644 +--- a/fs/bcachefs/bset.h ++++ b/fs/bcachefs/bset.h +@@ -305,7 +305,7 @@ static inline struct bkey_s __bkey_disassemble(struct btree *b, + #define bset_tree_for_each_key(_b, _t, _k) \ + for (_k = btree_bkey_first(_b, _t); \ + _k != btree_bkey_last(_b, _t); \ +- _k = bkey_next_skip_noops(_k, btree_bkey_last(_b, _t))) ++ _k = bkey_next(_k)) + + static inline bool bset_has_ro_aux_tree(struct bset_tree *t) + { +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +index c57bcc3f841c..4e503364011b 100644 +--- a/fs/bcachefs/btree_gc.c ++++ b/fs/bcachefs/btree_gc.c +@@ -1373,7 +1373,7 @@ static void bch2_coalesce_nodes(struct bch_fs *c, struct btree_iter *iter, + k < vstruct_last(s2) && + vstruct_blocks_plus(n1->data, c->block_bits, + u64s + k->u64s) <= blocks; +- k = bkey_next_skip_noops(k, vstruct_last(s2))) { ++ k = bkey_next(k)) { + last = k; + u64s += k->u64s; + } +diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c +index 002025856236..9e909a73004f 100644 +--- a/fs/bcachefs/btree_io.c ++++ b/fs/bcachefs/btree_io.c +@@ -32,9 +32,9 @@ static void verify_no_dups(struct btree *b, + if (start == end) + return; + +- for (p = start, k = bkey_next_skip_noops(start, end); ++ for (p = start, k = bkey_next(start); + k != end; +- p = k, k = bkey_next_skip_noops(k, end)) { ++ p = k, k = bkey_next(k)) { + struct bkey l = bkey_unpack_key(b, p); + struct bkey r = bkey_unpack_key(b, k); + +@@ -47,9 +47,7 @@ static void set_needs_whiteout(struct bset *i, int v) + { + struct bkey_packed *k; + +- for (k = i->start; +- k != vstruct_last(i); +- k = bkey_next_skip_noops(k, vstruct_last(i))) ++ for (k = i->start; k != vstruct_last(i); k = bkey_next(k)) + k->needs_whiteout = v; + } + +@@ -213,7 +211,7 @@ static bool bch2_drop_whiteouts(struct btree *b, enum compact_mode mode) + out = i->start; + + for (k = start; k != end; k = n) { +- n = bkey_next_skip_noops(k, end); ++ n = bkey_next(k); + + if (!bkey_deleted(k)) { + bkey_copy(out, k); +@@ -754,7 +752,7 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b, + } + + prev = k; +- k = bkey_next_skip_noops(k, vstruct_last(i)); ++ k = bkey_next(k); + } + fsck_err: + return ret; +@@ -947,7 +945,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, + bp.v->mem_ptr = 0; + } + +- k = bkey_next_skip_noops(k, vstruct_last(i)); ++ k = bkey_next(k); + } + + bch2_bset_build_aux_tree(b, b->set, false); +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index a661bc0cf98a..60fe93635602 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -1119,7 +1119,7 @@ static struct btree *__btree_split_node(struct btree_update *as, + */ + k = set1->start; + while (1) { +- struct bkey_packed *n = bkey_next_skip_noops(k, vstruct_last(set1)); ++ struct bkey_packed *n = bkey_next(k); + + if (n == vstruct_last(set1)) + break; +@@ -1216,7 +1216,7 @@ static void btree_split_insert_keys(struct btree_update *as, struct btree *b, + i = btree_bset_first(b); + src = dst = i->start; + while (src != vstruct_last(i)) { +- n = bkey_next_skip_noops(src, vstruct_last(i)); ++ n = bkey_next(src); + if (!bkey_deleted(src)) { + memmove_u64s_down(dst, src, src->u64s); + dst = bkey_next(dst); +-- +cgit v1.2.3 + + +From 14b3b70d25802a1c49e5e60017db66051b8eb6d8 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 26 Mar 2021 20:08:56 -0400 +Subject: bcachefs: Generate better bkey formats when splitting nodes + +On btree node split, we weren't ensuring the min_key of the new larger +node packs in the new format for this node. This triggers some painful +slowpaths in the bset.c aux search tree code - this patch fixes that by +calculating a new format for the new node with the new min_key. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_update_interior.c | 53 ++++++++++++++++++++++++------------- + 1 file changed, 34 insertions(+), 19 deletions(-) + +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index 60fe93635602..bfbdeef966a5 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -1095,10 +1095,11 @@ static struct btree *__btree_split_node(struct btree_update *as, + struct btree *n1, + struct btree_iter *iter) + { ++ struct bkey_format_state s; + size_t nr_packed = 0, nr_unpacked = 0; + struct btree *n2; + struct bset *set1, *set2; +- struct bkey_packed *k, *prev = NULL; ++ struct bkey_packed *k, *set2_start, *set2_end, *out, *prev = NULL; + + n2 = bch2_btree_node_alloc(as, n1->c.level); + bch2_btree_update_add_new_node(as, n2); +@@ -1108,8 +1109,6 @@ static struct btree *__btree_split_node(struct btree_update *as, + SET_BTREE_NODE_SEQ(n2->data, BTREE_NODE_SEQ(n1->data)); + n2->key.k.p = n1->key.k.p; + +- btree_node_set_format(n2, n2->data->format); +- + set1 = btree_bset_first(n1); + set2 = btree_bset_first(n2); + +@@ -1136,33 +1135,49 @@ static struct btree *__btree_split_node(struct btree_update *as, + } + + BUG_ON(!prev); ++ set2_start = k; ++ set2_end = vstruct_last(set1); + +- btree_set_max(n1, bkey_unpack_pos(n1, prev)); +- btree_set_min(n2, bkey_successor(n1->key.k.p)); +- +- set2->u64s = cpu_to_le16((u64 *) vstruct_end(set1) - (u64 *) k); +- set1->u64s = cpu_to_le16(le16_to_cpu(set1->u64s) - le16_to_cpu(set2->u64s)); +- ++ set1->u64s = cpu_to_le16((u64 *) set2_start - set1->_data); + set_btree_bset_end(n1, n1->set); +- set_btree_bset_end(n2, n2->set); +- +- n2->nr.live_u64s = le16_to_cpu(set2->u64s); +- n2->nr.bset_u64s[0] = le16_to_cpu(set2->u64s); +- n2->nr.packed_keys = n1->nr.packed_keys - nr_packed; +- n2->nr.unpacked_keys = n1->nr.unpacked_keys - nr_unpacked; + + n1->nr.live_u64s = le16_to_cpu(set1->u64s); + n1->nr.bset_u64s[0] = le16_to_cpu(set1->u64s); + n1->nr.packed_keys = nr_packed; + n1->nr.unpacked_keys = nr_unpacked; + ++ btree_set_max(n1, bkey_unpack_pos(n1, prev)); ++ btree_set_min(n2, bkey_successor(n1->key.k.p)); ++ ++ bch2_bkey_format_init(&s); ++ bch2_bkey_format_add_pos(&s, n2->data->min_key); ++ bch2_bkey_format_add_pos(&s, n2->data->max_key); ++ ++ for (k = set2_start; k != set2_end; k = bkey_next(k)) { ++ struct bkey uk = bkey_unpack_key(n1, k); ++ bch2_bkey_format_add_key(&s, &uk); ++ } ++ ++ n2->data->format = bch2_bkey_format_done(&s); ++ btree_node_set_format(n2, n2->data->format); ++ ++ out = set2->start; ++ memset(&n2->nr, 0, sizeof(n2->nr)); ++ ++ for (k = set2_start; k != set2_end; k = bkey_next(k)) { ++ BUG_ON(!bch2_bkey_transform(&n2->format, out, bkey_packed(k) ++ ? &n1->format : &bch2_bkey_format_current, k)); ++ out->format = KEY_FORMAT_LOCAL_BTREE; ++ btree_keys_account_key_add(&n2->nr, 0, out); ++ out = bkey_next(out); ++ } ++ ++ set2->u64s = cpu_to_le16((u64 *) out - set2->_data); ++ set_btree_bset_end(n2, n2->set); ++ + BUG_ON(!set1->u64s); + BUG_ON(!set2->u64s); + +- memcpy_u64s(set2->start, +- vstruct_end(set1), +- le16_to_cpu(set2->u64s)); +- + btree_node_reset_sib_u64s(n1); + btree_node_reset_sib_u64s(n2); + +-- +cgit v1.2.3 + + +From 0156dc13933d7ba5e562b8d12df5b980439b279b Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 26 Mar 2021 20:10:59 -0400 +Subject: bcachefs: Fix building of aux search trees + +We weren't packing the min/max keys, which was a major oversight and +completely disabled generating bkey_floats for adjacent nodes. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bset.c | 29 +++++++++++++++++++---------- + 1 file changed, 19 insertions(+), 10 deletions(-) + +diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c +index 1446839100cf..f36254cdc15b 100644 +--- a/fs/bcachefs/bset.c ++++ b/fs/bcachefs/bset.c +@@ -686,16 +686,20 @@ static void make_bfloat(struct btree *b, struct bset_tree *t, + + if (is_power_of_2(j) && + !min_key->u64s) { +- k = (void *) min_key; +- bkey_init(&k->k); +- k->k.p = b->data->min_key; ++ if (!bkey_pack_pos(min_key, b->data->min_key, b)) { ++ k = (void *) min_key; ++ bkey_init(&k->k); ++ k->k.p = b->data->min_key; ++ } + } + + if (is_power_of_2(j + 1) && + !max_key->u64s) { +- k = (void *) max_key; +- bkey_init(&k->k); +- k->k.p = t->max_key; ++ if (!bkey_pack_pos(max_key, b->data->max_key, b)) { ++ k = (void *) max_key; ++ bkey_init(&k->k); ++ k->k.p = t->max_key; ++ } + } + + __make_bfloat(b, t, j, min_key, max_key); +@@ -780,10 +784,15 @@ retry: + + t->max_key = bkey_unpack_pos(b, prev); + +- bkey_init(&min_key.k); +- min_key.k.p = b->data->min_key; +- bkey_init(&max_key.k); +- max_key.k.p = t->max_key; ++ if (!bkey_pack_pos(bkey_to_packed(&min_key), b->data->min_key, b)) { ++ bkey_init(&min_key.k); ++ min_key.k.p = b->data->min_key; ++ } ++ ++ if (!bkey_pack_pos(bkey_to_packed(&max_key), b->data->max_key, b)) { ++ bkey_init(&max_key.k); ++ max_key.k.p = t->max_key; ++ } + + /* Then we build the tree */ + eytzinger1_for_each(j, t->size) +-- +cgit v1.2.3 + + +From e016d8a31d1e4b2d61ec5fd3422989a53406b2f6 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 26 Mar 2021 20:29:04 -0400 +Subject: bcachefs: Fix packed bkey format calculation for new btree roots + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_gc.c | 3 +++ + fs/bcachefs/btree_update_interior.c | 10 ++++++---- + 2 files changed, 9 insertions(+), 4 deletions(-) + +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +index 4e503364011b..cb7ed7e35a9a 100644 +--- a/fs/bcachefs/btree_gc.c ++++ b/fs/bcachefs/btree_gc.c +@@ -1311,6 +1311,9 @@ static void bch2_coalesce_nodes(struct bch_fs *c, struct btree_iter *iter, + /* Find a format that all keys in @old_nodes can pack into */ + bch2_bkey_format_init(&format_state); + ++ /* ++ * XXX: this won't correctly take it account the new min/max keys: ++ */ + for (i = 0; i < nr_old_nodes; i++) + __bch2_btree_calc_format(&format_state, old_nodes[i]); + +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index bfbdeef966a5..dfd35f67cdfc 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -82,8 +82,6 @@ void __bch2_btree_calc_format(struct bkey_format_state *s, struct btree *b) + struct bset_tree *t; + struct bkey uk; + +- bch2_bkey_format_add_pos(s, b->data->min_key); +- + for_each_bset(b, t) + bset_tree_for_each_key(b, t, k) + if (!bkey_deleted(k)) { +@@ -97,6 +95,8 @@ static struct bkey_format bch2_btree_calc_format(struct btree *b) + struct bkey_format_state s; + + bch2_bkey_format_init(&s); ++ bch2_bkey_format_add_pos(&s, b->data->min_key); ++ bch2_bkey_format_add_pos(&s, b->data->max_key); + __bch2_btree_calc_format(&s, b); + + return bch2_bkey_format_done(&s); +@@ -1578,8 +1578,10 @@ retry: + } + + bch2_bkey_format_init(&new_s); +- __bch2_btree_calc_format(&new_s, b); +- __bch2_btree_calc_format(&new_s, m); ++ bch2_bkey_format_add_pos(&new_s, prev->data->min_key); ++ __bch2_btree_calc_format(&new_s, prev); ++ __bch2_btree_calc_format(&new_s, next); ++ bch2_bkey_format_add_pos(&new_s, next->data->max_key); + new_f = bch2_bkey_format_done(&new_s); + + sib_u64s = btree_node_u64s_with_format(b, &new_f) + +-- +cgit v1.2.3 + + +From deba1c1fc3e97aa0e42f18bf01fd4899beaa7893 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 27 Mar 2021 20:58:57 -0400 +Subject: bcachefs: Fix for bch2_trans_commit() unlocking when it's not + supposed to + +When we pass BTREE_INSERT_NOUNLOCK bch2_trans_commit isn't supposed to +unlock after a successful commit, but it was calling +bch2_trans_cond_resched() - oops. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 3 ++- + fs/bcachefs/btree_iter.h | 1 + + fs/bcachefs/btree_update_leaf.c | 8 ++++++-- + 3 files changed, 9 insertions(+), 3 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 533316f1a1c4..dcdfb75f7308 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -2154,7 +2154,8 @@ void bch2_trans_reset(struct btree_trans *trans, unsigned flags) + (void *) &trans->fs_usage_deltas->memset_start); + } + +- bch2_trans_cond_resched(trans); ++ if (!(flags & TRANS_RESET_NOUNLOCK)) ++ bch2_trans_cond_resched(trans); + + if (!(flags & TRANS_RESET_NOTRAVERSE)) + bch2_btree_iter_traverse_all(trans); +diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h +index 8768f4cb96fa..176661b3b879 100644 +--- a/fs/bcachefs/btree_iter.h ++++ b/fs/bcachefs/btree_iter.h +@@ -303,6 +303,7 @@ static inline void set_btree_iter_dontneed(struct btree_trans *trans, struct btr + } + + #define TRANS_RESET_NOTRAVERSE (1 << 0) ++#define TRANS_RESET_NOUNLOCK (1 << 1) + + void bch2_trans_reset(struct btree_trans *, unsigned); + +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index d9308bd49fc9..5e7790917495 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -826,7 +826,7 @@ int __bch2_trans_commit(struct btree_trans *trans) + struct btree_insert_entry *i = NULL; + struct btree_iter *iter; + bool trans_trigger_run; +- unsigned u64s; ++ unsigned u64s, reset_flags = 0; + int ret = 0; + + if (!trans->nr_updates) +@@ -940,7 +940,11 @@ out: + if (likely(!(trans->flags & BTREE_INSERT_NOCHECK_RW))) + percpu_ref_put(&trans->c->writes); + out_reset: +- bch2_trans_reset(trans, !ret ? TRANS_RESET_NOTRAVERSE : 0); ++ if (!ret) ++ reset_flags |= TRANS_RESET_NOTRAVERSE; ++ if (!ret && (trans->flags & BTREE_INSERT_NOUNLOCK)) ++ reset_flags |= TRANS_RESET_NOUNLOCK; ++ bch2_trans_reset(trans, reset_flags); + + return ret; + err: +-- +cgit v1.2.3 + + +From 3fbce466fd5327660e248ef5359acc4978123f3e Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 27 Mar 2021 21:00:26 -0400 +Subject: bcachefs: Simplify btree_node_iter_init_pack_failed() + +Since we now make sure to always generate packed bkey formats that can +pack the min_key of a btree node, this path should actually never +happen. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bset.c | 19 +++++++------------ + 1 file changed, 7 insertions(+), 12 deletions(-) + +diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c +index f36254cdc15b..e4346c10268f 100644 +--- a/fs/bcachefs/bset.c ++++ b/fs/bcachefs/bset.c +@@ -1179,8 +1179,7 @@ void bch2_bset_delete(struct btree *b, + __flatten + static struct bkey_packed *bset_search_write_set(const struct btree *b, + struct bset_tree *t, +- struct bpos *search, +- const struct bkey_packed *packed_search) ++ struct bpos *search) + { + unsigned l = 0, r = t->size; + +@@ -1247,9 +1246,6 @@ static struct bkey_packed *bset_search_tree(const struct btree *b, + prefetch(&base->f[n << 4]); + + f = &base->f[n]; +- +- if (!unlikely(packed_search)) +- goto slowpath; + if (unlikely(f->exponent >= BFLOAT_FAILED)) + goto slowpath; + +@@ -1313,7 +1309,7 @@ struct bkey_packed *__bch2_bset_search(struct btree *b, + case BSET_NO_AUX_TREE: + return btree_bkey_first(b, t); + case BSET_RW_AUX_TREE: +- return bset_search_write_set(b, t, search, lossy_packed_search); ++ return bset_search_write_set(b, t, search); + case BSET_RO_AUX_TREE: + /* + * Each node in the auxiliary search tree covers a certain range +@@ -1412,16 +1408,15 @@ noinline __flatten __attribute__((cold)) + static void btree_node_iter_init_pack_failed(struct btree_node_iter *iter, + struct btree *b, struct bpos *search) + { +- struct bset_tree *t; ++ struct bkey_packed *k; + + trace_bkey_pack_pos_fail(search); + +- for_each_bset(b, t) +- __bch2_btree_node_iter_push(iter, b, +- bch2_bset_search(b, t, search, NULL, NULL), +- btree_bkey_last(b, t)); ++ bch2_btree_node_iter_init_from_start(iter, b); + +- bch2_btree_node_iter_sort(iter, b); ++ while ((k = bch2_btree_node_iter_peek(iter, b)) && ++ bkey_iter_pos_cmp(b, k, search) < 0) ++ bch2_btree_node_iter_advance(iter, b); + } + + /** +-- +cgit v1.2.3 + + +From 6078615a06c6d751bdec9379ba35495ca6941e76 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 24 Mar 2021 23:37:33 -0400 +Subject: bcachefs: btree key cache locking improvements + +The btree key cache mutex was becoming a significant bottleneck - it was +mainly used to protect the lists of dirty, clean and freed cached keys. + +This patch eliminates the dirty and clean lists - instead, when we need +to scan for keys to drop from the cache we iterate over the rhashtable, +and thus we're able to remove most uses of that lock. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_key_cache.c | 184 +++++++++++++++++++++++++++--------------- + fs/bcachefs/btree_key_cache.h | 8 +- + fs/bcachefs/btree_types.h | 7 +- + fs/bcachefs/journal_reclaim.c | 4 +- + 4 files changed, 130 insertions(+), 73 deletions(-) + +diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c +index 0b3545637bb3..30c76c7a8c12 100644 +--- a/fs/bcachefs/btree_key_cache.c ++++ b/fs/bcachefs/btree_key_cache.c +@@ -70,7 +70,7 @@ static void bkey_cached_evict(struct btree_key_cache *c, + bch2_btree_key_cache_params)); + memset(&ck->key, ~0, sizeof(ck->key)); + +- c->nr_keys--; ++ atomic_long_dec(&c->nr_keys); + } + + static void bkey_cached_free(struct btree_key_cache *bc, +@@ -99,12 +99,6 @@ bkey_cached_alloc(struct btree_key_cache *c) + { + struct bkey_cached *ck; + +- list_for_each_entry_reverse(ck, &c->freed, list) +- if (bkey_cached_lock_for_evict(ck)) { +- c->nr_freed--; +- return ck; +- } +- + ck = kmem_cache_alloc(bch2_key_cache, GFP_NOFS|__GFP_ZERO); + if (likely(ck)) { + INIT_LIST_HEAD(&ck->list); +@@ -114,11 +108,39 @@ bkey_cached_alloc(struct btree_key_cache *c) + return ck; + } + +- list_for_each_entry(ck, &c->clean, list) ++ return NULL; ++} ++ ++static struct bkey_cached * ++bkey_cached_reuse(struct btree_key_cache *c) ++{ ++ struct bucket_table *tbl; ++ struct rhash_head *pos; ++ struct bkey_cached *ck; ++ unsigned i; ++ ++ mutex_lock(&c->lock); ++ list_for_each_entry_reverse(ck, &c->freed, list) + if (bkey_cached_lock_for_evict(ck)) { +- bkey_cached_evict(c, ck); ++ c->nr_freed--; ++ list_del(&ck->list); ++ mutex_unlock(&c->lock); + return ck; + } ++ mutex_unlock(&c->lock); ++ ++ rcu_read_lock(); ++ tbl = rht_dereference_rcu(c->table.tbl, &c->table); ++ for (i = 0; i < tbl->size; i++) ++ rht_for_each_entry_rcu(ck, pos, tbl, i, hash) { ++ if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) && ++ bkey_cached_lock_for_evict(ck)) { ++ bkey_cached_evict(c, ck); ++ rcu_read_unlock(); ++ return ck; ++ } ++ } ++ rcu_read_unlock(); + + return NULL; + } +@@ -129,10 +151,17 @@ btree_key_cache_create(struct btree_key_cache *c, + struct bpos pos) + { + struct bkey_cached *ck; ++ bool was_new = true; + + ck = bkey_cached_alloc(c); +- if (!ck) +- return ERR_PTR(-ENOMEM); ++ ++ if (unlikely(!ck)) { ++ ck = bkey_cached_reuse(c); ++ if (unlikely(!ck)) ++ return ERR_PTR(-ENOMEM); ++ ++ was_new = false; ++ } + + ck->c.level = 0; + ck->c.btree_id = btree_id; +@@ -141,17 +170,26 @@ btree_key_cache_create(struct btree_key_cache *c, + ck->valid = false; + ck->flags = 1U << BKEY_CACHED_ACCESSED; + +- if (rhashtable_lookup_insert_fast(&c->table, ++ if (unlikely(rhashtable_lookup_insert_fast(&c->table, + &ck->hash, +- bch2_btree_key_cache_params)) { ++ bch2_btree_key_cache_params))) { + /* We raced with another fill: */ +- bkey_cached_free(c, ck); ++ ++ if (likely(was_new)) { ++ six_unlock_write(&ck->c.lock); ++ six_unlock_intent(&ck->c.lock); ++ kfree(ck); ++ } else { ++ mutex_lock(&c->lock); ++ bkey_cached_free(c, ck); ++ mutex_unlock(&c->lock); ++ } ++ + return NULL; + } + +- c->nr_keys++; ++ atomic_long_inc(&c->nr_keys); + +- list_move(&ck->list, &c->clean); + six_unlock_write(&ck->c.lock); + + return ck; +@@ -238,11 +276,8 @@ retry: + return 0; + } + +- mutex_lock(&c->btree_key_cache.lock); + ck = btree_key_cache_create(&c->btree_key_cache, + iter->btree_id, iter->pos); +- mutex_unlock(&c->btree_key_cache.lock); +- + ret = PTR_ERR_OR_ZERO(ck); + if (ret) + goto err; +@@ -370,15 +405,13 @@ err: + bch2_journal_pin_drop(j, &ck->journal); + bch2_journal_preres_put(j, &ck->res); + ++ BUG_ON(!btree_node_locked(c_iter, 0)); ++ + if (!evict) { +- mutex_lock(&c->btree_key_cache.lock); + if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { + clear_bit(BKEY_CACHED_DIRTY, &ck->flags); +- c->btree_key_cache.nr_dirty--; ++ atomic_long_dec(&c->btree_key_cache.nr_dirty); + } +- +- list_move_tail(&ck->list, &c->btree_key_cache.clean); +- mutex_unlock(&c->btree_key_cache.lock); + } else { + evict: + BUG_ON(!btree_node_intent_locked(c_iter, 0)); +@@ -388,13 +421,14 @@ evict: + + six_lock_write(&ck->c.lock, NULL, NULL); + +- mutex_lock(&c->btree_key_cache.lock); + if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { + clear_bit(BKEY_CACHED_DIRTY, &ck->flags); +- c->btree_key_cache.nr_dirty--; ++ atomic_long_dec(&c->btree_key_cache.nr_dirty); + } + + bkey_cached_evict(&c->btree_key_cache, ck); ++ ++ mutex_lock(&c->btree_key_cache.lock); + bkey_cached_free(&c->btree_key_cache, ck); + mutex_unlock(&c->btree_key_cache.lock); + } +@@ -475,16 +509,11 @@ bool bch2_btree_insert_key_cached(struct btree_trans *trans, + ck->valid = true; + + if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { +- mutex_lock(&c->btree_key_cache.lock); +- list_move(&ck->list, &c->btree_key_cache.dirty); +- + set_bit(BKEY_CACHED_DIRTY, &ck->flags); +- c->btree_key_cache.nr_dirty++; ++ atomic_long_inc(&c->btree_key_cache.nr_dirty); + + if (bch2_nr_btree_keys_need_flush(c)) + kick_reclaim = true; +- +- mutex_unlock(&c->btree_key_cache.lock); + } + + bch2_journal_pin_update(&c->journal, trans->journal_res.seq, +@@ -509,9 +538,11 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, + struct bch_fs *c = container_of(shrink, struct bch_fs, + btree_key_cache.shrink); + struct btree_key_cache *bc = &c->btree_key_cache; ++ struct bucket_table *tbl; + struct bkey_cached *ck, *t; + size_t scanned = 0, freed = 0, nr = sc->nr_to_scan; +- unsigned flags; ++ unsigned start, flags; ++ int srcu_idx; + + /* Return -1 if we can't do anything right now */ + if (sc->gfp_mask & __GFP_FS) +@@ -519,6 +550,7 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, + else if (!mutex_trylock(&bc->lock)) + return -1; + ++ srcu_idx = srcu_read_lock(&c->btree_trans_barrier); + flags = memalloc_nofs_save(); + + /* +@@ -540,23 +572,47 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, + if (scanned >= nr) + goto out; + +- list_for_each_entry_safe(ck, t, &bc->clean, list) { +- if (test_bit(BKEY_CACHED_ACCESSED, &ck->flags)) +- clear_bit(BKEY_CACHED_ACCESSED, &ck->flags); +- else if (bkey_cached_lock_for_evict(ck)) { +- bkey_cached_evict(bc, ck); +- bkey_cached_free(bc, ck); +- } ++ rcu_read_lock(); ++ tbl = rht_dereference_rcu(bc->table.tbl, &bc->table); ++ if (bc->shrink_iter >= tbl->size) ++ bc->shrink_iter = 0; ++ start = bc->shrink_iter; + +- scanned++; +- if (scanned >= nr) { +- if (&t->list != &bc->clean) +- list_move_tail(&bc->clean, &t->list); +- goto out; ++ do { ++ struct rhash_head *pos, *next; ++ ++ pos = rht_ptr_rcu(rht_bucket(tbl, bc->shrink_iter)); ++ ++ while (!rht_is_a_nulls(pos)) { ++ next = rht_dereference_bucket_rcu(pos->next, tbl, bc->shrink_iter); ++ ck = container_of(pos, struct bkey_cached, hash); ++ ++ if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) ++ goto next; ++ ++ if (test_bit(BKEY_CACHED_ACCESSED, &ck->flags)) ++ clear_bit(BKEY_CACHED_ACCESSED, &ck->flags); ++ else if (bkey_cached_lock_for_evict(ck)) { ++ bkey_cached_evict(bc, ck); ++ bkey_cached_free(bc, ck); ++ } ++ ++ scanned++; ++ if (scanned >= nr) ++ break; ++next: ++ pos = next; + } +- } ++ ++ bc->shrink_iter++; ++ if (bc->shrink_iter >= tbl->size) ++ bc->shrink_iter = 0; ++ } while (scanned < nr && bc->shrink_iter != start); ++ ++ rcu_read_unlock(); + out: + memalloc_nofs_restore(flags); ++ srcu_read_unlock(&c->btree_trans_barrier, srcu_idx); + mutex_unlock(&bc->lock); + + return freed; +@@ -569,41 +625,45 @@ static unsigned long bch2_btree_key_cache_count(struct shrinker *shrink, + btree_key_cache.shrink); + struct btree_key_cache *bc = &c->btree_key_cache; + +- return bc->nr_keys; ++ return atomic_long_read(&bc->nr_keys); + } + + void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc) + { + struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); ++ struct bucket_table *tbl; + struct bkey_cached *ck, *n; ++ struct rhash_head *pos; ++ unsigned i; + + if (bc->shrink.list.next) + unregister_shrinker(&bc->shrink); + + mutex_lock(&bc->lock); +- list_splice(&bc->dirty, &bc->clean); + +- list_for_each_entry_safe(ck, n, &bc->clean, list) { ++ rcu_read_lock(); ++ tbl = rht_dereference_rcu(bc->table.tbl, &bc->table); ++ for (i = 0; i < tbl->size; i++) ++ rht_for_each_entry_rcu(ck, pos, tbl, i, hash) { ++ bkey_cached_evict(bc, ck); ++ list_add(&ck->list, &bc->freed); ++ } ++ rcu_read_unlock(); ++ ++ list_for_each_entry_safe(ck, n, &bc->freed, list) { + cond_resched(); + + bch2_journal_pin_drop(&c->journal, &ck->journal); + bch2_journal_preres_put(&c->journal, &ck->res); + +- kfree(ck->k); + list_del(&ck->list); ++ kfree(ck->k); + kmem_cache_free(bch2_key_cache, ck); +- bc->nr_keys--; + } + +- BUG_ON(bc->nr_dirty && !bch2_journal_error(&c->journal)); +- BUG_ON(bc->nr_keys); +- +- list_for_each_entry_safe(ck, n, &bc->freed, list) { +- cond_resched(); ++ BUG_ON(atomic_long_read(&bc->nr_dirty) && !bch2_journal_error(&c->journal)); ++ BUG_ON(atomic_long_read(&bc->nr_keys)); + +- list_del(&ck->list); +- kmem_cache_free(bch2_key_cache, ck); +- } + mutex_unlock(&bc->lock); + + if (bc->table_init_done) +@@ -614,8 +674,6 @@ void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c) + { + mutex_init(&c->lock); + INIT_LIST_HEAD(&c->freed); +- INIT_LIST_HEAD(&c->clean); +- INIT_LIST_HEAD(&c->dirty); + } + + int bch2_fs_btree_key_cache_init(struct btree_key_cache *c) +@@ -641,8 +699,8 @@ int bch2_fs_btree_key_cache_init(struct btree_key_cache *c) + void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *c) + { + pr_buf(out, "nr_freed:\t%zu\n", c->nr_freed); +- pr_buf(out, "nr_keys:\t%zu\n", c->nr_keys); +- pr_buf(out, "nr_dirty:\t%zu\n", c->nr_dirty); ++ pr_buf(out, "nr_keys:\t%zu\n", atomic_long_read(&c->nr_keys)); ++ pr_buf(out, "nr_dirty:\t%zu\n", atomic_long_read(&c->nr_dirty)); + } + + void bch2_btree_key_cache_exit(void) +diff --git a/fs/bcachefs/btree_key_cache.h b/fs/bcachefs/btree_key_cache.h +index 2f8b5521718a..02715cd258ab 100644 +--- a/fs/bcachefs/btree_key_cache.h ++++ b/fs/bcachefs/btree_key_cache.h +@@ -3,8 +3,8 @@ + + static inline size_t bch2_nr_btree_keys_need_flush(struct bch_fs *c) + { +- size_t nr_dirty = READ_ONCE(c->btree_key_cache.nr_dirty); +- size_t nr_keys = READ_ONCE(c->btree_key_cache.nr_keys); ++ size_t nr_dirty = atomic_long_read(&c->btree_key_cache.nr_dirty); ++ size_t nr_keys = atomic_long_read(&c->btree_key_cache.nr_keys); + size_t max_dirty = 1024 + nr_keys / 2; + + return max_t(ssize_t, 0, nr_dirty - max_dirty); +@@ -12,8 +12,8 @@ static inline size_t bch2_nr_btree_keys_need_flush(struct bch_fs *c) + + static inline bool bch2_btree_key_cache_must_wait(struct bch_fs *c) + { +- size_t nr_dirty = READ_ONCE(c->btree_key_cache.nr_dirty); +- size_t nr_keys = READ_ONCE(c->btree_key_cache.nr_keys); ++ size_t nr_dirty = atomic_long_read(&c->btree_key_cache.nr_dirty); ++ size_t nr_keys = atomic_long_read(&c->btree_key_cache.nr_keys); + size_t max_dirty = 4096 + (nr_keys * 3) / 4; + + return nr_dirty > max_dirty && +diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h +index 96c4cd4ba1ea..bff27bffb54f 100644 +--- a/fs/bcachefs/btree_types.h ++++ b/fs/bcachefs/btree_types.h +@@ -292,13 +292,12 @@ struct btree_key_cache { + struct rhashtable table; + bool table_init_done; + struct list_head freed; +- struct list_head clean; +- struct list_head dirty; + struct shrinker shrink; ++ unsigned shrink_iter; + + size_t nr_freed; +- size_t nr_keys; +- size_t nr_dirty; ++ atomic_long_t nr_keys; ++ atomic_long_t nr_dirty; + }; + + struct bkey_cached_key { +diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c +index bbf8e5ad8aa0..4a5b50ed71b0 100644 +--- a/fs/bcachefs/journal_reclaim.c ++++ b/fs/bcachefs/journal_reclaim.c +@@ -610,8 +610,8 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct) + j->prereserved.remaining, + atomic_read(&c->btree_cache.dirty), + c->btree_cache.used, +- c->btree_key_cache.nr_dirty, +- c->btree_key_cache.nr_keys); ++ atomic_long_read(&c->btree_key_cache.nr_dirty), ++ atomic_long_read(&c->btree_key_cache.nr_keys)); + + nr_flushed = journal_flush_pins(j, seq_to_flush, min_nr); + +-- +cgit v1.2.3 + + +From 85f5948350d6e1f20995ea70ae217a8aa0a52bae Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 3 Feb 2021 21:51:56 -0500 +Subject: bcachefs: Add a mechanism for running callbacks at trans commit time + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 1 + + fs/bcachefs/btree_types.h | 9 +++++++++ + fs/bcachefs/btree_update.h | 2 ++ + fs/bcachefs/btree_update_leaf.c | 16 ++++++++++++++++ + 4 files changed, 28 insertions(+) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index dcdfb75f7308..f3c90204332f 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -2144,6 +2144,7 @@ void bch2_trans_reset(struct btree_trans *trans, unsigned flags) + trans->nr_updates2 = 0; + trans->mem_top = 0; + ++ trans->hooks = NULL; + trans->extra_journal_entries = NULL; + trans->extra_journal_entry_u64s = 0; + +diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h +index bff27bffb54f..f5eb970db8f9 100644 +--- a/fs/bcachefs/btree_types.h ++++ b/fs/bcachefs/btree_types.h +@@ -343,6 +343,14 @@ struct btree_insert_entry { + #define BTREE_ITER_MAX 32 + #endif + ++struct btree_trans_commit_hook; ++typedef int (btree_trans_commit_hook_fn)(struct btree_trans *, struct btree_trans_commit_hook *); ++ ++struct btree_trans_commit_hook { ++ btree_trans_commit_hook_fn *fn; ++ struct btree_trans_commit_hook *next; ++}; ++ + #define BTREE_TRANS_MEM_MAX (1U << 14) + + struct btree_trans { +@@ -379,6 +387,7 @@ struct btree_trans { + struct btree_insert_entry *updates2; + + /* update path: */ ++ struct btree_trans_commit_hook *hooks; + struct jset_entry *extra_journal_entries; + unsigned extra_journal_entry_u64s; + struct journal_entry_pin *journal_pin; +diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h +index a25138080169..4ce12ae29a55 100644 +--- a/fs/bcachefs/btree_update.h ++++ b/fs/bcachefs/btree_update.h +@@ -77,6 +77,8 @@ int bch2_btree_node_update_key(struct bch_fs *, struct btree_iter *, + + int bch2_trans_update(struct btree_trans *, struct btree_iter *, + struct bkey_i *, enum btree_trigger_flags); ++void bch2_trans_commit_hook(struct btree_trans *, ++ struct btree_trans_commit_hook *); + int __bch2_trans_commit(struct btree_trans *); + + /** +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index 5e7790917495..5e1e9309211e 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -369,6 +369,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, + struct bch_fs *c = trans->c; + struct bch_fs_usage *fs_usage = NULL; + struct btree_insert_entry *i; ++ struct btree_trans_commit_hook *h; + unsigned u64s = 0; + bool marking = false; + int ret; +@@ -386,6 +387,14 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, + + prefetch(&trans->c->journal.flags); + ++ h = trans->hooks; ++ while (h) { ++ ret = h->fn(trans, h); ++ if (ret) ++ return ret; ++ h = h->next; ++ } ++ + trans_for_each_update2(trans, i) { + /* Multiple inserts might go to same leaf: */ + if (!same_leaf_as_prev(trans, i)) +@@ -1057,6 +1066,13 @@ int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter, + return 0; + } + ++void bch2_trans_commit_hook(struct btree_trans *trans, ++ struct btree_trans_commit_hook *h) ++{ ++ h->next = trans->hooks; ++ trans->hooks = h; ++} ++ + int __bch2_btree_insert(struct btree_trans *trans, + enum btree_id id, struct bkey_i *k) + { +-- +cgit v1.2.3 + + +From cffa222b7041676940f32cb597080e9b431aaae5 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 4 Mar 2021 16:20:16 -0500 +Subject: bcachefs: Split out bpos_cmp() and bkey_cmp() + +With snapshots, we're going to need to differentiate between comparisons +that should and shouldn't include the snapshot field. bpos_cmp is now +the comparison function that does include the snapshot field, used by +core btree code. + +Upper level filesystem code generally does _not_ want to compare against +the snapshot field - that code wants keys to compare as equal even when +one of them is in an ancestor snapshot. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bkey.c | 8 ++++---- + fs/bcachefs/bkey.h | 26 ++++++++++++-------------- + fs/bcachefs/bkey_methods.c | 10 +++++----- + fs/bcachefs/bset.c | 12 ++++++------ + fs/bcachefs/bset.h | 20 +------------------- + fs/bcachefs/btree_cache.c | 10 +++++----- + fs/bcachefs/btree_gc.c | 14 +++++++------- + fs/bcachefs/btree_gc.h | 10 +++------- + fs/bcachefs/btree_io.c | 6 +++--- + fs/bcachefs/btree_io.h | 4 ++-- + fs/bcachefs/btree_iter.c | 20 ++++++++++---------- + fs/bcachefs/btree_key_cache.c | 8 ++++---- + fs/bcachefs/btree_update_interior.c | 4 ++-- + fs/bcachefs/btree_update_leaf.c | 8 ++++---- + fs/bcachefs/debug.c | 4 ++-- + fs/bcachefs/extents.h | 18 ++++++++++++++++++ + fs/bcachefs/recovery.c | 16 ++++++++-------- + 17 files changed, 96 insertions(+), 102 deletions(-) + +diff --git a/fs/bcachefs/bkey.c b/fs/bcachefs/bkey.c +index e1906f257ef2..3e427b3d34a5 100644 +--- a/fs/bcachefs/bkey.c ++++ b/fs/bcachefs/bkey.c +@@ -1045,7 +1045,7 @@ int __bch2_bkey_cmp_packed_format_checked(const struct bkey_packed *l, + high_word(f, r), + b->nr_key_bits); + +- EBUG_ON(ret != bkey_cmp(bkey_unpack_pos(b, l), ++ EBUG_ON(ret != bpos_cmp(bkey_unpack_pos(b, l), + bkey_unpack_pos(b, r))); + return ret; + } +@@ -1055,7 +1055,7 @@ int __bch2_bkey_cmp_left_packed_format_checked(const struct btree *b, + const struct bkey_packed *l, + const struct bpos *r) + { +- return bkey_cmp(bkey_unpack_pos_format_checked(b, l), *r); ++ return bpos_cmp(bkey_unpack_pos_format_checked(b, l), *r); + } + + __pure __flatten +@@ -1076,7 +1076,7 @@ int bch2_bkey_cmp_packed(const struct btree *b, + r = (void*) &unpacked; + } + +- return bkey_cmp(((struct bkey *) l)->p, ((struct bkey *) r)->p); ++ return bpos_cmp(((struct bkey *) l)->p, ((struct bkey *) r)->p); + } + + __pure __flatten +@@ -1087,7 +1087,7 @@ int __bch2_bkey_cmp_left_packed(const struct btree *b, + const struct bkey *l_unpacked; + + return unlikely(l_unpacked = packed_to_bkey_c(l)) +- ? bkey_cmp(l_unpacked->p, *r) ++ ? bpos_cmp(l_unpacked->p, *r) + : __bch2_bkey_cmp_left_packed_format_checked(b, l, r); + } + +diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h +index 33dd57dab4c8..41bfae6aae0d 100644 +--- a/fs/bcachefs/bkey.h ++++ b/fs/bcachefs/bkey.h +@@ -140,29 +140,27 @@ static inline int bkey_cmp_left_packed_byval(const struct btree *b, + return bkey_cmp_left_packed(b, l, &r); + } + +-#if 1 ++static __always_inline int bpos_cmp(struct bpos l, struct bpos r) ++{ ++ return cmp_int(l.inode, r.inode) ?: ++ cmp_int(l.offset, r.offset) ?: ++ cmp_int(l.snapshot, r.snapshot); ++} ++ + static __always_inline int bkey_cmp(struct bpos l, struct bpos r) + { +- if (l.inode != r.inode) +- return l.inode < r.inode ? -1 : 1; +- if (l.offset != r.offset) +- return l.offset < r.offset ? -1 : 1; +- if (l.snapshot != r.snapshot) +- return l.snapshot < r.snapshot ? -1 : 1; +- return 0; ++ return cmp_int(l.inode, r.inode) ?: ++ cmp_int(l.offset, r.offset); + } +-#else +-int bkey_cmp(struct bpos l, struct bpos r); +-#endif + + static inline struct bpos bpos_min(struct bpos l, struct bpos r) + { +- return bkey_cmp(l, r) < 0 ? l : r; ++ return bpos_cmp(l, r) < 0 ? l : r; + } + + static inline struct bpos bpos_max(struct bpos l, struct bpos r) + { +- return bkey_cmp(l, r) > 0 ? l : r; ++ return bpos_cmp(l, r) > 0 ? l : r; + } + + #define sbb(a, b, borrow) \ +@@ -190,7 +188,7 @@ static inline struct bpos bpos_sub(struct bpos a, struct bpos b) + + static inline struct bpos bpos_diff(struct bpos l, struct bpos r) + { +- if (bkey_cmp(l, r) > 0) ++ if (bpos_cmp(l, r) > 0) + swap(l, r); + + return bpos_sub(r, l); +diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c +index 641169ef91b5..5e7eadeb3b57 100644 +--- a/fs/bcachefs/bkey_methods.c ++++ b/fs/bcachefs/bkey_methods.c +@@ -138,10 +138,10 @@ const char *bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k, + + const char *bch2_bkey_in_btree_node(struct btree *b, struct bkey_s_c k) + { +- if (bkey_cmp(k.k->p, b->data->min_key) < 0) ++ if (bpos_cmp(k.k->p, b->data->min_key) < 0) + return "key before start of btree node"; + +- if (bkey_cmp(k.k->p, b->data->max_key) > 0) ++ if (bpos_cmp(k.k->p, b->data->max_key) > 0) + return "key past end of btree node"; + + return NULL; +@@ -165,9 +165,9 @@ void bch2_bkey_debugcheck(struct bch_fs *c, struct btree *b, struct bkey_s_c k) + + void bch2_bpos_to_text(struct printbuf *out, struct bpos pos) + { +- if (!bkey_cmp(pos, POS_MIN)) ++ if (!bpos_cmp(pos, POS_MIN)) + pr_buf(out, "POS_MIN"); +- else if (!bkey_cmp(pos, POS_MAX)) ++ else if (!bpos_cmp(pos, POS_MAX)) + pr_buf(out, "POS_MAX"); + else { + if (pos.inode == U64_MAX) +@@ -256,7 +256,7 @@ enum merge_result bch2_bkey_merge(struct bch_fs *c, + !ops->key_merge || + l.k->type != r.k->type || + bversion_cmp(l.k->version, r.k->version) || +- bkey_cmp(l.k->p, bkey_start_pos(r.k))) ++ bpos_cmp(l.k->p, bkey_start_pos(r.k))) + return BCH_MERGE_NOMERGE; + + ret = ops->key_merge(c, l, r); +diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c +index e4346c10268f..44b202798632 100644 +--- a/fs/bcachefs/bset.c ++++ b/fs/bcachefs/bset.c +@@ -93,13 +93,13 @@ void bch2_dump_bset(struct bch_fs *c, struct btree *b, + + n = bkey_unpack_key(b, _n); + +- if (bkey_cmp(bkey_start_pos(&n), k.k->p) < 0) { ++ if (bpos_cmp(n.p, k.k->p) < 0) { + printk(KERN_ERR "Key skipped backwards\n"); + continue; + } + + if (!bkey_deleted(k.k) && +- !bkey_cmp(n.p, k.k->p)) ++ !bpos_cmp(n.p, k.k->p)) + printk(KERN_ERR "Duplicate keys\n"); + } + } +@@ -534,7 +534,7 @@ static void bch2_bset_verify_rw_aux_tree(struct btree *b, + goto start; + while (1) { + if (rw_aux_to_bkey(b, t, j) == k) { +- BUG_ON(bkey_cmp(rw_aux_tree(b, t)[j].k, ++ BUG_ON(bpos_cmp(rw_aux_tree(b, t)[j].k, + bkey_unpack_pos(b, k))); + start: + if (++j == t->size) +@@ -1186,7 +1186,7 @@ static struct bkey_packed *bset_search_write_set(const struct btree *b, + while (l + 1 != r) { + unsigned m = (l + r) >> 1; + +- if (bkey_cmp(rw_aux_tree(b, t)[m].k, *search) < 0) ++ if (bpos_cmp(rw_aux_tree(b, t)[m].k, *search) < 0) + l = m; + else + r = m; +@@ -1318,7 +1318,7 @@ struct bkey_packed *__bch2_bset_search(struct btree *b, + * start and end - handle that here: + */ + +- if (bkey_cmp(*search, t->max_key) > 0) ++ if (bpos_cmp(*search, t->max_key) > 0) + return btree_bkey_last(b, t); + + return bset_search_tree(b, t, search, lossy_packed_search); +@@ -1468,7 +1468,7 @@ void bch2_btree_node_iter_init(struct btree_node_iter *iter, + struct bkey_packed *k[MAX_BSETS]; + unsigned i; + +- EBUG_ON(bkey_cmp(*search, b->data->min_key) < 0); ++ EBUG_ON(bpos_cmp(*search, b->data->min_key) < 0); + bset_aux_tree_verify(b); + + memset(iter, 0, sizeof(*iter)); +diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h +index 80ea75b2935a..506da4e0c911 100644 +--- a/fs/bcachefs/bset.h ++++ b/fs/bcachefs/bset.h +@@ -378,7 +378,7 @@ static inline int bkey_cmp_p_or_unp(const struct btree *b, + EBUG_ON(r_packed && !bkey_packed(r_packed)); + + if (unlikely(!bkey_packed(l))) +- return bkey_cmp(packed_to_bkey_c(l)->p, *r); ++ return bpos_cmp(packed_to_bkey_c(l)->p, *r); + + if (likely(r_packed)) + return __bch2_bkey_cmp_packed_format_checked(l, r_packed, b); +@@ -403,24 +403,6 @@ bch2_bkey_prev(struct btree *b, struct bset_tree *t, struct bkey_packed *k) + return bch2_bkey_prev_filter(b, t, k, 1); + } + +-enum bch_extent_overlap { +- BCH_EXTENT_OVERLAP_ALL = 0, +- BCH_EXTENT_OVERLAP_BACK = 1, +- BCH_EXTENT_OVERLAP_FRONT = 2, +- BCH_EXTENT_OVERLAP_MIDDLE = 3, +-}; +- +-/* Returns how k overlaps with m */ +-static inline enum bch_extent_overlap bch2_extent_overlap(const struct bkey *k, +- const struct bkey *m) +-{ +- int cmp1 = bkey_cmp(k->p, m->p) < 0; +- int cmp2 = bkey_cmp(bkey_start_pos(k), +- bkey_start_pos(m)) > 0; +- +- return (cmp1 << 1) + cmp2; +-} +- + /* Btree key iteration */ + + void bch2_btree_node_iter_push(struct btree_node_iter *, struct btree *, +diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c +index 16099641a42d..fd96677bd50f 100644 +--- a/fs/bcachefs/btree_cache.c ++++ b/fs/bcachefs/btree_cache.c +@@ -820,9 +820,9 @@ lock_node: + + EBUG_ON(b->c.btree_id != iter->btree_id); + EBUG_ON(BTREE_NODE_LEVEL(b->data) != level); +- EBUG_ON(bkey_cmp(b->data->max_key, k->k.p)); ++ EBUG_ON(bpos_cmp(b->data->max_key, k->k.p)); + EBUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 && +- bkey_cmp(b->data->min_key, ++ bpos_cmp(b->data->min_key, + bkey_i_to_btree_ptr_v2(&b->key)->v.min_key)); + + return b; +@@ -903,9 +903,9 @@ lock_node: + + EBUG_ON(b->c.btree_id != btree_id); + EBUG_ON(BTREE_NODE_LEVEL(b->data) != level); +- EBUG_ON(bkey_cmp(b->data->max_key, k->k.p)); ++ EBUG_ON(bpos_cmp(b->data->max_key, k->k.p)); + EBUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 && +- bkey_cmp(b->data->min_key, ++ bpos_cmp(b->data->min_key, + bkey_i_to_btree_ptr_v2(&b->key)->v.min_key)); + out: + bch2_btree_cache_cannibalize_unlock(c); +@@ -1017,7 +1017,7 @@ out: + if (sib != btree_prev_sib) + swap(n1, n2); + +- if (bkey_cmp(bkey_successor(n1->key.k.p), ++ if (bpos_cmp(bkey_successor(n1->key.k.p), + n2->data->min_key)) { + char buf1[200], buf2[200]; + +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +index cb7ed7e35a9a..661425024a72 100644 +--- a/fs/bcachefs/btree_gc.c ++++ b/fs/bcachefs/btree_gc.c +@@ -81,7 +81,7 @@ static int bch2_gc_check_topology(struct bch_fs *c, + bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(prev->k)); + } + +- if (fsck_err_on(bkey_cmp(expected_start, bp->v.min_key), c, ++ if (fsck_err_on(bpos_cmp(expected_start, bp->v.min_key), c, + "btree node with incorrect min_key at btree %s level %u:\n" + " prev %s\n" + " cur %s", +@@ -92,7 +92,7 @@ static int bch2_gc_check_topology(struct bch_fs *c, + } + + if (fsck_err_on(is_last && +- bkey_cmp(cur.k->k.p, node_end), c, ++ bpos_cmp(cur.k->k.p, node_end), c, + "btree node with incorrect max_key at btree %s level %u:\n" + " %s\n" + " expected %s", +@@ -489,8 +489,8 @@ static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b, + bkey_init(&prev.k->k); + + while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { +- BUG_ON(bkey_cmp(k.k->p, b->data->min_key) < 0); +- BUG_ON(bkey_cmp(k.k->p, b->data->max_key) > 0); ++ BUG_ON(bpos_cmp(k.k->p, b->data->min_key) < 0); ++ BUG_ON(bpos_cmp(k.k->p, b->data->max_key) > 0); + + ret = bch2_gc_mark_key(c, b->c.btree_id, b->c.level, false, + &k, &max_stale, true); +@@ -581,13 +581,13 @@ static int bch2_gc_btree_init(struct bch_fs *c, + return 0; + + six_lock_read(&b->c.lock, NULL, NULL); +- if (fsck_err_on(bkey_cmp(b->data->min_key, POS_MIN), c, ++ if (fsck_err_on(bpos_cmp(b->data->min_key, POS_MIN), c, + "btree root with incorrect min_key: %s", + (bch2_bpos_to_text(&PBUF(buf), b->data->min_key), buf))) { + BUG(); + } + +- if (fsck_err_on(bkey_cmp(b->data->max_key, POS_MAX), c, ++ if (fsck_err_on(bpos_cmp(b->data->max_key, POS_MAX), c, + "btree root with incorrect max_key: %s", + (bch2_bpos_to_text(&PBUF(buf), b->data->max_key), buf))) { + BUG(); +@@ -1448,7 +1448,7 @@ static void bch2_coalesce_nodes(struct bch_fs *c, struct btree_iter *iter, + unsigned j; + + for (j = 0; j < nr_new_nodes; j++) +- if (!bkey_cmp(old_nodes[i]->key.k.p, ++ if (!bpos_cmp(old_nodes[i]->key.k.p, + new_nodes[j]->key.k.p)) + goto next; + +diff --git a/fs/bcachefs/btree_gc.h b/fs/bcachefs/btree_gc.h +index d5559827ed7f..44b7d121610f 100644 +--- a/fs/bcachefs/btree_gc.h ++++ b/fs/bcachefs/btree_gc.h +@@ -45,13 +45,9 @@ static inline struct gc_pos gc_phase(enum gc_phase phase) + + static inline int gc_pos_cmp(struct gc_pos l, struct gc_pos r) + { +- if (l.phase != r.phase) +- return l.phase < r.phase ? -1 : 1; +- if (bkey_cmp(l.pos, r.pos)) +- return bkey_cmp(l.pos, r.pos); +- if (l.level != r.level) +- return l.level < r.level ? -1 : 1; +- return 0; ++ return cmp_int(l.phase, r.phase) ?: ++ bpos_cmp(l.pos, r.pos) ?: ++ cmp_int(l.level, r.level); + } + + static inline enum gc_phase btree_id_to_gc_phase(enum btree_id id) +diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c +index 9e909a73004f..f7f265d49d0c 100644 +--- a/fs/bcachefs/btree_io.c ++++ b/fs/bcachefs/btree_io.c +@@ -38,7 +38,7 @@ static void verify_no_dups(struct btree *b, + struct bkey l = bkey_unpack_key(b, p); + struct bkey r = bkey_unpack_key(b, k); + +- BUG_ON(bkey_cmp(l.p, bkey_start_pos(&r)) >= 0); ++ BUG_ON(bpos_cmp(l.p, bkey_start_pos(&r)) >= 0); + } + #endif + } +@@ -631,14 +631,14 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, + b->data->max_key = b->key.k.p; + } + +- btree_err_on(bkey_cmp(b->data->min_key, bp->min_key), ++ btree_err_on(bpos_cmp(b->data->min_key, bp->min_key), + BTREE_ERR_MUST_RETRY, c, ca, b, NULL, + "incorrect min_key: got %s should be %s", + (bch2_bpos_to_text(&PBUF(buf1), bn->min_key), buf1), + (bch2_bpos_to_text(&PBUF(buf2), bp->min_key), buf2)); + } + +- btree_err_on(bkey_cmp(bn->max_key, b->key.k.p), ++ btree_err_on(bpos_cmp(bn->max_key, b->key.k.p), + BTREE_ERR_MUST_RETRY, c, ca, b, i, + "incorrect max key %s", + (bch2_bpos_to_text(&PBUF(buf1), bn->max_key), buf1)); +diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h +index 16ce6dff6af7..f155a6cc1755 100644 +--- a/fs/bcachefs/btree_io.h ++++ b/fs/bcachefs/btree_io.h +@@ -220,7 +220,7 @@ static inline void compat_btree_node(unsigned level, enum btree_id btree_id, + { + if (version < bcachefs_metadata_version_inode_btree_change && + btree_node_type_is_extents(btree_id) && +- bkey_cmp(bn->min_key, POS_MIN) && ++ bpos_cmp(bn->min_key, POS_MIN) && + write) + bn->min_key = bkey_predecessor(bn->min_key); + +@@ -229,7 +229,7 @@ static inline void compat_btree_node(unsigned level, enum btree_id btree_id, + + if (version < bcachefs_metadata_version_inode_btree_change && + btree_node_type_is_extents(btree_id) && +- bkey_cmp(bn->min_key, POS_MIN) && ++ bpos_cmp(bn->min_key, POS_MIN) && + !write) + bn->min_key = bkey_successor(bn->min_key); + } +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index f3c90204332f..b64eb94aa746 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -37,13 +37,13 @@ static inline struct bpos btree_iter_search_key(struct btree_iter *iter) + static inline bool btree_iter_pos_before_node(struct btree_iter *iter, + struct btree *b) + { +- return bkey_cmp(iter->real_pos, b->data->min_key) < 0; ++ return bpos_cmp(iter->real_pos, b->data->min_key) < 0; + } + + static inline bool btree_iter_pos_after_node(struct btree_iter *iter, + struct btree *b) + { +- return bkey_cmp(b->key.k.p, iter->real_pos) < 0; ++ return bpos_cmp(b->key.k.p, iter->real_pos) < 0; + } + + static inline bool btree_iter_pos_in_node(struct btree_iter *iter, +@@ -293,7 +293,7 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos, + + /* Must lock btree nodes in key order: */ + if (btree_node_locked(linked, level) && +- bkey_cmp(pos, btree_node_pos((void *) linked->l[level].b, ++ bpos_cmp(pos, btree_node_pos((void *) linked->l[level].b, + btree_iter_type(linked))) <= 0) { + deadlock_iter = linked; + reason = 7; +@@ -1392,7 +1392,7 @@ struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter) + if (!b) + return NULL; + +- BUG_ON(bkey_cmp(b->key.k.p, iter->pos) < 0); ++ BUG_ON(bpos_cmp(b->key.k.p, iter->pos) < 0); + + iter->pos = iter->real_pos = b->key.k.p; + +@@ -1429,7 +1429,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) + if (!b) + return NULL; + +- if (bkey_cmp(iter->pos, b->key.k.p) < 0) { ++ if (bpos_cmp(iter->pos, b->key.k.p) < 0) { + /* + * Haven't gotten to the end of the parent node: go back down to + * the next child node +@@ -1461,7 +1461,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) + + static void btree_iter_set_search_pos(struct btree_iter *iter, struct bpos new_pos) + { +- int cmp = bkey_cmp(new_pos, iter->real_pos); ++ int cmp = bpos_cmp(new_pos, iter->real_pos); + unsigned l = iter->level; + + if (!cmp) +@@ -1505,7 +1505,7 @@ out: + inline bool bch2_btree_iter_advance(struct btree_iter *iter) + { + struct bpos pos = iter->k.p; +- bool ret = bkey_cmp(pos, POS_MAX) != 0; ++ bool ret = bpos_cmp(pos, POS_MAX) != 0; + + if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS)) + pos = bkey_successor(pos); +@@ -1516,7 +1516,7 @@ inline bool bch2_btree_iter_advance(struct btree_iter *iter) + inline bool bch2_btree_iter_rewind(struct btree_iter *iter) + { + struct bpos pos = bkey_start_pos(&iter->k); +- bool ret = bkey_cmp(pos, POS_MIN) != 0; ++ bool ret = bpos_cmp(pos, POS_MIN) != 0; + + if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS)) + pos = bkey_predecessor(pos); +@@ -1527,7 +1527,7 @@ inline bool bch2_btree_iter_rewind(struct btree_iter *iter) + static inline bool btree_iter_set_pos_to_next_leaf(struct btree_iter *iter) + { + struct bpos next_pos = iter->l[0].b->key.k.p; +- bool ret = bkey_cmp(next_pos, POS_MAX) != 0; ++ bool ret = bpos_cmp(next_pos, POS_MAX) != 0; + + /* + * Typically, we don't want to modify iter->pos here, since that +@@ -1545,7 +1545,7 @@ static inline bool btree_iter_set_pos_to_next_leaf(struct btree_iter *iter) + static inline bool btree_iter_set_pos_to_prev_leaf(struct btree_iter *iter) + { + struct bpos next_pos = iter->l[0].b->data->min_key; +- bool ret = bkey_cmp(next_pos, POS_MIN) != 0; ++ bool ret = bpos_cmp(next_pos, POS_MIN) != 0; + + if (ret) + btree_iter_set_search_pos(iter, bkey_predecessor(next_pos)); +diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c +index 30c76c7a8c12..98cf092a0b95 100644 +--- a/fs/bcachefs/btree_key_cache.c ++++ b/fs/bcachefs/btree_key_cache.c +@@ -21,7 +21,7 @@ static int bch2_btree_key_cache_cmp_fn(struct rhashtable_compare_arg *arg, + const struct bkey_cached_key *key = arg->key; + + return cmp_int(ck->key.btree_id, key->btree_id) ?: +- bkey_cmp(ck->key.pos, key->pos); ++ bpos_cmp(ck->key.pos, key->pos); + } + + static const struct rhashtable_params bch2_btree_key_cache_params = { +@@ -251,7 +251,7 @@ static int bkey_cached_check_fn(struct six_lock *lock, void *p) + const struct btree_iter *iter = p; + + return ck->key.btree_id == iter->btree_id && +- !bkey_cmp(ck->key.pos, iter->pos) ? 0 : -1; ++ !bpos_cmp(ck->key.pos, iter->pos) ? 0 : -1; + } + + __flatten +@@ -292,7 +292,7 @@ retry: + if (!btree_node_lock((void *) ck, iter->pos, 0, iter, lock_want, + bkey_cached_check_fn, iter, _THIS_IP_)) { + if (ck->key.btree_id != iter->btree_id || +- bkey_cmp(ck->key.pos, iter->pos)) { ++ bpos_cmp(ck->key.pos, iter->pos)) { + goto retry; + } + +@@ -302,7 +302,7 @@ retry: + } + + if (ck->key.btree_id != iter->btree_id || +- bkey_cmp(ck->key.pos, iter->pos)) { ++ bpos_cmp(ck->key.pos, iter->pos)) { + six_unlock_type(&ck->c.lock, lock_want); + goto retry; + } +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index dfd35f67cdfc..4f82cb330ed8 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -50,7 +50,7 @@ static void btree_node_interior_verify(struct bch_fs *c, struct btree *b) + break; + bp = bkey_s_c_to_btree_ptr_v2(k); + +- if (bkey_cmp(next_node, bp.v->min_key)) { ++ if (bpos_cmp(next_node, bp.v->min_key)) { + bch2_dump_btree_node(c, b); + panic("expected next min_key %s got %s\n", + (bch2_bpos_to_text(&PBUF(buf1), next_node), buf1), +@@ -60,7 +60,7 @@ static void btree_node_interior_verify(struct bch_fs *c, struct btree *b) + bch2_btree_node_iter_advance(&iter, b); + + if (bch2_btree_node_iter_end(&iter)) { +- if (bkey_cmp(k.k->p, b->key.k.p)) { ++ if (bpos_cmp(k.k->p, b->key.k.p)) { + bch2_dump_btree_node(c, b); + panic("expected end %s got %s\n", + (bch2_bpos_to_text(&PBUF(buf1), b->key.k.p), buf1), +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index 5e1e9309211e..7f79836a57ca 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -26,7 +26,7 @@ static inline int btree_insert_entry_cmp(const struct btree_insert_entry *l, + { + return cmp_int(l->btree_id, r->btree_id) ?: + -cmp_int(l->level, r->level) ?: +- bkey_cmp(l->k->k.p, r->k->k.p); ++ bpos_cmp(l->k->k.p, r->k->k.p); + } + + static inline bool same_leaf_as_prev(struct btree_trans *trans, +@@ -70,8 +70,8 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter, + EBUG_ON(btree_node_just_written(b)); + EBUG_ON(bset_written(b, btree_bset_last(b))); + EBUG_ON(bkey_deleted(&insert->k) && bkey_val_u64s(&insert->k)); +- EBUG_ON(bkey_cmp(insert->k.p, b->data->min_key) < 0); +- EBUG_ON(bkey_cmp(insert->k.p, b->data->max_key) > 0); ++ EBUG_ON(bpos_cmp(insert->k.p, b->data->min_key) < 0); ++ EBUG_ON(bpos_cmp(insert->k.p, b->data->max_key) > 0); + EBUG_ON(insert->k.u64s > + bch_btree_keys_u64s_remaining(iter->trans->c, b)); + EBUG_ON(iter->flags & BTREE_ITER_IS_EXTENTS); +@@ -225,7 +225,7 @@ static inline void btree_insert_entry_checks(struct btree_trans *trans, + + BUG_ON(bch2_debug_check_bkeys && + bch2_bkey_invalid(c, bkey_i_to_s_c(i->k), i->bkey_type)); +- BUG_ON(bkey_cmp(i->k->k.p, i->iter->real_pos)); ++ BUG_ON(bpos_cmp(i->k->k.p, i->iter->real_pos)); + BUG_ON(i->level != i->iter->level); + BUG_ON(i->btree_id != i->iter->btree_id); + } +diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c +index c6d49f44aa01..2cedf6578f5b 100644 +--- a/fs/bcachefs/debug.c ++++ b/fs/bcachefs/debug.c +@@ -273,7 +273,7 @@ static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf, + if (err) + return err; + +- if (!i->size || !bkey_cmp(POS_MAX, i->from)) ++ if (!i->size || !bpos_cmp(POS_MAX, i->from)) + return i->ret; + + bch2_trans_init(&trans, i->c, 0, 0); +@@ -289,7 +289,7 @@ static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf, + * can't easily correctly restart a btree node traversal across + * all nodes, meh + */ +- i->from = bkey_cmp(POS_MAX, b->key.k.p) ++ i->from = bpos_cmp(POS_MAX, b->key.k.p) + ? bkey_successor(b->key.k.p) + : b->key.k.p; + +diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h +index c8069dfb90ff..ccee43a2019d 100644 +--- a/fs/bcachefs/extents.h ++++ b/fs/bcachefs/extents.h +@@ -582,6 +582,24 @@ void bch2_ptr_swab(struct bkey_s); + + /* Generic extent code: */ + ++enum bch_extent_overlap { ++ BCH_EXTENT_OVERLAP_ALL = 0, ++ BCH_EXTENT_OVERLAP_BACK = 1, ++ BCH_EXTENT_OVERLAP_FRONT = 2, ++ BCH_EXTENT_OVERLAP_MIDDLE = 3, ++}; ++ ++/* Returns how k overlaps with m */ ++static inline enum bch_extent_overlap bch2_extent_overlap(const struct bkey *k, ++ const struct bkey *m) ++{ ++ int cmp1 = bkey_cmp(k->p, m->p) < 0; ++ int cmp2 = bkey_cmp(bkey_start_pos(k), ++ bkey_start_pos(m)) > 0; ++ ++ return (cmp1 << 1) + cmp2; ++} ++ + int bch2_cut_front_s(struct bpos, struct bkey_s); + int bch2_cut_back_s(struct bpos, struct bkey_s); + +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index 92f7568175eb..596f7c1e4245 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -48,14 +48,14 @@ static int __journal_key_cmp(enum btree_id l_btree_id, + { + return (cmp_int(l_btree_id, r->btree_id) ?: + cmp_int(l_level, r->level) ?: +- bkey_cmp(l_pos, r->k->k.p)); ++ bpos_cmp(l_pos, r->k->k.p)); + } + + static int journal_key_cmp(struct journal_key *l, struct journal_key *r) + { + return (cmp_int(l->btree_id, r->btree_id) ?: + cmp_int(l->level, r->level) ?: +- bkey_cmp(l->k->k.p, r->k->k.p)); ++ bpos_cmp(l->k->k.p, r->k->k.p)); + } + + static size_t journal_key_search(struct journal_keys *journal_keys, +@@ -90,7 +90,7 @@ static void journal_iter_fix(struct bch_fs *c, struct journal_iter *iter, unsign + if (iter->idx > idx || + (iter->idx == idx && + biter->last && +- bkey_cmp(n->k.p, biter->unpacked.p) <= 0)) ++ bpos_cmp(n->k.p, biter->unpacked.p) <= 0)) + iter->idx++; + } + +@@ -238,7 +238,7 @@ struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter * + bkey_i_to_s_c(bch2_journal_iter_peek(&iter->journal)); + + if (btree_k.k && journal_k.k) { +- int cmp = bkey_cmp(btree_k.k->p, journal_k.k->p); ++ int cmp = bpos_cmp(btree_k.k->p, journal_k.k->p); + + if (!cmp) + bch2_journal_iter_advance_btree(iter); +@@ -256,7 +256,7 @@ struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter * + ret = iter->last == journal ? journal_k : btree_k; + + if (iter->b && +- bkey_cmp(ret.k->p, iter->b->data->max_key) > 0) { ++ bpos_cmp(ret.k->p, iter->b->data->max_key) > 0) { + iter->journal.idx = iter->journal.keys->nr; + iter->last = none; + return bkey_s_c_null; +@@ -419,7 +419,7 @@ static int journal_sort_key_cmp(const void *_l, const void *_r) + + return cmp_int(l->btree_id, r->btree_id) ?: + cmp_int(l->level, r->level) ?: +- bkey_cmp(l->k->k.p, r->k->k.p) ?: ++ bpos_cmp(l->k->k.p, r->k->k.p) ?: + cmp_int(l->journal_seq, r->journal_seq) ?: + cmp_int(l->journal_offset, r->journal_offset); + } +@@ -490,7 +490,7 @@ static struct journal_keys journal_keys_sort(struct list_head *journal_entries) + while (src + 1 < keys.d + keys.nr && + src[0].btree_id == src[1].btree_id && + src[0].level == src[1].level && +- !bkey_cmp(src[0].k->k.p, src[1].k->k.p)) ++ !bpos_cmp(src[0].k->k.p, src[1].k->k.p)) + src++; + + *dst++ = *src++; +@@ -581,7 +581,7 @@ static int journal_sort_seq_cmp(const void *_l, const void *_r) + return cmp_int(r->level, l->level) ?: + cmp_int(l->journal_seq, r->journal_seq) ?: + cmp_int(l->btree_id, r->btree_id) ?: +- bkey_cmp(l->k->k.p, r->k->k.p); ++ bpos_cmp(l->k->k.p, r->k->k.p); + } + + static int bch2_journal_replay(struct bch_fs *c, +-- +cgit v1.2.3 + + +From 1259ba201519ecefd8ff817536fdec6d1c1c3a87 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 24 Mar 2021 18:02:16 -0400 +Subject: bcachefs: Start using bpos.snapshot field + +This patch starts treating the bpos.snapshot field like part of the key +in the btree code: + +* bpos_successor() and bpos_predecessor() now include the snapshot field +* Keys in btrees that will be using snapshots (extents, inodes, dirents + and xattrs) now always have their snapshot field set to U32_MAX + +The btree iterator code gets a new flag, BTREE_ITER_ALL_SNAPSHOTS, that +determines whether we're iterating over keys in all snapshots or not - +internally, this controlls whether bkey_(successor|predecessor) +increment/decrement the snapshot field, or only the higher bits of the +key. + +We add a new member to struct btree_iter, iter->snapshot: when +BTREE_ITER_ALL_SNAPSHOTS is not set, iter->pos.snapshot should always +equal iter->snapshot, which will be 0 for btrees that don't use +snapshots, and alsways U32_MAX for btrees that will use snapshots +(until we enable snapshot creation). + +This patch also introduces a new metadata version number, and compat +code for reading from/writing to older versions - this isn't a forced +upgrade (yet). + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bcachefs_format.h | 24 +++++------ + fs/bcachefs/bkey.c | 17 +++++--- + fs/bcachefs/bkey.h | 42 +++++++++++++++----- + fs/bcachefs/bkey_methods.c | 36 +++++++++++++++-- + fs/bcachefs/bset.c | 2 +- + fs/bcachefs/btree_cache.c | 2 +- + fs/bcachefs/btree_gc.c | 8 ++-- + fs/bcachefs/btree_io.c | 12 ++---- + fs/bcachefs/btree_io.h | 26 ++++++++++-- + fs/bcachefs/btree_iter.c | 79 ++++++++++++++++++++++++++++++++----- + fs/bcachefs/btree_iter.h | 3 ++ + fs/bcachefs/btree_types.h | 16 +++++++- + fs/bcachefs/btree_update_interior.c | 12 ++++-- + fs/bcachefs/btree_update_leaf.c | 14 +++++-- + fs/bcachefs/debug.c | 6 ++- + fs/bcachefs/extents.c | 7 ++-- + fs/bcachefs/fsck.c | 1 + + fs/bcachefs/inode.c | 1 + + fs/bcachefs/io.c | 5 +++ + fs/bcachefs/journal_io.c | 2 +- + fs/bcachefs/recovery.c | 8 ++++ + fs/bcachefs/tests.c | 1 + + 22 files changed, 251 insertions(+), 73 deletions(-) + +diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h +index 3de414ceb267..375d1c7ed392 100644 +--- a/fs/bcachefs/bcachefs_format.h ++++ b/fs/bcachefs/bcachefs_format.h +@@ -138,19 +138,18 @@ struct bpos { + #define KEY_SNAPSHOT_MAX ((__u32)~0U) + #define KEY_SIZE_MAX ((__u32)~0U) + +-static inline struct bpos POS(__u64 inode, __u64 offset) ++static inline struct bpos SPOS(__u64 inode, __u64 offset, __u32 snapshot) + { +- struct bpos ret; +- +- ret.inode = inode; +- ret.offset = offset; +- ret.snapshot = 0; +- +- return ret; ++ return (struct bpos) { ++ .inode = inode, ++ .offset = offset, ++ .snapshot = snapshot, ++ }; + } + +-#define POS_MIN POS(0, 0) +-#define POS_MAX POS(KEY_INODE_MAX, KEY_OFFSET_MAX) ++#define POS_MIN SPOS(0, 0, 0) ++#define POS_MAX SPOS(KEY_INODE_MAX, KEY_OFFSET_MAX, KEY_SNAPSHOT_MAX) ++#define POS(_inode, _offset) SPOS(_inode, _offset, 0) + + /* Empty placeholder struct, for container_of() */ + struct bch_val { +@@ -1204,7 +1203,8 @@ enum bcachefs_metadata_version { + bcachefs_metadata_version_new_versioning = 10, + bcachefs_metadata_version_bkey_renumber = 10, + bcachefs_metadata_version_inode_btree_change = 11, +- bcachefs_metadata_version_max = 12, ++ bcachefs_metadata_version_snapshot = 12, ++ bcachefs_metadata_version_max = 13, + }; + + #define bcachefs_metadata_version_current (bcachefs_metadata_version_max - 1) +@@ -1742,7 +1742,7 @@ struct btree_node { + /* Closed interval: */ + struct bpos min_key; + struct bpos max_key; +- struct bch_extent_ptr ptr; ++ struct bch_extent_ptr _ptr; /* not used anymore */ + struct bkey_format format; + + union { +diff --git a/fs/bcachefs/bkey.c b/fs/bcachefs/bkey.c +index 3e427b3d34a5..3af56062601f 100644 +--- a/fs/bcachefs/bkey.c ++++ b/fs/bcachefs/bkey.c +@@ -614,15 +614,19 @@ const char *bch2_bkey_format_validate(struct bkey_format *f) + return "incorrect number of fields"; + + for (i = 0; i < f->nr_fields; i++) { ++ unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i]; ++ u64 unpacked_mask = ~((~0ULL << 1) << (unpacked_bits - 1)); + u64 field_offset = le64_to_cpu(f->field_offset[i]); + +- if (f->bits_per_field[i] > 64) ++ if (f->bits_per_field[i] > unpacked_bits) + return "field too large"; + +- if (field_offset && +- (f->bits_per_field[i] == 64 || +- (field_offset + ((1ULL << f->bits_per_field[i]) - 1) < +- field_offset))) ++ if ((f->bits_per_field[i] == unpacked_bits) && field_offset) ++ return "offset + bits overflow"; ++ ++ if (((field_offset + ((1ULL << f->bits_per_field[i]) - 1)) & ++ unpacked_mask) < ++ field_offset) + return "offset + bits overflow"; + + bits += f->bits_per_field[i]; +@@ -1123,11 +1127,12 @@ void bch2_bkey_pack_test(void) + struct bkey_packed p; + + struct bkey_format test_format = { +- .key_u64s = 2, ++ .key_u64s = 3, + .nr_fields = BKEY_NR_FIELDS, + .bits_per_field = { + 13, + 64, ++ 32, + }, + }; + +diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h +index 41bfae6aae0d..2e45d88fab03 100644 +--- a/fs/bcachefs/bkey.h ++++ b/fs/bcachefs/bkey.h +@@ -250,24 +250,46 @@ static inline unsigned bkey_format_key_bits(const struct bkey_format *format) + format->bits_per_field[BKEY_FIELD_SNAPSHOT]; + } + +-static inline struct bpos bkey_successor(struct bpos p) ++static inline struct bpos bpos_successor(struct bpos p) + { +- struct bpos ret = p; ++ if (!++p.snapshot && ++ !++p.offset && ++ !++p.inode) ++ BUG(); + +- if (!++ret.offset) +- BUG_ON(!++ret.inode); ++ return p; ++} + +- return ret; ++static inline struct bpos bpos_predecessor(struct bpos p) ++{ ++ if (!p.snapshot-- && ++ !p.offset-- && ++ !p.inode--) ++ BUG(); ++ ++ return p; + } + +-static inline struct bpos bkey_predecessor(struct bpos p) ++static inline struct bpos bpos_nosnap_successor(struct bpos p) + { +- struct bpos ret = p; ++ p.snapshot = 0; + +- if (!ret.offset--) +- BUG_ON(!ret.inode--); ++ if (!++p.offset && ++ !++p.inode) ++ BUG(); + +- return ret; ++ return p; ++} ++ ++static inline struct bpos bpos_nosnap_predecessor(struct bpos p) ++{ ++ p.snapshot = 0; ++ ++ if (!p.offset-- && ++ !p.inode--) ++ BUG(); ++ ++ return p; + } + + static inline u64 bkey_start_offset(const struct bkey *k) +diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c +index 5e7eadeb3b57..6fe95b802e13 100644 +--- a/fs/bcachefs/bkey_methods.c ++++ b/fs/bcachefs/bkey_methods.c +@@ -119,9 +119,16 @@ const char *__bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k, + return "nonzero size field"; + } + +- if (k.k->p.snapshot) ++ if (type != BKEY_TYPE_btree && ++ !btree_type_has_snapshots(type) && ++ k.k->p.snapshot) + return "nonzero snapshot"; + ++ if (type != BKEY_TYPE_btree && ++ btree_type_has_snapshots(type) && ++ k.k->p.snapshot != U32_MAX) ++ return "invalid snapshot field"; ++ + if (type != BKEY_TYPE_btree && + !bkey_cmp(k.k->p, POS_MAX)) + return "POS_MAX key"; +@@ -310,14 +317,15 @@ void __bch2_bkey_compat(unsigned level, enum btree_id btree_id, + const struct bkey_ops *ops; + struct bkey uk; + struct bkey_s u; ++ unsigned nr_compat = 5; + int i; + + /* + * Do these operations in reverse order in the write path: + */ + +- for (i = 0; i < 4; i++) +- switch (!write ? i : 3 - i) { ++ for (i = 0; i < nr_compat; i++) ++ switch (!write ? i : nr_compat - 1 - i) { + case 0: + if (big_endian != CPU_BIG_ENDIAN) + bch2_bkey_swab_key(f, k); +@@ -351,6 +359,28 @@ void __bch2_bkey_compat(unsigned level, enum btree_id btree_id, + } + break; + case 3: ++ if (version < bcachefs_metadata_version_snapshot && ++ (level || btree_type_has_snapshots(btree_id))) { ++ struct bkey_i *u = packed_to_bkey(k); ++ ++ if (u) { ++ u->k.p.snapshot = write ++ ? 0 : U32_MAX; ++ } else { ++ u64 min_packed = f->field_offset[BKEY_FIELD_SNAPSHOT]; ++ u64 max_packed = min_packed + ++ ~(~0ULL << f->bits_per_field[BKEY_FIELD_SNAPSHOT]); ++ ++ uk = __bch2_bkey_unpack_key(f, k); ++ uk.p.snapshot = write ++ ? min_packed : min_t(u64, U32_MAX, max_packed); ++ ++ BUG_ON(!bch2_bkey_pack_key(k, &uk, f)); ++ } ++ } ++ ++ break; ++ case 4: + if (!bkey_packed(k)) { + u = bkey_i_to_s(packed_to_bkey(k)); + } else { +diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c +index 44b202798632..a8e2ebbe8ace 100644 +--- a/fs/bcachefs/bset.c ++++ b/fs/bcachefs/bset.c +@@ -1450,7 +1450,7 @@ static void btree_node_iter_init_pack_failed(struct btree_node_iter *iter, + * to the search key is going to have 0 sectors after the search key. + * + * But this does mean that we can't just search for +- * bkey_successor(start_of_range) to get the first extent that overlaps with ++ * bpos_successor(start_of_range) to get the first extent that overlaps with + * the range we want - if we're unlucky and there's an extent that ends + * exactly where we searched, then there could be a deleted key at the same + * position and we'd get that when we search instead of the preceding extent +diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c +index fd96677bd50f..ac59cb8c75c1 100644 +--- a/fs/bcachefs/btree_cache.c ++++ b/fs/bcachefs/btree_cache.c +@@ -1017,7 +1017,7 @@ out: + if (sib != btree_prev_sib) + swap(n1, n2); + +- if (bpos_cmp(bkey_successor(n1->key.k.p), ++ if (bpos_cmp(bpos_successor(n1->key.k.p), + n2->data->min_key)) { + char buf1[200], buf2[200]; + +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +index 661425024a72..7b26d743112e 100644 +--- a/fs/bcachefs/btree_gc.c ++++ b/fs/bcachefs/btree_gc.c +@@ -64,7 +64,7 @@ static int bch2_gc_check_topology(struct bch_fs *c, + struct bpos node_end = b->data->max_key; + struct bpos expected_start = bkey_deleted(&prev->k->k) + ? node_start +- : bkey_successor(prev->k->k.p); ++ : bpos_successor(prev->k->k.p); + char buf1[200], buf2[200]; + bool update_min = false; + bool update_max = false; +@@ -1187,7 +1187,9 @@ static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id btree_id) + bch2_trans_init(&trans, c, 0, 0); + + iter = bch2_trans_get_iter(&trans, btree_id, POS_MIN, +- BTREE_ITER_PREFETCH); ++ BTREE_ITER_PREFETCH| ++ BTREE_ITER_NOT_EXTENTS| ++ BTREE_ITER_ALL_SNAPSHOTS); + + while ((k = bch2_btree_iter_peek(iter)).k && + !(ret = bkey_err(k))) { +@@ -1405,7 +1407,7 @@ static void bch2_coalesce_nodes(struct bch_fs *c, struct btree_iter *iter, + n1->key.k.p = n1->data->max_key = + bkey_unpack_pos(n1, last); + +- n2->data->min_key = bkey_successor(n1->data->max_key); ++ n2->data->min_key = bpos_successor(n1->data->max_key); + + memcpy_u64s(vstruct_last(s1), + s2->start, u64s); +diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c +index f7f265d49d0c..04328456dcec 100644 +--- a/fs/bcachefs/btree_io.c ++++ b/fs/bcachefs/btree_io.c +@@ -612,12 +612,6 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, + BTREE_ERR_MUST_RETRY, c, ca, b, i, + "incorrect level"); + +- if (BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN) { +- u64 *p = (u64 *) &bn->ptr; +- +- *p = swab64(*p); +- } +- + if (!write) + compat_btree_node(b->c.level, b->c.btree_id, version, + BSET_BIG_ENDIAN(i), write, bn); +@@ -1328,8 +1322,8 @@ static int validate_bset_for_write(struct bch_fs *c, struct btree *b, + if (bch2_bkey_invalid(c, bkey_i_to_s_c(&b->key), BKEY_TYPE_btree)) + return -1; + +- ret = validate_bset(c, NULL, b, i, sectors, WRITE, false) ?: +- validate_bset_keys(c, b, i, &whiteout_u64s, WRITE, false); ++ ret = validate_bset_keys(c, b, i, &whiteout_u64s, WRITE, false) ?: ++ validate_bset(c, NULL, b, i, sectors, WRITE, false); + if (ret) { + bch2_inconsistent_error(c); + dump_stack(); +@@ -1482,7 +1476,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, + validate_before_checksum = true; + + /* validate_bset will be modifying: */ +- if (le16_to_cpu(i->version) <= bcachefs_metadata_version_inode_btree_change) ++ if (le16_to_cpu(i->version) < bcachefs_metadata_version_current) + validate_before_checksum = true; + + /* if we're going to be encrypting, check metadata validity first: */ +diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h +index f155a6cc1755..9c14cd30a09e 100644 +--- a/fs/bcachefs/btree_io.h ++++ b/fs/bcachefs/btree_io.h +@@ -189,8 +189,8 @@ void bch2_btree_flush_all_writes(struct bch_fs *); + void bch2_dirty_btree_nodes_to_text(struct printbuf *, struct bch_fs *); + + static inline void compat_bformat(unsigned level, enum btree_id btree_id, +- unsigned version, unsigned big_endian, +- int write, struct bkey_format *f) ++ unsigned version, unsigned big_endian, ++ int write, struct bkey_format *f) + { + if (version < bcachefs_metadata_version_inode_btree_change && + btree_id == BTREE_ID_inodes) { +@@ -199,6 +199,16 @@ static inline void compat_bformat(unsigned level, enum btree_id btree_id, + swap(f->field_offset[BKEY_FIELD_INODE], + f->field_offset[BKEY_FIELD_OFFSET]); + } ++ ++ if (version < bcachefs_metadata_version_snapshot && ++ (level || btree_type_has_snapshots(btree_id))) { ++ u64 max_packed = ++ ~(~0ULL << f->bits_per_field[BKEY_FIELD_SNAPSHOT]); ++ ++ f->field_offset[BKEY_FIELD_SNAPSHOT] = write ++ ? 0 ++ : U32_MAX - max_packed; ++ } + } + + static inline void compat_bpos(unsigned level, enum btree_id btree_id, +@@ -222,16 +232,24 @@ static inline void compat_btree_node(unsigned level, enum btree_id btree_id, + btree_node_type_is_extents(btree_id) && + bpos_cmp(bn->min_key, POS_MIN) && + write) +- bn->min_key = bkey_predecessor(bn->min_key); ++ bn->min_key = bpos_nosnap_predecessor(bn->min_key); ++ ++ if (version < bcachefs_metadata_version_snapshot && ++ write) ++ bn->max_key.snapshot = 0; + + compat_bpos(level, btree_id, version, big_endian, write, &bn->min_key); + compat_bpos(level, btree_id, version, big_endian, write, &bn->max_key); + ++ if (version < bcachefs_metadata_version_snapshot && ++ !write) ++ bn->max_key.snapshot = U32_MAX; ++ + if (version < bcachefs_metadata_version_inode_btree_change && + btree_node_type_is_extents(btree_id) && + bpos_cmp(bn->min_key, POS_MIN) && + !write) +- bn->min_key = bkey_successor(bn->min_key); ++ bn->min_key = bpos_nosnap_successor(bn->min_key); + } + + #endif /* _BCACHEFS_BTREE_IO_H */ +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index b64eb94aa746..4a3f3d5b860f 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -18,6 +18,36 @@ + + static void btree_iter_set_search_pos(struct btree_iter *, struct bpos); + ++static inline struct bpos bkey_successor(struct btree_iter *iter, struct bpos p) ++{ ++ EBUG_ON(btree_iter_type(iter) == BTREE_ITER_NODES); ++ ++ /* Are we iterating over keys in all snapshots? */ ++ if (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) { ++ p = bpos_successor(p); ++ } else { ++ p = bpos_nosnap_successor(p); ++ p.snapshot = iter->snapshot; ++ } ++ ++ return p; ++} ++ ++static inline struct bpos bkey_predecessor(struct btree_iter *iter, struct bpos p) ++{ ++ EBUG_ON(btree_iter_type(iter) == BTREE_ITER_NODES); ++ ++ /* Are we iterating over keys in all snapshots? */ ++ if (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) { ++ p = bpos_predecessor(p); ++ } else { ++ p = bpos_nosnap_predecessor(p); ++ p.snapshot = iter->snapshot; ++ } ++ ++ return p; ++} ++ + static inline bool is_btree_node(struct btree_iter *iter, unsigned l) + { + return l < BTREE_MAX_DEPTH && +@@ -30,7 +60,7 @@ static inline struct bpos btree_iter_search_key(struct btree_iter *iter) + + if ((iter->flags & BTREE_ITER_IS_EXTENTS) && + bkey_cmp(pos, POS_MAX)) +- pos = bkey_successor(pos); ++ pos = bkey_successor(iter, pos); + return pos; + } + +@@ -591,10 +621,24 @@ err: + + static void bch2_btree_iter_verify(struct btree_iter *iter) + { ++ enum btree_iter_type type = btree_iter_type(iter); + unsigned i; + + EBUG_ON(iter->btree_id >= BTREE_ID_NR); + ++ BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS) && ++ iter->pos.snapshot != iter->snapshot); ++ ++ BUG_ON((iter->flags & BTREE_ITER_IS_EXTENTS) && ++ (iter->flags & BTREE_ITER_ALL_SNAPSHOTS)); ++ ++ BUG_ON(type == BTREE_ITER_NODES && ++ !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)); ++ ++ BUG_ON(type != BTREE_ITER_NODES && ++ (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) && ++ !btree_type_has_snapshots(iter->btree_id)); ++ + bch2_btree_iter_verify_locks(iter); + + for (i = 0; i < BTREE_MAX_DEPTH; i++) +@@ -605,6 +649,9 @@ static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) + { + enum btree_iter_type type = btree_iter_type(iter); + ++ BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS) && ++ iter->pos.snapshot != iter->snapshot); ++ + BUG_ON((type == BTREE_ITER_KEYS || + type == BTREE_ITER_CACHED) && + (bkey_cmp(iter->pos, bkey_start_pos(&iter->k)) < 0 || +@@ -1434,7 +1481,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) + * Haven't gotten to the end of the parent node: go back down to + * the next child node + */ +- btree_iter_set_search_pos(iter, bkey_successor(iter->pos)); ++ btree_iter_set_search_pos(iter, bpos_successor(iter->pos)); + + /* Unlock to avoid screwing up our lock invariants: */ + btree_node_unlock(iter, iter->level); +@@ -1508,7 +1555,7 @@ inline bool bch2_btree_iter_advance(struct btree_iter *iter) + bool ret = bpos_cmp(pos, POS_MAX) != 0; + + if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS)) +- pos = bkey_successor(pos); ++ pos = bkey_successor(iter, pos); + bch2_btree_iter_set_pos(iter, pos); + return ret; + } +@@ -1519,7 +1566,7 @@ inline bool bch2_btree_iter_rewind(struct btree_iter *iter) + bool ret = bpos_cmp(pos, POS_MIN) != 0; + + if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS)) +- pos = bkey_predecessor(pos); ++ pos = bkey_predecessor(iter, pos); + bch2_btree_iter_set_pos(iter, pos); + return ret; + } +@@ -1535,7 +1582,7 @@ static inline bool btree_iter_set_pos_to_next_leaf(struct btree_iter *iter) + * btree, in that case we want iter->pos to reflect that: + */ + if (ret) +- btree_iter_set_search_pos(iter, bkey_successor(next_pos)); ++ btree_iter_set_search_pos(iter, bpos_successor(next_pos)); + else + bch2_btree_iter_set_pos(iter, POS_MAX); + +@@ -1548,7 +1595,7 @@ static inline bool btree_iter_set_pos_to_prev_leaf(struct btree_iter *iter) + bool ret = bpos_cmp(next_pos, POS_MIN) != 0; + + if (ret) +- btree_iter_set_search_pos(iter, bkey_predecessor(next_pos)); ++ btree_iter_set_search_pos(iter, bpos_predecessor(next_pos)); + else + bch2_btree_iter_set_pos(iter, POS_MIN); + +@@ -1594,13 +1641,13 @@ static inline struct bkey_s_c __btree_iter_peek(struct btree_iter *iter, bool wi + k = btree_iter_level_peek(iter, &iter->l[0]); + + if (next_update && +- bkey_cmp(next_update->k.p, iter->real_pos) <= 0) ++ bpos_cmp(next_update->k.p, iter->real_pos) <= 0) + k = bkey_i_to_s_c(next_update); + + if (likely(k.k)) { + if (bkey_deleted(k.k)) { + btree_iter_set_search_pos(iter, +- bkey_successor(k.k->p)); ++ bkey_successor(iter, k.k->p)); + continue; + } + +@@ -1739,7 +1786,7 @@ __bch2_btree_iter_peek_slot_extents(struct btree_iter *iter) + if (iter->pos.inode == KEY_INODE_MAX) + return bkey_s_c_null; + +- bch2_btree_iter_set_pos(iter, bkey_successor(iter->pos)); ++ bch2_btree_iter_set_pos(iter, bkey_successor(iter, iter->pos)); + } + + pos = iter->pos; +@@ -1973,6 +2020,14 @@ struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans, + { + struct btree_iter *iter, *best = NULL; + ++ if ((flags & BTREE_ITER_TYPE) != BTREE_ITER_NODES && ++ !btree_type_has_snapshots(btree_id)) ++ flags &= ~BTREE_ITER_ALL_SNAPSHOTS; ++ ++ if (!(flags & BTREE_ITER_ALL_SNAPSHOTS)) ++ pos.snapshot = btree_type_has_snapshots(btree_id) ++ ? U32_MAX : 0; ++ + /* We always want a fresh iterator for node iterators: */ + if ((flags & BTREE_ITER_TYPE) == BTREE_ITER_NODES) + goto alloc_iter; +@@ -2007,11 +2062,14 @@ alloc_iter: + + if ((flags & BTREE_ITER_TYPE) != BTREE_ITER_NODES && + btree_node_type_is_extents(btree_id) && +- !(flags & BTREE_ITER_NOT_EXTENTS)) ++ !(flags & BTREE_ITER_NOT_EXTENTS) && ++ !(flags & BTREE_ITER_ALL_SNAPSHOTS)) + flags |= BTREE_ITER_IS_EXTENTS; + + iter->flags = flags; + ++ iter->snapshot = pos.snapshot; ++ + if (!(iter->flags & BTREE_ITER_INTENT)) + bch2_btree_iter_downgrade(iter); + else if (!iter->locks_want) +@@ -2034,6 +2092,7 @@ struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *trans, + __bch2_trans_get_iter(trans, btree_id, pos, + BTREE_ITER_NODES| + BTREE_ITER_NOT_EXTENTS| ++ BTREE_ITER_ALL_SNAPSHOTS| + flags); + unsigned i; + +diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h +index 176661b3b879..7585f989ad50 100644 +--- a/fs/bcachefs/btree_iter.h ++++ b/fs/bcachefs/btree_iter.h +@@ -172,6 +172,9 @@ bool bch2_btree_iter_rewind(struct btree_iter *); + + static inline void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos) + { ++ if (!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) ++ new_pos.snapshot = iter->snapshot; ++ + bkey_init(&iter->k); + iter->k.p = iter->pos = new_pos; + } +diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h +index f5eb970db8f9..038cd1f94376 100644 +--- a/fs/bcachefs/btree_types.h ++++ b/fs/bcachefs/btree_types.h +@@ -216,6 +216,7 @@ enum btree_iter_type { + #define BTREE_ITER_CACHED_NOFILL (1 << 9) + #define BTREE_ITER_CACHED_NOCREATE (1 << 10) + #define BTREE_ITER_NOT_EXTENTS (1 << 11) ++#define BTREE_ITER_ALL_SNAPSHOTS (1 << 12) + + enum btree_iter_uptodate { + BTREE_ITER_UPTODATE = 0, +@@ -245,6 +246,8 @@ struct btree_iter { + /* what we're searching for/what the iterator actually points to: */ + struct bpos real_pos; + struct bpos pos_after_commit; ++ /* When we're filtering by snapshot, the snapshot ID we're looking for: */ ++ unsigned snapshot; + + u16 flags; + u8 idx; +@@ -329,7 +332,7 @@ struct bkey_cached { + struct btree_insert_entry { + unsigned trigger_flags; + u8 bkey_type; +- u8 btree_id; ++ enum btree_id btree_id:8; + u8 level; + unsigned trans_triggers_run:1; + unsigned is_extent:1; +@@ -610,6 +613,17 @@ static inline bool btree_iter_is_extents(struct btree_iter *iter) + (BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS| \ + BTREE_NODE_TYPE_HAS_MEM_TRIGGERS) + ++#define BTREE_ID_HAS_SNAPSHOTS \ ++ ((1U << BTREE_ID_extents)| \ ++ (1U << BTREE_ID_inodes)| \ ++ (1U << BTREE_ID_dirents)| \ ++ (1U << BTREE_ID_xattrs)) ++ ++static inline bool btree_type_has_snapshots(enum btree_id id) ++{ ++ return (1 << id) & BTREE_ID_HAS_SNAPSHOTS; ++} ++ + enum btree_trigger_flags { + __BTREE_TRIGGER_NORUN, /* Don't run triggers at all */ + +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index 4f82cb330ed8..19dfc32e8c68 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -69,7 +69,7 @@ static void btree_node_interior_verify(struct bch_fs *c, struct btree *b) + break; + } + +- next_node = bkey_successor(k.k->p); ++ next_node = bpos_successor(k.k->p); + } + #endif + } +@@ -289,7 +289,6 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned lev + b->data->flags = 0; + SET_BTREE_NODE_ID(b->data, as->btree_id); + SET_BTREE_NODE_LEVEL(b->data, level); +- b->data->ptr = bch2_bkey_ptrs_c(bkey_i_to_s_c(&b->key)).start->ptr; + + if (b->key.k.type == KEY_TYPE_btree_ptr_v2) { + struct bkey_i_btree_ptr_v2 *bp = bkey_i_to_btree_ptr_v2(&b->key); +@@ -1100,6 +1099,7 @@ static struct btree *__btree_split_node(struct btree_update *as, + struct btree *n2; + struct bset *set1, *set2; + struct bkey_packed *k, *set2_start, *set2_end, *out, *prev = NULL; ++ struct bpos n1_pos; + + n2 = bch2_btree_node_alloc(as, n1->c.level); + bch2_btree_update_add_new_node(as, n2); +@@ -1146,8 +1146,12 @@ static struct btree *__btree_split_node(struct btree_update *as, + n1->nr.packed_keys = nr_packed; + n1->nr.unpacked_keys = nr_unpacked; + +- btree_set_max(n1, bkey_unpack_pos(n1, prev)); +- btree_set_min(n2, bkey_successor(n1->key.k.p)); ++ n1_pos = bkey_unpack_pos(n1, prev); ++ if (as->c->sb.version < bcachefs_metadata_version_snapshot) ++ n1_pos.snapshot = U32_MAX; ++ ++ btree_set_max(n1, n1_pos); ++ btree_set_min(n2, bpos_successor(n1->key.k.p)); + + bch2_bkey_format_init(&s); + bch2_bkey_format_add_pos(&s, n2->data->min_key); +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index 7f79836a57ca..e73577ef853c 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -223,9 +223,17 @@ static inline void btree_insert_entry_checks(struct btree_trans *trans, + { + struct bch_fs *c = trans->c; + +- BUG_ON(bch2_debug_check_bkeys && +- bch2_bkey_invalid(c, bkey_i_to_s_c(i->k), i->bkey_type)); +- BUG_ON(bpos_cmp(i->k->k.p, i->iter->real_pos)); ++ if (bch2_debug_check_bkeys) { ++ const char *invalid = bch2_bkey_invalid(c, ++ bkey_i_to_s_c(i->k), i->bkey_type); ++ if (invalid) { ++ char buf[200]; ++ ++ bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(i->k)); ++ panic("invalid bkey %s on insert: %s\n", buf, invalid); ++ } ++ } ++ BUG_ON(!i->is_extent && bpos_cmp(i->k->k.p, i->iter->real_pos)); + BUG_ON(i->level != i->iter->level); + BUG_ON(i->btree_id != i->iter->btree_id); + } +diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c +index 2cedf6578f5b..acf600387c9f 100644 +--- a/fs/bcachefs/debug.c ++++ b/fs/bcachefs/debug.c +@@ -222,7 +222,9 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf, + + bch2_trans_init(&trans, i->c, 0, 0); + +- iter = bch2_trans_get_iter(&trans, i->id, i->from, BTREE_ITER_PREFETCH); ++ iter = bch2_trans_get_iter(&trans, i->id, i->from, ++ BTREE_ITER_PREFETCH| ++ BTREE_ITER_ALL_SNAPSHOTS); + k = bch2_btree_iter_peek(iter); + + while (k.k && !(err = bkey_err(k))) { +@@ -290,7 +292,7 @@ static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf, + * all nodes, meh + */ + i->from = bpos_cmp(POS_MAX, b->key.k.p) +- ? bkey_successor(b->key.k.p) ++ ? bpos_successor(b->key.k.p) + : b->key.k.p; + + if (!i->size) +diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c +index a7e0408213a9..b07d39555eb6 100644 +--- a/fs/bcachefs/extents.c ++++ b/fs/bcachefs/extents.c +@@ -180,7 +180,8 @@ const char *bch2_btree_ptr_v2_invalid(const struct bch_fs *c, struct bkey_s_c k) + if (bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX) + return "value too big"; + +- if (bp.v->min_key.snapshot) ++ if (c->sb.version < bcachefs_metadata_version_snapshot && ++ bp.v->min_key.snapshot) + return "invalid min_key.snapshot"; + + return bch2_bkey_ptrs_invalid(c, k); +@@ -212,8 +213,8 @@ void bch2_btree_ptr_v2_compat(enum btree_id btree_id, unsigned version, + btree_node_type_is_extents(btree_id) && + bkey_cmp(bp.v->min_key, POS_MIN)) + bp.v->min_key = write +- ? bkey_predecessor(bp.v->min_key) +- : bkey_successor(bp.v->min_key); ++ ? bpos_nosnap_predecessor(bp.v->min_key) ++ : bpos_nosnap_successor(bp.v->min_key); + } + + /* KEY_TYPE_extent: */ +diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c +index ffb30ef7ef00..a3acae0ddfa9 100644 +--- a/fs/bcachefs/fsck.c ++++ b/fs/bcachefs/fsck.c +@@ -1318,6 +1318,7 @@ static int check_inode(struct btree_trans *trans, + struct bkey_inode_buf p; + + bch2_inode_pack(c, &p, &u); ++ p.inode.k.p = iter->pos; + + ret = __bch2_trans_do(trans, NULL, NULL, + BTREE_INSERT_NOFAIL| +diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c +index 4559e77f91f0..40b176fc1788 100644 +--- a/fs/bcachefs/inode.c ++++ b/fs/bcachefs/inode.c +@@ -332,6 +332,7 @@ int bch2_inode_write(struct btree_trans *trans, + return PTR_ERR(inode_p); + + bch2_inode_pack(trans->c, inode_p, inode); ++ inode_p->inode.k.p.snapshot = iter->snapshot; + bch2_trans_update(trans, iter, &inode_p->inode.k_i, 0); + return 0; + } +diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c +index 07b7a648b0c9..c484e58acbec 100644 +--- a/fs/bcachefs/io.c ++++ b/fs/bcachefs/io.c +@@ -322,6 +322,9 @@ int bch2_extent_update(struct btree_trans *trans, + + if (i_sectors_delta || new_i_size) { + bch2_inode_pack(trans->c, &inode_p, &inode_u); ++ ++ inode_p.inode.k.p.snapshot = iter->snapshot; ++ + bch2_trans_update(trans, inode_iter, + &inode_p.inode.k_i, 0); + } +@@ -437,6 +440,8 @@ int bch2_write_index_default(struct bch_write_op *op) + + k = bch2_keylist_front(keys); + ++ k->k.p.snapshot = iter->snapshot; ++ + bch2_bkey_buf_realloc(&sk, c, k->k.u64s); + bkey_copy(sk.k, k); + bch2_cut_front(iter->pos, sk.k); +diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c +index 54f2e2053bc0..c7fa03cfbde6 100644 +--- a/fs/bcachefs/journal_io.c ++++ b/fs/bcachefs/journal_io.c +@@ -1452,7 +1452,7 @@ void bch2_journal_write(struct closure *cl) + if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset))) + validate_before_checksum = true; + +- if (le32_to_cpu(jset->version) <= bcachefs_metadata_version_inode_btree_change) ++ if (le32_to_cpu(jset->version) < bcachefs_metadata_version_current) + validate_before_checksum = true; + + if (validate_before_checksum && +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index 596f7c1e4245..a3a6abb88d6f 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -998,6 +998,13 @@ int bch2_fs_recovery(struct bch_fs *c) + goto err; + } + ++ if (!(c->sb.compat & (1ULL << BCH_COMPAT_bformat_overflow_done))) { ++ bch_err(c, "filesystem may have incompatible bkey formats; run fsck from the compat branch to fix"); ++ ret = -EINVAL; ++ goto err; ++ ++ } ++ + if (!(c->sb.features & (1ULL << BCH_FEATURE_alloc_v2))) { + bch_info(c, "alloc_v2 feature bit not set, fsck required"); + c->opts.fsck = true; +@@ -1340,6 +1347,7 @@ int bch2_fs_initialize(struct bch_fs *c) + S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0, NULL); + root_inode.bi_inum = BCACHEFS_ROOT_INO; + bch2_inode_pack(c, &packed_inode, &root_inode); ++ packed_inode.inode.k.p.snapshot = U32_MAX; + + err = "error creating root directory"; + ret = bch2_btree_insert(c, BTREE_ID_inodes, +diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c +index 14b85dc22342..7507b6bcc13f 100644 +--- a/fs/bcachefs/tests.c ++++ b/fs/bcachefs/tests.c +@@ -483,6 +483,7 @@ static int rand_insert(struct bch_fs *c, u64 nr) + for (i = 0; i < nr; i++) { + bkey_cookie_init(&k.k_i); + k.k.p.offset = test_rand(); ++ k.k.p.snapshot = U32_MAX; + + ret = __bch2_trans_do(&trans, NULL, NULL, 0, + __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k.k_i)); +-- +cgit v1.2.3 + + +From 8aee97b68bd5279cb7253cef3484e2ab9fb489e1 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 2 Mar 2021 18:35:30 -0500 +Subject: bcachefs: Inode backpointers + +This patch adds two new inode fields, bi_dir and bi_dir_offset, that +point back to the inode's dirent. + +Since we're only adding fields for a single backpointer, files that have +been hardlinked won't necessarily have valid backpointers: we also add a +new inode flag, BCH_INODE_BACKPTR_UNTRUSTED, that's set if an inode has +ever had multiple links to it. That's ok, because we only really need +this functionality for directories, which can never have multiple +hardlinks - when we add subvolumes, we'll need a way to enemurate and +print subvolumes, and this will let us reconstruct a path to a subvolume +root given a subvolume root inode. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bcachefs_format.h | 9 ++++-- + fs/bcachefs/dirent.c | 18 ++++++++---- + fs/bcachefs/dirent.h | 6 ++-- + fs/bcachefs/fs-common.c | 68 +++++++++++++++++++++++++++++++++---------- + fs/bcachefs/fsck.c | 43 +++++++++++++++++++++++++++ + fs/bcachefs/inode.c | 18 ++++-------- + fs/bcachefs/inode.h | 3 +- + 7 files changed, 125 insertions(+), 40 deletions(-) + +diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h +index 375d1c7ed392..ead7268bf898 100644 +--- a/fs/bcachefs/bcachefs_format.h ++++ b/fs/bcachefs/bcachefs_format.h +@@ -706,7 +706,9 @@ struct bch_inode_generation { + x(bi_foreground_target, 16) \ + x(bi_background_target, 16) \ + x(bi_erasure_code, 16) \ +- x(bi_fields_set, 16) ++ x(bi_fields_set, 16) \ ++ x(bi_dir, 64) \ ++ x(bi_dir_offset, 64) + + /* subset of BCH_INODE_FIELDS */ + #define BCH_INODE_OPTS() \ +@@ -742,6 +744,7 @@ enum { + __BCH_INODE_I_SIZE_DIRTY= 5, + __BCH_INODE_I_SECTORS_DIRTY= 6, + __BCH_INODE_UNLINKED = 7, ++ __BCH_INODE_BACKPTR_UNTRUSTED = 8, + + /* bits 20+ reserved for packed fields below: */ + }; +@@ -754,6 +757,7 @@ enum { + #define BCH_INODE_I_SIZE_DIRTY (1 << __BCH_INODE_I_SIZE_DIRTY) + #define BCH_INODE_I_SECTORS_DIRTY (1 << __BCH_INODE_I_SECTORS_DIRTY) + #define BCH_INODE_UNLINKED (1 << __BCH_INODE_UNLINKED) ++#define BCH_INODE_BACKPTR_UNTRUSTED (1 << __BCH_INODE_BACKPTR_UNTRUSTED) + + LE32_BITMASK(INODE_STR_HASH, struct bch_inode, bi_flags, 20, 24); + LE32_BITMASK(INODE_NR_FIELDS, struct bch_inode, bi_flags, 24, 31); +@@ -1204,7 +1208,8 @@ enum bcachefs_metadata_version { + bcachefs_metadata_version_bkey_renumber = 10, + bcachefs_metadata_version_inode_btree_change = 11, + bcachefs_metadata_version_snapshot = 12, +- bcachefs_metadata_version_max = 13, ++ bcachefs_metadata_version_inode_backpointers = 13, ++ bcachefs_metadata_version_max = 14, + }; + + #define bcachefs_metadata_version_current (bcachefs_metadata_version_max - 1) +diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c +index 592dd80cf963..cf4ce2e7f29c 100644 +--- a/fs/bcachefs/dirent.c ++++ b/fs/bcachefs/dirent.c +@@ -141,7 +141,7 @@ static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans, + int bch2_dirent_create(struct btree_trans *trans, + u64 dir_inum, const struct bch_hash_info *hash_info, + u8 type, const struct qstr *name, u64 dst_inum, +- int flags) ++ u64 *dir_offset, int flags) + { + struct bkey_i_dirent *dirent; + int ret; +@@ -151,8 +151,11 @@ int bch2_dirent_create(struct btree_trans *trans, + if (ret) + return ret; + +- return bch2_hash_set(trans, bch2_dirent_hash_desc, hash_info, +- dir_inum, &dirent->k_i, flags); ++ ret = bch2_hash_set(trans, bch2_dirent_hash_desc, hash_info, ++ dir_inum, &dirent->k_i, flags); ++ *dir_offset = dirent->k.p.offset; ++ ++ return ret; + } + + static void dirent_copy_target(struct bkey_i_dirent *dst, +@@ -165,8 +168,8 @@ static void dirent_copy_target(struct bkey_i_dirent *dst, + int bch2_dirent_rename(struct btree_trans *trans, + u64 src_dir, struct bch_hash_info *src_hash, + u64 dst_dir, struct bch_hash_info *dst_hash, +- const struct qstr *src_name, u64 *src_inum, +- const struct qstr *dst_name, u64 *dst_inum, ++ const struct qstr *src_name, u64 *src_inum, u64 *src_offset, ++ const struct qstr *dst_name, u64 *dst_inum, u64 *dst_offset, + enum bch_rename_mode mode) + { + struct btree_iter *src_iter = NULL, *dst_iter = NULL; +@@ -255,7 +258,7 @@ int bch2_dirent_rename(struct btree_trans *trans, + new_dst->k.p = src_iter->pos; + bch2_trans_update(trans, src_iter, + &new_dst->k_i, 0); +- goto out; ++ goto out_set_offset; + } else { + /* If we're overwriting, we can't insert new_dst + * at a different slot because it has to +@@ -278,6 +281,9 @@ int bch2_dirent_rename(struct btree_trans *trans, + + bch2_trans_update(trans, src_iter, &new_src->k_i, 0); + bch2_trans_update(trans, dst_iter, &new_dst->k_i, 0); ++out_set_offset: ++ *src_offset = new_src->k.p.offset; ++ *dst_offset = new_dst->k.p.offset; + out: + bch2_trans_iter_put(trans, src_iter); + bch2_trans_iter_put(trans, dst_iter); +diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h +index 34769371dd13..e1d8ce377d43 100644 +--- a/fs/bcachefs/dirent.h ++++ b/fs/bcachefs/dirent.h +@@ -31,7 +31,7 @@ static inline unsigned dirent_val_u64s(unsigned len) + + int bch2_dirent_create(struct btree_trans *, u64, + const struct bch_hash_info *, u8, +- const struct qstr *, u64, int); ++ const struct qstr *, u64, u64 *, int); + + int bch2_dirent_delete_at(struct btree_trans *, + const struct bch_hash_info *, +@@ -46,8 +46,8 @@ enum bch_rename_mode { + int bch2_dirent_rename(struct btree_trans *, + u64, struct bch_hash_info *, + u64, struct bch_hash_info *, +- const struct qstr *, u64 *, +- const struct qstr *, u64 *, ++ const struct qstr *, u64 *, u64 *, ++ const struct qstr *, u64 *, u64 *, + enum bch_rename_mode); + + struct btree_iter * +diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c +index 503ce1920f39..83c2168ce480 100644 +--- a/fs/bcachefs/fs-common.c ++++ b/fs/bcachefs/fs-common.c +@@ -20,8 +20,10 @@ int bch2_create_trans(struct btree_trans *trans, u64 dir_inum, + { + struct bch_fs *c = trans->c; + struct btree_iter *dir_iter = NULL; ++ struct btree_iter *inode_iter = NULL; + struct bch_hash_info hash = bch2_hash_info_init(c, new_inode); +- u64 now = bch2_current_time(trans->c); ++ u64 now = bch2_current_time(c); ++ u64 dir_offset = 0; + int ret; + + dir_iter = bch2_inode_peek(trans, dir_u, dir_inum, BTREE_ITER_INTENT); +@@ -34,7 +36,8 @@ int bch2_create_trans(struct btree_trans *trans, u64 dir_inum, + if (!name) + new_inode->bi_flags |= BCH_INODE_UNLINKED; + +- ret = bch2_inode_create(trans, new_inode); ++ inode_iter = bch2_inode_create(trans, new_inode); ++ ret = PTR_ERR_OR_ZERO(inode_iter); + if (ret) + goto err; + +@@ -66,11 +69,20 @@ int bch2_create_trans(struct btree_trans *trans, u64 dir_inum, + ret = bch2_dirent_create(trans, dir_inum, &dir_hash, + mode_to_type(new_inode->bi_mode), + name, new_inode->bi_inum, ++ &dir_offset, + BCH_HASH_SET_MUST_CREATE); + if (ret) + goto err; + } ++ ++ if (c->sb.version >= bcachefs_metadata_version_inode_backpointers) { ++ new_inode->bi_dir = dir_u->bi_inum; ++ new_inode->bi_dir_offset = dir_offset; ++ } ++ ++ ret = bch2_inode_write(trans, inode_iter, new_inode); + err: ++ bch2_trans_iter_put(trans, inode_iter); + bch2_trans_iter_put(trans, dir_iter); + return ret; + } +@@ -79,9 +91,11 @@ int bch2_link_trans(struct btree_trans *trans, u64 dir_inum, + u64 inum, struct bch_inode_unpacked *dir_u, + struct bch_inode_unpacked *inode_u, const struct qstr *name) + { ++ struct bch_fs *c = trans->c; + struct btree_iter *dir_iter = NULL, *inode_iter = NULL; + struct bch_hash_info dir_hash; +- u64 now = bch2_current_time(trans->c); ++ u64 now = bch2_current_time(c); ++ u64 dir_offset = 0; + int ret; + + inode_iter = bch2_inode_peek(trans, inode_u, inum, BTREE_ITER_INTENT); +@@ -92,6 +106,8 @@ int bch2_link_trans(struct btree_trans *trans, u64 dir_inum, + inode_u->bi_ctime = now; + bch2_inode_nlink_inc(inode_u); + ++ inode_u->bi_flags |= BCH_INODE_BACKPTR_UNTRUSTED; ++ + dir_iter = bch2_inode_peek(trans, dir_u, dir_inum, 0); + ret = PTR_ERR_OR_ZERO(dir_iter); + if (ret) +@@ -99,12 +115,21 @@ int bch2_link_trans(struct btree_trans *trans, u64 dir_inum, + + dir_u->bi_mtime = dir_u->bi_ctime = now; + +- dir_hash = bch2_hash_info_init(trans->c, dir_u); ++ dir_hash = bch2_hash_info_init(c, dir_u); + +- ret = bch2_dirent_create(trans, dir_inum, &dir_hash, +- mode_to_type(inode_u->bi_mode), +- name, inum, BCH_HASH_SET_MUST_CREATE) ?: +- bch2_inode_write(trans, dir_iter, dir_u) ?: ++ ret = bch2_dirent_create(trans, dir_inum, &dir_hash, ++ mode_to_type(inode_u->bi_mode), ++ name, inum, &dir_offset, ++ BCH_HASH_SET_MUST_CREATE); ++ if (ret) ++ goto err; ++ ++ if (c->sb.version >= bcachefs_metadata_version_inode_backpointers) { ++ inode_u->bi_dir = dir_inum; ++ inode_u->bi_dir_offset = dir_offset; ++ } ++ ++ ret = bch2_inode_write(trans, dir_iter, dir_u) ?: + bch2_inode_write(trans, inode_iter, inode_u); + err: + bch2_trans_iter_put(trans, dir_iter); +@@ -117,10 +142,11 @@ int bch2_unlink_trans(struct btree_trans *trans, + struct bch_inode_unpacked *inode_u, + const struct qstr *name) + { ++ struct bch_fs *c = trans->c; + struct btree_iter *dir_iter = NULL, *dirent_iter = NULL, + *inode_iter = NULL; + struct bch_hash_info dir_hash; +- u64 inum, now = bch2_current_time(trans->c); ++ u64 inum, now = bch2_current_time(c); + struct bkey_s_c k; + int ret; + +@@ -129,7 +155,7 @@ int bch2_unlink_trans(struct btree_trans *trans, + if (ret) + goto err; + +- dir_hash = bch2_hash_info_init(trans->c, dir_u); ++ dir_hash = bch2_hash_info_init(c, dir_u); + + dirent_iter = __bch2_dirent_lookup_trans(trans, dir_inum, &dir_hash, + name, BTREE_ITER_INTENT); +@@ -195,10 +221,12 @@ int bch2_rename_trans(struct btree_trans *trans, + const struct qstr *dst_name, + enum bch_rename_mode mode) + { ++ struct bch_fs *c = trans->c; + struct btree_iter *src_dir_iter = NULL, *dst_dir_iter = NULL; + struct btree_iter *src_inode_iter = NULL, *dst_inode_iter = NULL; + struct bch_hash_info src_hash, dst_hash; +- u64 src_inode, dst_inode, now = bch2_current_time(trans->c); ++ u64 src_inode, src_offset, dst_inode, dst_offset; ++ u64 now = bch2_current_time(c); + int ret; + + src_dir_iter = bch2_inode_peek(trans, src_dir_u, src_dir, +@@ -207,7 +235,7 @@ int bch2_rename_trans(struct btree_trans *trans, + if (ret) + goto err; + +- src_hash = bch2_hash_info_init(trans->c, src_dir_u); ++ src_hash = bch2_hash_info_init(c, src_dir_u); + + if (dst_dir != src_dir) { + dst_dir_iter = bch2_inode_peek(trans, dst_dir_u, dst_dir, +@@ -216,7 +244,7 @@ int bch2_rename_trans(struct btree_trans *trans, + if (ret) + goto err; + +- dst_hash = bch2_hash_info_init(trans->c, dst_dir_u); ++ dst_hash = bch2_hash_info_init(c, dst_dir_u); + } else { + dst_dir_u = src_dir_u; + dst_hash = src_hash; +@@ -225,8 +253,8 @@ int bch2_rename_trans(struct btree_trans *trans, + ret = bch2_dirent_rename(trans, + src_dir, &src_hash, + dst_dir, &dst_hash, +- src_name, &src_inode, +- dst_name, &dst_inode, ++ src_name, &src_inode, &src_offset, ++ dst_name, &dst_inode, &dst_offset, + mode); + if (ret) + goto err; +@@ -245,6 +273,16 @@ int bch2_rename_trans(struct btree_trans *trans, + goto err; + } + ++ if (c->sb.version >= bcachefs_metadata_version_inode_backpointers) { ++ src_inode_u->bi_dir = dst_dir_u->bi_inum; ++ src_inode_u->bi_dir_offset = dst_offset; ++ ++ if (mode == BCH_RENAME_EXCHANGE) { ++ dst_inode_u->bi_dir = src_dir_u->bi_inum; ++ dst_inode_u->bi_dir_offset = src_offset; ++ } ++ } ++ + if (mode == BCH_RENAME_OVERWRITE) { + if (S_ISDIR(src_inode_u->bi_mode) != + S_ISDIR(dst_inode_u->bi_mode)) { +diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c +index a3acae0ddfa9..d65b3e100f78 100644 +--- a/fs/bcachefs/fsck.c ++++ b/fs/bcachefs/fsck.c +@@ -675,6 +675,39 @@ retry: + continue; + } + ++ if (!target.bi_nlink && ++ !(target.bi_flags & BCH_INODE_BACKPTR_UNTRUSTED) && ++ (target.bi_dir != k.k->p.inode || ++ target.bi_dir_offset != k.k->p.offset) && ++ (fsck_err_on(c->sb.version >= bcachefs_metadata_version_inode_backpointers, c, ++ "inode %llu has wrong backpointer:\n" ++ "got %llu:%llu\n" ++ "should be %llu:%llu", ++ d_inum, ++ target.bi_dir, ++ target.bi_dir_offset, ++ k.k->p.inode, ++ k.k->p.offset) || ++ c->opts.version_upgrade)) { ++ struct bkey_inode_buf p; ++ ++ target.bi_dir = k.k->p.inode; ++ target.bi_dir_offset = k.k->p.offset; ++ bch2_trans_unlock(&trans); ++ ++ bch2_inode_pack(c, &p, &target); ++ ++ ret = bch2_btree_insert(c, BTREE_ID_inodes, ++ &p.inode.k_i, NULL, NULL, ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_LAZY_RW); ++ if (ret) { ++ bch_err(c, "error in fsck: error %i updating inode", ret); ++ goto err; ++ } ++ continue; ++ } ++ + if (fsck_err_on(have_target && + d.v->d_type != + mode_to_type(target.bi_mode), c, +@@ -1314,6 +1347,16 @@ static int check_inode(struct btree_trans *trans, + do_update = true; + } + ++ if (!S_ISDIR(u.bi_mode) && ++ u.bi_nlink && ++ !(u.bi_flags & BCH_INODE_BACKPTR_UNTRUSTED) && ++ (fsck_err_on(c->sb.version >= bcachefs_metadata_version_inode_backpointers, c, ++ "inode missing BCH_INODE_BACKPTR_UNTRUSTED flags") || ++ c->opts.version_upgrade)) { ++ u.bi_flags |= BCH_INODE_BACKPTR_UNTRUSTED; ++ do_update = true; ++ } ++ + if (do_update) { + struct bkey_inode_buf p; + +diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c +index 40b176fc1788..f1665ca85da6 100644 +--- a/fs/bcachefs/inode.c ++++ b/fs/bcachefs/inode.c +@@ -470,11 +470,10 @@ static inline u32 bkey_generation(struct bkey_s_c k) + } + } + +-int bch2_inode_create(struct btree_trans *trans, +- struct bch_inode_unpacked *inode_u) ++struct btree_iter *bch2_inode_create(struct btree_trans *trans, ++ struct bch_inode_unpacked *inode_u) + { + struct bch_fs *c = trans->c; +- struct bkey_inode_buf *inode_p; + struct btree_iter *iter = NULL; + struct bkey_s_c k; + u64 min, max, start, *hint; +@@ -494,10 +493,6 @@ int bch2_inode_create(struct btree_trans *trans, + + if (start >= max || start < min) + start = min; +- +- inode_p = bch2_trans_kmalloc(trans, sizeof(*inode_p)); +- if (IS_ERR(inode_p)) +- return PTR_ERR(inode_p); + again: + for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, start), + BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { +@@ -521,7 +516,7 @@ again: + bch2_trans_iter_put(trans, iter); + + if (ret) +- return ret; ++ return ERR_PTR(ret); + + if (start != min) { + /* Retry from start */ +@@ -529,15 +524,12 @@ again: + goto again; + } + +- return -ENOSPC; ++ return ERR_PTR(-ENOSPC); + found_slot: + *hint = k.k->p.offset; + inode_u->bi_inum = k.k->p.offset; + inode_u->bi_generation = bkey_generation(k); +- +- ret = bch2_inode_write(trans, iter, inode_u); +- bch2_trans_iter_put(trans, iter); +- return ret; ++ return iter; + } + + int bch2_inode_rm(struct bch_fs *c, u64 inode_nr, bool cached) +diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h +index 1caf036ae928..6bad6dfb7989 100644 +--- a/fs/bcachefs/inode.h ++++ b/fs/bcachefs/inode.h +@@ -69,7 +69,8 @@ void bch2_inode_init(struct bch_fs *, struct bch_inode_unpacked *, + uid_t, gid_t, umode_t, dev_t, + struct bch_inode_unpacked *); + +-int bch2_inode_create(struct btree_trans *, struct bch_inode_unpacked *); ++struct btree_iter *bch2_inode_create(struct btree_trans *, ++ struct bch_inode_unpacked *); + + int bch2_inode_rm(struct bch_fs *, u64, bool); + +-- +cgit v1.2.3 + + +From b808fb1c99dc5682a660968e9dbf3fe4cb3e23bb Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 15 Mar 2021 19:18:30 -0400 +Subject: bcachefs: Change inode allocation code for snapshots + +For snapshots, when we allocate a new inode we want to allocate an inode +number that isn't in use in any other subvolume. We won't be able to use +ITER_SLOTS for this, inode allocation needs to change to use +BTREE_ITER_ALL_SNAPSHOTS. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/fs-common.c | 6 +++- + fs/bcachefs/inode.c | 78 ++++++++++++++++++++++++++++++++++--------------- + fs/bcachefs/inode.h | 2 +- + 3 files changed, 61 insertions(+), 25 deletions(-) + +diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c +index 83c2168ce480..281a6135e599 100644 +--- a/fs/bcachefs/fs-common.c ++++ b/fs/bcachefs/fs-common.c +@@ -36,7 +36,7 @@ int bch2_create_trans(struct btree_trans *trans, u64 dir_inum, + if (!name) + new_inode->bi_flags |= BCH_INODE_UNLINKED; + +- inode_iter = bch2_inode_create(trans, new_inode); ++ inode_iter = bch2_inode_create(trans, new_inode, U32_MAX); + ret = PTR_ERR_OR_ZERO(inode_iter); + if (ret) + goto err; +@@ -80,6 +80,10 @@ int bch2_create_trans(struct btree_trans *trans, u64 dir_inum, + new_inode->bi_dir_offset = dir_offset; + } + ++ /* XXX use bch2_btree_iter_set_snapshot() */ ++ inode_iter->snapshot = U32_MAX; ++ bch2_btree_iter_set_pos(inode_iter, SPOS(0, new_inode->bi_inum, U32_MAX)); ++ + ret = bch2_inode_write(trans, inode_iter, new_inode); + err: + bch2_trans_iter_put(trans, inode_iter); +diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c +index f1665ca85da6..d4c328397156 100644 +--- a/fs/bcachefs/inode.c ++++ b/fs/bcachefs/inode.c +@@ -471,12 +471,13 @@ static inline u32 bkey_generation(struct bkey_s_c k) + } + + struct btree_iter *bch2_inode_create(struct btree_trans *trans, +- struct bch_inode_unpacked *inode_u) ++ struct bch_inode_unpacked *inode_u, ++ u32 snapshot) + { + struct bch_fs *c = trans->c; + struct btree_iter *iter = NULL; + struct bkey_s_c k; +- u64 min, max, start, *hint; ++ u64 min, max, start, pos, *hint; + int ret; + + u64 cpu = raw_smp_processor_id(); +@@ -493,39 +494,70 @@ struct btree_iter *bch2_inode_create(struct btree_trans *trans, + + if (start >= max || start < min) + start = min; ++ ++ pos = start; ++ iter = bch2_trans_get_iter(trans, BTREE_ID_inodes, POS(0, pos), ++ BTREE_ITER_ALL_SNAPSHOTS| ++ BTREE_ITER_INTENT); + again: +- for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, start), +- BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { +- if (bkey_cmp(iter->pos, POS(0, max)) > 0) +- break; ++ while ((k = bch2_btree_iter_peek(iter)).k && ++ !(ret = bkey_err(k)) && ++ bkey_cmp(k.k->p, POS(0, max)) < 0) { ++ while (pos < iter->pos.offset) { ++ if (!bch2_btree_key_cache_find(c, BTREE_ID_inodes, POS(0, pos))) ++ goto found_slot; ++ ++ pos++; ++ } ++ ++ if (k.k->p.snapshot == snapshot && ++ k.k->type != KEY_TYPE_inode && ++ !bch2_btree_key_cache_find(c, BTREE_ID_inodes, SPOS(0, pos, snapshot))) { ++ bch2_btree_iter_next(iter); ++ continue; ++ } + + /* +- * There's a potential cache coherency issue with the btree key +- * cache code here - we're iterating over the btree, skipping +- * that cache. We should never see an empty slot that isn't +- * actually empty due to a pending update in the key cache +- * because the update that creates the inode isn't done with a +- * cached iterator, but - better safe than sorry, check the +- * cache before using a slot: ++ * We don't need to iterate over keys in every snapshot once ++ * we've found just one: + */ +- if (k.k->type != KEY_TYPE_inode && +- !bch2_btree_key_cache_find(c, BTREE_ID_inodes, iter->pos)) ++ pos = iter->pos.offset + 1; ++ bch2_btree_iter_set_pos(iter, POS(0, pos)); ++ } ++ ++ while (!ret && pos < max) { ++ if (!bch2_btree_key_cache_find(c, BTREE_ID_inodes, POS(0, pos))) + goto found_slot; ++ ++ pos++; + } + +- bch2_trans_iter_put(trans, iter); ++ if (!ret && start == min) ++ ret = -ENOSPC; + +- if (ret) ++ if (ret) { ++ bch2_trans_iter_put(trans, iter); + return ERR_PTR(ret); +- +- if (start != min) { +- /* Retry from start */ +- start = min; +- goto again; + } + +- return ERR_PTR(-ENOSPC); ++ /* Retry from start */ ++ pos = start = min; ++ bch2_btree_iter_set_pos(iter, POS(0, pos)); ++ goto again; + found_slot: ++ bch2_btree_iter_set_pos(iter, SPOS(0, pos, snapshot)); ++ k = bch2_btree_iter_peek_slot(iter); ++ ret = bkey_err(k); ++ if (ret) { ++ bch2_trans_iter_put(trans, iter); ++ return ERR_PTR(ret); ++ } ++ ++ /* We may have raced while the iterator wasn't pointing at pos: */ ++ if (k.k->type == KEY_TYPE_inode || ++ bch2_btree_key_cache_find(c, BTREE_ID_inodes, k.k->p)) ++ goto again; ++ + *hint = k.k->p.offset; + inode_u->bi_inum = k.k->p.offset; + inode_u->bi_generation = bkey_generation(k); +diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h +index 6bad6dfb7989..23c322d9a85b 100644 +--- a/fs/bcachefs/inode.h ++++ b/fs/bcachefs/inode.h +@@ -70,7 +70,7 @@ void bch2_inode_init(struct bch_fs *, struct bch_inode_unpacked *, + struct bch_inode_unpacked *); + + struct btree_iter *bch2_inode_create(struct btree_trans *, +- struct bch_inode_unpacked *); ++ struct bch_inode_unpacked *, u32); + + int bch2_inode_rm(struct bch_fs *, u64, bool); + +-- +cgit v1.2.3 + + +From 0f78d1b74c4e555e598b4f5375b9c6c2fc8ccfb5 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 16 Mar 2021 18:08:10 -0400 +Subject: bcachefs: Don't use bch2_inode_find_by_inum() in move.c + +Since move.c isn't aware of what subvolume we're in, we can't use the +standard inode lookup code - fortunately, we're just using it for +reading IO options. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/move.c | 42 ++++++++++++++++++++++++++++++++++++------ + 1 file changed, 36 insertions(+), 6 deletions(-) + +diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c +index 7448ea36abd9..5b108490d7c4 100644 +--- a/fs/bcachefs/move.c ++++ b/fs/bcachefs/move.c +@@ -509,6 +509,32 @@ err: + return ret; + } + ++static int lookup_inode(struct btree_trans *trans, struct bpos pos, ++ struct bch_inode_unpacked *inode) ++{ ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ int ret; ++ ++ iter = bch2_trans_get_iter(trans, BTREE_ID_inodes, pos, ++ BTREE_ITER_ALL_SNAPSHOTS); ++ k = bch2_btree_iter_peek(iter); ++ ret = bkey_err(k); ++ if (ret) ++ goto err; ++ ++ ret = k.k->type == KEY_TYPE_inode ? 0 : -EIO; ++ if (ret) ++ goto err; ++ ++ ret = bch2_inode_unpack(bkey_s_c_to_inode(k), inode); ++ if (ret) ++ goto err; ++err: ++ bch2_trans_iter_put(trans, iter); ++ return ret; ++} ++ + static int __bch2_move_data(struct bch_fs *c, + struct moving_context *ctxt, + struct bch_ratelimit *rate, +@@ -566,7 +592,7 @@ static int __bch2_move_data(struct bch_fs *c, + try_to_freeze(); + } + } while (delay); +-peek: ++ + k = bch2_btree_iter_peek(iter); + + stats->pos = iter->pos; +@@ -586,14 +612,18 @@ peek: + cur_inum != k.k->p.inode) { + struct bch_inode_unpacked inode; + +- /* don't hold btree locks while looking up inode: */ +- bch2_trans_unlock(&trans); +- + io_opts = bch2_opts_to_inode_opts(c->opts); +- if (!bch2_inode_find_by_inum(c, k.k->p.inode, &inode)) ++ ++ ret = lookup_inode(&trans, ++ SPOS(0, k.k->p.inode, k.k->p.snapshot), ++ &inode); ++ if (ret == -EINTR) ++ continue; ++ ++ if (!ret) + bch2_io_opts_apply(&io_opts, bch2_inode_opts_get(&inode)); ++ + cur_inum = k.k->p.inode; +- goto peek; + } + + switch ((data_cmd = pred(c, arg, k, &io_opts, &data_opts))) { +-- +cgit v1.2.3 + + +From a18a829ad1b3c56d162c61d8faf7d0236b5b18aa Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 28 Mar 2021 20:57:59 -0400 +Subject: bcachefs: Have journal reclaim thread flush more aggressively + +This adds a new watermark for the journal reclaim when flushing btree +key cache entries - it should try and stay ahead of where foreground +threads doing transaction commits will enter direct journal reclaim. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_key_cache.h | 9 +++++++++ + fs/bcachefs/journal_reclaim.c | 2 +- + 2 files changed, 10 insertions(+), 1 deletion(-) + +diff --git a/fs/bcachefs/btree_key_cache.h b/fs/bcachefs/btree_key_cache.h +index 02715cd258ab..4e1e5a9c7656 100644 +--- a/fs/bcachefs/btree_key_cache.h ++++ b/fs/bcachefs/btree_key_cache.h +@@ -1,6 +1,15 @@ + #ifndef _BCACHEFS_BTREE_KEY_CACHE_H + #define _BCACHEFS_BTREE_KEY_CACHE_H + ++static inline size_t bch2_nr_btree_keys_want_flush(struct bch_fs *c) ++{ ++ size_t nr_dirty = atomic_long_read(&c->btree_key_cache.nr_dirty); ++ size_t nr_keys = atomic_long_read(&c->btree_key_cache.nr_keys); ++ size_t max_dirty = nr_keys / 4; ++ ++ return max_t(ssize_t, 0, nr_dirty - max_dirty); ++} ++ + static inline size_t bch2_nr_btree_keys_need_flush(struct bch_fs *c) + { + size_t nr_dirty = atomic_long_read(&c->btree_key_cache.nr_dirty); +diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c +index 4a5b50ed71b0..93b5e07e05bc 100644 +--- a/fs/bcachefs/journal_reclaim.c ++++ b/fs/bcachefs/journal_reclaim.c +@@ -602,7 +602,7 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct) + if (fifo_free(&j->pin) <= 32) + min_nr = 1; + +- min_nr = max(min_nr, bch2_nr_btree_keys_need_flush(c)); ++ min_nr = max(min_nr, bch2_nr_btree_keys_want_flush(c)); + + trace_journal_reclaim_start(c, + min_nr, +-- +cgit v1.2.3 + + +From e42a85cbe6c9abb4796565e7ba25afa9d81e8f14 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 28 Mar 2021 21:20:22 -0400 +Subject: bcachefs: Free iterator in bch2_btree_delete_range_trans() + +This is specifically to speed up bch2_inode_rm(), so that we're not +traversing iterators we're done with. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_update_leaf.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index e73577ef853c..3744fb487e6c 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -1187,7 +1187,7 @@ retry: + goto retry; + } + +- bch2_trans_iter_put(trans, iter); ++ bch2_trans_iter_free(trans, iter); + return ret; + } + +-- +cgit v1.2.3 + + +From fc3a2cf0fc2e4f61bfa4119fc3f83628afcc3594 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 29 Mar 2021 00:19:05 -0400 +Subject: bcachefs: Add repair code for out of order keys in a btree node. + +This just drops the offending key - in the bug report where this was +seen, it was clearly a single bit memory error, and fsck will fix the +missing key. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_io.c | 36 ++++++++++++++---------------------- + 1 file changed, 14 insertions(+), 22 deletions(-) + +diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c +index 04328456dcec..509c77b900b4 100644 +--- a/fs/bcachefs/btree_io.c ++++ b/fs/bcachefs/btree_io.c +@@ -578,6 +578,10 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, + mutex_unlock(&c->sb_lock); + } + ++ btree_err_on(BSET_SEPARATE_WHITEOUTS(i), ++ BTREE_ERR_FATAL, c, ca, b, i, ++ "BSET_SEPARATE_WHITEOUTS no longer supported"); ++ + if (btree_err_on(b->written + sectors > c->opts.btree_node_size, + BTREE_ERR_FIXABLE, c, ca, b, i, + "bset past end of btree node")) { +@@ -660,14 +664,8 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b, + { + unsigned version = le16_to_cpu(i->version); + struct bkey_packed *k, *prev = NULL; +- bool seen_non_whiteout = false; + int ret = 0; + +- if (!BSET_SEPARATE_WHITEOUTS(i)) { +- seen_non_whiteout = true; +- *whiteout_u64s = 0; +- } +- + for (k = i->start; + k != vstruct_last(i);) { + struct bkey_s u; +@@ -719,18 +717,7 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b, + BSET_BIG_ENDIAN(i), write, + &b->format, k); + +- /* +- * with the separate whiteouts thing (used for extents), the +- * second set of keys actually can have whiteouts too, so we +- * can't solely go off bkey_deleted()... +- */ +- +- if (!seen_non_whiteout && +- (!bkey_deleted(k) || +- (prev && bkey_iter_cmp(b, prev, k) > 0))) { +- *whiteout_u64s = k->_data - i->_data; +- seen_non_whiteout = true; +- } else if (prev && bkey_iter_cmp(b, prev, k) > 0) { ++ if (prev && bkey_iter_cmp(b, prev, k) > 0) { + char buf1[80]; + char buf2[80]; + struct bkey up = bkey_unpack_key(b, prev); +@@ -739,10 +726,15 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b, + bch2_bkey_to_text(&PBUF(buf2), u.k); + + bch2_dump_bset(c, b, i, 0); +- btree_err(BTREE_ERR_FATAL, c, NULL, b, i, +- "keys out of order: %s > %s", +- buf1, buf2); +- /* XXX: repair this */ ++ ++ if (btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i, ++ "keys out of order: %s > %s", ++ buf1, buf2)) { ++ i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); ++ memmove_u64s_down(k, bkey_next(k), ++ (u64 *) vstruct_end(i) - (u64 *) k); ++ continue; ++ } + } + + prev = k; +-- +cgit v1.2.3 + + +From 4ec16de53162db0b10eb336b807a29e1c91524b2 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 28 Mar 2021 20:56:25 -0400 +Subject: bcachefs: Don't use write side of mark_lock in journal write path + +The write side of percpu rwsemaphors is really expensive, and we +shouldn't be taking it at all in steady state operation. + +Fortunately, in bch2_journal_super_entries_add_common(), we don't need +to - we have a seqlock, usage_lock for accumulating percpu usage +counters to the base counters. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bcachefs.h | 3 +- + fs/bcachefs/btree_update_leaf.c | 6 +- + fs/bcachefs/buckets.c | 168 ++++++++++++++++++---------------------- + fs/bcachefs/buckets.h | 20 ++--- + fs/bcachefs/buckets_types.h | 12 ++- + fs/bcachefs/chardev.c | 6 +- + fs/bcachefs/replicas.c | 6 +- + fs/bcachefs/super-io.c | 4 +- + fs/bcachefs/super.c | 2 + + fs/bcachefs/sysfs.c | 4 +- + 10 files changed, 104 insertions(+), 127 deletions(-) + +diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h +index 17e3d55a1f06..abb30fe03aa7 100644 +--- a/fs/bcachefs/bcachefs.h ++++ b/fs/bcachefs/bcachefs.h +@@ -684,10 +684,11 @@ struct bch_fs { + struct bch_fs_usage *usage_base; + struct bch_fs_usage __percpu *usage[JOURNAL_BUF_NR]; + struct bch_fs_usage __percpu *usage_gc; ++ u64 __percpu *online_reserved; + + /* single element mempool: */ + struct mutex usage_scratch_lock; +- struct bch_fs_usage *usage_scratch; ++ struct bch_fs_usage_online *usage_scratch; + + struct io_clock io_clock[2]; + +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index 3744fb487e6c..a45aac1b1af3 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -375,7 +375,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, + struct btree_insert_entry **stopped_at) + { + struct bch_fs *c = trans->c; +- struct bch_fs_usage *fs_usage = NULL; ++ struct bch_fs_usage_online *fs_usage = NULL; + struct btree_insert_entry *i; + struct btree_trans_commit_hook *h; + unsigned u64s = 0; +@@ -464,7 +464,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, + + /* Must be called under mark_lock: */ + if (marking && trans->fs_usage_deltas && +- bch2_replicas_delta_list_apply(c, fs_usage, ++ bch2_replicas_delta_list_apply(c, &fs_usage->u, + trans->fs_usage_deltas)) { + ret = BTREE_INSERT_NEED_MARK_REPLICAS; + goto err; +@@ -473,7 +473,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, + trans_for_each_update(trans, i) + if (BTREE_NODE_TYPE_HAS_MEM_TRIGGERS & (1U << i->bkey_type)) + bch2_mark_update(trans, i->iter, i->k, +- fs_usage, i->trigger_flags); ++ &fs_usage->u, i->trigger_flags); + + if (marking) + bch2_trans_fs_usage_apply(trans, fs_usage); +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index 5729123e515d..ce4cd9c741a1 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -167,7 +167,7 @@ void bch2_fs_usage_initialize(struct bch_fs *c) + percpu_up_write(&c->mark_lock); + } + +-void bch2_fs_usage_scratch_put(struct bch_fs *c, struct bch_fs_usage *fs_usage) ++void bch2_fs_usage_scratch_put(struct bch_fs *c, struct bch_fs_usage_online *fs_usage) + { + if (fs_usage == c->usage_scratch) + mutex_unlock(&c->usage_scratch_lock); +@@ -175,11 +175,11 @@ void bch2_fs_usage_scratch_put(struct bch_fs *c, struct bch_fs_usage *fs_usage) + kfree(fs_usage); + } + +-struct bch_fs_usage *bch2_fs_usage_scratch_get(struct bch_fs *c) ++struct bch_fs_usage_online *bch2_fs_usage_scratch_get(struct bch_fs *c) + { +- struct bch_fs_usage *ret; +- unsigned bytes = fs_usage_u64s(c) * sizeof(u64); +- ++ struct bch_fs_usage_online *ret; ++ unsigned bytes = sizeof(struct bch_fs_usage_online) + sizeof(u64) * ++ READ_ONCE(c->replicas.nr); + ret = kzalloc(bytes, GFP_NOWAIT|__GFP_NOWARN); + if (ret) + return ret; +@@ -252,30 +252,28 @@ u64 bch2_fs_usage_read_one(struct bch_fs *c, u64 *v) + return ret; + } + +-struct bch_fs_usage *bch2_fs_usage_read(struct bch_fs *c) ++struct bch_fs_usage_online *bch2_fs_usage_read(struct bch_fs *c) + { +- struct bch_fs_usage *ret; +- unsigned seq, i, v, u64s = fs_usage_u64s(c); +-retry: +- ret = kmalloc(u64s * sizeof(u64), GFP_NOFS); +- if (unlikely(!ret)) +- return NULL; ++ struct bch_fs_usage_online *ret; ++ unsigned seq, i, u64s; + + percpu_down_read(&c->mark_lock); + +- v = fs_usage_u64s(c); +- if (unlikely(u64s != v)) { +- u64s = v; ++ ret = kmalloc(sizeof(struct bch_fs_usage_online) + ++ sizeof(u64) + c->replicas.nr, GFP_NOFS); ++ if (unlikely(!ret)) { + percpu_up_read(&c->mark_lock); +- kfree(ret); +- goto retry; ++ return NULL; + } + ++ ret->online_reserved = percpu_u64_get(c->online_reserved); ++ ++ u64s = fs_usage_u64s(c); + do { + seq = read_seqcount_begin(&c->usage_lock); +- memcpy(ret, c->usage_base, u64s * sizeof(u64)); ++ memcpy(&ret->u, c->usage_base, u64s * sizeof(u64)); + for (i = 0; i < ARRAY_SIZE(c->usage); i++) +- acc_u64s_percpu((u64 *) ret, (u64 __percpu *) c->usage[i], u64s); ++ acc_u64s_percpu((u64 *) &ret->u, (u64 __percpu *) c->usage[i], u64s); + } while (read_seqcount_retry(&c->usage_lock, seq)); + + return ret; +@@ -311,31 +309,31 @@ void bch2_fs_usage_acc_to_base(struct bch_fs *c, unsigned idx) + + void bch2_fs_usage_to_text(struct printbuf *out, + struct bch_fs *c, +- struct bch_fs_usage *fs_usage) ++ struct bch_fs_usage_online *fs_usage) + { + unsigned i; + + pr_buf(out, "capacity:\t\t\t%llu\n", c->capacity); + + pr_buf(out, "hidden:\t\t\t\t%llu\n", +- fs_usage->hidden); ++ fs_usage->u.hidden); + pr_buf(out, "data:\t\t\t\t%llu\n", +- fs_usage->data); ++ fs_usage->u.data); + pr_buf(out, "cached:\t\t\t\t%llu\n", +- fs_usage->cached); ++ fs_usage->u.cached); + pr_buf(out, "reserved:\t\t\t%llu\n", +- fs_usage->reserved); ++ fs_usage->u.reserved); + pr_buf(out, "nr_inodes:\t\t\t%llu\n", +- fs_usage->nr_inodes); ++ fs_usage->u.nr_inodes); + pr_buf(out, "online reserved:\t\t%llu\n", + fs_usage->online_reserved); + + for (i = 0; +- i < ARRAY_SIZE(fs_usage->persistent_reserved); ++ i < ARRAY_SIZE(fs_usage->u.persistent_reserved); + i++) { + pr_buf(out, "%u replicas:\n", i + 1); + pr_buf(out, "\treserved:\t\t%llu\n", +- fs_usage->persistent_reserved[i]); ++ fs_usage->u.persistent_reserved[i]); + } + + for (i = 0; i < c->replicas.nr; i++) { +@@ -344,7 +342,7 @@ void bch2_fs_usage_to_text(struct printbuf *out, + + pr_buf(out, "\t"); + bch2_replicas_entry_to_text(out, e); +- pr_buf(out, ":\t%llu\n", fs_usage->replicas[i]); ++ pr_buf(out, ":\t%llu\n", fs_usage->u.replicas[i]); + } + } + +@@ -360,12 +358,12 @@ static u64 avail_factor(u64 r) + return div_u64(r << RESERVE_FACTOR, (1 << RESERVE_FACTOR) + 1); + } + +-u64 bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage *fs_usage) ++u64 bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage_online *fs_usage) + { +- return min(fs_usage->hidden + +- fs_usage->btree + +- fs_usage->data + +- reserve_factor(fs_usage->reserved + ++ return min(fs_usage->u.hidden + ++ fs_usage->u.btree + ++ fs_usage->u.data + ++ reserve_factor(fs_usage->u.reserved + + fs_usage->online_reserved), + c->capacity); + } +@@ -382,7 +380,7 @@ __bch2_fs_usage_read_short(struct bch_fs *c) + data = bch2_fs_usage_read_one(c, &c->usage_base->data) + + bch2_fs_usage_read_one(c, &c->usage_base->btree); + reserved = bch2_fs_usage_read_one(c, &c->usage_base->reserved) + +- bch2_fs_usage_read_one(c, &c->usage_base->online_reserved); ++ percpu_u64_get(c->online_reserved); + + ret.used = min(ret.capacity, data + reserve_factor(reserved)); + ret.free = ret.capacity - ret.used; +@@ -436,43 +434,6 @@ static bool bucket_became_unavailable(struct bucket_mark old, + !is_available_bucket(new); + } + +-int bch2_fs_usage_apply(struct bch_fs *c, +- struct bch_fs_usage *fs_usage, +- struct disk_reservation *disk_res, +- unsigned journal_seq) +-{ +- s64 added = fs_usage->data + fs_usage->reserved; +- s64 should_not_have_added; +- int ret = 0; +- +- percpu_rwsem_assert_held(&c->mark_lock); +- +- /* +- * Not allowed to reduce sectors_available except by getting a +- * reservation: +- */ +- should_not_have_added = added - (s64) (disk_res ? disk_res->sectors : 0); +- if (WARN_ONCE(should_not_have_added > 0, +- "disk usage increased by %lli more than reservation of %llu", +- added, disk_res ? disk_res->sectors : 0)) { +- atomic64_sub(should_not_have_added, &c->sectors_available); +- added -= should_not_have_added; +- ret = -1; +- } +- +- if (added > 0) { +- disk_res->sectors -= added; +- fs_usage->online_reserved -= added; +- } +- +- preempt_disable(); +- acc_u64s((u64 *) fs_usage_ptr(c, journal_seq, false), +- (u64 *) fs_usage, fs_usage_u64s(c)); +- preempt_enable(); +- +- return ret; +-} +- + static inline void account_bucket(struct bch_fs_usage *fs_usage, + struct bch_dev_usage *dev_usage, + enum bch_data_type type, +@@ -504,8 +465,6 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca, + account_bucket(fs_usage, u, bucket_type(new), + 1, ca->mi.bucket_size); + +- u->buckets_alloc += +- (int) new.owned_by_allocator - (int) old.owned_by_allocator; + u->buckets_ec += (int) new.stripe - (int) old.stripe; + u->buckets_unavailable += + is_unavailable_bucket(new) - is_unavailable_bucket(old); +@@ -669,7 +628,6 @@ static int __bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca, + size_t b, bool owned_by_allocator, + bool gc) + { +- struct bch_fs_usage *fs_usage = fs_usage_ptr(c, 0, gc); + struct bucket *g = __bucket(ca, b, gc); + struct bucket_mark old, new; + +@@ -677,13 +635,6 @@ static int __bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca, + new.owned_by_allocator = owned_by_allocator; + })); + +- /* +- * XXX: this is wrong, this means we'll be doing updates to the percpu +- * buckets_alloc counter that don't have an open journal buffer and +- * we'll race with the machinery that accumulates that to ca->usage_base +- */ +- bch2_dev_usage_update(c, ca, fs_usage, old, new, 0, gc); +- + BUG_ON(!gc && + !owned_by_allocator && !old.owned_by_allocator); + +@@ -1432,8 +1383,47 @@ int bch2_mark_update(struct btree_trans *trans, + return ret; + } + ++static int bch2_fs_usage_apply(struct bch_fs *c, ++ struct bch_fs_usage_online *src, ++ struct disk_reservation *disk_res, ++ unsigned journal_seq) ++{ ++ struct bch_fs_usage *dst = fs_usage_ptr(c, journal_seq, false); ++ s64 added = src->u.data + src->u.reserved; ++ s64 should_not_have_added; ++ int ret = 0; ++ ++ percpu_rwsem_assert_held(&c->mark_lock); ++ ++ /* ++ * Not allowed to reduce sectors_available except by getting a ++ * reservation: ++ */ ++ should_not_have_added = added - (s64) (disk_res ? disk_res->sectors : 0); ++ if (WARN_ONCE(should_not_have_added > 0, ++ "disk usage increased by %lli more than reservation of %llu", ++ added, disk_res ? disk_res->sectors : 0)) { ++ atomic64_sub(should_not_have_added, &c->sectors_available); ++ added -= should_not_have_added; ++ ret = -1; ++ } ++ ++ if (added > 0) { ++ disk_res->sectors -= added; ++ src->online_reserved -= added; ++ } ++ ++ this_cpu_add(*c->online_reserved, src->online_reserved); ++ ++ preempt_disable(); ++ acc_u64s((u64 *) dst, (u64 *) &src->u, fs_usage_u64s(c)); ++ preempt_enable(); ++ ++ return ret; ++} ++ + void bch2_trans_fs_usage_apply(struct btree_trans *trans, +- struct bch_fs_usage *fs_usage) ++ struct bch_fs_usage_online *fs_usage) + { + struct bch_fs *c = trans->c; + struct btree_insert_entry *i; +@@ -2213,16 +2203,6 @@ int bch2_trans_mark_dev_sb(struct bch_fs *c, + + /* Disk reservations: */ + +-void __bch2_disk_reservation_put(struct bch_fs *c, struct disk_reservation *res) +-{ +- percpu_down_read(&c->mark_lock); +- this_cpu_sub(c->usage[0]->online_reserved, +- res->sectors); +- percpu_up_read(&c->mark_lock); +- +- res->sectors = 0; +-} +- + #define SECTORS_CACHE 1024 + + int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res, +@@ -2256,7 +2236,7 @@ int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res, + + out: + pcpu->sectors_available -= sectors; +- this_cpu_add(c->usage[0]->online_reserved, sectors); ++ this_cpu_add(*c->online_reserved, sectors); + res->sectors += sectors; + + preempt_enable(); +@@ -2273,7 +2253,7 @@ recalculate: + (flags & BCH_DISK_RESERVATION_NOFAIL)) { + atomic64_set(&c->sectors_available, + max_t(s64, 0, sectors_available - sectors)); +- this_cpu_add(c->usage[0]->online_reserved, sectors); ++ this_cpu_add(*c->online_reserved, sectors); + res->sectors += sectors; + ret = 0; + } else { +diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h +index 9a91a4969783..af8cb74d71e0 100644 +--- a/fs/bcachefs/buckets.h ++++ b/fs/bcachefs/buckets.h +@@ -216,19 +216,19 @@ static inline unsigned dev_usage_u64s(void) + return sizeof(struct bch_dev_usage) / sizeof(u64); + } + +-void bch2_fs_usage_scratch_put(struct bch_fs *, struct bch_fs_usage *); +-struct bch_fs_usage *bch2_fs_usage_scratch_get(struct bch_fs *); ++void bch2_fs_usage_scratch_put(struct bch_fs *, struct bch_fs_usage_online *); ++struct bch_fs_usage_online *bch2_fs_usage_scratch_get(struct bch_fs *); + + u64 bch2_fs_usage_read_one(struct bch_fs *, u64 *); + +-struct bch_fs_usage *bch2_fs_usage_read(struct bch_fs *); ++struct bch_fs_usage_online *bch2_fs_usage_read(struct bch_fs *); + + void bch2_fs_usage_acc_to_base(struct bch_fs *, unsigned); + + void bch2_fs_usage_to_text(struct printbuf *, +- struct bch_fs *, struct bch_fs_usage *); ++ struct bch_fs *, struct bch_fs_usage_online *); + +-u64 bch2_fs_sectors_used(struct bch_fs *, struct bch_fs_usage *); ++u64 bch2_fs_sectors_used(struct bch_fs *, struct bch_fs_usage_online *); + + struct bch_fs_usage_short + bch2_fs_usage_read_short(struct bch_fs *); +@@ -246,8 +246,6 @@ void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *, + + int bch2_mark_key(struct bch_fs *, struct bkey_s_c, unsigned, + s64, struct bch_fs_usage *, u64, unsigned); +-int bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage *, +- struct disk_reservation *, unsigned); + + int bch2_mark_update(struct btree_trans *, struct btree_iter *, + struct bkey_i *, struct bch_fs_usage *, unsigned); +@@ -259,7 +257,7 @@ int bch2_trans_mark_key(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, + unsigned, s64, unsigned); + int bch2_trans_mark_update(struct btree_trans *, struct btree_iter *iter, + struct bkey_i *insert, unsigned); +-void bch2_trans_fs_usage_apply(struct btree_trans *, struct bch_fs_usage *); ++void bch2_trans_fs_usage_apply(struct btree_trans *, struct bch_fs_usage_online *); + + int bch2_trans_mark_metadata_bucket(struct btree_trans *, + struct disk_reservation *, struct bch_dev *, +@@ -269,13 +267,11 @@ int bch2_trans_mark_dev_sb(struct bch_fs *, struct disk_reservation *, + + /* disk reservations: */ + +-void __bch2_disk_reservation_put(struct bch_fs *, struct disk_reservation *); +- + static inline void bch2_disk_reservation_put(struct bch_fs *c, + struct disk_reservation *res) + { +- if (res->sectors) +- __bch2_disk_reservation_put(c, res); ++ this_cpu_sub(*c->online_reserved, res->sectors); ++ res->sectors = 0; + } + + #define BCH_DISK_RESERVATION_NOFAIL (1 << 0) +diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h +index 404c89a7a264..b6ea67506cc2 100644 +--- a/fs/bcachefs/buckets_types.h ++++ b/fs/bcachefs/buckets_types.h +@@ -53,7 +53,6 @@ struct bucket_array { + }; + + struct bch_dev_usage { +- u64 buckets_alloc; + u64 buckets_ec; + u64 buckets_unavailable; + +@@ -66,12 +65,6 @@ struct bch_dev_usage { + + struct bch_fs_usage { + /* all fields are in units of 512 byte sectors: */ +- +- u64 online_reserved; +- +- /* fields after online_reserved are cleared/recalculated by gc: */ +- u64 gc_start[0]; +- + u64 hidden; + u64 btree; + u64 data; +@@ -91,6 +84,11 @@ struct bch_fs_usage { + u64 replicas[]; + }; + ++struct bch_fs_usage_online { ++ u64 online_reserved; ++ struct bch_fs_usage u; ++}; ++ + struct bch_fs_usage_short { + u64 capacity; + u64 used; +diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c +index 49842ec88390..c61601476c0d 100644 +--- a/fs/bcachefs/chardev.c ++++ b/fs/bcachefs/chardev.c +@@ -379,7 +379,7 @@ static long bch2_ioctl_fs_usage(struct bch_fs *c, + { + struct bch_ioctl_fs_usage *arg = NULL; + struct bch_replicas_usage *dst_e, *dst_end; +- struct bch_fs_usage *src; ++ struct bch_fs_usage_online *src; + u32 replica_entries_bytes; + unsigned i; + int ret = 0; +@@ -405,7 +405,7 @@ static long bch2_ioctl_fs_usage(struct bch_fs *c, + arg->online_reserved = src->online_reserved; + + for (i = 0; i < BCH_REPLICAS_MAX; i++) +- arg->persistent_reserved[i] = src->persistent_reserved[i]; ++ arg->persistent_reserved[i] = src->u.persistent_reserved[i]; + + dst_e = arg->replicas; + dst_end = (void *) arg->replicas + replica_entries_bytes; +@@ -419,7 +419,7 @@ static long bch2_ioctl_fs_usage(struct bch_fs *c, + break; + } + +- dst_e->sectors = src->replicas[i]; ++ dst_e->sectors = src->u.replicas[i]; + dst_e->r = *src_e; + + /* recheck after setting nr_devs: */ +diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c +index e45a6d6b103c..068fbca1dd54 100644 +--- a/fs/bcachefs/replicas.c ++++ b/fs/bcachefs/replicas.c +@@ -271,11 +271,13 @@ static int replicas_table_update(struct bch_fs *c, + struct bch_replicas_cpu *new_r) + { + struct bch_fs_usage __percpu *new_usage[JOURNAL_BUF_NR]; +- struct bch_fs_usage *new_scratch = NULL; ++ struct bch_fs_usage_online *new_scratch = NULL; + struct bch_fs_usage __percpu *new_gc = NULL; + struct bch_fs_usage *new_base = NULL; + unsigned i, bytes = sizeof(struct bch_fs_usage) + + sizeof(u64) * new_r->nr; ++ unsigned scratch_bytes = sizeof(struct bch_fs_usage_online) + ++ sizeof(u64) * new_r->nr; + int ret = 0; + + memset(new_usage, 0, sizeof(new_usage)); +@@ -286,7 +288,7 @@ static int replicas_table_update(struct bch_fs *c, + goto err; + + if (!(new_base = kzalloc(bytes, GFP_KERNEL)) || +- !(new_scratch = kmalloc(bytes, GFP_KERNEL)) || ++ !(new_scratch = kmalloc(scratch_bytes, GFP_KERNEL)) || + (c->usage_gc && + !(new_gc = __alloc_percpu_gfp(bytes, sizeof(u64), GFP_KERNEL)))) + goto err; +diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c +index 6bb12d5e09e3..f8f57caa417a 100644 +--- a/fs/bcachefs/super-io.c ++++ b/fs/bcachefs/super-io.c +@@ -999,7 +999,7 @@ void bch2_journal_super_entries_add_common(struct bch_fs *c, + struct bch_dev *ca; + unsigned i, dev; + +- percpu_down_write(&c->mark_lock); ++ percpu_down_read(&c->mark_lock); + + if (!journal_seq) { + for (i = 0; i < ARRAY_SIZE(c->usage); i++) +@@ -1070,7 +1070,7 @@ void bch2_journal_super_entries_add_common(struct bch_fs *c, + } + } + +- percpu_up_write(&c->mark_lock); ++ percpu_up_read(&c->mark_lock); + + for (i = 0; i < 2; i++) { + struct jset_entry_clock *clock = +diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c +index ce8b37513af7..70a4d0dcc395 100644 +--- a/fs/bcachefs/super.c ++++ b/fs/bcachefs/super.c +@@ -484,6 +484,7 @@ static void __bch2_fs_free(struct bch_fs *c) + for_each_possible_cpu(cpu) + kfree(per_cpu_ptr(c->btree_iters_bufs, cpu)->iter); + ++ free_percpu(c->online_reserved); + free_percpu(c->btree_iters_bufs); + free_percpu(c->pcpu); + mempool_exit(&c->large_bkey_pool); +@@ -759,6 +760,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) + BIOSET_NEED_BVECS) || + !(c->pcpu = alloc_percpu(struct bch_fs_pcpu)) || + !(c->btree_iters_bufs = alloc_percpu(struct btree_iter_buf)) || ++ !(c->online_reserved = alloc_percpu(u64)) || + mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1, + btree_bytes(c)) || + mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048) || +diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c +index a6a0a3f6f205..4b83a98621d7 100644 +--- a/fs/bcachefs/sysfs.c ++++ b/fs/bcachefs/sysfs.c +@@ -230,7 +230,7 @@ static size_t bch2_btree_cache_size(struct bch_fs *c) + + static int fs_alloc_debug_to_text(struct printbuf *out, struct bch_fs *c) + { +- struct bch_fs_usage *fs_usage = bch2_fs_usage_read(c); ++ struct bch_fs_usage_online *fs_usage = bch2_fs_usage_read(c); + + if (!fs_usage) + return -ENOMEM; +@@ -794,7 +794,6 @@ static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca) + pr_buf(out, + "ec\t%16llu\n" + "available%15llu\n" +- "alloc\t%16llu\n" + "\n" + "free_inc\t\t%zu/%zu\n" + "free[RESERVE_MOVINGGC]\t%zu/%zu\n" +@@ -810,7 +809,6 @@ static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca) + "thread state:\t\t%s\n", + stats.buckets_ec, + __dev_buckets_available(ca, stats), +- stats.buckets_alloc, + fifo_used(&ca->free_inc), ca->free_inc.size, + fifo_used(&ca->free[RESERVE_MOVINGGC]), ca->free[RESERVE_MOVINGGC].size, + fifo_used(&ca->free[RESERVE_NONE]), ca->free[RESERVE_NONE].size, +-- +cgit v1.2.3 + + +From 600c15030e59a934821bd456624afe612c4a9b97 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 31 Mar 2021 17:52:52 -0400 +Subject: bcachefs: Don't make foreground writes wait behind journal reclaim + too long + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_update_leaf.c | 33 +++++++++++++++++++++++++++------ + fs/bcachefs/journal.c | 1 + + fs/bcachefs/journal_reclaim.c | 6 ++++++ + fs/bcachefs/journal_types.h | 1 + + 4 files changed, 35 insertions(+), 6 deletions(-) + +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index a45aac1b1af3..0348ba782cb5 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -586,6 +586,28 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, + return 0; + } + ++static int journal_reclaim_wait_done(struct bch_fs *c) ++{ ++ int ret; ++ ++ ret = bch2_journal_error(&c->journal); ++ if (ret) ++ return ret; ++ ++ ret = !bch2_btree_key_cache_must_wait(c); ++ if (ret) ++ return ret; ++ ++ if (mutex_trylock(&c->journal.reclaim_lock)) { ++ ret = bch2_journal_reclaim(&c->journal); ++ mutex_unlock(&c->journal.reclaim_lock); ++ } ++ ++ if (!ret) ++ ret = !bch2_btree_key_cache_must_wait(c); ++ return ret; ++} ++ + static noinline + int bch2_trans_commit_error(struct btree_trans *trans, + struct btree_insert_entry *i, +@@ -668,13 +690,12 @@ int bch2_trans_commit_error(struct btree_trans *trans, + case BTREE_INSERT_NEED_JOURNAL_RECLAIM: + bch2_trans_unlock(trans); + +- do { +- mutex_lock(&c->journal.reclaim_lock); +- ret = bch2_journal_reclaim(&c->journal); +- mutex_unlock(&c->journal.reclaim_lock); +- } while (!ret && bch2_btree_key_cache_must_wait(c)); ++ wait_event(c->journal.reclaim_wait, ++ (ret = journal_reclaim_wait_done(c))); ++ if (ret < 0) ++ return ret; + +- if (!ret && bch2_trans_relock(trans)) ++ if (bch2_trans_relock(trans)) + return 0; + + trace_trans_restart_journal_reclaim(trans->ip); +diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c +index 690b0358e437..063505abc641 100644 +--- a/fs/bcachefs/journal.c ++++ b/fs/bcachefs/journal.c +@@ -1114,6 +1114,7 @@ int bch2_fs_journal_init(struct journal *j) + spin_lock_init(&j->err_lock); + init_waitqueue_head(&j->wait); + INIT_DELAYED_WORK(&j->write_work, journal_write_work); ++ init_waitqueue_head(&j->reclaim_wait); + init_waitqueue_head(&j->pin_flush_wait); + mutex_init(&j->reclaim_lock); + mutex_init(&j->discard_lock); +diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c +index 93b5e07e05bc..32ac6da4672b 100644 +--- a/fs/bcachefs/journal_reclaim.c ++++ b/fs/bcachefs/journal_reclaim.c +@@ -604,6 +604,9 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct) + + min_nr = max(min_nr, bch2_nr_btree_keys_want_flush(c)); + ++ /* Don't do too many without delivering wakeup: */ ++ min_nr = min(min_nr, 128UL); ++ + trace_journal_reclaim_start(c, + min_nr, + j->prereserved.reserved, +@@ -620,6 +623,9 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct) + else + j->nr_background_reclaim += nr_flushed; + trace_journal_reclaim_finish(c, nr_flushed); ++ ++ if (nr_flushed) ++ wake_up(&j->reclaim_wait); + } while (min_nr && nr_flushed); + + memalloc_noreclaim_restore(flags); +diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h +index d17a1ff82a18..7fcf5150db2c 100644 +--- a/fs/bcachefs/journal_types.h ++++ b/fs/bcachefs/journal_types.h +@@ -243,6 +243,7 @@ struct journal { + spinlock_t err_lock; + + struct mutex reclaim_lock; ++ wait_queue_head_t reclaim_wait; + struct task_struct *reclaim_thread; + bool reclaim_kicked; + u64 nr_direct_reclaim; +-- +cgit v1.2.3 + + +From fa7892e58f03552f02843574c056a135ee8f3dd7 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 31 Mar 2021 14:42:36 -0400 +Subject: bcachefs: Move btree lock debugging to slowpath fn + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 22 +++++++++++++++++----- + fs/bcachefs/btree_locking.h | 15 +-------------- + 2 files changed, 18 insertions(+), 19 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 4a3f3d5b860f..d229312b261c 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -246,6 +246,7 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos, + struct btree_iter *linked, *deadlock_iter = NULL; + u64 start_time = local_clock(); + unsigned reason = 9; ++ bool ret; + + /* Check if it's safe to block: */ + trans_for_each_iter(trans, linked) { +@@ -354,12 +355,23 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos, + if (six_trylock_type(&b->c.lock, type)) + return true; + +- if (six_lock_type(&b->c.lock, type, should_sleep_fn, p)) +- return false; ++#ifdef CONFIG_BCACHEFS_DEBUG ++ trans->locking_iter_idx = iter->idx; ++ trans->locking_pos = pos; ++ trans->locking_btree_id = iter->btree_id; ++ trans->locking_level = level; ++ trans->locking = b; ++#endif + +- bch2_time_stats_update(&trans->c->times[lock_to_time_stat(type)], +- start_time); +- return true; ++ ret = six_lock_type(&b->c.lock, type, should_sleep_fn, p) == 0; ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++ trans->locking = NULL; ++#endif ++ if (ret) ++ bch2_time_stats_update(&trans->c->times[lock_to_time_stat(type)], ++ start_time); ++ return ret; + } + + /* Btree iterator locking: */ +diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h +index 38323e32731f..b166a94753f7 100644 +--- a/fs/bcachefs/btree_locking.h ++++ b/fs/bcachefs/btree_locking.h +@@ -187,27 +187,14 @@ static inline bool btree_node_lock(struct btree *b, + unsigned long ip) + { + struct btree_trans *trans = iter->trans; +- bool ret; + + EBUG_ON(level >= BTREE_MAX_DEPTH); + EBUG_ON(!(trans->iters_linked & (1ULL << iter->idx))); + +-#ifdef CONFIG_BCACHEFS_DEBUG +- trans->locking = b; +- trans->locking_iter_idx = iter->idx; +- trans->locking_pos = pos; +- trans->locking_btree_id = iter->btree_id; +- trans->locking_level = level; +-#endif +- ret = likely(six_trylock_type(&b->c.lock, type)) || ++ return likely(six_trylock_type(&b->c.lock, type)) || + btree_node_lock_increment(trans, b, level, type) || + __bch2_btree_node_lock(b, pos, level, iter, type, + should_sleep_fn, p, ip); +- +-#ifdef CONFIG_BCACHEFS_DEBUG +- trans->locking = NULL; +-#endif +- return ret; + } + + bool __bch2_btree_node_relock(struct btree_iter *, unsigned); +-- +cgit v1.2.3 + + +From 0d097e0854b2980c568ff9798d93806126900020 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 30 Mar 2021 20:35:46 -0400 +Subject: bcachefs: Improve bch2_trans_relock() + +We're getting away from relying on iter->uptodate - this changes +bch2_trans_relock() to more directly specify which iterators should be +relocked. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 9 ++++----- + 1 file changed, 4 insertions(+), 5 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index d229312b261c..478839d9930b 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -504,13 +504,12 @@ void bch2_trans_downgrade(struct btree_trans *trans) + bool bch2_trans_relock(struct btree_trans *trans) + { + struct btree_iter *iter; +- bool ret = true; + + trans_for_each_iter(trans, iter) +- if (iter->uptodate == BTREE_ITER_NEED_RELOCK) +- ret &= bch2_btree_iter_relock(iter, true); +- +- return ret; ++ if (btree_iter_keep(trans, iter) && ++ !bch2_btree_iter_relock(iter, true)) ++ return false; ++ return true; + } + + void bch2_trans_unlock(struct btree_trans *trans) +-- +cgit v1.2.3 + + +From 0835b4af9d387208158e0139c677d38d1938472f Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 31 Mar 2021 21:07:37 -0400 +Subject: bcachefs: Add a sysfs var for average btree write size + +Useful number for performance tuning. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bcachefs.h | 3 +++ + fs/bcachefs/btree_io.c | 3 +++ + fs/bcachefs/sysfs.c | 12 ++++++++++++ + 3 files changed, 18 insertions(+) + +diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h +index abb30fe03aa7..13fafa42153d 100644 +--- a/fs/bcachefs/bcachefs.h ++++ b/fs/bcachefs/bcachefs.h +@@ -798,6 +798,9 @@ struct bch_fs { + struct bio_set dio_write_bioset; + struct bio_set dio_read_bioset; + ++ ++ atomic64_t btree_writes_nr; ++ atomic64_t btree_writes_sectors; + struct bio_list btree_write_error_list; + struct work_struct btree_write_error_work; + spinlock_t btree_write_error_lock; +diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c +index 509c77b900b4..7e6858e3af2b 100644 +--- a/fs/bcachefs/btree_io.c ++++ b/fs/bcachefs/btree_io.c +@@ -1550,6 +1550,9 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, + + b->written += sectors_to_write; + ++ atomic64_inc(&c->btree_writes_nr); ++ atomic64_add(sectors_to_write, &c->btree_writes_sectors); ++ + /* XXX: submitting IO with btree locks held: */ + bch2_submit_wbio_replicas(&wbio->wbio, c, BCH_DATA_btree, k.k); + bch2_bkey_buf_exit(&k, c); +diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c +index 4b83a98621d7..dd9b54e0d80b 100644 +--- a/fs/bcachefs/sysfs.c ++++ b/fs/bcachefs/sysfs.c +@@ -153,6 +153,8 @@ read_attribute(io_latency_stats_read); + read_attribute(io_latency_stats_write); + read_attribute(congested); + ++read_attribute(btree_avg_write_size); ++ + read_attribute(bucket_quantiles_last_read); + read_attribute(bucket_quantiles_last_write); + read_attribute(bucket_quantiles_fragmentation); +@@ -228,6 +230,14 @@ static size_t bch2_btree_cache_size(struct bch_fs *c) + return ret; + } + ++static size_t bch2_btree_avg_write_size(struct bch_fs *c) ++{ ++ u64 nr = atomic64_read(&c->btree_writes_nr); ++ u64 sectors = atomic64_read(&c->btree_writes_sectors); ++ ++ return nr ? div64_u64(sectors, nr) : 0; ++} ++ + static int fs_alloc_debug_to_text(struct printbuf *out, struct bch_fs *c) + { + struct bch_fs_usage_online *fs_usage = bch2_fs_usage_read(c); +@@ -316,6 +326,7 @@ SHOW(bch2_fs) + sysfs_print(block_size, block_bytes(c)); + sysfs_print(btree_node_size, btree_bytes(c)); + sysfs_hprint(btree_cache_size, bch2_btree_cache_size(c)); ++ sysfs_hprint(btree_avg_write_size, bch2_btree_avg_write_size(c)); + + sysfs_print(read_realloc_races, + atomic_long_read(&c->read_realloc_races)); +@@ -507,6 +518,7 @@ struct attribute *bch2_fs_files[] = { + &sysfs_block_size, + &sysfs_btree_node_size, + &sysfs_btree_cache_size, ++ &sysfs_btree_avg_write_size, + + &sysfs_journal_write_delay_ms, + &sysfs_journal_reclaim_delay_ms, +-- +cgit v1.2.3 + + +From 6999900627ccb168006c20c1b34120ee654c9ecb Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 31 Mar 2021 15:21:37 -0400 +Subject: bcachefs: Improve bch2_btree_update_start() + +bch2_btree_update_start() is now responsible for taking gc_lock and +upgrading the iterator to lock parent nodes - greatly simplifying error +handling and all of the callers. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_gc.c | 5 +- + fs/bcachefs/btree_update_interior.c | 329 ++++++++++++------------------------ + fs/bcachefs/btree_update_interior.h | 4 +- + 3 files changed, 114 insertions(+), 224 deletions(-) + +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +index 7b26d743112e..53a677894a79 100644 +--- a/fs/bcachefs/btree_gc.c ++++ b/fs/bcachefs/btree_gc.c +@@ -1336,11 +1336,10 @@ static void bch2_coalesce_nodes(struct bch_fs *c, struct btree_iter *iter, + return; + } + +- as = bch2_btree_update_start(iter->trans, iter->btree_id, ++ as = bch2_btree_update_start(iter, old_nodes[0]->c.level, + btree_update_reserve_required(c, parent) + nr_old_nodes, + BTREE_INSERT_NOFAIL| +- BTREE_INSERT_USE_RESERVE, +- NULL); ++ BTREE_INSERT_USE_RESERVE); + if (IS_ERR(as)) { + trace_btree_gc_coalesce_fail(c, + BTREE_GC_COALESCE_FAIL_RESERVE_GET); +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index 19dfc32e8c68..aeb6b3b7bc89 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -458,6 +458,10 @@ static void bch2_btree_update_free(struct btree_update *as) + { + struct bch_fs *c = as->c; + ++ if (as->took_gc_lock) ++ up_read(&c->gc_lock); ++ as->took_gc_lock = false; ++ + bch2_journal_preres_put(&c->journal, &as->journal_preres); + + bch2_journal_pin_drop(&c->journal, &as->journal); +@@ -893,24 +897,31 @@ void bch2_btree_update_done(struct btree_update *as) + { + BUG_ON(as->mode == BTREE_INTERIOR_NO_UPDATE); + ++ if (as->took_gc_lock) ++ up_read(&as->c->gc_lock); ++ as->took_gc_lock = false; ++ + bch2_btree_reserve_put(as); + + continue_at(&as->cl, btree_update_set_nodes_written, system_freezable_wq); + } + + struct btree_update * +-bch2_btree_update_start(struct btree_trans *trans, enum btree_id id, +- unsigned nr_nodes, unsigned flags, +- struct closure *cl) ++bch2_btree_update_start(struct btree_iter *iter, unsigned level, ++ unsigned nr_nodes, unsigned flags) + { ++ struct btree_trans *trans = iter->trans; + struct bch_fs *c = trans->c; + struct btree_update *as; ++ struct closure cl; + int disk_res_flags = (flags & BTREE_INSERT_NOFAIL) + ? BCH_DISK_RESERVATION_NOFAIL : 0; + int journal_flags = (flags & BTREE_INSERT_JOURNAL_RESERVED) + ? JOURNAL_RES_GET_RECLAIM : 0; + int ret = 0; + ++ closure_init_stack(&cl); ++retry: + /* + * This check isn't necessary for correctness - it's just to potentially + * prevent us from doing a lot of work that'll end up being wasted: +@@ -919,12 +930,36 @@ bch2_btree_update_start(struct btree_trans *trans, enum btree_id id, + if (ret) + return ERR_PTR(ret); + ++ /* ++ * XXX: figure out how far we might need to split, ++ * instead of locking/reserving all the way to the root: ++ */ ++ if (!bch2_btree_iter_upgrade(iter, U8_MAX)) { ++ trace_trans_restart_iter_upgrade(trans->ip); ++ return ERR_PTR(-EINTR); ++ } ++ ++ if (flags & BTREE_INSERT_GC_LOCK_HELD) ++ lockdep_assert_held(&c->gc_lock); ++ else if (!down_read_trylock(&c->gc_lock)) { ++ if (flags & BTREE_INSERT_NOUNLOCK) ++ return ERR_PTR(-EINTR); ++ ++ bch2_trans_unlock(trans); ++ down_read(&c->gc_lock); ++ if (!bch2_trans_relock(trans)) { ++ up_read(&c->gc_lock); ++ return ERR_PTR(-EINTR); ++ } ++ } ++ + as = mempool_alloc(&c->btree_interior_update_pool, GFP_NOIO); + memset(as, 0, sizeof(*as)); + closure_init(&as->cl, NULL); + as->c = c; + as->mode = BTREE_INTERIOR_NO_UPDATE; +- as->btree_id = id; ++ as->took_gc_lock = !(flags & BTREE_INSERT_GC_LOCK_HELD); ++ as->btree_id = iter->btree_id; + INIT_LIST_HEAD(&as->list); + INIT_LIST_HEAD(&as->unwritten_list); + INIT_LIST_HEAD(&as->write_blocked_list); +@@ -936,8 +971,14 @@ bch2_btree_update_start(struct btree_trans *trans, enum btree_id id, + BTREE_UPDATE_JOURNAL_RES, + journal_flags|JOURNAL_RES_GET_NONBLOCK); + if (ret == -EAGAIN) { +- if (flags & BTREE_INSERT_NOUNLOCK) +- return ERR_PTR(-EINTR); ++ /* ++ * this would be cleaner if bch2_journal_preres_get() took a ++ * closure argument ++ */ ++ if (flags & BTREE_INSERT_NOUNLOCK) { ++ ret = -EINTR; ++ goto err; ++ } + + bch2_trans_unlock(trans); + +@@ -945,7 +986,7 @@ bch2_btree_update_start(struct btree_trans *trans, enum btree_id id, + BTREE_UPDATE_JOURNAL_RES, + journal_flags); + if (ret) +- return ERR_PTR(ret); ++ goto err; + + if (!bch2_trans_relock(trans)) { + ret = -EINTR; +@@ -960,7 +1001,8 @@ bch2_btree_update_start(struct btree_trans *trans, enum btree_id id, + if (ret) + goto err; + +- ret = bch2_btree_reserve_get(as, nr_nodes, flags, cl); ++ ret = bch2_btree_reserve_get(as, nr_nodes, flags, ++ !(flags & BTREE_INSERT_NOUNLOCK) ? &cl : NULL); + if (ret) + goto err; + +@@ -975,6 +1017,18 @@ bch2_btree_update_start(struct btree_trans *trans, enum btree_id id, + return as; + err: + bch2_btree_update_free(as); ++ ++ if (ret == -EAGAIN) { ++ BUG_ON(flags & BTREE_INSERT_NOUNLOCK); ++ ++ bch2_trans_unlock(trans); ++ closure_sync(&cl); ++ ret = -EINTR; ++ } ++ ++ if (ret == -EINTR && bch2_trans_relock(trans)) ++ goto retry; ++ + return ERR_PTR(ret); + } + +@@ -1419,6 +1473,7 @@ void bch2_btree_insert_node(struct btree_update *as, struct btree *b, + int old_live_u64s = b->nr.live_u64s; + int live_u64s_added, u64s_added; + ++ lockdep_assert_held(&c->gc_lock); + BUG_ON(!btree_node_intent_locked(iter, btree_node_root(c, b)->c.level)); + BUG_ON(!b->c.level); + BUG_ON(!as || as->b); +@@ -1466,67 +1521,17 @@ split: + int bch2_btree_split_leaf(struct bch_fs *c, struct btree_iter *iter, + unsigned flags) + { +- struct btree_trans *trans = iter->trans; + struct btree *b = iter_l(iter)->b; + struct btree_update *as; +- struct closure cl; +- int ret = 0; +- +- closure_init_stack(&cl); +- +- /* Hack, because gc and splitting nodes doesn't mix yet: */ +- if (!(flags & BTREE_INSERT_GC_LOCK_HELD) && +- !down_read_trylock(&c->gc_lock)) { +- if (flags & BTREE_INSERT_NOUNLOCK) { +- trace_transaction_restart_ip(trans->ip, _THIS_IP_); +- return -EINTR; +- } +- +- bch2_trans_unlock(trans); +- down_read(&c->gc_lock); +- +- if (!bch2_trans_relock(trans)) +- ret = -EINTR; +- } +- +- /* +- * XXX: figure out how far we might need to split, +- * instead of locking/reserving all the way to the root: +- */ +- if (!bch2_btree_iter_upgrade(iter, U8_MAX)) { +- trace_trans_restart_iter_upgrade(trans->ip); +- ret = -EINTR; +- goto out; +- } +- +- as = bch2_btree_update_start(trans, iter->btree_id, +- btree_update_reserve_required(c, b), flags, +- !(flags & BTREE_INSERT_NOUNLOCK) ? &cl : NULL); +- if (IS_ERR(as)) { +- ret = PTR_ERR(as); +- if (ret == -EAGAIN) { +- BUG_ON(flags & BTREE_INSERT_NOUNLOCK); +- bch2_trans_unlock(trans); +- ret = -EINTR; + +- trace_transaction_restart_ip(trans->ip, _THIS_IP_); +- } +- goto out; +- } ++ as = bch2_btree_update_start(iter, iter->level, ++ btree_update_reserve_required(c, b), flags); ++ if (IS_ERR(as)) ++ return PTR_ERR(as); + + btree_split(as, b, iter, NULL, flags); + bch2_btree_update_done(as); +- +- /* +- * We haven't successfully inserted yet, so don't downgrade all the way +- * back to read locks; +- */ +- __bch2_btree_iter_downgrade(iter, 1); +-out: +- if (!(flags & BTREE_INSERT_GC_LOCK_HELD)) +- up_read(&c->gc_lock); +- closure_sync(&cl); +- return ret; ++ return 0; + } + + void __bch2_foreground_maybe_merge(struct bch_fs *c, +@@ -1541,13 +1546,10 @@ void __bch2_foreground_maybe_merge(struct bch_fs *c, + struct bkey_format new_f; + struct bkey_i delete; + struct btree *b, *m, *n, *prev, *next, *parent; +- struct closure cl; + size_t sib_u64s; + int ret = 0; + + BUG_ON(!btree_node_locked(iter, level)); +- +- closure_init_stack(&cl); + retry: + BUG_ON(!btree_node_locked(iter, level)); + +@@ -1605,25 +1607,15 @@ retry: + goto out; + } + +- /* We're changing btree topology, doesn't mix with gc: */ +- if (!(flags & BTREE_INSERT_GC_LOCK_HELD) && +- !down_read_trylock(&c->gc_lock)) +- goto err_cycle_gc_lock; +- +- if (!bch2_btree_iter_upgrade(iter, U8_MAX)) { +- ret = -EINTR; +- goto err_unlock; +- } +- +- as = bch2_btree_update_start(trans, iter->btree_id, ++ as = bch2_btree_update_start(iter, level, + btree_update_reserve_required(c, parent) + 1, + flags| + BTREE_INSERT_NOFAIL| +- BTREE_INSERT_USE_RESERVE, +- !(flags & BTREE_INSERT_NOUNLOCK) ? &cl : NULL); +- if (IS_ERR(as)) { +- ret = PTR_ERR(as); +- goto err_unlock; ++ BTREE_INSERT_USE_RESERVE); ++ ret = PTR_ERR_OR_ZERO(as); ++ if (ret) { ++ six_unlock_intent(&m->c.lock); ++ goto err; + } + + trace_btree_merge(c, b); +@@ -1671,9 +1663,6 @@ retry: + six_unlock_intent(&n->c.lock); + + bch2_btree_update_done(as); +- +- if (!(flags & BTREE_INSERT_GC_LOCK_HELD)) +- up_read(&c->gc_lock); + out: + bch2_btree_trans_verify_locks(trans); + +@@ -1686,58 +1675,52 @@ out: + * split path, and downgrading to read locks in there is potentially + * confusing: + */ +- closure_sync(&cl); + return; +- +-err_cycle_gc_lock: +- six_unlock_intent(&m->c.lock); +- +- if (flags & BTREE_INSERT_NOUNLOCK) +- goto out; +- +- bch2_trans_unlock(trans); +- +- down_read(&c->gc_lock); +- up_read(&c->gc_lock); +- ret = -EINTR; +- goto err; +- +-err_unlock: +- six_unlock_intent(&m->c.lock); +- if (!(flags & BTREE_INSERT_GC_LOCK_HELD)) +- up_read(&c->gc_lock); + err: + BUG_ON(ret == -EAGAIN && (flags & BTREE_INSERT_NOUNLOCK)); + +- if ((ret == -EAGAIN || ret == -EINTR) && +- !(flags & BTREE_INSERT_NOUNLOCK)) { ++ if (ret == -EINTR && !(flags & BTREE_INSERT_NOUNLOCK)) { + bch2_trans_unlock(trans); +- closure_sync(&cl); + ret = bch2_btree_iter_traverse(iter); +- if (ret) +- goto out; +- +- goto retry; ++ if (!ret) ++ goto retry; + } + + goto out; + } + +-static int __btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter, +- struct btree *b, unsigned flags, +- struct closure *cl) ++/** ++ * bch_btree_node_rewrite - Rewrite/move a btree node ++ */ ++int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter, ++ __le64 seq, unsigned flags) + { +- struct btree *n, *parent = btree_node_parent(iter, b); ++ struct btree *b, *n, *parent; + struct btree_update *as; ++ int ret; ++ ++ flags |= BTREE_INSERT_NOFAIL; ++retry: ++ ret = bch2_btree_iter_traverse(iter); ++ if (ret) ++ goto out; ++ ++ b = bch2_btree_iter_peek_node(iter); ++ if (!b || b->data->keys.seq != seq) ++ goto out; + +- as = bch2_btree_update_start(iter->trans, iter->btree_id, ++ parent = btree_node_parent(iter, b); ++ as = bch2_btree_update_start(iter, b->c.level, + (parent + ? btree_update_reserve_required(c, parent) + : 0) + 1, +- flags, cl); +- if (IS_ERR(as)) { ++ flags); ++ ret = PTR_ERR_OR_ZERO(as); ++ if (ret == -EINTR) ++ goto retry; ++ if (ret) { + trace_btree_gc_rewrite_node_fail(c, b); +- return PTR_ERR(as); ++ goto out; + } + + bch2_btree_interior_update_will_free_node(as, b); +@@ -1768,60 +1751,8 @@ static int __btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter, + six_unlock_intent(&n->c.lock); + + bch2_btree_update_done(as); +- return 0; +-} +- +-/** +- * bch_btree_node_rewrite - Rewrite/move a btree node +- * +- * Returns 0 on success, -EINTR or -EAGAIN on failure (i.e. +- * btree_check_reserve() has to wait) +- */ +-int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter, +- __le64 seq, unsigned flags) +-{ +- struct btree_trans *trans = iter->trans; +- struct closure cl; +- struct btree *b; +- int ret; +- +- flags |= BTREE_INSERT_NOFAIL; +- +- closure_init_stack(&cl); +- +- bch2_btree_iter_upgrade(iter, U8_MAX); +- +- if (!(flags & BTREE_INSERT_GC_LOCK_HELD)) { +- if (!down_read_trylock(&c->gc_lock)) { +- bch2_trans_unlock(trans); +- down_read(&c->gc_lock); +- } +- } +- +- while (1) { +- ret = bch2_btree_iter_traverse(iter); +- if (ret) +- break; +- +- b = bch2_btree_iter_peek_node(iter); +- if (!b || b->data->keys.seq != seq) +- break; +- +- ret = __btree_node_rewrite(c, iter, b, flags, &cl); +- if (ret != -EAGAIN && +- ret != -EINTR) +- break; +- +- bch2_trans_unlock(trans); +- closure_sync(&cl); +- } +- ++out: + bch2_btree_iter_downgrade(iter); +- +- if (!(flags & BTREE_INSERT_GC_LOCK_HELD)) +- up_read(&c->gc_lock); +- +- closure_sync(&cl); + return ret; + } + +@@ -1892,71 +1823,34 @@ int bch2_btree_node_update_key(struct bch_fs *c, struct btree_iter *iter, + struct btree_update *as = NULL; + struct btree *new_hash = NULL; + struct closure cl; +- int ret; ++ int ret = 0; + + closure_init_stack(&cl); + +- if (!bch2_btree_iter_upgrade(iter, U8_MAX)) +- return -EINTR; +- +- if (!down_read_trylock(&c->gc_lock)) { +- bch2_trans_unlock(iter->trans); +- down_read(&c->gc_lock); +- +- if (!bch2_trans_relock(iter->trans)) { +- ret = -EINTR; +- goto err; +- } +- } +- + /* + * check btree_ptr_hash_val() after @b is locked by + * btree_iter_traverse(): + */ + if (btree_ptr_hash_val(new_key) != b->hash_val) { +- /* bch2_btree_reserve_get will unlock */ + ret = bch2_btree_cache_cannibalize_lock(c, &cl); + if (ret) { + bch2_trans_unlock(iter->trans); +- up_read(&c->gc_lock); + closure_sync(&cl); +- down_read(&c->gc_lock); +- +- if (!bch2_trans_relock(iter->trans)) { +- ret = -EINTR; +- goto err; +- } ++ if (!bch2_trans_relock(iter->trans)) ++ return -EINTR; + } + + new_hash = bch2_btree_node_mem_alloc(c); + } +-retry: +- as = bch2_btree_update_start(iter->trans, iter->btree_id, +- parent ? btree_update_reserve_required(c, parent) : 0, +- BTREE_INSERT_NOFAIL, &cl); + ++ as = bch2_btree_update_start(iter, b->c.level, ++ parent ? btree_update_reserve_required(c, parent) : 0, ++ BTREE_INSERT_NOFAIL); + if (IS_ERR(as)) { + ret = PTR_ERR(as); +- if (ret == -EAGAIN) +- ret = -EINTR; +- +- if (ret == -EINTR) { +- bch2_trans_unlock(iter->trans); +- up_read(&c->gc_lock); +- closure_sync(&cl); +- down_read(&c->gc_lock); +- +- if (bch2_trans_relock(iter->trans)) +- goto retry; +- } +- + goto err; + } + +- ret = bch2_mark_bkey_replicas(c, bkey_i_to_s_c(new_key)); +- if (ret) +- goto err_free_update; +- + __bch2_btree_node_update_key(c, as, iter, b, new_hash, new_key); + + bch2_btree_iter_downgrade(iter); +@@ -1969,12 +1863,9 @@ err: + six_unlock_write(&new_hash->c.lock); + six_unlock_intent(&new_hash->c.lock); + } +- up_read(&c->gc_lock); + closure_sync(&cl); ++ bch2_btree_cache_cannibalize_unlock(c); + return ret; +-err_free_update: +- bch2_btree_update_free(as); +- goto err; + } + + /* Init code: */ +diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h +index 45d212730fd7..2a6b51ece0f8 100644 +--- a/fs/bcachefs/btree_update_interior.h ++++ b/fs/bcachefs/btree_update_interior.h +@@ -48,6 +48,7 @@ struct btree_update { + } mode; + + unsigned nodes_written:1; ++ unsigned took_gc_lock:1; + + enum btree_id btree_id; + +@@ -120,8 +121,7 @@ struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *, + + void bch2_btree_update_done(struct btree_update *); + struct btree_update * +-bch2_btree_update_start(struct btree_trans *, enum btree_id, unsigned, +- unsigned, struct closure *); ++bch2_btree_update_start(struct btree_iter *, unsigned, unsigned, unsigned); + + void bch2_btree_interior_update_will_free_node(struct btree_update *, + struct btree *); +-- +cgit v1.2.3 + + +From d83f20436fa3acdb9875ff9dd260c3f08f431841 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 31 Mar 2021 15:39:16 -0400 +Subject: bcachefs: Change where merging of interior btree nodes is trigger + from + +Previously, we were doing btree node merging from +bch2_btree_insert_node() - but this is called from the split path, when +we're in the middle of creating new nodes and deleting new nodes and the +iterators are in a weird state. + +Also, this means we're starting a new btree_update while in the middle +of an existing one, and that's asking for deadlocks. + +Much simpler and saner to trigger btree node merging _after_ the whole +btree node split path is finished. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_update_interior.c | 16 +++++++--------- + 1 file changed, 7 insertions(+), 9 deletions(-) + +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index aeb6b3b7bc89..0aa3840b0c2f 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -1505,14 +1505,6 @@ void bch2_btree_insert_node(struct btree_update *as, struct btree *b, + bch2_btree_node_unlock_write(b, iter); + + btree_node_interior_verify(c, b); +- +- /* +- * when called from the btree_split path the new nodes aren't added to +- * the btree iterator yet, so the merge path's unlock/wait/relock dance +- * won't work: +- */ +- bch2_foreground_maybe_merge(c, iter, b->c.level, +- flags|BTREE_INSERT_NOUNLOCK); + return; + split: + btree_split(as, b, iter, keys, flags); +@@ -1523,6 +1515,8 @@ int bch2_btree_split_leaf(struct bch_fs *c, struct btree_iter *iter, + { + struct btree *b = iter_l(iter)->b; + struct btree_update *as; ++ unsigned l; ++ int ret = 0; + + as = bch2_btree_update_start(iter, iter->level, + btree_update_reserve_required(c, b), flags); +@@ -1531,7 +1525,11 @@ int bch2_btree_split_leaf(struct bch_fs *c, struct btree_iter *iter, + + btree_split(as, b, iter, NULL, flags); + bch2_btree_update_done(as); +- return 0; ++ ++ for (l = iter->level + 1; btree_iter_node(iter, l) && !ret; l++) ++ bch2_foreground_maybe_merge(c, iter, l, flags); ++ ++ return ret; + } + + void __bch2_foreground_maybe_merge(struct bch_fs *c, +-- +cgit v1.2.3 + + +From 764f997cf99fa04485f641fc025bc518e808c70a Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 29 Mar 2021 01:13:31 -0400 +Subject: bcachefs: Kill bch2_btree_node_get_sibling() + +This patch reworks the btree node merge path to use a second btree +iterator to get the sibling node - which means +bch2_btree_iter_get_sibling() can be deleted. Also, it uses +bch2_btree_iter_traverse_all() if necessary - which means it should be +more reliable. We don't currently even try to make it work when +trans->nounlock is set - after a BTREE_INSERT_NOUNLOCK transaction +commit, hopefully this will be a worthwhile tradeoff. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_cache.c | 134 +----------------------------------- + fs/bcachefs/btree_cache.h | 3 - + fs/bcachefs/btree_update_interior.c | 61 ++++++++++------ + 3 files changed, 43 insertions(+), 155 deletions(-) + +diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c +index ac59cb8c75c1..218c3488391b 100644 +--- a/fs/bcachefs/btree_cache.c ++++ b/fs/bcachefs/btree_cache.c +@@ -912,136 +912,6 @@ out: + return b; + } + +-struct btree *bch2_btree_node_get_sibling(struct bch_fs *c, +- struct btree_iter *iter, +- struct btree *b, +- enum btree_node_sibling sib) +-{ +- struct btree_trans *trans = iter->trans; +- struct btree *parent; +- struct btree_node_iter node_iter; +- struct bkey_packed *k; +- struct bkey_buf tmp; +- struct btree *ret = NULL; +- unsigned level = b->c.level; +- +- bch2_bkey_buf_init(&tmp); +- +- parent = btree_iter_node(iter, level + 1); +- if (!parent) +- return NULL; +- +- /* +- * There's a corner case where a btree_iter might have a node locked +- * that is just outside its current pos - when +- * bch2_btree_iter_set_pos_same_leaf() gets to the end of the node. +- * +- * But the lock ordering checks in __bch2_btree_node_lock() go off of +- * iter->pos, not the node's key: so if the iterator is marked as +- * needing to be traversed, we risk deadlock if we don't bail out here: +- */ +- if (iter->uptodate >= BTREE_ITER_NEED_TRAVERSE) +- return ERR_PTR(-EINTR); +- +- if (!bch2_btree_node_relock(iter, level + 1)) { +- ret = ERR_PTR(-EINTR); +- goto out; +- } +- +- node_iter = iter->l[parent->c.level].iter; +- +- k = bch2_btree_node_iter_peek_all(&node_iter, parent); +- BUG_ON(bkey_cmp_left_packed(parent, k, &b->key.k.p)); +- +- k = sib == btree_prev_sib +- ? bch2_btree_node_iter_prev(&node_iter, parent) +- : (bch2_btree_node_iter_advance(&node_iter, parent), +- bch2_btree_node_iter_peek(&node_iter, parent)); +- if (!k) +- goto out; +- +- bch2_bkey_buf_unpack(&tmp, c, parent, k); +- +- ret = bch2_btree_node_get(c, iter, tmp.k, level, +- SIX_LOCK_intent, _THIS_IP_); +- +- if (PTR_ERR_OR_ZERO(ret) == -EINTR && !trans->nounlock) { +- struct btree_iter *linked; +- +- if (!bch2_btree_node_relock(iter, level + 1)) +- goto out; +- +- /* +- * We might have got -EINTR because trylock failed, and we're +- * holding other locks that would cause us to deadlock: +- */ +- trans_for_each_iter(trans, linked) +- if (btree_iter_lock_cmp(iter, linked) < 0) +- __bch2_btree_iter_unlock(linked); +- +- if (sib == btree_prev_sib) +- btree_node_unlock(iter, level); +- +- ret = bch2_btree_node_get(c, iter, tmp.k, level, +- SIX_LOCK_intent, _THIS_IP_); +- +- /* +- * before btree_iter_relock() calls btree_iter_verify_locks(): +- */ +- if (btree_lock_want(iter, level + 1) == BTREE_NODE_UNLOCKED) +- btree_node_unlock(iter, level + 1); +- +- if (!bch2_btree_node_relock(iter, level)) { +- btree_iter_set_dirty(iter, BTREE_ITER_NEED_RELOCK); +- +- if (!IS_ERR(ret)) { +- six_unlock_intent(&ret->c.lock); +- ret = ERR_PTR(-EINTR); +- } +- } +- +- bch2_trans_relock(trans); +- } +-out: +- if (btree_lock_want(iter, level + 1) == BTREE_NODE_UNLOCKED) +- btree_node_unlock(iter, level + 1); +- +- if (PTR_ERR_OR_ZERO(ret) == -EINTR) +- bch2_btree_iter_upgrade(iter, level + 2); +- +- BUG_ON(!IS_ERR(ret) && !btree_node_locked(iter, level)); +- +- if (!IS_ERR_OR_NULL(ret)) { +- struct btree *n1 = ret, *n2 = b; +- +- if (sib != btree_prev_sib) +- swap(n1, n2); +- +- if (bpos_cmp(bpos_successor(n1->key.k.p), +- n2->data->min_key)) { +- char buf1[200], buf2[200]; +- +- bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(&n1->key)); +- bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(&n2->key)); +- +- bch2_fs_inconsistent(c, "btree topology error at btree %s level %u:\n" +- "prev: %s\n" +- "next: %s\n", +- bch2_btree_ids[iter->btree_id], level, +- buf1, buf2); +- +- six_unlock_intent(&ret->c.lock); +- ret = NULL; +- } +- } +- +- bch2_btree_trans_verify_locks(trans); +- +- bch2_bkey_buf_exit(&tmp, c); +- +- return ret; +-} +- + void bch2_btree_node_prefetch(struct bch_fs *c, struct btree_iter *iter, + const struct bkey_i *k, + enum btree_id btree_id, unsigned level) +@@ -1081,7 +951,7 @@ void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, + " format: u64s %u fields %u %u %u %u %u\n" + " unpack fn len: %u\n" + " bytes used %zu/%zu (%zu%% full)\n" +- " sib u64s: %u, %u (merge threshold %zu)\n" ++ " sib u64s: %u, %u (merge threshold %u)\n" + " nr packed keys %u\n" + " nr unpacked keys %u\n" + " floats %zu\n" +@@ -1098,7 +968,7 @@ void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, + b->nr.live_u64s * 100 / btree_max_u64s(c), + b->sib_u64s[0], + b->sib_u64s[1], +- BTREE_FOREGROUND_MERGE_THRESHOLD(c), ++ c->btree_foreground_merge_threshold, + b->nr.packed_keys, + b->nr.unpacked_keys, + stats.floats, +diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h +index 217988696a77..aa8fe4a1b04b 100644 +--- a/fs/bcachefs/btree_cache.h ++++ b/fs/bcachefs/btree_cache.h +@@ -26,9 +26,6 @@ struct btree *bch2_btree_node_get(struct bch_fs *, struct btree_iter *, + struct btree *bch2_btree_node_get_noiter(struct bch_fs *, const struct bkey_i *, + enum btree_id, unsigned, bool); + +-struct btree *bch2_btree_node_get_sibling(struct bch_fs *, struct btree_iter *, +- struct btree *, enum btree_node_sibling); +- + void bch2_btree_node_prefetch(struct bch_fs *, struct btree_iter *, + const struct bkey_i *, enum btree_id, unsigned); + +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index 0aa3840b0c2f..b8a2fe562a17 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -1539,36 +1539,50 @@ void __bch2_foreground_maybe_merge(struct bch_fs *c, + enum btree_node_sibling sib) + { + struct btree_trans *trans = iter->trans; ++ struct btree_iter *sib_iter = NULL; + struct btree_update *as; + struct bkey_format_state new_s; + struct bkey_format new_f; + struct bkey_i delete; + struct btree *b, *m, *n, *prev, *next, *parent; ++ struct bpos sib_pos; + size_t sib_u64s; + int ret = 0; + ++ if (trans->nounlock) ++ return; ++ + BUG_ON(!btree_node_locked(iter, level)); + retry: ++ ret = bch2_btree_iter_traverse(iter); ++ if (ret) ++ goto err; ++ + BUG_ON(!btree_node_locked(iter, level)); + + b = iter->l[level].b; + +- parent = btree_node_parent(iter, b); +- if (!parent) ++ if ((sib == btree_prev_sib && !bpos_cmp(b->data->min_key, POS_MIN)) || ++ (sib == btree_next_sib && !bpos_cmp(b->data->max_key, POS_MAX))) { ++ b->sib_u64s[sib] = U16_MAX; + goto out; ++ } + +- if (b->sib_u64s[sib] > BTREE_FOREGROUND_MERGE_THRESHOLD(c)) +- goto out; ++ sib_pos = sib == btree_prev_sib ++ ? bpos_predecessor(b->data->min_key) ++ : bpos_successor(b->data->max_key); + +- /* XXX: can't be holding read locks */ +- m = bch2_btree_node_get_sibling(c, iter, b, sib); +- if (IS_ERR(m)) { +- ret = PTR_ERR(m); ++ sib_iter = bch2_trans_get_node_iter(trans, iter->btree_id, ++ sib_pos, U8_MAX, level, ++ BTREE_ITER_INTENT); ++ ret = bch2_btree_iter_traverse(sib_iter); ++ if (ret) + goto err; +- } + +- /* NULL means no sibling: */ +- if (!m) { ++ m = sib_iter->l[level].b; ++ ++ if (btree_node_parent(iter, b) != ++ btree_node_parent(sib_iter, m)) { + b->sib_u64s[sib] = U16_MAX; + goto out; + } +@@ -1581,6 +1595,8 @@ retry: + next = m; + } + ++ BUG_ON(bkey_cmp(bpos_successor(prev->data->max_key), next->data->min_key)); ++ + bch2_bkey_format_init(&new_s); + bch2_bkey_format_add_pos(&new_s, prev->data->min_key); + __bch2_btree_calc_format(&new_s, prev); +@@ -1598,23 +1614,21 @@ retry: + } + + sib_u64s = min(sib_u64s, btree_max_u64s(c)); ++ sib_u64s = min(sib_u64s, (size_t) U16_MAX - 1); + b->sib_u64s[sib] = sib_u64s; + +- if (b->sib_u64s[sib] > BTREE_FOREGROUND_MERGE_THRESHOLD(c)) { +- six_unlock_intent(&m->c.lock); ++ if (b->sib_u64s[sib] > c->btree_foreground_merge_threshold) + goto out; +- } + ++ parent = btree_node_parent(iter, b); + as = bch2_btree_update_start(iter, level, + btree_update_reserve_required(c, parent) + 1, + flags| + BTREE_INSERT_NOFAIL| + BTREE_INSERT_USE_RESERVE); + ret = PTR_ERR_OR_ZERO(as); +- if (ret) { +- six_unlock_intent(&m->c.lock); ++ if (ret) + goto err; +- } + + trace_btree_merge(c, b); + +@@ -1648,6 +1662,7 @@ retry: + bch2_btree_update_get_open_buckets(as, n); + + six_lock_increment(&b->c.lock, SIX_LOCK_intent); ++ six_lock_increment(&m->c.lock, SIX_LOCK_intent); + bch2_btree_iter_node_drop(iter, b); + bch2_btree_iter_node_drop(iter, m); + +@@ -1663,6 +1678,7 @@ retry: + bch2_btree_update_done(as); + out: + bch2_btree_trans_verify_locks(trans); ++ bch2_trans_iter_free(trans, sib_iter); + + /* + * Don't downgrade locks here: we're called after successful insert, +@@ -1675,11 +1691,16 @@ out: + */ + return; + err: +- BUG_ON(ret == -EAGAIN && (flags & BTREE_INSERT_NOUNLOCK)); ++ bch2_trans_iter_put(trans, sib_iter); ++ sib_iter = NULL; ++ ++ if (ret == -EINTR && bch2_trans_relock(trans)) { ++ ret = 0; ++ goto retry; ++ } + + if (ret == -EINTR && !(flags & BTREE_INSERT_NOUNLOCK)) { +- bch2_trans_unlock(trans); +- ret = bch2_btree_iter_traverse(iter); ++ ret = bch2_btree_iter_traverse_all(trans); + if (!ret) + goto retry; + } +-- +cgit v1.2.3 + + +From 97aede782b06f9c3bef7af7ba498571d4a34a9d2 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 31 Mar 2021 16:16:39 -0400 +Subject: bcachefs: bch2_foreground_maybe_merge() now correctly reports lock + restarts + +This means that btree node splits don't have to automatically trigger a +transaction restart. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_update_interior.c | 24 ++++++++++-------------- + fs/bcachefs/btree_update_interior.h | 24 ++++++++++++------------ + 2 files changed, 22 insertions(+), 26 deletions(-) + +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index b8a2fe562a17..beb9a367fd80 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -1527,16 +1527,16 @@ int bch2_btree_split_leaf(struct bch_fs *c, struct btree_iter *iter, + bch2_btree_update_done(as); + + for (l = iter->level + 1; btree_iter_node(iter, l) && !ret; l++) +- bch2_foreground_maybe_merge(c, iter, l, flags); ++ ret = bch2_foreground_maybe_merge(c, iter, l, flags); + + return ret; + } + +-void __bch2_foreground_maybe_merge(struct bch_fs *c, +- struct btree_iter *iter, +- unsigned level, +- unsigned flags, +- enum btree_node_sibling sib) ++int __bch2_foreground_maybe_merge(struct bch_fs *c, ++ struct btree_iter *iter, ++ unsigned level, ++ unsigned flags, ++ enum btree_node_sibling sib) + { + struct btree_trans *trans = iter->trans; + struct btree_iter *sib_iter = NULL; +@@ -1547,10 +1547,7 @@ void __bch2_foreground_maybe_merge(struct bch_fs *c, + struct btree *b, *m, *n, *prev, *next, *parent; + struct bpos sib_pos; + size_t sib_u64s; +- int ret = 0; +- +- if (trans->nounlock) +- return; ++ int ret = 0, ret2 = 0; + + BUG_ON(!btree_node_locked(iter, level)); + retry: +@@ -1689,17 +1686,16 @@ out: + * split path, and downgrading to read locks in there is potentially + * confusing: + */ +- return; ++ return ret ?: ret2; + err: + bch2_trans_iter_put(trans, sib_iter); + sib_iter = NULL; + +- if (ret == -EINTR && bch2_trans_relock(trans)) { +- ret = 0; ++ if (ret == -EINTR && bch2_trans_relock(trans)) + goto retry; +- } + + if (ret == -EINTR && !(flags & BTREE_INSERT_NOUNLOCK)) { ++ ret2 = ret; + ret = bch2_btree_iter_traverse_all(trans); + if (!ret) + goto retry; +diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h +index 2a6b51ece0f8..f2925b0d7f17 100644 +--- a/fs/bcachefs/btree_update_interior.h ++++ b/fs/bcachefs/btree_update_interior.h +@@ -132,10 +132,10 @@ void bch2_btree_insert_node(struct btree_update *, struct btree *, + unsigned); + int bch2_btree_split_leaf(struct bch_fs *, struct btree_iter *, unsigned); + +-void __bch2_foreground_maybe_merge(struct bch_fs *, struct btree_iter *, +- unsigned, unsigned, enum btree_node_sibling); ++int __bch2_foreground_maybe_merge(struct bch_fs *, struct btree_iter *, ++ unsigned, unsigned, enum btree_node_sibling); + +-static inline void bch2_foreground_maybe_merge_sibling(struct bch_fs *c, ++static inline int bch2_foreground_maybe_merge_sibling(struct bch_fs *c, + struct btree_iter *iter, + unsigned level, unsigned flags, + enum btree_node_sibling sib) +@@ -143,27 +143,27 @@ static inline void bch2_foreground_maybe_merge_sibling(struct bch_fs *c, + struct btree *b; + + if (iter->uptodate >= BTREE_ITER_NEED_TRAVERSE) +- return; ++ return 0; + + if (!bch2_btree_node_relock(iter, level)) +- return; ++ return 0; + + b = iter->l[level].b; + if (b->sib_u64s[sib] > c->btree_foreground_merge_threshold) +- return; ++ return 0; + +- __bch2_foreground_maybe_merge(c, iter, level, flags, sib); ++ return __bch2_foreground_maybe_merge(c, iter, level, flags, sib); + } + +-static inline void bch2_foreground_maybe_merge(struct bch_fs *c, ++static inline int bch2_foreground_maybe_merge(struct bch_fs *c, + struct btree_iter *iter, + unsigned level, + unsigned flags) + { +- bch2_foreground_maybe_merge_sibling(c, iter, level, flags, +- btree_prev_sib); +- bch2_foreground_maybe_merge_sibling(c, iter, level, flags, +- btree_next_sib); ++ return bch2_foreground_maybe_merge_sibling(c, iter, level, flags, ++ btree_prev_sib) ?: ++ bch2_foreground_maybe_merge_sibling(c, iter, level, flags, ++ btree_next_sib); + } + + void bch2_btree_set_root_for_read(struct bch_fs *, struct btree *); +-- +cgit v1.2.3 + + +From 4ea20d484b5686231921dcd5bed88ce992d6d3b4 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 29 Mar 2021 01:13:31 -0400 +Subject: bcachefs: Move btree node merging to before transaction commit + +Currently, BTREE_INSERT_NOUNLOCK makes it hard to ensure btree node +merging happens reliably - since btree node merging happens after +transaction commit, we can't drop btree locks and block when starting +the btree update. + +This patch moves it to before transaction commit - and failure to do a +merge that we wanted to do just restarts the transaction. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_update_leaf.c | 76 ++++++++++++++++++++++++++++++++--------- + 1 file changed, 60 insertions(+), 16 deletions(-) + +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index 0348ba782cb5..f8e394e2e767 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -492,20 +492,75 @@ err: + return ret; + } + ++static noinline int maybe_do_btree_merge(struct btree_trans *trans, struct btree_iter *iter) ++{ ++ struct btree_insert_entry *i; ++ struct btree *b = iter_l(iter)->b; ++ struct bkey_s_c old; ++ int u64s_delta = 0; ++ int ret; ++ ++ /* ++ * Inserting directly into interior nodes is an uncommon operation with ++ * various weird edge cases: also, a lot of things about ++ * BTREE_ITER_NODES iters need to be audited ++ */ ++ if (unlikely(btree_iter_type(iter) != BTREE_ITER_KEYS)) ++ return 0; ++ ++ BUG_ON(iter->level); ++ ++ trans_for_each_update2(trans, i) { ++ if (iter_l(i->iter)->b != b) ++ continue; ++ ++ old = bch2_btree_iter_peek_slot(i->iter); ++ ret = bkey_err(old); ++ if (ret) ++ return ret; ++ ++ u64s_delta += !bkey_deleted(&i->k->k) ? i->k->k.u64s : 0; ++ u64s_delta -= !bkey_deleted(old.k) ? old.k->u64s : 0; ++ } ++ ++ return u64s_delta <= 0 ++ ? (bch2_foreground_maybe_merge(trans->c, iter, iter->level, ++ trans->flags & ~BTREE_INSERT_NOUNLOCK) ?: -EINTR) ++ : 0; ++} ++ + /* + * Get journal reservation, take write locks, and attempt to do btree update(s): + */ + static inline int do_bch2_trans_commit(struct btree_trans *trans, + struct btree_insert_entry **stopped_at) + { ++ struct bch_fs *c = trans->c; + struct btree_insert_entry *i; + struct btree_iter *iter; + int ret; + ++ trans_for_each_update2(trans, i) { ++ struct btree *b; ++ ++ BUG_ON(!btree_node_intent_locked(i->iter, i->level)); ++ ++ if (btree_iter_type(i->iter) == BTREE_ITER_CACHED) ++ continue; ++ ++ b = iter_l(i->iter)->b; ++ if (b->sib_u64s[0] < c->btree_foreground_merge_threshold || ++ b->sib_u64s[1] < c->btree_foreground_merge_threshold) { ++ ret = maybe_do_btree_merge(trans, i->iter); ++ if (unlikely(ret)) ++ return ret; ++ } ++ } ++ + trans_for_each_update2(trans, i) +- BUG_ON(!btree_node_intent_locked(i->iter, i->iter->level)); ++ BUG_ON(!btree_node_intent_locked(i->iter, i->level)); + +- ret = bch2_journal_preres_get(&trans->c->journal, ++ ret = bch2_journal_preres_get(&c->journal, + &trans->journal_preres, trans->journal_preres_u64s, + JOURNAL_RES_GET_NONBLOCK| + ((trans->flags & BTREE_INSERT_JOURNAL_RECLAIM) +@@ -547,7 +602,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, + + trans_for_each_update2(trans, i) + if (!same_leaf_as_prev(trans, i)) +- bch2_btree_node_lock_for_insert(trans->c, ++ bch2_btree_node_lock_for_insert(c, + iter_l(i->iter)->b, i->iter); + + ret = bch2_trans_commit_write_locked(trans, stopped_at); +@@ -558,29 +613,18 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, + i->iter); + + if (!ret && trans->journal_pin) +- bch2_journal_pin_add(&trans->c->journal, trans->journal_res.seq, ++ bch2_journal_pin_add(&c->journal, trans->journal_res.seq, + trans->journal_pin, NULL); + + /* + * Drop journal reservation after dropping write locks, since dropping + * the journal reservation may kick off a journal write: + */ +- bch2_journal_res_put(&trans->c->journal, &trans->journal_res); ++ bch2_journal_res_put(&c->journal, &trans->journal_res); + + if (unlikely(ret)) + return ret; + +- if (trans->flags & BTREE_INSERT_NOUNLOCK) +- trans->nounlock = true; +- +- trans_for_each_update2(trans, i) +- if (btree_iter_type(i->iter) != BTREE_ITER_CACHED && +- !same_leaf_as_prev(trans, i)) +- bch2_foreground_maybe_merge(trans->c, i->iter, +- 0, trans->flags); +- +- trans->nounlock = false; +- + bch2_trans_downgrade(trans); + + return 0; +-- +cgit v1.2.3 + + +From 4a96402580c0bc3f6de6c59c0b221b9cebc5bed0 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 31 Mar 2021 16:43:50 -0400 +Subject: bcachefs: Drop trans->nounlock + +Since we're no longer doing btree node merging post commit, we can now +delete a bunch of code. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 94 +++++++++++++++++------------------------ + fs/bcachefs/btree_iter.h | 5 +-- + fs/bcachefs/btree_locking.h | 9 +--- + fs/bcachefs/btree_types.h | 1 - + fs/bcachefs/btree_update_leaf.c | 9 ++-- + 5 files changed, 44 insertions(+), 74 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 478839d9930b..7355816eafa0 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -267,17 +267,12 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos, + */ + if (type == SIX_LOCK_intent && + linked->nodes_locked != linked->nodes_intent_locked) { +- if (!(trans->nounlock)) { +- linked->locks_want = max_t(unsigned, +- linked->locks_want, +- __fls(linked->nodes_locked) + 1); +- if (!btree_iter_get_locks(linked, true, false)) { +- deadlock_iter = linked; +- reason = 1; +- } +- } else { ++ linked->locks_want = max_t(unsigned, ++ linked->locks_want, ++ __fls(linked->nodes_locked) + 1); ++ if (!btree_iter_get_locks(linked, true, false)) { + deadlock_iter = linked; +- reason = 2; ++ reason = 1; + } + } + +@@ -307,18 +302,13 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos, + * we're about to lock, it must have the ancestors locked too: + */ + if (level > __fls(linked->nodes_locked)) { +- if (!(trans->nounlock)) { +- linked->locks_want = +- max(level + 1, max_t(unsigned, +- linked->locks_want, +- iter->locks_want)); +- if (!btree_iter_get_locks(linked, true, false)) { +- deadlock_iter = linked; +- reason = 5; +- } +- } else { ++ linked->locks_want = ++ max(level + 1, max_t(unsigned, ++ linked->locks_want, ++ iter->locks_want)); ++ if (!btree_iter_get_locks(linked, true, false)) { + deadlock_iter = linked; +- reason = 6; ++ reason = 5; + } + } + +@@ -441,30 +431,6 @@ bool __bch2_btree_iter_upgrade(struct btree_iter *iter, + return false; + } + +-bool __bch2_btree_iter_upgrade_nounlock(struct btree_iter *iter, +- unsigned new_locks_want) +-{ +- unsigned l = iter->level; +- +- EBUG_ON(iter->locks_want >= new_locks_want); +- +- iter->locks_want = new_locks_want; +- +- do { +- if (!btree_iter_node(iter, l)) +- break; +- +- if (!bch2_btree_node_upgrade(iter, l)) { +- iter->locks_want = l; +- return false; +- } +- +- l++; +- } while (l < iter->locks_want); +- +- return true; +-} +- + void __bch2_btree_iter_downgrade(struct btree_iter *iter, + unsigned downgrade_to) + { +@@ -1046,7 +1012,7 @@ void bch2_btree_iter_node_drop(struct btree_iter *iter, struct btree *b) + + trans_for_each_iter(iter->trans, linked) + if (linked->l[level].b == b) { +- __btree_node_unlock(linked, level); ++ btree_node_unlock(linked, level); + linked->l[level].b = BTREE_ITER_NO_NODE_DROP; + } + } +@@ -2083,8 +2049,10 @@ alloc_iter: + + if (!(iter->flags & BTREE_ITER_INTENT)) + bch2_btree_iter_downgrade(iter); +- else if (!iter->locks_want) +- __bch2_btree_iter_upgrade_nounlock(iter, 1); ++ else if (!iter->locks_want) { ++ iter->locks_want = 1; ++ btree_iter_get_locks(iter, true, false); ++ } + + bch2_btree_iter_set_pos(iter, pos); + btree_iter_set_search_pos(iter, btree_iter_search_key(iter)); +@@ -2349,11 +2317,22 @@ bch2_btree_iter_node_to_text(struct printbuf *out, + struct btree_bkey_cached_common *_b, + enum btree_iter_type type) + { +- pr_buf(out, " %px l=%u %s:", +- _b, _b->level, bch2_btree_ids[_b->btree_id]); ++ pr_buf(out, " l=%u %s:", ++ _b->level, bch2_btree_ids[_b->btree_id]); + bch2_bpos_to_text(out, btree_node_pos(_b, type)); + } + ++static bool trans_has_btree_nodes_locked(struct btree_trans *trans) ++{ ++ struct btree_iter *iter; ++ ++ trans_for_each_iter(trans, iter) ++ if (btree_iter_type(iter) != BTREE_ITER_CACHED && ++ iter->nodes_locked) ++ return true; ++ return false; ++} ++ + void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c) + { + #ifdef CONFIG_BCACHEFS_DEBUG +@@ -2364,14 +2343,18 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c) + + mutex_lock(&c->btree_trans_lock); + list_for_each_entry(trans, &c->btree_trans_list, list) { +- pr_buf(out, "%i %px %ps\n", trans->pid, trans, (void *) trans->ip); ++ if (!trans_has_btree_nodes_locked(trans)) ++ continue; ++ ++ pr_buf(out, "%i %ps\n", trans->pid, (void *) trans->ip); + + trans_for_each_iter(trans, iter) { + if (!iter->nodes_locked) + continue; + +- pr_buf(out, " iter %u %s:", ++ pr_buf(out, " iter %u %c %s:", + iter->idx, ++ btree_iter_type(iter) == BTREE_ITER_CACHED ? 'c' : 'b', + bch2_btree_ids[iter->btree_id]); + bch2_bpos_to_text(out, iter->pos); + pr_buf(out, "\n"); +@@ -2390,17 +2373,18 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c) + + b = READ_ONCE(trans->locking); + if (b) { +- pr_buf(out, " locking iter %u l=%u %s:", ++ iter = &trans->iters[trans->locking_iter_idx]; ++ pr_buf(out, " locking iter %u %c l=%u %s:", + trans->locking_iter_idx, ++ btree_iter_type(iter) == BTREE_ITER_CACHED ? 'c' : 'b', + trans->locking_level, + bch2_btree_ids[trans->locking_btree_id]); + bch2_bpos_to_text(out, trans->locking_pos); + +- + pr_buf(out, " node "); + bch2_btree_iter_node_to_text(out, + (void *) b, +- btree_iter_type(&trans->iters[trans->locking_iter_idx])); ++ btree_iter_type(iter)); + pr_buf(out, "\n"); + } + } +diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h +index 7585f989ad50..1a11e68911ba 100644 +--- a/fs/bcachefs/btree_iter.h ++++ b/fs/bcachefs/btree_iter.h +@@ -116,7 +116,6 @@ bool bch2_trans_relock(struct btree_trans *); + void bch2_trans_unlock(struct btree_trans *); + + bool __bch2_btree_iter_upgrade(struct btree_iter *, unsigned); +-bool __bch2_btree_iter_upgrade_nounlock(struct btree_iter *, unsigned); + + static inline bool bch2_btree_iter_upgrade(struct btree_iter *iter, + unsigned new_locks_want) +@@ -124,9 +123,7 @@ static inline bool bch2_btree_iter_upgrade(struct btree_iter *iter, + new_locks_want = min(new_locks_want, BTREE_MAX_DEPTH); + + return iter->locks_want < new_locks_want +- ? (!iter->trans->nounlock +- ? __bch2_btree_iter_upgrade(iter, new_locks_want) +- : __bch2_btree_iter_upgrade_nounlock(iter, new_locks_want)) ++ ? __bch2_btree_iter_upgrade(iter, new_locks_want) + : iter->uptodate <= BTREE_ITER_NEED_PEEK; + } + +diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h +index b166a94753f7..7532bcdef967 100644 +--- a/fs/bcachefs/btree_locking.h ++++ b/fs/bcachefs/btree_locking.h +@@ -95,7 +95,7 @@ btree_lock_want(struct btree_iter *iter, int level) + return BTREE_NODE_UNLOCKED; + } + +-static inline void __btree_node_unlock(struct btree_iter *iter, unsigned level) ++static inline void btree_node_unlock(struct btree_iter *iter, unsigned level) + { + int lock_type = btree_node_locked_type(iter, level); + +@@ -106,13 +106,6 @@ static inline void __btree_node_unlock(struct btree_iter *iter, unsigned level) + mark_btree_node_unlocked(iter, level); + } + +-static inline void btree_node_unlock(struct btree_iter *iter, unsigned level) +-{ +- EBUG_ON(!level && iter->trans->nounlock); +- +- __btree_node_unlock(iter, level); +-} +- + static inline void __bch2_btree_iter_unlock(struct btree_iter *iter) + { + btree_iter_set_dirty(iter, BTREE_ITER_NEED_RELOCK); +diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h +index 038cd1f94376..abbc548666e9 100644 +--- a/fs/bcachefs/btree_types.h ++++ b/fs/bcachefs/btree_types.h +@@ -374,7 +374,6 @@ struct btree_trans { + u8 nr_updates2; + unsigned used_mempool:1; + unsigned error:1; +- unsigned nounlock:1; + unsigned in_traverse_all:1; + + u64 iters_linked; +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index f8e394e2e767..bfd985b86450 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -984,17 +984,14 @@ int __bch2_trans_commit(struct btree_trans *trans) + goto out; + } + +- /* +- * We're not using bch2_btree_iter_upgrade here because +- * we know trans->nounlock can't be set: +- */ +- if (unlikely(!btree_node_intent_locked(i->iter, i->iter->level) && +- !__bch2_btree_iter_upgrade(i->iter, i->iter->level + 1))) { ++ if (unlikely(!bch2_btree_iter_upgrade(i->iter, i->level + 1))) { + trace_trans_restart_upgrade(trans->ip); + ret = -EINTR; + goto out; + } + ++ BUG_ON(!btree_node_intent_locked(i->iter, i->level)); ++ + u64s = jset_u64s(i->k->k.u64s); + if (btree_iter_type(i->iter) == BTREE_ITER_CACHED && + likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) +-- +cgit v1.2.3 + + +From 269f139c883d09cd10b0afe78968327272e1fcb4 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 31 Mar 2021 16:10:21 -0400 +Subject: bcachefs: Fix BTREE_FOREGROUND_MERGE_HYSTERESIS + +We were multiplying instead of dividing - oops. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_cache.h | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h +index aa8fe4a1b04b..4791c3b64452 100644 +--- a/fs/bcachefs/btree_cache.h ++++ b/fs/bcachefs/btree_cache.h +@@ -89,7 +89,7 @@ static inline unsigned btree_blocks(struct bch_fs *c) + #define BTREE_FOREGROUND_MERGE_THRESHOLD(c) (btree_max_u64s(c) * 1 / 3) + #define BTREE_FOREGROUND_MERGE_HYSTERESIS(c) \ + (BTREE_FOREGROUND_MERGE_THRESHOLD(c) + \ +- (BTREE_FOREGROUND_MERGE_THRESHOLD(c) << 2)) ++ (BTREE_FOREGROUND_MERGE_THRESHOLD(c) >> 2)) + + #define btree_node_root(_c, _b) ((_c)->btree_roots[(_b)->c.btree_id].b) + +-- +cgit v1.2.3 + + +From 00b6ede71e1b948d527fb7f1ea2a23f654eeef47 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 2 Apr 2021 21:29:05 -0400 +Subject: bcachefs: Increase commality between BTREE_ITER_NODES and + BTREE_ITER_KEYS + +Eventually BTREE_ITER_NODES should be going away. This patch is to fix a +transaction iterator overflow in the btree node merge path because +BTREE_ITER_NODES iterators couldn't be reused. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 81 ++++++++++++++++++++++++++---------------------- + fs/bcachefs/btree_iter.h | 13 +++++--- + 2 files changed, 53 insertions(+), 41 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 7355816eafa0..d6f5c383e7ba 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -432,25 +432,24 @@ bool __bch2_btree_iter_upgrade(struct btree_iter *iter, + } + + void __bch2_btree_iter_downgrade(struct btree_iter *iter, +- unsigned downgrade_to) ++ unsigned new_locks_want) + { +- unsigned l, new_locks_want = downgrade_to ?: +- (iter->flags & BTREE_ITER_INTENT ? 1 : 0); ++ unsigned l; + +- if (iter->locks_want < downgrade_to) { +- iter->locks_want = new_locks_want; ++ EBUG_ON(iter->locks_want < new_locks_want); + +- while (iter->nodes_locked && +- (l = __fls(iter->nodes_locked)) >= iter->locks_want) { +- if (l > iter->level) { +- btree_node_unlock(iter, l); +- } else { +- if (btree_node_intent_locked(iter, l)) { +- six_lock_downgrade(&iter->l[l].b->c.lock); +- iter->nodes_intent_locked ^= 1 << l; +- } +- break; ++ iter->locks_want = new_locks_want; ++ ++ while (iter->nodes_locked && ++ (l = __fls(iter->nodes_locked)) >= iter->locks_want) { ++ if (l > iter->level) { ++ btree_node_unlock(iter, l); ++ } else { ++ if (btree_node_intent_locked(iter, l)) { ++ six_lock_downgrade(&iter->l[l].b->c.lock); ++ iter->nodes_intent_locked ^= 1 << l; + } ++ break; + } + } + +@@ -1993,6 +1992,8 @@ static inline void btree_iter_copy(struct btree_iter *dst, + + struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans, + unsigned btree_id, struct bpos pos, ++ unsigned locks_want, ++ unsigned depth, + unsigned flags) + { + struct btree_iter *iter, *best = NULL; +@@ -2005,10 +2006,6 @@ struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans, + pos.snapshot = btree_type_has_snapshots(btree_id) + ? U32_MAX : 0; + +- /* We always want a fresh iterator for node iterators: */ +- if ((flags & BTREE_ITER_TYPE) == BTREE_ITER_NODES) +- goto alloc_iter; +- + trans_for_each_iter(trans, iter) { + if (btree_iter_type(iter) != (flags & BTREE_ITER_TYPE)) + continue; +@@ -2023,7 +2020,7 @@ struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans, + + best = iter; + } +-alloc_iter: ++ + if (!best) { + iter = btree_trans_iter_alloc(trans); + bch2_btree_iter_init(trans, iter, btree_id); +@@ -2047,13 +2044,26 @@ alloc_iter: + + iter->snapshot = pos.snapshot; + +- if (!(iter->flags & BTREE_ITER_INTENT)) +- bch2_btree_iter_downgrade(iter); +- else if (!iter->locks_want) { +- iter->locks_want = 1; ++ locks_want = min(locks_want, BTREE_MAX_DEPTH); ++ ++ if (locks_want > iter->locks_want) { ++ iter->locks_want = locks_want; + btree_iter_get_locks(iter, true, false); ++ } else if (locks_want < iter->locks_want) { ++ __bch2_btree_iter_downgrade(iter, locks_want); + } + ++ while (iter->level < depth) { ++ btree_node_unlock(iter, iter->level); ++ iter->l[iter->level].b = BTREE_ITER_NO_NODE_INIT; ++ iter->level++; ++ } ++ ++ while (iter->level > depth) ++ iter->l[--iter->level].b = BTREE_ITER_NO_NODE_INIT; ++ ++ iter->min_depth = depth; ++ + bch2_btree_iter_set_pos(iter, pos); + btree_iter_set_search_pos(iter, btree_iter_search_key(iter)); + +@@ -2069,21 +2079,16 @@ struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *trans, + { + struct btree_iter *iter = + __bch2_trans_get_iter(trans, btree_id, pos, +- BTREE_ITER_NODES| +- BTREE_ITER_NOT_EXTENTS| +- BTREE_ITER_ALL_SNAPSHOTS| +- flags); +- unsigned i; ++ locks_want, depth, ++ BTREE_ITER_NODES| ++ BTREE_ITER_NOT_EXTENTS| ++ BTREE_ITER_ALL_SNAPSHOTS| ++ flags); + + BUG_ON(bkey_cmp(iter->pos, pos)); +- +- iter->locks_want = locks_want; +- iter->level = depth; +- iter->min_depth = depth; +- +- for (i = 0; i < ARRAY_SIZE(iter->l); i++) +- iter->l[i].b = NULL; +- iter->l[iter->level].b = BTREE_ITER_NO_NODE_INIT; ++ BUG_ON(iter->locks_want != min(locks_want, BTREE_MAX_DEPTH)); ++ BUG_ON(iter->level != depth); ++ BUG_ON(iter->min_depth != depth); + iter->ip_allocated = _RET_IP_; + + return iter; +@@ -2322,6 +2327,7 @@ bch2_btree_iter_node_to_text(struct printbuf *out, + bch2_bpos_to_text(out, btree_node_pos(_b, type)); + } + ++#ifdef CONFIG_BCACHEFS_DEBUG + static bool trans_has_btree_nodes_locked(struct btree_trans *trans) + { + struct btree_iter *iter; +@@ -2332,6 +2338,7 @@ static bool trans_has_btree_nodes_locked(struct btree_trans *trans) + return true; + return false; + } ++#endif + + void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c) + { +diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h +index 1a11e68911ba..455f2fe4929c 100644 +--- a/fs/bcachefs/btree_iter.h ++++ b/fs/bcachefs/btree_iter.h +@@ -131,8 +131,10 @@ void __bch2_btree_iter_downgrade(struct btree_iter *, unsigned); + + static inline void bch2_btree_iter_downgrade(struct btree_iter *iter) + { +- if (iter->locks_want > (iter->flags & BTREE_ITER_INTENT) ? 1 : 0) +- __bch2_btree_iter_downgrade(iter, 0); ++ unsigned new_locks_want = (iter->flags & BTREE_ITER_INTENT ? 1 : 0); ++ ++ if (iter->locks_want > new_locks_want) ++ __bch2_btree_iter_downgrade(iter, new_locks_want); + } + + void bch2_trans_downgrade(struct btree_trans *); +@@ -258,14 +260,17 @@ int bch2_trans_iter_free(struct btree_trans *, struct btree_iter *); + void bch2_trans_unlink_iters(struct btree_trans *); + + struct btree_iter *__bch2_trans_get_iter(struct btree_trans *, enum btree_id, +- struct bpos, unsigned); ++ struct bpos, unsigned, ++ unsigned, unsigned); + + static inline struct btree_iter * + bch2_trans_get_iter(struct btree_trans *trans, enum btree_id btree_id, + struct bpos pos, unsigned flags) + { + struct btree_iter *iter = +- __bch2_trans_get_iter(trans, btree_id, pos, flags); ++ __bch2_trans_get_iter(trans, btree_id, pos, ++ (flags & BTREE_ITER_INTENT) != 0, 0, ++ flags); + iter->ip_allocated = _THIS_IP_; + return iter; + } +-- +cgit v1.2.3 + + +From 1e9ceca853760b6b2d82c6599036b97c300d0c3e Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 3 Apr 2021 18:37:09 -0400 +Subject: bcachefs: Fix this_cpu_ptr() usage + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/buckets.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index ce4cd9c741a1..879e14245766 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -1388,7 +1388,7 @@ static int bch2_fs_usage_apply(struct bch_fs *c, + struct disk_reservation *disk_res, + unsigned journal_seq) + { +- struct bch_fs_usage *dst = fs_usage_ptr(c, journal_seq, false); ++ struct bch_fs_usage *dst; + s64 added = src->u.data + src->u.reserved; + s64 should_not_have_added; + int ret = 0; +@@ -1416,6 +1416,7 @@ static int bch2_fs_usage_apply(struct bch_fs *c, + this_cpu_add(*c->online_reserved, src->online_reserved); + + preempt_disable(); ++ dst = fs_usage_ptr(c, journal_seq, false); + acc_u64s((u64 *) dst, (u64 *) &src->u, fs_usage_u64s(c)); + preempt_enable(); + +-- +cgit v1.2.3 + + +From 35e77acc279a693ae1853d96d56b1f9fa80a70cf Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 3 Apr 2021 19:27:05 -0400 +Subject: bcachefs: Fix journal deadlock + +After we get a journal reservation, we need to use it - if we erorr out +of a transaction commit, we'll be eating into space in the journal and +if our transaction needs to make forward progress in order to reclaim +space in the journal, we'll deadlock. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_update_leaf.c | 16 ++++++++-------- + 1 file changed, 8 insertions(+), 8 deletions(-) + +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index bfd985b86450..36eb85929086 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -426,6 +426,14 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, + fs_usage = bch2_fs_usage_scratch_get(c); + } + ++ /* Must be called under mark_lock: */ ++ if (marking && trans->fs_usage_deltas && ++ bch2_replicas_delta_list_apply(c, &fs_usage->u, ++ trans->fs_usage_deltas)) { ++ ret = BTREE_INSERT_NEED_MARK_REPLICAS; ++ goto err; ++ } ++ + /* + * Don't get journal reservation until after we know insert will + * succeed: +@@ -462,14 +470,6 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, + i->k->k.version = MAX_VERSION; + } + +- /* Must be called under mark_lock: */ +- if (marking && trans->fs_usage_deltas && +- bch2_replicas_delta_list_apply(c, &fs_usage->u, +- trans->fs_usage_deltas)) { +- ret = BTREE_INSERT_NEED_MARK_REPLICAS; +- goto err; +- } +- + trans_for_each_update(trans, i) + if (BTREE_NODE_TYPE_HAS_MEM_TRIGGERS & (1U << i->bkey_type)) + bch2_mark_update(trans, i->iter, i->k, +-- +cgit v1.2.3 + + +From 845ee57cb7eac878fdf50e529bd99a87cc5396e9 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 3 Apr 2021 16:24:13 -0400 +Subject: bcachefs: Be more careful about JOURNAL_RES_GET_RESERVED + +JOURNAL_RES_GET_RESERVED should only be used for updatse that need to be +done to free up space in the journal. In particular, when we're flushing +keys from the key cache, if we're flushing them out of order we +shouldn't be using it, since we're using up our remaining space in the +journal without dropping a pin that will let us make forward progress. + +With this patch, BTREE_INSERT_JOURNAL_RECLAIM without +BTREE_INSERT_JOURNAL_RESERVED may return -EAGAIN - we can't wait on +journal reclaim if we're already in journal reclaim. + +This means we need to propagate these errors up to journal reclaim, +indicating that flushing a journal pin should be retried in the future. + +This is prep work for a patch to change the way journal reclaim works, +to split out flushing key cache keys because the btree key cache is too +dirty from journal reclaim because we need space in the journal. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_key_cache.c | 24 ++++++++++----- + fs/bcachefs/btree_update_interior.c | 9 ++++-- + fs/bcachefs/btree_update_leaf.c | 15 ++++++--- + fs/bcachefs/journal.c | 24 +++++++++++++++ + fs/bcachefs/journal.h | 3 +- + fs/bcachefs/journal_reclaim.c | 61 +++++++++++++++++++++++-------------- + fs/bcachefs/journal_types.h | 3 +- + 7 files changed, 99 insertions(+), 40 deletions(-) + +diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c +index 98cf092a0b95..215b2e1963e1 100644 +--- a/fs/bcachefs/btree_key_cache.c ++++ b/fs/bcachefs/btree_key_cache.c +@@ -352,6 +352,7 @@ err: + static int btree_key_cache_flush_pos(struct btree_trans *trans, + struct bkey_cached_key key, + u64 journal_seq, ++ unsigned commit_flags, + bool evict) + { + struct bch_fs *c = trans->c; +@@ -390,12 +391,17 @@ retry: + BTREE_INSERT_NOUNLOCK| + BTREE_INSERT_NOCHECK_RW| + BTREE_INSERT_NOFAIL| +- BTREE_INSERT_JOURNAL_RESERVED| +- BTREE_INSERT_JOURNAL_RECLAIM); ++ (ck->journal.seq == journal_last_seq(j) ++ ? BTREE_INSERT_JOURNAL_RESERVED ++ : 0)| ++ commit_flags); + err: + if (ret == -EINTR) + goto retry; + ++ if (ret == -EAGAIN) ++ goto out; ++ + if (ret) { + bch2_fs_fatal_err_on(!bch2_journal_error(j), c, + "error flushing key cache: %i", ret); +@@ -438,15 +444,16 @@ out: + return ret; + } + +-static void btree_key_cache_journal_flush(struct journal *j, +- struct journal_entry_pin *pin, +- u64 seq) ++static int btree_key_cache_journal_flush(struct journal *j, ++ struct journal_entry_pin *pin, ++ u64 seq) + { + struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct bkey_cached *ck = + container_of(pin, struct bkey_cached, journal); + struct bkey_cached_key key; + struct btree_trans trans; ++ int ret = 0; + + int srcu_idx = srcu_read_lock(&c->btree_trans_barrier); + +@@ -461,10 +468,13 @@ static void btree_key_cache_journal_flush(struct journal *j, + six_unlock_read(&ck->c.lock); + + bch2_trans_init(&trans, c, 0, 0); +- btree_key_cache_flush_pos(&trans, key, seq, false); ++ ret = btree_key_cache_flush_pos(&trans, key, seq, ++ BTREE_INSERT_JOURNAL_RECLAIM, false); + bch2_trans_exit(&trans); + unlock: + srcu_read_unlock(&c->btree_trans_barrier, srcu_idx); ++ ++ return ret; + } + + /* +@@ -480,7 +490,7 @@ int bch2_btree_key_cache_flush(struct btree_trans *trans, + if (!bch2_btree_key_cache_find(c, id, pos)) + return 0; + +- return btree_key_cache_flush_pos(trans, key, 0, true); ++ return btree_key_cache_flush_pos(trans, key, 0, 0, true); + } + + bool bch2_btree_insert_key_cached(struct btree_trans *trans, +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index beb9a367fd80..f61f41436580 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -916,10 +916,12 @@ bch2_btree_update_start(struct btree_iter *iter, unsigned level, + struct closure cl; + int disk_res_flags = (flags & BTREE_INSERT_NOFAIL) + ? BCH_DISK_RESERVATION_NOFAIL : 0; +- int journal_flags = (flags & BTREE_INSERT_JOURNAL_RESERVED) +- ? JOURNAL_RES_GET_RECLAIM : 0; ++ int journal_flags = 0; + int ret = 0; + ++ if (flags & BTREE_INSERT_JOURNAL_RESERVED) ++ journal_flags |= JOURNAL_RES_GET_RESERVED; ++ + closure_init_stack(&cl); + retry: + /* +@@ -982,6 +984,9 @@ retry: + + bch2_trans_unlock(trans); + ++ if (flags & BTREE_INSERT_JOURNAL_RECLAIM) ++ goto err; ++ + ret = bch2_journal_preres_get(&c->journal, &as->journal_preres, + BTREE_UPDATE_JOURNAL_RES, + journal_flags); +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index 36eb85929086..7379db6648b3 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -134,7 +134,7 @@ fix_iter: + return true; + } + +-static void __btree_node_flush(struct journal *j, struct journal_entry_pin *pin, ++static int __btree_node_flush(struct journal *j, struct journal_entry_pin *pin, + unsigned i, u64 seq) + { + struct bch_fs *c = container_of(j, struct bch_fs, journal); +@@ -145,14 +145,15 @@ static void __btree_node_flush(struct journal *j, struct journal_entry_pin *pin, + bch2_btree_node_write_cond(c, b, + (btree_current_write(b) == w && w->journal.seq == seq)); + six_unlock_read(&b->c.lock); ++ return 0; + } + +-static void btree_node_flush0(struct journal *j, struct journal_entry_pin *pin, u64 seq) ++static int btree_node_flush0(struct journal *j, struct journal_entry_pin *pin, u64 seq) + { + return __btree_node_flush(j, pin, 0, seq); + } + +-static void btree_node_flush1(struct journal *j, struct journal_entry_pin *pin, u64 seq) ++static int btree_node_flush1(struct journal *j, struct journal_entry_pin *pin, u64 seq) + { + return __btree_node_flush(j, pin, 1, seq); + } +@@ -563,8 +564,8 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, + ret = bch2_journal_preres_get(&c->journal, + &trans->journal_preres, trans->journal_preres_u64s, + JOURNAL_RES_GET_NONBLOCK| +- ((trans->flags & BTREE_INSERT_JOURNAL_RECLAIM) +- ? JOURNAL_RES_GET_RECLAIM : 0)); ++ ((trans->flags & BTREE_INSERT_JOURNAL_RESERVED) ++ ? JOURNAL_RES_GET_RESERVED : 0)); + if (unlikely(ret == -EAGAIN)) + ret = bch2_trans_journal_preres_get_cold(trans, + trans->journal_preres_u64s); +@@ -721,6 +722,10 @@ int bch2_trans_commit_error(struct btree_trans *trans, + case BTREE_INSERT_NEED_JOURNAL_RES: + bch2_trans_unlock(trans); + ++ if ((trans->flags & BTREE_INSERT_JOURNAL_RECLAIM) && ++ !(trans->flags & BTREE_INSERT_JOURNAL_RESERVED)) ++ return -EAGAIN; ++ + ret = bch2_trans_journal_res_get(trans, JOURNAL_RES_GET_CHECK); + if (ret) + return ret; +diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c +index 063505abc641..425f8e1719ca 100644 +--- a/fs/bcachefs/journal.c ++++ b/fs/bcachefs/journal.c +@@ -11,6 +11,7 @@ + #include "btree_gc.h" + #include "btree_update.h" + #include "buckets.h" ++#include "error.h" + #include "journal.h" + #include "journal_io.h" + #include "journal_reclaim.h" +@@ -450,6 +451,27 @@ unlock: + if (!ret) + goto retry; + ++ if ((ret == cur_entry_journal_full || ++ ret == cur_entry_journal_pin_full) && ++ !can_discard && ++ j->reservations.idx == j->reservations.unwritten_idx && ++ (flags & JOURNAL_RES_GET_RESERVED)) { ++ char *journal_debug_buf = kmalloc(4096, GFP_ATOMIC); ++ ++ bch_err(c, "Journal stuck!"); ++ if (journal_debug_buf) { ++ bch2_journal_debug_to_text(&_PBUF(journal_debug_buf, 4096), j); ++ bch_err(c, "%s", journal_debug_buf); ++ ++ bch2_journal_pins_to_text(&_PBUF(journal_debug_buf, 4096), j); ++ bch_err(c, "Journal pins:\n%s", journal_debug_buf); ++ kfree(journal_debug_buf); ++ } ++ ++ bch2_fatal_error(c); ++ dump_stack(); ++ } ++ + /* + * Journal is full - can't rely on reclaim from work item due to + * freezing: +@@ -1167,6 +1189,7 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) + "last_seq_ondisk:\t%llu\n" + "flushed_seq_ondisk:\t%llu\n" + "prereserved:\t\t%u/%u\n" ++ "each entry reserved:\t%u\n" + "nr flush writes:\t%llu\n" + "nr noflush writes:\t%llu\n" + "nr direct reclaim:\t%llu\n" +@@ -1181,6 +1204,7 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) + j->flushed_seq_ondisk, + j->prereserved.reserved, + j->prereserved.remaining, ++ j->entry_u64s_reserved, + j->nr_flush_writes, + j->nr_noflush_writes, + j->nr_direct_reclaim, +diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h +index bda8cb97d321..221f5bb01e95 100644 +--- a/fs/bcachefs/journal.h ++++ b/fs/bcachefs/journal.h +@@ -306,7 +306,6 @@ int bch2_journal_res_get_slowpath(struct journal *, struct journal_res *, + #define JOURNAL_RES_GET_NONBLOCK (1 << 0) + #define JOURNAL_RES_GET_CHECK (1 << 1) + #define JOURNAL_RES_GET_RESERVED (1 << 2) +-#define JOURNAL_RES_GET_RECLAIM (1 << 3) + + static inline int journal_res_get_fast(struct journal *j, + struct journal_res *res, +@@ -444,7 +443,7 @@ static inline int bch2_journal_preres_get_fast(struct journal *j, + * into the reclaim path and deadlock: + */ + +- if (!(flags & JOURNAL_RES_GET_RECLAIM) && ++ if (!(flags & JOURNAL_RES_GET_RESERVED) && + new.reserved > new.remaining) + return 0; + } while ((v = atomic64_cmpxchg(&j->prereserved.counter, +diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c +index 32ac6da4672b..d7ff66b2ccb6 100644 +--- a/fs/bcachefs/journal_reclaim.c ++++ b/fs/bcachefs/journal_reclaim.c +@@ -239,7 +239,7 @@ void bch2_journal_space_available(struct journal *j) + u64s_remaining = (u64) clean << 6; + u64s_remaining -= (u64) total << 3; + u64s_remaining = max(0LL, u64s_remaining); +- u64s_remaining /= 2; ++ u64s_remaining /= 4; + u64s_remaining = min_t(u64, u64s_remaining, U32_MAX); + out: + j->cur_entry_sectors = !ret ? j->space[journal_space_discarded].next_entry : 0; +@@ -353,6 +353,9 @@ static inline void __journal_pin_drop(struct journal *j, + if (!journal_pin_active(pin)) + return; + ++ if (j->flush_in_progress == pin) ++ j->flush_in_progress_dropped = true; ++ + pin_list = journal_seq_pin(j, pin->seq); + pin->seq = 0; + list_del_init(&pin->list); +@@ -439,34 +442,27 @@ journal_get_next_pin(struct journal *j, u64 max_seq, u64 *seq) + struct journal_entry_pin_list *pin_list; + struct journal_entry_pin *ret = NULL; + +- if (!test_bit(JOURNAL_RECLAIM_STARTED, &j->flags)) +- return NULL; +- +- spin_lock(&j->lock); +- + fifo_for_each_entry_ptr(pin_list, &j->pin, *seq) + if (*seq > max_seq || + (ret = list_first_entry_or_null(&pin_list->list, + struct journal_entry_pin, list))) + break; + +- if (ret) { +- list_move(&ret->list, &pin_list->flushed); +- BUG_ON(j->flush_in_progress); +- j->flush_in_progress = ret; +- } +- +- spin_unlock(&j->lock); +- + return ret; + } + + /* returns true if we did work */ +-static u64 journal_flush_pins(struct journal *j, u64 seq_to_flush, +- unsigned min_nr) ++static size_t journal_flush_pins(struct journal *j, u64 seq_to_flush, ++ unsigned min_nr) + { + struct journal_entry_pin *pin; +- u64 seq, ret = 0; ++ size_t nr_flushed = 0; ++ journal_pin_flush_fn flush_fn; ++ u64 seq; ++ int err; ++ ++ if (!test_bit(JOURNAL_RECLAIM_STARTED, &j->flags)) ++ return 0; + + lockdep_assert_held(&j->reclaim_lock); + +@@ -475,23 +471,42 @@ static u64 journal_flush_pins(struct journal *j, u64 seq_to_flush, + + j->last_flushed = jiffies; + ++ spin_lock(&j->lock); + pin = journal_get_next_pin(j, min_nr + ? U64_MAX : seq_to_flush, &seq); ++ if (pin) { ++ BUG_ON(j->flush_in_progress); ++ j->flush_in_progress = pin; ++ j->flush_in_progress_dropped = false; ++ flush_fn = pin->flush; ++ } ++ spin_unlock(&j->lock); ++ + if (!pin) + break; + + if (min_nr) + min_nr--; + +- pin->flush(j, pin, seq); ++ err = flush_fn(j, pin, seq); + +- BUG_ON(j->flush_in_progress != pin); ++ spin_lock(&j->lock); ++ /* Pin might have been dropped or rearmed: */ ++ if (likely(!err && !j->flush_in_progress_dropped)) ++ list_move(&pin->list, &journal_seq_pin(j, seq)->flushed); + j->flush_in_progress = NULL; ++ j->flush_in_progress_dropped = false; ++ spin_unlock(&j->lock); ++ + wake_up(&j->pin_flush_wait); +- ret++; ++ ++ if (err) ++ break; ++ ++ nr_flushed++; + } + +- return ret; ++ return nr_flushed; + } + + static u64 journal_seq_to_flush(struct journal *j) +@@ -556,8 +571,8 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct) + { + struct bch_fs *c = container_of(j, struct bch_fs, journal); + bool kthread = (current->flags & PF_KTHREAD) != 0; +- u64 seq_to_flush, nr_flushed = 0; +- size_t min_nr; ++ u64 seq_to_flush; ++ size_t min_nr, nr_flushed; + unsigned flags; + int ret = 0; + +diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h +index 7fcf5150db2c..98f1a6e222de 100644 +--- a/fs/bcachefs/journal_types.h ++++ b/fs/bcachefs/journal_types.h +@@ -50,7 +50,7 @@ struct journal_entry_pin_list { + + struct journal; + struct journal_entry_pin; +-typedef void (*journal_pin_flush_fn)(struct journal *j, ++typedef int (*journal_pin_flush_fn)(struct journal *j, + struct journal_entry_pin *, u64); + + struct journal_entry_pin { +@@ -251,6 +251,7 @@ struct journal { + + unsigned long last_flushed; + struct journal_entry_pin *flush_in_progress; ++ bool flush_in_progress_dropped; + wait_queue_head_t pin_flush_wait; + + /* protects advancing ja->discard_idx: */ +-- +cgit v1.2.3 + + +From 379ad24e04716b3b7e9fe79cc532dcec26008b8e Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 3 Apr 2021 19:41:09 -0400 +Subject: bcachefs: Fix livelock calling bch2_mark_bkey_replicas() + +The bug was that we were trying to find a replicas entry that wasn't +sorted - but, we can also simplify the code by not using +bch2_mark_bkey_replicas and instead ensuring the list of replicas +entries exists directly. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_update_leaf.c | 8 +++----- + fs/bcachefs/buckets.c | 13 +++++++++++++ + fs/bcachefs/buckets.h | 2 ++ + 3 files changed, 18 insertions(+), 5 deletions(-) + +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index 7379db6648b3..834409d8c785 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -707,11 +707,9 @@ int bch2_trans_commit_error(struct btree_trans *trans, + case BTREE_INSERT_NEED_MARK_REPLICAS: + bch2_trans_unlock(trans); + +- trans_for_each_update(trans, i) { +- ret = bch2_mark_bkey_replicas(c, bkey_i_to_s_c(i->k)); +- if (ret) +- return ret; +- } ++ ret = bch2_replicas_delta_list_mark(c, trans->fs_usage_deltas); ++ if (ret) ++ return ret; + + if (bch2_trans_relock(trans)) + return 0; +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index 879e14245766..85a1d28599e0 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -561,6 +561,7 @@ static inline void update_replicas_list(struct btree_trans *trans, + n = (void *) d->d + d->used; + n->delta = sectors; + memcpy(&n->r, r, replicas_entry_bytes(r)); ++ bch2_replicas_entry_sort(&n->r); + d->used += b; + } + +@@ -611,6 +612,18 @@ unwind: + return -1; + } + ++int bch2_replicas_delta_list_mark(struct bch_fs *c, ++ struct replicas_delta_list *r) ++{ ++ struct replicas_delta *d = r->d; ++ struct replicas_delta *top = (void *) r->d + r->used; ++ int ret = 0; ++ ++ for (d = r->d; !ret && d != top; d = replicas_delta_next(d)) ++ ret = bch2_mark_replicas(c, &d->r); ++ return ret; ++} ++ + #define do_mark_fn(fn, c, pos, flags, ...) \ + ({ \ + int gc, ret = 0; \ +diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h +index af8cb74d71e0..1b83a768ba06 100644 +--- a/fs/bcachefs/buckets.h ++++ b/fs/bcachefs/buckets.h +@@ -253,6 +253,8 @@ int bch2_mark_update(struct btree_trans *, struct btree_iter *, + int bch2_replicas_delta_list_apply(struct bch_fs *, + struct bch_fs_usage *, + struct replicas_delta_list *); ++int bch2_replicas_delta_list_mark(struct bch_fs *, ++ struct replicas_delta_list *); + int bch2_trans_mark_key(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, + unsigned, s64, unsigned); + int bch2_trans_mark_update(struct btree_trans *, struct btree_iter *iter, +-- +cgit v1.2.3 + + +From d3e190e211501a4ae92bcf61b818c9a1c41b85ce Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 3 Apr 2021 20:29:05 -0400 +Subject: bcachefs: Kill bch2_fs_usage_scratch_get() + +This is an important cleanup, eliminating an unnecessary copy in the +transaction commit path. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 1 + + fs/bcachefs/btree_update_interior.c | 4 - + fs/bcachefs/btree_update_leaf.c | 12 +- + fs/bcachefs/buckets.c | 211 ++++++++++++------------------------ + fs/bcachefs/buckets.h | 10 +- + fs/bcachefs/buckets_types.h | 16 --- + fs/bcachefs/replicas.c | 37 +++++++ + fs/bcachefs/replicas.h | 25 +++++ + 8 files changed, 138 insertions(+), 178 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index d6f5c383e7ba..6d1f7ece2ace 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -12,6 +12,7 @@ + #include "error.h" + #include "extents.h" + #include "journal.h" ++#include "replicas.h" + + #include + #include +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index f61f41436580..00144707988f 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -437,10 +437,6 @@ static int bch2_btree_reserve_get(struct btree_update *as, unsigned nr_nodes, + goto err_free; + } + +- ret = bch2_mark_bkey_replicas(c, bkey_i_to_s_c(&b->key)); +- if (ret) +- goto err_free; +- + as->prealloc_nodes[as->nr_prealloc_nodes++] = b; + } + +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index 834409d8c785..8517c1fe4bd4 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -376,7 +376,6 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, + struct btree_insert_entry **stopped_at) + { + struct bch_fs *c = trans->c; +- struct bch_fs_usage_online *fs_usage = NULL; + struct btree_insert_entry *i; + struct btree_trans_commit_hook *h; + unsigned u64s = 0; +@@ -424,13 +423,11 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, + + if (marking) { + percpu_down_read(&c->mark_lock); +- fs_usage = bch2_fs_usage_scratch_get(c); + } + + /* Must be called under mark_lock: */ + if (marking && trans->fs_usage_deltas && +- bch2_replicas_delta_list_apply(c, &fs_usage->u, +- trans->fs_usage_deltas)) { ++ !bch2_replicas_delta_list_marked(c, trans->fs_usage_deltas)) { + ret = BTREE_INSERT_NEED_MARK_REPLICAS; + goto err; + } +@@ -474,10 +471,10 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, + trans_for_each_update(trans, i) + if (BTREE_NODE_TYPE_HAS_MEM_TRIGGERS & (1U << i->bkey_type)) + bch2_mark_update(trans, i->iter, i->k, +- &fs_usage->u, i->trigger_flags); ++ NULL, i->trigger_flags); + +- if (marking) +- bch2_trans_fs_usage_apply(trans, fs_usage); ++ if (marking && trans->fs_usage_deltas) ++ bch2_trans_fs_usage_apply(trans, trans->fs_usage_deltas); + + if (unlikely(c->gc_pos.phase)) + bch2_trans_mark_gc(trans); +@@ -486,7 +483,6 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, + do_btree_insert_one(trans, i->iter, i->k); + err: + if (marking) { +- bch2_fs_usage_scratch_put(c, fs_usage); + percpu_up_read(&c->mark_lock); + } + +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index 85a1d28599e0..42920f9a2b7b 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -167,37 +167,6 @@ void bch2_fs_usage_initialize(struct bch_fs *c) + percpu_up_write(&c->mark_lock); + } + +-void bch2_fs_usage_scratch_put(struct bch_fs *c, struct bch_fs_usage_online *fs_usage) +-{ +- if (fs_usage == c->usage_scratch) +- mutex_unlock(&c->usage_scratch_lock); +- else +- kfree(fs_usage); +-} +- +-struct bch_fs_usage_online *bch2_fs_usage_scratch_get(struct bch_fs *c) +-{ +- struct bch_fs_usage_online *ret; +- unsigned bytes = sizeof(struct bch_fs_usage_online) + sizeof(u64) * +- READ_ONCE(c->replicas.nr); +- ret = kzalloc(bytes, GFP_NOWAIT|__GFP_NOWARN); +- if (ret) +- return ret; +- +- if (mutex_trylock(&c->usage_scratch_lock)) +- goto out_pool; +- +- ret = kzalloc(bytes, GFP_NOFS); +- if (ret) +- return ret; +- +- mutex_lock(&c->usage_scratch_lock); +-out_pool: +- ret = c->usage_scratch; +- memset(ret, 0, bytes); +- return ret; +-} +- + static inline struct bch_dev_usage *dev_usage_ptr(struct bch_dev *ca, + unsigned journal_seq, + bool gc) +@@ -455,6 +424,8 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca, + percpu_rwsem_assert_held(&c->mark_lock); + + preempt_disable(); ++ if (!fs_usage) ++ fs_usage = fs_usage_ptr(c, journal_seq, gc); + u = dev_usage_ptr(ca, journal_seq, gc); + + if (bucket_type(old)) +@@ -483,22 +454,17 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca, + bch2_wake_allocator(ca); + } + +-static inline int update_replicas(struct bch_fs *c, +- struct bch_fs_usage *fs_usage, +- struct bch_replicas_entry *r, +- s64 sectors) ++static inline void update_replicas(struct bch_fs *c, ++ struct bch_fs_usage *fs_usage, ++ struct bch_replicas_entry *r, ++ s64 sectors) + { + int idx = bch2_replicas_entry_idx(c, r); + +- if (idx < 0) +- return -1; +- +- if (!fs_usage) +- return 0; ++ BUG_ON(idx < 0); + + fs_usage_data_type_to_base(fs_usage, r->data_type, sectors); + fs_usage->replicas[idx] += sectors; +- return 0; + } + + static inline void update_cached_sectors(struct bch_fs *c, +@@ -575,55 +541,6 @@ static inline void update_cached_sectors_list(struct btree_trans *trans, + update_replicas_list(trans, &r.e, sectors); + } + +-static inline struct replicas_delta * +-replicas_delta_next(struct replicas_delta *d) +-{ +- return (void *) d + replicas_entry_bytes(&d->r) + 8; +-} +- +-int bch2_replicas_delta_list_apply(struct bch_fs *c, +- struct bch_fs_usage *fs_usage, +- struct replicas_delta_list *r) +-{ +- struct replicas_delta *d = r->d; +- struct replicas_delta *top = (void *) r->d + r->used; +- unsigned i; +- +- for (d = r->d; d != top; d = replicas_delta_next(d)) +- if (update_replicas(c, fs_usage, &d->r, d->delta)) { +- top = d; +- goto unwind; +- } +- +- if (!fs_usage) +- return 0; +- +- fs_usage->nr_inodes += r->nr_inodes; +- +- for (i = 0; i < BCH_REPLICAS_MAX; i++) { +- fs_usage->reserved += r->persistent_reserved[i]; +- fs_usage->persistent_reserved[i] += r->persistent_reserved[i]; +- } +- +- return 0; +-unwind: +- for (d = r->d; d != top; d = replicas_delta_next(d)) +- update_replicas(c, fs_usage, &d->r, -d->delta); +- return -1; +-} +- +-int bch2_replicas_delta_list_mark(struct bch_fs *c, +- struct replicas_delta_list *r) +-{ +- struct replicas_delta *d = r->d; +- struct replicas_delta *top = (void *) r->d + r->used; +- int ret = 0; +- +- for (d = r->d; !ret && d != top; d = replicas_delta_next(d)) +- ret = bch2_mark_replicas(c, &d->r); +- return ret; +-} +- + #define do_mark_fn(fn, c, pos, flags, ...) \ + ({ \ + int gc, ret = 0; \ +@@ -1396,62 +1313,15 @@ int bch2_mark_update(struct btree_trans *trans, + return ret; + } + +-static int bch2_fs_usage_apply(struct bch_fs *c, +- struct bch_fs_usage_online *src, +- struct disk_reservation *disk_res, +- unsigned journal_seq) +-{ +- struct bch_fs_usage *dst; +- s64 added = src->u.data + src->u.reserved; +- s64 should_not_have_added; +- int ret = 0; +- +- percpu_rwsem_assert_held(&c->mark_lock); +- +- /* +- * Not allowed to reduce sectors_available except by getting a +- * reservation: +- */ +- should_not_have_added = added - (s64) (disk_res ? disk_res->sectors : 0); +- if (WARN_ONCE(should_not_have_added > 0, +- "disk usage increased by %lli more than reservation of %llu", +- added, disk_res ? disk_res->sectors : 0)) { +- atomic64_sub(should_not_have_added, &c->sectors_available); +- added -= should_not_have_added; +- ret = -1; +- } +- +- if (added > 0) { +- disk_res->sectors -= added; +- src->online_reserved -= added; +- } +- +- this_cpu_add(*c->online_reserved, src->online_reserved); +- +- preempt_disable(); +- dst = fs_usage_ptr(c, journal_seq, false); +- acc_u64s((u64 *) dst, (u64 *) &src->u, fs_usage_u64s(c)); +- preempt_enable(); +- +- return ret; +-} +- +-void bch2_trans_fs_usage_apply(struct btree_trans *trans, +- struct bch_fs_usage_online *fs_usage) ++static noinline __cold ++void fs_usage_apply_warn(struct btree_trans *trans, ++ unsigned disk_res_sectors) + { + struct bch_fs *c = trans->c; + struct btree_insert_entry *i; +- static int warned_disk_usage = 0; +- u64 disk_res_sectors = trans->disk_res ? trans->disk_res->sectors : 0; + char buf[200]; + +- if (!bch2_fs_usage_apply(c, fs_usage, trans->disk_res, +- trans->journal_res.seq) || +- warned_disk_usage || +- xchg(&warned_disk_usage, 1)) +- return; +- +- bch_err(c, "disk usage increased more than %llu sectors reserved", ++ bch_err(c, "disk usage increased more than %u sectors reserved", + disk_res_sectors); + + trans_for_each_update(trans, i) { +@@ -1486,6 +1356,65 @@ void bch2_trans_fs_usage_apply(struct btree_trans *trans, + } + } + ++void bch2_trans_fs_usage_apply(struct btree_trans *trans, ++ struct replicas_delta_list *deltas) ++{ ++ struct bch_fs *c = trans->c; ++ static int warned_disk_usage = 0; ++ bool warn = false; ++ unsigned disk_res_sectors = trans->disk_res ? trans->disk_res->sectors : 0; ++ struct replicas_delta *d = deltas->d; ++ struct replicas_delta *top = (void *) deltas->d + deltas->used; ++ struct bch_fs_usage *dst; ++ s64 added = 0, should_not_have_added; ++ unsigned i; ++ ++ percpu_rwsem_assert_held(&c->mark_lock); ++ ++ preempt_disable(); ++ dst = fs_usage_ptr(c, trans->journal_res.seq, false); ++ ++ for (d = deltas->d; d != top; d = replicas_delta_next(d)) { ++ switch (d->r.data_type) { ++ case BCH_DATA_btree: ++ case BCH_DATA_user: ++ case BCH_DATA_parity: ++ added += d->delta; ++ } ++ ++ update_replicas(c, dst, &d->r, d->delta); ++ } ++ ++ dst->nr_inodes += deltas->nr_inodes; ++ ++ for (i = 0; i < BCH_REPLICAS_MAX; i++) { ++ added += deltas->persistent_reserved[i]; ++ dst->reserved += deltas->persistent_reserved[i]; ++ dst->persistent_reserved[i] += deltas->persistent_reserved[i]; ++ } ++ ++ /* ++ * Not allowed to reduce sectors_available except by getting a ++ * reservation: ++ */ ++ should_not_have_added = added - (s64) disk_res_sectors; ++ if (unlikely(should_not_have_added > 0)) { ++ atomic64_sub(should_not_have_added, &c->sectors_available); ++ added -= should_not_have_added; ++ warn = true; ++ } ++ ++ if (added > 0) { ++ trans->disk_res->sectors -= added; ++ this_cpu_sub(*c->online_reserved, added); ++ } ++ ++ preempt_enable(); ++ ++ if (unlikely(warn) && !xchg(&warned_disk_usage, 1)) ++ fs_usage_apply_warn(trans, disk_res_sectors); ++} ++ + /* trans_mark: */ + + static struct btree_iter *trans_get_update(struct btree_trans *trans, +diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h +index 1b83a768ba06..cd81e6aba1b0 100644 +--- a/fs/bcachefs/buckets.h ++++ b/fs/bcachefs/buckets.h +@@ -216,9 +216,6 @@ static inline unsigned dev_usage_u64s(void) + return sizeof(struct bch_dev_usage) / sizeof(u64); + } + +-void bch2_fs_usage_scratch_put(struct bch_fs *, struct bch_fs_usage_online *); +-struct bch_fs_usage_online *bch2_fs_usage_scratch_get(struct bch_fs *); +- + u64 bch2_fs_usage_read_one(struct bch_fs *, u64 *); + + struct bch_fs_usage_online *bch2_fs_usage_read(struct bch_fs *); +@@ -250,16 +247,11 @@ int bch2_mark_key(struct bch_fs *, struct bkey_s_c, unsigned, + int bch2_mark_update(struct btree_trans *, struct btree_iter *, + struct bkey_i *, struct bch_fs_usage *, unsigned); + +-int bch2_replicas_delta_list_apply(struct bch_fs *, +- struct bch_fs_usage *, +- struct replicas_delta_list *); +-int bch2_replicas_delta_list_mark(struct bch_fs *, +- struct replicas_delta_list *); + int bch2_trans_mark_key(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, + unsigned, s64, unsigned); + int bch2_trans_mark_update(struct btree_trans *, struct btree_iter *iter, + struct bkey_i *insert, unsigned); +-void bch2_trans_fs_usage_apply(struct btree_trans *, struct bch_fs_usage_online *); ++void bch2_trans_fs_usage_apply(struct btree_trans *, struct replicas_delta_list *); + + int bch2_trans_mark_metadata_bucket(struct btree_trans *, + struct disk_reservation *, struct bch_dev *, +diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h +index b6ea67506cc2..588b1a72adae 100644 +--- a/fs/bcachefs/buckets_types.h ++++ b/fs/bcachefs/buckets_types.h +@@ -96,22 +96,6 @@ struct bch_fs_usage_short { + u64 nr_inodes; + }; + +-struct replicas_delta { +- s64 delta; +- struct bch_replicas_entry r; +-} __packed; +- +-struct replicas_delta_list { +- unsigned size; +- unsigned used; +- +- struct {} memset_start; +- u64 nr_inodes; +- u64 persistent_reserved[BCH_REPLICAS_MAX]; +- struct {} memset_end; +- struct replicas_delta d[0]; +-}; +- + /* + * A reservation for space on disk: + */ +diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c +index 068fbca1dd54..e47c1073d5ab 100644 +--- a/fs/bcachefs/replicas.c ++++ b/fs/bcachefs/replicas.c +@@ -464,6 +464,36 @@ static int __bch2_mark_bkey_replicas(struct bch_fs *c, struct bkey_s_c k, + return 0; + } + ++/* replicas delta list: */ ++ ++bool bch2_replicas_delta_list_marked(struct bch_fs *c, ++ struct replicas_delta_list *r) ++{ ++ struct replicas_delta *d = r->d; ++ struct replicas_delta *top = (void *) r->d + r->used; ++ ++ percpu_rwsem_assert_held(&c->mark_lock); ++ ++ for (d = r->d; d != top; d = replicas_delta_next(d)) ++ if (bch2_replicas_entry_idx(c, &d->r) < 0) ++ return false; ++ return true; ++} ++ ++int bch2_replicas_delta_list_mark(struct bch_fs *c, ++ struct replicas_delta_list *r) ++{ ++ struct replicas_delta *d = r->d; ++ struct replicas_delta *top = (void *) r->d + r->used; ++ int ret = 0; ++ ++ for (d = r->d; !ret && d != top; d = replicas_delta_next(d)) ++ ret = bch2_mark_replicas(c, &d->r); ++ return ret; ++} ++ ++/* bkey replicas: */ ++ + bool bch2_bkey_replicas_marked(struct bch_fs *c, + struct bkey_s_c k) + { +@@ -475,6 +505,11 @@ int bch2_mark_bkey_replicas(struct bch_fs *c, struct bkey_s_c k) + return __bch2_mark_bkey_replicas(c, k, false); + } + ++/* ++ * Old replicas_gc mechanism: only used for journal replicas entries now, should ++ * die at some point: ++ */ ++ + int bch2_replicas_gc_end(struct bch_fs *c, int ret) + { + unsigned i; +@@ -568,6 +603,8 @@ int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask) + return 0; + } + ++/* New much simpler mechanism for clearing out unneeded replicas entries: */ ++ + int bch2_replicas_gc2(struct bch_fs *c) + { + struct bch_replicas_cpu new = { 0 }; +diff --git a/fs/bcachefs/replicas.h b/fs/bcachefs/replicas.h +index 8cb1f592f1b6..72ac544f16d8 100644 +--- a/fs/bcachefs/replicas.h ++++ b/fs/bcachefs/replicas.h +@@ -26,6 +26,31 @@ bool bch2_replicas_marked(struct bch_fs *, struct bch_replicas_entry *); + int bch2_mark_replicas(struct bch_fs *, + struct bch_replicas_entry *); + ++struct replicas_delta { ++ s64 delta; ++ struct bch_replicas_entry r; ++} __packed; ++ ++struct replicas_delta_list { ++ unsigned size; ++ unsigned used; ++ ++ struct {} memset_start; ++ u64 nr_inodes; ++ u64 persistent_reserved[BCH_REPLICAS_MAX]; ++ struct {} memset_end; ++ struct replicas_delta d[0]; ++}; ++ ++static inline struct replicas_delta * ++replicas_delta_next(struct replicas_delta *d) ++{ ++ return (void *) d + replicas_entry_bytes(&d->r) + 8; ++} ++ ++bool bch2_replicas_delta_list_marked(struct bch_fs *, struct replicas_delta_list *); ++int bch2_replicas_delta_list_mark(struct bch_fs *, struct replicas_delta_list *); ++ + void bch2_bkey_to_replicas(struct bch_replicas_entry *, struct bkey_s_c); + bool bch2_bkey_replicas_marked(struct bch_fs *, struct bkey_s_c); + int bch2_mark_bkey_replicas(struct bch_fs *, struct bkey_s_c); +-- +cgit v1.2.3 + + +From 6d20749952ed11d194d5784ce797b3b6167f8490 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 3 Apr 2021 21:09:13 -0400 +Subject: bcachefs: Drop some memset() calls + +gcc is emitting rep stos here, which is silly (and slow) for an 8 byte +memset. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.h | 7 +++++-- + fs/bcachefs/journal.h | 6 ++++-- + 2 files changed, 9 insertions(+), 4 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h +index 455f2fe4929c..07d9b6d36e51 100644 +--- a/fs/bcachefs/btree_iter.h ++++ b/fs/bcachefs/btree_iter.h +@@ -174,8 +174,11 @@ static inline void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos + if (!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) + new_pos.snapshot = iter->snapshot; + +- bkey_init(&iter->k); +- iter->k.p = iter->pos = new_pos; ++ iter->k.type = KEY_TYPE_deleted; ++ iter->k.p.inode = iter->pos.inode = new_pos.inode; ++ iter->k.p.offset = iter->pos.offset = new_pos.offset; ++ iter->k.p.snapshot = iter->pos.snapshot = new_pos.snapshot; ++ iter->k.size = 0; + } + + /* Sort order for locking btree iterators: */ +diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h +index 221f5bb01e95..a0d19fad3bdd 100644 +--- a/fs/bcachefs/journal.h ++++ b/fs/bcachefs/journal.h +@@ -213,11 +213,13 @@ static inline unsigned journal_entry_set(struct jset_entry *entry, unsigned type + enum btree_id id, unsigned level, + const void *data, unsigned u64s) + { +- memset(entry, 0, sizeof(*entry)); + entry->u64s = cpu_to_le16(u64s); +- entry->type = type; + entry->btree_id = id; + entry->level = level; ++ entry->type = type; ++ entry->pad[0] = 0; ++ entry->pad[1] = 0; ++ entry->pad[2] = 0; + memcpy_u64s_small(entry->_data, data, u64s); + + return jset_u64s(u64s); +-- +cgit v1.2.3 + + +From 7ccc55d0b2e493a9907a176fd5f65f5812db645d Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 3 Apr 2021 21:31:02 -0400 +Subject: bcachefs: Eliminate memory barrier from fast path of + journal_preres_put() + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/journal.c | 2 +- + fs/bcachefs/journal.h | 39 ++++++++++++++++++++++----------------- + fs/bcachefs/journal_types.h | 5 +++-- + 3 files changed, 26 insertions(+), 20 deletions(-) + +diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c +index 425f8e1719ca..da3c598fab69 100644 +--- a/fs/bcachefs/journal.c ++++ b/fs/bcachefs/journal.c +@@ -521,7 +521,7 @@ static bool journal_preres_available(struct journal *j, + unsigned new_u64s, + unsigned flags) + { +- bool ret = bch2_journal_preres_get_fast(j, res, new_u64s, flags); ++ bool ret = bch2_journal_preres_get_fast(j, res, new_u64s, flags, true); + + if (!ret && mutex_trylock(&j->reclaim_lock)) { + bch2_journal_reclaim(j); +diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h +index a0d19fad3bdd..cc497125889f 100644 +--- a/fs/bcachefs/journal.h ++++ b/fs/bcachefs/journal.h +@@ -411,7 +411,12 @@ static inline void bch2_journal_preres_put(struct journal *j, + + s.v = atomic64_sub_return(s.v, &j->prereserved.counter); + res->u64s = 0; +- closure_wake_up(&j->preres_wait); ++ ++ if (unlikely(s.waiting)) { ++ clear_bit(ilog2((((union journal_preres_state) { .waiting = 1 }).v)), ++ (unsigned long *) &j->prereserved.v); ++ closure_wake_up(&j->preres_wait); ++ } + + if (s.reserved <= s.remaining && + !test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags)) { +@@ -427,32 +432,32 @@ int __bch2_journal_preres_get(struct journal *, + static inline int bch2_journal_preres_get_fast(struct journal *j, + struct journal_preres *res, + unsigned new_u64s, +- unsigned flags) ++ unsigned flags, ++ bool set_waiting) + { + int d = new_u64s - res->u64s; + union journal_preres_state old, new; + u64 v = atomic64_read(&j->prereserved.counter); ++ int ret; + + do { + old.v = new.v = v; +- +- new.reserved += d; +- +- /* +- * If we're being called from the journal reclaim path, we have +- * to unconditionally give out the pre-reservation, there's +- * nothing else sensible we can do - otherwise we'd recurse back +- * into the reclaim path and deadlock: +- */ +- +- if (!(flags & JOURNAL_RES_GET_RESERVED) && +- new.reserved > new.remaining) ++ ret = 0; ++ ++ if ((flags & JOURNAL_RES_GET_RESERVED) || ++ new.reserved + d < new.remaining) { ++ new.reserved += d; ++ ret = 1; ++ } else if (set_waiting && !new.waiting) ++ new.waiting = true; ++ else + return 0; + } while ((v = atomic64_cmpxchg(&j->prereserved.counter, + old.v, new.v)) != old.v); + +- res->u64s += d; +- return 1; ++ if (ret) ++ res->u64s += d; ++ return ret; + } + + static inline int bch2_journal_preres_get(struct journal *j, +@@ -463,7 +468,7 @@ static inline int bch2_journal_preres_get(struct journal *j, + if (new_u64s <= res->u64s) + return 0; + +- if (bch2_journal_preres_get_fast(j, res, new_u64s, flags)) ++ if (bch2_journal_preres_get_fast(j, res, new_u64s, flags, false)) + return 0; + + if (flags & JOURNAL_RES_GET_NONBLOCK) +diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h +index 98f1a6e222de..aac15bc1b4d6 100644 +--- a/fs/bcachefs/journal_types.h ++++ b/fs/bcachefs/journal_types.h +@@ -105,8 +105,9 @@ union journal_preres_state { + }; + + struct { +- u32 reserved; +- u32 remaining; ++ u64 waiting:1, ++ reserved:31, ++ remaining:32; + }; + }; + +-- +cgit v1.2.3 + + +From cd42f93b49da5dc5345d2efd4a9005431e9ba3d4 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 3 Apr 2021 21:54:14 -0400 +Subject: bcachefs: kill bset_tree->max_key + +Since we now ensure a btree node's max key fits in its packed format, +this isn't needed for the reasons it used to be - and, it was being used +inconsistently. + +Also reorder struct btree a bit for performance, and kill some dead +code. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bset.c | 36 +++--------------------------------- + fs/bcachefs/btree_types.h | 12 +++++------- + 2 files changed, 8 insertions(+), 40 deletions(-) + +diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c +index a8e2ebbe8ace..0a3e3b63828b 100644 +--- a/fs/bcachefs/bset.c ++++ b/fs/bcachefs/bset.c +@@ -698,7 +698,7 @@ static void make_bfloat(struct btree *b, struct bset_tree *t, + if (!bkey_pack_pos(max_key, b->data->max_key, b)) { + k = (void *) max_key; + bkey_init(&k->k); +- k->k.p = t->max_key; ++ k->k.p = b->data->max_key; + } + } + +@@ -782,8 +782,6 @@ retry: + while (k != btree_bkey_last(b, t)) + prev = k, k = bkey_next(k); + +- t->max_key = bkey_unpack_pos(b, prev); +- + if (!bkey_pack_pos(bkey_to_packed(&min_key), b->data->min_key, b)) { + bkey_init(&min_key.k); + min_key.k.p = b->data->min_key; +@@ -791,7 +789,7 @@ retry: + + if (!bkey_pack_pos(bkey_to_packed(&max_key), b->data->max_key, b)) { + bkey_init(&max_key.k); +- max_key.k.p = t->max_key; ++ max_key.k.p = b->data->max_key; + } + + /* Then we build the tree */ +@@ -970,8 +968,6 @@ static void ro_aux_tree_fix_invalidated_key(struct btree *b, + min_key.u64s = max_key.u64s = 0; + + if (bkey_next(k) == btree_bkey_last(b, t)) { +- t->max_key = bkey_unpack_pos(b, k); +- + for (j = 1; j < t->size; j = j * 2 + 1) + make_bfloat(b, t, j, &min_key, &max_key); + } +@@ -1311,16 +1307,6 @@ struct bkey_packed *__bch2_bset_search(struct btree *b, + case BSET_RW_AUX_TREE: + return bset_search_write_set(b, t, search); + case BSET_RO_AUX_TREE: +- /* +- * Each node in the auxiliary search tree covers a certain range +- * of bits, and keys above and below the set it covers might +- * differ outside those bits - so we have to special case the +- * start and end - handle that here: +- */ +- +- if (bpos_cmp(*search, t->max_key) > 0) +- return btree_bkey_last(b, t); +- + return bset_search_tree(b, t, search, lossy_packed_search); + default: + unreachable(); +@@ -1357,23 +1343,6 @@ struct bkey_packed *bch2_bset_search_linear(struct btree *b, + return m; + } + +-/* +- * Returns the first key greater than or equal to @search +- */ +-static __always_inline __flatten +-struct bkey_packed *bch2_bset_search(struct btree *b, +- struct bset_tree *t, +- struct bpos *search, +- struct bkey_packed *packed_search, +- const struct bkey_packed *lossy_packed_search) +-{ +- struct bkey_packed *m = __bch2_bset_search(b, t, search, +- lossy_packed_search); +- +- return bch2_bset_search_linear(b, t, search, +- packed_search, lossy_packed_search, m); +-} +- + /* Btree node iterator */ + + static inline void __bch2_btree_node_iter_push(struct btree_node_iter *iter, +@@ -1469,6 +1438,7 @@ void bch2_btree_node_iter_init(struct btree_node_iter *iter, + unsigned i; + + EBUG_ON(bpos_cmp(*search, b->data->min_key) < 0); ++ EBUG_ON(bpos_cmp(*search, b->data->max_key) > 0); + bset_aux_tree_verify(b); + + memset(iter, 0, sizeof(*iter)); +diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h +index abbc548666e9..10366f6c0619 100644 +--- a/fs/bcachefs/btree_types.h ++++ b/fs/bcachefs/btree_types.h +@@ -47,8 +47,6 @@ struct bset_tree { + u16 data_offset; + u16 aux_data_offset; + u16 end_offset; +- +- struct bpos max_key; + }; + + struct btree_write { +@@ -98,6 +96,11 @@ struct btree { + u8 byte_order; + u8 unpack_fn_len; + ++ struct btree_write writes[2]; ++ ++ /* Key/pointer for this btree node */ ++ __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX); ++ + /* + * XXX: add a delete sequence number, so when bch2_btree_node_relock() + * fails because the lock sequence number has changed - i.e. the +@@ -128,11 +131,6 @@ struct btree { + + /* lru list */ + struct list_head list; +- +- struct btree_write writes[2]; +- +- /* Key/pointer for this btree node */ +- __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX); + }; + + struct btree_cache { +-- +cgit v1.2.3 + + +From 7cd8e5fc10a3bc0938511151f51f03087c554e03 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 4 Apr 2021 22:38:07 -0400 +Subject: bcachefs: Fix an uninitialized variable + +Fortunately it was just used in an error message + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/buckets.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index 42920f9a2b7b..243f8610fcec 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -848,7 +848,7 @@ static int mark_stripe_bucket(struct bch_fs *c, struct bkey_s_c k, + if (g->stripe && g->stripe != k.k->p.offset) { + bch2_fs_inconsistent(c, + "bucket %u:%zu gen %u: multiple stripes using same bucket\n%s", +- ptr->dev, PTR_BUCKET_NR(ca, ptr), new.gen, ++ ptr->dev, PTR_BUCKET_NR(ca, ptr), g->mark.gen, + (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); + return -EINVAL; + } +-- +cgit v1.2.3 + + +From 1e5a05be83a4bff53696223eb4a3d501e621e0d9 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 5 Apr 2021 01:23:55 -0400 +Subject: bcachefs: Fix a startup race + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_key_cache.c | 14 +++++--------- + 1 file changed, 5 insertions(+), 9 deletions(-) + +diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c +index 215b2e1963e1..833c8fd16a4f 100644 +--- a/fs/bcachefs/btree_key_cache.c ++++ b/fs/bcachefs/btree_key_cache.c +@@ -690,20 +690,16 @@ int bch2_fs_btree_key_cache_init(struct btree_key_cache *c) + { + int ret; + +- c->shrink.seeks = 1; +- c->shrink.count_objects = bch2_btree_key_cache_count; +- c->shrink.scan_objects = bch2_btree_key_cache_scan; +- +- ret = register_shrinker(&c->shrink); +- if (ret) +- return ret; +- + ret = rhashtable_init(&c->table, &bch2_btree_key_cache_params); + if (ret) + return ret; + + c->table_init_done = true; +- return 0; ++ ++ c->shrink.seeks = 1; ++ c->shrink.count_objects = bch2_btree_key_cache_count; ++ c->shrink.scan_objects = bch2_btree_key_cache_scan; ++ return register_shrinker(&c->shrink); + } + + void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *c) +-- +cgit v1.2.3 + + +From d61b29c60c58aceedf1cf17ddff5a5ac8a70d8ac Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 6 Apr 2021 13:43:31 -0400 +Subject: bcachefs: Increase BSET_CACHELINE to 256 bytes + +Linear searches have gotten cheaper relative to binary searches on +modern hardware, due to better branch prediction behaviour. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bset.h | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h +index 506da4e0c911..e42f866cf2ec 100644 +--- a/fs/bcachefs/bset.h ++++ b/fs/bcachefs/bset.h +@@ -188,7 +188,7 @@ static inline enum bset_aux_tree_type bset_aux_tree_type(const struct bset_tree + * gets to the second cacheline. + */ + +-#define BSET_CACHELINE 128 ++#define BSET_CACHELINE 256 + + static inline size_t btree_keys_cachelines(const struct btree *b) + { +-- +cgit v1.2.3 + + +From fae63ddbeb8af444da78beee21df163362433559 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 6 Apr 2021 14:00:56 -0400 +Subject: bcachefs: Eliminate more PAGE_SIZE uses + +In userspace, we don't really have a well defined PAGE_SIZE and shouln't +be relying on it. This is some more incremental work to remove +references to it. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/debug.c | 4 ++-- + fs/bcachefs/super-io.c | 32 ++++++++++++++++---------------- + fs/bcachefs/super.c | 3 +-- + fs/bcachefs/super_types.h | 2 +- + fs/bcachefs/util.c | 2 +- + 5 files changed, 21 insertions(+), 22 deletions(-) + +diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c +index acf600387c9f..90364b55aa40 100644 +--- a/fs/bcachefs/debug.c ++++ b/fs/bcachefs/debug.c +@@ -150,7 +150,7 @@ struct dump_iter { + struct bch_fs *c; + enum btree_id id; + +- char buf[PAGE_SIZE]; ++ char buf[1 << 12]; + size_t bytes; /* what's currently in buf */ + + char __user *ubuf; /* destination user buffer */ +@@ -230,7 +230,7 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf, + while (k.k && !(err = bkey_err(k))) { + bch2_bkey_val_to_text(&PBUF(i->buf), i->c, k); + i->bytes = strlen(i->buf); +- BUG_ON(i->bytes >= PAGE_SIZE); ++ BUG_ON(i->bytes >= sizeof(i->buf)); + i->buf[i->bytes] = '\n'; + i->bytes++; + +diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c +index f8f57caa417a..de8d49e3ef02 100644 +--- a/fs/bcachefs/super-io.c ++++ b/fs/bcachefs/super-io.c +@@ -50,8 +50,7 @@ static struct bch_sb_field *__bch2_sb_field_resize(struct bch_sb_handle *sb, + unsigned old_u64s = f ? le32_to_cpu(f->u64s) : 0; + unsigned sb_u64s = le32_to_cpu(sb->sb->u64s) + u64s - old_u64s; + +- BUG_ON(get_order(__vstruct_bytes(struct bch_sb, sb_u64s)) > +- sb->page_order); ++ BUG_ON(__vstruct_bytes(struct bch_sb, sb_u64s) > sb->buffer_size); + + if (!f && !u64s) { + /* nothing to do: */ +@@ -101,18 +100,23 @@ void bch2_free_super(struct bch_sb_handle *sb) + if (!IS_ERR_OR_NULL(sb->bdev)) + blkdev_put(sb->bdev, sb->mode); + +- free_pages((unsigned long) sb->sb, sb->page_order); ++ kfree(sb->sb); + memset(sb, 0, sizeof(*sb)); + } + + int bch2_sb_realloc(struct bch_sb_handle *sb, unsigned u64s) + { + size_t new_bytes = __vstruct_bytes(struct bch_sb, u64s); +- unsigned order = get_order(new_bytes); ++ size_t new_buffer_size; + struct bch_sb *new_sb; + struct bio *bio; + +- if (sb->sb && sb->page_order >= order) ++ if (sb->bdev) ++ new_bytes = max_t(size_t, new_bytes, bdev_logical_block_size(sb->bdev)); ++ ++ new_buffer_size = roundup_pow_of_two(new_bytes); ++ ++ if (sb->sb && sb->buffer_size >= new_buffer_size) + return 0; + + if (sb->have_layout) { +@@ -127,14 +131,15 @@ int bch2_sb_realloc(struct bch_sb_handle *sb, unsigned u64s) + } + } + +- if (sb->page_order >= order && sb->sb) ++ if (sb->buffer_size >= new_buffer_size && sb->sb) + return 0; + + if (dynamic_fault("bcachefs:add:super_realloc")) + return -ENOMEM; + + if (sb->have_bio) { +- bio = bio_kmalloc(GFP_KERNEL, 1 << order); ++ bio = bio_kmalloc(GFP_KERNEL, ++ DIV_ROUND_UP(new_buffer_size, PAGE_SIZE)); + if (!bio) + return -ENOMEM; + +@@ -143,17 +148,12 @@ int bch2_sb_realloc(struct bch_sb_handle *sb, unsigned u64s) + sb->bio = bio; + } + +- new_sb = (void *) __get_free_pages(GFP_NOFS|__GFP_ZERO, order); ++ new_sb = krealloc(sb->sb, new_buffer_size, GFP_NOFS|__GFP_ZERO); + if (!new_sb) + return -ENOMEM; + +- if (sb->sb) +- memcpy(new_sb, sb->sb, PAGE_SIZE << sb->page_order); +- +- free_pages((unsigned long) sb->sb, sb->page_order); + sb->sb = new_sb; +- +- sb->page_order = order; ++ sb->buffer_size = new_buffer_size; + + return 0; + } +@@ -475,7 +475,7 @@ reread: + bio_set_dev(sb->bio, sb->bdev); + sb->bio->bi_iter.bi_sector = offset; + bio_set_op_attrs(sb->bio, REQ_OP_READ, REQ_SYNC|REQ_META); +- bch2_bio_map(sb->bio, sb->sb, PAGE_SIZE << sb->page_order); ++ bch2_bio_map(sb->bio, sb->sb, sb->buffer_size); + + if (submit_bio_wait(sb->bio)) + return "IO error"; +@@ -492,7 +492,7 @@ reread: + if (bytes > 512 << sb->sb->layout.sb_max_size_bits) + return "Bad superblock: too big"; + +- if (get_order(bytes) > sb->page_order) { ++ if (bytes > sb->buffer_size) { + if (bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s))) + return "cannot allocate memory"; + goto reread; +diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c +index 70a4d0dcc395..7ce867e5ff0c 100644 +--- a/fs/bcachefs/super.c ++++ b/fs/bcachefs/super.c +@@ -502,8 +502,7 @@ static void __bch2_fs_free(struct bch_fs *c) + if (c->wq) + destroy_workqueue(c->wq); + +- free_pages((unsigned long) c->disk_sb.sb, +- c->disk_sb.page_order); ++ bch2_free_super(&c->disk_sb); + kvpfree(c, sizeof(*c)); + module_put(THIS_MODULE); + } +diff --git a/fs/bcachefs/super_types.h b/fs/bcachefs/super_types.h +index 069973a38f12..96023f37afea 100644 +--- a/fs/bcachefs/super_types.h ++++ b/fs/bcachefs/super_types.h +@@ -6,7 +6,7 @@ struct bch_sb_handle { + struct bch_sb *sb; + struct block_device *bdev; + struct bio *bio; +- unsigned page_order; ++ size_t buffer_size; + fmode_t mode; + unsigned have_layout:1; + unsigned have_bio:1; +diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c +index 2709163e02b5..e3ad26e244ab 100644 +--- a/fs/bcachefs/util.c ++++ b/fs/bcachefs/util.c +@@ -154,7 +154,7 @@ void bch2_flags_to_text(struct printbuf *out, + u64 bch2_read_flag_list(char *opt, const char * const list[]) + { + u64 ret = 0; +- char *p, *s, *d = kstrndup(opt, PAGE_SIZE - 1, GFP_KERNEL); ++ char *p, *s, *d = kstrdup(opt, GFP_KERNEL); + + if (!d) + return -ENOMEM; +-- +cgit v1.2.3 + + +From dbd7dc5c14836084f8d9d2c44122c38630c26399 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 31 Mar 2021 21:44:55 -0400 +Subject: bcachefs: Don't flush btree writes more aggressively because of btree + key cache + +We need to flush the btree key cache when it's too dirty, because +otherwise the shrinker won't be able to reclaim memory - this is done by +journal reclaim. But journal reclaim also kicks btree node writes: this +meant that btree node writes were getting kicked much too often just +because we needed to flush btree key cache keys. + +This patch splits journal pins into two different lists, and teaches +journal reclaim to not flush btree node writes when it only needs to +flush key cache keys. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 4 ++- + fs/bcachefs/btree_key_cache.c | 7 ++-- + fs/bcachefs/btree_key_cache.h | 12 ++----- + fs/bcachefs/btree_update_interior.c | 11 ++++-- + fs/bcachefs/journal.c | 30 ++++++++-------- + fs/bcachefs/journal_reclaim.c | 68 ++++++++++++++++++++++++------------- + fs/bcachefs/journal_types.h | 1 + + include/trace/events/bcachefs.h | 5 +++ + 8 files changed, 81 insertions(+), 57 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 6d1f7ece2ace..1944a9a99861 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -473,8 +473,10 @@ bool bch2_trans_relock(struct btree_trans *trans) + + trans_for_each_iter(trans, iter) + if (btree_iter_keep(trans, iter) && +- !bch2_btree_iter_relock(iter, true)) ++ !bch2_btree_iter_relock(iter, true)) { ++ trace_trans_restart_relock(trans->ip); + return false; ++ } + return true; + } + +diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c +index 833c8fd16a4f..53191c99e590 100644 +--- a/fs/bcachefs/btree_key_cache.c ++++ b/fs/bcachefs/btree_key_cache.c +@@ -444,9 +444,8 @@ out: + return ret; + } + +-static int btree_key_cache_journal_flush(struct journal *j, +- struct journal_entry_pin *pin, +- u64 seq) ++int bch2_btree_key_cache_journal_flush(struct journal *j, ++ struct journal_entry_pin *pin, u64 seq) + { + struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct bkey_cached *ck = +@@ -527,7 +526,7 @@ bool bch2_btree_insert_key_cached(struct btree_trans *trans, + } + + bch2_journal_pin_update(&c->journal, trans->journal_res.seq, +- &ck->journal, btree_key_cache_journal_flush); ++ &ck->journal, bch2_btree_key_cache_journal_flush); + + if (kick_reclaim) + journal_reclaim_kick(&c->journal); +diff --git a/fs/bcachefs/btree_key_cache.h b/fs/bcachefs/btree_key_cache.h +index 4e1e5a9c7656..7e2b0a08f745 100644 +--- a/fs/bcachefs/btree_key_cache.h ++++ b/fs/bcachefs/btree_key_cache.h +@@ -1,15 +1,6 @@ + #ifndef _BCACHEFS_BTREE_KEY_CACHE_H + #define _BCACHEFS_BTREE_KEY_CACHE_H + +-static inline size_t bch2_nr_btree_keys_want_flush(struct bch_fs *c) +-{ +- size_t nr_dirty = atomic_long_read(&c->btree_key_cache.nr_dirty); +- size_t nr_keys = atomic_long_read(&c->btree_key_cache.nr_keys); +- size_t max_dirty = nr_keys / 4; +- +- return max_t(ssize_t, 0, nr_dirty - max_dirty); +-} +- + static inline size_t bch2_nr_btree_keys_need_flush(struct bch_fs *c) + { + size_t nr_dirty = atomic_long_read(&c->btree_key_cache.nr_dirty); +@@ -29,6 +20,9 @@ static inline bool bch2_btree_key_cache_must_wait(struct bch_fs *c) + test_bit(JOURNAL_RECLAIM_STARTED, &c->journal.flags); + } + ++int bch2_btree_key_cache_journal_flush(struct journal *, ++ struct journal_entry_pin *, u64); ++ + struct bkey_cached * + bch2_btree_key_cache_find(struct bch_fs *, enum btree_id, struct bpos); + +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index 00144707988f..07c925345675 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -974,20 +974,25 @@ retry: + * closure argument + */ + if (flags & BTREE_INSERT_NOUNLOCK) { ++ trace_trans_restart_journal_preres_get(trans->ip); + ret = -EINTR; + goto err; + } + + bch2_trans_unlock(trans); + +- if (flags & BTREE_INSERT_JOURNAL_RECLAIM) +- goto err; ++ if (flags & BTREE_INSERT_JOURNAL_RECLAIM) { ++ bch2_btree_update_free(as); ++ return ERR_PTR(ret); ++ } + + ret = bch2_journal_preres_get(&c->journal, &as->journal_preres, + BTREE_UPDATE_JOURNAL_RES, + journal_flags); +- if (ret) ++ if (ret) { ++ trace_trans_restart_journal_preres_get(trans->ip); + goto err; ++ } + + if (!bch2_trans_relock(trans)) { + ret = -EINTR; +diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c +index da3c598fab69..14fa3be5626a 100644 +--- a/fs/bcachefs/journal.c ++++ b/fs/bcachefs/journal.c +@@ -60,21 +60,23 @@ journal_seq_to_buf(struct journal *j, u64 seq) + return buf; + } + +-static void journal_pin_new_entry(struct journal *j, int count) ++static void journal_pin_list_init(struct journal_entry_pin_list *p, int count) + { +- struct journal_entry_pin_list *p; ++ INIT_LIST_HEAD(&p->list); ++ INIT_LIST_HEAD(&p->key_cache_list); ++ INIT_LIST_HEAD(&p->flushed); ++ atomic_set(&p->count, count); ++ p->devs.nr = 0; ++} + ++static void journal_pin_new_entry(struct journal *j) ++{ + /* + * The fifo_push() needs to happen at the same time as j->seq is + * incremented for journal_last_seq() to be calculated correctly + */ + atomic64_inc(&j->seq); +- p = fifo_push_ref(&j->pin); +- +- INIT_LIST_HEAD(&p->list); +- INIT_LIST_HEAD(&p->flushed); +- atomic_set(&p->count, count); +- p->devs.nr = 0; ++ journal_pin_list_init(fifo_push_ref(&j->pin), 1); + } + + static void bch2_journal_buf_init(struct journal *j) +@@ -193,7 +195,7 @@ static bool __journal_entry_close(struct journal *j) + __bch2_journal_pin_put(j, le64_to_cpu(buf->data->seq)); + + /* Initialize new buffer: */ +- journal_pin_new_entry(j, 1); ++ journal_pin_new_entry(j); + + bch2_journal_buf_init(j); + +@@ -1031,12 +1033,8 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq, + j->pin.back = cur_seq; + atomic64_set(&j->seq, cur_seq - 1); + +- fifo_for_each_entry_ptr(p, &j->pin, seq) { +- INIT_LIST_HEAD(&p->list); +- INIT_LIST_HEAD(&p->flushed); +- atomic_set(&p->count, 1); +- p->devs.nr = 0; +- } ++ fifo_for_each_entry_ptr(p, &j->pin, seq) ++ journal_pin_list_init(p, 1); + + list_for_each_entry(i, journal_entries, list) { + unsigned ptr; +@@ -1059,7 +1057,7 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq, + set_bit(JOURNAL_STARTED, &j->flags); + j->last_flush_write = jiffies; + +- journal_pin_new_entry(j, 1); ++ journal_pin_new_entry(j); + + j->reservations.idx = j->reservations.unwritten_idx = journal_cur_seq(j); + +diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c +index d7ff66b2ccb6..7be6c65c1abe 100644 +--- a/fs/bcachefs/journal_reclaim.c ++++ b/fs/bcachefs/journal_reclaim.c +@@ -407,7 +407,12 @@ void bch2_journal_pin_set(struct journal *j, u64 seq, + pin->seq = seq; + pin->flush = flush_fn; + +- list_add(&pin->list, flush_fn ? &pin_list->list : &pin_list->flushed); ++ if (flush_fn == bch2_btree_key_cache_journal_flush) ++ list_add(&pin->list, &pin_list->key_cache_list); ++ else if (flush_fn) ++ list_add(&pin->list, &pin_list->list); ++ else ++ list_add(&pin->list, &pin_list->flushed); + spin_unlock(&j->lock); + + /* +@@ -437,23 +442,40 @@ void bch2_journal_pin_flush(struct journal *j, struct journal_entry_pin *pin) + */ + + static struct journal_entry_pin * +-journal_get_next_pin(struct journal *j, u64 max_seq, u64 *seq) ++journal_get_next_pin(struct journal *j, ++ bool get_any, ++ bool get_key_cache, ++ u64 max_seq, u64 *seq) + { + struct journal_entry_pin_list *pin_list; + struct journal_entry_pin *ret = NULL; + +- fifo_for_each_entry_ptr(pin_list, &j->pin, *seq) +- if (*seq > max_seq || +- (ret = list_first_entry_or_null(&pin_list->list, +- struct journal_entry_pin, list))) ++ fifo_for_each_entry_ptr(pin_list, &j->pin, *seq) { ++ if (*seq > max_seq && !get_any && !get_key_cache) + break; + +- return ret; ++ if (*seq <= max_seq || get_any) { ++ ret = list_first_entry_or_null(&pin_list->list, ++ struct journal_entry_pin, list); ++ if (ret) ++ return ret; ++ } ++ ++ if (*seq <= max_seq || get_any || get_key_cache) { ++ ret = list_first_entry_or_null(&pin_list->key_cache_list, ++ struct journal_entry_pin, list); ++ if (ret) ++ return ret; ++ } ++ } ++ ++ return NULL; + } + + /* returns true if we did work */ + static size_t journal_flush_pins(struct journal *j, u64 seq_to_flush, +- unsigned min_nr) ++ unsigned min_any, ++ unsigned min_key_cache) + { + struct journal_entry_pin *pin; + size_t nr_flushed = 0; +@@ -472,8 +494,10 @@ static size_t journal_flush_pins(struct journal *j, u64 seq_to_flush, + j->last_flushed = jiffies; + + spin_lock(&j->lock); +- pin = journal_get_next_pin(j, min_nr +- ? U64_MAX : seq_to_flush, &seq); ++ pin = journal_get_next_pin(j, ++ min_any != 0, ++ min_key_cache != 0, ++ seq_to_flush, &seq); + if (pin) { + BUG_ON(j->flush_in_progress); + j->flush_in_progress = pin; +@@ -485,8 +509,11 @@ static size_t journal_flush_pins(struct journal *j, u64 seq_to_flush, + if (!pin) + break; + +- if (min_nr) +- min_nr--; ++ if (min_key_cache && pin->flush == bch2_btree_key_cache_journal_flush) ++ min_key_cache--; ++ ++ if (min_any) ++ min_any--; + + err = flush_fn(j, pin, seq); + +@@ -610,18 +637,9 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct) + if (j->prereserved.reserved * 2 > j->prereserved.remaining) + min_nr = 1; + +- if (atomic_read(&c->btree_cache.dirty) * 4 > +- c->btree_cache.used * 3) +- min_nr = 1; +- + if (fifo_free(&j->pin) <= 32) + min_nr = 1; + +- min_nr = max(min_nr, bch2_nr_btree_keys_want_flush(c)); +- +- /* Don't do too many without delivering wakeup: */ +- min_nr = min(min_nr, 128UL); +- + trace_journal_reclaim_start(c, + min_nr, + j->prereserved.reserved, +@@ -631,7 +649,9 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct) + atomic_long_read(&c->btree_key_cache.nr_dirty), + atomic_long_read(&c->btree_key_cache.nr_keys)); + +- nr_flushed = journal_flush_pins(j, seq_to_flush, min_nr); ++ nr_flushed = journal_flush_pins(j, seq_to_flush, ++ min_nr, ++ min(bch2_nr_btree_keys_need_flush(c), 128UL)); + + if (direct) + j->nr_direct_reclaim += nr_flushed; +@@ -641,7 +661,7 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct) + + if (nr_flushed) + wake_up(&j->reclaim_wait); +- } while (min_nr && nr_flushed); ++ } while (min_nr && nr_flushed && !direct); + + memalloc_noreclaim_restore(flags); + +@@ -734,7 +754,7 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush, + + mutex_lock(&j->reclaim_lock); + +- *did_work = journal_flush_pins(j, seq_to_flush, 0) != 0; ++ *did_work = journal_flush_pins(j, seq_to_flush, 0, 0) != 0; + + spin_lock(&j->lock); + /* +diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h +index aac15bc1b4d6..c24bc4aa9af2 100644 +--- a/fs/bcachefs/journal_types.h ++++ b/fs/bcachefs/journal_types.h +@@ -43,6 +43,7 @@ struct journal_buf { + + struct journal_entry_pin_list { + struct list_head list; ++ struct list_head key_cache_list; + struct list_head flushed; + atomic_t count; + struct bch_devs_list devs; +diff --git a/include/trace/events/bcachefs.h b/include/trace/events/bcachefs.h +index cb22db36fc03..0a730e258d95 100644 +--- a/include/trace/events/bcachefs.h ++++ b/include/trace/events/bcachefs.h +@@ -716,6 +716,11 @@ DEFINE_EVENT(transaction_restart, trans_restart_iter_upgrade, + TP_ARGS(ip) + ); + ++DEFINE_EVENT(transaction_restart, trans_restart_relock, ++ TP_PROTO(unsigned long ip), ++ TP_ARGS(ip) ++); ++ + DEFINE_EVENT(transaction_restart, trans_restart_traverse, + TP_PROTO(unsigned long ip), + TP_ARGS(ip) +-- +cgit v1.2.3 + + +From 131bab8a6a54012af8cb5ef79412ea5217cdb034 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 6 Apr 2021 15:33:19 -0400 +Subject: bcachefs: Improve bset compaction + +The previous patch that fixed btree nodes being written too aggressively +now meant that we weren't sorting btree node bsets optimally - this +patch fixes that. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_cache.c | 2 +- + fs/bcachefs/btree_io.c | 51 ++++++++++++++++++++++++------------- + fs/bcachefs/btree_io.h | 3 +-- + fs/bcachefs/btree_update_interior.h | 4 ++- + 4 files changed, 39 insertions(+), 21 deletions(-) + +diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c +index 218c3488391b..5cc9e0222a74 100644 +--- a/fs/bcachefs/btree_cache.c ++++ b/fs/bcachefs/btree_cache.c +@@ -214,7 +214,7 @@ static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush) + if (bch2_verify_btree_ondisk) + bch2_btree_node_write(c, b, SIX_LOCK_intent); + else +- __bch2_btree_node_write(c, b, SIX_LOCK_read); ++ __bch2_btree_node_write(c, b); + + /* wait for any in flight btree write */ + btree_node_wait_on_io(b); +diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c +index 7e6858e3af2b..c8d8df9637db 100644 +--- a/fs/bcachefs/btree_io.c ++++ b/fs/bcachefs/btree_io.c +@@ -241,7 +241,6 @@ bool bch2_compact_whiteouts(struct bch_fs *c, struct btree *b, + } + + static void btree_node_sort(struct bch_fs *c, struct btree *b, +- struct btree_iter *iter, + unsigned start_idx, + unsigned end_idx, + bool filter_whiteouts) +@@ -377,8 +376,7 @@ void bch2_btree_sort_into(struct bch_fs *c, + * We're about to add another bset to the btree node, so if there's currently + * too many bsets - sort some of them together: + */ +-static bool btree_node_compact(struct bch_fs *c, struct btree *b, +- struct btree_iter *iter) ++static bool btree_node_compact(struct bch_fs *c, struct btree *b) + { + unsigned unwritten_idx; + bool ret = false; +@@ -390,13 +388,13 @@ static bool btree_node_compact(struct bch_fs *c, struct btree *b, + break; + + if (b->nsets - unwritten_idx > 1) { +- btree_node_sort(c, b, iter, unwritten_idx, ++ btree_node_sort(c, b, unwritten_idx, + b->nsets, false); + ret = true; + } + + if (unwritten_idx > 1) { +- btree_node_sort(c, b, iter, 0, unwritten_idx, false); ++ btree_node_sort(c, b, 0, unwritten_idx, false); + ret = true; + } + +@@ -426,12 +424,30 @@ void bch2_btree_init_next(struct bch_fs *c, struct btree *b, + struct btree_iter *iter) + { + struct btree_node_entry *bne; +- bool did_sort; ++ bool reinit_iter = false; + + EBUG_ON(!(b->c.lock.state.seq & 1)); + EBUG_ON(iter && iter->l[b->c.level].b != b); ++ BUG_ON(bset_written(b, bset(b, &b->set[1]))); ++ ++ if (b->nsets == MAX_BSETS) { ++ unsigned log_u64s[] = { ++ ilog2(bset_u64s(&b->set[0])), ++ ilog2(bset_u64s(&b->set[1])), ++ ilog2(bset_u64s(&b->set[2])), ++ }; ++ ++ if (log_u64s[1] >= (log_u64s[0] + log_u64s[2]) / 2) { ++ bch2_btree_node_write(c, b, SIX_LOCK_write); ++ reinit_iter = true; ++ } ++ } ++ ++ if (b->nsets == MAX_BSETS && ++ btree_node_compact(c, b)) ++ reinit_iter = true; + +- did_sort = btree_node_compact(c, b, iter); ++ BUG_ON(b->nsets >= MAX_BSETS); + + bne = want_new_bset(c, b); + if (bne) +@@ -439,7 +455,7 @@ void bch2_btree_init_next(struct bch_fs *c, struct btree *b, + + bch2_btree_build_aux_trees(b); + +- if (iter && did_sort) ++ if (iter && reinit_iter) + bch2_btree_iter_reinit_node(iter, b); + } + +@@ -1324,8 +1340,7 @@ static int validate_bset_for_write(struct bch_fs *c, struct btree *b, + return ret; + } + +-void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, +- enum six_lock_type lock_type_held) ++void __bch2_btree_node_write(struct bch_fs *c, struct btree *b) + { + struct btree_write_bio *wbio; + struct bset_tree *t; +@@ -1595,7 +1610,7 @@ bool bch2_btree_post_write_cleanup(struct bch_fs *c, struct btree *b) + * single bset: + */ + if (b->nsets > 1) { +- btree_node_sort(c, b, NULL, 0, b->nsets, true); ++ btree_node_sort(c, b, 0, b->nsets, true); + invalidated_iter = true; + } else { + invalidated_iter = bch2_drop_whiteouts(b, COMPACT_ALL); +@@ -1625,13 +1640,12 @@ bool bch2_btree_post_write_cleanup(struct bch_fs *c, struct btree *b) + * Use this one if the node is intent locked: + */ + void bch2_btree_node_write(struct bch_fs *c, struct btree *b, +- enum six_lock_type lock_type_held) ++ enum six_lock_type lock_type_held) + { +- BUG_ON(lock_type_held == SIX_LOCK_write); +- + if (lock_type_held == SIX_LOCK_intent || +- six_lock_tryupgrade(&b->c.lock)) { +- __bch2_btree_node_write(c, b, SIX_LOCK_intent); ++ (lock_type_held == SIX_LOCK_read && ++ six_lock_tryupgrade(&b->c.lock))) { ++ __bch2_btree_node_write(c, b); + + /* don't cycle lock unnecessarily: */ + if (btree_node_just_written(b) && +@@ -1643,7 +1657,10 @@ void bch2_btree_node_write(struct bch_fs *c, struct btree *b, + if (lock_type_held == SIX_LOCK_read) + six_lock_downgrade(&b->c.lock); + } else { +- __bch2_btree_node_write(c, b, SIX_LOCK_read); ++ __bch2_btree_node_write(c, b); ++ if (lock_type_held == SIX_LOCK_write && ++ btree_node_just_written(b)) ++ bch2_btree_post_write_cleanup(c, b); + } + } + +diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h +index 9c14cd30a09e..95c351611045 100644 +--- a/fs/bcachefs/btree_io.h ++++ b/fs/bcachefs/btree_io.h +@@ -144,8 +144,7 @@ void bch2_btree_complete_write(struct bch_fs *, struct btree *, + struct btree_write *); + void bch2_btree_write_error_work(struct work_struct *); + +-void __bch2_btree_node_write(struct bch_fs *, struct btree *, +- enum six_lock_type); ++void __bch2_btree_node_write(struct bch_fs *, struct btree *); + bool bch2_btree_post_write_cleanup(struct bch_fs *, struct btree *); + + void bch2_btree_node_write(struct bch_fs *, struct btree *, +diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h +index f2925b0d7f17..7eef3dbb6ef1 100644 +--- a/fs/bcachefs/btree_update_interior.h ++++ b/fs/bcachefs/btree_update_interior.h +@@ -256,13 +256,15 @@ static inline size_t bch_btree_keys_u64s_remaining(struct bch_fs *c, + return remaining; + } + ++#define BTREE_WRITE_SET_U64s_BITS 9 ++ + static inline unsigned btree_write_set_buffer(struct btree *b) + { + /* + * Could buffer up larger amounts of keys for btrees with larger keys, + * pending benchmarking: + */ +- return 4 << 10; ++ return 8 << BTREE_WRITE_SET_U64s_BITS; + } + + static inline struct btree_node_entry *want_new_bset(struct bch_fs *c, +-- +cgit v1.2.3 + + +From 3161287ece613915bec5388a205de3ec06b2b86b Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 6 Apr 2021 20:11:28 -0400 +Subject: bcachefs: Move some dirent checks to bch2_dirent_invalid() + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/dirent.c | 18 +++++++++++++----- + fs/bcachefs/fsck.c | 31 ------------------------------- + 2 files changed, 13 insertions(+), 36 deletions(-) + +diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c +index cf4ce2e7f29c..ec4666143f23 100644 +--- a/fs/bcachefs/dirent.c ++++ b/fs/bcachefs/dirent.c +@@ -84,16 +84,24 @@ const char *bch2_dirent_invalid(const struct bch_fs *c, struct bkey_s_c k) + if (!len) + return "empty name"; + +- /* +- * older versions of bcachefs were buggy and creating dirent +- * keys that were bigger than necessary: +- */ +- if (bkey_val_u64s(k.k) > dirent_val_u64s(len + 7)) ++ if (bkey_val_u64s(k.k) > dirent_val_u64s(len)) + return "value too big"; + + if (len > BCH_NAME_MAX) + return "dirent name too big"; + ++ if (len == 1 && !memcmp(d.v->d_name, ".", 1)) ++ return "invalid name"; ++ ++ if (len == 2 && !memcmp(d.v->d_name, "..", 2)) ++ return "invalid name"; ++ ++ if (memchr(d.v->d_name, '/', len)) ++ return "invalid name"; ++ ++ if (le64_to_cpu(d.v->d_inum) == d.k->p.inode) ++ return "dirent points to own directory"; ++ + return NULL; + } + +diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c +index d65b3e100f78..36baff8409cd 100644 +--- a/fs/bcachefs/fsck.c ++++ b/fs/bcachefs/fsck.c +@@ -569,7 +569,6 @@ static int check_dirents(struct bch_fs *c) + struct btree_trans trans; + struct btree_iter *iter; + struct bkey_s_c k; +- unsigned name_len; + char buf[200]; + int ret = 0; + +@@ -628,36 +627,6 @@ retry: + d = bkey_s_c_to_dirent(k); + d_inum = le64_to_cpu(d.v->d_inum); + +- name_len = bch2_dirent_name_bytes(d); +- +- if (fsck_err_on(!name_len, c, "empty dirent") || +- fsck_err_on(name_len == 1 && +- !memcmp(d.v->d_name, ".", 1), c, +- ". dirent") || +- fsck_err_on(name_len == 2 && +- !memcmp(d.v->d_name, "..", 2), c, +- ".. dirent") || +- fsck_err_on(name_len == 2 && +- !memcmp(d.v->d_name, "..", 2), c, +- ".. dirent") || +- fsck_err_on(memchr(d.v->d_name, '/', name_len), c, +- "dirent name has invalid chars")) { +- ret = remove_dirent(&trans, d); +- if (ret) +- goto err; +- continue; +- } +- +- if (fsck_err_on(d_inum == d.k->p.inode, c, +- "dirent points to own directory:\n%s", +- (bch2_bkey_val_to_text(&PBUF(buf), c, +- k), buf))) { +- ret = remove_dirent(&trans, d); +- if (ret) +- goto err; +- continue; +- } +- + ret = __bch2_inode_find_by_inum_trans(&trans, d_inum, &target, 0); + if (ret && ret != -ENOENT) + break; +-- +cgit v1.2.3 + + +From fd545b791acc175cdd89551652909834c48f8f8c Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 6 Apr 2021 21:19:25 -0400 +Subject: bcachefs: Drop bch2_fsck_inode_nlink() + +We've had BCH_FEATURE_atomic_nlink for quite some time, we can drop this +now. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/fsck.c | 9 --------- + fs/bcachefs/fsck.h | 1 - + fs/bcachefs/recovery.c | 33 ++++++++++++++------------------- + 3 files changed, 14 insertions(+), 29 deletions(-) + +diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c +index 36baff8409cd..8fa41b36f72d 100644 +--- a/fs/bcachefs/fsck.c ++++ b/fs/bcachefs/fsck.c +@@ -1468,15 +1468,6 @@ int bch2_fsck_full(struct bch_fs *c) + check_inode_nlinks(c, &lostfound_inode); + } + +-int bch2_fsck_inode_nlink(struct bch_fs *c) +-{ +- struct bch_inode_unpacked root_inode, lostfound_inode; +- +- return check_root(c, &root_inode) ?: +- check_lostfound(c, &root_inode, &lostfound_inode) ?: +- check_inode_nlinks(c, &lostfound_inode); +-} +- + int bch2_fsck_walk_inodes_only(struct bch_fs *c) + { + struct btree_trans trans; +diff --git a/fs/bcachefs/fsck.h b/fs/bcachefs/fsck.h +index 9e4af02bde1e..264f2706b12d 100644 +--- a/fs/bcachefs/fsck.h ++++ b/fs/bcachefs/fsck.h +@@ -3,7 +3,6 @@ + #define _BCACHEFS_FSCK_H + + int bch2_fsck_full(struct bch_fs *); +-int bch2_fsck_inode_nlink(struct bch_fs *); + int bch2_fsck_walk_inodes_only(struct bch_fs *); + + #endif /* _BCACHEFS_FSCK_H */ +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index a3a6abb88d6f..24c0646913a8 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -1005,6 +1005,13 @@ int bch2_fs_recovery(struct bch_fs *c) + + } + ++ if (!c->sb.clean && ++ !(c->sb.features & (1 << BCH_FEATURE_atomic_nlink))) { ++ bch_info(c, "BCH_FEATURE_atomic_nlink not set and filesystem dirty, fsck required"); ++ c->opts.fsck = true; ++ c->opts.fix_errors = FSCK_OPT_YES; ++ } ++ + if (!(c->sb.features & (1ULL << BCH_FEATURE_alloc_v2))) { + bch_info(c, "alloc_v2 feature bit not set, fsck required"); + c->opts.fsck = true; +@@ -1181,25 +1188,6 @@ use_clean: + bch_verbose(c, "alloc write done"); + } + +- if (!c->sb.clean) { +- if (!(c->sb.features & (1 << BCH_FEATURE_atomic_nlink))) { +- bch_info(c, "checking inode link counts"); +- err = "error in recovery"; +- ret = bch2_fsck_inode_nlink(c); +- if (ret) +- goto err; +- bch_verbose(c, "check inodes done"); +- +- } else { +- bch_verbose(c, "checking for deleted inodes"); +- err = "error in recovery"; +- ret = bch2_fsck_walk_inodes_only(c); +- if (ret) +- goto err; +- bch_verbose(c, "check inodes done"); +- } +- } +- + if (c->opts.fsck) { + bch_info(c, "starting fsck"); + err = "error in fsck"; +@@ -1207,6 +1195,13 @@ use_clean: + if (ret) + goto err; + bch_verbose(c, "fsck done"); ++ } else if (!c->sb.clean) { ++ bch_verbose(c, "checking for deleted inodes"); ++ err = "error in recovery"; ++ ret = bch2_fsck_walk_inodes_only(c); ++ if (ret) ++ goto err; ++ bch_verbose(c, "check inodes done"); + } + + if (enabled_qtypes(c)) { +-- +cgit v1.2.3 + + +From dbe85c271658c7600d7e1d3f620575fba08cd3fe Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 7 Apr 2021 21:04:04 -0400 +Subject: bcachefs: Don't wait for ALLOC_SCAN_BATCH buckets in allocator + +It used to be necessary for the allocator thread to batch up +invalidating buckets when possible - but since we added the btree key +cache that hasn't been a concern, and now it's causing the allocator +thread to livelock when the filesystem is nearly full. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/alloc_background.c | 19 +++++++++---------- + 1 file changed, 9 insertions(+), 10 deletions(-) + +diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c +index a10c1a41e4c9..c47a2098a10c 100644 +--- a/fs/bcachefs/alloc_background.c ++++ b/fs/bcachefs/alloc_background.c +@@ -1071,7 +1071,7 @@ static int bch2_allocator_thread(void *arg) + + pr_debug("free_inc now empty"); + +- do { ++ while (1) { + cond_resched(); + /* + * Find some buckets that we can invalidate, either +@@ -1095,22 +1095,21 @@ static int bch2_allocator_thread(void *arg) + wake_up_process(c->gc_thread); + } + ++ if (nr) ++ break; ++ + /* + * If we found any buckets, we have to invalidate them + * before we scan for more - but if we didn't find very + * many we may want to wait on more buckets being + * available so we don't spin: + */ +- if (!nr || +- (nr < ALLOC_SCAN_BATCH(ca) && +- !fifo_empty(&ca->free[RESERVE_NONE]))) { +- ret = wait_buckets_available(c, ca); +- if (ret) { +- up_read(&c->gc_lock); +- goto stop; +- } ++ ret = wait_buckets_available(c, ca); ++ if (ret) { ++ up_read(&c->gc_lock); ++ goto stop; + } +- } while (!nr); ++ } + + up_read(&c->gc_lock); + +-- +cgit v1.2.3 + + +From 4f620cc626bd7e8e3d2c011d3a71ffd5ab7e4a25 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 8 Apr 2021 16:15:03 -0400 +Subject: bcachefs: Make sure to kick journal reclaim when we're waiting on it + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_update_leaf.c | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index 8517c1fe4bd4..8c8a584f82ae 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -639,6 +639,8 @@ static int journal_reclaim_wait_done(struct bch_fs *c) + if (ret) + return ret; + ++ journal_reclaim_kick(&c->journal); ++ + if (mutex_trylock(&c->journal.reclaim_lock)) { + ret = bch2_journal_reclaim(&c->journal); + mutex_unlock(&c->journal.reclaim_lock); +-- +cgit v1.2.3 + + +From 891ae4c83c13341500b599b80b5920a7617e6968 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 9 Apr 2021 15:10:24 -0400 +Subject: bcachefs: Fix bch2_gc_btree_gens() + +Since we're using a NOT_EXTENTS iterator, we shouldn't be setting the +iter pos to the start of the extent. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_gc.c | 2 -- + 1 file changed, 2 deletions(-) + +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +index 53a677894a79..382dd22c196f 100644 +--- a/fs/bcachefs/btree_gc.c ++++ b/fs/bcachefs/btree_gc.c +@@ -1197,8 +1197,6 @@ static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id btree_id) + bch2_bkey_buf_reassemble(&sk, c, k); + bch2_extent_normalize(c, bkey_i_to_s(sk.k)); + +- bch2_btree_iter_set_pos(iter, bkey_start_pos(&sk.k->k)); +- + bch2_trans_update(&trans, iter, sk.k, 0); + + ret = bch2_trans_commit(&trans, NULL, NULL, +-- +cgit v1.2.3 + + +From 9d4bfc96e746d24a99166b5a691790115e0221db Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 9 Apr 2021 16:52:30 -0400 +Subject: bcachefs: Fix BTREE_ITER_NOT_EXTENTS + +bch2_btree_iter_peek() wasn't properly checking for +BTREE_ITER_IS_EXTENTS when updating iter->pos. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_gc.c | 2 +- + fs/bcachefs/btree_iter.c | 4 +++- + fs/bcachefs/btree_types.h | 4 ++++ + 3 files changed, 8 insertions(+), 2 deletions(-) + +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +index 382dd22c196f..d0c06fa5198d 100644 +--- a/fs/bcachefs/btree_gc.c ++++ b/fs/bcachefs/btree_gc.c +@@ -1243,7 +1243,7 @@ int bch2_gc_gens(struct bch_fs *c) + } + + for (i = 0; i < BTREE_ID_NR; i++) +- if (btree_node_type_needs_gc(i)) { ++ if ((1 << i) & BTREE_ID_HAS_PTRS) { + ret = bch2_gc_btree_gens(c, i); + if (ret) { + bch_err(c, "error recalculating oldest_gen: %i", ret); +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 1944a9a99861..2c321a88da8b 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -1641,7 +1641,9 @@ static inline struct bkey_s_c __btree_iter_peek(struct btree_iter *iter, bool wi + * iter->pos should be mononotically increasing, and always be equal to + * the key we just returned - except extents can straddle iter->pos: + */ +- if (bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0) ++ if (!(iter->flags & BTREE_ITER_IS_EXTENTS)) ++ iter->pos = k.k->p; ++ else if (bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0) + iter->pos = bkey_start_pos(k.k); + + bch2_btree_iter_verify_entry_exit(iter); +diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h +index 10366f6c0619..0c93547cebae 100644 +--- a/fs/bcachefs/btree_types.h ++++ b/fs/bcachefs/btree_types.h +@@ -616,6 +616,10 @@ static inline bool btree_iter_is_extents(struct btree_iter *iter) + (1U << BTREE_ID_dirents)| \ + (1U << BTREE_ID_xattrs)) + ++#define BTREE_ID_HAS_PTRS \ ++ ((1U << BTREE_ID_extents)| \ ++ (1U << BTREE_ID_reflink)) ++ + static inline bool btree_type_has_snapshots(enum btree_id id) + { + return (1 << id) & BTREE_ID_HAS_SNAPSHOTS; +-- +cgit v1.2.3 + + +From d3a07997d8da4f2059046d1521bc94ef9cb0245d Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 6 Apr 2021 21:41:48 -0400 +Subject: bcachefs: Check inodes at start of fsck + +This splits out checking inode nlinks from the rest of the inode checks +and moves most of the inode checks to the start of fsck, so that other +fsck passes can depend on it. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/fsck.c | 355 ++++++++++++++++++++++++++++------------------------- + 1 file changed, 186 insertions(+), 169 deletions(-) + +diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c +index 8fa41b36f72d..6e1f9194a671 100644 +--- a/fs/bcachefs/fsck.c ++++ b/fs/bcachefs/fsck.c +@@ -413,6 +413,151 @@ err_redo: + goto err; + } + ++static int check_inode(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct bkey_s_c_inode inode) ++{ ++ struct bch_fs *c = trans->c; ++ struct bch_inode_unpacked u; ++ bool do_update = false; ++ int ret = 0; ++ ++ ret = bch2_inode_unpack(inode, &u); ++ ++ if (bch2_fs_inconsistent_on(ret, c, ++ "error unpacking inode %llu in fsck", ++ inode.k->p.inode)) ++ return ret; ++ ++ if (u.bi_flags & BCH_INODE_UNLINKED && ++ (!c->sb.clean || ++ fsck_err(c, "filesystem marked clean, but inode %llu unlinked", ++ u.bi_inum))) { ++ bch_verbose(c, "deleting inode %llu", u.bi_inum); ++ ++ bch2_trans_unlock(trans); ++ bch2_fs_lazy_rw(c); ++ ++ ret = bch2_inode_rm(c, u.bi_inum, false); ++ if (ret) ++ bch_err(c, "error in fsck: error %i while deleting inode", ret); ++ return ret; ++ } ++ ++ if (u.bi_flags & BCH_INODE_I_SIZE_DIRTY && ++ (!c->sb.clean || ++ fsck_err(c, "filesystem marked clean, but inode %llu has i_size dirty", ++ u.bi_inum))) { ++ bch_verbose(c, "truncating inode %llu", u.bi_inum); ++ ++ bch2_trans_unlock(trans); ++ bch2_fs_lazy_rw(c); ++ ++ /* ++ * XXX: need to truncate partial blocks too here - or ideally ++ * just switch units to bytes and that issue goes away ++ */ ++ ret = bch2_btree_delete_range_trans(trans, BTREE_ID_extents, ++ POS(u.bi_inum, round_up(u.bi_size, block_bytes(c)) >> 9), ++ POS(u.bi_inum, U64_MAX), ++ NULL); ++ if (ret) { ++ bch_err(c, "error in fsck: error %i truncating inode", ret); ++ return ret; ++ } ++ ++ /* ++ * We truncated without our normal sector accounting hook, just ++ * make sure we recalculate it: ++ */ ++ u.bi_flags |= BCH_INODE_I_SECTORS_DIRTY; ++ ++ u.bi_flags &= ~BCH_INODE_I_SIZE_DIRTY; ++ do_update = true; ++ } ++ ++ if (u.bi_flags & BCH_INODE_I_SECTORS_DIRTY && ++ (!c->sb.clean || ++ fsck_err(c, "filesystem marked clean, but inode %llu has i_sectors dirty", ++ u.bi_inum))) { ++ s64 sectors; ++ ++ bch_verbose(c, "recounting sectors for inode %llu", ++ u.bi_inum); ++ ++ sectors = bch2_count_inode_sectors(trans, u.bi_inum); ++ if (sectors < 0) { ++ bch_err(c, "error in fsck: error %i recounting inode sectors", ++ (int) sectors); ++ return sectors; ++ } ++ ++ u.bi_sectors = sectors; ++ u.bi_flags &= ~BCH_INODE_I_SECTORS_DIRTY; ++ do_update = true; ++ } ++ ++ if (!S_ISDIR(u.bi_mode) && ++ u.bi_nlink && ++ !(u.bi_flags & BCH_INODE_BACKPTR_UNTRUSTED) && ++ (fsck_err_on(c->sb.version >= bcachefs_metadata_version_inode_backpointers, c, ++ "inode missing BCH_INODE_BACKPTR_UNTRUSTED flags") || ++ c->opts.version_upgrade)) { ++ u.bi_flags |= BCH_INODE_BACKPTR_UNTRUSTED; ++ do_update = true; ++ } ++ ++ if (do_update) { ++ struct bkey_inode_buf p; ++ ++ bch2_inode_pack(c, &p, &u); ++ p.inode.k.p = iter->pos; ++ ++ ret = __bch2_trans_do(trans, NULL, NULL, ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_LAZY_RW, ++ (bch2_trans_update(trans, iter, &p.inode.k_i, 0), 0)); ++ if (ret) ++ bch_err(c, "error in fsck: error %i " ++ "updating inode", ret); ++ } ++fsck_err: ++ return ret; ++} ++ ++noinline_for_stack ++static int check_inodes(struct bch_fs *c, bool full) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ struct bkey_s_c_inode inode; ++ int ret; ++ ++ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_inodes, POS_MIN, 0, k, ret) { ++ if (k.k->type != KEY_TYPE_inode) ++ continue; ++ ++ inode = bkey_s_c_to_inode(k); ++ ++ if (full || ++ (inode.v->bi_flags & (BCH_INODE_I_SIZE_DIRTY| ++ BCH_INODE_I_SECTORS_DIRTY| ++ BCH_INODE_UNLINKED))) { ++ ret = check_inode(&trans, iter, inode); ++ if (ret) ++ break; ++ } ++ } ++ bch2_trans_iter_put(&trans, iter); ++ ++ BUG_ON(ret == -EINTR); ++ ++ return bch2_trans_exit(&trans) ?: ret; ++} ++ + static int fix_overlapping_extent(struct btree_trans *trans, + struct bkey_s_c k, struct bpos cut_at) + { +@@ -1131,61 +1276,70 @@ static int bch2_gc_walk_dirents(struct bch_fs *c, nlink_table *links, + return ret; + } + +-static int check_inode_nlink(struct bch_fs *c, ++static int check_inode_nlink(struct btree_trans *trans, + struct bch_inode_unpacked *lostfound_inode, +- struct bch_inode_unpacked *u, +- struct nlink *link, +- bool *do_update) ++ struct btree_iter *iter, ++ struct bkey_s_c_inode inode, ++ struct nlink *link) + { +- u32 i_nlink = bch2_inode_nlink_get(u); +- u32 real_i_nlink = +- link->count * nlink_bias(u->bi_mode) + +- link->dir_count; ++ struct bch_fs *c = trans->c; ++ struct bch_inode_unpacked u; ++ u32 i_nlink, real_i_nlink; + int ret = 0; + ++ ret = bch2_inode_unpack(inode, &u); ++ /* Should never happen, checked by bch2_inode_invalid: */ ++ if (bch2_fs_inconsistent_on(ret, c, ++ "error unpacking inode %llu in fsck", ++ inode.k->p.inode)) ++ return ret; ++ ++ i_nlink = bch2_inode_nlink_get(&u); ++ real_i_nlink = link->count * nlink_bias(u.bi_mode) + link->dir_count; ++ + /* + * These should have been caught/fixed by earlier passes, we don't + * repair them here: + */ +- if (S_ISDIR(u->bi_mode) && link->count > 1) { ++ if (S_ISDIR(u.bi_mode) && link->count > 1) { + need_fsck_err(c, "directory %llu with multiple hardlinks: %u", +- u->bi_inum, link->count); ++ u.bi_inum, link->count); + return 0; + } + +- if (S_ISDIR(u->bi_mode) && !link->count) { ++ if (S_ISDIR(u.bi_mode) && !link->count) { + need_fsck_err(c, "unreachable directory found (inum %llu)", +- u->bi_inum); ++ u.bi_inum); + return 0; + } + +- if (!S_ISDIR(u->bi_mode) && link->dir_count) { ++ if (!S_ISDIR(u.bi_mode) && link->dir_count) { + need_fsck_err(c, "non directory with subdirectories (inum %llu)", +- u->bi_inum); ++ u.bi_inum); + return 0; + } + + if (!link->count && +- !(u->bi_flags & BCH_INODE_UNLINKED) && ++ !(u.bi_flags & BCH_INODE_UNLINKED) && + (c->sb.features & (1 << BCH_FEATURE_atomic_nlink))) { + if (fsck_err(c, "unreachable inode %llu not marked as unlinked (type %u)", +- u->bi_inum, mode_to_type(u->bi_mode)) == ++ u.bi_inum, mode_to_type(u.bi_mode)) == + FSCK_ERR_IGNORE) + return 0; + +- ret = reattach_inode(c, lostfound_inode, u->bi_inum); ++ ret = reattach_inode(c, lostfound_inode, u.bi_inum); + if (ret) + return ret; + + link->count = 1; +- real_i_nlink = nlink_bias(u->bi_mode) + link->dir_count; ++ real_i_nlink = nlink_bias(u.bi_mode) + link->dir_count; + goto set_i_nlink; + } + + if (i_nlink < link->count) { + if (fsck_err(c, "inode %llu i_link too small (%u < %u, type %i)", +- u->bi_inum, i_nlink, link->count, +- mode_to_type(u->bi_mode)) == FSCK_ERR_IGNORE) ++ u.bi_inum, i_nlink, link->count, ++ mode_to_type(u.bi_mode)) == FSCK_ERR_IGNORE) + return 0; + goto set_i_nlink; + } +@@ -1195,7 +1349,7 @@ static int check_inode_nlink(struct bch_fs *c, + if (fsck_err(c, "filesystem marked clean, " + "but inode %llu has wrong i_nlink " + "(type %u i_nlink %u, should be %u)", +- u->bi_inum, mode_to_type(u->bi_mode), ++ u.bi_inum, mode_to_type(u.bi_mode), + i_nlink, real_i_nlink) == FSCK_ERR_IGNORE) + return 0; + goto set_i_nlink; +@@ -1205,7 +1359,7 @@ static int check_inode_nlink(struct bch_fs *c, + (c->sb.features & (1 << BCH_FEATURE_atomic_nlink))) { + if (fsck_err(c, "inode %llu has wrong i_nlink " + "(type %u i_nlink %u, should be %u)", +- u->bi_inum, mode_to_type(u->bi_mode), ++ u.bi_inum, mode_to_type(u.bi_mode), + i_nlink, real_i_nlink) == FSCK_ERR_IGNORE) + return 0; + goto set_i_nlink; +@@ -1213,122 +1367,12 @@ static int check_inode_nlink(struct bch_fs *c, + + if (real_i_nlink && i_nlink != real_i_nlink) + bch_verbose(c, "setting inode %llu nlink from %u to %u", +- u->bi_inum, i_nlink, real_i_nlink); ++ u.bi_inum, i_nlink, real_i_nlink); + set_i_nlink: + if (i_nlink != real_i_nlink) { +- bch2_inode_nlink_set(u, real_i_nlink); +- *do_update = true; +- } +-fsck_err: +- return ret; +-} +- +-static int check_inode(struct btree_trans *trans, +- struct bch_inode_unpacked *lostfound_inode, +- struct btree_iter *iter, +- struct bkey_s_c_inode inode, +- struct nlink *link) +-{ +- struct bch_fs *c = trans->c; +- struct bch_inode_unpacked u; +- bool do_update = false; +- int ret = 0; +- +- ret = bch2_inode_unpack(inode, &u); +- +- bch2_trans_unlock(trans); +- +- if (bch2_fs_inconsistent_on(ret, c, +- "error unpacking inode %llu in fsck", +- inode.k->p.inode)) +- return ret; +- +- if (link) { +- ret = check_inode_nlink(c, lostfound_inode, &u, link, +- &do_update); +- if (ret) +- return ret; +- } +- +- if (u.bi_flags & BCH_INODE_UNLINKED && +- (!c->sb.clean || +- fsck_err(c, "filesystem marked clean, but inode %llu unlinked", +- u.bi_inum))) { +- bch_verbose(c, "deleting inode %llu", u.bi_inum); +- +- bch2_fs_lazy_rw(c); +- +- ret = bch2_inode_rm(c, u.bi_inum, false); +- if (ret) +- bch_err(c, "error in fsck: error %i while deleting inode", ret); +- return ret; +- } +- +- if (u.bi_flags & BCH_INODE_I_SIZE_DIRTY && +- (!c->sb.clean || +- fsck_err(c, "filesystem marked clean, but inode %llu has i_size dirty", +- u.bi_inum))) { +- bch_verbose(c, "truncating inode %llu", u.bi_inum); +- +- bch2_fs_lazy_rw(c); +- +- /* +- * XXX: need to truncate partial blocks too here - or ideally +- * just switch units to bytes and that issue goes away +- */ +- ret = bch2_btree_delete_range_trans(trans, BTREE_ID_extents, +- POS(u.bi_inum, round_up(u.bi_size, block_bytes(c)) >> 9), +- POS(u.bi_inum, U64_MAX), +- NULL); +- if (ret) { +- bch_err(c, "error in fsck: error %i truncating inode", ret); +- return ret; +- } +- +- /* +- * We truncated without our normal sector accounting hook, just +- * make sure we recalculate it: +- */ +- u.bi_flags |= BCH_INODE_I_SECTORS_DIRTY; +- +- u.bi_flags &= ~BCH_INODE_I_SIZE_DIRTY; +- do_update = true; +- } +- +- if (u.bi_flags & BCH_INODE_I_SECTORS_DIRTY && +- (!c->sb.clean || +- fsck_err(c, "filesystem marked clean, but inode %llu has i_sectors dirty", +- u.bi_inum))) { +- s64 sectors; +- +- bch_verbose(c, "recounting sectors for inode %llu", +- u.bi_inum); +- +- sectors = bch2_count_inode_sectors(trans, u.bi_inum); +- if (sectors < 0) { +- bch_err(c, "error in fsck: error %i recounting inode sectors", +- (int) sectors); +- return sectors; +- } +- +- u.bi_sectors = sectors; +- u.bi_flags &= ~BCH_INODE_I_SECTORS_DIRTY; +- do_update = true; +- } +- +- if (!S_ISDIR(u.bi_mode) && +- u.bi_nlink && +- !(u.bi_flags & BCH_INODE_BACKPTR_UNTRUSTED) && +- (fsck_err_on(c->sb.version >= bcachefs_metadata_version_inode_backpointers, c, +- "inode missing BCH_INODE_BACKPTR_UNTRUSTED flags") || +- c->opts.version_upgrade)) { +- u.bi_flags |= BCH_INODE_BACKPTR_UNTRUSTED; +- do_update = true; +- } +- +- if (do_update) { + struct bkey_inode_buf p; + ++ bch2_inode_nlink_set(&u, real_i_nlink); + bch2_inode_pack(c, &p, &u); + p.inode.k.p = iter->pos; + +@@ -1337,8 +1381,7 @@ static int check_inode(struct btree_trans *trans, + BTREE_INSERT_LAZY_RW, + (bch2_trans_update(trans, iter, &p.inode.k_i, 0), 0)); + if (ret) +- bch_err(c, "error in fsck: error %i " +- "updating inode", ret); ++ bch_err(c, "error in fsck: error %i updating inode", ret); + } + fsck_err: + return ret; +@@ -1387,8 +1430,8 @@ peek_nlinks: link = genradix_iter_peek(&nlinks_iter, links); + link = &zero_links; + + if (k.k && k.k->type == KEY_TYPE_inode) { +- ret = check_inode(&trans, lostfound_inode, iter, +- bkey_s_c_to_inode(k), link); ++ ret = check_inode_nlink(&trans, lostfound_inode, iter, ++ bkey_s_c_to_inode(k), link); + BUG_ON(ret == -EINTR); + if (ret) + break; +@@ -1416,7 +1459,7 @@ fsck_err: + } + + noinline_for_stack +-static int check_inode_nlinks(struct bch_fs *c, ++static int check_nlinks(struct bch_fs *c, + struct bch_inode_unpacked *lostfound_inode) + { + nlink_table links; +@@ -1459,43 +1502,17 @@ int bch2_fsck_full(struct bch_fs *c) + { + struct bch_inode_unpacked root_inode, lostfound_inode; + +- return check_extents(c) ?: ++ return check_inodes(c, true) ?: ++ check_extents(c) ?: + check_dirents(c) ?: + check_xattrs(c) ?: + check_root(c, &root_inode) ?: + check_lostfound(c, &root_inode, &lostfound_inode) ?: + check_directory_structure(c, &lostfound_inode) ?: +- check_inode_nlinks(c, &lostfound_inode); ++ check_nlinks(c, &lostfound_inode); + } + + int bch2_fsck_walk_inodes_only(struct bch_fs *c) + { +- struct btree_trans trans; +- struct btree_iter *iter; +- struct bkey_s_c k; +- struct bkey_s_c_inode inode; +- int ret; +- +- bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); +- +- for_each_btree_key(&trans, iter, BTREE_ID_inodes, POS_MIN, 0, k, ret) { +- if (k.k->type != KEY_TYPE_inode) +- continue; +- +- inode = bkey_s_c_to_inode(k); +- +- if (inode.v->bi_flags & +- (BCH_INODE_I_SIZE_DIRTY| +- BCH_INODE_I_SECTORS_DIRTY| +- BCH_INODE_UNLINKED)) { +- ret = check_inode(&trans, NULL, iter, inode, NULL); +- if (ret) +- break; +- } +- } +- bch2_trans_iter_put(&trans, iter); +- +- BUG_ON(ret == -EINTR); +- +- return bch2_trans_exit(&trans) ?: ret; ++ return check_inodes(c, false); + } +-- +cgit v1.2.3 + + +From f9e20842930f09f9ce31be6d21c8062d3b30bd74 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 7 Apr 2021 01:55:57 -0400 +Subject: bcachefs: Simplify hash table checks + +Very early on there was a period where we were accidentally generating +dirents with trailing garbage; we've since dropped support for +filesystems that old and the fsck code can be dropped. + +Also, this patch switches to a simpler algorithm for checking hash +tables. It's less efficient on hash collision - but with 64 bit keys, +those are very rare. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/fsck.c | 296 ++++++++++++----------------------------------------- + 1 file changed, 65 insertions(+), 231 deletions(-) + +diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c +index 6e1f9194a671..0d27a7a736e0 100644 +--- a/fs/bcachefs/fsck.c ++++ b/fs/bcachefs/fsck.c +@@ -142,42 +142,10 @@ static int walk_inode(struct btree_trans *trans, + return 0; + } + +-struct hash_check { +- struct bch_hash_info info; +- +- /* start of current chain of hash collisions: */ +- struct btree_iter *chain; +- +- /* next offset in current chain of hash collisions: */ +- u64 chain_end; +-}; +- +-static void hash_check_init(struct hash_check *h) +-{ +- h->chain = NULL; +- h->chain_end = 0; +-} +- +-static void hash_stop_chain(struct btree_trans *trans, +- struct hash_check *h) +-{ +- if (h->chain) +- bch2_trans_iter_free(trans, h->chain); +- h->chain = NULL; +-} +- +-static void hash_check_set_inode(struct btree_trans *trans, +- struct hash_check *h, +- const struct bch_inode_unpacked *bi) +-{ +- h->info = bch2_hash_info_init(trans->c, bi); +- hash_stop_chain(trans, h); +-} +- +-static int hash_redo_key(const struct bch_hash_desc desc, +- struct btree_trans *trans, struct hash_check *h, +- struct btree_iter *k_iter, struct bkey_s_c k, +- u64 hashed) ++static int hash_redo_key(struct btree_trans *trans, ++ const struct bch_hash_desc desc, ++ struct bch_hash_info *hash_info, ++ struct btree_iter *k_iter, struct bkey_s_c k) + { + struct bkey_i delete; + struct bkey_i *tmp; +@@ -192,7 +160,7 @@ static int hash_redo_key(const struct bch_hash_desc desc, + delete.k.p = k_iter->pos; + bch2_trans_update(trans, k_iter, &delete, 0); + +- return bch2_hash_set(trans, desc, &h->info, k_iter->pos.inode, ++ return bch2_hash_set(trans, desc, hash_info, k_iter->pos.inode, + tmp, 0); + } + +@@ -216,201 +184,72 @@ retry: + return ret; + } + +-static int hash_check_duplicates(struct btree_trans *trans, +- const struct bch_hash_desc desc, struct hash_check *h, +- struct btree_iter *k_iter, struct bkey_s_c k) ++static int hash_check_key(struct btree_trans *trans, ++ const struct bch_hash_desc desc, ++ struct bch_hash_info *hash_info, ++ struct btree_iter *k_iter, struct bkey_s_c hash_k) + { + struct bch_fs *c = trans->c; +- struct btree_iter *iter; +- struct bkey_s_c k2; ++ struct btree_iter *iter = NULL; + char buf[200]; ++ struct bkey_s_c k; ++ u64 hash; + int ret = 0; + +- if (!bkey_cmp(h->chain->pos, k_iter->pos)) ++ if (hash_k.k->type != desc.key_type) + return 0; + +- iter = bch2_trans_copy_iter(trans, h->chain); ++ hash = desc.hash_bkey(hash_info, hash_k); ++ ++ if (likely(hash == hash_k.k->p.offset)) ++ return 0; + +- for_each_btree_key_continue(iter, 0, k2, ret) { +- if (bkey_cmp(k2.k->p, k.k->p) >= 0) ++ if (hash_k.k->p.offset < hash) ++ goto bad_hash; ++ ++ for_each_btree_key(trans, iter, desc.btree_id, POS(hash_k.k->p.inode, hash), ++ BTREE_ITER_SLOTS, k, ret) { ++ if (!bkey_cmp(k.k->p, hash_k.k->p)) + break; + +- if (fsck_err_on(k2.k->type == desc.key_type && +- !desc.cmp_bkey(k, k2), c, ++ if (fsck_err_on(k.k->type == desc.key_type && ++ !desc.cmp_bkey(k, hash_k), c, + "duplicate hash table keys:\n%s", + (bch2_bkey_val_to_text(&PBUF(buf), c, +- k), buf))) { +- ret = fsck_hash_delete_at(trans, desc, &h->info, k_iter); ++ hash_k), buf))) { ++ ret = fsck_hash_delete_at(trans, desc, hash_info, k_iter); + if (ret) + return ret; + ret = 1; + break; + } +- } +-fsck_err: +- bch2_trans_iter_free(trans, iter); +- return ret; +-} +- +-static void hash_set_chain_start(struct btree_trans *trans, +- const struct bch_hash_desc desc, +- struct hash_check *h, +- struct btree_iter *k_iter, struct bkey_s_c k) +-{ +- bool hole = (k.k->type != KEY_TYPE_hash_whiteout && +- k.k->type != desc.key_type); +- +- if (hole || k.k->p.offset > h->chain_end + 1) +- hash_stop_chain(trans, h); + +- if (!hole) { +- if (!h->chain) +- h->chain = bch2_trans_copy_iter(trans, k_iter); +- +- h->chain_end = k.k->p.offset; +- } +-} +- +-static bool key_has_correct_hash(struct btree_trans *trans, +- const struct bch_hash_desc desc, +- struct hash_check *h, +- struct btree_iter *k_iter, struct bkey_s_c k) +-{ +- u64 hash; +- +- hash_set_chain_start(trans, desc, h, k_iter, k); +- +- if (k.k->type != desc.key_type) +- return true; +- +- hash = desc.hash_bkey(&h->info, k); +- +- return hash >= h->chain->pos.offset && +- hash <= k.k->p.offset; +-} +- +-static int hash_check_key(struct btree_trans *trans, +- const struct bch_hash_desc desc, struct hash_check *h, +- struct btree_iter *k_iter, struct bkey_s_c k) +-{ +- struct bch_fs *c = trans->c; +- char buf[200]; +- u64 hashed; +- int ret = 0; +- +- hash_set_chain_start(trans, desc, h, k_iter, k); +- +- if (k.k->type != desc.key_type) +- return 0; +- +- hashed = desc.hash_bkey(&h->info, k); +- +- if (fsck_err_on(hashed < h->chain->pos.offset || +- hashed > k.k->p.offset, c, +- "hash table key at wrong offset: btree %u, %llu, " +- "hashed to %llu chain starts at %llu\n%s", +- desc.btree_id, k.k->p.offset, +- hashed, h->chain->pos.offset, +- (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf))) { +- ret = __bch2_trans_do(trans, NULL, NULL, +- BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW, +- hash_redo_key(desc, trans, h, k_iter, k, hashed)); +- if (ret) { +- bch_err(c, "hash_redo_key err %i", ret); +- return ret; ++ if (bkey_deleted(k.k)) { ++ bch2_trans_iter_free(trans, iter); ++ goto bad_hash; + } +- return -EINTR; +- } + +- ret = hash_check_duplicates(trans, desc, h, k_iter, k); +-fsck_err: ++ } ++ bch2_trans_iter_free(trans, iter); + return ret; +-} +- +-static int check_dirent_hash(struct btree_trans *trans, struct hash_check *h, +- struct btree_iter *iter, struct bkey_s_c *k) +-{ +- struct bch_fs *c = trans->c; +- struct bkey_i_dirent *d = NULL; +- int ret = -EINVAL; +- char buf[200]; +- unsigned len; +- u64 hash; +- +- if (key_has_correct_hash(trans, bch2_dirent_hash_desc, h, iter, *k)) ++bad_hash: ++ if (fsck_err(c, "hash table key at wrong offset: btree %u inode %llu offset %llu, " ++ "hashed to %llu should be at %llu\n%s", ++ desc.btree_id, hash_k.k->p.inode, hash_k.k->p.offset, ++ hash, iter->pos.offset, ++ (bch2_bkey_val_to_text(&PBUF(buf), c, hash_k), buf)) == FSCK_ERR_IGNORE) + return 0; + +- len = bch2_dirent_name_bytes(bkey_s_c_to_dirent(*k)); +- BUG_ON(!len); +- +- memcpy(buf, bkey_s_c_to_dirent(*k).v->d_name, len); +- buf[len] = '\0'; +- +- d = kmalloc(bkey_bytes(k->k), GFP_KERNEL); +- if (!d) { +- bch_err(c, "memory allocation failure"); +- return -ENOMEM; +- } +- +- bkey_reassemble(&d->k_i, *k); +- +- do { +- --len; +- if (!len) +- goto err_redo; +- +- d->k.u64s = BKEY_U64s + dirent_val_u64s(len); +- +- BUG_ON(bkey_val_bytes(&d->k) < +- offsetof(struct bch_dirent, d_name) + len); +- +- memset(d->v.d_name + len, 0, +- bkey_val_bytes(&d->k) - +- offsetof(struct bch_dirent, d_name) - len); +- +- hash = bch2_dirent_hash_desc.hash_bkey(&h->info, +- bkey_i_to_s_c(&d->k_i)); +- } while (hash < h->chain->pos.offset || +- hash > k->k->p.offset); +- +- if (fsck_err(c, "dirent with junk at end, was %s (%zu) now %s (%u)", +- buf, strlen(buf), d->v.d_name, len)) { +- ret = __bch2_trans_do(trans, NULL, NULL, +- BTREE_INSERT_NOFAIL| +- BTREE_INSERT_LAZY_RW, +- (bch2_trans_update(trans, iter, &d->k_i, 0), 0)); +- if (ret) +- goto err; +- +- *k = bch2_btree_iter_peek(iter); +- +- BUG_ON(k->k->type != KEY_TYPE_dirent); ++ ret = __bch2_trans_do(trans, NULL, NULL, ++ BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW, ++ hash_redo_key(trans, desc, hash_info, k_iter, hash_k)); ++ if (ret) { ++ bch_err(c, "hash_redo_key err %i", ret); ++ return ret; + } +-err: ++ return -EINTR; + fsck_err: +- kfree(d); + return ret; +-err_redo: +- hash = bch2_dirent_hash_desc.hash_bkey(&h->info, *k); +- +- if (fsck_err(c, "cannot fix dirent by removing trailing garbage %s (%zu)\n" +- "hash table key at wrong offset: btree %u, offset %llu, " +- "hashed to %llu chain starts at %llu\n%s", +- buf, strlen(buf), BTREE_ID_dirents, +- k->k->p.offset, hash, h->chain->pos.offset, +- (bch2_bkey_val_to_text(&PBUF(buf), c, +- *k), buf))) { +- ret = __bch2_trans_do(trans, NULL, NULL, +- BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW, +- hash_redo_key(bch2_dirent_hash_desc, trans, +- h, iter, *k, hash)); +- if (ret) +- bch_err(c, "hash_redo_key err %i", ret); +- else +- ret = 1; +- } +- +- goto err; + } + + static int check_inode(struct btree_trans *trans, +@@ -710,7 +549,7 @@ noinline_for_stack + static int check_dirents(struct bch_fs *c) + { + struct inode_walker w = inode_walker_init(); +- struct hash_check h; ++ struct bch_hash_info hash_info; + struct btree_trans trans; + struct btree_iter *iter; + struct bkey_s_c k; +@@ -721,8 +560,6 @@ static int check_dirents(struct bch_fs *c) + + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); + +- hash_check_init(&h); +- + iter = bch2_trans_get_iter(&trans, BTREE_ID_dirents, + POS(BCACHEFS_ROOT_INO, 0), 0); + retry: +@@ -749,25 +586,26 @@ retry: + ret = bch2_btree_delete_at(&trans, iter, 0); + if (ret) + goto err; +- continue; ++ goto next; + } + +- if (w.first_this_inode && w.have_inode) +- hash_check_set_inode(&trans, &h, &w.inode); ++ if (!w.have_inode) ++ goto next; + +- ret = check_dirent_hash(&trans, &h, iter, &k); ++ if (w.first_this_inode) ++ hash_info = bch2_hash_info_init(c, &w.inode); ++ ++ ret = hash_check_key(&trans, bch2_dirent_hash_desc, ++ &hash_info, iter, k); + if (ret > 0) { + ret = 0; +- continue; ++ goto next; + } + if (ret) + goto fsck_err; + +- if (ret) +- goto fsck_err; +- + if (k.k->type != KEY_TYPE_dirent) +- continue; ++ goto next; + + d = bkey_s_c_to_dirent(k); + d_inum = le64_to_cpu(d.v->d_inum); +@@ -786,9 +624,12 @@ retry: + ret = remove_dirent(&trans, d); + if (ret) + goto err; +- continue; ++ goto next; + } + ++ if (!have_target) ++ goto next; ++ + if (!target.bi_nlink && + !(target.bi_flags & BCH_INODE_BACKPTR_UNTRUSTED) && + (target.bi_dir != k.k->p.inode || +@@ -822,8 +663,7 @@ retry: + continue; + } + +- if (fsck_err_on(have_target && +- d.v->d_type != ++ if (fsck_err_on(d.v->d_type != + mode_to_type(target.bi_mode), c, + "incorrect d_type: should be %u:\n%s", + mode_to_type(target.bi_mode), +@@ -849,17 +689,14 @@ retry: + goto err; + + } +- ++next: + bch2_btree_iter_advance(iter); + } +- +- hash_stop_chain(&trans, &h); + err: + fsck_err: + if (ret == -EINTR) + goto retry; + +- bch2_trans_iter_put(&trans, h.chain); + bch2_trans_iter_put(&trans, iter); + return bch2_trans_exit(&trans) ?: ret; + } +@@ -871,7 +708,7 @@ noinline_for_stack + static int check_xattrs(struct bch_fs *c) + { + struct inode_walker w = inode_walker_init(); +- struct hash_check h; ++ struct bch_hash_info hash_info; + struct btree_trans trans; + struct btree_iter *iter; + struct bkey_s_c k; +@@ -879,8 +716,6 @@ static int check_xattrs(struct bch_fs *c) + + bch_verbose(c, "checking xattrs"); + +- hash_check_init(&h); +- + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); + + iter = bch2_trans_get_iter(&trans, BTREE_ID_xattrs, +@@ -902,10 +737,10 @@ retry: + } + + if (w.first_this_inode && w.have_inode) +- hash_check_set_inode(&trans, &h, &w.inode); ++ hash_info = bch2_hash_info_init(c, &w.inode); + + ret = hash_check_key(&trans, bch2_xattr_hash_desc, +- &h, iter, k); ++ &hash_info, iter, k); + if (ret) + break; + +@@ -915,7 +750,6 @@ fsck_err: + if (ret == -EINTR) + goto retry; + +- bch2_trans_iter_put(&trans, h.chain); + bch2_trans_iter_put(&trans, iter); + return bch2_trans_exit(&trans) ?: ret; + } +-- +cgit v1.2.3 + + +From f905f90318cb9e0bfb3633518f649190d62360b1 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 6 Apr 2021 20:15:26 -0400 +Subject: bcachefs: Inode backpointers are now required + +This lets us simplify fsck quite a bit, which we need for making fsck +snapshot aware. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/fsck.c | 187 +++++++++++++++++++++++++++++++++++++------------ + fs/bcachefs/recovery.c | 7 ++ + 2 files changed, 151 insertions(+), 43 deletions(-) + +diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c +index 0d27a7a736e0..5be86bf60545 100644 +--- a/fs/bcachefs/fsck.c ++++ b/fs/bcachefs/fsck.c +@@ -38,6 +38,49 @@ static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum) + return ret ?: sectors; + } + ++static int lookup_inode(struct btree_trans *trans, u64 inode_nr, ++ struct bch_inode_unpacked *inode, ++ u32 *snapshot) ++{ ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ int ret; ++ ++ iter = bch2_trans_get_iter(trans, BTREE_ID_inodes, ++ POS(0, inode_nr), 0); ++ k = bch2_btree_iter_peek_slot(iter); ++ ret = bkey_err(k); ++ if (ret) ++ goto err; ++ ++ if (snapshot) ++ *snapshot = iter->pos.snapshot; ++ ret = k.k->type == KEY_TYPE_inode ++ ? bch2_inode_unpack(bkey_s_c_to_inode(k), inode) ++ : -ENOENT; ++err: ++ bch2_trans_iter_put(trans, iter); ++ return ret; ++} ++ ++static int write_inode(struct btree_trans *trans, ++ struct bch_inode_unpacked *inode, ++ u32 snapshot) ++{ ++ struct btree_iter *inode_iter = ++ bch2_trans_get_iter(trans, BTREE_ID_inodes, ++ SPOS(0, inode->bi_inum, snapshot), ++ BTREE_ITER_INTENT); ++ int ret = __bch2_trans_do(trans, NULL, NULL, ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_LAZY_RW, ++ bch2_inode_write(trans, inode_iter, inode)); ++ bch2_trans_iter_put(trans, inode_iter); ++ if (ret) ++ bch_err(trans->c, "error in fsck: error %i updating inode", ret); ++ return ret; ++} ++ + static int __remove_dirent(struct btree_trans *trans, + struct bkey_s_c_dirent dirent) + { +@@ -58,7 +101,7 @@ static int __remove_dirent(struct btree_trans *trans, + buf[name.len] = '\0'; + name.name = buf; + +- ret = __bch2_inode_find_by_inum_trans(trans, dir_inum, &dir_inode, 0); ++ ret = lookup_inode(trans, dir_inum, &dir_inode, NULL); + if (ret && ret != -EINTR) + bch_err(c, "remove_dirent: err %i looking up directory inode", ret); + if (ret) +@@ -111,6 +154,7 @@ struct inode_walker { + bool first_this_inode; + bool have_inode; + u64 cur_inum; ++ u32 snapshot; + struct bch_inode_unpacked inode; + }; + +@@ -126,8 +170,7 @@ static int walk_inode(struct btree_trans *trans, + struct inode_walker *w, u64 inum) + { + if (inum != w->cur_inum) { +- int ret = __bch2_inode_find_by_inum_trans(trans, inum, +- &w->inode, 0); ++ int ret = lookup_inode(trans, inum, &w->inode, &w->snapshot); + + if (ret && ret != -ENOENT) + return ret; +@@ -432,6 +475,35 @@ static int fix_overlapping_extent(struct btree_trans *trans, + BTREE_INSERT_LAZY_RW); + } + ++static int inode_backpointer_exists(struct btree_trans *trans, ++ struct bch_inode_unpacked *inode) ++{ ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ int ret; ++ ++ iter = bch2_trans_get_iter(trans, BTREE_ID_dirents, ++ POS(inode->bi_dir, inode->bi_dir_offset), 0); ++ k = bch2_btree_iter_peek_slot(iter); ++ ret = bkey_err(k); ++ if (ret) ++ goto out; ++ if (k.k->type != KEY_TYPE_dirent) ++ goto out; ++ ++ ret = le64_to_cpu(bkey_s_c_to_dirent(k).v->d_inum) == inode->bi_inum; ++out: ++ bch2_trans_iter_free(trans, iter); ++ return ret; ++} ++ ++static bool inode_backpointer_matches(struct bkey_s_c_dirent d, ++ struct bch_inode_unpacked *inode) ++{ ++ return d.k->p.inode == inode->bi_dir && ++ d.k->p.offset == inode->bi_dir_offset; ++} ++ + /* + * Walk extents: verify that extents have a corresponding S_ISREG inode, and + * that i_size an i_sectors are consistent +@@ -466,18 +538,9 @@ retry: + "inode %llu has incorrect i_sectors: got %llu, should be %llu", + w.inode.bi_inum, + w.inode.bi_sectors, i_sectors)) { +- struct btree_iter *inode_iter = +- bch2_trans_get_iter(&trans, BTREE_ID_inodes, +- POS(0, w.cur_inum), +- BTREE_ITER_INTENT); +- + w.inode.bi_sectors = i_sectors; + +- ret = __bch2_trans_do(&trans, NULL, NULL, +- BTREE_INSERT_NOFAIL| +- BTREE_INSERT_LAZY_RW, +- bch2_inode_write(&trans, inode_iter, &w.inode)); +- bch2_trans_iter_put(&trans, inode_iter); ++ ret = write_inode(&trans, &w.inode, w.snapshot); + if (ret) + break; + } +@@ -554,6 +617,7 @@ static int check_dirents(struct bch_fs *c) + struct btree_iter *iter; + struct bkey_s_c k; + char buf[200]; ++ unsigned nr_subdirs = 0; + int ret = 0; + + bch_verbose(c, "checking dirents"); +@@ -567,13 +631,29 @@ retry: + !(ret = bkey_err(k))) { + struct bkey_s_c_dirent d; + struct bch_inode_unpacked target; ++ u32 target_snapshot; + bool have_target; ++ bool backpointer_exists = true; + u64 d_inum; + ++ if (w.have_inode && ++ w.cur_inum != k.k->p.inode && ++ fsck_err_on(w.inode.bi_nlink != nr_subdirs, c, ++ "directory %llu with wrong i_nlink: got %u, should be %u", ++ w.inode.bi_inum, w.inode.bi_nlink, nr_subdirs)) { ++ w.inode.bi_nlink = nr_subdirs; ++ ret = write_inode(&trans, &w.inode, w.snapshot); ++ if (ret) ++ break; ++ } ++ + ret = walk_inode(&trans, &w, k.k->p.inode); + if (ret) + break; + ++ if (w.first_this_inode) ++ nr_subdirs = 0; ++ + if (fsck_err_on(!w.have_inode, c, + "dirent in nonexisting directory:\n%s", + (bch2_bkey_val_to_text(&PBUF(buf), c, +@@ -610,7 +690,7 @@ retry: + d = bkey_s_c_to_dirent(k); + d_inum = le64_to_cpu(d.v->d_inum); + +- ret = __bch2_inode_find_by_inum_trans(&trans, d_inum, &target, 0); ++ ret = lookup_inode(&trans, d_inum, &target, &target_snapshot); + if (ret && ret != -ENOENT) + break; + +@@ -630,41 +710,60 @@ retry: + if (!have_target) + goto next; + +- if (!target.bi_nlink && +- !(target.bi_flags & BCH_INODE_BACKPTR_UNTRUSTED) && +- (target.bi_dir != k.k->p.inode || +- target.bi_dir_offset != k.k->p.offset) && +- (fsck_err_on(c->sb.version >= bcachefs_metadata_version_inode_backpointers, c, +- "inode %llu has wrong backpointer:\n" +- "got %llu:%llu\n" +- "should be %llu:%llu", +- d_inum, +- target.bi_dir, +- target.bi_dir_offset, +- k.k->p.inode, +- k.k->p.offset) || +- c->opts.version_upgrade)) { +- struct bkey_inode_buf p; +- +- target.bi_dir = k.k->p.inode; +- target.bi_dir_offset = k.k->p.offset; +- bch2_trans_unlock(&trans); ++ if (!inode_backpointer_matches(d, &target)) { ++ ret = inode_backpointer_exists(&trans, &target); ++ if (ret < 0) ++ goto err; + +- bch2_inode_pack(c, &p, &target); ++ backpointer_exists = ret; ++ ret = 0; ++ } + +- ret = bch2_btree_insert(c, BTREE_ID_inodes, +- &p.inode.k_i, NULL, NULL, +- BTREE_INSERT_NOFAIL| +- BTREE_INSERT_LAZY_RW); +- if (ret) { +- bch_err(c, "error in fsck: error %i updating inode", ret); ++ if (fsck_err_on(S_ISDIR(target.bi_mode) && ++ !inode_backpointer_matches(d, &target) && ++ backpointer_exists, c, ++ "directory %llu with multiple links", ++ target.bi_inum)) { ++ ret = remove_dirent(&trans, d); ++ if (ret) + goto err; +- } + continue; + } + +- if (fsck_err_on(d.v->d_type != +- mode_to_type(target.bi_mode), c, ++ if (!inode_backpointer_matches(d, &target) && ++ (S_ISDIR(target.bi_mode) || !target.bi_nlink)) { ++ if (backpointer_exists) { ++ if (!fsck_err(c, "inode %llu has multiple links but i_nlink 0", ++ d_inum)) ++ goto check_type; ++ ++ target.bi_nlink++; ++ target.bi_flags |= BCH_INODE_BACKPTR_UNTRUSTED; ++ } else { ++ if (c->sb.version >= bcachefs_metadata_version_inode_backpointers && ++ !(target.bi_flags & BCH_INODE_BACKPTR_UNTRUSTED) && ++ !fsck_err(c, "inode %llu has wrong backpointer:\n" ++ "got %llu:%llu\n" ++ "should be %llu:%llu", ++ d_inum, ++ target.bi_dir, ++ target.bi_dir_offset, ++ k.k->p.inode, ++ k.k->p.offset)) ++ goto check_type; ++ ++ target.bi_dir = k.k->p.inode; ++ target.bi_dir_offset = k.k->p.offset; ++ target.bi_flags &= ~BCH_INODE_BACKPTR_UNTRUSTED; ++ } ++ ++ ret = write_inode(&trans, &target, target_snapshot); ++ if (ret) ++ goto err; ++ continue; ++ } ++check_type: ++ if (fsck_err_on(d.v->d_type != mode_to_type(target.bi_mode), c, + "incorrect d_type: should be %u:\n%s", + mode_to_type(target.bi_mode), + (bch2_bkey_val_to_text(&PBUF(buf), c, +@@ -689,6 +788,8 @@ retry: + goto err; + + } ++ ++ nr_subdirs += d.v->d_type == DT_DIR; + next: + bch2_btree_iter_advance(iter); + } +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index 24c0646913a8..012a08574022 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -1024,6 +1024,13 @@ int bch2_fs_recovery(struct bch_fs *c) + set_bit(BCH_FS_REBUILD_REPLICAS, &c->flags); + } + ++ if (c->sb.version < bcachefs_metadata_version_inode_backpointers) { ++ bch_info(c, "version prior to inode backpointers, upgrade and fsck required"); ++ c->opts.version_upgrade = true; ++ c->opts.fsck = true; ++ c->opts.fix_errors = FSCK_OPT_YES; ++ } ++ + ret = bch2_blacklist_table_initialize(c); + if (ret) { + bch_err(c, "error initializing blacklist table"); +-- +cgit v1.2.3 + + +From 4b8e4ba734d0ffbcd0150f017b914f9b6bda9433 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 8 Apr 2021 15:25:29 -0400 +Subject: bcachefs: Redo check_nlink fsck pass + +Now that we have inode backpointers the check_nlink pass only is +concerned with files that have hardlinks, and can be simplified. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/fsck.c | 179 ++++++++++++----------------------------------------- + 1 file changed, 41 insertions(+), 138 deletions(-) + +diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c +index 5be86bf60545..6bc3f2f09e36 100644 +--- a/fs/bcachefs/fsck.c ++++ b/fs/bcachefs/fsck.c +@@ -1137,14 +1137,12 @@ fsck_err: + + struct nlink { + u32 count; +- u32 dir_count; + }; + + typedef GENRADIX(struct nlink) nlink_table; + + static void inc_link(struct bch_fs *c, nlink_table *links, +- u64 range_start, u64 *range_end, +- u64 inum, bool dir) ++ u64 range_start, u64 *range_end, u64 inum) + { + struct nlink *link; + +@@ -1163,10 +1161,7 @@ static void inc_link(struct bch_fs *c, nlink_table *links, + return; + } + +- if (dir) +- link->dir_count++; +- else +- link->count++; ++ link->count++; + } + + noinline_for_stack +@@ -1177,26 +1172,18 @@ static int bch2_gc_walk_dirents(struct bch_fs *c, nlink_table *links, + struct btree_iter *iter; + struct bkey_s_c k; + struct bkey_s_c_dirent d; +- u64 d_inum; + int ret; + + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); + +- inc_link(c, links, range_start, range_end, BCACHEFS_ROOT_INO, false); +- + for_each_btree_key(&trans, iter, BTREE_ID_dirents, POS_MIN, 0, k, ret) { + switch (k.k->type) { + case KEY_TYPE_dirent: + d = bkey_s_c_to_dirent(k); +- d_inum = le64_to_cpu(d.v->d_inum); + +- if (d.v->d_type == DT_DIR) ++ if (d.v->d_type != DT_DIR) + inc_link(c, links, range_start, range_end, +- d.k->p.inode, true); +- +- inc_link(c, links, range_start, range_end, +- d_inum, false); +- ++ le64_to_cpu(d.v->d_inum)); + break; + } + +@@ -1215,99 +1202,48 @@ static int check_inode_nlink(struct btree_trans *trans, + struct bch_inode_unpacked *lostfound_inode, + struct btree_iter *iter, + struct bkey_s_c_inode inode, +- struct nlink *link) ++ unsigned nlink) + { + struct bch_fs *c = trans->c; + struct bch_inode_unpacked u; +- u32 i_nlink, real_i_nlink; + int ret = 0; + ++ /* ++ * Backpointer and directory structure checks are sufficient for ++ * directories, since they can't have hardlinks: ++ */ ++ if (S_ISDIR(le16_to_cpu(inode.v->bi_mode))) ++ return 0; ++ + ret = bch2_inode_unpack(inode, &u); ++ + /* Should never happen, checked by bch2_inode_invalid: */ + if (bch2_fs_inconsistent_on(ret, c, + "error unpacking inode %llu in fsck", + inode.k->p.inode)) + return ret; + +- i_nlink = bch2_inode_nlink_get(&u); +- real_i_nlink = link->count * nlink_bias(u.bi_mode) + link->dir_count; +- +- /* +- * These should have been caught/fixed by earlier passes, we don't +- * repair them here: +- */ +- if (S_ISDIR(u.bi_mode) && link->count > 1) { +- need_fsck_err(c, "directory %llu with multiple hardlinks: %u", +- u.bi_inum, link->count); +- return 0; +- } +- +- if (S_ISDIR(u.bi_mode) && !link->count) { +- need_fsck_err(c, "unreachable directory found (inum %llu)", +- u.bi_inum); +- return 0; +- } +- +- if (!S_ISDIR(u.bi_mode) && link->dir_count) { +- need_fsck_err(c, "non directory with subdirectories (inum %llu)", +- u.bi_inum); +- return 0; +- } +- +- if (!link->count && +- !(u.bi_flags & BCH_INODE_UNLINKED) && +- (c->sb.features & (1 << BCH_FEATURE_atomic_nlink))) { +- if (fsck_err(c, "unreachable inode %llu not marked as unlinked (type %u)", +- u.bi_inum, mode_to_type(u.bi_mode)) == +- FSCK_ERR_IGNORE) +- return 0; +- ++ /* Improved directory structure pass will catch this: */ ++ if (fsck_err_on(!nlink, c, ++ "unreachable inode %llu not marked as unlinked (type %u)", ++ u.bi_inum, mode_to_type(u.bi_mode))) { + ret = reattach_inode(c, lostfound_inode, u.bi_inum); + if (ret) + return ret; + +- link->count = 1; +- real_i_nlink = nlink_bias(u.bi_mode) + link->dir_count; +- goto set_i_nlink; +- } +- +- if (i_nlink < link->count) { +- if (fsck_err(c, "inode %llu i_link too small (%u < %u, type %i)", +- u.bi_inum, i_nlink, link->count, +- mode_to_type(u.bi_mode)) == FSCK_ERR_IGNORE) +- return 0; +- goto set_i_nlink; +- } +- +- if (i_nlink != real_i_nlink && +- c->sb.clean) { +- if (fsck_err(c, "filesystem marked clean, " +- "but inode %llu has wrong i_nlink " +- "(type %u i_nlink %u, should be %u)", +- u.bi_inum, mode_to_type(u.bi_mode), +- i_nlink, real_i_nlink) == FSCK_ERR_IGNORE) +- return 0; +- goto set_i_nlink; +- } +- +- if (i_nlink != real_i_nlink && +- (c->sb.features & (1 << BCH_FEATURE_atomic_nlink))) { +- if (fsck_err(c, "inode %llu has wrong i_nlink " +- "(type %u i_nlink %u, should be %u)", +- u.bi_inum, mode_to_type(u.bi_mode), +- i_nlink, real_i_nlink) == FSCK_ERR_IGNORE) +- return 0; +- goto set_i_nlink; ++ nlink = 1; + } + +- if (real_i_nlink && i_nlink != real_i_nlink) +- bch_verbose(c, "setting inode %llu nlink from %u to %u", +- u.bi_inum, i_nlink, real_i_nlink); +-set_i_nlink: +- if (i_nlink != real_i_nlink) { ++ if (fsck_err_on(bch2_inode_nlink_get(&u) != nlink, c, ++ "inode %llu has wrong i_nlink (type %u i_nlink %u, should be %u)", ++ u.bi_inum, mode_to_type(u.bi_mode), ++ bch2_inode_nlink_get(&u), nlink)) { + struct bkey_inode_buf p; + +- bch2_inode_nlink_set(&u, real_i_nlink); ++ if (nlink > 1) ++ u.bi_flags |= BCH_INODE_BACKPTR_UNTRUSTED; ++ ++ bch2_inode_nlink_set(&u, nlink); + bch2_inode_pack(c, &p, &u); + p.inode.k.p = iter->pos; + +@@ -1331,66 +1267,33 @@ static int bch2_gc_walk_inodes(struct bch_fs *c, + struct btree_trans trans; + struct btree_iter *iter; + struct bkey_s_c k; +- struct nlink *link, zero_links = { 0, 0 }; +- struct genradix_iter nlinks_iter; +- int ret = 0, ret2 = 0; +- u64 nlinks_pos; ++ struct nlink *link; ++ int ret = 0; + + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); + +- iter = bch2_trans_get_iter(&trans, BTREE_ID_inodes, +- POS(0, range_start), 0); +- nlinks_iter = genradix_iter_init(links, 0); +- +- while ((k = bch2_btree_iter_peek(iter)).k && +- !(ret2 = bkey_err(k)) && +- iter->pos.offset < range_end) { +-peek_nlinks: link = genradix_iter_peek(&nlinks_iter, links); +- +- if (!link && (!k.k || iter->pos.offset >= range_end)) ++ for_each_btree_key(&trans, iter, BTREE_ID_inodes, ++ POS(0, range_start), 0, k, ret) { ++ if (!k.k || k.k->p.offset >= range_end) + break; + +- nlinks_pos = range_start + nlinks_iter.pos; +- +- if (link && nlinks_pos < iter->pos.offset) { +- /* Should have been caught by dirents pass: */ +- need_fsck_err_on(link->count, c, +- "missing inode %llu (nlink %u)", +- nlinks_pos, link->count); +- genradix_iter_advance(&nlinks_iter, links); +- goto peek_nlinks; +- } +- +- if (!link || nlinks_pos > iter->pos.offset) +- link = &zero_links; +- +- if (k.k && k.k->type == KEY_TYPE_inode) { +- ret = check_inode_nlink(&trans, lostfound_inode, iter, +- bkey_s_c_to_inode(k), link); +- BUG_ON(ret == -EINTR); +- if (ret) +- break; +- } else { +- /* Should have been caught by dirents pass: */ +- need_fsck_err_on(link->count, c, +- "missing inode %llu (nlink %u)", +- nlinks_pos, link->count); +- } ++ if (k.k->type != KEY_TYPE_inode) ++ continue; + +- if (nlinks_pos == iter->pos.offset) +- genradix_iter_advance(&nlinks_iter, links); ++ link = genradix_ptr(links, k.k->p.offset - range_start); ++ ret = check_inode_nlink(&trans, lostfound_inode, iter, ++ bkey_s_c_to_inode(k), link ? link->count : 0); ++ if (ret) ++ break; + +- bch2_btree_iter_advance(iter); +- bch2_trans_cond_resched(&trans); + } +-fsck_err: + bch2_trans_iter_put(&trans, iter); + bch2_trans_exit(&trans); + +- if (ret2) +- bch_err(c, "error in fsck: btree error %i while walking inodes", ret2); ++ if (ret) ++ bch_err(c, "error in fsck: btree error %i while walking inodes", ret); + +- return ret ?: ret2; ++ return ret; + } + + noinline_for_stack +-- +cgit v1.2.3 + + +From 4e52e51523a571381f380aefde958ac29c69621e Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 12 Apr 2021 14:00:07 -0400 +Subject: bcachefs: Fix bch2_trans_relock() + +The patch that changed bch2_trans_relock() to not look at iter->uptodate +also tried to add an optimization by only having it relock +btree_iter_key() iterators (iterators that are live or have been marked +as keep). But, this wasn't thought through - this pops internal iterator +assertions because on transaction restart, when we're traversing +iterators we traverse all iterators marked as linked, and having +bch2_trans_relock() skip some of those mean that it can skil the +iterator that bch2_btree_iter_traverse_one() is currently traversing. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 3 +-- + 1 file changed, 1 insertion(+), 2 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 2c321a88da8b..d4105a9c7650 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -472,8 +472,7 @@ bool bch2_trans_relock(struct btree_trans *trans) + struct btree_iter *iter; + + trans_for_each_iter(trans, iter) +- if (btree_iter_keep(trans, iter) && +- !bch2_btree_iter_relock(iter, true)) { ++ if (!bch2_btree_iter_relock(iter, true)) { + trace_trans_restart_relock(trans->ip); + return false; + } +-- +cgit v1.2.3 + + +From ff0fc8608552f88e30231b2b59f9b5d2251ea787 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 9 Apr 2021 03:25:37 -0400 +Subject: bcachefs: Fix fsck to not use bch2_link_trans() + +bch2_link_trans() uses the btree key cache for inode updates, and fsck +isn't supposed to - also, it's not really what we want for reattaching +unreachable inodes anyways. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/fsck.c | 63 +++++++++++++++++++++++++++++++++++++++++++----------- + 1 file changed, 51 insertions(+), 12 deletions(-) + +diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c +index 6bc3f2f09e36..d7d26fb40432 100644 +--- a/fs/bcachefs/fsck.c ++++ b/fs/bcachefs/fsck.c +@@ -128,22 +128,63 @@ static int remove_dirent(struct btree_trans *trans, + __remove_dirent(trans, dirent)); + } + +-static int reattach_inode(struct bch_fs *c, +- struct bch_inode_unpacked *lostfound_inode, +- u64 inum) ++static int __reattach_inode(struct btree_trans *trans, ++ struct bch_inode_unpacked *lostfound, ++ u64 inum) + { +- struct bch_inode_unpacked dir_u, inode_u; ++ struct bch_hash_info dir_hash = ++ bch2_hash_info_init(trans->c, lostfound); ++ struct btree_iter *dir_iter = NULL, *inode_iter = NULL; ++ struct bch_inode_unpacked inode_u; + char name_buf[20]; + struct qstr name; ++ u64 dir_offset = 0; + int ret; + + snprintf(name_buf, sizeof(name_buf), "%llu", inum); + name = (struct qstr) QSTR(name_buf); + +- ret = bch2_trans_do(c, NULL, NULL, +- BTREE_INSERT_LAZY_RW, +- bch2_link_trans(&trans, lostfound_inode->bi_inum, +- inum, &dir_u, &inode_u, &name)); ++ inode_iter = bch2_inode_peek(trans, &inode_u, inum, 0); ++ ret = PTR_ERR_OR_ZERO(inode_iter); ++ if (ret) ++ goto err; ++ ++ if (S_ISDIR(inode_u.bi_mode)) { ++ lostfound->bi_nlink++; ++ ++ ret = write_inode(trans, lostfound, U32_MAX); ++ if (ret) ++ goto err; ++ } ++ ++ ret = bch2_dirent_create(trans, lostfound->bi_inum, &dir_hash, ++ mode_to_type(inode_u.bi_mode), ++ &name, inum, &dir_offset, ++ BCH_HASH_SET_MUST_CREATE); ++ if (ret) ++ goto err; ++ ++ inode_u.bi_dir = lostfound->bi_inum; ++ inode_u.bi_dir_offset = dir_offset; ++ ++ ret = write_inode(trans, &inode_u, U32_MAX); ++ if (ret) ++ goto err; ++err: ++ bch2_trans_iter_put(trans, dir_iter); ++ bch2_trans_iter_put(trans, inode_iter); ++ return ret; ++} ++ ++static int reattach_inode(struct btree_trans *trans, ++ struct bch_inode_unpacked *lostfound, ++ u64 inum) ++{ ++ struct bch_fs *c = trans->c; ++ int ret; ++ ++ ret = __bch2_trans_do(trans, NULL, NULL, BTREE_INSERT_LAZY_RW, ++ __reattach_inode(trans, lostfound, inum)); + if (ret) + bch_err(c, "error %i reattaching inode %llu", ret, inum); + +@@ -1105,9 +1146,7 @@ retry: + if (fsck_err_on(!inode_bitmap_test(&dirs_done, k.k->p.offset), c, + "unreachable directory found (inum %llu)", + k.k->p.offset)) { +- bch2_trans_unlock(&trans); +- +- ret = reattach_inode(c, lostfound_inode, k.k->p.offset); ++ ret = reattach_inode(&trans, lostfound_inode, k.k->p.offset); + if (ret) { + goto err; + } +@@ -1227,7 +1266,7 @@ static int check_inode_nlink(struct btree_trans *trans, + if (fsck_err_on(!nlink, c, + "unreachable inode %llu not marked as unlinked (type %u)", + u.bi_inum, mode_to_type(u.bi_mode))) { +- ret = reattach_inode(c, lostfound_inode, u.bi_inum); ++ ret = reattach_inode(trans, lostfound_inode, u.bi_inum); + if (ret) + return ret; + +-- +cgit v1.2.3 + + +From e5a32f3dd1b0d8a7e108c8a645706866118cd50c Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 7 Apr 2021 03:11:07 -0400 +Subject: bcachefs: Improved check_directory_structure() + +Now that we have inode backpointers, we can simplify checking directory +structure: instead of doing a DFS from the filesystem root and then +checking if we found everything, we can iterate over every inode and see +if we can go up until we get to the root. + +This patch also has a number of fixes and simplifications for the inode +backpointer checks. Also, it turns out we don't actually need the +BCH_INODE_BACKPTR_UNTRUSTED flag. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_update.h | 9 +- + fs/bcachefs/fs-common.c | 8 +- + fs/bcachefs/fsck.c | 398 ++++++++++++++++++++------------------------- + fs/bcachefs/inode.c | 31 +--- + fs/bcachefs/inode.h | 4 - + 5 files changed, 193 insertions(+), 257 deletions(-) + +diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h +index 4ce12ae29a55..0c7caa7e91a0 100644 +--- a/fs/bcachefs/btree_update.h ++++ b/fs/bcachefs/btree_update.h +@@ -103,13 +103,12 @@ static inline int bch2_trans_commit(struct btree_trans *trans, + return __bch2_trans_commit(trans); + } + +-#define __bch2_trans_do(_trans, _disk_res, _journal_seq, _flags, _do) \ ++#define lockrestart_do(_trans, _do) \ + ({ \ + int _ret; \ + \ + while (1) { \ +- _ret = (_do) ?: bch2_trans_commit(_trans, (_disk_res), \ +- (_journal_seq), (_flags)); \ ++ _ret = (_do); \ + if (_ret != -EINTR) \ + break; \ + bch2_trans_reset(_trans, 0); \ +@@ -118,6 +117,10 @@ static inline int bch2_trans_commit(struct btree_trans *trans, + _ret; \ + }) + ++#define __bch2_trans_do(_trans, _disk_res, _journal_seq, _flags, _do) \ ++ lockrestart_do(_trans, _do ?: bch2_trans_commit(_trans, (_disk_res),\ ++ (_journal_seq), (_flags))) ++ + #define bch2_trans_do(_c, _disk_res, _journal_seq, _flags, _do) \ + ({ \ + struct btree_trans trans; \ +diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c +index 281a6135e599..34d69c3f6680 100644 +--- a/fs/bcachefs/fs-common.c ++++ b/fs/bcachefs/fs-common.c +@@ -110,8 +110,6 @@ int bch2_link_trans(struct btree_trans *trans, u64 dir_inum, + inode_u->bi_ctime = now; + bch2_inode_nlink_inc(inode_u); + +- inode_u->bi_flags |= BCH_INODE_BACKPTR_UNTRUSTED; +- + dir_iter = bch2_inode_peek(trans, dir_u, dir_inum, 0); + ret = PTR_ERR_OR_ZERO(dir_iter); + if (ret) +@@ -175,6 +173,12 @@ int bch2_unlink_trans(struct btree_trans *trans, + if (ret) + goto err; + ++ if (inode_u->bi_dir == k.k->p.inode && ++ inode_u->bi_dir_offset == k.k->p.offset) { ++ inode_u->bi_dir = 0; ++ inode_u->bi_dir_offset = 0; ++ } ++ + dir_u->bi_mtime = dir_u->bi_ctime = inode_u->bi_ctime = now; + dir_u->bi_nlink -= S_ISDIR(inode_u->bi_mode); + bch2_inode_nlink_dec(inode_u); +diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c +index d7d26fb40432..fa1922cb5c87 100644 +--- a/fs/bcachefs/fsck.c ++++ b/fs/bcachefs/fsck.c +@@ -59,7 +59,7 @@ static int lookup_inode(struct btree_trans *trans, u64 inode_nr, + ? bch2_inode_unpack(bkey_s_c_to_inode(k), inode) + : -ENOENT; + err: +- bch2_trans_iter_put(trans, iter); ++ bch2_trans_iter_free(trans, iter); + return ret; + } + +@@ -134,27 +134,26 @@ static int __reattach_inode(struct btree_trans *trans, + { + struct bch_hash_info dir_hash = + bch2_hash_info_init(trans->c, lostfound); +- struct btree_iter *dir_iter = NULL, *inode_iter = NULL; + struct bch_inode_unpacked inode_u; + char name_buf[20]; + struct qstr name; + u64 dir_offset = 0; ++ u32 snapshot; + int ret; + + snprintf(name_buf, sizeof(name_buf), "%llu", inum); + name = (struct qstr) QSTR(name_buf); + +- inode_iter = bch2_inode_peek(trans, &inode_u, inum, 0); +- ret = PTR_ERR_OR_ZERO(inode_iter); ++ ret = lookup_inode(trans, inum, &inode_u, &snapshot); + if (ret) +- goto err; ++ return ret; + + if (S_ISDIR(inode_u.bi_mode)) { + lostfound->bi_nlink++; + + ret = write_inode(trans, lostfound, U32_MAX); + if (ret) +- goto err; ++ return ret; + } + + ret = bch2_dirent_create(trans, lostfound->bi_inum, &dir_hash, +@@ -162,18 +161,12 @@ static int __reattach_inode(struct btree_trans *trans, + &name, inum, &dir_offset, + BCH_HASH_SET_MUST_CREATE); + if (ret) +- goto err; ++ return ret; + + inode_u.bi_dir = lostfound->bi_inum; + inode_u.bi_dir_offset = dir_offset; + +- ret = write_inode(trans, &inode_u, U32_MAX); +- if (ret) +- goto err; +-err: +- bch2_trans_iter_put(trans, dir_iter); +- bch2_trans_iter_put(trans, inode_iter); +- return ret; ++ return write_inode(trans, &inode_u, U32_MAX); + } + + static int reattach_inode(struct btree_trans *trans, +@@ -191,6 +184,30 @@ static int reattach_inode(struct btree_trans *trans, + return ret; + } + ++static int remove_backpointer(struct btree_trans *trans, ++ struct bch_inode_unpacked *inode) ++{ ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ int ret; ++ ++ iter = bch2_trans_get_iter(trans, BTREE_ID_dirents, ++ POS(inode->bi_dir, inode->bi_dir_offset), 0); ++ k = bch2_btree_iter_peek_slot(iter); ++ ret = bkey_err(k); ++ if (ret) ++ goto out; ++ if (k.k->type != KEY_TYPE_dirent) { ++ ret = -ENOENT; ++ goto out; ++ } ++ ++ ret = remove_dirent(trans, bkey_s_c_to_dirent(k)); ++out: ++ bch2_trans_iter_put(trans, iter); ++ return ret; ++} ++ + struct inode_walker { + bool first_this_inode; + bool have_inode; +@@ -420,26 +437,18 @@ static int check_inode(struct btree_trans *trans, + do_update = true; + } + +- if (!S_ISDIR(u.bi_mode) && +- u.bi_nlink && +- !(u.bi_flags & BCH_INODE_BACKPTR_UNTRUSTED) && +- (fsck_err_on(c->sb.version >= bcachefs_metadata_version_inode_backpointers, c, +- "inode missing BCH_INODE_BACKPTR_UNTRUSTED flags") || +- c->opts.version_upgrade)) { +- u.bi_flags |= BCH_INODE_BACKPTR_UNTRUSTED; ++ if (u.bi_flags & BCH_INODE_BACKPTR_UNTRUSTED) { ++ u.bi_dir = 0; ++ u.bi_dir_offset = 0; ++ u.bi_flags &= ~BCH_INODE_BACKPTR_UNTRUSTED; + do_update = true; + } + + if (do_update) { +- struct bkey_inode_buf p; +- +- bch2_inode_pack(c, &p, &u); +- p.inode.k.p = iter->pos; +- + ret = __bch2_trans_do(trans, NULL, NULL, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW, +- (bch2_trans_update(trans, iter, &p.inode.k_i, 0), 0)); ++ bch2_inode_write(trans, iter, &u)); + if (ret) + bch_err(c, "error in fsck: error %i " + "updating inode", ret); +@@ -704,7 +713,8 @@ retry: + mode_to_type(w.inode.bi_mode), + (bch2_bkey_val_to_text(&PBUF(buf), c, + k), buf))) { +- ret = bch2_btree_delete_at(&trans, iter, 0); ++ ret = lockrestart_do(&trans, ++ bch2_btree_delete_at(&trans, iter, 0)); + if (ret) + goto err; + goto next; +@@ -751,6 +761,16 @@ retry: + if (!have_target) + goto next; + ++ if (!target.bi_dir && ++ !target.bi_dir_offset) { ++ target.bi_dir = k.k->p.inode; ++ target.bi_dir_offset = k.k->p.offset; ++ ++ ret = write_inode(&trans, &target, target_snapshot); ++ if (ret) ++ goto err; ++ } ++ + if (!inode_backpointer_matches(d, &target)) { + ret = inode_backpointer_exists(&trans, &target); + if (ret < 0) +@@ -758,52 +778,47 @@ retry: + + backpointer_exists = ret; + ret = 0; +- } + +- if (fsck_err_on(S_ISDIR(target.bi_mode) && +- !inode_backpointer_matches(d, &target) && +- backpointer_exists, c, +- "directory %llu with multiple links", +- target.bi_inum)) { +- ret = remove_dirent(&trans, d); +- if (ret) +- goto err; +- continue; +- } +- +- if (!inode_backpointer_matches(d, &target) && +- (S_ISDIR(target.bi_mode) || !target.bi_nlink)) { +- if (backpointer_exists) { +- if (!fsck_err(c, "inode %llu has multiple links but i_nlink 0", +- d_inum)) +- goto check_type; ++ if (fsck_err_on(S_ISDIR(target.bi_mode) && ++ backpointer_exists, c, ++ "directory %llu with multiple links", ++ target.bi_inum)) { ++ ret = remove_dirent(&trans, d); ++ if (ret) ++ goto err; ++ continue; ++ } + ++ if (fsck_err_on(backpointer_exists && ++ !target.bi_nlink, c, ++ "inode %llu has multiple links but i_nlink 0", ++ d_inum)) { + target.bi_nlink++; +- target.bi_flags |= BCH_INODE_BACKPTR_UNTRUSTED; +- } else { +- if (c->sb.version >= bcachefs_metadata_version_inode_backpointers && +- !(target.bi_flags & BCH_INODE_BACKPTR_UNTRUSTED) && +- !fsck_err(c, "inode %llu has wrong backpointer:\n" +- "got %llu:%llu\n" +- "should be %llu:%llu", +- d_inum, +- target.bi_dir, +- target.bi_dir_offset, +- k.k->p.inode, +- k.k->p.offset)) +- goto check_type; ++ target.bi_flags &= ~BCH_INODE_UNLINKED; ++ ++ ret = write_inode(&trans, &target, target_snapshot); ++ if (ret) ++ goto err; ++ } + ++ if (fsck_err_on(!backpointer_exists, c, ++ "inode %llu has wrong backpointer:\n" ++ "got %llu:%llu\n" ++ "should be %llu:%llu", ++ d_inum, ++ target.bi_dir, ++ target.bi_dir_offset, ++ k.k->p.inode, ++ k.k->p.offset)) { + target.bi_dir = k.k->p.inode; + target.bi_dir_offset = k.k->p.offset; +- target.bi_flags &= ~BCH_INODE_BACKPTR_UNTRUSTED; +- } + +- ret = write_inode(&trans, &target, target_snapshot); +- if (ret) +- goto err; +- continue; ++ ret = write_inode(&trans, &target, target_snapshot); ++ if (ret) ++ goto err; ++ } + } +-check_type: ++ + if (fsck_err_on(d.v->d_type != mode_to_type(target.bi_mode), c, + "incorrect d_type: should be %u:\n%s", + mode_to_type(target.bi_mode), +@@ -900,13 +915,13 @@ fsck_err: + static int check_root(struct bch_fs *c, struct bch_inode_unpacked *root_inode) + { + struct bkey_inode_buf packed; ++ u32 snapshot; + int ret; + + bch_verbose(c, "checking root directory"); + + ret = bch2_trans_do(c, NULL, NULL, 0, +- __bch2_inode_find_by_inum_trans(&trans, BCACHEFS_ROOT_INO, +- root_inode, 0)); ++ lookup_inode(&trans, BCACHEFS_ROOT_INO, root_inode, &snapshot)); + if (ret && ret != -ENOENT) + return ret; + +@@ -942,6 +957,7 @@ static int check_lostfound(struct bch_fs *c, + struct bch_hash_info root_hash_info = + bch2_hash_info_init(c, root_inode); + u64 inum; ++ u32 snapshot; + int ret; + + bch_verbose(c, "checking lost+found"); +@@ -954,7 +970,7 @@ static int check_lostfound(struct bch_fs *c, + } + + ret = bch2_trans_do(c, NULL, NULL, 0, +- __bch2_inode_find_by_inum_trans(&trans, inum, lostfound_inode, 0)); ++ lookup_inode(&trans, inum, lostfound_inode, &snapshot)); + if (ret && ret != -ENOENT) + return ret; + +@@ -984,32 +1000,12 @@ create_lostfound: + return ret; + } + +-typedef GENRADIX(unsigned long) inode_bitmap; +- +-static inline bool inode_bitmap_test(inode_bitmap *b, size_t nr) +-{ +- unsigned long *w = genradix_ptr(b, nr / BITS_PER_LONG); +- return w ? test_bit(nr & (BITS_PER_LONG - 1), w) : false; +-} +- +-static inline int inode_bitmap_set(inode_bitmap *b, size_t nr) +-{ +- unsigned long *w = genradix_ptr_alloc(b, nr / BITS_PER_LONG, GFP_KERNEL); +- +- if (!w) +- return -ENOMEM; +- +- *w |= 1UL << (nr & (BITS_PER_LONG - 1)); +- return 0; +-} +- + struct pathbuf { + size_t nr; + size_t size; + + struct pathbuf_entry { + u64 inum; +- u64 offset; + } *entries; + }; + +@@ -1020,8 +1016,9 @@ static int path_down(struct pathbuf *p, u64 inum) + void *n = krealloc(p->entries, + new_size * sizeof(p->entries[0]), + GFP_KERNEL); +- if (!n) ++ if (!n) { + return -ENOMEM; ++ } + + p->entries = n; + p->size = new_size; +@@ -1029,149 +1026,119 @@ static int path_down(struct pathbuf *p, u64 inum) + + p->entries[p->nr++] = (struct pathbuf_entry) { + .inum = inum, +- .offset = 0, + }; + return 0; + } + +-noinline_for_stack +-static int check_directory_structure(struct bch_fs *c, +- struct bch_inode_unpacked *lostfound_inode) ++static int check_path(struct btree_trans *trans, ++ struct bch_inode_unpacked *lostfound, ++ struct pathbuf *p, ++ struct bch_inode_unpacked *inode) + { +- inode_bitmap dirs_done; +- struct pathbuf path = { 0, 0, NULL }; +- struct pathbuf_entry *e; +- struct btree_trans trans; +- struct btree_iter *iter; +- struct bkey_s_c k; +- struct bkey_s_c_dirent dirent; +- bool had_unreachable; +- u64 d_inum; ++ struct bch_fs *c = trans->c; ++ u32 snapshot; ++ size_t i; + int ret = 0; + +- bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); ++ p->nr = 0; + +- bch_verbose(c, "checking directory structure"); +- +- /* DFS: */ +-restart_dfs: +- genradix_init(&dirs_done); +- had_unreachable = false; +- +- ret = inode_bitmap_set(&dirs_done, BCACHEFS_ROOT_INO); +- if (ret) { +- bch_err(c, "memory allocation failure in inode_bitmap_set()"); +- goto err; +- } +- +- ret = path_down(&path, BCACHEFS_ROOT_INO); +- if (ret) +- goto err; +- +- while (path.nr) { +-next: +- e = &path.entries[path.nr - 1]; +- +- if (e->offset == U64_MAX) +- goto up; +- +- for_each_btree_key(&trans, iter, BTREE_ID_dirents, +- POS(e->inum, e->offset + 1), 0, k, ret) { +- if (k.k->p.inode != e->inum) +- break; +- +- e->offset = k.k->p.offset; +- +- if (k.k->type != KEY_TYPE_dirent) +- continue; ++ while (inode->bi_inum != BCACHEFS_ROOT_INO) { ++ ret = lockrestart_do(trans, ++ inode_backpointer_exists(trans, inode)); ++ if (ret < 0) ++ break; + +- dirent = bkey_s_c_to_dirent(k); ++ if (!ret) { ++ if (fsck_err(c, "unreachable inode %llu, type %u nlink %u backptr %llu:%llu", ++ inode->bi_inum, ++ mode_to_type(inode->bi_mode), ++ inode->bi_nlink, ++ inode->bi_dir, ++ inode->bi_dir_offset)) ++ ret = reattach_inode(trans, lostfound, inode->bi_inum); ++ break; ++ } ++ ret = 0; + +- if (dirent.v->d_type != DT_DIR) +- continue; ++ if (!S_ISDIR(inode->bi_mode)) ++ break; + +- d_inum = le64_to_cpu(dirent.v->d_inum); ++ ret = path_down(p, inode->bi_inum); ++ if (ret) { ++ bch_err(c, "memory allocation failure"); ++ return ret; ++ } + +- if (fsck_err_on(inode_bitmap_test(&dirs_done, d_inum), c, +- "directory %llu has multiple hardlinks", +- d_inum)) { +- ret = remove_dirent(&trans, dirent); +- if (ret) +- goto err; ++ for (i = 0; i < p->nr; i++) { ++ if (inode->bi_dir != p->entries[i].inum) + continue; +- } + +- ret = inode_bitmap_set(&dirs_done, d_inum); +- if (ret) { +- bch_err(c, "memory allocation failure in inode_bitmap_set()"); +- goto err; +- } ++ /* XXX print path */ ++ if (!fsck_err(c, "directory structure loop")) ++ return 0; + +- ret = path_down(&path, d_inum); ++ ret = lockrestart_do(trans, ++ remove_backpointer(trans, inode)); + if (ret) { +- goto err; ++ bch_err(c, "error removing dirent: %i", ret); ++ break; + } + +- ret = bch2_trans_iter_free(&trans, iter); +- if (ret) { +- bch_err(c, "btree error %i in fsck", ret); +- goto err; +- } +- goto next; ++ ret = reattach_inode(trans, lostfound, inode->bi_inum); ++ break; + } +- ret = bch2_trans_iter_free(&trans, iter) ?: ret; ++ ++ ret = lockrestart_do(trans, ++ lookup_inode(trans, inode->bi_dir, inode, &snapshot)); + if (ret) { +- bch_err(c, "btree error %i in fsck", ret); +- goto err; ++ /* Should have been caught in dirents pass */ ++ bch_err(c, "error looking up parent directory: %i", ret); ++ break; + } +-up: +- path.nr--; + } ++fsck_err: ++ if (ret) ++ bch_err(c, "%s: err %i", __func__, ret); ++ return ret; ++} + +- iter = bch2_trans_get_iter(&trans, BTREE_ID_inodes, POS_MIN, 0); +-retry: +- for_each_btree_key_continue(iter, 0, k, ret) { +- if (k.k->type != KEY_TYPE_inode) +- continue; ++/* ++ * Check for unreachable inodes, as well as loops in the directory structure: ++ * After check_dirents(), if an inode backpointer doesn't exist that means it's ++ * unreachable: ++ */ ++static int check_directory_structure(struct bch_fs *c, ++ struct bch_inode_unpacked *lostfound) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ struct bch_inode_unpacked u; ++ struct pathbuf path = { 0, 0, NULL }; ++ int ret; + +- if (!S_ISDIR(le16_to_cpu(bkey_s_c_to_inode(k).v->bi_mode))) +- continue; ++ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); + +- ret = bch2_empty_dir_trans(&trans, k.k->p.inode); +- if (ret == -EINTR) +- goto retry; +- if (!ret) ++ for_each_btree_key(&trans, iter, BTREE_ID_inodes, POS_MIN, 0, k, ret) { ++ if (k.k->type != KEY_TYPE_inode) + continue; + +- if (fsck_err_on(!inode_bitmap_test(&dirs_done, k.k->p.offset), c, +- "unreachable directory found (inum %llu)", +- k.k->p.offset)) { +- ret = reattach_inode(&trans, lostfound_inode, k.k->p.offset); +- if (ret) { +- goto err; +- } +- +- had_unreachable = true; ++ ret = bch2_inode_unpack(bkey_s_c_to_inode(k), &u); ++ if (ret) { ++ /* Should have been caught earlier in fsck: */ ++ bch_err(c, "error unpacking inode %llu: %i", k.k->p.offset, ret); ++ break; + } +- } +- bch2_trans_iter_free(&trans, iter); +- if (ret) +- goto err; + +- if (had_unreachable) { +- bch_info(c, "reattached unreachable directories, restarting pass to check for loops"); +- genradix_free(&dirs_done); +- kfree(path.entries); +- memset(&dirs_done, 0, sizeof(dirs_done)); +- memset(&path, 0, sizeof(path)); +- goto restart_dfs; ++ ret = check_path(&trans, lostfound, &path, &u); ++ if (ret) ++ break; + } +-err: +-fsck_err: +- ret = bch2_trans_exit(&trans) ?: ret; +- genradix_free(&dirs_done); +- kfree(path.entries); +- return ret; ++ bch2_trans_iter_put(&trans, iter); ++ ++ BUG_ON(ret == -EINTR); ++ ++ return bch2_trans_exit(&trans) ?: ret; + } + + struct nlink { +@@ -1254,6 +1221,11 @@ static int check_inode_nlink(struct btree_trans *trans, + if (S_ISDIR(le16_to_cpu(inode.v->bi_mode))) + return 0; + ++ if (!nlink) { ++ bch_err(c, "no links found to inode %llu", inode.k->p.offset); ++ return -EINVAL; ++ } ++ + ret = bch2_inode_unpack(inode, &u); + + /* Should never happen, checked by bch2_inode_invalid: */ +@@ -1262,34 +1234,16 @@ static int check_inode_nlink(struct btree_trans *trans, + inode.k->p.inode)) + return ret; + +- /* Improved directory structure pass will catch this: */ +- if (fsck_err_on(!nlink, c, +- "unreachable inode %llu not marked as unlinked (type %u)", +- u.bi_inum, mode_to_type(u.bi_mode))) { +- ret = reattach_inode(trans, lostfound_inode, u.bi_inum); +- if (ret) +- return ret; +- +- nlink = 1; +- } +- + if (fsck_err_on(bch2_inode_nlink_get(&u) != nlink, c, + "inode %llu has wrong i_nlink (type %u i_nlink %u, should be %u)", + u.bi_inum, mode_to_type(u.bi_mode), + bch2_inode_nlink_get(&u), nlink)) { +- struct bkey_inode_buf p; +- +- if (nlink > 1) +- u.bi_flags |= BCH_INODE_BACKPTR_UNTRUSTED; +- + bch2_inode_nlink_set(&u, nlink); +- bch2_inode_pack(c, &p, &u); +- p.inode.k.p = iter->pos; + + ret = __bch2_trans_do(trans, NULL, NULL, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW, +- (bch2_trans_update(trans, iter, &p.inode.k_i, 0), 0)); ++ bch2_inode_write(trans, iter, &u)); + if (ret) + bch_err(c, "error in fsck: error %i updating inode", ret); + } +diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c +index d4c328397156..dfde5ba3f1b7 100644 +--- a/fs/bcachefs/inode.c ++++ b/fs/bcachefs/inode.c +@@ -307,7 +307,7 @@ struct btree_iter *bch2_inode_peek(struct btree_trans *trans, + if (ret) + goto err; + +- ret = k.k->type == KEY_TYPE_inode ? 0 : -EIO; ++ ret = k.k->type == KEY_TYPE_inode ? 0 : -ENOENT; + if (ret) + goto err; + +@@ -637,39 +637,18 @@ err: + return ret; + } + +-int __bch2_inode_find_by_inum_trans(struct btree_trans *trans, u64 inode_nr, +- struct bch_inode_unpacked *inode, +- unsigned flags) ++static int bch2_inode_find_by_inum_trans(struct btree_trans *trans, u64 inode_nr, ++ struct bch_inode_unpacked *inode) + { + struct btree_iter *iter; +- struct bkey_s_c k; + int ret; + +- iter = bch2_trans_get_iter(trans, BTREE_ID_inodes, +- POS(0, inode_nr), flags); +- k = (flags & BTREE_ITER_TYPE) == BTREE_ITER_CACHED +- ? bch2_btree_iter_peek_cached(iter) +- : bch2_btree_iter_peek_slot(iter); +- ret = bkey_err(k); +- if (ret) +- goto err; +- +- ret = k.k->type == KEY_TYPE_inode +- ? bch2_inode_unpack(bkey_s_c_to_inode(k), inode) +- : -ENOENT; +-err: ++ iter = bch2_inode_peek(trans, inode, inode_nr, 0); ++ ret = PTR_ERR_OR_ZERO(iter); + bch2_trans_iter_put(trans, iter); + return ret; + } + +-int bch2_inode_find_by_inum_trans(struct btree_trans *trans, u64 inode_nr, +- struct bch_inode_unpacked *inode) +-{ +- return __bch2_inode_find_by_inum_trans(trans, inode_nr, +- inode, BTREE_ITER_CACHED); +- +-} +- + int bch2_inode_find_by_inum(struct bch_fs *c, u64 inode_nr, + struct bch_inode_unpacked *inode) + { +diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h +index 23c322d9a85b..558d5464095d 100644 +--- a/fs/bcachefs/inode.h ++++ b/fs/bcachefs/inode.h +@@ -74,10 +74,6 @@ struct btree_iter *bch2_inode_create(struct btree_trans *, + + int bch2_inode_rm(struct bch_fs *, u64, bool); + +-int __bch2_inode_find_by_inum_trans(struct btree_trans *, u64, +- struct bch_inode_unpacked *, unsigned); +-int bch2_inode_find_by_inum_trans(struct btree_trans *, u64, +- struct bch_inode_unpacked *); + int bch2_inode_find_by_inum(struct bch_fs *, u64, struct bch_inode_unpacked *); + + static inline struct bch_io_opts bch2_inode_opts_get(struct bch_inode_unpacked *inode) +-- +cgit v1.2.3 + + +From 38361e765764a0a16102828b101ac6346b604952 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 13 Apr 2021 10:26:59 -0400 +Subject: bcachefs: BCH_BEATURE_atomic_nlink is obsolete + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/recovery.c | 11 ++--------- + 1 file changed, 2 insertions(+), 9 deletions(-) + +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index 012a08574022..9991a4f67163 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -1005,13 +1005,6 @@ int bch2_fs_recovery(struct bch_fs *c) + + } + +- if (!c->sb.clean && +- !(c->sb.features & (1 << BCH_FEATURE_atomic_nlink))) { +- bch_info(c, "BCH_FEATURE_atomic_nlink not set and filesystem dirty, fsck required"); +- c->opts.fsck = true; +- c->opts.fix_errors = FSCK_OPT_YES; +- } +- + if (!(c->sb.features & (1ULL << BCH_FEATURE_alloc_v2))) { + bch_info(c, "alloc_v2 feature bit not set, fsck required"); + c->opts.fsck = true; +@@ -1247,8 +1240,8 @@ use_clean: + } + + if (c->opts.fsck && +- !test_bit(BCH_FS_ERROR, &c->flags)) { +- c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_atomic_nlink; ++ !test_bit(BCH_FS_ERROR, &c->flags) && ++ BCH_SB_HAS_ERRORS(c->disk_sb.sb)) { + SET_BCH_SB_HAS_ERRORS(c->disk_sb.sb, 0); + write_sb = true; + } +-- +cgit v1.2.3 + + +From 32f3b54011d52a1ae8530b39849b161aa5585654 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 13 Apr 2021 10:30:58 -0400 +Subject: bcachefs: Fix heap overrun in bch2_fs_usage_read() + +oops + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/buckets.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index 243f8610fcec..cb63fafcfcb1 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -229,7 +229,7 @@ struct bch_fs_usage_online *bch2_fs_usage_read(struct bch_fs *c) + percpu_down_read(&c->mark_lock); + + ret = kmalloc(sizeof(struct bch_fs_usage_online) + +- sizeof(u64) + c->replicas.nr, GFP_NOFS); ++ sizeof(u64) * c->replicas.nr, GFP_NOFS); + if (unlikely(!ret)) { + percpu_up_read(&c->mark_lock); + return NULL; +-- +cgit v1.2.3 + + +From 98db68deba23ed5d854801aa8dc96e4b88708674 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 13 Apr 2021 15:00:40 -0400 +Subject: bcachefs: Add the status of bucket gen gc to sysfs + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bcachefs.h | 3 +++ + fs/bcachefs/btree_gc.c | 7 +++++++ + fs/bcachefs/sysfs.c | 14 ++++++++++++++ + 3 files changed, 24 insertions(+) + +diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h +index 13fafa42153d..4fe162a338f6 100644 +--- a/fs/bcachefs/bcachefs.h ++++ b/fs/bcachefs/bcachefs.h +@@ -720,6 +720,9 @@ struct bch_fs { + atomic_t kick_gc; + unsigned long gc_count; + ++ enum btree_id gc_gens_btree; ++ struct bpos gc_gens_pos; ++ + /* + * Tracks GC's progress - everything in the range [ZERO_KEY..gc_cur_pos] + * has been marked by GC. +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +index d0c06fa5198d..e40878a646c1 100644 +--- a/fs/bcachefs/btree_gc.c ++++ b/fs/bcachefs/btree_gc.c +@@ -1193,6 +1193,8 @@ static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id btree_id) + + while ((k = bch2_btree_iter_peek(iter)).k && + !(ret = bkey_err(k))) { ++ c->gc_gens_pos = iter->pos; ++ + if (gc_btree_gens_key(c, k)) { + bch2_bkey_buf_reassemble(&sk, c, k); + bch2_extent_normalize(c, bkey_i_to_s(sk.k)); +@@ -1244,6 +1246,8 @@ int bch2_gc_gens(struct bch_fs *c) + + for (i = 0; i < BTREE_ID_NR; i++) + if ((1 << i) & BTREE_ID_HAS_PTRS) { ++ c->gc_gens_btree = i; ++ c->gc_gens_pos = POS_MIN; + ret = bch2_gc_btree_gens(c, i); + if (ret) { + bch_err(c, "error recalculating oldest_gen: %i", ret); +@@ -1260,6 +1264,9 @@ int bch2_gc_gens(struct bch_fs *c) + up_read(&ca->bucket_lock); + } + ++ c->gc_gens_btree = 0; ++ c->gc_gens_pos = POS_MIN; ++ + c->gc_count++; + err: + up_read(&c->gc_lock); +diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c +index dd9b54e0d80b..077f3a8cead7 100644 +--- a/fs/bcachefs/sysfs.c ++++ b/fs/bcachefs/sysfs.c +@@ -136,6 +136,7 @@ write_attribute(trigger_btree_coalesce); + write_attribute(trigger_gc); + write_attribute(prune_cache); + rw_attribute(btree_gc_periodic); ++rw_attribute(gc_gens_pos); + + read_attribute(uuid); + read_attribute(minor); +@@ -312,6 +313,13 @@ static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c + return 0; + } + ++void bch2_gc_gens_pos_to_text(struct printbuf *out, struct bch_fs *c) ++{ ++ pr_buf(out, "%s: ", bch2_btree_ids[c->gc_gens_btree]); ++ bch2_bpos_to_text(out, c->gc_gens_pos); ++ pr_buf(out, "\n"); ++} ++ + SHOW(bch2_fs) + { + struct bch_fs *c = container_of(kobj, struct bch_fs, kobj); +@@ -337,6 +345,11 @@ SHOW(bch2_fs) + + sysfs_printf(btree_gc_periodic, "%u", (int) c->btree_gc_periodic); + ++ if (attr == &sysfs_gc_gens_pos) { ++ bch2_gc_gens_pos_to_text(&out, c); ++ return out.pos - buf; ++ } ++ + sysfs_printf(copy_gc_enabled, "%i", c->copy_gc_enabled); + + sysfs_printf(rebalance_enabled, "%i", c->rebalance.enabled); +@@ -566,6 +579,7 @@ struct attribute *bch2_fs_internal_files[] = { + &sysfs_trigger_journal_flush, + &sysfs_trigger_btree_coalesce, + &sysfs_trigger_gc, ++ &sysfs_gc_gens_pos, + &sysfs_prune_cache, + + &sysfs_copy_gc_enabled, +-- +cgit v1.2.3 + + +From e40db85ca605eb9d9739558792e02742712ce5a5 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 13 Apr 2021 15:10:39 -0400 +Subject: bcachefs: Ensure bucket gen gc completes + +We don't want it to block, if it can't allocate it should just continue +instead of possibly deadlocking. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_gc.c | 14 +++++++------- + 1 file changed, 7 insertions(+), 7 deletions(-) + +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +index e40878a646c1..9dbec355809a 100644 +--- a/fs/bcachefs/btree_gc.c ++++ b/fs/bcachefs/btree_gc.c +@@ -1181,7 +1181,7 @@ static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id btree_id) + struct btree_iter *iter; + struct bkey_s_c k; + struct bkey_buf sk; +- int ret = 0; ++ int ret = 0, commit_err = 0; + + bch2_bkey_buf_init(&sk); + bch2_trans_init(&trans, c, 0, 0); +@@ -1195,18 +1195,18 @@ static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id btree_id) + !(ret = bkey_err(k))) { + c->gc_gens_pos = iter->pos; + +- if (gc_btree_gens_key(c, k)) { ++ if (gc_btree_gens_key(c, k) && !commit_err) { + bch2_bkey_buf_reassemble(&sk, c, k); + bch2_extent_normalize(c, bkey_i_to_s(sk.k)); + + bch2_trans_update(&trans, iter, sk.k, 0); + +- ret = bch2_trans_commit(&trans, NULL, NULL, +- BTREE_INSERT_NOFAIL); +- if (ret == -EINTR) ++ commit_err = bch2_trans_commit(&trans, NULL, NULL, ++ BTREE_INSERT_NOWAIT| ++ BTREE_INSERT_NOFAIL); ++ if (commit_err == -EINTR) { ++ commit_err = 0; + continue; +- if (ret) { +- break; + } + } + +-- +cgit v1.2.3 + + +From 22db04ef65fa2ae752d399f080103b56f99a0f9d Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 14 Apr 2021 12:10:17 -0400 +Subject: bcachefs: Add a perf test for multiple updates per commit + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/tests.c | 37 +++++++++++++++++++++++++++++++++++++ + 1 file changed, 37 insertions(+) + +diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c +index 7507b6bcc13f..254e3b314204 100644 +--- a/fs/bcachefs/tests.c ++++ b/fs/bcachefs/tests.c +@@ -497,6 +497,42 @@ static int rand_insert(struct bch_fs *c, u64 nr) + return ret; + } + ++static int rand_insert_multi(struct bch_fs *c, u64 nr) ++{ ++ struct btree_trans trans; ++ struct bkey_i_cookie k[8]; ++ int ret = 0; ++ unsigned j; ++ u64 i; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ for (i = 0; i < nr; i += ARRAY_SIZE(k)) { ++ for (j = 0; j < ARRAY_SIZE(k); j++) { ++ bkey_cookie_init(&k[j].k_i); ++ k[j].k.p.offset = test_rand(); ++ k[j].k.p.snapshot = U32_MAX; ++ } ++ ++ ret = __bch2_trans_do(&trans, NULL, NULL, 0, ++ __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[0].k_i) ?: ++ __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[1].k_i) ?: ++ __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[2].k_i) ?: ++ __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[3].k_i) ?: ++ __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[4].k_i) ?: ++ __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[5].k_i) ?: ++ __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[6].k_i) ?: ++ __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[7].k_i)); ++ if (ret) { ++ bch_err(c, "error in rand_insert_multi: %i", ret); ++ break; ++ } ++ } ++ ++ bch2_trans_exit(&trans); ++ return ret; ++} ++ + static int rand_lookup(struct bch_fs *c, u64 nr) + { + struct btree_trans trans; +@@ -765,6 +801,7 @@ int bch2_btree_perf_test(struct bch_fs *c, const char *testname, + if (!strcmp(testname, #_test)) j.fn = _test + + perf_test(rand_insert); ++ perf_test(rand_insert_multi); + perf_test(rand_lookup); + perf_test(rand_mixed); + perf_test(rand_delete); +-- +cgit v1.2.3 + + +From e219fe886d138f6aad4e2cbc574b613b498eb5cb Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 14 Apr 2021 12:17:41 -0400 +Subject: bcachefs: Drop old style btree node coalescing + +We have foreground btree node merging now, and any future btree node +merging improvements are going to be based off of that code. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_gc.c | 340 ---------------------------------------- + fs/bcachefs/btree_gc.h | 2 - + fs/bcachefs/sysfs.c | 5 - + include/trace/events/bcachefs.h | 37 ----- + 4 files changed, 384 deletions(-) + +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +index 9dbec355809a..f742ff8a2ca7 100644 +--- a/fs/bcachefs/btree_gc.c ++++ b/fs/bcachefs/btree_gc.c +@@ -1273,346 +1273,6 @@ err: + return ret; + } + +-/* Btree coalescing */ +- +-static void recalc_packed_keys(struct btree *b) +-{ +- struct bset *i = btree_bset_first(b); +- struct bkey_packed *k; +- +- memset(&b->nr, 0, sizeof(b->nr)); +- +- BUG_ON(b->nsets != 1); +- +- vstruct_for_each(i, k) +- btree_keys_account_key_add(&b->nr, 0, k); +-} +- +-static void bch2_coalesce_nodes(struct bch_fs *c, struct btree_iter *iter, +- struct btree *old_nodes[GC_MERGE_NODES]) +-{ +- struct btree *parent = btree_node_parent(iter, old_nodes[0]); +- unsigned i, nr_old_nodes, nr_new_nodes, u64s = 0; +- unsigned blocks = btree_blocks(c) * 2 / 3; +- struct btree *new_nodes[GC_MERGE_NODES]; +- struct btree_update *as; +- struct keylist keylist; +- struct bkey_format_state format_state; +- struct bkey_format new_format; +- +- memset(new_nodes, 0, sizeof(new_nodes)); +- bch2_keylist_init(&keylist, NULL); +- +- /* Count keys that are not deleted */ +- for (i = 0; i < GC_MERGE_NODES && old_nodes[i]; i++) +- u64s += old_nodes[i]->nr.live_u64s; +- +- nr_old_nodes = nr_new_nodes = i; +- +- /* Check if all keys in @old_nodes could fit in one fewer node */ +- if (nr_old_nodes <= 1 || +- __vstruct_blocks(struct btree_node, c->block_bits, +- DIV_ROUND_UP(u64s, nr_old_nodes - 1)) > blocks) +- return; +- +- /* Find a format that all keys in @old_nodes can pack into */ +- bch2_bkey_format_init(&format_state); +- +- /* +- * XXX: this won't correctly take it account the new min/max keys: +- */ +- for (i = 0; i < nr_old_nodes; i++) +- __bch2_btree_calc_format(&format_state, old_nodes[i]); +- +- new_format = bch2_bkey_format_done(&format_state); +- +- /* Check if repacking would make any nodes too big to fit */ +- for (i = 0; i < nr_old_nodes; i++) +- if (!bch2_btree_node_format_fits(c, old_nodes[i], &new_format)) { +- trace_btree_gc_coalesce_fail(c, +- BTREE_GC_COALESCE_FAIL_FORMAT_FITS); +- return; +- } +- +- if (bch2_keylist_realloc(&keylist, NULL, 0, +- BKEY_BTREE_PTR_U64s_MAX * nr_old_nodes)) { +- trace_btree_gc_coalesce_fail(c, +- BTREE_GC_COALESCE_FAIL_KEYLIST_REALLOC); +- return; +- } +- +- as = bch2_btree_update_start(iter, old_nodes[0]->c.level, +- btree_update_reserve_required(c, parent) + nr_old_nodes, +- BTREE_INSERT_NOFAIL| +- BTREE_INSERT_USE_RESERVE); +- if (IS_ERR(as)) { +- trace_btree_gc_coalesce_fail(c, +- BTREE_GC_COALESCE_FAIL_RESERVE_GET); +- bch2_keylist_free(&keylist, NULL); +- return; +- } +- +- trace_btree_gc_coalesce(c, old_nodes[0]); +- +- for (i = 0; i < nr_old_nodes; i++) +- bch2_btree_interior_update_will_free_node(as, old_nodes[i]); +- +- /* Repack everything with @new_format and sort down to one bset */ +- for (i = 0; i < nr_old_nodes; i++) +- new_nodes[i] = +- __bch2_btree_node_alloc_replacement(as, old_nodes[i], +- new_format); +- +- /* +- * Conceptually we concatenate the nodes together and slice them +- * up at different boundaries. +- */ +- for (i = nr_new_nodes - 1; i > 0; --i) { +- struct btree *n1 = new_nodes[i]; +- struct btree *n2 = new_nodes[i - 1]; +- +- struct bset *s1 = btree_bset_first(n1); +- struct bset *s2 = btree_bset_first(n2); +- struct bkey_packed *k, *last = NULL; +- +- /* Calculate how many keys from @n2 we could fit inside @n1 */ +- u64s = 0; +- +- for (k = s2->start; +- k < vstruct_last(s2) && +- vstruct_blocks_plus(n1->data, c->block_bits, +- u64s + k->u64s) <= blocks; +- k = bkey_next(k)) { +- last = k; +- u64s += k->u64s; +- } +- +- if (u64s == le16_to_cpu(s2->u64s)) { +- /* n2 fits entirely in n1 */ +- n1->key.k.p = n1->data->max_key = n2->data->max_key; +- +- memcpy_u64s(vstruct_last(s1), +- s2->start, +- le16_to_cpu(s2->u64s)); +- le16_add_cpu(&s1->u64s, le16_to_cpu(s2->u64s)); +- +- set_btree_bset_end(n1, n1->set); +- +- six_unlock_write(&n2->c.lock); +- bch2_btree_node_free_never_inserted(c, n2); +- six_unlock_intent(&n2->c.lock); +- +- memmove(new_nodes + i - 1, +- new_nodes + i, +- sizeof(new_nodes[0]) * (nr_new_nodes - i)); +- new_nodes[--nr_new_nodes] = NULL; +- } else if (u64s) { +- /* move part of n2 into n1 */ +- n1->key.k.p = n1->data->max_key = +- bkey_unpack_pos(n1, last); +- +- n2->data->min_key = bpos_successor(n1->data->max_key); +- +- memcpy_u64s(vstruct_last(s1), +- s2->start, u64s); +- le16_add_cpu(&s1->u64s, u64s); +- +- memmove(s2->start, +- vstruct_idx(s2, u64s), +- (le16_to_cpu(s2->u64s) - u64s) * sizeof(u64)); +- s2->u64s = cpu_to_le16(le16_to_cpu(s2->u64s) - u64s); +- +- set_btree_bset_end(n1, n1->set); +- set_btree_bset_end(n2, n2->set); +- } +- } +- +- for (i = 0; i < nr_new_nodes; i++) { +- struct btree *n = new_nodes[i]; +- +- recalc_packed_keys(n); +- btree_node_reset_sib_u64s(n); +- +- bch2_btree_build_aux_trees(n); +- +- bch2_btree_update_add_new_node(as, n); +- six_unlock_write(&n->c.lock); +- +- bch2_btree_node_write(c, n, SIX_LOCK_intent); +- } +- +- /* +- * The keys for the old nodes get deleted. We don't want to insert keys +- * that compare equal to the keys for the new nodes we'll also be +- * inserting - we can't because keys on a keylist must be strictly +- * greater than the previous keys, and we also don't need to since the +- * key for the new node will serve the same purpose (overwriting the key +- * for the old node). +- */ +- for (i = 0; i < nr_old_nodes; i++) { +- struct bkey_i delete; +- unsigned j; +- +- for (j = 0; j < nr_new_nodes; j++) +- if (!bpos_cmp(old_nodes[i]->key.k.p, +- new_nodes[j]->key.k.p)) +- goto next; +- +- bkey_init(&delete.k); +- delete.k.p = old_nodes[i]->key.k.p; +- bch2_keylist_add_in_order(&keylist, &delete); +-next: +- i = i; +- } +- +- /* +- * Keys for the new nodes get inserted: bch2_btree_insert_keys() only +- * does the lookup once and thus expects the keys to be in sorted order +- * so we have to make sure the new keys are correctly ordered with +- * respect to the deleted keys added in the previous loop +- */ +- for (i = 0; i < nr_new_nodes; i++) +- bch2_keylist_add_in_order(&keylist, &new_nodes[i]->key); +- +- /* Insert the newly coalesced nodes */ +- bch2_btree_insert_node(as, parent, iter, &keylist, 0); +- +- BUG_ON(!bch2_keylist_empty(&keylist)); +- +- BUG_ON(iter->l[old_nodes[0]->c.level].b != old_nodes[0]); +- +- bch2_btree_iter_node_replace(iter, new_nodes[0]); +- +- for (i = 0; i < nr_new_nodes; i++) +- bch2_btree_update_get_open_buckets(as, new_nodes[i]); +- +- /* Free the old nodes and update our sliding window */ +- for (i = 0; i < nr_old_nodes; i++) { +- bch2_btree_node_free_inmem(c, old_nodes[i], iter); +- +- /* +- * the index update might have triggered a split, in which case +- * the nodes we coalesced - the new nodes we just created - +- * might not be sibling nodes anymore - don't add them to the +- * sliding window (except the first): +- */ +- if (!i) { +- old_nodes[i] = new_nodes[i]; +- } else { +- old_nodes[i] = NULL; +- } +- } +- +- for (i = 0; i < nr_new_nodes; i++) +- six_unlock_intent(&new_nodes[i]->c.lock); +- +- bch2_btree_update_done(as); +- bch2_keylist_free(&keylist, NULL); +-} +- +-static int bch2_coalesce_btree(struct bch_fs *c, enum btree_id btree_id) +-{ +- struct btree_trans trans; +- struct btree_iter *iter; +- struct btree *b; +- bool kthread = (current->flags & PF_KTHREAD) != 0; +- unsigned i; +- int ret = 0; +- +- /* Sliding window of adjacent btree nodes */ +- struct btree *merge[GC_MERGE_NODES]; +- u32 lock_seq[GC_MERGE_NODES]; +- +- bch2_trans_init(&trans, c, 0, 0); +- +- /* +- * XXX: We don't have a good way of positively matching on sibling nodes +- * that have the same parent - this code works by handling the cases +- * where they might not have the same parent, and is thus fragile. Ugh. +- * +- * Perhaps redo this to use multiple linked iterators? +- */ +- memset(merge, 0, sizeof(merge)); +- +- __for_each_btree_node(&trans, iter, btree_id, POS_MIN, +- BTREE_MAX_DEPTH, 0, +- BTREE_ITER_PREFETCH, b) { +- memmove(merge + 1, merge, +- sizeof(merge) - sizeof(merge[0])); +- memmove(lock_seq + 1, lock_seq, +- sizeof(lock_seq) - sizeof(lock_seq[0])); +- +- merge[0] = b; +- +- for (i = 1; i < GC_MERGE_NODES; i++) { +- if (!merge[i] || +- !six_relock_intent(&merge[i]->c.lock, lock_seq[i])) +- break; +- +- if (merge[i]->c.level != merge[0]->c.level) { +- six_unlock_intent(&merge[i]->c.lock); +- break; +- } +- } +- memset(merge + i, 0, (GC_MERGE_NODES - i) * sizeof(merge[0])); +- +- bch2_coalesce_nodes(c, iter, merge); +- +- for (i = 1; i < GC_MERGE_NODES && merge[i]; i++) { +- lock_seq[i] = merge[i]->c.lock.state.seq; +- six_unlock_intent(&merge[i]->c.lock); +- } +- +- lock_seq[0] = merge[0]->c.lock.state.seq; +- +- if (kthread && kthread_should_stop()) { +- ret = -ESHUTDOWN; +- break; +- } +- +- bch2_trans_cond_resched(&trans); +- +- /* +- * If the parent node wasn't relocked, it might have been split +- * and the nodes in our sliding window might not have the same +- * parent anymore - blow away the sliding window: +- */ +- if (btree_iter_node(iter, iter->level + 1) && +- !btree_node_intent_locked(iter, iter->level + 1)) +- memset(merge + 1, 0, +- (GC_MERGE_NODES - 1) * sizeof(merge[0])); +- } +- bch2_trans_iter_put(&trans, iter); +- +- return bch2_trans_exit(&trans) ?: ret; +-} +- +-/** +- * bch_coalesce - coalesce adjacent nodes with low occupancy +- */ +-void bch2_coalesce(struct bch_fs *c) +-{ +- enum btree_id id; +- +- down_read(&c->gc_lock); +- trace_gc_coalesce_start(c); +- +- for (id = 0; id < BTREE_ID_NR; id++) { +- int ret = c->btree_roots[id].b +- ? bch2_coalesce_btree(c, id) +- : 0; +- +- if (ret) { +- if (ret != -ESHUTDOWN) +- bch_err(c, "btree coalescing failed: %d", ret); +- return; +- } +- } +- +- trace_gc_coalesce_end(c); +- up_read(&c->gc_lock); +-} +- + static int bch2_gc_thread(void *arg) + { + struct bch_fs *c = arg; +diff --git a/fs/bcachefs/btree_gc.h b/fs/bcachefs/btree_gc.h +index 44b7d121610f..868723a30b15 100644 +--- a/fs/bcachefs/btree_gc.h ++++ b/fs/bcachefs/btree_gc.h +@@ -4,8 +4,6 @@ + + #include "btree_types.h" + +-void bch2_coalesce(struct bch_fs *); +- + int bch2_gc(struct bch_fs *, bool, bool); + int bch2_gc_gens(struct bch_fs *); + void bch2_gc_thread_stop(struct bch_fs *); +diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c +index 077f3a8cead7..21ef7719cf55 100644 +--- a/fs/bcachefs/sysfs.c ++++ b/fs/bcachefs/sysfs.c +@@ -132,7 +132,6 @@ do { \ + } while (0) + + write_attribute(trigger_journal_flush); +-write_attribute(trigger_btree_coalesce); + write_attribute(trigger_gc); + write_attribute(prune_cache); + rw_attribute(btree_gc_periodic); +@@ -478,9 +477,6 @@ STORE(bch2_fs) + if (attr == &sysfs_trigger_journal_flush) + bch2_journal_meta(&c->journal); + +- if (attr == &sysfs_trigger_btree_coalesce) +- bch2_coalesce(c); +- + if (attr == &sysfs_trigger_gc) { + /* + * Full gc is currently incompatible with btree key cache: +@@ -577,7 +573,6 @@ struct attribute *bch2_fs_internal_files[] = { + &sysfs_extent_migrate_raced, + + &sysfs_trigger_journal_flush, +- &sysfs_trigger_btree_coalesce, + &sysfs_trigger_gc, + &sysfs_gc_gens_pos, + &sysfs_prune_cache, +diff --git a/include/trace/events/bcachefs.h b/include/trace/events/bcachefs.h +index 0a730e258d95..0018d381abdd 100644 +--- a/include/trace/events/bcachefs.h ++++ b/include/trace/events/bcachefs.h +@@ -353,28 +353,6 @@ DEFINE_EVENT(btree_node, btree_set_root, + + /* Garbage collection */ + +-DEFINE_EVENT(btree_node, btree_gc_coalesce, +- TP_PROTO(struct bch_fs *c, struct btree *b), +- TP_ARGS(c, b) +-); +- +-TRACE_EVENT(btree_gc_coalesce_fail, +- TP_PROTO(struct bch_fs *c, int reason), +- TP_ARGS(c, reason), +- +- TP_STRUCT__entry( +- __field(u8, reason ) +- __array(char, uuid, 16 ) +- ), +- +- TP_fast_assign( +- __entry->reason = reason; +- memcpy(__entry->uuid, c->disk_sb.sb->user_uuid.b, 16); +- ), +- +- TP_printk("%pU: %u", __entry->uuid, __entry->reason) +-); +- + DEFINE_EVENT(btree_node, btree_gc_rewrite_node, + TP_PROTO(struct bch_fs *c, struct btree *b), + TP_ARGS(c, b) +@@ -395,16 +373,6 @@ DEFINE_EVENT(bch_fs, gc_end, + TP_ARGS(c) + ); + +-DEFINE_EVENT(bch_fs, gc_coalesce_start, +- TP_PROTO(struct bch_fs *c), +- TP_ARGS(c) +-); +- +-DEFINE_EVENT(bch_fs, gc_coalesce_end, +- TP_PROTO(struct bch_fs *c), +- TP_ARGS(c) +-); +- + DEFINE_EVENT(bch_fs, gc_cannot_inc_gens, + TP_PROTO(struct bch_fs *c), + TP_ARGS(c) +@@ -453,11 +421,6 @@ TRACE_EVENT(invalidate, + MINOR(__entry->dev), __entry->offset) + ); + +-DEFINE_EVENT(bch_fs, rescale_prios, +- TP_PROTO(struct bch_fs *c), +- TP_ARGS(c) +-); +- + DECLARE_EVENT_CLASS(bucket_alloc, + TP_PROTO(struct bch_dev *ca, enum alloc_reserve reserve), + TP_ARGS(ca, reserve), +-- +cgit v1.2.3 + + +From c8ff71e1f5e3abedc25ea97c29302274f134baee Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 14 Apr 2021 13:29:34 -0400 +Subject: bcachefs: Better iterator picking + +Avoid cloning iterators if we don't have to. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 12 ++++++++---- + 1 file changed, 8 insertions(+), 4 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index d4105a9c7650..590b79f6d6b9 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -2017,10 +2017,14 @@ struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans, + if (iter->btree_id != btree_id) + continue; + +- if (best && +- bkey_cmp(bpos_diff(best->real_pos, pos), +- bpos_diff(iter->real_pos, pos)) < 0) +- continue; ++ if (best) { ++ int cmp = bkey_cmp(bpos_diff(best->real_pos, pos), ++ bpos_diff(iter->real_pos, pos)); ++ ++ if (cmp < 0 || ++ ((cmp == 0 && btree_iter_keep(trans, iter)))) ++ continue; ++ } + + best = iter; + } +-- +cgit v1.2.3 + + +From f26f714ff2abc1f76abcc245bcd3566156bce3cf Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 14 Apr 2021 17:45:31 -0400 +Subject: bcachefs: Don't call bch2_btree_iter_traverse() unnecessarily + +If we let bch2_trans_commit() do it, it'll traverse iterators in sorted +order which means we'll get fewer lock restarts. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_update_leaf.c | 3 +-- + 1 file changed, 1 insertion(+), 2 deletions(-) + +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index 8c8a584f82ae..7707979158d7 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -1153,8 +1153,7 @@ int __bch2_btree_insert(struct btree_trans *trans, + iter = bch2_trans_get_iter(trans, id, bkey_start_pos(&k->k), + BTREE_ITER_INTENT); + +- ret = bch2_btree_iter_traverse(iter) ?: +- bch2_trans_update(trans, iter, k, 0); ++ ret = bch2_trans_update(trans, iter, k, 0); + bch2_trans_iter_put(trans, iter); + return ret; + } +-- +cgit v1.2.3 + + +From ca8c3defc08854245b16d940b8e88657e3cae2e6 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 14 Apr 2021 20:22:10 -0400 +Subject: bcachefs: Fix bch2_gc_done() error messages + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_gc.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +index f742ff8a2ca7..cc17da8b7a1a 100644 +--- a/fs/bcachefs/btree_gc.c ++++ b/fs/bcachefs/btree_gc.c +@@ -828,7 +828,7 @@ static int bch2_gc_done(struct bch_fs *c, + if (dst->b[b].mark._f != src->b[b].mark._f) { \ + if (verify) \ + fsck_err(c, "bucket %u:%zu gen %u data type %s has wrong " #_f \ +- ": got %u, should be %u", i, b, \ ++ ": got %u, should be %u", dev, b, \ + dst->b[b].mark.gen, \ + bch2_data_types[dst->b[b].mark.data_type],\ + dst->b[b].mark._f, src->b[b].mark._f); \ +@@ -836,7 +836,7 @@ static int bch2_gc_done(struct bch_fs *c, + set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags); \ + } + #define copy_dev_field(_f, _msg, ...) \ +- copy_field(_f, "dev %u has wrong " _msg, i, ##__VA_ARGS__) ++ copy_field(_f, "dev %u has wrong " _msg, dev, ##__VA_ARGS__) + #define copy_fs_field(_f, _msg, ...) \ + copy_field(_f, "fs has wrong " _msg, ##__VA_ARGS__) + +-- +cgit v1.2.3 + + +From e7432f2b48ddb805df7edb03a393902e47f39fea Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 14 Apr 2021 22:15:55 -0400 +Subject: bcachefs: Fix journal_reclaim_wait_done() + +Can't run arbitrary code inside a wait_event() conditional, due to +task state being weird... + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_update_leaf.c | 24 +++++------------------- + 1 file changed, 5 insertions(+), 19 deletions(-) + +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index 7707979158d7..db5702ca6e0c 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -629,25 +629,11 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, + + static int journal_reclaim_wait_done(struct bch_fs *c) + { +- int ret; +- +- ret = bch2_journal_error(&c->journal); +- if (ret) +- return ret; +- +- ret = !bch2_btree_key_cache_must_wait(c); +- if (ret) +- return ret; +- +- journal_reclaim_kick(&c->journal); +- +- if (mutex_trylock(&c->journal.reclaim_lock)) { +- ret = bch2_journal_reclaim(&c->journal); +- mutex_unlock(&c->journal.reclaim_lock); +- } ++ int ret = bch2_journal_error(&c->journal) ?: ++ !bch2_btree_key_cache_must_wait(c); + + if (!ret) +- ret = !bch2_btree_key_cache_must_wait(c); ++ journal_reclaim_kick(&c->journal); + return ret; + } + +@@ -735,8 +721,8 @@ int bch2_trans_commit_error(struct btree_trans *trans, + case BTREE_INSERT_NEED_JOURNAL_RECLAIM: + bch2_trans_unlock(trans); + +- wait_event(c->journal.reclaim_wait, +- (ret = journal_reclaim_wait_done(c))); ++ wait_event_freezable(c->journal.reclaim_wait, ++ (ret = journal_reclaim_wait_done(c))); + if (ret < 0) + return ret; + +-- +cgit v1.2.3 + + +From f122c5175f37a45ba95ce6cf39954f6ecdc12a1a Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 14 Apr 2021 13:26:15 -0400 +Subject: bcachefs: Improve bch2_btree_iter_traverse_all() + +By changing it to upgrade iterators to intent locks to avoid lock +restarts we can simplify __bch2_btree_node_lock() quite a bit - this +fixes a probable bug where it could potentially drop a lock on an +unrelated error but still succeed instead of causing a transaction +restart. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 88 ++++++++++++++++++----------------------- + fs/bcachefs/btree_iter.h | 2 +- + include/trace/events/bcachefs.h | 44 ++++++++++++++++++--- + 3 files changed, 77 insertions(+), 57 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 590b79f6d6b9..c19f165ffc91 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -268,13 +268,8 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos, + */ + if (type == SIX_LOCK_intent && + linked->nodes_locked != linked->nodes_intent_locked) { +- linked->locks_want = max_t(unsigned, +- linked->locks_want, +- __fls(linked->nodes_locked) + 1); +- if (!btree_iter_get_locks(linked, true, false)) { +- deadlock_iter = linked; +- reason = 1; +- } ++ deadlock_iter = linked; ++ reason = 1; + } + + if (linked->btree_id != iter->btree_id) { +@@ -303,14 +298,8 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos, + * we're about to lock, it must have the ancestors locked too: + */ + if (level > __fls(linked->nodes_locked)) { +- linked->locks_want = +- max(level + 1, max_t(unsigned, +- linked->locks_want, +- iter->locks_want)); +- if (!btree_iter_get_locks(linked, true, false)) { +- deadlock_iter = linked; +- reason = 5; +- } ++ deadlock_iter = linked; ++ reason = 5; + } + + /* Must lock btree nodes in key order: */ +@@ -319,27 +308,19 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos, + btree_iter_type(linked))) <= 0) { + deadlock_iter = linked; + reason = 7; +- } +- +- /* +- * Recheck if this is a node we already have locked - since one +- * of the get_locks() calls might've successfully +- * upgraded/relocked it: +- */ +- if (linked->l[level].b == b && +- btree_node_locked_type(linked, level) >= type) { +- six_lock_increment(&b->c.lock, type); +- return true; ++ BUG_ON(trans->in_traverse_all); + } + } + + if (unlikely(deadlock_iter)) { + trace_trans_restart_would_deadlock(iter->trans->ip, ip, +- reason, ++ trans->in_traverse_all, reason, + deadlock_iter->btree_id, + btree_iter_type(deadlock_iter), ++ &deadlock_iter->real_pos, + iter->btree_id, +- btree_iter_type(iter)); ++ btree_iter_type(iter), ++ &pos); + return false; + } + +@@ -407,29 +388,11 @@ bool bch2_btree_iter_relock(struct btree_iter *iter, bool trace) + bool __bch2_btree_iter_upgrade(struct btree_iter *iter, + unsigned new_locks_want) + { +- struct btree_iter *linked; +- + EBUG_ON(iter->locks_want >= new_locks_want); + + iter->locks_want = new_locks_want; + +- if (btree_iter_get_locks(iter, true, true)) +- return true; +- +- /* +- * Ancestor nodes must be locked before child nodes, so set locks_want +- * on iterators that might lock ancestors before us to avoid getting +- * -EINTR later: +- */ +- trans_for_each_iter(iter->trans, linked) +- if (linked != iter && +- linked->btree_id == iter->btree_id && +- linked->locks_want < new_locks_want) { +- linked->locks_want = new_locks_want; +- btree_iter_get_locks(linked, true, false); +- } +- +- return false; ++ return btree_iter_get_locks(iter, true, true); + } + + void __bch2_btree_iter_downgrade(struct btree_iter *iter, +@@ -1192,7 +1155,8 @@ static int __btree_iter_traverse_all(struct btree_trans *trans, int ret) + struct bch_fs *c = trans->c; + struct btree_iter *iter; + u8 sorted[BTREE_ITER_MAX]; +- unsigned i, nr_sorted = 0; ++ int i, nr_sorted = 0; ++ bool relock_fail; + + if (trans->in_traverse_all) + return -EINTR; +@@ -1200,15 +1164,36 @@ static int __btree_iter_traverse_all(struct btree_trans *trans, int ret) + trans->in_traverse_all = true; + retry_all: + nr_sorted = 0; ++ relock_fail = false; + +- trans_for_each_iter(trans, iter) ++ trans_for_each_iter(trans, iter) { ++ if (!bch2_btree_iter_relock(iter, true)) ++ relock_fail = true; + sorted[nr_sorted++] = iter->idx; ++ } ++ ++ if (!relock_fail) { ++ trans->in_traverse_all = false; ++ return 0; ++ } + + #define btree_iter_cmp_by_idx(_l, _r) \ + btree_iter_lock_cmp(&trans->iters[_l], &trans->iters[_r]) + + bubble_sort(sorted, nr_sorted, btree_iter_cmp_by_idx); + #undef btree_iter_cmp_by_idx ++ ++ for (i = nr_sorted - 2; i >= 0; --i) { ++ struct btree_iter *iter1 = trans->iters + sorted[i]; ++ struct btree_iter *iter2 = trans->iters + sorted[i + 1]; ++ ++ if (iter1->btree_id == iter2->btree_id && ++ iter1->locks_want < iter2->locks_want) ++ __bch2_btree_iter_upgrade(iter1, iter2->locks_want); ++ else if (!iter1->locks_want && iter2->locks_want) ++ __bch2_btree_iter_upgrade(iter1, 1); ++ } ++ + bch2_trans_unlock(trans); + cond_resched(); + +@@ -1258,6 +1243,8 @@ out: + bch2_btree_cache_cannibalize_unlock(c); + + trans->in_traverse_all = false; ++ ++ trace_trans_traverse_all(trans->ip); + return ret; + } + +@@ -2209,7 +2196,8 @@ void bch2_trans_reset(struct btree_trans *trans, unsigned flags) + if (!(flags & TRANS_RESET_NOUNLOCK)) + bch2_trans_cond_resched(trans); + +- if (!(flags & TRANS_RESET_NOTRAVERSE)) ++ if (!(flags & TRANS_RESET_NOTRAVERSE) && ++ trans->iters_linked) + bch2_btree_iter_traverse_all(trans); + } + +diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h +index 07d9b6d36e51..2f63adb9e420 100644 +--- a/fs/bcachefs/btree_iter.h ++++ b/fs/bcachefs/btree_iter.h +@@ -187,7 +187,7 @@ static inline int btree_iter_lock_cmp(const struct btree_iter *l, + { + return cmp_int(l->btree_id, r->btree_id) ?: + -cmp_int(btree_iter_is_cached(l), btree_iter_is_cached(r)) ?: +- bkey_cmp(l->pos, r->pos); ++ bkey_cmp(l->real_pos, r->real_pos); + } + + /* +diff --git a/include/trace/events/bcachefs.h b/include/trace/events/bcachefs.h +index 0018d381abdd..30277a547d80 100644 +--- a/include/trace/events/bcachefs.h ++++ b/include/trace/events/bcachefs.h +@@ -561,43 +561,70 @@ DEFINE_EVENT(transaction_restart, trans_restart_btree_node_reused, + TRACE_EVENT(trans_restart_would_deadlock, + TP_PROTO(unsigned long trans_ip, + unsigned long caller_ip, ++ bool in_traverse_all, + unsigned reason, + enum btree_id have_btree_id, + unsigned have_iter_type, ++ struct bpos *have_pos, + enum btree_id want_btree_id, +- unsigned want_iter_type), +- TP_ARGS(trans_ip, caller_ip, reason, +- have_btree_id, have_iter_type, +- want_btree_id, want_iter_type), ++ unsigned want_iter_type, ++ struct bpos *want_pos), ++ TP_ARGS(trans_ip, caller_ip, in_traverse_all, reason, ++ have_btree_id, have_iter_type, have_pos, ++ want_btree_id, want_iter_type, want_pos), + + TP_STRUCT__entry( + __field(unsigned long, trans_ip ) + __field(unsigned long, caller_ip ) ++ __field(u8, in_traverse_all ) + __field(u8, reason ) + __field(u8, have_btree_id ) + __field(u8, have_iter_type ) + __field(u8, want_btree_id ) + __field(u8, want_iter_type ) ++ ++ __field(u64, have_pos_inode ) ++ __field(u64, have_pos_offset ) ++ __field(u32, have_pos_snapshot) ++ __field(u32, want_pos_snapshot) ++ __field(u64, want_pos_inode ) ++ __field(u64, want_pos_offset ) + ), + + TP_fast_assign( + __entry->trans_ip = trans_ip; + __entry->caller_ip = caller_ip; ++ __entry->in_traverse_all = in_traverse_all; + __entry->reason = reason; + __entry->have_btree_id = have_btree_id; + __entry->have_iter_type = have_iter_type; + __entry->want_btree_id = want_btree_id; + __entry->want_iter_type = want_iter_type; ++ ++ __entry->have_pos_inode = have_pos->inode; ++ __entry->have_pos_offset = have_pos->offset; ++ __entry->have_pos_snapshot = have_pos->snapshot; ++ ++ __entry->want_pos_inode = want_pos->inode; ++ __entry->want_pos_offset = want_pos->offset; ++ __entry->want_pos_snapshot = want_pos->snapshot; + ), + +- TP_printk("%ps %pS because %u have %u:%u want %u:%u", ++ TP_printk("%ps %pS traverse_all %u because %u have %u:%u %llu:%llu:%u want %u:%u %llu:%llu:%u", + (void *) __entry->trans_ip, + (void *) __entry->caller_ip, ++ __entry->in_traverse_all, + __entry->reason, + __entry->have_btree_id, + __entry->have_iter_type, ++ __entry->have_pos_inode, ++ __entry->have_pos_offset, ++ __entry->have_pos_snapshot, + __entry->want_btree_id, +- __entry->want_iter_type) ++ __entry->want_iter_type, ++ __entry->want_pos_inode, ++ __entry->want_pos_offset, ++ __entry->want_pos_snapshot) + ); + + TRACE_EVENT(trans_restart_iters_realloced, +@@ -689,6 +716,11 @@ DEFINE_EVENT(transaction_restart, trans_restart_traverse, + TP_ARGS(ip) + ); + ++DEFINE_EVENT(transaction_restart, trans_traverse_all, ++ TP_PROTO(unsigned long ip), ++ TP_ARGS(ip) ++); ++ + DECLARE_EVENT_CLASS(node_lock_fail, + TP_PROTO(unsigned level, u32 iter_seq, unsigned node, u32 node_seq), + TP_ARGS(level, iter_seq, node, node_seq), +-- +cgit v1.2.3 + + +From 42c87e47479ade827b82a6475636cf08f0ff9f17 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 15 Apr 2021 12:36:40 -0400 +Subject: bcachefs: Don't downgrade iterators in bch2_trans_get_iter() + +This fixes a livelock with btree node splits. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 11 ++++++++--- + 1 file changed, 8 insertions(+), 3 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index c19f165ffc91..8deb4438c90e 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -2039,13 +2039,18 @@ struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans, + + iter->snapshot = pos.snapshot; + +- locks_want = min(locks_want, BTREE_MAX_DEPTH); ++ /* ++ * If the iterator has locks_want greater than requested, we explicitly ++ * do not downgrade it here - on transaction restart because btree node ++ * split needs to upgrade locks, we might be putting/getting the ++ * iterator again. Downgrading iterators only happens via an explicit ++ * bch2_trans_downgrade(). ++ */ + ++ locks_want = min(locks_want, BTREE_MAX_DEPTH); + if (locks_want > iter->locks_want) { + iter->locks_want = locks_want; + btree_iter_get_locks(iter, true, false); +- } else if (locks_want < iter->locks_want) { +- __bch2_btree_iter_downgrade(iter, locks_want); + } + + while (iter->level < depth) { +-- +cgit v1.2.3 + + +From 35a46f80a0c5153f4d7b6c1b583e31972386bd4d Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 15 Apr 2021 12:50:09 -0400 +Subject: bcachefs: Improve trans_restart_mem_realloced tracepoint + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 27 +++++++++------------------ + include/trace/events/bcachefs.h | 37 +++++++++++++------------------------ + 2 files changed, 22 insertions(+), 42 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 8deb4438c90e..5a5533cad514 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -2112,11 +2112,14 @@ struct btree_iter *__bch2_trans_copy_iter(struct btree_trans *trans, + return iter; + } + +-static int bch2_trans_preload_mem(struct btree_trans *trans, size_t size) ++void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size) + { +- if (size > trans->mem_bytes) { ++ size_t new_top = trans->mem_top + size; ++ void *p; ++ ++ if (new_top > trans->mem_bytes) { + size_t old_bytes = trans->mem_bytes; +- size_t new_bytes = roundup_pow_of_two(size); ++ size_t new_bytes = roundup_pow_of_two(new_top); + void *new_mem; + + WARN_ON_ONCE(new_bytes > BTREE_TRANS_MEM_MAX); +@@ -2129,29 +2132,17 @@ static int bch2_trans_preload_mem(struct btree_trans *trans, size_t size) + } + + if (!new_mem) +- return -ENOMEM; ++ return ERR_PTR(-ENOMEM); + + trans->mem = new_mem; + trans->mem_bytes = new_bytes; + + if (old_bytes) { +- trace_trans_restart_mem_realloced(trans->ip, new_bytes); +- return -EINTR; ++ trace_trans_restart_mem_realloced(trans->ip, _RET_IP_, new_bytes); ++ return ERR_PTR(-EINTR); + } + } + +- return 0; +-} +- +-void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size) +-{ +- void *p; +- int ret; +- +- ret = bch2_trans_preload_mem(trans, trans->mem_top + size); +- if (ret) +- return ERR_PTR(ret); +- + p = trans->mem + trans->mem_top; + trans->mem_top += size; + return p; +diff --git a/include/trace/events/bcachefs.h b/include/trace/events/bcachefs.h +index 30277a547d80..18a05f741bff 100644 +--- a/include/trace/events/bcachefs.h ++++ b/include/trace/events/bcachefs.h +@@ -627,38 +627,27 @@ TRACE_EVENT(trans_restart_would_deadlock, + __entry->want_pos_snapshot) + ); + +-TRACE_EVENT(trans_restart_iters_realloced, +- TP_PROTO(unsigned long ip, unsigned nr), +- TP_ARGS(ip, nr), +- +- TP_STRUCT__entry( +- __field(unsigned long, ip ) +- __field(unsigned, nr ) +- ), +- +- TP_fast_assign( +- __entry->ip = ip; +- __entry->nr = nr; +- ), +- +- TP_printk("%ps nr %u", (void *) __entry->ip, __entry->nr) +-); +- + TRACE_EVENT(trans_restart_mem_realloced, +- TP_PROTO(unsigned long ip, unsigned long bytes), +- TP_ARGS(ip, bytes), ++ TP_PROTO(unsigned long trans_ip, unsigned long caller_ip, ++ unsigned long bytes), ++ TP_ARGS(trans_ip, caller_ip, bytes), + + TP_STRUCT__entry( +- __field(unsigned long, ip ) +- __field(unsigned long, bytes ) ++ __field(unsigned long, trans_ip ) ++ __field(unsigned long, caller_ip ) ++ __field(unsigned long, bytes ) + ), + + TP_fast_assign( +- __entry->ip = ip; +- __entry->bytes = bytes; ++ __entry->trans_ip = trans_ip; ++ __entry->caller_ip = caller_ip; ++ __entry->bytes = bytes; + ), + +- TP_printk("%ps bytes %lu", (void *) __entry->ip, __entry->bytes) ++ TP_printk("%ps %pS bytes %lu", ++ (void *) __entry->trans_ip, ++ (void *) __entry->caller_ip, ++ __entry->bytes) + ); + + DEFINE_EVENT(transaction_restart, trans_restart_journal_res_get, +-- +cgit v1.2.3 + + +From 0c4abf557af354d0fd4702fa01463a45ecdd154e Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 14 Apr 2021 20:25:33 -0400 +Subject: bcachefs: Fix bch2_trans_mark_dev_sb() + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/alloc_background.c | 6 ++-- + fs/bcachefs/buckets.c | 63 ++++++++++++------------------------------ + fs/bcachefs/buckets.h | 8 ++---- + fs/bcachefs/buckets_types.h | 5 ++++ + fs/bcachefs/journal.c | 2 +- + fs/bcachefs/recovery.c | 10 ++++--- + fs/bcachefs/super.c | 4 +-- + 7 files changed, 38 insertions(+), 60 deletions(-) + +diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c +index c47a2098a10c..f973ace061d2 100644 +--- a/fs/bcachefs/alloc_background.c ++++ b/fs/bcachefs/alloc_background.c +@@ -254,9 +254,9 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, + { + struct bkey_alloc_unpacked u = bch2_alloc_unpack(k); + +- pr_buf(out, "gen %u oldest_gen %u data_type %u", +- u.gen, u.oldest_gen, u.data_type); +-#define x(_name, ...) pr_buf(out, #_name " %llu ", (u64) u._name); ++ pr_buf(out, "gen %u oldest_gen %u data_type %s", ++ u.gen, u.oldest_gen, bch2_data_types[u.data_type]); ++#define x(_name, ...) pr_buf(out, " " #_name " %llu", (u64) u._name); + BCH_ALLOC_FIELDS_V2() + #undef x + } +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index cb63fafcfcb1..5ad66d4409bd 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -2020,22 +2020,6 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans, + goto out; + } + +- if ((unsigned) (u.dirty_sectors + sectors) > ca->mi.bucket_size) { +- bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, +- "bucket %llu:%llu gen %u data type %s sector count overflow: %u + %u > %u\n" +- "while marking %s", +- iter->pos.inode, iter->pos.offset, u.gen, +- bch2_data_types[u.data_type ?: type], +- u.dirty_sectors, sectors, ca->mi.bucket_size, +- bch2_data_types[type]); +- ret = -EIO; +- goto out; +- } +- +- if (u.data_type == type && +- u.dirty_sectors == sectors) +- goto out; +- + u.data_type = type; + u.dirty_sectors = sectors; + +@@ -2047,53 +2031,44 @@ out: + } + + int bch2_trans_mark_metadata_bucket(struct btree_trans *trans, +- struct disk_reservation *res, + struct bch_dev *ca, size_t b, + enum bch_data_type type, + unsigned sectors) + { +- return __bch2_trans_do(trans, res, NULL, 0, +- __bch2_trans_mark_metadata_bucket(trans, ca, b, BCH_DATA_journal, +- ca->mi.bucket_size)); +- ++ return __bch2_trans_do(trans, NULL, NULL, 0, ++ __bch2_trans_mark_metadata_bucket(trans, ca, b, type, sectors)); + } + + static int bch2_trans_mark_metadata_sectors(struct btree_trans *trans, +- struct disk_reservation *res, + struct bch_dev *ca, + u64 start, u64 end, + enum bch_data_type type, + u64 *bucket, unsigned *bucket_sectors) + { +- int ret; +- + do { + u64 b = sector_to_bucket(ca, start); + unsigned sectors = + min_t(u64, bucket_to_sector(ca, b + 1), end) - start; + +- if (b != *bucket) { +- if (*bucket_sectors) { +- ret = bch2_trans_mark_metadata_bucket(trans, res, ca, +- *bucket, type, *bucket_sectors); +- if (ret) +- return ret; +- } ++ if (b != *bucket && *bucket_sectors) { ++ int ret = bch2_trans_mark_metadata_bucket(trans, ca, *bucket, ++ type, *bucket_sectors); ++ if (ret) ++ return ret; + +- *bucket = b; +- *bucket_sectors = 0; ++ *bucket_sectors = 0; + } + ++ *bucket = b; + *bucket_sectors += sectors; + start += sectors; +- } while (!ret && start < end); ++ } while (start < end); + + return 0; + } + + static int __bch2_trans_mark_dev_sb(struct btree_trans *trans, +- struct disk_reservation *res, +- struct bch_dev *ca) ++ struct bch_dev *ca) + { + struct bch_sb_layout *layout = &ca->disk_sb.sb->layout; + u64 bucket = 0; +@@ -2104,14 +2079,14 @@ static int __bch2_trans_mark_dev_sb(struct btree_trans *trans, + u64 offset = le64_to_cpu(layout->sb_offset[i]); + + if (offset == BCH_SB_SECTOR) { +- ret = bch2_trans_mark_metadata_sectors(trans, res, ca, ++ ret = bch2_trans_mark_metadata_sectors(trans, ca, + 0, BCH_SB_SECTOR, + BCH_DATA_sb, &bucket, &bucket_sectors); + if (ret) + return ret; + } + +- ret = bch2_trans_mark_metadata_sectors(trans, res, ca, offset, ++ ret = bch2_trans_mark_metadata_sectors(trans, ca, offset, + offset + (1 << layout->sb_max_size_bits), + BCH_DATA_sb, &bucket, &bucket_sectors); + if (ret) +@@ -2119,14 +2094,14 @@ static int __bch2_trans_mark_dev_sb(struct btree_trans *trans, + } + + if (bucket_sectors) { +- ret = bch2_trans_mark_metadata_bucket(trans, res, ca, ++ ret = bch2_trans_mark_metadata_bucket(trans, ca, + bucket, BCH_DATA_sb, bucket_sectors); + if (ret) + return ret; + } + + for (i = 0; i < ca->journal.nr; i++) { +- ret = bch2_trans_mark_metadata_bucket(trans, res, ca, ++ ret = bch2_trans_mark_metadata_bucket(trans, ca, + ca->journal.buckets[i], + BCH_DATA_journal, ca->mi.bucket_size); + if (ret) +@@ -2136,12 +2111,10 @@ static int __bch2_trans_mark_dev_sb(struct btree_trans *trans, + return 0; + } + +-int bch2_trans_mark_dev_sb(struct bch_fs *c, +- struct disk_reservation *res, +- struct bch_dev *ca) ++int bch2_trans_mark_dev_sb(struct bch_fs *c, struct bch_dev *ca) + { +- return bch2_trans_do(c, res, NULL, 0, +- __bch2_trans_mark_dev_sb(&trans, res, ca)); ++ return bch2_trans_do(c, NULL, NULL, 0, ++ __bch2_trans_mark_dev_sb(&trans, ca)); + } + + /* Disk reservations: */ +diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h +index cd81e6aba1b0..794c426e2198 100644 +--- a/fs/bcachefs/buckets.h ++++ b/fs/bcachefs/buckets.h +@@ -253,11 +253,9 @@ int bch2_trans_mark_update(struct btree_trans *, struct btree_iter *iter, + struct bkey_i *insert, unsigned); + void bch2_trans_fs_usage_apply(struct btree_trans *, struct replicas_delta_list *); + +-int bch2_trans_mark_metadata_bucket(struct btree_trans *, +- struct disk_reservation *, struct bch_dev *, +- size_t, enum bch_data_type, unsigned); +-int bch2_trans_mark_dev_sb(struct bch_fs *, struct disk_reservation *, +- struct bch_dev *); ++int bch2_trans_mark_metadata_bucket(struct btree_trans *, struct bch_dev *, ++ size_t, enum bch_data_type, unsigned); ++int bch2_trans_mark_dev_sb(struct bch_fs *, struct bch_dev *); + + /* disk reservations: */ + +diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h +index 588b1a72adae..b2de2995c5e7 100644 +--- a/fs/bcachefs/buckets_types.h ++++ b/fs/bcachefs/buckets_types.h +@@ -59,6 +59,11 @@ struct bch_dev_usage { + struct { + u64 buckets; + u64 sectors; /* _compressed_ sectors: */ ++ /* ++ * XXX ++ * Why do we have this? Isn't it just buckets * bucket_size - ++ * sectors? ++ */ + u64 fragmented; + } d[BCH_DATA_NR]; + }; +diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c +index 14fa3be5626a..ce1a8761e27b 100644 +--- a/fs/bcachefs/journal.c ++++ b/fs/bcachefs/journal.c +@@ -865,7 +865,7 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, + + if (c && !new_fs) + ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOFAIL, +- bch2_trans_mark_metadata_bucket(&trans, NULL, ca, ++ bch2_trans_mark_metadata_bucket(&trans, ca, + bucket, BCH_DATA_journal, + ca->mi.bucket_size)); + +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index 9991a4f67163..2dc3dee4efc8 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -1333,10 +1333,12 @@ int bch2_fs_initialize(struct bch_fs *c) + * Write out the superblock and journal buckets, now that we can do + * btree updates + */ +- err = "error writing alloc info"; +- ret = bch2_alloc_write(c, 0); +- if (ret) +- goto err; ++ err = "error marking superblock and journal"; ++ for_each_member_device(ca, c, i) { ++ ret = bch2_trans_mark_dev_sb(c, ca); ++ if (ret) ++ goto err; ++ } + + bch2_inode_init(c, &root_inode, 0, 0, + S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0, NULL); +diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c +index 7ce867e5ff0c..6c690b4e0918 100644 +--- a/fs/bcachefs/super.c ++++ b/fs/bcachefs/super.c +@@ -1666,7 +1666,7 @@ have_slot: + bch2_dev_usage_journal_reserve(c); + + err = "error marking superblock"; +- ret = bch2_trans_mark_dev_sb(c, NULL, ca); ++ ret = bch2_trans_mark_dev_sb(c, ca); + if (ret) + goto err_late; + +@@ -1726,7 +1726,7 @@ int bch2_dev_online(struct bch_fs *c, const char *path) + + ca = bch_dev_locked(c, dev_idx); + +- if (bch2_trans_mark_dev_sb(c, NULL, ca)) { ++ if (bch2_trans_mark_dev_sb(c, ca)) { + err = "bch2_trans_mark_dev_sb() error"; + goto err; + } +-- +cgit v1.2.3 + + +From 4b483d74ef0fb0076ed1bc6055e5d65782cbdceb Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 14 Apr 2021 20:23:58 -0400 +Subject: bcachefs: Simplify bch2_set_nr_journal_buckets() + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/journal.c | 35 +++++++++++++++-------------------- + 1 file changed, 15 insertions(+), 20 deletions(-) + +diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c +index ce1a8761e27b..1e88a5f3d0f3 100644 +--- a/fs/bcachefs/journal.c ++++ b/fs/bcachefs/journal.c +@@ -787,7 +787,7 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, + * We may be called from the device add path, before the new device has + * actually been added to the running filesystem: + */ +- if (c) ++ if (!new_fs) + spin_lock(&c->journal.lock); + + memcpy(new_buckets, ja->buckets, ja->nr * sizeof(u64)); +@@ -795,17 +795,17 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, + swap(new_buckets, ja->buckets); + swap(new_bucket_seq, ja->bucket_seq); + +- if (c) ++ if (!new_fs) + spin_unlock(&c->journal.lock); + + while (ja->nr < nr) { + struct open_bucket *ob = NULL; + unsigned pos; +- long bucket; ++ long b; + + if (new_fs) { +- bucket = bch2_bucket_alloc_new_fs(ca); +- if (bucket < 0) { ++ b = bch2_bucket_alloc_new_fs(ca); ++ if (b < 0) { + ret = -ENOSPC; + goto err; + } +@@ -819,10 +819,8 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, + goto err; + } + +- bucket = sector_to_bucket(ca, ob->ptr.offset); +- } ++ b = sector_to_bucket(ca, ob->ptr.offset); + +- if (c) { + percpu_down_read(&c->mark_lock); + spin_lock(&c->journal.lock); + } +@@ -839,9 +837,9 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, + __array_insert_item(journal_buckets->buckets, ja->nr, pos); + ja->nr++; + +- ja->buckets[pos] = bucket; ++ ja->buckets[pos] = b; + ja->bucket_seq[pos] = 0; +- journal_buckets->buckets[pos] = cpu_to_le64(bucket); ++ journal_buckets->buckets[pos] = cpu_to_le64(b); + + if (pos <= ja->discard_idx) + ja->discard_idx = (ja->discard_idx + 1) % ja->nr; +@@ -852,28 +850,25 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, + if (pos <= ja->cur_idx) + ja->cur_idx = (ja->cur_idx + 1) % ja->nr; + +- if (!c || new_fs) +- bch2_mark_metadata_bucket(c, ca, bucket, BCH_DATA_journal, ++ if (new_fs) { ++ bch2_mark_metadata_bucket(c, ca, b, BCH_DATA_journal, + ca->mi.bucket_size, + gc_phase(GC_PHASE_SB), + 0); +- +- if (c) { ++ } else { + spin_unlock(&c->journal.lock); + percpu_up_read(&c->mark_lock); +- } + +- if (c && !new_fs) + ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOFAIL, + bch2_trans_mark_metadata_bucket(&trans, ca, +- bucket, BCH_DATA_journal, ++ b, BCH_DATA_journal, + ca->mi.bucket_size)); + +- if (!new_fs) + bch2_open_bucket_put(c, ob); + +- if (ret) +- goto err; ++ if (ret) ++ goto err; ++ } + } + err: + bch2_sb_resize_journal(&ca->disk_sb, +-- +cgit v1.2.3 + + +From 35345b4ab07496d42530c8e1411218bac0c4a0e1 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 15 Apr 2021 18:31:58 -0400 +Subject: bcachefs: Fix an RCU splat + +Writepoints are never deallocated so the rcu_read_lock() isn't really +needed, but we are doing lockless list traversal. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/alloc_foreground.c | 9 ++++++--- + 1 file changed, 6 insertions(+), 3 deletions(-) + +diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c +index 2e7b19be02b9..499d4c8baf66 100644 +--- a/fs/bcachefs/alloc_foreground.c ++++ b/fs/bcachefs/alloc_foreground.c +@@ -683,11 +683,14 @@ static struct write_point *__writepoint_find(struct hlist_head *head, + { + struct write_point *wp; + ++ rcu_read_lock(); + hlist_for_each_entry_rcu(wp, head, node) + if (wp->write_point == write_point) +- return wp; +- +- return NULL; ++ goto out; ++ wp = NULL; ++out: ++ rcu_read_unlock(); ++ return wp; + } + + static inline bool too_many_writepoints(struct bch_fs *c, unsigned factor) +-- +cgit v1.2.3 + + +From 9aebaeb302b5259b6ec2cbb64d1c391fdd8f6b79 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 16 Apr 2021 12:38:14 -0400 +Subject: bcachefs: Fix journal reclaim loop + +When dirty key cache keys were separated from other journal pins, we +broke the loop conditional in __bch2_journal_reclaim() - it's supposed +to keep looping as long as there's work to do. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/journal_reclaim.c | 9 +++++---- + 1 file changed, 5 insertions(+), 4 deletions(-) + +diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c +index 7be6c65c1abe..f117d361d584 100644 +--- a/fs/bcachefs/journal_reclaim.c ++++ b/fs/bcachefs/journal_reclaim.c +@@ -599,7 +599,7 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct) + struct bch_fs *c = container_of(j, struct bch_fs, journal); + bool kthread = (current->flags & PF_KTHREAD) != 0; + u64 seq_to_flush; +- size_t min_nr, nr_flushed; ++ size_t min_nr, min_key_cache, nr_flushed; + unsigned flags; + int ret = 0; + +@@ -649,9 +649,10 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct) + atomic_long_read(&c->btree_key_cache.nr_dirty), + atomic_long_read(&c->btree_key_cache.nr_keys)); + ++ min_key_cache = min(bch2_nr_btree_keys_need_flush(c), 128UL); ++ + nr_flushed = journal_flush_pins(j, seq_to_flush, +- min_nr, +- min(bch2_nr_btree_keys_need_flush(c), 128UL)); ++ min_nr, min_key_cache); + + if (direct) + j->nr_direct_reclaim += nr_flushed; +@@ -661,7 +662,7 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct) + + if (nr_flushed) + wake_up(&j->reclaim_wait); +- } while (min_nr && nr_flushed && !direct); ++ } while ((min_nr || min_key_cache) && !direct); + + memalloc_noreclaim_restore(flags); + +-- +cgit v1.2.3 + + +From 6ca24e8911cdc30830ff892330b2b80c7121c5dc Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 16 Apr 2021 14:29:26 -0400 +Subject: bcachefs: Fix transaction restarts due to upgrading of cloned + iterators + +This fixes a regression from + 52d86202fd bcachefs: Improve bch2_btree_iter_traverse_all() + +We want to avoid mucking with other iterators in the btree transaction +in operations that are only supposed to be touching individual iterators +- that patch was a cleanup to move lock ordering handling to +bch2_btree_iter_traverse_all(). But it broke upgrading of cloned +iterators. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 35 ++++++++++++++++++++++++++++++++++- + 1 file changed, 34 insertions(+), 1 deletion(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 5a5533cad514..ac5a8737e9f3 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -388,11 +388,44 @@ bool bch2_btree_iter_relock(struct btree_iter *iter, bool trace) + bool __bch2_btree_iter_upgrade(struct btree_iter *iter, + unsigned new_locks_want) + { ++ struct btree_iter *linked; ++ + EBUG_ON(iter->locks_want >= new_locks_want); + + iter->locks_want = new_locks_want; + +- return btree_iter_get_locks(iter, true, true); ++ if (btree_iter_get_locks(iter, true, true)) ++ return true; ++ ++ /* ++ * XXX: this is ugly - we'd prefer to not be mucking with other ++ * iterators in the btree_trans here. ++ * ++ * On failure to upgrade the iterator, setting iter->locks_want and ++ * calling get_locks() is sufficient to make bch2_btree_iter_traverse() ++ * get the locks we want on transaction restart. ++ * ++ * But if this iterator was a clone, on transaction restart what we did ++ * to this iterator isn't going to be preserved. ++ * ++ * Possibly we could add an iterator field for the parent iterator when ++ * an iterator is a copy - for now, we'll just upgrade any other ++ * iterators with the same btree id. ++ * ++ * The code below used to be needed to ensure ancestor nodes get locked ++ * before interior nodes - now that's handled by ++ * bch2_btree_iter_traverse_all(). ++ */ ++ trans_for_each_iter(iter->trans, linked) ++ if (linked != iter && ++ btree_iter_type(linked) == btree_iter_type(iter) && ++ linked->btree_id == iter->btree_id && ++ linked->locks_want < new_locks_want) { ++ linked->locks_want = new_locks_want; ++ btree_iter_get_locks(linked, true, false); ++ } ++ ++ return false; + } + + void __bch2_btree_iter_downgrade(struct btree_iter *iter, +-- +cgit v1.2.3 + + +From 74227442446f659fd707abb5957d3c288f7dda55 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 16 Apr 2021 14:48:51 -0400 +Subject: bcachefs: Simplify fsck remove_dirent() + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/fsck.c | 52 +++++++++++++++++++--------------------------------- + 1 file changed, 19 insertions(+), 33 deletions(-) + +diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c +index fa1922cb5c87..e6036d36e0f9 100644 +--- a/fs/bcachefs/fsck.c ++++ b/fs/bcachefs/fsck.c +@@ -81,51 +81,37 @@ static int write_inode(struct btree_trans *trans, + return ret; + } + +-static int __remove_dirent(struct btree_trans *trans, +- struct bkey_s_c_dirent dirent) ++static int __remove_dirent(struct btree_trans *trans, struct bpos pos) + { + struct bch_fs *c = trans->c; +- struct qstr name; ++ struct btree_iter *iter; + struct bch_inode_unpacked dir_inode; + struct bch_hash_info dir_hash_info; +- u64 dir_inum = dirent.k->p.inode; + int ret; +- char *buf; +- +- name.len = bch2_dirent_name_bytes(dirent); +- buf = bch2_trans_kmalloc(trans, name.len + 1); +- if (IS_ERR(buf)) +- return PTR_ERR(buf); +- +- memcpy(buf, dirent.v->d_name, name.len); +- buf[name.len] = '\0'; +- name.name = buf; + +- ret = lookup_inode(trans, dir_inum, &dir_inode, NULL); +- if (ret && ret != -EINTR) +- bch_err(c, "remove_dirent: err %i looking up directory inode", ret); ++ ret = lookup_inode(trans, pos.inode, &dir_inode, NULL); + if (ret) + return ret; + + dir_hash_info = bch2_hash_info_init(c, &dir_inode); + +- ret = bch2_hash_delete(trans, bch2_dirent_hash_desc, +- &dir_hash_info, dir_inum, &name); +- if (ret && ret != -EINTR) +- bch_err(c, "remove_dirent: err %i deleting dirent", ret); +- if (ret) +- return ret; ++ iter = bch2_trans_get_iter(trans, BTREE_ID_dirents, pos, BTREE_ITER_INTENT); + +- return 0; ++ ret = bch2_hash_delete_at(trans, bch2_dirent_hash_desc, ++ &dir_hash_info, iter); ++ bch2_trans_iter_put(trans, iter); ++ return ret; + } + +-static int remove_dirent(struct btree_trans *trans, +- struct bkey_s_c_dirent dirent) ++static int remove_dirent(struct btree_trans *trans, struct bpos pos) + { +- return __bch2_trans_do(trans, NULL, NULL, +- BTREE_INSERT_NOFAIL| +- BTREE_INSERT_LAZY_RW, +- __remove_dirent(trans, dirent)); ++ int ret = __bch2_trans_do(trans, NULL, NULL, ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_LAZY_RW, ++ __remove_dirent(trans, pos)); ++ if (ret) ++ bch_err(trans->c, "remove_dirent: err %i deleting dirent", ret); ++ return ret; + } + + static int __reattach_inode(struct btree_trans *trans, +@@ -202,7 +188,7 @@ static int remove_backpointer(struct btree_trans *trans, + goto out; + } + +- ret = remove_dirent(trans, bkey_s_c_to_dirent(k)); ++ ret = remove_dirent(trans, k.k->p); + out: + bch2_trans_iter_put(trans, iter); + return ret; +@@ -752,7 +738,7 @@ retry: + "dirent points to missing inode:\n%s", + (bch2_bkey_val_to_text(&PBUF(buf), c, + k), buf))) { +- ret = remove_dirent(&trans, d); ++ ret = remove_dirent(&trans, d.k->p); + if (ret) + goto err; + goto next; +@@ -783,7 +769,7 @@ retry: + backpointer_exists, c, + "directory %llu with multiple links", + target.bi_inum)) { +- ret = remove_dirent(&trans, d); ++ ret = remove_dirent(&trans, d.k->p); + if (ret) + goto err; + continue; +-- +cgit v1.2.3 + + +From 865aff17f9a43a5c4f5d6a3b309cd20e68c86784 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 16 Apr 2021 17:26:25 -0400 +Subject: bcachefs: Fix some small memory leaks + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/fsck.c | 2 ++ + fs/bcachefs/replicas.c | 4 ++-- + 2 files changed, 4 insertions(+), 2 deletions(-) + +diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c +index e6036d36e0f9..4a48ef5d1bfb 100644 +--- a/fs/bcachefs/fsck.c ++++ b/fs/bcachefs/fsck.c +@@ -1124,6 +1124,8 @@ static int check_directory_structure(struct bch_fs *c, + + BUG_ON(ret == -EINTR); + ++ kfree(path.entries); ++ + return bch2_trans_exit(&trans) ?: ret; + } + +diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c +index e47c1073d5ab..8e6cccd39383 100644 +--- a/fs/bcachefs/replicas.c ++++ b/fs/bcachefs/replicas.c +@@ -313,8 +313,8 @@ static int replicas_table_update(struct bch_fs *c, + out: + free_percpu(new_gc); + kfree(new_scratch); +- free_percpu(new_usage[1]); +- free_percpu(new_usage[0]); ++ for (i = 0; i < ARRAY_SIZE(new_usage); i++) ++ free_percpu(new_usage[i]); + kfree(new_base); + return ret; + err: +-- +cgit v1.2.3 + + +From dd3c9086d234d94b97dd101c4dac9e6e6ee81e47 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 16 Apr 2021 17:34:53 -0400 +Subject: bcachefs: Fix an unused var warning in userspace + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/fsck.c | 7 ++----- + 1 file changed, 2 insertions(+), 5 deletions(-) + +diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c +index 4a48ef5d1bfb..cfe606342032 100644 +--- a/fs/bcachefs/fsck.c ++++ b/fs/bcachefs/fsck.c +@@ -159,13 +159,10 @@ static int reattach_inode(struct btree_trans *trans, + struct bch_inode_unpacked *lostfound, + u64 inum) + { +- struct bch_fs *c = trans->c; +- int ret; +- +- ret = __bch2_trans_do(trans, NULL, NULL, BTREE_INSERT_LAZY_RW, ++ int ret = __bch2_trans_do(trans, NULL, NULL, BTREE_INSERT_LAZY_RW, + __reattach_inode(trans, lostfound, inum)); + if (ret) +- bch_err(c, "error %i reattaching inode %llu", ret, inum); ++ bch_err(trans->c, "error %i reattaching inode %llu", ret, inum); + + return ret; + } +-- +cgit v1.2.3 + + +From 66221285b6cc922241b8181c248b87983f52f742 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 16 Apr 2021 20:35:20 -0400 +Subject: bcachefs: Refactor bchfs_fallocate() to not nest btree_trans on stack + +Upcoming patch is going to disallow multiple btree_trans on the stack. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/fs-io.c | 103 ++++++++++++++++++++++++---------------------------- + 1 file changed, 48 insertions(+), 55 deletions(-) + +diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c +index fa85ca78460b..0087374c6242 100644 +--- a/fs/bcachefs/fs-io.c ++++ b/fs/bcachefs/fs-io.c +@@ -2627,54 +2627,21 @@ err: + return ret; + } + +-static long bchfs_fallocate(struct bch_inode_info *inode, int mode, +- loff_t offset, loff_t len) ++static int __bchfs_fallocate(struct bch_inode_info *inode, int mode, ++ u64 start_sector, u64 end_sector) + { +- struct address_space *mapping = inode->v.i_mapping; + struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct btree_trans trans; + struct btree_iter *iter; +- struct bpos end_pos; +- loff_t end = offset + len; +- loff_t block_start = round_down(offset, block_bytes(c)); +- loff_t block_end = round_up(end, block_bytes(c)); +- unsigned sectors; ++ struct bpos end_pos = POS(inode->v.i_ino, end_sector); + unsigned replicas = io_opts(c, &inode->ei_inode).data_replicas; +- int ret; ++ int ret = 0; + + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); + +- inode_lock(&inode->v); +- inode_dio_wait(&inode->v); +- bch2_pagecache_block_get(&inode->ei_pagecache_lock); +- +- if (!(mode & FALLOC_FL_KEEP_SIZE) && end > inode->v.i_size) { +- ret = inode_newsize_ok(&inode->v, end); +- if (ret) +- goto err; +- } +- +- if (mode & FALLOC_FL_ZERO_RANGE) { +- ret = __bch2_truncate_page(inode, +- offset >> PAGE_SHIFT, +- offset, end); +- +- if (!ret && +- offset >> PAGE_SHIFT != end >> PAGE_SHIFT) +- ret = __bch2_truncate_page(inode, +- end >> PAGE_SHIFT, +- offset, end); +- +- if (unlikely(ret)) +- goto err; +- +- truncate_pagecache_range(&inode->v, offset, end - 1); +- } +- + iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, +- POS(inode->v.i_ino, block_start >> 9), ++ POS(inode->v.i_ino, start_sector), + BTREE_ITER_SLOTS|BTREE_ITER_INTENT); +- end_pos = POS(inode->v.i_ino, block_end >> 9); + + while (!ret && bkey_cmp(iter->pos, end_pos) < 0) { + s64 i_sectors_delta = 0; +@@ -2682,6 +2649,7 @@ static long bchfs_fallocate(struct bch_inode_info *inode, int mode, + struct quota_res quota_res = { 0 }; + struct bkey_i_reservation reservation; + struct bkey_s_c k; ++ unsigned sectors; + + bch2_trans_begin(&trans); + +@@ -2742,7 +2710,48 @@ bkey_err: + ret = 0; + } + bch2_trans_iter_put(&trans, iter); ++ bch2_trans_exit(&trans); ++ return ret; ++} + ++static long bchfs_fallocate(struct bch_inode_info *inode, int mode, ++ loff_t offset, loff_t len) ++{ ++ struct address_space *mapping = inode->v.i_mapping; ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ loff_t end = offset + len; ++ loff_t block_start = round_down(offset, block_bytes(c)); ++ loff_t block_end = round_up(end, block_bytes(c)); ++ int ret; ++ ++ inode_lock(&inode->v); ++ inode_dio_wait(&inode->v); ++ bch2_pagecache_block_get(&inode->ei_pagecache_lock); ++ ++ if (!(mode & FALLOC_FL_KEEP_SIZE) && end > inode->v.i_size) { ++ ret = inode_newsize_ok(&inode->v, end); ++ if (ret) ++ goto err; ++ } ++ ++ if (mode & FALLOC_FL_ZERO_RANGE) { ++ ret = __bch2_truncate_page(inode, ++ offset >> PAGE_SHIFT, ++ offset, end); ++ ++ if (!ret && ++ offset >> PAGE_SHIFT != end >> PAGE_SHIFT) ++ ret = __bch2_truncate_page(inode, ++ end >> PAGE_SHIFT, ++ offset, end); ++ ++ if (unlikely(ret)) ++ goto err; ++ ++ truncate_pagecache_range(&inode->v, offset, end - 1); ++ } ++ ++ ret = __bchfs_fallocate(inode, mode, block_start >> 9, block_end >> 9); + if (ret) + goto err; + +@@ -2756,28 +2765,13 @@ bkey_err: + if (end >= inode->v.i_size && + (!(mode & FALLOC_FL_KEEP_SIZE) || + (mode & FALLOC_FL_ZERO_RANGE))) { +- struct btree_iter *inode_iter; +- struct bch_inode_unpacked inode_u; +- +- do { +- bch2_trans_begin(&trans); +- inode_iter = bch2_inode_peek(&trans, &inode_u, +- inode->v.i_ino, 0); +- ret = PTR_ERR_OR_ZERO(inode_iter); +- } while (ret == -EINTR); +- +- bch2_trans_iter_put(&trans, inode_iter); +- bch2_trans_unlock(&trans); +- +- if (ret) +- goto err; + + /* + * Sync existing appends before extending i_size, + * as in bch2_extend(): + */ + ret = filemap_write_and_wait_range(mapping, +- inode_u.bi_size, S64_MAX); ++ inode->ei_inode.bi_size, S64_MAX); + if (ret) + goto err; + +@@ -2791,7 +2785,6 @@ bkey_err: + mutex_unlock(&inode->ei_update_lock); + } + err: +- bch2_trans_exit(&trans); + bch2_pagecache_block_put(&inode->ei_pagecache_lock); + inode_unlock(&inode->v); + return ret; +-- +cgit v1.2.3 + + +From ca5a31f36dd2739c5643f1b932a1914f85066f2a Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 16 Apr 2021 21:34:00 -0400 +Subject: bcachefs: gc shouldn't care about owned_by_allocator + +The owned_by_allocator field is a purely in memory thing, even if/when +we bring back GC at runtime there's no need for it to be recalculating +this field. This is prep work for pulling it out of struct bucket, and +eventually getting rid of the bucket array. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/alloc_background.c | 5 ++- + fs/bcachefs/alloc_foreground.c | 3 +- + fs/bcachefs/bcachefs.h | 1 - + fs/bcachefs/btree_gc.c | 53 +-------------------------- + fs/bcachefs/btree_gc.h | 8 ----- + fs/bcachefs/buckets.c | 82 +++--------------------------------------- + fs/bcachefs/buckets.h | 3 +- + 7 files changed, 9 insertions(+), 146 deletions(-) + +diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c +index f973ace061d2..dad921f34815 100644 +--- a/fs/bcachefs/alloc_background.c ++++ b/fs/bcachefs/alloc_background.c +@@ -783,7 +783,7 @@ static int bch2_invalidate_one_bucket2(struct btree_trans *trans, + + BUG_ON(m.dirty_sectors); + +- bch2_mark_alloc_bucket(c, ca, b, true, gc_pos_alloc(c, NULL), 0); ++ bch2_mark_alloc_bucket(c, ca, b, true); + + spin_lock(&c->freelist_lock); + verify_not_on_freelist(c, ca, b); +@@ -880,8 +880,7 @@ out: + percpu_down_read(&c->mark_lock); + spin_lock(&c->freelist_lock); + +- bch2_mark_alloc_bucket(c, ca, b, false, +- gc_pos_alloc(c, NULL), 0); ++ bch2_mark_alloc_bucket(c, ca, b, false); + + BUG_ON(!fifo_pop_back(&ca->free_inc, b2)); + BUG_ON(b != b2); +diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c +index 499d4c8baf66..408a63a4c9bb 100644 +--- a/fs/bcachefs/alloc_foreground.c ++++ b/fs/bcachefs/alloc_foreground.c +@@ -98,8 +98,7 @@ void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob) + percpu_down_read(&c->mark_lock); + spin_lock(&ob->lock); + +- bch2_mark_alloc_bucket(c, ca, PTR_BUCKET_NR(ca, &ob->ptr), +- false, gc_pos_alloc(c, ob), 0); ++ bch2_mark_alloc_bucket(c, ca, PTR_BUCKET_NR(ca, &ob->ptr), false); + ob->valid = false; + ob->type = 0; + +diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h +index 4fe162a338f6..42bb4d734363 100644 +--- a/fs/bcachefs/bcachefs.h ++++ b/fs/bcachefs/bcachefs.h +@@ -380,7 +380,6 @@ enum gc_phase { + GC_PHASE_BTREE_reflink, + + GC_PHASE_PENDING_DELETE, +- GC_PHASE_ALLOC, + }; + + struct gc_pos { +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +index cc17da8b7a1a..5c45a693d512 100644 +--- a/fs/bcachefs/btree_gc.c ++++ b/fs/bcachefs/btree_gc.c +@@ -730,52 +730,6 @@ static void bch2_mark_pending_btree_node_frees(struct bch_fs *c) + } + #endif + +-static void bch2_mark_allocator_buckets(struct bch_fs *c) +-{ +- struct bch_dev *ca; +- struct open_bucket *ob; +- size_t i, j, iter; +- unsigned ci; +- +- percpu_down_read(&c->mark_lock); +- +- spin_lock(&c->freelist_lock); +- gc_pos_set(c, gc_pos_alloc(c, NULL)); +- +- for_each_member_device(ca, c, ci) { +- fifo_for_each_entry(i, &ca->free_inc, iter) +- bch2_mark_alloc_bucket(c, ca, i, true, +- gc_pos_alloc(c, NULL), +- BTREE_TRIGGER_GC); +- +- +- +- for (j = 0; j < RESERVE_NR; j++) +- fifo_for_each_entry(i, &ca->free[j], iter) +- bch2_mark_alloc_bucket(c, ca, i, true, +- gc_pos_alloc(c, NULL), +- BTREE_TRIGGER_GC); +- } +- +- spin_unlock(&c->freelist_lock); +- +- for (ob = c->open_buckets; +- ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); +- ob++) { +- spin_lock(&ob->lock); +- if (ob->valid) { +- gc_pos_set(c, gc_pos_alloc(c, ob)); +- ca = bch_dev_bkey_exists(c, ob->ptr.dev); +- bch2_mark_alloc_bucket(c, ca, PTR_BUCKET_NR(ca, &ob->ptr), true, +- gc_pos_alloc(c, ob), +- BTREE_TRIGGER_GC); +- } +- spin_unlock(&ob->lock); +- } +- +- percpu_up_read(&c->mark_lock); +-} +- + static void bch2_gc_free(struct bch_fs *c) + { + struct bch_dev *ca; +@@ -880,7 +834,6 @@ static int bch2_gc_done(struct bch_fs *c, + for (b = 0; b < src->nbuckets; b++) { + copy_bucket_field(gen); + copy_bucket_field(data_type); +- copy_bucket_field(owned_by_allocator); + copy_bucket_field(stripe); + copy_bucket_field(dirty_sectors); + copy_bucket_field(cached_sectors); +@@ -1020,10 +973,8 @@ static int bch2_gc_start(struct bch_fs *c, + + if (metadata_only && + (s->mark.data_type == BCH_DATA_user || +- s->mark.data_type == BCH_DATA_cached)) { ++ s->mark.data_type == BCH_DATA_cached)) + d->_mark = s->mark; +- d->_mark.owned_by_allocator = 0; +- } + } + }; + +@@ -1079,8 +1030,6 @@ again: + #if 0 + bch2_mark_pending_btree_node_frees(c); + #endif +- bch2_mark_allocator_buckets(c); +- + c->gc_count++; + + if (test_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags) || +diff --git a/fs/bcachefs/btree_gc.h b/fs/bcachefs/btree_gc.h +index 868723a30b15..e9a87394370a 100644 +--- a/fs/bcachefs/btree_gc.h ++++ b/fs/bcachefs/btree_gc.h +@@ -90,14 +90,6 @@ static inline struct gc_pos gc_pos_btree_root(enum btree_id id) + return gc_pos_btree(id, POS_MAX, BTREE_MAX_DEPTH); + } + +-static inline struct gc_pos gc_pos_alloc(struct bch_fs *c, struct open_bucket *ob) +-{ +- return (struct gc_pos) { +- .phase = GC_PHASE_ALLOC, +- .pos = POS(ob ? ob - c->open_buckets : 0, 0), +- }; +-} +- + static inline bool gc_visited(struct bch_fs *c, struct gc_pos pos) + { + unsigned seq; +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index 5ad66d4409bd..b3fc9a91e20e 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -3,64 +3,6 @@ + * Code for manipulating bucket marks for garbage collection. + * + * Copyright 2014 Datera, Inc. +- * +- * Bucket states: +- * - free bucket: mark == 0 +- * The bucket contains no data and will not be read +- * +- * - allocator bucket: owned_by_allocator == 1 +- * The bucket is on a free list, or it is an open bucket +- * +- * - cached bucket: owned_by_allocator == 0 && +- * dirty_sectors == 0 && +- * cached_sectors > 0 +- * The bucket contains data but may be safely discarded as there are +- * enough replicas of the data on other cache devices, or it has been +- * written back to the backing device +- * +- * - dirty bucket: owned_by_allocator == 0 && +- * dirty_sectors > 0 +- * The bucket contains data that we must not discard (either only copy, +- * or one of the 'main copies' for data requiring multiple replicas) +- * +- * - metadata bucket: owned_by_allocator == 0 && is_metadata == 1 +- * This is a btree node, journal or gen/prio bucket +- * +- * Lifecycle: +- * +- * bucket invalidated => bucket on freelist => open bucket => +- * [dirty bucket =>] cached bucket => bucket invalidated => ... +- * +- * Note that cache promotion can skip the dirty bucket step, as data +- * is copied from a deeper tier to a shallower tier, onto a cached +- * bucket. +- * Note also that a cached bucket can spontaneously become dirty -- +- * see below. +- * +- * Only a traversal of the key space can determine whether a bucket is +- * truly dirty or cached. +- * +- * Transitions: +- * +- * - free => allocator: bucket was invalidated +- * - cached => allocator: bucket was invalidated +- * +- * - allocator => dirty: open bucket was filled up +- * - allocator => cached: open bucket was filled up +- * - allocator => metadata: metadata was allocated +- * +- * - dirty => cached: dirty sectors were copied to a deeper tier +- * - dirty => free: dirty sectors were overwritten or moved (copy gc) +- * - cached => free: cached sectors were overwritten +- * +- * - metadata => free: metadata was freed +- * +- * Oddities: +- * - cached => dirty: a device was removed so formerly replicated data +- * is no longer sufficiently replicated +- * - free => cached: cannot happen +- * - free => dirty: cannot happen +- * - free => metadata: cannot happen + */ + + #include "bcachefs.h" +@@ -554,33 +496,17 @@ static inline void update_cached_sectors_list(struct btree_trans *trans, + ret; \ + }) + +-static int __bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca, +- size_t b, bool owned_by_allocator, +- bool gc) ++void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca, ++ size_t b, bool owned_by_allocator) + { +- struct bucket *g = __bucket(ca, b, gc); ++ struct bucket *g = bucket(ca, b); + struct bucket_mark old, new; + + old = bucket_cmpxchg(g, new, ({ + new.owned_by_allocator = owned_by_allocator; + })); + +- BUG_ON(!gc && +- !owned_by_allocator && !old.owned_by_allocator); +- +- return 0; +-} +- +-void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca, +- size_t b, bool owned_by_allocator, +- struct gc_pos pos, unsigned flags) +-{ +- preempt_disable(); +- +- do_mark_fn(__bch2_mark_alloc_bucket, c, pos, flags, +- ca, b, owned_by_allocator); +- +- preempt_enable(); ++ BUG_ON(owned_by_allocator == old.owned_by_allocator); + } + + static int bch2_mark_alloc(struct bch_fs *c, +diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h +index 794c426e2198..7463e6420b14 100644 +--- a/fs/bcachefs/buckets.h ++++ b/fs/bcachefs/buckets.h +@@ -235,8 +235,7 @@ bch2_fs_usage_read_short(struct bch_fs *); + void bch2_bucket_seq_cleanup(struct bch_fs *); + void bch2_fs_usage_initialize(struct bch_fs *); + +-void bch2_mark_alloc_bucket(struct bch_fs *, struct bch_dev *, +- size_t, bool, struct gc_pos, unsigned); ++void bch2_mark_alloc_bucket(struct bch_fs *, struct bch_dev *, size_t, bool); + void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *, + size_t, enum bch_data_type, unsigned, + struct gc_pos, unsigned); +-- +cgit v1.2.3 + + +From 66bfcbdd288861521aecf0b28433dc0e39158ed5 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 16 Apr 2021 21:53:23 -0400 +Subject: bcachefs: Allocator thread doesn't need gc_lock anymore + +Even with runtime gc (which currently isn't supported), runtime gc no +longer clears/recalculates the main set of bucket marks - it allocates +and calculates another set, updating the primary at the end. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/alloc_background.c | 26 +++++--------------------- + 1 file changed, 5 insertions(+), 21 deletions(-) + +diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c +index dad921f34815..43eb3c515cf3 100644 +--- a/fs/bcachefs/alloc_background.c ++++ b/fs/bcachefs/alloc_background.c +@@ -209,7 +209,7 @@ void bch2_alloc_pack(struct bch_fs *c, + bch2_alloc_pack_v2(dst, src); + } + +-static unsigned bch_alloc_val_u64s(const struct bch_alloc *a) ++static unsigned bch_alloc_v1_val_u64s(const struct bch_alloc *a) + { + unsigned i, bytes = offsetof(struct bch_alloc, data); + +@@ -229,7 +229,7 @@ const char *bch2_alloc_v1_invalid(const struct bch_fs *c, struct bkey_s_c k) + return "invalid device"; + + /* allow for unknown fields */ +- if (bkey_val_u64s(a.k) < bch_alloc_val_u64s(a.v)) ++ if (bkey_val_u64s(a.k) < bch_alloc_v1_val_u64s(a.v)) + return "incorrect value size"; + + return NULL; +@@ -293,11 +293,8 @@ int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys) + { + int ret; + +- down_read(&c->gc_lock); + ret = bch2_btree_and_journal_walk(c, journal_keys, BTREE_ID_alloc, + NULL, bch2_alloc_read_fn); +- up_read(&c->gc_lock); +- + if (ret) { + bch_err(c, "error reading alloc info: %i", ret); + return ret; +@@ -475,10 +472,8 @@ static int wait_buckets_available(struct bch_fs *c, struct bch_dev *ca) + if (available) + break; + +- up_read(&c->gc_lock); + schedule(); + try_to_freeze(); +- down_read(&c->gc_lock); + } + + __set_current_state(TASK_RUNNING); +@@ -914,7 +909,6 @@ static int bch2_invalidate_buckets(struct bch_fs *c, struct bch_dev *ca) + !fifo_full(&ca->free_inc) && + ca->alloc_heap.used) + ret = bch2_invalidate_one_bucket2(&trans, ca, iter, &journal_seq, +- BTREE_INSERT_GC_LOCK_HELD| + (!fifo_empty(&ca->free_inc) + ? BTREE_INSERT_NOWAIT : 0)); + +@@ -1055,18 +1049,12 @@ static int bch2_allocator_thread(void *arg) + if (ret) + goto stop; + +- down_read(&c->gc_lock); +- + ret = bch2_invalidate_buckets(c, ca); +- if (ret) { +- up_read(&c->gc_lock); ++ if (ret) + goto stop; +- } + +- if (!fifo_empty(&ca->free_inc)) { +- up_read(&c->gc_lock); ++ if (!fifo_empty(&ca->free_inc)) + continue; +- } + + pr_debug("free_inc now empty"); + +@@ -1104,14 +1092,10 @@ static int bch2_allocator_thread(void *arg) + * available so we don't spin: + */ + ret = wait_buckets_available(c, ca); +- if (ret) { +- up_read(&c->gc_lock); ++ if (ret) + goto stop; +- } + } + +- up_read(&c->gc_lock); +- + pr_debug("%zu buckets to invalidate", nr); + + /* +-- +cgit v1.2.3 + + +From 3d78d2e913f1b75a56b0e1b0175d189a84c74829 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 18 Apr 2021 17:26:34 -0400 +Subject: bcachefs: Handle errors in bch2_trans_mark_update() + +It's not actually the case that iterators are always checked here - +__bch2_trans_commit() checks for that after running triggers. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/alloc_background.c | 2 ++ + fs/bcachefs/buckets.c | 5 +++-- + 2 files changed, 5 insertions(+), 2 deletions(-) + +diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c +index 43eb3c515cf3..8fcc5c8e9181 100644 +--- a/fs/bcachefs/alloc_background.c ++++ b/fs/bcachefs/alloc_background.c +@@ -293,8 +293,10 @@ int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys) + { + int ret; + ++ down_read(&c->gc_lock); + ret = bch2_btree_and_journal_walk(c, journal_keys, BTREE_ID_alloc, + NULL, bch2_alloc_read_fn); ++ up_read(&c->gc_lock); + if (ret) { + bch_err(c, "error reading alloc info: %i", ret); + return ret; +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index b3fc9a91e20e..dd2cfc515879 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -1832,10 +1832,11 @@ int bch2_trans_mark_update(struct btree_trans *trans, + return 0; + + if (!btree_node_type_is_extents(iter->btree_id)) { +- /* iterators should be uptodate, shouldn't get errors here: */ + if (btree_iter_type(iter) != BTREE_ITER_CACHED) { + old = bch2_btree_iter_peek_slot(iter); +- BUG_ON(bkey_err(old)); ++ ret = bkey_err(old); ++ if (ret) ++ return ret; + } else { + struct bkey_cached *ck = (void *) iter->l[0].b; + +-- +cgit v1.2.3 + + +From 0a48673250e04b8277dce6c2fd881daeabf8b76e Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 17 Apr 2021 23:18:17 -0400 +Subject: bcachefs: Check that keys are in the correct btrees + +We've started seeing bug reports of pointers to btree nodes being +detected in leaf nodes. This should catch that before it's happened, and +it's something we should've been checking anyways. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bkey_methods.c | 39 +++++++++++++++++++++++++++++++++++++++ + 1 file changed, 39 insertions(+) + +diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c +index 6fe95b802e13..cf2e054cca2f 100644 +--- a/fs/bcachefs/bkey_methods.c ++++ b/fs/bcachefs/bkey_methods.c +@@ -98,12 +98,51 @@ const char *bch2_bkey_val_invalid(struct bch_fs *c, struct bkey_s_c k) + return bch2_bkey_ops[k.k->type].key_invalid(c, k); + } + ++static unsigned bch2_key_types_allowed[] = { ++ [BKEY_TYPE_extents] = ++ (1U << KEY_TYPE_error)| ++ (1U << KEY_TYPE_cookie)| ++ (1U << KEY_TYPE_extent)| ++ (1U << KEY_TYPE_reservation)| ++ (1U << KEY_TYPE_reflink_p)| ++ (1U << KEY_TYPE_inline_data), ++ [BKEY_TYPE_inodes] = ++ (1U << KEY_TYPE_inode)| ++ (1U << KEY_TYPE_inode_generation), ++ [BKEY_TYPE_dirents] = ++ (1U << KEY_TYPE_hash_whiteout)| ++ (1U << KEY_TYPE_dirent), ++ [BKEY_TYPE_xattrs] = ++ (1U << KEY_TYPE_cookie)| ++ (1U << KEY_TYPE_hash_whiteout)| ++ (1U << KEY_TYPE_xattr), ++ [BKEY_TYPE_alloc] = ++ (1U << KEY_TYPE_alloc)| ++ (1U << KEY_TYPE_alloc_v2), ++ [BKEY_TYPE_quotas] = ++ (1U << KEY_TYPE_quota), ++ [BKEY_TYPE_stripes] = ++ (1U << KEY_TYPE_stripe), ++ [BKEY_TYPE_reflink] = ++ (1U << KEY_TYPE_reflink_v)| ++ (1U << KEY_TYPE_indirect_inline_data), ++ [BKEY_TYPE_btree] = ++ (1U << KEY_TYPE_btree_ptr)| ++ (1U << KEY_TYPE_btree_ptr_v2), ++}; ++ + const char *__bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k, + enum btree_node_type type) + { ++ unsigned key_types_allowed = (1U << KEY_TYPE_deleted)| ++ bch2_key_types_allowed[type] ; ++ + if (k.k->u64s < BKEY_U64s) + return "u64s too small"; + ++ if (!(key_types_allowed & (1U << k.k->type))) ++ return "invalid key type for this btree"; ++ + if (type == BKEY_TYPE_btree && + bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX) + return "value too big"; +-- +cgit v1.2.3 + + +From b3593a55ce6c13528e4eb523466aa77cc49ff13f Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 18 Apr 2021 17:44:35 -0400 +Subject: bcachefs: Always check for invalid bkeys in trans commit path + +We check for this prior to metadata being written, but we're seeing some +strange bugs lately, and this will help catch those closer to where they +occur. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_update_leaf.c | 27 ++++++++++++--------------- + 1 file changed, 12 insertions(+), 15 deletions(-) + +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index db5702ca6e0c..4212326f6a36 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -222,18 +222,6 @@ static bool btree_insert_key_leaf(struct btree_trans *trans, + static inline void btree_insert_entry_checks(struct btree_trans *trans, + struct btree_insert_entry *i) + { +- struct bch_fs *c = trans->c; +- +- if (bch2_debug_check_bkeys) { +- const char *invalid = bch2_bkey_invalid(c, +- bkey_i_to_s_c(i->k), i->bkey_type); +- if (invalid) { +- char buf[200]; +- +- bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(i->k)); +- panic("invalid bkey %s on insert: %s\n", buf, invalid); +- } +- } + BUG_ON(!i->is_extent && bpos_cmp(i->k->k.p, i->iter->real_pos)); + BUG_ON(i->level != i->iter->level); + BUG_ON(i->btree_id != i->iter->btree_id); +@@ -592,9 +580,18 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, + } + } + +- if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) +- trans_for_each_update2(trans, i) +- btree_insert_entry_checks(trans, i); ++ trans_for_each_update2(trans, i) { ++ const char *invalid = bch2_bkey_invalid(c, ++ bkey_i_to_s_c(i->k), i->bkey_type); ++ if (invalid) { ++ char buf[200]; ++ ++ bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(i->k)); ++ bch_err(c, "invalid bkey %s on insert: %s\n", buf, invalid); ++ bch2_fatal_error(c); ++ } ++ btree_insert_entry_checks(trans, i); ++ } + bch2_btree_trans_verify_locks(trans); + + trans_for_each_update2(trans, i) +-- +cgit v1.2.3 + + +From b60a163bddb110630b4e5201e42b4fc94cc127d0 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 17 Apr 2021 20:37:04 -0400 +Subject: bcachefs: Allocator refactoring + +This uses the kthread_wait_freezable() macro to simplify a lot of the +allocator thread code, along with cleaning up bch2_invalidate_bucket2(). + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/alloc_background.c | 382 ++++++++++++++-------------------------- + fs/bcachefs/alloc_foreground.c | 47 +---- + include/trace/events/bcachefs.h | 43 +++-- + 3 files changed, 161 insertions(+), 311 deletions(-) + +diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c +index 8fcc5c8e9181..93ecf7301818 100644 +--- a/fs/bcachefs/alloc_background.c ++++ b/fs/bcachefs/alloc_background.c +@@ -441,50 +441,6 @@ out: + * commands to the newly free buckets, then puts them on the various freelists. + */ + +-/** +- * wait_buckets_available - wait on reclaimable buckets +- * +- * If there aren't enough available buckets to fill up free_inc, wait until +- * there are. +- */ +-static int wait_buckets_available(struct bch_fs *c, struct bch_dev *ca) +-{ +- unsigned long gc_count = c->gc_count; +- s64 available; +- int ret = 0; +- +- ca->allocator_state = ALLOCATOR_blocked; +- closure_wake_up(&c->freelist_wait); +- +- while (1) { +- set_current_state(TASK_INTERRUPTIBLE); +- if (kthread_should_stop()) { +- ret = 1; +- break; +- } +- +- if (gc_count != c->gc_count) +- ca->inc_gen_really_needs_gc = 0; +- +- available = dev_buckets_reclaimable(ca); +- available -= ca->inc_gen_really_needs_gc; +- +- available = max(available, 0LL); +- +- if (available) +- break; +- +- schedule(); +- try_to_freeze(); +- } +- +- __set_current_state(TASK_RUNNING); +- ca->allocator_state = ALLOCATOR_running; +- closure_wake_up(&c->freelist_wait); +- +- return ret; +-} +- + static bool bch2_can_invalidate_bucket(struct bch_dev *ca, size_t b, + struct bucket_mark m) + { +@@ -502,11 +458,8 @@ static bool bch2_can_invalidate_bucket(struct bch_dev *ca, size_t b, + + gc_gen = bucket_gc_gen(bucket(ca, b)); + +- if (gc_gen >= BUCKET_GC_GEN_MAX / 2) +- ca->inc_gen_needs_gc++; +- +- if (gc_gen >= BUCKET_GC_GEN_MAX) +- ca->inc_gen_really_needs_gc++; ++ ca->inc_gen_needs_gc += gc_gen >= BUCKET_GC_GEN_MAX / 2; ++ ca->inc_gen_really_needs_gc += gc_gen >= BUCKET_GC_GEN_MAX; + + return gc_gen < BUCKET_GC_GEN_MAX; + } +@@ -583,6 +536,8 @@ static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca) + struct bucket_mark m = READ_ONCE(g->mark); + unsigned key = bucket_sort_key(g, m, now, last_seq_ondisk); + ++ cond_resched(); ++ + if (!bch2_can_invalidate_bucket(ca, b, m)) + continue; + +@@ -599,8 +554,6 @@ static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca) + .key = key, + }; + } +- +- cond_resched(); + } + + if (e.nr) +@@ -693,6 +646,7 @@ static size_t find_reclaimable_buckets(struct bch_fs *c, struct bch_dev *ca) + size_t i, nr = 0; + + ca->inc_gen_needs_gc = 0; ++ ca->inc_gen_really_needs_gc = 0; + + switch (ca->mi.replacement) { + case BCH_CACHE_REPLACEMENT_lru: +@@ -714,25 +668,6 @@ static size_t find_reclaimable_buckets(struct bch_fs *c, struct bch_dev *ca) + return nr; + } + +-static inline long next_alloc_bucket(struct bch_dev *ca) +-{ +- struct alloc_heap_entry e, *top = ca->alloc_heap.data; +- +- while (ca->alloc_heap.used) { +- if (top->nr) { +- size_t b = top->bucket; +- +- top->bucket++; +- top->nr--; +- return b; +- } +- +- heap_pop(&ca->alloc_heap, e, bucket_alloc_cmp, NULL); +- } +- +- return -1; +-} +- + /* + * returns sequence number of most recent journal entry that updated this + * bucket: +@@ -755,17 +690,56 @@ static u64 bucket_journal_seq(struct bch_fs *c, struct bucket_mark m) + } + } + +-static int bch2_invalidate_one_bucket2(struct btree_trans *trans, +- struct bch_dev *ca, +- struct btree_iter *iter, +- u64 *journal_seq, unsigned flags) ++static int bucket_invalidate_btree(struct btree_trans *trans, ++ struct bch_dev *ca, u64 b) + { + struct bch_fs *c = trans->c; +- struct bkey_alloc_buf a; ++ struct bkey_alloc_buf *a; + struct bkey_alloc_unpacked u; + struct bucket *g; + struct bucket_mark m; +- bool invalidating_cached_data; ++ struct btree_iter *iter = ++ bch2_trans_get_iter(trans, BTREE_ID_alloc, ++ POS(ca->dev_idx, b), ++ BTREE_ITER_CACHED| ++ BTREE_ITER_CACHED_NOFILL| ++ BTREE_ITER_INTENT); ++ int ret; ++ ++ a = bch2_trans_kmalloc(trans, sizeof(*a)); ++ ret = PTR_ERR_OR_ZERO(a); ++ if (ret) ++ goto err; ++ ++ ret = bch2_btree_iter_traverse(iter); ++ if (ret) ++ goto err; ++ ++ percpu_down_read(&c->mark_lock); ++ g = bucket(ca, b); ++ m = READ_ONCE(g->mark); ++ u = alloc_mem_to_key(iter, g, m); ++ percpu_up_read(&c->mark_lock); ++ ++ u.gen++; ++ u.data_type = 0; ++ u.dirty_sectors = 0; ++ u.cached_sectors = 0; ++ u.read_time = atomic64_read(&c->io_clock[READ].now); ++ u.write_time = atomic64_read(&c->io_clock[WRITE].now); ++ ++ bch2_alloc_pack(c, a, u); ++ bch2_trans_update(trans, iter, &a->k, BTREE_TRIGGER_BUCKET_INVALIDATE); ++err: ++ bch2_trans_iter_put(trans, iter); ++ return ret; ++} ++ ++static int bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca, ++ u64 *journal_seq, unsigned flags) ++{ ++ struct bucket *g; ++ struct bucket_mark m; + size_t b; + int ret = 0; + +@@ -811,48 +785,12 @@ static int bch2_invalidate_one_bucket2(struct btree_trans *trans, + goto out; + } + +- bch2_btree_iter_set_pos(iter, POS(ca->dev_idx, b)); +-retry: +- ret = bch2_btree_iter_traverse(iter); +- if (ret) +- return ret; +- +- percpu_down_read(&c->mark_lock); +- g = bucket(ca, iter->pos.offset); +- m = READ_ONCE(g->mark); +- u = alloc_mem_to_key(iter, g, m); +- +- percpu_up_read(&c->mark_lock); +- +- invalidating_cached_data = u.cached_sectors != 0; +- +- u.gen++; +- u.data_type = 0; +- u.dirty_sectors = 0; +- u.cached_sectors = 0; +- u.read_time = atomic64_read(&c->io_clock[READ].now); +- u.write_time = atomic64_read(&c->io_clock[WRITE].now); +- +- bch2_alloc_pack(c, &a, u); +- bch2_trans_update(trans, iter, &a.k, +- BTREE_TRIGGER_BUCKET_INVALIDATE); +- +- /* +- * XXX: +- * when using deferred btree updates, we have journal reclaim doing +- * btree updates and thus requiring the allocator to make forward +- * progress, and here the allocator is requiring space in the journal - +- * so we need a journal pre-reservation: +- */ +- ret = bch2_trans_commit(trans, NULL, +- invalidating_cached_data ? journal_seq : NULL, +- BTREE_INSERT_NOUNLOCK| +- BTREE_INSERT_NOCHECK_RW| +- BTREE_INSERT_NOFAIL| +- BTREE_INSERT_JOURNAL_RESERVED| +- flags); +- if (ret == -EINTR) +- goto retry; ++ ret = bch2_trans_do(c, NULL, journal_seq, ++ BTREE_INSERT_NOCHECK_RW| ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_JOURNAL_RESERVED| ++ flags, ++ bucket_invalidate_btree(&trans, ca, b)); + out: + if (!ret) { + /* remove from alloc_heap: */ +@@ -894,28 +832,23 @@ out: + */ + static int bch2_invalidate_buckets(struct bch_fs *c, struct bch_dev *ca) + { +- struct btree_trans trans; +- struct btree_iter *iter; + u64 journal_seq = 0; + int ret = 0; + +- bch2_trans_init(&trans, c, 0, 0); +- iter = bch2_trans_get_iter(&trans, BTREE_ID_alloc, +- POS(ca->dev_idx, 0), +- BTREE_ITER_CACHED| +- BTREE_ITER_CACHED_NOFILL| +- BTREE_ITER_INTENT); +- + /* Only use nowait if we've already invalidated at least one bucket: */ + while (!ret && + !fifo_full(&ca->free_inc) && +- ca->alloc_heap.used) +- ret = bch2_invalidate_one_bucket2(&trans, ca, iter, &journal_seq, ++ ca->alloc_heap.used) { ++ ret = bch2_invalidate_one_bucket(c, ca, &journal_seq, + (!fifo_empty(&ca->free_inc) + ? BTREE_INSERT_NOWAIT : 0)); +- +- bch2_trans_iter_put(&trans, iter); +- bch2_trans_exit(&trans); ++ /* ++ * We only want to batch up invalidates when they're going to ++ * require flushing the journal: ++ */ ++ if (!journal_seq) ++ break; ++ } + + /* If we used NOWAIT, don't return the error: */ + if (!fifo_empty(&ca->free_inc)) +@@ -935,83 +868,72 @@ static int bch2_invalidate_buckets(struct bch_fs *c, struct bch_dev *ca) + return 0; + } + +-static int push_invalidated_bucket(struct bch_fs *c, struct bch_dev *ca, size_t bucket) ++static void alloc_thread_set_state(struct bch_dev *ca, unsigned new_state) ++{ ++ if (ca->allocator_state != new_state) { ++ ca->allocator_state = new_state; ++ closure_wake_up(&ca->fs->freelist_wait); ++ } ++} ++ ++static int push_invalidated_bucket(struct bch_fs *c, struct bch_dev *ca, u64 b) + { + unsigned i; + int ret = 0; + +- while (1) { +- set_current_state(TASK_INTERRUPTIBLE); +- +- spin_lock(&c->freelist_lock); +- for (i = 0; i < RESERVE_NR; i++) { +- +- /* +- * Don't strand buckets on the copygc freelist until +- * after recovery is finished: +- */ +- if (!test_bit(BCH_FS_STARTED, &c->flags) && +- i == RESERVE_MOVINGGC) +- continue; +- +- if (fifo_push(&ca->free[i], bucket)) { +- fifo_pop(&ca->free_inc, bucket); +- +- closure_wake_up(&c->freelist_wait); +- ca->allocator_state = ALLOCATOR_running; +- +- spin_unlock(&c->freelist_lock); +- goto out; +- } +- } +- +- if (ca->allocator_state != ALLOCATOR_blocked_full) { +- ca->allocator_state = ALLOCATOR_blocked_full; +- closure_wake_up(&c->freelist_wait); +- } +- +- spin_unlock(&c->freelist_lock); ++ spin_lock(&c->freelist_lock); ++ for (i = 0; i < RESERVE_NR; i++) { ++ /* ++ * Don't strand buckets on the copygc freelist until ++ * after recovery is finished: ++ */ ++ if (i == RESERVE_MOVINGGC && ++ !test_bit(BCH_FS_STARTED, &c->flags)) ++ continue; + +- if ((current->flags & PF_KTHREAD) && +- kthread_should_stop()) { ++ if (fifo_push(&ca->free[i], b)) { ++ fifo_pop(&ca->free_inc, b); + ret = 1; + break; + } +- +- schedule(); +- try_to_freeze(); + } +-out: +- __set_current_state(TASK_RUNNING); ++ spin_unlock(&c->freelist_lock); ++ ++ ca->allocator_state = ret ++ ? ALLOCATOR_running ++ : ALLOCATOR_blocked_full; ++ closure_wake_up(&c->freelist_wait); + return ret; + } + +-/* +- * Pulls buckets off free_inc, discards them (if enabled), then adds them to +- * freelists, waiting until there's room if necessary: +- */ +-static int discard_invalidated_buckets(struct bch_fs *c, struct bch_dev *ca) ++static void discard_one_bucket(struct bch_fs *c, struct bch_dev *ca, u64 b) + { +- while (!fifo_empty(&ca->free_inc)) { +- size_t bucket = fifo_peek(&ca->free_inc); +- +- if (ca->mi.discard && +- blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev))) +- blkdev_issue_discard(ca->disk_sb.bdev, +- bucket_to_sector(ca, bucket), +- ca->mi.bucket_size, GFP_NOIO, 0); +- +- if (push_invalidated_bucket(c, ca, bucket)) +- return 1; +- } ++ if (ca->mi.discard && ++ blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev))) ++ blkdev_issue_discard(ca->disk_sb.bdev, bucket_to_sector(ca, b), ++ ca->mi.bucket_size, GFP_NOFS, 0); ++} + +- return 0; ++static bool allocator_thread_running(struct bch_dev *ca) ++{ ++ unsigned state = ca->mi.state == BCH_MEMBER_STATE_rw && ++ test_bit(BCH_FS_ALLOCATOR_RUNNING, &ca->fs->flags) ++ ? ALLOCATOR_running ++ : ALLOCATOR_stopped; ++ alloc_thread_set_state(ca, state); ++ return state == ALLOCATOR_running; + } + +-static inline bool allocator_thread_running(struct bch_dev *ca) ++static int buckets_available(struct bch_dev *ca, unsigned long gc_count) + { +- return ca->mi.state == BCH_MEMBER_STATE_rw && +- test_bit(BCH_FS_ALLOCATOR_RUNNING, &ca->fs->flags); ++ s64 available = dev_buckets_reclaimable(ca) - ++ (gc_count == ca->fs->gc_count ? ca->inc_gen_really_needs_gc : 0); ++ bool ret = available > 0; ++ ++ alloc_thread_set_state(ca, ret ++ ? ALLOCATOR_running ++ : ALLOCATOR_blocked); ++ return ret; + } + + /** +@@ -1026,56 +948,29 @@ static int bch2_allocator_thread(void *arg) + { + struct bch_dev *ca = arg; + struct bch_fs *c = ca->fs; ++ unsigned long gc_count = c->gc_count; + size_t nr; + int ret; + + set_freezable(); + + while (1) { +- if (!allocator_thread_running(ca)) { +- ca->allocator_state = ALLOCATOR_stopped; +- if (kthread_wait_freezable(allocator_thread_running(ca))) +- break; +- } +- +- ca->allocator_state = ALLOCATOR_running; +- +- cond_resched(); +- if (kthread_should_stop()) +- break; +- +- pr_debug("discarding %zu invalidated buckets", +- fifo_used(&ca->free_inc)); +- +- ret = discard_invalidated_buckets(c, ca); ++ ret = kthread_wait_freezable(allocator_thread_running(ca)); + if (ret) + goto stop; + +- ret = bch2_invalidate_buckets(c, ca); +- if (ret) +- goto stop; +- +- if (!fifo_empty(&ca->free_inc)) +- continue; +- +- pr_debug("free_inc now empty"); +- +- while (1) { ++ while (!ca->alloc_heap.used) { + cond_resched(); +- /* +- * Find some buckets that we can invalidate, either +- * they're completely unused, or only contain clean data +- * that's been written back to the backing device or +- * another cache tier +- */ + +- pr_debug("scanning for reclaimable buckets"); ++ ret = kthread_wait_freezable(buckets_available(ca, gc_count)); ++ if (ret) ++ goto stop; + ++ gc_count = c->gc_count; + nr = find_reclaimable_buckets(c, ca); + +- pr_debug("found %zu buckets", nr); +- +- trace_alloc_batch(ca, nr, ca->alloc_heap.size); ++ trace_alloc_scan(ca, nr, ca->inc_gen_needs_gc, ++ ca->inc_gen_really_needs_gc); + + if ((ca->inc_gen_needs_gc >= ALLOC_SCAN_BATCH(ca) || + ca->inc_gen_really_needs_gc) && +@@ -1083,33 +978,24 @@ static int bch2_allocator_thread(void *arg) + atomic_inc(&c->kick_gc); + wake_up_process(c->gc_thread); + } ++ } + +- if (nr) +- break; ++ ret = bch2_invalidate_buckets(c, ca); ++ if (ret) ++ goto stop; + +- /* +- * If we found any buckets, we have to invalidate them +- * before we scan for more - but if we didn't find very +- * many we may want to wait on more buckets being +- * available so we don't spin: +- */ +- ret = wait_buckets_available(c, ca); ++ while (!fifo_empty(&ca->free_inc)) { ++ u64 b = fifo_peek(&ca->free_inc); ++ ++ discard_one_bucket(c, ca, b); ++ ++ ret = kthread_wait_freezable(push_invalidated_bucket(c, ca, b)); + if (ret) + goto stop; + } +- +- pr_debug("%zu buckets to invalidate", nr); +- +- /* +- * alloc_heap is now full of newly-invalidated buckets: next, +- * write out the new bucket gens: +- */ + } +- + stop: +- pr_debug("alloc thread stopping (ret %i)", ret); +- ca->allocator_state = ALLOCATOR_stopped; +- closure_wake_up(&c->freelist_wait); ++ alloc_thread_set_state(ca, ALLOCATOR_stopped); + return 0; + } + +diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c +index 408a63a4c9bb..412fed479482 100644 +--- a/fs/bcachefs/alloc_foreground.c ++++ b/fs/bcachefs/alloc_foreground.c +@@ -1,57 +1,14 @@ + // SPDX-License-Identifier: GPL-2.0 + /* +- * Primary bucket allocation code +- * + * Copyright 2012 Google, Inc. + * +- * Allocation in bcache is done in terms of buckets: +- * +- * Each bucket has associated an 8 bit gen; this gen corresponds to the gen in +- * btree pointers - they must match for the pointer to be considered valid. +- * +- * Thus (assuming a bucket has no dirty data or metadata in it) we can reuse a +- * bucket simply by incrementing its gen. +- * +- * The gens (along with the priorities; it's really the gens are important but +- * the code is named as if it's the priorities) are written in an arbitrary list +- * of buckets on disk, with a pointer to them in the journal header. +- * +- * When we invalidate a bucket, we have to write its new gen to disk and wait +- * for that write to complete before we use it - otherwise after a crash we +- * could have pointers that appeared to be good but pointed to data that had +- * been overwritten. +- * +- * Since the gens and priorities are all stored contiguously on disk, we can +- * batch this up: We fill up the free_inc list with freshly invalidated buckets, +- * call prio_write(), and when prio_write() finishes we pull buckets off the +- * free_inc list and optionally discard them. +- * +- * free_inc isn't the only freelist - if it was, we'd often have to sleep while +- * priorities and gens were being written before we could allocate. c->free is a +- * smaller freelist, and buckets on that list are always ready to be used. +- * +- * If we've got discards enabled, that happens when a bucket moves from the +- * free_inc list to the free list. +- * +- * It's important to ensure that gens don't wrap around - with respect to +- * either the oldest gen in the btree or the gen on disk. This is quite +- * difficult to do in practice, but we explicitly guard against it anyways - if +- * a bucket is in danger of wrapping around we simply skip invalidating it that +- * time around, and we garbage collect or rewrite the priorities sooner than we +- * would have otherwise. ++ * Foreground allocator code: allocate buckets from freelist, and allocate in ++ * sector granularity from writepoints. + * + * bch2_bucket_alloc() allocates a single bucket from a specific device. + * + * bch2_bucket_alloc_set() allocates one or more buckets from different devices + * in a given filesystem. +- * +- * invalidate_buckets() drives all the processes described above. It's called +- * from bch2_bucket_alloc() and a few other places that need to make sure free +- * buckets are ready. +- * +- * invalidate_buckets_(lru|fifo)() find buckets that are available to be +- * invalidated, and then invalidate them and stick them on the free_inc list - +- * in either lru or fifo order. + */ + + #include "bcachefs.h" +diff --git a/include/trace/events/bcachefs.h b/include/trace/events/bcachefs.h +index 18a05f741bff..fd4b2f4ef46b 100644 +--- a/include/trace/events/bcachefs.h ++++ b/include/trace/events/bcachefs.h +@@ -380,24 +380,27 @@ DEFINE_EVENT(bch_fs, gc_cannot_inc_gens, + + /* Allocator */ + +-TRACE_EVENT(alloc_batch, +- TP_PROTO(struct bch_dev *ca, size_t free, size_t total), +- TP_ARGS(ca, free, total), ++TRACE_EVENT(alloc_scan, ++ TP_PROTO(struct bch_dev *ca, u64 found, u64 inc_gen, u64 inc_gen_skipped), ++ TP_ARGS(ca, found, inc_gen, inc_gen_skipped), + + TP_STRUCT__entry( +- __array(char, uuid, 16 ) +- __field(size_t, free ) +- __field(size_t, total ) ++ __field(dev_t, dev ) ++ __field(u64, found ) ++ __field(u64, inc_gen ) ++ __field(u64, inc_gen_skipped ) + ), + + TP_fast_assign( +- memcpy(__entry->uuid, ca->uuid.b, 16); +- __entry->free = free; +- __entry->total = total; ++ __entry->dev = ca->disk_sb.bdev->bd_dev; ++ __entry->found = found; ++ __entry->inc_gen = inc_gen; ++ __entry->inc_gen_skipped = inc_gen_skipped; + ), + +- TP_printk("%pU free %zu total %zu", +- __entry->uuid, __entry->free, __entry->total) ++ TP_printk("%d,%d found %llu inc_gen %llu inc_gen_skipped %llu", ++ MAJOR(__entry->dev), MINOR(__entry->dev), ++ __entry->found, __entry->inc_gen, __entry->inc_gen_skipped) + ); + + TRACE_EVENT(invalidate, +@@ -417,8 +420,10 @@ TRACE_EVENT(invalidate, + ), + + TP_printk("invalidated %u sectors at %d,%d sector=%llu", +- __entry->sectors, MAJOR(__entry->dev), +- MINOR(__entry->dev), __entry->offset) ++ __entry->sectors, ++ MAJOR(__entry->dev), ++ MINOR(__entry->dev), ++ __entry->offset) + ); + + DECLARE_EVENT_CLASS(bucket_alloc, +@@ -426,16 +431,18 @@ DECLARE_EVENT_CLASS(bucket_alloc, + TP_ARGS(ca, reserve), + + TP_STRUCT__entry( +- __array(char, uuid, 16) +- __field(enum alloc_reserve, reserve ) ++ __field(dev_t, dev ) ++ __field(enum alloc_reserve, reserve ) + ), + + TP_fast_assign( +- memcpy(__entry->uuid, ca->uuid.b, 16); +- __entry->reserve = reserve; ++ __entry->dev = ca->disk_sb.bdev->bd_dev; ++ __entry->reserve = reserve; + ), + +- TP_printk("%pU reserve %d", __entry->uuid, __entry->reserve) ++ TP_printk("%d,%d reserve %d", ++ MAJOR(__entry->dev), MINOR(__entry->dev), ++ __entry->reserve) + ); + + DEFINE_EVENT(bucket_alloc, bucket_alloc, +-- +cgit v1.2.3 + + +From 4fbaa096a4cdb0a8dbd189b8814637893f08bbb2 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 19 Apr 2021 00:33:05 -0400 +Subject: bcachefs: Preallocate trans mem in bch2_migrate_index_update() + +This will help avoid transaction restarts. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/move.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c +index 5b108490d7c4..aa8e8c25402f 100644 +--- a/fs/bcachefs/move.c ++++ b/fs/bcachefs/move.c +@@ -68,7 +68,7 @@ static int bch2_migrate_index_update(struct bch_write_op *op) + bch2_bkey_buf_init(&_insert); + bch2_bkey_buf_realloc(&_insert, c, U8_MAX); + +- bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); ++ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024); + + iter = bch2_trans_get_iter(&trans, m->btree_id, + bkey_start_pos(&bch2_keylist_front(keys)->k), +-- +cgit v1.2.3 + + +From 42a68cf3155cbbf9ddac0d6b01c97ed59c865330 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 19 Apr 2021 17:07:20 -0400 +Subject: bcachefs: Fix for btree_gc repairing interior btree ptrs + +Using the normal transaction commit path to insert and journal updates +to interior nodes hadn't been done before this repair code was written, +not surprising that there was a bug. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_update_leaf.c | 19 ++++++++++--------- + fs/bcachefs/journal.h | 5 +++-- + 2 files changed, 13 insertions(+), 11 deletions(-) + +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index 4212326f6a36..afdcc98dfb83 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -307,8 +307,7 @@ btree_key_can_insert_cached(struct btree_trans *trans, + } + + static inline void do_btree_insert_one(struct btree_trans *trans, +- struct btree_iter *iter, +- struct bkey_i *insert) ++ struct btree_insert_entry *i) + { + struct bch_fs *c = trans->c; + struct journal *j = &c->journal; +@@ -317,20 +316,22 @@ static inline void do_btree_insert_one(struct btree_trans *trans, + EBUG_ON(trans->journal_res.ref != + !(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)); + +- insert->k.needs_whiteout = false; ++ i->k->k.needs_whiteout = false; + +- did_work = (btree_iter_type(iter) != BTREE_ITER_CACHED) +- ? btree_insert_key_leaf(trans, iter, insert) +- : bch2_btree_insert_key_cached(trans, iter, insert); ++ did_work = (btree_iter_type(i->iter) != BTREE_ITER_CACHED) ++ ? btree_insert_key_leaf(trans, i->iter, i->k) ++ : bch2_btree_insert_key_cached(trans, i->iter, i->k); + if (!did_work) + return; + + if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) { + bch2_journal_add_keys(j, &trans->journal_res, +- iter->btree_id, insert); ++ i->btree_id, ++ i->level, ++ i->k); + + bch2_journal_set_has_inode(j, &trans->journal_res, +- insert->k.p.inode); ++ i->k->k.p.inode); + + if (trans->journal_seq) + *trans->journal_seq = trans->journal_res.seq; +@@ -468,7 +469,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, + bch2_trans_mark_gc(trans); + + trans_for_each_update2(trans, i) +- do_btree_insert_one(trans, i->iter, i->k); ++ do_btree_insert_one(trans, i); + err: + if (marking) { + percpu_up_read(&c->mark_lock); +diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h +index cc497125889f..1d556790b38e 100644 +--- a/fs/bcachefs/journal.h ++++ b/fs/bcachefs/journal.h +@@ -241,10 +241,11 @@ static inline void bch2_journal_add_entry(struct journal *j, struct journal_res + } + + static inline void bch2_journal_add_keys(struct journal *j, struct journal_res *res, +- enum btree_id id, const struct bkey_i *k) ++ enum btree_id id, unsigned level, ++ const struct bkey_i *k) + { + bch2_journal_add_entry(j, res, BCH_JSET_ENTRY_btree_keys, +- id, 0, k, k->k.u64s); ++ id, level, k, k->k.u64s); + } + + static inline bool journal_entry_empty(struct jset *j) +-- +cgit v1.2.3 + + +From b315ff9182b95e8739a550a2039d6ddfa998138a Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 19 Apr 2021 17:17:34 -0400 +Subject: bcachefs: Fix a use after free + +Turns out, we weren't waiting on in flight btree writes when freeing +existing btree nodes. This lead to stray btree writes overwriting newly +allocated buckets, but only started showing itself with some of the +recent allocator work and another patch to move submitting of btree +writes to worqueues. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_update_interior.c | 8 ++++++++ + 1 file changed, 8 insertions(+) + +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index 07c925345675..6b8d3d0f3d2d 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -887,6 +887,14 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *as, + btree_update_drop_new_node(c, b); + + btree_update_will_delete_key(as, &b->key); ++ ++ /* ++ * XXX: Waiting on io with btree node locks held, we don't want to be ++ * doing this. We can't have btree writes happening after the space has ++ * been freed, but we really only need to block before ++ * btree_update_nodes_written_trans() happens. ++ */ ++ btree_node_wait_on_io(b); + } + + void bch2_btree_update_done(struct btree_update *as) +-- +cgit v1.2.3 + + +From 103774628bb1836d62618bac7cb11033ed1d3277 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 6 Apr 2021 15:28:34 -0400 +Subject: bcachefs: Punt btree writes to workqueue to submit + +We don't want to be submitting IO with btree locks held, and btree +writes usually aren't latency sensitive. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_io.c | 20 ++++++++++++-------- + fs/bcachefs/btree_io.h | 1 + + 2 files changed, 13 insertions(+), 8 deletions(-) + +diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c +index c8d8df9637db..2de31a6b9661 100644 +--- a/fs/bcachefs/btree_io.c ++++ b/fs/bcachefs/btree_io.c +@@ -1340,6 +1340,13 @@ static int validate_bset_for_write(struct bch_fs *c, struct btree *b, + return ret; + } + ++static void btree_write_submit(struct work_struct *work) ++{ ++ struct btree_write_bio *wbio = container_of(work, struct btree_write_bio, work); ++ ++ bch2_submit_wbio_replicas(&wbio->wbio, wbio->wbio.c, BCH_DATA_btree, &wbio->key); ++} ++ + void __bch2_btree_node_write(struct bch_fs *c, struct btree *b) + { + struct btree_write_bio *wbio; +@@ -1347,7 +1354,6 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b) + struct bset *i; + struct btree_node *bn = NULL; + struct btree_node_entry *bne = NULL; +- struct bkey_buf k; + struct bch_extent_ptr *ptr; + struct sort_iter sort_iter; + struct nonce nonce; +@@ -1358,8 +1364,6 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b) + bool validate_before_checksum = false; + void *data; + +- bch2_bkey_buf_init(&k); +- + if (test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags)) + return; + +@@ -1536,6 +1540,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b) + wbio_init(&wbio->wbio.bio); + wbio->data = data; + wbio->bytes = bytes; ++ wbio->wbio.c = c; + wbio->wbio.used_mempool = used_mempool; + wbio->wbio.bio.bi_opf = REQ_OP_WRITE|REQ_META; + wbio->wbio.bio.bi_end_io = btree_node_write_endio; +@@ -1558,9 +1563,9 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b) + * just make all btree node writes FUA to keep things sane. + */ + +- bch2_bkey_buf_copy(&k, c, &b->key); ++ bkey_copy(&wbio->key, &b->key); + +- bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(k.k)), ptr) ++ bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&wbio->key)), ptr) + ptr->offset += b->written; + + b->written += sectors_to_write; +@@ -1568,9 +1573,8 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b) + atomic64_inc(&c->btree_writes_nr); + atomic64_add(sectors_to_write, &c->btree_writes_sectors); + +- /* XXX: submitting IO with btree locks held: */ +- bch2_submit_wbio_replicas(&wbio->wbio, c, BCH_DATA_btree, k.k); +- bch2_bkey_buf_exit(&k, c); ++ INIT_WORK(&wbio->work, btree_write_submit); ++ schedule_work(&wbio->work); + return; + err: + set_btree_node_noevict(b); +diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h +index 95c351611045..c8a8b05a19b0 100644 +--- a/fs/bcachefs/btree_io.h ++++ b/fs/bcachefs/btree_io.h +@@ -42,6 +42,7 @@ struct btree_read_bio { + + struct btree_write_bio { + struct work_struct work; ++ __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX); + void *data; + unsigned bytes; + struct bch_write_bio wbio; +-- +cgit v1.2.3 + + +From 6c60e89ef52b2605a29909797f03edaf28c95829 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 20 Apr 2021 20:21:39 -0400 +Subject: bcachefs: Fix two btree iterator leaks + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/ec.c | 1 + + fs/bcachefs/move.c | 6 ++++-- + 2 files changed, 5 insertions(+), 2 deletions(-) + +diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c +index a13d4e138314..e6a14497ea84 100644 +--- a/fs/bcachefs/ec.c ++++ b/fs/bcachefs/ec.c +@@ -1621,6 +1621,7 @@ int bch2_stripes_write(struct bch_fs *c, unsigned flags) + if (ret) + break; + } ++ bch2_trans_iter_put(&trans, iter); + + bch2_trans_exit(&trans); + +diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c +index aa8e8c25402f..778ff72cf5b2 100644 +--- a/fs/bcachefs/move.c ++++ b/fs/bcachefs/move.c +@@ -762,7 +762,7 @@ static int bch2_move_btree(struct bch_fs *c, + id == start_btree_id ? start_pos : POS_MIN, + BTREE_ITER_PREFETCH, b) { + if (kthread && kthread_should_stop()) +- goto out; ++ break; + + if ((cmp_int(id, end_btree_id) ?: + bkey_cmp(b->key.k.p, end_pos)) > 0) +@@ -789,8 +789,10 @@ next: + } + + ret = bch2_trans_iter_free(&trans, iter) ?: ret; ++ if (kthread && kthread_should_stop()) ++ break; + } +-out: ++ + bch2_trans_exit(&trans); + + if (ret) +-- +cgit v1.2.3 + + +From 74e180b81645d62672a885baf12009ea78b923d9 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 20 Apr 2021 20:21:12 -0400 +Subject: bcachefs: Update bch2_btree_verify() + +bch2_btree_verify() verifies that the btree node on disk matches what we +have in memory. This patch changes it to verify every replica, and also +fixes it for interior btree nodes - there's a mem_ptr field which is +used as a scratch space and needs to be zeroed out for comparing with +what's on disk. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bcachefs.h | 12 ++--- + fs/bcachefs/btree_cache.c | 22 +-------- + fs/bcachefs/btree_cache.h | 1 + + fs/bcachefs/debug.c | 120 ++++++++++++++++++++++++++++++---------------- + fs/bcachefs/debug.h | 4 -- + 5 files changed, 87 insertions(+), 72 deletions(-) + +diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h +index 42bb4d734363..7d5dbcc62060 100644 +--- a/fs/bcachefs/bcachefs.h ++++ b/fs/bcachefs/bcachefs.h +@@ -260,7 +260,11 @@ do { \ + BCH_DEBUG_PARAM(btree_gc_rewrite_disabled, \ + "Disables rewriting of btree nodes during mark and sweep")\ + BCH_DEBUG_PARAM(btree_shrinker_disabled, \ +- "Disables the shrinker callback for the btree node cache") ++ "Disables the shrinker callback for the btree node cache")\ ++ BCH_DEBUG_PARAM(verify_btree_ondisk, \ ++ "Reread btree nodes at various points to verify the " \ ++ "mergesort in the read path against modifications " \ ++ "done in memory") + + /* Parameters that should only be compiled in in debug mode: */ + #define BCH_DEBUG_PARAMS_DEBUG() \ +@@ -274,10 +278,6 @@ do { \ + "information) when iterating over keys") \ + BCH_DEBUG_PARAM(debug_check_btree_accounting, \ + "Verify btree accounting for keys within a node") \ +- BCH_DEBUG_PARAM(verify_btree_ondisk, \ +- "Reread btree nodes at various points to verify the " \ +- "mergesort in the read path against modifications " \ +- "done in memory") \ + BCH_DEBUG_PARAM(journal_seq_verify, \ + "Store the journal sequence number in the version " \ + "number of every btree key, and verify that btree " \ +@@ -818,11 +818,9 @@ struct bch_fs { + /* DEBUG JUNK */ + struct dentry *debug; + struct btree_debug btree_debug[BTREE_ID_NR]; +-#ifdef CONFIG_BCACHEFS_DEBUG + struct btree *verify_data; + struct btree_node *verify_ondisk; + struct mutex verify_lock; +-#endif + + u64 *unused_inode_hints; + unsigned inode_shard_bits; +diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c +index 5cc9e0222a74..61363d44fb20 100644 +--- a/fs/bcachefs/btree_cache.c ++++ b/fs/bcachefs/btree_cache.c +@@ -100,7 +100,7 @@ static struct btree *__btree_node_mem_alloc(struct bch_fs *c) + return b; + } + +-static struct btree *btree_node_mem_alloc(struct bch_fs *c) ++struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c) + { + struct btree_cache *bc = &c->btree_cache; + struct btree *b = __btree_node_mem_alloc(c); +@@ -366,12 +366,10 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c) + flags = memalloc_nofs_save(); + mutex_lock(&bc->lock); + +-#ifdef CONFIG_BCACHEFS_DEBUG + if (c->verify_data) + list_move(&c->verify_data->list, &bc->live); + + kvpfree(c->verify_ondisk, btree_bytes(c)); +-#endif + + for (i = 0; i < BTREE_ID_NR; i++) + if (c->btree_roots[i].b) +@@ -425,31 +423,15 @@ int bch2_fs_btree_cache_init(struct bch_fs *c) + bch2_recalc_btree_reserve(c); + + for (i = 0; i < bc->reserve; i++) +- if (!btree_node_mem_alloc(c)) { ++ if (!__bch2_btree_node_mem_alloc(c)) { + ret = -ENOMEM; + goto out; + } + + list_splice_init(&bc->live, &bc->freeable); + +-#ifdef CONFIG_BCACHEFS_DEBUG + mutex_init(&c->verify_lock); + +- c->verify_ondisk = kvpmalloc(btree_bytes(c), GFP_KERNEL); +- if (!c->verify_ondisk) { +- ret = -ENOMEM; +- goto out; +- } +- +- c->verify_data = btree_node_mem_alloc(c); +- if (!c->verify_data) { +- ret = -ENOMEM; +- goto out; +- } +- +- list_del_init(&c->verify_data->list); +-#endif +- + bc->shrink.count_objects = bch2_btree_cache_count; + bc->shrink.scan_objects = bch2_btree_cache_scan; + bc->shrink.seeks = 4; +diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h +index 4791c3b64452..c517cc029454 100644 +--- a/fs/bcachefs/btree_cache.h ++++ b/fs/bcachefs/btree_cache.h +@@ -17,6 +17,7 @@ int bch2_btree_node_hash_insert(struct btree_cache *, struct btree *, + void bch2_btree_cache_cannibalize_unlock(struct bch_fs *); + int bch2_btree_cache_cannibalize_lock(struct bch_fs *, struct closure *); + ++struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *); + struct btree *bch2_btree_node_mem_alloc(struct bch_fs *); + + struct btree *bch2_btree_node_get(struct bch_fs *, struct btree_iter *, +diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c +index 90364b55aa40..4215c119e0a2 100644 +--- a/fs/bcachefs/debug.c ++++ b/fs/bcachefs/debug.c +@@ -29,40 +29,19 @@ + + static struct dentry *bch_debug; + +-#ifdef CONFIG_BCACHEFS_DEBUG +- +-void __bch2_btree_verify(struct bch_fs *c, struct btree *b) ++static bool bch2_btree_verify_replica(struct bch_fs *c, struct btree *b, ++ struct extent_ptr_decoded pick) + { + struct btree *v = c->verify_data; +- struct btree_node *n_ondisk, *n_sorted, *n_inmemory; +- struct bset *sorted, *inmemory; +- struct extent_ptr_decoded pick; +- struct bch_dev *ca; ++ struct btree_node *n_ondisk = c->verify_ondisk; ++ struct btree_node *n_sorted = c->verify_data->data; ++ struct bset *sorted, *inmemory = &b->data->keys; ++ struct bch_dev *ca = bch_dev_bkey_exists(c, pick.ptr.dev); + struct bio *bio; ++ bool failed = false; + +- if (c->opts.nochanges) +- return; +- +- btree_node_io_lock(b); +- mutex_lock(&c->verify_lock); +- +- n_ondisk = c->verify_ondisk; +- n_sorted = c->verify_data->data; +- n_inmemory = b->data; +- +- bkey_copy(&v->key, &b->key); +- v->written = 0; +- v->c.level = b->c.level; +- v->c.btree_id = b->c.btree_id; +- bch2_btree_keys_init(v); +- +- if (bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key), +- NULL, &pick) <= 0) +- return; +- +- ca = bch_dev_bkey_exists(c, pick.ptr.dev); + if (!bch2_dev_get_ioref(ca, READ)) +- return; ++ return false; + + bio = bio_alloc_bioset(GFP_NOIO, + buf_pages(n_sorted, btree_bytes(c)), +@@ -79,12 +58,12 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b) + + memcpy(n_ondisk, n_sorted, btree_bytes(c)); + ++ v->written = 0; + if (bch2_btree_node_read_done(c, ca, v, false)) +- goto out; ++ return false; + + n_sorted = c->verify_data->data; + sorted = &n_sorted->keys; +- inmemory = &n_inmemory->keys; + + if (inmemory->u64s != sorted->u64s || + memcmp(inmemory->start, +@@ -102,8 +81,8 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b) + printk(KERN_ERR "*** read back in:\n"); + bch2_dump_bset(c, v, sorted, 0); + +- while (offset < b->written) { +- if (!offset ) { ++ while (offset < v->written) { ++ if (!offset) { + i = &n_ondisk->keys; + sectors = vstruct_blocks(n_ondisk, c->block_bits) << + c->block_bits; +@@ -122,25 +101,84 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b) + offset += sectors; + } + +- printk(KERN_ERR "*** block %u/%u not written\n", +- offset >> c->block_bits, btree_blocks(c)); +- + for (j = 0; j < le16_to_cpu(inmemory->u64s); j++) + if (inmemory->_data[j] != sorted->_data[j]) + break; + +- printk(KERN_ERR "b->written %u\n", b->written); +- + console_unlock(); +- panic("verify failed at %u\n", j); ++ bch_err(c, "verify failed at key %u", j); ++ ++ failed = true; ++ } ++ ++ if (v->written != b->written) { ++ bch_err(c, "written wrong: expected %u, got %u", ++ b->written, v->written); ++ failed = true; ++ } ++ ++ return failed; ++} ++ ++void __bch2_btree_verify(struct bch_fs *c, struct btree *b) ++{ ++ struct bkey_ptrs_c ptrs; ++ struct extent_ptr_decoded p; ++ const union bch_extent_entry *entry; ++ struct btree *v; ++ struct bset *inmemory = &b->data->keys; ++ struct bkey_packed *k; ++ bool failed = false; ++ ++ if (c->opts.nochanges) ++ return; ++ ++ btree_node_io_lock(b); ++ mutex_lock(&c->verify_lock); ++ ++ if (!c->verify_ondisk) { ++ c->verify_ondisk = kvpmalloc(btree_bytes(c), GFP_KERNEL); ++ if (!c->verify_ondisk) ++ goto out; ++ } ++ ++ if (!c->verify_data) { ++ c->verify_data = __bch2_btree_node_mem_alloc(c); ++ if (!c->verify_data) ++ goto out; ++ ++ list_del_init(&c->verify_data->list); ++ } ++ ++ BUG_ON(b->nsets != 1); ++ ++ for (k = inmemory->start; k != vstruct_last(inmemory); k = bkey_next(k)) ++ if (k->type == KEY_TYPE_btree_ptr_v2) { ++ struct bch_btree_ptr_v2 *v = (void *) bkeyp_val(&b->format, k); ++ v->mem_ptr = 0; ++ } ++ ++ v = c->verify_data; ++ bkey_copy(&v->key, &b->key); ++ v->c.level = b->c.level; ++ v->c.btree_id = b->c.btree_id; ++ bch2_btree_keys_init(v); ++ ++ ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(&b->key)); ++ bkey_for_each_ptr_decode(&b->key.k, ptrs, p, entry) ++ failed |= bch2_btree_verify_replica(c, b, p); ++ ++ if (failed) { ++ char buf[200]; ++ ++ bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(&b->key)); ++ bch2_fs_fatal_error(c, "btree node verify failed for : %s\n", buf); + } + out: + mutex_unlock(&c->verify_lock); + btree_node_io_unlock(b); + } + +-#endif +- + #ifdef CONFIG_DEBUG_FS + + /* XXX: bch_fs refcounting */ +diff --git a/fs/bcachefs/debug.h b/fs/bcachefs/debug.h +index 7ac1615e9447..0b86736e5e1b 100644 +--- a/fs/bcachefs/debug.h ++++ b/fs/bcachefs/debug.h +@@ -8,11 +8,7 @@ struct bio; + struct btree; + struct bch_fs; + +-#ifdef CONFIG_BCACHEFS_DEBUG + void __bch2_btree_verify(struct bch_fs *, struct btree *); +-#else +-static inline void __bch2_btree_verify(struct bch_fs *c, struct btree *b) {} +-#endif + + static inline void bch2_btree_verify(struct bch_fs *c, struct btree *b) + { +-- +cgit v1.2.3 + + +From 8fb0ad93a748ef39da5a380c71ede19dbb59d048 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 20 Apr 2021 17:09:25 -0400 +Subject: bcachefs: Fix a deadlock on journal reclaim + +Flushing the btree key cache needs to use allocation reserves - journal +reclaim depends on flushing the btree key cache for making forward +progress, and the allocator and copygc depend on journal reclaim making +forward progress. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_key_cache.c | 6 ++++++ + fs/bcachefs/journal_reclaim.c | 2 +- + fs/bcachefs/movinggc.c | 13 ++++++++++++- + 3 files changed, 19 insertions(+), 2 deletions(-) + +diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c +index 53191c99e590..8dec32057385 100644 +--- a/fs/bcachefs/btree_key_cache.c ++++ b/fs/bcachefs/btree_key_cache.c +@@ -385,12 +385,18 @@ retry: + goto evict; + } + ++ /* ++ * Since journal reclaim depends on us making progress here, and the ++ * allocator/copygc depend on journal reclaim making progress, we need ++ * to be using alloc reserves: ++ * */ + ret = bch2_btree_iter_traverse(b_iter) ?: + bch2_trans_update(trans, b_iter, ck->k, BTREE_TRIGGER_NORUN) ?: + bch2_trans_commit(trans, NULL, NULL, + BTREE_INSERT_NOUNLOCK| + BTREE_INSERT_NOCHECK_RW| + BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_USE_RESERVE| + (ck->journal.seq == journal_last_seq(j) + ? BTREE_INSERT_JOURNAL_RESERVED + : 0)| +diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c +index f117d361d584..24d04e51fb61 100644 +--- a/fs/bcachefs/journal_reclaim.c ++++ b/fs/bcachefs/journal_reclaim.c +@@ -634,7 +634,7 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct) + msecs_to_jiffies(j->reclaim_delay_ms))) + min_nr = 1; + +- if (j->prereserved.reserved * 2 > j->prereserved.remaining) ++ if (j->prereserved.reserved * 4 > j->prereserved.remaining) + min_nr = 1; + + if (fifo_free(&j->pin) <= 32) +diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c +index 80772cff0f9d..4ac7e61fb841 100644 +--- a/fs/bcachefs/movinggc.c ++++ b/fs/bcachefs/movinggc.c +@@ -87,9 +87,20 @@ static enum data_cmd copygc_pred(struct bch_fs *c, void *arg, + if (i >= 0 && + p.ptr.offset < h->data[i].offset + ca->mi.bucket_size && + p.ptr.gen == h->data[i].gen) { ++ /* ++ * We need to use the journal reserve here, because ++ * - journal reclaim depends on btree key cache ++ * flushing to make forward progress, ++ * - which has to make forward progress when the ++ * journal is pre-reservation full, ++ * - and depends on allocation - meaning allocator and ++ * copygc ++ */ ++ + data_opts->target = io_opts->background_target; + data_opts->nr_replicas = 1; +- data_opts->btree_insert_flags = BTREE_INSERT_USE_RESERVE; ++ data_opts->btree_insert_flags = BTREE_INSERT_USE_RESERVE| ++ BTREE_INSERT_JOURNAL_RESERVED; + data_opts->rewrite_dev = p.ptr.dev; + + if (p.has_ec) +-- +cgit v1.2.3 + + +From 6a7250a13e735a6074627ab40f936c6a7e0b1002 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 21 Apr 2021 18:08:39 -0400 +Subject: bcachefs: Don't BUG() in update_replicas + +Apparently, we have a bug where in mark and sweep while accounting for a +key, a replicas entry isn't found. Change the code to print out the key +we couldn't mark and halt instead of a BUG_ON(). + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/buckets.c | 49 ++++++++++++++++++++++++++++++++++++------------- + 1 file changed, 36 insertions(+), 13 deletions(-) + +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index dd2cfc515879..c3ad0bc85e78 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -396,20 +396,22 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca, + bch2_wake_allocator(ca); + } + +-static inline void update_replicas(struct bch_fs *c, ++static inline int update_replicas(struct bch_fs *c, + struct bch_fs_usage *fs_usage, + struct bch_replicas_entry *r, + s64 sectors) + { + int idx = bch2_replicas_entry_idx(c, r); + +- BUG_ON(idx < 0); ++ if (idx < 0) ++ return -1; + + fs_usage_data_type_to_base(fs_usage, r->data_type, sectors); + fs_usage->replicas[idx] += sectors; ++ return 0; + } + +-static inline void update_cached_sectors(struct bch_fs *c, ++static inline int update_cached_sectors(struct bch_fs *c, + struct bch_fs_usage *fs_usage, + unsigned dev, s64 sectors) + { +@@ -417,7 +419,7 @@ static inline void update_cached_sectors(struct bch_fs *c, + + bch2_replicas_entry_cached(&r.e, dev); + +- update_replicas(c, fs_usage, &r.e, sectors); ++ return update_replicas(c, fs_usage, &r.e, sectors); + } + + static struct replicas_delta_list * +@@ -569,8 +571,12 @@ static int bch2_mark_alloc(struct bch_fs *c, + + if ((flags & BTREE_TRIGGER_BUCKET_INVALIDATE) && + old_m.cached_sectors) { +- update_cached_sectors(c, fs_usage, ca->dev_idx, +- -old_m.cached_sectors); ++ if (update_cached_sectors(c, fs_usage, ca->dev_idx, ++ -old_m.cached_sectors)) { ++ bch2_fs_fatal_error(c, "bch2_mark_alloc(): no replicas entry while updating cached sectors"); ++ return -1; ++ } ++ + trace_invalidate(ca, bucket_to_sector(ca, new.k->p.offset), + old_m.cached_sectors); + } +@@ -952,8 +958,12 @@ static int bch2_mark_extent(struct bch_fs *c, + + if (p.ptr.cached) { + if (!stale) +- update_cached_sectors(c, fs_usage, p.ptr.dev, +- disk_sectors); ++ if (update_cached_sectors(c, fs_usage, p.ptr.dev, ++ disk_sectors)) { ++ bch2_fs_fatal_error(c, "bch2_mark_extent(): no replicas entry while updating cached sectors"); ++ return -1; ++ ++ } + } else if (!p.has_ec) { + dirty_sectors += disk_sectors; + r.e.devs[r.e.nr_devs++] = p.ptr.dev; +@@ -972,8 +982,15 @@ static int bch2_mark_extent(struct bch_fs *c, + } + } + +- if (r.e.nr_devs) +- update_replicas(c, fs_usage, &r.e, dirty_sectors); ++ if (r.e.nr_devs) { ++ if (update_replicas(c, fs_usage, &r.e, dirty_sectors)) { ++ char buf[200]; ++ ++ bch2_bkey_val_to_text(&PBUF(buf), c, k); ++ bch2_fs_fatal_error(c, "no replicas entry for %s", buf); ++ return -1; ++ } ++ } + + return 0; + } +@@ -1047,8 +1064,14 @@ static int bch2_mark_stripe(struct bch_fs *c, + return ret; + } + +- update_replicas(c, fs_usage, &m->r.e, +- ((s64) m->sectors * m->nr_redundant)); ++ if (update_replicas(c, fs_usage, &m->r.e, ++ ((s64) m->sectors * m->nr_redundant))) { ++ char buf[200]; ++ ++ bch2_bkey_val_to_text(&PBUF(buf), c, new); ++ bch2_fs_fatal_error(c, "no replicas entry for %s", buf); ++ return -1; ++ } + } + + return 0; +@@ -1308,7 +1331,7 @@ void bch2_trans_fs_usage_apply(struct btree_trans *trans, + added += d->delta; + } + +- update_replicas(c, dst, &d->r, d->delta); ++ BUG_ON(update_replicas(c, dst, &d->r, d->delta)); + } + + dst->nr_inodes += deltas->nr_inodes; +-- +cgit v1.2.3 + + +From 8063c50181f869cea87c6851e8772136e30b8d02 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 19 Apr 2021 22:19:18 -0400 +Subject: bcachefs: Lookup/create lost+found lazily + +This is prep work for subvolumes - each subvolume will have its own +lost+found. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/fsck.c | 222 ++++++++++++++++++++++++++--------------------------- + 1 file changed, 111 insertions(+), 111 deletions(-) + +diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c +index cfe606342032..1ce038846476 100644 +--- a/fs/bcachefs/fsck.c ++++ b/fs/bcachefs/fsck.c +@@ -38,9 +38,9 @@ static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum) + return ret ?: sectors; + } + +-static int lookup_inode(struct btree_trans *trans, u64 inode_nr, +- struct bch_inode_unpacked *inode, +- u32 *snapshot) ++static int __lookup_inode(struct btree_trans *trans, u64 inode_nr, ++ struct bch_inode_unpacked *inode, ++ u32 *snapshot) + { + struct btree_iter *iter; + struct bkey_s_c k; +@@ -63,19 +63,34 @@ err: + return ret; + } + +-static int write_inode(struct btree_trans *trans, +- struct bch_inode_unpacked *inode, +- u32 snapshot) ++static int lookup_inode(struct btree_trans *trans, u64 inode_nr, ++ struct bch_inode_unpacked *inode, ++ u32 *snapshot) ++{ ++ return lockrestart_do(trans, __lookup_inode(trans, inode_nr, inode, snapshot)); ++} ++ ++static int __write_inode(struct btree_trans *trans, ++ struct bch_inode_unpacked *inode, ++ u32 snapshot) + { + struct btree_iter *inode_iter = + bch2_trans_get_iter(trans, BTREE_ID_inodes, + SPOS(0, inode->bi_inum, snapshot), + BTREE_ITER_INTENT); ++ int ret = bch2_inode_write(trans, inode_iter, inode); ++ bch2_trans_iter_put(trans, inode_iter); ++ return ret; ++} ++ ++static int write_inode(struct btree_trans *trans, ++ struct bch_inode_unpacked *inode, ++ u32 snapshot) ++{ + int ret = __bch2_trans_do(trans, NULL, NULL, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW, +- bch2_inode_write(trans, inode_iter, inode)); +- bch2_trans_iter_put(trans, inode_iter); ++ __write_inode(trans, inode, snapshot)); + if (ret) + bch_err(trans->c, "error in fsck: error %i updating inode", ret); + return ret; +@@ -114,57 +129,101 @@ static int remove_dirent(struct btree_trans *trans, struct bpos pos) + return ret; + } + +-static int __reattach_inode(struct btree_trans *trans, +- struct bch_inode_unpacked *lostfound, +- u64 inum) ++/* Get lost+found, create if it doesn't exist: */ ++static int lookup_lostfound(struct btree_trans *trans, ++ struct bch_inode_unpacked *lostfound) + { +- struct bch_hash_info dir_hash = +- bch2_hash_info_init(trans->c, lostfound); +- struct bch_inode_unpacked inode_u; ++ struct bch_fs *c = trans->c; ++ struct bch_inode_unpacked root; ++ struct bch_hash_info root_hash_info; ++ struct qstr lostfound_str = QSTR("lost+found"); ++ u64 inum; ++ u32 snapshot; ++ int ret; ++ ++ ret = lookup_inode(trans, BCACHEFS_ROOT_INO, &root, &snapshot); ++ if (ret && ret != -ENOENT) ++ return ret; ++ ++ root_hash_info = bch2_hash_info_init(c, &root); ++ inum = bch2_dirent_lookup(c, BCACHEFS_ROOT_INO, &root_hash_info, ++ &lostfound_str); ++ if (!inum) { ++ bch_notice(c, "creating lost+found"); ++ goto create_lostfound; ++ } ++ ++ ret = lookup_inode(trans, inum, lostfound, &snapshot); ++ if (ret && ret != -ENOENT) { ++ /* ++ * The check_dirents pass has already run, dangling dirents ++ * shouldn't exist here: ++ */ ++ bch_err(c, "error looking up lost+found: %i", ret); ++ return ret; ++ } ++ ++ if (ret == -ENOENT) { ++create_lostfound: ++ bch2_inode_init_early(c, lostfound); ++ ++ ret = __bch2_trans_do(trans, NULL, NULL, ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_LAZY_RW, ++ bch2_create_trans(trans, ++ BCACHEFS_ROOT_INO, &root, ++ lostfound, ++ &lostfound_str, ++ 0, 0, S_IFDIR|0700, 0, NULL, NULL)); ++ if (ret) ++ bch_err(c, "error creating lost+found: %i", ret); ++ } ++ ++ return 0; ++} ++ ++static int reattach_inode(struct btree_trans *trans, ++ struct bch_inode_unpacked *inode) ++{ ++ struct bch_hash_info dir_hash; ++ struct bch_inode_unpacked lostfound; + char name_buf[20]; + struct qstr name; + u64 dir_offset = 0; +- u32 snapshot; + int ret; + +- snprintf(name_buf, sizeof(name_buf), "%llu", inum); +- name = (struct qstr) QSTR(name_buf); +- +- ret = lookup_inode(trans, inum, &inode_u, &snapshot); ++ ret = lookup_lostfound(trans, &lostfound); + if (ret) + return ret; + +- if (S_ISDIR(inode_u.bi_mode)) { +- lostfound->bi_nlink++; ++ if (S_ISDIR(inode->bi_mode)) { ++ lostfound.bi_nlink++; + +- ret = write_inode(trans, lostfound, U32_MAX); ++ ret = write_inode(trans, &lostfound, U32_MAX); + if (ret) + return ret; + } + +- ret = bch2_dirent_create(trans, lostfound->bi_inum, &dir_hash, +- mode_to_type(inode_u.bi_mode), +- &name, inum, &dir_offset, +- BCH_HASH_SET_MUST_CREATE); +- if (ret) +- return ret; ++ dir_hash = bch2_hash_info_init(trans->c, &lostfound); + +- inode_u.bi_dir = lostfound->bi_inum; +- inode_u.bi_dir_offset = dir_offset; ++ snprintf(name_buf, sizeof(name_buf), "%llu", inode->bi_inum); ++ name = (struct qstr) QSTR(name_buf); + +- return write_inode(trans, &inode_u, U32_MAX); +-} ++ ret = __bch2_trans_do(trans, NULL, NULL, BTREE_INSERT_LAZY_RW, ++ bch2_dirent_create(trans, lostfound.bi_inum, &dir_hash, ++ mode_to_type(inode->bi_mode), ++ &name, inode->bi_inum, &dir_offset, ++ BCH_HASH_SET_MUST_CREATE)); ++ if (ret) { ++ bch_err(trans->c, "error %i reattaching inode %llu", ++ ret, inode->bi_inum); ++ return ret; ++ } + +-static int reattach_inode(struct btree_trans *trans, +- struct bch_inode_unpacked *lostfound, +- u64 inum) +-{ +- int ret = __bch2_trans_do(trans, NULL, NULL, BTREE_INSERT_LAZY_RW, +- __reattach_inode(trans, lostfound, inum)); +- if (ret) +- bch_err(trans->c, "error %i reattaching inode %llu", ret, inum); ++ inode->bi_dir = lostfound.bi_inum; ++ inode->bi_dir_offset = dir_offset; + +- return ret; ++ return write_inode(trans, inode, U32_MAX); + } + + static int remove_backpointer(struct btree_trans *trans, +@@ -931,58 +990,6 @@ create_root: + BTREE_INSERT_LAZY_RW); + } + +-/* Get lost+found, create if it doesn't exist: */ +-static int check_lostfound(struct bch_fs *c, +- struct bch_inode_unpacked *root_inode, +- struct bch_inode_unpacked *lostfound_inode) +-{ +- struct qstr lostfound = QSTR("lost+found"); +- struct bch_hash_info root_hash_info = +- bch2_hash_info_init(c, root_inode); +- u64 inum; +- u32 snapshot; +- int ret; +- +- bch_verbose(c, "checking lost+found"); +- +- inum = bch2_dirent_lookup(c, BCACHEFS_ROOT_INO, &root_hash_info, +- &lostfound); +- if (!inum) { +- bch_notice(c, "creating lost+found"); +- goto create_lostfound; +- } +- +- ret = bch2_trans_do(c, NULL, NULL, 0, +- lookup_inode(&trans, inum, lostfound_inode, &snapshot)); +- if (ret && ret != -ENOENT) +- return ret; +- +- if (fsck_err_on(ret, c, "lost+found missing")) +- goto create_lostfound; +- +- if (fsck_err_on(!S_ISDIR(lostfound_inode->bi_mode), c, +- "lost+found inode not a directory")) +- goto create_lostfound; +- +- return 0; +-fsck_err: +- return ret; +-create_lostfound: +- bch2_inode_init_early(c, lostfound_inode); +- +- ret = bch2_trans_do(c, NULL, NULL, +- BTREE_INSERT_NOFAIL| +- BTREE_INSERT_LAZY_RW, +- bch2_create_trans(&trans, +- BCACHEFS_ROOT_INO, root_inode, +- lostfound_inode, &lostfound, +- 0, 0, S_IFDIR|0700, 0, NULL, NULL)); +- if (ret) +- bch_err(c, "error creating lost+found: %i", ret); +- +- return ret; +-} +- + struct pathbuf { + size_t nr; + size_t size; +@@ -1014,7 +1021,6 @@ static int path_down(struct pathbuf *p, u64 inum) + } + + static int check_path(struct btree_trans *trans, +- struct bch_inode_unpacked *lostfound, + struct pathbuf *p, + struct bch_inode_unpacked *inode) + { +@@ -1038,7 +1044,7 @@ static int check_path(struct btree_trans *trans, + inode->bi_nlink, + inode->bi_dir, + inode->bi_dir_offset)) +- ret = reattach_inode(trans, lostfound, inode->bi_inum); ++ ret = reattach_inode(trans, inode); + break; + } + ret = 0; +@@ -1067,12 +1073,11 @@ static int check_path(struct btree_trans *trans, + break; + } + +- ret = reattach_inode(trans, lostfound, inode->bi_inum); ++ ret = reattach_inode(trans, inode); + break; + } + +- ret = lockrestart_do(trans, +- lookup_inode(trans, inode->bi_dir, inode, &snapshot)); ++ ret = lookup_inode(trans, inode->bi_dir, inode, &snapshot); + if (ret) { + /* Should have been caught in dirents pass */ + bch_err(c, "error looking up parent directory: %i", ret); +@@ -1090,8 +1095,7 @@ fsck_err: + * After check_dirents(), if an inode backpointer doesn't exist that means it's + * unreachable: + */ +-static int check_directory_structure(struct bch_fs *c, +- struct bch_inode_unpacked *lostfound) ++static int check_directory_structure(struct bch_fs *c) + { + struct btree_trans trans; + struct btree_iter *iter; +@@ -1113,7 +1117,7 @@ static int check_directory_structure(struct bch_fs *c, + break; + } + +- ret = check_path(&trans, lostfound, &path, &u); ++ ret = check_path(&trans, &path, &u); + if (ret) + break; + } +@@ -1190,7 +1194,6 @@ static int bch2_gc_walk_dirents(struct bch_fs *c, nlink_table *links, + } + + static int check_inode_nlink(struct btree_trans *trans, +- struct bch_inode_unpacked *lostfound_inode, + struct btree_iter *iter, + struct bkey_s_c_inode inode, + unsigned nlink) +@@ -1238,7 +1241,6 @@ fsck_err: + + noinline_for_stack + static int bch2_gc_walk_inodes(struct bch_fs *c, +- struct bch_inode_unpacked *lostfound_inode, + nlink_table *links, + u64 range_start, u64 range_end) + { +@@ -1259,7 +1261,7 @@ static int bch2_gc_walk_inodes(struct bch_fs *c, + continue; + + link = genradix_ptr(links, k.k->p.offset - range_start); +- ret = check_inode_nlink(&trans, lostfound_inode, iter, ++ ret = check_inode_nlink(&trans, iter, + bkey_s_c_to_inode(k), link ? link->count : 0); + if (ret) + break; +@@ -1275,8 +1277,7 @@ static int bch2_gc_walk_inodes(struct bch_fs *c, + } + + noinline_for_stack +-static int check_nlinks(struct bch_fs *c, +- struct bch_inode_unpacked *lostfound_inode) ++static int check_nlinks(struct bch_fs *c) + { + nlink_table links; + u64 this_iter_range_start, next_iter_range_start = 0; +@@ -1296,7 +1297,7 @@ static int check_nlinks(struct bch_fs *c, + if (ret) + break; + +- ret = bch2_gc_walk_inodes(c, lostfound_inode, &links, ++ ret = bch2_gc_walk_inodes(c, &links, + this_iter_range_start, + next_iter_range_start); + if (ret) +@@ -1316,16 +1317,15 @@ static int check_nlinks(struct bch_fs *c, + */ + int bch2_fsck_full(struct bch_fs *c) + { +- struct bch_inode_unpacked root_inode, lostfound_inode; ++ struct bch_inode_unpacked root_inode; + + return check_inodes(c, true) ?: + check_extents(c) ?: + check_dirents(c) ?: + check_xattrs(c) ?: + check_root(c, &root_inode) ?: +- check_lostfound(c, &root_inode, &lostfound_inode) ?: +- check_directory_structure(c, &lostfound_inode) ?: +- check_nlinks(c, &lostfound_inode); ++ check_directory_structure(c) ?: ++ check_nlinks(c); + } + + int bch2_fsck_walk_inodes_only(struct bch_fs *c) +-- +cgit v1.2.3 + + +From 70194433e29d212908675ea6dc03dd772f966420 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 23 Apr 2021 16:18:43 -0400 +Subject: bcachefs: Fix repair leading to replicas not marked + +bch2_check_fix_ptrs() was being called after checking if the replicas +set was marked - but repair could change which replicas set needed to be +marked. Oops. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_gc.c | 6 ++++-- + 1 file changed, 4 insertions(+), 2 deletions(-) + +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +index 5c45a693d512..604772cb479f 100644 +--- a/fs/bcachefs/btree_gc.c ++++ b/fs/bcachefs/btree_gc.c +@@ -330,6 +330,10 @@ static int bch2_gc_mark_key(struct bch_fs *c, enum btree_id btree_id, + BUG_ON(bch2_journal_seq_verify && + k->k->version.lo > journal_cur_seq(&c->journal)); + ++ ret = bch2_check_fix_ptrs(c, btree_id, level, is_root, k); ++ if (ret) ++ goto err; ++ + if (fsck_err_on(k->k->version.lo > atomic64_read(&c->key_version), c, + "key version number higher than recorded: %llu > %llu", + k->k->version.lo, +@@ -346,8 +350,6 @@ static int bch2_gc_mark_key(struct bch_fs *c, enum btree_id btree_id, + goto err; + } + } +- +- ret = bch2_check_fix_ptrs(c, btree_id, level, is_root, k); + } + + ptrs = bch2_bkey_ptrs_c(*k); +-- +cgit v1.2.3 + + +From e53689355d527f34b8d928350aed6828ec659516 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 23 Apr 2021 16:05:49 -0400 +Subject: bcachefs: Don't BUG_ON() btree topology error + +This replaces an assertion in the btree merge path with a +bch2_inconsistent_error() - fsck will fix it. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_cache.c | 45 ++++++++++++++++++++++++++++++------- + fs/bcachefs/btree_update_interior.c | 14 +++++++++++- + 2 files changed, 50 insertions(+), 9 deletions(-) + +diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c +index 61363d44fb20..05e04c235ce0 100644 +--- a/fs/bcachefs/btree_cache.c ++++ b/fs/bcachefs/btree_cache.c +@@ -685,6 +685,41 @@ static int lock_node_check_fn(struct six_lock *lock, void *p) + return b->hash_val == btree_ptr_hash_val(k) ? 0 : -1; + } + ++static noinline void btree_bad_header(struct bch_fs *c, struct btree *b) ++{ ++ char buf1[100], buf2[100], buf3[100], buf4[100]; ++ ++ if (!test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) ++ return; ++ ++ bch2_bpos_to_text(&PBUF(buf1), b->key.k.type == KEY_TYPE_btree_ptr_v2 ++ ? bkey_i_to_btree_ptr_v2(&b->key)->v.min_key ++ : POS_MIN); ++ bch2_bpos_to_text(&PBUF(buf2), b->data->min_key); ++ ++ bch2_bpos_to_text(&PBUF(buf3), b->key.k.p); ++ bch2_bpos_to_text(&PBUF(buf4), b->data->max_key); ++ bch2_fs_inconsistent(c, "btree node header doesn't match ptr\n" ++ "btree: ptr %u header %llu\n" ++ "level: ptr %u header %llu\n" ++ "min ptr %s node header %s\n" ++ "max ptr %s node header %s", ++ b->c.btree_id, BTREE_NODE_ID(b->data), ++ b->c.level, BTREE_NODE_LEVEL(b->data), ++ buf1, buf2, buf3, buf4); ++} ++ ++static inline void btree_check_header(struct bch_fs *c, struct btree *b) ++{ ++ if (b->c.btree_id != BTREE_NODE_ID(b->data) || ++ b->c.level != BTREE_NODE_LEVEL(b->data) || ++ bpos_cmp(b->data->max_key, b->key.k.p) || ++ (b->key.k.type == KEY_TYPE_btree_ptr_v2 && ++ bpos_cmp(b->data->min_key, ++ bkey_i_to_btree_ptr_v2(&b->key)->v.min_key))) ++ btree_bad_header(c, b); ++} ++ + /** + * bch_btree_node_get - find a btree node in the cache and lock it, reading it + * in from disk if necessary. +@@ -802,10 +837,7 @@ lock_node: + + EBUG_ON(b->c.btree_id != iter->btree_id); + EBUG_ON(BTREE_NODE_LEVEL(b->data) != level); +- EBUG_ON(bpos_cmp(b->data->max_key, k->k.p)); +- EBUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 && +- bpos_cmp(b->data->min_key, +- bkey_i_to_btree_ptr_v2(&b->key)->v.min_key)); ++ btree_check_header(c, b); + + return b; + } +@@ -885,10 +917,7 @@ lock_node: + + EBUG_ON(b->c.btree_id != btree_id); + EBUG_ON(BTREE_NODE_LEVEL(b->data) != level); +- EBUG_ON(bpos_cmp(b->data->max_key, k->k.p)); +- EBUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 && +- bpos_cmp(b->data->min_key, +- bkey_i_to_btree_ptr_v2(&b->key)->v.min_key)); ++ btree_check_header(c, b); + out: + bch2_btree_cache_cannibalize_unlock(c); + return b; +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index 6b8d3d0f3d2d..5c86e76f5079 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -1606,7 +1606,19 @@ retry: + next = m; + } + +- BUG_ON(bkey_cmp(bpos_successor(prev->data->max_key), next->data->min_key)); ++ if (bkey_cmp(bpos_successor(prev->data->max_key), next->data->min_key)) { ++ char buf1[100], buf2[100]; ++ ++ bch2_bpos_to_text(&PBUF(buf1), prev->data->max_key); ++ bch2_bpos_to_text(&PBUF(buf2), next->data->min_key); ++ bch2_fs_inconsistent(c, ++ "btree topology error in btree merge:\n" ++ "prev ends at %s\n" ++ "next starts at %s\n", ++ buf1, buf2); ++ ret = -EIO; ++ goto err; ++ } + + bch2_bkey_format_init(&new_s); + bch2_bkey_format_add_pos(&new_s, prev->data->min_key); +-- +cgit v1.2.3 + + +From d24f77812368c0bb78b4cb71bb4fb8665f41f304 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 24 Apr 2021 00:38:16 -0400 +Subject: bcachefs: Use mmap() instead of vmalloc_exec() in userspace + +Calling mmap() directly is much better than malloc() then mprotect(), we +end up with much less address space fragmentation. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_cache.c | 23 +++++++++++++++-------- + 1 file changed, 15 insertions(+), 8 deletions(-) + +diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c +index 05e04c235ce0..db7052564c7d 100644 +--- a/fs/bcachefs/btree_cache.c ++++ b/fs/bcachefs/btree_cache.c +@@ -33,21 +33,21 @@ static inline unsigned btree_cache_can_free(struct btree_cache *bc) + return max_t(int, 0, bc->used - bc->reserve); + } + +-static void __btree_node_data_free(struct bch_fs *c, struct btree *b) ++static void btree_node_data_free(struct bch_fs *c, struct btree *b) + { ++ struct btree_cache *bc = &c->btree_cache; ++ + EBUG_ON(btree_node_write_in_flight(b)); + + kvpfree(b->data, btree_bytes(c)); + b->data = NULL; ++#ifdef __KERNEL__ + vfree(b->aux_data); ++#else ++ munmap(b->aux_data, btree_aux_data_bytes(b)); ++#endif + b->aux_data = NULL; +-} + +-static void btree_node_data_free(struct bch_fs *c, struct btree *b) +-{ +- struct btree_cache *bc = &c->btree_cache; +- +- __btree_node_data_free(c, b); + bc->used--; + list_move(&b->list, &bc->freed); + } +@@ -75,8 +75,15 @@ static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp) + b->data = kvpmalloc(btree_bytes(c), gfp); + if (!b->data) + return -ENOMEM; +- ++#ifdef __KERNEL__ + b->aux_data = vmalloc_exec(btree_aux_data_bytes(b), gfp); ++#else ++ b->aux_data = mmap(NULL, btree_aux_data_bytes(b), ++ PROT_READ|PROT_WRITE|PROT_EXEC, ++ MAP_PRIVATE|MAP_ANONYMOUS, 0, 0); ++ if (b->aux_data == MAP_FAILED) ++ b->aux_data = NULL; ++#endif + if (!b->aux_data) { + kvpfree(b->data, btree_bytes(c)); + b->data = NULL; +-- +cgit v1.2.3 + + +From 6178dc8b6d01c8f23f57a7d991ea4f138bcb57ba Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 24 Apr 2021 00:42:02 -0400 +Subject: bcachefs: Fix an out of bounds read + +bch2_varint_decode() can read up to 7 bytes past the end of the buffer, +which means we need to allocate slightly larger key cache buffers. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_key_cache.c | 10 ++++++++-- + fs/bcachefs/btree_update_leaf.c | 6 ++++++ + 2 files changed, 14 insertions(+), 2 deletions(-) + +diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c +index 8dec32057385..a5181a96397a 100644 +--- a/fs/bcachefs/btree_key_cache.c ++++ b/fs/bcachefs/btree_key_cache.c +@@ -218,8 +218,14 @@ static int btree_key_cache_fill(struct btree_trans *trans, + goto err; + } + +- if (k.k->u64s > ck->u64s) { +- new_u64s = roundup_pow_of_two(k.k->u64s); ++ /* ++ * bch2_varint_decode can read past the end of the buffer by at ++ * most 7 bytes (it won't be used): ++ */ ++ new_u64s = k.k->u64s + 1; ++ ++ if (new_u64s > ck->u64s) { ++ new_u64s = roundup_pow_of_two(new_u64s); + new_k = kmalloc(new_u64s * sizeof(u64), GFP_NOFS); + if (!new_k) { + ret = -ENOMEM; +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index afdcc98dfb83..b793ab77e452 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -293,6 +293,12 @@ btree_key_can_insert_cached(struct btree_trans *trans, + !(trans->flags & BTREE_INSERT_JOURNAL_RECLAIM)) + return BTREE_INSERT_NEED_JOURNAL_RECLAIM; + ++ /* ++ * bch2_varint_decode can read past the end of the buffer by at most 7 ++ * bytes (it won't be used): ++ */ ++ u64s += 1; ++ + if (u64s <= ck->u64s) + return BTREE_INSERT_OK; + +-- +cgit v1.2.3 + + +From a66e1ec34d501011706f2d22498103b20baa5816 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 24 Apr 2021 00:59:29 -0400 +Subject: bcachefs: Fix bch2_verify_keylist_sorted + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/keylist.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/fs/bcachefs/keylist.c b/fs/bcachefs/keylist.c +index 864dfaa67b7a..cda77835b9ea 100644 +--- a/fs/bcachefs/keylist.c ++++ b/fs/bcachefs/keylist.c +@@ -62,6 +62,6 @@ void bch2_verify_keylist_sorted(struct keylist *l) + + for_each_keylist_key(l, k) + BUG_ON(bkey_next(k) != l->top && +- bkey_cmp(k->k.p, bkey_next(k)->k.p) >= 0); ++ bpos_cmp(k->k.p, bkey_next(k)->k.p) >= 0); + } + #endif +-- +cgit v1.2.3 + + +From de38390234107ff97ece4fb0604f4e7c67c165bd Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 24 Apr 2021 02:47:41 -0400 +Subject: bcachefs: Rewrite btree nodes with errors + +This patch adds self healing functionality for btree nodes - if we +notice a problem when reading a btree node, we just rewrite it. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_io.c | 7 ++++++ + fs/bcachefs/btree_update.h | 1 + + fs/bcachefs/btree_update_interior.c | 50 +++++++++++++++++++++++++++++++++++++ + 3 files changed, 58 insertions(+) + +diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c +index 2de31a6b9661..05418b51b8a5 100644 +--- a/fs/bcachefs/btree_io.c ++++ b/fs/bcachefs/btree_io.c +@@ -986,6 +986,7 @@ static void btree_node_read_work(struct work_struct *work) + struct bch_io_failures failed = { .nr = 0 }; + char buf[200]; + struct printbuf out; ++ bool saw_error = false; + bool can_retry; + + goto start; +@@ -1023,6 +1024,8 @@ start: + !bch2_btree_node_read_done(c, ca, b, can_retry)) + break; + ++ saw_error = true; ++ + if (!can_retry) { + set_btree_node_read_error(b); + break; +@@ -1032,6 +1035,10 @@ start: + bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read], + rb->start_time); + bio_put(&rb->bio); ++ ++ if (saw_error && !btree_node_read_error(b)) ++ bch2_btree_node_rewrite_async(c, b); ++ + clear_btree_node_read_in_flight(b); + wake_up_bit(&b->flags, BTREE_NODE_read_in_flight); + } +diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h +index 0c7caa7e91a0..56131ac516ce 100644 +--- a/fs/bcachefs/btree_update.h ++++ b/fs/bcachefs/btree_update.h +@@ -72,6 +72,7 @@ int bch2_btree_delete_range(struct bch_fs *, enum btree_id, + + int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *, + __le64, unsigned); ++void bch2_btree_node_rewrite_async(struct bch_fs *, struct btree *); + int bch2_btree_node_update_key(struct bch_fs *, struct btree_iter *, + struct btree *, struct bkey_i *); + +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index 5c86e76f5079..c4dc581ab2ab 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -1797,6 +1797,56 @@ out: + return ret; + } + ++struct async_btree_rewrite { ++ struct bch_fs *c; ++ struct work_struct work; ++ enum btree_id btree_id; ++ unsigned level; ++ struct bpos pos; ++ __le64 seq; ++}; ++ ++void async_btree_node_rewrite_work(struct work_struct *work) ++{ ++ struct async_btree_rewrite *a = ++ container_of(work, struct async_btree_rewrite, work); ++ struct bch_fs *c = a->c; ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ iter = bch2_trans_get_node_iter(&trans, a->btree_id, a->pos, ++ BTREE_MAX_DEPTH, a->level, 0); ++ bch2_btree_node_rewrite(c, iter, a->seq, 0); ++ bch2_trans_iter_put(&trans, iter); ++ bch2_trans_exit(&trans); ++ percpu_ref_put(&c->writes); ++ kfree(a); ++} ++ ++void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b) ++{ ++ struct async_btree_rewrite *a; ++ ++ if (!percpu_ref_tryget(&c->writes)) ++ return; ++ ++ a = kmalloc(sizeof(*a), GFP_NOFS); ++ if (!a) { ++ percpu_ref_put(&c->writes); ++ return; ++ } ++ ++ a->c = c; ++ a->btree_id = b->c.btree_id; ++ a->level = b->c.level; ++ a->pos = b->key.k.p; ++ a->seq = b->data->keys.seq; ++ ++ INIT_WORK(&a->work, async_btree_node_rewrite_work); ++ queue_work(system_long_wq, &a->work); ++} ++ + static void __bch2_btree_node_update_key(struct bch_fs *c, + struct btree_update *as, + struct btree_iter *iter, +-- +cgit v1.2.3 + + +From b0254c710dd01adc0c3262522f62951e10511814 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 23 Apr 2021 19:25:27 -0400 +Subject: bcachefs: New helper __bch2_btree_insert_keys_interior() + +Consolidate common parts of bch2_btree_insert_keys_interior() and +btree_split_insert_keys() - prep work for adding some new topology +assertions. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_update_interior.c | 48 ++++++++++++++++++------------------- + 1 file changed, 23 insertions(+), 25 deletions(-) + +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index c4dc581ab2ab..43d5f09fe2f4 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -1154,6 +1154,27 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b + set_btree_node_need_write(b); + } + ++static void ++__bch2_btree_insert_keys_interior(struct btree_update *as, struct btree *b, ++ struct btree_iter *iter, struct keylist *keys, ++ struct btree_node_iter node_iter) ++{ ++ struct bkey_i *insert = bch2_keylist_front(keys); ++ struct bkey_packed *k; ++ ++ BUG_ON(btree_node_type(b) != BKEY_TYPE_btree); ++ ++ while ((k = bch2_btree_node_iter_prev_all(&node_iter, b)) && ++ (bkey_cmp_left_packed(b, k, &insert->k.p) >= 0)) ++ ; ++ ++ while (!bch2_keylist_empty(keys)) { ++ bch2_insert_fixup_btree_ptr(as, b, iter, ++ bch2_keylist_front(keys), &node_iter); ++ bch2_keylist_pop_front(keys); ++ } ++} ++ + /* + * Move keys from n1 (original replacement node, now lower node) to n2 (higher + * node) +@@ -1284,16 +1305,9 @@ static void btree_split_insert_keys(struct btree_update *as, struct btree *b, + struct bkey_packed *src, *dst, *n; + struct bset *i; + +- BUG_ON(btree_node_type(b) != BKEY_TYPE_btree); +- + bch2_btree_node_iter_init(&node_iter, b, &k->k.p); + +- while (!bch2_keylist_empty(keys)) { +- k = bch2_keylist_front(keys); +- +- bch2_insert_fixup_btree_ptr(as, b, iter, k, &node_iter); +- bch2_keylist_pop_front(keys); +- } ++ __bch2_btree_insert_keys_interior(as, b, iter, keys, node_iter); + + /* + * We can't tolerate whiteouts here - with whiteouts there can be +@@ -1439,24 +1453,8 @@ bch2_btree_insert_keys_interior(struct btree_update *as, struct btree *b, + struct btree_iter *iter, struct keylist *keys) + { + struct btree_iter *linked; +- struct btree_node_iter node_iter; +- struct bkey_i *insert = bch2_keylist_front(keys); +- struct bkey_packed *k; +- +- /* Don't screw up @iter's position: */ +- node_iter = iter->l[b->c.level].iter; +- +- /* +- * btree_split(), btree_gc_coalesce() will insert keys before +- * the iterator's current position - they know the keys go in +- * the node the iterator points to: +- */ +- while ((k = bch2_btree_node_iter_prev_all(&node_iter, b)) && +- (bkey_cmp_left_packed(b, k, &insert->k.p) >= 0)) +- ; + +- for_each_keylist_key(keys, insert) +- bch2_insert_fixup_btree_ptr(as, b, iter, insert, &node_iter); ++ __bch2_btree_insert_keys_interior(as, b, iter, keys, iter->l[b->c.level].iter); + + btree_update_updated_node(as, b); + +-- +cgit v1.2.3 + + +From 6952b69e6d587c8d4d3139d0b5e95ec579d86b4c Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 24 Apr 2021 18:02:59 -0400 +Subject: bcachefs: Fix key cache assertion + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bcachefs.h | 1 + + fs/bcachefs/btree_key_cache.c | 4 +++- + fs/bcachefs/recovery.c | 11 +++++++---- + fs/bcachefs/super.c | 1 + + 4 files changed, 12 insertions(+), 5 deletions(-) + +diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h +index 7d5dbcc62060..c3f7b1401765 100644 +--- a/fs/bcachefs/bcachefs.h ++++ b/fs/bcachefs/bcachefs.h +@@ -490,6 +490,7 @@ enum { + BCH_FS_FSCK_DONE, + BCH_FS_STARTED, + BCH_FS_RW, ++ BCH_FS_WAS_RW, + + /* shutdown: */ + BCH_FS_STOPPING, +diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c +index a5181a96397a..a0ff0c3ceb90 100644 +--- a/fs/bcachefs/btree_key_cache.c ++++ b/fs/bcachefs/btree_key_cache.c +@@ -682,7 +682,9 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc) + kmem_cache_free(bch2_key_cache, ck); + } + +- BUG_ON(atomic_long_read(&bc->nr_dirty) && !bch2_journal_error(&c->journal)); ++ BUG_ON(atomic_long_read(&bc->nr_dirty) && ++ !bch2_journal_error(&c->journal) && ++ test_bit(BCH_FS_WAS_RW, &c->flags)); + BUG_ON(atomic_long_read(&bc->nr_keys)); + + mutex_unlock(&bc->lock); +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index 2dc3dee4efc8..fe6886e42216 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -973,7 +973,7 @@ int bch2_fs_recovery(struct bch_fs *c) + struct jset *last_journal_entry = NULL; + u64 blacklist_seq, journal_seq; + bool write_sb = false; +- int ret; ++ int ret = 0; + + if (c->sb.clean) + clean = read_superblock_clean(c); +@@ -1253,10 +1253,9 @@ use_clean: + if (c->journal_seq_blacklist_table && + c->journal_seq_blacklist_table->nr > 128) + queue_work(system_long_wq, &c->journal_seq_blacklist_gc_work); +-out: ++ + ret = 0; +-err: +-fsck_err: ++out: + set_bit(BCH_FS_FSCK_DONE, &c->flags); + bch2_flush_fsck_errs(c); + +@@ -1270,6 +1269,10 @@ fsck_err: + else + bch_verbose(c, "ret %i", ret); + return ret; ++err: ++fsck_err: ++ bch2_fs_emergency_read_only(c); ++ goto out; + } + + int bch2_fs_initialize(struct bch_fs *c) +diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c +index 6c690b4e0918..68be792926da 100644 +--- a/fs/bcachefs/super.c ++++ b/fs/bcachefs/super.c +@@ -434,6 +434,7 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early) + + percpu_ref_reinit(&c->writes); + set_bit(BCH_FS_RW, &c->flags); ++ set_bit(BCH_FS_WAS_RW, &c->flags); + return 0; + err: + __bch2_fs_read_only(c); +-- +cgit v1.2.3 + + +From dd1b17b1b11eac51e0eef26ed00778e9ff578f93 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 24 Apr 2021 16:32:35 -0400 +Subject: bcachefs: New and improved topology repair code + +This splits out btree topology repair into a separate pass, and makes +some improvements: + - When we have to pick which of two overlapping nodes to drop keys + from, we use the btree node header sequence number to preserve the + newer node + + - the gc code has been changed so that it doesn't bail out if we're + continuing/ignoring on fsck error - this way the dump tool can skip + running the repair pass but still walk all reachable metadata + + - add a new superblock flag indicating when a filesystem is known to + have btree topology issues, and the topology repair pass should be + run + + - changing the start/end of a node might mean keys in that node have to + be deleted: this patch handles that better by splitting it out into a + separate function and running it explicitly in the topology repair + code, previously those keys were only being dropped when the btree + node was read in. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bcachefs.h | 3 + + fs/bcachefs/bcachefs_format.h | 4 +- + fs/bcachefs/btree_gc.c | 455 +++++++++++++++++++++++++++++------- + fs/bcachefs/btree_io.c | 57 ++++- + fs/bcachefs/btree_io.h | 2 + + fs/bcachefs/btree_update_interior.c | 11 +- + fs/bcachefs/error.c | 18 +- + fs/bcachefs/error.h | 3 + + fs/bcachefs/recovery.c | 3 +- + fs/bcachefs/super-io.c | 7 + + fs/bcachefs/super.c | 5 + + 11 files changed, 470 insertions(+), 98 deletions(-) + +diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h +index c3f7b1401765..b3a93e4b399f 100644 +--- a/fs/bcachefs/bcachefs.h ++++ b/fs/bcachefs/bcachefs.h +@@ -486,6 +486,7 @@ enum { + BCH_FS_ALLOCATOR_RUNNING, + BCH_FS_ALLOCATOR_STOPPING, + BCH_FS_INITIAL_GC_DONE, ++ BCH_FS_INITIAL_GC_UNFIXED, + BCH_FS_BTREE_INTERIOR_REPLAY_DONE, + BCH_FS_FSCK_DONE, + BCH_FS_STARTED, +@@ -499,7 +500,9 @@ enum { + + /* errors: */ + BCH_FS_ERROR, ++ BCH_FS_TOPOLOGY_ERROR, + BCH_FS_ERRORS_FIXED, ++ BCH_FS_ERRORS_NOT_FIXED, + + /* misc: */ + BCH_FS_NEED_ANOTHER_GC, +diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h +index ead7268bf898..d640a3115adc 100644 +--- a/fs/bcachefs/bcachefs_format.h ++++ b/fs/bcachefs/bcachefs_format.h +@@ -1313,12 +1313,10 @@ LE64_BITMASK(BCH_SB_GRPQUOTA, struct bch_sb, flags[0], 58, 59); + LE64_BITMASK(BCH_SB_PRJQUOTA, struct bch_sb, flags[0], 59, 60); + + LE64_BITMASK(BCH_SB_HAS_ERRORS, struct bch_sb, flags[0], 60, 61); ++LE64_BITMASK(BCH_SB_HAS_TOPOLOGY_ERRORS,struct bch_sb, flags[0], 61, 62); + +-/* bit 61 was reflink option */ + LE64_BITMASK(BCH_SB_BIG_ENDIAN, struct bch_sb, flags[0], 62, 63); + +-/* 61-64 unused */ +- + LE64_BITMASK(BCH_SB_STR_HASH_TYPE, struct bch_sb, flags[1], 0, 4); + LE64_BITMASK(BCH_SB_COMPRESSION_TYPE, struct bch_sb, flags[1], 4, 8); + LE64_BITMASK(BCH_SB_INODE_32BIT, struct bch_sb, flags[1], 8, 9); +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +index 604772cb479f..5f58b316b049 100644 +--- a/fs/bcachefs/btree_gc.c ++++ b/fs/bcachefs/btree_gc.c +@@ -66,8 +66,6 @@ static int bch2_gc_check_topology(struct bch_fs *c, + ? node_start + : bpos_successor(prev->k->k.p); + char buf1[200], buf2[200]; +- bool update_min = false; +- bool update_max = false; + int ret = 0; + + if (cur.k->k.type == KEY_TYPE_btree_ptr_v2) { +@@ -81,83 +79,341 @@ static int bch2_gc_check_topology(struct bch_fs *c, + bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(prev->k)); + } + +- if (fsck_err_on(bpos_cmp(expected_start, bp->v.min_key), c, +- "btree node with incorrect min_key at btree %s level %u:\n" +- " prev %s\n" +- " cur %s", +- bch2_btree_ids[b->c.btree_id], b->c.level, +- buf1, +- (bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(cur.k)), buf2))) +- update_min = true; ++ if (bpos_cmp(expected_start, bp->v.min_key)) { ++ bch2_topology_error(c); ++ ++ if (fsck_err(c, "btree node with incorrect min_key at btree %s level %u:\n" ++ " prev %s\n" ++ " cur %s", ++ bch2_btree_ids[b->c.btree_id], b->c.level, ++ buf1, ++ (bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(cur.k)), buf2))) { ++ bch_info(c, "Halting mark and sweep to start topology repair pass"); ++ return FSCK_ERR_START_TOPOLOGY_REPAIR; ++ } else { ++ set_bit(BCH_FS_INITIAL_GC_UNFIXED, &c->flags); ++ } ++ } ++ } ++ ++ if (is_last && bpos_cmp(cur.k->k.p, node_end)) { ++ bch2_topology_error(c); ++ ++ if (fsck_err(c, "btree node with incorrect max_key at btree %s level %u:\n" ++ " %s\n" ++ " expected %s", ++ bch2_btree_ids[b->c.btree_id], b->c.level, ++ (bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(cur.k)), buf1), ++ (bch2_bpos_to_text(&PBUF(buf2), node_end), buf2))) { ++ bch_info(c, "Halting mark and sweep to start topology repair pass"); ++ return FSCK_ERR_START_TOPOLOGY_REPAIR; ++ } else { ++ set_bit(BCH_FS_INITIAL_GC_UNFIXED, &c->flags); ++ } ++ } ++ ++ bch2_bkey_buf_copy(prev, c, cur.k); ++fsck_err: ++ return ret; ++} ++ ++static void btree_ptr_to_v2(struct btree *b, struct bkey_i_btree_ptr_v2 *dst) ++{ ++ switch (b->key.k.type) { ++ case KEY_TYPE_btree_ptr: { ++ struct bkey_i_btree_ptr *src = bkey_i_to_btree_ptr(&b->key); ++ ++ dst->k.p = src->k.p; ++ dst->v.mem_ptr = 0; ++ dst->v.seq = b->data->keys.seq; ++ dst->v.sectors_written = 0; ++ dst->v.flags = 0; ++ dst->v.min_key = b->data->min_key; ++ set_bkey_val_bytes(&dst->k, sizeof(dst->v) + bkey_val_bytes(&src->k)); ++ memcpy(dst->v.start, src->v.start, bkey_val_bytes(&src->k)); ++ break; ++ } ++ case KEY_TYPE_btree_ptr_v2: ++ bkey_copy(&dst->k_i, &b->key); ++ break; ++ default: ++ BUG(); ++ } ++} ++ ++static int set_node_min(struct bch_fs *c, struct btree *b, struct bpos new_min) ++{ ++ struct bkey_i_btree_ptr_v2 *new; ++ int ret; ++ ++ new = kmalloc(BKEY_BTREE_PTR_U64s_MAX * sizeof(u64), GFP_KERNEL); ++ if (!new) ++ return -ENOMEM; ++ ++ btree_ptr_to_v2(b, new); ++ b->data->min_key = new_min; ++ new->v.min_key = new_min; ++ SET_BTREE_PTR_RANGE_UPDATED(&new->v, true); ++ ++ ret = bch2_journal_key_insert(c, b->c.btree_id, b->c.level + 1, &new->k_i); ++ if (ret) { ++ kfree(new); ++ return ret; ++ } ++ ++ bch2_btree_node_drop_keys_outside_node(b); ++ ++ return 0; ++} ++ ++static int set_node_max(struct bch_fs *c, struct btree *b, struct bpos new_max) ++{ ++ struct bkey_i_btree_ptr_v2 *new; ++ int ret; ++ ++ ret = bch2_journal_key_delete(c, b->c.btree_id, b->c.level + 1, b->key.k.p); ++ if (ret) ++ return ret; ++ ++ new = kmalloc(BKEY_BTREE_PTR_U64s_MAX * sizeof(u64), GFP_KERNEL); ++ if (!new) ++ return -ENOMEM; ++ ++ btree_ptr_to_v2(b, new); ++ b->data->max_key = new_max; ++ new->k.p = new_max; ++ SET_BTREE_PTR_RANGE_UPDATED(&new->v, true); ++ ++ ret = bch2_journal_key_insert(c, b->c.btree_id, b->c.level + 1, &new->k_i); ++ if (ret) { ++ kfree(new); ++ return ret; ++ } ++ ++ bch2_btree_node_drop_keys_outside_node(b); ++ ++ mutex_lock(&c->btree_cache.lock); ++ bch2_btree_node_hash_remove(&c->btree_cache, b); ++ ++ bkey_copy(&b->key, &new->k_i); ++ ret = __bch2_btree_node_hash_insert(&c->btree_cache, b); ++ BUG_ON(ret); ++ mutex_unlock(&c->btree_cache.lock); ++ return 0; ++} ++ ++static int btree_repair_node_start(struct bch_fs *c, struct btree *b, ++ struct btree *prev, struct btree *cur) ++{ ++ struct bpos expected_start = !prev ++ ? b->data->min_key ++ : bpos_successor(prev->key.k.p); ++ char buf1[200], buf2[200]; ++ int ret = 0; ++ ++ if (!prev) { ++ struct printbuf out = PBUF(buf1); ++ pr_buf(&out, "start of node: "); ++ bch2_bpos_to_text(&out, b->data->min_key); ++ } else { ++ bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(&prev->key)); ++ } ++ ++ if (mustfix_fsck_err_on(bpos_cmp(expected_start, cur->data->min_key), c, ++ "btree node with incorrect min_key at btree %s level %u:\n" ++ " prev %s\n" ++ " cur %s", ++ bch2_btree_ids[b->c.btree_id], b->c.level, ++ buf1, ++ (bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(&cur->key)), buf2))) { ++ if (prev && ++ bpos_cmp(expected_start, cur->data->min_key) > 0 && ++ BTREE_NODE_SEQ(cur->data) > BTREE_NODE_SEQ(prev->data)) ++ ret = set_node_max(c, prev, ++ bpos_predecessor(cur->data->min_key)); ++ else ++ ret = set_node_min(c, cur, expected_start); ++ if (ret) ++ return ret; + } ++fsck_err: ++ return ret; ++} + +- if (fsck_err_on(is_last && +- bpos_cmp(cur.k->k.p, node_end), c, ++static int btree_repair_node_end(struct bch_fs *c, struct btree *b, ++ struct btree *child) ++{ ++ char buf1[200], buf2[200]; ++ int ret = 0; ++ ++ if (mustfix_fsck_err_on(bpos_cmp(child->key.k.p, b->key.k.p), c, + "btree node with incorrect max_key at btree %s level %u:\n" + " %s\n" + " expected %s", + bch2_btree_ids[b->c.btree_id], b->c.level, +- (bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(cur.k)), buf1), +- (bch2_bpos_to_text(&PBUF(buf2), node_end), buf2))) +- update_max = true; ++ (bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(&child->key)), buf1), ++ (bch2_bpos_to_text(&PBUF(buf2), b->key.k.p), buf2))) { ++ ret = set_node_max(c, child, b->key.k.p); ++ if (ret) ++ return ret; ++ } ++fsck_err: ++ return ret; ++} + +- bch2_bkey_buf_copy(prev, c, cur.k); ++#define DROP_THIS_NODE 10 + +- if (update_min || update_max) { +- struct bkey_i *new; +- struct bkey_i_btree_ptr_v2 *bp = NULL; +- struct btree *n; ++static int bch2_btree_repair_topology_recurse(struct bch_fs *c, struct btree *b) ++{ ++ struct btree_and_journal_iter iter; ++ struct bkey_s_c k; ++ struct bkey_buf tmp; ++ struct btree *prev = NULL, *cur = NULL; ++ bool have_child, dropped_children = false; ++ char buf[200]; ++ int ret = 0; + +- if (update_max) { ++ if (!b->c.level) ++ return 0; ++again: ++ have_child = dropped_children = false; ++ bch2_bkey_buf_init(&tmp); ++ bch2_btree_and_journal_iter_init_node_iter(&iter, c, b); ++ ++ while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { ++ BUG_ON(bpos_cmp(k.k->p, b->data->min_key) < 0); ++ BUG_ON(bpos_cmp(k.k->p, b->data->max_key) > 0); ++ ++ bch2_btree_and_journal_iter_advance(&iter); ++ bch2_bkey_buf_reassemble(&tmp, c, k); ++ ++ cur = bch2_btree_node_get_noiter(c, tmp.k, ++ b->c.btree_id, b->c.level - 1, ++ false); ++ ret = PTR_ERR_OR_ZERO(cur); ++ ++ if (mustfix_fsck_err_on(ret == -EIO, c, ++ "Unreadable btree node at btree %s level %u:\n" ++ " %s", ++ bch2_btree_ids[b->c.btree_id], ++ b->c.level - 1, ++ (bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(tmp.k)), buf))) { + ret = bch2_journal_key_delete(c, b->c.btree_id, +- b->c.level, cur.k->k.p); ++ b->c.level, tmp.k->k.p); + if (ret) +- return ret; ++ goto err; ++ continue; + } + +- new = kmalloc(bkey_bytes(&cur.k->k), GFP_KERNEL); +- if (!new) { +- bch_err(c, "%s: error allocating new key", __func__); +- return -ENOMEM; ++ if (ret) { ++ bch_err(c, "%s: error %i getting btree node", ++ __func__, ret); ++ break; + } + +- bkey_copy(new, cur.k); ++ ret = btree_repair_node_start(c, b, prev, cur); ++ if (prev) ++ six_unlock_read(&prev->c.lock); ++ prev = cur; ++ cur = NULL; ++ ++ if (ret) ++ break; ++ } ++ ++ if (!ret && !IS_ERR_OR_NULL(prev)) { ++ BUG_ON(cur); ++ ret = btree_repair_node_end(c, b, prev); ++ } ++ ++ if (!IS_ERR_OR_NULL(prev)) ++ six_unlock_read(&prev->c.lock); ++ prev = NULL; ++ if (!IS_ERR_OR_NULL(cur)) ++ six_unlock_read(&cur->c.lock); ++ cur = NULL; + +- if (new->k.type == KEY_TYPE_btree_ptr_v2) +- bp = bkey_i_to_btree_ptr_v2(new); ++ if (ret) ++ goto err; ++ ++ bch2_btree_and_journal_iter_exit(&iter); ++ bch2_btree_and_journal_iter_init_node_iter(&iter, c, b); ++ ++ while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { ++ bch2_bkey_buf_reassemble(&tmp, c, k); ++ bch2_btree_and_journal_iter_advance(&iter); + +- if (update_min) +- bp->v.min_key = expected_start; +- if (update_max) +- new->k.p = node_end; +- if (bp) +- SET_BTREE_PTR_RANGE_UPDATED(&bp->v, true); ++ cur = bch2_btree_node_get_noiter(c, tmp.k, ++ b->c.btree_id, b->c.level - 1, ++ false); ++ ret = PTR_ERR_OR_ZERO(cur); + +- ret = bch2_journal_key_insert(c, b->c.btree_id, b->c.level, new); + if (ret) { +- kfree(new); +- return ret; ++ bch_err(c, "%s: error %i getting btree node", ++ __func__, ret); ++ goto err; + } + +- n = bch2_btree_node_get_noiter(c, cur.k, b->c.btree_id, +- b->c.level - 1, true); +- if (n) { +- mutex_lock(&c->btree_cache.lock); +- bch2_btree_node_hash_remove(&c->btree_cache, n); +- +- bkey_copy(&n->key, new); +- if (update_min) +- n->data->min_key = expected_start; +- if (update_max) +- n->data->max_key = node_end; +- +- ret = __bch2_btree_node_hash_insert(&c->btree_cache, n); +- BUG_ON(ret); +- mutex_unlock(&c->btree_cache.lock); +- six_unlock_read(&n->c.lock); ++ ret = bch2_btree_repair_topology_recurse(c, cur); ++ six_unlock_read(&cur->c.lock); ++ cur = NULL; ++ ++ if (ret == DROP_THIS_NODE) { ++ ret = bch2_journal_key_delete(c, b->c.btree_id, ++ b->c.level, tmp.k->k.p); ++ dropped_children = true; + } ++ ++ if (ret) ++ goto err; ++ ++ have_child = true; + } ++ ++ if (mustfix_fsck_err_on(!have_child, c, ++ "empty interior btree node at btree %s level %u\n" ++ " %s", ++ bch2_btree_ids[b->c.btree_id], ++ b->c.level, ++ (bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(&b->key)), buf))) ++ ret = DROP_THIS_NODE; ++err: + fsck_err: ++ if (!IS_ERR_OR_NULL(prev)) ++ six_unlock_read(&prev->c.lock); ++ if (!IS_ERR_OR_NULL(cur)) ++ six_unlock_read(&cur->c.lock); ++ ++ bch2_btree_and_journal_iter_exit(&iter); ++ bch2_bkey_buf_exit(&tmp, c); ++ ++ if (!ret && dropped_children) ++ goto again; ++ ++ return ret; ++} ++ ++static int bch2_repair_topology(struct bch_fs *c) ++{ ++ struct btree *b; ++ unsigned i; ++ int ret = 0; ++ ++ for (i = 0; i < BTREE_ID_NR && !ret; i++) { ++ b = c->btree_roots[i].b; ++ if (btree_node_fake(b)) ++ continue; ++ ++ six_lock_read(&b->c.lock, NULL, NULL); ++ ret = bch2_btree_repair_topology_recurse(c, b); ++ six_unlock_read(&b->c.lock); ++ ++ if (ret == DROP_THIS_NODE) { ++ bch_err(c, "empty btree root - repair unimplemented"); ++ ret = FSCK_ERR_EXIT; ++ } ++ } ++ + return ret; + } + +@@ -483,6 +739,7 @@ static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b, + struct bkey_s_c k; + struct bkey_buf cur, prev; + u8 max_stale = 0; ++ char buf[200]; + int ret = 0; + + bch2_btree_and_journal_iter_init_node_iter(&iter, c, b); +@@ -498,7 +755,7 @@ static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b, + &k, &max_stale, true); + if (ret) { + bch_err(c, "%s: error %i from bch2_gc_mark_key", __func__, ret); +- break; ++ goto fsck_err; + } + + if (b->c.level) { +@@ -511,7 +768,7 @@ static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b, + &prev, cur, + !bch2_btree_and_journal_iter_peek(&iter).k); + if (ret) +- break; ++ goto fsck_err; + } else { + bch2_btree_and_journal_iter_advance(&iter); + } +@@ -532,18 +789,25 @@ static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b, + false); + ret = PTR_ERR_OR_ZERO(child); + +- if (fsck_err_on(ret == -EIO, c, +- "unreadable btree node")) { +- ret = bch2_journal_key_delete(c, b->c.btree_id, +- b->c.level, cur.k->k.p); +- if (ret) +- return ret; +- +- set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags); +- continue; +- } +- +- if (ret) { ++ if (ret == -EIO) { ++ bch2_topology_error(c); ++ ++ if (fsck_err(c, "Unreadable btree node at btree %s level %u:\n" ++ " %s", ++ bch2_btree_ids[b->c.btree_id], ++ b->c.level - 1, ++ (bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(cur.k)), buf))) { ++ ret = FSCK_ERR_START_TOPOLOGY_REPAIR; ++ bch_info(c, "Halting mark and sweep to start topology repair pass"); ++ goto fsck_err; ++ } else { ++ /* Continue marking when opted to not ++ * fix the error: */ ++ ret = 0; ++ set_bit(BCH_FS_INITIAL_GC_UNFIXED, &c->flags); ++ continue; ++ } ++ } else if (ret) { + bch_err(c, "%s: error %i getting btree node", + __func__, ret); + break; +@@ -583,16 +847,20 @@ static int bch2_gc_btree_init(struct bch_fs *c, + return 0; + + six_lock_read(&b->c.lock, NULL, NULL); +- if (fsck_err_on(bpos_cmp(b->data->min_key, POS_MIN), c, ++ if (mustfix_fsck_err_on(bpos_cmp(b->data->min_key, POS_MIN), c, + "btree root with incorrect min_key: %s", + (bch2_bpos_to_text(&PBUF(buf), b->data->min_key), buf))) { +- BUG(); ++ bch_err(c, "repair unimplemented"); ++ ret = FSCK_ERR_EXIT; ++ goto fsck_err; + } + +- if (fsck_err_on(bpos_cmp(b->data->max_key, POS_MAX), c, ++ if (mustfix_fsck_err_on(bpos_cmp(b->data->max_key, POS_MAX), c, + "btree root with incorrect max_key: %s", + (bch2_bpos_to_text(&PBUF(buf), b->data->max_key), buf))) { +- BUG(); ++ bch_err(c, "repair unimplemented"); ++ ret = FSCK_ERR_EXIT; ++ goto fsck_err; + } + + if (b->c.level >= target_depth) +@@ -607,7 +875,7 @@ static int bch2_gc_btree_init(struct bch_fs *c, + fsck_err: + six_unlock_read(&b->c.lock); + +- if (ret) ++ if (ret < 0) + bch_err(c, "%s: ret %i", __func__, ret); + return ret; + } +@@ -622,23 +890,20 @@ static int bch2_gc_btrees(struct bch_fs *c, bool initial, bool metadata_only) + { + enum btree_id ids[BTREE_ID_NR]; + unsigned i; ++ int ret = 0; + + for (i = 0; i < BTREE_ID_NR; i++) + ids[i] = i; + bubble_sort(ids, BTREE_ID_NR, btree_id_gc_phase_cmp); + +- for (i = 0; i < BTREE_ID_NR; i++) { +- enum btree_id id = ids[i]; +- int ret = initial +- ? bch2_gc_btree_init(c, id, metadata_only) +- : bch2_gc_btree(c, id, initial, metadata_only); +- if (ret) { +- bch_err(c, "%s: ret %i", __func__, ret); +- return ret; +- } +- } ++ for (i = 0; i < BTREE_ID_NR && !ret; i++) ++ ret = initial ++ ? bch2_gc_btree_init(c, ids[i], metadata_only) ++ : bch2_gc_btree(c, ids[i], initial, metadata_only); + +- return 0; ++ if (ret < 0) ++ bch_err(c, "%s: ret %i", __func__, ret); ++ return ret; + } + + static void mark_metadata_sectors(struct bch_fs *c, struct bch_dev *ca, +@@ -1025,7 +1290,27 @@ again: + + bch2_mark_superblocks(c); + ++ if (test_bit(BCH_FS_TOPOLOGY_ERROR, &c->flags) && ++ !test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags) && ++ c->opts.fix_errors != FSCK_OPT_NO) { ++ bch_info(c, "starting topology repair pass"); ++ ret = bch2_repair_topology(c); ++ if (ret) ++ goto out; ++ bch_info(c, "topology repair pass done"); ++ } ++ + ret = bch2_gc_btrees(c, initial, metadata_only); ++ ++ if (ret == FSCK_ERR_START_TOPOLOGY_REPAIR && ++ !test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) { ++ set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags); ++ ret = 0; ++ } ++ ++ if (ret == FSCK_ERR_START_TOPOLOGY_REPAIR) ++ ret = FSCK_ERR_EXIT; ++ + if (ret) + goto out; + +diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c +index 05418b51b8a5..226b19b6e467 100644 +--- a/fs/bcachefs/btree_io.c ++++ b/fs/bcachefs/btree_io.c +@@ -558,6 +558,54 @@ out: \ + + #define btree_err_on(cond, ...) ((cond) ? btree_err(__VA_ARGS__) : false) + ++/* ++ * When btree topology repair changes the start or end of a node, that might ++ * mean we have to drop keys that are no longer inside the node: ++ */ ++void bch2_btree_node_drop_keys_outside_node(struct btree *b) ++{ ++ struct bset_tree *t; ++ struct bkey_s_c k; ++ struct bkey unpacked; ++ struct btree_node_iter iter; ++ ++ for_each_bset(b, t) { ++ struct bset *i = bset(b, t); ++ struct bkey_packed *k; ++ ++ for (k = i->start; k != vstruct_last(i); k = bkey_next(k)) ++ if (bkey_cmp_left_packed(b, k, &b->data->min_key) >= 0) ++ break; ++ ++ if (k != i->start) { ++ unsigned shift = (u64 *) k - (u64 *) i->start; ++ ++ memmove_u64s_down(i->start, k, ++ (u64 *) vstruct_end(i) - (u64 *) k); ++ i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - shift); ++ set_btree_bset_end(b, t); ++ bch2_bset_set_no_aux_tree(b, t); ++ } ++ ++ for (k = i->start; k != vstruct_last(i); k = bkey_next(k)) ++ if (bkey_cmp_left_packed(b, k, &b->data->max_key) > 0) ++ break; ++ ++ if (k != vstruct_last(i)) { ++ i->u64s = cpu_to_le16((u64 *) k - (u64 *) i->start); ++ set_btree_bset_end(b, t); ++ bch2_bset_set_no_aux_tree(b, t); ++ } ++ } ++ ++ bch2_btree_build_aux_trees(b); ++ ++ for_each_btree_node_key_unpack(b, k, &iter, &unpacked) { ++ BUG_ON(bpos_cmp(k.k->p, b->data->min_key) < 0); ++ BUG_ON(bpos_cmp(k.k->p, b->data->max_key) > 0); ++ } ++} ++ + static int validate_bset(struct bch_fs *c, struct bch_dev *ca, + struct btree *b, struct bset *i, + unsigned sectors, int write, bool have_retry) +@@ -680,6 +728,8 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b, + { + unsigned version = le16_to_cpu(i->version); + struct bkey_packed *k, *prev = NULL; ++ bool updated_range = b->key.k.type == KEY_TYPE_btree_ptr_v2 && ++ BTREE_PTR_RANGE_UPDATED(&bkey_i_to_btree_ptr_v2(&b->key)->v); + int ret = 0; + + for (k = i->start; +@@ -713,7 +763,7 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b, + u = __bkey_disassemble(b, k, &tmp); + + invalid = __bch2_bkey_invalid(c, u.s_c, btree_node_type(b)) ?: +- bch2_bkey_in_btree_node(b, u.s_c) ?: ++ (!updated_range ? bch2_bkey_in_btree_node(b, u.s_c) : NULL) ?: + (write ? bch2_bkey_val_invalid(c, u.s_c) : NULL); + if (invalid) { + char buf[160]; +@@ -770,6 +820,8 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, + struct bch_extent_ptr *ptr; + struct bset *i; + bool used_mempool, blacklisted; ++ bool updated_range = b->key.k.type == KEY_TYPE_btree_ptr_v2 && ++ BTREE_PTR_RANGE_UPDATED(&bkey_i_to_btree_ptr_v2(&b->key)->v); + unsigned u64s; + int ret, retry_read = 0, write = READ; + +@@ -917,6 +969,9 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, + + btree_bounce_free(c, btree_bytes(c), used_mempool, sorted); + ++ if (updated_range) ++ bch2_btree_node_drop_keys_outside_node(b); ++ + i = &b->data->keys; + for (k = i->start; k != vstruct_last(i);) { + struct bkey tmp; +diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h +index c8a8b05a19b0..cadcf7f886d7 100644 +--- a/fs/bcachefs/btree_io.h ++++ b/fs/bcachefs/btree_io.h +@@ -131,6 +131,8 @@ static inline void bset_encrypt(struct bch_fs *c, struct bset *i, unsigned offse + + void bch2_btree_sort_into(struct bch_fs *, struct btree *, struct btree *); + ++void bch2_btree_node_drop_keys_outside_node(struct btree *); ++ + void bch2_btree_build_aux_trees(struct btree *); + void bch2_btree_init_next(struct bch_fs *, struct btree *, + struct btree_iter *); +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index 43d5f09fe2f4..d7a1abf44f41 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -1609,11 +1609,12 @@ retry: + + bch2_bpos_to_text(&PBUF(buf1), prev->data->max_key); + bch2_bpos_to_text(&PBUF(buf2), next->data->min_key); +- bch2_fs_inconsistent(c, +- "btree topology error in btree merge:\n" +- "prev ends at %s\n" +- "next starts at %s\n", +- buf1, buf2); ++ bch_err(c, ++ "btree topology error in btree merge:\n" ++ " prev ends at %s\n" ++ " next starts at %s", ++ buf1, buf2); ++ bch2_topology_error(c); + ret = -EIO; + goto err; + } +diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c +index a8ee1db8aa39..90c3b986c264 100644 +--- a/fs/bcachefs/error.c ++++ b/fs/bcachefs/error.c +@@ -25,6 +25,13 @@ bool bch2_inconsistent_error(struct bch_fs *c) + } + } + ++void bch2_topology_error(struct bch_fs *c) ++{ ++ set_bit(BCH_FS_TOPOLOGY_ERROR, &c->flags); ++ if (test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) ++ bch2_inconsistent_error(c); ++} ++ + void bch2_fatal_error(struct bch_fs *c) + { + if (bch2_fs_emergency_read_only(c)) +@@ -74,9 +81,13 @@ enum fsck_err_ret bch2_fsck_err(struct bch_fs *c, unsigned flags, + vprintk(fmt, args); + va_end(args); + +- return bch2_inconsistent_error(c) +- ? FSCK_ERR_EXIT +- : FSCK_ERR_FIX; ++ if (c->opts.errors == BCH_ON_ERROR_continue) { ++ bch_err(c, "fixing"); ++ return FSCK_ERR_FIX; ++ } else { ++ bch2_inconsistent_error(c); ++ return FSCK_ERR_EXIT; ++ } + } + + mutex_lock(&c->fsck_error_lock); +@@ -146,6 +157,7 @@ print: + set_bit(BCH_FS_ERRORS_FIXED, &c->flags); + return FSCK_ERR_FIX; + } else { ++ set_bit(BCH_FS_ERRORS_NOT_FIXED, &c->flags); + set_bit(BCH_FS_ERROR, &c->flags); + return c->opts.fix_errors == FSCK_OPT_EXIT || + !(flags & FSCK_CAN_IGNORE) +diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h +index 0e49fd728e44..d8cd19b3f63c 100644 +--- a/fs/bcachefs/error.h ++++ b/fs/bcachefs/error.h +@@ -29,6 +29,8 @@ struct work_struct; + + bool bch2_inconsistent_error(struct bch_fs *); + ++void bch2_topology_error(struct bch_fs *); ++ + #define bch2_fs_inconsistent(c, ...) \ + ({ \ + bch_err(c, __VA_ARGS__); \ +@@ -88,6 +90,7 @@ enum fsck_err_ret { + FSCK_ERR_IGNORE = 0, + FSCK_ERR_FIX = 1, + FSCK_ERR_EXIT = 2, ++ FSCK_ERR_START_TOPOLOGY_REPAIR = 3, + }; + + struct fsck_err_state { +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index fe6886e42216..a9ccd14effe7 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -1241,8 +1241,9 @@ use_clean: + + if (c->opts.fsck && + !test_bit(BCH_FS_ERROR, &c->flags) && +- BCH_SB_HAS_ERRORS(c->disk_sb.sb)) { ++ !test_bit(BCH_FS_ERRORS_NOT_FIXED, &c->flags)) { + SET_BCH_SB_HAS_ERRORS(c->disk_sb.sb, 0); ++ SET_BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb, 0); + write_sb = true; + } + +diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c +index de8d49e3ef02..11d7167b0129 100644 +--- a/fs/bcachefs/super-io.c ++++ b/fs/bcachefs/super-io.c +@@ -433,6 +433,11 @@ int bch2_sb_to_fs(struct bch_fs *c, struct bch_sb *src) + + __copy_super(&c->disk_sb, src); + ++ if (BCH_SB_HAS_ERRORS(c->disk_sb.sb)) ++ set_bit(BCH_FS_ERROR, &c->flags); ++ if (BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb)) ++ set_bit(BCH_FS_TOPOLOGY_ERROR, &c->flags); ++ + ret = bch2_sb_replicas_to_cpu_replicas(c); + if (ret) + return ret; +@@ -713,6 +718,8 @@ int bch2_write_super(struct bch_fs *c) + + if (test_bit(BCH_FS_ERROR, &c->flags)) + SET_BCH_SB_HAS_ERRORS(c->disk_sb.sb, 1); ++ if (test_bit(BCH_FS_TOPOLOGY_ERROR, &c->flags)) ++ SET_BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb, 1); + + SET_BCH_SB_BIG_ENDIAN(c->disk_sb.sb, CPU_BIG_ENDIAN); + +diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c +index 68be792926da..326ff51e4da6 100644 +--- a/fs/bcachefs/super.c ++++ b/fs/bcachefs/super.c +@@ -381,6 +381,11 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early) + unsigned i; + int ret; + ++ if (test_bit(BCH_FS_INITIAL_GC_UNFIXED, &c->flags)) { ++ bch_err(c, "cannot go rw, unfixed btree errors"); ++ return -EROFS; ++ } ++ + if (test_bit(BCH_FS_RW, &c->flags)) + return 0; + +-- +cgit v1.2.3 + + +From 55ca31fe499aaf44b856fcbb17b05fbc1e4707a1 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 24 Apr 2021 22:33:25 -0400 +Subject: bcachefs: Fix a null ptr deref + +Fix a few memory safety issues, found by asan in userspace. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/fsck.c | 20 +++++++++++--------- + 1 file changed, 11 insertions(+), 9 deletions(-) + +diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c +index 1ce038846476..b867576b3ffd 100644 +--- a/fs/bcachefs/fsck.c ++++ b/fs/bcachefs/fsck.c +@@ -290,21 +290,24 @@ static int hash_redo_key(struct btree_trans *trans, + struct bch_hash_info *hash_info, + struct btree_iter *k_iter, struct bkey_s_c k) + { +- struct bkey_i delete; ++ struct bkey_i *delete; + struct bkey_i *tmp; + ++ delete = bch2_trans_kmalloc(trans, sizeof(*delete)); ++ if (IS_ERR(delete)) ++ return PTR_ERR(delete); ++ + tmp = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); + if (IS_ERR(tmp)) + return PTR_ERR(tmp); + + bkey_reassemble(tmp, k); + +- bkey_init(&delete.k); +- delete.k.p = k_iter->pos; +- bch2_trans_update(trans, k_iter, &delete, 0); ++ bkey_init(&delete->k); ++ delete->k.p = k_iter->pos; ++ bch2_trans_update(trans, k_iter, delete, 0); + +- return bch2_hash_set(trans, desc, hash_info, k_iter->pos.inode, +- tmp, 0); ++ return bch2_hash_set(trans, desc, hash_info, k_iter->pos.inode, tmp, 0); + } + + static int fsck_hash_delete_at(struct btree_trans *trans, +@@ -377,9 +380,8 @@ static int hash_check_key(struct btree_trans *trans, + return ret; + bad_hash: + if (fsck_err(c, "hash table key at wrong offset: btree %u inode %llu offset %llu, " +- "hashed to %llu should be at %llu\n%s", +- desc.btree_id, hash_k.k->p.inode, hash_k.k->p.offset, +- hash, iter->pos.offset, ++ "hashed to %llu\n%s", ++ desc.btree_id, hash_k.k->p.inode, hash_k.k->p.offset, hash, + (bch2_bkey_val_to_text(&PBUF(buf), c, hash_k), buf)) == FSCK_ERR_IGNORE) + return 0; + +-- +cgit v1.2.3 + + +From c68bac4644b5d53ba5c6f2975502997f82c1ac2b Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 21 Apr 2021 21:08:49 -0400 +Subject: bcachefs: New check_nlinks algorithm for snapshots + +With snapshots, using a radix tree for the table of link counts won't +work anymore because we also need to distinguish between inodes with +different snapshot IDs. Instead, this patch builds up a sorted array of +inodes that have hardlinks that we can binary search on - taking +advantage of the fact that with inode backpointers, the check_nlinks() +pass _only_ needs to concern itself with inodes that have hardlinks now. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/fsck.c | 228 ++++++++++++++++++++++++++++++++++------------------- + 1 file changed, 145 insertions(+), 83 deletions(-) + +diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c +index b867576b3ffd..8ae4e4c30933 100644 +--- a/fs/bcachefs/fsck.c ++++ b/fs/bcachefs/fsck.c +@@ -12,8 +12,8 @@ + #include "super.h" + #include "xattr.h" + ++#include + #include /* struct qstr */ +-#include + + #define QSTR(n) { { { .len = strlen(n) } }, .name = n } + +@@ -1132,38 +1132,120 @@ static int check_directory_structure(struct bch_fs *c) + return bch2_trans_exit(&trans) ?: ret; + } + +-struct nlink { +- u32 count; +-}; ++struct nlink_table { ++ size_t nr; ++ size_t size; + +-typedef GENRADIX(struct nlink) nlink_table; ++ struct nlink { ++ u64 inum; ++ u32 snapshot; ++ u32 count; ++ } *d; ++}; + +-static void inc_link(struct bch_fs *c, nlink_table *links, +- u64 range_start, u64 *range_end, u64 inum) ++static int add_nlink(struct nlink_table *t, u64 inum, u32 snapshot) + { +- struct nlink *link; ++ if (t->nr == t->size) { ++ size_t new_size = max_t(size_t, 128UL, t->size * 2); ++ void *d = kvmalloc(new_size * sizeof(t->d[0]), GFP_KERNEL); ++ if (!d) { ++ return -ENOMEM; ++ } + +- if (inum < range_start || inum >= *range_end) +- return; ++ memcpy(d, t->d, t->size * sizeof(t->d[0])); ++ kvfree(t->d); + +- if (inum - range_start >= SIZE_MAX / sizeof(struct nlink)) { +- *range_end = inum; +- return; ++ t->d = d; ++ t->size = new_size; + } + +- link = genradix_ptr_alloc(links, inum - range_start, GFP_KERNEL); +- if (!link) { +- bch_verbose(c, "allocation failed during fsck - will need another pass"); +- *range_end = inum; ++ ++ t->d[t->nr++] = (struct nlink) { ++ .inum = inum, ++ .snapshot = snapshot, ++ }; ++ ++ return 0; ++} ++ ++static int nlink_cmp(const void *_l, const void *_r) ++{ ++ const struct nlink *l = _l; ++ const struct nlink *r = _r; ++ ++ return cmp_int(l->inum, r->inum) ?: cmp_int(l->snapshot, r->snapshot); ++} ++ ++static void inc_link(struct bch_fs *c, struct nlink_table *links, ++ u64 range_start, u64 range_end, u64 inum) ++{ ++ struct nlink *link, key = { ++ .inum = inum, .snapshot = U32_MAX, ++ }; ++ ++ if (inum < range_start || inum >= range_end) + return; ++ ++ link = __inline_bsearch(&key, links->d, links->nr, ++ sizeof(links->d[0]), nlink_cmp); ++ if (link) ++ link->count++; ++} ++ ++noinline_for_stack ++static int check_nlinks_find_hardlinks(struct bch_fs *c, ++ struct nlink_table *t, ++ u64 start, u64 *end) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ struct bkey_s_c_inode inode; ++ struct bch_inode_unpacked u; ++ int ret = 0; ++ ++ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_inodes, ++ POS(0, start), 0, k, ret) { ++ if (k.k->type != KEY_TYPE_inode) ++ continue; ++ ++ inode = bkey_s_c_to_inode(k); ++ ++ /* ++ * Backpointer and directory structure checks are sufficient for ++ * directories, since they can't have hardlinks: ++ */ ++ if (S_ISDIR(le16_to_cpu(inode.v->bi_mode))) ++ continue; ++ ++ /* Should never fail, checked by bch2_inode_invalid: */ ++ BUG_ON(bch2_inode_unpack(inode, &u)); ++ ++ if (!u.bi_nlink) ++ continue; ++ ++ ret = add_nlink(t, k.k->p.offset, k.k->p.snapshot); ++ if (ret) { ++ *end = k.k->p.offset; ++ ret = 0; ++ break; ++ } ++ + } ++ bch2_trans_iter_put(&trans, iter); ++ bch2_trans_exit(&trans); ++ ++ if (ret) ++ bch_err(c, "error in fsck: btree error %i while walking inodes", ret); + +- link->count++; ++ return ret; + } + + noinline_for_stack +-static int bch2_gc_walk_dirents(struct bch_fs *c, nlink_table *links, +- u64 range_start, u64 *range_end) ++static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links, ++ u64 range_start, u64 range_end) + { + struct btree_trans trans; + struct btree_iter *iter; +@@ -1195,80 +1277,58 @@ static int bch2_gc_walk_dirents(struct bch_fs *c, nlink_table *links, + return ret; + } + +-static int check_inode_nlink(struct btree_trans *trans, +- struct btree_iter *iter, +- struct bkey_s_c_inode inode, +- unsigned nlink) +-{ +- struct bch_fs *c = trans->c; +- struct bch_inode_unpacked u; +- int ret = 0; +- +- /* +- * Backpointer and directory structure checks are sufficient for +- * directories, since they can't have hardlinks: +- */ +- if (S_ISDIR(le16_to_cpu(inode.v->bi_mode))) +- return 0; +- +- if (!nlink) { +- bch_err(c, "no links found to inode %llu", inode.k->p.offset); +- return -EINVAL; +- } +- +- ret = bch2_inode_unpack(inode, &u); +- +- /* Should never happen, checked by bch2_inode_invalid: */ +- if (bch2_fs_inconsistent_on(ret, c, +- "error unpacking inode %llu in fsck", +- inode.k->p.inode)) +- return ret; +- +- if (fsck_err_on(bch2_inode_nlink_get(&u) != nlink, c, +- "inode %llu has wrong i_nlink (type %u i_nlink %u, should be %u)", +- u.bi_inum, mode_to_type(u.bi_mode), +- bch2_inode_nlink_get(&u), nlink)) { +- bch2_inode_nlink_set(&u, nlink); +- +- ret = __bch2_trans_do(trans, NULL, NULL, +- BTREE_INSERT_NOFAIL| +- BTREE_INSERT_LAZY_RW, +- bch2_inode_write(trans, iter, &u)); +- if (ret) +- bch_err(c, "error in fsck: error %i updating inode", ret); +- } +-fsck_err: +- return ret; +-} +- + noinline_for_stack +-static int bch2_gc_walk_inodes(struct bch_fs *c, +- nlink_table *links, ++static int check_nlinks_update_hardlinks(struct bch_fs *c, ++ struct nlink_table *links, + u64 range_start, u64 range_end) + { + struct btree_trans trans; + struct btree_iter *iter; + struct bkey_s_c k; +- struct nlink *link; ++ struct bkey_s_c_inode inode; ++ struct bch_inode_unpacked u; ++ struct nlink *link = links->d; + int ret = 0; + + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); + + for_each_btree_key(&trans, iter, BTREE_ID_inodes, + POS(0, range_start), 0, k, ret) { +- if (!k.k || k.k->p.offset >= range_end) ++ if (k.k->p.offset >= range_end) + break; + + if (k.k->type != KEY_TYPE_inode) + continue; + +- link = genradix_ptr(links, k.k->p.offset - range_start); +- ret = check_inode_nlink(&trans, iter, +- bkey_s_c_to_inode(k), link ? link->count : 0); +- if (ret) +- break; ++ inode = bkey_s_c_to_inode(k); ++ if (S_ISDIR(le16_to_cpu(inode.v->bi_mode))) ++ continue; ++ ++ BUG_ON(bch2_inode_unpack(inode, &u)); + ++ if (!u.bi_nlink) ++ continue; ++ ++ while (link->inum < k.k->p.offset) { ++ link++; ++ BUG_ON(link >= links->d + links->nr); ++ } ++ ++ if (fsck_err_on(bch2_inode_nlink_get(&u) != link->count, c, ++ "inode %llu has wrong i_nlink (type %u i_nlink %u, should be %u)", ++ u.bi_inum, mode_to_type(u.bi_mode), ++ bch2_inode_nlink_get(&u), link->count)) { ++ bch2_inode_nlink_set(&u, link->count); ++ ++ ret = __bch2_trans_do(&trans, NULL, NULL, ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_LAZY_RW, ++ bch2_inode_write(&trans, iter, &u)); ++ if (ret) ++ bch_err(c, "error in fsck: error %i updating inode", ret); ++ } + } ++fsck_err: + bch2_trans_iter_put(&trans, iter); + bch2_trans_exit(&trans); + +@@ -1281,34 +1341,36 @@ static int bch2_gc_walk_inodes(struct bch_fs *c, + noinline_for_stack + static int check_nlinks(struct bch_fs *c) + { +- nlink_table links; ++ struct nlink_table links = { 0 }; + u64 this_iter_range_start, next_iter_range_start = 0; + int ret = 0; + + bch_verbose(c, "checking inode nlinks"); + +- genradix_init(&links); +- + do { + this_iter_range_start = next_iter_range_start; + next_iter_range_start = U64_MAX; + +- ret = bch2_gc_walk_dirents(c, &links, ++ ret = check_nlinks_find_hardlinks(c, &links, ++ this_iter_range_start, ++ &next_iter_range_start); ++ ++ ret = check_nlinks_walk_dirents(c, &links, + this_iter_range_start, +- &next_iter_range_start); ++ next_iter_range_start); + if (ret) + break; + +- ret = bch2_gc_walk_inodes(c, &links, ++ ret = check_nlinks_update_hardlinks(c, &links, + this_iter_range_start, + next_iter_range_start); + if (ret) + break; + +- genradix_free(&links); ++ links.nr = 0; + } while (next_iter_range_start != U64_MAX); + +- genradix_free(&links); ++ kvfree(links.d); + + return ret; + } +-- +cgit v1.2.3 + + +From 2f5f4a55339ead5dd136ff23646a23c6b206c7ef Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 25 Apr 2021 16:24:03 -0400 +Subject: bcachefs: Evict btree nodes we're deleting + +There was a bug that led to duplicate btree node pointers being inserted +at the wrong level. The new topology repair code can fix that, except +that the btree cache code gets confused when we read in a btree node +from the pointer that was at the wrong level. This patch evicts nodes +that we're deleting to, which nicely solves the problem. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_cache.c | 30 ++++++++++++++++++++++++++++++ + fs/bcachefs/btree_cache.h | 2 ++ + fs/bcachefs/btree_gc.c | 2 ++ + 3 files changed, 34 insertions(+) + +diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c +index db7052564c7d..eaf9da6e50da 100644 +--- a/fs/bcachefs/btree_cache.c ++++ b/fs/bcachefs/btree_cache.c +@@ -947,6 +947,36 @@ void bch2_btree_node_prefetch(struct bch_fs *c, struct btree_iter *iter, + bch2_btree_node_fill(c, iter, k, btree_id, level, SIX_LOCK_read, false); + } + ++void bch2_btree_node_evict(struct bch_fs *c, const struct bkey_i *k) ++{ ++ struct btree_cache *bc = &c->btree_cache; ++ struct btree *b; ++ ++ b = btree_cache_find(bc, k); ++ if (!b) ++ return; ++ ++ six_lock_intent(&b->c.lock, NULL, NULL); ++ six_lock_write(&b->c.lock, NULL, NULL); ++ ++ wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight, ++ TASK_UNINTERRUPTIBLE); ++ __bch2_btree_node_write(c, b); ++ ++ /* wait for any in flight btree write */ ++ btree_node_wait_on_io(b); ++ ++ BUG_ON(btree_node_dirty(b)); ++ ++ mutex_lock(&bc->lock); ++ btree_node_data_free(c, b); ++ bch2_btree_node_hash_remove(bc, b); ++ mutex_unlock(&bc->lock); ++ ++ six_unlock_write(&b->c.lock); ++ six_unlock_intent(&b->c.lock); ++} ++ + void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, + struct btree *b) + { +diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h +index c517cc029454..40dd263a7caa 100644 +--- a/fs/bcachefs/btree_cache.h ++++ b/fs/bcachefs/btree_cache.h +@@ -30,6 +30,8 @@ struct btree *bch2_btree_node_get_noiter(struct bch_fs *, const struct bkey_i *, + void bch2_btree_node_prefetch(struct bch_fs *, struct btree_iter *, + const struct bkey_i *, enum btree_id, unsigned); + ++void bch2_btree_node_evict(struct bch_fs *, const struct bkey_i *); ++ + void bch2_fs_btree_cache_exit(struct bch_fs *); + int bch2_fs_btree_cache_init(struct bch_fs *); + void bch2_fs_btree_cache_init_early(struct btree_cache *); +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +index 5f58b316b049..12b7c35b9ea5 100644 +--- a/fs/bcachefs/btree_gc.c ++++ b/fs/bcachefs/btree_gc.c +@@ -298,6 +298,7 @@ again: + bch2_btree_ids[b->c.btree_id], + b->c.level - 1, + (bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(tmp.k)), buf))) { ++ bch2_btree_node_evict(c, tmp.k); + ret = bch2_journal_key_delete(c, b->c.btree_id, + b->c.level, tmp.k->k.p); + if (ret) +@@ -359,6 +360,7 @@ again: + cur = NULL; + + if (ret == DROP_THIS_NODE) { ++ bch2_btree_node_evict(c, tmp.k); + ret = bch2_journal_key_delete(c, b->c.btree_id, + b->c.level, tmp.k->k.p); + dropped_children = true; +-- +cgit v1.2.3 + + +From 4a4772c15aaf862d4a17c96ea90d65018fca9d9b Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 27 Apr 2021 11:12:17 -0400 +Subject: bcachefs: Fix __bch2_trans_get_iter() + +We need to also set iter->uptodate to indicate it needs to be traversed. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 11 ++++++----- + 1 file changed, 6 insertions(+), 5 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index ac5a8737e9f3..658f751b6243 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -2086,15 +2086,16 @@ struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans, + btree_iter_get_locks(iter, true, false); + } + +- while (iter->level < depth) { ++ while (iter->level != depth) { + btree_node_unlock(iter, iter->level); + iter->l[iter->level].b = BTREE_ITER_NO_NODE_INIT; +- iter->level++; ++ iter->uptodate = BTREE_ITER_NEED_TRAVERSE; ++ if (iter->level < depth) ++ iter->level++; ++ else ++ iter->level--; + } + +- while (iter->level > depth) +- iter->l[--iter->level].b = BTREE_ITER_NO_NODE_INIT; +- + iter->min_depth = depth; + + bch2_btree_iter_set_pos(iter, pos); +-- +cgit v1.2.3 + + +From b8e1b3235f0595587ad860ff29f1e995d2488060 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 29 Apr 2021 16:56:17 -0400 +Subject: bcachefs: New tracepoint for bch2_trans_get_iter() + +Trying to debug an issue where after traverse_all() we shouldn't have to +traverse any iterators... yet we are + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 32 ++++++++++++++++------- + include/trace/events/bcachefs.h | 56 +++++++++++++++++++++++++++++++++++++++++ + 2 files changed, 79 insertions(+), 9 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 658f751b6243..8cc8214235c0 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -2021,6 +2021,13 @@ struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans, + unsigned flags) + { + struct btree_iter *iter, *best = NULL; ++ struct bpos real_pos, pos_min = POS_MIN; ++ ++ if ((flags & BTREE_ITER_TYPE) != BTREE_ITER_NODES && ++ btree_node_type_is_extents(btree_id) && ++ !(flags & BTREE_ITER_NOT_EXTENTS) && ++ !(flags & BTREE_ITER_ALL_SNAPSHOTS)) ++ flags |= BTREE_ITER_IS_EXTENTS; + + if ((flags & BTREE_ITER_TYPE) != BTREE_ITER_NODES && + !btree_type_has_snapshots(btree_id)) +@@ -2030,6 +2037,12 @@ struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans, + pos.snapshot = btree_type_has_snapshots(btree_id) + ? U32_MAX : 0; + ++ real_pos = pos; ++ ++ if ((flags & BTREE_ITER_IS_EXTENTS) && ++ bkey_cmp(pos, POS_MAX)) ++ real_pos = bpos_nosnap_successor(pos); ++ + trans_for_each_iter(trans, iter) { + if (btree_iter_type(iter) != (flags & BTREE_ITER_TYPE)) + continue; +@@ -2038,8 +2051,8 @@ struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans, + continue; + + if (best) { +- int cmp = bkey_cmp(bpos_diff(best->real_pos, pos), +- bpos_diff(iter->real_pos, pos)); ++ int cmp = bkey_cmp(bpos_diff(best->real_pos, real_pos), ++ bpos_diff(iter->real_pos, real_pos)); + + if (cmp < 0 || + ((cmp == 0 && btree_iter_keep(trans, iter)))) +@@ -2049,6 +2062,13 @@ struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans, + best = iter; + } + ++ trace_trans_get_iter(_RET_IP_, trans->ip, ++ btree_id, ++ &real_pos, locks_want, ++ best ? &best->real_pos : &pos_min, ++ best ? best->locks_want : 0, ++ best ? best->uptodate : BTREE_ITER_NEED_TRAVERSE); ++ + if (!best) { + iter = btree_trans_iter_alloc(trans); + bch2_btree_iter_init(trans, iter, btree_id); +@@ -2062,12 +2082,6 @@ struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans, + trans->iters_live |= 1ULL << iter->idx; + trans->iters_touched |= 1ULL << iter->idx; + +- if ((flags & BTREE_ITER_TYPE) != BTREE_ITER_NODES && +- btree_node_type_is_extents(btree_id) && +- !(flags & BTREE_ITER_NOT_EXTENTS) && +- !(flags & BTREE_ITER_ALL_SNAPSHOTS)) +- flags |= BTREE_ITER_IS_EXTENTS; +- + iter->flags = flags; + + iter->snapshot = pos.snapshot; +@@ -2099,7 +2113,7 @@ struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans, + iter->min_depth = depth; + + bch2_btree_iter_set_pos(iter, pos); +- btree_iter_set_search_pos(iter, btree_iter_search_key(iter)); ++ btree_iter_set_search_pos(iter, real_pos); + + return iter; + } +diff --git a/include/trace/events/bcachefs.h b/include/trace/events/bcachefs.h +index fd4b2f4ef46b..2163651c0fe4 100644 +--- a/include/trace/events/bcachefs.h ++++ b/include/trace/events/bcachefs.h +@@ -528,6 +528,62 @@ TRACE_EVENT(copygc, + __entry->buckets_moved, __entry->buckets_not_moved) + ); + ++TRACE_EVENT(trans_get_iter, ++ TP_PROTO(unsigned long caller, unsigned long ip, ++ enum btree_id btree_id, ++ struct bpos *pos_want, ++ unsigned locks_want, ++ struct bpos *pos_found, ++ unsigned locks_found, ++ unsigned uptodate), ++ TP_ARGS(caller, ip, btree_id, ++ pos_want, locks_want, ++ pos_found, locks_found, ++ uptodate), ++ ++ TP_STRUCT__entry( ++ __field(unsigned long, caller ) ++ __field(unsigned long, ip ) ++ __field(u8, btree_id ) ++ __field(u8, uptodate ) ++ __field(u8, locks_want ) ++ __field(u8, locks_found ) ++ __field(u64, pos_want_inode ) ++ __field(u64, pos_want_offset ) ++ __field(u32, pos_want_snapshot ) ++ __field(u64, pos_found_inode ) ++ __field(u64, pos_found_offset ) ++ __field(u32, pos_found_snapshot ) ++ ), ++ ++ TP_fast_assign( ++ __entry->caller = caller; ++ __entry->ip = ip; ++ __entry->btree_id = btree_id; ++ __entry->uptodate = uptodate; ++ __entry->pos_want_inode = pos_want->inode; ++ __entry->pos_want_offset = pos_want->offset; ++ __entry->pos_want_snapshot = pos_want->snapshot; ++ __entry->pos_found_inode = pos_found->inode; ++ __entry->pos_found_offset = pos_found->offset; ++ __entry->pos_found_snapshot = pos_found->snapshot; ++ ), ++ ++ TP_printk("%ps %pS btree %u uptodate %u want %llu:%llu:%u locks %u found %llu:%llu:%u locks %u", ++ (void *) __entry->caller, ++ (void *) __entry->ip, ++ __entry->btree_id, ++ __entry->uptodate, ++ __entry->pos_want_inode, ++ __entry->pos_want_offset, ++ __entry->pos_want_snapshot, ++ __entry->locks_want, ++ __entry->pos_found_inode, ++ __entry->pos_found_offset, ++ __entry->pos_found_snapshot, ++ __entry->locks_found) ++); ++ + TRACE_EVENT(transaction_restart_ip, + TP_PROTO(unsigned long caller, unsigned long ip), + TP_ARGS(caller, ip), +-- +cgit v1.2.3 + + +From 7f0126ac46317231acaa4c37573ba83a2865cdbb Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 29 Apr 2021 22:32:44 -0400 +Subject: bcachefs: Call bch2_inconsistent_error() on missing stripe/indirect + extent + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/buckets.c | 4 ++++ + fs/bcachefs/io.c | 1 + + 2 files changed, 5 insertions(+) + +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index c3ad0bc85e78..70008603f047 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -898,6 +898,7 @@ static int bch2_mark_stripe_ptr(struct bch_fs *c, + spin_unlock(&c->ec_stripes_heap_lock); + bch_err_ratelimited(c, "pointer to nonexistent stripe %llu", + (u64) p.idx); ++ bch2_inconsistent_error(c); + return -EIO; + } + +@@ -1015,6 +1016,7 @@ static int bch2_mark_stripe(struct bch_fs *c, + if (!m || (old_s && !m->alive)) { + bch_err_ratelimited(c, "error marking nonexistent stripe %zu", + idx); ++ bch2_inconsistent_error(c); + return -1; + } + +@@ -1499,6 +1501,7 @@ static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans, + bch2_fs_inconsistent(c, + "pointer to nonexistent stripe %llu", + (u64) p.ec.idx); ++ bch2_inconsistent_error(c); + ret = -EIO; + goto out; + } +@@ -1739,6 +1742,7 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans, + bch2_fs_inconsistent(c, + "%llu:%llu len %u points to nonexistent indirect extent %llu", + p.k->p.inode, p.k->p.offset, p.k->size, idx); ++ bch2_inconsistent_error(c); + ret = -EIO; + goto err; + } +diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c +index c484e58acbec..842f065f8141 100644 +--- a/fs/bcachefs/io.c ++++ b/fs/bcachefs/io.c +@@ -1967,6 +1967,7 @@ int __bch2_read_indirect_extent(struct btree_trans *trans, + k.k->type != KEY_TYPE_indirect_inline_data) { + bch_err_inum_ratelimited(trans->c, orig_k->k->k.p.inode, + "pointer to nonexistent indirect extent"); ++ bch2_inconsistent_error(trans->c); + ret = -EIO; + goto err; + } +-- +cgit v1.2.3 + + +From cc72a65b7c4d9f9ea7164185ef9d994d1c6a5403 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 27 Apr 2021 14:02:00 -0400 +Subject: bcachefs: Change bch2_btree_key_cache_count() to exclude dirty keys + +We're seeing livelocks that appear to be due to +bch2_btree_key_cache_scan repeatedly scanning and blocking other tasks +from using the key cache lock - we probably shouldn't be reporting +objects that can't actually be freed yet. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_key_cache.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c +index a0ff0c3ceb90..dfaf5e6df917 100644 +--- a/fs/bcachefs/btree_key_cache.c ++++ b/fs/bcachefs/btree_key_cache.c +@@ -645,8 +645,10 @@ static unsigned long bch2_btree_key_cache_count(struct shrinker *shrink, + struct bch_fs *c = container_of(shrink, struct bch_fs, + btree_key_cache.shrink); + struct btree_key_cache *bc = &c->btree_key_cache; ++ long nr = atomic_long_read(&bc->nr_keys) - ++ atomic_long_read(&bc->nr_dirty); + +- return atomic_long_read(&bc->nr_keys); ++ return max(0L, nr); + } + + void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc) +-- +cgit v1.2.3 + + +From 24c981f4649c5b2251dcd1597f33079787200963 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 27 Apr 2021 14:03:13 -0400 +Subject: bcachefs: Change copygc wait amount to be min of per device waits + +We're seeing a filesystem get stuck when all devices but one have no +more reclaimable buckets - because the copygc wait amount is curretly +filesystem wide. + +This patch should fix that, possibly at the expensive of running too +much when only one or a few devices is full and the rebalance thread +needs to move data around. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/movinggc.c | 10 ++++++---- + 1 file changed, 6 insertions(+), 4 deletions(-) + +diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c +index 4ac7e61fb841..61c5901f0980 100644 +--- a/fs/bcachefs/movinggc.c ++++ b/fs/bcachefs/movinggc.c +@@ -293,17 +293,19 @@ unsigned long bch2_copygc_wait_amount(struct bch_fs *c) + { + struct bch_dev *ca; + unsigned dev_idx; +- u64 fragmented_allowed = 0, fragmented = 0; ++ s64 wait = S64_MAX, fragmented_allowed, fragmented; + + for_each_rw_member(ca, c, dev_idx) { + struct bch_dev_usage usage = bch2_dev_usage_read(ca); + +- fragmented_allowed += ((__dev_buckets_reclaimable(ca, usage) * ++ fragmented_allowed = ((__dev_buckets_reclaimable(ca, usage) * + ca->mi.bucket_size) >> 1); +- fragmented += usage.d[BCH_DATA_user].fragmented; ++ fragmented = usage.d[BCH_DATA_user].fragmented; ++ ++ wait = min(wait, max(0LL, fragmented_allowed - fragmented)); + } + +- return max_t(s64, 0, fragmented_allowed - fragmented); ++ return wait; + } + + static int bch2_copygc_thread(void *arg) +-- +cgit v1.2.3 + + +From 8d4e8614db8b5f0ba561aeb5b2ab63c4f27a6aa4 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 27 Apr 2021 14:18:22 -0400 +Subject: bcachefs: Enable .splice_write + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/fs-io.c | 5 +++-- + fs/bcachefs/fs.c | 3 --- + 2 files changed, 3 insertions(+), 5 deletions(-) + +diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c +index 0087374c6242..3bde3f89d959 100644 +--- a/fs/bcachefs/fs-io.c ++++ b/fs/bcachefs/fs-io.c +@@ -1937,8 +1937,9 @@ loop: + i_size_write(&inode->v, req->ki_pos); + spin_unlock(&inode->v.i_lock); + +- bio_for_each_segment_all(bv, bio, iter) +- put_page(bv->bv_page); ++ if (likely(!bio_flagged(bio, BIO_NO_PAGE_REF))) ++ bio_for_each_segment_all(bv, bio, iter) ++ put_page(bv->bv_page); + + if (dio->op.error) { + set_bit(EI_INODE_ERROR, &inode->ei_flags); +diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c +index 095b3109ed29..1b6f5653c44f 100644 +--- a/fs/bcachefs/fs.c ++++ b/fs/bcachefs/fs.c +@@ -1010,10 +1010,7 @@ static const struct file_operations bch_file_operations = { + .open = generic_file_open, + .fsync = bch2_fsync, + .splice_read = generic_file_splice_read, +-#if 0 +- /* Busted: */ + .splice_write = iter_file_splice_write, +-#endif + .fallocate = bch2_fallocate_dispatch, + .unlocked_ioctl = bch2_fs_file_ioctl, + #ifdef CONFIG_COMPAT +-- +cgit v1.2.3 + + +From 3a870a6a0c15060bdffad3c6ec6de4da7a6c2b33 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 28 Apr 2021 19:36:12 -0400 +Subject: bcachefs: Ensure that fpunch updates inode timestamps + +Fixes xfstests generic/059 + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/fs-io.c | 14 ++++++++++++++ + fs/bcachefs/fs.c | 2 +- + 2 files changed, 15 insertions(+), 1 deletion(-) + +diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c +index 3bde3f89d959..955fce9e90e0 100644 +--- a/fs/bcachefs/fs-io.c ++++ b/fs/bcachefs/fs-io.c +@@ -2402,6 +2402,15 @@ err: + + /* fallocate: */ + ++static int inode_update_times_fn(struct bch_inode_info *inode, ++ struct bch_inode_unpacked *bi, void *p) ++{ ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ ++ bi->bi_mtime = bi->bi_ctime = bch2_current_time(c); ++ return 0; ++} ++ + static long bchfs_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len) + { + struct bch_fs *c = inode->v.i_sb->s_fs_info; +@@ -2439,6 +2448,11 @@ static long bchfs_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len + &i_sectors_delta); + i_sectors_acct(c, inode, NULL, i_sectors_delta); + } ++ ++ mutex_lock(&inode->ei_update_lock); ++ ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL, ++ ATTR_MTIME|ATTR_CTIME) ?: ret; ++ mutex_unlock(&inode->ei_update_lock); + err: + bch2_pagecache_block_put(&inode->ei_pagecache_lock); + inode_unlock(&inode->v); +diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c +index 1b6f5653c44f..2b286987d0a4 100644 +--- a/fs/bcachefs/fs.c ++++ b/fs/bcachefs/fs.c +@@ -145,7 +145,7 @@ int __must_check bch2_write_inode(struct bch_fs *c, + struct bch_inode_unpacked inode_u; + int ret; + +- bch2_trans_init(&trans, c, 0, 0); ++ bch2_trans_init(&trans, c, 0, 256); + retry: + bch2_trans_begin(&trans); + +-- +cgit v1.2.3 + + +From 4ce9f9afcd07a379323da7f2f1e04ee1cf2ba07e Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 28 Apr 2021 22:12:07 -0400 +Subject: bcachefs: Make sure to initialize j->last_flushed + +If the journal reclaim thread makes it to the timeout without ever +initializing j->last_flushed, we could end up sleeping for a very long +time. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/journal.c | 4 ++++ + fs/bcachefs/journal_reclaim.c | 16 +++++++++++----- + fs/bcachefs/journal_reclaim.h | 8 +++----- + fs/bcachefs/journal_types.h | 1 + + 4 files changed, 19 insertions(+), 10 deletions(-) + +diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c +index 1e88a5f3d0f3..33dae09798e9 100644 +--- a/fs/bcachefs/journal.c ++++ b/fs/bcachefs/journal.c +@@ -1187,6 +1187,8 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) + "nr noflush writes:\t%llu\n" + "nr direct reclaim:\t%llu\n" + "nr background reclaim:\t%llu\n" ++ "reclaim kicked:\t\t%u\n" ++ "reclaim runs in:\t%u ms\n" + "current entry sectors:\t%u\n" + "current entry error:\t%u\n" + "current entry:\t\t", +@@ -1202,6 +1204,8 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) + j->nr_noflush_writes, + j->nr_direct_reclaim, + j->nr_background_reclaim, ++ j->reclaim_kicked, ++ jiffies_to_msecs(j->next_reclaim - jiffies), + j->cur_entry_sectors, + j->cur_entry_error); + +diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c +index 24d04e51fb61..427be2da1dfc 100644 +--- a/fs/bcachefs/journal_reclaim.c ++++ b/fs/bcachefs/journal_reclaim.c +@@ -677,13 +677,15 @@ int bch2_journal_reclaim(struct journal *j) + static int bch2_journal_reclaim_thread(void *arg) + { + struct journal *j = arg; +- unsigned long next; ++ unsigned long delay, now; + int ret = 0; + + set_freezable(); + + kthread_wait_freezable(test_bit(JOURNAL_RECLAIM_STARTED, &j->flags)); + ++ j->last_flushed = jiffies; ++ + while (!ret && !kthread_should_stop()) { + j->reclaim_kicked = false; + +@@ -691,7 +693,12 @@ static int bch2_journal_reclaim_thread(void *arg) + ret = __bch2_journal_reclaim(j, false); + mutex_unlock(&j->reclaim_lock); + +- next = j->last_flushed + msecs_to_jiffies(j->reclaim_delay_ms); ++ now = jiffies; ++ delay = msecs_to_jiffies(j->reclaim_delay_ms); ++ j->next_reclaim = j->last_flushed + delay; ++ ++ if (!time_in_range(j->next_reclaim, now, now + delay)) ++ j->next_reclaim = now + delay; + + while (1) { + set_current_state(TASK_INTERRUPTIBLE); +@@ -699,10 +706,9 @@ static int bch2_journal_reclaim_thread(void *arg) + break; + if (j->reclaim_kicked) + break; +- if (time_after_eq(jiffies, next)) ++ if (time_after_eq(jiffies, j->next_reclaim)) + break; +- schedule_timeout(next - jiffies); +- try_to_freeze(); ++ freezable_schedule_timeout(j->next_reclaim - jiffies); + + } + __set_current_state(TASK_RUNNING); +diff --git a/fs/bcachefs/journal_reclaim.h b/fs/bcachefs/journal_reclaim.h +index adf1f5c981cd..0fd1af120db5 100644 +--- a/fs/bcachefs/journal_reclaim.h ++++ b/fs/bcachefs/journal_reclaim.h +@@ -8,11 +8,9 @@ static inline void journal_reclaim_kick(struct journal *j) + { + struct task_struct *p = READ_ONCE(j->reclaim_thread); + +- if (p && !j->reclaim_kicked) { +- j->reclaim_kicked = true; +- if (p) +- wake_up_process(p); +- } ++ j->reclaim_kicked = true; ++ if (p) ++ wake_up_process(p); + } + + unsigned bch2_journal_dev_buckets_available(struct journal *, +diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h +index c24bc4aa9af2..a7aa12e919e2 100644 +--- a/fs/bcachefs/journal_types.h ++++ b/fs/bcachefs/journal_types.h +@@ -248,6 +248,7 @@ struct journal { + wait_queue_head_t reclaim_wait; + struct task_struct *reclaim_thread; + bool reclaim_kicked; ++ unsigned long next_reclaim; + u64 nr_direct_reclaim; + u64 nr_background_reclaim; + +-- +cgit v1.2.3 + + +From 3f34c4d257acf6303f78becd2057f44fd9111ab0 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 29 Apr 2021 00:21:54 -0400 +Subject: bcachefs: Add a tracepoint for when we block on journal reclaim + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_update_leaf.c | 2 ++ + include/trace/events/bcachefs.h | 5 +++++ + 2 files changed, 7 insertions(+) + +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index b793ab77e452..64b6e86cf90a 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -725,6 +725,8 @@ int bch2_trans_commit_error(struct btree_trans *trans, + case BTREE_INSERT_NEED_JOURNAL_RECLAIM: + bch2_trans_unlock(trans); + ++ trace_trans_blocked_journal_reclaim(trans->ip); ++ + wait_event_freezable(c->journal.reclaim_wait, + (ret = journal_reclaim_wait_done(c))); + if (ret < 0) +diff --git a/include/trace/events/bcachefs.h b/include/trace/events/bcachefs.h +index 2163651c0fe4..48dc2377930a 100644 +--- a/include/trace/events/bcachefs.h ++++ b/include/trace/events/bcachefs.h +@@ -621,6 +621,11 @@ DEFINE_EVENT(transaction_restart, trans_restart_btree_node_reused, + TP_ARGS(ip) + ); + ++DEFINE_EVENT(transaction_restart, trans_blocked_journal_reclaim, ++ TP_PROTO(unsigned long ip), ++ TP_ARGS(ip) ++); ++ + TRACE_EVENT(trans_restart_would_deadlock, + TP_PROTO(unsigned long trans_ip, + unsigned long caller_ip, +-- +cgit v1.2.3 + + +From a1e6af3e2f949f532e62863a5c02955f589754d6 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 28 Apr 2021 22:51:42 -0400 +Subject: bcachefs: Fix time handling + +There were some overflows in the time conversion functions - fix this by +converting tv_sec and tv_nsec separately. Also, set sb->time_min and +sb->time_max. + +Fixes xfstest generic/258. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bcachefs.h | 23 ++++++++++++++--------- + fs/bcachefs/fs.c | 4 +++- + fs/bcachefs/super-io.c | 10 ++++++++-- + 3 files changed, 25 insertions(+), 12 deletions(-) + +diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h +index b3a93e4b399f..40ce8c763396 100644 +--- a/fs/bcachefs/bcachefs.h ++++ b/fs/bcachefs/bcachefs.h +@@ -606,11 +606,13 @@ struct bch_fs { + + u64 time_base_lo; + u32 time_base_hi; +- u32 time_precision; ++ unsigned time_units_per_sec; ++ unsigned nsec_per_time_unit; + u64 features; + u64 compat; + } sb; + ++ + struct bch_sb_handle disk_sb; + + unsigned short block_bits; /* ilog2(block_size) */ +@@ -874,19 +876,22 @@ static inline unsigned block_bytes(const struct bch_fs *c) + return c->opts.block_size << 9; + } + +-static inline struct timespec64 bch2_time_to_timespec(struct bch_fs *c, u64 time) ++static inline struct timespec64 bch2_time_to_timespec(struct bch_fs *c, s64 time) + { +- return ns_to_timespec64(time * c->sb.time_precision + c->sb.time_base_lo); ++ struct timespec64 t; ++ s32 rem; ++ ++ time += c->sb.time_base_lo; ++ ++ t.tv_sec = div_s64_rem(time, c->sb.time_units_per_sec, &rem); ++ t.tv_nsec = rem * c->sb.nsec_per_time_unit; ++ return t; + } + + static inline s64 timespec_to_bch2_time(struct bch_fs *c, struct timespec64 ts) + { +- s64 ns = timespec64_to_ns(&ts) - c->sb.time_base_lo; +- +- if (c->sb.time_precision == 1) +- return ns; +- +- return div_s64(ns, c->sb.time_precision); ++ return (ts.tv_sec * c->sb.time_units_per_sec + ++ (int) ts.tv_nsec / c->sb.nsec_per_time_unit) - c->sb.time_base_lo; + } + + static inline s64 bch2_current_time(struct bch_fs *c) +diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c +index 2b286987d0a4..93f3f494448a 100644 +--- a/fs/bcachefs/fs.c ++++ b/fs/bcachefs/fs.c +@@ -1564,7 +1564,9 @@ got_sb: + #endif + sb->s_xattr = bch2_xattr_handlers; + sb->s_magic = BCACHEFS_STATFS_MAGIC; +- sb->s_time_gran = c->sb.time_precision; ++ sb->s_time_gran = c->sb.nsec_per_time_unit; ++ sb->s_time_min = div_s64(S64_MIN, c->sb.time_units_per_sec) + 1; ++ sb->s_time_max = div_s64(S64_MAX, c->sb.time_units_per_sec); + c->vfs_sb = sb; + strlcpy(sb->s_id, c->name, sizeof(sb->s_id)); + +diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c +index 11d7167b0129..74a75ced031e 100644 +--- a/fs/bcachefs/super-io.c ++++ b/fs/bcachefs/super-io.c +@@ -367,9 +367,15 @@ static void bch2_sb_update(struct bch_fs *c) + c->sb.clean = BCH_SB_CLEAN(src); + c->sb.encryption_type = BCH_SB_ENCRYPTION_TYPE(src); + c->sb.encoded_extent_max= 1 << BCH_SB_ENCODED_EXTENT_MAX_BITS(src); +- c->sb.time_base_lo = le64_to_cpu(src->time_base_lo); ++ ++ c->sb.nsec_per_time_unit = le32_to_cpu(src->time_precision); ++ c->sb.time_units_per_sec = NSEC_PER_SEC / c->sb.nsec_per_time_unit; ++ ++ /* XXX this is wrong, we need a 96 or 128 bit integer type */ ++ c->sb.time_base_lo = div_u64(le64_to_cpu(src->time_base_lo), ++ c->sb.nsec_per_time_unit); + c->sb.time_base_hi = le32_to_cpu(src->time_base_hi); +- c->sb.time_precision = le32_to_cpu(src->time_precision); ++ + c->sb.features = le64_to_cpu(src->features[0]); + c->sb.compat = le64_to_cpu(src->compat[0]); + +-- +cgit v1.2.3 + + +From 353c32c1b647b6ce73a7370810b496a3c9a9cabe Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 29 Apr 2021 16:55:26 -0400 +Subject: bcachefs: Mark newly allocated btree nodes as accessed + +This was a major oversight - this means under memory pressure we can end +up reading in a btree node, then having it evicted before we get to use +it. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_cache.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c +index eaf9da6e50da..f3ceb1e5464f 100644 +--- a/fs/bcachefs/btree_cache.c ++++ b/fs/bcachefs/btree_cache.c +@@ -589,6 +589,7 @@ out: + b->sib_u64s[1] = 0; + b->whiteout_u64s = 0; + bch2_btree_keys_init(b); ++ set_btree_node_accessed(b); + + bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc], + start_time); +-- +cgit v1.2.3 + + +From 3eb9053f5dcddd67cde8bf55cb9d02bc32d15f9b Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 29 Apr 2021 15:37:47 -0400 +Subject: bcachefs: Clean up bch2_btree_and_journal_walk() + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/alloc_background.c | 13 +++++-------- + fs/bcachefs/alloc_background.h | 3 +-- + fs/bcachefs/ec.c | 14 +++++--------- + fs/bcachefs/ec.h | 3 +-- + fs/bcachefs/recovery.c | 36 ++++++++++++------------------------ + fs/bcachefs/recovery.h | 7 ++----- + 6 files changed, 26 insertions(+), 50 deletions(-) + +diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c +index 93ecf7301818..d938e444a216 100644 +--- a/fs/bcachefs/alloc_background.c ++++ b/fs/bcachefs/alloc_background.c +@@ -261,16 +261,14 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, + #undef x + } + +-static int bch2_alloc_read_fn(struct bch_fs *c, enum btree_id id, +- unsigned level, struct bkey_s_c k) ++static int bch2_alloc_read_fn(struct bch_fs *c, struct bkey_s_c k) + { + struct bch_dev *ca; + struct bucket *g; + struct bkey_alloc_unpacked u; + +- if (level || +- (k.k->type != KEY_TYPE_alloc && +- k.k->type != KEY_TYPE_alloc_v2)) ++ if (k.k->type != KEY_TYPE_alloc && ++ k.k->type != KEY_TYPE_alloc_v2) + return 0; + + ca = bch_dev_bkey_exists(c, k.k->p.inode); +@@ -289,13 +287,12 @@ static int bch2_alloc_read_fn(struct bch_fs *c, enum btree_id id, + return 0; + } + +-int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys) ++int bch2_alloc_read(struct bch_fs *c) + { + int ret; + + down_read(&c->gc_lock); +- ret = bch2_btree_and_journal_walk(c, journal_keys, BTREE_ID_alloc, +- NULL, bch2_alloc_read_fn); ++ ret = bch2_btree_and_journal_walk(c, BTREE_ID_alloc, bch2_alloc_read_fn); + up_read(&c->gc_lock); + if (ret) { + bch_err(c, "error reading alloc info: %i", ret); +diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h +index ad15a80602c0..9cadfdb5b83d 100644 +--- a/fs/bcachefs/alloc_background.h ++++ b/fs/bcachefs/alloc_background.h +@@ -91,8 +91,7 @@ void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); + .val_to_text = bch2_alloc_to_text, \ + } + +-struct journal_keys; +-int bch2_alloc_read(struct bch_fs *, struct journal_keys *); ++int bch2_alloc_read(struct bch_fs *); + + static inline void bch2_wake_allocator(struct bch_dev *ca) + { +diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c +index e6a14497ea84..5a87d41ff279 100644 +--- a/fs/bcachefs/ec.c ++++ b/fs/bcachefs/ec.c +@@ -1630,26 +1630,22 @@ int bch2_stripes_write(struct bch_fs *c, unsigned flags) + return ret; + } + +-static int bch2_stripes_read_fn(struct bch_fs *c, enum btree_id id, +- unsigned level, struct bkey_s_c k) ++static int bch2_stripes_read_fn(struct bch_fs *c, struct bkey_s_c k) + { + int ret = 0; + +- if (k.k->type == KEY_TYPE_stripe) { ++ if (k.k->type == KEY_TYPE_stripe) + ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL) ?: + bch2_mark_key(c, k, 0, 0, NULL, 0, + BTREE_TRIGGER_NOATOMIC); +- if (ret) +- return ret; +- } + + return ret; + } + +-int bch2_stripes_read(struct bch_fs *c, struct journal_keys *journal_keys) ++int bch2_stripes_read(struct bch_fs *c) + { +- int ret = bch2_btree_and_journal_walk(c, journal_keys, BTREE_ID_stripes, +- NULL, bch2_stripes_read_fn); ++ int ret = bch2_btree_and_journal_walk(c, BTREE_ID_stripes, ++ bch2_stripes_read_fn); + if (ret) + bch_err(c, "error reading stripes: %i", ret); + +diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h +index 744e51eaf327..e79626b59509 100644 +--- a/fs/bcachefs/ec.h ++++ b/fs/bcachefs/ec.h +@@ -215,8 +215,7 @@ void bch2_ec_flush_new_stripes(struct bch_fs *); + + void bch2_stripes_heap_start(struct bch_fs *); + +-struct journal_keys; +-int bch2_stripes_read(struct bch_fs *, struct journal_keys *); ++int bch2_stripes_read(struct bch_fs *); + int bch2_stripes_write(struct bch_fs *, unsigned); + + int bch2_ec_mem_alloc(struct bch_fs *, bool); +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index a9ccd14effe7..b35b297d4446 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -323,9 +323,7 @@ static void btree_and_journal_iter_prefetch(struct bch_fs *c, struct btree *b, + } + + static int bch2_btree_and_journal_walk_recurse(struct bch_fs *c, struct btree *b, +- struct journal_keys *journal_keys, + enum btree_id btree_id, +- btree_walk_node_fn node_fn, + btree_walk_key_fn key_fn) + { + struct btree_and_journal_iter iter; +@@ -338,15 +336,9 @@ static int bch2_btree_and_journal_walk_recurse(struct bch_fs *c, struct btree *b + bch2_btree_and_journal_iter_init_node_iter(&iter, c, b); + + while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { +- ret = key_fn(c, btree_id, b->c.level, k); +- if (ret) +- break; +- + if (b->c.level) { + bch2_bkey_buf_reassemble(&tmp, c, k); + +- bch2_btree_and_journal_iter_advance(&iter); +- + child = bch2_btree_node_get_noiter(c, tmp.k, + b->c.btree_id, b->c.level - 1, + false); +@@ -357,16 +349,17 @@ static int bch2_btree_and_journal_walk_recurse(struct bch_fs *c, struct btree *b + + btree_and_journal_iter_prefetch(c, b, iter); + +- ret = (node_fn ? node_fn(c, b) : 0) ?: +- bch2_btree_and_journal_walk_recurse(c, child, +- journal_keys, btree_id, node_fn, key_fn); ++ ret = bch2_btree_and_journal_walk_recurse(c, child, ++ btree_id, key_fn); + six_unlock_read(&child->c.lock); +- +- if (ret) +- break; + } else { +- bch2_btree_and_journal_iter_advance(&iter); ++ ret = key_fn(c, k); + } ++ ++ if (ret) ++ break; ++ ++ bch2_btree_and_journal_iter_advance(&iter); + } + + bch2_btree_and_journal_iter_exit(&iter); +@@ -374,9 +367,7 @@ static int bch2_btree_and_journal_walk_recurse(struct bch_fs *c, struct btree *b + return ret; + } + +-int bch2_btree_and_journal_walk(struct bch_fs *c, struct journal_keys *journal_keys, +- enum btree_id btree_id, +- btree_walk_node_fn node_fn, ++int bch2_btree_and_journal_walk(struct bch_fs *c, enum btree_id btree_id, + btree_walk_key_fn key_fn) + { + struct btree *b = c->btree_roots[btree_id].b; +@@ -386,10 +377,7 @@ int bch2_btree_and_journal_walk(struct bch_fs *c, struct journal_keys *journal_k + return 0; + + six_lock_read(&b->c.lock, NULL, NULL); +- ret = (node_fn ? node_fn(c, b) : 0) ?: +- bch2_btree_and_journal_walk_recurse(c, b, journal_keys, btree_id, +- node_fn, key_fn) ?: +- key_fn(c, btree_id, b->c.level + 1, bkey_i_to_s_c(&b->key)); ++ ret = bch2_btree_and_journal_walk_recurse(c, b, btree_id, key_fn); + six_unlock_read(&b->c.lock); + + return ret; +@@ -1120,14 +1108,14 @@ use_clean: + + bch_verbose(c, "starting alloc read"); + err = "error reading allocation information"; +- ret = bch2_alloc_read(c, &c->journal_keys); ++ ret = bch2_alloc_read(c); + if (ret) + goto err; + bch_verbose(c, "alloc read done"); + + bch_verbose(c, "starting stripes_read"); + err = "error reading stripes"; +- ret = bch2_stripes_read(c, &c->journal_keys); ++ ret = bch2_stripes_read(c); + if (ret) + goto err; + bch_verbose(c, "stripes_read done"); +diff --git a/fs/bcachefs/recovery.h b/fs/bcachefs/recovery.h +index fa91851b9ed7..e5565e4f335a 100644 +--- a/fs/bcachefs/recovery.h ++++ b/fs/bcachefs/recovery.h +@@ -45,12 +45,9 @@ void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *, + struct bch_fs *, + struct btree *); + +-typedef int (*btree_walk_node_fn)(struct bch_fs *c, struct btree *b); +-typedef int (*btree_walk_key_fn)(struct bch_fs *c, enum btree_id id, +- unsigned level, struct bkey_s_c k); ++typedef int (*btree_walk_key_fn)(struct bch_fs *c, struct bkey_s_c k); + +-int bch2_btree_and_journal_walk(struct bch_fs *, struct journal_keys *, enum btree_id, +- btree_walk_node_fn, btree_walk_key_fn); ++int bch2_btree_and_journal_walk(struct bch_fs *, enum btree_id, btree_walk_key_fn); + + void bch2_journal_keys_free(struct journal_keys *); + void bch2_journal_entries_free(struct list_head *); +-- +cgit v1.2.3 + + +From 61b9527714855cf908302891e2765d2e46f1d1c4 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 7 May 2021 23:32:26 -0400 +Subject: bcachefs: Fix usage of last_seq + encryption + +jset->last_seq is in the region that's encrypted - on journal write +completion, we were using it and getting garbage. This patch shadows it +to fix. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/journal.c | 3 ++- + fs/bcachefs/journal_io.c | 7 +++---- + fs/bcachefs/journal_types.h | 1 + + 3 files changed, 6 insertions(+), 5 deletions(-) + +diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c +index 33dae09798e9..4af2fc1ad23b 100644 +--- a/fs/bcachefs/journal.c ++++ b/fs/bcachefs/journal.c +@@ -190,7 +190,8 @@ static bool __journal_entry_close(struct journal *j) + * Hence, we want update/set last_seq on the current journal entry right + * before we open a new one: + */ +- buf->data->last_seq = cpu_to_le64(journal_last_seq(j)); ++ buf->last_seq = journal_last_seq(j); ++ buf->data->last_seq = cpu_to_le64(buf->last_seq); + + __bch2_journal_pin_put(j, le64_to_cpu(buf->data->seq)); + +diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c +index c7fa03cfbde6..635cceb4dd21 100644 +--- a/fs/bcachefs/journal_io.c ++++ b/fs/bcachefs/journal_io.c +@@ -1237,7 +1237,7 @@ static void journal_write_done(struct closure *cl) + bch2_bkey_devs(bkey_i_to_s_c(&w->key)); + struct bch_replicas_padded replicas; + union journal_res_state old, new; +- u64 v, seq, last_seq; ++ u64 v, seq; + int err = 0; + + bch2_time_stats_update(j->write_time, j->write_start_time); +@@ -1256,7 +1256,6 @@ static void journal_write_done(struct closure *cl) + + spin_lock(&j->lock); + seq = le64_to_cpu(w->data->seq); +- last_seq = le64_to_cpu(w->data->last_seq); + + if (seq >= j->pin.front) + journal_seq_pin(j, seq)->devs = devs; +@@ -1267,7 +1266,7 @@ static void journal_write_done(struct closure *cl) + + if (!JSET_NO_FLUSH(w->data)) { + j->flushed_seq_ondisk = seq; +- j->last_seq_ondisk = last_seq; ++ j->last_seq_ondisk = w->last_seq; + } + + /* +@@ -1403,7 +1402,7 @@ void bch2_journal_write(struct closure *cl) + test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags)) { + w->noflush = true; + SET_JSET_NO_FLUSH(jset, true); +- jset->last_seq = 0; ++ jset->last_seq = w->last_seq = 0; + + j->nr_noflush_writes++; + } else { +diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h +index a7aa12e919e2..cacab22a35c1 100644 +--- a/fs/bcachefs/journal_types.h ++++ b/fs/bcachefs/journal_types.h +@@ -23,6 +23,7 @@ struct journal_buf { + __BKEY_PADDED(key, BCH_REPLICAS_MAX); + + struct closure_waitlist wait; ++ u64 last_seq; /* copy of data->last_seq */ + + unsigned buf_size; /* size in bytes of @data */ + unsigned sectors; /* maximum size for current entry */ +-- +cgit v1.2.3 + + +From 3b2db6b9a1d4b3b58df2ec271000672ab76b2d90 Mon Sep 17 00:00:00 2001 +From: Dan Robertson +Date: Fri, 7 May 2021 22:29:02 -0400 +Subject: bcachefs: Fix oob write in __bch2_btree_node_write + +Fix a possible out of bounds write in __bch2_btree_node_write when +the data buffer padding is cleared up to the block size. The out of +bounds write is possible if the data buffers size is not a multiple +of the block size. + +Signed-off-by: Dan Robertson +--- + fs/bcachefs/btree_io.c | 3 +++ + 1 file changed, 3 insertions(+) + +diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c +index 226b19b6e467..a0af6d013ef3 100644 +--- a/fs/bcachefs/btree_io.c ++++ b/fs/bcachefs/btree_io.c +@@ -1499,6 +1499,9 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b) + /* bch2_varint_decode may read up to 7 bytes past the end of the buffer: */ + bytes += 8; + ++ /* buffer must be a multiple of the block size */ ++ bytes = round_up(bytes, block_bytes(c)); ++ + data = btree_bounce_alloc(c, bytes, &used_mempool); + + if (!b->written) { +-- +cgit v1.2.3 + + +From 921ab8803e9aad2bdbc1e89da9c3f64d621641eb Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 7 May 2021 20:43:43 -0400 +Subject: bcachefs: Fix some refcounting bugs + +We really need debug mode assertions that ca->ref and ca->io_ref are +used correctly. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/alloc_background.c | 2 +- + fs/bcachefs/btree_gc.c | 6 ++++-- + fs/bcachefs/buckets.c | 2 +- + fs/bcachefs/recovery.c | 4 +++- + fs/bcachefs/super.c | 12 ++++++++---- + fs/bcachefs/super.h | 5 +---- + 6 files changed, 18 insertions(+), 13 deletions(-) + +diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c +index d938e444a216..4fa052a8f06e 100644 +--- a/fs/bcachefs/alloc_background.c ++++ b/fs/bcachefs/alloc_background.c +@@ -371,7 +371,7 @@ int bch2_alloc_write(struct bch_fs *c, unsigned flags) + + ret = bch2_alloc_write_key(&trans, iter, flags); + if (ret) { +- percpu_ref_put(&ca->io_ref); ++ percpu_ref_put(&ca->ref); + goto err; + } + bch2_btree_iter_next_slot(iter); +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +index 12b7c35b9ea5..93a8d8e37505 100644 +--- a/fs/bcachefs/btree_gc.c ++++ b/fs/bcachefs/btree_gc.c +@@ -1023,7 +1023,7 @@ static void bch2_gc_free(struct bch_fs *c) + static int bch2_gc_done(struct bch_fs *c, + bool initial, bool metadata_only) + { +- struct bch_dev *ca; ++ struct bch_dev *ca = NULL; + bool verify = !metadata_only && (!initial || + (c->sb.compat & (1ULL << BCH_COMPAT_alloc_info))); + unsigned i, dev; +@@ -1169,6 +1169,8 @@ static int bch2_gc_done(struct bch_fs *c, + #undef copy_stripe_field + #undef copy_field + fsck_err: ++ if (ca) ++ percpu_ref_put(&ca->ref); + if (ret) + bch_err(c, "%s: ret %i", __func__, ret); + return ret; +@@ -1177,7 +1179,7 @@ fsck_err: + static int bch2_gc_start(struct bch_fs *c, + bool metadata_only) + { +- struct bch_dev *ca; ++ struct bch_dev *ca = NULL; + unsigned i; + int ret; + +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index 70008603f047..947e55334245 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -2067,7 +2067,7 @@ static int __bch2_trans_mark_dev_sb(struct btree_trans *trans, + + int bch2_trans_mark_dev_sb(struct bch_fs *c, struct bch_dev *ca) + { +- return bch2_trans_do(c, NULL, NULL, 0, ++ return bch2_trans_do(c, NULL, NULL, BTREE_INSERT_LAZY_RW, + __bch2_trans_mark_dev_sb(&trans, ca)); + } + +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index b35b297d4446..cd538ecc1f3f 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -1328,8 +1328,10 @@ int bch2_fs_initialize(struct bch_fs *c) + err = "error marking superblock and journal"; + for_each_member_device(ca, c, i) { + ret = bch2_trans_mark_dev_sb(c, ca); +- if (ret) ++ if (ret) { ++ percpu_ref_put(&ca->ref); + goto err; ++ } + } + + bch2_inode_init(c, &root_inode, 0, 0, +diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c +index 326ff51e4da6..792d8bb4896c 100644 +--- a/fs/bcachefs/super.c ++++ b/fs/bcachefs/super.c +@@ -621,9 +621,11 @@ static const char *bch2_fs_online(struct bch_fs *c) + down_write(&c->state_lock); + + err = "error creating sysfs objects"; +- __for_each_member_device(ca, c, i, NULL) +- if (bch2_dev_sysfs_online(c, ca)) ++ for_each_member_device(ca, c, i) ++ if (bch2_dev_sysfs_online(c, ca)) { ++ percpu_ref_put(&ca->ref); + goto err; ++ } + + list_add(&c->list, &bch_fs_list); + err = NULL; +@@ -1835,12 +1837,14 @@ struct bch_dev *bch2_dev_lookup(struct bch_fs *c, const char *path) + if (ret) + return ERR_PTR(ret); + +- for_each_member_device(ca, c, i) ++ rcu_read_lock(); ++ for_each_member_device_rcu(ca, c, i, NULL) + if (ca->disk_sb.bdev->bd_dev == dev) + goto found; +- + ca = ERR_PTR(-ENOENT); + found: ++ rcu_read_unlock(); ++ + return ca; + } + +diff --git a/fs/bcachefs/super.h b/fs/bcachefs/super.h +index 6cab506150a8..739e8fd18176 100644 +--- a/fs/bcachefs/super.h ++++ b/fs/bcachefs/super.h +@@ -107,11 +107,8 @@ static inline struct bch_dev *__bch2_next_dev(struct bch_fs *c, unsigned *iter, + return ca; + } + +-#define __for_each_member_device(ca, c, iter, mask) \ +- for ((iter) = 0; ((ca) = __bch2_next_dev((c), &(iter), mask)); (iter)++) +- + #define for_each_member_device_rcu(ca, c, iter, mask) \ +- __for_each_member_device(ca, c, iter, mask) ++ for ((iter) = 0; ((ca) = __bch2_next_dev((c), &(iter), mask)); (iter)++) + + static inline struct bch_dev *bch2_get_next_dev(struct bch_fs *c, unsigned *iter) + { +-- +cgit v1.2.3 + + +From 76f0f43fd235b2c2e4b347fd8f965ab35c252627 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 3 May 2021 20:31:27 -0400 +Subject: bcachefs: Fix reflink trigger + +The trigger for reflink pointers wasn't always incrementing/decrementing +the refcounts correctly - this patch fixes that logic. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/buckets.c | 43 +++++++++++++++++++++++++++++++++++++------ + fs/bcachefs/io.c | 5 ++++- + 2 files changed, 41 insertions(+), 7 deletions(-) + +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index 947e55334245..87266179542b 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -1706,9 +1706,28 @@ static __le64 *bkey_refcount(struct bkey_i *k) + } + } + ++static bool reflink_p_frag_references(struct bkey_s_c_reflink_p p, ++ u64 start, u64 end, ++ struct bkey_s_c k) ++{ ++ if (start == end) ++ return false; ++ ++ start += le64_to_cpu(p.v->idx); ++ end += le64_to_cpu(p.v->idx); ++ ++ if (end <= bkey_start_offset(k.k)) ++ return false; ++ if (start >= k.k->p.offset) ++ return false; ++ return true; ++} ++ + static int __bch2_trans_mark_reflink_p(struct btree_trans *trans, + struct bkey_s_c_reflink_p p, + u64 idx, unsigned sectors, ++ unsigned front_frag, ++ unsigned back_frag, + unsigned flags) + { + struct bch_fs *c = trans->c; +@@ -1716,6 +1735,7 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans, + struct bkey_s_c k; + struct bkey_i *n; + __le64 *refcount; ++ int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1; + s64 ret; + + ret = trans_get_key(trans, BTREE_ID_reflink, +@@ -1723,12 +1743,17 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans, + if (ret < 0) + return ret; + +- if ((flags & BTREE_TRIGGER_OVERWRITE) && +- (bkey_start_offset(k.k) < idx || +- k.k->p.offset > idx + sectors)) ++ if (reflink_p_frag_references(p, 0, front_frag, k) && ++ reflink_p_frag_references(p, back_frag, p.k->size, k)) { ++ BUG_ON(!(flags & BTREE_TRIGGER_OVERWRITE_SPLIT)); ++ add = -add; ++ } else if (reflink_p_frag_references(p, 0, front_frag, k) || ++ reflink_p_frag_references(p, back_frag, p.k->size, k)) { ++ BUG_ON(!(flags & BTREE_TRIGGER_OVERWRITE)); + goto out; ++ } + +- sectors = k.k->p.offset - idx; ++ sectors = min_t(u64, sectors, k.k->p.offset - idx); + + n = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); + ret = PTR_ERR_OR_ZERO(n); +@@ -1747,7 +1772,8 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans, + goto err; + } + +- le64_add_cpu(refcount, !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1); ++ BUG_ON(!*refcount && (flags & BTREE_TRIGGER_OVERWRITE)); ++ le64_add_cpu(refcount, add); + + if (!*refcount) { + n->k.type = KEY_TYPE_deleted; +@@ -1768,13 +1794,18 @@ static int bch2_trans_mark_reflink_p(struct btree_trans *trans, + s64 sectors, unsigned flags) + { + u64 idx = le64_to_cpu(p.v->idx) + offset; ++ unsigned front_frag, back_frag; + s64 ret = 0; + + sectors = abs(sectors); + BUG_ON(offset + sectors > p.k->size); + ++ front_frag = offset; ++ back_frag = offset + sectors; ++ + while (sectors) { +- ret = __bch2_trans_mark_reflink_p(trans, p, idx, sectors, flags); ++ ret = __bch2_trans_mark_reflink_p(trans, p, idx, sectors, ++ front_frag, back_frag, flags); + if (ret < 0) + break; + +diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c +index 842f065f8141..4e5e11d5859d 100644 +--- a/fs/bcachefs/io.c ++++ b/fs/bcachefs/io.c +@@ -1966,7 +1966,10 @@ int __bch2_read_indirect_extent(struct btree_trans *trans, + if (k.k->type != KEY_TYPE_reflink_v && + k.k->type != KEY_TYPE_indirect_inline_data) { + bch_err_inum_ratelimited(trans->c, orig_k->k->k.p.inode, +- "pointer to nonexistent indirect extent"); ++ "%llu len %u points to nonexistent indirect extent %llu", ++ orig_k->k->k.p.offset, ++ orig_k->k->k.size, ++ reflink_offset); + bch2_inconsistent_error(trans->c); + ret = -EIO; + goto err; +-- +cgit v1.2.3 + + +From 606c72afca9200c4260e3cc74d2aea0ae437c48a Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 29 Apr 2021 21:44:05 -0400 +Subject: bcachefs: Fix bch2_btree_iter_peek_with_updates() + +By not re-fetching the next update we were going into an infinite loop. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 14 +++++++------- + 1 file changed, 7 insertions(+), 7 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 8cc8214235c0..10d5f80ae4d5 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -1619,16 +1619,17 @@ static struct bkey_i *btree_trans_peek_updates(struct btree_trans *trans, + static inline struct bkey_s_c __btree_iter_peek(struct btree_iter *iter, bool with_updates) + { + struct bpos search_key = btree_iter_search_key(iter); +- struct bkey_i *next_update = with_updates +- ? btree_trans_peek_updates(iter->trans, iter->btree_id, search_key) +- : NULL; ++ struct bkey_i *next_update; + struct bkey_s_c k; + int ret; + + EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS); + bch2_btree_iter_verify(iter); + bch2_btree_iter_verify_entry_exit(iter); +- ++start: ++ next_update = with_updates ++ ? btree_trans_peek_updates(iter->trans, iter->btree_id, search_key) ++ : NULL; + btree_iter_set_search_pos(iter, search_key); + + while (1) { +@@ -1644,9 +1645,8 @@ static inline struct bkey_s_c __btree_iter_peek(struct btree_iter *iter, bool wi + + if (likely(k.k)) { + if (bkey_deleted(k.k)) { +- btree_iter_set_search_pos(iter, +- bkey_successor(iter, k.k->p)); +- continue; ++ search_key = bkey_successor(iter, k.k->p); ++ goto start; + } + + break; +-- +cgit v1.2.3 + + +From e7f8efe37c3ca490c37f6081db9d969bfbd0266a Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 14 May 2021 16:56:26 -0400 +Subject: bcachefs: Make sure to use BTREE_ITER_PREFETCH in fsck + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/fsck.c | 31 +++++++++++++++++++++++-------- + 1 file changed, 23 insertions(+), 8 deletions(-) + +diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c +index 8ae4e4c30933..fcdcf42f85a4 100644 +--- a/fs/bcachefs/fsck.c ++++ b/fs/bcachefs/fsck.c +@@ -512,7 +512,9 @@ static int check_inodes(struct bch_fs *c, bool full) + + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); + +- for_each_btree_key(&trans, iter, BTREE_ID_inodes, POS_MIN, 0, k, ret) { ++ for_each_btree_key(&trans, iter, BTREE_ID_inodes, POS_MIN, ++ BTREE_ITER_INTENT| ++ BTREE_ITER_PREFETCH, k, ret) { + if (k.k->type != KEY_TYPE_inode) + continue; + +@@ -621,7 +623,8 @@ static int check_extents(struct bch_fs *c) + + iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, + POS(BCACHEFS_ROOT_INO, 0), +- BTREE_ITER_INTENT); ++ BTREE_ITER_INTENT| ++ BTREE_ITER_PREFETCH); + retry: + while ((k = bch2_btree_iter_peek(iter)).k && + !(ret = bkey_err(k))) { +@@ -719,7 +722,9 @@ static int check_dirents(struct bch_fs *c) + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); + + iter = bch2_trans_get_iter(&trans, BTREE_ID_dirents, +- POS(BCACHEFS_ROOT_INO, 0), 0); ++ POS(BCACHEFS_ROOT_INO, 0), ++ BTREE_ITER_INTENT| ++ BTREE_ITER_PREFETCH); + retry: + while ((k = bch2_btree_iter_peek(iter)).k && + !(ret = bkey_err(k))) { +@@ -920,7 +925,9 @@ static int check_xattrs(struct bch_fs *c) + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); + + iter = bch2_trans_get_iter(&trans, BTREE_ID_xattrs, +- POS(BCACHEFS_ROOT_INO, 0), 0); ++ POS(BCACHEFS_ROOT_INO, 0), ++ BTREE_ITER_INTENT| ++ BTREE_ITER_PREFETCH); + retry: + while ((k = bch2_btree_iter_peek(iter)).k && + !(ret = bkey_err(k))) { +@@ -1108,7 +1115,9 @@ static int check_directory_structure(struct bch_fs *c) + + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); + +- for_each_btree_key(&trans, iter, BTREE_ID_inodes, POS_MIN, 0, k, ret) { ++ for_each_btree_key(&trans, iter, BTREE_ID_inodes, POS_MIN, ++ BTREE_ITER_INTENT| ++ BTREE_ITER_PREFETCH, k, ret) { + if (k.k->type != KEY_TYPE_inode) + continue; + +@@ -1207,7 +1216,9 @@ static int check_nlinks_find_hardlinks(struct bch_fs *c, + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); + + for_each_btree_key(&trans, iter, BTREE_ID_inodes, +- POS(0, start), 0, k, ret) { ++ POS(0, start), ++ BTREE_ITER_INTENT| ++ BTREE_ITER_PREFETCH, k, ret) { + if (k.k->type != KEY_TYPE_inode) + continue; + +@@ -1255,7 +1266,9 @@ static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links + + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); + +- for_each_btree_key(&trans, iter, BTREE_ID_dirents, POS_MIN, 0, k, ret) { ++ for_each_btree_key(&trans, iter, BTREE_ID_dirents, POS_MIN, ++ BTREE_ITER_INTENT| ++ BTREE_ITER_PREFETCH, k, ret) { + switch (k.k->type) { + case KEY_TYPE_dirent: + d = bkey_s_c_to_dirent(k); +@@ -1293,7 +1306,9 @@ static int check_nlinks_update_hardlinks(struct bch_fs *c, + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); + + for_each_btree_key(&trans, iter, BTREE_ID_inodes, +- POS(0, range_start), 0, k, ret) { ++ POS(0, range_start), ++ BTREE_ITER_INTENT| ++ BTREE_ITER_PREFETCH, k, ret) { + if (k.k->p.offset >= range_end) + break; + +-- +cgit v1.2.3 + + +From 9300151a514ba37c4b49cc1943ecc939df4b0ffc Mon Sep 17 00:00:00 2001 +From: Stijn Tintel +Date: Thu, 13 May 2021 23:08:47 +0300 +Subject: bcachefs: avoid out-of-bounds in split_devs + +Calling mount with an empty source string causes an out-of-bounds error +in split_devs. Check the length of the source string to avoid this. + +Signed-off-by: Stijn Tintel +--- + fs/bcachefs/fs.c | 4 ++++ + 1 file changed, 4 insertions(+) + +diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c +index 93f3f494448a..671b218095fd 100644 +--- a/fs/bcachefs/fs.c ++++ b/fs/bcachefs/fs.c +@@ -32,6 +32,7 @@ + #include + #include + #include ++#include + #include + + static struct kmem_cache *bch2_inode_cache; +@@ -1323,6 +1324,9 @@ static char **split_devs(const char *_dev_name, unsigned *nr) + char *dev_name = NULL, **devs = NULL, *s; + size_t i, nr_devs = 0; + ++ if (strlen(_dev_name) == 0) ++ return NULL; ++ + dev_name = kstrdup(_dev_name, GFP_KERNEL); + if (!dev_name) + return NULL; +-- +cgit v1.2.3 + + +From 64459166a3f427d147642481932998db4838d528 Mon Sep 17 00:00:00 2001 +From: Dan Robertson +Date: Sun, 9 May 2021 18:52:23 -0400 +Subject: bcachefs: Fix error in parsing of mount options + +When parsing the mount options duplicate the given options. This is +required as the options are parsed twice and strsep is used in parsing. +The options will be modified into a possibly invalid options set for the +second round of parsing if the options are not duplicated before +parsing. + +Signed-off-by: Dan Robertson +--- + fs/bcachefs/opts.c | 27 ++++++++++++++++++++++----- + 1 file changed, 22 insertions(+), 5 deletions(-) + +diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c +index 0cfbb56a57c1..64bf5a382d63 100644 +--- a/fs/bcachefs/opts.c ++++ b/fs/bcachefs/opts.c +@@ -315,11 +315,20 @@ int bch2_opts_check_may_set(struct bch_fs *c) + int bch2_parse_mount_opts(struct bch_fs *c, struct bch_opts *opts, + char *options) + { ++ char *copied_opts, *copied_opts_start; + char *opt, *name, *val; + int ret, id; + u64 v; + +- while ((opt = strsep(&options, ",")) != NULL) { ++ if (!options) ++ return 0; ++ ++ copied_opts = kstrdup(options, GFP_KERNEL); ++ if (!copied_opts) ++ return -1; ++ copied_opts_start = copied_opts; ++ ++ while ((opt = strsep(&copied_opts, ",")) != NULL) { + name = strsep(&opt, "="); + val = opt; + +@@ -363,16 +372,24 @@ int bch2_parse_mount_opts(struct bch_fs *c, struct bch_opts *opts, + bch2_opt_set_by_id(opts, id, v); + } + +- return 0; ++ ret = 0; ++ goto out; ++ + bad_opt: + pr_err("Bad mount option %s", name); +- return -1; ++ ret = -1; ++ goto out; + bad_val: + pr_err("Invalid value %s for mount option %s", val, name); +- return -1; ++ ret = -1; ++ goto out; + no_val: + pr_err("Mount option %s requires a value", name); +- return -1; ++ ret = -1; ++ goto out; ++out: ++ kfree(copied_opts_start); ++ return ret; + } + + /* io opts: */ +-- +cgit v1.2.3 + + +From f5ee88fc920c765cd0b088ef0321b8b997b93d81 Mon Sep 17 00:00:00 2001 +From: Dan Robertson +Date: Wed, 12 May 2021 14:07:57 -0400 +Subject: bcachefs: Fix possible null deref on mount + +Ensure that the block device pointer in a superblock handle is not +null before dereferencing it in bch2_dev_to_fs. The block device pointer +may be null when mounting a new bcachefs filesystem given another mounted +bcachefs filesystem exists that has at least one device that is offline. + +Signed-off-by: Dan Robertson +--- + fs/bcachefs/super.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c +index 792d8bb4896c..01b246076a0b 100644 +--- a/fs/bcachefs/super.c ++++ b/fs/bcachefs/super.c +@@ -110,7 +110,7 @@ struct bch_fs *bch2_dev_to_fs(dev_t dev) + + list_for_each_entry(c, &bch_fs_list, list) + for_each_member_device_rcu(ca, c, i, NULL) +- if (ca->disk_sb.bdev->bd_dev == dev) { ++ if (ca->disk_sb.bdev && ca->disk_sb.bdev->bd_dev == dev) { + closure_get(&c->cl); + goto found; + } +-- +cgit v1.2.3 + + +From 34e1e98819a81036bfa450051652d220a93797dd Mon Sep 17 00:00:00 2001 +From: Dan Robertson +Date: Wed, 12 May 2021 20:54:37 -0400 +Subject: bcachefs: Fix null deref in bch2_ioctl_read_super + +Do not attempt to cleanup the returned value of bch2_device_lookup if +the returned value was an error pointer. We currently check to see if +the returned value is null and run the cleanup otherwise. As a result, +we attempt to run the cleanup on a error pointer. + +Signed-off-by: Dan Robertson +--- + fs/bcachefs/chardev.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c +index c61601476c0d..ba8873ccde6c 100644 +--- a/fs/bcachefs/chardev.c ++++ b/fs/bcachefs/chardev.c +@@ -523,7 +523,7 @@ static long bch2_ioctl_read_super(struct bch_fs *c, + ret = copy_to_user((void __user *)(unsigned long)arg.sb, + sb, vstruct_bytes(sb)); + err: +- if (ca) ++ if (!IS_ERR_OR_NULL(ca)) + percpu_ref_put(&ca->ref); + mutex_unlock(&c->sb_lock); + return ret; +-- +cgit v1.2.3 + + +From 5cbea813027327125fb19ddc84d9ba51ed267c77 Mon Sep 17 00:00:00 2001 +From: Dan Robertson +Date: Wed, 5 May 2021 07:09:43 -0400 +Subject: bcachefs: Fix out of bounds read in fs usage ioctl + +Fix a possible read out of bounds if bch2_ioctl_fs_usage is called when +replica_entries_bytes is set to a value that is smaller than the size +of bch_replicas_usage. + +Signed-off-by: Dan Robertson +--- + fs/bcachefs/chardev.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c +index ba8873ccde6c..c29f8272e682 100644 +--- a/fs/bcachefs/chardev.c ++++ b/fs/bcachefs/chardev.c +@@ -414,7 +414,8 @@ static long bch2_ioctl_fs_usage(struct bch_fs *c, + struct bch_replicas_entry *src_e = + cpu_replicas_entry(&c->replicas, i); + +- if (replicas_usage_next(dst_e) > dst_end) { ++ /* check that we have enough space for one replicas entry */ ++ if (dst_e + 1 > dst_end) { + ret = -ERANGE; + break; + } +-- +cgit v1.2.3 + + +From 91e61a3db5660257bfff015c531813a9d304238c Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 14 May 2021 21:28:37 -0400 +Subject: bcachefs: Repair code for multiple types of data in same bucket + +bch2_check_fix_ptrs() is awkward, we need to find a way to improve it. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_gc.c | 53 +++++++++++++++++++++++++++++++++++++++----------- + fs/bcachefs/extents.h | 24 +++++++++++++++++++++++ + 2 files changed, 66 insertions(+), 11 deletions(-) + +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +index 93a8d8e37505..88d1cd0a8f95 100644 +--- a/fs/bcachefs/btree_gc.c ++++ b/fs/bcachefs/btree_gc.c +@@ -427,18 +427,38 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id, + const union bch_extent_entry *entry; + struct extent_ptr_decoded p = { 0 }; + bool do_update = false; ++ char buf[200]; + int ret = 0; + + bkey_for_each_ptr_decode(k->k, ptrs, p, entry) { + struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); + struct bucket *g = PTR_BUCKET(ca, &p.ptr, true); + struct bucket *g2 = PTR_BUCKET(ca, &p.ptr, false); ++ enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, &entry->ptr); ++ ++ if (fsck_err_on(g->mark.data_type && ++ g->mark.data_type != data_type, c, ++ "bucket %u:%zu different types of data in same bucket: %s, %s\n" ++ "while marking %s", ++ p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), ++ bch2_data_types[g->mark.data_type], ++ bch2_data_types[data_type], ++ (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) { ++ if (data_type == BCH_DATA_btree) { ++ g2->_mark.data_type = g->_mark.data_type = data_type; ++ set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags); ++ } else { ++ do_update = true; ++ } ++ } + + if (fsck_err_on(!g->gen_valid, c, +- "bucket %u:%zu data type %s ptr gen %u missing in alloc btree", ++ "bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n" ++ "while marking %s", + p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), + bch2_data_types[ptr_data_type(k->k, &p.ptr)], +- p.ptr.gen)) { ++ p.ptr.gen, ++ (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) { + if (!p.ptr.cached) { + g2->_mark.gen = g->_mark.gen = p.ptr.gen; + g2->gen_valid = g->gen_valid = true; +@@ -449,10 +469,12 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id, + } + + if (fsck_err_on(gen_cmp(p.ptr.gen, g->mark.gen) > 0, c, +- "bucket %u:%zu data type %s ptr gen in the future: %u > %u", ++ "bucket %u:%zu data type %s ptr gen in the future: %u > %u\n" ++ "while marking %s", + p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), + bch2_data_types[ptr_data_type(k->k, &p.ptr)], +- p.ptr.gen, g->mark.gen)) { ++ p.ptr.gen, g->mark.gen, ++ (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) { + if (!p.ptr.cached) { + g2->_mark.gen = g->_mark.gen = p.ptr.gen; + g2->gen_valid = g->gen_valid = true; +@@ -468,23 +490,29 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id, + + if (fsck_err_on(!p.ptr.cached && + gen_cmp(p.ptr.gen, g->mark.gen) < 0, c, +- "bucket %u:%zu data type %s stale dirty ptr: %u < %u", ++ "bucket %u:%zu data type %s stale dirty ptr: %u < %u\n" ++ "while marking %s", + p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), + bch2_data_types[ptr_data_type(k->k, &p.ptr)], +- p.ptr.gen, g->mark.gen)) ++ p.ptr.gen, g->mark.gen, ++ (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) + do_update = true; + + if (p.has_ec) { + struct stripe *m = genradix_ptr(&c->stripes[true], p.ec.idx); + + if (fsck_err_on(!m || !m->alive, c, +- "pointer to nonexistent stripe %llu", +- (u64) p.ec.idx)) ++ "pointer to nonexistent stripe %llu\n" ++ "while marking %s", ++ (u64) p.ec.idx, ++ (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) + do_update = true; + + if (fsck_err_on(!bch2_ptr_matches_stripe_m(m, p), c, +- "pointer does not match stripe %llu", +- (u64) p.ec.idx)) ++ "pointer does not match stripe %llu\n" ++ "while marking %s", ++ (u64) p.ec.idx, ++ (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) + do_update = true; + } + } +@@ -525,11 +553,14 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id, + bch2_bkey_drop_ptrs(bkey_i_to_s(new), ptr, ({ + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); + struct bucket *g = PTR_BUCKET(ca, ptr, true); ++ enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, ptr); + + (ptr->cached && + (!g->gen_valid || gen_cmp(ptr->gen, g->mark.gen) > 0)) || + (!ptr->cached && +- gen_cmp(ptr->gen, g->mark.gen) < 0); ++ gen_cmp(ptr->gen, g->mark.gen) < 0) || ++ (g->mark.data_type && ++ g->mark.data_type != data_type); + })); + again: + ptrs = bch2_bkey_ptrs(bkey_i_to_s(new)); +diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h +index ccee43a2019d..9999805f955e 100644 +--- a/fs/bcachefs/extents.h ++++ b/fs/bcachefs/extents.h +@@ -529,6 +529,30 @@ static inline struct bch_devs_list bch2_bkey_cached_devs(struct bkey_s_c k) + return ret; + } + ++static inline unsigned bch2_bkey_ptr_data_type(struct bkey_s_c k, const struct bch_extent_ptr *ptr) ++{ ++ switch (k.k->type) { ++ case KEY_TYPE_btree_ptr: ++ case KEY_TYPE_btree_ptr_v2: ++ return BCH_DATA_btree; ++ case KEY_TYPE_extent: ++ case KEY_TYPE_reflink_v: ++ return BCH_DATA_user; ++ case KEY_TYPE_stripe: { ++ struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k); ++ ++ BUG_ON(ptr < s.v->ptrs || ++ ptr >= s.v->ptrs + s.v->nr_blocks); ++ ++ return ptr >= s.v->ptrs + s.v->nr_blocks - s.v->nr_redundant ++ ? BCH_DATA_parity ++ : BCH_DATA_user; ++ } ++ default: ++ BUG(); ++ } ++} ++ + unsigned bch2_bkey_nr_ptrs(struct bkey_s_c); + unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c); + unsigned bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c); +-- +cgit v1.2.3 + + +From 6a3ce4c4f6cb4dac0300f7ea868c2f3b908c6e84 Mon Sep 17 00:00:00 2001 +From: Dan Robertson +Date: Fri, 14 May 2021 20:02:44 -0400 +Subject: bcachefs: properly initialize used values + + - Ensure the second key value in bch_hash_info is initialized to zero + if the info type is of type BCH_STR_HASH_SIPHASH. + + - Initialize the possibly returned value in bch2_inode_create. Assuming + bch2_btree_iter_peek returns bkey_s_c_null, the uninitialized value + of ret could be returned to the user as an error pointer. + + - Fix compiler warning in initialization of bkey_s_c_stripe + +fs/bcachefs/buckets.c:1646:35: warning: suggest braces around initialization +of subobject [-Wmissing-braces] + struct bkey_s_c_stripe new_s = { NULL }; + ^~~~ + +Signed-off-by: Dan Robertson +--- + fs/bcachefs/buckets.c | 4 ++-- + fs/bcachefs/inode.c | 2 +- + fs/bcachefs/str_hash.h | 17 ++++++++++------- + 3 files changed, 13 insertions(+), 10 deletions(-) + +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index 87266179542b..e497a938d933 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -1642,8 +1642,8 @@ static int bch2_trans_mark_stripe(struct btree_trans *trans, + struct bkey_s_c old, struct bkey_s_c new, + unsigned flags) + { +- struct bkey_s_c_stripe old_s = { NULL }; +- struct bkey_s_c_stripe new_s = { NULL }; ++ struct bkey_s_c_stripe old_s = { .k = NULL }; ++ struct bkey_s_c_stripe new_s = { .k = NULL }; + struct bch_replicas_padded r; + unsigned i; + int ret = 0; +diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c +index dfde5ba3f1b7..c5892e42aaec 100644 +--- a/fs/bcachefs/inode.c ++++ b/fs/bcachefs/inode.c +@@ -478,7 +478,7 @@ struct btree_iter *bch2_inode_create(struct btree_trans *trans, + struct btree_iter *iter = NULL; + struct bkey_s_c k; + u64 min, max, start, pos, *hint; +- int ret; ++ int ret = 0; + + u64 cpu = raw_smp_processor_id(); + unsigned bits = (c->opts.inodes_32bit +diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h +index b85f895de346..eab669af7032 100644 +--- a/fs/bcachefs/str_hash.h ++++ b/fs/bcachefs/str_hash.h +@@ -33,10 +33,11 @@ bch2_str_hash_opt_to_type(struct bch_fs *c, enum bch_str_hash_opts opt) + + struct bch_hash_info { + u8 type; +- union { +- __le64 crc_key; +- SIPHASH_KEY siphash_key; +- }; ++ /* ++ * For crc32 or crc64 string hashes the first key value of ++ * the siphash_key (k0) is used as the key. ++ */ ++ SIPHASH_KEY siphash_key; + }; + + static inline struct bch_hash_info +@@ -46,7 +47,7 @@ bch2_hash_info_init(struct bch_fs *c, const struct bch_inode_unpacked *bi) + struct bch_hash_info info = { + .type = (bi->bi_flags >> INODE_STR_HASH_OFFSET) & + ~(~0U << INODE_STR_HASH_BITS), +- .crc_key = bi->bi_hash_seed, ++ .siphash_key = { .k0 = bi->bi_hash_seed } + }; + + if (unlikely(info.type == BCH_STR_HASH_SIPHASH_OLD)) { +@@ -76,10 +77,12 @@ static inline void bch2_str_hash_init(struct bch_str_hash_ctx *ctx, + { + switch (info->type) { + case BCH_STR_HASH_CRC32C: +- ctx->crc32c = crc32c(~0, &info->crc_key, sizeof(info->crc_key)); ++ ctx->crc32c = crc32c(~0, &info->siphash_key.k0, ++ sizeof(info->siphash_key.k0)); + break; + case BCH_STR_HASH_CRC64: +- ctx->crc64 = crc64_be(~0, &info->crc_key, sizeof(info->crc_key)); ++ ctx->crc64 = crc64_be(~0, &info->siphash_key.k0, ++ sizeof(info->siphash_key.k0)); + break; + case BCH_STR_HASH_SIPHASH_OLD: + case BCH_STR_HASH_SIPHASH: +-- +cgit v1.2.3 + + +From 8521cd3d87fd14f915dee04fc43c1a17f1b831a0 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 16 May 2021 23:46:08 -0400 +Subject: bcachefs: Fix locking in __bch2_set_nr_journal_buckets() + +We weren't holding mark_lock correctly - it's needed for the new_fs +path. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/journal.c | 15 ++++++++++----- + 1 file changed, 10 insertions(+), 5 deletions(-) + +diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c +index 4af2fc1ad23b..9c8d408c2a08 100644 +--- a/fs/bcachefs/journal.c ++++ b/fs/bcachefs/journal.c +@@ -805,8 +805,11 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, + long b; + + if (new_fs) { ++ if (c) ++ percpu_down_read(&c->mark_lock); + b = bch2_bucket_alloc_new_fs(ca); + if (b < 0) { ++ percpu_up_read(&c->mark_lock); + ret = -ENOSPC; + goto err; + } +@@ -821,10 +824,10 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, + } + + b = sector_to_bucket(ca, ob->ptr.offset); ++ } + +- percpu_down_read(&c->mark_lock); ++ if (c) + spin_lock(&c->journal.lock); +- } + + /* + * XXX +@@ -851,15 +854,17 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, + if (pos <= ja->cur_idx) + ja->cur_idx = (ja->cur_idx + 1) % ja->nr; + ++ if (c) ++ spin_unlock(&c->journal.lock); ++ + if (new_fs) { + bch2_mark_metadata_bucket(c, ca, b, BCH_DATA_journal, + ca->mi.bucket_size, + gc_phase(GC_PHASE_SB), + 0); ++ if (c) ++ percpu_up_read(&c->mark_lock); + } else { +- spin_unlock(&c->journal.lock); +- percpu_up_read(&c->mark_lock); +- + ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOFAIL, + bch2_trans_mark_metadata_bucket(&trans, ca, + b, BCH_DATA_journal, +-- +cgit v1.2.3 + + +From dc4450aad77fc234bdb2d2f1b53952d3ee760ced Mon Sep 17 00:00:00 2001 +From: Brett Holman +Date: Sun, 16 May 2021 21:53:55 -0600 +Subject: bcachefs: made changes to support clang, fixed a couple bugs + +fs/bcachefs/bset.c edited prefetch macro to add clang support +fs/bcachefs/btree_iter.c bugfix: initialize iter->real_pos in bch2_btree_iter_init for later use +fs/bcachefs/io.c bugfix: eliminated undefined behavior (negative bitshift) +fs/bcachefs/buckets.c bugfix: invert sign to handle 64bit abs() +--- + fs/bcachefs/bset.c | 2 +- + fs/bcachefs/btree_iter.c | 1 + + fs/bcachefs/buckets.c | 4 +++- + fs/bcachefs/io.c | 2 +- + 4 files changed, 6 insertions(+), 3 deletions(-) + +diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c +index 0a3e3b63828b..61d29cc92079 100644 +--- a/fs/bcachefs/bset.c ++++ b/fs/bcachefs/bset.c +@@ -1193,7 +1193,7 @@ static struct bkey_packed *bset_search_write_set(const struct btree *b, + + static inline void prefetch_four_cachelines(void *p) + { +-#ifdef CONFIG_X86_64 ++#if (CONFIG_X86_64 && !defined(__clang__)) + asm(".intel_syntax noprefix;" + "prefetcht0 [%0 - 127 + 64 * 0];" + "prefetcht0 [%0 - 127 + 64 * 1];" +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 10d5f80ae4d5..ac6449e07522 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -1899,6 +1899,7 @@ static inline void bch2_btree_iter_init(struct btree_trans *trans, + iter->trans = trans; + iter->uptodate = BTREE_ITER_NEED_TRAVERSE; + iter->btree_id = btree_id; ++ iter->real_pos = POS_MIN; + iter->level = 0; + iter->min_depth = 0; + iter->locks_want = 0; +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index e497a938d933..6e988a28a6be 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -1797,7 +1797,9 @@ static int bch2_trans_mark_reflink_p(struct btree_trans *trans, + unsigned front_frag, back_frag; + s64 ret = 0; + +- sectors = abs(sectors); ++ if (sectors < 0) ++ sectors = -sectors; ++ + BUG_ON(offset + sectors > p.k->size); + + front_frag = offset; +diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c +index 4e5e11d5859d..83e108bb7a04 100644 +--- a/fs/bcachefs/io.c ++++ b/fs/bcachefs/io.c +@@ -120,7 +120,7 @@ void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) + * the time: + */ + if (abs((int) (old - io_latency)) < (old >> 1) && +- now & ~(~0 << 5)) ++ now & ~(~0U << 5)) + break; + + new = ewma_add(old, io_latency, 5); +-- +cgit v1.2.3 + + +From 8efb4d8a9403a8cb87f19ae8bbb811c65ddf573a Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 17 May 2021 00:08:06 -0400 +Subject: bcachefs: Make sure to pass a disk reservation to + bch2_extent_update() + +It's needed when we split an existing compressed extent - we get a null +ptr deref without it. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/reflink.c | 6 +++++- + 1 file changed, 5 insertions(+), 1 deletion(-) + +diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c +index 0978ad92614c..405a194d10e5 100644 +--- a/fs/bcachefs/reflink.c ++++ b/fs/bcachefs/reflink.c +@@ -2,6 +2,7 @@ + #include "bcachefs.h" + #include "bkey_buf.h" + #include "btree_update.h" ++#include "buckets.h" + #include "extents.h" + #include "inode.h" + #include "io.h" +@@ -224,6 +225,8 @@ s64 bch2_remap_range(struct bch_fs *c, + BTREE_ITER_INTENT); + + while (ret == 0 || ret == -EINTR) { ++ struct disk_reservation disk_res = { 0 }; ++ + bch2_trans_begin(&trans); + + if (fatal_signal_pending(current)) { +@@ -287,8 +290,9 @@ s64 bch2_remap_range(struct bch_fs *c, + dst_end.offset - dst_iter->pos.offset)); + + ret = bch2_extent_update(&trans, dst_iter, new_dst.k, +- NULL, journal_seq, ++ &disk_res, journal_seq, + new_i_size, i_sectors_delta); ++ bch2_disk_reservation_put(c, &disk_res); + if (ret) + continue; + +-- +cgit v1.2.3 + + +From 89f1d8cb5b1196b876d523aeabf8f64fba347eae Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 17 May 2021 00:28:50 -0400 +Subject: bcachefs: Fix bch2_extent_can_insert() call + +It was being skipped when hole punching, leading to problems when +splitting compressed extents. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_update_leaf.c | 6 +++--- + fs/bcachefs/buckets.c | 10 ++++++---- + 2 files changed, 9 insertions(+), 7 deletions(-) + +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index 64b6e86cf90a..ac6a039b3fa4 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -806,13 +806,13 @@ static int extent_update_to_keys(struct btree_trans *trans, + { + int ret; + +- if (bkey_deleted(&n.k->k)) +- return 0; +- + ret = bch2_extent_can_insert(trans, n.iter, n.k); + if (ret) + return ret; + ++ if (bkey_deleted(&n.k->k)) ++ return 0; ++ + n.iter = bch2_trans_get_iter(trans, n.iter->btree_id, n.k->k.p, + BTREE_ITER_INTENT| + BTREE_ITER_NOT_EXTENTS); +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index 6e988a28a6be..151b9db32798 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -1266,14 +1266,15 @@ int bch2_mark_update(struct btree_trans *trans, + + static noinline __cold + void fs_usage_apply_warn(struct btree_trans *trans, +- unsigned disk_res_sectors) ++ unsigned disk_res_sectors, ++ s64 should_not_have_added) + { + struct bch_fs *c = trans->c; + struct btree_insert_entry *i; + char buf[200]; + +- bch_err(c, "disk usage increased more than %u sectors reserved", +- disk_res_sectors); ++ bch_err(c, "disk usage increased %lli more than %u sectors reserved", ++ should_not_have_added, disk_res_sectors); + + trans_for_each_update(trans, i) { + pr_err("while inserting"); +@@ -1305,6 +1306,7 @@ void fs_usage_apply_warn(struct btree_trans *trans, + } + } + } ++ __WARN(); + } + + void bch2_trans_fs_usage_apply(struct btree_trans *trans, +@@ -1363,7 +1365,7 @@ void bch2_trans_fs_usage_apply(struct btree_trans *trans, + preempt_enable(); + + if (unlikely(warn) && !xchg(&warned_disk_usage, 1)) +- fs_usage_apply_warn(trans, disk_res_sectors); ++ fs_usage_apply_warn(trans, disk_res_sectors, should_not_have_added); + } + + /* trans_mark: */ +-- +cgit v1.2.3 + + +From e6cf2d133b7038fea03016edd11536aa19619ffd Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 17 May 2021 16:10:06 -0400 +Subject: bcachefs: Fix a memcpy call + +Not supposed to pass a null ptr to memcpy (even if the size is 0). + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/fsck.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c +index fcdcf42f85a4..a40459d2b0f0 100644 +--- a/fs/bcachefs/fsck.c ++++ b/fs/bcachefs/fsck.c +@@ -1161,7 +1161,8 @@ static int add_nlink(struct nlink_table *t, u64 inum, u32 snapshot) + return -ENOMEM; + } + +- memcpy(d, t->d, t->size * sizeof(t->d[0])); ++ if (t->d) ++ memcpy(d, t->d, t->size * sizeof(t->d[0])); + kvfree(t->d); + + t->d = d; +-- +cgit v1.2.3 + + +From 77f80ab94c5c3b9d7d866a2def99cf086a29c2b9 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 17 May 2021 16:43:30 -0400 +Subject: bcachefs: Fix for bch2_bkey_pack_pos() not initializing len/version + fields + +This bug led to push_whiteout() generating whiteouts that failed +bch2_bkey_invalid() due to nonzero length fields - oops. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bkey.c | 9 ++++++++- + 1 file changed, 8 insertions(+), 1 deletion(-) + +diff --git a/fs/bcachefs/bkey.c b/fs/bcachefs/bkey.c +index 3af56062601f..0053f32c0076 100644 +--- a/fs/bcachefs/bkey.c ++++ b/fs/bcachefs/bkey.c +@@ -443,8 +443,15 @@ enum bkey_pack_pos_ret bch2_bkey_pack_pos_lossy(struct bkey_packed *out, + struct bpos orig = in; + #endif + bool exact = true; ++ unsigned i; + +- out->_data[0] = 0; ++ /* ++ * bch2_bkey_pack_key() will write to all of f->key_u64s, minus the 3 ++ * byte header, but pack_pos() won't if the len/version fields are big ++ * enough - we need to make sure to zero them out: ++ */ ++ for (i = 0; i < f->key_u64s; i++) ++ out->_data[i] = 0; + + if (unlikely(in.snapshot < + le64_to_cpu(f->field_offset[BKEY_FIELD_SNAPSHOT]))) { +-- +cgit v1.2.3 + + +From d53952d917c0fd5aa4d11efce0c8b77a2f57b7d8 Mon Sep 17 00:00:00 2001 +From: Dan Robertson +Date: Tue, 18 May 2021 20:36:20 -0400 +Subject: bcachefs: statfs resports incorrect avail blocks + +The current implementation of bch_statfs does not scale the number of +available blocks provided in f_bavail by the reserve factor. This causes +an allocation of a file of this size to fail. + +Signed-off-by: Dan Robertson +--- + fs/bcachefs/buckets.c | 7 ------- + fs/bcachefs/buckets.h | 7 +++++++ + fs/bcachefs/fs.c | 4 ++-- + 3 files changed, 9 insertions(+), 9 deletions(-) + +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index 151b9db32798..cbd295e494bd 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -257,18 +257,11 @@ void bch2_fs_usage_to_text(struct printbuf *out, + } + } + +-#define RESERVE_FACTOR 6 +- + static u64 reserve_factor(u64 r) + { + return r + (round_up(r, (1 << RESERVE_FACTOR)) >> RESERVE_FACTOR); + } + +-static u64 avail_factor(u64 r) +-{ +- return div_u64(r << RESERVE_FACTOR, (1 << RESERVE_FACTOR) + 1); +-} +- + u64 bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage_online *fs_usage) + { + return min(fs_usage->u.hidden + +diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h +index 7463e6420b14..04a2a9310cdd 100644 +--- a/fs/bcachefs/buckets.h ++++ b/fs/bcachefs/buckets.h +@@ -294,6 +294,13 @@ static inline int bch2_disk_reservation_get(struct bch_fs *c, + return bch2_disk_reservation_add(c, res, sectors * nr_replicas, flags); + } + ++#define RESERVE_FACTOR 6 ++ ++static inline u64 avail_factor(u64 r) ++{ ++ return div_u64(r << RESERVE_FACTOR, (1 << RESERVE_FACTOR) + 1); ++} ++ + int bch2_dev_buckets_resize(struct bch_fs *, struct bch_dev *, u64); + void bch2_dev_buckets_free(struct bch_dev *); + int bch2_dev_buckets_alloc(struct bch_fs *, struct bch_dev *); +diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c +index 671b218095fd..e6e439f39ee2 100644 +--- a/fs/bcachefs/fs.c ++++ b/fs/bcachefs/fs.c +@@ -1273,8 +1273,8 @@ static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf) + buf->f_type = BCACHEFS_STATFS_MAGIC; + buf->f_bsize = sb->s_blocksize; + buf->f_blocks = usage.capacity >> shift; +- buf->f_bfree = (usage.capacity - usage.used) >> shift; +- buf->f_bavail = buf->f_bfree; ++ buf->f_bfree = usage.free >> shift; ++ buf->f_bavail = avail_factor(usage.free) >> shift; + + buf->f_files = usage.nr_inodes + avail_inodes; + buf->f_ffree = avail_inodes; +-- +cgit v1.2.3 + + +From 86d24391ce5d9058cd6c59f8570b54f0448c3b94 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 18 May 2021 23:53:43 -0400 +Subject: bcachefs: Move io_in_flight ratelimiting to fs-io.c + +This fixes a bug where an async O_DIRECT write that required multiple +bch2_write calls could deadlock, because bch2_write runs out of the same +workqueue used for index updates and would block on the io_in_flight +semaphore. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/fs-io.c | 7 +++++++ + fs/bcachefs/io.c | 9 --------- + 2 files changed, 7 insertions(+), 9 deletions(-) + +diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c +index 955fce9e90e0..5b2ad2bc7812 100644 +--- a/fs/bcachefs/fs-io.c ++++ b/fs/bcachefs/fs-io.c +@@ -1026,6 +1026,8 @@ static void bch2_writepage_io_done(struct closure *cl) + struct bio_vec *bvec; + unsigned i; + ++ up(&io->op.c->io_in_flight); ++ + if (io->op.error) { + set_bit(EI_INODE_ERROR, &io->inode->ei_flags); + +@@ -1088,6 +1090,8 @@ static void bch2_writepage_do_io(struct bch_writepage_state *w) + { + struct bch_writepage_io *io = w->io; + ++ down(&io->op.c->io_in_flight); ++ + w->io = NULL; + closure_call(&io->op.cl, bch2_write, NULL, &io->cl); + continue_at(&io->cl, bch2_writepage_io_done, NULL); +@@ -1825,6 +1829,8 @@ static long bch2_dio_write_loop(struct dio_write *dio) + if (dio->loop) + goto loop; + ++ down(&c->io_in_flight); ++ + while (1) { + iter_count = dio->iter.count; + +@@ -1955,6 +1961,7 @@ loop: + + ret = dio->op.error ?: ((long) dio->written << 9); + err: ++ up(&c->io_in_flight); + bch2_pagecache_block_put(&inode->ei_pagecache_lock); + bch2_quota_reservation_put(c, inode, &dio->quota_res); + +diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c +index 83e108bb7a04..fe53b4208b9b 100644 +--- a/fs/bcachefs/io.c ++++ b/fs/bcachefs/io.c +@@ -537,9 +537,6 @@ static void bch2_write_done(struct closure *cl) + + bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time); + +- if (!(op->flags & BCH_WRITE_FROM_INTERNAL)) +- up(&c->io_in_flight); +- + if (op->end_io) { + EBUG_ON(cl->parent); + closure_debug_destroy(cl); +@@ -1322,12 +1319,6 @@ void bch2_write(struct closure *cl) + goto err; + } + +- /* +- * Can't ratelimit copygc - we'd deadlock: +- */ +- if (!(op->flags & BCH_WRITE_FROM_INTERNAL)) +- down(&c->io_in_flight); +- + bch2_increment_clock(c, bio_sectors(bio), WRITE); + + data_len = min_t(u64, bio->bi_iter.bi_size, +-- +cgit v1.2.3 + + +From 6cc4087f815ba7b505902d867694a068ef9c8513 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 18 May 2021 23:17:03 -0400 +Subject: bcachefs: Split extents if necessary in bch2_trans_update() + +Currently, we handle multiple overlapping extents in the same +transaction commit by doing fixups in bch2_trans_update() - this patch +extents that to split updates when necessary. The next patch that +changes the reflink code to not fragment extents when making them +indirect will require this. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_update_leaf.c | 27 ++++++++++++++++++++++++--- + fs/bcachefs/io.c | 5 ++--- + fs/bcachefs/reflink.c | 6 ++++-- + 3 files changed, 30 insertions(+), 8 deletions(-) + +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index ac6a039b3fa4..b7a5cdb71ee8 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -1099,9 +1099,30 @@ int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter, + if (i < trans->updates + trans->nr_updates && + i->btree_id == n.btree_id && + bkey_cmp(n.k->k.p, bkey_start_pos(&i->k->k)) > 0) { +- /* We don't handle splitting extents here: */ +- BUG_ON(bkey_cmp(bkey_start_pos(&n.k->k), +- bkey_start_pos(&i->k->k)) > 0); ++ if (bkey_cmp(bkey_start_pos(&n.k->k), ++ bkey_start_pos(&i->k->k)) > 0) { ++ struct btree_insert_entry split = *i; ++ int ret; ++ ++ BUG_ON(trans->nr_updates + 1 >= BTREE_ITER_MAX); ++ ++ split.k = bch2_trans_kmalloc(trans, bkey_bytes(&i->k->k)); ++ ret = PTR_ERR_OR_ZERO(split.k); ++ if (ret) ++ return ret; ++ ++ bkey_copy(split.k, i->k); ++ bch2_cut_back(bkey_start_pos(&n.k->k), split.k); ++ ++ split.iter = bch2_trans_get_iter(trans, split.btree_id, ++ bkey_start_pos(&split.k->k), ++ BTREE_ITER_INTENT); ++ split.iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT; ++ bch2_trans_iter_put(trans, split.iter); ++ array_insert_item(trans->updates, trans->nr_updates, ++ i - trans->updates, split); ++ i++; ++ } + + /* + * When we have an extent that overwrites the start of another +diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c +index fe53b4208b9b..d498994c6ab8 100644 +--- a/fs/bcachefs/io.c ++++ b/fs/bcachefs/io.c +@@ -332,9 +332,8 @@ int bch2_extent_update(struct btree_trans *trans, + bch2_trans_iter_put(trans, inode_iter); + } + +- bch2_trans_update(trans, iter, k, 0); +- +- ret = bch2_trans_commit(trans, disk_res, journal_seq, ++ ret = bch2_trans_update(trans, iter, k, 0) ?: ++ bch2_trans_commit(trans, disk_res, journal_seq, + BTREE_INSERT_NOCHECK_RW| + BTREE_INSERT_NOFAIL); + if (ret) +diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c +index 405a194d10e5..ec8532b39a49 100644 +--- a/fs/bcachefs/reflink.c ++++ b/fs/bcachefs/reflink.c +@@ -155,7 +155,9 @@ static int bch2_make_extent_indirect(struct btree_trans *trans, + *refcount = 0; + memcpy(refcount + 1, &orig->v, bkey_val_bytes(&orig->k)); + +- bch2_trans_update(trans, reflink_iter, r_v, 0); ++ ret = bch2_trans_update(trans, reflink_iter, r_v, 0); ++ if (ret) ++ goto err; + + r_p = bch2_trans_kmalloc(trans, sizeof(*r_p)); + if (IS_ERR(r_p)) { +@@ -168,7 +170,7 @@ static int bch2_make_extent_indirect(struct btree_trans *trans, + set_bkey_val_bytes(&r_p->k, sizeof(r_p->v)); + r_p->v.idx = cpu_to_le64(bkey_start_offset(&r_v->k)); + +- bch2_trans_update(trans, extent_iter, &r_p->k_i, 0); ++ ret = bch2_trans_update(trans, extent_iter, &r_p->k_i, 0); + err: + if (!IS_ERR(reflink_iter)) + c->reflink_hint = reflink_iter->pos.offset; +-- +cgit v1.2.3 + + +From 19d835ae9b53215b0e540cf433c46af75919787c Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 19 May 2021 21:21:49 -0400 +Subject: bcachefs: Make bch2_remap_range respect O_SYNC + +Caught by xfstest generic/628 + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/fs-io.c | 5 +++++ + 1 file changed, 5 insertions(+) + +diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c +index 5b2ad2bc7812..077632022718 100644 +--- a/fs/bcachefs/fs-io.c ++++ b/fs/bcachefs/fs-io.c +@@ -2943,6 +2943,11 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src, + if (pos_dst + ret > dst->v.i_size) + i_size_write(&dst->v, pos_dst + ret); + spin_unlock(&dst->v.i_lock); ++ ++ if (((file_dst->f_flags & (__O_SYNC | O_DSYNC)) || ++ IS_SYNC(file_inode(file_dst))) && ++ !c->opts.journal_flush_disabled) ++ ret = bch2_journal_flush_seq(&c->journal, dst->ei_journal_seq); + err: + bch2_unlock_inodes(INODE_LOCK|INODE_PAGECACHE_BLOCK, src, dst); + +-- +cgit v1.2.3 + + +From 91e865fb2b70c5b3fe70d0b6d3628d3723eb2730 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 20 May 2021 00:09:47 -0400 +Subject: bcachefs: Fix inode backpointers in RENAME_OVERWRITE + +When we delete the dirent an inode points to, we need to zero out the +backpointer fields - this was missed in the RENAME_OVERWRITE case. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/dirent.c | 5 ++++- + fs/bcachefs/fs-common.c | 7 +++++++ + 2 files changed, 11 insertions(+), 1 deletion(-) + +diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c +index ec4666143f23..3bf6379cefe6 100644 +--- a/fs/bcachefs/dirent.c ++++ b/fs/bcachefs/dirent.c +@@ -210,6 +210,8 @@ int bch2_dirent_rename(struct btree_trans *trans, + + if (mode != BCH_RENAME) + *dst_inum = le64_to_cpu(bkey_s_c_to_dirent(old_dst).v->d_inum); ++ if (mode != BCH_RENAME_EXCHANGE) ++ *src_offset = dst_iter->pos.offset; + + /* Lookup src: */ + src_iter = bch2_hash_lookup(trans, bch2_dirent_hash_desc, +@@ -290,7 +292,8 @@ int bch2_dirent_rename(struct btree_trans *trans, + bch2_trans_update(trans, src_iter, &new_src->k_i, 0); + bch2_trans_update(trans, dst_iter, &new_dst->k_i, 0); + out_set_offset: +- *src_offset = new_src->k.p.offset; ++ if (mode == BCH_RENAME_EXCHANGE) ++ *src_offset = new_src->k.p.offset; + *dst_offset = new_dst->k.p.offset; + out: + bch2_trans_iter_put(trans, src_iter); +diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c +index 34d69c3f6680..08c6af886df7 100644 +--- a/fs/bcachefs/fs-common.c ++++ b/fs/bcachefs/fs-common.c +@@ -289,6 +289,13 @@ int bch2_rename_trans(struct btree_trans *trans, + dst_inode_u->bi_dir = src_dir_u->bi_inum; + dst_inode_u->bi_dir_offset = src_offset; + } ++ ++ if (mode == BCH_RENAME_OVERWRITE && ++ dst_inode_u->bi_dir == dst_dir_u->bi_inum && ++ dst_inode_u->bi_dir_offset == src_offset) { ++ dst_inode_u->bi_dir = 0; ++ dst_inode_u->bi_dir_offset = 0; ++ } + } + + if (mode == BCH_RENAME_OVERWRITE) { +-- +cgit v1.2.3 + + +From ccb2858856a18b130160c7965e3becabdcc411f5 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 20 May 2021 15:49:23 -0400 +Subject: bcachefs: Fix for buffered writes getting -ENOSPC + +Buffered writes may have to increase their disk reservation at btree +update time, due to compression and erasure coding being unpredictable: +O_DIRECT writes should be checking for -ENOSPC, but buffered writes have +already been accepted and should not. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_update_leaf.c | 3 +++ + fs/bcachefs/fs-io.c | 3 ++- + fs/bcachefs/io.c | 26 ++++++++++++++++---------- + fs/bcachefs/io.h | 9 +++++---- + fs/bcachefs/reflink.c | 3 ++- + 5 files changed, 28 insertions(+), 16 deletions(-) + +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index b7a5cdb71ee8..70d2186e509f 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -690,6 +690,7 @@ int bch2_trans_commit_error(struct btree_trans *trans, + } + break; + case BTREE_INSERT_ENOSPC: ++ BUG_ON(flags & BTREE_INSERT_NOFAIL); + ret = -ENOSPC; + break; + case BTREE_INSERT_NEED_MARK_REPLICAS: +@@ -743,6 +744,8 @@ int bch2_trans_commit_error(struct btree_trans *trans, + break; + } + ++ BUG_ON(ret == -ENOSPC && (flags & BTREE_INSERT_NOFAIL)); ++ + return ret; + } + +diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c +index 077632022718..ece28b1e9901 100644 +--- a/fs/bcachefs/fs-io.c ++++ b/fs/bcachefs/fs-io.c +@@ -1895,6 +1895,7 @@ static long bch2_dio_write_loop(struct dio_write *dio) + if ((req->ki_flags & IOCB_DSYNC) && + !c->opts.journal_flush_disabled) + dio->op.flags |= BCH_WRITE_FLUSH; ++ dio->op.flags |= BCH_WRITE_CHECK_ENOSPC; + + ret = bch2_disk_reservation_get(c, &dio->op.res, bio_sectors(bio), + dio->op.opts.data_replicas, 0); +@@ -2723,7 +2724,7 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode, + + ret = bch2_extent_update(&trans, iter, &reservation.k_i, + &disk_res, &inode->ei_journal_seq, +- 0, &i_sectors_delta); ++ 0, &i_sectors_delta, true); + i_sectors_acct(c, inode, "a_res, i_sectors_delta); + bkey_err: + bch2_quota_reservation_put(c, inode, "a_res); +diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c +index d498994c6ab8..3b510f6ba7ef 100644 +--- a/fs/bcachefs/io.c ++++ b/fs/bcachefs/io.c +@@ -187,7 +187,7 @@ int bch2_sum_sector_overwrites(struct btree_trans *trans, + struct btree_iter *extent_iter, + struct bkey_i *new, + bool *maybe_extending, +- bool *should_check_enospc, ++ bool *usage_increasing, + s64 *i_sectors_delta, + s64 *disk_sectors_delta) + { +@@ -199,7 +199,7 @@ int bch2_sum_sector_overwrites(struct btree_trans *trans, + int ret = 0; + + *maybe_extending = true; +- *should_check_enospc = false; ++ *usage_increasing = false; + *i_sectors_delta = 0; + *disk_sectors_delta = 0; + +@@ -219,10 +219,10 @@ int bch2_sum_sector_overwrites(struct btree_trans *trans, + ? sectors * bch2_bkey_nr_ptrs_fully_allocated(old) + : 0; + +- if (!*should_check_enospc && ++ if (!*usage_increasing && + (new_replicas > bch2_bkey_replicas(c, old) || + (!new_compressed && bch2_bkey_sectors_compressed(old)))) +- *should_check_enospc = true; ++ *usage_increasing = true; + + if (bkey_cmp(old.k->p, new->k.p) >= 0) { + /* +@@ -257,11 +257,12 @@ int bch2_extent_update(struct btree_trans *trans, + struct disk_reservation *disk_res, + u64 *journal_seq, + u64 new_i_size, +- s64 *i_sectors_delta_total) ++ s64 *i_sectors_delta_total, ++ bool check_enospc) + { + /* this must live until after bch2_trans_commit(): */ + struct bkey_inode_buf inode_p; +- bool extending = false, should_check_enospc; ++ bool extending = false, usage_increasing; + s64 i_sectors_delta = 0, disk_sectors_delta = 0; + int ret; + +@@ -271,17 +272,20 @@ int bch2_extent_update(struct btree_trans *trans, + + ret = bch2_sum_sector_overwrites(trans, iter, k, + &extending, +- &should_check_enospc, ++ &usage_increasing, + &i_sectors_delta, + &disk_sectors_delta); + if (ret) + return ret; + ++ if (!usage_increasing) ++ check_enospc = false; ++ + if (disk_res && + disk_sectors_delta > (s64) disk_res->sectors) { + ret = bch2_disk_reservation_add(trans->c, disk_res, + disk_sectors_delta - disk_res->sectors, +- !should_check_enospc ++ !check_enospc + ? BCH_DISK_RESERVATION_NOFAIL : 0); + if (ret) + return ret; +@@ -336,6 +340,7 @@ int bch2_extent_update(struct btree_trans *trans, + bch2_trans_commit(trans, disk_res, journal_seq, + BTREE_INSERT_NOCHECK_RW| + BTREE_INSERT_NOFAIL); ++ BUG_ON(ret == -ENOSPC); + if (ret) + return ret; + +@@ -374,7 +379,7 @@ int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter, + + ret = bch2_extent_update(trans, iter, &delete, + &disk_res, journal_seq, +- 0, i_sectors_delta); ++ 0, i_sectors_delta, false); + bch2_disk_reservation_put(c, &disk_res); + btree_err: + if (ret == -EINTR) { +@@ -447,7 +452,8 @@ int bch2_write_index_default(struct bch_write_op *op) + + ret = bch2_extent_update(&trans, iter, sk.k, + &op->res, op_journal_seq(op), +- op->new_i_size, &op->i_sectors_delta); ++ op->new_i_size, &op->i_sectors_delta, ++ op->flags & BCH_WRITE_CHECK_ENOSPC); + if (ret == -EINTR) + continue; + if (ret) +diff --git a/fs/bcachefs/io.h b/fs/bcachefs/io.h +index 2ac03c049c92..144dc9346c02 100644 +--- a/fs/bcachefs/io.h ++++ b/fs/bcachefs/io.h +@@ -34,11 +34,12 @@ enum bch_write_flags { + BCH_WRITE_ONLY_SPECIFIED_DEVS = (1 << 6), + BCH_WRITE_WROTE_DATA_INLINE = (1 << 7), + BCH_WRITE_FROM_INTERNAL = (1 << 8), ++ BCH_WRITE_CHECK_ENOSPC = (1 << 9), + + /* Internal: */ +- BCH_WRITE_JOURNAL_SEQ_PTR = (1 << 9), +- BCH_WRITE_SKIP_CLOSURE_PUT = (1 << 10), +- BCH_WRITE_DONE = (1 << 11), ++ BCH_WRITE_JOURNAL_SEQ_PTR = (1 << 10), ++ BCH_WRITE_SKIP_CLOSURE_PUT = (1 << 11), ++ BCH_WRITE_DONE = (1 << 12), + }; + + static inline u64 *op_journal_seq(struct bch_write_op *op) +@@ -64,7 +65,7 @@ int bch2_sum_sector_overwrites(struct btree_trans *, struct btree_iter *, + struct bkey_i *, bool *, bool *, s64 *, s64 *); + int bch2_extent_update(struct btree_trans *, struct btree_iter *, + struct bkey_i *, struct disk_reservation *, +- u64 *, u64, s64 *); ++ u64 *, u64, s64 *, bool); + int bch2_fpunch_at(struct btree_trans *, struct btree_iter *, + struct bpos, u64 *, s64 *); + int bch2_fpunch(struct bch_fs *c, u64, u64, u64, u64 *, s64 *); +diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c +index ec8532b39a49..c624fabe1e1c 100644 +--- a/fs/bcachefs/reflink.c ++++ b/fs/bcachefs/reflink.c +@@ -293,7 +293,8 @@ s64 bch2_remap_range(struct bch_fs *c, + + ret = bch2_extent_update(&trans, dst_iter, new_dst.k, + &disk_res, journal_seq, +- new_i_size, i_sectors_delta); ++ new_i_size, i_sectors_delta, ++ true); + bch2_disk_reservation_put(c, &disk_res); + if (ret) + continue; +-- +cgit v1.2.3 + + +From 569d3fe79df4456ca5bea77badd4fcd994bf4429 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 20 May 2021 20:47:27 -0400 +Subject: bcachefs: Fix an uninitialized var + +this fixes a valgrind complaint + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_update_interior.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index d7a1abf44f41..d31c6bd1ee7f 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -286,6 +286,7 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned lev + + memset(&b->nr, 0, sizeof(b->nr)); + b->data->magic = cpu_to_le64(bset_magic(c)); ++ memset(&b->data->_ptr, 0, sizeof(b->data->_ptr)); + b->data->flags = 0; + SET_BTREE_NODE_ID(b->data, as->btree_id); + SET_BTREE_NODE_LEVEL(b->data, level); +-- +cgit v1.2.3 + + +From 322af6d469f9953d4057e01c2edfa263c129f5a4 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 21 May 2021 16:06:54 -0400 +Subject: bcachefs: Don't repair btree nodes until after interior journal + replay is done + +We need the btree to be in a consistent state before we can rewrite +btree nodes. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_update_interior.c | 3 +++ + 1 file changed, 3 insertions(+) + +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index d31c6bd1ee7f..5ee191ba495c 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -1828,6 +1828,9 @@ void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b) + { + struct async_btree_rewrite *a; + ++ if (!test_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags)) ++ return; ++ + if (!percpu_ref_tryget(&c->writes)) + return; + +-- +cgit v1.2.3 + + +From 74ea964222e5220aca0ecbe10310fd32fd4aafc9 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 21 May 2021 23:57:37 -0400 +Subject: bcachefs: Add a debug mode that always reads from every btree replica + +There's a new module parameter, verify_all_btree_replicas, that enables +reading from every btree replica when reading in btree nodes and +comparing them against each other. We've been seeing some strange btree +corruption - this will hopefully aid in tracking it down and catching it +more often. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bcachefs.h | 5 +- + fs/bcachefs/btree_io.c | 272 +++++++++++++++++++++++++++++++++++++++++++++++-- + fs/bcachefs/btree_io.h | 4 + + 3 files changed, 273 insertions(+), 8 deletions(-) + +diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h +index 40ce8c763396..77928bae446c 100644 +--- a/fs/bcachefs/bcachefs.h ++++ b/fs/bcachefs/bcachefs.h +@@ -264,7 +264,10 @@ do { \ + BCH_DEBUG_PARAM(verify_btree_ondisk, \ + "Reread btree nodes at various points to verify the " \ + "mergesort in the read path against modifications " \ +- "done in memory") ++ "done in memory") \ ++ BCH_DEBUG_PARAM(verify_all_btree_replicas, \ ++ "When reading btree nodes, read all replicas and " \ ++ "compare them") + + /* Parameters that should only be compiled in in debug mode: */ + #define BCH_DEBUG_PARAMS_DEBUG() \ +diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c +index a0af6d013ef3..52f26e23b859 100644 +--- a/fs/bcachefs/btree_io.c ++++ b/fs/bcachefs/btree_io.c +@@ -521,7 +521,7 @@ enum btree_validate_ret { + \ + switch (write) { \ + case READ: \ +- bch_err(c, "%s", _buf2); \ ++ bch_err(c, "%s", _buf2); \ + \ + switch (type) { \ + case BTREE_ERR_FIXABLE: \ +@@ -1035,8 +1035,8 @@ static void btree_node_read_work(struct work_struct *work) + struct btree_read_bio *rb = + container_of(work, struct btree_read_bio, work); + struct bch_fs *c = rb->c; ++ struct btree *b = rb->b; + struct bch_dev *ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev); +- struct btree *b = rb->bio.bi_private; + struct bio *bio = &rb->bio; + struct bch_io_failures failed = { .nr = 0 }; + char buf[200]; +@@ -1112,6 +1112,261 @@ static void btree_node_read_endio(struct bio *bio) + queue_work(system_unbound_wq, &rb->work); + } + ++struct btree_node_read_all { ++ struct closure cl; ++ struct bch_fs *c; ++ struct btree *b; ++ unsigned nr; ++ void *buf[BCH_REPLICAS_MAX]; ++ struct bio *bio[BCH_REPLICAS_MAX]; ++ int err[BCH_REPLICAS_MAX]; ++}; ++ ++static unsigned btree_node_sectors_written(struct bch_fs *c, void *data) ++{ ++ struct btree_node *bn = data; ++ struct btree_node_entry *bne; ++ unsigned offset = 0; ++ ++ if (le64_to_cpu(bn->magic) != bset_magic(c)) ++ return 0; ++ ++ while (offset < c->opts.btree_node_size) { ++ if (!offset) { ++ offset += vstruct_sectors(bn, c->block_bits); ++ } else { ++ bne = data + (offset << 9); ++ if (bne->keys.seq != bn->keys.seq) ++ break; ++ offset += vstruct_sectors(bne, c->block_bits); ++ } ++ } ++ ++ return offset; ++} ++ ++static bool btree_node_has_extra_bsets(struct bch_fs *c, unsigned offset, void *data) ++{ ++ struct btree_node *bn = data; ++ struct btree_node_entry *bne; ++ ++ if (!offset) ++ return false; ++ ++ while (offset < c->opts.btree_node_size) { ++ bne = data + (offset << 9); ++ if (bne->keys.seq == bn->keys.seq) ++ return true; ++ offset++; ++ } ++ ++ return false; ++ return offset; ++} ++ ++static void btree_node_read_all_replicas_done(struct closure *cl) ++{ ++ struct btree_node_read_all *ra = ++ container_of(cl, struct btree_node_read_all, cl); ++ struct bch_fs *c = ra->c; ++ struct btree *b = ra->b; ++ bool have_good_copy = false; ++ bool dump_bset_maps = false; ++ bool have_retry = false; ++ int ret = 0, write = READ; ++ unsigned i, written, written2; ++ __le64 seq = b->key.k.type == KEY_TYPE_btree_ptr_v2 ++ ? bkey_i_to_btree_ptr_v2(&b->key)->v.seq : 0; ++ ++ for (i = 0; i < ra->nr; i++) { ++ if (ra->err[i]) ++ continue; ++ ++ if (!have_good_copy) { ++ memcpy(b->data, ra->buf[i], btree_bytes(c)); ++ have_good_copy = true; ++ written = btree_node_sectors_written(c, b->data); ++ } ++ ++ /* Try to get the right btree node: */ ++ if (have_good_copy && ++ seq && ++ b->data->keys.seq != seq && ++ ((struct btree_node *) ra->buf[i])->keys.seq == seq) { ++ memcpy(b->data, ra->buf[i], btree_bytes(c)); ++ written = btree_node_sectors_written(c, b->data); ++ } ++ ++ written2 = btree_node_sectors_written(c, ra->buf[i]); ++ if (btree_err_on(written2 != written, BTREE_ERR_FIXABLE, c, NULL, b, NULL, ++ "btree node sectors written mismatch: %u != %u", ++ written, written2) || ++ btree_err_on(btree_node_has_extra_bsets(c, written2, ra->buf[i]), ++ BTREE_ERR_FIXABLE, c, NULL, b, NULL, ++ "found bset signature after last bset") || ++ btree_err_on(memcmp(b->data, ra->buf[i], written << 9), ++ BTREE_ERR_FIXABLE, c, NULL, b, NULL, ++ "btree node replicas content mismatch")) ++ dump_bset_maps = true; ++ ++ if (written2 > written) { ++ written = written2; ++ memcpy(b->data, ra->buf[i], btree_bytes(c)); ++ } ++ } ++fsck_err: ++ if (dump_bset_maps) { ++ for (i = 0; i < ra->nr; i++) { ++ char buf[200]; ++ struct printbuf out = PBUF(buf); ++ struct btree_node *bn = ra->buf[i]; ++ struct btree_node_entry *bne = NULL; ++ unsigned offset = 0, sectors; ++ bool gap = false; ++ ++ if (ra->err[i]) ++ continue; ++ ++ while (offset < c->opts.btree_node_size) { ++ if (!offset) { ++ sectors = vstruct_sectors(bn, c->block_bits); ++ } else { ++ bne = ra->buf[i] + (offset << 9); ++ if (bne->keys.seq != bn->keys.seq) ++ break; ++ sectors = vstruct_sectors(bne, c->block_bits); ++ } ++ ++ pr_buf(&out, " %u-%u", offset, offset + sectors); ++ if (bne && bch2_journal_seq_is_blacklisted(c, ++ le64_to_cpu(bne->keys.journal_seq), false)) ++ pr_buf(&out, "*"); ++ offset += sectors; ++ } ++ ++ while (offset < c->opts.btree_node_size) { ++ bne = ra->buf[i] + (offset << 9); ++ if (bne->keys.seq == bn->keys.seq) { ++ if (!gap) ++ pr_buf(&out, " GAP"); ++ gap = true; ++ ++ sectors = vstruct_sectors(bne, c->block_bits); ++ pr_buf(&out, " %u-%u", offset, offset + sectors); ++ if (bch2_journal_seq_is_blacklisted(c, ++ le64_to_cpu(bne->keys.journal_seq), false)) ++ pr_buf(&out, "*"); ++ } ++ offset++; ++ } ++ ++ bch_err(c, "replica %u:%s", i, buf); ++ } ++ } ++ ++ if (have_good_copy) ++ bch2_btree_node_read_done(c, NULL, b, false); ++ else ++ set_btree_node_read_error(b); ++ ++ for (i = 0; i < ra->nr; i++) { ++ mempool_free(ra->buf[i], &c->btree_bounce_pool); ++ bio_put(ra->bio[i]); ++ } ++ ++ closure_debug_destroy(&ra->cl); ++ kfree(ra); ++ ++ clear_btree_node_read_in_flight(b); ++ wake_up_bit(&b->flags, BTREE_NODE_read_in_flight); ++} ++ ++static void btree_node_read_all_replicas_endio(struct bio *bio) ++{ ++ struct btree_read_bio *rb = ++ container_of(bio, struct btree_read_bio, bio); ++ struct bch_fs *c = rb->c; ++ struct btree_node_read_all *ra = rb->ra; ++ ++ if (rb->have_ioref) { ++ struct bch_dev *ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev); ++ bch2_latency_acct(ca, rb->start_time, READ); ++ } ++ ++ ra->err[rb->idx] = bio->bi_status; ++ closure_put(&ra->cl); ++} ++ ++/* ++ * XXX This allocates multiple times from the same mempools, and can deadlock ++ * under sufficient memory pressure (but is only a debug path) ++ */ ++static int btree_node_read_all_replicas(struct bch_fs *c, struct btree *b, bool sync) ++{ ++ struct bkey_s_c k = bkey_i_to_s_c(&b->key); ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const union bch_extent_entry *entry; ++ struct extent_ptr_decoded pick; ++ struct btree_node_read_all *ra; ++ unsigned i; ++ ++ ra = kzalloc(sizeof(*ra), GFP_NOFS); ++ if (!ra) ++ return -ENOMEM; ++ ++ closure_init(&ra->cl, NULL); ++ ra->c = c; ++ ra->b = b; ++ ra->nr = bch2_bkey_nr_ptrs(k); ++ ++ for (i = 0; i < ra->nr; i++) { ++ ra->buf[i] = mempool_alloc(&c->btree_bounce_pool, GFP_NOFS); ++ ra->bio[i] = bio_alloc_bioset(GFP_NOFS, buf_pages(ra->buf[i], ++ btree_bytes(c)), ++ &c->btree_bio); ++ } ++ ++ i = 0; ++ bkey_for_each_ptr_decode(k.k, ptrs, pick, entry) { ++ struct bch_dev *ca = bch_dev_bkey_exists(c, pick.ptr.dev); ++ struct btree_read_bio *rb = ++ container_of(ra->bio[i], struct btree_read_bio, bio); ++ rb->c = c; ++ rb->b = b; ++ rb->ra = ra; ++ rb->start_time = local_clock(); ++ rb->have_ioref = bch2_dev_get_ioref(ca, READ); ++ rb->idx = i; ++ rb->pick = pick; ++ rb->bio.bi_opf = REQ_OP_READ|REQ_SYNC|REQ_META; ++ rb->bio.bi_iter.bi_sector = pick.ptr.offset; ++ rb->bio.bi_end_io = btree_node_read_all_replicas_endio; ++ bch2_bio_map(&rb->bio, ra->buf[i], btree_bytes(c)); ++ ++ if (rb->have_ioref) { ++ this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_btree], ++ bio_sectors(&rb->bio)); ++ bio_set_dev(&rb->bio, ca->disk_sb.bdev); ++ ++ closure_get(&ra->cl); ++ submit_bio(&rb->bio); ++ } else { ++ ra->err[i] = BLK_STS_REMOVED; ++ } ++ ++ i++; ++ } ++ ++ if (sync) { ++ closure_sync(&ra->cl); ++ btree_node_read_all_replicas_done(&ra->cl); ++ } else { ++ continue_at(&ra->cl, btree_node_read_all_replicas_done, system_unbound_wq); ++ } ++ ++ return 0; ++} ++ + void bch2_btree_node_read(struct bch_fs *c, struct btree *b, + bool sync) + { +@@ -1125,6 +1380,12 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b, + btree_pos_to_text(&PBUF(buf), c, b); + trace_btree_read(c, b); + ++ set_btree_node_read_in_flight(b); ++ ++ if (bch2_verify_all_btree_replicas && ++ !btree_node_read_all_replicas(c, b, sync)) ++ return; ++ + ret = bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key), + NULL, &pick); + if (bch2_fs_fatal_err_on(ret <= 0, c, +@@ -1141,6 +1402,8 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b, + &c->btree_bio); + rb = container_of(bio, struct btree_read_bio, bio); + rb->c = c; ++ rb->b = b; ++ rb->ra = NULL; + rb->start_time = local_clock(); + rb->have_ioref = bch2_dev_get_ioref(ca, READ); + rb->pick = pick; +@@ -1148,11 +1411,8 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b, + bio->bi_opf = REQ_OP_READ|REQ_SYNC|REQ_META; + bio->bi_iter.bi_sector = pick.ptr.offset; + bio->bi_end_io = btree_node_read_endio; +- bio->bi_private = b; + bch2_bio_map(bio, b->data, btree_bytes(c)); + +- set_btree_node_read_in_flight(b); +- + if (rb->have_ioref) { + this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_btree], + bio_sectors(bio)); +@@ -1161,7 +1421,6 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b, + if (sync) { + submit_bio_wait(bio); + +- bio->bi_private = b; + btree_node_read_work(&rb->work); + } else { + submit_bio(bio); +@@ -1173,7 +1432,6 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b, + btree_node_read_work(&rb->work); + else + queue_work(system_unbound_wq, &rb->work); +- + } + } + +diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h +index cadcf7f886d7..abbc4675964a 100644 +--- a/fs/bcachefs/btree_io.h ++++ b/fs/bcachefs/btree_io.h +@@ -13,6 +13,7 @@ struct bch_fs; + struct btree_write; + struct btree; + struct btree_iter; ++struct btree_node_read_all; + + static inline bool btree_node_dirty(struct btree *b) + { +@@ -33,8 +34,11 @@ static inline void clear_btree_node_dirty(struct bch_fs *c, struct btree *b) + + struct btree_read_bio { + struct bch_fs *c; ++ struct btree *b; ++ struct btree_node_read_all *ra; + u64 start_time; + unsigned have_ioref:1; ++ unsigned idx:7; + struct extent_ptr_decoded pick; + struct work_struct work; + struct bio bio; +-- +cgit v1.2.3 + + +From 05e5b154a4ac51c7aaeb76e2bee9ac948ca3e0de Mon Sep 17 00:00:00 2001 +From: Brett Holman +Date: Fri, 21 May 2021 16:45:38 -0600 +Subject: bcachefs: rewrote prefetch asm in gas syntax for clang compatibility + +--- + fs/bcachefs/bset.c | 12 +++++------- + 1 file changed, 5 insertions(+), 7 deletions(-) + +diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c +index 61d29cc92079..1d170d8a65c8 100644 +--- a/fs/bcachefs/bset.c ++++ b/fs/bcachefs/bset.c +@@ -1193,13 +1193,11 @@ static struct bkey_packed *bset_search_write_set(const struct btree *b, + + static inline void prefetch_four_cachelines(void *p) + { +-#if (CONFIG_X86_64 && !defined(__clang__)) +- asm(".intel_syntax noprefix;" +- "prefetcht0 [%0 - 127 + 64 * 0];" +- "prefetcht0 [%0 - 127 + 64 * 1];" +- "prefetcht0 [%0 - 127 + 64 * 2];" +- "prefetcht0 [%0 - 127 + 64 * 3];" +- ".att_syntax prefix;" ++#if CONFIG_X86_64 ++ asm("prefetcht0 (-127 + 64 * 0)(%0);" ++ "prefetcht0 (-127 + 64 * 1)(%0);" ++ "prefetcht0 (-127 + 64 * 2)(%0);" ++ "prefetcht0 (-127 + 64 * 3)(%0);" + : + : "r" (p + 127)); + #else +-- +cgit v1.2.3 + + +From d5e2c4be161a607cd953704661ee4689150379d0 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 22 May 2021 17:37:25 -0400 +Subject: bcachefs: Add a workqueue for btree io completions + +Also, clean up workqueue usage - we shouldn't be using system +workqueues, pretty much everything we do needs to be on our own +WQ_MEM_RECLAIM workqueues. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bcachefs.h | 3 ++- + fs/bcachefs/btree_io.c | 13 +++++++------ + fs/bcachefs/btree_update_interior.c | 5 +++-- + fs/bcachefs/io.c | 9 ++++++++- + fs/bcachefs/io.h | 2 +- + fs/bcachefs/journal.c | 6 ++++-- + fs/bcachefs/journal_io.c | 14 +++++++------- + fs/bcachefs/super.c | 10 +++++++--- + 8 files changed, 39 insertions(+), 23 deletions(-) + +diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h +index 77928bae446c..4d20c52c74a8 100644 +--- a/fs/bcachefs/bcachefs.h ++++ b/fs/bcachefs/bcachefs.h +@@ -627,6 +627,7 @@ struct bch_fs { + + /* BTREE CACHE */ + struct bio_set btree_bio; ++ struct workqueue_struct *io_complete_wq; + + struct btree_root btree_roots[BTREE_ID_NR]; + struct mutex btree_root_lock; +@@ -664,7 +665,7 @@ struct bch_fs { + + struct btree_key_cache btree_key_cache; + +- struct workqueue_struct *wq; ++ struct workqueue_struct *btree_update_wq; + /* copygc needs its own workqueue for index updates.. */ + struct workqueue_struct *copygc_wq; + +diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c +index 52f26e23b859..c1c3ed63ce76 100644 +--- a/fs/bcachefs/btree_io.c ++++ b/fs/bcachefs/btree_io.c +@@ -1109,7 +1109,7 @@ static void btree_node_read_endio(struct bio *bio) + bch2_latency_acct(ca, rb->start_time, READ); + } + +- queue_work(system_unbound_wq, &rb->work); ++ queue_work(c->io_complete_wq, &rb->work); + } + + struct btree_node_read_all { +@@ -1361,7 +1361,8 @@ static int btree_node_read_all_replicas(struct bch_fs *c, struct btree *b, bool + closure_sync(&ra->cl); + btree_node_read_all_replicas_done(&ra->cl); + } else { +- continue_at(&ra->cl, btree_node_read_all_replicas_done, system_unbound_wq); ++ continue_at(&ra->cl, btree_node_read_all_replicas_done, ++ c->io_complete_wq); + } + + return 0; +@@ -1431,7 +1432,7 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b, + if (sync) + btree_node_read_work(&rb->work); + else +- queue_work(system_unbound_wq, &rb->work); ++ queue_work(c->io_complete_wq, &rb->work); + } + } + +@@ -1598,7 +1599,7 @@ static void btree_node_write_work(struct work_struct *work) + bio_list_add(&c->btree_write_error_list, &wbio->wbio.bio); + spin_unlock_irqrestore(&c->btree_write_error_lock, flags); + +- queue_work(c->wq, &c->btree_write_error_work); ++ queue_work(c->btree_update_wq, &c->btree_write_error_work); + return; + } + +@@ -1637,7 +1638,7 @@ static void btree_node_write_endio(struct bio *bio) + container_of(orig, struct btree_write_bio, wbio); + + INIT_WORK(&wb->work, btree_node_write_work); +- queue_work(system_unbound_wq, &wb->work); ++ queue_work(c->io_complete_wq, &wb->work); + } + } + +@@ -1897,7 +1898,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b) + atomic64_add(sectors_to_write, &c->btree_writes_sectors); + + INIT_WORK(&wbio->work, btree_write_submit); +- schedule_work(&wbio->work); ++ queue_work(c->io_complete_wq, &wbio->work); + return; + err: + set_btree_node_noevict(b); +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index 5ee191ba495c..c55df177d7f2 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -908,7 +908,8 @@ void bch2_btree_update_done(struct btree_update *as) + + bch2_btree_reserve_put(as); + +- continue_at(&as->cl, btree_update_set_nodes_written, system_freezable_wq); ++ continue_at(&as->cl, btree_update_set_nodes_written, ++ as->c->btree_interior_update_worker); + } + + struct btree_update * +@@ -1847,7 +1848,7 @@ void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b) + a->seq = b->data->keys.seq; + + INIT_WORK(&a->work, async_btree_node_rewrite_work); +- queue_work(system_long_wq, &a->work); ++ queue_work(c->btree_interior_update_worker, &a->work); + } + + static void __bch2_btree_node_update_key(struct bch_fs *c, +diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c +index 3b510f6ba7ef..c928bc8aa03c 100644 +--- a/fs/bcachefs/io.c ++++ b/fs/bcachefs/io.c +@@ -1438,7 +1438,7 @@ static void promote_start(struct promote_op *op, struct bch_read_bio *rbio) + bch2_migrate_read_done(&op->write, rbio); + + closure_init(cl, NULL); +- closure_call(&op->write.op.cl, bch2_write, c->wq, cl); ++ closure_call(&op->write.op.cl, bch2_write, c->btree_update_wq, cl); + closure_return_with_destructor(cl, promote_done); + } + +@@ -1821,6 +1821,13 @@ static void __bch2_read_endio(struct work_struct *work) + if (bch2_crc_cmp(csum, rbio->pick.crc.csum)) + goto csum_err; + ++ /* ++ * XXX ++ * We need to rework the narrow_crcs path to deliver the read completion ++ * first, and then punt to a different workqueue, otherwise we're ++ * holding up reads while doing btree updates which is bad for memory ++ * reclaim. ++ */ + if (unlikely(rbio->narrow_crcs)) + bch2_rbio_narrow_crcs(rbio); + +diff --git a/fs/bcachefs/io.h b/fs/bcachefs/io.h +index 144dc9346c02..bc0a0bd6f849 100644 +--- a/fs/bcachefs/io.h ++++ b/fs/bcachefs/io.h +@@ -58,7 +58,7 @@ static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op) + { + return op->alloc_reserve == RESERVE_MOVINGGC + ? op->c->copygc_wq +- : op->c->wq; ++ : op->c->btree_update_wq; + } + + int bch2_sum_sector_overwrites(struct btree_trans *, struct btree_iter *, +diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c +index 9c8d408c2a08..ac4071fc4e80 100644 +--- a/fs/bcachefs/journal.c ++++ b/fs/bcachefs/journal.c +@@ -118,7 +118,9 @@ void bch2_journal_halt(struct journal *j) + + void __bch2_journal_buf_put(struct journal *j) + { +- closure_call(&j->io, bch2_journal_write, system_highpri_wq, NULL); ++ struct bch_fs *c = container_of(j, struct bch_fs, journal); ++ ++ closure_call(&j->io, bch2_journal_write, c->io_complete_wq, NULL); + } + + /* +@@ -304,7 +306,7 @@ static int journal_entry_open(struct journal *j) + j->res_get_blocked_start); + j->res_get_blocked_start = 0; + +- mod_delayed_work(system_freezable_wq, ++ mod_delayed_work(c->io_complete_wq, + &j->write_work, + msecs_to_jiffies(j->write_delay_ms)); + journal_wake(j); +diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c +index 635cceb4dd21..b40952248a5d 100644 +--- a/fs/bcachefs/journal_io.c ++++ b/fs/bcachefs/journal_io.c +@@ -1296,12 +1296,12 @@ static void journal_write_done(struct closure *cl) + journal_wake(j); + + if (test_bit(JOURNAL_NEED_WRITE, &j->flags)) +- mod_delayed_work(system_freezable_wq, &j->write_work, 0); ++ mod_delayed_work(c->io_complete_wq, &j->write_work, 0); + spin_unlock(&j->lock); + + if (new.unwritten_idx != new.idx && + !journal_state_count(new, new.unwritten_idx)) +- closure_call(&j->io, bch2_journal_write, system_highpri_wq, NULL); ++ closure_call(&j->io, bch2_journal_write, c->io_complete_wq, NULL); + } + + static void journal_write_endio(struct bio *bio) +@@ -1370,7 +1370,7 @@ static void do_journal_write(struct closure *cl) + le64_to_cpu(w->data->seq); + } + +- continue_at(cl, journal_write_done, system_highpri_wq); ++ continue_at(cl, journal_write_done, c->io_complete_wq); + return; + } + +@@ -1509,7 +1509,7 @@ retry_alloc: + journal_debug_buf); + kfree(journal_debug_buf); + bch2_fatal_error(c); +- continue_at(cl, journal_write_done, system_highpri_wq); ++ continue_at(cl, journal_write_done, c->io_complete_wq); + return; + } + +@@ -1542,14 +1542,14 @@ retry_alloc: + + bch2_bucket_seq_cleanup(c); + +- continue_at(cl, do_journal_write, system_highpri_wq); ++ continue_at(cl, do_journal_write, c->io_complete_wq); + return; + no_io: + bch2_bucket_seq_cleanup(c); + +- continue_at(cl, journal_write_done, system_highpri_wq); ++ continue_at(cl, journal_write_done, c->io_complete_wq); + return; + err: + bch2_inconsistent_error(c); +- continue_at(cl, journal_write_done, system_highpri_wq); ++ continue_at(cl, journal_write_done, c->io_complete_wq); + } +diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c +index 01b246076a0b..ab1a07c82584 100644 +--- a/fs/bcachefs/super.c ++++ b/fs/bcachefs/super.c +@@ -503,10 +503,12 @@ static void __bch2_fs_free(struct bch_fs *c) + kfree(c->unused_inode_hints); + free_heap(&c->copygc_heap); + ++ if (c->io_complete_wq ) ++ destroy_workqueue(c->io_complete_wq ); + if (c->copygc_wq) + destroy_workqueue(c->copygc_wq); +- if (c->wq) +- destroy_workqueue(c->wq); ++ if (c->btree_update_wq) ++ destroy_workqueue(c->btree_update_wq); + + bch2_free_super(&c->disk_sb); + kvpfree(c, sizeof(*c)); +@@ -754,10 +756,12 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) + + c->inode_shard_bits = ilog2(roundup_pow_of_two(num_possible_cpus())); + +- if (!(c->wq = alloc_workqueue("bcachefs", ++ if (!(c->btree_update_wq = alloc_workqueue("bcachefs", + WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) || + !(c->copygc_wq = alloc_workqueue("bcachefs_copygc", + WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) || ++ !(c->io_complete_wq = alloc_workqueue("bcachefs_io", ++ WQ_FREEZABLE|WQ_HIGHPRI|WQ_MEM_RECLAIM, 1)) || + percpu_ref_init(&c->writes, bch2_writes_disabled, + PERCPU_REF_INIT_DEAD, GFP_KERNEL) || + mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) || +-- +cgit v1.2.3 + + +From c6886c9d5e1c25fc0f3e3ff492542958722ccb14 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 22 May 2021 21:13:17 -0400 +Subject: bcachefs: Improve FS_IOC_GOINGDOWN ioctl + +We weren't interpreting the flags argument at all. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/fs-ioctl.c | 60 +++++++++++++++++++++++++++++++++++++++++--------- + 1 file changed, 49 insertions(+), 11 deletions(-) + +diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c +index ef2ab3e7dfa5..91a0e761c8e7 100644 +--- a/fs/bcachefs/fs-ioctl.c ++++ b/fs/bcachefs/fs-ioctl.c +@@ -13,6 +13,9 @@ + #include + + #define FS_IOC_GOINGDOWN _IOR('X', 125, __u32) ++#define FSOP_GOING_FLAGS_DEFAULT 0x0 /* going down */ ++#define FSOP_GOING_FLAGS_LOGFLUSH 0x1 /* flush log but not data */ ++#define FSOP_GOING_FLAGS_NOLOGFLUSH 0x2 /* don't flush log nor data */ + + struct flags_set { + unsigned mask; +@@ -247,11 +250,54 @@ err1: + return ret; + } + ++static int bch2_ioc_goingdown(struct bch_fs *c, u32 __user *arg) ++{ ++ u32 flags; ++ int ret = 0; ++ ++ if (!capable(CAP_SYS_ADMIN)) ++ return -EPERM; ++ ++ if (get_user(flags, arg)) ++ return -EFAULT; ++ ++ bch_notice(c, "shutdown by ioctl type %u", flags); ++ ++ down_write(&c->vfs_sb->s_umount); ++ ++ switch (flags) { ++ case FSOP_GOING_FLAGS_DEFAULT: ++ ret = freeze_bdev(c->vfs_sb->s_bdev); ++ if (ret) ++ goto err; ++ ++ bch2_journal_flush(&c->journal); ++ c->vfs_sb->s_flags |= SB_RDONLY; ++ bch2_fs_emergency_read_only(c); ++ thaw_bdev(c->vfs_sb->s_bdev); ++ break; ++ ++ case FSOP_GOING_FLAGS_LOGFLUSH: ++ bch2_journal_flush(&c->journal); ++ fallthrough; ++ ++ case FSOP_GOING_FLAGS_NOLOGFLUSH: ++ c->vfs_sb->s_flags |= SB_RDONLY; ++ bch2_fs_emergency_read_only(c); ++ break; ++ default: ++ ret = -EINVAL; ++ break; ++ } ++err: ++ up_write(&c->vfs_sb->s_umount); ++ return ret; ++} ++ + long bch2_fs_file_ioctl(struct file *file, unsigned cmd, unsigned long arg) + { + struct bch_inode_info *inode = file_bch_inode(file); +- struct super_block *sb = inode->v.i_sb; +- struct bch_fs *c = sb->s_fs_info; ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; + + switch (cmd) { + case FS_IOC_GETFLAGS: +@@ -276,15 +322,7 @@ long bch2_fs_file_ioctl(struct file *file, unsigned cmd, unsigned long arg) + return -ENOTTY; + + case FS_IOC_GOINGDOWN: +- if (!capable(CAP_SYS_ADMIN)) +- return -EPERM; +- +- down_write(&sb->s_umount); +- sb->s_flags |= SB_RDONLY; +- if (bch2_fs_emergency_read_only(c)) +- bch_err(c, "emergency read only due to ioctl"); +- up_write(&sb->s_umount); +- return 0; ++ return bch2_ioc_goingdown(c, (u32 __user *) arg); + + default: + return bch2_fs_ioctl(c, cmd, (void __user *) arg); +-- +cgit v1.2.3 + + +From dffcaa0883e20d83196a95fc2950fedaac543c01 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 22 May 2021 21:43:20 -0400 +Subject: bcachefs: Fix an issue with inconsistent btree writes after unclean + shutdown + +After unclean shutdown, btree writes may have completed on one device +and not others - and this inconsistency could lead us to writing new +bsets with a gap in our btree node in one of our replicas. + +Fortunately, this is only an issue with bsets that are newer than the +most recent journal flush, and we already have a mechanism for detecting +and blacklisting those. We just need to make sure to start new btree +writes after the most recent _non_ blacklisted bset. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_io.c | 19 ++++++++++++++++++- + 1 file changed, 18 insertions(+), 1 deletion(-) + +diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c +index c1c3ed63ce76..c23f10d25181 100644 +--- a/fs/bcachefs/btree_io.c ++++ b/fs/bcachefs/btree_io.c +@@ -823,6 +823,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, + bool updated_range = b->key.k.type == KEY_TYPE_btree_ptr_v2 && + BTREE_PTR_RANGE_UPDATED(&bkey_i_to_btree_ptr_v2(&b->key)->v); + unsigned u64s; ++ unsigned nonblacklisted_written = 0; + int ret, retry_read = 0, write = READ; + + b->version_ondisk = U16_MAX; +@@ -942,15 +943,31 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, + sort_iter_add(iter, + vstruct_idx(i, whiteout_u64s), + vstruct_last(i)); ++ ++ nonblacklisted_written = b->written; + } + + for (bne = write_block(b); + bset_byte_offset(b, bne) < btree_bytes(c); + bne = (void *) bne + block_bytes(c)) +- btree_err_on(bne->keys.seq == b->data->keys.seq, ++ btree_err_on(bne->keys.seq == b->data->keys.seq && ++ !bch2_journal_seq_is_blacklisted(c, ++ le64_to_cpu(bne->keys.journal_seq), ++ true), + BTREE_ERR_WANT_RETRY, c, ca, b, NULL, + "found bset signature after last bset"); + ++ /* ++ * Blacklisted bsets are those that were written after the most recent ++ * (flush) journal write. Since there wasn't a flush, they may not have ++ * made it to all devices - which means we shouldn't write new bsets ++ * after them, as that could leave a gap and then reads from that device ++ * wouldn't find all the bsets in that btree node - which means it's ++ * important that we start writing new bsets after the most recent _non_ ++ * blacklisted bset: ++ */ ++ b->written = nonblacklisted_written; ++ + sorted = btree_bounce_alloc(c, btree_bytes(c), &used_mempool); + sorted->keys.u64s = 0; + +-- +cgit v1.2.3 + + +From 1022c4709f7a1d0a85dbca373f866026e65ab098 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 23 May 2021 18:42:51 -0400 +Subject: bcachefs: Fix a null ptr deref + +bch2_btree_iter_peek() won't always return a key - whoops. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/move.c | 5 +++++ + 1 file changed, 5 insertions(+) + +diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c +index 778ff72cf5b2..2d5c4e9bbf42 100644 +--- a/fs/bcachefs/move.c ++++ b/fs/bcachefs/move.c +@@ -523,6 +523,11 @@ static int lookup_inode(struct btree_trans *trans, struct bpos pos, + if (ret) + goto err; + ++ if (!k.k || bkey_cmp(k.k->p, pos)) { ++ ret = -ENOENT; ++ goto err; ++ } ++ + ret = k.k->type == KEY_TYPE_inode ? 0 : -EIO; + if (ret) + goto err; +-- +cgit v1.2.3 + + +From b52a3f9de5d5150f414d1d564bfcdb8c0d6dfdd8 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 25 May 2021 18:42:05 -0400 +Subject: bcachefs: Add a cond_resched call to the copygc main loop + +We seem to have a bug where the copygc thread ends up spinning and +making the system unusable - this will at least prevent it from locking +up the machine, and it's a good thing to have anyways. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/movinggc.c | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c +index 61c5901f0980..0963c4969421 100644 +--- a/fs/bcachefs/movinggc.c ++++ b/fs/bcachefs/movinggc.c +@@ -317,6 +317,8 @@ static int bch2_copygc_thread(void *arg) + set_freezable(); + + while (!kthread_should_stop()) { ++ cond_resched(); ++ + if (kthread_wait_freezable(c->copy_gc_enabled)) + break; + +-- +cgit v1.2.3 + + +From 252001889f8cd7168fefeff972fb8f43322bee16 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 26 May 2021 01:03:35 -0400 +Subject: bcachefs: Add a tracepoint for copygc waiting + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/movinggc.c | 1 + + include/trace/events/bcachefs.h | 21 +++++++++++++++++++++ + 2 files changed, 22 insertions(+) + +diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c +index 0963c4969421..2acca0ddb6fd 100644 +--- a/fs/bcachefs/movinggc.c ++++ b/fs/bcachefs/movinggc.c +@@ -326,6 +326,7 @@ static int bch2_copygc_thread(void *arg) + wait = bch2_copygc_wait_amount(c); + + if (wait > clock->max_slop) { ++ trace_copygc_wait(c, wait, last + wait); + c->copygc_wait = last + wait; + bch2_kthread_io_clock_wait(clock, last + wait, + MAX_SCHEDULE_TIMEOUT); +diff --git a/include/trace/events/bcachefs.h b/include/trace/events/bcachefs.h +index 48dc2377930a..b5f3c3d1498b 100644 +--- a/include/trace/events/bcachefs.h ++++ b/include/trace/events/bcachefs.h +@@ -528,6 +528,27 @@ TRACE_EVENT(copygc, + __entry->buckets_moved, __entry->buckets_not_moved) + ); + ++TRACE_EVENT(copygc_wait, ++ TP_PROTO(struct bch_fs *c, ++ u64 wait_amount, u64 until), ++ TP_ARGS(c, wait_amount, until), ++ ++ TP_STRUCT__entry( ++ __array(char, uuid, 16 ) ++ __field(u64, wait_amount ) ++ __field(u64, until ) ++ ), ++ ++ TP_fast_assign( ++ memcpy(__entry->uuid, c->sb.user_uuid.b, 16); ++ __entry->wait_amount = wait_amount; ++ __entry->until = until; ++ ), ++ ++ TP_printk("%pU waiting for %llu sectors until %llu", ++ __entry->uuid, __entry->wait_amount, __entry->until) ++); ++ + TRACE_EVENT(trans_get_iter, + TP_PROTO(unsigned long caller, unsigned long ip, + enum btree_id btree_id, +-- +cgit v1.2.3 + + +From 7f63f5326f0918c6a5d9d8a3d4b5cf74dff573c6 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 27 May 2021 19:15:44 -0400 +Subject: bcachefs: Don't use uuid in tracepoints + +%pU for printing out pointers to uuids doesn't work in perf trace + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bcachefs.h | 1 + + fs/bcachefs/fs.c | 2 + + include/trace/events/bcachefs.h | 93 +++++++++++++++++++---------------------- + 3 files changed, 45 insertions(+), 51 deletions(-) + +diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h +index 4d20c52c74a8..32435696841d 100644 +--- a/fs/bcachefs/bcachefs.h ++++ b/fs/bcachefs/bcachefs.h +@@ -568,6 +568,7 @@ struct bch_fs { + int minor; + struct device *chardev; + struct super_block *vfs_sb; ++ dev_t dev; + char name[40]; + + /* ro/rw, add/remove/resize devices: */ +diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c +index e6e439f39ee2..2dabf86450c7 100644 +--- a/fs/bcachefs/fs.c ++++ b/fs/bcachefs/fs.c +@@ -1590,6 +1590,8 @@ got_sb: + break; + } + ++ c->dev = sb->s_dev; ++ + #ifdef CONFIG_BCACHEFS_POSIX_ACL + if (c->opts.acl) + sb->s_flags |= SB_POSIXACL; +diff --git a/include/trace/events/bcachefs.h b/include/trace/events/bcachefs.h +index b5f3c3d1498b..05314cc0f97a 100644 +--- a/include/trace/events/bcachefs.h ++++ b/include/trace/events/bcachefs.h +@@ -49,14 +49,14 @@ DECLARE_EVENT_CLASS(bch_fs, + TP_ARGS(c), + + TP_STRUCT__entry( +- __array(char, uuid, 16 ) ++ __field(dev_t, dev ) + ), + + TP_fast_assign( +- memcpy(__entry->uuid, c->sb.user_uuid.b, 16); ++ __entry->dev = c->dev; + ), + +- TP_printk("%pU", __entry->uuid) ++ TP_printk("%d,%d", MAJOR(__entry->dev), MINOR(__entry->dev)) + ); + + DECLARE_EVENT_CLASS(bio, +@@ -131,7 +131,7 @@ TRACE_EVENT(journal_reclaim_start, + btree_key_cache_dirty, btree_key_cache_total), + + TP_STRUCT__entry( +- __array(char, uuid, 16 ) ++ __field(dev_t, dev ) + __field(u64, min_nr ) + __field(u64, prereserved ) + __field(u64, prereserved_total ) +@@ -142,7 +142,7 @@ TRACE_EVENT(journal_reclaim_start, + ), + + TP_fast_assign( +- memcpy(__entry->uuid, c->sb.user_uuid.b, 16); ++ __entry->dev = c->dev; + __entry->min_nr = min_nr; + __entry->prereserved = prereserved; + __entry->prereserved_total = prereserved_total; +@@ -152,8 +152,8 @@ TRACE_EVENT(journal_reclaim_start, + __entry->btree_key_cache_total = btree_key_cache_total; + ), + +- TP_printk("%pU min %llu prereserved %llu/%llu btree cache %llu/%llu key cache %llu/%llu", +- __entry->uuid, ++ TP_printk("%d,%d min %llu prereserved %llu/%llu btree cache %llu/%llu key cache %llu/%llu", ++ MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->min_nr, + __entry->prereserved, + __entry->prereserved_total, +@@ -168,16 +168,18 @@ TRACE_EVENT(journal_reclaim_finish, + TP_ARGS(c, nr_flushed), + + TP_STRUCT__entry( +- __array(char, uuid, 16 ) +- __field(u64, nr_flushed ) ++ __field(dev_t, dev ) ++ __field(u64, nr_flushed ) + ), + + TP_fast_assign( +- memcpy(__entry->uuid, c->sb.user_uuid.b, 16); +- __entry->nr_flushed = nr_flushed; ++ __entry->dev = c->dev; ++ __entry->nr_flushed = nr_flushed; + ), + +- TP_printk("%pU flushed %llu", __entry->uuid, __entry->nr_flushed) ++ TP_printk("%d%d flushed %llu", ++ MAJOR(__entry->dev), MINOR(__entry->dev), ++ __entry->nr_flushed) + ); + + /* bset.c: */ +@@ -194,7 +196,7 @@ DECLARE_EVENT_CLASS(btree_node, + TP_ARGS(c, b), + + TP_STRUCT__entry( +- __array(char, uuid, 16 ) ++ __field(dev_t, dev ) + __field(u8, level ) + __field(u8, id ) + __field(u64, inode ) +@@ -202,15 +204,16 @@ DECLARE_EVENT_CLASS(btree_node, + ), + + TP_fast_assign( +- memcpy(__entry->uuid, c->sb.user_uuid.b, 16); ++ __entry->dev = c->dev; + __entry->level = b->c.level; + __entry->id = b->c.btree_id; + __entry->inode = b->key.k.p.inode; + __entry->offset = b->key.k.p.offset; + ), + +- TP_printk("%pU %u id %u %llu:%llu", +- __entry->uuid, __entry->level, __entry->id, ++ TP_printk("%d,%d %u id %u %llu:%llu", ++ MAJOR(__entry->dev), MINOR(__entry->dev), ++ __entry->level, __entry->id, + __entry->inode, __entry->offset) + ); + +@@ -254,32 +257,17 @@ DEFINE_EVENT(btree_node, btree_node_reap, + TP_ARGS(c, b) + ); + +-DECLARE_EVENT_CLASS(btree_node_cannibalize_lock, +- TP_PROTO(struct bch_fs *c), +- TP_ARGS(c), +- +- TP_STRUCT__entry( +- __array(char, uuid, 16 ) +- ), +- +- TP_fast_assign( +- memcpy(__entry->uuid, c->sb.user_uuid.b, 16); +- ), +- +- TP_printk("%pU", __entry->uuid) +-); +- +-DEFINE_EVENT(btree_node_cannibalize_lock, btree_node_cannibalize_lock_fail, ++DEFINE_EVENT(bch_fs, btree_node_cannibalize_lock_fail, + TP_PROTO(struct bch_fs *c), + TP_ARGS(c) + ); + +-DEFINE_EVENT(btree_node_cannibalize_lock, btree_node_cannibalize_lock, ++DEFINE_EVENT(bch_fs, btree_node_cannibalize_lock, + TP_PROTO(struct bch_fs *c), + TP_ARGS(c) + ); + +-DEFINE_EVENT(btree_node_cannibalize_lock, btree_node_cannibalize, ++DEFINE_EVENT(bch_fs, btree_node_cannibalize, + TP_PROTO(struct bch_fs *c), + TP_ARGS(c) + ); +@@ -294,18 +282,19 @@ TRACE_EVENT(btree_reserve_get_fail, + TP_ARGS(c, required, cl), + + TP_STRUCT__entry( +- __array(char, uuid, 16 ) ++ __field(dev_t, dev ) + __field(size_t, required ) + __field(struct closure *, cl ) + ), + + TP_fast_assign( +- memcpy(__entry->uuid, c->sb.user_uuid.b, 16); ++ __entry->dev = c->dev; + __entry->required = required; + __entry->cl = cl; + ), + +- TP_printk("%pU required %zu by %p", __entry->uuid, ++ TP_printk("%d,%d required %zu by %p", ++ MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->required, __entry->cl) + ); + +@@ -483,19 +472,20 @@ TRACE_EVENT(move_data, + TP_ARGS(c, sectors_moved, keys_moved), + + TP_STRUCT__entry( +- __array(char, uuid, 16 ) ++ __field(dev_t, dev ) + __field(u64, sectors_moved ) + __field(u64, keys_moved ) + ), + + TP_fast_assign( +- memcpy(__entry->uuid, c->sb.user_uuid.b, 16); ++ __entry->dev = c->dev; + __entry->sectors_moved = sectors_moved; + __entry->keys_moved = keys_moved; + ), + +- TP_printk("%pU sectors_moved %llu keys_moved %llu", +- __entry->uuid, __entry->sectors_moved, __entry->keys_moved) ++ TP_printk("%d,%d sectors_moved %llu keys_moved %llu", ++ MAJOR(__entry->dev), MINOR(__entry->dev), ++ __entry->sectors_moved, __entry->keys_moved) + ); + + TRACE_EVENT(copygc, +@@ -507,7 +497,7 @@ TRACE_EVENT(copygc, + buckets_moved, buckets_not_moved), + + TP_STRUCT__entry( +- __array(char, uuid, 16 ) ++ __field(dev_t, dev ) + __field(u64, sectors_moved ) + __field(u64, sectors_not_moved ) + __field(u64, buckets_moved ) +@@ -515,17 +505,17 @@ TRACE_EVENT(copygc, + ), + + TP_fast_assign( +- memcpy(__entry->uuid, c->sb.user_uuid.b, 16); ++ __entry->dev = c->dev; + __entry->sectors_moved = sectors_moved; + __entry->sectors_not_moved = sectors_not_moved; + __entry->buckets_moved = buckets_moved; + __entry->buckets_not_moved = buckets_moved; + ), + +- TP_printk("%pU sectors moved %llu remain %llu buckets moved %llu remain %llu", +- __entry->uuid, +- __entry->sectors_moved, __entry->sectors_not_moved, +- __entry->buckets_moved, __entry->buckets_not_moved) ++ TP_printk("%d,%d sectors moved %llu remain %llu buckets moved %llu remain %llu", ++ MAJOR(__entry->dev), MINOR(__entry->dev), ++ __entry->sectors_moved, __entry->sectors_not_moved, ++ __entry->buckets_moved, __entry->buckets_not_moved) + ); + + TRACE_EVENT(copygc_wait, +@@ -534,19 +524,20 @@ TRACE_EVENT(copygc_wait, + TP_ARGS(c, wait_amount, until), + + TP_STRUCT__entry( +- __array(char, uuid, 16 ) ++ __field(dev_t, dev ) + __field(u64, wait_amount ) + __field(u64, until ) + ), + + TP_fast_assign( +- memcpy(__entry->uuid, c->sb.user_uuid.b, 16); ++ __entry->dev = c->dev; + __entry->wait_amount = wait_amount; + __entry->until = until; + ), + +- TP_printk("%pU waiting for %llu sectors until %llu", +- __entry->uuid, __entry->wait_amount, __entry->until) ++ TP_printk("%d,%u waiting for %llu sectors until %llu", ++ MAJOR(__entry->dev), MINOR(__entry->dev), ++ __entry->wait_amount, __entry->until) + ); + + TRACE_EVENT(trans_get_iter, +-- +cgit v1.2.3 + + +From f887fa15137558872f96281bde1186f5f2263da4 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 27 May 2021 20:20:20 -0400 +Subject: bcachefs: Add an option to control sharding new inode numbers + +We're seeing a bug where inode creates end up spinning in +bch2_inode_create - disabling sharding will simplify what we're testing. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bcachefs_format.h | 1 + + fs/bcachefs/inode.c | 21 ++++++++++++++------- + fs/bcachefs/opts.h | 7 ++++++- + 3 files changed, 21 insertions(+), 8 deletions(-) + +diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h +index d640a3115adc..79c0876aab8b 100644 +--- a/fs/bcachefs/bcachefs_format.h ++++ b/fs/bcachefs/bcachefs_format.h +@@ -1344,6 +1344,7 @@ LE64_BITMASK(BCH_SB_GC_RESERVE_BYTES, struct bch_sb, flags[2], 4, 64); + + LE64_BITMASK(BCH_SB_ERASURE_CODE, struct bch_sb, flags[3], 0, 16); + LE64_BITMASK(BCH_SB_METADATA_TARGET, struct bch_sb, flags[3], 16, 28); ++LE64_BITMASK(BCH_SB_SHARD_INUMS, struct bch_sb, flags[3], 28, 29); + + /* + * Features: +diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c +index c5892e42aaec..463d647b359e 100644 +--- a/fs/bcachefs/inode.c ++++ b/fs/bcachefs/inode.c +@@ -479,16 +479,23 @@ struct btree_iter *bch2_inode_create(struct btree_trans *trans, + struct bkey_s_c k; + u64 min, max, start, pos, *hint; + int ret = 0; ++ unsigned bits = (c->opts.inodes_32bit ? 31 : 63); + +- u64 cpu = raw_smp_processor_id(); +- unsigned bits = (c->opts.inodes_32bit +- ? 31 : 63) - c->inode_shard_bits; ++ if (c->opts.shard_inode_numbers) { ++ u64 cpu = raw_smp_processor_id(); + +- min = (cpu << bits); +- max = (cpu << bits) | ~(ULLONG_MAX << bits); ++ bits -= c->inode_shard_bits; + +- min = max_t(u64, min, BLOCKDEV_INODE_MAX); +- hint = c->unused_inode_hints + cpu; ++ min = (cpu << bits); ++ max = (cpu << bits) | ~(ULLONG_MAX << bits); ++ ++ min = max_t(u64, min, BLOCKDEV_INODE_MAX); ++ hint = c->unused_inode_hints + cpu; ++ } else { ++ min = BLOCKDEV_INODE_MAX; ++ max = ~(ULLONG_MAX << bits); ++ hint = c->unused_inode_hints; ++ } + + start = READ_ONCE(*hint); + +diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h +index 001e865c5555..1e2fc5de5ca4 100644 +--- a/fs/bcachefs/opts.h ++++ b/fs/bcachefs/opts.h +@@ -165,8 +165,13 @@ enum opt_type { + x(inodes_32bit, u8, \ + OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_BOOL(), \ +- BCH_SB_INODE_32BIT, false, \ ++ BCH_SB_INODE_32BIT, true, \ + NULL, "Constrain inode numbers to 32 bits") \ ++ x(shard_inode_numbers, u8, \ ++ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ ++ OPT_BOOL(), \ ++ BCH_SB_SHARD_INUMS, false, \ ++ NULL, "Shard new inode numbers by CPU id") \ + x(gc_reserve_percent, u8, \ + OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_UINT(5, 21), \ +-- +cgit v1.2.3 + + +From 944bed7baf88d3894075d1d8341963102d70f5be Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 27 May 2021 21:16:50 -0400 +Subject: bcachefs: Reflink refcount fix + +__bch2_trans_mark_reflink_p wasn't always correctly returning the number +of sectors processed - the new logic is a bit more straightforward +overall too. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/buckets.c | 24 +++++++++++++----------- + 1 file changed, 13 insertions(+), 11 deletions(-) + +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index cbd295e494bd..282bca166b16 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -1731,6 +1731,7 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans, + struct bkey_i *n; + __le64 *refcount; + int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1; ++ int frags_referenced; + s64 ret; + + ret = trans_get_key(trans, BTREE_ID_reflink, +@@ -1738,18 +1739,20 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans, + if (ret < 0) + return ret; + +- if (reflink_p_frag_references(p, 0, front_frag, k) && +- reflink_p_frag_references(p, back_frag, p.k->size, k)) { ++ sectors = min_t(u64, sectors, k.k->p.offset - idx); ++ ++ frags_referenced = ++ reflink_p_frag_references(p, 0, front_frag, k) + ++ reflink_p_frag_references(p, back_frag, p.k->size, k); ++ ++ if (frags_referenced == 2) { + BUG_ON(!(flags & BTREE_TRIGGER_OVERWRITE_SPLIT)); + add = -add; +- } else if (reflink_p_frag_references(p, 0, front_frag, k) || +- reflink_p_frag_references(p, back_frag, p.k->size, k)) { ++ } else if (frags_referenced == 1) { + BUG_ON(!(flags & BTREE_TRIGGER_OVERWRITE)); + goto out; + } + +- sectors = min_t(u64, sectors, k.k->p.offset - idx); +- + n = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); + ret = PTR_ERR_OR_ZERO(n); + if (ret) +@@ -1804,14 +1807,13 @@ static int bch2_trans_mark_reflink_p(struct btree_trans *trans, + ret = __bch2_trans_mark_reflink_p(trans, p, idx, sectors, + front_frag, back_frag, flags); + if (ret < 0) +- break; ++ return ret; + +- idx += ret; +- sectors = max_t(s64, 0LL, sectors - ret); +- ret = 0; ++ idx += ret; ++ sectors -= ret; + } + +- return ret; ++ return 0; + } + + int bch2_trans_mark_key(struct btree_trans *trans, +-- +cgit v1.2.3 + + +From f7ec49a7da80e2ceb9e9ffdf60bac24336ec0809 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 27 May 2021 23:16:25 -0400 +Subject: bcachefs: Fix journal write error path + +Journal write errors were racing with the submission path - potentially +causing writes to other replicas to not get submitted. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/journal_io.c | 25 +++++++++++-------------- + fs/bcachefs/journal_types.h | 1 + + 2 files changed, 12 insertions(+), 14 deletions(-) + +diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c +index b40952248a5d..58298d3b3a58 100644 +--- a/fs/bcachefs/journal_io.c ++++ b/fs/bcachefs/journal_io.c +@@ -1233,8 +1233,6 @@ static void journal_write_done(struct closure *cl) + struct journal *j = container_of(cl, struct journal, io); + struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct journal_buf *w = journal_last_unwritten_buf(j); +- struct bch_devs_list devs = +- bch2_bkey_devs(bkey_i_to_s_c(&w->key)); + struct bch_replicas_padded replicas; + union journal_res_state old, new; + u64 v, seq; +@@ -1242,11 +1240,12 @@ static void journal_write_done(struct closure *cl) + + bch2_time_stats_update(j->write_time, j->write_start_time); + +- if (!devs.nr) { ++ if (!w->devs_written.nr) { + bch_err(c, "unable to write journal to sufficient devices"); + err = -EIO; + } else { +- bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, devs); ++ bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, ++ w->devs_written); + if (bch2_mark_replicas(c, &replicas.e)) + err = -EIO; + } +@@ -1258,7 +1257,7 @@ static void journal_write_done(struct closure *cl) + seq = le64_to_cpu(w->data->seq); + + if (seq >= j->pin.front) +- journal_seq_pin(j, seq)->devs = devs; ++ journal_seq_pin(j, seq)->devs = w->devs_written; + + j->seq_ondisk = seq; + if (err && (!j->err_seq || seq < j->err_seq)) +@@ -1308,15 +1307,15 @@ static void journal_write_endio(struct bio *bio) + { + struct bch_dev *ca = bio->bi_private; + struct journal *j = &ca->fs->journal; ++ struct journal_buf *w = journal_last_unwritten_buf(j); ++ unsigned long flags; + +- if (bch2_dev_io_err_on(bio->bi_status, ca, "journal write error: %s", ++ if (bch2_dev_io_err_on(bio->bi_status, ca, "error writing journal entry %llu: %s", ++ le64_to_cpu(w->data->seq), + bch2_blk_status_to_str(bio->bi_status)) || + bch2_meta_write_fault("journal")) { +- struct journal_buf *w = journal_last_unwritten_buf(j); +- unsigned long flags; +- + spin_lock_irqsave(&j->err_lock, flags); +- bch2_bkey_drop_device(bkey_i_to_s(&w->key), ca->dev_idx); ++ bch2_dev_list_drop_dev(&w->devs_written, ca->dev_idx); + spin_unlock_irqrestore(&j->err_lock, flags); + } + +@@ -1513,10 +1512,8 @@ retry_alloc: + return; + } + +- /* +- * XXX: we really should just disable the entire journal in nochanges +- * mode +- */ ++ w->devs_written = bch2_bkey_devs(bkey_i_to_s_c(&w->key)); ++ + if (c->opts.nochanges) + goto no_io; + +diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h +index cacab22a35c1..61674ae1ab5f 100644 +--- a/fs/bcachefs/journal_types.h ++++ b/fs/bcachefs/journal_types.h +@@ -21,6 +21,7 @@ struct journal_buf { + struct jset *data; + + __BKEY_PADDED(key, BCH_REPLICAS_MAX); ++ struct bch_devs_list devs_written; + + struct closure_waitlist wait; + u64 last_seq; /* copy of data->last_seq */ +-- +cgit v1.2.3 + + +From 92481907415aca901aaefcc4a37626c4412b5e0a Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 28 May 2021 05:06:18 -0400 +Subject: bcachefs: Fix pathalogical behaviour with inode sharding by cpu ID + +If the transactior restarts on a different CPU, it could end up needing +to read in a different btree node, which makes another transaction +restart more likely... + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/fs-common.c | 3 ++- + fs/bcachefs/inode.c | 4 +--- + fs/bcachefs/inode.h | 2 +- + 3 files changed, 4 insertions(+), 5 deletions(-) + +diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c +index 08c6af886df7..00a63fecb976 100644 +--- a/fs/bcachefs/fs-common.c ++++ b/fs/bcachefs/fs-common.c +@@ -23,6 +23,7 @@ int bch2_create_trans(struct btree_trans *trans, u64 dir_inum, + struct btree_iter *inode_iter = NULL; + struct bch_hash_info hash = bch2_hash_info_init(c, new_inode); + u64 now = bch2_current_time(c); ++ u64 cpu = raw_smp_processor_id(); + u64 dir_offset = 0; + int ret; + +@@ -36,7 +37,7 @@ int bch2_create_trans(struct btree_trans *trans, u64 dir_inum, + if (!name) + new_inode->bi_flags |= BCH_INODE_UNLINKED; + +- inode_iter = bch2_inode_create(trans, new_inode, U32_MAX); ++ inode_iter = bch2_inode_create(trans, new_inode, U32_MAX, cpu); + ret = PTR_ERR_OR_ZERO(inode_iter); + if (ret) + goto err; +diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c +index 463d647b359e..6b43a9716cf0 100644 +--- a/fs/bcachefs/inode.c ++++ b/fs/bcachefs/inode.c +@@ -472,7 +472,7 @@ static inline u32 bkey_generation(struct bkey_s_c k) + + struct btree_iter *bch2_inode_create(struct btree_trans *trans, + struct bch_inode_unpacked *inode_u, +- u32 snapshot) ++ u32 snapshot, u64 cpu) + { + struct bch_fs *c = trans->c; + struct btree_iter *iter = NULL; +@@ -482,8 +482,6 @@ struct btree_iter *bch2_inode_create(struct btree_trans *trans, + unsigned bits = (c->opts.inodes_32bit ? 31 : 63); + + if (c->opts.shard_inode_numbers) { +- u64 cpu = raw_smp_processor_id(); +- + bits -= c->inode_shard_bits; + + min = (cpu << bits); +diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h +index 558d5464095d..2cb081ae44d9 100644 +--- a/fs/bcachefs/inode.h ++++ b/fs/bcachefs/inode.h +@@ -70,7 +70,7 @@ void bch2_inode_init(struct bch_fs *, struct bch_inode_unpacked *, + struct bch_inode_unpacked *); + + struct btree_iter *bch2_inode_create(struct btree_trans *, +- struct bch_inode_unpacked *, u32); ++ struct bch_inode_unpacked *, u32, u64); + + int bch2_inode_rm(struct bch_fs *, u64, bool); + +-- +cgit v1.2.3 + + +From 11914e0528d1b0d212b16cd9c1510ac192cd56ab Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 27 May 2021 21:38:00 -0400 +Subject: bcachefs: Split out btree_error_wq + +We can't use btree_update_wq becuase btree updates may be waiting on +btree writes to complete. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bcachefs.h | 1 + + fs/bcachefs/btree_io.c | 2 +- + fs/bcachefs/super.c | 4 ++++ + 3 files changed, 6 insertions(+), 1 deletion(-) + +diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h +index 32435696841d..d73302d14820 100644 +--- a/fs/bcachefs/bcachefs.h ++++ b/fs/bcachefs/bcachefs.h +@@ -667,6 +667,7 @@ struct bch_fs { + struct btree_key_cache btree_key_cache; + + struct workqueue_struct *btree_update_wq; ++ struct workqueue_struct *btree_error_wq; + /* copygc needs its own workqueue for index updates.. */ + struct workqueue_struct *copygc_wq; + +diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c +index c23f10d25181..c2d053308c19 100644 +--- a/fs/bcachefs/btree_io.c ++++ b/fs/bcachefs/btree_io.c +@@ -1616,7 +1616,7 @@ static void btree_node_write_work(struct work_struct *work) + bio_list_add(&c->btree_write_error_list, &wbio->wbio.bio); + spin_unlock_irqrestore(&c->btree_write_error_lock, flags); + +- queue_work(c->btree_update_wq, &c->btree_write_error_work); ++ queue_work(c->btree_error_wq, &c->btree_write_error_work); + return; + } + +diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c +index ab1a07c82584..332594b33dde 100644 +--- a/fs/bcachefs/super.c ++++ b/fs/bcachefs/super.c +@@ -507,6 +507,8 @@ static void __bch2_fs_free(struct bch_fs *c) + destroy_workqueue(c->io_complete_wq ); + if (c->copygc_wq) + destroy_workqueue(c->copygc_wq); ++ if (c->btree_error_wq) ++ destroy_workqueue(c->btree_error_wq); + if (c->btree_update_wq) + destroy_workqueue(c->btree_update_wq); + +@@ -758,6 +760,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) + + if (!(c->btree_update_wq = alloc_workqueue("bcachefs", + WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) || ++ !(c->btree_error_wq = alloc_workqueue("bcachefs_error", ++ WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) || + !(c->copygc_wq = alloc_workqueue("bcachefs_copygc", + WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) || + !(c->io_complete_wq = alloc_workqueue("bcachefs_io", +-- +cgit v1.2.3 + + +From d0ac8ea1ef8946d4ce246cc5d46f4ca263975971 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 26 May 2021 01:03:48 -0400 +Subject: bcachefs: Fix a deadlock + +Waiting on a btree node write with btree locks held can deadlock, if the +write errors: the write error path has to do do a btree update to drop +the pointer to the replica that errored. + +The interior update path has to wait on in flight btree writes before +freeing nodes on disk. Previously, this was done in +bch2_btree_interior_update_will_free_node(), and could deadlock; now, we +just stash a pointer to the node and do it in +btree_update_nodes_written(), just prior to the transactional part of +the update. +--- + fs/bcachefs/btree_io.c | 4 ++++ + fs/bcachefs/btree_update_interior.c | 26 +++++++++++++++++++------- + fs/bcachefs/btree_update_interior.h | 4 ++++ + 3 files changed, 27 insertions(+), 7 deletions(-) + +diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c +index c2d053308c19..721deee6bcc4 100644 +--- a/fs/bcachefs/btree_io.c ++++ b/fs/bcachefs/btree_io.c +@@ -1725,6 +1725,10 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b) + return; + + if (old & (1 << BTREE_NODE_write_in_flight)) { ++ /* ++ * XXX waiting on btree writes with btree locks held - ++ * this can deadlock, and we hit the write error path ++ */ + btree_node_wait_on_io(b); + continue; + } +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index c55df177d7f2..95e6d21dac2a 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -550,6 +550,22 @@ static void btree_update_nodes_written(struct btree_update *as) + + BUG_ON(!journal_pin_active(&as->journal)); + ++ /* ++ * Wait for any in flight writes to finish before we free the old nodes ++ * on disk: ++ */ ++ for (i = 0; i < as->nr_old_nodes; i++) { ++ struct btree *old = as->old_nodes[i]; ++ __le64 seq; ++ ++ six_lock_read(&old->c.lock, NULL, NULL); ++ seq = old->data ? old->data->keys.seq : 0; ++ six_unlock_read(&old->c.lock); ++ ++ if (seq == as->old_nodes_seq[i]) ++ btree_node_wait_on_io(old); ++ } ++ + /* + * We did an update to a parent node where the pointers we added pointed + * to child nodes that weren't written yet: now, the child nodes have +@@ -889,13 +905,9 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *as, + + btree_update_will_delete_key(as, &b->key); + +- /* +- * XXX: Waiting on io with btree node locks held, we don't want to be +- * doing this. We can't have btree writes happening after the space has +- * been freed, but we really only need to block before +- * btree_update_nodes_written_trans() happens. +- */ +- btree_node_wait_on_io(b); ++ as->old_nodes[as->nr_old_nodes] = b; ++ as->old_nodes_seq[as->nr_old_nodes] = b->data->keys.seq; ++ as->nr_old_nodes++; + } + + void bch2_btree_update_done(struct btree_update *as) +diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h +index 7eef3dbb6ef1..7ed67b47e1b9 100644 +--- a/fs/bcachefs/btree_update_interior.h ++++ b/fs/bcachefs/btree_update_interior.h +@@ -92,6 +92,10 @@ struct btree_update { + struct btree *new_nodes[BTREE_UPDATE_NODES_MAX]; + unsigned nr_new_nodes; + ++ struct btree *old_nodes[BTREE_UPDATE_NODES_MAX]; ++ __le64 old_nodes_seq[BTREE_UPDATE_NODES_MAX]; ++ unsigned nr_old_nodes; ++ + open_bucket_idx_t open_buckets[BTREE_UPDATE_NODES_MAX * + BCH_REPLICAS_MAX]; + open_bucket_idx_t nr_open_buckets; +-- +cgit v1.2.3 + + +From c923c9934d3397454837000a3408c5be9dcf18ed Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 23 May 2021 17:04:13 -0400 +Subject: bcachefs: Assorted endianness fixes + +Found by sparse + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 2 ++ + fs/bcachefs/journal_io.c | 5 +++-- + fs/bcachefs/journal_seq_blacklist.c | 6 ++---- + fs/bcachefs/move.c | 4 ++-- + fs/bcachefs/recovery.c | 18 +++++++++--------- + fs/bcachefs/super-io.c | 14 +++++++------- + fs/bcachefs/super.c | 2 +- + fs/bcachefs/sysfs.c | 2 +- + 8 files changed, 27 insertions(+), 26 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index ac6449e07522..bf8cd7542fd4 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -2268,6 +2268,7 @@ static void bch2_trans_alloc_iters(struct btree_trans *trans, struct bch_fs *c) + void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, + unsigned expected_nr_iters, + size_t expected_mem_bytes) ++ __acquires(&c->btree_trans_barrier) + { + memset(trans, 0, sizeof(*trans)); + trans->c = c; +@@ -2300,6 +2301,7 @@ void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, + } + + int bch2_trans_exit(struct btree_trans *trans) ++ __releases(&c->btree_trans_barrier) + { + struct bch_fs *c = trans->c; + +diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c +index 58298d3b3a58..2da6839fcdc0 100644 +--- a/fs/bcachefs/journal_io.c ++++ b/fs/bcachefs/journal_io.c +@@ -834,7 +834,7 @@ static void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c, + unsigned i; + + for (i = 0; i < j->nr_ptrs; i++) { +- struct bch_dev *ca = c->devs[j->ptrs[i].dev]; ++ struct bch_dev *ca = bch_dev_bkey_exists(c, j->ptrs[i].dev); + u64 offset; + + div64_u64_rem(j->ptrs[i].offset, ca->mi.bucket_size, &offset); +@@ -1401,7 +1401,8 @@ void bch2_journal_write(struct closure *cl) + test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags)) { + w->noflush = true; + SET_JSET_NO_FLUSH(jset, true); +- jset->last_seq = w->last_seq = 0; ++ jset->last_seq = 0; ++ w->last_seq = 0; + + j->nr_noflush_writes++; + } else { +diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c +index e1b63f3879f4..f2060f903cbc 100644 +--- a/fs/bcachefs/journal_seq_blacklist.c ++++ b/fs/bcachefs/journal_seq_blacklist.c +@@ -111,8 +111,7 @@ int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64 start, u64 end) + bl->start[nr].start = cpu_to_le64(start); + bl->start[nr].end = cpu_to_le64(end); + out_write_sb: +- c->disk_sb.sb->features[0] |= +- 1ULL << BCH_FEATURE_journal_seq_blacklist_v3; ++ c->disk_sb.sb->features[0] |= cpu_to_le64(1ULL << BCH_FEATURE_journal_seq_blacklist_v3); + + ret = bch2_write_super(c); + out: +@@ -298,8 +297,7 @@ void bch2_blacklist_entries_gc(struct work_struct *work) + BUG_ON(new_nr && !bl); + + if (!new_nr) +- c->disk_sb.sb->features[0] &= +- ~(1ULL << BCH_FEATURE_journal_seq_blacklist_v3); ++ c->disk_sb.sb->features[0] &= cpu_to_le64(~(1ULL << BCH_FEATURE_journal_seq_blacklist_v3)); + + bch2_write_super(c); + } +diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c +index 2d5c4e9bbf42..2fa763e35392 100644 +--- a/fs/bcachefs/move.c ++++ b/fs/bcachefs/move.c +@@ -926,8 +926,8 @@ int bch2_scan_old_btree_nodes(struct bch_fs *c, struct bch_move_stats *stats) + rewrite_old_nodes_pred, c, stats); + if (!ret) { + mutex_lock(&c->sb_lock); +- c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_extents_above_btree_updates_done; +- c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_bformat_overflow_done; ++ c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_extents_above_btree_updates_done); ++ c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_bformat_overflow_done); + c->disk_sb.sb->version_min = c->disk_sb.sb->version; + bch2_write_super(c); + mutex_unlock(&c->sb_lock); +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index cd538ecc1f3f..9bd6348842e0 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -716,7 +716,7 @@ static int journal_replay_entry_early(struct bch_fs *c, + case BCH_JSET_ENTRY_dev_usage: { + struct jset_entry_dev_usage *u = + container_of(entry, struct jset_entry_dev_usage, entry); +- struct bch_dev *ca = bch_dev_bkey_exists(c, u->dev); ++ struct bch_dev *ca = bch_dev_bkey_exists(c, le32_to_cpu(u->dev)); + unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); + unsigned nr_types = (bytes - sizeof(struct jset_entry_dev_usage)) / + sizeof(struct jset_entry_dev_usage_type); +@@ -755,7 +755,7 @@ static int journal_replay_entry_early(struct bch_fs *c, + struct jset_entry_clock *clock = + container_of(entry, struct jset_entry_clock, entry); + +- atomic64_set(&c->io_clock[clock->rw].now, clock->time); ++ atomic64_set(&c->io_clock[clock->rw].now, le64_to_cpu(clock->time)); + } + } + +@@ -1217,13 +1217,13 @@ use_clean: + + mutex_lock(&c->sb_lock); + if (c->opts.version_upgrade) { +- c->disk_sb.sb->version = le16_to_cpu(bcachefs_metadata_version_current); +- c->disk_sb.sb->features[0] |= BCH_SB_FEATURES_ALL; ++ c->disk_sb.sb->version = cpu_to_le16(bcachefs_metadata_version_current); ++ c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL); + write_sb = true; + } + + if (!test_bit(BCH_FS_ERROR, &c->flags)) { +- c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_alloc_info; ++ c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_alloc_info); + write_sb = true; + } + +@@ -1278,12 +1278,12 @@ int bch2_fs_initialize(struct bch_fs *c) + bch_notice(c, "initializing new filesystem"); + + mutex_lock(&c->sb_lock); +- c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_extents_above_btree_updates_done; +- c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_bformat_overflow_done; ++ c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_extents_above_btree_updates_done); ++ c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_bformat_overflow_done); + + if (c->opts.version_upgrade) { +- c->disk_sb.sb->version = le16_to_cpu(bcachefs_metadata_version_current); +- c->disk_sb.sb->features[0] |= BCH_SB_FEATURES_ALL; ++ c->disk_sb.sb->version = cpu_to_le16(bcachefs_metadata_version_current); ++ c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL); + bch2_write_super(c); + } + +diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c +index 74a75ced031e..977885166d55 100644 +--- a/fs/bcachefs/super-io.c ++++ b/fs/bcachefs/super-io.c +@@ -982,7 +982,7 @@ int bch2_fs_mark_dirty(struct bch_fs *c) + + mutex_lock(&c->sb_lock); + SET_BCH_SB_CLEAN(c->disk_sb.sb, false); +- c->disk_sb.sb->features[0] |= BCH_SB_FEATURES_ALWAYS; ++ c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALWAYS); + ret = bch2_write_super(c); + mutex_unlock(&c->sb_lock); + +@@ -999,7 +999,7 @@ static struct jset_entry *jset_entry_init(struct jset_entry **end, size_t size) + * The u64s field counts from the start of data, ignoring the shared + * fields. + */ +- entry->u64s = u64s - 1; ++ entry->u64s = cpu_to_le16(u64s - 1); + + *end = vstruct_next(*end); + return entry; +@@ -1092,7 +1092,7 @@ void bch2_journal_super_entries_add_common(struct bch_fs *c, + + clock->entry.type = BCH_JSET_ENTRY_clock; + clock->rw = i; +- clock->time = atomic64_read(&c->io_clock[i].now); ++ clock->time = cpu_to_le64(atomic64_read(&c->io_clock[i].now)); + } + } + +@@ -1109,10 +1109,10 @@ void bch2_fs_mark_clean(struct bch_fs *c) + + SET_BCH_SB_CLEAN(c->disk_sb.sb, true); + +- c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_alloc_info; +- c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_alloc_metadata; +- c->disk_sb.sb->features[0] &= ~(1ULL << BCH_FEATURE_extents_above_btree_updates); +- c->disk_sb.sb->features[0] &= ~(1ULL << BCH_FEATURE_btree_updates_journalled); ++ c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_alloc_info); ++ c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_alloc_metadata); ++ c->disk_sb.sb->features[0] &= cpu_to_le64(~(1ULL << BCH_FEATURE_extents_above_btree_updates)); ++ c->disk_sb.sb->features[0] &= cpu_to_le64(~(1ULL << BCH_FEATURE_btree_updates_journalled)); + + u64s = sizeof(*sb_clean) / sizeof(u64) + c->journal.entry_u64s_reserved; + +diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c +index 332594b33dde..53f18c480234 100644 +--- a/fs/bcachefs/super.c ++++ b/fs/bcachefs/super.c +@@ -1439,7 +1439,7 @@ int bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca, + + /* Device add/removal: */ + +-int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca) ++static int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca) + { + struct btree_trans trans; + size_t i; +diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c +index 21ef7719cf55..84a7acb04d01 100644 +--- a/fs/bcachefs/sysfs.c ++++ b/fs/bcachefs/sysfs.c +@@ -312,7 +312,7 @@ static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c + return 0; + } + +-void bch2_gc_gens_pos_to_text(struct printbuf *out, struct bch_fs *c) ++static void bch2_gc_gens_pos_to_text(struct printbuf *out, struct bch_fs *c) + { + pr_buf(out, "%s: ", bch2_btree_ids[c->gc_gens_btree]); + bch2_bpos_to_text(out, c->gc_gens_pos); +-- +cgit v1.2.3 + + +From fa5ceadb3fc70f96ee309d45bdbee5ea343a0a73 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 23 May 2021 02:31:33 -0400 +Subject: bcachefs: Fsck for reflink refcounts + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bcachefs.h | 11 +++ + fs/bcachefs/btree_gc.c | 202 ++++++++++++++++++++++++++++++++++++++++++++++++- + fs/bcachefs/buckets.c | 152 ++++++++++++++++++++++++++++++------- + fs/bcachefs/reflink.c | 2 +- + fs/bcachefs/reflink.h | 24 ++++++ + 5 files changed, 359 insertions(+), 32 deletions(-) + +diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h +index d73302d14820..38963900e41a 100644 +--- a/fs/bcachefs/bcachefs.h ++++ b/fs/bcachefs/bcachefs.h +@@ -391,6 +391,14 @@ struct gc_pos { + unsigned level; + }; + ++struct reflink_gc { ++ u64 offset; ++ u32 size; ++ u32 refcount; ++}; ++ ++typedef GENRADIX(struct reflink_gc) reflink_gc_table; ++ + struct io_count { + u64 sectors[2][BCH_DATA_NR]; + }; +@@ -806,6 +814,9 @@ struct bch_fs { + + /* REFLINK */ + u64 reflink_hint; ++ reflink_gc_table reflink_gc_table; ++ size_t reflink_gc_nr; ++ size_t reflink_gc_idx; + + /* VFS IO PATH - fs-io.c */ + struct bio_set writepage_bioset; +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +index 88d1cd0a8f95..dea699935025 100644 +--- a/fs/bcachefs/btree_gc.c ++++ b/fs/bcachefs/btree_gc.c +@@ -23,6 +23,7 @@ + #include "keylist.h" + #include "move.h" + #include "recovery.h" ++#include "reflink.h" + #include "replicas.h" + #include "super-io.h" + +@@ -1285,6 +1286,201 @@ static int bch2_gc_start(struct bch_fs *c, + return 0; + } + ++static int bch2_gc_reflink_done_initial_fn(struct bch_fs *c, struct bkey_s_c k) ++{ ++ struct reflink_gc *r; ++ const __le64 *refcount = bkey_refcount_c(k); ++ char buf[200]; ++ int ret = 0; ++ ++ if (!refcount) ++ return 0; ++ ++ r = genradix_ptr(&c->reflink_gc_table, c->reflink_gc_idx++); ++ if (!r) ++ return -ENOMEM; ++ ++ if (!r || ++ r->offset != k.k->p.offset || ++ r->size != k.k->size) { ++ bch_err(c, "unexpected inconsistency walking reflink table at gc finish"); ++ return -EINVAL; ++ } ++ ++ if (fsck_err_on(r->refcount != le64_to_cpu(*refcount), c, ++ "reflink key has wrong refcount:\n" ++ " %s\n" ++ " should be %u", ++ (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf), ++ r->refcount)) { ++ struct bkey_i *new; ++ ++ new = kmalloc(bkey_bytes(k.k), GFP_KERNEL); ++ if (!new) { ++ ret = -ENOMEM; ++ goto fsck_err; ++ } ++ ++ bkey_reassemble(new, k); ++ ++ if (!r->refcount) { ++ new->k.type = KEY_TYPE_deleted; ++ new->k.size = 0; ++ } else { ++ *bkey_refcount(new) = cpu_to_le64(r->refcount); ++ } ++ ++ ret = bch2_journal_key_insert(c, BTREE_ID_reflink, 0, new); ++ if (ret) ++ kfree(new); ++ } ++fsck_err: ++ return ret; ++} ++ ++static int bch2_gc_reflink_done(struct bch_fs *c, bool initial, ++ bool metadata_only) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ struct reflink_gc *r; ++ size_t idx = 0; ++ char buf[200]; ++ int ret = 0; ++ ++ if (metadata_only) ++ return 0; ++ ++ if (initial) { ++ c->reflink_gc_idx = 0; ++ ++ ret = bch2_btree_and_journal_walk(c, BTREE_ID_reflink, ++ bch2_gc_reflink_done_initial_fn); ++ goto out; ++ } ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_reflink, POS_MIN, ++ BTREE_ITER_PREFETCH, k, ret) { ++ const __le64 *refcount = bkey_refcount_c(k); ++ ++ if (!refcount) ++ continue; ++ ++ r = genradix_ptr(&c->reflink_gc_table, idx); ++ if (!r || ++ r->offset != k.k->p.offset || ++ r->size != k.k->size) { ++ bch_err(c, "unexpected inconsistency walking reflink table at gc finish"); ++ ret = -EINVAL; ++ break; ++ } ++ ++ if (fsck_err_on(r->refcount != le64_to_cpu(*refcount), c, ++ "reflink key has wrong refcount:\n" ++ " %s\n" ++ " should be %u", ++ (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf), ++ r->refcount)) { ++ struct bkey_i *new; ++ ++ new = kmalloc(bkey_bytes(k.k), GFP_KERNEL); ++ if (!new) { ++ ret = -ENOMEM; ++ break; ++ } ++ ++ bkey_reassemble(new, k); ++ ++ if (!r->refcount) ++ new->k.type = KEY_TYPE_deleted; ++ else ++ *bkey_refcount(new) = cpu_to_le64(r->refcount); ++ ++ ret = __bch2_trans_do(&trans, NULL, NULL, 0, ++ __bch2_btree_insert(&trans, BTREE_ID_reflink, new)); ++ kfree(new); ++ ++ if (ret) ++ break; ++ } ++ } ++fsck_err: ++ bch2_trans_iter_put(&trans, iter); ++ bch2_trans_exit(&trans); ++out: ++ genradix_free(&c->reflink_gc_table); ++ c->reflink_gc_nr = 0; ++ return ret; ++} ++ ++static int bch2_gc_reflink_start_initial_fn(struct bch_fs *c, struct bkey_s_c k) ++{ ++ ++ struct reflink_gc *r; ++ const __le64 *refcount = bkey_refcount_c(k); ++ ++ if (!refcount) ++ return 0; ++ ++ r = genradix_ptr_alloc(&c->reflink_gc_table, c->reflink_gc_nr++, ++ GFP_KERNEL); ++ if (!r) ++ return -ENOMEM; ++ ++ r->offset = k.k->p.offset; ++ r->size = k.k->size; ++ r->refcount = 0; ++ return 0; ++} ++ ++static int bch2_gc_reflink_start(struct bch_fs *c, bool initial, ++ bool metadata_only) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ struct reflink_gc *r; ++ int ret; ++ ++ if (metadata_only) ++ return 0; ++ ++ genradix_free(&c->reflink_gc_table); ++ c->reflink_gc_nr = 0; ++ ++ if (initial) ++ return bch2_btree_and_journal_walk(c, BTREE_ID_reflink, ++ bch2_gc_reflink_start_initial_fn); ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_reflink, POS_MIN, ++ BTREE_ITER_PREFETCH, k, ret) { ++ const __le64 *refcount = bkey_refcount_c(k); ++ ++ if (!refcount) ++ continue; ++ ++ r = genradix_ptr_alloc(&c->reflink_gc_table, c->reflink_gc_nr++, ++ GFP_KERNEL); ++ if (!r) { ++ ret = -ENOMEM; ++ break; ++ } ++ ++ r->offset = k.k->p.offset; ++ r->size = k.k->size; ++ r->refcount = 0; ++ } ++ bch2_trans_iter_put(&trans, iter); ++ ++ bch2_trans_exit(&trans); ++ return 0; ++} ++ + /** + * bch2_gc - walk _all_ references to buckets, and recompute them: + * +@@ -1319,7 +1515,8 @@ int bch2_gc(struct bch_fs *c, bool initial, bool metadata_only) + closure_wait_event(&c->btree_interior_update_wait, + !bch2_btree_interior_updates_nr_pending(c)); + again: +- ret = bch2_gc_start(c, metadata_only); ++ ret = bch2_gc_start(c, metadata_only) ?: ++ bch2_gc_reflink_start(c, initial, metadata_only); + if (ret) + goto out; + +@@ -1381,7 +1578,8 @@ out: + bch2_journal_block(&c->journal); + + percpu_down_write(&c->mark_lock); +- ret = bch2_gc_done(c, initial, metadata_only); ++ ret = bch2_gc_reflink_done(c, initial, metadata_only) ?: ++ bch2_gc_done(c, initial, metadata_only); + + bch2_journal_unblock(&c->journal); + } else { +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index 282bca166b16..d07085a2fd1b 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -14,6 +14,7 @@ + #include "ec.h" + #include "error.h" + #include "movinggc.h" ++#include "reflink.h" + #include "replicas.h" + + #include +@@ -1072,6 +1073,124 @@ static int bch2_mark_stripe(struct bch_fs *c, + return 0; + } + ++static int __reflink_p_frag_references(struct bkey_s_c_reflink_p p, ++ u64 p_start, u64 p_end, ++ u64 v_start, u64 v_end) ++{ ++ if (p_start == p_end) ++ return false; ++ ++ p_start += le64_to_cpu(p.v->idx); ++ p_end += le64_to_cpu(p.v->idx); ++ ++ if (p_end <= v_start) ++ return false; ++ if (p_start >= v_end) ++ return false; ++ return true; ++} ++ ++static int reflink_p_frag_references(struct bkey_s_c_reflink_p p, ++ u64 start, u64 end, ++ struct bkey_s_c k) ++{ ++ return __reflink_p_frag_references(p, start, end, ++ bkey_start_offset(k.k), ++ k.k->p.offset); ++} ++ ++static int __bch2_mark_reflink_p(struct bch_fs *c, ++ struct bkey_s_c_reflink_p p, ++ u64 idx, unsigned sectors, ++ unsigned front_frag, ++ unsigned back_frag, ++ unsigned flags, ++ size_t *r_idx) ++{ ++ struct reflink_gc *r; ++ int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1; ++ int frags_referenced; ++ ++ while (1) { ++ if (*r_idx >= c->reflink_gc_nr) ++ goto not_found; ++ r = genradix_ptr(&c->reflink_gc_table, *r_idx); ++ BUG_ON(!r); ++ ++ if (r->offset > idx) ++ break; ++ (*r_idx)++; ++ } ++ ++ frags_referenced = ++ __reflink_p_frag_references(p, 0, front_frag, ++ r->offset - r->size, r->offset) + ++ __reflink_p_frag_references(p, back_frag, p.k->size, ++ r->offset - r->size, r->offset); ++ ++ if (frags_referenced == 2) { ++ BUG_ON(!(flags & BTREE_TRIGGER_OVERWRITE_SPLIT)); ++ add = -add; ++ } else if (frags_referenced == 1) { ++ BUG_ON(!(flags & BTREE_TRIGGER_OVERWRITE)); ++ add = 0; ++ } ++ ++ BUG_ON((s64) r->refcount + add < 0); ++ ++ r->refcount += add; ++ return min_t(u64, sectors, r->offset - idx); ++not_found: ++ bch2_fs_inconsistent(c, ++ "%llu:%llu len %u points to nonexistent indirect extent %llu", ++ p.k->p.inode, p.k->p.offset, p.k->size, idx); ++ bch2_inconsistent_error(c); ++ return -EIO; ++} ++ ++static int bch2_mark_reflink_p(struct bch_fs *c, ++ struct bkey_s_c_reflink_p p, unsigned offset, ++ s64 sectors, unsigned flags) ++{ ++ u64 idx = le64_to_cpu(p.v->idx) + offset; ++ struct reflink_gc *ref; ++ size_t l, r, m; ++ unsigned front_frag, back_frag; ++ s64 ret = 0; ++ ++ if (sectors < 0) ++ sectors = -sectors; ++ ++ BUG_ON(offset + sectors > p.k->size); ++ ++ front_frag = offset; ++ back_frag = offset + sectors; ++ ++ l = 0; ++ r = c->reflink_gc_nr; ++ while (l < r) { ++ m = l + (r - l) / 2; ++ ++ ref = genradix_ptr(&c->reflink_gc_table, m); ++ if (ref->offset <= idx) ++ l = m + 1; ++ else ++ r = m; ++ } ++ ++ while (sectors) { ++ ret = __bch2_mark_reflink_p(c, p, idx, sectors, ++ front_frag, back_frag, flags, &l); ++ if (ret < 0) ++ return ret; ++ ++ idx += ret; ++ sectors -= ret; ++ } ++ ++ return 0; ++} ++ + static int bch2_mark_key_locked(struct bch_fs *c, + struct bkey_s_c old, + struct bkey_s_c new, +@@ -1127,6 +1246,10 @@ static int bch2_mark_key_locked(struct bch_fs *c, + fs_usage->persistent_reserved[replicas - 1] += sectors; + break; + } ++ case KEY_TYPE_reflink_p: ++ ret = bch2_mark_reflink_p(c, bkey_s_c_to_reflink_p(k), ++ offset, sectors, flags); ++ break; + } + + preempt_enable(); +@@ -1689,35 +1812,6 @@ static int bch2_trans_mark_stripe(struct btree_trans *trans, + return ret; + } + +-static __le64 *bkey_refcount(struct bkey_i *k) +-{ +- switch (k->k.type) { +- case KEY_TYPE_reflink_v: +- return &bkey_i_to_reflink_v(k)->v.refcount; +- case KEY_TYPE_indirect_inline_data: +- return &bkey_i_to_indirect_inline_data(k)->v.refcount; +- default: +- return NULL; +- } +-} +- +-static bool reflink_p_frag_references(struct bkey_s_c_reflink_p p, +- u64 start, u64 end, +- struct bkey_s_c k) +-{ +- if (start == end) +- return false; +- +- start += le64_to_cpu(p.v->idx); +- end += le64_to_cpu(p.v->idx); +- +- if (end <= bkey_start_offset(k.k)) +- return false; +- if (start >= k.k->p.offset) +- return false; +- return true; +-} +- + static int __bch2_trans_mark_reflink_p(struct btree_trans *trans, + struct bkey_s_c_reflink_p p, + u64 idx, unsigned sectors, +diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c +index c624fabe1e1c..e986b5284d37 100644 +--- a/fs/bcachefs/reflink.c ++++ b/fs/bcachefs/reflink.c +@@ -151,7 +151,7 @@ static int bch2_make_extent_indirect(struct btree_trans *trans, + + set_bkey_val_bytes(&r_v->k, sizeof(__le64) + bkey_val_bytes(&orig->k)); + +- refcount = (void *) &r_v->v; ++ refcount = bkey_refcount(r_v); + *refcount = 0; + memcpy(refcount + 1, &orig->v, bkey_val_bytes(&orig->k)); + +diff --git a/fs/bcachefs/reflink.h b/fs/bcachefs/reflink.h +index 9d5e7dc58f2b..bfc785619ee8 100644 +--- a/fs/bcachefs/reflink.h ++++ b/fs/bcachefs/reflink.h +@@ -34,6 +34,30 @@ void bch2_indirect_inline_data_to_text(struct printbuf *, + .val_to_text = bch2_indirect_inline_data_to_text, \ + } + ++static inline const __le64 *bkey_refcount_c(struct bkey_s_c k) ++{ ++ switch (k.k->type) { ++ case KEY_TYPE_reflink_v: ++ return &bkey_s_c_to_reflink_v(k).v->refcount; ++ case KEY_TYPE_indirect_inline_data: ++ return &bkey_s_c_to_indirect_inline_data(k).v->refcount; ++ default: ++ return NULL; ++ } ++} ++ ++static inline __le64 *bkey_refcount(struct bkey_i *k) ++{ ++ switch (k->k.type) { ++ case KEY_TYPE_reflink_v: ++ return &bkey_i_to_reflink_v(k)->v.refcount; ++ case KEY_TYPE_indirect_inline_data: ++ return &bkey_i_to_indirect_inline_data(k)->v.refcount; ++ default: ++ return NULL; ++ } ++} ++ + s64 bch2_remap_range(struct bch_fs *, struct bpos, struct bpos, + u64, u64 *, u64, s64 *); + +-- +cgit v1.2.3 + + +From beb434b34d0e10bdae836d4871d60f23ceed92fc Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 20 Mar 2021 22:14:10 -0400 +Subject: bcachefs: Don't fragment extents when making them indirect + +This fixes a "disk usage increased without a reservation" bug, when +reflinking compressed extents. Also, there's no good reason for reflink +to be fragmenting extents anyways. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/reflink.c | 52 ++++++++++++++++++++++----------------------------- + 1 file changed, 22 insertions(+), 30 deletions(-) + +diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c +index e986b5284d37..a420729288d4 100644 +--- a/fs/bcachefs/reflink.c ++++ b/fs/bcachefs/reflink.c +@@ -181,18 +181,19 @@ err: + + static struct bkey_s_c get_next_src(struct btree_iter *iter, struct bpos end) + { +- struct bkey_s_c k = bch2_btree_iter_peek(iter); ++ struct bkey_s_c k; + int ret; + + for_each_btree_key_continue(iter, 0, k, ret) { + if (bkey_cmp(iter->pos, end) >= 0) +- return bkey_s_c_null; ++ break; + + if (bkey_extent_is_data(k.k)) +- break; ++ return k; + } + +- return k; ++ bch2_btree_iter_set_pos(iter, end); ++ return bkey_s_c_null; + } + + s64 bch2_remap_range(struct bch_fs *c, +@@ -205,8 +206,8 @@ s64 bch2_remap_range(struct bch_fs *c, + struct bkey_s_c src_k; + struct bkey_buf new_dst, new_src; + struct bpos dst_end = dst_start, src_end = src_start; +- struct bpos dst_want, src_want; +- u64 src_done, dst_done; ++ struct bpos src_want; ++ u64 dst_done; + int ret = 0, ret2 = 0; + + if (!percpu_ref_tryget(&c->writes)) +@@ -226,7 +227,8 @@ s64 bch2_remap_range(struct bch_fs *c, + dst_iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, dst_start, + BTREE_ITER_INTENT); + +- while (ret == 0 || ret == -EINTR) { ++ while ((ret == 0 || ret == -EINTR) && ++ bkey_cmp(dst_iter->pos, dst_end) < 0) { + struct disk_reservation disk_res = { 0 }; + + bch2_trans_begin(&trans); +@@ -236,32 +238,29 @@ s64 bch2_remap_range(struct bch_fs *c, + break; + } + ++ dst_done = dst_iter->pos.offset - dst_start.offset; ++ src_want = POS(src_start.inode, src_start.offset + dst_done); ++ bch2_btree_iter_set_pos(src_iter, src_want); ++ + src_k = get_next_src(src_iter, src_end); + ret = bkey_err(src_k); + if (ret) + continue; + +- src_done = bpos_min(src_iter->pos, src_end).offset - +- src_start.offset; +- dst_want = POS(dst_start.inode, dst_start.offset + src_done); +- +- if (bkey_cmp(dst_iter->pos, dst_want) < 0) { +- ret = bch2_fpunch_at(&trans, dst_iter, dst_want, +- journal_seq, i_sectors_delta); ++ if (bkey_cmp(src_want, src_iter->pos) < 0) { ++ ret = bch2_fpunch_at(&trans, dst_iter, ++ bpos_min(dst_end, ++ POS(dst_iter->pos.inode, dst_iter->pos.offset + ++ src_iter->pos.offset - src_want.offset)), ++ journal_seq, i_sectors_delta); + continue; + } + +- BUG_ON(bkey_cmp(dst_iter->pos, dst_want)); +- +- if (!bkey_cmp(dst_iter->pos, dst_end)) +- break; +- + if (src_k.k->type != KEY_TYPE_reflink_p) { + bch2_bkey_buf_reassemble(&new_src, c, src_k); + src_k = bkey_i_to_s_c(new_src.k); + +- bch2_cut_front(src_iter->pos, new_src.k); +- bch2_cut_back(src_end, new_src.k); ++ bch2_btree_iter_set_pos(src_iter, bkey_start_pos(src_k.k)); + + ret = bch2_make_extent_indirect(&trans, src_iter, + new_src.k); +@@ -278,7 +277,7 @@ s64 bch2_remap_range(struct bch_fs *c, + bkey_reflink_p_init(new_dst.k); + + u64 offset = le64_to_cpu(src_p.v->idx) + +- (src_iter->pos.offset - ++ (src_want.offset - + bkey_start_offset(src_k.k)); + + dst_p->v.idx = cpu_to_le64(offset); +@@ -288,20 +287,13 @@ s64 bch2_remap_range(struct bch_fs *c, + + new_dst.k->k.p = dst_iter->pos; + bch2_key_resize(&new_dst.k->k, +- min(src_k.k->p.offset - src_iter->pos.offset, ++ min(src_k.k->p.offset - src_want.offset, + dst_end.offset - dst_iter->pos.offset)); +- + ret = bch2_extent_update(&trans, dst_iter, new_dst.k, + &disk_res, journal_seq, + new_i_size, i_sectors_delta, + true); + bch2_disk_reservation_put(c, &disk_res); +- if (ret) +- continue; +- +- dst_done = dst_iter->pos.offset - dst_start.offset; +- src_want = POS(src_start.inode, src_start.offset + dst_done); +- bch2_btree_iter_set_pos(src_iter, src_want); + } + bch2_trans_iter_put(&trans, dst_iter); + bch2_trans_iter_put(&trans, src_iter); +-- +cgit v1.2.3 + + +From 26068e09fd7ca542bb297642d25d606bbeeabf69 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 31 May 2021 00:13:39 -0400 +Subject: bcachefs: Journal space calculation fix + +When devices have different bucket sizes, we may accumulate a journal +write that doesn't fit on some of our devices - previously, we'd +underflow when calculating space on that device and then everything +would get weird. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/journal_reclaim.c | 4 ++++ + 1 file changed, 4 insertions(+) + +diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c +index 427be2da1dfc..7a0ae5d3431c 100644 +--- a/fs/bcachefs/journal_reclaim.c ++++ b/fs/bcachefs/journal_reclaim.c +@@ -93,6 +93,10 @@ journal_dev_space_available(struct journal *j, struct bch_dev *ca, + * until we write it out - thus, account for it here: + */ + while ((unwritten = get_unwritten_sectors(j, &idx))) { ++ /* entry won't fit on this device, skip: */ ++ if (unwritten > ca->mi.bucket_size) ++ continue; ++ + if (unwritten >= sectors) { + if (!buckets) { + sectors = 0; +-- +cgit v1.2.3 + + +From 5ac9f0695021b09a37a71202b952576a20f84287 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 31 May 2021 20:52:39 -0400 +Subject: bcachefs; Check for allocator thread shutdown + +We were missing a kthread_should_stop() check in the loop in +bch2_invalidate_buckets(), very occasionally leading to us getting stuck +while shutting down. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/alloc_background.c | 5 +++++ + 1 file changed, 5 insertions(+) + +diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c +index 4fa052a8f06e..408173f12479 100644 +--- a/fs/bcachefs/alloc_background.c ++++ b/fs/bcachefs/alloc_background.c +@@ -836,6 +836,11 @@ static int bch2_invalidate_buckets(struct bch_fs *c, struct bch_dev *ca) + while (!ret && + !fifo_full(&ca->free_inc) && + ca->alloc_heap.used) { ++ if (kthread_should_stop()) { ++ ret = 1; ++ break; ++ } ++ + ret = bch2_invalidate_one_bucket(c, ca, &journal_seq, + (!fifo_empty(&ca->free_inc) + ? BTREE_INSERT_NOWAIT : 0)); +-- +cgit v1.2.3 + + +From 41db1c0e99cc1f3800bb00d28b0025c36d0226b0 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 2 Jun 2021 00:15:07 -0400 +Subject: bcachefs: Check for errors from bch2_trans_update() + +Upcoming refactoring is going to change bch2_trans_update() to start +returning transaction restarts. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/acl.c | 2 +- + fs/bcachefs/alloc_background.c | 9 +++++---- + fs/bcachefs/btree_gc.c | 5 +++-- + fs/bcachefs/btree_update_leaf.c | 10 +++++----- + fs/bcachefs/buckets.c | 4 +++- + fs/bcachefs/ec.c | 15 ++++++--------- + fs/bcachefs/fsck.c | 17 ++++++++--------- + fs/bcachefs/inode.c | 8 +++----- + fs/bcachefs/io.c | 12 ++++++++---- + fs/bcachefs/migrate.c | 5 ++--- + fs/bcachefs/move.c | 5 ++--- + fs/bcachefs/str_hash.h | 14 +++++++------- + fs/bcachefs/tests.c | 2 +- + 13 files changed, 54 insertions(+), 54 deletions(-) + +diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c +index e7f69cab5a6a..5c365f527dbd 100644 +--- a/fs/bcachefs/acl.c ++++ b/fs/bcachefs/acl.c +@@ -387,7 +387,7 @@ int bch2_acl_chmod(struct btree_trans *trans, + } + + new->k.p = iter->pos; +- bch2_trans_update(trans, iter, &new->k_i, 0); ++ ret = bch2_trans_update(trans, iter, &new->k_i, 0); + *new_acl = acl; + acl = NULL; + err: +diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c +index 408173f12479..07823a168b4f 100644 +--- a/fs/bcachefs/alloc_background.c ++++ b/fs/bcachefs/alloc_background.c +@@ -340,9 +340,9 @@ retry: + return 0; + + bch2_alloc_pack(c, &a, new_u); +- bch2_trans_update(trans, iter, &a.k, +- BTREE_TRIGGER_NORUN); +- ret = bch2_trans_commit(trans, NULL, NULL, ++ ret = bch2_trans_update(trans, iter, &a.k, ++ BTREE_TRIGGER_NORUN) ?: ++ bch2_trans_commit(trans, NULL, NULL, + BTREE_INSERT_NOFAIL|flags); + err: + if (ret == -EINTR) +@@ -726,7 +726,8 @@ static int bucket_invalidate_btree(struct btree_trans *trans, + u.write_time = atomic64_read(&c->io_clock[WRITE].now); + + bch2_alloc_pack(c, a, u); +- bch2_trans_update(trans, iter, &a->k, BTREE_TRIGGER_BUCKET_INVALIDATE); ++ ret = bch2_trans_update(trans, iter, &a->k, ++ BTREE_TRIGGER_BUCKET_INVALIDATE); + err: + bch2_trans_iter_put(trans, iter); + return ret; +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +index dea699935025..b785e6636c0f 100644 +--- a/fs/bcachefs/btree_gc.c ++++ b/fs/bcachefs/btree_gc.c +@@ -1668,9 +1668,10 @@ static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id btree_id) + bch2_bkey_buf_reassemble(&sk, c, k); + bch2_extent_normalize(c, bkey_i_to_s(sk.k)); + +- bch2_trans_update(&trans, iter, sk.k, 0); + +- commit_err = bch2_trans_commit(&trans, NULL, NULL, ++ commit_err = ++ bch2_trans_update(&trans, iter, sk.k, 0) ?: ++ bch2_trans_commit(&trans, NULL, NULL, + BTREE_INSERT_NOWAIT| + BTREE_INSERT_NOFAIL); + if (commit_err == -EINTR) { +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index 70d2186e509f..cc7acce171a2 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -1198,9 +1198,9 @@ int bch2_btree_delete_at(struct btree_trans *trans, + bkey_init(&k.k); + k.k.p = iter->pos; + +- bch2_trans_update(trans, iter, &k, 0); +- return bch2_trans_commit(trans, NULL, NULL, +- BTREE_INSERT_NOFAIL|flags); ++ return bch2_trans_update(trans, iter, &k, 0) ?: ++ bch2_trans_commit(trans, NULL, NULL, ++ BTREE_INSERT_NOFAIL|flags); + } + + int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id, +@@ -1251,8 +1251,8 @@ retry: + break; + } + +- bch2_trans_update(trans, iter, &delete, 0); +- ret = bch2_trans_commit(trans, NULL, journal_seq, ++ ret = bch2_trans_update(trans, iter, &delete, 0) ?: ++ bch2_trans_commit(trans, NULL, journal_seq, + BTREE_INSERT_NOFAIL); + if (ret) + break; +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index d07085a2fd1b..2b5e1d5c6a29 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -1873,7 +1873,9 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans, + } + + bch2_btree_iter_set_pos(iter, bkey_start_pos(k.k)); +- bch2_trans_update(trans, iter, n, 0); ++ ret = bch2_trans_update(trans, iter, n, 0); ++ if (ret) ++ goto err; + out: + ret = sectors; + err: +diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c +index 5a87d41ff279..48f9232e61eb 100644 +--- a/fs/bcachefs/ec.c ++++ b/fs/bcachefs/ec.c +@@ -741,9 +741,8 @@ found_slot: + + stripe->k.p = iter->pos; + +- bch2_trans_update(&trans, iter, &stripe->k_i, 0); +- +- ret = bch2_trans_commit(&trans, res, NULL, ++ ret = bch2_trans_update(&trans, iter, &stripe->k_i, 0) ?: ++ bch2_trans_commit(&trans, res, NULL, + BTREE_INSERT_NOFAIL); + err: + bch2_trans_iter_put(&trans, iter); +@@ -791,7 +790,7 @@ static int ec_stripe_bkey_update(struct btree_trans *trans, + stripe_blockcount_set(&new->v, i, + stripe_blockcount_get(existing, i)); + +- bch2_trans_update(trans, iter, &new->k_i, 0); ++ ret = bch2_trans_update(trans, iter, &new->k_i, 0); + err: + bch2_trans_iter_put(trans, iter); + return ret; +@@ -864,9 +863,8 @@ static int ec_stripe_update_ptrs(struct bch_fs *c, + extent_stripe_ptr_add(e, s, ec_ptr, block); + + bch2_btree_iter_set_pos(iter, bkey_start_pos(&sk.k->k)); +- bch2_trans_update(&trans, iter, sk.k, 0); +- +- ret = bch2_trans_commit(&trans, NULL, NULL, ++ ret = bch2_trans_update(&trans, iter, sk.k, 0) ?: ++ bch2_trans_commit(&trans, NULL, NULL, + BTREE_INSERT_NOFAIL); + if (ret == -EINTR) + ret = 0; +@@ -1588,8 +1586,7 @@ write: + stripe_blockcount_set(&new_key->v, i, + m->block_sectors[i]); + +- bch2_trans_update(trans, iter, &new_key->k_i, 0); +- return 0; ++ return bch2_trans_update(trans, iter, &new_key->k_i, 0); + } + + int bch2_stripes_write(struct bch_fs *c, unsigned flags) +diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c +index a40459d2b0f0..89a130d9c537 100644 +--- a/fs/bcachefs/fsck.c ++++ b/fs/bcachefs/fsck.c +@@ -305,9 +305,8 @@ static int hash_redo_key(struct btree_trans *trans, + + bkey_init(&delete->k); + delete->k.p = k_iter->pos; +- bch2_trans_update(trans, k_iter, delete, 0); +- +- return bch2_hash_set(trans, desc, hash_info, k_iter->pos.inode, tmp, 0); ++ return bch2_trans_update(trans, k_iter, delete, 0) ?: ++ bch2_hash_set(trans, desc, hash_info, k_iter->pos.inode, tmp, 0); + } + + static int fsck_hash_delete_at(struct btree_trans *trans, +@@ -563,12 +562,12 @@ static int fix_overlapping_extent(struct btree_trans *trans, + BTREE_ITER_INTENT|BTREE_ITER_NOT_EXTENTS); + + BUG_ON(iter->flags & BTREE_ITER_IS_EXTENTS); +- bch2_trans_update(trans, iter, u, BTREE_TRIGGER_NORUN); ++ ret = bch2_trans_update(trans, iter, u, BTREE_TRIGGER_NORUN) ?: ++ bch2_trans_commit(trans, NULL, NULL, ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_LAZY_RW); + bch2_trans_iter_put(trans, iter); +- +- return bch2_trans_commit(trans, NULL, NULL, +- BTREE_INSERT_NOFAIL| +- BTREE_INSERT_LAZY_RW); ++ return ret; + } + + static int inode_backpointer_exists(struct btree_trans *trans, +@@ -887,7 +886,7 @@ retry: + ret = __bch2_trans_do(&trans, NULL, NULL, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW, +- (bch2_trans_update(&trans, iter, &n->k_i, 0), 0)); ++ bch2_trans_update(&trans, iter, &n->k_i, 0)); + kfree(n); + if (ret) + goto err; +diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c +index 6b43a9716cf0..f77b57490341 100644 +--- a/fs/bcachefs/inode.c ++++ b/fs/bcachefs/inode.c +@@ -333,8 +333,7 @@ int bch2_inode_write(struct btree_trans *trans, + + bch2_inode_pack(trans->c, inode_p, inode); + inode_p->inode.k.p.snapshot = iter->snapshot; +- bch2_trans_update(trans, iter, &inode_p->inode.k_i, 0); +- return 0; ++ return bch2_trans_update(trans, iter, &inode_p->inode.k_i, 0); + } + + const char *bch2_inode_invalid(const struct bch_fs *c, struct bkey_s_c k) +@@ -629,9 +628,8 @@ retry: + delete.k.p = iter->pos; + delete.v.bi_generation = cpu_to_le32(inode_u.bi_generation + 1); + +- bch2_trans_update(&trans, iter, &delete.k_i, 0); +- +- ret = bch2_trans_commit(&trans, NULL, NULL, ++ ret = bch2_trans_update(&trans, iter, &delete.k_i, 0) ?: ++ bch2_trans_commit(&trans, NULL, NULL, + BTREE_INSERT_NOFAIL); + err: + bch2_trans_iter_put(&trans, iter); +diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c +index c928bc8aa03c..0fe671646df0 100644 +--- a/fs/bcachefs/io.c ++++ b/fs/bcachefs/io.c +@@ -301,8 +301,9 @@ int bch2_extent_update(struct btree_trans *trans, + + inode_iter = bch2_inode_peek(trans, &inode_u, + k->k.p.inode, BTREE_ITER_INTENT); +- if (IS_ERR(inode_iter)) +- return PTR_ERR(inode_iter); ++ ret = PTR_ERR_OR_ZERO(inode_iter); ++ if (ret) ++ return ret; + + /* + * XXX: +@@ -329,11 +330,14 @@ int bch2_extent_update(struct btree_trans *trans, + + inode_p.inode.k.p.snapshot = iter->snapshot; + +- bch2_trans_update(trans, inode_iter, ++ ret = bch2_trans_update(trans, inode_iter, + &inode_p.inode.k_i, 0); + } + + bch2_trans_iter_put(trans, inode_iter); ++ ++ if (ret) ++ return ret; + } + + ret = bch2_trans_update(trans, iter, k, 0) ?: +@@ -1782,7 +1786,7 @@ static int __bch2_rbio_narrow_crcs(struct btree_trans *trans, + if (!bch2_bkey_narrow_crcs(new, new_crc)) + goto out; + +- bch2_trans_update(trans, iter, new, 0); ++ ret = bch2_trans_update(trans, iter, new, 0); + out: + bch2_trans_iter_put(trans, iter); + return ret; +diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c +index ef69a19f494a..6ebe49ba2248 100644 +--- a/fs/bcachefs/migrate.c ++++ b/fs/bcachefs/migrate.c +@@ -73,9 +73,8 @@ static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags + + bch2_btree_iter_set_pos(iter, bkey_start_pos(&sk.k->k)); + +- bch2_trans_update(&trans, iter, sk.k, 0); +- +- ret = bch2_trans_commit(&trans, NULL, NULL, ++ ret = bch2_trans_update(&trans, iter, sk.k, 0) ?: ++ bch2_trans_commit(&trans, NULL, NULL, + BTREE_INSERT_NOFAIL); + + /* +diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c +index 2fa763e35392..91be50812a38 100644 +--- a/fs/bcachefs/move.c ++++ b/fs/bcachefs/move.c +@@ -163,9 +163,8 @@ static int bch2_migrate_index_update(struct bch_write_op *op) + goto out; + } + +- bch2_trans_update(&trans, iter, insert, 0); +- +- ret = bch2_trans_commit(&trans, &op->res, ++ ret = bch2_trans_update(&trans, iter, insert, 0) ?: ++ bch2_trans_commit(&trans, &op->res, + op_journal_seq(op), + BTREE_INSERT_NOFAIL| + m->data_opts.btree_insert_flags); +diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h +index eab669af7032..2ff8e5bd2744 100644 +--- a/fs/bcachefs/str_hash.h ++++ b/fs/bcachefs/str_hash.h +@@ -281,7 +281,7 @@ not_found: + swap(iter, slot); + + insert->k.p = iter->pos; +- bch2_trans_update(trans, iter, insert, 0); ++ ret = bch2_trans_update(trans, iter, insert, 0); + } + + goto out; +@@ -296,20 +296,20 @@ int bch2_hash_delete_at(struct btree_trans *trans, + struct bkey_i *delete; + int ret; + ++ delete = bch2_trans_kmalloc(trans, sizeof(*delete)); ++ ret = PTR_ERR_OR_ZERO(delete); ++ if (ret) ++ return ret; ++ + ret = bch2_hash_needs_whiteout(trans, desc, info, iter); + if (ret < 0) + return ret; + +- delete = bch2_trans_kmalloc(trans, sizeof(*delete)); +- if (IS_ERR(delete)) +- return PTR_ERR(delete); +- + bkey_init(&delete->k); + delete->k.p = iter->pos; + delete->k.type = ret ? KEY_TYPE_hash_whiteout : KEY_TYPE_deleted; + +- bch2_trans_update(trans, iter, delete, 0); +- return 0; ++ return bch2_trans_update(trans, iter, delete, 0); + } + + static __always_inline +diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c +index 254e3b314204..63f4a83ad1de 100644 +--- a/fs/bcachefs/tests.c ++++ b/fs/bcachefs/tests.c +@@ -621,7 +621,7 @@ static int __do_delete(struct btree_trans *trans, struct bpos pos) + bkey_init(&delete.k); + delete.k.p = k.k->p; + +- bch2_trans_update(trans, iter, &delete, 0); ++ ret = bch2_trans_update(trans, iter, &delete, 0); + err: + bch2_trans_iter_put(trans, iter); + return ret; +-- +cgit v1.2.3 + + +From 04a3e7c79063319e08f88c9e10f4c47b8dec977e Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 2 Jun 2021 23:31:42 -0400 +Subject: bcachefs: Preallocate transaction mem + +This helps avoid transaction restarts. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/fs-io.c | 4 ++-- + fs/bcachefs/fs.c | 2 +- + fs/bcachefs/inode.c | 2 +- + 3 files changed, 4 insertions(+), 4 deletions(-) + +diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c +index ece28b1e9901..50e0b5af9b24 100644 +--- a/fs/bcachefs/fs-io.c ++++ b/fs/bcachefs/fs-io.c +@@ -2540,7 +2540,7 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode, + } + + bch2_bkey_buf_init(©); +- bch2_trans_init(&trans, c, BTREE_ITER_MAX, 256); ++ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024); + src = bch2_trans_get_iter(&trans, BTREE_ID_extents, + POS(inode->v.i_ino, src_start >> 9), + BTREE_ITER_INTENT); +@@ -2660,7 +2660,7 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode, + unsigned replicas = io_opts(c, &inode->ei_inode).data_replicas; + int ret = 0; + +- bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); ++ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 512); + + iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, + POS(inode->v.i_ino, start_sector), +diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c +index 2dabf86450c7..78691b0bb3d5 100644 +--- a/fs/bcachefs/fs.c ++++ b/fs/bcachefs/fs.c +@@ -146,7 +146,7 @@ int __must_check bch2_write_inode(struct bch_fs *c, + struct bch_inode_unpacked inode_u; + int ret; + +- bch2_trans_init(&trans, c, 0, 256); ++ bch2_trans_init(&trans, c, 0, 512); + retry: + bch2_trans_begin(&trans); + +diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c +index f77b57490341..17d8eb5223cd 100644 +--- a/fs/bcachefs/inode.c ++++ b/fs/bcachefs/inode.c +@@ -579,7 +579,7 @@ int bch2_inode_rm(struct bch_fs *c, u64 inode_nr, bool cached) + struct bkey_s_c k; + int ret; + +- bch2_trans_init(&trans, c, 0, 0); ++ bch2_trans_init(&trans, c, 0, 1024); + + /* + * If this was a directory, there shouldn't be any real dirents left - +-- +cgit v1.2.3 + + +From b670f12b967faea8ac6488d719a7a3e1651bd6f6 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 4 Jun 2021 15:18:10 -0400 +Subject: bcachefs: Improve btree iterator tracepoints + +This patch adds some new tracepoints to the btree iterator code, and +adds new fields to the existing tracepoints - primarily for the iterator +position. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_cache.c | 5 +- + fs/bcachefs/btree_iter.c | 106 +++++---- + fs/bcachefs/btree_iter.h | 1 - + fs/bcachefs/btree_types.h | 1 + + fs/bcachefs/btree_update_interior.c | 8 +- + fs/bcachefs/btree_update_leaf.c | 51 ++-- + include/trace/events/bcachefs.h | 455 +++++++++++++++++++++++++----------- + 7 files changed, 429 insertions(+), 198 deletions(-) + +diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c +index f3ceb1e5464f..62a10a78fe8f 100644 +--- a/fs/bcachefs/btree_cache.c ++++ b/fs/bcachefs/btree_cache.c +@@ -815,7 +815,10 @@ lock_node: + if (bch2_btree_node_relock(iter, level + 1)) + goto retry; + +- trace_trans_restart_btree_node_reused(iter->trans->ip); ++ trace_trans_restart_btree_node_reused(iter->trans->ip, ++ trace_ip, ++ iter->btree_id, ++ &iter->real_pos); + return ERR_PTR(-EINTR); + } + } +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index bf8cd7542fd4..712352d35f87 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -178,8 +178,8 @@ success: + return true; + } + +-static inline bool btree_iter_get_locks(struct btree_iter *iter, +- bool upgrade, bool trace) ++static inline bool btree_iter_get_locks(struct btree_iter *iter, bool upgrade, ++ unsigned long trace_ip) + { + unsigned l = iter->level; + int fail_idx = -1; +@@ -191,16 +191,17 @@ static inline bool btree_iter_get_locks(struct btree_iter *iter, + if (!(upgrade + ? bch2_btree_node_upgrade(iter, l) + : bch2_btree_node_relock(iter, l))) { +- if (trace) +- (upgrade +- ? trace_node_upgrade_fail +- : trace_node_relock_fail)(l, iter->l[l].lock_seq, +- is_btree_node(iter, l) +- ? 0 +- : (unsigned long) iter->l[l].b, +- is_btree_node(iter, l) +- ? iter->l[l].b->c.lock.state.seq +- : 0); ++ (upgrade ++ ? trace_node_upgrade_fail ++ : trace_node_relock_fail)(iter->trans->ip, trace_ip, ++ iter->btree_id, &iter->real_pos, ++ l, iter->l[l].lock_seq, ++ is_btree_node(iter, l) ++ ? 0 ++ : (unsigned long) iter->l[l].b, ++ is_btree_node(iter, l) ++ ? iter->l[l].b->c.lock.state.seq ++ : 0); + + fail_idx = l; + btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); +@@ -380,9 +381,9 @@ static inline void bch2_btree_iter_verify_locks(struct btree_iter *iter) {} + #endif + + __flatten +-bool bch2_btree_iter_relock(struct btree_iter *iter, bool trace) ++static bool bch2_btree_iter_relock(struct btree_iter *iter, unsigned long trace_ip) + { +- return btree_iter_get_locks(iter, false, trace); ++ return btree_iter_get_locks(iter, false, trace_ip); + } + + bool __bch2_btree_iter_upgrade(struct btree_iter *iter, +@@ -394,7 +395,7 @@ bool __bch2_btree_iter_upgrade(struct btree_iter *iter, + + iter->locks_want = new_locks_want; + +- if (btree_iter_get_locks(iter, true, true)) ++ if (btree_iter_get_locks(iter, true, _THIS_IP_)) + return true; + + /* +@@ -422,7 +423,7 @@ bool __bch2_btree_iter_upgrade(struct btree_iter *iter, + linked->btree_id == iter->btree_id && + linked->locks_want < new_locks_want) { + linked->locks_want = new_locks_want; +- btree_iter_get_locks(linked, true, false); ++ btree_iter_get_locks(linked, true, _THIS_IP_); + } + + return false; +@@ -468,8 +469,9 @@ bool bch2_trans_relock(struct btree_trans *trans) + struct btree_iter *iter; + + trans_for_each_iter(trans, iter) +- if (!bch2_btree_iter_relock(iter, true)) { +- trace_trans_restart_relock(trans->ip); ++ if (!bch2_btree_iter_relock(iter, _RET_IP_)) { ++ trace_trans_restart_relock(trans->ip, _RET_IP_, ++ iter->btree_id, &iter->real_pos); + return false; + } + return true; +@@ -1183,7 +1185,8 @@ err: + + static int btree_iter_traverse_one(struct btree_iter *, unsigned long); + +-static int __btree_iter_traverse_all(struct btree_trans *trans, int ret) ++static int __btree_iter_traverse_all(struct btree_trans *trans, int ret, ++ unsigned long trace_ip) + { + struct bch_fs *c = trans->c; + struct btree_iter *iter; +@@ -1200,7 +1203,7 @@ retry_all: + relock_fail = false; + + trans_for_each_iter(trans, iter) { +- if (!bch2_btree_iter_relock(iter, true)) ++ if (!bch2_btree_iter_relock(iter, _THIS_IP_)) + relock_fail = true; + sorted[nr_sorted++] = iter->idx; + } +@@ -1277,13 +1280,13 @@ out: + + trans->in_traverse_all = false; + +- trace_trans_traverse_all(trans->ip); ++ trace_trans_traverse_all(trans->ip, trace_ip); + return ret; + } + + int bch2_btree_iter_traverse_all(struct btree_trans *trans) + { +- return __btree_iter_traverse_all(trans, 0); ++ return __btree_iter_traverse_all(trans, 0, _RET_IP_); + } + + static inline bool btree_iter_good_node(struct btree_iter *iter, +@@ -1328,6 +1331,7 @@ static int btree_iter_traverse_one(struct btree_iter *iter, + unsigned long trace_ip) + { + unsigned depth_want = iter->level; ++ int ret = 0; + + /* + * if we need interior nodes locked, call btree_iter_relock() to make +@@ -1335,16 +1339,18 @@ static int btree_iter_traverse_one(struct btree_iter *iter, + */ + if (iter->uptodate == BTREE_ITER_NEED_RELOCK || + iter->locks_want > 1) +- bch2_btree_iter_relock(iter, false); ++ bch2_btree_iter_relock(iter, _THIS_IP_); + +- if (btree_iter_type(iter) == BTREE_ITER_CACHED) +- return bch2_btree_iter_traverse_cached(iter); ++ if (btree_iter_type(iter) == BTREE_ITER_CACHED) { ++ ret = bch2_btree_iter_traverse_cached(iter); ++ goto out; ++ } + + if (iter->uptodate < BTREE_ITER_NEED_RELOCK) +- return 0; ++ goto out; + + if (unlikely(iter->level >= BTREE_MAX_DEPTH)) +- return 0; ++ goto out; + + iter->level = btree_iter_up_until_good_node(iter, 0); + +@@ -1355,12 +1361,18 @@ static int btree_iter_traverse_one(struct btree_iter *iter, + * btree_iter_lock_root() comes next and that it can't fail + */ + while (iter->level > depth_want) { +- int ret = btree_iter_node(iter, iter->level) ++ ret = btree_iter_node(iter, iter->level) + ? btree_iter_down(iter, trace_ip) + : btree_iter_lock_root(iter, depth_want, trace_ip); + if (unlikely(ret)) { +- if (ret == 1) +- return 0; ++ if (ret == 1) { ++ /* ++ * Got to the end of the btree (in ++ * BTREE_ITER_NODES mode) ++ */ ++ ret = 0; ++ goto out; ++ } + + iter->level = depth_want; + +@@ -1372,14 +1384,16 @@ static int btree_iter_traverse_one(struct btree_iter *iter, + iter->l[iter->level].b = + BTREE_ITER_NO_NODE_DOWN; + } +- return ret; ++ goto out; + } + } + + iter->uptodate = BTREE_ITER_NEED_PEEK; +- ++out: ++ trace_iter_traverse(iter->trans->ip, trace_ip, ++ iter->btree_id, &iter->real_pos, ret); + bch2_btree_iter_verify(iter); +- return 0; ++ return ret; + } + + static int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter) +@@ -1390,7 +1404,7 @@ static int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter) + ret = bch2_trans_cond_resched(trans) ?: + btree_iter_traverse_one(iter, _RET_IP_); + if (unlikely(ret)) +- ret = __btree_iter_traverse_all(trans, ret); ++ ret = __btree_iter_traverse_all(trans, ret, _RET_IP_); + + return ret; + } +@@ -1506,6 +1520,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) + + static void btree_iter_set_search_pos(struct btree_iter *iter, struct bpos new_pos) + { ++ struct bpos old_pos = iter->real_pos; + int cmp = bpos_cmp(new_pos, iter->real_pos); + unsigned l = iter->level; + +@@ -1516,7 +1531,7 @@ static void btree_iter_set_search_pos(struct btree_iter *iter, struct bpos new_p + + if (unlikely(btree_iter_type(iter) == BTREE_ITER_CACHED)) { + btree_node_unlock(iter, 0); +- iter->l[0].b = BTREE_ITER_NO_NODE_UP; ++ iter->l[0].b = BTREE_ITER_NO_NODE_CACHED; + btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); + return; + } +@@ -1545,6 +1560,11 @@ out: + btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); + + bch2_btree_iter_verify(iter); ++#ifdef CONFIG_BCACHEFS_DEBUG ++ trace_iter_set_search_pos(iter->trans->ip, _RET_IP_, ++ iter->btree_id, ++ &old_pos, &new_pos, l); ++#endif + } + + inline bool bch2_btree_iter_advance(struct btree_iter *iter) +@@ -2063,13 +2083,6 @@ struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans, + best = iter; + } + +- trace_trans_get_iter(_RET_IP_, trans->ip, +- btree_id, +- &real_pos, locks_want, +- best ? &best->real_pos : &pos_min, +- best ? best->locks_want : 0, +- best ? best->uptodate : BTREE_ITER_NEED_TRAVERSE); +- + if (!best) { + iter = btree_trans_iter_alloc(trans); + bch2_btree_iter_init(trans, iter, btree_id); +@@ -2098,7 +2111,7 @@ struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans, + locks_want = min(locks_want, BTREE_MAX_DEPTH); + if (locks_want > iter->locks_want) { + iter->locks_want = locks_want; +- btree_iter_get_locks(iter, true, false); ++ btree_iter_get_locks(iter, true, _THIS_IP_); + } + + while (iter->level != depth) { +@@ -2116,6 +2129,13 @@ struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans, + bch2_btree_iter_set_pos(iter, pos); + btree_iter_set_search_pos(iter, real_pos); + ++ trace_trans_get_iter(_RET_IP_, trans->ip, ++ btree_id, ++ &real_pos, locks_want, iter->uptodate, ++ best ? &best->real_pos : &pos_min, ++ best ? best->locks_want : U8_MAX, ++ best ? best->uptodate : U8_MAX); ++ + return iter; + } + +diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h +index 2f63adb9e420..01b834bf79f7 100644 +--- a/fs/bcachefs/btree_iter.h ++++ b/fs/bcachefs/btree_iter.h +@@ -111,7 +111,6 @@ void bch2_btree_node_iter_fix(struct btree_iter *, struct btree *, + struct btree_node_iter *, struct bkey_packed *, + unsigned, unsigned); + +-bool bch2_btree_iter_relock(struct btree_iter *, bool); + bool bch2_trans_relock(struct btree_trans *); + void bch2_trans_unlock(struct btree_trans *); + +diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h +index 0c93547cebae..902c762739c0 100644 +--- a/fs/bcachefs/btree_types.h ++++ b/fs/bcachefs/btree_types.h +@@ -230,6 +230,7 @@ enum btree_iter_uptodate { + #define BTREE_ITER_NO_NODE_DOWN ((struct btree *) 5) + #define BTREE_ITER_NO_NODE_INIT ((struct btree *) 6) + #define BTREE_ITER_NO_NODE_ERROR ((struct btree *) 7) ++#define BTREE_ITER_NO_NODE_CACHED ((struct btree *) 8) + + /* + * @pos - iterator's current position +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index 95e6d21dac2a..2d8093d1bf00 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -955,7 +955,9 @@ retry: + * instead of locking/reserving all the way to the root: + */ + if (!bch2_btree_iter_upgrade(iter, U8_MAX)) { +- trace_trans_restart_iter_upgrade(trans->ip); ++ trace_trans_restart_iter_upgrade(trans->ip, _RET_IP_, ++ iter->btree_id, ++ &iter->real_pos); + return ERR_PTR(-EINTR); + } + +@@ -996,7 +998,7 @@ retry: + * closure argument + */ + if (flags & BTREE_INSERT_NOUNLOCK) { +- trace_trans_restart_journal_preres_get(trans->ip); ++ trace_trans_restart_journal_preres_get(trans->ip, _RET_IP_); + ret = -EINTR; + goto err; + } +@@ -1012,7 +1014,7 @@ retry: + BTREE_UPDATE_JOURNAL_RES, + journal_flags); + if (ret) { +- trace_trans_restart_journal_preres_get(trans->ip); ++ trace_trans_restart_journal_preres_get(trans->ip, _RET_IP_); + goto err; + } + +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index cc7acce171a2..0d566be7455e 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -228,7 +228,8 @@ static inline void btree_insert_entry_checks(struct btree_trans *trans, + } + + static noinline int +-bch2_trans_journal_preres_get_cold(struct btree_trans *trans, unsigned u64s) ++bch2_trans_journal_preres_get_cold(struct btree_trans *trans, unsigned u64s, ++ unsigned long trace_ip) + { + struct bch_fs *c = trans->c; + int ret; +@@ -241,7 +242,7 @@ bch2_trans_journal_preres_get_cold(struct btree_trans *trans, unsigned u64s) + return ret; + + if (!bch2_trans_relock(trans)) { +- trace_trans_restart_journal_preres_get(trans->ip); ++ trace_trans_restart_journal_preres_get(trans->ip, trace_ip); + return -EINTR; + } + +@@ -368,7 +369,8 @@ static noinline void bch2_trans_mark_gc(struct btree_trans *trans) + + static inline int + bch2_trans_commit_write_locked(struct btree_trans *trans, +- struct btree_insert_entry **stopped_at) ++ struct btree_insert_entry **stopped_at, ++ unsigned long trace_ip) + { + struct bch_fs *c = trans->c; + struct btree_insert_entry *i; +@@ -378,7 +380,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, + int ret; + + if (race_fault()) { +- trace_trans_restart_fault_inject(trans->ip); ++ trace_trans_restart_fault_inject(trans->ip, trace_ip); + return -EINTR; + } + +@@ -525,7 +527,8 @@ static noinline int maybe_do_btree_merge(struct btree_trans *trans, struct btree + * Get journal reservation, take write locks, and attempt to do btree update(s): + */ + static inline int do_bch2_trans_commit(struct btree_trans *trans, +- struct btree_insert_entry **stopped_at) ++ struct btree_insert_entry **stopped_at, ++ unsigned long trace_ip) + { + struct bch_fs *c = trans->c; + struct btree_insert_entry *i; +@@ -559,7 +562,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, + ? JOURNAL_RES_GET_RESERVED : 0)); + if (unlikely(ret == -EAGAIN)) + ret = bch2_trans_journal_preres_get_cold(trans, +- trans->journal_preres_u64s); ++ trans->journal_preres_u64s, trace_ip); + if (unlikely(ret)) + return ret; + +@@ -578,7 +581,9 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, + if (iter->nodes_locked != iter->nodes_intent_locked) { + if (btree_iter_keep(trans, iter)) { + if (!bch2_btree_iter_upgrade(iter, 1)) { +- trace_trans_restart_upgrade(trans->ip); ++ trace_trans_restart_upgrade(trans->ip, trace_ip, ++ iter->btree_id, ++ &iter->real_pos); + return -EINTR; + } + } else { +@@ -606,7 +611,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, + bch2_btree_node_lock_for_insert(c, + iter_l(i->iter)->b, i->iter); + +- ret = bch2_trans_commit_write_locked(trans, stopped_at); ++ ret = bch2_trans_commit_write_locked(trans, stopped_at, trace_ip); + + trans_for_each_update2(trans, i) + if (!same_leaf_as_prev(trans, i)) +@@ -644,7 +649,7 @@ static int journal_reclaim_wait_done(struct bch_fs *c) + static noinline + int bch2_trans_commit_error(struct btree_trans *trans, + struct btree_insert_entry *i, +- int ret) ++ int ret, unsigned long trace_ip) + { + struct bch_fs *c = trans->c; + unsigned flags = trans->flags; +@@ -685,7 +690,9 @@ int bch2_trans_commit_error(struct btree_trans *trans, + if (!ret || + ret == -EINTR || + (flags & BTREE_INSERT_NOUNLOCK)) { +- trace_trans_restart_btree_node_split(trans->ip); ++ trace_trans_restart_btree_node_split(trans->ip, trace_ip, ++ i->iter->btree_id, ++ &i->iter->real_pos); + ret = -EINTR; + } + break; +@@ -703,7 +710,7 @@ int bch2_trans_commit_error(struct btree_trans *trans, + if (bch2_trans_relock(trans)) + return 0; + +- trace_trans_restart_mark_replicas(trans->ip); ++ trace_trans_restart_mark_replicas(trans->ip, trace_ip); + ret = -EINTR; + break; + case BTREE_INSERT_NEED_JOURNAL_RES: +@@ -720,13 +727,13 @@ int bch2_trans_commit_error(struct btree_trans *trans, + if (bch2_trans_relock(trans)) + return 0; + +- trace_trans_restart_journal_res_get(trans->ip); ++ trace_trans_restart_journal_res_get(trans->ip, trace_ip); + ret = -EINTR; + break; + case BTREE_INSERT_NEED_JOURNAL_RECLAIM: + bch2_trans_unlock(trans); + +- trace_trans_blocked_journal_reclaim(trans->ip); ++ trace_trans_blocked_journal_reclaim(trans->ip, trace_ip); + + wait_event_freezable(c->journal.reclaim_wait, + (ret = journal_reclaim_wait_done(c))); +@@ -736,7 +743,7 @@ int bch2_trans_commit_error(struct btree_trans *trans, + if (bch2_trans_relock(trans)) + return 0; + +- trace_trans_restart_journal_reclaim(trans->ip); ++ trace_trans_restart_journal_reclaim(trans->ip, trace_ip); + ret = -EINTR; + break; + default: +@@ -950,7 +957,9 @@ int __bch2_trans_commit(struct btree_trans *trans) + i->trigger_flags); + if (unlikely(ret)) { + if (ret == -EINTR) +- trace_trans_restart_mark(trans->ip); ++ trace_trans_restart_mark(trans->ip, _RET_IP_, ++ i->iter->btree_id, ++ &i->iter->pos); + goto out; + } + } +@@ -976,12 +985,16 @@ int __bch2_trans_commit(struct btree_trans *trans) + trans_for_each_update2(trans, i) { + ret = bch2_btree_iter_traverse(i->iter); + if (unlikely(ret)) { +- trace_trans_restart_traverse(trans->ip); ++ trace_trans_restart_traverse(trans->ip, _RET_IP_, ++ i->iter->btree_id, ++ &i->iter->pos); + goto out; + } + + if (unlikely(!bch2_btree_iter_upgrade(i->iter, i->level + 1))) { +- trace_trans_restart_upgrade(trans->ip); ++ trace_trans_restart_upgrade(trans->ip, _RET_IP_, ++ i->iter->btree_id, ++ &i->iter->pos); + ret = -EINTR; + goto out; + } +@@ -997,7 +1010,7 @@ int __bch2_trans_commit(struct btree_trans *trans) + retry: + memset(&trans->journal_res, 0, sizeof(trans->journal_res)); + +- ret = do_bch2_trans_commit(trans, &i); ++ ret = do_bch2_trans_commit(trans, &i, _RET_IP_); + + /* make sure we didn't drop or screw up locks: */ + bch2_btree_trans_verify_locks(trans); +@@ -1023,7 +1036,7 @@ out_reset: + + return ret; + err: +- ret = bch2_trans_commit_error(trans, i, ret); ++ ret = bch2_trans_commit_error(trans, i, ret, _RET_IP_); + if (ret) + goto out; + +diff --git a/include/trace/events/bcachefs.h b/include/trace/events/bcachefs.h +index 05314cc0f97a..4c0d9b7660ee 100644 +--- a/include/trace/events/bcachefs.h ++++ b/include/trace/events/bcachefs.h +@@ -541,59 +541,66 @@ TRACE_EVENT(copygc_wait, + ); + + TRACE_EVENT(trans_get_iter, +- TP_PROTO(unsigned long caller, unsigned long ip, ++ TP_PROTO(unsigned long trans_ip, ++ unsigned long caller_ip, + enum btree_id btree_id, +- struct bpos *pos_want, +- unsigned locks_want, +- struct bpos *pos_found, +- unsigned locks_found, +- unsigned uptodate), +- TP_ARGS(caller, ip, btree_id, +- pos_want, locks_want, +- pos_found, locks_found, +- uptodate), ++ struct bpos *got_pos, ++ unsigned got_locks, ++ unsigned got_uptodate, ++ struct bpos *src_pos, ++ unsigned src_locks, ++ unsigned src_uptodate), ++ TP_ARGS(trans_ip, caller_ip, btree_id, ++ got_pos, got_locks, got_uptodate, ++ src_pos, src_locks, src_uptodate), + + TP_STRUCT__entry( +- __field(unsigned long, caller ) +- __field(unsigned long, ip ) +- __field(u8, btree_id ) +- __field(u8, uptodate ) +- __field(u8, locks_want ) +- __field(u8, locks_found ) +- __field(u64, pos_want_inode ) +- __field(u64, pos_want_offset ) +- __field(u32, pos_want_snapshot ) +- __field(u64, pos_found_inode ) +- __field(u64, pos_found_offset ) +- __field(u32, pos_found_snapshot ) ++ __field(unsigned long, trans_ip ) ++ __field(unsigned long, caller_ip ) ++ __field(u8, btree_id ) ++ __field(u64, got_pos_inode ) ++ __field(u64, got_pos_offset ) ++ __field(u32, got_pos_snapshot ) ++ __field(u8, got_locks ) ++ __field(u8, got_uptodate ) ++ __field(u64, src_pos_inode ) ++ __field(u64, src_pos_offset ) ++ __field(u32, src_pos_snapshot ) ++ __field(u8, src_locks ) ++ __field(u8, src_uptodate ) + ), + + TP_fast_assign( +- __entry->caller = caller; +- __entry->ip = ip; ++ __entry->trans_ip = trans_ip; ++ __entry->caller_ip = caller_ip; + __entry->btree_id = btree_id; +- __entry->uptodate = uptodate; +- __entry->pos_want_inode = pos_want->inode; +- __entry->pos_want_offset = pos_want->offset; +- __entry->pos_want_snapshot = pos_want->snapshot; +- __entry->pos_found_inode = pos_found->inode; +- __entry->pos_found_offset = pos_found->offset; +- __entry->pos_found_snapshot = pos_found->snapshot; +- ), +- +- TP_printk("%ps %pS btree %u uptodate %u want %llu:%llu:%u locks %u found %llu:%llu:%u locks %u", +- (void *) __entry->caller, +- (void *) __entry->ip, ++ __entry->got_pos_inode = got_pos->inode; ++ __entry->got_pos_offset = got_pos->offset; ++ __entry->got_pos_snapshot = got_pos->snapshot; ++ __entry->got_locks = got_locks; ++ __entry->got_uptodate = got_uptodate; ++ __entry->src_pos_inode = src_pos->inode; ++ __entry->src_pos_offset = src_pos->offset; ++ __entry->src_pos_snapshot = src_pos->snapshot; ++ __entry->src_locks = src_locks; ++ __entry->src_uptodate = src_uptodate; ++ ), ++ ++ TP_printk("%ps %pS btree %u got %llu:%llu:%u l %u u %u " ++ "src %llu:%llu:%u l %u u %u", ++ (void *) __entry->trans_ip, ++ (void *) __entry->caller_ip, + __entry->btree_id, +- __entry->uptodate, +- __entry->pos_want_inode, +- __entry->pos_want_offset, +- __entry->pos_want_snapshot, +- __entry->locks_want, +- __entry->pos_found_inode, +- __entry->pos_found_offset, +- __entry->pos_found_snapshot, +- __entry->locks_found) ++ __entry->got_pos_inode, ++ __entry->got_pos_offset, ++ __entry->got_pos_snapshot, ++ __entry->got_locks, ++ __entry->got_uptodate, ++ __entry->src_pos_inode, ++ __entry->src_pos_offset, ++ __entry->src_pos_snapshot, ++ __entry->src_locks, ++ __entry->src_uptodate) + ); + + TRACE_EVENT(transaction_restart_ip, +@@ -614,28 +621,241 @@ TRACE_EVENT(transaction_restart_ip, + ); + + DECLARE_EVENT_CLASS(transaction_restart, +- TP_PROTO(unsigned long ip), +- TP_ARGS(ip), ++ TP_PROTO(unsigned long trans_ip, ++ unsigned long caller_ip), ++ TP_ARGS(trans_ip, caller_ip), + + TP_STRUCT__entry( +- __field(unsigned long, ip ) ++ __field(unsigned long, trans_ip ) ++ __field(unsigned long, caller_ip ) + ), + + TP_fast_assign( +- __entry->ip = ip; ++ __entry->trans_ip = trans_ip; ++ __entry->caller_ip = caller_ip; + ), + +- TP_printk("%ps", (void *) __entry->ip) ++ TP_printk("%ps %pS", ++ (void *) __entry->trans_ip, ++ (void *) __entry->caller_ip) + ); + +-DEFINE_EVENT(transaction_restart, trans_restart_btree_node_reused, +- TP_PROTO(unsigned long ip), +- TP_ARGS(ip) ++DEFINE_EVENT(transaction_restart, trans_blocked_journal_reclaim, ++ TP_PROTO(unsigned long trans_ip, ++ unsigned long caller_ip), ++ TP_ARGS(trans_ip, caller_ip) + ); + +-DEFINE_EVENT(transaction_restart, trans_blocked_journal_reclaim, +- TP_PROTO(unsigned long ip), +- TP_ARGS(ip) ++DEFINE_EVENT(transaction_restart, trans_restart_journal_res_get, ++ TP_PROTO(unsigned long trans_ip, ++ unsigned long caller_ip), ++ TP_ARGS(trans_ip, caller_ip) ++); ++ ++DEFINE_EVENT(transaction_restart, trans_restart_journal_preres_get, ++ TP_PROTO(unsigned long trans_ip, ++ unsigned long caller_ip), ++ TP_ARGS(trans_ip, caller_ip) ++); ++ ++DEFINE_EVENT(transaction_restart, trans_restart_journal_reclaim, ++ TP_PROTO(unsigned long trans_ip, ++ unsigned long caller_ip), ++ TP_ARGS(trans_ip, caller_ip) ++); ++ ++DEFINE_EVENT(transaction_restart, trans_restart_fault_inject, ++ TP_PROTO(unsigned long trans_ip, ++ unsigned long caller_ip), ++ TP_ARGS(trans_ip, caller_ip) ++); ++ ++DEFINE_EVENT(transaction_restart, trans_traverse_all, ++ TP_PROTO(unsigned long trans_ip, ++ unsigned long caller_ip), ++ TP_ARGS(trans_ip, caller_ip) ++); ++ ++DEFINE_EVENT(transaction_restart, trans_restart_mark_replicas, ++ TP_PROTO(unsigned long trans_ip, ++ unsigned long caller_ip), ++ TP_ARGS(trans_ip, caller_ip) ++); ++ ++DECLARE_EVENT_CLASS(transaction_restart_iter, ++ TP_PROTO(unsigned long trans_ip, ++ unsigned long caller_ip, ++ enum btree_id btree_id, ++ struct bpos *pos), ++ TP_ARGS(trans_ip, caller_ip, btree_id, pos), ++ ++ TP_STRUCT__entry( ++ __field(unsigned long, trans_ip ) ++ __field(unsigned long, caller_ip ) ++ __field(u8, btree_id ) ++ __field(u64, pos_inode ) ++ __field(u64, pos_offset ) ++ __field(u32, pos_snapshot ) ++ ), ++ ++ TP_fast_assign( ++ __entry->trans_ip = trans_ip; ++ __entry->caller_ip = caller_ip; ++ __entry->btree_id = btree_id; ++ __entry->pos_inode = pos->inode; ++ __entry->pos_offset = pos->offset; ++ __entry->pos_snapshot = pos->snapshot; ++ ), ++ ++ TP_printk("%ps %pS btree %u pos %llu:%llu:%u", ++ (void *) __entry->trans_ip, ++ (void *) __entry->caller_ip, ++ __entry->btree_id, ++ __entry->pos_inode, ++ __entry->pos_offset, ++ __entry->pos_snapshot) ++); ++ ++DEFINE_EVENT(transaction_restart_iter, trans_restart_btree_node_reused, ++ TP_PROTO(unsigned long trans_ip, ++ unsigned long caller_ip, ++ enum btree_id btree_id, ++ struct bpos *pos), ++ TP_ARGS(trans_ip, caller_ip, btree_id, pos) ++); ++ ++DEFINE_EVENT(transaction_restart_iter, trans_restart_btree_node_split, ++ TP_PROTO(unsigned long trans_ip, ++ unsigned long caller_ip, ++ enum btree_id btree_id, ++ struct bpos *pos), ++ TP_ARGS(trans_ip, caller_ip, btree_id, pos) ++); ++ ++DEFINE_EVENT(transaction_restart_iter, trans_restart_mark, ++ TP_PROTO(unsigned long trans_ip, ++ unsigned long caller_ip, ++ enum btree_id btree_id, ++ struct bpos *pos), ++ TP_ARGS(trans_ip, caller_ip, btree_id, pos) ++); ++ ++DEFINE_EVENT(transaction_restart_iter, trans_restart_upgrade, ++ TP_PROTO(unsigned long trans_ip, ++ unsigned long caller_ip, ++ enum btree_id btree_id, ++ struct bpos *pos), ++ TP_ARGS(trans_ip, caller_ip, btree_id, pos) ++); ++ ++DEFINE_EVENT(transaction_restart_iter, trans_restart_iter_upgrade, ++ TP_PROTO(unsigned long trans_ip, ++ unsigned long caller_ip, ++ enum btree_id btree_id, ++ struct bpos *pos), ++ TP_ARGS(trans_ip, caller_ip, btree_id, pos) ++); ++ ++DEFINE_EVENT(transaction_restart_iter, trans_restart_relock, ++ TP_PROTO(unsigned long trans_ip, ++ unsigned long caller_ip, ++ enum btree_id btree_id, ++ struct bpos *pos), ++ TP_ARGS(trans_ip, caller_ip, btree_id, pos) ++); ++ ++DEFINE_EVENT(transaction_restart_iter, trans_restart_traverse, ++ TP_PROTO(unsigned long trans_ip, ++ unsigned long caller_ip, ++ enum btree_id btree_id, ++ struct bpos *pos), ++ TP_ARGS(trans_ip, caller_ip, btree_id, pos) ++); ++ ++TRACE_EVENT(iter_traverse, ++ TP_PROTO(unsigned long trans_ip, ++ unsigned long caller_ip, ++ enum btree_id btree_id, ++ struct bpos *pos, ++ int ret), ++ TP_ARGS(trans_ip, caller_ip, btree_id, pos, ret), ++ ++ TP_STRUCT__entry( ++ __field(unsigned long, trans_ip ) ++ __field(unsigned long, caller_ip ) ++ __field(u8, btree_id ) ++ __field(u64, pos_inode ) ++ __field(u64, pos_offset ) ++ __field(u32, pos_snapshot ) ++ __field(s32, ret ) ++ ), ++ ++ TP_fast_assign( ++ __entry->trans_ip = trans_ip; ++ __entry->caller_ip = caller_ip; ++ __entry->btree_id = btree_id; ++ __entry->pos_inode = pos->inode; ++ __entry->pos_offset = pos->offset; ++ __entry->pos_snapshot = pos->snapshot; ++ __entry->ret = ret; ++ ), ++ ++ TP_printk("%ps %pS pos %u %llu:%llu:%u ret %i", ++ (void *) __entry->trans_ip, ++ (void *) __entry->caller_ip, ++ __entry->btree_id, ++ __entry->pos_inode, ++ __entry->pos_offset, ++ __entry->pos_snapshot, ++ __entry->ret) ++); ++ ++TRACE_EVENT(iter_set_search_pos, ++ TP_PROTO(unsigned long trans_ip, ++ unsigned long caller_ip, ++ enum btree_id btree_id, ++ struct bpos *old_pos, ++ struct bpos *new_pos, ++ unsigned good_level), ++ TP_ARGS(trans_ip, caller_ip, btree_id, old_pos, new_pos, good_level), ++ ++ TP_STRUCT__entry( ++ __field(unsigned long, trans_ip ) ++ __field(unsigned long, caller_ip ) ++ __field(u8, btree_id ) ++ __field(u64, old_pos_inode ) ++ __field(u64, old_pos_offset ) ++ __field(u32, old_pos_snapshot ) ++ __field(u64, new_pos_inode ) ++ __field(u64, new_pos_offset ) ++ __field(u32, new_pos_snapshot ) ++ __field(u8, good_level ) ++ ), ++ ++ TP_fast_assign( ++ __entry->trans_ip = trans_ip; ++ __entry->caller_ip = caller_ip; ++ __entry->btree_id = btree_id; ++ __entry->old_pos_inode = old_pos->inode; ++ __entry->old_pos_offset = old_pos->offset; ++ __entry->old_pos_snapshot = old_pos->snapshot; ++ __entry->new_pos_inode = new_pos->inode; ++ __entry->new_pos_offset = new_pos->offset; ++ __entry->new_pos_snapshot = new_pos->snapshot; ++ __entry->good_level = good_level; ++ ), ++ ++ TP_printk("%ps %pS btree %u old pos %llu:%llu:%u new pos %llu:%llu:%u l %u", ++ (void *) __entry->trans_ip, ++ (void *) __entry->caller_ip, ++ __entry->btree_id, ++ __entry->old_pos_inode, ++ __entry->old_pos_offset, ++ __entry->old_pos_snapshot, ++ __entry->new_pos_inode, ++ __entry->new_pos_offset, ++ __entry->new_pos_snapshot, ++ __entry->good_level) + ); + + TRACE_EVENT(trans_restart_would_deadlock, +@@ -730,97 +950,70 @@ TRACE_EVENT(trans_restart_mem_realloced, + __entry->bytes) + ); + +-DEFINE_EVENT(transaction_restart, trans_restart_journal_res_get, +- TP_PROTO(unsigned long ip), +- TP_ARGS(ip) +-); +- +-DEFINE_EVENT(transaction_restart, trans_restart_journal_preres_get, +- TP_PROTO(unsigned long ip), +- TP_ARGS(ip) +-); +- +-DEFINE_EVENT(transaction_restart, trans_restart_journal_reclaim, +- TP_PROTO(unsigned long ip), +- TP_ARGS(ip) +-); +- +-DEFINE_EVENT(transaction_restart, trans_restart_mark_replicas, +- TP_PROTO(unsigned long ip), +- TP_ARGS(ip) +-); +- +-DEFINE_EVENT(transaction_restart, trans_restart_fault_inject, +- TP_PROTO(unsigned long ip), +- TP_ARGS(ip) +-); +- +-DEFINE_EVENT(transaction_restart, trans_restart_btree_node_split, +- TP_PROTO(unsigned long ip), +- TP_ARGS(ip) +-); +- +-DEFINE_EVENT(transaction_restart, trans_restart_mark, +- TP_PROTO(unsigned long ip), +- TP_ARGS(ip) +-); +- +-DEFINE_EVENT(transaction_restart, trans_restart_upgrade, +- TP_PROTO(unsigned long ip), +- TP_ARGS(ip) +-); +- +-DEFINE_EVENT(transaction_restart, trans_restart_iter_upgrade, +- TP_PROTO(unsigned long ip), +- TP_ARGS(ip) +-); +- +-DEFINE_EVENT(transaction_restart, trans_restart_relock, +- TP_PROTO(unsigned long ip), +- TP_ARGS(ip) +-); +- +-DEFINE_EVENT(transaction_restart, trans_restart_traverse, +- TP_PROTO(unsigned long ip), +- TP_ARGS(ip) +-); +- +-DEFINE_EVENT(transaction_restart, trans_traverse_all, +- TP_PROTO(unsigned long ip), +- TP_ARGS(ip) +-); +- + DECLARE_EVENT_CLASS(node_lock_fail, +- TP_PROTO(unsigned level, u32 iter_seq, unsigned node, u32 node_seq), +- TP_ARGS(level, iter_seq, node, node_seq), ++ TP_PROTO(unsigned long trans_ip, ++ unsigned long caller_ip, ++ enum btree_id btree_id, ++ struct bpos *pos, ++ unsigned level, u32 iter_seq, unsigned node, u32 node_seq), ++ TP_ARGS(trans_ip, caller_ip, btree_id, pos, ++ level, iter_seq, node, node_seq), + + TP_STRUCT__entry( +- __field(u32, level) +- __field(u32, iter_seq) +- __field(u32, node) +- __field(u32, node_seq) ++ __field(unsigned long, trans_ip ) ++ __field(unsigned long, caller_ip ) ++ __field(u8, btree_id ) ++ __field(u64, pos_inode ) ++ __field(u64, pos_offset ) ++ __field(u32, pos_snapshot ) ++ __field(u32, level ) ++ __field(u32, iter_seq ) ++ __field(u32, node ) ++ __field(u32, node_seq ) + ), + + TP_fast_assign( +- __entry->level = level; +- __entry->iter_seq = iter_seq; +- __entry->node = node; +- __entry->node_seq = node_seq; ++ __entry->trans_ip = trans_ip; ++ __entry->caller_ip = caller_ip; ++ __entry->btree_id = btree_id; ++ __entry->pos_inode = pos->inode; ++ __entry->pos_offset = pos->offset; ++ __entry->pos_snapshot = pos->snapshot; ++ __entry->level = level; ++ __entry->iter_seq = iter_seq; ++ __entry->node = node; ++ __entry->node_seq = node_seq; + ), + +- TP_printk("level %u iter seq %u node %u node seq %u", ++ TP_printk("%ps %pS btree %u pos %llu:%llu:%u level %u iter seq %u node %u node seq %u", ++ (void *) __entry->trans_ip, ++ (void *) __entry->caller_ip, ++ __entry->btree_id, ++ __entry->pos_inode, ++ __entry->pos_offset, ++ __entry->pos_snapshot, + __entry->level, __entry->iter_seq, + __entry->node, __entry->node_seq) + ); + + DEFINE_EVENT(node_lock_fail, node_upgrade_fail, +- TP_PROTO(unsigned level, u32 iter_seq, unsigned node, u32 node_seq), +- TP_ARGS(level, iter_seq, node, node_seq) ++ TP_PROTO(unsigned long trans_ip, ++ unsigned long caller_ip, ++ enum btree_id btree_id, ++ struct bpos *pos, ++ unsigned level, u32 iter_seq, unsigned node, u32 node_seq), ++ TP_ARGS(trans_ip, caller_ip, btree_id, pos, ++ level, iter_seq, node, node_seq) + ); + + DEFINE_EVENT(node_lock_fail, node_relock_fail, +- TP_PROTO(unsigned level, u32 iter_seq, unsigned node, u32 node_seq), +- TP_ARGS(level, iter_seq, node, node_seq) ++ TP_PROTO(unsigned long trans_ip, ++ unsigned long caller_ip, ++ enum btree_id btree_id, ++ struct bpos *pos, ++ unsigned level, u32 iter_seq, unsigned node, u32 node_seq), ++ TP_ARGS(trans_ip, caller_ip, btree_id, pos, ++ level, iter_seq, node, node_seq) + ); + + #endif /* _TRACE_BCACHE_H */ +-- +cgit v1.2.3 + + +From 0c6fca37db1d3140c873b499f5c3c5e7ba4df459 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 4 Jun 2021 17:17:45 -0400 +Subject: bcachefs: btree_iter->should_be_locked + +Add a field to struct btree_iter for tracking whether it should be +locked - this fixes spurious transaction restarts in +bch2_trans_relock(). + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 28 ++++++++++++++++++++++++++-- + fs/bcachefs/btree_iter.h | 1 + + fs/bcachefs/btree_types.h | 8 +++++++- + 3 files changed, 34 insertions(+), 3 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 712352d35f87..609cfa19c4d6 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -464,12 +464,20 @@ void bch2_trans_downgrade(struct btree_trans *trans) + + /* Btree transaction locking: */ + ++static inline bool btree_iter_should_be_locked(struct btree_trans *trans, ++ struct btree_iter *iter) ++{ ++ return (iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT) || ++ iter->should_be_locked; ++} ++ + bool bch2_trans_relock(struct btree_trans *trans) + { + struct btree_iter *iter; + + trans_for_each_iter(trans, iter) +- if (!bch2_btree_iter_relock(iter, _RET_IP_)) { ++ if (!bch2_btree_iter_relock(iter, _RET_IP_) && ++ btree_iter_should_be_locked(trans, iter)) { + trace_trans_restart_relock(trans->ip, _RET_IP_, + iter->btree_id, &iter->real_pos); + return false; +@@ -1428,9 +1436,16 @@ btree_iter_traverse(struct btree_iter *iter) + int __must_check + bch2_btree_iter_traverse(struct btree_iter *iter) + { ++ int ret; ++ + btree_iter_set_search_pos(iter, btree_iter_search_key(iter)); + +- return btree_iter_traverse(iter); ++ ret = btree_iter_traverse(iter); ++ if (ret) ++ return ret; ++ ++ iter->should_be_locked = true; ++ return 0; + } + + /* Iterate across nodes (leaf and interior nodes) */ +@@ -1456,6 +1471,7 @@ struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter) + iter->pos = iter->real_pos = b->key.k.p; + + bch2_btree_iter_verify(iter); ++ iter->should_be_locked = true; + + return b; + } +@@ -1512,6 +1528,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) + iter->pos = iter->real_pos = b->key.k.p; + + bch2_btree_iter_verify(iter); ++ iter->should_be_locked = true; + + return b; + } +@@ -1528,6 +1545,7 @@ static void btree_iter_set_search_pos(struct btree_iter *iter, struct bpos new_p + goto out; + + iter->real_pos = new_pos; ++ iter->should_be_locked = false; + + if (unlikely(btree_iter_type(iter) == BTREE_ITER_CACHED)) { + btree_node_unlock(iter, 0); +@@ -1687,6 +1705,7 @@ start: + + bch2_btree_iter_verify_entry_exit(iter); + bch2_btree_iter_verify(iter); ++ iter->should_be_locked = true; + return k; + } + +@@ -1771,6 +1790,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) + out: + bch2_btree_iter_verify_entry_exit(iter); + bch2_btree_iter_verify(iter); ++ iter->should_be_locked = true; + return k; + no_key: + /* +@@ -1870,6 +1890,8 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) + + bch2_btree_iter_verify_entry_exit(iter); + bch2_btree_iter_verify(iter); ++ iter->should_be_locked = true; ++ + return k; + } + +@@ -1907,6 +1929,8 @@ struct bkey_s_c bch2_btree_iter_peek_cached(struct btree_iter *iter) + bkey_cmp(iter->pos, ck->key.pos)); + BUG_ON(!ck->valid); + ++ iter->should_be_locked = true; ++ + return bkey_i_to_s_c(ck->k); + } + +diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h +index 01b834bf79f7..a2ce711fd61f 100644 +--- a/fs/bcachefs/btree_iter.h ++++ b/fs/bcachefs/btree_iter.h +@@ -178,6 +178,7 @@ static inline void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos + iter->k.p.offset = iter->pos.offset = new_pos.offset; + iter->k.p.snapshot = iter->pos.snapshot = new_pos.snapshot; + iter->k.size = 0; ++ iter->should_be_locked = false; + } + + /* Sort order for locking btree iterators: */ +diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h +index 902c762739c0..66056d015e7d 100644 +--- a/fs/bcachefs/btree_types.h ++++ b/fs/bcachefs/btree_types.h +@@ -252,7 +252,13 @@ struct btree_iter { + u8 idx; + + enum btree_id btree_id:4; +- enum btree_iter_uptodate uptodate:4; ++ enum btree_iter_uptodate uptodate:3; ++ /* ++ * True if we've returned a key (and thus are expected to keep it ++ * locked), false after set_pos - for avoiding spurious transaction ++ * restarts in bch2_trans_relock(): ++ */ ++ bool should_be_locked:1; + unsigned level:4, + min_depth:4, + locks_want:4, +-- +cgit v1.2.3 + + +From e95da431ad3cf4034e0424620a1424db6fa3339d Mon Sep 17 00:00:00 2001 +From: Dan Robertson +Date: Sat, 5 Jun 2021 19:03:16 -0400 +Subject: bcachefs: do not compile acl mod on minimal config + +Do not compile the acl.o target if BCACHEFS_POSIX_ACL is not enabled. + +Signed-off-by: Dan Robertson +--- + fs/bcachefs/Makefile | 3 ++- + fs/bcachefs/xattr.c | 2 ++ + 2 files changed, 4 insertions(+), 1 deletion(-) + +diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile +index 2fbf978424ed..ee5e6dbd5ede 100644 +--- a/fs/bcachefs/Makefile ++++ b/fs/bcachefs/Makefile +@@ -2,7 +2,6 @@ + obj-$(CONFIG_BCACHEFS_FS) += bcachefs.o + + bcachefs-y := \ +- acl.o \ + alloc_background.o \ + alloc_foreground.o \ + bkey.o \ +@@ -58,3 +57,5 @@ bcachefs-y := \ + util.o \ + varint.o \ + xattr.o ++ ++bcachefs-$(CONFIG_BCACHEFS_POSIX_ACL) += acl.o +diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c +index 8985a21b122c..8bd7553b9ebd 100644 +--- a/fs/bcachefs/xattr.c ++++ b/fs/bcachefs/xattr.c +@@ -562,8 +562,10 @@ static const struct xattr_handler bch_xattr_bcachefs_effective_handler = { + + const struct xattr_handler *bch2_xattr_handlers[] = { + &bch_xattr_user_handler, ++#ifdef CONFIG_BCACHEFS_POSIX_ACL + &posix_acl_access_xattr_handler, + &posix_acl_default_xattr_handler, ++#endif + &bch_xattr_trusted_handler, + &bch_xattr_security_handler, + #ifndef NO_BCACHEFS_FS +-- +cgit v1.2.3 + + +From 3d201120bb8df94a47c6c0716ab28b36906be0e7 Mon Sep 17 00:00:00 2001 +From: Brett Holman +Date: Sun, 6 Jun 2021 09:29:42 -0600 +Subject: bcachefs: Fix unitialized use of a value + +--- + fs/bcachefs/replicas.c | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c +index 8e6cccd39383..dbbbcc6dcec6 100644 +--- a/fs/bcachefs/replicas.c ++++ b/fs/bcachefs/replicas.c +@@ -435,6 +435,8 @@ static int __bch2_mark_bkey_replicas(struct bch_fs *c, struct bkey_s_c k, + unsigned i; + int ret; + ++ memset(&search, 0, sizeof(search)); ++ + for (i = 0; i < cached.nr; i++) { + bch2_replicas_entry_cached(&search.e, cached.devs[i]); + +-- +cgit v1.2.3 + + +From a128e6037d0bd1333720ca5e9acc6352199046af Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 8 Jun 2021 16:29:24 -0400 +Subject: bcachefs: Fix a spurious debug mode assertion + +When we switched to using bch2_btree_bset_insert_key() for extents it +turned out it started leaving invalid keys around - of type deleted but +nonzero size - but this is fine (if ugly) because they're never written +out. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 9 ++++++++- + 1 file changed, 8 insertions(+), 1 deletion(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 609cfa19c4d6..9d7440d2b9e8 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -847,7 +847,14 @@ static inline struct bkey_s_c __btree_iter_unpack(struct btree_iter *iter, + + ret = bkey_disassemble(l->b, k, u); + +- if (bch2_debug_check_bkeys) ++ /* ++ * XXX: bch2_btree_bset_insert_key() generates invalid keys when we ++ * overwrite extents - it sets k->type = KEY_TYPE_deleted on the key ++ * being overwritten but doesn't change k->size. But this is ok, because ++ * those keys are never written out, we just have to avoid a spurious ++ * assertion here: ++ */ ++ if (bch2_debug_check_bkeys && !bkey_deleted(ret.k)) + bch2_bkey_debugcheck(iter->trans->c, l->b, ret); + + return ret; +-- +cgit v1.2.3 + + +From 5b2de1f5335dce658c65ad2ef057e917b80876a7 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 8 Jun 2021 22:50:30 -0400 +Subject: bcachefs: Don't mark superblocks past end of usable space + +bcachefs-tools recently started putting a backup superblock at the end +of the device. This causes a problem if the bucket size doesn't divide +the device size - but we can fix it by just skipping marking that part. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/buckets.c | 12 ++++++++++++ + fs/bcachefs/super.c | 5 +++++ + 2 files changed, 17 insertions(+) + +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index 2b5e1d5c6a29..76d15a5dc62f 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -631,6 +631,12 @@ void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, + BUG_ON(type != BCH_DATA_sb && + type != BCH_DATA_journal); + ++ /* ++ * Backup superblock might be past the end of our normal usable space: ++ */ ++ if (b >= ca->mi.nbuckets) ++ return; ++ + preempt_disable(); + + if (likely(c)) { +@@ -2084,6 +2090,12 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans, + }; + int ret = 0; + ++ /* ++ * Backup superblock might be past the end of our normal usable space: ++ */ ++ if (b >= ca->mi.nbuckets) ++ return 0; ++ + a = bch2_trans_start_alloc_update(trans, &iter, &ptr, &u); + if (IS_ERR(a)) + return PTR_ERR(a); +diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c +index 53f18c480234..13a5ca713e7a 100644 +--- a/fs/bcachefs/super.c ++++ b/fs/bcachefs/super.c +@@ -1820,6 +1820,11 @@ int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) + goto err; + } + ++ ret = bch2_trans_mark_dev_sb(c, ca); ++ if (ret) { ++ goto err; ++ } ++ + mutex_lock(&c->sb_lock); + mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx]; + mi->nbuckets = cpu_to_le64(nbuckets); +-- +cgit v1.2.3 + + +From 09725775f4aab1f8e75bc3713b1c780cf94ec815 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 10 Jun 2021 13:21:39 -0400 +Subject: bcachefs: Fix a buffer overrun + +In make_extent_indirect(), we were allocating too small of a buffer for +the new indirect extent. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/reflink.c | 8 +------- + 1 file changed, 1 insertion(+), 7 deletions(-) + +diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c +index a420729288d4..6aa37726341d 100644 +--- a/fs/bcachefs/reflink.c ++++ b/fs/bcachefs/reflink.c +@@ -138,7 +138,7 @@ static int bch2_make_extent_indirect(struct btree_trans *trans, + /* rewind iter to start of hole, if necessary: */ + bch2_btree_iter_set_pos(reflink_iter, bkey_start_pos(k.k)); + +- r_v = bch2_trans_kmalloc(trans, sizeof(__le64) + bkey_val_bytes(&orig->k)); ++ r_v = bch2_trans_kmalloc(trans, sizeof(__le64) + bkey_bytes(&orig->k)); + ret = PTR_ERR_OR_ZERO(r_v); + if (ret) + goto err; +@@ -159,12 +159,6 @@ static int bch2_make_extent_indirect(struct btree_trans *trans, + if (ret) + goto err; + +- r_p = bch2_trans_kmalloc(trans, sizeof(*r_p)); +- if (IS_ERR(r_p)) { +- ret = PTR_ERR(r_p); +- goto err; +- } +- + orig->k.type = KEY_TYPE_reflink_p; + r_p = bkey_i_to_reflink_p(orig); + set_bkey_val_bytes(&r_p->k, sizeof(r_p->v)); +-- +cgit v1.2.3 + + +From e993a0e022ec5a4f57be5bb6fb3adfb64773f19e Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 7 Jun 2021 13:28:50 -0400 +Subject: bcachefs: More topology repair code + +This improves the handling of overlapping btree nodes; now, we handle +the case where one btree node completely overwrites another. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_gc.c | 129 +++++++++++++++++++++++++++++++++++-------------- + 1 file changed, 93 insertions(+), 36 deletions(-) + +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +index b785e6636c0f..d46d933b393b 100644 +--- a/fs/bcachefs/btree_gc.c ++++ b/fs/bcachefs/btree_gc.c +@@ -36,6 +36,9 @@ + #include + #include + ++#define DROP_THIS_NODE 10 ++#define DROP_PREV_NODE 11 ++ + static inline void __gc_pos_set(struct bch_fs *c, struct gc_pos new_pos) + { + preempt_disable(); +@@ -203,8 +206,8 @@ static int set_node_max(struct bch_fs *c, struct btree *b, struct bpos new_max) + return 0; + } + +-static int btree_repair_node_start(struct bch_fs *c, struct btree *b, +- struct btree *prev, struct btree *cur) ++static int btree_repair_node_boundaries(struct bch_fs *c, struct btree *b, ++ struct btree *prev, struct btree *cur) + { + struct bpos expected_start = !prev + ? b->data->min_key +@@ -220,22 +223,50 @@ static int btree_repair_node_start(struct bch_fs *c, struct btree *b, + bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(&prev->key)); + } + +- if (mustfix_fsck_err_on(bpos_cmp(expected_start, cur->data->min_key), c, +- "btree node with incorrect min_key at btree %s level %u:\n" +- " prev %s\n" +- " cur %s", +- bch2_btree_ids[b->c.btree_id], b->c.level, +- buf1, +- (bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(&cur->key)), buf2))) { +- if (prev && +- bpos_cmp(expected_start, cur->data->min_key) > 0 && +- BTREE_NODE_SEQ(cur->data) > BTREE_NODE_SEQ(prev->data)) ++ bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(&cur->key)); ++ ++ if (prev && ++ bpos_cmp(expected_start, cur->data->min_key) > 0 && ++ BTREE_NODE_SEQ(cur->data) > BTREE_NODE_SEQ(prev->data)) { ++ /* cur overwrites prev: */ ++ ++ if (mustfix_fsck_err_on(bpos_cmp(prev->data->min_key, ++ cur->data->min_key) >= 0, c, ++ "btree node overwritten by next node at btree %s level %u:\n" ++ " node %s\n" ++ " next %s", ++ bch2_btree_ids[b->c.btree_id], b->c.level, ++ buf1, buf2)) ++ return DROP_PREV_NODE; ++ ++ if (mustfix_fsck_err_on(bpos_cmp(prev->key.k.p, ++ bpos_predecessor(cur->data->min_key)), c, ++ "btree node with incorrect max_key at btree %s level %u:\n" ++ " node %s\n" ++ " next %s", ++ bch2_btree_ids[b->c.btree_id], b->c.level, ++ buf1, buf2)) + ret = set_node_max(c, prev, +- bpos_predecessor(cur->data->min_key)); +- else +- ret = set_node_min(c, cur, expected_start); +- if (ret) +- return ret; ++ bpos_predecessor(cur->data->min_key)); ++ } else { ++ /* prev overwrites cur: */ ++ ++ if (mustfix_fsck_err_on(bpos_cmp(expected_start, ++ cur->data->max_key) >= 0, c, ++ "btree node overwritten by prev node at btree %s level %u:\n" ++ " prev %s\n" ++ " node %s", ++ bch2_btree_ids[b->c.btree_id], b->c.level, ++ buf1, buf2)) ++ return DROP_THIS_NODE; ++ ++ if (mustfix_fsck_err_on(bpos_cmp(expected_start, cur->data->min_key), c, ++ "btree node with incorrect min_key at btree %s level %u:\n" ++ " prev %s\n" ++ " node %s", ++ bch2_btree_ids[b->c.btree_id], b->c.level, ++ buf1, buf2)) ++ ret = set_node_min(c, cur, expected_start); + } + fsck_err: + return ret; +@@ -262,13 +293,11 @@ fsck_err: + return ret; + } + +-#define DROP_THIS_NODE 10 +- + static int bch2_btree_repair_topology_recurse(struct bch_fs *c, struct btree *b) + { + struct btree_and_journal_iter iter; + struct bkey_s_c k; +- struct bkey_buf tmp; ++ struct bkey_buf prev_k, cur_k; + struct btree *prev = NULL, *cur = NULL; + bool have_child, dropped_children = false; + char buf[200]; +@@ -277,8 +306,10 @@ static int bch2_btree_repair_topology_recurse(struct bch_fs *c, struct btree *b) + if (!b->c.level) + return 0; + again: ++ prev = NULL; + have_child = dropped_children = false; +- bch2_bkey_buf_init(&tmp); ++ bch2_bkey_buf_init(&prev_k); ++ bch2_bkey_buf_init(&cur_k); + bch2_btree_and_journal_iter_init_node_iter(&iter, c, b); + + while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { +@@ -286,9 +317,9 @@ again: + BUG_ON(bpos_cmp(k.k->p, b->data->max_key) > 0); + + bch2_btree_and_journal_iter_advance(&iter); +- bch2_bkey_buf_reassemble(&tmp, c, k); ++ bch2_bkey_buf_reassemble(&cur_k, c, k); + +- cur = bch2_btree_node_get_noiter(c, tmp.k, ++ cur = bch2_btree_node_get_noiter(c, cur_k.k, + b->c.btree_id, b->c.level - 1, + false); + ret = PTR_ERR_OR_ZERO(cur); +@@ -298,12 +329,12 @@ again: + " %s", + bch2_btree_ids[b->c.btree_id], + b->c.level - 1, +- (bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(tmp.k)), buf))) { +- bch2_btree_node_evict(c, tmp.k); ++ (bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(cur_k.k)), buf))) { ++ bch2_btree_node_evict(c, cur_k.k); + ret = bch2_journal_key_delete(c, b->c.btree_id, +- b->c.level, tmp.k->k.p); ++ b->c.level, cur_k.k->k.p); + if (ret) +- goto err; ++ break; + continue; + } + +@@ -313,14 +344,39 @@ again: + break; + } + +- ret = btree_repair_node_start(c, b, prev, cur); ++ ret = btree_repair_node_boundaries(c, b, prev, cur); ++ ++ if (ret == DROP_THIS_NODE) { ++ six_unlock_read(&cur->c.lock); ++ bch2_btree_node_evict(c, cur_k.k); ++ ret = bch2_journal_key_delete(c, b->c.btree_id, ++ b->c.level, cur_k.k->k.p); ++ if (ret) ++ break; ++ continue; ++ } ++ + if (prev) + six_unlock_read(&prev->c.lock); +- prev = cur; +- cur = NULL; ++ prev = NULL; + +- if (ret) ++ if (ret == DROP_PREV_NODE) { ++ bch2_btree_node_evict(c, prev_k.k); ++ ret = bch2_journal_key_delete(c, b->c.btree_id, ++ b->c.level, prev_k.k->k.p); ++ if (ret) ++ break; ++ ++ bch2_btree_and_journal_iter_exit(&iter); ++ bch2_bkey_buf_exit(&prev_k, c); ++ bch2_bkey_buf_exit(&cur_k, c); ++ goto again; ++ } else if (ret) + break; ++ ++ prev = cur; ++ cur = NULL; ++ bch2_bkey_buf_copy(&prev_k, c, cur_k.k); + } + + if (!ret && !IS_ERR_OR_NULL(prev)) { +@@ -342,10 +398,10 @@ again: + bch2_btree_and_journal_iter_init_node_iter(&iter, c, b); + + while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { +- bch2_bkey_buf_reassemble(&tmp, c, k); ++ bch2_bkey_buf_reassemble(&cur_k, c, k); + bch2_btree_and_journal_iter_advance(&iter); + +- cur = bch2_btree_node_get_noiter(c, tmp.k, ++ cur = bch2_btree_node_get_noiter(c, cur_k.k, + b->c.btree_id, b->c.level - 1, + false); + ret = PTR_ERR_OR_ZERO(cur); +@@ -361,9 +417,9 @@ again: + cur = NULL; + + if (ret == DROP_THIS_NODE) { +- bch2_btree_node_evict(c, tmp.k); ++ bch2_btree_node_evict(c, cur_k.k); + ret = bch2_journal_key_delete(c, b->c.btree_id, +- b->c.level, tmp.k->k.p); ++ b->c.level, cur_k.k->k.p); + dropped_children = true; + } + +@@ -388,7 +444,8 @@ fsck_err: + six_unlock_read(&cur->c.lock); + + bch2_btree_and_journal_iter_exit(&iter); +- bch2_bkey_buf_exit(&tmp, c); ++ bch2_bkey_buf_exit(&prev_k, c); ++ bch2_bkey_buf_exit(&cur_k, c); + + if (!ret && dropped_children) + goto again; +-- +cgit v1.2.3 + + +From b46e327c4fdb7ce7a3f46b184b0f21ad65008c4a Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 8 Apr 2021 22:26:53 -0400 +Subject: bcachefs: Drop all btree locks when submitting btree node reads + +As a rule we don't want to be holding btree locks while submitting IO - +this will improve overall filesystem latency. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_cache.c | 39 +++++++++++++++++++++++++++++---------- + fs/bcachefs/btree_iter.c | 4 ++++ + 2 files changed, 33 insertions(+), 10 deletions(-) + +diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c +index 62a10a78fe8f..94448d40c824 100644 +--- a/fs/bcachefs/btree_cache.c ++++ b/fs/bcachefs/btree_cache.c +@@ -662,13 +662,9 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c, + return NULL; + } + +- /* +- * Unlock before doing IO: +- * +- * XXX: ideally should be dropping all btree node locks here +- */ +- if (iter && btree_node_read_locked(iter, level + 1)) +- btree_node_unlock(iter, level + 1); ++ /* Unlock before doing IO: */ ++ if (iter && sync) ++ bch2_trans_unlock(iter->trans); + + bch2_btree_node_read(c, b, sync); + +@@ -679,6 +675,16 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c, + return NULL; + } + ++ /* ++ * XXX: this will probably always fail because btree_iter_relock() ++ * currently fails for iterators that aren't pointed at a valid btree ++ * node ++ */ ++ if (iter && !bch2_trans_relock(iter->trans)) { ++ six_unlock_intent(&b->c.lock); ++ return ERR_PTR(-EINTR); ++ } ++ + if (lock_type == SIX_LOCK_read) + six_lock_downgrade(&b->c.lock); + +@@ -823,9 +829,22 @@ lock_node: + } + } + +- /* XXX: waiting on IO with btree locks held: */ +- wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight, +- TASK_UNINTERRUPTIBLE); ++ if (unlikely(btree_node_read_in_flight(b))) { ++ six_unlock_type(&b->c.lock, lock_type); ++ bch2_trans_unlock(iter->trans); ++ ++ wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight, ++ TASK_UNINTERRUPTIBLE); ++ ++ /* ++ * XXX: check if this always fails - btree_iter_relock() ++ * currently fails for iterators that aren't pointed at a valid ++ * btree node ++ */ ++ if (iter && !bch2_trans_relock(iter->trans)) ++ return ERR_PTR(-EINTR); ++ goto retry; ++ } + + prefetch(b->aux_data); + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 9d7440d2b9e8..9432ff73abb4 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -1192,7 +1192,11 @@ static __always_inline int btree_iter_down(struct btree_iter *iter, + if (iter->flags & BTREE_ITER_PREFETCH) + btree_iter_prefetch(iter); + ++ if (btree_node_read_locked(iter, level + 1)) ++ btree_node_unlock(iter, level + 1); + iter->level = level; ++ ++ bch2_btree_iter_verify_locks(iter); + err: + bch2_bkey_buf_exit(&tmp, c); + return ret; +-- +cgit v1.2.3 + + +From 6110f124fd62b1ea0ff5503093a42c6b8191735a Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 20 Mar 2021 15:12:05 -0400 +Subject: bcachefs: Child btree iterators + +This adds the ability for btree iterators to own child iterators - to be +used by an upcoming rework of bch2_btree_iter_peek_slot(), so we can +scan forwards while maintaining our current position. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 67 ++++++++++++++++++++++++++++++++++++++++------- + fs/bcachefs/btree_iter.h | 6 +++++ + fs/bcachefs/btree_types.h | 18 ++++++++----- + 3 files changed, 74 insertions(+), 17 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 9432ff73abb4..041ce260d91e 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -18,6 +18,9 @@ + #include + + static void btree_iter_set_search_pos(struct btree_iter *, struct bpos); ++static struct btree_iter *btree_iter_child_alloc(struct btree_iter *, unsigned long); ++static struct btree_iter *btree_trans_iter_alloc(struct btree_trans *); ++static void btree_iter_copy(struct btree_iter *, struct btree_iter *); + + static inline struct bpos bkey_successor(struct btree_iter *iter, struct bpos p) + { +@@ -1968,9 +1971,39 @@ static inline void bch2_btree_iter_init(struct btree_trans *trans, + + /* new transactional stuff: */ + ++static void btree_iter_child_free(struct btree_iter *iter) ++{ ++ struct btree_iter *child = btree_iter_child(iter); ++ ++ if (child) { ++ bch2_trans_iter_free(iter->trans, child); ++ iter->child_idx = U8_MAX; ++ } ++} ++ ++static struct btree_iter *btree_iter_child_alloc(struct btree_iter *iter, ++ unsigned long ip) ++{ ++ struct btree_trans *trans = iter->trans; ++ struct btree_iter *child = btree_iter_child(iter); ++ ++ if (!child) { ++ child = btree_trans_iter_alloc(trans); ++ child->ip_allocated = ip; ++ iter->child_idx = child->idx; ++ ++ trans->iters_live |= 1ULL << child->idx; ++ trans->iters_touched |= 1ULL << child->idx; ++ } ++ ++ return child; ++} ++ + static inline void __bch2_trans_iter_free(struct btree_trans *trans, + unsigned idx) + { ++ btree_iter_child_free(&trans->iters[idx]); ++ + __bch2_btree_iter_unlock(&trans->iters[idx]); + trans->iters_linked &= ~(1ULL << idx); + trans->iters_live &= ~(1ULL << idx); +@@ -2038,6 +2071,7 @@ static void btree_trans_iter_alloc_fail(struct btree_trans *trans) + + static struct btree_iter *btree_trans_iter_alloc(struct btree_trans *trans) + { ++ struct btree_iter *iter; + unsigned idx; + + if (unlikely(trans->iters_linked == +@@ -2045,21 +2079,27 @@ static struct btree_iter *btree_trans_iter_alloc(struct btree_trans *trans) + btree_trans_iter_alloc_fail(trans); + + idx = __ffs64(~trans->iters_linked); +- ++ iter = &trans->iters[idx]; ++ ++ iter->trans = trans; ++ iter->idx = idx; ++ iter->child_idx = U8_MAX; ++ iter->flags = 0; ++ iter->nodes_locked = 0; ++ iter->nodes_intent_locked = 0; + trans->iters_linked |= 1ULL << idx; +- trans->iters[idx].idx = idx; +- trans->iters[idx].flags = 0; +- return &trans->iters[idx]; ++ return iter; + } + +-static inline void btree_iter_copy(struct btree_iter *dst, +- struct btree_iter *src) ++static void btree_iter_copy(struct btree_iter *dst, struct btree_iter *src) + { +- unsigned i, idx = dst->idx; ++ unsigned i; + +- *dst = *src; +- dst->idx = idx; +- dst->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT; ++ __bch2_btree_iter_unlock(dst); ++ btree_iter_child_free(dst); ++ ++ memcpy(&dst->flags, &src->flags, ++ sizeof(struct btree_iter) - offsetof(struct btree_iter, flags)); + + for (i = 0; i < BTREE_MAX_DEPTH; i++) + if (btree_node_locked(dst, i)) +@@ -2363,6 +2403,13 @@ int bch2_trans_exit(struct btree_trans *trans) + bch2_trans_unlock(trans); + + #ifdef CONFIG_BCACHEFS_DEBUG ++ if (trans->iters_live) { ++ struct btree_iter *iter; ++ ++ trans_for_each_iter(trans, iter) ++ btree_iter_child_free(iter); ++ } ++ + if (trans->iters_live) { + struct btree_iter *iter; + +diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h +index a2ce711fd61f..18732ca531ec 100644 +--- a/fs/bcachefs/btree_iter.h ++++ b/fs/bcachefs/btree_iter.h +@@ -181,6 +181,12 @@ static inline void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos + iter->should_be_locked = false; + } + ++static inline struct btree_iter *btree_iter_child(struct btree_iter *iter) ++{ ++ return iter->child_idx == U8_MAX ? NULL ++ : iter->trans->iters + iter->child_idx; ++} ++ + /* Sort order for locking btree iterators: */ + static inline int btree_iter_lock_cmp(const struct btree_iter *l, + const struct btree_iter *r) +diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h +index 66056d015e7d..3d360b722822 100644 +--- a/fs/bcachefs/btree_types.h ++++ b/fs/bcachefs/btree_types.h +@@ -241,15 +241,20 @@ enum btree_iter_uptodate { + */ + struct btree_iter { + struct btree_trans *trans; +- struct bpos pos; +- /* what we're searching for/what the iterator actually points to: */ +- struct bpos real_pos; +- struct bpos pos_after_commit; ++ unsigned long ip_allocated; ++ ++ u8 idx; ++ u8 child_idx; ++ ++ /* btree_iter_copy starts here: */ ++ u16 flags; ++ + /* When we're filtering by snapshot, the snapshot ID we're looking for: */ + unsigned snapshot; + +- u16 flags; +- u8 idx; ++ struct bpos pos; ++ struct bpos real_pos; ++ struct bpos pos_after_commit; + + enum btree_id btree_id:4; + enum btree_iter_uptodate uptodate:3; +@@ -276,7 +281,6 @@ struct btree_iter { + * bch2_btree_iter_next_slot() can correctly advance pos. + */ + struct bkey k; +- unsigned long ip_allocated; + }; + + static inline enum btree_iter_type +-- +cgit v1.2.3 + + +From b8a4c890bbf5a13fdc3343ba47067642e528bbce Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 4 Jun 2021 00:29:49 -0400 +Subject: bcachefs: BTREE_ITER_WITH_UPDATES + +This drops bch2_btree_iter_peek_with_updates() and replaces it with a +new flag, BTREE_ITER_WITH_UPDATES, and also reworks +bch2_btree_iter_peek_slot() to respect it too. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 177 ++++++++++++++++++++-------------------- + fs/bcachefs/btree_iter.h | 3 - + fs/bcachefs/btree_types.h | 13 +-- + fs/bcachefs/btree_update_leaf.c | 12 +-- + 4 files changed, 99 insertions(+), 106 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 041ce260d91e..a0eba5270e32 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -865,10 +865,9 @@ static inline struct bkey_s_c __btree_iter_unpack(struct btree_iter *iter, + + /* peek_all() doesn't skip deleted keys */ + static inline struct bkey_s_c btree_iter_level_peek_all(struct btree_iter *iter, +- struct btree_iter_level *l, +- struct bkey *u) ++ struct btree_iter_level *l) + { +- return __btree_iter_unpack(iter, l, u, ++ return __btree_iter_unpack(iter, l, &iter->k, + bch2_btree_node_iter_peek_all(&l->iter, l->b)); + } + +@@ -1652,15 +1651,18 @@ static inline bool btree_iter_set_pos_to_prev_leaf(struct btree_iter *iter) + return ret; + } + +-static struct bkey_i *btree_trans_peek_updates(struct btree_trans *trans, +- enum btree_id btree_id, struct bpos pos) ++static inline struct bkey_i *btree_trans_peek_updates(struct btree_iter *iter, ++ struct bpos pos) + { + struct btree_insert_entry *i; + +- trans_for_each_update2(trans, i) +- if ((cmp_int(btree_id, i->iter->btree_id) ?: +- bkey_cmp(pos, i->k->k.p)) <= 0) { +- if (btree_id == i->iter->btree_id) ++ if (!(iter->flags & BTREE_ITER_WITH_UPDATES)) ++ return NULL; ++ ++ trans_for_each_update2(iter->trans, i) ++ if ((cmp_int(iter->btree_id, i->iter->btree_id) ?: ++ bkey_cmp(pos, i->k->k.p)) <= 0) { ++ if (iter->btree_id == i->iter->btree_id) + return i->k; + break; + } +@@ -1668,7 +1670,11 @@ static struct bkey_i *btree_trans_peek_updates(struct btree_trans *trans, + return NULL; + } + +-static inline struct bkey_s_c __btree_iter_peek(struct btree_iter *iter, bool with_updates) ++/** ++ * bch2_btree_iter_peek: returns first key greater than or equal to iterator's ++ * current position ++ */ ++struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) + { + struct bpos search_key = btree_iter_search_key(iter); + struct bkey_i *next_update; +@@ -1679,9 +1685,7 @@ static inline struct bkey_s_c __btree_iter_peek(struct btree_iter *iter, bool wi + bch2_btree_iter_verify(iter); + bch2_btree_iter_verify_entry_exit(iter); + start: +- next_update = with_updates +- ? btree_trans_peek_updates(iter->trans, iter->btree_id, search_key) +- : NULL; ++ next_update = btree_trans_peek_updates(iter, search_key); + btree_iter_set_search_pos(iter, search_key); + + while (1) { +@@ -1723,15 +1727,6 @@ start: + return k; + } + +-/** +- * bch2_btree_iter_peek: returns first key greater than or equal to iterator's +- * current position +- */ +-struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) +-{ +- return __btree_iter_peek(iter, false); +-} +- + /** + * bch2_btree_iter_next: returns first key greater than iterator's current + * position +@@ -1744,19 +1739,6 @@ struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter) + return bch2_btree_iter_peek(iter); + } + +-struct bkey_s_c bch2_btree_iter_peek_with_updates(struct btree_iter *iter) +-{ +- return __btree_iter_peek(iter, true); +-} +- +-struct bkey_s_c bch2_btree_iter_next_with_updates(struct btree_iter *iter) +-{ +- if (!bch2_btree_iter_advance(iter)) +- return bkey_s_c_null; +- +- return bch2_btree_iter_peek_with_updates(iter); +-} +- + /** + * bch2_btree_iter_peek_prev: returns first key less than or equal to + * iterator's current position +@@ -1768,6 +1750,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) + int ret; + + EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS); ++ EBUG_ON(iter->flags & BTREE_ITER_WITH_UPDATES); + bch2_btree_iter_verify(iter); + bch2_btree_iter_verify_entry_exit(iter); + +@@ -1829,52 +1812,9 @@ struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *iter) + return bch2_btree_iter_peek_prev(iter); + } + +-static inline struct bkey_s_c +-__bch2_btree_iter_peek_slot_extents(struct btree_iter *iter) +-{ +- struct bkey_s_c k; +- struct bpos pos, next_start; +- +- /* keys & holes can't span inode numbers: */ +- if (iter->pos.offset == KEY_OFFSET_MAX) { +- if (iter->pos.inode == KEY_INODE_MAX) +- return bkey_s_c_null; +- +- bch2_btree_iter_set_pos(iter, bkey_successor(iter, iter->pos)); +- } +- +- pos = iter->pos; +- k = bch2_btree_iter_peek(iter); +- iter->pos = pos; +- +- if (bkey_err(k)) +- return k; +- +- if (k.k && bkey_cmp(bkey_start_pos(k.k), iter->pos) <= 0) +- return k; +- +- next_start = k.k ? bkey_start_pos(k.k) : POS_MAX; +- +- bkey_init(&iter->k); +- iter->k.p = iter->pos; +- bch2_key_resize(&iter->k, +- min_t(u64, KEY_SIZE_MAX, +- (next_start.inode == iter->pos.inode +- ? next_start.offset +- : KEY_OFFSET_MAX) - +- iter->pos.offset)); +- +- EBUG_ON(!iter->k.size); +- +- bch2_btree_iter_verify_entry_exit(iter); +- bch2_btree_iter_verify(iter); +- +- return (struct bkey_s_c) { &iter->k, NULL }; +-} +- + struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) + { +- struct btree_iter_level *l = &iter->l[0]; ++ struct bpos search_key = btree_iter_search_key(iter); + struct bkey_s_c k; + int ret; + +@@ -1882,24 +1822,78 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) + bch2_btree_iter_verify(iter); + bch2_btree_iter_verify_entry_exit(iter); + +- btree_iter_set_search_pos(iter, btree_iter_search_key(iter)); ++ btree_iter_set_search_pos(iter, search_key); + +- if (iter->flags & BTREE_ITER_IS_EXTENTS) +- return __bch2_btree_iter_peek_slot_extents(iter); ++ /* extents can't span inode numbers: */ ++ if ((iter->flags & BTREE_ITER_IS_EXTENTS) && ++ iter->pos.offset == KEY_OFFSET_MAX) { ++ if (iter->pos.inode == KEY_INODE_MAX) ++ return bkey_s_c_null; ++ ++ bch2_btree_iter_set_pos(iter, bpos_nosnap_successor(iter->pos)); ++ } + + ret = btree_iter_traverse(iter); + if (unlikely(ret)) + return bkey_s_c_err(ret); + +- k = btree_iter_level_peek_all(iter, l, &iter->k); ++ if (!(iter->flags & BTREE_ITER_IS_EXTENTS)) { ++ struct bkey_i *next_update = btree_trans_peek_updates(iter, search_key); + +- EBUG_ON(k.k && bkey_deleted(k.k) && bkey_cmp(k.k->p, iter->pos) == 0); ++ k = btree_iter_level_peek_all(iter, &iter->l[0]); ++ EBUG_ON(k.k && bkey_deleted(k.k) && bkey_cmp(k.k->p, iter->pos) == 0); + +- if (!k.k || bkey_cmp(iter->pos, k.k->p)) { +- /* hole */ +- bkey_init(&iter->k); +- iter->k.p = iter->pos; +- k = (struct bkey_s_c) { &iter->k, NULL }; ++ if (next_update && ++ (!k.k || bpos_cmp(next_update->k.p, k.k->p) <= 0)) { ++ iter->k = next_update->k; ++ k = bkey_i_to_s_c(next_update); ++ } ++ } else { ++ if ((iter->flags & BTREE_ITER_INTENT)) { ++ struct btree_iter *child = ++ btree_iter_child_alloc(iter, _THIS_IP_); ++ ++ btree_iter_copy(child, iter); ++ k = bch2_btree_iter_peek(child); ++ ++ if (k.k && !bkey_err(k)) ++ iter->k = child->k; ++ } else { ++ struct bpos pos = iter->pos; ++ ++ k = bch2_btree_iter_peek(iter); ++ iter->pos = pos; ++ } ++ ++ if (unlikely(bkey_err(k))) ++ return k; ++ } ++ ++ if (!(iter->flags & BTREE_ITER_IS_EXTENTS)) { ++ if (!k.k || ++ ((iter->flags & BTREE_ITER_ALL_SNAPSHOTS) ++ ? bpos_cmp(iter->pos, k.k->p) ++ : bkey_cmp(iter->pos, k.k->p))) { ++ bkey_init(&iter->k); ++ iter->k.p = iter->pos; ++ k = (struct bkey_s_c) { &iter->k, NULL }; ++ } ++ } else { ++ struct bpos next = k.k ? bkey_start_pos(k.k) : POS_MAX; ++ ++ if (bkey_cmp(iter->pos, next) < 0) { ++ bkey_init(&iter->k); ++ iter->k.p = iter->pos; ++ bch2_key_resize(&iter->k, ++ min_t(u64, KEY_SIZE_MAX, ++ (next.inode == iter->pos.inode ++ ? next.offset ++ : KEY_OFFSET_MAX) - ++ iter->pos.offset)); ++ ++ k = (struct bkey_s_c) { &iter->k, NULL }; ++ EBUG_ON(!k.k->size); ++ } + } + + bch2_btree_iter_verify_entry_exit(iter); +@@ -1927,12 +1921,17 @@ struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_iter *iter) + + struct bkey_s_c bch2_btree_iter_peek_cached(struct btree_iter *iter) + { ++ struct bkey_i *next_update; + struct bkey_cached *ck; + int ret; + + EBUG_ON(btree_iter_type(iter) != BTREE_ITER_CACHED); + bch2_btree_iter_verify(iter); + ++ next_update = btree_trans_peek_updates(iter, iter->pos); ++ if (next_update && !bpos_cmp(next_update->k.p, iter->pos)) ++ return bkey_i_to_s_c(next_update); ++ + ret = btree_iter_traverse(iter); + if (unlikely(ret)) + return bkey_s_c_err(ret); +diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h +index 18732ca531ec..ba98cfea4d60 100644 +--- a/fs/bcachefs/btree_iter.h ++++ b/fs/bcachefs/btree_iter.h +@@ -153,9 +153,6 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *); + struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *); + struct bkey_s_c bch2_btree_iter_next(struct btree_iter *); + +-struct bkey_s_c bch2_btree_iter_peek_with_updates(struct btree_iter *); +-struct bkey_s_c bch2_btree_iter_next_with_updates(struct btree_iter *); +- + struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *); + struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *); + +diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h +index 3d360b722822..4997ce22e5e3 100644 +--- a/fs/bcachefs/btree_types.h ++++ b/fs/bcachefs/btree_types.h +@@ -209,12 +209,13 @@ enum btree_iter_type { + * @pos or the first key strictly greater than @pos + */ + #define BTREE_ITER_IS_EXTENTS (1 << 6) +-#define BTREE_ITER_ERROR (1 << 7) +-#define BTREE_ITER_SET_POS_AFTER_COMMIT (1 << 8) +-#define BTREE_ITER_CACHED_NOFILL (1 << 9) +-#define BTREE_ITER_CACHED_NOCREATE (1 << 10) +-#define BTREE_ITER_NOT_EXTENTS (1 << 11) +-#define BTREE_ITER_ALL_SNAPSHOTS (1 << 12) ++#define BTREE_ITER_NOT_EXTENTS (1 << 7) ++#define BTREE_ITER_ERROR (1 << 8) ++#define BTREE_ITER_SET_POS_AFTER_COMMIT (1 << 9) ++#define BTREE_ITER_CACHED_NOFILL (1 << 10) ++#define BTREE_ITER_CACHED_NOCREATE (1 << 11) ++#define BTREE_ITER_WITH_UPDATES (1 << 12) ++#define BTREE_ITER_ALL_SNAPSHOTS (1 << 13) + + enum btree_iter_uptodate { + BTREE_ITER_UPTODATE = 0, +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index 0d566be7455e..6557dbc0b64e 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -841,13 +841,11 @@ static int extent_handle_overwrites(struct btree_trans *trans, + struct bpos start = bkey_start_pos(&insert->k); + struct bkey_i *update; + struct bkey_s_c k; +- int ret = 0; +- +- iter = bch2_trans_get_iter(trans, btree_id, start, +- BTREE_ITER_INTENT); +- k = bch2_btree_iter_peek_with_updates(iter); ++ int ret; + +- while (k.k && !(ret = bkey_err(k))) { ++ for_each_btree_key(trans, iter, btree_id, start, ++ BTREE_ITER_INTENT| ++ BTREE_ITER_WITH_UPDATES, k, ret) { + if (bkey_cmp(insert->k.p, bkey_start_pos(k.k)) <= 0) + break; + +@@ -898,8 +896,6 @@ static int extent_handle_overwrites(struct btree_trans *trans, + bch2_trans_iter_put(trans, update_iter); + break; + } +- +- k = bch2_btree_iter_next_with_updates(iter); + } + bch2_trans_iter_put(trans, iter); + +-- +cgit v1.2.3 + + +From ebce12ab734e98259dadbd3093e29e53413c099a Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 2 Jun 2021 00:18:34 -0400 +Subject: bcachefs: Move extent_handle_overwrites() to bch2_trans_update() + +This lifts handling of overlapping extents out of __bch2_trans_commit() +and moves it to where we first do the update - which means that +BTREE_ITER_WITH_UPDATES can now work correctly in extents mode. + +Also, this patch reworks how extent triggers work: previously, on +partial extent overwrite we would pass this information to the trigger, +telling it what part of the extent was being overwritten. But, this +approach has had too many subtle corner cases - now, we only mark whole +extents, meaning on partial extent overwrite we unmark the old extent +and mark the new extent. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 6 +- + fs/bcachefs/btree_update_leaf.c | 157 +++++++++++----------------------------- + fs/bcachefs/buckets.c | 145 +++++++++---------------------------- + 3 files changed, 80 insertions(+), 228 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index a0eba5270e32..5c57a6d26335 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -1659,7 +1659,7 @@ static inline struct bkey_i *btree_trans_peek_updates(struct btree_iter *iter, + if (!(iter->flags & BTREE_ITER_WITH_UPDATES)) + return NULL; + +- trans_for_each_update2(iter->trans, i) ++ trans_for_each_update(iter->trans, i) + if ((cmp_int(iter->btree_id, i->iter->btree_id) ?: + bkey_cmp(pos, i->k->k.p)) <= 0) { + if (iter->btree_id == i->iter->btree_id) +@@ -1696,8 +1696,10 @@ start: + k = btree_iter_level_peek(iter, &iter->l[0]); + + if (next_update && +- bpos_cmp(next_update->k.p, iter->real_pos) <= 0) ++ bpos_cmp(next_update->k.p, iter->real_pos) <= 0) { ++ iter->k = next_update->k; + k = bkey_i_to_s_c(next_update); ++ } + + if (likely(k.k)) { + if (bkey_deleted(k.k)) { +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index 6557dbc0b64e..6976e470df6d 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -775,7 +775,7 @@ bch2_trans_commit_get_rw_cold(struct btree_trans *trans) + return 0; + } + +-static void __bch2_trans_update2(struct btree_trans *trans, ++static void bch2_trans_update2(struct btree_trans *trans, + struct btree_insert_entry n) + { + struct btree_insert_entry *i; +@@ -798,44 +798,23 @@ static void __bch2_trans_update2(struct btree_trans *trans, + i - trans->updates2, n); + } + +-static void bch2_trans_update2(struct btree_trans *trans, +- struct btree_iter *iter, +- struct bkey_i *insert) +-{ +- __bch2_trans_update2(trans, (struct btree_insert_entry) { +- .bkey_type = __btree_node_type(iter->level, iter->btree_id), +- .btree_id = iter->btree_id, +- .level = iter->level, +- .iter = iter, +- .k = insert, +- }); +-} +- + static int extent_update_to_keys(struct btree_trans *trans, + struct btree_insert_entry n) + { +- int ret; +- +- ret = bch2_extent_can_insert(trans, n.iter, n.k); +- if (ret) +- return ret; +- +- if (bkey_deleted(&n.k->k)) +- return 0; +- + n.iter = bch2_trans_get_iter(trans, n.iter->btree_id, n.k->k.p, + BTREE_ITER_INTENT| + BTREE_ITER_NOT_EXTENTS); + n.is_extent = false; + +- __bch2_trans_update2(trans, n); ++ bch2_trans_update2(trans, n); + bch2_trans_iter_put(trans, n.iter); + return 0; + } + + static int extent_handle_overwrites(struct btree_trans *trans, + enum btree_id btree_id, +- struct bkey_i *insert) ++ struct bkey_i *insert, ++ unsigned trigger_flags) + { + struct btree_iter *iter, *update_iter; + struct bpos start = bkey_start_pos(&insert->k); +@@ -861,7 +840,8 @@ static int extent_handle_overwrites(struct btree_trans *trans, + update_iter = bch2_trans_get_iter(trans, btree_id, update->k.p, + BTREE_ITER_NOT_EXTENTS| + BTREE_ITER_INTENT); +- bch2_trans_update2(trans, update_iter, update); ++ bch2_trans_update(trans, update_iter, update, ++ trigger_flags); + bch2_trans_iter_put(trans, update_iter); + } + +@@ -877,7 +857,8 @@ static int extent_handle_overwrites(struct btree_trans *trans, + update_iter = bch2_trans_get_iter(trans, btree_id, update->k.p, + BTREE_ITER_NOT_EXTENTS| + BTREE_ITER_INTENT); +- bch2_trans_update2(trans, update_iter, update); ++ bch2_trans_update(trans, update_iter, update, ++ trigger_flags); + bch2_trans_iter_put(trans, update_iter); + } + +@@ -892,7 +873,8 @@ static int extent_handle_overwrites(struct btree_trans *trans, + update_iter = bch2_trans_get_iter(trans, btree_id, update->k.p, + BTREE_ITER_NOT_EXTENTS| + BTREE_ITER_INTENT); +- bch2_trans_update2(trans, update_iter, update); ++ bch2_trans_update(trans, update_iter, update, ++ trigger_flags); + bch2_trans_iter_put(trans, update_iter); + break; + } +@@ -962,18 +944,10 @@ int __bch2_trans_commit(struct btree_trans *trans) + } + } while (trans_trigger_run); + +- /* Turn extents updates into keys: */ +- trans_for_each_update(trans, i) +- if (i->is_extent) { +- ret = extent_handle_overwrites(trans, i->btree_id, i->k); +- if (unlikely(ret)) +- goto out; +- } +- + trans_for_each_update(trans, i) { + ret = i->is_extent + ? extent_update_to_keys(trans, *i) +- : (__bch2_trans_update2(trans, *i), 0); ++ : (bch2_trans_update2(trans, *i), 0); + if (unlikely(ret)) + goto out; + } +@@ -1051,6 +1025,7 @@ int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter, + .iter = iter, + .k = k + }; ++ int ret = 0; + + BUG_ON(trans->nr_updates >= BTREE_ITER_MAX); + +@@ -1067,97 +1042,47 @@ int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter, + } + #endif + +- iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT; +- + if (n.is_extent) { ++ ret = bch2_extent_can_insert(trans, n.iter, n.k); ++ if (ret) ++ return ret; ++ ++ ret = extent_handle_overwrites(trans, n.btree_id, n.k, flags); ++ if (ret) ++ return ret; ++ + iter->pos_after_commit = k->k.p; + iter->flags |= BTREE_ITER_SET_POS_AFTER_COMMIT; ++ ++ if (bkey_deleted(&n.k->k)) ++ return 0; ++ ++ n.iter = bch2_trans_get_iter(trans, n.iter->btree_id, n.k->k.p, ++ BTREE_ITER_INTENT| ++ BTREE_ITER_NOT_EXTENTS); ++ bch2_trans_iter_put(trans, n.iter); ++ n.is_extent = false; + } + ++ BUG_ON(n.iter->flags & BTREE_ITER_IS_EXTENTS); ++ ++ n.iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT; ++ + /* + * Pending updates are kept sorted: first, find position of new update, + * then delete/trim any updates the new update overwrites: + */ +- if (!n.is_extent) { +- trans_for_each_update(trans, i) +- if (btree_insert_entry_cmp(&n, i) <= 0) +- break; +- +- if (i < trans->updates + trans->nr_updates && +- !btree_insert_entry_cmp(&n, i)) +- *i = n; +- else +- array_insert_item(trans->updates, trans->nr_updates, +- i - trans->updates, n); +- } else { +- trans_for_each_update(trans, i) +- if (btree_insert_entry_cmp(&n, i) < 0) +- break; +- +- while (i > trans->updates && +- i[-1].btree_id == n.btree_id && +- bkey_cmp(bkey_start_pos(&n.k->k), +- bkey_start_pos(&i[-1].k->k)) <= 0) { +- --i; +- array_remove_item(trans->updates, trans->nr_updates, +- i - trans->updates); +- } +- +- if (i > trans->updates && +- i[-1].btree_id == n.btree_id && +- bkey_cmp(bkey_start_pos(&n.k->k), i[-1].k->k.p) < 0) +- bch2_cut_back(bkey_start_pos(&n.k->k), i[-1].k); +- +- if (i < trans->updates + trans->nr_updates && +- i->btree_id == n.btree_id && +- bkey_cmp(n.k->k.p, bkey_start_pos(&i->k->k)) > 0) { +- if (bkey_cmp(bkey_start_pos(&n.k->k), +- bkey_start_pos(&i->k->k)) > 0) { +- struct btree_insert_entry split = *i; +- int ret; +- +- BUG_ON(trans->nr_updates + 1 >= BTREE_ITER_MAX); +- +- split.k = bch2_trans_kmalloc(trans, bkey_bytes(&i->k->k)); +- ret = PTR_ERR_OR_ZERO(split.k); +- if (ret) +- return ret; +- +- bkey_copy(split.k, i->k); +- bch2_cut_back(bkey_start_pos(&n.k->k), split.k); +- +- split.iter = bch2_trans_get_iter(trans, split.btree_id, +- bkey_start_pos(&split.k->k), +- BTREE_ITER_INTENT); +- split.iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT; +- bch2_trans_iter_put(trans, split.iter); +- array_insert_item(trans->updates, trans->nr_updates, +- i - trans->updates, split); +- i++; +- } +- +- /* +- * When we have an extent that overwrites the start of another +- * update, trimming that extent will mean the iterator's +- * position has to change since the iterator position has to +- * match the extent's start pos - but we don't want to change +- * the iterator pos if some other code is using it, so we may +- * need to clone it: +- */ +- if (btree_iter_live(trans, i->iter)) { +- i->iter = bch2_trans_copy_iter(trans, i->iter); +- +- i->iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT; +- bch2_trans_iter_put(trans, i->iter); +- } +- +- bch2_cut_front(n.k->k.p, i->k); +- bch2_btree_iter_set_pos(i->iter, n.k->k.p); +- } ++ trans_for_each_update(trans, i) ++ if (btree_insert_entry_cmp(&n, i) <= 0) ++ break; + ++ if (i < trans->updates + trans->nr_updates && ++ !btree_insert_entry_cmp(&n, i)) { ++ BUG_ON(i->trans_triggers_run); ++ *i = n; ++ } else + array_insert_item(trans->updates, trans->nr_updates, + i - trans->updates, n); +- } + + return 0; + } +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index 76d15a5dc62f..7672752b57d6 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -1515,29 +1515,6 @@ static struct btree_iter *trans_get_update(struct btree_trans *trans, + return NULL; + } + +-static int trans_get_key(struct btree_trans *trans, +- enum btree_id btree_id, struct bpos pos, +- struct btree_iter **iter, +- struct bkey_s_c *k) +-{ +- unsigned flags = btree_id != BTREE_ID_alloc +- ? BTREE_ITER_SLOTS +- : BTREE_ITER_CACHED; +- int ret; +- +- *iter = trans_get_update(trans, btree_id, pos, k); +- if (*iter) +- return 1; +- +- *iter = bch2_trans_get_iter(trans, btree_id, pos, +- flags|BTREE_ITER_INTENT); +- *k = __bch2_btree_iter_peek(*iter, flags); +- ret = bkey_err(*k); +- if (ret) +- bch2_trans_iter_put(trans, *iter); +- return ret; +-} +- + static struct bkey_alloc_buf * + bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter **_iter, + const struct bch_extent_ptr *ptr, +@@ -1617,9 +1594,13 @@ static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans, + struct bch_replicas_padded r; + int ret = 0; + +- ret = trans_get_key(trans, BTREE_ID_stripes, POS(0, p.ec.idx), &iter, &k); +- if (ret < 0) +- return ret; ++ iter = bch2_trans_get_iter(trans, BTREE_ID_stripes, POS(0, p.ec.idx), ++ BTREE_ITER_INTENT| ++ BTREE_ITER_WITH_UPDATES); ++ k = bch2_btree_iter_peek_slot(iter); ++ ret = bkey_err(k); ++ if (ret) ++ goto err; + + if (k.k->type != KEY_TYPE_stripe) { + bch2_fs_inconsistent(c, +@@ -1627,7 +1608,7 @@ static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans, + (u64) p.ec.idx); + bch2_inconsistent_error(c); + ret = -EIO; +- goto out; ++ goto err; + } + + if (!bch2_ptr_matches_stripe(bkey_s_c_to_stripe(k).v, p)) { +@@ -1635,13 +1616,13 @@ static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans, + "stripe pointer doesn't match stripe %llu", + (u64) p.ec.idx); + ret = -EIO; +- goto out; ++ goto err; + } + + s = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); + ret = PTR_ERR_OR_ZERO(s); + if (ret) +- goto out; ++ goto err; + + bkey_reassemble(&s->k_i, k); + stripe_blockcount_set(&s->v, p.ec.block, +@@ -1652,7 +1633,7 @@ static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans, + bch2_bkey_to_replicas(&r.e, bkey_i_to_s_c(&s->k_i)); + r.e.data_type = data_type; + update_replicas_list(trans, &r.e, sectors); +-out: ++err: + bch2_trans_iter_put(trans, iter); + return ret; + } +@@ -1834,10 +1815,13 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans, + int frags_referenced; + s64 ret; + +- ret = trans_get_key(trans, BTREE_ID_reflink, +- POS(0, idx), &iter, &k); +- if (ret < 0) +- return ret; ++ iter = bch2_trans_get_iter(trans, BTREE_ID_reflink, POS(0, idx), ++ BTREE_ITER_INTENT| ++ BTREE_ITER_WITH_UPDATES); ++ k = bch2_btree_iter_peek_slot(iter); ++ ret = bkey_err(k); ++ if (ret) ++ goto err; + + sectors = min_t(u64, sectors, k.k->p.offset - idx); + +@@ -1990,86 +1974,27 @@ int bch2_trans_mark_update(struct btree_trans *trans, + if (!btree_node_type_needs_gc(iter->btree_id)) + return 0; + +- if (!btree_node_type_is_extents(iter->btree_id)) { +- if (btree_iter_type(iter) != BTREE_ITER_CACHED) { +- old = bch2_btree_iter_peek_slot(iter); +- ret = bkey_err(old); +- if (ret) +- return ret; +- } else { +- struct bkey_cached *ck = (void *) iter->l[0].b; +- +- BUG_ON(!ck->valid); +- old = bkey_i_to_s_c(ck->k); +- } +- +- if (old.k->type == new->k.type) { +- ret = bch2_trans_mark_key(trans, old, bkey_i_to_s_c(new), 0, 0, +- BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags); +- } else { +- ret = bch2_trans_mark_key(trans, old, bkey_i_to_s_c(new), 0, 0, +- BTREE_TRIGGER_INSERT|flags) ?: +- bch2_trans_mark_key(trans, old, bkey_i_to_s_c(new), 0, 0, +- BTREE_TRIGGER_OVERWRITE|flags); +- } +- } else { +- struct btree_iter *copy; +- struct bkey _old; +- +- EBUG_ON(btree_iter_type(iter) == BTREE_ITER_CACHED); +- +- bkey_init(&_old); +- old = (struct bkey_s_c) { &_old, NULL }; +- +- ret = bch2_trans_mark_key(trans, old, bkey_i_to_s_c(new), +- 0, new->k.size, +- BTREE_TRIGGER_INSERT); ++ if (btree_iter_type(iter) != BTREE_ITER_CACHED) { ++ old = bch2_btree_iter_peek_slot(iter); ++ ret = bkey_err(old); + if (ret) + return ret; ++ } else { ++ struct bkey_cached *ck = (void *) iter->l[0].b; + +- copy = bch2_trans_copy_iter(trans, iter); +- +- for_each_btree_key_continue(copy, 0, old, ret) { +- unsigned offset = 0; +- s64 sectors = -((s64) old.k->size); +- +- flags |= BTREE_TRIGGER_OVERWRITE; +- +- if (bkey_cmp(new->k.p, bkey_start_pos(old.k)) <= 0) +- break; +- +- switch (bch2_extent_overlap(&new->k, old.k)) { +- case BCH_EXTENT_OVERLAP_ALL: +- offset = 0; +- sectors = -((s64) old.k->size); +- break; +- case BCH_EXTENT_OVERLAP_BACK: +- offset = bkey_start_offset(&new->k) - +- bkey_start_offset(old.k); +- sectors = bkey_start_offset(&new->k) - +- old.k->p.offset; +- break; +- case BCH_EXTENT_OVERLAP_FRONT: +- offset = 0; +- sectors = bkey_start_offset(old.k) - +- new->k.p.offset; +- break; +- case BCH_EXTENT_OVERLAP_MIDDLE: +- offset = bkey_start_offset(&new->k) - +- bkey_start_offset(old.k); +- sectors = -((s64) new->k.size); +- flags |= BTREE_TRIGGER_OVERWRITE_SPLIT; +- break; +- } +- +- BUG_ON(sectors >= 0); ++ BUG_ON(!ck->valid); ++ old = bkey_i_to_s_c(ck->k); ++ } + +- ret = bch2_trans_mark_key(trans, old, bkey_i_to_s_c(new), +- offset, sectors, flags); +- if (ret) +- break; +- } +- bch2_trans_iter_put(trans, copy); ++ if (old.k->type == new->k.type && ++ !btree_node_type_is_extents(iter->btree_id)) { ++ ret = bch2_trans_mark_key(trans, old, bkey_i_to_s_c(new), 0, 0, ++ BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags); ++ } else { ++ ret = bch2_trans_mark_key(trans, old, bkey_i_to_s_c(new), 0, new->k.size, ++ BTREE_TRIGGER_INSERT|flags) ?: ++ bch2_trans_mark_key(trans, old, bkey_i_to_s_c(new), 0, -((s64) old.k->size), ++ BTREE_TRIGGER_OVERWRITE|flags); + } + + return ret; +-- +cgit v1.2.3 + + +From bd4c4adc632b63508ebb6d72614232465cc0413f Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 7 Jun 2021 13:39:21 -0400 +Subject: bcachefs: Simplify reflink trigger + +Now that we only mark entire extents, we can ditch the +"reflink_p_frag_references" code. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/buckets.c | 67 +++------------------------------------------------ + 1 file changed, 3 insertions(+), 64 deletions(-) + +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index 7672752b57d6..20862a4a77f2 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -1079,32 +1079,6 @@ static int bch2_mark_stripe(struct bch_fs *c, + return 0; + } + +-static int __reflink_p_frag_references(struct bkey_s_c_reflink_p p, +- u64 p_start, u64 p_end, +- u64 v_start, u64 v_end) +-{ +- if (p_start == p_end) +- return false; +- +- p_start += le64_to_cpu(p.v->idx); +- p_end += le64_to_cpu(p.v->idx); +- +- if (p_end <= v_start) +- return false; +- if (p_start >= v_end) +- return false; +- return true; +-} +- +-static int reflink_p_frag_references(struct bkey_s_c_reflink_p p, +- u64 start, u64 end, +- struct bkey_s_c k) +-{ +- return __reflink_p_frag_references(p, start, end, +- bkey_start_offset(k.k), +- k.k->p.offset); +-} +- + static int __bch2_mark_reflink_p(struct bch_fs *c, + struct bkey_s_c_reflink_p p, + u64 idx, unsigned sectors, +@@ -1115,7 +1089,6 @@ static int __bch2_mark_reflink_p(struct bch_fs *c, + { + struct reflink_gc *r; + int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1; +- int frags_referenced; + + while (1) { + if (*r_idx >= c->reflink_gc_nr) +@@ -1128,20 +1101,6 @@ static int __bch2_mark_reflink_p(struct bch_fs *c, + (*r_idx)++; + } + +- frags_referenced = +- __reflink_p_frag_references(p, 0, front_frag, +- r->offset - r->size, r->offset) + +- __reflink_p_frag_references(p, back_frag, p.k->size, +- r->offset - r->size, r->offset); +- +- if (frags_referenced == 2) { +- BUG_ON(!(flags & BTREE_TRIGGER_OVERWRITE_SPLIT)); +- add = -add; +- } else if (frags_referenced == 1) { +- BUG_ON(!(flags & BTREE_TRIGGER_OVERWRITE)); +- add = 0; +- } +- + BUG_ON((s64) r->refcount + add < 0); + + r->refcount += add; +@@ -1802,8 +1761,6 @@ static int bch2_trans_mark_stripe(struct btree_trans *trans, + static int __bch2_trans_mark_reflink_p(struct btree_trans *trans, + struct bkey_s_c_reflink_p p, + u64 idx, unsigned sectors, +- unsigned front_frag, +- unsigned back_frag, + unsigned flags) + { + struct bch_fs *c = trans->c; +@@ -1812,7 +1769,6 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans, + struct bkey_i *n; + __le64 *refcount; + int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1; +- int frags_referenced; + s64 ret; + + iter = bch2_trans_get_iter(trans, BTREE_ID_reflink, POS(0, idx), +@@ -1825,18 +1781,6 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans, + + sectors = min_t(u64, sectors, k.k->p.offset - idx); + +- frags_referenced = +- reflink_p_frag_references(p, 0, front_frag, k) + +- reflink_p_frag_references(p, back_frag, p.k->size, k); +- +- if (frags_referenced == 2) { +- BUG_ON(!(flags & BTREE_TRIGGER_OVERWRITE_SPLIT)); +- add = -add; +- } else if (frags_referenced == 1) { +- BUG_ON(!(flags & BTREE_TRIGGER_OVERWRITE)); +- goto out; +- } +- + n = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); + ret = PTR_ERR_OR_ZERO(n); + if (ret) +@@ -1866,7 +1810,7 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans, + ret = bch2_trans_update(trans, iter, n, 0); + if (ret) + goto err; +-out: ++ + ret = sectors; + err: + bch2_trans_iter_put(trans, iter); +@@ -1878,20 +1822,15 @@ static int bch2_trans_mark_reflink_p(struct btree_trans *trans, + s64 sectors, unsigned flags) + { + u64 idx = le64_to_cpu(p.v->idx) + offset; +- unsigned front_frag, back_frag; + s64 ret = 0; + + if (sectors < 0) + sectors = -sectors; + +- BUG_ON(offset + sectors > p.k->size); +- +- front_frag = offset; +- back_frag = offset + sectors; ++ BUG_ON(offset || sectors != p.k->size); + + while (sectors) { +- ret = __bch2_trans_mark_reflink_p(trans, p, idx, sectors, +- front_frag, back_frag, flags); ++ ret = __bch2_trans_mark_reflink_p(trans, p, idx, sectors, flags); + if (ret < 0) + return ret; + +-- +cgit v1.2.3 + + +From 422813ef9716258c8aee389ee10c8e50d8a771f6 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 7 Jun 2021 14:54:56 -0400 +Subject: bcachefs: Kill trans->updates2 + +Now that extent handling has been lifted to bch2_trans_update(), we +don't need to keep two different lists of updates. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 3 -- + fs/bcachefs/btree_types.h | 3 -- + fs/bcachefs/btree_update.h | 5 --- + fs/bcachefs/btree_update_leaf.c | 78 +++++++++-------------------------------- + 4 files changed, 16 insertions(+), 73 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 5c57a6d26335..ee1f388f8775 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -2320,7 +2320,6 @@ void bch2_trans_reset(struct btree_trans *trans, unsigned flags) + trans->iters_touched &= trans->iters_live; + + trans->nr_updates = 0; +- trans->nr_updates2 = 0; + trans->mem_top = 0; + + trans->hooks = NULL; +@@ -2358,7 +2357,6 @@ static void bch2_trans_alloc_iters(struct btree_trans *trans, struct bch_fs *c) + + trans->iters = p; p += iters_bytes; + trans->updates = p; p += updates_bytes; +- trans->updates2 = p; p += updates_bytes; + } + + void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, +@@ -2562,7 +2560,6 @@ int bch2_fs_btree_iter_init(struct bch_fs *c) + return init_srcu_struct(&c->btree_trans_barrier) ?: + mempool_init_kmalloc_pool(&c->btree_iters_pool, 1, + sizeof(struct btree_iter) * nr + +- sizeof(struct btree_insert_entry) * nr + + sizeof(struct btree_insert_entry) * nr) ?: + mempool_init_kmalloc_pool(&c->btree_trans_mem_pool, 1, + BTREE_TRANS_MEM_MAX); +diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h +index 4997ce22e5e3..bcc8e6126c46 100644 +--- a/fs/bcachefs/btree_types.h ++++ b/fs/bcachefs/btree_types.h +@@ -345,7 +345,6 @@ struct btree_insert_entry { + enum btree_id btree_id:8; + u8 level; + unsigned trans_triggers_run:1; +- unsigned is_extent:1; + struct bkey_i *k; + struct btree_iter *iter; + }; +@@ -381,7 +380,6 @@ struct btree_trans { + int srcu_idx; + + u8 nr_updates; +- u8 nr_updates2; + unsigned used_mempool:1; + unsigned error:1; + unsigned in_traverse_all:1; +@@ -396,7 +394,6 @@ struct btree_trans { + + struct btree_iter *iters; + struct btree_insert_entry *updates; +- struct btree_insert_entry *updates2; + + /* update path: */ + struct btree_trans_commit_hook *hooks; +diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h +index 56131ac516ce..cbfc8544def4 100644 +--- a/fs/bcachefs/btree_update.h ++++ b/fs/bcachefs/btree_update.h +@@ -140,9 +140,4 @@ static inline int bch2_trans_commit(struct btree_trans *trans, + (_i) < (_trans)->updates + (_trans)->nr_updates; \ + (_i)++) + +-#define trans_for_each_update2(_trans, _i) \ +- for ((_i) = (_trans)->updates2; \ +- (_i) < (_trans)->updates2 + (_trans)->nr_updates2; \ +- (_i)++) +- + #endif /* _BCACHEFS_BTREE_UPDATE_H */ +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index 6976e470df6d..30a08f5e992b 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -32,7 +32,7 @@ static inline int btree_insert_entry_cmp(const struct btree_insert_entry *l, + static inline bool same_leaf_as_prev(struct btree_trans *trans, + struct btree_insert_entry *i) + { +- return i != trans->updates2 && ++ return i != trans->updates && + iter_l(i[0].iter)->b == iter_l(i[-1].iter)->b; + } + +@@ -222,7 +222,7 @@ static bool btree_insert_key_leaf(struct btree_trans *trans, + static inline void btree_insert_entry_checks(struct btree_trans *trans, + struct btree_insert_entry *i) + { +- BUG_ON(!i->is_extent && bpos_cmp(i->k->k.p, i->iter->real_pos)); ++ BUG_ON(bpos_cmp(i->k->k.p, i->iter->real_pos)); + BUG_ON(i->level != i->iter->level); + BUG_ON(i->btree_id != i->iter->btree_id); + } +@@ -400,7 +400,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, + h = h->next; + } + +- trans_for_each_update2(trans, i) { ++ trans_for_each_update(trans, i) { + /* Multiple inserts might go to same leaf: */ + if (!same_leaf_as_prev(trans, i)) + u64s = 0; +@@ -458,10 +458,10 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, + + if (!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)) { + if (bch2_journal_seq_verify) +- trans_for_each_update2(trans, i) ++ trans_for_each_update(trans, i) + i->k->k.version.lo = trans->journal_res.seq; + else if (bch2_inject_invalid_keys) +- trans_for_each_update2(trans, i) ++ trans_for_each_update(trans, i) + i->k->k.version = MAX_VERSION; + } + +@@ -476,7 +476,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, + if (unlikely(c->gc_pos.phase)) + bch2_trans_mark_gc(trans); + +- trans_for_each_update2(trans, i) ++ trans_for_each_update(trans, i) + do_btree_insert_one(trans, i); + err: + if (marking) { +@@ -504,7 +504,7 @@ static noinline int maybe_do_btree_merge(struct btree_trans *trans, struct btree + + BUG_ON(iter->level); + +- trans_for_each_update2(trans, i) { ++ trans_for_each_update(trans, i) { + if (iter_l(i->iter)->b != b) + continue; + +@@ -535,7 +535,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, + struct btree_iter *iter; + int ret; + +- trans_for_each_update2(trans, i) { ++ trans_for_each_update(trans, i) { + struct btree *b; + + BUG_ON(!btree_node_intent_locked(i->iter, i->level)); +@@ -552,7 +552,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, + } + } + +- trans_for_each_update2(trans, i) ++ trans_for_each_update(trans, i) + BUG_ON(!btree_node_intent_locked(i->iter, i->level)); + + ret = bch2_journal_preres_get(&c->journal, +@@ -592,7 +592,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, + } + } + +- trans_for_each_update2(trans, i) { ++ trans_for_each_update(trans, i) { + const char *invalid = bch2_bkey_invalid(c, + bkey_i_to_s_c(i->k), i->bkey_type); + if (invalid) { +@@ -606,14 +606,14 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, + } + bch2_btree_trans_verify_locks(trans); + +- trans_for_each_update2(trans, i) ++ trans_for_each_update(trans, i) + if (!same_leaf_as_prev(trans, i)) + bch2_btree_node_lock_for_insert(c, + iter_l(i->iter)->b, i->iter); + + ret = bch2_trans_commit_write_locked(trans, stopped_at, trace_ip); + +- trans_for_each_update2(trans, i) ++ trans_for_each_update(trans, i) + if (!same_leaf_as_prev(trans, i)) + bch2_btree_node_unlock_write_inlined(iter_l(i->iter)->b, + i->iter); +@@ -775,42 +775,6 @@ bch2_trans_commit_get_rw_cold(struct btree_trans *trans) + return 0; + } + +-static void bch2_trans_update2(struct btree_trans *trans, +- struct btree_insert_entry n) +-{ +- struct btree_insert_entry *i; +- +- btree_insert_entry_checks(trans, &n); +- +- EBUG_ON(trans->nr_updates2 >= BTREE_ITER_MAX); +- +- n.iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT; +- +- trans_for_each_update2(trans, i) +- if (btree_insert_entry_cmp(&n, i) <= 0) +- break; +- +- if (i < trans->updates2 + trans->nr_updates2 && +- !btree_insert_entry_cmp(&n, i)) +- *i = n; +- else +- array_insert_item(trans->updates2, trans->nr_updates2, +- i - trans->updates2, n); +-} +- +-static int extent_update_to_keys(struct btree_trans *trans, +- struct btree_insert_entry n) +-{ +- n.iter = bch2_trans_get_iter(trans, n.iter->btree_id, n.k->k.p, +- BTREE_ITER_INTENT| +- BTREE_ITER_NOT_EXTENTS); +- n.is_extent = false; +- +- bch2_trans_update2(trans, n); +- bch2_trans_iter_put(trans, n.iter); +- return 0; +-} +- + static int extent_handle_overwrites(struct btree_trans *trans, + enum btree_id btree_id, + struct bkey_i *insert, +@@ -945,14 +909,6 @@ int __bch2_trans_commit(struct btree_trans *trans) + } while (trans_trigger_run); + + trans_for_each_update(trans, i) { +- ret = i->is_extent +- ? extent_update_to_keys(trans, *i) +- : (bch2_trans_update2(trans, *i), 0); +- if (unlikely(ret)) +- goto out; +- } +- +- trans_for_each_update2(trans, i) { + ret = bch2_btree_iter_traverse(i->iter); + if (unlikely(ret)) { + trace_trans_restart_traverse(trans->ip, _RET_IP_, +@@ -1021,28 +977,27 @@ int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter, + .bkey_type = __btree_node_type(iter->level, iter->btree_id), + .btree_id = iter->btree_id, + .level = iter->level, +- .is_extent = (iter->flags & BTREE_ITER_IS_EXTENTS) != 0, + .iter = iter, + .k = k + }; ++ bool is_extent = (iter->flags & BTREE_ITER_IS_EXTENTS) != 0; + int ret = 0; + + BUG_ON(trans->nr_updates >= BTREE_ITER_MAX); + + #ifdef CONFIG_BCACHEFS_DEBUG + BUG_ON(bkey_cmp(iter->pos, +- n.is_extent ? bkey_start_pos(&k->k) : k->k.p)); ++ is_extent ? bkey_start_pos(&k->k) : k->k.p)); + + trans_for_each_update(trans, i) { +- BUG_ON(bkey_cmp(i->iter->pos, +- i->is_extent ? bkey_start_pos(&i->k->k) : i->k->k.p)); ++ BUG_ON(bkey_cmp(i->iter->pos, i->k->k.p)); + + BUG_ON(i != trans->updates && + btree_insert_entry_cmp(i - 1, i) >= 0); + } + #endif + +- if (n.is_extent) { ++ if (is_extent) { + ret = bch2_extent_can_insert(trans, n.iter, n.k); + if (ret) + return ret; +@@ -1061,7 +1016,6 @@ int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter, + BTREE_ITER_INTENT| + BTREE_ITER_NOT_EXTENTS); + bch2_trans_iter_put(trans, n.iter); +- n.is_extent = false; + } + + BUG_ON(n.iter->flags & BTREE_ITER_IS_EXTENTS); +-- +cgit v1.2.3 + + +From 1e7659957289a2c33245d7f9e601bff9131f3c37 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 28 Apr 2021 23:49:30 -0400 +Subject: bcachefs: Clean up key merging + +This patch simplifies the key merging code by getting rid of partial +merges - it's simpler and saner if we just don't merge extents when +they'd overflow k->size. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bkey_methods.c | 19 +++--------------- + fs/bcachefs/bkey_methods.h | 29 +++++++++++++-------------- + fs/bcachefs/extents.c | 50 +++++++++++++++++++--------------------------- + fs/bcachefs/extents.h | 6 ++---- + fs/bcachefs/reflink.c | 22 ++++++++++---------- + fs/bcachefs/reflink.h | 3 +-- + 6 files changed, 50 insertions(+), 79 deletions(-) + +diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c +index cf2e054cca2f..ff9d770aabea 100644 +--- a/fs/bcachefs/bkey_methods.c ++++ b/fs/bcachefs/bkey_methods.c +@@ -84,7 +84,7 @@ static void key_type_inline_data_to_text(struct printbuf *out, struct bch_fs *c, + .val_to_text = key_type_inline_data_to_text, \ + } + +-static const struct bkey_ops bch2_bkey_ops[] = { ++const struct bkey_ops bch2_bkey_ops[] = { + #define x(name, nr) [KEY_TYPE_##name] = bch2_bkey_ops_##name, + BCH_BKEY_TYPES() + #undef x +@@ -292,24 +292,11 @@ bool bch2_bkey_normalize(struct bch_fs *c, struct bkey_s k) + : false; + } + +-enum merge_result bch2_bkey_merge(struct bch_fs *c, +- struct bkey_s l, struct bkey_s r) ++bool bch2_bkey_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r) + { + const struct bkey_ops *ops = &bch2_bkey_ops[l.k->type]; +- enum merge_result ret; + +- if (bch2_key_merging_disabled || +- !ops->key_merge || +- l.k->type != r.k->type || +- bversion_cmp(l.k->version, r.k->version) || +- bpos_cmp(l.k->p, bkey_start_pos(r.k))) +- return BCH_MERGE_NOMERGE; +- +- ret = ops->key_merge(c, l, r); +- +- if (ret != BCH_MERGE_NOMERGE) +- l.k->needs_whiteout |= r.k->needs_whiteout; +- return ret; ++ return bch2_bkey_maybe_mergable(l.k, r.k) && ops->key_merge(c, l, r); + } + + static const struct old_bkey_type { +diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h +index bfa6f112aeed..3012035db1a3 100644 +--- a/fs/bcachefs/bkey_methods.h ++++ b/fs/bcachefs/bkey_methods.h +@@ -11,17 +11,6 @@ enum btree_node_type; + + extern const char * const bch2_bkey_types[]; + +-enum merge_result { +- BCH_MERGE_NOMERGE, +- +- /* +- * The keys were mergeable, but would have overflowed size - so instead +- * l was changed to the maximum size, and both keys were modified: +- */ +- BCH_MERGE_PARTIAL, +- BCH_MERGE_MERGE, +-}; +- + struct bkey_ops { + /* Returns reason for being invalid if invalid, else NULL: */ + const char * (*key_invalid)(const struct bch_fs *, +@@ -30,13 +19,14 @@ struct bkey_ops { + struct bkey_s_c); + void (*swab)(struct bkey_s); + bool (*key_normalize)(struct bch_fs *, struct bkey_s); +- enum merge_result (*key_merge)(struct bch_fs *, +- struct bkey_s, struct bkey_s); ++ bool (*key_merge)(struct bch_fs *, struct bkey_s, struct bkey_s_c); + void (*compat)(enum btree_id id, unsigned version, + unsigned big_endian, int write, + struct bkey_s); + }; + ++extern const struct bkey_ops bch2_bkey_ops[]; ++ + const char *bch2_bkey_val_invalid(struct bch_fs *, struct bkey_s_c); + const char *__bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c, + enum btree_node_type); +@@ -57,8 +47,17 @@ void bch2_bkey_swab_val(struct bkey_s); + + bool bch2_bkey_normalize(struct bch_fs *, struct bkey_s); + +-enum merge_result bch2_bkey_merge(struct bch_fs *, +- struct bkey_s, struct bkey_s); ++static inline bool bch2_bkey_maybe_mergable(const struct bkey *l, const struct bkey *r) ++{ ++ return l->type == r->type && ++ !bversion_cmp(l->version, r->version) && ++ !bpos_cmp(l->p, bkey_start_pos(r)) && ++ (u64) l->size + r->size <= KEY_SIZE_MAX && ++ bch2_bkey_ops[l->type].key_merge && ++ !bch2_key_merging_disabled; ++} ++ ++bool bch2_bkey_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); + + void bch2_bkey_renumber(enum btree_node_type, struct bkey_packed *, int); + +diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c +index b07d39555eb6..5b9aaa568371 100644 +--- a/fs/bcachefs/extents.c ++++ b/fs/bcachefs/extents.c +@@ -230,17 +230,16 @@ void bch2_extent_to_text(struct printbuf *out, struct bch_fs *c, + bch2_bkey_ptrs_to_text(out, c, k); + } + +-enum merge_result bch2_extent_merge(struct bch_fs *c, +- struct bkey_s _l, struct bkey_s _r) ++bool bch2_extent_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r) + { + struct bkey_s_extent l = bkey_s_to_extent(_l); +- struct bkey_s_extent r = bkey_s_to_extent(_r); ++ struct bkey_s_c_extent r = bkey_s_c_to_extent(_r); + union bch_extent_entry *en_l = l.v->start; +- union bch_extent_entry *en_r = r.v->start; ++ const union bch_extent_entry *en_r = r.v->start; + struct bch_extent_crc_unpacked crc_l, crc_r; + + if (bkey_val_u64s(l.k) != bkey_val_u64s(r.k)) +- return BCH_MERGE_NOMERGE; ++ return false; + + crc_l = bch2_extent_crc_unpack(l.k, NULL); + +@@ -248,7 +247,7 @@ enum merge_result bch2_extent_merge(struct bch_fs *c, + en_r = vstruct_idx(r.v, (u64 *) en_l - l.v->_data); + + if (extent_entry_type(en_l) != extent_entry_type(en_r)) +- return BCH_MERGE_NOMERGE; ++ return false; + + switch (extent_entry_type(en_l)) { + case BCH_EXTENT_ENTRY_ptr: { +@@ -259,20 +258,20 @@ enum merge_result bch2_extent_merge(struct bch_fs *c, + if (lp->offset + crc_l.compressed_size != rp->offset || + lp->dev != rp->dev || + lp->gen != rp->gen) +- return BCH_MERGE_NOMERGE; ++ return false; + + /* We don't allow extents to straddle buckets: */ + ca = bch_dev_bkey_exists(c, lp->dev); + + if (PTR_BUCKET_NR(ca, lp) != PTR_BUCKET_NR(ca, rp)) +- return BCH_MERGE_NOMERGE; ++ return false; + + break; + } + case BCH_EXTENT_ENTRY_stripe_ptr: + if (en_l->stripe_ptr.block != en_r->stripe_ptr.block || + en_l->stripe_ptr.idx != en_r->stripe_ptr.idx) +- return BCH_MERGE_NOMERGE; ++ return false; + break; + case BCH_EXTENT_ENTRY_crc32: + case BCH_EXTENT_ENTRY_crc64: +@@ -283,30 +282,30 @@ enum merge_result bch2_extent_merge(struct bch_fs *c, + if (crc_l.csum_type != crc_r.csum_type || + crc_l.compression_type != crc_r.compression_type || + crc_l.nonce != crc_r.nonce) +- return BCH_MERGE_NOMERGE; ++ return false; + + if (crc_l.offset + crc_l.live_size != crc_l.compressed_size || + crc_r.offset) +- return BCH_MERGE_NOMERGE; ++ return false; + + if (!bch2_checksum_mergeable(crc_l.csum_type)) +- return BCH_MERGE_NOMERGE; ++ return false; + + if (crc_is_compressed(crc_l)) +- return BCH_MERGE_NOMERGE; ++ return false; + + if (crc_l.csum_type && + crc_l.uncompressed_size + + crc_r.uncompressed_size > c->sb.encoded_extent_max) +- return BCH_MERGE_NOMERGE; ++ return false; + + if (crc_l.uncompressed_size + crc_r.uncompressed_size > + bch2_crc_field_size_max[extent_entry_type(en_l)]) +- return BCH_MERGE_NOMERGE; ++ return false; + + break; + default: +- return BCH_MERGE_NOMERGE; ++ return false; + } + } + +@@ -334,8 +333,7 @@ enum merge_result bch2_extent_merge(struct bch_fs *c, + } + + bch2_key_resize(l.k, l.k->size + r.k->size); +- +- return BCH_MERGE_MERGE; ++ return true; + } + + /* KEY_TYPE_reservation: */ +@@ -363,25 +361,17 @@ void bch2_reservation_to_text(struct printbuf *out, struct bch_fs *c, + r.v->nr_replicas); + } + +-enum merge_result bch2_reservation_merge(struct bch_fs *c, +- struct bkey_s _l, struct bkey_s _r) ++bool bch2_reservation_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r) + { + struct bkey_s_reservation l = bkey_s_to_reservation(_l); +- struct bkey_s_reservation r = bkey_s_to_reservation(_r); ++ struct bkey_s_c_reservation r = bkey_s_c_to_reservation(_r); + + if (l.v->generation != r.v->generation || + l.v->nr_replicas != r.v->nr_replicas) +- return BCH_MERGE_NOMERGE; +- +- if ((u64) l.k->size + r.k->size > KEY_SIZE_MAX) { +- bch2_key_resize(l.k, KEY_SIZE_MAX); +- bch2_cut_front_s(l.k->p, r.s); +- return BCH_MERGE_PARTIAL; +- } ++ return false; + + bch2_key_resize(l.k, l.k->size + r.k->size); +- +- return BCH_MERGE_MERGE; ++ return true; + } + + /* Extent checksum entries: */ +diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h +index 9999805f955e..3f6224f75ce8 100644 +--- a/fs/bcachefs/extents.h ++++ b/fs/bcachefs/extents.h +@@ -394,8 +394,7 @@ void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned, + + const char *bch2_extent_invalid(const struct bch_fs *, struct bkey_s_c); + void bch2_extent_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); +-enum merge_result bch2_extent_merge(struct bch_fs *, +- struct bkey_s, struct bkey_s); ++bool bch2_extent_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); + + #define bch2_bkey_ops_extent (struct bkey_ops) { \ + .key_invalid = bch2_extent_invalid, \ +@@ -409,8 +408,7 @@ enum merge_result bch2_extent_merge(struct bch_fs *, + + const char *bch2_reservation_invalid(const struct bch_fs *, struct bkey_s_c); + void bch2_reservation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); +-enum merge_result bch2_reservation_merge(struct bch_fs *, +- struct bkey_s, struct bkey_s); ++bool bch2_reservation_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); + + #define bch2_bkey_ops_reservation (struct bkey_ops) { \ + .key_invalid = bch2_reservation_invalid, \ +diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c +index 6aa37726341d..ead31f9e31aa 100644 +--- a/fs/bcachefs/reflink.c ++++ b/fs/bcachefs/reflink.c +@@ -42,24 +42,22 @@ void bch2_reflink_p_to_text(struct printbuf *out, struct bch_fs *c, + pr_buf(out, "idx %llu", le64_to_cpu(p.v->idx)); + } + +-enum merge_result bch2_reflink_p_merge(struct bch_fs *c, +- struct bkey_s _l, struct bkey_s _r) ++bool bch2_reflink_p_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r) + { + struct bkey_s_reflink_p l = bkey_s_to_reflink_p(_l); +- struct bkey_s_reflink_p r = bkey_s_to_reflink_p(_r); ++ struct bkey_s_c_reflink_p r = bkey_s_c_to_reflink_p(_r); + +- if (le64_to_cpu(l.v->idx) + l.k->size != le64_to_cpu(r.v->idx)) +- return BCH_MERGE_NOMERGE; ++ /* ++ * Disabled for now, the triggers code needs to be reworked for merging ++ * of reflink pointers to work: ++ */ ++ return false; + +- if ((u64) l.k->size + r.k->size > KEY_SIZE_MAX) { +- bch2_key_resize(l.k, KEY_SIZE_MAX); +- bch2_cut_front_s(l.k->p, _r); +- return BCH_MERGE_PARTIAL; +- } ++ if (le64_to_cpu(l.v->idx) + l.k->size != le64_to_cpu(r.v->idx)) ++ return false; + + bch2_key_resize(l.k, l.k->size + r.k->size); +- +- return BCH_MERGE_MERGE; ++ return true; + } + + /* indirect extents */ +diff --git a/fs/bcachefs/reflink.h b/fs/bcachefs/reflink.h +index bfc785619ee8..68c5cb5a2780 100644 +--- a/fs/bcachefs/reflink.h ++++ b/fs/bcachefs/reflink.h +@@ -5,8 +5,7 @@ + const char *bch2_reflink_p_invalid(const struct bch_fs *, struct bkey_s_c); + void bch2_reflink_p_to_text(struct printbuf *, struct bch_fs *, + struct bkey_s_c); +-enum merge_result bch2_reflink_p_merge(struct bch_fs *, +- struct bkey_s, struct bkey_s); ++bool bch2_reflink_p_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); + + #define bch2_bkey_ops_reflink_p (struct bkey_ops) { \ + .key_invalid = bch2_reflink_p_invalid, \ +-- +cgit v1.2.3 + + +From c9396e45daa2bcd2f2ab5a7e469f7277c10f07e7 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 28 Apr 2021 23:52:19 -0400 +Subject: bcachefs: Re-implement extent merging in transaction commit path + +We haven't had extent merging in quite some time. It used to be done by +the btree code when sorting btree nodes, but that was eliminated as part +of the work to separate extent handling from core btree code. + +This patch re-implements extent merging in the transaction commit path. +We don't currently have the ability to merge reflink pointers, we need +to do some work on the triggers code to be able to do that without +ending up with incorrect refcounts. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_update_leaf.c | 117 +++++++++++++++++++++++++++------------- + 1 file changed, 80 insertions(+), 37 deletions(-) + +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index 30a08f5e992b..482d583e9b6e 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -775,74 +775,117 @@ bch2_trans_commit_get_rw_cold(struct btree_trans *trans) + return 0; + } + ++static int __btree_delete_at(struct btree_trans *trans, enum btree_id btree_id, ++ struct bpos pos, unsigned trigger_flags) ++{ ++ struct btree_iter *iter; ++ struct bkey_i *update; ++ int ret; ++ ++ update = bch2_trans_kmalloc(trans, sizeof(struct bkey)); ++ if ((ret = PTR_ERR_OR_ZERO(update))) ++ return ret; ++ ++ bkey_init(&update->k); ++ update->k.p = pos; ++ ++ iter = bch2_trans_get_iter(trans, btree_id, pos, ++ BTREE_ITER_NOT_EXTENTS| ++ BTREE_ITER_INTENT); ++ bch2_trans_update(trans, iter, update, trigger_flags); ++ bch2_trans_iter_put(trans, iter); ++ return 0; ++} ++ + static int extent_handle_overwrites(struct btree_trans *trans, +- enum btree_id btree_id, +- struct bkey_i *insert, +- unsigned trigger_flags) ++ struct btree_insert_entry *i) + { ++ struct bch_fs *c = trans->c; + struct btree_iter *iter, *update_iter; +- struct bpos start = bkey_start_pos(&insert->k); ++ struct bpos start = bkey_start_pos(&i->k->k); + struct bkey_i *update; + struct bkey_s_c k; +- int ret; ++ int ret = 0; + +- for_each_btree_key(trans, iter, btree_id, start, +- BTREE_ITER_INTENT| +- BTREE_ITER_WITH_UPDATES, k, ret) { +- if (bkey_cmp(insert->k.p, bkey_start_pos(k.k)) <= 0) +- break; ++ iter = bch2_trans_get_iter(trans, i->btree_id, start, ++ BTREE_ITER_INTENT| ++ BTREE_ITER_WITH_UPDATES| ++ BTREE_ITER_NOT_EXTENTS); ++ k = bch2_btree_iter_peek(iter); ++ if (!k.k || (ret = bkey_err(k))) ++ goto out; + ++ if (bch2_bkey_maybe_mergable(k.k, &i->k->k)) { ++ struct bpos l_pos = k.k->p; ++ ++ update = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); ++ if ((ret = PTR_ERR_OR_ZERO(update))) ++ goto out; ++ ++ bkey_reassemble(update, k); ++ ++ if (bch2_bkey_merge(c, bkey_i_to_s(update), bkey_i_to_s_c(i->k))) { ++ ret = __btree_delete_at(trans, i->btree_id, l_pos, ++ i->trigger_flags); ++ if (ret) ++ goto out; ++ ++ i->k = update; ++ goto next; ++ } ++ } ++ ++ if (!bkey_cmp(k.k->p, bkey_start_pos(&i->k->k))) ++ goto next; ++ ++ while (bkey_cmp(i->k->k.p, bkey_start_pos(k.k)) > 0) { + if (bkey_cmp(bkey_start_pos(k.k), start) < 0) { + update = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); + if ((ret = PTR_ERR_OR_ZERO(update))) +- break; ++ goto out; + + bkey_reassemble(update, k); + + bch2_cut_back(start, update); + +- update_iter = bch2_trans_get_iter(trans, btree_id, update->k.p, ++ update_iter = bch2_trans_get_iter(trans, i->btree_id, update->k.p, + BTREE_ITER_NOT_EXTENTS| + BTREE_ITER_INTENT); +- bch2_trans_update(trans, update_iter, update, +- trigger_flags); ++ bch2_trans_update(trans, update_iter, update, i->trigger_flags); + bch2_trans_iter_put(trans, update_iter); + } + +- if (bkey_cmp(k.k->p, insert->k.p) < 0 || +- (!bkey_cmp(k.k->p, insert->k.p) && bkey_deleted(&insert->k))) { +- update = bch2_trans_kmalloc(trans, sizeof(struct bkey)); +- if ((ret = PTR_ERR_OR_ZERO(update))) +- break; +- +- bkey_init(&update->k); +- update->k.p = k.k->p; +- +- update_iter = bch2_trans_get_iter(trans, btree_id, update->k.p, +- BTREE_ITER_NOT_EXTENTS| +- BTREE_ITER_INTENT); +- bch2_trans_update(trans, update_iter, update, +- trigger_flags); +- bch2_trans_iter_put(trans, update_iter); ++ if (bkey_cmp(k.k->p, i->k->k.p) <= 0) { ++ ret = __btree_delete_at(trans, i->btree_id, k.k->p, ++ i->trigger_flags); ++ if (ret) ++ goto out; + } + +- if (bkey_cmp(k.k->p, insert->k.p) > 0) { ++ if (bkey_cmp(k.k->p, i->k->k.p) > 0) { + update = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); + if ((ret = PTR_ERR_OR_ZERO(update))) +- break; ++ goto out; + + bkey_reassemble(update, k); +- bch2_cut_front(insert->k.p, update); ++ bch2_cut_front(i->k->k.p, update); + +- update_iter = bch2_trans_get_iter(trans, btree_id, update->k.p, ++ update_iter = bch2_trans_get_iter(trans, i->btree_id, update->k.p, + BTREE_ITER_NOT_EXTENTS| + BTREE_ITER_INTENT); + bch2_trans_update(trans, update_iter, update, +- trigger_flags); ++ i->trigger_flags); + bch2_trans_iter_put(trans, update_iter); +- break; ++ goto out; + } ++next: ++ k = bch2_btree_iter_next(iter); ++ if (!k.k || (ret = bkey_err(k))) ++ goto out; + } ++ ++ bch2_bkey_merge(c, bkey_i_to_s(i->k), k); ++out: + bch2_trans_iter_put(trans, iter); + + return ret; +@@ -1002,7 +1045,7 @@ int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter, + if (ret) + return ret; + +- ret = extent_handle_overwrites(trans, n.btree_id, n.k, flags); ++ ret = extent_handle_overwrites(trans, &n); + if (ret) + return ret; + +@@ -1012,7 +1055,7 @@ int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter, + if (bkey_deleted(&n.k->k)) + return 0; + +- n.iter = bch2_trans_get_iter(trans, n.iter->btree_id, n.k->k.p, ++ n.iter = bch2_trans_get_iter(trans, n.btree_id, n.k->k.p, + BTREE_ITER_INTENT| + BTREE_ITER_NOT_EXTENTS); + bch2_trans_iter_put(trans, n.iter); +-- +cgit v1.2.3 + + +From bbb3bc00e2313635d401cd195f549d8d9ca29a23 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 15 May 2021 00:37:37 -0400 +Subject: bcachefs: Improved extent merging + +Previously, checksummed extents could only be merged when the checksum +covered only the currently live data. + +xfstest generic/064 creates a test file, then uses finsert calls to +split the extent, then collapse calls to see if they get merged. But +without any reads to trigger the narrow_crcs path, each of the split +extents will still have a checksum for the entire original extent. + +This patch improves the extent merge path so that if either of the +extents we're attempting to merge has a checksum that covers the entire +merged extent, we just use that checksum. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/extents.c | 139 ++++++++++++++++++++++++++++---------------------- + 1 file changed, 79 insertions(+), 60 deletions(-) + +diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c +index 5b9aaa568371..704d6dc46ad5 100644 +--- a/fs/bcachefs/extents.c ++++ b/fs/bcachefs/extents.c +@@ -234,102 +234,121 @@ bool bch2_extent_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r) + { + struct bkey_s_extent l = bkey_s_to_extent(_l); + struct bkey_s_c_extent r = bkey_s_c_to_extent(_r); +- union bch_extent_entry *en_l = l.v->start; +- const union bch_extent_entry *en_r = r.v->start; +- struct bch_extent_crc_unpacked crc_l, crc_r; ++ union bch_extent_entry *en_l; ++ const union bch_extent_entry *en_r; ++ struct extent_ptr_decoded lp, rp; ++ bool use_right_ptr; ++ struct bch_dev *ca; + + if (bkey_val_u64s(l.k) != bkey_val_u64s(r.k)) + return false; + +- crc_l = bch2_extent_crc_unpack(l.k, NULL); +- + extent_for_each_entry(l, en_l) { + en_r = vstruct_idx(r.v, (u64 *) en_l - l.v->_data); + + if (extent_entry_type(en_l) != extent_entry_type(en_r)) + return false; ++ } + +- switch (extent_entry_type(en_l)) { +- case BCH_EXTENT_ENTRY_ptr: { +- const struct bch_extent_ptr *lp = &en_l->ptr; +- const struct bch_extent_ptr *rp = &en_r->ptr; +- struct bch_dev *ca; +- +- if (lp->offset + crc_l.compressed_size != rp->offset || +- lp->dev != rp->dev || +- lp->gen != rp->gen) +- return false; +- +- /* We don't allow extents to straddle buckets: */ +- ca = bch_dev_bkey_exists(c, lp->dev); +- +- if (PTR_BUCKET_NR(ca, lp) != PTR_BUCKET_NR(ca, rp)) +- return false; ++ en_l = l.v->start; ++ en_r = r.v->start; ++ lp.crc = bch2_extent_crc_unpack(l.k, NULL); ++ rp.crc = bch2_extent_crc_unpack(r.k, NULL); ++ ++ while (__bkey_ptr_next_decode(l.k, extent_entry_last(l), lp, en_l) && ++ __bkey_ptr_next_decode(r.k, extent_entry_last(r), rp, en_r)) { ++ if (lp.ptr.offset + lp.crc.offset + lp.crc.live_size != ++ rp.ptr.offset + rp.crc.offset || ++ lp.ptr.dev != rp.ptr.dev || ++ lp.ptr.gen != rp.ptr.gen || ++ lp.has_ec != rp.has_ec) ++ return false; + +- break; +- } +- case BCH_EXTENT_ENTRY_stripe_ptr: +- if (en_l->stripe_ptr.block != en_r->stripe_ptr.block || +- en_l->stripe_ptr.idx != en_r->stripe_ptr.idx) +- return false; +- break; +- case BCH_EXTENT_ENTRY_crc32: +- case BCH_EXTENT_ENTRY_crc64: +- case BCH_EXTENT_ENTRY_crc128: +- crc_l = bch2_extent_crc_unpack(l.k, entry_to_crc(en_l)); +- crc_r = bch2_extent_crc_unpack(r.k, entry_to_crc(en_r)); ++ /* Extents may not straddle buckets: */ ++ ca = bch_dev_bkey_exists(c, lp.ptr.dev); ++ if (PTR_BUCKET_NR(ca, &lp.ptr) != PTR_BUCKET_NR(ca, &rp.ptr)) ++ return false; + +- if (crc_l.csum_type != crc_r.csum_type || +- crc_l.compression_type != crc_r.compression_type || +- crc_l.nonce != crc_r.nonce) +- return false; ++ if (lp.has_ec != rp.has_ec || ++ (lp.has_ec && ++ (lp.ec.block != rp.ec.block || ++ lp.ec.redundancy != rp.ec.redundancy || ++ lp.ec.idx != rp.ec.idx))) ++ return false; + +- if (crc_l.offset + crc_l.live_size != crc_l.compressed_size || +- crc_r.offset) +- return false; ++ if (lp.crc.compression_type != rp.crc.compression_type || ++ lp.crc.nonce != rp.crc.nonce) ++ return false; + +- if (!bch2_checksum_mergeable(crc_l.csum_type)) ++ if (lp.crc.offset + lp.crc.live_size + rp.crc.live_size <= ++ lp.crc.uncompressed_size) { ++ /* can use left extent's crc entry */ ++ } else if (lp.crc.live_size <= rp.crc.offset ) { ++ /* can use right extent's crc entry */ ++ } else { ++ /* check if checksums can be merged: */ ++ if (lp.crc.csum_type != rp.crc.csum_type || ++ lp.crc.nonce != rp.crc.nonce || ++ crc_is_compressed(lp.crc) || ++ !bch2_checksum_mergeable(lp.crc.csum_type)) + return false; + +- if (crc_is_compressed(crc_l)) ++ if (lp.crc.offset + lp.crc.live_size != lp.crc.compressed_size || ++ rp.crc.offset) + return false; + +- if (crc_l.csum_type && +- crc_l.uncompressed_size + +- crc_r.uncompressed_size > c->sb.encoded_extent_max) ++ if (lp.crc.csum_type && ++ lp.crc.uncompressed_size + ++ rp.crc.uncompressed_size > c->sb.encoded_extent_max) + return false; + +- if (crc_l.uncompressed_size + crc_r.uncompressed_size > ++ if (lp.crc.uncompressed_size + rp.crc.uncompressed_size > + bch2_crc_field_size_max[extent_entry_type(en_l)]) + return false; +- +- break; +- default: +- return false; + } ++ ++ en_l = extent_entry_next(en_l); ++ en_r = extent_entry_next(en_r); + } + ++ use_right_ptr = false; + extent_for_each_entry(l, en_l) { + struct bch_extent_crc_unpacked crc_l, crc_r; + + en_r = vstruct_idx(r.v, (u64 *) en_l - l.v->_data); + ++ if (extent_entry_type(en_l) == BCH_EXTENT_ENTRY_ptr && ++ use_right_ptr) ++ en_l->ptr = en_r->ptr; ++ + if (!extent_entry_is_crc(en_l)) + continue; + ++ use_right_ptr = false; ++ + crc_l = bch2_extent_crc_unpack(l.k, entry_to_crc(en_l)); + crc_r = bch2_extent_crc_unpack(r.k, entry_to_crc(en_r)); + +- crc_l.csum = bch2_checksum_merge(crc_l.csum_type, +- crc_l.csum, +- crc_r.csum, +- crc_r.uncompressed_size << 9); +- +- crc_l.uncompressed_size += crc_r.uncompressed_size; +- crc_l.compressed_size += crc_r.compressed_size; +- +- bch2_extent_crc_pack(entry_to_crc(en_l), crc_l, +- extent_entry_type(en_l)); ++ if (crc_l.offset + crc_l.live_size + crc_r.live_size <= ++ crc_l.uncompressed_size) { ++ /* can use left extent's crc entry */ ++ } else if (crc_l.live_size <= crc_r.offset ) { ++ /* can use right extent's crc entry */ ++ crc_r.offset -= crc_l.live_size; ++ bch2_extent_crc_pack(entry_to_crc(en_l), crc_r, ++ extent_entry_type(en_l)); ++ use_right_ptr = true; ++ } else { ++ crc_l.csum = bch2_checksum_merge(crc_l.csum_type, ++ crc_l.csum, ++ crc_r.csum, ++ crc_r.uncompressed_size << 9); ++ ++ crc_l.uncompressed_size += crc_r.uncompressed_size; ++ crc_l.compressed_size += crc_r.compressed_size; ++ bch2_extent_crc_pack(entry_to_crc(en_l), crc_l, ++ extent_entry_type(en_l)); ++ } + } + + bch2_key_resize(l.k, l.k->size + r.k->size); +-- +cgit v1.2.3 + + +From 38edd7556513fa2a048ae5fa61b6b4234c788e45 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 15 May 2021 15:04:08 -0400 +Subject: bcachefs: Merging for indirect extents + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/extents.c | 95 +++++++++++++++++++++++++++------------------------ + fs/bcachefs/reflink.c | 8 +++++ + 2 files changed, 58 insertions(+), 45 deletions(-) + +diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c +index 704d6dc46ad5..3968f1fd7d27 100644 +--- a/fs/bcachefs/extents.c ++++ b/fs/bcachefs/extents.c +@@ -230,33 +230,36 @@ void bch2_extent_to_text(struct printbuf *out, struct bch_fs *c, + bch2_bkey_ptrs_to_text(out, c, k); + } + +-bool bch2_extent_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r) ++bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r) + { +- struct bkey_s_extent l = bkey_s_to_extent(_l); +- struct bkey_s_c_extent r = bkey_s_c_to_extent(_r); ++ struct bkey_ptrs l_ptrs = bch2_bkey_ptrs(l); ++ struct bkey_ptrs_c r_ptrs = bch2_bkey_ptrs_c(r); + union bch_extent_entry *en_l; + const union bch_extent_entry *en_r; + struct extent_ptr_decoded lp, rp; + bool use_right_ptr; + struct bch_dev *ca; + +- if (bkey_val_u64s(l.k) != bkey_val_u64s(r.k)) +- return false; +- +- extent_for_each_entry(l, en_l) { +- en_r = vstruct_idx(r.v, (u64 *) en_l - l.v->_data); +- ++ en_l = l_ptrs.start; ++ en_r = r_ptrs.start; ++ while (en_l < l_ptrs.end && en_r < r_ptrs.end) { + if (extent_entry_type(en_l) != extent_entry_type(en_r)) + return false; ++ ++ en_l = extent_entry_next(en_l); ++ en_r = extent_entry_next(en_r); + } + +- en_l = l.v->start; +- en_r = r.v->start; ++ if (en_l < l_ptrs.end || en_r < r_ptrs.end) ++ return false; ++ ++ en_l = l_ptrs.start; ++ en_r = r_ptrs.start; + lp.crc = bch2_extent_crc_unpack(l.k, NULL); + rp.crc = bch2_extent_crc_unpack(r.k, NULL); + +- while (__bkey_ptr_next_decode(l.k, extent_entry_last(l), lp, en_l) && +- __bkey_ptr_next_decode(r.k, extent_entry_last(r), rp, en_r)) { ++ while (__bkey_ptr_next_decode(l.k, l_ptrs.end, lp, en_l) && ++ __bkey_ptr_next_decode(r.k, r_ptrs.end, rp, en_r)) { + if (lp.ptr.offset + lp.crc.offset + lp.crc.live_size != + rp.ptr.offset + rp.crc.offset || + lp.ptr.dev != rp.ptr.dev || +@@ -312,43 +315,45 @@ bool bch2_extent_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r) + } + + use_right_ptr = false; +- extent_for_each_entry(l, en_l) { +- struct bch_extent_crc_unpacked crc_l, crc_r; +- +- en_r = vstruct_idx(r.v, (u64 *) en_l - l.v->_data); +- ++ en_l = l_ptrs.start; ++ en_r = r_ptrs.start; ++ while (en_l < l_ptrs.end) { + if (extent_entry_type(en_l) == BCH_EXTENT_ENTRY_ptr && + use_right_ptr) + en_l->ptr = en_r->ptr; + +- if (!extent_entry_is_crc(en_l)) +- continue; +- +- use_right_ptr = false; +- +- crc_l = bch2_extent_crc_unpack(l.k, entry_to_crc(en_l)); +- crc_r = bch2_extent_crc_unpack(r.k, entry_to_crc(en_r)); +- +- if (crc_l.offset + crc_l.live_size + crc_r.live_size <= +- crc_l.uncompressed_size) { +- /* can use left extent's crc entry */ +- } else if (crc_l.live_size <= crc_r.offset ) { +- /* can use right extent's crc entry */ +- crc_r.offset -= crc_l.live_size; +- bch2_extent_crc_pack(entry_to_crc(en_l), crc_r, +- extent_entry_type(en_l)); +- use_right_ptr = true; +- } else { +- crc_l.csum = bch2_checksum_merge(crc_l.csum_type, +- crc_l.csum, +- crc_r.csum, +- crc_r.uncompressed_size << 9); +- +- crc_l.uncompressed_size += crc_r.uncompressed_size; +- crc_l.compressed_size += crc_r.compressed_size; +- bch2_extent_crc_pack(entry_to_crc(en_l), crc_l, +- extent_entry_type(en_l)); ++ if (extent_entry_is_crc(en_l)) { ++ struct bch_extent_crc_unpacked crc_l = ++ bch2_extent_crc_unpack(l.k, entry_to_crc(en_l)); ++ struct bch_extent_crc_unpacked crc_r = ++ bch2_extent_crc_unpack(r.k, entry_to_crc(en_r)); ++ ++ use_right_ptr = false; ++ ++ if (crc_l.offset + crc_l.live_size + crc_r.live_size <= ++ crc_l.uncompressed_size) { ++ /* can use left extent's crc entry */ ++ } else if (crc_l.live_size <= crc_r.offset ) { ++ /* can use right extent's crc entry */ ++ crc_r.offset -= crc_l.live_size; ++ bch2_extent_crc_pack(entry_to_crc(en_l), crc_r, ++ extent_entry_type(en_l)); ++ use_right_ptr = true; ++ } else { ++ crc_l.csum = bch2_checksum_merge(crc_l.csum_type, ++ crc_l.csum, ++ crc_r.csum, ++ crc_r.uncompressed_size << 9); ++ ++ crc_l.uncompressed_size += crc_r.uncompressed_size; ++ crc_l.compressed_size += crc_r.compressed_size; ++ bch2_extent_crc_pack(entry_to_crc(en_l), crc_l, ++ extent_entry_type(en_l)); ++ } + } ++ ++ en_l = extent_entry_next(en_l); ++ en_r = extent_entry_next(en_r); + } + + bch2_key_resize(l.k, l.k->size + r.k->size); +diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c +index ead31f9e31aa..ba700810a4be 100644 +--- a/fs/bcachefs/reflink.c ++++ b/fs/bcachefs/reflink.c +@@ -82,6 +82,14 @@ void bch2_reflink_v_to_text(struct printbuf *out, struct bch_fs *c, + bch2_bkey_ptrs_to_text(out, c, k); + } + ++bool bch2_reflink_v_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r) ++{ ++ struct bkey_s_reflink_v l = bkey_s_to_reflink_v(_l); ++ struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(_r); ++ ++ return l.v->refcount == r.v->refcount && bch2_extent_merge(c, _l, _r); ++} ++ + /* indirect inline data */ + + const char *bch2_indirect_inline_data_invalid(const struct bch_fs *c, +-- +cgit v1.2.3 + + +From 87abead71e56c97c5be906519cdaed3fc1ffe03e Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 7 Jun 2021 16:50:30 -0400 +Subject: bcachefs: Always zero memory from bch2_trans_kmalloc() + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index ee1f388f8775..8305ff9994d7 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -2290,6 +2290,7 @@ void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size) + + p = trans->mem + trans->mem_top; + trans->mem_top += size; ++ memset(p, 0, size); + return p; + } + +-- +cgit v1.2.3 + + +From b4de8bc14e07c034aa7235baa597294285c46a83 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 10 Jun 2021 23:51:09 -0400 +Subject: bcachefs: Fix overflow in journal_replay_entry_early + +If filesystem on disk was used by a version with a larger BCH_DATA_NR +thas the currently running version, we don't want this to cause a buffer +overrun. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/recovery.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index 9bd6348842e0..f32414171aab 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -725,7 +725,7 @@ static int journal_replay_entry_early(struct bch_fs *c, + ca->usage_base->buckets_ec = le64_to_cpu(u->buckets_ec); + ca->usage_base->buckets_unavailable = le64_to_cpu(u->buckets_unavailable); + +- for (i = 0; i < nr_types; i++) { ++ for (i = 0; i < min_t(unsigned, nr_types, BCH_DATA_NR); i++) { + ca->usage_base->d[i].buckets = le64_to_cpu(u->d[i].buckets); + ca->usage_base->d[i].sectors = le64_to_cpu(u->d[i].sectors); + ca->usage_base->d[i].fragmented = le64_to_cpu(u->d[i].fragmented); +-- +cgit v1.2.3 + + +From 6afb31efaedcb0fc506f9c020d3a3e0407bc4d65 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 12 Jun 2021 15:45:56 -0400 +Subject: bcachefs: Fix null ptr deref when splitting compressed extents + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 1 + + fs/bcachefs/btree_types.h | 6 +++++- + fs/bcachefs/btree_update_leaf.c | 37 +++++++++++++++++++++---------------- + fs/bcachefs/extent_update.c | 35 ----------------------------------- + fs/bcachefs/extent_update.h | 4 ---- + 5 files changed, 27 insertions(+), 56 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 8305ff9994d7..6d9ab1ec08db 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -2320,6 +2320,7 @@ void bch2_trans_reset(struct btree_trans *trans, unsigned flags) + + trans->iters_touched &= trans->iters_live; + ++ trans->extra_journal_res = 0; + trans->nr_updates = 0; + trans->mem_top = 0; + +diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h +index bcc8e6126c46..39130ab6d739 100644 +--- a/fs/bcachefs/btree_types.h ++++ b/fs/bcachefs/btree_types.h +@@ -383,6 +383,11 @@ struct btree_trans { + unsigned used_mempool:1; + unsigned error:1; + unsigned in_traverse_all:1; ++ /* ++ * For when bch2_trans_update notices we'll be splitting a compressed ++ * extent: ++ */ ++ unsigned extra_journal_res; + + u64 iters_linked; + u64 iters_live; +@@ -680,7 +685,6 @@ enum btree_insert_ret { + BTREE_INSERT_OK, + /* leaf node needs to be split */ + BTREE_INSERT_BTREE_NODE_FULL, +- BTREE_INSERT_ENOSPC, + BTREE_INSERT_NEED_MARK_REPLICAS, + BTREE_INSERT_NEED_JOURNAL_RES, + BTREE_INSERT_NEED_JOURNAL_RECLAIM, +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index 482d583e9b6e..7939fbbb1863 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -696,10 +696,6 @@ int bch2_trans_commit_error(struct btree_trans *trans, + ret = -EINTR; + } + break; +- case BTREE_INSERT_ENOSPC: +- BUG_ON(flags & BTREE_INSERT_NOFAIL); +- ret = -ENOSPC; +- break; + case BTREE_INSERT_NEED_MARK_REPLICAS: + bch2_trans_unlock(trans); + +@@ -805,7 +801,7 @@ static int extent_handle_overwrites(struct btree_trans *trans, + struct bpos start = bkey_start_pos(&i->k->k); + struct bkey_i *update; + struct bkey_s_c k; +- int ret = 0; ++ int ret = 0, compressed_sectors; + + iter = bch2_trans_get_iter(trans, i->btree_id, start, + BTREE_ITER_INTENT| +@@ -839,6 +835,16 @@ static int extent_handle_overwrites(struct btree_trans *trans, + goto next; + + while (bkey_cmp(i->k->k.p, bkey_start_pos(k.k)) > 0) { ++ /* ++ * If we're going to be splitting a compressed extent, note it ++ * so that __bch2_trans_commit() can increase our disk ++ * reservation: ++ */ ++ if (bkey_cmp(bkey_start_pos(k.k), start) < 0 && ++ bkey_cmp(k.k->p, i->k->k.p) > 0 && ++ (compressed_sectors = bch2_bkey_sectors_compressed(k))) ++ trans->extra_journal_res += compressed_sectors; ++ + if (bkey_cmp(bkey_start_pos(k.k), start) < 0) { + update = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); + if ((ret = PTR_ERR_OR_ZERO(update))) +@@ -976,6 +982,15 @@ int __bch2_trans_commit(struct btree_trans *trans) + trans->journal_preres_u64s += u64s; + trans->journal_u64s += u64s; + } ++ ++ if (trans->extra_journal_res) { ++ ret = bch2_disk_reservation_add(trans->c, trans->disk_res, ++ trans->extra_journal_res, ++ (trans->flags & BTREE_INSERT_NOFAIL) ++ ? BCH_DISK_RESERVATION_NOFAIL : 0); ++ if (ret) ++ goto err; ++ } + retry: + memset(&trans->journal_res, 0, sizeof(trans->journal_res)); + +@@ -1029,22 +1044,12 @@ int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter, + BUG_ON(trans->nr_updates >= BTREE_ITER_MAX); + + #ifdef CONFIG_BCACHEFS_DEBUG +- BUG_ON(bkey_cmp(iter->pos, +- is_extent ? bkey_start_pos(&k->k) : k->k.p)); +- +- trans_for_each_update(trans, i) { +- BUG_ON(bkey_cmp(i->iter->pos, i->k->k.p)); +- ++ trans_for_each_update(trans, i) + BUG_ON(i != trans->updates && + btree_insert_entry_cmp(i - 1, i) >= 0); +- } + #endif + + if (is_extent) { +- ret = bch2_extent_can_insert(trans, n.iter, n.k); +- if (ret) +- return ret; +- + ret = extent_handle_overwrites(trans, &n); + if (ret) + return ret; +diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c +index bb4b2b4352e0..ef4aaf1c30ed 100644 +--- a/fs/bcachefs/extent_update.c ++++ b/fs/bcachefs/extent_update.c +@@ -173,38 +173,3 @@ int bch2_extent_is_atomic(struct bkey_i *k, struct btree_iter *iter) + + return !bkey_cmp(end, k->k.p); + } +- +-enum btree_insert_ret +-bch2_extent_can_insert(struct btree_trans *trans, +- struct btree_iter *iter, +- struct bkey_i *insert) +-{ +- struct bkey_s_c k; +- int ret, sectors; +- +- k = bch2_btree_iter_peek_slot(iter); +- ret = bkey_err(k); +- if (ret) +- return ret; +- +- /* Check if we're splitting a compressed extent: */ +- +- if (bkey_cmp(bkey_start_pos(&insert->k), bkey_start_pos(k.k)) > 0 && +- bkey_cmp(insert->k.p, k.k->p) < 0 && +- (sectors = bch2_bkey_sectors_compressed(k))) { +- int flags = trans->flags & BTREE_INSERT_NOFAIL +- ? BCH_DISK_RESERVATION_NOFAIL : 0; +- +- switch (bch2_disk_reservation_add(trans->c, trans->disk_res, +- sectors, flags)) { +- case 0: +- break; +- case -ENOSPC: +- return BTREE_INSERT_ENOSPC; +- default: +- BUG(); +- } +- } +- +- return BTREE_INSERT_OK; +-} +diff --git a/fs/bcachefs/extent_update.h b/fs/bcachefs/extent_update.h +index 38dc084627d2..2fa4602967e0 100644 +--- a/fs/bcachefs/extent_update.h ++++ b/fs/bcachefs/extent_update.h +@@ -9,8 +9,4 @@ int bch2_extent_atomic_end(struct btree_iter *, struct bkey_i *, + int bch2_extent_trim_atomic(struct bkey_i *, struct btree_iter *); + int bch2_extent_is_atomic(struct bkey_i *, struct btree_iter *); + +-enum btree_insert_ret +-bch2_extent_can_insert(struct btree_trans *, struct btree_iter *, +- struct bkey_i *); +- + #endif /* _BCACHEFS_EXTENT_UPDATE_H */ +-- +cgit v1.2.3 + + +From 53fb1cfc12e0df9471179f958d8ed339fbb92f2d Mon Sep 17 00:00:00 2001 +From: Dan Robertson +Date: Thu, 10 Jun 2021 07:52:42 -0400 +Subject: bcachefs: mount: fix null deref with null devname + + - Fix null deref on mount when given a null device name. + - Move the dev_name checks to return EINVAL when it is invalid. + +Signed-off-by: Dan Robertson +--- + fs/bcachefs/fs.c | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c +index 78691b0bb3d5..0c3112619677 100644 +--- a/fs/bcachefs/fs.c ++++ b/fs/bcachefs/fs.c +@@ -1324,9 +1324,6 @@ static char **split_devs(const char *_dev_name, unsigned *nr) + char *dev_name = NULL, **devs = NULL, *s; + size_t i, nr_devs = 0; + +- if (strlen(_dev_name) == 0) +- return NULL; +- + dev_name = kstrdup(_dev_name, GFP_KERNEL); + if (!dev_name) + return NULL; +@@ -1502,6 +1499,9 @@ static struct dentry *bch2_mount(struct file_system_type *fs_type, + if (ret) + return ERR_PTR(ret); + ++ if (!dev_name || strlen(dev_name) == 0) ++ return ERR_PTR(-EINVAL); ++ + devs = split_devs(dev_name, &nr_devs); + if (!devs) + return ERR_PTR(-ENOMEM); +-- +cgit v1.2.3 + + +From 2f687654710f4c6864e1d737b4b1727ffc208473 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 12 Jun 2021 17:20:02 -0400 +Subject: bcachefs: Allow shorter JSET_ENTRY_dev_usage entries + +If the last entry(ies) would be all zeros, there's no need to write them +out - the read path already handles that. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/journal_io.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c +index 2da6839fcdc0..66a0e267b3f4 100644 +--- a/fs/bcachefs/journal_io.c ++++ b/fs/bcachefs/journal_io.c +@@ -450,7 +450,7 @@ static int journal_entry_validate_dev_usage(struct bch_fs *c, + struct jset_entry_dev_usage *u = + container_of(entry, struct jset_entry_dev_usage, entry); + unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); +- unsigned expected = sizeof(*u) + sizeof(u->d[0]) * 7; /* Current value of BCH_DATA_NR */ ++ unsigned expected = sizeof(*u); + unsigned dev; + int ret = 0; + +-- +cgit v1.2.3 + + +From 3b23ef39c2c732203ffb70b0c0cebfc094122472 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 10 Jun 2021 20:15:50 -0400 +Subject: bcachefs: Kill bch2_btree_iter_peek_cached() + +It's now been rolled into bch2_btree_iter_peek_slot() + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 65 +++++++++++++++++++++--------------------------- + fs/bcachefs/btree_iter.h | 11 +++----- + fs/bcachefs/buckets.c | 15 +++-------- + fs/bcachefs/inode.c | 17 ++++++------- + 4 files changed, 42 insertions(+), 66 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 6d9ab1ec08db..a5982c569767 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -1816,35 +1816,54 @@ struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *iter) + + struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) + { +- struct bpos search_key = btree_iter_search_key(iter); ++ struct bpos search_key; + struct bkey_s_c k; + int ret; + +- EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS); ++ EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS && ++ btree_iter_type(iter) != BTREE_ITER_CACHED); + bch2_btree_iter_verify(iter); + bch2_btree_iter_verify_entry_exit(iter); + +- btree_iter_set_search_pos(iter, search_key); +- + /* extents can't span inode numbers: */ + if ((iter->flags & BTREE_ITER_IS_EXTENTS) && +- iter->pos.offset == KEY_OFFSET_MAX) { ++ unlikely(iter->pos.offset == KEY_OFFSET_MAX)) { + if (iter->pos.inode == KEY_INODE_MAX) + return bkey_s_c_null; + + bch2_btree_iter_set_pos(iter, bpos_nosnap_successor(iter->pos)); + } + ++ search_key = btree_iter_search_key(iter); ++ btree_iter_set_search_pos(iter, search_key); ++ + ret = btree_iter_traverse(iter); + if (unlikely(ret)) + return bkey_s_c_err(ret); + +- if (!(iter->flags & BTREE_ITER_IS_EXTENTS)) { +- struct bkey_i *next_update = btree_trans_peek_updates(iter, search_key); ++ if (btree_iter_type(iter) == BTREE_ITER_CACHED || ++ !(iter->flags & BTREE_ITER_IS_EXTENTS)) { ++ struct bkey_i *next_update; ++ struct bkey_cached *ck; + +- k = btree_iter_level_peek_all(iter, &iter->l[0]); +- EBUG_ON(k.k && bkey_deleted(k.k) && bkey_cmp(k.k->p, iter->pos) == 0); ++ switch (btree_iter_type(iter)) { ++ case BTREE_ITER_KEYS: ++ k = btree_iter_level_peek_all(iter, &iter->l[0]); ++ EBUG_ON(k.k && bkey_deleted(k.k) && bkey_cmp(k.k->p, iter->pos) == 0); ++ break; ++ case BTREE_ITER_CACHED: ++ ck = (void *) iter->l[0].b; ++ EBUG_ON(iter->btree_id != ck->key.btree_id || ++ bkey_cmp(iter->pos, ck->key.pos)); ++ BUG_ON(!ck->valid); + ++ k = bkey_i_to_s_c(ck->k); ++ break; ++ case BTREE_ITER_NODES: ++ BUG(); ++ } ++ ++ next_update = btree_trans_peek_updates(iter, search_key); + if (next_update && + (!k.k || bpos_cmp(next_update->k.p, k.k->p) <= 0)) { + iter->k = next_update->k; +@@ -1921,34 +1940,6 @@ struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_iter *iter) + return bch2_btree_iter_peek_slot(iter); + } + +-struct bkey_s_c bch2_btree_iter_peek_cached(struct btree_iter *iter) +-{ +- struct bkey_i *next_update; +- struct bkey_cached *ck; +- int ret; +- +- EBUG_ON(btree_iter_type(iter) != BTREE_ITER_CACHED); +- bch2_btree_iter_verify(iter); +- +- next_update = btree_trans_peek_updates(iter, iter->pos); +- if (next_update && !bpos_cmp(next_update->k.p, iter->pos)) +- return bkey_i_to_s_c(next_update); +- +- ret = btree_iter_traverse(iter); +- if (unlikely(ret)) +- return bkey_s_c_err(ret); +- +- ck = (void *) iter->l[0].b; +- +- EBUG_ON(iter->btree_id != ck->key.btree_id || +- bkey_cmp(iter->pos, ck->key.pos)); +- BUG_ON(!ck->valid); +- +- iter->should_be_locked = true; +- +- return bkey_i_to_s_c(ck->k); +-} +- + static inline void bch2_btree_iter_init(struct btree_trans *trans, + struct btree_iter *iter, enum btree_id btree_id) + { +diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h +index ba98cfea4d60..27c685a482ec 100644 +--- a/fs/bcachefs/btree_iter.h ++++ b/fs/bcachefs/btree_iter.h +@@ -160,8 +160,6 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *); + struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *); + struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_iter *); + +-struct bkey_s_c bch2_btree_iter_peek_cached(struct btree_iter *); +- + bool bch2_btree_iter_advance(struct btree_iter *); + bool bch2_btree_iter_rewind(struct btree_iter *); + +@@ -224,12 +222,9 @@ static inline int bch2_trans_cond_resched(struct btree_trans *trans) + static inline struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, + unsigned flags) + { +- if ((flags & BTREE_ITER_TYPE) == BTREE_ITER_CACHED) +- return bch2_btree_iter_peek_cached(iter); +- else +- return flags & BTREE_ITER_SLOTS +- ? bch2_btree_iter_peek_slot(iter) +- : bch2_btree_iter_peek(iter); ++ return flags & BTREE_ITER_SLOTS ++ ? bch2_btree_iter_peek_slot(iter) ++ : bch2_btree_iter_peek(iter); + } + + static inline struct bkey_s_c __bch2_btree_iter_next(struct btree_iter *iter, +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index 20862a4a77f2..6d5fe398007a 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -1913,17 +1913,10 @@ int bch2_trans_mark_update(struct btree_trans *trans, + if (!btree_node_type_needs_gc(iter->btree_id)) + return 0; + +- if (btree_iter_type(iter) != BTREE_ITER_CACHED) { +- old = bch2_btree_iter_peek_slot(iter); +- ret = bkey_err(old); +- if (ret) +- return ret; +- } else { +- struct bkey_cached *ck = (void *) iter->l[0].b; +- +- BUG_ON(!ck->valid); +- old = bkey_i_to_s_c(ck->k); +- } ++ old = bch2_btree_iter_peek_slot(iter); ++ ret = bkey_err(old); ++ if (ret) ++ return ret; + + if (old.k->type == new->k.type && + !btree_node_type_is_extents(iter->btree_id)) { +diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c +index 17d8eb5223cd..59edb4cea5f1 100644 +--- a/fs/bcachefs/inode.c ++++ b/fs/bcachefs/inode.c +@@ -302,7 +302,7 @@ struct btree_iter *bch2_inode_peek(struct btree_trans *trans, + + iter = bch2_trans_get_iter(trans, BTREE_ID_inodes, POS(0, inum), + BTREE_ITER_CACHED|flags); +- k = bch2_btree_iter_peek_cached(iter); ++ k = bch2_btree_iter_peek_slot(iter); + ret = bkey_err(k); + if (ret) + goto err; +@@ -600,15 +600,12 @@ int bch2_inode_rm(struct bch_fs *c, u64 inode_nr, bool cached) + retry: + bch2_trans_begin(&trans); + +- if (cached) { +- iter = bch2_trans_get_iter(&trans, BTREE_ID_inodes, POS(0, inode_nr), +- BTREE_ITER_CACHED|BTREE_ITER_INTENT); +- k = bch2_btree_iter_peek_cached(iter); +- } else { +- iter = bch2_trans_get_iter(&trans, BTREE_ID_inodes, POS(0, inode_nr), +- BTREE_ITER_SLOTS|BTREE_ITER_INTENT); +- k = bch2_btree_iter_peek_slot(iter); +- } ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_inodes, POS(0, inode_nr), ++ (cached ++ ? BTREE_ITER_CACHED ++ : BTREE_ITER_SLOTS)| ++ BTREE_ITER_INTENT); ++ k = bch2_btree_iter_peek_slot(iter); + + ret = bkey_err(k); + if (ret) +-- +cgit v1.2.3 + + +From 68156540104302814f64cd0f3129bb7e5804a771 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 10 Jun 2021 23:33:27 -0400 +Subject: bcachefs: Don't underflow c->sectors_available + +This rarely used error path should've been checking for underflow - +oops. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/buckets.c | 9 ++++++++- + 1 file changed, 8 insertions(+), 1 deletion(-) + +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index 6d5fe398007a..84f280b8525c 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -1433,7 +1433,14 @@ void bch2_trans_fs_usage_apply(struct btree_trans *trans, + */ + should_not_have_added = added - (s64) disk_res_sectors; + if (unlikely(should_not_have_added > 0)) { +- atomic64_sub(should_not_have_added, &c->sectors_available); ++ u64 old, new, v = atomic64_read(&c->sectors_available); ++ ++ do { ++ old = v; ++ new = max_t(s64, 0, old - should_not_have_added); ++ } while ((v = atomic64_cmpxchg(&c->sectors_available, ++ old, new)) != old); ++ + added -= should_not_have_added; + warn = true; + } +-- +cgit v1.2.3 + + +From 26cc4c48d979189fd2d2a596493766215f96ed62 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 12 Jun 2021 22:33:53 -0400 +Subject: bcachefs: Clear iter->should_be_locked in bch2_trans_reset + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index a5982c569767..c8f503190ef5 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -2303,9 +2303,11 @@ void bch2_trans_reset(struct btree_trans *trans, unsigned flags) + { + struct btree_iter *iter; + +- trans_for_each_iter(trans, iter) ++ trans_for_each_iter(trans, iter) { + iter->flags &= ~(BTREE_ITER_KEEP_UNTIL_COMMIT| + BTREE_ITER_SET_POS_AFTER_COMMIT); ++ iter->should_be_locked = false; ++ } + + bch2_trans_unlink_iters(trans); + +-- +cgit v1.2.3 + + +From 2596ffb9415f492d006480a8fbacd79e74e7c4cb Mon Sep 17 00:00:00 2001 +From: jpsollie +Date: Sun, 13 Jun 2021 22:01:08 +0200 +Subject: bcachefs: fix a possible bcachefs checksum mapping error opt-checksum + enum to type-checksum enum + +This fixes some rare cases where the metadata checksum option specified may map to the wrong actual checksum type. + +Signed-Off By: Janpieter Sollie +--- + fs/bcachefs/super-io.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c +index 977885166d55..c771b92d9496 100644 +--- a/fs/bcachefs/super-io.c ++++ b/fs/bcachefs/super-io.c +@@ -680,7 +680,7 @@ static void write_one_super(struct bch_fs *c, struct bch_dev *ca, unsigned idx) + + sb->offset = sb->layout.sb_offset[idx]; + +- SET_BCH_SB_CSUM_TYPE(sb, c->opts.metadata_checksum); ++ SET_BCH_SB_CSUM_TYPE(sb, bch2_csum_opt_to_type(c->opts.metadata_checksum, false)); + sb->csum = csum_vstruct(c, BCH_SB_CSUM_TYPE(sb), + null_nonce(), sb); + +-- +cgit v1.2.3 + + +From dd7c301da98fd04688420c066ef53b33a9931b29 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 14 Jun 2021 14:47:26 -0400 +Subject: bcachefs: Fix a memory leak in dio write path + +Commit c42bca92be928ce7dece5fc04cf68d0e37ee6718 "bio: don't copy bvec +for direct IO" changed bio_iov_iter_get_pages() to point bio->bi_iovec +at the incoming biovec, meaning if we already allocated one, it'll be +leaked. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/fs-io.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c +index 50e0b5af9b24..ae35c6042bdf 100644 +--- a/fs/bcachefs/fs-io.c ++++ b/fs/bcachefs/fs-io.c +@@ -2035,7 +2035,9 @@ ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter) + } + + bio = bio_alloc_bioset(GFP_KERNEL, +- iov_iter_npages(iter, BIO_MAX_VECS), ++ iov_iter_is_bvec(iter) ++ ? 0 ++ : iov_iter_npages(iter, BIO_MAX_VECS), + &c->dio_write_bioset); + dio = container_of(bio, struct dio_write, op.wbio.bio); + init_completion(&dio->done); +-- +cgit v1.2.3 + + +From 2f526b04873c449b6953d71a7ba4a5fbd9e1333e Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 14 Jun 2021 16:32:44 -0400 +Subject: bcachefs: Make sure bch2_trans_mark_update uses correct iter flags + +Now that bch2_btree_iter_peek_with_updates() has been removed in favor +of BTREE_ITER_WITH_UPDATES, we need to make sure it's not used where we +don't want it. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/buckets.c | 8 +++++++- + 1 file changed, 7 insertions(+), 1 deletion(-) + +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index 84f280b8525c..0b4fb05760bf 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -1912,7 +1912,7 @@ int bch2_trans_mark_update(struct btree_trans *trans, + unsigned flags) + { + struct bkey_s_c old; +- int ret; ++ int iter_flags, ret; + + if (unlikely(flags & BTREE_TRIGGER_NORUN)) + return 0; +@@ -1920,7 +1920,13 @@ int bch2_trans_mark_update(struct btree_trans *trans, + if (!btree_node_type_needs_gc(iter->btree_id)) + return 0; + ++ iter_flags = iter->flags & BTREE_ITER_WITH_UPDATES; ++ iter->flags &= ~BTREE_ITER_WITH_UPDATES; ++ + old = bch2_btree_iter_peek_slot(iter); ++ ++ iter->flags |= iter_flags; ++ + ret = bkey_err(old); + if (ret) + return ret; +-- +cgit v1.2.3 + + +From 47ff0a4a59c88c00200eaa4d86c7a00930c3a767 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 14 Jun 2021 16:35:03 -0400 +Subject: bcachefs: Kill __btree_delete_at() + +With trans->updates2 gone, we can now drop this helper and use +bch2_btree_delete_at() instead. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_update_leaf.c | 60 +++++++++++++---------------------------- + fs/bcachefs/fsck.c | 2 +- + fs/bcachefs/tests.c | 9 ++++--- + 3 files changed, 26 insertions(+), 45 deletions(-) + +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index 7939fbbb1863..03fd8d00e642 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -771,28 +771,6 @@ bch2_trans_commit_get_rw_cold(struct btree_trans *trans) + return 0; + } + +-static int __btree_delete_at(struct btree_trans *trans, enum btree_id btree_id, +- struct bpos pos, unsigned trigger_flags) +-{ +- struct btree_iter *iter; +- struct bkey_i *update; +- int ret; +- +- update = bch2_trans_kmalloc(trans, sizeof(struct bkey)); +- if ((ret = PTR_ERR_OR_ZERO(update))) +- return ret; +- +- bkey_init(&update->k); +- update->k.p = pos; +- +- iter = bch2_trans_get_iter(trans, btree_id, pos, +- BTREE_ITER_NOT_EXTENTS| +- BTREE_ITER_INTENT); +- bch2_trans_update(trans, iter, update, trigger_flags); +- bch2_trans_iter_put(trans, iter); +- return 0; +-} +- + static int extent_handle_overwrites(struct btree_trans *trans, + struct btree_insert_entry *i) + { +@@ -812,8 +790,6 @@ static int extent_handle_overwrites(struct btree_trans *trans, + goto out; + + if (bch2_bkey_maybe_mergable(k.k, &i->k->k)) { +- struct bpos l_pos = k.k->p; +- + update = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); + if ((ret = PTR_ERR_OR_ZERO(update))) + goto out; +@@ -821,8 +797,11 @@ static int extent_handle_overwrites(struct btree_trans *trans, + bkey_reassemble(update, k); + + if (bch2_bkey_merge(c, bkey_i_to_s(update), bkey_i_to_s_c(i->k))) { +- ret = __btree_delete_at(trans, i->btree_id, l_pos, +- i->trigger_flags); ++ update_iter = bch2_trans_copy_iter(trans, iter); ++ ret = bch2_btree_delete_at(trans, update_iter, ++ i->trigger_flags); ++ bch2_trans_iter_put(trans, update_iter); ++ + if (ret) + goto out; + +@@ -862,8 +841,11 @@ static int extent_handle_overwrites(struct btree_trans *trans, + } + + if (bkey_cmp(k.k->p, i->k->k.p) <= 0) { +- ret = __btree_delete_at(trans, i->btree_id, k.k->p, +- i->trigger_flags); ++ update_iter = bch2_trans_copy_iter(trans, iter); ++ ret = bch2_btree_delete_at(trans, update_iter, ++ i->trigger_flags); ++ bch2_trans_iter_put(trans, update_iter); ++ + if (ret) + goto out; + } +@@ -876,12 +858,7 @@ static int extent_handle_overwrites(struct btree_trans *trans, + bkey_reassemble(update, k); + bch2_cut_front(i->k->k.p, update); + +- update_iter = bch2_trans_get_iter(trans, i->btree_id, update->k.p, +- BTREE_ITER_NOT_EXTENTS| +- BTREE_ITER_INTENT); +- bch2_trans_update(trans, update_iter, update, +- i->trigger_flags); +- bch2_trans_iter_put(trans, update_iter); ++ bch2_trans_update(trans, iter, update, i->trigger_flags); + goto out; + } + next: +@@ -1127,16 +1104,17 @@ int bch2_btree_insert(struct bch_fs *c, enum btree_id id, + } + + int bch2_btree_delete_at(struct btree_trans *trans, +- struct btree_iter *iter, unsigned flags) ++ struct btree_iter *iter, unsigned trigger_flags) + { +- struct bkey_i k; ++ struct bkey_i *k; + +- bkey_init(&k.k); +- k.k.p = iter->pos; ++ k = bch2_trans_kmalloc(trans, sizeof(*k)); ++ if (IS_ERR(k)) ++ return PTR_ERR(k); + +- return bch2_trans_update(trans, iter, &k, 0) ?: +- bch2_trans_commit(trans, NULL, NULL, +- BTREE_INSERT_NOFAIL|flags); ++ bkey_init(&k->k); ++ k->k.p = iter->pos; ++ return bch2_trans_update(trans, iter, k, trigger_flags); + } + + int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id, +diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c +index 89a130d9c537..1bb595f4003a 100644 +--- a/fs/bcachefs/fsck.c ++++ b/fs/bcachefs/fsck.c +@@ -761,7 +761,7 @@ retry: + mode_to_type(w.inode.bi_mode), + (bch2_bkey_val_to_text(&PBUF(buf), c, + k), buf))) { +- ret = lockrestart_do(&trans, ++ ret = __bch2_trans_do(&trans, NULL, NULL, 0, + bch2_btree_delete_at(&trans, iter, 0)); + if (ret) + goto err; +diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c +index 63f4a83ad1de..59f34b40fd5b 100644 +--- a/fs/bcachefs/tests.c ++++ b/fs/bcachefs/tests.c +@@ -54,14 +54,16 @@ static int test_delete(struct bch_fs *c, u64 nr) + } + + pr_info("deleting once"); +- ret = bch2_btree_delete_at(&trans, iter, 0); ++ ret = __bch2_trans_do(&trans, NULL, NULL, 0, ++ bch2_btree_delete_at(&trans, iter, 0)); + if (ret) { + bch_err(c, "delete error (first) in test_delete: %i", ret); + goto err; + } + + pr_info("deleting twice"); +- ret = bch2_btree_delete_at(&trans, iter, 0); ++ ret = __bch2_trans_do(&trans, NULL, NULL, 0, ++ bch2_btree_delete_at(&trans, iter, 0)); + if (ret) { + bch_err(c, "delete error (second) in test_delete: %i", ret); + goto err; +@@ -101,7 +103,8 @@ static int test_delete_written(struct bch_fs *c, u64 nr) + + bch2_journal_flush_all_pins(&c->journal); + +- ret = bch2_btree_delete_at(&trans, iter, 0); ++ ret = __bch2_trans_do(&trans, NULL, NULL, 0, ++ bch2_btree_delete_at(&trans, iter, 0)); + if (ret) { + bch_err(c, "delete error in test_delete_written: %i", ret); + goto err; +-- +cgit v1.2.3 + + +From b72351fda3e3b6bf6a01ee83a91099d284b1c268 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 14 Jun 2021 18:16:10 -0400 +Subject: bcachefs: Improve iter->should_be_locked + +Adding iter->should_be_locked introduced a regression where it ended up +not being set on the iterator passed to bch2_btree_update_start(), which +is definitely not what we want. + +This patch requires it to be set when calling bch2_trans_update(), and +adds various fixups to make that happen. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.h | 6 ++++++ + fs/bcachefs/btree_update_interior.c | 2 ++ + fs/bcachefs/btree_update_leaf.c | 13 +++++++++---- + fs/bcachefs/buckets.c | 2 +- + fs/bcachefs/extent_update.c | 4 ++++ + fs/bcachefs/fs-common.c | 3 ++- + fs/bcachefs/fs-io.c | 3 ++- + fs/bcachefs/fsck.c | 12 +++++++++--- + fs/bcachefs/recovery.c | 15 ++++----------- + fs/bcachefs/reflink.c | 6 +++--- + fs/bcachefs/tests.c | 16 ++++++++-------- + 11 files changed, 50 insertions(+), 32 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h +index 27c685a482ec..6efea281d87f 100644 +--- a/fs/bcachefs/btree_iter.h ++++ b/fs/bcachefs/btree_iter.h +@@ -176,6 +176,12 @@ static inline void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos + iter->should_be_locked = false; + } + ++static inline void bch2_btree_iter_set_pos_to_extent_start(struct btree_iter *iter) ++{ ++ BUG_ON(!(iter->flags & BTREE_ITER_IS_EXTENTS)); ++ iter->pos = bkey_start_pos(&iter->k); ++} ++ + static inline struct btree_iter *btree_iter_child(struct btree_iter *iter) + { + return iter->child_idx == U8_MAX ? NULL +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index 2d8093d1bf00..89011f9f89ed 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -937,6 +937,8 @@ bch2_btree_update_start(struct btree_iter *iter, unsigned level, + int journal_flags = 0; + int ret = 0; + ++ BUG_ON(!iter->should_be_locked); ++ + if (flags & BTREE_INSERT_JOURNAL_RESERVED) + journal_flags |= JOURNAL_RES_GET_RESERVED; + +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index 03fd8d00e642..4bf386640779 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -836,6 +836,10 @@ static int extent_handle_overwrites(struct btree_trans *trans, + update_iter = bch2_trans_get_iter(trans, i->btree_id, update->k.p, + BTREE_ITER_NOT_EXTENTS| + BTREE_ITER_INTENT); ++ ret = bch2_btree_iter_traverse(update_iter); ++ if (ret) ++ goto out; ++ + bch2_trans_update(trans, update_iter, update, i->trigger_flags); + bch2_trans_iter_put(trans, update_iter); + } +@@ -1019,6 +1023,7 @@ int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter, + int ret = 0; + + BUG_ON(trans->nr_updates >= BTREE_ITER_MAX); ++ BUG_ON(!iter->should_be_locked); + + #ifdef CONFIG_BCACHEFS_DEBUG + trans_for_each_update(trans, i) +@@ -1082,7 +1087,8 @@ int __bch2_btree_insert(struct btree_trans *trans, + iter = bch2_trans_get_iter(trans, id, bkey_start_pos(&k->k), + BTREE_ITER_INTENT); + +- ret = bch2_trans_update(trans, iter, k, 0); ++ ret = bch2_btree_iter_traverse(iter) ?: ++ bch2_trans_update(trans, iter, k, 0); + bch2_trans_iter_put(trans, iter); + return ret; + } +@@ -1127,13 +1133,12 @@ int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id, + + iter = bch2_trans_get_iter(trans, id, start, BTREE_ITER_INTENT); + retry: +- while ((k = bch2_btree_iter_peek(iter)).k && ++ while ((bch2_trans_begin(trans), ++ (k = bch2_btree_iter_peek(iter)).k) && + !(ret = bkey_err(k)) && + bkey_cmp(iter->pos, end) < 0) { + struct bkey_i delete; + +- bch2_trans_begin(trans); +- + bkey_init(&delete.k); + + /* +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index 0b4fb05760bf..b06105aea216 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -1813,7 +1813,7 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans, + set_bkey_val_u64s(&n->k, 0); + } + +- bch2_btree_iter_set_pos(iter, bkey_start_pos(k.k)); ++ bch2_btree_iter_set_pos_to_extent_start(iter); + ret = bch2_trans_update(trans, iter, n, 0); + if (ret) + goto err; +diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c +index ef4aaf1c30ed..4a8dd085f7fb 100644 +--- a/fs/bcachefs/extent_update.c ++++ b/fs/bcachefs/extent_update.c +@@ -104,6 +104,10 @@ int bch2_extent_atomic_end(struct btree_iter *iter, + unsigned nr_iters = 0; + int ret; + ++ ret = bch2_btree_iter_traverse(iter); ++ if (ret) ++ return ret; ++ + *end = insert->k.p; + + /* extent_update_to_keys(): */ +diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c +index 00a63fecb976..60c54438074e 100644 +--- a/fs/bcachefs/fs-common.c ++++ b/fs/bcachefs/fs-common.c +@@ -85,7 +85,8 @@ int bch2_create_trans(struct btree_trans *trans, u64 dir_inum, + inode_iter->snapshot = U32_MAX; + bch2_btree_iter_set_pos(inode_iter, SPOS(0, new_inode->bi_inum, U32_MAX)); + +- ret = bch2_inode_write(trans, inode_iter, new_inode); ++ ret = bch2_btree_iter_traverse(inode_iter) ?: ++ bch2_inode_write(trans, inode_iter, new_inode); + err: + bch2_trans_iter_put(trans, inode_iter); + bch2_trans_iter_put(trans, dir_iter); +diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c +index ae35c6042bdf..b306fd59bbf1 100644 +--- a/fs/bcachefs/fs-io.c ++++ b/fs/bcachefs/fs-io.c +@@ -2620,7 +2620,8 @@ reassemble: + BUG_ON(ret); + } + +- ret = bch2_trans_update(&trans, del, &delete, trigger_flags) ?: ++ ret = bch2_btree_iter_traverse(del) ?: ++ bch2_trans_update(&trans, del, &delete, trigger_flags) ?: + bch2_trans_update(&trans, dst, copy.k, trigger_flags) ?: + bch2_trans_commit(&trans, &disk_res, + &inode->ei_journal_seq, +diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c +index 1bb595f4003a..7ea1a41ac637 100644 +--- a/fs/bcachefs/fsck.c ++++ b/fs/bcachefs/fsck.c +@@ -78,7 +78,8 @@ static int __write_inode(struct btree_trans *trans, + bch2_trans_get_iter(trans, BTREE_ID_inodes, + SPOS(0, inode->bi_inum, snapshot), + BTREE_ITER_INTENT); +- int ret = bch2_inode_write(trans, inode_iter, inode); ++ int ret = bch2_btree_iter_traverse(inode_iter) ?: ++ bch2_inode_write(trans, inode_iter, inode); + bch2_trans_iter_put(trans, inode_iter); + return ret; + } +@@ -305,7 +306,8 @@ static int hash_redo_key(struct btree_trans *trans, + + bkey_init(&delete->k); + delete->k.p = k_iter->pos; +- return bch2_trans_update(trans, k_iter, delete, 0) ?: ++ return bch2_btree_iter_traverse(k_iter) ?: ++ bch2_trans_update(trans, k_iter, delete, 0) ?: + bch2_hash_set(trans, desc, hash_info, k_iter->pos.inode, tmp, 0); + } + +@@ -491,6 +493,7 @@ static int check_inode(struct btree_trans *trans, + ret = __bch2_trans_do(trans, NULL, NULL, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW, ++ bch2_btree_iter_traverse(iter) ?: + bch2_inode_write(trans, iter, &u)); + if (ret) + bch_err(c, "error in fsck: error %i " +@@ -562,7 +565,8 @@ static int fix_overlapping_extent(struct btree_trans *trans, + BTREE_ITER_INTENT|BTREE_ITER_NOT_EXTENTS); + + BUG_ON(iter->flags & BTREE_ITER_IS_EXTENTS); +- ret = bch2_trans_update(trans, iter, u, BTREE_TRIGGER_NORUN) ?: ++ ret = bch2_btree_iter_traverse(iter) ?: ++ bch2_trans_update(trans, iter, u, BTREE_TRIGGER_NORUN) ?: + bch2_trans_commit(trans, NULL, NULL, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW); +@@ -886,6 +890,7 @@ retry: + ret = __bch2_trans_do(&trans, NULL, NULL, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW, ++ bch2_btree_iter_traverse(iter) ?: + bch2_trans_update(&trans, iter, &n->k_i, 0)); + kfree(n); + if (ret) +@@ -1338,6 +1343,7 @@ static int check_nlinks_update_hardlinks(struct bch_fs *c, + ret = __bch2_trans_do(&trans, NULL, NULL, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW, ++ bch2_btree_iter_traverse(iter) ?: + bch2_inode_write(&trans, iter, &u)); + if (ret) + bch_err(c, "error in fsck: error %i updating inode", ret); +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index f32414171aab..c6fa4ca31ae9 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -509,16 +509,8 @@ static int __bch2_journal_replay_key(struct btree_trans *trans, + + iter = bch2_trans_get_node_iter(trans, id, k->k.p, + BTREE_MAX_DEPTH, level, +- BTREE_ITER_INTENT); +- +- /* +- * iter->flags & BTREE_ITER_IS_EXTENTS triggers the update path to run +- * extent_handle_overwrites() and extent_update_to_keys() - but we don't +- * want that here, journal replay is supposed to treat extents like +- * regular keys: +- */ +- BUG_ON(iter->flags & BTREE_ITER_IS_EXTENTS); +- ++ BTREE_ITER_INTENT| ++ BTREE_ITER_NOT_EXTENTS); + ret = bch2_btree_iter_traverse(iter) ?: + bch2_trans_update(trans, iter, k, BTREE_TRIGGER_NORUN); + bch2_trans_iter_put(trans, iter); +@@ -546,7 +538,8 @@ static int __bch2_alloc_replay_key(struct btree_trans *trans, struct bkey_i *k) + BTREE_ITER_CACHED| + BTREE_ITER_CACHED_NOFILL| + BTREE_ITER_INTENT); +- ret = bch2_trans_update(trans, iter, k, BTREE_TRIGGER_NORUN); ++ ret = bch2_btree_iter_traverse(iter) ?: ++ bch2_trans_update(trans, iter, k, BTREE_TRIGGER_NORUN); + bch2_trans_iter_put(trans, iter); + return ret; + } +diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c +index ba700810a4be..ebf391245470 100644 +--- a/fs/bcachefs/reflink.c ++++ b/fs/bcachefs/reflink.c +@@ -142,7 +142,7 @@ static int bch2_make_extent_indirect(struct btree_trans *trans, + goto err; + + /* rewind iter to start of hole, if necessary: */ +- bch2_btree_iter_set_pos(reflink_iter, bkey_start_pos(k.k)); ++ bch2_btree_iter_set_pos_to_extent_start(reflink_iter); + + r_v = bch2_trans_kmalloc(trans, sizeof(__le64) + bkey_bytes(&orig->k)); + ret = PTR_ERR_OR_ZERO(r_v); +@@ -257,11 +257,11 @@ s64 bch2_remap_range(struct bch_fs *c, + } + + if (src_k.k->type != KEY_TYPE_reflink_p) { ++ bch2_btree_iter_set_pos_to_extent_start(src_iter); ++ + bch2_bkey_buf_reassemble(&new_src, c, src_k); + src_k = bkey_i_to_s_c(new_src.k); + +- bch2_btree_iter_set_pos(src_iter, bkey_start_pos(src_k.k)); +- + ret = bch2_make_extent_indirect(&trans, src_iter, + new_src.k); + if (ret) +diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c +index 59f34b40fd5b..d099358e43d6 100644 +--- a/fs/bcachefs/tests.c ++++ b/fs/bcachefs/tests.c +@@ -40,13 +40,8 @@ static int test_delete(struct bch_fs *c, u64 nr) + iter = bch2_trans_get_iter(&trans, BTREE_ID_xattrs, k.k.p, + BTREE_ITER_INTENT); + +- ret = bch2_btree_iter_traverse(iter); +- if (ret) { +- bch_err(c, "lookup error in test_delete: %i", ret); +- goto err; +- } +- + ret = __bch2_trans_do(&trans, NULL, NULL, 0, ++ bch2_btree_iter_traverse(iter) ?: + bch2_trans_update(&trans, iter, &k.k_i, 0)); + if (ret) { + bch_err(c, "update error in test_delete: %i", ret); +@@ -55,7 +50,8 @@ static int test_delete(struct bch_fs *c, u64 nr) + + pr_info("deleting once"); + ret = __bch2_trans_do(&trans, NULL, NULL, 0, +- bch2_btree_delete_at(&trans, iter, 0)); ++ bch2_btree_iter_traverse(iter) ?: ++ bch2_btree_delete_at(&trans, iter, 0)); + if (ret) { + bch_err(c, "delete error (first) in test_delete: %i", ret); + goto err; +@@ -63,7 +59,8 @@ static int test_delete(struct bch_fs *c, u64 nr) + + pr_info("deleting twice"); + ret = __bch2_trans_do(&trans, NULL, NULL, 0, +- bch2_btree_delete_at(&trans, iter, 0)); ++ bch2_btree_iter_traverse(iter) ?: ++ bch2_btree_delete_at(&trans, iter, 0)); + if (ret) { + bch_err(c, "delete error (second) in test_delete: %i", ret); + goto err; +@@ -591,6 +588,7 @@ static int rand_mixed(struct bch_fs *c, u64 nr) + k.k.p = iter->pos; + + ret = __bch2_trans_do(&trans, NULL, NULL, 0, ++ bch2_btree_iter_traverse(iter) ?: + bch2_trans_update(&trans, iter, &k.k_i, 0)); + if (ret) { + bch_err(c, "update error in rand_mixed: %i", ret); +@@ -671,6 +669,7 @@ static int seq_insert(struct bch_fs *c, u64 nr) + insert.k.p = iter->pos; + + ret = __bch2_trans_do(&trans, NULL, NULL, 0, ++ bch2_btree_iter_traverse(iter) ?: + bch2_trans_update(&trans, iter, &insert.k_i, 0)); + if (ret) { + bch_err(c, "error in seq_insert: %i", ret); +@@ -719,6 +718,7 @@ static int seq_overwrite(struct bch_fs *c, u64 nr) + bkey_reassemble(&u.k_i, k); + + ret = __bch2_trans_do(&trans, NULL, NULL, 0, ++ bch2_btree_iter_traverse(iter) ?: + bch2_trans_update(&trans, iter, &u.k_i, 0)); + if (ret) { + bch_err(c, "error in seq_overwrite: %i", ret); +-- +cgit v1.2.3 + + +From 4a6a99f85d3b42424527890c1201468549c8bf2a Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 14 Jun 2021 22:29:54 -0400 +Subject: bcachefs: fix truncate with ATTR_MODE + +After the v5.12 rebase, we started oopsing when truncate was passed +ATTR_MODE, due to not passing mnt_userns to setattr_copy(). This +refactors things so that truncate/extend finish by using +bch2_setattr_nonsize(), which solves the problem. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/fs-io.c | 43 +++++++++++++++++++++++-------------------- + fs/bcachefs/fs-io.h | 3 ++- + fs/bcachefs/fs.c | 19 +++++++++++-------- + fs/bcachefs/fs.h | 4 ++++ + 4 files changed, 40 insertions(+), 29 deletions(-) + +diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c +index b306fd59bbf1..93e1845523fa 100644 +--- a/fs/bcachefs/fs-io.c ++++ b/fs/bcachefs/fs-io.c +@@ -2261,11 +2261,11 @@ static int bch2_truncate_page(struct bch_inode_info *inode, loff_t from) + from, round_up(from, PAGE_SIZE)); + } + +-static int bch2_extend(struct bch_inode_info *inode, ++static int bch2_extend(struct user_namespace *mnt_userns, ++ struct bch_inode_info *inode, + struct bch_inode_unpacked *inode_u, + struct iattr *iattr) + { +- struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct address_space *mapping = inode->v.i_mapping; + int ret; + +@@ -2279,25 +2279,15 @@ static int bch2_extend(struct bch_inode_info *inode, + return ret; + + truncate_setsize(&inode->v, iattr->ia_size); +- /* ATTR_MODE will never be set here, ns argument isn't needed: */ +- setattr_copy(NULL, &inode->v, iattr); +- +- mutex_lock(&inode->ei_update_lock); +- ret = bch2_write_inode_size(c, inode, inode->v.i_size, +- ATTR_MTIME|ATTR_CTIME); +- mutex_unlock(&inode->ei_update_lock); + +- return ret; ++ return bch2_setattr_nonsize(mnt_userns, inode, iattr); + } + + static int bch2_truncate_finish_fn(struct bch_inode_info *inode, + struct bch_inode_unpacked *bi, + void *p) + { +- struct bch_fs *c = inode->v.i_sb->s_fs_info; +- + bi->bi_flags &= ~BCH_INODE_I_SIZE_DIRTY; +- bi->bi_mtime = bi->bi_ctime = bch2_current_time(c); + return 0; + } + +@@ -2311,7 +2301,8 @@ static int bch2_truncate_start_fn(struct bch_inode_info *inode, + return 0; + } + +-int bch2_truncate(struct bch_inode_info *inode, struct iattr *iattr) ++int bch2_truncate(struct user_namespace *mnt_userns, ++ struct bch_inode_info *inode, struct iattr *iattr) + { + struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct address_space *mapping = inode->v.i_mapping; +@@ -2322,6 +2313,18 @@ int bch2_truncate(struct bch_inode_info *inode, struct iattr *iattr) + s64 i_sectors_delta = 0; + int ret = 0; + ++ /* ++ * Don't update timestamps if we're not doing anything: ++ */ ++ if (iattr->ia_size == inode->v.i_size) ++ return 0; ++ ++ if (!(iattr->ia_valid & ATTR_MTIME)) ++ ktime_get_coarse_real_ts64(&iattr->ia_mtime); ++ if (!(iattr->ia_valid & ATTR_CTIME)) ++ ktime_get_coarse_real_ts64(&iattr->ia_ctime); ++ iattr->ia_valid |= ATTR_MTIME|ATTR_CTIME; ++ + inode_dio_wait(&inode->v); + bch2_pagecache_block_get(&inode->ei_pagecache_lock); + +@@ -2351,10 +2354,12 @@ int bch2_truncate(struct bch_inode_info *inode, struct iattr *iattr) + inode->v.i_size < inode_u.bi_size); + + if (iattr->ia_size > inode->v.i_size) { +- ret = bch2_extend(inode, &inode_u, iattr); ++ ret = bch2_extend(mnt_userns, inode, &inode_u, iattr); + goto err; + } + ++ iattr->ia_valid &= ~ATTR_SIZE; ++ + ret = bch2_truncate_page(inode, iattr->ia_size); + if (unlikely(ret)) + goto err; +@@ -2398,13 +2403,11 @@ int bch2_truncate(struct bch_inode_info *inode, struct iattr *iattr) + if (unlikely(ret)) + goto err; + +- /* ATTR_MODE will never be set here, ns argument isn't needed: */ +- setattr_copy(NULL, &inode->v, iattr); +- + mutex_lock(&inode->ei_update_lock); +- ret = bch2_write_inode(c, inode, bch2_truncate_finish_fn, NULL, +- ATTR_MTIME|ATTR_CTIME); ++ ret = bch2_write_inode(c, inode, bch2_truncate_finish_fn, NULL, 0); + mutex_unlock(&inode->ei_update_lock); ++ ++ ret = bch2_setattr_nonsize(mnt_userns, inode, iattr); + err: + bch2_pagecache_block_put(&inode->ei_pagecache_lock); + return ret; +diff --git a/fs/bcachefs/fs-io.h b/fs/bcachefs/fs-io.h +index 2537a3d25ede..b24efeaf343e 100644 +--- a/fs/bcachefs/fs-io.h ++++ b/fs/bcachefs/fs-io.h +@@ -31,7 +31,8 @@ ssize_t bch2_write_iter(struct kiocb *, struct iov_iter *); + + int bch2_fsync(struct file *, loff_t, loff_t, int); + +-int bch2_truncate(struct bch_inode_info *, struct iattr *); ++int bch2_truncate(struct user_namespace *, ++ struct bch_inode_info *, struct iattr *); + long bch2_fallocate_dispatch(struct file *, int, loff_t, loff_t); + + loff_t bch2_remap_file_range(struct file *, loff_t, struct file *, +diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c +index 0c3112619677..20907e554dd3 100644 +--- a/fs/bcachefs/fs.c ++++ b/fs/bcachefs/fs.c +@@ -649,10 +649,10 @@ err: + return ret; + } + +-void bch2_setattr_copy(struct user_namespace *mnt_userns, +- struct bch_inode_info *inode, +- struct bch_inode_unpacked *bi, +- struct iattr *attr) ++static void bch2_setattr_copy(struct user_namespace *mnt_userns, ++ struct bch_inode_info *inode, ++ struct bch_inode_unpacked *bi, ++ struct iattr *attr) + { + struct bch_fs *c = inode->v.i_sb->s_fs_info; + unsigned int ia_valid = attr->ia_valid; +@@ -662,6 +662,9 @@ void bch2_setattr_copy(struct user_namespace *mnt_userns, + if (ia_valid & ATTR_GID) + bi->bi_gid = from_kgid(mnt_userns, attr->ia_gid); + ++ if (ia_valid & ATTR_SIZE) ++ bi->bi_size = attr->ia_size; ++ + if (ia_valid & ATTR_ATIME) + bi->bi_atime = timespec_to_bch2_time(c, attr->ia_atime); + if (ia_valid & ATTR_MTIME) +@@ -682,9 +685,9 @@ void bch2_setattr_copy(struct user_namespace *mnt_userns, + } + } + +-static int bch2_setattr_nonsize(struct user_namespace *mnt_userns, +- struct bch_inode_info *inode, +- struct iattr *attr) ++int bch2_setattr_nonsize(struct user_namespace *mnt_userns, ++ struct bch_inode_info *inode, ++ struct iattr *attr) + { + struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct bch_qid qid; +@@ -808,7 +811,7 @@ static int bch2_setattr(struct user_namespace *mnt_userns, + return ret; + + return iattr->ia_valid & ATTR_SIZE +- ? bch2_truncate(inode, iattr) ++ ? bch2_truncate(mnt_userns, inode, iattr) + : bch2_setattr_nonsize(mnt_userns, inode, iattr); + } + +diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h +index 2d82ed7dd740..36cc6ba2d644 100644 +--- a/fs/bcachefs/fs.h ++++ b/fs/bcachefs/fs.h +@@ -167,6 +167,10 @@ void bch2_inode_update_after_write(struct bch_fs *, + int __must_check bch2_write_inode(struct bch_fs *, struct bch_inode_info *, + inode_set_fn, void *, unsigned); + ++int bch2_setattr_nonsize(struct user_namespace *, ++ struct bch_inode_info *, ++ struct iattr *); ++ + void bch2_vfs_exit(void); + int bch2_vfs_init(void); + +-- +cgit v1.2.3 + + +From 98199500b3a0b0de33e6904d255fd000078dd624 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 10 Jun 2021 21:44:27 -0400 +Subject: bcachefs: Extensive triggers cleanups + + - We no longer mark subsets of extents, they're marked like regular + keys now - which means we can drop the offset & sectors arguments + to trigger functions + - Drop other arguments that are no longer needed anymore in various + places - fs_usage + - Drop the logic for handling extents in bch2_mark_update() that isn't + needed anymore, to match bch2_trans_mark_update() + - Better logic for hanlding the BTREE_ITER_CACHED_NOFILL case, where we + don't have an old key to mark + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_gc.c | 6 +- + fs/bcachefs/btree_types.h | 6 +- + fs/bcachefs/btree_update_interior.c | 4 +- + fs/bcachefs/btree_update_leaf.c | 4 +- + fs/bcachefs/buckets.c | 573 ++++++++++++++++-------------------- + fs/bcachefs/buckets.h | 23 +- + fs/bcachefs/ec.c | 3 +- + fs/bcachefs/extents.h | 11 + + 8 files changed, 284 insertions(+), 346 deletions(-) + +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +index d46d933b393b..95d31fc221d1 100644 +--- a/fs/bcachefs/btree_gc.c ++++ b/fs/bcachefs/btree_gc.c +@@ -669,6 +669,7 @@ static int bch2_gc_mark_key(struct bch_fs *c, enum btree_id btree_id, + struct bkey_ptrs_c ptrs; + const struct bch_extent_ptr *ptr; + unsigned flags = ++ BTREE_TRIGGER_INSERT| + BTREE_TRIGGER_GC| + (initial ? BTREE_TRIGGER_NOATOMIC : 0); + int ret = 0; +@@ -710,7 +711,7 @@ static int bch2_gc_mark_key(struct bch_fs *c, enum btree_id btree_id, + *max_stale = max(*max_stale, ptr_stale(ca, ptr)); + } + +- bch2_mark_key(c, *k, 0, k->k->size, NULL, 0, flags); ++ bch2_mark_key(c, *k, flags); + fsck_err: + err: + if (ret) +@@ -1081,8 +1082,7 @@ static void bch2_mark_pending_btree_node_frees(struct bch_fs *c) + for_each_pending_btree_node_free(c, as, d) + if (d->index_update_done) + bch2_mark_key(c, bkey_i_to_s_c(&d->key), +- 0, 0, NULL, 0, +- BTREE_TRIGGER_GC); ++ BTREE_TRIGGER_INSERT|BTREE_TRIGGER_GC); + + mutex_unlock(&c->btree_interior_update_lock); + } +diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h +index 39130ab6d739..ec5195daead4 100644 +--- a/fs/bcachefs/btree_types.h ++++ b/fs/bcachefs/btree_types.h +@@ -644,7 +644,6 @@ enum btree_trigger_flags { + + __BTREE_TRIGGER_INSERT, + __BTREE_TRIGGER_OVERWRITE, +- __BTREE_TRIGGER_OVERWRITE_SPLIT, + + __BTREE_TRIGGER_GC, + __BTREE_TRIGGER_BUCKET_INVALIDATE, +@@ -655,12 +654,15 @@ enum btree_trigger_flags { + + #define BTREE_TRIGGER_INSERT (1U << __BTREE_TRIGGER_INSERT) + #define BTREE_TRIGGER_OVERWRITE (1U << __BTREE_TRIGGER_OVERWRITE) +-#define BTREE_TRIGGER_OVERWRITE_SPLIT (1U << __BTREE_TRIGGER_OVERWRITE_SPLIT) + + #define BTREE_TRIGGER_GC (1U << __BTREE_TRIGGER_GC) + #define BTREE_TRIGGER_BUCKET_INVALIDATE (1U << __BTREE_TRIGGER_BUCKET_INVALIDATE) + #define BTREE_TRIGGER_NOATOMIC (1U << __BTREE_TRIGGER_NOATOMIC) + ++#define BTREE_TRIGGER_WANTS_OLD_AND_NEW \ ++ ((1U << KEY_TYPE_stripe)| \ ++ (1U << KEY_TYPE_inode)) ++ + static inline bool btree_node_type_needs_gc(enum btree_node_type type) + { + return BTREE_NODE_TYPE_HAS_TRIGGERS & (1U << type); +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index 89011f9f89ed..cd214599a03f 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -511,7 +511,7 @@ static int btree_update_nodes_written_trans(struct btree_trans *trans, + ret = bch2_trans_mark_key(trans, + bkey_s_c_null, + bkey_i_to_s_c(k), +- 0, 0, BTREE_TRIGGER_INSERT); ++ BTREE_TRIGGER_INSERT); + if (ret) + return ret; + } +@@ -520,7 +520,7 @@ static int btree_update_nodes_written_trans(struct btree_trans *trans, + ret = bch2_trans_mark_key(trans, + bkey_i_to_s_c(k), + bkey_s_c_null, +- 0, 0, BTREE_TRIGGER_OVERWRITE); ++ BTREE_TRIGGER_OVERWRITE); + if (ret) + return ret; + } +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index 4bf386640779..fa08470b6ca3 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -362,7 +362,7 @@ static noinline void bch2_trans_mark_gc(struct btree_trans *trans) + BUG_ON(btree_iter_type(i->iter) == BTREE_ITER_CACHED); + + if (gc_visited(c, gc_pos_btree_node(i->iter->l[0].b))) +- bch2_mark_update(trans, i->iter, i->k, NULL, ++ bch2_mark_update(trans, i->iter, i->k, + i->trigger_flags|BTREE_TRIGGER_GC); + } + } +@@ -468,7 +468,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, + trans_for_each_update(trans, i) + if (BTREE_NODE_TYPE_HAS_MEM_TRIGGERS & (1U << i->bkey_type)) + bch2_mark_update(trans, i->iter, i->k, +- NULL, i->trigger_flags); ++ i->trigger_flags); + + if (marking && trans->fs_usage_deltas) + bch2_trans_fs_usage_apply(trans, trans->fs_usage_deltas); +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index b06105aea216..76945e50e4b1 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -351,17 +351,16 @@ static inline void account_bucket(struct bch_fs_usage *fs_usage, + } + + static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca, +- struct bch_fs_usage *fs_usage, + struct bucket_mark old, struct bucket_mark new, + u64 journal_seq, bool gc) + { ++ struct bch_fs_usage *fs_usage; + struct bch_dev_usage *u; + + percpu_rwsem_assert_held(&c->mark_lock); + + preempt_disable(); +- if (!fs_usage) +- fs_usage = fs_usage_ptr(c, journal_seq, gc); ++ fs_usage = fs_usage_ptr(c, journal_seq, gc); + u = dev_usage_ptr(ca, journal_seq, gc); + + if (bucket_type(old)) +@@ -390,30 +389,48 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca, + bch2_wake_allocator(ca); + } + ++static inline int __update_replicas(struct bch_fs *c, ++ struct bch_fs_usage *fs_usage, ++ struct bch_replicas_entry *r, ++ s64 sectors) ++{ ++ int idx = bch2_replicas_entry_idx(c, r); ++ ++ if (idx < 0) ++ return -1; ++ ++ fs_usage_data_type_to_base(fs_usage, r->data_type, sectors); ++ fs_usage->replicas[idx] += sectors; ++ return 0; ++} ++ + static inline int update_replicas(struct bch_fs *c, +- struct bch_fs_usage *fs_usage, +- struct bch_replicas_entry *r, +- s64 sectors) ++ struct bch_replicas_entry *r, s64 sectors, ++ unsigned journal_seq, bool gc) + { ++ struct bch_fs_usage __percpu *fs_usage; + int idx = bch2_replicas_entry_idx(c, r); + + if (idx < 0) + return -1; + ++ preempt_disable(); ++ fs_usage = fs_usage_ptr(c, journal_seq, gc); + fs_usage_data_type_to_base(fs_usage, r->data_type, sectors); + fs_usage->replicas[idx] += sectors; ++ preempt_enable(); + return 0; + } + + static inline int update_cached_sectors(struct bch_fs *c, +- struct bch_fs_usage *fs_usage, +- unsigned dev, s64 sectors) ++ unsigned dev, s64 sectors, ++ unsigned journal_seq, bool gc) + { + struct bch_replicas_padded r; + + bch2_replicas_entry_cached(&r.e, dev); + +- return update_replicas(c, fs_usage, &r.e, sectors); ++ return update_replicas(c, &r.e, sectors, journal_seq, gc); + } + + static struct replicas_delta_list * +@@ -507,7 +524,6 @@ void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca, + + static int bch2_mark_alloc(struct bch_fs *c, + struct bkey_s_c old, struct bkey_s_c new, +- struct bch_fs_usage *fs_usage, + u64 journal_seq, unsigned flags) + { + bool gc = flags & BTREE_TRIGGER_GC; +@@ -549,7 +565,7 @@ static int bch2_mark_alloc(struct bch_fs *c, + } + })); + +- bch2_dev_usage_update(c, ca, fs_usage, old_m, m, journal_seq, gc); ++ bch2_dev_usage_update(c, ca, old_m, m, journal_seq, gc); + + g->io_time[READ] = u.read_time; + g->io_time[WRITE] = u.write_time; +@@ -565,8 +581,8 @@ static int bch2_mark_alloc(struct bch_fs *c, + + if ((flags & BTREE_TRIGGER_BUCKET_INVALIDATE) && + old_m.cached_sectors) { +- if (update_cached_sectors(c, fs_usage, ca->dev_idx, +- -old_m.cached_sectors)) { ++ if (update_cached_sectors(c, ca->dev_idx, -old_m.cached_sectors, ++ journal_seq, gc)) { + bch2_fs_fatal_error(c, "bch2_mark_alloc(): no replicas entry while updating cached sectors"); + return -1; + } +@@ -617,8 +633,7 @@ static int __bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, + old.dirty_sectors, sectors); + + if (c) +- bch2_dev_usage_update(c, ca, fs_usage_ptr(c, 0, gc), +- old, new, 0, gc); ++ bch2_dev_usage_update(c, ca, old, new, 0, gc); + + return 0; + } +@@ -637,54 +652,20 @@ void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, + if (b >= ca->mi.nbuckets) + return; + +- preempt_disable(); +- + if (likely(c)) { + do_mark_fn(__bch2_mark_metadata_bucket, c, pos, flags, + ca, b, type, sectors); + } else { + __bch2_mark_metadata_bucket(c, ca, b, type, sectors, 0); + } +- +- preempt_enable(); + } + +-static s64 disk_sectors_scaled(unsigned n, unsigned d, unsigned sectors) ++static s64 ptr_disk_sectors(s64 sectors, struct extent_ptr_decoded p) + { +- return DIV_ROUND_UP(sectors * n, d); +-} +- +-static s64 __ptr_disk_sectors_delta(unsigned old_size, +- unsigned offset, s64 delta, +- unsigned flags, +- unsigned n, unsigned d) +-{ +- BUG_ON(!n || !d); +- +- if (flags & BTREE_TRIGGER_OVERWRITE_SPLIT) { +- BUG_ON(offset + -delta > old_size); +- +- return -disk_sectors_scaled(n, d, old_size) + +- disk_sectors_scaled(n, d, offset) + +- disk_sectors_scaled(n, d, old_size - offset + delta); +- } else if (flags & BTREE_TRIGGER_OVERWRITE) { +- BUG_ON(offset + -delta > old_size); +- +- return -disk_sectors_scaled(n, d, old_size) + +- disk_sectors_scaled(n, d, old_size + delta); +- } else { +- return disk_sectors_scaled(n, d, delta); +- } +-} +- +-static s64 ptr_disk_sectors_delta(struct extent_ptr_decoded p, +- unsigned offset, s64 delta, +- unsigned flags) +-{ +- return __ptr_disk_sectors_delta(p.crc.live_size, +- offset, delta, flags, +- p.crc.compressed_size, +- p.crc.uncompressed_size); ++ return p.crc.compression_type ++ ? DIV_ROUND_UP(sectors * p.crc.compressed_size, ++ p.crc.uncompressed_size) ++ : sectors; + } + + static int check_bucket_ref(struct bch_fs *c, struct bkey_s_c k, +@@ -763,7 +744,6 @@ static int check_bucket_ref(struct bch_fs *c, struct bkey_s_c k, + + static int mark_stripe_bucket(struct bch_fs *c, struct bkey_s_c k, + unsigned ptr_idx, +- struct bch_fs_usage *fs_usage, + u64 journal_seq, unsigned flags) + { + const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; +@@ -805,7 +785,7 @@ static int mark_stripe_bucket(struct bch_fs *c, struct bkey_s_c k, + g->stripe = k.k->p.offset; + g->stripe_redundancy = s->nr_redundant; + +- bch2_dev_usage_update(c, ca, fs_usage, old, new, journal_seq, gc); ++ bch2_dev_usage_update(c, ca, old, new, journal_seq, gc); + return 0; + } + +@@ -834,7 +814,6 @@ static int __mark_pointer(struct bch_fs *c, struct bkey_s_c k, + static int bch2_mark_pointer(struct bch_fs *c, struct bkey_s_c k, + struct extent_ptr_decoded p, + s64 sectors, enum bch_data_type data_type, +- struct bch_fs_usage *fs_usage, + u64 journal_seq, unsigned flags) + { + bool gc = flags & BTREE_TRIGGER_GC; +@@ -872,7 +851,7 @@ static int bch2_mark_pointer(struct bch_fs *c, struct bkey_s_c k, + old.v.counter, + new.v.counter)) != old.v.counter); + +- bch2_dev_usage_update(c, ca, fs_usage, old, new, journal_seq, gc); ++ bch2_dev_usage_update(c, ca, old, new, journal_seq, gc); + + BUG_ON(!gc && bucket_became_unavailable(old, new)); + +@@ -882,8 +861,8 @@ static int bch2_mark_pointer(struct bch_fs *c, struct bkey_s_c k, + static int bch2_mark_stripe_ptr(struct bch_fs *c, + struct bch_extent_stripe_ptr p, + enum bch_data_type data_type, +- struct bch_fs_usage *fs_usage, +- s64 sectors, unsigned flags) ++ s64 sectors, ++ unsigned journal_seq, unsigned flags) + { + bool gc = flags & BTREE_TRIGGER_GC; + struct bch_replicas_padded r; +@@ -918,40 +897,46 @@ static int bch2_mark_stripe_ptr(struct bch_fs *c, + spin_unlock(&c->ec_stripes_heap_lock); + + r.e.data_type = data_type; +- update_replicas(c, fs_usage, &r.e, sectors); ++ update_replicas(c, &r.e, sectors, journal_seq, gc); + + return 0; + } + + static int bch2_mark_extent(struct bch_fs *c, + struct bkey_s_c old, struct bkey_s_c new, +- unsigned offset, s64 sectors, +- enum bch_data_type data_type, +- struct bch_fs_usage *fs_usage, + unsigned journal_seq, unsigned flags) + { ++ bool gc = flags & BTREE_TRIGGER_GC; + struct bkey_s_c k = flags & BTREE_TRIGGER_INSERT ? new : old; + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + struct bch_replicas_padded r; ++ enum bch_data_type data_type = bkey_is_btree_ptr(k.k) ++ ? BCH_DATA_btree ++ : BCH_DATA_user; ++ s64 sectors = bkey_is_btree_ptr(k.k) ++ ? c->opts.btree_node_size ++ : k.k->size; + s64 dirty_sectors = 0; + bool stale; + int ret; + ++ BUG_ON((flags & (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE)) == ++ (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE)); ++ ++ if (flags & BTREE_TRIGGER_OVERWRITE) ++ sectors = -sectors; ++ + r.e.data_type = data_type; + r.e.nr_devs = 0; + r.e.nr_required = 1; + +- BUG_ON(!sectors); +- + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { +- s64 disk_sectors = data_type == BCH_DATA_btree +- ? sectors +- : ptr_disk_sectors_delta(p, offset, sectors, flags); ++ s64 disk_sectors = ptr_disk_sectors(sectors, p); + + ret = bch2_mark_pointer(c, k, p, disk_sectors, data_type, +- fs_usage, journal_seq, flags); ++ journal_seq, flags); + if (ret < 0) + return ret; + +@@ -959,8 +944,8 @@ static int bch2_mark_extent(struct bch_fs *c, + + if (p.ptr.cached) { + if (!stale) +- if (update_cached_sectors(c, fs_usage, p.ptr.dev, +- disk_sectors)) { ++ if (update_cached_sectors(c, p.ptr.dev, disk_sectors, ++ journal_seq, gc)) { + bch2_fs_fatal_error(c, "bch2_mark_extent(): no replicas entry while updating cached sectors"); + return -1; + +@@ -970,7 +955,7 @@ static int bch2_mark_extent(struct bch_fs *c, + r.e.devs[r.e.nr_devs++] = p.ptr.dev; + } else { + ret = bch2_mark_stripe_ptr(c, p.ec, data_type, +- fs_usage, disk_sectors, flags); ++ disk_sectors, journal_seq, flags); + if (ret) + return ret; + +@@ -984,7 +969,7 @@ static int bch2_mark_extent(struct bch_fs *c, + } + + if (r.e.nr_devs) { +- if (update_replicas(c, fs_usage, &r.e, dirty_sectors)) { ++ if (update_replicas(c, &r.e, dirty_sectors, journal_seq, gc)) { + char buf[200]; + + bch2_bkey_val_to_text(&PBUF(buf), c, k); +@@ -997,9 +982,8 @@ static int bch2_mark_extent(struct bch_fs *c, + } + + static int bch2_mark_stripe(struct bch_fs *c, +- struct bkey_s_c old, struct bkey_s_c new, +- struct bch_fs_usage *fs_usage, +- u64 journal_seq, unsigned flags) ++ struct bkey_s_c old, struct bkey_s_c new, ++ u64 journal_seq, unsigned flags) + { + bool gc = flags & BTREE_TRIGGER_GC; + size_t idx = new.k->p.offset; +@@ -1060,14 +1044,14 @@ static int bch2_mark_stripe(struct bch_fs *c, + m->blocks_nonempty = 0; + + for (i = 0; i < new_s->nr_blocks; i++) { +- ret = mark_stripe_bucket(c, new, i, fs_usage, +- journal_seq, flags); ++ ret = mark_stripe_bucket(c, new, i, journal_seq, flags); + if (ret) + return ret; + } + +- if (update_replicas(c, fs_usage, &m->r.e, +- ((s64) m->sectors * m->nr_redundant))) { ++ if (update_replicas(c, &m->r.e, ++ ((s64) m->sectors * m->nr_redundant), ++ journal_seq, gc)) { + char buf[200]; + + bch2_bkey_val_to_text(&PBUF(buf), c, new); +@@ -1079,13 +1063,47 @@ static int bch2_mark_stripe(struct bch_fs *c, + return 0; + } + +-static int __bch2_mark_reflink_p(struct bch_fs *c, +- struct bkey_s_c_reflink_p p, +- u64 idx, unsigned sectors, +- unsigned front_frag, +- unsigned back_frag, +- unsigned flags, +- size_t *r_idx) ++static int bch2_mark_inode(struct bch_fs *c, ++ struct bkey_s_c old, struct bkey_s_c new, ++ u64 journal_seq, unsigned flags) ++{ ++ struct bch_fs_usage __percpu *fs_usage; ++ ++ preempt_disable(); ++ fs_usage = fs_usage_ptr(c, journal_seq, flags & BTREE_TRIGGER_GC); ++ fs_usage->nr_inodes += new.k->type == KEY_TYPE_inode; ++ fs_usage->nr_inodes -= old.k->type == KEY_TYPE_inode; ++ preempt_enable(); ++ return 0; ++} ++ ++static int bch2_mark_reservation(struct bch_fs *c, ++ struct bkey_s_c old, struct bkey_s_c new, ++ u64 journal_seq, unsigned flags) ++{ ++ struct bkey_s_c k = flags & BTREE_TRIGGER_INSERT ? new : old; ++ struct bch_fs_usage __percpu *fs_usage; ++ unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas; ++ s64 sectors = (s64) k.k->size; ++ ++ if (flags & BTREE_TRIGGER_OVERWRITE) ++ sectors = -sectors; ++ sectors *= replicas; ++ ++ preempt_disable(); ++ fs_usage = fs_usage_ptr(c, journal_seq, flags & BTREE_TRIGGER_GC); ++ replicas = clamp_t(unsigned, replicas, 1, ++ ARRAY_SIZE(fs_usage->persistent_reserved)); ++ ++ fs_usage->reserved += sectors; ++ fs_usage->persistent_reserved[replicas - 1] += sectors; ++ preempt_enable(); ++ ++ return 0; ++} ++ ++static s64 __bch2_mark_reflink_p(struct bch_fs *c, struct bkey_s_c_reflink_p p, ++ u64 idx, unsigned flags, size_t *r_idx) + { + struct reflink_gc *r; + int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1; +@@ -1096,7 +1114,7 @@ static int __bch2_mark_reflink_p(struct bch_fs *c, + r = genradix_ptr(&c->reflink_gc_table, *r_idx); + BUG_ON(!r); + +- if (r->offset > idx) ++ if (idx < r->offset) + break; + (*r_idx)++; + } +@@ -1104,7 +1122,7 @@ static int __bch2_mark_reflink_p(struct bch_fs *c, + BUG_ON((s64) r->refcount + add < 0); + + r->refcount += add; +- return min_t(u64, sectors, r->offset - idx); ++ return r->offset - idx; + not_found: + bch2_fs_inconsistent(c, + "%llu:%llu len %u points to nonexistent indirect extent %llu", +@@ -1114,22 +1132,19 @@ not_found: + } + + static int bch2_mark_reflink_p(struct bch_fs *c, +- struct bkey_s_c_reflink_p p, unsigned offset, +- s64 sectors, unsigned flags) ++ struct bkey_s_c old, struct bkey_s_c new, ++ u64 journal_seq, unsigned flags) + { +- u64 idx = le64_to_cpu(p.v->idx) + offset; ++ struct bkey_s_c k = flags & BTREE_TRIGGER_INSERT ? new : old; ++ struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); + struct reflink_gc *ref; + size_t l, r, m; +- unsigned front_frag, back_frag; ++ u64 idx = le64_to_cpu(p.v->idx); ++ unsigned sectors = p.k->size; + s64 ret = 0; + +- if (sectors < 0) +- sectors = -sectors; +- +- BUG_ON(offset + sectors > p.k->size); +- +- front_frag = offset; +- back_frag = offset + sectors; ++ BUG_ON((flags & (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE)) == ++ (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE)); + + l = 0; + r = c->reflink_gc_nr; +@@ -1144,11 +1159,11 @@ static int bch2_mark_reflink_p(struct bch_fs *c, + } + + while (sectors) { +- ret = __bch2_mark_reflink_p(c, p, idx, sectors, +- front_frag, back_frag, flags, &l); ++ ret = __bch2_mark_reflink_p(c, p, idx, flags, &l); + if (ret < 0) + return ret; + ++ ret = min_t(s64, ret, sectors); + idx += ret; + sectors -= ret; + } +@@ -1159,99 +1174,55 @@ static int bch2_mark_reflink_p(struct bch_fs *c, + static int bch2_mark_key_locked(struct bch_fs *c, + struct bkey_s_c old, + struct bkey_s_c new, +- unsigned offset, s64 sectors, +- struct bch_fs_usage *fs_usage, + u64 journal_seq, unsigned flags) + { + struct bkey_s_c k = flags & BTREE_TRIGGER_INSERT ? new : old; +- int ret = 0; + + BUG_ON(!(flags & (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE))); + +- preempt_disable(); +- +- if (!fs_usage || (flags & BTREE_TRIGGER_GC)) +- fs_usage = fs_usage_ptr(c, journal_seq, +- flags & BTREE_TRIGGER_GC); +- + switch (k.k->type) { + case KEY_TYPE_alloc: + case KEY_TYPE_alloc_v2: +- ret = bch2_mark_alloc(c, old, new, fs_usage, journal_seq, flags); +- break; ++ return bch2_mark_alloc(c, old, new, journal_seq, flags); + case KEY_TYPE_btree_ptr: + case KEY_TYPE_btree_ptr_v2: +- sectors = !(flags & BTREE_TRIGGER_OVERWRITE) +- ? c->opts.btree_node_size +- : -c->opts.btree_node_size; +- +- ret = bch2_mark_extent(c, old, new, offset, sectors, +- BCH_DATA_btree, fs_usage, journal_seq, flags); +- break; + case KEY_TYPE_extent: + case KEY_TYPE_reflink_v: +- ret = bch2_mark_extent(c, old, new, offset, sectors, +- BCH_DATA_user, fs_usage, journal_seq, flags); +- break; ++ return bch2_mark_extent(c, old, new, journal_seq, flags); + case KEY_TYPE_stripe: +- ret = bch2_mark_stripe(c, old, new, fs_usage, journal_seq, flags); +- break; ++ return bch2_mark_stripe(c, old, new, journal_seq, flags); + case KEY_TYPE_inode: +- fs_usage->nr_inodes += new.k->type == KEY_TYPE_inode; +- fs_usage->nr_inodes -= old.k->type == KEY_TYPE_inode; +- break; +- case KEY_TYPE_reservation: { +- unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas; +- +- sectors *= replicas; +- replicas = clamp_t(unsigned, replicas, 1, +- ARRAY_SIZE(fs_usage->persistent_reserved)); +- +- fs_usage->reserved += sectors; +- fs_usage->persistent_reserved[replicas - 1] += sectors; +- break; +- } ++ return bch2_mark_inode(c, old, new, journal_seq, flags); ++ case KEY_TYPE_reservation: ++ return bch2_mark_reservation(c, old, new, journal_seq, flags); + case KEY_TYPE_reflink_p: +- ret = bch2_mark_reflink_p(c, bkey_s_c_to_reflink_p(k), +- offset, sectors, flags); +- break; ++ return bch2_mark_reflink_p(c, old, new, journal_seq, flags); ++ default: ++ return 0; + } +- +- preempt_enable(); +- +- return ret; + } + +-int bch2_mark_key(struct bch_fs *c, struct bkey_s_c new, +- unsigned offset, s64 sectors, +- struct bch_fs_usage *fs_usage, +- u64 journal_seq, unsigned flags) ++int bch2_mark_key(struct bch_fs *c, struct bkey_s_c new, unsigned flags) + { +- struct bkey deleted; ++ struct bkey deleted = KEY(0, 0, 0); + struct bkey_s_c old = (struct bkey_s_c) { &deleted, NULL }; + int ret; + +- bkey_init(&deleted); +- + percpu_down_read(&c->mark_lock); +- ret = bch2_mark_key_locked(c, old, new, offset, sectors, +- fs_usage, journal_seq, +- BTREE_TRIGGER_INSERT|flags); ++ ret = bch2_mark_key_locked(c, old, new, 0, flags); + percpu_up_read(&c->mark_lock); + + return ret; + } + +-int bch2_mark_update(struct btree_trans *trans, +- struct btree_iter *iter, +- struct bkey_i *new, +- struct bch_fs_usage *fs_usage, +- unsigned flags) ++int bch2_mark_update(struct btree_trans *trans, struct btree_iter *iter, ++ struct bkey_i *new, unsigned flags) + { + struct bch_fs *c = trans->c; ++ struct bkey _deleted = KEY(0, 0, 0); ++ struct bkey_s_c deleted = (struct bkey_s_c) { &_deleted, NULL }; + struct bkey_s_c old; +- struct bkey unpacked; +- int ret = 0; ++ int iter_flags, ret; + + if (unlikely(flags & BTREE_TRIGGER_NORUN)) + return 0; +@@ -1259,87 +1230,36 @@ int bch2_mark_update(struct btree_trans *trans, + if (!btree_node_type_needs_gc(iter->btree_id)) + return 0; + +- bkey_init(&unpacked); +- old = (struct bkey_s_c) { &unpacked, NULL }; ++ if (likely(!(iter->flags & BTREE_ITER_CACHED_NOFILL))) { ++ iter_flags = iter->flags & BTREE_ITER_WITH_UPDATES; ++ iter->flags &= ~BTREE_ITER_WITH_UPDATES; + +- if (!btree_node_type_is_extents(iter->btree_id)) { +- /* iterators should be uptodate, shouldn't get errors here: */ +- if (btree_iter_type(iter) != BTREE_ITER_CACHED) { +- old = bch2_btree_iter_peek_slot(iter); +- BUG_ON(bkey_err(old)); +- } else { +- struct bkey_cached *ck = (void *) iter->l[0].b; ++ old = bch2_btree_iter_peek_slot(iter); ++ iter->flags |= iter_flags; + +- if (ck->valid) +- old = bkey_i_to_s_c(ck->k); +- } ++ ret = bkey_err(old); ++ if (ret) ++ return ret; ++ } else { ++ /* ++ * If BTREE_ITER_CACHED_NOFILL was used, we better not be ++ * running triggers that do anything on removal (alloc btree): ++ */ ++ old = deleted; ++ } + +- if (old.k->type == new->k.type) { +- bch2_mark_key_locked(c, old, bkey_i_to_s_c(new), 0, 0, +- fs_usage, trans->journal_res.seq, ++ if (old.k->type == new->k.type && ++ ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) { ++ ret = bch2_mark_key_locked(c, old, bkey_i_to_s_c(new), ++ trans->journal_res.seq, + BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags); +- +- } else { +- bch2_mark_key_locked(c, old, bkey_i_to_s_c(new), 0, 0, +- fs_usage, trans->journal_res.seq, +- BTREE_TRIGGER_INSERT|flags); +- bch2_mark_key_locked(c, old, bkey_i_to_s_c(new), 0, 0, +- fs_usage, trans->journal_res.seq, +- BTREE_TRIGGER_OVERWRITE|flags); +- } + } else { +- struct btree_iter *copy; +- +- BUG_ON(btree_iter_type(iter) == BTREE_ITER_CACHED); +- bch2_mark_key_locked(c, old, bkey_i_to_s_c(new), +- 0, new->k.size, +- fs_usage, trans->journal_res.seq, +- BTREE_TRIGGER_INSERT|flags); +- +- copy = bch2_trans_copy_iter(trans, iter); +- +- for_each_btree_key_continue(copy, 0, old, ret) { +- unsigned offset = 0; +- s64 sectors = -((s64) old.k->size); +- +- flags |= BTREE_TRIGGER_OVERWRITE; +- +- if (bkey_cmp(new->k.p, bkey_start_pos(old.k)) <= 0) +- break; +- +- switch (bch2_extent_overlap(&new->k, old.k)) { +- case BCH_EXTENT_OVERLAP_ALL: +- offset = 0; +- sectors = -((s64) old.k->size); +- break; +- case BCH_EXTENT_OVERLAP_BACK: +- offset = bkey_start_offset(&new->k) - +- bkey_start_offset(old.k); +- sectors = bkey_start_offset(&new->k) - +- old.k->p.offset; +- break; +- case BCH_EXTENT_OVERLAP_FRONT: +- offset = 0; +- sectors = bkey_start_offset(old.k) - +- new->k.p.offset; +- break; +- case BCH_EXTENT_OVERLAP_MIDDLE: +- offset = bkey_start_offset(&new->k) - +- bkey_start_offset(old.k); +- sectors = -((s64) new->k.size); +- flags |= BTREE_TRIGGER_OVERWRITE_SPLIT; +- break; +- } +- +- BUG_ON(sectors >= 0); +- +- ret = bch2_mark_key_locked(c, old, bkey_i_to_s_c(new), +- offset, sectors, fs_usage, +- trans->journal_res.seq, flags) ?: 1; +- if (ret <= 0) +- break; +- } +- bch2_trans_iter_put(trans, copy); ++ ret = bch2_mark_key_locked(c, deleted, bkey_i_to_s_c(new), ++ trans->journal_res.seq, ++ BTREE_TRIGGER_INSERT|flags) ?: ++ bch2_mark_key_locked(c, old, deleted, ++ trans->journal_res.seq, ++ BTREE_TRIGGER_OVERWRITE|flags); + } + + return ret; +@@ -1416,7 +1336,7 @@ void bch2_trans_fs_usage_apply(struct btree_trans *trans, + added += d->delta; + } + +- BUG_ON(update_replicas(c, dst, &d->r, d->delta)); ++ BUG_ON(__update_replicas(c, dst, &d->r, d->delta)); + } + + dst->nr_inodes += deltas->nr_inodes; +@@ -1605,31 +1525,38 @@ err: + } + + static int bch2_trans_mark_extent(struct btree_trans *trans, +- struct bkey_s_c k, unsigned offset, +- s64 sectors, unsigned flags, +- enum bch_data_type data_type) ++ struct bkey_s_c k, unsigned flags) + { ++ struct bch_fs *c = trans->c; + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + struct bch_replicas_padded r; ++ enum bch_data_type data_type = bkey_is_btree_ptr(k.k) ++ ? BCH_DATA_btree ++ : BCH_DATA_user; ++ s64 sectors = bkey_is_btree_ptr(k.k) ++ ? c->opts.btree_node_size ++ : k.k->size; + s64 dirty_sectors = 0; + bool stale; + int ret; + ++ BUG_ON((flags & (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE)) == ++ (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE)); ++ ++ if (flags & BTREE_TRIGGER_OVERWRITE) ++ sectors = -sectors; ++ + r.e.data_type = data_type; + r.e.nr_devs = 0; + r.e.nr_required = 1; + +- BUG_ON(!sectors); +- + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { +- s64 disk_sectors = data_type == BCH_DATA_btree +- ? sectors +- : ptr_disk_sectors_delta(p, offset, sectors, flags); ++ s64 disk_sectors = ptr_disk_sectors(sectors, p); + +- ret = bch2_trans_mark_pointer(trans, k, p, disk_sectors, +- data_type); ++ ret = bch2_trans_mark_pointer(trans, k, p, ++ disk_sectors, data_type); + if (ret < 0) + return ret; + +@@ -1765,10 +1692,49 @@ static int bch2_trans_mark_stripe(struct btree_trans *trans, + return ret; + } + ++static int bch2_trans_mark_inode(struct btree_trans *trans, ++ struct bkey_s_c old, ++ struct bkey_s_c new, ++ unsigned flags) ++{ ++ int nr = (new.k->type == KEY_TYPE_inode) - ++ (old.k->type == KEY_TYPE_inode); ++ ++ if (nr) { ++ struct replicas_delta_list *d = ++ replicas_deltas_realloc(trans, 0); ++ d->nr_inodes += nr; ++ } ++ ++ return 0; ++} ++ ++static int bch2_trans_mark_reservation(struct btree_trans *trans, ++ struct bkey_s_c k, unsigned flags) ++{ ++ unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas; ++ s64 sectors = (s64) k.k->size; ++ struct replicas_delta_list *d; ++ ++ BUG_ON((flags & (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE)) == ++ (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE)); ++ ++ if (flags & BTREE_TRIGGER_OVERWRITE) ++ sectors = -sectors; ++ sectors *= replicas; ++ ++ d = replicas_deltas_realloc(trans, 0); ++ ++ replicas = clamp_t(unsigned, replicas, 1, ++ ARRAY_SIZE(d->persistent_reserved)); ++ ++ d->persistent_reserved[replicas - 1] += sectors; ++ return 0; ++} ++ + static int __bch2_trans_mark_reflink_p(struct btree_trans *trans, + struct bkey_s_c_reflink_p p, +- u64 idx, unsigned sectors, +- unsigned flags) ++ u64 idx, unsigned flags) + { + struct bch_fs *c = trans->c; + struct btree_iter *iter; +@@ -1786,8 +1752,6 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans, + if (ret) + goto err; + +- sectors = min_t(u64, sectors, k.k->p.offset - idx); +- + n = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); + ret = PTR_ERR_OR_ZERO(n); + if (ret) +@@ -1818,29 +1782,26 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans, + if (ret) + goto err; + +- ret = sectors; ++ ret = k.k->p.offset - idx; + err: + bch2_trans_iter_put(trans, iter); + return ret; + } + + static int bch2_trans_mark_reflink_p(struct btree_trans *trans, +- struct bkey_s_c_reflink_p p, unsigned offset, +- s64 sectors, unsigned flags) ++ struct bkey_s_c k, unsigned flags) + { +- u64 idx = le64_to_cpu(p.v->idx) + offset; ++ struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); ++ u64 idx = le64_to_cpu(p.v->idx); ++ unsigned sectors = p.k->size; + s64 ret = 0; + +- if (sectors < 0) +- sectors = -sectors; +- +- BUG_ON(offset || sectors != p.k->size); +- + while (sectors) { +- ret = __bch2_trans_mark_reflink_p(trans, p, idx, sectors, flags); ++ ret = __bch2_trans_mark_reflink_p(trans, p, idx, flags); + if (ret < 0) + return ret; + ++ ret = min_t(s64, ret, sectors); + idx += ret; + sectors -= ret; + } +@@ -1848,59 +1809,27 @@ static int bch2_trans_mark_reflink_p(struct btree_trans *trans, + return 0; + } + +-int bch2_trans_mark_key(struct btree_trans *trans, +- struct bkey_s_c old, +- struct bkey_s_c new, +- unsigned offset, s64 sectors, unsigned flags) ++int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c old, ++ struct bkey_s_c new, unsigned flags) + { +- struct bch_fs *c = trans->c; + struct bkey_s_c k = flags & BTREE_TRIGGER_INSERT ? new : old; +- struct replicas_delta_list *d; + + BUG_ON(!(flags & (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE))); + + switch (k.k->type) { + case KEY_TYPE_btree_ptr: + case KEY_TYPE_btree_ptr_v2: +- sectors = !(flags & BTREE_TRIGGER_OVERWRITE) +- ? c->opts.btree_node_size +- : -c->opts.btree_node_size; +- +- return bch2_trans_mark_extent(trans, k, offset, sectors, +- flags, BCH_DATA_btree); + case KEY_TYPE_extent: + case KEY_TYPE_reflink_v: +- return bch2_trans_mark_extent(trans, k, offset, sectors, +- flags, BCH_DATA_user); ++ return bch2_trans_mark_extent(trans, k, flags); + case KEY_TYPE_stripe: + return bch2_trans_mark_stripe(trans, old, new, flags); +- case KEY_TYPE_inode: { +- int nr = (new.k->type == KEY_TYPE_inode) - +- (old.k->type == KEY_TYPE_inode); +- +- if (nr) { +- d = replicas_deltas_realloc(trans, 0); +- d->nr_inodes += nr; +- } +- +- return 0; +- } +- case KEY_TYPE_reservation: { +- unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas; +- +- d = replicas_deltas_realloc(trans, 0); +- +- sectors *= replicas; +- replicas = clamp_t(unsigned, replicas, 1, +- ARRAY_SIZE(d->persistent_reserved)); +- +- d->persistent_reserved[replicas - 1] += sectors; +- return 0; +- } ++ case KEY_TYPE_inode: ++ return bch2_trans_mark_inode(trans, old, new, flags); ++ case KEY_TYPE_reservation: ++ return bch2_trans_mark_reservation(trans, k, flags); + case KEY_TYPE_reflink_p: +- return bch2_trans_mark_reflink_p(trans, +- bkey_s_c_to_reflink_p(k), +- offset, sectors, flags); ++ return bch2_trans_mark_reflink_p(trans, k, flags); + default: + return 0; + } +@@ -1911,7 +1840,9 @@ int bch2_trans_mark_update(struct btree_trans *trans, + struct bkey_i *new, + unsigned flags) + { +- struct bkey_s_c old; ++ struct bkey _deleted = KEY(0, 0, 0); ++ struct bkey_s_c deleted = (struct bkey_s_c) { &_deleted, NULL }; ++ struct bkey_s_c old; + int iter_flags, ret; + + if (unlikely(flags & BTREE_TRIGGER_NORUN)) +@@ -1920,25 +1851,33 @@ int bch2_trans_mark_update(struct btree_trans *trans, + if (!btree_node_type_needs_gc(iter->btree_id)) + return 0; + +- iter_flags = iter->flags & BTREE_ITER_WITH_UPDATES; +- iter->flags &= ~BTREE_ITER_WITH_UPDATES; + +- old = bch2_btree_iter_peek_slot(iter); ++ if (likely(!(iter->flags & BTREE_ITER_CACHED_NOFILL))) { ++ iter_flags = iter->flags & BTREE_ITER_WITH_UPDATES; ++ iter->flags &= ~BTREE_ITER_WITH_UPDATES; + +- iter->flags |= iter_flags; ++ old = bch2_btree_iter_peek_slot(iter); ++ iter->flags |= iter_flags; + +- ret = bkey_err(old); +- if (ret) +- return ret; ++ ret = bkey_err(old); ++ if (ret) ++ return ret; ++ } else { ++ /* ++ * If BTREE_ITER_CACHED_NOFILL was used, we better not be ++ * running triggers that do anything on removal (alloc btree): ++ */ ++ old = deleted; ++ } + + if (old.k->type == new->k.type && +- !btree_node_type_is_extents(iter->btree_id)) { +- ret = bch2_trans_mark_key(trans, old, bkey_i_to_s_c(new), 0, 0, ++ ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) { ++ ret = bch2_trans_mark_key(trans, old, bkey_i_to_s_c(new), + BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags); + } else { +- ret = bch2_trans_mark_key(trans, old, bkey_i_to_s_c(new), 0, new->k.size, ++ ret = bch2_trans_mark_key(trans, deleted, bkey_i_to_s_c(new), + BTREE_TRIGGER_INSERT|flags) ?: +- bch2_trans_mark_key(trans, old, bkey_i_to_s_c(new), 0, -((s64) old.k->size), ++ bch2_trans_mark_key(trans, old, deleted, + BTREE_TRIGGER_OVERWRITE|flags); + } + +diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h +index 04a2a9310cdd..0f544b62fc90 100644 +--- a/fs/bcachefs/buckets.h ++++ b/fs/bcachefs/buckets.h +@@ -125,20 +125,6 @@ static inline u8 ptr_stale(struct bch_dev *ca, + return gen_after(ptr_bucket_mark(ca, ptr).gen, ptr->gen); + } + +-static inline s64 __ptr_disk_sectors(struct extent_ptr_decoded p, +- unsigned live_size) +-{ +- return live_size && p.crc.compression_type +- ? max(1U, DIV_ROUND_UP(live_size * p.crc.compressed_size, +- p.crc.uncompressed_size)) +- : live_size; +-} +- +-static inline s64 ptr_disk_sectors(struct extent_ptr_decoded p) +-{ +- return __ptr_disk_sectors(p, p.crc.live_size); +-} +- + /* bucket gc marks */ + + static inline unsigned bucket_sectors_used(struct bucket_mark mark) +@@ -240,14 +226,13 @@ void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *, + size_t, enum bch_data_type, unsigned, + struct gc_pos, unsigned); + +-int bch2_mark_key(struct bch_fs *, struct bkey_s_c, unsigned, +- s64, struct bch_fs_usage *, u64, unsigned); ++int bch2_mark_key(struct bch_fs *, struct bkey_s_c, unsigned); + + int bch2_mark_update(struct btree_trans *, struct btree_iter *, +- struct bkey_i *, struct bch_fs_usage *, unsigned); ++ struct bkey_i *, unsigned); + +-int bch2_trans_mark_key(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, +- unsigned, s64, unsigned); ++int bch2_trans_mark_key(struct btree_trans *, struct bkey_s_c, ++ struct bkey_s_c, unsigned); + int bch2_trans_mark_update(struct btree_trans *, struct btree_iter *iter, + struct bkey_i *insert, unsigned); + void bch2_trans_fs_usage_apply(struct btree_trans *, struct replicas_delta_list *); +diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c +index 48f9232e61eb..fcf93c3d3b57 100644 +--- a/fs/bcachefs/ec.c ++++ b/fs/bcachefs/ec.c +@@ -1633,7 +1633,8 @@ static int bch2_stripes_read_fn(struct bch_fs *c, struct bkey_s_c k) + + if (k.k->type == KEY_TYPE_stripe) + ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL) ?: +- bch2_mark_key(c, k, 0, 0, NULL, 0, ++ bch2_mark_key(c, k, ++ BTREE_TRIGGER_INSERT| + BTREE_TRIGGER_NOATOMIC); + + return ret; +diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h +index 3f6224f75ce8..43cef0a3bdf3 100644 +--- a/fs/bcachefs/extents.h ++++ b/fs/bcachefs/extents.h +@@ -426,6 +426,17 @@ void bch2_extent_crc_append(struct bkey_i *, + + /* Generic code for keys with pointers: */ + ++static inline bool bkey_is_btree_ptr(const struct bkey *k) ++{ ++ switch (k->type) { ++ case KEY_TYPE_btree_ptr: ++ case KEY_TYPE_btree_ptr_v2: ++ return true; ++ default: ++ return false; ++ } ++} ++ + static inline bool bkey_extent_is_direct_data(const struct bkey *k) + { + switch (k->type) { +-- +cgit v1.2.3 + + +From 34916111e9faffbd4551ed0fc2751463dd315de0 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 10 Jun 2021 23:34:02 -0400 +Subject: bcachefs: Don't disable preemption unnecessarily + +Small improvements to some percpu utility code. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/util.c | 6 +----- + fs/bcachefs/util.h | 5 +---- + 2 files changed, 2 insertions(+), 9 deletions(-) + +diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c +index e3ad26e244ab..463260c04585 100644 +--- a/fs/bcachefs/util.c ++++ b/fs/bcachefs/util.c +@@ -887,13 +887,9 @@ void eytzinger0_find_test(void) + */ + u64 *bch2_acc_percpu_u64s(u64 __percpu *p, unsigned nr) + { +- u64 *ret; ++ u64 *ret = this_cpu_ptr(p); + int cpu; + +- preempt_disable(); +- ret = this_cpu_ptr(p); +- preempt_enable(); +- + for_each_possible_cpu(cpu) { + u64 *i = per_cpu_ptr(p, cpu); + +diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h +index 2cf8568e630b..bec84d8aabed 100644 +--- a/fs/bcachefs/util.h ++++ b/fs/bcachefs/util.h +@@ -711,10 +711,7 @@ static inline void percpu_u64_set(u64 __percpu *dst, u64 src) + + for_each_possible_cpu(cpu) + *per_cpu_ptr(dst, cpu) = 0; +- +- preempt_disable(); +- *this_cpu_ptr(dst) = src; +- preempt_enable(); ++ this_cpu_write(*dst, src); + } + + static inline void acc_u64s(u64 *acc, const u64 *src, unsigned nr) +-- +cgit v1.2.3 + + +From b53727b2be5b0875c6ad282ad62199b6f066a641 Mon Sep 17 00:00:00 2001 +From: Dan Robertson +Date: Wed, 16 Jun 2021 23:21:23 -0400 +Subject: bcachefs: ensure iter->should_be_locked is set + +Ensure that iter->should_be_locked value is set to true before we +call bch2_trans_update in ec_stripe_update_ptrs. + +Signed-off-by: Dan Robertson +--- + fs/bcachefs/ec.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c +index fcf93c3d3b57..328e0429b5d7 100644 +--- a/fs/bcachefs/ec.c ++++ b/fs/bcachefs/ec.c +@@ -863,7 +863,8 @@ static int ec_stripe_update_ptrs(struct bch_fs *c, + extent_stripe_ptr_add(e, s, ec_ptr, block); + + bch2_btree_iter_set_pos(iter, bkey_start_pos(&sk.k->k)); +- ret = bch2_trans_update(&trans, iter, sk.k, 0) ?: ++ ret = bch2_btree_iter_traverse(iter) ?: ++ bch2_trans_update(&trans, iter, sk.k, 0) ?: + bch2_trans_commit(&trans, NULL, NULL, + BTREE_INSERT_NOFAIL); + if (ret == -EINTR) +-- +cgit v1.2.3 + + +From 5ab16e7f4996d09a6e19bee06152075e346ebb72 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 21 Jun 2021 16:28:43 -0400 +Subject: bcachefs: Don't ratelimit certain fsck errors + +It's unhelpful if we see "Halting mark and sweep to start topology +repair" but we don't see the error that triggered it. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_gc.c | 46 +++++++++++++++++++++++++++++----------------- + fs/bcachefs/error.c | 1 + + fs/bcachefs/error.h | 1 + + 3 files changed, 31 insertions(+), 17 deletions(-) + +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +index 95d31fc221d1..cf174f48000a 100644 +--- a/fs/bcachefs/btree_gc.c ++++ b/fs/bcachefs/btree_gc.c +@@ -86,12 +86,16 @@ static int bch2_gc_check_topology(struct bch_fs *c, + if (bpos_cmp(expected_start, bp->v.min_key)) { + bch2_topology_error(c); + +- if (fsck_err(c, "btree node with incorrect min_key at btree %s level %u:\n" +- " prev %s\n" +- " cur %s", +- bch2_btree_ids[b->c.btree_id], b->c.level, +- buf1, +- (bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(cur.k)), buf2))) { ++ if (__fsck_err(c, ++ FSCK_CAN_FIX| ++ FSCK_CAN_IGNORE| ++ FSCK_NO_RATELIMIT, ++ "btree node with incorrect min_key at btree %s level %u:\n" ++ " prev %s\n" ++ " cur %s", ++ bch2_btree_ids[b->c.btree_id], b->c.level, ++ buf1, ++ (bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(cur.k)), buf2))) { + bch_info(c, "Halting mark and sweep to start topology repair pass"); + return FSCK_ERR_START_TOPOLOGY_REPAIR; + } else { +@@ -103,12 +107,16 @@ static int bch2_gc_check_topology(struct bch_fs *c, + if (is_last && bpos_cmp(cur.k->k.p, node_end)) { + bch2_topology_error(c); + +- if (fsck_err(c, "btree node with incorrect max_key at btree %s level %u:\n" +- " %s\n" +- " expected %s", +- bch2_btree_ids[b->c.btree_id], b->c.level, +- (bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(cur.k)), buf1), +- (bch2_bpos_to_text(&PBUF(buf2), node_end), buf2))) { ++ if (__fsck_err(c, ++ FSCK_CAN_FIX| ++ FSCK_CAN_IGNORE| ++ FSCK_NO_RATELIMIT, ++ "btree node with incorrect max_key at btree %s level %u:\n" ++ " %s\n" ++ " expected %s", ++ bch2_btree_ids[b->c.btree_id], b->c.level, ++ (bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(cur.k)), buf1), ++ (bch2_bpos_to_text(&PBUF(buf2), node_end), buf2))) { + bch_info(c, "Halting mark and sweep to start topology repair pass"); + return FSCK_ERR_START_TOPOLOGY_REPAIR; + } else { +@@ -884,11 +892,15 @@ static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b, + if (ret == -EIO) { + bch2_topology_error(c); + +- if (fsck_err(c, "Unreadable btree node at btree %s level %u:\n" +- " %s", +- bch2_btree_ids[b->c.btree_id], +- b->c.level - 1, +- (bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(cur.k)), buf))) { ++ if (__fsck_err(c, ++ FSCK_CAN_FIX| ++ FSCK_CAN_IGNORE| ++ FSCK_NO_RATELIMIT, ++ "Unreadable btree node at btree %s level %u:\n" ++ " %s", ++ bch2_btree_ids[b->c.btree_id], ++ b->c.level - 1, ++ (bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(cur.k)), buf))) { + ret = FSCK_ERR_START_TOPOLOGY_REPAIR; + bch_info(c, "Halting mark and sweep to start topology repair pass"); + goto fsck_err; +diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c +index 90c3b986c264..2cea694575e9 100644 +--- a/fs/bcachefs/error.c ++++ b/fs/bcachefs/error.c +@@ -111,6 +111,7 @@ found: + list_move(&s->list, &c->fsck_errors); + s->nr++; + if (c->opts.ratelimit_errors && ++ !(flags & FSCK_NO_RATELIMIT) && + s->nr >= FSCK_ERR_RATELIMIT_NR) { + if (s->nr == FSCK_ERR_RATELIMIT_NR) + suppressing = true; +diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h +index d8cd19b3f63c..986938298adc 100644 +--- a/fs/bcachefs/error.h ++++ b/fs/bcachefs/error.h +@@ -104,6 +104,7 @@ struct fsck_err_state { + #define FSCK_CAN_FIX (1 << 0) + #define FSCK_CAN_IGNORE (1 << 1) + #define FSCK_NEED_FSCK (1 << 2) ++#define FSCK_NO_RATELIMIT (1 << 3) + + __printf(3, 4) __cold + enum fsck_err_ret bch2_fsck_err(struct bch_fs *, +-- +cgit v1.2.3 + + +From 019373ec8d593db19fdf70f75a0feb3ebd4a27b0 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 22 Jun 2021 20:44:54 -0400 +Subject: bcachefs: Don't loop into topology repair + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bcachefs.h | 1 + + fs/bcachefs/btree_gc.c | 12 +++++++++--- + 2 files changed, 10 insertions(+), 3 deletions(-) + +diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h +index 38963900e41a..90844d6532f6 100644 +--- a/fs/bcachefs/bcachefs.h ++++ b/fs/bcachefs/bcachefs.h +@@ -498,6 +498,7 @@ enum { + BCH_FS_ALLOCATOR_STOPPING, + BCH_FS_INITIAL_GC_DONE, + BCH_FS_INITIAL_GC_UNFIXED, ++ BCH_FS_TOPOLOGY_REPAIR_DONE, + BCH_FS_BTREE_INTERIOR_REPLAY_DONE, + BCH_FS_FSCK_DONE, + BCH_FS_STARTED, +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +index cf174f48000a..039677a55f58 100644 +--- a/fs/bcachefs/btree_gc.c ++++ b/fs/bcachefs/btree_gc.c +@@ -95,7 +95,8 @@ static int bch2_gc_check_topology(struct bch_fs *c, + " cur %s", + bch2_btree_ids[b->c.btree_id], b->c.level, + buf1, +- (bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(cur.k)), buf2))) { ++ (bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(cur.k)), buf2)) && ++ !test_bit(BCH_FS_TOPOLOGY_REPAIR_DONE, &c->flags)) { + bch_info(c, "Halting mark and sweep to start topology repair pass"); + return FSCK_ERR_START_TOPOLOGY_REPAIR; + } else { +@@ -116,7 +117,8 @@ static int bch2_gc_check_topology(struct bch_fs *c, + " expected %s", + bch2_btree_ids[b->c.btree_id], b->c.level, + (bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(cur.k)), buf1), +- (bch2_bpos_to_text(&PBUF(buf2), node_end), buf2))) { ++ (bch2_bpos_to_text(&PBUF(buf2), node_end), buf2)) && ++ !test_bit(BCH_FS_TOPOLOGY_REPAIR_DONE, &c->flags)) { + bch_info(c, "Halting mark and sweep to start topology repair pass"); + return FSCK_ERR_START_TOPOLOGY_REPAIR; + } else { +@@ -900,7 +902,8 @@ static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b, + " %s", + bch2_btree_ids[b->c.btree_id], + b->c.level - 1, +- (bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(cur.k)), buf))) { ++ (bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(cur.k)), buf)) && ++ !test_bit(BCH_FS_TOPOLOGY_REPAIR_DONE, &c->flags)) { + ret = FSCK_ERR_START_TOPOLOGY_REPAIR; + bch_info(c, "Halting mark and sweep to start topology repair pass"); + goto fsck_err; +@@ -1599,11 +1602,14 @@ again: + if (ret) + goto out; + bch_info(c, "topology repair pass done"); ++ ++ set_bit(BCH_FS_TOPOLOGY_REPAIR_DONE, &c->flags); + } + + ret = bch2_gc_btrees(c, initial, metadata_only); + + if (ret == FSCK_ERR_START_TOPOLOGY_REPAIR && ++ !test_bit(BCH_FS_TOPOLOGY_REPAIR_DONE, &c->flags) && + !test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) { + set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags); + ret = 0; +-- +cgit v1.2.3 + + +From 7a98cac626751b8cde58d3e69c7e96e5e2fd6ca8 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 22 Jun 2021 21:51:17 -0400 +Subject: bcachefs: Fix btree_node_read_all_replicas() error handling + +We weren't checking bch2_btree_node_read_done() for errors, oops. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_io.c | 39 ++++++++++++++++++++------------------- + 1 file changed, 20 insertions(+), 19 deletions(-) + +diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c +index 721deee6bcc4..c354dd1aefb9 100644 +--- a/fs/bcachefs/btree_io.c ++++ b/fs/bcachefs/btree_io.c +@@ -1187,31 +1187,27 @@ static void btree_node_read_all_replicas_done(struct closure *cl) + container_of(cl, struct btree_node_read_all, cl); + struct bch_fs *c = ra->c; + struct btree *b = ra->b; +- bool have_good_copy = false; + bool dump_bset_maps = false; + bool have_retry = false; +- int ret = 0, write = READ; ++ int ret = 0, best = -1, write = READ; + unsigned i, written, written2; + __le64 seq = b->key.k.type == KEY_TYPE_btree_ptr_v2 + ? bkey_i_to_btree_ptr_v2(&b->key)->v.seq : 0; + + for (i = 0; i < ra->nr; i++) { ++ struct btree_node *bn = ra->buf[i]; ++ + if (ra->err[i]) + continue; + +- if (!have_good_copy) { +- memcpy(b->data, ra->buf[i], btree_bytes(c)); +- have_good_copy = true; +- written = btree_node_sectors_written(c, b->data); +- } ++ if (le64_to_cpu(bn->magic) != bset_magic(c) || ++ (seq && seq != bn->keys.seq)) ++ continue; + +- /* Try to get the right btree node: */ +- if (have_good_copy && +- seq && +- b->data->keys.seq != seq && +- ((struct btree_node *) ra->buf[i])->keys.seq == seq) { +- memcpy(b->data, ra->buf[i], btree_bytes(c)); +- written = btree_node_sectors_written(c, b->data); ++ if (best < 0) { ++ best = i; ++ written = btree_node_sectors_written(c, bn); ++ continue; + } + + written2 = btree_node_sectors_written(c, ra->buf[i]); +@@ -1221,14 +1217,14 @@ static void btree_node_read_all_replicas_done(struct closure *cl) + btree_err_on(btree_node_has_extra_bsets(c, written2, ra->buf[i]), + BTREE_ERR_FIXABLE, c, NULL, b, NULL, + "found bset signature after last bset") || +- btree_err_on(memcmp(b->data, ra->buf[i], written << 9), ++ btree_err_on(memcmp(ra->buf[best], ra->buf[i], written << 9), + BTREE_ERR_FIXABLE, c, NULL, b, NULL, + "btree node replicas content mismatch")) + dump_bset_maps = true; + + if (written2 > written) { + written = written2; +- memcpy(b->data, ra->buf[i], btree_bytes(c)); ++ best = i; + } + } + fsck_err: +@@ -1281,9 +1277,14 @@ fsck_err: + } + } + +- if (have_good_copy) +- bch2_btree_node_read_done(c, NULL, b, false); +- else ++ if (best >= 0) { ++ memcpy(b->data, ra->buf[best], btree_bytes(c)); ++ ret = bch2_btree_node_read_done(c, NULL, b, false); ++ } else { ++ ret = -1; ++ } ++ ++ if (ret) + set_btree_node_read_error(b); + + for (i = 0; i < ra->nr; i++) { +-- +cgit v1.2.3 + + +From 580a1f51a85c8ab44e1bd68c00de98bb55c9c60c Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 30 Jun 2021 15:44:11 -0400 +Subject: bcachefs: Use memalloc_nofs_save() in bch2_read_endio() + +This solves a problematic memory allocation in bch2_bio_uncompress() -> +vmap(). + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/io.c | 11 ++++++++--- + 1 file changed, 8 insertions(+), 3 deletions(-) + +diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c +index 0fe671646df0..1345befd1a09 100644 +--- a/fs/bcachefs/io.c ++++ b/fs/bcachefs/io.c +@@ -1810,8 +1810,11 @@ static void __bch2_read_endio(struct work_struct *work) + struct bvec_iter dst_iter = rbio->bvec_iter; + struct bch_extent_crc_unpacked crc = rbio->pick.crc; + struct nonce nonce = extent_nonce(rbio->version, crc); ++ unsigned nofs_flags; + struct bch_csum csum; + ++ nofs_flags = memalloc_nofs_save(); ++ + /* Reset iterator for checksumming and copying bounced data: */ + if (rbio->bounce) { + src->bi_iter.bi_size = crc.compressed_size << 9; +@@ -1876,6 +1879,8 @@ nodecode: + rbio = bch2_rbio_free(rbio); + bch2_rbio_done(rbio); + } ++out: ++ memalloc_nofs_restore(nofs_flags); + return; + csum_err: + /* +@@ -1886,7 +1891,7 @@ csum_err: + if (!rbio->bounce && (rbio->flags & BCH_READ_USER_MAPPED)) { + rbio->flags |= BCH_READ_MUST_BOUNCE; + bch2_rbio_error(rbio, READ_RETRY, BLK_STS_IOERR); +- return; ++ goto out; + } + + bch2_dev_inum_io_error(ca, rbio->read_pos.inode, (u64) rbio->bvec_iter.bi_sector, +@@ -1894,12 +1899,12 @@ csum_err: + rbio->pick.crc.csum.hi, rbio->pick.crc.csum.lo, + csum.hi, csum.lo, crc.csum_type); + bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); +- return; ++ goto out; + decompression_err: + bch_err_inum_ratelimited(c, rbio->read_pos.inode, + "decompression error"); + bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR); +- return; ++ goto out; + } + + static void bch2_read_endio(struct bio *bio) +-- +cgit v1.2.3 + + +From a0ec5fc51d4ca8cdda26e7c9c383983f3bd0977a Mon Sep 17 00:00:00 2001 +From: Christopher James Halse Rogers +Date: Fri, 25 Jun 2021 11:45:19 +1000 +Subject: bcachefs: Fix unused variable warning when !BCACHEFS_DEBUG + +Signed-off-by: Christopher James Halse Rogers +--- + fs/bcachefs/btree_iter.c | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index c8f503190ef5..c9a369571989 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -1550,7 +1550,9 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) + + static void btree_iter_set_search_pos(struct btree_iter *iter, struct bpos new_pos) + { ++#ifdef CONFIG_BCACHEFS_DEBUG + struct bpos old_pos = iter->real_pos; ++#endif + int cmp = bpos_cmp(new_pos, iter->real_pos); + unsigned l = iter->level; + +-- +cgit v1.2.3 + + +From 55f6bc59623d7fb30acdbbbf5b87d884d88f931b Mon Sep 17 00:00:00 2001 +From: Dan Robertson +Date: Tue, 29 Jun 2021 18:52:13 -0400 +Subject: bcachefs: ensure iter->should_be_locked is set + +Ensure that iter->should_be_locked is set to true before we +call bch2_trans_update in __bch2_dev_usrdata_drop. + +Signed-off-by: Dan Robertson +--- + fs/bcachefs/migrate.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c +index 6ebe49ba2248..91a9f584dd6d 100644 +--- a/fs/bcachefs/migrate.c ++++ b/fs/bcachefs/migrate.c +@@ -73,7 +73,8 @@ static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags + + bch2_btree_iter_set_pos(iter, bkey_start_pos(&sk.k->k)); + +- ret = bch2_trans_update(&trans, iter, sk.k, 0) ?: ++ ret = bch2_btree_iter_traverse(iter) ?: ++ bch2_trans_update(&trans, iter, sk.k, 0) ?: + bch2_trans_commit(&trans, NULL, NULL, + BTREE_INSERT_NOFAIL); + +-- +cgit v1.2.3 + + +From d16984124656db557fbfdacde32bf0854175b18f Mon Sep 17 00:00:00 2001 +From: Dan Robertson +Date: Fri, 2 Jul 2021 21:22:06 -0400 +Subject: bcachefs: fix ifdef for x86_64 asm + +The implementation of prefetch_four_cachelines should use ifdef +CONFIG_X86_64 to conditionally compile x86_64 asm. + +Signed-off-by: Dan Robertson +--- + fs/bcachefs/bset.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c +index 1d170d8a65c8..8baada315cae 100644 +--- a/fs/bcachefs/bset.c ++++ b/fs/bcachefs/bset.c +@@ -1193,7 +1193,7 @@ static struct bkey_packed *bset_search_write_set(const struct btree *b, + + static inline void prefetch_four_cachelines(void *p) + { +-#if CONFIG_X86_64 ++#ifdef CONFIG_X86_64 + asm("prefetcht0 (-127 + 64 * 0)(%0);" + "prefetcht0 (-127 + 64 * 1)(%0);" + "prefetcht0 (-127 + 64 * 2)(%0);" +-- +cgit v1.2.3 + + +From 9b9cb6deded3b2ea53f3c2eb76bb6d04e4ab05c7 Mon Sep 17 00:00:00 2001 +From: Dan Robertson +Date: Sun, 27 Jun 2021 20:54:34 -0400 +Subject: bcachefs: fix truncate without a size change + +Do not attempt to shortcut a truncate when the given new size is +the same as the current size. There may be blocks allocated to the +file that extend beyond the i_size. The ctime and mtime should +not be updated in this case. + +Signed-off-by: Dan Robertson +--- + fs/bcachefs/fs-io.c | 19 ++++++++++--------- + 1 file changed, 10 insertions(+), 9 deletions(-) + +diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c +index 93e1845523fa..3fca1f0c6597 100644 +--- a/fs/bcachefs/fs-io.c ++++ b/fs/bcachefs/fs-io.c +@@ -2314,16 +2314,17 @@ int bch2_truncate(struct user_namespace *mnt_userns, + int ret = 0; + + /* +- * Don't update timestamps if we're not doing anything: ++ * If the truncate call with change the size of the file, the ++ * cmtimes should be updated. If the size will not change, we ++ * do not need to update the cmtimes. + */ +- if (iattr->ia_size == inode->v.i_size) +- return 0; +- +- if (!(iattr->ia_valid & ATTR_MTIME)) +- ktime_get_coarse_real_ts64(&iattr->ia_mtime); +- if (!(iattr->ia_valid & ATTR_CTIME)) +- ktime_get_coarse_real_ts64(&iattr->ia_ctime); +- iattr->ia_valid |= ATTR_MTIME|ATTR_CTIME; ++ if (iattr->ia_size != inode->v.i_size) { ++ if (!(iattr->ia_valid & ATTR_MTIME)) ++ ktime_get_coarse_real_ts64(&iattr->ia_mtime); ++ if (!(iattr->ia_valid & ATTR_CTIME)) ++ ktime_get_coarse_real_ts64(&iattr->ia_ctime); ++ iattr->ia_valid |= ATTR_MTIME|ATTR_CTIME; ++ } + + inode_dio_wait(&inode->v); + bch2_pagecache_block_get(&inode->ei_pagecache_lock); +-- +cgit v1.2.3 + + +From 7747799f9774b07a3aafa84c5fb1ff382ccc5d58 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 24 Jun 2021 13:19:25 -0400 +Subject: bcachefs: Fix shift-by-64 in bch2_bkey_format_validate() + +We need to ensure that packed formats can't represent fields larger than +the unpacked format, which is a bit tricky since the calculations can +also overflow a u64. This patch fixes a shift and simplifies the overall +calculations. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bkey.c | 20 ++++++++++---------- + 1 file changed, 10 insertions(+), 10 deletions(-) + +diff --git a/fs/bcachefs/bkey.c b/fs/bcachefs/bkey.c +index 0053f32c0076..946dd27f09fc 100644 +--- a/fs/bcachefs/bkey.c ++++ b/fs/bcachefs/bkey.c +@@ -620,22 +620,22 @@ const char *bch2_bkey_format_validate(struct bkey_format *f) + if (f->nr_fields != BKEY_NR_FIELDS) + return "incorrect number of fields"; + ++ /* ++ * Verify that the packed format can't represent fields larger than the ++ * unpacked format: ++ */ + for (i = 0; i < f->nr_fields; i++) { + unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i]; +- u64 unpacked_mask = ~((~0ULL << 1) << (unpacked_bits - 1)); ++ u64 unpacked_max = ~((~0ULL << 1) << (unpacked_bits - 1)); ++ u64 packed_max = f->bits_per_field[i] ++ ? ~((~0ULL << 1) << (f->bits_per_field[i] - 1)) ++ : 0; + u64 field_offset = le64_to_cpu(f->field_offset[i]); + +- if (f->bits_per_field[i] > unpacked_bits) ++ if (packed_max + field_offset < packed_max || ++ packed_max + field_offset > unpacked_max) + return "field too large"; + +- if ((f->bits_per_field[i] == unpacked_bits) && field_offset) +- return "offset + bits overflow"; +- +- if (((field_offset + ((1ULL << f->bits_per_field[i]) - 1)) & +- unpacked_mask) < +- field_offset) +- return "offset + bits overflow"; +- + bits += f->bits_per_field[i]; + } + +-- +cgit v1.2.3 + + +From 3f885d9fc6759a70828035bb73defd2c6b0a34c8 Mon Sep 17 00:00:00 2001 +From: Dan Robertson +Date: Wed, 23 Jun 2021 19:25:00 -0400 +Subject: bcachefs: statfs bfree and bavail should be the same + +The value of f_bfree and f_bavail should be the same. The value of +f_bfree is not currently scaled by the availability factor. + +Signed-off-by: Dan Robertson +--- + fs/bcachefs/fs.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c +index 20907e554dd3..2ae1aed69445 100644 +--- a/fs/bcachefs/fs.c ++++ b/fs/bcachefs/fs.c +@@ -1276,8 +1276,8 @@ static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf) + buf->f_type = BCACHEFS_STATFS_MAGIC; + buf->f_bsize = sb->s_blocksize; + buf->f_blocks = usage.capacity >> shift; +- buf->f_bfree = usage.free >> shift; +- buf->f_bavail = avail_factor(usage.free) >> shift; ++ buf->f_bfree = avail_factor(usage.free) >> shift; ++ buf->f_bavail = buf->f_bfree; + + buf->f_files = usage.nr_inodes + avail_inodes; + buf->f_ffree = avail_inodes; +-- +cgit v1.2.3 + + +From 378e601c993cfed611da03618d21c3b22468e0eb Mon Sep 17 00:00:00 2001 +From: Dan Robertson +Date: Wed, 23 Jun 2021 21:52:41 -0400 +Subject: bcachefs: Fix bch2_acl_chmod() cleanup on error + +Avoid calling kfree on the returned error pointer if +bch2_acl_from_disk fails. + +Signed-off-by: Dan Robertson +--- + fs/bcachefs/acl.c | 5 +++-- + 1 file changed, 3 insertions(+), 2 deletions(-) + +diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c +index 5c365f527dbd..e8d0eb92c782 100644 +--- a/fs/bcachefs/acl.c ++++ b/fs/bcachefs/acl.c +@@ -373,7 +373,7 @@ int bch2_acl_chmod(struct btree_trans *trans, + acl = bch2_acl_from_disk(xattr_val(xattr.v), + le16_to_cpu(xattr.v->x_val_len)); + ret = PTR_ERR_OR_ZERO(acl); +- if (ret || !acl) ++ if (IS_ERR_OR_NULL(acl)) + goto err; + + ret = __posix_acl_chmod(&acl, GFP_KERNEL, mode); +@@ -392,7 +392,8 @@ int bch2_acl_chmod(struct btree_trans *trans, + acl = NULL; + err: + bch2_trans_iter_put(trans, iter); +- kfree(acl); ++ if (!IS_ERR_OR_NULL(acl)) ++ kfree(acl); + return ret; + } + +-- +cgit v1.2.3 + + +From b011f34f0c801c3ce4f0c167b59de4bedc2944d3 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 3 Jul 2021 23:57:09 -0400 +Subject: bcachefs: Fix bch2_btree_iter_peek_prev() + +In !BTREE_ITER_IS_EXTENTS mode, we shouldn't be looking at k->size, i.e. +we shouldn't use bkey_start_pos(). + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index c9a369571989..4e3c24266dfb 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -1771,7 +1771,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) + if (!k.k || + ((iter->flags & BTREE_ITER_IS_EXTENTS) + ? bkey_cmp(bkey_start_pos(k.k), iter->pos) >= 0 +- : bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0)) ++ : bkey_cmp(k.k->p, iter->pos) > 0)) + k = btree_iter_level_prev(iter, l); + + if (likely(k.k)) +-- +cgit v1.2.3 + + +From 821112e81aa734f0f7023ef766f04b995a404af6 Mon Sep 17 00:00:00 2001 +From: Tobias Geerinckx-Rice +Date: Sun, 4 Jul 2021 21:35:32 +0200 +Subject: bcachefs: Enforce SYS_CAP_ADMIN within ioctls + +bch2_fs_ioctl() didn't distinguish between unsupported ioctls and those +which the current user is unauthorised to perform. That kept the code +simple but meant that, for example, an unprivileged TIOCGWINSZ ioctl on +a bcachefs file would return -EPERM instead of the expected -ENOTTY. +The same call made by a privileged user would correctly return -ENOTTY. + +Fix this discrepancy by moving the check for CAP_SYS_ADMIN into each +privileged ioctl function. + +Signed-off-by: Tobias Geerinckx-Rice +--- + fs/bcachefs/chardev.c | 44 ++++++++++++++++++++++++++++++++++++-------- + 1 file changed, 36 insertions(+), 8 deletions(-) + +diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c +index c29f8272e682..9ac34cc36c07 100644 +--- a/fs/bcachefs/chardev.c ++++ b/fs/bcachefs/chardev.c +@@ -157,6 +157,9 @@ static long bch2_ioctl_query_uuid(struct bch_fs *c, + #if 0 + static long bch2_ioctl_start(struct bch_fs *c, struct bch_ioctl_start arg) + { ++ if (!capable(CAP_SYS_ADMIN)) ++ return -EPERM; ++ + if (arg.flags || arg.pad) + return -EINVAL; + +@@ -165,6 +168,9 @@ static long bch2_ioctl_start(struct bch_fs *c, struct bch_ioctl_start arg) + + static long bch2_ioctl_stop(struct bch_fs *c) + { ++ if (!capable(CAP_SYS_ADMIN)) ++ return -EPERM; ++ + bch2_fs_stop(c); + return 0; + } +@@ -175,6 +181,9 @@ static long bch2_ioctl_disk_add(struct bch_fs *c, struct bch_ioctl_disk arg) + char *path; + int ret; + ++ if (!capable(CAP_SYS_ADMIN)) ++ return -EPERM; ++ + if (arg.flags || arg.pad) + return -EINVAL; + +@@ -192,6 +201,9 @@ static long bch2_ioctl_disk_remove(struct bch_fs *c, struct bch_ioctl_disk arg) + { + struct bch_dev *ca; + ++ if (!capable(CAP_SYS_ADMIN)) ++ return -EPERM; ++ + if ((arg.flags & ~(BCH_FORCE_IF_DATA_LOST| + BCH_FORCE_IF_METADATA_LOST| + BCH_FORCE_IF_DEGRADED| +@@ -211,6 +223,9 @@ static long bch2_ioctl_disk_online(struct bch_fs *c, struct bch_ioctl_disk arg) + char *path; + int ret; + ++ if (!capable(CAP_SYS_ADMIN)) ++ return -EPERM; ++ + if (arg.flags || arg.pad) + return -EINVAL; + +@@ -228,6 +243,9 @@ static long bch2_ioctl_disk_offline(struct bch_fs *c, struct bch_ioctl_disk arg) + struct bch_dev *ca; + int ret; + ++ if (!capable(CAP_SYS_ADMIN)) ++ return -EPERM; ++ + if ((arg.flags & ~(BCH_FORCE_IF_DATA_LOST| + BCH_FORCE_IF_METADATA_LOST| + BCH_FORCE_IF_DEGRADED| +@@ -250,6 +268,9 @@ static long bch2_ioctl_disk_set_state(struct bch_fs *c, + struct bch_dev *ca; + int ret; + ++ if (!capable(CAP_SYS_ADMIN)) ++ return -EPERM; ++ + if ((arg.flags & ~(BCH_FORCE_IF_DATA_LOST| + BCH_FORCE_IF_METADATA_LOST| + BCH_FORCE_IF_DEGRADED| +@@ -331,6 +352,9 @@ static long bch2_ioctl_data(struct bch_fs *c, + unsigned flags = O_RDONLY|O_CLOEXEC|O_NONBLOCK; + int ret, fd = -1; + ++ if (!capable(CAP_SYS_ADMIN)) ++ return -EPERM; ++ + if (arg.op >= BCH_DATA_OP_NR || arg.flags) + return -EINVAL; + +@@ -497,6 +521,9 @@ static long bch2_ioctl_read_super(struct bch_fs *c, + struct bch_sb *sb; + int ret = 0; + ++ if (!capable(CAP_SYS_ADMIN)) ++ return -EPERM; ++ + if ((arg.flags & ~(BCH_BY_INDEX|BCH_READ_DEV)) || + arg.pad) + return -EINVAL; +@@ -537,6 +564,9 @@ static long bch2_ioctl_disk_get_idx(struct bch_fs *c, + struct bch_dev *ca; + unsigned i; + ++ if (!capable(CAP_SYS_ADMIN)) ++ return -EPERM; ++ + for_each_online_member(ca, c, i) + if (ca->disk_sb.bdev->bd_dev == dev) { + percpu_ref_put(&ca->io_ref); +@@ -552,6 +582,9 @@ static long bch2_ioctl_disk_resize(struct bch_fs *c, + struct bch_dev *ca; + int ret; + ++ if (!capable(CAP_SYS_ADMIN)) ++ return -EPERM; ++ + if ((arg.flags & ~BCH_BY_INDEX) || + arg.pad) + return -EINVAL; +@@ -572,6 +605,9 @@ static long bch2_ioctl_disk_resize_journal(struct bch_fs *c, + struct bch_dev *ca; + int ret; + ++ if (!capable(CAP_SYS_ADMIN)) ++ return -EPERM; ++ + if ((arg.flags & ~BCH_BY_INDEX) || + arg.pad) + return -EINVAL; +@@ -597,7 +633,6 @@ do { \ + + long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg) + { +- /* ioctls that don't require admin cap: */ + switch (cmd) { + case BCH_IOCTL_QUERY_UUID: + return bch2_ioctl_query_uuid(c, arg); +@@ -605,12 +640,6 @@ long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg) + return bch2_ioctl_fs_usage(c, arg); + case BCH_IOCTL_DEV_USAGE: + return bch2_ioctl_dev_usage(c, arg); +- } +- +- if (!capable(CAP_SYS_ADMIN)) +- return -EPERM; +- +- switch (cmd) { + #if 0 + case BCH_IOCTL_START: + BCH_IOCTL(start, struct bch_ioctl_start); +@@ -626,7 +655,6 @@ long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg) + if (!test_bit(BCH_FS_STARTED, &c->flags)) + return -EINVAL; + +- /* ioctls that do require admin cap: */ + switch (cmd) { + case BCH_IOCTL_DISK_ADD: + BCH_IOCTL(disk_add, struct bch_ioctl_disk); +-- +cgit v1.2.3 + + +From c076782bf6e25810a95fabd0ebcf09b1520f47c5 Mon Sep 17 00:00:00 2001 +From: jpsollie +Date: Thu, 17 Jun 2021 11:29:59 +0200 +Subject: bcachefs: Prepare checksums for more advanced algorithms + +Perform abstraction of hash calculation for advanced checksum algorithms. +Algorithms like xxhash do not store their state as a u64 int. + +Signed-off-by: jpsollie +--- + fs/bcachefs/checksum.c | 95 +++++++++++++++++++++++++++++++------------------- + 1 file changed, 59 insertions(+), 36 deletions(-) + +diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c +index 3d88719ba86c..6c23a9073dbf 100644 +--- a/fs/bcachefs/checksum.c ++++ b/fs/bcachefs/checksum.c +@@ -16,53 +16,68 @@ + #include + #include + +-static u64 bch2_checksum_init(unsigned type) ++/* ++ * bch2_checksum state is an abstraction of the checksum state calculated over different pages. ++ * it features page merging without having the checksum algorithm lose its state. ++ * for native checksum aglorithms (like crc), a default seed value will do. ++ * for hash-like algorithms, a state needs to be stored ++ */ ++ ++struct bch2_checksum_state { ++ union { ++ u64 seed; ++ }; ++ unsigned int type; ++}; ++ ++static void bch2_checksum_init(struct bch2_checksum_state *state) + { +- switch (type) { ++ switch (state->type) { + case BCH_CSUM_NONE: +- return 0; +- case BCH_CSUM_CRC32C_NONZERO: +- return U32_MAX; +- case BCH_CSUM_CRC64_NONZERO: +- return U64_MAX; + case BCH_CSUM_CRC32C: +- return 0; + case BCH_CSUM_CRC64: +- return 0; ++ state->seed = 0; ++ break; ++ case BCH_CSUM_CRC32C_NONZERO: ++ state->seed = U32_MAX; ++ break; ++ case BCH_CSUM_CRC64_NONZERO: ++ state->seed = U64_MAX; ++ break; + default: + BUG(); + } + } + +-static u64 bch2_checksum_final(unsigned type, u64 crc) ++static u64 bch2_checksum_final(const struct bch2_checksum_state *state) + { +- switch (type) { ++ switch (state->type) { + case BCH_CSUM_NONE: +- return 0; +- case BCH_CSUM_CRC32C_NONZERO: +- return crc ^ U32_MAX; +- case BCH_CSUM_CRC64_NONZERO: +- return crc ^ U64_MAX; + case BCH_CSUM_CRC32C: +- return crc; + case BCH_CSUM_CRC64: +- return crc; ++ return state->seed; ++ case BCH_CSUM_CRC32C_NONZERO: ++ return state->seed ^ U32_MAX; ++ case BCH_CSUM_CRC64_NONZERO: ++ return state->seed ^ U64_MAX; + default: + BUG(); + } + } + +-static u64 bch2_checksum_update(unsigned type, u64 crc, const void *data, size_t len) ++static void bch2_checksum_update(struct bch2_checksum_state *state, const void *data, size_t len) + { +- switch (type) { ++ switch (state->type) { + case BCH_CSUM_NONE: +- return 0; ++ return; + case BCH_CSUM_CRC32C_NONZERO: + case BCH_CSUM_CRC32C: +- return crc32c(crc, data, len); ++ state->seed = crc32c(state->seed, data, len); ++ break; + case BCH_CSUM_CRC64_NONZERO: + case BCH_CSUM_CRC64: +- return crc64_be(crc, data, len); ++ state->seed = crc64_be(state->seed, data, len); ++ break; + default: + BUG(); + } +@@ -141,12 +156,14 @@ struct bch_csum bch2_checksum(struct bch_fs *c, unsigned type, + case BCH_CSUM_CRC64_NONZERO: + case BCH_CSUM_CRC32C: + case BCH_CSUM_CRC64: { +- u64 crc = bch2_checksum_init(type); ++ struct bch2_checksum_state state; + +- crc = bch2_checksum_update(type, crc, data, len); +- crc = bch2_checksum_final(type, crc); ++ state.type = type; + +- return (struct bch_csum) { .lo = cpu_to_le64(crc) }; ++ bch2_checksum_init(&state); ++ bch2_checksum_update(&state, data, len); ++ ++ return (struct bch_csum) { .lo = cpu_to_le64(bch2_checksum_final(&state)) }; + } + + case BCH_CSUM_CHACHA20_POLY1305_80: +@@ -190,23 +207,23 @@ static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type, + case BCH_CSUM_CRC64_NONZERO: + case BCH_CSUM_CRC32C: + case BCH_CSUM_CRC64: { +- u64 crc = bch2_checksum_init(type); ++ struct bch2_checksum_state state; ++ ++ state.type = type; ++ bch2_checksum_init(&state); + + #ifdef CONFIG_HIGHMEM + __bio_for_each_segment(bv, bio, *iter, *iter) { + void *p = kmap_atomic(bv.bv_page) + bv.bv_offset; +- crc = bch2_checksum_update(type, +- crc, p, bv.bv_len); ++ bch2_checksum_update(&state, p, bv.bv_len); + kunmap_atomic(p); + } + #else + __bio_for_each_bvec(bv, bio, *iter, *iter) +- crc = bch2_checksum_update(type, crc, +- page_address(bv.bv_page) + bv.bv_offset, ++ bch2_checksum_update(&state, page_address(bv.bv_page) + bv.bv_offset, + bv.bv_len); + #endif +- crc = bch2_checksum_final(type, crc); +- return (struct bch_csum) { .lo = cpu_to_le64(crc) }; ++ return (struct bch_csum) { .lo = cpu_to_le64(bch2_checksum_final(&state)) }; + } + + case BCH_CSUM_CHACHA20_POLY1305_80: +@@ -284,16 +301,22 @@ void bch2_encrypt_bio(struct bch_fs *c, unsigned type, + struct bch_csum bch2_checksum_merge(unsigned type, struct bch_csum a, + struct bch_csum b, size_t b_len) + { ++ struct bch2_checksum_state state; ++ ++ state.type = type; ++ bch2_checksum_init(&state); ++ state.seed = a.lo; ++ + BUG_ON(!bch2_checksum_mergeable(type)); + + while (b_len) { + unsigned b = min_t(unsigned, b_len, PAGE_SIZE); + +- a.lo = bch2_checksum_update(type, a.lo, ++ bch2_checksum_update(&state, + page_address(ZERO_PAGE(0)), b); + b_len -= b; + } +- ++ a.lo = bch2_checksum_final(&state); + a.lo ^= b.lo; + a.hi ^= b.hi; + return a; +-- +cgit v1.2.3 + + +From c6104a683adbcc9bd9e9e4d2d61dee20b74b2c74 Mon Sep 17 00:00:00 2001 +From: jpsollie +Date: Thu, 17 Jun 2021 13:42:09 +0200 +Subject: bcachefs: add bcachefs xxhash support + +xxhash is a much faster algorithm compared to crc32. +could be used to speed up checksum calculation. +xxhash 64-bit only, as it is much faster on 64-bit CPUs compared to xxh32. + +Signed-off-by: jpsollie +--- + fs/bcachefs/Kconfig | 1 + + fs/bcachefs/bcachefs_format.h | 7 +++++-- + fs/bcachefs/checksum.c | 12 ++++++++++++ + fs/bcachefs/checksum.h | 2 ++ + 4 files changed, 20 insertions(+), 2 deletions(-) + +diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig +index 57c5d58c2d87..27742ce276cd 100644 +--- a/fs/bcachefs/Kconfig ++++ b/fs/bcachefs/Kconfig +@@ -20,6 +20,7 @@ config BCACHEFS_FS + select SIXLOCKS + select RAID6_PQ + select XOR_BLOCKS ++ select XXHASH + select SRCU + help + The bcachefs filesystem - a modern, copy on write filesystem, with +diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h +index 79c0876aab8b..633fe71fcc8f 100644 +--- a/fs/bcachefs/bcachefs_format.h ++++ b/fs/bcachefs/bcachefs_format.h +@@ -1456,7 +1456,8 @@ enum bch_csum_type { + BCH_CSUM_CHACHA20_POLY1305_128 = 4, + BCH_CSUM_CRC32C = 5, + BCH_CSUM_CRC64 = 6, +- BCH_CSUM_NR = 7, ++ BCH_CSUM_XXHASH = 7, ++ BCH_CSUM_NR = 8, + }; + + static const unsigned bch_crc_bytes[] = { +@@ -1465,6 +1466,7 @@ static const unsigned bch_crc_bytes[] = { + [BCH_CSUM_CRC32C] = 4, + [BCH_CSUM_CRC64_NONZERO] = 8, + [BCH_CSUM_CRC64] = 8, ++ [BCH_CSUM_XXHASH] = 8, + [BCH_CSUM_CHACHA20_POLY1305_80] = 10, + [BCH_CSUM_CHACHA20_POLY1305_128] = 16, + }; +@@ -1483,7 +1485,8 @@ static inline _Bool bch2_csum_type_is_encryption(enum bch_csum_type type) + #define BCH_CSUM_OPTS() \ + x(none, 0) \ + x(crc32c, 1) \ +- x(crc64, 2) ++ x(crc64, 2) \ ++ x(xxhash, 3) + + enum bch_csum_opts { + #define x(t, n) BCH_CSUM_OPT_##t = n, +diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c +index 6c23a9073dbf..d20924e579bf 100644 +--- a/fs/bcachefs/checksum.c ++++ b/fs/bcachefs/checksum.c +@@ -6,6 +6,7 @@ + + #include + #include ++#include + #include + #include + #include +@@ -26,6 +27,7 @@ + struct bch2_checksum_state { + union { + u64 seed; ++ struct xxh64_state h64state; + }; + unsigned int type; + }; +@@ -44,6 +46,9 @@ static void bch2_checksum_init(struct bch2_checksum_state *state) + case BCH_CSUM_CRC64_NONZERO: + state->seed = U64_MAX; + break; ++ case BCH_CSUM_XXHASH: ++ xxh64_reset(&state->h64state, 0); ++ break; + default: + BUG(); + } +@@ -60,6 +65,8 @@ static u64 bch2_checksum_final(const struct bch2_checksum_state *state) + return state->seed ^ U32_MAX; + case BCH_CSUM_CRC64_NONZERO: + return state->seed ^ U64_MAX; ++ case BCH_CSUM_XXHASH: ++ return xxh64_digest(&state->h64state); + default: + BUG(); + } +@@ -78,6 +85,9 @@ static void bch2_checksum_update(struct bch2_checksum_state *state, const void * + case BCH_CSUM_CRC64: + state->seed = crc64_be(state->seed, data, len); + break; ++ case BCH_CSUM_XXHASH: ++ xxh64_update(&state->h64state, data, len); ++ break; + default: + BUG(); + } +@@ -155,6 +165,7 @@ struct bch_csum bch2_checksum(struct bch_fs *c, unsigned type, + case BCH_CSUM_CRC32C_NONZERO: + case BCH_CSUM_CRC64_NONZERO: + case BCH_CSUM_CRC32C: ++ case BCH_CSUM_XXHASH: + case BCH_CSUM_CRC64: { + struct bch2_checksum_state state; + +@@ -206,6 +217,7 @@ static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type, + case BCH_CSUM_CRC32C_NONZERO: + case BCH_CSUM_CRC64_NONZERO: + case BCH_CSUM_CRC32C: ++ case BCH_CSUM_XXHASH: + case BCH_CSUM_CRC64: { + struct bch2_checksum_state state; + +diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h +index 728b7ef1a149..6841fb16568a 100644 +--- a/fs/bcachefs/checksum.h ++++ b/fs/bcachefs/checksum.h +@@ -83,6 +83,8 @@ static inline enum bch_csum_type bch2_csum_opt_to_type(enum bch_csum_opts type, + return data ? BCH_CSUM_CRC32C : BCH_CSUM_CRC32C_NONZERO; + case BCH_CSUM_OPT_crc64: + return data ? BCH_CSUM_CRC64 : BCH_CSUM_CRC64_NONZERO; ++ case BCH_CSUM_OPT_xxhash: ++ return BCH_CSUM_XXHASH; + default: + BUG(); + } +-- +cgit v1.2.3 + + +From 2286151a00a80e2a3ed1e86c542028dc7eed5d65 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 5 Jul 2021 22:02:07 -0400 +Subject: bcachefs: Split out SPOS_MAX + +Internal btree code really wants a POS_MAX with all fields ~0; external +code more likely wants the snapshot field to be 0, because when we're +passing it to bch2_trans_get_iter() it's used for the snapshot we're +operating in, which should be 0 for most btrees that don't use +snapshots. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bcachefs_format.h | 3 ++- + fs/bcachefs/btree_gc.c | 2 +- + fs/bcachefs/btree_gc.h | 2 +- + fs/bcachefs/btree_iter.c | 8 ++++---- + fs/bcachefs/btree_update_interior.c | 8 ++++---- + fs/bcachefs/debug.c | 4 ++-- + fs/bcachefs/move.c | 4 ++-- + 7 files changed, 16 insertions(+), 15 deletions(-) + +diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h +index 633fe71fcc8f..94273d5161f2 100644 +--- a/fs/bcachefs/bcachefs_format.h ++++ b/fs/bcachefs/bcachefs_format.h +@@ -148,7 +148,8 @@ static inline struct bpos SPOS(__u64 inode, __u64 offset, __u32 snapshot) + } + + #define POS_MIN SPOS(0, 0, 0) +-#define POS_MAX SPOS(KEY_INODE_MAX, KEY_OFFSET_MAX, KEY_SNAPSHOT_MAX) ++#define POS_MAX SPOS(KEY_INODE_MAX, KEY_OFFSET_MAX, 0) ++#define SPOS_MAX SPOS(KEY_INODE_MAX, KEY_OFFSET_MAX, KEY_SNAPSHOT_MAX) + #define POS(_inode, _offset) SPOS(_inode, _offset, 0) + + /* Empty placeholder struct, for container_of() */ +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +index 039677a55f58..b1b31164c512 100644 +--- a/fs/bcachefs/btree_gc.c ++++ b/fs/bcachefs/btree_gc.c +@@ -962,7 +962,7 @@ static int bch2_gc_btree_init(struct bch_fs *c, + goto fsck_err; + } + +- if (mustfix_fsck_err_on(bpos_cmp(b->data->max_key, POS_MAX), c, ++ if (mustfix_fsck_err_on(bpos_cmp(b->data->max_key, SPOS_MAX), c, + "btree root with incorrect max_key: %s", + (bch2_bpos_to_text(&PBUF(buf), b->data->max_key), buf))) { + bch_err(c, "repair unimplemented"); +diff --git a/fs/bcachefs/btree_gc.h b/fs/bcachefs/btree_gc.h +index e9a87394370a..59dfb069e699 100644 +--- a/fs/bcachefs/btree_gc.h ++++ b/fs/bcachefs/btree_gc.h +@@ -87,7 +87,7 @@ static inline struct gc_pos gc_pos_btree_node(struct btree *b) + */ + static inline struct gc_pos gc_pos_btree_root(enum btree_id id) + { +- return gc_pos_btree(id, POS_MAX, BTREE_MAX_DEPTH); ++ return gc_pos_btree(id, SPOS_MAX, BTREE_MAX_DEPTH); + } + + static inline bool gc_visited(struct bch_fs *c, struct gc_pos pos) +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 4e3c24266dfb..4ba7cc923421 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -1082,7 +1082,7 @@ static inline int btree_iter_lock_root(struct btree_iter *iter, + } + + lock_type = __btree_lock_want(iter, iter->level); +- if (unlikely(!btree_node_lock(b, POS_MAX, iter->level, ++ if (unlikely(!btree_node_lock(b, SPOS_MAX, iter->level, + iter, lock_type, + lock_root_check_fn, rootp, + trace_ip))) +@@ -1603,7 +1603,7 @@ out: + inline bool bch2_btree_iter_advance(struct btree_iter *iter) + { + struct bpos pos = iter->k.p; +- bool ret = bpos_cmp(pos, POS_MAX) != 0; ++ bool ret = bpos_cmp(pos, SPOS_MAX) != 0; + + if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS)) + pos = bkey_successor(iter, pos); +@@ -1625,7 +1625,7 @@ inline bool bch2_btree_iter_rewind(struct btree_iter *iter) + static inline bool btree_iter_set_pos_to_next_leaf(struct btree_iter *iter) + { + struct bpos next_pos = iter->l[0].b->key.k.p; +- bool ret = bpos_cmp(next_pos, POS_MAX) != 0; ++ bool ret = bpos_cmp(next_pos, SPOS_MAX) != 0; + + /* + * Typically, we don't want to modify iter->pos here, since that +@@ -1635,7 +1635,7 @@ static inline bool btree_iter_set_pos_to_next_leaf(struct btree_iter *iter) + if (ret) + btree_iter_set_search_pos(iter, bpos_successor(next_pos)); + else +- bch2_btree_iter_set_pos(iter, POS_MAX); ++ bch2_btree_iter_set_pos(iter, SPOS_MAX); + + return ret; + } +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index cd214599a03f..d3c6b562a749 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -367,7 +367,7 @@ static struct btree *__btree_root_alloc(struct btree_update *as, unsigned level) + struct btree *b = bch2_btree_node_alloc(as, level); + + btree_set_min(b, POS_MIN); +- btree_set_max(b, POS_MAX); ++ btree_set_max(b, SPOS_MAX); + b->data->format = bch2_btree_calc_format(b); + + btree_node_set_format(b, b->data->format); +@@ -1590,7 +1590,7 @@ retry: + b = iter->l[level].b; + + if ((sib == btree_prev_sib && !bpos_cmp(b->data->min_key, POS_MIN)) || +- (sib == btree_next_sib && !bpos_cmp(b->data->max_key, POS_MAX))) { ++ (sib == btree_next_sib && !bpos_cmp(b->data->max_key, SPOS_MAX))) { + b->sib_u64s[sib] = U16_MAX; + goto out; + } +@@ -2014,7 +2014,7 @@ void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id) + b->c.btree_id = id; + + bkey_btree_ptr_init(&b->key); +- b->key.k.p = POS_MAX; ++ b->key.k.p = SPOS_MAX; + *((u64 *) bkey_i_to_btree_ptr(&b->key)->v.start) = U64_MAX - id; + + bch2_bset_init_first(b, &b->data->keys); +@@ -2022,7 +2022,7 @@ void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id) + + b->data->flags = 0; + btree_set_min(b, POS_MIN); +- btree_set_max(b, POS_MAX); ++ btree_set_max(b, SPOS_MAX); + b->data->format = bch2_btree_calc_format(b); + btree_node_set_format(b, b->data->format); + +diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c +index 4215c119e0a2..92e970bc1332 100644 +--- a/fs/bcachefs/debug.c ++++ b/fs/bcachefs/debug.c +@@ -313,7 +313,7 @@ static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf, + if (err) + return err; + +- if (!i->size || !bpos_cmp(POS_MAX, i->from)) ++ if (!i->size || !bpos_cmp(SPOS_MAX, i->from)) + return i->ret; + + bch2_trans_init(&trans, i->c, 0, 0); +@@ -329,7 +329,7 @@ static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf, + * can't easily correctly restart a btree node traversal across + * all nodes, meh + */ +- i->from = bpos_cmp(POS_MAX, b->key.k.p) ++ i->from = bpos_cmp(SPOS_MAX, b->key.k.p) + ? bpos_successor(b->key.k.p) + : b->key.k.p; + +diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c +index 91be50812a38..c15e3145348a 100644 +--- a/fs/bcachefs/move.c ++++ b/fs/bcachefs/move.c +@@ -769,7 +769,7 @@ static int bch2_move_btree(struct bch_fs *c, + break; + + if ((cmp_int(id, end_btree_id) ?: +- bkey_cmp(b->key.k.p, end_pos)) > 0) ++ bpos_cmp(b->key.k.p, end_pos)) > 0) + break; + + stats->pos = iter->pos; +@@ -921,7 +921,7 @@ int bch2_scan_old_btree_nodes(struct bch_fs *c, struct bch_move_stats *stats) + + ret = bch2_move_btree(c, + 0, POS_MIN, +- BTREE_ID_NR, POS_MAX, ++ BTREE_ID_NR, SPOS_MAX, + rewrite_old_nodes_pred, c, stats); + if (!ret) { + mutex_lock(&c->sb_lock); +-- +cgit v1.2.3 + + +From 5fcd8610d506fff7c6ce68f0b8047a7a73824505 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 5 Jul 2021 22:08:28 -0400 +Subject: bcachefs: Fix bch2_btree_iter_peek_slot() assertion + +This assertion is checking that what the iterator points to is +consistent with iter->real_pos, and since it's an internal btree +ordering property it should be using bpos_cmp. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 4ba7cc923421..f47c9912f2b8 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -1851,7 +1851,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) + switch (btree_iter_type(iter)) { + case BTREE_ITER_KEYS: + k = btree_iter_level_peek_all(iter, &iter->l[0]); +- EBUG_ON(k.k && bkey_deleted(k.k) && bkey_cmp(k.k->p, iter->pos) == 0); ++ EBUG_ON(k.k && bkey_deleted(k.k) && bpos_cmp(k.k->p, iter->pos) == 0); + break; + case BTREE_ITER_CACHED: + ck = (void *) iter->l[0].b; +-- +cgit v1.2.3 + + +From 1a0e1f53d371050b799a74e8edacbdd5c6d87ea0 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 5 Jul 2021 22:18:07 -0400 +Subject: bcachefs: bch2_d_types[] + +Add readable names for d_type, and use it in dirent_to_text(). + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/dirent.c | 2 +- + fs/bcachefs/opts.c | 12 ++++++++++++ + fs/bcachefs/opts.h | 1 + + 3 files changed, 14 insertions(+), 1 deletion(-) + +diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c +index 3bf6379cefe6..d5883ab7de21 100644 +--- a/fs/bcachefs/dirent.c ++++ b/fs/bcachefs/dirent.c +@@ -112,7 +112,7 @@ void bch2_dirent_to_text(struct printbuf *out, struct bch_fs *c, + + bch_scnmemcpy(out, d.v->d_name, + bch2_dirent_name_bytes(d)); +- pr_buf(out, " -> %llu type %u", d.v->d_inum, d.v->d_type); ++ pr_buf(out, " -> %llu type %s", d.v->d_inum, bch2_d_types[d.v->d_type]); + } + + static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans, +diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c +index 64bf5a382d63..fd3f7cddb9ab 100644 +--- a/fs/bcachefs/opts.c ++++ b/fs/bcachefs/opts.c +@@ -63,6 +63,18 @@ const char * const bch2_member_states[] = { + + #undef x + ++const char * const bch2_d_types[] = { ++ [DT_UNKNOWN] = "unknown", ++ [DT_FIFO] = "fifo", ++ [DT_CHR] = "chr", ++ [DT_DIR] = "dir", ++ [DT_BLK] = "blk", ++ [DT_REG] = "reg", ++ [DT_LNK] = "lnk", ++ [DT_SOCK] = "sock", ++ [DT_WHT] = "whiteout", ++}; ++ + void bch2_opts_apply(struct bch_opts *dst, struct bch_opts src) + { + #define x(_name, ...) \ +diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h +index 1e2fc5de5ca4..c331535b0063 100644 +--- a/fs/bcachefs/opts.h ++++ b/fs/bcachefs/opts.h +@@ -18,6 +18,7 @@ extern const char * const bch2_str_hash_types[]; + extern const char * const bch2_data_types[]; + extern const char * const bch2_cache_replacement_policies[]; + extern const char * const bch2_member_states[]; ++extern const char * const bch2_d_types[]; + + /* + * Mount options; we also store defaults in the superblock. +-- +cgit v1.2.3 + + +From f6bbe2945f3df37b2a9e3dba59c362fb3e8edc71 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 5 Jul 2021 22:16:02 -0400 +Subject: bcachefs: BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE + +Add a new flag to control assertions about updating to internal snapshot +nodes, that normally should not be written to - to be used in an +upcoming patch. + +Also do some renaming - trigger_flags is now update_flags. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_key_cache.c | 4 +++- + fs/bcachefs/btree_types.h | 8 ++++++-- + fs/bcachefs/btree_update.h | 2 +- + fs/bcachefs/btree_update_leaf.c | 29 +++++++++++++++-------------- + 4 files changed, 25 insertions(+), 18 deletions(-) + +diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c +index dfaf5e6df917..0f8ff4aa76e5 100644 +--- a/fs/bcachefs/btree_key_cache.c ++++ b/fs/bcachefs/btree_key_cache.c +@@ -397,7 +397,9 @@ retry: + * to be using alloc reserves: + * */ + ret = bch2_btree_iter_traverse(b_iter) ?: +- bch2_trans_update(trans, b_iter, ck->k, BTREE_TRIGGER_NORUN) ?: ++ bch2_trans_update(trans, b_iter, ck->k, ++ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE| ++ BTREE_TRIGGER_NORUN) ?: + bch2_trans_commit(trans, NULL, NULL, + BTREE_INSERT_NOUNLOCK| + BTREE_INSERT_NOCHECK_RW| +diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h +index ec5195daead4..a2581500b791 100644 +--- a/fs/bcachefs/btree_types.h ++++ b/fs/bcachefs/btree_types.h +@@ -340,7 +340,7 @@ struct bkey_cached { + }; + + struct btree_insert_entry { +- unsigned trigger_flags; ++ unsigned flags; + u8 bkey_type; + enum btree_id btree_id:8; + u8 level; +@@ -639,7 +639,9 @@ static inline bool btree_type_has_snapshots(enum btree_id id) + return (1 << id) & BTREE_ID_HAS_SNAPSHOTS; + } + +-enum btree_trigger_flags { ++enum btree_update_flags { ++ __BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE, ++ + __BTREE_TRIGGER_NORUN, /* Don't run triggers at all */ + + __BTREE_TRIGGER_INSERT, +@@ -650,6 +652,8 @@ enum btree_trigger_flags { + __BTREE_TRIGGER_NOATOMIC, + }; + ++#define BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE (1U << __BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ++ + #define BTREE_TRIGGER_NORUN (1U << __BTREE_TRIGGER_NORUN) + + #define BTREE_TRIGGER_INSERT (1U << __BTREE_TRIGGER_INSERT) +diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h +index cbfc8544def4..1c085a28b832 100644 +--- a/fs/bcachefs/btree_update.h ++++ b/fs/bcachefs/btree_update.h +@@ -77,7 +77,7 @@ int bch2_btree_node_update_key(struct bch_fs *, struct btree_iter *, + struct btree *, struct bkey_i *); + + int bch2_trans_update(struct btree_trans *, struct btree_iter *, +- struct bkey_i *, enum btree_trigger_flags); ++ struct bkey_i *, enum btree_update_flags); + void bch2_trans_commit_hook(struct btree_trans *, + struct btree_trans_commit_hook *); + int __bch2_trans_commit(struct btree_trans *); +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index fa08470b6ca3..90ae7b380753 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -363,7 +363,7 @@ static noinline void bch2_trans_mark_gc(struct btree_trans *trans) + + if (gc_visited(c, gc_pos_btree_node(i->iter->l[0].b))) + bch2_mark_update(trans, i->iter, i->k, +- i->trigger_flags|BTREE_TRIGGER_GC); ++ i->flags|BTREE_TRIGGER_GC); + } + } + +@@ -468,7 +468,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, + trans_for_each_update(trans, i) + if (BTREE_NODE_TYPE_HAS_MEM_TRIGGERS & (1U << i->bkey_type)) + bch2_mark_update(trans, i->iter, i->k, +- i->trigger_flags); ++ i->flags); + + if (marking && trans->fs_usage_deltas) + bch2_trans_fs_usage_apply(trans, trans->fs_usage_deltas); +@@ -798,8 +798,7 @@ static int extent_handle_overwrites(struct btree_trans *trans, + + if (bch2_bkey_merge(c, bkey_i_to_s(update), bkey_i_to_s_c(i->k))) { + update_iter = bch2_trans_copy_iter(trans, iter); +- ret = bch2_btree_delete_at(trans, update_iter, +- i->trigger_flags); ++ ret = bch2_btree_delete_at(trans, update_iter, i->flags); + bch2_trans_iter_put(trans, update_iter); + + if (ret) +@@ -840,14 +839,16 @@ static int extent_handle_overwrites(struct btree_trans *trans, + if (ret) + goto out; + +- bch2_trans_update(trans, update_iter, update, i->trigger_flags); ++ bch2_trans_update(trans, update_iter, update, ++ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE| ++ i->flags); + bch2_trans_iter_put(trans, update_iter); + } + + if (bkey_cmp(k.k->p, i->k->k.p) <= 0) { + update_iter = bch2_trans_copy_iter(trans, iter); + ret = bch2_btree_delete_at(trans, update_iter, +- i->trigger_flags); ++ i->flags); + bch2_trans_iter_put(trans, update_iter); + + if (ret) +@@ -862,7 +863,7 @@ static int extent_handle_overwrites(struct btree_trans *trans, + bkey_reassemble(update, k); + bch2_cut_front(i->k->k.p, update); + +- bch2_trans_update(trans, iter, update, i->trigger_flags); ++ bch2_trans_update(trans, iter, update, i->flags); + goto out; + } + next: +@@ -907,7 +908,7 @@ int __bch2_trans_commit(struct btree_trans *trans) + #ifdef CONFIG_BCACHEFS_DEBUG + trans_for_each_update(trans, i) + if (btree_iter_type(i->iter) != BTREE_ITER_CACHED && +- !(i->trigger_flags & BTREE_TRIGGER_NORUN)) ++ !(i->flags & BTREE_TRIGGER_NORUN)) + bch2_btree_key_cache_verify_clean(trans, + i->btree_id, i->k->k.p); + #endif +@@ -925,8 +926,8 @@ int __bch2_trans_commit(struct btree_trans *trans) + i->trans_triggers_run = true; + trans_trigger_run = true; + +- ret = bch2_trans_mark_update(trans, i->iter, i->k, +- i->trigger_flags); ++ ret = bch2_trans_mark_update(trans, i->iter, ++ i->k, i->flags); + if (unlikely(ret)) { + if (ret == -EINTR) + trace_trans_restart_mark(trans->ip, _RET_IP_, +@@ -1009,10 +1010,10 @@ err: + } + + int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter, +- struct bkey_i *k, enum btree_trigger_flags flags) ++ struct bkey_i *k, enum btree_update_flags flags) + { + struct btree_insert_entry *i, n = (struct btree_insert_entry) { +- .trigger_flags = flags, ++ .flags = flags, + .bkey_type = __btree_node_type(iter->level, iter->btree_id), + .btree_id = iter->btree_id, + .level = iter->level, +@@ -1110,7 +1111,7 @@ int bch2_btree_insert(struct bch_fs *c, enum btree_id id, + } + + int bch2_btree_delete_at(struct btree_trans *trans, +- struct btree_iter *iter, unsigned trigger_flags) ++ struct btree_iter *iter, unsigned update_flags) + { + struct bkey_i *k; + +@@ -1120,7 +1121,7 @@ int bch2_btree_delete_at(struct btree_trans *trans, + + bkey_init(&k->k); + k->k.p = iter->pos; +- return bch2_trans_update(trans, iter, k, trigger_flags); ++ return bch2_trans_update(trans, iter, k, update_flags); + } + + int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id, +-- +cgit v1.2.3 + + +From 00fdfb6bb0067f3cab635b7debbcb7a284bc542c Mon Sep 17 00:00:00 2001 +From: Dan Robertson +Date: Thu, 8 Jul 2021 18:15:38 -0400 +Subject: bcachefs: set disk state should check new_state + +A new device state that is not a valid state should return -EINVAL +in the disk set state ioctl. + +Signed-off-by: Dan Robertson +--- + fs/bcachefs/chardev.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c +index 9ac34cc36c07..db68a78276cf 100644 +--- a/fs/bcachefs/chardev.c ++++ b/fs/bcachefs/chardev.c +@@ -275,7 +275,8 @@ static long bch2_ioctl_disk_set_state(struct bch_fs *c, + BCH_FORCE_IF_METADATA_LOST| + BCH_FORCE_IF_DEGRADED| + BCH_BY_INDEX)) || +- arg.pad[0] || arg.pad[1] || arg.pad[2]) ++ arg.pad[0] || arg.pad[1] || arg.pad[2] || ++ arg.new_state >= BCH_MEMBER_STATE_NR) + return -EINVAL; + + ca = bch2_device_lookup(c, arg.dev, arg.flags); +-- +cgit v1.2.3 + + +From 53fd8a48d1e89efaa8a0c786df86079936eba175 Mon Sep 17 00:00:00 2001 +From: Dan Robertson +Date: Wed, 7 Jul 2021 22:31:36 -0400 +Subject: bcachefs: docs: add docs for bch2_trans_reset + +Add basic kernel docs for bch2_trans_reset and bch2_trans_begin. + +Signed-off-by: Dan Robertson +--- + fs/bcachefs/btree_iter.c | 16 ++++++++++++++++ + fs/bcachefs/btree_iter.h | 7 +++++++ + 2 files changed, 23 insertions(+) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index f47c9912f2b8..5a810fdaeec1 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -2301,6 +2301,22 @@ inline void bch2_trans_unlink_iters(struct btree_trans *trans) + } + } + ++/** ++ * bch2_trans_reset() - reset a transaction after a interrupted attempt ++ * @trans: transaction to reset ++ * @flags: transaction reset flags. ++ * ++ * While iterating over nodes or updating nodes a attempt to lock a btree ++ * node may return EINTR when the trylock fails. When this occurs ++ * bch2_trans_reset() or bch2_trans_begin() should be called and the ++ * transaction retried. ++ * ++ * Transaction reset flags include: ++ * ++ * - TRANS_RESET_NOUNLOCK - Do not attempt to unlock and reschedule the ++ * transaction. ++ * - TRANS_RESET_NOTRAVERSE - Do not traverse all linked iters. ++ */ + void bch2_trans_reset(struct btree_trans *trans, unsigned flags) + { + struct btree_iter *iter; +diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h +index 6efea281d87f..31175cf00c0a 100644 +--- a/fs/bcachefs/btree_iter.h ++++ b/fs/bcachefs/btree_iter.h +@@ -319,6 +319,13 @@ static inline void set_btree_iter_dontneed(struct btree_trans *trans, struct btr + + void bch2_trans_reset(struct btree_trans *, unsigned); + ++/** ++ * bch2_trans_begin() - ensure lock consistency of transaction on retry ++ * @trans: transaction to prepare ++ * ++ * Ensure lock ordering is correct before potentially retrying a transaction ++ * after a failed trylock. ++ */ + static inline void bch2_trans_begin(struct btree_trans *trans) + { + return bch2_trans_reset(trans, 0); +-- +cgit v1.2.3 + + +From df142aeccf854f5d0692feba2ddb25997614f5e9 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 10 Jul 2021 23:22:06 -0400 +Subject: bcachefs: Regularize argument passing of btree_trans + +btree_trans should always be passed when we have one - iter->trans is +disfavoured. This mainly updates old code in btree_update_interior.c, +some of which predates btree_trans. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_gc.c | 4 +-- + fs/bcachefs/btree_io.c | 8 +++-- + fs/bcachefs/btree_io.h | 4 +-- + fs/bcachefs/btree_update.h | 8 ++--- + fs/bcachefs/btree_update_interior.c | 60 +++++++++++++++++++++++-------------- + fs/bcachefs/btree_update_interior.h | 25 +++++++--------- + fs/bcachefs/btree_update_leaf.c | 17 ++++++----- + fs/bcachefs/migrate.c | 2 +- + fs/bcachefs/move.c | 2 +- + 9 files changed, 73 insertions(+), 57 deletions(-) + +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +index b1b31164c512..3dd1094d10c9 100644 +--- a/fs/bcachefs/btree_gc.c ++++ b/fs/bcachefs/btree_gc.c +@@ -800,13 +800,13 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id, + + if (!initial) { + if (max_stale > 64) +- bch2_btree_node_rewrite(c, iter, ++ bch2_btree_node_rewrite(&trans, iter, + b->data->keys.seq, + BTREE_INSERT_NOWAIT| + BTREE_INSERT_GC_LOCK_HELD); + else if (!bch2_btree_gc_rewrite_disabled && + (bch2_btree_gc_always_rewrite || max_stale > 16)) +- bch2_btree_node_rewrite(c, iter, ++ bch2_btree_node_rewrite(&trans, iter, + b->data->keys.seq, + BTREE_INSERT_NOWAIT| + BTREE_INSERT_GC_LOCK_HELD); +diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c +index c354dd1aefb9..98fcd6a9f97a 100644 +--- a/fs/bcachefs/btree_io.c ++++ b/fs/bcachefs/btree_io.c +@@ -420,9 +420,11 @@ void bch2_btree_build_aux_trees(struct btree *b) + * + * Returns true if we sorted (i.e. invalidated iterators + */ +-void bch2_btree_init_next(struct bch_fs *c, struct btree *b, +- struct btree_iter *iter) ++void bch2_btree_init_next(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct btree *b) + { ++ struct bch_fs *c = trans->c; + struct btree_node_entry *bne; + bool reinit_iter = false; + +@@ -1561,7 +1563,7 @@ retry: + if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(k.k))) + goto err; + +- ret = bch2_btree_node_update_key(c, iter, b, k.k); ++ ret = bch2_btree_node_update_key(&trans, iter, b, k.k); + if (ret == -EINTR) + goto retry; + if (ret) +diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h +index abbc4675964a..fae67622c127 100644 +--- a/fs/bcachefs/btree_io.h ++++ b/fs/bcachefs/btree_io.h +@@ -138,8 +138,8 @@ void bch2_btree_sort_into(struct bch_fs *, struct btree *, struct btree *); + void bch2_btree_node_drop_keys_outside_node(struct btree *); + + void bch2_btree_build_aux_trees(struct btree *); +-void bch2_btree_init_next(struct bch_fs *, struct btree *, +- struct btree_iter *); ++void bch2_btree_init_next(struct btree_trans *, struct btree_iter *, ++ struct btree *); + + int bch2_btree_node_read_done(struct bch_fs *, struct bch_dev *, + struct btree *, bool); +diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h +index 1c085a28b832..12065bba82dd 100644 +--- a/fs/bcachefs/btree_update.h ++++ b/fs/bcachefs/btree_update.h +@@ -8,8 +8,8 @@ + struct bch_fs; + struct btree; + +-void bch2_btree_node_lock_for_insert(struct bch_fs *, struct btree *, +- struct btree_iter *); ++void bch2_btree_node_lock_for_insert(struct btree_trans *, struct btree_iter *, ++ struct btree *); + bool bch2_btree_bset_insert_key(struct btree_iter *, struct btree *, + struct btree_node_iter *, struct bkey_i *); + void bch2_btree_add_journal_pin(struct bch_fs *, struct btree *, u64); +@@ -70,10 +70,10 @@ int bch2_btree_delete_range_trans(struct btree_trans *, enum btree_id, + int bch2_btree_delete_range(struct bch_fs *, enum btree_id, + struct bpos, struct bpos, u64 *); + +-int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *, ++int bch2_btree_node_rewrite(struct btree_trans *, struct btree_iter *, + __le64, unsigned); + void bch2_btree_node_rewrite_async(struct bch_fs *, struct btree *); +-int bch2_btree_node_update_key(struct bch_fs *, struct btree_iter *, ++int bch2_btree_node_update_key(struct btree_trans *, struct btree_iter *, + struct btree *, struct bkey_i *); + + int bch2_trans_update(struct btree_trans *, struct btree_iter *, +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index d3c6b562a749..191e5d306700 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -22,6 +22,10 @@ + #include + #include + ++static void bch2_btree_insert_node(struct btree_update *, struct btree_trans *, ++ struct btree_iter *, struct btree *, ++ struct keylist *, unsigned); ++ + /* Debug code: */ + + /* +@@ -1355,8 +1359,9 @@ static void btree_split_insert_keys(struct btree_update *as, struct btree *b, + btree_node_interior_verify(as->c, b); + } + +-static void btree_split(struct btree_update *as, struct btree *b, +- struct btree_iter *iter, struct keylist *keys, ++static void btree_split(struct btree_update *as, ++ struct btree_trans *trans, struct btree_iter *iter, ++ struct btree *b, struct keylist *keys, + unsigned flags) + { + struct bch_fs *c = as->c; +@@ -1422,7 +1427,7 @@ static void btree_split(struct btree_update *as, struct btree *b, + + if (parent) { + /* Split a non root node */ +- bch2_btree_insert_node(as, parent, iter, &as->parent_keys, flags); ++ bch2_btree_insert_node(as, trans, iter, parent, &as->parent_keys, flags); + } else if (n3) { + bch2_btree_set_root(as, n3, iter); + } else { +@@ -1460,7 +1465,7 @@ static void btree_split(struct btree_update *as, struct btree *b, + six_unlock_intent(&n2->c.lock); + six_unlock_intent(&n1->c.lock); + +- bch2_btree_trans_verify_locks(iter->trans); ++ bch2_btree_trans_verify_locks(trans); + + bch2_time_stats_update(&c->times[BCH_TIME_btree_node_split], + start_time); +@@ -1494,9 +1499,10 @@ bch2_btree_insert_keys_interior(struct btree_update *as, struct btree *b, + * If a split occurred, this function will return early. This can only happen + * for leaf nodes -- inserts into interior nodes have to be atomic. + */ +-void bch2_btree_insert_node(struct btree_update *as, struct btree *b, +- struct btree_iter *iter, struct keylist *keys, +- unsigned flags) ++static void bch2_btree_insert_node(struct btree_update *as, ++ struct btree_trans *trans, struct btree_iter *iter, ++ struct btree *b, struct keylist *keys, ++ unsigned flags) + { + struct bch_fs *c = as->c; + int old_u64s = le16_to_cpu(btree_bset_last(b)->u64s); +@@ -1509,7 +1515,7 @@ void bch2_btree_insert_node(struct btree_update *as, struct btree *b, + BUG_ON(!as || as->b); + bch2_verify_keylist_sorted(keys); + +- bch2_btree_node_lock_for_insert(c, b, iter); ++ bch2_btree_node_lock_for_insert(trans, iter, b); + + if (!bch2_btree_node_insert_fits(c, b, bch2_keylist_u64s(keys))) { + bch2_btree_node_unlock_write(b, iter); +@@ -1537,12 +1543,14 @@ void bch2_btree_insert_node(struct btree_update *as, struct btree *b, + btree_node_interior_verify(c, b); + return; + split: +- btree_split(as, b, iter, keys, flags); ++ btree_split(as, trans, iter, b, keys, flags); + } + +-int bch2_btree_split_leaf(struct bch_fs *c, struct btree_iter *iter, ++int bch2_btree_split_leaf(struct btree_trans *trans, ++ struct btree_iter *iter, + unsigned flags) + { ++ struct bch_fs *c = trans->c; + struct btree *b = iter_l(iter)->b; + struct btree_update *as; + unsigned l; +@@ -1553,22 +1561,22 @@ int bch2_btree_split_leaf(struct bch_fs *c, struct btree_iter *iter, + if (IS_ERR(as)) + return PTR_ERR(as); + +- btree_split(as, b, iter, NULL, flags); ++ btree_split(as, trans, iter, b, NULL, flags); + bch2_btree_update_done(as); + + for (l = iter->level + 1; btree_iter_node(iter, l) && !ret; l++) +- ret = bch2_foreground_maybe_merge(c, iter, l, flags); ++ ret = bch2_foreground_maybe_merge(trans, iter, l, flags); + + return ret; + } + +-int __bch2_foreground_maybe_merge(struct bch_fs *c, ++int __bch2_foreground_maybe_merge(struct btree_trans *trans, + struct btree_iter *iter, + unsigned level, + unsigned flags, + enum btree_node_sibling sib) + { +- struct btree_trans *trans = iter->trans; ++ struct bch_fs *c = trans->c; + struct btree_iter *sib_iter = NULL; + struct btree_update *as; + struct bkey_format_state new_s; +@@ -1697,7 +1705,7 @@ retry: + + bch2_btree_node_write(c, n, SIX_LOCK_intent); + +- bch2_btree_insert_node(as, parent, iter, &as->parent_keys, flags); ++ bch2_btree_insert_node(as, trans, iter, parent, &as->parent_keys, flags); + + bch2_btree_update_get_open_buckets(as, n); + +@@ -1750,9 +1758,11 @@ err: + /** + * bch_btree_node_rewrite - Rewrite/move a btree node + */ +-int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter, ++int bch2_btree_node_rewrite(struct btree_trans *trans, ++ struct btree_iter *iter, + __le64 seq, unsigned flags) + { ++ struct bch_fs *c = trans->c; + struct btree *b, *n, *parent; + struct btree_update *as; + int ret; +@@ -1795,7 +1805,8 @@ retry: + + if (parent) { + bch2_keylist_add(&as->parent_keys, &n->key); +- bch2_btree_insert_node(as, parent, iter, &as->parent_keys, flags); ++ bch2_btree_insert_node(as, trans, iter, parent, ++ &as->parent_keys, flags); + } else { + bch2_btree_set_root(as, n, iter); + } +@@ -1834,7 +1845,7 @@ void async_btree_node_rewrite_work(struct work_struct *work) + bch2_trans_init(&trans, c, 0, 0); + iter = bch2_trans_get_node_iter(&trans, a->btree_id, a->pos, + BTREE_MAX_DEPTH, a->level, 0); +- bch2_btree_node_rewrite(c, iter, a->seq, 0); ++ bch2_btree_node_rewrite(&trans, iter, a->seq, 0); + bch2_trans_iter_put(&trans, iter); + bch2_trans_exit(&trans); + percpu_ref_put(&c->writes); +@@ -1867,12 +1878,13 @@ void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b) + queue_work(c->btree_interior_update_worker, &a->work); + } + +-static void __bch2_btree_node_update_key(struct bch_fs *c, +- struct btree_update *as, ++static void __bch2_btree_node_update_key(struct btree_update *as, ++ struct btree_trans *trans, + struct btree_iter *iter, + struct btree *b, struct btree *new_hash, + struct bkey_i *new_key) + { ++ struct bch_fs *c = as->c; + struct btree *parent; + int ret; + +@@ -1889,7 +1901,7 @@ static void __bch2_btree_node_update_key(struct bch_fs *c, + } + + bch2_keylist_add(&as->parent_keys, new_key); +- bch2_btree_insert_node(as, parent, iter, &as->parent_keys, 0); ++ bch2_btree_insert_node(as, trans, iter, parent, &as->parent_keys, 0); + + if (new_hash) { + mutex_lock(&c->btree_cache.lock); +@@ -1926,10 +1938,12 @@ static void __bch2_btree_node_update_key(struct bch_fs *c, + bch2_btree_update_done(as); + } + +-int bch2_btree_node_update_key(struct bch_fs *c, struct btree_iter *iter, ++int bch2_btree_node_update_key(struct btree_trans *trans, ++ struct btree_iter *iter, + struct btree *b, + struct bkey_i *new_key) + { ++ struct bch_fs *c = trans->c; + struct btree *parent = btree_node_parent(iter, b); + struct btree_update *as = NULL; + struct btree *new_hash = NULL; +@@ -1962,7 +1976,7 @@ int bch2_btree_node_update_key(struct bch_fs *c, struct btree_iter *iter, + goto err; + } + +- __bch2_btree_node_update_key(c, as, iter, b, new_hash, new_key); ++ __bch2_btree_node_update_key(as, trans, iter, b, new_hash, new_key); + + bch2_btree_iter_downgrade(iter); + err: +diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h +index 7ed67b47e1b9..e88e737ee813 100644 +--- a/fs/bcachefs/btree_update_interior.h ++++ b/fs/bcachefs/btree_update_interior.h +@@ -131,15 +131,12 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *, + struct btree *); + void bch2_btree_update_add_new_node(struct btree_update *, struct btree *); + +-void bch2_btree_insert_node(struct btree_update *, struct btree *, +- struct btree_iter *, struct keylist *, +- unsigned); +-int bch2_btree_split_leaf(struct bch_fs *, struct btree_iter *, unsigned); ++int bch2_btree_split_leaf(struct btree_trans *, struct btree_iter *, unsigned); + +-int __bch2_foreground_maybe_merge(struct bch_fs *, struct btree_iter *, ++int __bch2_foreground_maybe_merge(struct btree_trans *, struct btree_iter *, + unsigned, unsigned, enum btree_node_sibling); + +-static inline int bch2_foreground_maybe_merge_sibling(struct bch_fs *c, ++static inline int bch2_foreground_maybe_merge_sibling(struct btree_trans *trans, + struct btree_iter *iter, + unsigned level, unsigned flags, + enum btree_node_sibling sib) +@@ -153,20 +150,20 @@ static inline int bch2_foreground_maybe_merge_sibling(struct bch_fs *c, + return 0; + + b = iter->l[level].b; +- if (b->sib_u64s[sib] > c->btree_foreground_merge_threshold) ++ if (b->sib_u64s[sib] > trans->c->btree_foreground_merge_threshold) + return 0; + +- return __bch2_foreground_maybe_merge(c, iter, level, flags, sib); ++ return __bch2_foreground_maybe_merge(trans, iter, level, flags, sib); + } + +-static inline int bch2_foreground_maybe_merge(struct bch_fs *c, +- struct btree_iter *iter, +- unsigned level, +- unsigned flags) ++static inline int bch2_foreground_maybe_merge(struct btree_trans *trans, ++ struct btree_iter *iter, ++ unsigned level, ++ unsigned flags) + { +- return bch2_foreground_maybe_merge_sibling(c, iter, level, flags, ++ return bch2_foreground_maybe_merge_sibling(trans, iter, level, flags, + btree_prev_sib) ?: +- bch2_foreground_maybe_merge_sibling(c, iter, level, flags, ++ bch2_foreground_maybe_merge_sibling(trans, iter, level, flags, + btree_next_sib); + } + +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index 90ae7b380753..0843e2c395aa 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -36,9 +36,12 @@ static inline bool same_leaf_as_prev(struct btree_trans *trans, + iter_l(i[0].iter)->b == iter_l(i[-1].iter)->b; + } + +-inline void bch2_btree_node_lock_for_insert(struct bch_fs *c, struct btree *b, +- struct btree_iter *iter) ++inline void bch2_btree_node_lock_for_insert(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct btree *b) + { ++ struct bch_fs *c = trans->c; ++ + bch2_btree_node_lock_write(b, iter); + + if (btree_iter_type(iter) == BTREE_ITER_CACHED) +@@ -53,7 +56,7 @@ inline void bch2_btree_node_lock_for_insert(struct bch_fs *c, struct btree *b, + * a new bset to insert into: + */ + if (want_new_bset(c, b)) +- bch2_btree_init_next(c, b, iter); ++ bch2_btree_init_next(trans, iter, b); + } + + /* Inserting into a given leaf node (last stage of insert): */ +@@ -518,7 +521,7 @@ static noinline int maybe_do_btree_merge(struct btree_trans *trans, struct btree + } + + return u64s_delta <= 0 +- ? (bch2_foreground_maybe_merge(trans->c, iter, iter->level, ++ ? (bch2_foreground_maybe_merge(trans, iter, iter->level, + trans->flags & ~BTREE_INSERT_NOUNLOCK) ?: -EINTR) + : 0; + } +@@ -608,8 +611,8 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, + + trans_for_each_update(trans, i) + if (!same_leaf_as_prev(trans, i)) +- bch2_btree_node_lock_for_insert(c, +- iter_l(i->iter)->b, i->iter); ++ bch2_btree_node_lock_for_insert(trans, i->iter, ++ iter_l(i->iter)->b); + + ret = bch2_trans_commit_write_locked(trans, stopped_at, trace_ip); + +@@ -662,7 +665,7 @@ int bch2_trans_commit_error(struct btree_trans *trans, + + switch (ret) { + case BTREE_INSERT_BTREE_NODE_FULL: +- ret = bch2_btree_split_leaf(c, i->iter, flags); ++ ret = bch2_btree_split_leaf(trans, i->iter, flags); + + /* + * if the split succeeded without dropping locks the insert will +diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c +index 91a9f584dd6d..aacd6385db1f 100644 +--- a/fs/bcachefs/migrate.c ++++ b/fs/bcachefs/migrate.c +@@ -139,7 +139,7 @@ retry: + break; + } + +- ret = bch2_btree_node_update_key(c, iter, b, k.k); ++ ret = bch2_btree_node_update_key(&trans, iter, b, k.k); + if (ret == -EINTR) { + b = bch2_btree_iter_peek_node(iter); + ret = 0; +diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c +index c15e3145348a..80a54e17760f 100644 +--- a/fs/bcachefs/move.c ++++ b/fs/bcachefs/move.c +@@ -786,7 +786,7 @@ static int bch2_move_btree(struct bch_fs *c, + BUG(); + } + +- ret = bch2_btree_node_rewrite(c, iter, ++ ret = bch2_btree_node_rewrite(&trans, iter, + b->data->keys.seq, 0) ?: ret; + next: + bch2_trans_cond_resched(&trans); +-- +cgit v1.2.3 + + +From 54915351a7161d304cb36058f22c35bb6f6b6844 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 31 Mar 2021 14:27:05 -0400 +Subject: bcachefs: Assert that btree node locks aren't being leaked + +This asserts (when lockdep is enabled) that btree locks aren't held when +exiting a btree_trans. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_cache.c | 4 +++- + fs/bcachefs/btree_cache.h | 2 ++ + fs/bcachefs/btree_iter.c | 2 ++ + 3 files changed, 7 insertions(+), 1 deletion(-) + +diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c +index 94448d40c824..e2fa9cc93d0b 100644 +--- a/fs/bcachefs/btree_cache.c ++++ b/fs/bcachefs/btree_cache.c +@@ -13,6 +13,8 @@ + #include + #include + ++struct lock_class_key bch2_btree_node_lock_key; ++ + void bch2_recalc_btree_reserve(struct bch_fs *c) + { + unsigned i, reserve = 16; +@@ -100,7 +102,7 @@ static struct btree *__btree_node_mem_alloc(struct bch_fs *c) + return NULL; + + bkey_btree_ptr_init(&b->key); +- six_lock_init(&b->c.lock); ++ __six_lock_init(&b->c.lock, "b->c.lock", &bch2_btree_node_lock_key); + INIT_LIST_HEAD(&b->list); + INIT_LIST_HEAD(&b->write_blocked); + b->byte_order = ilog2(btree_bytes(c)); +diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h +index 40dd263a7caa..fd5026c9f9c3 100644 +--- a/fs/bcachefs/btree_cache.h ++++ b/fs/bcachefs/btree_cache.h +@@ -5,6 +5,8 @@ + #include "bcachefs.h" + #include "btree_types.h" + ++extern struct lock_class_key bch2_btree_node_lock_key; ++ + struct btree_iter; + + void bch2_recalc_btree_reserve(struct bch_fs *); +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 5a810fdaeec1..f9f8f5588901 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -494,6 +494,8 @@ void bch2_trans_unlock(struct btree_trans *trans) + + trans_for_each_iter(trans, iter) + __bch2_btree_iter_unlock(iter); ++ ++ BUG_ON(lock_class_is_held(&bch2_btree_node_lock_key)); + } + + /* Btree iterator: */ +-- +cgit v1.2.3 + + +From 5bf747ed9128670bcf58ba1fc1ae133ab217bb28 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 10 Jul 2021 23:03:15 -0400 +Subject: bcachefs: Really don't hold btree locks while btree IOs are in flight + +This is something we've attempted to stick to for quite some time, as it +helps guarantee filesystem latency - but there's a few remaining paths +that this patch fixes. + +We also add asserts that we're not holding btree locks when waiting on +btree reads or writes. + +This is also necessary for an upcoming patch to update btree pointers +after every btree write - since the btree write completion path will now +be doing btree operations. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_cache.c | 95 +++++++++++++++++++++++-------------- + fs/bcachefs/btree_io.c | 57 ++++++++++++++++++++-- + fs/bcachefs/btree_io.h | 26 +++------- + fs/bcachefs/btree_update_interior.c | 2 +- + fs/bcachefs/debug.c | 4 +- + 5 files changed, 122 insertions(+), 62 deletions(-) + +diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c +index e2fa9cc93d0b..45bccf1060a4 100644 +--- a/fs/bcachefs/btree_cache.c ++++ b/fs/bcachefs/btree_cache.c +@@ -188,6 +188,17 @@ static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush) + int ret = 0; + + lockdep_assert_held(&bc->lock); ++wait_on_io: ++ if (b->flags & ((1U << BTREE_NODE_dirty)| ++ (1U << BTREE_NODE_read_in_flight)| ++ (1U << BTREE_NODE_write_in_flight))) { ++ if (!flush) ++ return -ENOMEM; ++ ++ /* XXX: waiting on IO with btree cache lock held */ ++ bch2_btree_node_wait_on_read(b); ++ bch2_btree_node_wait_on_write(b); ++ } + + if (!six_trylock_intent(&b->c.lock)) + return -ENOMEM; +@@ -195,25 +206,26 @@ static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush) + if (!six_trylock_write(&b->c.lock)) + goto out_unlock_intent; + ++ /* recheck under lock */ ++ if (b->flags & ((1U << BTREE_NODE_read_in_flight)| ++ (1U << BTREE_NODE_write_in_flight))) { ++ if (!flush) ++ goto out_unlock; ++ six_unlock_write(&b->c.lock); ++ six_unlock_intent(&b->c.lock); ++ goto wait_on_io; ++ } ++ + if (btree_node_noevict(b)) + goto out_unlock; + + if (!btree_node_may_write(b)) + goto out_unlock; + +- if (btree_node_dirty(b) && +- test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags)) +- goto out_unlock; +- +- if (btree_node_dirty(b) || +- btree_node_write_in_flight(b) || +- btree_node_read_in_flight(b)) { +- if (!flush) ++ if (btree_node_dirty(b)) { ++ if (!flush || ++ test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags)) + goto out_unlock; +- +- wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight, +- TASK_UNINTERRUPTIBLE); +- + /* + * Using the underscore version because we don't want to compact + * bsets after the write, since this node is about to be evicted +@@ -225,8 +237,9 @@ static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush) + else + __bch2_btree_node_write(c, b); + +- /* wait for any in flight btree write */ +- btree_node_wait_on_io(b); ++ six_unlock_write(&b->c.lock); ++ six_unlock_intent(&b->c.lock); ++ goto wait_on_io; + } + out: + if (b->hash_val && !ret) +@@ -582,6 +595,7 @@ got_node: + } + + BUG_ON(btree_node_hashed(b)); ++ BUG_ON(btree_node_dirty(b)); + BUG_ON(btree_node_write_in_flight(b)); + out: + b->flags = 0; +@@ -635,6 +649,7 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c, + { + struct btree_cache *bc = &c->btree_cache; + struct btree *b; ++ u32 seq; + + BUG_ON(level + 1 >= BTREE_MAX_DEPTH); + /* +@@ -664,31 +679,31 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c, + return NULL; + } + ++ set_btree_node_read_in_flight(b); ++ ++ six_unlock_write(&b->c.lock); ++ seq = b->c.lock.state.seq; ++ six_unlock_intent(&b->c.lock); ++ + /* Unlock before doing IO: */ + if (iter && sync) + bch2_trans_unlock(iter->trans); + + bch2_btree_node_read(c, b, sync); + +- six_unlock_write(&b->c.lock); +- +- if (!sync) { +- six_unlock_intent(&b->c.lock); ++ if (!sync) + return NULL; +- } + + /* + * XXX: this will probably always fail because btree_iter_relock() + * currently fails for iterators that aren't pointed at a valid btree + * node + */ +- if (iter && !bch2_trans_relock(iter->trans)) { +- six_unlock_intent(&b->c.lock); ++ if (iter && !bch2_trans_relock(iter->trans)) + return ERR_PTR(-EINTR); +- } + +- if (lock_type == SIX_LOCK_read) +- six_lock_downgrade(&b->c.lock); ++ if (!six_relock_type(&b->c.lock, lock_type, seq)) ++ return ERR_PTR(-EINTR); + + return b; + } +@@ -832,11 +847,12 @@ lock_node: + } + + if (unlikely(btree_node_read_in_flight(b))) { ++ u32 seq = b->c.lock.state.seq; ++ + six_unlock_type(&b->c.lock, lock_type); + bch2_trans_unlock(iter->trans); + +- wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight, +- TASK_UNINTERRUPTIBLE); ++ bch2_btree_node_wait_on_read(b); + + /* + * XXX: check if this always fails - btree_iter_relock() +@@ -845,7 +861,9 @@ lock_node: + */ + if (iter && !bch2_trans_relock(iter->trans)) + return ERR_PTR(-EINTR); +- goto retry; ++ ++ if (!six_relock_type(&b->c.lock, lock_type, seq)) ++ goto retry; + } + + prefetch(b->aux_data); +@@ -924,8 +942,7 @@ lock_node: + } + + /* XXX: waiting on IO with btree locks held: */ +- wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight, +- TASK_UNINTERRUPTIBLE); ++ __bch2_btree_node_wait_on_read(b); + + prefetch(b->aux_data); + +@@ -980,16 +997,24 @@ void bch2_btree_node_evict(struct bch_fs *c, const struct bkey_i *k) + b = btree_cache_find(bc, k); + if (!b) + return; ++wait_on_io: ++ /* not allowed to wait on io with btree locks held: */ ++ ++ /* XXX we're called from btree_gc which will be holding other btree ++ * nodes locked ++ * */ ++ __bch2_btree_node_wait_on_read(b); ++ __bch2_btree_node_wait_on_write(b); + + six_lock_intent(&b->c.lock, NULL, NULL); + six_lock_write(&b->c.lock, NULL, NULL); + +- wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight, +- TASK_UNINTERRUPTIBLE); +- __bch2_btree_node_write(c, b); +- +- /* wait for any in flight btree write */ +- btree_node_wait_on_io(b); ++ if (btree_node_dirty(b)) { ++ __bch2_btree_node_write(c, b); ++ six_unlock_write(&b->c.lock); ++ six_unlock_intent(&b->c.lock); ++ goto wait_on_io; ++ } + + BUG_ON(btree_node_dirty(b)); + +diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c +index 98fcd6a9f97a..12894f8959bf 100644 +--- a/fs/bcachefs/btree_io.c ++++ b/fs/bcachefs/btree_io.c +@@ -22,6 +22,50 @@ + #include + #include + ++void bch2_btree_node_io_unlock(struct btree *b) ++{ ++ EBUG_ON(!btree_node_write_in_flight(b)); ++ ++ clear_btree_node_write_in_flight(b); ++ wake_up_bit(&b->flags, BTREE_NODE_write_in_flight); ++} ++ ++void bch2_btree_node_io_lock(struct btree *b) ++{ ++ BUG_ON(lock_class_is_held(&bch2_btree_node_lock_key)); ++ ++ wait_on_bit_lock_io(&b->flags, BTREE_NODE_write_in_flight, ++ TASK_UNINTERRUPTIBLE); ++} ++ ++void __bch2_btree_node_wait_on_read(struct btree *b) ++{ ++ wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight, ++ TASK_UNINTERRUPTIBLE); ++} ++ ++void __bch2_btree_node_wait_on_write(struct btree *b) ++{ ++ wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight, ++ TASK_UNINTERRUPTIBLE); ++} ++ ++void bch2_btree_node_wait_on_read(struct btree *b) ++{ ++ BUG_ON(lock_class_is_held(&bch2_btree_node_lock_key)); ++ ++ wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight, ++ TASK_UNINTERRUPTIBLE); ++} ++ ++void bch2_btree_node_wait_on_write(struct btree *b) ++{ ++ BUG_ON(lock_class_is_held(&bch2_btree_node_lock_key)); ++ ++ wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight, ++ TASK_UNINTERRUPTIBLE); ++} ++ + static void verify_no_dups(struct btree *b, + struct bkey_packed *start, + struct bkey_packed *end) +@@ -432,7 +476,8 @@ void bch2_btree_init_next(struct btree_trans *trans, + EBUG_ON(iter && iter->l[b->c.level].b != b); + BUG_ON(bset_written(b, bset(b, &b->set[1]))); + +- if (b->nsets == MAX_BSETS) { ++ if (b->nsets == MAX_BSETS && ++ !btree_node_write_in_flight(b)) { + unsigned log_u64s[] = { + ilog2(bset_u64s(&b->set[0])), + ilog2(bset_u64s(&b->set[1])), +@@ -1401,8 +1446,6 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b, + btree_pos_to_text(&PBUF(buf), c, b); + trace_btree_read(c, b); + +- set_btree_node_read_in_flight(b); +- + if (bch2_verify_all_btree_replicas && + !btree_node_read_all_replicas(c, b, sync)) + return; +@@ -1478,6 +1521,8 @@ int bch2_btree_root_read(struct bch_fs *c, enum btree_id id, + bkey_copy(&b->key, k); + BUG_ON(bch2_btree_node_hash_insert(&c->btree_cache, b, level, id)); + ++ set_btree_node_read_in_flight(b); ++ + bch2_btree_node_read(c, b, true); + + if (btree_node_read_error(b)) { +@@ -1523,7 +1568,7 @@ static void btree_node_write_done(struct bch_fs *c, struct btree *b) + struct btree_write *w = btree_prev_write(b); + + bch2_btree_complete_write(c, b, w); +- btree_node_io_unlock(b); ++ bch2_btree_node_io_unlock(b); + } + + static void bch2_btree_node_write_error(struct bch_fs *c, +@@ -1705,6 +1750,8 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b) + bool validate_before_checksum = false; + void *data; + ++ BUG_ON(btree_node_write_in_flight(b)); ++ + if (test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags)) + return; + +@@ -1732,7 +1779,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b) + * XXX waiting on btree writes with btree locks held - + * this can deadlock, and we hit the write error path + */ +- btree_node_wait_on_io(b); ++ bch2_btree_node_wait_on_write(b); + continue; + } + +diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h +index fae67622c127..89fd4aba5218 100644 +--- a/fs/bcachefs/btree_io.h ++++ b/fs/bcachefs/btree_io.h +@@ -52,24 +52,12 @@ struct btree_write_bio { + struct bch_write_bio wbio; + }; + +-static inline void btree_node_io_unlock(struct btree *b) +-{ +- EBUG_ON(!btree_node_write_in_flight(b)); +- clear_btree_node_write_in_flight(b); +- wake_up_bit(&b->flags, BTREE_NODE_write_in_flight); +-} +- +-static inline void btree_node_io_lock(struct btree *b) +-{ +- wait_on_bit_lock_io(&b->flags, BTREE_NODE_write_in_flight, +- TASK_UNINTERRUPTIBLE); +-} +- +-static inline void btree_node_wait_on_io(struct btree *b) +-{ +- wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight, +- TASK_UNINTERRUPTIBLE); +-} ++void bch2_btree_node_io_unlock(struct btree *); ++void bch2_btree_node_io_lock(struct btree *); ++void __bch2_btree_node_wait_on_read(struct btree *); ++void __bch2_btree_node_wait_on_write(struct btree *); ++void bch2_btree_node_wait_on_read(struct btree *); ++void bch2_btree_node_wait_on_write(struct btree *); + + static inline bool btree_node_may_write(struct btree *b) + { +@@ -169,7 +157,7 @@ static inline void btree_node_write_if_need(struct bch_fs *c, struct btree *b, + } + + six_unlock_type(&b->c.lock, lock_held); +- btree_node_wait_on_io(b); ++ bch2_btree_node_wait_on_write(b); + btree_node_lock_type(c, b, lock_held); + } + } +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index 191e5d306700..6b55a4108425 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -567,7 +567,7 @@ static void btree_update_nodes_written(struct btree_update *as) + six_unlock_read(&old->c.lock); + + if (seq == as->old_nodes_seq[i]) +- btree_node_wait_on_io(old); ++ bch2_btree_node_wait_on_write(old); + } + + /* +diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c +index 92e970bc1332..b0a8eb58a7a7 100644 +--- a/fs/bcachefs/debug.c ++++ b/fs/bcachefs/debug.c +@@ -133,7 +133,7 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b) + if (c->opts.nochanges) + return; + +- btree_node_io_lock(b); ++ bch2_btree_node_io_lock(b); + mutex_lock(&c->verify_lock); + + if (!c->verify_ondisk) { +@@ -176,7 +176,7 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b) + } + out: + mutex_unlock(&c->verify_lock); +- btree_node_io_unlock(b); ++ bch2_btree_node_io_unlock(b); + } + + #ifdef CONFIG_DEBUG_FS +-- +cgit v1.2.3 + + +From ff7f4c553adbea2b2a565674fdd45a435a70f43e Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 11 Jul 2021 13:54:07 -0400 +Subject: bcachefs: Mask out unknown compat features when going read-write + +Compat features should be cleared if the filesystem was touched by a +version that doesn't support them. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/super-io.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c +index c771b92d9496..3903b730bba3 100644 +--- a/fs/bcachefs/super-io.c ++++ b/fs/bcachefs/super-io.c +@@ -983,6 +983,7 @@ int bch2_fs_mark_dirty(struct bch_fs *c) + mutex_lock(&c->sb_lock); + SET_BCH_SB_CLEAN(c->disk_sb.sb, false); + c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALWAYS); ++ c->disk_sb.sb->compat[0] &= cpu_to_le64((1ULL << BCH_COMPAT_NR) - 1); + ret = bch2_write_super(c); + mutex_unlock(&c->sb_lock); + +-- +cgit v1.2.3 + + +From 9079263568c3d8eb0a10033ae4b43cbe21e8584b Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 11 Jul 2021 16:41:14 -0400 +Subject: bcachefs: Kick off btree node writes from write completions + +This is a performance improvement by removing the need to wait for the +in flight btree write to complete before kicking one off, which is going +to be needed to avoid a performance regression with the upcoming patch +to update btree ptrs after every btree write. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_cache.c | 4 ++-- + fs/bcachefs/btree_io.c | 61 +++++++++++++++++++++++++++++++++++++---------- + fs/bcachefs/btree_io.h | 19 +++++---------- + 3 files changed, 56 insertions(+), 28 deletions(-) + +diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c +index 45bccf1060a4..19909ce3bc18 100644 +--- a/fs/bcachefs/btree_cache.c ++++ b/fs/bcachefs/btree_cache.c +@@ -235,7 +235,7 @@ wait_on_io: + if (bch2_verify_btree_ondisk) + bch2_btree_node_write(c, b, SIX_LOCK_intent); + else +- __bch2_btree_node_write(c, b); ++ __bch2_btree_node_write(c, b, false); + + six_unlock_write(&b->c.lock); + six_unlock_intent(&b->c.lock); +@@ -1010,7 +1010,7 @@ wait_on_io: + six_lock_write(&b->c.lock, NULL, NULL); + + if (btree_node_dirty(b)) { +- __bch2_btree_node_write(c, b); ++ __bch2_btree_node_write(c, b, false); + six_unlock_write(&b->c.lock); + six_unlock_intent(&b->c.lock); + goto wait_on_io; +diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c +index 12894f8959bf..957a6a9a1559 100644 +--- a/fs/bcachefs/btree_io.c ++++ b/fs/bcachefs/btree_io.c +@@ -1566,9 +1566,47 @@ void bch2_btree_complete_write(struct bch_fs *c, struct btree *b, + static void btree_node_write_done(struct bch_fs *c, struct btree *b) + { + struct btree_write *w = btree_prev_write(b); ++ unsigned long old, new, v; + + bch2_btree_complete_write(c, b, w); +- bch2_btree_node_io_unlock(b); ++ ++ v = READ_ONCE(b->flags); ++ do { ++ old = new = v; ++ ++ if (old & (1U << BTREE_NODE_need_write)) ++ goto do_write; ++ ++ new &= ~(1U << BTREE_NODE_write_in_flight); ++ } while ((v = cmpxchg(&b->flags, old, new)) != old); ++ ++ wake_up_bit(&b->flags, BTREE_NODE_write_in_flight); ++ return; ++ ++do_write: ++ six_lock_read(&b->c.lock, NULL, NULL); ++ v = READ_ONCE(b->flags); ++ do { ++ old = new = v; ++ ++ if ((old & (1U << BTREE_NODE_dirty)) && ++ (old & (1U << BTREE_NODE_need_write)) && ++ !(old & (1U << BTREE_NODE_never_write)) && ++ btree_node_may_write(b)) { ++ new &= ~(1U << BTREE_NODE_dirty); ++ new &= ~(1U << BTREE_NODE_need_write); ++ new |= (1U << BTREE_NODE_write_in_flight); ++ new |= (1U << BTREE_NODE_just_written); ++ new ^= (1U << BTREE_NODE_write_idx); ++ } else { ++ new &= ~(1U << BTREE_NODE_write_in_flight); ++ } ++ } while ((v = cmpxchg(&b->flags, old, new)) != old); ++ ++ if (new & (1U << BTREE_NODE_write_in_flight)) ++ __bch2_btree_node_write(c, b, true); ++ ++ six_unlock_read(&b->c.lock); + } + + static void bch2_btree_node_write_error(struct bch_fs *c, +@@ -1733,7 +1771,7 @@ static void btree_write_submit(struct work_struct *work) + bch2_submit_wbio_replicas(&wbio->wbio, wbio->wbio.c, BCH_DATA_btree, &wbio->key); + } + +-void __bch2_btree_node_write(struct bch_fs *c, struct btree *b) ++void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, bool already_started) + { + struct btree_write_bio *wbio; + struct bset_tree *t; +@@ -1750,7 +1788,8 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b) + bool validate_before_checksum = false; + void *data; + +- BUG_ON(btree_node_write_in_flight(b)); ++ if (already_started) ++ goto do_write; + + if (test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags)) + return; +@@ -1774,14 +1813,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b) + if (old & (1 << BTREE_NODE_never_write)) + return; + +- if (old & (1 << BTREE_NODE_write_in_flight)) { +- /* +- * XXX waiting on btree writes with btree locks held - +- * this can deadlock, and we hit the write error path +- */ +- bch2_btree_node_wait_on_write(b); +- continue; +- } ++ BUG_ON(old & (1 << BTREE_NODE_write_in_flight)); + + new &= ~(1 << BTREE_NODE_dirty); + new &= ~(1 << BTREE_NODE_need_write); +@@ -1790,6 +1822,9 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b) + new ^= (1 << BTREE_NODE_write_idx); + } while (cmpxchg_acquire(&b->flags, old, new) != old); + ++ if (new & (1U << BTREE_NODE_need_write)) ++ return; ++do_write: + atomic_dec(&c->btree_cache.dirty); + + BUG_ON(btree_node_fake(b)); +@@ -2044,7 +2079,7 @@ void bch2_btree_node_write(struct bch_fs *c, struct btree *b, + if (lock_type_held == SIX_LOCK_intent || + (lock_type_held == SIX_LOCK_read && + six_lock_tryupgrade(&b->c.lock))) { +- __bch2_btree_node_write(c, b); ++ __bch2_btree_node_write(c, b, false); + + /* don't cycle lock unnecessarily: */ + if (btree_node_just_written(b) && +@@ -2056,7 +2091,7 @@ void bch2_btree_node_write(struct bch_fs *c, struct btree *b, + if (lock_type_held == SIX_LOCK_read) + six_lock_downgrade(&b->c.lock); + } else { +- __bch2_btree_node_write(c, b); ++ __bch2_btree_node_write(c, b, false); + if (lock_type_held == SIX_LOCK_write && + btree_node_just_written(b)) + bch2_btree_post_write_cleanup(c, b); +diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h +index 89fd4aba5218..3732d135de8d 100644 +--- a/fs/bcachefs/btree_io.h ++++ b/fs/bcachefs/btree_io.h +@@ -139,7 +139,7 @@ void bch2_btree_complete_write(struct bch_fs *, struct btree *, + struct btree_write *); + void bch2_btree_write_error_work(struct work_struct *); + +-void __bch2_btree_node_write(struct bch_fs *, struct btree *); ++void __bch2_btree_node_write(struct bch_fs *, struct btree *, bool); + bool bch2_btree_post_write_cleanup(struct bch_fs *, struct btree *); + + void bch2_btree_node_write(struct bch_fs *, struct btree *, +@@ -148,18 +148,11 @@ void bch2_btree_node_write(struct bch_fs *, struct btree *, + static inline void btree_node_write_if_need(struct bch_fs *c, struct btree *b, + enum six_lock_type lock_held) + { +- while (b->written && +- btree_node_need_write(b) && +- btree_node_may_write(b)) { +- if (!btree_node_write_in_flight(b)) { +- bch2_btree_node_write(c, b, lock_held); +- break; +- } +- +- six_unlock_type(&b->c.lock, lock_held); +- bch2_btree_node_wait_on_write(b); +- btree_node_lock_type(c, b, lock_held); +- } ++ if (b->written && ++ btree_node_need_write(b) && ++ btree_node_may_write(b) && ++ !btree_node_write_in_flight(b)) ++ bch2_btree_node_write(c, b, lock_held); + } + + #define bch2_btree_node_write_cond(_c, _b, cond) \ +-- +cgit v1.2.3 + + +From df8e90b12f0d376d85060b74f2a393cf0706a6ef Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 12 Jul 2021 23:17:15 -0400 +Subject: bcachefs: Ensure bad d_type doesn't oops in bch2_dirent_to_text() + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/dirent.c | 5 ++++- + fs/bcachefs/opts.c | 2 +- + 2 files changed, 5 insertions(+), 2 deletions(-) + +diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c +index d5883ab7de21..a95165b8eddf 100644 +--- a/fs/bcachefs/dirent.c ++++ b/fs/bcachefs/dirent.c +@@ -112,7 +112,10 @@ void bch2_dirent_to_text(struct printbuf *out, struct bch_fs *c, + + bch_scnmemcpy(out, d.v->d_name, + bch2_dirent_name_bytes(d)); +- pr_buf(out, " -> %llu type %s", d.v->d_inum, bch2_d_types[d.v->d_type]); ++ pr_buf(out, " -> %llu type %s", d.v->d_inum, ++ d.v->d_type < DT_MAX ++ ? bch2_d_types[d.v->d_type] ++ : "(bad d_type)"); + } + + static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans, +diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c +index fd3f7cddb9ab..5de296078219 100644 +--- a/fs/bcachefs/opts.c ++++ b/fs/bcachefs/opts.c +@@ -63,7 +63,7 @@ const char * const bch2_member_states[] = { + + #undef x + +-const char * const bch2_d_types[] = { ++const char * const bch2_d_types[DT_MAX] = { + [DT_UNKNOWN] = "unknown", + [DT_FIFO] = "fifo", + [DT_CHR] = "chr", +-- +cgit v1.2.3 + + +From b5788c3fe8a6226a8c4a36f2e3e7bc449ad9aa3e Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 12 Jul 2021 23:52:49 -0400 +Subject: bcachefs: Add open_buckets to sysfs + +This is to help debug a rare shutdown deadlock in the allocator code - +the btree code is leaking open_buckets. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/alloc_background.c | 19 +++++++++++++++++++ + fs/bcachefs/alloc_background.h | 2 ++ + fs/bcachefs/sysfs.c | 7 +++++++ + 3 files changed, 28 insertions(+) + +diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c +index 07823a168b4f..083e51465bee 100644 +--- a/fs/bcachefs/alloc_background.c ++++ b/fs/bcachefs/alloc_background.c +@@ -1232,3 +1232,22 @@ void bch2_fs_allocator_background_init(struct bch_fs *c) + { + spin_lock_init(&c->freelist_lock); + } ++ ++void bch2_open_buckets_to_text(struct printbuf *out, struct bch_fs *c) ++{ ++ struct open_bucket *ob; ++ ++ for (ob = c->open_buckets; ++ ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); ++ ob++) { ++ spin_lock(&ob->lock); ++ if (ob->valid && !ob->on_partial_list) { ++ pr_buf(out, "%zu ref %u type %s\n", ++ ob - c->open_buckets, ++ atomic_read(&ob->pin), ++ bch2_data_types[ob->type]); ++ } ++ spin_unlock(&ob->lock); ++ } ++ ++} +diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h +index 9cadfdb5b83d..a4f6bf56b18f 100644 +--- a/fs/bcachefs/alloc_background.h ++++ b/fs/bcachefs/alloc_background.h +@@ -132,4 +132,6 @@ int bch2_dev_allocator_start(struct bch_dev *); + int bch2_alloc_write(struct bch_fs *, unsigned); + void bch2_fs_allocator_background_init(struct bch_fs *); + ++void bch2_open_buckets_to_text(struct printbuf *, struct bch_fs *); ++ + #endif /* _BCACHEFS_ALLOC_BACKGROUND_H */ +diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c +index 84a7acb04d01..9b1ffbf96e14 100644 +--- a/fs/bcachefs/sysfs.c ++++ b/fs/bcachefs/sysfs.c +@@ -171,6 +171,7 @@ read_attribute(btree_cache); + read_attribute(btree_key_cache); + read_attribute(btree_transactions); + read_attribute(stripes_heap); ++read_attribute(open_buckets); + + read_attribute(internal_uuid); + +@@ -409,6 +410,11 @@ SHOW(bch2_fs) + return out.pos - buf; + } + ++ if (attr == &sysfs_open_buckets) { ++ bch2_open_buckets_to_text(&out, c); ++ return out.pos - buf; ++ } ++ + if (attr == &sysfs_compression_stats) { + bch2_compression_stats_to_text(&out, c); + return out.pos - buf; +@@ -567,6 +573,7 @@ struct attribute *bch2_fs_internal_files[] = { + &sysfs_btree_key_cache, + &sysfs_btree_transactions, + &sysfs_stripes_heap, ++ &sysfs_open_buckets, + + &sysfs_read_realloc_races, + &sysfs_extent_migrate_done, +-- +cgit v1.2.3 + + +From ee1b4864da62a43a3aefb2a373d1e0663e79c511 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 13 Jul 2021 16:03:51 -0400 +Subject: bcachefs: Add safe versions of varint encode/decode + +This adds safe versions of bch2_varint_(encode|decode) that don't read +or write past the end of the buffer, or varint being encoded. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/alloc_background.c | 4 +-- + fs/bcachefs/inode.c | 6 ++-- + fs/bcachefs/varint.c | 73 +++++++++++++++++++++++++++++++++++++++++- + fs/bcachefs/varint.h | 3 ++ + 4 files changed, 80 insertions(+), 6 deletions(-) + +diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c +index 083e51465bee..82e6ee8117b5 100644 +--- a/fs/bcachefs/alloc_background.c ++++ b/fs/bcachefs/alloc_background.c +@@ -130,7 +130,7 @@ static int bch2_alloc_unpack_v2(struct bkey_alloc_unpacked *out, + + #define x(_name, _bits) \ + if (fieldnr < a.v->nr_fields) { \ +- ret = bch2_varint_decode(in, end, &v); \ ++ ret = bch2_varint_decode_fast(in, end, &v); \ + if (ret < 0) \ + return ret; \ + in += ret; \ +@@ -166,7 +166,7 @@ static void bch2_alloc_pack_v2(struct bkey_alloc_buf *dst, + nr_fields++; \ + \ + if (src._name) { \ +- out += bch2_varint_encode(out, src._name); \ ++ out += bch2_varint_encode_fast(out, src._name); \ + \ + last_nonzero_field = out; \ + last_nonzero_fieldnr = nr_fields; \ +diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c +index 59edb4cea5f1..46f32f978dc9 100644 +--- a/fs/bcachefs/inode.c ++++ b/fs/bcachefs/inode.c +@@ -137,7 +137,7 @@ static void bch2_inode_pack_v2(struct bkey_inode_buf *packed, + nr_fields++; \ + \ + if (inode->_name) { \ +- ret = bch2_varint_encode(out, inode->_name); \ ++ ret = bch2_varint_encode_fast(out, inode->_name); \ + out += ret; \ + \ + if (_bits > 64) \ +@@ -246,13 +246,13 @@ static int bch2_inode_unpack_v2(struct bkey_s_c_inode inode, + + #define x(_name, _bits) \ + if (fieldnr < INODE_NR_FIELDS(inode.v)) { \ +- ret = bch2_varint_decode(in, end, &v[0]); \ ++ ret = bch2_varint_decode_fast(in, end, &v[0]); \ + if (ret < 0) \ + return ret; \ + in += ret; \ + \ + if (_bits > 64) { \ +- ret = bch2_varint_decode(in, end, &v[1]); \ ++ ret = bch2_varint_decode_fast(in, end, &v[1]); \ + if (ret < 0) \ + return ret; \ + in += ret; \ +diff --git a/fs/bcachefs/varint.c b/fs/bcachefs/varint.c +index a3d252c741c8..e6a041541792 100644 +--- a/fs/bcachefs/varint.c ++++ b/fs/bcachefs/varint.c +@@ -1,10 +1,18 @@ + // SPDX-License-Identifier: GPL-2.0 + + #include ++#include + #include + + #include "varint.h" + ++/** ++ * bch2_varint_encode - encode a variable length integer ++ * @out - destination to encode to ++ * @v - unsigned integer to encode ++ * ++ * Returns the size in bytes of the encoded integer - at most 9 bytes ++ */ + int bch2_varint_encode(u8 *out, u64 v) + { + unsigned bits = fls64(v|1); +@@ -13,16 +21,79 @@ int bch2_varint_encode(u8 *out, u64 v) + if (likely(bytes < 9)) { + v <<= bytes; + v |= ~(~0 << (bytes - 1)); ++ v = cpu_to_le64(v); ++ memcpy(out, &v, bytes); + } else { + *out++ = 255; + bytes = 9; ++ put_unaligned_le64(v, out); + } + +- put_unaligned_le64(v, out); + return bytes; + } + ++/** ++ * bch2_varint_decode - encode a variable length integer ++ * @in - varint to decode ++ * @end - end of buffer to decode from ++ * @out - on success, decoded integer ++ * ++ * Returns the size in bytes of the decoded integer - or -1 on failure (would ++ * have read past the end of the buffer) ++ */ + int bch2_varint_decode(const u8 *in, const u8 *end, u64 *out) ++{ ++ unsigned bytes = likely(in < end) ++ ? ffz(*in & 255) + 1 ++ : 1; ++ u64 v; ++ ++ if (unlikely(in + bytes > end)) ++ return -1; ++ ++ if (likely(bytes < 9)) { ++ v = 0; ++ memcpy(&v, in, bytes); ++ v = le64_to_cpu(v); ++ v >>= bytes; ++ } else { ++ v = get_unaligned_le64(++in); ++ } ++ ++ *out = v; ++ return bytes; ++} ++ ++/** ++ * bch2_varint_encode_fast - fast version of bch2_varint_encode ++ * ++ * This version assumes it's always safe to write 8 bytes to @out, even if the ++ * encoded integer would be smaller. ++ */ ++int bch2_varint_encode_fast(u8 *out, u64 v) ++{ ++ unsigned bits = fls64(v|1); ++ unsigned bytes = DIV_ROUND_UP(bits, 7); ++ ++ if (likely(bytes < 9)) { ++ v <<= bytes; ++ v |= ~(~0 << (bytes - 1)); ++ } else { ++ *out++ = 255; ++ bytes = 9; ++ } ++ ++ put_unaligned_le64(v, out); ++ return bytes; ++} ++ ++/** ++ * bch2_varint_decode_fast - fast version of bch2_varint_decode ++ * ++ * This version assumes that it is safe to read at most 8 bytes past the end of ++ * @end (we still return an error if the varint extends past @end). ++ */ ++int bch2_varint_decode_fast(const u8 *in, const u8 *end, u64 *out) + { + u64 v = get_unaligned_le64(in); + unsigned bytes = ffz(v & 255) + 1; +diff --git a/fs/bcachefs/varint.h b/fs/bcachefs/varint.h +index 8daf813576b7..92a182fb3d7a 100644 +--- a/fs/bcachefs/varint.h ++++ b/fs/bcachefs/varint.h +@@ -5,4 +5,7 @@ + int bch2_varint_encode(u8 *, u64); + int bch2_varint_decode(const u8 *, const u8 *, u64 *); + ++int bch2_varint_encode_fast(u8 *, u64); ++int bch2_varint_decode_fast(const u8 *, const u8 *, u64 *); ++ + #endif /* _BCACHEFS_VARINT_H */ +-- +cgit v1.2.3 + + +From 78a4e637126c030fc29076646bf9008a05dfd98a Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 13 Jul 2021 16:12:00 -0400 +Subject: bcachefs: Fix an allocator shutdown deadlock + +On fstest generic/388, we were seeing sporadic deadlocks in the +emergency shutdown, where we'd get stuck shutting down the allocator +because bch2_btree_update_start() -> bch2_btree_reserve_get() allocated +and then deallocated some btree nodes, putting them back on the +btree_reserve_cache, after the allocator shutdown code had already +cleared out that cache. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_update_interior.c | 27 ++++++++++++++++----------- + 1 file changed, 16 insertions(+), 11 deletions(-) + +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index 6b55a4108425..0b4e4056e1d9 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -948,13 +948,6 @@ bch2_btree_update_start(struct btree_iter *iter, unsigned level, + + closure_init_stack(&cl); + retry: +- /* +- * This check isn't necessary for correctness - it's just to potentially +- * prevent us from doing a lot of work that'll end up being wasted: +- */ +- ret = bch2_journal_error(&c->journal); +- if (ret) +- return ERR_PTR(ret); + + /* + * XXX: figure out how far we might need to split, +@@ -995,6 +988,22 @@ retry: + bch2_keylist_init(&as->new_keys, as->_new_keys); + bch2_keylist_init(&as->parent_keys, as->inline_keys); + ++ mutex_lock(&c->btree_interior_update_lock); ++ list_add_tail(&as->list, &c->btree_interior_update_list); ++ mutex_unlock(&c->btree_interior_update_lock); ++ ++ /* ++ * We don't want to allocate if we're in an error state, that can cause ++ * deadlock on emergency shutdown due to open buckets getting stuck in ++ * the btree_reserve_cache after allocator shutdown has cleared it out. ++ * This check needs to come after adding us to the btree_interior_update ++ * list but before calling bch2_btree_reserve_get, to synchronize with ++ * __bch2_fs_read_only(). ++ */ ++ ret = bch2_journal_error(&c->journal); ++ if (ret) ++ goto err; ++ + ret = bch2_journal_preres_get(&c->journal, &as->journal_preres, + BTREE_UPDATE_JOURNAL_RES, + journal_flags|JOURNAL_RES_GET_NONBLOCK); +@@ -1046,10 +1055,6 @@ retry: + atomic64_read(&c->journal.seq), + &as->journal, NULL); + +- mutex_lock(&c->btree_interior_update_lock); +- list_add_tail(&as->list, &c->btree_interior_update_list); +- mutex_unlock(&c->btree_interior_update_lock); +- + return as; + err: + bch2_btree_update_free(as); +-- +cgit v1.2.3 + + +From 2659cd0a69be5aceafa17b53d473305b66bc814d Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 13 Jun 2021 17:07:18 -0400 +Subject: bcachefs: Add an option for whether inodes use the key cache + +We probably don't ever want to flip this off in production, but it may +be useful for certain kinds of testing. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bcachefs_format.h | 1 + + fs/bcachefs/inode.c | 17 ++++++++++------- + fs/bcachefs/opts.h | 5 +++++ + 3 files changed, 16 insertions(+), 7 deletions(-) + +diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h +index 94273d5161f2..8a89ab0d8e85 100644 +--- a/fs/bcachefs/bcachefs_format.h ++++ b/fs/bcachefs/bcachefs_format.h +@@ -1346,6 +1346,7 @@ LE64_BITMASK(BCH_SB_GC_RESERVE_BYTES, struct bch_sb, flags[2], 4, 64); + LE64_BITMASK(BCH_SB_ERASURE_CODE, struct bch_sb, flags[3], 0, 16); + LE64_BITMASK(BCH_SB_METADATA_TARGET, struct bch_sb, flags[3], 16, 28); + LE64_BITMASK(BCH_SB_SHARD_INUMS, struct bch_sb, flags[3], 28, 29); ++LE64_BITMASK(BCH_SB_INODES_USE_KEY_CACHE,struct bch_sb, flags[3], 29, 30); + + /* + * Features: +diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c +index 46f32f978dc9..67983ff4fb2c 100644 +--- a/fs/bcachefs/inode.c ++++ b/fs/bcachefs/inode.c +@@ -300,8 +300,10 @@ struct btree_iter *bch2_inode_peek(struct btree_trans *trans, + struct bkey_s_c k; + int ret; + +- iter = bch2_trans_get_iter(trans, BTREE_ID_inodes, POS(0, inum), +- BTREE_ITER_CACHED|flags); ++ if (trans->c->opts.inodes_use_key_cache) ++ flags |= BTREE_ITER_CACHED; ++ ++ iter = bch2_trans_get_iter(trans, BTREE_ID_inodes, POS(0, inum), flags); + k = bch2_btree_iter_peek_slot(iter); + ret = bkey_err(k); + if (ret) +@@ -577,8 +579,12 @@ int bch2_inode_rm(struct bch_fs *c, u64 inode_nr, bool cached) + struct bpos end = POS(inode_nr + 1, 0); + struct bch_inode_unpacked inode_u; + struct bkey_s_c k; ++ unsigned iter_flags = BTREE_ITER_INTENT; + int ret; + ++ if (cached && c->opts.inodes_use_key_cache) ++ iter_flags |= BTREE_ITER_CACHED; ++ + bch2_trans_init(&trans, c, 0, 1024); + + /* +@@ -600,11 +606,8 @@ int bch2_inode_rm(struct bch_fs *c, u64 inode_nr, bool cached) + retry: + bch2_trans_begin(&trans); + +- iter = bch2_trans_get_iter(&trans, BTREE_ID_inodes, POS(0, inode_nr), +- (cached +- ? BTREE_ITER_CACHED +- : BTREE_ITER_SLOTS)| +- BTREE_ITER_INTENT); ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_inodes, ++ POS(0, inode_nr), iter_flags); + k = bch2_btree_iter_peek_slot(iter); + + ret = bkey_err(k); +diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h +index c331535b0063..ed505857bc9e 100644 +--- a/fs/bcachefs/opts.h ++++ b/fs/bcachefs/opts.h +@@ -173,6 +173,11 @@ enum opt_type { + OPT_BOOL(), \ + BCH_SB_SHARD_INUMS, false, \ + NULL, "Shard new inode numbers by CPU id") \ ++ x(inodes_use_key_cache, u8, \ ++ OPT_FORMAT|OPT_MOUNT, \ ++ OPT_BOOL(), \ ++ BCH_SB_INODES_USE_KEY_CACHE, true, \ ++ NULL, "Use the btree key cache for the inodes btree") \ + x(gc_reserve_percent, u8, \ + OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_UINT(5, 21), \ +-- +cgit v1.2.3 + + +From f01fb05f697f99cf8113245d1b608c5d8ce2e441 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 14 Jul 2021 00:14:45 -0400 +Subject: bcachefs: Fix a memory leak in the dio write path + +There were some error paths where we were leaking page refs - oops. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/fs-io.c | 6 ++++-- + 1 file changed, 4 insertions(+), 2 deletions(-) + +diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c +index 3fca1f0c6597..f29ffe420a33 100644 +--- a/fs/bcachefs/fs-io.c ++++ b/fs/bcachefs/fs-io.c +@@ -1878,8 +1878,6 @@ static long bch2_dio_write_loop(struct dio_write *dio) + * bio_iov_iter_get_pages was only able to get < + * blocksize worth of pages: + */ +- bio_for_each_segment_all(bv, bio, iter) +- put_page(bv->bv_page); + ret = -EFAULT; + goto err; + } +@@ -1947,6 +1945,7 @@ loop: + if (likely(!bio_flagged(bio, BIO_NO_PAGE_REF))) + bio_for_each_segment_all(bv, bio, iter) + put_page(bv->bv_page); ++ bio->bi_vcnt = 0; + + if (dio->op.error) { + set_bit(EI_INODE_ERROR, &inode->ei_flags); +@@ -1969,6 +1968,9 @@ err: + if (dio->free_iov) + kfree(dio->iter.iov); + ++ if (likely(!bio_flagged(bio, BIO_NO_PAGE_REF))) ++ bio_for_each_segment_all(bv, bio, iter) ++ put_page(bv->bv_page); + bio_put(bio); + + /* inode->i_dio_count is our ref on inode and thus bch_fs */ +-- +cgit v1.2.3 + + +From e36027e0e3cd450a04db1fe730303aa2bc07478b Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 14 Jul 2021 15:13:27 -0400 +Subject: bcachefs: Tighten up btree_iter locking assertions + +We weren't correctly verifying that we had interior node intent locks - +this patch also fixes bugs uncovered by the new assertions. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_cache.c | 8 ++++++-- + fs/bcachefs/btree_iter.c | 41 +++++++++++++++++++++++++---------------- + fs/bcachefs/btree_iter.h | 2 ++ + fs/bcachefs/btree_key_cache.c | 4 +++- + 4 files changed, 36 insertions(+), 19 deletions(-) + +diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c +index 19909ce3bc18..b3e90d3f1ef2 100644 +--- a/fs/bcachefs/btree_cache.c ++++ b/fs/bcachefs/btree_cache.c +@@ -699,7 +699,9 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c, + * currently fails for iterators that aren't pointed at a valid btree + * node + */ +- if (iter && !bch2_trans_relock(iter->trans)) ++ if (iter && ++ (!bch2_trans_relock(iter->trans) || ++ !bch2_btree_iter_relock(iter, _THIS_IP_))) + return ERR_PTR(-EINTR); + + if (!six_relock_type(&b->c.lock, lock_type, seq)) +@@ -859,7 +861,9 @@ lock_node: + * currently fails for iterators that aren't pointed at a valid + * btree node + */ +- if (iter && !bch2_trans_relock(iter->trans)) ++ if (iter && ++ (!bch2_trans_relock(iter->trans) || ++ !bch2_btree_iter_relock(iter, _THIS_IP_))) + return ERR_PTR(-EINTR); + + if (!six_relock_type(&b->c.lock, lock_type, seq)) +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index f9f8f5588901..8a67d8568302 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -362,7 +362,7 @@ static void bch2_btree_iter_verify_locks(struct btree_iter *iter) + return; + } + +- for (l = 0; is_btree_node(iter, l); l++) { ++ for (l = 0; btree_iter_node(iter, l); l++) { + if (iter->uptodate >= BTREE_ITER_NEED_RELOCK && + !btree_node_locked(iter, l)) + continue; +@@ -384,7 +384,7 @@ static inline void bch2_btree_iter_verify_locks(struct btree_iter *iter) {} + #endif + + __flatten +-static bool bch2_btree_iter_relock(struct btree_iter *iter, unsigned long trace_ip) ++bool bch2_btree_iter_relock(struct btree_iter *iter, unsigned long trace_ip) + { + return btree_iter_get_locks(iter, false, trace_ip); + } +@@ -610,6 +610,8 @@ err: + + static void bch2_btree_iter_verify(struct btree_iter *iter) + { ++ struct btree_trans *trans = iter->trans; ++ struct bch_fs *c = trans->c; + enum btree_iter_type type = btree_iter_type(iter); + unsigned i; + +@@ -628,10 +630,16 @@ static void bch2_btree_iter_verify(struct btree_iter *iter) + (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) && + !btree_type_has_snapshots(iter->btree_id)); + +- bch2_btree_iter_verify_locks(iter); ++ for (i = 0; i < (type != BTREE_ITER_CACHED ? BTREE_MAX_DEPTH : 1); i++) { ++ if (!iter->l[i].b) { ++ BUG_ON(c->btree_roots[iter->btree_id].b->c.level > i); ++ break; ++ } + +- for (i = 0; i < BTREE_MAX_DEPTH; i++) + bch2_btree_iter_verify_level(iter, i); ++ } ++ ++ bch2_btree_iter_verify_locks(iter); + } + + static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) +@@ -1353,30 +1361,30 @@ static inline unsigned btree_iter_up_until_good_node(struct btree_iter *iter, + static int btree_iter_traverse_one(struct btree_iter *iter, + unsigned long trace_ip) + { +- unsigned depth_want = iter->level; ++ unsigned l, depth_want = iter->level; + int ret = 0; + +- /* +- * if we need interior nodes locked, call btree_iter_relock() to make +- * sure we walk back up enough that we lock them: +- */ +- if (iter->uptodate == BTREE_ITER_NEED_RELOCK || +- iter->locks_want > 1) +- bch2_btree_iter_relock(iter, _THIS_IP_); +- + if (btree_iter_type(iter) == BTREE_ITER_CACHED) { + ret = bch2_btree_iter_traverse_cached(iter); + goto out; + } + +- if (iter->uptodate < BTREE_ITER_NEED_RELOCK) +- goto out; +- + if (unlikely(iter->level >= BTREE_MAX_DEPTH)) + goto out; + + iter->level = btree_iter_up_until_good_node(iter, 0); + ++ /* If we need intent locks, take them too: */ ++ for (l = iter->level + 1; ++ l < iter->locks_want && btree_iter_node(iter, l); ++ l++) ++ if (!bch2_btree_node_relock(iter, l)) ++ while (iter->level <= l) { ++ btree_node_unlock(iter, iter->level); ++ iter->l[iter->level].b = BTREE_ITER_NO_NODE_UP; ++ iter->level++; ++ } ++ + /* + * Note: iter->nodes[iter->level] may be temporarily NULL here - that + * would indicate to other code that we got to the end of the btree, +@@ -1397,6 +1405,7 @@ static int btree_iter_traverse_one(struct btree_iter *iter, + goto out; + } + ++ __bch2_btree_iter_unlock(iter); + iter->level = depth_want; + + if (ret == -EIO) { +diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h +index 31175cf00c0a..58f15b716d49 100644 +--- a/fs/bcachefs/btree_iter.h ++++ b/fs/bcachefs/btree_iter.h +@@ -111,6 +111,8 @@ void bch2_btree_node_iter_fix(struct btree_iter *, struct btree *, + struct btree_node_iter *, struct bkey_packed *, + unsigned, unsigned); + ++bool bch2_btree_iter_relock(struct btree_iter *, unsigned long); ++ + bool bch2_trans_relock(struct btree_trans *); + void bch2_trans_unlock(struct btree_trans *); + +diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c +index 0f8ff4aa76e5..6a2984e97b1f 100644 +--- a/fs/bcachefs/btree_key_cache.c ++++ b/fs/bcachefs/btree_key_cache.c +@@ -270,7 +270,9 @@ int bch2_btree_iter_traverse_cached(struct btree_iter *iter) + + BUG_ON(iter->level); + +- if (btree_node_locked(iter, 0)) { ++ iter->l[1].b = NULL; ++ ++ if (bch2_btree_node_relock(iter, 0)) { + ck = (void *) iter->l[0].b; + goto fill; + } +-- +cgit v1.2.3 + + +From 7daa7fa75c78db12c0433ef3fe76bbb80bd86fce Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 14 Jul 2021 20:28:27 -0400 +Subject: bcachefs: Improvements to fsck check_dirents() + +The fsck code handles transaction restarts in a very ad hoc way, and not +always correctly. This patch makes some improvements to check_dirents(), +but more work needs to be done to figure out how this kind of code +should be structured. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/fsck.c | 345 +++++++++++++++++++++++++++-------------------------- + 1 file changed, 178 insertions(+), 167 deletions(-) + +diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c +index 7ea1a41ac637..bedfd34803ce 100644 +--- a/fs/bcachefs/fsck.c ++++ b/fs/bcachefs/fsck.c +@@ -267,11 +267,11 @@ static struct inode_walker inode_walker_init(void) + }; + } + +-static int walk_inode(struct btree_trans *trans, +- struct inode_walker *w, u64 inum) ++static int __walk_inode(struct btree_trans *trans, ++ struct inode_walker *w, u64 inum) + { + if (inum != w->cur_inum) { +- int ret = lookup_inode(trans, inum, &w->inode, &w->snapshot); ++ int ret = __lookup_inode(trans, inum, &w->inode, &w->snapshot); + + if (ret && ret != -ENOENT) + return ret; +@@ -286,6 +286,12 @@ static int walk_inode(struct btree_trans *trans, + return 0; + } + ++static int walk_inode(struct btree_trans *trans, ++ struct inode_walker *w, u64 inum) ++{ ++ return lockrestart_do(trans, __walk_inode(trans, w, inum)); ++} ++ + static int hash_redo_key(struct btree_trans *trans, + const struct bch_hash_desc desc, + struct bch_hash_info *hash_info, +@@ -704,210 +710,215 @@ fsck_err: + return bch2_trans_exit(&trans) ?: ret; + } + +-/* +- * Walk dirents: verify that they all have a corresponding S_ISDIR inode, +- * validate d_type +- */ +-noinline_for_stack +-static int check_dirents(struct bch_fs *c) ++static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, ++ struct bch_hash_info *hash_info, ++ struct inode_walker *w, unsigned *nr_subdirs) + { +- struct inode_walker w = inode_walker_init(); +- struct bch_hash_info hash_info; +- struct btree_trans trans; +- struct btree_iter *iter; ++ struct bch_fs *c = trans->c; + struct bkey_s_c k; ++ struct bkey_s_c_dirent d; ++ struct bch_inode_unpacked target; ++ u32 target_snapshot; ++ bool have_target; ++ bool backpointer_exists = true; ++ u64 d_inum; + char buf[200]; +- unsigned nr_subdirs = 0; +- int ret = 0; ++ int ret; + +- bch_verbose(c, "checking dirents"); ++ k = bch2_btree_iter_peek(iter); ++ if (!k.k) ++ return 1; + +- bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); ++ ret = bkey_err(k); ++ if (ret) ++ return ret; + +- iter = bch2_trans_get_iter(&trans, BTREE_ID_dirents, +- POS(BCACHEFS_ROOT_INO, 0), +- BTREE_ITER_INTENT| +- BTREE_ITER_PREFETCH); +-retry: +- while ((k = bch2_btree_iter_peek(iter)).k && +- !(ret = bkey_err(k))) { +- struct bkey_s_c_dirent d; +- struct bch_inode_unpacked target; +- u32 target_snapshot; +- bool have_target; +- bool backpointer_exists = true; +- u64 d_inum; ++ if (w->have_inode && ++ w->cur_inum != k.k->p.inode && ++ fsck_err_on(w->inode.bi_nlink != *nr_subdirs, c, ++ "directory %llu with wrong i_nlink: got %u, should be %u", ++ w->inode.bi_inum, w->inode.bi_nlink, *nr_subdirs)) { ++ w->inode.bi_nlink = *nr_subdirs; ++ ret = write_inode(trans, &w->inode, w->snapshot); ++ return ret ?: -EINTR; ++ } + +- if (w.have_inode && +- w.cur_inum != k.k->p.inode && +- fsck_err_on(w.inode.bi_nlink != nr_subdirs, c, +- "directory %llu with wrong i_nlink: got %u, should be %u", +- w.inode.bi_inum, w.inode.bi_nlink, nr_subdirs)) { +- w.inode.bi_nlink = nr_subdirs; +- ret = write_inode(&trans, &w.inode, w.snapshot); +- if (ret) +- break; +- } ++ ret = __walk_inode(trans, w, k.k->p.inode); ++ if (ret) ++ return ret; + +- ret = walk_inode(&trans, &w, k.k->p.inode); +- if (ret) +- break; ++ if (w->first_this_inode) ++ *nr_subdirs = 0; + +- if (w.first_this_inode) +- nr_subdirs = 0; ++ if (fsck_err_on(!w->have_inode, c, ++ "dirent in nonexisting directory:\n%s", ++ (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)) || ++ fsck_err_on(!S_ISDIR(w->inode.bi_mode), c, ++ "dirent in non directory inode type %u:\n%s", ++ mode_to_type(w->inode.bi_mode), ++ (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf))) ++ return __bch2_trans_do(trans, NULL, NULL, 0, ++ bch2_btree_delete_at(trans, iter, 0)); + +- if (fsck_err_on(!w.have_inode, c, +- "dirent in nonexisting directory:\n%s", +- (bch2_bkey_val_to_text(&PBUF(buf), c, +- k), buf)) || +- fsck_err_on(!S_ISDIR(w.inode.bi_mode), c, +- "dirent in non directory inode type %u:\n%s", +- mode_to_type(w.inode.bi_mode), +- (bch2_bkey_val_to_text(&PBUF(buf), c, +- k), buf))) { +- ret = __bch2_trans_do(&trans, NULL, NULL, 0, +- bch2_btree_delete_at(&trans, iter, 0)); +- if (ret) +- goto err; +- goto next; +- } ++ if (!w->have_inode) ++ return 0; + +- if (!w.have_inode) +- goto next; ++ if (w->first_this_inode) ++ *hash_info = bch2_hash_info_init(c, &w->inode); + +- if (w.first_this_inode) +- hash_info = bch2_hash_info_init(c, &w.inode); ++ ret = hash_check_key(trans, bch2_dirent_hash_desc, ++ hash_info, iter, k); ++ if (ret < 0) ++ return ret; ++ if (ret) /* dirent has been deleted */ ++ return 0; + +- ret = hash_check_key(&trans, bch2_dirent_hash_desc, +- &hash_info, iter, k); +- if (ret > 0) { +- ret = 0; +- goto next; +- } +- if (ret) +- goto fsck_err; ++ if (k.k->type != KEY_TYPE_dirent) ++ return 0; ++ ++ d = bkey_s_c_to_dirent(k); ++ d_inum = le64_to_cpu(d.v->d_inum); + +- if (k.k->type != KEY_TYPE_dirent) +- goto next; ++ ret = __lookup_inode(trans, d_inum, &target, &target_snapshot); ++ if (ret && ret != -ENOENT) ++ return ret; + +- d = bkey_s_c_to_dirent(k); +- d_inum = le64_to_cpu(d.v->d_inum); ++ have_target = !ret; ++ ret = 0; + +- ret = lookup_inode(&trans, d_inum, &target, &target_snapshot); +- if (ret && ret != -ENOENT) +- break; ++ if (fsck_err_on(!have_target, c, ++ "dirent points to missing inode:\n%s", ++ (bch2_bkey_val_to_text(&PBUF(buf), c, ++ k), buf))) ++ return remove_dirent(trans, d.k->p); + +- have_target = !ret; ++ if (!have_target) ++ return 0; ++ ++ if (!target.bi_dir && ++ !target.bi_dir_offset) { ++ target.bi_dir = k.k->p.inode; ++ target.bi_dir_offset = k.k->p.offset; ++ ++ ret = __write_inode(trans, &target, target_snapshot) ?: ++ bch2_trans_commit(trans, NULL, NULL, ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_LAZY_RW| ++ BTREE_INSERT_NOUNLOCK); ++ if (ret) ++ return ret; ++ return -EINTR; ++ } ++ ++ if (!inode_backpointer_matches(d, &target)) { ++ ret = inode_backpointer_exists(trans, &target); ++ if (ret < 0) ++ return ret; ++ ++ backpointer_exists = ret; + ret = 0; + +- if (fsck_err_on(!have_target, c, +- "dirent points to missing inode:\n%s", +- (bch2_bkey_val_to_text(&PBUF(buf), c, +- k), buf))) { +- ret = remove_dirent(&trans, d.k->p); +- if (ret) +- goto err; +- goto next; ++ if (fsck_err_on(S_ISDIR(target.bi_mode) && ++ backpointer_exists, c, ++ "directory %llu with multiple links", ++ target.bi_inum)) ++ return remove_dirent(trans, d.k->p); ++ ++ if (fsck_err_on(backpointer_exists && ++ !target.bi_nlink, c, ++ "inode %llu has multiple links but i_nlink 0", ++ d_inum)) { ++ target.bi_nlink++; ++ target.bi_flags &= ~BCH_INODE_UNLINKED; ++ ++ ret = write_inode(trans, &target, target_snapshot); ++ return ret ?: -EINTR; + } + +- if (!have_target) +- goto next; +- +- if (!target.bi_dir && +- !target.bi_dir_offset) { ++ if (fsck_err_on(!backpointer_exists, c, ++ "inode %llu has wrong backpointer:\n" ++ "got %llu:%llu\n" ++ "should be %llu:%llu", ++ d_inum, ++ target.bi_dir, ++ target.bi_dir_offset, ++ k.k->p.inode, ++ k.k->p.offset)) { + target.bi_dir = k.k->p.inode; + target.bi_dir_offset = k.k->p.offset; + +- ret = write_inode(&trans, &target, target_snapshot); +- if (ret) +- goto err; ++ ret = write_inode(trans, &target, target_snapshot); ++ return ret ?: -EINTR; + } ++ } + +- if (!inode_backpointer_matches(d, &target)) { +- ret = inode_backpointer_exists(&trans, &target); +- if (ret < 0) +- goto err; +- +- backpointer_exists = ret; +- ret = 0; ++ if (fsck_err_on(d.v->d_type != mode_to_type(target.bi_mode), c, ++ "incorrect d_type: should be %u:\n%s", ++ mode_to_type(target.bi_mode), ++ (bch2_bkey_val_to_text(&PBUF(buf), c, ++ k), buf))) { ++ struct bkey_i_dirent *n; + +- if (fsck_err_on(S_ISDIR(target.bi_mode) && +- backpointer_exists, c, +- "directory %llu with multiple links", +- target.bi_inum)) { +- ret = remove_dirent(&trans, d.k->p); +- if (ret) +- goto err; +- continue; +- } ++ n = kmalloc(bkey_bytes(d.k), GFP_KERNEL); ++ if (!n) ++ return -ENOMEM; + +- if (fsck_err_on(backpointer_exists && +- !target.bi_nlink, c, +- "inode %llu has multiple links but i_nlink 0", +- d_inum)) { +- target.bi_nlink++; +- target.bi_flags &= ~BCH_INODE_UNLINKED; ++ bkey_reassemble(&n->k_i, d.s_c); ++ n->v.d_type = mode_to_type(target.bi_mode); + +- ret = write_inode(&trans, &target, target_snapshot); +- if (ret) +- goto err; +- } ++ ret = __bch2_trans_do(trans, NULL, NULL, ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_LAZY_RW, ++ bch2_btree_iter_traverse(iter) ?: ++ bch2_trans_update(trans, iter, &n->k_i, 0)); ++ kfree(n); ++ return ret ?: -EINTR; ++ } + +- if (fsck_err_on(!backpointer_exists, c, +- "inode %llu has wrong backpointer:\n" +- "got %llu:%llu\n" +- "should be %llu:%llu", +- d_inum, +- target.bi_dir, +- target.bi_dir_offset, +- k.k->p.inode, +- k.k->p.offset)) { +- target.bi_dir = k.k->p.inode; +- target.bi_dir_offset = k.k->p.offset; +- +- ret = write_inode(&trans, &target, target_snapshot); +- if (ret) +- goto err; +- } +- } ++ *nr_subdirs += d.v->d_type == DT_DIR; ++ return 0; ++fsck_err: ++ return ret; ++} + +- if (fsck_err_on(d.v->d_type != mode_to_type(target.bi_mode), c, +- "incorrect d_type: should be %u:\n%s", +- mode_to_type(target.bi_mode), +- (bch2_bkey_val_to_text(&PBUF(buf), c, +- k), buf))) { +- struct bkey_i_dirent *n; ++/* ++ * Walk dirents: verify that they all have a corresponding S_ISDIR inode, ++ * validate d_type ++ */ ++noinline_for_stack ++static int check_dirents(struct bch_fs *c) ++{ ++ struct inode_walker w = inode_walker_init(); ++ struct bch_hash_info hash_info; ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ unsigned nr_subdirs = 0; ++ int ret = 0; + +- n = kmalloc(bkey_bytes(d.k), GFP_KERNEL); +- if (!n) { +- ret = -ENOMEM; +- goto err; +- } ++ bch_verbose(c, "checking dirents"); + +- bkey_reassemble(&n->k_i, d.s_c); +- n->v.d_type = mode_to_type(target.bi_mode); ++ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); + +- ret = __bch2_trans_do(&trans, NULL, NULL, +- BTREE_INSERT_NOFAIL| +- BTREE_INSERT_LAZY_RW, +- bch2_btree_iter_traverse(iter) ?: +- bch2_trans_update(&trans, iter, &n->k_i, 0)); +- kfree(n); +- if (ret) +- goto err; ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_dirents, ++ POS(BCACHEFS_ROOT_INO, 0), ++ BTREE_ITER_INTENT| ++ BTREE_ITER_PREFETCH); + ++ while (1) { ++ ret = lockrestart_do(&trans, ++ check_dirent(&trans, iter, &hash_info, &w, &nr_subdirs)); ++ if (ret == 1) { ++ /* at end */ ++ ret = 0; ++ break; + } ++ if (ret) ++ break; + +- nr_subdirs += d.v->d_type == DT_DIR; +-next: + bch2_btree_iter_advance(iter); + } +-err: +-fsck_err: +- if (ret == -EINTR) +- goto retry; +- + bch2_trans_iter_put(&trans, iter); ++ + return bch2_trans_exit(&trans) ?: ret; + } + +-- +cgit v1.2.3 + + +From 363b05bb64d11eb52445b2fcca4ccf47cd1bf420 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 14 Jul 2021 23:35:11 -0400 +Subject: bcachefs: Fix bch2_btree_iter_rewind() + +We'd hit a BUG() when rewinding at the start of the btree on btrees with +snapshots. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 8a67d8568302..b0d5681f2a5b 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -1625,7 +1625,9 @@ inline bool bch2_btree_iter_advance(struct btree_iter *iter) + inline bool bch2_btree_iter_rewind(struct btree_iter *iter) + { + struct bpos pos = bkey_start_pos(&iter->k); +- bool ret = bpos_cmp(pos, POS_MIN) != 0; ++ bool ret = (iter->flags & BTREE_ITER_ALL_SNAPSHOTS ++ ? bpos_cmp(pos, POS_MIN) ++ : bkey_cmp(pos, POS_MIN)) != 0; + + if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS)) + pos = bkey_predecessor(iter, pos); +-- +cgit v1.2.3 + + +From f760b44c4b5a3164dec635b25d62d9717b225941 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 14 Jul 2021 21:25:55 -0400 +Subject: bcachefs: Fixes for unit tests + +The unit tests hadn't been updated for various recent btree changes - +this patch makes them work again. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/tests.c | 18 +++++++++++------- + 1 file changed, 11 insertions(+), 7 deletions(-) + +diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c +index d099358e43d6..4d8d50fd7642 100644 +--- a/fs/bcachefs/tests.c ++++ b/fs/bcachefs/tests.c +@@ -34,6 +34,7 @@ static int test_delete(struct bch_fs *c, u64 nr) + int ret; + + bkey_cookie_init(&k.k_i); ++ k.k.p.snapshot = U32_MAX; + + bch2_trans_init(&trans, c, 0, 0); + +@@ -79,29 +80,27 @@ static int test_delete_written(struct bch_fs *c, u64 nr) + int ret; + + bkey_cookie_init(&k.k_i); ++ k.k.p.snapshot = U32_MAX; + + bch2_trans_init(&trans, c, 0, 0); + + iter = bch2_trans_get_iter(&trans, BTREE_ID_xattrs, k.k.p, + BTREE_ITER_INTENT); + +- ret = bch2_btree_iter_traverse(iter); +- if (ret) { +- bch_err(c, "lookup error in test_delete_written: %i", ret); +- goto err; +- } +- + ret = __bch2_trans_do(&trans, NULL, NULL, 0, ++ bch2_btree_iter_traverse(iter) ?: + bch2_trans_update(&trans, iter, &k.k_i, 0)); + if (ret) { + bch_err(c, "update error in test_delete_written: %i", ret); + goto err; + } + ++ bch2_trans_unlock(&trans); + bch2_journal_flush_all_pins(&c->journal); + + ret = __bch2_trans_do(&trans, NULL, NULL, 0, +- bch2_btree_delete_at(&trans, iter, 0)); ++ bch2_btree_iter_traverse(iter) ?: ++ bch2_btree_delete_at(&trans, iter, 0)); + if (ret) { + bch_err(c, "delete error in test_delete_written: %i", ret); + goto err; +@@ -131,6 +130,7 @@ static int test_iterate(struct bch_fs *c, u64 nr) + + bkey_cookie_init(&k.k_i); + k.k.p.offset = i; ++ k.k.p.snapshot = U32_MAX; + + ret = bch2_btree_insert(c, BTREE_ID_xattrs, &k.k_i, + NULL, NULL, 0); +@@ -185,6 +185,7 @@ static int test_iterate_extents(struct bch_fs *c, u64 nr) + + bkey_cookie_init(&k.k_i); + k.k.p.offset = i + 8; ++ k.k.p.snapshot = U32_MAX; + k.k.size = 8; + + ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i, +@@ -240,6 +241,7 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr) + + bkey_cookie_init(&k.k_i); + k.k.p.offset = i * 2; ++ k.k.p.snapshot = U32_MAX; + + ret = bch2_btree_insert(c, BTREE_ID_xattrs, &k.k_i, + NULL, NULL, 0); +@@ -303,6 +305,7 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr) + + bkey_cookie_init(&k.k_i); + k.k.p.offset = i + 16; ++ k.k.p.snapshot = U32_MAX; + k.k.size = 8; + + ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i, +@@ -410,6 +413,7 @@ static int insert_test_extent(struct bch_fs *c, + + bkey_cookie_init(&k.k_i); + k.k_i.k.p.offset = end; ++ k.k_i.k.p.snapshot = U32_MAX; + k.k_i.k.size = end - start; + k.k_i.k.version.lo = test_version++; + +-- +cgit v1.2.3 + + +From 49d4b5c9762e1a219c43608c7b1a9eea5d5f855a Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 15 Jul 2021 13:42:43 -0400 +Subject: bcachefs: Improve btree_bad_header() error message + +We should always print out the full btree node ptr. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_cache.c | 25 ++++++++++++------------- + fs/bcachefs/extents.c | 5 +++-- + 2 files changed, 15 insertions(+), 15 deletions(-) + +diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c +index b3e90d3f1ef2..3b975f4dceed 100644 +--- a/fs/bcachefs/btree_cache.c ++++ b/fs/bcachefs/btree_cache.c +@@ -720,26 +720,25 @@ static int lock_node_check_fn(struct six_lock *lock, void *p) + + static noinline void btree_bad_header(struct bch_fs *c, struct btree *b) + { +- char buf1[100], buf2[100], buf3[100], buf4[100]; ++ char buf1[200], buf2[100], buf3[100]; + + if (!test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) + return; + +- bch2_bpos_to_text(&PBUF(buf1), b->key.k.type == KEY_TYPE_btree_ptr_v2 +- ? bkey_i_to_btree_ptr_v2(&b->key)->v.min_key +- : POS_MIN); ++ bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(&b->key)); + bch2_bpos_to_text(&PBUF(buf2), b->data->min_key); ++ bch2_bpos_to_text(&PBUF(buf3), b->data->max_key); + +- bch2_bpos_to_text(&PBUF(buf3), b->key.k.p); +- bch2_bpos_to_text(&PBUF(buf4), b->data->max_key); + bch2_fs_inconsistent(c, "btree node header doesn't match ptr\n" +- "btree: ptr %u header %llu\n" +- "level: ptr %u header %llu\n" +- "min ptr %s node header %s\n" +- "max ptr %s node header %s", +- b->c.btree_id, BTREE_NODE_ID(b->data), +- b->c.level, BTREE_NODE_LEVEL(b->data), +- buf1, buf2, buf3, buf4); ++ "btree %s level %u\n" ++ "ptr: %s\n" ++ "header: btree %s level %llu\n" ++ "min %s max %s\n", ++ bch2_btree_ids[b->c.btree_id], b->c.level, ++ buf1, ++ bch2_btree_ids[BTREE_NODE_ID(b->data)], ++ BTREE_NODE_LEVEL(b->data), ++ buf2, buf3); + } + + static inline void btree_check_header(struct bch_fs *c, struct btree *b) +diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c +index 3968f1fd7d27..563e13057f5f 100644 +--- a/fs/bcachefs/extents.c ++++ b/fs/bcachefs/extents.c +@@ -192,9 +192,10 @@ void bch2_btree_ptr_v2_to_text(struct printbuf *out, struct bch_fs *c, + { + struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k); + +- pr_buf(out, "seq %llx written %u min_key ", ++ pr_buf(out, "seq %llx written %u min_key %s", + le64_to_cpu(bp.v->seq), +- le16_to_cpu(bp.v->sectors_written)); ++ le16_to_cpu(bp.v->sectors_written), ++ BTREE_PTR_RANGE_UPDATED(bp.v) ? "R " : ""); + + bch2_bpos_to_text(out, bp.v->min_key); + pr_buf(out, " "); +-- +cgit v1.2.3 + + +From 45166945cfe1285eebd3f7709a9e588fa9c642bd Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 10 Jul 2021 13:44:42 -0400 +Subject: bcachefs: Update btree ptrs after every write + +This closes a significant hole (and last known hole) in our ability to +verify metadata. Previously, since btree nodes are log structured, we +couldn't detect lost btree writes that weren't the first write to a +given node. Additionally, this seems to have lead to some significant +metadata corruption on multi device filesystems with metadata +replication: since a write may have made it to one device and not +another, if we read that btree node back from the replica that did have +that write and started appending after that point, the other replica +would have a gap in the bset entries and reading from that replica +wouldn't find the rest of the bsets. + +But, since updates to interior btree nodes are now journalled, we can +close this hole by updating pointers to btree nodes after every write +with the currently written number of sectors, without negatively +affecting performance. This means we will always detect lost or corrupt +metadata - it also means that our btree is now a curious hybrid of COW +and non COW btrees, with all the benefits of both (excluding +complexity). + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bcachefs.h | 4 +- + fs/bcachefs/bcachefs_format.h | 3 +- + fs/bcachefs/btree_io.c | 222 +++++++++++++++--------------------- + fs/bcachefs/btree_io.h | 11 +- + fs/bcachefs/btree_iter.h | 2 +- + fs/bcachefs/btree_types.h | 2 + + fs/bcachefs/btree_update.h | 4 +- + fs/bcachefs/btree_update_interior.c | 194 ++++++++++++++++++++----------- + fs/bcachefs/btree_update_leaf.c | 3 +- + fs/bcachefs/io_types.h | 3 +- + fs/bcachefs/migrate.c | 2 +- + fs/bcachefs/recovery.c | 5 + + fs/bcachefs/super.c | 9 +- + 13 files changed, 250 insertions(+), 214 deletions(-) + +diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h +index 90844d6532f6..04210df26af7 100644 +--- a/fs/bcachefs/bcachefs.h ++++ b/fs/bcachefs/bcachefs.h +@@ -676,7 +676,7 @@ struct bch_fs { + struct btree_key_cache btree_key_cache; + + struct workqueue_struct *btree_update_wq; +- struct workqueue_struct *btree_error_wq; ++ struct workqueue_struct *btree_io_complete_wq; + /* copygc needs its own workqueue for index updates.. */ + struct workqueue_struct *copygc_wq; + +@@ -827,8 +827,6 @@ struct bch_fs { + + atomic64_t btree_writes_nr; + atomic64_t btree_writes_sectors; +- struct bio_list btree_write_error_list; +- struct work_struct btree_write_error_work; + spinlock_t btree_write_error_lock; + + /* ERRORS */ +diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h +index 8a89ab0d8e85..ee958f598195 100644 +--- a/fs/bcachefs/bcachefs_format.h ++++ b/fs/bcachefs/bcachefs_format.h +@@ -1210,7 +1210,8 @@ enum bcachefs_metadata_version { + bcachefs_metadata_version_inode_btree_change = 11, + bcachefs_metadata_version_snapshot = 12, + bcachefs_metadata_version_inode_backpointers = 13, +- bcachefs_metadata_version_max = 14, ++ bcachefs_metadata_version_btree_ptr_sectors_written = 14, ++ bcachefs_metadata_version_max = 15, + }; + + #define bcachefs_metadata_version_current (bcachefs_metadata_version_max - 1) +diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c +index 957a6a9a1559..25f6a689633e 100644 +--- a/fs/bcachefs/btree_io.c ++++ b/fs/bcachefs/btree_io.c +@@ -26,6 +26,7 @@ void bch2_btree_node_io_unlock(struct btree *b) + { + EBUG_ON(!btree_node_write_in_flight(b)); + ++ clear_btree_node_write_in_flight_inner(b); + clear_btree_node_write_in_flight(b); + wake_up_bit(&b->flags, BTREE_NODE_write_in_flight); + } +@@ -870,7 +871,8 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, + bool updated_range = b->key.k.type == KEY_TYPE_btree_ptr_v2 && + BTREE_PTR_RANGE_UPDATED(&bkey_i_to_btree_ptr_v2(&b->key)->v); + unsigned u64s; +- unsigned nonblacklisted_written = 0; ++ unsigned blacklisted_written, nonblacklisted_written = 0; ++ unsigned ptr_written = btree_ptr_sectors_written(&b->key); + int ret, retry_read = 0, write = READ; + + b->version_ondisk = U16_MAX; +@@ -901,7 +903,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, + b->data->keys.seq, bp->seq); + } + +- while (b->written < c->opts.btree_node_size) { ++ while (b->written < (ptr_written ?: c->opts.btree_node_size)) { + unsigned sectors, whiteout_u64s = 0; + struct nonce nonce; + struct bch_csum csum; +@@ -981,6 +983,10 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, + btree_err_on(blacklisted && first, + BTREE_ERR_FIXABLE, c, ca, b, i, + "first btree node bset has blacklisted journal seq"); ++ ++ btree_err_on(blacklisted && ptr_written, ++ BTREE_ERR_FIXABLE, c, ca, b, i, ++ "found blacklisted bset in btree node with sectors_written"); + if (blacklisted && !first) + continue; + +@@ -994,26 +1000,34 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, + nonblacklisted_written = b->written; + } + +- for (bne = write_block(b); +- bset_byte_offset(b, bne) < btree_bytes(c); +- bne = (void *) bne + block_bytes(c)) +- btree_err_on(bne->keys.seq == b->data->keys.seq && +- !bch2_journal_seq_is_blacklisted(c, +- le64_to_cpu(bne->keys.journal_seq), +- true), ++ if (ptr_written) { ++ btree_err_on(b->written < ptr_written, + BTREE_ERR_WANT_RETRY, c, ca, b, NULL, +- "found bset signature after last bset"); ++ "btree node data missing: expected %u sectors, found %u", ++ ptr_written, b->written); ++ } else { ++ for (bne = write_block(b); ++ bset_byte_offset(b, bne) < btree_bytes(c); ++ bne = (void *) bne + block_bytes(c)) ++ btree_err_on(bne->keys.seq == b->data->keys.seq && ++ !bch2_journal_seq_is_blacklisted(c, ++ le64_to_cpu(bne->keys.journal_seq), ++ true), ++ BTREE_ERR_WANT_RETRY, c, ca, b, NULL, ++ "found bset signature after last bset"); + +- /* +- * Blacklisted bsets are those that were written after the most recent +- * (flush) journal write. Since there wasn't a flush, they may not have +- * made it to all devices - which means we shouldn't write new bsets +- * after them, as that could leave a gap and then reads from that device +- * wouldn't find all the bsets in that btree node - which means it's +- * important that we start writing new bsets after the most recent _non_ +- * blacklisted bset: +- */ +- b->written = nonblacklisted_written; ++ /* ++ * Blacklisted bsets are those that were written after the most recent ++ * (flush) journal write. Since there wasn't a flush, they may not have ++ * made it to all devices - which means we shouldn't write new bsets ++ * after them, as that could leave a gap and then reads from that device ++ * wouldn't find all the bsets in that btree node - which means it's ++ * important that we start writing new bsets after the most recent _non_ ++ * blacklisted bset: ++ */ ++ blacklisted_written = b->written; ++ b->written = nonblacklisted_written; ++ } + + sorted = btree_bounce_alloc(c, btree_bytes(c), &used_mempool); + sorted->keys.u64s = 0; +@@ -1081,6 +1095,9 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, + if (ca->mi.state != BCH_MEMBER_STATE_rw) + set_btree_node_need_rewrite(b); + } ++ ++ if (!ptr_written) ++ set_btree_node_need_rewrite(b); + out: + mempool_free(iter, &c->fill_iter); + return retry_read; +@@ -1578,6 +1595,7 @@ static void btree_node_write_done(struct bch_fs *c, struct btree *b) + goto do_write; + + new &= ~(1U << BTREE_NODE_write_in_flight); ++ new &= ~(1U << BTREE_NODE_write_in_flight_inner); + } while ((v = cmpxchg(&b->flags, old, new)) != old); + + wake_up_bit(&b->flags, BTREE_NODE_write_in_flight); +@@ -1596,10 +1614,12 @@ do_write: + new &= ~(1U << BTREE_NODE_dirty); + new &= ~(1U << BTREE_NODE_need_write); + new |= (1U << BTREE_NODE_write_in_flight); ++ new |= (1U << BTREE_NODE_write_in_flight_inner); + new |= (1U << BTREE_NODE_just_written); + new ^= (1U << BTREE_NODE_write_idx); + } else { + new &= ~(1U << BTREE_NODE_write_in_flight); ++ new &= ~(1U << BTREE_NODE_write_in_flight_inner); + } + } while ((v = cmpxchg(&b->flags, old, new)) != old); + +@@ -1609,52 +1629,38 @@ do_write: + six_unlock_read(&b->c.lock); + } + +-static void bch2_btree_node_write_error(struct bch_fs *c, +- struct btree_write_bio *wbio) ++static void btree_node_write_work(struct work_struct *work) + { ++ struct btree_write_bio *wbio = ++ container_of(work, struct btree_write_bio, work); ++ struct bch_fs *c = wbio->wbio.c; + struct btree *b = wbio->wbio.bio.bi_private; +- struct bkey_buf k; + struct bch_extent_ptr *ptr; +- struct btree_trans trans; +- struct btree_iter *iter; + int ret; + +- bch2_bkey_buf_init(&k); +- bch2_trans_init(&trans, c, 0, 0); +- +- iter = bch2_trans_get_node_iter(&trans, b->c.btree_id, b->key.k.p, +- BTREE_MAX_DEPTH, b->c.level, 0); +-retry: +- ret = bch2_btree_iter_traverse(iter); +- if (ret) +- goto err; +- +- /* has node been freed? */ +- if (iter->l[b->c.level].b != b) { +- /* node has been freed: */ +- BUG_ON(!btree_node_dying(b)); +- goto out; +- } +- +- BUG_ON(!btree_node_hashed(b)); +- +- bch2_bkey_buf_copy(&k, c, &b->key); ++ btree_bounce_free(c, ++ wbio->data_bytes, ++ wbio->wbio.used_mempool, ++ wbio->data); + +- bch2_bkey_drop_ptrs(bkey_i_to_s(k.k), ptr, ++ bch2_bkey_drop_ptrs(bkey_i_to_s(&wbio->key), ptr, + bch2_dev_list_has_dev(wbio->wbio.failed, ptr->dev)); + +- if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(k.k))) ++ if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(&wbio->key))) + goto err; + +- ret = bch2_btree_node_update_key(&trans, iter, b, k.k); +- if (ret == -EINTR) +- goto retry; +- if (ret) +- goto err; ++ if (wbio->wbio.first_btree_write) { ++ if (wbio->wbio.failed.nr) { ++ ++ } ++ } else { ++ ret = bch2_trans_do(c, NULL, NULL, 0, ++ bch2_btree_node_update_key_get_iter(&trans, b, &wbio->key, ++ !wbio->wbio.failed.nr)); ++ if (ret) ++ goto err; ++ } + out: +- bch2_trans_iter_put(&trans, iter); +- bch2_trans_exit(&trans); +- bch2_bkey_buf_exit(&k, c); + bio_put(&wbio->wbio.bio); + btree_node_write_done(c, b); + return; +@@ -1664,58 +1670,14 @@ err: + goto out; + } + +-void bch2_btree_write_error_work(struct work_struct *work) +-{ +- struct bch_fs *c = container_of(work, struct bch_fs, +- btree_write_error_work); +- struct bio *bio; +- +- while (1) { +- spin_lock_irq(&c->btree_write_error_lock); +- bio = bio_list_pop(&c->btree_write_error_list); +- spin_unlock_irq(&c->btree_write_error_lock); +- +- if (!bio) +- break; +- +- bch2_btree_node_write_error(c, +- container_of(bio, struct btree_write_bio, wbio.bio)); +- } +-} +- +-static void btree_node_write_work(struct work_struct *work) +-{ +- struct btree_write_bio *wbio = +- container_of(work, struct btree_write_bio, work); +- struct bch_fs *c = wbio->wbio.c; +- struct btree *b = wbio->wbio.bio.bi_private; +- +- btree_bounce_free(c, +- wbio->bytes, +- wbio->wbio.used_mempool, +- wbio->data); +- +- if (wbio->wbio.failed.nr) { +- unsigned long flags; +- +- spin_lock_irqsave(&c->btree_write_error_lock, flags); +- bio_list_add(&c->btree_write_error_list, &wbio->wbio.bio); +- spin_unlock_irqrestore(&c->btree_write_error_lock, flags); +- +- queue_work(c->btree_error_wq, &c->btree_write_error_work); +- return; +- } +- +- bio_put(&wbio->wbio.bio); +- btree_node_write_done(c, b); +-} +- + static void btree_node_write_endio(struct bio *bio) + { + struct bch_write_bio *wbio = to_wbio(bio); + struct bch_write_bio *parent = wbio->split ? wbio->parent : NULL; + struct bch_write_bio *orig = parent ?: wbio; ++ struct btree_write_bio *wb = container_of(orig, struct btree_write_bio, wbio); + struct bch_fs *c = wbio->c; ++ struct btree *b = wbio->bio.bi_private; + struct bch_dev *ca = bch_dev_bkey_exists(c, wbio->dev); + unsigned long flags; + +@@ -1736,13 +1698,13 @@ static void btree_node_write_endio(struct bio *bio) + if (parent) { + bio_put(bio); + bio_endio(&parent->bio); +- } else { +- struct btree_write_bio *wb = +- container_of(orig, struct btree_write_bio, wbio); +- +- INIT_WORK(&wb->work, btree_node_write_work); +- queue_work(c->io_complete_wq, &wb->work); ++ return; + } ++ ++ clear_btree_node_write_in_flight_inner(b); ++ wake_up_bit(&b->flags, BTREE_NODE_write_in_flight_inner); ++ INIT_WORK(&wb->work, btree_node_write_work); ++ queue_work(c->btree_io_complete_wq, &wb->work); + } + + static int validate_bset_for_write(struct bch_fs *c, struct btree *b, +@@ -1767,8 +1729,15 @@ static int validate_bset_for_write(struct bch_fs *c, struct btree *b, + static void btree_write_submit(struct work_struct *work) + { + struct btree_write_bio *wbio = container_of(work, struct btree_write_bio, work); ++ struct bch_extent_ptr *ptr; ++ __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp; ++ ++ bkey_copy(&tmp.k, &wbio->key); ++ ++ bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&tmp.k)), ptr) ++ ptr->offset += wbio->sector_offset; + +- bch2_submit_wbio_replicas(&wbio->wbio, wbio->wbio.c, BCH_DATA_btree, &wbio->key); ++ bch2_submit_wbio_replicas(&wbio->wbio, wbio->wbio.c, BCH_DATA_btree, &tmp.k); + } + + void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, bool already_started) +@@ -1778,7 +1747,6 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, bool already_sta + struct bset *i; + struct btree_node *bn = NULL; + struct btree_node_entry *bne = NULL; +- struct bch_extent_ptr *ptr; + struct sort_iter sort_iter; + struct nonce nonce; + unsigned bytes_to_write, sectors_to_write, bytes, u64s; +@@ -1818,6 +1786,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, bool already_sta + new &= ~(1 << BTREE_NODE_dirty); + new &= ~(1 << BTREE_NODE_need_write); + new |= (1 << BTREE_NODE_write_in_flight); ++ new |= (1 << BTREE_NODE_write_in_flight_inner); + new |= (1 << BTREE_NODE_just_written); + new ^= (1 << BTREE_NODE_write_idx); + } while (cmpxchg_acquire(&b->flags, old, new) != old); +@@ -1969,37 +1938,30 @@ do_write: + struct btree_write_bio, wbio.bio); + wbio_init(&wbio->wbio.bio); + wbio->data = data; +- wbio->bytes = bytes; ++ wbio->data_bytes = bytes; ++ wbio->sector_offset = b->written; + wbio->wbio.c = c; + wbio->wbio.used_mempool = used_mempool; ++ wbio->wbio.first_btree_write = !b->written; + wbio->wbio.bio.bi_opf = REQ_OP_WRITE|REQ_META; + wbio->wbio.bio.bi_end_io = btree_node_write_endio; + wbio->wbio.bio.bi_private = b; + + bch2_bio_map(&wbio->wbio.bio, data, sectors_to_write << 9); + +- /* +- * If we're appending to a leaf node, we don't technically need FUA - +- * this write just needs to be persisted before the next journal write, +- * which will be marked FLUSH|FUA. +- * +- * Similarly if we're writing a new btree root - the pointer is going to +- * be in the next journal entry. +- * +- * But if we're writing a new btree node (that isn't a root) or +- * appending to a non leaf btree node, we need either FUA or a flush +- * when we write the parent with the new pointer. FUA is cheaper than a +- * flush, and writes appending to leaf nodes aren't blocking anything so +- * just make all btree node writes FUA to keep things sane. +- */ +- + bkey_copy(&wbio->key, &b->key); + +- bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&wbio->key)), ptr) +- ptr->offset += b->written; +- + b->written += sectors_to_write; + ++ if (wbio->wbio.first_btree_write && ++ b->key.k.type == KEY_TYPE_btree_ptr_v2) ++ bkey_i_to_btree_ptr_v2(&b->key)->v.sectors_written = ++ cpu_to_le16(b->written); ++ ++ if (wbio->key.k.type == KEY_TYPE_btree_ptr_v2) ++ bkey_i_to_btree_ptr_v2(&wbio->key)->v.sectors_written = ++ cpu_to_le16(b->written); ++ + atomic64_inc(&c->btree_writes_nr); + atomic64_add(sectors_to_write, &c->btree_writes_sectors); + +@@ -2008,6 +1970,10 @@ do_write: + return; + err: + set_btree_node_noevict(b); ++ if (!b->written && ++ b->key.k.type == KEY_TYPE_btree_ptr_v2) ++ bkey_i_to_btree_ptr_v2(&b->key)->v.sectors_written = ++ cpu_to_le16(sectors_to_write); + b->written += sectors_to_write; + nowrite: + btree_bounce_free(c, bytes, used_mempool, data); +diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h +index 3732d135de8d..7fdcf879c7d4 100644 +--- a/fs/bcachefs/btree_io.h ++++ b/fs/bcachefs/btree_io.h +@@ -32,6 +32,13 @@ static inline void clear_btree_node_dirty(struct bch_fs *c, struct btree *b) + atomic_dec(&c->btree_cache.dirty); + } + ++static inline unsigned btree_ptr_sectors_written(struct bkey_i *k) ++{ ++ return k->k.type == KEY_TYPE_btree_ptr_v2 ++ ? le16_to_cpu(bkey_i_to_btree_ptr_v2(k)->v.sectors_written) ++ : 0; ++} ++ + struct btree_read_bio { + struct bch_fs *c; + struct btree *b; +@@ -48,7 +55,8 @@ struct btree_write_bio { + struct work_struct work; + __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX); + void *data; +- unsigned bytes; ++ unsigned data_bytes; ++ unsigned sector_offset; + struct bch_write_bio wbio; + }; + +@@ -137,7 +145,6 @@ int bch2_btree_root_read(struct bch_fs *, enum btree_id, + + void bch2_btree_complete_write(struct bch_fs *, struct btree *, + struct btree_write *); +-void bch2_btree_write_error_work(struct work_struct *); + + void __bch2_btree_node_write(struct bch_fs *, struct btree *, bool); + bool bch2_btree_post_write_cleanup(struct bch_fs *, struct btree *); +diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h +index 58f15b716d49..7385cca43f8b 100644 +--- a/fs/bcachefs/btree_iter.h ++++ b/fs/bcachefs/btree_iter.h +@@ -132,7 +132,7 @@ void __bch2_btree_iter_downgrade(struct btree_iter *, unsigned); + + static inline void bch2_btree_iter_downgrade(struct btree_iter *iter) + { +- unsigned new_locks_want = (iter->flags & BTREE_ITER_INTENT ? 1 : 0); ++ unsigned new_locks_want = iter->level + !!(iter->flags & BTREE_ITER_INTENT); + + if (iter->locks_want > new_locks_want) + __bch2_btree_iter_downgrade(iter, new_locks_want); +diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h +index a2581500b791..07c9ba4ea475 100644 +--- a/fs/bcachefs/btree_types.h ++++ b/fs/bcachefs/btree_types.h +@@ -435,6 +435,7 @@ enum btree_flags { + BTREE_NODE_write_idx, + BTREE_NODE_accessed, + BTREE_NODE_write_in_flight, ++ BTREE_NODE_write_in_flight_inner, + BTREE_NODE_just_written, + BTREE_NODE_dying, + BTREE_NODE_fake, +@@ -449,6 +450,7 @@ BTREE_FLAG(noevict); + BTREE_FLAG(write_idx); + BTREE_FLAG(accessed); + BTREE_FLAG(write_in_flight); ++BTREE_FLAG(write_in_flight_inner); + BTREE_FLAG(just_written); + BTREE_FLAG(dying); + BTREE_FLAG(fake); +diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h +index 12065bba82dd..bab135fae0b0 100644 +--- a/fs/bcachefs/btree_update.h ++++ b/fs/bcachefs/btree_update.h +@@ -74,7 +74,9 @@ int bch2_btree_node_rewrite(struct btree_trans *, struct btree_iter *, + __le64, unsigned); + void bch2_btree_node_rewrite_async(struct bch_fs *, struct btree *); + int bch2_btree_node_update_key(struct btree_trans *, struct btree_iter *, +- struct btree *, struct bkey_i *); ++ struct btree *, struct bkey_i *, bool); ++int bch2_btree_node_update_key_get_iter(struct btree_trans *, ++ struct btree *, struct bkey_i *, bool); + + int bch2_trans_update(struct btree_trans *, struct btree_iter *, + struct bkey_i *, enum btree_update_flags); +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index 0b4e4056e1d9..a254240868a5 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -246,11 +246,7 @@ retry: + goto retry; + } + +- if (c->sb.features & (1ULL << BCH_FEATURE_btree_ptr_v2)) +- bkey_btree_ptr_v2_init(&tmp.k); +- else +- bkey_btree_ptr_init(&tmp.k); +- ++ bkey_btree_ptr_v2_init(&tmp.k); + bch2_alloc_sectors_append_ptrs(c, wp, &tmp.k, c->opts.btree_node_size); + + bch2_open_bucket_get(c, wp, &ob); +@@ -567,7 +563,8 @@ static void btree_update_nodes_written(struct btree_update *as) + six_unlock_read(&old->c.lock); + + if (seq == as->old_nodes_seq[i]) +- bch2_btree_node_wait_on_write(old); ++ wait_on_bit_io(&old->flags, BTREE_NODE_write_in_flight_inner, ++ TASK_UNINTERRUPTIBLE); + } + + /* +@@ -1153,6 +1150,9 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b + struct bkey_packed *k; + const char *invalid; + ++ BUG_ON(insert->k.type == KEY_TYPE_btree_ptr_v2 && ++ !btree_ptr_sectors_written(insert)); ++ + invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(insert), btree_node_type(b)) ?: + bch2_bkey_in_btree_node(b, bkey_i_to_s_c(insert)); + if (invalid) { +@@ -1395,6 +1395,7 @@ static void btree_split(struct btree_update *as, + six_unlock_write(&n2->c.lock); + six_unlock_write(&n1->c.lock); + ++ bch2_btree_node_write(c, n1, SIX_LOCK_intent); + bch2_btree_node_write(c, n2, SIX_LOCK_intent); + + /* +@@ -1422,12 +1423,12 @@ static void btree_split(struct btree_update *as, + bch2_btree_build_aux_trees(n1); + six_unlock_write(&n1->c.lock); + ++ bch2_btree_node_write(c, n1, SIX_LOCK_intent); ++ + if (parent) + bch2_keylist_add(&as->parent_keys, &n1->key); + } + +- bch2_btree_node_write(c, n1, SIX_LOCK_intent); +- + /* New nodes all written, now make them visible: */ + + if (parent) { +@@ -1703,13 +1704,13 @@ retry: + bch2_btree_build_aux_trees(n); + six_unlock_write(&n->c.lock); + ++ bch2_btree_node_write(c, n, SIX_LOCK_intent); ++ + bkey_init(&delete.k); + delete.k.p = prev->key.k.p; + bch2_keylist_add(&as->parent_keys, &delete); + bch2_keylist_add(&as->parent_keys, &n->key); + +- bch2_btree_node_write(c, n, SIX_LOCK_intent); +- + bch2_btree_insert_node(as, trans, iter, parent, &as->parent_keys, flags); + + bch2_btree_update_get_open_buckets(as, n); +@@ -1883,74 +1884,109 @@ void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b) + queue_work(c->btree_interior_update_worker, &a->work); + } + +-static void __bch2_btree_node_update_key(struct btree_update *as, +- struct btree_trans *trans, +- struct btree_iter *iter, +- struct btree *b, struct btree *new_hash, +- struct bkey_i *new_key) ++static int __bch2_btree_node_update_key(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct btree *b, struct btree *new_hash, ++ struct bkey_i *new_key, ++ bool skip_triggers) + { +- struct bch_fs *c = as->c; ++ struct bch_fs *c = trans->c; ++ struct btree_iter *iter2 = NULL; + struct btree *parent; ++ u64 journal_entries[BKEY_BTREE_PTR_U64s_MAX]; + int ret; + +- btree_update_will_delete_key(as, &b->key); +- btree_update_will_add_key(as, new_key); ++ if (!skip_triggers) { ++ ret = bch2_trans_mark_key(trans, ++ bkey_s_c_null, ++ bkey_i_to_s_c(new_key), ++ BTREE_TRIGGER_INSERT); ++ if (ret) ++ return ret; ++ ++ ret = bch2_trans_mark_key(trans, ++ bkey_i_to_s_c(&b->key), ++ bkey_s_c_null, ++ BTREE_TRIGGER_OVERWRITE); ++ if (ret) ++ return ret; ++ } ++ ++ if (new_hash) { ++ bkey_copy(&new_hash->key, new_key); ++ ret = bch2_btree_node_hash_insert(&c->btree_cache, ++ new_hash, b->c.level, b->c.btree_id); ++ BUG_ON(ret); ++ } + + parent = btree_node_parent(iter, b); + if (parent) { +- if (new_hash) { +- bkey_copy(&new_hash->key, new_key); +- ret = bch2_btree_node_hash_insert(&c->btree_cache, +- new_hash, b->c.level, b->c.btree_id); +- BUG_ON(ret); +- } ++ iter2 = bch2_trans_copy_iter(trans, iter); + +- bch2_keylist_add(&as->parent_keys, new_key); +- bch2_btree_insert_node(as, trans, iter, parent, &as->parent_keys, 0); ++ BUG_ON(iter2->level != b->c.level); ++ BUG_ON(bpos_cmp(iter2->pos, new_key->k.p)); + +- if (new_hash) { +- mutex_lock(&c->btree_cache.lock); +- bch2_btree_node_hash_remove(&c->btree_cache, new_hash); ++ btree_node_unlock(iter2, iter2->level); ++ iter2->l[iter2->level].b = BTREE_ITER_NO_NODE_UP; ++ iter2->level++; + +- bch2_btree_node_hash_remove(&c->btree_cache, b); +- +- bkey_copy(&b->key, new_key); +- ret = __bch2_btree_node_hash_insert(&c->btree_cache, b); +- BUG_ON(ret); +- mutex_unlock(&c->btree_cache.lock); +- } else { +- bkey_copy(&b->key, new_key); +- } ++ ret = bch2_btree_iter_traverse(iter2) ?: ++ bch2_trans_update(trans, iter2, new_key, BTREE_TRIGGER_NORUN); ++ if (ret) ++ goto err; + } else { + BUG_ON(btree_node_root(c, b) != b); + +- bch2_btree_node_lock_write(b, iter); +- bkey_copy(&b->key, new_key); ++ trans->extra_journal_entries = (void *) &journal_entries[0]; ++ trans->extra_journal_entry_u64s = ++ journal_entry_set((void *) &journal_entries[0], ++ BCH_JSET_ENTRY_btree_root, ++ b->c.btree_id, b->c.level, ++ new_key, new_key->k.u64s); ++ } + +- if (btree_ptr_hash_val(&b->key) != b->hash_val) { +- mutex_lock(&c->btree_cache.lock); +- bch2_btree_node_hash_remove(&c->btree_cache, b); ++ ret = bch2_trans_commit(trans, NULL, NULL, ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_NOCHECK_RW| ++ BTREE_INSERT_JOURNAL_RECLAIM| ++ BTREE_INSERT_JOURNAL_RESERVED| ++ BTREE_INSERT_NOUNLOCK); ++ if (ret) ++ goto err; + +- ret = __bch2_btree_node_hash_insert(&c->btree_cache, b); +- BUG_ON(ret); +- mutex_unlock(&c->btree_cache.lock); +- } ++ bch2_btree_node_lock_write(b, iter); + +- btree_update_updated_root(as, b); +- bch2_btree_node_unlock_write(b, iter); ++ if (new_hash) { ++ mutex_lock(&c->btree_cache.lock); ++ bch2_btree_node_hash_remove(&c->btree_cache, new_hash); ++ bch2_btree_node_hash_remove(&c->btree_cache, b); ++ ++ bkey_copy(&b->key, new_key); ++ ret = __bch2_btree_node_hash_insert(&c->btree_cache, b); ++ BUG_ON(ret); ++ mutex_unlock(&c->btree_cache.lock); ++ } else { ++ bkey_copy(&b->key, new_key); + } + +- bch2_btree_update_done(as); ++ bch2_btree_node_unlock_write(b, iter); ++out: ++ bch2_trans_iter_put(trans, iter2); ++ return ret; ++err: ++ if (new_hash) { ++ mutex_lock(&c->btree_cache.lock); ++ bch2_btree_node_hash_remove(&c->btree_cache, b); ++ mutex_unlock(&c->btree_cache.lock); ++ } ++ goto out; + } + +-int bch2_btree_node_update_key(struct btree_trans *trans, +- struct btree_iter *iter, +- struct btree *b, +- struct bkey_i *new_key) ++int bch2_btree_node_update_key(struct btree_trans *trans, struct btree_iter *iter, ++ struct btree *b, struct bkey_i *new_key, ++ bool skip_triggers) + { + struct bch_fs *c = trans->c; +- struct btree *parent = btree_node_parent(iter, b); +- struct btree_update *as = NULL; + struct btree *new_hash = NULL; + struct closure cl; + int ret = 0; +@@ -1964,27 +2000,18 @@ int bch2_btree_node_update_key(struct btree_trans *trans, + if (btree_ptr_hash_val(new_key) != b->hash_val) { + ret = bch2_btree_cache_cannibalize_lock(c, &cl); + if (ret) { +- bch2_trans_unlock(iter->trans); ++ bch2_trans_unlock(trans); + closure_sync(&cl); +- if (!bch2_trans_relock(iter->trans)) ++ if (!bch2_trans_relock(trans)) + return -EINTR; + } + + new_hash = bch2_btree_node_mem_alloc(c); + } + +- as = bch2_btree_update_start(iter, b->c.level, +- parent ? btree_update_reserve_required(c, parent) : 0, +- BTREE_INSERT_NOFAIL); +- if (IS_ERR(as)) { +- ret = PTR_ERR(as); +- goto err; +- } +- +- __bch2_btree_node_update_key(as, trans, iter, b, new_hash, new_key); ++ ret = __bch2_btree_node_update_key(trans, iter, b, new_hash, ++ new_key, skip_triggers); + +- bch2_btree_iter_downgrade(iter); +-err: + if (new_hash) { + mutex_lock(&c->btree_cache.lock); + list_move(&new_hash->list, &c->btree_cache.freeable); +@@ -1998,6 +2025,35 @@ err: + return ret; + } + ++int bch2_btree_node_update_key_get_iter(struct btree_trans *trans, ++ struct btree *b, struct bkey_i *new_key, ++ bool skip_triggers) ++{ ++ struct btree_iter *iter; ++ int ret; ++ ++ iter = bch2_trans_get_node_iter(trans, b->c.btree_id, b->key.k.p, ++ BTREE_MAX_DEPTH, b->c.level, ++ BTREE_ITER_INTENT); ++ ret = bch2_btree_iter_traverse(iter); ++ if (ret) ++ goto out; ++ ++ /* has node been freed? */ ++ if (iter->l[b->c.level].b != b) { ++ /* node has been freed: */ ++ BUG_ON(!btree_node_dying(b)); ++ goto out; ++ } ++ ++ BUG_ON(!btree_node_hashed(b)); ++ ++ ret = bch2_btree_node_update_key(trans, iter, b, new_key, skip_triggers); ++out: ++ bch2_trans_iter_put(trans, iter); ++ return ret; ++} ++ + /* Init code: */ + + /* +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index 0843e2c395aa..d319e27aed9f 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -890,7 +890,8 @@ int __bch2_trans_commit(struct btree_trans *trans) + unsigned u64s, reset_flags = 0; + int ret = 0; + +- if (!trans->nr_updates) ++ if (!trans->nr_updates && ++ !trans->extra_journal_entry_u64s) + goto out_reset; + + if (trans->flags & BTREE_INSERT_GC_LOCK_HELD) +diff --git a/fs/bcachefs/io_types.h b/fs/bcachefs/io_types.h +index e7aca7c9823a..0aab77951c4c 100644 +--- a/fs/bcachefs/io_types.h ++++ b/fs/bcachefs/io_types.h +@@ -94,7 +94,8 @@ struct bch_write_bio { + bounce:1, + put_bio:1, + have_ioref:1, +- used_mempool:1; ++ used_mempool:1, ++ first_btree_write:1; + + struct bio bio; + }; +diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c +index aacd6385db1f..1f65eca48c6e 100644 +--- a/fs/bcachefs/migrate.c ++++ b/fs/bcachefs/migrate.c +@@ -139,7 +139,7 @@ retry: + break; + } + +- ret = bch2_btree_node_update_key(&trans, iter, b, k.k); ++ ret = bch2_btree_node_update_key(&trans, iter, b, k.k, false); + if (ret == -EINTR) { + b = bch2_btree_iter_peek_node(iter); + ret = 0; +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index c6fa4ca31ae9..84e224fb0d01 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -1005,6 +1005,11 @@ int bch2_fs_recovery(struct bch_fs *c) + c->opts.fix_errors = FSCK_OPT_YES; + } + ++ if (c->sb.version < bcachefs_metadata_version_btree_ptr_sectors_written) { ++ bch_info(c, "version prior to btree_ptr_sectors_written, upgrade required"); ++ c->opts.version_upgrade = true; ++ } ++ + ret = bch2_blacklist_table_initialize(c); + if (ret) { + bch_err(c, "error initializing blacklist table"); +diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c +index 13a5ca713e7a..ce8e5d4843d0 100644 +--- a/fs/bcachefs/super.c ++++ b/fs/bcachefs/super.c +@@ -507,8 +507,8 @@ static void __bch2_fs_free(struct bch_fs *c) + destroy_workqueue(c->io_complete_wq ); + if (c->copygc_wq) + destroy_workqueue(c->copygc_wq); +- if (c->btree_error_wq) +- destroy_workqueue(c->btree_error_wq); ++ if (c->btree_io_complete_wq) ++ destroy_workqueue(c->btree_io_complete_wq); + if (c->btree_update_wq) + destroy_workqueue(c->btree_update_wq); + +@@ -560,7 +560,6 @@ void __bch2_fs_stop(struct bch_fs *c) + for_each_member_device(ca, c, i) + cancel_work_sync(&ca->io_error_work); + +- cancel_work_sync(&c->btree_write_error_work); + cancel_work_sync(&c->read_only_work); + + for (i = 0; i < c->sb.nr_devices; i++) +@@ -688,9 +687,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) + + mutex_init(&c->bio_bounce_pages_lock); + +- bio_list_init(&c->btree_write_error_list); + spin_lock_init(&c->btree_write_error_lock); +- INIT_WORK(&c->btree_write_error_work, bch2_btree_write_error_work); + + INIT_WORK(&c->journal_seq_blacklist_gc_work, + bch2_blacklist_entries_gc); +@@ -760,7 +757,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) + + if (!(c->btree_update_wq = alloc_workqueue("bcachefs", + WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) || +- !(c->btree_error_wq = alloc_workqueue("bcachefs_error", ++ !(c->btree_io_complete_wq = alloc_workqueue("bcachefs_btree_io", + WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) || + !(c->copygc_wq = alloc_workqueue("bcachefs_copygc", + WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) || +-- +cgit v1.2.3 + + +From bcacb4ae7b79d70ca3bfe4e8dce0dcd752d9c428 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 16 Jul 2021 01:44:26 -0400 +Subject: Revert "bcachefs: statfs bfree and bavail should be the same" + +This reverts commit 664f9847bec525d396d62d2db094ca9020289ae0. +--- + fs/bcachefs/fs.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c +index 2ae1aed69445..20907e554dd3 100644 +--- a/fs/bcachefs/fs.c ++++ b/fs/bcachefs/fs.c +@@ -1276,8 +1276,8 @@ static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf) + buf->f_type = BCACHEFS_STATFS_MAGIC; + buf->f_bsize = sb->s_blocksize; + buf->f_blocks = usage.capacity >> shift; +- buf->f_bfree = avail_factor(usage.free) >> shift; +- buf->f_bavail = buf->f_bfree; ++ buf->f_bfree = usage.free >> shift; ++ buf->f_bavail = avail_factor(usage.free) >> shift; + + buf->f_files = usage.nr_inodes + avail_inodes; + buf->f_ffree = avail_inodes; +-- +cgit v1.2.3 + + +From a55c377e0c511b1670b9b60af2528499488953d7 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 16 Jul 2021 12:57:27 -0400 +Subject: bcachefs: BSET_OFFSET() + +Add a field to struct bset for the sector offset within the btree node +where it was written. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bcachefs_format.h | 3 +++ + fs/bcachefs/btree_io.c | 19 +++++++++++++------ + 2 files changed, 16 insertions(+), 6 deletions(-) + +diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h +index ee958f598195..98779e46bbd0 100644 +--- a/fs/bcachefs/bcachefs_format.h ++++ b/fs/bcachefs/bcachefs_format.h +@@ -1742,6 +1742,9 @@ LE32_BITMASK(BSET_BIG_ENDIAN, struct bset, flags, 4, 5); + LE32_BITMASK(BSET_SEPARATE_WHITEOUTS, + struct bset, flags, 5, 6); + ++/* Sector offset within the btree node: */ ++LE32_BITMASK(BSET_OFFSET, struct bset, flags, 16, 32); ++ + struct btree_node { + struct bch_csum csum; + __le64 magic; +diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c +index 25f6a689633e..40fa0111a3f6 100644 +--- a/fs/bcachefs/btree_io.c ++++ b/fs/bcachefs/btree_io.c +@@ -656,7 +656,8 @@ void bch2_btree_node_drop_keys_outside_node(struct btree *b) + + static int validate_bset(struct bch_fs *c, struct bch_dev *ca, + struct btree *b, struct bset *i, +- unsigned sectors, int write, bool have_retry) ++ unsigned offset, unsigned sectors, ++ int write, bool have_retry) + { + unsigned version = le16_to_cpu(i->version); + const char *err; +@@ -694,18 +695,23 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, + BTREE_ERR_FATAL, c, ca, b, i, + "BSET_SEPARATE_WHITEOUTS no longer supported"); + +- if (btree_err_on(b->written + sectors > c->opts.btree_node_size, ++ if (btree_err_on(offset + sectors > c->opts.btree_node_size, + BTREE_ERR_FIXABLE, c, ca, b, i, + "bset past end of btree node")) { + i->u64s = 0; + return 0; + } + +- btree_err_on(b->written && !i->u64s, ++ btree_err_on(offset && !i->u64s, + BTREE_ERR_FIXABLE, c, ca, b, i, + "empty bset"); + +- if (!b->written) { ++ btree_err_on(BSET_OFFSET(i) && ++ BSET_OFFSET(i) != offset, ++ BTREE_ERR_WANT_RETRY, c, ca, b, i, ++ "bset at wrong sector offset"); ++ ++ if (!offset) { + struct btree_node *bn = + container_of(i, struct btree_node, keys); + /* These indicate that we read the wrong btree node: */ +@@ -959,7 +965,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, + b->version_ondisk = min(b->version_ondisk, + le16_to_cpu(i->version)); + +- ret = validate_bset(c, ca, b, i, sectors, ++ ret = validate_bset(c, ca, b, i, b->written, sectors, + READ, have_retry); + if (ret) + goto fsck_err; +@@ -1717,7 +1723,7 @@ static int validate_bset_for_write(struct bch_fs *c, struct btree *b, + return -1; + + ret = validate_bset_keys(c, b, i, &whiteout_u64s, WRITE, false) ?: +- validate_bset(c, NULL, b, i, sectors, WRITE, false); ++ validate_bset(c, NULL, b, i, b->written, sectors, WRITE, false); + if (ret) { + bch2_inconsistent_error(c); + dump_stack(); +@@ -1880,6 +1886,7 @@ do_write: + i->version = c->sb.version < bcachefs_metadata_version_new_versioning + ? cpu_to_le16(BCH_BSET_VERSION_OLD) + : cpu_to_le16(c->sb.version); ++ SET_BSET_OFFSET(i, b->written); + SET_BSET_CSUM_TYPE(i, bch2_meta_checksum_type(c)); + + if (bch2_csum_type_is_encryption(BSET_CSUM_TYPE(i))) +-- +cgit v1.2.3 + + +From 83901f9be0e87964a0114a0eacc9ae01f48347ab Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 20 Jul 2021 20:14:44 -0400 +Subject: bcachefs: Don't downgrade in traverse() + +Downgrading of btree iterators is something that should only happen +explicitly. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_key_cache.c | 10 ++++------ + 1 file changed, 4 insertions(+), 6 deletions(-) + +diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c +index 6a2984e97b1f..b9e304a3267d 100644 +--- a/fs/bcachefs/btree_key_cache.c ++++ b/fs/bcachefs/btree_key_cache.c +@@ -340,12 +340,10 @@ fill: + + iter->uptodate = BTREE_ITER_NEED_PEEK; + +- if (!(iter->flags & BTREE_ITER_INTENT)) +- bch2_btree_iter_downgrade(iter); +- else if (!iter->locks_want) { +- if (!__bch2_btree_iter_upgrade(iter, 1)) +- ret = -EINTR; +- } ++ if ((iter->flags & BTREE_ITER_INTENT) && ++ !iter->locks_want && ++ __bch2_btree_iter_upgrade(iter, 1)) ++ ret = -EINTR; + + return ret; + err: +-- +cgit v1.2.3 + + +From 9ce5fdf1dac151ec247663fb4bfd8c080bcacb32 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 20 Jul 2021 21:07:21 -0400 +Subject: bcachefs: Handle lock restarts in bch2_xattr_get() + +Snapshots add another btree lookup, thus we need to handle lock +restarts. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/xattr.c | 23 ++++++++++++----------- + 1 file changed, 12 insertions(+), 11 deletions(-) + +diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c +index 8bd7553b9ebd..05afbac97b6b 100644 +--- a/fs/bcachefs/xattr.c ++++ b/fs/bcachefs/xattr.c +@@ -118,18 +118,15 @@ void bch2_xattr_to_text(struct printbuf *out, struct bch_fs *c, + le16_to_cpu(xattr.v->x_val_len)); + } + +-int bch2_xattr_get(struct bch_fs *c, struct bch_inode_info *inode, +- const char *name, void *buffer, size_t size, int type) ++static int bch2_xattr_get_trans(struct btree_trans *trans, struct bch_inode_info *inode, ++ const char *name, void *buffer, size_t size, int type) + { +- struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode); +- struct btree_trans trans; ++ struct bch_hash_info hash = bch2_hash_info_init(trans->c, &inode->ei_inode); + struct btree_iter *iter; + struct bkey_s_c_xattr xattr; + int ret; + +- bch2_trans_init(&trans, c, 0, 0); +- +- iter = bch2_hash_lookup(&trans, bch2_xattr_hash_desc, &hash, ++ iter = bch2_hash_lookup(trans, bch2_xattr_hash_desc, &hash, + inode->v.i_ino, + &X_SEARCH(type, name, strlen(name)), + 0); +@@ -145,14 +142,18 @@ int bch2_xattr_get(struct bch_fs *c, struct bch_inode_info *inode, + else + memcpy(buffer, xattr_val(xattr.v), ret); + } +- bch2_trans_iter_put(&trans, iter); ++ bch2_trans_iter_put(trans, iter); + err: +- bch2_trans_exit(&trans); +- +- BUG_ON(ret == -EINTR); + return ret == -ENOENT ? -ENODATA : ret; + } + ++int bch2_xattr_get(struct bch_fs *c, struct bch_inode_info *inode, ++ const char *name, void *buffer, size_t size, int type) ++{ ++ return bch2_trans_do(c, NULL, NULL, 0, ++ bch2_xattr_get_trans(&trans, inode, name, buffer, size, type)); ++} ++ + int bch2_xattr_set(struct btree_trans *trans, u64 inum, + const struct bch_hash_info *hash_info, + const char *name, const void *value, size_t size, +-- +cgit v1.2.3 + + +From c714cad9514991ed9a892045662f1f7d463a6c05 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 20 Jul 2021 21:18:16 -0400 +Subject: bcachefs: Use bch2_inode_find_by_inum() in truncate + +This is needed for snapshots because we need to start handling lock +restarts even when just calling bch2_inode_peek(). + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/fs-io.c | 13 +------------ + 1 file changed, 1 insertion(+), 12 deletions(-) + +diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c +index f29ffe420a33..a5aaf2ca1950 100644 +--- a/fs/bcachefs/fs-io.c ++++ b/fs/bcachefs/fs-io.c +@@ -2309,8 +2309,6 @@ int bch2_truncate(struct user_namespace *mnt_userns, + struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct address_space *mapping = inode->v.i_mapping; + struct bch_inode_unpacked inode_u; +- struct btree_trans trans; +- struct btree_iter *iter; + u64 new_i_size = iattr->ia_size; + s64 i_sectors_delta = 0; + int ret = 0; +@@ -2331,16 +2329,7 @@ int bch2_truncate(struct user_namespace *mnt_userns, + inode_dio_wait(&inode->v); + bch2_pagecache_block_get(&inode->ei_pagecache_lock); + +- /* +- * fetch current on disk i_size: inode is locked, i_size can only +- * increase underneath us: +- */ +- bch2_trans_init(&trans, c, 0, 0); +- iter = bch2_inode_peek(&trans, &inode_u, inode->v.i_ino, 0); +- ret = PTR_ERR_OR_ZERO(iter); +- bch2_trans_iter_put(&trans, iter); +- bch2_trans_exit(&trans); +- ++ ret = bch2_inode_find_by_inum(c, inode->v.i_ino, &inode_u); + if (ret) + goto err; + +-- +cgit v1.2.3 + + +From a52e63fd8d002490eb8cf87336dbb94e5f1f5d2b Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 21 Jul 2021 13:23:50 -0400 +Subject: bcachefs: Don't squash return code in check_dirents() + +We were squashing BCH_FSCK_ERRORS_NOT_FIXED. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/fsck.c | 13 +++---------- + 1 file changed, 3 insertions(+), 10 deletions(-) + +diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c +index bedfd34803ce..63d42542c194 100644 +--- a/fs/bcachefs/fsck.c ++++ b/fs/bcachefs/fsck.c +@@ -727,7 +727,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, + + k = bch2_btree_iter_peek(iter); + if (!k.k) +- return 1; ++ return 0; + + ret = bkey_err(k); + if (ret) +@@ -904,19 +904,12 @@ static int check_dirents(struct bch_fs *c) + BTREE_ITER_INTENT| + BTREE_ITER_PREFETCH); + +- while (1) { ++ do { + ret = lockrestart_do(&trans, + check_dirent(&trans, iter, &hash_info, &w, &nr_subdirs)); +- if (ret == 1) { +- /* at end */ +- ret = 0; +- break; +- } + if (ret) + break; +- +- bch2_btree_iter_advance(iter); +- } ++ } while (bch2_btree_iter_advance(iter)); + bch2_trans_iter_put(&trans, iter); + + return bch2_trans_exit(&trans) ?: ret; +-- +cgit v1.2.3 + + +From 8548623e59112406dc776085f3ebbfd13221efc3 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 21 Jul 2021 13:55:51 -0400 +Subject: bcachefs: Pretty-ify bch2_bkey_val_to_text() + +Don't print out the ": " when there isn't a value to print. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bkey_methods.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c +index ff9d770aabea..f8adbf437276 100644 +--- a/fs/bcachefs/bkey_methods.c ++++ b/fs/bcachefs/bkey_methods.c +@@ -269,7 +269,7 @@ void bch2_bkey_val_to_text(struct printbuf *out, struct bch_fs *c, + { + bch2_bkey_to_text(out, k.k); + +- if (k.k) { ++ if (bkey_val_bytes(k.k)) { + pr_buf(out, ": "); + bch2_val_to_text(out, c, k); + } +-- +cgit v1.2.3 + + +From 9c6b6c4f5841a527acec2e0a6242acf0adb8c293 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 24 Jul 2021 20:20:02 -0400 +Subject: bcachefs: Fix a btree iterator leak + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_update_leaf.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index d319e27aed9f..9bdf1f64d0e5 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -839,8 +839,10 @@ static int extent_handle_overwrites(struct btree_trans *trans, + BTREE_ITER_NOT_EXTENTS| + BTREE_ITER_INTENT); + ret = bch2_btree_iter_traverse(update_iter); +- if (ret) ++ if (ret) { ++ bch2_trans_iter_put(trans, update_iter); + goto out; ++ } + + bch2_trans_update(trans, update_iter, update, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE| +-- +cgit v1.2.3 + + +From 8177f78561f705f6961f0849eb9bf58cca19fbb4 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 23 Jul 2021 18:26:38 -0400 +Subject: bcachefs: Use bch2_trans_do() in bch2_btree_key_cache_journal_flush() + +We're working to standardize handling of transaction restarts. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_key_cache.c | 22 +++++++--------------- + 1 file changed, 7 insertions(+), 15 deletions(-) + +diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c +index b9e304a3267d..e93bf675f19a 100644 +--- a/fs/bcachefs/btree_key_cache.c ++++ b/fs/bcachefs/btree_key_cache.c +@@ -375,10 +375,9 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans, + BTREE_ITER_CACHED_NOFILL| + BTREE_ITER_CACHED_NOCREATE| + BTREE_ITER_INTENT); +-retry: + ret = bch2_btree_iter_traverse(c_iter); + if (ret) +- goto err; ++ goto out; + + ck = (void *) c_iter->l[0].b; + if (!ck || +@@ -409,15 +408,10 @@ retry: + ? BTREE_INSERT_JOURNAL_RESERVED + : 0)| + commit_flags); +-err: +- if (ret == -EINTR) +- goto retry; +- +- if (ret == -EAGAIN) +- goto out; +- + if (ret) { +- bch2_fs_fatal_err_on(!bch2_journal_error(j), c, ++ bch2_fs_fatal_err_on(ret != -EINTR && ++ ret != -EAGAIN && ++ !bch2_journal_error(j), c, + "error flushing key cache: %i", ret); + goto out; + } +@@ -465,7 +459,6 @@ int bch2_btree_key_cache_journal_flush(struct journal *j, + struct bkey_cached *ck = + container_of(pin, struct bkey_cached, journal); + struct bkey_cached_key key; +- struct btree_trans trans; + int ret = 0; + + int srcu_idx = srcu_read_lock(&c->btree_trans_barrier); +@@ -480,10 +473,9 @@ int bch2_btree_key_cache_journal_flush(struct journal *j, + } + six_unlock_read(&ck->c.lock); + +- bch2_trans_init(&trans, c, 0, 0); +- ret = btree_key_cache_flush_pos(&trans, key, seq, +- BTREE_INSERT_JOURNAL_RECLAIM, false); +- bch2_trans_exit(&trans); ++ ret = bch2_trans_do(c, NULL, NULL, 0, ++ btree_key_cache_flush_pos(&trans, key, seq, ++ BTREE_INSERT_JOURNAL_RECLAIM, false)); + unlock: + srcu_read_unlock(&c->btree_trans_barrier, srcu_idx); + +-- +cgit v1.2.3 + + +From 49d70e67a01ba0e7314cd1891ebaeb14fb4b8ecb Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 24 Jul 2021 17:12:51 -0400 +Subject: bcachefs: bch2_btree_iter_relock_intent() + +This adds a new helper for btree_cache.c that does what we want where +the iterator is still being traverse - and also eliminates some +unnecessary transaction restarts. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_cache.c | 23 +++++++++-------------- + fs/bcachefs/btree_cache.h | 2 +- + fs/bcachefs/btree_iter.c | 34 +++++++++++++++++++++++++++++++--- + fs/bcachefs/btree_iter.h | 1 + + 4 files changed, 42 insertions(+), 18 deletions(-) + +diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c +index 3b975f4dceed..b07e0204996d 100644 +--- a/fs/bcachefs/btree_cache.c ++++ b/fs/bcachefs/btree_cache.c +@@ -694,14 +694,9 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c, + if (!sync) + return NULL; + +- /* +- * XXX: this will probably always fail because btree_iter_relock() +- * currently fails for iterators that aren't pointed at a valid btree +- * node +- */ + if (iter && + (!bch2_trans_relock(iter->trans) || +- !bch2_btree_iter_relock(iter, _THIS_IP_))) ++ !bch2_btree_iter_relock_intent(iter))) + return ERR_PTR(-EINTR); + + if (!six_relock_type(&b->c.lock, lock_type, seq)) +@@ -761,11 +756,12 @@ static inline void btree_check_header(struct bch_fs *c, struct btree *b) + * The btree node will have either a read or a write lock held, depending on + * the @write parameter. + */ +-struct btree *bch2_btree_node_get(struct bch_fs *c, struct btree_iter *iter, ++struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_iter *iter, + const struct bkey_i *k, unsigned level, + enum six_lock_type lock_type, + unsigned long trace_ip) + { ++ struct bch_fs *c = trans->c; + struct btree_cache *bc = &c->btree_cache; + struct btree *b; + struct bset_tree *t; +@@ -839,7 +835,7 @@ lock_node: + if (bch2_btree_node_relock(iter, level + 1)) + goto retry; + +- trace_trans_restart_btree_node_reused(iter->trans->ip, ++ trace_trans_restart_btree_node_reused(trans->ip, + trace_ip, + iter->btree_id, + &iter->real_pos); +@@ -851,18 +847,17 @@ lock_node: + u32 seq = b->c.lock.state.seq; + + six_unlock_type(&b->c.lock, lock_type); +- bch2_trans_unlock(iter->trans); ++ bch2_trans_unlock(trans); + + bch2_btree_node_wait_on_read(b); + + /* +- * XXX: check if this always fails - btree_iter_relock() +- * currently fails for iterators that aren't pointed at a valid +- * btree node ++ * should_be_locked is not set on this iterator yet, so we need ++ * to relock it specifically: + */ + if (iter && +- (!bch2_trans_relock(iter->trans) || +- !bch2_btree_iter_relock(iter, _THIS_IP_))) ++ (!bch2_trans_relock(trans) || ++ !bch2_btree_iter_relock_intent(iter))) + return ERR_PTR(-EINTR); + + if (!six_relock_type(&b->c.lock, lock_type, seq)) +diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h +index fd5026c9f9c3..3dbfd6201d28 100644 +--- a/fs/bcachefs/btree_cache.h ++++ b/fs/bcachefs/btree_cache.h +@@ -22,7 +22,7 @@ int bch2_btree_cache_cannibalize_lock(struct bch_fs *, struct closure *); + struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *); + struct btree *bch2_btree_node_mem_alloc(struct bch_fs *); + +-struct btree *bch2_btree_node_get(struct bch_fs *, struct btree_iter *, ++struct btree *bch2_btree_node_get(struct btree_trans *, struct btree_iter *, + const struct bkey_i *, unsigned, + enum six_lock_type, unsigned long); + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index b0d5681f2a5b..8ea9e881a1b4 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -205,7 +205,6 @@ static inline bool btree_iter_get_locks(struct btree_iter *iter, bool upgrade, + is_btree_node(iter, l) + ? iter->l[l].b->c.lock.state.seq + : 0); +- + fail_idx = l; + btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); + } +@@ -383,6 +382,34 @@ void bch2_btree_trans_verify_locks(struct btree_trans *trans) + static inline void bch2_btree_iter_verify_locks(struct btree_iter *iter) {} + #endif + ++/* ++ * Only for btree_cache.c - only relocks intent locks ++ */ ++bool bch2_btree_iter_relock_intent(struct btree_iter *iter) ++{ ++ unsigned l; ++ ++ for (l = iter->level; ++ l < iter->locks_want && btree_iter_node(iter, l); ++ l++) { ++ if (!bch2_btree_node_relock(iter, l)) { ++ trace_node_relock_fail(iter->trans->ip, _RET_IP_, ++ iter->btree_id, &iter->real_pos, ++ l, iter->l[l].lock_seq, ++ is_btree_node(iter, l) ++ ? 0 ++ : (unsigned long) iter->l[l].b, ++ is_btree_node(iter, l) ++ ? iter->l[l].b->c.lock.state.seq ++ : 0); ++ btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); ++ return false; ++ } ++ } ++ ++ return true; ++} ++ + __flatten + bool bch2_btree_iter_relock(struct btree_iter *iter, unsigned long trace_ip) + { +@@ -1175,7 +1202,8 @@ static noinline void btree_node_mem_ptr_set(struct btree_iter *iter, + static __always_inline int btree_iter_down(struct btree_iter *iter, + unsigned long trace_ip) + { +- struct bch_fs *c = iter->trans->c; ++ struct btree_trans *trans = iter->trans; ++ struct bch_fs *c = trans->c; + struct btree_iter_level *l = &iter->l[iter->level]; + struct btree *b; + unsigned level = iter->level - 1; +@@ -1189,7 +1217,7 @@ static __always_inline int btree_iter_down(struct btree_iter *iter, + bch2_bkey_buf_unpack(&tmp, c, l->b, + bch2_btree_node_iter_peek(&l->iter, l->b)); + +- b = bch2_btree_node_get(c, iter, tmp.k, level, lock_type, trace_ip); ++ b = bch2_btree_node_get(trans, iter, tmp.k, level, lock_type, trace_ip); + ret = PTR_ERR_OR_ZERO(b); + if (unlikely(ret)) + goto err; +diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h +index 7385cca43f8b..3889683e16f8 100644 +--- a/fs/bcachefs/btree_iter.h ++++ b/fs/bcachefs/btree_iter.h +@@ -111,6 +111,7 @@ void bch2_btree_node_iter_fix(struct btree_iter *, struct btree *, + struct btree_node_iter *, struct bkey_packed *, + unsigned, unsigned); + ++bool bch2_btree_iter_relock_intent(struct btree_iter *); + bool bch2_btree_iter_relock(struct btree_iter *, unsigned long); + + bool bch2_trans_relock(struct btree_trans *); +-- +cgit v1.2.3 + + +From 44b1deb736b5e3d80276789faab9a4184f50b467 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 24 Jul 2021 17:31:25 -0400 +Subject: bcachefs: Minor tracepoint improvements + +Btree iterator tracepoints should print whether they're for the key +cache. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 6 +++++- + include/trace/events/bcachefs.h | 22 ++++++++++++++++------ + 2 files changed, 21 insertions(+), 7 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 8ea9e881a1b4..a8734b6f1835 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -197,6 +197,7 @@ static inline bool btree_iter_get_locks(struct btree_iter *iter, bool upgrade, + (upgrade + ? trace_node_upgrade_fail + : trace_node_relock_fail)(iter->trans->ip, trace_ip, ++ btree_iter_type(iter) == BTREE_ITER_CACHED, + iter->btree_id, &iter->real_pos, + l, iter->l[l].lock_seq, + is_btree_node(iter, l) +@@ -394,6 +395,7 @@ bool bch2_btree_iter_relock_intent(struct btree_iter *iter) + l++) { + if (!bch2_btree_node_relock(iter, l)) { + trace_node_relock_fail(iter->trans->ip, _RET_IP_, ++ btree_iter_type(iter) == BTREE_ITER_CACHED, + iter->btree_id, &iter->real_pos, + l, iter->l[l].lock_seq, + is_btree_node(iter, l) +@@ -1389,6 +1391,7 @@ static inline unsigned btree_iter_up_until_good_node(struct btree_iter *iter, + static int btree_iter_traverse_one(struct btree_iter *iter, + unsigned long trace_ip) + { ++ struct btree_trans *trans = iter->trans; + unsigned l, depth_want = iter->level; + int ret = 0; + +@@ -1450,7 +1453,8 @@ static int btree_iter_traverse_one(struct btree_iter *iter, + + iter->uptodate = BTREE_ITER_NEED_PEEK; + out: +- trace_iter_traverse(iter->trans->ip, trace_ip, ++ trace_iter_traverse(trans->ip, trace_ip, ++ btree_iter_type(iter) == BTREE_ITER_CACHED, + iter->btree_id, &iter->real_pos, ret); + bch2_btree_iter_verify(iter); + return ret; +diff --git a/include/trace/events/bcachefs.h b/include/trace/events/bcachefs.h +index 4c0d9b7660ee..a11bb5f7180e 100644 +--- a/include/trace/events/bcachefs.h ++++ b/include/trace/events/bcachefs.h +@@ -775,14 +775,16 @@ DEFINE_EVENT(transaction_restart_iter, trans_restart_traverse, + TRACE_EVENT(iter_traverse, + TP_PROTO(unsigned long trans_ip, + unsigned long caller_ip, ++ bool key_cache, + enum btree_id btree_id, + struct bpos *pos, + int ret), +- TP_ARGS(trans_ip, caller_ip, btree_id, pos, ret), ++ TP_ARGS(trans_ip, caller_ip, key_cache, btree_id, pos, ret), + + TP_STRUCT__entry( + __field(unsigned long, trans_ip ) + __field(unsigned long, caller_ip ) ++ __field(u8, key_cache ) + __field(u8, btree_id ) + __field(u64, pos_inode ) + __field(u64, pos_offset ) +@@ -793,6 +795,7 @@ TRACE_EVENT(iter_traverse, + TP_fast_assign( + __entry->trans_ip = trans_ip; + __entry->caller_ip = caller_ip; ++ __entry->key_cache = key_cache; + __entry->btree_id = btree_id; + __entry->pos_inode = pos->inode; + __entry->pos_offset = pos->offset; +@@ -800,9 +803,10 @@ TRACE_EVENT(iter_traverse, + __entry->ret = ret; + ), + +- TP_printk("%ps %pS pos %u %llu:%llu:%u ret %i", ++ TP_printk("%ps %pS key cache %u btree %u %llu:%llu:%u ret %i", + (void *) __entry->trans_ip, + (void *) __entry->caller_ip, ++ __entry->key_cache, + __entry->btree_id, + __entry->pos_inode, + __entry->pos_offset, +@@ -953,15 +957,17 @@ TRACE_EVENT(trans_restart_mem_realloced, + DECLARE_EVENT_CLASS(node_lock_fail, + TP_PROTO(unsigned long trans_ip, + unsigned long caller_ip, ++ bool key_cache, + enum btree_id btree_id, + struct bpos *pos, + unsigned level, u32 iter_seq, unsigned node, u32 node_seq), +- TP_ARGS(trans_ip, caller_ip, btree_id, pos, ++ TP_ARGS(trans_ip, caller_ip, key_cache, btree_id, pos, + level, iter_seq, node, node_seq), + + TP_STRUCT__entry( + __field(unsigned long, trans_ip ) + __field(unsigned long, caller_ip ) ++ __field(u8, key_cache ) + __field(u8, btree_id ) + __field(u64, pos_inode ) + __field(u64, pos_offset ) +@@ -975,6 +981,7 @@ DECLARE_EVENT_CLASS(node_lock_fail, + TP_fast_assign( + __entry->trans_ip = trans_ip; + __entry->caller_ip = caller_ip; ++ __entry->key_cache = key_cache; + __entry->btree_id = btree_id; + __entry->pos_inode = pos->inode; + __entry->pos_offset = pos->offset; +@@ -985,9 +992,10 @@ DECLARE_EVENT_CLASS(node_lock_fail, + __entry->node_seq = node_seq; + ), + +- TP_printk("%ps %pS btree %u pos %llu:%llu:%u level %u iter seq %u node %u node seq %u", ++ TP_printk("%ps %pS key cache %u btree %u pos %llu:%llu:%u level %u iter seq %u node %u node seq %u", + (void *) __entry->trans_ip, + (void *) __entry->caller_ip, ++ __entry->key_cache, + __entry->btree_id, + __entry->pos_inode, + __entry->pos_offset, +@@ -999,20 +1007,22 @@ DECLARE_EVENT_CLASS(node_lock_fail, + DEFINE_EVENT(node_lock_fail, node_upgrade_fail, + TP_PROTO(unsigned long trans_ip, + unsigned long caller_ip, ++ bool key_cache, + enum btree_id btree_id, + struct bpos *pos, + unsigned level, u32 iter_seq, unsigned node, u32 node_seq), +- TP_ARGS(trans_ip, caller_ip, btree_id, pos, ++ TP_ARGS(trans_ip, caller_ip, key_cache, btree_id, pos, + level, iter_seq, node, node_seq) + ); + + DEFINE_EVENT(node_lock_fail, node_relock_fail, + TP_PROTO(unsigned long trans_ip, + unsigned long caller_ip, ++ bool key_cache, + enum btree_id btree_id, + struct bpos *pos, + unsigned level, u32 iter_seq, unsigned node, u32 node_seq), +- TP_ARGS(trans_ip, caller_ip, btree_id, pos, ++ TP_ARGS(trans_ip, caller_ip, key_cache, btree_id, pos, + level, iter_seq, node, node_seq) + ); + +-- +cgit v1.2.3 + + +From 2d11e65e496d9b4a8025e24bf7dbc105ea5a82a7 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 26 Jul 2021 15:52:41 -0400 +Subject: bcachefs: Add an option for btree node mem ptr optimization + +bch2_btree_node_ptr_v2 has a field for stashing a pointer to the in +memory btree node; this is safe because we clear this field when reading +in nodes from disk and we never free in memory btree nodes - but, we +have bug reports that indicate something might be faulty with this +optimization, so let's add an option for it. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_cache.c | 16 ++++++++++------ + fs/bcachefs/opts.h | 5 +++++ + 2 files changed, 15 insertions(+), 6 deletions(-) + +diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c +index b07e0204996d..c2919b874d4e 100644 +--- a/fs/bcachefs/btree_cache.c ++++ b/fs/bcachefs/btree_cache.c +@@ -768,9 +768,11 @@ struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_iter * + + EBUG_ON(level >= BTREE_MAX_DEPTH); + +- b = btree_node_mem_ptr(k); +- if (b) +- goto lock_node; ++ if (c->opts.btree_node_mem_ptr_optimization) { ++ b = btree_node_mem_ptr(k); ++ if (b) ++ goto lock_node; ++ } + retry: + b = btree_cache_find(bc, k); + if (unlikely(!b)) { +@@ -903,9 +905,11 @@ struct btree *bch2_btree_node_get_noiter(struct bch_fs *c, + + EBUG_ON(level >= BTREE_MAX_DEPTH); + +- b = btree_node_mem_ptr(k); +- if (b) +- goto lock_node; ++ if (c->opts.btree_node_mem_ptr_optimization) { ++ b = btree_node_mem_ptr(k); ++ if (b) ++ goto lock_node; ++ } + retry: + b = btree_cache_find(bc, k); + if (unlikely(!b)) { +diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h +index ed505857bc9e..003c00f25037 100644 +--- a/fs/bcachefs/opts.h ++++ b/fs/bcachefs/opts.h +@@ -178,6 +178,11 @@ enum opt_type { + OPT_BOOL(), \ + BCH_SB_INODES_USE_KEY_CACHE, true, \ + NULL, "Use the btree key cache for the inodes btree") \ ++ x(btree_node_mem_ptr_optimization, u8, \ ++ OPT_MOUNT|OPT_RUNTIME, \ ++ OPT_BOOL(), \ ++ NO_SB_OPT, true, \ ++ NULL, "Stash pointer to in memory btree node in btree ptr")\ + x(gc_reserve_percent, u8, \ + OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_UINT(5, 21), \ +-- +cgit v1.2.3 + + +From 35f24bc679c53dd4ec7a0d67d9b17f60ee8bad03 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 25 Jul 2021 14:20:43 -0400 +Subject: bcachefs: Don't traverse iterators in __bch2_trans_commit() + +They should already be traversed, and we're asserting that since the +introduction of iter->should_be_locked + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_update_leaf.c | 12 +++++------- + 1 file changed, 5 insertions(+), 7 deletions(-) + +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index 9bdf1f64d0e5..6196fc4c8799 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -946,13 +946,7 @@ int __bch2_trans_commit(struct btree_trans *trans) + } while (trans_trigger_run); + + trans_for_each_update(trans, i) { +- ret = bch2_btree_iter_traverse(i->iter); +- if (unlikely(ret)) { +- trace_trans_restart_traverse(trans->ip, _RET_IP_, +- i->iter->btree_id, +- &i->iter->pos); +- goto out; +- } ++ BUG_ON(!i->iter->should_be_locked); + + if (unlikely(!bch2_btree_iter_upgrade(i->iter, i->level + 1))) { + trace_trans_restart_upgrade(trans->ip, _RET_IP_, +@@ -1052,7 +1046,11 @@ int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter, + n.iter = bch2_trans_get_iter(trans, n.btree_id, n.k->k.p, + BTREE_ITER_INTENT| + BTREE_ITER_NOT_EXTENTS); ++ ret = bch2_btree_iter_traverse(n.iter); + bch2_trans_iter_put(trans, n.iter); ++ ++ if (ret) ++ return ret; + } + + BUG_ON(n.iter->flags & BTREE_ITER_IS_EXTENTS); +-- +cgit v1.2.3 + + +From 2bc3d7db0faf4172ec6845c66a8a5706113409e7 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 27 Jul 2021 17:58:58 -0400 +Subject: bcachefs: bch2_trans_relock() only relocks iters that should be + locked + +This avoids unexpected lock restarts in bch2_btree_iter_traverse_all(). + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 7 +++---- + 1 file changed, 3 insertions(+), 4 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index a8734b6f1835..53a2e66fa40a 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -496,8 +496,7 @@ void bch2_trans_downgrade(struct btree_trans *trans) + + /* Btree transaction locking: */ + +-static inline bool btree_iter_should_be_locked(struct btree_trans *trans, +- struct btree_iter *iter) ++static inline bool btree_iter_should_be_locked(struct btree_iter *iter) + { + return (iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT) || + iter->should_be_locked; +@@ -508,8 +507,8 @@ bool bch2_trans_relock(struct btree_trans *trans) + struct btree_iter *iter; + + trans_for_each_iter(trans, iter) +- if (!bch2_btree_iter_relock(iter, _RET_IP_) && +- btree_iter_should_be_locked(trans, iter)) { ++ if (btree_iter_should_be_locked(iter) && ++ !bch2_btree_iter_relock(iter, _RET_IP_)) { + trace_trans_restart_relock(trans->ip, _RET_IP_, + iter->btree_id, &iter->real_pos); + return false; +-- +cgit v1.2.3 + + +From 3ea61c07e959afc0f4312d2764af88c8ef656998 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 24 Jul 2021 17:43:35 -0400 +Subject: bcachefs: traverse_all() is responsible for clearing should_be_locked + +bch2_btree_iter_traverse_all() may loop, and it needs to clear +iter->should_be_locked on every iteration. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 14 ++------------ + 1 file changed, 2 insertions(+), 12 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 53a2e66fa40a..aaf505f1a61e 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -1252,7 +1252,6 @@ static int __btree_iter_traverse_all(struct btree_trans *trans, int ret, + struct btree_iter *iter; + u8 sorted[BTREE_ITER_MAX]; + int i, nr_sorted = 0; +- bool relock_fail; + + if (trans->in_traverse_all) + return -EINTR; +@@ -1260,17 +1259,10 @@ static int __btree_iter_traverse_all(struct btree_trans *trans, int ret, + trans->in_traverse_all = true; + retry_all: + nr_sorted = 0; +- relock_fail = false; + + trans_for_each_iter(trans, iter) { +- if (!bch2_btree_iter_relock(iter, _THIS_IP_)) +- relock_fail = true; + sorted[nr_sorted++] = iter->idx; +- } +- +- if (!relock_fail) { +- trans->in_traverse_all = false; +- return 0; ++ iter->should_be_locked = false; + } + + #define btree_iter_cmp_by_idx(_l, _r) \ +@@ -2365,11 +2357,9 @@ void bch2_trans_reset(struct btree_trans *trans, unsigned flags) + { + struct btree_iter *iter; + +- trans_for_each_iter(trans, iter) { ++ trans_for_each_iter(trans, iter) + iter->flags &= ~(BTREE_ITER_KEEP_UNTIL_COMMIT| + BTREE_ITER_SET_POS_AFTER_COMMIT); +- iter->should_be_locked = false; +- } + + bch2_trans_unlink_iters(trans); + +-- +cgit v1.2.3 + + +From 962e4f76c3361bdd327bd62fe3ec90f921a557af Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 24 Jul 2021 19:50:40 -0400 +Subject: bcachefs: Always check for transaction restarts + +On transaction restart iterators won't be locked anymore - make sure +we're always checking for errors. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/acl.c | 18 ++++++++++++++++-- + fs/bcachefs/alloc_background.c | 2 +- + fs/bcachefs/btree_cache.c | 11 ++++++----- + fs/bcachefs/btree_cache.h | 4 ++-- + fs/bcachefs/btree_iter.c | 12 +++++++----- + fs/bcachefs/dirent.c | 18 +++++++++++++++--- + fs/bcachefs/fs-common.c | 4 ++++ + fs/bcachefs/fs-io.c | 4 ++-- + fs/bcachefs/inode.c | 2 +- + fs/bcachefs/io.c | 6 +++++- + fs/bcachefs/move.c | 2 +- + fs/bcachefs/reflink.c | 5 +++-- + fs/bcachefs/str_hash.h | 2 +- + fs/bcachefs/xattr.c | 8 +++++++- + 14 files changed, 71 insertions(+), 27 deletions(-) + +diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c +index e8d0eb92c782..8371a20ac310 100644 +--- a/fs/bcachefs/acl.c ++++ b/fs/bcachefs/acl.c +@@ -221,6 +221,8 @@ struct posix_acl *bch2_get_acl(struct inode *vinode, int type, bool rcu) + struct btree_iter *iter; + struct bkey_s_c_xattr xattr; + struct posix_acl *acl = NULL; ++ struct bkey_s_c k; ++ int ret; + + if (rcu) + return ERR_PTR(-ECHILD); +@@ -242,7 +244,14 @@ retry: + goto out; + } + +- xattr = bkey_s_c_to_xattr(bch2_btree_iter_peek_slot(iter)); ++ k = bch2_btree_iter_peek_slot(iter); ++ ret = bkey_err(k); ++ if (ret) { ++ acl = ERR_PTR(ret); ++ goto out; ++ } ++ ++ xattr = bkey_s_c_to_xattr(k); + acl = bch2_acl_from_disk(xattr_val(xattr.v), + le16_to_cpu(xattr.v->x_val_len)); + +@@ -359,6 +368,7 @@ int bch2_acl_chmod(struct btree_trans *trans, + struct bkey_s_c_xattr xattr; + struct bkey_i_xattr *new; + struct posix_acl *acl; ++ struct bkey_s_c k; + int ret; + + iter = bch2_hash_lookup(trans, bch2_xattr_hash_desc, +@@ -369,7 +379,11 @@ int bch2_acl_chmod(struct btree_trans *trans, + if (ret) + return ret == -ENOENT ? 0 : ret; + +- xattr = bkey_s_c_to_xattr(bch2_btree_iter_peek_slot(iter)); ++ k = bch2_btree_iter_peek_slot(iter); ++ xattr = bkey_s_c_to_xattr(k); ++ if (ret) ++ goto err; ++ + acl = bch2_acl_from_disk(xattr_val(xattr.v), + le16_to_cpu(xattr.v->x_val_len)); + ret = PTR_ERR_OR_ZERO(acl); +diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c +index 82e6ee8117b5..b553b6c93568 100644 +--- a/fs/bcachefs/alloc_background.c ++++ b/fs/bcachefs/alloc_background.c +@@ -374,7 +374,7 @@ int bch2_alloc_write(struct bch_fs *c, unsigned flags) + percpu_ref_put(&ca->ref); + goto err; + } +- bch2_btree_iter_next_slot(iter); ++ bch2_btree_iter_advance(iter); + } + } + err: +diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c +index c2919b874d4e..b7b9468a590e 100644 +--- a/fs/bcachefs/btree_cache.c ++++ b/fs/bcachefs/btree_cache.c +@@ -974,9 +974,9 @@ out: + return b; + } + +-void bch2_btree_node_prefetch(struct bch_fs *c, struct btree_iter *iter, +- const struct bkey_i *k, +- enum btree_id btree_id, unsigned level) ++int bch2_btree_node_prefetch(struct bch_fs *c, struct btree_iter *iter, ++ const struct bkey_i *k, ++ enum btree_id btree_id, unsigned level) + { + struct btree_cache *bc = &c->btree_cache; + struct btree *b; +@@ -986,9 +986,10 @@ void bch2_btree_node_prefetch(struct bch_fs *c, struct btree_iter *iter, + + b = btree_cache_find(bc, k); + if (b) +- return; ++ return 0; + +- bch2_btree_node_fill(c, iter, k, btree_id, level, SIX_LOCK_read, false); ++ b = bch2_btree_node_fill(c, iter, k, btree_id, level, SIX_LOCK_read, false); ++ return PTR_ERR_OR_ZERO(b); + } + + void bch2_btree_node_evict(struct bch_fs *c, const struct bkey_i *k) +diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h +index 3dbfd6201d28..5032293e8628 100644 +--- a/fs/bcachefs/btree_cache.h ++++ b/fs/bcachefs/btree_cache.h +@@ -29,8 +29,8 @@ struct btree *bch2_btree_node_get(struct btree_trans *, struct btree_iter *, + struct btree *bch2_btree_node_get_noiter(struct bch_fs *, const struct bkey_i *, + enum btree_id, unsigned, bool); + +-void bch2_btree_node_prefetch(struct bch_fs *, struct btree_iter *, +- const struct bkey_i *, enum btree_id, unsigned); ++int bch2_btree_node_prefetch(struct bch_fs *, struct btree_iter *, ++ const struct bkey_i *, enum btree_id, unsigned); + + void bch2_btree_node_evict(struct bch_fs *, const struct bkey_i *); + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index aaf505f1a61e..5a287e8f4a6a 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -1145,7 +1145,7 @@ static inline int btree_iter_lock_root(struct btree_iter *iter, + } + + noinline +-static void btree_iter_prefetch(struct btree_iter *iter) ++static int btree_iter_prefetch(struct btree_iter *iter) + { + struct bch_fs *c = iter->trans->c; + struct btree_iter_level *l = &iter->l[iter->level]; +@@ -1156,10 +1156,11 @@ static void btree_iter_prefetch(struct btree_iter *iter) + ? (iter->level > 1 ? 0 : 2) + : (iter->level > 1 ? 1 : 16); + bool was_locked = btree_node_locked(iter, iter->level); ++ int ret = 0; + + bch2_bkey_buf_init(&tmp); + +- while (nr) { ++ while (nr && !ret) { + if (!bch2_btree_node_relock(iter, iter->level)) + break; + +@@ -1169,14 +1170,15 @@ static void btree_iter_prefetch(struct btree_iter *iter) + break; + + bch2_bkey_buf_unpack(&tmp, c, l->b, k); +- bch2_btree_node_prefetch(c, iter, tmp.k, iter->btree_id, +- iter->level - 1); ++ ret = bch2_btree_node_prefetch(c, iter, tmp.k, iter->btree_id, ++ iter->level - 1); + } + + if (!was_locked) + btree_node_unlock(iter, iter->level); + + bch2_bkey_buf_exit(&tmp, c); ++ return ret; + } + + static noinline void btree_node_mem_ptr_set(struct btree_iter *iter, +@@ -1231,7 +1233,7 @@ static __always_inline int btree_iter_down(struct btree_iter *iter, + btree_node_mem_ptr_set(iter, level + 1, b); + + if (iter->flags & BTREE_ITER_PREFETCH) +- btree_iter_prefetch(iter); ++ ret = btree_iter_prefetch(iter); + + if (btree_node_read_locked(iter, level + 1)) + btree_node_unlock(iter, level + 1); +diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c +index a95165b8eddf..02b29681f695 100644 +--- a/fs/bcachefs/dirent.c ++++ b/fs/bcachefs/dirent.c +@@ -210,6 +210,9 @@ int bch2_dirent_rename(struct btree_trans *trans, + goto out; + + old_dst = bch2_btree_iter_peek_slot(dst_iter); ++ ret = bkey_err(old_dst); ++ if (ret) ++ goto out; + + if (mode != BCH_RENAME) + *dst_inum = le64_to_cpu(bkey_s_c_to_dirent(old_dst).v->d_inum); +@@ -225,6 +228,10 @@ int bch2_dirent_rename(struct btree_trans *trans, + goto out; + + old_src = bch2_btree_iter_peek_slot(src_iter); ++ ret = bkey_err(old_src); ++ if (ret) ++ goto out; ++ + *src_inum = le64_to_cpu(bkey_s_c_to_dirent(old_src).v->d_inum); + + /* Create new dst key: */ +@@ -329,20 +336,25 @@ u64 bch2_dirent_lookup(struct bch_fs *c, u64 dir_inum, + struct btree_iter *iter; + struct bkey_s_c k; + u64 inum = 0; ++ int ret = 0; + + bch2_trans_init(&trans, c, 0, 0); + + iter = __bch2_dirent_lookup_trans(&trans, dir_inum, + hash_info, name, 0); +- if (IS_ERR(iter)) { +- BUG_ON(PTR_ERR(iter) == -EINTR); ++ ret = PTR_ERR_OR_ZERO(iter); ++ if (ret) + goto out; +- } + + k = bch2_btree_iter_peek_slot(iter); ++ ret = bkey_err(k); ++ if (ret) ++ goto out; ++ + inum = le64_to_cpu(bkey_s_c_to_dirent(k).v->d_inum); + bch2_trans_iter_put(&trans, iter); + out: ++ BUG_ON(ret == -EINTR); + bch2_trans_exit(&trans); + return inum; + } +diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c +index 60c54438074e..2189a11ccad8 100644 +--- a/fs/bcachefs/fs-common.c ++++ b/fs/bcachefs/fs-common.c +@@ -168,6 +168,10 @@ int bch2_unlink_trans(struct btree_trans *trans, + goto err; + + k = bch2_btree_iter_peek_slot(dirent_iter); ++ ret = bkey_err(k); ++ if (ret) ++ goto err; ++ + inum = le64_to_cpu(bkey_s_c_to_dirent(k).v->d_inum); + + inode_iter = bch2_inode_peek(trans, inode_u, inum, BTREE_ITER_INTENT); +diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c +index a5aaf2ca1950..e8b47ced177d 100644 +--- a/fs/bcachefs/fs-io.c ++++ b/fs/bcachefs/fs-io.c +@@ -2681,13 +2681,13 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode, + /* already reserved */ + if (k.k->type == KEY_TYPE_reservation && + bkey_s_c_to_reservation(k).v->nr_replicas >= replicas) { +- bch2_btree_iter_next_slot(iter); ++ bch2_btree_iter_advance(iter); + continue; + } + + if (bkey_extent_is_data(k.k) && + !(mode & FALLOC_FL_ZERO_RANGE)) { +- bch2_btree_iter_next_slot(iter); ++ bch2_btree_iter_advance(iter); + continue; + } + +diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c +index 67983ff4fb2c..25607b5848be 100644 +--- a/fs/bcachefs/inode.c ++++ b/fs/bcachefs/inode.c +@@ -519,7 +519,7 @@ again: + if (k.k->p.snapshot == snapshot && + k.k->type != KEY_TYPE_inode && + !bch2_btree_key_cache_find(c, BTREE_ID_inodes, SPOS(0, pos, snapshot))) { +- bch2_btree_iter_next(iter); ++ bch2_btree_iter_advance(iter); + continue; + } + +diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c +index 1345befd1a09..e48bdaa15c7b 100644 +--- a/fs/bcachefs/io.c ++++ b/fs/bcachefs/io.c +@@ -235,8 +235,12 @@ int bch2_sum_sector_overwrites(struct btree_trans *trans, + * writing to, because i_size could be up to one block + * less: + */ +- if (!bkey_cmp(old.k->p, new->k.p)) ++ if (!bkey_cmp(old.k->p, new->k.p)) { + old = bch2_btree_iter_next(iter); ++ ret = bkey_err(old); ++ if (ret) ++ break; ++ } + + if (old.k && !bkey_err(old) && + old.k->p.inode == extent_iter->pos.inode && +diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c +index 80a54e17760f..e9533131e795 100644 +--- a/fs/bcachefs/move.c ++++ b/fs/bcachefs/move.c +@@ -191,7 +191,7 @@ nomatch: + } + atomic_long_inc(&c->extent_migrate_raced); + trace_move_race(&new->k); +- bch2_btree_iter_next_slot(iter); ++ bch2_btree_iter_advance(iter); + goto next; + } + out: +diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c +index ebf391245470..8b168246ca38 100644 +--- a/fs/bcachefs/reflink.c ++++ b/fs/bcachefs/reflink.c +@@ -192,8 +192,9 @@ static struct bkey_s_c get_next_src(struct btree_iter *iter, struct bpos end) + return k; + } + +- bch2_btree_iter_set_pos(iter, end); +- return bkey_s_c_null; ++ if (bkey_cmp(iter->pos, end) >= 0) ++ bch2_btree_iter_set_pos(iter, end); ++ return ret ? bkey_s_c_err(ret) : bkey_s_c_null; + } + + s64 bch2_remap_range(struct bch_fs *c, +diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h +index 2ff8e5bd2744..236023494191 100644 +--- a/fs/bcachefs/str_hash.h ++++ b/fs/bcachefs/str_hash.h +@@ -209,7 +209,7 @@ int bch2_hash_needs_whiteout(struct btree_trans *trans, + + iter = bch2_trans_copy_iter(trans, start); + +- bch2_btree_iter_next_slot(iter); ++ bch2_btree_iter_advance(iter); + + for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, k, ret) { + if (k.k->type != desc.key_type && +diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c +index 05afbac97b6b..e4d400b16dba 100644 +--- a/fs/bcachefs/xattr.c ++++ b/fs/bcachefs/xattr.c +@@ -124,6 +124,7 @@ static int bch2_xattr_get_trans(struct btree_trans *trans, struct bch_inode_info + struct bch_hash_info hash = bch2_hash_info_init(trans->c, &inode->ei_inode); + struct btree_iter *iter; + struct bkey_s_c_xattr xattr; ++ struct bkey_s_c k; + int ret; + + iter = bch2_hash_lookup(trans, bch2_xattr_hash_desc, &hash, +@@ -134,7 +135,12 @@ static int bch2_xattr_get_trans(struct btree_trans *trans, struct bch_inode_info + if (ret) + goto err; + +- xattr = bkey_s_c_to_xattr(bch2_btree_iter_peek_slot(iter)); ++ k = bch2_btree_iter_peek_slot(iter); ++ ret = bkey_err(k); ++ if (ret) ++ goto err; ++ ++ xattr = bkey_s_c_to_xattr(k); + ret = le16_to_cpu(xattr.v->x_val_len); + if (buffer) { + if (ret > size) +-- +cgit v1.2.3 + + +From b9d8cd1c6f980507c9894fc08eec81fd130addf5 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 24 Jul 2021 20:24:10 -0400 +Subject: bcachefs: Use bch2_trans_begin() more consistently + +Upcoming patch will require that a transaction restart is always +immediately followed by bch2_trans_begin(). + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_update.h | 2 +- + fs/bcachefs/fs-io.c | 4 ++++ + fs/bcachefs/fs.c | 2 ++ + fs/bcachefs/io.c | 17 +++++++++-------- + fs/bcachefs/move.c | 7 ++++--- + fs/bcachefs/reflink.c | 4 ++-- + 6 files changed, 22 insertions(+), 14 deletions(-) + +diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h +index bab135fae0b0..b5f35a419004 100644 +--- a/fs/bcachefs/btree_update.h ++++ b/fs/bcachefs/btree_update.h +@@ -114,7 +114,7 @@ static inline int bch2_trans_commit(struct btree_trans *trans, + _ret = (_do); \ + if (_ret != -EINTR) \ + break; \ +- bch2_trans_reset(_trans, 0); \ ++ bch2_trans_begin(_trans); \ + } \ + \ + _ret; \ +diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c +index e8b47ced177d..35d213791ec1 100644 +--- a/fs/bcachefs/fs-io.c ++++ b/fs/bcachefs/fs-io.c +@@ -810,6 +810,8 @@ static void bchfs_read(struct btree_trans *trans, struct btree_iter *iter, + + bch2_bkey_buf_init(&sk); + retry: ++ bch2_trans_begin(trans); ++ + while (1) { + struct bkey_s_c k; + unsigned bytes, sectors, offset_into_extent; +@@ -2554,6 +2556,8 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode, + struct bpos atomic_end; + unsigned trigger_flags = 0; + ++ bch2_trans_begin(&trans); ++ + k = insert + ? bch2_btree_iter_peek_prev(src) + : bch2_btree_iter_peek(src); +diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c +index 20907e554dd3..2029c7745a7a 100644 +--- a/fs/bcachefs/fs.c ++++ b/fs/bcachefs/fs.c +@@ -910,6 +910,8 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, + iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, + POS(ei->v.i_ino, start >> 9), 0); + retry: ++ bch2_trans_begin(&trans); ++ + while ((k = bch2_btree_iter_peek(iter)).k && + !(ret = bkey_err(k)) && + bkey_cmp(iter->pos, end) < 0) { +diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c +index e48bdaa15c7b..e090424fa380 100644 +--- a/fs/bcachefs/io.c ++++ b/fs/bcachefs/io.c +@@ -366,14 +366,13 @@ int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter, + struct bkey_s_c k; + int ret = 0, ret2 = 0; + +- while ((k = bch2_btree_iter_peek(iter)).k && ++ while ((bch2_trans_begin(trans), ++ (k = bch2_btree_iter_peek(iter)).k) && + bkey_cmp(iter->pos, end) < 0) { + struct disk_reservation disk_res = + bch2_disk_reservation_init(c, 0); + struct bkey_i delete; + +- bch2_trans_begin(trans); +- + ret = bkey_err(k); + if (ret) + goto btree_err; +@@ -2273,12 +2272,13 @@ void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, + + bch2_bkey_buf_init(&sk); + bch2_trans_init(&trans, c, 0, 0); +-retry: +- bch2_trans_begin(&trans); + + iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, + POS(inode, bvec_iter.bi_sector), + BTREE_ITER_SLOTS); ++retry: ++ bch2_trans_begin(&trans); ++ + while (1) { + unsigned bytes, sectors, offset_into_extent; + enum btree_id data_btree = BTREE_ID_extents; +@@ -2334,19 +2334,20 @@ retry: + swap(bvec_iter.bi_size, bytes); + bio_advance_iter(&rbio->bio, &bvec_iter, bytes); + } +- bch2_trans_iter_put(&trans, iter); + + if (ret == -EINTR || ret == READ_RETRY || ret == READ_RETRY_AVOID) + goto retry; + ++ bch2_trans_iter_put(&trans, iter); ++ bch2_trans_exit(&trans); ++ bch2_bkey_buf_exit(&sk, c); ++ + if (ret) { + bch_err_inum_ratelimited(c, inode, + "read error %i from btree lookup", ret); + rbio->bio.bi_status = BLK_STS_IOERR; + bch2_rbio_done(rbio); + } +- bch2_trans_exit(&trans); +- bch2_bkey_buf_exit(&sk, c); + } + + void bch2_fs_io_exit(struct bch_fs *c) +diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c +index e9533131e795..ee0f155fda6c 100644 +--- a/fs/bcachefs/move.c ++++ b/fs/bcachefs/move.c +@@ -84,7 +84,7 @@ static int bch2_migrate_index_update(struct bch_write_op *op) + bool extending = false, should_check_enospc; + s64 i_sectors_delta = 0, disk_sectors_delta = 0; + +- bch2_trans_reset(&trans, 0); ++ bch2_trans_begin(&trans); + + k = bch2_btree_iter_peek_slot(iter); + ret = bkey_err(k); +@@ -597,6 +597,8 @@ static int __bch2_move_data(struct bch_fs *c, + } + } while (delay); + ++ bch2_trans_begin(&trans); ++ + k = bch2_btree_iter_peek(iter); + + stats->pos = iter->pos; +@@ -652,8 +654,7 @@ static int __bch2_move_data(struct bch_fs *c, + data_cmd, data_opts); + if (ret2) { + if (ret2 == -EINTR) { +- bch2_trans_reset(&trans, 0); +- bch2_trans_cond_resched(&trans); ++ bch2_trans_begin(&trans); + continue; + } + +diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c +index 8b168246ca38..3d9c5c5b0eba 100644 +--- a/fs/bcachefs/reflink.c ++++ b/fs/bcachefs/reflink.c +@@ -305,12 +305,12 @@ s64 bch2_remap_range(struct bch_fs *c, + dst_done = dst_iter->pos.offset - dst_start.offset; + new_i_size = min(dst_iter->pos.offset << 9, new_i_size); + +- bch2_trans_begin(&trans); +- + do { + struct bch_inode_unpacked inode_u; + struct btree_iter *inode_iter; + ++ bch2_trans_begin(&trans); ++ + inode_iter = bch2_inode_peek(&trans, &inode_u, + dst_start.inode, BTREE_ITER_INTENT); + ret2 = PTR_ERR_OR_ZERO(inode_iter); +-- +cgit v1.2.3 + + +From b990def82e0194eb6eadccce346afa3ea5a3cee2 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 24 Jul 2021 17:38:15 -0400 +Subject: bcachefs: Clean up interior update paths + +Btree node merging now happens prior to transaction commit, not after, +so we don't need to pay attention to BTREE_INSERT_NOUNLOCK. + +Also, foreground_maybe_merge shouldn't be calling +bch2_btree_iter_traverse_all() - this is becoming private to the btree +iterator code and should only be called by bch2_trans_begin(). + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 2 +- + fs/bcachefs/btree_iter.h | 2 -- + fs/bcachefs/btree_update_interior.c | 29 +++-------------------------- + 3 files changed, 4 insertions(+), 29 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 5a287e8f4a6a..46b57603e197 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -1338,7 +1338,7 @@ out: + return ret; + } + +-int bch2_btree_iter_traverse_all(struct btree_trans *trans) ++static int bch2_btree_iter_traverse_all(struct btree_trans *trans) + { + return __btree_iter_traverse_all(trans, 0, _RET_IP_); + } +diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h +index 3889683e16f8..bcb8f0ebbdf4 100644 +--- a/fs/bcachefs/btree_iter.h ++++ b/fs/bcachefs/btree_iter.h +@@ -148,8 +148,6 @@ void bch2_btree_iter_reinit_node(struct btree_iter *, struct btree *); + + int __must_check bch2_btree_iter_traverse(struct btree_iter *); + +-int bch2_btree_iter_traverse_all(struct btree_trans *); +- + struct btree *bch2_btree_iter_peek_node(struct btree_iter *); + struct btree *bch2_btree_iter_next_node(struct btree_iter *); + +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index a254240868a5..53f0ece281c4 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -960,9 +960,6 @@ retry: + if (flags & BTREE_INSERT_GC_LOCK_HELD) + lockdep_assert_held(&c->gc_lock); + else if (!down_read_trylock(&c->gc_lock)) { +- if (flags & BTREE_INSERT_NOUNLOCK) +- return ERR_PTR(-EINTR); +- + bch2_trans_unlock(trans); + down_read(&c->gc_lock); + if (!bch2_trans_relock(trans)) { +@@ -1005,16 +1002,6 @@ retry: + BTREE_UPDATE_JOURNAL_RES, + journal_flags|JOURNAL_RES_GET_NONBLOCK); + if (ret == -EAGAIN) { +- /* +- * this would be cleaner if bch2_journal_preres_get() took a +- * closure argument +- */ +- if (flags & BTREE_INSERT_NOUNLOCK) { +- trace_trans_restart_journal_preres_get(trans->ip, _RET_IP_); +- ret = -EINTR; +- goto err; +- } +- + bch2_trans_unlock(trans); + + if (flags & BTREE_INSERT_JOURNAL_RECLAIM) { +@@ -1043,8 +1030,7 @@ retry: + if (ret) + goto err; + +- ret = bch2_btree_reserve_get(as, nr_nodes, flags, +- !(flags & BTREE_INSERT_NOUNLOCK) ? &cl : NULL); ++ ret = bch2_btree_reserve_get(as, nr_nodes, flags, &cl); + if (ret) + goto err; + +@@ -1057,8 +1043,6 @@ err: + bch2_btree_update_free(as); + + if (ret == -EAGAIN) { +- BUG_ON(flags & BTREE_INSERT_NOUNLOCK); +- + bch2_trans_unlock(trans); + closure_sync(&cl); + ret = -EINTR; +@@ -1593,12 +1577,12 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, + size_t sib_u64s; + int ret = 0, ret2 = 0; + +- BUG_ON(!btree_node_locked(iter, level)); + retry: + ret = bch2_btree_iter_traverse(iter); + if (ret) +- goto err; ++ return ret; + ++ BUG_ON(!iter->should_be_locked); + BUG_ON(!btree_node_locked(iter, level)); + + b = iter->l[level].b; +@@ -1751,13 +1735,6 @@ err: + if (ret == -EINTR && bch2_trans_relock(trans)) + goto retry; + +- if (ret == -EINTR && !(flags & BTREE_INSERT_NOUNLOCK)) { +- ret2 = ret; +- ret = bch2_btree_iter_traverse_all(trans); +- if (!ret) +- goto retry; +- } +- + goto out; + } + +-- +cgit v1.2.3 + + +From 86df452404e0f696263dfa3d44be951bf12c775c Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 27 Jul 2021 18:01:52 -0400 +Subject: bcachefs: bch2_btree_iter_traverse() shouldn't normally call + traverse_all() + +If there's more than one iterator in the btree_trans, it's requried to +call bch2_trans_begin() to handle transaction restarts. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 46b57603e197..5ade731973c0 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -1460,8 +1460,10 @@ static int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter) + + ret = bch2_trans_cond_resched(trans) ?: + btree_iter_traverse_one(iter, _RET_IP_); +- if (unlikely(ret)) ++ if (unlikely(ret) && hweight64(trans->iters_linked) == 1) { + ret = __btree_iter_traverse_all(trans, ret, _RET_IP_); ++ BUG_ON(ret == -EINTR); ++ } + + return ret; + } +-- +cgit v1.2.3 + + +From 2566169a2bce30041f17c018d5e4d520e7e7f664 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 22 Jul 2021 12:39:11 -0400 +Subject: bcachefs: Ensure btree_iter_traverse() obeys iter->should_be_locked + +iter->should_be_locked means that if bch2_btree_iter_relock() fails, we +need to restart the transaction. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 9 +++++++++ + 1 file changed, 9 insertions(+) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 5ade731973c0..6b4c86b8bd36 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -1388,6 +1388,15 @@ static int btree_iter_traverse_one(struct btree_iter *iter, + unsigned l, depth_want = iter->level; + int ret = 0; + ++ /* ++ * Ensure we obey iter->should_be_locked: if it's set, we can't unlock ++ * and re-traverse the iterator without a transaction restart: ++ */ ++ if (iter->should_be_locked) { ++ ret = bch2_btree_iter_relock(iter, trace_ip) ? 0 : -EINTR; ++ goto out; ++ } ++ + if (btree_iter_type(iter) == BTREE_ITER_CACHED) { + ret = bch2_btree_iter_traverse_cached(iter); + goto out; +-- +cgit v1.2.3 + + +From 8b5b8403ced9b20fe7c6ab83ccd84603567098a0 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 25 Jul 2021 17:19:52 -0400 +Subject: bcachefs: trans->restarted + +Start tracking when btree transactions have been restarted - and assert +that we're always calling bch2_trans_begin() immediately after +transaction restart. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_cache.c | 19 ++++++++++---- + fs/bcachefs/btree_gc.c | 3 ++- + fs/bcachefs/btree_iter.c | 50 ++++++++++++++++++++++++++++--------- + fs/bcachefs/btree_iter.h | 8 ++++++ + fs/bcachefs/btree_key_cache.c | 23 ++++++++++------- + fs/bcachefs/btree_types.h | 7 +++--- + fs/bcachefs/btree_update_interior.c | 1 + + fs/bcachefs/btree_update_leaf.c | 34 ++++++++++++++++++------- + 8 files changed, 106 insertions(+), 39 deletions(-) + +diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c +index b7b9468a590e..3dfb0dca445a 100644 +--- a/fs/bcachefs/btree_cache.c ++++ b/fs/bcachefs/btree_cache.c +@@ -656,8 +656,10 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c, + * Parent node must be locked, else we could read in a btree node that's + * been freed: + */ +- if (iter && !bch2_btree_node_relock(iter, level + 1)) ++ if (iter && !bch2_btree_node_relock(iter, level + 1)) { ++ btree_trans_restart(iter->trans); + return ERR_PTR(-EINTR); ++ } + + b = bch2_btree_node_mem_alloc(c); + if (IS_ERR(b)) +@@ -696,11 +698,15 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c, + + if (iter && + (!bch2_trans_relock(iter->trans) || +- !bch2_btree_iter_relock_intent(iter))) ++ !bch2_btree_iter_relock_intent(iter))) { ++ BUG_ON(!iter->trans->restarted); + return ERR_PTR(-EINTR); ++ } + +- if (!six_relock_type(&b->c.lock, lock_type, seq)) ++ if (!six_relock_type(&b->c.lock, lock_type, seq)) { ++ btree_trans_restart(iter->trans); + return ERR_PTR(-EINTR); ++ } + + return b; + } +@@ -825,7 +831,7 @@ lock_node: + + if (!btree_node_lock(b, k->k.p, level, iter, lock_type, + lock_node_check_fn, (void *) k, trace_ip)) { +- if (b->hash_val != btree_ptr_hash_val(k)) ++ if (!trans->restarted) + goto retry; + return ERR_PTR(-EINTR); + } +@@ -841,6 +847,7 @@ lock_node: + trace_ip, + iter->btree_id, + &iter->real_pos); ++ btree_trans_restart(trans); + return ERR_PTR(-EINTR); + } + } +@@ -859,8 +866,10 @@ lock_node: + */ + if (iter && + (!bch2_trans_relock(trans) || +- !bch2_btree_iter_relock_intent(iter))) ++ !bch2_btree_iter_relock_intent(iter))) { ++ BUG_ON(!trans->restarted); + return ERR_PTR(-EINTR); ++ } + + if (!six_relock_type(&b->c.lock, lock_type, seq)) + goto retry; +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +index 3dd1094d10c9..91f6a2ada44e 100644 +--- a/fs/bcachefs/btree_gc.c ++++ b/fs/bcachefs/btree_gc.c +@@ -1735,7 +1735,8 @@ static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id btree_id) + BTREE_ITER_NOT_EXTENTS| + BTREE_ITER_ALL_SNAPSHOTS); + +- while ((k = bch2_btree_iter_peek(iter)).k && ++ while ((bch2_trans_begin(&trans), ++ k = bch2_btree_iter_peek(iter)).k && + !(ret = bkey_err(k))) { + c->gc_gens_pos = iter->pos; + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 6b4c86b8bd36..1519e417a1f9 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -317,7 +317,7 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos, + } + + if (unlikely(deadlock_iter)) { +- trace_trans_restart_would_deadlock(iter->trans->ip, ip, ++ trace_trans_restart_would_deadlock(trans->ip, ip, + trans->in_traverse_all, reason, + deadlock_iter->btree_id, + btree_iter_type(deadlock_iter), +@@ -325,6 +325,7 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos, + iter->btree_id, + btree_iter_type(iter), + &pos); ++ btree_trans_restart(trans); + return false; + } + +@@ -405,6 +406,7 @@ bool bch2_btree_iter_relock_intent(struct btree_iter *iter) + ? iter->l[l].b->c.lock.state.seq + : 0); + btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); ++ btree_trans_restart(iter->trans); + return false; + } + } +@@ -415,7 +417,11 @@ bool bch2_btree_iter_relock_intent(struct btree_iter *iter) + __flatten + bool bch2_btree_iter_relock(struct btree_iter *iter, unsigned long trace_ip) + { +- return btree_iter_get_locks(iter, false, trace_ip); ++ bool ret = btree_iter_get_locks(iter, false, trace_ip); ++ ++ if (!ret) ++ btree_trans_restart(iter->trans); ++ return ret; + } + + bool __bch2_btree_iter_upgrade(struct btree_iter *iter, +@@ -458,6 +464,8 @@ bool __bch2_btree_iter_upgrade(struct btree_iter *iter, + btree_iter_get_locks(linked, true, _THIS_IP_); + } + ++ if (iter->should_be_locked) ++ btree_trans_restart(iter->trans); + return false; + } + +@@ -506,11 +514,15 @@ bool bch2_trans_relock(struct btree_trans *trans) + { + struct btree_iter *iter; + ++ if (unlikely(trans->restarted)) ++ return false; ++ + trans_for_each_iter(trans, iter) + if (btree_iter_should_be_locked(iter) && + !bch2_btree_iter_relock(iter, _RET_IP_)) { + trace_trans_restart_relock(trans->ip, _RET_IP_, + iter->btree_id, &iter->real_pos); ++ BUG_ON(!trans->restarted); + return false; + } + return true; +@@ -1091,11 +1103,12 @@ static int lock_root_check_fn(struct six_lock *lock, void *p) + return b == *rootp ? 0 : -1; + } + +-static inline int btree_iter_lock_root(struct btree_iter *iter, ++static inline int btree_iter_lock_root(struct btree_trans *trans, ++ struct btree_iter *iter, + unsigned depth_want, + unsigned long trace_ip) + { +- struct bch_fs *c = iter->trans->c; ++ struct bch_fs *c = trans->c; + struct btree *b, **rootp = &c->btree_roots[iter->btree_id].b; + enum six_lock_type lock_type; + unsigned i; +@@ -1123,8 +1136,11 @@ static inline int btree_iter_lock_root(struct btree_iter *iter, + if (unlikely(!btree_node_lock(b, SPOS_MAX, iter->level, + iter, lock_type, + lock_root_check_fn, rootp, +- trace_ip))) +- return -EINTR; ++ trace_ip))) { ++ if (trans->restarted) ++ return -EINTR; ++ continue; ++ } + + if (likely(b == READ_ONCE(*rootp) && + b->c.level == iter->level && +@@ -1202,10 +1218,10 @@ static noinline void btree_node_mem_ptr_set(struct btree_iter *iter, + btree_node_unlock(iter, plevel); + } + +-static __always_inline int btree_iter_down(struct btree_iter *iter, ++static __always_inline int btree_iter_down(struct btree_trans *trans, ++ struct btree_iter *iter, + unsigned long trace_ip) + { +- struct btree_trans *trans = iter->trans; + struct bch_fs *c = trans->c; + struct btree_iter_level *l = &iter->l[iter->level]; + struct btree *b; +@@ -1260,6 +1276,8 @@ static int __btree_iter_traverse_all(struct btree_trans *trans, int ret, + + trans->in_traverse_all = true; + retry_all: ++ trans->restarted = false; ++ + nr_sorted = 0; + + trans_for_each_iter(trans, iter) { +@@ -1322,11 +1340,11 @@ retry_all: + } + + if (hweight64(trans->iters_live) > 1) +- ret = -EINTR; ++ ret = btree_trans_restart(trans); + else + trans_for_each_iter(trans, iter) + if (iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT) { +- ret = -EINTR; ++ ret = btree_trans_restart(trans); + break; + } + out: +@@ -1426,8 +1444,8 @@ static int btree_iter_traverse_one(struct btree_iter *iter, + */ + while (iter->level > depth_want) { + ret = btree_iter_node(iter, iter->level) +- ? btree_iter_down(iter, trace_ip) +- : btree_iter_lock_root(iter, depth_want, trace_ip); ++ ? btree_iter_down(trans, iter, trace_ip) ++ : btree_iter_lock_root(trans, iter, depth_want, trace_ip); + if (unlikely(ret)) { + if (ret == 1) { + /* +@@ -1455,6 +1473,7 @@ static int btree_iter_traverse_one(struct btree_iter *iter, + + iter->uptodate = BTREE_ITER_NEED_PEEK; + out: ++ BUG_ON((ret == -EINTR) != !!trans->restarted); + trace_iter_traverse(trans->ip, trace_ip, + btree_iter_type(iter) == BTREE_ITER_CACHED, + iter->btree_id, &iter->real_pos, ret); +@@ -1603,6 +1622,8 @@ static void btree_iter_set_search_pos(struct btree_iter *iter, struct bpos new_p + int cmp = bpos_cmp(new_pos, iter->real_pos); + unsigned l = iter->level; + ++ EBUG_ON(iter->trans->restarted); ++ + if (!cmp) + goto out; + +@@ -2162,6 +2183,8 @@ struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans, + struct btree_iter *iter, *best = NULL; + struct bpos real_pos, pos_min = POS_MIN; + ++ EBUG_ON(trans->restarted); ++ + if ((flags & BTREE_ITER_TYPE) != BTREE_ITER_NODES && + btree_node_type_is_extents(btree_id) && + !(flags & BTREE_ITER_NOT_EXTENTS) && +@@ -2326,6 +2349,7 @@ void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size) + + if (old_bytes) { + trace_trans_restart_mem_realloced(trans->ip, _RET_IP_, new_bytes); ++ btree_trans_restart(trans); + return ERR_PTR(-EINTR); + } + } +@@ -2399,6 +2423,8 @@ void bch2_trans_reset(struct btree_trans *trans, unsigned flags) + if (!(flags & TRANS_RESET_NOTRAVERSE) && + trans->iters_linked) + bch2_btree_iter_traverse_all(trans); ++ ++ trans->restarted = false; + } + + static void bch2_trans_alloc_iters(struct btree_trans *trans, struct bch_fs *c) +diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h +index bcb8f0ebbdf4..243f65f0b7ad 100644 +--- a/fs/bcachefs/btree_iter.h ++++ b/fs/bcachefs/btree_iter.h +@@ -117,6 +117,14 @@ bool bch2_btree_iter_relock(struct btree_iter *, unsigned long); + bool bch2_trans_relock(struct btree_trans *); + void bch2_trans_unlock(struct btree_trans *); + ++__always_inline ++static inline int btree_trans_restart(struct btree_trans *trans) ++{ ++ trans->restarted = true; ++ bch2_trans_unlock(trans); ++ return -EINTR; ++} ++ + bool __bch2_btree_iter_upgrade(struct btree_iter *, unsigned); + + static inline bool bch2_btree_iter_upgrade(struct btree_iter *iter, +diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c +index e93bf675f19a..742d096e91b7 100644 +--- a/fs/bcachefs/btree_key_cache.c ++++ b/fs/bcachefs/btree_key_cache.c +@@ -214,7 +214,7 @@ static int btree_key_cache_fill(struct btree_trans *trans, + + if (!bch2_btree_node_relock(ck_iter, 0)) { + trace_transaction_restart_ip(trans->ip, _THIS_IP_); +- ret = -EINTR; ++ ret = btree_trans_restart(trans); + goto err; + } + +@@ -233,6 +233,10 @@ static int btree_key_cache_fill(struct btree_trans *trans, + } + } + ++ /* ++ * XXX: not allowed to be holding read locks when we take a write lock, ++ * currently ++ */ + bch2_btree_node_lock_write(ck_iter->l[0].b, ck_iter); + if (new_k) { + kfree(ck->k); +@@ -299,10 +303,8 @@ retry: + + if (!btree_node_lock((void *) ck, iter->pos, 0, iter, lock_want, + bkey_cached_check_fn, iter, _THIS_IP_)) { +- if (ck->key.btree_id != iter->btree_id || +- bpos_cmp(ck->key.pos, iter->pos)) { ++ if (!trans->restarted) + goto retry; +- } + + trace_transaction_restart_ip(trans->ip, _THIS_IP_); + ret = -EINTR; +@@ -322,10 +324,10 @@ retry: + iter->l[0].b = (void *) ck; + fill: + if (!ck->valid && !(iter->flags & BTREE_ITER_CACHED_NOFILL)) { +- if (!btree_node_intent_locked(iter, 0)) +- bch2_btree_iter_upgrade(iter, 1); +- if (!btree_node_intent_locked(iter, 0)) { ++ if (!iter->locks_want && ++ !!__bch2_btree_iter_upgrade(iter, 1)) { + trace_transaction_restart_ip(trans->ip, _THIS_IP_); ++ BUG_ON(!trans->restarted); + ret = -EINTR; + goto err; + } +@@ -341,9 +343,12 @@ fill: + iter->uptodate = BTREE_ITER_NEED_PEEK; + + if ((iter->flags & BTREE_ITER_INTENT) && +- !iter->locks_want && +- __bch2_btree_iter_upgrade(iter, 1)) ++ !bch2_btree_iter_upgrade(iter, 1)) { ++ BUG_ON(!trans->restarted); + ret = -EINTR; ++ } ++ ++ BUG_ON(!ret && !btree_node_locked(iter, 0)); + + return ret; + err: +diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h +index 07c9ba4ea475..6882873d149a 100644 +--- a/fs/bcachefs/btree_types.h ++++ b/fs/bcachefs/btree_types.h +@@ -380,9 +380,10 @@ struct btree_trans { + int srcu_idx; + + u8 nr_updates; +- unsigned used_mempool:1; +- unsigned error:1; +- unsigned in_traverse_all:1; ++ bool used_mempool:1; ++ bool error:1; ++ bool in_traverse_all:1; ++ bool restarted:1; + /* + * For when bch2_trans_update notices we'll be splitting a compressed + * extent: +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index 53f0ece281c4..c08d5e90cb4c 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -1006,6 +1006,7 @@ retry: + + if (flags & BTREE_INSERT_JOURNAL_RECLAIM) { + bch2_btree_update_free(as); ++ btree_trans_restart(trans); + return ERR_PTR(ret); + } + +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index 6196fc4c8799..79287496d586 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -384,6 +384,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, + + if (race_fault()) { + trace_trans_restart_fault_inject(trans->ip, trace_ip); ++ trans->restarted = true; + return -EINTR; + } + +@@ -520,10 +521,17 @@ static noinline int maybe_do_btree_merge(struct btree_trans *trans, struct btree + u64s_delta -= !bkey_deleted(old.k) ? old.k->u64s : 0; + } + +- return u64s_delta <= 0 +- ? (bch2_foreground_maybe_merge(trans, iter, iter->level, +- trans->flags & ~BTREE_INSERT_NOUNLOCK) ?: -EINTR) +- : 0; ++ if (u64s_delta > 0) ++ return 0; ++ ++ ret = bch2_foreground_maybe_merge(trans, iter, iter->level, ++ trans->flags & ~BTREE_INSERT_NOUNLOCK); ++ if (!ret) { ++ ret = -EINTR; ++ trans->restarted = true; ++ } ++ ++ return ret; + } + + /* +@@ -587,6 +595,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, + trace_trans_restart_upgrade(trans->ip, trace_ip, + iter->btree_id, + &iter->real_pos); ++ trans->restarted = true; + return -EINTR; + } + } else { +@@ -696,6 +705,7 @@ int bch2_trans_commit_error(struct btree_trans *trans, + trace_trans_restart_btree_node_split(trans->ip, trace_ip, + i->iter->btree_id, + &i->iter->real_pos); ++ trans->restarted = true; + ret = -EINTR; + } + break; +@@ -704,7 +714,7 @@ int bch2_trans_commit_error(struct btree_trans *trans, + + ret = bch2_replicas_delta_list_mark(c, trans->fs_usage_deltas); + if (ret) +- return ret; ++ break; + + if (bch2_trans_relock(trans)) + return 0; +@@ -716,12 +726,15 @@ int bch2_trans_commit_error(struct btree_trans *trans, + bch2_trans_unlock(trans); + + if ((trans->flags & BTREE_INSERT_JOURNAL_RECLAIM) && +- !(trans->flags & BTREE_INSERT_JOURNAL_RESERVED)) +- return -EAGAIN; ++ !(trans->flags & BTREE_INSERT_JOURNAL_RESERVED)) { ++ trans->restarted = true; ++ ret = -EAGAIN; ++ break; ++ } + + ret = bch2_trans_journal_res_get(trans, JOURNAL_RES_GET_CHECK); + if (ret) +- return ret; ++ break; + + if (bch2_trans_relock(trans)) + return 0; +@@ -737,7 +750,7 @@ int bch2_trans_commit_error(struct btree_trans *trans, + wait_event_freezable(c->journal.reclaim_wait, + (ret = journal_reclaim_wait_done(c))); + if (ret < 0) +- return ret; ++ break; + + if (bch2_trans_relock(trans)) + return 0; +@@ -750,6 +763,7 @@ int bch2_trans_commit_error(struct btree_trans *trans, + break; + } + ++ BUG_ON((ret == EINTR || ret == -EAGAIN) && !trans->restarted); + BUG_ON(ret == -ENOSPC && (flags & BTREE_INSERT_NOFAIL)); + + return ret; +@@ -952,6 +966,7 @@ int __bch2_trans_commit(struct btree_trans *trans) + trace_trans_restart_upgrade(trans->ip, _RET_IP_, + i->iter->btree_id, + &i->iter->pos); ++ trans->restarted = true; + ret = -EINTR; + goto out; + } +@@ -974,6 +989,7 @@ int __bch2_trans_commit(struct btree_trans *trans) + goto err; + } + retry: ++ BUG_ON(trans->restarted); + memset(&trans->journal_res, 0, sizeof(trans->journal_res)); + + ret = do_bch2_trans_commit(trans, &i, _RET_IP_); +-- +cgit v1.2.3 + + +From 8a677f53aa9888518e6d44c704ace656472469d5 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 24 Jul 2021 23:57:28 -0400 +Subject: bcachefs: __bch2_trans_commit() no longer calls bch2_trans_reset() + +It's now the caller's responsibility to call bch2_trans_begin. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 25 +++++++++---------------- + fs/bcachefs/btree_iter.h | 17 +---------------- + fs/bcachefs/btree_update_leaf.c | 19 +++++++++++++------ + 3 files changed, 23 insertions(+), 38 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 1519e417a1f9..28a3ba1a0d62 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -2375,22 +2375,14 @@ inline void bch2_trans_unlink_iters(struct btree_trans *trans) + } + + /** +- * bch2_trans_reset() - reset a transaction after a interrupted attempt ++ * bch2_trans_begin() - reset a transaction after a interrupted attempt + * @trans: transaction to reset +- * @flags: transaction reset flags. + * + * While iterating over nodes or updating nodes a attempt to lock a btree + * node may return EINTR when the trylock fails. When this occurs +- * bch2_trans_reset() or bch2_trans_begin() should be called and the +- * transaction retried. +- * +- * Transaction reset flags include: +- * +- * - TRANS_RESET_NOUNLOCK - Do not attempt to unlock and reschedule the +- * transaction. +- * - TRANS_RESET_NOTRAVERSE - Do not traverse all linked iters. ++ * bch2_trans_begin() should be called and the transaction retried. + */ +-void bch2_trans_reset(struct btree_trans *trans, unsigned flags) ++void bch2_trans_begin(struct btree_trans *trans) + { + struct btree_iter *iter; + +@@ -2398,8 +2390,11 @@ void bch2_trans_reset(struct btree_trans *trans, unsigned flags) + iter->flags &= ~(BTREE_ITER_KEEP_UNTIL_COMMIT| + BTREE_ITER_SET_POS_AFTER_COMMIT); + ++ /* ++ * XXX: we shouldn't be doing this if the transaction was restarted, but ++ * currently we still overflow transaction iterators if we do that ++ * */ + bch2_trans_unlink_iters(trans); +- + trans->iters_touched &= trans->iters_live; + + trans->extra_journal_res = 0; +@@ -2417,11 +2412,9 @@ void bch2_trans_reset(struct btree_trans *trans, unsigned flags) + (void *) &trans->fs_usage_deltas->memset_start); + } + +- if (!(flags & TRANS_RESET_NOUNLOCK)) +- bch2_trans_cond_resched(trans); ++ bch2_trans_cond_resched(trans); + +- if (!(flags & TRANS_RESET_NOTRAVERSE) && +- trans->iters_linked) ++ if (trans->restarted) + bch2_btree_iter_traverse_all(trans); + + trans->restarted = false; +diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h +index 243f65f0b7ad..aeabc07d2c9c 100644 +--- a/fs/bcachefs/btree_iter.h ++++ b/fs/bcachefs/btree_iter.h +@@ -323,22 +323,7 @@ static inline void set_btree_iter_dontneed(struct btree_trans *trans, struct btr + trans->iters_touched &= ~(1ULL << iter->idx); + } + +-#define TRANS_RESET_NOTRAVERSE (1 << 0) +-#define TRANS_RESET_NOUNLOCK (1 << 1) +- +-void bch2_trans_reset(struct btree_trans *, unsigned); +- +-/** +- * bch2_trans_begin() - ensure lock consistency of transaction on retry +- * @trans: transaction to prepare +- * +- * Ensure lock ordering is correct before potentially retrying a transaction +- * after a failed trylock. +- */ +-static inline void bch2_trans_begin(struct btree_trans *trans) +-{ +- return bch2_trans_reset(trans, 0); +-} ++void bch2_trans_begin(struct btree_trans *); + + void *bch2_trans_kmalloc(struct btree_trans *, size_t); + void bch2_trans_init(struct btree_trans *, struct bch_fs *, unsigned, size_t); +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index 79287496d586..da0941ca5b35 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -903,7 +903,7 @@ int __bch2_trans_commit(struct btree_trans *trans) + struct btree_insert_entry *i = NULL; + struct btree_iter *iter; + bool trans_trigger_run; +- unsigned u64s, reset_flags = 0; ++ unsigned u64s; + int ret = 0; + + if (!trans->nr_updates && +@@ -1010,11 +1010,18 @@ out: + if (likely(!(trans->flags & BTREE_INSERT_NOCHECK_RW))) + percpu_ref_put(&trans->c->writes); + out_reset: +- if (!ret) +- reset_flags |= TRANS_RESET_NOTRAVERSE; +- if (!ret && (trans->flags & BTREE_INSERT_NOUNLOCK)) +- reset_flags |= TRANS_RESET_NOUNLOCK; +- bch2_trans_reset(trans, reset_flags); ++ trans->extra_journal_res = 0; ++ trans->nr_updates = 0; ++ trans->hooks = NULL; ++ trans->extra_journal_entries = NULL; ++ trans->extra_journal_entry_u64s = 0; ++ ++ if (trans->fs_usage_deltas) { ++ trans->fs_usage_deltas->used = 0; ++ memset(&trans->fs_usage_deltas->memset_start, 0, ++ (void *) &trans->fs_usage_deltas->memset_end - ++ (void *) &trans->fs_usage_deltas->memset_start); ++ } + + return ret; + err: +-- +cgit v1.2.3 + + +From 4536916e5d08d28a7b25fd5dacf4666702da0fab Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 24 Jul 2021 14:25:01 -0400 +Subject: bcachefs: Btree splits no longer automatically cause a transaction + restart + +With the new and improved handling of transaction restarts, this should +finally be safe. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_update_leaf.c | 49 +++++------------------------------------ + 1 file changed, 5 insertions(+), 44 deletions(-) + +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index da0941ca5b35..5e25e3c51ec0 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -524,14 +524,8 @@ static noinline int maybe_do_btree_merge(struct btree_trans *trans, struct btree + if (u64s_delta > 0) + return 0; + +- ret = bch2_foreground_maybe_merge(trans, iter, iter->level, ++ return bch2_foreground_maybe_merge(trans, iter, iter->level, + trans->flags & ~BTREE_INSERT_NOUNLOCK); +- if (!ret) { +- ret = -EINTR; +- trans->restarted = true; +- } +- +- return ret; + } + + /* +@@ -664,50 +658,17 @@ int bch2_trans_commit_error(struct btree_trans *trans, + int ret, unsigned long trace_ip) + { + struct bch_fs *c = trans->c; +- unsigned flags = trans->flags; +- +- /* +- * BTREE_INSERT_NOUNLOCK means don't unlock _after_ successful btree +- * update; if we haven't done anything yet it doesn't apply +- */ +- flags &= ~BTREE_INSERT_NOUNLOCK; + + switch (ret) { + case BTREE_INSERT_BTREE_NODE_FULL: +- ret = bch2_btree_split_leaf(trans, i->iter, flags); +- +- /* +- * if the split succeeded without dropping locks the insert will +- * still be atomic (what the caller peeked() and is overwriting +- * won't have changed) +- */ +-#if 0 +- /* +- * XXX: +- * split -> btree node merging (of parent node) might still drop +- * locks when we're not passing it BTREE_INSERT_NOUNLOCK +- * +- * we don't want to pass BTREE_INSERT_NOUNLOCK to split as that +- * will inhibit merging - but we don't have a reliable way yet +- * (do we?) of checking if we dropped locks in this path +- */ ++ ret = bch2_btree_split_leaf(trans, i->iter, trans->flags); + if (!ret) +- goto retry; +-#endif ++ return 0; + +- /* +- * don't care if we got ENOSPC because we told split it +- * couldn't block: +- */ +- if (!ret || +- ret == -EINTR || +- (flags & BTREE_INSERT_NOUNLOCK)) { ++ if (ret == -EINTR) + trace_trans_restart_btree_node_split(trans->ip, trace_ip, + i->iter->btree_id, + &i->iter->real_pos); +- trans->restarted = true; +- ret = -EINTR; +- } + break; + case BTREE_INSERT_NEED_MARK_REPLICAS: + bch2_trans_unlock(trans); +@@ -764,7 +725,7 @@ int bch2_trans_commit_error(struct btree_trans *trans, + } + + BUG_ON((ret == EINTR || ret == -EAGAIN) && !trans->restarted); +- BUG_ON(ret == -ENOSPC && (flags & BTREE_INSERT_NOFAIL)); ++ BUG_ON(ret == -ENOSPC && (trans->flags & BTREE_INSERT_NOFAIL)); + + return ret; + } +-- +cgit v1.2.3 + + +From ddfbdcbd67b3b5dee870ba855b5e271e833e51ae Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 27 Jul 2021 22:15:04 -0400 +Subject: bcachefs: Kill BTREE_INSERT_NOUNLOCK + +With the recent transaction restart changes, it's no longer needed - all +transaction commits have BTREE_INSERT_NOUNLOCK semantics. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/acl.c | 3 +-- + fs/bcachefs/btree_key_cache.c | 1 - + fs/bcachefs/btree_update.h | 6 ------ + fs/bcachefs/btree_update_interior.c | 3 +-- + fs/bcachefs/btree_update_leaf.c | 4 ++-- + fs/bcachefs/fs.c | 12 +++--------- + fs/bcachefs/fsck.c | 3 +-- + fs/bcachefs/quota.c | 2 +- + 8 files changed, 9 insertions(+), 25 deletions(-) + +diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c +index 8371a20ac310..1d3887306eb0 100644 +--- a/fs/bcachefs/acl.c ++++ b/fs/bcachefs/acl.c +@@ -337,8 +337,7 @@ retry: + + ret = bch2_inode_write(&trans, inode_iter, &inode_u) ?: + bch2_trans_commit(&trans, NULL, +- &inode->ei_journal_seq, +- BTREE_INSERT_NOUNLOCK); ++ &inode->ei_journal_seq, 0); + btree_err: + bch2_trans_iter_put(&trans, inode_iter); + +diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c +index 742d096e91b7..e327ef39d432 100644 +--- a/fs/bcachefs/btree_key_cache.c ++++ b/fs/bcachefs/btree_key_cache.c +@@ -405,7 +405,6 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE| + BTREE_TRIGGER_NORUN) ?: + bch2_trans_commit(trans, NULL, NULL, +- BTREE_INSERT_NOUNLOCK| + BTREE_INSERT_NOCHECK_RW| + BTREE_INSERT_NOFAIL| + BTREE_INSERT_USE_RESERVE| +diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h +index b5f35a419004..4bcfbc029b36 100644 +--- a/fs/bcachefs/btree_update.h ++++ b/fs/bcachefs/btree_update.h +@@ -15,7 +15,6 @@ bool bch2_btree_bset_insert_key(struct btree_iter *, struct btree *, + void bch2_btree_add_journal_pin(struct bch_fs *, struct btree *, u64); + + enum btree_insert_flags { +- __BTREE_INSERT_NOUNLOCK, + __BTREE_INSERT_NOFAIL, + __BTREE_INSERT_NOCHECK_RW, + __BTREE_INSERT_LAZY_RW, +@@ -29,11 +28,6 @@ enum btree_insert_flags { + __BCH_HASH_SET_MUST_REPLACE, + }; + +-/* +- * Don't drop locks _after_ successfully updating btree: +- */ +-#define BTREE_INSERT_NOUNLOCK (1 << __BTREE_INSERT_NOUNLOCK) +- + /* Don't check for -ENOSPC: */ + #define BTREE_INSERT_NOFAIL (1 << __BTREE_INSERT_NOFAIL) + +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index c08d5e90cb4c..c8c3382f48c7 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -1927,8 +1927,7 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_NOCHECK_RW| + BTREE_INSERT_JOURNAL_RECLAIM| +- BTREE_INSERT_JOURNAL_RESERVED| +- BTREE_INSERT_NOUNLOCK); ++ BTREE_INSERT_JOURNAL_RESERVED); + if (ret) + goto err; + +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index 5e25e3c51ec0..441727234974 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -524,8 +524,8 @@ static noinline int maybe_do_btree_merge(struct btree_trans *trans, struct btree + if (u64s_delta > 0) + return 0; + +- return bch2_foreground_maybe_merge(trans, iter, iter->level, +- trans->flags & ~BTREE_INSERT_NOUNLOCK); ++ return bch2_foreground_maybe_merge(trans, iter, ++ iter->level, trans->flags); + } + + /* +diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c +index 2029c7745a7a..b9fc813087f0 100644 +--- a/fs/bcachefs/fs.c ++++ b/fs/bcachefs/fs.c +@@ -157,7 +157,6 @@ retry: + bch2_inode_write(&trans, iter, &inode_u) ?: + bch2_trans_commit(&trans, NULL, + &inode->ei_journal_seq, +- BTREE_INSERT_NOUNLOCK| + BTREE_INSERT_NOFAIL); + + /* +@@ -295,8 +294,7 @@ retry: + if (unlikely(ret)) + goto err_before_quota; + +- ret = bch2_trans_commit(&trans, NULL, &journal_seq, +- BTREE_INSERT_NOUNLOCK); ++ ret = bch2_trans_commit(&trans, NULL, &journal_seq, 0); + if (unlikely(ret)) { + bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1, + KEY_TYPE_QUOTA_WARN); +@@ -417,8 +415,7 @@ static int __bch2_link(struct bch_fs *c, + mutex_lock(&inode->ei_update_lock); + bch2_trans_init(&trans, c, 4, 1024); + +- ret = __bch2_trans_do(&trans, NULL, &inode->ei_journal_seq, +- BTREE_INSERT_NOUNLOCK, ++ ret = __bch2_trans_do(&trans, NULL, &inode->ei_journal_seq, 0, + bch2_link_trans(&trans, + dir->v.i_ino, + inode->v.i_ino, &dir_u, &inode_u, +@@ -470,7 +467,6 @@ static int bch2_unlink(struct inode *vdir, struct dentry *dentry) + bch2_trans_init(&trans, c, 4, 1024); + + ret = __bch2_trans_do(&trans, NULL, &dir->ei_journal_seq, +- BTREE_INSERT_NOUNLOCK| + BTREE_INSERT_NOFAIL, + bch2_unlink_trans(&trans, + dir->v.i_ino, &dir_u, +@@ -591,8 +587,7 @@ static int bch2_rename2(struct user_namespace *mnt_userns, + goto err; + } + +- ret = __bch2_trans_do(&trans, NULL, &journal_seq, +- BTREE_INSERT_NOUNLOCK, ++ ret = __bch2_trans_do(&trans, NULL, &journal_seq, 0, + bch2_rename_trans(&trans, + src_dir->v.i_ino, &src_dir_u, + dst_dir->v.i_ino, &dst_dir_u, +@@ -735,7 +730,6 @@ retry: + ret = bch2_inode_write(&trans, inode_iter, &inode_u) ?: + bch2_trans_commit(&trans, NULL, + &inode->ei_journal_seq, +- BTREE_INSERT_NOUNLOCK| + BTREE_INSERT_NOFAIL); + btree_err: + bch2_trans_iter_put(&trans, inode_iter); +diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c +index 63d42542c194..36eba46d566e 100644 +--- a/fs/bcachefs/fsck.c ++++ b/fs/bcachefs/fsck.c +@@ -803,8 +803,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, + ret = __write_inode(trans, &target, target_snapshot) ?: + bch2_trans_commit(trans, NULL, NULL, + BTREE_INSERT_NOFAIL| +- BTREE_INSERT_LAZY_RW| +- BTREE_INSERT_NOUNLOCK); ++ BTREE_INSERT_LAZY_RW); + if (ret) + return ret; + return -EINTR; +diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c +index 35b409e0f366..7861781a4a7f 100644 +--- a/fs/bcachefs/quota.c ++++ b/fs/bcachefs/quota.c +@@ -760,7 +760,7 @@ static int bch2_set_quota(struct super_block *sb, struct kqid qid, + bkey_quota_init(&new_quota.k_i); + new_quota.k.p = POS(qid.type, from_kqid(&init_user_ns, qid)); + +- ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOUNLOCK, ++ ret = bch2_trans_do(c, NULL, NULL, 0, + bch2_set_quota_trans(&trans, &new_quota, qdq)) ?: + __bch2_quota_set(c, bkey_i_to_s_c(&new_quota.k_i)); + +-- +cgit v1.2.3 + + +From 22c5cbbf6461842d3d2d57f2e23cec585fe4621b Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 27 Jul 2021 22:32:05 -0400 +Subject: bcachefs: traverse_all() shouldn't be restarting the transaction + +We're only called by bch2_trans_begin() now. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 9 --------- + 1 file changed, 9 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 28a3ba1a0d62..ddaeac2d2e55 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -1338,15 +1338,6 @@ retry_all: + if (ret) + goto retry_all; + } +- +- if (hweight64(trans->iters_live) > 1) +- ret = btree_trans_restart(trans); +- else +- trans_for_each_iter(trans, iter) +- if (iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT) { +- ret = btree_trans_restart(trans); +- break; +- } + out: + bch2_btree_cache_cannibalize_unlock(c); + +-- +cgit v1.2.3 + + +From aaed5ee42cbf99dc1dc9df81c3cfb4c04f4c809d Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 27 Jul 2021 22:28:39 -0400 +Subject: bcachefs: Don't drop read locks at transaction commit time + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_update_leaf.c | 27 ++++++++------------------- + 1 file changed, 8 insertions(+), 19 deletions(-) + +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index 441727234974..e9e542260bff 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -348,11 +348,6 @@ static inline void do_btree_insert_one(struct btree_trans *trans, + } + } + +-static noinline void bch2_btree_iter_unlock_noinline(struct btree_iter *iter) +-{ +- __bch2_btree_iter_unlock(iter); +-} +- + static noinline void bch2_trans_mark_gc(struct btree_trans *trans) + { + struct bch_fs *c = trans->c; +@@ -582,21 +577,15 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, + * or anything else that might call bch2_trans_relock(), since that + * would just retake the read locks: + */ +- trans_for_each_iter(trans, iter) { +- if (iter->nodes_locked != iter->nodes_intent_locked) { +- if (btree_iter_keep(trans, iter)) { +- if (!bch2_btree_iter_upgrade(iter, 1)) { +- trace_trans_restart_upgrade(trans->ip, trace_ip, +- iter->btree_id, +- &iter->real_pos); +- trans->restarted = true; +- return -EINTR; +- } +- } else { +- bch2_btree_iter_unlock_noinline(iter); +- } ++ trans_for_each_iter(trans, iter) ++ if (iter->nodes_locked != iter->nodes_intent_locked && ++ !bch2_btree_iter_upgrade(iter, 1)) { ++ trace_trans_restart_upgrade(trans->ip, trace_ip, ++ iter->btree_id, ++ &iter->real_pos); ++ trans->restarted = true; ++ return -EINTR; + } +- } + + trans_for_each_update(trans, i) { + const char *invalid = bch2_bkey_invalid(c, +-- +cgit v1.2.3 + + +From 7131dd0b77bef1513e417837631d2e61ee4c7673 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 28 Jul 2021 16:17:10 -0400 +Subject: bcachefs: Change lockrestart_do() to always call bch2_trans_begin() + +More consistent behaviour means less likely to trip over ourselves in +silly ways. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_update.h | 8 +++----- + 1 file changed, 3 insertions(+), 5 deletions(-) + +diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h +index 4bcfbc029b36..217b52e1a168 100644 +--- a/fs/bcachefs/btree_update.h ++++ b/fs/bcachefs/btree_update.h +@@ -104,12 +104,10 @@ static inline int bch2_trans_commit(struct btree_trans *trans, + ({ \ + int _ret; \ + \ +- while (1) { \ +- _ret = (_do); \ +- if (_ret != -EINTR) \ +- break; \ ++ do { \ + bch2_trans_begin(_trans); \ +- } \ ++ _ret = (_do); \ ++ } while (_ret == -EINTR); \ + \ + _ret; \ + }) +-- +cgit v1.2.3 + + +From e74ccf992cb66278eff91a97165feb0eca37449f Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 30 Jul 2021 14:33:06 -0400 +Subject: bcachefs: Zero out mem_ptr field in btree ptr keys from journal + replay + +This fixes a bad ptr deref on recovery from unclean shutdown in +bch2_btree_node_get_noiter(). + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/recovery.c | 16 ++++++++++++++++ + 1 file changed, 16 insertions(+) + +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index 84e224fb0d01..afb72648fe54 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -39,6 +39,20 @@ static void drop_alloc_keys(struct journal_keys *keys) + keys->nr = dst; + } + ++/* ++ * Btree node pointers have a field to stack a pointer to the in memory btree ++ * node; we need to zero out this field when reading in btree nodes, or when ++ * reading in keys from the journal: ++ */ ++static void zero_out_btree_mem_ptr(struct journal_keys *keys) ++{ ++ struct journal_key *i; ++ ++ for (i = keys->d; i < keys->d + keys->nr; i++) ++ if (i->k->k.type == KEY_TYPE_btree_ptr_v2) ++ bkey_i_to_btree_ptr_v2(i->k)->v.mem_ptr = 0; ++} ++ + /* iterate over keys read from the journal: */ + + static int __journal_key_cmp(enum btree_id l_btree_id, +@@ -1072,6 +1086,8 @@ use_clean: + drop_alloc_keys(&c->journal_keys); + } + ++ zero_out_btree_mem_ptr(&c->journal_keys); ++ + ret = journal_replay_early(c, clean, &c->journal_entries); + if (ret) + goto err; +-- +cgit v1.2.3 + + +From 53c75e114818b9fbf2e40717e74bc10325516191 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 12 Jun 2021 15:45:45 -0400 +Subject: bcachefs: Keep a sorted list of btree iterators + +This will be used to make other operations on btree iterators within a +transaction more efficient, and enable some other improvements to how we +manage btree iterators. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 231 +++++++++++++++++++++++++++++++++++++++------- + fs/bcachefs/btree_iter.h | 37 ++++++-- + fs/bcachefs/btree_types.h | 3 + + 3 files changed, 229 insertions(+), 42 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index ddaeac2d2e55..43954a9bd80b 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -18,10 +18,21 @@ + #include + + static void btree_iter_set_search_pos(struct btree_iter *, struct bpos); ++static void btree_trans_sort_iters(struct btree_trans *); ++static void btree_iter_check_sort(struct btree_trans *, struct btree_iter *); + static struct btree_iter *btree_iter_child_alloc(struct btree_iter *, unsigned long); +-static struct btree_iter *btree_trans_iter_alloc(struct btree_trans *); ++static struct btree_iter *btree_trans_iter_alloc(struct btree_trans *, ++ struct btree_iter *); + static void btree_iter_copy(struct btree_iter *, struct btree_iter *); + ++static inline int btree_iter_cmp(const struct btree_iter *l, ++ const struct btree_iter *r) ++{ ++ return cmp_int(l->btree_id, r->btree_id) ?: ++ -cmp_int(btree_iter_is_cached(l), btree_iter_is_cached(r)) ?: ++ bkey_cmp(l->real_pos, r->real_pos); ++} ++ + static inline struct bpos bkey_successor(struct btree_iter *iter, struct bpos p) + { + EBUG_ON(btree_iter_type(iter) == BTREE_ITER_NODES); +@@ -1268,8 +1279,7 @@ static int __btree_iter_traverse_all(struct btree_trans *trans, int ret, + { + struct bch_fs *c = trans->c; + struct btree_iter *iter; +- u8 sorted[BTREE_ITER_MAX]; +- int i, nr_sorted = 0; ++ int i; + + if (trans->in_traverse_all) + return -EINTR; +@@ -1278,22 +1288,14 @@ static int __btree_iter_traverse_all(struct btree_trans *trans, int ret, + retry_all: + trans->restarted = false; + +- nr_sorted = 0; +- +- trans_for_each_iter(trans, iter) { +- sorted[nr_sorted++] = iter->idx; ++ trans_for_each_iter(trans, iter) + iter->should_be_locked = false; +- } +- +-#define btree_iter_cmp_by_idx(_l, _r) \ +- btree_iter_lock_cmp(&trans->iters[_l], &trans->iters[_r]) + +- bubble_sort(sorted, nr_sorted, btree_iter_cmp_by_idx); +-#undef btree_iter_cmp_by_idx ++ btree_trans_sort_iters(trans); + +- for (i = nr_sorted - 2; i >= 0; --i) { +- struct btree_iter *iter1 = trans->iters + sorted[i]; +- struct btree_iter *iter2 = trans->iters + sorted[i + 1]; ++ for (i = trans->nr_sorted - 2; i >= 0; --i) { ++ struct btree_iter *iter1 = trans->iters + trans->sorted[i]; ++ struct btree_iter *iter2 = trans->iters + trans->sorted[i + 1]; + + if (iter1->btree_id == iter2->btree_id && + iter1->locks_want < iter2->locks_want) +@@ -1324,20 +1326,23 @@ retry_all: + BUG_ON(ret && ret != -EINTR); + + /* Now, redo traversals in correct order: */ +- for (i = 0; i < nr_sorted; i++) { +- unsigned idx = sorted[i]; +- +- /* +- * sucessfully traversing one iterator can cause another to be +- * unlinked, in btree_key_cache_fill() +- */ +- if (!(trans->iters_linked & (1ULL << idx))) +- continue; ++ trans_for_each_iter_inorder(trans, iter) { ++ EBUG_ON(!(trans->iters_linked & (1ULL << iter->idx))); + +- ret = btree_iter_traverse_one(&trans->iters[idx], _THIS_IP_); ++ ret = btree_iter_traverse_one(iter, _THIS_IP_); + if (ret) + goto retry_all; ++ ++ EBUG_ON(!(trans->iters_linked & (1ULL << iter->idx))); + } ++ ++ /* ++ * BTREE_ITER_NEED_RELOCK is ok here - if we called bch2_trans_unlock() ++ * and relock(), relock() won't relock since iter->should_be_locked ++ * isn't set yet, which is all fine ++ */ ++ trans_for_each_iter(trans, iter) ++ BUG_ON(iter->uptodate >= BTREE_ITER_NEED_TRAVERSE); + out: + bch2_btree_cache_cannibalize_unlock(c); + +@@ -1621,6 +1626,8 @@ static void btree_iter_set_search_pos(struct btree_iter *iter, struct bpos new_p + iter->real_pos = new_pos; + iter->should_be_locked = false; + ++ btree_iter_check_sort(iter->trans, iter); ++ + if (unlikely(btree_iter_type(iter) == BTREE_ITER_CACHED)) { + btree_node_unlock(iter, 0); + iter->l[0].b = BTREE_ITER_NO_NODE_CACHED; +@@ -2026,6 +2033,151 @@ static inline void bch2_btree_iter_init(struct btree_trans *trans, + + /* new transactional stuff: */ + ++static inline void btree_iter_verify_sorted_ref(struct btree_trans *trans, ++ struct btree_iter *iter) ++{ ++ EBUG_ON(iter->sorted_idx >= trans->nr_sorted); ++ EBUG_ON(trans->sorted[iter->sorted_idx] != iter->idx); ++ EBUG_ON(!(trans->iters_linked & (1ULL << iter->idx))); ++} ++ ++static inline void btree_trans_verify_sorted_refs(struct btree_trans *trans) ++{ ++#ifdef CONFIG_BCACHEFS_DEBUG ++ unsigned i; ++ ++ for (i = 0; i < trans->nr_sorted; i++) ++ btree_iter_verify_sorted_ref(trans, trans->iters + trans->sorted[i]); ++#endif ++} ++ ++static inline void btree_trans_verify_sorted(struct btree_trans *trans) ++{ ++#ifdef CONFIG_BCACHEFS_DEBUG ++ struct btree_iter *iter, *prev = NULL; ++ ++ trans_for_each_iter_inorder(trans, iter) ++ BUG_ON(prev && btree_iter_cmp(prev, iter) > 0); ++#endif ++} ++ ++static inline void btree_iter_swap(struct btree_trans *trans, ++ struct btree_iter *l, struct btree_iter *r) ++{ ++ swap(l->sorted_idx, r->sorted_idx); ++ swap(trans->sorted[l->sorted_idx], ++ trans->sorted[r->sorted_idx]); ++ ++ btree_iter_verify_sorted_ref(trans, l); ++ btree_iter_verify_sorted_ref(trans, r); ++} ++ ++static void btree_trans_sort_iters(struct btree_trans *trans) ++{ ++ bool swapped = false; ++ int i, l = 0, r = trans->nr_sorted; ++ ++ while (1) { ++ for (i = l; i + 1 < r; i++) { ++ if (btree_iter_cmp(trans->iters + trans->sorted[i], ++ trans->iters + trans->sorted[i + 1]) > 0) { ++ swap(trans->sorted[i], trans->sorted[i + 1]); ++ trans->iters[trans->sorted[i]].sorted_idx = i; ++ trans->iters[trans->sorted[i + 1]].sorted_idx = i + 1; ++ swapped = true; ++ } ++ } ++ ++ if (!swapped) ++ break; ++ ++ r--; ++ swapped = false; ++ ++ for (i = r - 2; i >= l; --i) { ++ if (btree_iter_cmp(trans->iters + trans->sorted[i], ++ trans->iters + trans->sorted[i + 1]) > 0) { ++ swap(trans->sorted[i], ++ trans->sorted[i + 1]); ++ trans->iters[trans->sorted[i]].sorted_idx = i; ++ trans->iters[trans->sorted[i + 1]].sorted_idx = i + 1; ++ swapped = true; ++ } ++ } ++ ++ if (!swapped) ++ break; ++ ++ l++; ++ swapped = false; ++ } ++ ++ btree_trans_verify_sorted_refs(trans); ++ btree_trans_verify_sorted(trans); ++} ++ ++static void btree_iter_check_sort(struct btree_trans *trans, struct btree_iter *iter) ++{ ++ struct btree_iter *n; ++ ++ EBUG_ON(iter->sorted_idx == U8_MAX); ++ ++ n = next_btree_iter(trans, iter); ++ if (n && btree_iter_cmp(iter, n) > 0) { ++ do { ++ btree_iter_swap(trans, iter, n); ++ n = next_btree_iter(trans, iter); ++ } while (n && btree_iter_cmp(iter, n) > 0); ++ ++ return; ++ } ++ ++ n = prev_btree_iter(trans, iter); ++ if (n && btree_iter_cmp(n, iter) > 0) { ++ do { ++ btree_iter_swap(trans, n, iter); ++ n = prev_btree_iter(trans, iter); ++ } while (n && btree_iter_cmp(n, iter) > 0); ++ } ++ ++ btree_trans_verify_sorted(trans); ++} ++ ++static inline void btree_iter_list_remove(struct btree_trans *trans, ++ struct btree_iter *iter) ++{ ++ unsigned i; ++ ++ EBUG_ON(iter->sorted_idx >= trans->nr_sorted); ++ ++ array_remove_item(trans->sorted, trans->nr_sorted, iter->sorted_idx); ++ ++ for (i = iter->sorted_idx; i < trans->nr_sorted; i++) ++ trans->iters[trans->sorted[i]].sorted_idx = i; ++ ++ iter->sorted_idx = U8_MAX; ++ ++ btree_trans_verify_sorted_refs(trans); ++} ++ ++static inline void btree_iter_list_add(struct btree_trans *trans, ++ struct btree_iter *pos, ++ struct btree_iter *iter) ++{ ++ unsigned i; ++ ++ btree_trans_verify_sorted_refs(trans); ++ ++ iter->sorted_idx = pos ? pos->sorted_idx : trans->nr_sorted; ++ ++ array_insert_item(trans->sorted, trans->nr_sorted, iter->sorted_idx, iter->idx); ++ ++ for (i = iter->sorted_idx; i < trans->nr_sorted; i++) ++ trans->iters[trans->sorted[i]].sorted_idx = i; ++ ++ btree_trans_verify_sorted_refs(trans); ++} ++ + static void btree_iter_child_free(struct btree_iter *iter) + { + struct btree_iter *child = btree_iter_child(iter); +@@ -2043,7 +2195,7 @@ static struct btree_iter *btree_iter_child_alloc(struct btree_iter *iter, + struct btree_iter *child = btree_iter_child(iter); + + if (!child) { +- child = btree_trans_iter_alloc(trans); ++ child = btree_trans_iter_alloc(trans, iter); + child->ip_allocated = ip; + iter->child_idx = child->idx; + +@@ -2059,6 +2211,8 @@ static inline void __bch2_trans_iter_free(struct btree_trans *trans, + { + btree_iter_child_free(&trans->iters[idx]); + ++ btree_iter_list_remove(trans, &trans->iters[idx]); ++ + __bch2_btree_iter_unlock(&trans->iters[idx]); + trans->iters_linked &= ~(1ULL << idx); + trans->iters_live &= ~(1ULL << idx); +@@ -2105,10 +2259,12 @@ static void btree_trans_iter_alloc_fail(struct btree_trans *trans) + struct btree_insert_entry *i; + char buf[100]; + +- trans_for_each_iter(trans, iter) ++ btree_trans_sort_iters(trans); ++ ++ trans_for_each_iter_inorder(trans, iter) + printk(KERN_ERR "iter: btree %s pos %s%s%s%s %pS\n", + bch2_btree_ids[iter->btree_id], +- (bch2_bpos_to_text(&PBUF(buf), iter->pos), buf), ++ (bch2_bpos_to_text(&PBUF(buf), iter->real_pos), buf), + btree_iter_live(trans, iter) ? " live" : "", + (trans->iters_touched & (1ULL << iter->idx)) ? " touched" : "", + iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT ? " keep" : "", +@@ -2124,7 +2280,8 @@ static void btree_trans_iter_alloc_fail(struct btree_trans *trans) + panic("trans iter oveflow\n"); + } + +-static struct btree_iter *btree_trans_iter_alloc(struct btree_trans *trans) ++static struct btree_iter *btree_trans_iter_alloc(struct btree_trans *trans, ++ struct btree_iter *pos) + { + struct btree_iter *iter; + unsigned idx; +@@ -2139,10 +2296,13 @@ static struct btree_iter *btree_trans_iter_alloc(struct btree_trans *trans) + iter->trans = trans; + iter->idx = idx; + iter->child_idx = U8_MAX; ++ iter->sorted_idx = U8_MAX; + iter->flags = 0; + iter->nodes_locked = 0; + iter->nodes_intent_locked = 0; + trans->iters_linked |= 1ULL << idx; ++ ++ btree_iter_list_add(trans, pos, iter); + return iter; + } + +@@ -2163,6 +2323,8 @@ static void btree_iter_copy(struct btree_iter *dst, struct btree_iter *src) + + dst->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT; + dst->flags &= ~BTREE_ITER_SET_POS_AFTER_COMMIT; ++ ++ btree_iter_check_sort(dst->trans, dst); + } + + struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans, +@@ -2216,10 +2378,10 @@ struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans, + } + + if (!best) { +- iter = btree_trans_iter_alloc(trans); ++ iter = btree_trans_iter_alloc(trans, NULL); + bch2_btree_iter_init(trans, iter, btree_id); + } else if (btree_iter_keep(trans, best)) { +- iter = btree_trans_iter_alloc(trans); ++ iter = btree_trans_iter_alloc(trans, best); + btree_iter_copy(iter, best); + } else { + iter = best; +@@ -2300,7 +2462,7 @@ struct btree_iter *__bch2_trans_copy_iter(struct btree_trans *trans, + { + struct btree_iter *iter; + +- iter = btree_trans_iter_alloc(trans); ++ iter = btree_trans_iter_alloc(trans, src); + btree_iter_copy(iter, src); + + trans->iters_live |= 1ULL << iter->idx; +@@ -2415,6 +2577,7 @@ static void bch2_trans_alloc_iters(struct btree_trans *trans, struct bch_fs *c) + { + size_t iters_bytes = sizeof(struct btree_iter) * BTREE_ITER_MAX; + size_t updates_bytes = sizeof(struct btree_insert_entry) * BTREE_ITER_MAX; ++ size_t sorted_bytes = sizeof(u8) * BTREE_ITER_MAX; + void *p = NULL; + + BUG_ON(trans->used_mempool); +@@ -2427,6 +2590,7 @@ static void bch2_trans_alloc_iters(struct btree_trans *trans, struct bch_fs *c) + + trans->iters = p; p += iters_bytes; + trans->updates = p; p += updates_bytes; ++ trans->sorted = p; p += sorted_bytes; + } + + void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, +@@ -2629,6 +2793,7 @@ int bch2_fs_btree_iter_init(struct bch_fs *c) + + return init_srcu_struct(&c->btree_trans_barrier) ?: + mempool_init_kmalloc_pool(&c->btree_iters_pool, 1, ++ sizeof(u8) * nr + + sizeof(struct btree_iter) * nr + + sizeof(struct btree_insert_entry) * nr) ?: + mempool_init_kmalloc_pool(&c->btree_trans_mem_pool, 1, +diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h +index aeabc07d2c9c..39124e68e488 100644 +--- a/fs/bcachefs/btree_iter.h ++++ b/fs/bcachefs/btree_iter.h +@@ -71,6 +71,30 @@ __trans_next_iter(struct btree_trans *trans, unsigned idx) + (_iter); \ + _iter = __trans_next_iter((_trans), (_iter)->idx + 1)) + ++static inline struct btree_iter *next_btree_iter(struct btree_trans *trans, struct btree_iter *iter) ++{ ++ unsigned idx = iter ? iter->sorted_idx + 1 : 0; ++ ++ EBUG_ON(idx > trans->nr_sorted); ++ ++ return idx < trans->nr_sorted ++ ? trans->iters + trans->sorted[idx] ++ : NULL; ++} ++ ++static inline struct btree_iter *prev_btree_iter(struct btree_trans *trans, struct btree_iter *iter) ++{ ++ EBUG_ON(iter->sorted_idx >= trans->nr_sorted); ++ return iter->sorted_idx ++ ? trans->iters + trans->sorted[iter->sorted_idx - 1] ++ : NULL; ++} ++ ++#define trans_for_each_iter_inorder(_trans, _iter) \ ++ for (_iter = next_btree_iter(trans, NULL); \ ++ (_iter); \ ++ _iter = next_btree_iter((_trans), (_iter))) ++ + static inline bool __iter_has_node(const struct btree_iter *iter, + const struct btree *b) + { +@@ -191,19 +215,14 @@ static inline void bch2_btree_iter_set_pos_to_extent_start(struct btree_iter *it + iter->pos = bkey_start_pos(&iter->k); + } + +-static inline struct btree_iter *btree_iter_child(struct btree_iter *iter) ++static inline struct btree_iter *idx_to_btree_iter(struct btree_trans *trans, unsigned idx) + { +- return iter->child_idx == U8_MAX ? NULL +- : iter->trans->iters + iter->child_idx; ++ return idx != U8_MAX ? trans->iters + idx : NULL; + } + +-/* Sort order for locking btree iterators: */ +-static inline int btree_iter_lock_cmp(const struct btree_iter *l, +- const struct btree_iter *r) ++static inline struct btree_iter *btree_iter_child(struct btree_iter *iter) + { +- return cmp_int(l->btree_id, r->btree_id) ?: +- -cmp_int(btree_iter_is_cached(l), btree_iter_is_cached(r)) ?: +- bkey_cmp(l->real_pos, r->real_pos); ++ return idx_to_btree_iter(iter->trans, iter->child_idx); + } + + /* +diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h +index 6882873d149a..a1e5debf19f3 100644 +--- a/fs/bcachefs/btree_types.h ++++ b/fs/bcachefs/btree_types.h +@@ -246,6 +246,7 @@ struct btree_iter { + + u8 idx; + u8 child_idx; ++ u8 sorted_idx; + + /* btree_iter_copy starts here: */ + u16 flags; +@@ -379,6 +380,7 @@ struct btree_trans { + unsigned long ip; + int srcu_idx; + ++ u8 nr_sorted; + u8 nr_updates; + bool used_mempool:1; + bool error:1; +@@ -398,6 +400,7 @@ struct btree_trans { + unsigned mem_bytes; + void *mem; + ++ u8 *sorted; + struct btree_iter *iters; + struct btree_insert_entry *updates; + +-- +cgit v1.2.3 + + +From 25aa2740fb11f94d798b4668f4570a7708a59a88 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 30 Jul 2021 17:59:37 -0400 +Subject: bcachefs: Add flags field to bch2_inode_to_text() + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/inode.c | 23 +++++++++++++++++------ + fs/bcachefs/inode.h | 2 ++ + 2 files changed, 19 insertions(+), 6 deletions(-) + +diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c +index 25607b5848be..3b671082cd1e 100644 +--- a/fs/bcachefs/inode.c ++++ b/fs/bcachefs/inode.c +@@ -371,6 +371,22 @@ const char *bch2_inode_invalid(const struct bch_fs *c, struct bkey_s_c k) + return NULL; + } + ++static void __bch2_inode_unpacked_to_text(struct printbuf *out, struct bch_inode_unpacked *inode) ++{ ++ pr_buf(out, "mode %o flags %x ", inode->bi_mode, inode->bi_flags); ++ ++#define x(_name, _bits) \ ++ pr_buf(out, #_name " %llu ", (u64) inode->_name); ++ BCH_INODE_FIELDS() ++#undef x ++} ++ ++void bch2_inode_unpacked_to_text(struct printbuf *out, struct bch_inode_unpacked *inode) ++{ ++ pr_buf(out, "inum: %llu ", inode->bi_inum); ++ __bch2_inode_unpacked_to_text(out, inode); ++} ++ + void bch2_inode_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) + { +@@ -382,12 +398,7 @@ void bch2_inode_to_text(struct printbuf *out, struct bch_fs *c, + return; + } + +- pr_buf(out, "mode: %o ", unpacked.bi_mode); +- +-#define x(_name, _bits) \ +- pr_buf(out, #_name ": %llu ", (u64) unpacked._name); +- BCH_INODE_FIELDS() +-#undef x ++ __bch2_inode_unpacked_to_text(out, &unpacked); + } + + const char *bch2_inode_generation_invalid(const struct bch_fs *c, +diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h +index 2cb081ae44d9..d67af4f56f05 100644 +--- a/fs/bcachefs/inode.h ++++ b/fs/bcachefs/inode.h +@@ -55,6 +55,8 @@ void bch2_inode_pack(struct bch_fs *, struct bkey_inode_buf *, + const struct bch_inode_unpacked *); + int bch2_inode_unpack(struct bkey_s_c_inode, struct bch_inode_unpacked *); + ++void bch2_inode_unpacked_to_text(struct printbuf *, struct bch_inode_unpacked *); ++ + struct btree_iter *bch2_inode_peek(struct btree_trans *, + struct bch_inode_unpacked *, u64, unsigned); + int bch2_inode_write(struct btree_trans *, struct btree_iter *, +-- +cgit v1.2.3 + + +From f63c126f6158c8b72e513e70590cb0e0ddc835d9 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 30 Jul 2021 18:01:33 -0400 +Subject: bcachefs: Ensure that new inodes hit underlying btree + +Inode creation is done with non-cached btree iterators, but then in the +same transaction the inode may be updated again with a cached iterator - +it makes cache coherency easier if new inodes always land in the +underlying btree. + +This patch adds a check to bch2_trans_update() - if the same key is +updated multiple times in the same transaction with both cached and non +cache iterators, use the non cached iterator. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_update_leaf.c | 14 +++++++++++++- + 1 file changed, 13 insertions(+), 1 deletion(-) + +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index e9e542260bff..7e9909e2dcaf 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -1041,7 +1041,19 @@ int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter, + if (i < trans->updates + trans->nr_updates && + !btree_insert_entry_cmp(&n, i)) { + BUG_ON(i->trans_triggers_run); +- *i = n; ++ ++ /* ++ * This is a hack to ensure that inode creates update the btree, ++ * not the key cache, which helps with cache coherency issues in ++ * other areas: ++ */ ++ if (btree_iter_type(n.iter) == BTREE_ITER_CACHED && ++ btree_iter_type(i->iter) != BTREE_ITER_CACHED) { ++ i->k = n.k; ++ i->flags = n.flags; ++ } else { ++ *i = n; ++ } + } else + array_insert_item(trans->updates, trans->nr_updates, + i - trans->updates, n); +-- +cgit v1.2.3 + + +From 5615140fe436f596c50503d8586519201f824922 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 5 Aug 2021 13:02:39 -0400 +Subject: bcachefs: Fix an unhandled transaction restart + +__bch2_read() -> __bch2_read_extent() -> bch2_bucket_io_time_reset() may +cause a transaction restart, which we don't return an error for because +it doesn't prevent us from making forward progress on the read we're +submitting. + +Instead, change __bch2_read() and bchfs_read() to check for transaction +restarts. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/fs-io.c | 9 +++++++++ + fs/bcachefs/io.c | 9 +++++++++ + 2 files changed, 18 insertions(+) + +diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c +index 35d213791ec1..42927a9ea8e6 100644 +--- a/fs/bcachefs/fs-io.c ++++ b/fs/bcachefs/fs-io.c +@@ -817,6 +817,15 @@ retry: + unsigned bytes, sectors, offset_into_extent; + enum btree_id data_btree = BTREE_ID_extents; + ++ /* ++ * read_extent -> io_time_reset may cause a transaction restart ++ * without returning an error, we need to check for that here: ++ */ ++ if (!bch2_trans_relock(trans)) { ++ ret = -EINTR; ++ break; ++ } ++ + bch2_btree_iter_set_pos(iter, + POS(inum, rbio->bio.bi_iter.bi_sector)); + +diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c +index e090424fa380..a4d659a0ddaa 100644 +--- a/fs/bcachefs/io.c ++++ b/fs/bcachefs/io.c +@@ -2283,6 +2283,15 @@ retry: + unsigned bytes, sectors, offset_into_extent; + enum btree_id data_btree = BTREE_ID_extents; + ++ /* ++ * read_extent -> io_time_reset may cause a transaction restart ++ * without returning an error, we need to check for that here: ++ */ ++ if (!bch2_trans_relock(&trans)) { ++ ret = -EINTR; ++ break; ++ } ++ + bch2_btree_iter_set_pos(iter, + POS(inode, bvec_iter.bi_sector)); + +-- +cgit v1.2.3 + + +From 147cf28feee84af0b26001a63b253c04507ccd04 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 7 Aug 2021 18:21:35 -0400 +Subject: bcachefs: Fix btree_trans_peek_updates() + +Should have been using bpos_cmp(), not bkey_cmp(). + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 16 ++++++++++------ + 1 file changed, 10 insertions(+), 6 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 43954a9bd80b..8cb3bd14948e 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -1721,8 +1721,7 @@ static inline bool btree_iter_set_pos_to_prev_leaf(struct btree_iter *iter) + return ret; + } + +-static inline struct bkey_i *btree_trans_peek_updates(struct btree_iter *iter, +- struct bpos pos) ++static inline struct bkey_i *btree_trans_peek_updates(struct btree_iter *iter) + { + struct btree_insert_entry *i; + +@@ -1731,7 +1730,7 @@ static inline struct bkey_i *btree_trans_peek_updates(struct btree_iter *iter, + + trans_for_each_update(iter->trans, i) + if ((cmp_int(iter->btree_id, i->iter->btree_id) ?: +- bkey_cmp(pos, i->k->k.p)) <= 0) { ++ bpos_cmp(iter->real_pos, i->k->k.p)) <= 0) { + if (iter->btree_id == i->iter->btree_id) + return i->k; + break; +@@ -1755,7 +1754,6 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) + bch2_btree_iter_verify(iter); + bch2_btree_iter_verify_entry_exit(iter); + start: +- next_update = btree_trans_peek_updates(iter, search_key); + btree_iter_set_search_pos(iter, search_key); + + while (1) { +@@ -1763,8 +1761,13 @@ start: + if (unlikely(ret)) + return bkey_s_c_err(ret); + ++ /* ++ * btree_iter_level_peek() mutates iter->real_pos, which ++ * btree_trans_peek_updates() checks against, so we have to call ++ * them in this order: ++ */ ++ next_update = btree_trans_peek_updates(iter); + k = btree_iter_level_peek(iter, &iter->l[0]); +- + if (next_update && + bpos_cmp(next_update->k.p, iter->real_pos) <= 0) { + iter->k = next_update->k; +@@ -1916,6 +1919,8 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) + struct bkey_i *next_update; + struct bkey_cached *ck; + ++ next_update = btree_trans_peek_updates(iter); ++ + switch (btree_iter_type(iter)) { + case BTREE_ITER_KEYS: + k = btree_iter_level_peek_all(iter, &iter->l[0]); +@@ -1933,7 +1938,6 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) + BUG(); + } + +- next_update = btree_trans_peek_updates(iter, search_key); + if (next_update && + (!k.k || bpos_cmp(next_update->k.p, k.k->p) <= 0)) { + iter->k = next_update->k; +-- +cgit v1.2.3 + + +From 16631055f726a9626f078f343da7f292f36ff58a Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 7 Aug 2021 18:19:33 -0400 +Subject: bcachefs: Minor btree iter refactoring + +This makes the flow control in bch2_btree_iter_peek() and +bch2_btree_iter_peek_prev() a bit cleaner. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 59 ++++++++++++++++-------------------------------- + 1 file changed, 20 insertions(+), 39 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 8cb3bd14948e..b01ee51edaf5 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -1690,37 +1690,6 @@ inline bool bch2_btree_iter_rewind(struct btree_iter *iter) + return ret; + } + +-static inline bool btree_iter_set_pos_to_next_leaf(struct btree_iter *iter) +-{ +- struct bpos next_pos = iter->l[0].b->key.k.p; +- bool ret = bpos_cmp(next_pos, SPOS_MAX) != 0; +- +- /* +- * Typically, we don't want to modify iter->pos here, since that +- * indicates where we searched from - unless we got to the end of the +- * btree, in that case we want iter->pos to reflect that: +- */ +- if (ret) +- btree_iter_set_search_pos(iter, bpos_successor(next_pos)); +- else +- bch2_btree_iter_set_pos(iter, SPOS_MAX); +- +- return ret; +-} +- +-static inline bool btree_iter_set_pos_to_prev_leaf(struct btree_iter *iter) +-{ +- struct bpos next_pos = iter->l[0].b->data->min_key; +- bool ret = bpos_cmp(next_pos, POS_MIN) != 0; +- +- if (ret) +- btree_iter_set_search_pos(iter, bpos_predecessor(next_pos)); +- else +- bch2_btree_iter_set_pos(iter, POS_MIN); +- +- return ret; +-} +- + static inline struct bkey_i *btree_trans_peek_updates(struct btree_iter *iter) + { + struct btree_insert_entry *i; +@@ -1753,10 +1722,10 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) + EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS); + bch2_btree_iter_verify(iter); + bch2_btree_iter_verify_entry_exit(iter); +-start: +- btree_iter_set_search_pos(iter, search_key); + + while (1) { ++ btree_iter_set_search_pos(iter, search_key); ++ + ret = btree_iter_traverse(iter); + if (unlikely(ret)) + return bkey_s_c_err(ret); +@@ -1777,14 +1746,20 @@ start: + if (likely(k.k)) { + if (bkey_deleted(k.k)) { + search_key = bkey_successor(iter, k.k->p); +- goto start; ++ continue; + } + + break; + } + +- if (!btree_iter_set_pos_to_next_leaf(iter)) +- return bkey_s_c_null; ++ if (unlikely(!bpos_cmp(iter->l[0].b->key.k.p, SPOS_MAX))) { ++ bch2_btree_iter_set_pos(iter, SPOS_MAX); ++ k = bkey_s_c_null; ++ goto out; ++ } ++ ++ /* Advance to next leaf node: */ ++ search_key = bpos_successor(iter->l[0].b->key.k.p); + } + + /* +@@ -1796,6 +1771,7 @@ start: + else if (bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0) + iter->pos = bkey_start_pos(k.k); + ++out: + bch2_btree_iter_verify_entry_exit(iter); + bch2_btree_iter_verify(iter); + iter->should_be_locked = true; +@@ -1820,6 +1796,7 @@ struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter) + */ + struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) + { ++ struct bpos search_key = iter->pos; + struct btree_iter_level *l = &iter->l[0]; + struct bkey_s_c k; + int ret; +@@ -1829,9 +1806,9 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) + bch2_btree_iter_verify(iter); + bch2_btree_iter_verify_entry_exit(iter); + +- btree_iter_set_search_pos(iter, iter->pos); +- + while (1) { ++ btree_iter_set_search_pos(iter, search_key); ++ + ret = btree_iter_traverse(iter); + if (unlikely(ret)) { + k = bkey_s_c_err(ret); +@@ -1848,10 +1825,14 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) + if (likely(k.k)) + break; + +- if (!btree_iter_set_pos_to_prev_leaf(iter)) { ++ if (unlikely(!bpos_cmp(iter->l[0].b->data->min_key, POS_MIN))) { ++ bch2_btree_iter_set_pos(iter, POS_MIN); + k = bkey_s_c_null; + goto no_key; + } ++ ++ /* Advance to previous leaf node: */ ++ search_key = bpos_predecessor(iter->l[0].b->data->min_key); + } + + EBUG_ON(bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0); +-- +cgit v1.2.3 + + +From 6491fcf3be59d3b8572f830864c9cee89071dd68 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 17 Aug 2021 15:03:53 -0400 +Subject: bcachefs: Fix a valgrind conditional jump + +Valgrind was complaining about a jump depending on uninitialized memory +- we weren't, but this change makes the code less confusing for valgrind +to follow. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/varint.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/fs/bcachefs/varint.c b/fs/bcachefs/varint.c +index e6a041541792..752179b26a1e 100644 +--- a/fs/bcachefs/varint.c ++++ b/fs/bcachefs/varint.c +@@ -96,7 +96,7 @@ int bch2_varint_encode_fast(u8 *out, u64 v) + int bch2_varint_decode_fast(const u8 *in, const u8 *end, u64 *out) + { + u64 v = get_unaligned_le64(in); +- unsigned bytes = ffz(v & 255) + 1; ++ unsigned bytes = ffz(*in) + 1; + + if (unlikely(in + bytes > end)) + return -1; +-- +cgit v1.2.3 + + +From f89358f1b4a8bc38078b074bf7cfdeef9714bc6f Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 17 Aug 2021 15:29:21 -0400 +Subject: bcachefs: Disk space accounting fix + +DIV_ROUND_UP() wasn't doing what we wanted when passing it negative +numbers - fix it by just not passing it negative numbers anymore. + +Also, no need to do the scaling by compression ratio for incompressible +data. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/buckets.c | 17 ++++++++++------- + 1 file changed, 10 insertions(+), 7 deletions(-) + +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index 76945e50e4b1..00aaed3f545f 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -662,7 +662,10 @@ void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, + + static s64 ptr_disk_sectors(s64 sectors, struct extent_ptr_decoded p) + { +- return p.crc.compression_type ++ EBUG_ON(sectors < 0); ++ ++ return p.crc.compression_type && ++ p.crc.compression_type != BCH_COMPRESSION_TYPE_incompressible + ? DIV_ROUND_UP(sectors * p.crc.compressed_size, + p.crc.uncompressed_size) + : sectors; +@@ -925,9 +928,6 @@ static int bch2_mark_extent(struct bch_fs *c, + BUG_ON((flags & (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE)) == + (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE)); + +- if (flags & BTREE_TRIGGER_OVERWRITE) +- sectors = -sectors; +- + r.e.data_type = data_type; + r.e.nr_devs = 0; + r.e.nr_required = 1; +@@ -935,6 +935,9 @@ static int bch2_mark_extent(struct bch_fs *c, + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { + s64 disk_sectors = ptr_disk_sectors(sectors, p); + ++ if (flags & BTREE_TRIGGER_OVERWRITE) ++ disk_sectors = -disk_sectors; ++ + ret = bch2_mark_pointer(c, k, p, disk_sectors, data_type, + journal_seq, flags); + if (ret < 0) +@@ -1545,9 +1548,6 @@ static int bch2_trans_mark_extent(struct btree_trans *trans, + BUG_ON((flags & (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE)) == + (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE)); + +- if (flags & BTREE_TRIGGER_OVERWRITE) +- sectors = -sectors; +- + r.e.data_type = data_type; + r.e.nr_devs = 0; + r.e.nr_required = 1; +@@ -1555,6 +1555,9 @@ static int bch2_trans_mark_extent(struct btree_trans *trans, + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { + s64 disk_sectors = ptr_disk_sectors(sectors, p); + ++ if (flags & BTREE_TRIGGER_OVERWRITE) ++ disk_sectors = -disk_sectors; ++ + ret = bch2_trans_mark_pointer(trans, k, p, + disk_sectors, data_type); + if (ret < 0) +-- +cgit v1.2.3 + + +From 2bce2a90228d6e6b47a189decb9c50de34e338d5 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 18 Aug 2021 16:19:28 -0400 +Subject: bcachefs: Be sure to check ptr->dev in copygc pred function + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/movinggc.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c +index 2acca0ddb6fd..0495711b88ce 100644 +--- a/fs/bcachefs/movinggc.c ++++ b/fs/bcachefs/movinggc.c +@@ -85,6 +85,7 @@ static enum data_cmd copygc_pred(struct bch_fs *c, void *arg, + BUG_ON(i != j); + #endif + if (i >= 0 && ++ p.ptr.dev == h->data[i].dev && + p.ptr.offset < h->data[i].offset + ca->mi.bucket_size && + p.ptr.gen == h->data[i].gen) { + /* +-- +cgit v1.2.3 + + +From a0a623b7dfd47e2d78327e86f42ef736a030b7c5 Mon Sep 17 00:00:00 2001 +From: Brett Holman +Date: Tue, 17 Aug 2021 17:14:26 -0600 +Subject: bcachefs: Fix 32 bit build failures + +This fix replaces multiple 64 bit divisions with do_div() equivalents. + +Signed-off-by: Brett Holman +--- + fs/bcachefs/buckets.c | 2 +- + fs/bcachefs/tests.c | 8 ++++---- + 2 files changed, 5 insertions(+), 5 deletions(-) + +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index 00aaed3f545f..c951c1dc10dd 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -666,7 +666,7 @@ static s64 ptr_disk_sectors(s64 sectors, struct extent_ptr_decoded p) + + return p.crc.compression_type && + p.crc.compression_type != BCH_COMPRESSION_TYPE_incompressible +- ? DIV_ROUND_UP(sectors * p.crc.compressed_size, ++ ? DIV_ROUND_UP_ULL(sectors * p.crc.compressed_size, + p.crc.uncompressed_size) + : sectors; + } +diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c +index 4d8d50fd7642..44b812dc4053 100644 +--- a/fs/bcachefs/tests.c ++++ b/fs/bcachefs/tests.c +@@ -778,7 +778,7 @@ static int btree_perf_test_thread(void *data) + wait_event(j->ready_wait, !atomic_read(&j->ready)); + } + +- ret = j->fn(j->c, j->nr / j->nr_threads); ++ ret = j->fn(j->c, div64_u64(j->nr, j->nr_threads)); + if (ret) + j->ret = ret; + +@@ -854,11 +854,11 @@ int bch2_btree_perf_test(struct bch_fs *c, const char *testname, + + scnprintf(name_buf, sizeof(name_buf), "%s:", testname); + bch2_hprint(&PBUF(nr_buf), nr); +- bch2_hprint(&PBUF(per_sec_buf), nr * NSEC_PER_SEC / time); ++ bch2_hprint(&PBUF(per_sec_buf), div64_u64(nr * NSEC_PER_SEC, time)); + printk(KERN_INFO "%-12s %s with %u threads in %5llu sec, %5llu nsec per iter, %5s per sec\n", + name_buf, nr_buf, nr_threads, +- time / NSEC_PER_SEC, +- time * nr_threads / nr, ++ div_u64(time, NSEC_PER_SEC), ++ div_u64(time * nr_threads, nr), + per_sec_buf); + return j.ret; + } +-- +cgit v1.2.3 + + +From 894f11c431f895a2f98ee6f62982c3f3e8bba8a3 Mon Sep 17 00:00:00 2001 +From: Brett Holman +Date: Fri, 23 Jul 2021 13:57:19 -0600 +Subject: bcachefs: add progress stats to sysfs + +This adds progress stats to sysfs for copygc, rebalance, recovery, and the +cmd_job ioctls. + +Signed-off-by: Brett Holman +--- + fs/bcachefs/bcachefs.h | 4 ++++ + fs/bcachefs/move.c | 31 +++++++++++++++++++++++++++++++ + fs/bcachefs/move.h | 4 ++++ + fs/bcachefs/move_types.h | 2 ++ + fs/bcachefs/movinggc.c | 3 ++- + fs/bcachefs/rebalance.c | 11 +++++------ + fs/bcachefs/rebalance_types.h | 1 - + fs/bcachefs/recovery.c | 4 +++- + fs/bcachefs/super.c | 3 +++ + fs/bcachefs/sysfs.c | 40 ++++++++++++++++++++++++++++++++++++++++ + 10 files changed, 94 insertions(+), 9 deletions(-) + +diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h +index 04210df26af7..f7d64eb8b0b8 100644 +--- a/fs/bcachefs/bcachefs.h ++++ b/fs/bcachefs/bcachefs.h +@@ -792,6 +792,10 @@ struct bch_fs { + struct write_point copygc_write_point; + s64 copygc_wait; + ++ /* DATA PROGRESS STATS */ ++ struct list_head data_progress_list; ++ struct mutex data_progress_lock; ++ + /* STRIPES: */ + GENRADIX(struct stripe) stripes[2]; + +diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c +index ee0f155fda6c..befb198a77fd 100644 +--- a/fs/bcachefs/move.c ++++ b/fs/bcachefs/move.c +@@ -686,6 +686,30 @@ out: + return ret; + } + ++inline void bch_move_stats_init(struct bch_move_stats *stats, char *name) ++{ ++ memset(stats, 0, sizeof(*stats)); ++ ++ scnprintf(stats->name, sizeof(stats->name), ++ "%s", name); ++} ++ ++static inline void progress_list_add(struct bch_fs *c, ++ struct bch_move_stats *stats) ++{ ++ mutex_lock(&c->data_progress_lock); ++ list_add(&stats->list, &c->data_progress_list); ++ mutex_unlock(&c->data_progress_lock); ++} ++ ++static inline void progress_list_del(struct bch_fs *c, ++ struct bch_move_stats *stats) ++{ ++ mutex_lock(&c->data_progress_lock); ++ list_del(&stats->list); ++ mutex_unlock(&c->data_progress_lock); ++} ++ + int bch2_move_data(struct bch_fs *c, + enum btree_id start_btree_id, struct bpos start_pos, + enum btree_id end_btree_id, struct bpos end_pos, +@@ -698,6 +722,7 @@ int bch2_move_data(struct bch_fs *c, + enum btree_id id; + int ret; + ++ progress_list_add(c, stats); + closure_init_stack(&ctxt.cl); + INIT_LIST_HEAD(&ctxt.reads); + init_waitqueue_head(&ctxt.wait); +@@ -731,6 +756,7 @@ int bch2_move_data(struct bch_fs *c, + atomic64_read(&stats->sectors_moved), + atomic64_read(&stats->keys_moved)); + ++ progress_list_del(c, stats); + return ret; + } + +@@ -755,6 +781,7 @@ static int bch2_move_btree(struct bch_fs *c, + int ret = 0; + + bch2_trans_init(&trans, c, 0, 0); ++ progress_list_add(c, stats); + + stats->data_type = BCH_DATA_btree; + +@@ -803,6 +830,7 @@ next: + if (ret) + bch_err(c, "error %i in bch2_move_btree", ret); + ++ progress_list_del(c, stats); + return ret; + } + +@@ -944,6 +972,7 @@ int bch2_data_job(struct bch_fs *c, + + switch (op.op) { + case BCH_DATA_OP_REREPLICATE: ++ bch_move_stats_init(stats, "rereplicate"); + stats->data_type = BCH_DATA_journal; + ret = bch2_journal_flush_device_pins(&c->journal, -1); + +@@ -968,6 +997,7 @@ int bch2_data_job(struct bch_fs *c, + if (op.migrate.dev >= c->sb.nr_devices) + return -EINVAL; + ++ bch_move_stats_init(stats, "migrate"); + stats->data_type = BCH_DATA_journal; + ret = bch2_journal_flush_device_pins(&c->journal, op.migrate.dev); + +@@ -985,6 +1015,7 @@ int bch2_data_job(struct bch_fs *c, + ret = bch2_replicas_gc2(c) ?: ret; + break; + case BCH_DATA_OP_REWRITE_OLD_NODES: ++ bch_move_stats_init(stats, "rewrite_old_nodes"); + ret = bch2_scan_old_btree_nodes(c, stats); + break; + default: +diff --git a/fs/bcachefs/move.h b/fs/bcachefs/move.h +index 5076153689d1..2a789a1158ca 100644 +--- a/fs/bcachefs/move.h ++++ b/fs/bcachefs/move.h +@@ -66,4 +66,8 @@ int bch2_data_job(struct bch_fs *, + struct bch_move_stats *, + struct bch_ioctl_data); + ++inline void bch_move_stats_init(struct bch_move_stats *stats, ++ char *name); ++ ++ + #endif /* _BCACHEFS_MOVE_H */ +diff --git a/fs/bcachefs/move_types.h b/fs/bcachefs/move_types.h +index fc0de165af9f..9df6d18137a5 100644 +--- a/fs/bcachefs/move_types.h ++++ b/fs/bcachefs/move_types.h +@@ -6,6 +6,8 @@ struct bch_move_stats { + enum bch_data_type data_type; + enum btree_id btree_id; + struct bpos pos; ++ struct list_head list; ++ char name[32]; + + atomic64_t keys_moved; + atomic64_t keys_raced; +diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c +index 0495711b88ce..5c9eafc026c9 100644 +--- a/fs/bcachefs/movinggc.c ++++ b/fs/bcachefs/movinggc.c +@@ -147,7 +147,8 @@ static int bch2_copygc(struct bch_fs *c) + size_t b, heap_size = 0; + int ret; + +- memset(&move_stats, 0, sizeof(move_stats)); ++ bch_move_stats_init(&move_stats, "copygc"); ++ + /* + * Find buckets with lowest sector counts, skipping completely + * empty buckets, by building a maxheap sorted by sector count, +diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c +index a0dbf41d1d37..a573fede05b1 100644 +--- a/fs/bcachefs/rebalance.c ++++ b/fs/bcachefs/rebalance.c +@@ -166,6 +166,7 @@ static int bch2_rebalance_thread(void *arg) + struct bch_fs_rebalance *r = &c->rebalance; + struct io_clock *clock = &c->io_clock[WRITE]; + struct rebalance_work w, p; ++ struct bch_move_stats move_stats; + unsigned long start, prev_start; + unsigned long prev_run_time, prev_run_cputime; + unsigned long cputime, prev_cputime; +@@ -179,6 +180,7 @@ static int bch2_rebalance_thread(void *arg) + prev_start = jiffies; + prev_cputime = curr_cputime(); + ++ bch_move_stats_init(&move_stats, "rebalance"); + while (!kthread_wait_freezable(r->enabled)) { + cond_resched(); + +@@ -235,7 +237,7 @@ static int bch2_rebalance_thread(void *arg) + prev_cputime = cputime; + + r->state = REBALANCE_RUNNING; +- memset(&r->move_stats, 0, sizeof(r->move_stats)); ++ memset(&move_stats, 0, sizeof(move_stats)); + rebalance_work_reset(c); + + bch2_move_data(c, +@@ -245,7 +247,7 @@ static int bch2_rebalance_thread(void *arg) + NULL, /* &r->pd.rate, */ + writepoint_ptr(&c->rebalance_write_point), + rebalance_pred, NULL, +- &r->move_stats); ++ &move_stats); + } + + return 0; +@@ -281,10 +283,7 @@ void bch2_rebalance_work_to_text(struct printbuf *out, struct bch_fs *c) + h1); + break; + case REBALANCE_RUNNING: +- pr_buf(out, "running\n" +- "pos "); +- bch2_bpos_to_text(out, r->move_stats.pos); +- pr_buf(out, "\n"); ++ pr_buf(out, "running\n"); + break; + } + } +diff --git a/fs/bcachefs/rebalance_types.h b/fs/bcachefs/rebalance_types.h +index 2f62a643c39f..7462a92e9598 100644 +--- a/fs/bcachefs/rebalance_types.h ++++ b/fs/bcachefs/rebalance_types.h +@@ -19,7 +19,6 @@ struct bch_fs_rebalance { + enum rebalance_state state; + u64 throttled_until_iotime; + unsigned long throttled_until_cputime; +- struct bch_move_stats move_stats; + + unsigned enabled:1; + }; +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index afb72648fe54..b02af94f4037 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -1216,7 +1216,9 @@ use_clean: + + if (!(c->sb.compat & (1ULL << BCH_COMPAT_extents_above_btree_updates_done)) || + !(c->sb.compat & (1ULL << BCH_COMPAT_bformat_overflow_done))) { +- struct bch_move_stats stats = { 0 }; ++ struct bch_move_stats stats; ++ ++ bch_move_stats_init(&stats, "recovery"); + + bch_info(c, "scanning for old btree nodes"); + ret = bch2_fs_read_write(c); +diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c +index ce8e5d4843d0..c1c3cf8f5a56 100644 +--- a/fs/bcachefs/super.c ++++ b/fs/bcachefs/super.c +@@ -704,6 +704,9 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) + INIT_LIST_HEAD(&c->ec_stripe_new_list); + mutex_init(&c->ec_stripe_new_lock); + ++ INIT_LIST_HEAD(&c->data_progress_list); ++ mutex_init(&c->data_progress_lock); ++ + spin_lock_init(&c->ec_stripes_heap_lock); + + seqcount_init(&c->gc_pos_lock); +diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c +index 9b1ffbf96e14..b5ce336f00ca 100644 +--- a/fs/bcachefs/sysfs.c ++++ b/fs/bcachefs/sysfs.c +@@ -203,6 +203,8 @@ read_attribute(new_stripes); + read_attribute(io_timers_read); + read_attribute(io_timers_write); + ++read_attribute(data_op_data_progress); ++ + #ifdef CONFIG_BCACHEFS_TESTS + write_attribute(perf_test); + #endif /* CONFIG_BCACHEFS_TESTS */ +@@ -239,6 +241,37 @@ static size_t bch2_btree_avg_write_size(struct bch_fs *c) + return nr ? div64_u64(sectors, nr) : 0; + } + ++static long stats_to_text(struct printbuf *out, struct bch_fs *c, ++ struct bch_move_stats *stats) ++{ ++ pr_buf(out, "%s: data type %s btree_id %s position: ", ++ stats->name, ++ bch2_data_types[stats->data_type], ++ bch2_btree_ids[stats->btree_id]); ++ bch2_bpos_to_text(out, stats->pos); ++ pr_buf(out, "%s", "\n"); ++ ++ return 0; ++} ++ ++static long data_progress_to_text(struct printbuf *out, struct bch_fs *c) ++{ ++ long ret = 0; ++ struct bch_move_stats *iter; ++ ++ mutex_lock(&c->data_progress_lock); ++ ++ if (list_empty(&c->data_progress_list)) ++ pr_buf(out, "%s", "no progress to report\n"); ++ else ++ list_for_each_entry(iter, &c->data_progress_list, list) { ++ stats_to_text(out, c, iter); ++ } ++ ++ mutex_unlock(&c->data_progress_lock); ++ return ret; ++} ++ + static int fs_alloc_debug_to_text(struct printbuf *out, struct bch_fs *c) + { + struct bch_fs_usage_online *fs_usage = bch2_fs_usage_read(c); +@@ -434,6 +467,11 @@ SHOW(bch2_fs) + return out.pos - buf; + } + ++ if (attr == &sysfs_data_op_data_progress) { ++ data_progress_to_text(&out, c); ++ return out.pos - buf; ++ } ++ + return 0; + } + +@@ -596,6 +634,8 @@ struct attribute *bch2_fs_internal_files[] = { + &sysfs_io_timers_read, + &sysfs_io_timers_write, + ++ &sysfs_data_op_data_progress, ++ + &sysfs_internal_uuid, + NULL + }; +-- +cgit v1.2.3 + + +From 8dbf1040e7ae9c02df0ca7c8834324771c8e75f7 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 22 Aug 2021 12:56:56 -0400 +Subject: bcachefs: Fix unhandled transaction restart in bch2_gc_btree_gens() + +This fixes https://github.com/koverstreet/bcachefs/issues/305 + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_gc.c | 10 ++++++++-- + 1 file changed, 8 insertions(+), 2 deletions(-) + +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +index 91f6a2ada44e..5757b4a2ace5 100644 +--- a/fs/bcachefs/btree_gc.c ++++ b/fs/bcachefs/btree_gc.c +@@ -1736,8 +1736,14 @@ static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id btree_id) + BTREE_ITER_ALL_SNAPSHOTS); + + while ((bch2_trans_begin(&trans), +- k = bch2_btree_iter_peek(iter)).k && +- !(ret = bkey_err(k))) { ++ k = bch2_btree_iter_peek(iter)).k) { ++ ret = bkey_err(k); ++ ++ if (ret == -EINTR) ++ continue; ++ if (ret) ++ break; ++ + c->gc_gens_pos = iter->pos; + + if (gc_btree_gens_key(c, k) && !commit_err) { +-- +cgit v1.2.3 + + +From 617a84cf0d1becf7198bdcd8e411ce98f0a66d59 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 23 Aug 2021 17:19:17 -0400 +Subject: bcachefs: Free iterator if we have duplicate + +This helps - but does not fully fix - the outstanding "transaction +iterator overflow" bugs. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 21 +++++++++++++++++++-- + fs/bcachefs/btree_update_leaf.c | 1 + + 2 files changed, 20 insertions(+), 2 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index b01ee51edaf5..c913ca0777dc 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -2204,6 +2204,22 @@ static inline void __bch2_trans_iter_free(struct btree_trans *trans, + trans->iters_touched &= ~(1ULL << idx); + } + ++static bool have_iter_at_pos(struct btree_trans *trans, ++ struct btree_iter *iter) ++{ ++ struct btree_iter *n; ++ ++ n = prev_btree_iter(trans, iter); ++ if (n && !btree_iter_cmp(n, iter)) ++ return true; ++ ++ n = next_btree_iter(trans, iter); ++ if (n && !btree_iter_cmp(n, iter)) ++ return true; ++ ++ return false; ++} ++ + int bch2_trans_iter_put(struct btree_trans *trans, + struct btree_iter *iter) + { +@@ -2217,8 +2233,9 @@ int bch2_trans_iter_put(struct btree_trans *trans, + + ret = btree_iter_err(iter); + +- if (!(trans->iters_touched & (1ULL << iter->idx)) && +- !(iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT)) ++ if (!(iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT) && ++ (!(trans->iters_touched & (1ULL << iter->idx)) || ++ have_iter_at_pos(trans, iter))) + __bch2_trans_iter_free(trans, iter->idx); + + trans->iters_live &= ~(1ULL << iter->idx); +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index 7e9909e2dcaf..bfb568025a2a 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -1019,6 +1019,7 @@ int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter, + n.iter = bch2_trans_get_iter(trans, n.btree_id, n.k->k.p, + BTREE_ITER_INTENT| + BTREE_ITER_NOT_EXTENTS); ++ n.iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT; + ret = bch2_btree_iter_traverse(n.iter); + bch2_trans_iter_put(trans, n.iter); + +-- +cgit v1.2.3 + + +From 9ece87027d457c685d1a7ce3cb1f87cdb08b0cd4 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 24 Aug 2021 20:31:44 -0400 +Subject: bcachefs: Add SPOS_MAX to bpos_to_text() + +Better pretty printing ftw + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bkey_methods.c | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c +index f8adbf437276..a03b5514a802 100644 +--- a/fs/bcachefs/bkey_methods.c ++++ b/fs/bcachefs/bkey_methods.c +@@ -215,6 +215,8 @@ void bch2_bpos_to_text(struct printbuf *out, struct bpos pos) + pr_buf(out, "POS_MIN"); + else if (!bpos_cmp(pos, POS_MAX)) + pr_buf(out, "POS_MAX"); ++ else if (!bpos_cmp(pos, SPOS_MAX)) ++ pr_buf(out, "SPOS_MAX"); + else { + if (pos.inode == U64_MAX) + pr_buf(out, "U64_MAX"); +-- +cgit v1.2.3 + + +From 28b1a6378769d4e3a169c00d2096dcb251b5e49e Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 24 Aug 2021 16:54:36 -0400 +Subject: bcachefs: Ensure iter->real_pos is consistent with key returned + +iter->real_pos needs to match the key returned or bad things will happen +when we go to update the key at that position. When we returned a +pending update from btree_trans_peek_updates(), this wasn't necessarily +the case. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 84 +++++++++++++++++++++++++----------------------- + 1 file changed, 43 insertions(+), 41 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index c913ca0777dc..2ed4ff414d3e 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -1714,6 +1714,7 @@ static inline struct bkey_i *btree_trans_peek_updates(struct btree_iter *iter) + */ + struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) + { ++ struct btree_iter_level *l = &iter->l[0]; + struct bpos search_key = btree_iter_search_key(iter); + struct bkey_i *next_update; + struct bkey_s_c k; +@@ -1727,39 +1728,47 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) + btree_iter_set_search_pos(iter, search_key); + + ret = btree_iter_traverse(iter); +- if (unlikely(ret)) +- return bkey_s_c_err(ret); ++ if (unlikely(ret)) { ++ /* ensure that iter->k is consistent with iter->pos: */ ++ bch2_btree_iter_set_pos(iter, iter->pos); ++ k = bkey_s_c_err(ret); ++ goto out; ++ } + +- /* +- * btree_iter_level_peek() mutates iter->real_pos, which +- * btree_trans_peek_updates() checks against, so we have to call +- * them in this order: +- */ + next_update = btree_trans_peek_updates(iter); +- k = btree_iter_level_peek(iter, &iter->l[0]); ++ k = btree_iter_level_peek_all(iter, l); ++ ++ /* * In the btree, deleted keys sort before non deleted: */ ++ if (k.k && bkey_deleted(k.k) && ++ (!next_update || ++ bpos_cmp(k.k->p, next_update->k.p) <= 0)) { ++ search_key = k.k->p; ++ continue; ++ } ++ + if (next_update && +- bpos_cmp(next_update->k.p, iter->real_pos) <= 0) { ++ bpos_cmp(next_update->k.p, ++ k.k ? k.k->p : l->b->key.k.p) <= 0) { + iter->k = next_update->k; + k = bkey_i_to_s_c(next_update); + } + + if (likely(k.k)) { +- if (bkey_deleted(k.k)) { +- search_key = bkey_successor(iter, k.k->p); +- continue; +- } +- +- break; +- } +- +- if (unlikely(!bpos_cmp(iter->l[0].b->key.k.p, SPOS_MAX))) { ++ if (likely(!bkey_deleted(k.k))) ++ break; ++ ++ /* Advance to next key: */ ++ search_key = bkey_successor(iter, k.k->p); ++ } else if (likely(bpos_cmp(l->b->key.k.p, SPOS_MAX))) { ++ /* Advance to next leaf node: */ ++ search_key = bpos_successor(l->b->key.k.p); ++ } else { ++ /* End of btree: */ + bch2_btree_iter_set_pos(iter, SPOS_MAX); ++ iter->real_pos = SPOS_MAX; + k = bkey_s_c_null; + goto out; + } +- +- /* Advance to next leaf node: */ +- search_key = bpos_successor(iter->l[0].b->key.k.p); + } + + /* +@@ -1770,11 +1779,11 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) + iter->pos = k.k->p; + else if (bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0) + iter->pos = bkey_start_pos(k.k); +- ++ iter->real_pos = k.k->p; + out: ++ iter->should_be_locked = true; + bch2_btree_iter_verify_entry_exit(iter); + bch2_btree_iter_verify(iter); +- iter->should_be_locked = true; + return k; + } + +@@ -1811,8 +1820,10 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) + + ret = btree_iter_traverse(iter); + if (unlikely(ret)) { ++ /* ensure that iter->k is consistent with iter->pos: */ ++ bch2_btree_iter_set_pos(iter, iter->pos); + k = bkey_s_c_err(ret); +- goto no_key; ++ goto out; + } + + k = btree_iter_level_peek(iter, l); +@@ -1822,17 +1833,17 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) + : bkey_cmp(k.k->p, iter->pos) > 0)) + k = btree_iter_level_prev(iter, l); + +- if (likely(k.k)) ++ if (likely(k.k)) { + break; +- +- if (unlikely(!bpos_cmp(iter->l[0].b->data->min_key, POS_MIN))) { ++ } else if (likely(bpos_cmp(l->b->data->min_key, POS_MIN))) { ++ /* Advance to previous leaf node: */ ++ search_key = bpos_predecessor(l->b->data->min_key); ++ } else { ++ /* Start of btree: */ + bch2_btree_iter_set_pos(iter, POS_MIN); + k = bkey_s_c_null; +- goto no_key; ++ goto out; + } +- +- /* Advance to previous leaf node: */ +- search_key = bpos_predecessor(iter->l[0].b->data->min_key); + } + + EBUG_ON(bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0); +@@ -1841,19 +1852,10 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) + if (bkey_cmp(k.k->p, iter->pos) < 0) + iter->pos = k.k->p; + out: ++ iter->should_be_locked = true; + bch2_btree_iter_verify_entry_exit(iter); + bch2_btree_iter_verify(iter); +- iter->should_be_locked = true; + return k; +-no_key: +- /* +- * btree_iter_level_peek() may have set iter->k to a key we didn't want, and +- * then we errored going to the previous leaf - make sure it's +- * consistent with iter->pos: +- */ +- bkey_init(&iter->k); +- iter->k.p = iter->pos; +- goto out; + } + + /** +-- +cgit v1.2.3 + + +From c7a700f6ba7e8058f1fbf2baeec3270e548becb9 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 24 Aug 2021 21:26:43 -0400 +Subject: bcachefs: bch2_dump_trans_iters_updates() + +This factors out bch2_dump_trans_iters_updates() from the iter alloc +overflow path, and makes some small improvements to what it prints. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 29 ++++++++++++++--------------- + fs/bcachefs/btree_iter.h | 2 ++ + fs/bcachefs/btree_types.h | 1 + + fs/bcachefs/btree_update_leaf.c | 3 ++- + 4 files changed, 19 insertions(+), 16 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 2ed4ff414d3e..5d7fe953bd16 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -2256,32 +2256,29 @@ int bch2_trans_iter_free(struct btree_trans *trans, + } + + noinline __cold +-static void btree_trans_iter_alloc_fail(struct btree_trans *trans) ++void bch2_dump_trans_iters_updates(struct btree_trans *trans) + { +- + struct btree_iter *iter; + struct btree_insert_entry *i; +- char buf[100]; ++ char buf1[300], buf2[100]; + + btree_trans_sort_iters(trans); + + trans_for_each_iter_inorder(trans, iter) +- printk(KERN_ERR "iter: btree %s pos %s%s%s%s %pS\n", ++ printk(KERN_ERR "iter: btree %s pos %s real_pos %s%s%s%s %pS\n", + bch2_btree_ids[iter->btree_id], +- (bch2_bpos_to_text(&PBUF(buf), iter->real_pos), buf), ++ (bch2_bpos_to_text(&PBUF(buf1), iter->pos), buf1), ++ (bch2_bpos_to_text(&PBUF(buf2), iter->real_pos), buf2), + btree_iter_live(trans, iter) ? " live" : "", + (trans->iters_touched & (1ULL << iter->idx)) ? " touched" : "", + iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT ? " keep" : "", + (void *) iter->ip_allocated); + +- trans_for_each_update(trans, i) { +- char buf[300]; +- +- bch2_bkey_val_to_text(&PBUF(buf), trans->c, bkey_i_to_s_c(i->k)); +- printk(KERN_ERR "update: btree %s %s\n", +- bch2_btree_ids[i->iter->btree_id], buf); +- } +- panic("trans iter oveflow\n"); ++ trans_for_each_update(trans, i) ++ printk(KERN_ERR "update: btree %s %s %pS\n", ++ bch2_btree_ids[i->btree_id], ++ (bch2_bkey_val_to_text(&PBUF(buf1), trans->c, bkey_i_to_s_c(i->k)), buf1), ++ (void *) i->ip_allocated); + } + + static struct btree_iter *btree_trans_iter_alloc(struct btree_trans *trans, +@@ -2291,8 +2288,10 @@ static struct btree_iter *btree_trans_iter_alloc(struct btree_trans *trans, + unsigned idx; + + if (unlikely(trans->iters_linked == +- ~((~0ULL << 1) << (BTREE_ITER_MAX - 1)))) +- btree_trans_iter_alloc_fail(trans); ++ ~((~0ULL << 1) << (BTREE_ITER_MAX - 1)))) { ++ bch2_dump_trans_iters_updates(trans); ++ panic("trans iter oveflow\n"); ++ } + + idx = __ffs64(~trans->iters_linked); + iter = &trans->iters[idx]; +diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h +index 39124e68e488..fc7172a8f3f9 100644 +--- a/fs/bcachefs/btree_iter.h ++++ b/fs/bcachefs/btree_iter.h +@@ -289,6 +289,8 @@ static inline int bkey_err(struct bkey_s_c k) + + /* new multiple iterator interface: */ + ++void bch2_dump_trans_iters_updates(struct btree_trans *); ++ + int bch2_trans_iter_put(struct btree_trans *, struct btree_iter *); + int bch2_trans_iter_free(struct btree_trans *, struct btree_iter *); + +diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h +index a1e5debf19f3..0a59e4b6e7a7 100644 +--- a/fs/bcachefs/btree_types.h ++++ b/fs/bcachefs/btree_types.h +@@ -348,6 +348,7 @@ struct btree_insert_entry { + unsigned trans_triggers_run:1; + struct bkey_i *k; + struct btree_iter *iter; ++ unsigned long ip_allocated; + }; + + #ifndef CONFIG_LOCKDEP +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index bfb568025a2a..91524da9404c 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -991,7 +991,8 @@ int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter, + .btree_id = iter->btree_id, + .level = iter->level, + .iter = iter, +- .k = k ++ .k = k, ++ .ip_allocated = _RET_IP_, + }; + bool is_extent = (iter->flags & BTREE_ITER_IS_EXTENTS) != 0; + int ret = 0; +-- +cgit v1.2.3 + + +From 3ad5a21c6f6b05c5a38e50dd89b03bb203be3b0e Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 24 Aug 2021 21:30:06 -0400 +Subject: bcachefs: Reduce iter->trans usage + +Disfavoured, and should go away. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_io.c | 2 +- + fs/bcachefs/btree_iter.c | 160 ++++++++++++++++++++---------------- + fs/bcachefs/btree_iter.h | 27 +++--- + fs/bcachefs/btree_key_cache.c | 4 +- + fs/bcachefs/btree_locking.h | 17 ++-- + fs/bcachefs/btree_update.h | 5 +- + fs/bcachefs/btree_update_interior.c | 146 ++++++++++++++++---------------- + fs/bcachefs/btree_update_interior.h | 14 ---- + fs/bcachefs/btree_update_leaf.c | 27 +++--- + fs/bcachefs/ec.c | 10 +-- + fs/bcachefs/extent_update.c | 22 ++--- + fs/bcachefs/extent_update.h | 8 +- + fs/bcachefs/fs-io.c | 2 +- + fs/bcachefs/io.c | 2 +- + 14 files changed, 226 insertions(+), 220 deletions(-) + +diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c +index 40fa0111a3f6..455c24734a2b 100644 +--- a/fs/bcachefs/btree_io.c ++++ b/fs/bcachefs/btree_io.c +@@ -504,7 +504,7 @@ void bch2_btree_init_next(struct btree_trans *trans, + bch2_btree_build_aux_trees(b); + + if (iter && reinit_iter) +- bch2_btree_iter_reinit_node(iter, b); ++ bch2_btree_iter_reinit_node(trans, iter, b); + } + + static void btree_pos_to_text(struct printbuf *out, struct bch_fs *c, +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 5d7fe953bd16..89c061e74b45 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -20,10 +20,11 @@ + static void btree_iter_set_search_pos(struct btree_iter *, struct bpos); + static void btree_trans_sort_iters(struct btree_trans *); + static void btree_iter_check_sort(struct btree_trans *, struct btree_iter *); +-static struct btree_iter *btree_iter_child_alloc(struct btree_iter *, unsigned long); ++static struct btree_iter *btree_iter_child_alloc(struct btree_trans *, ++ struct btree_iter *, unsigned long); + static struct btree_iter *btree_trans_iter_alloc(struct btree_trans *, + struct btree_iter *); +-static void btree_iter_copy(struct btree_iter *, struct btree_iter *); ++static void btree_iter_copy(struct btree_trans *, struct btree_iter *, struct btree_iter *); + + static inline int btree_iter_cmp(const struct btree_iter *l, + const struct btree_iter *r) +@@ -101,19 +102,21 @@ static inline bool btree_iter_pos_in_node(struct btree_iter *iter, + + /* Btree node locking: */ + +-void bch2_btree_node_unlock_write(struct btree *b, struct btree_iter *iter) ++void bch2_btree_node_unlock_write(struct btree_trans *trans, ++ struct btree_iter *iter, struct btree *b) + { +- bch2_btree_node_unlock_write_inlined(b, iter); ++ bch2_btree_node_unlock_write_inlined(trans, iter, b); + } + +-void __bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter) ++void __bch2_btree_node_lock_write(struct btree_trans *trans, ++ struct btree_iter *iter, struct btree *b) + { + struct btree_iter *linked; + unsigned readers = 0; + + EBUG_ON(!btree_node_intent_locked(iter, b->c.level)); + +- trans_for_each_iter(iter->trans, linked) ++ trans_for_each_iter(trans, linked) + if (linked->l[b->c.level].b == b && + btree_node_read_locked(linked, b->c.level)) + readers++; +@@ -130,7 +133,7 @@ void __bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter) + else + this_cpu_sub(*b->c.lock.readers, readers); + +- btree_node_lock_type(iter->trans->c, b, SIX_LOCK_write); ++ btree_node_lock_type(trans->c, b, SIX_LOCK_write); + + if (!b->c.lock.readers) + atomic64_add(__SIX_VAL(read_lock, readers), +@@ -192,8 +195,9 @@ success: + return true; + } + +-static inline bool btree_iter_get_locks(struct btree_iter *iter, bool upgrade, +- unsigned long trace_ip) ++static inline bool btree_iter_get_locks(struct btree_trans *trans, ++ struct btree_iter *iter, ++ bool upgrade, unsigned long trace_ip) + { + unsigned l = iter->level; + int fail_idx = -1; +@@ -207,7 +211,7 @@ static inline bool btree_iter_get_locks(struct btree_iter *iter, bool upgrade, + : bch2_btree_node_relock(iter, l))) { + (upgrade + ? trace_node_upgrade_fail +- : trace_node_relock_fail)(iter->trans->ip, trace_ip, ++ : trace_node_relock_fail)(trans->ip, trace_ip, + btree_iter_type(iter) == BTREE_ITER_CACHED, + iter->btree_id, &iter->real_pos, + l, iter->l[l].lock_seq, +@@ -238,7 +242,7 @@ static inline bool btree_iter_get_locks(struct btree_iter *iter, bool upgrade, + if (iter->uptodate == BTREE_ITER_NEED_RELOCK) + iter->uptodate = BTREE_ITER_NEED_PEEK; + +- bch2_btree_trans_verify_locks(iter->trans); ++ bch2_btree_trans_verify_locks(trans); + + return iter->uptodate < BTREE_ITER_NEED_RELOCK; + } +@@ -365,11 +369,12 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos, + /* Btree iterator locking: */ + + #ifdef CONFIG_BCACHEFS_DEBUG +-static void bch2_btree_iter_verify_locks(struct btree_iter *iter) ++static void bch2_btree_iter_verify_locks(struct btree_trans *trans, ++ struct btree_iter *iter) + { + unsigned l; + +- if (!(iter->trans->iters_linked & (1ULL << iter->idx))) { ++ if (!(trans->iters_linked & (1ULL << iter->idx))) { + BUG_ON(iter->nodes_locked); + return; + } +@@ -389,10 +394,11 @@ void bch2_btree_trans_verify_locks(struct btree_trans *trans) + struct btree_iter *iter; + + trans_for_each_iter(trans, iter) +- bch2_btree_iter_verify_locks(iter); ++ bch2_btree_iter_verify_locks(trans, iter); + } + #else +-static inline void bch2_btree_iter_verify_locks(struct btree_iter *iter) {} ++static inline void bch2_btree_iter_verify_locks(struct btree_trans *trans, ++ struct btree_iter *iter) {} + #endif + + /* +@@ -400,13 +406,14 @@ static inline void bch2_btree_iter_verify_locks(struct btree_iter *iter) {} + */ + bool bch2_btree_iter_relock_intent(struct btree_iter *iter) + { ++ struct btree_trans *trans = iter->trans; + unsigned l; + + for (l = iter->level; + l < iter->locks_want && btree_iter_node(iter, l); + l++) { + if (!bch2_btree_node_relock(iter, l)) { +- trace_node_relock_fail(iter->trans->ip, _RET_IP_, ++ trace_node_relock_fail(trans->ip, _RET_IP_, + btree_iter_type(iter) == BTREE_ITER_CACHED, + iter->btree_id, &iter->real_pos, + l, iter->l[l].lock_seq, +@@ -417,7 +424,7 @@ bool bch2_btree_iter_relock_intent(struct btree_iter *iter) + ? iter->l[l].b->c.lock.state.seq + : 0); + btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); +- btree_trans_restart(iter->trans); ++ btree_trans_restart(trans); + return false; + } + } +@@ -426,25 +433,27 @@ bool bch2_btree_iter_relock_intent(struct btree_iter *iter) + } + + __flatten +-bool bch2_btree_iter_relock(struct btree_iter *iter, unsigned long trace_ip) ++static bool bch2_btree_iter_relock(struct btree_trans *trans, ++ struct btree_iter *iter, unsigned long trace_ip) + { +- bool ret = btree_iter_get_locks(iter, false, trace_ip); ++ bool ret = btree_iter_get_locks(trans, iter, false, trace_ip); + + if (!ret) +- btree_trans_restart(iter->trans); ++ btree_trans_restart(trans); + return ret; + } + + bool __bch2_btree_iter_upgrade(struct btree_iter *iter, + unsigned new_locks_want) + { ++ struct btree_trans *trans = iter->trans; + struct btree_iter *linked; + + EBUG_ON(iter->locks_want >= new_locks_want); + + iter->locks_want = new_locks_want; + +- if (btree_iter_get_locks(iter, true, _THIS_IP_)) ++ if (btree_iter_get_locks(trans, iter, true, _THIS_IP_)) + return true; + + /* +@@ -466,17 +475,17 @@ bool __bch2_btree_iter_upgrade(struct btree_iter *iter, + * before interior nodes - now that's handled by + * bch2_btree_iter_traverse_all(). + */ +- trans_for_each_iter(iter->trans, linked) ++ trans_for_each_iter(trans, linked) + if (linked != iter && + btree_iter_type(linked) == btree_iter_type(iter) && + linked->btree_id == iter->btree_id && + linked->locks_want < new_locks_want) { + linked->locks_want = new_locks_want; +- btree_iter_get_locks(linked, true, _THIS_IP_); ++ btree_iter_get_locks(trans, linked, true, _THIS_IP_); + } + + if (iter->should_be_locked) +- btree_trans_restart(iter->trans); ++ btree_trans_restart(trans); + return false; + } + +@@ -530,7 +539,7 @@ bool bch2_trans_relock(struct btree_trans *trans) + + trans_for_each_iter(trans, iter) + if (btree_iter_should_be_locked(iter) && +- !bch2_btree_iter_relock(iter, _RET_IP_)) { ++ !bch2_btree_iter_relock(trans, iter, _RET_IP_)) { + trace_trans_restart_relock(trans->ip, _RET_IP_, + iter->btree_id, &iter->real_pos); + BUG_ON(!trans->restarted); +@@ -690,7 +699,7 @@ static void bch2_btree_iter_verify(struct btree_iter *iter) + bch2_btree_iter_verify_level(iter, i); + } + +- bch2_btree_iter_verify_locks(iter); ++ bch2_btree_iter_verify_locks(trans, iter); + } + + static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) +@@ -757,13 +766,14 @@ static void __bch2_btree_iter_fix_key_modified(struct btree_iter *iter, + btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); + } + +-void bch2_btree_iter_fix_key_modified(struct btree_iter *iter, ++void bch2_btree_iter_fix_key_modified(struct btree_trans *trans, ++ struct btree_iter *iter, + struct btree *b, + struct bkey_packed *where) + { + struct btree_iter *linked; + +- trans_for_each_iter_with_node(iter->trans, b, linked) { ++ trans_for_each_iter_with_node(trans, b, linked) { + __bch2_btree_iter_fix_key_modified(linked, b, where); + bch2_btree_iter_verify_level(linked, b->c.level); + } +@@ -867,7 +877,8 @@ fixup_done: + btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); + } + +-void bch2_btree_node_iter_fix(struct btree_iter *iter, ++void bch2_btree_node_iter_fix(struct btree_trans *trans, ++ struct btree_iter *iter, + struct btree *b, + struct btree_node_iter *node_iter, + struct bkey_packed *where, +@@ -885,7 +896,7 @@ void bch2_btree_node_iter_fix(struct btree_iter *iter, + bch2_btree_node_iter_verify(node_iter, b); + } + +- trans_for_each_iter_with_node(iter->trans, b, linked) { ++ trans_for_each_iter_with_node(trans, b, linked) { + __bch2_btree_node_iter_fix(linked, b, + &linked->l[b->c.level].iter, t, + where, clobber_u64s, new_u64s); +@@ -1057,12 +1068,13 @@ static inline void btree_iter_node_set(struct btree_iter *iter, + * A btree node is being replaced - update the iterator to point to the new + * node: + */ +-void bch2_btree_iter_node_replace(struct btree_iter *iter, struct btree *b) ++void bch2_btree_iter_node_replace(struct btree_trans *trans, ++ struct btree_iter *iter, struct btree *b) + { + enum btree_node_locked_type t; + struct btree_iter *linked; + +- trans_for_each_iter(iter->trans, linked) ++ trans_for_each_iter(trans, linked) + if (btree_iter_type(linked) != BTREE_ITER_CACHED && + btree_iter_pos_in_node(linked, b)) { + /* +@@ -1082,12 +1094,13 @@ void bch2_btree_iter_node_replace(struct btree_iter *iter, struct btree *b) + } + } + +-void bch2_btree_iter_node_drop(struct btree_iter *iter, struct btree *b) ++void bch2_btree_iter_node_drop(struct btree_trans *trans, ++ struct btree_iter *iter, struct btree *b) + { + struct btree_iter *linked; + unsigned level = b->c.level; + +- trans_for_each_iter(iter->trans, linked) ++ trans_for_each_iter(trans, linked) + if (linked->l[level].b == b) { + btree_node_unlock(linked, level); + linked->l[level].b = BTREE_ITER_NO_NODE_DROP; +@@ -1098,11 +1111,12 @@ void bch2_btree_iter_node_drop(struct btree_iter *iter, struct btree *b) + * A btree node has been modified in such a way as to invalidate iterators - fix + * them: + */ +-void bch2_btree_iter_reinit_node(struct btree_iter *iter, struct btree *b) ++void bch2_btree_iter_reinit_node(struct btree_trans *trans, ++ struct btree_iter *iter, struct btree *b) + { + struct btree_iter *linked; + +- trans_for_each_iter_with_node(iter->trans, b, linked) ++ trans_for_each_iter_with_node(trans, b, linked) + __btree_iter_init(linked, b->c.level); + } + +@@ -1172,9 +1186,9 @@ static inline int btree_iter_lock_root(struct btree_trans *trans, + } + + noinline +-static int btree_iter_prefetch(struct btree_iter *iter) ++static int btree_iter_prefetch(struct btree_trans *trans, struct btree_iter *iter) + { +- struct bch_fs *c = iter->trans->c; ++ struct bch_fs *c = trans->c; + struct btree_iter_level *l = &iter->l[iter->level]; + struct btree_node_iter node_iter = l->iter; + struct bkey_packed *k; +@@ -1260,19 +1274,20 @@ static __always_inline int btree_iter_down(struct btree_trans *trans, + btree_node_mem_ptr_set(iter, level + 1, b); + + if (iter->flags & BTREE_ITER_PREFETCH) +- ret = btree_iter_prefetch(iter); ++ ret = btree_iter_prefetch(trans, iter); + + if (btree_node_read_locked(iter, level + 1)) + btree_node_unlock(iter, level + 1); + iter->level = level; + +- bch2_btree_iter_verify_locks(iter); ++ bch2_btree_iter_verify_locks(trans, iter); + err: + bch2_bkey_buf_exit(&tmp, c); + return ret; + } + +-static int btree_iter_traverse_one(struct btree_iter *, unsigned long); ++static int btree_iter_traverse_one(struct btree_trans *, ++ struct btree_iter *, unsigned long); + + static int __btree_iter_traverse_all(struct btree_trans *trans, int ret, + unsigned long trace_ip) +@@ -1329,7 +1344,7 @@ retry_all: + trans_for_each_iter_inorder(trans, iter) { + EBUG_ON(!(trans->iters_linked & (1ULL << iter->idx))); + +- ret = btree_iter_traverse_one(iter, _THIS_IP_); ++ ret = btree_iter_traverse_one(trans, iter, _THIS_IP_); + if (ret) + goto retry_all; + +@@ -1395,10 +1410,10 @@ static inline unsigned btree_iter_up_until_good_node(struct btree_iter *iter, + * On error, caller (peek_node()/peek_key()) must return NULL; the error is + * stashed in the iterator and returned from bch2_trans_exit(). + */ +-static int btree_iter_traverse_one(struct btree_iter *iter, ++static int btree_iter_traverse_one(struct btree_trans *trans, ++ struct btree_iter *iter, + unsigned long trace_ip) + { +- struct btree_trans *trans = iter->trans; + unsigned l, depth_want = iter->level; + int ret = 0; + +@@ -1407,7 +1422,7 @@ static int btree_iter_traverse_one(struct btree_iter *iter, + * and re-traverse the iterator without a transaction restart: + */ + if (iter->should_be_locked) { +- ret = bch2_btree_iter_relock(iter, trace_ip) ? 0 : -EINTR; ++ ret = bch2_btree_iter_relock(trans, iter, trace_ip) ? 0 : -EINTR; + goto out; + } + +@@ -1483,7 +1498,7 @@ static int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter) + int ret; + + ret = bch2_trans_cond_resched(trans) ?: +- btree_iter_traverse_one(iter, _RET_IP_); ++ btree_iter_traverse_one(trans, iter, _RET_IP_); + if (unlikely(ret) && hweight64(trans->iters_linked) == 1) { + ret = __btree_iter_traverse_all(trans, ret, _RET_IP_); + BUG_ON(ret == -EINTR); +@@ -1612,13 +1627,14 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) + + static void btree_iter_set_search_pos(struct btree_iter *iter, struct bpos new_pos) + { ++ struct btree_trans *trans = iter->trans; + #ifdef CONFIG_BCACHEFS_DEBUG + struct bpos old_pos = iter->real_pos; + #endif + int cmp = bpos_cmp(new_pos, iter->real_pos); + unsigned l = iter->level; + +- EBUG_ON(iter->trans->restarted); ++ EBUG_ON(trans->restarted); + + if (!cmp) + goto out; +@@ -1626,7 +1642,7 @@ static void btree_iter_set_search_pos(struct btree_iter *iter, struct bpos new_p + iter->real_pos = new_pos; + iter->should_be_locked = false; + +- btree_iter_check_sort(iter->trans, iter); ++ btree_iter_check_sort(trans, iter); + + if (unlikely(btree_iter_type(iter) == BTREE_ITER_CACHED)) { + btree_node_unlock(iter, 0); +@@ -1660,7 +1676,7 @@ out: + + bch2_btree_iter_verify(iter); + #ifdef CONFIG_BCACHEFS_DEBUG +- trace_iter_set_search_pos(iter->trans->ip, _RET_IP_, ++ trace_iter_set_search_pos(trans->ip, _RET_IP_, + iter->btree_id, + &old_pos, &new_pos, l); + #endif +@@ -1690,14 +1706,15 @@ inline bool bch2_btree_iter_rewind(struct btree_iter *iter) + return ret; + } + +-static inline struct bkey_i *btree_trans_peek_updates(struct btree_iter *iter) ++static inline struct bkey_i *btree_trans_peek_updates(struct btree_trans *trans, ++ struct btree_iter *iter) + { + struct btree_insert_entry *i; + + if (!(iter->flags & BTREE_ITER_WITH_UPDATES)) + return NULL; + +- trans_for_each_update(iter->trans, i) ++ trans_for_each_update(trans, i) + if ((cmp_int(iter->btree_id, i->iter->btree_id) ?: + bpos_cmp(iter->real_pos, i->k->k.p)) <= 0) { + if (iter->btree_id == i->iter->btree_id) +@@ -1714,6 +1731,7 @@ static inline struct bkey_i *btree_trans_peek_updates(struct btree_iter *iter) + */ + struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) + { ++ struct btree_trans *trans = iter->trans; + struct btree_iter_level *l = &iter->l[0]; + struct bpos search_key = btree_iter_search_key(iter); + struct bkey_i *next_update; +@@ -1735,7 +1753,7 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) + goto out; + } + +- next_update = btree_trans_peek_updates(iter); ++ next_update = btree_trans_peek_updates(trans, iter); + k = btree_iter_level_peek_all(iter, l); + + /* * In the btree, deleted keys sort before non deleted: */ +@@ -1872,6 +1890,7 @@ struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *iter) + + struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) + { ++ struct btree_trans *trans = iter->trans; + struct bpos search_key; + struct bkey_s_c k; + int ret; +@@ -1902,7 +1921,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) + struct bkey_i *next_update; + struct bkey_cached *ck; + +- next_update = btree_trans_peek_updates(iter); ++ next_update = btree_trans_peek_updates(trans, iter); + + switch (btree_iter_type(iter)) { + case BTREE_ITER_KEYS: +@@ -1929,9 +1948,9 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) + } else { + if ((iter->flags & BTREE_ITER_INTENT)) { + struct btree_iter *child = +- btree_iter_child_alloc(iter, _THIS_IP_); ++ btree_iter_child_alloc(trans, iter, _THIS_IP_); + +- btree_iter_copy(child, iter); ++ btree_iter_copy(trans, child, iter); + k = bch2_btree_iter_peek(child); + + if (k.k && !bkey_err(k)) +@@ -2165,21 +2184,21 @@ static inline void btree_iter_list_add(struct btree_trans *trans, + btree_trans_verify_sorted_refs(trans); + } + +-static void btree_iter_child_free(struct btree_iter *iter) ++static void btree_iter_child_free(struct btree_trans *trans, struct btree_iter *iter) + { +- struct btree_iter *child = btree_iter_child(iter); ++ struct btree_iter *child = btree_iter_child(trans, iter); + + if (child) { +- bch2_trans_iter_free(iter->trans, child); ++ bch2_trans_iter_free(trans, child); + iter->child_idx = U8_MAX; + } + } + +-static struct btree_iter *btree_iter_child_alloc(struct btree_iter *iter, ++static struct btree_iter *btree_iter_child_alloc(struct btree_trans *trans, ++ struct btree_iter *iter, + unsigned long ip) + { +- struct btree_trans *trans = iter->trans; +- struct btree_iter *child = btree_iter_child(iter); ++ struct btree_iter *child = btree_iter_child(trans, iter); + + if (!child) { + child = btree_trans_iter_alloc(trans, iter); +@@ -2196,7 +2215,7 @@ static struct btree_iter *btree_iter_child_alloc(struct btree_iter *iter, + static inline void __bch2_trans_iter_free(struct btree_trans *trans, + unsigned idx) + { +- btree_iter_child_free(&trans->iters[idx]); ++ btree_iter_child_free(trans, &trans->iters[idx]); + + btree_iter_list_remove(trans, &trans->iters[idx]); + +@@ -2309,12 +2328,13 @@ static struct btree_iter *btree_trans_iter_alloc(struct btree_trans *trans, + return iter; + } + +-static void btree_iter_copy(struct btree_iter *dst, struct btree_iter *src) ++static void btree_iter_copy(struct btree_trans *trans, struct btree_iter *dst, ++ struct btree_iter *src) + { + unsigned i; + + __bch2_btree_iter_unlock(dst); +- btree_iter_child_free(dst); ++ btree_iter_child_free(trans, dst); + + memcpy(&dst->flags, &src->flags, + sizeof(struct btree_iter) - offsetof(struct btree_iter, flags)); +@@ -2327,7 +2347,7 @@ static void btree_iter_copy(struct btree_iter *dst, struct btree_iter *src) + dst->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT; + dst->flags &= ~BTREE_ITER_SET_POS_AFTER_COMMIT; + +- btree_iter_check_sort(dst->trans, dst); ++ btree_iter_check_sort(trans, dst); + } + + struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans, +@@ -2385,7 +2405,7 @@ struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans, + bch2_btree_iter_init(trans, iter, btree_id); + } else if (btree_iter_keep(trans, best)) { + iter = btree_trans_iter_alloc(trans, best); +- btree_iter_copy(iter, best); ++ btree_iter_copy(trans, iter, best); + } else { + iter = best; + } +@@ -2408,7 +2428,7 @@ struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans, + locks_want = min(locks_want, BTREE_MAX_DEPTH); + if (locks_want > iter->locks_want) { + iter->locks_want = locks_want; +- btree_iter_get_locks(iter, true, _THIS_IP_); ++ btree_iter_get_locks(trans, iter, true, _THIS_IP_); + } + + while (iter->level != depth) { +@@ -2461,12 +2481,12 @@ struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *trans, + } + + struct btree_iter *__bch2_trans_copy_iter(struct btree_trans *trans, +- struct btree_iter *src) ++ struct btree_iter *src) + { + struct btree_iter *iter; + + iter = btree_trans_iter_alloc(trans, src); +- btree_iter_copy(iter, src); ++ btree_iter_copy(trans, iter, src); + + trans->iters_live |= 1ULL << iter->idx; + /* +@@ -2643,7 +2663,7 @@ int bch2_trans_exit(struct btree_trans *trans) + struct btree_iter *iter; + + trans_for_each_iter(trans, iter) +- btree_iter_child_free(iter); ++ btree_iter_child_free(trans, iter); + } + + if (trans->iters_live) { +diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h +index fc7172a8f3f9..6dad6f1a2d9b 100644 +--- a/fs/bcachefs/btree_iter.h ++++ b/fs/bcachefs/btree_iter.h +@@ -129,14 +129,13 @@ static inline void bch2_btree_trans_verify_iters(struct btree_trans *trans, + static inline void bch2_btree_trans_verify_locks(struct btree_trans *iter) {} + #endif + +-void bch2_btree_iter_fix_key_modified(struct btree_iter *, struct btree *, +- struct bkey_packed *); +-void bch2_btree_node_iter_fix(struct btree_iter *, struct btree *, +- struct btree_node_iter *, struct bkey_packed *, +- unsigned, unsigned); ++void bch2_btree_iter_fix_key_modified(struct btree_trans *trans, struct btree_iter *, ++ struct btree *, struct bkey_packed *); ++void bch2_btree_node_iter_fix(struct btree_trans *trans, struct btree_iter *, ++ struct btree *, struct btree_node_iter *, ++ struct bkey_packed *, unsigned, unsigned); + + bool bch2_btree_iter_relock_intent(struct btree_iter *); +-bool bch2_btree_iter_relock(struct btree_iter *, unsigned long); + + bool bch2_trans_relock(struct btree_trans *); + void bch2_trans_unlock(struct btree_trans *); +@@ -173,10 +172,13 @@ static inline void bch2_btree_iter_downgrade(struct btree_iter *iter) + + void bch2_trans_downgrade(struct btree_trans *); + +-void bch2_btree_iter_node_replace(struct btree_iter *, struct btree *); +-void bch2_btree_iter_node_drop(struct btree_iter *, struct btree *); ++void bch2_btree_iter_node_replace(struct btree_trans *trans, ++ struct btree_iter *, struct btree *); ++void bch2_btree_iter_node_drop(struct btree_trans *, ++ struct btree_iter *, struct btree *); + +-void bch2_btree_iter_reinit_node(struct btree_iter *, struct btree *); ++void bch2_btree_iter_reinit_node(struct btree_trans *, ++ struct btree_iter *, struct btree *); + + int __must_check bch2_btree_iter_traverse(struct btree_iter *); + +@@ -220,9 +222,10 @@ static inline struct btree_iter *idx_to_btree_iter(struct btree_trans *trans, un + return idx != U8_MAX ? trans->iters + idx : NULL; + } + +-static inline struct btree_iter *btree_iter_child(struct btree_iter *iter) ++static inline struct btree_iter *btree_iter_child(struct btree_trans *trans, ++ struct btree_iter *iter) + { +- return idx_to_btree_iter(iter->trans, iter->child_idx); ++ return idx_to_btree_iter(trans, iter->child_idx); + } + + /* +@@ -313,7 +316,7 @@ bch2_trans_get_iter(struct btree_trans *trans, enum btree_id btree_id, + } + + struct btree_iter *__bch2_trans_copy_iter(struct btree_trans *, +- struct btree_iter *); ++ struct btree_iter *); + static inline struct btree_iter * + bch2_trans_copy_iter(struct btree_trans *trans, struct btree_iter *src) + { +diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c +index e327ef39d432..6bc20813d00d 100644 +--- a/fs/bcachefs/btree_key_cache.c ++++ b/fs/bcachefs/btree_key_cache.c +@@ -237,7 +237,7 @@ static int btree_key_cache_fill(struct btree_trans *trans, + * XXX: not allowed to be holding read locks when we take a write lock, + * currently + */ +- bch2_btree_node_lock_write(ck_iter->l[0].b, ck_iter); ++ bch2_btree_node_lock_write(trans, ck_iter, ck_iter->l[0].b); + if (new_k) { + kfree(ck->k); + ck->u64s = new_u64s; +@@ -246,7 +246,7 @@ static int btree_key_cache_fill(struct btree_trans *trans, + + bkey_reassemble(ck->k, k); + ck->valid = true; +- bch2_btree_node_unlock_write(ck_iter->l[0].b, ck_iter); ++ bch2_btree_node_unlock_write(trans, ck_iter, ck_iter->l[0].b); + + /* We're not likely to need this iterator again: */ + set_btree_iter_dontneed(trans, iter); +diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h +index 7532bcdef967..869c498e3f78 100644 +--- a/fs/bcachefs/btree_locking.h ++++ b/fs/bcachefs/btree_locking.h +@@ -208,30 +208,35 @@ static inline bool bch2_btree_node_relock(struct btree_iter *iter, + * succeed: + */ + static inline void +-bch2_btree_node_unlock_write_inlined(struct btree *b, struct btree_iter *iter) ++bch2_btree_node_unlock_write_inlined(struct btree_trans *trans, struct btree_iter *iter, ++ struct btree *b) + { + struct btree_iter *linked; + + EBUG_ON(iter->l[b->c.level].b != b); + EBUG_ON(iter->l[b->c.level].lock_seq + 1 != b->c.lock.state.seq); + +- trans_for_each_iter_with_node(iter->trans, b, linked) ++ trans_for_each_iter_with_node(trans, b, linked) + linked->l[b->c.level].lock_seq += 2; + + six_unlock_write(&b->c.lock); + } + +-void bch2_btree_node_unlock_write(struct btree *, struct btree_iter *); ++void bch2_btree_node_unlock_write(struct btree_trans *, ++ struct btree_iter *, struct btree *); + +-void __bch2_btree_node_lock_write(struct btree *, struct btree_iter *); ++void __bch2_btree_node_lock_write(struct btree_trans *, ++ struct btree_iter *, struct btree *); + +-static inline void bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter) ++static inline void bch2_btree_node_lock_write(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct btree *b) + { + EBUG_ON(iter->l[b->c.level].b != b); + EBUG_ON(iter->l[b->c.level].lock_seq != b->c.lock.state.seq); + + if (unlikely(!six_trylock_write(&b->c.lock))) +- __bch2_btree_node_lock_write(b, iter); ++ __bch2_btree_node_lock_write(trans, iter, b); + } + + #endif /* _BCACHEFS_BTREE_LOCKING_H */ +diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h +index 217b52e1a168..5707baf10262 100644 +--- a/fs/bcachefs/btree_update.h ++++ b/fs/bcachefs/btree_update.h +@@ -10,8 +10,9 @@ struct btree; + + void bch2_btree_node_lock_for_insert(struct btree_trans *, struct btree_iter *, + struct btree *); +-bool bch2_btree_bset_insert_key(struct btree_iter *, struct btree *, +- struct btree_node_iter *, struct bkey_i *); ++bool bch2_btree_bset_insert_key(struct btree_trans *, struct btree_iter *, ++ struct btree *, struct btree_node_iter *, ++ struct bkey_i *); + void bch2_btree_add_journal_pin(struct bch_fs *, struct btree *, u64); + + enum btree_insert_flags { +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index c8c3382f48c7..3d7c47712b74 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -25,6 +25,7 @@ + static void bch2_btree_insert_node(struct btree_update *, struct btree_trans *, + struct btree_iter *, struct btree *, + struct keylist *, unsigned); ++static void bch2_btree_update_add_new_node(struct btree_update *, struct btree *); + + /* Debug code: */ + +@@ -159,27 +160,14 @@ static void __btree_node_free(struct bch_fs *c, struct btree *b) + mutex_unlock(&c->btree_cache.lock); + } + +-void bch2_btree_node_free_never_inserted(struct bch_fs *c, struct btree *b) +-{ +- struct open_buckets ob = b->ob; +- +- b->ob.nr = 0; +- +- clear_btree_node_dirty(c, b); +- +- btree_node_lock_type(c, b, SIX_LOCK_write); +- __btree_node_free(c, b); +- six_unlock_write(&b->c.lock); +- +- bch2_open_buckets_put(c, &ob); +-} +- +-void bch2_btree_node_free_inmem(struct bch_fs *c, struct btree *b, +- struct btree_iter *iter) ++static void bch2_btree_node_free_inmem(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct btree *b) + { ++ struct bch_fs *c = trans->c; + struct btree_iter *linked; + +- trans_for_each_iter(iter->trans, linked) ++ trans_for_each_iter(trans, linked) + BUG_ON(linked->l[b->c.level].b == b); + + six_lock_write(&b->c.lock, NULL, NULL); +@@ -773,7 +761,7 @@ static void btree_update_updated_root(struct btree_update *as, struct btree *b) + * And it adds @b to the list of @as's new nodes, so that we can update sector + * counts in bch2_btree_update_nodes_written: + */ +-void bch2_btree_update_add_new_node(struct btree_update *as, struct btree *b) ++static void bch2_btree_update_add_new_node(struct btree_update *as, struct btree *b) + { + struct bch_fs *c = as->c; + +@@ -827,7 +815,7 @@ found: + closure_put(&as->cl); + } + +-void bch2_btree_update_get_open_buckets(struct btree_update *as, struct btree *b) ++static void bch2_btree_update_get_open_buckets(struct btree_update *as, struct btree *b) + { + while (b->ob.nr) + as->open_buckets[as->nr_open_buckets++] = +@@ -839,7 +827,7 @@ void bch2_btree_update_get_open_buckets(struct btree_update *as, struct btree *b + * nodes and thus outstanding btree_updates - redirect @b's + * btree_updates to point to this btree_update: + */ +-void bch2_btree_interior_update_will_free_node(struct btree_update *as, ++static void bch2_btree_interior_update_will_free_node(struct btree_update *as, + struct btree *b) + { + struct bch_fs *c = as->c; +@@ -911,7 +899,7 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *as, + as->nr_old_nodes++; + } + +-void bch2_btree_update_done(struct btree_update *as) ++static void bch2_btree_update_done(struct btree_update *as) + { + BUG_ON(as->mode == BTREE_INTERIOR_NO_UPDATE); + +@@ -925,11 +913,10 @@ void bch2_btree_update_done(struct btree_update *as) + as->c->btree_interior_update_worker); + } + +-struct btree_update * +-bch2_btree_update_start(struct btree_iter *iter, unsigned level, +- unsigned nr_nodes, unsigned flags) ++static struct btree_update * ++bch2_btree_update_start(struct btree_trans *trans, struct btree_iter *iter, ++ unsigned level, unsigned nr_nodes, unsigned flags) + { +- struct btree_trans *trans = iter->trans; + struct bch_fs *c = trans->c; + struct btree_update *as; + struct closure cl; +@@ -1092,8 +1079,10 @@ static void bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b) + * is nothing new to be done. This just guarantees that there is a + * journal write. + */ +-static void bch2_btree_set_root(struct btree_update *as, struct btree *b, +- struct btree_iter *iter) ++static void bch2_btree_set_root(struct btree_update *as, ++ struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct btree *b) + { + struct bch_fs *c = as->c; + struct btree *old; +@@ -1108,7 +1097,7 @@ static void bch2_btree_set_root(struct btree_update *as, struct btree *b, + * Ensure no one is using the old root while we switch to the + * new root: + */ +- bch2_btree_node_lock_write(old, iter); ++ bch2_btree_node_lock_write(trans, iter, old); + + bch2_btree_set_root_inmem(c, b); + +@@ -1121,15 +1110,17 @@ static void bch2_btree_set_root(struct btree_update *as, struct btree *b, + * an intent lock on the new root, and any updates that would + * depend on the new root would have to update the new root. + */ +- bch2_btree_node_unlock_write(old, iter); ++ bch2_btree_node_unlock_write(trans, iter, old); + } + + /* Interior node updates: */ + +-static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b, ++static void bch2_insert_fixup_btree_ptr(struct btree_update *as, ++ struct btree_trans *trans, + struct btree_iter *iter, +- struct bkey_i *insert, +- struct btree_node_iter *node_iter) ++ struct btree *b, ++ struct btree_node_iter *node_iter, ++ struct bkey_i *insert) + { + struct bch_fs *c = as->c; + struct bkey_packed *k; +@@ -1161,15 +1152,18 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b + bkey_iter_pos_cmp(b, k, &insert->k.p) < 0) + bch2_btree_node_iter_advance(node_iter, b); + +- bch2_btree_bset_insert_key(iter, b, node_iter, insert); ++ bch2_btree_bset_insert_key(trans, iter, b, node_iter, insert); + set_btree_node_dirty(c, b); + set_btree_node_need_write(b); + } + + static void +-__bch2_btree_insert_keys_interior(struct btree_update *as, struct btree *b, +- struct btree_iter *iter, struct keylist *keys, +- struct btree_node_iter node_iter) ++__bch2_btree_insert_keys_interior(struct btree_update *as, ++ struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct btree *b, ++ struct btree_node_iter node_iter, ++ struct keylist *keys) + { + struct bkey_i *insert = bch2_keylist_front(keys); + struct bkey_packed *k; +@@ -1181,8 +1175,8 @@ __bch2_btree_insert_keys_interior(struct btree_update *as, struct btree *b, + ; + + while (!bch2_keylist_empty(keys)) { +- bch2_insert_fixup_btree_ptr(as, b, iter, +- bch2_keylist_front(keys), &node_iter); ++ bch2_insert_fixup_btree_ptr(as, trans, iter, b, ++ &node_iter, bch2_keylist_front(keys)); + bch2_keylist_pop_front(keys); + } + } +@@ -1308,8 +1302,10 @@ static struct btree *__btree_split_node(struct btree_update *as, + * nodes that were coalesced, and thus in the middle of a child node post + * coalescing: + */ +-static void btree_split_insert_keys(struct btree_update *as, struct btree *b, ++static void btree_split_insert_keys(struct btree_update *as, ++ struct btree_trans *trans, + struct btree_iter *iter, ++ struct btree *b, + struct keylist *keys) + { + struct btree_node_iter node_iter; +@@ -1319,7 +1315,7 @@ static void btree_split_insert_keys(struct btree_update *as, struct btree *b, + + bch2_btree_node_iter_init(&node_iter, b, &k->k.p); + +- __bch2_btree_insert_keys_interior(as, b, iter, keys, node_iter); ++ __bch2_btree_insert_keys_interior(as, trans, iter, b, node_iter, keys); + + /* + * We can't tolerate whiteouts here - with whiteouts there can be +@@ -1368,7 +1364,7 @@ static void btree_split(struct btree_update *as, + bch2_btree_update_add_new_node(as, n1); + + if (keys) +- btree_split_insert_keys(as, n1, iter, keys); ++ btree_split_insert_keys(as, trans, iter, n1, keys); + + if (bset_u64s(&n1->set[0]) > BTREE_SPLIT_THRESHOLD(c)) { + trace_btree_split(c, b); +@@ -1398,7 +1394,7 @@ static void btree_split(struct btree_update *as, + n3->sib_u64s[0] = U16_MAX; + n3->sib_u64s[1] = U16_MAX; + +- btree_split_insert_keys(as, n3, iter, &as->parent_keys); ++ btree_split_insert_keys(as, trans, iter, n3, &as->parent_keys); + + bch2_btree_node_write(c, n3, SIX_LOCK_intent); + } +@@ -1420,10 +1416,10 @@ static void btree_split(struct btree_update *as, + /* Split a non root node */ + bch2_btree_insert_node(as, trans, iter, parent, &as->parent_keys, flags); + } else if (n3) { +- bch2_btree_set_root(as, n3, iter); ++ bch2_btree_set_root(as, trans, iter, n3); + } else { + /* Root filled up but didn't need to be split */ +- bch2_btree_set_root(as, n1, iter); ++ bch2_btree_set_root(as, trans, iter, n1); + } + + bch2_btree_update_get_open_buckets(as, n1); +@@ -1435,12 +1431,12 @@ static void btree_split(struct btree_update *as, + /* Successful split, update the iterator to point to the new nodes: */ + + six_lock_increment(&b->c.lock, SIX_LOCK_intent); +- bch2_btree_iter_node_drop(iter, b); ++ bch2_btree_iter_node_drop(trans, iter, b); + if (n3) +- bch2_btree_iter_node_replace(iter, n3); ++ bch2_btree_iter_node_replace(trans, iter, n3); + if (n2) +- bch2_btree_iter_node_replace(iter, n2); +- bch2_btree_iter_node_replace(iter, n1); ++ bch2_btree_iter_node_replace(trans, iter, n2); ++ bch2_btree_iter_node_replace(trans, iter, n1); + + /* + * The old node must be freed (in memory) _before_ unlocking the new +@@ -1448,7 +1444,7 @@ static void btree_split(struct btree_update *as, + * node after another thread has locked and updated the new node, thus + * seeing stale data: + */ +- bch2_btree_node_free_inmem(c, b, iter); ++ bch2_btree_node_free_inmem(trans, iter, b); + + if (n3) + six_unlock_intent(&n3->c.lock); +@@ -1463,19 +1459,23 @@ static void btree_split(struct btree_update *as, + } + + static void +-bch2_btree_insert_keys_interior(struct btree_update *as, struct btree *b, +- struct btree_iter *iter, struct keylist *keys) ++bch2_btree_insert_keys_interior(struct btree_update *as, ++ struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct btree *b, ++ struct keylist *keys) + { + struct btree_iter *linked; + +- __bch2_btree_insert_keys_interior(as, b, iter, keys, iter->l[b->c.level].iter); ++ __bch2_btree_insert_keys_interior(as, trans, iter, b, ++ iter->l[b->c.level].iter, keys); + + btree_update_updated_node(as, b); + +- trans_for_each_iter_with_node(iter->trans, b, linked) ++ trans_for_each_iter_with_node(trans, b, linked) + bch2_btree_node_iter_peek(&linked->l[b->c.level].iter, b); + +- bch2_btree_trans_verify_iters(iter->trans, b); ++ bch2_btree_trans_verify_iters(trans, b); + } + + /** +@@ -1509,13 +1509,13 @@ static void bch2_btree_insert_node(struct btree_update *as, + bch2_btree_node_lock_for_insert(trans, iter, b); + + if (!bch2_btree_node_insert_fits(c, b, bch2_keylist_u64s(keys))) { +- bch2_btree_node_unlock_write(b, iter); ++ bch2_btree_node_unlock_write(trans, iter, b); + goto split; + } + + btree_node_interior_verify(c, b); + +- bch2_btree_insert_keys_interior(as, b, iter, keys); ++ bch2_btree_insert_keys_interior(as, trans, iter, b, keys); + + live_u64s_added = (int) b->nr.live_u64s - old_live_u64s; + u64s_added = (int) le16_to_cpu(btree_bset_last(b)->u64s) - old_u64s; +@@ -1527,9 +1527,9 @@ static void bch2_btree_insert_node(struct btree_update *as, + + if (u64s_added > live_u64s_added && + bch2_maybe_compact_whiteouts(c, b)) +- bch2_btree_iter_reinit_node(iter, b); ++ bch2_btree_iter_reinit_node(trans, iter, b); + +- bch2_btree_node_unlock_write(b, iter); ++ bch2_btree_node_unlock_write(trans, iter, b); + + btree_node_interior_verify(c, b); + return; +@@ -1547,7 +1547,7 @@ int bch2_btree_split_leaf(struct btree_trans *trans, + unsigned l; + int ret = 0; + +- as = bch2_btree_update_start(iter, iter->level, ++ as = bch2_btree_update_start(trans, iter, iter->level, + btree_update_reserve_required(c, b), flags); + if (IS_ERR(as)) + return PTR_ERR(as); +@@ -1660,7 +1660,7 @@ retry: + goto out; + + parent = btree_node_parent(iter, b); +- as = bch2_btree_update_start(iter, level, ++ as = bch2_btree_update_start(trans, iter, level, + btree_update_reserve_required(c, parent) + 1, + flags| + BTREE_INSERT_NOFAIL| +@@ -1702,15 +1702,15 @@ retry: + + six_lock_increment(&b->c.lock, SIX_LOCK_intent); + six_lock_increment(&m->c.lock, SIX_LOCK_intent); +- bch2_btree_iter_node_drop(iter, b); +- bch2_btree_iter_node_drop(iter, m); ++ bch2_btree_iter_node_drop(trans, iter, b); ++ bch2_btree_iter_node_drop(trans, iter, m); + +- bch2_btree_iter_node_replace(iter, n); ++ bch2_btree_iter_node_replace(trans, iter, n); + + bch2_btree_trans_verify_iters(trans, n); + +- bch2_btree_node_free_inmem(c, b, iter); +- bch2_btree_node_free_inmem(c, m, iter); ++ bch2_btree_node_free_inmem(trans, iter, b); ++ bch2_btree_node_free_inmem(trans, iter, m); + + six_unlock_intent(&n->c.lock); + +@@ -1762,7 +1762,7 @@ retry: + goto out; + + parent = btree_node_parent(iter, b); +- as = bch2_btree_update_start(iter, b->c.level, ++ as = bch2_btree_update_start(trans, iter, b->c.level, + (parent + ? btree_update_reserve_required(c, parent) + : 0) + 1, +@@ -1792,15 +1792,15 @@ retry: + bch2_btree_insert_node(as, trans, iter, parent, + &as->parent_keys, flags); + } else { +- bch2_btree_set_root(as, n, iter); ++ bch2_btree_set_root(as, trans, iter, n); + } + + bch2_btree_update_get_open_buckets(as, n); + + six_lock_increment(&b->c.lock, SIX_LOCK_intent); +- bch2_btree_iter_node_drop(iter, b); +- bch2_btree_iter_node_replace(iter, n); +- bch2_btree_node_free_inmem(c, b, iter); ++ bch2_btree_iter_node_drop(trans, iter, b); ++ bch2_btree_iter_node_replace(trans, iter, n); ++ bch2_btree_node_free_inmem(trans, iter, b); + six_unlock_intent(&n->c.lock); + + bch2_btree_update_done(as); +@@ -1931,7 +1931,7 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans, + if (ret) + goto err; + +- bch2_btree_node_lock_write(b, iter); ++ bch2_btree_node_lock_write(trans, iter, b); + + if (new_hash) { + mutex_lock(&c->btree_cache.lock); +@@ -1946,7 +1946,7 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans, + bkey_copy(&b->key, new_key); + } + +- bch2_btree_node_unlock_write(b, iter); ++ bch2_btree_node_unlock_write(trans, iter, b); + out: + bch2_trans_iter_put(trans, iter2); + return ret; +diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h +index e88e737ee813..07046dab614b 100644 +--- a/fs/bcachefs/btree_update_interior.h ++++ b/fs/bcachefs/btree_update_interior.h +@@ -113,24 +113,10 @@ struct btree_update { + u64 inline_keys[BKEY_BTREE_PTR_U64s_MAX * 3]; + }; + +-void bch2_btree_node_free_inmem(struct bch_fs *, struct btree *, +- struct btree_iter *); +-void bch2_btree_node_free_never_inserted(struct bch_fs *, struct btree *); +- +-void bch2_btree_update_get_open_buckets(struct btree_update *, struct btree *); +- + struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *, + struct btree *, + struct bkey_format); + +-void bch2_btree_update_done(struct btree_update *); +-struct btree_update * +-bch2_btree_update_start(struct btree_iter *, unsigned, unsigned, unsigned); +- +-void bch2_btree_interior_update_will_free_node(struct btree_update *, +- struct btree *); +-void bch2_btree_update_add_new_node(struct btree_update *, struct btree *); +- + int bch2_btree_split_leaf(struct btree_trans *, struct btree_iter *, unsigned); + + int __bch2_foreground_maybe_merge(struct btree_trans *, struct btree_iter *, +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index 91524da9404c..daf8e73de90d 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -42,14 +42,14 @@ inline void bch2_btree_node_lock_for_insert(struct btree_trans *trans, + { + struct bch_fs *c = trans->c; + +- bch2_btree_node_lock_write(b, iter); ++ bch2_btree_node_lock_write(trans, iter, b); + + if (btree_iter_type(iter) == BTREE_ITER_CACHED) + return; + + if (unlikely(btree_node_just_written(b)) && + bch2_btree_post_write_cleanup(c, b)) +- bch2_btree_iter_reinit_node(iter, b); ++ bch2_btree_iter_reinit_node(trans, iter, b); + + /* + * If the last bset has been written, or if it's gotten too big - start +@@ -62,7 +62,8 @@ inline void bch2_btree_node_lock_for_insert(struct btree_trans *trans, + /* Inserting into a given leaf node (last stage of insert): */ + + /* Handle overwrites and do insert, for non extents: */ +-bool bch2_btree_bset_insert_key(struct btree_iter *iter, ++bool bch2_btree_bset_insert_key(struct btree_trans *trans, ++ struct btree_iter *iter, + struct btree *b, + struct btree_node_iter *node_iter, + struct bkey_i *insert) +@@ -76,7 +77,7 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter, + EBUG_ON(bpos_cmp(insert->k.p, b->data->min_key) < 0); + EBUG_ON(bpos_cmp(insert->k.p, b->data->max_key) > 0); + EBUG_ON(insert->k.u64s > +- bch_btree_keys_u64s_remaining(iter->trans->c, b)); ++ bch_btree_keys_u64s_remaining(trans->c, b)); + EBUG_ON(iter->flags & BTREE_ITER_IS_EXTENTS); + + k = bch2_btree_node_iter_peek_all(node_iter, b); +@@ -96,7 +97,7 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter, + k->type = KEY_TYPE_deleted; + + if (k->needs_whiteout) +- push_whiteout(iter->trans->c, b, insert->k.p); ++ push_whiteout(trans->c, b, insert->k.p); + k->needs_whiteout = false; + + if (k >= btree_bset_last(b)->start) { +@@ -104,7 +105,7 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter, + bch2_bset_delete(b, k, clobber_u64s); + goto fix_iter; + } else { +- bch2_btree_iter_fix_key_modified(iter, b, k); ++ bch2_btree_iter_fix_key_modified(trans, iter, b, k); + } + + return true; +@@ -122,7 +123,7 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter, + clobber_u64s = k->u64s; + goto overwrite; + } else { +- bch2_btree_iter_fix_key_modified(iter, b, k); ++ bch2_btree_iter_fix_key_modified(trans, iter, b, k); + } + } + +@@ -132,7 +133,7 @@ overwrite: + new_u64s = k->u64s; + fix_iter: + if (clobber_u64s != new_u64s) +- bch2_btree_node_iter_fix(iter, b, node_iter, k, ++ bch2_btree_node_iter_fix(trans, iter, b, node_iter, k, + clobber_u64s, new_u64s); + return true; + } +@@ -190,7 +191,7 @@ static bool btree_insert_key_leaf(struct btree_trans *trans, + EBUG_ON(!iter->level && + !test_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags)); + +- if (unlikely(!bch2_btree_bset_insert_key(iter, b, ++ if (unlikely(!bch2_btree_bset_insert_key(trans, iter, b, + &iter_l(iter)->iter, insert))) + return false; + +@@ -212,7 +213,7 @@ static bool btree_insert_key_leaf(struct btree_trans *trans, + + if (u64s_added > live_u64s_added && + bch2_maybe_compact_whiteouts(c, b)) +- bch2_btree_iter_reinit_node(iter, b); ++ bch2_btree_iter_reinit_node(trans, iter, b); + + trace_btree_insert_key(c, b, insert); + return true; +@@ -610,8 +611,8 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, + + trans_for_each_update(trans, i) + if (!same_leaf_as_prev(trans, i)) +- bch2_btree_node_unlock_write_inlined(iter_l(i->iter)->b, +- i->iter); ++ bch2_btree_node_unlock_write_inlined(trans, i->iter, ++ iter_l(i->iter)->b); + + if (!ret && trans->journal_pin) + bch2_journal_pin_add(&c->journal, trans->journal_res.seq, +@@ -1157,7 +1158,7 @@ retry: + bch2_key_resize(&delete.k, max_sectors); + bch2_cut_back(end, &delete); + +- ret = bch2_extent_trim_atomic(&delete, iter); ++ ret = bch2_extent_trim_atomic(trans, iter, &delete); + if (ret) + break; + } +diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c +index 328e0429b5d7..12458a19949f 100644 +--- a/fs/bcachefs/ec.c ++++ b/fs/bcachefs/ec.c +@@ -552,19 +552,19 @@ static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx, gfp_t gfp) + return 0; + } + +-static int ec_stripe_mem_alloc(struct bch_fs *c, ++static int ec_stripe_mem_alloc(struct btree_trans *trans, + struct btree_iter *iter) + { + size_t idx = iter->pos.offset; + int ret = 0; + +- if (!__ec_stripe_mem_alloc(c, idx, GFP_NOWAIT|__GFP_NOWARN)) ++ if (!__ec_stripe_mem_alloc(trans->c, idx, GFP_NOWAIT|__GFP_NOWARN)) + return ret; + +- bch2_trans_unlock(iter->trans); ++ bch2_trans_unlock(trans); + ret = -EINTR; + +- if (!__ec_stripe_mem_alloc(c, idx, GFP_KERNEL)) ++ if (!__ec_stripe_mem_alloc(trans->c, idx, GFP_KERNEL)) + return ret; + + return -ENOMEM; +@@ -735,7 +735,7 @@ retry: + found_slot: + start_pos = iter->pos; + +- ret = ec_stripe_mem_alloc(c, iter); ++ ret = ec_stripe_mem_alloc(&trans, iter); + if (ret) + goto err; + +diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c +index 4a8dd085f7fb..93d55f46233f 100644 +--- a/fs/bcachefs/extent_update.c ++++ b/fs/bcachefs/extent_update.c +@@ -94,11 +94,11 @@ static int count_iters_for_insert(struct btree_trans *trans, + + #define EXTENT_ITERS_MAX (BTREE_ITER_MAX / 3) + +-int bch2_extent_atomic_end(struct btree_iter *iter, ++int bch2_extent_atomic_end(struct btree_trans *trans, ++ struct btree_iter *iter, + struct bkey_i *insert, + struct bpos *end) + { +- struct btree_trans *trans = iter->trans; + struct btree_iter *copy; + struct bkey_s_c k; + unsigned nr_iters = 0; +@@ -153,27 +153,17 @@ int bch2_extent_atomic_end(struct btree_iter *iter, + return ret < 0 ? ret : 0; + } + +-int bch2_extent_trim_atomic(struct bkey_i *k, struct btree_iter *iter) ++int bch2_extent_trim_atomic(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct bkey_i *k) + { + struct bpos end; + int ret; + +- ret = bch2_extent_atomic_end(iter, k, &end); ++ ret = bch2_extent_atomic_end(trans, iter, k, &end); + if (ret) + return ret; + + bch2_cut_back(end, k); + return 0; + } +- +-int bch2_extent_is_atomic(struct bkey_i *k, struct btree_iter *iter) +-{ +- struct bpos end; +- int ret; +- +- ret = bch2_extent_atomic_end(iter, k, &end); +- if (ret) +- return ret; +- +- return !bkey_cmp(end, k->k.p); +-} +diff --git a/fs/bcachefs/extent_update.h b/fs/bcachefs/extent_update.h +index 2fa4602967e0..6f5cf449361a 100644 +--- a/fs/bcachefs/extent_update.h ++++ b/fs/bcachefs/extent_update.h +@@ -4,9 +4,9 @@ + + #include "bcachefs.h" + +-int bch2_extent_atomic_end(struct btree_iter *, struct bkey_i *, +- struct bpos *); +-int bch2_extent_trim_atomic(struct bkey_i *, struct btree_iter *); +-int bch2_extent_is_atomic(struct bkey_i *, struct btree_iter *); ++int bch2_extent_atomic_end(struct btree_trans *, struct btree_iter *, ++ struct bkey_i *, struct bpos *); ++int bch2_extent_trim_atomic(struct btree_trans *, struct btree_iter *, ++ struct bkey_i *); + + #endif /* _BCACHEFS_EXTENT_UPDATE_H */ +diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c +index 42927a9ea8e6..55bd5140a17d 100644 +--- a/fs/bcachefs/fs-io.c ++++ b/fs/bcachefs/fs-io.c +@@ -2589,7 +2589,7 @@ reassemble: + copy.k->k.p.offset += shift >> 9; + bch2_btree_iter_set_pos(dst, bkey_start_pos(©.k->k)); + +- ret = bch2_extent_atomic_end(dst, copy.k, &atomic_end); ++ ret = bch2_extent_atomic_end(&trans, dst, copy.k, &atomic_end); + if (ret) + continue; + +diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c +index a4d659a0ddaa..823830ccaf45 100644 +--- a/fs/bcachefs/io.c ++++ b/fs/bcachefs/io.c +@@ -270,7 +270,7 @@ int bch2_extent_update(struct btree_trans *trans, + s64 i_sectors_delta = 0, disk_sectors_delta = 0; + int ret; + +- ret = bch2_extent_trim_atomic(k, iter); ++ ret = bch2_extent_trim_atomic(trans, iter, k); + if (ret) + return ret; + +-- +cgit v1.2.3 + + +From effb735dd26bcee343ce01ad31a9acf354890b34 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 25 Aug 2021 01:03:25 -0400 +Subject: bcachefs: Refactor bch2_trans_update_extent() + +This consolidates the code for doing extent updates, and makes the btree +iterator usage a bit cleaner and more efficient. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_update_leaf.c | 269 ++++++++++++++++++++-------------------- + 1 file changed, 132 insertions(+), 137 deletions(-) + +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index daf8e73de90d..fbd40b1bab8f 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -739,116 +739,6 @@ bch2_trans_commit_get_rw_cold(struct btree_trans *trans) + return 0; + } + +-static int extent_handle_overwrites(struct btree_trans *trans, +- struct btree_insert_entry *i) +-{ +- struct bch_fs *c = trans->c; +- struct btree_iter *iter, *update_iter; +- struct bpos start = bkey_start_pos(&i->k->k); +- struct bkey_i *update; +- struct bkey_s_c k; +- int ret = 0, compressed_sectors; +- +- iter = bch2_trans_get_iter(trans, i->btree_id, start, +- BTREE_ITER_INTENT| +- BTREE_ITER_WITH_UPDATES| +- BTREE_ITER_NOT_EXTENTS); +- k = bch2_btree_iter_peek(iter); +- if (!k.k || (ret = bkey_err(k))) +- goto out; +- +- if (bch2_bkey_maybe_mergable(k.k, &i->k->k)) { +- update = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); +- if ((ret = PTR_ERR_OR_ZERO(update))) +- goto out; +- +- bkey_reassemble(update, k); +- +- if (bch2_bkey_merge(c, bkey_i_to_s(update), bkey_i_to_s_c(i->k))) { +- update_iter = bch2_trans_copy_iter(trans, iter); +- ret = bch2_btree_delete_at(trans, update_iter, i->flags); +- bch2_trans_iter_put(trans, update_iter); +- +- if (ret) +- goto out; +- +- i->k = update; +- goto next; +- } +- } +- +- if (!bkey_cmp(k.k->p, bkey_start_pos(&i->k->k))) +- goto next; +- +- while (bkey_cmp(i->k->k.p, bkey_start_pos(k.k)) > 0) { +- /* +- * If we're going to be splitting a compressed extent, note it +- * so that __bch2_trans_commit() can increase our disk +- * reservation: +- */ +- if (bkey_cmp(bkey_start_pos(k.k), start) < 0 && +- bkey_cmp(k.k->p, i->k->k.p) > 0 && +- (compressed_sectors = bch2_bkey_sectors_compressed(k))) +- trans->extra_journal_res += compressed_sectors; +- +- if (bkey_cmp(bkey_start_pos(k.k), start) < 0) { +- update = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); +- if ((ret = PTR_ERR_OR_ZERO(update))) +- goto out; +- +- bkey_reassemble(update, k); +- +- bch2_cut_back(start, update); +- +- update_iter = bch2_trans_get_iter(trans, i->btree_id, update->k.p, +- BTREE_ITER_NOT_EXTENTS| +- BTREE_ITER_INTENT); +- ret = bch2_btree_iter_traverse(update_iter); +- if (ret) { +- bch2_trans_iter_put(trans, update_iter); +- goto out; +- } +- +- bch2_trans_update(trans, update_iter, update, +- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE| +- i->flags); +- bch2_trans_iter_put(trans, update_iter); +- } +- +- if (bkey_cmp(k.k->p, i->k->k.p) <= 0) { +- update_iter = bch2_trans_copy_iter(trans, iter); +- ret = bch2_btree_delete_at(trans, update_iter, +- i->flags); +- bch2_trans_iter_put(trans, update_iter); +- +- if (ret) +- goto out; +- } +- +- if (bkey_cmp(k.k->p, i->k->k.p) > 0) { +- update = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); +- if ((ret = PTR_ERR_OR_ZERO(update))) +- goto out; +- +- bkey_reassemble(update, k); +- bch2_cut_front(i->k->k.p, update); +- +- bch2_trans_update(trans, iter, update, i->flags); +- goto out; +- } +-next: +- k = bch2_btree_iter_next(iter); +- if (!k.k || (ret = bkey_err(k))) +- goto out; +- } +- +- bch2_bkey_merge(c, bkey_i_to_s(i->k), k); +-out: +- bch2_trans_iter_put(trans, iter); +- +- return ret; +-} +- + int __bch2_trans_commit(struct btree_trans *trans) + { + struct btree_insert_entry *i = NULL; +@@ -983,6 +873,133 @@ err: + goto retry; + } + ++static int bch2_trans_update_extent(struct btree_trans *trans, ++ struct btree_iter *orig_iter, ++ struct bkey_i *insert, ++ enum btree_update_flags flags) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_iter *iter, *update_iter; ++ struct bpos start = bkey_start_pos(&insert->k); ++ struct bkey_i *update; ++ struct bkey_s_c k; ++ enum btree_id btree_id = orig_iter->btree_id; ++ int ret = 0, compressed_sectors; ++ ++ orig_iter->pos_after_commit = insert->k.p; ++ orig_iter->flags |= BTREE_ITER_SET_POS_AFTER_COMMIT; ++ ++ iter = bch2_trans_get_iter(trans, btree_id, start, ++ BTREE_ITER_INTENT| ++ BTREE_ITER_WITH_UPDATES| ++ BTREE_ITER_NOT_EXTENTS); ++ k = bch2_btree_iter_peek(iter); ++ if ((ret = bkey_err(k))) ++ goto err; ++ if (!k.k) ++ goto out; ++ ++ if (bch2_bkey_maybe_mergable(k.k, &insert->k)) { ++ update = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); ++ if ((ret = PTR_ERR_OR_ZERO(update))) ++ goto err; ++ ++ bkey_reassemble(update, k); ++ ++ if (bch2_bkey_merge(c, bkey_i_to_s(update), bkey_i_to_s_c(insert))) { ++ update_iter = bch2_trans_copy_iter(trans, iter); ++ ret = bch2_btree_delete_at(trans, update_iter, flags); ++ bch2_trans_iter_put(trans, update_iter); ++ ++ if (ret) ++ goto err; ++ ++ insert = update; ++ goto next; ++ } ++ } ++ ++ if (!bkey_cmp(k.k->p, bkey_start_pos(&insert->k))) ++ goto next; ++ ++ while (bkey_cmp(insert->k.p, bkey_start_pos(k.k)) > 0) { ++ /* ++ * If we're going to be splitting a compressed extent, note it ++ * so that __bch2_trans_commit() can increase our disk ++ * reservation: ++ */ ++ if (bkey_cmp(bkey_start_pos(k.k), start) < 0 && ++ bkey_cmp(k.k->p, insert->k.p) > 0 && ++ (compressed_sectors = bch2_bkey_sectors_compressed(k))) ++ trans->extra_journal_res += compressed_sectors; ++ ++ if (bkey_cmp(bkey_start_pos(k.k), start) < 0) { ++ update = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); ++ if ((ret = PTR_ERR_OR_ZERO(update))) ++ goto err; ++ ++ bkey_reassemble(update, k); ++ ++ bch2_cut_back(start, update); ++ ++ update_iter = bch2_trans_get_iter(trans, btree_id, update->k.p, ++ BTREE_ITER_NOT_EXTENTS| ++ BTREE_ITER_INTENT); ++ ret = bch2_btree_iter_traverse(update_iter) ?: ++ bch2_trans_update(trans, update_iter, update, ++ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE| ++ flags); ++ bch2_trans_iter_put(trans, update_iter); ++ if (ret) ++ goto err; ++ } ++ ++ if (bkey_cmp(k.k->p, insert->k.p) <= 0) { ++ update_iter = bch2_trans_copy_iter(trans, iter); ++ ret = bch2_btree_delete_at(trans, update_iter, ++ flags); ++ bch2_trans_iter_put(trans, update_iter); ++ ++ if (ret) ++ goto err; ++ } ++ ++ if (bkey_cmp(k.k->p, insert->k.p) > 0) { ++ update = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); ++ if ((ret = PTR_ERR_OR_ZERO(update))) ++ goto err; ++ ++ bkey_reassemble(update, k); ++ bch2_cut_front(insert->k.p, update); ++ ++ update_iter = bch2_trans_copy_iter(trans, iter); ++ bch2_trans_update(trans, update_iter, update, flags); ++ bch2_trans_iter_put(trans, update_iter); ++ goto out; ++ } ++next: ++ k = bch2_btree_iter_next(iter); ++ if ((ret = bkey_err(k))) ++ goto err; ++ if (!k.k) ++ goto out; ++ } ++ ++ bch2_bkey_merge(c, bkey_i_to_s(insert), k); ++out: ++ if (!bkey_deleted(&insert->k)) { ++ bch2_btree_iter_set_pos(iter, insert->k.p); ++ ret = bch2_btree_iter_traverse(iter) ?: ++ bch2_trans_update(trans, iter, insert, flags); ++ } else { ++ set_btree_iter_dontneed(trans, iter); ++ } ++err: ++ bch2_trans_iter_put(trans, iter); ++ ++ return ret; ++} ++ + int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter, + struct bkey_i *k, enum btree_update_flags flags) + { +@@ -995,41 +1012,19 @@ int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter, + .k = k, + .ip_allocated = _RET_IP_, + }; +- bool is_extent = (iter->flags & BTREE_ITER_IS_EXTENTS) != 0; +- int ret = 0; + +- BUG_ON(trans->nr_updates >= BTREE_ITER_MAX); + BUG_ON(!iter->should_be_locked); + ++ if (iter->flags & BTREE_ITER_IS_EXTENTS) ++ return bch2_trans_update_extent(trans, iter, k, flags); ++ + #ifdef CONFIG_BCACHEFS_DEBUG + trans_for_each_update(trans, i) + BUG_ON(i != trans->updates && + btree_insert_entry_cmp(i - 1, i) >= 0); + #endif +- +- if (is_extent) { +- ret = extent_handle_overwrites(trans, &n); +- if (ret) +- return ret; +- +- iter->pos_after_commit = k->k.p; +- iter->flags |= BTREE_ITER_SET_POS_AFTER_COMMIT; +- +- if (bkey_deleted(&n.k->k)) +- return 0; +- +- n.iter = bch2_trans_get_iter(trans, n.btree_id, n.k->k.p, +- BTREE_ITER_INTENT| +- BTREE_ITER_NOT_EXTENTS); +- n.iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT; +- ret = bch2_btree_iter_traverse(n.iter); +- bch2_trans_iter_put(trans, n.iter); +- +- if (ret) +- return ret; +- } +- +- BUG_ON(n.iter->flags & BTREE_ITER_IS_EXTENTS); ++ BUG_ON(trans->nr_updates >= BTREE_ITER_MAX); ++ BUG_ON(bpos_cmp(n.k->k.p, n.iter->real_pos)); + + n.iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT; + +-- +cgit v1.2.3 + + +From 734b86d39762a89e8570fefda3316f4ff3a99cca Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 29 Aug 2021 19:34:37 -0400 +Subject: bcachefs: Kill BTREE_ITER_SET_POS_AFTER_COMMIT + +BTREE_ITER_SET_POS_AFTER_COMMIT is used internally to automagically +advance extent btree iterators on sucessful commit. + +But with the upcomnig btree_path patch it's getting more awkward to +support, and it adds overhead to core data structures that's only used +in a few places, and can be easily done by the caller instead. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 4 +--- + fs/bcachefs/btree_types.h | 10 ++++------ + fs/bcachefs/btree_update_leaf.c | 9 --------- + fs/bcachefs/ec.c | 5 +++++ + fs/bcachefs/io.c | 5 +++++ + fs/bcachefs/move.c | 9 +++++++-- + 6 files changed, 22 insertions(+), 20 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 89c061e74b45..7ef5699e47cc 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -2345,7 +2345,6 @@ static void btree_iter_copy(struct btree_trans *trans, struct btree_iter *dst, + __btree_lock_want(dst, i)); + + dst->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT; +- dst->flags &= ~BTREE_ITER_SET_POS_AFTER_COMMIT; + + btree_iter_check_sort(trans, dst); + } +@@ -2563,8 +2562,7 @@ void bch2_trans_begin(struct btree_trans *trans) + struct btree_iter *iter; + + trans_for_each_iter(trans, iter) +- iter->flags &= ~(BTREE_ITER_KEEP_UNTIL_COMMIT| +- BTREE_ITER_SET_POS_AFTER_COMMIT); ++ iter->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT; + + /* + * XXX: we shouldn't be doing this if the transaction was restarted, but +diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h +index 0a59e4b6e7a7..423736ea56b4 100644 +--- a/fs/bcachefs/btree_types.h ++++ b/fs/bcachefs/btree_types.h +@@ -211,11 +211,10 @@ enum btree_iter_type { + #define BTREE_ITER_IS_EXTENTS (1 << 6) + #define BTREE_ITER_NOT_EXTENTS (1 << 7) + #define BTREE_ITER_ERROR (1 << 8) +-#define BTREE_ITER_SET_POS_AFTER_COMMIT (1 << 9) +-#define BTREE_ITER_CACHED_NOFILL (1 << 10) +-#define BTREE_ITER_CACHED_NOCREATE (1 << 11) +-#define BTREE_ITER_WITH_UPDATES (1 << 12) +-#define BTREE_ITER_ALL_SNAPSHOTS (1 << 13) ++#define BTREE_ITER_CACHED_NOFILL (1 << 9) ++#define BTREE_ITER_CACHED_NOCREATE (1 << 10) ++#define BTREE_ITER_WITH_UPDATES (1 << 11) ++#define BTREE_ITER_ALL_SNAPSHOTS (1 << 12) + + enum btree_iter_uptodate { + BTREE_ITER_UPTODATE = 0, +@@ -256,7 +255,6 @@ struct btree_iter { + + struct bpos pos; + struct bpos real_pos; +- struct bpos pos_after_commit; + + enum btree_id btree_id:4; + enum btree_iter_uptodate uptodate:3; +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index fbd40b1bab8f..f35918e5e8bd 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -742,7 +742,6 @@ bch2_trans_commit_get_rw_cold(struct btree_trans *trans) + int __bch2_trans_commit(struct btree_trans *trans) + { + struct btree_insert_entry *i = NULL; +- struct btree_iter *iter; + bool trans_trigger_run; + unsigned u64s; + int ret = 0; +@@ -840,11 +839,6 @@ retry: + + if (ret) + goto err; +- +- trans_for_each_iter(trans, iter) +- if (btree_iter_live(trans, iter) && +- (iter->flags & BTREE_ITER_SET_POS_AFTER_COMMIT)) +- bch2_btree_iter_set_pos(iter, iter->pos_after_commit); + out: + bch2_journal_preres_put(&trans->c->journal, &trans->journal_preres); + +@@ -886,9 +880,6 @@ static int bch2_trans_update_extent(struct btree_trans *trans, + enum btree_id btree_id = orig_iter->btree_id; + int ret = 0, compressed_sectors; + +- orig_iter->pos_after_commit = insert->k.p; +- orig_iter->flags |= BTREE_ITER_SET_POS_AFTER_COMMIT; +- + iter = bch2_trans_get_iter(trans, btree_id, start, + BTREE_ITER_INTENT| + BTREE_ITER_WITH_UPDATES| +diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c +index 12458a19949f..53d6be09decd 100644 +--- a/fs/bcachefs/ec.c ++++ b/fs/bcachefs/ec.c +@@ -824,6 +824,7 @@ static int ec_stripe_update_ptrs(struct bch_fs *c, + struct bkey_s_c k; + struct bkey_s_extent e; + struct bkey_buf sk; ++ struct bpos next_pos; + int ret = 0, dev, block; + + bch2_bkey_buf_init(&sk); +@@ -863,10 +864,14 @@ static int ec_stripe_update_ptrs(struct bch_fs *c, + extent_stripe_ptr_add(e, s, ec_ptr, block); + + bch2_btree_iter_set_pos(iter, bkey_start_pos(&sk.k->k)); ++ next_pos = sk.k->k.p; ++ + ret = bch2_btree_iter_traverse(iter) ?: + bch2_trans_update(&trans, iter, sk.k, 0) ?: + bch2_trans_commit(&trans, NULL, NULL, + BTREE_INSERT_NOFAIL); ++ if (!ret) ++ bch2_btree_iter_set_pos(iter, next_pos); + if (ret == -EINTR) + ret = 0; + if (ret) +diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c +index 823830ccaf45..b2a1bf242ed9 100644 +--- a/fs/bcachefs/io.c ++++ b/fs/bcachefs/io.c +@@ -266,6 +266,7 @@ int bch2_extent_update(struct btree_trans *trans, + { + /* this must live until after bch2_trans_commit(): */ + struct bkey_inode_buf inode_p; ++ struct bpos next_pos; + bool extending = false, usage_increasing; + s64 i_sectors_delta = 0, disk_sectors_delta = 0; + int ret; +@@ -344,6 +345,8 @@ int bch2_extent_update(struct btree_trans *trans, + return ret; + } + ++ next_pos = k->k.p; ++ + ret = bch2_trans_update(trans, iter, k, 0) ?: + bch2_trans_commit(trans, disk_res, journal_seq, + BTREE_INSERT_NOCHECK_RW| +@@ -352,6 +355,8 @@ int bch2_extent_update(struct btree_trans *trans, + if (ret) + return ret; + ++ bch2_btree_iter_set_pos(iter, next_pos); ++ + if (i_sectors_delta_total) + *i_sectors_delta_total += i_sectors_delta; + return 0; +diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c +index befb198a77fd..2e5e09350392 100644 +--- a/fs/bcachefs/move.c ++++ b/fs/bcachefs/move.c +@@ -80,6 +80,7 @@ static int bch2_migrate_index_update(struct bch_write_op *op) + struct bkey_i_extent *new; + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; ++ struct bpos next_pos; + bool did_work = false; + bool extending = false, should_check_enospc; + s64 i_sectors_delta = 0, disk_sectors_delta = 0; +@@ -163,14 +164,18 @@ static int bch2_migrate_index_update(struct bch_write_op *op) + goto out; + } + ++ next_pos = insert->k.p; ++ + ret = bch2_trans_update(&trans, iter, insert, 0) ?: + bch2_trans_commit(&trans, &op->res, + op_journal_seq(op), + BTREE_INSERT_NOFAIL| + m->data_opts.btree_insert_flags); +-err: +- if (!ret) ++ if (!ret) { ++ bch2_btree_iter_set_pos(iter, next_pos); + atomic_long_inc(&c->extent_migrate_done); ++ } ++err: + if (ret == -EINTR) + ret = 0; + if (ret) +-- +cgit v1.2.3 + + +From a4e0f4ac7d7fa7a66900abc15bee6712d4f5d44e Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 27 Aug 2021 20:55:44 -0400 +Subject: bcachefs: Better algorithm for btree node merging in write path + +The existing algorithm was O(n^2) in the number of updates in the +commit. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_update_leaf.c | 79 ++++++++++++++++------------------------- + 1 file changed, 30 insertions(+), 49 deletions(-) + +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index f35918e5e8bd..21fd6b2a7532 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -36,6 +36,13 @@ static inline bool same_leaf_as_prev(struct btree_trans *trans, + iter_l(i[0].iter)->b == iter_l(i[-1].iter)->b; + } + ++static inline bool same_leaf_as_next(struct btree_trans *trans, ++ struct btree_insert_entry *i) ++{ ++ return i + 1 < trans->updates + trans->nr_updates && ++ iter_l(i[0].iter)->b == iter_l(i[1].iter)->b; ++} ++ + inline void bch2_btree_node_lock_for_insert(struct btree_trans *trans, + struct btree_iter *iter, + struct btree *b) +@@ -486,44 +493,6 @@ err: + return ret; + } + +-static noinline int maybe_do_btree_merge(struct btree_trans *trans, struct btree_iter *iter) +-{ +- struct btree_insert_entry *i; +- struct btree *b = iter_l(iter)->b; +- struct bkey_s_c old; +- int u64s_delta = 0; +- int ret; +- +- /* +- * Inserting directly into interior nodes is an uncommon operation with +- * various weird edge cases: also, a lot of things about +- * BTREE_ITER_NODES iters need to be audited +- */ +- if (unlikely(btree_iter_type(iter) != BTREE_ITER_KEYS)) +- return 0; +- +- BUG_ON(iter->level); +- +- trans_for_each_update(trans, i) { +- if (iter_l(i->iter)->b != b) +- continue; +- +- old = bch2_btree_iter_peek_slot(i->iter); +- ret = bkey_err(old); +- if (ret) +- return ret; +- +- u64s_delta += !bkey_deleted(&i->k->k) ? i->k->k.u64s : 0; +- u64s_delta -= !bkey_deleted(old.k) ? old.k->u64s : 0; +- } +- +- if (u64s_delta > 0) +- return 0; +- +- return bch2_foreground_maybe_merge(trans, iter, +- iter->level, trans->flags); +-} +- + /* + * Get journal reservation, take write locks, and attempt to do btree update(s): + */ +@@ -534,22 +503,34 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, + struct bch_fs *c = trans->c; + struct btree_insert_entry *i; + struct btree_iter *iter; +- int ret; ++ struct bkey_s_c old; ++ int ret, u64s_delta = 0; + + trans_for_each_update(trans, i) { +- struct btree *b; ++ /* ++ * peek_slot() doesn't work on a BTREE_ITER_NODES iter; those ++ * iterator types should probably go away ++ */ ++ if (btree_iter_type(i->iter) != BTREE_ITER_KEYS) ++ continue; + +- BUG_ON(!btree_node_intent_locked(i->iter, i->level)); ++ old = bch2_btree_iter_peek_slot(i->iter); ++ ret = bkey_err(old); ++ if (unlikely(ret)) ++ return ret; + +- if (btree_iter_type(i->iter) == BTREE_ITER_CACHED) +- continue; ++ u64s_delta += !bkey_deleted(&i->k->k) ? i->k->k.u64s : 0; ++ u64s_delta -= !bkey_deleted(old.k) ? old.k->u64s : 0; ++ ++ if (!same_leaf_as_next(trans, i)) { ++ if (u64s_delta <= 0) { ++ ret = bch2_foreground_maybe_merge(trans, i->iter, ++ i->iter->level, trans->flags); ++ if (unlikely(ret)) ++ return ret; ++ } + +- b = iter_l(i->iter)->b; +- if (b->sib_u64s[0] < c->btree_foreground_merge_threshold || +- b->sib_u64s[1] < c->btree_foreground_merge_threshold) { +- ret = maybe_do_btree_merge(trans, i->iter); +- if (unlikely(ret)) +- return ret; ++ u64s_delta = 0; + } + } + +-- +cgit v1.2.3 + + +From 954dfc6bc44e4577ac63b2aca837ce917fb2914b Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 30 Aug 2021 14:22:43 -0400 +Subject: bcachefs: Further reduce iter->trans usage + +This is prep work for splitting btree_path out from btree_iter - +btree_path will not have a pointer to btree_trans. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_cache.c | 40 +++++----- + fs/bcachefs/btree_cache.h | 5 +- + fs/bcachefs/btree_iter.c | 155 ++++++++++++++++++------------------ + fs/bcachefs/btree_iter.h | 10 ++- + fs/bcachefs/btree_key_cache.c | 13 ++- + fs/bcachefs/btree_key_cache.h | 2 +- + fs/bcachefs/btree_locking.h | 30 ++++--- + fs/bcachefs/btree_update_interior.c | 2 +- + fs/bcachefs/btree_update_interior.h | 2 +- + fs/bcachefs/btree_update_leaf.c | 5 +- + fs/bcachefs/recovery.c | 4 +- + 11 files changed, 136 insertions(+), 132 deletions(-) + +diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c +index 3dfb0dca445a..b3445b67e981 100644 +--- a/fs/bcachefs/btree_cache.c ++++ b/fs/bcachefs/btree_cache.c +@@ -640,6 +640,7 @@ err: + + /* Slowpath, don't want it inlined into btree_iter_traverse() */ + static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c, ++ struct btree_trans *trans, + struct btree_iter *iter, + const struct bkey_i *k, + enum btree_id btree_id, +@@ -656,8 +657,8 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c, + * Parent node must be locked, else we could read in a btree node that's + * been freed: + */ +- if (iter && !bch2_btree_node_relock(iter, level + 1)) { +- btree_trans_restart(iter->trans); ++ if (trans && !bch2_btree_node_relock(trans, iter, level + 1)) { ++ btree_trans_restart(trans); + return ERR_PTR(-EINTR); + } + +@@ -688,23 +689,23 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c, + six_unlock_intent(&b->c.lock); + + /* Unlock before doing IO: */ +- if (iter && sync) +- bch2_trans_unlock(iter->trans); ++ if (trans && sync) ++ bch2_trans_unlock(trans); + + bch2_btree_node_read(c, b, sync); + + if (!sync) + return NULL; + +- if (iter && +- (!bch2_trans_relock(iter->trans) || +- !bch2_btree_iter_relock_intent(iter))) { +- BUG_ON(!iter->trans->restarted); ++ if (trans && ++ (!bch2_trans_relock(trans) || ++ !bch2_btree_iter_relock_intent(trans, iter))) { ++ BUG_ON(!trans->restarted); + return ERR_PTR(-EINTR); + } + + if (!six_relock_type(&b->c.lock, lock_type, seq)) { +- btree_trans_restart(iter->trans); ++ btree_trans_restart(trans); + return ERR_PTR(-EINTR); + } + +@@ -787,7 +788,7 @@ retry: + * else we could read in a btree node from disk that's been + * freed: + */ +- b = bch2_btree_node_fill(c, iter, k, iter->btree_id, ++ b = bch2_btree_node_fill(c, trans, iter, k, iter->btree_id, + level, lock_type, true); + + /* We raced and found the btree node in the cache */ +@@ -829,7 +830,7 @@ lock_node: + if (btree_node_read_locked(iter, level + 1)) + btree_node_unlock(iter, level + 1); + +- if (!btree_node_lock(b, k->k.p, level, iter, lock_type, ++ if (!btree_node_lock(trans, iter, b, k->k.p, level, lock_type, + lock_node_check_fn, (void *) k, trace_ip)) { + if (!trans->restarted) + goto retry; +@@ -840,7 +841,7 @@ lock_node: + b->c.level != level || + race_fault())) { + six_unlock_type(&b->c.lock, lock_type); +- if (bch2_btree_node_relock(iter, level + 1)) ++ if (bch2_btree_node_relock(trans, iter, level + 1)) + goto retry; + + trace_trans_restart_btree_node_reused(trans->ip, +@@ -864,9 +865,9 @@ lock_node: + * should_be_locked is not set on this iterator yet, so we need + * to relock it specifically: + */ +- if (iter && ++ if (trans && + (!bch2_trans_relock(trans) || +- !bch2_btree_iter_relock_intent(iter))) { ++ !bch2_btree_iter_relock_intent(trans, iter))) { + BUG_ON(!trans->restarted); + return ERR_PTR(-EINTR); + } +@@ -925,7 +926,7 @@ retry: + if (nofill) + goto out; + +- b = bch2_btree_node_fill(c, NULL, k, btree_id, ++ b = bch2_btree_node_fill(c, NULL, NULL, k, btree_id, + level, SIX_LOCK_read, true); + + /* We raced and found the btree node in the cache */ +@@ -983,21 +984,24 @@ out: + return b; + } + +-int bch2_btree_node_prefetch(struct bch_fs *c, struct btree_iter *iter, ++int bch2_btree_node_prefetch(struct bch_fs *c, ++ struct btree_trans *trans, ++ struct btree_iter *iter, + const struct bkey_i *k, + enum btree_id btree_id, unsigned level) + { + struct btree_cache *bc = &c->btree_cache; + struct btree *b; + +- BUG_ON(iter && !btree_node_locked(iter, level + 1)); ++ BUG_ON(trans && !btree_node_locked(iter, level + 1)); + BUG_ON(level >= BTREE_MAX_DEPTH); + + b = btree_cache_find(bc, k); + if (b) + return 0; + +- b = bch2_btree_node_fill(c, iter, k, btree_id, level, SIX_LOCK_read, false); ++ b = bch2_btree_node_fill(c, trans, iter, k, btree_id, ++ level, SIX_LOCK_read, false); + return PTR_ERR_OR_ZERO(b); + } + +diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h +index 5032293e8628..6c1c69f3abcf 100644 +--- a/fs/bcachefs/btree_cache.h ++++ b/fs/bcachefs/btree_cache.h +@@ -29,8 +29,9 @@ struct btree *bch2_btree_node_get(struct btree_trans *, struct btree_iter *, + struct btree *bch2_btree_node_get_noiter(struct bch_fs *, const struct bkey_i *, + enum btree_id, unsigned, bool); + +-int bch2_btree_node_prefetch(struct bch_fs *, struct btree_iter *, +- const struct bkey_i *, enum btree_id, unsigned); ++int bch2_btree_node_prefetch(struct bch_fs *, struct btree_trans *, ++ struct btree_iter *, const struct bkey_i *, ++ enum btree_id, unsigned); + + void bch2_btree_node_evict(struct bch_fs *, const struct bkey_i *); + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 7ef5699e47cc..c7ba4be0e7cd 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -108,17 +108,14 @@ void bch2_btree_node_unlock_write(struct btree_trans *trans, + bch2_btree_node_unlock_write_inlined(trans, iter, b); + } + +-void __bch2_btree_node_lock_write(struct btree_trans *trans, +- struct btree_iter *iter, struct btree *b) ++void __bch2_btree_node_lock_write(struct btree_trans *trans, struct btree *b) + { +- struct btree_iter *linked; ++ struct btree_iter *iter; + unsigned readers = 0; + +- EBUG_ON(!btree_node_intent_locked(iter, b->c.level)); +- +- trans_for_each_iter(trans, linked) +- if (linked->l[b->c.level].b == b && +- btree_node_read_locked(linked, b->c.level)) ++ trans_for_each_iter(trans, iter) ++ if (iter->l[b->c.level].b == b && ++ btree_node_read_locked(iter, b->c.level)) + readers++; + + /* +@@ -142,7 +139,8 @@ void __bch2_btree_node_lock_write(struct btree_trans *trans, + this_cpu_add(*b->c.lock.readers, readers); + } + +-bool __bch2_btree_node_relock(struct btree_iter *iter, unsigned level) ++bool __bch2_btree_node_relock(struct btree_trans *trans, ++ struct btree_iter *iter, unsigned level) + { + struct btree *b = btree_iter_node(iter, level); + int want = __btree_lock_want(iter, level); +@@ -155,7 +153,7 @@ bool __bch2_btree_node_relock(struct btree_iter *iter, unsigned level) + + if (six_relock_type(&b->c.lock, want, iter->l[level].lock_seq) || + (btree_node_lock_seq_matches(iter, b, level) && +- btree_node_lock_increment(iter->trans, b, level, want))) { ++ btree_node_lock_increment(trans, b, level, want))) { + mark_btree_node_locked(iter, level, want); + return true; + } else { +@@ -163,7 +161,8 @@ bool __bch2_btree_node_relock(struct btree_iter *iter, unsigned level) + } + } + +-static bool bch2_btree_node_upgrade(struct btree_iter *iter, unsigned level) ++static bool bch2_btree_node_upgrade(struct btree_trans *trans, ++ struct btree_iter *iter, unsigned level) + { + struct btree *b = iter->l[level].b; + +@@ -184,7 +183,7 @@ static bool bch2_btree_node_upgrade(struct btree_iter *iter, unsigned level) + goto success; + + if (btree_node_lock_seq_matches(iter, b, level) && +- btree_node_lock_increment(iter->trans, b, level, BTREE_NODE_INTENT_LOCKED)) { ++ btree_node_lock_increment(trans, b, level, BTREE_NODE_INTENT_LOCKED)) { + btree_node_unlock(iter, level); + goto success; + } +@@ -207,8 +206,8 @@ static inline bool btree_iter_get_locks(struct btree_trans *trans, + break; + + if (!(upgrade +- ? bch2_btree_node_upgrade(iter, l) +- : bch2_btree_node_relock(iter, l))) { ++ ? bch2_btree_node_upgrade(trans, iter, l) ++ : bch2_btree_node_relock(trans, iter, l))) { + (upgrade + ? trace_node_upgrade_fail + : trace_node_relock_fail)(trans->ip, trace_ip, +@@ -256,13 +255,13 @@ static struct bpos btree_node_pos(struct btree_bkey_cached_common *_b, + } + + /* Slowpath: */ +-bool __bch2_btree_node_lock(struct btree *b, struct bpos pos, +- unsigned level, struct btree_iter *iter, ++bool __bch2_btree_node_lock(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct btree *b, struct bpos pos, unsigned level, + enum six_lock_type type, + six_lock_should_sleep_fn should_sleep_fn, void *p, + unsigned long ip) + { +- struct btree_trans *trans = iter->trans; + struct btree_iter *linked, *deadlock_iter = NULL; + u64 start_time = local_clock(); + unsigned reason = 9; +@@ -369,16 +368,10 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos, + /* Btree iterator locking: */ + + #ifdef CONFIG_BCACHEFS_DEBUG +-static void bch2_btree_iter_verify_locks(struct btree_trans *trans, +- struct btree_iter *iter) ++static void bch2_btree_iter_verify_locks(struct btree_iter *iter) + { + unsigned l; + +- if (!(trans->iters_linked & (1ULL << iter->idx))) { +- BUG_ON(iter->nodes_locked); +- return; +- } +- + for (l = 0; btree_iter_node(iter, l); l++) { + if (iter->uptodate >= BTREE_ITER_NEED_RELOCK && + !btree_node_locked(iter, l)) +@@ -394,25 +387,24 @@ void bch2_btree_trans_verify_locks(struct btree_trans *trans) + struct btree_iter *iter; + + trans_for_each_iter(trans, iter) +- bch2_btree_iter_verify_locks(trans, iter); ++ bch2_btree_iter_verify_locks(iter); + } + #else +-static inline void bch2_btree_iter_verify_locks(struct btree_trans *trans, +- struct btree_iter *iter) {} ++static inline void bch2_btree_iter_verify_locks(struct btree_iter *iter) {} + #endif + + /* + * Only for btree_cache.c - only relocks intent locks + */ +-bool bch2_btree_iter_relock_intent(struct btree_iter *iter) ++bool bch2_btree_iter_relock_intent(struct btree_trans *trans, ++ struct btree_iter *iter) + { +- struct btree_trans *trans = iter->trans; + unsigned l; + + for (l = iter->level; + l < iter->locks_want && btree_iter_node(iter, l); + l++) { +- if (!bch2_btree_node_relock(iter, l)) { ++ if (!bch2_btree_node_relock(trans, iter, l)) { + trace_node_relock_fail(trans->ip, _RET_IP_, + btree_iter_type(iter) == BTREE_ITER_CACHED, + iter->btree_id, &iter->real_pos, +@@ -443,10 +435,10 @@ static bool bch2_btree_iter_relock(struct btree_trans *trans, + return ret; + } + +-bool __bch2_btree_iter_upgrade(struct btree_iter *iter, ++bool __bch2_btree_iter_upgrade(struct btree_trans *trans, ++ struct btree_iter *iter, + unsigned new_locks_want) + { +- struct btree_trans *trans = iter->trans; + struct btree_iter *linked; + + EBUG_ON(iter->locks_want >= new_locks_want); +@@ -511,7 +503,7 @@ void __bch2_btree_iter_downgrade(struct btree_iter *iter, + } + } + +- bch2_btree_trans_verify_locks(iter->trans); ++ bch2_btree_iter_verify_locks(iter); + } + + void bch2_trans_downgrade(struct btree_trans *trans) +@@ -562,12 +554,13 @@ void bch2_trans_unlock(struct btree_trans *trans) + + #ifdef CONFIG_BCACHEFS_DEBUG + +-static void bch2_btree_iter_verify_cached(struct btree_iter *iter) ++static void bch2_btree_iter_verify_cached(struct btree_trans *trans, ++ struct btree_iter *iter) + { + struct bkey_cached *ck; + bool locked = btree_node_locked(iter, 0); + +- if (!bch2_btree_node_relock(iter, 0)) ++ if (!bch2_btree_node_relock(trans, iter, 0)) + return; + + ck = (void *) iter->l[0].b; +@@ -578,8 +571,8 @@ static void bch2_btree_iter_verify_cached(struct btree_iter *iter) + btree_node_unlock(iter, 0); + } + +-static void bch2_btree_iter_verify_level(struct btree_iter *iter, +- unsigned level) ++static void bch2_btree_iter_verify_level(struct btree_trans *trans, ++ struct btree_iter *iter, unsigned level) + { + struct btree_iter_level *l; + struct btree_node_iter tmp; +@@ -597,7 +590,7 @@ static void bch2_btree_iter_verify_level(struct btree_iter *iter, + + if (btree_iter_type(iter) == BTREE_ITER_CACHED) { + if (!level) +- bch2_btree_iter_verify_cached(iter); ++ bch2_btree_iter_verify_cached(trans, iter); + return; + } + +@@ -606,7 +599,7 @@ static void bch2_btree_iter_verify_level(struct btree_iter *iter, + if (!btree_iter_node(iter, level)) + return; + +- if (!bch2_btree_node_relock(iter, level)) ++ if (!bch2_btree_node_relock(trans, iter, level)) + return; + + BUG_ON(!btree_iter_pos_in_node(iter, l->b)); +@@ -696,10 +689,10 @@ static void bch2_btree_iter_verify(struct btree_iter *iter) + break; + } + +- bch2_btree_iter_verify_level(iter, i); ++ bch2_btree_iter_verify_level(trans, iter, i); + } + +- bch2_btree_iter_verify_locks(trans, iter); ++ bch2_btree_iter_verify_locks(iter); + } + + static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) +@@ -723,12 +716,13 @@ void bch2_btree_trans_verify_iters(struct btree_trans *trans, struct btree *b) + return; + + trans_for_each_iter_with_node(trans, b, iter) +- bch2_btree_iter_verify_level(iter, b->c.level); ++ bch2_btree_iter_verify_level(trans, iter, b->c.level); + } + + #else + +-static inline void bch2_btree_iter_verify_level(struct btree_iter *iter, unsigned l) {} ++static inline void bch2_btree_iter_verify_level(struct btree_trans *trans, ++ struct btree_iter *iter, unsigned l) {} + static inline void bch2_btree_iter_verify(struct btree_iter *iter) {} + static inline void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) {} + +@@ -775,7 +769,7 @@ void bch2_btree_iter_fix_key_modified(struct btree_trans *trans, + + trans_for_each_iter_with_node(trans, b, linked) { + __bch2_btree_iter_fix_key_modified(linked, b, where); +- bch2_btree_iter_verify_level(linked, b->c.level); ++ bch2_btree_iter_verify_level(trans, linked, b->c.level); + } + } + +@@ -900,7 +894,7 @@ void bch2_btree_node_iter_fix(struct btree_trans *trans, + __bch2_btree_node_iter_fix(linked, b, + &linked->l[b->c.level].iter, t, + where, clobber_u64s, new_u64s); +- bch2_btree_iter_verify_level(linked, b->c.level); ++ bch2_btree_iter_verify_level(trans, linked, b->c.level); + } + } + +@@ -985,7 +979,8 @@ static inline bool btree_iter_advance_to_pos(struct btree_iter *iter, + /* + * Verify that iterator for parent node points to child node: + */ +-static void btree_iter_verify_new_node(struct btree_iter *iter, struct btree *b) ++static void btree_iter_verify_new_node(struct btree_trans *trans, ++ struct btree_iter *iter, struct btree *b) + { + struct btree_iter_level *l; + unsigned plevel; +@@ -1001,7 +996,7 @@ static void btree_iter_verify_new_node(struct btree_iter *iter, struct btree *b) + + parent_locked = btree_node_locked(iter, plevel); + +- if (!bch2_btree_node_relock(iter, plevel)) ++ if (!bch2_btree_node_relock(trans, iter, plevel)) + return; + + l = &iter->l[plevel]; +@@ -1015,7 +1010,7 @@ static void btree_iter_verify_new_node(struct btree_iter *iter, struct btree *b) + char buf4[100]; + struct bkey uk = bkey_unpack_key(b, k); + +- bch2_dump_btree_node(iter->trans->c, l->b); ++ bch2_dump_btree_node(trans->c, l->b); + bch2_bpos_to_text(&PBUF(buf1), iter->real_pos); + bch2_bkey_to_text(&PBUF(buf2), &uk); + bch2_bpos_to_text(&PBUF(buf3), b->data->min_key); +@@ -1032,8 +1027,8 @@ static void btree_iter_verify_new_node(struct btree_iter *iter, struct btree *b) + btree_node_unlock(iter, b->c.level + 1); + } + +-static inline void __btree_iter_init(struct btree_iter *iter, +- unsigned level) ++static inline void __btree_iter_level_init(struct btree_iter *iter, ++ unsigned level) + { + struct btree_iter_level *l = &iter->l[level]; + +@@ -1049,19 +1044,20 @@ static inline void __btree_iter_init(struct btree_iter *iter, + btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); + } + +-static inline void btree_iter_node_set(struct btree_iter *iter, +- struct btree *b) ++static inline void btree_iter_level_init(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct btree *b) + { + BUG_ON(btree_iter_type(iter) == BTREE_ITER_CACHED); + +- btree_iter_verify_new_node(iter, b); ++ btree_iter_verify_new_node(trans, iter, b); + + EBUG_ON(!btree_iter_pos_in_node(iter, b)); + EBUG_ON(b->c.lock.state.seq & 1); + + iter->l[b->c.level].lock_seq = b->c.lock.state.seq; + iter->l[b->c.level].b = b; +- __btree_iter_init(iter, b->c.level); ++ __btree_iter_level_init(iter, b->c.level); + } + + /* +@@ -1090,7 +1086,7 @@ void bch2_btree_iter_node_replace(struct btree_trans *trans, + mark_btree_node_locked(linked, b->c.level, t); + } + +- btree_iter_node_set(linked, b); ++ btree_iter_level_init(trans, linked, b); + } + } + +@@ -1117,7 +1113,7 @@ void bch2_btree_iter_reinit_node(struct btree_trans *trans, + struct btree_iter *linked; + + trans_for_each_iter_with_node(trans, b, linked) +- __btree_iter_init(linked, b->c.level); ++ __btree_iter_level_init(linked, b->c.level); + } + + static int lock_root_check_fn(struct six_lock *lock, void *p) +@@ -1158,8 +1154,8 @@ static inline int btree_iter_lock_root(struct btree_trans *trans, + } + + lock_type = __btree_lock_want(iter, iter->level); +- if (unlikely(!btree_node_lock(b, SPOS_MAX, iter->level, +- iter, lock_type, ++ if (unlikely(!btree_node_lock(trans, iter, b, SPOS_MAX, ++ iter->level, lock_type, + lock_root_check_fn, rootp, + trace_ip))) { + if (trans->restarted) +@@ -1177,7 +1173,7 @@ static inline int btree_iter_lock_root(struct btree_trans *trans, + iter->l[i].b = NULL; + + mark_btree_node_locked(iter, iter->level, lock_type); +- btree_iter_node_set(iter, b); ++ btree_iter_level_init(trans, iter, b); + return 0; + } + +@@ -1202,7 +1198,7 @@ static int btree_iter_prefetch(struct btree_trans *trans, struct btree_iter *ite + bch2_bkey_buf_init(&tmp); + + while (nr && !ret) { +- if (!bch2_btree_node_relock(iter, iter->level)) ++ if (!bch2_btree_node_relock(trans, iter, iter->level)) + break; + + bch2_btree_node_iter_advance(&node_iter, l->b); +@@ -1211,8 +1207,8 @@ static int btree_iter_prefetch(struct btree_trans *trans, struct btree_iter *ite + break; + + bch2_bkey_buf_unpack(&tmp, c, l->b, k); +- ret = bch2_btree_node_prefetch(c, iter, tmp.k, iter->btree_id, +- iter->level - 1); ++ ret = bch2_btree_node_prefetch(c, trans, iter, tmp.k, ++ iter->btree_id, iter->level - 1); + } + + if (!was_locked) +@@ -1222,7 +1218,8 @@ static int btree_iter_prefetch(struct btree_trans *trans, struct btree_iter *ite + return ret; + } + +-static noinline void btree_node_mem_ptr_set(struct btree_iter *iter, ++static noinline void btree_node_mem_ptr_set(struct btree_trans *trans, ++ struct btree_iter *iter, + unsigned plevel, struct btree *b) + { + struct btree_iter_level *l = &iter->l[plevel]; +@@ -1230,7 +1227,7 @@ static noinline void btree_node_mem_ptr_set(struct btree_iter *iter, + struct bkey_packed *k; + struct bch_btree_ptr_v2 *bp; + +- if (!bch2_btree_node_relock(iter, plevel)) ++ if (!bch2_btree_node_relock(trans, iter, plevel)) + return; + + k = bch2_btree_node_iter_peek_all(&l->iter, l->b); +@@ -1267,11 +1264,11 @@ static __always_inline int btree_iter_down(struct btree_trans *trans, + goto err; + + mark_btree_node_locked(iter, level, lock_type); +- btree_iter_node_set(iter, b); ++ btree_iter_level_init(trans, iter, b); + + if (tmp.k->k.type == KEY_TYPE_btree_ptr_v2 && + unlikely(b != btree_node_mem_ptr(tmp.k))) +- btree_node_mem_ptr_set(iter, level + 1, b); ++ btree_node_mem_ptr_set(trans, iter, level + 1, b); + + if (iter->flags & BTREE_ITER_PREFETCH) + ret = btree_iter_prefetch(trans, iter); +@@ -1280,7 +1277,7 @@ static __always_inline int btree_iter_down(struct btree_trans *trans, + btree_node_unlock(iter, level + 1); + iter->level = level; + +- bch2_btree_iter_verify_locks(trans, iter); ++ bch2_btree_iter_verify_locks(iter); + err: + bch2_bkey_buf_exit(&tmp, c); + return ret; +@@ -1314,9 +1311,9 @@ retry_all: + + if (iter1->btree_id == iter2->btree_id && + iter1->locks_want < iter2->locks_want) +- __bch2_btree_iter_upgrade(iter1, iter2->locks_want); ++ __bch2_btree_iter_upgrade(trans, iter1, iter2->locks_want); + else if (!iter1->locks_want && iter2->locks_want) +- __bch2_btree_iter_upgrade(iter1, 1); ++ __bch2_btree_iter_upgrade(trans, iter1, 1); + } + + bch2_trans_unlock(trans); +@@ -1372,11 +1369,12 @@ static int bch2_btree_iter_traverse_all(struct btree_trans *trans) + return __btree_iter_traverse_all(trans, 0, _RET_IP_); + } + +-static inline bool btree_iter_good_node(struct btree_iter *iter, ++static inline bool btree_iter_good_node(struct btree_trans *trans, ++ struct btree_iter *iter, + unsigned l, int check_pos) + { + if (!is_btree_node(iter, l) || +- !bch2_btree_node_relock(iter, l)) ++ !bch2_btree_node_relock(trans, iter, l)) + return false; + + if (check_pos < 0 && btree_iter_pos_before_node(iter, iter->l[l].b)) +@@ -1386,13 +1384,14 @@ static inline bool btree_iter_good_node(struct btree_iter *iter, + return true; + } + +-static inline unsigned btree_iter_up_until_good_node(struct btree_iter *iter, ++static inline unsigned btree_iter_up_until_good_node(struct btree_trans *trans, ++ struct btree_iter *iter, + int check_pos) + { + unsigned l = iter->level; + + while (btree_iter_node(iter, l) && +- !btree_iter_good_node(iter, l, check_pos)) { ++ !btree_iter_good_node(trans, iter, l, check_pos)) { + btree_node_unlock(iter, l); + iter->l[l].b = BTREE_ITER_NO_NODE_UP; + l++; +@@ -1427,20 +1426,20 @@ static int btree_iter_traverse_one(struct btree_trans *trans, + } + + if (btree_iter_type(iter) == BTREE_ITER_CACHED) { +- ret = bch2_btree_iter_traverse_cached(iter); ++ ret = bch2_btree_iter_traverse_cached(trans, iter); + goto out; + } + + if (unlikely(iter->level >= BTREE_MAX_DEPTH)) + goto out; + +- iter->level = btree_iter_up_until_good_node(iter, 0); ++ iter->level = btree_iter_up_until_good_node(trans, iter, 0); + + /* If we need intent locks, take them too: */ + for (l = iter->level + 1; + l < iter->locks_want && btree_iter_node(iter, l); + l++) +- if (!bch2_btree_node_relock(iter, l)) ++ if (!bch2_btree_node_relock(trans, iter, l)) + while (iter->level <= l) { + btree_node_unlock(iter, iter->level); + iter->l[iter->level].b = BTREE_ITER_NO_NODE_UP; +@@ -1651,7 +1650,7 @@ static void btree_iter_set_search_pos(struct btree_iter *iter, struct bpos new_p + return; + } + +- l = btree_iter_up_until_good_node(iter, cmp); ++ l = btree_iter_up_until_good_node(trans, iter, cmp); + + if (btree_iter_node(iter, l)) { + /* +@@ -1662,7 +1661,7 @@ static void btree_iter_set_search_pos(struct btree_iter *iter, struct bpos new_p + */ + if (cmp < 0 || + !btree_iter_advance_to_pos(iter, &iter->l[l], 8)) +- __btree_iter_init(iter, l); ++ __btree_iter_level_init(iter, l); + + /* Don't leave it locked if we're not supposed to: */ + if (btree_lock_want(iter, l) == BTREE_NODE_UNLOCKED) +diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h +index 6dad6f1a2d9b..b6354782a2d0 100644 +--- a/fs/bcachefs/btree_iter.h ++++ b/fs/bcachefs/btree_iter.h +@@ -135,7 +135,7 @@ void bch2_btree_node_iter_fix(struct btree_trans *trans, struct btree_iter *, + struct btree *, struct btree_node_iter *, + struct bkey_packed *, unsigned, unsigned); + +-bool bch2_btree_iter_relock_intent(struct btree_iter *); ++bool bch2_btree_iter_relock_intent(struct btree_trans *, struct btree_iter *); + + bool bch2_trans_relock(struct btree_trans *); + void bch2_trans_unlock(struct btree_trans *); +@@ -148,15 +148,17 @@ static inline int btree_trans_restart(struct btree_trans *trans) + return -EINTR; + } + +-bool __bch2_btree_iter_upgrade(struct btree_iter *, unsigned); ++bool __bch2_btree_iter_upgrade(struct btree_trans *, ++ struct btree_iter *, unsigned); + +-static inline bool bch2_btree_iter_upgrade(struct btree_iter *iter, ++static inline bool bch2_btree_iter_upgrade(struct btree_trans *trans, ++ struct btree_iter *iter, + unsigned new_locks_want) + { + new_locks_want = min(new_locks_want, BTREE_MAX_DEPTH); + + return iter->locks_want < new_locks_want +- ? __bch2_btree_iter_upgrade(iter, new_locks_want) ++ ? __bch2_btree_iter_upgrade(trans, iter, new_locks_want) + : iter->uptodate <= BTREE_ITER_NEED_PEEK; + } + +diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c +index 6bc20813d00d..568c1f2704c2 100644 +--- a/fs/bcachefs/btree_key_cache.c ++++ b/fs/bcachefs/btree_key_cache.c +@@ -212,7 +212,7 @@ static int btree_key_cache_fill(struct btree_trans *trans, + if (ret) + goto err; + +- if (!bch2_btree_node_relock(ck_iter, 0)) { ++ if (!bch2_btree_node_relock(trans, ck_iter, 0)) { + trace_transaction_restart_ip(trans->ip, _THIS_IP_); + ret = btree_trans_restart(trans); + goto err; +@@ -265,9 +265,8 @@ static int bkey_cached_check_fn(struct six_lock *lock, void *p) + } + + __flatten +-int bch2_btree_iter_traverse_cached(struct btree_iter *iter) ++int bch2_btree_iter_traverse_cached(struct btree_trans *trans, struct btree_iter *iter) + { +- struct btree_trans *trans = iter->trans; + struct bch_fs *c = trans->c; + struct bkey_cached *ck; + int ret = 0; +@@ -276,7 +275,7 @@ int bch2_btree_iter_traverse_cached(struct btree_iter *iter) + + iter->l[1].b = NULL; + +- if (bch2_btree_node_relock(iter, 0)) { ++ if (bch2_btree_node_relock(trans, iter, 0)) { + ck = (void *) iter->l[0].b; + goto fill; + } +@@ -301,7 +300,7 @@ retry: + } else { + enum six_lock_type lock_want = __btree_lock_want(iter, 0); + +- if (!btree_node_lock((void *) ck, iter->pos, 0, iter, lock_want, ++ if (!btree_node_lock(trans, iter, (void *) ck, iter->pos, 0, lock_want, + bkey_cached_check_fn, iter, _THIS_IP_)) { + if (!trans->restarted) + goto retry; +@@ -325,7 +324,7 @@ retry: + fill: + if (!ck->valid && !(iter->flags & BTREE_ITER_CACHED_NOFILL)) { + if (!iter->locks_want && +- !!__bch2_btree_iter_upgrade(iter, 1)) { ++ !!__bch2_btree_iter_upgrade(trans, iter, 1)) { + trace_transaction_restart_ip(trans->ip, _THIS_IP_); + BUG_ON(!trans->restarted); + ret = -EINTR; +@@ -343,7 +342,7 @@ fill: + iter->uptodate = BTREE_ITER_NEED_PEEK; + + if ((iter->flags & BTREE_ITER_INTENT) && +- !bch2_btree_iter_upgrade(iter, 1)) { ++ !bch2_btree_iter_upgrade(trans, iter, 1)) { + BUG_ON(!trans->restarted); + ret = -EINTR; + } +diff --git a/fs/bcachefs/btree_key_cache.h b/fs/bcachefs/btree_key_cache.h +index 7e2b0a08f745..d890632e4425 100644 +--- a/fs/bcachefs/btree_key_cache.h ++++ b/fs/bcachefs/btree_key_cache.h +@@ -26,7 +26,7 @@ int bch2_btree_key_cache_journal_flush(struct journal *, + struct bkey_cached * + bch2_btree_key_cache_find(struct bch_fs *, enum btree_id, struct bpos); + +-int bch2_btree_iter_traverse_cached(struct btree_iter *); ++int bch2_btree_iter_traverse_cached(struct btree_trans *, struct btree_iter *); + + bool bch2_btree_insert_key_cached(struct btree_trans *, + struct btree_iter *, struct bkey_i *); +diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h +index 869c498e3f78..f8b358f8f2c1 100644 +--- a/fs/bcachefs/btree_locking.h ++++ b/fs/bcachefs/btree_locking.h +@@ -167,40 +167,38 @@ static inline bool btree_node_lock_increment(struct btree_trans *trans, + return false; + } + +-bool __bch2_btree_node_lock(struct btree *, struct bpos, unsigned, +- struct btree_iter *, enum six_lock_type, +- six_lock_should_sleep_fn, void *, +- unsigned long); ++bool __bch2_btree_node_lock(struct btree_trans *, struct btree_iter *, ++ struct btree *, struct bpos, unsigned, ++ enum six_lock_type, six_lock_should_sleep_fn, ++ void *, unsigned long); + +-static inline bool btree_node_lock(struct btree *b, +- struct bpos pos, unsigned level, ++static inline bool btree_node_lock(struct btree_trans *trans, + struct btree_iter *iter, ++ struct btree *b, struct bpos pos, unsigned level, + enum six_lock_type type, + six_lock_should_sleep_fn should_sleep_fn, void *p, + unsigned long ip) + { +- struct btree_trans *trans = iter->trans; +- + EBUG_ON(level >= BTREE_MAX_DEPTH); + EBUG_ON(!(trans->iters_linked & (1ULL << iter->idx))); + + return likely(six_trylock_type(&b->c.lock, type)) || + btree_node_lock_increment(trans, b, level, type) || +- __bch2_btree_node_lock(b, pos, level, iter, type, ++ __bch2_btree_node_lock(trans, iter, b, pos, level, type, + should_sleep_fn, p, ip); + } + +-bool __bch2_btree_node_relock(struct btree_iter *, unsigned); ++bool __bch2_btree_node_relock(struct btree_trans *, struct btree_iter *, unsigned); + +-static inline bool bch2_btree_node_relock(struct btree_iter *iter, +- unsigned level) ++static inline bool bch2_btree_node_relock(struct btree_trans *trans, ++ struct btree_iter *iter, unsigned level) + { + EBUG_ON(btree_node_locked(iter, level) && + btree_node_locked_type(iter, level) != + __btree_lock_want(iter, level)); + + return likely(btree_node_locked(iter, level)) || +- __bch2_btree_node_relock(iter, level); ++ __bch2_btree_node_relock(trans, iter, level); + } + + /* +@@ -225,8 +223,7 @@ bch2_btree_node_unlock_write_inlined(struct btree_trans *trans, struct btree_ite + void bch2_btree_node_unlock_write(struct btree_trans *, + struct btree_iter *, struct btree *); + +-void __bch2_btree_node_lock_write(struct btree_trans *, +- struct btree_iter *, struct btree *); ++void __bch2_btree_node_lock_write(struct btree_trans *, struct btree *); + + static inline void bch2_btree_node_lock_write(struct btree_trans *trans, + struct btree_iter *iter, +@@ -234,9 +231,10 @@ static inline void bch2_btree_node_lock_write(struct btree_trans *trans, + { + EBUG_ON(iter->l[b->c.level].b != b); + EBUG_ON(iter->l[b->c.level].lock_seq != b->c.lock.state.seq); ++ EBUG_ON(!btree_node_intent_locked(iter, b->c.level)); + + if (unlikely(!six_trylock_write(&b->c.lock))) +- __bch2_btree_node_lock_write(trans, iter, b); ++ __bch2_btree_node_lock_write(trans, b); + } + + #endif /* _BCACHEFS_BTREE_LOCKING_H */ +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index 3d7c47712b74..d18d539bcc8e 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -937,7 +937,7 @@ retry: + * XXX: figure out how far we might need to split, + * instead of locking/reserving all the way to the root: + */ +- if (!bch2_btree_iter_upgrade(iter, U8_MAX)) { ++ if (!bch2_btree_iter_upgrade(trans, iter, U8_MAX)) { + trace_trans_restart_iter_upgrade(trans->ip, _RET_IP_, + iter->btree_id, + &iter->real_pos); +diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h +index 07046dab614b..13b3a1bf0f4f 100644 +--- a/fs/bcachefs/btree_update_interior.h ++++ b/fs/bcachefs/btree_update_interior.h +@@ -132,7 +132,7 @@ static inline int bch2_foreground_maybe_merge_sibling(struct btree_trans *trans, + if (iter->uptodate >= BTREE_ITER_NEED_TRAVERSE) + return 0; + +- if (!bch2_btree_node_relock(iter, level)) ++ if (!bch2_btree_node_relock(trans, iter, level)) + return 0; + + b = iter->l[level].b; +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index 21fd6b2a7532..aabf408a5114 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -561,7 +561,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, + */ + trans_for_each_iter(trans, iter) + if (iter->nodes_locked != iter->nodes_intent_locked && +- !bch2_btree_iter_upgrade(iter, 1)) { ++ !bch2_btree_iter_upgrade(trans, iter, 1)) { + trace_trans_restart_upgrade(trans->ip, trace_ip, + iter->btree_id, + &iter->real_pos); +@@ -783,7 +783,8 @@ int __bch2_trans_commit(struct btree_trans *trans) + trans_for_each_update(trans, i) { + BUG_ON(!i->iter->should_be_locked); + +- if (unlikely(!bch2_btree_iter_upgrade(i->iter, i->level + 1))) { ++ if (unlikely(!bch2_btree_iter_upgrade(trans, i->iter, ++ i->level + 1))) { + trace_trans_restart_upgrade(trans->ip, _RET_IP_, + i->iter->btree_id, + &i->iter->pos); +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index b02af94f4037..71b0f14f41f3 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -326,8 +326,8 @@ static void btree_and_journal_iter_prefetch(struct bch_fs *c, struct btree *b, + (k = bch2_btree_and_journal_iter_peek(&iter)).k) { + bch2_bkey_buf_reassemble(&tmp, c, k); + +- bch2_btree_node_prefetch(c, NULL, tmp.k, +- b->c.btree_id, b->c.level - 1); ++ bch2_btree_node_prefetch(c, NULL, NULL, tmp.k, ++ b->c.btree_id, b->c.level - 1); + + bch2_btree_and_journal_iter_advance(&iter); + i++; +-- +cgit v1.2.3 + + +From 9ba7f278738552a9772cb961e3a3a5e2f28075c5 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 30 Aug 2021 14:36:03 -0400 +Subject: bcachefs: Clean up/rename bch2_trans_node_* fns + +These utility functions are for managing btree node state within a +btree_trans - rename them for consistency, and drop some unneeded +arguments. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_io.c | 9 +++---- + fs/bcachefs/btree_io.h | 3 +-- + fs/bcachefs/btree_iter.c | 52 +++++++++++++++++-------------------- + fs/bcachefs/btree_iter.h | 12 +++------ + fs/bcachefs/btree_update_interior.c | 35 ++++++++++++------------- + fs/bcachefs/btree_update_leaf.c | 11 ++++---- + 6 files changed, 54 insertions(+), 68 deletions(-) + +diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c +index 455c24734a2b..35d0f646a0b5 100644 +--- a/fs/bcachefs/btree_io.c ++++ b/fs/bcachefs/btree_io.c +@@ -465,16 +465,13 @@ void bch2_btree_build_aux_trees(struct btree *b) + * + * Returns true if we sorted (i.e. invalidated iterators + */ +-void bch2_btree_init_next(struct btree_trans *trans, +- struct btree_iter *iter, +- struct btree *b) ++void bch2_btree_init_next(struct btree_trans *trans, struct btree *b) + { + struct bch_fs *c = trans->c; + struct btree_node_entry *bne; + bool reinit_iter = false; + + EBUG_ON(!(b->c.lock.state.seq & 1)); +- EBUG_ON(iter && iter->l[b->c.level].b != b); + BUG_ON(bset_written(b, bset(b, &b->set[1]))); + + if (b->nsets == MAX_BSETS && +@@ -503,8 +500,8 @@ void bch2_btree_init_next(struct btree_trans *trans, + + bch2_btree_build_aux_trees(b); + +- if (iter && reinit_iter) +- bch2_btree_iter_reinit_node(trans, iter, b); ++ if (reinit_iter) ++ bch2_trans_node_reinit_iter(trans, b); + } + + static void btree_pos_to_text(struct printbuf *out, struct bch_fs *c, +diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h +index 7fdcf879c7d4..0f20224e2a77 100644 +--- a/fs/bcachefs/btree_io.h ++++ b/fs/bcachefs/btree_io.h +@@ -134,8 +134,7 @@ void bch2_btree_sort_into(struct bch_fs *, struct btree *, struct btree *); + void bch2_btree_node_drop_keys_outside_node(struct btree *); + + void bch2_btree_build_aux_trees(struct btree *); +-void bch2_btree_init_next(struct btree_trans *, struct btree_iter *, +- struct btree *); ++void bch2_btree_init_next(struct btree_trans *, struct btree *); + + int bch2_btree_node_read_done(struct bch_fs *, struct bch_dev *, + struct btree *, bool); +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index c7ba4be0e7cd..b5fed89254ac 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -761,15 +761,14 @@ static void __bch2_btree_iter_fix_key_modified(struct btree_iter *iter, + } + + void bch2_btree_iter_fix_key_modified(struct btree_trans *trans, +- struct btree_iter *iter, + struct btree *b, + struct bkey_packed *where) + { +- struct btree_iter *linked; ++ struct btree_iter *iter; + +- trans_for_each_iter_with_node(trans, b, linked) { +- __bch2_btree_iter_fix_key_modified(linked, b, where); +- bch2_btree_iter_verify_level(trans, linked, b->c.level); ++ trans_for_each_iter_with_node(trans, b, iter) { ++ __bch2_btree_iter_fix_key_modified(iter, b, where); ++ bch2_btree_iter_verify_level(trans, iter, b->c.level); + } + } + +@@ -1064,42 +1063,40 @@ static inline void btree_iter_level_init(struct btree_trans *trans, + * A btree node is being replaced - update the iterator to point to the new + * node: + */ +-void bch2_btree_iter_node_replace(struct btree_trans *trans, +- struct btree_iter *iter, struct btree *b) ++void bch2_trans_node_add(struct btree_trans *trans, struct btree *b) + { + enum btree_node_locked_type t; +- struct btree_iter *linked; ++ struct btree_iter *iter; + +- trans_for_each_iter(trans, linked) +- if (btree_iter_type(linked) != BTREE_ITER_CACHED && +- btree_iter_pos_in_node(linked, b)) { ++ trans_for_each_iter(trans, iter) ++ if (btree_iter_type(iter) != BTREE_ITER_CACHED && ++ btree_iter_pos_in_node(iter, b)) { + /* +- * bch2_btree_iter_node_drop() has already been called - ++ * bch2_trans_node_drop() has already been called - + * the old node we're replacing has already been + * unlocked and the pointer invalidated + */ +- BUG_ON(btree_node_locked(linked, b->c.level)); ++ BUG_ON(btree_node_locked(iter, b->c.level)); + +- t = btree_lock_want(linked, b->c.level); ++ t = btree_lock_want(iter, b->c.level); + if (t != BTREE_NODE_UNLOCKED) { + six_lock_increment(&b->c.lock, t); +- mark_btree_node_locked(linked, b->c.level, t); ++ mark_btree_node_locked(iter, b->c.level, t); + } + +- btree_iter_level_init(trans, linked, b); ++ btree_iter_level_init(trans, iter, b); + } + } + +-void bch2_btree_iter_node_drop(struct btree_trans *trans, +- struct btree_iter *iter, struct btree *b) ++void bch2_trans_node_drop(struct btree_trans *trans, struct btree *b) + { +- struct btree_iter *linked; ++ struct btree_iter *iter; + unsigned level = b->c.level; + +- trans_for_each_iter(trans, linked) +- if (linked->l[level].b == b) { +- btree_node_unlock(linked, level); +- linked->l[level].b = BTREE_ITER_NO_NODE_DROP; ++ trans_for_each_iter(trans, iter) ++ if (iter->l[level].b == b) { ++ btree_node_unlock(iter, level); ++ iter->l[level].b = BTREE_ITER_NO_NODE_DROP; + } + } + +@@ -1107,13 +1104,12 @@ void bch2_btree_iter_node_drop(struct btree_trans *trans, + * A btree node has been modified in such a way as to invalidate iterators - fix + * them: + */ +-void bch2_btree_iter_reinit_node(struct btree_trans *trans, +- struct btree_iter *iter, struct btree *b) ++void bch2_trans_node_reinit_iter(struct btree_trans *trans, struct btree *b) + { +- struct btree_iter *linked; ++ struct btree_iter *iter; + +- trans_for_each_iter_with_node(trans, b, linked) +- __btree_iter_level_init(linked, b->c.level); ++ trans_for_each_iter_with_node(trans, b, iter) ++ __btree_iter_level_init(iter, b->c.level); + } + + static int lock_root_check_fn(struct six_lock *lock, void *p) +diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h +index b6354782a2d0..b56fff83f6e7 100644 +--- a/fs/bcachefs/btree_iter.h ++++ b/fs/bcachefs/btree_iter.h +@@ -129,7 +129,7 @@ static inline void bch2_btree_trans_verify_iters(struct btree_trans *trans, + static inline void bch2_btree_trans_verify_locks(struct btree_trans *iter) {} + #endif + +-void bch2_btree_iter_fix_key_modified(struct btree_trans *trans, struct btree_iter *, ++void bch2_btree_iter_fix_key_modified(struct btree_trans *trans, + struct btree *, struct bkey_packed *); + void bch2_btree_node_iter_fix(struct btree_trans *trans, struct btree_iter *, + struct btree *, struct btree_node_iter *, +@@ -174,13 +174,9 @@ static inline void bch2_btree_iter_downgrade(struct btree_iter *iter) + + void bch2_trans_downgrade(struct btree_trans *); + +-void bch2_btree_iter_node_replace(struct btree_trans *trans, +- struct btree_iter *, struct btree *); +-void bch2_btree_iter_node_drop(struct btree_trans *, +- struct btree_iter *, struct btree *); +- +-void bch2_btree_iter_reinit_node(struct btree_trans *, +- struct btree_iter *, struct btree *); ++void bch2_trans_node_add(struct btree_trans *trans, struct btree *); ++void bch2_trans_node_drop(struct btree_trans *, struct btree *); ++void bch2_trans_node_reinit_iter(struct btree_trans *, struct btree *); + + int __must_check bch2_btree_iter_traverse(struct btree_iter *); + +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index d18d539bcc8e..3de8cf909150 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -161,14 +161,13 @@ static void __btree_node_free(struct bch_fs *c, struct btree *b) + } + + static void bch2_btree_node_free_inmem(struct btree_trans *trans, +- struct btree_iter *iter, + struct btree *b) + { + struct bch_fs *c = trans->c; +- struct btree_iter *linked; ++ struct btree_iter *iter; + +- trans_for_each_iter(trans, linked) +- BUG_ON(linked->l[b->c.level].b == b); ++ trans_for_each_iter(trans, iter) ++ BUG_ON(iter->l[b->c.level].b == b); + + six_lock_write(&b->c.lock, NULL, NULL); + __btree_node_free(c, b); +@@ -1431,12 +1430,12 @@ static void btree_split(struct btree_update *as, + /* Successful split, update the iterator to point to the new nodes: */ + + six_lock_increment(&b->c.lock, SIX_LOCK_intent); +- bch2_btree_iter_node_drop(trans, iter, b); ++ bch2_trans_node_drop(trans, b); + if (n3) +- bch2_btree_iter_node_replace(trans, iter, n3); ++ bch2_trans_node_add(trans, n3); + if (n2) +- bch2_btree_iter_node_replace(trans, iter, n2); +- bch2_btree_iter_node_replace(trans, iter, n1); ++ bch2_trans_node_add(trans, n2); ++ bch2_trans_node_add(trans, n1); + + /* + * The old node must be freed (in memory) _before_ unlocking the new +@@ -1444,7 +1443,7 @@ static void btree_split(struct btree_update *as, + * node after another thread has locked and updated the new node, thus + * seeing stale data: + */ +- bch2_btree_node_free_inmem(trans, iter, b); ++ bch2_btree_node_free_inmem(trans, b); + + if (n3) + six_unlock_intent(&n3->c.lock); +@@ -1527,7 +1526,7 @@ static void bch2_btree_insert_node(struct btree_update *as, + + if (u64s_added > live_u64s_added && + bch2_maybe_compact_whiteouts(c, b)) +- bch2_btree_iter_reinit_node(trans, iter, b); ++ bch2_trans_node_reinit_iter(trans, b); + + bch2_btree_node_unlock_write(trans, iter, b); + +@@ -1702,15 +1701,15 @@ retry: + + six_lock_increment(&b->c.lock, SIX_LOCK_intent); + six_lock_increment(&m->c.lock, SIX_LOCK_intent); +- bch2_btree_iter_node_drop(trans, iter, b); +- bch2_btree_iter_node_drop(trans, iter, m); ++ bch2_trans_node_drop(trans, b); ++ bch2_trans_node_drop(trans, m); + +- bch2_btree_iter_node_replace(trans, iter, n); ++ bch2_trans_node_add(trans, n); + + bch2_btree_trans_verify_iters(trans, n); + +- bch2_btree_node_free_inmem(trans, iter, b); +- bch2_btree_node_free_inmem(trans, iter, m); ++ bch2_btree_node_free_inmem(trans, b); ++ bch2_btree_node_free_inmem(trans, m); + + six_unlock_intent(&n->c.lock); + +@@ -1798,9 +1797,9 @@ retry: + bch2_btree_update_get_open_buckets(as, n); + + six_lock_increment(&b->c.lock, SIX_LOCK_intent); +- bch2_btree_iter_node_drop(trans, iter, b); +- bch2_btree_iter_node_replace(trans, iter, n); +- bch2_btree_node_free_inmem(trans, iter, b); ++ bch2_trans_node_drop(trans, b); ++ bch2_trans_node_add(trans, n); ++ bch2_btree_node_free_inmem(trans, b); + six_unlock_intent(&n->c.lock); + + bch2_btree_update_done(as); +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index aabf408a5114..797f1090323a 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -56,14 +56,14 @@ inline void bch2_btree_node_lock_for_insert(struct btree_trans *trans, + + if (unlikely(btree_node_just_written(b)) && + bch2_btree_post_write_cleanup(c, b)) +- bch2_btree_iter_reinit_node(trans, iter, b); ++ bch2_trans_node_reinit_iter(trans, b); + + /* + * If the last bset has been written, or if it's gotten too big - start + * a new bset to insert into: + */ + if (want_new_bset(c, b)) +- bch2_btree_init_next(trans, iter, b); ++ bch2_btree_init_next(trans, b); + } + + /* Inserting into a given leaf node (last stage of insert): */ +@@ -85,7 +85,6 @@ bool bch2_btree_bset_insert_key(struct btree_trans *trans, + EBUG_ON(bpos_cmp(insert->k.p, b->data->max_key) > 0); + EBUG_ON(insert->k.u64s > + bch_btree_keys_u64s_remaining(trans->c, b)); +- EBUG_ON(iter->flags & BTREE_ITER_IS_EXTENTS); + + k = bch2_btree_node_iter_peek_all(node_iter, b); + if (k && bkey_cmp_left_packed(b, k, &insert->k.p)) +@@ -112,7 +111,7 @@ bool bch2_btree_bset_insert_key(struct btree_trans *trans, + bch2_bset_delete(b, k, clobber_u64s); + goto fix_iter; + } else { +- bch2_btree_iter_fix_key_modified(trans, iter, b, k); ++ bch2_btree_iter_fix_key_modified(trans, b, k); + } + + return true; +@@ -130,7 +129,7 @@ bool bch2_btree_bset_insert_key(struct btree_trans *trans, + clobber_u64s = k->u64s; + goto overwrite; + } else { +- bch2_btree_iter_fix_key_modified(trans, iter, b, k); ++ bch2_btree_iter_fix_key_modified(trans, b, k); + } + } + +@@ -220,7 +219,7 @@ static bool btree_insert_key_leaf(struct btree_trans *trans, + + if (u64s_added > live_u64s_added && + bch2_maybe_compact_whiteouts(c, b)) +- bch2_btree_iter_reinit_node(trans, iter, b); ++ bch2_trans_node_reinit_iter(trans, b); + + trace_btree_insert_key(c, b, insert); + return true; +-- +cgit v1.2.3 + + +From 1298a53dc1e41be57c4118d5e84226ac976ffde6 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 30 Aug 2021 14:45:11 -0400 +Subject: bcachefs: More renaming + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 6 +++--- + fs/bcachefs/btree_iter.h | 10 +++++----- + fs/bcachefs/btree_update_interior.c | 8 ++++---- + fs/bcachefs/btree_update_leaf.c | 4 ++-- + 4 files changed, 14 insertions(+), 14 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index b5fed89254ac..39746dfbf227 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -241,7 +241,7 @@ static inline bool btree_iter_get_locks(struct btree_trans *trans, + if (iter->uptodate == BTREE_ITER_NEED_RELOCK) + iter->uptodate = BTREE_ITER_NEED_PEEK; + +- bch2_btree_trans_verify_locks(trans); ++ bch2_trans_verify_locks(trans); + + return iter->uptodate < BTREE_ITER_NEED_RELOCK; + } +@@ -382,7 +382,7 @@ static void bch2_btree_iter_verify_locks(struct btree_iter *iter) + } + } + +-void bch2_btree_trans_verify_locks(struct btree_trans *trans) ++void bch2_trans_verify_locks(struct btree_trans *trans) + { + struct btree_iter *iter; + +@@ -708,7 +708,7 @@ static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) + bkey_cmp(iter->pos, iter->k.p) > 0)); + } + +-void bch2_btree_trans_verify_iters(struct btree_trans *trans, struct btree *b) ++void bch2_trans_verify_iters(struct btree_trans *trans, struct btree *b) + { + struct btree_iter *iter; + +diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h +index b56fff83f6e7..55b20aed7b59 100644 +--- a/fs/bcachefs/btree_iter.h ++++ b/fs/bcachefs/btree_iter.h +@@ -121,12 +121,12 @@ __trans_next_iter_with_node(struct btree_trans *trans, struct btree *b, + (_iter)->idx + 1)) + + #ifdef CONFIG_BCACHEFS_DEBUG +-void bch2_btree_trans_verify_iters(struct btree_trans *, struct btree *); +-void bch2_btree_trans_verify_locks(struct btree_trans *); ++void bch2_trans_verify_iters(struct btree_trans *, struct btree *); ++void bch2_trans_verify_locks(struct btree_trans *); + #else +-static inline void bch2_btree_trans_verify_iters(struct btree_trans *trans, +- struct btree *b) {} +-static inline void bch2_btree_trans_verify_locks(struct btree_trans *iter) {} ++static inline void bch2_trans_verify_iters(struct btree_trans *trans, ++ struct btree *b) {} ++static inline void bch2_trans_verify_locks(struct btree_trans *iter) {} + #endif + + void bch2_btree_iter_fix_key_modified(struct btree_trans *trans, +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index 3de8cf909150..0afd26083d6d 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -1451,7 +1451,7 @@ static void btree_split(struct btree_update *as, + six_unlock_intent(&n2->c.lock); + six_unlock_intent(&n1->c.lock); + +- bch2_btree_trans_verify_locks(trans); ++ bch2_trans_verify_locks(trans); + + bch2_time_stats_update(&c->times[BCH_TIME_btree_node_split], + start_time); +@@ -1474,7 +1474,7 @@ bch2_btree_insert_keys_interior(struct btree_update *as, + trans_for_each_iter_with_node(trans, b, linked) + bch2_btree_node_iter_peek(&linked->l[b->c.level].iter, b); + +- bch2_btree_trans_verify_iters(trans, b); ++ bch2_trans_verify_iters(trans, b); + } + + /** +@@ -1706,7 +1706,7 @@ retry: + + bch2_trans_node_add(trans, n); + +- bch2_btree_trans_verify_iters(trans, n); ++ bch2_trans_verify_iters(trans, n); + + bch2_btree_node_free_inmem(trans, b); + bch2_btree_node_free_inmem(trans, m); +@@ -1715,7 +1715,7 @@ retry: + + bch2_btree_update_done(as); + out: +- bch2_btree_trans_verify_locks(trans); ++ bch2_trans_verify_locks(trans); + bch2_trans_iter_free(trans, sib_iter); + + /* +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index 797f1090323a..5ecbbc7bcb4a 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -580,7 +580,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, + } + btree_insert_entry_checks(trans, i); + } +- bch2_btree_trans_verify_locks(trans); ++ bch2_trans_verify_locks(trans); + + trans_for_each_update(trans, i) + if (!same_leaf_as_prev(trans, i)) +@@ -816,7 +816,7 @@ retry: + ret = do_bch2_trans_commit(trans, &i, _RET_IP_); + + /* make sure we didn't drop or screw up locks: */ +- bch2_btree_trans_verify_locks(trans); ++ bch2_trans_verify_locks(trans); + + if (ret) + goto err; +-- +cgit v1.2.3 + + +From acfb10f4a0b378e4c453cc551cab0589a79bd1dc Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 30 Aug 2021 14:56:41 -0400 +Subject: bcachefs: Btree iterators are always sorted + +No need to actually resort, can just replace it with an debug assert. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 50 +++--------------------------------------------- + 1 file changed, 3 insertions(+), 47 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 39746dfbf227..3df15988a4d4 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -18,8 +18,8 @@ + #include + + static void btree_iter_set_search_pos(struct btree_iter *, struct bpos); +-static void btree_trans_sort_iters(struct btree_trans *); + static void btree_iter_check_sort(struct btree_trans *, struct btree_iter *); ++static inline void btree_trans_verify_sorted(struct btree_trans *); + static struct btree_iter *btree_iter_child_alloc(struct btree_trans *, + struct btree_iter *, unsigned long); + static struct btree_iter *btree_trans_iter_alloc(struct btree_trans *, +@@ -1299,7 +1299,7 @@ retry_all: + trans_for_each_iter(trans, iter) + iter->should_be_locked = false; + +- btree_trans_sort_iters(trans); ++ btree_trans_verify_sorted(trans); + + for (i = trans->nr_sorted - 2; i >= 0; --i) { + struct btree_iter *iter1 = trans->iters + trans->sorted[i]; +@@ -2073,50 +2073,6 @@ static inline void btree_iter_swap(struct btree_trans *trans, + btree_iter_verify_sorted_ref(trans, r); + } + +-static void btree_trans_sort_iters(struct btree_trans *trans) +-{ +- bool swapped = false; +- int i, l = 0, r = trans->nr_sorted; +- +- while (1) { +- for (i = l; i + 1 < r; i++) { +- if (btree_iter_cmp(trans->iters + trans->sorted[i], +- trans->iters + trans->sorted[i + 1]) > 0) { +- swap(trans->sorted[i], trans->sorted[i + 1]); +- trans->iters[trans->sorted[i]].sorted_idx = i; +- trans->iters[trans->sorted[i + 1]].sorted_idx = i + 1; +- swapped = true; +- } +- } +- +- if (!swapped) +- break; +- +- r--; +- swapped = false; +- +- for (i = r - 2; i >= l; --i) { +- if (btree_iter_cmp(trans->iters + trans->sorted[i], +- trans->iters + trans->sorted[i + 1]) > 0) { +- swap(trans->sorted[i], +- trans->sorted[i + 1]); +- trans->iters[trans->sorted[i]].sorted_idx = i; +- trans->iters[trans->sorted[i + 1]].sorted_idx = i + 1; +- swapped = true; +- } +- } +- +- if (!swapped) +- break; +- +- l++; +- swapped = false; +- } +- +- btree_trans_verify_sorted_refs(trans); +- btree_trans_verify_sorted(trans); +-} +- + static void btree_iter_check_sort(struct btree_trans *trans, struct btree_iter *iter) + { + struct btree_iter *n; +@@ -2276,7 +2232,7 @@ void bch2_dump_trans_iters_updates(struct btree_trans *trans) + struct btree_insert_entry *i; + char buf1[300], buf2[100]; + +- btree_trans_sort_iters(trans); ++ btree_trans_verify_sorted(trans); + + trans_for_each_iter_inorder(trans, iter) + printk(KERN_ERR "iter: btree %s pos %s real_pos %s%s%s%s %pS\n", +-- +cgit v1.2.3 + + +From c647bad97780b03a463ff0c5d606f8a42e737699 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 30 Aug 2021 15:18:31 -0400 +Subject: bcachefs: Kill bpos_diff() + +This improves the btree iterator lookup code by using +trans_for_each_iter_inorder(). + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bkey.h | 31 ------------------------------- + fs/bcachefs/btree_iter.c | 33 +++++++++++++++++---------------- + 2 files changed, 17 insertions(+), 47 deletions(-) + +diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h +index 2e45d88fab03..c4a66f28ef4b 100644 +--- a/fs/bcachefs/bkey.h ++++ b/fs/bcachefs/bkey.h +@@ -163,37 +163,6 @@ static inline struct bpos bpos_max(struct bpos l, struct bpos r) + return bpos_cmp(l, r) > 0 ? l : r; + } + +-#define sbb(a, b, borrow) \ +-do { \ +- typeof(a) d1, d2; \ +- \ +- d1 = a - borrow; \ +- borrow = d1 > a; \ +- \ +- d2 = d1 - b; \ +- borrow += d2 > d1; \ +- a = d2; \ +-} while (0) +- +-/* returns a - b: */ +-static inline struct bpos bpos_sub(struct bpos a, struct bpos b) +-{ +- int borrow = 0; +- +- sbb(a.snapshot, b.snapshot, borrow); +- sbb(a.offset, b.offset, borrow); +- sbb(a.inode, b.inode, borrow); +- return a; +-} +- +-static inline struct bpos bpos_diff(struct bpos l, struct bpos r) +-{ +- if (bpos_cmp(l, r) > 0) +- swap(l, r); +- +- return bpos_sub(r, l); +-} +- + void bch2_bpos_swab(struct bpos *); + void bch2_bkey_swab_key(const struct bkey_format *, struct bkey_packed *); + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 3df15988a4d4..b406afcb54a4 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -2306,8 +2306,9 @@ struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans, + unsigned depth, + unsigned flags) + { +- struct btree_iter *iter, *best = NULL; ++ struct btree_iter *iter, *list_pos = NULL, *best = NULL; + struct bpos real_pos, pos_min = POS_MIN; ++ int cmp; + + EBUG_ON(trans->restarted); + +@@ -2331,27 +2332,27 @@ struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans, + bkey_cmp(pos, POS_MAX)) + real_pos = bpos_nosnap_successor(pos); + +- trans_for_each_iter(trans, iter) { +- if (btree_iter_type(iter) != (flags & BTREE_ITER_TYPE)) +- continue; ++ trans_for_each_iter_inorder(trans, iter) { ++ list_pos = iter; + +- if (iter->btree_id != btree_id) ++ if (btree_iter_type(iter) != (flags & BTREE_ITER_TYPE) || ++ iter->btree_id != btree_id) + continue; + +- if (best) { +- int cmp = bkey_cmp(bpos_diff(best->real_pos, real_pos), +- bpos_diff(iter->real_pos, real_pos)); +- +- if (cmp < 0 || +- ((cmp == 0 && btree_iter_keep(trans, iter)))) +- continue; +- } +- +- best = iter; ++ /* ++ * Since advancing iterators is cheaper than rewinding them, we ++ * prefer a path <= the search pos ++ */ ++ cmp = bpos_cmp(iter->real_pos, real_pos) ?: ++ cmp_int(iter->level, depth); ++ if (!best || cmp <= 0) ++ best = iter; ++ if (cmp >= 0) ++ break; + } + + if (!best) { +- iter = btree_trans_iter_alloc(trans, NULL); ++ iter = btree_trans_iter_alloc(trans, list_pos); + bch2_btree_iter_init(trans, iter, btree_id); + } else if (btree_iter_keep(trans, best)) { + iter = btree_trans_iter_alloc(trans, best); +-- +cgit v1.2.3 + + +From 55585f6667b529d2c24373787a07bf01c219932d Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 30 Aug 2021 16:08:34 -0400 +Subject: bcachefs: Prefer using btree_insert_entry to btree_iter + +This moves some data dependencies forward, to improve pipelining. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_types.h | 3 +- + fs/bcachefs/btree_update_leaf.c | 66 ++++++++++++++++++++--------------------- + fs/bcachefs/buckets.c | 2 +- + 3 files changed, 36 insertions(+), 35 deletions(-) + +diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h +index 423736ea56b4..04ed6ad5be94 100644 +--- a/fs/bcachefs/btree_types.h ++++ b/fs/bcachefs/btree_types.h +@@ -343,7 +343,8 @@ struct btree_insert_entry { + u8 bkey_type; + enum btree_id btree_id:8; + u8 level; +- unsigned trans_triggers_run:1; ++ bool cached:1; ++ bool trans_triggers_run:1; + struct bkey_i *k; + struct btree_iter *iter; + unsigned long ip_allocated; +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index 5ecbbc7bcb4a..067c9038d2c9 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -29,18 +29,23 @@ static inline int btree_insert_entry_cmp(const struct btree_insert_entry *l, + bpos_cmp(l->k->k.p, r->k->k.p); + } + ++static inline struct btree_iter_level *insert_l(struct btree_insert_entry *i) ++{ ++ return i->iter->l + i->level; ++} ++ + static inline bool same_leaf_as_prev(struct btree_trans *trans, + struct btree_insert_entry *i) + { + return i != trans->updates && +- iter_l(i[0].iter)->b == iter_l(i[-1].iter)->b; ++ insert_l(&i[0])->b == insert_l(&i[-1])->b; + } + + static inline bool same_leaf_as_next(struct btree_trans *trans, + struct btree_insert_entry *i) + { + return i + 1 < trans->updates + trans->nr_updates && +- iter_l(i[0].iter)->b == iter_l(i[1].iter)->b; ++ insert_l(&i[0])->b == insert_l(&i[1])->b; + } + + inline void bch2_btree_node_lock_for_insert(struct btree_trans *trans, +@@ -183,22 +188,21 @@ inline void bch2_btree_add_journal_pin(struct bch_fs *c, + * btree_insert_key - insert a key one key into a leaf node + */ + static bool btree_insert_key_leaf(struct btree_trans *trans, +- struct btree_iter *iter, +- struct bkey_i *insert) ++ struct btree_insert_entry *insert) + { + struct bch_fs *c = trans->c; +- struct btree *b = iter_l(iter)->b; ++ struct btree *b = insert_l(insert)->b; + struct bset_tree *t = bset_tree_last(b); + struct bset *i = bset(b, t); + int old_u64s = bset_u64s(t); + int old_live_u64s = b->nr.live_u64s; + int live_u64s_added, u64s_added; + +- EBUG_ON(!iter->level && ++ EBUG_ON(!insert->level && + !test_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags)); + +- if (unlikely(!bch2_btree_bset_insert_key(trans, iter, b, +- &iter_l(iter)->iter, insert))) ++ if (unlikely(!bch2_btree_bset_insert_key(trans, insert->iter, b, ++ &insert_l(insert)->iter, insert->k))) + return false; + + i->journal_seq = cpu_to_le64(max(trans->journal_res.seq, +@@ -221,7 +225,7 @@ static bool btree_insert_key_leaf(struct btree_trans *trans, + bch2_maybe_compact_whiteouts(c, b)) + bch2_trans_node_reinit_iter(trans, b); + +- trace_btree_insert_key(c, b, insert); ++ trace_btree_insert_key(c, b, insert->k); + return true; + } + +@@ -274,13 +278,12 @@ static inline int bch2_trans_journal_res_get(struct btree_trans *trans, + return ret == -EAGAIN ? BTREE_INSERT_NEED_JOURNAL_RES : ret; + } + +-static enum btree_insert_ret ++static inline enum btree_insert_ret + btree_key_can_insert(struct btree_trans *trans, +- struct btree_iter *iter, ++ struct btree *b, + unsigned u64s) + { + struct bch_fs *c = trans->c; +- struct btree *b = iter_l(iter)->b; + + if (!bch2_btree_node_insert_fits(c, b, u64s)) + return BTREE_INSERT_BTREE_NODE_FULL; +@@ -297,7 +300,7 @@ btree_key_can_insert_cached(struct btree_trans *trans, + unsigned new_u64s; + struct bkey_i *new_k; + +- BUG_ON(iter->level); ++ EBUG_ON(iter->level); + + if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) && + bch2_btree_key_cache_must_wait(trans->c) && +@@ -335,8 +338,8 @@ static inline void do_btree_insert_one(struct btree_trans *trans, + + i->k->k.needs_whiteout = false; + +- did_work = (btree_iter_type(i->iter) != BTREE_ITER_CACHED) +- ? btree_insert_key_leaf(trans, i->iter, i->k) ++ did_work = !i->cached ++ ? btree_insert_key_leaf(trans, i) + : bch2_btree_insert_key_cached(trans, i->iter, i->k); + if (!did_work) + return; +@@ -364,9 +367,9 @@ static noinline void bch2_trans_mark_gc(struct btree_trans *trans) + /* + * XXX: synchronization of cached update triggers with gc + */ +- BUG_ON(btree_iter_type(i->iter) == BTREE_ITER_CACHED); ++ BUG_ON(i->cached || i->level); + +- if (gc_visited(c, gc_pos_btree_node(i->iter->l[0].b))) ++ if (gc_visited(c, gc_pos_btree_node(insert_l(i)->b))) + bch2_mark_update(trans, i->iter, i->k, + i->flags|BTREE_TRIGGER_GC); + } +@@ -412,8 +415,8 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, + u64s = 0; + + u64s += i->k->k.u64s; +- ret = btree_iter_type(i->iter) != BTREE_ITER_CACHED +- ? btree_key_can_insert(trans, i->iter, u64s) ++ ret = !i->cached ++ ? btree_key_can_insert(trans, insert_l(i)->b, u64s) + : btree_key_can_insert_cached(trans, i->iter, u64s); + if (ret) { + *stopped_at = i; +@@ -473,8 +476,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, + + trans_for_each_update(trans, i) + if (BTREE_NODE_TYPE_HAS_MEM_TRIGGERS & (1U << i->bkey_type)) +- bch2_mark_update(trans, i->iter, i->k, +- i->flags); ++ bch2_mark_update(trans, i->iter, i->k, i->flags); + + if (marking && trans->fs_usage_deltas) + bch2_trans_fs_usage_apply(trans, trans->fs_usage_deltas); +@@ -524,7 +526,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, + if (!same_leaf_as_next(trans, i)) { + if (u64s_delta <= 0) { + ret = bch2_foreground_maybe_merge(trans, i->iter, +- i->iter->level, trans->flags); ++ i->level, trans->flags); + if (unlikely(ret)) + return ret; + } +@@ -585,14 +587,14 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, + trans_for_each_update(trans, i) + if (!same_leaf_as_prev(trans, i)) + bch2_btree_node_lock_for_insert(trans, i->iter, +- iter_l(i->iter)->b); ++ insert_l(i)->b); + + ret = bch2_trans_commit_write_locked(trans, stopped_at, trace_ip); + + trans_for_each_update(trans, i) + if (!same_leaf_as_prev(trans, i)) + bch2_btree_node_unlock_write_inlined(trans, i->iter, +- iter_l(i->iter)->b); ++ insert_l(i)->b); + + if (!ret && trans->journal_pin) + bch2_journal_pin_add(&c->journal, trans->journal_res.seq, +@@ -637,8 +639,7 @@ int bch2_trans_commit_error(struct btree_trans *trans, + + if (ret == -EINTR) + trace_trans_restart_btree_node_split(trans->ip, trace_ip, +- i->iter->btree_id, +- &i->iter->real_pos); ++ i->btree_id, &i->iter->real_pos); + break; + case BTREE_INSERT_NEED_MARK_REPLICAS: + bch2_trans_unlock(trans); +@@ -747,7 +748,7 @@ int __bch2_trans_commit(struct btree_trans *trans) + + #ifdef CONFIG_BCACHEFS_DEBUG + trans_for_each_update(trans, i) +- if (btree_iter_type(i->iter) != BTREE_ITER_CACHED && ++ if (!i->cached && + !(i->flags & BTREE_TRIGGER_NORUN)) + bch2_btree_key_cache_verify_clean(trans, + i->btree_id, i->k->k.p); +@@ -771,7 +772,7 @@ int __bch2_trans_commit(struct btree_trans *trans) + if (unlikely(ret)) { + if (ret == -EINTR) + trace_trans_restart_mark(trans->ip, _RET_IP_, +- i->iter->btree_id, ++ i->btree_id, + &i->iter->pos); + goto out; + } +@@ -785,8 +786,7 @@ int __bch2_trans_commit(struct btree_trans *trans) + if (unlikely(!bch2_btree_iter_upgrade(trans, i->iter, + i->level + 1))) { + trace_trans_restart_upgrade(trans->ip, _RET_IP_, +- i->iter->btree_id, +- &i->iter->pos); ++ i->btree_id, &i->iter->pos); + trans->restarted = true; + ret = -EINTR; + goto out; +@@ -795,7 +795,7 @@ int __bch2_trans_commit(struct btree_trans *trans) + BUG_ON(!btree_node_intent_locked(i->iter, i->level)); + + u64s = jset_u64s(i->k->k.u64s); +- if (btree_iter_type(i->iter) == BTREE_ITER_CACHED && ++ if (i->cached && + likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) + trans->journal_preres_u64s += u64s; + trans->journal_u64s += u64s; +@@ -980,6 +980,7 @@ int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter, + .bkey_type = __btree_node_type(iter->level, iter->btree_id), + .btree_id = iter->btree_id, + .level = iter->level, ++ .cached = btree_iter_is_cached(iter), + .iter = iter, + .k = k, + .ip_allocated = _RET_IP_, +@@ -1017,8 +1018,7 @@ int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter, + * not the key cache, which helps with cache coherency issues in + * other areas: + */ +- if (btree_iter_type(n.iter) == BTREE_ITER_CACHED && +- btree_iter_type(i->iter) != BTREE_ITER_CACHED) { ++ if (n.cached && !i->cached) { + i->k = n.k; + i->flags = n.flags; + } else { +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index c951c1dc10dd..b4942b1f3768 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -1286,7 +1286,7 @@ void fs_usage_apply_warn(struct btree_trans *trans, + pr_err("%s", buf); + pr_err("overlapping with"); + +- if (btree_iter_type(i->iter) != BTREE_ITER_CACHED) { ++ if (!i->cached) { + struct btree_iter *copy = bch2_trans_copy_iter(trans, i->iter); + struct bkey_s_c k; + int ret; +-- +cgit v1.2.3 + + +From 11f039218d52147b5c8124a8c36ef9e3bcda31b4 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 30 Aug 2021 17:31:09 -0400 +Subject: bcachefs: Kill BTREE_ITER_NEED_PEEK + +This was used for an optimization that hasn't existing in quite awhile +- iter->uptodate will probably be going away as well. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 15 ++------------- + fs/bcachefs/btree_iter.h | 2 +- + fs/bcachefs/btree_key_cache.c | 2 +- + fs/bcachefs/btree_types.h | 5 ++--- + 4 files changed, 6 insertions(+), 18 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index b406afcb54a4..9e198930dd47 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -239,7 +239,7 @@ static inline bool btree_iter_get_locks(struct btree_trans *trans, + } + + if (iter->uptodate == BTREE_ITER_NEED_RELOCK) +- iter->uptodate = BTREE_ITER_NEED_PEEK; ++ iter->uptodate = BTREE_ITER_UPTODATE; + + bch2_trans_verify_locks(trans); + +@@ -756,8 +756,6 @@ static void __bch2_btree_iter_fix_key_modified(struct btree_iter *iter, + + if (bkey_iter_pos_cmp(l->b, where, &iter->real_pos) < 0) + bch2_btree_node_iter_advance(&l->iter, l->b); +- +- btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); + } + + void bch2_btree_iter_fix_key_modified(struct btree_trans *trans, +@@ -863,11 +861,6 @@ fixup_done: + b, t, k2); + } + } +- +- if (!b->c.level && +- node_iter == &iter->l[0].iter && +- iter_current_key_modified) +- btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); + } + + void bch2_btree_node_iter_fix(struct btree_trans *trans, +@@ -1039,8 +1032,6 @@ static inline void __btree_iter_level_init(struct btree_iter *iter, + */ + if (level) + bch2_btree_node_iter_peek(&l->iter, l->b); +- +- btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); + } + + static inline void btree_iter_level_init(struct btree_trans *trans, +@@ -1477,7 +1468,7 @@ static int btree_iter_traverse_one(struct btree_trans *trans, + } + } + +- iter->uptodate = BTREE_ITER_NEED_PEEK; ++ iter->uptodate = BTREE_ITER_UPTODATE; + out: + BUG_ON((ret == -EINTR) != !!trans->restarted); + trace_iter_traverse(trans->ip, trace_ip, +@@ -1666,8 +1657,6 @@ static void btree_iter_set_search_pos(struct btree_iter *iter, struct bpos new_p + out: + if (l != iter->level) + btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); +- else +- btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); + + bch2_btree_iter_verify(iter); + #ifdef CONFIG_BCACHEFS_DEBUG +diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h +index 55b20aed7b59..e4bfd9e75784 100644 +--- a/fs/bcachefs/btree_iter.h ++++ b/fs/bcachefs/btree_iter.h +@@ -159,7 +159,7 @@ static inline bool bch2_btree_iter_upgrade(struct btree_trans *trans, + + return iter->locks_want < new_locks_want + ? __bch2_btree_iter_upgrade(trans, iter, new_locks_want) +- : iter->uptodate <= BTREE_ITER_NEED_PEEK; ++ : iter->uptodate == BTREE_ITER_UPTODATE; + } + + void __bch2_btree_iter_downgrade(struct btree_iter *, unsigned); +diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c +index 568c1f2704c2..924b67e79805 100644 +--- a/fs/bcachefs/btree_key_cache.c ++++ b/fs/bcachefs/btree_key_cache.c +@@ -339,7 +339,7 @@ fill: + if (!test_bit(BKEY_CACHED_ACCESSED, &ck->flags)) + set_bit(BKEY_CACHED_ACCESSED, &ck->flags); + +- iter->uptodate = BTREE_ITER_NEED_PEEK; ++ iter->uptodate = BTREE_ITER_UPTODATE; + + if ((iter->flags & BTREE_ITER_INTENT) && + !bch2_btree_iter_upgrade(trans, iter, 1)) { +diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h +index 04ed6ad5be94..e3936a976347 100644 +--- a/fs/bcachefs/btree_types.h ++++ b/fs/bcachefs/btree_types.h +@@ -218,9 +218,8 @@ enum btree_iter_type { + + enum btree_iter_uptodate { + BTREE_ITER_UPTODATE = 0, +- BTREE_ITER_NEED_PEEK = 1, +- BTREE_ITER_NEED_RELOCK = 2, +- BTREE_ITER_NEED_TRAVERSE = 3, ++ BTREE_ITER_NEED_RELOCK = 1, ++ BTREE_ITER_NEED_TRAVERSE = 2, + }; + + #define BTREE_ITER_NO_NODE_GET_LOCKS ((struct btree *) 1) +-- +cgit v1.2.3 + + +From ffa8c7ed111ecde67cddbab24ce123c28765c723 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 30 Aug 2021 15:54:41 -0400 +Subject: bcachefs: Kill BTREE_ITER_NODES + +We really only need to distinguish between btree iterators and btree key +cache iterators - this is more prep work for btree_path. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 173 +++++++++++++++++----------------------- + fs/bcachefs/btree_types.h | 44 ++++------ + fs/bcachefs/btree_update_leaf.c | 10 +-- + 3 files changed, 94 insertions(+), 133 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 9e198930dd47..b27bc9002f93 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -29,15 +29,14 @@ static void btree_iter_copy(struct btree_trans *, struct btree_iter *, struct bt + static inline int btree_iter_cmp(const struct btree_iter *l, + const struct btree_iter *r) + { +- return cmp_int(l->btree_id, r->btree_id) ?: +- -cmp_int(btree_iter_is_cached(l), btree_iter_is_cached(r)) ?: +- bkey_cmp(l->real_pos, r->real_pos); ++ return cmp_int(l->btree_id, r->btree_id) ?: ++ -cmp_int(l->cached, r->cached) ?: ++ bkey_cmp(l->real_pos, r->real_pos) ?: ++ -cmp_int(l->level, r->level); + } + + static inline struct bpos bkey_successor(struct btree_iter *iter, struct bpos p) + { +- EBUG_ON(btree_iter_type(iter) == BTREE_ITER_NODES); +- + /* Are we iterating over keys in all snapshots? */ + if (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) { + p = bpos_successor(p); +@@ -51,8 +50,6 @@ static inline struct bpos bkey_successor(struct btree_iter *iter, struct bpos p) + + static inline struct bpos bkey_predecessor(struct btree_iter *iter, struct bpos p) + { +- EBUG_ON(btree_iter_type(iter) == BTREE_ITER_NODES); +- + /* Are we iterating over keys in all snapshots? */ + if (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) { + p = bpos_predecessor(p); +@@ -211,7 +208,7 @@ static inline bool btree_iter_get_locks(struct btree_trans *trans, + (upgrade + ? trace_node_upgrade_fail + : trace_node_relock_fail)(trans->ip, trace_ip, +- btree_iter_type(iter) == BTREE_ITER_CACHED, ++ iter->cached, + iter->btree_id, &iter->real_pos, + l, iter->l[l].lock_seq, + is_btree_node(iter, l) +@@ -247,9 +244,9 @@ static inline bool btree_iter_get_locks(struct btree_trans *trans, + } + + static struct bpos btree_node_pos(struct btree_bkey_cached_common *_b, +- enum btree_iter_type type) ++ bool cached) + { +- return type != BTREE_ITER_CACHED ++ return !cached + ? container_of(_b, struct btree, c)->key.k.p + : container_of(_b, struct bkey_cached, c)->key.pos; + } +@@ -302,8 +299,8 @@ bool __bch2_btree_node_lock(struct btree_trans *trans, + * Within the same btree, cached iterators come before non + * cached iterators: + */ +- if (btree_iter_is_cached(linked) != btree_iter_is_cached(iter)) { +- if (btree_iter_is_cached(iter)) { ++ if (linked->cached != iter->cached) { ++ if (iter->cached) { + deadlock_iter = linked; + reason = 4; + } +@@ -323,7 +320,7 @@ bool __bch2_btree_node_lock(struct btree_trans *trans, + /* Must lock btree nodes in key order: */ + if (btree_node_locked(linked, level) && + bpos_cmp(pos, btree_node_pos((void *) linked->l[level].b, +- btree_iter_type(linked))) <= 0) { ++ linked->cached)) <= 0) { + deadlock_iter = linked; + reason = 7; + BUG_ON(trans->in_traverse_all); +@@ -334,10 +331,10 @@ bool __bch2_btree_node_lock(struct btree_trans *trans, + trace_trans_restart_would_deadlock(trans->ip, ip, + trans->in_traverse_all, reason, + deadlock_iter->btree_id, +- btree_iter_type(deadlock_iter), ++ deadlock_iter->cached, + &deadlock_iter->real_pos, + iter->btree_id, +- btree_iter_type(iter), ++ iter->cached, + &pos); + btree_trans_restart(trans); + return false; +@@ -406,7 +403,7 @@ bool bch2_btree_iter_relock_intent(struct btree_trans *trans, + l++) { + if (!bch2_btree_node_relock(trans, iter, l)) { + trace_node_relock_fail(trans->ip, _RET_IP_, +- btree_iter_type(iter) == BTREE_ITER_CACHED, ++ iter->cached, + iter->btree_id, &iter->real_pos, + l, iter->l[l].lock_seq, + is_btree_node(iter, l) +@@ -469,7 +466,7 @@ bool __bch2_btree_iter_upgrade(struct btree_trans *trans, + */ + trans_for_each_iter(trans, linked) + if (linked != iter && +- btree_iter_type(linked) == btree_iter_type(iter) && ++ linked->cached == iter->cached && + linked->btree_id == iter->btree_id && + linked->locks_want < new_locks_want) { + linked->locks_want = new_locks_want; +@@ -588,7 +585,7 @@ static void bch2_btree_iter_verify_level(struct btree_trans *trans, + tmp = l->iter; + locked = btree_node_locked(iter, level); + +- if (btree_iter_type(iter) == BTREE_ITER_CACHED) { ++ if (iter->cached) { + if (!level) + bch2_btree_iter_verify_cached(trans, iter); + return; +@@ -604,13 +601,6 @@ static void bch2_btree_iter_verify_level(struct btree_trans *trans, + + BUG_ON(!btree_iter_pos_in_node(iter, l->b)); + +- /* +- * node iterators don't use leaf node iterator: +- */ +- if (btree_iter_type(iter) == BTREE_ITER_NODES && +- level <= iter->min_depth) +- goto unlock; +- + bch2_btree_node_iter_verify(&l->iter, l->b); + + /* +@@ -634,7 +624,7 @@ static void bch2_btree_iter_verify_level(struct btree_trans *trans, + msg = "after"; + goto err; + } +-unlock: ++ + if (!locked) + btree_node_unlock(iter, level); + return; +@@ -665,7 +655,6 @@ static void bch2_btree_iter_verify(struct btree_iter *iter) + { + struct btree_trans *trans = iter->trans; + struct bch_fs *c = trans->c; +- enum btree_iter_type type = btree_iter_type(iter); + unsigned i; + + EBUG_ON(iter->btree_id >= BTREE_ID_NR); +@@ -676,14 +665,11 @@ static void bch2_btree_iter_verify(struct btree_iter *iter) + BUG_ON((iter->flags & BTREE_ITER_IS_EXTENTS) && + (iter->flags & BTREE_ITER_ALL_SNAPSHOTS)); + +- BUG_ON(type == BTREE_ITER_NODES && +- !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)); +- +- BUG_ON(type != BTREE_ITER_NODES && ++ BUG_ON(!(iter->flags & __BTREE_ITER_ALL_SNAPSHOTS) && + (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) && + !btree_type_has_snapshots(iter->btree_id)); + +- for (i = 0; i < (type != BTREE_ITER_CACHED ? BTREE_MAX_DEPTH : 1); i++) { ++ for (i = 0; i < (!iter->cached ? BTREE_MAX_DEPTH : 1); i++) { + if (!iter->l[i].b) { + BUG_ON(c->btree_roots[iter->btree_id].b->c.level > i); + break; +@@ -697,15 +683,11 @@ static void bch2_btree_iter_verify(struct btree_iter *iter) + + static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) + { +- enum btree_iter_type type = btree_iter_type(iter); +- + BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS) && + iter->pos.snapshot != iter->snapshot); + +- BUG_ON((type == BTREE_ITER_KEYS || +- type == BTREE_ITER_CACHED) && +- (bkey_cmp(iter->pos, bkey_start_pos(&iter->k)) < 0 || +- bkey_cmp(iter->pos, iter->k.p) > 0)); ++ BUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&iter->k)) < 0 || ++ bkey_cmp(iter->pos, iter->k.p) > 0); + } + + void bch2_trans_verify_iters(struct btree_trans *trans, struct btree *b) +@@ -1038,7 +1020,7 @@ static inline void btree_iter_level_init(struct btree_trans *trans, + struct btree_iter *iter, + struct btree *b) + { +- BUG_ON(btree_iter_type(iter) == BTREE_ITER_CACHED); ++ BUG_ON(iter->cached); + + btree_iter_verify_new_node(trans, iter, b); + +@@ -1060,7 +1042,7 @@ void bch2_trans_node_add(struct btree_trans *trans, struct btree *b) + struct btree_iter *iter; + + trans_for_each_iter(trans, iter) +- if (btree_iter_type(iter) != BTREE_ITER_CACHED && ++ if (!iter->cached && + btree_iter_pos_in_node(iter, b)) { + /* + * bch2_trans_node_drop() has already been called - +@@ -1412,7 +1394,7 @@ static int btree_iter_traverse_one(struct btree_trans *trans, + goto out; + } + +- if (btree_iter_type(iter) == BTREE_ITER_CACHED) { ++ if (iter->cached) { + ret = bch2_btree_iter_traverse_cached(trans, iter); + goto out; + } +@@ -1446,8 +1428,8 @@ static int btree_iter_traverse_one(struct btree_trans *trans, + if (unlikely(ret)) { + if (ret == 1) { + /* +- * Got to the end of the btree (in +- * BTREE_ITER_NODES mode) ++ * No nodes at this level - got to the end of ++ * the btree: + */ + ret = 0; + goto out; +@@ -1472,7 +1454,7 @@ static int btree_iter_traverse_one(struct btree_trans *trans, + out: + BUG_ON((ret == -EINTR) != !!trans->restarted); + trace_iter_traverse(trans->ip, trace_ip, +- btree_iter_type(iter) == BTREE_ITER_CACHED, ++ iter->cached, + iter->btree_id, &iter->real_pos, ret); + bch2_btree_iter_verify(iter); + return ret; +@@ -1528,41 +1510,43 @@ bch2_btree_iter_traverse(struct btree_iter *iter) + + struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter) + { +- struct btree *b; ++ struct btree *b = NULL; + int ret; + +- EBUG_ON(btree_iter_type(iter) != BTREE_ITER_NODES); ++ EBUG_ON(iter->cached); + bch2_btree_iter_verify(iter); + + ret = btree_iter_traverse(iter); + if (ret) +- return NULL; ++ goto out; + + b = btree_iter_node(iter, iter->level); + if (!b) +- return NULL; ++ goto out; + + BUG_ON(bpos_cmp(b->key.k.p, iter->pos) < 0); + +- iter->pos = iter->real_pos = b->key.k.p; +- +- bch2_btree_iter_verify(iter); ++ bkey_init(&iter->k); ++ iter->k.p = iter->pos = iter->real_pos = b->key.k.p; + iter->should_be_locked = true; ++out: ++ bch2_btree_iter_verify_entry_exit(iter); ++ bch2_btree_iter_verify(iter); + + return b; + } + + struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) + { +- struct btree *b; ++ struct btree *b = NULL; + int ret; + +- EBUG_ON(btree_iter_type(iter) != BTREE_ITER_NODES); ++ EBUG_ON(iter->cached); + bch2_btree_iter_verify(iter); + + /* already got to end? */ + if (!btree_iter_node(iter, iter->level)) +- return NULL; ++ goto out; + + bch2_trans_cond_resched(iter->trans); + +@@ -1573,12 +1557,12 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) + btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); + ret = btree_iter_traverse(iter); + if (ret) +- return NULL; ++ goto out; + + /* got to end? */ + b = btree_iter_node(iter, iter->level); + if (!b) +- return NULL; ++ goto out; + + if (bpos_cmp(iter->pos, b->key.k.p) < 0) { + /* +@@ -1595,16 +1579,20 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) + bch2_btree_iter_verify(iter); + + ret = btree_iter_traverse(iter); +- if (ret) +- return NULL; ++ if (ret) { ++ b = NULL; ++ goto out; ++ } + + b = iter->l[iter->level].b; + } + +- iter->pos = iter->real_pos = b->key.k.p; +- +- bch2_btree_iter_verify(iter); ++ bkey_init(&iter->k); ++ iter->k.p = iter->pos = iter->real_pos = b->key.k.p; + iter->should_be_locked = true; ++out: ++ bch2_btree_iter_verify_entry_exit(iter); ++ bch2_btree_iter_verify(iter); + + return b; + } +@@ -1630,7 +1618,7 @@ static void btree_iter_set_search_pos(struct btree_iter *iter, struct bpos new_p + + btree_iter_check_sort(trans, iter); + +- if (unlikely(btree_iter_type(iter) == BTREE_ITER_CACHED)) { ++ if (unlikely(iter->cached)) { + btree_node_unlock(iter, 0); + iter->l[0].b = BTREE_ITER_NO_NODE_CACHED; + btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); +@@ -1722,7 +1710,7 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) + struct bkey_s_c k; + int ret; + +- EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS); ++ EBUG_ON(iter->cached || iter->level); + bch2_btree_iter_verify(iter); + bch2_btree_iter_verify_entry_exit(iter); + +@@ -1812,7 +1800,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) + struct bkey_s_c k; + int ret; + +- EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS); ++ EBUG_ON(iter->cached || iter->level); + EBUG_ON(iter->flags & BTREE_ITER_WITH_UPDATES); + bch2_btree_iter_verify(iter); + bch2_btree_iter_verify_entry_exit(iter); +@@ -1879,8 +1867,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) + struct bkey_s_c k; + int ret; + +- EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS && +- btree_iter_type(iter) != BTREE_ITER_CACHED); ++ EBUG_ON(iter->level); + bch2_btree_iter_verify(iter); + bch2_btree_iter_verify_entry_exit(iter); + +@@ -1900,28 +1887,21 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) + if (unlikely(ret)) + return bkey_s_c_err(ret); + +- if (btree_iter_type(iter) == BTREE_ITER_CACHED || +- !(iter->flags & BTREE_ITER_IS_EXTENTS)) { ++ if (!(iter->flags & BTREE_ITER_IS_EXTENTS)) { + struct bkey_i *next_update; +- struct bkey_cached *ck; + + next_update = btree_trans_peek_updates(trans, iter); + +- switch (btree_iter_type(iter)) { +- case BTREE_ITER_KEYS: ++ if (!iter->cached) { + k = btree_iter_level_peek_all(iter, &iter->l[0]); + EBUG_ON(k.k && bkey_deleted(k.k) && bpos_cmp(k.k->p, iter->pos) == 0); +- break; +- case BTREE_ITER_CACHED: +- ck = (void *) iter->l[0].b; ++ } else { ++ struct bkey_cached *ck = (void *) iter->l[0].b; + EBUG_ON(iter->btree_id != ck->key.btree_id || + bkey_cmp(iter->pos, ck->key.pos)); + BUG_ON(!ck->valid); + + k = bkey_i_to_s_c(ck->k); +- break; +- case BTREE_ITER_NODES: +- BUG(); + } + + if (next_update && +@@ -2301,14 +2281,12 @@ struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans, + + EBUG_ON(trans->restarted); + +- if ((flags & BTREE_ITER_TYPE) != BTREE_ITER_NODES && +- btree_node_type_is_extents(btree_id) && +- !(flags & BTREE_ITER_NOT_EXTENTS) && +- !(flags & BTREE_ITER_ALL_SNAPSHOTS)) ++ if (!(flags & (BTREE_ITER_ALL_SNAPSHOTS|BTREE_ITER_NOT_EXTENTS)) && ++ btree_node_type_is_extents(btree_id)) + flags |= BTREE_ITER_IS_EXTENTS; + +- if ((flags & BTREE_ITER_TYPE) != BTREE_ITER_NODES && +- !btree_type_has_snapshots(btree_id)) ++ if (!btree_type_has_snapshots(btree_id) && ++ !(flags & __BTREE_ITER_ALL_SNAPSHOTS)) + flags &= ~BTREE_ITER_ALL_SNAPSHOTS; + + if (!(flags & BTREE_ITER_ALL_SNAPSHOTS)) +@@ -2324,7 +2302,7 @@ struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans, + trans_for_each_iter_inorder(trans, iter) { + list_pos = iter; + +- if (btree_iter_type(iter) != (flags & BTREE_ITER_TYPE) || ++ if (iter->cached != (flags & BTREE_ITER_CACHED) || + iter->btree_id != btree_id) + continue; + +@@ -2353,9 +2331,9 @@ struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans, + trans->iters_live |= 1ULL << iter->idx; + trans->iters_touched |= 1ULL << iter->idx; + +- iter->flags = flags; +- +- iter->snapshot = pos.snapshot; ++ iter->cached = flags & BTREE_ITER_CACHED; ++ iter->flags = flags; ++ iter->snapshot = pos.snapshot; + + /* + * If the iterator has locks_want greater than requested, we explicitly +@@ -2406,8 +2384,8 @@ struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *trans, + struct btree_iter *iter = + __bch2_trans_get_iter(trans, btree_id, pos, + locks_want, depth, +- BTREE_ITER_NODES| + BTREE_ITER_NOT_EXTENTS| ++ __BTREE_ITER_ALL_SNAPSHOTS| + BTREE_ITER_ALL_SNAPSHOTS| + flags); + +@@ -2660,21 +2638,20 @@ int bch2_trans_exit(struct btree_trans *trans) + static void __maybe_unused + bch2_btree_iter_node_to_text(struct printbuf *out, + struct btree_bkey_cached_common *_b, +- enum btree_iter_type type) ++ bool cached) + { + pr_buf(out, " l=%u %s:", + _b->level, bch2_btree_ids[_b->btree_id]); +- bch2_bpos_to_text(out, btree_node_pos(_b, type)); ++ bch2_bpos_to_text(out, btree_node_pos(_b, cached)); + } + + #ifdef CONFIG_BCACHEFS_DEBUG +-static bool trans_has_btree_nodes_locked(struct btree_trans *trans) ++static bool trans_has_locks(struct btree_trans *trans) + { + struct btree_iter *iter; + + trans_for_each_iter(trans, iter) +- if (btree_iter_type(iter) != BTREE_ITER_CACHED && +- iter->nodes_locked) ++ if (iter->nodes_locked) + return true; + return false; + } +@@ -2690,7 +2667,7 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c) + + mutex_lock(&c->btree_trans_lock); + list_for_each_entry(trans, &c->btree_trans_list, list) { +- if (!trans_has_btree_nodes_locked(trans)) ++ if (!trans_has_locks(trans)) + continue; + + pr_buf(out, "%i %ps\n", trans->pid, (void *) trans->ip); +@@ -2701,7 +2678,7 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c) + + pr_buf(out, " iter %u %c %s:", + iter->idx, +- btree_iter_type(iter) == BTREE_ITER_CACHED ? 'c' : 'b', ++ iter->cached ? 'c' : 'b', + bch2_btree_ids[iter->btree_id]); + bch2_bpos_to_text(out, iter->pos); + pr_buf(out, "\n"); +@@ -2712,7 +2689,7 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c) + btree_node_intent_locked(iter, l) ? "i" : "r", l); + bch2_btree_iter_node_to_text(out, + (void *) iter->l[l].b, +- btree_iter_type(iter)); ++ iter->cached); + pr_buf(out, "\n"); + } + } +@@ -2723,7 +2700,7 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c) + iter = &trans->iters[trans->locking_iter_idx]; + pr_buf(out, " locking iter %u %c l=%u %s:", + trans->locking_iter_idx, +- btree_iter_type(iter) == BTREE_ITER_CACHED ? 'c' : 'b', ++ iter->cached ? 'c' : 'b', + trans->locking_level, + bch2_btree_ids[trans->locking_btree_id]); + bch2_bpos_to_text(out, trans->locking_pos); +@@ -2731,7 +2708,7 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c) + pr_buf(out, " node "); + bch2_btree_iter_node_to_text(out, + (void *) b, +- btree_iter_type(iter)); ++ iter->cached); + pr_buf(out, "\n"); + } + } +diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h +index e3936a976347..cd2b79a04880 100644 +--- a/fs/bcachefs/btree_types.h ++++ b/fs/bcachefs/btree_types.h +@@ -176,44 +176,38 @@ struct btree_node_iter { + } data[MAX_BSETS]; + }; + +-enum btree_iter_type { +- BTREE_ITER_KEYS, +- BTREE_ITER_NODES, +- BTREE_ITER_CACHED, +-}; +- +-#define BTREE_ITER_TYPE ((1 << 2) - 1) +- + /* + * Iterate over all possible positions, synthesizing deleted keys for holes: + */ +-#define BTREE_ITER_SLOTS (1 << 2) ++#define BTREE_ITER_SLOTS (1 << 0) + /* + * Indicates that intent locks should be taken on leaf nodes, because we expect + * to be doing updates: + */ +-#define BTREE_ITER_INTENT (1 << 3) ++#define BTREE_ITER_INTENT (1 << 1) + /* + * Causes the btree iterator code to prefetch additional btree nodes from disk: + */ +-#define BTREE_ITER_PREFETCH (1 << 4) ++#define BTREE_ITER_PREFETCH (1 << 2) + /* + * Indicates that this iterator should not be reused until transaction commit, + * either because a pending update references it or because the update depends + * on that particular key being locked (e.g. by the str_hash code, for hash + * table consistency) + */ +-#define BTREE_ITER_KEEP_UNTIL_COMMIT (1 << 5) ++#define BTREE_ITER_KEEP_UNTIL_COMMIT (1 << 3) + /* + * Used in bch2_btree_iter_traverse(), to indicate whether we're searching for + * @pos or the first key strictly greater than @pos + */ +-#define BTREE_ITER_IS_EXTENTS (1 << 6) +-#define BTREE_ITER_NOT_EXTENTS (1 << 7) +-#define BTREE_ITER_ERROR (1 << 8) +-#define BTREE_ITER_CACHED_NOFILL (1 << 9) +-#define BTREE_ITER_CACHED_NOCREATE (1 << 10) +-#define BTREE_ITER_WITH_UPDATES (1 << 11) ++#define BTREE_ITER_IS_EXTENTS (1 << 4) ++#define BTREE_ITER_NOT_EXTENTS (1 << 5) ++#define BTREE_ITER_ERROR (1 << 6) ++#define BTREE_ITER_CACHED (1 << 7) ++#define BTREE_ITER_CACHED_NOFILL (1 << 8) ++#define BTREE_ITER_CACHED_NOCREATE (1 << 9) ++#define BTREE_ITER_WITH_UPDATES (1 << 10) ++#define __BTREE_ITER_ALL_SNAPSHOTS (1 << 11) + #define BTREE_ITER_ALL_SNAPSHOTS (1 << 12) + + enum btree_iter_uptodate { +@@ -256,7 +250,8 @@ struct btree_iter { + struct bpos real_pos; + + enum btree_id btree_id:4; +- enum btree_iter_uptodate uptodate:3; ++ bool cached:1; ++ enum btree_iter_uptodate uptodate:2; + /* + * True if we've returned a key (and thus are expected to keep it + * locked), false after set_pos - for avoiding spurious transaction +@@ -282,17 +277,6 @@ struct btree_iter { + struct bkey k; + }; + +-static inline enum btree_iter_type +-btree_iter_type(const struct btree_iter *iter) +-{ +- return iter->flags & BTREE_ITER_TYPE; +-} +- +-static inline bool btree_iter_is_cached(const struct btree_iter *iter) +-{ +- return btree_iter_type(iter) == BTREE_ITER_CACHED; +-} +- + static inline struct btree_iter_level *iter_l(struct btree_iter *iter) + { + return iter->l + iter->level; +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index 067c9038d2c9..128d754c1e29 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -56,7 +56,7 @@ inline void bch2_btree_node_lock_for_insert(struct btree_trans *trans, + + bch2_btree_node_lock_write(trans, iter, b); + +- if (btree_iter_type(iter) == BTREE_ITER_CACHED) ++ if (iter->cached) + return; + + if (unlikely(btree_node_just_written(b)) && +@@ -509,10 +509,10 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, + + trans_for_each_update(trans, i) { + /* +- * peek_slot() doesn't work on a BTREE_ITER_NODES iter; those +- * iterator types should probably go away ++ * peek_slot() doesn't yet work on iterators that point to ++ * interior nodes: + */ +- if (btree_iter_type(i->iter) != BTREE_ITER_KEYS) ++ if (i->cached || i->level) + continue; + + old = bch2_btree_iter_peek_slot(i->iter); +@@ -980,7 +980,7 @@ int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter, + .bkey_type = __btree_node_type(iter->level, iter->btree_id), + .btree_id = iter->btree_id, + .level = iter->level, +- .cached = btree_iter_is_cached(iter), ++ .cached = iter->cached, + .iter = iter, + .k = k, + .ip_allocated = _RET_IP_, +-- +cgit v1.2.3 + + +From 7e599ac38701fdbbff141a1f50098f8d01c1eec0 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 1 Sep 2021 00:50:18 -0400 +Subject: bcachefs: Add an assertion for removing btree nodes from cache + +Chasing a bug that has something to do with the btree node cache. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_cache.c | 3 ++- + fs/bcachefs/btree_update_interior.c | 5 +++-- + 2 files changed, 5 insertions(+), 3 deletions(-) + +diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c +index b3445b67e981..83a2d0aa8367 100644 +--- a/fs/bcachefs/btree_cache.c ++++ b/fs/bcachefs/btree_cache.c +@@ -130,7 +130,8 @@ struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c) + + void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b) + { +- rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params); ++ int ret = rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params); ++ BUG_ON(ret); + + /* Cause future lookups for this node to fail: */ + b->hash_val = 0; +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index 0afd26083d6d..6e833a3c9e4c 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -153,8 +153,6 @@ static void __btree_node_free(struct bch_fs *c, struct btree *b) + + clear_btree_node_noevict(b); + +- bch2_btree_node_hash_remove(&c->btree_cache, b); +- + mutex_lock(&c->btree_cache.lock); + list_move(&b->list, &c->btree_cache.freeable); + mutex_unlock(&c->btree_cache.lock); +@@ -170,7 +168,10 @@ static void bch2_btree_node_free_inmem(struct btree_trans *trans, + BUG_ON(iter->l[b->c.level].b == b); + + six_lock_write(&b->c.lock, NULL, NULL); ++ ++ bch2_btree_node_hash_remove(&c->btree_cache, b); + __btree_node_free(c, b); ++ + six_unlock_write(&b->c.lock); + six_unlock_intent(&b->c.lock); + } +-- +cgit v1.2.3 + + +From e0d4d82c487c05d5c5ddce558c30adc272bbef88 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 1 Sep 2021 18:06:01 -0400 +Subject: bcachefs: Improve an error message + +When we detect an invalid key being inserted, we should print what code +was doing the update. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_update_leaf.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index 128d754c1e29..d68a9a51a106 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -577,7 +577,9 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, + char buf[200]; + + bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(i->k)); +- bch_err(c, "invalid bkey %s on insert: %s\n", buf, invalid); ++ bch_err(c, "invalid bkey %s on insert from %ps -> %ps: %s\n", ++ buf, (void *) trans->ip, ++ (void *) i->ip_allocated, invalid); + bch2_fatal_error(c); + } + btree_insert_entry_checks(trans, i); +-- +cgit v1.2.3 + + +From 36a6f3ef3d0c9b68782d804f49b69b849771e964 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 3 Sep 2021 17:32:42 -0400 +Subject: bcachefs: Fix initialization of bch_write_op.nonce + +If an extent ends up with a replica that is encrypted an a replica that +isn't encrypted (due the user changing options), and then +copygc/rebalance moves one of the replicas by reading from the +unencrypted replica, we had a bug where we wouldn't correctly initialize +op->nonce - for each crc field in an extent, crc.offset + crc.nonce must +be equal. + +This patch fixes that by moving op.nonce initialization to +bch2_migrate_write_init. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/move.c | 18 +++++++++++++----- + 1 file changed, 13 insertions(+), 5 deletions(-) + +diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c +index 2e5e09350392..45cea9231300 100644 +--- a/fs/bcachefs/move.c ++++ b/fs/bcachefs/move.c +@@ -221,11 +221,6 @@ void bch2_migrate_read_done(struct migrate_write *m, struct bch_read_bio *rbio) + m->op.crc = rbio->pick.crc; + m->op.wbio.bio.bi_iter.bi_size = m->op.crc.compressed_size << 9; + +- if (bch2_csum_type_is_encryption(m->op.crc.csum_type)) { +- m->op.nonce = m->op.crc.nonce + m->op.crc.offset; +- m->op.csum_type = m->op.crc.csum_type; +- } +- + if (m->data_cmd == DATA_REWRITE) + bch2_dev_list_drop_dev(&m->op.devs_have, m->data_opts.rewrite_dev); + } +@@ -240,6 +235,7 @@ int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m, + { + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; ++ struct bch_extent_crc_unpacked crc; + struct extent_ptr_decoded p; + int ret; + +@@ -260,6 +256,18 @@ int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m, + m->op.target = data_opts.target, + m->op.write_point = wp; + ++ /* ++ * op->csum_type is normally initialized from the fs/file's current ++ * options - but if an extent is encrypted, we require that it stays ++ * encrypted: ++ */ ++ bkey_for_each_crc(k.k, ptrs, crc, entry) ++ if (bch2_csum_type_is_encryption(crc.csum_type)) { ++ m->op.nonce = crc.nonce + crc.offset; ++ m->op.csum_type = crc.csum_type; ++ break; ++ } ++ + if (m->data_opts.btree_insert_flags & BTREE_INSERT_USE_RESERVE) { + m->op.alloc_reserve = RESERVE_MOVINGGC; + m->op.flags |= BCH_WRITE_ALLOC_NOWAIT; +-- +cgit v1.2.3 + + +From 20a4bbe16f928fcbfe3f86270d7f43bc5a94704b Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 27 Aug 2021 16:30:47 -0400 +Subject: bcachefs: btree_path + +This splits btree_iter into two components: btree_iter is now the +externally visible componont, and it points to a btree_path which is now +reference counted. + +This means we no longer have to clone iterators up front if they might +be mutated - btree_path can be shared by multiple iterators, and cloned +if an iterator would mutate a shared btree_path. This will help us use +iterators more efficiently, as well as slimming down the main long lived +state in btree_trans, and significantly cleans up the logic for iterator +lifetimes. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/acl.c | 41 +- + fs/bcachefs/alloc_background.c | 55 +- + fs/bcachefs/bcachefs.h | 8 +- + fs/bcachefs/bset.c | 4 +- + fs/bcachefs/btree_cache.c | 36 +- + fs/bcachefs/btree_cache.h | 7 +- + fs/bcachefs/btree_gc.c | 36 +- + fs/bcachefs/btree_iter.c | 2099 ++++++++++++++++++----------------- + fs/bcachefs/btree_iter.h | 257 ++--- + fs/bcachefs/btree_key_cache.c | 135 ++- + fs/bcachefs/btree_key_cache.h | 5 +- + fs/bcachefs/btree_locking.h | 117 +- + fs/bcachefs/btree_types.h | 92 +- + fs/bcachefs/btree_update.h | 21 +- + fs/bcachefs/btree_update_interior.c | 217 ++-- + fs/bcachefs/btree_update_interior.h | 20 +- + fs/bcachefs/btree_update_leaf.c | 212 ++-- + fs/bcachefs/buckets.c | 176 +-- + fs/bcachefs/buckets.h | 6 +- + fs/bcachefs/debug.c | 32 +- + fs/bcachefs/dirent.c | 77 +- + fs/bcachefs/dirent.h | 3 +- + fs/bcachefs/ec.c | 76 +- + fs/bcachefs/extent_update.c | 10 +- + fs/bcachefs/extents.c | 4 +- + fs/bcachefs/fs-common.c | 113 +- + fs/bcachefs/fs-io.c | 82 +- + fs/bcachefs/fs.c | 44 +- + fs/bcachefs/fsck.c | 159 +-- + fs/bcachefs/inode.c | 61 +- + fs/bcachefs/inode.h | 8 +- + fs/bcachefs/io.c | 98 +- + fs/bcachefs/journal_seq_blacklist.c | 4 +- + fs/bcachefs/migrate.c | 26 +- + fs/bcachefs/move.c | 62 +- + fs/bcachefs/quota.c | 20 +- + fs/bcachefs/recovery.c | 34 +- + fs/bcachefs/reflink.c | 76 +- + fs/bcachefs/str_hash.h | 65 +- + fs/bcachefs/super.c | 8 +- + fs/bcachefs/sysfs.c | 3 +- + fs/bcachefs/tests.c | 189 ++-- + fs/bcachefs/xattr.c | 26 +- + include/trace/events/bcachefs.h | 4 +- + 44 files changed, 2389 insertions(+), 2439 deletions(-) + +diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c +index 1d3887306eb0..828915145ade 100644 +--- a/fs/bcachefs/acl.c ++++ b/fs/bcachefs/acl.c +@@ -218,7 +218,7 @@ struct posix_acl *bch2_get_acl(struct inode *vinode, int type, bool rcu) + struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode); + struct btree_trans trans; +- struct btree_iter *iter; ++ struct btree_iter iter = { NULL }; + struct bkey_s_c_xattr xattr; + struct posix_acl *acl = NULL; + struct bkey_s_c k; +@@ -231,20 +231,19 @@ struct posix_acl *bch2_get_acl(struct inode *vinode, int type, bool rcu) + retry: + bch2_trans_begin(&trans); + +- iter = bch2_hash_lookup(&trans, bch2_xattr_hash_desc, ++ ret = bch2_hash_lookup(&trans, &iter, bch2_xattr_hash_desc, + &hash, inode->v.i_ino, + &X_SEARCH(acl_to_xattr_type(type), "", 0), + 0); +- if (IS_ERR(iter)) { +- if (PTR_ERR(iter) == -EINTR) ++ if (ret) { ++ if (ret == -EINTR) + goto retry; +- +- if (PTR_ERR(iter) != -ENOENT) +- acl = ERR_CAST(iter); ++ if (ret != -ENOENT) ++ acl = ERR_PTR(ret); + goto out; + } + +- k = bch2_btree_iter_peek_slot(iter); ++ k = bch2_btree_iter_peek_slot(&iter); + ret = bkey_err(k); + if (ret) { + acl = ERR_PTR(ret); +@@ -257,8 +256,8 @@ retry: + + if (!IS_ERR(acl)) + set_cached_acl(&inode->v, type, acl); +- bch2_trans_iter_put(&trans, iter); + out: ++ bch2_trans_iter_exit(&trans, &iter); + bch2_trans_exit(&trans); + return acl; + } +@@ -299,7 +298,7 @@ int bch2_set_acl(struct user_namespace *mnt_userns, + struct bch_inode_info *inode = to_bch_ei(vinode); + struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct btree_trans trans; +- struct btree_iter *inode_iter; ++ struct btree_iter inode_iter = { NULL }; + struct bch_inode_unpacked inode_u; + struct bch_hash_info hash_info; + struct posix_acl *acl; +@@ -312,9 +311,8 @@ retry: + bch2_trans_begin(&trans); + acl = _acl; + +- inode_iter = bch2_inode_peek(&trans, &inode_u, inode->v.i_ino, +- BTREE_ITER_INTENT); +- ret = PTR_ERR_OR_ZERO(inode_iter); ++ ret = bch2_inode_peek(&trans, &inode_iter, &inode_u, inode->v.i_ino, ++ BTREE_ITER_INTENT); + if (ret) + goto btree_err; + +@@ -335,11 +333,11 @@ retry: + inode_u.bi_ctime = bch2_current_time(c); + inode_u.bi_mode = mode; + +- ret = bch2_inode_write(&trans, inode_iter, &inode_u) ?: ++ ret = bch2_inode_write(&trans, &inode_iter, &inode_u) ?: + bch2_trans_commit(&trans, NULL, + &inode->ei_journal_seq, 0); + btree_err: +- bch2_trans_iter_put(&trans, inode_iter); ++ bch2_trans_iter_exit(&trans, &inode_iter); + + if (ret == -EINTR) + goto retry; +@@ -363,22 +361,21 @@ int bch2_acl_chmod(struct btree_trans *trans, + struct posix_acl **new_acl) + { + struct bch_hash_info hash_info = bch2_hash_info_init(trans->c, inode); +- struct btree_iter *iter; ++ struct btree_iter iter; + struct bkey_s_c_xattr xattr; + struct bkey_i_xattr *new; + struct posix_acl *acl; + struct bkey_s_c k; + int ret; + +- iter = bch2_hash_lookup(trans, bch2_xattr_hash_desc, ++ ret = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc, + &hash_info, inode->bi_inum, + &X_SEARCH(KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS, "", 0), + BTREE_ITER_INTENT); +- ret = PTR_ERR_OR_ZERO(iter); + if (ret) + return ret == -ENOENT ? 0 : ret; + +- k = bch2_btree_iter_peek_slot(iter); ++ k = bch2_btree_iter_peek_slot(&iter); + xattr = bkey_s_c_to_xattr(k); + if (ret) + goto err; +@@ -399,12 +396,12 @@ int bch2_acl_chmod(struct btree_trans *trans, + goto err; + } + +- new->k.p = iter->pos; +- ret = bch2_trans_update(trans, iter, &new->k_i, 0); ++ new->k.p = iter.pos; ++ ret = bch2_trans_update(trans, &iter, &new->k_i, 0); + *new_acl = acl; + acl = NULL; + err: +- bch2_trans_iter_put(trans, iter); ++ bch2_trans_iter_exit(trans, &iter); + if (!IS_ERR_OR_NULL(acl)) + kfree(acl); + return ret; +diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c +index b553b6c93568..897729918b99 100644 +--- a/fs/bcachefs/alloc_background.c ++++ b/fs/bcachefs/alloc_background.c +@@ -353,32 +353,32 @@ err: + int bch2_alloc_write(struct bch_fs *c, unsigned flags) + { + struct btree_trans trans; +- struct btree_iter *iter; ++ struct btree_iter iter; + struct bch_dev *ca; + unsigned i; + int ret = 0; + + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); +- iter = bch2_trans_get_iter(&trans, BTREE_ID_alloc, POS_MIN, +- BTREE_ITER_SLOTS|BTREE_ITER_INTENT); ++ bch2_trans_iter_init(&trans, &iter, BTREE_ID_alloc, POS_MIN, ++ BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + + for_each_member_device(ca, c, i) { +- bch2_btree_iter_set_pos(iter, ++ bch2_btree_iter_set_pos(&iter, + POS(ca->dev_idx, ca->mi.first_bucket)); + +- while (iter->pos.offset < ca->mi.nbuckets) { ++ while (iter.pos.offset < ca->mi.nbuckets) { + bch2_trans_cond_resched(&trans); + +- ret = bch2_alloc_write_key(&trans, iter, flags); ++ ret = bch2_alloc_write_key(&trans, &iter, flags); + if (ret) { + percpu_ref_put(&ca->ref); + goto err; + } +- bch2_btree_iter_advance(iter); ++ bch2_btree_iter_advance(&iter); + } + } + err: +- bch2_trans_iter_put(&trans, iter); ++ bch2_trans_iter_exit(&trans, &iter); + bch2_trans_exit(&trans); + return ret; + } +@@ -390,18 +390,18 @@ int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev, + { + struct bch_fs *c = trans->c; + struct bch_dev *ca = bch_dev_bkey_exists(c, dev); +- struct btree_iter *iter; ++ struct btree_iter iter; + struct bucket *g; + struct bkey_alloc_buf *a; + struct bkey_alloc_unpacked u; + u64 *time, now; + int ret = 0; + +- iter = bch2_trans_get_iter(trans, BTREE_ID_alloc, POS(dev, bucket_nr), +- BTREE_ITER_CACHED| +- BTREE_ITER_CACHED_NOFILL| +- BTREE_ITER_INTENT); +- ret = bch2_btree_iter_traverse(iter); ++ bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, POS(dev, bucket_nr), ++ BTREE_ITER_CACHED| ++ BTREE_ITER_CACHED_NOFILL| ++ BTREE_ITER_INTENT); ++ ret = bch2_btree_iter_traverse(&iter); + if (ret) + goto out; + +@@ -412,7 +412,7 @@ int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev, + + percpu_down_read(&c->mark_lock); + g = bucket(ca, bucket_nr); +- u = alloc_mem_to_key(iter, g, READ_ONCE(g->mark)); ++ u = alloc_mem_to_key(&iter, g, READ_ONCE(g->mark)); + percpu_up_read(&c->mark_lock); + + time = rw == READ ? &u.read_time : &u.write_time; +@@ -423,10 +423,10 @@ int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev, + *time = now; + + bch2_alloc_pack(c, a, u); +- ret = bch2_trans_update(trans, iter, &a->k, 0) ?: ++ ret = bch2_trans_update(trans, &iter, &a->k, 0) ?: + bch2_trans_commit(trans, NULL, NULL, 0); + out: +- bch2_trans_iter_put(trans, iter); ++ bch2_trans_iter_exit(trans, &iter); + return ret; + } + +@@ -695,27 +695,28 @@ static int bucket_invalidate_btree(struct btree_trans *trans, + struct bkey_alloc_unpacked u; + struct bucket *g; + struct bucket_mark m; +- struct btree_iter *iter = +- bch2_trans_get_iter(trans, BTREE_ID_alloc, +- POS(ca->dev_idx, b), +- BTREE_ITER_CACHED| +- BTREE_ITER_CACHED_NOFILL| +- BTREE_ITER_INTENT); ++ struct btree_iter iter; + int ret; + ++ bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, ++ POS(ca->dev_idx, b), ++ BTREE_ITER_CACHED| ++ BTREE_ITER_CACHED_NOFILL| ++ BTREE_ITER_INTENT); ++ + a = bch2_trans_kmalloc(trans, sizeof(*a)); + ret = PTR_ERR_OR_ZERO(a); + if (ret) + goto err; + +- ret = bch2_btree_iter_traverse(iter); ++ ret = bch2_btree_iter_traverse(&iter); + if (ret) + goto err; + + percpu_down_read(&c->mark_lock); + g = bucket(ca, b); + m = READ_ONCE(g->mark); +- u = alloc_mem_to_key(iter, g, m); ++ u = alloc_mem_to_key(&iter, g, m); + percpu_up_read(&c->mark_lock); + + u.gen++; +@@ -726,10 +727,10 @@ static int bucket_invalidate_btree(struct btree_trans *trans, + u.write_time = atomic64_read(&c->io_clock[WRITE].now); + + bch2_alloc_pack(c, a, u); +- ret = bch2_trans_update(trans, iter, &a->k, ++ ret = bch2_trans_update(trans, &iter, &a->k, + BTREE_TRIGGER_BUCKET_INVALIDATE); + err: +- bch2_trans_iter_put(trans, iter); ++ bch2_trans_iter_exit(trans, &iter); + return ret; + } + +diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h +index f7d64eb8b0b8..94c73f28398f 100644 +--- a/fs/bcachefs/bcachefs.h ++++ b/fs/bcachefs/bcachefs.h +@@ -558,8 +558,8 @@ struct journal_keys { + u64 journal_seq_base; + }; + +-struct btree_iter_buf { +- struct btree_iter *iter; ++struct btree_path_buf { ++ struct btree_path *path; + }; + + #define REPLICAS_DELTA_LIST_MAX (1U << 16) +@@ -667,9 +667,9 @@ struct bch_fs { + /* btree_iter.c: */ + struct mutex btree_trans_lock; + struct list_head btree_trans_list; +- mempool_t btree_iters_pool; ++ mempool_t btree_paths_pool; + mempool_t btree_trans_mem_pool; +- struct btree_iter_buf __percpu *btree_iters_bufs; ++ struct btree_path_buf __percpu *btree_paths_bufs; + + struct srcu_struct btree_trans_barrier; + +diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c +index 8baada315cae..a4e0d149e1dc 100644 +--- a/fs/bcachefs/bset.c ++++ b/fs/bcachefs/bset.c +@@ -197,9 +197,11 @@ void bch2_btree_node_iter_verify(struct btree_node_iter *iter, + return; + + /* Verify no duplicates: */ +- btree_node_iter_for_each(iter, set) ++ btree_node_iter_for_each(iter, set) { ++ BUG_ON(set->k > set->end); + btree_node_iter_for_each(iter, s2) + BUG_ON(set != s2 && set->end == s2->end); ++ } + + /* Verify that set->end is correct: */ + btree_node_iter_for_each(iter, set) { +diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c +index 83a2d0aa8367..3b08b32f00a4 100644 +--- a/fs/bcachefs/btree_cache.c ++++ b/fs/bcachefs/btree_cache.c +@@ -642,7 +642,7 @@ err: + /* Slowpath, don't want it inlined into btree_iter_traverse() */ + static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c, + struct btree_trans *trans, +- struct btree_iter *iter, ++ struct btree_path *path, + const struct bkey_i *k, + enum btree_id btree_id, + unsigned level, +@@ -658,7 +658,7 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c, + * Parent node must be locked, else we could read in a btree node that's + * been freed: + */ +- if (trans && !bch2_btree_node_relock(trans, iter, level + 1)) { ++ if (trans && !bch2_btree_node_relock(trans, path, level + 1)) { + btree_trans_restart(trans); + return ERR_PTR(-EINTR); + } +@@ -700,7 +700,7 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c, + + if (trans && + (!bch2_trans_relock(trans) || +- !bch2_btree_iter_relock_intent(trans, iter))) { ++ !bch2_btree_path_relock_intent(trans, path))) { + BUG_ON(!trans->restarted); + return ERR_PTR(-EINTR); + } +@@ -764,7 +764,7 @@ static inline void btree_check_header(struct bch_fs *c, struct btree *b) + * The btree node will have either a read or a write lock held, depending on + * the @write parameter. + */ +-struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_iter *iter, ++struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path, + const struct bkey_i *k, unsigned level, + enum six_lock_type lock_type, + unsigned long trace_ip) +@@ -789,7 +789,7 @@ retry: + * else we could read in a btree node from disk that's been + * freed: + */ +- b = bch2_btree_node_fill(c, trans, iter, k, iter->btree_id, ++ b = bch2_btree_node_fill(c, trans, path, k, path->btree_id, + level, lock_type, true); + + /* We raced and found the btree node in the cache */ +@@ -828,10 +828,10 @@ lock_node: + * the parent was modified, when the pointer to the node we want + * was removed - and we'll bail out: + */ +- if (btree_node_read_locked(iter, level + 1)) +- btree_node_unlock(iter, level + 1); ++ if (btree_node_read_locked(path, level + 1)) ++ btree_node_unlock(path, level + 1); + +- if (!btree_node_lock(trans, iter, b, k->k.p, level, lock_type, ++ if (!btree_node_lock(trans, path, b, k->k.p, level, lock_type, + lock_node_check_fn, (void *) k, trace_ip)) { + if (!trans->restarted) + goto retry; +@@ -842,13 +842,13 @@ lock_node: + b->c.level != level || + race_fault())) { + six_unlock_type(&b->c.lock, lock_type); +- if (bch2_btree_node_relock(trans, iter, level + 1)) ++ if (bch2_btree_node_relock(trans, path, level + 1)) + goto retry; + + trace_trans_restart_btree_node_reused(trans->ip, + trace_ip, +- iter->btree_id, +- &iter->real_pos); ++ path->btree_id, ++ &path->pos); + btree_trans_restart(trans); + return ERR_PTR(-EINTR); + } +@@ -863,12 +863,12 @@ lock_node: + bch2_btree_node_wait_on_read(b); + + /* +- * should_be_locked is not set on this iterator yet, so we need +- * to relock it specifically: ++ * should_be_locked is not set on this path yet, so we need to ++ * relock it specifically: + */ + if (trans && + (!bch2_trans_relock(trans) || +- !bch2_btree_iter_relock_intent(trans, iter))) { ++ !bch2_btree_path_relock_intent(trans, path))) { + BUG_ON(!trans->restarted); + return ERR_PTR(-EINTR); + } +@@ -896,7 +896,7 @@ lock_node: + return ERR_PTR(-EIO); + } + +- EBUG_ON(b->c.btree_id != iter->btree_id); ++ EBUG_ON(b->c.btree_id != path->btree_id); + EBUG_ON(BTREE_NODE_LEVEL(b->data) != level); + btree_check_header(c, b); + +@@ -987,21 +987,21 @@ out: + + int bch2_btree_node_prefetch(struct bch_fs *c, + struct btree_trans *trans, +- struct btree_iter *iter, ++ struct btree_path *path, + const struct bkey_i *k, + enum btree_id btree_id, unsigned level) + { + struct btree_cache *bc = &c->btree_cache; + struct btree *b; + +- BUG_ON(trans && !btree_node_locked(iter, level + 1)); ++ BUG_ON(trans && !btree_node_locked(path, level + 1)); + BUG_ON(level >= BTREE_MAX_DEPTH); + + b = btree_cache_find(bc, k); + if (b) + return 0; + +- b = bch2_btree_node_fill(c, trans, iter, k, btree_id, ++ b = bch2_btree_node_fill(c, trans, path, k, btree_id, + level, SIX_LOCK_read, false); + return PTR_ERR_OR_ZERO(b); + } +diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h +index 6c1c69f3abcf..402cec1802bc 100644 +--- a/fs/bcachefs/btree_cache.h ++++ b/fs/bcachefs/btree_cache.h +@@ -22,16 +22,15 @@ int bch2_btree_cache_cannibalize_lock(struct bch_fs *, struct closure *); + struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *); + struct btree *bch2_btree_node_mem_alloc(struct bch_fs *); + +-struct btree *bch2_btree_node_get(struct btree_trans *, struct btree_iter *, ++struct btree *bch2_btree_node_get(struct btree_trans *, struct btree_path *, + const struct bkey_i *, unsigned, + enum six_lock_type, unsigned long); + + struct btree *bch2_btree_node_get_noiter(struct bch_fs *, const struct bkey_i *, + enum btree_id, unsigned, bool); + +-int bch2_btree_node_prefetch(struct bch_fs *, struct btree_trans *, +- struct btree_iter *, const struct bkey_i *, +- enum btree_id, unsigned); ++int bch2_btree_node_prefetch(struct bch_fs *, struct btree_trans *, struct btree_path *, ++ const struct bkey_i *, enum btree_id, unsigned); + + void bch2_btree_node_evict(struct bch_fs *, const struct bkey_i *); + +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +index 5757b4a2ace5..307f287d95e6 100644 +--- a/fs/bcachefs/btree_gc.c ++++ b/fs/bcachefs/btree_gc.c +@@ -775,7 +775,7 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id, + bool initial, bool metadata_only) + { + struct btree_trans trans; +- struct btree_iter *iter; ++ struct btree_iter iter; + struct btree *b; + unsigned depth = metadata_only ? 1 + : bch2_expensive_debug_checks ? 0 +@@ -800,13 +800,13 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id, + + if (!initial) { + if (max_stale > 64) +- bch2_btree_node_rewrite(&trans, iter, ++ bch2_btree_node_rewrite(&trans, &iter, + b->data->keys.seq, + BTREE_INSERT_NOWAIT| + BTREE_INSERT_GC_LOCK_HELD); + else if (!bch2_btree_gc_rewrite_disabled && + (bch2_btree_gc_always_rewrite || max_stale > 16)) +- bch2_btree_node_rewrite(&trans, iter, ++ bch2_btree_node_rewrite(&trans, &iter, + b->data->keys.seq, + BTREE_INSERT_NOWAIT| + BTREE_INSERT_GC_LOCK_HELD); +@@ -814,7 +814,7 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id, + + bch2_trans_cond_resched(&trans); + } +- bch2_trans_iter_put(&trans, iter); ++ bch2_trans_iter_exit(&trans, &iter); + + ret = bch2_trans_exit(&trans) ?: ret; + if (ret) +@@ -1414,7 +1414,7 @@ static int bch2_gc_reflink_done(struct bch_fs *c, bool initial, + bool metadata_only) + { + struct btree_trans trans; +- struct btree_iter *iter; ++ struct btree_iter iter; + struct bkey_s_c k; + struct reflink_gc *r; + size_t idx = 0; +@@ -1480,7 +1480,7 @@ static int bch2_gc_reflink_done(struct bch_fs *c, bool initial, + } + } + fsck_err: +- bch2_trans_iter_put(&trans, iter); ++ bch2_trans_iter_exit(&trans, &iter); + bch2_trans_exit(&trans); + out: + genradix_free(&c->reflink_gc_table); +@@ -1512,7 +1512,7 @@ static int bch2_gc_reflink_start(struct bch_fs *c, bool initial, + bool metadata_only) + { + struct btree_trans trans; +- struct btree_iter *iter; ++ struct btree_iter iter; + struct bkey_s_c k; + struct reflink_gc *r; + int ret; +@@ -1547,7 +1547,7 @@ static int bch2_gc_reflink_start(struct bch_fs *c, bool initial, + r->size = k.k->size; + r->refcount = 0; + } +- bch2_trans_iter_put(&trans, iter); ++ bch2_trans_iter_exit(&trans, &iter); + + bch2_trans_exit(&trans); + return 0; +@@ -1722,7 +1722,7 @@ static bool gc_btree_gens_key(struct bch_fs *c, struct bkey_s_c k) + static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id btree_id) + { + struct btree_trans trans; +- struct btree_iter *iter; ++ struct btree_iter iter; + struct bkey_s_c k; + struct bkey_buf sk; + int ret = 0, commit_err = 0; +@@ -1730,13 +1730,13 @@ static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id btree_id) + bch2_bkey_buf_init(&sk); + bch2_trans_init(&trans, c, 0, 0); + +- iter = bch2_trans_get_iter(&trans, btree_id, POS_MIN, +- BTREE_ITER_PREFETCH| +- BTREE_ITER_NOT_EXTENTS| +- BTREE_ITER_ALL_SNAPSHOTS); ++ bch2_trans_iter_init(&trans, &iter, btree_id, POS_MIN, ++ BTREE_ITER_PREFETCH| ++ BTREE_ITER_NOT_EXTENTS| ++ BTREE_ITER_ALL_SNAPSHOTS); + + while ((bch2_trans_begin(&trans), +- k = bch2_btree_iter_peek(iter)).k) { ++ k = bch2_btree_iter_peek(&iter)).k) { + ret = bkey_err(k); + + if (ret == -EINTR) +@@ -1744,7 +1744,7 @@ static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id btree_id) + if (ret) + break; + +- c->gc_gens_pos = iter->pos; ++ c->gc_gens_pos = iter.pos; + + if (gc_btree_gens_key(c, k) && !commit_err) { + bch2_bkey_buf_reassemble(&sk, c, k); +@@ -1752,7 +1752,7 @@ static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id btree_id) + + + commit_err = +- bch2_trans_update(&trans, iter, sk.k, 0) ?: ++ bch2_trans_update(&trans, &iter, sk.k, 0) ?: + bch2_trans_commit(&trans, NULL, NULL, + BTREE_INSERT_NOWAIT| + BTREE_INSERT_NOFAIL); +@@ -1762,9 +1762,9 @@ static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id btree_id) + } + } + +- bch2_btree_iter_advance(iter); ++ bch2_btree_iter_advance(&iter); + } +- bch2_trans_iter_put(&trans, iter); ++ bch2_trans_iter_exit(&trans, &iter); + + bch2_trans_exit(&trans); + bch2_bkey_buf_exit(&sk, c); +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index b27bc9002f93..9eec445b7460 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -17,22 +17,31 @@ + #include + #include + +-static void btree_iter_set_search_pos(struct btree_iter *, struct bpos); +-static void btree_iter_check_sort(struct btree_trans *, struct btree_iter *); +-static inline void btree_trans_verify_sorted(struct btree_trans *); +-static struct btree_iter *btree_iter_child_alloc(struct btree_trans *, +- struct btree_iter *, unsigned long); +-static struct btree_iter *btree_trans_iter_alloc(struct btree_trans *, +- struct btree_iter *); +-static void btree_iter_copy(struct btree_trans *, struct btree_iter *, struct btree_iter *); ++static void btree_trans_verify_sorted(struct btree_trans *); ++static void btree_path_check_sort(struct btree_trans *, struct btree_path *, int); + +-static inline int btree_iter_cmp(const struct btree_iter *l, +- const struct btree_iter *r) ++static inline void btree_path_list_remove(struct btree_trans *, struct btree_path *); ++static inline void btree_path_list_add(struct btree_trans *, struct btree_path *, ++ struct btree_path *); ++ ++static struct btree_path *btree_path_alloc(struct btree_trans *, struct btree_path *); ++ ++static inline int __btree_path_cmp(const struct btree_path *l, ++ enum btree_id r_btree_id, ++ bool r_cached, ++ struct bpos r_pos, ++ unsigned r_level) + { +- return cmp_int(l->btree_id, r->btree_id) ?: +- -cmp_int(l->cached, r->cached) ?: +- bkey_cmp(l->real_pos, r->real_pos) ?: +- -cmp_int(l->level, r->level); ++ return cmp_int(l->btree_id, r_btree_id) ?: ++ cmp_int(l->cached, r_cached) ?: ++ bpos_cmp(l->pos, r_pos) ?: ++ -cmp_int(l->level, r_level); ++} ++ ++static inline int btree_path_cmp(const struct btree_path *l, ++ const struct btree_path *r) ++{ ++ return __btree_path_cmp(l, r->btree_id, r->cached, r->pos, r->level); + } + + static inline struct bpos bkey_successor(struct btree_iter *iter, struct bpos p) +@@ -61,10 +70,10 @@ static inline struct bpos bkey_predecessor(struct btree_iter *iter, struct bpos + return p; + } + +-static inline bool is_btree_node(struct btree_iter *iter, unsigned l) ++static inline bool is_btree_node(struct btree_path *path, unsigned l) + { + return l < BTREE_MAX_DEPTH && +- (unsigned long) iter->l[l].b >= 128; ++ (unsigned long) path->l[l].b >= 128; + } + + static inline struct bpos btree_iter_search_key(struct btree_iter *iter) +@@ -77,42 +86,42 @@ static inline struct bpos btree_iter_search_key(struct btree_iter *iter) + return pos; + } + +-static inline bool btree_iter_pos_before_node(struct btree_iter *iter, ++static inline bool btree_path_pos_before_node(struct btree_path *path, + struct btree *b) + { +- return bpos_cmp(iter->real_pos, b->data->min_key) < 0; ++ return bpos_cmp(path->pos, b->data->min_key) < 0; + } + +-static inline bool btree_iter_pos_after_node(struct btree_iter *iter, ++static inline bool btree_path_pos_after_node(struct btree_path *path, + struct btree *b) + { +- return bpos_cmp(b->key.k.p, iter->real_pos) < 0; ++ return bpos_cmp(b->key.k.p, path->pos) < 0; + } + +-static inline bool btree_iter_pos_in_node(struct btree_iter *iter, ++static inline bool btree_path_pos_in_node(struct btree_path *path, + struct btree *b) + { +- return iter->btree_id == b->c.btree_id && +- !btree_iter_pos_before_node(iter, b) && +- !btree_iter_pos_after_node(iter, b); ++ return path->btree_id == b->c.btree_id && ++ !btree_path_pos_before_node(path, b) && ++ !btree_path_pos_after_node(path, b); + } + + /* Btree node locking: */ + + void bch2_btree_node_unlock_write(struct btree_trans *trans, +- struct btree_iter *iter, struct btree *b) ++ struct btree_path *path, struct btree *b) + { +- bch2_btree_node_unlock_write_inlined(trans, iter, b); ++ bch2_btree_node_unlock_write_inlined(trans, path, b); + } + + void __bch2_btree_node_lock_write(struct btree_trans *trans, struct btree *b) + { +- struct btree_iter *iter; ++ struct btree_path *linked; + unsigned readers = 0; + +- trans_for_each_iter(trans, iter) +- if (iter->l[b->c.level].b == b && +- btree_node_read_locked(iter, b->c.level)) ++ trans_for_each_path(trans, linked) ++ if (linked->l[b->c.level].b == b && ++ btree_node_read_locked(linked, b->c.level)) + readers++; + + /* +@@ -137,21 +146,21 @@ void __bch2_btree_node_lock_write(struct btree_trans *trans, struct btree *b) + } + + bool __bch2_btree_node_relock(struct btree_trans *trans, +- struct btree_iter *iter, unsigned level) ++ struct btree_path *path, unsigned level) + { +- struct btree *b = btree_iter_node(iter, level); +- int want = __btree_lock_want(iter, level); ++ struct btree *b = btree_path_node(path, level); ++ int want = __btree_lock_want(path, level); + +- if (!is_btree_node(iter, level)) ++ if (!is_btree_node(path, level)) + return false; + + if (race_fault()) + return false; + +- if (six_relock_type(&b->c.lock, want, iter->l[level].lock_seq) || +- (btree_node_lock_seq_matches(iter, b, level) && ++ if (six_relock_type(&b->c.lock, want, path->l[level].lock_seq) || ++ (btree_node_lock_seq_matches(path, b, level) && + btree_node_lock_increment(trans, b, level, want))) { +- mark_btree_node_locked(iter, level, want); ++ mark_btree_node_locked(path, level, want); + return true; + } else { + return false; +@@ -159,88 +168,88 @@ bool __bch2_btree_node_relock(struct btree_trans *trans, + } + + static bool bch2_btree_node_upgrade(struct btree_trans *trans, +- struct btree_iter *iter, unsigned level) ++ struct btree_path *path, unsigned level) + { +- struct btree *b = iter->l[level].b; ++ struct btree *b = path->l[level].b; + +- EBUG_ON(btree_lock_want(iter, level) != BTREE_NODE_INTENT_LOCKED); ++ EBUG_ON(btree_lock_want(path, level) != BTREE_NODE_INTENT_LOCKED); + +- if (!is_btree_node(iter, level)) ++ if (!is_btree_node(path, level)) + return false; + +- if (btree_node_intent_locked(iter, level)) ++ if (btree_node_intent_locked(path, level)) + return true; + + if (race_fault()) + return false; + +- if (btree_node_locked(iter, level) ++ if (btree_node_locked(path, level) + ? six_lock_tryupgrade(&b->c.lock) +- : six_relock_type(&b->c.lock, SIX_LOCK_intent, iter->l[level].lock_seq)) ++ : six_relock_type(&b->c.lock, SIX_LOCK_intent, path->l[level].lock_seq)) + goto success; + +- if (btree_node_lock_seq_matches(iter, b, level) && ++ if (btree_node_lock_seq_matches(path, b, level) && + btree_node_lock_increment(trans, b, level, BTREE_NODE_INTENT_LOCKED)) { +- btree_node_unlock(iter, level); ++ btree_node_unlock(path, level); + goto success; + } + + return false; + success: +- mark_btree_node_intent_locked(iter, level); ++ mark_btree_node_intent_locked(path, level); + return true; + } + +-static inline bool btree_iter_get_locks(struct btree_trans *trans, +- struct btree_iter *iter, ++static inline bool btree_path_get_locks(struct btree_trans *trans, ++ struct btree_path *path, + bool upgrade, unsigned long trace_ip) + { +- unsigned l = iter->level; ++ unsigned l = path->level; + int fail_idx = -1; + + do { +- if (!btree_iter_node(iter, l)) ++ if (!btree_path_node(path, l)) + break; + + if (!(upgrade +- ? bch2_btree_node_upgrade(trans, iter, l) +- : bch2_btree_node_relock(trans, iter, l))) { ++ ? bch2_btree_node_upgrade(trans, path, l) ++ : bch2_btree_node_relock(trans, path, l))) { + (upgrade + ? trace_node_upgrade_fail + : trace_node_relock_fail)(trans->ip, trace_ip, +- iter->cached, +- iter->btree_id, &iter->real_pos, +- l, iter->l[l].lock_seq, +- is_btree_node(iter, l) ++ path->cached, ++ path->btree_id, &path->pos, ++ l, path->l[l].lock_seq, ++ is_btree_node(path, l) + ? 0 +- : (unsigned long) iter->l[l].b, +- is_btree_node(iter, l) +- ? iter->l[l].b->c.lock.state.seq ++ : (unsigned long) path->l[l].b, ++ is_btree_node(path, l) ++ ? path->l[l].b->c.lock.state.seq + : 0); + fail_idx = l; +- btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); ++ btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); + } + + l++; +- } while (l < iter->locks_want); ++ } while (l < path->locks_want); + + /* + * When we fail to get a lock, we have to ensure that any child nodes +- * can't be relocked so bch2_btree_iter_traverse has to walk back up to ++ * can't be relocked so bch2_btree_path_traverse has to walk back up to + * the node that we failed to relock: + */ + while (fail_idx >= 0) { +- btree_node_unlock(iter, fail_idx); +- iter->l[fail_idx].b = BTREE_ITER_NO_NODE_GET_LOCKS; ++ btree_node_unlock(path, fail_idx); ++ path->l[fail_idx].b = BTREE_ITER_NO_NODE_GET_LOCKS; + --fail_idx; + } + +- if (iter->uptodate == BTREE_ITER_NEED_RELOCK) +- iter->uptodate = BTREE_ITER_UPTODATE; ++ if (path->uptodate == BTREE_ITER_NEED_RELOCK) ++ path->uptodate = BTREE_ITER_UPTODATE; + + bch2_trans_verify_locks(trans); + +- return iter->uptodate < BTREE_ITER_NEED_RELOCK; ++ return path->uptodate < BTREE_ITER_NEED_RELOCK; + } + + static struct bpos btree_node_pos(struct btree_bkey_cached_common *_b, +@@ -253,19 +262,20 @@ static struct bpos btree_node_pos(struct btree_bkey_cached_common *_b, + + /* Slowpath: */ + bool __bch2_btree_node_lock(struct btree_trans *trans, +- struct btree_iter *iter, +- struct btree *b, struct bpos pos, unsigned level, ++ struct btree_path *path, ++ struct btree *b, ++ struct bpos pos, unsigned level, + enum six_lock_type type, + six_lock_should_sleep_fn should_sleep_fn, void *p, + unsigned long ip) + { +- struct btree_iter *linked, *deadlock_iter = NULL; ++ struct btree_path *linked, *deadlock_path = NULL; + u64 start_time = local_clock(); + unsigned reason = 9; + bool ret; + + /* Check if it's safe to block: */ +- trans_for_each_iter(trans, linked) { ++ trans_for_each_path(trans, linked) { + if (!linked->nodes_locked) + continue; + +@@ -283,25 +293,25 @@ bool __bch2_btree_node_lock(struct btree_trans *trans, + */ + if (type == SIX_LOCK_intent && + linked->nodes_locked != linked->nodes_intent_locked) { +- deadlock_iter = linked; ++ deadlock_path = linked; + reason = 1; + } + +- if (linked->btree_id != iter->btree_id) { +- if (linked->btree_id > iter->btree_id) { +- deadlock_iter = linked; ++ if (linked->btree_id != path->btree_id) { ++ if (linked->btree_id > path->btree_id) { ++ deadlock_path = linked; + reason = 3; + } + continue; + } + + /* +- * Within the same btree, cached iterators come before non +- * cached iterators: ++ * Within the same btree, cached paths come before non ++ * cached paths: + */ +- if (linked->cached != iter->cached) { +- if (iter->cached) { +- deadlock_iter = linked; ++ if (linked->cached != path->cached) { ++ if (path->cached) { ++ deadlock_path = linked; + reason = 4; + } + continue; +@@ -309,11 +319,11 @@ bool __bch2_btree_node_lock(struct btree_trans *trans, + + /* + * Interior nodes must be locked before their descendants: if +- * another iterator has possible descendants locked of the node ++ * another path has possible descendants locked of the node + * we're about to lock, it must have the ancestors locked too: + */ + if (level > __fls(linked->nodes_locked)) { +- deadlock_iter = linked; ++ deadlock_path = linked; + reason = 5; + } + +@@ -321,20 +331,20 @@ bool __bch2_btree_node_lock(struct btree_trans *trans, + if (btree_node_locked(linked, level) && + bpos_cmp(pos, btree_node_pos((void *) linked->l[level].b, + linked->cached)) <= 0) { +- deadlock_iter = linked; ++ deadlock_path = linked; + reason = 7; + BUG_ON(trans->in_traverse_all); + } + } + +- if (unlikely(deadlock_iter)) { ++ if (unlikely(deadlock_path)) { + trace_trans_restart_would_deadlock(trans->ip, ip, + trans->in_traverse_all, reason, +- deadlock_iter->btree_id, +- deadlock_iter->cached, +- &deadlock_iter->real_pos, +- iter->btree_id, +- iter->cached, ++ deadlock_path->btree_id, ++ deadlock_path->cached, ++ &deadlock_path->pos, ++ path->btree_id, ++ path->cached, + &pos); + btree_trans_restart(trans); + return false; +@@ -344,9 +354,9 @@ bool __bch2_btree_node_lock(struct btree_trans *trans, + return true; + + #ifdef CONFIG_BCACHEFS_DEBUG +- trans->locking_iter_idx = iter->idx; ++ trans->locking_path_idx = path->idx; + trans->locking_pos = pos; +- trans->locking_btree_id = iter->btree_id; ++ trans->locking_btree_id = path->btree_id; + trans->locking_level = level; + trans->locking = b; + #endif +@@ -365,54 +375,57 @@ bool __bch2_btree_node_lock(struct btree_trans *trans, + /* Btree iterator locking: */ + + #ifdef CONFIG_BCACHEFS_DEBUG +-static void bch2_btree_iter_verify_locks(struct btree_iter *iter) ++ ++static void bch2_btree_path_verify_locks(struct btree_path *path) + { + unsigned l; + +- for (l = 0; btree_iter_node(iter, l); l++) { +- if (iter->uptodate >= BTREE_ITER_NEED_RELOCK && +- !btree_node_locked(iter, l)) ++ for (l = 0; btree_path_node(path, l); l++) { ++ if (path->uptodate >= BTREE_ITER_NEED_RELOCK && ++ !btree_node_locked(path, l)) + continue; + +- BUG_ON(btree_lock_want(iter, l) != +- btree_node_locked_type(iter, l)); ++ BUG_ON(btree_lock_want(path, l) != ++ btree_node_locked_type(path, l)); + } + } + + void bch2_trans_verify_locks(struct btree_trans *trans) + { +- struct btree_iter *iter; ++ struct btree_path *path; + +- trans_for_each_iter(trans, iter) +- bch2_btree_iter_verify_locks(iter); ++ trans_for_each_path(trans, path) ++ bch2_btree_path_verify_locks(path); + } + #else +-static inline void bch2_btree_iter_verify_locks(struct btree_iter *iter) {} ++static inline void bch2_btree_path_verify_locks(struct btree_path *path) {} + #endif + ++/* Btree path locking: */ ++ + /* + * Only for btree_cache.c - only relocks intent locks + */ +-bool bch2_btree_iter_relock_intent(struct btree_trans *trans, +- struct btree_iter *iter) ++bool bch2_btree_path_relock_intent(struct btree_trans *trans, ++ struct btree_path *path) + { + unsigned l; + +- for (l = iter->level; +- l < iter->locks_want && btree_iter_node(iter, l); ++ for (l = path->level; ++ l < path->locks_want && btree_path_node(path, l); + l++) { +- if (!bch2_btree_node_relock(trans, iter, l)) { ++ if (!bch2_btree_node_relock(trans, path, l)) { + trace_node_relock_fail(trans->ip, _RET_IP_, +- iter->cached, +- iter->btree_id, &iter->real_pos, +- l, iter->l[l].lock_seq, +- is_btree_node(iter, l) ++ path->cached, ++ path->btree_id, &path->pos, ++ l, path->l[l].lock_seq, ++ is_btree_node(path, l) + ? 0 +- : (unsigned long) iter->l[l].b, +- is_btree_node(iter, l) +- ? iter->l[l].b->c.lock.state.seq ++ : (unsigned long) path->l[l].b, ++ is_btree_node(path, l) ++ ? path->l[l].b->c.lock.state.seq + : 0); +- btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); ++ btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); + btree_trans_restart(trans); + return false; + } +@@ -422,27 +435,27 @@ bool bch2_btree_iter_relock_intent(struct btree_trans *trans, + } + + __flatten +-static bool bch2_btree_iter_relock(struct btree_trans *trans, +- struct btree_iter *iter, unsigned long trace_ip) ++static bool bch2_btree_path_relock(struct btree_trans *trans, ++ struct btree_path *path, unsigned long trace_ip) + { +- bool ret = btree_iter_get_locks(trans, iter, false, trace_ip); ++ bool ret = btree_path_get_locks(trans, path, false, trace_ip); + + if (!ret) + btree_trans_restart(trans); + return ret; + } + +-bool __bch2_btree_iter_upgrade(struct btree_trans *trans, +- struct btree_iter *iter, ++bool __bch2_btree_path_upgrade(struct btree_trans *trans, ++ struct btree_path *path, + unsigned new_locks_want) + { +- struct btree_iter *linked; ++ struct btree_path *linked; + +- EBUG_ON(iter->locks_want >= new_locks_want); ++ EBUG_ON(path->locks_want >= new_locks_want); + +- iter->locks_want = new_locks_want; ++ path->locks_want = new_locks_want; + +- if (btree_iter_get_locks(trans, iter, true, _THIS_IP_)) ++ if (btree_path_get_locks(trans, path, true, _THIS_IP_)) + return true; + + /* +@@ -450,7 +463,7 @@ bool __bch2_btree_iter_upgrade(struct btree_trans *trans, + * iterators in the btree_trans here. + * + * On failure to upgrade the iterator, setting iter->locks_want and +- * calling get_locks() is sufficient to make bch2_btree_iter_traverse() ++ * calling get_locks() is sufficient to make bch2_btree_path_traverse() + * get the locks we want on transaction restart. + * + * But if this iterator was a clone, on transaction restart what we did +@@ -462,75 +475,67 @@ bool __bch2_btree_iter_upgrade(struct btree_trans *trans, + * + * The code below used to be needed to ensure ancestor nodes get locked + * before interior nodes - now that's handled by +- * bch2_btree_iter_traverse_all(). ++ * bch2_btree_path_traverse_all(). + */ +- trans_for_each_iter(trans, linked) +- if (linked != iter && +- linked->cached == iter->cached && +- linked->btree_id == iter->btree_id && ++ trans_for_each_path(trans, linked) ++ if (linked != path && ++ linked->cached == path->cached && ++ linked->btree_id == path->btree_id && + linked->locks_want < new_locks_want) { + linked->locks_want = new_locks_want; +- btree_iter_get_locks(trans, linked, true, _THIS_IP_); ++ btree_path_get_locks(trans, linked, true, _THIS_IP_); + } + +- if (iter->should_be_locked) +- btree_trans_restart(trans); + return false; + } + +-void __bch2_btree_iter_downgrade(struct btree_iter *iter, ++void __bch2_btree_path_downgrade(struct btree_path *path, + unsigned new_locks_want) + { + unsigned l; + +- EBUG_ON(iter->locks_want < new_locks_want); ++ EBUG_ON(path->locks_want < new_locks_want); + +- iter->locks_want = new_locks_want; ++ path->locks_want = new_locks_want; + +- while (iter->nodes_locked && +- (l = __fls(iter->nodes_locked)) >= iter->locks_want) { +- if (l > iter->level) { +- btree_node_unlock(iter, l); ++ while (path->nodes_locked && ++ (l = __fls(path->nodes_locked)) >= path->locks_want) { ++ if (l > path->level) { ++ btree_node_unlock(path, l); + } else { +- if (btree_node_intent_locked(iter, l)) { +- six_lock_downgrade(&iter->l[l].b->c.lock); +- iter->nodes_intent_locked ^= 1 << l; ++ if (btree_node_intent_locked(path, l)) { ++ six_lock_downgrade(&path->l[l].b->c.lock); ++ path->nodes_intent_locked ^= 1 << l; + } + break; + } + } + +- bch2_btree_iter_verify_locks(iter); ++ bch2_btree_path_verify_locks(path); + } + + void bch2_trans_downgrade(struct btree_trans *trans) + { +- struct btree_iter *iter; ++ struct btree_path *path; + +- trans_for_each_iter(trans, iter) +- bch2_btree_iter_downgrade(iter); ++ trans_for_each_path(trans, path) ++ bch2_btree_path_downgrade(path); + } + + /* Btree transaction locking: */ + +-static inline bool btree_iter_should_be_locked(struct btree_iter *iter) +-{ +- return (iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT) || +- iter->should_be_locked; +-} +- + bool bch2_trans_relock(struct btree_trans *trans) + { +- struct btree_iter *iter; ++ struct btree_path *path; + + if (unlikely(trans->restarted)) + return false; + +- trans_for_each_iter(trans, iter) +- if (btree_iter_should_be_locked(iter) && +- !bch2_btree_iter_relock(trans, iter, _RET_IP_)) { ++ trans_for_each_path(trans, path) ++ if (path->should_be_locked && ++ !bch2_btree_path_relock(trans, path, _RET_IP_)) { + trace_trans_restart_relock(trans->ip, _RET_IP_, +- iter->btree_id, &iter->real_pos); ++ path->btree_id, &path->pos); + BUG_ON(!trans->restarted); + return false; + } +@@ -539,10 +544,10 @@ bool bch2_trans_relock(struct btree_trans *trans) + + void bch2_trans_unlock(struct btree_trans *trans) + { +- struct btree_iter *iter; ++ struct btree_path *path; + +- trans_for_each_iter(trans, iter) +- __bch2_btree_iter_unlock(iter); ++ trans_for_each_path(trans, path) ++ __bch2_btree_path_unlock(path); + + BUG_ON(lock_class_is_held(&bch2_btree_node_lock_key)); + } +@@ -551,27 +556,27 @@ void bch2_trans_unlock(struct btree_trans *trans) + + #ifdef CONFIG_BCACHEFS_DEBUG + +-static void bch2_btree_iter_verify_cached(struct btree_trans *trans, +- struct btree_iter *iter) ++static void bch2_btree_path_verify_cached(struct btree_trans *trans, ++ struct btree_path *path) + { + struct bkey_cached *ck; +- bool locked = btree_node_locked(iter, 0); ++ bool locked = btree_node_locked(path, 0); + +- if (!bch2_btree_node_relock(trans, iter, 0)) ++ if (!bch2_btree_node_relock(trans, path, 0)) + return; + +- ck = (void *) iter->l[0].b; +- BUG_ON(ck->key.btree_id != iter->btree_id || +- bkey_cmp(ck->key.pos, iter->pos)); ++ ck = (void *) path->l[0].b; ++ BUG_ON(ck->key.btree_id != path->btree_id || ++ bkey_cmp(ck->key.pos, path->pos)); + + if (!locked) +- btree_node_unlock(iter, 0); ++ btree_node_unlock(path, 0); + } + +-static void bch2_btree_iter_verify_level(struct btree_trans *trans, +- struct btree_iter *iter, unsigned level) ++static void bch2_btree_path_verify_level(struct btree_trans *trans, ++ struct btree_path *path, unsigned level) + { +- struct btree_iter_level *l; ++ struct btree_path_level *l; + struct btree_node_iter tmp; + bool locked; + struct bkey_packed *p, *k; +@@ -581,25 +586,23 @@ static void bch2_btree_iter_verify_level(struct btree_trans *trans, + if (!bch2_debug_check_iterators) + return; + +- l = &iter->l[level]; ++ l = &path->l[level]; + tmp = l->iter; +- locked = btree_node_locked(iter, level); ++ locked = btree_node_locked(path, level); + +- if (iter->cached) { ++ if (path->cached) { + if (!level) +- bch2_btree_iter_verify_cached(trans, iter); ++ bch2_btree_path_verify_cached(trans, path); + return; + } + +- BUG_ON(iter->level < iter->min_depth); +- +- if (!btree_iter_node(iter, level)) ++ if (!btree_path_node(path, level)) + return; + +- if (!bch2_btree_node_relock(trans, iter, level)) ++ if (!bch2_btree_node_relock(trans, path, level)) + return; + +- BUG_ON(!btree_iter_pos_in_node(iter, l->b)); ++ BUG_ON(!btree_path_pos_in_node(path, l->b)); + + bch2_btree_node_iter_verify(&l->iter, l->b); + +@@ -610,29 +613,29 @@ static void bch2_btree_iter_verify_level(struct btree_trans *trans, + * For extents, the iterator may have skipped past deleted keys (but not + * whiteouts) + */ +- p = level || btree_node_type_is_extents(iter->btree_id) ++ p = level || btree_node_type_is_extents(path->btree_id) + ? bch2_btree_node_iter_prev(&tmp, l->b) + : bch2_btree_node_iter_prev_all(&tmp, l->b); + k = bch2_btree_node_iter_peek_all(&l->iter, l->b); + +- if (p && bkey_iter_pos_cmp(l->b, p, &iter->real_pos) >= 0) { ++ if (p && bkey_iter_pos_cmp(l->b, p, &path->pos) >= 0) { + msg = "before"; + goto err; + } + +- if (k && bkey_iter_pos_cmp(l->b, k, &iter->real_pos) < 0) { ++ if (k && bkey_iter_pos_cmp(l->b, k, &path->pos) < 0) { + msg = "after"; + goto err; + } + + if (!locked) +- btree_node_unlock(iter, level); ++ btree_node_unlock(path, level); + return; + err: + strcpy(buf2, "(none)"); + strcpy(buf3, "(none)"); + +- bch2_bpos_to_text(&PBUF(buf1), iter->real_pos); ++ bch2_bpos_to_text(&PBUF(buf1), path->pos); + + if (p) { + struct bkey uk = bkey_unpack_key(l->b, p); +@@ -644,20 +647,51 @@ err: + bch2_bkey_to_text(&PBUF(buf3), &uk); + } + +- panic("iterator should be %s key at level %u:\n" +- "iter pos %s\n" ++ panic("path should be %s key at level %u:\n" ++ "path pos %s\n" + "prev key %s\n" + "cur key %s\n", + msg, level, buf1, buf2, buf3); + } + +-static void bch2_btree_iter_verify(struct btree_iter *iter) ++static void bch2_btree_path_verify(struct btree_trans *trans, ++ struct btree_path *path) + { +- struct btree_trans *trans = iter->trans; + struct bch_fs *c = trans->c; + unsigned i; + +- EBUG_ON(iter->btree_id >= BTREE_ID_NR); ++ EBUG_ON(path->btree_id >= BTREE_ID_NR); ++ ++ for (i = 0; i < (!path->cached ? BTREE_MAX_DEPTH : 1); i++) { ++ if (!path->l[i].b) { ++ BUG_ON(c->btree_roots[path->btree_id].b->c.level > i); ++ break; ++ } ++ ++ bch2_btree_path_verify_level(trans, path, i); ++ } ++ ++ bch2_btree_path_verify_locks(path); ++} ++ ++void bch2_trans_verify_paths(struct btree_trans *trans) ++{ ++ struct btree_path *path; ++ ++ if (!bch2_debug_check_iterators) ++ return; ++ ++ trans_for_each_path(trans, path) ++ bch2_btree_path_verify(trans, path); ++} ++ ++static void bch2_btree_iter_verify(struct btree_iter *iter) ++{ ++ struct btree_trans *trans = iter->trans; ++ ++ BUG_ON(iter->btree_id >= BTREE_ID_NR); ++ ++ BUG_ON(!!(iter->flags & BTREE_ITER_CACHED) != iter->path->cached); + + BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS) && + iter->pos.snapshot != iter->snapshot); +@@ -669,16 +703,7 @@ static void bch2_btree_iter_verify(struct btree_iter *iter) + (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) && + !btree_type_has_snapshots(iter->btree_id)); + +- for (i = 0; i < (!iter->cached ? BTREE_MAX_DEPTH : 1); i++) { +- if (!iter->l[i].b) { +- BUG_ON(c->btree_roots[iter->btree_id].b->c.level > i); +- break; +- } +- +- bch2_btree_iter_verify_level(trans, iter, i); +- } +- +- bch2_btree_iter_verify_locks(iter); ++ bch2_btree_path_verify(trans, iter->path); + } + + static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) +@@ -690,26 +715,19 @@ static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) + bkey_cmp(iter->pos, iter->k.p) > 0); + } + +-void bch2_trans_verify_iters(struct btree_trans *trans, struct btree *b) +-{ +- struct btree_iter *iter; +- +- if (!bch2_debug_check_iterators) +- return; +- +- trans_for_each_iter_with_node(trans, b, iter) +- bch2_btree_iter_verify_level(trans, iter, b->c.level); +-} +- + #else + +-static inline void bch2_btree_iter_verify_level(struct btree_trans *trans, +- struct btree_iter *iter, unsigned l) {} ++static inline void bch2_btree_path_verify_level(struct btree_trans *trans, ++ struct btree_path *path, unsigned l) {} ++static inline void bch2_btree_path_verify(struct btree_trans *trans, ++ struct btree_path *path) {} + static inline void bch2_btree_iter_verify(struct btree_iter *iter) {} + static inline void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) {} + + #endif + ++/* Btree path: fixups after btree updates */ ++ + static void btree_node_iter_set_set_pos(struct btree_node_iter *iter, + struct btree *b, + struct bset_tree *t, +@@ -727,38 +745,38 @@ static void btree_node_iter_set_set_pos(struct btree_node_iter *iter, + bch2_btree_node_iter_push(iter, b, k, btree_bkey_last(b, t)); + } + +-static void __bch2_btree_iter_fix_key_modified(struct btree_iter *iter, ++static void __bch2_btree_path_fix_key_modified(struct btree_path *path, + struct btree *b, + struct bkey_packed *where) + { +- struct btree_iter_level *l = &iter->l[b->c.level]; ++ struct btree_path_level *l = &path->l[b->c.level]; + + if (where != bch2_btree_node_iter_peek_all(&l->iter, l->b)) + return; + +- if (bkey_iter_pos_cmp(l->b, where, &iter->real_pos) < 0) ++ if (bkey_iter_pos_cmp(l->b, where, &path->pos) < 0) + bch2_btree_node_iter_advance(&l->iter, l->b); + } + +-void bch2_btree_iter_fix_key_modified(struct btree_trans *trans, ++void bch2_btree_path_fix_key_modified(struct btree_trans *trans, + struct btree *b, + struct bkey_packed *where) + { +- struct btree_iter *iter; ++ struct btree_path *path; + +- trans_for_each_iter_with_node(trans, b, iter) { +- __bch2_btree_iter_fix_key_modified(iter, b, where); +- bch2_btree_iter_verify_level(trans, iter, b->c.level); ++ trans_for_each_path_with_node(trans, b, path) { ++ __bch2_btree_path_fix_key_modified(path, b, where); ++ bch2_btree_path_verify_level(trans, path, b->c.level); + } + } + +-static void __bch2_btree_node_iter_fix(struct btree_iter *iter, +- struct btree *b, +- struct btree_node_iter *node_iter, +- struct bset_tree *t, +- struct bkey_packed *where, +- unsigned clobber_u64s, +- unsigned new_u64s) ++static void __bch2_btree_node_iter_fix(struct btree_path *path, ++ struct btree *b, ++ struct btree_node_iter *node_iter, ++ struct bset_tree *t, ++ struct bkey_packed *where, ++ unsigned clobber_u64s, ++ unsigned new_u64s) + { + const struct bkey_packed *end = btree_bkey_last(b, t); + struct btree_node_iter_set *set; +@@ -776,7 +794,7 @@ static void __bch2_btree_node_iter_fix(struct btree_iter *iter, + + /* didn't find the bset in the iterator - might have to readd it: */ + if (new_u64s && +- bkey_iter_pos_cmp(b, where, &iter->real_pos) >= 0) { ++ bkey_iter_pos_cmp(b, where, &path->pos) >= 0) { + bch2_btree_node_iter_push(node_iter, b, where, end); + goto fixup_done; + } else { +@@ -791,7 +809,7 @@ found: + return; + + if (new_u64s && +- bkey_iter_pos_cmp(b, where, &iter->real_pos) >= 0) { ++ bkey_iter_pos_cmp(b, where, &path->pos) >= 0) { + set->k = offset; + } else if (set->k < offset + clobber_u64s) { + set->k = offset + new_u64s; +@@ -818,7 +836,7 @@ fixup_done: + if (!bch2_btree_node_iter_end(node_iter) && + iter_current_key_modified && + (b->c.level || +- btree_node_type_is_extents(iter->btree_id))) { ++ btree_node_type_is_extents(path->btree_id))) { + struct bset_tree *t; + struct bkey_packed *k, *k2, *p; + +@@ -846,7 +864,7 @@ fixup_done: + } + + void bch2_btree_node_iter_fix(struct btree_trans *trans, +- struct btree_iter *iter, ++ struct btree_path *path, + struct btree *b, + struct btree_node_iter *node_iter, + struct bkey_packed *where, +@@ -854,26 +872,28 @@ void bch2_btree_node_iter_fix(struct btree_trans *trans, + unsigned new_u64s) + { + struct bset_tree *t = bch2_bkey_to_bset(b, where); +- struct btree_iter *linked; ++ struct btree_path *linked; + +- if (node_iter != &iter->l[b->c.level].iter) { +- __bch2_btree_node_iter_fix(iter, b, node_iter, t, ++ if (node_iter != &path->l[b->c.level].iter) { ++ __bch2_btree_node_iter_fix(path, b, node_iter, t, + where, clobber_u64s, new_u64s); + + if (bch2_debug_check_iterators) + bch2_btree_node_iter_verify(node_iter, b); + } + +- trans_for_each_iter_with_node(trans, b, linked) { ++ trans_for_each_path_with_node(trans, b, linked) { + __bch2_btree_node_iter_fix(linked, b, + &linked->l[b->c.level].iter, t, + where, clobber_u64s, new_u64s); +- bch2_btree_iter_verify_level(trans, linked, b->c.level); ++ bch2_btree_path_verify_level(trans, linked, b->c.level); + } + } + +-static inline struct bkey_s_c __btree_iter_unpack(struct btree_iter *iter, +- struct btree_iter_level *l, ++/* Btree path level: pointer to a particular btree node and node iter */ ++ ++static inline struct bkey_s_c __btree_iter_unpack(struct bch_fs *c, ++ struct btree_path_level *l, + struct bkey *u, + struct bkey_packed *k) + { +@@ -898,48 +918,52 @@ static inline struct bkey_s_c __btree_iter_unpack(struct btree_iter *iter, + * assertion here: + */ + if (bch2_debug_check_bkeys && !bkey_deleted(ret.k)) +- bch2_bkey_debugcheck(iter->trans->c, l->b, ret); ++ bch2_bkey_debugcheck(c, l->b, ret); + + return ret; + } + +-/* peek_all() doesn't skip deleted keys */ +-static inline struct bkey_s_c btree_iter_level_peek_all(struct btree_iter *iter, +- struct btree_iter_level *l) ++static inline struct bkey_s_c btree_path_level_peek_all(struct bch_fs *c, ++ struct btree_path_level *l, ++ struct bkey *u) + { +- return __btree_iter_unpack(iter, l, &iter->k, ++ return __btree_iter_unpack(c, l, u, + bch2_btree_node_iter_peek_all(&l->iter, l->b)); + } + +-static inline struct bkey_s_c btree_iter_level_peek(struct btree_iter *iter, +- struct btree_iter_level *l) ++static inline struct bkey_s_c btree_path_level_peek(struct bch_fs *c, ++ struct btree_path *path, ++ struct btree_path_level *l, ++ struct bkey *u) + { +- struct bkey_s_c k = __btree_iter_unpack(iter, l, &iter->k, ++ struct bkey_s_c k = __btree_iter_unpack(c, l, u, + bch2_btree_node_iter_peek(&l->iter, l->b)); + +- iter->real_pos = k.k ? k.k->p : l->b->key.k.p; ++ path->pos = k.k ? k.k->p : l->b->key.k.p; + return k; + } + +-static inline struct bkey_s_c btree_iter_level_prev(struct btree_iter *iter, +- struct btree_iter_level *l) ++static inline struct bkey_s_c btree_path_level_prev(struct bch_fs *c, ++ struct btree_path *path, ++ struct btree_path_level *l, ++ struct bkey *u) + { +- struct bkey_s_c k = __btree_iter_unpack(iter, l, &iter->k, ++ struct bkey_s_c k = __btree_iter_unpack(c, l, u, + bch2_btree_node_iter_prev(&l->iter, l->b)); + +- iter->real_pos = k.k ? k.k->p : l->b->data->min_key; ++ path->pos = k.k ? k.k->p : l->b->data->min_key; + return k; + } + +-static inline bool btree_iter_advance_to_pos(struct btree_iter *iter, +- struct btree_iter_level *l, ++static inline bool btree_path_advance_to_pos(struct btree_path *path, ++ struct btree_path_level *l, + int max_advance) + { + struct bkey_packed *k; + int nr_advanced = 0; + + while ((k = bch2_btree_node_iter_peek_all(&l->iter, l->b)) && +- bkey_iter_pos_cmp(l->b, k, &iter->real_pos) < 0) { ++ bkey_iter_pos_cmp(l->b, k, &path->pos) < 0) { + if (max_advance > 0 && nr_advanced >= max_advance) + return false; + +@@ -953,10 +977,10 @@ static inline bool btree_iter_advance_to_pos(struct btree_iter *iter, + /* + * Verify that iterator for parent node points to child node: + */ +-static void btree_iter_verify_new_node(struct btree_trans *trans, +- struct btree_iter *iter, struct btree *b) ++static void btree_path_verify_new_node(struct btree_trans *trans, ++ struct btree_path *path, struct btree *b) + { +- struct btree_iter_level *l; ++ struct btree_path_level *l; + unsigned plevel; + bool parent_locked; + struct bkey_packed *k; +@@ -965,15 +989,15 @@ static void btree_iter_verify_new_node(struct btree_trans *trans, + return; + + plevel = b->c.level + 1; +- if (!btree_iter_node(iter, plevel)) ++ if (!btree_path_node(path, plevel)) + return; + +- parent_locked = btree_node_locked(iter, plevel); ++ parent_locked = btree_node_locked(path, plevel); + +- if (!bch2_btree_node_relock(trans, iter, plevel)) ++ if (!bch2_btree_node_relock(trans, path, plevel)) + return; + +- l = &iter->l[plevel]; ++ l = &path->l[plevel]; + k = bch2_btree_node_iter_peek_all(&l->iter, l->b); + if (!k || + bkey_deleted(k) || +@@ -985,7 +1009,7 @@ static void btree_iter_verify_new_node(struct btree_trans *trans, + struct bkey uk = bkey_unpack_key(b, k); + + bch2_dump_btree_node(trans->c, l->b); +- bch2_bpos_to_text(&PBUF(buf1), iter->real_pos); ++ bch2_bpos_to_text(&PBUF(buf1), path->pos); + bch2_bkey_to_text(&PBUF(buf2), &uk); + bch2_bpos_to_text(&PBUF(buf3), b->data->min_key); + bch2_bpos_to_text(&PBUF(buf3), b->data->max_key); +@@ -993,20 +1017,20 @@ static void btree_iter_verify_new_node(struct btree_trans *trans, + "iter pos %s %s\n" + "iter key %s\n" + "new node %s-%s\n", +- bch2_btree_ids[iter->btree_id], buf1, ++ bch2_btree_ids[path->btree_id], buf1, + buf2, buf3, buf4); + } + + if (!parent_locked) +- btree_node_unlock(iter, b->c.level + 1); ++ btree_node_unlock(path, b->c.level + 1); + } + +-static inline void __btree_iter_level_init(struct btree_iter *iter, ++static inline void __btree_path_level_init(struct btree_path *path, + unsigned level) + { +- struct btree_iter_level *l = &iter->l[level]; ++ struct btree_path_level *l = &path->l[level]; + +- bch2_btree_node_iter_init(&l->iter, l->b, &iter->real_pos); ++ bch2_btree_node_iter_init(&l->iter, l->b, &path->pos); + + /* + * Iterators to interior nodes should always be pointed at the first non +@@ -1016,22 +1040,24 @@ static inline void __btree_iter_level_init(struct btree_iter *iter, + bch2_btree_node_iter_peek(&l->iter, l->b); + } + +-static inline void btree_iter_level_init(struct btree_trans *trans, +- struct btree_iter *iter, ++static inline void btree_path_level_init(struct btree_trans *trans, ++ struct btree_path *path, + struct btree *b) + { +- BUG_ON(iter->cached); ++ BUG_ON(path->cached); + +- btree_iter_verify_new_node(trans, iter, b); ++ btree_path_verify_new_node(trans, path, b); + +- EBUG_ON(!btree_iter_pos_in_node(iter, b)); ++ EBUG_ON(!btree_path_pos_in_node(path, b)); + EBUG_ON(b->c.lock.state.seq & 1); + +- iter->l[b->c.level].lock_seq = b->c.lock.state.seq; +- iter->l[b->c.level].b = b; +- __btree_iter_level_init(iter, b->c.level); ++ path->l[b->c.level].lock_seq = b->c.lock.state.seq; ++ path->l[b->c.level].b = b; ++ __btree_path_level_init(path, b->c.level); + } + ++/* Btree path: fixups after btree node updates: */ ++ + /* + * A btree node is being replaced - update the iterator to point to the new + * node: +@@ -1039,37 +1065,37 @@ static inline void btree_iter_level_init(struct btree_trans *trans, + void bch2_trans_node_add(struct btree_trans *trans, struct btree *b) + { + enum btree_node_locked_type t; +- struct btree_iter *iter; ++ struct btree_path *path; + +- trans_for_each_iter(trans, iter) +- if (!iter->cached && +- btree_iter_pos_in_node(iter, b)) { ++ trans_for_each_path(trans, path) ++ if (!path->cached && ++ btree_path_pos_in_node(path, b)) { + /* +- * bch2_trans_node_drop() has already been called - ++ * bch2_btree_path_node_drop() has already been called - + * the old node we're replacing has already been + * unlocked and the pointer invalidated + */ +- BUG_ON(btree_node_locked(iter, b->c.level)); ++ BUG_ON(btree_node_locked(path, b->c.level)); + +- t = btree_lock_want(iter, b->c.level); ++ t = btree_lock_want(path, b->c.level); + if (t != BTREE_NODE_UNLOCKED) { + six_lock_increment(&b->c.lock, t); +- mark_btree_node_locked(iter, b->c.level, t); ++ mark_btree_node_locked(path, b->c.level, t); + } + +- btree_iter_level_init(trans, iter, b); ++ btree_path_level_init(trans, path, b); + } + } + + void bch2_trans_node_drop(struct btree_trans *trans, struct btree *b) + { +- struct btree_iter *iter; ++ struct btree_path *path; + unsigned level = b->c.level; + +- trans_for_each_iter(trans, iter) +- if (iter->l[level].b == b) { +- btree_node_unlock(iter, level); +- iter->l[level].b = BTREE_ITER_NO_NODE_DROP; ++ trans_for_each_path(trans, path) ++ if (path->l[level].b == b) { ++ btree_node_unlock(path, level); ++ path->l[level].b = BTREE_ITER_NO_NODE_DROP; + } + } + +@@ -1079,12 +1105,14 @@ void bch2_trans_node_drop(struct btree_trans *trans, struct btree *b) + */ + void bch2_trans_node_reinit_iter(struct btree_trans *trans, struct btree *b) + { +- struct btree_iter *iter; ++ struct btree_path *path; + +- trans_for_each_iter_with_node(trans, b, iter) +- __btree_iter_level_init(iter, b->c.level); ++ trans_for_each_path_with_node(trans, b, path) ++ __btree_path_level_init(path, b->c.level); + } + ++/* Btree path: traverse, set_pos: */ ++ + static int lock_root_check_fn(struct six_lock *lock, void *p) + { + struct btree *b = container_of(lock, struct btree, c.lock); +@@ -1093,38 +1121,38 @@ static int lock_root_check_fn(struct six_lock *lock, void *p) + return b == *rootp ? 0 : -1; + } + +-static inline int btree_iter_lock_root(struct btree_trans *trans, +- struct btree_iter *iter, ++static inline int btree_path_lock_root(struct btree_trans *trans, ++ struct btree_path *path, + unsigned depth_want, + unsigned long trace_ip) + { + struct bch_fs *c = trans->c; +- struct btree *b, **rootp = &c->btree_roots[iter->btree_id].b; ++ struct btree *b, **rootp = &c->btree_roots[path->btree_id].b; + enum six_lock_type lock_type; + unsigned i; + +- EBUG_ON(iter->nodes_locked); ++ EBUG_ON(path->nodes_locked); + + while (1) { + b = READ_ONCE(*rootp); +- iter->level = READ_ONCE(b->c.level); ++ path->level = READ_ONCE(b->c.level); + +- if (unlikely(iter->level < depth_want)) { ++ if (unlikely(path->level < depth_want)) { + /* + * the root is at a lower depth than the depth we want: + * got to the end of the btree, or we're walking nodes + * greater than some depth and there are no nodes >= + * that depth + */ +- iter->level = depth_want; +- for (i = iter->level; i < BTREE_MAX_DEPTH; i++) +- iter->l[i].b = NULL; ++ path->level = depth_want; ++ for (i = path->level; i < BTREE_MAX_DEPTH; i++) ++ path->l[i].b = NULL; + return 1; + } + +- lock_type = __btree_lock_want(iter, iter->level); +- if (unlikely(!btree_node_lock(trans, iter, b, SPOS_MAX, +- iter->level, lock_type, ++ lock_type = __btree_lock_want(path, path->level); ++ if (unlikely(!btree_node_lock(trans, path, b, SPOS_MAX, ++ path->level, lock_type, + lock_root_check_fn, rootp, + trace_ip))) { + if (trans->restarted) +@@ -1133,16 +1161,16 @@ static inline int btree_iter_lock_root(struct btree_trans *trans, + } + + if (likely(b == READ_ONCE(*rootp) && +- b->c.level == iter->level && ++ b->c.level == path->level && + !race_fault())) { +- for (i = 0; i < iter->level; i++) +- iter->l[i].b = BTREE_ITER_NO_NODE_LOCK_ROOT; +- iter->l[iter->level].b = b; +- for (i = iter->level + 1; i < BTREE_MAX_DEPTH; i++) +- iter->l[i].b = NULL; +- +- mark_btree_node_locked(iter, iter->level, lock_type); +- btree_iter_level_init(trans, iter, b); ++ for (i = 0; i < path->level; i++) ++ path->l[i].b = BTREE_ITER_NO_NODE_LOCK_ROOT; ++ path->l[path->level].b = b; ++ for (i = path->level + 1; i < BTREE_MAX_DEPTH; i++) ++ path->l[i].b = NULL; ++ ++ mark_btree_node_locked(path, path->level, lock_type); ++ btree_path_level_init(trans, path, b); + return 0; + } + +@@ -1151,23 +1179,23 @@ static inline int btree_iter_lock_root(struct btree_trans *trans, + } + + noinline +-static int btree_iter_prefetch(struct btree_trans *trans, struct btree_iter *iter) ++static int btree_path_prefetch(struct btree_trans *trans, struct btree_path *path) + { + struct bch_fs *c = trans->c; +- struct btree_iter_level *l = &iter->l[iter->level]; ++ struct btree_path_level *l = path_l(path); + struct btree_node_iter node_iter = l->iter; + struct bkey_packed *k; + struct bkey_buf tmp; + unsigned nr = test_bit(BCH_FS_STARTED, &c->flags) +- ? (iter->level > 1 ? 0 : 2) +- : (iter->level > 1 ? 1 : 16); +- bool was_locked = btree_node_locked(iter, iter->level); ++ ? (path->level > 1 ? 0 : 2) ++ : (path->level > 1 ? 1 : 16); ++ bool was_locked = btree_node_locked(path, path->level); + int ret = 0; + + bch2_bkey_buf_init(&tmp); + + while (nr && !ret) { +- if (!bch2_btree_node_relock(trans, iter, iter->level)) ++ if (!bch2_btree_node_relock(trans, path, path->level)) + break; + + bch2_btree_node_iter_advance(&node_iter, l->b); +@@ -1176,27 +1204,27 @@ static int btree_iter_prefetch(struct btree_trans *trans, struct btree_iter *ite + break; + + bch2_bkey_buf_unpack(&tmp, c, l->b, k); +- ret = bch2_btree_node_prefetch(c, trans, iter, tmp.k, +- iter->btree_id, iter->level - 1); ++ ret = bch2_btree_node_prefetch(c, trans, path, tmp.k, path->btree_id, ++ path->level - 1); + } + + if (!was_locked) +- btree_node_unlock(iter, iter->level); ++ btree_node_unlock(path, path->level); + + bch2_bkey_buf_exit(&tmp, c); + return ret; + } + + static noinline void btree_node_mem_ptr_set(struct btree_trans *trans, +- struct btree_iter *iter, ++ struct btree_path *path, + unsigned plevel, struct btree *b) + { +- struct btree_iter_level *l = &iter->l[plevel]; +- bool locked = btree_node_locked(iter, plevel); ++ struct btree_path_level *l = &path->l[plevel]; ++ bool locked = btree_node_locked(path, plevel); + struct bkey_packed *k; + struct bch_btree_ptr_v2 *bp; + +- if (!bch2_btree_node_relock(trans, iter, plevel)) ++ if (!bch2_btree_node_relock(trans, path, plevel)) + return; + + k = bch2_btree_node_iter_peek_all(&l->iter, l->b); +@@ -1206,60 +1234,61 @@ static noinline void btree_node_mem_ptr_set(struct btree_trans *trans, + bp->mem_ptr = (unsigned long)b; + + if (!locked) +- btree_node_unlock(iter, plevel); ++ btree_node_unlock(path, plevel); + } + +-static __always_inline int btree_iter_down(struct btree_trans *trans, +- struct btree_iter *iter, ++static __always_inline int btree_path_down(struct btree_trans *trans, ++ struct btree_path *path, ++ unsigned flags, + unsigned long trace_ip) + { + struct bch_fs *c = trans->c; +- struct btree_iter_level *l = &iter->l[iter->level]; ++ struct btree_path_level *l = path_l(path); + struct btree *b; +- unsigned level = iter->level - 1; +- enum six_lock_type lock_type = __btree_lock_want(iter, level); ++ unsigned level = path->level - 1; ++ enum six_lock_type lock_type = __btree_lock_want(path, level); + struct bkey_buf tmp; + int ret; + +- EBUG_ON(!btree_node_locked(iter, iter->level)); ++ EBUG_ON(!btree_node_locked(path, path->level)); + + bch2_bkey_buf_init(&tmp); + bch2_bkey_buf_unpack(&tmp, c, l->b, + bch2_btree_node_iter_peek(&l->iter, l->b)); + +- b = bch2_btree_node_get(trans, iter, tmp.k, level, lock_type, trace_ip); ++ b = bch2_btree_node_get(trans, path, tmp.k, level, lock_type, trace_ip); + ret = PTR_ERR_OR_ZERO(b); + if (unlikely(ret)) + goto err; + +- mark_btree_node_locked(iter, level, lock_type); +- btree_iter_level_init(trans, iter, b); ++ mark_btree_node_locked(path, level, lock_type); ++ btree_path_level_init(trans, path, b); + + if (tmp.k->k.type == KEY_TYPE_btree_ptr_v2 && + unlikely(b != btree_node_mem_ptr(tmp.k))) +- btree_node_mem_ptr_set(trans, iter, level + 1, b); ++ btree_node_mem_ptr_set(trans, path, level + 1, b); + +- if (iter->flags & BTREE_ITER_PREFETCH) +- ret = btree_iter_prefetch(trans, iter); ++ if (flags & BTREE_ITER_PREFETCH) ++ ret = btree_path_prefetch(trans, path); + +- if (btree_node_read_locked(iter, level + 1)) +- btree_node_unlock(iter, level + 1); +- iter->level = level; ++ if (btree_node_read_locked(path, level + 1)) ++ btree_node_unlock(path, level + 1); ++ path->level = level; + +- bch2_btree_iter_verify_locks(iter); ++ bch2_btree_path_verify_locks(path); + err: + bch2_bkey_buf_exit(&tmp, c); + return ret; + } + +-static int btree_iter_traverse_one(struct btree_trans *, +- struct btree_iter *, unsigned long); ++static int btree_path_traverse_one(struct btree_trans *, struct btree_path *, ++ unsigned, unsigned long); + +-static int __btree_iter_traverse_all(struct btree_trans *trans, int ret, ++static int __btree_path_traverse_all(struct btree_trans *trans, int ret, + unsigned long trace_ip) + { + struct bch_fs *c = trans->c; +- struct btree_iter *iter; ++ struct btree_path *path; + int i; + + if (trans->in_traverse_all) +@@ -1269,20 +1298,20 @@ static int __btree_iter_traverse_all(struct btree_trans *trans, int ret, + retry_all: + trans->restarted = false; + +- trans_for_each_iter(trans, iter) +- iter->should_be_locked = false; ++ trans_for_each_path(trans, path) ++ path->should_be_locked = false; + + btree_trans_verify_sorted(trans); + + for (i = trans->nr_sorted - 2; i >= 0; --i) { +- struct btree_iter *iter1 = trans->iters + trans->sorted[i]; +- struct btree_iter *iter2 = trans->iters + trans->sorted[i + 1]; +- +- if (iter1->btree_id == iter2->btree_id && +- iter1->locks_want < iter2->locks_want) +- __bch2_btree_iter_upgrade(trans, iter1, iter2->locks_want); +- else if (!iter1->locks_want && iter2->locks_want) +- __bch2_btree_iter_upgrade(trans, iter1, 1); ++ struct btree_path *path1 = trans->paths + trans->sorted[i]; ++ struct btree_path *path2 = trans->paths + trans->sorted[i + 1]; ++ ++ if (path1->btree_id == path2->btree_id && ++ path1->locks_want < path2->locks_want) ++ __bch2_btree_path_upgrade(trans, path1, path2->locks_want); ++ else if (!path1->locks_want && path2->locks_want) ++ __bch2_btree_path_upgrade(trans, path1, 1); + } + + bch2_trans_unlock(trans); +@@ -1307,23 +1336,23 @@ retry_all: + BUG_ON(ret && ret != -EINTR); + + /* Now, redo traversals in correct order: */ +- trans_for_each_iter_inorder(trans, iter) { +- EBUG_ON(!(trans->iters_linked & (1ULL << iter->idx))); ++ trans_for_each_path_inorder(trans, path) { ++ EBUG_ON(!(trans->paths_allocated & (1ULL << path->idx))); + +- ret = btree_iter_traverse_one(trans, iter, _THIS_IP_); ++ ret = btree_path_traverse_one(trans, path, 0, _THIS_IP_); + if (ret) + goto retry_all; + +- EBUG_ON(!(trans->iters_linked & (1ULL << iter->idx))); ++ EBUG_ON(!(trans->paths_allocated & (1ULL << path->idx))); + } + + /* + * BTREE_ITER_NEED_RELOCK is ok here - if we called bch2_trans_unlock() +- * and relock(), relock() won't relock since iter->should_be_locked ++ * and relock(), relock() won't relock since path->should_be_locked + * isn't set yet, which is all fine + */ +- trans_for_each_iter(trans, iter) +- BUG_ON(iter->uptodate >= BTREE_ITER_NEED_TRAVERSE); ++ trans_for_each_path(trans, path) ++ BUG_ON(path->uptodate >= BTREE_ITER_NEED_TRAVERSE); + out: + bch2_btree_cache_cannibalize_unlock(c); + +@@ -1333,36 +1362,36 @@ out: + return ret; + } + +-static int bch2_btree_iter_traverse_all(struct btree_trans *trans) ++static int bch2_btree_path_traverse_all(struct btree_trans *trans) + { +- return __btree_iter_traverse_all(trans, 0, _RET_IP_); ++ return __btree_path_traverse_all(trans, 0, _RET_IP_); + } + +-static inline bool btree_iter_good_node(struct btree_trans *trans, +- struct btree_iter *iter, ++static inline bool btree_path_good_node(struct btree_trans *trans, ++ struct btree_path *path, + unsigned l, int check_pos) + { +- if (!is_btree_node(iter, l) || +- !bch2_btree_node_relock(trans, iter, l)) ++ if (!is_btree_node(path, l) || ++ !bch2_btree_node_relock(trans, path, l)) + return false; + +- if (check_pos < 0 && btree_iter_pos_before_node(iter, iter->l[l].b)) ++ if (check_pos < 0 && btree_path_pos_before_node(path, path->l[l].b)) + return false; +- if (check_pos > 0 && btree_iter_pos_after_node(iter, iter->l[l].b)) ++ if (check_pos > 0 && btree_path_pos_after_node(path, path->l[l].b)) + return false; + return true; + } + +-static inline unsigned btree_iter_up_until_good_node(struct btree_trans *trans, +- struct btree_iter *iter, ++static inline unsigned btree_path_up_until_good_node(struct btree_trans *trans, ++ struct btree_path *path, + int check_pos) + { +- unsigned l = iter->level; ++ unsigned l = path->level; + +- while (btree_iter_node(iter, l) && +- !btree_iter_good_node(trans, iter, l, check_pos)) { +- btree_node_unlock(iter, l); +- iter->l[l].b = BTREE_ITER_NO_NODE_UP; ++ while (btree_path_node(path, l) && ++ !btree_path_good_node(trans, path, l, check_pos)) { ++ btree_node_unlock(path, l); ++ path->l[l].b = BTREE_ITER_NO_NODE_UP; + l++; + } + +@@ -1378,53 +1407,54 @@ static inline unsigned btree_iter_up_until_good_node(struct btree_trans *trans, + * On error, caller (peek_node()/peek_key()) must return NULL; the error is + * stashed in the iterator and returned from bch2_trans_exit(). + */ +-static int btree_iter_traverse_one(struct btree_trans *trans, +- struct btree_iter *iter, ++static int btree_path_traverse_one(struct btree_trans *trans, ++ struct btree_path *path, ++ unsigned flags, + unsigned long trace_ip) + { +- unsigned l, depth_want = iter->level; ++ unsigned l, depth_want = path->level; + int ret = 0; + + /* +- * Ensure we obey iter->should_be_locked: if it's set, we can't unlock +- * and re-traverse the iterator without a transaction restart: ++ * Ensure we obey path->should_be_locked: if it's set, we can't unlock ++ * and re-traverse the path without a transaction restart: + */ +- if (iter->should_be_locked) { +- ret = bch2_btree_iter_relock(trans, iter, trace_ip) ? 0 : -EINTR; ++ if (path->should_be_locked) { ++ ret = bch2_btree_path_relock(trans, path, trace_ip) ? 0 : -EINTR; + goto out; + } + +- if (iter->cached) { +- ret = bch2_btree_iter_traverse_cached(trans, iter); ++ if (path->cached) { ++ ret = bch2_btree_path_traverse_cached(trans, path, flags); + goto out; + } + +- if (unlikely(iter->level >= BTREE_MAX_DEPTH)) ++ if (unlikely(path->level >= BTREE_MAX_DEPTH)) + goto out; + +- iter->level = btree_iter_up_until_good_node(trans, iter, 0); ++ path->level = btree_path_up_until_good_node(trans, path, 0); + + /* If we need intent locks, take them too: */ +- for (l = iter->level + 1; +- l < iter->locks_want && btree_iter_node(iter, l); ++ for (l = path->level + 1; ++ l < path->locks_want && btree_path_node(path, l); + l++) +- if (!bch2_btree_node_relock(trans, iter, l)) +- while (iter->level <= l) { +- btree_node_unlock(iter, iter->level); +- iter->l[iter->level].b = BTREE_ITER_NO_NODE_UP; +- iter->level++; ++ if (!bch2_btree_node_relock(trans, path, l)) ++ while (path->level <= l) { ++ btree_node_unlock(path, path->level); ++ path->l[path->level].b = BTREE_ITER_NO_NODE_UP; ++ path->level++; + } + + /* +- * Note: iter->nodes[iter->level] may be temporarily NULL here - that ++ * Note: path->nodes[path->level] may be temporarily NULL here - that + * would indicate to other code that we got to the end of the btree, + * here it indicates that relocking the root failed - it's critical that +- * btree_iter_lock_root() comes next and that it can't fail ++ * btree_path_lock_root() comes next and that it can't fail + */ +- while (iter->level > depth_want) { +- ret = btree_iter_node(iter, iter->level) +- ? btree_iter_down(trans, iter, trace_ip) +- : btree_iter_lock_root(trans, iter, depth_want, trace_ip); ++ while (path->level > depth_want) { ++ ret = btree_path_node(path, path->level) ++ ? btree_path_down(trans, path, flags, trace_ip) ++ : btree_path_lock_root(trans, path, depth_want, trace_ip); + if (unlikely(ret)) { + if (ret == 1) { + /* +@@ -1435,74 +1465,397 @@ static int btree_iter_traverse_one(struct btree_trans *trans, + goto out; + } + +- __bch2_btree_iter_unlock(iter); +- iter->level = depth_want; ++ __bch2_btree_path_unlock(path); ++ path->level = depth_want; + +- if (ret == -EIO) { +- iter->flags |= BTREE_ITER_ERROR; +- iter->l[iter->level].b = ++ if (ret == -EIO) ++ path->l[path->level].b = + BTREE_ITER_NO_NODE_ERROR; +- } else { +- iter->l[iter->level].b = ++ else ++ path->l[path->level].b = + BTREE_ITER_NO_NODE_DOWN; +- } + goto out; + } + } + +- iter->uptodate = BTREE_ITER_UPTODATE; ++ path->uptodate = BTREE_ITER_UPTODATE; + out: + BUG_ON((ret == -EINTR) != !!trans->restarted); + trace_iter_traverse(trans->ip, trace_ip, +- iter->cached, +- iter->btree_id, &iter->real_pos, ret); +- bch2_btree_iter_verify(iter); ++ path->cached, ++ path->btree_id, &path->pos, ret); ++ bch2_btree_path_verify(trans, path); + return ret; + } + +-static int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter) ++static int __btree_path_traverse_all(struct btree_trans *, int, unsigned long); ++ ++int __must_check bch2_btree_path_traverse(struct btree_trans *trans, ++ struct btree_path *path, unsigned flags) + { +- struct btree_trans *trans = iter->trans; + int ret; + ++ if (path->uptodate < BTREE_ITER_NEED_RELOCK) ++ return 0; ++ + ret = bch2_trans_cond_resched(trans) ?: +- btree_iter_traverse_one(trans, iter, _RET_IP_); +- if (unlikely(ret) && hweight64(trans->iters_linked) == 1) { +- ret = __btree_iter_traverse_all(trans, ret, _RET_IP_); ++ btree_path_traverse_one(trans, path, flags, _RET_IP_); ++ if (unlikely(ret) && hweight64(trans->paths_allocated) == 1) { ++ ret = __btree_path_traverse_all(trans, ret, _RET_IP_); + BUG_ON(ret == -EINTR); + } + + return ret; + } + +-/* +- * Note: +- * bch2_btree_iter_traverse() is for external users, btree_iter_traverse() is +- * for internal btree iterator users +- * +- * bch2_btree_iter_traverse sets iter->real_pos to iter->pos, +- * btree_iter_traverse() does not: +- */ +-static inline int __must_check +-btree_iter_traverse(struct btree_iter *iter) ++static void btree_path_copy(struct btree_trans *trans, struct btree_path *dst, ++ struct btree_path *src) ++{ ++ unsigned i; ++ ++ memcpy(&dst->pos, &src->pos, ++ sizeof(struct btree_path) - offsetof(struct btree_path, pos)); ++ ++ for (i = 0; i < BTREE_MAX_DEPTH; i++) ++ if (btree_node_locked(dst, i)) ++ six_lock_increment(&dst->l[i].b->c.lock, ++ __btree_lock_want(dst, i)); ++ ++ btree_path_check_sort(trans, dst, 0); ++} ++ ++inline struct btree_path * __must_check ++bch2_btree_path_make_mut(struct btree_trans *trans, ++ struct btree_path *path, bool intent) ++{ ++ if (path->ref > 1 || path->preserve) { ++ struct btree_path *new = btree_path_alloc(trans, path); ++ ++ btree_path_copy(trans, new, path); ++ __btree_path_get(new, intent); ++ __btree_path_put(path, intent); ++ path = new; ++ path->preserve = false; ++#ifdef CONFIG_BCACHEFS_DEBUG ++ path->ip_allocated = _RET_IP_; ++#endif ++ btree_trans_verify_sorted(trans); ++ } ++ ++ return path; ++} ++ ++static struct btree_path * __must_check ++btree_path_set_pos(struct btree_trans *trans, ++ struct btree_path *path, struct bpos new_pos, ++ bool intent) ++{ ++#ifdef CONFIG_BCACHEFS_DEBUG ++ struct bpos old_pos = path->pos; ++#endif ++ int cmp = bpos_cmp(new_pos, path->pos); ++ unsigned l = path->level; ++ ++ EBUG_ON(trans->restarted); ++ EBUG_ON(!path->ref); ++ ++ if (!cmp) ++ return path; ++ ++ path = bch2_btree_path_make_mut(trans, path, intent); ++ ++ path->pos = new_pos; ++ path->should_be_locked = false; ++ ++ btree_path_check_sort(trans, path, cmp); ++ ++ if (unlikely(path->cached)) { ++ btree_node_unlock(path, 0); ++ path->l[0].b = BTREE_ITER_NO_NODE_CACHED; ++ btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); ++ goto out; ++ } ++ ++ l = btree_path_up_until_good_node(trans, path, cmp); ++ ++ if (btree_path_node(path, l)) { ++ /* ++ * We might have to skip over many keys, or just a few: try ++ * advancing the node iterator, and if we have to skip over too ++ * many keys just reinit it (or if we're rewinding, since that ++ * is expensive). ++ */ ++ if (cmp < 0 || ++ !btree_path_advance_to_pos(path, &path->l[l], 8)) ++ __btree_path_level_init(path, l); ++ ++ /* Don't leave it locked if we're not supposed to: */ ++ if (btree_lock_want(path, l) == BTREE_NODE_UNLOCKED) ++ btree_node_unlock(path, l); ++ } ++ ++ if (l != path->level) ++ btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); ++out: ++ bch2_btree_path_verify(trans, path); ++#ifdef CONFIG_BCACHEFS_DEBUG ++ trace_path_set_pos(trans->ip, _RET_IP_, path->btree_id, ++ &old_pos, &new_pos, l); ++#endif ++ return path; ++} ++ ++/* Btree path: main interface: */ ++ ++static struct btree_path *have_path_at_pos(struct btree_trans *trans, struct btree_path *path) ++{ ++ struct btree_path *next; ++ ++ next = prev_btree_path(trans, path); ++ if (next && !btree_path_cmp(next, path)) ++ return next; ++ ++ next = next_btree_path(trans, path); ++ if (next && !btree_path_cmp(next, path)) ++ return next; ++ ++ return NULL; ++} ++ ++static bool have_node_at_pos(struct btree_trans *trans, struct btree_path *path) ++{ ++ struct btree_path *next; ++ ++ next = prev_btree_path(trans, path); ++ if (next && path_l(next)->b == path_l(path)->b) ++ return true; ++ ++ next = next_btree_path(trans, path); ++ if (next && path_l(next)->b == path_l(path)->b) ++ return true; ++ ++ return false; ++} ++ ++static inline void __bch2_path_free(struct btree_trans *trans, struct btree_path *path) ++{ ++ __bch2_btree_path_unlock(path); ++ btree_path_list_remove(trans, path); ++ trans->paths_allocated &= ~(1ULL << path->idx); ++} ++ ++void bch2_path_put(struct btree_trans *trans, struct btree_path *path, bool intent) ++{ ++ struct btree_path *dup; ++ ++ EBUG_ON(trans->paths + path->idx != path); ++ EBUG_ON(!path->ref); ++ ++ if (!__btree_path_put(path, intent)) ++ return; ++ ++ /* ++ * Perhaps instead we should check for duplicate paths in traverse_all: ++ */ ++ if (path->preserve && ++ (dup = have_path_at_pos(trans, path))) { ++ dup->preserve = true; ++ path->preserve = false; ++ } ++ ++ if (!path->preserve && ++ have_node_at_pos(trans, path)) ++ __bch2_path_free(trans, path); ++} ++ ++noinline __cold ++void bch2_dump_trans_paths_updates(struct btree_trans *trans) ++{ ++ struct btree_path *path; ++ struct btree_insert_entry *i; ++ char buf[300]; ++ ++ btree_trans_verify_sorted(trans); ++ ++ trans_for_each_path_inorder(trans, path) ++ printk(KERN_ERR "path: idx %u ref %u:%u%s btree %s pos %s %pS\n", ++ path->idx, path->ref, path->intent_ref, ++ path->preserve ? " preserve" : "", ++ bch2_btree_ids[path->btree_id], ++ (bch2_bpos_to_text(&PBUF(buf), path->pos), buf), ++#ifdef CONFIG_BCACHEFS_DEBUG ++ (void *) path->ip_allocated ++#else ++ NULL ++#endif ++ ); ++ ++ trans_for_each_update(trans, i) ++ printk(KERN_ERR "update: btree %s %s %pS\n", ++ bch2_btree_ids[i->btree_id], ++ (bch2_bkey_val_to_text(&PBUF(buf), trans->c, bkey_i_to_s_c(i->k)), buf), ++ (void *) i->ip_allocated); ++} ++ ++static struct btree_path *btree_path_alloc(struct btree_trans *trans, ++ struct btree_path *pos) ++{ ++ struct btree_path *path; ++ unsigned idx; ++ ++ if (unlikely(trans->paths_allocated == ++ ~((~0ULL << 1) << (BTREE_ITER_MAX - 1)))) { ++ bch2_dump_trans_paths_updates(trans); ++ panic("trans path oveflow\n"); ++ } ++ ++ idx = __ffs64(~trans->paths_allocated); ++ trans->paths_allocated |= 1ULL << idx; ++ ++ path = &trans->paths[idx]; ++ ++ path->idx = idx; ++ path->ref = 0; ++ path->intent_ref = 0; ++ path->nodes_locked = 0; ++ path->nodes_intent_locked = 0; ++ ++ btree_path_list_add(trans, pos, path); ++ return path; ++} ++ ++struct btree_path *bch2_path_get(struct btree_trans *trans, bool cached, ++ enum btree_id btree_id, struct bpos pos, ++ unsigned locks_want, unsigned level, ++ bool intent) ++{ ++ struct btree_path *path, *path_pos = NULL; ++ struct bpos pos_min = POS_MIN; ++ int i; ++ ++ BUG_ON(trans->restarted); ++ ++ trans_for_each_path_inorder(trans, path) { ++ if (__btree_path_cmp(path, ++ btree_id, ++ cached, ++ pos, ++ level) > 0) ++ break; ++ ++ path_pos = path; ++ } ++ ++ if (path_pos && ++ path_pos->cached == cached && ++ path_pos->btree_id == btree_id && ++ path_pos->level == level) { ++ __btree_path_get(path_pos, intent); ++ path = btree_path_set_pos(trans, path_pos, pos, intent); ++ path->preserve = true; ++ } else { ++ path = btree_path_alloc(trans, path_pos); ++ path_pos = NULL; ++ ++ __btree_path_get(path, intent); ++ path->pos = pos; ++ path->btree_id = btree_id; ++ path->cached = cached; ++ path->preserve = true; ++ path->uptodate = BTREE_ITER_NEED_TRAVERSE; ++ path->should_be_locked = false; ++ path->level = level; ++ path->locks_want = locks_want; ++ path->nodes_locked = 0; ++ path->nodes_intent_locked = 0; ++ for (i = 0; i < ARRAY_SIZE(path->l); i++) ++ path->l[i].b = BTREE_ITER_NO_NODE_INIT; ++#ifdef CONFIG_BCACHEFS_DEBUG ++ path->ip_allocated = _RET_IP_; ++#endif ++ btree_trans_verify_sorted(trans); ++ } ++ ++ if (path->intent_ref) ++ locks_want = max(locks_want, level + 1); ++ ++ /* ++ * If the path has locks_want greater than requested, we don't downgrade ++ * it here - on transaction restart because btree node split needs to ++ * upgrade locks, we might be putting/getting the iterator again. ++ * Downgrading iterators only happens via bch2_trans_downgrade(), after ++ * a successful transaction commit. ++ */ ++ ++ locks_want = min(locks_want, BTREE_MAX_DEPTH); ++ if (locks_want > path->locks_want) { ++ path->locks_want = locks_want; ++ btree_path_get_locks(trans, path, true, _THIS_IP_); ++ } ++ ++ trace_trans_get_path(_RET_IP_, trans->ip, btree_id, ++ &pos, locks_want, path->uptodate, ++ path_pos ? &path_pos->pos : &pos_min, ++ path_pos ? path_pos->locks_want : U8_MAX, ++ path_pos ? path_pos->uptodate : U8_MAX); ++ ++ return path; ++} ++ ++inline struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *path, struct bkey *u) + { +- return iter->uptodate >= BTREE_ITER_NEED_RELOCK +- ? __bch2_btree_iter_traverse(iter) +- : 0; ++ ++ struct bkey_s_c k; ++ ++ BUG_ON(path->uptodate != BTREE_ITER_UPTODATE); ++ ++ if (!path->cached) { ++ struct btree_path_level *l = path_l(path); ++ struct bkey_packed *_k = ++ bch2_btree_node_iter_peek_all(&l->iter, l->b); ++ ++ k = _k ? bkey_disassemble(l->b, _k, u) : bkey_s_c_null; ++ ++ EBUG_ON(k.k && bkey_deleted(k.k) && bpos_cmp(k.k->p, path->pos) == 0); ++ ++ if (!k.k || bpos_cmp(path->pos, k.k->p)) ++ goto hole; ++ } else { ++ struct bkey_cached *ck = (void *) path->l[0].b; ++ ++ EBUG_ON(path->btree_id != ck->key.btree_id || ++ bkey_cmp(path->pos, ck->key.pos)); ++ ++ /* BTREE_ITER_CACHED_NOFILL? */ ++ if (unlikely(!ck->valid)) ++ goto hole; ++ ++ k = bkey_i_to_s_c(ck->k); ++ } ++ ++ return k; ++hole: ++ bkey_init(u); ++ u->p = path->pos; ++ return (struct bkey_s_c) { u, NULL }; + } + ++/* Btree iterators: */ ++ + int __must_check + bch2_btree_iter_traverse(struct btree_iter *iter) + { + int ret; + +- btree_iter_set_search_pos(iter, btree_iter_search_key(iter)); ++ iter->path = btree_path_set_pos(iter->trans, iter->path, ++ btree_iter_search_key(iter), ++ iter->flags & BTREE_ITER_INTENT); + +- ret = btree_iter_traverse(iter); ++ ret = bch2_btree_path_traverse(iter->trans, iter->path, iter->flags); + if (ret) + return ret; + +- iter->should_be_locked = true; ++ iter->path->should_be_locked = true; + return 0; + } + +@@ -1513,22 +1866,22 @@ struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter) + struct btree *b = NULL; + int ret; + +- EBUG_ON(iter->cached); ++ EBUG_ON(iter->path->cached); + bch2_btree_iter_verify(iter); + +- ret = btree_iter_traverse(iter); ++ ret = bch2_btree_path_traverse(iter->trans, iter->path, iter->flags); + if (ret) + goto out; + +- b = btree_iter_node(iter, iter->level); ++ b = btree_path_node(iter->path, iter->path->level); + if (!b) + goto out; + + BUG_ON(bpos_cmp(b->key.k.p, iter->pos) < 0); + + bkey_init(&iter->k); +- iter->k.p = iter->pos = iter->real_pos = b->key.k.p; +- iter->should_be_locked = true; ++ iter->k.p = iter->pos = b->key.k.p; ++ iter->path->should_be_locked = true; + out: + bch2_btree_iter_verify_entry_exit(iter); + bch2_btree_iter_verify(iter); +@@ -1538,29 +1891,31 @@ out: + + struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) + { ++ struct btree_trans *trans = iter->trans; ++ struct btree_path *path = iter->path; + struct btree *b = NULL; + int ret; + +- EBUG_ON(iter->cached); ++ EBUG_ON(iter->path->cached); + bch2_btree_iter_verify(iter); + + /* already got to end? */ +- if (!btree_iter_node(iter, iter->level)) ++ if (!btree_path_node(path, path->level)) + goto out; + +- bch2_trans_cond_resched(iter->trans); ++ bch2_trans_cond_resched(trans); + +- btree_node_unlock(iter, iter->level); +- iter->l[iter->level].b = BTREE_ITER_NO_NODE_UP; +- iter->level++; ++ btree_node_unlock(path, path->level); ++ path->l[path->level].b = BTREE_ITER_NO_NODE_UP; ++ path->level++; + +- btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); +- ret = btree_iter_traverse(iter); ++ btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); ++ ret = bch2_btree_path_traverse(trans, path, iter->flags); + if (ret) + goto out; + + /* got to end? */ +- b = btree_iter_node(iter, iter->level); ++ b = btree_path_node(path, path->level); + if (!b) + goto out; + +@@ -1569,27 +1924,29 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) + * Haven't gotten to the end of the parent node: go back down to + * the next child node + */ +- btree_iter_set_search_pos(iter, bpos_successor(iter->pos)); ++ path = iter->path = ++ btree_path_set_pos(trans, path, bpos_successor(iter->pos), ++ iter->flags & BTREE_ITER_INTENT); + + /* Unlock to avoid screwing up our lock invariants: */ +- btree_node_unlock(iter, iter->level); ++ btree_node_unlock(path, path->level); + +- iter->level = iter->min_depth; +- btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); ++ path->level = iter->min_depth; ++ btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); + bch2_btree_iter_verify(iter); + +- ret = btree_iter_traverse(iter); ++ ret = bch2_btree_path_traverse(trans, path, iter->flags); + if (ret) { + b = NULL; + goto out; + } + +- b = iter->l[iter->level].b; ++ b = path->l[path->level].b; + } + + bkey_init(&iter->k); +- iter->k.p = iter->pos = iter->real_pos = b->key.k.p; +- iter->should_be_locked = true; ++ iter->k.p = iter->pos = b->key.k.p; ++ iter->path->should_be_locked = true; + out: + bch2_btree_iter_verify_entry_exit(iter); + bch2_btree_iter_verify(iter); +@@ -1599,61 +1956,6 @@ out: + + /* Iterate across keys (in leaf nodes only) */ + +-static void btree_iter_set_search_pos(struct btree_iter *iter, struct bpos new_pos) +-{ +- struct btree_trans *trans = iter->trans; +-#ifdef CONFIG_BCACHEFS_DEBUG +- struct bpos old_pos = iter->real_pos; +-#endif +- int cmp = bpos_cmp(new_pos, iter->real_pos); +- unsigned l = iter->level; +- +- EBUG_ON(trans->restarted); +- +- if (!cmp) +- goto out; +- +- iter->real_pos = new_pos; +- iter->should_be_locked = false; +- +- btree_iter_check_sort(trans, iter); +- +- if (unlikely(iter->cached)) { +- btree_node_unlock(iter, 0); +- iter->l[0].b = BTREE_ITER_NO_NODE_CACHED; +- btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); +- return; +- } +- +- l = btree_iter_up_until_good_node(trans, iter, cmp); +- +- if (btree_iter_node(iter, l)) { +- /* +- * We might have to skip over many keys, or just a few: try +- * advancing the node iterator, and if we have to skip over too +- * many keys just reinit it (or if we're rewinding, since that +- * is expensive). +- */ +- if (cmp < 0 || +- !btree_iter_advance_to_pos(iter, &iter->l[l], 8)) +- __btree_iter_level_init(iter, l); +- +- /* Don't leave it locked if we're not supposed to: */ +- if (btree_lock_want(iter, l) == BTREE_NODE_UNLOCKED) +- btree_node_unlock(iter, l); +- } +-out: +- if (l != iter->level) +- btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); +- +- bch2_btree_iter_verify(iter); +-#ifdef CONFIG_BCACHEFS_DEBUG +- trace_iter_set_search_pos(trans->ip, _RET_IP_, +- iter->btree_id, +- &old_pos, &new_pos, l); +-#endif +-} +- + inline bool bch2_btree_iter_advance(struct btree_iter *iter) + { + struct bpos pos = iter->k.p; +@@ -1678,25 +1980,6 @@ inline bool bch2_btree_iter_rewind(struct btree_iter *iter) + return ret; + } + +-static inline struct bkey_i *btree_trans_peek_updates(struct btree_trans *trans, +- struct btree_iter *iter) +-{ +- struct btree_insert_entry *i; +- +- if (!(iter->flags & BTREE_ITER_WITH_UPDATES)) +- return NULL; +- +- trans_for_each_update(trans, i) +- if ((cmp_int(iter->btree_id, i->iter->btree_id) ?: +- bpos_cmp(iter->real_pos, i->k->k.p)) <= 0) { +- if (iter->btree_id == i->iter->btree_id) +- return i->k; +- break; +- } +- +- return NULL; +-} +- + /** + * bch2_btree_iter_peek: returns first key greater than or equal to iterator's + * current position +@@ -1704,20 +1987,20 @@ static inline struct bkey_i *btree_trans_peek_updates(struct btree_trans *trans, + struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) + { + struct btree_trans *trans = iter->trans; +- struct btree_iter_level *l = &iter->l[0]; + struct bpos search_key = btree_iter_search_key(iter); + struct bkey_i *next_update; + struct bkey_s_c k; +- int ret; ++ int ret, cmp; + +- EBUG_ON(iter->cached || iter->level); ++ EBUG_ON(iter->path->cached || iter->path->level); + bch2_btree_iter_verify(iter); + bch2_btree_iter_verify_entry_exit(iter); + + while (1) { +- btree_iter_set_search_pos(iter, search_key); ++ iter->path = btree_path_set_pos(trans, iter->path, search_key, ++ iter->flags & BTREE_ITER_INTENT); + +- ret = btree_iter_traverse(iter); ++ ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); + if (unlikely(ret)) { + /* ensure that iter->k is consistent with iter->pos: */ + bch2_btree_iter_set_pos(iter, iter->pos); +@@ -1725,8 +2008,10 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) + goto out; + } + +- next_update = btree_trans_peek_updates(trans, iter); +- k = btree_iter_level_peek_all(iter, l); ++ next_update = iter->flags & BTREE_ITER_WITH_UPDATES ++ ? btree_trans_peek_updates(trans, iter->btree_id, search_key) ++ : NULL; ++ k = btree_path_level_peek_all(trans->c, &iter->path->l[0], &iter->k); + + /* * In the btree, deleted keys sort before non deleted: */ + if (k.k && bkey_deleted(k.k) && +@@ -1738,7 +2023,7 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) + + if (next_update && + bpos_cmp(next_update->k.p, +- k.k ? k.k->p : l->b->key.k.p) <= 0) { ++ k.k ? k.k->p : iter->path->l[0].b->key.k.p) <= 0) { + iter->k = next_update->k; + k = bkey_i_to_s_c(next_update); + } +@@ -1749,13 +2034,12 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) + + /* Advance to next key: */ + search_key = bkey_successor(iter, k.k->p); +- } else if (likely(bpos_cmp(l->b->key.k.p, SPOS_MAX))) { ++ } else if (likely(bpos_cmp(iter->path->l[0].b->key.k.p, SPOS_MAX))) { + /* Advance to next leaf node: */ +- search_key = bpos_successor(l->b->key.k.p); ++ search_key = bpos_successor(iter->path->l[0].b->key.k.p); + } else { + /* End of btree: */ + bch2_btree_iter_set_pos(iter, SPOS_MAX); +- iter->real_pos = SPOS_MAX; + k = bkey_s_c_null; + goto out; + } +@@ -1769,9 +2053,15 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) + iter->pos = k.k->p; + else if (bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0) + iter->pos = bkey_start_pos(k.k); +- iter->real_pos = k.k->p; ++ ++ cmp = bpos_cmp(k.k->p, iter->path->pos); ++ if (cmp) { ++ iter->path->pos = k.k->p; ++ btree_path_check_sort(trans, iter->path, cmp); ++ } + out: +- iter->should_be_locked = true; ++ iter->path->should_be_locked = true; ++ + bch2_btree_iter_verify_entry_exit(iter); + bch2_btree_iter_verify(iter); + return k; +@@ -1795,20 +2085,21 @@ struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter) + */ + struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) + { ++ struct btree_trans *trans = iter->trans; + struct bpos search_key = iter->pos; +- struct btree_iter_level *l = &iter->l[0]; + struct bkey_s_c k; + int ret; + +- EBUG_ON(iter->cached || iter->level); ++ EBUG_ON(iter->path->cached || iter->path->level); + EBUG_ON(iter->flags & BTREE_ITER_WITH_UPDATES); + bch2_btree_iter_verify(iter); + bch2_btree_iter_verify_entry_exit(iter); + + while (1) { +- btree_iter_set_search_pos(iter, search_key); ++ iter->path = btree_path_set_pos(trans, iter->path, search_key, ++ iter->flags & BTREE_ITER_INTENT); + +- ret = btree_iter_traverse(iter); ++ ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); + if (unlikely(ret)) { + /* ensure that iter->k is consistent with iter->pos: */ + bch2_btree_iter_set_pos(iter, iter->pos); +@@ -1816,18 +2107,22 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) + goto out; + } + +- k = btree_iter_level_peek(iter, l); ++ k = btree_path_level_peek(trans->c, iter->path, ++ &iter->path->l[0], &iter->k); + if (!k.k || + ((iter->flags & BTREE_ITER_IS_EXTENTS) + ? bkey_cmp(bkey_start_pos(k.k), iter->pos) >= 0 + : bkey_cmp(k.k->p, iter->pos) > 0)) +- k = btree_iter_level_prev(iter, l); ++ k = btree_path_level_prev(trans->c, iter->path, ++ &iter->path->l[0], &iter->k); ++ ++ btree_path_check_sort(trans, iter->path, 0); + + if (likely(k.k)) { + break; +- } else if (likely(bpos_cmp(l->b->data->min_key, POS_MIN))) { ++ } else if (likely(bpos_cmp(iter->path->l[0].b->data->min_key, POS_MIN))) { + /* Advance to previous leaf node: */ +- search_key = bpos_predecessor(l->b->data->min_key); ++ search_key = bpos_predecessor(iter->path->l[0].b->data->min_key); + } else { + /* Start of btree: */ + bch2_btree_iter_set_pos(iter, POS_MIN); +@@ -1842,9 +2137,11 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) + if (bkey_cmp(k.k->p, iter->pos) < 0) + iter->pos = k.k->p; + out: +- iter->should_be_locked = true; ++ iter->path->should_be_locked = true; ++ + bch2_btree_iter_verify_entry_exit(iter); + bch2_btree_iter_verify(iter); ++ + return k; + } + +@@ -1867,7 +2164,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) + struct bkey_s_c k; + int ret; + +- EBUG_ON(iter->level); ++ EBUG_ON(iter->path->level); + bch2_btree_iter_verify(iter); + bch2_btree_iter_verify_entry_exit(iter); + +@@ -1881,44 +2178,41 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) + } + + search_key = btree_iter_search_key(iter); +- btree_iter_set_search_pos(iter, search_key); ++ iter->path = btree_path_set_pos(trans, iter->path, search_key, ++ iter->flags & BTREE_ITER_INTENT); + +- ret = btree_iter_traverse(iter); ++ ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); + if (unlikely(ret)) + return bkey_s_c_err(ret); + + if (!(iter->flags & BTREE_ITER_IS_EXTENTS)) { + struct bkey_i *next_update; + +- next_update = btree_trans_peek_updates(trans, iter); +- +- if (!iter->cached) { +- k = btree_iter_level_peek_all(iter, &iter->l[0]); +- EBUG_ON(k.k && bkey_deleted(k.k) && bpos_cmp(k.k->p, iter->pos) == 0); +- } else { +- struct bkey_cached *ck = (void *) iter->l[0].b; +- EBUG_ON(iter->btree_id != ck->key.btree_id || +- bkey_cmp(iter->pos, ck->key.pos)); +- BUG_ON(!ck->valid); +- +- k = bkey_i_to_s_c(ck->k); +- } ++ next_update = iter->flags & BTREE_ITER_WITH_UPDATES ++ ? btree_trans_peek_updates(trans, iter->btree_id, search_key) ++ : NULL; + + if (next_update && +- (!k.k || bpos_cmp(next_update->k.p, k.k->p) <= 0)) { ++ !bpos_cmp(next_update->k.p, iter->pos)) { + iter->k = next_update->k; + k = bkey_i_to_s_c(next_update); ++ } else { ++ k = bch2_btree_path_peek_slot(iter->path, &iter->k); + } + } else { +- if ((iter->flags & BTREE_ITER_INTENT)) { +- struct btree_iter *child = +- btree_iter_child_alloc(trans, iter, _THIS_IP_); ++ struct bpos next; ++ ++ if (iter->flags & BTREE_ITER_INTENT) { ++ struct btree_iter iter2; + +- btree_iter_copy(trans, child, iter); +- k = bch2_btree_iter_peek(child); ++ bch2_trans_copy_iter(&iter2, iter); ++ k = bch2_btree_iter_peek(&iter2); + +- if (k.k && !bkey_err(k)) +- iter->k = child->k; ++ if (k.k && !bkey_err(k)) { ++ iter->k = iter2.k; ++ k.k = &iter->k; ++ } ++ bch2_trans_iter_exit(trans, &iter2); + } else { + struct bpos pos = iter->pos; + +@@ -1928,19 +2222,8 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) + + if (unlikely(bkey_err(k))) + return k; +- } + +- if (!(iter->flags & BTREE_ITER_IS_EXTENTS)) { +- if (!k.k || +- ((iter->flags & BTREE_ITER_ALL_SNAPSHOTS) +- ? bpos_cmp(iter->pos, k.k->p) +- : bkey_cmp(iter->pos, k.k->p))) { +- bkey_init(&iter->k); +- iter->k.p = iter->pos; +- k = (struct bkey_s_c) { &iter->k, NULL }; +- } +- } else { +- struct bpos next = k.k ? bkey_start_pos(k.k) : POS_MAX; ++ next = k.k ? bkey_start_pos(k.k) : POS_MAX; + + if (bkey_cmp(iter->pos, next) < 0) { + bkey_init(&iter->k); +@@ -1957,9 +2240,10 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) + } + } + ++ iter->path->should_be_locked = true; ++ + bch2_btree_iter_verify_entry_exit(iter); + bch2_btree_iter_verify(iter); +- iter->should_be_locked = true; + + return k; + } +@@ -1980,35 +2264,14 @@ struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_iter *iter) + return bch2_btree_iter_peek_slot(iter); + } + +-static inline void bch2_btree_iter_init(struct btree_trans *trans, +- struct btree_iter *iter, enum btree_id btree_id) +-{ +- struct bch_fs *c = trans->c; +- unsigned i; +- +- iter->trans = trans; +- iter->uptodate = BTREE_ITER_NEED_TRAVERSE; +- iter->btree_id = btree_id; +- iter->real_pos = POS_MIN; +- iter->level = 0; +- iter->min_depth = 0; +- iter->locks_want = 0; +- iter->nodes_locked = 0; +- iter->nodes_intent_locked = 0; +- for (i = 0; i < ARRAY_SIZE(iter->l); i++) +- iter->l[i].b = BTREE_ITER_NO_NODE_INIT; +- +- prefetch(c->btree_roots[btree_id].b); +-} +- + /* new transactional stuff: */ + +-static inline void btree_iter_verify_sorted_ref(struct btree_trans *trans, +- struct btree_iter *iter) ++static inline void btree_path_verify_sorted_ref(struct btree_trans *trans, ++ struct btree_path *path) + { +- EBUG_ON(iter->sorted_idx >= trans->nr_sorted); +- EBUG_ON(trans->sorted[iter->sorted_idx] != iter->idx); +- EBUG_ON(!(trans->iters_linked & (1ULL << iter->idx))); ++ EBUG_ON(path->sorted_idx >= trans->nr_sorted); ++ EBUG_ON(trans->sorted[path->sorted_idx] != path->idx); ++ EBUG_ON(!(trans->paths_allocated & (1ULL << path->idx))); + } + + static inline void btree_trans_verify_sorted_refs(struct btree_trans *trans) +@@ -2017,268 +2280,113 @@ static inline void btree_trans_verify_sorted_refs(struct btree_trans *trans) + unsigned i; + + for (i = 0; i < trans->nr_sorted; i++) +- btree_iter_verify_sorted_ref(trans, trans->iters + trans->sorted[i]); ++ btree_path_verify_sorted_ref(trans, trans->paths + trans->sorted[i]); + #endif + } + +-static inline void btree_trans_verify_sorted(struct btree_trans *trans) ++static void btree_trans_verify_sorted(struct btree_trans *trans) + { + #ifdef CONFIG_BCACHEFS_DEBUG +- struct btree_iter *iter, *prev = NULL; ++ struct btree_path *path, *prev = NULL; + +- trans_for_each_iter_inorder(trans, iter) +- BUG_ON(prev && btree_iter_cmp(prev, iter) > 0); ++ trans_for_each_path_inorder(trans, path) { ++ BUG_ON(prev && btree_path_cmp(prev, path) > 0); ++ prev = path; ++ } + #endif + } + +-static inline void btree_iter_swap(struct btree_trans *trans, +- struct btree_iter *l, struct btree_iter *r) ++static inline void btree_path_swap(struct btree_trans *trans, ++ struct btree_path *l, struct btree_path *r) + { + swap(l->sorted_idx, r->sorted_idx); + swap(trans->sorted[l->sorted_idx], + trans->sorted[r->sorted_idx]); + +- btree_iter_verify_sorted_ref(trans, l); +- btree_iter_verify_sorted_ref(trans, r); ++ btree_path_verify_sorted_ref(trans, l); ++ btree_path_verify_sorted_ref(trans, r); + } + +-static void btree_iter_check_sort(struct btree_trans *trans, struct btree_iter *iter) ++static void btree_path_check_sort(struct btree_trans *trans, struct btree_path *path, ++ int cmp) + { +- struct btree_iter *n; +- +- EBUG_ON(iter->sorted_idx == U8_MAX); ++ struct btree_path *n; + +- n = next_btree_iter(trans, iter); +- if (n && btree_iter_cmp(iter, n) > 0) { +- do { +- btree_iter_swap(trans, iter, n); +- n = next_btree_iter(trans, iter); +- } while (n && btree_iter_cmp(iter, n) > 0); ++ if (cmp <= 0) { ++ n = prev_btree_path(trans, path); ++ if (n && btree_path_cmp(n, path) > 0) { ++ do { ++ btree_path_swap(trans, n, path); ++ n = prev_btree_path(trans, path); ++ } while (n && btree_path_cmp(n, path) > 0); + +- return; ++ goto out; ++ } + } + +- n = prev_btree_iter(trans, iter); +- if (n && btree_iter_cmp(n, iter) > 0) { +- do { +- btree_iter_swap(trans, n, iter); +- n = prev_btree_iter(trans, iter); +- } while (n && btree_iter_cmp(n, iter) > 0); ++ if (cmp >= 0) { ++ n = next_btree_path(trans, path); ++ if (n && btree_path_cmp(path, n) > 0) { ++ do { ++ btree_path_swap(trans, path, n); ++ n = next_btree_path(trans, path); ++ } while (n && btree_path_cmp(path, n) > 0); ++ } + } +- ++out: + btree_trans_verify_sorted(trans); + } + +-static inline void btree_iter_list_remove(struct btree_trans *trans, +- struct btree_iter *iter) ++static inline void btree_path_list_remove(struct btree_trans *trans, ++ struct btree_path *path) + { + unsigned i; + +- EBUG_ON(iter->sorted_idx >= trans->nr_sorted); ++ EBUG_ON(path->sorted_idx >= trans->nr_sorted); + +- array_remove_item(trans->sorted, trans->nr_sorted, iter->sorted_idx); ++ array_remove_item(trans->sorted, trans->nr_sorted, path->sorted_idx); + +- for (i = iter->sorted_idx; i < trans->nr_sorted; i++) +- trans->iters[trans->sorted[i]].sorted_idx = i; ++ for (i = path->sorted_idx; i < trans->nr_sorted; i++) ++ trans->paths[trans->sorted[i]].sorted_idx = i; + +- iter->sorted_idx = U8_MAX; ++ path->sorted_idx = U8_MAX; + + btree_trans_verify_sorted_refs(trans); + } + +-static inline void btree_iter_list_add(struct btree_trans *trans, +- struct btree_iter *pos, +- struct btree_iter *iter) ++static inline void btree_path_list_add(struct btree_trans *trans, ++ struct btree_path *pos, ++ struct btree_path *path) + { + unsigned i; + + btree_trans_verify_sorted_refs(trans); + +- iter->sorted_idx = pos ? pos->sorted_idx : trans->nr_sorted; ++ path->sorted_idx = pos ? pos->sorted_idx : trans->nr_sorted; + +- array_insert_item(trans->sorted, trans->nr_sorted, iter->sorted_idx, iter->idx); ++ array_insert_item(trans->sorted, trans->nr_sorted, path->sorted_idx, path->idx); + +- for (i = iter->sorted_idx; i < trans->nr_sorted; i++) +- trans->iters[trans->sorted[i]].sorted_idx = i; ++ for (i = path->sorted_idx; i < trans->nr_sorted; i++) ++ trans->paths[trans->sorted[i]].sorted_idx = i; + + btree_trans_verify_sorted_refs(trans); + } + +-static void btree_iter_child_free(struct btree_trans *trans, struct btree_iter *iter) +-{ +- struct btree_iter *child = btree_iter_child(trans, iter); +- +- if (child) { +- bch2_trans_iter_free(trans, child); +- iter->child_idx = U8_MAX; +- } +-} +- +-static struct btree_iter *btree_iter_child_alloc(struct btree_trans *trans, +- struct btree_iter *iter, +- unsigned long ip) +-{ +- struct btree_iter *child = btree_iter_child(trans, iter); +- +- if (!child) { +- child = btree_trans_iter_alloc(trans, iter); +- child->ip_allocated = ip; +- iter->child_idx = child->idx; +- +- trans->iters_live |= 1ULL << child->idx; +- trans->iters_touched |= 1ULL << child->idx; +- } +- +- return child; +-} +- +-static inline void __bch2_trans_iter_free(struct btree_trans *trans, +- unsigned idx) +-{ +- btree_iter_child_free(trans, &trans->iters[idx]); +- +- btree_iter_list_remove(trans, &trans->iters[idx]); +- +- __bch2_btree_iter_unlock(&trans->iters[idx]); +- trans->iters_linked &= ~(1ULL << idx); +- trans->iters_live &= ~(1ULL << idx); +- trans->iters_touched &= ~(1ULL << idx); +-} +- +-static bool have_iter_at_pos(struct btree_trans *trans, +- struct btree_iter *iter) +-{ +- struct btree_iter *n; +- +- n = prev_btree_iter(trans, iter); +- if (n && !btree_iter_cmp(n, iter)) +- return true; +- +- n = next_btree_iter(trans, iter); +- if (n && !btree_iter_cmp(n, iter)) +- return true; +- +- return false; +-} +- +-int bch2_trans_iter_put(struct btree_trans *trans, +- struct btree_iter *iter) +-{ +- int ret; +- +- if (IS_ERR_OR_NULL(iter)) +- return 0; +- +- BUG_ON(trans->iters + iter->idx != iter); +- BUG_ON(!btree_iter_live(trans, iter)); +- +- ret = btree_iter_err(iter); +- +- if (!(iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT) && +- (!(trans->iters_touched & (1ULL << iter->idx)) || +- have_iter_at_pos(trans, iter))) +- __bch2_trans_iter_free(trans, iter->idx); +- +- trans->iters_live &= ~(1ULL << iter->idx); +- return ret; +-} +- +-int bch2_trans_iter_free(struct btree_trans *trans, +- struct btree_iter *iter) ++void bch2_trans_iter_exit(struct btree_trans *trans, struct btree_iter *iter) + { +- if (IS_ERR_OR_NULL(iter)) +- return 0; +- +- set_btree_iter_dontneed(trans, iter); +- +- return bch2_trans_iter_put(trans, iter); ++ if (iter->path) ++ bch2_path_put(trans, iter->path, ++ iter->flags & BTREE_ITER_INTENT); ++ iter->path = NULL; + } + +-noinline __cold +-void bch2_dump_trans_iters_updates(struct btree_trans *trans) +-{ +- struct btree_iter *iter; +- struct btree_insert_entry *i; +- char buf1[300], buf2[100]; +- +- btree_trans_verify_sorted(trans); +- +- trans_for_each_iter_inorder(trans, iter) +- printk(KERN_ERR "iter: btree %s pos %s real_pos %s%s%s%s %pS\n", +- bch2_btree_ids[iter->btree_id], +- (bch2_bpos_to_text(&PBUF(buf1), iter->pos), buf1), +- (bch2_bpos_to_text(&PBUF(buf2), iter->real_pos), buf2), +- btree_iter_live(trans, iter) ? " live" : "", +- (trans->iters_touched & (1ULL << iter->idx)) ? " touched" : "", +- iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT ? " keep" : "", +- (void *) iter->ip_allocated); +- +- trans_for_each_update(trans, i) +- printk(KERN_ERR "update: btree %s %s %pS\n", +- bch2_btree_ids[i->btree_id], +- (bch2_bkey_val_to_text(&PBUF(buf1), trans->c, bkey_i_to_s_c(i->k)), buf1), +- (void *) i->ip_allocated); +-} +- +-static struct btree_iter *btree_trans_iter_alloc(struct btree_trans *trans, +- struct btree_iter *pos) +-{ +- struct btree_iter *iter; +- unsigned idx; +- +- if (unlikely(trans->iters_linked == +- ~((~0ULL << 1) << (BTREE_ITER_MAX - 1)))) { +- bch2_dump_trans_iters_updates(trans); +- panic("trans iter oveflow\n"); +- } +- +- idx = __ffs64(~trans->iters_linked); +- iter = &trans->iters[idx]; +- +- iter->trans = trans; +- iter->idx = idx; +- iter->child_idx = U8_MAX; +- iter->sorted_idx = U8_MAX; +- iter->flags = 0; +- iter->nodes_locked = 0; +- iter->nodes_intent_locked = 0; +- trans->iters_linked |= 1ULL << idx; +- +- btree_iter_list_add(trans, pos, iter); +- return iter; +-} +- +-static void btree_iter_copy(struct btree_trans *trans, struct btree_iter *dst, +- struct btree_iter *src) +-{ +- unsigned i; +- +- __bch2_btree_iter_unlock(dst); +- btree_iter_child_free(trans, dst); +- +- memcpy(&dst->flags, &src->flags, +- sizeof(struct btree_iter) - offsetof(struct btree_iter, flags)); +- +- for (i = 0; i < BTREE_MAX_DEPTH; i++) +- if (btree_node_locked(dst, i)) +- six_lock_increment(&dst->l[i].b->c.lock, +- __btree_lock_want(dst, i)); +- +- dst->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT; +- +- btree_iter_check_sort(trans, dst); +-} +- +-struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans, +- unsigned btree_id, struct bpos pos, +- unsigned locks_want, +- unsigned depth, +- unsigned flags) ++static void __bch2_trans_iter_init(struct btree_trans *trans, ++ struct btree_iter *iter, ++ unsigned btree_id, struct bpos pos, ++ unsigned locks_want, ++ unsigned depth, ++ unsigned flags) + { +- struct btree_iter *iter, *list_pos = NULL, *best = NULL; +- struct bpos real_pos, pos_min = POS_MIN; +- int cmp; +- + EBUG_ON(trans->restarted); + + if (!(flags & (BTREE_ITER_ALL_SNAPSHOTS|BTREE_ITER_NOT_EXTENTS)) && +@@ -2293,127 +2401,58 @@ struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans, + pos.snapshot = btree_type_has_snapshots(btree_id) + ? U32_MAX : 0; + +- real_pos = pos; +- +- if ((flags & BTREE_ITER_IS_EXTENTS) && +- bkey_cmp(pos, POS_MAX)) +- real_pos = bpos_nosnap_successor(pos); +- +- trans_for_each_iter_inorder(trans, iter) { +- list_pos = iter; +- +- if (iter->cached != (flags & BTREE_ITER_CACHED) || +- iter->btree_id != btree_id) +- continue; +- +- /* +- * Since advancing iterators is cheaper than rewinding them, we +- * prefer a path <= the search pos +- */ +- cmp = bpos_cmp(iter->real_pos, real_pos) ?: +- cmp_int(iter->level, depth); +- if (!best || cmp <= 0) +- best = iter; +- if (cmp >= 0) +- break; +- } +- +- if (!best) { +- iter = btree_trans_iter_alloc(trans, list_pos); +- bch2_btree_iter_init(trans, iter, btree_id); +- } else if (btree_iter_keep(trans, best)) { +- iter = btree_trans_iter_alloc(trans, best); +- btree_iter_copy(trans, iter, best); +- } else { +- iter = best; +- } +- +- trans->iters_live |= 1ULL << iter->idx; +- trans->iters_touched |= 1ULL << iter->idx; +- +- iter->cached = flags & BTREE_ITER_CACHED; ++ iter->trans = trans; ++ iter->path = NULL; ++ iter->btree_id = btree_id; ++ iter->min_depth = depth; + iter->flags = flags; + iter->snapshot = pos.snapshot; ++ iter->pos = pos; ++ iter->k.type = KEY_TYPE_deleted; ++ iter->k.p = pos; ++ iter->k.size = 0; + +- /* +- * If the iterator has locks_want greater than requested, we explicitly +- * do not downgrade it here - on transaction restart because btree node +- * split needs to upgrade locks, we might be putting/getting the +- * iterator again. Downgrading iterators only happens via an explicit +- * bch2_trans_downgrade(). +- */ +- +- locks_want = min(locks_want, BTREE_MAX_DEPTH); +- if (locks_want > iter->locks_want) { +- iter->locks_want = locks_want; +- btree_iter_get_locks(trans, iter, true, _THIS_IP_); +- } +- +- while (iter->level != depth) { +- btree_node_unlock(iter, iter->level); +- iter->l[iter->level].b = BTREE_ITER_NO_NODE_INIT; +- iter->uptodate = BTREE_ITER_NEED_TRAVERSE; +- if (iter->level < depth) +- iter->level++; +- else +- iter->level--; +- } +- +- iter->min_depth = depth; +- +- bch2_btree_iter_set_pos(iter, pos); +- btree_iter_set_search_pos(iter, real_pos); +- +- trace_trans_get_iter(_RET_IP_, trans->ip, +- btree_id, +- &real_pos, locks_want, iter->uptodate, +- best ? &best->real_pos : &pos_min, +- best ? best->locks_want : U8_MAX, +- best ? best->uptodate : U8_MAX); +- +- return iter; ++ iter->path = bch2_path_get(trans, ++ flags & BTREE_ITER_CACHED, ++ btree_id, ++ btree_iter_search_key(iter), ++ locks_want, ++ depth, ++ flags & BTREE_ITER_INTENT); + } + +-struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *trans, +- enum btree_id btree_id, +- struct bpos pos, +- unsigned locks_want, +- unsigned depth, +- unsigned flags) ++void bch2_trans_iter_init(struct btree_trans *trans, ++ struct btree_iter *iter, ++ unsigned btree_id, struct bpos pos, ++ unsigned flags) + { +- struct btree_iter *iter = +- __bch2_trans_get_iter(trans, btree_id, pos, +- locks_want, depth, +- BTREE_ITER_NOT_EXTENTS| +- __BTREE_ITER_ALL_SNAPSHOTS| +- BTREE_ITER_ALL_SNAPSHOTS| +- flags); +- +- BUG_ON(bkey_cmp(iter->pos, pos)); +- BUG_ON(iter->locks_want != min(locks_want, BTREE_MAX_DEPTH)); +- BUG_ON(iter->level != depth); +- BUG_ON(iter->min_depth != depth); +- iter->ip_allocated = _RET_IP_; +- +- return iter; ++ __bch2_trans_iter_init(trans, iter, btree_id, pos, ++ 0, 0, flags); + } + +-struct btree_iter *__bch2_trans_copy_iter(struct btree_trans *trans, +- struct btree_iter *src) ++void bch2_trans_node_iter_init(struct btree_trans *trans, ++ struct btree_iter *iter, ++ enum btree_id btree_id, ++ struct bpos pos, ++ unsigned locks_want, ++ unsigned depth, ++ unsigned flags) + { +- struct btree_iter *iter; +- +- iter = btree_trans_iter_alloc(trans, src); +- btree_iter_copy(trans, iter, src); +- +- trans->iters_live |= 1ULL << iter->idx; +- /* +- * We don't need to preserve this iter since it's cheap to copy it +- * again - this will cause trans_iter_put() to free it right away: +- */ +- set_btree_iter_dontneed(trans, iter); ++ __bch2_trans_iter_init(trans, iter, btree_id, pos, locks_want, depth, ++ BTREE_ITER_NOT_EXTENTS| ++ __BTREE_ITER_ALL_SNAPSHOTS| ++ BTREE_ITER_ALL_SNAPSHOTS| ++ flags); ++ BUG_ON(iter->path->locks_want < min(locks_want, BTREE_MAX_DEPTH)); ++ BUG_ON(iter->path->level != depth); ++ BUG_ON(iter->min_depth != depth); ++} + +- return iter; ++void bch2_trans_copy_iter(struct btree_iter *dst, struct btree_iter *src) ++{ ++ *dst = *src; ++ if (src->path) ++ __btree_path_get(src->path, src->flags & BTREE_ITER_INTENT); + } + + void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size) +@@ -2454,20 +2493,6 @@ void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size) + return p; + } + +-inline void bch2_trans_unlink_iters(struct btree_trans *trans) +-{ +- u64 iters = trans->iters_linked & +- ~trans->iters_touched & +- ~trans->iters_live; +- +- while (iters) { +- unsigned idx = __ffs64(iters); +- +- iters &= ~(1ULL << idx); +- __bch2_trans_iter_free(trans, idx); +- } +-} +- + /** + * bch2_trans_begin() - reset a transaction after a interrupted attempt + * @trans: transaction to reset +@@ -2478,17 +2503,11 @@ inline void bch2_trans_unlink_iters(struct btree_trans *trans) + */ + void bch2_trans_begin(struct btree_trans *trans) + { +- struct btree_iter *iter; +- +- trans_for_each_iter(trans, iter) +- iter->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT; ++ struct btree_insert_entry *i; ++ struct btree_path *path; + +- /* +- * XXX: we shouldn't be doing this if the transaction was restarted, but +- * currently we still overflow transaction iterators if we do that +- * */ +- bch2_trans_unlink_iters(trans); +- trans->iters_touched &= trans->iters_live; ++ trans_for_each_update(trans, i) ++ __btree_path_put(i->path, true); + + trans->extra_journal_res = 0; + trans->nr_updates = 0; +@@ -2505,17 +2524,29 @@ void bch2_trans_begin(struct btree_trans *trans) + (void *) &trans->fs_usage_deltas->memset_start); + } + ++ trans_for_each_path(trans, path) { ++ /* ++ * XXX: we probably shouldn't be doing this if the transaction ++ * was restarted, but currently we still overflow transaction ++ * iterators if we do that ++ */ ++ if (!path->ref && !path->preserve) ++ __bch2_path_free(trans, path); ++ else ++ path->preserve = path->should_be_locked = false; ++ } ++ + bch2_trans_cond_resched(trans); + + if (trans->restarted) +- bch2_btree_iter_traverse_all(trans); ++ bch2_btree_path_traverse_all(trans); + + trans->restarted = false; + } + +-static void bch2_trans_alloc_iters(struct btree_trans *trans, struct bch_fs *c) ++static void bch2_trans_alloc_paths(struct btree_trans *trans, struct bch_fs *c) + { +- size_t iters_bytes = sizeof(struct btree_iter) * BTREE_ITER_MAX; ++ size_t paths_bytes = sizeof(struct btree_path) * BTREE_ITER_MAX; + size_t updates_bytes = sizeof(struct btree_insert_entry) * BTREE_ITER_MAX; + size_t sorted_bytes = sizeof(u8) * BTREE_ITER_MAX; + void *p = NULL; +@@ -2523,12 +2554,12 @@ static void bch2_trans_alloc_iters(struct btree_trans *trans, struct bch_fs *c) + BUG_ON(trans->used_mempool); + + #ifdef __KERNEL__ +- p = this_cpu_xchg(c->btree_iters_bufs->iter, NULL); ++ p = this_cpu_xchg(c->btree_paths_bufs->path , NULL); + #endif + if (!p) +- p = mempool_alloc(&trans->c->btree_iters_pool, GFP_NOFS); ++ p = mempool_alloc(&trans->c->btree_paths_pool, GFP_NOFS); + +- trans->iters = p; p += iters_bytes; ++ trans->paths = p; p += paths_bytes; + trans->updates = p; p += updates_bytes; + trans->sorted = p; p += sorted_bytes; + } +@@ -2542,11 +2573,7 @@ void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, + trans->c = c; + trans->ip = _RET_IP_; + +- /* +- * reallocating iterators currently completely breaks +- * bch2_trans_iter_put(), we always allocate the max: +- */ +- bch2_trans_alloc_iters(trans, c); ++ bch2_trans_alloc_paths(trans, c); + + if (expected_mem_bytes) { + trans->mem_bytes = roundup_pow_of_two(expected_mem_bytes); +@@ -2568,54 +2595,63 @@ void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, + #endif + } + ++static void check_btree_paths_leaked(struct btree_trans *trans) ++{ ++#ifdef CONFIG_BCACHEFS_DEBUG ++ struct bch_fs *c = trans->c; ++ struct btree_path *path; ++ ++ trans_for_each_path(trans, path) ++ if (path->ref) ++ goto leaked; ++ return; ++leaked: ++ bch_err(c, "btree paths leaked from %pS!", (void *) trans->ip); ++ trans_for_each_path(trans, path) ++ if (path->ref) ++ printk(KERN_ERR " btree %s %pS\n", ++ bch2_btree_ids[path->btree_id], ++ (void *) path->ip_allocated); ++ /* Be noisy about this: */ ++ bch2_fatal_error(c); ++#endif ++} ++ + int bch2_trans_exit(struct btree_trans *trans) + __releases(&c->btree_trans_barrier) + { ++ struct btree_insert_entry *i; + struct bch_fs *c = trans->c; + + bch2_trans_unlock(trans); + +-#ifdef CONFIG_BCACHEFS_DEBUG +- if (trans->iters_live) { +- struct btree_iter *iter; +- +- trans_for_each_iter(trans, iter) +- btree_iter_child_free(trans, iter); +- } ++ trans_for_each_update(trans, i) ++ __btree_path_put(i->path, true); ++ trans->nr_updates = 0; + +- if (trans->iters_live) { +- struct btree_iter *iter; +- +- bch_err(c, "btree iterators leaked!"); +- trans_for_each_iter(trans, iter) +- if (btree_iter_live(trans, iter)) +- printk(KERN_ERR " btree %s allocated at %pS\n", +- bch2_btree_ids[iter->btree_id], +- (void *) iter->ip_allocated); +- /* Be noisy about this: */ +- bch2_fatal_error(c); +- } ++ check_btree_paths_leaked(trans); + +- mutex_lock(&trans->c->btree_trans_lock); ++#ifdef CONFIG_BCACHEFS_DEBUG ++ mutex_lock(&c->btree_trans_lock); + list_del(&trans->list); +- mutex_unlock(&trans->c->btree_trans_lock); ++ mutex_unlock(&c->btree_trans_lock); + #endif + + srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx); + +- bch2_journal_preres_put(&trans->c->journal, &trans->journal_preres); ++ bch2_journal_preres_put(&c->journal, &trans->journal_preres); + + if (trans->fs_usage_deltas) { + if (trans->fs_usage_deltas->size + sizeof(trans->fs_usage_deltas) == + REPLICAS_DELTA_LIST_MAX) + mempool_free(trans->fs_usage_deltas, +- &trans->c->replicas_delta_pool); ++ &c->replicas_delta_pool); + else + kfree(trans->fs_usage_deltas); + } + + if (trans->mem_bytes == BTREE_TRANS_MEM_MAX) +- mempool_free(trans->mem, &trans->c->btree_trans_mem_pool); ++ mempool_free(trans->mem, &c->btree_trans_mem_pool); + else + kfree(trans->mem); + +@@ -2623,20 +2659,20 @@ int bch2_trans_exit(struct btree_trans *trans) + /* + * Userspace doesn't have a real percpu implementation: + */ +- trans->iters = this_cpu_xchg(c->btree_iters_bufs->iter, trans->iters); ++ trans->paths = this_cpu_xchg(c->btree_paths_bufs->path, trans->paths); + #endif + +- if (trans->iters) +- mempool_free(trans->iters, &trans->c->btree_iters_pool); ++ if (trans->paths) ++ mempool_free(trans->paths, &c->btree_paths_pool); + + trans->mem = (void *) 0x1; +- trans->iters = (void *) 0x1; ++ trans->paths = (void *) 0x1; + + return trans->error ? -EIO : 0; + } + + static void __maybe_unused +-bch2_btree_iter_node_to_text(struct printbuf *out, ++bch2_btree_path_node_to_text(struct printbuf *out, + struct btree_bkey_cached_common *_b, + bool cached) + { +@@ -2648,10 +2684,10 @@ bch2_btree_iter_node_to_text(struct printbuf *out, + #ifdef CONFIG_BCACHEFS_DEBUG + static bool trans_has_locks(struct btree_trans *trans) + { +- struct btree_iter *iter; ++ struct btree_path *path; + +- trans_for_each_iter(trans, iter) +- if (iter->nodes_locked) ++ trans_for_each_path(trans, path) ++ if (path->nodes_locked) + return true; + return false; + } +@@ -2661,7 +2697,7 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c) + { + #ifdef CONFIG_BCACHEFS_DEBUG + struct btree_trans *trans; +- struct btree_iter *iter; ++ struct btree_path *path; + struct btree *b; + unsigned l; + +@@ -2672,24 +2708,24 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c) + + pr_buf(out, "%i %ps\n", trans->pid, (void *) trans->ip); + +- trans_for_each_iter(trans, iter) { +- if (!iter->nodes_locked) ++ trans_for_each_path(trans, path) { ++ if (!path->nodes_locked) + continue; + +- pr_buf(out, " iter %u %c %s:", +- iter->idx, +- iter->cached ? 'c' : 'b', +- bch2_btree_ids[iter->btree_id]); +- bch2_bpos_to_text(out, iter->pos); ++ pr_buf(out, " path %u %c %s:", ++ path->idx, ++ path->cached ? 'c' : 'b', ++ bch2_btree_ids[path->btree_id]); ++ bch2_bpos_to_text(out, path->pos); + pr_buf(out, "\n"); + + for (l = 0; l < BTREE_MAX_DEPTH; l++) { +- if (btree_node_locked(iter, l)) { ++ if (btree_node_locked(path, l)) { + pr_buf(out, " %s l=%u ", +- btree_node_intent_locked(iter, l) ? "i" : "r", l); +- bch2_btree_iter_node_to_text(out, +- (void *) iter->l[l].b, +- iter->cached); ++ btree_node_intent_locked(path, l) ? "i" : "r", l); ++ bch2_btree_path_node_to_text(out, ++ (void *) path->l[l].b, ++ path->cached); + pr_buf(out, "\n"); + } + } +@@ -2697,18 +2733,17 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c) + + b = READ_ONCE(trans->locking); + if (b) { +- iter = &trans->iters[trans->locking_iter_idx]; +- pr_buf(out, " locking iter %u %c l=%u %s:", +- trans->locking_iter_idx, +- iter->cached ? 'c' : 'b', ++ path = &trans->paths[trans->locking_path_idx]; ++ pr_buf(out, " locking path %u %c l=%u %s:", ++ trans->locking_path_idx, ++ path->cached ? 'c' : 'b', + trans->locking_level, + bch2_btree_ids[trans->locking_btree_id]); + bch2_bpos_to_text(out, trans->locking_pos); + + pr_buf(out, " node "); +- bch2_btree_iter_node_to_text(out, +- (void *) b, +- iter->cached); ++ bch2_btree_path_node_to_text(out, ++ (void *) b, path->cached); + pr_buf(out, "\n"); + } + } +@@ -2719,7 +2754,7 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c) + void bch2_fs_btree_iter_exit(struct bch_fs *c) + { + mempool_exit(&c->btree_trans_mem_pool); +- mempool_exit(&c->btree_iters_pool); ++ mempool_exit(&c->btree_paths_pool); + cleanup_srcu_struct(&c->btree_trans_barrier); + } + +@@ -2731,9 +2766,9 @@ int bch2_fs_btree_iter_init(struct bch_fs *c) + mutex_init(&c->btree_trans_lock); + + return init_srcu_struct(&c->btree_trans_barrier) ?: +- mempool_init_kmalloc_pool(&c->btree_iters_pool, 1, ++ mempool_init_kmalloc_pool(&c->btree_paths_pool, 1, + sizeof(u8) * nr + +- sizeof(struct btree_iter) * nr + ++ sizeof(struct btree_path) * nr + + sizeof(struct btree_insert_entry) * nr) ?: + mempool_init_kmalloc_pool(&c->btree_trans_mem_pool, 1, + BTREE_TRANS_MEM_MAX); +diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h +index e4bfd9e75784..273bc7f3a29b 100644 +--- a/fs/bcachefs/btree_iter.h ++++ b/fs/bcachefs/btree_iter.h +@@ -5,40 +5,49 @@ + #include "bset.h" + #include "btree_types.h" + +-static inline void btree_iter_set_dirty(struct btree_iter *iter, +- enum btree_iter_uptodate u) ++static inline void __btree_path_get(struct btree_path *path, bool intent) + { +- iter->uptodate = max_t(unsigned, iter->uptodate, u); ++ path->ref++; ++ path->intent_ref += intent; + } + +-static inline struct btree *btree_iter_node(struct btree_iter *iter, ++static inline bool __btree_path_put(struct btree_path *path, bool intent) ++{ ++ EBUG_ON(!path->ref); ++ EBUG_ON(!path->intent_ref && intent); ++ path->intent_ref -= intent; ++ return --path->ref == 0; ++} ++ ++static inline void btree_path_set_dirty(struct btree_path *path, ++ enum btree_path_uptodate u) ++{ ++ path->uptodate = max_t(unsigned, path->uptodate, u); ++} ++ ++static inline struct btree *btree_path_node(struct btree_path *path, + unsigned level) + { +- return level < BTREE_MAX_DEPTH ? iter->l[level].b : NULL; ++ return level < BTREE_MAX_DEPTH ? path->l[level].b : NULL; + } + +-static inline bool btree_node_lock_seq_matches(const struct btree_iter *iter, ++static inline bool btree_node_lock_seq_matches(const struct btree_path *path, + const struct btree *b, unsigned level) + { + /* + * We don't compare the low bits of the lock sequence numbers because +- * @iter might have taken a write lock on @b, and we don't want to skip +- * the linked iterator if the sequence numbers were equal before taking +- * that write lock. The lock sequence number is incremented by taking +- * and releasing write locks and is even when unlocked: ++ * @path might have taken a write lock on @b, and we don't want to skip ++ * the linked path if the sequence numbers were equal before taking that ++ * write lock. The lock sequence number is incremented by taking and ++ * releasing write locks and is even when unlocked: + */ +- return iter->l[level].lock_seq >> 1 == b->c.lock.state.seq >> 1; ++ return path->l[level].lock_seq >> 1 == b->c.lock.state.seq >> 1; + } + +-static inline struct btree *btree_node_parent(struct btree_iter *iter, ++static inline struct btree *btree_node_parent(struct btree_path *path, + struct btree *b) + { +- return btree_iter_node(iter, b->c.level + 1); +-} +- +-static inline bool btree_trans_has_multiple_iters(const struct btree_trans *trans) +-{ +- return hweight64(trans->iters_linked) > 1; ++ return btree_path_node(path, b->c.level + 1); + } + + static inline int btree_iter_err(const struct btree_iter *iter) +@@ -46,96 +55,105 @@ static inline int btree_iter_err(const struct btree_iter *iter) + return iter->flags & BTREE_ITER_ERROR ? -EIO : 0; + } + +-/* Iterate over iters within a transaction: */ ++/* Iterate over paths within a transaction: */ + +-static inline struct btree_iter * +-__trans_next_iter(struct btree_trans *trans, unsigned idx) ++static inline struct btree_path * ++__trans_next_path(struct btree_trans *trans, unsigned idx) + { + u64 l; + + if (idx == BTREE_ITER_MAX) + return NULL; + +- l = trans->iters_linked >> idx; ++ l = trans->paths_allocated >> idx; + if (!l) + return NULL; + + idx += __ffs64(l); + EBUG_ON(idx >= BTREE_ITER_MAX); +- EBUG_ON(trans->iters[idx].idx != idx); +- return &trans->iters[idx]; ++ EBUG_ON(trans->paths[idx].idx != idx); ++ return &trans->paths[idx]; + } + +-#define trans_for_each_iter(_trans, _iter) \ +- for (_iter = __trans_next_iter((_trans), 0); \ +- (_iter); \ +- _iter = __trans_next_iter((_trans), (_iter)->idx + 1)) ++#define trans_for_each_path(_trans, _path) \ ++ for (_path = __trans_next_path((_trans), 0); \ ++ (_path); \ ++ _path = __trans_next_path((_trans), (_path)->idx + 1)) + +-static inline struct btree_iter *next_btree_iter(struct btree_trans *trans, struct btree_iter *iter) ++static inline struct btree_path *next_btree_path(struct btree_trans *trans, struct btree_path *path) + { +- unsigned idx = iter ? iter->sorted_idx + 1 : 0; ++ unsigned idx = path ? path->sorted_idx + 1 : 0; + + EBUG_ON(idx > trans->nr_sorted); + + return idx < trans->nr_sorted +- ? trans->iters + trans->sorted[idx] ++ ? trans->paths + trans->sorted[idx] + : NULL; + } + +-static inline struct btree_iter *prev_btree_iter(struct btree_trans *trans, struct btree_iter *iter) ++static inline struct btree_path *prev_btree_path(struct btree_trans *trans, struct btree_path *path) + { +- EBUG_ON(iter->sorted_idx >= trans->nr_sorted); +- return iter->sorted_idx +- ? trans->iters + trans->sorted[iter->sorted_idx - 1] ++ EBUG_ON(path->sorted_idx >= trans->nr_sorted); ++ return path->sorted_idx ++ ? trans->paths + trans->sorted[path->sorted_idx - 1] + : NULL; + } + +-#define trans_for_each_iter_inorder(_trans, _iter) \ +- for (_iter = next_btree_iter(trans, NULL); \ +- (_iter); \ +- _iter = next_btree_iter((_trans), (_iter))) ++#define trans_for_each_path_inorder(_trans, _path) \ ++ for (_path = next_btree_path(trans, NULL); \ ++ (_path); \ ++ _path = next_btree_path((_trans), (_path))) + +-static inline bool __iter_has_node(const struct btree_iter *iter, ++static inline bool __path_has_node(const struct btree_path *path, + const struct btree *b) + { +- return iter->l[b->c.level].b == b && +- btree_node_lock_seq_matches(iter, b, b->c.level); ++ return path->l[b->c.level].b == b && ++ btree_node_lock_seq_matches(path, b, b->c.level); + } + +-static inline struct btree_iter * +-__trans_next_iter_with_node(struct btree_trans *trans, struct btree *b, ++static inline struct btree_path * ++__trans_next_path_with_node(struct btree_trans *trans, struct btree *b, + unsigned idx) + { +- struct btree_iter *iter = __trans_next_iter(trans, idx); ++ struct btree_path *path = __trans_next_path(trans, idx); + +- while (iter && !__iter_has_node(iter, b)) +- iter = __trans_next_iter(trans, iter->idx + 1); ++ while (path && !__path_has_node(path, b)) ++ path = __trans_next_path(trans, path->idx + 1); + +- return iter; ++ return path; + } + +-#define trans_for_each_iter_with_node(_trans, _b, _iter) \ +- for (_iter = __trans_next_iter_with_node((_trans), (_b), 0); \ +- (_iter); \ +- _iter = __trans_next_iter_with_node((_trans), (_b), \ +- (_iter)->idx + 1)) ++#define trans_for_each_path_with_node(_trans, _b, _path) \ ++ for (_path = __trans_next_path_with_node((_trans), (_b), 0); \ ++ (_path); \ ++ _path = __trans_next_path_with_node((_trans), (_b), \ ++ (_path)->idx + 1)) ++ ++struct btree_path * __must_check ++bch2_btree_path_make_mut(struct btree_trans *, struct btree_path *, bool); ++int __must_check bch2_btree_path_traverse(struct btree_trans *, ++ struct btree_path *, unsigned); ++struct btree_path *bch2_path_get(struct btree_trans *, bool, enum btree_id, ++ struct bpos, unsigned, unsigned, bool); ++inline struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *, struct bkey *); + + #ifdef CONFIG_BCACHEFS_DEBUG +-void bch2_trans_verify_iters(struct btree_trans *, struct btree *); ++void bch2_trans_verify_paths(struct btree_trans *); + void bch2_trans_verify_locks(struct btree_trans *); + #else +-static inline void bch2_trans_verify_iters(struct btree_trans *trans, +- struct btree *b) {} +-static inline void bch2_trans_verify_locks(struct btree_trans *iter) {} ++static inline void bch2_trans_verify_paths(struct btree_trans *trans) {} ++static inline void bch2_trans_verify_locks(struct btree_trans *trans) {} + #endif + +-void bch2_btree_iter_fix_key_modified(struct btree_trans *trans, ++void bch2_btree_path_fix_key_modified(struct btree_trans *trans, + struct btree *, struct bkey_packed *); +-void bch2_btree_node_iter_fix(struct btree_trans *trans, struct btree_iter *, ++void bch2_btree_node_iter_fix(struct btree_trans *trans, struct btree_path *, + struct btree *, struct btree_node_iter *, + struct bkey_packed *, unsigned, unsigned); + +-bool bch2_btree_iter_relock_intent(struct btree_trans *, struct btree_iter *); ++bool bch2_btree_path_relock_intent(struct btree_trans *, struct btree_path *); ++ ++void bch2_path_put(struct btree_trans *, struct btree_path *, bool); + + bool bch2_trans_relock(struct btree_trans *); + void bch2_trans_unlock(struct btree_trans *); +@@ -148,28 +166,28 @@ static inline int btree_trans_restart(struct btree_trans *trans) + return -EINTR; + } + +-bool __bch2_btree_iter_upgrade(struct btree_trans *, +- struct btree_iter *, unsigned); ++bool __bch2_btree_path_upgrade(struct btree_trans *, ++ struct btree_path *, unsigned); + +-static inline bool bch2_btree_iter_upgrade(struct btree_trans *trans, +- struct btree_iter *iter, ++static inline bool bch2_btree_path_upgrade(struct btree_trans *trans, ++ struct btree_path *path, + unsigned new_locks_want) + { + new_locks_want = min(new_locks_want, BTREE_MAX_DEPTH); + +- return iter->locks_want < new_locks_want +- ? __bch2_btree_iter_upgrade(trans, iter, new_locks_want) +- : iter->uptodate == BTREE_ITER_UPTODATE; ++ return path->locks_want < new_locks_want ++ ? __bch2_btree_path_upgrade(trans, path, new_locks_want) ++ : path->uptodate == BTREE_ITER_UPTODATE; + } + +-void __bch2_btree_iter_downgrade(struct btree_iter *, unsigned); ++void __bch2_btree_path_downgrade(struct btree_path *, unsigned); + +-static inline void bch2_btree_iter_downgrade(struct btree_iter *iter) ++static inline void bch2_btree_path_downgrade(struct btree_path *path) + { +- unsigned new_locks_want = iter->level + !!(iter->flags & BTREE_ITER_INTENT); ++ unsigned new_locks_want = path->level + !!path->intent_ref; + +- if (iter->locks_want > new_locks_want) +- __bch2_btree_iter_downgrade(iter, new_locks_want); ++ if (path->locks_want > new_locks_want) ++ __bch2_btree_path_downgrade(path, new_locks_want); + } + + void bch2_trans_downgrade(struct btree_trans *); +@@ -206,7 +224,8 @@ static inline void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos + iter->k.p.offset = iter->pos.offset = new_pos.offset; + iter->k.p.snapshot = iter->pos.snapshot = new_pos.snapshot; + iter->k.size = 0; +- iter->should_be_locked = false; ++ if (iter->path->ref == 1) ++ iter->path->should_be_locked = false; + } + + static inline void bch2_btree_iter_set_pos_to_extent_start(struct btree_iter *iter) +@@ -215,17 +234,6 @@ static inline void bch2_btree_iter_set_pos_to_extent_start(struct btree_iter *it + iter->pos = bkey_start_pos(&iter->k); + } + +-static inline struct btree_iter *idx_to_btree_iter(struct btree_trans *trans, unsigned idx) +-{ +- return idx != U8_MAX ? trans->iters + idx : NULL; +-} +- +-static inline struct btree_iter *btree_iter_child(struct btree_trans *trans, +- struct btree_iter *iter) +-{ +- return idx_to_btree_iter(trans, iter->child_idx); +-} +- + /* + * Unlocks before scheduling + * Note: does not revalidate iterator +@@ -243,11 +251,11 @@ static inline int bch2_trans_cond_resched(struct btree_trans *trans) + + #define __for_each_btree_node(_trans, _iter, _btree_id, _start, \ + _locks_want, _depth, _flags, _b) \ +- for (iter = bch2_trans_get_node_iter((_trans), (_btree_id), \ ++ for (bch2_trans_node_iter_init((_trans), &(_iter), (_btree_id), \ + _start, _locks_want, _depth, _flags), \ +- _b = bch2_btree_iter_peek_node(_iter); \ ++ _b = bch2_btree_iter_peek_node(&(_iter)); \ + (_b); \ +- (_b) = bch2_btree_iter_next_node(_iter)) ++ (_b) = bch2_btree_iter_next_node(&(_iter))) + + #define for_each_btree_node(_trans, _iter, _btree_id, _start, \ + _flags, _b) \ +@@ -277,77 +285,36 @@ static inline int bkey_err(struct bkey_s_c k) + + #define for_each_btree_key(_trans, _iter, _btree_id, \ + _start, _flags, _k, _ret) \ +- for ((_iter) = bch2_trans_get_iter((_trans), (_btree_id), \ +- (_start), (_flags)), \ +- (_k) = __bch2_btree_iter_peek(_iter, _flags); \ ++ for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ ++ (_start), (_flags)), \ ++ (_k) = __bch2_btree_iter_peek(&(_iter), _flags); \ + !((_ret) = bkey_err(_k)) && (_k).k; \ +- (_k) = __bch2_btree_iter_next(_iter, _flags)) ++ (_k) = __bch2_btree_iter_next(&(_iter), _flags)) + + #define for_each_btree_key_continue(_iter, _flags, _k, _ret) \ +- for ((_k) = __bch2_btree_iter_peek(_iter, _flags); \ ++ for ((_k) = __bch2_btree_iter_peek(&(_iter), _flags); \ + !((_ret) = bkey_err(_k)) && (_k).k; \ +- (_k) = __bch2_btree_iter_next(_iter, _flags)) ++ (_k) = __bch2_btree_iter_next(&(_iter), _flags)) + + /* new multiple iterator interface: */ + +-void bch2_dump_trans_iters_updates(struct btree_trans *); +- +-int bch2_trans_iter_put(struct btree_trans *, struct btree_iter *); +-int bch2_trans_iter_free(struct btree_trans *, struct btree_iter *); ++void bch2_dump_trans_paths_updates(struct btree_trans *); + +-void bch2_trans_unlink_iters(struct btree_trans *); ++void bch2_trans_iter_exit(struct btree_trans *, struct btree_iter *); ++void bch2_trans_iter_init(struct btree_trans *, struct btree_iter *, ++ unsigned, struct bpos, unsigned); ++void bch2_trans_node_iter_init(struct btree_trans *, struct btree_iter *, ++ enum btree_id, struct bpos, ++ unsigned, unsigned, unsigned); ++void bch2_trans_copy_iter(struct btree_iter *, struct btree_iter *); + +-struct btree_iter *__bch2_trans_get_iter(struct btree_trans *, enum btree_id, +- struct bpos, unsigned, +- unsigned, unsigned); +- +-static inline struct btree_iter * +-bch2_trans_get_iter(struct btree_trans *trans, enum btree_id btree_id, +- struct bpos pos, unsigned flags) +-{ +- struct btree_iter *iter = +- __bch2_trans_get_iter(trans, btree_id, pos, +- (flags & BTREE_ITER_INTENT) != 0, 0, +- flags); +- iter->ip_allocated = _THIS_IP_; +- return iter; +-} +- +-struct btree_iter *__bch2_trans_copy_iter(struct btree_trans *, +- struct btree_iter *); +-static inline struct btree_iter * +-bch2_trans_copy_iter(struct btree_trans *trans, struct btree_iter *src) +-{ +- struct btree_iter *iter = +- __bch2_trans_copy_iter(trans, src); +- +- iter->ip_allocated = _THIS_IP_; +- return iter; +-} +- +-struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *, +- enum btree_id, struct bpos, +- unsigned, unsigned, unsigned); +- +-static inline bool btree_iter_live(struct btree_trans *trans, struct btree_iter *iter) ++static inline void set_btree_iter_dontneed(struct btree_iter *iter) + { +- return (trans->iters_live & (1ULL << iter->idx)) != 0; ++ iter->path->preserve = false; + } + +-static inline bool btree_iter_keep(struct btree_trans *trans, struct btree_iter *iter) +-{ +- return btree_iter_live(trans, iter) || +- (iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT); +-} +- +-static inline void set_btree_iter_dontneed(struct btree_trans *trans, struct btree_iter *iter) +-{ +- trans->iters_touched &= ~(1ULL << iter->idx); +-} +- +-void bch2_trans_begin(struct btree_trans *); +- + void *bch2_trans_kmalloc(struct btree_trans *, size_t); ++void bch2_trans_begin(struct btree_trans *); + void bch2_trans_init(struct btree_trans *, struct bch_fs *, unsigned, size_t); + int bch2_trans_exit(struct btree_trans *); + +diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c +index 924b67e79805..2dfa5040d045 100644 +--- a/fs/bcachefs/btree_key_cache.c ++++ b/fs/bcachefs/btree_key_cache.c +@@ -196,23 +196,23 @@ btree_key_cache_create(struct btree_key_cache *c, + } + + static int btree_key_cache_fill(struct btree_trans *trans, +- struct btree_iter *ck_iter, ++ struct btree_path *ck_path, + struct bkey_cached *ck) + { +- struct btree_iter *iter; ++ struct btree_iter iter; + struct bkey_s_c k; + unsigned new_u64s = 0; + struct bkey_i *new_k = NULL; + int ret; + +- iter = bch2_trans_get_iter(trans, ck->key.btree_id, +- ck->key.pos, BTREE_ITER_SLOTS); +- k = bch2_btree_iter_peek_slot(iter); ++ bch2_trans_iter_init(trans, &iter, ck->key.btree_id, ++ ck->key.pos, BTREE_ITER_SLOTS); ++ k = bch2_btree_iter_peek_slot(&iter); + ret = bkey_err(k); + if (ret) + goto err; + +- if (!bch2_btree_node_relock(trans, ck_iter, 0)) { ++ if (!bch2_btree_node_relock(trans, ck_path, 0)) { + trace_transaction_restart_ip(trans->ip, _THIS_IP_); + ret = btree_trans_restart(trans); + goto err; +@@ -237,7 +237,7 @@ static int btree_key_cache_fill(struct btree_trans *trans, + * XXX: not allowed to be holding read locks when we take a write lock, + * currently + */ +- bch2_btree_node_lock_write(trans, ck_iter, ck_iter->l[0].b); ++ bch2_btree_node_lock_write(trans, ck_path, ck_path->l[0].b); + if (new_k) { + kfree(ck->k); + ck->u64s = new_u64s; +@@ -246,62 +246,64 @@ static int btree_key_cache_fill(struct btree_trans *trans, + + bkey_reassemble(ck->k, k); + ck->valid = true; +- bch2_btree_node_unlock_write(trans, ck_iter, ck_iter->l[0].b); ++ bch2_btree_node_unlock_write(trans, ck_path, ck_path->l[0].b); + + /* We're not likely to need this iterator again: */ +- set_btree_iter_dontneed(trans, iter); ++ set_btree_iter_dontneed(&iter); + err: +- bch2_trans_iter_put(trans, iter); ++ bch2_trans_iter_exit(trans, &iter); + return ret; + } + + static int bkey_cached_check_fn(struct six_lock *lock, void *p) + { + struct bkey_cached *ck = container_of(lock, struct bkey_cached, c.lock); +- const struct btree_iter *iter = p; ++ const struct btree_path *path = p; + +- return ck->key.btree_id == iter->btree_id && +- !bpos_cmp(ck->key.pos, iter->pos) ? 0 : -1; ++ return ck->key.btree_id == path->btree_id && ++ !bpos_cmp(ck->key.pos, path->pos) ? 0 : -1; + } + + __flatten +-int bch2_btree_iter_traverse_cached(struct btree_trans *trans, struct btree_iter *iter) ++int bch2_btree_path_traverse_cached(struct btree_trans *trans, struct btree_path *path, ++ unsigned flags) + { + struct bch_fs *c = trans->c; + struct bkey_cached *ck; + int ret = 0; + +- BUG_ON(iter->level); ++ BUG_ON(path->level); + +- iter->l[1].b = NULL; ++ path->l[1].b = NULL; + +- if (bch2_btree_node_relock(trans, iter, 0)) { +- ck = (void *) iter->l[0].b; ++ if (bch2_btree_node_relock(trans, path, 0)) { ++ ck = (void *) path->l[0].b; + goto fill; + } + retry: +- ck = bch2_btree_key_cache_find(c, iter->btree_id, iter->pos); ++ ck = bch2_btree_key_cache_find(c, path->btree_id, path->pos); + if (!ck) { +- if (iter->flags & BTREE_ITER_CACHED_NOCREATE) { +- iter->l[0].b = NULL; ++ if (flags & BTREE_ITER_CACHED_NOCREATE) { ++ path->l[0].b = NULL; + return 0; + } + + ck = btree_key_cache_create(&c->btree_key_cache, +- iter->btree_id, iter->pos); ++ path->btree_id, path->pos); + ret = PTR_ERR_OR_ZERO(ck); + if (ret) + goto err; + if (!ck) + goto retry; + +- mark_btree_node_locked(iter, 0, SIX_LOCK_intent); +- iter->locks_want = 1; ++ mark_btree_node_locked(path, 0, SIX_LOCK_intent); ++ path->locks_want = 1; + } else { +- enum six_lock_type lock_want = __btree_lock_want(iter, 0); ++ enum six_lock_type lock_want = __btree_lock_want(path, 0); + +- if (!btree_node_lock(trans, iter, (void *) ck, iter->pos, 0, lock_want, +- bkey_cached_check_fn, iter, _THIS_IP_)) { ++ if (!btree_node_lock(trans, path, (void *) ck, path->pos, 0, ++ lock_want, ++ bkey_cached_check_fn, path, _THIS_IP_)) { + if (!trans->restarted) + goto retry; + +@@ -310,28 +312,27 @@ retry: + goto err; + } + +- if (ck->key.btree_id != iter->btree_id || +- bpos_cmp(ck->key.pos, iter->pos)) { ++ if (ck->key.btree_id != path->btree_id || ++ bpos_cmp(ck->key.pos, path->pos)) { + six_unlock_type(&ck->c.lock, lock_want); + goto retry; + } + +- mark_btree_node_locked(iter, 0, lock_want); ++ mark_btree_node_locked(path, 0, lock_want); + } + +- iter->l[0].lock_seq = ck->c.lock.state.seq; +- iter->l[0].b = (void *) ck; ++ path->l[0].lock_seq = ck->c.lock.state.seq; ++ path->l[0].b = (void *) ck; + fill: +- if (!ck->valid && !(iter->flags & BTREE_ITER_CACHED_NOFILL)) { +- if (!iter->locks_want && +- !!__bch2_btree_iter_upgrade(trans, iter, 1)) { ++ if (!ck->valid && !(flags & BTREE_ITER_CACHED_NOFILL)) { ++ if (!path->locks_want && ++ !__bch2_btree_path_upgrade(trans, path, 1)) { + trace_transaction_restart_ip(trans->ip, _THIS_IP_); +- BUG_ON(!trans->restarted); +- ret = -EINTR; ++ ret = btree_trans_restart(trans); + goto err; + } + +- ret = btree_key_cache_fill(trans, iter, ck); ++ ret = btree_key_cache_fill(trans, path, ck); + if (ret) + goto err; + } +@@ -339,22 +340,14 @@ fill: + if (!test_bit(BKEY_CACHED_ACCESSED, &ck->flags)) + set_bit(BKEY_CACHED_ACCESSED, &ck->flags); + +- iter->uptodate = BTREE_ITER_UPTODATE; +- +- if ((iter->flags & BTREE_ITER_INTENT) && +- !bch2_btree_iter_upgrade(trans, iter, 1)) { +- BUG_ON(!trans->restarted); +- ret = -EINTR; +- } +- +- BUG_ON(!ret && !btree_node_locked(iter, 0)); ++ path->uptodate = BTREE_ITER_UPTODATE; ++ BUG_ON(btree_node_locked_type(path, 0) != btree_lock_want(path, 0)); + + return ret; + err: + if (ret != -EINTR) { +- btree_node_unlock(iter, 0); +- iter->flags |= BTREE_ITER_ERROR; +- iter->l[0].b = BTREE_ITER_NO_NODE_ERROR; ++ btree_node_unlock(path, 0); ++ path->l[0].b = BTREE_ITER_NO_NODE_ERROR; + } + return ret; + } +@@ -367,23 +360,23 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans, + { + struct bch_fs *c = trans->c; + struct journal *j = &c->journal; +- struct btree_iter *c_iter = NULL, *b_iter = NULL; ++ struct btree_iter c_iter, b_iter; + struct bkey_cached *ck = NULL; + int ret; + +- b_iter = bch2_trans_get_iter(trans, key.btree_id, key.pos, +- BTREE_ITER_SLOTS| +- BTREE_ITER_INTENT); +- c_iter = bch2_trans_get_iter(trans, key.btree_id, key.pos, +- BTREE_ITER_CACHED| +- BTREE_ITER_CACHED_NOFILL| +- BTREE_ITER_CACHED_NOCREATE| +- BTREE_ITER_INTENT); +- ret = bch2_btree_iter_traverse(c_iter); ++ bch2_trans_iter_init(trans, &b_iter, key.btree_id, key.pos, ++ BTREE_ITER_SLOTS| ++ BTREE_ITER_INTENT); ++ bch2_trans_iter_init(trans, &c_iter, key.btree_id, key.pos, ++ BTREE_ITER_CACHED| ++ BTREE_ITER_CACHED_NOFILL| ++ BTREE_ITER_CACHED_NOCREATE| ++ BTREE_ITER_INTENT); ++ ret = bch2_btree_iter_traverse(&c_iter); + if (ret) + goto out; + +- ck = (void *) c_iter->l[0].b; ++ ck = (void *) c_iter.path->l[0].b; + if (!ck || + (journal_seq && ck->journal.seq != journal_seq)) + goto out; +@@ -399,8 +392,8 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans, + * allocator/copygc depend on journal reclaim making progress, we need + * to be using alloc reserves: + * */ +- ret = bch2_btree_iter_traverse(b_iter) ?: +- bch2_trans_update(trans, b_iter, ck->k, ++ ret = bch2_btree_iter_traverse(&b_iter) ?: ++ bch2_trans_update(trans, &b_iter, ck->k, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE| + BTREE_TRIGGER_NORUN) ?: + bch2_trans_commit(trans, NULL, NULL, +@@ -422,7 +415,7 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans, + bch2_journal_pin_drop(j, &ck->journal); + bch2_journal_preres_put(j, &ck->res); + +- BUG_ON(!btree_node_locked(c_iter, 0)); ++ BUG_ON(!btree_node_locked(c_iter.path, 0)); + + if (!evict) { + if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { +@@ -431,10 +424,10 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans, + } + } else { + evict: +- BUG_ON(!btree_node_intent_locked(c_iter, 0)); ++ BUG_ON(!btree_node_intent_locked(c_iter.path, 0)); + +- mark_btree_node_unlocked(c_iter, 0); +- c_iter->l[0].b = NULL; ++ mark_btree_node_unlocked(c_iter.path, 0); ++ c_iter.path->l[0].b = NULL; + + six_lock_write(&ck->c.lock, NULL, NULL); + +@@ -450,8 +443,8 @@ evict: + mutex_unlock(&c->btree_key_cache.lock); + } + out: +- bch2_trans_iter_put(trans, b_iter); +- bch2_trans_iter_put(trans, c_iter); ++ bch2_trans_iter_exit(trans, &b_iter); ++ bch2_trans_iter_exit(trans, &c_iter); + return ret; + } + +@@ -502,11 +495,11 @@ int bch2_btree_key_cache_flush(struct btree_trans *trans, + } + + bool bch2_btree_insert_key_cached(struct btree_trans *trans, +- struct btree_iter *iter, ++ struct btree_path *path, + struct bkey_i *insert) + { + struct bch_fs *c = trans->c; +- struct bkey_cached *ck = (void *) iter->l[0].b; ++ struct bkey_cached *ck = (void *) path->l[0].b; + bool kick_reclaim = false; + + BUG_ON(insert->u64s > ck->u64s); +diff --git a/fs/bcachefs/btree_key_cache.h b/fs/bcachefs/btree_key_cache.h +index d890632e4425..0768ef3ca776 100644 +--- a/fs/bcachefs/btree_key_cache.h ++++ b/fs/bcachefs/btree_key_cache.h +@@ -26,10 +26,11 @@ int bch2_btree_key_cache_journal_flush(struct journal *, + struct bkey_cached * + bch2_btree_key_cache_find(struct bch_fs *, enum btree_id, struct bpos); + +-int bch2_btree_iter_traverse_cached(struct btree_trans *, struct btree_iter *); ++int bch2_btree_path_traverse_cached(struct btree_trans *, struct btree_path *, ++ unsigned); + + bool bch2_btree_insert_key_cached(struct btree_trans *, +- struct btree_iter *, struct bkey_i *); ++ struct btree_path *, struct bkey_i *); + int bch2_btree_key_cache_flush(struct btree_trans *, + enum btree_id, struct bpos); + #ifdef CONFIG_BCACHEFS_DEBUG +diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h +index f8b358f8f2c1..d599008c5fc1 100644 +--- a/fs/bcachefs/btree_locking.h ++++ b/fs/bcachefs/btree_locking.h +@@ -21,7 +21,7 @@ enum btree_node_locked_type { + BTREE_NODE_INTENT_LOCKED = SIX_LOCK_intent, + }; + +-static inline int btree_node_locked_type(struct btree_iter *iter, ++static inline int btree_node_locked_type(struct btree_path *path, + unsigned level) + { + /* +@@ -30,35 +30,35 @@ static inline int btree_node_locked_type(struct btree_iter *iter, + * branches: + */ + return BTREE_NODE_UNLOCKED + +- ((iter->nodes_locked >> level) & 1) + +- ((iter->nodes_intent_locked >> level) & 1); ++ ((path->nodes_locked >> level) & 1) + ++ ((path->nodes_intent_locked >> level) & 1); + } + +-static inline bool btree_node_intent_locked(struct btree_iter *iter, ++static inline bool btree_node_intent_locked(struct btree_path *path, + unsigned level) + { +- return btree_node_locked_type(iter, level) == BTREE_NODE_INTENT_LOCKED; ++ return btree_node_locked_type(path, level) == BTREE_NODE_INTENT_LOCKED; + } + +-static inline bool btree_node_read_locked(struct btree_iter *iter, ++static inline bool btree_node_read_locked(struct btree_path *path, + unsigned level) + { +- return btree_node_locked_type(iter, level) == BTREE_NODE_READ_LOCKED; ++ return btree_node_locked_type(path, level) == BTREE_NODE_READ_LOCKED; + } + +-static inline bool btree_node_locked(struct btree_iter *iter, unsigned level) ++static inline bool btree_node_locked(struct btree_path *path, unsigned level) + { +- return iter->nodes_locked & (1 << level); ++ return path->nodes_locked & (1 << level); + } + +-static inline void mark_btree_node_unlocked(struct btree_iter *iter, ++static inline void mark_btree_node_unlocked(struct btree_path *path, + unsigned level) + { +- iter->nodes_locked &= ~(1 << level); +- iter->nodes_intent_locked &= ~(1 << level); ++ path->nodes_locked &= ~(1 << level); ++ path->nodes_intent_locked &= ~(1 << level); + } + +-static inline void mark_btree_node_locked(struct btree_iter *iter, ++static inline void mark_btree_node_locked(struct btree_path *path, + unsigned level, + enum six_lock_type type) + { +@@ -66,52 +66,52 @@ static inline void mark_btree_node_locked(struct btree_iter *iter, + BUILD_BUG_ON(SIX_LOCK_read != 0); + BUILD_BUG_ON(SIX_LOCK_intent != 1); + +- iter->nodes_locked |= 1 << level; +- iter->nodes_intent_locked |= type << level; ++ path->nodes_locked |= 1 << level; ++ path->nodes_intent_locked |= type << level; + } + +-static inline void mark_btree_node_intent_locked(struct btree_iter *iter, ++static inline void mark_btree_node_intent_locked(struct btree_path *path, + unsigned level) + { +- mark_btree_node_locked(iter, level, SIX_LOCK_intent); ++ mark_btree_node_locked(path, level, SIX_LOCK_intent); + } + +-static inline enum six_lock_type __btree_lock_want(struct btree_iter *iter, int level) ++static inline enum six_lock_type __btree_lock_want(struct btree_path *path, int level) + { +- return level < iter->locks_want ++ return level < path->locks_want + ? SIX_LOCK_intent + : SIX_LOCK_read; + } + + static inline enum btree_node_locked_type +-btree_lock_want(struct btree_iter *iter, int level) ++btree_lock_want(struct btree_path *path, int level) + { +- if (level < iter->level) ++ if (level < path->level) + return BTREE_NODE_UNLOCKED; +- if (level < iter->locks_want) ++ if (level < path->locks_want) + return BTREE_NODE_INTENT_LOCKED; +- if (level == iter->level) ++ if (level == path->level) + return BTREE_NODE_READ_LOCKED; + return BTREE_NODE_UNLOCKED; + } + +-static inline void btree_node_unlock(struct btree_iter *iter, unsigned level) ++static inline void btree_node_unlock(struct btree_path *path, unsigned level) + { +- int lock_type = btree_node_locked_type(iter, level); ++ int lock_type = btree_node_locked_type(path, level); + + EBUG_ON(level >= BTREE_MAX_DEPTH); + + if (lock_type != BTREE_NODE_UNLOCKED) +- six_unlock_type(&iter->l[level].b->c.lock, lock_type); +- mark_btree_node_unlocked(iter, level); ++ six_unlock_type(&path->l[level].b->c.lock, lock_type); ++ mark_btree_node_unlocked(path, level); + } + +-static inline void __bch2_btree_iter_unlock(struct btree_iter *iter) ++static inline void __bch2_btree_path_unlock(struct btree_path *path) + { +- btree_iter_set_dirty(iter, BTREE_ITER_NEED_RELOCK); ++ btree_path_set_dirty(path, BTREE_ITER_NEED_RELOCK); + +- while (iter->nodes_locked) +- btree_node_unlock(iter, __ffs(iter->nodes_locked)); ++ while (path->nodes_locked) ++ btree_node_unlock(path, __ffs(path->nodes_locked)); + } + + static inline enum bch_time_stats lock_to_time_stat(enum six_lock_type type) +@@ -155,11 +155,11 @@ static inline bool btree_node_lock_increment(struct btree_trans *trans, + struct btree *b, unsigned level, + enum btree_node_locked_type want) + { +- struct btree_iter *iter; ++ struct btree_path *path; + +- trans_for_each_iter(trans, iter) +- if (iter->l[level].b == b && +- btree_node_locked_type(iter, level) >= want) { ++ trans_for_each_path(trans, path) ++ if (path->l[level].b == b && ++ btree_node_locked_type(path, level) >= want) { + six_lock_increment(&b->c.lock, want); + return true; + } +@@ -167,38 +167,39 @@ static inline bool btree_node_lock_increment(struct btree_trans *trans, + return false; + } + +-bool __bch2_btree_node_lock(struct btree_trans *, struct btree_iter *, ++bool __bch2_btree_node_lock(struct btree_trans *, struct btree_path *, + struct btree *, struct bpos, unsigned, +- enum six_lock_type, six_lock_should_sleep_fn, +- void *, unsigned long); ++ enum six_lock_type, ++ six_lock_should_sleep_fn, void *, ++ unsigned long); + + static inline bool btree_node_lock(struct btree_trans *trans, +- struct btree_iter *iter, ++ struct btree_path *path, + struct btree *b, struct bpos pos, unsigned level, + enum six_lock_type type, + six_lock_should_sleep_fn should_sleep_fn, void *p, + unsigned long ip) + { + EBUG_ON(level >= BTREE_MAX_DEPTH); +- EBUG_ON(!(trans->iters_linked & (1ULL << iter->idx))); ++ EBUG_ON(!(trans->paths_allocated & (1ULL << path->idx))); + + return likely(six_trylock_type(&b->c.lock, type)) || + btree_node_lock_increment(trans, b, level, type) || +- __bch2_btree_node_lock(trans, iter, b, pos, level, type, ++ __bch2_btree_node_lock(trans, path, b, pos, level, type, + should_sleep_fn, p, ip); + } + +-bool __bch2_btree_node_relock(struct btree_trans *, struct btree_iter *, unsigned); ++bool __bch2_btree_node_relock(struct btree_trans *, struct btree_path *, unsigned); + + static inline bool bch2_btree_node_relock(struct btree_trans *trans, +- struct btree_iter *iter, unsigned level) ++ struct btree_path *path, unsigned level) + { +- EBUG_ON(btree_node_locked(iter, level) && +- btree_node_locked_type(iter, level) != +- __btree_lock_want(iter, level)); ++ EBUG_ON(btree_node_locked(path, level) && ++ btree_node_locked_type(path, level) != ++ __btree_lock_want(path, level)); + +- return likely(btree_node_locked(iter, level)) || +- __bch2_btree_node_relock(trans, iter, level); ++ return likely(btree_node_locked(path, level)) || ++ __bch2_btree_node_relock(trans, path, level); + } + + /* +@@ -206,32 +207,32 @@ static inline bool bch2_btree_node_relock(struct btree_trans *trans, + * succeed: + */ + static inline void +-bch2_btree_node_unlock_write_inlined(struct btree_trans *trans, struct btree_iter *iter, ++bch2_btree_node_unlock_write_inlined(struct btree_trans *trans, struct btree_path *path, + struct btree *b) + { +- struct btree_iter *linked; ++ struct btree_path *linked; + +- EBUG_ON(iter->l[b->c.level].b != b); +- EBUG_ON(iter->l[b->c.level].lock_seq + 1 != b->c.lock.state.seq); ++ EBUG_ON(path->l[b->c.level].b != b); ++ EBUG_ON(path->l[b->c.level].lock_seq + 1 != b->c.lock.state.seq); + +- trans_for_each_iter_with_node(trans, b, linked) ++ trans_for_each_path_with_node(trans, b, linked) + linked->l[b->c.level].lock_seq += 2; + + six_unlock_write(&b->c.lock); + } + + void bch2_btree_node_unlock_write(struct btree_trans *, +- struct btree_iter *, struct btree *); ++ struct btree_path *, struct btree *); + + void __bch2_btree_node_lock_write(struct btree_trans *, struct btree *); + + static inline void bch2_btree_node_lock_write(struct btree_trans *trans, +- struct btree_iter *iter, ++ struct btree_path *path, + struct btree *b) + { +- EBUG_ON(iter->l[b->c.level].b != b); +- EBUG_ON(iter->l[b->c.level].lock_seq != b->c.lock.state.seq); +- EBUG_ON(!btree_node_intent_locked(iter, b->c.level)); ++ EBUG_ON(path->l[b->c.level].b != b); ++ EBUG_ON(path->l[b->c.level].lock_seq != b->c.lock.state.seq); ++ EBUG_ON(!btree_node_intent_locked(path, b->c.level)); + + if (unlikely(!six_trylock_write(&b->c.lock))) + __bch2_btree_node_lock_write(trans, b); +diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h +index cd2b79a04880..59a6b395d0e0 100644 +--- a/fs/bcachefs/btree_types.h ++++ b/fs/bcachefs/btree_types.h +@@ -210,7 +210,7 @@ struct btree_node_iter { + #define __BTREE_ITER_ALL_SNAPSHOTS (1 << 11) + #define BTREE_ITER_ALL_SNAPSHOTS (1 << 12) + +-enum btree_iter_uptodate { ++enum btree_path_uptodate { + BTREE_ITER_UPTODATE = 0, + BTREE_ITER_NEED_RELOCK = 1, + BTREE_ITER_NEED_TRAVERSE = 2, +@@ -225,51 +225,66 @@ enum btree_iter_uptodate { + #define BTREE_ITER_NO_NODE_ERROR ((struct btree *) 7) + #define BTREE_ITER_NO_NODE_CACHED ((struct btree *) 8) + +-/* +- * @pos - iterator's current position +- * @level - current btree depth +- * @locks_want - btree level below which we start taking intent locks +- * @nodes_locked - bitmask indicating which nodes in @nodes are locked +- * @nodes_intent_locked - bitmask indicating which locks are intent locks +- */ +-struct btree_iter { +- struct btree_trans *trans; +- unsigned long ip_allocated; +- ++struct btree_path { + u8 idx; +- u8 child_idx; + u8 sorted_idx; ++ u8 ref; ++ u8 intent_ref; + + /* btree_iter_copy starts here: */ +- u16 flags; +- +- /* When we're filtering by snapshot, the snapshot ID we're looking for: */ +- unsigned snapshot; +- + struct bpos pos; +- struct bpos real_pos; + + enum btree_id btree_id:4; + bool cached:1; +- enum btree_iter_uptodate uptodate:2; ++ bool preserve:1; ++ enum btree_path_uptodate uptodate:2; + /* +- * True if we've returned a key (and thus are expected to keep it +- * locked), false after set_pos - for avoiding spurious transaction +- * restarts in bch2_trans_relock(): ++ * When true, failing to relock this path will cause the transaction to ++ * restart: + */ + bool should_be_locked:1; +- unsigned level:4, +- min_depth:4, ++ unsigned level:3, + locks_want:4, + nodes_locked:4, + nodes_intent_locked:4; + +- struct btree_iter_level { ++ struct btree_path_level { + struct btree *b; + struct btree_node_iter iter; + u32 lock_seq; + } l[BTREE_MAX_DEPTH]; ++#ifdef CONFIG_BCACHEFS_DEBUG ++ unsigned long ip_allocated; ++#endif ++}; + ++static inline struct btree_path_level *path_l(struct btree_path *path) ++{ ++ return path->l + path->level; ++} ++ ++/* ++ * @pos - iterator's current position ++ * @level - current btree depth ++ * @locks_want - btree level below which we start taking intent locks ++ * @nodes_locked - bitmask indicating which nodes in @nodes are locked ++ * @nodes_intent_locked - bitmask indicating which locks are intent locks ++ */ ++struct btree_iter { ++ struct btree_trans *trans; ++ struct btree_path *path; ++ ++ enum btree_id btree_id:4; ++ unsigned min_depth:4; ++ ++ /* btree_iter_copy starts here: */ ++ u16 flags; ++ ++ /* When we're filtering by snapshot, the snapshot ID we're looking for: */ ++ unsigned snapshot; ++ ++ struct bpos pos; ++ struct bpos pos_after_commit; + /* + * Current unpacked key - so that bch2_btree_iter_next()/ + * bch2_btree_iter_next_slot() can correctly advance pos. +@@ -277,11 +292,6 @@ struct btree_iter { + struct bkey k; + }; + +-static inline struct btree_iter_level *iter_l(struct btree_iter *iter) +-{ +- return iter->l + iter->level; +-} +- + struct btree_key_cache { + struct mutex lock; + struct rhashtable table; +@@ -329,7 +339,7 @@ struct btree_insert_entry { + bool cached:1; + bool trans_triggers_run:1; + struct bkey_i *k; +- struct btree_iter *iter; ++ struct btree_path *path; + unsigned long ip_allocated; + }; + +@@ -354,7 +364,7 @@ struct btree_trans { + #ifdef CONFIG_BCACHEFS_DEBUG + struct list_head list; + struct btree *locking; +- unsigned locking_iter_idx; ++ unsigned locking_path_idx; + struct bpos locking_pos; + u8 locking_btree_id; + u8 locking_level; +@@ -375,16 +385,14 @@ struct btree_trans { + */ + unsigned extra_journal_res; + +- u64 iters_linked; +- u64 iters_live; +- u64 iters_touched; ++ u64 paths_allocated; + + unsigned mem_top; + unsigned mem_bytes; + void *mem; + + u8 *sorted; +- struct btree_iter *iters; ++ struct btree_path *paths; + struct btree_insert_entry *updates; + + /* update path: */ +@@ -588,16 +596,6 @@ static inline bool btree_node_is_extents(struct btree *b) + return btree_node_type_is_extents(btree_node_type(b)); + } + +-static inline enum btree_node_type btree_iter_key_type(struct btree_iter *iter) +-{ +- return __btree_node_type(iter->level, iter->btree_id); +-} +- +-static inline bool btree_iter_is_extents(struct btree_iter *iter) +-{ +- return btree_node_type_is_extents(btree_iter_key_type(iter)); +-} +- + #define BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS \ + ((1U << BKEY_TYPE_extents)| \ + (1U << BKEY_TYPE_inodes)| \ +diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h +index 5707baf10262..23b73d3a172c 100644 +--- a/fs/bcachefs/btree_update.h ++++ b/fs/bcachefs/btree_update.h +@@ -8,9 +8,9 @@ + struct bch_fs; + struct btree; + +-void bch2_btree_node_lock_for_insert(struct btree_trans *, struct btree_iter *, ++void bch2_btree_node_lock_for_insert(struct btree_trans *, struct btree_path *, + struct btree *); +-bool bch2_btree_bset_insert_key(struct btree_trans *, struct btree_iter *, ++bool bch2_btree_bset_insert_key(struct btree_trans *, struct btree_path *, + struct btree *, struct btree_node_iter *, + struct bkey_i *); + void bch2_btree_add_journal_pin(struct bch_fs *, struct btree *, u64); +@@ -135,4 +135,21 @@ static inline int bch2_trans_commit(struct btree_trans *trans, + (_i) < (_trans)->updates + (_trans)->nr_updates; \ + (_i)++) + ++static inline struct bkey_i *btree_trans_peek_updates(struct btree_trans *trans, ++ enum btree_id btree_id, ++ struct bpos pos) ++{ ++ struct btree_insert_entry *i; ++ ++ trans_for_each_update(trans, i) ++ if ((cmp_int(btree_id, i->btree_id) ?: ++ bpos_cmp(pos, i->k->k.p)) <= 0) { ++ if (btree_id == i->btree_id) ++ return i->k; ++ break; ++ } ++ ++ return NULL; ++} ++ + #endif /* _BCACHEFS_BTREE_UPDATE_H */ +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index 6e833a3c9e4c..5534573af425 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -23,7 +23,7 @@ + #include + + static void bch2_btree_insert_node(struct btree_update *, struct btree_trans *, +- struct btree_iter *, struct btree *, ++ struct btree_path *, struct btree *, + struct keylist *, unsigned); + static void bch2_btree_update_add_new_node(struct btree_update *, struct btree *); + +@@ -162,10 +162,10 @@ static void bch2_btree_node_free_inmem(struct btree_trans *trans, + struct btree *b) + { + struct bch_fs *c = trans->c; +- struct btree_iter *iter; ++ struct btree_path *path; + +- trans_for_each_iter(trans, iter) +- BUG_ON(iter->l[b->c.level].b == b); ++ trans_for_each_path(trans, path) ++ BUG_ON(path->l[b->c.level].b == b); + + six_lock_write(&b->c.lock, NULL, NULL); + +@@ -914,7 +914,7 @@ static void bch2_btree_update_done(struct btree_update *as) + } + + static struct btree_update * +-bch2_btree_update_start(struct btree_trans *trans, struct btree_iter *iter, ++bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, + unsigned level, unsigned nr_nodes, unsigned flags) + { + struct bch_fs *c = trans->c; +@@ -925,7 +925,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_iter *iter, + int journal_flags = 0; + int ret = 0; + +- BUG_ON(!iter->should_be_locked); ++ BUG_ON(!path->should_be_locked); + + if (flags & BTREE_INSERT_JOURNAL_RESERVED) + journal_flags |= JOURNAL_RES_GET_RESERVED; +@@ -937,11 +937,11 @@ retry: + * XXX: figure out how far we might need to split, + * instead of locking/reserving all the way to the root: + */ +- if (!bch2_btree_iter_upgrade(trans, iter, U8_MAX)) { ++ if (!bch2_btree_path_upgrade(trans, path, U8_MAX)) { + trace_trans_restart_iter_upgrade(trans->ip, _RET_IP_, +- iter->btree_id, +- &iter->real_pos); +- return ERR_PTR(-EINTR); ++ path->btree_id, &path->pos); ++ ret = btree_trans_restart(trans); ++ return ERR_PTR(ret); + } + + if (flags & BTREE_INSERT_GC_LOCK_HELD) +@@ -961,7 +961,7 @@ retry: + as->c = c; + as->mode = BTREE_INTERIOR_NO_UPDATE; + as->took_gc_lock = !(flags & BTREE_INSERT_GC_LOCK_HELD); +- as->btree_id = iter->btree_id; ++ as->btree_id = path->btree_id; + INIT_LIST_HEAD(&as->list); + INIT_LIST_HEAD(&as->unwritten_list); + INIT_LIST_HEAD(&as->write_blocked_list); +@@ -1081,7 +1081,7 @@ static void bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b) + */ + static void bch2_btree_set_root(struct btree_update *as, + struct btree_trans *trans, +- struct btree_iter *iter, ++ struct btree_path *path, + struct btree *b) + { + struct bch_fs *c = as->c; +@@ -1097,7 +1097,7 @@ static void bch2_btree_set_root(struct btree_update *as, + * Ensure no one is using the old root while we switch to the + * new root: + */ +- bch2_btree_node_lock_write(trans, iter, old); ++ bch2_btree_node_lock_write(trans, path, old); + + bch2_btree_set_root_inmem(c, b); + +@@ -1110,14 +1110,14 @@ static void bch2_btree_set_root(struct btree_update *as, + * an intent lock on the new root, and any updates that would + * depend on the new root would have to update the new root. + */ +- bch2_btree_node_unlock_write(trans, iter, old); ++ bch2_btree_node_unlock_write(trans, path, old); + } + + /* Interior node updates: */ + + static void bch2_insert_fixup_btree_ptr(struct btree_update *as, + struct btree_trans *trans, +- struct btree_iter *iter, ++ struct btree_path *path, + struct btree *b, + struct btree_node_iter *node_iter, + struct bkey_i *insert) +@@ -1152,7 +1152,7 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, + bkey_iter_pos_cmp(b, k, &insert->k.p) < 0) + bch2_btree_node_iter_advance(node_iter, b); + +- bch2_btree_bset_insert_key(trans, iter, b, node_iter, insert); ++ bch2_btree_bset_insert_key(trans, path, b, node_iter, insert); + set_btree_node_dirty(c, b); + set_btree_node_need_write(b); + } +@@ -1160,7 +1160,7 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, + static void + __bch2_btree_insert_keys_interior(struct btree_update *as, + struct btree_trans *trans, +- struct btree_iter *iter, ++ struct btree_path *path, + struct btree *b, + struct btree_node_iter node_iter, + struct keylist *keys) +@@ -1175,7 +1175,7 @@ __bch2_btree_insert_keys_interior(struct btree_update *as, + ; + + while (!bch2_keylist_empty(keys)) { +- bch2_insert_fixup_btree_ptr(as, trans, iter, b, ++ bch2_insert_fixup_btree_ptr(as, trans, path, b, + &node_iter, bch2_keylist_front(keys)); + bch2_keylist_pop_front(keys); + } +@@ -1186,8 +1186,7 @@ __bch2_btree_insert_keys_interior(struct btree_update *as, + * node) + */ + static struct btree *__btree_split_node(struct btree_update *as, +- struct btree *n1, +- struct btree_iter *iter) ++ struct btree *n1) + { + struct bkey_format_state s; + size_t nr_packed = 0, nr_unpacked = 0; +@@ -1304,7 +1303,7 @@ static struct btree *__btree_split_node(struct btree_update *as, + */ + static void btree_split_insert_keys(struct btree_update *as, + struct btree_trans *trans, +- struct btree_iter *iter, ++ struct btree_path *path, + struct btree *b, + struct keylist *keys) + { +@@ -1315,7 +1314,7 @@ static void btree_split_insert_keys(struct btree_update *as, + + bch2_btree_node_iter_init(&node_iter, b, &k->k.p); + +- __bch2_btree_insert_keys_interior(as, trans, iter, b, node_iter, keys); ++ __bch2_btree_insert_keys_interior(as, trans, path, b, node_iter, keys); + + /* + * We can't tolerate whiteouts here - with whiteouts there can be +@@ -1345,18 +1344,17 @@ static void btree_split_insert_keys(struct btree_update *as, + btree_node_interior_verify(as->c, b); + } + +-static void btree_split(struct btree_update *as, +- struct btree_trans *trans, struct btree_iter *iter, +- struct btree *b, struct keylist *keys, +- unsigned flags) ++static void btree_split(struct btree_update *as, struct btree_trans *trans, ++ struct btree_path *path, struct btree *b, ++ struct keylist *keys, unsigned flags) + { + struct bch_fs *c = as->c; +- struct btree *parent = btree_node_parent(iter, b); ++ struct btree *parent = btree_node_parent(path, b); + struct btree *n1, *n2 = NULL, *n3 = NULL; + u64 start_time = local_clock(); + + BUG_ON(!parent && (b != btree_node_root(c, b))); +- BUG_ON(!btree_node_intent_locked(iter, btree_node_root(c, b)->c.level)); ++ BUG_ON(!btree_node_intent_locked(path, btree_node_root(c, b)->c.level)); + + bch2_btree_interior_update_will_free_node(as, b); + +@@ -1364,12 +1362,12 @@ static void btree_split(struct btree_update *as, + bch2_btree_update_add_new_node(as, n1); + + if (keys) +- btree_split_insert_keys(as, trans, iter, n1, keys); ++ btree_split_insert_keys(as, trans, path, n1, keys); + + if (bset_u64s(&n1->set[0]) > BTREE_SPLIT_THRESHOLD(c)) { + trace_btree_split(c, b); + +- n2 = __btree_split_node(as, n1, iter); ++ n2 = __btree_split_node(as, n1); + + bch2_btree_build_aux_trees(n2); + bch2_btree_build_aux_trees(n1); +@@ -1394,7 +1392,7 @@ static void btree_split(struct btree_update *as, + n3->sib_u64s[0] = U16_MAX; + n3->sib_u64s[1] = U16_MAX; + +- btree_split_insert_keys(as, trans, iter, n3, &as->parent_keys); ++ btree_split_insert_keys(as, trans, path, n3, &as->parent_keys); + + bch2_btree_node_write(c, n3, SIX_LOCK_intent); + } +@@ -1414,12 +1412,12 @@ static void btree_split(struct btree_update *as, + + if (parent) { + /* Split a non root node */ +- bch2_btree_insert_node(as, trans, iter, parent, &as->parent_keys, flags); ++ bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys, flags); + } else if (n3) { +- bch2_btree_set_root(as, trans, iter, n3); ++ bch2_btree_set_root(as, trans, path, n3); + } else { + /* Root filled up but didn't need to be split */ +- bch2_btree_set_root(as, trans, iter, n1); ++ bch2_btree_set_root(as, trans, path, n1); + } + + bch2_btree_update_get_open_buckets(as, n1); +@@ -1428,7 +1426,7 @@ static void btree_split(struct btree_update *as, + if (n3) + bch2_btree_update_get_open_buckets(as, n3); + +- /* Successful split, update the iterator to point to the new nodes: */ ++ /* Successful split, update the path to point to the new nodes: */ + + six_lock_increment(&b->c.lock, SIX_LOCK_intent); + bch2_trans_node_drop(trans, b); +@@ -1461,21 +1459,21 @@ static void btree_split(struct btree_update *as, + static void + bch2_btree_insert_keys_interior(struct btree_update *as, + struct btree_trans *trans, +- struct btree_iter *iter, ++ struct btree_path *path, + struct btree *b, + struct keylist *keys) + { +- struct btree_iter *linked; ++ struct btree_path *linked; + +- __bch2_btree_insert_keys_interior(as, trans, iter, b, +- iter->l[b->c.level].iter, keys); ++ __bch2_btree_insert_keys_interior(as, trans, path, b, ++ path->l[b->c.level].iter, keys); + + btree_update_updated_node(as, b); + +- trans_for_each_iter_with_node(trans, b, linked) ++ trans_for_each_path_with_node(trans, b, linked) + bch2_btree_node_iter_peek(&linked->l[b->c.level].iter, b); + +- bch2_trans_verify_iters(trans, b); ++ bch2_trans_verify_paths(trans); + } + + /** +@@ -1490,10 +1488,9 @@ bch2_btree_insert_keys_interior(struct btree_update *as, + * If a split occurred, this function will return early. This can only happen + * for leaf nodes -- inserts into interior nodes have to be atomic. + */ +-static void bch2_btree_insert_node(struct btree_update *as, +- struct btree_trans *trans, struct btree_iter *iter, +- struct btree *b, struct keylist *keys, +- unsigned flags) ++static void bch2_btree_insert_node(struct btree_update *as, struct btree_trans *trans, ++ struct btree_path *path, struct btree *b, ++ struct keylist *keys, unsigned flags) + { + struct bch_fs *c = as->c; + int old_u64s = le16_to_cpu(btree_bset_last(b)->u64s); +@@ -1501,21 +1498,21 @@ static void bch2_btree_insert_node(struct btree_update *as, + int live_u64s_added, u64s_added; + + lockdep_assert_held(&c->gc_lock); +- BUG_ON(!btree_node_intent_locked(iter, btree_node_root(c, b)->c.level)); ++ BUG_ON(!btree_node_intent_locked(path, btree_node_root(c, b)->c.level)); + BUG_ON(!b->c.level); + BUG_ON(!as || as->b); + bch2_verify_keylist_sorted(keys); + +- bch2_btree_node_lock_for_insert(trans, iter, b); ++ bch2_btree_node_lock_for_insert(trans, path, b); + + if (!bch2_btree_node_insert_fits(c, b, bch2_keylist_u64s(keys))) { +- bch2_btree_node_unlock_write(trans, iter, b); ++ bch2_btree_node_unlock_write(trans, path, b); + goto split; + } + + btree_node_interior_verify(c, b); + +- bch2_btree_insert_keys_interior(as, trans, iter, b, keys); ++ bch2_btree_insert_keys_interior(as, trans, path, b, keys); + + live_u64s_added = (int) b->nr.live_u64s - old_live_u64s; + u64s_added = (int) le16_to_cpu(btree_bset_last(b)->u64s) - old_u64s; +@@ -1529,46 +1526,46 @@ static void bch2_btree_insert_node(struct btree_update *as, + bch2_maybe_compact_whiteouts(c, b)) + bch2_trans_node_reinit_iter(trans, b); + +- bch2_btree_node_unlock_write(trans, iter, b); ++ bch2_btree_node_unlock_write(trans, path, b); + + btree_node_interior_verify(c, b); + return; + split: +- btree_split(as, trans, iter, b, keys, flags); ++ btree_split(as, trans, path, b, keys, flags); + } + + int bch2_btree_split_leaf(struct btree_trans *trans, +- struct btree_iter *iter, ++ struct btree_path *path, + unsigned flags) + { + struct bch_fs *c = trans->c; +- struct btree *b = iter_l(iter)->b; ++ struct btree *b = path_l(path)->b; + struct btree_update *as; + unsigned l; + int ret = 0; + +- as = bch2_btree_update_start(trans, iter, iter->level, ++ as = bch2_btree_update_start(trans, path, path->level, + btree_update_reserve_required(c, b), flags); + if (IS_ERR(as)) + return PTR_ERR(as); + +- btree_split(as, trans, iter, b, NULL, flags); ++ btree_split(as, trans, path, b, NULL, flags); + bch2_btree_update_done(as); + +- for (l = iter->level + 1; btree_iter_node(iter, l) && !ret; l++) +- ret = bch2_foreground_maybe_merge(trans, iter, l, flags); ++ for (l = path->level + 1; btree_path_node(path, l) && !ret; l++) ++ ret = bch2_foreground_maybe_merge(trans, path, l, flags); + + return ret; + } + + int __bch2_foreground_maybe_merge(struct btree_trans *trans, +- struct btree_iter *iter, ++ struct btree_path *path, + unsigned level, + unsigned flags, + enum btree_node_sibling sib) + { + struct bch_fs *c = trans->c; +- struct btree_iter *sib_iter = NULL; ++ struct btree_path *sib_path = NULL; + struct btree_update *as; + struct bkey_format_state new_s; + struct bkey_format new_f; +@@ -1579,14 +1576,14 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, + int ret = 0, ret2 = 0; + + retry: +- ret = bch2_btree_iter_traverse(iter); ++ ret = bch2_btree_path_traverse(trans, path, false); + if (ret) + return ret; + +- BUG_ON(!iter->should_be_locked); +- BUG_ON(!btree_node_locked(iter, level)); ++ BUG_ON(!path->should_be_locked); ++ BUG_ON(!btree_node_locked(path, level)); + +- b = iter->l[level].b; ++ b = path->l[level].b; + + if ((sib == btree_prev_sib && !bpos_cmp(b->data->min_key, POS_MIN)) || + (sib == btree_next_sib && !bpos_cmp(b->data->max_key, SPOS_MAX))) { +@@ -1598,17 +1595,18 @@ retry: + ? bpos_predecessor(b->data->min_key) + : bpos_successor(b->data->max_key); + +- sib_iter = bch2_trans_get_node_iter(trans, iter->btree_id, +- sib_pos, U8_MAX, level, +- BTREE_ITER_INTENT); +- ret = bch2_btree_iter_traverse(sib_iter); ++ sib_path = bch2_path_get(trans, false, path->btree_id, ++ sib_pos, U8_MAX, level, true); ++ ret = bch2_btree_path_traverse(trans, sib_path, false); + if (ret) + goto err; + +- m = sib_iter->l[level].b; ++ sib_path->should_be_locked = true; ++ ++ m = sib_path->l[level].b; + +- if (btree_node_parent(iter, b) != +- btree_node_parent(sib_iter, m)) { ++ if (btree_node_parent(path, b) != ++ btree_node_parent(sib_path, m)) { + b->sib_u64s[sib] = U16_MAX; + goto out; + } +@@ -1659,8 +1657,8 @@ retry: + if (b->sib_u64s[sib] > c->btree_foreground_merge_threshold) + goto out; + +- parent = btree_node_parent(iter, b); +- as = bch2_btree_update_start(trans, iter, level, ++ parent = btree_node_parent(path, b); ++ as = bch2_btree_update_start(trans, path, level, + btree_update_reserve_required(c, parent) + 1, + flags| + BTREE_INSERT_NOFAIL| +@@ -1696,7 +1694,7 @@ retry: + bch2_keylist_add(&as->parent_keys, &delete); + bch2_keylist_add(&as->parent_keys, &n->key); + +- bch2_btree_insert_node(as, trans, iter, parent, &as->parent_keys, flags); ++ bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys, flags); + + bch2_btree_update_get_open_buckets(as, n); + +@@ -1707,7 +1705,7 @@ retry: + + bch2_trans_node_add(trans, n); + +- bch2_trans_verify_iters(trans, n); ++ bch2_trans_verify_paths(trans); + + bch2_btree_node_free_inmem(trans, b); + bch2_btree_node_free_inmem(trans, m); +@@ -1717,7 +1715,8 @@ retry: + bch2_btree_update_done(as); + out: + bch2_trans_verify_locks(trans); +- bch2_trans_iter_free(trans, sib_iter); ++ if (sib_path) ++ bch2_path_put(trans, sib_path, true); + + /* + * Don't downgrade locks here: we're called after successful insert, +@@ -1730,8 +1729,9 @@ out: + */ + return ret ?: ret2; + err: +- bch2_trans_iter_put(trans, sib_iter); +- sib_iter = NULL; ++ if (sib_path) ++ bch2_path_put(trans, sib_path, true); ++ sib_path = NULL; + + if (ret == -EINTR && bch2_trans_relock(trans)) + goto retry; +@@ -1761,8 +1761,8 @@ retry: + if (!b || b->data->keys.seq != seq) + goto out; + +- parent = btree_node_parent(iter, b); +- as = bch2_btree_update_start(trans, iter, b->c.level, ++ parent = btree_node_parent(iter->path, b); ++ as = bch2_btree_update_start(trans, iter->path, b->c.level, + (parent + ? btree_update_reserve_required(c, parent) + : 0) + 1, +@@ -1789,10 +1789,10 @@ retry: + + if (parent) { + bch2_keylist_add(&as->parent_keys, &n->key); +- bch2_btree_insert_node(as, trans, iter, parent, ++ bch2_btree_insert_node(as, trans, iter->path, parent, + &as->parent_keys, flags); + } else { +- bch2_btree_set_root(as, trans, iter, n); ++ bch2_btree_set_root(as, trans, iter->path, n); + } + + bch2_btree_update_get_open_buckets(as, n); +@@ -1805,7 +1805,7 @@ retry: + + bch2_btree_update_done(as); + out: +- bch2_btree_iter_downgrade(iter); ++ bch2_btree_path_downgrade(iter->path); + return ret; + } + +@@ -1824,13 +1824,13 @@ void async_btree_node_rewrite_work(struct work_struct *work) + container_of(work, struct async_btree_rewrite, work); + struct bch_fs *c = a->c; + struct btree_trans trans; +- struct btree_iter *iter; ++ struct btree_iter iter; + + bch2_trans_init(&trans, c, 0, 0); +- iter = bch2_trans_get_node_iter(&trans, a->btree_id, a->pos, ++ bch2_trans_node_iter_init(&trans, &iter, a->btree_id, a->pos, + BTREE_MAX_DEPTH, a->level, 0); +- bch2_btree_node_rewrite(&trans, iter, a->seq, 0); +- bch2_trans_iter_put(&trans, iter); ++ bch2_btree_node_rewrite(&trans, &iter, a->seq, 0); ++ bch2_trans_iter_exit(&trans, &iter); + bch2_trans_exit(&trans); + percpu_ref_put(&c->writes); + kfree(a); +@@ -1869,7 +1869,7 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans, + bool skip_triggers) + { + struct bch_fs *c = trans->c; +- struct btree_iter *iter2 = NULL; ++ struct btree_iter iter2 = { NULL }; + struct btree *parent; + u64 journal_entries[BKEY_BTREE_PTR_U64s_MAX]; + int ret; +@@ -1897,19 +1897,22 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans, + BUG_ON(ret); + } + +- parent = btree_node_parent(iter, b); ++ parent = btree_node_parent(iter->path, b); + if (parent) { +- iter2 = bch2_trans_copy_iter(trans, iter); ++ bch2_trans_copy_iter(&iter2, iter); + +- BUG_ON(iter2->level != b->c.level); +- BUG_ON(bpos_cmp(iter2->pos, new_key->k.p)); ++ iter2.path = bch2_btree_path_make_mut(trans, iter2.path, ++ iter2.flags & BTREE_ITER_INTENT); + +- btree_node_unlock(iter2, iter2->level); +- iter2->l[iter2->level].b = BTREE_ITER_NO_NODE_UP; +- iter2->level++; ++ BUG_ON(iter2.path->level != b->c.level); ++ BUG_ON(bpos_cmp(iter2.path->pos, new_key->k.p)); + +- ret = bch2_btree_iter_traverse(iter2) ?: +- bch2_trans_update(trans, iter2, new_key, BTREE_TRIGGER_NORUN); ++ btree_node_unlock(iter2.path, iter2.path->level); ++ path_l(iter2.path)->b = BTREE_ITER_NO_NODE_UP; ++ iter2.path->level++; ++ ++ ret = bch2_btree_iter_traverse(&iter2) ?: ++ bch2_trans_update(trans, &iter2, new_key, BTREE_TRIGGER_NORUN); + if (ret) + goto err; + } else { +@@ -1931,7 +1934,7 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans, + if (ret) + goto err; + +- bch2_btree_node_lock_write(trans, iter, b); ++ bch2_btree_node_lock_write(trans, iter->path, b); + + if (new_hash) { + mutex_lock(&c->btree_cache.lock); +@@ -1946,9 +1949,9 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans, + bkey_copy(&b->key, new_key); + } + +- bch2_btree_node_unlock_write(trans, iter, b); ++ bch2_btree_node_unlock_write(trans, iter->path, b); + out: +- bch2_trans_iter_put(trans, iter2); ++ bch2_trans_iter_exit(trans, &iter2); + return ret; + err: + if (new_hash) { +@@ -2006,18 +2009,18 @@ int bch2_btree_node_update_key_get_iter(struct btree_trans *trans, + struct btree *b, struct bkey_i *new_key, + bool skip_triggers) + { +- struct btree_iter *iter; ++ struct btree_iter iter; + int ret; + +- iter = bch2_trans_get_node_iter(trans, b->c.btree_id, b->key.k.p, +- BTREE_MAX_DEPTH, b->c.level, +- BTREE_ITER_INTENT); +- ret = bch2_btree_iter_traverse(iter); ++ bch2_trans_node_iter_init(trans, &iter, b->c.btree_id, b->key.k.p, ++ BTREE_MAX_DEPTH, b->c.level, ++ BTREE_ITER_INTENT); ++ ret = bch2_btree_iter_traverse(&iter); + if (ret) + goto out; + + /* has node been freed? */ +- if (iter->l[b->c.level].b != b) { ++ if (iter.path->l[b->c.level].b != b) { + /* node has been freed: */ + BUG_ON(!btree_node_dying(b)); + goto out; +@@ -2025,9 +2028,9 @@ int bch2_btree_node_update_key_get_iter(struct btree_trans *trans, + + BUG_ON(!btree_node_hashed(b)); + +- ret = bch2_btree_node_update_key(trans, iter, b, new_key, skip_triggers); ++ ret = bch2_btree_node_update_key(trans, &iter, b, new_key, skip_triggers); + out: +- bch2_trans_iter_put(trans, iter); ++ bch2_trans_iter_exit(trans, &iter); + return ret; + } + +diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h +index 13b3a1bf0f4f..c06cfcc66db7 100644 +--- a/fs/bcachefs/btree_update_interior.h ++++ b/fs/bcachefs/btree_update_interior.h +@@ -117,39 +117,39 @@ struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *, + struct btree *, + struct bkey_format); + +-int bch2_btree_split_leaf(struct btree_trans *, struct btree_iter *, unsigned); ++int bch2_btree_split_leaf(struct btree_trans *, struct btree_path *, unsigned); + +-int __bch2_foreground_maybe_merge(struct btree_trans *, struct btree_iter *, ++int __bch2_foreground_maybe_merge(struct btree_trans *, struct btree_path *, + unsigned, unsigned, enum btree_node_sibling); + + static inline int bch2_foreground_maybe_merge_sibling(struct btree_trans *trans, +- struct btree_iter *iter, ++ struct btree_path *path, + unsigned level, unsigned flags, + enum btree_node_sibling sib) + { + struct btree *b; + +- if (iter->uptodate >= BTREE_ITER_NEED_TRAVERSE) ++ if (path->uptodate >= BTREE_ITER_NEED_TRAVERSE) + return 0; + +- if (!bch2_btree_node_relock(trans, iter, level)) ++ if (!bch2_btree_node_relock(trans, path, level)) + return 0; + +- b = iter->l[level].b; ++ b = path->l[level].b; + if (b->sib_u64s[sib] > trans->c->btree_foreground_merge_threshold) + return 0; + +- return __bch2_foreground_maybe_merge(trans, iter, level, flags, sib); ++ return __bch2_foreground_maybe_merge(trans, path, level, flags, sib); + } + + static inline int bch2_foreground_maybe_merge(struct btree_trans *trans, +- struct btree_iter *iter, ++ struct btree_path *path, + unsigned level, + unsigned flags) + { +- return bch2_foreground_maybe_merge_sibling(trans, iter, level, flags, ++ return bch2_foreground_maybe_merge_sibling(trans, path, level, flags, + btree_prev_sib) ?: +- bch2_foreground_maybe_merge_sibling(trans, iter, level, flags, ++ bch2_foreground_maybe_merge_sibling(trans, path, level, flags, + btree_next_sib); + } + +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index d68a9a51a106..b70c65b0dc8c 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -29,9 +29,9 @@ static inline int btree_insert_entry_cmp(const struct btree_insert_entry *l, + bpos_cmp(l->k->k.p, r->k->k.p); + } + +-static inline struct btree_iter_level *insert_l(struct btree_insert_entry *i) ++static inline struct btree_path_level *insert_l(struct btree_insert_entry *i) + { +- return i->iter->l + i->level; ++ return i->path->l + i->level; + } + + static inline bool same_leaf_as_prev(struct btree_trans *trans, +@@ -49,14 +49,14 @@ static inline bool same_leaf_as_next(struct btree_trans *trans, + } + + inline void bch2_btree_node_lock_for_insert(struct btree_trans *trans, +- struct btree_iter *iter, ++ struct btree_path *path, + struct btree *b) + { + struct bch_fs *c = trans->c; + +- bch2_btree_node_lock_write(trans, iter, b); ++ bch2_btree_node_lock_write(trans, path, b); + +- if (iter->cached) ++ if (path->cached) + return; + + if (unlikely(btree_node_just_written(b)) && +@@ -75,7 +75,7 @@ inline void bch2_btree_node_lock_for_insert(struct btree_trans *trans, + + /* Handle overwrites and do insert, for non extents: */ + bool bch2_btree_bset_insert_key(struct btree_trans *trans, +- struct btree_iter *iter, ++ struct btree_path *path, + struct btree *b, + struct btree_node_iter *node_iter, + struct bkey_i *insert) +@@ -116,7 +116,7 @@ bool bch2_btree_bset_insert_key(struct btree_trans *trans, + bch2_bset_delete(b, k, clobber_u64s); + goto fix_iter; + } else { +- bch2_btree_iter_fix_key_modified(trans, b, k); ++ bch2_btree_path_fix_key_modified(trans, b, k); + } + + return true; +@@ -134,7 +134,7 @@ bool bch2_btree_bset_insert_key(struct btree_trans *trans, + clobber_u64s = k->u64s; + goto overwrite; + } else { +- bch2_btree_iter_fix_key_modified(trans, b, k); ++ bch2_btree_path_fix_key_modified(trans, b, k); + } + } + +@@ -144,7 +144,7 @@ overwrite: + new_u64s = k->u64s; + fix_iter: + if (clobber_u64s != new_u64s) +- bch2_btree_node_iter_fix(trans, iter, b, node_iter, k, ++ bch2_btree_node_iter_fix(trans, path, b, node_iter, k, + clobber_u64s, new_u64s); + return true; + } +@@ -201,7 +201,7 @@ static bool btree_insert_key_leaf(struct btree_trans *trans, + EBUG_ON(!insert->level && + !test_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags)); + +- if (unlikely(!bch2_btree_bset_insert_key(trans, insert->iter, b, ++ if (unlikely(!bch2_btree_bset_insert_key(trans, insert->path, b, + &insert_l(insert)->iter, insert->k))) + return false; + +@@ -236,9 +236,10 @@ static bool btree_insert_key_leaf(struct btree_trans *trans, + static inline void btree_insert_entry_checks(struct btree_trans *trans, + struct btree_insert_entry *i) + { +- BUG_ON(bpos_cmp(i->k->k.p, i->iter->real_pos)); +- BUG_ON(i->level != i->iter->level); +- BUG_ON(i->btree_id != i->iter->btree_id); ++ BUG_ON(bpos_cmp(i->k->k.p, i->path->pos)); ++ BUG_ON(i->cached != i->path->cached); ++ BUG_ON(i->level != i->path->level); ++ BUG_ON(i->btree_id != i->path->btree_id); + } + + static noinline int +@@ -293,14 +294,14 @@ btree_key_can_insert(struct btree_trans *trans, + + static enum btree_insert_ret + btree_key_can_insert_cached(struct btree_trans *trans, +- struct btree_iter *iter, ++ struct btree_path *path, + unsigned u64s) + { +- struct bkey_cached *ck = (void *) iter->l[0].b; ++ struct bkey_cached *ck = (void *) path->l[0].b; + unsigned new_u64s; + struct bkey_i *new_k; + +- EBUG_ON(iter->level); ++ EBUG_ON(path->level); + + if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) && + bch2_btree_key_cache_must_wait(trans->c) && +@@ -340,7 +341,7 @@ static inline void do_btree_insert_one(struct btree_trans *trans, + + did_work = !i->cached + ? btree_insert_key_leaf(trans, i) +- : bch2_btree_insert_key_cached(trans, i->iter, i->k); ++ : bch2_btree_insert_key_cached(trans, i->path, i->k); + if (!did_work) + return; + +@@ -366,11 +367,12 @@ static noinline void bch2_trans_mark_gc(struct btree_trans *trans) + trans_for_each_update(trans, i) { + /* + * XXX: synchronization of cached update triggers with gc ++ * XXX: synchronization of interior node updates with gc + */ + BUG_ON(i->cached || i->level); + + if (gc_visited(c, gc_pos_btree_node(insert_l(i)->b))) +- bch2_mark_update(trans, i->iter, i->k, ++ bch2_mark_update(trans, i->path, i->k, + i->flags|BTREE_TRIGGER_GC); + } + } +@@ -417,7 +419,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, + u64s += i->k->k.u64s; + ret = !i->cached + ? btree_key_can_insert(trans, insert_l(i)->b, u64s) +- : btree_key_can_insert_cached(trans, i->iter, u64s); ++ : btree_key_can_insert_cached(trans, i->path, u64s); + if (ret) { + *stopped_at = i; + return ret; +@@ -476,7 +478,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, + + trans_for_each_update(trans, i) + if (BTREE_NODE_TYPE_HAS_MEM_TRIGGERS & (1U << i->bkey_type)) +- bch2_mark_update(trans, i->iter, i->k, i->flags); ++ bch2_mark_update(trans, i->path, i->k, i->flags); + + if (marking && trans->fs_usage_deltas) + bch2_trans_fs_usage_apply(trans, trans->fs_usage_deltas); +@@ -503,11 +505,13 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, + { + struct bch_fs *c = trans->c; + struct btree_insert_entry *i; +- struct btree_iter *iter; ++ struct btree_path *path; + struct bkey_s_c old; + int ret, u64s_delta = 0; + + trans_for_each_update(trans, i) { ++ struct bkey u; ++ + /* + * peek_slot() doesn't yet work on iterators that point to + * interior nodes: +@@ -515,7 +519,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, + if (i->cached || i->level) + continue; + +- old = bch2_btree_iter_peek_slot(i->iter); ++ old = bch2_btree_path_peek_slot(i->path, &u); + ret = bkey_err(old); + if (unlikely(ret)) + return ret; +@@ -525,7 +529,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, + + if (!same_leaf_as_next(trans, i)) { + if (u64s_delta <= 0) { +- ret = bch2_foreground_maybe_merge(trans, i->iter, ++ ret = bch2_foreground_maybe_merge(trans, i->path, + i->level, trans->flags); + if (unlikely(ret)) + return ret; +@@ -536,7 +540,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, + } + + trans_for_each_update(trans, i) +- BUG_ON(!btree_node_intent_locked(i->iter, i->level)); ++ BUG_ON(!btree_node_intent_locked(i->path, i->level)); + + ret = bch2_journal_preres_get(&c->journal, + &trans->journal_preres, trans->journal_preres_u64s, +@@ -560,14 +564,12 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, + * or anything else that might call bch2_trans_relock(), since that + * would just retake the read locks: + */ +- trans_for_each_iter(trans, iter) +- if (iter->nodes_locked != iter->nodes_intent_locked && +- !bch2_btree_iter_upgrade(trans, iter, 1)) { ++ trans_for_each_path(trans, path) ++ if (path->nodes_locked != path->nodes_intent_locked && ++ !bch2_btree_path_upgrade(trans, path, path->level + 1)) { + trace_trans_restart_upgrade(trans->ip, trace_ip, +- iter->btree_id, +- &iter->real_pos); +- trans->restarted = true; +- return -EINTR; ++ path->btree_id, &path->pos); ++ return btree_trans_restart(trans); + } + + trans_for_each_update(trans, i) { +@@ -581,6 +583,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, + buf, (void *) trans->ip, + (void *) i->ip_allocated, invalid); + bch2_fatal_error(c); ++ return -EINVAL; + } + btree_insert_entry_checks(trans, i); + } +@@ -588,14 +591,14 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, + + trans_for_each_update(trans, i) + if (!same_leaf_as_prev(trans, i)) +- bch2_btree_node_lock_for_insert(trans, i->iter, ++ bch2_btree_node_lock_for_insert(trans, i->path, + insert_l(i)->b); + + ret = bch2_trans_commit_write_locked(trans, stopped_at, trace_ip); + + trans_for_each_update(trans, i) + if (!same_leaf_as_prev(trans, i)) +- bch2_btree_node_unlock_write_inlined(trans, i->iter, ++ bch2_btree_node_unlock_write_inlined(trans, i->path, + insert_l(i)->b); + + if (!ret && trans->journal_pin) +@@ -635,13 +638,13 @@ int bch2_trans_commit_error(struct btree_trans *trans, + + switch (ret) { + case BTREE_INSERT_BTREE_NODE_FULL: +- ret = bch2_btree_split_leaf(trans, i->iter, trans->flags); ++ ret = bch2_btree_split_leaf(trans, i->path, trans->flags); + if (!ret) + return 0; + + if (ret == -EINTR) + trace_trans_restart_btree_node_split(trans->ip, trace_ip, +- i->btree_id, &i->iter->real_pos); ++ i->btree_id, &i->path->pos); + break; + case BTREE_INSERT_NEED_MARK_REPLICAS: + bch2_trans_unlock(trans); +@@ -749,6 +752,10 @@ int __bch2_trans_commit(struct btree_trans *trans) + } + + #ifdef CONFIG_BCACHEFS_DEBUG ++ /* ++ * if BTREE_TRIGGER_NORUN is set, it means we're probably being called ++ * from the key cache flush code: ++ */ + trans_for_each_update(trans, i) + if (!i->cached && + !(i->flags & BTREE_TRIGGER_NORUN)) +@@ -769,13 +776,12 @@ int __bch2_trans_commit(struct btree_trans *trans) + i->trans_triggers_run = true; + trans_trigger_run = true; + +- ret = bch2_trans_mark_update(trans, i->iter, ++ ret = bch2_trans_mark_update(trans, i->path, + i->k, i->flags); + if (unlikely(ret)) { + if (ret == -EINTR) + trace_trans_restart_mark(trans->ip, _RET_IP_, +- i->btree_id, +- &i->iter->pos); ++ i->btree_id, &i->path->pos); + goto out; + } + } +@@ -783,18 +789,16 @@ int __bch2_trans_commit(struct btree_trans *trans) + } while (trans_trigger_run); + + trans_for_each_update(trans, i) { +- BUG_ON(!i->iter->should_be_locked); ++ BUG_ON(!i->path->should_be_locked); + +- if (unlikely(!bch2_btree_iter_upgrade(trans, i->iter, +- i->level + 1))) { ++ if (unlikely(!bch2_btree_path_upgrade(trans, i->path, i->level + 1))) { + trace_trans_restart_upgrade(trans->ip, _RET_IP_, +- i->btree_id, &i->iter->pos); +- trans->restarted = true; +- ret = -EINTR; ++ i->btree_id, &i->path->pos); ++ ret = btree_trans_restart(trans); + goto out; + } + +- BUG_ON(!btree_node_intent_locked(i->iter, i->level)); ++ BUG_ON(!btree_node_intent_locked(i->path, i->level)); + + u64s = jset_u64s(i->k->k.u64s); + if (i->cached && +@@ -828,6 +832,9 @@ out: + if (likely(!(trans->flags & BTREE_INSERT_NOCHECK_RW))) + percpu_ref_put(&trans->c->writes); + out_reset: ++ trans_for_each_update(trans, i) ++ bch2_path_put(trans, i->path, true); ++ + trans->extra_journal_res = 0; + trans->nr_updates = 0; + trans->hooks = NULL; +@@ -856,18 +863,18 @@ static int bch2_trans_update_extent(struct btree_trans *trans, + enum btree_update_flags flags) + { + struct bch_fs *c = trans->c; +- struct btree_iter *iter, *update_iter; ++ struct btree_iter iter, update_iter; + struct bpos start = bkey_start_pos(&insert->k); + struct bkey_i *update; + struct bkey_s_c k; + enum btree_id btree_id = orig_iter->btree_id; + int ret = 0, compressed_sectors; + +- iter = bch2_trans_get_iter(trans, btree_id, start, +- BTREE_ITER_INTENT| +- BTREE_ITER_WITH_UPDATES| +- BTREE_ITER_NOT_EXTENTS); +- k = bch2_btree_iter_peek(iter); ++ bch2_trans_iter_init(trans, &iter, btree_id, start, ++ BTREE_ITER_INTENT| ++ BTREE_ITER_WITH_UPDATES| ++ BTREE_ITER_NOT_EXTENTS); ++ k = bch2_btree_iter_peek(&iter); + if ((ret = bkey_err(k))) + goto err; + if (!k.k) +@@ -881,9 +888,9 @@ static int bch2_trans_update_extent(struct btree_trans *trans, + bkey_reassemble(update, k); + + if (bch2_bkey_merge(c, bkey_i_to_s(update), bkey_i_to_s_c(insert))) { +- update_iter = bch2_trans_copy_iter(trans, iter); +- ret = bch2_btree_delete_at(trans, update_iter, flags); +- bch2_trans_iter_put(trans, update_iter); ++ bch2_trans_copy_iter(&update_iter, &iter); ++ ret = bch2_btree_delete_at(trans, &update_iter, flags); ++ bch2_trans_iter_exit(trans, &update_iter); + + if (ret) + goto err; +@@ -916,23 +923,22 @@ static int bch2_trans_update_extent(struct btree_trans *trans, + + bch2_cut_back(start, update); + +- update_iter = bch2_trans_get_iter(trans, btree_id, update->k.p, +- BTREE_ITER_NOT_EXTENTS| +- BTREE_ITER_INTENT); +- ret = bch2_btree_iter_traverse(update_iter) ?: +- bch2_trans_update(trans, update_iter, update, ++ bch2_trans_iter_init(trans, &update_iter, btree_id, update->k.p, ++ BTREE_ITER_NOT_EXTENTS| ++ BTREE_ITER_INTENT); ++ ret = bch2_btree_iter_traverse(&update_iter) ?: ++ bch2_trans_update(trans, &update_iter, update, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE| + flags); +- bch2_trans_iter_put(trans, update_iter); ++ bch2_trans_iter_exit(trans, &update_iter); + if (ret) + goto err; + } + + if (bkey_cmp(k.k->p, insert->k.p) <= 0) { +- update_iter = bch2_trans_copy_iter(trans, iter); +- ret = bch2_btree_delete_at(trans, update_iter, +- flags); +- bch2_trans_iter_put(trans, update_iter); ++ bch2_trans_copy_iter(&update_iter, &iter); ++ ret = bch2_btree_delete_at(trans, &update_iter, flags); ++ bch2_trans_iter_exit(trans, &update_iter); + + if (ret) + goto err; +@@ -946,13 +952,13 @@ static int bch2_trans_update_extent(struct btree_trans *trans, + bkey_reassemble(update, k); + bch2_cut_front(insert->k.p, update); + +- update_iter = bch2_trans_copy_iter(trans, iter); +- bch2_trans_update(trans, update_iter, update, flags); +- bch2_trans_iter_put(trans, update_iter); ++ bch2_trans_copy_iter(&update_iter, &iter); ++ bch2_trans_update(trans, &update_iter, update, flags); ++ bch2_trans_iter_exit(trans, &update_iter); + goto out; + } + next: +- k = bch2_btree_iter_next(iter); ++ k = bch2_btree_iter_next(&iter); + if ((ret = bkey_err(k))) + goto err; + if (!k.k) +@@ -962,14 +968,12 @@ next: + bch2_bkey_merge(c, bkey_i_to_s(insert), k); + out: + if (!bkey_deleted(&insert->k)) { +- bch2_btree_iter_set_pos(iter, insert->k.p); +- ret = bch2_btree_iter_traverse(iter) ?: +- bch2_trans_update(trans, iter, insert, flags); +- } else { +- set_btree_iter_dontneed(trans, iter); ++ bch2_btree_iter_set_pos(&iter, insert->k.p); ++ ret = bch2_btree_iter_traverse(&iter) ?: ++ bch2_trans_update(trans, &iter, insert, flags); + } + err: +- bch2_trans_iter_put(trans, iter); ++ bch2_trans_iter_exit(trans, &iter); + + return ret; + } +@@ -977,31 +981,34 @@ err: + int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter, + struct bkey_i *k, enum btree_update_flags flags) + { +- struct btree_insert_entry *i, n = (struct btree_insert_entry) { ++ struct btree_insert_entry *i, n; ++ ++ BUG_ON(!iter->path->should_be_locked); ++ ++ if (iter->flags & BTREE_ITER_IS_EXTENTS) ++ return bch2_trans_update_extent(trans, iter, k, flags); ++ ++ BUG_ON(trans->nr_updates >= BTREE_ITER_MAX); ++ BUG_ON(bpos_cmp(k->k.p, iter->path->pos)); ++ ++ n = (struct btree_insert_entry) { + .flags = flags, +- .bkey_type = __btree_node_type(iter->level, iter->btree_id), ++ .bkey_type = __btree_node_type(iter->path->level, iter->btree_id), + .btree_id = iter->btree_id, +- .level = iter->level, +- .cached = iter->cached, +- .iter = iter, ++ .level = iter->path->level, ++ .cached = iter->flags & BTREE_ITER_CACHED, ++ .path = iter->path, + .k = k, + .ip_allocated = _RET_IP_, + }; + +- BUG_ON(!iter->should_be_locked); +- +- if (iter->flags & BTREE_ITER_IS_EXTENTS) +- return bch2_trans_update_extent(trans, iter, k, flags); ++ __btree_path_get(n.path, true); + + #ifdef CONFIG_BCACHEFS_DEBUG + trans_for_each_update(trans, i) + BUG_ON(i != trans->updates && + btree_insert_entry_cmp(i - 1, i) >= 0); + #endif +- BUG_ON(trans->nr_updates >= BTREE_ITER_MAX); +- BUG_ON(bpos_cmp(n.k->k.p, n.iter->real_pos)); +- +- n.iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT; + + /* + * Pending updates are kept sorted: first, find position of new update, +@@ -1023,7 +1030,10 @@ int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter, + if (n.cached && !i->cached) { + i->k = n.k; + i->flags = n.flags; ++ ++ __btree_path_get(n.path, false); + } else { ++ bch2_path_put(trans, i->path, true); + *i = n; + } + } else +@@ -1043,15 +1053,15 @@ void bch2_trans_commit_hook(struct btree_trans *trans, + int __bch2_btree_insert(struct btree_trans *trans, + enum btree_id id, struct bkey_i *k) + { +- struct btree_iter *iter; ++ struct btree_iter iter; + int ret; + +- iter = bch2_trans_get_iter(trans, id, bkey_start_pos(&k->k), ++ bch2_trans_iter_init(trans, &iter, id, bkey_start_pos(&k->k), + BTREE_ITER_INTENT); + +- ret = bch2_btree_iter_traverse(iter) ?: +- bch2_trans_update(trans, iter, k, 0); +- bch2_trans_iter_put(trans, iter); ++ ret = bch2_btree_iter_traverse(&iter) ?: ++ bch2_trans_update(trans, &iter, k, 0); ++ bch2_trans_iter_exit(trans, &iter); + return ret; + } + +@@ -1089,16 +1099,16 @@ int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id, + struct bpos start, struct bpos end, + u64 *journal_seq) + { +- struct btree_iter *iter; ++ struct btree_iter iter; + struct bkey_s_c k; + int ret = 0; + +- iter = bch2_trans_get_iter(trans, id, start, BTREE_ITER_INTENT); ++ bch2_trans_iter_init(trans, &iter, id, start, BTREE_ITER_INTENT); + retry: + while ((bch2_trans_begin(trans), +- (k = bch2_btree_iter_peek(iter)).k) && ++ (k = bch2_btree_iter_peek(&iter)).k) && + !(ret = bkey_err(k)) && +- bkey_cmp(iter->pos, end) < 0) { ++ bkey_cmp(iter.pos, end) < 0) { + struct bkey_i delete; + + bkey_init(&delete.k); +@@ -1117,9 +1127,9 @@ retry: + * (bch2_btree_iter_peek() does guarantee that iter.pos >= + * bkey_start_pos(k.k)). + */ +- delete.k.p = iter->pos; ++ delete.k.p = iter.pos; + +- if (btree_node_type_is_extents(iter->btree_id)) { ++ if (btree_node_type_is_extents(id)) { + unsigned max_sectors = + KEY_SIZE_MAX & (~0 << trans->c->block_bits); + +@@ -1127,12 +1137,12 @@ retry: + bch2_key_resize(&delete.k, max_sectors); + bch2_cut_back(end, &delete); + +- ret = bch2_extent_trim_atomic(trans, iter, &delete); ++ ret = bch2_extent_trim_atomic(trans, &iter, &delete); + if (ret) + break; + } + +- ret = bch2_trans_update(trans, iter, &delete, 0) ?: ++ ret = bch2_trans_update(trans, &iter, &delete, 0) ?: + bch2_trans_commit(trans, NULL, journal_seq, + BTREE_INSERT_NOFAIL); + if (ret) +@@ -1146,7 +1156,7 @@ retry: + goto retry; + } + +- bch2_trans_iter_free(trans, iter); ++ bch2_trans_iter_exit(trans, &iter); + return ret; + } + +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index b4942b1f3768..df12416eff8e 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -1218,38 +1218,23 @@ int bch2_mark_key(struct bch_fs *c, struct bkey_s_c new, unsigned flags) + return ret; + } + +-int bch2_mark_update(struct btree_trans *trans, struct btree_iter *iter, ++int bch2_mark_update(struct btree_trans *trans, struct btree_path *path, + struct bkey_i *new, unsigned flags) + { + struct bch_fs *c = trans->c; + struct bkey _deleted = KEY(0, 0, 0); + struct bkey_s_c deleted = (struct bkey_s_c) { &_deleted, NULL }; + struct bkey_s_c old; +- int iter_flags, ret; ++ struct bkey unpacked; ++ int ret; + + if (unlikely(flags & BTREE_TRIGGER_NORUN)) + return 0; + +- if (!btree_node_type_needs_gc(iter->btree_id)) ++ if (!btree_node_type_needs_gc(path->btree_id)) + return 0; + +- if (likely(!(iter->flags & BTREE_ITER_CACHED_NOFILL))) { +- iter_flags = iter->flags & BTREE_ITER_WITH_UPDATES; +- iter->flags &= ~BTREE_ITER_WITH_UPDATES; +- +- old = bch2_btree_iter_peek_slot(iter); +- iter->flags |= iter_flags; +- +- ret = bkey_err(old); +- if (ret) +- return ret; +- } else { +- /* +- * If BTREE_ITER_CACHED_NOFILL was used, we better not be +- * running triggers that do anything on removal (alloc btree): +- */ +- old = deleted; +- } ++ old = bch2_btree_path_peek_slot(path, &unpacked); + + if (old.k->type == new->k.type && + ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) { +@@ -1287,22 +1272,13 @@ void fs_usage_apply_warn(struct btree_trans *trans, + pr_err("overlapping with"); + + if (!i->cached) { +- struct btree_iter *copy = bch2_trans_copy_iter(trans, i->iter); +- struct bkey_s_c k; +- int ret; +- +- for_each_btree_key_continue(copy, 0, k, ret) { +- if (btree_node_type_is_extents(i->iter->btree_id) +- ? bkey_cmp(i->k->k.p, bkey_start_pos(k.k)) <= 0 +- : bkey_cmp(i->k->k.p, k.k->p)) +- break; ++ struct bkey u; ++ struct bkey_s_c k = bch2_btree_path_peek_slot(i->path, &u); + +- bch2_bkey_val_to_text(&PBUF(buf), c, k); +- pr_err("%s", buf); +- } +- bch2_trans_iter_put(trans, copy); ++ bch2_bkey_val_to_text(&PBUF(buf), c, k); ++ pr_err("%s", buf); + } else { +- struct bkey_cached *ck = (void *) i->iter->l[0].b; ++ struct bkey_cached *ck = (void *) i->path->l[0].b; + + if (ck->valid) { + bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(ck->k)); +@@ -1381,31 +1357,8 @@ void bch2_trans_fs_usage_apply(struct btree_trans *trans, + + /* trans_mark: */ + +-static struct btree_iter *trans_get_update(struct btree_trans *trans, +- enum btree_id btree_id, struct bpos pos, +- struct bkey_s_c *k) +-{ +- struct btree_insert_entry *i; +- +- trans_for_each_update(trans, i) +- if (i->iter->btree_id == btree_id && +- (btree_node_type_is_extents(btree_id) +- ? bkey_cmp(pos, bkey_start_pos(&i->k->k)) >= 0 && +- bkey_cmp(pos, i->k->k.p) < 0 +- : !bkey_cmp(pos, i->iter->pos))) { +- *k = bkey_i_to_s_c(i->k); +- +- /* ugly hack.. */ +- BUG_ON(btree_iter_live(trans, i->iter)); +- trans->iters_live |= 1ULL << i->iter->idx; +- return i->iter; +- } +- +- return NULL; +-} +- + static struct bkey_alloc_buf * +-bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter **_iter, ++bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter *iter, + const struct bch_extent_ptr *ptr, + struct bkey_alloc_unpacked *u) + { +@@ -1413,36 +1366,33 @@ bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter **_it + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); + struct bpos pos = POS(ptr->dev, PTR_BUCKET_NR(ca, ptr)); + struct bucket *g; +- struct btree_iter *iter; +- struct bkey_s_c k; + struct bkey_alloc_buf *a; ++ struct bkey_i *update = btree_trans_peek_updates(trans, BTREE_ID_alloc, pos); + int ret; + + a = bch2_trans_kmalloc(trans, sizeof(struct bkey_alloc_buf)); + if (IS_ERR(a)) + return a; + +- iter = trans_get_update(trans, BTREE_ID_alloc, pos, &k); +- if (iter) { +- *u = bch2_alloc_unpack(k); +- } else { +- iter = bch2_trans_get_iter(trans, BTREE_ID_alloc, pos, +- BTREE_ITER_CACHED| +- BTREE_ITER_CACHED_NOFILL| +- BTREE_ITER_INTENT); +- ret = bch2_btree_iter_traverse(iter); +- if (ret) { +- bch2_trans_iter_put(trans, iter); +- return ERR_PTR(ret); +- } ++ bch2_trans_iter_init(trans, iter, BTREE_ID_alloc, pos, ++ BTREE_ITER_CACHED| ++ BTREE_ITER_CACHED_NOFILL| ++ BTREE_ITER_INTENT); ++ ret = bch2_btree_iter_traverse(iter); ++ if (ret) { ++ bch2_trans_iter_exit(trans, iter); ++ return ERR_PTR(ret); ++ } + ++ if (update && !bpos_cmp(update->k.p, pos)) { ++ *u = bch2_alloc_unpack(bkey_i_to_s_c(update)); ++ } else { + percpu_down_read(&c->mark_lock); + g = bucket(ca, pos.offset); + *u = alloc_mem_to_key(iter, g, READ_ONCE(g->mark)); + percpu_up_read(&c->mark_lock); + } + +- *_iter = iter; + return a; + } + +@@ -1451,7 +1401,7 @@ static int bch2_trans_mark_pointer(struct btree_trans *trans, + s64 sectors, enum bch_data_type data_type) + { + struct bch_fs *c = trans->c; +- struct btree_iter *iter; ++ struct btree_iter iter; + struct bkey_alloc_unpacked u; + struct bkey_alloc_buf *a; + int ret; +@@ -1466,9 +1416,9 @@ static int bch2_trans_mark_pointer(struct btree_trans *trans, + goto out; + + bch2_alloc_pack(c, a, u); +- bch2_trans_update(trans, iter, &a->k, 0); ++ bch2_trans_update(trans, &iter, &a->k, 0); + out: +- bch2_trans_iter_put(trans, iter); ++ bch2_trans_iter_exit(trans, &iter); + return ret; + } + +@@ -1477,16 +1427,16 @@ static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans, + s64 sectors, enum bch_data_type data_type) + { + struct bch_fs *c = trans->c; +- struct btree_iter *iter; ++ struct btree_iter iter; + struct bkey_s_c k; + struct bkey_i_stripe *s; + struct bch_replicas_padded r; + int ret = 0; + +- iter = bch2_trans_get_iter(trans, BTREE_ID_stripes, POS(0, p.ec.idx), +- BTREE_ITER_INTENT| +- BTREE_ITER_WITH_UPDATES); +- k = bch2_btree_iter_peek_slot(iter); ++ bch2_trans_iter_init(trans, &iter, BTREE_ID_stripes, POS(0, p.ec.idx), ++ BTREE_ITER_INTENT| ++ BTREE_ITER_WITH_UPDATES); ++ k = bch2_btree_iter_peek_slot(&iter); + ret = bkey_err(k); + if (ret) + goto err; +@@ -1517,13 +1467,13 @@ static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans, + stripe_blockcount_set(&s->v, p.ec.block, + stripe_blockcount_get(&s->v, p.ec.block) + + sectors); +- bch2_trans_update(trans, iter, &s->k_i, 0); ++ bch2_trans_update(trans, &iter, &s->k_i, 0); + + bch2_bkey_to_replicas(&r.e, bkey_i_to_s_c(&s->k_i)); + r.e.data_type = data_type; + update_replicas_list(trans, &r.e, sectors); + err: +- bch2_trans_iter_put(trans, iter); ++ bch2_trans_iter_exit(trans, &iter); + return ret; + } + +@@ -1595,7 +1545,7 @@ static int bch2_trans_mark_stripe_alloc_ref(struct btree_trans *trans, + struct bch_fs *c = trans->c; + const struct bch_extent_ptr *ptr = &s.v->ptrs[idx]; + struct bkey_alloc_buf *a; +- struct btree_iter *iter; ++ struct btree_iter iter; + struct bkey_alloc_unpacked u; + bool parity = idx >= s.v->nr_blocks - s.v->nr_redundant; + int ret = 0; +@@ -1619,7 +1569,7 @@ static int bch2_trans_mark_stripe_alloc_ref(struct btree_trans *trans, + if (!deleting) { + if (bch2_fs_inconsistent_on(u.stripe && u.stripe != s.k->p.offset, c, + "bucket %llu:%llu gen %u: multiple stripes using same bucket (%u, %llu)", +- iter->pos.inode, iter->pos.offset, u.gen, ++ iter.pos.inode, iter.pos.offset, u.gen, + u.stripe, s.k->p.offset)) { + ret = -EIO; + goto err; +@@ -1633,9 +1583,9 @@ static int bch2_trans_mark_stripe_alloc_ref(struct btree_trans *trans, + } + + bch2_alloc_pack(c, a, u); +- bch2_trans_update(trans, iter, &a->k, 0); ++ bch2_trans_update(trans, &iter, &a->k, 0); + err: +- bch2_trans_iter_put(trans, iter); ++ bch2_trans_iter_exit(trans, &iter); + return ret; + } + +@@ -1740,17 +1690,17 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans, + u64 idx, unsigned flags) + { + struct bch_fs *c = trans->c; +- struct btree_iter *iter; ++ struct btree_iter iter; + struct bkey_s_c k; + struct bkey_i *n; + __le64 *refcount; + int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1; + s64 ret; + +- iter = bch2_trans_get_iter(trans, BTREE_ID_reflink, POS(0, idx), +- BTREE_ITER_INTENT| +- BTREE_ITER_WITH_UPDATES); +- k = bch2_btree_iter_peek_slot(iter); ++ bch2_trans_iter_init(trans, &iter, BTREE_ID_reflink, POS(0, idx), ++ BTREE_ITER_INTENT| ++ BTREE_ITER_WITH_UPDATES); ++ k = bch2_btree_iter_peek_slot(&iter); + ret = bkey_err(k); + if (ret) + goto err; +@@ -1780,14 +1730,14 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans, + set_bkey_val_u64s(&n->k, 0); + } + +- bch2_btree_iter_set_pos_to_extent_start(iter); +- ret = bch2_trans_update(trans, iter, n, 0); ++ bch2_btree_iter_set_pos_to_extent_start(&iter); ++ ret = bch2_trans_update(trans, &iter, n, 0); + if (ret) + goto err; + + ret = k.k->p.offset - idx; + err: +- bch2_trans_iter_put(trans, iter); ++ bch2_trans_iter_exit(trans, &iter); + return ret; + } + +@@ -1839,39 +1789,23 @@ int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c old, + } + + int bch2_trans_mark_update(struct btree_trans *trans, +- struct btree_iter *iter, ++ struct btree_path *path, + struct bkey_i *new, + unsigned flags) + { + struct bkey _deleted = KEY(0, 0, 0); + struct bkey_s_c deleted = (struct bkey_s_c) { &_deleted, NULL }; + struct bkey_s_c old; +- int iter_flags, ret; ++ struct bkey unpacked; ++ int ret; + + if (unlikely(flags & BTREE_TRIGGER_NORUN)) + return 0; + +- if (!btree_node_type_needs_gc(iter->btree_id)) ++ if (!btree_node_type_needs_gc(path->btree_id)) + return 0; + +- +- if (likely(!(iter->flags & BTREE_ITER_CACHED_NOFILL))) { +- iter_flags = iter->flags & BTREE_ITER_WITH_UPDATES; +- iter->flags &= ~BTREE_ITER_WITH_UPDATES; +- +- old = bch2_btree_iter_peek_slot(iter); +- iter->flags |= iter_flags; +- +- ret = bkey_err(old); +- if (ret) +- return ret; +- } else { +- /* +- * If BTREE_ITER_CACHED_NOFILL was used, we better not be +- * running triggers that do anything on removal (alloc btree): +- */ +- old = deleted; +- } ++ old = bch2_btree_path_peek_slot(path, &unpacked); + + if (old.k->type == new->k.type && + ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) { +@@ -1893,7 +1827,7 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans, + unsigned sectors) + { + struct bch_fs *c = trans->c; +- struct btree_iter *iter; ++ struct btree_iter iter; + struct bkey_alloc_unpacked u; + struct bkey_alloc_buf *a; + struct bch_extent_ptr ptr = { +@@ -1916,7 +1850,7 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans, + bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, + "bucket %llu:%llu gen %u different types of data in same bucket: %s, %s\n" + "while marking %s", +- iter->pos.inode, iter->pos.offset, u.gen, ++ iter.pos.inode, iter.pos.offset, u.gen, + bch2_data_types[u.data_type], + bch2_data_types[type], + bch2_data_types[type]); +@@ -1928,9 +1862,9 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans, + u.dirty_sectors = sectors; + + bch2_alloc_pack(c, a, u); +- bch2_trans_update(trans, iter, &a->k, 0); ++ bch2_trans_update(trans, &iter, &a->k, 0); + out: +- bch2_trans_iter_put(trans, iter); ++ bch2_trans_iter_exit(trans, &iter); + return ret; + } + +diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h +index 0f544b62fc90..61c2c0f9ff8f 100644 +--- a/fs/bcachefs/buckets.h ++++ b/fs/bcachefs/buckets.h +@@ -228,13 +228,13 @@ void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *, + + int bch2_mark_key(struct bch_fs *, struct bkey_s_c, unsigned); + +-int bch2_mark_update(struct btree_trans *, struct btree_iter *, ++int bch2_mark_update(struct btree_trans *, struct btree_path *, + struct bkey_i *, unsigned); + + int bch2_trans_mark_key(struct btree_trans *, struct bkey_s_c, + struct bkey_s_c, unsigned); +-int bch2_trans_mark_update(struct btree_trans *, struct btree_iter *iter, +- struct bkey_i *insert, unsigned); ++int bch2_trans_mark_update(struct btree_trans *, struct btree_path *, ++ struct bkey_i *, unsigned); + void bch2_trans_fs_usage_apply(struct btree_trans *, struct replicas_delta_list *); + + int bch2_trans_mark_metadata_bucket(struct btree_trans *, struct bch_dev *, +diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c +index b0a8eb58a7a7..9f14bf4cb49a 100644 +--- a/fs/bcachefs/debug.c ++++ b/fs/bcachefs/debug.c +@@ -243,7 +243,7 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf, + { + struct dump_iter *i = file->private_data; + struct btree_trans trans; +- struct btree_iter *iter; ++ struct btree_iter iter; + struct bkey_s_c k; + int err; + +@@ -260,10 +260,10 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf, + + bch2_trans_init(&trans, i->c, 0, 0); + +- iter = bch2_trans_get_iter(&trans, i->id, i->from, +- BTREE_ITER_PREFETCH| +- BTREE_ITER_ALL_SNAPSHOTS); +- k = bch2_btree_iter_peek(iter); ++ bch2_trans_iter_init(&trans, &iter, i->id, i->from, ++ BTREE_ITER_PREFETCH| ++ BTREE_ITER_ALL_SNAPSHOTS); ++ k = bch2_btree_iter_peek(&iter); + + while (k.k && !(err = bkey_err(k))) { + bch2_bkey_val_to_text(&PBUF(i->buf), i->c, k); +@@ -272,8 +272,8 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf, + i->buf[i->bytes] = '\n'; + i->bytes++; + +- k = bch2_btree_iter_next(iter); +- i->from = iter->pos; ++ k = bch2_btree_iter_next(&iter); ++ i->from = iter.pos; + + err = flush_buf(i); + if (err) +@@ -282,7 +282,7 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf, + if (!i->size) + break; + } +- bch2_trans_iter_put(&trans, iter); ++ bch2_trans_iter_exit(&trans, &iter); + + bch2_trans_exit(&trans); + +@@ -301,7 +301,7 @@ static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf, + { + struct dump_iter *i = file->private_data; + struct btree_trans trans; +- struct btree_iter *iter; ++ struct btree_iter iter; + struct btree *b; + int err; + +@@ -336,7 +336,7 @@ static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf, + if (!i->size) + break; + } +- bch2_trans_iter_put(&trans, iter); ++ bch2_trans_iter_exit(&trans, &iter); + + bch2_trans_exit(&trans); + +@@ -355,7 +355,7 @@ static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf, + { + struct dump_iter *i = file->private_data; + struct btree_trans trans; +- struct btree_iter *iter; ++ struct btree_iter iter; + struct bkey_s_c k; + struct btree *prev_node = NULL; + int err; +@@ -373,11 +373,11 @@ static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf, + + bch2_trans_init(&trans, i->c, 0, 0); + +- iter = bch2_trans_get_iter(&trans, i->id, i->from, BTREE_ITER_PREFETCH); ++ bch2_trans_iter_init(&trans, &iter, i->id, i->from, BTREE_ITER_PREFETCH); + +- while ((k = bch2_btree_iter_peek(iter)).k && ++ while ((k = bch2_btree_iter_peek(&iter)).k && + !(err = bkey_err(k))) { +- struct btree_iter_level *l = &iter->l[0]; ++ struct btree_path_level *l = &iter.path->l[0]; + struct bkey_packed *_k = + bch2_btree_node_iter_peek(&l->iter, l->b); + +@@ -396,8 +396,8 @@ static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf, + if (err) + break; + +- bch2_btree_iter_advance(iter); +- i->from = iter->pos; ++ bch2_btree_iter_advance(&iter); ++ i->from = iter.pos; + + err = flush_buf(i); + if (err) +diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c +index 02b29681f695..1d510f7728b6 100644 +--- a/fs/bcachefs/dirent.c ++++ b/fs/bcachefs/dirent.c +@@ -183,7 +183,8 @@ int bch2_dirent_rename(struct btree_trans *trans, + const struct qstr *dst_name, u64 *dst_inum, u64 *dst_offset, + enum bch_rename_mode mode) + { +- struct btree_iter *src_iter = NULL, *dst_iter = NULL; ++ struct btree_iter src_iter = { NULL }; ++ struct btree_iter dst_iter = { NULL }; + struct bkey_s_c old_src, old_dst; + struct bkey_i_dirent *new_src = NULL, *new_dst = NULL; + struct bpos dst_pos = +@@ -199,17 +200,16 @@ int bch2_dirent_rename(struct btree_trans *trans, + * the target already exists - we're relying on the VFS + * to do that check for us for correctness: + */ +- dst_iter = mode == BCH_RENAME +- ? bch2_hash_hole(trans, bch2_dirent_hash_desc, ++ ret = mode == BCH_RENAME ++ ? bch2_hash_hole(trans, &dst_iter, bch2_dirent_hash_desc, + dst_hash, dst_dir, dst_name) +- : bch2_hash_lookup(trans, bch2_dirent_hash_desc, ++ : bch2_hash_lookup(trans, &dst_iter, bch2_dirent_hash_desc, + dst_hash, dst_dir, dst_name, + BTREE_ITER_INTENT); +- ret = PTR_ERR_OR_ZERO(dst_iter); + if (ret) + goto out; + +- old_dst = bch2_btree_iter_peek_slot(dst_iter); ++ old_dst = bch2_btree_iter_peek_slot(&dst_iter); + ret = bkey_err(old_dst); + if (ret) + goto out; +@@ -217,17 +217,16 @@ int bch2_dirent_rename(struct btree_trans *trans, + if (mode != BCH_RENAME) + *dst_inum = le64_to_cpu(bkey_s_c_to_dirent(old_dst).v->d_inum); + if (mode != BCH_RENAME_EXCHANGE) +- *src_offset = dst_iter->pos.offset; ++ *src_offset = dst_iter.pos.offset; + + /* Lookup src: */ +- src_iter = bch2_hash_lookup(trans, bch2_dirent_hash_desc, +- src_hash, src_dir, src_name, +- BTREE_ITER_INTENT); +- ret = PTR_ERR_OR_ZERO(src_iter); ++ ret = bch2_hash_lookup(trans, &src_iter, bch2_dirent_hash_desc, ++ src_hash, src_dir, src_name, ++ BTREE_ITER_INTENT); + if (ret) + goto out; + +- old_src = bch2_btree_iter_peek_slot(src_iter); ++ old_src = bch2_btree_iter_peek_slot(&src_iter); + ret = bkey_err(old_src); + if (ret) + goto out; +@@ -241,7 +240,7 @@ int bch2_dirent_rename(struct btree_trans *trans, + goto out; + + dirent_copy_target(new_dst, bkey_s_c_to_dirent(old_src)); +- new_dst->k.p = dst_iter->pos; ++ new_dst->k.p = dst_iter.pos; + + /* Create new src key: */ + if (mode == BCH_RENAME_EXCHANGE) { +@@ -251,7 +250,7 @@ int bch2_dirent_rename(struct btree_trans *trans, + goto out; + + dirent_copy_target(new_src, bkey_s_c_to_dirent(old_dst)); +- new_src->k.p = src_iter->pos; ++ new_src->k.p = src_iter.pos; + } else { + new_src = bch2_trans_kmalloc(trans, sizeof(struct bkey_i)); + ret = PTR_ERR_OR_ZERO(new_src); +@@ -259,10 +258,10 @@ int bch2_dirent_rename(struct btree_trans *trans, + goto out; + + bkey_init(&new_src->k); +- new_src->k.p = src_iter->pos; ++ new_src->k.p = src_iter.pos; + +- if (bkey_cmp(dst_pos, src_iter->pos) <= 0 && +- bkey_cmp(src_iter->pos, dst_iter->pos) < 0) { ++ if (bkey_cmp(dst_pos, src_iter.pos) <= 0 && ++ bkey_cmp(src_iter.pos, dst_iter.pos) < 0) { + /* + * We have a hash collision for the new dst key, + * and new_src - the key we're deleting - is between +@@ -275,8 +274,8 @@ int bch2_dirent_rename(struct btree_trans *trans, + * If we're not overwriting, we can just insert + * new_dst at the src position: + */ +- new_dst->k.p = src_iter->pos; +- bch2_trans_update(trans, src_iter, ++ new_dst->k.p = src_iter.pos; ++ bch2_trans_update(trans, &src_iter, + &new_dst->k_i, 0); + goto out_set_offset; + } else { +@@ -290,7 +289,7 @@ int bch2_dirent_rename(struct btree_trans *trans, + } else { + /* Check if we need a whiteout to delete src: */ + ret = bch2_hash_needs_whiteout(trans, bch2_dirent_hash_desc, +- src_hash, src_iter); ++ src_hash, &src_iter); + if (ret < 0) + goto out; + +@@ -299,15 +298,15 @@ int bch2_dirent_rename(struct btree_trans *trans, + } + } + +- bch2_trans_update(trans, src_iter, &new_src->k_i, 0); +- bch2_trans_update(trans, dst_iter, &new_dst->k_i, 0); ++ bch2_trans_update(trans, &src_iter, &new_src->k_i, 0); ++ bch2_trans_update(trans, &dst_iter, &new_dst->k_i, 0); + out_set_offset: + if (mode == BCH_RENAME_EXCHANGE) + *src_offset = new_src->k.p.offset; + *dst_offset = new_dst->k.p.offset; + out: +- bch2_trans_iter_put(trans, src_iter); +- bch2_trans_iter_put(trans, dst_iter); ++ bch2_trans_iter_exit(trans, &src_iter); ++ bch2_trans_iter_exit(trans, &dst_iter); + return ret; + } + +@@ -319,12 +318,13 @@ int bch2_dirent_delete_at(struct btree_trans *trans, + hash_info, iter); + } + +-struct btree_iter * +-__bch2_dirent_lookup_trans(struct btree_trans *trans, u64 dir_inum, +- const struct bch_hash_info *hash_info, +- const struct qstr *name, unsigned flags) ++int __bch2_dirent_lookup_trans(struct btree_trans *trans, ++ struct btree_iter *iter, ++ u64 dir_inum, ++ const struct bch_hash_info *hash_info, ++ const struct qstr *name, unsigned flags) + { +- return bch2_hash_lookup(trans, bch2_dirent_hash_desc, ++ return bch2_hash_lookup(trans, iter, bch2_dirent_hash_desc, + hash_info, dir_inum, name, flags); + } + +@@ -333,26 +333,25 @@ u64 bch2_dirent_lookup(struct bch_fs *c, u64 dir_inum, + const struct qstr *name) + { + struct btree_trans trans; +- struct btree_iter *iter; ++ struct btree_iter iter; + struct bkey_s_c k; + u64 inum = 0; + int ret = 0; + + bch2_trans_init(&trans, c, 0, 0); + +- iter = __bch2_dirent_lookup_trans(&trans, dir_inum, +- hash_info, name, 0); +- ret = PTR_ERR_OR_ZERO(iter); ++ ret = __bch2_dirent_lookup_trans(&trans, &iter, dir_inum, ++ hash_info, name, 0); + if (ret) + goto out; + +- k = bch2_btree_iter_peek_slot(iter); ++ k = bch2_btree_iter_peek_slot(&iter); + ret = bkey_err(k); + if (ret) + goto out; + + inum = le64_to_cpu(bkey_s_c_to_dirent(k).v->d_inum); +- bch2_trans_iter_put(&trans, iter); ++ bch2_trans_iter_exit(&trans, &iter); + out: + BUG_ON(ret == -EINTR); + bch2_trans_exit(&trans); +@@ -361,7 +360,7 @@ out: + + int bch2_empty_dir_trans(struct btree_trans *trans, u64 dir_inum) + { +- struct btree_iter *iter; ++ struct btree_iter iter; + struct bkey_s_c k; + int ret; + +@@ -375,7 +374,7 @@ int bch2_empty_dir_trans(struct btree_trans *trans, u64 dir_inum) + break; + } + } +- bch2_trans_iter_put(trans, iter); ++ bch2_trans_iter_exit(trans, &iter); + + return ret; + } +@@ -383,7 +382,7 @@ int bch2_empty_dir_trans(struct btree_trans *trans, u64 dir_inum) + int bch2_readdir(struct bch_fs *c, u64 inum, struct dir_context *ctx) + { + struct btree_trans trans; +- struct btree_iter *iter; ++ struct btree_iter iter; + struct bkey_s_c k; + struct bkey_s_c_dirent dirent; + int ret; +@@ -412,7 +411,7 @@ int bch2_readdir(struct bch_fs *c, u64 inum, struct dir_context *ctx) + break; + ctx->pos = dirent.k->p.offset + 1; + } +- bch2_trans_iter_put(&trans, iter); ++ bch2_trans_iter_exit(&trans, &iter); + + ret = bch2_trans_exit(&trans) ?: ret; + +diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h +index e1d8ce377d43..c14f6029e1c9 100644 +--- a/fs/bcachefs/dirent.h ++++ b/fs/bcachefs/dirent.h +@@ -50,8 +50,7 @@ int bch2_dirent_rename(struct btree_trans *, + const struct qstr *, u64 *, u64 *, + enum bch_rename_mode); + +-struct btree_iter * +-__bch2_dirent_lookup_trans(struct btree_trans *, u64, ++int __bch2_dirent_lookup_trans(struct btree_trans *, struct btree_iter *, u64, + const struct bch_hash_info *, + const struct qstr *, unsigned); + u64 bch2_dirent_lookup(struct bch_fs *, u64, const struct bch_hash_info *, +diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c +index 53d6be09decd..f0bdbdb2673d 100644 +--- a/fs/bcachefs/ec.c ++++ b/fs/bcachefs/ec.c +@@ -429,13 +429,14 @@ static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf, + static int get_stripe_key(struct bch_fs *c, u64 idx, struct ec_stripe_buf *stripe) + { + struct btree_trans trans; +- struct btree_iter *iter; ++ struct btree_iter iter; + struct bkey_s_c k; + int ret; + + bch2_trans_init(&trans, c, 0, 0); +- iter = bch2_trans_get_iter(&trans, BTREE_ID_stripes, POS(0, idx), BTREE_ITER_SLOTS); +- k = bch2_btree_iter_peek_slot(iter); ++ bch2_trans_iter_init(&trans, &iter, BTREE_ID_stripes, ++ POS(0, idx), BTREE_ITER_SLOTS); ++ k = bch2_btree_iter_peek_slot(&iter); + ret = bkey_err(k); + if (ret) + goto err; +@@ -445,6 +446,7 @@ static int get_stripe_key(struct bch_fs *c, u64 idx, struct ec_stripe_buf *strip + } + bkey_reassemble(&stripe->key.k_i, k); + err: ++ bch2_trans_iter_exit(&trans, &iter); + bch2_trans_exit(&trans); + return ret; + } +@@ -704,7 +706,7 @@ static int ec_stripe_bkey_insert(struct bch_fs *c, + struct disk_reservation *res) + { + struct btree_trans trans; +- struct btree_iter *iter; ++ struct btree_iter iter; + struct bkey_s_c k; + struct bpos min_pos = POS(0, 1); + struct bpos start_pos = bpos_max(min_pos, POS(0, c->ec_stripe_hint)); +@@ -719,7 +721,7 @@ retry: + if (bkey_cmp(k.k->p, POS(0, U32_MAX)) > 0) { + if (start_pos.offset) { + start_pos = min_pos; +- bch2_btree_iter_set_pos(iter, start_pos); ++ bch2_btree_iter_set_pos(&iter, start_pos); + continue; + } + +@@ -733,19 +735,19 @@ retry: + + goto err; + found_slot: +- start_pos = iter->pos; ++ start_pos = iter.pos; + +- ret = ec_stripe_mem_alloc(&trans, iter); ++ ret = ec_stripe_mem_alloc(&trans, &iter); + if (ret) + goto err; + +- stripe->k.p = iter->pos; ++ stripe->k.p = iter.pos; + +- ret = bch2_trans_update(&trans, iter, &stripe->k_i, 0) ?: ++ ret = bch2_trans_update(&trans, &iter, &stripe->k_i, 0) ?: + bch2_trans_commit(&trans, res, NULL, + BTREE_INSERT_NOFAIL); + err: +- bch2_trans_iter_put(&trans, iter); ++ bch2_trans_iter_exit(&trans, &iter); + + if (ret == -EINTR) + goto retry; +@@ -759,15 +761,15 @@ err: + static int ec_stripe_bkey_update(struct btree_trans *trans, + struct bkey_i_stripe *new) + { +- struct btree_iter *iter; ++ struct btree_iter iter; + struct bkey_s_c k; + const struct bch_stripe *existing; + unsigned i; + int ret; + +- iter = bch2_trans_get_iter(trans, BTREE_ID_stripes, +- new->k.p, BTREE_ITER_INTENT); +- k = bch2_btree_iter_peek_slot(iter); ++ bch2_trans_iter_init(trans, &iter, BTREE_ID_stripes, ++ new->k.p, BTREE_ITER_INTENT); ++ k = bch2_btree_iter_peek_slot(&iter); + ret = bkey_err(k); + if (ret) + goto err; +@@ -790,9 +792,9 @@ static int ec_stripe_bkey_update(struct btree_trans *trans, + stripe_blockcount_set(&new->v, i, + stripe_blockcount_get(existing, i)); + +- ret = bch2_trans_update(trans, iter, &new->k_i, 0); ++ ret = bch2_trans_update(trans, &iter, &new->k_i, 0); + err: +- bch2_trans_iter_put(trans, iter); ++ bch2_trans_iter_exit(trans, &iter); + return ret; + } + +@@ -820,7 +822,7 @@ static int ec_stripe_update_ptrs(struct bch_fs *c, + struct bkey *pos) + { + struct btree_trans trans; +- struct btree_iter *iter; ++ struct btree_iter iter; + struct bkey_s_c k; + struct bkey_s_extent e; + struct bkey_buf sk; +@@ -832,23 +834,23 @@ static int ec_stripe_update_ptrs(struct bch_fs *c, + + /* XXX this doesn't support the reflink btree */ + +- iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, +- bkey_start_pos(pos), +- BTREE_ITER_INTENT); ++ bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, ++ bkey_start_pos(pos), ++ BTREE_ITER_INTENT); + +- while ((k = bch2_btree_iter_peek(iter)).k && ++ while ((k = bch2_btree_iter_peek(&iter)).k && + !(ret = bkey_err(k)) && + bkey_cmp(bkey_start_pos(k.k), pos->p) < 0) { + struct bch_extent_ptr *ptr, *ec_ptr = NULL; + + if (extent_has_stripe_ptr(k, s->key.k.p.offset)) { +- bch2_btree_iter_advance(iter); ++ bch2_btree_iter_advance(&iter); + continue; + } + + block = bkey_matches_stripe(&s->key.v, k); + if (block < 0) { +- bch2_btree_iter_advance(iter); ++ bch2_btree_iter_advance(&iter); + continue; + } + +@@ -863,21 +865,21 @@ static int ec_stripe_update_ptrs(struct bch_fs *c, + + extent_stripe_ptr_add(e, s, ec_ptr, block); + +- bch2_btree_iter_set_pos(iter, bkey_start_pos(&sk.k->k)); ++ bch2_btree_iter_set_pos(&iter, bkey_start_pos(&sk.k->k)); + next_pos = sk.k->k.p; + +- ret = bch2_btree_iter_traverse(iter) ?: +- bch2_trans_update(&trans, iter, sk.k, 0) ?: ++ ret = bch2_btree_iter_traverse(&iter) ?: ++ bch2_trans_update(&trans, &iter, sk.k, 0) ?: + bch2_trans_commit(&trans, NULL, NULL, + BTREE_INSERT_NOFAIL); + if (!ret) +- bch2_btree_iter_set_pos(iter, next_pos); ++ bch2_btree_iter_set_pos(&iter, next_pos); + if (ret == -EINTR) + ret = 0; + if (ret) + break; + } +- bch2_trans_iter_put(&trans, iter); ++ bch2_trans_iter_exit(&trans, &iter); + + bch2_trans_exit(&trans); + bch2_bkey_buf_exit(&sk, c); +@@ -1598,7 +1600,7 @@ write: + int bch2_stripes_write(struct bch_fs *c, unsigned flags) + { + struct btree_trans trans; +- struct btree_iter *iter; ++ struct btree_iter iter; + struct genradix_iter giter; + struct bkey_i_stripe *new_key; + struct stripe *m; +@@ -1609,8 +1611,8 @@ int bch2_stripes_write(struct bch_fs *c, unsigned flags) + + bch2_trans_init(&trans, c, 0, 0); + +- iter = bch2_trans_get_iter(&trans, BTREE_ID_stripes, POS_MIN, +- BTREE_ITER_SLOTS|BTREE_ITER_INTENT); ++ bch2_trans_iter_init(&trans, &iter, BTREE_ID_stripes, POS_MIN, ++ BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + + genradix_for_each(&c->stripes[0], giter, m) { + if (!m->alive) +@@ -1618,13 +1620,13 @@ int bch2_stripes_write(struct bch_fs *c, unsigned flags) + + ret = __bch2_trans_do(&trans, NULL, NULL, + BTREE_INSERT_NOFAIL|flags, +- __bch2_stripe_write_key(&trans, iter, m, ++ __bch2_stripe_write_key(&trans, &iter, m, + giter.pos, new_key)); + + if (ret) + break; + } +- bch2_trans_iter_put(&trans, iter); ++ bch2_trans_iter_exit(&trans, &iter); + + bch2_trans_exit(&trans); + +@@ -1659,19 +1661,19 @@ int bch2_stripes_read(struct bch_fs *c) + int bch2_ec_mem_alloc(struct bch_fs *c, bool gc) + { + struct btree_trans trans; +- struct btree_iter *iter; ++ struct btree_iter iter; + struct bkey_s_c k; + size_t i, idx = 0; + int ret = 0; + + bch2_trans_init(&trans, c, 0, 0); +- iter = bch2_trans_get_iter(&trans, BTREE_ID_stripes, POS(0, U64_MAX), 0); ++ bch2_trans_iter_init(&trans, &iter, BTREE_ID_stripes, POS(0, U64_MAX), 0); + +- k = bch2_btree_iter_prev(iter); ++ k = bch2_btree_iter_prev(&iter); + if (!IS_ERR_OR_NULL(k.k)) + idx = k.k->p.offset + 1; + +- bch2_trans_iter_put(&trans, iter); ++ bch2_trans_iter_exit(&trans, &iter); + ret = bch2_trans_exit(&trans); + if (ret) + return ret; +diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c +index 93d55f46233f..9d959b053def 100644 +--- a/fs/bcachefs/extent_update.c ++++ b/fs/bcachefs/extent_update.c +@@ -58,7 +58,7 @@ static int count_iters_for_insert(struct btree_trans *trans, + u64 idx = le64_to_cpu(p.v->idx); + unsigned sectors = bpos_min(*end, p.k->p).offset - + bkey_start_offset(p.k); +- struct btree_iter *iter; ++ struct btree_iter iter; + struct bkey_s_c r_k; + + for_each_btree_key(trans, iter, +@@ -83,8 +83,8 @@ static int count_iters_for_insert(struct btree_trans *trans, + break; + } + } ++ bch2_trans_iter_exit(trans, &iter); + +- bch2_trans_iter_put(trans, iter); + break; + } + } +@@ -99,7 +99,7 @@ int bch2_extent_atomic_end(struct btree_trans *trans, + struct bkey_i *insert, + struct bpos *end) + { +- struct btree_iter *copy; ++ struct btree_iter copy; + struct bkey_s_c k; + unsigned nr_iters = 0; + int ret; +@@ -118,7 +118,7 @@ int bch2_extent_atomic_end(struct btree_trans *trans, + if (ret < 0) + return ret; + +- copy = bch2_trans_copy_iter(trans, iter); ++ bch2_trans_copy_iter(©, iter); + + for_each_btree_key_continue(copy, 0, k, ret) { + unsigned offset = 0; +@@ -149,7 +149,7 @@ int bch2_extent_atomic_end(struct btree_trans *trans, + break; + } + +- bch2_trans_iter_put(trans, copy); ++ bch2_trans_iter_exit(trans, ©); + return ret < 0 ? ret : 0; + } + +diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c +index 563e13057f5f..f66640c2a5ed 100644 +--- a/fs/bcachefs/extents.c ++++ b/fs/bcachefs/extents.c +@@ -616,7 +616,7 @@ bool bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size, + unsigned nr_replicas, bool compressed) + { + struct btree_trans trans; +- struct btree_iter *iter; ++ struct btree_iter iter; + struct bpos end = pos; + struct bkey_s_c k; + bool ret = true; +@@ -637,7 +637,7 @@ bool bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size, + break; + } + } +- bch2_trans_iter_put(&trans, iter); ++ bch2_trans_iter_exit(&trans, &iter); + + bch2_trans_exit(&trans); + +diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c +index 2189a11ccad8..a6617455ea12 100644 +--- a/fs/bcachefs/fs-common.c ++++ b/fs/bcachefs/fs-common.c +@@ -19,16 +19,15 @@ int bch2_create_trans(struct btree_trans *trans, u64 dir_inum, + struct posix_acl *acl) + { + struct bch_fs *c = trans->c; +- struct btree_iter *dir_iter = NULL; +- struct btree_iter *inode_iter = NULL; ++ struct btree_iter dir_iter = { NULL }; ++ struct btree_iter inode_iter = { NULL }; + struct bch_hash_info hash = bch2_hash_info_init(c, new_inode); + u64 now = bch2_current_time(c); + u64 cpu = raw_smp_processor_id(); + u64 dir_offset = 0; + int ret; + +- dir_iter = bch2_inode_peek(trans, dir_u, dir_inum, BTREE_ITER_INTENT); +- ret = PTR_ERR_OR_ZERO(dir_iter); ++ ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir_inum, BTREE_ITER_INTENT); + if (ret) + goto err; + +@@ -37,8 +36,7 @@ int bch2_create_trans(struct btree_trans *trans, u64 dir_inum, + if (!name) + new_inode->bi_flags |= BCH_INODE_UNLINKED; + +- inode_iter = bch2_inode_create(trans, new_inode, U32_MAX, cpu); +- ret = PTR_ERR_OR_ZERO(inode_iter); ++ ret = bch2_inode_create(trans, &inode_iter, new_inode, U32_MAX, cpu); + if (ret) + goto err; + +@@ -63,7 +61,7 @@ int bch2_create_trans(struct btree_trans *trans, u64 dir_inum, + if (S_ISDIR(new_inode->bi_mode)) + dir_u->bi_nlink++; + +- ret = bch2_inode_write(trans, dir_iter, dir_u); ++ ret = bch2_inode_write(trans, &dir_iter, dir_u); + if (ret) + goto err; + +@@ -82,14 +80,14 @@ int bch2_create_trans(struct btree_trans *trans, u64 dir_inum, + } + + /* XXX use bch2_btree_iter_set_snapshot() */ +- inode_iter->snapshot = U32_MAX; +- bch2_btree_iter_set_pos(inode_iter, SPOS(0, new_inode->bi_inum, U32_MAX)); ++ inode_iter.snapshot = U32_MAX; ++ bch2_btree_iter_set_pos(&inode_iter, SPOS(0, new_inode->bi_inum, U32_MAX)); + +- ret = bch2_btree_iter_traverse(inode_iter) ?: +- bch2_inode_write(trans, inode_iter, new_inode); ++ ret = bch2_btree_iter_traverse(&inode_iter) ?: ++ bch2_inode_write(trans, &inode_iter, new_inode); + err: +- bch2_trans_iter_put(trans, inode_iter); +- bch2_trans_iter_put(trans, dir_iter); ++ bch2_trans_iter_exit(trans, &inode_iter); ++ bch2_trans_iter_exit(trans, &dir_iter); + return ret; + } + +@@ -98,22 +96,21 @@ int bch2_link_trans(struct btree_trans *trans, u64 dir_inum, + struct bch_inode_unpacked *inode_u, const struct qstr *name) + { + struct bch_fs *c = trans->c; +- struct btree_iter *dir_iter = NULL, *inode_iter = NULL; ++ struct btree_iter dir_iter = { NULL }; ++ struct btree_iter inode_iter = { NULL }; + struct bch_hash_info dir_hash; + u64 now = bch2_current_time(c); + u64 dir_offset = 0; + int ret; + +- inode_iter = bch2_inode_peek(trans, inode_u, inum, BTREE_ITER_INTENT); +- ret = PTR_ERR_OR_ZERO(inode_iter); ++ ret = bch2_inode_peek(trans, &inode_iter, inode_u, inum, BTREE_ITER_INTENT); + if (ret) + goto err; + + inode_u->bi_ctime = now; + bch2_inode_nlink_inc(inode_u); + +- dir_iter = bch2_inode_peek(trans, dir_u, dir_inum, 0); +- ret = PTR_ERR_OR_ZERO(dir_iter); ++ ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir_inum, 0); + if (ret) + goto err; + +@@ -133,11 +130,11 @@ int bch2_link_trans(struct btree_trans *trans, u64 dir_inum, + inode_u->bi_dir_offset = dir_offset; + } + +- ret = bch2_inode_write(trans, dir_iter, dir_u) ?: +- bch2_inode_write(trans, inode_iter, inode_u); ++ ret = bch2_inode_write(trans, &dir_iter, dir_u) ?: ++ bch2_inode_write(trans, &inode_iter, inode_u); + err: +- bch2_trans_iter_put(trans, dir_iter); +- bch2_trans_iter_put(trans, inode_iter); ++ bch2_trans_iter_exit(trans, &dir_iter); ++ bch2_trans_iter_exit(trans, &inode_iter); + return ret; + } + +@@ -147,35 +144,33 @@ int bch2_unlink_trans(struct btree_trans *trans, + const struct qstr *name) + { + struct bch_fs *c = trans->c; +- struct btree_iter *dir_iter = NULL, *dirent_iter = NULL, +- *inode_iter = NULL; ++ struct btree_iter dir_iter = { NULL }; ++ struct btree_iter dirent_iter = { NULL }; ++ struct btree_iter inode_iter = { NULL }; + struct bch_hash_info dir_hash; + u64 inum, now = bch2_current_time(c); + struct bkey_s_c k; + int ret; + +- dir_iter = bch2_inode_peek(trans, dir_u, dir_inum, BTREE_ITER_INTENT); +- ret = PTR_ERR_OR_ZERO(dir_iter); ++ ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir_inum, BTREE_ITER_INTENT); + if (ret) + goto err; + + dir_hash = bch2_hash_info_init(c, dir_u); + +- dirent_iter = __bch2_dirent_lookup_trans(trans, dir_inum, &dir_hash, +- name, BTREE_ITER_INTENT); +- ret = PTR_ERR_OR_ZERO(dirent_iter); ++ ret = __bch2_dirent_lookup_trans(trans, &dirent_iter, dir_inum, &dir_hash, ++ name, BTREE_ITER_INTENT); + if (ret) + goto err; + +- k = bch2_btree_iter_peek_slot(dirent_iter); ++ k = bch2_btree_iter_peek_slot(&dirent_iter); + ret = bkey_err(k); + if (ret) + goto err; + + inum = le64_to_cpu(bkey_s_c_to_dirent(k).v->d_inum); + +- inode_iter = bch2_inode_peek(trans, inode_u, inum, BTREE_ITER_INTENT); +- ret = PTR_ERR_OR_ZERO(inode_iter); ++ ret = bch2_inode_peek(trans, &inode_iter, inode_u, inum, BTREE_ITER_INTENT); + if (ret) + goto err; + +@@ -192,13 +187,13 @@ int bch2_unlink_trans(struct btree_trans *trans, + ret = (S_ISDIR(inode_u->bi_mode) + ? bch2_empty_dir_trans(trans, inum) + : 0) ?: +- bch2_dirent_delete_at(trans, &dir_hash, dirent_iter) ?: +- bch2_inode_write(trans, dir_iter, dir_u) ?: +- bch2_inode_write(trans, inode_iter, inode_u); ++ bch2_dirent_delete_at(trans, &dir_hash, &dirent_iter) ?: ++ bch2_inode_write(trans, &dir_iter, dir_u) ?: ++ bch2_inode_write(trans, &inode_iter, inode_u); + err: +- bch2_trans_iter_put(trans, inode_iter); +- bch2_trans_iter_put(trans, dirent_iter); +- bch2_trans_iter_put(trans, dir_iter); ++ bch2_trans_iter_exit(trans, &inode_iter); ++ bch2_trans_iter_exit(trans, &dirent_iter); ++ bch2_trans_iter_exit(trans, &dir_iter); + return ret; + } + +@@ -236,25 +231,25 @@ int bch2_rename_trans(struct btree_trans *trans, + enum bch_rename_mode mode) + { + struct bch_fs *c = trans->c; +- struct btree_iter *src_dir_iter = NULL, *dst_dir_iter = NULL; +- struct btree_iter *src_inode_iter = NULL, *dst_inode_iter = NULL; ++ struct btree_iter src_dir_iter = { NULL }; ++ struct btree_iter dst_dir_iter = { NULL }; ++ struct btree_iter src_inode_iter = { NULL }; ++ struct btree_iter dst_inode_iter = { NULL }; + struct bch_hash_info src_hash, dst_hash; + u64 src_inode, src_offset, dst_inode, dst_offset; + u64 now = bch2_current_time(c); + int ret; + +- src_dir_iter = bch2_inode_peek(trans, src_dir_u, src_dir, +- BTREE_ITER_INTENT); +- ret = PTR_ERR_OR_ZERO(src_dir_iter); ++ ret = bch2_inode_peek(trans, &src_dir_iter, src_dir_u, src_dir, ++ BTREE_ITER_INTENT); + if (ret) + goto err; + + src_hash = bch2_hash_info_init(c, src_dir_u); + + if (dst_dir != src_dir) { +- dst_dir_iter = bch2_inode_peek(trans, dst_dir_u, dst_dir, +- BTREE_ITER_INTENT); +- ret = PTR_ERR_OR_ZERO(dst_dir_iter); ++ ret = bch2_inode_peek(trans, &dst_dir_iter, dst_dir_u, dst_dir, ++ BTREE_ITER_INTENT); + if (ret) + goto err; + +@@ -273,16 +268,14 @@ int bch2_rename_trans(struct btree_trans *trans, + if (ret) + goto err; + +- src_inode_iter = bch2_inode_peek(trans, src_inode_u, src_inode, +- BTREE_ITER_INTENT); +- ret = PTR_ERR_OR_ZERO(src_inode_iter); ++ ret = bch2_inode_peek(trans, &src_inode_iter, src_inode_u, src_inode, ++ BTREE_ITER_INTENT); + if (ret) + goto err; + + if (dst_inode) { +- dst_inode_iter = bch2_inode_peek(trans, dst_inode_u, dst_inode, +- BTREE_ITER_INTENT); +- ret = PTR_ERR_OR_ZERO(dst_inode_iter); ++ ret = bch2_inode_peek(trans, &dst_inode_iter, dst_inode_u, dst_inode, ++ BTREE_ITER_INTENT); + if (ret) + goto err; + } +@@ -357,18 +350,18 @@ int bch2_rename_trans(struct btree_trans *trans, + if (dst_inode) + dst_inode_u->bi_ctime = now; + +- ret = bch2_inode_write(trans, src_dir_iter, src_dir_u) ?: ++ ret = bch2_inode_write(trans, &src_dir_iter, src_dir_u) ?: + (src_dir != dst_dir +- ? bch2_inode_write(trans, dst_dir_iter, dst_dir_u) ++ ? bch2_inode_write(trans, &dst_dir_iter, dst_dir_u) + : 0 ) ?: +- bch2_inode_write(trans, src_inode_iter, src_inode_u) ?: ++ bch2_inode_write(trans, &src_inode_iter, src_inode_u) ?: + (dst_inode +- ? bch2_inode_write(trans, dst_inode_iter, dst_inode_u) ++ ? bch2_inode_write(trans, &dst_inode_iter, dst_inode_u) + : 0 ); + err: +- bch2_trans_iter_put(trans, dst_inode_iter); +- bch2_trans_iter_put(trans, src_inode_iter); +- bch2_trans_iter_put(trans, dst_dir_iter); +- bch2_trans_iter_put(trans, src_dir_iter); ++ bch2_trans_iter_exit(trans, &dst_inode_iter); ++ bch2_trans_iter_exit(trans, &src_inode_iter); ++ bch2_trans_iter_exit(trans, &dst_dir_iter); ++ bch2_trans_iter_exit(trans, &src_dir_iter); + return ret; + } +diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c +index 55bd5140a17d..e474d1fa5b8e 100644 +--- a/fs/bcachefs/fs-io.c ++++ b/fs/bcachefs/fs-io.c +@@ -893,7 +893,7 @@ void bch2_readahead(struct readahead_control *ractl) + struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct bch_io_opts opts = io_opts(c, &inode->ei_inode); + struct btree_trans trans; +- struct btree_iter *iter; ++ struct btree_iter iter; + struct page *page; + struct readpages_iter readpages_iter; + int ret; +@@ -902,8 +902,8 @@ void bch2_readahead(struct readahead_control *ractl) + BUG_ON(ret); + + bch2_trans_init(&trans, c, 0, 0); +- iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, POS_MIN, +- BTREE_ITER_SLOTS); ++ bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, POS_MIN, ++ BTREE_ITER_SLOTS); + + bch2_pagecache_add_get(&inode->ei_pagecache_lock); + +@@ -924,13 +924,13 @@ void bch2_readahead(struct readahead_control *ractl) + rbio->bio.bi_end_io = bch2_readpages_end_io; + BUG_ON(!bio_add_page(&rbio->bio, page, PAGE_SIZE, 0)); + +- bchfs_read(&trans, iter, rbio, inode->v.i_ino, ++ bchfs_read(&trans, &iter, rbio, inode->v.i_ino, + &readpages_iter); + } + + bch2_pagecache_add_put(&inode->ei_pagecache_lock); + +- bch2_trans_iter_put(&trans, iter); ++ bch2_trans_iter_exit(&trans, &iter); + bch2_trans_exit(&trans); + kfree(readpages_iter.pages); + } +@@ -939,7 +939,7 @@ static void __bchfs_readpage(struct bch_fs *c, struct bch_read_bio *rbio, + u64 inum, struct page *page) + { + struct btree_trans trans; +- struct btree_iter *iter; ++ struct btree_iter iter; + + bch2_page_state_create(page, __GFP_NOFAIL); + +@@ -949,12 +949,12 @@ static void __bchfs_readpage(struct bch_fs *c, struct bch_read_bio *rbio, + BUG_ON(!bio_add_page(&rbio->bio, page, PAGE_SIZE, 0)); + + bch2_trans_init(&trans, c, 0, 0); +- iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, POS_MIN, +- BTREE_ITER_SLOTS); ++ bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, POS_MIN, ++ BTREE_ITER_SLOTS); + +- bchfs_read(&trans, iter, rbio, inum, NULL); ++ bchfs_read(&trans, &iter, rbio, inum, NULL); + +- bch2_trans_iter_put(&trans, iter); ++ bch2_trans_iter_exit(&trans, &iter); + bch2_trans_exit(&trans); + } + +@@ -2159,7 +2159,7 @@ static inline int range_has_data(struct bch_fs *c, + struct bpos end) + { + struct btree_trans trans; +- struct btree_iter *iter; ++ struct btree_iter iter; + struct bkey_s_c k; + int ret = 0; + +@@ -2174,7 +2174,7 @@ static inline int range_has_data(struct bch_fs *c, + break; + } + } +- bch2_trans_iter_put(&trans, iter); ++ bch2_trans_iter_exit(&trans, &iter); + + return bch2_trans_exit(&trans) ?: ret; + } +@@ -2484,7 +2484,7 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode, + struct address_space *mapping = inode->v.i_mapping; + struct bkey_buf copy; + struct btree_trans trans; +- struct btree_iter *src, *dst, *del; ++ struct btree_iter src, dst, del; + loff_t shift, new_size; + u64 src_start; + int ret = 0; +@@ -2549,11 +2549,11 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode, + + bch2_bkey_buf_init(©); + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024); +- src = bch2_trans_get_iter(&trans, BTREE_ID_extents, ++ bch2_trans_iter_init(&trans, &src, BTREE_ID_extents, + POS(inode->v.i_ino, src_start >> 9), + BTREE_ITER_INTENT); +- dst = bch2_trans_copy_iter(&trans, src); +- del = bch2_trans_copy_iter(&trans, src); ++ bch2_trans_copy_iter(&dst, &src); ++ bch2_trans_copy_iter(&del, &src); + + while (ret == 0 || ret == -EINTR) { + struct disk_reservation disk_res = +@@ -2568,8 +2568,8 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode, + bch2_trans_begin(&trans); + + k = insert +- ? bch2_btree_iter_peek_prev(src) +- : bch2_btree_iter_peek(src); ++ ? bch2_btree_iter_peek_prev(&src) ++ : bch2_btree_iter_peek(&src); + if ((ret = bkey_err(k))) + continue; + +@@ -2587,9 +2587,9 @@ reassemble: + bch2_cut_front(move_pos, copy.k); + + copy.k->k.p.offset += shift >> 9; +- bch2_btree_iter_set_pos(dst, bkey_start_pos(©.k->k)); ++ bch2_btree_iter_set_pos(&dst, bkey_start_pos(©.k->k)); + +- ret = bch2_extent_atomic_end(&trans, dst, copy.k, &atomic_end); ++ ret = bch2_extent_atomic_end(&trans, &dst, copy.k, &atomic_end); + if (ret) + continue; + +@@ -2607,7 +2607,7 @@ reassemble: + delete.k.p = copy.k->k.p; + delete.k.size = copy.k->k.size; + delete.k.p.offset -= shift >> 9; +- bch2_btree_iter_set_pos(del, bkey_start_pos(&delete.k)); ++ bch2_btree_iter_set_pos(&del, bkey_start_pos(&delete.k)); + + next_pos = insert ? bkey_start_pos(&delete.k) : delete.k.p; + +@@ -2628,20 +2628,20 @@ reassemble: + BUG_ON(ret); + } + +- ret = bch2_btree_iter_traverse(del) ?: +- bch2_trans_update(&trans, del, &delete, trigger_flags) ?: +- bch2_trans_update(&trans, dst, copy.k, trigger_flags) ?: ++ ret = bch2_btree_iter_traverse(&del) ?: ++ bch2_trans_update(&trans, &del, &delete, trigger_flags) ?: ++ bch2_trans_update(&trans, &dst, copy.k, trigger_flags) ?: + bch2_trans_commit(&trans, &disk_res, + &inode->ei_journal_seq, + BTREE_INSERT_NOFAIL); + bch2_disk_reservation_put(c, &disk_res); + + if (!ret) +- bch2_btree_iter_set_pos(src, next_pos); ++ bch2_btree_iter_set_pos(&src, next_pos); + } +- bch2_trans_iter_put(&trans, del); +- bch2_trans_iter_put(&trans, dst); +- bch2_trans_iter_put(&trans, src); ++ bch2_trans_iter_exit(&trans, &del); ++ bch2_trans_iter_exit(&trans, &dst); ++ bch2_trans_iter_exit(&trans, &src); + bch2_trans_exit(&trans); + bch2_bkey_buf_exit(©, c); + +@@ -2666,18 +2666,18 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode, + { + struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct btree_trans trans; +- struct btree_iter *iter; ++ struct btree_iter iter; + struct bpos end_pos = POS(inode->v.i_ino, end_sector); + unsigned replicas = io_opts(c, &inode->ei_inode).data_replicas; + int ret = 0; + + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 512); + +- iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, ++ bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, + POS(inode->v.i_ino, start_sector), + BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + +- while (!ret && bkey_cmp(iter->pos, end_pos) < 0) { ++ while (!ret && bkey_cmp(iter.pos, end_pos) < 0) { + s64 i_sectors_delta = 0; + struct disk_reservation disk_res = { 0 }; + struct quota_res quota_res = { 0 }; +@@ -2687,20 +2687,20 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode, + + bch2_trans_begin(&trans); + +- k = bch2_btree_iter_peek_slot(iter); ++ k = bch2_btree_iter_peek_slot(&iter); + if ((ret = bkey_err(k))) + goto bkey_err; + + /* already reserved */ + if (k.k->type == KEY_TYPE_reservation && + bkey_s_c_to_reservation(k).v->nr_replicas >= replicas) { +- bch2_btree_iter_advance(iter); ++ bch2_btree_iter_advance(&iter); + continue; + } + + if (bkey_extent_is_data(k.k) && + !(mode & FALLOC_FL_ZERO_RANGE)) { +- bch2_btree_iter_advance(iter); ++ bch2_btree_iter_advance(&iter); + continue; + } + +@@ -2709,7 +2709,7 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode, + reservation.k.p = k.k->p; + reservation.k.size = k.k->size; + +- bch2_cut_front(iter->pos, &reservation.k_i); ++ bch2_cut_front(iter.pos, &reservation.k_i); + bch2_cut_back(end_pos, &reservation.k_i); + + sectors = reservation.k.size; +@@ -2733,7 +2733,7 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode, + reservation.v.nr_replicas = disk_res.nr_replicas; + } + +- ret = bch2_extent_update(&trans, iter, &reservation.k_i, ++ ret = bch2_extent_update(&trans, &iter, &reservation.k_i, + &disk_res, &inode->ei_journal_seq, + 0, &i_sectors_delta, true); + i_sectors_acct(c, inode, "a_res, i_sectors_delta); +@@ -2743,7 +2743,7 @@ bkey_err: + if (ret == -EINTR) + ret = 0; + } +- bch2_trans_iter_put(&trans, iter); ++ bch2_trans_iter_exit(&trans, &iter); + bch2_trans_exit(&trans); + return ret; + } +@@ -3025,7 +3025,7 @@ static loff_t bch2_seek_data(struct file *file, u64 offset) + struct bch_inode_info *inode = file_bch_inode(file); + struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct btree_trans trans; +- struct btree_iter *iter; ++ struct btree_iter iter; + struct bkey_s_c k; + u64 isize, next_data = MAX_LFS_FILESIZE; + int ret; +@@ -3046,7 +3046,7 @@ static loff_t bch2_seek_data(struct file *file, u64 offset) + } else if (k.k->p.offset >> 9 > isize) + break; + } +- bch2_trans_iter_put(&trans, iter); ++ bch2_trans_iter_exit(&trans, &iter); + + ret = bch2_trans_exit(&trans) ?: ret; + if (ret) +@@ -3121,7 +3121,7 @@ static loff_t bch2_seek_hole(struct file *file, u64 offset) + struct bch_inode_info *inode = file_bch_inode(file); + struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct btree_trans trans; +- struct btree_iter *iter; ++ struct btree_iter iter; + struct bkey_s_c k; + u64 isize, next_hole = MAX_LFS_FILESIZE; + int ret; +@@ -3150,7 +3150,7 @@ static loff_t bch2_seek_hole(struct file *file, u64 offset) + offset = max(offset, bkey_start_offset(k.k) << 9); + } + } +- bch2_trans_iter_put(&trans, iter); ++ bch2_trans_iter_exit(&trans, &iter); + + ret = bch2_trans_exit(&trans) ?: ret; + if (ret) +diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c +index b9fc813087f0..bf1e519aa728 100644 +--- a/fs/bcachefs/fs.c ++++ b/fs/bcachefs/fs.c +@@ -142,7 +142,7 @@ int __must_check bch2_write_inode(struct bch_fs *c, + void *p, unsigned fields) + { + struct btree_trans trans; +- struct btree_iter *iter; ++ struct btree_iter iter = { NULL }; + struct bch_inode_unpacked inode_u; + int ret; + +@@ -150,11 +150,10 @@ int __must_check bch2_write_inode(struct bch_fs *c, + retry: + bch2_trans_begin(&trans); + +- iter = bch2_inode_peek(&trans, &inode_u, inode->v.i_ino, +- BTREE_ITER_INTENT); +- ret = PTR_ERR_OR_ZERO(iter) ?: ++ ret = bch2_inode_peek(&trans, &iter, &inode_u, inode->v.i_ino, ++ BTREE_ITER_INTENT) ?: + (set ? set(inode, &inode_u, p) : 0) ?: +- bch2_inode_write(&trans, iter, &inode_u) ?: ++ bch2_inode_write(&trans, &iter, &inode_u) ?: + bch2_trans_commit(&trans, NULL, + &inode->ei_journal_seq, + BTREE_INSERT_NOFAIL); +@@ -166,7 +165,7 @@ retry: + if (!ret) + bch2_inode_update_after_write(c, inode, &inode_u, fields); + +- bch2_trans_iter_put(&trans, iter); ++ bch2_trans_iter_exit(&trans, &iter); + + if (ret == -EINTR) + goto retry; +@@ -687,7 +686,7 @@ int bch2_setattr_nonsize(struct user_namespace *mnt_userns, + struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct bch_qid qid; + struct btree_trans trans; +- struct btree_iter *inode_iter; ++ struct btree_iter inode_iter = { NULL }; + struct bch_inode_unpacked inode_u; + struct posix_acl *acl = NULL; + int ret; +@@ -713,9 +712,8 @@ retry: + kfree(acl); + acl = NULL; + +- inode_iter = bch2_inode_peek(&trans, &inode_u, inode->v.i_ino, +- BTREE_ITER_INTENT); +- ret = PTR_ERR_OR_ZERO(inode_iter); ++ ret = bch2_inode_peek(&trans, &inode_iter, &inode_u, inode->v.i_ino, ++ BTREE_ITER_INTENT); + if (ret) + goto btree_err; + +@@ -727,12 +725,12 @@ retry: + goto btree_err; + } + +- ret = bch2_inode_write(&trans, inode_iter, &inode_u) ?: ++ ret = bch2_inode_write(&trans, &inode_iter, &inode_u) ?: + bch2_trans_commit(&trans, NULL, + &inode->ei_journal_seq, + BTREE_INSERT_NOFAIL); + btree_err: +- bch2_trans_iter_put(&trans, inode_iter); ++ bch2_trans_iter_exit(&trans, &inode_iter); + + if (ret == -EINTR) + goto retry; +@@ -882,7 +880,7 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, + struct bch_fs *c = vinode->i_sb->s_fs_info; + struct bch_inode_info *ei = to_bch_ei(vinode); + struct btree_trans trans; +- struct btree_iter *iter; ++ struct btree_iter iter; + struct bkey_s_c k; + struct bkey_buf cur, prev; + struct bpos end = POS(ei->v.i_ino, (start + len) >> 9); +@@ -901,23 +899,23 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, + bch2_bkey_buf_init(&prev); + bch2_trans_init(&trans, c, 0, 0); + +- iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, +- POS(ei->v.i_ino, start >> 9), 0); ++ bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, ++ POS(ei->v.i_ino, start >> 9), 0); + retry: + bch2_trans_begin(&trans); + +- while ((k = bch2_btree_iter_peek(iter)).k && ++ while ((k = bch2_btree_iter_peek(&iter)).k && + !(ret = bkey_err(k)) && +- bkey_cmp(iter->pos, end) < 0) { ++ bkey_cmp(iter.pos, end) < 0) { + enum btree_id data_btree = BTREE_ID_extents; + + if (!bkey_extent_is_data(k.k) && + k.k->type != KEY_TYPE_reservation) { +- bch2_btree_iter_advance(iter); ++ bch2_btree_iter_advance(&iter); + continue; + } + +- offset_into_extent = iter->pos.offset - ++ offset_into_extent = iter.pos.offset - + bkey_start_offset(k.k); + sectors = k.k->size - offset_into_extent; + +@@ -938,7 +936,7 @@ retry: + offset_into_extent), + cur.k); + bch2_key_resize(&cur.k->k, sectors); +- cur.k->k.p = iter->pos; ++ cur.k->k.p = iter.pos; + cur.k->k.p.offset += cur.k->k.size; + + if (have_extent) { +@@ -951,8 +949,8 @@ retry: + bkey_copy(prev.k, cur.k); + have_extent = true; + +- bch2_btree_iter_set_pos(iter, +- POS(iter->pos.inode, iter->pos.offset + sectors)); ++ bch2_btree_iter_set_pos(&iter, ++ POS(iter.pos.inode, iter.pos.offset + sectors)); + } + + if (ret == -EINTR) +@@ -962,7 +960,7 @@ retry: + ret = bch2_fill_extent(c, info, bkey_i_to_s_c(prev.k), + FIEMAP_EXTENT_LAST); + +- bch2_trans_iter_put(&trans, iter); ++ bch2_trans_iter_exit(&trans, &iter); + ret = bch2_trans_exit(&trans) ?: ret; + bch2_bkey_buf_exit(&cur, c); + bch2_bkey_buf_exit(&prev, c); +diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c +index 36eba46d566e..eb979e79eaac 100644 +--- a/fs/bcachefs/fsck.c ++++ b/fs/bcachefs/fsck.c +@@ -19,7 +19,7 @@ + + static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum) + { +- struct btree_iter *iter; ++ struct btree_iter iter; + struct bkey_s_c k; + u64 sectors = 0; + int ret; +@@ -33,7 +33,7 @@ static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum) + sectors += k.k->size; + } + +- bch2_trans_iter_free(trans, iter); ++ bch2_trans_iter_exit(trans, &iter); + + return ret ?: sectors; + } +@@ -42,24 +42,24 @@ static int __lookup_inode(struct btree_trans *trans, u64 inode_nr, + struct bch_inode_unpacked *inode, + u32 *snapshot) + { +- struct btree_iter *iter; ++ struct btree_iter iter; + struct bkey_s_c k; + int ret; + +- iter = bch2_trans_get_iter(trans, BTREE_ID_inodes, +- POS(0, inode_nr), 0); +- k = bch2_btree_iter_peek_slot(iter); ++ bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes, ++ POS(0, inode_nr), 0); ++ k = bch2_btree_iter_peek_slot(&iter); + ret = bkey_err(k); + if (ret) + goto err; + + if (snapshot) +- *snapshot = iter->pos.snapshot; ++ *snapshot = iter.pos.snapshot; + ret = k.k->type == KEY_TYPE_inode + ? bch2_inode_unpack(bkey_s_c_to_inode(k), inode) + : -ENOENT; + err: +- bch2_trans_iter_free(trans, iter); ++ bch2_trans_iter_exit(trans, &iter); + return ret; + } + +@@ -74,13 +74,16 @@ static int __write_inode(struct btree_trans *trans, + struct bch_inode_unpacked *inode, + u32 snapshot) + { +- struct btree_iter *inode_iter = +- bch2_trans_get_iter(trans, BTREE_ID_inodes, +- SPOS(0, inode->bi_inum, snapshot), +- BTREE_ITER_INTENT); +- int ret = bch2_btree_iter_traverse(inode_iter) ?: +- bch2_inode_write(trans, inode_iter, inode); +- bch2_trans_iter_put(trans, inode_iter); ++ struct btree_iter iter; ++ int ret; ++ ++ bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes, ++ SPOS(0, inode->bi_inum, snapshot), ++ BTREE_ITER_INTENT); ++ ++ ret = bch2_btree_iter_traverse(&iter) ?: ++ bch2_inode_write(trans, &iter, inode); ++ bch2_trans_iter_exit(trans, &iter); + return ret; + } + +@@ -100,7 +103,7 @@ static int write_inode(struct btree_trans *trans, + static int __remove_dirent(struct btree_trans *trans, struct bpos pos) + { + struct bch_fs *c = trans->c; +- struct btree_iter *iter; ++ struct btree_iter iter; + struct bch_inode_unpacked dir_inode; + struct bch_hash_info dir_hash_info; + int ret; +@@ -111,11 +114,11 @@ static int __remove_dirent(struct btree_trans *trans, struct bpos pos) + + dir_hash_info = bch2_hash_info_init(c, &dir_inode); + +- iter = bch2_trans_get_iter(trans, BTREE_ID_dirents, pos, BTREE_ITER_INTENT); ++ bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, pos, BTREE_ITER_INTENT); + + ret = bch2_hash_delete_at(trans, bch2_dirent_hash_desc, +- &dir_hash_info, iter); +- bch2_trans_iter_put(trans, iter); ++ &dir_hash_info, &iter); ++ bch2_trans_iter_exit(trans, &iter); + return ret; + } + +@@ -230,13 +233,13 @@ static int reattach_inode(struct btree_trans *trans, + static int remove_backpointer(struct btree_trans *trans, + struct bch_inode_unpacked *inode) + { +- struct btree_iter *iter; ++ struct btree_iter iter; + struct bkey_s_c k; + int ret; + +- iter = bch2_trans_get_iter(trans, BTREE_ID_dirents, +- POS(inode->bi_dir, inode->bi_dir_offset), 0); +- k = bch2_btree_iter_peek_slot(iter); ++ bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, ++ POS(inode->bi_dir, inode->bi_dir_offset), 0); ++ k = bch2_btree_iter_peek_slot(&iter); + ret = bkey_err(k); + if (ret) + goto out; +@@ -247,7 +250,7 @@ static int remove_backpointer(struct btree_trans *trans, + + ret = remove_dirent(trans, k.k->p); + out: +- bch2_trans_iter_put(trans, iter); ++ bch2_trans_iter_exit(trans, &iter); + return ret; + } + +@@ -343,7 +346,7 @@ static int hash_check_key(struct btree_trans *trans, + struct btree_iter *k_iter, struct bkey_s_c hash_k) + { + struct bch_fs *c = trans->c; +- struct btree_iter *iter = NULL; ++ struct btree_iter iter = { NULL }; + char buf[200]; + struct bkey_s_c k; + u64 hash; +@@ -378,12 +381,12 @@ static int hash_check_key(struct btree_trans *trans, + } + + if (bkey_deleted(k.k)) { +- bch2_trans_iter_free(trans, iter); ++ bch2_trans_iter_exit(trans, &iter); + goto bad_hash; + } + + } +- bch2_trans_iter_free(trans, iter); ++ bch2_trans_iter_exit(trans, &iter); + return ret; + bad_hash: + if (fsck_err(c, "hash table key at wrong offset: btree %u inode %llu offset %llu, " +@@ -513,7 +516,7 @@ noinline_for_stack + static int check_inodes(struct bch_fs *c, bool full) + { + struct btree_trans trans; +- struct btree_iter *iter; ++ struct btree_iter iter; + struct bkey_s_c k; + struct bkey_s_c_inode inode; + int ret; +@@ -532,12 +535,12 @@ static int check_inodes(struct bch_fs *c, bool full) + (inode.v->bi_flags & (BCH_INODE_I_SIZE_DIRTY| + BCH_INODE_I_SECTORS_DIRTY| + BCH_INODE_UNLINKED))) { +- ret = check_inode(&trans, iter, inode); ++ ret = check_inode(&trans, &iter, inode); + if (ret) + break; + } + } +- bch2_trans_iter_put(&trans, iter); ++ bch2_trans_iter_exit(&trans, &iter); + + BUG_ON(ret == -EINTR); + +@@ -547,7 +550,7 @@ static int check_inodes(struct bch_fs *c, bool full) + static int fix_overlapping_extent(struct btree_trans *trans, + struct bkey_s_c k, struct bpos cut_at) + { +- struct btree_iter *iter; ++ struct btree_iter iter; + struct bkey_i *u; + int ret; + +@@ -567,29 +570,29 @@ static int fix_overlapping_extent(struct btree_trans *trans, + * assume things about extent overwrites - we should be running the + * triggers manually here + */ +- iter = bch2_trans_get_iter(trans, BTREE_ID_extents, u->k.p, +- BTREE_ITER_INTENT|BTREE_ITER_NOT_EXTENTS); ++ bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, u->k.p, ++ BTREE_ITER_INTENT|BTREE_ITER_NOT_EXTENTS); + +- BUG_ON(iter->flags & BTREE_ITER_IS_EXTENTS); +- ret = bch2_btree_iter_traverse(iter) ?: +- bch2_trans_update(trans, iter, u, BTREE_TRIGGER_NORUN) ?: ++ BUG_ON(iter.flags & BTREE_ITER_IS_EXTENTS); ++ ret = bch2_btree_iter_traverse(&iter) ?: ++ bch2_trans_update(trans, &iter, u, BTREE_TRIGGER_NORUN) ?: + bch2_trans_commit(trans, NULL, NULL, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW); +- bch2_trans_iter_put(trans, iter); ++ bch2_trans_iter_exit(trans, &iter); + return ret; + } + + static int inode_backpointer_exists(struct btree_trans *trans, + struct bch_inode_unpacked *inode) + { +- struct btree_iter *iter; ++ struct btree_iter iter; + struct bkey_s_c k; + int ret; + +- iter = bch2_trans_get_iter(trans, BTREE_ID_dirents, +- POS(inode->bi_dir, inode->bi_dir_offset), 0); +- k = bch2_btree_iter_peek_slot(iter); ++ bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, ++ POS(inode->bi_dir, inode->bi_dir_offset), 0); ++ k = bch2_btree_iter_peek_slot(&iter); + ret = bkey_err(k); + if (ret) + goto out; +@@ -598,7 +601,7 @@ static int inode_backpointer_exists(struct btree_trans *trans, + + ret = le64_to_cpu(bkey_s_c_to_dirent(k).v->d_inum) == inode->bi_inum; + out: +- bch2_trans_iter_free(trans, iter); ++ bch2_trans_iter_exit(trans, &iter); + return ret; + } + +@@ -618,7 +621,7 @@ static int check_extents(struct bch_fs *c) + { + struct inode_walker w = inode_walker_init(); + struct btree_trans trans; +- struct btree_iter *iter; ++ struct btree_iter iter; + struct bkey_s_c k; + struct bkey_buf prev; + u64 i_sectors = 0; +@@ -630,12 +633,12 @@ static int check_extents(struct bch_fs *c) + + bch_verbose(c, "checking extents"); + +- iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, +- POS(BCACHEFS_ROOT_INO, 0), +- BTREE_ITER_INTENT| +- BTREE_ITER_PREFETCH); ++ bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, ++ POS(BCACHEFS_ROOT_INO, 0), ++ BTREE_ITER_INTENT| ++ BTREE_ITER_PREFETCH); + retry: +- while ((k = bch2_btree_iter_peek(iter)).k && ++ while ((k = bch2_btree_iter_peek(&iter)).k && + !(ret = bkey_err(k))) { + if (w.have_inode && + w.cur_inum != k.k->p.inode && +@@ -700,12 +703,12 @@ retry: + i_sectors += k.k->size; + bch2_bkey_buf_reassemble(&prev, c, k); + +- bch2_btree_iter_advance(iter); ++ bch2_btree_iter_advance(&iter); + } + fsck_err: + if (ret == -EINTR) + goto retry; +- bch2_trans_iter_put(&trans, iter); ++ bch2_trans_iter_exit(&trans, &iter); + bch2_bkey_buf_exit(&prev, c); + return bch2_trans_exit(&trans) ?: ret; + } +@@ -890,7 +893,7 @@ static int check_dirents(struct bch_fs *c) + struct inode_walker w = inode_walker_init(); + struct bch_hash_info hash_info; + struct btree_trans trans; +- struct btree_iter *iter; ++ struct btree_iter iter; + unsigned nr_subdirs = 0; + int ret = 0; + +@@ -898,18 +901,18 @@ static int check_dirents(struct bch_fs *c) + + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); + +- iter = bch2_trans_get_iter(&trans, BTREE_ID_dirents, +- POS(BCACHEFS_ROOT_INO, 0), +- BTREE_ITER_INTENT| +- BTREE_ITER_PREFETCH); ++ bch2_trans_iter_init(&trans, &iter, BTREE_ID_dirents, ++ POS(BCACHEFS_ROOT_INO, 0), ++ BTREE_ITER_INTENT| ++ BTREE_ITER_PREFETCH); + + do { + ret = lockrestart_do(&trans, +- check_dirent(&trans, iter, &hash_info, &w, &nr_subdirs)); ++ check_dirent(&trans, &iter, &hash_info, &w, &nr_subdirs)); + if (ret) + break; +- } while (bch2_btree_iter_advance(iter)); +- bch2_trans_iter_put(&trans, iter); ++ } while (bch2_btree_iter_advance(&iter)); ++ bch2_trans_iter_exit(&trans, &iter); + + return bch2_trans_exit(&trans) ?: ret; + } +@@ -923,7 +926,7 @@ static int check_xattrs(struct bch_fs *c) + struct inode_walker w = inode_walker_init(); + struct bch_hash_info hash_info; + struct btree_trans trans; +- struct btree_iter *iter; ++ struct btree_iter iter; + struct bkey_s_c k; + int ret = 0; + +@@ -931,12 +934,12 @@ static int check_xattrs(struct bch_fs *c) + + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); + +- iter = bch2_trans_get_iter(&trans, BTREE_ID_xattrs, +- POS(BCACHEFS_ROOT_INO, 0), +- BTREE_ITER_INTENT| +- BTREE_ITER_PREFETCH); ++ bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, ++ POS(BCACHEFS_ROOT_INO, 0), ++ BTREE_ITER_INTENT| ++ BTREE_ITER_PREFETCH); + retry: +- while ((k = bch2_btree_iter_peek(iter)).k && ++ while ((k = bch2_btree_iter_peek(&iter)).k && + !(ret = bkey_err(k))) { + ret = walk_inode(&trans, &w, k.k->p.inode); + if (ret) +@@ -945,7 +948,7 @@ retry: + if (fsck_err_on(!w.have_inode, c, + "xattr for missing inode %llu", + k.k->p.inode)) { +- ret = bch2_btree_delete_at(&trans, iter, 0); ++ ret = bch2_btree_delete_at(&trans, &iter, 0); + if (ret) + break; + continue; +@@ -955,17 +958,17 @@ retry: + hash_info = bch2_hash_info_init(c, &w.inode); + + ret = hash_check_key(&trans, bch2_xattr_hash_desc, +- &hash_info, iter, k); ++ &hash_info, &iter, k); + if (ret) + break; + +- bch2_btree_iter_advance(iter); ++ bch2_btree_iter_advance(&iter); + } + fsck_err: + if (ret == -EINTR) + goto retry; + +- bch2_trans_iter_put(&trans, iter); ++ bch2_trans_iter_exit(&trans, &iter); + return bch2_trans_exit(&trans) ?: ret; + } + +@@ -1114,7 +1117,7 @@ fsck_err: + static int check_directory_structure(struct bch_fs *c) + { + struct btree_trans trans; +- struct btree_iter *iter; ++ struct btree_iter iter; + struct bkey_s_c k; + struct bch_inode_unpacked u; + struct pathbuf path = { 0, 0, NULL }; +@@ -1139,7 +1142,7 @@ static int check_directory_structure(struct bch_fs *c) + if (ret) + break; + } +- bch2_trans_iter_put(&trans, iter); ++ bch2_trans_iter_exit(&trans, &iter); + + BUG_ON(ret == -EINTR); + +@@ -1215,7 +1218,7 @@ static int check_nlinks_find_hardlinks(struct bch_fs *c, + u64 start, u64 *end) + { + struct btree_trans trans; +- struct btree_iter *iter; ++ struct btree_iter iter; + struct bkey_s_c k; + struct bkey_s_c_inode inode; + struct bch_inode_unpacked u; +@@ -1253,7 +1256,7 @@ static int check_nlinks_find_hardlinks(struct bch_fs *c, + } + + } +- bch2_trans_iter_put(&trans, iter); ++ bch2_trans_iter_exit(&trans, &iter); + bch2_trans_exit(&trans); + + if (ret) +@@ -1267,7 +1270,7 @@ static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links + u64 range_start, u64 range_end) + { + struct btree_trans trans; +- struct btree_iter *iter; ++ struct btree_iter iter; + struct bkey_s_c k; + struct bkey_s_c_dirent d; + int ret; +@@ -1289,7 +1292,7 @@ static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links + + bch2_trans_cond_resched(&trans); + } +- bch2_trans_iter_put(&trans, iter); ++ bch2_trans_iter_exit(&trans, &iter); + + ret = bch2_trans_exit(&trans) ?: ret; + if (ret) +@@ -1304,7 +1307,7 @@ static int check_nlinks_update_hardlinks(struct bch_fs *c, + u64 range_start, u64 range_end) + { + struct btree_trans trans; +- struct btree_iter *iter; ++ struct btree_iter iter; + struct bkey_s_c k; + struct bkey_s_c_inode inode; + struct bch_inode_unpacked u; +@@ -1346,14 +1349,14 @@ static int check_nlinks_update_hardlinks(struct bch_fs *c, + ret = __bch2_trans_do(&trans, NULL, NULL, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW, +- bch2_btree_iter_traverse(iter) ?: +- bch2_inode_write(&trans, iter, &u)); ++ bch2_btree_iter_traverse(&iter) ?: ++ bch2_inode_write(&trans, &iter, &u)); + if (ret) + bch_err(c, "error in fsck: error %i updating inode", ret); + } + } + fsck_err: +- bch2_trans_iter_put(&trans, iter); ++ bch2_trans_iter_exit(&trans, &iter); + bch2_trans_exit(&trans); + + if (ret) +diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c +index 3b671082cd1e..14b0e8c03119 100644 +--- a/fs/bcachefs/inode.c ++++ b/fs/bcachefs/inode.c +@@ -292,18 +292,18 @@ int bch2_inode_unpack(struct bkey_s_c_inode inode, + return 0; + } + +-struct btree_iter *bch2_inode_peek(struct btree_trans *trans, +- struct bch_inode_unpacked *inode, +- u64 inum, unsigned flags) ++int bch2_inode_peek(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct bch_inode_unpacked *inode, ++ u64 inum, unsigned flags) + { +- struct btree_iter *iter; + struct bkey_s_c k; + int ret; + + if (trans->c->opts.inodes_use_key_cache) + flags |= BTREE_ITER_CACHED; + +- iter = bch2_trans_get_iter(trans, BTREE_ID_inodes, POS(0, inum), flags); ++ bch2_trans_iter_init(trans, iter, BTREE_ID_inodes, POS(0, inum), flags); + k = bch2_btree_iter_peek_slot(iter); + ret = bkey_err(k); + if (ret) +@@ -317,10 +317,10 @@ struct btree_iter *bch2_inode_peek(struct btree_trans *trans, + if (ret) + goto err; + +- return iter; ++ return 0; + err: +- bch2_trans_iter_put(trans, iter); +- return ERR_PTR(ret); ++ bch2_trans_iter_exit(trans, iter); ++ return ret; + } + + int bch2_inode_write(struct btree_trans *trans, +@@ -482,12 +482,12 @@ static inline u32 bkey_generation(struct bkey_s_c k) + } + } + +-struct btree_iter *bch2_inode_create(struct btree_trans *trans, +- struct bch_inode_unpacked *inode_u, +- u32 snapshot, u64 cpu) ++int bch2_inode_create(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct bch_inode_unpacked *inode_u, ++ u32 snapshot, u64 cpu) + { + struct bch_fs *c = trans->c; +- struct btree_iter *iter = NULL; + struct bkey_s_c k; + u64 min, max, start, pos, *hint; + int ret = 0; +@@ -513,9 +513,9 @@ struct btree_iter *bch2_inode_create(struct btree_trans *trans, + start = min; + + pos = start; +- iter = bch2_trans_get_iter(trans, BTREE_ID_inodes, POS(0, pos), +- BTREE_ITER_ALL_SNAPSHOTS| +- BTREE_ITER_INTENT); ++ bch2_trans_iter_init(trans, iter, BTREE_ID_inodes, POS(0, pos), ++ BTREE_ITER_ALL_SNAPSHOTS| ++ BTREE_ITER_INTENT); + again: + while ((k = bch2_btree_iter_peek(iter)).k && + !(ret = bkey_err(k)) && +@@ -553,8 +553,8 @@ again: + ret = -ENOSPC; + + if (ret) { +- bch2_trans_iter_put(trans, iter); +- return ERR_PTR(ret); ++ bch2_trans_iter_exit(trans, iter); ++ return ret; + } + + /* Retry from start */ +@@ -566,8 +566,8 @@ found_slot: + k = bch2_btree_iter_peek_slot(iter); + ret = bkey_err(k); + if (ret) { +- bch2_trans_iter_put(trans, iter); +- return ERR_PTR(ret); ++ bch2_trans_iter_exit(trans, iter); ++ return ret; + } + + /* We may have raced while the iterator wasn't pointing at pos: */ +@@ -578,13 +578,13 @@ found_slot: + *hint = k.k->p.offset; + inode_u->bi_inum = k.k->p.offset; + inode_u->bi_generation = bkey_generation(k); +- return iter; ++ return 0; + } + + int bch2_inode_rm(struct bch_fs *c, u64 inode_nr, bool cached) + { + struct btree_trans trans; +- struct btree_iter *iter = NULL; ++ struct btree_iter iter = { NULL }; + struct bkey_i_inode_generation delete; + struct bpos start = POS(inode_nr, 0); + struct bpos end = POS(inode_nr + 1, 0); +@@ -617,9 +617,9 @@ int bch2_inode_rm(struct bch_fs *c, u64 inode_nr, bool cached) + retry: + bch2_trans_begin(&trans); + +- iter = bch2_trans_get_iter(&trans, BTREE_ID_inodes, +- POS(0, inode_nr), iter_flags); +- k = bch2_btree_iter_peek_slot(iter); ++ bch2_trans_iter_init(&trans, &iter, BTREE_ID_inodes, ++ POS(0, inode_nr), iter_flags); ++ k = bch2_btree_iter_peek_slot(&iter); + + ret = bkey_err(k); + if (ret) +@@ -636,14 +636,14 @@ retry: + bch2_inode_unpack(bkey_s_c_to_inode(k), &inode_u); + + bkey_inode_generation_init(&delete.k_i); +- delete.k.p = iter->pos; ++ delete.k.p = iter.pos; + delete.v.bi_generation = cpu_to_le32(inode_u.bi_generation + 1); + +- ret = bch2_trans_update(&trans, iter, &delete.k_i, 0) ?: ++ ret = bch2_trans_update(&trans, &iter, &delete.k_i, 0) ?: + bch2_trans_commit(&trans, NULL, NULL, + BTREE_INSERT_NOFAIL); + err: +- bch2_trans_iter_put(&trans, iter); ++ bch2_trans_iter_exit(&trans, &iter); + if (ret == -EINTR) + goto retry; + +@@ -654,12 +654,11 @@ err: + static int bch2_inode_find_by_inum_trans(struct btree_trans *trans, u64 inode_nr, + struct bch_inode_unpacked *inode) + { +- struct btree_iter *iter; ++ struct btree_iter iter = { NULL }; + int ret; + +- iter = bch2_inode_peek(trans, inode, inode_nr, 0); +- ret = PTR_ERR_OR_ZERO(iter); +- bch2_trans_iter_put(trans, iter); ++ ret = bch2_inode_peek(trans, &iter, inode, inode_nr, 0); ++ bch2_trans_iter_exit(trans, &iter); + return ret; + } + +diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h +index d67af4f56f05..25bef104ebcc 100644 +--- a/fs/bcachefs/inode.h ++++ b/fs/bcachefs/inode.h +@@ -57,8 +57,8 @@ int bch2_inode_unpack(struct bkey_s_c_inode, struct bch_inode_unpacked *); + + void bch2_inode_unpacked_to_text(struct printbuf *, struct bch_inode_unpacked *); + +-struct btree_iter *bch2_inode_peek(struct btree_trans *, +- struct bch_inode_unpacked *, u64, unsigned); ++int bch2_inode_peek(struct btree_trans *, struct btree_iter *, ++ struct bch_inode_unpacked *, u64, unsigned); + int bch2_inode_write(struct btree_trans *, struct btree_iter *, + struct bch_inode_unpacked *); + +@@ -71,8 +71,8 @@ void bch2_inode_init(struct bch_fs *, struct bch_inode_unpacked *, + uid_t, gid_t, umode_t, dev_t, + struct bch_inode_unpacked *); + +-struct btree_iter *bch2_inode_create(struct btree_trans *, +- struct bch_inode_unpacked *, u32, u64); ++int bch2_inode_create(struct btree_trans *, struct btree_iter *, ++ struct bch_inode_unpacked *, u32, u64); + + int bch2_inode_rm(struct bch_fs *, u64, bool); + +diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c +index b2a1bf242ed9..51ce8a134243 100644 +--- a/fs/bcachefs/io.c ++++ b/fs/bcachefs/io.c +@@ -192,7 +192,7 @@ int bch2_sum_sector_overwrites(struct btree_trans *trans, + s64 *disk_sectors_delta) + { + struct bch_fs *c = trans->c; +- struct btree_iter *iter; ++ struct btree_iter iter; + struct bkey_s_c old; + unsigned new_replicas = bch2_bkey_replicas(c, bkey_i_to_s_c(new)); + bool new_compressed = bch2_bkey_sectors_compressed(bkey_i_to_s_c(new)); +@@ -203,7 +203,7 @@ int bch2_sum_sector_overwrites(struct btree_trans *trans, + *i_sectors_delta = 0; + *disk_sectors_delta = 0; + +- iter = bch2_trans_copy_iter(trans, extent_iter); ++ bch2_trans_copy_iter(&iter, extent_iter); + + for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, old, ret) { + s64 sectors = min(new->k.p.offset, old.k->p.offset) - +@@ -236,7 +236,7 @@ int bch2_sum_sector_overwrites(struct btree_trans *trans, + * less: + */ + if (!bkey_cmp(old.k->p, new->k.p)) { +- old = bch2_btree_iter_next(iter); ++ old = bch2_btree_iter_next(&iter); + ret = bkey_err(old); + if (ret) + break; +@@ -251,7 +251,7 @@ int bch2_sum_sector_overwrites(struct btree_trans *trans, + } + } + +- bch2_trans_iter_put(trans, iter); ++ bch2_trans_iter_exit(trans, &iter); + return ret; + } + +@@ -301,12 +301,11 @@ int bch2_extent_update(struct btree_trans *trans, + : 0; + + if (i_sectors_delta || new_i_size) { +- struct btree_iter *inode_iter; ++ struct btree_iter inode_iter; + struct bch_inode_unpacked inode_u; + +- inode_iter = bch2_inode_peek(trans, &inode_u, ++ ret = bch2_inode_peek(trans, &inode_iter, &inode_u, + k->k.p.inode, BTREE_ITER_INTENT); +- ret = PTR_ERR_OR_ZERO(inode_iter); + if (ret) + return ret; + +@@ -335,11 +334,11 @@ int bch2_extent_update(struct btree_trans *trans, + + inode_p.inode.k.p.snapshot = iter->snapshot; + +- ret = bch2_trans_update(trans, inode_iter, ++ ret = bch2_trans_update(trans, &inode_iter, + &inode_p.inode.k_i, 0); + } + +- bch2_trans_iter_put(trans, inode_iter); ++ bch2_trans_iter_exit(trans, &inode_iter); + + if (ret) + return ret; +@@ -414,18 +413,18 @@ int bch2_fpunch(struct bch_fs *c, u64 inum, u64 start, u64 end, + u64 *journal_seq, s64 *i_sectors_delta) + { + struct btree_trans trans; +- struct btree_iter *iter; ++ struct btree_iter iter; + int ret = 0; + + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024); +- iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, ++ bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, + POS(inum, start), + BTREE_ITER_INTENT); + +- ret = bch2_fpunch_at(&trans, iter, POS(inum, end), ++ ret = bch2_fpunch_at(&trans, &iter, POS(inum, end), + journal_seq, i_sectors_delta); + +- bch2_trans_iter_put(&trans, iter); ++ bch2_trans_iter_exit(&trans, &iter); + bch2_trans_exit(&trans); + + if (ret == -EINTR) +@@ -441,28 +440,28 @@ int bch2_write_index_default(struct bch_write_op *op) + struct keylist *keys = &op->insert_keys; + struct bkey_i *k = bch2_keylist_front(keys); + struct btree_trans trans; +- struct btree_iter *iter; ++ struct btree_iter iter; + int ret; + + bch2_bkey_buf_init(&sk); + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024); + +- iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, +- bkey_start_pos(&k->k), +- BTREE_ITER_SLOTS|BTREE_ITER_INTENT); ++ bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, ++ bkey_start_pos(&k->k), ++ BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + + do { + bch2_trans_begin(&trans); + + k = bch2_keylist_front(keys); + +- k->k.p.snapshot = iter->snapshot; ++ k->k.p.snapshot = iter.snapshot; + + bch2_bkey_buf_realloc(&sk, c, k->k.u64s); + bkey_copy(sk.k, k); +- bch2_cut_front(iter->pos, sk.k); ++ bch2_cut_front(iter.pos, sk.k); + +- ret = bch2_extent_update(&trans, iter, sk.k, ++ ret = bch2_extent_update(&trans, &iter, sk.k, + &op->res, op_journal_seq(op), + op->new_i_size, &op->i_sectors_delta, + op->flags & BCH_WRITE_CHECK_ENOSPC); +@@ -471,11 +470,11 @@ int bch2_write_index_default(struct bch_write_op *op) + if (ret) + break; + +- if (bkey_cmp(iter->pos, k->k.p) >= 0) ++ if (bkey_cmp(iter.pos, k->k.p) >= 0) + bch2_keylist_pop_front(keys); + } while (!bch2_keylist_empty(keys)); + +- bch2_trans_iter_put(&trans, iter); ++ bch2_trans_iter_exit(&trans, &iter); + bch2_trans_exit(&trans); + bch2_bkey_buf_exit(&sk, c); + +@@ -1640,7 +1639,7 @@ static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio + unsigned flags) + { + struct btree_trans trans; +- struct btree_iter *iter; ++ struct btree_iter iter; + struct bkey_buf sk; + struct bkey_s_c k; + int ret; +@@ -1651,12 +1650,12 @@ static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio + bch2_bkey_buf_init(&sk); + bch2_trans_init(&trans, c, 0, 0); + +- iter = bch2_trans_get_iter(&trans, rbio->data_btree, +- rbio->read_pos, BTREE_ITER_SLOTS); ++ bch2_trans_iter_init(&trans, &iter, rbio->data_btree, ++ rbio->read_pos, BTREE_ITER_SLOTS); + retry: + rbio->bio.bi_status = 0; + +- k = bch2_btree_iter_peek_slot(iter); ++ k = bch2_btree_iter_peek_slot(&iter); + if (bkey_err(k)) + goto err; + +@@ -1683,7 +1682,7 @@ retry: + goto err; + out: + bch2_rbio_done(rbio); +- bch2_trans_iter_put(&trans, iter); ++ bch2_trans_iter_exit(&trans, &iter); + bch2_trans_exit(&trans); + bch2_bkey_buf_exit(&sk, c); + return; +@@ -1749,7 +1748,7 @@ static int __bch2_rbio_narrow_crcs(struct btree_trans *trans, + struct bch_fs *c = rbio->c; + u64 data_offset = rbio->data_pos.offset - rbio->pick.crc.offset; + struct bch_extent_crc_unpacked new_crc; +- struct btree_iter *iter = NULL; ++ struct btree_iter iter; + struct bkey_i *new; + struct bkey_s_c k; + int ret = 0; +@@ -1757,9 +1756,9 @@ static int __bch2_rbio_narrow_crcs(struct btree_trans *trans, + if (crc_is_compressed(rbio->pick.crc)) + return 0; + +- iter = bch2_trans_get_iter(trans, rbio->data_btree, rbio->data_pos, +- BTREE_ITER_SLOTS|BTREE_ITER_INTENT); +- k = bch2_btree_iter_peek_slot(iter); ++ bch2_trans_iter_init(trans, &iter, rbio->data_btree, rbio->data_pos, ++ BTREE_ITER_SLOTS|BTREE_ITER_INTENT); ++ k = bch2_btree_iter_peek_slot(&iter); + if ((ret = bkey_err(k))) + goto out; + +@@ -1794,9 +1793,9 @@ static int __bch2_rbio_narrow_crcs(struct btree_trans *trans, + if (!bch2_bkey_narrow_crcs(new, new_crc)) + goto out; + +- ret = bch2_trans_update(trans, iter, new, 0); ++ ret = bch2_trans_update(trans, &iter, new, 0); + out: +- bch2_trans_iter_put(trans, iter); ++ bch2_trans_iter_exit(trans, &iter); + return ret; + } + +@@ -1967,7 +1966,7 @@ int __bch2_read_indirect_extent(struct btree_trans *trans, + unsigned *offset_into_extent, + struct bkey_buf *orig_k) + { +- struct btree_iter *iter; ++ struct btree_iter iter; + struct bkey_s_c k; + u64 reflink_offset; + int ret; +@@ -1975,10 +1974,10 @@ int __bch2_read_indirect_extent(struct btree_trans *trans, + reflink_offset = le64_to_cpu(bkey_i_to_reflink_p(orig_k->k)->v.idx) + + *offset_into_extent; + +- iter = bch2_trans_get_iter(trans, BTREE_ID_reflink, +- POS(0, reflink_offset), +- BTREE_ITER_SLOTS); +- k = bch2_btree_iter_peek_slot(iter); ++ bch2_trans_iter_init(trans, &iter, BTREE_ID_reflink, ++ POS(0, reflink_offset), ++ BTREE_ITER_SLOTS); ++ k = bch2_btree_iter_peek_slot(&iter); + ret = bkey_err(k); + if (ret) + goto err; +@@ -1995,10 +1994,10 @@ int __bch2_read_indirect_extent(struct btree_trans *trans, + goto err; + } + +- *offset_into_extent = iter->pos.offset - bkey_start_offset(k.k); ++ *offset_into_extent = iter.pos.offset - bkey_start_offset(k.k); + bch2_bkey_buf_reassemble(orig_k, trans->c, k); + err: +- bch2_trans_iter_put(trans, iter); ++ bch2_trans_iter_exit(trans, &iter); + return ret; + } + +@@ -2268,7 +2267,7 @@ void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, + struct bch_io_failures *failed, unsigned flags) + { + struct btree_trans trans; +- struct btree_iter *iter; ++ struct btree_iter iter; + struct bkey_buf sk; + struct bkey_s_c k; + int ret; +@@ -2277,10 +2276,9 @@ void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, + + bch2_bkey_buf_init(&sk); + bch2_trans_init(&trans, c, 0, 0); +- +- iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, +- POS(inode, bvec_iter.bi_sector), +- BTREE_ITER_SLOTS); ++ bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, ++ POS(inode, bvec_iter.bi_sector), ++ BTREE_ITER_SLOTS); + retry: + bch2_trans_begin(&trans); + +@@ -2297,15 +2295,15 @@ retry: + break; + } + +- bch2_btree_iter_set_pos(iter, ++ bch2_btree_iter_set_pos(&iter, + POS(inode, bvec_iter.bi_sector)); + +- k = bch2_btree_iter_peek_slot(iter); ++ k = bch2_btree_iter_peek_slot(&iter); + ret = bkey_err(k); + if (ret) + break; + +- offset_into_extent = iter->pos.offset - ++ offset_into_extent = iter.pos.offset - + bkey_start_offset(k.k); + sectors = k.k->size - offset_into_extent; + +@@ -2336,7 +2334,7 @@ retry: + if (bvec_iter.bi_size == bytes) + flags |= BCH_READ_LAST_FRAGMENT; + +- ret = __bch2_read_extent(&trans, rbio, bvec_iter, iter->pos, ++ ret = __bch2_read_extent(&trans, rbio, bvec_iter, iter.pos, + data_btree, k, + offset_into_extent, failed, flags); + if (ret) +@@ -2352,7 +2350,7 @@ retry: + if (ret == -EINTR || ret == READ_RETRY || ret == READ_RETRY_AVOID) + goto retry; + +- bch2_trans_iter_put(&trans, iter); ++ bch2_trans_iter_exit(&trans, &iter); + bch2_trans_exit(&trans); + bch2_bkey_buf_exit(&sk, c); + +diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c +index f2060f903cbc..68fb2ebd91ac 100644 +--- a/fs/bcachefs/journal_seq_blacklist.c ++++ b/fs/bcachefs/journal_seq_blacklist.c +@@ -250,7 +250,7 @@ void bch2_blacklist_entries_gc(struct work_struct *work) + bch2_trans_init(&trans, c, 0, 0); + + for (i = 0; i < BTREE_ID_NR; i++) { +- struct btree_iter *iter; ++ struct btree_iter iter; + struct btree *b; + + for_each_btree_node(&trans, iter, i, POS_MIN, +@@ -259,7 +259,7 @@ void bch2_blacklist_entries_gc(struct work_struct *work) + bch2_trans_exit(&trans); + return; + } +- bch2_trans_iter_free(&trans, iter); ++ bch2_trans_iter_exit(&trans, &iter); + } + + ret = bch2_trans_exit(&trans); +diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c +index 1f65eca48c6e..1899326d9754 100644 +--- a/fs/bcachefs/migrate.c ++++ b/fs/bcachefs/migrate.c +@@ -39,7 +39,7 @@ static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags + enum btree_id btree_id) + { + struct btree_trans trans; +- struct btree_iter *iter; ++ struct btree_iter iter; + struct bkey_s_c k; + struct bkey_buf sk; + int ret = 0; +@@ -47,13 +47,13 @@ static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags + bch2_bkey_buf_init(&sk); + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); + +- iter = bch2_trans_get_iter(&trans, btree_id, POS_MIN, +- BTREE_ITER_PREFETCH); ++ bch2_trans_iter_init(&trans, &iter, btree_id, POS_MIN, ++ BTREE_ITER_PREFETCH); + +- while ((k = bch2_btree_iter_peek(iter)).k && ++ while ((k = bch2_btree_iter_peek(&iter)).k && + !(ret = bkey_err(k))) { + if (!bch2_bkey_has_device(k, dev_idx)) { +- bch2_btree_iter_advance(iter); ++ bch2_btree_iter_advance(&iter); + continue; + } + +@@ -71,10 +71,10 @@ static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags + */ + bch2_extent_normalize(c, bkey_i_to_s(sk.k)); + +- bch2_btree_iter_set_pos(iter, bkey_start_pos(&sk.k->k)); ++ bch2_btree_iter_set_pos(&iter, bkey_start_pos(&sk.k->k)); + +- ret = bch2_btree_iter_traverse(iter) ?: +- bch2_trans_update(&trans, iter, sk.k, 0) ?: ++ ret = bch2_btree_iter_traverse(&iter) ?: ++ bch2_trans_update(&trans, &iter, sk.k, 0) ?: + bch2_trans_commit(&trans, NULL, NULL, + BTREE_INSERT_NOFAIL); + +@@ -88,7 +88,7 @@ static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags + if (ret) + break; + } +- bch2_trans_iter_put(&trans, iter); ++ bch2_trans_iter_exit(&trans, &iter); + + ret = bch2_trans_exit(&trans) ?: ret; + bch2_bkey_buf_exit(&sk, c); +@@ -107,7 +107,7 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags) + static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags) + { + struct btree_trans trans; +- struct btree_iter *iter; ++ struct btree_iter iter; + struct closure cl; + struct btree *b; + struct bkey_buf k; +@@ -139,9 +139,9 @@ retry: + break; + } + +- ret = bch2_btree_node_update_key(&trans, iter, b, k.k, false); ++ ret = bch2_btree_node_update_key(&trans, &iter, b, k.k, false); + if (ret == -EINTR) { +- b = bch2_btree_iter_peek_node(iter); ++ b = bch2_btree_iter_peek_node(&iter); + ret = 0; + goto retry; + } +@@ -150,7 +150,7 @@ retry: + break; + } + } +- bch2_trans_iter_free(&trans, iter); ++ bch2_trans_iter_exit(&trans, &iter); + + if (ret) + goto err; +diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c +index 45cea9231300..7001e3cda8c5 100644 +--- a/fs/bcachefs/move.c ++++ b/fs/bcachefs/move.c +@@ -57,7 +57,7 @@ static int bch2_migrate_index_update(struct bch_write_op *op) + { + struct bch_fs *c = op->c; + struct btree_trans trans; +- struct btree_iter *iter; ++ struct btree_iter iter; + struct migrate_write *m = + container_of(op, struct migrate_write, op); + struct keylist *keys = &op->insert_keys; +@@ -70,9 +70,9 @@ static int bch2_migrate_index_update(struct bch_write_op *op) + + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024); + +- iter = bch2_trans_get_iter(&trans, m->btree_id, +- bkey_start_pos(&bch2_keylist_front(keys)->k), +- BTREE_ITER_SLOTS|BTREE_ITER_INTENT); ++ bch2_trans_iter_init(&trans, &iter, m->btree_id, ++ bkey_start_pos(&bch2_keylist_front(keys)->k), ++ BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + + while (1) { + struct bkey_s_c k; +@@ -87,7 +87,7 @@ static int bch2_migrate_index_update(struct bch_write_op *op) + + bch2_trans_begin(&trans); + +- k = bch2_btree_iter_peek_slot(iter); ++ k = bch2_btree_iter_peek_slot(&iter); + ret = bkey_err(k); + if (ret) + goto err; +@@ -103,9 +103,9 @@ static int bch2_migrate_index_update(struct bch_write_op *op) + + bch2_bkey_buf_copy(&_new, c, bch2_keylist_front(keys)); + new = bkey_i_to_extent(_new.k); +- bch2_cut_front(iter->pos, &new->k_i); ++ bch2_cut_front(iter.pos, &new->k_i); + +- bch2_cut_front(iter->pos, insert); ++ bch2_cut_front(iter.pos, insert); + bch2_cut_back(new->k.p, insert); + bch2_cut_back(insert->k.p, &new->k_i); + +@@ -147,7 +147,7 @@ static int bch2_migrate_index_update(struct bch_write_op *op) + op->opts.background_target, + op->opts.data_replicas); + +- ret = bch2_sum_sector_overwrites(&trans, iter, insert, ++ ret = bch2_sum_sector_overwrites(&trans, &iter, insert, + &extending, + &should_check_enospc, + &i_sectors_delta, +@@ -166,13 +166,13 @@ static int bch2_migrate_index_update(struct bch_write_op *op) + + next_pos = insert->k.p; + +- ret = bch2_trans_update(&trans, iter, insert, 0) ?: ++ ret = bch2_trans_update(&trans, &iter, insert, 0) ?: + bch2_trans_commit(&trans, &op->res, + op_journal_seq(op), + BTREE_INSERT_NOFAIL| + m->data_opts.btree_insert_flags); + if (!ret) { +- bch2_btree_iter_set_pos(iter, next_pos); ++ bch2_btree_iter_set_pos(&iter, next_pos); + atomic_long_inc(&c->extent_migrate_done); + } + err: +@@ -181,7 +181,7 @@ err: + if (ret) + break; + next: +- while (bkey_cmp(iter->pos, bch2_keylist_front(keys)->k.p) >= 0) { ++ while (bkey_cmp(iter.pos, bch2_keylist_front(keys)->k.p) >= 0) { + bch2_keylist_pop_front(keys); + if (bch2_keylist_empty(keys)) + goto out; +@@ -189,18 +189,18 @@ next: + continue; + nomatch: + if (m->ctxt) { +- BUG_ON(k.k->p.offset <= iter->pos.offset); ++ BUG_ON(k.k->p.offset <= iter.pos.offset); + atomic64_inc(&m->ctxt->stats->keys_raced); +- atomic64_add(k.k->p.offset - iter->pos.offset, ++ atomic64_add(k.k->p.offset - iter.pos.offset, + &m->ctxt->stats->sectors_raced); + } + atomic_long_inc(&c->extent_migrate_raced); + trace_move_race(&new->k); +- bch2_btree_iter_advance(iter); ++ bch2_btree_iter_advance(&iter); + goto next; + } + out: +- bch2_trans_iter_put(&trans, iter); ++ bch2_trans_iter_exit(&trans, &iter); + bch2_trans_exit(&trans); + bch2_bkey_buf_exit(&_insert, c); + bch2_bkey_buf_exit(&_new, c); +@@ -524,13 +524,13 @@ err: + static int lookup_inode(struct btree_trans *trans, struct bpos pos, + struct bch_inode_unpacked *inode) + { +- struct btree_iter *iter; ++ struct btree_iter iter; + struct bkey_s_c k; + int ret; + +- iter = bch2_trans_get_iter(trans, BTREE_ID_inodes, pos, +- BTREE_ITER_ALL_SNAPSHOTS); +- k = bch2_btree_iter_peek(iter); ++ bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes, pos, ++ BTREE_ITER_ALL_SNAPSHOTS); ++ k = bch2_btree_iter_peek(&iter); + ret = bkey_err(k); + if (ret) + goto err; +@@ -548,7 +548,7 @@ static int lookup_inode(struct btree_trans *trans, struct bpos pos, + if (ret) + goto err; + err: +- bch2_trans_iter_put(trans, iter); ++ bch2_trans_iter_exit(trans, &iter); + return ret; + } + +@@ -566,7 +566,7 @@ static int __bch2_move_data(struct bch_fs *c, + struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts); + struct bkey_buf sk; + struct btree_trans trans; +- struct btree_iter *iter; ++ struct btree_iter iter; + struct bkey_s_c k; + struct data_opts data_opts; + enum data_cmd data_cmd; +@@ -580,8 +580,8 @@ static int __bch2_move_data(struct bch_fs *c, + stats->btree_id = btree_id; + stats->pos = start; + +- iter = bch2_trans_get_iter(&trans, btree_id, start, +- BTREE_ITER_PREFETCH); ++ bch2_trans_iter_init(&trans, &iter, btree_id, start, ++ BTREE_ITER_PREFETCH); + + if (rate) + bch2_ratelimit_reset(rate); +@@ -612,9 +612,9 @@ static int __bch2_move_data(struct bch_fs *c, + + bch2_trans_begin(&trans); + +- k = bch2_btree_iter_peek(iter); ++ k = bch2_btree_iter_peek(&iter); + +- stats->pos = iter->pos; ++ stats->pos = iter.pos; + + if (!k.k) + break; +@@ -687,12 +687,12 @@ next: + atomic64_add(k.k->size * bch2_bkey_nr_ptrs_allocated(k), + &stats->sectors_seen); + next_nondata: +- bch2_btree_iter_advance(iter); ++ bch2_btree_iter_advance(&iter); + bch2_trans_cond_resched(&trans); + } + out: + +- bch2_trans_iter_put(&trans, iter); ++ bch2_trans_iter_exit(&trans, &iter); + ret = bch2_trans_exit(&trans) ?: ret; + bch2_bkey_buf_exit(&sk, c); + +@@ -786,7 +786,7 @@ static int bch2_move_btree(struct bch_fs *c, + bool kthread = (current->flags & PF_KTHREAD) != 0; + struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts); + struct btree_trans trans; +- struct btree_iter *iter; ++ struct btree_iter iter; + struct btree *b; + enum btree_id id; + struct data_opts data_opts; +@@ -813,7 +813,7 @@ static int bch2_move_btree(struct bch_fs *c, + bpos_cmp(b->key.k.p, end_pos)) > 0) + break; + +- stats->pos = iter->pos; ++ stats->pos = iter.pos; + + switch ((cmd = pred(c, arg, b, &io_opts, &data_opts))) { + case DATA_SKIP: +@@ -827,13 +827,13 @@ static int bch2_move_btree(struct bch_fs *c, + BUG(); + } + +- ret = bch2_btree_node_rewrite(&trans, iter, ++ ret = bch2_btree_node_rewrite(&trans, &iter, + b->data->keys.seq, 0) ?: ret; + next: + bch2_trans_cond_resched(&trans); + } ++ bch2_trans_iter_exit(&trans, &iter); + +- ret = bch2_trans_iter_free(&trans, iter) ?: ret; + if (kthread && kthread_should_stop()) + break; + } +diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c +index 7861781a4a7f..9b0f4d3f176d 100644 +--- a/fs/bcachefs/quota.c ++++ b/fs/bcachefs/quota.c +@@ -357,7 +357,7 @@ static int __bch2_quota_set(struct bch_fs *c, struct bkey_s_c k) + static int bch2_quota_init_type(struct bch_fs *c, enum quota_types type) + { + struct btree_trans trans; +- struct btree_iter *iter; ++ struct btree_iter iter; + struct bkey_s_c k; + int ret = 0; + +@@ -372,7 +372,7 @@ static int bch2_quota_init_type(struct bch_fs *c, enum quota_types type) + if (ret) + break; + } +- bch2_trans_iter_put(&trans, iter); ++ bch2_trans_iter_exit(&trans, &iter); + + return bch2_trans_exit(&trans) ?: ret; + } +@@ -419,7 +419,7 @@ int bch2_fs_quota_read(struct bch_fs *c) + unsigned i, qtypes = enabled_qtypes(c); + struct bch_memquota_type *q; + struct btree_trans trans; +- struct btree_iter *iter; ++ struct btree_iter iter; + struct bch_inode_unpacked u; + struct bkey_s_c k; + int ret; +@@ -450,7 +450,7 @@ int bch2_fs_quota_read(struct bch_fs *c) + KEY_TYPE_QUOTA_NOCHECK); + } + } +- bch2_trans_iter_put(&trans, iter); ++ bch2_trans_iter_exit(&trans, &iter); + + return bch2_trans_exit(&trans) ?: ret; + } +@@ -717,13 +717,13 @@ static int bch2_set_quota_trans(struct btree_trans *trans, + struct bkey_i_quota *new_quota, + struct qc_dqblk *qdq) + { +- struct btree_iter *iter; ++ struct btree_iter iter; + struct bkey_s_c k; + int ret; + +- iter = bch2_trans_get_iter(trans, BTREE_ID_quotas, new_quota->k.p, +- BTREE_ITER_SLOTS|BTREE_ITER_INTENT); +- k = bch2_btree_iter_peek_slot(iter); ++ bch2_trans_iter_init(trans, &iter, BTREE_ID_quotas, new_quota->k.p, ++ BTREE_ITER_SLOTS|BTREE_ITER_INTENT); ++ k = bch2_btree_iter_peek_slot(&iter); + + ret = bkey_err(k); + if (unlikely(ret)) +@@ -742,8 +742,8 @@ static int bch2_set_quota_trans(struct btree_trans *trans, + if (qdq->d_fieldmask & QC_INO_HARD) + new_quota->v.c[Q_INO].hardlimit = cpu_to_le64(qdq->d_ino_hardlimit); + +- ret = bch2_trans_update(trans, iter, &new_quota->k_i, 0); +- bch2_trans_iter_put(trans, iter); ++ ret = bch2_trans_update(trans, &iter, &new_quota->k_i, 0); ++ bch2_trans_iter_exit(trans, &iter); + return ret; + } + +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index 71b0f14f41f3..11208e83fabe 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -327,7 +327,7 @@ static void btree_and_journal_iter_prefetch(struct bch_fs *c, struct btree *b, + bch2_bkey_buf_reassemble(&tmp, c, k); + + bch2_btree_node_prefetch(c, NULL, NULL, tmp.k, +- b->c.btree_id, b->c.level - 1); ++ b->c.btree_id, b->c.level - 1); + + bch2_btree_and_journal_iter_advance(&iter); + i++; +@@ -518,16 +518,16 @@ static int __bch2_journal_replay_key(struct btree_trans *trans, + enum btree_id id, unsigned level, + struct bkey_i *k) + { +- struct btree_iter *iter; ++ struct btree_iter iter; + int ret; + +- iter = bch2_trans_get_node_iter(trans, id, k->k.p, +- BTREE_MAX_DEPTH, level, +- BTREE_ITER_INTENT| +- BTREE_ITER_NOT_EXTENTS); +- ret = bch2_btree_iter_traverse(iter) ?: +- bch2_trans_update(trans, iter, k, BTREE_TRIGGER_NORUN); +- bch2_trans_iter_put(trans, iter); ++ bch2_trans_node_iter_init(trans, &iter, id, k->k.p, ++ BTREE_MAX_DEPTH, level, ++ BTREE_ITER_INTENT| ++ BTREE_ITER_NOT_EXTENTS); ++ ret = bch2_btree_iter_traverse(&iter) ?: ++ bch2_trans_update(trans, &iter, k, BTREE_TRIGGER_NORUN); ++ bch2_trans_iter_exit(trans, &iter); + return ret; + } + +@@ -545,16 +545,16 @@ static int bch2_journal_replay_key(struct bch_fs *c, struct journal_key *k) + + static int __bch2_alloc_replay_key(struct btree_trans *trans, struct bkey_i *k) + { +- struct btree_iter *iter; ++ struct btree_iter iter; + int ret; + +- iter = bch2_trans_get_iter(trans, BTREE_ID_alloc, k->k.p, +- BTREE_ITER_CACHED| +- BTREE_ITER_CACHED_NOFILL| +- BTREE_ITER_INTENT); +- ret = bch2_btree_iter_traverse(iter) ?: +- bch2_trans_update(trans, iter, k, BTREE_TRIGGER_NORUN); +- bch2_trans_iter_put(trans, iter); ++ bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, k->k.p, ++ BTREE_ITER_CACHED| ++ BTREE_ITER_CACHED_NOFILL| ++ BTREE_ITER_INTENT); ++ ret = bch2_btree_iter_traverse(&iter) ?: ++ bch2_trans_update(trans, &iter, k, BTREE_TRIGGER_NORUN); ++ bch2_trans_iter_exit(trans, &iter); + return ret; + } + +diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c +index 3d9c5c5b0eba..576cfbccf5b5 100644 +--- a/fs/bcachefs/reflink.c ++++ b/fs/bcachefs/reflink.c +@@ -116,7 +116,7 @@ static int bch2_make_extent_indirect(struct btree_trans *trans, + struct bkey_i *orig) + { + struct bch_fs *c = trans->c; +- struct btree_iter *reflink_iter; ++ struct btree_iter reflink_iter = { NULL }; + struct bkey_s_c k; + struct bkey_i *r_v; + struct bkey_i_reflink_p *r_p; +@@ -129,8 +129,8 @@ static int bch2_make_extent_indirect(struct btree_trans *trans, + for_each_btree_key(trans, reflink_iter, BTREE_ID_reflink, + POS(0, c->reflink_hint), + BTREE_ITER_INTENT|BTREE_ITER_SLOTS, k, ret) { +- if (reflink_iter->pos.inode) { +- bch2_btree_iter_set_pos(reflink_iter, POS_MIN); ++ if (reflink_iter.pos.inode) { ++ bch2_btree_iter_set_pos(&reflink_iter, POS_MIN); + continue; + } + +@@ -142,7 +142,7 @@ static int bch2_make_extent_indirect(struct btree_trans *trans, + goto err; + + /* rewind iter to start of hole, if necessary: */ +- bch2_btree_iter_set_pos_to_extent_start(reflink_iter); ++ bch2_btree_iter_set_pos_to_extent_start(&reflink_iter); + + r_v = bch2_trans_kmalloc(trans, sizeof(__le64) + bkey_bytes(&orig->k)); + ret = PTR_ERR_OR_ZERO(r_v); +@@ -151,7 +151,7 @@ static int bch2_make_extent_indirect(struct btree_trans *trans, + + bkey_init(&r_v->k); + r_v->k.type = bkey_type_to_indirect(&orig->k); +- r_v->k.p = reflink_iter->pos; ++ r_v->k.p = reflink_iter.pos; + bch2_key_resize(&r_v->k, orig->k.size); + r_v->k.version = orig->k.version; + +@@ -161,7 +161,7 @@ static int bch2_make_extent_indirect(struct btree_trans *trans, + *refcount = 0; + memcpy(refcount + 1, &orig->v, bkey_val_bytes(&orig->k)); + +- ret = bch2_trans_update(trans, reflink_iter, r_v, 0); ++ ret = bch2_trans_update(trans, &reflink_iter, r_v, 0); + if (ret) + goto err; + +@@ -172,9 +172,8 @@ static int bch2_make_extent_indirect(struct btree_trans *trans, + + ret = bch2_trans_update(trans, extent_iter, &r_p->k_i, 0); + err: +- if (!IS_ERR(reflink_iter)) +- c->reflink_hint = reflink_iter->pos.offset; +- bch2_trans_iter_put(trans, reflink_iter); ++ c->reflink_hint = reflink_iter.pos.offset; ++ bch2_trans_iter_exit(trans, &reflink_iter); + + return ret; + } +@@ -184,7 +183,7 @@ static struct bkey_s_c get_next_src(struct btree_iter *iter, struct bpos end) + struct bkey_s_c k; + int ret; + +- for_each_btree_key_continue(iter, 0, k, ret) { ++ for_each_btree_key_continue(*iter, 0, k, ret) { + if (bkey_cmp(iter->pos, end) >= 0) + break; + +@@ -203,7 +202,7 @@ s64 bch2_remap_range(struct bch_fs *c, + u64 new_i_size, s64 *i_sectors_delta) + { + struct btree_trans trans; +- struct btree_iter *dst_iter, *src_iter; ++ struct btree_iter dst_iter, src_iter; + struct bkey_s_c src_k; + struct bkey_buf new_dst, new_src; + struct bpos dst_end = dst_start, src_end = src_start; +@@ -223,13 +222,13 @@ s64 bch2_remap_range(struct bch_fs *c, + bch2_bkey_buf_init(&new_src); + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 4096); + +- src_iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, src_start, +- BTREE_ITER_INTENT); +- dst_iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, dst_start, +- BTREE_ITER_INTENT); ++ bch2_trans_iter_init(&trans, &src_iter, BTREE_ID_extents, src_start, ++ BTREE_ITER_INTENT); ++ bch2_trans_iter_init(&trans, &dst_iter, BTREE_ID_extents, dst_start, ++ BTREE_ITER_INTENT); + + while ((ret == 0 || ret == -EINTR) && +- bkey_cmp(dst_iter->pos, dst_end) < 0) { ++ bkey_cmp(dst_iter.pos, dst_end) < 0) { + struct disk_reservation disk_res = { 0 }; + + bch2_trans_begin(&trans); +@@ -239,31 +238,31 @@ s64 bch2_remap_range(struct bch_fs *c, + break; + } + +- dst_done = dst_iter->pos.offset - dst_start.offset; ++ dst_done = dst_iter.pos.offset - dst_start.offset; + src_want = POS(src_start.inode, src_start.offset + dst_done); +- bch2_btree_iter_set_pos(src_iter, src_want); ++ bch2_btree_iter_set_pos(&src_iter, src_want); + +- src_k = get_next_src(src_iter, src_end); ++ src_k = get_next_src(&src_iter, src_end); + ret = bkey_err(src_k); + if (ret) + continue; + +- if (bkey_cmp(src_want, src_iter->pos) < 0) { +- ret = bch2_fpunch_at(&trans, dst_iter, ++ if (bkey_cmp(src_want, src_iter.pos) < 0) { ++ ret = bch2_fpunch_at(&trans, &dst_iter, + bpos_min(dst_end, +- POS(dst_iter->pos.inode, dst_iter->pos.offset + +- src_iter->pos.offset - src_want.offset)), ++ POS(dst_iter.pos.inode, dst_iter.pos.offset + ++ src_iter.pos.offset - src_want.offset)), + journal_seq, i_sectors_delta); + continue; + } + + if (src_k.k->type != KEY_TYPE_reflink_p) { +- bch2_btree_iter_set_pos_to_extent_start(src_iter); ++ bch2_btree_iter_set_pos_to_extent_start(&src_iter); + + bch2_bkey_buf_reassemble(&new_src, c, src_k); + src_k = bkey_i_to_s_c(new_src.k); + +- ret = bch2_make_extent_indirect(&trans, src_iter, ++ ret = bch2_make_extent_indirect(&trans, &src_iter, + new_src.k); + if (ret) + continue; +@@ -286,43 +285,42 @@ s64 bch2_remap_range(struct bch_fs *c, + BUG(); + } + +- new_dst.k->k.p = dst_iter->pos; ++ new_dst.k->k.p = dst_iter.pos; + bch2_key_resize(&new_dst.k->k, + min(src_k.k->p.offset - src_want.offset, +- dst_end.offset - dst_iter->pos.offset)); +- ret = bch2_extent_update(&trans, dst_iter, new_dst.k, ++ dst_end.offset - dst_iter.pos.offset)); ++ ret = bch2_extent_update(&trans, &dst_iter, new_dst.k, + &disk_res, journal_seq, + new_i_size, i_sectors_delta, + true); + bch2_disk_reservation_put(c, &disk_res); + } +- bch2_trans_iter_put(&trans, dst_iter); +- bch2_trans_iter_put(&trans, src_iter); ++ bch2_trans_iter_exit(&trans, &dst_iter); ++ bch2_trans_iter_exit(&trans, &src_iter); + +- BUG_ON(!ret && bkey_cmp(dst_iter->pos, dst_end)); +- BUG_ON(bkey_cmp(dst_iter->pos, dst_end) > 0); ++ BUG_ON(!ret && bkey_cmp(dst_iter.pos, dst_end)); ++ BUG_ON(bkey_cmp(dst_iter.pos, dst_end) > 0); + +- dst_done = dst_iter->pos.offset - dst_start.offset; +- new_i_size = min(dst_iter->pos.offset << 9, new_i_size); ++ dst_done = dst_iter.pos.offset - dst_start.offset; ++ new_i_size = min(dst_iter.pos.offset << 9, new_i_size); + + do { + struct bch_inode_unpacked inode_u; +- struct btree_iter *inode_iter; ++ struct btree_iter inode_iter = { NULL }; + + bch2_trans_begin(&trans); + +- inode_iter = bch2_inode_peek(&trans, &inode_u, ++ ret2 = bch2_inode_peek(&trans, &inode_iter, &inode_u, + dst_start.inode, BTREE_ITER_INTENT); +- ret2 = PTR_ERR_OR_ZERO(inode_iter); + + if (!ret2 && + inode_u.bi_size < new_i_size) { + inode_u.bi_size = new_i_size; +- ret2 = bch2_inode_write(&trans, inode_iter, &inode_u) ?: ++ ret2 = bch2_inode_write(&trans, &inode_iter, &inode_u) ?: + bch2_trans_commit(&trans, NULL, journal_seq, 0); + } + +- bch2_trans_iter_put(&trans, inode_iter); ++ bch2_trans_iter_exit(&trans, &inode_iter); + } while (ret2 == -EINTR); + + ret = bch2_trans_exit(&trans) ?: ret; +diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h +index 236023494191..c6a132b3c5bb 100644 +--- a/fs/bcachefs/str_hash.h ++++ b/fs/bcachefs/str_hash.h +@@ -139,18 +139,18 @@ struct bch_hash_desc { + bool (*cmp_bkey)(struct bkey_s_c, struct bkey_s_c); + }; + +-static __always_inline struct btree_iter * ++static __always_inline int + bch2_hash_lookup(struct btree_trans *trans, ++ struct btree_iter *iter, + const struct bch_hash_desc desc, + const struct bch_hash_info *info, + u64 inode, const void *key, + unsigned flags) + { +- struct btree_iter *iter; + struct bkey_s_c k; + int ret; + +- for_each_btree_key(trans, iter, desc.btree_id, ++ for_each_btree_key(trans, *iter, desc.btree_id, + POS(inode, desc.hash_key(info, key)), + BTREE_ITER_SLOTS|flags, k, ret) { + if (iter->pos.inode != inode) +@@ -158,7 +158,7 @@ bch2_hash_lookup(struct btree_trans *trans, + + if (k.k->type == desc.key_type) { + if (!desc.cmp_key(k, key)) +- return iter; ++ return 0; + } else if (k.k->type == KEY_TYPE_hash_whiteout) { + ; + } else { +@@ -166,35 +166,33 @@ bch2_hash_lookup(struct btree_trans *trans, + break; + } + } +- bch2_trans_iter_put(trans, iter); ++ bch2_trans_iter_exit(trans, iter); + +- return ERR_PTR(ret ?: -ENOENT); ++ return ret ?: -ENOENT; + } + +-static __always_inline struct btree_iter * ++static __always_inline int + bch2_hash_hole(struct btree_trans *trans, ++ struct btree_iter *iter, + const struct bch_hash_desc desc, + const struct bch_hash_info *info, + u64 inode, const void *key) + { +- struct btree_iter *iter; + struct bkey_s_c k; + int ret; + +- for_each_btree_key(trans, iter, desc.btree_id, ++ for_each_btree_key(trans, *iter, desc.btree_id, + POS(inode, desc.hash_key(info, key)), + BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { + if (iter->pos.inode != inode) + break; + + if (k.k->type != desc.key_type) +- return iter; ++ return 0; + } ++ bch2_trans_iter_exit(trans, iter); + +- iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT; +- bch2_trans_iter_put(trans, iter); +- +- return ERR_PTR(ret ?: -ENOSPC); ++ return ret ?: -ENOSPC; + } + + static __always_inline +@@ -203,13 +201,13 @@ int bch2_hash_needs_whiteout(struct btree_trans *trans, + const struct bch_hash_info *info, + struct btree_iter *start) + { +- struct btree_iter *iter; ++ struct btree_iter iter; + struct bkey_s_c k; + int ret; + +- iter = bch2_trans_copy_iter(trans, start); ++ bch2_trans_copy_iter(&iter, start); + +- bch2_btree_iter_advance(iter); ++ bch2_btree_iter_advance(&iter); + + for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, k, ret) { + if (k.k->type != desc.key_type && +@@ -218,13 +216,12 @@ int bch2_hash_needs_whiteout(struct btree_trans *trans, + + if (k.k->type == desc.key_type && + desc.hash_bkey(info, k) <= start->pos.offset) { +- iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT; + ret = 1; + break; + } + } + +- bch2_trans_iter_put(trans, iter); ++ bch2_trans_iter_exit(trans, &iter); + return ret; + } + +@@ -234,7 +231,7 @@ int bch2_hash_set(struct btree_trans *trans, + const struct bch_hash_info *info, + u64 inode, struct bkey_i *insert, int flags) + { +- struct btree_iter *iter, *slot = NULL; ++ struct btree_iter iter, slot = { NULL }; + struct bkey_s_c k; + bool found = false; + int ret; +@@ -242,7 +239,7 @@ int bch2_hash_set(struct btree_trans *trans, + for_each_btree_key(trans, iter, desc.btree_id, + POS(inode, desc.hash_bkey(info, bkey_i_to_s_c(insert))), + BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { +- if (iter->pos.inode != inode) ++ if (iter.pos.inode != inode) + break; + + if (k.k->type == desc.key_type) { +@@ -253,9 +250,9 @@ int bch2_hash_set(struct btree_trans *trans, + continue; + } + +- if (!slot && ++ if (!slot.path && + !(flags & BCH_HASH_SET_MUST_REPLACE)) +- slot = bch2_trans_copy_iter(trans, iter); ++ bch2_trans_copy_iter(&slot, &iter); + + if (k.k->type != KEY_TYPE_hash_whiteout) + goto not_found; +@@ -264,8 +261,8 @@ int bch2_hash_set(struct btree_trans *trans, + if (!ret) + ret = -ENOSPC; + out: +- bch2_trans_iter_put(trans, slot); +- bch2_trans_iter_put(trans, iter); ++ bch2_trans_iter_exit(trans, &slot); ++ bch2_trans_iter_exit(trans, &iter); + + return ret; + found: +@@ -277,11 +274,11 @@ not_found: + } else if (found && (flags & BCH_HASH_SET_MUST_CREATE)) { + ret = -EEXIST; + } else { +- if (!found && slot) ++ if (!found && slot.path) + swap(iter, slot); + +- insert->k.p = iter->pos; +- ret = bch2_trans_update(trans, iter, insert, 0); ++ insert->k.p = iter.pos; ++ ret = bch2_trans_update(trans, &iter, insert, 0); + } + + goto out; +@@ -318,16 +315,16 @@ int bch2_hash_delete(struct btree_trans *trans, + const struct bch_hash_info *info, + u64 inode, const void *key) + { +- struct btree_iter *iter; ++ struct btree_iter iter; + int ret; + +- iter = bch2_hash_lookup(trans, desc, info, inode, key, ++ ret = bch2_hash_lookup(trans, &iter, desc, info, inode, key, + BTREE_ITER_INTENT); +- if (IS_ERR(iter)) +- return PTR_ERR(iter); ++ if (ret) ++ return ret; + +- ret = bch2_hash_delete_at(trans, desc, info, iter); +- bch2_trans_iter_put(trans, iter); ++ ret = bch2_hash_delete_at(trans, desc, info, &iter); ++ bch2_trans_iter_exit(trans, &iter); + return ret; + } + +diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c +index c1c3cf8f5a56..8f8476613594 100644 +--- a/fs/bcachefs/super.c ++++ b/fs/bcachefs/super.c +@@ -486,12 +486,12 @@ static void __bch2_fs_free(struct bch_fs *c) + bch2_journal_entries_free(&c->journal_entries); + percpu_free_rwsem(&c->mark_lock); + +- if (c->btree_iters_bufs) ++ if (c->btree_paths_bufs) + for_each_possible_cpu(cpu) +- kfree(per_cpu_ptr(c->btree_iters_bufs, cpu)->iter); ++ kfree(per_cpu_ptr(c->btree_paths_bufs, cpu)->path); + + free_percpu(c->online_reserved); +- free_percpu(c->btree_iters_bufs); ++ free_percpu(c->btree_paths_bufs); + free_percpu(c->pcpu); + mempool_exit(&c->large_bkey_pool); + mempool_exit(&c->btree_bounce_pool); +@@ -774,7 +774,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) + offsetof(struct btree_write_bio, wbio.bio)), + BIOSET_NEED_BVECS) || + !(c->pcpu = alloc_percpu(struct bch_fs_pcpu)) || +- !(c->btree_iters_bufs = alloc_percpu(struct btree_iter_buf)) || ++ !(c->btree_paths_bufs = alloc_percpu(struct btree_path_buf)) || + !(c->online_reserved = alloc_percpu(u64)) || + mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1, + btree_bytes(c)) || +diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c +index b5ce336f00ca..92e58f5c6bbf 100644 +--- a/fs/bcachefs/sysfs.c ++++ b/fs/bcachefs/sysfs.c +@@ -290,7 +290,7 @@ static int fs_alloc_debug_to_text(struct printbuf *out, struct bch_fs *c) + static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c) + { + struct btree_trans trans; +- struct btree_iter *iter; ++ struct btree_iter iter; + struct bkey_s_c k; + u64 nr_uncompressed_extents = 0, uncompressed_sectors = 0, + nr_compressed_extents = 0, +@@ -325,6 +325,7 @@ static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c + break; + } + } ++ bch2_trans_iter_exit(&trans, &iter); + + ret = bch2_trans_exit(&trans) ?: ret; + if (ret) +diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c +index 44b812dc4053..d5a74f4db64d 100644 +--- a/fs/bcachefs/tests.c ++++ b/fs/bcachefs/tests.c +@@ -29,7 +29,7 @@ static void delete_test_keys(struct bch_fs *c) + static int test_delete(struct bch_fs *c, u64 nr) + { + struct btree_trans trans; +- struct btree_iter *iter; ++ struct btree_iter iter; + struct bkey_i_cookie k; + int ret; + +@@ -37,13 +37,12 @@ static int test_delete(struct bch_fs *c, u64 nr) + k.k.p.snapshot = U32_MAX; + + bch2_trans_init(&trans, c, 0, 0); +- +- iter = bch2_trans_get_iter(&trans, BTREE_ID_xattrs, k.k.p, +- BTREE_ITER_INTENT); ++ bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, k.k.p, ++ BTREE_ITER_INTENT); + + ret = __bch2_trans_do(&trans, NULL, NULL, 0, +- bch2_btree_iter_traverse(iter) ?: +- bch2_trans_update(&trans, iter, &k.k_i, 0)); ++ bch2_btree_iter_traverse(&iter) ?: ++ bch2_trans_update(&trans, &iter, &k.k_i, 0)); + if (ret) { + bch_err(c, "update error in test_delete: %i", ret); + goto err; +@@ -51,8 +50,8 @@ static int test_delete(struct bch_fs *c, u64 nr) + + pr_info("deleting once"); + ret = __bch2_trans_do(&trans, NULL, NULL, 0, +- bch2_btree_iter_traverse(iter) ?: +- bch2_btree_delete_at(&trans, iter, 0)); ++ bch2_btree_iter_traverse(&iter) ?: ++ bch2_btree_delete_at(&trans, &iter, 0)); + if (ret) { + bch_err(c, "delete error (first) in test_delete: %i", ret); + goto err; +@@ -60,14 +59,14 @@ static int test_delete(struct bch_fs *c, u64 nr) + + pr_info("deleting twice"); + ret = __bch2_trans_do(&trans, NULL, NULL, 0, +- bch2_btree_iter_traverse(iter) ?: +- bch2_btree_delete_at(&trans, iter, 0)); ++ bch2_btree_iter_traverse(&iter) ?: ++ bch2_btree_delete_at(&trans, &iter, 0)); + if (ret) { + bch_err(c, "delete error (second) in test_delete: %i", ret); + goto err; + } + err: +- bch2_trans_iter_put(&trans, iter); ++ bch2_trans_iter_exit(&trans, &iter); + bch2_trans_exit(&trans); + return ret; + } +@@ -75,7 +74,7 @@ err: + static int test_delete_written(struct bch_fs *c, u64 nr) + { + struct btree_trans trans; +- struct btree_iter *iter; ++ struct btree_iter iter; + struct bkey_i_cookie k; + int ret; + +@@ -84,12 +83,12 @@ static int test_delete_written(struct bch_fs *c, u64 nr) + + bch2_trans_init(&trans, c, 0, 0); + +- iter = bch2_trans_get_iter(&trans, BTREE_ID_xattrs, k.k.p, +- BTREE_ITER_INTENT); ++ bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, k.k.p, ++ BTREE_ITER_INTENT); + + ret = __bch2_trans_do(&trans, NULL, NULL, 0, +- bch2_btree_iter_traverse(iter) ?: +- bch2_trans_update(&trans, iter, &k.k_i, 0)); ++ bch2_btree_iter_traverse(&iter) ?: ++ bch2_trans_update(&trans, &iter, &k.k_i, 0)); + if (ret) { + bch_err(c, "update error in test_delete_written: %i", ret); + goto err; +@@ -99,14 +98,14 @@ static int test_delete_written(struct bch_fs *c, u64 nr) + bch2_journal_flush_all_pins(&c->journal); + + ret = __bch2_trans_do(&trans, NULL, NULL, 0, +- bch2_btree_iter_traverse(iter) ?: +- bch2_btree_delete_at(&trans, iter, 0)); ++ bch2_btree_iter_traverse(&iter) ?: ++ bch2_btree_delete_at(&trans, &iter, 0)); + if (ret) { + bch_err(c, "delete error in test_delete_written: %i", ret); + goto err; + } + err: +- bch2_trans_iter_put(&trans, iter); ++ bch2_trans_iter_exit(&trans, &iter); + bch2_trans_exit(&trans); + return ret; + } +@@ -114,7 +113,7 @@ err: + static int test_iterate(struct bch_fs *c, u64 nr) + { + struct btree_trans trans; +- struct btree_iter *iter = NULL; ++ struct btree_iter iter = { NULL }; + struct bkey_s_c k; + u64 i; + int ret = 0; +@@ -156,12 +155,12 @@ static int test_iterate(struct bch_fs *c, u64 nr) + + pr_info("iterating backwards"); + +- while (!IS_ERR_OR_NULL((k = bch2_btree_iter_prev(iter)).k)) ++ while (!IS_ERR_OR_NULL((k = bch2_btree_iter_prev(&iter)).k)) + BUG_ON(k.k->p.offset != --i); + + BUG_ON(i); + err: +- bch2_trans_iter_put(&trans, iter); ++ bch2_trans_iter_exit(&trans, &iter); + bch2_trans_exit(&trans); + return ret; + } +@@ -169,7 +168,7 @@ err: + static int test_iterate_extents(struct bch_fs *c, u64 nr) + { + struct btree_trans trans; +- struct btree_iter *iter = NULL; ++ struct btree_iter iter = { NULL }; + struct bkey_s_c k; + u64 i; + int ret = 0; +@@ -210,14 +209,14 @@ static int test_iterate_extents(struct bch_fs *c, u64 nr) + + pr_info("iterating backwards"); + +- while (!IS_ERR_OR_NULL((k = bch2_btree_iter_prev(iter)).k)) { ++ while (!IS_ERR_OR_NULL((k = bch2_btree_iter_prev(&iter)).k)) { + BUG_ON(k.k->p.offset != i); + i = bkey_start_offset(k.k); + } + + BUG_ON(i); + err: +- bch2_trans_iter_put(&trans, iter); ++ bch2_trans_iter_exit(&trans, &iter); + bch2_trans_exit(&trans); + return ret; + } +@@ -225,7 +224,7 @@ err: + static int test_iterate_slots(struct bch_fs *c, u64 nr) + { + struct btree_trans trans; +- struct btree_iter *iter; ++ struct btree_iter iter = { NULL }; + struct bkey_s_c k; + u64 i; + int ret = 0; +@@ -263,7 +262,7 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr) + BUG_ON(k.k->p.offset != i); + i += 2; + } +- bch2_trans_iter_put(&trans, iter); ++ bch2_trans_iter_exit(&trans, &iter); + + BUG_ON(i != nr * 2); + +@@ -280,7 +279,7 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr) + if (i == nr * 2) + break; + } +- bch2_trans_iter_put(&trans, iter); ++ bch2_trans_iter_exit(&trans, &iter); + err: + bch2_trans_exit(&trans); + return ret; +@@ -289,7 +288,7 @@ err: + static int test_iterate_slots_extents(struct bch_fs *c, u64 nr) + { + struct btree_trans trans; +- struct btree_iter *iter; ++ struct btree_iter iter = { NULL }; + struct bkey_s_c k; + u64 i; + int ret = 0; +@@ -326,7 +325,7 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr) + BUG_ON(k.k->size != 8); + i += 16; + } +- bch2_trans_iter_put(&trans, iter); ++ bch2_trans_iter_exit(&trans, &iter); + + BUG_ON(i != nr); + +@@ -345,7 +344,7 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr) + if (i == nr) + break; + } +- bch2_trans_iter_put(&trans, iter); ++ bch2_trans_iter_exit(&trans, &iter); + err: + bch2_trans_exit(&trans); + return 0; +@@ -358,21 +357,19 @@ err: + static int test_peek_end(struct bch_fs *c, u64 nr) + { + struct btree_trans trans; +- struct btree_iter *iter; ++ struct btree_iter iter; + struct bkey_s_c k; + + bch2_trans_init(&trans, c, 0, 0); ++ bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, POS_MIN, 0); + +- iter = bch2_trans_get_iter(&trans, BTREE_ID_xattrs, POS_MIN, 0); +- +- k = bch2_btree_iter_peek(iter); ++ k = bch2_btree_iter_peek(&iter); + BUG_ON(k.k); + +- k = bch2_btree_iter_peek(iter); ++ k = bch2_btree_iter_peek(&iter); + BUG_ON(k.k); + +- bch2_trans_iter_put(&trans, iter); +- ++ bch2_trans_iter_exit(&trans, &iter); + bch2_trans_exit(&trans); + return 0; + } +@@ -380,21 +377,19 @@ static int test_peek_end(struct bch_fs *c, u64 nr) + static int test_peek_end_extents(struct bch_fs *c, u64 nr) + { + struct btree_trans trans; +- struct btree_iter *iter; ++ struct btree_iter iter; + struct bkey_s_c k; + + bch2_trans_init(&trans, c, 0, 0); ++ bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, POS_MIN, 0); + +- iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, POS_MIN, 0); +- +- k = bch2_btree_iter_peek(iter); ++ k = bch2_btree_iter_peek(&iter); + BUG_ON(k.k); + +- k = bch2_btree_iter_peek(iter); ++ k = bch2_btree_iter_peek(&iter); + BUG_ON(k.k); + +- bch2_trans_iter_put(&trans, iter); +- ++ bch2_trans_iter_exit(&trans, &iter); + bch2_trans_exit(&trans); + return 0; + } +@@ -540,18 +535,18 @@ static int rand_insert_multi(struct bch_fs *c, u64 nr) + static int rand_lookup(struct bch_fs *c, u64 nr) + { + struct btree_trans trans; +- struct btree_iter *iter; ++ struct btree_iter iter; + struct bkey_s_c k; + int ret = 0; + u64 i; + + bch2_trans_init(&trans, c, 0, 0); +- iter = bch2_trans_get_iter(&trans, BTREE_ID_xattrs, POS_MIN, 0); ++ bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, POS_MIN, 0); + + for (i = 0; i < nr; i++) { +- bch2_btree_iter_set_pos(iter, POS(0, test_rand())); ++ bch2_btree_iter_set_pos(&iter, POS(0, test_rand())); + +- k = bch2_btree_iter_peek(iter); ++ k = bch2_btree_iter_peek(&iter); + ret = bkey_err(k); + if (ret) { + bch_err(c, "error in rand_lookup: %i", ret); +@@ -559,63 +554,73 @@ static int rand_lookup(struct bch_fs *c, u64 nr) + } + } + +- bch2_trans_iter_put(&trans, iter); ++ bch2_trans_iter_exit(&trans, &iter); + bch2_trans_exit(&trans); + return ret; + } + ++static int rand_mixed_trans(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct bkey_i_cookie *cookie, ++ u64 i, u64 pos) ++{ ++ struct bkey_s_c k; ++ int ret; ++ ++ bch2_btree_iter_set_pos(iter, POS(0, pos)); ++ ++ k = bch2_btree_iter_peek(iter); ++ ret = bkey_err(k); ++ if (ret && ret != -EINTR) ++ bch_err(trans->c, "lookup error in rand_mixed: %i", ret); ++ if (ret) ++ return ret; ++ ++ if (!(i & 3) && k.k) { ++ bkey_cookie_init(&cookie->k_i); ++ cookie->k.p = iter->pos; ++ bch2_trans_update(trans, iter, &cookie->k_i, 0); ++ } ++ ++ return 0; ++} ++ + static int rand_mixed(struct bch_fs *c, u64 nr) + { + struct btree_trans trans; +- struct btree_iter *iter; +- struct bkey_s_c k; ++ struct btree_iter iter; ++ struct bkey_i_cookie cookie; + int ret = 0; +- u64 i; ++ u64 i, rand; + + bch2_trans_init(&trans, c, 0, 0); +- iter = bch2_trans_get_iter(&trans, BTREE_ID_xattrs, POS_MIN, 0); ++ bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, POS_MIN, 0); + + for (i = 0; i < nr; i++) { +- bch2_btree_iter_set_pos(iter, POS(0, test_rand())); +- +- k = bch2_btree_iter_peek(iter); +- ret = bkey_err(k); ++ rand = test_rand(); ++ ret = __bch2_trans_do(&trans, NULL, NULL, 0, ++ rand_mixed_trans(&trans, &iter, &cookie, i, rand)); + if (ret) { +- bch_err(c, "lookup error in rand_mixed: %i", ret); ++ bch_err(c, "update error in rand_mixed: %i", ret); + break; + } +- +- if (!(i & 3) && k.k) { +- struct bkey_i_cookie k; +- +- bkey_cookie_init(&k.k_i); +- k.k.p = iter->pos; +- +- ret = __bch2_trans_do(&trans, NULL, NULL, 0, +- bch2_btree_iter_traverse(iter) ?: +- bch2_trans_update(&trans, iter, &k.k_i, 0)); +- if (ret) { +- bch_err(c, "update error in rand_mixed: %i", ret); +- break; +- } +- } + } + +- bch2_trans_iter_put(&trans, iter); ++ bch2_trans_iter_exit(&trans, &iter); + bch2_trans_exit(&trans); + return ret; + } + + static int __do_delete(struct btree_trans *trans, struct bpos pos) + { +- struct btree_iter *iter; ++ struct btree_iter iter; + struct bkey_i delete; + struct bkey_s_c k; + int ret = 0; + +- iter = bch2_trans_get_iter(trans, BTREE_ID_xattrs, pos, +- BTREE_ITER_INTENT); +- k = bch2_btree_iter_peek(iter); ++ bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, pos, ++ BTREE_ITER_INTENT); ++ k = bch2_btree_iter_peek(&iter); + ret = bkey_err(k); + if (ret) + goto err; +@@ -626,9 +631,9 @@ static int __do_delete(struct btree_trans *trans, struct bpos pos) + bkey_init(&delete.k); + delete.k.p = k.k->p; + +- ret = bch2_trans_update(trans, iter, &delete, 0); ++ ret = bch2_trans_update(trans, &iter, &delete, 0); + err: +- bch2_trans_iter_put(trans, iter); ++ bch2_trans_iter_exit(trans, &iter); + return ret; + } + +@@ -658,7 +663,7 @@ static int rand_delete(struct bch_fs *c, u64 nr) + static int seq_insert(struct bch_fs *c, u64 nr) + { + struct btree_trans trans; +- struct btree_iter *iter; ++ struct btree_iter iter; + struct bkey_s_c k; + struct bkey_i_cookie insert; + int ret = 0; +@@ -670,11 +675,11 @@ static int seq_insert(struct bch_fs *c, u64 nr) + + for_each_btree_key(&trans, iter, BTREE_ID_xattrs, POS_MIN, + BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { +- insert.k.p = iter->pos; ++ insert.k.p = iter.pos; + + ret = __bch2_trans_do(&trans, NULL, NULL, 0, +- bch2_btree_iter_traverse(iter) ?: +- bch2_trans_update(&trans, iter, &insert.k_i, 0)); ++ bch2_btree_iter_traverse(&iter) ?: ++ bch2_trans_update(&trans, &iter, &insert.k_i, 0)); + if (ret) { + bch_err(c, "error in seq_insert: %i", ret); + break; +@@ -683,7 +688,7 @@ static int seq_insert(struct bch_fs *c, u64 nr) + if (++i == nr) + break; + } +- bch2_trans_iter_put(&trans, iter); ++ bch2_trans_iter_exit(&trans, &iter); + + bch2_trans_exit(&trans); + return ret; +@@ -692,7 +697,7 @@ static int seq_insert(struct bch_fs *c, u64 nr) + static int seq_lookup(struct bch_fs *c, u64 nr) + { + struct btree_trans trans; +- struct btree_iter *iter; ++ struct btree_iter iter; + struct bkey_s_c k; + int ret = 0; + +@@ -700,7 +705,7 @@ static int seq_lookup(struct bch_fs *c, u64 nr) + + for_each_btree_key(&trans, iter, BTREE_ID_xattrs, POS_MIN, 0, k, ret) + ; +- bch2_trans_iter_put(&trans, iter); ++ bch2_trans_iter_exit(&trans, &iter); + + bch2_trans_exit(&trans); + return ret; +@@ -709,7 +714,7 @@ static int seq_lookup(struct bch_fs *c, u64 nr) + static int seq_overwrite(struct bch_fs *c, u64 nr) + { + struct btree_trans trans; +- struct btree_iter *iter; ++ struct btree_iter iter; + struct bkey_s_c k; + int ret = 0; + +@@ -722,14 +727,14 @@ static int seq_overwrite(struct bch_fs *c, u64 nr) + bkey_reassemble(&u.k_i, k); + + ret = __bch2_trans_do(&trans, NULL, NULL, 0, +- bch2_btree_iter_traverse(iter) ?: +- bch2_trans_update(&trans, iter, &u.k_i, 0)); ++ bch2_btree_iter_traverse(&iter) ?: ++ bch2_trans_update(&trans, &iter, &u.k_i, 0)); + if (ret) { + bch_err(c, "error in seq_overwrite: %i", ret); + break; + } + } +- bch2_trans_iter_put(&trans, iter); ++ bch2_trans_iter_exit(&trans, &iter); + + bch2_trans_exit(&trans); + return ret; +diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c +index e4d400b16dba..ef6ae97e0df5 100644 +--- a/fs/bcachefs/xattr.c ++++ b/fs/bcachefs/xattr.c +@@ -122,23 +122,22 @@ static int bch2_xattr_get_trans(struct btree_trans *trans, struct bch_inode_info + const char *name, void *buffer, size_t size, int type) + { + struct bch_hash_info hash = bch2_hash_info_init(trans->c, &inode->ei_inode); +- struct btree_iter *iter; ++ struct btree_iter iter; + struct bkey_s_c_xattr xattr; + struct bkey_s_c k; + int ret; + +- iter = bch2_hash_lookup(trans, bch2_xattr_hash_desc, &hash, +- inode->v.i_ino, +- &X_SEARCH(type, name, strlen(name)), +- 0); +- ret = PTR_ERR_OR_ZERO(iter); ++ ret = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc, &hash, ++ inode->v.i_ino, ++ &X_SEARCH(type, name, strlen(name)), ++ 0); + if (ret) +- goto err; ++ goto err1; + +- k = bch2_btree_iter_peek_slot(iter); ++ k = bch2_btree_iter_peek_slot(&iter); + ret = bkey_err(k); + if (ret) +- goto err; ++ goto err2; + + xattr = bkey_s_c_to_xattr(k); + ret = le16_to_cpu(xattr.v->x_val_len); +@@ -148,8 +147,9 @@ static int bch2_xattr_get_trans(struct btree_trans *trans, struct bch_inode_info + else + memcpy(buffer, xattr_val(xattr.v), ret); + } +- bch2_trans_iter_put(trans, iter); +-err: ++err2: ++ bch2_trans_iter_exit(trans, &iter); ++err1: + return ret == -ENOENT ? -ENODATA : ret; + } + +@@ -279,7 +279,7 @@ ssize_t bch2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) + struct bch_fs *c = dentry->d_sb->s_fs_info; + struct bch_inode_info *inode = to_bch_ei(dentry->d_inode); + struct btree_trans trans; +- struct btree_iter *iter; ++ struct btree_iter iter; + struct bkey_s_c k; + struct xattr_buf buf = { .buf = buffer, .len = buffer_size }; + u64 inum = dentry->d_inode->i_ino; +@@ -301,7 +301,7 @@ ssize_t bch2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) + if (ret) + break; + } +- bch2_trans_iter_put(&trans, iter); ++ bch2_trans_iter_exit(&trans, &iter); + + ret = bch2_trans_exit(&trans) ?: ret; + +diff --git a/include/trace/events/bcachefs.h b/include/trace/events/bcachefs.h +index a11bb5f7180e..fe6bccb51c68 100644 +--- a/include/trace/events/bcachefs.h ++++ b/include/trace/events/bcachefs.h +@@ -540,7 +540,7 @@ TRACE_EVENT(copygc_wait, + __entry->wait_amount, __entry->until) + ); + +-TRACE_EVENT(trans_get_iter, ++TRACE_EVENT(trans_get_path, + TP_PROTO(unsigned long trans_ip, + unsigned long caller_ip, + enum btree_id btree_id, +@@ -814,7 +814,7 @@ TRACE_EVENT(iter_traverse, + __entry->ret) + ); + +-TRACE_EVENT(iter_set_search_pos, ++TRACE_EVENT(path_set_pos, + TP_PROTO(unsigned long trans_ip, + unsigned long caller_ip, + enum btree_id btree_id, +-- +cgit v1.2.3 + + +From 87d22ddafeb970d50212c6882e6a6f6dea805cb8 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 4 Sep 2021 21:19:48 -0400 +Subject: bcachefs: Extent btree iterators are no longer special + +Since iter->real_pos was introduced, we no longer have to deal with +extent btree iterators that have skipped past deleted keys - this is a +real performance improvement on btree updates. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 11 +++-------- + 1 file changed, 3 insertions(+), 8 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 9eec445b7460..cee692bef90c 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -607,13 +607,9 @@ static void bch2_btree_path_verify_level(struct btree_trans *trans, + bch2_btree_node_iter_verify(&l->iter, l->b); + + /* +- * For interior nodes, the iterator will have skipped past +- * deleted keys: +- * +- * For extents, the iterator may have skipped past deleted keys (but not +- * whiteouts) ++ * For interior nodes, the iterator will have skipped past deleted keys: + */ +- p = level || btree_node_type_is_extents(path->btree_id) ++ p = level + ? bch2_btree_node_iter_prev(&tmp, l->b) + : bch2_btree_node_iter_prev_all(&tmp, l->b); + k = bch2_btree_node_iter_peek_all(&l->iter, l->b); +@@ -835,8 +831,7 @@ fixup_done: + */ + if (!bch2_btree_node_iter_end(node_iter) && + iter_current_key_modified && +- (b->c.level || +- btree_node_type_is_extents(path->btree_id))) { ++ b->c.level) { + struct bset_tree *t; + struct bkey_packed *k, *k2, *p; + +-- +cgit v1.2.3 + + +From fc9679950a53e7fb1eff218ab68a7f440b2984d5 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 4 Sep 2021 21:23:11 -0400 +Subject: bcachefs: Tighten up btree locking invariants + +New rule is: if a btree path holds any locks it should be holding +precisely the locks wanted (accoringing to path->level and +path->locks_want). + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 69 +++++++++++++++---------------------- + fs/bcachefs/btree_iter.h | 1 - + fs/bcachefs/btree_update_interior.c | 8 ++--- + 3 files changed, 32 insertions(+), 46 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index cee692bef90c..ea7b3400c5d0 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -227,7 +227,6 @@ static inline bool btree_path_get_locks(struct btree_trans *trans, + ? path->l[l].b->c.lock.state.seq + : 0); + fail_idx = l; +- btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); + } + + l++; +@@ -238,10 +237,14 @@ static inline bool btree_path_get_locks(struct btree_trans *trans, + * can't be relocked so bch2_btree_path_traverse has to walk back up to + * the node that we failed to relock: + */ +- while (fail_idx >= 0) { +- btree_node_unlock(path, fail_idx); +- path->l[fail_idx].b = BTREE_ITER_NO_NODE_GET_LOCKS; +- --fail_idx; ++ if (fail_idx >= 0) { ++ __bch2_btree_path_unlock(path); ++ btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); ++ ++ do { ++ path->l[fail_idx].b = BTREE_ITER_NO_NODE_GET_LOCKS; ++ --fail_idx; ++ } while (fail_idx >= 0); + } + + if (path->uptodate == BTREE_ITER_NEED_RELOCK) +@@ -380,14 +383,14 @@ static void bch2_btree_path_verify_locks(struct btree_path *path) + { + unsigned l; + +- for (l = 0; btree_path_node(path, l); l++) { +- if (path->uptodate >= BTREE_ITER_NEED_RELOCK && +- !btree_node_locked(path, l)) +- continue; ++ if (!path->nodes_locked) { ++ BUG_ON(path->uptodate == BTREE_ITER_UPTODATE); ++ return; ++ } + ++ for (l = 0; btree_path_node(path, l); l++) + BUG_ON(btree_lock_want(path, l) != + btree_node_locked_type(path, l)); +- } + } + + void bch2_trans_verify_locks(struct btree_trans *trans) +@@ -425,6 +428,7 @@ bool bch2_btree_path_relock_intent(struct btree_trans *trans, + is_btree_node(path, l) + ? path->l[l].b->c.lock.state.seq + : 0); ++ __bch2_btree_path_unlock(path); + btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); + btree_trans_restart(trans); + return false; +@@ -674,9 +678,6 @@ void bch2_trans_verify_paths(struct btree_trans *trans) + { + struct btree_path *path; + +- if (!bch2_debug_check_iterators) +- return; +- + trans_for_each_path(trans, path) + bch2_btree_path_verify(trans, path); + } +@@ -1017,7 +1018,7 @@ static void btree_path_verify_new_node(struct btree_trans *trans, + } + + if (!parent_locked) +- btree_node_unlock(path, b->c.level + 1); ++ btree_node_unlock(path, plevel); + } + + static inline void __btree_path_level_init(struct btree_path *path, +@@ -1059,21 +1060,17 @@ static inline void btree_path_level_init(struct btree_trans *trans, + */ + void bch2_trans_node_add(struct btree_trans *trans, struct btree *b) + { +- enum btree_node_locked_type t; + struct btree_path *path; + + trans_for_each_path(trans, path) + if (!path->cached && + btree_path_pos_in_node(path, b)) { +- /* +- * bch2_btree_path_node_drop() has already been called - +- * the old node we're replacing has already been +- * unlocked and the pointer invalidated +- */ +- BUG_ON(btree_node_locked(path, b->c.level)); ++ enum btree_node_locked_type t = ++ btree_lock_want(path, b->c.level); + +- t = btree_lock_want(path, b->c.level); +- if (t != BTREE_NODE_UNLOCKED) { ++ if (path->nodes_locked && ++ t != BTREE_NODE_UNLOCKED) { ++ btree_node_unlock(path, b->c.level); + six_lock_increment(&b->c.lock, t); + mark_btree_node_locked(path, b->c.level, t); + } +@@ -1082,18 +1079,6 @@ void bch2_trans_node_add(struct btree_trans *trans, struct btree *b) + } + } + +-void bch2_trans_node_drop(struct btree_trans *trans, struct btree *b) +-{ +- struct btree_path *path; +- unsigned level = b->c.level; +- +- trans_for_each_path(trans, path) +- if (path->l[level].b == b) { +- btree_node_unlock(path, level); +- path->l[level].b = BTREE_ITER_NO_NODE_DROP; +- } +-} +- + /* + * A btree node has been modified in such a way as to invalidate iterators - fix + * them: +@@ -1383,6 +1368,9 @@ static inline unsigned btree_path_up_until_good_node(struct btree_trans *trans, + { + unsigned l = path->level; + ++ if (!path->nodes_locked) ++ btree_path_get_locks(trans, path, false, _THIS_IP_); ++ + while (btree_path_node(path, l) && + !btree_path_good_node(trans, path, l, check_pos)) { + btree_node_unlock(path, l); +@@ -1583,14 +1571,12 @@ btree_path_set_pos(struct btree_trans *trans, + if (cmp < 0 || + !btree_path_advance_to_pos(path, &path->l[l], 8)) + __btree_path_level_init(path, l); +- +- /* Don't leave it locked if we're not supposed to: */ +- if (btree_lock_want(path, l) == BTREE_NODE_UNLOCKED) +- btree_node_unlock(path, l); + } + +- if (l != path->level) ++ if (l != path->level) { + btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); ++ __bch2_btree_path_unlock(path); ++ } + out: + bch2_btree_path_verify(trans, path); + #ifdef CONFIG_BCACHEFS_DEBUG +@@ -2707,9 +2693,10 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c) + if (!path->nodes_locked) + continue; + +- pr_buf(out, " path %u %c %s:", ++ pr_buf(out, " path %u %c l=%u %s:", + path->idx, + path->cached ? 'c' : 'b', ++ path->level, + bch2_btree_ids[path->btree_id]); + bch2_bpos_to_text(out, path->pos); + pr_buf(out, "\n"); +diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h +index 273bc7f3a29b..ed41b52a5b5d 100644 +--- a/fs/bcachefs/btree_iter.h ++++ b/fs/bcachefs/btree_iter.h +@@ -193,7 +193,6 @@ static inline void bch2_btree_path_downgrade(struct btree_path *path) + void bch2_trans_downgrade(struct btree_trans *); + + void bch2_trans_node_add(struct btree_trans *trans, struct btree *); +-void bch2_trans_node_drop(struct btree_trans *, struct btree *); + void bch2_trans_node_reinit_iter(struct btree_trans *, struct btree *); + + int __must_check bch2_btree_iter_traverse(struct btree_iter *); +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index 5534573af425..4e428d2c0eeb 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -1429,7 +1429,6 @@ static void btree_split(struct btree_update *as, struct btree_trans *trans, + /* Successful split, update the path to point to the new nodes: */ + + six_lock_increment(&b->c.lock, SIX_LOCK_intent); +- bch2_trans_node_drop(trans, b); + if (n3) + bch2_trans_node_add(trans, n3); + if (n2) +@@ -1694,14 +1693,16 @@ retry: + bch2_keylist_add(&as->parent_keys, &delete); + bch2_keylist_add(&as->parent_keys, &n->key); + ++ bch2_trans_verify_paths(trans); ++ + bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys, flags); + ++ bch2_trans_verify_paths(trans); ++ + bch2_btree_update_get_open_buckets(as, n); + + six_lock_increment(&b->c.lock, SIX_LOCK_intent); + six_lock_increment(&m->c.lock, SIX_LOCK_intent); +- bch2_trans_node_drop(trans, b); +- bch2_trans_node_drop(trans, m); + + bch2_trans_node_add(trans, n); + +@@ -1798,7 +1799,6 @@ retry: + bch2_btree_update_get_open_buckets(as, n); + + six_lock_increment(&b->c.lock, SIX_LOCK_intent); +- bch2_trans_node_drop(trans, b); + bch2_trans_node_add(trans, n); + bch2_btree_node_free_inmem(trans, b); + six_unlock_intent(&n->c.lock); +-- +cgit v1.2.3 + + +From 8cee6bcfb79d0a3f946c73e5b97f2c12451522ac Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 3 Sep 2021 17:18:57 -0400 +Subject: bcachefs: Add more assertions for locking btree iterators out of + order + +btree_path_traverse_all() traverses btree iterators in sorted order, and +thus shouldn't see transaction restarts due to potential deadlocks - but +sometimes we do. This patch adds some more assertions and tracks some +more state to help track this down. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 25 ++++++++++++++++--------- + fs/bcachefs/btree_key_cache.c | 4 ++-- + fs/bcachefs/btree_locking.h | 17 ++++++++++++++--- + fs/bcachefs/btree_types.h | 2 ++ + 4 files changed, 34 insertions(+), 14 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index ea7b3400c5d0..331e2a86dbfd 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -160,7 +160,7 @@ bool __bch2_btree_node_relock(struct btree_trans *trans, + if (six_relock_type(&b->c.lock, want, path->l[level].lock_seq) || + (btree_node_lock_seq_matches(path, b, level) && + btree_node_lock_increment(trans, b, level, want))) { +- mark_btree_node_locked(path, level, want); ++ mark_btree_node_locked(trans, path, level, want); + return true; + } else { + return false; +@@ -196,7 +196,7 @@ static bool bch2_btree_node_upgrade(struct btree_trans *trans, + + return false; + success: +- mark_btree_node_intent_locked(path, level); ++ mark_btree_node_intent_locked(trans, path, level); + return true; + } + +@@ -927,12 +927,12 @@ static inline struct bkey_s_c btree_path_level_peek_all(struct bch_fs *c, + bch2_btree_node_iter_peek_all(&l->iter, l->b)); + } + +-static inline struct bkey_s_c btree_path_level_peek(struct bch_fs *c, ++static inline struct bkey_s_c btree_path_level_peek(struct btree_trans *trans, + struct btree_path *path, + struct btree_path_level *l, + struct bkey *u) + { +- struct bkey_s_c k = __btree_iter_unpack(c, l, u, ++ struct bkey_s_c k = __btree_iter_unpack(trans->c, l, u, + bch2_btree_node_iter_peek(&l->iter, l->b)); + + path->pos = k.k ? k.k->p : l->b->key.k.p; +@@ -1072,7 +1072,7 @@ void bch2_trans_node_add(struct btree_trans *trans, struct btree *b) + t != BTREE_NODE_UNLOCKED) { + btree_node_unlock(path, b->c.level); + six_lock_increment(&b->c.lock, t); +- mark_btree_node_locked(path, b->c.level, t); ++ mark_btree_node_locked(trans, path, b->c.level, t); + } + + btree_path_level_init(trans, path, b); +@@ -1149,7 +1149,7 @@ static inline int btree_path_lock_root(struct btree_trans *trans, + for (i = path->level + 1; i < BTREE_MAX_DEPTH; i++) + path->l[i].b = NULL; + +- mark_btree_node_locked(path, path->level, lock_type); ++ mark_btree_node_locked(trans, path, path->level, lock_type); + btree_path_level_init(trans, path, b); + return 0; + } +@@ -1241,7 +1241,7 @@ static __always_inline int btree_path_down(struct btree_trans *trans, + if (unlikely(ret)) + goto err; + +- mark_btree_node_locked(path, level, lock_type); ++ mark_btree_node_locked(trans, path, level, lock_type); + btree_path_level_init(trans, path, b); + + if (tmp.k->k.type == KEY_TYPE_btree_ptr_v2 && +@@ -1283,6 +1283,10 @@ retry_all: + + btree_trans_verify_sorted(trans); + ++#ifdef CONFIG_BCACHEFS_DEBUG ++ trans->traverse_all_idx = U8_MAX; ++#endif ++ + for (i = trans->nr_sorted - 2; i >= 0; --i) { + struct btree_path *path1 = trans->paths + trans->sorted[i]; + struct btree_path *path2 = trans->paths + trans->sorted[i + 1]; +@@ -1318,6 +1322,9 @@ retry_all: + /* Now, redo traversals in correct order: */ + trans_for_each_path_inorder(trans, path) { + EBUG_ON(!(trans->paths_allocated & (1ULL << path->idx))); ++#ifdef CONFIG_BCACHEFS_DEBUG ++ trans->traverse_all_idx = path->idx; ++#endif + + ret = btree_path_traverse_one(trans, path, 0, _THIS_IP_); + if (ret) +@@ -2088,7 +2095,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) + goto out; + } + +- k = btree_path_level_peek(trans->c, iter->path, ++ k = btree_path_level_peek(trans, iter->path, + &iter->path->l[0], &iter->k); + if (!k.k || + ((iter->flags & BTREE_ITER_IS_EXTENTS) +@@ -2343,7 +2350,7 @@ static inline void btree_path_list_add(struct btree_trans *trans, + + btree_trans_verify_sorted_refs(trans); + +- path->sorted_idx = pos ? pos->sorted_idx : trans->nr_sorted; ++ path->sorted_idx = pos ? pos->sorted_idx + 1 : 0; + + array_insert_item(trans->sorted, trans->nr_sorted, path->sorted_idx, path->idx); + +diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c +index 2dfa5040d045..938ced36af73 100644 +--- a/fs/bcachefs/btree_key_cache.c ++++ b/fs/bcachefs/btree_key_cache.c +@@ -296,7 +296,7 @@ retry: + if (!ck) + goto retry; + +- mark_btree_node_locked(path, 0, SIX_LOCK_intent); ++ mark_btree_node_locked(trans, path, 0, SIX_LOCK_intent); + path->locks_want = 1; + } else { + enum six_lock_type lock_want = __btree_lock_want(path, 0); +@@ -318,7 +318,7 @@ retry: + goto retry; + } + +- mark_btree_node_locked(path, 0, lock_want); ++ mark_btree_node_locked(trans, path, 0, lock_want); + } + + path->l[0].lock_seq = ck->c.lock.state.seq; +diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h +index d599008c5fc1..5c6b758070e1 100644 +--- a/fs/bcachefs/btree_locking.h ++++ b/fs/bcachefs/btree_locking.h +@@ -58,7 +58,8 @@ static inline void mark_btree_node_unlocked(struct btree_path *path, + path->nodes_intent_locked &= ~(1 << level); + } + +-static inline void mark_btree_node_locked(struct btree_path *path, ++static inline void mark_btree_node_locked(struct btree_trans *trans, ++ struct btree_path *path, + unsigned level, + enum six_lock_type type) + { +@@ -68,12 +69,19 @@ static inline void mark_btree_node_locked(struct btree_path *path, + + path->nodes_locked |= 1 << level; + path->nodes_intent_locked |= type << level; ++#ifdef CONFIG_BCACHEFS_DEBUG ++ path->ip_locked = _RET_IP_; ++ BUG_ON(trans->in_traverse_all && ++ trans->traverse_all_idx != U8_MAX && ++ path->sorted_idx > trans->paths[trans->traverse_all_idx].sorted_idx); ++#endif + } + +-static inline void mark_btree_node_intent_locked(struct btree_path *path, ++static inline void mark_btree_node_intent_locked(struct btree_trans *trans, ++ struct btree_path *path, + unsigned level) + { +- mark_btree_node_locked(path, level, SIX_LOCK_intent); ++ mark_btree_node_locked(trans, path, level, SIX_LOCK_intent); + } + + static inline enum six_lock_type __btree_lock_want(struct btree_path *path, int level) +@@ -112,6 +120,9 @@ static inline void __bch2_btree_path_unlock(struct btree_path *path) + + while (path->nodes_locked) + btree_node_unlock(path, __ffs(path->nodes_locked)); ++#ifdef CONFIG_BCACHEFS_DEBUG ++ path->ip_locked = 0; ++#endif + } + + static inline enum bch_time_stats lock_to_time_stat(enum six_lock_type type) +diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h +index 59a6b395d0e0..62aae4acc6b5 100644 +--- a/fs/bcachefs/btree_types.h ++++ b/fs/bcachefs/btree_types.h +@@ -255,6 +255,7 @@ struct btree_path { + } l[BTREE_MAX_DEPTH]; + #ifdef CONFIG_BCACHEFS_DEBUG + unsigned long ip_allocated; ++ unsigned long ip_locked; + #endif + }; + +@@ -368,6 +369,7 @@ struct btree_trans { + struct bpos locking_pos; + u8 locking_btree_id; + u8 locking_level; ++ u8 traverse_all_idx; + pid_t pid; + #endif + unsigned long ip; +-- +cgit v1.2.3 + + +From a929f98619fbce3513035396a5fef441163a76cd Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 5 Sep 2021 00:05:08 -0400 +Subject: bcachefs: Drop some fast path tracepoints + +These haven't turned out to be useful + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 42 +------ + fs/bcachefs/btree_update_leaf.c | 1 - + include/trace/events/bcachefs.h | 247 ---------------------------------------- + 3 files changed, 1 insertion(+), 289 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 331e2a86dbfd..57cc0191aa70 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -213,21 +213,8 @@ static inline bool btree_path_get_locks(struct btree_trans *trans, + + if (!(upgrade + ? bch2_btree_node_upgrade(trans, path, l) +- : bch2_btree_node_relock(trans, path, l))) { +- (upgrade +- ? trace_node_upgrade_fail +- : trace_node_relock_fail)(trans->ip, trace_ip, +- path->cached, +- path->btree_id, &path->pos, +- l, path->l[l].lock_seq, +- is_btree_node(path, l) +- ? 0 +- : (unsigned long) path->l[l].b, +- is_btree_node(path, l) +- ? path->l[l].b->c.lock.state.seq +- : 0); ++ : bch2_btree_node_relock(trans, path, l))) + fail_idx = l; +- } + + l++; + } while (l < path->locks_want); +@@ -418,16 +405,6 @@ bool bch2_btree_path_relock_intent(struct btree_trans *trans, + l < path->locks_want && btree_path_node(path, l); + l++) { + if (!bch2_btree_node_relock(trans, path, l)) { +- trace_node_relock_fail(trans->ip, _RET_IP_, +- path->cached, +- path->btree_id, &path->pos, +- l, path->l[l].lock_seq, +- is_btree_node(path, l) +- ? 0 +- : (unsigned long) path->l[l].b, +- is_btree_node(path, l) +- ? path->l[l].b->c.lock.state.seq +- : 0); + __bch2_btree_path_unlock(path); + btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); + btree_trans_restart(trans); +@@ -1471,9 +1448,6 @@ static int btree_path_traverse_one(struct btree_trans *trans, + path->uptodate = BTREE_ITER_UPTODATE; + out: + BUG_ON((ret == -EINTR) != !!trans->restarted); +- trace_iter_traverse(trans->ip, trace_ip, +- path->cached, +- path->btree_id, &path->pos, ret); + bch2_btree_path_verify(trans, path); + return ret; + } +@@ -1540,9 +1514,6 @@ btree_path_set_pos(struct btree_trans *trans, + struct btree_path *path, struct bpos new_pos, + bool intent) + { +-#ifdef CONFIG_BCACHEFS_DEBUG +- struct bpos old_pos = path->pos; +-#endif + int cmp = bpos_cmp(new_pos, path->pos); + unsigned l = path->level; + +@@ -1586,10 +1557,6 @@ btree_path_set_pos(struct btree_trans *trans, + } + out: + bch2_btree_path_verify(trans, path); +-#ifdef CONFIG_BCACHEFS_DEBUG +- trace_path_set_pos(trans->ip, _RET_IP_, path->btree_id, +- &old_pos, &new_pos, l); +-#endif + return path; + } + +@@ -1718,7 +1685,6 @@ struct btree_path *bch2_path_get(struct btree_trans *trans, bool cached, + bool intent) + { + struct btree_path *path, *path_pos = NULL; +- struct bpos pos_min = POS_MIN; + int i; + + BUG_ON(trans->restarted); +@@ -1781,12 +1747,6 @@ struct btree_path *bch2_path_get(struct btree_trans *trans, bool cached, + btree_path_get_locks(trans, path, true, _THIS_IP_); + } + +- trace_trans_get_path(_RET_IP_, trans->ip, btree_id, +- &pos, locks_want, path->uptodate, +- path_pos ? &path_pos->pos : &pos_min, +- path_pos ? path_pos->locks_want : U8_MAX, +- path_pos ? path_pos->uptodate : U8_MAX); +- + return path; + } + +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index b70c65b0dc8c..904fc72f5f75 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -225,7 +225,6 @@ static bool btree_insert_key_leaf(struct btree_trans *trans, + bch2_maybe_compact_whiteouts(c, b)) + bch2_trans_node_reinit_iter(trans, b); + +- trace_btree_insert_key(c, b, insert->k); + return true; + } + +diff --git a/include/trace/events/bcachefs.h b/include/trace/events/bcachefs.h +index fe6bccb51c68..06fda4ddb7ba 100644 +--- a/include/trace/events/bcachefs.h ++++ b/include/trace/events/bcachefs.h +@@ -298,28 +298,6 @@ TRACE_EVENT(btree_reserve_get_fail, + __entry->required, __entry->cl) + ); + +-TRACE_EVENT(btree_insert_key, +- TP_PROTO(struct bch_fs *c, struct btree *b, struct bkey_i *k), +- TP_ARGS(c, b, k), +- +- TP_STRUCT__entry( +- __field(u8, id ) +- __field(u64, inode ) +- __field(u64, offset ) +- __field(u32, size ) +- ), +- +- TP_fast_assign( +- __entry->id = b->c.btree_id; +- __entry->inode = k->k.p.inode; +- __entry->offset = k->k.p.offset; +- __entry->size = k->k.size; +- ), +- +- TP_printk("btree %u: %llu:%llu len %u", __entry->id, +- __entry->inode, __entry->offset, __entry->size) +-); +- + DEFINE_EVENT(btree_node, btree_split, + TP_PROTO(struct bch_fs *c, struct btree *b), + TP_ARGS(c, b) +@@ -540,69 +518,6 @@ TRACE_EVENT(copygc_wait, + __entry->wait_amount, __entry->until) + ); + +-TRACE_EVENT(trans_get_path, +- TP_PROTO(unsigned long trans_ip, +- unsigned long caller_ip, +- enum btree_id btree_id, +- struct bpos *got_pos, +- unsigned got_locks, +- unsigned got_uptodate, +- struct bpos *src_pos, +- unsigned src_locks, +- unsigned src_uptodate), +- TP_ARGS(trans_ip, caller_ip, btree_id, +- got_pos, got_locks, got_uptodate, +- src_pos, src_locks, src_uptodate), +- +- TP_STRUCT__entry( +- __field(unsigned long, trans_ip ) +- __field(unsigned long, caller_ip ) +- __field(u8, btree_id ) +- __field(u64, got_pos_inode ) +- __field(u64, got_pos_offset ) +- __field(u32, got_pos_snapshot ) +- __field(u8, got_locks ) +- __field(u8, got_uptodate ) +- __field(u64, src_pos_inode ) +- __field(u64, src_pos_offset ) +- __field(u32, src_pos_snapshot ) +- __field(u8, src_locks ) +- __field(u8, src_uptodate ) +- ), +- +- TP_fast_assign( +- __entry->trans_ip = trans_ip; +- __entry->caller_ip = caller_ip; +- __entry->btree_id = btree_id; +- __entry->got_pos_inode = got_pos->inode; +- __entry->got_pos_offset = got_pos->offset; +- __entry->got_pos_snapshot = got_pos->snapshot; +- __entry->got_locks = got_locks; +- __entry->got_uptodate = got_uptodate; +- __entry->src_pos_inode = src_pos->inode; +- __entry->src_pos_offset = src_pos->offset; +- __entry->src_pos_snapshot = src_pos->snapshot; +- __entry->src_locks = src_locks; +- __entry->src_uptodate = src_uptodate; +- ), +- +- TP_printk("%ps %pS btree %u got %llu:%llu:%u l %u u %u " +- "src %llu:%llu:%u l %u u %u", +- (void *) __entry->trans_ip, +- (void *) __entry->caller_ip, +- __entry->btree_id, +- __entry->got_pos_inode, +- __entry->got_pos_offset, +- __entry->got_pos_snapshot, +- __entry->got_locks, +- __entry->got_uptodate, +- __entry->src_pos_inode, +- __entry->src_pos_offset, +- __entry->src_pos_snapshot, +- __entry->src_locks, +- __entry->src_uptodate) +-); +- + TRACE_EVENT(transaction_restart_ip, + TP_PROTO(unsigned long caller, unsigned long ip), + TP_ARGS(caller, ip), +@@ -772,96 +687,6 @@ DEFINE_EVENT(transaction_restart_iter, trans_restart_traverse, + TP_ARGS(trans_ip, caller_ip, btree_id, pos) + ); + +-TRACE_EVENT(iter_traverse, +- TP_PROTO(unsigned long trans_ip, +- unsigned long caller_ip, +- bool key_cache, +- enum btree_id btree_id, +- struct bpos *pos, +- int ret), +- TP_ARGS(trans_ip, caller_ip, key_cache, btree_id, pos, ret), +- +- TP_STRUCT__entry( +- __field(unsigned long, trans_ip ) +- __field(unsigned long, caller_ip ) +- __field(u8, key_cache ) +- __field(u8, btree_id ) +- __field(u64, pos_inode ) +- __field(u64, pos_offset ) +- __field(u32, pos_snapshot ) +- __field(s32, ret ) +- ), +- +- TP_fast_assign( +- __entry->trans_ip = trans_ip; +- __entry->caller_ip = caller_ip; +- __entry->key_cache = key_cache; +- __entry->btree_id = btree_id; +- __entry->pos_inode = pos->inode; +- __entry->pos_offset = pos->offset; +- __entry->pos_snapshot = pos->snapshot; +- __entry->ret = ret; +- ), +- +- TP_printk("%ps %pS key cache %u btree %u %llu:%llu:%u ret %i", +- (void *) __entry->trans_ip, +- (void *) __entry->caller_ip, +- __entry->key_cache, +- __entry->btree_id, +- __entry->pos_inode, +- __entry->pos_offset, +- __entry->pos_snapshot, +- __entry->ret) +-); +- +-TRACE_EVENT(path_set_pos, +- TP_PROTO(unsigned long trans_ip, +- unsigned long caller_ip, +- enum btree_id btree_id, +- struct bpos *old_pos, +- struct bpos *new_pos, +- unsigned good_level), +- TP_ARGS(trans_ip, caller_ip, btree_id, old_pos, new_pos, good_level), +- +- TP_STRUCT__entry( +- __field(unsigned long, trans_ip ) +- __field(unsigned long, caller_ip ) +- __field(u8, btree_id ) +- __field(u64, old_pos_inode ) +- __field(u64, old_pos_offset ) +- __field(u32, old_pos_snapshot ) +- __field(u64, new_pos_inode ) +- __field(u64, new_pos_offset ) +- __field(u32, new_pos_snapshot ) +- __field(u8, good_level ) +- ), +- +- TP_fast_assign( +- __entry->trans_ip = trans_ip; +- __entry->caller_ip = caller_ip; +- __entry->btree_id = btree_id; +- __entry->old_pos_inode = old_pos->inode; +- __entry->old_pos_offset = old_pos->offset; +- __entry->old_pos_snapshot = old_pos->snapshot; +- __entry->new_pos_inode = new_pos->inode; +- __entry->new_pos_offset = new_pos->offset; +- __entry->new_pos_snapshot = new_pos->snapshot; +- __entry->good_level = good_level; +- ), +- +- TP_printk("%ps %pS btree %u old pos %llu:%llu:%u new pos %llu:%llu:%u l %u", +- (void *) __entry->trans_ip, +- (void *) __entry->caller_ip, +- __entry->btree_id, +- __entry->old_pos_inode, +- __entry->old_pos_offset, +- __entry->old_pos_snapshot, +- __entry->new_pos_inode, +- __entry->new_pos_offset, +- __entry->new_pos_snapshot, +- __entry->good_level) +-); +- + TRACE_EVENT(trans_restart_would_deadlock, + TP_PROTO(unsigned long trans_ip, + unsigned long caller_ip, +@@ -954,78 +779,6 @@ TRACE_EVENT(trans_restart_mem_realloced, + __entry->bytes) + ); + +-DECLARE_EVENT_CLASS(node_lock_fail, +- TP_PROTO(unsigned long trans_ip, +- unsigned long caller_ip, +- bool key_cache, +- enum btree_id btree_id, +- struct bpos *pos, +- unsigned level, u32 iter_seq, unsigned node, u32 node_seq), +- TP_ARGS(trans_ip, caller_ip, key_cache, btree_id, pos, +- level, iter_seq, node, node_seq), +- +- TP_STRUCT__entry( +- __field(unsigned long, trans_ip ) +- __field(unsigned long, caller_ip ) +- __field(u8, key_cache ) +- __field(u8, btree_id ) +- __field(u64, pos_inode ) +- __field(u64, pos_offset ) +- __field(u32, pos_snapshot ) +- __field(u32, level ) +- __field(u32, iter_seq ) +- __field(u32, node ) +- __field(u32, node_seq ) +- ), +- +- TP_fast_assign( +- __entry->trans_ip = trans_ip; +- __entry->caller_ip = caller_ip; +- __entry->key_cache = key_cache; +- __entry->btree_id = btree_id; +- __entry->pos_inode = pos->inode; +- __entry->pos_offset = pos->offset; +- __entry->pos_snapshot = pos->snapshot; +- __entry->level = level; +- __entry->iter_seq = iter_seq; +- __entry->node = node; +- __entry->node_seq = node_seq; +- ), +- +- TP_printk("%ps %pS key cache %u btree %u pos %llu:%llu:%u level %u iter seq %u node %u node seq %u", +- (void *) __entry->trans_ip, +- (void *) __entry->caller_ip, +- __entry->key_cache, +- __entry->btree_id, +- __entry->pos_inode, +- __entry->pos_offset, +- __entry->pos_snapshot, +- __entry->level, __entry->iter_seq, +- __entry->node, __entry->node_seq) +-); +- +-DEFINE_EVENT(node_lock_fail, node_upgrade_fail, +- TP_PROTO(unsigned long trans_ip, +- unsigned long caller_ip, +- bool key_cache, +- enum btree_id btree_id, +- struct bpos *pos, +- unsigned level, u32 iter_seq, unsigned node, u32 node_seq), +- TP_ARGS(trans_ip, caller_ip, key_cache, btree_id, pos, +- level, iter_seq, node, node_seq) +-); +- +-DEFINE_EVENT(node_lock_fail, node_relock_fail, +- TP_PROTO(unsigned long trans_ip, +- unsigned long caller_ip, +- bool key_cache, +- enum btree_id btree_id, +- struct bpos *pos, +- unsigned level, u32 iter_seq, unsigned node, u32 node_seq), +- TP_ARGS(trans_ip, caller_ip, key_cache, btree_id, pos, +- level, iter_seq, node, node_seq) +-); +- + #endif /* _TRACE_BCACHE_H */ + + /* This part must be outside protection */ +-- +cgit v1.2.3 + + +From 41d7f06fa254200f548c48552c1a4d75ea09b720 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 5 Sep 2021 00:22:32 -0400 +Subject: bcachefs: Kill retry loop in btree merge path + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_update_interior.c | 34 +++++----------------------------- + fs/bcachefs/btree_update_interior.h | 6 +----- + 2 files changed, 6 insertions(+), 34 deletions(-) + +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index 4e428d2c0eeb..5a1420b392ba 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -1572,12 +1572,7 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, + struct btree *b, *m, *n, *prev, *next, *parent; + struct bpos sib_pos; + size_t sib_u64s; +- int ret = 0, ret2 = 0; +- +-retry: +- ret = bch2_btree_path_traverse(trans, path, false); +- if (ret) +- return ret; ++ int ret = 0; + + BUG_ON(!path->should_be_locked); + BUG_ON(!btree_node_locked(path, level)); +@@ -1587,7 +1582,7 @@ retry: + if ((sib == btree_prev_sib && !bpos_cmp(b->data->min_key, POS_MIN)) || + (sib == btree_next_sib && !bpos_cmp(b->data->max_key, SPOS_MAX))) { + b->sib_u64s[sib] = U16_MAX; +- goto out; ++ return 0; + } + + sib_pos = sib == btree_prev_sib +@@ -1715,29 +1710,10 @@ retry: + + bch2_btree_update_done(as); + out: +- bch2_trans_verify_locks(trans); +- if (sib_path) +- bch2_path_put(trans, sib_path, true); +- +- /* +- * Don't downgrade locks here: we're called after successful insert, +- * and the caller will downgrade locks after a successful insert +- * anyways (in case e.g. a split was required first) +- * +- * And we're also called when inserting into interior nodes in the +- * split path, and downgrading to read locks in there is potentially +- * confusing: +- */ +- return ret ?: ret2; + err: +- if (sib_path) +- bch2_path_put(trans, sib_path, true); +- sib_path = NULL; +- +- if (ret == -EINTR && bch2_trans_relock(trans)) +- goto retry; +- +- goto out; ++ bch2_path_put(trans, sib_path, true); ++ bch2_trans_verify_locks(trans); ++ return ret; + } + + /** +diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h +index c06cfcc66db7..8e03bd987d6d 100644 +--- a/fs/bcachefs/btree_update_interior.h ++++ b/fs/bcachefs/btree_update_interior.h +@@ -129,11 +129,7 @@ static inline int bch2_foreground_maybe_merge_sibling(struct btree_trans *trans, + { + struct btree *b; + +- if (path->uptodate >= BTREE_ITER_NEED_TRAVERSE) +- return 0; +- +- if (!bch2_btree_node_relock(trans, path, level)) +- return 0; ++ EBUG_ON(!btree_node_locked(path, level)); + + b = path->l[level].b; + if (b->sib_u64s[sib] > trans->c->btree_foreground_merge_threshold) +-- +cgit v1.2.3 + + +From 2e509d3c9bce6f8112ca3aadf9002a5fc2884568 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 6 Sep 2021 15:38:12 -0400 +Subject: bcachefs: No need to clone iterators for update + +Since btree_path is now internally refcounted, we don't need to clone an +iterator before calling bch2_trans_update() if we'll be mutating it. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_update_leaf.c | 20 +++++++------------- + 1 file changed, 7 insertions(+), 13 deletions(-) + +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index 904fc72f5f75..c53e47399d27 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -887,10 +887,7 @@ static int bch2_trans_update_extent(struct btree_trans *trans, + bkey_reassemble(update, k); + + if (bch2_bkey_merge(c, bkey_i_to_s(update), bkey_i_to_s_c(insert))) { +- bch2_trans_copy_iter(&update_iter, &iter); +- ret = bch2_btree_delete_at(trans, &update_iter, flags); +- bch2_trans_iter_exit(trans, &update_iter); +- ++ ret = bch2_btree_delete_at(trans, &iter, flags); + if (ret) + goto err; + +@@ -935,10 +932,7 @@ static int bch2_trans_update_extent(struct btree_trans *trans, + } + + if (bkey_cmp(k.k->p, insert->k.p) <= 0) { +- bch2_trans_copy_iter(&update_iter, &iter); +- ret = bch2_btree_delete_at(trans, &update_iter, flags); +- bch2_trans_iter_exit(trans, &update_iter); +- ++ ret = bch2_btree_delete_at(trans, &iter, flags); + if (ret) + goto err; + } +@@ -951,9 +945,10 @@ static int bch2_trans_update_extent(struct btree_trans *trans, + bkey_reassemble(update, k); + bch2_cut_front(insert->k.p, update); + +- bch2_trans_copy_iter(&update_iter, &iter); +- bch2_trans_update(trans, &update_iter, update, flags); +- bch2_trans_iter_exit(trans, &update_iter); ++ ret = bch2_trans_update(trans, &iter, update, flags); ++ if (ret) ++ goto err; ++ + goto out; + } + next: +@@ -1056,8 +1051,7 @@ int __bch2_btree_insert(struct btree_trans *trans, + int ret; + + bch2_trans_iter_init(trans, &iter, id, bkey_start_pos(&k->k), +- BTREE_ITER_INTENT); +- ++ BTREE_ITER_INTENT); + ret = bch2_btree_iter_traverse(&iter) ?: + bch2_trans_update(trans, &iter, k, 0); + bch2_trans_iter_exit(trans, &iter); +-- +cgit v1.2.3 + + +From b9aa2d525afb25324810106d59013c9b324866fb Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 7 Sep 2021 00:58:13 -0400 +Subject: bcachefs: Enabled shard_inode_numbers by default + +We'd like performance increasing options to be on by default. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/opts.h | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h +index 003c00f25037..147b4021fdae 100644 +--- a/fs/bcachefs/opts.h ++++ b/fs/bcachefs/opts.h +@@ -171,7 +171,7 @@ enum opt_type { + x(shard_inode_numbers, u8, \ + OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_BOOL(), \ +- BCH_SB_SHARD_INUMS, false, \ ++ BCH_SB_SHARD_INUMS, true, \ + NULL, "Shard new inode numbers by CPU id") \ + x(inodes_use_key_cache, u8, \ + OPT_FORMAT|OPT_MOUNT, \ +-- +cgit v1.2.3 + + +From 8fa8df12cf441d859033110ffff86724a4e67d11 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 7 Sep 2021 13:55:33 -0400 +Subject: bcachefs: Add a missing btree_path_make_mut() call + +Also add another small helper, btree_path_clone(). + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 18 +++++++++++++----- + 1 file changed, 13 insertions(+), 5 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 57cc0191aa70..95fda480644b 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -1488,17 +1488,23 @@ static void btree_path_copy(struct btree_trans *trans, struct btree_path *dst, + btree_path_check_sort(trans, dst, 0); + } + ++static struct btree_path *btree_path_clone(struct btree_trans *trans, struct btree_path *src, ++ bool intent) ++{ ++ struct btree_path *new = btree_path_alloc(trans, src); ++ ++ btree_path_copy(trans, new, src); ++ __btree_path_get(new, intent); ++ return new; ++} ++ + inline struct btree_path * __must_check + bch2_btree_path_make_mut(struct btree_trans *trans, + struct btree_path *path, bool intent) + { + if (path->ref > 1 || path->preserve) { +- struct btree_path *new = btree_path_alloc(trans, path); +- +- btree_path_copy(trans, new, path); +- __btree_path_get(new, intent); + __btree_path_put(path, intent); +- path = new; ++ path = btree_path_clone(trans, path, intent); + path->preserve = false; + #ifdef CONFIG_BCACHEFS_DEBUG + path->ip_allocated = _RET_IP_; +@@ -2004,6 +2010,8 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) + + cmp = bpos_cmp(k.k->p, iter->path->pos); + if (cmp) { ++ iter->path = bch2_btree_path_make_mut(trans, iter->path, ++ iter->flags & BTREE_ITER_INTENT); + iter->path->pos = k.k->p; + btree_path_check_sort(trans, iter->path, cmp); + } +-- +cgit v1.2.3 + + +From 03db8a877d661a4f1fe8ff06f136a38d79dafc97 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 7 Sep 2021 15:34:16 -0400 +Subject: bcachefs: Optimize btree lookups in write path + +This patch significantly reduces the number of btree lookups required in +the extent update path. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 8 +++++++- + fs/bcachefs/btree_iter.h | 1 + + fs/bcachefs/btree_update_leaf.c | 9 ++++++++- + fs/bcachefs/io.c | 10 ++++++++++ + 4 files changed, 26 insertions(+), 2 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 95fda480644b..75319021ade3 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -1796,6 +1796,12 @@ hole: + + /* Btree iterators: */ + ++int __must_check ++__bch2_btree_iter_traverse(struct btree_iter *iter) ++{ ++ return bch2_btree_path_traverse(iter->trans, iter->path, iter->flags); ++} ++ + int __must_check + bch2_btree_iter_traverse(struct btree_iter *iter) + { +@@ -2371,7 +2377,7 @@ static void __bch2_trans_iter_init(struct btree_trans *trans, + iter->path = bch2_path_get(trans, + flags & BTREE_ITER_CACHED, + btree_id, +- btree_iter_search_key(iter), ++ iter->pos, + locks_want, + depth, + flags & BTREE_ITER_INTENT); +diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h +index ed41b52a5b5d..b2b152bc04db 100644 +--- a/fs/bcachefs/btree_iter.h ++++ b/fs/bcachefs/btree_iter.h +@@ -195,6 +195,7 @@ void bch2_trans_downgrade(struct btree_trans *); + void bch2_trans_node_add(struct btree_trans *trans, struct btree *); + void bch2_trans_node_reinit_iter(struct btree_trans *, struct btree *); + ++int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter); + int __must_check bch2_btree_iter_traverse(struct btree_iter *); + + struct btree *bch2_btree_iter_peek_node(struct btree_iter *); +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index c53e47399d27..ad5a845efebb 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -962,7 +962,14 @@ next: + bch2_bkey_merge(c, bkey_i_to_s(insert), k); + out: + if (!bkey_deleted(&insert->k)) { +- bch2_btree_iter_set_pos(&iter, insert->k.p); ++ /* ++ * Rewinding iterators is expensive: get a new one and the one ++ * that points to the start of insert will be cloned from: ++ */ ++ bch2_trans_iter_exit(trans, &iter); ++ bch2_trans_iter_init(trans, &iter, btree_id, insert->k.p, ++ BTREE_ITER_NOT_EXTENTS| ++ BTREE_ITER_INTENT); + ret = bch2_btree_iter_traverse(&iter) ?: + bch2_trans_update(trans, &iter, insert, flags); + } +diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c +index 51ce8a134243..f459dcb69ecd 100644 +--- a/fs/bcachefs/io.c ++++ b/fs/bcachefs/io.c +@@ -271,6 +271,16 @@ int bch2_extent_update(struct btree_trans *trans, + s64 i_sectors_delta = 0, disk_sectors_delta = 0; + int ret; + ++ /* ++ * This traverses us the iterator without changing iter->path->pos to ++ * search_key() (which is pos + 1 for extents): we want there to be a ++ * path already traversed at iter->pos because ++ * bch2_trans_extent_update() will use it to attempt extent merging ++ */ ++ ret = __bch2_btree_iter_traverse(iter); ++ if (ret) ++ return ret; ++ + ret = bch2_extent_trim_atomic(trans, iter, k); + if (ret) + return ret; +-- +cgit v1.2.3 + + +From 68502c5edede4160cdcacad2e5bee1b78c146278 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 7 Sep 2021 20:25:19 -0400 +Subject: bcachefs: Inline array of sorted paths in btree_trans + +It's small, and frequently used - this gets rid of a data dependency +load. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 3 --- + fs/bcachefs/btree_types.h | 2 +- + 2 files changed, 1 insertion(+), 4 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 75319021ade3..f683a51a4d45 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -2510,7 +2510,6 @@ static void bch2_trans_alloc_paths(struct btree_trans *trans, struct bch_fs *c) + { + size_t paths_bytes = sizeof(struct btree_path) * BTREE_ITER_MAX; + size_t updates_bytes = sizeof(struct btree_insert_entry) * BTREE_ITER_MAX; +- size_t sorted_bytes = sizeof(u8) * BTREE_ITER_MAX; + void *p = NULL; + + BUG_ON(trans->used_mempool); +@@ -2523,7 +2522,6 @@ static void bch2_trans_alloc_paths(struct btree_trans *trans, struct bch_fs *c) + + trans->paths = p; p += paths_bytes; + trans->updates = p; p += updates_bytes; +- trans->sorted = p; p += sorted_bytes; + } + + void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, +@@ -2730,7 +2728,6 @@ int bch2_fs_btree_iter_init(struct bch_fs *c) + + return init_srcu_struct(&c->btree_trans_barrier) ?: + mempool_init_kmalloc_pool(&c->btree_paths_pool, 1, +- sizeof(u8) * nr + + sizeof(struct btree_path) * nr + + sizeof(struct btree_insert_entry) * nr) ?: + mempool_init_kmalloc_pool(&c->btree_trans_mem_pool, 1, +diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h +index 62aae4acc6b5..ccf91ebd94aa 100644 +--- a/fs/bcachefs/btree_types.h ++++ b/fs/bcachefs/btree_types.h +@@ -393,7 +393,7 @@ struct btree_trans { + unsigned mem_bytes; + void *mem; + +- u8 *sorted; ++ u8 sorted[BTREE_ITER_MAX]; + struct btree_path *paths; + struct btree_insert_entry *updates; + +-- +cgit v1.2.3 + + +From 72c2b4f048844271fbc005f41e4bbec35a996a85 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 7 Sep 2021 20:27:55 -0400 +Subject: bcachefs: Make trans_for_each_path_inorder() faster + +Again, getting rid of data dependencies. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 16 ++++++++++++---- + fs/bcachefs/btree_iter.h | 8 ++++---- + 2 files changed, 16 insertions(+), 8 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index f683a51a4d45..1d6f6330bca8 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -1297,7 +1297,10 @@ retry_all: + BUG_ON(ret && ret != -EINTR); + + /* Now, redo traversals in correct order: */ +- trans_for_each_path_inorder(trans, path) { ++ i = 0; ++ while (i < trans->nr_sorted) { ++ path = trans->paths + trans->sorted[i]; ++ + EBUG_ON(!(trans->paths_allocated & (1ULL << path->idx))); + #ifdef CONFIG_BCACHEFS_DEBUG + trans->traverse_all_idx = path->idx; +@@ -1308,6 +1311,9 @@ retry_all: + goto retry_all; + + EBUG_ON(!(trans->paths_allocated & (1ULL << path->idx))); ++ ++ if (path->nodes_locked) ++ i++; + } + + /* +@@ -1634,11 +1640,12 @@ void bch2_dump_trans_paths_updates(struct btree_trans *trans) + { + struct btree_path *path; + struct btree_insert_entry *i; ++ unsigned idx; + char buf[300]; + + btree_trans_verify_sorted(trans); + +- trans_for_each_path_inorder(trans, path) ++ trans_for_each_path_inorder(trans, path, idx) + printk(KERN_ERR "path: idx %u ref %u:%u%s btree %s pos %s %pS\n", + path->idx, path->ref, path->intent_ref, + path->preserve ? " preserve" : "", +@@ -1695,7 +1702,7 @@ struct btree_path *bch2_path_get(struct btree_trans *trans, bool cached, + + BUG_ON(trans->restarted); + +- trans_for_each_path_inorder(trans, path) { ++ trans_for_each_path_inorder(trans, path, i) { + if (__btree_path_cmp(path, + btree_id, + cached, +@@ -2250,8 +2257,9 @@ static void btree_trans_verify_sorted(struct btree_trans *trans) + { + #ifdef CONFIG_BCACHEFS_DEBUG + struct btree_path *path, *prev = NULL; ++ unsigned i; + +- trans_for_each_path_inorder(trans, path) { ++ trans_for_each_path_inorder(trans, path, i) { + BUG_ON(prev && btree_path_cmp(prev, path) > 0); + prev = path; + } +diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h +index b2b152bc04db..be1bb489f3d6 100644 +--- a/fs/bcachefs/btree_iter.h ++++ b/fs/bcachefs/btree_iter.h +@@ -99,10 +99,10 @@ static inline struct btree_path *prev_btree_path(struct btree_trans *trans, stru + : NULL; + } + +-#define trans_for_each_path_inorder(_trans, _path) \ +- for (_path = next_btree_path(trans, NULL); \ +- (_path); \ +- _path = next_btree_path((_trans), (_path))) ++#define trans_for_each_path_inorder(_trans, _path, _i) \ ++ for (_i = 0; \ ++ ((_path) = (_trans)->paths + trans->sorted[_i]), (_i) < (_trans)->nr_sorted;\ ++ _i++) + + static inline bool __path_has_node(const struct btree_path *path, + const struct btree *b) +-- +cgit v1.2.3 + + +From a8c87f9cd4931208586a257ee9bf433d6078e64f Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 7 Sep 2021 20:23:30 -0400 +Subject: bcachefs: Consolidate intent lock code in + btree_path_up_until_good_node + +We need to take all needed intent locks when relocking an iterator: +bch2_btree_path_traverse() had a special cased, faster version of this, +but it really should be in up_until_good_node() so that set_pos() can +use it too. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 29 +++++++++++++---------------- + 1 file changed, 13 insertions(+), 16 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 1d6f6330bca8..a3927397b850 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -1356,10 +1356,7 @@ static inline unsigned btree_path_up_until_good_node(struct btree_trans *trans, + struct btree_path *path, + int check_pos) + { +- unsigned l = path->level; +- +- if (!path->nodes_locked) +- btree_path_get_locks(trans, path, false, _THIS_IP_); ++ unsigned i, l = path->level; + + while (btree_path_node(path, l) && + !btree_path_good_node(trans, path, l, check_pos)) { +@@ -1368,6 +1365,17 @@ static inline unsigned btree_path_up_until_good_node(struct btree_trans *trans, + l++; + } + ++ /* If we need intent locks, take them too: */ ++ for (i = l + 1; ++ i < path->locks_want && btree_path_node(path, i); ++ i++) ++ if (!bch2_btree_node_relock(trans, path, i)) ++ while (l <= i) { ++ btree_node_unlock(path, l); ++ path->l[l].b = BTREE_ITER_NO_NODE_UP; ++ l++; ++ } ++ + return l; + } + +@@ -1385,7 +1393,7 @@ static int btree_path_traverse_one(struct btree_trans *trans, + unsigned flags, + unsigned long trace_ip) + { +- unsigned l, depth_want = path->level; ++ unsigned depth_want = path->level; + int ret = 0; + + /* +@@ -1407,17 +1415,6 @@ static int btree_path_traverse_one(struct btree_trans *trans, + + path->level = btree_path_up_until_good_node(trans, path, 0); + +- /* If we need intent locks, take them too: */ +- for (l = path->level + 1; +- l < path->locks_want && btree_path_node(path, l); +- l++) +- if (!bch2_btree_node_relock(trans, path, l)) +- while (path->level <= l) { +- btree_node_unlock(path, path->level); +- path->l[path->level].b = BTREE_ITER_NO_NODE_UP; +- path->level++; +- } +- + /* + * Note: path->nodes[path->level] may be temporarily NULL here - that + * would indicate to other code that we got to the end of the btree, +-- +cgit v1.2.3 + + +From c2878c80488fcff2ef66ca0fd9e552fae56543b8 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 7 Sep 2021 21:24:05 -0400 +Subject: bcachefs: normalize_read_intent_locks + +This is a new approach to avoiding the self deadlock we'd get if we +tried to take a write lock on a node while holding a read lock - we +simply upgrade the readers to intent. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_update_leaf.c | 73 ++++++++++++++++++++++++++++++++++------- + 1 file changed, 61 insertions(+), 12 deletions(-) + +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index ad5a845efebb..f4ba2c4a2480 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -48,14 +48,12 @@ static inline bool same_leaf_as_next(struct btree_trans *trans, + insert_l(&i[0])->b == insert_l(&i[1])->b; + } + +-inline void bch2_btree_node_lock_for_insert(struct btree_trans *trans, +- struct btree_path *path, +- struct btree *b) ++static inline void bch2_btree_node_prep_for_write(struct btree_trans *trans, ++ struct btree_path *path, ++ struct btree *b) + { + struct bch_fs *c = trans->c; + +- bch2_btree_node_lock_write(trans, path, b); +- + if (path->cached) + return; + +@@ -71,6 +69,14 @@ inline void bch2_btree_node_lock_for_insert(struct btree_trans *trans, + bch2_btree_init_next(trans, b); + } + ++void bch2_btree_node_lock_for_insert(struct btree_trans *trans, ++ struct btree_path *path, ++ struct btree *b) ++{ ++ bch2_btree_node_lock_write(trans, path, b); ++ bch2_btree_node_prep_for_write(trans, path, b); ++} ++ + /* Inserting into a given leaf node (last stage of insert): */ + + /* Handle overwrites and do insert, for non extents: */ +@@ -495,6 +501,50 @@ err: + return ret; + } + ++static inline void upgrade_readers(struct btree_trans *trans, struct btree_path *path) ++{ ++ struct btree *b = path_l(path)->b; ++ ++ do { ++ if (path->nodes_locked && ++ path->nodes_locked != path->nodes_intent_locked) ++ BUG_ON(!bch2_btree_path_upgrade(trans, path, path->level + 1)); ++ } while ((path = prev_btree_path(trans, path)) && ++ path_l(path)->b == b); ++} ++ ++/* ++ * Check for nodes that we have both read and intent locks on, and upgrade the ++ * readers to intent: ++ */ ++static inline void normalize_read_intent_locks(struct btree_trans *trans) ++{ ++ struct btree_path *path; ++ unsigned i, nr_read = 0, nr_intent = 0; ++ ++ trans_for_each_path_inorder(trans, path, i) { ++ struct btree_path *next = i + 1 < trans->nr_sorted ++ ? trans->paths + trans->sorted[i + 1] ++ : NULL; ++ ++ if (path->nodes_locked) { ++ if (path->nodes_intent_locked) ++ nr_intent++; ++ else ++ nr_read++; ++ } ++ ++ if (!next || path_l(path)->b != path_l(next)->b) { ++ if (nr_read && nr_intent) ++ upgrade_readers(trans, path); ++ ++ nr_read = nr_intent = 0; ++ } ++ } ++ ++ bch2_trans_verify_locks(trans); ++} ++ + /* + * Get journal reservation, take write locks, and attempt to do btree update(s): + */ +@@ -538,9 +588,6 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, + } + } + +- trans_for_each_update(trans, i) +- BUG_ON(!btree_node_intent_locked(i->path, i->level)); +- + ret = bch2_journal_preres_get(&c->journal, + &trans->journal_preres, trans->journal_preres_u64s, + JOURNAL_RES_GET_NONBLOCK| +@@ -586,12 +633,14 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, + } + btree_insert_entry_checks(trans, i); + } +- bch2_trans_verify_locks(trans); ++ ++ normalize_read_intent_locks(trans); + + trans_for_each_update(trans, i) +- if (!same_leaf_as_prev(trans, i)) +- bch2_btree_node_lock_for_insert(trans, i->path, +- insert_l(i)->b); ++ if (!same_leaf_as_prev(trans, i)) { ++ btree_node_lock_type(c, insert_l(i)->b, SIX_LOCK_write); ++ bch2_btree_node_prep_for_write(trans, i->path, insert_l(i)->b); ++ } + + ret = bch2_trans_commit_write_locked(trans, stopped_at, trace_ip); + +-- +cgit v1.2.3 + + +From 19eb6ea89202aca16f76409249ab3b77fc0bf364 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 7 Sep 2021 21:25:32 -0400 +Subject: bcachefs: Better approach to write vs. read lock deadlocks + +Instead of unconditionally upgrading read locks to intent locks in +do_bch2_trans_commit(), this patch changes the path that takes write +locks to first trylock, and then if trylock fails check if we have a +conflicting read lock, and restart the transaction if necessary. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_update_leaf.c | 108 +++++++++++++++++++++++++--------------- + include/trace/events/bcachefs.h | 15 ++++++ + 2 files changed, 82 insertions(+), 41 deletions(-) + +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index f4ba2c4a2480..9c8c5cacc4fc 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -545,6 +545,54 @@ static inline void normalize_read_intent_locks(struct btree_trans *trans) + bch2_trans_verify_locks(trans); + } + ++static inline bool have_conflicting_read_lock(struct btree_trans *trans, struct btree_path *pos) ++{ ++ struct btree_path *path; ++ unsigned i; ++ ++ trans_for_each_path_inorder(trans, path, i) { ++ //if (path == pos) ++ // break; ++ ++ if (path->nodes_locked != path->nodes_intent_locked) ++ return true; ++ } ++ ++ return false; ++} ++ ++static inline int trans_lock_write(struct btree_trans *trans) ++{ ++ struct btree_insert_entry *i; ++ ++ trans_for_each_update(trans, i) { ++ if (same_leaf_as_prev(trans, i)) ++ continue; ++ ++ if (!six_trylock_write(&insert_l(i)->b->c.lock)) { ++ if (have_conflicting_read_lock(trans, i->path)) ++ goto fail; ++ ++ __btree_node_lock_type(trans->c, insert_l(i)->b, ++ SIX_LOCK_write); ++ } ++ ++ bch2_btree_node_prep_for_write(trans, i->path, insert_l(i)->b); ++ } ++ ++ return 0; ++fail: ++ while (--i >= trans->updates) { ++ if (same_leaf_as_prev(trans, i)) ++ continue; ++ ++ bch2_btree_node_unlock_write_inlined(trans, i->path, insert_l(i)->b); ++ } ++ ++ trace_trans_restart_would_deadlock_write(trans->ip); ++ return btree_trans_restart(trans); ++} ++ + /* + * Get journal reservation, take write locks, and attempt to do btree update(s): + */ +@@ -554,10 +602,25 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, + { + struct bch_fs *c = trans->c; + struct btree_insert_entry *i; +- struct btree_path *path; + struct bkey_s_c old; + int ret, u64s_delta = 0; + ++ trans_for_each_update(trans, i) { ++ const char *invalid = bch2_bkey_invalid(c, ++ bkey_i_to_s_c(i->k), i->bkey_type); ++ if (invalid) { ++ char buf[200]; ++ ++ bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(i->k)); ++ bch_err(c, "invalid bkey %s on insert from %ps -> %ps: %s\n", ++ buf, (void *) trans->ip, ++ (void *) i->ip_allocated, invalid); ++ bch2_fatal_error(c); ++ return -EINVAL; ++ } ++ btree_insert_entry_checks(trans, i); ++ } ++ + trans_for_each_update(trans, i) { + struct bkey u; + +@@ -599,48 +662,11 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, + if (unlikely(ret)) + return ret; + +- /* +- * Can't be holding any read locks when we go to take write locks: +- * another thread could be holding an intent lock on the same node we +- * have a read lock on, and it'll block trying to take a write lock +- * (because we hold a read lock) and it could be blocking us by holding +- * its own read lock (while we're trying to to take write locks). +- * +- * note - this must be done after bch2_trans_journal_preres_get_cold() +- * or anything else that might call bch2_trans_relock(), since that +- * would just retake the read locks: +- */ +- trans_for_each_path(trans, path) +- if (path->nodes_locked != path->nodes_intent_locked && +- !bch2_btree_path_upgrade(trans, path, path->level + 1)) { +- trace_trans_restart_upgrade(trans->ip, trace_ip, +- path->btree_id, &path->pos); +- return btree_trans_restart(trans); +- } +- +- trans_for_each_update(trans, i) { +- const char *invalid = bch2_bkey_invalid(c, +- bkey_i_to_s_c(i->k), i->bkey_type); +- if (invalid) { +- char buf[200]; +- +- bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(i->k)); +- bch_err(c, "invalid bkey %s on insert from %ps -> %ps: %s\n", +- buf, (void *) trans->ip, +- (void *) i->ip_allocated, invalid); +- bch2_fatal_error(c); +- return -EINVAL; +- } +- btree_insert_entry_checks(trans, i); +- } +- + normalize_read_intent_locks(trans); + +- trans_for_each_update(trans, i) +- if (!same_leaf_as_prev(trans, i)) { +- btree_node_lock_type(c, insert_l(i)->b, SIX_LOCK_write); +- bch2_btree_node_prep_for_write(trans, i->path, insert_l(i)->b); +- } ++ ret = trans_lock_write(trans); ++ if (unlikely(ret)) ++ return ret; + + ret = bch2_trans_commit_write_locked(trans, stopped_at, trace_ip); + +diff --git a/include/trace/events/bcachefs.h b/include/trace/events/bcachefs.h +index 06fda4ddb7ba..fce3146378f9 100644 +--- a/include/trace/events/bcachefs.h ++++ b/include/trace/events/bcachefs.h +@@ -756,6 +756,21 @@ TRACE_EVENT(trans_restart_would_deadlock, + __entry->want_pos_snapshot) + ); + ++TRACE_EVENT(trans_restart_would_deadlock_write, ++ TP_PROTO(unsigned long trans_ip), ++ TP_ARGS(trans_ip), ++ ++ TP_STRUCT__entry( ++ __field(unsigned long, trans_ip ) ++ ), ++ ++ TP_fast_assign( ++ __entry->trans_ip = trans_ip; ++ ), ++ ++ TP_printk("%ps", (void *) __entry->trans_ip) ++); ++ + TRACE_EVENT(trans_restart_mem_realloced, + TP_PROTO(unsigned long trans_ip, unsigned long caller_ip, + unsigned long bytes), +-- +cgit v1.2.3 + + +From 7a22bf8ee1a7ae021d3555d5287c215f1e36f37d Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 7 Sep 2021 23:04:04 -0400 +Subject: bcachefs: Add missing BTREE_ITER_INTENT + +No reason not to be using it here... + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/fs-common.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c +index a6617455ea12..6bc82559c9b1 100644 +--- a/fs/bcachefs/fs-common.c ++++ b/fs/bcachefs/fs-common.c +@@ -110,7 +110,7 @@ int bch2_link_trans(struct btree_trans *trans, u64 dir_inum, + inode_u->bi_ctime = now; + bch2_inode_nlink_inc(inode_u); + +- ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir_inum, 0); ++ ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir_inum, BTREE_ITER_INTENT); + if (ret) + goto err; + +-- +cgit v1.2.3 + + +From d55c8a1f5d29da7417e12a1d193f5b90fd8b5ffb Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 9 Sep 2021 19:05:34 -0400 +Subject: bcachefs: Fix some compiler warnings + +gcc couldn't always deduce that written wasn't used uninitialized + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_io.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c +index 35d0f646a0b5..f11fcab61902 100644 +--- a/fs/bcachefs/btree_io.c ++++ b/fs/bcachefs/btree_io.c +@@ -1257,7 +1257,7 @@ static void btree_node_read_all_replicas_done(struct closure *cl) + bool dump_bset_maps = false; + bool have_retry = false; + int ret = 0, best = -1, write = READ; +- unsigned i, written, written2; ++ unsigned i, written = 0, written2 = 0; + __le64 seq = b->key.k.type == KEY_TYPE_btree_ptr_v2 + ? bkey_i_to_btree_ptr_v2(&b->key)->v.seq : 0; + +-- +cgit v1.2.3 + + +From 34d5a812249d411069d47c64e8634af45bd9af6e Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 13 Sep 2021 12:38:40 -0400 +Subject: bcachefs: Add a missing bch2_trans_relock() call + +This was causing an assertion to pop in fsck, in one of the repair +paths. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_update_leaf.c | 3 +++ + 1 file changed, 3 insertions(+) + +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index 9c8c5cacc4fc..a0da96737700 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -795,6 +795,9 @@ bch2_trans_commit_get_rw_cold(struct btree_trans *trans) + if (ret) + return ret; + ++ if (!bch2_trans_relock(trans)) ++ return -EINTR; ++ + percpu_ref_get(&c->writes); + return 0; + } +-- +cgit v1.2.3 + + +From c5add2e5f6bccb30a1af7582df994e57de491026 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 13 Sep 2021 16:04:49 -0400 +Subject: bcachefs: Improve btree_node_mem_ptr optimization + +This patch checks b->hash_val before attempting to lock the node in the +btree, which makes it more equivalent to the "lookup in hash table" +path - and potentially avoids an unnecessary transaction restart if +btree_node_mem_ptr(k) no longer points to the node we want. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_cache.c | 7 ++++++- + 1 file changed, 6 insertions(+), 1 deletion(-) + +diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c +index 3b08b32f00a4..e894b8cab7af 100644 +--- a/fs/bcachefs/btree_cache.c ++++ b/fs/bcachefs/btree_cache.c +@@ -778,7 +778,12 @@ struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path * + + if (c->opts.btree_node_mem_ptr_optimization) { + b = btree_node_mem_ptr(k); +- if (b) ++ /* ++ * Check b->hash_val _before_ calling btree_node_lock() - this ++ * might not be the node we want anymore, and trying to lock the ++ * wrong node could cause an unneccessary transaction restart: ++ */ ++ if (b && b->hash_val == btree_ptr_hash_val(k)) + goto lock_node; + } + retry: +-- +cgit v1.2.3 + + +From aee30465dc23ec4d8b9ace702085b6a3af0aba9b Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 15 Sep 2021 11:15:18 -0400 +Subject: Revert "bcachefs: Add more assertions for locking btree iterators out + of order" + +Figured out the bug we were chasing, and it had nothing to do with +locking btree iterators/paths out of order. + +This reverts commit ff08733dd298c969aec7c7828095458f73fd5374. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 23 ++++++++--------------- + fs/bcachefs/btree_key_cache.c | 4 ++-- + fs/bcachefs/btree_locking.h | 17 +++-------------- + fs/bcachefs/btree_types.h | 2 -- + 4 files changed, 13 insertions(+), 33 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index a3927397b850..13e85096dd41 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -160,7 +160,7 @@ bool __bch2_btree_node_relock(struct btree_trans *trans, + if (six_relock_type(&b->c.lock, want, path->l[level].lock_seq) || + (btree_node_lock_seq_matches(path, b, level) && + btree_node_lock_increment(trans, b, level, want))) { +- mark_btree_node_locked(trans, path, level, want); ++ mark_btree_node_locked(path, level, want); + return true; + } else { + return false; +@@ -196,7 +196,7 @@ static bool bch2_btree_node_upgrade(struct btree_trans *trans, + + return false; + success: +- mark_btree_node_intent_locked(trans, path, level); ++ mark_btree_node_intent_locked(path, level); + return true; + } + +@@ -904,12 +904,12 @@ static inline struct bkey_s_c btree_path_level_peek_all(struct bch_fs *c, + bch2_btree_node_iter_peek_all(&l->iter, l->b)); + } + +-static inline struct bkey_s_c btree_path_level_peek(struct btree_trans *trans, ++static inline struct bkey_s_c btree_path_level_peek(struct bch_fs *c, + struct btree_path *path, + struct btree_path_level *l, + struct bkey *u) + { +- struct bkey_s_c k = __btree_iter_unpack(trans->c, l, u, ++ struct bkey_s_c k = __btree_iter_unpack(c, l, u, + bch2_btree_node_iter_peek(&l->iter, l->b)); + + path->pos = k.k ? k.k->p : l->b->key.k.p; +@@ -1049,7 +1049,7 @@ void bch2_trans_node_add(struct btree_trans *trans, struct btree *b) + t != BTREE_NODE_UNLOCKED) { + btree_node_unlock(path, b->c.level); + six_lock_increment(&b->c.lock, t); +- mark_btree_node_locked(trans, path, b->c.level, t); ++ mark_btree_node_locked(path, b->c.level, t); + } + + btree_path_level_init(trans, path, b); +@@ -1126,7 +1126,7 @@ static inline int btree_path_lock_root(struct btree_trans *trans, + for (i = path->level + 1; i < BTREE_MAX_DEPTH; i++) + path->l[i].b = NULL; + +- mark_btree_node_locked(trans, path, path->level, lock_type); ++ mark_btree_node_locked(path, path->level, lock_type); + btree_path_level_init(trans, path, b); + return 0; + } +@@ -1218,7 +1218,7 @@ static __always_inline int btree_path_down(struct btree_trans *trans, + if (unlikely(ret)) + goto err; + +- mark_btree_node_locked(trans, path, level, lock_type); ++ mark_btree_node_locked(path, level, lock_type); + btree_path_level_init(trans, path, b); + + if (tmp.k->k.type == KEY_TYPE_btree_ptr_v2 && +@@ -1260,10 +1260,6 @@ retry_all: + + btree_trans_verify_sorted(trans); + +-#ifdef CONFIG_BCACHEFS_DEBUG +- trans->traverse_all_idx = U8_MAX; +-#endif +- + for (i = trans->nr_sorted - 2; i >= 0; --i) { + struct btree_path *path1 = trans->paths + trans->sorted[i]; + struct btree_path *path2 = trans->paths + trans->sorted[i + 1]; +@@ -1302,9 +1298,6 @@ retry_all: + path = trans->paths + trans->sorted[i]; + + EBUG_ON(!(trans->paths_allocated & (1ULL << path->idx))); +-#ifdef CONFIG_BCACHEFS_DEBUG +- trans->traverse_all_idx = path->idx; +-#endif + + ret = btree_path_traverse_one(trans, path, 0, _THIS_IP_); + if (ret) +@@ -2073,7 +2066,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) + goto out; + } + +- k = btree_path_level_peek(trans, iter->path, ++ k = btree_path_level_peek(trans->c, iter->path, + &iter->path->l[0], &iter->k); + if (!k.k || + ((iter->flags & BTREE_ITER_IS_EXTENTS) +diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c +index 938ced36af73..2dfa5040d045 100644 +--- a/fs/bcachefs/btree_key_cache.c ++++ b/fs/bcachefs/btree_key_cache.c +@@ -296,7 +296,7 @@ retry: + if (!ck) + goto retry; + +- mark_btree_node_locked(trans, path, 0, SIX_LOCK_intent); ++ mark_btree_node_locked(path, 0, SIX_LOCK_intent); + path->locks_want = 1; + } else { + enum six_lock_type lock_want = __btree_lock_want(path, 0); +@@ -318,7 +318,7 @@ retry: + goto retry; + } + +- mark_btree_node_locked(trans, path, 0, lock_want); ++ mark_btree_node_locked(path, 0, lock_want); + } + + path->l[0].lock_seq = ck->c.lock.state.seq; +diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h +index 5c6b758070e1..d599008c5fc1 100644 +--- a/fs/bcachefs/btree_locking.h ++++ b/fs/bcachefs/btree_locking.h +@@ -58,8 +58,7 @@ static inline void mark_btree_node_unlocked(struct btree_path *path, + path->nodes_intent_locked &= ~(1 << level); + } + +-static inline void mark_btree_node_locked(struct btree_trans *trans, +- struct btree_path *path, ++static inline void mark_btree_node_locked(struct btree_path *path, + unsigned level, + enum six_lock_type type) + { +@@ -69,19 +68,12 @@ static inline void mark_btree_node_locked(struct btree_trans *trans, + + path->nodes_locked |= 1 << level; + path->nodes_intent_locked |= type << level; +-#ifdef CONFIG_BCACHEFS_DEBUG +- path->ip_locked = _RET_IP_; +- BUG_ON(trans->in_traverse_all && +- trans->traverse_all_idx != U8_MAX && +- path->sorted_idx > trans->paths[trans->traverse_all_idx].sorted_idx); +-#endif + } + +-static inline void mark_btree_node_intent_locked(struct btree_trans *trans, +- struct btree_path *path, ++static inline void mark_btree_node_intent_locked(struct btree_path *path, + unsigned level) + { +- mark_btree_node_locked(trans, path, level, SIX_LOCK_intent); ++ mark_btree_node_locked(path, level, SIX_LOCK_intent); + } + + static inline enum six_lock_type __btree_lock_want(struct btree_path *path, int level) +@@ -120,9 +112,6 @@ static inline void __bch2_btree_path_unlock(struct btree_path *path) + + while (path->nodes_locked) + btree_node_unlock(path, __ffs(path->nodes_locked)); +-#ifdef CONFIG_BCACHEFS_DEBUG +- path->ip_locked = 0; +-#endif + } + + static inline enum bch_time_stats lock_to_time_stat(enum six_lock_type type) +diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h +index ccf91ebd94aa..300cdbb6821d 100644 +--- a/fs/bcachefs/btree_types.h ++++ b/fs/bcachefs/btree_types.h +@@ -255,7 +255,6 @@ struct btree_path { + } l[BTREE_MAX_DEPTH]; + #ifdef CONFIG_BCACHEFS_DEBUG + unsigned long ip_allocated; +- unsigned long ip_locked; + #endif + }; + +@@ -369,7 +368,6 @@ struct btree_trans { + struct bpos locking_pos; + u8 locking_btree_id; + u8 locking_level; +- u8 traverse_all_idx; + pid_t pid; + #endif + unsigned long ip; +-- +cgit v1.2.3 + + +From 3b82e438a588917fbddf4df0635d902edfc04164 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 26 Sep 2021 13:54:14 -0400 +Subject: bcachefs: Disable quota support + +Existing quota support breaks badly with snapshots. We're not deleting +the code because some of it will be needed when we reimplement quotas +along the lines of btrfs subvolume quotas. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/opts.h | 12 ++++++------ + 1 file changed, 6 insertions(+), 6 deletions(-) + +diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h +index 147b4021fdae..d39d6a546ac4 100644 +--- a/fs/bcachefs/opts.h ++++ b/fs/bcachefs/opts.h +@@ -215,19 +215,19 @@ enum opt_type { + BCH_SB_POSIX_ACL, true, \ + NULL, "Enable POSIX acls") \ + x(usrquota, u8, \ +- OPT_FORMAT|OPT_MOUNT, \ ++ 0, \ + OPT_BOOL(), \ +- BCH_SB_USRQUOTA, false, \ ++ NO_SB_OPT, false, \ + NULL, "Enable user quotas") \ + x(grpquota, u8, \ +- OPT_FORMAT|OPT_MOUNT, \ ++ 0, \ + OPT_BOOL(), \ +- BCH_SB_GRPQUOTA, false, \ ++ NO_SB_OPT, false, \ + NULL, "Enable group quotas") \ + x(prjquota, u8, \ +- OPT_FORMAT|OPT_MOUNT, \ ++ 0, \ + OPT_BOOL(), \ +- BCH_SB_PRJQUOTA, false, \ ++ NO_SB_OPT, false, \ + NULL, "Enable project quotas") \ + x(degraded, u8, \ + OPT_MOUNT, \ +-- +cgit v1.2.3 + + +From baf2e7d17c49e94edd1cdcbc448e1e7f2db2d94b Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 13 Jun 2021 17:07:18 -0400 +Subject: bcachefs: Disable btree key cache for inodes + +Snapshots breaks using the btree key cache for inodes, for now: filling +in a key cache entry does a btree lookup in FILTER_SNAPSHOTS mode, which +means cached inodes at different positions (ancestor snapshots) need to +be flushed for the FILTER_SNAPHSOTS lookup to work correctly - and we +currently don't have a good way of doing that flush. + +Thus, this workaround for now. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/inode.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c +index 14b0e8c03119..84643eaaa8d5 100644 +--- a/fs/bcachefs/inode.c ++++ b/fs/bcachefs/inode.c +@@ -300,7 +300,7 @@ int bch2_inode_peek(struct btree_trans *trans, + struct bkey_s_c k; + int ret; + +- if (trans->c->opts.inodes_use_key_cache) ++ if (0 && trans->c->opts.inodes_use_key_cache) + flags |= BTREE_ITER_CACHED; + + bch2_trans_iter_init(trans, iter, BTREE_ID_inodes, POS(0, inum), flags); +@@ -593,7 +593,7 @@ int bch2_inode_rm(struct bch_fs *c, u64 inode_nr, bool cached) + unsigned iter_flags = BTREE_ITER_INTENT; + int ret; + +- if (cached && c->opts.inodes_use_key_cache) ++ if (0 && cached && c->opts.inodes_use_key_cache) + iter_flags |= BTREE_ITER_CACHED; + + bch2_trans_init(&trans, c, 0, 1024); +-- +cgit v1.2.3 + + +From 5c837ee06c50f43b27be40cb9a0f92dfe4eac4f4 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 16 Mar 2021 00:42:25 -0400 +Subject: bcachefs: Subvolumes, snapshots + +This patch adds subvolume.c - support for the subvolumes and snapshots +btrees and related data types and on disk data structures. The next +patches will start hooking up this new code to existing code. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/Makefile | 1 + + fs/bcachefs/bcachefs.h | 23 + + fs/bcachefs/bcachefs_format.h | 55 ++- + fs/bcachefs/bcachefs_ioctl.h | 15 + + fs/bcachefs/bkey_methods.c | 5 + + fs/bcachefs/btree_key_cache.c | 5 + + fs/bcachefs/btree_types.h | 15 +- + fs/bcachefs/btree_update_leaf.c | 6 + + fs/bcachefs/buckets.c | 3 + + fs/bcachefs/dirent.c | 5 +- + fs/bcachefs/fsck.c | 4 +- + fs/bcachefs/inode.c | 15 +- + fs/bcachefs/opts.c | 3 +- + fs/bcachefs/recovery.c | 121 ++++- + fs/bcachefs/subvolume.c | 981 ++++++++++++++++++++++++++++++++++++++++ + fs/bcachefs/subvolume.h | 77 ++++ + fs/bcachefs/super.c | 4 + + 17 files changed, 1314 insertions(+), 24 deletions(-) + create mode 100644 fs/bcachefs/subvolume.c + create mode 100644 fs/bcachefs/subvolume.h + +diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile +index ee5e6dbd5ede..71cda24e6d08 100644 +--- a/fs/bcachefs/Makefile ++++ b/fs/bcachefs/Makefile +@@ -49,6 +49,7 @@ bcachefs-y := \ + reflink.o \ + replicas.o \ + siphash.o \ ++ subvolume.o \ + super.o \ + super-io.o \ + sysfs.o \ +diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h +index 94c73f28398f..59cbede4c72d 100644 +--- a/fs/bcachefs/bcachefs.h ++++ b/fs/bcachefs/bcachefs.h +@@ -381,6 +381,8 @@ enum gc_phase { + GC_PHASE_BTREE_alloc, + GC_PHASE_BTREE_quotas, + GC_PHASE_BTREE_reflink, ++ GC_PHASE_BTREE_subvolumes, ++ GC_PHASE_BTREE_snapshots, + + GC_PHASE_PENDING_DELETE, + }; +@@ -564,6 +566,21 @@ struct btree_path_buf { + + #define REPLICAS_DELTA_LIST_MAX (1U << 16) + ++struct snapshot_t { ++ u32 parent; ++ u32 children[2]; ++ u32 subvol; /* Nonzero only if a subvolume points to this node: */ ++ u32 equiv; ++}; ++ ++typedef struct { ++ u32 subvol; ++ u64 inum; ++} subvol_inum; ++ ++#define BCACHEFS_ROOT_SUBVOL_INUM \ ++ ((subvol_inum) { BCACHEFS_ROOT_SUBVOL, BCACHEFS_ROOT_INO }) ++ + struct bch_fs { + struct closure cl; + +@@ -635,6 +652,12 @@ struct bch_fs { + struct closure sb_write; + struct mutex sb_lock; + ++ /* snapshot.c: */ ++ GENRADIX(struct snapshot_t) snapshots; ++ struct bch_snapshot_table __rcu *snapshot_table; ++ struct mutex snapshot_table_lock; ++ struct work_struct snapshot_delete_work; ++ + /* BTREE CACHE */ + struct bio_set btree_bio; + struct workqueue_struct *io_complete_wq; +diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h +index 98779e46bbd0..807cea622920 100644 +--- a/fs/bcachefs/bcachefs_format.h ++++ b/fs/bcachefs/bcachefs_format.h +@@ -342,7 +342,9 @@ static inline void bkey_init(struct bkey *k) + x(inline_data, 17) \ + x(btree_ptr_v2, 18) \ + x(indirect_inline_data, 19) \ +- x(alloc_v2, 20) ++ x(alloc_v2, 20) \ ++ x(subvolume, 21) \ ++ x(snapshot, 22) + + enum bch_bkey_type { + #define x(name, nr) KEY_TYPE_##name = nr, +@@ -686,6 +688,10 @@ struct bch_inode_generation { + __le32 pad; + } __attribute__((packed, aligned(8))); + ++/* ++ * bi_subvol and bi_parent_subvol are only set for subvolume roots: ++ */ ++ + #define BCH_INODE_FIELDS() \ + x(bi_atime, 96) \ + x(bi_ctime, 96) \ +@@ -709,7 +715,9 @@ struct bch_inode_generation { + x(bi_erasure_code, 16) \ + x(bi_fields_set, 16) \ + x(bi_dir, 64) \ +- x(bi_dir_offset, 64) ++ x(bi_dir_offset, 64) \ ++ x(bi_subvol, 32) \ ++ x(bi_parent_subvol, 32) + + /* subset of BCH_INODE_FIELDS */ + #define BCH_INODE_OPTS() \ +@@ -792,6 +800,9 @@ struct bch_dirent { + __u8 d_name[]; + } __attribute__((packed, aligned(8))); + ++#define DT_SUBVOL 16 ++#define BCH_DT_MAX 17 ++ + #define BCH_NAME_MAX (U8_MAX * sizeof(u64) - \ + sizeof(struct bkey) - \ + offsetof(struct bch_dirent, d_name)) +@@ -928,6 +939,42 @@ struct bch_inline_data { + u8 data[0]; + }; + ++/* Subvolumes: */ ++ ++#define SUBVOL_POS_MIN POS(0, 1) ++#define SUBVOL_POS_MAX POS(0, S32_MAX) ++#define BCACHEFS_ROOT_SUBVOL 1 ++ ++struct bch_subvolume { ++ struct bch_val v; ++ __le32 flags; ++ __le32 snapshot; ++ __le64 inode; ++}; ++ ++LE32_BITMASK(BCH_SUBVOLUME_RO, struct bch_subvolume, flags, 0, 1) ++/* ++ * We need to know whether a subvolume is a snapshot so we can know whether we ++ * can delete it (or whether it should just be rm -rf'd) ++ */ ++LE32_BITMASK(BCH_SUBVOLUME_SNAP, struct bch_subvolume, flags, 1, 2) ++ ++/* Snapshots */ ++ ++struct bch_snapshot { ++ struct bch_val v; ++ __le32 flags; ++ __le32 parent; ++ __le32 children[2]; ++ __le32 subvol; ++ __le32 pad; ++}; ++ ++LE32_BITMASK(BCH_SNAPSHOT_DELETED, struct bch_snapshot, flags, 0, 1) ++ ++/* True if a subvolume points to this snapshot node: */ ++LE32_BITMASK(BCH_SNAPSHOT_SUBVOL, struct bch_snapshot, flags, 1, 2) ++ + /* Optional/variable size superblock sections: */ + + struct bch_sb_field { +@@ -1695,7 +1742,9 @@ LE32_BITMASK(JSET_NO_FLUSH, struct jset, flags, 5, 6); + x(alloc, 4) \ + x(quotas, 5) \ + x(stripes, 6) \ +- x(reflink, 7) ++ x(reflink, 7) \ ++ x(subvolumes, 8) \ ++ x(snapshots, 9) + + enum btree_id { + #define x(kwd, val) BTREE_ID_##kwd = val, +diff --git a/fs/bcachefs/bcachefs_ioctl.h b/fs/bcachefs/bcachefs_ioctl.h +index f679fc2151bc..930981ad5535 100644 +--- a/fs/bcachefs/bcachefs_ioctl.h ++++ b/fs/bcachefs/bcachefs_ioctl.h +@@ -78,6 +78,9 @@ struct bch_ioctl_incremental { + #define BCH_IOCTL_DISK_RESIZE _IOW(0xbc, 14, struct bch_ioctl_disk_resize) + #define BCH_IOCTL_DISK_RESIZE_JOURNAL _IOW(0xbc,15, struct bch_ioctl_disk_resize_journal) + ++#define BCH_IOCTL_SUBVOLUME_CREATE _IOW(0xbc, 16, struct bch_ioctl_subvolume) ++#define BCH_IOCTL_SUBVOLUME_DESTROY _IOW(0xbc, 17, struct bch_ioctl_subvolume) ++ + /* ioctl below act on a particular file, not the filesystem as a whole: */ + + #define BCHFS_IOC_REINHERIT_ATTRS _IOR(0xbc, 64, const char __user *) +@@ -349,4 +352,16 @@ struct bch_ioctl_disk_resize_journal { + __u64 nbuckets; + }; + ++struct bch_ioctl_subvolume { ++ __u32 flags; ++ __u32 dirfd; ++ __u16 mode; ++ __u16 pad[3]; ++ __u64 dst_ptr; ++ __u64 src_ptr; ++}; ++ ++#define BCH_SUBVOL_SNAPSHOT_CREATE (1U << 0) ++#define BCH_SUBVOL_SNAPSHOT_RO (1U << 1) ++ + #endif /* _BCACHEFS_IOCTL_H */ +diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c +index a03b5514a802..53604af29bcc 100644 +--- a/fs/bcachefs/bkey_methods.c ++++ b/fs/bcachefs/bkey_methods.c +@@ -11,6 +11,7 @@ + #include "inode.h" + #include "quota.h" + #include "reflink.h" ++#include "subvolume.h" + #include "xattr.h" + + const char * const bch2_bkey_types[] = { +@@ -126,6 +127,10 @@ static unsigned bch2_key_types_allowed[] = { + [BKEY_TYPE_reflink] = + (1U << KEY_TYPE_reflink_v)| + (1U << KEY_TYPE_indirect_inline_data), ++ [BKEY_TYPE_subvolumes] = ++ (1U << KEY_TYPE_subvolume), ++ [BKEY_TYPE_snapshots] = ++ (1U << KEY_TYPE_snapshot), + [BKEY_TYPE_btree] = + (1U << KEY_TYPE_btree_ptr)| + (1U << KEY_TYPE_btree_ptr_v2), +diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c +index 2dfa5040d045..c019200a6125 100644 +--- a/fs/bcachefs/btree_key_cache.c ++++ b/fs/bcachefs/btree_key_cache.c +@@ -163,6 +163,11 @@ btree_key_cache_create(struct btree_key_cache *c, + was_new = false; + } + ++ if (btree_id == BTREE_ID_subvolumes) ++ six_lock_pcpu_alloc(&ck->c.lock); ++ else ++ six_lock_pcpu_free(&ck->c.lock); ++ + ck->c.level = 0; + ck->c.btree_id = btree_id; + ck->key.btree_id = btree_id; +diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h +index 300cdbb6821d..262ee2d53322 100644 +--- a/fs/bcachefs/btree_types.h ++++ b/fs/bcachefs/btree_types.h +@@ -605,7 +605,8 @@ static inline bool btree_node_is_extents(struct btree *b) + + #define BTREE_NODE_TYPE_HAS_MEM_TRIGGERS \ + ((1U << BKEY_TYPE_alloc)| \ +- (1U << BKEY_TYPE_stripes)) ++ (1U << BKEY_TYPE_stripes)| \ ++ (1U << BKEY_TYPE_snapshots)) + + #define BTREE_NODE_TYPE_HAS_TRIGGERS \ + (BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS| \ +@@ -652,7 +653,8 @@ enum btree_update_flags { + + #define BTREE_TRIGGER_WANTS_OLD_AND_NEW \ + ((1U << KEY_TYPE_stripe)| \ +- (1U << KEY_TYPE_inode)) ++ (1U << KEY_TYPE_inode)| \ ++ (1U << KEY_TYPE_snapshot)) + + static inline bool btree_node_type_needs_gc(enum btree_node_type type) + { +@@ -669,11 +671,6 @@ struct btree_root { + s8 error; + }; + +-/* +- * Optional hook that will be called just prior to a btree node update, when +- * we're holding the write lock and we know what key is about to be overwritten: +- */ +- + enum btree_insert_ret { + BTREE_INSERT_OK, + /* leaf node needs to be split */ +@@ -694,8 +691,4 @@ enum btree_node_sibling { + btree_next_sib, + }; + +-typedef struct btree_nr_keys (*sort_fix_overlapping_fn)(struct bset *, +- struct btree *, +- struct btree_node_iter *); +- + #endif /* _BCACHEFS_BTREE_TYPES_H */ +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index a0da96737700..a8575e847f0a 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -15,6 +15,7 @@ + #include "journal.h" + #include "journal_reclaim.h" + #include "keylist.h" ++#include "subvolume.h" + #include "replicas.h" + + #include +@@ -245,6 +246,11 @@ static inline void btree_insert_entry_checks(struct btree_trans *trans, + BUG_ON(i->cached != i->path->cached); + BUG_ON(i->level != i->path->level); + BUG_ON(i->btree_id != i->path->btree_id); ++ EBUG_ON(!i->level && ++ !(i->flags & BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) && ++ test_bit(JOURNAL_REPLAY_DONE, &trans->c->journal.flags) && ++ i->k->k.p.snapshot && ++ bch2_snapshot_internal_node(trans->c, i->k->k.p.snapshot)); + } + + static noinline int +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index df12416eff8e..5fd3aabb7669 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -16,6 +16,7 @@ + #include "movinggc.h" + #include "reflink.h" + #include "replicas.h" ++#include "subvolume.h" + + #include + #include +@@ -1200,6 +1201,8 @@ static int bch2_mark_key_locked(struct bch_fs *c, + return bch2_mark_reservation(c, old, new, journal_seq, flags); + case KEY_TYPE_reflink_p: + return bch2_mark_reflink_p(c, old, new, journal_seq, flags); ++ case KEY_TYPE_snapshot: ++ return bch2_mark_snapshot(c, old, new, journal_seq, flags); + default: + return 0; + } +diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c +index 1d510f7728b6..53c7687a9ca8 100644 +--- a/fs/bcachefs/dirent.c ++++ b/fs/bcachefs/dirent.c +@@ -99,7 +99,8 @@ const char *bch2_dirent_invalid(const struct bch_fs *c, struct bkey_s_c k) + if (memchr(d.v->d_name, '/', len)) + return "invalid name"; + +- if (le64_to_cpu(d.v->d_inum) == d.k->p.inode) ++ if (d.v->d_type != DT_SUBVOL && ++ le64_to_cpu(d.v->d_inum) == d.k->p.inode) + return "dirent points to own directory"; + + return NULL; +@@ -113,7 +114,7 @@ void bch2_dirent_to_text(struct printbuf *out, struct bch_fs *c, + bch_scnmemcpy(out, d.v->d_name, + bch2_dirent_name_bytes(d)); + pr_buf(out, " -> %llu type %s", d.v->d_inum, +- d.v->d_type < DT_MAX ++ d.v->d_type < BCH_DT_MAX + ? bch2_d_types[d.v->d_type] + : "(bad d_type)"); + } +diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c +index eb979e79eaac..62158c0803db 100644 +--- a/fs/bcachefs/fsck.c ++++ b/fs/bcachefs/fsck.c +@@ -9,6 +9,7 @@ + #include "fsck.h" + #include "inode.h" + #include "keylist.h" ++#include "subvolume.h" + #include "super.h" + #include "xattr.h" + +@@ -1410,7 +1411,8 @@ int bch2_fsck_full(struct bch_fs *c) + { + struct bch_inode_unpacked root_inode; + +- return check_inodes(c, true) ?: ++ return bch2_fs_snapshots_check(c) ?: ++ check_inodes(c, true) ?: + check_extents(c) ?: + check_dirents(c) ?: + check_xattrs(c) ?: +diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c +index 84643eaaa8d5..ca04a9715ec1 100644 +--- a/fs/bcachefs/inode.c ++++ b/fs/bcachefs/inode.c +@@ -8,6 +8,7 @@ + #include "extents.h" + #include "inode.h" + #include "str_hash.h" ++#include "subvolume.h" + #include "varint.h" + + #include +@@ -340,8 +341,8 @@ int bch2_inode_write(struct btree_trans *trans, + + const char *bch2_inode_invalid(const struct bch_fs *c, struct bkey_s_c k) + { +- struct bkey_s_c_inode inode = bkey_s_c_to_inode(k); +- struct bch_inode_unpacked unpacked; ++ struct bkey_s_c_inode inode = bkey_s_c_to_inode(k); ++ struct bch_inode_unpacked unpacked; + + if (k.k->p.inode) + return "nonzero k.p.inode"; +@@ -368,6 +369,9 @@ const char *bch2_inode_invalid(const struct bch_fs *c, struct bkey_s_c k) + unpacked.bi_nlink != 0) + return "flagged as unlinked but bi_nlink != 0"; + ++ if (unpacked.bi_subvol && !S_ISDIR(unpacked.bi_mode)) ++ return "subvolume root but not a directory"; ++ + return NULL; + } + +@@ -635,6 +639,13 @@ retry: + + bch2_inode_unpack(bkey_s_c_to_inode(k), &inode_u); + ++ /* Subvolume root? */ ++ if (inode_u.bi_subvol) { ++ ret = bch2_subvolume_delete(&trans, inode_u.bi_subvol, -1); ++ if (ret) ++ goto err; ++ } ++ + bkey_inode_generation_init(&delete.k_i); + delete.k.p = iter.pos; + delete.v.bi_generation = cpu_to_le32(inode_u.bi_generation + 1); +diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c +index 5de296078219..ff99c6d24abd 100644 +--- a/fs/bcachefs/opts.c ++++ b/fs/bcachefs/opts.c +@@ -63,7 +63,7 @@ const char * const bch2_member_states[] = { + + #undef x + +-const char * const bch2_d_types[DT_MAX] = { ++const char * const bch2_d_types[BCH_DT_MAX] = { + [DT_UNKNOWN] = "unknown", + [DT_FIFO] = "fifo", + [DT_CHR] = "chr", +@@ -73,6 +73,7 @@ const char * const bch2_d_types[DT_MAX] = { + [DT_LNK] = "lnk", + [DT_SOCK] = "sock", + [DT_WHT] = "whiteout", ++ [DT_SUBVOL] = "subvol", + }; + + void bch2_opts_apply(struct bch_opts *dst, struct bch_opts src) +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index 11208e83fabe..2aab57cf09e1 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -20,6 +20,7 @@ + #include "quota.h" + #include "recovery.h" + #include "replicas.h" ++#include "subvolume.h" + #include "super-io.h" + + #include +@@ -961,6 +962,81 @@ fsck_err: + return ret; + } + ++static int bch2_fs_initialize_subvolumes(struct bch_fs *c) ++{ ++ struct bkey_i_snapshot root_snapshot; ++ struct bkey_i_subvolume root_volume; ++ int ret; ++ ++ bkey_snapshot_init(&root_snapshot.k_i); ++ root_snapshot.k.p.offset = U32_MAX; ++ root_snapshot.v.flags = 0; ++ root_snapshot.v.parent = 0; ++ root_snapshot.v.subvol = BCACHEFS_ROOT_SUBVOL; ++ root_snapshot.v.pad = 0; ++ SET_BCH_SNAPSHOT_SUBVOL(&root_snapshot.v, true); ++ ++ ret = bch2_btree_insert(c, BTREE_ID_snapshots, ++ &root_snapshot.k_i, ++ NULL, NULL, 0); ++ if (ret) ++ return ret; ++ ++ ++ bkey_subvolume_init(&root_volume.k_i); ++ root_volume.k.p.offset = BCACHEFS_ROOT_SUBVOL; ++ root_volume.v.flags = 0; ++ root_volume.v.snapshot = cpu_to_le32(U32_MAX); ++ root_volume.v.inode = cpu_to_le64(BCACHEFS_ROOT_INO); ++ ++ ret = bch2_btree_insert(c, BTREE_ID_subvolumes, ++ &root_volume.k_i, ++ NULL, NULL, 0); ++ if (ret) ++ return ret; ++ ++ return 0; ++} ++ ++static int bch2_fs_upgrade_for_subvolumes(struct btree_trans *trans) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ struct bch_inode_unpacked inode; ++ struct bkey_inode_buf *packed; ++ int ret; ++ ++ bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes, ++ POS(0, BCACHEFS_ROOT_INO), 0); ++ k = bch2_btree_iter_peek_slot(&iter); ++ ret = bkey_err(k); ++ if (ret) ++ goto err; ++ ++ if (k.k->type != KEY_TYPE_inode) { ++ bch_err(c, "root inode not found"); ++ ret = -ENOENT; ++ goto err; ++ } ++ ++ ret = bch2_inode_unpack(bkey_s_c_to_inode(k), &inode); ++ BUG_ON(ret); ++ ++ inode.bi_subvol = BCACHEFS_ROOT_SUBVOL; ++ ++ packed = bch2_trans_kmalloc(trans, sizeof(*packed)); ++ ret = PTR_ERR_OR_ZERO(packed); ++ if (ret) ++ goto err; ++ ++ bch2_inode_pack(c, packed, &inode); ++ ret = bch2_trans_update(trans, &iter, &packed->inode.k_i, 0); ++err: ++ bch2_trans_iter_exit(trans, &iter); ++ return ret; ++} ++ + int bch2_fs_recovery(struct bch_fs *c) + { + const char *err = "cannot allocate memory"; +@@ -1017,11 +1093,12 @@ int bch2_fs_recovery(struct bch_fs *c) + c->opts.version_upgrade = true; + c->opts.fsck = true; + c->opts.fix_errors = FSCK_OPT_YES; +- } +- +- if (c->sb.version < bcachefs_metadata_version_btree_ptr_sectors_written) { ++ } else if (c->sb.version < bcachefs_metadata_version_btree_ptr_sectors_written) { + bch_info(c, "version prior to btree_ptr_sectors_written, upgrade required"); + c->opts.version_upgrade = true; ++ } else if (c->sb.version < bcachefs_metadata_version_snapshot) { ++ bch_info(c, "filesystem version is prior to snapshot field - upgrading"); ++ c->opts.version_upgrade = true; + } + + ret = bch2_blacklist_table_initialize(c); +@@ -1190,6 +1267,29 @@ use_clean: + bch_verbose(c, "alloc write done"); + } + ++ if (c->sb.version < bcachefs_metadata_version_snapshot) { ++ err = "error creating root snapshot node"; ++ ret = bch2_fs_initialize_subvolumes(c); ++ if (ret) ++ goto err; ++ } ++ ++ bch_verbose(c, "reading snapshots table"); ++ err = "error reading snapshots table"; ++ ret = bch2_fs_snapshots_start(c); ++ if (ret) ++ goto err; ++ bch_verbose(c, "reading snapshots done"); ++ ++ if (c->sb.version < bcachefs_metadata_version_snapshot) { ++ /* set bi_subvol on root inode */ ++ err = "error upgrade root inode for subvolumes"; ++ ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_LAZY_RW, ++ bch2_fs_upgrade_for_subvolumes(&trans)); ++ if (ret) ++ goto err; ++ } ++ + if (c->opts.fsck) { + bch_info(c, "starting fsck"); + err = "error in fsck"; +@@ -1350,9 +1450,22 @@ int bch2_fs_initialize(struct bch_fs *c) + } + } + ++ err = "error creating root snapshot node"; ++ ret = bch2_fs_initialize_subvolumes(c); ++ if (ret) ++ goto err; ++ ++ bch_verbose(c, "reading snapshots table"); ++ err = "error reading snapshots table"; ++ ret = bch2_fs_snapshots_start(c); ++ if (ret) ++ goto err; ++ bch_verbose(c, "reading snapshots done"); ++ + bch2_inode_init(c, &root_inode, 0, 0, + S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0, NULL); +- root_inode.bi_inum = BCACHEFS_ROOT_INO; ++ root_inode.bi_inum = BCACHEFS_ROOT_INO; ++ root_inode.bi_subvol = BCACHEFS_ROOT_SUBVOL; + bch2_inode_pack(c, &packed_inode, &root_inode); + packed_inode.inode.k.p.snapshot = U32_MAX; + +diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c +new file mode 100644 +index 000000000000..ff3b4d2d86b9 +--- /dev/null ++++ b/fs/bcachefs/subvolume.c +@@ -0,0 +1,981 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include "bcachefs.h" ++#include "btree_key_cache.h" ++#include "btree_update.h" ++#include "error.h" ++#include "subvolume.h" ++ ++/* Snapshot tree: */ ++ ++static void bch2_delete_dead_snapshots_work(struct work_struct *); ++static void bch2_delete_dead_snapshots(struct bch_fs *); ++ ++void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c, ++ struct bkey_s_c k) ++{ ++ struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k); ++ ++ pr_buf(out, "is_subvol %llu deleted %llu parent %u children %u %u subvol %u", ++ BCH_SNAPSHOT_SUBVOL(s.v), ++ BCH_SNAPSHOT_DELETED(s.v), ++ le32_to_cpu(s.v->parent), ++ le32_to_cpu(s.v->children[0]), ++ le32_to_cpu(s.v->children[1]), ++ le32_to_cpu(s.v->subvol)); ++} ++ ++const char *bch2_snapshot_invalid(const struct bch_fs *c, struct bkey_s_c k) ++{ ++ struct bkey_s_c_snapshot s; ++ u32 i, id; ++ ++ if (bkey_cmp(k.k->p, POS(0, U32_MAX)) > 0 || ++ bkey_cmp(k.k->p, POS(0, 1)) < 0) ++ return "bad pos"; ++ ++ if (bkey_val_bytes(k.k) != sizeof(struct bch_snapshot)) ++ return "bad val size"; ++ ++ s = bkey_s_c_to_snapshot(k); ++ ++ id = le32_to_cpu(s.v->parent); ++ if (id && id <= k.k->p.offset) ++ return "bad parent node"; ++ ++ if (le32_to_cpu(s.v->children[0]) < le32_to_cpu(s.v->children[1])) ++ return "children not normalized"; ++ ++ if (s.v->children[0] && ++ s.v->children[0] == s.v->children[1]) ++ return "duplicate child nodes"; ++ ++ for (i = 0; i < 2; i++) { ++ id = le32_to_cpu(s.v->children[i]); ++ ++ if (id >= k.k->p.offset) ++ return "bad child node"; ++ } ++ ++ return NULL; ++} ++ ++int bch2_mark_snapshot(struct bch_fs *c, ++ struct bkey_s_c old, struct bkey_s_c new, ++ u64 journal_seq, unsigned flags) ++{ ++ struct snapshot_t *t; ++ ++ t = genradix_ptr_alloc(&c->snapshots, ++ U32_MAX - new.k->p.offset, ++ GFP_KERNEL); ++ if (!t) ++ return -ENOMEM; ++ ++ if (new.k->type == KEY_TYPE_snapshot) { ++ struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(new); ++ ++ t->parent = le32_to_cpu(s.v->parent); ++ t->children[0] = le32_to_cpu(s.v->children[0]); ++ t->children[1] = le32_to_cpu(s.v->children[1]); ++ t->subvol = BCH_SNAPSHOT_SUBVOL(s.v) ? le32_to_cpu(s.v->subvol) : 0; ++ } else { ++ t->parent = 0; ++ t->children[0] = 0; ++ t->children[1] = 0; ++ t->subvol = 0; ++ } ++ ++ return 0; ++} ++ ++static int subvol_lookup(struct btree_trans *trans, unsigned id, struct bch_subvolume *s) ++{ ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ int ret; ++ ++ bch2_trans_iter_init(trans, &iter, BTREE_ID_subvolumes, POS(0, id), 0); ++ k = bch2_btree_iter_peek_slot(&iter); ++ ret = bkey_err(k) ?: k.k->type == KEY_TYPE_subvolume ? 0 : -ENOENT; ++ ++ if (!ret) ++ *s = *bkey_s_c_to_subvolume(k).v; ++ ++ bch2_trans_iter_exit(trans, &iter); ++ return ret; ++} ++ ++static int snapshot_lookup(struct btree_trans *trans, u32 id, ++ struct bch_snapshot *s) ++{ ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ int ret; ++ ++ bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots, POS(0, id), ++ BTREE_ITER_WITH_UPDATES); ++ k = bch2_btree_iter_peek_slot(&iter); ++ ret = bkey_err(k) ?: k.k->type == KEY_TYPE_snapshot ? 0 : -ENOENT; ++ ++ if (!ret) ++ *s = *bkey_s_c_to_snapshot(k).v; ++ ++ bch2_trans_iter_exit(trans, &iter); ++ return ret; ++} ++ ++static int snapshot_live(struct btree_trans *trans, u32 id) ++{ ++ struct bch_snapshot v; ++ int ret; ++ ++ if (!id) ++ return 0; ++ ++ ret = lockrestart_do(trans, snapshot_lookup(trans, id, &v)); ++ if (ret == -ENOENT) ++ bch_err(trans->c, "snapshot node %u not found", id); ++ if (ret) ++ return ret; ++ ++ return !BCH_SNAPSHOT_DELETED(&v); ++} ++ ++static int bch2_snapshots_set_equiv(struct btree_trans *trans) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ struct bkey_s_c_snapshot snap; ++ unsigned i; ++ int ret; ++ ++ for_each_btree_key(trans, iter, BTREE_ID_snapshots, ++ POS_MIN, 0, k, ret) { ++ u32 id = k.k->p.offset, child[2]; ++ unsigned nr_live = 0, live_idx; ++ ++ if (k.k->type != KEY_TYPE_snapshot) ++ continue; ++ ++ snap = bkey_s_c_to_snapshot(k); ++ child[0] = le32_to_cpu(snap.v->children[0]); ++ child[1] = le32_to_cpu(snap.v->children[1]); ++ ++ for (i = 0; i < 2; i++) { ++ ret = snapshot_live(trans, child[i]); ++ if (ret < 0) ++ break; ++ ++ if (ret) ++ live_idx = i; ++ nr_live += ret; ++ } ++ ++ snapshot_t(c, id)->equiv = nr_live == 1 ++ ? snapshot_t(c, child[live_idx])->equiv ++ : id; ++ } ++ bch2_trans_iter_exit(trans, &iter); ++ ++ if (ret) ++ bch_err(c, "error walking snapshots: %i", ret); ++ ++ return ret; ++} ++ ++/* fsck: */ ++static int bch2_snapshot_check(struct btree_trans *trans, ++ struct bkey_s_c_snapshot s) ++{ ++ struct bch_subvolume subvol; ++ struct bch_snapshot v; ++ u32 i, id; ++ int ret; ++ ++ id = le32_to_cpu(s.v->subvol); ++ ret = lockrestart_do(trans, subvol_lookup(trans, id, &subvol)); ++ if (ret == -ENOENT) ++ bch_err(trans->c, "snapshot node %llu has nonexistent subvolume %u", ++ s.k->p.offset, id); ++ if (ret) ++ return ret; ++ ++ if (BCH_SNAPSHOT_SUBVOL(s.v) != (le32_to_cpu(subvol.snapshot) == s.k->p.offset)) { ++ bch_err(trans->c, "snapshot node %llu has wrong BCH_SNAPSHOT_SUBVOL", ++ s.k->p.offset); ++ return -EINVAL; ++ } ++ ++ id = le32_to_cpu(s.v->parent); ++ if (id) { ++ ret = lockrestart_do(trans, snapshot_lookup(trans, id, &v)); ++ if (ret == -ENOENT) ++ bch_err(trans->c, "snapshot node %llu has nonexistent parent %u", ++ s.k->p.offset, id); ++ if (ret) ++ return ret; ++ ++ if (le32_to_cpu(v.children[0]) != s.k->p.offset && ++ le32_to_cpu(v.children[1]) != s.k->p.offset) { ++ bch_err(trans->c, "snapshot parent %u missing pointer to child %llu", ++ id, s.k->p.offset); ++ return -EINVAL; ++ } ++ } ++ ++ for (i = 0; i < 2 && s.v->children[i]; i++) { ++ id = le32_to_cpu(s.v->children[i]); ++ ++ ret = lockrestart_do(trans, snapshot_lookup(trans, id, &v)); ++ if (ret == -ENOENT) ++ bch_err(trans->c, "snapshot node %llu has nonexistent child %u", ++ s.k->p.offset, id); ++ if (ret) ++ return ret; ++ ++ if (le32_to_cpu(v.parent) != s.k->p.offset) { ++ bch_err(trans->c, "snapshot child %u has wrong parent (got %u should be %llu)", ++ id, le32_to_cpu(v.parent), s.k->p.offset); ++ return -EINVAL; ++ } ++ } ++ ++ return 0; ++} ++ ++int bch2_fs_snapshots_check(struct bch_fs *c) ++{ ++ struct btree_trans trans; ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ struct bch_snapshot s; ++ unsigned id; ++ int ret; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_snapshots, ++ POS_MIN, 0, k, ret) { ++ if (k.k->type != KEY_TYPE_snapshot) ++ continue; ++ ++ ret = bch2_snapshot_check(&trans, bkey_s_c_to_snapshot(k)); ++ if (ret) ++ break; ++ } ++ bch2_trans_iter_exit(&trans, &iter); ++ ++ if (ret) { ++ bch_err(c, "error %i checking snapshots", ret); ++ goto err; ++ } ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_subvolumes, ++ POS_MIN, 0, k, ret) { ++ if (k.k->type != KEY_TYPE_subvolume) ++ continue; ++again_2: ++ id = le32_to_cpu(bkey_s_c_to_subvolume(k).v->snapshot); ++ ret = snapshot_lookup(&trans, id, &s); ++ ++ if (ret == -EINTR) { ++ k = bch2_btree_iter_peek(&iter); ++ goto again_2; ++ } else if (ret == -ENOENT) ++ bch_err(c, "subvolume %llu points to nonexistent snapshot %u", ++ k.k->p.offset, id); ++ else if (ret) ++ break; ++ } ++ bch2_trans_iter_exit(&trans, &iter); ++err: ++ bch2_trans_exit(&trans); ++ return ret; ++} ++ ++void bch2_fs_snapshots_exit(struct bch_fs *c) ++{ ++ genradix_free(&c->snapshots); ++} ++ ++int bch2_fs_snapshots_start(struct bch_fs *c) ++{ ++ struct btree_trans trans; ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ bool have_deleted = false; ++ int ret = 0; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_snapshots, ++ POS_MIN, 0, k, ret) { ++ if (bkey_cmp(k.k->p, POS(0, U32_MAX)) > 0) ++ break; ++ ++ if (k.k->type != KEY_TYPE_snapshot) { ++ bch_err(c, "found wrong key type %u in snapshot node table", ++ k.k->type); ++ continue; ++ } ++ ++ if (BCH_SNAPSHOT_DELETED(bkey_s_c_to_snapshot(k).v)) ++ have_deleted = true; ++ ++ ret = bch2_mark_snapshot(c, bkey_s_c_null, k, 0, 0); ++ if (ret) ++ break; ++ } ++ bch2_trans_iter_exit(&trans, &iter); ++ ++ if (ret) ++ goto err; ++ ++ ret = bch2_snapshots_set_equiv(&trans); ++ if (ret) ++ goto err; ++err: ++ bch2_trans_exit(&trans); ++ ++ if (!ret && have_deleted) { ++ bch_info(c, "restarting deletion of dead snapshots"); ++ if (c->opts.fsck) { ++ bch2_delete_dead_snapshots_work(&c->snapshot_delete_work); ++ } else { ++ bch2_delete_dead_snapshots(c); ++ } ++ } ++ ++ return ret; ++} ++ ++/* ++ * Mark a snapshot as deleted, for future cleanup: ++ */ ++static int bch2_snapshot_node_set_deleted(struct btree_trans *trans, u32 id) ++{ ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ struct bkey_i_snapshot *s; ++ int ret = 0; ++ ++ bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots, POS(0, id), ++ BTREE_ITER_INTENT); ++ k = bch2_btree_iter_peek_slot(&iter); ++ ret = bkey_err(k); ++ if (ret) ++ goto err; ++ ++ if (k.k->type != KEY_TYPE_snapshot) { ++ bch2_fs_inconsistent(trans->c, "missing snapshot %u", id); ++ ret = -ENOENT; ++ goto err; ++ } ++ ++ /* already deleted? */ ++ if (BCH_SNAPSHOT_DELETED(bkey_s_c_to_snapshot(k).v)) ++ goto err; ++ ++ s = bch2_trans_kmalloc(trans, sizeof(*s)); ++ ret = PTR_ERR_OR_ZERO(s); ++ if (ret) ++ goto err; ++ ++ bkey_reassemble(&s->k_i, k); ++ ++ SET_BCH_SNAPSHOT_DELETED(&s->v, true); ++ ret = bch2_trans_update(trans, &iter, &s->k_i, 0); ++ if (ret) ++ goto err; ++err: ++ bch2_trans_iter_exit(trans, &iter); ++ return ret; ++} ++ ++static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id) ++{ ++ struct btree_iter iter, p_iter = (struct btree_iter) { NULL }; ++ struct bkey_s_c k; ++ struct bkey_s_c_snapshot s; ++ struct bkey_i_snapshot *parent; ++ u32 parent_id; ++ unsigned i; ++ int ret = 0; ++ ++ bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots, POS(0, id), ++ BTREE_ITER_INTENT); ++ k = bch2_btree_iter_peek_slot(&iter); ++ ret = bkey_err(k); ++ if (ret) ++ goto err; ++ ++ if (k.k->type != KEY_TYPE_snapshot) { ++ bch2_fs_inconsistent(trans->c, "missing snapshot %u", id); ++ ret = -ENOENT; ++ goto err; ++ } ++ ++ s = bkey_s_c_to_snapshot(k); ++ ++ BUG_ON(!BCH_SNAPSHOT_DELETED(s.v)); ++ parent_id = le32_to_cpu(s.v->parent); ++ ++ if (parent_id) { ++ bch2_trans_iter_init(trans, &p_iter, BTREE_ID_snapshots, ++ POS(0, parent_id), ++ BTREE_ITER_INTENT); ++ k = bch2_btree_iter_peek_slot(&p_iter); ++ ret = bkey_err(k); ++ if (ret) ++ goto err; ++ ++ if (k.k->type != KEY_TYPE_snapshot) { ++ bch2_fs_inconsistent(trans->c, "missing snapshot %u", parent_id); ++ ret = -ENOENT; ++ goto err; ++ } ++ ++ parent = bch2_trans_kmalloc(trans, sizeof(*parent)); ++ ret = PTR_ERR_OR_ZERO(parent); ++ if (ret) ++ goto err; ++ ++ bkey_reassemble(&parent->k_i, k); ++ ++ for (i = 0; i < 2; i++) ++ if (le32_to_cpu(parent->v.children[i]) == id) ++ break; ++ ++ if (i == 2) ++ bch_err(trans->c, "snapshot %u missing child pointer to %u", ++ parent_id, id); ++ else ++ parent->v.children[i] = 0; ++ ++ if (le32_to_cpu(parent->v.children[0]) < ++ le32_to_cpu(parent->v.children[1])) ++ swap(parent->v.children[0], ++ parent->v.children[1]); ++ ++ ret = bch2_trans_update(trans, &p_iter, &parent->k_i, 0); ++ if (ret) ++ goto err; ++ } ++ ++ ret = bch2_btree_delete_at(trans, &iter, 0); ++err: ++ bch2_trans_iter_exit(trans, &p_iter); ++ bch2_trans_iter_exit(trans, &iter); ++ return ret; ++} ++ ++static int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent, ++ u32 *new_snapids, ++ u32 *snapshot_subvols, ++ unsigned nr_snapids) ++{ ++ struct btree_iter iter; ++ struct bkey_i_snapshot *n; ++ struct bkey_s_c k; ++ unsigned i; ++ int ret = 0; ++ ++ bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots, ++ POS_MIN, BTREE_ITER_INTENT); ++ k = bch2_btree_iter_peek(&iter); ++ ret = bkey_err(k); ++ if (ret) ++ goto err; ++ ++ for (i = 0; i < nr_snapids; i++) { ++ k = bch2_btree_iter_prev_slot(&iter); ++ ret = bkey_err(k); ++ if (ret) ++ goto err; ++ ++ if (!k.k || !k.k->p.offset) { ++ ret = -ENOSPC; ++ goto err; ++ } ++ ++ n = bch2_trans_kmalloc(trans, sizeof(*n)); ++ ret = PTR_ERR_OR_ZERO(n); ++ if (ret) ++ return ret; ++ ++ bkey_snapshot_init(&n->k_i); ++ n->k.p = iter.pos; ++ n->v.flags = 0; ++ n->v.parent = cpu_to_le32(parent); ++ n->v.subvol = cpu_to_le32(snapshot_subvols[i]); ++ n->v.pad = 0; ++ SET_BCH_SNAPSHOT_SUBVOL(&n->v, true); ++ ++ bch2_trans_update(trans, &iter, &n->k_i, 0); ++ ++ ret = bch2_mark_snapshot(trans->c, bkey_s_c_null, bkey_i_to_s_c(&n->k_i), 0, 0); ++ if (ret) ++ break; ++ ++ new_snapids[i] = iter.pos.offset; ++ } ++ ++ if (parent) { ++ bch2_btree_iter_set_pos(&iter, POS(0, parent)); ++ k = bch2_btree_iter_peek(&iter); ++ ret = bkey_err(k); ++ if (ret) ++ goto err; ++ ++ if (k.k->type != KEY_TYPE_snapshot) { ++ bch_err(trans->c, "snapshot %u not found", parent); ++ ret = -ENOENT; ++ goto err; ++ } ++ ++ n = bch2_trans_kmalloc(trans, sizeof(*n)); ++ ret = PTR_ERR_OR_ZERO(n); ++ if (ret) ++ return ret; ++ ++ bkey_reassemble(&n->k_i, k); ++ ++ if (n->v.children[0] || n->v.children[1]) { ++ bch_err(trans->c, "Trying to add child snapshot nodes to parent that already has children"); ++ ret = -EINVAL; ++ goto err; ++ } ++ ++ n->v.children[0] = cpu_to_le32(new_snapids[0]); ++ n->v.children[1] = cpu_to_le32(new_snapids[1]); ++ SET_BCH_SNAPSHOT_SUBVOL(&n->v, false); ++ bch2_trans_update(trans, &iter, &n->k_i, 0); ++ } ++err: ++ bch2_trans_iter_exit(trans, &iter); ++ return ret; ++} ++ ++/* List of snapshot IDs that are being deleted: */ ++struct snapshot_id_list { ++ u32 nr; ++ u32 size; ++ u32 *d; ++}; ++ ++static bool snapshot_list_has_id(struct snapshot_id_list *s, u32 id) ++{ ++ unsigned i; ++ ++ for (i = 0; i < s->nr; i++) ++ if (id == s->d[i]) ++ return true; ++ return false; ++} ++ ++static int snapshot_id_add(struct snapshot_id_list *s, u32 id) ++{ ++ BUG_ON(snapshot_list_has_id(s, id)); ++ ++ if (s->nr == s->size) { ++ size_t new_size = max(8U, s->size * 2); ++ void *n = krealloc(s->d, ++ new_size * sizeof(s->d[0]), ++ GFP_KERNEL); ++ if (!n) { ++ pr_err("error allocating snapshot ID list"); ++ return -ENOMEM; ++ } ++ ++ s->d = n; ++ s->size = new_size; ++ }; ++ ++ s->d[s->nr++] = id; ++ return 0; ++} ++ ++static int bch2_snapshot_delete_keys_btree(struct btree_trans *trans, ++ struct snapshot_id_list *deleted, ++ enum btree_id btree_id) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ struct snapshot_id_list equiv_seen = { 0 }; ++ struct bpos last_pos = POS_MIN; ++ int ret = 0; ++ ++ /* ++ * XXX: We should also delete whiteouts that no longer overwrite ++ * anything ++ */ ++ ++ bch2_trans_iter_init(trans, &iter, btree_id, POS_MIN, ++ BTREE_ITER_INTENT| ++ BTREE_ITER_PREFETCH| ++ BTREE_ITER_NOT_EXTENTS| ++ BTREE_ITER_ALL_SNAPSHOTS); ++ ++ while ((bch2_trans_begin(trans), ++ (k = bch2_btree_iter_peek(&iter)).k) && ++ !(ret = bkey_err(k))) { ++ u32 equiv = snapshot_t(c, k.k->p.snapshot)->equiv; ++ ++ if (bkey_cmp(k.k->p, last_pos)) ++ equiv_seen.nr = 0; ++ last_pos = k.k->p; ++ ++ if (snapshot_list_has_id(deleted, k.k->p.snapshot) || ++ snapshot_list_has_id(&equiv_seen, equiv)) { ++ if (btree_id == BTREE_ID_inodes && ++ bch2_btree_key_cache_flush(trans, btree_id, iter.pos)) ++ continue; ++ ++ ret = __bch2_trans_do(trans, NULL, NULL, ++ BTREE_INSERT_NOFAIL, ++ bch2_btree_iter_traverse(&iter) ?: ++ bch2_btree_delete_at(trans, &iter, ++ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE)); ++ if (ret) ++ break; ++ } else { ++ ret = snapshot_id_add(&equiv_seen, equiv); ++ if (ret) ++ break; ++ } ++ ++ bch2_btree_iter_advance(&iter); ++ } ++ bch2_trans_iter_exit(trans, &iter); ++ ++ kfree(equiv_seen.d); ++ ++ return ret; ++} ++ ++static void bch2_delete_dead_snapshots_work(struct work_struct *work) ++{ ++ struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete_work); ++ struct btree_trans trans; ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ struct bkey_s_c_snapshot snap; ++ struct snapshot_id_list deleted = { 0 }; ++ u32 i, id, children[2]; ++ int ret = 0; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ /* ++ * For every snapshot node: If we have no live children and it's not ++ * pointed to by a subvolume, delete it: ++ */ ++ for_each_btree_key(&trans, iter, BTREE_ID_snapshots, ++ POS_MIN, 0, k, ret) { ++ if (k.k->type != KEY_TYPE_snapshot) ++ continue; ++ ++ snap = bkey_s_c_to_snapshot(k); ++ if (BCH_SNAPSHOT_DELETED(snap.v) || ++ BCH_SNAPSHOT_SUBVOL(snap.v)) ++ continue; ++ ++ children[0] = le32_to_cpu(snap.v->children[0]); ++ children[1] = le32_to_cpu(snap.v->children[1]); ++ ++ ret = snapshot_live(&trans, children[0]) ?: ++ snapshot_live(&trans, children[1]); ++ if (ret < 0) ++ break; ++ if (ret) ++ continue; ++ ++ ret = __bch2_trans_do(&trans, NULL, NULL, 0, ++ bch2_snapshot_node_set_deleted(&trans, iter.pos.offset)); ++ if (ret) { ++ bch_err(c, "error deleting snapshot %llu: %i", iter.pos.offset, ret); ++ break; ++ } ++ } ++ bch2_trans_iter_exit(&trans, &iter); ++ ++ if (ret) { ++ bch_err(c, "error walking snapshots: %i", ret); ++ goto err; ++ } ++ ++ ret = bch2_snapshots_set_equiv(&trans); ++ if (ret) ++ goto err; ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_snapshots, ++ POS_MIN, 0, k, ret) { ++ if (k.k->type != KEY_TYPE_snapshot) ++ continue; ++ ++ snap = bkey_s_c_to_snapshot(k); ++ if (BCH_SNAPSHOT_DELETED(snap.v)) { ++ ret = snapshot_id_add(&deleted, k.k->p.offset); ++ if (ret) ++ break; ++ } ++ } ++ bch2_trans_iter_exit(&trans, &iter); ++ ++ if (ret) { ++ bch_err(c, "error walking snapshots: %i", ret); ++ goto err; ++ } ++ ++ for (id = 0; id < BTREE_ID_NR; id++) { ++ if (!btree_type_has_snapshots(id)) ++ continue; ++ ++ ret = bch2_snapshot_delete_keys_btree(&trans, &deleted, id); ++ if (ret) { ++ bch_err(c, "error deleting snapshot keys: %i", ret); ++ goto err; ++ } ++ } ++ ++ for (i = 0; i < deleted.nr; i++) { ++ ret = __bch2_trans_do(&trans, NULL, NULL, 0, ++ bch2_snapshot_node_delete(&trans, deleted.d[i])); ++ if (ret) { ++ bch_err(c, "error deleting snapshot %u: %i", ++ deleted.d[i], ret); ++ goto err; ++ } ++ } ++err: ++ kfree(deleted.d); ++ bch2_trans_exit(&trans); ++ percpu_ref_put(&c->writes); ++} ++ ++static void bch2_delete_dead_snapshots(struct bch_fs *c) ++{ ++ if (unlikely(!percpu_ref_tryget(&c->writes))) ++ return; ++ ++ if (!queue_work(system_long_wq, &c->snapshot_delete_work)) ++ percpu_ref_put(&c->writes); ++} ++ ++static int bch2_delete_dead_snapshots_hook(struct btree_trans *trans, ++ struct btree_trans_commit_hook *h) ++{ ++ bch2_delete_dead_snapshots(trans->c); ++ return 0; ++} ++ ++/* Subvolumes: */ ++ ++const char *bch2_subvolume_invalid(const struct bch_fs *c, struct bkey_s_c k) ++{ ++ if (bkey_cmp(k.k->p, SUBVOL_POS_MIN) < 0) ++ return "invalid pos"; ++ ++ if (bkey_cmp(k.k->p, SUBVOL_POS_MAX) > 0) ++ return "invalid pos"; ++ ++ if (bkey_val_bytes(k.k) != sizeof(struct bch_subvolume)) ++ return "bad val size"; ++ ++ return NULL; ++} ++ ++void bch2_subvolume_to_text(struct printbuf *out, struct bch_fs *c, ++ struct bkey_s_c k) ++{ ++ struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k); ++ ++ pr_buf(out, "root %llu snapshot id %u", ++ le64_to_cpu(s.v->inode), ++ le32_to_cpu(s.v->snapshot)); ++} ++ ++int bch2_subvolume_get_snapshot(struct btree_trans *trans, u32 subvol, ++ u32 *snapid) ++{ ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ int ret; ++ ++ bch2_trans_iter_init(trans, &iter, BTREE_ID_subvolumes, ++ POS(0, subvol), ++ BTREE_ITER_CACHED| ++ BTREE_ITER_WITH_UPDATES); ++ k = bch2_btree_iter_peek_slot(&iter); ++ ret = bkey_err(k); ++ if (ret) ++ goto err; ++ ++ if (k.k->type != KEY_TYPE_subvolume) { ++ bch2_fs_inconsistent(trans->c, "missing subvolume %u", subvol); ++ ret = -EIO; ++ goto err; ++ } ++ ++ *snapid = le32_to_cpu(bkey_s_c_to_subvolume(k).v->snapshot); ++err: ++ bch2_trans_iter_exit(trans, &iter); ++ return ret; ++} ++ ++/* XXX: mark snapshot id for deletion, walk btree and delete: */ ++int bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid, ++ int deleting_snapshot) ++{ ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ struct bkey_s_c_subvolume subvol; ++ struct btree_trans_commit_hook *h; ++ struct bkey_i *delete; ++ u32 snapid; ++ int ret = 0; ++ ++ bch2_trans_iter_init(trans, &iter, BTREE_ID_subvolumes, ++ POS(0, subvolid), ++ BTREE_ITER_CACHED| ++ BTREE_ITER_INTENT); ++ k = bch2_btree_iter_peek_slot(&iter); ++ ret = bkey_err(k); ++ if (ret) ++ goto err; ++ ++ if (k.k->type != KEY_TYPE_subvolume) { ++ bch2_fs_inconsistent(trans->c, "missing subvolume %u", subvolid); ++ ret = -EIO; ++ goto err; ++ } ++ ++ subvol = bkey_s_c_to_subvolume(k); ++ snapid = le32_to_cpu(subvol.v->snapshot); ++ ++ if (deleting_snapshot >= 0 && ++ deleting_snapshot != BCH_SUBVOLUME_SNAP(subvol.v)) { ++ ret = -ENOENT; ++ goto err; ++ } ++ ++ delete = bch2_trans_kmalloc(trans, sizeof(*delete)); ++ ret = PTR_ERR_OR_ZERO(delete); ++ if (ret) ++ goto err; ++ ++ bkey_init(&delete->k); ++ delete->k.p = iter.pos; ++ ret = bch2_trans_update(trans, &iter, delete, 0); ++ if (ret) ++ goto err; ++ ++ ret = bch2_snapshot_node_set_deleted(trans, snapid); ++ ++ h = bch2_trans_kmalloc(trans, sizeof(*h)); ++ ret = PTR_ERR_OR_ZERO(h); ++ if (ret) ++ goto err; ++ ++ h->fn = bch2_delete_dead_snapshots_hook; ++ bch2_trans_commit_hook(trans, h); ++err: ++ bch2_trans_iter_exit(trans, &iter); ++ return ret; ++} ++ ++int bch2_subvolume_create(struct btree_trans *trans, u64 inode, ++ u32 src_subvolid, ++ u32 *new_subvolid, ++ u32 *new_snapshotid, ++ bool ro) ++{ ++ struct btree_iter dst_iter, src_iter = (struct btree_iter) { NULL }; ++ struct bkey_i_subvolume *new_subvol = NULL; ++ struct bkey_i_subvolume *src_subvol = NULL; ++ struct bkey_s_c k; ++ u32 parent = 0, new_nodes[2], snapshot_subvols[2]; ++ int ret = 0; ++ ++ for_each_btree_key(trans, dst_iter, BTREE_ID_subvolumes, SUBVOL_POS_MIN, ++ BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { ++ if (bkey_cmp(k.k->p, SUBVOL_POS_MAX) > 0) ++ break; ++ if (bkey_deleted(k.k)) ++ goto found_slot; ++ } ++ ++ if (!ret) ++ ret = -ENOSPC; ++ goto err; ++found_slot: ++ snapshot_subvols[0] = dst_iter.pos.offset; ++ snapshot_subvols[1] = src_subvolid; ++ ++ if (src_subvolid) { ++ /* Creating a snapshot: */ ++ src_subvol = bch2_trans_kmalloc(trans, sizeof(*src_subvol)); ++ ret = PTR_ERR_OR_ZERO(src_subvol); ++ if (ret) ++ goto err; ++ ++ bch2_trans_iter_init(trans, &src_iter, BTREE_ID_subvolumes, ++ POS(0, src_subvolid), ++ BTREE_ITER_CACHED| ++ BTREE_ITER_INTENT); ++ k = bch2_btree_iter_peek_slot(&src_iter); ++ ret = bkey_err(k); ++ if (ret) ++ goto err; ++ ++ if (k.k->type != KEY_TYPE_subvolume) { ++ bch_err(trans->c, "subvolume %u not found", src_subvolid); ++ ret = -ENOENT; ++ goto err; ++ } ++ ++ bkey_reassemble(&src_subvol->k_i, k); ++ parent = le32_to_cpu(src_subvol->v.snapshot); ++ } ++ ++ ret = bch2_snapshot_node_create(trans, parent, new_nodes, ++ snapshot_subvols, ++ src_subvolid ? 2 : 1); ++ if (ret) ++ goto err; ++ ++ if (src_subvolid) { ++ src_subvol->v.snapshot = cpu_to_le32(new_nodes[1]); ++ bch2_trans_update(trans, &src_iter, &src_subvol->k_i, 0); ++ } ++ ++ new_subvol = bch2_trans_kmalloc(trans, sizeof(*new_subvol)); ++ ret = PTR_ERR_OR_ZERO(new_subvol); ++ if (ret) ++ goto err; ++ ++ bkey_subvolume_init(&new_subvol->k_i); ++ new_subvol->v.flags = 0; ++ new_subvol->v.snapshot = cpu_to_le32(new_nodes[0]); ++ new_subvol->v.inode = cpu_to_le64(inode); ++ SET_BCH_SUBVOLUME_RO(&new_subvol->v, ro); ++ SET_BCH_SUBVOLUME_SNAP(&new_subvol->v, src_subvolid != 0); ++ new_subvol->k.p = dst_iter.pos; ++ bch2_trans_update(trans, &dst_iter, &new_subvol->k_i, 0); ++ ++ *new_subvolid = new_subvol->k.p.offset; ++ *new_snapshotid = new_nodes[0]; ++err: ++ bch2_trans_iter_exit(trans, &src_iter); ++ bch2_trans_iter_exit(trans, &dst_iter); ++ return ret; ++} ++ ++int bch2_fs_subvolumes_init(struct bch_fs *c) ++{ ++ INIT_WORK(&c->snapshot_delete_work, bch2_delete_dead_snapshots_work); ++ return 0; ++} +diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h +new file mode 100644 +index 000000000000..cea4c665af32 +--- /dev/null ++++ b/fs/bcachefs/subvolume.h +@@ -0,0 +1,77 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_SUBVOLUME_H ++#define _BCACHEFS_SUBVOLUME_H ++ ++void bch2_snapshot_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); ++const char *bch2_snapshot_invalid(const struct bch_fs *, struct bkey_s_c); ++ ++#define bch2_bkey_ops_snapshot (struct bkey_ops) { \ ++ .key_invalid = bch2_snapshot_invalid, \ ++ .val_to_text = bch2_snapshot_to_text, \ ++} ++ ++int bch2_mark_snapshot(struct bch_fs *, struct bkey_s_c, ++ struct bkey_s_c, u64, unsigned); ++ ++static inline struct snapshot_t *snapshot_t(struct bch_fs *c, u32 id) ++{ ++ return genradix_ptr(&c->snapshots, U32_MAX - id); ++} ++ ++static inline u32 bch2_snapshot_parent(struct bch_fs *c, u32 id) ++{ ++ return snapshot_t(c, id)->parent; ++} ++ ++static inline u32 bch2_snapshot_internal_node(struct bch_fs *c, u32 id) ++{ ++ struct snapshot_t *s = snapshot_t(c, id); ++ ++ return s->children[0] || s->children[1]; ++} ++ ++static inline u32 bch2_snapshot_sibling(struct bch_fs *c, u32 id) ++{ ++ struct snapshot_t *s; ++ u32 parent = bch2_snapshot_parent(c, id); ++ ++ if (!parent) ++ return 0; ++ ++ s = snapshot_t(c, bch2_snapshot_parent(c, id)); ++ if (id == s->children[0]) ++ return s->children[1]; ++ if (id == s->children[1]) ++ return s->children[0]; ++ return 0; ++} ++ ++static inline bool bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor) ++{ ++ while (id && id < ancestor) ++ id = bch2_snapshot_parent(c, id); ++ ++ return id == ancestor; ++} ++ ++int bch2_fs_snapshots_check(struct bch_fs *); ++void bch2_fs_snapshots_exit(struct bch_fs *); ++int bch2_fs_snapshots_start(struct bch_fs *); ++ ++const char *bch2_subvolume_invalid(const struct bch_fs *, struct bkey_s_c); ++void bch2_subvolume_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); ++ ++#define bch2_bkey_ops_subvolume (struct bkey_ops) { \ ++ .key_invalid = bch2_subvolume_invalid, \ ++ .val_to_text = bch2_subvolume_to_text, \ ++} ++ ++int bch2_subvolume_get_snapshot(struct btree_trans *, u32, u32 *); ++ ++int bch2_subvolume_delete(struct btree_trans *, u32, int); ++int bch2_subvolume_create(struct btree_trans *, u64, u32, ++ u32 *, u32 *, bool); ++ ++int bch2_fs_subvolumes_init(struct bch_fs *); ++ ++#endif /* _BCACHEFS_SUBVOLUME_H */ +diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c +index 8f8476613594..1feb7dee2e0c 100644 +--- a/fs/bcachefs/super.c ++++ b/fs/bcachefs/super.c +@@ -39,6 +39,7 @@ + #include "rebalance.h" + #include "recovery.h" + #include "replicas.h" ++#include "subvolume.h" + #include "super.h" + #include "super-io.h" + #include "sysfs.h" +@@ -468,6 +469,7 @@ static void __bch2_fs_free(struct bch_fs *c) + for (i = 0; i < BCH_TIME_STAT_NR; i++) + bch2_time_stats_exit(&c->times[i]); + ++ bch2_fs_snapshots_exit(c); + bch2_fs_quota_exit(c); + bch2_fs_fsio_exit(c); + bch2_fs_ec_exit(c); +@@ -686,6 +688,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) + mutex_init(&c->usage_scratch_lock); + + mutex_init(&c->bio_bounce_pages_lock); ++ mutex_init(&c->snapshot_table_lock); + + spin_lock_init(&c->btree_write_error_lock); + +@@ -789,6 +792,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) + bch2_fs_btree_key_cache_init(&c->btree_key_cache) || + bch2_fs_btree_iter_init(c) || + bch2_fs_btree_interior_update_init(c) || ++ bch2_fs_subvolumes_init(c) || + bch2_fs_io_init(c) || + bch2_fs_encryption_init(c) || + bch2_fs_compress_init(c) || +-- +cgit v1.2.3 + + +From 81c2f91103e2b99625379eb028e3b01e7c81e511 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 16 Mar 2021 00:46:26 -0400 +Subject: bcachefs: Add support for dirents that point to subvolumes + +Dirents currently always point to inodes. Subvolumes add a new type of +dirent, with d_type DT_SUBVOL, that instead points to an entry in the +subvolumes btree, and the subvolume has a pointer to the root inode. + +This patch adds bch2_dirent_read_target() to get the inode (and +potentially subvolume) a dirent points to, and changes existing code to +use that instead of reading from d_inum directly. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/dirent.c | 105 +++++++++++++++++++++++++++++++++++++++--------- + fs/bcachefs/dirent.h | 14 ++++++- + fs/bcachefs/fs-common.c | 9 +---- + fs/bcachefs/fsck.c | 23 ++++++++++- + 4 files changed, 123 insertions(+), 28 deletions(-) + +diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c +index 53c7687a9ca8..f3aef0686928 100644 +--- a/fs/bcachefs/dirent.c ++++ b/fs/bcachefs/dirent.c +@@ -177,6 +177,61 @@ static void dirent_copy_target(struct bkey_i_dirent *dst, + dst->v.d_type = src.v->d_type; + } + ++int __bch2_dirent_read_target(struct btree_trans *trans, ++ struct bkey_s_c_dirent d, ++ u32 *subvol, u32 *snapshot, u64 *inum, ++ bool is_fsck) ++{ ++ int ret = 0; ++ ++ *subvol = 0; ++ *snapshot = d.k->p.snapshot; ++ ++ if (likely(d.v->d_type != DT_SUBVOL)) { ++ *inum = le64_to_cpu(d.v->d_inum); ++ } else { ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ struct bkey_s_c_subvolume s; ++ int ret; ++ ++ *subvol = le64_to_cpu(d.v->d_inum); ++ bch2_trans_iter_init(trans, &iter, BTREE_ID_subvolumes, ++ POS(0, *subvol), ++ BTREE_ITER_CACHED); ++ k = bch2_btree_iter_peek_slot(&iter); ++ ret = bkey_err(k); ++ if (ret) ++ goto err; ++ ++ if (k.k->type != KEY_TYPE_subvolume) { ++ ret = -ENOENT; ++ goto err; ++ } ++ ++ s = bkey_s_c_to_subvolume(k); ++ *snapshot = le32_to_cpu(s.v->snapshot); ++ *inum = le64_to_cpu(s.v->inode); ++err: ++ if (ret == -ENOENT && !is_fsck) ++ bch2_fs_inconsistent(trans->c, "pointer to missing subvolume %u", ++ *subvol); ++ ++ bch2_trans_iter_exit(trans, &iter); ++ } ++ ++ return ret; ++} ++ ++int bch2_dirent_read_target(struct btree_trans *trans, ++ struct bkey_s_c_dirent d, u64 *target) ++{ ++ u32 subvol, snapshot; ++ ++ return __bch2_dirent_read_target(trans, d, &subvol, ++ &snapshot, target, false); ++} ++ + int bch2_dirent_rename(struct btree_trans *trans, + u64 src_dir, struct bch_hash_info *src_hash, + u64 dst_dir, struct bch_hash_info *dst_hash, +@@ -323,10 +378,32 @@ int __bch2_dirent_lookup_trans(struct btree_trans *trans, + struct btree_iter *iter, + u64 dir_inum, + const struct bch_hash_info *hash_info, +- const struct qstr *name, unsigned flags) ++ const struct qstr *name, u64 *inum, ++ unsigned flags) + { +- return bch2_hash_lookup(trans, iter, bch2_dirent_hash_desc, +- hash_info, dir_inum, name, flags); ++ struct bkey_s_c k; ++ struct bkey_s_c_dirent d; ++ int ret; ++ ++ ret = bch2_hash_lookup(trans, iter, bch2_dirent_hash_desc, ++ hash_info, dir_inum, name, flags); ++ if (ret) ++ return ret; ++ ++ k = bch2_btree_iter_peek_slot(iter); ++ ret = bkey_err(k); ++ if (ret) { ++ bch2_trans_iter_exit(trans, iter); ++ return ret; ++ } ++ ++ d = bkey_s_c_to_dirent(k); ++ ++ ret = bch2_dirent_read_target(trans, d, inum); ++ if (ret) ++ bch2_trans_iter_exit(trans, iter); ++ ++ return ret; + } + + u64 bch2_dirent_lookup(struct bch_fs *c, u64 dir_inum, +@@ -335,26 +412,18 @@ u64 bch2_dirent_lookup(struct bch_fs *c, u64 dir_inum, + { + struct btree_trans trans; + struct btree_iter iter; +- struct bkey_s_c k; + u64 inum = 0; + int ret = 0; + + bch2_trans_init(&trans, c, 0, 0); ++retry: ++ bch2_trans_begin(&trans); ++ ret = __bch2_dirent_lookup_trans(&trans, &iter, dir_inum, hash_info, ++ name, &inum, 0); + +- ret = __bch2_dirent_lookup_trans(&trans, &iter, dir_inum, +- hash_info, name, 0); +- if (ret) +- goto out; +- +- k = bch2_btree_iter_peek_slot(&iter); +- ret = bkey_err(k); +- if (ret) +- goto out; +- +- inum = le64_to_cpu(bkey_s_c_to_dirent(k).v->d_inum); + bch2_trans_iter_exit(&trans, &iter); +-out: +- BUG_ON(ret == -EINTR); ++ if (ret == -EINTR) ++ goto retry; + bch2_trans_exit(&trans); + return inum; + } +@@ -408,7 +477,7 @@ int bch2_readdir(struct bch_fs *c, u64 inum, struct dir_context *ctx) + if (!dir_emit(ctx, dirent.v->d_name, + bch2_dirent_name_bytes(dirent), + le64_to_cpu(dirent.v->d_inum), +- dirent.v->d_type)) ++ vfs_d_type(dirent.v->d_type))) + break; + ctx->pos = dirent.k->p.offset + 1; + } +diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h +index c14f6029e1c9..3cd05a2454e1 100644 +--- a/fs/bcachefs/dirent.h ++++ b/fs/bcachefs/dirent.h +@@ -37,6 +37,17 @@ int bch2_dirent_delete_at(struct btree_trans *, + const struct bch_hash_info *, + struct btree_iter *); + ++int __bch2_dirent_read_target(struct btree_trans *, struct bkey_s_c_dirent, ++ u32 *, u32 *, u64 *, bool); ++ ++int bch2_dirent_read_target(struct btree_trans *, ++ struct bkey_s_c_dirent, u64 *); ++ ++static inline unsigned vfs_d_type(unsigned type) ++{ ++ return type == DT_SUBVOL ? DT_DIR : type; ++} ++ + enum bch_rename_mode { + BCH_RENAME, + BCH_RENAME_OVERWRITE, +@@ -52,7 +63,8 @@ int bch2_dirent_rename(struct btree_trans *, + + int __bch2_dirent_lookup_trans(struct btree_trans *, struct btree_iter *, u64, + const struct bch_hash_info *, +- const struct qstr *, unsigned); ++ const struct qstr *, u64 *, ++ unsigned); + u64 bch2_dirent_lookup(struct bch_fs *, u64, const struct bch_hash_info *, + const struct qstr *); + +diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c +index 6bc82559c9b1..96b09b005d0b 100644 +--- a/fs/bcachefs/fs-common.c ++++ b/fs/bcachefs/fs-common.c +@@ -159,17 +159,10 @@ int bch2_unlink_trans(struct btree_trans *trans, + dir_hash = bch2_hash_info_init(c, dir_u); + + ret = __bch2_dirent_lookup_trans(trans, &dirent_iter, dir_inum, &dir_hash, +- name, BTREE_ITER_INTENT); ++ name, &inum, BTREE_ITER_INTENT); + if (ret) + goto err; + +- k = bch2_btree_iter_peek_slot(&dirent_iter); +- ret = bkey_err(k); +- if (ret) +- goto err; +- +- inum = le64_to_cpu(bkey_s_c_to_dirent(k).v->d_inum); +- + ret = bch2_inode_peek(trans, &inode_iter, inode_u, inum, BTREE_ITER_INTENT); + if (ret) + goto err; +diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c +index 62158c0803db..dca4abda2c41 100644 +--- a/fs/bcachefs/fsck.c ++++ b/fs/bcachefs/fsck.c +@@ -723,6 +723,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, + struct bkey_s_c_dirent d; + struct bch_inode_unpacked target; + u32 target_snapshot; ++ u32 target_subvol; + bool have_target; + bool backpointer_exists = true; + u64 d_inum; +@@ -783,6 +784,10 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, + d = bkey_s_c_to_dirent(k); + d_inum = le64_to_cpu(d.v->d_inum); + ++ ret = bch2_dirent_read_target(trans, d, &d_inum); ++ if (ret && ret != -ENOENT) ++ return ret; ++ + ret = __lookup_inode(trans, d_inum, &target, &target_snapshot); + if (ret && ret != -ENOENT) + return ret; +@@ -855,7 +860,23 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, + } + } + +- if (fsck_err_on(d.v->d_type != mode_to_type(target.bi_mode), c, ++ target_subvol = d.v->d_type == DT_SUBVOL ++ ? le64_to_cpu(d.v->d_inum) : 0; ++ ++ if (fsck_err_on(target.bi_subvol != target_subvol, c, ++ "subvol root %llu has wrong subvol field:\n" ++ "got %u\n" ++ "should be %u", ++ target.bi_inum, ++ target.bi_subvol, ++ target_subvol)) { ++ target.bi_subvol = target_subvol; ++ ++ ret = write_inode(trans, &target, target_snapshot); ++ return ret ?: -EINTR; ++ } ++ ++ if (fsck_err_on(vfs_d_type(d.v->d_type) != mode_to_type(target.bi_mode), c, + "incorrect d_type: should be %u:\n%s", + mode_to_type(target.bi_mode), + (bch2_bkey_val_to_text(&PBUF(buf), c, +-- +cgit v1.2.3 + + +From 3aa5f0753690cc1a039d0dfd5f4830f4befeaa18 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 19 Apr 2021 23:31:40 -0400 +Subject: bcachefs: Per subvolume lost+found + +On existing filesystems, we have a single global lost+found. Introducing +subvolumes means we need to introduce per subvolume lost+found +directories, because inodes are added to lost+found by their inode +number, and inode numbers are now only unique within a subvolume. + +This patch adds support to fsck for per subvolume lost+found. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/fsck.c | 94 ++++++++++++++++++++++++++++++++++++++++++++++++------ + 1 file changed, 84 insertions(+), 10 deletions(-) + +diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c +index dca4abda2c41..e4ca05aae76c 100644 +--- a/fs/bcachefs/fsck.c ++++ b/fs/bcachefs/fsck.c +@@ -39,6 +39,71 @@ static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum) + return ret ?: sectors; + } + ++static int __snapshot_lookup_subvol(struct btree_trans *trans, u32 snapshot, ++ u32 *subvol) ++{ ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ int ret; ++ ++ bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots, ++ POS(0, snapshot), 0); ++ k = bch2_btree_iter_peek_slot(&iter); ++ ret = bkey_err(k); ++ if (ret) ++ goto err; ++ ++ if (k.k->type != KEY_TYPE_snapshot) { ++ bch_err(trans->c, "snapshot %u not fonud", snapshot); ++ ret = -ENOENT; ++ goto err; ++ } ++ ++ *subvol = le32_to_cpu(bkey_s_c_to_snapshot(k).v->subvol); ++err: ++ bch2_trans_iter_exit(trans, &iter); ++ return ret; ++ ++} ++ ++static int snapshot_lookup_subvol(struct btree_trans *trans, u32 snapshot, ++ u32 *subvol) ++{ ++ return lockrestart_do(trans, __snapshot_lookup_subvol(trans, snapshot, subvol)); ++} ++ ++static int __subvol_lookup_root(struct btree_trans *trans, u32 subvol, ++ u64 *inum) ++{ ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ int ret; ++ ++ bch2_trans_iter_init(trans, &iter, BTREE_ID_subvolumes, ++ POS(0, subvol), 0); ++ k = bch2_btree_iter_peek_slot(&iter); ++ ret = bkey_err(k); ++ if (ret) ++ goto err; ++ ++ if (k.k->type != KEY_TYPE_subvolume) { ++ bch_err(trans->c, "subvolume %u not fonud", subvol); ++ ret = -ENOENT; ++ goto err; ++ } ++ ++ *inum = le64_to_cpu(bkey_s_c_to_subvolume(k).v->inode); ++err: ++ bch2_trans_iter_exit(trans, &iter); ++ return ret; ++ ++} ++ ++static int subvol_lookup_root(struct btree_trans *trans, u32 subvol, u64 *inum) ++{ ++ return lockrestart_do(trans, __subvol_lookup_root(trans, subvol, inum)); ++} ++ + static int __lookup_inode(struct btree_trans *trans, u64 inode_nr, + struct bch_inode_unpacked *inode, + u32 *snapshot) +@@ -136,6 +201,7 @@ static int remove_dirent(struct btree_trans *trans, struct bpos pos) + + /* Get lost+found, create if it doesn't exist: */ + static int lookup_lostfound(struct btree_trans *trans, ++ u32 subvol, + struct bch_inode_unpacked *lostfound) + { + struct bch_fs *c = trans->c; +@@ -146,12 +212,14 @@ static int lookup_lostfound(struct btree_trans *trans, + u32 snapshot; + int ret; + +- ret = lookup_inode(trans, BCACHEFS_ROOT_INO, &root, &snapshot); ++ ret = subvol_lookup_root(trans, subvol, &inum); ++ ++ ret = lookup_inode(trans, inum, &root, &snapshot); + if (ret && ret != -ENOENT) + return ret; + + root_hash_info = bch2_hash_info_init(c, &root); +- inum = bch2_dirent_lookup(c, BCACHEFS_ROOT_INO, &root_hash_info, ++ inum = bch2_dirent_lookup(c, root.bi_inum, &root_hash_info, + &lostfound_str); + if (!inum) { + bch_notice(c, "creating lost+found"); +@@ -188,16 +256,22 @@ create_lostfound: + } + + static int reattach_inode(struct btree_trans *trans, +- struct bch_inode_unpacked *inode) ++ struct bch_inode_unpacked *inode, ++ u32 snapshot) + { + struct bch_hash_info dir_hash; + struct bch_inode_unpacked lostfound; + char name_buf[20]; + struct qstr name; + u64 dir_offset = 0; ++ u32 subvol; + int ret; + +- ret = lookup_lostfound(trans, &lostfound); ++ ret = snapshot_lookup_subvol(trans, snapshot, &subvol); ++ if (ret) ++ return ret; ++ ++ ret = lookup_lostfound(trans, subvol, &lostfound); + if (ret) + return ret; + +@@ -1063,10 +1137,10 @@ static int path_down(struct pathbuf *p, u64 inum) + + static int check_path(struct btree_trans *trans, + struct pathbuf *p, +- struct bch_inode_unpacked *inode) ++ struct bch_inode_unpacked *inode, ++ u32 snapshot) + { + struct bch_fs *c = trans->c; +- u32 snapshot; + size_t i; + int ret = 0; + +@@ -1085,7 +1159,7 @@ static int check_path(struct btree_trans *trans, + inode->bi_nlink, + inode->bi_dir, + inode->bi_dir_offset)) +- ret = reattach_inode(trans, inode); ++ ret = reattach_inode(trans, inode, snapshot); + break; + } + ret = 0; +@@ -1108,13 +1182,13 @@ static int check_path(struct btree_trans *trans, + return 0; + + ret = lockrestart_do(trans, +- remove_backpointer(trans, inode)); ++ remove_backpointer(trans, inode)); + if (ret) { + bch_err(c, "error removing dirent: %i", ret); + break; + } + +- ret = reattach_inode(trans, inode); ++ ret = reattach_inode(trans, inode, snapshot); + break; + } + +@@ -1160,7 +1234,7 @@ static int check_directory_structure(struct bch_fs *c) + break; + } + +- ret = check_path(&trans, &path, &u); ++ ret = check_path(&trans, &path, &u, iter.pos.snapshot); + if (ret) + break; + } +-- +cgit v1.2.3 + + +From c376c45c93666b54cc89cf91252146c508ad8121 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 16 Mar 2021 01:33:39 -0400 +Subject: bcachefs: Add subvolume to ei_inode_info + +Filesystem operations generally operate within a subvolume: at the start +of every btree transaction we'll be looking up (and locking) the +subvolume to get the current snapshot ID, which we then use for our +other btree lookups in BTREE_ITER_FILTER_SNAPSHOTS mode. + +But inodes don't record what subvolume they're in - they can't, because +if they did we'd have to update every single inode within a subvolume +when taking a snapshot in order to keep that field up to date. So it +needs to be tracked in memory, based on how we got to that inode. + +Hence this patch adds a subvolume field to ei_inode_info, and switches +to iget5() so we can index by it in the inode hash table. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/fs-ioctl.c | 6 ++-- + fs/bcachefs/fs.c | 85 ++++++++++++++++++++++++++++++++++++-------------- + fs/bcachefs/fs.h | 12 ++++++- + 3 files changed, 76 insertions(+), 27 deletions(-) + +diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c +index 91a0e761c8e7..494879b5bed3 100644 +--- a/fs/bcachefs/fs-ioctl.c ++++ b/fs/bcachefs/fs-ioctl.c +@@ -192,7 +192,7 @@ static int bch2_ioc_reinherit_attrs(struct bch_fs *c, + char *kname = NULL; + struct qstr qstr; + int ret = 0; +- u64 inum; ++ subvol_inum inum = { .subvol = 1 }; + + kname = kmalloc(BCH_NAME_MAX + 1, GFP_KERNEL); + if (!kname) +@@ -206,9 +206,9 @@ static int bch2_ioc_reinherit_attrs(struct bch_fs *c, + qstr.name = kname; + + ret = -ENOENT; +- inum = bch2_dirent_lookup(c, src->v.i_ino, &hash, ++ inum.inum = bch2_dirent_lookup(c, src->v.i_ino, &hash, + &qstr); +- if (!inum) ++ if (!inum.inum) + goto err1; + + vinode = bch2_vfs_inode_get(c, inum); +diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c +index bf1e519aa728..df57680cfb45 100644 +--- a/fs/bcachefs/fs.c ++++ b/fs/bcachefs/fs.c +@@ -37,7 +37,7 @@ + + static struct kmem_cache *bch2_inode_cache; + +-static void bch2_vfs_inode_init(struct bch_fs *, ++static void bch2_vfs_inode_init(struct bch_fs *, subvol_inum, + struct bch_inode_info *, + struct bch_inode_unpacked *); + +@@ -209,40 +209,68 @@ int bch2_fs_quota_transfer(struct bch_fs *c, + return ret; + } + +-struct inode *bch2_vfs_inode_get(struct bch_fs *c, u64 inum) ++static int bch2_iget5_test(struct inode *vinode, void *p) ++{ ++ struct bch_inode_info *inode = to_bch_ei(vinode); ++ subvol_inum *inum = p; ++ ++ return inode->ei_subvol == inum->subvol && ++ inode->ei_inode.bi_inum == inum->inum; ++} ++ ++static int bch2_iget5_set(struct inode *vinode, void *p) ++{ ++ struct bch_inode_info *inode = to_bch_ei(vinode); ++ subvol_inum *inum = p; ++ ++ inode->v.i_ino = inum->inum; ++ inode->ei_subvol = inum->subvol; ++ inode->ei_inode.bi_inum = inum->inum; ++ return 0; ++} ++ ++static unsigned bch2_inode_hash(subvol_inum inum) ++{ ++ return jhash_3words(inum.subvol, inum.inum >> 32, inum.inum, JHASH_INITVAL); ++} ++ ++struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum) + { + struct bch_inode_unpacked inode_u; + struct bch_inode_info *inode; + int ret; + +- inode = to_bch_ei(iget_locked(c->vfs_sb, inum)); ++ /* ++ * debug assert, to be removed when we start creating ++ * subvolumes/snapshots: ++ */ ++ BUG_ON(inum.subvol != BCACHEFS_ROOT_SUBVOL); ++ ++ inode = to_bch_ei(iget5_locked(c->vfs_sb, ++ bch2_inode_hash(inum), ++ bch2_iget5_test, ++ bch2_iget5_set, ++ &inum)); + if (unlikely(!inode)) + return ERR_PTR(-ENOMEM); + if (!(inode->v.i_state & I_NEW)) + return &inode->v; + +- ret = bch2_inode_find_by_inum(c, inum, &inode_u); ++ ret = bch2_inode_find_by_inum(c, inum.inum, &inode_u); + if (ret) { + iget_failed(&inode->v); + return ERR_PTR(ret); + } + +- bch2_vfs_inode_init(c, inode, &inode_u); ++ bch2_vfs_inode_init(c, inum, inode, &inode_u); + +- inode->ei_journal_seq = bch2_inode_journal_seq(&c->journal, inum); ++ inode->ei_journal_seq = bch2_inode_journal_seq(&c->journal, inum.inum); + + unlock_new_inode(&inode->v); + + return &inode->v; + } + +-static int inum_test(struct inode *inode, void *p) +-{ +- unsigned long *ino = p; +- +- return *ino == inode->i_ino; +-} +- + static struct bch_inode_info * + __bch2_create(struct user_namespace *mnt_userns, + struct bch_inode_info *dir, struct dentry *dentry, +@@ -254,6 +282,7 @@ __bch2_create(struct user_namespace *mnt_userns, + struct bch_inode_info *inode, *old; + struct bch_inode_unpacked inode_u; + struct posix_acl *default_acl = NULL, *acl = NULL; ++ subvol_inum inum; + u64 journal_seq = 0; + int ret; + +@@ -310,7 +339,10 @@ err_before_quota: + mutex_unlock(&dir->ei_update_lock); + } + +- bch2_vfs_inode_init(c, inode, &inode_u); ++ inum.subvol = inode_u.bi_subvol ?: dir->ei_subvol; ++ inum.inum = inode_u.bi_inum; ++ ++ bch2_vfs_inode_init(c, inum, inode, &inode_u); + journal_seq_copy(c, inode, journal_seq); + + set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl); +@@ -323,8 +355,12 @@ err_before_quota: + */ + + inode->v.i_state |= I_CREATING; +- old = to_bch_ei(inode_insert5(&inode->v, inode->v.i_ino, +- inum_test, NULL, &inode->v.i_ino)); ++ ++ old = to_bch_ei(inode_insert5(&inode->v, ++ bch2_inode_hash(inum), ++ bch2_iget5_test, ++ bch2_iget5_set, ++ &inum)); + BUG_ON(!old); + + if (unlikely(old != inode)) { +@@ -370,12 +406,12 @@ static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry, + struct bch_inode_info *dir = to_bch_ei(vdir); + struct bch_hash_info hash = bch2_hash_info_init(c, &dir->ei_inode); + struct inode *vinode = NULL; +- u64 inum; ++ subvol_inum inum = { .subvol = 1 }; + +- inum = bch2_dirent_lookup(c, dir->v.i_ino, &hash, ++ inum.inum = bch2_dirent_lookup(c, dir->v.i_ino, &hash, + &dentry->d_name); + +- if (inum) ++ if (inum.inum) + vinode = bch2_vfs_inode_get(c, inum); + + return d_splice_alias(vinode, dentry); +@@ -1097,6 +1133,7 @@ static const struct address_space_operations bch_address_space_operations = { + .error_remove_page = generic_error_remove_page, + }; + ++#if 0 + static struct inode *bch2_nfs_get_inode(struct super_block *sb, + u64 ino, u32 generation) + { +@@ -1130,14 +1167,15 @@ static struct dentry *bch2_fh_to_parent(struct super_block *sb, struct fid *fid, + return generic_fh_to_parent(sb, fid, fh_len, fh_type, + bch2_nfs_get_inode); + } ++#endif + + static const struct export_operations bch_export_ops = { +- .fh_to_dentry = bch2_fh_to_dentry, +- .fh_to_parent = bch2_fh_to_parent, ++ //.fh_to_dentry = bch2_fh_to_dentry, ++ //.fh_to_parent = bch2_fh_to_parent, + //.get_parent = bch2_get_parent, + }; + +-static void bch2_vfs_inode_init(struct bch_fs *c, ++static void bch2_vfs_inode_init(struct bch_fs *c, subvol_inum inum, + struct bch_inode_info *inode, + struct bch_inode_unpacked *bi) + { +@@ -1153,6 +1191,7 @@ static void bch2_vfs_inode_init(struct bch_fs *c, + inode->ei_journal_seq = 0; + inode->ei_quota_reserved = 0; + inode->ei_qid = bch_qid(bi); ++ inode->ei_subvol = inum.subvol; + + inode->v.i_mapping->a_ops = &bch_address_space_operations; + +@@ -1594,7 +1633,7 @@ got_sb: + sb->s_flags |= SB_POSIXACL; + #endif + +- vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_INO); ++ vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_SUBVOL_INUM); + if (IS_ERR(vinode)) { + bch_err(c, "error mounting: error getting root inode %i", + (int) PTR_ERR(vinode)); +diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h +index 36cc6ba2d644..ac6617594916 100644 +--- a/fs/bcachefs/fs.h ++++ b/fs/bcachefs/fs.h +@@ -45,10 +45,20 @@ struct bch_inode_info { + struct mutex ei_quota_lock; + struct bch_qid ei_qid; + ++ u32 ei_subvol; ++ + /* copy of inode in btree: */ + struct bch_inode_unpacked ei_inode; + }; + ++static inline subvol_inum inode_inum(struct bch_inode_info *inode) ++{ ++ return (subvol_inum) { ++ .subvol = inode->ei_subvol, ++ .inum = inode->ei_inode.bi_inum, ++ }; ++} ++ + /* + * Set if we've gotten a btree error for this inode, and thus the vfs inode and + * btree inode may be inconsistent: +@@ -154,7 +164,7 @@ static inline int bch2_set_projid(struct bch_fs *c, + KEY_TYPE_QUOTA_PREALLOC); + } + +-struct inode *bch2_vfs_inode_get(struct bch_fs *, u64); ++struct inode *bch2_vfs_inode_get(struct bch_fs *, subvol_inum); + + /* returns 0 if we want to do the update, or error is passed up */ + typedef int (*inode_set_fn)(struct bch_inode_info *, +-- +cgit v1.2.3 + + +From 8e8a3bd1bede3924b030ff4b1826c16de8b138c7 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 4 Mar 2021 22:29:25 -0500 +Subject: bcachefs: BTREE_ITER_FILTER_SNAPSHOTS + +For snapshots, we need to implement btree lookups that return the first +key that's an ancestor of the snapshot ID the lookup is being done in - +and filter out keys in unrelated snapshots. This patch adds the btree +iterator flag BTREE_ITER_FILTER_SNAPSHOTS which does that filtering. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 168 ++++++++++++++++++++++++++++++++++++++---- + fs/bcachefs/btree_iter.h | 9 +++ + fs/bcachefs/btree_key_cache.c | 3 +- + fs/bcachefs/btree_types.h | 1 + + 4 files changed, 166 insertions(+), 15 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 13e85096dd41..ac071b60fda0 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -13,6 +13,7 @@ + #include "extents.h" + #include "journal.h" + #include "replicas.h" ++#include "subvolume.h" + + #include + #include +@@ -689,6 +690,55 @@ static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) + bkey_cmp(iter->pos, iter->k.p) > 0); + } + ++static int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k) ++{ ++ struct btree_trans *trans = iter->trans; ++ struct btree_iter copy; ++ struct bkey_s_c prev; ++ int ret = 0; ++ ++ if (!bch2_debug_check_iterators) ++ return 0; ++ ++ if (!(iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)) ++ return 0; ++ ++ if (bkey_err(k) || !k.k) ++ return 0; ++ ++ BUG_ON(!bch2_snapshot_is_ancestor(trans->c, ++ iter->snapshot, ++ k.k->p.snapshot)); ++ ++ bch2_trans_iter_init(trans, ©, iter->btree_id, iter->pos, ++ BTREE_ITER_ALL_SNAPSHOTS); ++ prev = bch2_btree_iter_prev(©); ++ if (!prev.k) ++ goto out; ++ ++ ret = bkey_err(prev); ++ if (ret) ++ goto out; ++ ++ if (!bkey_cmp(prev.k->p, k.k->p) && ++ bch2_snapshot_is_ancestor(trans->c, iter->snapshot, ++ prev.k->p.snapshot) > 0) { ++ char buf1[100], buf2[200]; ++ ++ bch2_bkey_to_text(&PBUF(buf1), k.k); ++ bch2_bkey_to_text(&PBUF(buf2), prev.k); ++ ++ panic("iter snap %u\n" ++ "k %s\n" ++ "prev %s\n", ++ iter->snapshot, ++ buf1, buf2); ++ } ++out: ++ bch2_trans_iter_exit(trans, ©); ++ return ret; ++} ++ + #else + + static inline void bch2_btree_path_verify_level(struct btree_trans *trans, +@@ -697,6 +747,7 @@ static inline void bch2_btree_path_verify(struct btree_trans *trans, + struct btree_path *path) {} + static inline void bch2_btree_iter_verify(struct btree_iter *iter) {} + static inline void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) {} ++static inline int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k) { return 0; } + + #endif + +@@ -1986,11 +2037,25 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) + } + + if (likely(k.k)) { +- if (likely(!bkey_deleted(k.k))) +- break; ++ /* ++ * We can never have a key in a leaf node at POS_MAX, so ++ * we don't have to check these successor() calls: ++ */ ++ if ((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) && ++ !bch2_snapshot_is_ancestor(trans->c, ++ iter->snapshot, ++ k.k->p.snapshot)) { ++ search_key = bpos_successor(k.k->p); ++ continue; ++ } + +- /* Advance to next key: */ +- search_key = bkey_successor(iter, k.k->p); ++ if (bkey_whiteout(k.k) && ++ !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) { ++ search_key = bkey_successor(iter, k.k->p); ++ continue; ++ } ++ ++ break; + } else if (likely(bpos_cmp(iter->path->l[0].b->key.k.p, SPOS_MAX))) { + /* Advance to next leaf node: */ + search_key = bpos_successor(iter->path->l[0].b->key.k.p); +@@ -2011,6 +2076,9 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) + else if (bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0) + iter->pos = bkey_start_pos(k.k); + ++ if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) ++ iter->pos.snapshot = iter->snapshot; ++ + cmp = bpos_cmp(k.k->p, iter->path->pos); + if (cmp) { + iter->path = bch2_btree_path_make_mut(trans, iter->path, +@@ -2023,6 +2091,10 @@ out: + + bch2_btree_iter_verify_entry_exit(iter); + bch2_btree_iter_verify(iter); ++ ret = bch2_btree_iter_verify_ret(iter, k); ++ if (unlikely(ret)) ++ return bkey_s_c_err(ret); ++ + return k; + } + +@@ -2046,7 +2118,10 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) + { + struct btree_trans *trans = iter->trans; + struct bpos search_key = iter->pos; ++ struct btree_path *saved_path = NULL; + struct bkey_s_c k; ++ struct bkey saved_k; ++ const struct bch_val *saved_v; + int ret; + + EBUG_ON(iter->path->cached || iter->path->level); +@@ -2054,6 +2129,9 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) + bch2_btree_iter_verify(iter); + bch2_btree_iter_verify_entry_exit(iter); + ++ if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) ++ search_key.snapshot = U32_MAX; ++ + while (1) { + iter->path = btree_path_set_pos(trans, iter->path, search_key, + iter->flags & BTREE_ITER_INTENT); +@@ -2070,14 +2148,57 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) + &iter->path->l[0], &iter->k); + if (!k.k || + ((iter->flags & BTREE_ITER_IS_EXTENTS) +- ? bkey_cmp(bkey_start_pos(k.k), iter->pos) >= 0 +- : bkey_cmp(k.k->p, iter->pos) > 0)) ++ ? bpos_cmp(bkey_start_pos(k.k), search_key) >= 0 ++ : bpos_cmp(k.k->p, search_key) > 0)) + k = btree_path_level_prev(trans->c, iter->path, + &iter->path->l[0], &iter->k); + + btree_path_check_sort(trans, iter->path, 0); + + if (likely(k.k)) { ++ if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) { ++ if (k.k->p.snapshot == iter->snapshot) ++ goto got_key; ++ ++ /* ++ * If we have a saved candidate, and we're no ++ * longer at the same _key_ (not pos), return ++ * that candidate ++ */ ++ if (saved_path && bkey_cmp(k.k->p, saved_k.p)) { ++ bch2_path_put(trans, iter->path, ++ iter->flags & BTREE_ITER_INTENT); ++ iter->path = saved_path; ++ saved_path = NULL; ++ iter->k = saved_k; ++ k.v = saved_v; ++ goto got_key; ++ } ++ ++ if (bch2_snapshot_is_ancestor(iter->trans->c, ++ iter->snapshot, ++ k.k->p.snapshot)) { ++ if (saved_path) ++ bch2_path_put(trans, saved_path, ++ iter->flags & BTREE_ITER_INTENT); ++ saved_path = btree_path_clone(trans, iter->path, ++ iter->flags & BTREE_ITER_INTENT); ++ saved_k = *k.k; ++ saved_v = k.v; ++ } ++ ++ search_key = bpos_predecessor(k.k->p); ++ continue; ++ } ++got_key: ++ if (bkey_whiteout(k.k) && ++ !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) { ++ search_key = bkey_predecessor(iter, k.k->p); ++ if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) ++ search_key.snapshot = U32_MAX; ++ continue; ++ } ++ + break; + } else if (likely(bpos_cmp(iter->path->l[0].b->data->min_key, POS_MIN))) { + /* Advance to previous leaf node: */ +@@ -2095,7 +2216,12 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) + /* Extents can straddle iter->pos: */ + if (bkey_cmp(k.k->p, iter->pos) < 0) + iter->pos = k.k->p; ++ ++ if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) ++ iter->pos.snapshot = iter->snapshot; + out: ++ if (saved_path) ++ bch2_path_put(trans, saved_path, iter->flags & BTREE_ITER_INTENT); + iter->path->should_be_locked = true; + + bch2_btree_iter_verify_entry_exit(iter); +@@ -2144,7 +2270,8 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) + if (unlikely(ret)) + return bkey_s_c_err(ret); + +- if (!(iter->flags & BTREE_ITER_IS_EXTENTS)) { ++ if ((iter->flags & BTREE_ITER_CACHED) || ++ !(iter->flags & (BTREE_ITER_IS_EXTENTS|BTREE_ITER_FILTER_SNAPSHOTS))) { + struct bkey_i *next_update; + + next_update = iter->flags & BTREE_ITER_WITH_UPDATES +@@ -2187,15 +2314,18 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) + if (bkey_cmp(iter->pos, next) < 0) { + bkey_init(&iter->k); + iter->k.p = iter->pos; +- bch2_key_resize(&iter->k, +- min_t(u64, KEY_SIZE_MAX, +- (next.inode == iter->pos.inode +- ? next.offset +- : KEY_OFFSET_MAX) - +- iter->pos.offset)); ++ ++ if (iter->flags & BTREE_ITER_IS_EXTENTS) { ++ bch2_key_resize(&iter->k, ++ min_t(u64, KEY_SIZE_MAX, ++ (next.inode == iter->pos.inode ++ ? next.offset ++ : KEY_OFFSET_MAX) - ++ iter->pos.offset)); ++ EBUG_ON(!iter->k.size); ++ } + + k = (struct bkey_s_c) { &iter->k, NULL }; +- EBUG_ON(!k.k->size); + } + } + +@@ -2203,6 +2333,9 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) + + bch2_btree_iter_verify_entry_exit(iter); + bch2_btree_iter_verify(iter); ++ ret = bch2_btree_iter_verify_ret(iter, k); ++ if (unlikely(ret)) ++ return bkey_s_c_err(ret); + + return k; + } +@@ -2356,6 +2489,13 @@ static void __bch2_trans_iter_init(struct btree_trans *trans, + if (!btree_type_has_snapshots(btree_id) && + !(flags & __BTREE_ITER_ALL_SNAPSHOTS)) + flags &= ~BTREE_ITER_ALL_SNAPSHOTS; ++#if 0 ++ /* let's have this be explicitly set: */ ++ if ((flags & BTREE_ITER_TYPE) != BTREE_ITER_NODES && ++ btree_type_has_snapshots(btree_id) && ++ !(flags & BTREE_ITER_ALL_SNAPSHOTS)) ++ flags |= BTREE_ITER_FILTER_SNAPSHOTS; ++#endif + + if (!(flags & BTREE_ITER_ALL_SNAPSHOTS)) + pos.snapshot = btree_type_has_snapshots(btree_id) +diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h +index be1bb489f3d6..19ca73f5ea22 100644 +--- a/fs/bcachefs/btree_iter.h ++++ b/fs/bcachefs/btree_iter.h +@@ -234,6 +234,15 @@ static inline void bch2_btree_iter_set_pos_to_extent_start(struct btree_iter *it + iter->pos = bkey_start_pos(&iter->k); + } + ++static inline void bch2_btree_iter_set_snapshot(struct btree_iter *iter, u32 snapshot) ++{ ++ struct bpos pos = iter->pos; ++ ++ iter->snapshot = snapshot; ++ pos.snapshot = snapshot; ++ bch2_btree_iter_set_pos(iter, pos); ++} ++ + /* + * Unlocks before scheduling + * Note: does not revalidate iterator +diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c +index c019200a6125..4f1bc1d165aa 100644 +--- a/fs/bcachefs/btree_key_cache.c ++++ b/fs/bcachefs/btree_key_cache.c +@@ -371,7 +371,8 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans, + + bch2_trans_iter_init(trans, &b_iter, key.btree_id, key.pos, + BTREE_ITER_SLOTS| +- BTREE_ITER_INTENT); ++ BTREE_ITER_INTENT| ++ BTREE_ITER_ALL_SNAPSHOTS); + bch2_trans_iter_init(trans, &c_iter, key.btree_id, key.pos, + BTREE_ITER_CACHED| + BTREE_ITER_CACHED_NOFILL| +diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h +index 262ee2d53322..7fcd2ceb51e9 100644 +--- a/fs/bcachefs/btree_types.h ++++ b/fs/bcachefs/btree_types.h +@@ -209,6 +209,7 @@ struct btree_node_iter { + #define BTREE_ITER_WITH_UPDATES (1 << 10) + #define __BTREE_ITER_ALL_SNAPSHOTS (1 << 11) + #define BTREE_ITER_ALL_SNAPSHOTS (1 << 12) ++#define BTREE_ITER_FILTER_SNAPSHOTS (1 << 13) + + enum btree_path_uptodate { + BTREE_ITER_UPTODATE = 0, +-- +cgit v1.2.3 + + +From 915b34d825c7766657db0a1dad22ac957d60da41 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 16 Mar 2021 00:28:17 -0400 +Subject: bcachefs: Plumb through subvolume id + +To implement snapshots, we need every filesystem btree operation (every +btree operation without a subvolume) to start by looking up the +subvolume and getting the current snapshot ID, with +bch2_subvolume_get_snapshot() - then, that snapshot ID is used for doing +btree lookups in BTREE_ITER_FILTER_SNAPSHOTS mode. + +This patch adds those bch2_subvolume_get_snapshot() calls, and also +switches to passing around a subvol_inum instead of just an inode +number. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/acl.c | 25 +++++----- + fs/bcachefs/acl.h | 11 ++--- + fs/bcachefs/dirent.c | 107 +++++++++++++++++++++++++++------------- + fs/bcachefs/dirent.h | 29 +++++------ + fs/bcachefs/extents.c | 32 ------------ + fs/bcachefs/extents.h | 1 - + fs/bcachefs/fs-common.c | 127 ++++++++++++++++++++++++++++++------------------ + fs/bcachefs/fs-common.h | 21 ++++---- + fs/bcachefs/fs-io.c | 117 +++++++++++++++++++++++++++++++++++++++----- + fs/bcachefs/fs-ioctl.c | 8 ++- + fs/bcachefs/fs.c | 77 +++++++++++++++++------------ + fs/bcachefs/fs.h | 4 ++ + fs/bcachefs/fsck.c | 5 +- + fs/bcachefs/inode.c | 109 +++++++++++++++++++++++++++++++++-------- + fs/bcachefs/inode.h | 7 +-- + fs/bcachefs/io.c | 5 +- + fs/bcachefs/move.c | 3 +- + fs/bcachefs/recovery.c | 5 +- + fs/bcachefs/reflink.c | 18 ++++++- + fs/bcachefs/reflink.h | 4 +- + fs/bcachefs/str_hash.h | 41 +++++++++++----- + fs/bcachefs/xattr.c | 23 +++++++-- + fs/bcachefs/xattr.h | 3 +- + 23 files changed, 526 insertions(+), 256 deletions(-) + +diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c +index 828915145ade..d0c6878b003f 100644 +--- a/fs/bcachefs/acl.c ++++ b/fs/bcachefs/acl.c +@@ -232,7 +232,7 @@ retry: + bch2_trans_begin(&trans); + + ret = bch2_hash_lookup(&trans, &iter, bch2_xattr_hash_desc, +- &hash, inode->v.i_ino, ++ &hash, inode_inum(inode), + &X_SEARCH(acl_to_xattr_type(type), "", 0), + 0); + if (ret) { +@@ -262,11 +262,11 @@ out: + return acl; + } + +-int bch2_set_acl_trans(struct btree_trans *trans, ++int bch2_set_acl_trans(struct btree_trans *trans, subvol_inum inum, + struct bch_inode_unpacked *inode_u, +- const struct bch_hash_info *hash_info, + struct posix_acl *acl, int type) + { ++ struct bch_hash_info hash_info = bch2_hash_info_init(trans->c, inode_u); + int ret; + + if (type == ACL_TYPE_DEFAULT && +@@ -279,14 +279,14 @@ int bch2_set_acl_trans(struct btree_trans *trans, + if (IS_ERR(xattr)) + return PTR_ERR(xattr); + +- ret = bch2_hash_set(trans, bch2_xattr_hash_desc, hash_info, +- inode_u->bi_inum, &xattr->k_i, 0); ++ ret = bch2_hash_set(trans, bch2_xattr_hash_desc, &hash_info, ++ inum, &xattr->k_i, 0); + } else { + struct xattr_search_key search = + X_SEARCH(acl_to_xattr_type(type), "", 0); + +- ret = bch2_hash_delete(trans, bch2_xattr_hash_desc, hash_info, +- inode_u->bi_inum, &search); ++ ret = bch2_hash_delete(trans, bch2_xattr_hash_desc, &hash_info, ++ inum, &search); + } + + return ret == -ENOENT ? 0 : ret; +@@ -300,7 +300,6 @@ int bch2_set_acl(struct user_namespace *mnt_userns, + struct btree_trans trans; + struct btree_iter inode_iter = { NULL }; + struct bch_inode_unpacked inode_u; +- struct bch_hash_info hash_info; + struct posix_acl *acl; + umode_t mode; + int ret; +@@ -311,7 +310,7 @@ retry: + bch2_trans_begin(&trans); + acl = _acl; + +- ret = bch2_inode_peek(&trans, &inode_iter, &inode_u, inode->v.i_ino, ++ ret = bch2_inode_peek(&trans, &inode_iter, &inode_u, inode_inum(inode), + BTREE_ITER_INTENT); + if (ret) + goto btree_err; +@@ -324,9 +323,7 @@ retry: + goto btree_err; + } + +- hash_info = bch2_hash_info_init(c, &inode_u); +- +- ret = bch2_set_acl_trans(&trans, &inode_u, &hash_info, acl, type); ++ ret = bch2_set_acl_trans(&trans, inode_inum(inode), &inode_u, acl, type); + if (ret) + goto btree_err; + +@@ -355,7 +352,7 @@ err: + return ret; + } + +-int bch2_acl_chmod(struct btree_trans *trans, ++int bch2_acl_chmod(struct btree_trans *trans, subvol_inum inum, + struct bch_inode_unpacked *inode, + umode_t mode, + struct posix_acl **new_acl) +@@ -369,7 +366,7 @@ int bch2_acl_chmod(struct btree_trans *trans, + int ret; + + ret = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc, +- &hash_info, inode->bi_inum, ++ &hash_info, inum, + &X_SEARCH(KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS, "", 0), + BTREE_ITER_INTENT); + if (ret) +diff --git a/fs/bcachefs/acl.h b/fs/bcachefs/acl.h +index f7c758369faf..2d76a4897ba8 100644 +--- a/fs/bcachefs/acl.h ++++ b/fs/bcachefs/acl.h +@@ -28,25 +28,24 @@ typedef struct { + + struct posix_acl *bch2_get_acl(struct inode *, int, bool); + +-int bch2_set_acl_trans(struct btree_trans *, ++int bch2_set_acl_trans(struct btree_trans *, subvol_inum, + struct bch_inode_unpacked *, +- const struct bch_hash_info *, + struct posix_acl *, int); + int bch2_set_acl(struct user_namespace *, struct inode *, struct posix_acl *, int); +-int bch2_acl_chmod(struct btree_trans *, struct bch_inode_unpacked *, ++int bch2_acl_chmod(struct btree_trans *, subvol_inum, ++ struct bch_inode_unpacked *, + umode_t, struct posix_acl **); + + #else + +-static inline int bch2_set_acl_trans(struct btree_trans *trans, ++static inline int bch2_set_acl_trans(struct btree_trans *trans, subvol_inum inum, + struct bch_inode_unpacked *inode_u, +- const struct bch_hash_info *hash_info, + struct posix_acl *acl, int type) + { + return 0; + } + +-static inline int bch2_acl_chmod(struct btree_trans *trans, ++static inline int bch2_acl_chmod(struct btree_trans *trans, subvol_inum inum, + struct bch_inode_unpacked *inode, + umode_t mode, + struct posix_acl **new_acl) +diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c +index f3aef0686928..f290580594ce 100644 +--- a/fs/bcachefs/dirent.c ++++ b/fs/bcachefs/dirent.c +@@ -8,6 +8,7 @@ + #include "fs.h" + #include "keylist.h" + #include "str_hash.h" ++#include "subvolume.h" + + #include + +@@ -150,8 +151,8 @@ static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans, + return dirent; + } + +-int bch2_dirent_create(struct btree_trans *trans, +- u64 dir_inum, const struct bch_hash_info *hash_info, ++int bch2_dirent_create(struct btree_trans *trans, subvol_inum dir, ++ const struct bch_hash_info *hash_info, + u8 type, const struct qstr *name, u64 dst_inum, + u64 *dir_offset, int flags) + { +@@ -164,7 +165,7 @@ int bch2_dirent_create(struct btree_trans *trans, + return ret; + + ret = bch2_hash_set(trans, bch2_dirent_hash_desc, hash_info, +- dir_inum, &dirent->k_i, flags); ++ dir, &dirent->k_i, flags); + *dir_offset = dirent->k.p.offset; + + return ret; +@@ -223,31 +224,40 @@ err: + return ret; + } + +-int bch2_dirent_read_target(struct btree_trans *trans, +- struct bkey_s_c_dirent d, u64 *target) ++static int bch2_dirent_read_target(struct btree_trans *trans, subvol_inum dir, ++ struct bkey_s_c_dirent d, subvol_inum *target) + { +- u32 subvol, snapshot; ++ u32 snapshot; ++ int ret = 0; + +- return __bch2_dirent_read_target(trans, d, &subvol, +- &snapshot, target, false); ++ ret = __bch2_dirent_read_target(trans, d, &target->subvol, &snapshot, ++ &target->inum, false); ++ if (!target->subvol) ++ target->subvol = dir.subvol; ++ ++ return ret; + } + + int bch2_dirent_rename(struct btree_trans *trans, +- u64 src_dir, struct bch_hash_info *src_hash, +- u64 dst_dir, struct bch_hash_info *dst_hash, +- const struct qstr *src_name, u64 *src_inum, u64 *src_offset, +- const struct qstr *dst_name, u64 *dst_inum, u64 *dst_offset, +- enum bch_rename_mode mode) ++ subvol_inum src_dir, struct bch_hash_info *src_hash, ++ subvol_inum dst_dir, struct bch_hash_info *dst_hash, ++ const struct qstr *src_name, subvol_inum *src_inum, u64 *src_offset, ++ const struct qstr *dst_name, subvol_inum *dst_inum, u64 *dst_offset, ++ enum bch_rename_mode mode) + { + struct btree_iter src_iter = { NULL }; + struct btree_iter dst_iter = { NULL }; + struct bkey_s_c old_src, old_dst; + struct bkey_i_dirent *new_src = NULL, *new_dst = NULL; + struct bpos dst_pos = +- POS(dst_dir, bch2_dirent_hash(dst_hash, dst_name)); ++ POS(dst_dir.inum, bch2_dirent_hash(dst_hash, dst_name)); + int ret = 0; + +- *src_inum = *dst_inum = 0; ++ if (src_dir.subvol != dst_dir.subvol) ++ return -EXDEV; ++ ++ memset(src_inum, 0, sizeof(*src_inum)); ++ memset(dst_inum, 0, sizeof(*dst_inum)); + + /* + * Lookup dst: +@@ -270,8 +280,12 @@ int bch2_dirent_rename(struct btree_trans *trans, + if (ret) + goto out; + +- if (mode != BCH_RENAME) +- *dst_inum = le64_to_cpu(bkey_s_c_to_dirent(old_dst).v->d_inum); ++ if (mode != BCH_RENAME) { ++ ret = bch2_dirent_read_target(trans, dst_dir, ++ bkey_s_c_to_dirent(old_dst), dst_inum); ++ if (ret) ++ goto out; ++ } + if (mode != BCH_RENAME_EXCHANGE) + *src_offset = dst_iter.pos.offset; + +@@ -287,7 +301,10 @@ int bch2_dirent_rename(struct btree_trans *trans, + if (ret) + goto out; + +- *src_inum = le64_to_cpu(bkey_s_c_to_dirent(old_src).v->d_inum); ++ ret = bch2_dirent_read_target(trans, src_dir, ++ bkey_s_c_to_dirent(old_src), src_inum); ++ if (ret) ++ goto out; + + /* Create new dst key: */ + new_dst = dirent_create_key(trans, 0, dst_name, 0); +@@ -376,17 +393,22 @@ int bch2_dirent_delete_at(struct btree_trans *trans, + + int __bch2_dirent_lookup_trans(struct btree_trans *trans, + struct btree_iter *iter, +- u64 dir_inum, ++ subvol_inum dir, + const struct bch_hash_info *hash_info, +- const struct qstr *name, u64 *inum, ++ const struct qstr *name, subvol_inum *inum, + unsigned flags) + { + struct bkey_s_c k; + struct bkey_s_c_dirent d; ++ u32 snapshot; + int ret; + ++ ret = bch2_subvolume_get_snapshot(trans, dir.subvol, &snapshot); ++ if (ret) ++ return ret; ++ + ret = bch2_hash_lookup(trans, iter, bch2_dirent_hash_desc, +- hash_info, dir_inum, name, flags); ++ hash_info, dir, name, flags); + if (ret) + return ret; + +@@ -399,44 +421,49 @@ int __bch2_dirent_lookup_trans(struct btree_trans *trans, + + d = bkey_s_c_to_dirent(k); + +- ret = bch2_dirent_read_target(trans, d, inum); ++ ret = bch2_dirent_read_target(trans, dir, d, inum); + if (ret) + bch2_trans_iter_exit(trans, iter); + + return ret; + } + +-u64 bch2_dirent_lookup(struct bch_fs *c, u64 dir_inum, ++u64 bch2_dirent_lookup(struct bch_fs *c, subvol_inum dir, + const struct bch_hash_info *hash_info, +- const struct qstr *name) ++ const struct qstr *name, subvol_inum *inum) + { + struct btree_trans trans; + struct btree_iter iter; +- u64 inum = 0; +- int ret = 0; ++ int ret; + + bch2_trans_init(&trans, c, 0, 0); + retry: + bch2_trans_begin(&trans); +- ret = __bch2_dirent_lookup_trans(&trans, &iter, dir_inum, hash_info, +- name, &inum, 0); ++ ++ ret = __bch2_dirent_lookup_trans(&trans, &iter, dir, hash_info, ++ name, inum, 0); + + bch2_trans_iter_exit(&trans, &iter); + if (ret == -EINTR) + goto retry; + bch2_trans_exit(&trans); +- return inum; ++ return ret; + } + +-int bch2_empty_dir_trans(struct btree_trans *trans, u64 dir_inum) ++int bch2_empty_dir_trans(struct btree_trans *trans, subvol_inum dir) + { + struct btree_iter iter; + struct bkey_s_c k; ++ u32 snapshot; + int ret; + ++ ret = bch2_subvolume_get_snapshot(trans, dir.subvol, &snapshot); ++ if (ret) ++ return ret; ++ + for_each_btree_key(trans, iter, BTREE_ID_dirents, +- POS(dir_inum, 0), 0, k, ret) { +- if (k.k->p.inode > dir_inum) ++ SPOS(dir.inum, 0, snapshot), 0, k, ret) { ++ if (k.k->p.inode > dir.inum) + break; + + if (k.k->type == KEY_TYPE_dirent) { +@@ -449,19 +476,26 @@ int bch2_empty_dir_trans(struct btree_trans *trans, u64 dir_inum) + return ret; + } + +-int bch2_readdir(struct bch_fs *c, u64 inum, struct dir_context *ctx) ++int bch2_readdir(struct bch_fs *c, subvol_inum inum, struct dir_context *ctx) + { + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; + struct bkey_s_c_dirent dirent; ++ u32 snapshot; + int ret; + + bch2_trans_init(&trans, c, 0, 0); ++retry: ++ bch2_trans_begin(&trans); ++ ++ ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot); ++ if (ret) ++ goto err; + + for_each_btree_key(&trans, iter, BTREE_ID_dirents, +- POS(inum, ctx->pos), 0, k, ret) { +- if (k.k->p.inode > inum) ++ SPOS(inum.inum, ctx->pos, snapshot), 0, k, ret) { ++ if (k.k->p.inode > inum.inum) + break; + + if (k.k->type != KEY_TYPE_dirent) +@@ -482,6 +516,9 @@ int bch2_readdir(struct bch_fs *c, u64 inum, struct dir_context *ctx) + ctx->pos = dirent.k->p.offset + 1; + } + bch2_trans_iter_exit(&trans, &iter); ++err: ++ if (ret == -EINTR) ++ goto retry; + + ret = bch2_trans_exit(&trans) ?: ret; + +diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h +index 3cd05a2454e1..88b784a99cb5 100644 +--- a/fs/bcachefs/dirent.h ++++ b/fs/bcachefs/dirent.h +@@ -29,7 +29,7 @@ static inline unsigned dirent_val_u64s(unsigned len) + sizeof(u64)); + } + +-int bch2_dirent_create(struct btree_trans *, u64, ++int bch2_dirent_create(struct btree_trans *, subvol_inum, + const struct bch_hash_info *, u8, + const struct qstr *, u64, u64 *, int); + +@@ -40,9 +40,6 @@ int bch2_dirent_delete_at(struct btree_trans *, + int __bch2_dirent_read_target(struct btree_trans *, struct bkey_s_c_dirent, + u32 *, u32 *, u64 *, bool); + +-int bch2_dirent_read_target(struct btree_trans *, +- struct bkey_s_c_dirent, u64 *); +- + static inline unsigned vfs_d_type(unsigned type) + { + return type == DT_SUBVOL ? DT_DIR : type; +@@ -55,20 +52,20 @@ enum bch_rename_mode { + }; + + int bch2_dirent_rename(struct btree_trans *, +- u64, struct bch_hash_info *, +- u64, struct bch_hash_info *, +- const struct qstr *, u64 *, u64 *, +- const struct qstr *, u64 *, u64 *, ++ subvol_inum, struct bch_hash_info *, ++ subvol_inum, struct bch_hash_info *, ++ const struct qstr *, subvol_inum *, u64 *, ++ const struct qstr *, subvol_inum *, u64 *, + enum bch_rename_mode); + +-int __bch2_dirent_lookup_trans(struct btree_trans *, struct btree_iter *, u64, +- const struct bch_hash_info *, +- const struct qstr *, u64 *, +- unsigned); +-u64 bch2_dirent_lookup(struct bch_fs *, u64, const struct bch_hash_info *, +- const struct qstr *); ++int __bch2_dirent_lookup_trans(struct btree_trans *, struct btree_iter *, ++ subvol_inum, const struct bch_hash_info *, ++ const struct qstr *, subvol_inum *, unsigned); ++u64 bch2_dirent_lookup(struct bch_fs *, subvol_inum, ++ const struct bch_hash_info *, ++ const struct qstr *, subvol_inum *); + +-int bch2_empty_dir_trans(struct btree_trans *, u64); +-int bch2_readdir(struct bch_fs *, u64, struct dir_context *); ++int bch2_empty_dir_trans(struct btree_trans *, subvol_inum); ++int bch2_readdir(struct bch_fs *, subvol_inum, struct dir_context *); + + #endif /* _BCACHEFS_DIRENT_H */ +diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c +index f66640c2a5ed..6c2eed77a326 100644 +--- a/fs/bcachefs/extents.c ++++ b/fs/bcachefs/extents.c +@@ -612,38 +612,6 @@ bool bch2_bkey_is_incompressible(struct bkey_s_c k) + return false; + } + +-bool bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size, +- unsigned nr_replicas, bool compressed) +-{ +- struct btree_trans trans; +- struct btree_iter iter; +- struct bpos end = pos; +- struct bkey_s_c k; +- bool ret = true; +- int err; +- +- end.offset += size; +- +- bch2_trans_init(&trans, c, 0, 0); +- +- for_each_btree_key(&trans, iter, BTREE_ID_extents, pos, +- BTREE_ITER_SLOTS, k, err) { +- if (bkey_cmp(bkey_start_pos(k.k), end) >= 0) +- break; +- +- if (nr_replicas > bch2_bkey_replicas(c, k) || +- (!compressed && bch2_bkey_sectors_compressed(k))) { +- ret = false; +- break; +- } +- } +- bch2_trans_iter_exit(&trans, &iter); +- +- bch2_trans_exit(&trans); +- +- return ret; +-} +- + unsigned bch2_bkey_replicas(struct bch_fs *c, struct bkey_s_c k) + { + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); +diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h +index 43cef0a3bdf3..afd3067bb64e 100644 +--- a/fs/bcachefs/extents.h ++++ b/fs/bcachefs/extents.h +@@ -567,7 +567,6 @@ unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c); + unsigned bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c); + bool bch2_bkey_is_incompressible(struct bkey_s_c); + unsigned bch2_bkey_sectors_compressed(struct bkey_s_c); +-bool bch2_check_range_allocated(struct bch_fs *, struct bpos, u64, unsigned, bool); + + unsigned bch2_bkey_replicas(struct bch_fs *, struct bkey_s_c); + unsigned bch2_bkey_durability(struct bch_fs *, struct bkey_s_c); +diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c +index 96b09b005d0b..02bf32cc7659 100644 +--- a/fs/bcachefs/fs-common.c ++++ b/fs/bcachefs/fs-common.c +@@ -6,28 +6,38 @@ + #include "dirent.h" + #include "fs-common.h" + #include "inode.h" ++#include "subvolume.h" + #include "xattr.h" + + #include + +-int bch2_create_trans(struct btree_trans *trans, u64 dir_inum, ++int bch2_create_trans(struct btree_trans *trans, ++ subvol_inum dir, + struct bch_inode_unpacked *dir_u, + struct bch_inode_unpacked *new_inode, + const struct qstr *name, + uid_t uid, gid_t gid, umode_t mode, dev_t rdev, + struct posix_acl *default_acl, +- struct posix_acl *acl) ++ struct posix_acl *acl, ++ unsigned flags) + { + struct bch_fs *c = trans->c; + struct btree_iter dir_iter = { NULL }; + struct btree_iter inode_iter = { NULL }; +- struct bch_hash_info hash = bch2_hash_info_init(c, new_inode); ++ subvol_inum new_inum = dir; + u64 now = bch2_current_time(c); + u64 cpu = raw_smp_processor_id(); + u64 dir_offset = 0; ++ u64 dir_target; ++ u32 snapshot; ++ unsigned dir_type; + int ret; + +- ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir_inum, BTREE_ITER_INTENT); ++ ret = bch2_subvolume_get_snapshot(trans, dir.subvol, &snapshot); ++ if (ret) ++ goto err; ++ ++ ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_INTENT); + if (ret) + goto err; + +@@ -36,19 +46,23 @@ int bch2_create_trans(struct btree_trans *trans, u64 dir_inum, + if (!name) + new_inode->bi_flags |= BCH_INODE_UNLINKED; + +- ret = bch2_inode_create(trans, &inode_iter, new_inode, U32_MAX, cpu); ++ ret = bch2_inode_create(trans, &inode_iter, new_inode, snapshot, cpu); + if (ret) + goto err; + ++ new_inum.inum = new_inode->bi_inum; ++ dir_target = new_inode->bi_inum; ++ dir_type = mode_to_type(new_inode->bi_mode); ++ + if (default_acl) { +- ret = bch2_set_acl_trans(trans, new_inode, &hash, ++ ret = bch2_set_acl_trans(trans, new_inum, new_inode, + default_acl, ACL_TYPE_DEFAULT); + if (ret) + goto err; + } + + if (acl) { +- ret = bch2_set_acl_trans(trans, new_inode, &hash, ++ ret = bch2_set_acl_trans(trans, new_inum, new_inode, + acl, ACL_TYPE_ACCESS); + if (ret) + goto err; +@@ -56,18 +70,19 @@ int bch2_create_trans(struct btree_trans *trans, u64 dir_inum, + + if (name) { + struct bch_hash_info dir_hash = bch2_hash_info_init(c, dir_u); +- dir_u->bi_mtime = dir_u->bi_ctime = now; + + if (S_ISDIR(new_inode->bi_mode)) + dir_u->bi_nlink++; ++ dir_u->bi_mtime = dir_u->bi_ctime = now; + + ret = bch2_inode_write(trans, &dir_iter, dir_u); + if (ret) + goto err; + +- ret = bch2_dirent_create(trans, dir_inum, &dir_hash, +- mode_to_type(new_inode->bi_mode), +- name, new_inode->bi_inum, ++ ret = bch2_dirent_create(trans, dir, &dir_hash, ++ dir_type, ++ name, ++ dir_target, + &dir_offset, + BCH_HASH_SET_MUST_CREATE); + if (ret) +@@ -79,9 +94,8 @@ int bch2_create_trans(struct btree_trans *trans, u64 dir_inum, + new_inode->bi_dir_offset = dir_offset; + } + +- /* XXX use bch2_btree_iter_set_snapshot() */ +- inode_iter.snapshot = U32_MAX; +- bch2_btree_iter_set_pos(&inode_iter, SPOS(0, new_inode->bi_inum, U32_MAX)); ++ inode_iter.flags &= ~BTREE_ITER_ALL_SNAPSHOTS; ++ bch2_btree_iter_set_snapshot(&inode_iter, snapshot); + + ret = bch2_btree_iter_traverse(&inode_iter) ?: + bch2_inode_write(trans, &inode_iter, new_inode); +@@ -91,9 +105,10 @@ err: + return ret; + } + +-int bch2_link_trans(struct btree_trans *trans, u64 dir_inum, +- u64 inum, struct bch_inode_unpacked *dir_u, +- struct bch_inode_unpacked *inode_u, const struct qstr *name) ++int bch2_link_trans(struct btree_trans *trans, ++ subvol_inum dir, struct bch_inode_unpacked *dir_u, ++ subvol_inum inum, struct bch_inode_unpacked *inode_u, ++ const struct qstr *name) + { + struct bch_fs *c = trans->c; + struct btree_iter dir_iter = { NULL }; +@@ -103,6 +118,9 @@ int bch2_link_trans(struct btree_trans *trans, u64 dir_inum, + u64 dir_offset = 0; + int ret; + ++ if (dir.subvol != inum.subvol) ++ return -EXDEV; ++ + ret = bch2_inode_peek(trans, &inode_iter, inode_u, inum, BTREE_ITER_INTENT); + if (ret) + goto err; +@@ -110,7 +128,7 @@ int bch2_link_trans(struct btree_trans *trans, u64 dir_inum, + inode_u->bi_ctime = now; + bch2_inode_nlink_inc(inode_u); + +- ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir_inum, BTREE_ITER_INTENT); ++ ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_INTENT); + if (ret) + goto err; + +@@ -118,15 +136,15 @@ int bch2_link_trans(struct btree_trans *trans, u64 dir_inum, + + dir_hash = bch2_hash_info_init(c, dir_u); + +- ret = bch2_dirent_create(trans, dir_inum, &dir_hash, ++ ret = bch2_dirent_create(trans, dir, &dir_hash, + mode_to_type(inode_u->bi_mode), +- name, inum, &dir_offset, ++ name, inum.inum, &dir_offset, + BCH_HASH_SET_MUST_CREATE); + if (ret) + goto err; + + if (c->sb.version >= bcachefs_metadata_version_inode_backpointers) { +- inode_u->bi_dir = dir_inum; ++ inode_u->bi_dir = dir.inum; + inode_u->bi_dir_offset = dir_offset; + } + +@@ -139,7 +157,8 @@ err: + } + + int bch2_unlink_trans(struct btree_trans *trans, +- u64 dir_inum, struct bch_inode_unpacked *dir_u, ++ subvol_inum dir, ++ struct bch_inode_unpacked *dir_u, + struct bch_inode_unpacked *inode_u, + const struct qstr *name) + { +@@ -148,39 +167,49 @@ int bch2_unlink_trans(struct btree_trans *trans, + struct btree_iter dirent_iter = { NULL }; + struct btree_iter inode_iter = { NULL }; + struct bch_hash_info dir_hash; +- u64 inum, now = bch2_current_time(c); +- struct bkey_s_c k; ++ subvol_inum inum; ++ u64 now = bch2_current_time(c); + int ret; + +- ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir_inum, BTREE_ITER_INTENT); ++ ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_INTENT); + if (ret) + goto err; + + dir_hash = bch2_hash_info_init(c, dir_u); + +- ret = __bch2_dirent_lookup_trans(trans, &dirent_iter, dir_inum, &dir_hash, ++ ret = __bch2_dirent_lookup_trans(trans, &dirent_iter, dir, &dir_hash, + name, &inum, BTREE_ITER_INTENT); + if (ret) + goto err; + +- ret = bch2_inode_peek(trans, &inode_iter, inode_u, inum, BTREE_ITER_INTENT); ++ ret = bch2_inode_peek(trans, &inode_iter, inode_u, inum, ++ BTREE_ITER_INTENT); + if (ret) + goto err; + +- if (inode_u->bi_dir == k.k->p.inode && +- inode_u->bi_dir_offset == k.k->p.offset) { ++ if (inode_u->bi_dir == dirent_iter.pos.inode && ++ inode_u->bi_dir_offset == dirent_iter.pos.offset) { + inode_u->bi_dir = 0; + inode_u->bi_dir_offset = 0; + } + ++ if (S_ISDIR(inode_u->bi_mode)) { ++ ret = bch2_empty_dir_trans(trans, inum); ++ if (ret) ++ goto err; ++ } ++ ++ if (dir.subvol != inum.subvol) { ++ ret = bch2_subvolume_delete(trans, inum.subvol, false); ++ if (ret) ++ goto err; ++ } ++ + dir_u->bi_mtime = dir_u->bi_ctime = inode_u->bi_ctime = now; + dir_u->bi_nlink -= S_ISDIR(inode_u->bi_mode); + bch2_inode_nlink_dec(inode_u); + +- ret = (S_ISDIR(inode_u->bi_mode) +- ? bch2_empty_dir_trans(trans, inum) +- : 0) ?: +- bch2_dirent_delete_at(trans, &dir_hash, &dirent_iter) ?: ++ ret = bch2_dirent_delete_at(trans, &dir_hash, &dirent_iter) ?: + bch2_inode_write(trans, &dir_iter, dir_u) ?: + bch2_inode_write(trans, &inode_iter, inode_u); + err: +@@ -215,8 +244,8 @@ bool bch2_reinherit_attrs(struct bch_inode_unpacked *dst_u, + } + + int bch2_rename_trans(struct btree_trans *trans, +- u64 src_dir, struct bch_inode_unpacked *src_dir_u, +- u64 dst_dir, struct bch_inode_unpacked *dst_dir_u, ++ subvol_inum src_dir, struct bch_inode_unpacked *src_dir_u, ++ subvol_inum dst_dir, struct bch_inode_unpacked *dst_dir_u, + struct bch_inode_unpacked *src_inode_u, + struct bch_inode_unpacked *dst_inode_u, + const struct qstr *src_name, +@@ -229,7 +258,8 @@ int bch2_rename_trans(struct btree_trans *trans, + struct btree_iter src_inode_iter = { NULL }; + struct btree_iter dst_inode_iter = { NULL }; + struct bch_hash_info src_hash, dst_hash; +- u64 src_inode, src_offset, dst_inode, dst_offset; ++ subvol_inum src_inum, dst_inum; ++ u64 src_offset, dst_offset; + u64 now = bch2_current_time(c); + int ret; + +@@ -240,7 +270,8 @@ int bch2_rename_trans(struct btree_trans *trans, + + src_hash = bch2_hash_info_init(c, src_dir_u); + +- if (dst_dir != src_dir) { ++ if (dst_dir.inum != src_dir.inum || ++ dst_dir.subvol != src_dir.subvol) { + ret = bch2_inode_peek(trans, &dst_dir_iter, dst_dir_u, dst_dir, + BTREE_ITER_INTENT); + if (ret) +@@ -255,19 +286,19 @@ int bch2_rename_trans(struct btree_trans *trans, + ret = bch2_dirent_rename(trans, + src_dir, &src_hash, + dst_dir, &dst_hash, +- src_name, &src_inode, &src_offset, +- dst_name, &dst_inode, &dst_offset, ++ src_name, &src_inum, &src_offset, ++ dst_name, &dst_inum, &dst_offset, + mode); + if (ret) + goto err; + +- ret = bch2_inode_peek(trans, &src_inode_iter, src_inode_u, src_inode, ++ ret = bch2_inode_peek(trans, &src_inode_iter, src_inode_u, src_inum, + BTREE_ITER_INTENT); + if (ret) + goto err; + +- if (dst_inode) { +- ret = bch2_inode_peek(trans, &dst_inode_iter, dst_inode_u, dst_inode, ++ if (dst_inum.inum) { ++ ret = bch2_inode_peek(trans, &dst_inode_iter, dst_inode_u, dst_inum, + BTREE_ITER_INTENT); + if (ret) + goto err; +@@ -298,7 +329,7 @@ int bch2_rename_trans(struct btree_trans *trans, + } + + if (S_ISDIR(dst_inode_u->bi_mode) && +- bch2_empty_dir_trans(trans, dst_inode)) { ++ bch2_empty_dir_trans(trans, dst_inum)) { + ret = -ENOTEMPTY; + goto err; + } +@@ -322,7 +353,7 @@ int bch2_rename_trans(struct btree_trans *trans, + dst_dir_u->bi_nlink++; + } + +- if (dst_inode && S_ISDIR(dst_inode_u->bi_mode)) { ++ if (dst_inum.inum && S_ISDIR(dst_inode_u->bi_mode)) { + dst_dir_u->bi_nlink--; + src_dir_u->bi_nlink += mode == BCH_RENAME_EXCHANGE; + } +@@ -333,22 +364,22 @@ int bch2_rename_trans(struct btree_trans *trans, + src_dir_u->bi_mtime = now; + src_dir_u->bi_ctime = now; + +- if (src_dir != dst_dir) { ++ if (src_dir.inum != dst_dir.inum) { + dst_dir_u->bi_mtime = now; + dst_dir_u->bi_ctime = now; + } + + src_inode_u->bi_ctime = now; + +- if (dst_inode) ++ if (dst_inum.inum) + dst_inode_u->bi_ctime = now; + + ret = bch2_inode_write(trans, &src_dir_iter, src_dir_u) ?: +- (src_dir != dst_dir ++ (src_dir.inum != dst_dir.inum + ? bch2_inode_write(trans, &dst_dir_iter, dst_dir_u) + : 0 ) ?: + bch2_inode_write(trans, &src_inode_iter, src_inode_u) ?: +- (dst_inode ++ (dst_inum.inum + ? bch2_inode_write(trans, &dst_inode_iter, dst_inode_u) + : 0 ); + err: +diff --git a/fs/bcachefs/fs-common.h b/fs/bcachefs/fs-common.h +index 2273b7961c9b..1bb2ac4dc13a 100644 +--- a/fs/bcachefs/fs-common.h ++++ b/fs/bcachefs/fs-common.h +@@ -4,27 +4,30 @@ + + struct posix_acl; + +-int bch2_create_trans(struct btree_trans *, u64, ++#define BCH_CREATE_TMPFILE (1U << 0) ++ ++int bch2_create_trans(struct btree_trans *, subvol_inum, + struct bch_inode_unpacked *, + struct bch_inode_unpacked *, + const struct qstr *, + uid_t, gid_t, umode_t, dev_t, + struct posix_acl *, +- struct posix_acl *); ++ struct posix_acl *, ++ unsigned); + +-int bch2_link_trans(struct btree_trans *, u64, +- u64, struct bch_inode_unpacked *, +- struct bch_inode_unpacked *, ++int bch2_link_trans(struct btree_trans *, ++ subvol_inum, struct bch_inode_unpacked *, ++ subvol_inum, struct bch_inode_unpacked *, + const struct qstr *); + +-int bch2_unlink_trans(struct btree_trans *, +- u64, struct bch_inode_unpacked *, ++int bch2_unlink_trans(struct btree_trans *, subvol_inum, ++ struct bch_inode_unpacked *, + struct bch_inode_unpacked *, + const struct qstr *); + + int bch2_rename_trans(struct btree_trans *, +- u64, struct bch_inode_unpacked *, +- u64, struct bch_inode_unpacked *, ++ subvol_inum, struct bch_inode_unpacked *, ++ subvol_inum, struct bch_inode_unpacked *, + struct bch_inode_unpacked *, + struct bch_inode_unpacked *, + const struct qstr *, +diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c +index e474d1fa5b8e..27516a162aec 100644 +--- a/fs/bcachefs/fs-io.c ++++ b/fs/bcachefs/fs-io.c +@@ -1821,6 +1821,49 @@ ssize_t bch2_read_iter(struct kiocb *iocb, struct iov_iter *iter) + + /* O_DIRECT writes */ + ++static bool bch2_check_range_allocated(struct bch_fs *c, subvol_inum inum, ++ u64 offset, u64 size, ++ unsigned nr_replicas, bool compressed) ++{ ++ struct btree_trans trans; ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ u64 end = offset + size; ++ u32 snapshot; ++ bool ret = true; ++ int err; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++retry: ++ bch2_trans_begin(&trans); ++ ++ err = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot); ++ if (err) ++ goto err; ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_extents, ++ SPOS(inum.inum, offset, snapshot), ++ BTREE_ITER_SLOTS, k, err) { ++ if (bkey_cmp(bkey_start_pos(k.k), POS(inum.inum, end)) >= 0) ++ break; ++ ++ if (nr_replicas > bch2_bkey_replicas(c, k) || ++ (!compressed && bch2_bkey_sectors_compressed(k))) { ++ ret = false; ++ break; ++ } ++ } ++ ++ offset = iter.pos.offset; ++ bch2_trans_iter_exit(&trans, &iter); ++err: ++ if (err == -EINTR) ++ goto retry; ++ bch2_trans_exit(&trans); ++ ++ return err ? false : ret; ++} ++ + static void bch2_dio_write_loop_async(struct bch_write_op *); + + static long bch2_dio_write_loop(struct dio_write *dio) +@@ -1909,8 +1952,8 @@ static long bch2_dio_write_loop(struct dio_write *dio) + ret = bch2_disk_reservation_get(c, &dio->op.res, bio_sectors(bio), + dio->op.opts.data_replicas, 0); + if (unlikely(ret) && +- !bch2_check_range_allocated(c, dio->op.pos, +- bio_sectors(bio), ++ !bch2_check_range_allocated(c, inode_inum(inode), ++ dio->op.pos.offset, bio_sectors(bio), + dio->op.opts.data_replicas, + dio->op.opts.compression != 0)) + goto err; +@@ -2154,9 +2197,9 @@ out: + + /* truncate: */ + +-static inline int range_has_data(struct bch_fs *c, +- struct bpos start, +- struct bpos end) ++static inline int range_has_data(struct bch_fs *c, u32 subvol, ++ struct bpos start, ++ struct bpos end) + { + struct btree_trans trans; + struct btree_iter iter; +@@ -2164,6 +2207,12 @@ static inline int range_has_data(struct bch_fs *c, + int ret = 0; + + bch2_trans_init(&trans, c, 0, 0); ++retry: ++ bch2_trans_begin(&trans); ++ ++ ret = bch2_subvolume_get_snapshot(&trans, subvol, &start.snapshot); ++ if (ret) ++ goto err; + + for_each_btree_key(&trans, iter, BTREE_ID_extents, start, 0, k, ret) { + if (bkey_cmp(bkey_start_pos(k.k), end) >= 0) +@@ -2174,7 +2223,11 @@ static inline int range_has_data(struct bch_fs *c, + break; + } + } ++ start = iter.pos; + bch2_trans_iter_exit(&trans, &iter); ++err: ++ if (ret == -EINTR) ++ goto retry; + + return bch2_trans_exit(&trans) ?: ret; + } +@@ -2206,7 +2259,7 @@ static int __bch2_truncate_page(struct bch_inode_info *inode, + * XXX: we're doing two index lookups when we end up reading the + * page + */ +- ret = range_has_data(c, ++ ret = range_has_data(c, inode->ei_subvol, + POS(inode->v.i_ino, index << PAGE_SECTOR_SHIFT), + POS(inode->v.i_ino, (index + 1) << PAGE_SECTOR_SHIFT)); + if (ret <= 0) +@@ -2340,7 +2393,7 @@ int bch2_truncate(struct user_namespace *mnt_userns, + inode_dio_wait(&inode->v); + bch2_pagecache_block_get(&inode->ei_pagecache_lock); + +- ret = bch2_inode_find_by_inum(c, inode->v.i_ino, &inode_u); ++ ret = bch2_inode_find_by_inum(c, inode_inum(inode), &inode_u); + if (ret) + goto err; + +@@ -2564,6 +2617,18 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode, + struct bpos move_pos = POS(inode->v.i_ino, offset >> 9); + struct bpos atomic_end; + unsigned trigger_flags = 0; ++ u32 snapshot; ++ ++ bch2_trans_begin(&trans); ++ ++ ret = bch2_subvolume_get_snapshot(&trans, ++ inode->ei_subvol, &snapshot); ++ if (ret) ++ continue; ++ ++ bch2_btree_iter_set_snapshot(&src, snapshot); ++ bch2_btree_iter_set_snapshot(&dst, snapshot); ++ bch2_btree_iter_set_snapshot(&del, snapshot); + + bch2_trans_begin(&trans); + +@@ -2684,9 +2749,17 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode, + struct bkey_i_reservation reservation; + struct bkey_s_c k; + unsigned sectors; ++ u32 snapshot; + + bch2_trans_begin(&trans); + ++ ret = bch2_subvolume_get_snapshot(&trans, ++ inode->ei_subvol, &snapshot); ++ if (ret) ++ goto bkey_err; ++ ++ bch2_btree_iter_set_snapshot(&iter, snapshot); ++ + k = bch2_btree_iter_peek_slot(&iter); + if ((ret = bkey_err(k))) + goto bkey_err; +@@ -2935,8 +3008,8 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src, + mark_range_unallocated(src, pos_src, pos_src + aligned_len); + + ret = bch2_remap_range(c, +- POS(dst->v.i_ino, pos_dst >> 9), +- POS(src->v.i_ino, pos_src >> 9), ++ inode_inum(dst), pos_dst >> 9, ++ inode_inum(src), pos_src >> 9, + aligned_len >> 9, + &dst->ei_journal_seq, + pos_dst + len, &i_sectors_delta); +@@ -3027,7 +3100,9 @@ static loff_t bch2_seek_data(struct file *file, u64 offset) + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; ++ subvol_inum inum = inode_inum(inode); + u64 isize, next_data = MAX_LFS_FILESIZE; ++ u32 snapshot; + int ret; + + isize = i_size_read(&inode->v); +@@ -3035,9 +3110,15 @@ static loff_t bch2_seek_data(struct file *file, u64 offset) + return -ENXIO; + + bch2_trans_init(&trans, c, 0, 0); ++retry: ++ bch2_trans_begin(&trans); ++ ++ ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot); ++ if (ret) ++ goto err; + + for_each_btree_key(&trans, iter, BTREE_ID_extents, +- POS(inode->v.i_ino, offset >> 9), 0, k, ret) { ++ SPOS(inode->v.i_ino, offset >> 9, snapshot), 0, k, ret) { + if (k.k->p.inode != inode->v.i_ino) { + break; + } else if (bkey_extent_is_data(k.k)) { +@@ -3047,6 +3128,9 @@ static loff_t bch2_seek_data(struct file *file, u64 offset) + break; + } + bch2_trans_iter_exit(&trans, &iter); ++err: ++ if (ret == -EINTR) ++ goto retry; + + ret = bch2_trans_exit(&trans) ?: ret; + if (ret) +@@ -3123,7 +3207,9 @@ static loff_t bch2_seek_hole(struct file *file, u64 offset) + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; ++ subvol_inum inum = inode_inum(inode); + u64 isize, next_hole = MAX_LFS_FILESIZE; ++ u32 snapshot; + int ret; + + isize = i_size_read(&inode->v); +@@ -3131,9 +3217,15 @@ static loff_t bch2_seek_hole(struct file *file, u64 offset) + return -ENXIO; + + bch2_trans_init(&trans, c, 0, 0); ++retry: ++ bch2_trans_begin(&trans); ++ ++ ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot); ++ if (ret) ++ goto err; + + for_each_btree_key(&trans, iter, BTREE_ID_extents, +- POS(inode->v.i_ino, offset >> 9), ++ SPOS(inode->v.i_ino, offset >> 9, snapshot), + BTREE_ITER_SLOTS, k, ret) { + if (k.k->p.inode != inode->v.i_ino) { + next_hole = bch2_seek_pagecache_hole(&inode->v, +@@ -3151,6 +3243,9 @@ static loff_t bch2_seek_hole(struct file *file, u64 offset) + } + } + bch2_trans_iter_exit(&trans, &iter); ++err: ++ if (ret == -EINTR) ++ goto retry; + + ret = bch2_trans_exit(&trans) ?: ret; + if (ret) +diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c +index 494879b5bed3..d7bcb2219b8d 100644 +--- a/fs/bcachefs/fs-ioctl.c ++++ b/fs/bcachefs/fs-ioctl.c +@@ -192,7 +192,7 @@ static int bch2_ioc_reinherit_attrs(struct bch_fs *c, + char *kname = NULL; + struct qstr qstr; + int ret = 0; +- subvol_inum inum = { .subvol = 1 }; ++ subvol_inum inum; + + kname = kmalloc(BCH_NAME_MAX + 1, GFP_KERNEL); + if (!kname) +@@ -205,10 +205,8 @@ static int bch2_ioc_reinherit_attrs(struct bch_fs *c, + qstr.len = ret; + qstr.name = kname; + +- ret = -ENOENT; +- inum.inum = bch2_dirent_lookup(c, src->v.i_ino, &hash, +- &qstr); +- if (!inum.inum) ++ ret = bch2_dirent_lookup(c, inode_inum(src), &hash, &qstr, &inum); ++ if (ret) + goto err1; + + vinode = bch2_vfs_inode_get(c, inum); +diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c +index df57680cfb45..69e888a88fb3 100644 +--- a/fs/bcachefs/fs.c ++++ b/fs/bcachefs/fs.c +@@ -150,7 +150,7 @@ int __must_check bch2_write_inode(struct bch_fs *c, + retry: + bch2_trans_begin(&trans); + +- ret = bch2_inode_peek(&trans, &iter, &inode_u, inode->v.i_ino, ++ ret = bch2_inode_peek(&trans, &iter, &inode_u, inode_inum(inode), + BTREE_ITER_INTENT) ?: + (set ? set(inode, &inode_u, p) : 0) ?: + bch2_inode_write(&trans, &iter, &inode_u) ?: +@@ -256,7 +256,7 @@ struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum) + if (!(inode->v.i_state & I_NEW)) + return &inode->v; + +- ret = bch2_inode_find_by_inum(c, inum.inum, &inode_u); ++ ret = bch2_inode_find_by_inum(c, inum, &inode_u); + if (ret) { + iget_failed(&inode->v); + return ERR_PTR(ret); +@@ -271,10 +271,10 @@ struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum) + return &inode->v; + } + +-static struct bch_inode_info * ++struct bch_inode_info * + __bch2_create(struct user_namespace *mnt_userns, + struct bch_inode_info *dir, struct dentry *dentry, +- umode_t mode, dev_t rdev, bool tmpfile) ++ umode_t mode, dev_t rdev, unsigned flags) + { + struct bch_fs *c = dir->v.i_sb->s_fs_info; + struct btree_trans trans; +@@ -303,20 +303,23 @@ __bch2_create(struct user_namespace *mnt_userns, + + bch2_inode_init_early(c, &inode_u); + +- if (!tmpfile) ++ if (!(flags & BCH_CREATE_TMPFILE)) + mutex_lock(&dir->ei_update_lock); + + bch2_trans_init(&trans, c, 8, +- 2048 + (!tmpfile ? dentry->d_name.len : 0)); ++ 2048 + (!(flags & BCH_CREATE_TMPFILE) ++ ? dentry->d_name.len : 0)); + retry: + bch2_trans_begin(&trans); + +- ret = bch2_create_trans(&trans, dir->v.i_ino, &dir_u, &inode_u, +- !tmpfile ? &dentry->d_name : NULL, ++ ret = bch2_create_trans(&trans, ++ inode_inum(dir), &dir_u, &inode_u, ++ !(flags & BCH_CREATE_TMPFILE) ++ ? &dentry->d_name : NULL, + from_kuid(mnt_userns, current_fsuid()), + from_kgid(mnt_userns, current_fsgid()), + mode, rdev, +- default_acl, acl) ?: ++ default_acl, acl, flags) ?: + bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, 1, + KEY_TYPE_QUOTA_PREALLOC); + if (unlikely(ret)) +@@ -332,7 +335,7 @@ err_before_quota: + goto err_trans; + } + +- if (!tmpfile) { ++ if (!(flags & BCH_CREATE_TMPFILE)) { + bch2_inode_update_after_write(c, dir, &dir_u, + ATTR_MTIME|ATTR_CTIME); + journal_seq_copy(c, dir, journal_seq); +@@ -387,7 +390,7 @@ err: + posix_acl_release(acl); + return inode; + err_trans: +- if (!tmpfile) ++ if (!(flags & BCH_CREATE_TMPFILE)) + mutex_unlock(&dir->ei_update_lock); + + bch2_trans_exit(&trans); +@@ -407,11 +410,12 @@ static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry, + struct bch_hash_info hash = bch2_hash_info_init(c, &dir->ei_inode); + struct inode *vinode = NULL; + subvol_inum inum = { .subvol = 1 }; ++ int ret; + +- inum.inum = bch2_dirent_lookup(c, dir->v.i_ino, &hash, +- &dentry->d_name); ++ ret = bch2_dirent_lookup(c, inode_inum(dir), &hash, ++ &dentry->d_name, &inum); + +- if (inum.inum) ++ if (!ret) + vinode = bch2_vfs_inode_get(c, inum); + + return d_splice_alias(vinode, dentry); +@@ -422,7 +426,7 @@ static int bch2_mknod(struct user_namespace *mnt_userns, + umode_t mode, dev_t rdev) + { + struct bch_inode_info *inode = +- __bch2_create(mnt_userns, to_bch_ei(vdir), dentry, mode, rdev, false); ++ __bch2_create(mnt_userns, to_bch_ei(vdir), dentry, mode, rdev, 0); + + if (IS_ERR(inode)) + return PTR_ERR(inode); +@@ -452,8 +456,8 @@ static int __bch2_link(struct bch_fs *c, + + ret = __bch2_trans_do(&trans, NULL, &inode->ei_journal_seq, 0, + bch2_link_trans(&trans, +- dir->v.i_ino, +- inode->v.i_ino, &dir_u, &inode_u, ++ inode_inum(dir), &dir_u, ++ inode_inum(inode), &inode_u, + &dentry->d_name)); + + if (likely(!ret)) { +@@ -504,7 +508,7 @@ static int bch2_unlink(struct inode *vdir, struct dentry *dentry) + ret = __bch2_trans_do(&trans, NULL, &dir->ei_journal_seq, + BTREE_INSERT_NOFAIL, + bch2_unlink_trans(&trans, +- dir->v.i_ino, &dir_u, ++ inode_inum(dir), &dir_u, + &inode_u, &dentry->d_name)); + + if (likely(!ret)) { +@@ -531,7 +535,8 @@ static int bch2_symlink(struct user_namespace *mnt_userns, + struct bch_inode_info *dir = to_bch_ei(vdir), *inode; + int ret; + +- inode = __bch2_create(mnt_userns, dir, dentry, S_IFLNK|S_IRWXUGO, 0, true); ++ inode = __bch2_create(mnt_userns, dir, dentry, S_IFLNK|S_IRWXUGO, 0, ++ BCH_CREATE_TMPFILE); + if (unlikely(IS_ERR(inode))) + return PTR_ERR(inode); + +@@ -624,8 +629,8 @@ static int bch2_rename2(struct user_namespace *mnt_userns, + + ret = __bch2_trans_do(&trans, NULL, &journal_seq, 0, + bch2_rename_trans(&trans, +- src_dir->v.i_ino, &src_dir_u, +- dst_dir->v.i_ino, &dst_dir_u, ++ inode_inum(src_dir), &src_dir_u, ++ inode_inum(dst_dir), &dst_dir_u, + &src_inode_u, + &dst_inode_u, + &src_dentry->d_name, +@@ -748,7 +753,7 @@ retry: + kfree(acl); + acl = NULL; + +- ret = bch2_inode_peek(&trans, &inode_iter, &inode_u, inode->v.i_ino, ++ ret = bch2_inode_peek(&trans, &inode_iter, &inode_u, inode_inum(inode), + BTREE_ITER_INTENT); + if (ret) + goto btree_err; +@@ -756,7 +761,8 @@ retry: + bch2_setattr_copy(mnt_userns, inode, &inode_u, attr); + + if (attr->ia_valid & ATTR_MODE) { +- ret = bch2_acl_chmod(&trans, &inode_u, inode_u.bi_mode, &acl); ++ ret = bch2_acl_chmod(&trans, inode_inum(inode), &inode_u, ++ inode_u.bi_mode, &acl); + if (ret) + goto btree_err; + } +@@ -847,7 +853,8 @@ static int bch2_tmpfile(struct user_namespace *mnt_userns, + struct inode *vdir, struct dentry *dentry, umode_t mode) + { + struct bch_inode_info *inode = +- __bch2_create(mnt_userns, to_bch_ei(vdir), dentry, mode, 0, true); ++ __bch2_create(mnt_userns, to_bch_ei(vdir), dentry, mode, 0, ++ BCH_CREATE_TMPFILE); + + if (IS_ERR(inode)) + return PTR_ERR(inode); +@@ -922,6 +929,7 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, + struct bpos end = POS(ei->v.i_ino, (start + len) >> 9); + unsigned offset_into_extent, sectors; + bool have_extent = false; ++ u32 snapshot; + int ret = 0; + + ret = fiemap_prep(&ei->v, info, start, &len, FIEMAP_FLAG_SYNC); +@@ -931,15 +939,21 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, + if (start + len < start) + return -EINVAL; + ++ start >>= 9; ++ + bch2_bkey_buf_init(&cur); + bch2_bkey_buf_init(&prev); + bch2_trans_init(&trans, c, 0, 0); +- +- bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, +- POS(ei->v.i_ino, start >> 9), 0); + retry: + bch2_trans_begin(&trans); + ++ ret = bch2_subvolume_get_snapshot(&trans, ei->ei_subvol, &snapshot); ++ if (ret) ++ goto err; ++ ++ bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, ++ SPOS(ei->v.i_ino, start, snapshot), 0); ++ + while ((k = bch2_btree_iter_peek(&iter)).k && + !(ret = bkey_err(k)) && + bkey_cmp(iter.pos, end) < 0) { +@@ -988,7 +1002,9 @@ retry: + bch2_btree_iter_set_pos(&iter, + POS(iter.pos.inode, iter.pos.offset + sectors)); + } +- ++ start = iter.pos.offset; ++ bch2_trans_iter_exit(&trans, &iter); ++err: + if (ret == -EINTR) + goto retry; + +@@ -996,7 +1012,6 @@ retry: + ret = bch2_fill_extent(c, info, bkey_i_to_s_c(prev.k), + FIEMAP_EXTENT_LAST); + +- bch2_trans_iter_exit(&trans, &iter); + ret = bch2_trans_exit(&trans) ?: ret; + bch2_bkey_buf_exit(&cur, c); + bch2_bkey_buf_exit(&prev, c); +@@ -1033,7 +1048,7 @@ static int bch2_vfs_readdir(struct file *file, struct dir_context *ctx) + if (!dir_emit_dots(file, ctx)) + return 0; + +- return bch2_readdir(c, inode->v.i_ino, ctx); ++ return bch2_readdir(c, inode_inum(inode), ctx); + } + + static const struct file_operations bch_file_operations = { +@@ -1289,7 +1304,7 @@ static void bch2_evict_inode(struct inode *vinode) + KEY_TYPE_QUOTA_WARN); + bch2_quota_acct(c, inode->ei_qid, Q_INO, -1, + KEY_TYPE_QUOTA_WARN); +- bch2_inode_rm(c, inode->v.i_ino, true); ++ bch2_inode_rm(c, inode_inum(inode), true); + } + } + +diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h +index ac6617594916..b7655fbf7c31 100644 +--- a/fs/bcachefs/fs.h ++++ b/fs/bcachefs/fs.h +@@ -145,6 +145,10 @@ struct bch_inode_unpacked; + + #ifndef NO_BCACHEFS_FS + ++struct bch_inode_info * ++__bch2_create(struct user_namespace *, struct bch_inode_info *, ++ struct dentry *, umode_t, dev_t, unsigned); ++ + int bch2_fs_quota_transfer(struct bch_fs *, + struct bch_inode_info *, + struct bch_qid, +diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c +index e4ca05aae76c..40b107715cdd 100644 +--- a/fs/bcachefs/fsck.c ++++ b/fs/bcachefs/fsck.c +@@ -858,7 +858,10 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, + d = bkey_s_c_to_dirent(k); + d_inum = le64_to_cpu(d.v->d_inum); + +- ret = bch2_dirent_read_target(trans, d, &d_inum); ++ ret = __bch2_dirent_read_target(&trans, d, ++ &target_subvol, ++ &target_snapshot, ++ &target_inum); + if (ret && ret != -ENOENT) + return ret; + +diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c +index ca04a9715ec1..9130d571e84d 100644 +--- a/fs/bcachefs/inode.c ++++ b/fs/bcachefs/inode.c +@@ -6,6 +6,7 @@ + #include "btree_update.h" + #include "error.h" + #include "extents.h" ++#include "extent_update.h" + #include "inode.h" + #include "str_hash.h" + #include "subvolume.h" +@@ -296,15 +297,21 @@ int bch2_inode_unpack(struct bkey_s_c_inode inode, + int bch2_inode_peek(struct btree_trans *trans, + struct btree_iter *iter, + struct bch_inode_unpacked *inode, +- u64 inum, unsigned flags) ++ subvol_inum inum, unsigned flags) + { + struct bkey_s_c k; ++ u32 snapshot; + int ret; + + if (0 && trans->c->opts.inodes_use_key_cache) + flags |= BTREE_ITER_CACHED; + +- bch2_trans_iter_init(trans, iter, BTREE_ID_inodes, POS(0, inum), flags); ++ ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); ++ if (ret) ++ return ret; ++ ++ bch2_trans_iter_init(trans, iter, BTREE_ID_inodes, ++ SPOS(0, inum.inum, snapshot), flags); + k = bch2_btree_iter_peek_slot(iter); + ret = bkey_err(k); + if (ret) +@@ -486,6 +493,9 @@ static inline u32 bkey_generation(struct bkey_s_c k) + } + } + ++/* ++ * This just finds an empty slot: ++ */ + int bch2_inode_create(struct btree_trans *trans, + struct btree_iter *iter, + struct bch_inode_unpacked *inode_u, +@@ -585,16 +595,74 @@ found_slot: + return 0; + } + +-int bch2_inode_rm(struct bch_fs *c, u64 inode_nr, bool cached) ++static int bch2_inode_delete_keys(struct btree_trans *trans, ++ subvol_inum inum, enum btree_id id) ++{ ++ u64 offset = 0; ++ int ret = 0; ++ ++ while (!ret || ret == -EINTR) { ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ struct bkey_i delete; ++ u32 snapshot; ++ ++ bch2_trans_begin(trans); ++ ++ ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); ++ if (ret) ++ continue; ++ ++ bch2_trans_iter_init(trans, &iter, id, ++ SPOS(inum.inum, offset, snapshot), ++ BTREE_ITER_INTENT); ++ k = bch2_btree_iter_peek(&iter); ++ ++ if (!k.k || iter.pos.inode != inum.inum) { ++ bch2_trans_iter_exit(trans, &iter); ++ break; ++ } ++ ++ ret = bkey_err(k); ++ if (ret) ++ goto err; ++ ++ bkey_init(&delete.k); ++ delete.k.p = iter.pos; ++ ++ if (btree_node_type_is_extents(iter.btree_id)) { ++ unsigned max_sectors = ++ min_t(u64, U64_MAX - iter.pos.offset, ++ KEY_SIZE_MAX & (~0 << trans->c->block_bits)); ++ ++ /* create the biggest key we can */ ++ bch2_key_resize(&delete.k, max_sectors); ++ ++ ret = bch2_extent_trim_atomic(trans, &iter, &delete); ++ if (ret) ++ goto err; ++ } ++ ++ ret = bch2_trans_update(trans, &iter, &delete, 0) ?: ++ bch2_trans_commit(trans, NULL, NULL, ++ BTREE_INSERT_NOFAIL); ++err: ++ offset = iter.pos.offset; ++ bch2_trans_iter_exit(trans, &iter); ++ } ++ ++ return ret; ++} ++ ++int bch2_inode_rm(struct bch_fs *c, subvol_inum inum, bool cached) + { + struct btree_trans trans; + struct btree_iter iter = { NULL }; + struct bkey_i_inode_generation delete; +- struct bpos start = POS(inode_nr, 0); +- struct bpos end = POS(inode_nr + 1, 0); + struct bch_inode_unpacked inode_u; + struct bkey_s_c k; + unsigned iter_flags = BTREE_ITER_INTENT; ++ u32 snapshot; + int ret; + + if (0 && cached && c->opts.inodes_use_key_cache) +@@ -610,19 +678,20 @@ int bch2_inode_rm(struct bch_fs *c, u64 inode_nr, bool cached) + * XXX: the dirent could ideally would delete whiteouts when they're no + * longer needed + */ +- ret = bch2_btree_delete_range_trans(&trans, BTREE_ID_extents, +- start, end, NULL) ?: +- bch2_btree_delete_range_trans(&trans, BTREE_ID_xattrs, +- start, end, NULL) ?: +- bch2_btree_delete_range_trans(&trans, BTREE_ID_dirents, +- start, end, NULL); ++ ret = bch2_inode_delete_keys(&trans, inum, BTREE_ID_extents) ?: ++ bch2_inode_delete_keys(&trans, inum, BTREE_ID_xattrs) ?: ++ bch2_inode_delete_keys(&trans, inum, BTREE_ID_dirents); + if (ret) + goto err; + retry: + bch2_trans_begin(&trans); + ++ ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot); ++ if (ret) ++ goto err; ++ + bch2_trans_iter_init(&trans, &iter, BTREE_ID_inodes, +- POS(0, inode_nr), iter_flags); ++ SPOS(0, inum.inum, snapshot), iter_flags); + k = bch2_btree_iter_peek_slot(&iter); + + ret = bkey_err(k); +@@ -632,7 +701,7 @@ retry: + if (k.k->type != KEY_TYPE_inode) { + bch2_fs_inconsistent(trans.c, + "inode %llu not found when deleting", +- inode_nr); ++ inum.inum); + ret = -EIO; + goto err; + } +@@ -662,20 +731,22 @@ err: + return ret; + } + +-static int bch2_inode_find_by_inum_trans(struct btree_trans *trans, u64 inode_nr, ++static int bch2_inode_find_by_inum_trans(struct btree_trans *trans, ++ subvol_inum inum, + struct bch_inode_unpacked *inode) + { +- struct btree_iter iter = { NULL }; ++ struct btree_iter iter; + int ret; + +- ret = bch2_inode_peek(trans, &iter, inode, inode_nr, 0); +- bch2_trans_iter_exit(trans, &iter); ++ ret = bch2_inode_peek(trans, &iter, inode, inum, 0); ++ if (!ret) ++ bch2_trans_iter_exit(trans, &iter); + return ret; + } + +-int bch2_inode_find_by_inum(struct bch_fs *c, u64 inode_nr, ++int bch2_inode_find_by_inum(struct bch_fs *c, subvol_inum inum, + struct bch_inode_unpacked *inode) + { + return bch2_trans_do(c, NULL, NULL, 0, +- bch2_inode_find_by_inum_trans(&trans, inode_nr, inode)); ++ bch2_inode_find_by_inum_trans(&trans, inum, inode)); + } +diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h +index 25bef104ebcc..9e84cddcc6cb 100644 +--- a/fs/bcachefs/inode.h ++++ b/fs/bcachefs/inode.h +@@ -58,7 +58,7 @@ int bch2_inode_unpack(struct bkey_s_c_inode, struct bch_inode_unpacked *); + void bch2_inode_unpacked_to_text(struct printbuf *, struct bch_inode_unpacked *); + + int bch2_inode_peek(struct btree_trans *, struct btree_iter *, +- struct bch_inode_unpacked *, u64, unsigned); ++ struct bch_inode_unpacked *, subvol_inum, unsigned); + int bch2_inode_write(struct btree_trans *, struct btree_iter *, + struct bch_inode_unpacked *); + +@@ -74,9 +74,10 @@ void bch2_inode_init(struct bch_fs *, struct bch_inode_unpacked *, + int bch2_inode_create(struct btree_trans *, struct btree_iter *, + struct bch_inode_unpacked *, u32, u64); + +-int bch2_inode_rm(struct bch_fs *, u64, bool); ++int bch2_inode_rm(struct bch_fs *, subvol_inum, bool); + +-int bch2_inode_find_by_inum(struct bch_fs *, u64, struct bch_inode_unpacked *); ++int bch2_inode_find_by_inum(struct bch_fs *, subvol_inum, ++ struct bch_inode_unpacked *); + + static inline struct bch_io_opts bch2_inode_opts_get(struct bch_inode_unpacked *inode) + { +diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c +index f459dcb69ecd..177b01b941aa 100644 +--- a/fs/bcachefs/io.c ++++ b/fs/bcachefs/io.c +@@ -315,7 +315,10 @@ int bch2_extent_update(struct btree_trans *trans, + struct bch_inode_unpacked inode_u; + + ret = bch2_inode_peek(trans, &inode_iter, &inode_u, +- k->k.p.inode, BTREE_ITER_INTENT); ++ (subvol_inum) { ++ .subvol = BCACHEFS_ROOT_SUBVOL, ++ .inum = k->k.p.inode, ++ }, BTREE_ITER_INTENT); + if (ret) + return ret; + +diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c +index 7001e3cda8c5..32d94c6c8b15 100644 +--- a/fs/bcachefs/move.c ++++ b/fs/bcachefs/move.c +@@ -581,7 +581,8 @@ static int __bch2_move_data(struct bch_fs *c, + stats->pos = start; + + bch2_trans_iter_init(&trans, &iter, btree_id, start, +- BTREE_ITER_PREFETCH); ++ BTREE_ITER_PREFETCH| ++ BTREE_ITER_ALL_SNAPSHOTS); + + if (rate) + bch2_ratelimit_reset(rate); +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index 2aab57cf09e1..47c8fecc6839 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -1480,11 +1480,12 @@ int bch2_fs_initialize(struct bch_fs *c) + + err = "error creating lost+found"; + ret = bch2_trans_do(c, NULL, NULL, 0, +- bch2_create_trans(&trans, BCACHEFS_ROOT_INO, ++ bch2_create_trans(&trans, ++ BCACHEFS_ROOT_SUBVOL_INUM, + &root_inode, &lostfound_inode, + &lostfound, + 0, 0, S_IFDIR|0700, 0, +- NULL, NULL)); ++ NULL, NULL, 0)); + if (ret) { + bch_err(c, "error creating lost+found"); + goto err; +diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c +index 576cfbccf5b5..be4b47bc7438 100644 +--- a/fs/bcachefs/reflink.c ++++ b/fs/bcachefs/reflink.c +@@ -7,6 +7,7 @@ + #include "inode.h" + #include "io.h" + #include "reflink.h" ++#include "subvolume.h" + + #include + +@@ -197,7 +198,8 @@ static struct bkey_s_c get_next_src(struct btree_iter *iter, struct bpos end) + } + + s64 bch2_remap_range(struct bch_fs *c, +- struct bpos dst_start, struct bpos src_start, ++ subvol_inum dst_inum, u64 dst_offset, ++ subvol_inum src_inum, u64 src_offset, + u64 remap_sectors, u64 *journal_seq, + u64 new_i_size, s64 *i_sectors_delta) + { +@@ -205,6 +207,8 @@ s64 bch2_remap_range(struct bch_fs *c, + struct btree_iter dst_iter, src_iter; + struct bkey_s_c src_k; + struct bkey_buf new_dst, new_src; ++ struct bpos dst_start = POS(dst_inum.inum, dst_offset); ++ struct bpos src_start = POS(src_inum.inum, src_offset); + struct bpos dst_end = dst_start, src_end = src_start; + struct bpos src_want; + u64 dst_done; +@@ -238,6 +242,16 @@ s64 bch2_remap_range(struct bch_fs *c, + break; + } + ++ ret = bch2_subvolume_get_snapshot(&trans, src_inum.subvol, ++ &src_iter.snapshot); ++ if (ret) ++ continue; ++ ++ ret = bch2_subvolume_get_snapshot(&trans, dst_inum.subvol, ++ &dst_iter.snapshot); ++ if (ret) ++ continue; ++ + dst_done = dst_iter.pos.offset - dst_start.offset; + src_want = POS(src_start.inode, src_start.offset + dst_done); + bch2_btree_iter_set_pos(&src_iter, src_want); +@@ -311,7 +325,7 @@ s64 bch2_remap_range(struct bch_fs *c, + bch2_trans_begin(&trans); + + ret2 = bch2_inode_peek(&trans, &inode_iter, &inode_u, +- dst_start.inode, BTREE_ITER_INTENT); ++ dst_inum, BTREE_ITER_INTENT); + + if (!ret2 && + inode_u.bi_size < new_i_size) { +diff --git a/fs/bcachefs/reflink.h b/fs/bcachefs/reflink.h +index 68c5cb5a2780..4c1b82860b0b 100644 +--- a/fs/bcachefs/reflink.h ++++ b/fs/bcachefs/reflink.h +@@ -57,7 +57,7 @@ static inline __le64 *bkey_refcount(struct bkey_i *k) + } + } + +-s64 bch2_remap_range(struct bch_fs *, struct bpos, struct bpos, +- u64, u64 *, u64, s64 *); ++s64 bch2_remap_range(struct bch_fs *, subvol_inum, u64, ++ subvol_inum, u64, u64, u64 *, u64, s64 *); + + #endif /* _BCACHEFS_REFLINK_H */ +diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h +index c6a132b3c5bb..6418089531ad 100644 +--- a/fs/bcachefs/str_hash.h ++++ b/fs/bcachefs/str_hash.h +@@ -8,6 +8,7 @@ + #include "error.h" + #include "inode.h" + #include "siphash.h" ++#include "subvolume.h" + #include "super.h" + + #include +@@ -144,16 +145,21 @@ bch2_hash_lookup(struct btree_trans *trans, + struct btree_iter *iter, + const struct bch_hash_desc desc, + const struct bch_hash_info *info, +- u64 inode, const void *key, ++ subvol_inum inum, const void *key, + unsigned flags) + { + struct bkey_s_c k; ++ u32 snapshot; + int ret; + ++ ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); ++ if (ret) ++ return ret; ++ + for_each_btree_key(trans, *iter, desc.btree_id, +- POS(inode, desc.hash_key(info, key)), ++ SPOS(inum.inum, desc.hash_key(info, key), snapshot), + BTREE_ITER_SLOTS|flags, k, ret) { +- if (iter->pos.inode != inode) ++ if (iter->pos.inode != inum.inum) + break; + + if (k.k->type == desc.key_type) { +@@ -176,15 +182,20 @@ bch2_hash_hole(struct btree_trans *trans, + struct btree_iter *iter, + const struct bch_hash_desc desc, + const struct bch_hash_info *info, +- u64 inode, const void *key) ++ subvol_inum inum, const void *key) + { + struct bkey_s_c k; ++ u32 snapshot; + int ret; + ++ ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); ++ if (ret) ++ return ret; ++ + for_each_btree_key(trans, *iter, desc.btree_id, +- POS(inode, desc.hash_key(info, key)), ++ SPOS(inum.inum, desc.hash_key(info, key), snapshot), + BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { +- if (iter->pos.inode != inode) ++ if (iter->pos.inode != inum.inum) + break; + + if (k.k->type != desc.key_type) +@@ -229,17 +240,25 @@ static __always_inline + int bch2_hash_set(struct btree_trans *trans, + const struct bch_hash_desc desc, + const struct bch_hash_info *info, +- u64 inode, struct bkey_i *insert, int flags) ++ subvol_inum inum, ++ struct bkey_i *insert, int flags) + { + struct btree_iter iter, slot = { NULL }; + struct bkey_s_c k; + bool found = false; ++ u32 snapshot; + int ret; + ++ ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); ++ if (ret) ++ return ret; ++ + for_each_btree_key(trans, iter, desc.btree_id, +- POS(inode, desc.hash_bkey(info, bkey_i_to_s_c(insert))), ++ SPOS(inum.inum, ++ desc.hash_bkey(info, bkey_i_to_s_c(insert)), ++ snapshot), + BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { +- if (iter.pos.inode != inode) ++ if (iter.pos.inode != inum.inum) + break; + + if (k.k->type == desc.key_type) { +@@ -313,12 +332,12 @@ static __always_inline + int bch2_hash_delete(struct btree_trans *trans, + const struct bch_hash_desc desc, + const struct bch_hash_info *info, +- u64 inode, const void *key) ++ subvol_inum inum, const void *key) + { + struct btree_iter iter; + int ret; + +- ret = bch2_hash_lookup(trans, &iter, desc, info, inode, key, ++ ret = bch2_hash_lookup(trans, &iter, desc, info, inum, key, + BTREE_ITER_INTENT); + if (ret) + return ret; +diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c +index ef6ae97e0df5..a182e242a0e8 100644 +--- a/fs/bcachefs/xattr.c ++++ b/fs/bcachefs/xattr.c +@@ -128,7 +128,7 @@ static int bch2_xattr_get_trans(struct btree_trans *trans, struct bch_inode_info + int ret; + + ret = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc, &hash, +- inode->v.i_ino, ++ inode_inum(inode), + &X_SEARCH(type, name, strlen(name)), + 0); + if (ret) +@@ -160,7 +160,7 @@ int bch2_xattr_get(struct bch_fs *c, struct bch_inode_info *inode, + bch2_xattr_get_trans(&trans, inode, name, buffer, size, type)); + } + +-int bch2_xattr_set(struct btree_trans *trans, u64 inum, ++int bch2_xattr_set(struct btree_trans *trans, subvol_inum inum, + const struct bch_hash_info *hash_info, + const char *name, const void *value, size_t size, + int type, int flags) +@@ -282,13 +282,21 @@ ssize_t bch2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) + struct btree_iter iter; + struct bkey_s_c k; + struct xattr_buf buf = { .buf = buffer, .len = buffer_size }; +- u64 inum = dentry->d_inode->i_ino; ++ u64 offset = 0, inum = inode->ei_inode.bi_inum; ++ u32 snapshot; + int ret; + + bch2_trans_init(&trans, c, 0, 0); ++retry: ++ bch2_trans_begin(&trans); ++ iter = (struct btree_iter) { NULL }; ++ ++ ret = bch2_subvolume_get_snapshot(&trans, inode->ei_subvol, &snapshot); ++ if (ret) ++ goto err; + + for_each_btree_key(&trans, iter, BTREE_ID_xattrs, +- POS(inum, 0), 0, k, ret) { ++ SPOS(inum, offset, snapshot), 0, k, ret) { + BUG_ON(k.k->p.inode < inum); + + if (k.k->p.inode > inum) +@@ -301,7 +309,12 @@ ssize_t bch2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) + if (ret) + break; + } ++ ++ offset = iter.pos.offset; + bch2_trans_iter_exit(&trans, &iter); ++err: ++ if (ret == -EINTR) ++ goto retry; + + ret = bch2_trans_exit(&trans) ?: ret; + +@@ -340,7 +353,7 @@ static int bch2_xattr_set_handler(const struct xattr_handler *handler, + struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode); + + return bch2_trans_do(c, NULL, &inode->ei_journal_seq, 0, +- bch2_xattr_set(&trans, inode->v.i_ino, &hash, ++ bch2_xattr_set(&trans, inode_inum(inode), &hash, + name, value, size, + handler->flags, flags)); + } +diff --git a/fs/bcachefs/xattr.h b/fs/bcachefs/xattr.h +index 4151065ab853..f4f896545e1c 100644 +--- a/fs/bcachefs/xattr.h ++++ b/fs/bcachefs/xattr.h +@@ -39,7 +39,8 @@ struct bch_inode_info; + int bch2_xattr_get(struct bch_fs *, struct bch_inode_info *, + const char *, void *, size_t, int); + +-int bch2_xattr_set(struct btree_trans *, u64, const struct bch_hash_info *, ++int bch2_xattr_set(struct btree_trans *, subvol_inum, ++ const struct bch_hash_info *, + const char *, const void *, size_t, int, int); + + ssize_t bch2_xattr_list(struct dentry *, char *, size_t); +-- +cgit v1.2.3 + + +From 336248da8f0a85207d9300716055f2e78a9844d1 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 20 Apr 2021 00:15:44 -0400 +Subject: bcachefs: Update fsck for snapshots + +This updates the fsck algorithms to handle snapshots - meaning there +will be multiple versions of the same key (extents, inodes, dirents, +xattrs) in different snapshots, and we have to carefully consider which +keys are visible in which snapshot. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_update.h | 2 +- + fs/bcachefs/btree_update_leaf.c | 5 +- + fs/bcachefs/fsck.c | 1400 +++++++++++++++++++++++++++++---------- + 3 files changed, 1044 insertions(+), 363 deletions(-) + +diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h +index 23b73d3a172c..4d0ece342cf6 100644 +--- a/fs/bcachefs/btree_update.h ++++ b/fs/bcachefs/btree_update.h +@@ -61,7 +61,7 @@ int bch2_btree_insert(struct bch_fs *, enum btree_id, struct bkey_i *, + struct disk_reservation *, u64 *, int flags); + + int bch2_btree_delete_range_trans(struct btree_trans *, enum btree_id, +- struct bpos, struct bpos, u64 *); ++ struct bpos, struct bpos, unsigned, u64 *); + int bch2_btree_delete_range(struct bch_fs *, enum btree_id, + struct bpos, struct bpos, u64 *); + +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index a8575e847f0a..37626fedfb3b 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -1181,13 +1181,14 @@ int bch2_btree_delete_at(struct btree_trans *trans, + + int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id, + struct bpos start, struct bpos end, ++ unsigned iter_flags, + u64 *journal_seq) + { + struct btree_iter iter; + struct bkey_s_c k; + int ret = 0; + +- bch2_trans_iter_init(trans, &iter, id, start, BTREE_ITER_INTENT); ++ bch2_trans_iter_init(trans, &iter, id, start, BTREE_ITER_INTENT|iter_flags); + retry: + while ((bch2_trans_begin(trans), + (k = bch2_btree_iter_peek(&iter)).k) && +@@ -1254,5 +1255,5 @@ int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id, + u64 *journal_seq) + { + return bch2_trans_do(c, NULL, journal_seq, 0, +- bch2_btree_delete_range_trans(&trans, id, start, end, journal_seq)); ++ bch2_btree_delete_range_trans(&trans, id, start, end, 0, journal_seq)); + } +diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c +index 40b107715cdd..b4a6b3d2ed07 100644 +--- a/fs/bcachefs/fsck.c ++++ b/fs/bcachefs/fsck.c +@@ -18,7 +18,8 @@ + + #define QSTR(n) { { { .len = strlen(n) } }, .name = n } + +-static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum) ++static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum, ++ u32 snapshot) + { + struct btree_iter iter; + struct bkey_s_c k; +@@ -26,7 +27,7 @@ static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum) + int ret; + + for_each_btree_key(trans, iter, BTREE_ID_extents, +- POS(inum, 0), 0, k, ret) { ++ SPOS(inum, 0, snapshot), 0, k, ret) { + if (k.k->p.inode != inum) + break; + +@@ -39,6 +40,33 @@ static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum) + return ret ?: sectors; + } + ++static s64 bch2_count_subdirs(struct btree_trans *trans, u64 inum, ++ u32 snapshot) ++{ ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ struct bkey_s_c_dirent d; ++ u64 subdirs = 0; ++ int ret; ++ ++ for_each_btree_key(trans, iter, BTREE_ID_dirents, ++ SPOS(inum, 0, snapshot), 0, k, ret) { ++ if (k.k->p.inode != inum) ++ break; ++ ++ if (k.k->type != KEY_TYPE_dirent) ++ continue; ++ ++ d = bkey_s_c_to_dirent(k); ++ if (d.v->d_type == DT_DIR) ++ subdirs++; ++ } ++ ++ bch2_trans_iter_exit(trans, &iter); ++ ++ return ret ?: subdirs; ++} ++ + static int __snapshot_lookup_subvol(struct btree_trans *trans, u32 snapshot, + u32 *subvol) + { +@@ -72,8 +100,8 @@ static int snapshot_lookup_subvol(struct btree_trans *trans, u32 snapshot, + return lockrestart_do(trans, __snapshot_lookup_subvol(trans, snapshot, subvol)); + } + +-static int __subvol_lookup_root(struct btree_trans *trans, u32 subvol, +- u64 *inum) ++static int __subvol_lookup(struct btree_trans *trans, u32 subvol, ++ u32 *snapshot, u64 *inum) + { + struct btree_iter iter; + struct bkey_s_c k; +@@ -92,6 +120,7 @@ static int __subvol_lookup_root(struct btree_trans *trans, u32 subvol, + goto err; + } + ++ *snapshot = le32_to_cpu(bkey_s_c_to_subvolume(k).v->snapshot); + *inum = le64_to_cpu(bkey_s_c_to_subvolume(k).v->inode); + err: + bch2_trans_iter_exit(trans, &iter); +@@ -99,9 +128,10 @@ err: + + } + +-static int subvol_lookup_root(struct btree_trans *trans, u32 subvol, u64 *inum) ++static int subvol_lookup(struct btree_trans *trans, u32 subvol, ++ u32 *snapshot, u64 *inum) + { +- return lockrestart_do(trans, __subvol_lookup_root(trans, subvol, inum)); ++ return lockrestart_do(trans, __subvol_lookup(trans, subvol, snapshot, inum)); + } + + static int __lookup_inode(struct btree_trans *trans, u64 inode_nr, +@@ -113,14 +143,13 @@ static int __lookup_inode(struct btree_trans *trans, u64 inode_nr, + int ret; + + bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes, +- POS(0, inode_nr), 0); ++ SPOS(0, inode_nr, *snapshot), 0); + k = bch2_btree_iter_peek_slot(&iter); + ret = bkey_err(k); + if (ret) + goto err; + +- if (snapshot) +- *snapshot = iter.pos.snapshot; ++ *snapshot = iter.pos.snapshot; + ret = k.k->type == KEY_TYPE_inode + ? bch2_inode_unpack(bkey_s_c_to_inode(k), inode) + : -ENOENT; +@@ -136,6 +165,36 @@ static int lookup_inode(struct btree_trans *trans, u64 inode_nr, + return lockrestart_do(trans, __lookup_inode(trans, inode_nr, inode, snapshot)); + } + ++static int __lookup_dirent(struct btree_trans *trans, ++ struct bch_hash_info hash_info, ++ subvol_inum dir, struct qstr *name, ++ u64 *target, unsigned *type) ++{ ++ struct btree_iter iter; ++ struct bkey_s_c_dirent d; ++ int ret; ++ ++ ret = bch2_hash_lookup(trans, &iter, bch2_dirent_hash_desc, ++ &hash_info, dir, name, 0); ++ if (ret) ++ return ret; ++ ++ d = bkey_s_c_to_dirent(bch2_btree_iter_peek_slot(&iter)); ++ *target = le64_to_cpu(d.v->d_inum); ++ *type = d.v->d_type; ++ bch2_trans_iter_exit(trans, &iter); ++ return 0; ++} ++ ++static int lookup_dirent(struct btree_trans *trans, ++ struct bch_hash_info hash_info, ++ subvol_inum dir, struct qstr *name, ++ u64 *target, unsigned *type) ++{ ++ return lockrestart_do(trans, ++ __lookup_dirent(trans, hash_info, dir, name, target, type)); ++} ++ + static int __write_inode(struct btree_trans *trans, + struct bch_inode_unpacked *inode, + u32 snapshot) +@@ -166,6 +225,71 @@ static int write_inode(struct btree_trans *trans, + return ret; + } + ++static int fsck_inode_rm(struct btree_trans *trans, u64 inum, u32 snapshot) ++{ ++ struct btree_iter iter = { NULL }; ++ struct bkey_i_inode_generation delete; ++ struct bch_inode_unpacked inode_u; ++ struct bkey_s_c k; ++ int ret; ++ ++ ret = bch2_btree_delete_range_trans(trans, BTREE_ID_extents, ++ SPOS(inum, 0, snapshot), ++ SPOS(inum, U64_MAX, snapshot), ++ 0, NULL) ?: ++ bch2_btree_delete_range_trans(trans, BTREE_ID_dirents, ++ SPOS(inum, 0, snapshot), ++ SPOS(inum, U64_MAX, snapshot), ++ 0, NULL) ?: ++ bch2_btree_delete_range_trans(trans, BTREE_ID_xattrs, ++ SPOS(inum, 0, snapshot), ++ SPOS(inum, U64_MAX, snapshot), ++ 0, NULL); ++ if (ret) ++ goto err; ++retry: ++ bch2_trans_begin(trans); ++ ++ bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes, ++ SPOS(0, inum, snapshot), BTREE_ITER_INTENT); ++ k = bch2_btree_iter_peek_slot(&iter); ++ ++ ret = bkey_err(k); ++ if (ret) ++ goto err; ++ ++ if (k.k->type != KEY_TYPE_inode) { ++ bch2_fs_inconsistent(trans->c, ++ "inode %llu:%u not found when deleting", ++ inum, snapshot); ++ ret = -EIO; ++ goto err; ++ } ++ ++ bch2_inode_unpack(bkey_s_c_to_inode(k), &inode_u); ++ ++ /* Subvolume root? */ ++ if (inode_u.bi_subvol) { ++ ret = bch2_subvolume_delete(trans, inode_u.bi_subvol, -1); ++ if (ret) ++ goto err; ++ } ++ ++ bkey_inode_generation_init(&delete.k_i); ++ delete.k.p = iter.pos; ++ delete.v.bi_generation = cpu_to_le32(inode_u.bi_generation + 1); ++ ++ ret = bch2_trans_update(trans, &iter, &delete.k_i, 0) ?: ++ bch2_trans_commit(trans, NULL, NULL, ++ BTREE_INSERT_NOFAIL); ++err: ++ bch2_trans_iter_exit(trans, &iter); ++ if (ret == -EINTR) ++ goto retry; ++ ++ return ret; ++} ++ + static int __remove_dirent(struct btree_trans *trans, struct bpos pos) + { + struct bch_fs *c = trans->c; +@@ -200,32 +324,49 @@ static int remove_dirent(struct btree_trans *trans, struct bpos pos) + } + + /* Get lost+found, create if it doesn't exist: */ +-static int lookup_lostfound(struct btree_trans *trans, +- u32 subvol, ++static int lookup_lostfound(struct btree_trans *trans, u32 subvol, + struct bch_inode_unpacked *lostfound) + { + struct bch_fs *c = trans->c; + struct bch_inode_unpacked root; + struct bch_hash_info root_hash_info; + struct qstr lostfound_str = QSTR("lost+found"); +- u64 inum; ++ subvol_inum root_inum = { .subvol = subvol }; ++ u64 inum = 0; ++ unsigned d_type = 0; + u32 snapshot; + int ret; + +- ret = subvol_lookup_root(trans, subvol, &inum); ++ ret = subvol_lookup(trans, subvol, &snapshot, &root_inum.inum); ++ if (ret) ++ return ret; + +- ret = lookup_inode(trans, inum, &root, &snapshot); +- if (ret && ret != -ENOENT) ++ ret = lookup_inode(trans, root_inum.inum, &root, &snapshot); ++ if (ret) { ++ bch_err(c, "error fetching subvol root: %i", ret); + return ret; ++ } + + root_hash_info = bch2_hash_info_init(c, &root); +- inum = bch2_dirent_lookup(c, root.bi_inum, &root_hash_info, +- &lostfound_str); +- if (!inum) { ++ ++ ret = lookup_dirent(trans, root_hash_info, root_inum, ++ &lostfound_str, &inum, &d_type); ++ if (ret == -ENOENT) { + bch_notice(c, "creating lost+found"); + goto create_lostfound; + } + ++ if (ret) { ++ bch_err(c, "error looking up lost+found: %i", ret); ++ return ret; ++ } ++ ++ if (d_type != DT_DIR) { ++ bch_err(c, "error looking up lost+found: not a directory"); ++ return ret; ++ ++ } ++ + ret = lookup_inode(trans, inum, lostfound, &snapshot); + if (ret && ret != -ENOENT) { + /* +@@ -243,11 +384,9 @@ create_lostfound: + ret = __bch2_trans_do(trans, NULL, NULL, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW, +- bch2_create_trans(trans, +- BCACHEFS_ROOT_INO, &root, +- lostfound, +- &lostfound_str, +- 0, 0, S_IFDIR|0700, 0, NULL, NULL)); ++ bch2_create_trans(trans, root_inum, &root, ++ lostfound, &lostfound_str, ++ 0, 0, S_IFDIR|0700, 0, NULL, NULL, 0)); + if (ret) + bch_err(c, "error creating lost+found: %i", ret); + } +@@ -257,7 +396,7 @@ create_lostfound: + + static int reattach_inode(struct btree_trans *trans, + struct bch_inode_unpacked *inode, +- u32 snapshot) ++ u32 inode_snapshot) + { + struct bch_hash_info dir_hash; + struct bch_inode_unpacked lostfound; +@@ -267,7 +406,7 @@ static int reattach_inode(struct btree_trans *trans, + u32 subvol; + int ret; + +- ret = snapshot_lookup_subvol(trans, snapshot, &subvol); ++ ret = snapshot_lookup_subvol(trans, inode_snapshot, &subvol); + if (ret) + return ret; + +@@ -289,10 +428,15 @@ static int reattach_inode(struct btree_trans *trans, + name = (struct qstr) QSTR(name_buf); + + ret = __bch2_trans_do(trans, NULL, NULL, BTREE_INSERT_LAZY_RW, +- bch2_dirent_create(trans, lostfound.bi_inum, &dir_hash, +- mode_to_type(inode->bi_mode), +- &name, inode->bi_inum, &dir_offset, +- BCH_HASH_SET_MUST_CREATE)); ++ bch2_dirent_create(trans, ++ (subvol_inum) { ++ .subvol = subvol, ++ .inum = lostfound.bi_inum, ++ }, ++ &dir_hash, ++ mode_to_type(inode->bi_mode), ++ &name, inode->bi_inum, &dir_offset, ++ BCH_HASH_SET_MUST_CREATE)); + if (ret) { + bch_err(trans->c, "error %i reattaching inode %llu", + ret, inode->bi_inum); +@@ -302,7 +446,7 @@ static int reattach_inode(struct btree_trans *trans, + inode->bi_dir = lostfound.bi_inum; + inode->bi_dir_offset = dir_offset; + +- return write_inode(trans, inode, U32_MAX); ++ return write_inode(trans, inode, inode_snapshot); + } + + static int remove_backpointer(struct btree_trans *trans, +@@ -329,45 +473,287 @@ out: + return ret; + } + ++struct snapshots_seen { ++ struct bpos pos; ++ size_t nr; ++ size_t size; ++ u32 *d; ++}; ++ ++static void snapshots_seen_exit(struct snapshots_seen *s) ++{ ++ kfree(s->d); ++ s->d = NULL; ++} ++ ++static void snapshots_seen_init(struct snapshots_seen *s) ++{ ++ memset(s, 0, sizeof(*s)); ++} ++ ++static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s, struct bpos pos) ++{ ++ pos.snapshot = snapshot_t(c, pos.snapshot)->equiv; ++ ++ if (bkey_cmp(s->pos, pos)) ++ s->nr = 0; ++ s->pos = pos; ++ ++ if (s->nr == s->size) { ++ size_t new_size = max(s->size, 128UL) * 2; ++ u32 *d = krealloc(s->d, new_size * sizeof(s->d[0]), GFP_KERNEL); ++ ++ if (!d) { ++ bch_err(c, "error reallocating snapshots_seen table (new size %zu)", ++ new_size); ++ return -ENOMEM; ++ } ++ ++ s->size = new_size; ++ s->d = d; ++ } ++ ++ /* Might get called multiple times due to lock restarts */ ++ if (s->nr && s->d[s->nr - 1] == pos.snapshot) ++ return 0; ++ ++ s->d[s->nr++] = pos.snapshot; ++ return 0; ++} ++ ++/** ++ * key_visible_in_snapshot - returns true if @id is a descendent of @ancestor, ++ * and @ancestor hasn't been overwritten in @seen ++ * ++ * That is, returns whether key in @ancestor snapshot is visible in @id snapshot ++ */ ++static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *seen, ++ u32 id, u32 ancestor) ++{ ++ ssize_t i; ++ ++ BUG_ON(id > ancestor); ++ ++ id = snapshot_t(c, id)->equiv; ++ ancestor = snapshot_t(c, ancestor)->equiv; ++ ++ /* @ancestor should be the snapshot most recently added to @seen */ ++ BUG_ON(!seen->nr || seen->d[seen->nr - 1] != ancestor); ++ BUG_ON(seen->pos.snapshot != ancestor); ++ ++ if (id == ancestor) ++ return true; ++ ++ if (!bch2_snapshot_is_ancestor(c, id, ancestor)) ++ return false; ++ ++ for (i = seen->nr - 2; ++ i >= 0 && seen->d[i] >= id; ++ --i) ++ if (bch2_snapshot_is_ancestor(c, id, seen->d[i]) && ++ bch2_snapshot_is_ancestor(c, seen->d[i], ancestor)) ++ return false; ++ ++ return true; ++} ++ ++/** ++ * ref_visible - given a key with snapshot id @src that points to a key with ++ * snapshot id @dst, test whether there is some snapshot in which @dst is ++ * visible. ++ * ++ * This assumes we're visiting @src keys in natural key order. ++ * ++ * @s - list of snapshot IDs already seen at @src ++ * @src - snapshot ID of src key ++ * @dst - snapshot ID of dst key ++ */ ++static int ref_visible(struct bch_fs *c, struct snapshots_seen *s, ++ u32 src, u32 dst) ++{ ++ return dst <= src ++ ? key_visible_in_snapshot(c, s, dst, src) ++ : bch2_snapshot_is_ancestor(c, src, dst); ++} ++ ++#define for_each_visible_inode(_c, _s, _w, _snapshot, _i) \ ++ for (_i = (_w)->d; _i < (_w)->d + (_w)->nr && (_i)->snapshot <= (_snapshot); _i++)\ ++ if (key_visible_in_snapshot(_c, _s, _i->snapshot, _snapshot)) ++ + struct inode_walker { +- bool first_this_inode; +- bool have_inode; +- u64 cur_inum; +- u32 snapshot; +- struct bch_inode_unpacked inode; ++ bool first_this_inode; ++ u64 cur_inum; ++ ++ size_t nr; ++ size_t size; ++ struct inode_walker_entry { ++ struct bch_inode_unpacked inode; ++ u32 snapshot; ++ u64 count; ++ } *d; + }; + ++static void inode_walker_exit(struct inode_walker *w) ++{ ++ kfree(w->d); ++ w->d = NULL; ++} ++ + static struct inode_walker inode_walker_init(void) + { +- return (struct inode_walker) { +- .cur_inum = -1, +- .have_inode = false, ++ return (struct inode_walker) { 0, }; ++} ++ ++static int inode_walker_realloc(struct inode_walker *w) ++{ ++ if (w->nr == w->size) { ++ size_t new_size = max_t(size_t, 8UL, w->size * 2); ++ void *d = krealloc(w->d, new_size * sizeof(w->d[0]), ++ GFP_KERNEL); ++ if (!d) ++ return -ENOMEM; ++ ++ w->d = d; ++ w->size = new_size; ++ } ++ ++ return 0; ++} ++ ++static int add_inode(struct bch_fs *c, struct inode_walker *w, ++ struct bkey_s_c_inode inode) ++{ ++ struct bch_inode_unpacked u; ++ int ret; ++ ++ ret = inode_walker_realloc(w); ++ if (ret) ++ return ret; ++ ++ BUG_ON(bch2_inode_unpack(inode, &u)); ++ ++ w->d[w->nr++] = (struct inode_walker_entry) { ++ .inode = u, ++ .snapshot = snapshot_t(c, inode.k->p.snapshot)->equiv, + }; ++ ++ return 0; + } + + static int __walk_inode(struct btree_trans *trans, +- struct inode_walker *w, u64 inum) ++ struct inode_walker *w, struct bpos pos) + { +- if (inum != w->cur_inum) { +- int ret = __lookup_inode(trans, inum, &w->inode, &w->snapshot); ++ struct bch_fs *c = trans->c; ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ unsigned i, ancestor_pos; ++ int ret; + +- if (ret && ret != -ENOENT) +- return ret; ++ pos.snapshot = snapshot_t(c, pos.snapshot)->equiv; + +- w->have_inode = !ret; +- w->cur_inum = inum; +- w->first_this_inode = true; +- } else { ++ if (pos.inode == w->cur_inum) { + w->first_this_inode = false; ++ goto lookup_snapshot; + } + +- return 0; ++ w->nr = 0; ++ ++ for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, pos.inode), ++ BTREE_ITER_ALL_SNAPSHOTS, k, ret) { ++ if (k.k->p.offset != pos.inode) ++ break; ++ ++ if (k.k->type == KEY_TYPE_inode) ++ add_inode(c, w, bkey_s_c_to_inode(k)); ++ } ++ bch2_trans_iter_exit(trans, &iter); ++ ++ if (ret) ++ return ret; ++ ++ w->cur_inum = pos.inode; ++ w->first_this_inode = true; ++lookup_snapshot: ++ for (i = 0; i < w->nr; i++) ++ if (bch2_snapshot_is_ancestor(c, pos.snapshot, w->d[i].snapshot)) ++ goto found; ++ return INT_MAX; ++found: ++ BUG_ON(pos.snapshot > w->d[i].snapshot); ++ ++ if (pos.snapshot != w->d[i].snapshot) { ++ ancestor_pos = i; ++ ++ while (i && w->d[i - 1].snapshot > pos.snapshot) ++ --i; ++ ++ ret = inode_walker_realloc(w); ++ if (ret) ++ return ret; ++ ++ array_insert_item(w->d, w->nr, i, w->d[ancestor_pos]); ++ w->d[i].snapshot = pos.snapshot; ++ w->d[i].count = 0; ++ } ++ ++ return i; + } + + static int walk_inode(struct btree_trans *trans, +- struct inode_walker *w, u64 inum) ++ struct inode_walker *w, struct bpos pos) + { +- return lockrestart_do(trans, __walk_inode(trans, w, inum)); ++ return lockrestart_do(trans, __walk_inode(trans, w, pos)); ++} ++ ++static int __get_visible_inodes(struct btree_trans *trans, ++ struct inode_walker *w, ++ struct snapshots_seen *s, ++ u64 inum) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ int ret; ++ ++ w->nr = 0; ++ ++ for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, inum), ++ BTREE_ITER_ALL_SNAPSHOTS, k, ret) { ++ if (k.k->p.offset != inum) ++ break; ++ ++ if (k.k->type != KEY_TYPE_inode) ++ continue; ++ ++ if (ref_visible(c, s, s->pos.snapshot, k.k->p.snapshot)) { ++ add_inode(c, w, bkey_s_c_to_inode(k)); ++ if (k.k->p.snapshot >= s->pos.snapshot) ++ break; ++ } ++ } ++ bch2_trans_iter_exit(trans, &iter); ++ ++ return ret; ++} ++ ++static int check_key_has_snapshot(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct bkey_s_c k) ++{ ++ struct bch_fs *c = trans->c; ++ char buf[200]; ++ int ret = 0; ++ ++ if (fsck_err_on(!snapshot_t(c, k.k->p.snapshot)->equiv, c, ++ "key in missing snapshot: %s", ++ (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf))) { ++ ret = __bch2_trans_do(trans, NULL, NULL, BTREE_INSERT_LAZY_RW, ++ bch2_btree_delete_at(trans, iter, ++ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE)); ++ return ret ?: -EINTR; ++ } ++fsck_err: ++ return ret; + } + + static int hash_redo_key(struct btree_trans *trans, +@@ -375,6 +761,9 @@ static int hash_redo_key(struct btree_trans *trans, + struct bch_hash_info *hash_info, + struct btree_iter *k_iter, struct bkey_s_c k) + { ++ bch_err(trans->c, "hash_redo_key() not implemented yet"); ++ return -EINVAL; ++#if 0 + struct bkey_i *delete; + struct bkey_i *tmp; + +@@ -393,6 +782,7 @@ static int hash_redo_key(struct btree_trans *trans, + return bch2_btree_iter_traverse(k_iter) ?: + bch2_trans_update(trans, k_iter, delete, 0) ?: + bch2_hash_set(trans, desc, hash_info, k_iter->pos.inode, tmp, 0); ++#endif + } + + static int fsck_hash_delete_at(struct btree_trans *trans, +@@ -484,30 +874,29 @@ fsck_err: + + static int check_inode(struct btree_trans *trans, + struct btree_iter *iter, +- struct bkey_s_c_inode inode) ++ struct bch_inode_unpacked *prev, ++ struct bch_inode_unpacked u) + { + struct bch_fs *c = trans->c; +- struct bch_inode_unpacked u; + bool do_update = false; + int ret = 0; + +- ret = bch2_inode_unpack(inode, &u); +- +- if (bch2_fs_inconsistent_on(ret, c, +- "error unpacking inode %llu in fsck", +- inode.k->p.inode)) +- return ret; ++ if (fsck_err_on(prev && ++ (prev->bi_hash_seed != u.bi_hash_seed || ++ mode_to_type(prev->bi_mode) != mode_to_type(u.bi_mode)), c, ++ "inodes in different snapshots don't match")) { ++ bch_err(c, "repair not implemented yet"); ++ return -EINVAL; ++ } + + if (u.bi_flags & BCH_INODE_UNLINKED && + (!c->sb.clean || + fsck_err(c, "filesystem marked clean, but inode %llu unlinked", + u.bi_inum))) { +- bch_verbose(c, "deleting inode %llu", u.bi_inum); +- + bch2_trans_unlock(trans); + bch2_fs_lazy_rw(c); + +- ret = bch2_inode_rm(c, u.bi_inum, false); ++ ret = fsck_inode_rm(trans, u.bi_inum, iter->pos.snapshot); + if (ret) + bch_err(c, "error in fsck: error %i while deleting inode", ret); + return ret; +@@ -527,9 +916,10 @@ static int check_inode(struct btree_trans *trans, + * just switch units to bytes and that issue goes away + */ + ret = bch2_btree_delete_range_trans(trans, BTREE_ID_extents, +- POS(u.bi_inum, round_up(u.bi_size, block_bytes(c)) >> 9), ++ SPOS(u.bi_inum, round_up(u.bi_size, block_bytes(c)) >> 9, ++ iter->pos.snapshot), + POS(u.bi_inum, U64_MAX), +- NULL); ++ 0, NULL); + if (ret) { + bch_err(c, "error in fsck: error %i truncating inode", ret); + return ret; +@@ -554,7 +944,7 @@ static int check_inode(struct btree_trans *trans, + bch_verbose(c, "recounting sectors for inode %llu", + u.bi_inum); + +- sectors = bch2_count_inode_sectors(trans, u.bi_inum); ++ sectors = bch2_count_inode_sectors(trans, u.bi_inum, iter->pos.snapshot); + if (sectors < 0) { + bch_err(c, "error in fsck: error %i recounting inode sectors", + (int) sectors); +@@ -574,11 +964,7 @@ static int check_inode(struct btree_trans *trans, + } + + if (do_update) { +- ret = __bch2_trans_do(trans, NULL, NULL, +- BTREE_INSERT_NOFAIL| +- BTREE_INSERT_LAZY_RW, +- bch2_btree_iter_traverse(iter) ?: +- bch2_inode_write(trans, iter, &u)); ++ ret = write_inode(trans, &u, iter->pos.snapshot); + if (ret) + bch_err(c, "error in fsck: error %i " + "updating inode", ret); +@@ -594,26 +980,49 @@ static int check_inodes(struct bch_fs *c, bool full) + struct btree_iter iter; + struct bkey_s_c k; + struct bkey_s_c_inode inode; ++ struct bch_inode_unpacked prev, u; + int ret; + ++ memset(&prev, 0, sizeof(prev)); ++ + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); + + for_each_btree_key(&trans, iter, BTREE_ID_inodes, POS_MIN, + BTREE_ITER_INTENT| +- BTREE_ITER_PREFETCH, k, ret) { ++ BTREE_ITER_PREFETCH| ++ BTREE_ITER_ALL_SNAPSHOTS, k, ret) { ++ ret = check_key_has_snapshot(&trans, &iter, k); ++ if (ret) ++ break; ++ ++ /* ++ * if snapshot id isn't a leaf node, skip it - deletion in ++ * particular is not atomic, so on the internal snapshot nodes ++ * we can see inodes marked for deletion after a clean shutdown ++ */ ++ if (bch2_snapshot_internal_node(c, k.k->p.snapshot)) ++ continue; ++ + if (k.k->type != KEY_TYPE_inode) + continue; + + inode = bkey_s_c_to_inode(k); + +- if (full || +- (inode.v->bi_flags & (BCH_INODE_I_SIZE_DIRTY| +- BCH_INODE_I_SECTORS_DIRTY| +- BCH_INODE_UNLINKED))) { +- ret = check_inode(&trans, &iter, inode); +- if (ret) +- break; +- } ++ if (!full && ++ !(inode.v->bi_flags & (BCH_INODE_I_SIZE_DIRTY| ++ BCH_INODE_I_SECTORS_DIRTY| ++ BCH_INODE_UNLINKED))) ++ continue; ++ ++ BUG_ON(bch2_inode_unpack(inode, &u)); ++ ++ ret = check_inode(&trans, &iter, ++ full && prev.bi_inum == u.bi_inum ++ ? &prev : NULL, u); ++ if (ret) ++ break; ++ ++ prev = u; + } + bch2_trans_iter_exit(&trans, &iter); + +@@ -622,6 +1031,29 @@ static int check_inodes(struct bch_fs *c, bool full) + return bch2_trans_exit(&trans) ?: ret; + } + ++noinline_for_stack ++static int check_subvols(struct bch_fs *c) ++{ ++ struct btree_trans trans; ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ int ret; ++ ++ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_subvolumes, POS_MIN, ++ 0, k, ret) { ++ } ++ bch2_trans_iter_exit(&trans, &iter); ++ ++ bch2_trans_exit(&trans); ++ return ret; ++} ++ ++/* ++ * Checking for overlapping extents needs to be reimplemented ++ */ ++#if 0 + static int fix_overlapping_extent(struct btree_trans *trans, + struct bkey_s_c k, struct bpos cut_at) + { +@@ -638,55 +1070,195 @@ static int fix_overlapping_extent(struct btree_trans *trans, + bch2_cut_front(cut_at, u); + + +- /* +- * We don't want to go through the extent_handle_overwrites path: +- * +- * XXX: this is going to screw up disk accounting, extent triggers +- * assume things about extent overwrites - we should be running the +- * triggers manually here +- */ +- bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, u->k.p, +- BTREE_ITER_INTENT|BTREE_ITER_NOT_EXTENTS); ++ /* ++ * We don't want to go through the extent_handle_overwrites path: ++ * ++ * XXX: this is going to screw up disk accounting, extent triggers ++ * assume things about extent overwrites - we should be running the ++ * triggers manually here ++ */ ++ bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, u->k.p, ++ BTREE_ITER_INTENT|BTREE_ITER_NOT_EXTENTS); ++ ++ BUG_ON(iter.flags & BTREE_ITER_IS_EXTENTS); ++ ret = bch2_btree_iter_traverse(&iter) ?: ++ bch2_trans_update(trans, &iter, u, BTREE_TRIGGER_NORUN) ?: ++ bch2_trans_commit(trans, NULL, NULL, ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_LAZY_RW); ++ bch2_trans_iter_exit(trans, &iter); ++ return ret; ++} ++#endif ++ ++static int inode_backpointer_exists(struct btree_trans *trans, ++ struct bch_inode_unpacked *inode, ++ u32 snapshot) ++{ ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ int ret; ++ ++ bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, ++ SPOS(inode->bi_dir, inode->bi_dir_offset, snapshot), 0); ++ k = bch2_btree_iter_peek_slot(&iter); ++ ret = bkey_err(k); ++ if (ret) ++ goto out; ++ if (k.k->type != KEY_TYPE_dirent) ++ goto out; ++ ++ ret = le64_to_cpu(bkey_s_c_to_dirent(k).v->d_inum) == inode->bi_inum; ++out: ++ bch2_trans_iter_exit(trans, &iter); ++ return ret; ++} ++ ++static bool inode_backpointer_matches(struct bkey_s_c_dirent d, ++ struct bch_inode_unpacked *inode) ++{ ++ return d.k->p.inode == inode->bi_dir && ++ d.k->p.offset == inode->bi_dir_offset; ++} ++ ++static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w) ++{ ++ struct bch_fs *c = trans->c; ++ struct inode_walker_entry *i; ++ int ret = 0, ret2 = 0; ++ s64 count2; ++ ++ for (i = w->d; i < w->d + w->nr; i++) { ++ if (i->inode.bi_sectors == i->count) ++ continue; ++ ++ count2 = lockrestart_do(trans, ++ bch2_count_inode_sectors(trans, w->cur_inum, i->snapshot)); ++ ++ if (i->count != count2) { ++ bch_err(c, "fsck counted i_sectors wrong: got %llu should be %llu", ++ i->count, count2); ++ i->count = count2; ++ if (i->inode.bi_sectors == i->count) ++ continue; ++ } ++ ++ if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_I_SECTORS_DIRTY), c, ++ "inode %llu:%u has incorrect i_sectors: got %llu, should be %llu", ++ w->cur_inum, i->snapshot, ++ i->inode.bi_sectors, i->count) == FSCK_ERR_IGNORE) ++ continue; ++ ++ i->inode.bi_sectors = i->count; ++ ret = write_inode(trans, &i->inode, i->snapshot); ++ if (ret) ++ break; ++ ret2 = -EINTR; ++ } ++fsck_err: ++ return ret ?: ret2; ++} ++ ++static int check_extent(struct btree_trans *trans, struct btree_iter *iter, ++ struct inode_walker *inode, ++ struct snapshots_seen *s) ++{ ++ struct bch_fs *c = trans->c; ++ struct bkey_s_c k; ++ struct inode_walker_entry *i; ++ char buf[200]; ++ int ret = 0; ++ ++ k = bch2_btree_iter_peek(iter); ++ if (!k.k) ++ return 0; ++ ++ ret = bkey_err(k); ++ if (ret) ++ return ret; ++ ++ ret = check_key_has_snapshot(trans, iter, k); ++ if (ret) ++ return ret; ++ ++ ret = snapshots_seen_update(c, s, k.k->p); ++ if (ret) ++ return ret; ++ ++ if (k.k->type == KEY_TYPE_whiteout) ++ return 0; ++ ++ if (inode->cur_inum != k.k->p.inode) { ++ ret = check_i_sectors(trans, inode); ++ if (ret) ++ return ret; ++ } ++#if 0 ++ if (bkey_cmp(prev.k->k.p, bkey_start_pos(k.k)) > 0) { ++ char buf1[200]; ++ char buf2[200]; ++ ++ bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(prev.k)); ++ bch2_bkey_val_to_text(&PBUF(buf2), c, k); ++ ++ if (fsck_err(c, "overlapping extents:\n%s\n%s", buf1, buf2)) ++ return fix_overlapping_extent(trans, k, prev.k->k.p) ?: -EINTR; ++ } ++#endif ++ ret = __walk_inode(trans, inode, k.k->p); ++ if (ret < 0) ++ return ret; ++ ++ if (fsck_err_on(ret == INT_MAX, c, ++ "extent in missing inode:\n %s", ++ (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf))) ++ return __bch2_trans_do(trans, NULL, NULL, BTREE_INSERT_LAZY_RW, ++ bch2_btree_delete_at(trans, iter, ++ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE)); ++ ++ if (ret == INT_MAX) ++ return 0; + +- BUG_ON(iter.flags & BTREE_ITER_IS_EXTENTS); +- ret = bch2_btree_iter_traverse(&iter) ?: +- bch2_trans_update(trans, &iter, u, BTREE_TRIGGER_NORUN) ?: +- bch2_trans_commit(trans, NULL, NULL, +- BTREE_INSERT_NOFAIL| +- BTREE_INSERT_LAZY_RW); +- bch2_trans_iter_exit(trans, &iter); +- return ret; +-} ++ i = inode->d + ret; ++ ret = 0; + +-static int inode_backpointer_exists(struct btree_trans *trans, +- struct bch_inode_unpacked *inode) +-{ +- struct btree_iter iter; +- struct bkey_s_c k; +- int ret; ++ if (fsck_err_on(!S_ISREG(i->inode.bi_mode) && ++ !S_ISLNK(i->inode.bi_mode), c, ++ "extent in non regular inode mode %o:\n %s", ++ i->inode.bi_mode, ++ (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf))) ++ return __bch2_trans_do(trans, NULL, NULL, BTREE_INSERT_LAZY_RW, ++ bch2_btree_delete_at(trans, iter, ++ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE)); ++ ++ if (!bch2_snapshot_internal_node(c, k.k->p.snapshot)) { ++ for_each_visible_inode(c, s, inode, k.k->p.snapshot, i) { ++ if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_I_SIZE_DIRTY) && ++ k.k->type != KEY_TYPE_reservation && ++ k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9, c, ++ "extent type %u offset %llu past end of inode %llu, i_size %llu", ++ k.k->type, k.k->p.offset, k.k->p.inode, i->inode.bi_size)) { ++ bch2_fs_lazy_rw(c); ++ return bch2_btree_delete_range_trans(trans, BTREE_ID_extents, ++ SPOS(k.k->p.inode, round_up(i->inode.bi_size, block_bytes(c)) >> 9, ++ k.k->p.snapshot), ++ POS(k.k->p.inode, U64_MAX), ++ 0, NULL) ?: -EINTR; ++ } ++ } ++ } + +- bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, +- POS(inode->bi_dir, inode->bi_dir_offset), 0); +- k = bch2_btree_iter_peek_slot(&iter); +- ret = bkey_err(k); +- if (ret) +- goto out; +- if (k.k->type != KEY_TYPE_dirent) +- goto out; ++ if (bkey_extent_is_allocation(k.k)) ++ for_each_visible_inode(c, s, inode, k.k->p.snapshot, i) ++ i->count += k.k->size; ++#if 0 ++ bch2_bkey_buf_reassemble(&prev, c, k); ++#endif + +- ret = le64_to_cpu(bkey_s_c_to_dirent(k).v->d_inum) == inode->bi_inum; +-out: +- bch2_trans_iter_exit(trans, &iter); ++fsck_err: + return ret; + } + +-static bool inode_backpointer_matches(struct bkey_s_c_dirent d, +- struct bch_inode_unpacked *inode) +-{ +- return d.k->p.inode == inode->bi_dir && +- d.k->p.offset == inode->bi_dir_offset; +-} +- + /* + * Walk extents: verify that extents have a corresponding S_ISREG inode, and + * that i_size an i_sectors are consistent +@@ -695,15 +1267,17 @@ noinline_for_stack + static int check_extents(struct bch_fs *c) + { + struct inode_walker w = inode_walker_init(); ++ struct snapshots_seen s; + struct btree_trans trans; + struct btree_iter iter; +- struct bkey_s_c k; +- struct bkey_buf prev; +- u64 i_sectors = 0; + int ret = 0; + ++#if 0 ++ struct bkey_buf prev; + bch2_bkey_buf_init(&prev); + prev.k->k = KEY(0, 0, 0); ++#endif ++ snapshots_seen_init(&s); + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); + + bch_verbose(c, "checking extents"); +@@ -711,96 +1285,172 @@ static int check_extents(struct bch_fs *c) + bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, + POS(BCACHEFS_ROOT_INO, 0), + BTREE_ITER_INTENT| +- BTREE_ITER_PREFETCH); +-retry: +- while ((k = bch2_btree_iter_peek(&iter)).k && +- !(ret = bkey_err(k))) { +- if (w.have_inode && +- w.cur_inum != k.k->p.inode && +- !(w.inode.bi_flags & BCH_INODE_I_SECTORS_DIRTY) && +- fsck_err_on(w.inode.bi_sectors != i_sectors, c, +- "inode %llu has incorrect i_sectors: got %llu, should be %llu", +- w.inode.bi_inum, +- w.inode.bi_sectors, i_sectors)) { +- w.inode.bi_sectors = i_sectors; +- +- ret = write_inode(&trans, &w.inode, w.snapshot); ++ BTREE_ITER_PREFETCH| ++ BTREE_ITER_ALL_SNAPSHOTS); ++ ++ do { ++ ret = lockrestart_do(&trans, ++ check_extent(&trans, &iter, &w, &s)); ++ if (ret) ++ break; ++ } while (bch2_btree_iter_advance(&iter)); ++ bch2_trans_iter_exit(&trans, &iter); ++#if 0 ++ bch2_bkey_buf_exit(&prev, c); ++#endif ++ inode_walker_exit(&w); ++ bch2_trans_exit(&trans); ++ snapshots_seen_exit(&s); ++ ++ return ret; ++} ++ ++static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w) ++{ ++ struct bch_fs *c = trans->c; ++ struct inode_walker_entry *i; ++ int ret = 0, ret2 = 0; ++ s64 count2; ++ ++ for (i = w->d; i < w->d + w->nr; i++) { ++ if (i->inode.bi_nlink == i->count) ++ continue; ++ ++ count2 = lockrestart_do(trans, ++ bch2_count_subdirs(trans, w->cur_inum, i->snapshot)); ++ ++ if (i->count != count2) { ++ bch_err(c, "fsck counted subdirectories wrong: got %llu should be %llu", ++ i->count, count2); ++ i->count = count2; ++ if (i->inode.bi_nlink == i->count) ++ continue; ++ } ++ ++ if (fsck_err_on(i->inode.bi_nlink != i->count, c, ++ "directory %llu:%u with wrong i_nlink: got %u, should be %llu", ++ w->cur_inum, i->snapshot, i->inode.bi_nlink, i->count)) { ++ i->inode.bi_nlink = i->count; ++ ret = write_inode(trans, &i->inode, i->snapshot); + if (ret) + break; ++ ret2 = -EINTR; + } ++ } ++fsck_err: ++ return ret ?: ret2; ++} + +- if (bkey_cmp(prev.k->k.p, bkey_start_pos(k.k)) > 0) { +- char buf1[200]; +- char buf2[200]; ++static int check_dirent_target(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct bkey_s_c_dirent d, ++ struct bch_inode_unpacked *target, ++ u32 target_snapshot) ++{ ++ struct bch_fs *c = trans->c; ++ bool backpointer_exists = true; ++ char buf[200]; ++ int ret = 0; ++ ++ if (!target->bi_dir && ++ !target->bi_dir_offset) { ++ target->bi_dir = d.k->p.inode; ++ target->bi_dir_offset = d.k->p.offset; ++ ++ ret = write_inode(trans, target, target_snapshot); ++ if (ret) ++ goto err; ++ } ++ ++ if (!inode_backpointer_matches(d, target)) { ++ ret = inode_backpointer_exists(trans, target, d.k->p.snapshot); ++ if (ret < 0) ++ goto err; + +- bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(prev.k)); +- bch2_bkey_val_to_text(&PBUF(buf2), c, k); ++ backpointer_exists = ret; ++ ret = 0; + +- if (fsck_err(c, "overlapping extents:\n%s\n%s", buf1, buf2)) +- return fix_overlapping_extent(&trans, k, prev.k->k.p) ?: -EINTR; ++ if (fsck_err_on(S_ISDIR(target->bi_mode) && ++ backpointer_exists, c, ++ "directory %llu with multiple links", ++ target->bi_inum)) { ++ ret = remove_dirent(trans, d.k->p); ++ if (ret) ++ goto err; ++ return 0; + } + +- ret = walk_inode(&trans, &w, k.k->p.inode); +- if (ret) +- break; ++ if (fsck_err_on(backpointer_exists && ++ !target->bi_nlink, c, ++ "inode %llu has multiple links but i_nlink 0", ++ target->bi_inum)) { ++ target->bi_nlink++; ++ target->bi_flags &= ~BCH_INODE_UNLINKED; + +- if (w.first_this_inode) +- i_sectors = 0; +- +- if (fsck_err_on(!w.have_inode, c, +- "extent type %u for missing inode %llu", +- k.k->type, k.k->p.inode) || +- fsck_err_on(w.have_inode && +- !S_ISREG(w.inode.bi_mode) && !S_ISLNK(w.inode.bi_mode), c, +- "extent type %u for non regular file, inode %llu mode %o", +- k.k->type, k.k->p.inode, w.inode.bi_mode)) { +- bch2_fs_lazy_rw(c); +- return bch2_btree_delete_range_trans(&trans, BTREE_ID_extents, +- POS(k.k->p.inode, 0), +- POS(k.k->p.inode, U64_MAX), +- NULL) ?: -EINTR; ++ ret = write_inode(trans, target, target_snapshot); ++ if (ret) ++ goto err; + } + +- if (fsck_err_on(w.have_inode && +- !(w.inode.bi_flags & BCH_INODE_I_SIZE_DIRTY) && +- k.k->type != KEY_TYPE_reservation && +- k.k->p.offset > round_up(w.inode.bi_size, block_bytes(c)) >> 9, c, +- "extent type %u offset %llu past end of inode %llu, i_size %llu", +- k.k->type, k.k->p.offset, k.k->p.inode, w.inode.bi_size)) { +- bch2_fs_lazy_rw(c); +- return bch2_btree_delete_range_trans(&trans, BTREE_ID_extents, +- POS(k.k->p.inode, round_up(w.inode.bi_size, block_bytes(c)) >> 9), +- POS(k.k->p.inode, U64_MAX), +- NULL) ?: -EINTR; ++ if (fsck_err_on(!backpointer_exists, c, ++ "inode %llu has wrong backpointer:\n" ++ "got %llu:%llu\n" ++ "should be %llu:%llu", ++ target->bi_inum, ++ target->bi_dir, ++ target->bi_dir_offset, ++ d.k->p.inode, ++ d.k->p.offset)) { ++ target->bi_dir = d.k->p.inode; ++ target->bi_dir_offset = d.k->p.offset; ++ ++ ret = write_inode(trans, target, target_snapshot); ++ if (ret) ++ goto err; + } ++ } + +- if (bkey_extent_is_allocation(k.k)) +- i_sectors += k.k->size; +- bch2_bkey_buf_reassemble(&prev, c, k); ++ if (fsck_err_on(vfs_d_type(d.v->d_type) != mode_to_type(target->bi_mode), c, ++ "incorrect d_type: should be %u:\n%s", ++ mode_to_type(target->bi_mode), ++ (bch2_bkey_val_to_text(&PBUF(buf), c, d.s_c), buf))) { ++ struct bkey_i_dirent *n; + +- bch2_btree_iter_advance(&iter); ++ n = kmalloc(bkey_bytes(d.k), GFP_KERNEL); ++ if (!n) { ++ ret = -ENOMEM; ++ goto err; ++ } ++ ++ bkey_reassemble(&n->k_i, d.s_c); ++ n->v.d_type = mode_to_type(target->bi_mode); ++ ++ ret = __bch2_trans_do(trans, NULL, NULL, ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_LAZY_RW, ++ bch2_trans_update(trans, iter, &n->k_i, 0)); ++ kfree(n); ++ if (ret) ++ goto err; + } ++err: + fsck_err: +- if (ret == -EINTR) +- goto retry; +- bch2_trans_iter_exit(&trans, &iter); +- bch2_bkey_buf_exit(&prev, c); +- return bch2_trans_exit(&trans) ?: ret; ++ return ret; + } + + static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, + struct bch_hash_info *hash_info, +- struct inode_walker *w, unsigned *nr_subdirs) ++ struct inode_walker *dir, ++ struct inode_walker *target, ++ struct snapshots_seen *s) + { + struct bch_fs *c = trans->c; + struct bkey_s_c k; + struct bkey_s_c_dirent d; +- struct bch_inode_unpacked target; ++ struct inode_walker_entry *i; + u32 target_snapshot; + u32 target_subvol; +- bool have_target; +- bool backpointer_exists = true; +- u64 d_inum; ++ u64 target_inum; + char buf[200]; + int ret; + +@@ -812,38 +1462,49 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, + if (ret) + return ret; + +- if (w->have_inode && +- w->cur_inum != k.k->p.inode && +- fsck_err_on(w->inode.bi_nlink != *nr_subdirs, c, +- "directory %llu with wrong i_nlink: got %u, should be %u", +- w->inode.bi_inum, w->inode.bi_nlink, *nr_subdirs)) { +- w->inode.bi_nlink = *nr_subdirs; +- ret = write_inode(trans, &w->inode, w->snapshot); +- return ret ?: -EINTR; +- } ++ ret = check_key_has_snapshot(trans, iter, k); ++ if (ret) ++ return ret; + +- ret = __walk_inode(trans, w, k.k->p.inode); ++ ret = snapshots_seen_update(c, s, k.k->p); + if (ret) + return ret; + +- if (w->first_this_inode) +- *nr_subdirs = 0; ++ if (k.k->type == KEY_TYPE_whiteout) ++ return 0; ++ ++ if (dir->cur_inum != k.k->p.inode) { ++ ret = check_subdir_count(trans, dir); ++ if (ret) ++ return ret; ++ } ++ ++ ret = __walk_inode(trans, dir, k.k->p); ++ if (ret < 0) ++ return ret; + +- if (fsck_err_on(!w->have_inode, c, ++ if (fsck_err_on(ret == INT_MAX, c, + "dirent in nonexisting directory:\n%s", +- (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)) || +- fsck_err_on(!S_ISDIR(w->inode.bi_mode), c, ++ (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf))) ++ return __bch2_trans_do(trans, NULL, NULL, BTREE_INSERT_LAZY_RW, ++ bch2_btree_delete_at(trans, iter, ++ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE)); ++ ++ if (ret == INT_MAX) ++ return 0; ++ ++ i = dir->d + ret; ++ ret = 0; ++ ++ if (fsck_err_on(!S_ISDIR(i->inode.bi_mode), c, + "dirent in non directory inode type %u:\n%s", +- mode_to_type(w->inode.bi_mode), ++ mode_to_type(i->inode.bi_mode), + (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf))) + return __bch2_trans_do(trans, NULL, NULL, 0, + bch2_btree_delete_at(trans, iter, 0)); + +- if (!w->have_inode) +- return 0; +- +- if (w->first_this_inode) +- *hash_info = bch2_hash_info_init(c, &w->inode); ++ if (dir->first_this_inode) ++ *hash_info = bch2_hash_info_init(c, &dir->d[0].inode); + + ret = hash_check_key(trans, bch2_dirent_hash_desc, + hash_info, iter, k); +@@ -856,128 +1517,76 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, + return 0; + + d = bkey_s_c_to_dirent(k); +- d_inum = le64_to_cpu(d.v->d_inum); + +- ret = __bch2_dirent_read_target(&trans, d, ++ ret = __bch2_dirent_read_target(trans, d, + &target_subvol, + &target_snapshot, +- &target_inum); ++ &target_inum, ++ true); + if (ret && ret != -ENOENT) + return ret; + +- ret = __lookup_inode(trans, d_inum, &target, &target_snapshot); +- if (ret && ret != -ENOENT) +- return ret; ++ if (fsck_err_on(ret, c, ++ "dirent points to missing subvolume %llu", ++ le64_to_cpu(d.v->d_inum))) ++ return remove_dirent(trans, d.k->p); + +- have_target = !ret; +- ret = 0; ++ if (target_subvol) { ++ struct bch_inode_unpacked subvol_root; + +- if (fsck_err_on(!have_target, c, +- "dirent points to missing inode:\n%s", +- (bch2_bkey_val_to_text(&PBUF(buf), c, +- k), buf))) +- return remove_dirent(trans, d.k->p); ++ ret = __lookup_inode(trans, target_inum, ++ &subvol_root, &target_snapshot); ++ if (ret && ret != -ENOENT) ++ return ret; + +- if (!have_target) +- return 0; ++ if (fsck_err_on(ret, c, ++ "subvolume %u points to missing subvolume root %llu", ++ target_subvol, ++ target_inum)) { ++ bch_err(c, "repair not implemented yet"); ++ return -EINVAL; ++ } + +- if (!target.bi_dir && +- !target.bi_dir_offset) { +- target.bi_dir = k.k->p.inode; +- target.bi_dir_offset = k.k->p.offset; ++ if (fsck_err_on(subvol_root.bi_subvol != target_subvol, c, ++ "subvol root %llu has wrong bi_subvol field: got %u, should be %u", ++ target_inum, ++ subvol_root.bi_subvol, target_subvol)) { ++ subvol_root.bi_subvol = target_subvol; ++ ret = write_inode(trans, &subvol_root, target_snapshot); ++ if (ret) ++ return ret; ++ } + +- ret = __write_inode(trans, &target, target_snapshot) ?: +- bch2_trans_commit(trans, NULL, NULL, +- BTREE_INSERT_NOFAIL| +- BTREE_INSERT_LAZY_RW); ++ ret = check_dirent_target(trans, iter, d, &subvol_root, ++ target_snapshot); + if (ret) + return ret; +- return -EINTR; +- } +- +- if (!inode_backpointer_matches(d, &target)) { +- ret = inode_backpointer_exists(trans, &target); +- if (ret < 0) ++ } else { ++ ret = __get_visible_inodes(trans, target, s, target_inum); ++ if (ret) + return ret; + +- backpointer_exists = ret; +- ret = 0; +- +- if (fsck_err_on(S_ISDIR(target.bi_mode) && +- backpointer_exists, c, +- "directory %llu with multiple links", +- target.bi_inum)) +- return remove_dirent(trans, d.k->p); +- +- if (fsck_err_on(backpointer_exists && +- !target.bi_nlink, c, +- "inode %llu has multiple links but i_nlink 0", +- d_inum)) { +- target.bi_nlink++; +- target.bi_flags &= ~BCH_INODE_UNLINKED; +- +- ret = write_inode(trans, &target, target_snapshot); +- return ret ?: -EINTR; ++ if (fsck_err_on(!target->nr, c, ++ "dirent points to missing inode:\n%s", ++ (bch2_bkey_val_to_text(&PBUF(buf), c, ++ k), buf))) { ++ ret = remove_dirent(trans, d.k->p); ++ if (ret) ++ return ret; + } + +- if (fsck_err_on(!backpointer_exists, c, +- "inode %llu has wrong backpointer:\n" +- "got %llu:%llu\n" +- "should be %llu:%llu", +- d_inum, +- target.bi_dir, +- target.bi_dir_offset, +- k.k->p.inode, +- k.k->p.offset)) { +- target.bi_dir = k.k->p.inode; +- target.bi_dir_offset = k.k->p.offset; +- +- ret = write_inode(trans, &target, target_snapshot); +- return ret ?: -EINTR; ++ for (i = target->d; i < target->d + target->nr; i++) { ++ ret = check_dirent_target(trans, iter, d, ++ &i->inode, i->snapshot); ++ if (ret) ++ return ret; + } + } + +- target_subvol = d.v->d_type == DT_SUBVOL +- ? le64_to_cpu(d.v->d_inum) : 0; +- +- if (fsck_err_on(target.bi_subvol != target_subvol, c, +- "subvol root %llu has wrong subvol field:\n" +- "got %u\n" +- "should be %u", +- target.bi_inum, +- target.bi_subvol, +- target_subvol)) { +- target.bi_subvol = target_subvol; +- +- ret = write_inode(trans, &target, target_snapshot); +- return ret ?: -EINTR; +- } +- +- if (fsck_err_on(vfs_d_type(d.v->d_type) != mode_to_type(target.bi_mode), c, +- "incorrect d_type: should be %u:\n%s", +- mode_to_type(target.bi_mode), +- (bch2_bkey_val_to_text(&PBUF(buf), c, +- k), buf))) { +- struct bkey_i_dirent *n; +- +- n = kmalloc(bkey_bytes(d.k), GFP_KERNEL); +- if (!n) +- return -ENOMEM; +- +- bkey_reassemble(&n->k_i, d.s_c); +- n->v.d_type = mode_to_type(target.bi_mode); +- +- ret = __bch2_trans_do(trans, NULL, NULL, +- BTREE_INSERT_NOFAIL| +- BTREE_INSERT_LAZY_RW, +- bch2_btree_iter_traverse(iter) ?: +- bch2_trans_update(trans, iter, &n->k_i, 0)); +- kfree(n); +- return ret ?: -EINTR; +- } ++ if (d.v->d_type == DT_DIR) ++ for_each_visible_inode(c, s, dir, d.k->p.snapshot, i) ++ i->count++; + +- *nr_subdirs += d.v->d_type == DT_DIR; +- return 0; + fsck_err: + return ret; + } +@@ -989,31 +1598,39 @@ fsck_err: + noinline_for_stack + static int check_dirents(struct bch_fs *c) + { +- struct inode_walker w = inode_walker_init(); ++ struct inode_walker dir = inode_walker_init(); ++ struct inode_walker target = inode_walker_init(); ++ struct snapshots_seen s; + struct bch_hash_info hash_info; + struct btree_trans trans; + struct btree_iter iter; +- unsigned nr_subdirs = 0; + int ret = 0; + + bch_verbose(c, "checking dirents"); + ++ snapshots_seen_init(&s); + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); + + bch2_trans_iter_init(&trans, &iter, BTREE_ID_dirents, + POS(BCACHEFS_ROOT_INO, 0), + BTREE_ITER_INTENT| +- BTREE_ITER_PREFETCH); ++ BTREE_ITER_PREFETCH| ++ BTREE_ITER_ALL_SNAPSHOTS); + + do { + ret = lockrestart_do(&trans, +- check_dirent(&trans, &iter, &hash_info, &w, &nr_subdirs)); ++ check_dirent(&trans, &iter, &hash_info, ++ &dir, &target, &s)); + if (ret) + break; + } while (bch2_btree_iter_advance(&iter)); + bch2_trans_iter_exit(&trans, &iter); + +- return bch2_trans_exit(&trans) ?: ret; ++ bch2_trans_exit(&trans); ++ snapshots_seen_exit(&s); ++ inode_walker_exit(&dir); ++ inode_walker_exit(&target); ++ return ret; + } + + /* +@@ -1036,15 +1653,22 @@ static int check_xattrs(struct bch_fs *c) + bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, + POS(BCACHEFS_ROOT_INO, 0), + BTREE_ITER_INTENT| +- BTREE_ITER_PREFETCH); ++ BTREE_ITER_PREFETCH| ++ BTREE_ITER_ALL_SNAPSHOTS); + retry: ++ bch2_trans_begin(&trans); ++ + while ((k = bch2_btree_iter_peek(&iter)).k && + !(ret = bkey_err(k))) { +- ret = walk_inode(&trans, &w, k.k->p.inode); ++ ret = check_key_has_snapshot(&trans, &iter, k); + if (ret) + break; + +- if (fsck_err_on(!w.have_inode, c, ++ ret = walk_inode(&trans, &w, k.k->p); ++ if (ret < 0) ++ break; ++ ++ if (fsck_err_on(ret == INT_MAX, c, + "xattr for missing inode %llu", + k.k->p.inode)) { + ret = bch2_btree_delete_at(&trans, &iter, 0); +@@ -1053,14 +1677,18 @@ retry: + continue; + } + +- if (w.first_this_inode && w.have_inode) +- hash_info = bch2_hash_info_init(c, &w.inode); ++ if (ret == INT_MAX) ++ goto next; ++ ret = 0; ++ ++ if (w.first_this_inode) ++ hash_info = bch2_hash_info_init(c, &w.d[0].inode); + + ret = hash_check_key(&trans, bch2_xattr_hash_desc, + &hash_info, &iter, k); + if (ret) + break; +- ++next: + bch2_btree_iter_advance(&iter); + } + fsck_err: +@@ -1072,40 +1700,63 @@ fsck_err: + } + + /* Get root directory, create if it doesn't exist: */ +-static int check_root(struct bch_fs *c, struct bch_inode_unpacked *root_inode) ++static int check_root(struct bch_fs *c) + { +- struct bkey_inode_buf packed; ++ struct btree_trans trans; ++ struct bch_inode_unpacked root_inode; + u32 snapshot; ++ u64 inum; + int ret; + ++ bch2_trans_init(&trans, c, 0, 0); ++ + bch_verbose(c, "checking root directory"); + +- ret = bch2_trans_do(c, NULL, NULL, 0, +- lookup_inode(&trans, BCACHEFS_ROOT_INO, root_inode, &snapshot)); ++ ret = subvol_lookup(&trans, BCACHEFS_ROOT_SUBVOL, &snapshot, &inum); + if (ret && ret != -ENOENT) + return ret; + +- if (fsck_err_on(ret, c, "root directory missing")) +- goto create_root; ++ if (mustfix_fsck_err_on(ret, c, "root subvol missing")) { ++ struct bkey_i_subvolume root_subvol; + +- if (fsck_err_on(!S_ISDIR(root_inode->bi_mode), c, +- "root inode not a directory")) +- goto create_root; ++ snapshot = U32_MAX; ++ inum = BCACHEFS_ROOT_INO; + +- return 0; +-fsck_err: +- return ret; +-create_root: +- bch2_inode_init(c, root_inode, 0, 0, S_IFDIR|0755, +- 0, NULL); +- root_inode->bi_inum = BCACHEFS_ROOT_INO; ++ bkey_subvolume_init(&root_subvol.k_i); ++ root_subvol.k.p.offset = BCACHEFS_ROOT_SUBVOL; ++ root_subvol.v.flags = 0; ++ root_subvol.v.snapshot = cpu_to_le32(snapshot); ++ root_subvol.v.inode = cpu_to_le64(inum); ++ ret = __bch2_trans_do(&trans, NULL, NULL, ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_LAZY_RW, ++ __bch2_btree_insert(&trans, BTREE_ID_subvolumes, &root_subvol.k_i)); ++ if (ret) { ++ bch_err(c, "error writing root subvol: %i", ret); ++ goto err; ++ } ++ ++ } ++ ++ ret = lookup_inode(&trans, BCACHEFS_ROOT_INO, &root_inode, &snapshot); ++ if (ret && ret != -ENOENT) ++ return ret; + +- bch2_inode_pack(c, &packed, root_inode); ++ if (mustfix_fsck_err_on(ret, c, "root directory missing") || ++ mustfix_fsck_err_on(!S_ISDIR(root_inode.bi_mode), c, ++ "root inode not a directory")) { ++ bch2_inode_init(c, &root_inode, 0, 0, S_IFDIR|0755, ++ 0, NULL); ++ root_inode.bi_inum = inum; + +- return bch2_btree_insert(c, BTREE_ID_inodes, &packed.inode.k_i, +- NULL, NULL, +- BTREE_INSERT_NOFAIL| +- BTREE_INSERT_LAZY_RW); ++ ret = write_inode(&trans, &root_inode, snapshot); ++ if (ret) ++ bch_err(c, "error writing root inode: %i", ret); ++ } ++err: ++fsck_err: ++ bch2_trans_exit(&trans); ++ return ret; + } + + struct pathbuf { +@@ -1147,17 +1798,18 @@ static int check_path(struct btree_trans *trans, + size_t i; + int ret = 0; + ++ snapshot = snapshot_t(c, snapshot)->equiv; + p->nr = 0; + + while (inode->bi_inum != BCACHEFS_ROOT_INO) { + ret = lockrestart_do(trans, +- inode_backpointer_exists(trans, inode)); ++ inode_backpointer_exists(trans, inode, snapshot)); + if (ret < 0) + break; + + if (!ret) { +- if (fsck_err(c, "unreachable inode %llu, type %u nlink %u backptr %llu:%llu", +- inode->bi_inum, ++ if (fsck_err(c, "unreachable inode %llu:%u, type %u nlink %u backptr %llu:%llu", ++ inode->bi_inum, snapshot, + mode_to_type(inode->bi_mode), + inode->bi_nlink, + inode->bi_dir, +@@ -1226,7 +1878,8 @@ static int check_directory_structure(struct bch_fs *c) + + for_each_btree_key(&trans, iter, BTREE_ID_inodes, POS_MIN, + BTREE_ITER_INTENT| +- BTREE_ITER_PREFETCH, k, ret) { ++ BTREE_ITER_PREFETCH| ++ BTREE_ITER_ALL_SNAPSHOTS, k, ret) { + if (k.k->type != KEY_TYPE_inode) + continue; + +@@ -1237,6 +1890,9 @@ static int check_directory_structure(struct bch_fs *c) + break; + } + ++ if (u.bi_flags & BCH_INODE_UNLINKED) ++ continue; ++ + ret = check_path(&trans, &path, &u, iter.pos.snapshot); + if (ret) + break; +@@ -1295,8 +1951,9 @@ static int nlink_cmp(const void *_l, const void *_r) + return cmp_int(l->inum, r->inum) ?: cmp_int(l->snapshot, r->snapshot); + } + +-static void inc_link(struct bch_fs *c, struct nlink_table *links, +- u64 range_start, u64 range_end, u64 inum) ++static void inc_link(struct bch_fs *c, struct snapshots_seen *s, ++ struct nlink_table *links, ++ u64 range_start, u64 range_end, u64 inum, u32 snapshot) + { + struct nlink *link, key = { + .inum = inum, .snapshot = U32_MAX, +@@ -1307,8 +1964,18 @@ static void inc_link(struct bch_fs *c, struct nlink_table *links, + + link = __inline_bsearch(&key, links->d, links->nr, + sizeof(links->d[0]), nlink_cmp); +- if (link) +- link->count++; ++ if (!link) ++ return; ++ ++ while (link > links->d && link[0].inum == link[-1].inum) ++ --link; ++ ++ for (; link < links->d + links->nr && link->inum == inum; link++) ++ if (ref_visible(c, s, snapshot, link->snapshot)) { ++ link->count++; ++ if (link->snapshot >= snapshot) ++ break; ++ } + } + + noinline_for_stack +@@ -1328,7 +1995,8 @@ static int check_nlinks_find_hardlinks(struct bch_fs *c, + for_each_btree_key(&trans, iter, BTREE_ID_inodes, + POS(0, start), + BTREE_ITER_INTENT| +- BTREE_ITER_PREFETCH, k, ret) { ++ BTREE_ITER_PREFETCH| ++ BTREE_ITER_ALL_SNAPSHOTS, k, ret) { + if (k.k->type != KEY_TYPE_inode) + continue; + +@@ -1369,23 +2037,33 @@ static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links + u64 range_start, u64 range_end) + { + struct btree_trans trans; ++ struct snapshots_seen s; + struct btree_iter iter; + struct bkey_s_c k; + struct bkey_s_c_dirent d; + int ret; + ++ snapshots_seen_init(&s); ++ + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); + + for_each_btree_key(&trans, iter, BTREE_ID_dirents, POS_MIN, + BTREE_ITER_INTENT| +- BTREE_ITER_PREFETCH, k, ret) { ++ BTREE_ITER_PREFETCH| ++ BTREE_ITER_ALL_SNAPSHOTS, k, ret) { ++ ret = snapshots_seen_update(c, &s, k.k->p); ++ if (ret) ++ break; ++ + switch (k.k->type) { + case KEY_TYPE_dirent: + d = bkey_s_c_to_dirent(k); + +- if (d.v->d_type != DT_DIR) +- inc_link(c, links, range_start, range_end, +- le64_to_cpu(d.v->d_inum)); ++ if (d.v->d_type != DT_DIR && ++ d.v->d_type != DT_SUBVOL) ++ inc_link(c, &s, links, range_start, range_end, ++ le64_to_cpu(d.v->d_inum), ++ d.k->p.snapshot); + break; + } + +@@ -1393,10 +2071,11 @@ static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links + } + bch2_trans_iter_exit(&trans, &iter); + +- ret = bch2_trans_exit(&trans) ?: ret; + if (ret) + bch_err(c, "error in fsck: btree error %i while walking dirents", ret); + ++ bch2_trans_exit(&trans); ++ snapshots_seen_exit(&s); + return ret; + } + +@@ -1418,7 +2097,8 @@ static int check_nlinks_update_hardlinks(struct bch_fs *c, + for_each_btree_key(&trans, iter, BTREE_ID_inodes, + POS(0, range_start), + BTREE_ITER_INTENT| +- BTREE_ITER_PREFETCH, k, ret) { ++ BTREE_ITER_PREFETCH| ++ BTREE_ITER_ALL_SNAPSHOTS, k, ret) { + if (k.k->p.offset >= range_end) + break; + +@@ -1434,7 +2114,8 @@ static int check_nlinks_update_hardlinks(struct bch_fs *c, + if (!u.bi_nlink) + continue; + +- while (link->inum < k.k->p.offset) { ++ while ((cmp_int(link->inum, k.k->p.offset) ?: ++ cmp_int(link->snapshot, k.k->p.snapshot)) < 0) { + link++; + BUG_ON(link >= links->d + links->nr); + } +@@ -1507,14 +2188,13 @@ static int check_nlinks(struct bch_fs *c) + */ + int bch2_fsck_full(struct bch_fs *c) + { +- struct bch_inode_unpacked root_inode; +- + return bch2_fs_snapshots_check(c) ?: + check_inodes(c, true) ?: ++ check_subvols(c) ?: + check_extents(c) ?: + check_dirents(c) ?: + check_xattrs(c) ?: +- check_root(c, &root_inode) ?: ++ check_root(c) ?: + check_directory_structure(c) ?: + check_nlinks(c); + } +-- +cgit v1.2.3 + + +From 3a0b9047e870edfdf47be8426fd653933429d22d Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 12 Mar 2021 20:30:39 -0500 +Subject: bcachefs: Convert io paths for snapshots + +This plumbs around the subvolume ID as was done previously for other +filesystem code, but now for the IO paths - the control flow in the IO +paths is trickier so the changes in this patch are more involved. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/fs-io.c | 66 ++++++++++++++----------- + fs/bcachefs/io.c | 128 ++++++++++++++++++++++++++++++------------------- + fs/bcachefs/io.h | 19 ++++---- + fs/bcachefs/io_types.h | 2 + + fs/bcachefs/reflink.c | 24 ++++++---- + 5 files changed, 145 insertions(+), 94 deletions(-) + +diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c +index 27516a162aec..66be27ad5649 100644 +--- a/fs/bcachefs/fs-io.c ++++ b/fs/bcachefs/fs-io.c +@@ -795,23 +795,35 @@ static void readpage_bio_extend(struct readpages_iter *iter, + } + } + +-static void bchfs_read(struct btree_trans *trans, struct btree_iter *iter, +- struct bch_read_bio *rbio, u64 inum, ++static void bchfs_read(struct btree_trans *trans, ++ struct bch_read_bio *rbio, ++ subvol_inum inum, + struct readpages_iter *readpages_iter) + { + struct bch_fs *c = trans->c; ++ struct btree_iter iter; + struct bkey_buf sk; + int flags = BCH_READ_RETRY_IF_STALE| + BCH_READ_MAY_PROMOTE; ++ u32 snapshot; + int ret = 0; + + rbio->c = c; + rbio->start_time = local_clock(); ++ rbio->subvol = inum.subvol; + + bch2_bkey_buf_init(&sk); + retry: + bch2_trans_begin(trans); ++ iter = (struct btree_iter) { NULL }; + ++ ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); ++ if (ret) ++ goto err; ++ ++ bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, ++ SPOS(inum.inum, rbio->bio.bi_iter.bi_sector, snapshot), ++ BTREE_ITER_SLOTS|BTREE_ITER_FILTER_SNAPSHOTS); + while (1) { + struct bkey_s_c k; + unsigned bytes, sectors, offset_into_extent; +@@ -826,15 +838,15 @@ retry: + break; + } + +- bch2_btree_iter_set_pos(iter, +- POS(inum, rbio->bio.bi_iter.bi_sector)); ++ bch2_btree_iter_set_pos(&iter, ++ POS(inum.inum, rbio->bio.bi_iter.bi_sector)); + +- k = bch2_btree_iter_peek_slot(iter); ++ k = bch2_btree_iter_peek_slot(&iter); + ret = bkey_err(k); + if (ret) + break; + +- offset_into_extent = iter->pos.offset - ++ offset_into_extent = iter.pos.offset - + bkey_start_offset(k.k); + sectors = k.k->size - offset_into_extent; + +@@ -864,7 +876,7 @@ retry: + if (bkey_extent_is_allocation(k.k)) + bch2_add_page_sectors(&rbio->bio, k); + +- bch2_read_extent(trans, rbio, iter->pos, ++ bch2_read_extent(trans, rbio, iter.pos, + data_btree, k, offset_into_extent, flags); + + if (flags & BCH_READ_LAST_FRAGMENT) +@@ -873,12 +885,14 @@ retry: + swap(rbio->bio.bi_iter.bi_size, bytes); + bio_advance(&rbio->bio, bytes); + } ++err: ++ bch2_trans_iter_exit(trans, &iter); + + if (ret == -EINTR) + goto retry; + + if (ret) { +- bch_err_inum_ratelimited(c, inum, ++ bch_err_inum_ratelimited(c, inum.inum, + "read error %i from btree lookup", ret); + rbio->bio.bi_status = BLK_STS_IOERR; + bio_endio(&rbio->bio); +@@ -893,7 +907,6 @@ void bch2_readahead(struct readahead_control *ractl) + struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct bch_io_opts opts = io_opts(c, &inode->ei_inode); + struct btree_trans trans; +- struct btree_iter iter; + struct page *page; + struct readpages_iter readpages_iter; + int ret; +@@ -902,8 +915,6 @@ void bch2_readahead(struct readahead_control *ractl) + BUG_ON(ret); + + bch2_trans_init(&trans, c, 0, 0); +- bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, POS_MIN, +- BTREE_ITER_SLOTS); + + bch2_pagecache_add_get(&inode->ei_pagecache_lock); + +@@ -924,22 +935,20 @@ void bch2_readahead(struct readahead_control *ractl) + rbio->bio.bi_end_io = bch2_readpages_end_io; + BUG_ON(!bio_add_page(&rbio->bio, page, PAGE_SIZE, 0)); + +- bchfs_read(&trans, &iter, rbio, inode->v.i_ino, ++ bchfs_read(&trans, rbio, inode_inum(inode), + &readpages_iter); + } + + bch2_pagecache_add_put(&inode->ei_pagecache_lock); + +- bch2_trans_iter_exit(&trans, &iter); + bch2_trans_exit(&trans); + kfree(readpages_iter.pages); + } + + static void __bchfs_readpage(struct bch_fs *c, struct bch_read_bio *rbio, +- u64 inum, struct page *page) ++ subvol_inum inum, struct page *page) + { + struct btree_trans trans; +- struct btree_iter iter; + + bch2_page_state_create(page, __GFP_NOFAIL); + +@@ -949,12 +958,7 @@ static void __bchfs_readpage(struct bch_fs *c, struct bch_read_bio *rbio, + BUG_ON(!bio_add_page(&rbio->bio, page, PAGE_SIZE, 0)); + + bch2_trans_init(&trans, c, 0, 0); +- bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, POS_MIN, +- BTREE_ITER_SLOTS); +- +- bchfs_read(&trans, &iter, rbio, inum, NULL); +- +- bch2_trans_iter_exit(&trans, &iter); ++ bchfs_read(&trans, rbio, inum, NULL); + bch2_trans_exit(&trans); + } + +@@ -968,7 +972,7 @@ int bch2_readpage(struct file *file, struct page *page) + rbio = rbio_init(bio_alloc_bioset(GFP_NOFS, 1, &c->bio_read), opts); + rbio->bio.bi_end_io = bch2_readpages_end_io; + +- __bchfs_readpage(c, rbio, inode->v.i_ino, page); ++ __bchfs_readpage(c, rbio, inode_inum(inode), page); + return 0; + } + +@@ -991,7 +995,7 @@ static int bch2_read_single_page(struct page *page, + rbio->bio.bi_private = &done; + rbio->bio.bi_end_io = bch2_read_single_page_end_io; + +- __bchfs_readpage(c, rbio, inode->v.i_ino, page); ++ __bchfs_readpage(c, rbio, inode_inum(inode), page); + wait_for_completion(&done); + + ret = blk_status_to_errno(rbio->bio.bi_status); +@@ -1135,6 +1139,7 @@ static void bch2_writepage_io_alloc(struct bch_fs *c, + op->nr_replicas = nr_replicas; + op->res.nr_replicas = nr_replicas; + op->write_point = writepoint_hashed(inode->ei_last_dirtied); ++ op->subvol = inode->ei_subvol; + op->pos = POS(inode->v.i_ino, sector); + op->wbio.bio.bi_iter.bi_sector = sector; + op->wbio.bio.bi_opf = wbc_to_write_flags(wbc); +@@ -1766,7 +1771,7 @@ start: + if (iter->count) + closure_get(&dio->cl); + +- bch2_read(c, rbio_init(bio, opts), inode->v.i_ino); ++ bch2_read(c, rbio_init(bio, opts), inode_inum(inode)); + } + + iter->count += shorten; +@@ -1847,7 +1852,8 @@ retry: + if (bkey_cmp(bkey_start_pos(k.k), POS(inum.inum, end)) >= 0) + break; + +- if (nr_replicas > bch2_bkey_replicas(c, k) || ++ if (k.k->p.snapshot != snapshot || ++ nr_replicas > bch2_bkey_replicas(c, k) || + (!compressed && bch2_bkey_sectors_compressed(k))) { + ret = false; + break; +@@ -1942,6 +1948,7 @@ static long bch2_dio_write_loop(struct dio_write *dio) + op_journal_seq_set(&dio->op, &inode->ei_journal_seq); + dio->op.write_point = writepoint_hashed((unsigned long) current); + dio->op.nr_replicas = dio->op.opts.data_replicas; ++ dio->op.subvol = inode->ei_subvol; + dio->op.pos = POS(inode->v.i_ino, (u64) req->ki_pos >> 9); + + if ((req->ki_flags & IOCB_DSYNC) && +@@ -2451,7 +2458,7 @@ int bch2_truncate(struct user_namespace *mnt_userns, + + truncate_setsize(&inode->v, iattr->ia_size); + +- ret = bch2_fpunch(c, inode->v.i_ino, ++ ret = bch2_fpunch(c, inode_inum(inode), + round_up(iattr->ia_size, block_bytes(c)) >> 9, + U64_MAX, &inode->ei_journal_seq, &i_sectors_delta); + i_sectors_acct(c, inode, NULL, i_sectors_delta); +@@ -2511,7 +2518,7 @@ static long bchfs_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len + if (discard_start < discard_end) { + s64 i_sectors_delta = 0; + +- ret = bch2_fpunch(c, inode->v.i_ino, ++ ret = bch2_fpunch(c, inode_inum(inode), + discard_start, discard_end, + &inode->ei_journal_seq, + &i_sectors_delta); +@@ -2590,7 +2597,7 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode, + } else { + s64 i_sectors_delta = 0; + +- ret = bch2_fpunch(c, inode->v.i_ino, ++ ret = bch2_fpunch(c, inode_inum(inode), + offset >> 9, (offset + len) >> 9, + &inode->ei_journal_seq, + &i_sectors_delta); +@@ -2806,7 +2813,8 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode, + reservation.v.nr_replicas = disk_res.nr_replicas; + } + +- ret = bch2_extent_update(&trans, &iter, &reservation.k_i, ++ ret = bch2_extent_update(&trans, inode_inum(inode), &iter, ++ &reservation.k_i, + &disk_res, &inode->ei_journal_seq, + 0, &i_sectors_delta, true); + i_sectors_acct(c, inode, "a_res, i_sectors_delta); +diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c +index 177b01b941aa..4d44c86cf4f7 100644 +--- a/fs/bcachefs/io.c ++++ b/fs/bcachefs/io.c +@@ -27,6 +27,7 @@ + #include "keylist.h" + #include "move.h" + #include "rebalance.h" ++#include "subvolume.h" + #include "super.h" + #include "super-io.h" + +@@ -220,7 +221,8 @@ int bch2_sum_sector_overwrites(struct btree_trans *trans, + : 0; + + if (!*usage_increasing && +- (new_replicas > bch2_bkey_replicas(c, old) || ++ (new->k.p.snapshot != old.k->p.snapshot || ++ new_replicas > bch2_bkey_replicas(c, old) || + (!new_compressed && bch2_bkey_sectors_compressed(old)))) + *usage_increasing = true; + +@@ -256,6 +258,7 @@ int bch2_sum_sector_overwrites(struct btree_trans *trans, + } + + int bch2_extent_update(struct btree_trans *trans, ++ subvol_inum inum, + struct btree_iter *iter, + struct bkey_i *k, + struct disk_reservation *disk_res, +@@ -314,11 +317,8 @@ int bch2_extent_update(struct btree_trans *trans, + struct btree_iter inode_iter; + struct bch_inode_unpacked inode_u; + +- ret = bch2_inode_peek(trans, &inode_iter, &inode_u, +- (subvol_inum) { +- .subvol = BCACHEFS_ROOT_SUBVOL, +- .inum = k->k.p.inode, +- }, BTREE_ITER_INTENT); ++ ret = bch2_inode_peek(trans, &inode_iter, &inode_u, inum, ++ BTREE_ITER_INTENT); + if (ret) + return ret; + +@@ -374,22 +374,37 @@ int bch2_extent_update(struct btree_trans *trans, + return 0; + } + ++/* ++ * Returns -EINTR if we had to drop locks: ++ */ + int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter, +- struct bpos end, u64 *journal_seq, +- s64 *i_sectors_delta) ++ subvol_inum inum, u64 end, ++ u64 *journal_seq, s64 *i_sectors_delta) + { + struct bch_fs *c = trans->c; + unsigned max_sectors = KEY_SIZE_MAX & (~0 << c->block_bits); ++ struct bpos end_pos = POS(inum.inum, end); + struct bkey_s_c k; + int ret = 0, ret2 = 0; ++ u32 snapshot; + +- while ((bch2_trans_begin(trans), +- (k = bch2_btree_iter_peek(iter)).k) && +- bkey_cmp(iter->pos, end) < 0) { ++ while (1) { + struct disk_reservation disk_res = + bch2_disk_reservation_init(c, 0); + struct bkey_i delete; + ++ bch2_trans_begin(trans); ++ ++ ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); ++ if (ret) ++ goto btree_err; ++ ++ bch2_btree_iter_set_snapshot(iter, snapshot); ++ ++ k = bch2_btree_iter_peek(iter); ++ if (bkey_cmp(iter->pos, end_pos) >= 0) ++ break; ++ + ret = bkey_err(k); + if (ret) + goto btree_err; +@@ -399,9 +414,9 @@ int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter, + + /* create the biggest key we can */ + bch2_key_resize(&delete.k, max_sectors); +- bch2_cut_back(end, &delete); ++ bch2_cut_back(end_pos, &delete); + +- ret = bch2_extent_update(trans, iter, &delete, ++ ret = bch2_extent_update(trans, inum, iter, &delete, + &disk_res, journal_seq, + 0, i_sectors_delta, false); + bch2_disk_reservation_put(c, &disk_res); +@@ -414,36 +429,31 @@ btree_err: + break; + } + +- if (bkey_cmp(iter->pos, end) > 0) { +- bch2_btree_iter_set_pos(iter, end); +- ret = bch2_btree_iter_traverse(iter); +- } ++ if (bkey_cmp(iter->pos, end_pos) > 0) ++ bch2_btree_iter_set_pos(iter, end_pos); + + return ret ?: ret2; + } + +-int bch2_fpunch(struct bch_fs *c, u64 inum, u64 start, u64 end, ++int bch2_fpunch(struct bch_fs *c, subvol_inum inum, u64 start, u64 end, + u64 *journal_seq, s64 *i_sectors_delta) + { + struct btree_trans trans; + struct btree_iter iter; +- int ret = 0; ++ int ret; + + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024); + bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, +- POS(inum, start), +- BTREE_ITER_INTENT); ++ POS(inum.inum, start), ++ BTREE_ITER_INTENT); + +- ret = bch2_fpunch_at(&trans, &iter, POS(inum, end), ++ ret = bch2_fpunch_at(&trans, &iter, inum, end, + journal_seq, i_sectors_delta); + + bch2_trans_iter_exit(&trans, &iter); + bch2_trans_exit(&trans); + +- if (ret == -EINTR) +- ret = 0; +- +- return ret; ++ return ret == -EINTR ? 0 : ret; + } + + int bch2_write_index_default(struct bch_write_op *op) +@@ -454,40 +464,51 @@ int bch2_write_index_default(struct bch_write_op *op) + struct bkey_i *k = bch2_keylist_front(keys); + struct btree_trans trans; + struct btree_iter iter; ++ subvol_inum inum = { ++ .subvol = op->subvol, ++ .inum = k->k.p.inode, ++ }; + int ret; + ++ BUG_ON(!inum.subvol); ++ + bch2_bkey_buf_init(&sk); + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024); + +- bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, +- bkey_start_pos(&k->k), +- BTREE_ITER_SLOTS|BTREE_ITER_INTENT); +- + do { + bch2_trans_begin(&trans); + + k = bch2_keylist_front(keys); ++ bch2_bkey_buf_copy(&sk, c, k); + +- k->k.p.snapshot = iter.snapshot; ++ ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, ++ &sk.k->k.p.snapshot); ++ if (ret == -EINTR) ++ continue; ++ if (ret) ++ break; + +- bch2_bkey_buf_realloc(&sk, c, k->k.u64s); +- bkey_copy(sk.k, k); +- bch2_cut_front(iter.pos, sk.k); ++ bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, ++ bkey_start_pos(&sk.k->k), ++ BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + +- ret = bch2_extent_update(&trans, &iter, sk.k, ++ ret = bch2_extent_update(&trans, inum, &iter, sk.k, + &op->res, op_journal_seq(op), + op->new_i_size, &op->i_sectors_delta, + op->flags & BCH_WRITE_CHECK_ENOSPC); ++ bch2_trans_iter_exit(&trans, &iter); ++ + if (ret == -EINTR) + continue; + if (ret) + break; + + if (bkey_cmp(iter.pos, k->k.p) >= 0) +- bch2_keylist_pop_front(keys); ++ bch2_keylist_pop_front(&op->insert_keys); ++ else ++ bch2_cut_front(iter.pos, k); + } while (!bch2_keylist_empty(keys)); + +- bch2_trans_iter_exit(&trans, &iter); + bch2_trans_exit(&trans); + bch2_bkey_buf_exit(&sk, c); + +@@ -1647,7 +1668,7 @@ static void bch2_rbio_done(struct bch_read_bio *rbio) + } + + static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio, +- struct bvec_iter bvec_iter, u64 inode, ++ struct bvec_iter bvec_iter, + struct bch_io_failures *failed, + unsigned flags) + { +@@ -1711,7 +1732,10 @@ static void bch2_rbio_retry(struct work_struct *work) + struct bch_fs *c = rbio->c; + struct bvec_iter iter = rbio->bvec_iter; + unsigned flags = rbio->flags; +- u64 inode = rbio->read_pos.inode; ++ subvol_inum inum = { ++ .subvol = rbio->subvol, ++ .inum = rbio->read_pos.inode, ++ }; + struct bch_io_failures failed = { .nr = 0 }; + + trace_read_retry(&rbio->bio); +@@ -1727,12 +1751,12 @@ static void bch2_rbio_retry(struct work_struct *work) + flags &= ~BCH_READ_MAY_PROMOTE; + + if (flags & BCH_READ_NODECODE) { +- bch2_read_retry_nodecode(c, rbio, iter, inode, &failed, flags); ++ bch2_read_retry_nodecode(c, rbio, iter, &failed, flags); + } else { + flags &= ~BCH_READ_LAST_FRAGMENT; + flags |= BCH_READ_MUST_CLONE; + +- __bch2_read(c, rbio, iter, inode, &failed, flags); ++ __bch2_read(c, rbio, iter, inum, &failed, flags); + } + } + +@@ -2174,6 +2198,7 @@ get_bio: + /* XXX: only initialize this if needed */ + rbio->devs_have = bch2_bkey_devs(k); + rbio->pick = pick; ++ rbio->subvol = orig->subvol; + rbio->read_pos = read_pos; + rbio->data_btree = data_btree; + rbio->data_pos = data_pos; +@@ -2276,25 +2301,31 @@ out_read_done: + } + + void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, +- struct bvec_iter bvec_iter, u64 inode, ++ struct bvec_iter bvec_iter, subvol_inum inum, + struct bch_io_failures *failed, unsigned flags) + { + struct btree_trans trans; + struct btree_iter iter; + struct bkey_buf sk; + struct bkey_s_c k; ++ u32 snapshot; + int ret; + + BUG_ON(flags & BCH_READ_NODECODE); + + bch2_bkey_buf_init(&sk); + bch2_trans_init(&trans, c, 0, 0); +- bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, +- POS(inode, bvec_iter.bi_sector), +- BTREE_ITER_SLOTS); + retry: + bch2_trans_begin(&trans); ++ iter = (struct btree_iter) { NULL }; ++ ++ ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot); ++ if (ret) ++ goto err; + ++ bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, ++ SPOS(inum.inum, bvec_iter.bi_sector, snapshot), ++ BTREE_ITER_SLOTS|BTREE_ITER_FILTER_SNAPSHOTS); + while (1) { + unsigned bytes, sectors, offset_into_extent; + enum btree_id data_btree = BTREE_ID_extents; +@@ -2309,7 +2340,7 @@ retry: + } + + bch2_btree_iter_set_pos(&iter, +- POS(inode, bvec_iter.bi_sector)); ++ POS(inum.inum, bvec_iter.bi_sector)); + + k = bch2_btree_iter_peek_slot(&iter); + ret = bkey_err(k); +@@ -2359,16 +2390,17 @@ retry: + swap(bvec_iter.bi_size, bytes); + bio_advance_iter(&rbio->bio, &bvec_iter, bytes); + } ++err: ++ bch2_trans_iter_exit(&trans, &iter); + + if (ret == -EINTR || ret == READ_RETRY || ret == READ_RETRY_AVOID) + goto retry; + +- bch2_trans_iter_exit(&trans, &iter); + bch2_trans_exit(&trans); + bch2_bkey_buf_exit(&sk, c); + + if (ret) { +- bch_err_inum_ratelimited(c, inode, ++ bch_err_inum_ratelimited(c, inum.inum, + "read error %i from btree lookup", ret); + rbio->bio.bi_status = BLK_STS_IOERR; + bch2_rbio_done(rbio); +diff --git a/fs/bcachefs/io.h b/fs/bcachefs/io.h +index bc0a0bd6f849..38efd39c664e 100644 +--- a/fs/bcachefs/io.h ++++ b/fs/bcachefs/io.h +@@ -63,12 +63,13 @@ static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op) + + int bch2_sum_sector_overwrites(struct btree_trans *, struct btree_iter *, + struct bkey_i *, bool *, bool *, s64 *, s64 *); +-int bch2_extent_update(struct btree_trans *, struct btree_iter *, +- struct bkey_i *, struct disk_reservation *, +- u64 *, u64, s64 *, bool); ++int bch2_extent_update(struct btree_trans *, subvol_inum, ++ struct btree_iter *, struct bkey_i *, ++ struct disk_reservation *, u64 *, u64, s64 *, bool); ++ + int bch2_fpunch_at(struct btree_trans *, struct btree_iter *, +- struct bpos, u64 *, s64 *); +-int bch2_fpunch(struct bch_fs *c, u64, u64, u64, u64 *, s64 *); ++ subvol_inum, u64, u64 *, s64 *); ++int bch2_fpunch(struct bch_fs *c, subvol_inum, u64, u64, u64 *, s64 *); + + int bch2_write_index_default(struct bch_write_op *); + +@@ -90,6 +91,7 @@ static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c, + op->devs_have.nr = 0; + op->target = 0; + op->opts = opts; ++ op->subvol = 0; + op->pos = POS_MAX; + op->version = ZERO_VERSION; + op->write_point = (struct write_point_specifier) { 0 }; +@@ -157,10 +159,10 @@ static inline void bch2_read_extent(struct btree_trans *trans, + } + + void __bch2_read(struct bch_fs *, struct bch_read_bio *, struct bvec_iter, +- u64, struct bch_io_failures *, unsigned flags); ++ subvol_inum, struct bch_io_failures *, unsigned flags); + + static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, +- u64 inode) ++ subvol_inum inum) + { + struct bch_io_failures failed = { .nr = 0 }; + +@@ -168,8 +170,9 @@ static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, + + rbio->c = c; + rbio->start_time = local_clock(); ++ rbio->subvol = inum.subvol; + +- __bch2_read(c, rbio, rbio->bio.bi_iter, inode, &failed, ++ __bch2_read(c, rbio, rbio->bio.bi_iter, inum, &failed, + BCH_READ_RETRY_IF_STALE| + BCH_READ_MAY_PROMOTE| + BCH_READ_USER_MAPPED); +diff --git a/fs/bcachefs/io_types.h b/fs/bcachefs/io_types.h +index 0aab77951c4c..78bff13d36f2 100644 +--- a/fs/bcachefs/io_types.h ++++ b/fs/bcachefs/io_types.h +@@ -62,6 +62,7 @@ struct bch_read_bio { + /* + * pos we read from - different from data_pos for indirect extents: + */ ++ u32 subvol; + struct bpos read_pos; + + /* +@@ -122,6 +123,7 @@ struct bch_write_op { + u16 nonce; + struct bch_io_opts opts; + ++ u32 subvol; + struct bpos pos; + struct bversion version; + +diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c +index be4b47bc7438..92ff609453b8 100644 +--- a/fs/bcachefs/reflink.c ++++ b/fs/bcachefs/reflink.c +@@ -212,6 +212,7 @@ s64 bch2_remap_range(struct bch_fs *c, + struct bpos dst_end = dst_start, src_end = src_start; + struct bpos src_want; + u64 dst_done; ++ u32 dst_snapshot, src_snapshot; + int ret = 0, ret2 = 0; + + if (!percpu_ref_tryget(&c->writes)) +@@ -243,15 +244,19 @@ s64 bch2_remap_range(struct bch_fs *c, + } + + ret = bch2_subvolume_get_snapshot(&trans, src_inum.subvol, +- &src_iter.snapshot); ++ &src_snapshot); + if (ret) + continue; + ++ bch2_btree_iter_set_snapshot(&src_iter, src_snapshot); ++ + ret = bch2_subvolume_get_snapshot(&trans, dst_inum.subvol, +- &dst_iter.snapshot); ++ &dst_snapshot); + if (ret) + continue; + ++ bch2_btree_iter_set_snapshot(&dst_iter, dst_snapshot); ++ + dst_done = dst_iter.pos.offset - dst_start.offset; + src_want = POS(src_start.inode, src_start.offset + dst_done); + bch2_btree_iter_set_pos(&src_iter, src_want); +@@ -262,11 +267,11 @@ s64 bch2_remap_range(struct bch_fs *c, + continue; + + if (bkey_cmp(src_want, src_iter.pos) < 0) { +- ret = bch2_fpunch_at(&trans, &dst_iter, +- bpos_min(dst_end, +- POS(dst_iter.pos.inode, dst_iter.pos.offset + +- src_iter.pos.offset - src_want.offset)), +- journal_seq, i_sectors_delta); ++ ret = bch2_fpunch_at(&trans, &dst_iter, dst_inum, ++ min(dst_end.offset, ++ dst_iter.pos.offset + ++ src_iter.pos.offset - src_want.offset), ++ journal_seq, i_sectors_delta); + continue; + } + +@@ -303,8 +308,9 @@ s64 bch2_remap_range(struct bch_fs *c, + bch2_key_resize(&new_dst.k->k, + min(src_k.k->p.offset - src_want.offset, + dst_end.offset - dst_iter.pos.offset)); +- ret = bch2_extent_update(&trans, &dst_iter, new_dst.k, +- &disk_res, journal_seq, ++ ++ ret = bch2_extent_update(&trans, dst_inum, &dst_iter, ++ new_dst.k, &disk_res, journal_seq, + new_i_size, i_sectors_delta, + true); + bch2_disk_reservation_put(c, &disk_res); +-- +cgit v1.2.3 + + +From fdef9d0eb1eb3e84ba6c5f1869b5e15bcb6bbf94 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 2 Feb 2021 17:09:10 -0500 +Subject: bcachefs: Whiteouts for snapshots + +This patch adds KEY_TYPE_whiteout, a new type of whiteout for snapshots, +when we're deleting and the key being deleted is in an ancestor +snapshot - and updates the transaction update/commit path to use it. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bcachefs_format.h | 4 +- + fs/bcachefs/bkey.h | 2 +- + fs/bcachefs/bkey_methods.c | 26 ++++++--- + fs/bcachefs/btree_update_leaf.c | 113 +++++++++++++++++++++++++++++++++++++--- + 4 files changed, 127 insertions(+), 18 deletions(-) + +diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h +index 807cea622920..c082d5fce79a 100644 +--- a/fs/bcachefs/bcachefs_format.h ++++ b/fs/bcachefs/bcachefs_format.h +@@ -323,7 +323,7 @@ static inline void bkey_init(struct bkey *k) + */ + #define BCH_BKEY_TYPES() \ + x(deleted, 0) \ +- x(discard, 1) \ ++ x(whiteout, 1) \ + x(error, 2) \ + x(cookie, 3) \ + x(hash_whiteout, 4) \ +@@ -357,7 +357,7 @@ struct bch_deleted { + struct bch_val v; + }; + +-struct bch_discard { ++struct bch_whiteout { + struct bch_val v; + }; + +diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h +index c4a66f28ef4b..7dee3d8e0a3d 100644 +--- a/fs/bcachefs/bkey.h ++++ b/fs/bcachefs/bkey.h +@@ -55,7 +55,7 @@ static inline void set_bkey_val_bytes(struct bkey *k, unsigned bytes) + #define bkey_deleted(_k) ((_k)->type == KEY_TYPE_deleted) + + #define bkey_whiteout(_k) \ +- ((_k)->type == KEY_TYPE_deleted || (_k)->type == KEY_TYPE_discard) ++ ((_k)->type == KEY_TYPE_deleted || (_k)->type == KEY_TYPE_whiteout) + + enum bkey_lr_packed { + BKEY_PACKED_BOTH, +diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c +index 53604af29bcc..9355e9a33861 100644 +--- a/fs/bcachefs/bkey_methods.c ++++ b/fs/bcachefs/bkey_methods.c +@@ -31,7 +31,7 @@ static const char *deleted_key_invalid(const struct bch_fs *c, + .key_invalid = deleted_key_invalid, \ + } + +-#define bch2_bkey_ops_discard (struct bkey_ops) { \ ++#define bch2_bkey_ops_whiteout (struct bkey_ops) { \ + .key_invalid = deleted_key_invalid, \ + } + +@@ -101,6 +101,8 @@ const char *bch2_bkey_val_invalid(struct bch_fs *c, struct bkey_s_c k) + + static unsigned bch2_key_types_allowed[] = { + [BKEY_TYPE_extents] = ++ (1U << KEY_TYPE_deleted)| ++ (1U << KEY_TYPE_whiteout)| + (1U << KEY_TYPE_error)| + (1U << KEY_TYPE_cookie)| + (1U << KEY_TYPE_extent)| +@@ -108,30 +110,43 @@ static unsigned bch2_key_types_allowed[] = { + (1U << KEY_TYPE_reflink_p)| + (1U << KEY_TYPE_inline_data), + [BKEY_TYPE_inodes] = ++ (1U << KEY_TYPE_deleted)| ++ (1U << KEY_TYPE_whiteout)| + (1U << KEY_TYPE_inode)| + (1U << KEY_TYPE_inode_generation), + [BKEY_TYPE_dirents] = ++ (1U << KEY_TYPE_deleted)| ++ (1U << KEY_TYPE_whiteout)| + (1U << KEY_TYPE_hash_whiteout)| + (1U << KEY_TYPE_dirent), + [BKEY_TYPE_xattrs] = ++ (1U << KEY_TYPE_deleted)| ++ (1U << KEY_TYPE_whiteout)| + (1U << KEY_TYPE_cookie)| + (1U << KEY_TYPE_hash_whiteout)| + (1U << KEY_TYPE_xattr), + [BKEY_TYPE_alloc] = ++ (1U << KEY_TYPE_deleted)| + (1U << KEY_TYPE_alloc)| + (1U << KEY_TYPE_alloc_v2), + [BKEY_TYPE_quotas] = ++ (1U << KEY_TYPE_deleted)| + (1U << KEY_TYPE_quota), + [BKEY_TYPE_stripes] = ++ (1U << KEY_TYPE_deleted)| + (1U << KEY_TYPE_stripe), + [BKEY_TYPE_reflink] = ++ (1U << KEY_TYPE_deleted)| + (1U << KEY_TYPE_reflink_v)| + (1U << KEY_TYPE_indirect_inline_data), + [BKEY_TYPE_subvolumes] = ++ (1U << KEY_TYPE_deleted)| + (1U << KEY_TYPE_subvolume), + [BKEY_TYPE_snapshots] = ++ (1U << KEY_TYPE_deleted)| + (1U << KEY_TYPE_snapshot), + [BKEY_TYPE_btree] = ++ (1U << KEY_TYPE_deleted)| + (1U << KEY_TYPE_btree_ptr)| + (1U << KEY_TYPE_btree_ptr_v2), + }; +@@ -139,21 +154,18 @@ static unsigned bch2_key_types_allowed[] = { + const char *__bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k, + enum btree_node_type type) + { +- unsigned key_types_allowed = (1U << KEY_TYPE_deleted)| +- bch2_key_types_allowed[type] ; +- + if (k.k->u64s < BKEY_U64s) + return "u64s too small"; + +- if (!(key_types_allowed & (1U << k.k->type))) ++ if (!(bch2_key_types_allowed[type] & (1U << k.k->type))) + return "invalid key type for this btree"; + + if (type == BKEY_TYPE_btree && + bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX) + return "value too big"; + +- if (btree_node_type_is_extents(type)) { +- if ((k.k->size == 0) != bkey_deleted(k.k)) ++ if (btree_node_type_is_extents(type) && !bkey_whiteout(k.k)) { ++ if (k.k->size == 0) + return "bad size field"; + + if (k.k->size > k.k->p.offset) +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index 37626fedfb3b..5689acbfa9f8 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -980,21 +980,24 @@ static int bch2_trans_update_extent(struct btree_trans *trans, + } + } + +- if (!bkey_cmp(k.k->p, bkey_start_pos(&insert->k))) ++ if (!bkey_cmp(k.k->p, start)) + goto next; + + while (bkey_cmp(insert->k.p, bkey_start_pos(k.k)) > 0) { ++ bool front_split = bkey_cmp(bkey_start_pos(k.k), start) < 0; ++ bool back_split = bkey_cmp(k.k->p, insert->k.p) > 0; ++ + /* + * If we're going to be splitting a compressed extent, note it + * so that __bch2_trans_commit() can increase our disk + * reservation: + */ +- if (bkey_cmp(bkey_start_pos(k.k), start) < 0 && +- bkey_cmp(k.k->p, insert->k.p) > 0 && ++ if (((front_split && back_split) || ++ ((front_split || back_split) && k.k->p.snapshot != insert->k.p.snapshot)) && + (compressed_sectors = bch2_bkey_sectors_compressed(k))) + trans->extra_journal_res += compressed_sectors; + +- if (bkey_cmp(bkey_start_pos(k.k), start) < 0) { ++ if (front_split) { + update = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); + if ((ret = PTR_ERR_OR_ZERO(update))) + goto err; +@@ -1005,6 +1008,32 @@ static int bch2_trans_update_extent(struct btree_trans *trans, + + bch2_trans_iter_init(trans, &update_iter, btree_id, update->k.p, + BTREE_ITER_NOT_EXTENTS| ++ BTREE_ITER_ALL_SNAPSHOTS| ++ BTREE_ITER_INTENT); ++ ret = bch2_btree_iter_traverse(&update_iter) ?: ++ bch2_trans_update(trans, &update_iter, update, ++ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE| ++ flags); ++ bch2_trans_iter_exit(trans, &update_iter); ++ ++ if (ret) ++ goto err; ++ } ++ ++ if (k.k->p.snapshot != insert->k.p.snapshot && ++ (front_split || back_split)) { ++ update = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); ++ if ((ret = PTR_ERR_OR_ZERO(update))) ++ goto err; ++ ++ bkey_reassemble(update, k); ++ ++ bch2_cut_front(start, update); ++ bch2_cut_back(insert->k.p, update); ++ ++ bch2_trans_iter_init(trans, &update_iter, btree_id, update->k.p, ++ BTREE_ITER_NOT_EXTENTS| ++ BTREE_ITER_ALL_SNAPSHOTS| + BTREE_ITER_INTENT); + ret = bch2_btree_iter_traverse(&update_iter) ?: + bch2_trans_update(trans, &update_iter, update, +@@ -1016,12 +1045,32 @@ static int bch2_trans_update_extent(struct btree_trans *trans, + } + + if (bkey_cmp(k.k->p, insert->k.p) <= 0) { +- ret = bch2_btree_delete_at(trans, &iter, flags); ++ update = bch2_trans_kmalloc(trans, sizeof(*update)); ++ if ((ret = PTR_ERR_OR_ZERO(update))) ++ goto err; ++ ++ bkey_init(&update->k); ++ update->k.p = k.k->p; ++ ++ if (insert->k.p.snapshot != k.k->p.snapshot) { ++ update->k.p.snapshot = insert->k.p.snapshot; ++ update->k.type = KEY_TYPE_whiteout; ++ } ++ ++ bch2_trans_iter_init(trans, &update_iter, btree_id, update->k.p, ++ BTREE_ITER_NOT_EXTENTS| ++ BTREE_ITER_INTENT); ++ ret = bch2_btree_iter_traverse(&update_iter) ?: ++ bch2_trans_update(trans, &update_iter, update, ++ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE| ++ flags); ++ bch2_trans_iter_exit(trans, &update_iter); ++ + if (ret) + goto err; + } + +- if (bkey_cmp(k.k->p, insert->k.p) > 0) { ++ if (back_split) { + update = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); + if ((ret = PTR_ERR_OR_ZERO(update))) + goto err; +@@ -1029,10 +1078,15 @@ static int bch2_trans_update_extent(struct btree_trans *trans, + bkey_reassemble(update, k); + bch2_cut_front(insert->k.p, update); + +- ret = bch2_trans_update(trans, &iter, update, flags); ++ bch2_trans_copy_iter(&update_iter, &iter); ++ update_iter.pos = update->k.p; ++ ret = bch2_trans_update(trans, &update_iter, update, ++ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE| ++ flags); ++ bch2_trans_iter_exit(trans, &update_iter); ++ + if (ret) + goto err; +- + goto out; + } + next: +@@ -1063,6 +1117,39 @@ err: + return ret; + } + ++/* ++ * When deleting, check if we need to emit a whiteout (because we're overwriting ++ * something in an ancestor snapshot) ++ */ ++static int need_whiteout_for_snapshot(struct btree_trans *trans, ++ enum btree_id btree_id, struct bpos pos) ++{ ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ u32 snapshot = pos.snapshot; ++ int ret; ++ ++ if (!bch2_snapshot_parent(trans->c, pos.snapshot)) ++ return 0; ++ ++ pos.snapshot++; ++ ++ for_each_btree_key(trans, iter, btree_id, pos, ++ BTREE_ITER_ALL_SNAPSHOTS, k, ret) { ++ if (bkey_cmp(k.k->p, pos)) ++ break; ++ ++ if (bch2_snapshot_is_ancestor(trans->c, snapshot, ++ k.k->p.snapshot)) { ++ ret = !bkey_whiteout(k.k); ++ break; ++ } ++ } ++ bch2_trans_iter_exit(trans, &iter); ++ ++ return ret; ++} ++ + int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter, + struct bkey_i *k, enum btree_update_flags flags) + { +@@ -1095,6 +1182,16 @@ int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter, + btree_insert_entry_cmp(i - 1, i) >= 0); + #endif + ++ if (bkey_deleted(&n.k->k) && ++ (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)) { ++ int ret = need_whiteout_for_snapshot(trans, n.btree_id, n.k->k.p); ++ if (unlikely(ret < 0)) ++ return ret; ++ ++ if (ret) ++ n.k->k.type = KEY_TYPE_whiteout; ++ } ++ + /* + * Pending updates are kept sorted: first, find position of new update, + * then delete/trim any updates the new update overwrites: +-- +cgit v1.2.3 + + +From 41a98abf9d1e3c37c8b8d82927a759a3d0eb1ad5 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 5 Aug 2021 00:41:41 -0400 +Subject: bcachefs: Update data move path for snapshots + +The data move path operates on existing extents, and not within a +subvolume as the regular IO paths do. It needs to change because it may +cause existing extents to be split, and when splitting an existing +extent in an ancestor snapshot we need to make sure the new split has +the same visibility in child snapshots as the existing extent. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_update_leaf.c | 80 +++++++++++++++++++++++++++++++++++++++- + fs/bcachefs/fsck.c | 35 +----------------- + fs/bcachefs/io.c | 3 +- + fs/bcachefs/migrate.c | 6 ++- + fs/bcachefs/move.c | 81 ++++++++++++++++++++++++++++++++++++++++- + fs/bcachefs/subvolume.h | 38 +++++++++++++++++++ + 6 files changed, 203 insertions(+), 40 deletions(-) + +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index 5689acbfa9f8..f69f919d83ac 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -940,6 +940,43 @@ err: + goto retry; + } + ++static int check_pos_snapshot_overwritten(struct btree_trans *trans, ++ enum btree_id id, ++ struct bpos pos) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ int ret; ++ ++ if (!snapshot_t(c, pos.snapshot)->children[0]) ++ return 0; ++ ++ bch2_trans_iter_init(trans, &iter, id, pos, ++ BTREE_ITER_NOT_EXTENTS| ++ BTREE_ITER_ALL_SNAPSHOTS); ++ while (1) { ++ k = bch2_btree_iter_prev(&iter); ++ ret = bkey_err(k); ++ if (ret) ++ break; ++ ++ if (!k.k) ++ break; ++ ++ if (bkey_cmp(pos, k.k->p)) ++ break; ++ ++ if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, pos.snapshot)) { ++ ret = 1; ++ break; ++ } ++ } ++ bch2_trans_iter_exit(trans, &iter); ++ ++ return ret; ++} ++ + static int bch2_trans_update_extent(struct btree_trans *trans, + struct btree_iter *orig_iter, + struct bkey_i *insert, +@@ -964,6 +1001,28 @@ static int bch2_trans_update_extent(struct btree_trans *trans, + goto out; + + if (bch2_bkey_maybe_mergable(k.k, &insert->k)) { ++ /* ++ * We can't merge extents if they belong to interior snapshot ++ * tree nodes, and there's a snapshot in which one extent is ++ * visible and the other is not - i.e. if visibility is ++ * different. ++ * ++ * Instead of checking if visibilitiy of the two extents is ++ * different, for now we just check if either has been ++ * overwritten: ++ */ ++ ret = check_pos_snapshot_overwritten(trans, btree_id, insert->k.p); ++ if (ret < 0) ++ goto err; ++ if (ret) ++ goto nomerge1; ++ ++ ret = check_pos_snapshot_overwritten(trans, btree_id, k.k->p); ++ if (ret < 0) ++ goto err; ++ if (ret) ++ goto nomerge1; ++ + update = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); + if ((ret = PTR_ERR_OR_ZERO(update))) + goto err; +@@ -979,7 +1038,8 @@ static int bch2_trans_update_extent(struct btree_trans *trans, + goto next; + } + } +- ++nomerge1: ++ ret = 0; + if (!bkey_cmp(k.k->p, start)) + goto next; + +@@ -1097,7 +1157,23 @@ next: + goto out; + } + +- bch2_bkey_merge(c, bkey_i_to_s(insert), k); ++ if (bch2_bkey_maybe_mergable(&insert->k, k.k)) { ++ ret = check_pos_snapshot_overwritten(trans, btree_id, insert->k.p); ++ if (ret < 0) ++ goto out; ++ if (ret) ++ goto nomerge2; ++ ++ ret = check_pos_snapshot_overwritten(trans, btree_id, k.k->p); ++ if (ret < 0) ++ goto out; ++ if (ret) ++ goto nomerge2; ++ ++ bch2_bkey_merge(c, bkey_i_to_s(insert), k); ++ } ++nomerge2: ++ ret = 0; + out: + if (!bkey_deleted(&insert->k)) { + /* +diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c +index b4a6b3d2ed07..f9a6a0b3ce7a 100644 +--- a/fs/bcachefs/fsck.c ++++ b/fs/bcachefs/fsck.c +@@ -473,24 +473,6 @@ out: + return ret; + } + +-struct snapshots_seen { +- struct bpos pos; +- size_t nr; +- size_t size; +- u32 *d; +-}; +- +-static void snapshots_seen_exit(struct snapshots_seen *s) +-{ +- kfree(s->d); +- s->d = NULL; +-} +- +-static void snapshots_seen_init(struct snapshots_seen *s) +-{ +- memset(s, 0, sizeof(*s)); +-} +- + static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s, struct bpos pos) + { + pos.snapshot = snapshot_t(c, pos.snapshot)->equiv; +@@ -499,26 +481,11 @@ static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s, str + s->nr = 0; + s->pos = pos; + +- if (s->nr == s->size) { +- size_t new_size = max(s->size, 128UL) * 2; +- u32 *d = krealloc(s->d, new_size * sizeof(s->d[0]), GFP_KERNEL); +- +- if (!d) { +- bch_err(c, "error reallocating snapshots_seen table (new size %zu)", +- new_size); +- return -ENOMEM; +- } +- +- s->size = new_size; +- s->d = d; +- } +- + /* Might get called multiple times due to lock restarts */ + if (s->nr && s->d[s->nr - 1] == pos.snapshot) + return 0; + +- s->d[s->nr++] = pos.snapshot; +- return 0; ++ return snapshots_seen_add(c, s, pos.snapshot); + } + + /** +diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c +index 4d44c86cf4f7..8c0697bf7828 100644 +--- a/fs/bcachefs/io.c ++++ b/fs/bcachefs/io.c +@@ -1830,7 +1830,8 @@ static int __bch2_rbio_narrow_crcs(struct btree_trans *trans, + if (!bch2_bkey_narrow_crcs(new, new_crc)) + goto out; + +- ret = bch2_trans_update(trans, &iter, new, 0); ++ ret = bch2_trans_update(trans, &iter, new, ++ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + out: + bch2_trans_iter_exit(trans, &iter); + return ret; +diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c +index 1899326d9754..7c764ee4ea09 100644 +--- a/fs/bcachefs/migrate.c ++++ b/fs/bcachefs/migrate.c +@@ -48,7 +48,8 @@ static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); + + bch2_trans_iter_init(&trans, &iter, btree_id, POS_MIN, +- BTREE_ITER_PREFETCH); ++ BTREE_ITER_PREFETCH| ++ BTREE_ITER_ALL_SNAPSHOTS); + + while ((k = bch2_btree_iter_peek(&iter)).k && + !(ret = bkey_err(k))) { +@@ -74,7 +75,8 @@ static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags + bch2_btree_iter_set_pos(&iter, bkey_start_pos(&sk.k->k)); + + ret = bch2_btree_iter_traverse(&iter) ?: +- bch2_trans_update(&trans, &iter, sk.k, 0) ?: ++ bch2_trans_update(&trans, &iter, sk.k, ++ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: + bch2_trans_commit(&trans, NULL, NULL, + BTREE_INSERT_NOFAIL); + +diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c +index 32d94c6c8b15..44a61818d9a4 100644 +--- a/fs/bcachefs/move.c ++++ b/fs/bcachefs/move.c +@@ -13,6 +13,7 @@ + #include "journal_reclaim.h" + #include "move.h" + #include "replicas.h" ++#include "subvolume.h" + #include "super-io.h" + #include "keylist.h" + +@@ -53,6 +54,81 @@ struct moving_context { + wait_queue_head_t wait; + }; + ++static int insert_snapshot_whiteouts(struct btree_trans *trans, ++ enum btree_id id, ++ struct bpos old_pos, ++ struct bpos new_pos) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_iter iter, update_iter; ++ struct bkey_s_c k; ++ struct snapshots_seen s; ++ int ret; ++ ++ if (!btree_type_has_snapshots(id)) ++ return 0; ++ ++ snapshots_seen_init(&s); ++ ++ if (!bkey_cmp(old_pos, new_pos)) ++ return 0; ++ ++ if (!snapshot_t(c, old_pos.snapshot)->children[0]) ++ return 0; ++ ++ bch2_trans_iter_init(trans, &iter, id, old_pos, ++ BTREE_ITER_NOT_EXTENTS| ++ BTREE_ITER_ALL_SNAPSHOTS); ++ while (1) { ++next: ++ k = bch2_btree_iter_prev(&iter); ++ ret = bkey_err(k); ++ if (ret) ++ break; ++ ++ if (bkey_cmp(old_pos, k.k->p)) ++ break; ++ ++ if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, old_pos.snapshot)) { ++ struct bkey_i *update; ++ size_t i; ++ ++ for (i = 0; i < s.nr; i++) ++ if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, s.d[i])) ++ goto next; ++ ++ update = bch2_trans_kmalloc(trans, sizeof(struct bkey_i)); ++ ++ ret = PTR_ERR_OR_ZERO(update); ++ if (ret) ++ break; ++ ++ bkey_init(&update->k); ++ update->k.p = new_pos; ++ update->k.p.snapshot = k.k->p.snapshot; ++ ++ bch2_trans_iter_init(trans, &update_iter, id, update->k.p, ++ BTREE_ITER_NOT_EXTENTS| ++ BTREE_ITER_ALL_SNAPSHOTS| ++ BTREE_ITER_INTENT); ++ ret = bch2_btree_iter_traverse(&update_iter) ?: ++ bch2_trans_update(trans, &update_iter, update, ++ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); ++ bch2_trans_iter_exit(trans, &update_iter); ++ if (ret) ++ break; ++ ++ ret = snapshots_seen_add(c, &s, k.k->p.snapshot); ++ if (ret) ++ break; ++ } ++ } ++ bch2_trans_iter_exit(trans, &iter); ++ kfree(s.d); ++ ++ return ret; ++} ++ + static int bch2_migrate_index_update(struct bch_write_op *op) + { + struct bch_fs *c = op->c; +@@ -166,7 +242,10 @@ static int bch2_migrate_index_update(struct bch_write_op *op) + + next_pos = insert->k.p; + +- ret = bch2_trans_update(&trans, &iter, insert, 0) ?: ++ ret = insert_snapshot_whiteouts(&trans, m->btree_id, ++ k.k->p, insert->k.p) ?: ++ bch2_trans_update(&trans, &iter, insert, ++ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: + bch2_trans_commit(&trans, &op->res, + op_journal_seq(op), + BTREE_INSERT_NOFAIL| +diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h +index cea4c665af32..0740c7b7f772 100644 +--- a/fs/bcachefs/subvolume.h ++++ b/fs/bcachefs/subvolume.h +@@ -54,6 +54,44 @@ static inline bool bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ances + return id == ancestor; + } + ++struct snapshots_seen { ++ struct bpos pos; ++ size_t nr; ++ size_t size; ++ u32 *d; ++}; ++ ++static inline void snapshots_seen_exit(struct snapshots_seen *s) ++{ ++ kfree(s->d); ++ s->d = NULL; ++} ++ ++static inline void snapshots_seen_init(struct snapshots_seen *s) ++{ ++ memset(s, 0, sizeof(*s)); ++} ++ ++static inline int snapshots_seen_add(struct bch_fs *c, struct snapshots_seen *s, u32 id) ++{ ++ if (s->nr == s->size) { ++ size_t new_size = max(s->size, 128UL) * 2; ++ u32 *d = krealloc(s->d, new_size * sizeof(s->d[0]), GFP_KERNEL); ++ ++ if (!d) { ++ bch_err(c, "error reallocating snapshots_seen table (new size %zu)", ++ new_size); ++ return -ENOMEM; ++ } ++ ++ s->size = new_size; ++ s->d = d; ++ } ++ ++ s->d[s->nr++] = id; ++ return 0; ++} ++ + int bch2_fs_snapshots_check(struct bch_fs *); + void bch2_fs_snapshots_exit(struct bch_fs *); + int bch2_fs_snapshots_start(struct bch_fs *); +-- +cgit v1.2.3 + + +From d9ad16dab5a61d5aba88b6d7e3aec8b152680b0e Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 15 Dec 2021 20:38:56 -0500 +Subject: bcachefs: Fix unit & perf tests for snapshots + +This finishes updating the unit & perf tests for snapshots - btrees that +use snapshots now always require the snapshot field of the start +position to be a valid snapshot ID. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/tests.c | 61 ++++++++++++++++++++++++++++------------------------- + 1 file changed, 32 insertions(+), 29 deletions(-) + +diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c +index d5a74f4db64d..d6facb76a0a2 100644 +--- a/fs/bcachefs/tests.c ++++ b/fs/bcachefs/tests.c +@@ -14,12 +14,14 @@ static void delete_test_keys(struct bch_fs *c) + int ret; + + ret = bch2_btree_delete_range(c, BTREE_ID_extents, +- POS(0, 0), POS(0, U64_MAX), ++ SPOS(0, 0, U32_MAX), ++ SPOS(0, U64_MAX, U32_MAX), + NULL); + BUG_ON(ret); + + ret = bch2_btree_delete_range(c, BTREE_ID_xattrs, +- POS(0, 0), POS(0, U64_MAX), ++ SPOS(0, 0, U32_MAX), ++ SPOS(0, U64_MAX, U32_MAX), + NULL); + BUG_ON(ret); + } +@@ -144,7 +146,7 @@ static int test_iterate(struct bch_fs *c, u64 nr) + i = 0; + + for_each_btree_key(&trans, iter, BTREE_ID_xattrs, +- POS_MIN, 0, k, ret) { ++ SPOS(0, 0, U32_MAX), 0, k, ret) { + if (k.k->p.inode) + break; + +@@ -200,7 +202,7 @@ static int test_iterate_extents(struct bch_fs *c, u64 nr) + i = 0; + + for_each_btree_key(&trans, iter, BTREE_ID_extents, +- POS_MIN, 0, k, ret) { ++ SPOS(0, 0, U32_MAX), 0, k, ret) { + BUG_ON(bkey_start_offset(k.k) != i); + i = k.k->p.offset; + } +@@ -254,8 +256,8 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr) + + i = 0; + +- for_each_btree_key(&trans, iter, BTREE_ID_xattrs, POS_MIN, +- 0, k, ret) { ++ for_each_btree_key(&trans, iter, BTREE_ID_xattrs, ++ SPOS(0, 0, U32_MAX), 0, k, ret) { + if (k.k->p.inode) + break; + +@@ -270,7 +272,8 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr) + + i = 0; + +- for_each_btree_key(&trans, iter, BTREE_ID_xattrs, POS_MIN, ++ for_each_btree_key(&trans, iter, BTREE_ID_xattrs, ++ SPOS(0, 0, U32_MAX), + BTREE_ITER_SLOTS, k, ret) { + BUG_ON(k.k->p.offset != i); + BUG_ON(bkey_deleted(k.k) != (i & 1)); +@@ -319,8 +322,8 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr) + + i = 0; + +- for_each_btree_key(&trans, iter, BTREE_ID_extents, POS_MIN, +- 0, k, ret) { ++ for_each_btree_key(&trans, iter, BTREE_ID_extents, ++ SPOS(0, 0, U32_MAX), 0, k, ret) { + BUG_ON(bkey_start_offset(k.k) != i + 8); + BUG_ON(k.k->size != 8); + i += 16; +@@ -333,7 +336,8 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr) + + i = 0; + +- for_each_btree_key(&trans, iter, BTREE_ID_extents, POS_MIN, ++ for_each_btree_key(&trans, iter, BTREE_ID_extents, ++ SPOS(0, 0, U32_MAX), + BTREE_ITER_SLOTS, k, ret) { + BUG_ON(bkey_deleted(k.k) != !(i % 16)); + +@@ -361,7 +365,8 @@ static int test_peek_end(struct bch_fs *c, u64 nr) + struct bkey_s_c k; + + bch2_trans_init(&trans, c, 0, 0); +- bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, POS_MIN, 0); ++ bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, ++ SPOS(0, 0, U32_MAX), 0); + + k = bch2_btree_iter_peek(&iter); + BUG_ON(k.k); +@@ -381,7 +386,8 @@ static int test_peek_end_extents(struct bch_fs *c, u64 nr) + struct bkey_s_c k; + + bch2_trans_init(&trans, c, 0, 0); +- bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, POS_MIN, 0); ++ bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, ++ SPOS(0, 0, U32_MAX), 0); + + k = bch2_btree_iter_peek(&iter); + BUG_ON(k.k); +@@ -404,8 +410,6 @@ static int insert_test_extent(struct bch_fs *c, + struct bkey_i_cookie k; + int ret; + +- //pr_info("inserting %llu-%llu v %llu", start, end, test_version); +- + bkey_cookie_init(&k.k_i); + k.k_i.k.p.offset = end; + k.k_i.k.p.snapshot = U32_MAX; +@@ -541,10 +545,11 @@ static int rand_lookup(struct bch_fs *c, u64 nr) + u64 i; + + bch2_trans_init(&trans, c, 0, 0); +- bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, POS_MIN, 0); ++ bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, ++ SPOS(0, 0, U32_MAX), 0); + + for (i = 0; i < nr; i++) { +- bch2_btree_iter_set_pos(&iter, POS(0, test_rand())); ++ bch2_btree_iter_set_pos(&iter, SPOS(0, test_rand(), U32_MAX)); + + k = bch2_btree_iter_peek(&iter); + ret = bkey_err(k); +@@ -567,7 +572,7 @@ static int rand_mixed_trans(struct btree_trans *trans, + struct bkey_s_c k; + int ret; + +- bch2_btree_iter_set_pos(iter, POS(0, pos)); ++ bch2_btree_iter_set_pos(iter, SPOS(0, pos, U32_MAX)); + + k = bch2_btree_iter_peek(iter); + ret = bkey_err(k); +@@ -594,7 +599,8 @@ static int rand_mixed(struct bch_fs *c, u64 nr) + u64 i, rand; + + bch2_trans_init(&trans, c, 0, 0); +- bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, POS_MIN, 0); ++ bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, ++ SPOS(0, 0, U32_MAX), 0); + + for (i = 0; i < nr; i++) { + rand = test_rand(); +@@ -614,7 +620,6 @@ static int rand_mixed(struct bch_fs *c, u64 nr) + static int __do_delete(struct btree_trans *trans, struct bpos pos) + { + struct btree_iter iter; +- struct bkey_i delete; + struct bkey_s_c k; + int ret = 0; + +@@ -628,10 +633,7 @@ static int __do_delete(struct btree_trans *trans, struct bpos pos) + if (!k.k) + goto err; + +- bkey_init(&delete.k); +- delete.k.p = k.k->p; +- +- ret = bch2_trans_update(trans, &iter, &delete, 0); ++ ret = bch2_btree_delete_at(trans, &iter, 0); + err: + bch2_trans_iter_exit(trans, &iter); + return ret; +@@ -646,7 +648,7 @@ static int rand_delete(struct bch_fs *c, u64 nr) + bch2_trans_init(&trans, c, 0, 0); + + for (i = 0; i < nr; i++) { +- struct bpos pos = POS(0, test_rand()); ++ struct bpos pos = SPOS(0, test_rand(), U32_MAX); + + ret = __bch2_trans_do(&trans, NULL, NULL, 0, + __do_delete(&trans, pos)); +@@ -673,7 +675,7 @@ static int seq_insert(struct bch_fs *c, u64 nr) + + bch2_trans_init(&trans, c, 0, 0); + +- for_each_btree_key(&trans, iter, BTREE_ID_xattrs, POS_MIN, ++ for_each_btree_key(&trans, iter, BTREE_ID_xattrs, SPOS(0, 0, U32_MAX), + BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { + insert.k.p = iter.pos; + +@@ -703,7 +705,8 @@ static int seq_lookup(struct bch_fs *c, u64 nr) + + bch2_trans_init(&trans, c, 0, 0); + +- for_each_btree_key(&trans, iter, BTREE_ID_xattrs, POS_MIN, 0, k, ret) ++ for_each_btree_key(&trans, iter, BTREE_ID_xattrs, ++ SPOS(0, 0, U32_MAX), 0, k, ret) + ; + bch2_trans_iter_exit(&trans, &iter); + +@@ -720,7 +723,8 @@ static int seq_overwrite(struct bch_fs *c, u64 nr) + + bch2_trans_init(&trans, c, 0, 0); + +- for_each_btree_key(&trans, iter, BTREE_ID_xattrs, POS_MIN, ++ for_each_btree_key(&trans, iter, BTREE_ID_xattrs, ++ SPOS(0, 0, U32_MAX), + BTREE_ITER_INTENT, k, ret) { + struct bkey_i_cookie u; + +@@ -745,8 +749,7 @@ static int seq_delete(struct bch_fs *c, u64 nr) + int ret; + + ret = bch2_btree_delete_range(c, BTREE_ID_xattrs, +- POS(0, 0), POS(0, U64_MAX), +- NULL); ++ SPOS(0, 0, U32_MAX), POS_MAX, NULL); + if (ret) + bch_err(c, "error in seq_delete: %i", ret); + return ret; +-- +cgit v1.2.3 + + +From d78dcc475bee18a41dc065e9dd3fbc2c84437151 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 15 Mar 2021 22:34:00 -0400 +Subject: bcachefs: Require snapshot id to be set + +Now that all the existing code has been converted for snapshots, this +patch changes the code for initializing a btree iterator to require a +snapshot to be specified, and also change bkey_invalid() to allow for +non U32_MAX snapshot IDs. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bkey_methods.c | 2 +- + fs/bcachefs/btree_iter.c | 20 ++++++++------------ + 2 files changed, 9 insertions(+), 13 deletions(-) + +diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c +index 9355e9a33861..874defd8aff8 100644 +--- a/fs/bcachefs/bkey_methods.c ++++ b/fs/bcachefs/bkey_methods.c +@@ -182,7 +182,7 @@ const char *__bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k, + + if (type != BKEY_TYPE_btree && + btree_type_has_snapshots(type) && +- k.k->p.snapshot != U32_MAX) ++ !k.k->p.snapshot) + return "invalid snapshot field"; + + if (type != BKEY_TYPE_btree && +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index ac071b60fda0..d805a090eacf 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -683,6 +683,9 @@ static void bch2_btree_iter_verify(struct btree_iter *iter) + + static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) + { ++ BUG_ON((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) && ++ !iter->pos.snapshot); ++ + BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS) && + iter->pos.snapshot != iter->snapshot); + +@@ -2486,20 +2489,13 @@ static void __bch2_trans_iter_init(struct btree_trans *trans, + btree_node_type_is_extents(btree_id)) + flags |= BTREE_ITER_IS_EXTENTS; + +- if (!btree_type_has_snapshots(btree_id) && +- !(flags & __BTREE_ITER_ALL_SNAPSHOTS)) ++ if (!(flags & __BTREE_ITER_ALL_SNAPSHOTS) && ++ !btree_type_has_snapshots(btree_id)) + flags &= ~BTREE_ITER_ALL_SNAPSHOTS; +-#if 0 +- /* let's have this be explicitly set: */ +- if ((flags & BTREE_ITER_TYPE) != BTREE_ITER_NODES && +- btree_type_has_snapshots(btree_id) && +- !(flags & BTREE_ITER_ALL_SNAPSHOTS)) +- flags |= BTREE_ITER_FILTER_SNAPSHOTS; +-#endif + +- if (!(flags & BTREE_ITER_ALL_SNAPSHOTS)) +- pos.snapshot = btree_type_has_snapshots(btree_id) +- ? U32_MAX : 0; ++ if (!(flags & BTREE_ITER_ALL_SNAPSHOTS) && ++ btree_type_has_snapshots(btree_id)) ++ flags |= BTREE_ITER_FILTER_SNAPSHOTS; + + iter->trans = trans; + iter->path = NULL; +-- +cgit v1.2.3 + + +From 7237659d87010aaa2fd8853530883f4f8541b920 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 16 Mar 2021 23:28:43 -0400 +Subject: bcachefs: Snapshot creation, deletion + +This is the final patch in the patch series implementing snapshots. +This patch implements two new ioctls that work like creation and +deletion of directories, but fancier. + + - BCH_IOCTL_SUBVOLUME_CREATE, for creating new subvolumes and snaphots + - BCH_IOCTL_SUBVOLUME_DESTROY, for deleting subvolumes and snapshots + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/dirent.c | 8 --- + fs/bcachefs/dirent.h | 4 -- + fs/bcachefs/fs-common.c | 182 +++++++++++++++++++++++++++++++++++++++--------- + fs/bcachefs/fs-common.h | 7 +- + fs/bcachefs/fs-ioctl.c | 168 ++++++++++++++++++++++++++++++++++++++++++++ + fs/bcachefs/fs.c | 29 ++++---- + fs/bcachefs/fs.h | 3 +- + fs/bcachefs/fsck.c | 7 +- + fs/bcachefs/recovery.c | 2 +- + fs/bcachefs/str_hash.h | 7 +- + 10 files changed, 348 insertions(+), 69 deletions(-) + +diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c +index f290580594ce..8653a106809d 100644 +--- a/fs/bcachefs/dirent.c ++++ b/fs/bcachefs/dirent.c +@@ -383,14 +383,6 @@ out: + return ret; + } + +-int bch2_dirent_delete_at(struct btree_trans *trans, +- const struct bch_hash_info *hash_info, +- struct btree_iter *iter) +-{ +- return bch2_hash_delete_at(trans, bch2_dirent_hash_desc, +- hash_info, iter); +-} +- + int __bch2_dirent_lookup_trans(struct btree_trans *trans, + struct btree_iter *iter, + subvol_inum dir, +diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h +index 88b784a99cb5..e7f65fbd8e65 100644 +--- a/fs/bcachefs/dirent.h ++++ b/fs/bcachefs/dirent.h +@@ -33,10 +33,6 @@ int bch2_dirent_create(struct btree_trans *, subvol_inum, + const struct bch_hash_info *, u8, + const struct qstr *, u64, u64 *, int); + +-int bch2_dirent_delete_at(struct btree_trans *, +- const struct bch_hash_info *, +- struct btree_iter *); +- + int __bch2_dirent_read_target(struct btree_trans *, struct bkey_s_c_dirent, + u32 *, u32 *, u64 *, bool); + +diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c +index 02bf32cc7659..3e8e3c5bf870 100644 +--- a/fs/bcachefs/fs-common.c ++++ b/fs/bcachefs/fs-common.c +@@ -11,6 +11,11 @@ + + #include + ++static inline int is_subdir_for_nlink(struct bch_inode_unpacked *inode) ++{ ++ return S_ISDIR(inode->bi_mode) && !inode->bi_subvol; ++} ++ + int bch2_create_trans(struct btree_trans *trans, + subvol_inum dir, + struct bch_inode_unpacked *dir_u, +@@ -19,6 +24,7 @@ int bch2_create_trans(struct btree_trans *trans, + uid_t uid, gid_t gid, umode_t mode, dev_t rdev, + struct posix_acl *default_acl, + struct posix_acl *acl, ++ subvol_inum snapshot_src, + unsigned flags) + { + struct bch_fs *c = trans->c; +@@ -27,10 +33,9 @@ int bch2_create_trans(struct btree_trans *trans, + subvol_inum new_inum = dir; + u64 now = bch2_current_time(c); + u64 cpu = raw_smp_processor_id(); +- u64 dir_offset = 0; + u64 dir_target; + u32 snapshot; +- unsigned dir_type; ++ unsigned dir_type = mode_to_type(mode); + int ret; + + ret = bch2_subvolume_get_snapshot(trans, dir.subvol, &snapshot); +@@ -41,37 +46,122 @@ int bch2_create_trans(struct btree_trans *trans, + if (ret) + goto err; + +- bch2_inode_init_late(new_inode, now, uid, gid, mode, rdev, dir_u); ++ if (!(flags & BCH_CREATE_SNAPSHOT)) { ++ /* Normal create path - allocate a new inode: */ ++ bch2_inode_init_late(new_inode, now, uid, gid, mode, rdev, dir_u); + +- if (!name) +- new_inode->bi_flags |= BCH_INODE_UNLINKED; ++ if (flags & BCH_CREATE_TMPFILE) ++ new_inode->bi_flags |= BCH_INODE_UNLINKED; + +- ret = bch2_inode_create(trans, &inode_iter, new_inode, snapshot, cpu); +- if (ret) +- goto err; ++ ret = bch2_inode_create(trans, &inode_iter, new_inode, snapshot, cpu); ++ if (ret) ++ goto err; ++ ++ snapshot_src = (subvol_inum) { 0 }; ++ } else { ++ /* ++ * Creating a snapshot - we're not allocating a new inode, but ++ * we do have to lookup the root inode of the subvolume we're ++ * snapshotting and update it (in the new snapshot): ++ */ ++ ++ if (!snapshot_src.inum) { ++ /* Inode wasn't specified, just snapshot: */ ++ struct btree_iter subvol_iter; ++ struct bkey_s_c k; ++ ++ bch2_trans_iter_init(trans, &subvol_iter, BTREE_ID_subvolumes, ++ POS(0, snapshot_src.subvol), 0); ++ k = bch2_btree_iter_peek_slot(&subvol_iter); ++ ++ ret = bkey_err(k); ++ if (!ret && k.k->type != KEY_TYPE_subvolume) { ++ bch_err(c, "subvolume %u not found", ++ snapshot_src.subvol); ++ ret = -ENOENT; ++ } ++ ++ if (!ret) ++ snapshot_src.inum = le64_to_cpu(bkey_s_c_to_subvolume(k).v->inode); ++ bch2_trans_iter_exit(trans, &subvol_iter); ++ ++ if (ret) ++ goto err; ++ } ++ ++ ret = bch2_inode_peek(trans, &inode_iter, new_inode, snapshot_src, ++ BTREE_ITER_INTENT); ++ if (ret) ++ goto err; ++ ++ if (new_inode->bi_subvol != snapshot_src.subvol) { ++ /* Not a subvolume root: */ ++ ret = -EINVAL; ++ goto err; ++ } ++ ++ /* ++ * If we're not root, we have to own the subvolume being ++ * snapshotted: ++ */ ++ if (uid && new_inode->bi_uid != uid) { ++ ret = -EPERM; ++ goto err; ++ } ++ ++ flags |= BCH_CREATE_SUBVOL; ++ } + + new_inum.inum = new_inode->bi_inum; + dir_target = new_inode->bi_inum; +- dir_type = mode_to_type(new_inode->bi_mode); + +- if (default_acl) { +- ret = bch2_set_acl_trans(trans, new_inum, new_inode, +- default_acl, ACL_TYPE_DEFAULT); ++ if (flags & BCH_CREATE_SUBVOL) { ++ u32 new_subvol, dir_snapshot; ++ ++ ret = bch2_subvolume_create(trans, new_inode->bi_inum, ++ snapshot_src.subvol, ++ &new_subvol, &snapshot, ++ (flags & BCH_CREATE_SNAPSHOT_RO) != 0); + if (ret) + goto err; +- } + +- if (acl) { +- ret = bch2_set_acl_trans(trans, new_inum, new_inode, +- acl, ACL_TYPE_ACCESS); ++ new_inode->bi_parent_subvol = dir.subvol; ++ new_inode->bi_subvol = new_subvol; ++ new_inum.subvol = new_subvol; ++ dir_target = new_subvol; ++ dir_type = DT_SUBVOL; ++ ++ ret = bch2_subvolume_get_snapshot(trans, dir.subvol, &dir_snapshot); ++ if (ret) ++ goto err; ++ ++ bch2_btree_iter_set_snapshot(&dir_iter, dir_snapshot); ++ ret = bch2_btree_iter_traverse(&dir_iter); + if (ret) + goto err; + } + +- if (name) { ++ if (!(flags & BCH_CREATE_SNAPSHOT)) { ++ if (default_acl) { ++ ret = bch2_set_acl_trans(trans, new_inum, new_inode, ++ default_acl, ACL_TYPE_DEFAULT); ++ if (ret) ++ goto err; ++ } ++ ++ if (acl) { ++ ret = bch2_set_acl_trans(trans, new_inum, new_inode, ++ acl, ACL_TYPE_ACCESS); ++ if (ret) ++ goto err; ++ } ++ } ++ ++ if (!(flags & BCH_CREATE_TMPFILE)) { + struct bch_hash_info dir_hash = bch2_hash_info_init(c, dir_u); ++ u64 dir_offset; + +- if (S_ISDIR(new_inode->bi_mode)) ++ if (is_subdir_for_nlink(new_inode)) + dir_u->bi_nlink++; + dir_u->bi_mtime = dir_u->bi_ctime = now; + +@@ -87,11 +177,11 @@ int bch2_create_trans(struct btree_trans *trans, + BCH_HASH_SET_MUST_CREATE); + if (ret) + goto err; +- } + +- if (c->sb.version >= bcachefs_metadata_version_inode_backpointers) { +- new_inode->bi_dir = dir_u->bi_inum; +- new_inode->bi_dir_offset = dir_offset; ++ if (c->sb.version >= bcachefs_metadata_version_inode_backpointers) { ++ new_inode->bi_dir = dir_u->bi_inum; ++ new_inode->bi_dir_offset = dir_offset; ++ } + } + + inode_iter.flags &= ~BTREE_ITER_ALL_SNAPSHOTS; +@@ -160,7 +250,8 @@ int bch2_unlink_trans(struct btree_trans *trans, + subvol_inum dir, + struct bch_inode_unpacked *dir_u, + struct bch_inode_unpacked *inode_u, +- const struct qstr *name) ++ const struct qstr *name, ++ int deleting_snapshot) + { + struct bch_fs *c = trans->c; + struct btree_iter dir_iter = { NULL }; +@@ -169,6 +260,7 @@ int bch2_unlink_trans(struct btree_trans *trans, + struct bch_hash_info dir_hash; + subvol_inum inum; + u64 now = bch2_current_time(c); ++ struct bkey_s_c k; + int ret; + + ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_INTENT); +@@ -187,29 +279,51 @@ int bch2_unlink_trans(struct btree_trans *trans, + if (ret) + goto err; + +- if (inode_u->bi_dir == dirent_iter.pos.inode && +- inode_u->bi_dir_offset == dirent_iter.pos.offset) { +- inode_u->bi_dir = 0; +- inode_u->bi_dir_offset = 0; ++ if (deleting_snapshot == 1 && !inode_u->bi_subvol) { ++ ret = -ENOENT; ++ goto err; + } + +- if (S_ISDIR(inode_u->bi_mode)) { ++ if (deleting_snapshot <= 0 && S_ISDIR(inode_u->bi_mode)) { + ret = bch2_empty_dir_trans(trans, inum); + if (ret) + goto err; + } + +- if (dir.subvol != inum.subvol) { +- ret = bch2_subvolume_delete(trans, inum.subvol, false); ++ if (inode_u->bi_subvol) { ++ ret = bch2_subvolume_delete(trans, inode_u->bi_subvol, ++ deleting_snapshot); ++ if (ret) ++ goto err; ++ ++ k = bch2_btree_iter_peek_slot(&dirent_iter); ++ ret = bkey_err(k); ++ if (ret) ++ goto err; ++ ++ /* ++ * If we're deleting a subvolume, we need to really delete the ++ * dirent, not just emit a whiteout in the current snapshot: ++ */ ++ bch2_btree_iter_set_snapshot(&dirent_iter, k.k->p.snapshot); ++ ret = bch2_btree_iter_traverse(&dirent_iter); + if (ret) + goto err; + } + ++ if (inode_u->bi_dir == dirent_iter.pos.inode && ++ inode_u->bi_dir_offset == dirent_iter.pos.offset) { ++ inode_u->bi_dir = 0; ++ inode_u->bi_dir_offset = 0; ++ } ++ + dir_u->bi_mtime = dir_u->bi_ctime = inode_u->bi_ctime = now; +- dir_u->bi_nlink -= S_ISDIR(inode_u->bi_mode); ++ dir_u->bi_nlink -= is_subdir_for_nlink(inode_u); + bch2_inode_nlink_dec(inode_u); + +- ret = bch2_dirent_delete_at(trans, &dir_hash, &dirent_iter) ?: ++ ret = bch2_hash_delete_at(trans, bch2_dirent_hash_desc, ++ &dir_hash, &dirent_iter, ++ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: + bch2_inode_write(trans, &dir_iter, dir_u) ?: + bch2_inode_write(trans, &inode_iter, inode_u); + err: +@@ -348,12 +462,12 @@ int bch2_rename_trans(struct btree_trans *trans, + goto err; + } + +- if (S_ISDIR(src_inode_u->bi_mode)) { ++ if (is_subdir_for_nlink(src_inode_u)) { + src_dir_u->bi_nlink--; + dst_dir_u->bi_nlink++; + } + +- if (dst_inum.inum && S_ISDIR(dst_inode_u->bi_mode)) { ++ if (dst_inum.inum && is_subdir_for_nlink(dst_inode_u)) { + dst_dir_u->bi_nlink--; + src_dir_u->bi_nlink += mode == BCH_RENAME_EXCHANGE; + } +diff --git a/fs/bcachefs/fs-common.h b/fs/bcachefs/fs-common.h +index 1bb2ac4dc13a..9bb0a9676147 100644 +--- a/fs/bcachefs/fs-common.h ++++ b/fs/bcachefs/fs-common.h +@@ -5,6 +5,9 @@ + struct posix_acl; + + #define BCH_CREATE_TMPFILE (1U << 0) ++#define BCH_CREATE_SUBVOL (1U << 1) ++#define BCH_CREATE_SNAPSHOT (1U << 2) ++#define BCH_CREATE_SNAPSHOT_RO (1U << 3) + + int bch2_create_trans(struct btree_trans *, subvol_inum, + struct bch_inode_unpacked *, +@@ -13,7 +16,7 @@ int bch2_create_trans(struct btree_trans *, subvol_inum, + uid_t, gid_t, umode_t, dev_t, + struct posix_acl *, + struct posix_acl *, +- unsigned); ++ subvol_inum, unsigned); + + int bch2_link_trans(struct btree_trans *, + subvol_inum, struct bch_inode_unpacked *, +@@ -23,7 +26,7 @@ int bch2_link_trans(struct btree_trans *, + int bch2_unlink_trans(struct btree_trans *, subvol_inum, + struct bch_inode_unpacked *, + struct bch_inode_unpacked *, +- const struct qstr *); ++ const struct qstr *, int); + + int bch2_rename_trans(struct btree_trans *, + subvol_inum, struct bch_inode_unpacked *, +diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c +index d7bcb2219b8d..3ed53f420e7e 100644 +--- a/fs/bcachefs/fs-ioctl.c ++++ b/fs/bcachefs/fs-ioctl.c +@@ -10,7 +10,11 @@ + #include "quota.h" + + #include ++#include + #include ++#include ++#include ++#include + + #define FS_IOC_GOINGDOWN _IOR('X', 125, __u32) + #define FSOP_GOING_FLAGS_DEFAULT 0x0 /* going down */ +@@ -292,6 +296,154 @@ err: + return ret; + } + ++static long bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp, ++ struct bch_ioctl_subvolume arg) ++{ ++ struct inode *dir; ++ struct bch_inode_info *inode; ++ struct user_namespace *s_user_ns; ++ struct dentry *dst_dentry; ++ struct path src_path, dst_path; ++ int how = LOOKUP_FOLLOW; ++ int error; ++ subvol_inum snapshot_src = { 0 }; ++ unsigned lookup_flags = 0; ++ unsigned create_flags = BCH_CREATE_SUBVOL; ++ ++ if (arg.flags & ~(BCH_SUBVOL_SNAPSHOT_CREATE| ++ BCH_SUBVOL_SNAPSHOT_RO)) ++ return -EINVAL; ++ ++ if (!(arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE) && ++ (arg.src_ptr || ++ (arg.flags & BCH_SUBVOL_SNAPSHOT_RO))) ++ return -EINVAL; ++ ++ if (arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE) ++ create_flags |= BCH_CREATE_SNAPSHOT; ++ ++ if (arg.flags & BCH_SUBVOL_SNAPSHOT_RO) ++ create_flags |= BCH_CREATE_SNAPSHOT_RO; ++ ++ /* why do we need this lock? */ ++ down_read(&c->vfs_sb->s_umount); ++ ++ if (arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE) ++ sync_inodes_sb(c->vfs_sb); ++retry: ++ if (arg.src_ptr) { ++ error = user_path_at(arg.dirfd, ++ (const char __user *)(unsigned long)arg.src_ptr, ++ how, &src_path); ++ if (error) ++ goto err1; ++ ++ if (src_path.dentry->d_sb->s_fs_info != c) { ++ path_put(&src_path); ++ error = -EXDEV; ++ goto err1; ++ } ++ ++ snapshot_src = inode_inum(to_bch_ei(src_path.dentry->d_inode)); ++ } ++ ++ dst_dentry = user_path_create(arg.dirfd, ++ (const char __user *)(unsigned long)arg.dst_ptr, ++ &dst_path, lookup_flags); ++ error = PTR_ERR_OR_ZERO(dst_dentry); ++ if (error) ++ goto err2; ++ ++ if (dst_dentry->d_sb->s_fs_info != c) { ++ error = -EXDEV; ++ goto err3; ++ } ++ ++ if (dst_dentry->d_inode) { ++ error = -EEXIST; ++ goto err3; ++ } ++ ++ dir = dst_path.dentry->d_inode; ++ if (IS_DEADDIR(dir)) { ++ error = -ENOENT; ++ goto err3; ++ } ++ ++ s_user_ns = dir->i_sb->s_user_ns; ++ if (!kuid_has_mapping(s_user_ns, current_fsuid()) || ++ !kgid_has_mapping(s_user_ns, current_fsgid())) { ++ error = -EOVERFLOW; ++ goto err3; ++ } ++ ++ error = inode_permission(file_mnt_user_ns(filp), ++ dir, MAY_WRITE | MAY_EXEC); ++ if (error) ++ goto err3; ++ ++ if (!IS_POSIXACL(dir)) ++ arg.mode &= ~current_umask(); ++ ++ error = security_path_mkdir(&dst_path, dst_dentry, arg.mode); ++ if (error) ++ goto err3; ++ ++ if ((arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE) && ++ !arg.src_ptr) ++ snapshot_src.subvol = to_bch_ei(dir)->ei_inode.bi_subvol; ++ ++ inode = __bch2_create(file_mnt_user_ns(filp), to_bch_ei(dir), ++ dst_dentry, arg.mode|S_IFDIR, ++ 0, snapshot_src, create_flags); ++ error = PTR_ERR_OR_ZERO(inode); ++ if (error) ++ goto err3; ++ ++ d_instantiate(dst_dentry, &inode->v); ++ fsnotify_mkdir(dir, dst_dentry); ++err3: ++ done_path_create(&dst_path, dst_dentry); ++err2: ++ if (arg.src_ptr) ++ path_put(&src_path); ++ ++ if (retry_estale(error, lookup_flags)) { ++ lookup_flags |= LOOKUP_REVAL; ++ goto retry; ++ } ++err1: ++ up_read(&c->vfs_sb->s_umount); ++ ++ return error; ++} ++ ++static long bch2_ioctl_subvolume_destroy(struct bch_fs *c, struct file *filp, ++ struct bch_ioctl_subvolume arg) ++{ ++ struct path path; ++ int ret = 0; ++ ++ if (arg.flags) ++ return -EINVAL; ++ ++ ret = user_path_at(arg.dirfd, ++ (const char __user *)(unsigned long)arg.dst_ptr, ++ LOOKUP_FOLLOW, &path); ++ if (ret) ++ return ret; ++ ++ if (path.dentry->d_sb->s_fs_info != c) { ++ path_put(&path); ++ return -EXDEV; ++ } ++ ++ ret = __bch2_unlink(path.dentry->d_parent->d_inode, path.dentry, 1); ++ path_put(&path); ++ ++ return ret; ++} ++ + long bch2_fs_file_ioctl(struct file *file, unsigned cmd, unsigned long arg) + { + struct bch_inode_info *inode = file_bch_inode(file); +@@ -322,6 +474,22 @@ long bch2_fs_file_ioctl(struct file *file, unsigned cmd, unsigned long arg) + case FS_IOC_GOINGDOWN: + return bch2_ioc_goingdown(c, (u32 __user *) arg); + ++ case BCH_IOCTL_SUBVOLUME_CREATE: { ++ struct bch_ioctl_subvolume i; ++ ++ if (copy_from_user(&i, (void __user *) arg, sizeof(i))) ++ return -EFAULT; ++ return bch2_ioctl_subvolume_create(c, file, i); ++ } ++ ++ case BCH_IOCTL_SUBVOLUME_DESTROY: { ++ struct bch_ioctl_subvolume i; ++ ++ if (copy_from_user(&i, (void __user *) arg, sizeof(i))) ++ return -EFAULT; ++ return bch2_ioctl_subvolume_destroy(c, file, i); ++ } ++ + default: + return bch2_fs_ioctl(c, cmd, (void __user *) arg); + } +diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c +index 69e888a88fb3..48f75f0aa7e4 100644 +--- a/fs/bcachefs/fs.c ++++ b/fs/bcachefs/fs.c +@@ -240,12 +240,6 @@ struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum) + struct bch_inode_info *inode; + int ret; + +- /* +- * debug assert, to be removed when we start creating +- * subvolumes/snapshots: +- */ +- BUG_ON(inum.subvol != BCACHEFS_ROOT_SUBVOL); +- + inode = to_bch_ei(iget5_locked(c->vfs_sb, + bch2_inode_hash(inum), + bch2_iget5_test, +@@ -274,7 +268,8 @@ struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum) + struct bch_inode_info * + __bch2_create(struct user_namespace *mnt_userns, + struct bch_inode_info *dir, struct dentry *dentry, +- umode_t mode, dev_t rdev, unsigned flags) ++ umode_t mode, dev_t rdev, subvol_inum snapshot_src, ++ unsigned flags) + { + struct bch_fs *c = dir->v.i_sb->s_fs_info; + struct btree_trans trans; +@@ -319,7 +314,7 @@ retry: + from_kuid(mnt_userns, current_fsuid()), + from_kgid(mnt_userns, current_fsgid()), + mode, rdev, +- default_acl, acl, flags) ?: ++ default_acl, acl, snapshot_src, flags) ?: + bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, 1, + KEY_TYPE_QUOTA_PREALLOC); + if (unlikely(ret)) +@@ -426,7 +421,8 @@ static int bch2_mknod(struct user_namespace *mnt_userns, + umode_t mode, dev_t rdev) + { + struct bch_inode_info *inode = +- __bch2_create(mnt_userns, to_bch_ei(vdir), dentry, mode, rdev, 0); ++ __bch2_create(mnt_userns, to_bch_ei(vdir), dentry, mode, rdev, ++ (subvol_inum) { 0 }, 0); + + if (IS_ERR(inode)) + return PTR_ERR(inode); +@@ -493,7 +489,8 @@ static int bch2_link(struct dentry *old_dentry, struct inode *vdir, + return 0; + } + +-static int bch2_unlink(struct inode *vdir, struct dentry *dentry) ++int __bch2_unlink(struct inode *vdir, struct dentry *dentry, ++ int deleting_snapshot) + { + struct bch_fs *c = vdir->i_sb->s_fs_info; + struct bch_inode_info *dir = to_bch_ei(vdir); +@@ -509,7 +506,8 @@ static int bch2_unlink(struct inode *vdir, struct dentry *dentry) + BTREE_INSERT_NOFAIL, + bch2_unlink_trans(&trans, + inode_inum(dir), &dir_u, +- &inode_u, &dentry->d_name)); ++ &inode_u, &dentry->d_name, ++ deleting_snapshot)); + + if (likely(!ret)) { + BUG_ON(inode_u.bi_inum != inode->v.i_ino); +@@ -527,6 +525,11 @@ static int bch2_unlink(struct inode *vdir, struct dentry *dentry) + return ret; + } + ++static int bch2_unlink(struct inode *vdir, struct dentry *dentry) ++{ ++ return __bch2_unlink(vdir, dentry, -1); ++} ++ + static int bch2_symlink(struct user_namespace *mnt_userns, + struct inode *vdir, struct dentry *dentry, + const char *symname) +@@ -536,7 +539,7 @@ static int bch2_symlink(struct user_namespace *mnt_userns, + int ret; + + inode = __bch2_create(mnt_userns, dir, dentry, S_IFLNK|S_IRWXUGO, 0, +- BCH_CREATE_TMPFILE); ++ (subvol_inum) { 0 }, BCH_CREATE_TMPFILE); + if (unlikely(IS_ERR(inode))) + return PTR_ERR(inode); + +@@ -854,7 +857,7 @@ static int bch2_tmpfile(struct user_namespace *mnt_userns, + { + struct bch_inode_info *inode = + __bch2_create(mnt_userns, to_bch_ei(vdir), dentry, mode, 0, +- BCH_CREATE_TMPFILE); ++ (subvol_inum) { 0 }, BCH_CREATE_TMPFILE); + + if (IS_ERR(inode)) + return PTR_ERR(inode); +diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h +index b7655fbf7c31..48fc504e2da2 100644 +--- a/fs/bcachefs/fs.h ++++ b/fs/bcachefs/fs.h +@@ -147,7 +147,7 @@ struct bch_inode_unpacked; + + struct bch_inode_info * + __bch2_create(struct user_namespace *, struct bch_inode_info *, +- struct dentry *, umode_t, dev_t, unsigned); ++ struct dentry *, umode_t, dev_t, subvol_inum, unsigned); + + int bch2_fs_quota_transfer(struct bch_fs *, + struct bch_inode_info *, +@@ -184,6 +184,7 @@ int __must_check bch2_write_inode(struct bch_fs *, struct bch_inode_info *, + int bch2_setattr_nonsize(struct user_namespace *, + struct bch_inode_info *, + struct iattr *); ++int __bch2_unlink(struct inode *, struct dentry *, int); + + void bch2_vfs_exit(void); + int bch2_vfs_init(void); +diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c +index f9a6a0b3ce7a..16a1eae9b374 100644 +--- a/fs/bcachefs/fsck.c ++++ b/fs/bcachefs/fsck.c +@@ -307,7 +307,7 @@ static int __remove_dirent(struct btree_trans *trans, struct bpos pos) + bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, pos, BTREE_ITER_INTENT); + + ret = bch2_hash_delete_at(trans, bch2_dirent_hash_desc, +- &dir_hash_info, &iter); ++ &dir_hash_info, &iter, 0); + bch2_trans_iter_exit(trans, &iter); + return ret; + } +@@ -386,7 +386,8 @@ create_lostfound: + BTREE_INSERT_LAZY_RW, + bch2_create_trans(trans, root_inum, &root, + lostfound, &lostfound_str, +- 0, 0, S_IFDIR|0700, 0, NULL, NULL, 0)); ++ 0, 0, S_IFDIR|0700, 0, NULL, NULL, ++ (subvol_inum) { }, 0)); + if (ret) + bch_err(c, "error creating lost+found: %i", ret); + } +@@ -759,7 +760,7 @@ static int fsck_hash_delete_at(struct btree_trans *trans, + { + int ret; + retry: +- ret = bch2_hash_delete_at(trans, desc, info, iter) ?: ++ ret = bch2_hash_delete_at(trans, desc, info, iter, 0) ?: + bch2_trans_commit(trans, NULL, NULL, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW); +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index 47c8fecc6839..64e0b542e779 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -1485,7 +1485,7 @@ int bch2_fs_initialize(struct bch_fs *c) + &root_inode, &lostfound_inode, + &lostfound, + 0, 0, S_IFDIR|0700, 0, +- NULL, NULL, 0)); ++ NULL, NULL, (subvol_inum) { 0 }, 0)); + if (ret) { + bch_err(c, "error creating lost+found"); + goto err; +diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h +index 6418089531ad..6486e709b700 100644 +--- a/fs/bcachefs/str_hash.h ++++ b/fs/bcachefs/str_hash.h +@@ -307,7 +307,8 @@ static __always_inline + int bch2_hash_delete_at(struct btree_trans *trans, + const struct bch_hash_desc desc, + const struct bch_hash_info *info, +- struct btree_iter *iter) ++ struct btree_iter *iter, ++ unsigned update_flags) + { + struct bkey_i *delete; + int ret; +@@ -325,7 +326,7 @@ int bch2_hash_delete_at(struct btree_trans *trans, + delete->k.p = iter->pos; + delete->k.type = ret ? KEY_TYPE_hash_whiteout : KEY_TYPE_deleted; + +- return bch2_trans_update(trans, iter, delete, 0); ++ return bch2_trans_update(trans, iter, delete, update_flags); + } + + static __always_inline +@@ -342,7 +343,7 @@ int bch2_hash_delete(struct btree_trans *trans, + if (ret) + return ret; + +- ret = bch2_hash_delete_at(trans, desc, info, &iter); ++ ret = bch2_hash_delete_at(trans, desc, info, &iter, 0); + bch2_trans_iter_exit(trans, &iter); + return ret; + } +-- +cgit v1.2.3 + + +From 81ee33b6b53f2971fc8b0051d04e8e405341d7f8 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 27 Sep 2021 01:56:31 -0400 +Subject: bcachefs: Fix an assertion + +We can end up in a strange situation where a btree_path points to a node +being freed even after pointers to it should have been replaced by +pointers to the new node - if the btree node has been reused since the +pointer to it was created. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_update_interior.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index 5a1420b392ba..c54e6b46a026 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -165,7 +165,8 @@ static void bch2_btree_node_free_inmem(struct btree_trans *trans, + struct btree_path *path; + + trans_for_each_path(trans, path) +- BUG_ON(path->l[b->c.level].b == b); ++ BUG_ON(path->l[b->c.level].b == b && ++ path->l[b->c.level].lock_seq == b->c.lock.state.seq); + + six_lock_write(&b->c.lock, NULL, NULL); + +-- +cgit v1.2.3 + + +From 53a957bdfc1a0195fe4ab7ff5d947fc5d17ccd7d Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 27 Sep 2021 13:25:18 -0400 +Subject: bcachefs: Rev the on disk format version for snapshots + +This will cause the compat code to be run that creates entries in the +subvolumes and snapshots btrees. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bcachefs_format.h | 3 ++- + fs/bcachefs/recovery.c | 21 ++++++++------------- + 2 files changed, 10 insertions(+), 14 deletions(-) + +diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h +index c082d5fce79a..0b8eabe5eaa4 100644 +--- a/fs/bcachefs/bcachefs_format.h ++++ b/fs/bcachefs/bcachefs_format.h +@@ -1258,7 +1258,8 @@ enum bcachefs_metadata_version { + bcachefs_metadata_version_snapshot = 12, + bcachefs_metadata_version_inode_backpointers = 13, + bcachefs_metadata_version_btree_ptr_sectors_written = 14, +- bcachefs_metadata_version_max = 15, ++ bcachefs_metadata_version_snapshot_2 = 15, ++ bcachefs_metadata_version_max = 16, + }; + + #define bcachefs_metadata_version_current (bcachefs_metadata_version_max - 1) +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index 64e0b542e779..6afb37a2e1b0 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -1004,11 +1004,10 @@ static int bch2_fs_upgrade_for_subvolumes(struct btree_trans *trans) + struct btree_iter iter; + struct bkey_s_c k; + struct bch_inode_unpacked inode; +- struct bkey_inode_buf *packed; + int ret; + + bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes, +- POS(0, BCACHEFS_ROOT_INO), 0); ++ SPOS(0, BCACHEFS_ROOT_INO, U32_MAX), 0); + k = bch2_btree_iter_peek_slot(&iter); + ret = bkey_err(k); + if (ret) +@@ -1025,13 +1024,7 @@ static int bch2_fs_upgrade_for_subvolumes(struct btree_trans *trans) + + inode.bi_subvol = BCACHEFS_ROOT_SUBVOL; + +- packed = bch2_trans_kmalloc(trans, sizeof(*packed)); +- ret = PTR_ERR_OR_ZERO(packed); +- if (ret) +- goto err; +- +- bch2_inode_pack(c, packed, &inode); +- ret = bch2_trans_update(trans, &iter, &packed->inode.k_i, 0); ++ ret = bch2_inode_write(trans, &iter, &inode); + err: + bch2_trans_iter_exit(trans, &iter); + return ret; +@@ -1096,8 +1089,8 @@ int bch2_fs_recovery(struct bch_fs *c) + } else if (c->sb.version < bcachefs_metadata_version_btree_ptr_sectors_written) { + bch_info(c, "version prior to btree_ptr_sectors_written, upgrade required"); + c->opts.version_upgrade = true; +- } else if (c->sb.version < bcachefs_metadata_version_snapshot) { +- bch_info(c, "filesystem version is prior to snapshot field - upgrading"); ++ } else if (c->sb.version < bcachefs_metadata_version_snapshot_2) { ++ bch_info(c, "filesystem version is prior to snapshots - upgrading"); + c->opts.version_upgrade = true; + } + +@@ -1267,7 +1260,9 @@ use_clean: + bch_verbose(c, "alloc write done"); + } + +- if (c->sb.version < bcachefs_metadata_version_snapshot) { ++ if (c->sb.version < bcachefs_metadata_version_snapshot_2) { ++ bch2_fs_lazy_rw(c); ++ + err = "error creating root snapshot node"; + ret = bch2_fs_initialize_subvolumes(c); + if (ret) +@@ -1281,7 +1276,7 @@ use_clean: + goto err; + bch_verbose(c, "reading snapshots done"); + +- if (c->sb.version < bcachefs_metadata_version_snapshot) { ++ if (c->sb.version < bcachefs_metadata_version_snapshot_2) { + /* set bi_subvol on root inode */ + err = "error upgrade root inode for subvolumes"; + ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_LAZY_RW, +-- +cgit v1.2.3 + + +From 867bdd79f48f921605272ca9bf49079a0557019b Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 30 Sep 2021 11:09:26 -0400 +Subject: bcachefs: Fix check_inode_update_hardlinks() + +We were incorrectly using bch2_inode_write(), which gets the snapshot ID +from the iterator, with a BTREE_ITER_ALL_SNAPSHOTS iterator - +fortunately caught by an assertion in the update path. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/fsck.c | 6 +----- + 1 file changed, 1 insertion(+), 5 deletions(-) + +diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c +index 16a1eae9b374..3622fb4d18e2 100644 +--- a/fs/bcachefs/fsck.c ++++ b/fs/bcachefs/fsck.c +@@ -2094,11 +2094,7 @@ static int check_nlinks_update_hardlinks(struct bch_fs *c, + bch2_inode_nlink_get(&u), link->count)) { + bch2_inode_nlink_set(&u, link->count); + +- ret = __bch2_trans_do(&trans, NULL, NULL, +- BTREE_INSERT_NOFAIL| +- BTREE_INSERT_LAZY_RW, +- bch2_btree_iter_traverse(&iter) ?: +- bch2_inode_write(&trans, &iter, &u)); ++ ret = write_inode(&trans, &u, k.k->p.snapshot); + if (ret) + bch_err(c, "error in fsck: error %i updating inode", ret); + } +-- +cgit v1.2.3 + + +From 67d1bdc4beb36b47e7f42fdf79f69588b36c73b1 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 30 Sep 2021 17:51:18 -0400 +Subject: bcachefs: Fix a spurious fsck error + +We were getting spurious "multiple types of data in same bucket" errors +in fsck, because the check was running for (cached) stale pointers - +oops. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_gc.c | 49 +++++++++++++++++++++++++++++++++---------------- + 1 file changed, 33 insertions(+), 16 deletions(-) + +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +index 307f287d95e6..079424227a1c 100644 +--- a/fs/bcachefs/btree_gc.c ++++ b/fs/bcachefs/btree_gc.c +@@ -504,22 +504,6 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id, + struct bucket *g2 = PTR_BUCKET(ca, &p.ptr, false); + enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, &entry->ptr); + +- if (fsck_err_on(g->mark.data_type && +- g->mark.data_type != data_type, c, +- "bucket %u:%zu different types of data in same bucket: %s, %s\n" +- "while marking %s", +- p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), +- bch2_data_types[g->mark.data_type], +- bch2_data_types[data_type], +- (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) { +- if (data_type == BCH_DATA_btree) { +- g2->_mark.data_type = g->_mark.data_type = data_type; +- set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags); +- } else { +- do_update = true; +- } +- } +- + if (fsck_err_on(!g->gen_valid, c, + "bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n" + "while marking %s", +@@ -536,6 +520,19 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id, + } + } + ++ if (fsck_err_on(data_type == BCH_DATA_btree && ++ g->mark.gen != p.ptr.gen, c, ++ "bucket %u:%zu data type %s has metadata but wrong gen: %u != %u\n" ++ "while marking %s", ++ p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), ++ bch2_data_types[ptr_data_type(k->k, &p.ptr)], ++ p.ptr.gen, g->mark.gen, ++ (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) { ++ g2->_mark.data_type = g->_mark.data_type = data_type; ++ g2->gen_valid = g->gen_valid = true; ++ set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags); ++ } ++ + if (fsck_err_on(gen_cmp(p.ptr.gen, g->mark.gen) > 0, c, + "bucket %u:%zu data type %s ptr gen in the future: %u > %u\n" + "while marking %s", +@@ -566,6 +563,26 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id, + (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) + do_update = true; + ++ if (p.ptr.gen != g->mark.gen) ++ continue; ++ ++ if (fsck_err_on(g->mark.data_type && ++ g->mark.data_type != data_type, c, ++ "bucket %u:%zu different types of data in same bucket: %s, %s\n" ++ "while marking %s", ++ p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), ++ bch2_data_types[g->mark.data_type], ++ bch2_data_types[data_type], ++ (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) { ++ if (data_type == BCH_DATA_btree) { ++ g2->_mark.data_type = g->_mark.data_type = data_type; ++ g2->gen_valid = g->gen_valid = true; ++ set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags); ++ } else { ++ do_update = true; ++ } ++ } ++ + if (p.has_ec) { + struct stripe *m = genradix_ptr(&c->stripes[true], p.ec.idx); + +-- +cgit v1.2.3 + + +From da3d2a3726732d2e2c19ffa9ad6deee8e318ee84 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 1 Oct 2021 10:08:13 -0400 +Subject: bcachefs: Fix allocator shutdown error message + +We return 1 to indicate kthread_should_stop() returned true - we +shouldn't be printing an error. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/alloc_background.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c +index 897729918b99..eb74b96124c5 100644 +--- a/fs/bcachefs/alloc_background.c ++++ b/fs/bcachefs/alloc_background.c +@@ -857,10 +857,10 @@ static int bch2_invalidate_buckets(struct bch_fs *c, struct bch_dev *ca) + /* If we used NOWAIT, don't return the error: */ + if (!fifo_empty(&ca->free_inc)) + ret = 0; +- if (ret) { ++ if (ret < 0) + bch_err(ca, "error invalidating buckets: %i", ret); ++ if (ret) + return ret; +- } + + if (journal_seq) + ret = bch2_journal_flush_seq(&c->journal, journal_seq); +-- +cgit v1.2.3 + + +From 69bda615c30166c886e28d3652a8335c8fbb8220 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 30 Sep 2021 19:46:23 -0400 +Subject: bcachefs: bch2_subvolume_get() + +Factor out a little helper. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/dirent.c | 27 ++++------------------ + fs/bcachefs/fs-common.c | 22 +++++------------- + fs/bcachefs/fsck.c | 23 ++++--------------- + fs/bcachefs/subvolume.c | 59 ++++++++++++++++++++++--------------------------- + fs/bcachefs/subvolume.h | 2 ++ + 5 files changed, 41 insertions(+), 92 deletions(-) + +diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c +index 8653a106809d..c7344ac87fcd 100644 +--- a/fs/bcachefs/dirent.c ++++ b/fs/bcachefs/dirent.c +@@ -191,34 +191,15 @@ int __bch2_dirent_read_target(struct btree_trans *trans, + if (likely(d.v->d_type != DT_SUBVOL)) { + *inum = le64_to_cpu(d.v->d_inum); + } else { +- struct btree_iter iter; +- struct bkey_s_c k; +- struct bkey_s_c_subvolume s; ++ struct bch_subvolume s; + int ret; + + *subvol = le64_to_cpu(d.v->d_inum); +- bch2_trans_iter_init(trans, &iter, BTREE_ID_subvolumes, +- POS(0, *subvol), +- BTREE_ITER_CACHED); +- k = bch2_btree_iter_peek_slot(&iter); +- ret = bkey_err(k); +- if (ret) +- goto err; +- +- if (k.k->type != KEY_TYPE_subvolume) { +- ret = -ENOENT; +- goto err; +- } + +- s = bkey_s_c_to_subvolume(k); +- *snapshot = le32_to_cpu(s.v->snapshot); +- *inum = le64_to_cpu(s.v->inode); +-err: +- if (ret == -ENOENT && !is_fsck) +- bch2_fs_inconsistent(trans->c, "pointer to missing subvolume %u", +- *subvol); ++ ret = bch2_subvolume_get(trans, *subvol, !is_fsck, BTREE_ITER_CACHED, &s); + +- bch2_trans_iter_exit(trans, &iter); ++ *snapshot = le32_to_cpu(s.snapshot); ++ *inum = le64_to_cpu(s.inode); + } + + return ret; +diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c +index 3e8e3c5bf870..00c7ba17f6c8 100644 +--- a/fs/bcachefs/fs-common.c ++++ b/fs/bcachefs/fs-common.c +@@ -67,26 +67,14 @@ int bch2_create_trans(struct btree_trans *trans, + + if (!snapshot_src.inum) { + /* Inode wasn't specified, just snapshot: */ +- struct btree_iter subvol_iter; +- struct bkey_s_c k; +- +- bch2_trans_iter_init(trans, &subvol_iter, BTREE_ID_subvolumes, +- POS(0, snapshot_src.subvol), 0); +- k = bch2_btree_iter_peek_slot(&subvol_iter); +- +- ret = bkey_err(k); +- if (!ret && k.k->type != KEY_TYPE_subvolume) { +- bch_err(c, "subvolume %u not found", +- snapshot_src.subvol); +- ret = -ENOENT; +- } +- +- if (!ret) +- snapshot_src.inum = le64_to_cpu(bkey_s_c_to_subvolume(k).v->inode); +- bch2_trans_iter_exit(trans, &subvol_iter); ++ struct bch_subvolume s; + ++ ret = bch2_subvolume_get(trans, snapshot_src.subvol, true, ++ BTREE_ITER_CACHED, &s); + if (ret) + goto err; ++ ++ snapshot_src.inum = le64_to_cpu(s.inode); + } + + ret = bch2_inode_peek(trans, &inode_iter, new_inode, snapshot_src, +diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c +index 3622fb4d18e2..208bf6df82b5 100644 +--- a/fs/bcachefs/fsck.c ++++ b/fs/bcachefs/fsck.c +@@ -103,29 +103,14 @@ static int snapshot_lookup_subvol(struct btree_trans *trans, u32 snapshot, + static int __subvol_lookup(struct btree_trans *trans, u32 subvol, + u32 *snapshot, u64 *inum) + { +- struct btree_iter iter; +- struct bkey_s_c k; ++ struct bch_subvolume s; + int ret; + +- bch2_trans_iter_init(trans, &iter, BTREE_ID_subvolumes, +- POS(0, subvol), 0); +- k = bch2_btree_iter_peek_slot(&iter); +- ret = bkey_err(k); +- if (ret) +- goto err; ++ ret = bch2_subvolume_get(trans, subvol, false, 0, &s); + +- if (k.k->type != KEY_TYPE_subvolume) { +- bch_err(trans->c, "subvolume %u not fonud", subvol); +- ret = -ENOENT; +- goto err; +- } +- +- *snapshot = le32_to_cpu(bkey_s_c_to_subvolume(k).v->snapshot); +- *inum = le64_to_cpu(bkey_s_c_to_subvolume(k).v->inode); +-err: +- bch2_trans_iter_exit(trans, &iter); ++ *snapshot = le32_to_cpu(s.snapshot); ++ *inum = le64_to_cpu(s.inode); + return ret; +- + } + + static int subvol_lookup(struct btree_trans *trans, u32 subvol, +diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c +index ff3b4d2d86b9..d1c111050c35 100644 +--- a/fs/bcachefs/subvolume.c ++++ b/fs/bcachefs/subvolume.c +@@ -89,23 +89,6 @@ int bch2_mark_snapshot(struct bch_fs *c, + return 0; + } + +-static int subvol_lookup(struct btree_trans *trans, unsigned id, struct bch_subvolume *s) +-{ +- struct btree_iter iter; +- struct bkey_s_c k; +- int ret; +- +- bch2_trans_iter_init(trans, &iter, BTREE_ID_subvolumes, POS(0, id), 0); +- k = bch2_btree_iter_peek_slot(&iter); +- ret = bkey_err(k) ?: k.k->type == KEY_TYPE_subvolume ? 0 : -ENOENT; +- +- if (!ret) +- *s = *bkey_s_c_to_subvolume(k).v; +- +- bch2_trans_iter_exit(trans, &iter); +- return ret; +-} +- + static int snapshot_lookup(struct btree_trans *trans, u32 id, + struct bch_snapshot *s) + { +@@ -195,7 +178,7 @@ static int bch2_snapshot_check(struct btree_trans *trans, + int ret; + + id = le32_to_cpu(s.v->subvol); +- ret = lockrestart_do(trans, subvol_lookup(trans, id, &subvol)); ++ ret = lockrestart_do(trans, bch2_subvolume_get(trans, id, 0, false, &subvol)); + if (ret == -ENOENT) + bch_err(trans->c, "snapshot node %llu has nonexistent subvolume %u", + s.k->p.offset, id); +@@ -798,34 +781,44 @@ void bch2_subvolume_to_text(struct printbuf *out, struct bch_fs *c, + le32_to_cpu(s.v->snapshot)); + } + +-int bch2_subvolume_get_snapshot(struct btree_trans *trans, u32 subvol, +- u32 *snapid) ++int bch2_subvolume_get(struct btree_trans *trans, unsigned subvol, ++ bool inconsistent_if_not_found, ++ int iter_flags, ++ struct bch_subvolume *s) + { + struct btree_iter iter; + struct bkey_s_c k; + int ret; + +- bch2_trans_iter_init(trans, &iter, BTREE_ID_subvolumes, +- POS(0, subvol), +- BTREE_ITER_CACHED| +- BTREE_ITER_WITH_UPDATES); ++ bch2_trans_iter_init(trans, &iter, BTREE_ID_subvolumes, POS(0, subvol), ++ iter_flags); + k = bch2_btree_iter_peek_slot(&iter); +- ret = bkey_err(k); +- if (ret) +- goto err; ++ ret = bkey_err(k) ?: k.k->type == KEY_TYPE_subvolume ? 0 : -ENOENT; + +- if (k.k->type != KEY_TYPE_subvolume) { ++ if (ret == -ENOENT && inconsistent_if_not_found) + bch2_fs_inconsistent(trans->c, "missing subvolume %u", subvol); +- ret = -EIO; +- goto err; +- } ++ if (!ret) ++ *s = *bkey_s_c_to_subvolume(k).v; + +- *snapid = le32_to_cpu(bkey_s_c_to_subvolume(k).v->snapshot); +-err: + bch2_trans_iter_exit(trans, &iter); + return ret; + } + ++int bch2_subvolume_get_snapshot(struct btree_trans *trans, u32 subvol, ++ u32 *snapid) ++{ ++ struct bch_subvolume s; ++ int ret; ++ ++ ret = bch2_subvolume_get(trans, subvol, true, ++ BTREE_ITER_CACHED| ++ BTREE_ITER_WITH_UPDATES, ++ &s); ++ ++ *snapid = le32_to_cpu(s.snapshot); ++ return ret; ++} ++ + /* XXX: mark snapshot id for deletion, walk btree and delete: */ + int bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid, + int deleting_snapshot) +diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h +index 0740c7b7f772..ed02b982ff96 100644 +--- a/fs/bcachefs/subvolume.h ++++ b/fs/bcachefs/subvolume.h +@@ -104,6 +104,8 @@ void bch2_subvolume_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c) + .val_to_text = bch2_subvolume_to_text, \ + } + ++int bch2_subvolume_get(struct btree_trans *, unsigned, ++ bool, int, struct bch_subvolume *); + int bch2_subvolume_get_snapshot(struct btree_trans *, u32, u32 *); + + int bch2_subvolume_delete(struct btree_trans *, u32, int); +-- +cgit v1.2.3 + + +From 05cd6c0fe235e17273a2e1273512890778319325 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 7 Oct 2021 14:53:21 -0400 +Subject: bcachefs: Fix bch2_dev_remove_alloc() + +It was missing a lockrestart_do(), to call bch2_trans_begin() and also +handle transaction restarts. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/super.c | 9 ++++++--- + 1 file changed, 6 insertions(+), 3 deletions(-) + +diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c +index 1feb7dee2e0c..bb633e3df618 100644 +--- a/fs/bcachefs/super.c ++++ b/fs/bcachefs/super.c +@@ -1452,15 +1452,18 @@ static int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca) + bch2_trans_init(&trans, c, 0, 0); + + for (i = 0; i < ca->mi.nbuckets; i++) { +- ret = bch2_btree_key_cache_flush(&trans, +- BTREE_ID_alloc, POS(ca->dev_idx, i)); ++ ret = lockrestart_do(&trans, ++ bch2_btree_key_cache_flush(&trans, ++ BTREE_ID_alloc, POS(ca->dev_idx, i))); + if (ret) + break; + } + bch2_trans_exit(&trans); + +- if (ret) ++ if (ret) { ++ bch_err(c, "error %i removing dev alloc info", ret); + return ret; ++ } + + return bch2_btree_delete_range(c, BTREE_ID_alloc, + POS(ca->dev_idx, 0), +-- +cgit v1.2.3 + + +From 9092667bf764e6e97994b1c22973e97e841a41bd Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 7 Oct 2021 14:54:50 -0400 +Subject: bcachefs: Ensure btree_path consistent with node iterators + +Btree node iterators want the interior btree_path to point to the same +pos as the returned btree node - this fixes a regression from the +introduction of btree_path, where rewriting/updating keys of btree nodes +(e.g. in bch2_dev_metadata_drop()) via btree node iterators. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 11 ++++++++++- + 1 file changed, 10 insertions(+), 1 deletion(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index d805a090eacf..a8ca6b8503e8 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -1874,13 +1874,14 @@ bch2_btree_iter_traverse(struct btree_iter *iter) + + struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter) + { ++ struct btree_trans *trans = iter->trans; + struct btree *b = NULL; + int ret; + + EBUG_ON(iter->path->cached); + bch2_btree_iter_verify(iter); + +- ret = bch2_btree_path_traverse(iter->trans, iter->path, iter->flags); ++ ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); + if (ret) + goto out; + +@@ -1892,7 +1893,11 @@ struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter) + + bkey_init(&iter->k); + iter->k.p = iter->pos = b->key.k.p; ++ ++ iter->path = btree_path_set_pos(trans, iter->path, b->key.k.p, ++ iter->flags & BTREE_ITER_INTENT); + iter->path->should_be_locked = true; ++ BUG_ON(iter->path->uptodate); + out: + bch2_btree_iter_verify_entry_exit(iter); + bch2_btree_iter_verify(iter); +@@ -1957,7 +1962,11 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) + + bkey_init(&iter->k); + iter->k.p = iter->pos = b->key.k.p; ++ ++ iter->path = btree_path_set_pos(trans, iter->path, b->key.k.p, ++ iter->flags & BTREE_ITER_INTENT); + iter->path->should_be_locked = true; ++ BUG_ON(iter->path->uptodate); + out: + bch2_btree_iter_verify_entry_exit(iter); + bch2_btree_iter_verify(iter); +-- +cgit v1.2.3 + + +From f6e270aa7e51b7d242bb04ec0908c6f338c3c181 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 7 Oct 2021 14:56:56 -0400 +Subject: bcachefs: More btree iterator fixes + + - check for getting to the end of the btree in bch2_path_verify_locks + and __btree_path_traverse_all(), this fixes an infinite loop in + __btree_path_traverse_all(). + - relax requirement in bch2_btree_node_upgrade() that we must want an + intent lock, this fixes bugs with paths that point to interior nodes + (nonzero level). + - bch2_btree_node_update_key(): fix it to upgrade the path to an intent + lock, if necessary + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 19 +++++++++++++++---- + fs/bcachefs/btree_update_interior.c | 9 +++++++++ + 2 files changed, 24 insertions(+), 4 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index a8ca6b8503e8..f6b663dd46d9 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -173,11 +173,20 @@ static bool bch2_btree_node_upgrade(struct btree_trans *trans, + { + struct btree *b = path->l[level].b; + +- EBUG_ON(btree_lock_want(path, level) != BTREE_NODE_INTENT_LOCKED); +- + if (!is_btree_node(path, level)) + return false; + ++ switch (btree_lock_want(path, level)) { ++ case BTREE_NODE_UNLOCKED: ++ BUG_ON(btree_node_locked(path, level)); ++ return true; ++ case BTREE_NODE_READ_LOCKED: ++ BUG_ON(btree_node_intent_locked(path, level)); ++ return bch2_btree_node_relock(trans, path, level); ++ case BTREE_NODE_INTENT_LOCKED: ++ break; ++ } ++ + if (btree_node_intent_locked(path, level)) + return true; + +@@ -372,7 +381,8 @@ static void bch2_btree_path_verify_locks(struct btree_path *path) + unsigned l; + + if (!path->nodes_locked) { +- BUG_ON(path->uptodate == BTREE_ITER_UPTODATE); ++ BUG_ON(path->uptodate == BTREE_ITER_UPTODATE && ++ btree_path_node(path, path->level)); + return; + } + +@@ -1359,7 +1369,8 @@ retry_all: + + EBUG_ON(!(trans->paths_allocated & (1ULL << path->idx))); + +- if (path->nodes_locked) ++ if (path->nodes_locked || ++ !btree_path_node(path, path->level)) + i++; + } + +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index c54e6b46a026..98c05bb032a9 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -1945,9 +1945,16 @@ int bch2_btree_node_update_key(struct btree_trans *trans, struct btree_iter *ite + { + struct bch_fs *c = trans->c; + struct btree *new_hash = NULL; ++ struct btree_path *path = iter->path; + struct closure cl; + int ret = 0; + ++ if (!btree_node_intent_locked(path, b->c.level) && ++ !bch2_btree_path_upgrade(trans, path, b->c.level + 1)) { ++ btree_trans_restart(trans); ++ return -EINTR; ++ } ++ + closure_init_stack(&cl); + + /* +@@ -1966,8 +1973,10 @@ int bch2_btree_node_update_key(struct btree_trans *trans, struct btree_iter *ite + new_hash = bch2_btree_node_mem_alloc(c); + } + ++ path->intent_ref++; + ret = __bch2_btree_node_update_key(trans, iter, b, new_hash, + new_key, skip_triggers); ++ --path->intent_ref; + + if (new_hash) { + mutex_lock(&c->btree_cache.lock); +-- +cgit v1.2.3 + + +From 92f9c93aa83866c834faa7ceb41079fa1cc7d22a Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 7 Oct 2021 14:59:00 -0400 +Subject: bcachefs: Fixes for usrdata/metadata drop paths + +These paths weren't updated for btree_path and snapshots - a couple of +minor fixes. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/migrate.c | 21 ++++++++++++--------- + 1 file changed, 12 insertions(+), 9 deletions(-) + +diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c +index 7c764ee4ea09..9f9eb799337e 100644 +--- a/fs/bcachefs/migrate.c ++++ b/fs/bcachefs/migrate.c +@@ -51,7 +51,8 @@ static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags + BTREE_ITER_PREFETCH| + BTREE_ITER_ALL_SNAPSHOTS); + +- while ((k = bch2_btree_iter_peek(&iter)).k && ++ while ((bch2_trans_begin(&trans), ++ (k = bch2_btree_iter_peek(&iter)).k) && + !(ret = bkey_err(k))) { + if (!bch2_bkey_has_device(k, dev_idx)) { + bch2_btree_iter_advance(&iter); +@@ -72,8 +73,6 @@ static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags + */ + bch2_extent_normalize(c, bkey_i_to_s(sk.k)); + +- bch2_btree_iter_set_pos(&iter, bkey_start_pos(&sk.k->k)); +- + ret = bch2_btree_iter_traverse(&iter) ?: + bch2_trans_update(&trans, &iter, sk.k, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: +@@ -125,12 +124,14 @@ static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags) + closure_init_stack(&cl); + + for (id = 0; id < BTREE_ID_NR; id++) { +- for_each_btree_node(&trans, iter, id, POS_MIN, +- BTREE_ITER_PREFETCH, b) { +-retry: ++ bch2_trans_node_iter_init(&trans, &iter, id, POS_MIN, 0, 0, ++ BTREE_ITER_PREFETCH); ++ ++ while (bch2_trans_begin(&trans), ++ (b = bch2_btree_iter_peek_node(&iter))) { + if (!bch2_bkey_has_device(bkey_i_to_s_c(&b->key), + dev_idx)) +- continue; ++ goto next; + + bch2_bkey_buf_copy(&k, c, &b->key); + +@@ -143,14 +144,16 @@ retry: + + ret = bch2_btree_node_update_key(&trans, &iter, b, k.k, false); + if (ret == -EINTR) { +- b = bch2_btree_iter_peek_node(&iter); + ret = 0; +- goto retry; ++ continue; + } ++ + if (ret) { + bch_err(c, "Error updating btree node key: %i", ret); + break; + } ++next: ++ bch2_btree_iter_next_node(&iter); + } + bch2_trans_iter_exit(&trans, &iter); + +-- +cgit v1.2.3 + + +From 86eb21192cb4f1e4780ce2c82641e98f819eb56a Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 7 Oct 2021 18:08:01 -0400 +Subject: bcachefs: Fix bch2_move_btree() + +bch2_trans_begin() is now required for transaction restarts. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/move.c | 9 ++++++--- + 1 file changed, 6 insertions(+), 3 deletions(-) + +diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c +index 44a61818d9a4..fddf7e822614 100644 +--- a/fs/bcachefs/move.c ++++ b/fs/bcachefs/move.c +@@ -883,9 +883,11 @@ static int bch2_move_btree(struct bch_fs *c, + id++) { + stats->btree_id = id; + +- for_each_btree_node(&trans, iter, id, +- id == start_btree_id ? start_pos : POS_MIN, +- BTREE_ITER_PREFETCH, b) { ++ bch2_trans_node_iter_init(&trans, &iter, id, POS_MIN, 0, 0, ++ BTREE_ITER_PREFETCH); ++ ++ while (bch2_trans_begin(&trans), ++ (b = bch2_btree_iter_peek_node(&iter))) { + if (kthread && kthread_should_stop()) + break; + +@@ -911,6 +913,7 @@ static int bch2_move_btree(struct bch_fs *c, + b->data->keys.seq, 0) ?: ret; + next: + bch2_trans_cond_resched(&trans); ++ bch2_btree_iter_next_node(&iter); + } + bch2_trans_iter_exit(&trans, &iter); + +-- +cgit v1.2.3 + + +From 0281a775430efaa5c695f8e8cc3915b05286b713 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 7 Oct 2021 18:18:01 -0400 +Subject: bcachefs: Fix a pcpu var splat + +this_cpu_ptr() emits a warning when used without preemption disabled - +harmless in this case, as we have other locking where +bch2_acc_percpu_u64s() is used. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/util.c | 7 ++++++- + 1 file changed, 6 insertions(+), 1 deletion(-) + +diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c +index 463260c04585..9f21f68e84d3 100644 +--- a/fs/bcachefs/util.c ++++ b/fs/bcachefs/util.c +@@ -887,9 +887,14 @@ void eytzinger0_find_test(void) + */ + u64 *bch2_acc_percpu_u64s(u64 __percpu *p, unsigned nr) + { +- u64 *ret = this_cpu_ptr(p); ++ u64 *ret; + int cpu; + ++ /* access to pcpu vars has to be blocked by other locking */ ++ preempt_disable(); ++ ret = this_cpu_ptr(p); ++ preempt_enable(); ++ + for_each_possible_cpu(cpu) { + u64 *i = per_cpu_ptr(p, cpu); + +-- +cgit v1.2.3 + + +From 548d4c654c62bc1b2b1086471ab8ed2b979b88ce Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 30 Sep 2021 20:09:08 -0400 +Subject: bcachefs: Snapshot deletion fix + +When we delete a snapshot, we unlink the inode but we don't want to run +the inode_rm path - the unlink path deletes the subvolume directly, +which does everything we need. Also allowing the inode_rm path to run +was getting us "missing subvolume" errors. + +There's still another bug with snapshot deletion: we need to make +snapshot deletion a multi stage process, where we unlink the root +dentry, then tear down the page cache, then delete the snapshot. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/fs-common.c | 30 +++++++++++++++++++++++------- + 1 file changed, 23 insertions(+), 7 deletions(-) + +diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c +index 00c7ba17f6c8..c49de741e1e3 100644 +--- a/fs/bcachefs/fs-common.c ++++ b/fs/bcachefs/fs-common.c +@@ -267,18 +267,33 @@ int bch2_unlink_trans(struct btree_trans *trans, + if (ret) + goto err; + +- if (deleting_snapshot == 1 && !inode_u->bi_subvol) { +- ret = -ENOENT; +- goto err; +- } +- + if (deleting_snapshot <= 0 && S_ISDIR(inode_u->bi_mode)) { + ret = bch2_empty_dir_trans(trans, inum); + if (ret) + goto err; + } + +- if (inode_u->bi_subvol) { ++ if (deleting_snapshot < 0 && ++ inode_u->bi_subvol) { ++ struct bch_subvolume s; ++ ++ ret = bch2_subvolume_get(trans, inode_u->bi_subvol, true, ++ BTREE_ITER_CACHED| ++ BTREE_ITER_WITH_UPDATES, ++ &s); ++ if (ret) ++ goto err; ++ ++ if (BCH_SUBVOLUME_SNAP(&s)) ++ deleting_snapshot = 1; ++ } ++ ++ if (deleting_snapshot == 1) { ++ if (!inode_u->bi_subvol) { ++ ret = -ENOENT; ++ goto err; ++ } ++ + ret = bch2_subvolume_delete(trans, inode_u->bi_subvol, + deleting_snapshot); + if (ret) +@@ -297,6 +312,8 @@ int bch2_unlink_trans(struct btree_trans *trans, + ret = bch2_btree_iter_traverse(&dirent_iter); + if (ret) + goto err; ++ } else { ++ bch2_inode_nlink_dec(inode_u); + } + + if (inode_u->bi_dir == dirent_iter.pos.inode && +@@ -307,7 +324,6 @@ int bch2_unlink_trans(struct btree_trans *trans, + + dir_u->bi_mtime = dir_u->bi_ctime = inode_u->bi_ctime = now; + dir_u->bi_nlink -= is_subdir_for_nlink(inode_u); +- bch2_inode_nlink_dec(inode_u); + + ret = bch2_hash_delete_at(trans, bch2_dirent_hash_desc, + &dir_hash, &dirent_iter, +-- +cgit v1.2.3 + + +From 0c3c7f54d922ff60142af039dd3d5fe48a05af01 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 12 Oct 2021 14:15:45 -0400 +Subject: bcachefs: Fix rereplicate_pred() + +It was switching off of the key type incorrectly - this code must've +been quite old, and not rereplicating anything that wasn't a +btree_ptr_v1 or a plain old extent. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/move.c | 13 +++---------- + 1 file changed, 3 insertions(+), 10 deletions(-) + +diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c +index fddf7e822614..0a8fe7085cc0 100644 +--- a/fs/bcachefs/move.c ++++ b/fs/bcachefs/move.c +@@ -946,16 +946,9 @@ static enum data_cmd rereplicate_pred(struct bch_fs *c, void *arg, + struct data_opts *data_opts) + { + unsigned nr_good = bch2_bkey_durability(c, k); +- unsigned replicas = 0; +- +- switch (k.k->type) { +- case KEY_TYPE_btree_ptr: +- replicas = c->opts.metadata_replicas; +- break; +- case KEY_TYPE_extent: +- replicas = io_opts->data_replicas; +- break; +- } ++ unsigned replicas = bkey_is_btree_ptr(k.k) ++ ? c->opts.metadata_replicas ++ : io_opts->data_replicas; + + if (!nr_good || nr_good >= replicas) + return DATA_SKIP; +-- +cgit v1.2.3 + + +From c48f6cbe33ec5e831724f79d32a9ba30960187de Mon Sep 17 00:00:00 2001 +From: Brett Holman +Date: Tue, 12 Oct 2021 21:11:25 -0600 +Subject: bcachefs: Add a valgrind memcheck hint + +Prevent false positives in bch2_varint_decode_fast() + +Signed-off-by: Brett Holman +--- + fs/bcachefs/varint.c | 7 +++++++ + 1 file changed, 7 insertions(+) + +diff --git a/fs/bcachefs/varint.c b/fs/bcachefs/varint.c +index 752179b26a1e..a2d6bb7136c7 100644 +--- a/fs/bcachefs/varint.c ++++ b/fs/bcachefs/varint.c +@@ -4,6 +4,10 @@ + #include + #include + ++#ifdef CONFIG_VALGRIND ++#include ++#endif ++ + #include "varint.h" + + /** +@@ -95,6 +99,9 @@ int bch2_varint_encode_fast(u8 *out, u64 v) + */ + int bch2_varint_decode_fast(const u8 *in, const u8 *end, u64 *out) + { ++#ifdef CONFIG_VALGRIND ++ VALGRIND_MAKE_MEM_DEFINED(in, 8); ++#endif + u64 v = get_unaligned_le64(in); + unsigned bytes = ffz(*in) + 1; + +-- +cgit v1.2.3 + + +From c50437e29d315b8afb3ee17afe264a627c86f993 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 12 Oct 2021 14:25:13 -0400 +Subject: bcachefs: Fix deletion in __bch2_dev_usrdata_drop() + +With snapshots, __bch2_dev_usr_data_drop() now uses an ALL_SNAPSHOTS +iterator, which isn't an extent iterator - meaning we shouldn't be +inserting whiteouts with nonzero size to delete. This fixes a bug where +we go RO because we tried to insert an invalid key in the device remove +path. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/migrate.c | 9 +++++++++ + 1 file changed, 9 insertions(+) + +diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c +index 9f9eb799337e..94d5d99ffd2a 100644 +--- a/fs/bcachefs/migrate.c ++++ b/fs/bcachefs/migrate.c +@@ -73,6 +73,15 @@ static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags + */ + bch2_extent_normalize(c, bkey_i_to_s(sk.k)); + ++ /* ++ * Since we're not inserting through an extent iterator ++ * (BTREE_ITER_ALL_SNAPSHOTS iterators aren't extent iterators), ++ * we aren't using the extent overwrite path to delete, we're ++ * just using the normal key deletion path: ++ */ ++ if (bkey_deleted(&sk.k->k)) ++ sk.k->k.size = 0; ++ + ret = bch2_btree_iter_traverse(&iter) ?: + bch2_trans_update(&trans, &iter, sk.k, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: +-- +cgit v1.2.3 + + +From 2d2cedb0ac6ae5ba993ba706f0d7aef60373c33f Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 13 Oct 2021 13:12:26 -0400 +Subject: bcachefs: Fix implementation of KEY_TYPE_error + +When force-removing a device, we were silently dropping extents that we +no longer had pointers for - we should have been switching them to +KEY_TYPE_error, so that reads for data that was lost return errors. + +This patch adds the logic for switching a key to KEY_TYPE_error to +bch2_bkey_drop_ptr(), and improves the logic somewhat. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_gc.c | 1 - + fs/bcachefs/extents.c | 90 ++++++++++++++++++++++++++++++++++++-------------- + fs/bcachefs/extents.h | 14 ++++---- + fs/bcachefs/move.c | 2 +- + 4 files changed, 74 insertions(+), 33 deletions(-) + +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +index 079424227a1c..236ecbd82a63 100644 +--- a/fs/bcachefs/btree_gc.c ++++ b/fs/bcachefs/btree_gc.c +@@ -1767,7 +1767,6 @@ static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id btree_id) + bch2_bkey_buf_reassemble(&sk, c, k); + bch2_extent_normalize(c, bkey_i_to_s(sk.k)); + +- + commit_err = + bch2_trans_update(&trans, &iter, sk.k, 0) ?: + bch2_trans_commit(&trans, NULL, NULL, +diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c +index 6c2eed77a326..194fbe21c97f 100644 +--- a/fs/bcachefs/extents.c ++++ b/fs/bcachefs/extents.c +@@ -480,7 +480,7 @@ restart_narrow_pointers: + + bkey_for_each_ptr_decode(&k->k, ptrs, p, i) + if (can_narrow_crc(p.crc, n)) { +- bch2_bkey_drop_ptr(bkey_i_to_s(k), &i->ptr); ++ __bch2_bkey_drop_ptr(bkey_i_to_s(k), &i->ptr); + p.ptr.offset += p.crc.offset; + p.crc = n; + bch2_extent_ptr_decoded_append(k, &p); +@@ -785,41 +785,85 @@ static union bch_extent_entry *extent_entry_prev(struct bkey_ptrs ptrs, + return i; + } + +-union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s k, +- struct bch_extent_ptr *ptr) ++static void extent_entry_drop(struct bkey_s k, union bch_extent_entry *entry) ++{ ++ union bch_extent_entry *next = extent_entry_next(entry); ++ ++ /* stripes have ptrs, but their layout doesn't work with this code */ ++ BUG_ON(k.k->type == KEY_TYPE_stripe); ++ ++ memmove_u64s_down(entry, next, ++ (u64 *) bkey_val_end(k) - (u64 *) next); ++ k.k->u64s -= (u64 *) next - (u64 *) entry; ++} ++ ++/* ++ * Returns pointer to the next entry after the one being dropped: ++ */ ++union bch_extent_entry *__bch2_bkey_drop_ptr(struct bkey_s k, ++ struct bch_extent_ptr *ptr) + { + struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); +- union bch_extent_entry *dst, *src, *prev; ++ union bch_extent_entry *entry = to_entry(ptr), *next; ++ union bch_extent_entry *ret = entry; + bool drop_crc = true; + + EBUG_ON(ptr < &ptrs.start->ptr || + ptr >= &ptrs.end->ptr); + EBUG_ON(ptr->type != 1 << BCH_EXTENT_ENTRY_ptr); + +- src = extent_entry_next(to_entry(ptr)); +- if (src != ptrs.end && +- !extent_entry_is_crc(src)) +- drop_crc = false; +- +- dst = to_entry(ptr); +- while ((prev = extent_entry_prev(ptrs, dst))) { +- if (extent_entry_is_ptr(prev)) ++ for (next = extent_entry_next(entry); ++ next != ptrs.end; ++ next = extent_entry_next(next)) { ++ if (extent_entry_is_crc(next)) { + break; +- +- if (extent_entry_is_crc(prev)) { +- if (drop_crc) +- dst = prev; ++ } else if (extent_entry_is_ptr(next)) { ++ drop_crc = false; + break; + } ++ } ++ ++ extent_entry_drop(k, entry); + +- dst = prev; ++ while ((entry = extent_entry_prev(ptrs, entry))) { ++ if (extent_entry_is_ptr(entry)) ++ break; ++ ++ if ((extent_entry_is_crc(entry) && drop_crc) || ++ extent_entry_is_stripe_ptr(entry)) { ++ ret = (void *) ret - extent_entry_bytes(entry); ++ extent_entry_drop(k, entry); ++ } + } + +- memmove_u64s_down(dst, src, +- (u64 *) ptrs.end - (u64 *) src); +- k.k->u64s -= (u64 *) src - (u64 *) dst; ++ return ret; ++} ++ ++union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s k, ++ struct bch_extent_ptr *ptr) ++{ ++ bool have_dirty = bch2_bkey_dirty_devs(k.s_c).nr; ++ union bch_extent_entry *ret = ++ __bch2_bkey_drop_ptr(k, ptr); ++ ++ /* ++ * If we deleted all the dirty pointers and there's still cached ++ * pointers, we could set the cached pointers to dirty if they're not ++ * stale - but to do that correctly we'd need to grab an open_bucket ++ * reference so that we don't race with bucket reuse: ++ */ ++ if (have_dirty && ++ !bch2_bkey_dirty_devs(k.s_c).nr) { ++ k.k->type = KEY_TYPE_error; ++ set_bkey_val_u64s(k.k, 0); ++ ret = NULL; ++ } else if (!bch2_bkey_nr_ptrs(k.s_c)) { ++ k.k->type = KEY_TYPE_deleted; ++ set_bkey_val_u64s(k.k, 0); ++ ret = NULL; ++ } + +- return dst; ++ return ret; + } + + void bch2_bkey_drop_device(struct bkey_s k, unsigned dev) +@@ -889,10 +933,6 @@ bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k) + ptr->cached && + ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr)); + +- /* will only happen if all pointers were cached: */ +- if (!bch2_bkey_nr_ptrs(k.s_c)) +- k.k->type = KEY_TYPE_deleted; +- + return bkey_deleted(k.k); + } + +diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h +index afd3067bb64e..9c2567274a2b 100644 +--- a/fs/bcachefs/extents.h ++++ b/fs/bcachefs/extents.h +@@ -78,12 +78,12 @@ static inline size_t extent_entry_u64s(const union bch_extent_entry *entry) + + static inline bool extent_entry_is_ptr(const union bch_extent_entry *e) + { +- switch (extent_entry_type(e)) { +- case BCH_EXTENT_ENTRY_ptr: +- return true; +- default: +- return false; +- } ++ return extent_entry_type(e) == BCH_EXTENT_ENTRY_ptr; ++} ++ ++static inline bool extent_entry_is_stripe_ptr(const union bch_extent_entry *e) ++{ ++ return extent_entry_type(e) == BCH_EXTENT_ENTRY_stripe_ptr; + } + + static inline bool extent_entry_is_crc(const union bch_extent_entry *e) +@@ -578,6 +578,8 @@ void bch2_bkey_extent_entry_drop(struct bkey_i *, union bch_extent_entry *); + void bch2_bkey_append_ptr(struct bkey_i *, struct bch_extent_ptr); + void bch2_extent_ptr_decoded_append(struct bkey_i *, + struct extent_ptr_decoded *); ++union bch_extent_entry *__bch2_bkey_drop_ptr(struct bkey_s, ++ struct bch_extent_ptr *); + union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s, + struct bch_extent_ptr *); + +diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c +index 0a8fe7085cc0..790389d485a4 100644 +--- a/fs/bcachefs/move.c ++++ b/fs/bcachefs/move.c +@@ -196,7 +196,7 @@ static int bch2_migrate_index_update(struct bch_write_op *op) + extent_for_each_ptr(extent_i_to_s(new), new_ptr) + new_ptr->cached = true; + +- bch2_bkey_drop_ptr(bkey_i_to_s(insert), old_ptr); ++ __bch2_bkey_drop_ptr(bkey_i_to_s(insert), old_ptr); + } + + extent_for_each_ptr_decode(extent_i_to_s(new), p, entry) { +-- +cgit v1.2.3 + + +From 36f92370b657a36a6c5028997013471c1a4db8ee Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 13 Oct 2021 13:45:46 -0400 +Subject: bcachefs: Don't allocate too-big bios + +This fixes a null ptr deref in bio_alloc_bioset() -> biovec_slab() + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/io.c | 2 ++ + fs/bcachefs/util.c | 6 +++++- + 2 files changed, 7 insertions(+), 1 deletion(-) + +diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c +index 8c0697bf7828..708ba5590182 100644 +--- a/fs/bcachefs/io.c ++++ b/fs/bcachefs/io.c +@@ -782,6 +782,8 @@ static struct bio *bch2_write_bio_alloc(struct bch_fs *c, + ? ((unsigned long) buf & (PAGE_SIZE - 1)) + : 0), PAGE_SIZE); + ++ pages = min(pages, BIO_MAX_VECS); ++ + bio = bio_alloc_bioset(GFP_NOIO, pages, &c->bio_write); + wbio = wbio_init(bio); + wbio->put_bio = true; +diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c +index 9f21f68e84d3..52de7c49cacb 100644 +--- a/fs/bcachefs/util.c ++++ b/fs/bcachefs/util.c +@@ -525,7 +525,11 @@ int bch2_bio_alloc_pages(struct bio *bio, size_t size, gfp_t gfp_mask) + if (!page) + return -ENOMEM; + +- BUG_ON(!bio_add_page(bio, page, len, 0)); ++ if (unlikely(!bio_add_page(bio, page, len, 0))) { ++ __free_page(page); ++ break; ++ } ++ + size -= len; + } + +-- +cgit v1.2.3 + + +From 303a55b73d563b60987170b3de186ce62ed48cac Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 14 Oct 2021 11:47:52 -0400 +Subject: bcachefs: Improve bch2_dump_trans_paths_updates() + +Also print the key beyng overwritten for each update. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 17 +++++++++++------ + 1 file changed, 11 insertions(+), 6 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index f6b663dd46d9..563b187ebca1 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -1696,7 +1696,7 @@ void bch2_dump_trans_paths_updates(struct btree_trans *trans) + struct btree_path *path; + struct btree_insert_entry *i; + unsigned idx; +- char buf[300]; ++ char buf1[300], buf2[300]; + + btree_trans_verify_sorted(trans); + +@@ -1705,7 +1705,7 @@ void bch2_dump_trans_paths_updates(struct btree_trans *trans) + path->idx, path->ref, path->intent_ref, + path->preserve ? " preserve" : "", + bch2_btree_ids[path->btree_id], +- (bch2_bpos_to_text(&PBUF(buf), path->pos), buf), ++ (bch2_bpos_to_text(&PBUF(buf1), path->pos), buf1), + #ifdef CONFIG_BCACHEFS_DEBUG + (void *) path->ip_allocated + #else +@@ -1713,11 +1713,16 @@ void bch2_dump_trans_paths_updates(struct btree_trans *trans) + #endif + ); + +- trans_for_each_update(trans, i) +- printk(KERN_ERR "update: btree %s %s %pS\n", ++ trans_for_each_update(trans, i) { ++ struct bkey u; ++ struct bkey_s_c old = bch2_btree_path_peek_slot(i->path, &u); ++ ++ printk(KERN_ERR "update: btree %s %pS\n old %s\n new %s", + bch2_btree_ids[i->btree_id], +- (bch2_bkey_val_to_text(&PBUF(buf), trans->c, bkey_i_to_s_c(i->k)), buf), +- (void *) i->ip_allocated); ++ (void *) i->ip_allocated, ++ (bch2_bkey_val_to_text(&PBUF(buf1), trans->c, old), buf1), ++ (bch2_bkey_val_to_text(&PBUF(buf2), trans->c, bkey_i_to_s_c(i->k)), buf2)); ++ } + } + + static struct btree_path *btree_path_alloc(struct btree_trans *trans, +-- +cgit v1.2.3 + + +From c77af21b11ef8b157277fef9fc4d254d28525852 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 14 Oct 2021 13:14:40 -0400 +Subject: bcachefs: Fix __bch2_dirent_read_target() + +We were shadowing our exist status, oops + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/dirent.c | 4 +--- + 1 file changed, 1 insertion(+), 3 deletions(-) + +diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c +index c7344ac87fcd..cd5468b15ba2 100644 +--- a/fs/bcachefs/dirent.c ++++ b/fs/bcachefs/dirent.c +@@ -183,6 +183,7 @@ int __bch2_dirent_read_target(struct btree_trans *trans, + u32 *subvol, u32 *snapshot, u64 *inum, + bool is_fsck) + { ++ struct bch_subvolume s; + int ret = 0; + + *subvol = 0; +@@ -191,9 +192,6 @@ int __bch2_dirent_read_target(struct btree_trans *trans, + if (likely(d.v->d_type != DT_SUBVOL)) { + *inum = le64_to_cpu(d.v->d_inum); + } else { +- struct bch_subvolume s; +- int ret; +- + *subvol = le64_to_cpu(d.v->d_inum); + + ret = bch2_subvolume_get(trans, *subvol, !is_fsck, BTREE_ITER_CACHED, &s); +-- +cgit v1.2.3 + + +From a5aec2b59d98616c2209406117da98dcc5829108 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 18 Oct 2021 11:32:06 -0400 +Subject: bcachefs: Zero out reflink_p val in bch2_make_extent_indirect() + +This bug was only discovered when we started using the 2nd word in the +val, which should have been zeroed out as those fields had never been +used before - ouch. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/reflink.c | 6 ++++++ + 1 file changed, 6 insertions(+) + +diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c +index 92ff609453b8..c63c95fc49b1 100644 +--- a/fs/bcachefs/reflink.c ++++ b/fs/bcachefs/reflink.c +@@ -166,9 +166,15 @@ static int bch2_make_extent_indirect(struct btree_trans *trans, + if (ret) + goto err; + ++ /* ++ * orig is in a bkey_buf which statically allocates 5 64s for the val, ++ * so we know it will be big enough: ++ */ + orig->k.type = KEY_TYPE_reflink_p; + r_p = bkey_i_to_reflink_p(orig); + set_bkey_val_bytes(&r_p->k, sizeof(r_p->v)); ++ memset(&r_p->v, 0, sizeof(r_p->v)); ++ + r_p->v.idx = cpu_to_le64(bkey_start_offset(&r_v->k)); + + ret = bch2_trans_update(trans, extent_iter, &r_p->k_i, 0); +-- +cgit v1.2.3 + + +From 9af94668b072d62534d8bd14386538fb4470f391 Mon Sep 17 00:00:00 2001 +From: Brett Holman +Date: Sat, 16 Oct 2021 19:13:53 -0600 +Subject: bcachefs: Fix compiler warnings + +Type size_t is architecture-specific. Fix warnings for some non-amd64 +arches. + +Signed-off-by: Brett Holman +--- + fs/bcachefs/journal_reclaim.c | 2 +- + fs/bcachefs/subvolume.h | 2 +- + 2 files changed, 2 insertions(+), 2 deletions(-) + +diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c +index 7a0ae5d3431c..c468d597d427 100644 +--- a/fs/bcachefs/journal_reclaim.c ++++ b/fs/bcachefs/journal_reclaim.c +@@ -653,7 +653,7 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct) + atomic_long_read(&c->btree_key_cache.nr_dirty), + atomic_long_read(&c->btree_key_cache.nr_keys)); + +- min_key_cache = min(bch2_nr_btree_keys_need_flush(c), 128UL); ++ min_key_cache = min(bch2_nr_btree_keys_need_flush(c), (size_t) 128); + + nr_flushed = journal_flush_pins(j, seq_to_flush, + min_nr, min_key_cache); +diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h +index ed02b982ff96..f98c8c0dbea2 100644 +--- a/fs/bcachefs/subvolume.h ++++ b/fs/bcachefs/subvolume.h +@@ -75,7 +75,7 @@ static inline void snapshots_seen_init(struct snapshots_seen *s) + static inline int snapshots_seen_add(struct bch_fs *c, struct snapshots_seen *s, u32 id) + { + if (s->nr == s->size) { +- size_t new_size = max(s->size, 128UL) * 2; ++ size_t new_size = max(s->size, (size_t) 128) * 2; + u32 *d = krealloc(s->d, new_size * sizeof(s->d[0]), GFP_KERNEL); + + if (!d) { +-- +cgit v1.2.3 + + +From 9abc2d389d151802b5e77b088bdca68ac3cf1140 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 18 Oct 2021 14:46:57 -0400 +Subject: bcachefs: Fix a cache coherency bug in bch2_subvolume_create() + +Subvolume deletion doesn't flush & evict the btree key cache - ideally +it would, but that's tricky, so instead bch2_subvolume_create() needs to +make sure the slot doesn't exist in the key cache to avoid triggering +assertions. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/subvolume.c | 11 +++++++++-- + 1 file changed, 9 insertions(+), 2 deletions(-) + +diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c +index d1c111050c35..9bd8d61c96fe 100644 +--- a/fs/bcachefs/subvolume.c ++++ b/fs/bcachefs/subvolume.c +@@ -886,6 +886,7 @@ int bch2_subvolume_create(struct btree_trans *trans, u64 inode, + u32 *new_snapshotid, + bool ro) + { ++ struct bch_fs *c = trans->c; + struct btree_iter dst_iter, src_iter = (struct btree_iter) { NULL }; + struct bkey_i_subvolume *new_subvol = NULL; + struct bkey_i_subvolume *src_subvol = NULL; +@@ -897,7 +898,13 @@ int bch2_subvolume_create(struct btree_trans *trans, u64 inode, + BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { + if (bkey_cmp(k.k->p, SUBVOL_POS_MAX) > 0) + break; +- if (bkey_deleted(k.k)) ++ ++ /* ++ * bch2_subvolume_delete() doesn't flush the btree key cache - ++ * ideally it would but that's tricky ++ */ ++ if (bkey_deleted(k.k) && ++ !bch2_btree_key_cache_find(c, BTREE_ID_subvolumes, dst_iter.pos)) + goto found_slot; + } + +@@ -925,7 +932,7 @@ found_slot: + goto err; + + if (k.k->type != KEY_TYPE_subvolume) { +- bch_err(trans->c, "subvolume %u not found", src_subvolid); ++ bch_err(c, "subvolume %u not found", src_subvolid); + ret = -ENOENT; + goto err; + } +-- +cgit v1.2.3 + + +From 62bd8641caf214f2887c8030dd5e3ef64410e8f0 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 19 Oct 2021 01:08:05 -0400 +Subject: bcachefs: Fix check_path() across subvolumes + +Checking of directory structure across subvolumes was broken - we need +to look up the snapshot ID of the parent subvolume when crossing subvol +boundaries. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/fsck.c | 24 ++++++++++++++++++++++-- + 1 file changed, 22 insertions(+), 2 deletions(-) + +diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c +index 208bf6df82b5..826a3577ee93 100644 +--- a/fs/bcachefs/fsck.c ++++ b/fs/bcachefs/fsck.c +@@ -1050,6 +1050,8 @@ static int inode_backpointer_exists(struct btree_trans *trans, + { + struct btree_iter iter; + struct bkey_s_c k; ++ u32 target_subvol, target_snapshot; ++ u64 target_inum; + int ret; + + bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, +@@ -1061,7 +1063,15 @@ static int inode_backpointer_exists(struct btree_trans *trans, + if (k.k->type != KEY_TYPE_dirent) + goto out; + +- ret = le64_to_cpu(bkey_s_c_to_dirent(k).v->d_inum) == inode->bi_inum; ++ ret = __bch2_dirent_read_target(trans, bkey_s_c_to_dirent(k), ++ &target_subvol, ++ &target_snapshot, ++ &target_inum, ++ true); ++ if (ret) ++ goto out; ++ ++ ret = target_inum == inode->bi_inum; + out: + bch2_trans_iter_exit(trans, &iter); + return ret; +@@ -1754,7 +1764,17 @@ static int check_path(struct btree_trans *trans, + snapshot = snapshot_t(c, snapshot)->equiv; + p->nr = 0; + +- while (inode->bi_inum != BCACHEFS_ROOT_INO) { ++ while (!(inode->bi_inum == BCACHEFS_ROOT_INO && ++ inode->bi_subvol == BCACHEFS_ROOT_SUBVOL)) { ++ if (inode->bi_parent_subvol) { ++ u64 inum; ++ ++ ret = subvol_lookup(trans, inode->bi_parent_subvol, ++ &snapshot, &inum); ++ if (ret) ++ break; ++ } ++ + ret = lockrestart_do(trans, + inode_backpointer_exists(trans, inode, snapshot)); + if (ret < 0) +-- +cgit v1.2.3 + + +From d728293ef1d7a5a8508a307e967e015068fe64b9 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 19 Oct 2021 12:27:47 -0400 +Subject: bcachefs: Improve reflink repair code + +When a reflink pointer points to an indirect extent that doesn't exist, +we need to replace it with a KEY_TYPE_error key. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_gc.c | 2 +- + fs/bcachefs/buckets.c | 51 +++++++++++++++++++++++++++++++++++++++++--------- + 2 files changed, 43 insertions(+), 10 deletions(-) + +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +index 236ecbd82a63..8f6e73b1e260 100644 +--- a/fs/bcachefs/btree_gc.c ++++ b/fs/bcachefs/btree_gc.c +@@ -738,7 +738,7 @@ static int bch2_gc_mark_key(struct bch_fs *c, enum btree_id btree_id, + *max_stale = max(*max_stale, ptr_stale(ca, ptr)); + } + +- bch2_mark_key(c, *k, flags); ++ ret = bch2_mark_key(c, *k, flags); + fsck_err: + err: + if (ret) +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index 5fd3aabb7669..d5ec4d727d0e 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -14,6 +14,7 @@ + #include "ec.h" + #include "error.h" + #include "movinggc.h" ++#include "recovery.h" + #include "reflink.h" + #include "replicas.h" + #include "subvolume.h" +@@ -1111,10 +1112,9 @@ static s64 __bch2_mark_reflink_p(struct bch_fs *c, struct bkey_s_c_reflink_p p, + { + struct reflink_gc *r; + int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1; ++ s64 ret = 0; + +- while (1) { +- if (*r_idx >= c->reflink_gc_nr) +- goto not_found; ++ while (*r_idx < c->reflink_gc_nr) { + r = genradix_ptr(&c->reflink_gc_table, *r_idx); + BUG_ON(!r); + +@@ -1123,16 +1123,49 @@ static s64 __bch2_mark_reflink_p(struct bch_fs *c, struct bkey_s_c_reflink_p p, + (*r_idx)++; + } + ++ if (*r_idx >= c->reflink_gc_nr || ++ idx < r->offset - r->size) { ++ ret = p.k->size; ++ goto not_found; ++ } ++ + BUG_ON((s64) r->refcount + add < 0); + + r->refcount += add; + return r->offset - idx; + not_found: +- bch2_fs_inconsistent(c, +- "%llu:%llu len %u points to nonexistent indirect extent %llu", +- p.k->p.inode, p.k->p.offset, p.k->size, idx); +- bch2_inconsistent_error(c); +- return -EIO; ++ if ((flags & BTREE_TRIGGER_GC) && ++ (flags & BTREE_TRIGGER_NOATOMIC)) { ++ /* ++ * XXX: we're replacing the entire reflink pointer with an error ++ * key, we should just be replacing the part that was missing: ++ */ ++ if (fsck_err(c, "%llu:%llu len %u points to nonexistent indirect extent %llu", ++ p.k->p.inode, p.k->p.offset, p.k->size, idx)) { ++ struct bkey_i_error *new; ++ ++ new = kmalloc(sizeof(*new), GFP_KERNEL); ++ if (!new) { ++ bch_err(c, "%s: error allocating new key", __func__); ++ return -ENOMEM; ++ } ++ ++ bkey_init(&new->k); ++ new->k.type = KEY_TYPE_error; ++ new->k.p = p.k->p; ++ new->k.size = p.k->size; ++ ret = bch2_journal_key_insert(c, BTREE_ID_extents, 0, &new->k_i); ++ ++ } ++ } else { ++ bch2_fs_inconsistent(c, ++ "%llu:%llu len %u points to nonexistent indirect extent %llu", ++ p.k->p.inode, p.k->p.offset, p.k->size, idx); ++ bch2_inconsistent_error(c); ++ ret = -EIO; ++ } ++fsck_err: ++ return ret; + } + + static int bch2_mark_reflink_p(struct bch_fs *c, +@@ -1164,7 +1197,7 @@ static int bch2_mark_reflink_p(struct bch_fs *c, + + while (sectors) { + ret = __bch2_mark_reflink_p(c, p, idx, flags, &l); +- if (ret < 0) ++ if (ret <= 0) + return ret; + + ret = min_t(s64, ret, sectors); +-- +cgit v1.2.3 + + +From 08f5634eb0816ef08e035b2c6587270fdacd1c64 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 19 Oct 2021 14:20:50 -0400 +Subject: bcachefs: for_each_btree_node() now returns errors directly + +This changes for_each_btree_node() to work like for_each_btree_key(), +and to that end bch2_btree_iter_peek_node() and next_node() also return +error ptrs. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_gc.c | 4 ++-- + fs/bcachefs/btree_iter.c | 20 +++++++++++++------- + fs/bcachefs/btree_iter.h | 10 +++++----- + fs/bcachefs/btree_update_interior.c | 4 ++++ + fs/bcachefs/debug.c | 2 +- + fs/bcachefs/journal_seq_blacklist.c | 4 ++-- + fs/bcachefs/migrate.c | 8 ++++++-- + fs/bcachefs/move.c | 8 ++++++-- + 8 files changed, 39 insertions(+), 21 deletions(-) + +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +index 8f6e73b1e260..dcbde49f07c4 100644 +--- a/fs/bcachefs/btree_gc.c ++++ b/fs/bcachefs/btree_gc.c +@@ -806,7 +806,7 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id, + gc_pos_set(c, gc_pos_btree(btree_id, POS_MIN, 0)); + + __for_each_btree_node(&trans, iter, btree_id, POS_MIN, +- 0, depth, BTREE_ITER_PREFETCH, b) { ++ 0, depth, BTREE_ITER_PREFETCH, b, ret) { + bch2_verify_btree_nr_keys(b); + + gc_pos_set(c, gc_pos_btree_node(b)); +@@ -833,7 +833,7 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id, + } + bch2_trans_iter_exit(&trans, &iter); + +- ret = bch2_trans_exit(&trans) ?: ret; ++ bch2_trans_exit(&trans); + if (ret) + return ret; + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 563b187ebca1..af7eedcdf6ad 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -1899,7 +1899,7 @@ struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter) + + ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); + if (ret) +- goto out; ++ goto err; + + b = btree_path_node(iter->path, iter->path->level); + if (!b) +@@ -1919,6 +1919,9 @@ out: + bch2_btree_iter_verify(iter); + + return b; ++err: ++ b = ERR_PTR(ret); ++ goto out; + } + + struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) +@@ -1935,7 +1938,9 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) + if (!btree_path_node(path, path->level)) + goto out; + +- bch2_trans_cond_resched(trans); ++ ret = bch2_trans_cond_resched(trans); ++ if (ret) ++ goto err; + + btree_node_unlock(path, path->level); + path->l[path->level].b = BTREE_ITER_NO_NODE_UP; +@@ -1944,7 +1949,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) + btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); + ret = bch2_btree_path_traverse(trans, path, iter->flags); + if (ret) +- goto out; ++ goto err; + + /* got to end? */ + b = btree_path_node(path, path->level); +@@ -1968,10 +1973,8 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) + bch2_btree_iter_verify(iter); + + ret = bch2_btree_path_traverse(trans, path, iter->flags); +- if (ret) { +- b = NULL; +- goto out; +- } ++ if (ret) ++ goto err; + + b = path->l[path->level].b; + } +@@ -1988,6 +1991,9 @@ out: + bch2_btree_iter_verify(iter); + + return b; ++err: ++ b = ERR_PTR(ret); ++ goto out; + } + + /* Iterate across keys (in leaf nodes only) */ +diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h +index 19ca73f5ea22..620f80999795 100644 +--- a/fs/bcachefs/btree_iter.h ++++ b/fs/bcachefs/btree_iter.h +@@ -258,18 +258,18 @@ static inline int bch2_trans_cond_resched(struct btree_trans *trans) + } + } + +-#define __for_each_btree_node(_trans, _iter, _btree_id, _start, \ +- _locks_want, _depth, _flags, _b) \ ++#define __for_each_btree_node(_trans, _iter, _btree_id, _start, \ ++ _locks_want, _depth, _flags, _b, _ret) \ + for (bch2_trans_node_iter_init((_trans), &(_iter), (_btree_id), \ + _start, _locks_want, _depth, _flags), \ + _b = bch2_btree_iter_peek_node(&(_iter)); \ +- (_b); \ ++ !((_ret) = PTR_ERR_OR_ZERO(_b)) && (_b); \ + (_b) = bch2_btree_iter_next_node(&(_iter))) + + #define for_each_btree_node(_trans, _iter, _btree_id, _start, \ +- _flags, _b) \ ++ _flags, _b, _ret) \ + __for_each_btree_node(_trans, _iter, _btree_id, _start, \ +- 0, 0, _flags, _b) ++ 0, 0, _flags, _b, _ret) + + static inline struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, + unsigned flags) +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index 98c05bb032a9..591a2fedb89d 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -1736,6 +1736,10 @@ retry: + goto out; + + b = bch2_btree_iter_peek_node(iter); ++ ret = PTR_ERR_OR_ZERO(b); ++ if (ret) ++ goto out; ++ + if (!b || b->data->keys.seq != seq) + goto out; + +diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c +index 9f14bf4cb49a..294e4baf4deb 100644 +--- a/fs/bcachefs/debug.c ++++ b/fs/bcachefs/debug.c +@@ -318,7 +318,7 @@ static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf, + + bch2_trans_init(&trans, i->c, 0, 0); + +- for_each_btree_node(&trans, iter, i->id, i->from, 0, b) { ++ for_each_btree_node(&trans, iter, i->id, i->from, 0, b, err) { + bch2_btree_node_to_text(&PBUF(i->buf), i->c, b); + i->bytes = strlen(i->buf); + err = flush_buf(i); +diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c +index 68fb2ebd91ac..f84a63ac15af 100644 +--- a/fs/bcachefs/journal_seq_blacklist.c ++++ b/fs/bcachefs/journal_seq_blacklist.c +@@ -254,7 +254,7 @@ void bch2_blacklist_entries_gc(struct work_struct *work) + struct btree *b; + + for_each_btree_node(&trans, iter, i, POS_MIN, +- BTREE_ITER_PREFETCH, b) ++ BTREE_ITER_PREFETCH, b, ret) + if (test_bit(BCH_FS_STOPPING, &c->flags)) { + bch2_trans_exit(&trans); + return; +@@ -262,7 +262,7 @@ void bch2_blacklist_entries_gc(struct work_struct *work) + bch2_trans_iter_exit(&trans, &iter); + } + +- ret = bch2_trans_exit(&trans); ++ bch2_trans_exit(&trans); + if (ret) + return; + +diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c +index 94d5d99ffd2a..111a41159eb2 100644 +--- a/fs/bcachefs/migrate.c ++++ b/fs/bcachefs/migrate.c +@@ -135,9 +135,10 @@ static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags) + for (id = 0; id < BTREE_ID_NR; id++) { + bch2_trans_node_iter_init(&trans, &iter, id, POS_MIN, 0, 0, + BTREE_ITER_PREFETCH); +- ++retry: + while (bch2_trans_begin(&trans), +- (b = bch2_btree_iter_peek_node(&iter))) { ++ (b = bch2_btree_iter_peek_node(&iter)) && ++ !(ret = PTR_ERR_OR_ZERO(b))) { + if (!bch2_bkey_has_device(bkey_i_to_s_c(&b->key), + dev_idx)) + goto next; +@@ -164,6 +165,9 @@ static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags) + next: + bch2_btree_iter_next_node(&iter); + } ++ if (ret == -EINTR) ++ goto retry; ++ + bch2_trans_iter_exit(&trans, &iter); + + if (ret) +diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c +index 790389d485a4..92872939b868 100644 +--- a/fs/bcachefs/move.c ++++ b/fs/bcachefs/move.c +@@ -885,9 +885,10 @@ static int bch2_move_btree(struct bch_fs *c, + + bch2_trans_node_iter_init(&trans, &iter, id, POS_MIN, 0, 0, + BTREE_ITER_PREFETCH); +- ++retry: + while (bch2_trans_begin(&trans), +- (b = bch2_btree_iter_peek_node(&iter))) { ++ (b = bch2_btree_iter_peek_node(&iter)) && ++ !(ret = PTR_ERR_OR_ZERO(b))) { + if (kthread && kthread_should_stop()) + break; + +@@ -915,6 +916,9 @@ next: + bch2_trans_cond_resched(&trans); + bch2_btree_iter_next_node(&iter); + } ++ if (ret == -EINTR) ++ goto retry; ++ + bch2_trans_iter_exit(&trans, &iter); + + if (kthread && kthread_should_stop()) +-- +cgit v1.2.3 + + +From c153630d78951715be867df0e2fafabdb2abdf38 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 19 Oct 2021 15:08:00 -0400 +Subject: bcachefs: bch2_trans_exit() no longer returns errors + +Now that peek_node()/next_node() are converted to return errors +directly, we don't need bch2_trans_exit() to return errors - it's +cleaner this way and wasn't used much anymore. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 8 ++------ + fs/bcachefs/btree_iter.h | 2 +- + fs/bcachefs/btree_types.h | 1 - + fs/bcachefs/btree_update.h | 6 +++--- + fs/bcachefs/dirent.c | 2 +- + fs/bcachefs/ec.c | 5 +++-- + fs/bcachefs/fs-io.c | 7 ++++--- + fs/bcachefs/fs.c | 2 +- + fs/bcachefs/fsck.c | 9 ++++++--- + fs/bcachefs/migrate.c | 4 ++-- + fs/bcachefs/move.c | 2 +- + fs/bcachefs/quota.c | 6 ++++-- + fs/bcachefs/reflink.c | 2 +- + fs/bcachefs/sysfs.c | 2 +- + fs/bcachefs/xattr.c | 2 +- + 15 files changed, 31 insertions(+), 29 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index af7eedcdf6ad..a54484e05e3a 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -1349,10 +1349,8 @@ retry_all: + } while (ret); + } + +- if (unlikely(ret == -EIO)) { +- trans->error = true; ++ if (unlikely(ret == -EIO)) + goto out; +- } + + BUG_ON(ret && ret != -EINTR); + +@@ -2742,7 +2740,7 @@ leaked: + #endif + } + +-int bch2_trans_exit(struct btree_trans *trans) ++void bch2_trans_exit(struct btree_trans *trans) + __releases(&c->btree_trans_barrier) + { + struct btree_insert_entry *i; +@@ -2792,8 +2790,6 @@ int bch2_trans_exit(struct btree_trans *trans) + + trans->mem = (void *) 0x1; + trans->paths = (void *) 0x1; +- +- return trans->error ? -EIO : 0; + } + + static void __maybe_unused +diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h +index 620f80999795..72aff955493b 100644 +--- a/fs/bcachefs/btree_iter.h ++++ b/fs/bcachefs/btree_iter.h +@@ -325,7 +325,7 @@ static inline void set_btree_iter_dontneed(struct btree_iter *iter) + void *bch2_trans_kmalloc(struct btree_trans *, size_t); + void bch2_trans_begin(struct btree_trans *); + void bch2_trans_init(struct btree_trans *, struct bch_fs *, unsigned, size_t); +-int bch2_trans_exit(struct btree_trans *); ++void bch2_trans_exit(struct btree_trans *); + + void bch2_btree_trans_to_text(struct printbuf *, struct bch_fs *); + +diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h +index 7fcd2ceb51e9..26aa3cd182d5 100644 +--- a/fs/bcachefs/btree_types.h ++++ b/fs/bcachefs/btree_types.h +@@ -377,7 +377,6 @@ struct btree_trans { + u8 nr_sorted; + u8 nr_updates; + bool used_mempool:1; +- bool error:1; + bool in_traverse_all:1; + bool restarted:1; + /* +diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h +index 4d0ece342cf6..155643da35be 100644 +--- a/fs/bcachefs/btree_update.h ++++ b/fs/bcachefs/btree_update.h +@@ -120,14 +120,14 @@ static inline int bch2_trans_commit(struct btree_trans *trans, + #define bch2_trans_do(_c, _disk_res, _journal_seq, _flags, _do) \ + ({ \ + struct btree_trans trans; \ +- int _ret, _ret2; \ ++ int _ret; \ + \ + bch2_trans_init(&trans, (_c), 0, 0); \ + _ret = __bch2_trans_do(&trans, _disk_res, _journal_seq, _flags, \ + _do); \ +- _ret2 = bch2_trans_exit(&trans); \ ++ bch2_trans_exit(&trans); \ + \ +- _ret ?: _ret2; \ ++ _ret; \ + }) + + #define trans_for_each_update(_trans, _i) \ +diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c +index cd5468b15ba2..26df20ad090c 100644 +--- a/fs/bcachefs/dirent.c ++++ b/fs/bcachefs/dirent.c +@@ -491,7 +491,7 @@ err: + if (ret == -EINTR) + goto retry; + +- ret = bch2_trans_exit(&trans) ?: ret; ++ bch2_trans_exit(&trans); + + return ret; + } +diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c +index f0bdbdb2673d..9f87e2bc4468 100644 +--- a/fs/bcachefs/ec.c ++++ b/fs/bcachefs/ec.c +@@ -1670,11 +1670,12 @@ int bch2_ec_mem_alloc(struct bch_fs *c, bool gc) + bch2_trans_iter_init(&trans, &iter, BTREE_ID_stripes, POS(0, U64_MAX), 0); + + k = bch2_btree_iter_prev(&iter); +- if (!IS_ERR_OR_NULL(k.k)) ++ ret = bkey_err(k); ++ if (!ret && k.k) + idx = k.k->p.offset + 1; + + bch2_trans_iter_exit(&trans, &iter); +- ret = bch2_trans_exit(&trans); ++ bch2_trans_exit(&trans); + if (ret) + return ret; + +diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c +index 66be27ad5649..0deb38949844 100644 +--- a/fs/bcachefs/fs-io.c ++++ b/fs/bcachefs/fs-io.c +@@ -2236,7 +2236,8 @@ err: + if (ret == -EINTR) + goto retry; + +- return bch2_trans_exit(&trans) ?: ret; ++ bch2_trans_exit(&trans); ++ return ret; + } + + static int __bch2_truncate_page(struct bch_inode_info *inode, +@@ -3140,7 +3141,7 @@ err: + if (ret == -EINTR) + goto retry; + +- ret = bch2_trans_exit(&trans) ?: ret; ++ bch2_trans_exit(&trans); + if (ret) + return ret; + +@@ -3255,7 +3256,7 @@ err: + if (ret == -EINTR) + goto retry; + +- ret = bch2_trans_exit(&trans) ?: ret; ++ bch2_trans_exit(&trans); + if (ret) + return ret; + +diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c +index 48f75f0aa7e4..50135ec6af92 100644 +--- a/fs/bcachefs/fs.c ++++ b/fs/bcachefs/fs.c +@@ -1015,7 +1015,7 @@ err: + ret = bch2_fill_extent(c, info, bkey_i_to_s_c(prev.k), + FIEMAP_EXTENT_LAST); + +- ret = bch2_trans_exit(&trans) ?: ret; ++ bch2_trans_exit(&trans); + bch2_bkey_buf_exit(&cur, c); + bch2_bkey_buf_exit(&prev, c); + return ret < 0 ? ret : 0; +diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c +index 826a3577ee93..a36bc840a62c 100644 +--- a/fs/bcachefs/fsck.c ++++ b/fs/bcachefs/fsck.c +@@ -981,7 +981,8 @@ static int check_inodes(struct bch_fs *c, bool full) + + BUG_ON(ret == -EINTR); + +- return bch2_trans_exit(&trans) ?: ret; ++ bch2_trans_exit(&trans); ++ return ret; + } + + noinline_for_stack +@@ -1659,7 +1660,8 @@ fsck_err: + goto retry; + + bch2_trans_iter_exit(&trans, &iter); +- return bch2_trans_exit(&trans) ?: ret; ++ bch2_trans_exit(&trans); ++ return ret; + } + + /* Get root directory, create if it doesn't exist: */ +@@ -1876,7 +1878,8 @@ static int check_directory_structure(struct bch_fs *c) + + kfree(path.entries); + +- return bch2_trans_exit(&trans) ?: ret; ++ bch2_trans_exit(&trans); ++ return ret; + } + + struct nlink_table { +diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c +index 111a41159eb2..00ba6e1c92ee 100644 +--- a/fs/bcachefs/migrate.c ++++ b/fs/bcachefs/migrate.c +@@ -100,7 +100,7 @@ static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags + } + bch2_trans_iter_exit(&trans, &iter); + +- ret = bch2_trans_exit(&trans) ?: ret; ++ bch2_trans_exit(&trans); + bch2_bkey_buf_exit(&sk, c); + + BUG_ON(ret == -EINTR); +@@ -180,7 +180,7 @@ next: + + ret = 0; + err: +- ret = bch2_trans_exit(&trans) ?: ret; ++ bch2_trans_exit(&trans); + bch2_bkey_buf_exit(&k, c); + + BUG_ON(ret == -EINTR); +diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c +index 92872939b868..8f1536882091 100644 +--- a/fs/bcachefs/move.c ++++ b/fs/bcachefs/move.c +@@ -773,7 +773,7 @@ next_nondata: + out: + + bch2_trans_iter_exit(&trans, &iter); +- ret = bch2_trans_exit(&trans) ?: ret; ++ bch2_trans_exit(&trans); + bch2_bkey_buf_exit(&sk, c); + + return ret; +diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c +index 9b0f4d3f176d..17fd5bf107bb 100644 +--- a/fs/bcachefs/quota.c ++++ b/fs/bcachefs/quota.c +@@ -374,7 +374,8 @@ static int bch2_quota_init_type(struct bch_fs *c, enum quota_types type) + } + bch2_trans_iter_exit(&trans, &iter); + +- return bch2_trans_exit(&trans) ?: ret; ++ bch2_trans_exit(&trans); ++ return ret; + } + + void bch2_fs_quota_exit(struct bch_fs *c) +@@ -452,7 +453,8 @@ int bch2_fs_quota_read(struct bch_fs *c) + } + bch2_trans_iter_exit(&trans, &iter); + +- return bch2_trans_exit(&trans) ?: ret; ++ bch2_trans_exit(&trans); ++ return ret; + } + + /* Enable/disable/delete quotas for an entire filesystem: */ +diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c +index c63c95fc49b1..9bcf4216a286 100644 +--- a/fs/bcachefs/reflink.c ++++ b/fs/bcachefs/reflink.c +@@ -349,7 +349,7 @@ s64 bch2_remap_range(struct bch_fs *c, + bch2_trans_iter_exit(&trans, &inode_iter); + } while (ret2 == -EINTR); + +- ret = bch2_trans_exit(&trans) ?: ret; ++ bch2_trans_exit(&trans); + bch2_bkey_buf_exit(&new_src, c); + bch2_bkey_buf_exit(&new_dst, c); + +diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c +index 92e58f5c6bbf..51eb19b84a28 100644 +--- a/fs/bcachefs/sysfs.c ++++ b/fs/bcachefs/sysfs.c +@@ -327,7 +327,7 @@ static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c + } + bch2_trans_iter_exit(&trans, &iter); + +- ret = bch2_trans_exit(&trans) ?: ret; ++ bch2_trans_exit(&trans); + if (ret) + return ret; + +diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c +index a182e242a0e8..fe572b2375eb 100644 +--- a/fs/bcachefs/xattr.c ++++ b/fs/bcachefs/xattr.c +@@ -316,7 +316,7 @@ err: + if (ret == -EINTR) + goto retry; + +- ret = bch2_trans_exit(&trans) ?: ret; ++ bch2_trans_exit(&trans); + + if (ret) + return ret; +-- +cgit v1.2.3 + + +From a961d4a837a3580cbb9df4a0af691a844a74c9c3 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 19 Oct 2021 15:11:45 -0400 +Subject: bcachefs: Handle transaction restarts in bch2_blacklist_entries_gc() + +It shouldn't be necessary when we're only using a single iterator and +not doing updates, but that's harder to debug at the moment. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/journal_seq_blacklist.c | 21 +++++++++++++++------ + fs/bcachefs/migrate.c | 1 + + fs/bcachefs/move.c | 1 + + 3 files changed, 17 insertions(+), 6 deletions(-) + +diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c +index f84a63ac15af..79bc0e49389b 100644 +--- a/fs/bcachefs/journal_seq_blacklist.c ++++ b/fs/bcachefs/journal_seq_blacklist.c +@@ -253,12 +253,21 @@ void bch2_blacklist_entries_gc(struct work_struct *work) + struct btree_iter iter; + struct btree *b; + +- for_each_btree_node(&trans, iter, i, POS_MIN, +- BTREE_ITER_PREFETCH, b, ret) +- if (test_bit(BCH_FS_STOPPING, &c->flags)) { +- bch2_trans_exit(&trans); +- return; +- } ++ bch2_trans_node_iter_init(&trans, &iter, i, POS_MIN, ++ 0, 0, BTREE_ITER_PREFETCH); ++retry: ++ bch2_trans_begin(&trans); ++ ++ b = bch2_btree_iter_peek_node(&iter); ++ ++ while (!(ret = PTR_ERR_OR_ZERO(b)) && ++ b && ++ !test_bit(BCH_FS_STOPPING, &c->flags)) ++ b = bch2_btree_iter_next_node(&iter); ++ ++ if (ret == -EINTR) ++ goto retry; ++ + bch2_trans_iter_exit(&trans, &iter); + } + +diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c +index 00ba6e1c92ee..6defc33322b3 100644 +--- a/fs/bcachefs/migrate.c ++++ b/fs/bcachefs/migrate.c +@@ -136,6 +136,7 @@ static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags) + bch2_trans_node_iter_init(&trans, &iter, id, POS_MIN, 0, 0, + BTREE_ITER_PREFETCH); + retry: ++ ret = 0; + while (bch2_trans_begin(&trans), + (b = bch2_btree_iter_peek_node(&iter)) && + !(ret = PTR_ERR_OR_ZERO(b))) { +diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c +index 8f1536882091..20396820bbb3 100644 +--- a/fs/bcachefs/move.c ++++ b/fs/bcachefs/move.c +@@ -886,6 +886,7 @@ static int bch2_move_btree(struct bch_fs *c, + bch2_trans_node_iter_init(&trans, &iter, id, POS_MIN, 0, 0, + BTREE_ITER_PREFETCH); + retry: ++ ret = 0; + while (bch2_trans_begin(&trans), + (b = bch2_btree_iter_peek_node(&iter)) && + !(ret = PTR_ERR_OR_ZERO(b))) { +-- +cgit v1.2.3 + + +From d53fc016be7b14fde52a791f6a57fb09d90406f0 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 19 Oct 2021 17:30:16 -0400 +Subject: bcachefs: New on disk format to fix reflink_p pointers + +We had a bug where reflink_p pointers weren't being initialized to 0, +and when we started using the second word, things broke badly. + +This patch revs the on disk format version and adds cleanup code to zero +out the second word of reflink_p pointers before we start using it. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bcachefs_format.h | 8 ++--- + fs/bcachefs/fsck.c | 68 ++++++++++++++++++++++++++++++++++++++++++- + fs/bcachefs/recovery.c | 8 ++--- + 3 files changed, 73 insertions(+), 11 deletions(-) + +diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h +index 0b8eabe5eaa4..ed7d12aa317c 100644 +--- a/fs/bcachefs/bcachefs_format.h ++++ b/fs/bcachefs/bcachefs_format.h +@@ -913,10 +913,7 @@ struct bch_stripe { + struct bch_reflink_p { + struct bch_val v; + __le64 idx; +- +- __le32 reservation_generation; +- __u8 nr_replicas; +- __u8 pad[3]; ++ __le64 v2; + }; + + struct bch_reflink_v { +@@ -1259,7 +1256,8 @@ enum bcachefs_metadata_version { + bcachefs_metadata_version_inode_backpointers = 13, + bcachefs_metadata_version_btree_ptr_sectors_written = 14, + bcachefs_metadata_version_snapshot_2 = 15, +- bcachefs_metadata_version_max = 16, ++ bcachefs_metadata_version_reflink_p_fix = 16, ++ bcachefs_metadata_version_max = 17, + }; + + #define bcachefs_metadata_version_current (bcachefs_metadata_version_max - 1) +diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c +index a36bc840a62c..b43c31b95dff 100644 +--- a/fs/bcachefs/fsck.c ++++ b/fs/bcachefs/fsck.c +@@ -2154,6 +2154,71 @@ static int check_nlinks(struct bch_fs *c) + return ret; + } + ++static int fix_reflink_p_key(struct btree_trans *trans, struct btree_iter *iter) ++{ ++ struct bkey_s_c k; ++ struct bkey_s_c_reflink_p p; ++ struct bkey_i_reflink_p *u; ++ int ret; ++ ++ k = bch2_btree_iter_peek(iter); ++ if (!k.k) ++ return 0; ++ ++ ret = bkey_err(k); ++ if (ret) ++ return ret; ++ ++ if (k.k->type != KEY_TYPE_reflink_p) ++ return 0; ++ ++ p = bkey_s_c_to_reflink_p(k); ++ ++ if (!p.v->v2) ++ return 0; ++ ++ u = bch2_trans_kmalloc(trans, sizeof(*u)); ++ ret = PTR_ERR_OR_ZERO(u); ++ if (ret) ++ return ret; ++ ++ bkey_reassemble(&u->k_i, k); ++ u->v.v2 = 0; ++ ++ return bch2_trans_update(trans, iter, &u->k_i, 0); ++} ++ ++static int fix_reflink_p(struct bch_fs *c) ++{ ++ struct btree_trans trans; ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ int ret; ++ ++ if (c->sb.version >= bcachefs_metadata_version_reflink_p_fix) ++ return 0; ++ ++ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_extents, POS_MIN, ++ BTREE_ITER_INTENT| ++ BTREE_ITER_PREFETCH| ++ BTREE_ITER_ALL_SNAPSHOTS, k, ret) { ++ if (k.k->type == KEY_TYPE_reflink_p) { ++ ret = __bch2_trans_do(&trans, NULL, NULL, ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_LAZY_RW, ++ fix_reflink_p_key(&trans, &iter)); ++ if (ret) ++ break; ++ } ++ } ++ bch2_trans_iter_exit(&trans, &iter); ++ ++ bch2_trans_exit(&trans); ++ return ret; ++} ++ + /* + * Checks for inconsistencies that shouldn't happen, unless we have a bug. + * Doesn't fix them yet, mainly because they haven't yet been observed: +@@ -2168,7 +2233,8 @@ int bch2_fsck_full(struct bch_fs *c) + check_xattrs(c) ?: + check_root(c) ?: + check_directory_structure(c) ?: +- check_nlinks(c); ++ check_nlinks(c) ?: ++ fix_reflink_p(c); + } + + int bch2_fsck_walk_inodes_only(struct bch_fs *c) +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index 6afb37a2e1b0..8c53b1e977d1 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -1086,12 +1086,10 @@ int bch2_fs_recovery(struct bch_fs *c) + c->opts.version_upgrade = true; + c->opts.fsck = true; + c->opts.fix_errors = FSCK_OPT_YES; +- } else if (c->sb.version < bcachefs_metadata_version_btree_ptr_sectors_written) { +- bch_info(c, "version prior to btree_ptr_sectors_written, upgrade required"); +- c->opts.version_upgrade = true; +- } else if (c->sb.version < bcachefs_metadata_version_snapshot_2) { +- bch_info(c, "filesystem version is prior to snapshots - upgrading"); ++ } else if (c->sb.version < bcachefs_metadata_version_reflink_p_fix) { ++ bch_info(c, "filesystem version is prior to reflink_p fix - upgrading"); + c->opts.version_upgrade = true; ++ c->opts.fsck = true; + } + + ret = bch2_blacklist_table_initialize(c); +-- +cgit v1.2.3 + + +From a62512d3c2988a5692dfc878fdb935ef4a9995e4 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 14 Oct 2021 09:54:47 -0400 +Subject: bcachefs: Fix for leaking of reflinked extents + +When a reflink pointer points to only part of an indirect extent, and +then that indirect extent is fragmented (e.g. by copygc), if the reflink +pointer only points to one of the fragments we leak a reference. + +Fix this by storing front/back pad values in reflink pointers - when +inserting reflink pointesr, we initialize them to cover the full range +of the indirect extents we reference. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bcachefs_format.h | 15 ++++++++++++--- + fs/bcachefs/buckets.c | 45 +++++++++++++++++++++++++++++++++++++------ + fs/bcachefs/fsck.c | 5 +++-- + fs/bcachefs/reflink.c | 4 ++++ + 4 files changed, 58 insertions(+), 11 deletions(-) + +diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h +index ed7d12aa317c..e268125b057e 100644 +--- a/fs/bcachefs/bcachefs_format.h ++++ b/fs/bcachefs/bcachefs_format.h +@@ -913,15 +913,24 @@ struct bch_stripe { + struct bch_reflink_p { + struct bch_val v; + __le64 idx; +- __le64 v2; +-}; ++ /* ++ * A reflink pointer might point to an indirect extent which is then ++ * later split (by copygc or rebalance). If we only pointed to part of ++ * the original indirect extent, and then one of the fragments is ++ * outside the range we point to, we'd leak a refcount: so when creating ++ * reflink pointers, we need to store pad values to remember the full ++ * range we were taking a reference on. ++ */ ++ __le32 front_pad; ++ __le32 back_pad; ++} __attribute__((packed, aligned(8))); + + struct bch_reflink_v { + struct bch_val v; + __le64 refcount; + union bch_extent_entry start[0]; + __u64 _data[0]; +-}; ++} __attribute__((packed, aligned(8))); + + struct bch_indirect_inline_data { + struct bch_val v; +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index d5ec4d727d0e..97151ec80c52 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -1176,8 +1176,10 @@ static int bch2_mark_reflink_p(struct bch_fs *c, + struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); + struct reflink_gc *ref; + size_t l, r, m; +- u64 idx = le64_to_cpu(p.v->idx); +- unsigned sectors = p.k->size; ++ u64 idx = le64_to_cpu(p.v->idx) - le32_to_cpu(p.v->front_pad); ++ u64 sectors = (u64) le32_to_cpu(p.v->front_pad) + ++ le32_to_cpu(p.v->back_pad) + ++ p.k->size; + s64 ret = 0; + + BUG_ON((flags & (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE)) == +@@ -1753,12 +1755,33 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans, + bch2_fs_inconsistent(c, + "%llu:%llu len %u points to nonexistent indirect extent %llu", + p.k->p.inode, p.k->p.offset, p.k->size, idx); +- bch2_inconsistent_error(c); + ret = -EIO; + goto err; + } + +- BUG_ON(!*refcount && (flags & BTREE_TRIGGER_OVERWRITE)); ++ if (!*refcount && (flags & BTREE_TRIGGER_OVERWRITE)) { ++ bch2_fs_inconsistent(c, ++ "%llu:%llu len %u idx %llu indirect extent refcount underflow", ++ p.k->p.inode, p.k->p.offset, p.k->size, idx); ++ ret = -EIO; ++ goto err; ++ } ++ ++ if (flags & BTREE_TRIGGER_INSERT) { ++ struct bch_reflink_p *v = (struct bch_reflink_p *) p.v; ++ u64 pad; ++ ++ pad = max_t(s64, le32_to_cpu(v->front_pad), ++ le64_to_cpu(v->idx) - bkey_start_offset(k.k)); ++ BUG_ON(pad > U32_MAX); ++ v->front_pad = cpu_to_le32(pad); ++ ++ pad = max_t(s64, le32_to_cpu(v->back_pad), ++ k.k->p.offset - p.k->size - le64_to_cpu(v->idx)); ++ BUG_ON(pad > U32_MAX); ++ v->back_pad = cpu_to_le32(pad); ++ } ++ + le64_add_cpu(refcount, add); + + if (!*refcount) { +@@ -1781,10 +1804,20 @@ static int bch2_trans_mark_reflink_p(struct btree_trans *trans, + struct bkey_s_c k, unsigned flags) + { + struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); +- u64 idx = le64_to_cpu(p.v->idx); +- unsigned sectors = p.k->size; ++ u64 idx, sectors; + s64 ret = 0; + ++ if (flags & BTREE_TRIGGER_INSERT) { ++ struct bch_reflink_p *v = (struct bch_reflink_p *) p.v; ++ ++ v->front_pad = v->back_pad = 0; ++ } ++ ++ idx = le64_to_cpu(p.v->idx) - le32_to_cpu(p.v->front_pad); ++ sectors = (u64) le32_to_cpu(p.v->front_pad) + ++ le32_to_cpu(p.v->back_pad) + ++ p.k->size; ++ + while (sectors) { + ret = __bch2_trans_mark_reflink_p(trans, p, idx, flags); + if (ret < 0) +diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c +index b43c31b95dff..c99e1514fd4f 100644 +--- a/fs/bcachefs/fsck.c ++++ b/fs/bcachefs/fsck.c +@@ -2174,7 +2174,7 @@ static int fix_reflink_p_key(struct btree_trans *trans, struct btree_iter *iter) + + p = bkey_s_c_to_reflink_p(k); + +- if (!p.v->v2) ++ if (!p.v->front_pad && !p.v->back_pad) + return 0; + + u = bch2_trans_kmalloc(trans, sizeof(*u)); +@@ -2183,7 +2183,8 @@ static int fix_reflink_p_key(struct btree_trans *trans, struct btree_iter *iter) + return ret; + + bkey_reassemble(&u->k_i, k); +- u->v.v2 = 0; ++ u->v.front_pad = 0; ++ u->v.back_pad = 0; + + return bch2_trans_update(trans, iter, &u->k_i, 0); + } +diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c +index 9bcf4216a286..2827d0ef1019 100644 +--- a/fs/bcachefs/reflink.c ++++ b/fs/bcachefs/reflink.c +@@ -32,6 +32,10 @@ const char *bch2_reflink_p_invalid(const struct bch_fs *c, struct bkey_s_c k) + if (bkey_val_bytes(p.k) != sizeof(*p.v)) + return "incorrect value size"; + ++ if (c->sb.version >= bcachefs_metadata_version_reflink_p_fix && ++ le64_to_cpu(p.v->idx) < le32_to_cpu(p.v->front_pad)) ++ return "idx < front_pad"; ++ + return NULL; + } + +-- +cgit v1.2.3 + + +From acab0ba4544dc58e38b9d5362c549ef99a1580cb Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 20 Oct 2021 17:59:38 -0400 +Subject: bcachefs: Fix check_path() for snapshots + +check_path() wasn't checking the snapshot ID when checking for directory +structure loops - so, snapshots would cause us to detect a loop that +wasn't actually a loop. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/fsck.c | 64 ++++++++++++++++++++++++++++++++++++++---------------- + 1 file changed, 45 insertions(+), 19 deletions(-) + +diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c +index c99e1514fd4f..d6f37b9e00fb 100644 +--- a/fs/bcachefs/fsck.c ++++ b/fs/bcachefs/fsck.c +@@ -1357,10 +1357,10 @@ static int check_dirent_target(struct btree_trans *trans, + } + + if (fsck_err_on(!backpointer_exists, c, +- "inode %llu has wrong backpointer:\n" ++ "inode %llu:%u has wrong backpointer:\n" + "got %llu:%llu\n" + "should be %llu:%llu", +- target->bi_inum, ++ target->bi_inum, target_snapshot, + target->bi_dir, + target->bi_dir_offset, + d.k->p.inode, +@@ -1730,10 +1730,23 @@ struct pathbuf { + + struct pathbuf_entry { + u64 inum; ++ u32 snapshot; + } *entries; + }; + +-static int path_down(struct pathbuf *p, u64 inum) ++static bool path_is_dup(struct pathbuf *p, u64 inum, u32 snapshot) ++{ ++ struct pathbuf_entry *i; ++ ++ for (i = p->entries; i < p->entries + p->nr; i++) ++ if (i->inum == inum && ++ i->snapshot == snapshot) ++ return true; ++ ++ return false; ++} ++ ++static int path_down(struct pathbuf *p, u64 inum, u32 snapshot) + { + if (p->nr == p->size) { + size_t new_size = max_t(size_t, 256UL, p->size * 2); +@@ -1749,18 +1762,23 @@ static int path_down(struct pathbuf *p, u64 inum) + }; + + p->entries[p->nr++] = (struct pathbuf_entry) { +- .inum = inum, ++ .inum = inum, ++ .snapshot = snapshot, + }; + return 0; + } + ++/* ++ * Check that a given inode is reachable from the root: ++ * ++ * XXX: we should also be verifying that inodes are in the right subvolumes ++ */ + static int check_path(struct btree_trans *trans, + struct pathbuf *p, + struct bch_inode_unpacked *inode, + u32 snapshot) + { + struct bch_fs *c = trans->c; +- size_t i; + int ret = 0; + + snapshot = snapshot_t(c, snapshot)->equiv; +@@ -1768,17 +1786,19 @@ static int check_path(struct btree_trans *trans, + + while (!(inode->bi_inum == BCACHEFS_ROOT_INO && + inode->bi_subvol == BCACHEFS_ROOT_SUBVOL)) { ++ u32 parent_snapshot = snapshot; ++ + if (inode->bi_parent_subvol) { + u64 inum; + + ret = subvol_lookup(trans, inode->bi_parent_subvol, +- &snapshot, &inum); ++ &parent_snapshot, &inum); + if (ret) + break; + } + + ret = lockrestart_do(trans, +- inode_backpointer_exists(trans, inode, snapshot)); ++ inode_backpointer_exists(trans, inode, parent_snapshot)); + if (ret < 0) + break; + +@@ -1797,17 +1817,31 @@ static int check_path(struct btree_trans *trans, + if (!S_ISDIR(inode->bi_mode)) + break; + +- ret = path_down(p, inode->bi_inum); ++ ret = path_down(p, inode->bi_inum, snapshot); + if (ret) { + bch_err(c, "memory allocation failure"); + return ret; + } + +- for (i = 0; i < p->nr; i++) { +- if (inode->bi_dir != p->entries[i].inum) +- continue; ++ snapshot = parent_snapshot; ++ ++ ret = lookup_inode(trans, inode->bi_dir, inode, &snapshot); ++ if (ret) { ++ /* Should have been caught in dirents pass */ ++ bch_err(c, "error looking up parent directory: %i", ret); ++ break; ++ } ++ ++ if (path_is_dup(p, inode->bi_inum, snapshot)) { ++ struct pathbuf_entry *i; + + /* XXX print path */ ++ bch_err(c, "directory structure loop"); ++ ++ for (i = p->entries; i < p->entries + p->nr; i++) ++ pr_err("%llu:%u", i->inum, i->snapshot); ++ pr_err("%llu:%u", inode->bi_inum, snapshot); ++ + if (!fsck_err(c, "directory structure loop")) + return 0; + +@@ -1819,14 +1853,6 @@ static int check_path(struct btree_trans *trans, + } + + ret = reattach_inode(trans, inode, snapshot); +- break; +- } +- +- ret = lookup_inode(trans, inode->bi_dir, inode, &snapshot); +- if (ret) { +- /* Should have been caught in dirents pass */ +- bch_err(c, "error looking up parent directory: %i", ret); +- break; + } + } + fsck_err: +-- +cgit v1.2.3 + + +From bcd1f107fd0ebc787c91a7472c450e2887e8dbbb Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 20 Oct 2021 20:50:07 -0400 +Subject: bcachefs: Delete dentry when deleting snapshots + +This fixes a bug where subsequently doing creates with the same name +fails. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/fs-ioctl.c | 9 ++++++++- + 1 file changed, 8 insertions(+), 1 deletion(-) + +diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c +index 3ed53f420e7e..513f7a7a3fd4 100644 +--- a/fs/bcachefs/fs-ioctl.c ++++ b/fs/bcachefs/fs-ioctl.c +@@ -422,6 +422,7 @@ static long bch2_ioctl_subvolume_destroy(struct bch_fs *c, struct file *filp, + struct bch_ioctl_subvolume arg) + { + struct path path; ++ struct inode *dir; + int ret = 0; + + if (arg.flags) +@@ -438,7 +439,13 @@ static long bch2_ioctl_subvolume_destroy(struct bch_fs *c, struct file *filp, + return -EXDEV; + } + +- ret = __bch2_unlink(path.dentry->d_parent->d_inode, path.dentry, 1); ++ dir = path.dentry->d_parent->d_inode; ++ ++ ret = __bch2_unlink(dir, path.dentry, 1); ++ if (!ret) { ++ fsnotify_rmdir(dir, path.dentry); ++ d_delete(path.dentry); ++ } + path_put(&path); + + return ret; +-- +cgit v1.2.3 + + +From 1c661f66588d6d7c8bacc313265407a7db9041cc Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 21 Oct 2021 00:38:13 -0400 +Subject: bcachefs: cached data shouldn't prevent fs from mounting + +It's not an error if we don't have cached data - skip BCH_DATA_cached in +bch2_have_enough_devs(). + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/replicas.c | 3 +++ + 1 file changed, 3 insertions(+) + +diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c +index dbbbcc6dcec6..002006593044 100644 +--- a/fs/bcachefs/replicas.c ++++ b/fs/bcachefs/replicas.c +@@ -1010,6 +1010,9 @@ bool bch2_have_enough_devs(struct bch_fs *c, struct bch_devs_mask devs, + unsigned i, nr_online = 0, nr_failed = 0, dflags = 0; + bool metadata = e->data_type < BCH_DATA_user; + ++ if (e->data_type == BCH_DATA_cached) ++ continue; ++ + for (i = 0; i < e->nr_devs; i++) { + struct bch_dev *ca = bch_dev_bkey_exists(c, e->devs[i]); + +-- +cgit v1.2.3 + + +From 2a4f63afee77ce3d31e7668d84effcec99c8be03 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 21 Oct 2021 12:05:21 -0400 +Subject: bcachefs: Fix restart handling in for_each_btree_key() + +Code that uses for_each_btree_key often wants transaction restarts to be +handled locally and not returned. Originally, we wouldn't return +transaction restarts if there was a single iterator in the transaction - +the reasoning being if there weren't other iterators being invalidated, +and the current iterator was being advanced/retraversed, there weren't +any locks or iterators we were required to preserve. + +But with the btree_path conversion that approach doesn't work anymore - +even when we're using for_each_btree_key() with a single iterator there +will still be two paths in the transaction, since we now always preserve +the path at the pos the iterator was initialized at - the reason being +that on restart we often restart from the same place. + +And it turns out there's now a lot of for_each_btree_key() uses that _do +not_ want transaction restarts handled locally, and should be returning +them. + +This patch splits out for_each_btree_key_norestart() and +for_each_btree_key_continue_norestart(), and converts existing users as +appropriate. for_each_btree_key(), for_each_btree_key_continue(), and +for_each_btree_node() now handle transaction restarts themselves by +calling bch2_trans_begin() when necessary - and the old hack to not +return transaction restarts when there's a single path in the +transaction has been deleted. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bcachefs_format.h | 2 - + fs/bcachefs/btree_iter.c | 14 +----- + fs/bcachefs/btree_iter.h | 98 +++++++++++++++++++++++++++-------------- + fs/bcachefs/btree_update_leaf.c | 2 +- + fs/bcachefs/dirent.c | 4 +- + fs/bcachefs/extent_update.c | 4 +- + fs/bcachefs/fs-io.c | 8 ++-- + fs/bcachefs/io.c | 2 +- + fs/bcachefs/reflink.c | 4 +- + fs/bcachefs/str_hash.h | 8 ++-- + fs/bcachefs/xattr.c | 2 +- + 11 files changed, 83 insertions(+), 65 deletions(-) + +diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h +index e268125b057e..296166fa41ff 100644 +--- a/fs/bcachefs/bcachefs_format.h ++++ b/fs/bcachefs/bcachefs_format.h +@@ -1037,8 +1037,6 @@ LE64_BITMASK(BCH_MEMBER_DATA_ALLOWED, struct bch_member, flags[0], 15, 20) + LE64_BITMASK(BCH_MEMBER_GROUP, struct bch_member, flags[0], 20, 28) + LE64_BITMASK(BCH_MEMBER_DURABILITY, struct bch_member, flags[0], 28, 30) + +-#define BCH_TIER_MAX 4U +- + #if 0 + LE64_BITMASK(BCH_MEMBER_NR_READ_ERRORS, struct bch_member, flags[1], 0, 20); + LE64_BITMASK(BCH_MEMBER_NR_WRITE_ERRORS,struct bch_member, flags[1], 20, 40); +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index a54484e05e3a..746237eb3b5a 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -1516,19 +1516,11 @@ static int __btree_path_traverse_all(struct btree_trans *, int, unsigned long); + int __must_check bch2_btree_path_traverse(struct btree_trans *trans, + struct btree_path *path, unsigned flags) + { +- int ret; +- + if (path->uptodate < BTREE_ITER_NEED_RELOCK) + return 0; + +- ret = bch2_trans_cond_resched(trans) ?: ++ return bch2_trans_cond_resched(trans) ?: + btree_path_traverse_one(trans, path, flags, _RET_IP_); +- if (unlikely(ret) && hweight64(trans->paths_allocated) == 1) { +- ret = __btree_path_traverse_all(trans, ret, _RET_IP_); +- BUG_ON(ret == -EINTR); +- } +- +- return ret; + } + + static void btree_path_copy(struct btree_trans *trans, struct btree_path *dst, +@@ -1936,10 +1928,6 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) + if (!btree_path_node(path, path->level)) + goto out; + +- ret = bch2_trans_cond_resched(trans); +- if (ret) +- goto err; +- + btree_node_unlock(path, path->level); + path->l[path->level].b = BTREE_ITER_NO_NODE_UP; + path->level++; +diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h +index 72aff955493b..eaf432aa47d7 100644 +--- a/fs/bcachefs/btree_iter.h ++++ b/fs/bcachefs/btree_iter.h +@@ -258,11 +258,39 @@ static inline int bch2_trans_cond_resched(struct btree_trans *trans) + } + } + ++void bch2_trans_iter_exit(struct btree_trans *, struct btree_iter *); ++void bch2_trans_iter_init(struct btree_trans *, struct btree_iter *, ++ unsigned, struct bpos, unsigned); ++void bch2_trans_node_iter_init(struct btree_trans *, struct btree_iter *, ++ enum btree_id, struct bpos, ++ unsigned, unsigned, unsigned); ++void bch2_trans_copy_iter(struct btree_iter *, struct btree_iter *); ++ ++static inline void set_btree_iter_dontneed(struct btree_iter *iter) ++{ ++ iter->path->preserve = false; ++} ++ ++void *bch2_trans_kmalloc(struct btree_trans *, size_t); ++void bch2_trans_begin(struct btree_trans *); ++ ++static inline struct btree * ++__btree_iter_peek_node_and_restart(struct btree_trans *trans, struct btree_iter *iter) ++{ ++ struct btree *b; ++ ++ while (b = bch2_btree_iter_peek_node(iter), ++ PTR_ERR_OR_ZERO(b) == -EINTR) ++ bch2_trans_begin(trans); ++ ++ return b; ++} ++ + #define __for_each_btree_node(_trans, _iter, _btree_id, _start, \ + _locks_want, _depth, _flags, _b, _ret) \ + for (bch2_trans_node_iter_init((_trans), &(_iter), (_btree_id), \ +- _start, _locks_want, _depth, _flags), \ +- _b = bch2_btree_iter_peek_node(&(_iter)); \ ++ _start, _locks_want, _depth, _flags); \ ++ (_b) = __btree_iter_peek_node_and_restart((_trans), &(_iter)),\ + !((_ret) = PTR_ERR_OR_ZERO(_b)) && (_b); \ + (_b) = bch2_btree_iter_next_node(&(_iter))) + +@@ -271,6 +299,11 @@ static inline int bch2_trans_cond_resched(struct btree_trans *trans) + __for_each_btree_node(_trans, _iter, _btree_id, _start, \ + 0, 0, _flags, _b, _ret) + ++static inline int bkey_err(struct bkey_s_c k) ++{ ++ return PTR_ERR_OR_ZERO(k.k); ++} ++ + static inline struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, + unsigned flags) + { +@@ -279,51 +312,50 @@ static inline struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, + : bch2_btree_iter_peek(iter); + } + +-static inline struct bkey_s_c __bch2_btree_iter_next(struct btree_iter *iter, +- unsigned flags) ++static inline struct bkey_s_c ++__bch2_btree_iter_peek_and_restart(struct btree_trans *trans, ++ struct btree_iter *iter, unsigned flags) + { +- return flags & BTREE_ITER_SLOTS +- ? bch2_btree_iter_next_slot(iter) +- : bch2_btree_iter_next(iter); +-} ++ struct bkey_s_c k; + +-static inline int bkey_err(struct bkey_s_c k) +-{ +- return PTR_ERR_OR_ZERO(k.k); ++ while (k = __bch2_btree_iter_peek(iter, flags), ++ bkey_err(k) == -EINTR) ++ bch2_trans_begin(trans); ++ ++ return k; + } + + #define for_each_btree_key(_trans, _iter, _btree_id, \ + _start, _flags, _k, _ret) \ + for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ +- (_start), (_flags)), \ +- (_k) = __bch2_btree_iter_peek(&(_iter), _flags); \ ++ (_start), (_flags)); \ ++ (_k) = __bch2_btree_iter_peek_and_restart((_trans), &(_iter), _flags),\ + !((_ret) = bkey_err(_k)) && (_k).k; \ +- (_k) = __bch2_btree_iter_next(&(_iter), _flags)) ++ bch2_btree_iter_advance(&(_iter))) + +-#define for_each_btree_key_continue(_iter, _flags, _k, _ret) \ +- for ((_k) = __bch2_btree_iter_peek(&(_iter), _flags); \ ++#define for_each_btree_key_norestart(_trans, _iter, _btree_id, \ ++ _start, _flags, _k, _ret) \ ++ for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ ++ (_start), (_flags)); \ ++ (_k) = __bch2_btree_iter_peek(&(_iter), _flags), \ + !((_ret) = bkey_err(_k)) && (_k).k; \ +- (_k) = __bch2_btree_iter_next(&(_iter), _flags)) ++ bch2_btree_iter_advance(&(_iter))) + +-/* new multiple iterator interface: */ +- +-void bch2_dump_trans_paths_updates(struct btree_trans *); ++#define for_each_btree_key_continue(_trans, _iter, _flags, _k, _ret) \ ++ for (; \ ++ (_k) = __bch2_btree_iter_peek_and_restart((_trans), &(_iter), _flags),\ ++ !((_ret) = bkey_err(_k)) && (_k).k; \ ++ bch2_btree_iter_advance(&(_iter))) + +-void bch2_trans_iter_exit(struct btree_trans *, struct btree_iter *); +-void bch2_trans_iter_init(struct btree_trans *, struct btree_iter *, +- unsigned, struct bpos, unsigned); +-void bch2_trans_node_iter_init(struct btree_trans *, struct btree_iter *, +- enum btree_id, struct bpos, +- unsigned, unsigned, unsigned); +-void bch2_trans_copy_iter(struct btree_iter *, struct btree_iter *); ++#define for_each_btree_key_continue_norestart(_iter, _flags, _k, _ret) \ ++ for (; \ ++ (_k) = __bch2_btree_iter_peek(&(_iter), _flags), \ ++ !((_ret) = bkey_err(_k)) && (_k).k; \ ++ bch2_btree_iter_advance(&(_iter))) + +-static inline void set_btree_iter_dontneed(struct btree_iter *iter) +-{ +- iter->path->preserve = false; +-} ++/* new multiple iterator interface: */ + +-void *bch2_trans_kmalloc(struct btree_trans *, size_t); +-void bch2_trans_begin(struct btree_trans *); ++void bch2_dump_trans_paths_updates(struct btree_trans *); + void bch2_trans_init(struct btree_trans *, struct bch_fs *, unsigned, size_t); + void bch2_trans_exit(struct btree_trans *); + +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index f69f919d83ac..762a97739d80 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -1210,7 +1210,7 @@ static int need_whiteout_for_snapshot(struct btree_trans *trans, + + pos.snapshot++; + +- for_each_btree_key(trans, iter, btree_id, pos, ++ for_each_btree_key_norestart(trans, iter, btree_id, pos, + BTREE_ITER_ALL_SNAPSHOTS, k, ret) { + if (bkey_cmp(k.k->p, pos)) + break; +diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c +index 26df20ad090c..00dac68701f5 100644 +--- a/fs/bcachefs/dirent.c ++++ b/fs/bcachefs/dirent.c +@@ -432,7 +432,7 @@ int bch2_empty_dir_trans(struct btree_trans *trans, subvol_inum dir) + if (ret) + return ret; + +- for_each_btree_key(trans, iter, BTREE_ID_dirents, ++ for_each_btree_key_norestart(trans, iter, BTREE_ID_dirents, + SPOS(dir.inum, 0, snapshot), 0, k, ret) { + if (k.k->p.inode > dir.inum) + break; +@@ -464,7 +464,7 @@ retry: + if (ret) + goto err; + +- for_each_btree_key(&trans, iter, BTREE_ID_dirents, ++ for_each_btree_key_norestart(&trans, iter, BTREE_ID_dirents, + SPOS(inum.inum, ctx->pos, snapshot), 0, k, ret) { + if (k.k->p.inode > inum.inum) + break; +diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c +index 9d959b053def..58b2c96f450c 100644 +--- a/fs/bcachefs/extent_update.c ++++ b/fs/bcachefs/extent_update.c +@@ -61,7 +61,7 @@ static int count_iters_for_insert(struct btree_trans *trans, + struct btree_iter iter; + struct bkey_s_c r_k; + +- for_each_btree_key(trans, iter, ++ for_each_btree_key_norestart(trans, iter, + BTREE_ID_reflink, POS(0, idx + offset), + BTREE_ITER_SLOTS, r_k, ret2) { + if (bkey_cmp(bkey_start_pos(r_k.k), +@@ -120,7 +120,7 @@ int bch2_extent_atomic_end(struct btree_trans *trans, + + bch2_trans_copy_iter(©, iter); + +- for_each_btree_key_continue(copy, 0, k, ret) { ++ for_each_btree_key_continue_norestart(copy, 0, k, ret) { + unsigned offset = 0; + + if (bkey_cmp(bkey_start_pos(k.k), *end) >= 0) +diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c +index 0deb38949844..83b18b881e6e 100644 +--- a/fs/bcachefs/fs-io.c ++++ b/fs/bcachefs/fs-io.c +@@ -1846,7 +1846,7 @@ retry: + if (err) + goto err; + +- for_each_btree_key(&trans, iter, BTREE_ID_extents, ++ for_each_btree_key_norestart(&trans, iter, BTREE_ID_extents, + SPOS(inum.inum, offset, snapshot), + BTREE_ITER_SLOTS, k, err) { + if (bkey_cmp(bkey_start_pos(k.k), POS(inum.inum, end)) >= 0) +@@ -2221,7 +2221,7 @@ retry: + if (ret) + goto err; + +- for_each_btree_key(&trans, iter, BTREE_ID_extents, start, 0, k, ret) { ++ for_each_btree_key_norestart(&trans, iter, BTREE_ID_extents, start, 0, k, ret) { + if (bkey_cmp(bkey_start_pos(k.k), end) >= 0) + break; + +@@ -3126,7 +3126,7 @@ retry: + if (ret) + goto err; + +- for_each_btree_key(&trans, iter, BTREE_ID_extents, ++ for_each_btree_key_norestart(&trans, iter, BTREE_ID_extents, + SPOS(inode->v.i_ino, offset >> 9, snapshot), 0, k, ret) { + if (k.k->p.inode != inode->v.i_ino) { + break; +@@ -3233,7 +3233,7 @@ retry: + if (ret) + goto err; + +- for_each_btree_key(&trans, iter, BTREE_ID_extents, ++ for_each_btree_key_norestart(&trans, iter, BTREE_ID_extents, + SPOS(inode->v.i_ino, offset >> 9, snapshot), + BTREE_ITER_SLOTS, k, ret) { + if (k.k->p.inode != inode->v.i_ino) { +diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c +index 708ba5590182..c4c28559a49c 100644 +--- a/fs/bcachefs/io.c ++++ b/fs/bcachefs/io.c +@@ -206,7 +206,7 @@ int bch2_sum_sector_overwrites(struct btree_trans *trans, + + bch2_trans_copy_iter(&iter, extent_iter); + +- for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, old, ret) { ++ for_each_btree_key_continue_norestart(iter, BTREE_ITER_SLOTS, old, ret) { + s64 sectors = min(new->k.p.offset, old.k->p.offset) - + max(bkey_start_offset(&new->k), + bkey_start_offset(old.k)); +diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c +index 2827d0ef1019..8e66e6390e62 100644 +--- a/fs/bcachefs/reflink.c ++++ b/fs/bcachefs/reflink.c +@@ -131,7 +131,7 @@ static int bch2_make_extent_indirect(struct btree_trans *trans, + if (orig->k.type == KEY_TYPE_inline_data) + bch2_check_set_feature(c, BCH_FEATURE_reflink_inline_data); + +- for_each_btree_key(trans, reflink_iter, BTREE_ID_reflink, ++ for_each_btree_key_norestart(trans, reflink_iter, BTREE_ID_reflink, + POS(0, c->reflink_hint), + BTREE_ITER_INTENT|BTREE_ITER_SLOTS, k, ret) { + if (reflink_iter.pos.inode) { +@@ -194,7 +194,7 @@ static struct bkey_s_c get_next_src(struct btree_iter *iter, struct bpos end) + struct bkey_s_c k; + int ret; + +- for_each_btree_key_continue(*iter, 0, k, ret) { ++ for_each_btree_key_continue_norestart(*iter, 0, k, ret) { + if (bkey_cmp(iter->pos, end) >= 0) + break; + +diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h +index 6486e709b700..3e54d0b0fb5c 100644 +--- a/fs/bcachefs/str_hash.h ++++ b/fs/bcachefs/str_hash.h +@@ -156,7 +156,7 @@ bch2_hash_lookup(struct btree_trans *trans, + if (ret) + return ret; + +- for_each_btree_key(trans, *iter, desc.btree_id, ++ for_each_btree_key_norestart(trans, *iter, desc.btree_id, + SPOS(inum.inum, desc.hash_key(info, key), snapshot), + BTREE_ITER_SLOTS|flags, k, ret) { + if (iter->pos.inode != inum.inum) +@@ -192,7 +192,7 @@ bch2_hash_hole(struct btree_trans *trans, + if (ret) + return ret; + +- for_each_btree_key(trans, *iter, desc.btree_id, ++ for_each_btree_key_norestart(trans, *iter, desc.btree_id, + SPOS(inum.inum, desc.hash_key(info, key), snapshot), + BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { + if (iter->pos.inode != inum.inum) +@@ -220,7 +220,7 @@ int bch2_hash_needs_whiteout(struct btree_trans *trans, + + bch2_btree_iter_advance(&iter); + +- for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, k, ret) { ++ for_each_btree_key_continue_norestart(iter, BTREE_ITER_SLOTS, k, ret) { + if (k.k->type != desc.key_type && + k.k->type != KEY_TYPE_hash_whiteout) + break; +@@ -253,7 +253,7 @@ int bch2_hash_set(struct btree_trans *trans, + if (ret) + return ret; + +- for_each_btree_key(trans, iter, desc.btree_id, ++ for_each_btree_key_norestart(trans, iter, desc.btree_id, + SPOS(inum.inum, + desc.hash_bkey(info, bkey_i_to_s_c(insert)), + snapshot), +diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c +index fe572b2375eb..bb5da310e4d6 100644 +--- a/fs/bcachefs/xattr.c ++++ b/fs/bcachefs/xattr.c +@@ -295,7 +295,7 @@ retry: + if (ret) + goto err; + +- for_each_btree_key(&trans, iter, BTREE_ID_xattrs, ++ for_each_btree_key_norestart(&trans, iter, BTREE_ID_xattrs, + SPOS(inum, offset, snapshot), 0, k, ret) { + BUG_ON(k.k->p.inode < inum); + +-- +cgit v1.2.3 + + +From 4f6b2cf03fcc7f554152126c06ba259497ef4eac Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 12 Oct 2021 12:06:02 -0400 +Subject: bcachefs: Subvol dirents are now only visible in parent subvol + +This changes the on disk format for dirents that point to subvols so +that they also record the subvolid of the parent subvol, so that we can +filter them out in other subvolumes. + +This also updates the dirent code to do that filtering, and in +particular tweaks the rename code - we need to ensure that there's only +ever one dirent (counting multiplicities in different snapshots) that +point to a subvolume. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bcachefs_format.h | 9 ++- + fs/bcachefs/dirent.c | 180 ++++++++++++++++++++++++++---------------- + fs/bcachefs/dirent.h | 3 - + fs/bcachefs/fsck.c | 152 +++++++++++++++++++++++------------ + fs/bcachefs/recovery.c | 4 +- + fs/bcachefs/str_hash.h | 13 ++- + 6 files changed, 232 insertions(+), 129 deletions(-) + +diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h +index 296166fa41ff..0a78d0f1d0c3 100644 +--- a/fs/bcachefs/bcachefs_format.h ++++ b/fs/bcachefs/bcachefs_format.h +@@ -789,7 +789,13 @@ struct bch_dirent { + struct bch_val v; + + /* Target inode number: */ ++ union { + __le64 d_inum; ++ struct { /* DT_SUBVOL */ ++ __le32 d_child_subvol; ++ __le32 d_parent_subvol; ++ }; ++ }; + + /* + * Copy of mode bits 12-15 from the target inode - so userspace can get +@@ -1264,7 +1270,8 @@ enum bcachefs_metadata_version { + bcachefs_metadata_version_btree_ptr_sectors_written = 14, + bcachefs_metadata_version_snapshot_2 = 15, + bcachefs_metadata_version_reflink_p_fix = 16, +- bcachefs_metadata_version_max = 17, ++ bcachefs_metadata_version_subvol_dirent = 17, ++ bcachefs_metadata_version_max = 18, + }; + + #define bcachefs_metadata_version_current (bcachefs_metadata_version_max - 1) +diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c +index 00dac68701f5..2ab9cbaf71f2 100644 +--- a/fs/bcachefs/dirent.c ++++ b/fs/bcachefs/dirent.c +@@ -64,6 +64,15 @@ static bool dirent_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r) + return l_len - r_len ?: memcmp(l.v->d_name, r.v->d_name, l_len); + } + ++static bool dirent_is_visible(subvol_inum inum, struct bkey_s_c k) ++{ ++ struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); ++ ++ if (d.v->d_type == DT_SUBVOL) ++ return le32_to_cpu(d.v->d_parent_subvol) == inum.subvol; ++ return true; ++} ++ + const struct bch_hash_desc bch2_dirent_hash_desc = { + .btree_id = BTREE_ID_dirents, + .key_type = KEY_TYPE_dirent, +@@ -71,6 +80,7 @@ const struct bch_hash_desc bch2_dirent_hash_desc = { + .hash_bkey = dirent_hash_bkey, + .cmp_key = dirent_cmp_key, + .cmp_bkey = dirent_cmp_bkey, ++ .is_visible = dirent_is_visible, + }; + + const char *bch2_dirent_invalid(const struct bch_fs *c, struct bkey_s_c k) +@@ -114,14 +124,18 @@ void bch2_dirent_to_text(struct printbuf *out, struct bch_fs *c, + + bch_scnmemcpy(out, d.v->d_name, + bch2_dirent_name_bytes(d)); +- pr_buf(out, " -> %llu type %s", d.v->d_inum, ++ pr_buf(out, " -> %llu type %s", ++ d.v->d_type != DT_SUBVOL ++ ? le64_to_cpu(d.v->d_inum) ++ : le32_to_cpu(d.v->d_child_subvol), + d.v->d_type < BCH_DT_MAX + ? bch2_d_types[d.v->d_type] + : "(bad d_type)"); + } + + static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans, +- u8 type, const struct qstr *name, u64 dst) ++ subvol_inum dir, u8 type, ++ const struct qstr *name, u64 dst) + { + struct bkey_i_dirent *dirent; + unsigned u64s = BKEY_U64s + dirent_val_u64s(name->len); +@@ -137,7 +151,14 @@ static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans, + + bkey_dirent_init(&dirent->k_i); + dirent->k.u64s = u64s; +- dirent->v.d_inum = cpu_to_le64(dst); ++ ++ if (type != DT_SUBVOL) { ++ dirent->v.d_inum = cpu_to_le64(dst); ++ } else { ++ dirent->v.d_parent_subvol = cpu_to_le32(dir.subvol); ++ dirent->v.d_child_subvol = cpu_to_le32(dst); ++ } ++ + dirent->v.d_type = type; + + memcpy(dirent->v.d_name, name->name, name->len); +@@ -159,7 +180,7 @@ int bch2_dirent_create(struct btree_trans *trans, subvol_inum dir, + struct bkey_i_dirent *dirent; + int ret; + +- dirent = dirent_create_key(trans, type, name, dst_inum); ++ dirent = dirent_create_key(trans, dir, type, name, dst_inum); + ret = PTR_ERR_OR_ZERO(dirent); + if (ret) + return ret; +@@ -178,45 +199,30 @@ static void dirent_copy_target(struct bkey_i_dirent *dst, + dst->v.d_type = src.v->d_type; + } + +-int __bch2_dirent_read_target(struct btree_trans *trans, +- struct bkey_s_c_dirent d, +- u32 *subvol, u32 *snapshot, u64 *inum, +- bool is_fsck) ++static int bch2_dirent_read_target(struct btree_trans *trans, subvol_inum dir, ++ struct bkey_s_c_dirent d, subvol_inum *target) + { + struct bch_subvolume s; + int ret = 0; + +- *subvol = 0; +- *snapshot = d.k->p.snapshot; ++ if (d.v->d_type == DT_SUBVOL && ++ d.v->d_parent_subvol != dir.subvol) ++ return 1; + + if (likely(d.v->d_type != DT_SUBVOL)) { +- *inum = le64_to_cpu(d.v->d_inum); ++ target->subvol = dir.subvol; ++ target->inum = le64_to_cpu(d.v->d_inum); + } else { +- *subvol = le64_to_cpu(d.v->d_inum); ++ target->subvol = le32_to_cpu(d.v->d_child_subvol); + +- ret = bch2_subvolume_get(trans, *subvol, !is_fsck, BTREE_ITER_CACHED, &s); ++ ret = bch2_subvolume_get(trans, target->subvol, true, BTREE_ITER_CACHED, &s); + +- *snapshot = le32_to_cpu(s.snapshot); +- *inum = le64_to_cpu(s.inode); ++ target->inum = le64_to_cpu(s.inode); + } + + return ret; + } + +-static int bch2_dirent_read_target(struct btree_trans *trans, subvol_inum dir, +- struct bkey_s_c_dirent d, subvol_inum *target) +-{ +- u32 snapshot; +- int ret = 0; +- +- ret = __bch2_dirent_read_target(trans, d, &target->subvol, &snapshot, +- &target->inum, false); +- if (!target->subvol) +- target->subvol = dir.subvol; +- +- return ret; +-} +- + int bch2_dirent_rename(struct btree_trans *trans, + subvol_inum src_dir, struct bch_hash_info *src_hash, + subvol_inum dst_dir, struct bch_hash_info *dst_hash, +@@ -230,6 +236,7 @@ int bch2_dirent_rename(struct btree_trans *trans, + struct bkey_i_dirent *new_src = NULL, *new_dst = NULL; + struct bpos dst_pos = + POS(dst_dir.inum, bch2_dirent_hash(dst_hash, dst_name)); ++ unsigned src_type = 0, dst_type = 0, src_update_flags = 0; + int ret = 0; + + if (src_dir.subvol != dst_dir.subvol) +@@ -238,36 +245,6 @@ int bch2_dirent_rename(struct btree_trans *trans, + memset(src_inum, 0, sizeof(*src_inum)); + memset(dst_inum, 0, sizeof(*dst_inum)); + +- /* +- * Lookup dst: +- * +- * Note that in BCH_RENAME mode, we're _not_ checking if +- * the target already exists - we're relying on the VFS +- * to do that check for us for correctness: +- */ +- ret = mode == BCH_RENAME +- ? bch2_hash_hole(trans, &dst_iter, bch2_dirent_hash_desc, +- dst_hash, dst_dir, dst_name) +- : bch2_hash_lookup(trans, &dst_iter, bch2_dirent_hash_desc, +- dst_hash, dst_dir, dst_name, +- BTREE_ITER_INTENT); +- if (ret) +- goto out; +- +- old_dst = bch2_btree_iter_peek_slot(&dst_iter); +- ret = bkey_err(old_dst); +- if (ret) +- goto out; +- +- if (mode != BCH_RENAME) { +- ret = bch2_dirent_read_target(trans, dst_dir, +- bkey_s_c_to_dirent(old_dst), dst_inum); +- if (ret) +- goto out; +- } +- if (mode != BCH_RENAME_EXCHANGE) +- *src_offset = dst_iter.pos.offset; +- + /* Lookup src: */ + ret = bch2_hash_lookup(trans, &src_iter, bch2_dirent_hash_desc, + src_hash, src_dir, src_name, +@@ -285,8 +262,51 @@ int bch2_dirent_rename(struct btree_trans *trans, + if (ret) + goto out; + ++ src_type = bkey_s_c_to_dirent(old_src).v->d_type; ++ ++ if (src_type == DT_SUBVOL && mode == BCH_RENAME_EXCHANGE) ++ return -EOPNOTSUPP; ++ ++ ++ /* Lookup dst: */ ++ if (mode == BCH_RENAME) { ++ /* ++ * Note that we're _not_ checking if the target already exists - ++ * we're relying on the VFS to do that check for us for ++ * correctness: ++ */ ++ ret = bch2_hash_hole(trans, &dst_iter, bch2_dirent_hash_desc, ++ dst_hash, dst_dir, dst_name); ++ if (ret) ++ goto out; ++ } else { ++ ret = bch2_hash_lookup(trans, &dst_iter, bch2_dirent_hash_desc, ++ dst_hash, dst_dir, dst_name, ++ BTREE_ITER_INTENT); ++ if (ret) ++ goto out; ++ ++ old_dst = bch2_btree_iter_peek_slot(&dst_iter); ++ ret = bkey_err(old_dst); ++ if (ret) ++ goto out; ++ ++ ret = bch2_dirent_read_target(trans, dst_dir, ++ bkey_s_c_to_dirent(old_dst), dst_inum); ++ if (ret) ++ goto out; ++ ++ dst_type = bkey_s_c_to_dirent(old_dst).v->d_type; ++ ++ if (dst_type == DT_SUBVOL) ++ return -EOPNOTSUPP; ++ } ++ ++ if (mode != BCH_RENAME_EXCHANGE) ++ *src_offset = dst_iter.pos.offset; ++ + /* Create new dst key: */ +- new_dst = dirent_create_key(trans, 0, dst_name, 0); ++ new_dst = dirent_create_key(trans, dst_dir, 0, dst_name, 0); + ret = PTR_ERR_OR_ZERO(new_dst); + if (ret) + goto out; +@@ -296,7 +316,7 @@ int bch2_dirent_rename(struct btree_trans *trans, + + /* Create new src key: */ + if (mode == BCH_RENAME_EXCHANGE) { +- new_src = dirent_create_key(trans, 0, src_name, 0); ++ new_src = dirent_create_key(trans, src_dir, 0, src_name, 0); + ret = PTR_ERR_OR_ZERO(new_src); + if (ret) + goto out; +@@ -326,10 +346,9 @@ int bch2_dirent_rename(struct btree_trans *trans, + * If we're not overwriting, we can just insert + * new_dst at the src position: + */ +- new_dst->k.p = src_iter.pos; +- bch2_trans_update(trans, &src_iter, +- &new_dst->k_i, 0); +- goto out_set_offset; ++ new_src = new_dst; ++ new_src->k.p = src_iter.pos; ++ goto out_set_src; + } else { + /* If we're overwriting, we can't insert new_dst + * at a different slot because it has to +@@ -350,9 +369,25 @@ int bch2_dirent_rename(struct btree_trans *trans, + } + } + +- bch2_trans_update(trans, &src_iter, &new_src->k_i, 0); + bch2_trans_update(trans, &dst_iter, &new_dst->k_i, 0); +-out_set_offset: ++out_set_src: ++ ++ /* ++ * If we're deleting a subvolume, we need to really delete the dirent, ++ * not just emit a whiteout in the current snapshot: ++ */ ++ if (src_type == DT_SUBVOL) { ++ bch2_btree_iter_set_snapshot(&src_iter, old_src.k->p.snapshot); ++ ret = bch2_btree_iter_traverse(&src_iter); ++ if (ret) ++ goto out; ++ ++ new_src->k.p = src_iter.pos; ++ src_update_flags |= BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE; ++ } ++ ++ bch2_trans_update(trans, &src_iter, &new_src->k_i, src_update_flags); ++ + if (mode == BCH_RENAME_EXCHANGE) + *src_offset = new_src->k.p.offset; + *dst_offset = new_dst->k.p.offset; +@@ -393,6 +428,8 @@ int __bch2_dirent_lookup_trans(struct btree_trans *trans, + d = bkey_s_c_to_dirent(k); + + ret = bch2_dirent_read_target(trans, dir, d, inum); ++ if (ret > 0) ++ ret = -ENOENT; + if (ret) + bch2_trans_iter_exit(trans, iter); + +@@ -453,6 +490,7 @@ int bch2_readdir(struct bch_fs *c, subvol_inum inum, struct dir_context *ctx) + struct btree_iter iter; + struct bkey_s_c k; + struct bkey_s_c_dirent dirent; ++ subvol_inum target; + u32 snapshot; + int ret; + +@@ -474,6 +512,12 @@ retry: + + dirent = bkey_s_c_to_dirent(k); + ++ ret = bch2_dirent_read_target(&trans, inum, dirent, &target); ++ if (ret < 0) ++ break; ++ if (ret) ++ continue; ++ + /* + * XXX: dir_emit() can fault and block, while we're holding + * locks +@@ -481,7 +525,7 @@ retry: + ctx->pos = dirent.k->p.offset; + if (!dir_emit(ctx, dirent.v->d_name, + bch2_dirent_name_bytes(dirent), +- le64_to_cpu(dirent.v->d_inum), ++ target.inum, + vfs_d_type(dirent.v->d_type))) + break; + ctx->pos = dirent.k->p.offset + 1; +diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h +index e7f65fbd8e65..8ae407765fe4 100644 +--- a/fs/bcachefs/dirent.h ++++ b/fs/bcachefs/dirent.h +@@ -33,9 +33,6 @@ int bch2_dirent_create(struct btree_trans *, subvol_inum, + const struct bch_hash_info *, u8, + const struct qstr *, u64, u64 *, int); + +-int __bch2_dirent_read_target(struct btree_trans *, struct bkey_s_c_dirent, +- u32 *, u32 *, u64 *, bool); +- + static inline unsigned vfs_d_type(unsigned type) + { + return type == DT_SUBVOL ? DT_DIR : type; +diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c +index d6f37b9e00fb..58d42734c252 100644 +--- a/fs/bcachefs/fsck.c ++++ b/fs/bcachefs/fsck.c +@@ -134,10 +134,11 @@ static int __lookup_inode(struct btree_trans *trans, u64 inode_nr, + if (ret) + goto err; + +- *snapshot = iter.pos.snapshot; + ret = k.k->type == KEY_TYPE_inode + ? bch2_inode_unpack(bkey_s_c_to_inode(k), inode) + : -ENOENT; ++ if (!ret) ++ *snapshot = iter.pos.snapshot; + err: + bch2_trans_iter_exit(trans, &iter); + return ret; +@@ -1045,46 +1046,60 @@ static int fix_overlapping_extent(struct btree_trans *trans, + } + #endif + ++static struct bkey_s_c_dirent dirent_get_by_pos(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct bpos pos) ++{ ++ struct bkey_s_c k; ++ int ret; ++ ++ bch2_trans_iter_init(trans, iter, BTREE_ID_dirents, pos, 0); ++ k = bch2_btree_iter_peek_slot(iter); ++ ret = bkey_err(k); ++ if (!ret && k.k->type != KEY_TYPE_dirent) ++ ret = -ENOENT; ++ if (ret) { ++ bch2_trans_iter_exit(trans, iter); ++ return (struct bkey_s_c_dirent) { .k = ERR_PTR(ret) }; ++ } ++ ++ return bkey_s_c_to_dirent(k); ++} ++ ++static bool inode_points_to_dirent(struct bch_inode_unpacked *inode, ++ struct bkey_s_c_dirent d) ++{ ++ return inode->bi_dir == d.k->p.inode && ++ inode->bi_dir_offset == d.k->p.offset; ++} ++ ++static bool dirent_points_to_inode(struct bkey_s_c_dirent d, ++ struct bch_inode_unpacked *inode) ++{ ++ return d.v->d_type == DT_SUBVOL ++ ? le32_to_cpu(d.v->d_child_subvol) == inode->bi_subvol ++ : le64_to_cpu(d.v->d_inum) == inode->bi_inum; ++} ++ + static int inode_backpointer_exists(struct btree_trans *trans, + struct bch_inode_unpacked *inode, + u32 snapshot) + { + struct btree_iter iter; +- struct bkey_s_c k; +- u32 target_subvol, target_snapshot; +- u64 target_inum; ++ struct bkey_s_c_dirent d; + int ret; + +- bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, +- SPOS(inode->bi_dir, inode->bi_dir_offset, snapshot), 0); +- k = bch2_btree_iter_peek_slot(&iter); +- ret = bkey_err(k); ++ d = dirent_get_by_pos(trans, &iter, ++ SPOS(inode->bi_dir, inode->bi_dir_offset, snapshot)); ++ ret = bkey_err(d.s_c); + if (ret) +- goto out; +- if (k.k->type != KEY_TYPE_dirent) +- goto out; +- +- ret = __bch2_dirent_read_target(trans, bkey_s_c_to_dirent(k), +- &target_subvol, +- &target_snapshot, +- &target_inum, +- true); +- if (ret) +- goto out; ++ return ret; + +- ret = target_inum == inode->bi_inum; +-out: ++ ret = dirent_points_to_inode(d, inode); + bch2_trans_iter_exit(trans, &iter); + return ret; + } + +-static bool inode_backpointer_matches(struct bkey_s_c_dirent d, +- struct bch_inode_unpacked *inode) +-{ +- return d.k->p.inode == inode->bi_dir && +- d.k->p.offset == inode->bi_dir_offset; +-} +- + static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w) + { + struct bch_fs *c = trans->c; +@@ -1326,7 +1341,7 @@ static int check_dirent_target(struct btree_trans *trans, + goto err; + } + +- if (!inode_backpointer_matches(d, target)) { ++ if (!inode_points_to_dirent(target, d)) { + ret = inode_backpointer_exists(trans, target, d.k->p.snapshot); + if (ret < 0) + goto err; +@@ -1394,8 +1409,34 @@ static int check_dirent_target(struct btree_trans *trans, + BTREE_INSERT_LAZY_RW, + bch2_trans_update(trans, iter, &n->k_i, 0)); + kfree(n); +- if (ret) ++ ++ return ret ?: -EINTR; ++ } ++ ++ if (d.v->d_type == DT_SUBVOL && ++ target->bi_parent_subvol != le32_to_cpu(d.v->d_parent_subvol) && ++ (c->sb.version < bcachefs_metadata_version_subvol_dirent || ++ fsck_err(c, "dirent has wrong d_parent_subvol field: got %u, should be %u", ++ le32_to_cpu(d.v->d_parent_subvol), ++ target->bi_parent_subvol))) { ++ struct bkey_i_dirent *n; ++ ++ n = kmalloc(bkey_bytes(d.k), GFP_KERNEL); ++ if (!n) { ++ ret = -ENOMEM; + goto err; ++ } ++ ++ bkey_reassemble(&n->k_i, d.s_c); ++ n->v.d_parent_subvol = cpu_to_le32(target->bi_parent_subvol); ++ ++ ret = __bch2_trans_do(trans, NULL, NULL, ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_LAZY_RW, ++ bch2_trans_update(trans, iter, &n->k_i, 0)); ++ kfree(n); ++ ++ return ret ?: -EINTR; + } + err: + fsck_err: +@@ -1412,9 +1453,6 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, + struct bkey_s_c k; + struct bkey_s_c_dirent d; + struct inode_walker_entry *i; +- u32 target_snapshot; +- u32 target_subvol; +- u64 target_inum; + char buf[200]; + int ret; + +@@ -1482,21 +1520,21 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, + + d = bkey_s_c_to_dirent(k); + +- ret = __bch2_dirent_read_target(trans, d, +- &target_subvol, +- &target_snapshot, +- &target_inum, +- true); +- if (ret && ret != -ENOENT) +- return ret; ++ if (d.v->d_type == DT_SUBVOL) { ++ struct bch_inode_unpacked subvol_root; ++ u32 target_subvol = le32_to_cpu(d.v->d_child_subvol); ++ u32 target_snapshot; ++ u64 target_inum; + +- if (fsck_err_on(ret, c, +- "dirent points to missing subvolume %llu", +- le64_to_cpu(d.v->d_inum))) +- return remove_dirent(trans, d.k->p); ++ ret = __subvol_lookup(trans, target_subvol, ++ &target_snapshot, &target_inum); ++ if (ret && ret != -ENOENT) ++ return ret; + +- if (target_subvol) { +- struct bch_inode_unpacked subvol_root; ++ if (fsck_err_on(ret, c, ++ "dirent points to missing subvolume %llu", ++ le64_to_cpu(d.v->d_child_subvol))) ++ return remove_dirent(trans, d.k->p); + + ret = __lookup_inode(trans, target_inum, + &subvol_root, &target_snapshot); +@@ -1526,7 +1564,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, + if (ret) + return ret; + } else { +- ret = __get_visible_inodes(trans, target, s, target_inum); ++ ret = __get_visible_inodes(trans, target, s, le64_to_cpu(d.v->d_inum)); + if (ret) + return ret; + +@@ -1786,9 +1824,11 @@ static int check_path(struct btree_trans *trans, + + while (!(inode->bi_inum == BCACHEFS_ROOT_INO && + inode->bi_subvol == BCACHEFS_ROOT_SUBVOL)) { ++ struct btree_iter dirent_iter; ++ struct bkey_s_c_dirent d; + u32 parent_snapshot = snapshot; + +- if (inode->bi_parent_subvol) { ++ if (inode->bi_subvol) { + u64 inum; + + ret = subvol_lookup(trans, inode->bi_parent_subvol, +@@ -1798,11 +1838,18 @@ static int check_path(struct btree_trans *trans, + } + + ret = lockrestart_do(trans, +- inode_backpointer_exists(trans, inode, parent_snapshot)); +- if (ret < 0) ++ PTR_ERR_OR_ZERO((d = dirent_get_by_pos(trans, &dirent_iter, ++ SPOS(inode->bi_dir, inode->bi_dir_offset, ++ parent_snapshot))).k)); ++ if (ret && ret != -ENOENT) + break; + +- if (!ret) { ++ if (!ret && !dirent_points_to_inode(d, inode)) { ++ bch2_trans_iter_exit(trans, &dirent_iter); ++ ret = -ENOENT; ++ } ++ ++ if (ret == -ENOENT) { + if (fsck_err(c, "unreachable inode %llu:%u, type %u nlink %u backptr %llu:%llu", + inode->bi_inum, snapshot, + mode_to_type(inode->bi_mode), +@@ -1812,7 +1859,8 @@ static int check_path(struct btree_trans *trans, + ret = reattach_inode(trans, inode, snapshot); + break; + } +- ret = 0; ++ ++ bch2_trans_iter_exit(trans, &dirent_iter); + + if (!S_ISDIR(inode->bi_mode)) + break; +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index 8c53b1e977d1..6bf9c48a7871 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -1086,8 +1086,8 @@ int bch2_fs_recovery(struct bch_fs *c) + c->opts.version_upgrade = true; + c->opts.fsck = true; + c->opts.fix_errors = FSCK_OPT_YES; +- } else if (c->sb.version < bcachefs_metadata_version_reflink_p_fix) { +- bch_info(c, "filesystem version is prior to reflink_p fix - upgrading"); ++ } else if (c->sb.version < bcachefs_metadata_version_subvol_dirent) { ++ bch_info(c, "filesystem version is prior to subvol_dirent - upgrading"); + c->opts.version_upgrade = true; + c->opts.fsck = true; + } +diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h +index 3e54d0b0fb5c..789dde7c6ac6 100644 +--- a/fs/bcachefs/str_hash.h ++++ b/fs/bcachefs/str_hash.h +@@ -138,8 +138,15 @@ struct bch_hash_desc { + u64 (*hash_bkey)(const struct bch_hash_info *, struct bkey_s_c); + bool (*cmp_key)(struct bkey_s_c, const void *); + bool (*cmp_bkey)(struct bkey_s_c, struct bkey_s_c); ++ bool (*is_visible)(subvol_inum inum, struct bkey_s_c); + }; + ++static inline bool is_visible_key(struct bch_hash_desc desc, subvol_inum inum, struct bkey_s_c k) ++{ ++ return k.k->type == desc.key_type && ++ (!desc.is_visible || desc.is_visible(inum, k)); ++} ++ + static __always_inline int + bch2_hash_lookup(struct btree_trans *trans, + struct btree_iter *iter, +@@ -162,7 +169,7 @@ bch2_hash_lookup(struct btree_trans *trans, + if (iter->pos.inode != inum.inum) + break; + +- if (k.k->type == desc.key_type) { ++ if (is_visible_key(desc, inum, k)) { + if (!desc.cmp_key(k, key)) + return 0; + } else if (k.k->type == KEY_TYPE_hash_whiteout) { +@@ -198,7 +205,7 @@ bch2_hash_hole(struct btree_trans *trans, + if (iter->pos.inode != inum.inum) + break; + +- if (k.k->type != desc.key_type) ++ if (!is_visible_key(desc, inum, k)) + return 0; + } + bch2_trans_iter_exit(trans, iter); +@@ -261,7 +268,7 @@ int bch2_hash_set(struct btree_trans *trans, + if (iter.pos.inode != inum.inum) + break; + +- if (k.k->type == desc.key_type) { ++ if (is_visible_key(desc, inum, k)) { + if (!desc.cmp_bkey(k, bkey_i_to_s_c(insert))) + goto found; + +-- +cgit v1.2.3 + + +From b15dc42e6d173a244aae5e7f1a6c35e1a8b2f10b Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 21 Oct 2021 14:33:31 -0400 +Subject: bcachefs: Fix error handling in bch2_trans_extent_merging + +The back merging case wasn't returning errors correctly. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_update_leaf.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index 762a97739d80..004f0ac2126f 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -1160,13 +1160,13 @@ next: + if (bch2_bkey_maybe_mergable(&insert->k, k.k)) { + ret = check_pos_snapshot_overwritten(trans, btree_id, insert->k.p); + if (ret < 0) +- goto out; ++ goto err; + if (ret) + goto nomerge2; + + ret = check_pos_snapshot_overwritten(trans, btree_id, k.k->p); + if (ret < 0) +- goto out; ++ goto err; + if (ret) + goto nomerge2; + +-- +cgit v1.2.3 + + +From e22ac03b29507f6638129cc67a2371c0d685e43d Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 22 Oct 2021 17:33:38 -0400 +Subject: bcachefs: Fix a transaction path overflow + +readdir() in a directory with many subvolumes could overflow transaction +paths - this is a simple hack around the issue. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/dirent.c | 9 +++++++++ + 1 file changed, 9 insertions(+) + +diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c +index 2ab9cbaf71f2..6be3ec4ec4a6 100644 +--- a/fs/bcachefs/dirent.c ++++ b/fs/bcachefs/dirent.c +@@ -529,6 +529,15 @@ retry: + vfs_d_type(dirent.v->d_type))) + break; + ctx->pos = dirent.k->p.offset + 1; ++ ++ /* ++ * read_target looks up subvolumes, we can overflow paths if the ++ * directory has many subvolumes in it ++ */ ++ if (hweight64(trans.paths_allocated) > BTREE_ITER_MAX / 2) { ++ ret = -EINTR; ++ break; ++ } + } + bch2_trans_iter_exit(&trans, &iter); + err: +-- +cgit v1.2.3 + + +From 354f3dc148bc8fff51d66493a762c6def9b62812 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 24 Oct 2021 11:57:47 -0400 +Subject: bcachefs: Fix dev accounting after device add + +This is a hacky but effective fix to device usage stats for superblock +and journal being wrong on a newly added device (following the comment +that already told us how it needed to be done!) + +Reported-by: Chris Webb +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/super.c | 12 ++++++++++++ + 1 file changed, 12 insertions(+) + +diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c +index bb633e3df618..486a019900e3 100644 +--- a/fs/bcachefs/super.c ++++ b/fs/bcachefs/super.c +@@ -1585,6 +1585,8 @@ int bch2_dev_add(struct bch_fs *c, const char *path) + struct bch_dev *ca = NULL; + struct bch_sb_field_members *mi; + struct bch_member dev_mi; ++ struct bucket_array *buckets; ++ struct bucket *g; + unsigned dev_idx, nr_devices, u64s; + int ret; + +@@ -1688,6 +1690,16 @@ have_slot: + + bch2_dev_usage_journal_reserve(c); + ++ /* ++ * Clear marks before marking transactionally in the btree, so that ++ * per-device accounting gets done correctly: ++ */ ++ down_read(&ca->bucket_lock); ++ buckets = bucket_array(ca); ++ for_each_bucket(g, buckets) ++ atomic64_set(&g->_mark.v, 0); ++ up_read(&ca->bucket_lock); ++ + err = "error marking superblock"; + ret = bch2_trans_mark_dev_sb(c, ca); + if (ret) +-- +cgit v1.2.3 + + +From 2adaf67ebcb8272fa398f7a79513bf01ceed801b Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 24 Oct 2021 16:40:05 -0400 +Subject: bcachefs: Must check for errors from bch2_trans_cond_resched() + +But we don't need to call it from outside the btree iterator code +anymore, since it's called by bch2_trans_begin() and +bch2_btree_path_traverse(). + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/alloc_background.c | 2 -- + fs/bcachefs/btree_gc.c | 2 -- + fs/bcachefs/btree_iter.c | 15 +++++++++++++++ + fs/bcachefs/btree_iter.h | 15 --------------- + fs/bcachefs/btree_update_leaf.c | 2 -- + fs/bcachefs/fsck.c | 2 -- + fs/bcachefs/move.c | 2 -- + 7 files changed, 15 insertions(+), 25 deletions(-) + +diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c +index eb74b96124c5..fe899c5c64d9 100644 +--- a/fs/bcachefs/alloc_background.c ++++ b/fs/bcachefs/alloc_background.c +@@ -367,8 +367,6 @@ int bch2_alloc_write(struct bch_fs *c, unsigned flags) + POS(ca->dev_idx, ca->mi.first_bucket)); + + while (iter.pos.offset < ca->mi.nbuckets) { +- bch2_trans_cond_resched(&trans); +- + ret = bch2_alloc_write_key(&trans, &iter, flags); + if (ret) { + percpu_ref_put(&ca->ref); +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +index dcbde49f07c4..b7a33c084aea 100644 +--- a/fs/bcachefs/btree_gc.c ++++ b/fs/bcachefs/btree_gc.c +@@ -828,8 +828,6 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id, + BTREE_INSERT_NOWAIT| + BTREE_INSERT_GC_LOCK_HELD); + } +- +- bch2_trans_cond_resched(&trans); + } + bch2_trans_iter_exit(&trans, &iter); + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 746237eb3b5a..44dd22d0d9d9 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -27,6 +27,21 @@ static inline void btree_path_list_add(struct btree_trans *, struct btree_path * + + static struct btree_path *btree_path_alloc(struct btree_trans *, struct btree_path *); + ++/* ++ * Unlocks before scheduling ++ * Note: does not revalidate iterator ++ */ ++static inline int bch2_trans_cond_resched(struct btree_trans *trans) ++{ ++ if (need_resched() || race_fault()) { ++ bch2_trans_unlock(trans); ++ schedule(); ++ return bch2_trans_relock(trans) ? 0 : -EINTR; ++ } else { ++ return 0; ++ } ++} ++ + static inline int __btree_path_cmp(const struct btree_path *l, + enum btree_id r_btree_id, + bool r_cached, +diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h +index eaf432aa47d7..876bf42c4248 100644 +--- a/fs/bcachefs/btree_iter.h ++++ b/fs/bcachefs/btree_iter.h +@@ -243,21 +243,6 @@ static inline void bch2_btree_iter_set_snapshot(struct btree_iter *iter, u32 sna + bch2_btree_iter_set_pos(iter, pos); + } + +-/* +- * Unlocks before scheduling +- * Note: does not revalidate iterator +- */ +-static inline int bch2_trans_cond_resched(struct btree_trans *trans) +-{ +- if (need_resched() || race_fault()) { +- bch2_trans_unlock(trans); +- schedule(); +- return bch2_trans_relock(trans) ? 0 : -EINTR; +- } else { +- return 0; +- } +-} +- + void bch2_trans_iter_exit(struct btree_trans *, struct btree_iter *); + void bch2_trans_iter_init(struct btree_trans *, struct btree_iter *, + unsigned, struct bpos, unsigned); +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index 004f0ac2126f..c405466733e2 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -1405,8 +1405,6 @@ retry: + BTREE_INSERT_NOFAIL); + if (ret) + break; +- +- bch2_trans_cond_resched(trans); + } + + if (ret == -EINTR) { +diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c +index 58d42734c252..197b9079e2b8 100644 +--- a/fs/bcachefs/fsck.c ++++ b/fs/bcachefs/fsck.c +@@ -2116,8 +2116,6 @@ static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links + d.k->p.snapshot); + break; + } +- +- bch2_trans_cond_resched(&trans); + } + bch2_trans_iter_exit(&trans, &iter); + +diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c +index 20396820bbb3..c6d6dd39900a 100644 +--- a/fs/bcachefs/move.c ++++ b/fs/bcachefs/move.c +@@ -768,7 +768,6 @@ next: + &stats->sectors_seen); + next_nondata: + bch2_btree_iter_advance(&iter); +- bch2_trans_cond_resched(&trans); + } + out: + +@@ -914,7 +913,6 @@ retry: + ret = bch2_btree_node_rewrite(&trans, &iter, + b->data->keys.seq, 0) ?: ret; + next: +- bch2_trans_cond_resched(&trans); + bch2_btree_iter_next_node(&iter); + } + if (ret == -EINTR) +-- +cgit v1.2.3 + + +From 7fc041535a339c26d65b7e50290731456b400e73 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 24 Oct 2021 16:55:17 -0400 +Subject: bcachefs: Fix bch2_btree_iter_next_node() + +We were modifying state, then return -EINTR, causing us to skip nodes - +ouch. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 50 ++++++++++++++++++++++++++++++++---------------- + 1 file changed, 34 insertions(+), 16 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 44dd22d0d9d9..b8d92988d57d 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -1467,6 +1467,11 @@ static int btree_path_traverse_one(struct btree_trans *trans, + unsigned depth_want = path->level; + int ret = 0; + ++ if (unlikely(trans->restarted)) { ++ ret = -EINTR; ++ goto out; ++ } ++ + /* + * Ensure we obey path->should_be_locked: if it's set, we can't unlock + * and re-traverse the path without a transaction restart: +@@ -1934,30 +1939,41 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) + struct btree_trans *trans = iter->trans; + struct btree_path *path = iter->path; + struct btree *b = NULL; ++ unsigned l; + int ret; + ++ BUG_ON(trans->restarted); + EBUG_ON(iter->path->cached); + bch2_btree_iter_verify(iter); + +- /* already got to end? */ ++ /* already at end? */ + if (!btree_path_node(path, path->level)) +- goto out; ++ return NULL; + +- btree_node_unlock(path, path->level); +- path->l[path->level].b = BTREE_ITER_NO_NODE_UP; +- path->level++; ++ /* got to end? */ ++ if (!btree_path_node(path, path->level + 1)) { ++ btree_node_unlock(path, path->level); ++ path->l[path->level].b = BTREE_ITER_NO_NODE_UP; ++ path->level++; ++ return NULL; ++ } + +- btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); +- ret = bch2_btree_path_traverse(trans, path, iter->flags); +- if (ret) ++ if (!bch2_btree_node_relock(trans, path, path->level + 1)) { ++ __bch2_btree_path_unlock(path); ++ path->l[path->level].b = BTREE_ITER_NO_NODE_GET_LOCKS; ++ path->l[path->level + 1].b = BTREE_ITER_NO_NODE_GET_LOCKS; ++ btree_trans_restart(trans); ++ ret = -EINTR; + goto err; ++ } + +- /* got to end? */ +- b = btree_path_node(path, path->level); +- if (!b) +- goto out; ++ b = btree_path_node(path, path->level + 1); + +- if (bpos_cmp(iter->pos, b->key.k.p) < 0) { ++ if (!bpos_cmp(iter->pos, b->key.k.p)) { ++ btree_node_unlock(path, path->level); ++ path->l[path->level].b = BTREE_ITER_NO_NODE_UP; ++ path->level++; ++ } else { + /* + * Haven't gotten to the end of the parent node: go back down to + * the next child node +@@ -1966,10 +1982,12 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) + btree_path_set_pos(trans, path, bpos_successor(iter->pos), + iter->flags & BTREE_ITER_INTENT); + +- /* Unlock to avoid screwing up our lock invariants: */ +- btree_node_unlock(path, path->level); +- + path->level = iter->min_depth; ++ ++ for (l = path->level + 1; l < BTREE_MAX_DEPTH; l++) ++ if (btree_lock_want(path, l) == BTREE_NODE_UNLOCKED) ++ btree_node_unlock(path, l); ++ + btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); + bch2_btree_iter_verify(iter); + +-- +cgit v1.2.3 + + +From 4ec99f580f9e1a90dd8459c384f0fd40000adbbd Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 24 Oct 2021 16:59:33 -0400 +Subject: bcachefs: bch2_btree_node_rewrite() now returns transaction restarts + +We have been getting away from handling transaction restarts locally - +convert bch2_btree_node_rewrite() to the newer style. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_gc.c | 6 ++--- + fs/bcachefs/btree_update.h | 2 +- + fs/bcachefs/btree_update_interior.c | 53 ++++++++++++++++++++----------------- + fs/bcachefs/move.c | 7 +++-- + 4 files changed, 37 insertions(+), 31 deletions(-) + +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +index b7a33c084aea..54b3d0d97a3b 100644 +--- a/fs/bcachefs/btree_gc.c ++++ b/fs/bcachefs/btree_gc.c +@@ -817,15 +817,13 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id, + + if (!initial) { + if (max_stale > 64) +- bch2_btree_node_rewrite(&trans, &iter, +- b->data->keys.seq, ++ bch2_btree_node_rewrite(&trans, &iter, b, + BTREE_INSERT_NOWAIT| + BTREE_INSERT_GC_LOCK_HELD); + else if (!bch2_btree_gc_rewrite_disabled && + (bch2_btree_gc_always_rewrite || max_stale > 16)) + bch2_btree_node_rewrite(&trans, &iter, +- b->data->keys.seq, +- BTREE_INSERT_NOWAIT| ++ b, BTREE_INSERT_NOWAIT| + BTREE_INSERT_GC_LOCK_HELD); + } + } +diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h +index 155643da35be..0268dd74f0ab 100644 +--- a/fs/bcachefs/btree_update.h ++++ b/fs/bcachefs/btree_update.h +@@ -66,7 +66,7 @@ int bch2_btree_delete_range(struct bch_fs *, enum btree_id, + struct bpos, struct bpos, u64 *); + + int bch2_btree_node_rewrite(struct btree_trans *, struct btree_iter *, +- __le64, unsigned); ++ struct btree *, unsigned); + void bch2_btree_node_rewrite_async(struct bch_fs *, struct btree *); + int bch2_btree_node_update_key(struct btree_trans *, struct btree_iter *, + struct btree *, struct bkey_i *, bool); +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index 591a2fedb89d..61c7757bd3ca 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -1722,26 +1722,15 @@ err: + */ + int bch2_btree_node_rewrite(struct btree_trans *trans, + struct btree_iter *iter, +- __le64 seq, unsigned flags) ++ struct btree *b, ++ unsigned flags) + { + struct bch_fs *c = trans->c; +- struct btree *b, *n, *parent; ++ struct btree *n, *parent; + struct btree_update *as; + int ret; + + flags |= BTREE_INSERT_NOFAIL; +-retry: +- ret = bch2_btree_iter_traverse(iter); +- if (ret) +- goto out; +- +- b = bch2_btree_iter_peek_node(iter); +- ret = PTR_ERR_OR_ZERO(b); +- if (ret) +- goto out; +- +- if (!b || b->data->keys.seq != seq) +- goto out; + + parent = btree_node_parent(iter->path, b); + as = bch2_btree_update_start(trans, iter->path, b->c.level, +@@ -1750,8 +1739,6 @@ retry: + : 0) + 1, + flags); + ret = PTR_ERR_OR_ZERO(as); +- if (ret == -EINTR) +- goto retry; + if (ret) { + trace_btree_gc_rewrite_node_fail(c, b); + goto out; +@@ -1799,20 +1786,38 @@ struct async_btree_rewrite { + __le64 seq; + }; + ++static int async_btree_node_rewrite_trans(struct btree_trans *trans, ++ struct async_btree_rewrite *a) ++{ ++ struct btree_iter iter; ++ struct btree *b; ++ int ret; ++ ++ bch2_trans_node_iter_init(trans, &iter, a->btree_id, a->pos, ++ BTREE_MAX_DEPTH, a->level, 0); ++ b = bch2_btree_iter_peek_node(&iter); ++ ret = PTR_ERR_OR_ZERO(b); ++ if (ret) ++ goto out; ++ ++ if (!b || b->data->keys.seq != a->seq) ++ goto out; ++ ++ ret = bch2_btree_node_rewrite(trans, &iter, b, 0); ++out : ++ bch2_trans_iter_exit(trans, &iter); ++ ++ return ret; ++} ++ + void async_btree_node_rewrite_work(struct work_struct *work) + { + struct async_btree_rewrite *a = + container_of(work, struct async_btree_rewrite, work); + struct bch_fs *c = a->c; +- struct btree_trans trans; +- struct btree_iter iter; + +- bch2_trans_init(&trans, c, 0, 0); +- bch2_trans_node_iter_init(&trans, &iter, a->btree_id, a->pos, +- BTREE_MAX_DEPTH, a->level, 0); +- bch2_btree_node_rewrite(&trans, &iter, a->seq, 0); +- bch2_trans_iter_exit(&trans, &iter); +- bch2_trans_exit(&trans); ++ bch2_trans_do(c, NULL, NULL, 0, ++ async_btree_node_rewrite_trans(&trans, a)); + percpu_ref_put(&c->writes); + kfree(a); + } +diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c +index c6d6dd39900a..83ee011c5157 100644 +--- a/fs/bcachefs/move.c ++++ b/fs/bcachefs/move.c +@@ -910,8 +910,11 @@ retry: + BUG(); + } + +- ret = bch2_btree_node_rewrite(&trans, &iter, +- b->data->keys.seq, 0) ?: ret; ++ ret = bch2_btree_node_rewrite(&trans, &iter, b, 0) ?: ret; ++ if (ret == -EINTR) ++ continue; ++ if (ret) ++ break; + next: + bch2_btree_iter_next_node(&iter); + } +-- +cgit v1.2.3 + + +From 4b59d01ad780e2d93ecb1d1587dfbd8f86b52b71 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 24 Oct 2021 17:00:33 -0400 +Subject: bcachefs: Ensure we flush btree updates in evacuate path + +This fixes a possible race where we fail to remove a device because of +btree nodes still on it, that are being deleted by in flight btree +updates. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/move.c | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c +index 83ee011c5157..fae260097950 100644 +--- a/fs/bcachefs/move.c ++++ b/fs/bcachefs/move.c +@@ -932,6 +932,10 @@ next: + if (ret) + bch_err(c, "error %i in bch2_move_btree", ret); + ++ /* flush relevant btree updates */ ++ closure_wait_event(&c->btree_interior_update_wait, ++ !bch2_btree_interior_updates_nr_pending(c)); ++ + progress_list_del(c, stats); + return ret; + } +@@ -1075,10 +1079,6 @@ int bch2_data_job(struct bch_fs *c, + op.start_btree, op.start_pos, + op.end_btree, op.end_pos, + rereplicate_btree_pred, c, stats) ?: ret; +- +- closure_wait_event(&c->btree_interior_update_wait, +- !bch2_btree_interior_updates_nr_pending(c)); +- + ret = bch2_replicas_gc2(c) ?: ret; + + ret = bch2_move_data(c, +-- +cgit v1.2.3 + + +From 68eb99e5adef9886f8357429bd09aa042331d996 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 21 Oct 2021 15:48:05 -0400 +Subject: bcachefs: Fix fsck path for refink pointers + +The way __bch2_mark_reflink_p returns errors was clashing with returning +the number of sectors processed - we weren't returning FSCK_ERR_EXIT +correctly. + +Fix this by only using the return code for errors, which actually ends +up simplifying the overall logic. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/buckets.c | 122 +++++++++++++++++++------------------------------- + 1 file changed, 46 insertions(+), 76 deletions(-) + +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index 97151ec80c52..45215d0a15cd 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -1108,61 +1108,47 @@ static int bch2_mark_reservation(struct bch_fs *c, + } + + static s64 __bch2_mark_reflink_p(struct bch_fs *c, struct bkey_s_c_reflink_p p, +- u64 idx, unsigned flags, size_t *r_idx) ++ u64 *idx, unsigned flags, size_t r_idx) + { + struct reflink_gc *r; + int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1; + s64 ret = 0; + +- while (*r_idx < c->reflink_gc_nr) { +- r = genradix_ptr(&c->reflink_gc_table, *r_idx); +- BUG_ON(!r); +- +- if (idx < r->offset) +- break; +- (*r_idx)++; +- } ++ if (r_idx >= c->reflink_gc_nr) ++ goto not_found; + +- if (*r_idx >= c->reflink_gc_nr || +- idx < r->offset - r->size) { +- ret = p.k->size; ++ r = genradix_ptr(&c->reflink_gc_table, r_idx); ++ if (*idx < r->offset - r->size) + goto not_found; +- } + + BUG_ON((s64) r->refcount + add < 0); + + r->refcount += add; +- return r->offset - idx; ++ *idx = r->offset; ++ return 0; + not_found: +- if ((flags & BTREE_TRIGGER_GC) && +- (flags & BTREE_TRIGGER_NOATOMIC)) { +- /* +- * XXX: we're replacing the entire reflink pointer with an error +- * key, we should just be replacing the part that was missing: +- */ +- if (fsck_err(c, "%llu:%llu len %u points to nonexistent indirect extent %llu", +- p.k->p.inode, p.k->p.offset, p.k->size, idx)) { +- struct bkey_i_error *new; +- +- new = kmalloc(sizeof(*new), GFP_KERNEL); +- if (!new) { +- bch_err(c, "%s: error allocating new key", __func__); +- return -ENOMEM; +- } ++ *idx = U64_MAX; ++ ret = -EIO; + +- bkey_init(&new->k); +- new->k.type = KEY_TYPE_error; +- new->k.p = p.k->p; +- new->k.size = p.k->size; +- ret = bch2_journal_key_insert(c, BTREE_ID_extents, 0, &new->k_i); ++ /* ++ * XXX: we're replacing the entire reflink pointer with an error ++ * key, we should just be replacing the part that was missing: ++ */ ++ if (fsck_err(c, "%llu:%llu len %u points to nonexistent indirect extent %llu", ++ p.k->p.inode, p.k->p.offset, p.k->size, *idx)) { ++ struct bkey_i_error *new; + ++ new = kmalloc(sizeof(*new), GFP_KERNEL); ++ if (!new) { ++ bch_err(c, "%s: error allocating new key", __func__); ++ return -ENOMEM; + } +- } else { +- bch2_fs_inconsistent(c, +- "%llu:%llu len %u points to nonexistent indirect extent %llu", +- p.k->p.inode, p.k->p.offset, p.k->size, idx); +- bch2_inconsistent_error(c); +- ret = -EIO; ++ ++ bkey_init(&new->k); ++ new->k.type = KEY_TYPE_error; ++ new->k.p = p.k->p; ++ new->k.size = p.k->size; ++ ret = bch2_journal_key_insert(c, BTREE_ID_extents, 0, &new->k_i); + } + fsck_err: + return ret; +@@ -1177,10 +1163,9 @@ static int bch2_mark_reflink_p(struct bch_fs *c, + struct reflink_gc *ref; + size_t l, r, m; + u64 idx = le64_to_cpu(p.v->idx) - le32_to_cpu(p.v->front_pad); +- u64 sectors = (u64) le32_to_cpu(p.v->front_pad) + +- le32_to_cpu(p.v->back_pad) + +- p.k->size; +- s64 ret = 0; ++ u64 end_idx = le64_to_cpu(p.v->idx) + p.k->size + ++ le32_to_cpu(p.v->back_pad); ++ int ret = 0; + + BUG_ON((flags & (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE)) == + (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE)); +@@ -1197,17 +1182,10 @@ static int bch2_mark_reflink_p(struct bch_fs *c, + r = m; + } + +- while (sectors) { +- ret = __bch2_mark_reflink_p(c, p, idx, flags, &l); +- if (ret <= 0) +- return ret; ++ while (idx < end_idx && !ret) ++ ret = __bch2_mark_reflink_p(c, p, &idx, flags, l++); + +- ret = min_t(s64, ret, sectors); +- idx += ret; +- sectors -= ret; +- } +- +- return 0; ++ return ret; + } + + static int bch2_mark_key_locked(struct bch_fs *c, +@@ -1725,7 +1703,7 @@ static int bch2_trans_mark_reservation(struct btree_trans *trans, + + static int __bch2_trans_mark_reflink_p(struct btree_trans *trans, + struct bkey_s_c_reflink_p p, +- u64 idx, unsigned flags) ++ u64 *idx, unsigned flags) + { + struct bch_fs *c = trans->c; + struct btree_iter iter; +@@ -1733,9 +1711,9 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans, + struct bkey_i *n; + __le64 *refcount; + int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1; +- s64 ret; ++ int ret; + +- bch2_trans_iter_init(trans, &iter, BTREE_ID_reflink, POS(0, idx), ++ bch2_trans_iter_init(trans, &iter, BTREE_ID_reflink, POS(0, *idx), + BTREE_ITER_INTENT| + BTREE_ITER_WITH_UPDATES); + k = bch2_btree_iter_peek_slot(&iter); +@@ -1754,7 +1732,7 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans, + if (!refcount) { + bch2_fs_inconsistent(c, + "%llu:%llu len %u points to nonexistent indirect extent %llu", +- p.k->p.inode, p.k->p.offset, p.k->size, idx); ++ p.k->p.inode, p.k->p.offset, p.k->size, *idx); + ret = -EIO; + goto err; + } +@@ -1762,7 +1740,7 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans, + if (!*refcount && (flags & BTREE_TRIGGER_OVERWRITE)) { + bch2_fs_inconsistent(c, + "%llu:%llu len %u idx %llu indirect extent refcount underflow", +- p.k->p.inode, p.k->p.offset, p.k->size, idx); ++ p.k->p.inode, p.k->p.offset, p.k->size, *idx); + ret = -EIO; + goto err; + } +@@ -1794,7 +1772,7 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans, + if (ret) + goto err; + +- ret = k.k->p.offset - idx; ++ *idx = k.k->p.offset; + err: + bch2_trans_iter_exit(trans, &iter); + return ret; +@@ -1804,8 +1782,8 @@ static int bch2_trans_mark_reflink_p(struct btree_trans *trans, + struct bkey_s_c k, unsigned flags) + { + struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); +- u64 idx, sectors; +- s64 ret = 0; ++ u64 idx, end_idx; ++ int ret = 0; + + if (flags & BTREE_TRIGGER_INSERT) { + struct bch_reflink_p *v = (struct bch_reflink_p *) p.v; +@@ -1813,22 +1791,14 @@ static int bch2_trans_mark_reflink_p(struct btree_trans *trans, + v->front_pad = v->back_pad = 0; + } + +- idx = le64_to_cpu(p.v->idx) - le32_to_cpu(p.v->front_pad); +- sectors = (u64) le32_to_cpu(p.v->front_pad) + +- le32_to_cpu(p.v->back_pad) + +- p.k->size; +- +- while (sectors) { +- ret = __bch2_trans_mark_reflink_p(trans, p, idx, flags); +- if (ret < 0) +- return ret; ++ idx = le64_to_cpu(p.v->idx) - le32_to_cpu(p.v->front_pad); ++ end_idx = le64_to_cpu(p.v->idx) + p.k->size + ++ le32_to_cpu(p.v->back_pad); + +- ret = min_t(s64, ret, sectors); +- idx += ret; +- sectors -= ret; +- } ++ while (idx < end_idx && !ret) ++ ret = __bch2_trans_mark_reflink_p(trans, p, &idx, flags); + +- return 0; ++ return ret; + } + + int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c old, +-- +cgit v1.2.3 + + +From 4a4bab5c152ee163773876223fbbdf76b0a13bbf Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 25 Oct 2021 18:30:28 -0400 +Subject: bcachefs: More general fix for transaction paths overflow + +for_each_btree_key() now calls bch2_trans_begin() as needed; that means, +we can also call it when we're in danger of overflowing transaction +paths. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.h | 5 +++-- + 1 file changed, 3 insertions(+), 2 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h +index 876bf42c4248..61bbb7bc54b3 100644 +--- a/fs/bcachefs/btree_iter.h ++++ b/fs/bcachefs/btree_iter.h +@@ -303,8 +303,9 @@ __bch2_btree_iter_peek_and_restart(struct btree_trans *trans, + { + struct bkey_s_c k; + +- while (k = __bch2_btree_iter_peek(iter, flags), +- bkey_err(k) == -EINTR) ++ while ((hweight64(trans->paths_allocated) > BTREE_ITER_MAX / 2) || ++ (k = __bch2_btree_iter_peek(iter, flags), ++ bkey_err(k) == -EINTR)) + bch2_trans_begin(trans); + + return k; +-- +cgit v1.2.3 + + +From 3c92b264ee1a80b5687c69c5d4016cebf0933a77 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 25 Oct 2021 19:30:24 -0400 +Subject: bcachefs: Don't run triggers in fix_reflink_p_key() + +It seems some users have reflink pointers which span many indirect +extents, from a short window in time when merging of reflink pointers +was allowed. + +Now, we're seeing transaction path overflows in fix_reflink_p(), the +code path to clear out the reflink_p fields now used for front/back pad +- but, we don't actually need to be running triggers in that path, which +is an easy partial fix. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/fsck.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c +index 197b9079e2b8..a61d380a47b6 100644 +--- a/fs/bcachefs/fsck.c ++++ b/fs/bcachefs/fsck.c +@@ -2258,7 +2258,7 @@ static int fix_reflink_p_key(struct btree_trans *trans, struct btree_iter *iter) + u->v.front_pad = 0; + u->v.back_pad = 0; + +- return bch2_trans_update(trans, iter, &u->k_i, 0); ++ return bch2_trans_update(trans, iter, &u->k_i, BTREE_TRIGGER_NORUN); + } + + static int fix_reflink_p(struct bch_fs *c) +-- +cgit v1.2.3 + + +From dab15a363b353fea9e862ea71301ef8fe2212d6d Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 26 Oct 2021 14:07:43 -0400 +Subject: bcachefs: Improve error messages in trans_mark_reflink_p() + +We should always print out the key we were marking. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/buckets.c | 11 +++++++---- + 1 file changed, 7 insertions(+), 4 deletions(-) + +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index 45215d0a15cd..40084edd1376 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -1711,6 +1711,7 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans, + struct bkey_i *n; + __le64 *refcount; + int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1; ++ char buf[200]; + int ret; + + bch2_trans_iter_init(trans, &iter, BTREE_ID_reflink, POS(0, *idx), +@@ -1730,17 +1731,19 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans, + + refcount = bkey_refcount(n); + if (!refcount) { ++ bch2_bkey_val_to_text(&PBUF(buf), c, p.s_c); + bch2_fs_inconsistent(c, +- "%llu:%llu len %u points to nonexistent indirect extent %llu", +- p.k->p.inode, p.k->p.offset, p.k->size, *idx); ++ "nonexistent indirect extent at %llu while marking\n %s", ++ *idx, buf); + ret = -EIO; + goto err; + } + + if (!*refcount && (flags & BTREE_TRIGGER_OVERWRITE)) { ++ bch2_bkey_val_to_text(&PBUF(buf), c, p.s_c); + bch2_fs_inconsistent(c, +- "%llu:%llu len %u idx %llu indirect extent refcount underflow", +- p.k->p.inode, p.k->p.offset, p.k->size, *idx); ++ "indirect extent refcount underflow at %llu while marking\n %s", ++ *idx, buf); + ret = -EIO; + goto err; + } +-- +cgit v1.2.3 + + +From dd4ddb46744afe3c9d8ffe2640ca198aebfbf89c Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 11 Oct 2021 12:03:19 -0400 +Subject: bcachefs: Add BCH_SUBVOLUME_UNLINKED + +Snapshot deletion needs to become a multi step process, where we unlink, +then tear down the page cache, then delete the subvolume - the deleting +flag is equivalent to an inode with i_nlink = 0. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bcachefs.h | 4 + + fs/bcachefs/bcachefs_format.h | 1 + + fs/bcachefs/fs-common.c | 30 ++----- + fs/bcachefs/fs-common.h | 2 +- + fs/bcachefs/fs-ioctl.c | 2 +- + fs/bcachefs/fs.c | 11 ++- + fs/bcachefs/fs.h | 2 +- + fs/bcachefs/fsck.c | 18 ++++- + fs/bcachefs/inode.c | 6 +- + fs/bcachefs/subvolume.c | 182 ++++++++++++++++++++++++++++++++++++++---- + fs/bcachefs/subvolume.h | 5 +- + fs/bcachefs/subvolume_types.h | 11 +++ + 12 files changed, 223 insertions(+), 51 deletions(-) + create mode 100644 fs/bcachefs/subvolume_types.h + +diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h +index 59cbede4c72d..131d0f7ba47d 100644 +--- a/fs/bcachefs/bcachefs.h ++++ b/fs/bcachefs/bcachefs.h +@@ -353,6 +353,7 @@ enum bch_time_stats { + #include "quota_types.h" + #include "rebalance_types.h" + #include "replicas_types.h" ++#include "subvolume_types.h" + #include "super_types.h" + + /* Number of nodes btree coalesce will try to coalesce at once */ +@@ -657,6 +658,9 @@ struct bch_fs { + struct bch_snapshot_table __rcu *snapshot_table; + struct mutex snapshot_table_lock; + struct work_struct snapshot_delete_work; ++ struct work_struct snapshot_wait_for_pagecache_and_delete_work; ++ struct snapshot_id_list snapshots_unlinked; ++ struct mutex snapshots_unlinked_lock; + + /* BTREE CACHE */ + struct bio_set btree_bio; +diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h +index 0a78d0f1d0c3..9b1be7146c1c 100644 +--- a/fs/bcachefs/bcachefs_format.h ++++ b/fs/bcachefs/bcachefs_format.h +@@ -970,6 +970,7 @@ LE32_BITMASK(BCH_SUBVOLUME_RO, struct bch_subvolume, flags, 0, 1) + * can delete it (or whether it should just be rm -rf'd) + */ + LE32_BITMASK(BCH_SUBVOLUME_SNAP, struct bch_subvolume, flags, 1, 2) ++LE32_BITMASK(BCH_SUBVOLUME_UNLINKED, struct bch_subvolume, flags, 2, 3) + + /* Snapshots */ + +diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c +index c49de741e1e3..5f3429e99115 100644 +--- a/fs/bcachefs/fs-common.c ++++ b/fs/bcachefs/fs-common.c +@@ -239,7 +239,7 @@ int bch2_unlink_trans(struct btree_trans *trans, + struct bch_inode_unpacked *dir_u, + struct bch_inode_unpacked *inode_u, + const struct qstr *name, +- int deleting_snapshot) ++ bool deleting_snapshot) + { + struct bch_fs *c = trans->c; + struct btree_iter dir_iter = { NULL }; +@@ -267,35 +267,19 @@ int bch2_unlink_trans(struct btree_trans *trans, + if (ret) + goto err; + +- if (deleting_snapshot <= 0 && S_ISDIR(inode_u->bi_mode)) { ++ if (!deleting_snapshot && S_ISDIR(inode_u->bi_mode)) { + ret = bch2_empty_dir_trans(trans, inum); + if (ret) + goto err; + } + +- if (deleting_snapshot < 0 && +- inode_u->bi_subvol) { +- struct bch_subvolume s; +- +- ret = bch2_subvolume_get(trans, inode_u->bi_subvol, true, +- BTREE_ITER_CACHED| +- BTREE_ITER_WITH_UPDATES, +- &s); +- if (ret) +- goto err; +- +- if (BCH_SUBVOLUME_SNAP(&s)) +- deleting_snapshot = 1; ++ if (deleting_snapshot && !inode_u->bi_subvol) { ++ ret = -ENOENT; ++ goto err; + } + +- if (deleting_snapshot == 1) { +- if (!inode_u->bi_subvol) { +- ret = -ENOENT; +- goto err; +- } +- +- ret = bch2_subvolume_delete(trans, inode_u->bi_subvol, +- deleting_snapshot); ++ if (deleting_snapshot || inode_u->bi_subvol) { ++ ret = bch2_subvolume_unlink(trans, inode_u->bi_subvol); + if (ret) + goto err; + +diff --git a/fs/bcachefs/fs-common.h b/fs/bcachefs/fs-common.h +index 9bb0a9676147..dde237859514 100644 +--- a/fs/bcachefs/fs-common.h ++++ b/fs/bcachefs/fs-common.h +@@ -26,7 +26,7 @@ int bch2_link_trans(struct btree_trans *, + int bch2_unlink_trans(struct btree_trans *, subvol_inum, + struct bch_inode_unpacked *, + struct bch_inode_unpacked *, +- const struct qstr *, int); ++ const struct qstr *, bool); + + int bch2_rename_trans(struct btree_trans *, + subvol_inum, struct bch_inode_unpacked *, +diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c +index 513f7a7a3fd4..9f329a624c12 100644 +--- a/fs/bcachefs/fs-ioctl.c ++++ b/fs/bcachefs/fs-ioctl.c +@@ -441,7 +441,7 @@ static long bch2_ioctl_subvolume_destroy(struct bch_fs *c, struct file *filp, + + dir = path.dentry->d_parent->d_inode; + +- ret = __bch2_unlink(dir, path.dentry, 1); ++ ret = __bch2_unlink(dir, path.dentry, true); + if (!ret) { + fsnotify_rmdir(dir, path.dentry); + d_delete(path.dentry); +diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c +index 50135ec6af92..1c119c14dcb3 100644 +--- a/fs/bcachefs/fs.c ++++ b/fs/bcachefs/fs.c +@@ -490,7 +490,7 @@ static int bch2_link(struct dentry *old_dentry, struct inode *vdir, + } + + int __bch2_unlink(struct inode *vdir, struct dentry *dentry, +- int deleting_snapshot) ++ bool deleting_snapshot) + { + struct bch_fs *c = vdir->i_sb->s_fs_info; + struct bch_inode_info *dir = to_bch_ei(vdir); +@@ -527,7 +527,7 @@ int __bch2_unlink(struct inode *vdir, struct dentry *dentry, + + static int bch2_unlink(struct inode *vdir, struct dentry *dentry) + { +- return __bch2_unlink(vdir, dentry, -1); ++ return __bch2_unlink(vdir, dentry, false); + } + + static int bch2_symlink(struct user_namespace *mnt_userns, +@@ -1291,6 +1291,12 @@ static int bch2_vfs_write_inode(struct inode *vinode, + return ret; + } + ++static int bch2_drop_inode(struct inode *vinode) ++{ ++ ++ return generic_drop_inode(vinode); ++} ++ + static void bch2_evict_inode(struct inode *vinode) + { + struct bch_fs *c = vinode->i_sb->s_fs_info; +@@ -1495,6 +1501,7 @@ static const struct super_operations bch_super_operations = { + .alloc_inode = bch2_alloc_inode, + .destroy_inode = bch2_destroy_inode, + .write_inode = bch2_vfs_write_inode, ++ .drop_inode = bch2_drop_inode, + .evict_inode = bch2_evict_inode, + .sync_fs = bch2_sync_fs, + .statfs = bch2_statfs, +diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h +index 48fc504e2da2..22b90bd53e4a 100644 +--- a/fs/bcachefs/fs.h ++++ b/fs/bcachefs/fs.h +@@ -184,7 +184,7 @@ int __must_check bch2_write_inode(struct bch_fs *, struct bch_inode_info *, + int bch2_setattr_nonsize(struct user_namespace *, + struct bch_inode_info *, + struct iattr *); +-int __bch2_unlink(struct inode *, struct dentry *, int); ++int __bch2_unlink(struct inode *, struct dentry *, bool); + + void bch2_vfs_exit(void); + int bch2_vfs_init(void); +diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c +index a61d380a47b6..6b3eecdef81a 100644 +--- a/fs/bcachefs/fsck.c ++++ b/fs/bcachefs/fsck.c +@@ -256,7 +256,7 @@ retry: + + /* Subvolume root? */ + if (inode_u.bi_subvol) { +- ret = bch2_subvolume_delete(trans, inode_u.bi_subvol, -1); ++ ret = bch2_subvolume_delete(trans, inode_u.bi_subvol); + if (ret) + goto err; + } +@@ -992,12 +992,28 @@ static int check_subvols(struct bch_fs *c) + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; ++ struct bkey_s_c_subvolume subvol; + int ret; + + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); + + for_each_btree_key(&trans, iter, BTREE_ID_subvolumes, POS_MIN, + 0, k, ret) { ++ if (k.k->type != KEY_TYPE_subvolume) ++ continue; ++ ++ subvol = bkey_s_c_to_subvolume(k); ++ ++ if (BCH_SUBVOLUME_UNLINKED(subvol.v)) { ++ ret = __bch2_trans_do(&trans, NULL, NULL, ++ BTREE_INSERT_LAZY_RW, ++ bch2_subvolume_delete(&trans, iter.pos.offset)); ++ if (ret) { ++ bch_err(c, "error deleting subvolume %llu: %i", ++ iter.pos.offset, ret); ++ break; ++ } ++ } + } + bch2_trans_iter_exit(&trans, &iter); + +diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c +index 9130d571e84d..462c1f43ae96 100644 +--- a/fs/bcachefs/inode.c ++++ b/fs/bcachefs/inode.c +@@ -709,11 +709,7 @@ retry: + bch2_inode_unpack(bkey_s_c_to_inode(k), &inode_u); + + /* Subvolume root? */ +- if (inode_u.bi_subvol) { +- ret = bch2_subvolume_delete(&trans, inode_u.bi_subvol, -1); +- if (ret) +- goto err; +- } ++ BUG_ON(inode_u.bi_subvol); + + bkey_inode_generation_init(&delete.k_i); + delete.k.p = iter.pos; +diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c +index 9bd8d61c96fe..58cda98989b1 100644 +--- a/fs/bcachefs/subvolume.c ++++ b/fs/bcachefs/subvolume.c +@@ -4,6 +4,7 @@ + #include "btree_key_cache.h" + #include "btree_update.h" + #include "error.h" ++#include "fs.h" + #include "subvolume.h" + + /* Snapshot tree: */ +@@ -541,13 +542,6 @@ err: + return ret; + } + +-/* List of snapshot IDs that are being deleted: */ +-struct snapshot_id_list { +- u32 nr; +- u32 size; +- u32 *d; +-}; +- + static bool snapshot_list_has_id(struct snapshot_id_list *s, u32 id) + { + unsigned i; +@@ -819,9 +813,11 @@ int bch2_subvolume_get_snapshot(struct btree_trans *trans, u32 subvol, + return ret; + } + +-/* XXX: mark snapshot id for deletion, walk btree and delete: */ +-int bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid, +- int deleting_snapshot) ++/* ++ * Delete subvolume, mark snapshot ID as deleted, queue up snapshot ++ * deletion/cleanup: ++ */ ++int bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid) + { + struct btree_iter iter; + struct bkey_s_c k; +@@ -849,12 +845,6 @@ int bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid, + subvol = bkey_s_c_to_subvolume(k); + snapid = le32_to_cpu(subvol.v->snapshot); + +- if (deleting_snapshot >= 0 && +- deleting_snapshot != BCH_SUBVOLUME_SNAP(subvol.v)) { +- ret = -ENOENT; +- goto err; +- } +- + delete = bch2_trans_kmalloc(trans, sizeof(*delete)); + ret = PTR_ERR_OR_ZERO(delete); + if (ret) +@@ -880,6 +870,163 @@ err: + return ret; + } + ++static void bch2_evict_subvolume_inodes(struct bch_fs *c, ++ struct snapshot_id_list *s) ++{ ++ struct super_block *sb = c->vfs_sb; ++ struct inode *inode; ++ ++ spin_lock(&sb->s_inode_list_lock); ++ list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { ++ if (!snapshot_list_has_id(s, to_bch_ei(inode)->ei_subvol) || ++ (inode->i_state & I_FREEING)) ++ continue; ++ ++ d_mark_dontcache(inode); ++ d_prune_aliases(inode); ++ } ++ spin_unlock(&sb->s_inode_list_lock); ++again: ++ cond_resched(); ++ spin_lock(&sb->s_inode_list_lock); ++ list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { ++ if (!snapshot_list_has_id(s, to_bch_ei(inode)->ei_subvol) || ++ (inode->i_state & I_FREEING)) ++ continue; ++ ++ if (!(inode->i_state & I_DONTCACHE)) { ++ d_mark_dontcache(inode); ++ d_prune_aliases(inode); ++ } ++ ++ spin_lock(&inode->i_lock); ++ if (snapshot_list_has_id(s, to_bch_ei(inode)->ei_subvol) && ++ !(inode->i_state & I_FREEING)) { ++ wait_queue_head_t *wq = bit_waitqueue(&inode->i_state, __I_NEW); ++ DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW); ++ prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); ++ spin_unlock(&inode->i_lock); ++ spin_unlock(&sb->s_inode_list_lock); ++ schedule(); ++ finish_wait(wq, &wait.wq_entry); ++ goto again; ++ } ++ ++ spin_unlock(&inode->i_lock); ++ } ++ spin_unlock(&sb->s_inode_list_lock); ++} ++ ++void bch2_subvolume_wait_for_pagecache_and_delete(struct work_struct *work) ++{ ++ struct bch_fs *c = container_of(work, struct bch_fs, ++ snapshot_wait_for_pagecache_and_delete_work); ++ struct snapshot_id_list s; ++ u32 *id; ++ int ret = 0; ++ ++ while (!ret) { ++ mutex_lock(&c->snapshots_unlinked_lock); ++ s = c->snapshots_unlinked; ++ memset(&c->snapshots_unlinked, 0, sizeof(c->snapshots_unlinked)); ++ mutex_unlock(&c->snapshots_unlinked_lock); ++ ++ if (!s.nr) ++ break; ++ ++ bch2_evict_subvolume_inodes(c, &s); ++ ++ for (id = s.d; id < s.d + s.nr; id++) { ++ ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOFAIL, ++ bch2_subvolume_delete(&trans, *id)); ++ if (ret) { ++ bch_err(c, "error %i deleting subvolume %u", ret, *id); ++ break; ++ } ++ } ++ ++ kfree(s.d); ++ } ++ ++ percpu_ref_put(&c->writes); ++} ++ ++struct subvolume_unlink_hook { ++ struct btree_trans_commit_hook h; ++ u32 subvol; ++}; ++ ++int bch2_subvolume_wait_for_pagecache_and_delete_hook(struct btree_trans *trans, ++ struct btree_trans_commit_hook *_h) ++{ ++ struct subvolume_unlink_hook *h = container_of(_h, struct subvolume_unlink_hook, h); ++ struct bch_fs *c = trans->c; ++ int ret = 0; ++ ++ mutex_lock(&c->snapshots_unlinked_lock); ++ if (!snapshot_list_has_id(&c->snapshots_unlinked, h->subvol)) ++ ret = snapshot_id_add(&c->snapshots_unlinked, h->subvol); ++ mutex_unlock(&c->snapshots_unlinked_lock); ++ ++ if (ret) ++ return ret; ++ ++ if (unlikely(!percpu_ref_tryget(&c->writes))) ++ return -EROFS; ++ ++ if (!queue_work(system_long_wq, &c->snapshot_wait_for_pagecache_and_delete_work)) ++ percpu_ref_put(&c->writes); ++ return 0; ++} ++ ++int bch2_subvolume_unlink(struct btree_trans *trans, u32 subvolid) ++{ ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ struct bkey_i_subvolume *n; ++ struct subvolume_unlink_hook *h; ++ int ret = 0; ++ ++ bch2_trans_iter_init(trans, &iter, BTREE_ID_subvolumes, ++ POS(0, subvolid), ++ BTREE_ITER_CACHED| ++ BTREE_ITER_INTENT); ++ k = bch2_btree_iter_peek_slot(&iter); ++ ret = bkey_err(k); ++ if (ret) ++ goto err; ++ ++ if (k.k->type != KEY_TYPE_subvolume) { ++ bch2_fs_inconsistent(trans->c, "missing subvolume %u", subvolid); ++ ret = -EIO; ++ goto err; ++ } ++ ++ n = bch2_trans_kmalloc(trans, sizeof(*n)); ++ ret = PTR_ERR_OR_ZERO(n); ++ if (ret) ++ goto err; ++ ++ bkey_reassemble(&n->k_i, k); ++ SET_BCH_SUBVOLUME_UNLINKED(&n->v, true); ++ ++ ret = bch2_trans_update(trans, &iter, &n->k_i, 0); ++ if (ret) ++ goto err; ++ ++ h = bch2_trans_kmalloc(trans, sizeof(*h)); ++ ret = PTR_ERR_OR_ZERO(h); ++ if (ret) ++ goto err; ++ ++ h->h.fn = bch2_subvolume_wait_for_pagecache_and_delete_hook; ++ h->subvol = subvolid; ++ bch2_trans_commit_hook(trans, &h->h); ++err: ++ bch2_trans_iter_exit(trans, &iter); ++ return ret; ++} ++ + int bch2_subvolume_create(struct btree_trans *trans, u64 inode, + u32 src_subvolid, + u32 *new_subvolid, +@@ -977,5 +1124,8 @@ err: + int bch2_fs_subvolumes_init(struct bch_fs *c) + { + INIT_WORK(&c->snapshot_delete_work, bch2_delete_dead_snapshots_work); ++ INIT_WORK(&c->snapshot_wait_for_pagecache_and_delete_work, ++ bch2_subvolume_wait_for_pagecache_and_delete); ++ mutex_init(&c->snapshots_unlinked_lock); + return 0; + } +diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h +index f98c8c0dbea2..45234c9de0f6 100644 +--- a/fs/bcachefs/subvolume.h ++++ b/fs/bcachefs/subvolume.h +@@ -2,6 +2,8 @@ + #ifndef _BCACHEFS_SUBVOLUME_H + #define _BCACHEFS_SUBVOLUME_H + ++#include "subvolume_types.h" ++ + void bch2_snapshot_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); + const char *bch2_snapshot_invalid(const struct bch_fs *, struct bkey_s_c); + +@@ -108,7 +110,8 @@ int bch2_subvolume_get(struct btree_trans *, unsigned, + bool, int, struct bch_subvolume *); + int bch2_subvolume_get_snapshot(struct btree_trans *, u32, u32 *); + +-int bch2_subvolume_delete(struct btree_trans *, u32, int); ++int bch2_subvolume_delete(struct btree_trans *, u32); ++int bch2_subvolume_unlink(struct btree_trans *, u32); + int bch2_subvolume_create(struct btree_trans *, u64, u32, + u32 *, u32 *, bool); + +diff --git a/fs/bcachefs/subvolume_types.h b/fs/bcachefs/subvolume_types.h +new file mode 100644 +index 000000000000..9410b9587591 +--- /dev/null ++++ b/fs/bcachefs/subvolume_types.h +@@ -0,0 +1,11 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_SUBVOLUME_TYPES_H ++#define _BCACHEFS_SUBVOLUME_TYPES_H ++ ++struct snapshot_id_list { ++ u32 nr; ++ u32 size; ++ u32 *d; ++}; ++ ++#endif /* _BCACHEFS_SUBVOLUME_TYPES_H */ +-- +cgit v1.2.3 + + +From 0c47894c5e3c384469aeacde62d8d8d815e09b5c Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 26 Oct 2021 16:03:28 -0400 +Subject: bcachefs: Drop bch2_journal_meta() call when going RW + +Back when we relied on the journal sequence number blacklist machinery +for consistency between btree and the journal, we needed to ensure a new +journal entry was written before any btree writes were done. But, this +had the side effect of consuming some space in the journal prior to +doing journal replay - which could lead to a very wedged filesystem, +since we don't yet have a way to grow the journal prior to going RW. + +Fortunately, the journal sequence number blacklist machinery isn't +needed anymore, as btree node pointers now record the numer of sectors +currently written to that node - that code should all be ripped out. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/super.c | 7 ------- + 1 file changed, 7 deletions(-) + +diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c +index 486a019900e3..e0c93cb520c3 100644 +--- a/fs/bcachefs/super.c ++++ b/fs/bcachefs/super.c +@@ -405,13 +405,6 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early) + if (ret) + goto err; + +- /* +- * We need to write out a journal entry before we start doing btree +- * updates, to ensure that on unclean shutdown new journal blacklist +- * entries are created: +- */ +- bch2_journal_meta(&c->journal); +- + clear_bit(BCH_FS_ALLOC_CLEAN, &c->flags); + + for_each_rw_member(ca, c, i) +-- +cgit v1.2.3 + + +From f96d32acf5bd4b51400e082675f33ebf28aca57a Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 27 Oct 2021 17:53:20 -0400 +Subject: bcachefs: Don't do upgrades in nochanges mode + +nochanges mode is often used for getting data off of otherwise +nonrecoverable filesystems, which is often because of errors hit during +fsck. + +Don't force version upgrade & fsck in nochanges mode, so that it's more +likely to mount. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/recovery.c | 20 +++++++++++--------- + 1 file changed, 11 insertions(+), 9 deletions(-) + +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index 6bf9c48a7871..da9c3ea528e7 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -1081,15 +1081,17 @@ int bch2_fs_recovery(struct bch_fs *c) + set_bit(BCH_FS_REBUILD_REPLICAS, &c->flags); + } + +- if (c->sb.version < bcachefs_metadata_version_inode_backpointers) { +- bch_info(c, "version prior to inode backpointers, upgrade and fsck required"); +- c->opts.version_upgrade = true; +- c->opts.fsck = true; +- c->opts.fix_errors = FSCK_OPT_YES; +- } else if (c->sb.version < bcachefs_metadata_version_subvol_dirent) { +- bch_info(c, "filesystem version is prior to subvol_dirent - upgrading"); +- c->opts.version_upgrade = true; +- c->opts.fsck = true; ++ if (!c->opts.nochanges) { ++ if (c->sb.version < bcachefs_metadata_version_inode_backpointers) { ++ bch_info(c, "version prior to inode backpointers, upgrade and fsck required"); ++ c->opts.version_upgrade = true; ++ c->opts.fsck = true; ++ c->opts.fix_errors = FSCK_OPT_YES; ++ } else if (c->sb.version < bcachefs_metadata_version_subvol_dirent) { ++ bch_info(c, "filesystem version is prior to subvol_dirent - upgrading"); ++ c->opts.version_upgrade = true; ++ c->opts.fsck = true; ++ } + } + + ret = bch2_blacklist_table_initialize(c); +-- +cgit v1.2.3 + + +From 78cb473277c318e530253629c14c5e0ffe989665 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 28 Oct 2021 16:24:39 -0400 +Subject: bcachefs: Move bch2_evict_subvolume_inodes() to fs.c + +This fixes building in userspace - code that's coupled to the kernel VFS +interface should live in fs.c + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/fs.c | 54 ++++++++++++++++++++++++++++++++++++++++------ + fs/bcachefs/fs.h | 4 ++++ + fs/bcachefs/subvolume.c | 57 ------------------------------------------------- + fs/bcachefs/subvolume.h | 10 +++++++++ + 4 files changed, 61 insertions(+), 64 deletions(-) + +diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c +index 1c119c14dcb3..316cb76e24b6 100644 +--- a/fs/bcachefs/fs.c ++++ b/fs/bcachefs/fs.c +@@ -1291,12 +1291,6 @@ static int bch2_vfs_write_inode(struct inode *vinode, + return ret; + } + +-static int bch2_drop_inode(struct inode *vinode) +-{ +- +- return generic_drop_inode(vinode); +-} +- + static void bch2_evict_inode(struct inode *vinode) + { + struct bch_fs *c = vinode->i_sb->s_fs_info; +@@ -1317,6 +1311,53 @@ static void bch2_evict_inode(struct inode *vinode) + } + } + ++void bch2_evict_subvolume_inodes(struct bch_fs *c, ++ struct snapshot_id_list *s) ++{ ++ struct super_block *sb = c->vfs_sb; ++ struct inode *inode; ++ ++ spin_lock(&sb->s_inode_list_lock); ++ list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { ++ if (!snapshot_list_has_id(s, to_bch_ei(inode)->ei_subvol) || ++ (inode->i_state & I_FREEING)) ++ continue; ++ ++ d_mark_dontcache(inode); ++ d_prune_aliases(inode); ++ } ++ spin_unlock(&sb->s_inode_list_lock); ++again: ++ cond_resched(); ++ spin_lock(&sb->s_inode_list_lock); ++ list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { ++ if (!snapshot_list_has_id(s, to_bch_ei(inode)->ei_subvol) || ++ (inode->i_state & I_FREEING)) ++ continue; ++ ++ if (!(inode->i_state & I_DONTCACHE)) { ++ d_mark_dontcache(inode); ++ d_prune_aliases(inode); ++ } ++ ++ spin_lock(&inode->i_lock); ++ if (snapshot_list_has_id(s, to_bch_ei(inode)->ei_subvol) && ++ !(inode->i_state & I_FREEING)) { ++ wait_queue_head_t *wq = bit_waitqueue(&inode->i_state, __I_NEW); ++ DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW); ++ prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); ++ spin_unlock(&inode->i_lock); ++ spin_unlock(&sb->s_inode_list_lock); ++ schedule(); ++ finish_wait(wq, &wait.wq_entry); ++ goto again; ++ } ++ ++ spin_unlock(&inode->i_lock); ++ } ++ spin_unlock(&sb->s_inode_list_lock); ++} ++ + static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf) + { + struct super_block *sb = dentry->d_sb; +@@ -1501,7 +1542,6 @@ static const struct super_operations bch_super_operations = { + .alloc_inode = bch2_alloc_inode, + .destroy_inode = bch2_destroy_inode, + .write_inode = bch2_vfs_write_inode, +- .drop_inode = bch2_drop_inode, + .evict_inode = bch2_evict_inode, + .sync_fs = bch2_sync_fs, + .statfs = bch2_statfs, +diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h +index 22b90bd53e4a..bf62e80fde59 100644 +--- a/fs/bcachefs/fs.h ++++ b/fs/bcachefs/fs.h +@@ -186,11 +186,15 @@ int bch2_setattr_nonsize(struct user_namespace *, + struct iattr *); + int __bch2_unlink(struct inode *, struct dentry *, bool); + ++void bch2_evict_subvolume_inodes(struct bch_fs *, struct snapshot_id_list *); ++ + void bch2_vfs_exit(void); + int bch2_vfs_init(void); + + #else + ++static inline void bch2_evict_subvolume_inodes(struct bch_fs *c, ++ struct snapshot_id_list *s) {} + static inline void bch2_vfs_exit(void) {} + static inline int bch2_vfs_init(void) { return 0; } + +diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c +index 58cda98989b1..4d385c9e9268 100644 +--- a/fs/bcachefs/subvolume.c ++++ b/fs/bcachefs/subvolume.c +@@ -542,16 +542,6 @@ err: + return ret; + } + +-static bool snapshot_list_has_id(struct snapshot_id_list *s, u32 id) +-{ +- unsigned i; +- +- for (i = 0; i < s->nr; i++) +- if (id == s->d[i]) +- return true; +- return false; +-} +- + static int snapshot_id_add(struct snapshot_id_list *s, u32 id) + { + BUG_ON(snapshot_list_has_id(s, id)); +@@ -870,53 +860,6 @@ err: + return ret; + } + +-static void bch2_evict_subvolume_inodes(struct bch_fs *c, +- struct snapshot_id_list *s) +-{ +- struct super_block *sb = c->vfs_sb; +- struct inode *inode; +- +- spin_lock(&sb->s_inode_list_lock); +- list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { +- if (!snapshot_list_has_id(s, to_bch_ei(inode)->ei_subvol) || +- (inode->i_state & I_FREEING)) +- continue; +- +- d_mark_dontcache(inode); +- d_prune_aliases(inode); +- } +- spin_unlock(&sb->s_inode_list_lock); +-again: +- cond_resched(); +- spin_lock(&sb->s_inode_list_lock); +- list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { +- if (!snapshot_list_has_id(s, to_bch_ei(inode)->ei_subvol) || +- (inode->i_state & I_FREEING)) +- continue; +- +- if (!(inode->i_state & I_DONTCACHE)) { +- d_mark_dontcache(inode); +- d_prune_aliases(inode); +- } +- +- spin_lock(&inode->i_lock); +- if (snapshot_list_has_id(s, to_bch_ei(inode)->ei_subvol) && +- !(inode->i_state & I_FREEING)) { +- wait_queue_head_t *wq = bit_waitqueue(&inode->i_state, __I_NEW); +- DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW); +- prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); +- spin_unlock(&inode->i_lock); +- spin_unlock(&sb->s_inode_list_lock); +- schedule(); +- finish_wait(wq, &wait.wq_entry); +- goto again; +- } +- +- spin_unlock(&inode->i_lock); +- } +- spin_unlock(&sb->s_inode_list_lock); +-} +- + void bch2_subvolume_wait_for_pagecache_and_delete(struct work_struct *work) + { + struct bch_fs *c = container_of(work, struct bch_fs, +diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h +index 45234c9de0f6..b5067dc68fc7 100644 +--- a/fs/bcachefs/subvolume.h ++++ b/fs/bcachefs/subvolume.h +@@ -94,6 +94,16 @@ static inline int snapshots_seen_add(struct bch_fs *c, struct snapshots_seen *s, + return 0; + } + ++static inline bool snapshot_list_has_id(struct snapshot_id_list *s, u32 id) ++{ ++ unsigned i; ++ ++ for (i = 0; i < s->nr; i++) ++ if (id == s->d[i]) ++ return true; ++ return false; ++} ++ + int bch2_fs_snapshots_check(struct bch_fs *); + void bch2_fs_snapshots_exit(struct bch_fs *); + int bch2_fs_snapshots_start(struct bch_fs *); +-- +cgit v1.2.3 + + +From b92610c42f1eb340cf8b170e4e3e7db7c7d5d1ac Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 28 Oct 2021 16:34:17 -0400 +Subject: bcachefs: Fix bch2_btree_iter_advance() + +Was popping an assertion on !BTREE_ITER_ALL_SNAPSHOTS iters when getting +to the end. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index b8d92988d57d..f572a57a8038 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -2020,7 +2020,9 @@ err: + inline bool bch2_btree_iter_advance(struct btree_iter *iter) + { + struct bpos pos = iter->k.p; +- bool ret = bpos_cmp(pos, SPOS_MAX) != 0; ++ bool ret = (iter->flags & BTREE_ITER_ALL_SNAPSHOTS ++ ? bpos_cmp(pos, SPOS_MAX) ++ : bkey_cmp(pos, SPOS_MAX)) != 0; + + if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS)) + pos = bkey_successor(iter, pos); +-- +cgit v1.2.3 + + +From 6f933b6b46f2a5276c15b1ced610ee8921565202 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 28 Oct 2021 16:16:55 -0400 +Subject: bcachefs: Improve transaction restart handling in fsck code + +The fsck code has been handling transaction restarts locally, to avoid +calling fsck_err() multiple times (and asking the user/logging the error +multiple times) on transaction restart. + +However, with our improving assertions about iterator validity, this +isn't working anymore - the code wasn't entirely correct, in ways that +are fine for now but are going to matter once we start wanting online +fsck. + +This code converts much of the fsck code to handle transaction restarts +in a more rigorously correct way - moving restart handling up to the top +level of check_dirent, check_xattr and others - at the cost of logging +errors multiple times on transaction restart. + +Fixing the issues with logging errors multiple times is probably going +to require memoizing calls to fsck_err() - we'll leave that for future +improvements. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/dirent.c | 4 +- + fs/bcachefs/fsck.c | 567 +++++++++++++++++++++++++-------------------------- + fs/bcachefs/inode.h | 5 + + fs/bcachefs/opts.h | 5 + + 4 files changed, 291 insertions(+), 290 deletions(-) + +diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c +index 6be3ec4ec4a6..9267eea810f8 100644 +--- a/fs/bcachefs/dirent.c ++++ b/fs/bcachefs/dirent.c +@@ -128,9 +128,7 @@ void bch2_dirent_to_text(struct printbuf *out, struct bch_fs *c, + d.v->d_type != DT_SUBVOL + ? le64_to_cpu(d.v->d_inum) + : le32_to_cpu(d.v->d_child_subvol), +- d.v->d_type < BCH_DT_MAX +- ? bch2_d_types[d.v->d_type] +- : "(bad d_type)"); ++ bch2_d_type_str(d.v->d_type)); + } + + static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans, +diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c +index 6b3eecdef81a..5bc04c7bbb83 100644 +--- a/fs/bcachefs/fsck.c ++++ b/fs/bcachefs/fsck.c +@@ -94,12 +94,6 @@ err: + + } + +-static int snapshot_lookup_subvol(struct btree_trans *trans, u32 snapshot, +- u32 *subvol) +-{ +- return lockrestart_do(trans, __snapshot_lookup_subvol(trans, snapshot, subvol)); +-} +- + static int __subvol_lookup(struct btree_trans *trans, u32 subvol, + u32 *snapshot, u64 *inum) + { +@@ -140,6 +134,9 @@ static int __lookup_inode(struct btree_trans *trans, u64 inode_nr, + if (!ret) + *snapshot = iter.pos.snapshot; + err: ++ if (ret && ret != -EINTR) ++ bch_err(trans->c, "error %i fetching inode %llu:%u", ++ ret, inode_nr, *snapshot); + bch2_trans_iter_exit(trans, &iter); + return ret; + } +@@ -172,15 +169,6 @@ static int __lookup_dirent(struct btree_trans *trans, + return 0; + } + +-static int lookup_dirent(struct btree_trans *trans, +- struct bch_hash_info hash_info, +- subvol_inum dir, struct qstr *name, +- u64 *target, unsigned *type) +-{ +- return lockrestart_do(trans, +- __lookup_dirent(trans, hash_info, dir, name, target, type)); +-} +- + static int __write_inode(struct btree_trans *trans, + struct bch_inode_unpacked *inode, + u32 snapshot) +@@ -284,7 +272,7 @@ static int __remove_dirent(struct btree_trans *trans, struct bpos pos) + struct bch_hash_info dir_hash_info; + int ret; + +- ret = lookup_inode(trans, pos.inode, &dir_inode, NULL); ++ ret = __lookup_inode(trans, pos.inode, &dir_inode, NULL); + if (ret) + return ret; + +@@ -298,17 +286,6 @@ static int __remove_dirent(struct btree_trans *trans, struct bpos pos) + return ret; + } + +-static int remove_dirent(struct btree_trans *trans, struct bpos pos) +-{ +- int ret = __bch2_trans_do(trans, NULL, NULL, +- BTREE_INSERT_NOFAIL| +- BTREE_INSERT_LAZY_RW, +- __remove_dirent(trans, pos)); +- if (ret) +- bch_err(trans->c, "remove_dirent: err %i deleting dirent", ret); +- return ret; +-} +- + /* Get lost+found, create if it doesn't exist: */ + static int lookup_lostfound(struct btree_trans *trans, u32 subvol, + struct bch_inode_unpacked *lostfound) +@@ -323,65 +300,52 @@ static int lookup_lostfound(struct btree_trans *trans, u32 subvol, + u32 snapshot; + int ret; + +- ret = subvol_lookup(trans, subvol, &snapshot, &root_inum.inum); ++ ret = __subvol_lookup(trans, subvol, &snapshot, &root_inum.inum); + if (ret) + return ret; + +- ret = lookup_inode(trans, root_inum.inum, &root, &snapshot); +- if (ret) { +- bch_err(c, "error fetching subvol root: %i", ret); ++ ret = __lookup_inode(trans, root_inum.inum, &root, &snapshot); ++ if (ret) + return ret; +- } + + root_hash_info = bch2_hash_info_init(c, &root); + +- ret = lookup_dirent(trans, root_hash_info, root_inum, ++ ret = __lookup_dirent(trans, root_hash_info, root_inum, + &lostfound_str, &inum, &d_type); + if (ret == -ENOENT) { + bch_notice(c, "creating lost+found"); + goto create_lostfound; + } + +- if (ret) { ++ if (ret && ret != -EINTR) + bch_err(c, "error looking up lost+found: %i", ret); ++ if (ret) + return ret; +- } + + if (d_type != DT_DIR) { + bch_err(c, "error looking up lost+found: not a directory"); + return ret; +- + } + +- ret = lookup_inode(trans, inum, lostfound, &snapshot); +- if (ret && ret != -ENOENT) { +- /* +- * The check_dirents pass has already run, dangling dirents +- * shouldn't exist here: +- */ +- bch_err(c, "error looking up lost+found: %i", ret); +- return ret; +- } ++ /* ++ * The check_dirents pass has already run, dangling dirents ++ * shouldn't exist here: ++ */ ++ return __lookup_inode(trans, inum, lostfound, &snapshot); + +- if (ret == -ENOENT) { + create_lostfound: +- bch2_inode_init_early(c, lostfound); +- +- ret = __bch2_trans_do(trans, NULL, NULL, +- BTREE_INSERT_NOFAIL| +- BTREE_INSERT_LAZY_RW, +- bch2_create_trans(trans, root_inum, &root, +- lostfound, &lostfound_str, +- 0, 0, S_IFDIR|0700, 0, NULL, NULL, +- (subvol_inum) { }, 0)); +- if (ret) +- bch_err(c, "error creating lost+found: %i", ret); +- } +- +- return 0; ++ bch2_inode_init_early(c, lostfound); ++ ++ ret = bch2_create_trans(trans, root_inum, &root, ++ lostfound, &lostfound_str, ++ 0, 0, S_IFDIR|0700, 0, NULL, NULL, ++ (subvol_inum) { }, 0); ++ if (ret && ret != -EINTR) ++ bch_err(c, "error creating lost+found: %i", ret); ++ return ret; + } + +-static int reattach_inode(struct btree_trans *trans, ++static int __reattach_inode(struct btree_trans *trans, + struct bch_inode_unpacked *inode, + u32 inode_snapshot) + { +@@ -393,7 +357,7 @@ static int reattach_inode(struct btree_trans *trans, + u32 subvol; + int ret; + +- ret = snapshot_lookup_subvol(trans, inode_snapshot, &subvol); ++ ret = __snapshot_lookup_subvol(trans, inode_snapshot, &subvol); + if (ret) + return ret; + +@@ -404,7 +368,7 @@ static int reattach_inode(struct btree_trans *trans, + if (S_ISDIR(inode->bi_mode)) { + lostfound.bi_nlink++; + +- ret = write_inode(trans, &lostfound, U32_MAX); ++ ret = __write_inode(trans, &lostfound, U32_MAX); + if (ret) + return ret; + } +@@ -414,26 +378,39 @@ static int reattach_inode(struct btree_trans *trans, + snprintf(name_buf, sizeof(name_buf), "%llu", inode->bi_inum); + name = (struct qstr) QSTR(name_buf); + +- ret = __bch2_trans_do(trans, NULL, NULL, BTREE_INSERT_LAZY_RW, +- bch2_dirent_create(trans, +- (subvol_inum) { +- .subvol = subvol, +- .inum = lostfound.bi_inum, +- }, +- &dir_hash, +- mode_to_type(inode->bi_mode), +- &name, inode->bi_inum, &dir_offset, +- BCH_HASH_SET_MUST_CREATE)); ++ ret = bch2_dirent_create(trans, ++ (subvol_inum) { ++ .subvol = subvol, ++ .inum = lostfound.bi_inum, ++ }, ++ &dir_hash, ++ inode_d_type(inode), ++ &name, inode->bi_inum, &dir_offset, ++ BCH_HASH_SET_MUST_CREATE); ++ if (ret) ++ return ret; ++ ++ inode->bi_dir = lostfound.bi_inum; ++ inode->bi_dir_offset = dir_offset; ++ ++ return __write_inode(trans, inode, inode_snapshot); ++} ++ ++static int reattach_inode(struct btree_trans *trans, ++ struct bch_inode_unpacked *inode, ++ u32 inode_snapshot) ++{ ++ int ret = __bch2_trans_do(trans, NULL, NULL, ++ BTREE_INSERT_LAZY_RW| ++ BTREE_INSERT_NOFAIL, ++ __reattach_inode(trans, inode, inode_snapshot)); + if (ret) { + bch_err(trans->c, "error %i reattaching inode %llu", + ret, inode->bi_inum); + return ret; + } + +- inode->bi_dir = lostfound.bi_inum; +- inode->bi_dir_offset = dir_offset; +- +- return write_inode(trans, inode, inode_snapshot); ++ return ret; + } + + static int remove_backpointer(struct btree_trans *trans, +@@ -454,7 +431,7 @@ static int remove_backpointer(struct btree_trans *trans, + goto out; + } + +- ret = remove_dirent(trans, k.k->p); ++ ret = __remove_dirent(trans, k.k->p); + out: + bch2_trans_iter_exit(trans, &iter); + return ret; +@@ -653,12 +630,6 @@ found: + return i; + } + +-static int walk_inode(struct btree_trans *trans, +- struct inode_walker *w, struct bpos pos) +-{ +- return lockrestart_do(trans, __walk_inode(trans, w, pos)); +-} +- + static int __get_visible_inodes(struct btree_trans *trans, + struct inode_walker *w, + struct snapshots_seen *s, +@@ -700,12 +671,9 @@ static int check_key_has_snapshot(struct btree_trans *trans, + + if (fsck_err_on(!snapshot_t(c, k.k->p.snapshot)->equiv, c, + "key in missing snapshot: %s", +- (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf))) { +- ret = __bch2_trans_do(trans, NULL, NULL, BTREE_INSERT_LAZY_RW, +- bch2_btree_delete_at(trans, iter, +- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE)); +- return ret ?: -EINTR; +- } ++ (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf))) ++ return bch2_btree_delete_at(trans, iter, ++ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: 1; + fsck_err: + return ret; + } +@@ -739,26 +707,6 @@ static int hash_redo_key(struct btree_trans *trans, + #endif + } + +-static int fsck_hash_delete_at(struct btree_trans *trans, +- const struct bch_hash_desc desc, +- struct bch_hash_info *info, +- struct btree_iter *iter) +-{ +- int ret; +-retry: +- ret = bch2_hash_delete_at(trans, desc, info, iter, 0) ?: +- bch2_trans_commit(trans, NULL, NULL, +- BTREE_INSERT_NOFAIL| +- BTREE_INSERT_LAZY_RW); +- if (ret == -EINTR) { +- ret = bch2_btree_iter_traverse(iter); +- if (!ret) +- goto retry; +- } +- +- return ret; +-} +- + static int hash_check_key(struct btree_trans *trans, + const struct bch_hash_desc desc, + struct bch_hash_info *hash_info, +@@ -792,10 +740,7 @@ static int hash_check_key(struct btree_trans *trans, + "duplicate hash table keys:\n%s", + (bch2_bkey_val_to_text(&PBUF(buf), c, + hash_k), buf))) { +- ret = fsck_hash_delete_at(trans, desc, hash_info, k_iter); +- if (ret) +- return ret; +- ret = 1; ++ ret = bch2_hash_delete_at(trans, desc, hash_info, k_iter, 0) ?: 1; + break; + } + +@@ -814,9 +759,7 @@ bad_hash: + (bch2_bkey_val_to_text(&PBUF(buf), c, hash_k), buf)) == FSCK_ERR_IGNORE) + return 0; + +- ret = __bch2_trans_do(trans, NULL, NULL, +- BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW, +- hash_redo_key(trans, desc, hash_info, k_iter, hash_k)); ++ ret = hash_redo_key(trans, desc, hash_info, k_iter, hash_k); + if (ret) { + bch_err(c, "hash_redo_key err %i", ret); + return ret; +@@ -829,15 +772,53 @@ fsck_err: + static int check_inode(struct btree_trans *trans, + struct btree_iter *iter, + struct bch_inode_unpacked *prev, +- struct bch_inode_unpacked u) ++ bool full) + { + struct bch_fs *c = trans->c; ++ struct bkey_s_c k; ++ struct bkey_s_c_inode inode; ++ struct bch_inode_unpacked u; + bool do_update = false; +- int ret = 0; ++ int ret; ++ ++ k = bch2_btree_iter_peek(iter); ++ if (!k.k) ++ return 0; ++ ++ ret = bkey_err(k); ++ if (ret) ++ return ret; + +- if (fsck_err_on(prev && +- (prev->bi_hash_seed != u.bi_hash_seed || +- mode_to_type(prev->bi_mode) != mode_to_type(u.bi_mode)), c, ++ ret = check_key_has_snapshot(trans, iter, k); ++ if (ret) ++ return ret < 0 ? ret : 0; ++ ++ /* ++ * if snapshot id isn't a leaf node, skip it - deletion in ++ * particular is not atomic, so on the internal snapshot nodes ++ * we can see inodes marked for deletion after a clean shutdown ++ */ ++ if (bch2_snapshot_internal_node(c, k.k->p.snapshot)) ++ return 0; ++ ++ if (k.k->type != KEY_TYPE_inode) ++ return 0; ++ ++ inode = bkey_s_c_to_inode(k); ++ ++ if (!full && ++ !(inode.v->bi_flags & (BCH_INODE_I_SIZE_DIRTY| ++ BCH_INODE_I_SECTORS_DIRTY| ++ BCH_INODE_UNLINKED))) ++ return 0; ++ ++ BUG_ON(bch2_inode_unpack(inode, &u)); ++ ++ if (prev->bi_inum != u.bi_inum) ++ *prev = u; ++ ++ if (fsck_err_on(prev->bi_hash_seed != u.bi_hash_seed || ++ inode_d_type(prev) != inode_d_type(&u), c, + "inodes in different snapshots don't match")) { + bch_err(c, "repair not implemented yet"); + return -EINVAL; +@@ -932,58 +913,61 @@ static int check_inodes(struct bch_fs *c, bool full) + { + struct btree_trans trans; + struct btree_iter iter; +- struct bkey_s_c k; +- struct bkey_s_c_inode inode; +- struct bch_inode_unpacked prev, u; ++ struct bch_inode_unpacked prev = { 0 }; + int ret; + +- memset(&prev, 0, sizeof(prev)); +- + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); + +- for_each_btree_key(&trans, iter, BTREE_ID_inodes, POS_MIN, +- BTREE_ITER_INTENT| +- BTREE_ITER_PREFETCH| +- BTREE_ITER_ALL_SNAPSHOTS, k, ret) { +- ret = check_key_has_snapshot(&trans, &iter, k); ++ bch2_trans_iter_init(&trans, &iter, BTREE_ID_inodes, ++ POS(BCACHEFS_ROOT_INO, 0), ++ BTREE_ITER_INTENT| ++ BTREE_ITER_PREFETCH| ++ BTREE_ITER_ALL_SNAPSHOTS); ++ ++ do { ++ ret = __bch2_trans_do(&trans, NULL, NULL, ++ BTREE_INSERT_LAZY_RW| ++ BTREE_INSERT_NOFAIL, ++ check_inode(&trans, &iter, &prev, full)); + if (ret) + break; ++ } while (bch2_btree_iter_advance(&iter)); ++ bch2_trans_iter_exit(&trans, &iter); + +- /* +- * if snapshot id isn't a leaf node, skip it - deletion in +- * particular is not atomic, so on the internal snapshot nodes +- * we can see inodes marked for deletion after a clean shutdown +- */ +- if (bch2_snapshot_internal_node(c, k.k->p.snapshot)) +- continue; ++ bch2_trans_exit(&trans); ++ return ret; ++} + +- if (k.k->type != KEY_TYPE_inode) +- continue; ++static int check_subvol(struct btree_trans *trans, ++ struct btree_iter *iter) ++{ ++ struct bkey_s_c k; ++ struct bkey_s_c_subvolume subvol; ++ int ret; + +- inode = bkey_s_c_to_inode(k); ++ k = bch2_btree_iter_peek(iter); ++ if (!k.k) ++ return 0; + +- if (!full && +- !(inode.v->bi_flags & (BCH_INODE_I_SIZE_DIRTY| +- BCH_INODE_I_SECTORS_DIRTY| +- BCH_INODE_UNLINKED))) +- continue; ++ ret = bkey_err(k); ++ if (ret) ++ return ret; + +- BUG_ON(bch2_inode_unpack(inode, &u)); ++ if (k.k->type != KEY_TYPE_subvolume) ++ return 0; + +- ret = check_inode(&trans, &iter, +- full && prev.bi_inum == u.bi_inum +- ? &prev : NULL, u); +- if (ret) +- break; ++ subvol = bkey_s_c_to_subvolume(k); + +- prev = u; ++ if (BCH_SUBVOLUME_UNLINKED(subvol.v)) { ++ ret = bch2_subvolume_delete(trans, iter->pos.offset); ++ if (ret && ret != -EINTR) ++ bch_err(trans->c, "error deleting subvolume %llu: %i", ++ iter->pos.offset, ret); ++ if (ret) ++ return ret; + } +- bch2_trans_iter_exit(&trans, &iter); +- +- BUG_ON(ret == -EINTR); + +- bch2_trans_exit(&trans); +- return ret; ++ return 0; + } + + noinline_for_stack +@@ -991,30 +975,23 @@ static int check_subvols(struct bch_fs *c) + { + struct btree_trans trans; + struct btree_iter iter; +- struct bkey_s_c k; +- struct bkey_s_c_subvolume subvol; + int ret; + + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); + +- for_each_btree_key(&trans, iter, BTREE_ID_subvolumes, POS_MIN, +- 0, k, ret) { +- if (k.k->type != KEY_TYPE_subvolume) +- continue; +- +- subvol = bkey_s_c_to_subvolume(k); ++ bch2_trans_iter_init(&trans, &iter, BTREE_ID_subvolumes, ++ POS_MIN, ++ BTREE_ITER_INTENT| ++ BTREE_ITER_PREFETCH); + +- if (BCH_SUBVOLUME_UNLINKED(subvol.v)) { +- ret = __bch2_trans_do(&trans, NULL, NULL, +- BTREE_INSERT_LAZY_RW, +- bch2_subvolume_delete(&trans, iter.pos.offset)); +- if (ret) { +- bch_err(c, "error deleting subvolume %llu: %i", +- iter.pos.offset, ret); +- break; +- } +- } +- } ++ do { ++ ret = __bch2_trans_do(&trans, NULL, NULL, ++ BTREE_INSERT_LAZY_RW| ++ BTREE_INSERT_NOFAIL, ++ check_subvol(&trans, &iter)); ++ if (ret) ++ break; ++ } while (bch2_btree_iter_advance(&iter)); + bch2_trans_iter_exit(&trans, &iter); + + bch2_trans_exit(&trans); +@@ -1174,7 +1151,7 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter, + + ret = check_key_has_snapshot(trans, iter, k); + if (ret) +- return ret; ++ return ret < 0 ? ret : 0; + + ret = snapshots_seen_update(c, s, k.k->p); + if (ret) +@@ -1207,9 +1184,8 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter, + if (fsck_err_on(ret == INT_MAX, c, + "extent in missing inode:\n %s", + (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf))) +- return __bch2_trans_do(trans, NULL, NULL, BTREE_INSERT_LAZY_RW, +- bch2_btree_delete_at(trans, iter, +- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE)); ++ return bch2_btree_delete_at(trans, iter, ++ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + + if (ret == INT_MAX) + return 0; +@@ -1222,9 +1198,8 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter, + "extent in non regular inode mode %o:\n %s", + i->inode.bi_mode, + (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf))) +- return __bch2_trans_do(trans, NULL, NULL, BTREE_INSERT_LAZY_RW, +- bch2_btree_delete_at(trans, iter, +- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE)); ++ return bch2_btree_delete_at(trans, iter, ++ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + + if (!bch2_snapshot_internal_node(c, k.k->p.snapshot)) { + for_each_visible_inode(c, s, inode, k.k->p.snapshot, i) { +@@ -1284,7 +1259,9 @@ static int check_extents(struct bch_fs *c) + BTREE_ITER_ALL_SNAPSHOTS); + + do { +- ret = lockrestart_do(&trans, ++ ret = __bch2_trans_do(&trans, NULL, NULL, ++ BTREE_INSERT_LAZY_RW| ++ BTREE_INSERT_NOFAIL, + check_extent(&trans, &iter, &w, &s)); + if (ret) + break; +@@ -1343,6 +1320,7 @@ static int check_dirent_target(struct btree_trans *trans, + u32 target_snapshot) + { + struct bch_fs *c = trans->c; ++ struct bkey_i_dirent *n; + bool backpointer_exists = true; + char buf[200]; + int ret = 0; +@@ -1352,7 +1330,7 @@ static int check_dirent_target(struct btree_trans *trans, + target->bi_dir = d.k->p.inode; + target->bi_dir_offset = d.k->p.offset; + +- ret = write_inode(trans, target, target_snapshot); ++ ret = __write_inode(trans, target, target_snapshot); + if (ret) + goto err; + } +@@ -1369,7 +1347,7 @@ static int check_dirent_target(struct btree_trans *trans, + backpointer_exists, c, + "directory %llu with multiple links", + target->bi_inum)) { +- ret = remove_dirent(trans, d.k->p); ++ ret = __remove_dirent(trans, d.k->p); + if (ret) + goto err; + return 0; +@@ -1382,7 +1360,7 @@ static int check_dirent_target(struct btree_trans *trans, + target->bi_nlink++; + target->bi_flags &= ~BCH_INODE_UNLINKED; + +- ret = write_inode(trans, target, target_snapshot); ++ ret = __write_inode(trans, target, target_snapshot); + if (ret) + goto err; + } +@@ -1399,34 +1377,30 @@ static int check_dirent_target(struct btree_trans *trans, + target->bi_dir = d.k->p.inode; + target->bi_dir_offset = d.k->p.offset; + +- ret = write_inode(trans, target, target_snapshot); ++ ret = __write_inode(trans, target, target_snapshot); + if (ret) + goto err; + } + } + +- if (fsck_err_on(vfs_d_type(d.v->d_type) != mode_to_type(target->bi_mode), c, +- "incorrect d_type: should be %u:\n%s", +- mode_to_type(target->bi_mode), ++ if (fsck_err_on(d.v->d_type != inode_d_type(target), c, ++ "incorrect d_type: got %s, should be %s:\n%s", ++ bch2_d_type_str(d.v->d_type), ++ bch2_d_type_str(inode_d_type(target)), + (bch2_bkey_val_to_text(&PBUF(buf), c, d.s_c), buf))) { +- struct bkey_i_dirent *n; +- +- n = kmalloc(bkey_bytes(d.k), GFP_KERNEL); +- if (!n) { +- ret = -ENOMEM; +- goto err; +- } ++ n = bch2_trans_kmalloc(trans, bkey_bytes(d.k)); ++ ret = PTR_ERR_OR_ZERO(n); ++ if (ret) ++ return ret; + + bkey_reassemble(&n->k_i, d.s_c); +- n->v.d_type = mode_to_type(target->bi_mode); ++ n->v.d_type = inode_d_type(target); + +- ret = __bch2_trans_do(trans, NULL, NULL, +- BTREE_INSERT_NOFAIL| +- BTREE_INSERT_LAZY_RW, +- bch2_trans_update(trans, iter, &n->k_i, 0)); +- kfree(n); ++ ret = bch2_trans_update(trans, iter, &n->k_i, 0); ++ if (ret) ++ return ret; + +- return ret ?: -EINTR; ++ d = dirent_i_to_s_c(n); + } + + if (d.v->d_type == DT_SUBVOL && +@@ -1435,24 +1409,19 @@ static int check_dirent_target(struct btree_trans *trans, + fsck_err(c, "dirent has wrong d_parent_subvol field: got %u, should be %u", + le32_to_cpu(d.v->d_parent_subvol), + target->bi_parent_subvol))) { +- struct bkey_i_dirent *n; +- +- n = kmalloc(bkey_bytes(d.k), GFP_KERNEL); +- if (!n) { +- ret = -ENOMEM; +- goto err; +- } ++ n = bch2_trans_kmalloc(trans, bkey_bytes(d.k)); ++ ret = PTR_ERR_OR_ZERO(n); ++ if (ret) ++ return ret; + + bkey_reassemble(&n->k_i, d.s_c); + n->v.d_parent_subvol = cpu_to_le32(target->bi_parent_subvol); + +- ret = __bch2_trans_do(trans, NULL, NULL, +- BTREE_INSERT_NOFAIL| +- BTREE_INSERT_LAZY_RW, +- bch2_trans_update(trans, iter, &n->k_i, 0)); +- kfree(n); ++ ret = bch2_trans_update(trans, iter, &n->k_i, 0); ++ if (ret) ++ return ret; + +- return ret ?: -EINTR; ++ d = dirent_i_to_s_c(n); + } + err: + fsck_err: +@@ -1482,7 +1451,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, + + ret = check_key_has_snapshot(trans, iter, k); + if (ret) +- return ret; ++ return ret < 0 ? ret : 0; + + ret = snapshots_seen_update(c, s, k.k->p); + if (ret) +@@ -1504,9 +1473,8 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, + if (fsck_err_on(ret == INT_MAX, c, + "dirent in nonexisting directory:\n%s", + (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf))) +- return __bch2_trans_do(trans, NULL, NULL, BTREE_INSERT_LAZY_RW, +- bch2_btree_delete_at(trans, iter, +- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE)); ++ return bch2_btree_delete_at(trans, iter, ++ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + + if (ret == INT_MAX) + return 0; +@@ -1515,11 +1483,10 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, + ret = 0; + + if (fsck_err_on(!S_ISDIR(i->inode.bi_mode), c, +- "dirent in non directory inode type %u:\n%s", +- mode_to_type(i->inode.bi_mode), ++ "dirent in non directory inode type %s:\n%s", ++ bch2_d_type_str(inode_d_type(&i->inode)), + (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf))) +- return __bch2_trans_do(trans, NULL, NULL, 0, +- bch2_btree_delete_at(trans, iter, 0)); ++ return bch2_btree_delete_at(trans, iter, 0); + + if (dir->first_this_inode) + *hash_info = bch2_hash_info_init(c, &dir->d[0].inode); +@@ -1550,7 +1517,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, + if (fsck_err_on(ret, c, + "dirent points to missing subvolume %llu", + le64_to_cpu(d.v->d_child_subvol))) +- return remove_dirent(trans, d.k->p); ++ return __remove_dirent(trans, d.k->p); + + ret = __lookup_inode(trans, target_inum, + &subvol_root, &target_snapshot); +@@ -1570,7 +1537,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, + target_inum, + subvol_root.bi_subvol, target_subvol)) { + subvol_root.bi_subvol = target_subvol; +- ret = write_inode(trans, &subvol_root, target_snapshot); ++ ret = __write_inode(trans, &subvol_root, target_snapshot); + if (ret) + return ret; + } +@@ -1588,7 +1555,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, + "dirent points to missing inode:\n%s", + (bch2_bkey_val_to_text(&PBUF(buf), c, + k), buf))) { +- ret = remove_dirent(trans, d.k->p); ++ ret = __remove_dirent(trans, d.k->p); + if (ret) + return ret; + } +@@ -1636,7 +1603,9 @@ static int check_dirents(struct bch_fs *c) + BTREE_ITER_ALL_SNAPSHOTS); + + do { +- ret = lockrestart_do(&trans, ++ ret = __bch2_trans_do(&trans, NULL, NULL, ++ BTREE_INSERT_LAZY_RW| ++ BTREE_INSERT_NOFAIL, + check_dirent(&trans, &iter, &hash_info, + &dir, &target, &s)); + if (ret) +@@ -1651,17 +1620,58 @@ static int check_dirents(struct bch_fs *c) + return ret; + } + ++static int check_xattr(struct btree_trans *trans, struct btree_iter *iter, ++ struct bch_hash_info *hash_info, ++ struct inode_walker *inode) ++{ ++ struct bch_fs *c = trans->c; ++ struct bkey_s_c k; ++ int ret; ++ ++ k = bch2_btree_iter_peek(iter); ++ if (!k.k) ++ return 0; ++ ++ ret = bkey_err(k); ++ if (ret) ++ return ret; ++ ++ ret = check_key_has_snapshot(trans, iter, k); ++ if (ret) ++ return ret; ++ ++ ret = __walk_inode(trans, inode, k.k->p); ++ if (ret < 0) ++ return ret; ++ ++ if (fsck_err_on(ret == INT_MAX, c, ++ "xattr for missing inode %llu", ++ k.k->p.inode)) ++ return bch2_btree_delete_at(trans, iter, 0); ++ ++ if (ret == INT_MAX) ++ return 0; ++ ++ ret = 0; ++ ++ if (inode->first_this_inode) ++ *hash_info = bch2_hash_info_init(c, &inode->d[0].inode); ++ ++ ret = hash_check_key(trans, bch2_xattr_hash_desc, hash_info, iter, k); ++fsck_err: ++ return ret; ++} ++ + /* + * Walk xattrs: verify that they all have a corresponding inode + */ + noinline_for_stack + static int check_xattrs(struct bch_fs *c) + { +- struct inode_walker w = inode_walker_init(); ++ struct inode_walker inode = inode_walker_init(); + struct bch_hash_info hash_info; + struct btree_trans trans; + struct btree_iter iter; +- struct bkey_s_c k; + int ret = 0; + + bch_verbose(c, "checking xattrs"); +@@ -1673,65 +1683,31 @@ static int check_xattrs(struct bch_fs *c) + BTREE_ITER_INTENT| + BTREE_ITER_PREFETCH| + BTREE_ITER_ALL_SNAPSHOTS); +-retry: +- bch2_trans_begin(&trans); +- +- while ((k = bch2_btree_iter_peek(&iter)).k && +- !(ret = bkey_err(k))) { +- ret = check_key_has_snapshot(&trans, &iter, k); +- if (ret) +- break; +- +- ret = walk_inode(&trans, &w, k.k->p); +- if (ret < 0) +- break; +- +- if (fsck_err_on(ret == INT_MAX, c, +- "xattr for missing inode %llu", +- k.k->p.inode)) { +- ret = bch2_btree_delete_at(&trans, &iter, 0); +- if (ret) +- break; +- continue; +- } +- +- if (ret == INT_MAX) +- goto next; +- ret = 0; +- +- if (w.first_this_inode) +- hash_info = bch2_hash_info_init(c, &w.d[0].inode); + +- ret = hash_check_key(&trans, bch2_xattr_hash_desc, +- &hash_info, &iter, k); ++ do { ++ ret = __bch2_trans_do(&trans, NULL, NULL, ++ BTREE_INSERT_LAZY_RW| ++ BTREE_INSERT_NOFAIL, ++ check_xattr(&trans, &iter, &hash_info, ++ &inode)); + if (ret) + break; +-next: +- bch2_btree_iter_advance(&iter); +- } +-fsck_err: +- if (ret == -EINTR) +- goto retry; +- ++ } while (bch2_btree_iter_advance(&iter)); + bch2_trans_iter_exit(&trans, &iter); ++ + bch2_trans_exit(&trans); + return ret; + } + +-/* Get root directory, create if it doesn't exist: */ +-static int check_root(struct bch_fs *c) ++static int check_root_trans(struct btree_trans *trans) + { +- struct btree_trans trans; ++ struct bch_fs *c = trans->c; + struct bch_inode_unpacked root_inode; + u32 snapshot; + u64 inum; + int ret; + +- bch2_trans_init(&trans, c, 0, 0); +- +- bch_verbose(c, "checking root directory"); +- +- ret = subvol_lookup(&trans, BCACHEFS_ROOT_SUBVOL, &snapshot, &inum); ++ ret = __subvol_lookup(trans, BCACHEFS_ROOT_SUBVOL, &snapshot, &inum); + if (ret && ret != -ENOENT) + return ret; + +@@ -1746,10 +1722,10 @@ static int check_root(struct bch_fs *c) + root_subvol.v.flags = 0; + root_subvol.v.snapshot = cpu_to_le32(snapshot); + root_subvol.v.inode = cpu_to_le64(inum); +- ret = __bch2_trans_do(&trans, NULL, NULL, ++ ret = __bch2_trans_do(trans, NULL, NULL, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW, +- __bch2_btree_insert(&trans, BTREE_ID_subvolumes, &root_subvol.k_i)); ++ __bch2_btree_insert(trans, BTREE_ID_subvolumes, &root_subvol.k_i)); + if (ret) { + bch_err(c, "error writing root subvol: %i", ret); + goto err; +@@ -1757,7 +1733,7 @@ static int check_root(struct bch_fs *c) + + } + +- ret = lookup_inode(&trans, BCACHEFS_ROOT_INO, &root_inode, &snapshot); ++ ret = __lookup_inode(trans, BCACHEFS_ROOT_INO, &root_inode, &snapshot); + if (ret && ret != -ENOENT) + return ret; + +@@ -1768,16 +1744,27 @@ static int check_root(struct bch_fs *c) + 0, NULL); + root_inode.bi_inum = inum; + +- ret = write_inode(&trans, &root_inode, snapshot); ++ ret = __write_inode(trans, &root_inode, snapshot); + if (ret) + bch_err(c, "error writing root inode: %i", ret); + } + err: + fsck_err: +- bch2_trans_exit(&trans); + return ret; + } + ++/* Get root directory, create if it doesn't exist: */ ++noinline_for_stack ++static int check_root(struct bch_fs *c) ++{ ++ bch_verbose(c, "checking root directory"); ++ ++ return bch2_trans_do(c, NULL, NULL, ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_LAZY_RW, ++ check_root_trans(&trans)); ++} ++ + struct pathbuf { + size_t nr; + size_t size; +@@ -1866,9 +1853,9 @@ static int check_path(struct btree_trans *trans, + } + + if (ret == -ENOENT) { +- if (fsck_err(c, "unreachable inode %llu:%u, type %u nlink %u backptr %llu:%llu", ++ if (fsck_err(c, "unreachable inode %llu:%u, type %s nlink %u backptr %llu:%llu", + inode->bi_inum, snapshot, +- mode_to_type(inode->bi_mode), ++ bch2_d_type_str(inode_d_type(inode)), + inode->bi_nlink, + inode->bi_dir, + inode->bi_dir_offset)) +@@ -1909,7 +1896,9 @@ static int check_path(struct btree_trans *trans, + if (!fsck_err(c, "directory structure loop")) + return 0; + +- ret = lockrestart_do(trans, ++ ret = __bch2_trans_do(trans, NULL, NULL, ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_LAZY_RW, + remove_backpointer(trans, inode)); + if (ret) { + bch_err(c, "error removing dirent: %i", ret); +@@ -1930,6 +1919,7 @@ fsck_err: + * After check_dirents(), if an inode backpointer doesn't exist that means it's + * unreachable: + */ ++noinline_for_stack + static int check_directory_structure(struct bch_fs *c) + { + struct btree_trans trans; +@@ -2277,6 +2267,7 @@ static int fix_reflink_p_key(struct btree_trans *trans, struct btree_iter *iter) + return bch2_trans_update(trans, iter, &u->k_i, BTREE_TRIGGER_NORUN); + } + ++noinline_for_stack + static int fix_reflink_p(struct bch_fs *c) + { + struct btree_trans trans; +@@ -2287,6 +2278,8 @@ static int fix_reflink_p(struct bch_fs *c) + if (c->sb.version >= bcachefs_metadata_version_reflink_p_fix) + return 0; + ++ bch_verbose(c, "fixing reflink_p keys"); ++ + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); + + for_each_btree_key(&trans, iter, BTREE_ID_extents, POS_MIN, +diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h +index 9e84cddcc6cb..009b807cc167 100644 +--- a/fs/bcachefs/inode.h ++++ b/fs/bcachefs/inode.h +@@ -134,6 +134,11 @@ static inline u8 mode_to_type(umode_t mode) + return (mode >> 12) & 15; + } + ++static inline u8 inode_d_type(struct bch_inode_unpacked *inode) ++{ ++ return inode->bi_subvol ? DT_SUBVOL : mode_to_type(inode->bi_mode); ++} ++ + /* i_nlink: */ + + static inline unsigned nlink_bias(umode_t mode) +diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h +index d39d6a546ac4..b60bdfca27fd 100644 +--- a/fs/bcachefs/opts.h ++++ b/fs/bcachefs/opts.h +@@ -20,6 +20,11 @@ extern const char * const bch2_cache_replacement_policies[]; + extern const char * const bch2_member_states[]; + extern const char * const bch2_d_types[]; + ++static inline const char *bch2_d_type_str(unsigned d_type) ++{ ++ return (d_type < BCH_DT_MAX ? bch2_d_types[d_type] : NULL) ?: "(bad d_type)"; ++} ++ + /* + * Mount options; we also store defaults in the superblock. + * +-- +cgit v1.2.3 + + +From 1d74899c2679741591db73dde9c7a4e57cd83f7a Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 28 Oct 2021 18:22:25 -0400 +Subject: bcachefs: Ensure journal doesn't get stuck in nochanges mode + +This tweaks the journal code to always act as if there's space available +in nochanges mode, when we're not going to be doing any writes. This +helps in recovering filesystems that won't mount because they need +journal replay and the journal has gotten stuck. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/journal.h | 1 + + fs/bcachefs/journal_io.c | 2 +- + fs/bcachefs/journal_reclaim.c | 6 ++++-- + fs/bcachefs/journal_types.h | 1 + + fs/bcachefs/super.c | 3 +++ + 5 files changed, 10 insertions(+), 3 deletions(-) + +diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h +index 1d556790b38e..99fd253648bf 100644 +--- a/fs/bcachefs/journal.h ++++ b/fs/bcachefs/journal.h +@@ -446,6 +446,7 @@ static inline int bch2_journal_preres_get_fast(struct journal *j, + ret = 0; + + if ((flags & JOURNAL_RES_GET_RESERVED) || ++ test_bit(JOURNAL_NOCHANGES, &j->flags) || + new.reserved + d < new.remaining) { + new.reserved += d; + ret = 1; +diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c +index 66a0e267b3f4..53aad1d0c9a9 100644 +--- a/fs/bcachefs/journal_io.c ++++ b/fs/bcachefs/journal_io.c +@@ -1515,7 +1515,7 @@ retry_alloc: + + w->devs_written = bch2_bkey_devs(bkey_i_to_s_c(&w->key)); + +- if (c->opts.nochanges) ++ if (test_bit(JOURNAL_NOCHANGES, &j->flags)) + goto no_io; + + for_each_rw_member(ca, c, i) +diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c +index c468d597d427..a93f5b189248 100644 +--- a/fs/bcachefs/journal_reclaim.c ++++ b/fs/bcachefs/journal_reclaim.c +@@ -34,8 +34,10 @@ unsigned bch2_journal_dev_buckets_available(struct journal *j, + struct journal_device *ja, + enum journal_space_from from) + { +- unsigned available = (journal_space_from(ja, from) - +- ja->cur_idx - 1 + ja->nr) % ja->nr; ++ unsigned available = !test_bit(JOURNAL_NOCHANGES, &j->flags) ++ ? ((journal_space_from(ja, from) - ++ ja->cur_idx - 1 + ja->nr) % ja->nr) ++ : ja->nr; + + /* + * Don't use the last bucket unless writing the new last_seq +diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h +index 61674ae1ab5f..cc10e1d7895c 100644 +--- a/fs/bcachefs/journal_types.h ++++ b/fs/bcachefs/journal_types.h +@@ -154,6 +154,7 @@ enum { + JOURNAL_NEED_WRITE, + JOURNAL_MAY_GET_UNRESERVED, + JOURNAL_MAY_SKIP_FLUSH, ++ JOURNAL_NOCHANGES, + }; + + /* Embedded in struct bch_fs */ +diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c +index e0c93cb520c3..dc8f641504be 100644 +--- a/fs/bcachefs/super.c ++++ b/fs/bcachefs/super.c +@@ -793,6 +793,9 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) + bch2_fs_fsio_init(c)) + goto err; + ++ if (c->opts.nochanges) ++ set_bit(JOURNAL_NOCHANGES, &c->journal.flags); ++ + mi = bch2_sb_get_members(c->disk_sb.sb); + for (i = 0; i < c->sb.nr_devices; i++) + if (bch2_dev_exists(c->disk_sb.sb, mi, i) && +-- +cgit v1.2.3 + + +From 25952e17958a3af2590a2cb8b8ff799c006a80ca Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 29 Oct 2021 18:21:05 -0400 +Subject: bcachefs: Fix bch2_mark_update() + +When the old or new key doesn't exist, we should still pass in a deleted +key with the correct pos. This fixes a bug in the ec code, when +bch2_mark_stripe() was looking up the wrong in-memory stripe. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/buckets.c | 6 ++++++ + 1 file changed, 6 insertions(+) + +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index 40084edd1376..bdbc374bb7a7 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -1227,6 +1227,8 @@ int bch2_mark_key(struct bch_fs *c, struct bkey_s_c new, unsigned flags) + struct bkey_s_c old = (struct bkey_s_c) { &deleted, NULL }; + int ret; + ++ deleted.p = new.k->p; ++ + percpu_down_read(&c->mark_lock); + ret = bch2_mark_key_locked(c, old, new, 0, flags); + percpu_up_read(&c->mark_lock); +@@ -1244,6 +1246,8 @@ int bch2_mark_update(struct btree_trans *trans, struct btree_path *path, + struct bkey unpacked; + int ret; + ++ _deleted.p = path->pos; ++ + if (unlikely(flags & BTREE_TRIGGER_NORUN)) + return 0; + +@@ -1841,6 +1845,8 @@ int bch2_trans_mark_update(struct btree_trans *trans, + struct bkey unpacked; + int ret; + ++ _deleted.p = path->pos; ++ + if (unlikely(flags & BTREE_TRIGGER_NORUN)) + return 0; + +-- +cgit v1.2.3 + + +From 423aa6c8e7c9a681d5723fadac079590fb203406 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 29 Oct 2021 16:29:13 -0400 +Subject: bcachefs: Assorted ec fixes + +- The backpointer that ec_stripe_update_ptrs() uses now needs to include + the snapshot ID, which means we have to change where we add the + backpointer to after getting the snapshot ID for the new extents + +- ec_stripe_update_ptrs() needs to be calling bch2_trans_begin() + +- improve error message in bch2_mark_stripe() + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_gc.c | 6 ++++-- + fs/bcachefs/buckets.c | 9 +++++++-- + fs/bcachefs/ec.c | 23 +++++++++++------------ + fs/bcachefs/ec.h | 4 ++-- + fs/bcachefs/io.c | 8 ++++---- + fs/bcachefs/move.c | 4 ++++ + 6 files changed, 32 insertions(+), 22 deletions(-) + +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +index 54b3d0d97a3b..6e32d41863eb 100644 +--- a/fs/bcachefs/btree_gc.c ++++ b/fs/bcachefs/btree_gc.c +@@ -699,6 +699,7 @@ static int bch2_gc_mark_key(struct bch_fs *c, enum btree_id btree_id, + BTREE_TRIGGER_INSERT| + BTREE_TRIGGER_GC| + (initial ? BTREE_TRIGGER_NOATOMIC : 0); ++ char buf[200]; + int ret = 0; + + if (initial) { +@@ -717,8 +718,9 @@ static int bch2_gc_mark_key(struct bch_fs *c, enum btree_id btree_id, + + if (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) || + fsck_err_on(!bch2_bkey_replicas_marked(c, *k), c, +- "superblock not marked as containing replicas (type %u)", +- k->k->type)) { ++ "superblock not marked as containing replicas\n" ++ " while marking %s", ++ (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) { + ret = bch2_mark_bkey_replicas(c, *k); + if (ret) { + bch_err(c, "error marking bkey replicas: %i", ret); +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index bdbc374bb7a7..daa3942139a4 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -1003,8 +1003,13 @@ static int bch2_mark_stripe(struct bch_fs *c, + BUG_ON(gc && old_s); + + if (!m || (old_s && !m->alive)) { +- bch_err_ratelimited(c, "error marking nonexistent stripe %zu", +- idx); ++ char buf1[200], buf2[200]; ++ ++ bch2_bkey_val_to_text(&PBUF(buf1), c, old); ++ bch2_bkey_val_to_text(&PBUF(buf2), c, new); ++ bch_err_ratelimited(c, "error marking nonexistent stripe %zu while marking\n" ++ "old %s\n" ++ "new %s", idx, buf1, buf2); + bch2_inconsistent_error(c); + return -1; + } +diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c +index 9f87e2bc4468..e93a40b49c6e 100644 +--- a/fs/bcachefs/ec.c ++++ b/fs/bcachefs/ec.c +@@ -837,8 +837,9 @@ static int ec_stripe_update_ptrs(struct bch_fs *c, + bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, + bkey_start_pos(pos), + BTREE_ITER_INTENT); +- +- while ((k = bch2_btree_iter_peek(&iter)).k && ++retry: ++ while (bch2_trans_begin(&trans), ++ (k = bch2_btree_iter_peek(&iter)).k && + !(ret = bkey_err(k)) && + bkey_cmp(bkey_start_pos(k.k), pos->p) < 0) { + struct bch_extent_ptr *ptr, *ec_ptr = NULL; +@@ -874,11 +875,11 @@ static int ec_stripe_update_ptrs(struct bch_fs *c, + BTREE_INSERT_NOFAIL); + if (!ret) + bch2_btree_iter_set_pos(&iter, next_pos); +- if (ret == -EINTR) +- ret = 0; + if (ret) + break; + } ++ if (ret == -EINTR) ++ goto retry; + bch2_trans_iter_exit(&trans, &iter); + + bch2_trans_exit(&trans); +@@ -1069,16 +1070,14 @@ void *bch2_writepoint_ec_buf(struct bch_fs *c, struct write_point *wp) + return ob->ec->new_stripe.data[ob->ec_idx] + (offset << 9); + } + +-void bch2_ec_add_backpointer(struct bch_fs *c, struct write_point *wp, +- struct bpos pos, unsigned sectors) ++void bch2_ob_add_backpointer(struct bch_fs *c, struct open_bucket *ob, ++ struct bkey *k) + { +- struct open_bucket *ob = ec_open_bucket(c, &wp->ptrs); +- struct ec_stripe_new *ec; ++ struct ec_stripe_new *ec = ob->ec; + +- if (!ob) ++ if (!ec) + return; + +- ec = ob->ec; + mutex_lock(&ec->lock); + + if (bch2_keylist_realloc(&ec->keys, ec->inline_keys, +@@ -1088,8 +1087,8 @@ void bch2_ec_add_backpointer(struct bch_fs *c, struct write_point *wp, + } + + bkey_init(&ec->keys.top->k); +- ec->keys.top->k.p = pos; +- bch2_key_resize(&ec->keys.top->k, sectors); ++ ec->keys.top->k.p = k->p; ++ ec->keys.top->k.size = k->size; + bch2_keylist_push(&ec->keys); + + mutex_unlock(&ec->lock); +diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h +index e79626b59509..eb16e140e2c8 100644 +--- a/fs/bcachefs/ec.h ++++ b/fs/bcachefs/ec.h +@@ -193,8 +193,8 @@ struct ec_stripe_head { + int bch2_ec_read_extent(struct bch_fs *, struct bch_read_bio *); + + void *bch2_writepoint_ec_buf(struct bch_fs *, struct write_point *); +-void bch2_ec_add_backpointer(struct bch_fs *, struct write_point *, +- struct bpos, unsigned); ++void bch2_ob_add_backpointer(struct bch_fs *, struct open_bucket *, ++ struct bkey *); + + void bch2_ec_bucket_written(struct bch_fs *, struct open_bucket *); + void bch2_ec_bucket_cancel(struct bch_fs *, struct open_bucket *); +diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c +index c4c28559a49c..017fc689801a 100644 +--- a/fs/bcachefs/io.c ++++ b/fs/bcachefs/io.c +@@ -460,6 +460,7 @@ int bch2_write_index_default(struct bch_write_op *op) + { + struct bch_fs *c = op->c; + struct bkey_buf sk; ++ struct open_bucket *ec_ob = ec_open_bucket(c, &op->open_buckets); + struct keylist *keys = &op->insert_keys; + struct bkey_i *k = bch2_keylist_front(keys); + struct btree_trans trans; +@@ -503,6 +504,9 @@ int bch2_write_index_default(struct bch_write_op *op) + if (ret) + break; + ++ if (ec_ob) ++ bch2_ob_add_backpointer(c, ec_ob, &sk.k->k); ++ + if (bkey_cmp(iter.pos, k->k.p) >= 0) + bch2_keylist_pop_front(&op->insert_keys); + else +@@ -949,7 +953,6 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, + struct bio *src = &op->wbio.bio, *dst = src; + struct bvec_iter saved_iter; + void *ec_buf; +- struct bpos ec_pos = op->pos; + unsigned total_output = 0, total_input = 0; + bool bounce = false; + bool page_alloc_failed = false; +@@ -1119,9 +1122,6 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, + + dst->bi_iter.bi_size = total_output; + do_write: +- /* might have done a realloc... */ +- bch2_ec_add_backpointer(c, wp, ec_pos, total_input >> 9); +- + *_dst = dst; + return more; + csum_err: +diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c +index fae260097950..d0c784012e88 100644 +--- a/fs/bcachefs/move.c ++++ b/fs/bcachefs/move.c +@@ -8,6 +8,7 @@ + #include "btree_update_interior.h" + #include "buckets.h" + #include "disk_groups.h" ++#include "ec.h" + #include "inode.h" + #include "io.h" + #include "journal_reclaim.h" +@@ -136,6 +137,7 @@ static int bch2_migrate_index_update(struct bch_write_op *op) + struct btree_iter iter; + struct migrate_write *m = + container_of(op, struct migrate_write, op); ++ struct open_bucket *ec_ob = ec_open_bucket(c, &op->open_buckets); + struct keylist *keys = &op->insert_keys; + struct bkey_buf _new, _insert; + int ret = 0; +@@ -253,6 +255,8 @@ static int bch2_migrate_index_update(struct bch_write_op *op) + if (!ret) { + bch2_btree_iter_set_pos(&iter, next_pos); + atomic_long_inc(&c->extent_migrate_done); ++ if (ec_ob) ++ bch2_ob_add_backpointer(c, ec_ob, &insert->k); + } + err: + if (ret == -EINTR) +-- +cgit v1.2.3 + + +From cbc17aa2911b4b5767b5ec4b082b805c9a1ef55d Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 29 Oct 2021 18:43:18 -0400 +Subject: bcachefs: Convert bch2_mark_key() to take a btree_trans * + +This helps to unify the interface between bch2_mark_key() and +bch2_trans_mark_key() - and it also gives access to the journal +reservation and journal seq in the mark_key path. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/alloc_background.c | 8 ++- + fs/bcachefs/btree_gc.c | 85 ++++++++++++++++------------ + fs/bcachefs/btree_iter.c | 1 + + fs/bcachefs/buckets.c | 124 +++++++++++++++++++++++------------------ + fs/bcachefs/buckets.h | 2 +- + fs/bcachefs/ec.c | 14 +++-- + fs/bcachefs/recovery.c | 12 ++-- + fs/bcachefs/recovery.h | 4 +- + fs/bcachefs/subvolume.c | 9 +-- + fs/bcachefs/subvolume.h | 4 +- + 10 files changed, 152 insertions(+), 111 deletions(-) + +diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c +index fe899c5c64d9..022c905dc8b4 100644 +--- a/fs/bcachefs/alloc_background.c ++++ b/fs/bcachefs/alloc_background.c +@@ -261,8 +261,9 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, + #undef x + } + +-static int bch2_alloc_read_fn(struct bch_fs *c, struct bkey_s_c k) ++static int bch2_alloc_read_fn(struct btree_trans *trans, struct bkey_s_c k) + { ++ struct bch_fs *c = trans->c; + struct bch_dev *ca; + struct bucket *g; + struct bkey_alloc_unpacked u; +@@ -289,11 +290,14 @@ static int bch2_alloc_read_fn(struct bch_fs *c, struct bkey_s_c k) + + int bch2_alloc_read(struct bch_fs *c) + { ++ struct btree_trans trans; + int ret; + ++ bch2_trans_init(&trans, c, 0, 0); + down_read(&c->gc_lock); +- ret = bch2_btree_and_journal_walk(c, BTREE_ID_alloc, bch2_alloc_read_fn); ++ ret = bch2_btree_and_journal_walk(&trans, BTREE_ID_alloc, bch2_alloc_read_fn); + up_read(&c->gc_lock); ++ bch2_trans_exit(&trans); + if (ret) { + bch_err(c, "error reading alloc info: %i", ret); + return ret; +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +index 6e32d41863eb..82316349698d 100644 +--- a/fs/bcachefs/btree_gc.c ++++ b/fs/bcachefs/btree_gc.c +@@ -688,11 +688,12 @@ fsck_err: + + /* marking of btree keys/nodes: */ + +-static int bch2_gc_mark_key(struct bch_fs *c, enum btree_id btree_id, ++static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id, + unsigned level, bool is_root, + struct bkey_s_c *k, + u8 *max_stale, bool initial) + { ++ struct bch_fs *c = trans->c; + struct bkey_ptrs_c ptrs; + const struct bch_extent_ptr *ptr; + unsigned flags = +@@ -740,7 +741,7 @@ static int bch2_gc_mark_key(struct bch_fs *c, enum btree_id btree_id, + *max_stale = max(*max_stale, ptr_stale(ca, ptr)); + } + +- ret = bch2_mark_key(c, *k, flags); ++ ret = bch2_mark_key(trans, *k, flags); + fsck_err: + err: + if (ret) +@@ -748,9 +749,10 @@ err: + return ret; + } + +-static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, u8 *max_stale, ++static int btree_gc_mark_node(struct btree_trans *trans, struct btree *b, u8 *max_stale, + bool initial) + { ++ struct bch_fs *c = trans->c; + struct btree_node_iter iter; + struct bkey unpacked; + struct bkey_s_c k; +@@ -768,7 +770,7 @@ static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, u8 *max_stale, + bkey_init(&prev.k->k); + + while ((k = bch2_btree_node_iter_peek_unpack(&iter, b, &unpacked)).k) { +- ret = bch2_gc_mark_key(c, b->c.btree_id, b->c.level, false, ++ ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level, false, + &k, max_stale, initial); + if (ret) + break; +@@ -790,10 +792,10 @@ static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, u8 *max_stale, + return ret; + } + +-static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id, ++static int bch2_gc_btree(struct btree_trans *trans, enum btree_id btree_id, + bool initial, bool metadata_only) + { +- struct btree_trans trans; ++ struct bch_fs *c = trans->c; + struct btree_iter iter; + struct btree *b; + unsigned depth = metadata_only ? 1 +@@ -803,35 +805,32 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id, + u8 max_stale = 0; + int ret = 0; + +- bch2_trans_init(&trans, c, 0, 0); +- + gc_pos_set(c, gc_pos_btree(btree_id, POS_MIN, 0)); + +- __for_each_btree_node(&trans, iter, btree_id, POS_MIN, ++ __for_each_btree_node(trans, iter, btree_id, POS_MIN, + 0, depth, BTREE_ITER_PREFETCH, b, ret) { + bch2_verify_btree_nr_keys(b); + + gc_pos_set(c, gc_pos_btree_node(b)); + +- ret = btree_gc_mark_node(c, b, &max_stale, initial); ++ ret = btree_gc_mark_node(trans, b, &max_stale, initial); + if (ret) + break; + + if (!initial) { + if (max_stale > 64) +- bch2_btree_node_rewrite(&trans, &iter, b, ++ bch2_btree_node_rewrite(trans, &iter, b, + BTREE_INSERT_NOWAIT| + BTREE_INSERT_GC_LOCK_HELD); + else if (!bch2_btree_gc_rewrite_disabled && + (bch2_btree_gc_always_rewrite || max_stale > 16)) +- bch2_btree_node_rewrite(&trans, &iter, ++ bch2_btree_node_rewrite(trans, &iter, + b, BTREE_INSERT_NOWAIT| + BTREE_INSERT_GC_LOCK_HELD); + } + } +- bch2_trans_iter_exit(&trans, &iter); ++ bch2_trans_iter_exit(trans, &iter); + +- bch2_trans_exit(&trans); + if (ret) + return ret; + +@@ -840,7 +839,7 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id, + if (!btree_node_fake(b)) { + struct bkey_s_c k = bkey_i_to_s_c(&b->key); + +- ret = bch2_gc_mark_key(c, b->c.btree_id, b->c.level, true, ++ ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level, true, + &k, &max_stale, initial); + } + gc_pos_set(c, gc_pos_btree_root(b->c.btree_id)); +@@ -849,9 +848,10 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id, + return ret; + } + +-static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b, ++static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b, + unsigned target_depth) + { ++ struct bch_fs *c = trans->c; + struct btree_and_journal_iter iter; + struct bkey_s_c k; + struct bkey_buf cur, prev; +@@ -868,7 +868,7 @@ static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b, + BUG_ON(bpos_cmp(k.k->p, b->data->min_key) < 0); + BUG_ON(bpos_cmp(k.k->p, b->data->max_key) > 0); + +- ret = bch2_gc_mark_key(c, b->c.btree_id, b->c.level, false, ++ ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level, false, + &k, &max_stale, true); + if (ret) { + bch_err(c, "%s: error %i from bch2_gc_mark_key", __func__, ret); +@@ -935,7 +935,7 @@ static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b, + break; + } + +- ret = bch2_gc_btree_init_recurse(c, child, ++ ret = bch2_gc_btree_init_recurse(trans, child, + target_depth); + six_unlock_read(&child->c.lock); + +@@ -950,10 +950,11 @@ fsck_err: + return ret; + } + +-static int bch2_gc_btree_init(struct bch_fs *c, ++static int bch2_gc_btree_init(struct btree_trans *trans, + enum btree_id btree_id, + bool metadata_only) + { ++ struct bch_fs *c = trans->c; + struct btree *b; + unsigned target_depth = metadata_only ? 1 + : bch2_expensive_debug_checks ? 0 +@@ -986,12 +987,12 @@ static int bch2_gc_btree_init(struct bch_fs *c, + } + + if (b->c.level >= target_depth) +- ret = bch2_gc_btree_init_recurse(c, b, target_depth); ++ ret = bch2_gc_btree_init_recurse(trans, b, target_depth); + + if (!ret) { + struct bkey_s_c k = bkey_i_to_s_c(&b->key); + +- ret = bch2_gc_mark_key(c, b->c.btree_id, b->c.level, true, ++ ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level, true, + &k, &max_stale, true); + } + fsck_err: +@@ -1010,21 +1011,26 @@ static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r) + + static int bch2_gc_btrees(struct bch_fs *c, bool initial, bool metadata_only) + { ++ struct btree_trans trans; + enum btree_id ids[BTREE_ID_NR]; + unsigned i; + int ret = 0; + ++ bch2_trans_init(&trans, c, 0, 0); ++ + for (i = 0; i < BTREE_ID_NR; i++) + ids[i] = i; + bubble_sort(ids, BTREE_ID_NR, btree_id_gc_phase_cmp); + + for (i = 0; i < BTREE_ID_NR && !ret; i++) + ret = initial +- ? bch2_gc_btree_init(c, ids[i], metadata_only) +- : bch2_gc_btree(c, ids[i], initial, metadata_only); ++ ? bch2_gc_btree_init(&trans, ids[i], metadata_only) ++ : bch2_gc_btree(&trans, ids[i], initial, metadata_only); + + if (ret < 0) + bch_err(c, "%s: ret %i", __func__, ret); ++ ++ bch2_trans_exit(&trans); + return ret; + } + +@@ -1373,8 +1379,10 @@ static int bch2_gc_start(struct bch_fs *c, + return 0; + } + +-static int bch2_gc_reflink_done_initial_fn(struct bch_fs *c, struct bkey_s_c k) ++static int bch2_gc_reflink_done_initial_fn(struct btree_trans *trans, ++ struct bkey_s_c k) + { ++ struct bch_fs *c = trans->c; + struct reflink_gc *r; + const __le64 *refcount = bkey_refcount_c(k); + char buf[200]; +@@ -1439,16 +1447,16 @@ static int bch2_gc_reflink_done(struct bch_fs *c, bool initial, + if (metadata_only) + return 0; + ++ bch2_trans_init(&trans, c, 0, 0); ++ + if (initial) { + c->reflink_gc_idx = 0; + +- ret = bch2_btree_and_journal_walk(c, BTREE_ID_reflink, ++ ret = bch2_btree_and_journal_walk(&trans, BTREE_ID_reflink, + bch2_gc_reflink_done_initial_fn); + goto out; + } + +- bch2_trans_init(&trans, c, 0, 0); +- + for_each_btree_key(&trans, iter, BTREE_ID_reflink, POS_MIN, + BTREE_ITER_PREFETCH, k, ret) { + const __le64 *refcount = bkey_refcount_c(k); +@@ -1496,16 +1504,18 @@ static int bch2_gc_reflink_done(struct bch_fs *c, bool initial, + } + fsck_err: + bch2_trans_iter_exit(&trans, &iter); +- bch2_trans_exit(&trans); + out: + genradix_free(&c->reflink_gc_table); + c->reflink_gc_nr = 0; ++ bch2_trans_exit(&trans); + return ret; + } + +-static int bch2_gc_reflink_start_initial_fn(struct bch_fs *c, struct bkey_s_c k) ++static int bch2_gc_reflink_start_initial_fn(struct btree_trans *trans, ++ struct bkey_s_c k) + { + ++ struct bch_fs *c = trans->c; + struct reflink_gc *r; + const __le64 *refcount = bkey_refcount_c(k); + +@@ -1530,19 +1540,20 @@ static int bch2_gc_reflink_start(struct bch_fs *c, bool initial, + struct btree_iter iter; + struct bkey_s_c k; + struct reflink_gc *r; +- int ret; ++ int ret = 0; + + if (metadata_only) + return 0; + ++ bch2_trans_init(&trans, c, 0, 0); + genradix_free(&c->reflink_gc_table); + c->reflink_gc_nr = 0; + +- if (initial) +- return bch2_btree_and_journal_walk(c, BTREE_ID_reflink, +- bch2_gc_reflink_start_initial_fn); +- +- bch2_trans_init(&trans, c, 0, 0); ++ if (initial) { ++ ret = bch2_btree_and_journal_walk(&trans, BTREE_ID_reflink, ++ bch2_gc_reflink_start_initial_fn); ++ goto out; ++ } + + for_each_btree_key(&trans, iter, BTREE_ID_reflink, POS_MIN, + BTREE_ITER_PREFETCH, k, ret) { +@@ -1563,9 +1574,9 @@ static int bch2_gc_reflink_start(struct bch_fs *c, bool initial, + r->refcount = 0; + } + bch2_trans_iter_exit(&trans, &iter); +- ++out: + bch2_trans_exit(&trans); +- return 0; ++ return ret; + } + + /** +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index f572a57a8038..398b4cf29bb1 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -2657,6 +2657,7 @@ void bch2_trans_begin(struct btree_trans *trans) + trans_for_each_update(trans, i) + __btree_path_put(i->path, true); + ++ memset(&trans->journal_res, 0, sizeof(trans->journal_res)); + trans->extra_journal_res = 0; + trans->nr_updates = 0; + trans->mem_top = 0; +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index daa3942139a4..bb3e6767c70c 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -524,11 +524,13 @@ void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca, + BUG_ON(owned_by_allocator == old.owned_by_allocator); + } + +-static int bch2_mark_alloc(struct bch_fs *c, ++static int bch2_mark_alloc(struct btree_trans *trans, + struct bkey_s_c old, struct bkey_s_c new, +- u64 journal_seq, unsigned flags) ++ unsigned flags) + { + bool gc = flags & BTREE_TRIGGER_GC; ++ u64 journal_seq = trans->journal_res.seq; ++ struct bch_fs *c = trans->c; + struct bkey_alloc_unpacked u; + struct bch_dev *ca; + struct bucket *g; +@@ -673,7 +675,8 @@ static s64 ptr_disk_sectors(s64 sectors, struct extent_ptr_decoded p) + : sectors; + } + +-static int check_bucket_ref(struct bch_fs *c, struct bkey_s_c k, ++static int check_bucket_ref(struct bch_fs *c, ++ struct bkey_s_c k, + const struct bch_extent_ptr *ptr, + s64 sectors, enum bch_data_type ptr_data_type, + u8 bucket_gen, u8 bucket_data_type, +@@ -747,10 +750,12 @@ static int check_bucket_ref(struct bch_fs *c, struct bkey_s_c k, + return 0; + } + +-static int mark_stripe_bucket(struct bch_fs *c, struct bkey_s_c k, +- unsigned ptr_idx, +- u64 journal_seq, unsigned flags) ++static int mark_stripe_bucket(struct btree_trans *trans, ++ struct bkey_s_c k, ++ unsigned ptr_idx, ++ u64 journal_seq, unsigned flags) + { ++ struct bch_fs *c = trans->c; + const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; + unsigned nr_data = s->nr_blocks - s->nr_redundant; + bool parity = ptr_idx >= nr_data; +@@ -794,7 +799,8 @@ static int mark_stripe_bucket(struct bch_fs *c, struct bkey_s_c k, + return 0; + } + +-static int __mark_pointer(struct bch_fs *c, struct bkey_s_c k, ++static int __mark_pointer(struct btree_trans *trans, ++ struct bkey_s_c k, + const struct bch_extent_ptr *ptr, + s64 sectors, enum bch_data_type ptr_data_type, + u8 bucket_gen, u8 *bucket_data_type, +@@ -803,7 +809,7 @@ static int __mark_pointer(struct bch_fs *c, struct bkey_s_c k, + u16 *dst_sectors = !ptr->cached + ? dirty_sectors + : cached_sectors; +- int ret = check_bucket_ref(c, k, ptr, sectors, ptr_data_type, ++ int ret = check_bucket_ref(trans->c, k, ptr, sectors, ptr_data_type, + bucket_gen, *bucket_data_type, + *dirty_sectors, *cached_sectors); + +@@ -816,12 +822,15 @@ static int __mark_pointer(struct bch_fs *c, struct bkey_s_c k, + return 0; + } + +-static int bch2_mark_pointer(struct bch_fs *c, struct bkey_s_c k, ++static int bch2_mark_pointer(struct btree_trans *trans, ++ struct bkey_s_c k, + struct extent_ptr_decoded p, + s64 sectors, enum bch_data_type data_type, +- u64 journal_seq, unsigned flags) ++ unsigned flags) + { + bool gc = flags & BTREE_TRIGGER_GC; ++ u64 journal_seq = trans->journal_res.seq; ++ struct bch_fs *c = trans->c; + struct bucket_mark old, new; + struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); + struct bucket *g = PTR_BUCKET(ca, &p.ptr, gc); +@@ -834,7 +843,8 @@ static int bch2_mark_pointer(struct bch_fs *c, struct bkey_s_c k, + new.v.counter = old.v.counter = v; + bucket_data_type = new.data_type; + +- ret = __mark_pointer(c, k, &p.ptr, sectors, data_type, new.gen, ++ ret = __mark_pointer(trans, k, &p.ptr, sectors, ++ data_type, new.gen, + &bucket_data_type, + &new.dirty_sectors, + &new.cached_sectors); +@@ -863,13 +873,14 @@ static int bch2_mark_pointer(struct bch_fs *c, struct bkey_s_c k, + return 0; + } + +-static int bch2_mark_stripe_ptr(struct bch_fs *c, ++static int bch2_mark_stripe_ptr(struct btree_trans *trans, + struct bch_extent_stripe_ptr p, + enum bch_data_type data_type, + s64 sectors, +- unsigned journal_seq, unsigned flags) ++ unsigned flags) + { + bool gc = flags & BTREE_TRIGGER_GC; ++ struct bch_fs *c = trans->c; + struct bch_replicas_padded r; + struct stripe *m; + unsigned i, blocks_nonempty = 0; +@@ -902,16 +913,18 @@ static int bch2_mark_stripe_ptr(struct bch_fs *c, + spin_unlock(&c->ec_stripes_heap_lock); + + r.e.data_type = data_type; +- update_replicas(c, &r.e, sectors, journal_seq, gc); ++ update_replicas(c, &r.e, sectors, trans->journal_res.seq, gc); + + return 0; + } + +-static int bch2_mark_extent(struct bch_fs *c, ++static int bch2_mark_extent(struct btree_trans *trans, + struct bkey_s_c old, struct bkey_s_c new, +- unsigned journal_seq, unsigned flags) ++ unsigned flags) + { + bool gc = flags & BTREE_TRIGGER_GC; ++ u64 journal_seq = trans->journal_res.seq; ++ struct bch_fs *c = trans->c; + struct bkey_s_c k = flags & BTREE_TRIGGER_INSERT ? new : old; + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; +@@ -940,8 +953,8 @@ static int bch2_mark_extent(struct bch_fs *c, + if (flags & BTREE_TRIGGER_OVERWRITE) + disk_sectors = -disk_sectors; + +- ret = bch2_mark_pointer(c, k, p, disk_sectors, data_type, +- journal_seq, flags); ++ ret = bch2_mark_pointer(trans, k, p, disk_sectors, ++ data_type, flags); + if (ret < 0) + return ret; + +@@ -959,8 +972,8 @@ static int bch2_mark_extent(struct bch_fs *c, + dirty_sectors += disk_sectors; + r.e.devs[r.e.nr_devs++] = p.ptr.dev; + } else { +- ret = bch2_mark_stripe_ptr(c, p.ec, data_type, +- disk_sectors, journal_seq, flags); ++ ret = bch2_mark_stripe_ptr(trans, p.ec, data_type, ++ disk_sectors, flags); + if (ret) + return ret; + +@@ -986,11 +999,13 @@ static int bch2_mark_extent(struct bch_fs *c, + return 0; + } + +-static int bch2_mark_stripe(struct bch_fs *c, +- struct bkey_s_c old, struct bkey_s_c new, +- u64 journal_seq, unsigned flags) ++static int bch2_mark_stripe(struct btree_trans *trans, ++ struct bkey_s_c old, struct bkey_s_c new, ++ unsigned flags) + { + bool gc = flags & BTREE_TRIGGER_GC; ++ u64 journal_seq = trans->journal_res.seq; ++ struct bch_fs *c = trans->c; + size_t idx = new.k->p.offset; + const struct bch_stripe *old_s = old.k->type == KEY_TYPE_stripe + ? bkey_s_c_to_stripe(old).v : NULL; +@@ -1054,7 +1069,7 @@ static int bch2_mark_stripe(struct bch_fs *c, + m->blocks_nonempty = 0; + + for (i = 0; i < new_s->nr_blocks; i++) { +- ret = mark_stripe_bucket(c, new, i, journal_seq, flags); ++ ret = mark_stripe_bucket(trans, new, i, journal_seq, flags); + if (ret) + return ret; + } +@@ -1073,24 +1088,26 @@ static int bch2_mark_stripe(struct bch_fs *c, + return 0; + } + +-static int bch2_mark_inode(struct bch_fs *c, +- struct bkey_s_c old, struct bkey_s_c new, +- u64 journal_seq, unsigned flags) ++static int bch2_mark_inode(struct btree_trans *trans, ++ struct bkey_s_c old, struct bkey_s_c new, ++ unsigned flags) + { ++ struct bch_fs *c = trans->c; + struct bch_fs_usage __percpu *fs_usage; + + preempt_disable(); +- fs_usage = fs_usage_ptr(c, journal_seq, flags & BTREE_TRIGGER_GC); ++ fs_usage = fs_usage_ptr(c, trans->journal_res.seq, flags & BTREE_TRIGGER_GC); + fs_usage->nr_inodes += new.k->type == KEY_TYPE_inode; + fs_usage->nr_inodes -= old.k->type == KEY_TYPE_inode; + preempt_enable(); + return 0; + } + +-static int bch2_mark_reservation(struct bch_fs *c, +- struct bkey_s_c old, struct bkey_s_c new, +- u64 journal_seq, unsigned flags) ++static int bch2_mark_reservation(struct btree_trans *trans, ++ struct bkey_s_c old, struct bkey_s_c new, ++ unsigned flags) + { ++ struct bch_fs *c = trans->c; + struct bkey_s_c k = flags & BTREE_TRIGGER_INSERT ? new : old; + struct bch_fs_usage __percpu *fs_usage; + unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas; +@@ -1101,7 +1118,7 @@ static int bch2_mark_reservation(struct bch_fs *c, + sectors *= replicas; + + preempt_disable(); +- fs_usage = fs_usage_ptr(c, journal_seq, flags & BTREE_TRIGGER_GC); ++ fs_usage = fs_usage_ptr(c, trans->journal_res.seq, flags & BTREE_TRIGGER_GC); + replicas = clamp_t(unsigned, replicas, 1, + ARRAY_SIZE(fs_usage->persistent_reserved)); + +@@ -1159,10 +1176,11 @@ fsck_err: + return ret; + } + +-static int bch2_mark_reflink_p(struct bch_fs *c, +- struct bkey_s_c old, struct bkey_s_c new, +- u64 journal_seq, unsigned flags) ++static int bch2_mark_reflink_p(struct btree_trans *trans, ++ struct bkey_s_c old, struct bkey_s_c new, ++ unsigned flags) + { ++ struct bch_fs *c = trans->c; + struct bkey_s_c k = flags & BTREE_TRIGGER_INSERT ? new : old; + struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); + struct reflink_gc *ref; +@@ -1193,10 +1211,10 @@ static int bch2_mark_reflink_p(struct bch_fs *c, + return ret; + } + +-static int bch2_mark_key_locked(struct bch_fs *c, ++static int bch2_mark_key_locked(struct btree_trans *trans, + struct bkey_s_c old, + struct bkey_s_c new, +- u64 journal_seq, unsigned flags) ++ unsigned flags) + { + struct bkey_s_c k = flags & BTREE_TRIGGER_INSERT ? new : old; + +@@ -1205,29 +1223,30 @@ static int bch2_mark_key_locked(struct bch_fs *c, + switch (k.k->type) { + case KEY_TYPE_alloc: + case KEY_TYPE_alloc_v2: +- return bch2_mark_alloc(c, old, new, journal_seq, flags); ++ return bch2_mark_alloc(trans, old, new, flags); + case KEY_TYPE_btree_ptr: + case KEY_TYPE_btree_ptr_v2: + case KEY_TYPE_extent: + case KEY_TYPE_reflink_v: +- return bch2_mark_extent(c, old, new, journal_seq, flags); ++ return bch2_mark_extent(trans, old, new, flags); + case KEY_TYPE_stripe: +- return bch2_mark_stripe(c, old, new, journal_seq, flags); ++ return bch2_mark_stripe(trans, old, new, flags); + case KEY_TYPE_inode: +- return bch2_mark_inode(c, old, new, journal_seq, flags); ++ return bch2_mark_inode(trans, old, new, flags); + case KEY_TYPE_reservation: +- return bch2_mark_reservation(c, old, new, journal_seq, flags); ++ return bch2_mark_reservation(trans, old, new, flags); + case KEY_TYPE_reflink_p: +- return bch2_mark_reflink_p(c, old, new, journal_seq, flags); ++ return bch2_mark_reflink_p(trans, old, new, flags); + case KEY_TYPE_snapshot: +- return bch2_mark_snapshot(c, old, new, journal_seq, flags); ++ return bch2_mark_snapshot(trans, old, new, flags); + default: + return 0; + } + } + +-int bch2_mark_key(struct bch_fs *c, struct bkey_s_c new, unsigned flags) ++int bch2_mark_key(struct btree_trans *trans, struct bkey_s_c new, unsigned flags) + { ++ struct bch_fs *c = trans->c; + struct bkey deleted = KEY(0, 0, 0); + struct bkey_s_c old = (struct bkey_s_c) { &deleted, NULL }; + int ret; +@@ -1235,7 +1254,7 @@ int bch2_mark_key(struct bch_fs *c, struct bkey_s_c new, unsigned flags) + deleted.p = new.k->p; + + percpu_down_read(&c->mark_lock); +- ret = bch2_mark_key_locked(c, old, new, 0, flags); ++ ret = bch2_mark_key_locked(trans, old, new, flags); + percpu_up_read(&c->mark_lock); + + return ret; +@@ -1244,7 +1263,6 @@ int bch2_mark_key(struct bch_fs *c, struct bkey_s_c new, unsigned flags) + int bch2_mark_update(struct btree_trans *trans, struct btree_path *path, + struct bkey_i *new, unsigned flags) + { +- struct bch_fs *c = trans->c; + struct bkey _deleted = KEY(0, 0, 0); + struct bkey_s_c deleted = (struct bkey_s_c) { &_deleted, NULL }; + struct bkey_s_c old; +@@ -1263,15 +1281,12 @@ int bch2_mark_update(struct btree_trans *trans, struct btree_path *path, + + if (old.k->type == new->k.type && + ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) { +- ret = bch2_mark_key_locked(c, old, bkey_i_to_s_c(new), +- trans->journal_res.seq, ++ ret = bch2_mark_key_locked(trans, old, bkey_i_to_s_c(new), + BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags); + } else { +- ret = bch2_mark_key_locked(c, deleted, bkey_i_to_s_c(new), +- trans->journal_res.seq, ++ ret = bch2_mark_key_locked(trans, deleted, bkey_i_to_s_c(new), + BTREE_TRIGGER_INSERT|flags) ?: +- bch2_mark_key_locked(c, old, deleted, +- trans->journal_res.seq, ++ bch2_mark_key_locked(trans, old, deleted, + BTREE_TRIGGER_OVERWRITE|flags); + } + +@@ -1435,7 +1450,8 @@ static int bch2_trans_mark_pointer(struct btree_trans *trans, + if (IS_ERR(a)) + return PTR_ERR(a); + +- ret = __mark_pointer(c, k, &p.ptr, sectors, data_type, u.gen, &u.data_type, ++ ret = __mark_pointer(trans, k, &p.ptr, sectors, data_type, ++ u.gen, &u.data_type, + &u.dirty_sectors, &u.cached_sectors); + if (ret) + goto out; +diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h +index 61c2c0f9ff8f..8a9b2b565d48 100644 +--- a/fs/bcachefs/buckets.h ++++ b/fs/bcachefs/buckets.h +@@ -226,7 +226,7 @@ void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *, + size_t, enum bch_data_type, unsigned, + struct gc_pos, unsigned); + +-int bch2_mark_key(struct bch_fs *, struct bkey_s_c, unsigned); ++int bch2_mark_key(struct btree_trans *, struct bkey_s_c, unsigned); + + int bch2_mark_update(struct btree_trans *, struct btree_path *, + struct bkey_i *, unsigned); +diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c +index e93a40b49c6e..b493498fa587 100644 +--- a/fs/bcachefs/ec.c ++++ b/fs/bcachefs/ec.c +@@ -1634,13 +1634,14 @@ int bch2_stripes_write(struct bch_fs *c, unsigned flags) + return ret; + } + +-static int bch2_stripes_read_fn(struct bch_fs *c, struct bkey_s_c k) ++static int bch2_stripes_read_fn(struct btree_trans *trans, struct bkey_s_c k) + { ++ struct bch_fs *c = trans->c; + int ret = 0; + + if (k.k->type == KEY_TYPE_stripe) + ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL) ?: +- bch2_mark_key(c, k, ++ bch2_mark_key(trans, k, + BTREE_TRIGGER_INSERT| + BTREE_TRIGGER_NOATOMIC); + +@@ -1649,8 +1650,13 @@ static int bch2_stripes_read_fn(struct bch_fs *c, struct bkey_s_c k) + + int bch2_stripes_read(struct bch_fs *c) + { +- int ret = bch2_btree_and_journal_walk(c, BTREE_ID_stripes, +- bch2_stripes_read_fn); ++ struct btree_trans trans; ++ int ret; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ret = bch2_btree_and_journal_walk(&trans, BTREE_ID_stripes, ++ bch2_stripes_read_fn); ++ bch2_trans_exit(&trans); + if (ret) + bch_err(c, "error reading stripes: %i", ret); + +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index da9c3ea528e7..29fae6dbce76 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -337,10 +337,11 @@ static void btree_and_journal_iter_prefetch(struct bch_fs *c, struct btree *b, + bch2_bkey_buf_exit(&tmp, c); + } + +-static int bch2_btree_and_journal_walk_recurse(struct bch_fs *c, struct btree *b, ++static int bch2_btree_and_journal_walk_recurse(struct btree_trans *trans, struct btree *b, + enum btree_id btree_id, + btree_walk_key_fn key_fn) + { ++ struct bch_fs *c = trans->c; + struct btree_and_journal_iter iter; + struct bkey_s_c k; + struct bkey_buf tmp; +@@ -364,11 +365,11 @@ static int bch2_btree_and_journal_walk_recurse(struct bch_fs *c, struct btree *b + + btree_and_journal_iter_prefetch(c, b, iter); + +- ret = bch2_btree_and_journal_walk_recurse(c, child, ++ ret = bch2_btree_and_journal_walk_recurse(trans, child, + btree_id, key_fn); + six_unlock_read(&child->c.lock); + } else { +- ret = key_fn(c, k); ++ ret = key_fn(trans, k); + } + + if (ret) +@@ -382,9 +383,10 @@ static int bch2_btree_and_journal_walk_recurse(struct bch_fs *c, struct btree *b + return ret; + } + +-int bch2_btree_and_journal_walk(struct bch_fs *c, enum btree_id btree_id, ++int bch2_btree_and_journal_walk(struct btree_trans *trans, enum btree_id btree_id, + btree_walk_key_fn key_fn) + { ++ struct bch_fs *c = trans->c; + struct btree *b = c->btree_roots[btree_id].b; + int ret = 0; + +@@ -392,7 +394,7 @@ int bch2_btree_and_journal_walk(struct bch_fs *c, enum btree_id btree_id, + return 0; + + six_lock_read(&b->c.lock, NULL, NULL); +- ret = bch2_btree_and_journal_walk_recurse(c, b, btree_id, key_fn); ++ ret = bch2_btree_and_journal_walk_recurse(trans, b, btree_id, key_fn); + six_unlock_read(&b->c.lock); + + return ret; +diff --git a/fs/bcachefs/recovery.h b/fs/bcachefs/recovery.h +index e5565e4f335a..e45c70b3693f 100644 +--- a/fs/bcachefs/recovery.h ++++ b/fs/bcachefs/recovery.h +@@ -45,9 +45,9 @@ void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *, + struct bch_fs *, + struct btree *); + +-typedef int (*btree_walk_key_fn)(struct bch_fs *c, struct bkey_s_c k); ++typedef int (*btree_walk_key_fn)(struct btree_trans *, struct bkey_s_c); + +-int bch2_btree_and_journal_walk(struct bch_fs *, enum btree_id, btree_walk_key_fn); ++int bch2_btree_and_journal_walk(struct btree_trans *, enum btree_id, btree_walk_key_fn); + + void bch2_journal_keys_free(struct journal_keys *); + void bch2_journal_entries_free(struct list_head *); +diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c +index 4d385c9e9268..0ef625d21672 100644 +--- a/fs/bcachefs/subvolume.c ++++ b/fs/bcachefs/subvolume.c +@@ -61,10 +61,11 @@ const char *bch2_snapshot_invalid(const struct bch_fs *c, struct bkey_s_c k) + return NULL; + } + +-int bch2_mark_snapshot(struct bch_fs *c, ++int bch2_mark_snapshot(struct btree_trans *trans, + struct bkey_s_c old, struct bkey_s_c new, +- u64 journal_seq, unsigned flags) ++ unsigned flags) + { ++ struct bch_fs *c = trans->c; + struct snapshot_t *t; + + t = genradix_ptr_alloc(&c->snapshots, +@@ -308,7 +309,7 @@ int bch2_fs_snapshots_start(struct bch_fs *c) + if (BCH_SNAPSHOT_DELETED(bkey_s_c_to_snapshot(k).v)) + have_deleted = true; + +- ret = bch2_mark_snapshot(c, bkey_s_c_null, k, 0, 0); ++ ret = bch2_mark_snapshot(&trans, bkey_s_c_null, k, 0); + if (ret) + break; + } +@@ -499,7 +500,7 @@ static int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent, + + bch2_trans_update(trans, &iter, &n->k_i, 0); + +- ret = bch2_mark_snapshot(trans->c, bkey_s_c_null, bkey_i_to_s_c(&n->k_i), 0, 0); ++ ret = bch2_mark_snapshot(trans, bkey_s_c_null, bkey_i_to_s_c(&n->k_i), 0); + if (ret) + break; + +diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h +index b5067dc68fc7..dde755b45392 100644 +--- a/fs/bcachefs/subvolume.h ++++ b/fs/bcachefs/subvolume.h +@@ -12,8 +12,8 @@ const char *bch2_snapshot_invalid(const struct bch_fs *, struct bkey_s_c); + .val_to_text = bch2_snapshot_to_text, \ + } + +-int bch2_mark_snapshot(struct bch_fs *, struct bkey_s_c, +- struct bkey_s_c, u64, unsigned); ++int bch2_mark_snapshot(struct btree_trans *, struct bkey_s_c, ++ struct bkey_s_c, unsigned); + + static inline struct snapshot_t *snapshot_t(struct bch_fs *c, u32 id) + { +-- +cgit v1.2.3 + + +From dc662a3bcc91698bd6892fa464f3b5bf282b0df6 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 29 Oct 2021 18:58:50 -0400 +Subject: bcachefs: BTREE_TRIGGER_INSERT now only means insert + +This allows triggers to distinguish between a key entering the btree - +i.e. being called from the trans commit path - vs. being called on a key +that already exists, i.e. by GC. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_gc.c | 4 +--- + fs/bcachefs/buckets.c | 26 +++++--------------------- + fs/bcachefs/ec.c | 1 - + 3 files changed, 6 insertions(+), 25 deletions(-) + +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +index 82316349698d..b4340df677b7 100644 +--- a/fs/bcachefs/btree_gc.c ++++ b/fs/bcachefs/btree_gc.c +@@ -697,7 +697,6 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id, + struct bkey_ptrs_c ptrs; + const struct bch_extent_ptr *ptr; + unsigned flags = +- BTREE_TRIGGER_INSERT| + BTREE_TRIGGER_GC| + (initial ? BTREE_TRIGGER_NOATOMIC : 0); + char buf[200]; +@@ -1117,8 +1116,7 @@ static void bch2_mark_pending_btree_node_frees(struct bch_fs *c) + + for_each_pending_btree_node_free(c, as, d) + if (d->index_update_done) +- bch2_mark_key(c, bkey_i_to_s_c(&d->key), +- BTREE_TRIGGER_INSERT|BTREE_TRIGGER_GC); ++ bch2_mark_key(c, bkey_i_to_s_c(&d->key), BTREE_TRIGGER_GC); + + mutex_unlock(&c->btree_interior_update_lock); + } +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index bb3e6767c70c..699a0865c9b8 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -925,7 +925,7 @@ static int bch2_mark_extent(struct btree_trans *trans, + bool gc = flags & BTREE_TRIGGER_GC; + u64 journal_seq = trans->journal_res.seq; + struct bch_fs *c = trans->c; +- struct bkey_s_c k = flags & BTREE_TRIGGER_INSERT ? new : old; ++ struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old: new; + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; +@@ -940,9 +940,6 @@ static int bch2_mark_extent(struct btree_trans *trans, + bool stale; + int ret; + +- BUG_ON((flags & (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE)) == +- (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE)); +- + r.e.data_type = data_type; + r.e.nr_devs = 0; + r.e.nr_required = 1; +@@ -1108,7 +1105,7 @@ static int bch2_mark_reservation(struct btree_trans *trans, + unsigned flags) + { + struct bch_fs *c = trans->c; +- struct bkey_s_c k = flags & BTREE_TRIGGER_INSERT ? new : old; ++ struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old: new; + struct bch_fs_usage __percpu *fs_usage; + unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas; + s64 sectors = (s64) k.k->size; +@@ -1181,7 +1178,7 @@ static int bch2_mark_reflink_p(struct btree_trans *trans, + unsigned flags) + { + struct bch_fs *c = trans->c; +- struct bkey_s_c k = flags & BTREE_TRIGGER_INSERT ? new : old; ++ struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old: new; + struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); + struct reflink_gc *ref; + size_t l, r, m; +@@ -1190,9 +1187,6 @@ static int bch2_mark_reflink_p(struct btree_trans *trans, + le32_to_cpu(p.v->back_pad); + int ret = 0; + +- BUG_ON((flags & (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE)) == +- (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE)); +- + l = 0; + r = c->reflink_gc_nr; + while (l < r) { +@@ -1216,9 +1210,7 @@ static int bch2_mark_key_locked(struct btree_trans *trans, + struct bkey_s_c new, + unsigned flags) + { +- struct bkey_s_c k = flags & BTREE_TRIGGER_INSERT ? new : old; +- +- BUG_ON(!(flags & (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE))); ++ struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old: new; + + switch (k.k->type) { + case KEY_TYPE_alloc: +@@ -1536,9 +1528,6 @@ static int bch2_trans_mark_extent(struct btree_trans *trans, + bool stale; + int ret; + +- BUG_ON((flags & (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE)) == +- (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE)); +- + r.e.data_type = data_type; + r.e.nr_devs = 0; + r.e.nr_required = 1; +@@ -1710,9 +1699,6 @@ static int bch2_trans_mark_reservation(struct btree_trans *trans, + s64 sectors = (s64) k.k->size; + struct replicas_delta_list *d; + +- BUG_ON((flags & (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE)) == +- (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE)); +- + if (flags & BTREE_TRIGGER_OVERWRITE) + sectors = -sectors; + sectors *= replicas; +@@ -1832,9 +1818,7 @@ static int bch2_trans_mark_reflink_p(struct btree_trans *trans, + int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c old, + struct bkey_s_c new, unsigned flags) + { +- struct bkey_s_c k = flags & BTREE_TRIGGER_INSERT ? new : old; +- +- BUG_ON(!(flags & (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE))); ++ struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old: new; + + switch (k.k->type) { + case KEY_TYPE_btree_ptr: +diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c +index b493498fa587..bfa512d78538 100644 +--- a/fs/bcachefs/ec.c ++++ b/fs/bcachefs/ec.c +@@ -1642,7 +1642,6 @@ static int bch2_stripes_read_fn(struct btree_trans *trans, struct bkey_s_c k) + if (k.k->type == KEY_TYPE_stripe) + ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL) ?: + bch2_mark_key(trans, k, +- BTREE_TRIGGER_INSERT| + BTREE_TRIGGER_NOATOMIC); + + return ret; +-- +cgit v1.2.3 + + +From 682c495036c2aefc91a79077fc07b2cd34a7a482 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 30 Oct 2021 05:28:27 -0400 +Subject: bcachefs: Fix faulty assertion + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 398b4cf29bb1..c40e2cc942eb 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -667,7 +667,8 @@ static void bch2_btree_path_verify(struct btree_trans *trans, + + for (i = 0; i < (!path->cached ? BTREE_MAX_DEPTH : 1); i++) { + if (!path->l[i].b) { +- BUG_ON(c->btree_roots[path->btree_id].b->c.level > i); ++ BUG_ON(!path->cached && ++ c->btree_roots[path->btree_id].b->c.level > i); + break; + } + +-- +cgit v1.2.3 + + +From 03270febf5e7fa8e247f5516b6b903900da23406 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 3 Nov 2021 12:08:02 -0400 +Subject: bcachefs: Fix upgrade_readers() + +The bch2_btree_path_upgrade() call was failing and tripping an assert - +path->level + 1 is in this case not necessarily exactly what we want, +fix it by upgrading exactly the locks we want. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 4 ++-- + fs/bcachefs/btree_iter.h | 3 +++ + fs/bcachefs/btree_update_leaf.c | 11 ++++++++++- + 3 files changed, 15 insertions(+), 3 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index c40e2cc942eb..be55346b6dd1 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -183,8 +183,8 @@ bool __bch2_btree_node_relock(struct btree_trans *trans, + } + } + +-static bool bch2_btree_node_upgrade(struct btree_trans *trans, +- struct btree_path *path, unsigned level) ++bool bch2_btree_node_upgrade(struct btree_trans *trans, ++ struct btree_path *path, unsigned level) + { + struct btree *b = path->l[level].b; + +diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h +index 61bbb7bc54b3..2dc588283252 100644 +--- a/fs/bcachefs/btree_iter.h ++++ b/fs/bcachefs/btree_iter.h +@@ -166,6 +166,9 @@ static inline int btree_trans_restart(struct btree_trans *trans) + return -EINTR; + } + ++bool bch2_btree_node_upgrade(struct btree_trans *, ++ struct btree_path *, unsigned); ++ + bool __bch2_btree_path_upgrade(struct btree_trans *, + struct btree_path *, unsigned); + +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index c405466733e2..5cddb572a55a 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -507,6 +507,15 @@ err: + return ret; + } + ++static inline void path_upgrade_readers(struct btree_trans *trans, struct btree_path *path) ++{ ++ unsigned l; ++ ++ for (l = 0; l < BTREE_MAX_DEPTH; l++) ++ if (btree_node_read_locked(path, l)) ++ BUG_ON(!bch2_btree_node_upgrade(trans, path, l)); ++} ++ + static inline void upgrade_readers(struct btree_trans *trans, struct btree_path *path) + { + struct btree *b = path_l(path)->b; +@@ -514,7 +523,7 @@ static inline void upgrade_readers(struct btree_trans *trans, struct btree_path + do { + if (path->nodes_locked && + path->nodes_locked != path->nodes_intent_locked) +- BUG_ON(!bch2_btree_path_upgrade(trans, path, path->level + 1)); ++ path_upgrade_readers(trans, path); + } while ((path = prev_btree_path(trans, path)) && + path_l(path)->b == b); + } +-- +cgit v1.2.3 + + +From 0f23c8521f4c49a529f92aef07533572ca1ef12a Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 3 Nov 2021 17:23:03 -0400 +Subject: bcachefs: Fix trans_lock_write() + +On failure to get a write lock (because we had a conflicting read lock), +we need to make sure to upgrade the read lock to an intent lock - or we +could end up spinning. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_update_leaf.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index 5cddb572a55a..b9c777cca23e 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -569,7 +569,8 @@ static inline bool have_conflicting_read_lock(struct btree_trans *trans, struct + //if (path == pos) + // break; + +- if (path->nodes_locked != path->nodes_intent_locked) ++ if (path->nodes_locked != path->nodes_intent_locked && ++ !bch2_btree_path_upgrade(trans, path, path->level + 1)) + return true; + } + +-- +cgit v1.2.3 + + +From ce86f029a6697c8c0ccf8d04f7aebdd155b5baa7 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 3 Nov 2021 17:23:49 -0400 +Subject: bcachefs: Improve error message in bch2_write_super() + +It's helpful to know what the superblock write is for. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/super-io.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c +index 3903b730bba3..33d832bc4d4a 100644 +--- a/fs/bcachefs/super-io.c ++++ b/fs/bcachefs/super-io.c +@@ -807,7 +807,8 @@ int bch2_write_super(struct bch_fs *c) + !can_mount_with_written || + (can_mount_without_written && + !can_mount_with_written), c, +- "Unable to write superblock to sufficient devices")) ++ "Unable to write superblock to sufficient devices (from %ps)", ++ (void *) _RET_IP_)) + ret = -1; + out: + /* Make new options visible after they're persistent: */ +-- +cgit v1.2.3 + + +From d925633919ea9054ea53c015999f5f6d4ebf8c59 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 3 Nov 2021 20:25:35 -0400 +Subject: bcachefs: Fix check_inodes() + +We were starting at the wrong btree position, and thus not actually +checking any inodes - oops. + +Also, make check_key_has_snapshot() a mustfix fsck error, since later +fsck code assumes that all keys have valid snapshot IDs. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/fsck.c | 5 ++--- + 1 file changed, 2 insertions(+), 3 deletions(-) + +diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c +index 5bc04c7bbb83..8a9cfccf5ee8 100644 +--- a/fs/bcachefs/fsck.c ++++ b/fs/bcachefs/fsck.c +@@ -669,7 +669,7 @@ static int check_key_has_snapshot(struct btree_trans *trans, + char buf[200]; + int ret = 0; + +- if (fsck_err_on(!snapshot_t(c, k.k->p.snapshot)->equiv, c, ++ if (mustfix_fsck_err_on(!snapshot_t(c, k.k->p.snapshot)->equiv, c, + "key in missing snapshot: %s", + (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf))) + return bch2_btree_delete_at(trans, iter, +@@ -918,8 +918,7 @@ static int check_inodes(struct bch_fs *c, bool full) + + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); + +- bch2_trans_iter_init(&trans, &iter, BTREE_ID_inodes, +- POS(BCACHEFS_ROOT_INO, 0), ++ bch2_trans_iter_init(&trans, &iter, BTREE_ID_inodes, POS_MIN, + BTREE_ITER_INTENT| + BTREE_ITER_PREFETCH| + BTREE_ITER_ALL_SNAPSHOTS); +-- +cgit v1.2.3 + + +From 57c75a5263ffa8f9e5552a5681ec8730f8924690 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 3 Nov 2021 21:22:46 -0400 +Subject: bcachefs: Fix __remove_dirent() + +__lookup_inode() doesn't work for what __remove_dirent() wants - it just +wants the first inode at a given inode number, they all have the same +hash info. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/fsck.c | 31 ++++++++++++++++++++++++++++++- + 1 file changed, 30 insertions(+), 1 deletion(-) + +diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c +index 8a9cfccf5ee8..9519ced976f2 100644 +--- a/fs/bcachefs/fsck.c ++++ b/fs/bcachefs/fsck.c +@@ -113,6 +113,35 @@ static int subvol_lookup(struct btree_trans *trans, u32 subvol, + return lockrestart_do(trans, __subvol_lookup(trans, subvol, snapshot, inum)); + } + ++static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr, ++ struct bch_inode_unpacked *inode) ++{ ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ int ret; ++ ++ bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes, ++ POS(0, inode_nr), ++ BTREE_ITER_ALL_SNAPSHOTS); ++ k = bch2_btree_iter_peek(&iter); ++ ret = bkey_err(k); ++ if (ret) ++ goto err; ++ ++ if (!k.k || bkey_cmp(k.k->p, POS(0, inode_nr))) { ++ ret = -ENOENT; ++ goto err; ++ } ++ ++ ret = bch2_inode_unpack(bkey_s_c_to_inode(k), inode); ++err: ++ if (ret && ret != -EINTR) ++ bch_err(trans->c, "error %i fetching inode %llu", ++ ret, inode_nr); ++ bch2_trans_iter_exit(trans, &iter); ++ return ret; ++} ++ + static int __lookup_inode(struct btree_trans *trans, u64 inode_nr, + struct bch_inode_unpacked *inode, + u32 *snapshot) +@@ -272,7 +301,7 @@ static int __remove_dirent(struct btree_trans *trans, struct bpos pos) + struct bch_hash_info dir_hash_info; + int ret; + +- ret = __lookup_inode(trans, pos.inode, &dir_inode, NULL); ++ ret = lookup_first_inode(trans, pos.inode, &dir_inode); + if (ret) + return ret; + +-- +cgit v1.2.3 + + +From 99787111309002c656a4c04b818dc0421b5ef352 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 3 Nov 2021 22:35:34 -0400 +Subject: bcachefs: Update inode on every write + +This is going to be a performance regression until we get the btree key +cache re-enabled - but it's needed for fixing fsync. Upcoming patches +will record the journal_seq an inode was updated at in the inode itself. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/io.c | 67 +++++++++++++++++++++++++++----------------------------- + 1 file changed, 32 insertions(+), 35 deletions(-) + +diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c +index 017fc689801a..701e9d0eab4f 100644 +--- a/fs/bcachefs/io.c ++++ b/fs/bcachefs/io.c +@@ -269,6 +269,8 @@ int bch2_extent_update(struct btree_trans *trans, + { + /* this must live until after bch2_trans_commit(): */ + struct bkey_inode_buf inode_p; ++ struct btree_iter inode_iter; ++ struct bch_inode_unpacked inode_u; + struct bpos next_pos; + bool extending = false, usage_increasing; + s64 i_sectors_delta = 0, disk_sectors_delta = 0; +@@ -313,49 +315,44 @@ int bch2_extent_update(struct btree_trans *trans, + ? min(k->k.p.offset << 9, new_i_size) + : 0; + +- if (i_sectors_delta || new_i_size) { +- struct btree_iter inode_iter; +- struct bch_inode_unpacked inode_u; +- +- ret = bch2_inode_peek(trans, &inode_iter, &inode_u, inum, +- BTREE_ITER_INTENT); +- if (ret) +- return ret; ++ ret = bch2_inode_peek(trans, &inode_iter, &inode_u, inum, ++ BTREE_ITER_INTENT); ++ if (ret) ++ return ret; + +- /* +- * XXX: +- * writeback can race a bit with truncate, because truncate +- * first updates the inode then truncates the pagecache. This is +- * ugly, but lets us preserve the invariant that the in memory +- * i_size is always >= the on disk i_size. +- * +- BUG_ON(new_i_size > inode_u.bi_size && +- (inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY)); +- */ +- BUG_ON(new_i_size > inode_u.bi_size && !extending); ++ /* ++ * XXX: ++ * writeback can race a bit with truncate, because truncate ++ * first updates the inode then truncates the pagecache. This is ++ * ugly, but lets us preserve the invariant that the in memory ++ * i_size is always >= the on disk i_size. ++ * ++ BUG_ON(new_i_size > inode_u.bi_size && ++ (inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY)); ++ */ ++ BUG_ON(new_i_size > inode_u.bi_size && !extending); + +- if (!(inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY) && +- new_i_size > inode_u.bi_size) +- inode_u.bi_size = new_i_size; +- else +- new_i_size = 0; ++ if (!(inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY) && ++ new_i_size > inode_u.bi_size) ++ inode_u.bi_size = new_i_size; ++ else ++ new_i_size = 0; + +- inode_u.bi_sectors += i_sectors_delta; ++ inode_u.bi_sectors += i_sectors_delta; + +- if (i_sectors_delta || new_i_size) { +- bch2_inode_pack(trans->c, &inode_p, &inode_u); ++ if (i_sectors_delta || new_i_size) { ++ bch2_inode_pack(trans->c, &inode_p, &inode_u); + +- inode_p.inode.k.p.snapshot = iter->snapshot; ++ inode_p.inode.k.p.snapshot = iter->snapshot; + +- ret = bch2_trans_update(trans, &inode_iter, +- &inode_p.inode.k_i, 0); +- } ++ ret = bch2_trans_update(trans, &inode_iter, ++ &inode_p.inode.k_i, 0); ++ } + +- bch2_trans_iter_exit(trans, &inode_iter); ++ bch2_trans_iter_exit(trans, &inode_iter); + +- if (ret) +- return ret; +- } ++ if (ret) ++ return ret; + + next_pos = k->k.p; + +-- +cgit v1.2.3 + + +From 3832d6dd58f760053f8610acfb9ad5115b716522 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 29 Oct 2021 21:14:23 -0400 +Subject: bcachefs: Add journal_seq to inode & alloc keys + +Add fields to inode & alloc keys that record the journal sequence number +when they were most recently modified. + +For alloc keys, this is needed to know what journal sequence number we +have to flush before the bucket can be reused. Currently this is tracked +in memory, but we'll be getting rid of the in memory bucket array. + +For inodes, this is needed for fsync when the inode has been evicted +from the vfs cache. Currently we use a bloom filter per outstanding +journal buf - but that mechanism has been broken since we added the +ability to not issue a flush/fua for every journal write. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/alloc_background.c | 76 +++++++++++++-- + fs/bcachefs/alloc_background.h | 26 +++-- + fs/bcachefs/bcachefs_format.h | 31 +++++- + fs/bcachefs/bkey_methods.c | 4 +- + fs/bcachefs/btree_types.h | 7 +- + fs/bcachefs/buckets.c | 41 ++++++-- + fs/bcachefs/fs.c | 2 +- + fs/bcachefs/fsck.c | 58 +++++------ + fs/bcachefs/inode.c | 211 ++++++++++++++++++++--------------------- + fs/bcachefs/inode.h | 17 +++- + fs/bcachefs/move.c | 4 +- + fs/bcachefs/quota.c | 5 +- + fs/bcachefs/recovery.c | 7 +- + 13 files changed, 305 insertions(+), 184 deletions(-) + +diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c +index 022c905dc8b4..b2735c8591d6 100644 +--- a/fs/bcachefs/alloc_background.c ++++ b/fs/bcachefs/alloc_background.c +@@ -147,10 +147,44 @@ static int bch2_alloc_unpack_v2(struct bkey_alloc_unpacked *out, + return 0; + } + +-static void bch2_alloc_pack_v2(struct bkey_alloc_buf *dst, ++static int bch2_alloc_unpack_v3(struct bkey_alloc_unpacked *out, ++ struct bkey_s_c k) ++{ ++ struct bkey_s_c_alloc_v3 a = bkey_s_c_to_alloc_v3(k); ++ const u8 *in = a.v->data; ++ const u8 *end = bkey_val_end(a); ++ unsigned fieldnr = 0; ++ int ret; ++ u64 v; ++ ++ out->gen = a.v->gen; ++ out->oldest_gen = a.v->oldest_gen; ++ out->data_type = a.v->data_type; ++ out->journal_seq = le64_to_cpu(a.v->journal_seq); ++ ++#define x(_name, _bits) \ ++ if (fieldnr < a.v->nr_fields) { \ ++ ret = bch2_varint_decode_fast(in, end, &v); \ ++ if (ret < 0) \ ++ return ret; \ ++ in += ret; \ ++ } else { \ ++ v = 0; \ ++ } \ ++ out->_name = v; \ ++ if (v != out->_name) \ ++ return -1; \ ++ fieldnr++; ++ ++ BCH_ALLOC_FIELDS_V2() ++#undef x ++ return 0; ++} ++ ++static void bch2_alloc_pack_v3(struct bkey_alloc_buf *dst, + const struct bkey_alloc_unpacked src) + { +- struct bkey_i_alloc_v2 *a = bkey_alloc_v2_init(&dst->k); ++ struct bkey_i_alloc_v3 *a = bkey_alloc_v3_init(&dst->k); + unsigned nr_fields = 0, last_nonzero_fieldnr = 0; + u8 *out = a->v.data; + u8 *end = (void *) &dst[1]; +@@ -161,6 +195,7 @@ static void bch2_alloc_pack_v2(struct bkey_alloc_buf *dst, + a->v.gen = src.gen; + a->v.oldest_gen = src.oldest_gen; + a->v.data_type = src.data_type; ++ a->v.journal_seq = cpu_to_le64(src.journal_seq); + + #define x(_name, _bits) \ + nr_fields++; \ +@@ -194,10 +229,17 @@ struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c k) + .gen = 0, + }; + +- if (k.k->type == KEY_TYPE_alloc_v2) +- bch2_alloc_unpack_v2(&ret, k); +- else if (k.k->type == KEY_TYPE_alloc) ++ switch (k.k->type) { ++ case KEY_TYPE_alloc: + bch2_alloc_unpack_v1(&ret, k); ++ break; ++ case KEY_TYPE_alloc_v2: ++ bch2_alloc_unpack_v2(&ret, k); ++ break; ++ case KEY_TYPE_alloc_v3: ++ bch2_alloc_unpack_v3(&ret, k); ++ break; ++ } + + return ret; + } +@@ -206,7 +248,7 @@ void bch2_alloc_pack(struct bch_fs *c, + struct bkey_alloc_buf *dst, + const struct bkey_alloc_unpacked src) + { +- bch2_alloc_pack_v2(dst, src); ++ bch2_alloc_pack_v3(dst, src); + } + + static unsigned bch_alloc_v1_val_u64s(const struct bch_alloc *a) +@@ -249,13 +291,28 @@ const char *bch2_alloc_v2_invalid(const struct bch_fs *c, struct bkey_s_c k) + return NULL; + } + ++const char *bch2_alloc_v3_invalid(const struct bch_fs *c, struct bkey_s_c k) ++{ ++ struct bkey_alloc_unpacked u; ++ ++ if (k.k->p.inode >= c->sb.nr_devices || ++ !c->devs[k.k->p.inode]) ++ return "invalid device"; ++ ++ if (bch2_alloc_unpack_v3(&u, k)) ++ return "unpack error"; ++ ++ return NULL; ++} ++ + void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) + { + struct bkey_alloc_unpacked u = bch2_alloc_unpack(k); + +- pr_buf(out, "gen %u oldest_gen %u data_type %s", +- u.gen, u.oldest_gen, bch2_data_types[u.data_type]); ++ pr_buf(out, "gen %u oldest_gen %u data_type %s journal_seq %llu", ++ u.gen, u.oldest_gen, bch2_data_types[u.data_type], ++ u.journal_seq); + #define x(_name, ...) pr_buf(out, " " #_name " %llu", (u64) u._name); + BCH_ALLOC_FIELDS_V2() + #undef x +@@ -268,8 +325,7 @@ static int bch2_alloc_read_fn(struct btree_trans *trans, struct bkey_s_c k) + struct bucket *g; + struct bkey_alloc_unpacked u; + +- if (k.k->type != KEY_TYPE_alloc && +- k.k->type != KEY_TYPE_alloc_v2) ++ if (!bkey_is_alloc(k.k)) + return 0; + + ca = bch_dev_bkey_exists(c, k.k->p.inode); +diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h +index a4f6bf56b18f..370573f8e05d 100644 +--- a/fs/bcachefs/alloc_background.h ++++ b/fs/bcachefs/alloc_background.h +@@ -9,6 +9,7 @@ + extern const char * const bch2_allocator_states[]; + + struct bkey_alloc_unpacked { ++ u64 journal_seq; + u64 bucket; + u8 dev; + u8 gen; +@@ -21,19 +22,11 @@ struct bkey_alloc_unpacked { + + struct bkey_alloc_buf { + struct bkey_i k; ++ struct bch_alloc_v3 v; + +- union { +- struct { + #define x(_name, _bits) + _bits / 8 +- u8 _pad[8 + BCH_ALLOC_FIELDS_V1()]; ++ u8 _pad[0 + BCH_ALLOC_FIELDS_V2()]; + #undef x +- } _v1; +- struct { +-#define x(_name, _bits) + 8 + _bits / 8 +- u8 _pad[8 + BCH_ALLOC_FIELDS_V2()]; +-#undef x +- } _v2; +- }; + } __attribute__((packed, aligned(8))); + + /* How out of date a pointer gen is allowed to be: */ +@@ -79,6 +72,7 @@ alloc_mem_to_key(struct btree_iter *iter, + + const char *bch2_alloc_v1_invalid(const struct bch_fs *, struct bkey_s_c); + const char *bch2_alloc_v2_invalid(const struct bch_fs *, struct bkey_s_c); ++const char *bch2_alloc_v3_invalid(const struct bch_fs *, struct bkey_s_c); + void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); + + #define bch2_bkey_ops_alloc (struct bkey_ops) { \ +@@ -91,6 +85,18 @@ void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); + .val_to_text = bch2_alloc_to_text, \ + } + ++#define bch2_bkey_ops_alloc_v3 (struct bkey_ops) { \ ++ .key_invalid = bch2_alloc_v3_invalid, \ ++ .val_to_text = bch2_alloc_to_text, \ ++} ++ ++static inline bool bkey_is_alloc(const struct bkey *k) ++{ ++ return k->type == KEY_TYPE_alloc || ++ k->type == KEY_TYPE_alloc_v2 || ++ k->type == KEY_TYPE_alloc_v3; ++} ++ + int bch2_alloc_read(struct bch_fs *); + + static inline void bch2_wake_allocator(struct bch_dev *ca) +diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h +index 9b1be7146c1c..52212ad1682f 100644 +--- a/fs/bcachefs/bcachefs_format.h ++++ b/fs/bcachefs/bcachefs_format.h +@@ -344,7 +344,9 @@ static inline void bkey_init(struct bkey *k) + x(indirect_inline_data, 19) \ + x(alloc_v2, 20) \ + x(subvolume, 21) \ +- x(snapshot, 22) ++ x(snapshot, 22) \ ++ x(inode_v2, 23) \ ++ x(alloc_v3, 24) + + enum bch_bkey_type { + #define x(name, nr) KEY_TYPE_##name = nr, +@@ -681,6 +683,16 @@ struct bch_inode { + __u8 fields[0]; + } __attribute__((packed, aligned(8))); + ++struct bch_inode_v2 { ++ struct bch_val v; ++ ++ __le64 bi_journal_seq; ++ __le64 bi_hash_seed; ++ __le64 bi_flags; ++ __le16 bi_mode; ++ __u8 fields[0]; ++} __attribute__((packed, aligned(8))); ++ + struct bch_inode_generation { + struct bch_val v; + +@@ -772,6 +784,9 @@ LE32_BITMASK(INODE_STR_HASH, struct bch_inode, bi_flags, 20, 24); + LE32_BITMASK(INODE_NR_FIELDS, struct bch_inode, bi_flags, 24, 31); + LE32_BITMASK(INODE_NEW_VARINT, struct bch_inode, bi_flags, 31, 32); + ++LE64_BITMASK(INODEv2_STR_HASH, struct bch_inode_v2, bi_flags, 20, 24); ++LE64_BITMASK(INODEv2_NR_FIELDS, struct bch_inode_v2, bi_flags, 24, 31); ++ + /* Dirents */ + + /* +@@ -866,6 +881,17 @@ struct bch_alloc_v2 { + x(stripe, 32) \ + x(stripe_redundancy, 8) + ++struct bch_alloc_v3 { ++ struct bch_val v; ++ __le64 journal_seq; ++ __le32 flags; ++ __u8 nr_fields; ++ __u8 gen; ++ __u8 oldest_gen; ++ __u8 data_type; ++ __u8 data[]; ++} __attribute__((packed, aligned(8))); ++ + enum { + #define x(name, _bits) BCH_ALLOC_FIELD_V1_##name, + BCH_ALLOC_FIELDS_V1() +@@ -1272,7 +1298,8 @@ enum bcachefs_metadata_version { + bcachefs_metadata_version_snapshot_2 = 15, + bcachefs_metadata_version_reflink_p_fix = 16, + bcachefs_metadata_version_subvol_dirent = 17, +- bcachefs_metadata_version_max = 18, ++ bcachefs_metadata_version_inode_v2 = 18, ++ bcachefs_metadata_version_max = 19, + }; + + #define bcachefs_metadata_version_current (bcachefs_metadata_version_max - 1) +diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c +index 874defd8aff8..5c900cf8a8a2 100644 +--- a/fs/bcachefs/bkey_methods.c ++++ b/fs/bcachefs/bkey_methods.c +@@ -113,6 +113,7 @@ static unsigned bch2_key_types_allowed[] = { + (1U << KEY_TYPE_deleted)| + (1U << KEY_TYPE_whiteout)| + (1U << KEY_TYPE_inode)| ++ (1U << KEY_TYPE_inode_v2)| + (1U << KEY_TYPE_inode_generation), + [BKEY_TYPE_dirents] = + (1U << KEY_TYPE_deleted)| +@@ -128,7 +129,8 @@ static unsigned bch2_key_types_allowed[] = { + [BKEY_TYPE_alloc] = + (1U << KEY_TYPE_deleted)| + (1U << KEY_TYPE_alloc)| +- (1U << KEY_TYPE_alloc_v2), ++ (1U << KEY_TYPE_alloc_v2)| ++ (1U << KEY_TYPE_alloc_v3), + [BKEY_TYPE_quotas] = + (1U << KEY_TYPE_deleted)| + (1U << KEY_TYPE_quota), +diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h +index 26aa3cd182d5..affc0e681de9 100644 +--- a/fs/bcachefs/btree_types.h ++++ b/fs/bcachefs/btree_types.h +@@ -605,6 +605,7 @@ static inline bool btree_node_is_extents(struct btree *b) + + #define BTREE_NODE_TYPE_HAS_MEM_TRIGGERS \ + ((1U << BKEY_TYPE_alloc)| \ ++ (1U << BKEY_TYPE_inodes)| \ + (1U << BKEY_TYPE_stripes)| \ + (1U << BKEY_TYPE_snapshots)) + +@@ -652,8 +653,12 @@ enum btree_update_flags { + #define BTREE_TRIGGER_NOATOMIC (1U << __BTREE_TRIGGER_NOATOMIC) + + #define BTREE_TRIGGER_WANTS_OLD_AND_NEW \ +- ((1U << KEY_TYPE_stripe)| \ ++ ((1U << KEY_TYPE_alloc)| \ ++ (1U << KEY_TYPE_alloc_v2)| \ ++ (1U << KEY_TYPE_alloc_v3)| \ ++ (1U << KEY_TYPE_stripe)| \ + (1U << KEY_TYPE_inode)| \ ++ (1U << KEY_TYPE_inode_v2)| \ + (1U << KEY_TYPE_snapshot)) + + static inline bool btree_node_type_needs_gc(enum btree_node_type type) +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index 699a0865c9b8..a762a31be35d 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -13,6 +13,7 @@ + #include "buckets.h" + #include "ec.h" + #include "error.h" ++#include "inode.h" + #include "movinggc.h" + #include "recovery.h" + #include "reflink.h" +@@ -537,8 +538,7 @@ static int bch2_mark_alloc(struct btree_trans *trans, + struct bucket_mark old_m, m; + + /* We don't do anything for deletions - do we?: */ +- if (new.k->type != KEY_TYPE_alloc && +- new.k->type != KEY_TYPE_alloc_v2) ++ if (!bkey_is_alloc(new.k)) + return 0; + + /* +@@ -548,6 +548,15 @@ static int bch2_mark_alloc(struct btree_trans *trans, + !(flags & BTREE_TRIGGER_BUCKET_INVALIDATE)) + return 0; + ++ if (flags & BTREE_TRIGGER_INSERT) { ++ struct bch_alloc_v3 *v = (struct bch_alloc_v3 *) new.v; ++ ++ BUG_ON(!journal_seq); ++ BUG_ON(new.k->type != KEY_TYPE_alloc_v3); ++ ++ v->journal_seq = cpu_to_le64(journal_seq); ++ } ++ + ca = bch_dev_bkey_exists(c, new.k->p.inode); + + if (new.k->p.offset >= ca->mi.nbuckets) +@@ -1091,12 +1100,24 @@ static int bch2_mark_inode(struct btree_trans *trans, + { + struct bch_fs *c = trans->c; + struct bch_fs_usage __percpu *fs_usage; ++ u64 journal_seq = trans->journal_res.seq; + +- preempt_disable(); +- fs_usage = fs_usage_ptr(c, trans->journal_res.seq, flags & BTREE_TRIGGER_GC); +- fs_usage->nr_inodes += new.k->type == KEY_TYPE_inode; +- fs_usage->nr_inodes -= old.k->type == KEY_TYPE_inode; +- preempt_enable(); ++ if (flags & BTREE_TRIGGER_INSERT) { ++ struct bch_inode_v2 *v = (struct bch_inode_v2 *) new.v; ++ ++ BUG_ON(!journal_seq); ++ BUG_ON(new.k->type != KEY_TYPE_inode_v2); ++ ++ v->bi_journal_seq = cpu_to_le64(journal_seq); ++ } ++ ++ if (flags & BTREE_TRIGGER_GC) { ++ preempt_disable(); ++ fs_usage = fs_usage_ptr(c, journal_seq, flags & BTREE_TRIGGER_GC); ++ fs_usage->nr_inodes += bkey_is_inode(new.k); ++ fs_usage->nr_inodes -= bkey_is_inode(old.k); ++ preempt_enable(); ++ } + return 0; + } + +@@ -1215,6 +1236,7 @@ static int bch2_mark_key_locked(struct btree_trans *trans, + switch (k.k->type) { + case KEY_TYPE_alloc: + case KEY_TYPE_alloc_v2: ++ case KEY_TYPE_alloc_v3: + return bch2_mark_alloc(trans, old, new, flags); + case KEY_TYPE_btree_ptr: + case KEY_TYPE_btree_ptr_v2: +@@ -1224,6 +1246,7 @@ static int bch2_mark_key_locked(struct btree_trans *trans, + case KEY_TYPE_stripe: + return bch2_mark_stripe(trans, old, new, flags); + case KEY_TYPE_inode: ++ case KEY_TYPE_inode_v2: + return bch2_mark_inode(trans, old, new, flags); + case KEY_TYPE_reservation: + return bch2_mark_reservation(trans, old, new, flags); +@@ -1680,8 +1703,7 @@ static int bch2_trans_mark_inode(struct btree_trans *trans, + struct bkey_s_c new, + unsigned flags) + { +- int nr = (new.k->type == KEY_TYPE_inode) - +- (old.k->type == KEY_TYPE_inode); ++ int nr = bkey_is_inode(new.k) - bkey_is_inode(old.k); + + if (nr) { + struct replicas_delta_list *d = +@@ -1829,6 +1851,7 @@ int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c old, + case KEY_TYPE_stripe: + return bch2_trans_mark_stripe(trans, old, new, flags); + case KEY_TYPE_inode: ++ case KEY_TYPE_inode_v2: + return bch2_trans_mark_inode(trans, old, new, flags); + case KEY_TYPE_reservation: + return bch2_trans_mark_reservation(trans, k, flags); +diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c +index 316cb76e24b6..382fa3b9d4dd 100644 +--- a/fs/bcachefs/fs.c ++++ b/fs/bcachefs/fs.c +@@ -1206,7 +1206,7 @@ static void bch2_vfs_inode_init(struct bch_fs *c, subvol_inum inum, + inode->v.i_size = bi->bi_size; + + inode->ei_flags = 0; +- inode->ei_journal_seq = 0; ++ inode->ei_journal_seq = bi->bi_journal_seq; + inode->ei_quota_reserved = 0; + inode->ei_qid = bch_qid(bi); + inode->ei_subvol = inum.subvol; +diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c +index 9519ced976f2..361dbf338023 100644 +--- a/fs/bcachefs/fsck.c ++++ b/fs/bcachefs/fsck.c +@@ -133,7 +133,7 @@ static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr, + goto err; + } + +- ret = bch2_inode_unpack(bkey_s_c_to_inode(k), inode); ++ ret = bch2_inode_unpack(k, inode); + err: + if (ret && ret != -EINTR) + bch_err(trans->c, "error %i fetching inode %llu", +@@ -157,8 +157,8 @@ static int __lookup_inode(struct btree_trans *trans, u64 inode_nr, + if (ret) + goto err; + +- ret = k.k->type == KEY_TYPE_inode +- ? bch2_inode_unpack(bkey_s_c_to_inode(k), inode) ++ ret = bkey_is_inode(k.k) ++ ? bch2_inode_unpack(k, inode) + : -ENOENT; + if (!ret) + *snapshot = iter.pos.snapshot; +@@ -261,7 +261,7 @@ retry: + if (ret) + goto err; + +- if (k.k->type != KEY_TYPE_inode) { ++ if (!bkey_is_inode(k.k)) { + bch2_fs_inconsistent(trans->c, + "inode %llu:%u not found when deleting", + inum, snapshot); +@@ -269,7 +269,7 @@ retry: + goto err; + } + +- bch2_inode_unpack(bkey_s_c_to_inode(k), &inode_u); ++ bch2_inode_unpack(k, &inode_u); + + /* Subvolume root? */ + if (inode_u.bi_subvol) { +@@ -581,7 +581,7 @@ static int inode_walker_realloc(struct inode_walker *w) + } + + static int add_inode(struct bch_fs *c, struct inode_walker *w, +- struct bkey_s_c_inode inode) ++ struct bkey_s_c inode) + { + struct bch_inode_unpacked u; + int ret; +@@ -623,8 +623,8 @@ static int __walk_inode(struct btree_trans *trans, + if (k.k->p.offset != pos.inode) + break; + +- if (k.k->type == KEY_TYPE_inode) +- add_inode(c, w, bkey_s_c_to_inode(k)); ++ if (bkey_is_inode(k.k)) ++ add_inode(c, w, k); + } + bch2_trans_iter_exit(trans, &iter); + +@@ -676,11 +676,11 @@ static int __get_visible_inodes(struct btree_trans *trans, + if (k.k->p.offset != inum) + break; + +- if (k.k->type != KEY_TYPE_inode) ++ if (!bkey_is_inode(k.k)) + continue; + + if (ref_visible(c, s, s->pos.snapshot, k.k->p.snapshot)) { +- add_inode(c, w, bkey_s_c_to_inode(k)); ++ add_inode(c, w, k); + if (k.k->p.snapshot >= s->pos.snapshot) + break; + } +@@ -805,7 +805,6 @@ static int check_inode(struct btree_trans *trans, + { + struct bch_fs *c = trans->c; + struct bkey_s_c k; +- struct bkey_s_c_inode inode; + struct bch_inode_unpacked u; + bool do_update = false; + int ret; +@@ -830,19 +829,17 @@ static int check_inode(struct btree_trans *trans, + if (bch2_snapshot_internal_node(c, k.k->p.snapshot)) + return 0; + +- if (k.k->type != KEY_TYPE_inode) ++ if (!bkey_is_inode(k.k)) + return 0; + +- inode = bkey_s_c_to_inode(k); ++ BUG_ON(bch2_inode_unpack(k, &u)); + + if (!full && +- !(inode.v->bi_flags & (BCH_INODE_I_SIZE_DIRTY| +- BCH_INODE_I_SECTORS_DIRTY| +- BCH_INODE_UNLINKED))) ++ !(u.bi_flags & (BCH_INODE_I_SIZE_DIRTY| ++ BCH_INODE_I_SECTORS_DIRTY| ++ BCH_INODE_UNLINKED))) + return 0; + +- BUG_ON(bch2_inode_unpack(inode, &u)); +- + if (prev->bi_inum != u.bi_inum) + *prev = u; + +@@ -1963,10 +1960,10 @@ static int check_directory_structure(struct bch_fs *c) + BTREE_ITER_INTENT| + BTREE_ITER_PREFETCH| + BTREE_ITER_ALL_SNAPSHOTS, k, ret) { +- if (k.k->type != KEY_TYPE_inode) ++ if (!bkey_is_inode(k.k)) + continue; + +- ret = bch2_inode_unpack(bkey_s_c_to_inode(k), &u); ++ ret = bch2_inode_unpack(k, &u); + if (ret) { + /* Should have been caught earlier in fsck: */ + bch_err(c, "error unpacking inode %llu: %i", k.k->p.offset, ret); +@@ -2070,7 +2067,6 @@ static int check_nlinks_find_hardlinks(struct bch_fs *c, + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; +- struct bkey_s_c_inode inode; + struct bch_inode_unpacked u; + int ret = 0; + +@@ -2081,21 +2077,19 @@ static int check_nlinks_find_hardlinks(struct bch_fs *c, + BTREE_ITER_INTENT| + BTREE_ITER_PREFETCH| + BTREE_ITER_ALL_SNAPSHOTS, k, ret) { +- if (k.k->type != KEY_TYPE_inode) ++ if (!bkey_is_inode(k.k)) + continue; + +- inode = bkey_s_c_to_inode(k); ++ /* Should never fail, checked by bch2_inode_invalid: */ ++ BUG_ON(bch2_inode_unpack(k, &u)); + + /* + * Backpointer and directory structure checks are sufficient for + * directories, since they can't have hardlinks: + */ +- if (S_ISDIR(le16_to_cpu(inode.v->bi_mode))) ++ if (S_ISDIR(le16_to_cpu(u.bi_mode))) + continue; + +- /* Should never fail, checked by bch2_inode_invalid: */ +- BUG_ON(bch2_inode_unpack(inode, &u)); +- + if (!u.bi_nlink) + continue; + +@@ -2169,7 +2163,6 @@ static int check_nlinks_update_hardlinks(struct bch_fs *c, + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; +- struct bkey_s_c_inode inode; + struct bch_inode_unpacked u; + struct nlink *link = links->d; + int ret = 0; +@@ -2184,14 +2177,13 @@ static int check_nlinks_update_hardlinks(struct bch_fs *c, + if (k.k->p.offset >= range_end) + break; + +- if (k.k->type != KEY_TYPE_inode) ++ if (!bkey_is_inode(k.k)) + continue; + +- inode = bkey_s_c_to_inode(k); +- if (S_ISDIR(le16_to_cpu(inode.v->bi_mode))) +- continue; ++ BUG_ON(bch2_inode_unpack(k, &u)); + +- BUG_ON(bch2_inode_unpack(inode, &u)); ++ if (S_ISDIR(le16_to_cpu(u.bi_mode))) ++ continue; + + if (!u.bi_nlink) + continue; +diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c +index 462c1f43ae96..ef1866a7e96f 100644 +--- a/fs/bcachefs/inode.c ++++ b/fs/bcachefs/inode.c +@@ -35,29 +35,6 @@ static const u8 bits_table[8] = { + 13 * 8 - 8, + }; + +-static int inode_encode_field(u8 *out, u8 *end, u64 hi, u64 lo) +-{ +- __be64 in[2] = { cpu_to_be64(hi), cpu_to_be64(lo), }; +- unsigned shift, bytes, bits = likely(!hi) +- ? fls64(lo) +- : fls64(hi) + 64; +- +- for (shift = 1; shift <= 8; shift++) +- if (bits < bits_table[shift - 1]) +- goto got_shift; +- +- BUG(); +-got_shift: +- bytes = byte_table[shift - 1]; +- +- BUG_ON(out + bytes > end); +- +- memcpy(out, (u8 *) in + 16 - bytes, bytes); +- *out |= (1 << 8) >> shift; +- +- return bytes; +-} +- + static int inode_decode_field(const u8 *in, const u8 *end, + u64 out[2], unsigned *out_bits) + { +@@ -92,42 +69,11 @@ static int inode_decode_field(const u8 *in, const u8 *end, + return bytes; + } + +-static noinline void bch2_inode_pack_v1(struct bkey_inode_buf *packed, +- const struct bch_inode_unpacked *inode) +-{ +- struct bkey_i_inode *k = &packed->inode; +- u8 *out = k->v.fields; +- u8 *end = (void *) &packed[1]; +- u8 *last_nonzero_field = out; +- unsigned nr_fields = 0, last_nonzero_fieldnr = 0; +- unsigned bytes; +- +-#define x(_name, _bits) \ +- out += inode_encode_field(out, end, 0, inode->_name); \ +- nr_fields++; \ +- \ +- if (inode->_name) { \ +- last_nonzero_field = out; \ +- last_nonzero_fieldnr = nr_fields; \ +- } +- +- BCH_INODE_FIELDS() +-#undef x +- +- out = last_nonzero_field; +- nr_fields = last_nonzero_fieldnr; +- +- bytes = out - (u8 *) &packed->inode.v; +- set_bkey_val_bytes(&packed->inode.k, bytes); +- memset_u64s_tail(&packed->inode.v, 0, bytes); +- +- SET_INODE_NR_FIELDS(&k->v, nr_fields); +-} +- +-static void bch2_inode_pack_v2(struct bkey_inode_buf *packed, +- const struct bch_inode_unpacked *inode) ++void bch2_inode_pack(struct bch_fs *c, ++ struct bkey_inode_buf *packed, ++ const struct bch_inode_unpacked *inode) + { +- struct bkey_i_inode *k = &packed->inode; ++ struct bkey_i_inode_v2 *k = &packed->inode; + u8 *out = k->v.fields; + u8 *end = (void *) &packed[1]; + u8 *last_nonzero_field = out; +@@ -135,6 +81,14 @@ static void bch2_inode_pack_v2(struct bkey_inode_buf *packed, + unsigned bytes; + int ret; + ++ bkey_inode_v2_init(&packed->inode.k_i); ++ packed->inode.k.p.offset = inode->bi_inum; ++ packed->inode.v.bi_journal_seq = cpu_to_le64(inode->bi_journal_seq); ++ packed->inode.v.bi_hash_seed = inode->bi_hash_seed; ++ packed->inode.v.bi_flags = cpu_to_le64(inode->bi_flags); ++ packed->inode.v.bi_flags = cpu_to_le64(inode->bi_flags); ++ packed->inode.v.bi_mode = cpu_to_le16(inode->bi_mode); ++ + #define x(_name, _bits) \ + nr_fields++; \ + \ +@@ -165,30 +119,12 @@ static void bch2_inode_pack_v2(struct bkey_inode_buf *packed, + set_bkey_val_bytes(&packed->inode.k, bytes); + memset_u64s_tail(&packed->inode.v, 0, bytes); + +- SET_INODE_NR_FIELDS(&k->v, nr_fields); +-} +- +-void bch2_inode_pack(struct bch_fs *c, +- struct bkey_inode_buf *packed, +- const struct bch_inode_unpacked *inode) +-{ +- bkey_inode_init(&packed->inode.k_i); +- packed->inode.k.p.offset = inode->bi_inum; +- packed->inode.v.bi_hash_seed = inode->bi_hash_seed; +- packed->inode.v.bi_flags = cpu_to_le32(inode->bi_flags); +- packed->inode.v.bi_mode = cpu_to_le16(inode->bi_mode); +- +- if (c->sb.features & (1ULL << BCH_FEATURE_new_varint)) { +- SET_INODE_NEW_VARINT(&packed->inode.v, true); +- bch2_inode_pack_v2(packed, inode); +- } else { +- bch2_inode_pack_v1(packed, inode); +- } ++ SET_INODEv2_NR_FIELDS(&k->v, nr_fields); + + if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) { + struct bch_inode_unpacked unpacked; + +- int ret = bch2_inode_unpack(inode_i_to_s_c(&packed->inode), ++ int ret = bch2_inode_unpack(bkey_i_to_s_c(&packed->inode.k_i), + &unpacked); + BUG_ON(ret); + BUG_ON(unpacked.bi_inum != inode->bi_inum); +@@ -237,17 +173,16 @@ static noinline int bch2_inode_unpack_v1(struct bkey_s_c_inode inode, + return 0; + } + +-static int bch2_inode_unpack_v2(struct bkey_s_c_inode inode, +- struct bch_inode_unpacked *unpacked) ++static int bch2_inode_unpack_v2(struct bch_inode_unpacked *unpacked, ++ const u8 *in, const u8 *end, ++ unsigned nr_fields) + { +- const u8 *in = inode.v->fields; +- const u8 *end = bkey_val_end(inode); + unsigned fieldnr = 0; + int ret; + u64 v[2]; + + #define x(_name, _bits) \ +- if (fieldnr < INODE_NR_FIELDS(inode.v)) { \ ++ if (fieldnr < nr_fields) { \ + ret = bch2_varint_decode_fast(in, end, &v[0]); \ + if (ret < 0) \ + return ret; \ +@@ -277,21 +212,43 @@ static int bch2_inode_unpack_v2(struct bkey_s_c_inode inode, + return 0; + } + +-int bch2_inode_unpack(struct bkey_s_c_inode inode, ++int bch2_inode_unpack(struct bkey_s_c k, + struct bch_inode_unpacked *unpacked) + { +- unpacked->bi_inum = inode.k->p.offset; +- unpacked->bi_hash_seed = inode.v->bi_hash_seed; +- unpacked->bi_flags = le32_to_cpu(inode.v->bi_flags); +- unpacked->bi_mode = le16_to_cpu(inode.v->bi_mode); +- +- if (INODE_NEW_VARINT(inode.v)) { +- return bch2_inode_unpack_v2(inode, unpacked); +- } else { +- return bch2_inode_unpack_v1(inode, unpacked); ++ switch (k.k->type) { ++ case KEY_TYPE_inode: { ++ struct bkey_s_c_inode inode = bkey_s_c_to_inode(k); ++ ++ unpacked->bi_inum = inode.k->p.offset; ++ unpacked->bi_hash_seed = inode.v->bi_hash_seed; ++ unpacked->bi_flags = le32_to_cpu(inode.v->bi_flags); ++ unpacked->bi_mode = le16_to_cpu(inode.v->bi_mode); ++ ++ if (INODE_NEW_VARINT(inode.v)) { ++ return bch2_inode_unpack_v2(unpacked, inode.v->fields, ++ bkey_val_end(inode), ++ INODE_NR_FIELDS(inode.v)); ++ } else { ++ return bch2_inode_unpack_v1(inode, unpacked); ++ } ++ break; ++ } ++ case KEY_TYPE_inode_v2: { ++ struct bkey_s_c_inode_v2 inode = bkey_s_c_to_inode_v2(k); ++ ++ unpacked->bi_inum = inode.k->p.offset; ++ unpacked->bi_journal_seq= le64_to_cpu(inode.v->bi_journal_seq); ++ unpacked->bi_hash_seed = inode.v->bi_hash_seed; ++ unpacked->bi_flags = le64_to_cpu(inode.v->bi_flags); ++ unpacked->bi_mode = le16_to_cpu(inode.v->bi_mode); ++ ++ return bch2_inode_unpack_v2(unpacked, inode.v->fields, ++ bkey_val_end(inode), ++ INODEv2_NR_FIELDS(inode.v)); ++ } ++ default: ++ BUG(); + } +- +- return 0; + } + + int bch2_inode_peek(struct btree_trans *trans, +@@ -317,11 +274,11 @@ int bch2_inode_peek(struct btree_trans *trans, + if (ret) + goto err; + +- ret = k.k->type == KEY_TYPE_inode ? 0 : -ENOENT; ++ ret = bkey_is_inode(k.k) ? 0 : -ENOENT; + if (ret) + goto err; + +- ret = bch2_inode_unpack(bkey_s_c_to_inode(k), inode); ++ ret = bch2_inode_unpack(k, inode); + if (ret) + goto err; + +@@ -363,7 +320,43 @@ const char *bch2_inode_invalid(const struct bch_fs *c, struct bkey_s_c k) + if (INODE_STR_HASH(inode.v) >= BCH_STR_HASH_NR) + return "invalid str hash type"; + +- if (bch2_inode_unpack(inode, &unpacked)) ++ if (bch2_inode_unpack(k, &unpacked)) ++ return "invalid variable length fields"; ++ ++ if (unpacked.bi_data_checksum >= BCH_CSUM_OPT_NR + 1) ++ return "invalid data checksum type"; ++ ++ if (unpacked.bi_compression >= BCH_COMPRESSION_OPT_NR + 1) ++ return "invalid data checksum type"; ++ ++ if ((unpacked.bi_flags & BCH_INODE_UNLINKED) && ++ unpacked.bi_nlink != 0) ++ return "flagged as unlinked but bi_nlink != 0"; ++ ++ if (unpacked.bi_subvol && !S_ISDIR(unpacked.bi_mode)) ++ return "subvolume root but not a directory"; ++ ++ return NULL; ++} ++ ++const char *bch2_inode_v2_invalid(const struct bch_fs *c, struct bkey_s_c k) ++{ ++ struct bkey_s_c_inode_v2 inode = bkey_s_c_to_inode_v2(k); ++ struct bch_inode_unpacked unpacked; ++ ++ if (k.k->p.inode) ++ return "nonzero k.p.inode"; ++ ++ if (bkey_val_bytes(k.k) < sizeof(struct bch_inode)) ++ return "incorrect value size"; ++ ++ if (k.k->p.offset < BLOCKDEV_INODE_MAX) ++ return "fs inode in blockdev range"; ++ ++ if (INODEv2_STR_HASH(inode.v) >= BCH_STR_HASH_NR) ++ return "invalid str hash type"; ++ ++ if (bch2_inode_unpack(k, &unpacked)) + return "invalid variable length fields"; + + if (unpacked.bi_data_checksum >= BCH_CSUM_OPT_NR + 1) +@@ -384,10 +377,12 @@ const char *bch2_inode_invalid(const struct bch_fs *c, struct bkey_s_c k) + + static void __bch2_inode_unpacked_to_text(struct printbuf *out, struct bch_inode_unpacked *inode) + { +- pr_buf(out, "mode %o flags %x ", inode->bi_mode, inode->bi_flags); ++ pr_buf(out, "mode %o flags %x journal_seq %llu", ++ inode->bi_mode, inode->bi_flags, ++ inode->bi_journal_seq); + + #define x(_name, _bits) \ +- pr_buf(out, #_name " %llu ", (u64) inode->_name); ++ pr_buf(out, " "#_name " %llu", (u64) inode->_name); + BCH_INODE_FIELDS() + #undef x + } +@@ -401,15 +396,14 @@ void bch2_inode_unpacked_to_text(struct printbuf *out, struct bch_inode_unpacked + void bch2_inode_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) + { +- struct bkey_s_c_inode inode = bkey_s_c_to_inode(k); +- struct bch_inode_unpacked unpacked; ++ struct bch_inode_unpacked inode; + +- if (bch2_inode_unpack(inode, &unpacked)) { ++ if (bch2_inode_unpack(k, &inode)) { + pr_buf(out, "(unpack error)"); + return; + } + +- __bch2_inode_unpacked_to_text(out, &unpacked); ++ __bch2_inode_unpacked_to_text(out, &inode); + } + + const char *bch2_inode_generation_invalid(const struct bch_fs *c, +@@ -485,6 +479,7 @@ static inline u32 bkey_generation(struct bkey_s_c k) + { + switch (k.k->type) { + case KEY_TYPE_inode: ++ case KEY_TYPE_inode_v2: + BUG(); + case KEY_TYPE_inode_generation: + return le32_to_cpu(bkey_s_c_to_inode_generation(k).v->bi_generation); +@@ -542,7 +537,7 @@ again: + } + + if (k.k->p.snapshot == snapshot && +- k.k->type != KEY_TYPE_inode && ++ !bkey_is_inode(k.k) && + !bch2_btree_key_cache_find(c, BTREE_ID_inodes, SPOS(0, pos, snapshot))) { + bch2_btree_iter_advance(iter); + continue; +@@ -585,7 +580,7 @@ found_slot: + } + + /* We may have raced while the iterator wasn't pointing at pos: */ +- if (k.k->type == KEY_TYPE_inode || ++ if (bkey_is_inode(k.k) || + bch2_btree_key_cache_find(c, BTREE_ID_inodes, k.k->p)) + goto again; + +@@ -698,7 +693,7 @@ retry: + if (ret) + goto err; + +- if (k.k->type != KEY_TYPE_inode) { ++ if (!bkey_is_inode(k.k)) { + bch2_fs_inconsistent(trans.c, + "inode %llu not found when deleting", + inum.inum); +@@ -706,7 +701,7 @@ retry: + goto err; + } + +- bch2_inode_unpack(bkey_s_c_to_inode(k), &inode_u); ++ bch2_inode_unpack(k, &inode_u); + + /* Subvolume root? */ + BUG_ON(inode_u.bi_subvol); +diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h +index 009b807cc167..d433d48de4e0 100644 +--- a/fs/bcachefs/inode.h ++++ b/fs/bcachefs/inode.h +@@ -7,6 +7,7 @@ + extern const char * const bch2_inode_opts[]; + + const char *bch2_inode_invalid(const struct bch_fs *, struct bkey_s_c); ++const char *bch2_inode_v2_invalid(const struct bch_fs *, struct bkey_s_c); + void bch2_inode_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); + + #define bch2_bkey_ops_inode (struct bkey_ops) { \ +@@ -14,6 +15,17 @@ void bch2_inode_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); + .val_to_text = bch2_inode_to_text, \ + } + ++#define bch2_bkey_ops_inode_v2 (struct bkey_ops) { \ ++ .key_invalid = bch2_inode_v2_invalid, \ ++ .val_to_text = bch2_inode_to_text, \ ++} ++ ++static inline bool bkey_is_inode(const struct bkey *k) ++{ ++ return k->type == KEY_TYPE_inode || ++ k->type == KEY_TYPE_inode_v2; ++} ++ + const char *bch2_inode_generation_invalid(const struct bch_fs *, + struct bkey_s_c); + void bch2_inode_generation_to_text(struct printbuf *, struct bch_fs *, +@@ -34,6 +46,7 @@ typedef u64 u96; + + struct bch_inode_unpacked { + u64 bi_inum; ++ u64 bi_journal_seq; + __le64 bi_hash_seed; + u32 bi_flags; + u16 bi_mode; +@@ -44,7 +57,7 @@ struct bch_inode_unpacked { + }; + + struct bkey_inode_buf { +- struct bkey_i_inode inode; ++ struct bkey_i_inode_v2 inode; + + #define x(_name, _bits) + 8 + _bits / 8 + u8 _pad[0 + BCH_INODE_FIELDS()]; +@@ -53,7 +66,7 @@ struct bkey_inode_buf { + + void bch2_inode_pack(struct bch_fs *, struct bkey_inode_buf *, + const struct bch_inode_unpacked *); +-int bch2_inode_unpack(struct bkey_s_c_inode, struct bch_inode_unpacked *); ++int bch2_inode_unpack(struct bkey_s_c, struct bch_inode_unpacked *); + + void bch2_inode_unpacked_to_text(struct printbuf *, struct bch_inode_unpacked *); + +diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c +index d0c784012e88..0152fbcde3c2 100644 +--- a/fs/bcachefs/move.c ++++ b/fs/bcachefs/move.c +@@ -623,11 +623,11 @@ static int lookup_inode(struct btree_trans *trans, struct bpos pos, + goto err; + } + +- ret = k.k->type == KEY_TYPE_inode ? 0 : -EIO; ++ ret = bkey_is_inode(k.k) ? 0 : -EIO; + if (ret) + goto err; + +- ret = bch2_inode_unpack(bkey_s_c_to_inode(k), inode); ++ ret = bch2_inode_unpack(k, inode); + if (ret) + goto err; + err: +diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c +index 17fd5bf107bb..5f1216da76d0 100644 +--- a/fs/bcachefs/quota.c ++++ b/fs/bcachefs/quota.c +@@ -439,9 +439,8 @@ int bch2_fs_quota_read(struct bch_fs *c) + + for_each_btree_key(&trans, iter, BTREE_ID_inodes, POS_MIN, + BTREE_ITER_PREFETCH, k, ret) { +- switch (k.k->type) { +- case KEY_TYPE_inode: +- ret = bch2_inode_unpack(bkey_s_c_to_inode(k), &u); ++ if (bkey_is_inode(k.k)) { ++ ret = bch2_inode_unpack(k, &u); + if (ret) + return ret; + +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index 29fae6dbce76..d8e511a0664e 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -1015,13 +1015,13 @@ static int bch2_fs_upgrade_for_subvolumes(struct btree_trans *trans) + if (ret) + goto err; + +- if (k.k->type != KEY_TYPE_inode) { ++ if (!bkey_is_inode(k.k)) { + bch_err(c, "root inode not found"); + ret = -ENOENT; + goto err; + } + +- ret = bch2_inode_unpack(bkey_s_c_to_inode(k), &inode); ++ ret = bch2_inode_unpack(k, &inode); + BUG_ON(ret); + + inode.bi_subvol = BCACHEFS_ROOT_SUBVOL; +@@ -1093,6 +1093,9 @@ int bch2_fs_recovery(struct bch_fs *c) + bch_info(c, "filesystem version is prior to subvol_dirent - upgrading"); + c->opts.version_upgrade = true; + c->opts.fsck = true; ++ } else if (c->sb.version < bcachefs_metadata_version_inode_v2) { ++ bch_info(c, "filesystem version is prior to inode_v2 - upgrading"); ++ c->opts.version_upgrade = true; + } + } + +-- +cgit v1.2.3 + + +From 577298e1aaec09d58bd31bec3c692b20958d1d7a Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 3 Nov 2021 22:33:32 -0400 +Subject: bcachefs: Kill journal buf bloom filter + +This was used for recording which inodes have been modified by in flight +journal writes, but was broken and has been superceded. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_update_leaf.c | 3 --- + fs/bcachefs/fs.c | 4 ---- + fs/bcachefs/journal.c | 51 ----------------------------------------- + fs/bcachefs/journal.h | 13 ----------- + fs/bcachefs/journal_types.h | 2 -- + 5 files changed, 73 deletions(-) + +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index b9c777cca23e..51f65226d3bf 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -362,9 +362,6 @@ static inline void do_btree_insert_one(struct btree_trans *trans, + i->level, + i->k); + +- bch2_journal_set_has_inode(j, &trans->journal_res, +- i->k->k.p.inode); +- + if (trans->journal_seq) + *trans->journal_seq = trans->journal_res.seq; + } +diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c +index 382fa3b9d4dd..5c07bbff56dc 100644 +--- a/fs/bcachefs/fs.c ++++ b/fs/bcachefs/fs.c +@@ -58,8 +58,6 @@ static void journal_seq_copy(struct bch_fs *c, + if (old >= journal_seq) + break; + } while ((v = atomic64_cmpxchg(dst_seq, old, journal_seq)) != old); +- +- bch2_journal_set_has_inum(&c->journal, dst->v.i_ino, journal_seq); + } + + static void __pagecache_lock_put(struct pagecache_lock *lock, long i) +@@ -258,8 +256,6 @@ struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum) + + bch2_vfs_inode_init(c, inum, inode, &inode_u); + +- inode->ei_journal_seq = bch2_inode_journal_seq(&c->journal, inum.inum); +- + unlock_new_inode(&inode->v); + + return &inode->v; +diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c +index ac4071fc4e80..a2b26d5b5236 100644 +--- a/fs/bcachefs/journal.c ++++ b/fs/bcachefs/journal.c +@@ -88,8 +88,6 @@ static void bch2_journal_buf_init(struct journal *j) + buf->must_flush = false; + buf->separate_flush = false; + +- memset(buf->has_inode, 0, sizeof(buf->has_inode)); +- + memset(buf->data, 0, sizeof(*buf->data)); + buf->data->seq = cpu_to_le64(journal_cur_seq(j)); + buf->data->u64s = 0; +@@ -335,55 +333,6 @@ static void journal_write_work(struct work_struct *work) + journal_entry_close(j); + } + +-/* +- * Given an inode number, if that inode number has data in the journal that +- * hasn't yet been flushed, return the journal sequence number that needs to be +- * flushed: +- */ +-u64 bch2_inode_journal_seq(struct journal *j, u64 inode) +-{ +- size_t h = hash_64(inode, ilog2(sizeof(j->buf[0].has_inode) * 8)); +- union journal_res_state s; +- unsigned i; +- u64 seq; +- +- +- spin_lock(&j->lock); +- seq = journal_cur_seq(j); +- s = READ_ONCE(j->reservations); +- i = s.idx; +- +- while (1) { +- if (test_bit(h, j->buf[i].has_inode)) +- goto out; +- +- if (i == s.unwritten_idx) +- break; +- +- i = (i - 1) & JOURNAL_BUF_MASK; +- seq--; +- } +- +- seq = 0; +-out: +- spin_unlock(&j->lock); +- +- return seq; +-} +- +-void bch2_journal_set_has_inum(struct journal *j, u64 inode, u64 seq) +-{ +- size_t h = hash_64(inode, ilog2(sizeof(j->buf[0].has_inode) * 8)); +- struct journal_buf *buf; +- +- spin_lock(&j->lock); +- +- if ((buf = journal_seq_to_buf(j, seq))) +- set_bit(h, buf->has_inode); +- +- spin_unlock(&j->lock); +-} +- + static int __journal_res_get(struct journal *j, struct journal_res *res, + unsigned flags) + { +diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h +index 99fd253648bf..c39cbbf1bccd 100644 +--- a/fs/bcachefs/journal.h ++++ b/fs/bcachefs/journal.h +@@ -141,7 +141,6 @@ static inline u64 journal_cur_seq(struct journal *j) + return j->pin.back - 1; + } + +-u64 bch2_inode_journal_seq(struct journal *, u64); + void bch2_journal_set_has_inum(struct journal *, u64, u64); + + static inline int journal_state_count(union journal_res_state s, int idx) +@@ -163,18 +162,6 @@ static inline void journal_state_inc(union journal_res_state *s) + s->buf3_count += s->idx == 3; + } + +-static inline void bch2_journal_set_has_inode(struct journal *j, +- struct journal_res *res, +- u64 inum) +-{ +- struct journal_buf *buf = &j->buf[res->idx]; +- unsigned long bit = hash_64(inum, ilog2(sizeof(buf->has_inode) * 8)); +- +- /* avoid atomic op if possible */ +- if (unlikely(!test_bit(bit, buf->has_inode))) +- set_bit(bit, buf->has_inode); +-} +- + /* + * Amount of space that will be taken up by some keys in the journal (i.e. + * including the jset header) +diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h +index cc10e1d7895c..d484513289aa 100644 +--- a/fs/bcachefs/journal_types.h ++++ b/fs/bcachefs/journal_types.h +@@ -34,8 +34,6 @@ struct journal_buf { + bool noflush; /* write has already been kicked off, and was noflush */ + bool must_flush; /* something wants a flush */ + bool separate_flush; +- /* bloom filter: */ +- unsigned long has_inode[1024 / sizeof(unsigned long)]; + }; + + /* +-- +cgit v1.2.3 + + +From d3cc926caa3148d9e31caa42fbb946684302bb93 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 4 Nov 2021 11:44:13 -0400 +Subject: bcachefs: Kill bucket quantiles sysfs code + +We're getting rid of code that uses the in memory bucket array - and we +now have better mechanisms for viewing most of what the bucket quantiles +code gave us (especially internal fragmentation). + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/sysfs.c | 90 ----------------------------------------------------- + 1 file changed, 90 deletions(-) + +diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c +index 51eb19b84a28..864be8601868 100644 +--- a/fs/bcachefs/sysfs.c ++++ b/fs/bcachefs/sysfs.c +@@ -155,11 +155,6 @@ read_attribute(congested); + + read_attribute(btree_avg_write_size); + +-read_attribute(bucket_quantiles_last_read); +-read_attribute(bucket_quantiles_last_write); +-read_attribute(bucket_quantiles_fragmentation); +-read_attribute(bucket_quantiles_oldest_gen); +- + read_attribute(reserve_stats); + read_attribute(btree_cache_size); + read_attribute(compression_stats); +@@ -751,76 +746,6 @@ struct attribute *bch2_fs_time_stats_files[] = { + NULL + }; + +-typedef unsigned (bucket_map_fn)(struct bch_fs *, struct bch_dev *, +- size_t, void *); +- +-static unsigned bucket_last_io_fn(struct bch_fs *c, struct bch_dev *ca, +- size_t b, void *private) +-{ +- int rw = (private ? 1 : 0); +- +- return atomic64_read(&c->io_clock[rw].now) - bucket(ca, b)->io_time[rw]; +-} +- +-static unsigned bucket_sectors_used_fn(struct bch_fs *c, struct bch_dev *ca, +- size_t b, void *private) +-{ +- struct bucket *g = bucket(ca, b); +- return bucket_sectors_used(g->mark); +-} +- +-static unsigned bucket_oldest_gen_fn(struct bch_fs *c, struct bch_dev *ca, +- size_t b, void *private) +-{ +- return bucket_gc_gen(bucket(ca, b)); +-} +- +-static int unsigned_cmp(const void *_l, const void *_r) +-{ +- const unsigned *l = _l; +- const unsigned *r = _r; +- +- return cmp_int(*l, *r); +-} +- +-static int quantiles_to_text(struct printbuf *out, +- struct bch_fs *c, struct bch_dev *ca, +- bucket_map_fn *fn, void *private) +-{ +- size_t i, n; +- /* Compute 31 quantiles */ +- unsigned q[31], *p; +- +- down_read(&ca->bucket_lock); +- n = ca->mi.nbuckets; +- +- p = vzalloc(n * sizeof(unsigned)); +- if (!p) { +- up_read(&ca->bucket_lock); +- return -ENOMEM; +- } +- +- for (i = ca->mi.first_bucket; i < n; i++) +- p[i] = fn(c, ca, i, private); +- +- sort(p, n, sizeof(unsigned), unsigned_cmp, NULL); +- up_read(&ca->bucket_lock); +- +- while (n && +- !p[n - 1]) +- --n; +- +- for (i = 0; i < ARRAY_SIZE(q); i++) +- q[i] = p[n * (i + 1) / (ARRAY_SIZE(q) + 1)]; +- +- vfree(p); +- +- for (i = 0; i < ARRAY_SIZE(q); i++) +- pr_buf(out, "%u ", q[i]); +- pr_buf(out, "\n"); +- return 0; +-} +- + static void reserve_stats_to_text(struct printbuf *out, struct bch_dev *ca) + { + enum alloc_reserve i; +@@ -982,15 +907,6 @@ SHOW(bch2_dev) + clamp(atomic_read(&ca->congested), 0, CONGESTED_MAX) + * 100 / CONGESTED_MAX); + +- if (attr == &sysfs_bucket_quantiles_last_read) +- return quantiles_to_text(&out, c, ca, bucket_last_io_fn, (void *) 0) ?: out.pos - buf; +- if (attr == &sysfs_bucket_quantiles_last_write) +- return quantiles_to_text(&out, c, ca, bucket_last_io_fn, (void *) 1) ?: out.pos - buf; +- if (attr == &sysfs_bucket_quantiles_fragmentation) +- return quantiles_to_text(&out, c, ca, bucket_sectors_used_fn, NULL) ?: out.pos - buf; +- if (attr == &sysfs_bucket_quantiles_oldest_gen) +- return quantiles_to_text(&out, c, ca, bucket_oldest_gen_fn, NULL) ?: out.pos - buf; +- + if (attr == &sysfs_reserve_stats) { + reserve_stats_to_text(&out, ca); + return out.pos - buf; +@@ -1082,12 +998,6 @@ struct attribute *bch2_dev_files[] = { + &sysfs_io_latency_stats_write, + &sysfs_congested, + +- /* alloc info - other stats: */ +- &sysfs_bucket_quantiles_last_read, +- &sysfs_bucket_quantiles_last_write, +- &sysfs_bucket_quantiles_fragmentation, +- &sysfs_bucket_quantiles_oldest_gen, +- + &sysfs_reserve_stats, + + /* debug: */ +-- +cgit v1.2.3 + + +From 572bdc670113105434254f94cc10e42d98249347 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 5 Nov 2021 15:17:13 -0400 +Subject: bcachefs: Switch fsync to use bi_journal_seq + +Now that we're recording in each inode the journal sequence number of +the most recent update, fsync becomes a lot simpler and we can delete +all the plumbing for ei_journal_seq. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/acl.c | 3 +-- + fs/bcachefs/fs-io.c | 58 +++++++++++++++++++++++++-------------------------- + fs/bcachefs/fs.c | 52 +++++++-------------------------------------- + fs/bcachefs/fs.h | 1 - + fs/bcachefs/io.c | 9 ++++---- + fs/bcachefs/io.h | 10 ++------- + fs/bcachefs/reflink.c | 8 +++---- + fs/bcachefs/reflink.h | 2 +- + fs/bcachefs/xattr.c | 18 +++++++++++++++- + 9 files changed, 65 insertions(+), 96 deletions(-) + +diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c +index d0c6878b003f..0cde2638d017 100644 +--- a/fs/bcachefs/acl.c ++++ b/fs/bcachefs/acl.c +@@ -331,8 +331,7 @@ retry: + inode_u.bi_mode = mode; + + ret = bch2_inode_write(&trans, &inode_iter, &inode_u) ?: +- bch2_trans_commit(&trans, NULL, +- &inode->ei_journal_seq, 0); ++ bch2_trans_commit(&trans, NULL, NULL, 0); + btree_err: + bch2_trans_iter_exit(&trans, &inode_iter); + +diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c +index 83b18b881e6e..9eed6e81384f 100644 +--- a/fs/bcachefs/fs-io.c ++++ b/fs/bcachefs/fs-io.c +@@ -1135,7 +1135,6 @@ static void bch2_writepage_io_alloc(struct bch_fs *c, + op = &w->io->op; + bch2_write_op_init(op, c, w->opts); + op->target = w->opts.foreground_target; +- op_journal_seq_set(op, &inode->ei_journal_seq); + op->nr_replicas = nr_replicas; + op->res.nr_replicas = nr_replicas; + op->write_point = writepoint_hashed(inode->ei_last_dirtied); +@@ -1945,7 +1944,6 @@ static long bch2_dio_write_loop(struct dio_write *dio) + bch2_write_op_init(&dio->op, c, io_opts(c, &inode->ei_inode)); + dio->op.end_io = bch2_dio_write_loop_async; + dio->op.target = dio->op.opts.foreground_target; +- op_journal_seq_set(&dio->op, &inode->ei_journal_seq); + dio->op.write_point = writepoint_hashed((unsigned long) current); + dio->op.nr_replicas = dio->op.opts.data_replicas; + dio->op.subvol = inode->ei_subvol; +@@ -2177,29 +2175,36 @@ unlock: + + /* fsync: */ + +-int bch2_fsync(struct file *file, loff_t start, loff_t end, int datasync) ++/* ++ * inode->ei_inode.bi_journal_seq won't be up to date since it's set in an ++ * insert trigger: look up the btree inode instead ++ */ ++static int bch2_flush_inode(struct bch_fs *c, subvol_inum inum) + { +- struct bch_inode_info *inode = file_bch_inode(file); +- struct bch_fs *c = inode->v.i_sb->s_fs_info; +- int ret, ret2; ++ struct bch_inode_unpacked inode; ++ int ret; + +- ret = file_write_and_wait_range(file, start, end); ++ if (c->opts.journal_flush_disabled) ++ return 0; ++ ++ ret = bch2_inode_find_by_inum(c, inum, &inode); + if (ret) + return ret; + +- if (datasync && !(inode->v.i_state & I_DIRTY_DATASYNC)) +- goto out; ++ return bch2_journal_flush_seq(&c->journal, inode.bi_journal_seq); ++} + +- ret = sync_inode_metadata(&inode->v, 1); +- if (ret) +- return ret; +-out: +- if (!c->opts.journal_flush_disabled) +- ret = bch2_journal_flush_seq(&c->journal, +- inode->ei_journal_seq); +- ret2 = file_check_and_advance_wb_err(file); ++int bch2_fsync(struct file *file, loff_t start, loff_t end, int datasync) ++{ ++ struct bch_inode_info *inode = file_bch_inode(file); ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ int ret, ret2, ret3; ++ ++ ret = file_write_and_wait_range(file, start, end); ++ ret2 = sync_inode_metadata(&inode->v, 1); ++ ret3 = bch2_flush_inode(c, inode_inum(inode)); + +- return ret ?: ret2; ++ return ret ?: ret2 ?: ret3; + } + + /* truncate: */ +@@ -2461,7 +2466,7 @@ int bch2_truncate(struct user_namespace *mnt_userns, + + ret = bch2_fpunch(c, inode_inum(inode), + round_up(iattr->ia_size, block_bytes(c)) >> 9, +- U64_MAX, &inode->ei_journal_seq, &i_sectors_delta); ++ U64_MAX, &i_sectors_delta); + i_sectors_acct(c, inode, NULL, i_sectors_delta); + + if (unlikely(ret)) +@@ -2521,7 +2526,6 @@ static long bchfs_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len + + ret = bch2_fpunch(c, inode_inum(inode), + discard_start, discard_end, +- &inode->ei_journal_seq, + &i_sectors_delta); + i_sectors_acct(c, inode, NULL, i_sectors_delta); + } +@@ -2600,7 +2604,6 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode, + + ret = bch2_fpunch(c, inode_inum(inode), + offset >> 9, (offset + len) >> 9, +- &inode->ei_journal_seq, + &i_sectors_delta); + i_sectors_acct(c, inode, NULL, i_sectors_delta); + +@@ -2704,8 +2707,7 @@ reassemble: + ret = bch2_btree_iter_traverse(&del) ?: + bch2_trans_update(&trans, &del, &delete, trigger_flags) ?: + bch2_trans_update(&trans, &dst, copy.k, trigger_flags) ?: +- bch2_trans_commit(&trans, &disk_res, +- &inode->ei_journal_seq, ++ bch2_trans_commit(&trans, &disk_res, NULL, + BTREE_INSERT_NOFAIL); + bch2_disk_reservation_put(c, &disk_res); + +@@ -2816,7 +2818,7 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode, + + ret = bch2_extent_update(&trans, inode_inum(inode), &iter, + &reservation.k_i, +- &disk_res, &inode->ei_journal_seq, ++ &disk_res, NULL, + 0, &i_sectors_delta, true); + i_sectors_acct(c, inode, "a_res, i_sectors_delta); + bkey_err: +@@ -3020,7 +3022,6 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src, + inode_inum(dst), pos_dst >> 9, + inode_inum(src), pos_src >> 9, + aligned_len >> 9, +- &dst->ei_journal_seq, + pos_dst + len, &i_sectors_delta); + if (ret < 0) + goto err; +@@ -3038,10 +3039,9 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src, + i_size_write(&dst->v, pos_dst + ret); + spin_unlock(&dst->v.i_lock); + +- if (((file_dst->f_flags & (__O_SYNC | O_DSYNC)) || +- IS_SYNC(file_inode(file_dst))) && +- !c->opts.journal_flush_disabled) +- ret = bch2_journal_flush_seq(&c->journal, dst->ei_journal_seq); ++ if ((file_dst->f_flags & (__O_SYNC | O_DSYNC)) || ++ IS_SYNC(file_inode(file_dst))) ++ ret = bch2_flush_inode(c, inode_inum(dst)); + err: + bch2_unlock_inodes(INODE_LOCK|INODE_PAGECACHE_BLOCK, src, dst); + +diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c +index 5c07bbff56dc..ba91135cd16a 100644 +--- a/fs/bcachefs/fs.c ++++ b/fs/bcachefs/fs.c +@@ -41,25 +41,6 @@ static void bch2_vfs_inode_init(struct bch_fs *, subvol_inum, + struct bch_inode_info *, + struct bch_inode_unpacked *); + +-static void journal_seq_copy(struct bch_fs *c, +- struct bch_inode_info *dst, +- u64 journal_seq) +-{ +- /* +- * atomic64_cmpxchg has a fallback for archs that don't support it, +- * cmpxchg does not: +- */ +- atomic64_t *dst_seq = (void *) &dst->ei_journal_seq; +- u64 old, v = READ_ONCE(dst->ei_journal_seq); +- +- do { +- old = v; +- +- if (old >= journal_seq) +- break; +- } while ((v = atomic64_cmpxchg(dst_seq, old, journal_seq)) != old); +-} +- + static void __pagecache_lock_put(struct pagecache_lock *lock, long i) + { + BUG_ON(atomic_long_read(&lock->v) == 0); +@@ -152,9 +133,7 @@ retry: + BTREE_ITER_INTENT) ?: + (set ? set(inode, &inode_u, p) : 0) ?: + bch2_inode_write(&trans, &iter, &inode_u) ?: +- bch2_trans_commit(&trans, NULL, +- &inode->ei_journal_seq, +- BTREE_INSERT_NOFAIL); ++ bch2_trans_commit(&trans, NULL, NULL, BTREE_INSERT_NOFAIL); + + /* + * the btree node lock protects inode->ei_inode, not ei_update_lock; +@@ -329,7 +308,6 @@ err_before_quota: + if (!(flags & BCH_CREATE_TMPFILE)) { + bch2_inode_update_after_write(c, dir, &dir_u, + ATTR_MTIME|ATTR_CTIME); +- journal_seq_copy(c, dir, journal_seq); + mutex_unlock(&dir->ei_update_lock); + } + +@@ -337,7 +315,6 @@ err_before_quota: + inum.inum = inode_u.bi_inum; + + bch2_vfs_inode_init(c, inum, inode, &inode_u); +- journal_seq_copy(c, inode, journal_seq); + + set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl); + set_cached_acl(&inode->v, ACL_TYPE_DEFAULT, default_acl); +@@ -362,7 +339,6 @@ err_before_quota: + * We raced, another process pulled the new inode into cache + * before us: + */ +- journal_seq_copy(c, old, journal_seq); + make_bad_inode(&inode->v); + iput(&inode->v); + +@@ -446,7 +422,7 @@ static int __bch2_link(struct bch_fs *c, + mutex_lock(&inode->ei_update_lock); + bch2_trans_init(&trans, c, 4, 1024); + +- ret = __bch2_trans_do(&trans, NULL, &inode->ei_journal_seq, 0, ++ ret = __bch2_trans_do(&trans, NULL, NULL, 0, + bch2_link_trans(&trans, + inode_inum(dir), &dir_u, + inode_inum(inode), &inode_u, +@@ -455,7 +431,6 @@ static int __bch2_link(struct bch_fs *c, + if (likely(!ret)) { + BUG_ON(inode_u.bi_inum != inode->v.i_ino); + +- journal_seq_copy(c, inode, dir->ei_journal_seq); + bch2_inode_update_after_write(c, dir, &dir_u, + ATTR_MTIME|ATTR_CTIME); + bch2_inode_update_after_write(c, inode, &inode_u, ATTR_CTIME); +@@ -498,7 +473,7 @@ int __bch2_unlink(struct inode *vdir, struct dentry *dentry, + bch2_lock_inodes(INODE_UPDATE_LOCK, dir, inode); + bch2_trans_init(&trans, c, 4, 1024); + +- ret = __bch2_trans_do(&trans, NULL, &dir->ei_journal_seq, ++ ret = __bch2_trans_do(&trans, NULL, NULL, + BTREE_INSERT_NOFAIL, + bch2_unlink_trans(&trans, + inode_inum(dir), &dir_u, +@@ -508,7 +483,6 @@ int __bch2_unlink(struct inode *vdir, struct dentry *dentry, + if (likely(!ret)) { + BUG_ON(inode_u.bi_inum != inode->v.i_ino); + +- journal_seq_copy(c, inode, dir->ei_journal_seq); + bch2_inode_update_after_write(c, dir, &dir_u, + ATTR_MTIME|ATTR_CTIME); + bch2_inode_update_after_write(c, inode, &inode_u, +@@ -550,8 +524,6 @@ static int bch2_symlink(struct user_namespace *mnt_userns, + if (unlikely(ret)) + goto err; + +- journal_seq_copy(c, dir, inode->ei_journal_seq); +- + ret = __bch2_link(c, inode, dir, dentry); + if (unlikely(ret)) + goto err; +@@ -586,7 +558,6 @@ static int bch2_rename2(struct user_namespace *mnt_userns, + ? BCH_RENAME_EXCHANGE + : dst_dentry->d_inode + ? BCH_RENAME_OVERWRITE : BCH_RENAME; +- u64 journal_seq = 0; + int ret; + + if (flags & ~(RENAME_NOREPLACE|RENAME_EXCHANGE)) +@@ -626,7 +597,7 @@ static int bch2_rename2(struct user_namespace *mnt_userns, + goto err; + } + +- ret = __bch2_trans_do(&trans, NULL, &journal_seq, 0, ++ ret = __bch2_trans_do(&trans, NULL, NULL, 0, + bch2_rename_trans(&trans, + inode_inum(src_dir), &src_dir_u, + inode_inum(dst_dir), &dst_dir_u, +@@ -644,23 +615,17 @@ static int bch2_rename2(struct user_namespace *mnt_userns, + + bch2_inode_update_after_write(c, src_dir, &src_dir_u, + ATTR_MTIME|ATTR_CTIME); +- journal_seq_copy(c, src_dir, journal_seq); + +- if (src_dir != dst_dir) { ++ if (src_dir != dst_dir) + bch2_inode_update_after_write(c, dst_dir, &dst_dir_u, + ATTR_MTIME|ATTR_CTIME); +- journal_seq_copy(c, dst_dir, journal_seq); +- } + + bch2_inode_update_after_write(c, src_inode, &src_inode_u, + ATTR_CTIME); +- journal_seq_copy(c, src_inode, journal_seq); + +- if (dst_inode) { ++ if (dst_inode) + bch2_inode_update_after_write(c, dst_inode, &dst_inode_u, + ATTR_CTIME); +- journal_seq_copy(c, dst_inode, journal_seq); +- } + err: + bch2_trans_exit(&trans); + +@@ -767,8 +732,7 @@ retry: + } + + ret = bch2_inode_write(&trans, &inode_iter, &inode_u) ?: +- bch2_trans_commit(&trans, NULL, +- &inode->ei_journal_seq, ++ bch2_trans_commit(&trans, NULL, NULL, + BTREE_INSERT_NOFAIL); + btree_err: + bch2_trans_iter_exit(&trans, &inode_iter); +@@ -1202,7 +1166,6 @@ static void bch2_vfs_inode_init(struct bch_fs *c, subvol_inum inum, + inode->v.i_size = bi->bi_size; + + inode->ei_flags = 0; +- inode->ei_journal_seq = bi->bi_journal_seq; + inode->ei_quota_reserved = 0; + inode->ei_qid = bch_qid(bi); + inode->ei_subvol = inum.subvol; +@@ -1241,7 +1204,6 @@ static struct inode *bch2_alloc_inode(struct super_block *sb) + mutex_init(&inode->ei_update_lock); + pagecache_lock_init(&inode->ei_pagecache_lock); + mutex_init(&inode->ei_quota_lock); +- inode->ei_journal_seq = 0; + + return &inode->v; + } +diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h +index bf62e80fde59..40212b3da091 100644 +--- a/fs/bcachefs/fs.h ++++ b/fs/bcachefs/fs.h +@@ -36,7 +36,6 @@ struct bch_inode_info { + unsigned long ei_flags; + + struct mutex ei_update_lock; +- u64 ei_journal_seq; + u64 ei_quota_reserved; + unsigned long ei_last_dirtied; + +diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c +index 701e9d0eab4f..7c9ea91d8f5b 100644 +--- a/fs/bcachefs/io.c ++++ b/fs/bcachefs/io.c +@@ -376,7 +376,7 @@ int bch2_extent_update(struct btree_trans *trans, + */ + int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter, + subvol_inum inum, u64 end, +- u64 *journal_seq, s64 *i_sectors_delta) ++ s64 *i_sectors_delta) + { + struct bch_fs *c = trans->c; + unsigned max_sectors = KEY_SIZE_MAX & (~0 << c->block_bits); +@@ -414,7 +414,7 @@ int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter, + bch2_cut_back(end_pos, &delete); + + ret = bch2_extent_update(trans, inum, iter, &delete, +- &disk_res, journal_seq, ++ &disk_res, NULL, + 0, i_sectors_delta, false); + bch2_disk_reservation_put(c, &disk_res); + btree_err: +@@ -433,7 +433,7 @@ btree_err: + } + + int bch2_fpunch(struct bch_fs *c, subvol_inum inum, u64 start, u64 end, +- u64 *journal_seq, s64 *i_sectors_delta) ++ s64 *i_sectors_delta) + { + struct btree_trans trans; + struct btree_iter iter; +@@ -444,8 +444,7 @@ int bch2_fpunch(struct bch_fs *c, subvol_inum inum, u64 start, u64 end, + POS(inum.inum, start), + BTREE_ITER_INTENT); + +- ret = bch2_fpunch_at(&trans, &iter, inum, end, +- journal_seq, i_sectors_delta); ++ ret = bch2_fpunch_at(&trans, &iter, inum, end, i_sectors_delta); + + bch2_trans_iter_exit(&trans, &iter); + bch2_trans_exit(&trans); +diff --git a/fs/bcachefs/io.h b/fs/bcachefs/io.h +index 38efd39c664e..fbe46660662b 100644 +--- a/fs/bcachefs/io.h ++++ b/fs/bcachefs/io.h +@@ -48,12 +48,6 @@ static inline u64 *op_journal_seq(struct bch_write_op *op) + ? op->journal_seq_p : &op->journal_seq; + } + +-static inline void op_journal_seq_set(struct bch_write_op *op, u64 *journal_seq) +-{ +- op->journal_seq_p = journal_seq; +- op->flags |= BCH_WRITE_JOURNAL_SEQ_PTR; +-} +- + static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op) + { + return op->alloc_reserve == RESERVE_MOVINGGC +@@ -68,8 +62,8 @@ int bch2_extent_update(struct btree_trans *, subvol_inum, + struct disk_reservation *, u64 *, u64, s64 *, bool); + + int bch2_fpunch_at(struct btree_trans *, struct btree_iter *, +- subvol_inum, u64, u64 *, s64 *); +-int bch2_fpunch(struct bch_fs *c, subvol_inum, u64, u64, u64 *, s64 *); ++ subvol_inum, u64, s64 *); ++int bch2_fpunch(struct bch_fs *c, subvol_inum, u64, u64, s64 *); + + int bch2_write_index_default(struct bch_write_op *); + +diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c +index 8e66e6390e62..d003f4088dfc 100644 +--- a/fs/bcachefs/reflink.c ++++ b/fs/bcachefs/reflink.c +@@ -210,7 +210,7 @@ static struct bkey_s_c get_next_src(struct btree_iter *iter, struct bpos end) + s64 bch2_remap_range(struct bch_fs *c, + subvol_inum dst_inum, u64 dst_offset, + subvol_inum src_inum, u64 src_offset, +- u64 remap_sectors, u64 *journal_seq, ++ u64 remap_sectors, + u64 new_i_size, s64 *i_sectors_delta) + { + struct btree_trans trans; +@@ -281,7 +281,7 @@ s64 bch2_remap_range(struct bch_fs *c, + min(dst_end.offset, + dst_iter.pos.offset + + src_iter.pos.offset - src_want.offset), +- journal_seq, i_sectors_delta); ++ i_sectors_delta); + continue; + } + +@@ -320,7 +320,7 @@ s64 bch2_remap_range(struct bch_fs *c, + dst_end.offset - dst_iter.pos.offset)); + + ret = bch2_extent_update(&trans, dst_inum, &dst_iter, +- new_dst.k, &disk_res, journal_seq, ++ new_dst.k, &disk_res, NULL, + new_i_size, i_sectors_delta, + true); + bch2_disk_reservation_put(c, &disk_res); +@@ -347,7 +347,7 @@ s64 bch2_remap_range(struct bch_fs *c, + inode_u.bi_size < new_i_size) { + inode_u.bi_size = new_i_size; + ret2 = bch2_inode_write(&trans, &inode_iter, &inode_u) ?: +- bch2_trans_commit(&trans, NULL, journal_seq, 0); ++ bch2_trans_commit(&trans, NULL, NULL, 0); + } + + bch2_trans_iter_exit(&trans, &inode_iter); +diff --git a/fs/bcachefs/reflink.h b/fs/bcachefs/reflink.h +index 4c1b82860b0b..3745873fd88d 100644 +--- a/fs/bcachefs/reflink.h ++++ b/fs/bcachefs/reflink.h +@@ -58,6 +58,6 @@ static inline __le64 *bkey_refcount(struct bkey_i *k) + } + + s64 bch2_remap_range(struct bch_fs *, subvol_inum, u64, +- subvol_inum, u64, u64, u64 *, u64, s64 *); ++ subvol_inum, u64, u64, u64, s64 *); + + #endif /* _BCACHEFS_REFLINK_H */ +diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c +index bb5da310e4d6..464ed68318e7 100644 +--- a/fs/bcachefs/xattr.c ++++ b/fs/bcachefs/xattr.c +@@ -165,8 +165,24 @@ int bch2_xattr_set(struct btree_trans *trans, subvol_inum inum, + const char *name, const void *value, size_t size, + int type, int flags) + { ++ struct btree_iter inode_iter = { NULL }; ++ struct bch_inode_unpacked inode_u; + int ret; + ++ /* ++ * We need to do an inode update so that bi_journal_sync gets updated ++ * and fsync works: ++ * ++ * Perhaps we should be updating bi_mtime too? ++ */ ++ ++ ret = bch2_inode_peek(trans, &inode_iter, &inode_u, inum, BTREE_ITER_INTENT) ?: ++ bch2_inode_write(trans, &inode_iter, &inode_u); ++ bch2_trans_iter_exit(trans, &inode_iter); ++ ++ if (ret) ++ return ret; ++ + if (value) { + struct bkey_i_xattr *xattr; + unsigned namelen = strlen(name); +@@ -352,7 +368,7 @@ static int bch2_xattr_set_handler(const struct xattr_handler *handler, + struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode); + +- return bch2_trans_do(c, NULL, &inode->ei_journal_seq, 0, ++ return bch2_trans_do(c, NULL, NULL, 0, + bch2_xattr_set(&trans, inode_inum(inode), &hash, + name, value, size, + handler->flags, flags)); +-- +cgit v1.2.3 + + +From dcebf914a732d9d86791269ea81c714ae632553b Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 6 Nov 2021 00:05:12 -0400 +Subject: bcachefs: Fix upgrade path for reflink_p fix + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/buckets.c | 12 ++++++++---- + 1 file changed, 8 insertions(+), 4 deletions(-) + +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index a762a31be35d..c3542d3c2eac 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -1203,11 +1203,15 @@ static int bch2_mark_reflink_p(struct btree_trans *trans, + struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); + struct reflink_gc *ref; + size_t l, r, m; +- u64 idx = le64_to_cpu(p.v->idx) - le32_to_cpu(p.v->front_pad); +- u64 end_idx = le64_to_cpu(p.v->idx) + p.k->size + +- le32_to_cpu(p.v->back_pad); ++ u64 idx = le64_to_cpu(p.v->idx); ++ u64 end = le64_to_cpu(p.v->idx) + p.k->size; + int ret = 0; + ++ if (c->sb.version >= bcachefs_metadata_version_reflink_p_fix) { ++ idx -= le32_to_cpu(p.v->front_pad); ++ end += le32_to_cpu(p.v->back_pad); ++ } ++ + l = 0; + r = c->reflink_gc_nr; + while (l < r) { +@@ -1220,7 +1224,7 @@ static int bch2_mark_reflink_p(struct btree_trans *trans, + r = m; + } + +- while (idx < end_idx && !ret) ++ while (idx < end && !ret) + ret = __bch2_mark_reflink_p(c, p, &idx, flags, l++); + + return ret; +-- +cgit v1.2.3 + + +From 28bf30f67823b2184b30e20a9ae7928ef98b9c07 Mon Sep 17 00:00:00 2001 +From: Chris Webb +Date: Thu, 4 Nov 2021 21:03:16 +0000 +Subject: bcachefs: Return -ENOKEY/EINVAL when mount decryption fails + +bch2_fs_encryption_init() correctly passes back -ENOKEY from request_key() +when no unlock key is found, or -EINVAL if superblock decryption fails +because of an invalid key. However, these get absorbed into a generic NULL +return from bch2_fs_alloc() and later returned to user space as -ENOMEM, +leading to a misleading error from mount(1): + + mount(2) system call failed: Out of memory. + +Return explicit error pointers out of bch2_fs_alloc() and handle them in +both callers, so the user instead sees + + mount(2) system call failed: Required key not available. + +when attempting to mount a filesystem which is still locked. + +Signed-off-by: Chris Webb +--- + fs/bcachefs/super.c | 49 +++++++++++++++++++++++++++++++++---------------- + 1 file changed, 33 insertions(+), 16 deletions(-) + +diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c +index dc8f641504be..9cd296feb312 100644 +--- a/fs/bcachefs/super.c ++++ b/fs/bcachefs/super.c +@@ -638,12 +638,15 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) + struct bch_fs *c; + unsigned i, iter_size; + const char *err; ++ int ret = 0; + + pr_verbose_init(opts, ""); + + c = kvpmalloc(sizeof(struct bch_fs), GFP_KERNEL|__GFP_ZERO); +- if (!c) ++ if (!c) { ++ c = ERR_PTR(-ENOMEM); + goto out; ++ } + + __module_get(THIS_MODULE); + +@@ -724,13 +727,16 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) + + mutex_init(&c->sectors_available_lock); + +- if (percpu_init_rwsem(&c->mark_lock)) ++ if (percpu_init_rwsem(&c->mark_lock)) { ++ ret = -ENOMEM; + goto err; ++ } + + mutex_lock(&c->sb_lock); + + if (bch2_sb_to_fs(c, sb)) { + mutex_unlock(&c->sb_lock); ++ ret = -ENOMEM; + goto err; + } + +@@ -745,8 +751,10 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) + c->block_bits = ilog2(c->opts.block_size); + c->btree_foreground_merge_threshold = BTREE_FOREGROUND_MERGE_THRESHOLD(c); + +- if (bch2_fs_init_fault("fs_alloc")) ++ if (bch2_fs_init_fault("fs_alloc")) { ++ ret = -ENOMEM; + goto err; ++ } + + iter_size = sizeof(struct sort_iter) + + (btree_blocks(c) + 1) * 2 * +@@ -787,10 +795,15 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) + bch2_fs_btree_interior_update_init(c) || + bch2_fs_subvolumes_init(c) || + bch2_fs_io_init(c) || +- bch2_fs_encryption_init(c) || + bch2_fs_compress_init(c) || + bch2_fs_ec_init(c) || +- bch2_fs_fsio_init(c)) ++ bch2_fs_fsio_init(c)) { ++ ret = -ENOMEM; ++ goto err; ++ } ++ ++ ret = bch2_fs_encryption_init(c); ++ if (ret) + goto err; + + if (c->opts.nochanges) +@@ -799,8 +812,10 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) + mi = bch2_sb_get_members(c->disk_sb.sb); + for (i = 0; i < c->sb.nr_devices; i++) + if (bch2_dev_exists(c->disk_sb.sb, mi, i) && +- bch2_dev_alloc(c, i)) ++ bch2_dev_alloc(c, i)) { ++ ret = -ENOMEM; + goto err; ++ } + + bch2_journal_entry_res_resize(&c->journal, + &c->btree_root_journal_res, +@@ -815,14 +830,15 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) + mutex_unlock(&bch_fs_list_lock); + if (err) { + bch_err(c, "bch2_fs_online() error: %s", err); ++ ret = -ENOMEM; + goto err; + } + out: +- pr_verbose_init(opts, "ret %i", c ? 0 : -ENOMEM); ++ pr_verbose_init(opts, "ret %i", PTR_ERR_OR_ZERO(c)); + return c; + err: + bch2_fs_free(c); +- c = NULL; ++ c = ERR_PTR(ret); + goto out; + } + +@@ -1939,10 +1955,11 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices, + i++; + } + +- ret = -ENOMEM; + c = bch2_fs_alloc(sb[best_sb].sb, opts); +- if (!c) ++ if (IS_ERR(c)) { ++ ret = PTR_ERR(c); + goto err; ++ } + + err = "bch2_dev_online() error"; + down_write(&c->state_lock); +@@ -1973,7 +1990,7 @@ err_print: + devices[0], err); + ret = -EINVAL; + err: +- if (c) ++ if (!IS_ERR_OR_NULL(c)) + bch2_fs_stop(c); + for (i = 0; i < nr_devices; i++) + bch2_free_super(&sb[i]); +@@ -2002,12 +2019,12 @@ static const char *__bch2_fs_open_incremental(struct bch_sb_handle *sb, + if (err) + goto err; + } else { ++ allocated_fs = true; + c = bch2_fs_alloc(sb->sb, opts); +- err = "cannot allocate memory"; +- if (!c) +- goto err; + +- allocated_fs = true; ++ err = "bch2_fs_alloc() error"; ++ if (IS_ERR(c)) ++ goto err; + } + + err = "bch2_dev_online() error"; +@@ -2033,7 +2050,7 @@ static const char *__bch2_fs_open_incremental(struct bch_sb_handle *sb, + err: + mutex_unlock(&bch_fs_list_lock); + +- if (allocated_fs) ++ if (allocated_fs && !IS_ERR(c)) + bch2_fs_stop(c); + else if (c) + closure_put(&c->cl); +-- +cgit v1.2.3 + + +From c2482def9c601c50ae827fb1218e7e64990f2d4d Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 5 Nov 2021 21:28:17 -0400 +Subject: bcachefs: Clean up error reporting in the startup path + +It used to be that error reporting in the startup path was done by +returning strings describing the error, but that turned out to be a +rather silly idea - if there's something we can describe about the +error, just print it right away. + +This converts a good chunk of code to returning error codes, as is more +typical style. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/super.c | 177 ++++++++++++++++++++++++++-------------------------- + 1 file changed, 87 insertions(+), 90 deletions(-) + +diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c +index 9cd296feb312..3744b6d519a7 100644 +--- a/fs/bcachefs/super.c ++++ b/fs/bcachefs/super.c +@@ -588,48 +588,53 @@ void bch2_fs_stop(struct bch_fs *c) + bch2_fs_free(c); + } + +-static const char *bch2_fs_online(struct bch_fs *c) ++static int bch2_fs_online(struct bch_fs *c) + { + struct bch_dev *ca; +- const char *err = NULL; + unsigned i; +- int ret; ++ int ret = 0; + + lockdep_assert_held(&bch_fs_list_lock); + +- if (!list_empty(&c->list)) +- return NULL; +- +- if (__bch2_uuid_to_fs(c->sb.uuid)) +- return "filesystem UUID already open"; ++ if (__bch2_uuid_to_fs(c->sb.uuid)) { ++ bch_err(c, "filesystem UUID already open"); ++ return -EINVAL; ++ } + + ret = bch2_fs_chardev_init(c); +- if (ret) +- return "error creating character device"; ++ if (ret) { ++ bch_err(c, "error creating character device"); ++ return ret; ++ } + + bch2_fs_debug_init(c); + +- if (kobject_add(&c->kobj, NULL, "%pU", c->sb.user_uuid.b) || +- kobject_add(&c->internal, &c->kobj, "internal") || +- kobject_add(&c->opts_dir, &c->kobj, "options") || +- kobject_add(&c->time_stats, &c->kobj, "time_stats") || +- bch2_opts_create_sysfs_files(&c->opts_dir)) +- return "error creating sysfs objects"; ++ ret = kobject_add(&c->kobj, NULL, "%pU", c->sb.user_uuid.b) ?: ++ kobject_add(&c->internal, &c->kobj, "internal") ?: ++ kobject_add(&c->opts_dir, &c->kobj, "options") ?: ++ kobject_add(&c->time_stats, &c->kobj, "time_stats") ?: ++ bch2_opts_create_sysfs_files(&c->opts_dir); ++ if (ret) { ++ bch_err(c, "error creating sysfs objects"); ++ return ret; ++ } + + down_write(&c->state_lock); + +- err = "error creating sysfs objects"; +- for_each_member_device(ca, c, i) +- if (bch2_dev_sysfs_online(c, ca)) { ++ for_each_member_device(ca, c, i) { ++ ret = bch2_dev_sysfs_online(c, ca); ++ if (ret) { ++ bch_err(c, "error creating sysfs objects"); + percpu_ref_put(&ca->ref); + goto err; + } ++ } + ++ BUG_ON(!list_empty(&c->list)); + list_add(&c->list, &bch_fs_list); +- err = NULL; + err: + up_write(&c->state_lock); +- return err; ++ return ret; + } + + static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) +@@ -637,7 +642,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) + struct bch_sb_field_members *mi; + struct bch_fs *c; + unsigned i, iter_size; +- const char *err; + int ret = 0; + + pr_verbose_init(opts, ""); +@@ -727,20 +731,16 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) + + mutex_init(&c->sectors_available_lock); + +- if (percpu_init_rwsem(&c->mark_lock)) { +- ret = -ENOMEM; ++ ret = percpu_init_rwsem(&c->mark_lock); ++ if (ret) + goto err; +- } + + mutex_lock(&c->sb_lock); ++ ret = bch2_sb_to_fs(c, sb); ++ mutex_unlock(&c->sb_lock); + +- if (bch2_sb_to_fs(c, sb)) { +- mutex_unlock(&c->sb_lock); +- ret = -ENOMEM; ++ if (ret) + goto err; +- } +- +- mutex_unlock(&c->sb_lock); + + scnprintf(c->name, sizeof(c->name), "%pU", &c->sb.user_uuid); + +@@ -752,7 +752,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) + c->btree_foreground_merge_threshold = BTREE_FOREGROUND_MERGE_THRESHOLD(c); + + if (bch2_fs_init_fault("fs_alloc")) { +- ret = -ENOMEM; ++ bch_err(c, "fs_alloc fault injected"); ++ ret = -EFAULT; + goto err; + } + +@@ -784,25 +785,25 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) + btree_bytes(c)) || + mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048) || + !(c->unused_inode_hints = kcalloc(1U << c->inode_shard_bits, +- sizeof(u64), GFP_KERNEL)) || +- bch2_io_clock_init(&c->io_clock[READ]) || +- bch2_io_clock_init(&c->io_clock[WRITE]) || +- bch2_fs_journal_init(&c->journal) || +- bch2_fs_replicas_init(c) || +- bch2_fs_btree_cache_init(c) || +- bch2_fs_btree_key_cache_init(&c->btree_key_cache) || +- bch2_fs_btree_iter_init(c) || +- bch2_fs_btree_interior_update_init(c) || +- bch2_fs_subvolumes_init(c) || +- bch2_fs_io_init(c) || +- bch2_fs_compress_init(c) || +- bch2_fs_ec_init(c) || +- bch2_fs_fsio_init(c)) { ++ sizeof(u64), GFP_KERNEL))) { + ret = -ENOMEM; + goto err; + } + +- ret = bch2_fs_encryption_init(c); ++ ret = bch2_io_clock_init(&c->io_clock[READ]) ?: ++ bch2_io_clock_init(&c->io_clock[WRITE]) ?: ++ bch2_fs_journal_init(&c->journal) ?: ++ bch2_fs_replicas_init(c) ?: ++ bch2_fs_btree_cache_init(c) ?: ++ bch2_fs_btree_key_cache_init(&c->btree_key_cache) ?: ++ bch2_fs_btree_iter_init(c) ?: ++ bch2_fs_btree_interior_update_init(c) ?: ++ bch2_fs_subvolumes_init(c) ?: ++ bch2_fs_io_init(c) ?: ++ bch2_fs_encryption_init(c) ?: ++ bch2_fs_compress_init(c) ?: ++ bch2_fs_ec_init(c) ?: ++ bch2_fs_fsio_init(c); + if (ret) + goto err; + +@@ -813,7 +814,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) + for (i = 0; i < c->sb.nr_devices; i++) + if (bch2_dev_exists(c->disk_sb.sb, mi, i) && + bch2_dev_alloc(c, i)) { +- ret = -ENOMEM; ++ ret = -EEXIST; + goto err; + } + +@@ -826,13 +827,11 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) + (sizeof(struct jset_entry_clock) / sizeof(u64)) * 2); + + mutex_lock(&bch_fs_list_lock); +- err = bch2_fs_online(c); ++ ret = bch2_fs_online(c); + mutex_unlock(&bch_fs_list_lock); +- if (err) { +- bch_err(c, "bch2_fs_online() error: %s", err); +- ret = -ENOMEM; ++ ++ if (ret) + goto err; +- } + out: + pr_verbose_init(opts, "ret %i", PTR_ERR_OR_ZERO(c)); + return c; +@@ -878,7 +877,6 @@ static void print_mount_opts(struct bch_fs *c) + + int bch2_fs_start(struct bch_fs *c) + { +- const char *err = "cannot allocate memory"; + struct bch_sb_field_members *mi; + struct bch_dev *ca; + time64_t now = ktime_get_real_seconds(); +@@ -914,10 +912,11 @@ int bch2_fs_start(struct bch_fs *c) + if (ret) + goto err; + +- err = "dynamic fault"; + ret = -EINVAL; +- if (bch2_fs_init_fault("fs_start")) ++ if (bch2_fs_init_fault("fs_start")) { ++ bch_err(c, "fs_start fault injected"); + goto err; ++ } + + set_bit(BCH_FS_STARTED, &c->flags); + +@@ -938,7 +937,6 @@ int bch2_fs_start(struct bch_fs *c) + if (c->opts.read_only || c->opts.nochanges) { + bch2_fs_read_only(c); + } else { +- err = "error going read write"; + ret = !test_bit(BCH_FS_RW, &c->flags) + ? bch2_fs_read_write(c) + : bch2_fs_read_write_late(c); +@@ -956,25 +954,22 @@ err: + case BCH_FSCK_ERRORS_NOT_FIXED: + bch_err(c, "filesystem contains errors: please report this to the developers"); + pr_cont("mount with -o fix_errors to repair\n"); +- err = "fsck error"; + break; + case BCH_FSCK_REPAIR_UNIMPLEMENTED: + bch_err(c, "filesystem contains errors: please report this to the developers"); + pr_cont("repair unimplemented: inform the developers so that it can be added\n"); +- err = "fsck error"; + break; + case BCH_FSCK_REPAIR_IMPOSSIBLE: + bch_err(c, "filesystem contains errors, but repair impossible"); +- err = "fsck error"; + break; + case BCH_FSCK_UNKNOWN_VERSION: +- err = "unknown metadata version";; ++ bch_err(c, "unknown metadata version"); + break; + case -ENOMEM: +- err = "cannot allocate memory"; ++ bch_err(c, "cannot allocate memory"); + break; + case -EIO: +- err = "IO error"; ++ bch_err(c, "IO error"); + break; + } + +@@ -1394,7 +1389,7 @@ static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca) + bch2_copygc_start(c); + } + +-static const char *__bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca) ++static int __bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca) + { + lockdep_assert_held(&c->state_lock); + +@@ -1403,10 +1398,7 @@ static const char *__bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca) + bch2_dev_allocator_add(c, ca); + bch2_recalc_capacity(c); + +- if (bch2_dev_allocator_start(ca)) +- return "error starting allocator thread"; +- +- return NULL; ++ return bch2_dev_allocator_start(ca); + } + + int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca, +@@ -1432,9 +1424,8 @@ int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca, + bch2_write_super(c); + mutex_unlock(&c->sb_lock); + +- if (new_state == BCH_MEMBER_STATE_rw && +- __bch2_dev_read_write(c, ca)) +- ret = -ENOMEM; ++ if (new_state == BCH_MEMBER_STATE_rw) ++ ret = __bch2_dev_read_write(c, ca); + + rebalance_wakeup(c); + +@@ -1718,8 +1709,8 @@ have_slot: + goto err_late; + + if (ca->mi.state == BCH_MEMBER_STATE_rw) { +- err = __bch2_dev_read_write(c, ca); +- if (err) ++ ret = __bch2_dev_read_write(c, ca); ++ if (ret) + goto err_late; + } + +@@ -1763,24 +1754,27 @@ int bch2_dev_online(struct bch_fs *c, const char *path) + dev_idx = sb.sb->dev_idx; + + err = bch2_dev_in_fs(c->disk_sb.sb, sb.sb); +- if (err) ++ if (err) { ++ bch_err(c, "error bringing %s online: %s", path, err); + goto err; ++ } + +- if (bch2_dev_attach_bdev(c, &sb)) { +- err = "bch2_dev_attach_bdev() error"; ++ ret = bch2_dev_attach_bdev(c, &sb); ++ if (ret) + goto err; +- } + + ca = bch_dev_locked(c, dev_idx); + +- if (bch2_trans_mark_dev_sb(c, ca)) { +- err = "bch2_trans_mark_dev_sb() error"; ++ ret = bch2_trans_mark_dev_sb(c, ca); ++ if (ret) { ++ bch_err(c, "error bringing %s online: error %i from bch2_trans_mark_dev_sb", ++ path, ret); + goto err; + } + + if (ca->mi.state == BCH_MEMBER_STATE_rw) { +- err = __bch2_dev_read_write(c, ca); +- if (err) ++ ret = __bch2_dev_read_write(c, ca); ++ if (ret) + goto err; + } + +@@ -1798,7 +1792,6 @@ int bch2_dev_online(struct bch_fs *c, const char *path) + err: + up_write(&c->state_lock); + bch2_free_super(&sb); +- bch_err(c, "error bringing %s online: %s", path, err); + return -EINVAL; + } + +@@ -1902,7 +1895,7 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices, + struct bch_sb_field_members *mi; + unsigned i, best_sb = 0; + const char *err; +- int ret = -ENOMEM; ++ int ret = 0; + + pr_verbose_init(opts, ""); + +@@ -1917,8 +1910,10 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices, + } + + sb = kcalloc(nr_devices, sizeof(*sb), GFP_KERNEL); +- if (!sb) ++ if (!sb) { ++ ret = -ENOMEM; + goto err; ++ } + + for (i = 0; i < nr_devices; i++) { + ret = bch2_read_super(devices[i], &opts, &sb[i]); +@@ -1961,13 +1956,14 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices, + goto err; + } + +- err = "bch2_dev_online() error"; + down_write(&c->state_lock); +- for (i = 0; i < nr_devices; i++) +- if (bch2_dev_attach_bdev(c, &sb[i])) { ++ for (i = 0; i < nr_devices; i++) { ++ ret = bch2_dev_attach_bdev(c, &sb[i]); ++ if (ret) { + up_write(&c->state_lock); +- goto err_print; ++ goto err; + } ++ } + up_write(&c->state_lock); + + err = "insufficient devices"; +@@ -1992,8 +1988,9 @@ err_print: + err: + if (!IS_ERR_OR_NULL(c)) + bch2_fs_stop(c); +- for (i = 0; i < nr_devices; i++) +- bch2_free_super(&sb[i]); ++ if (sb) ++ for (i = 0; i < nr_devices; i++) ++ bch2_free_super(&sb[i]); + c = ERR_PTR(ret); + goto out; + } +-- +cgit v1.2.3 + + +From b46dc20adf85177e966fbba26fbcda3fdf95bed4 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 7 Nov 2021 10:19:37 -0500 +Subject: bcachefs: path->should_be_locked fixes + + - We should only be clearing should_be_locked in btree_path_set_pos() - + it's the responsiblity of the btree_path code, not the btree_iter + code. + + - bch2_path_put() needs to pay attention to path->should_be_locked, to + ensure we don't drop locks we're supposed to be keeping. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 25 +++++++++++++++++-------- + fs/bcachefs/btree_iter.h | 2 -- + 2 files changed, 17 insertions(+), 10 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index be55346b6dd1..0d3acba91bd4 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -1655,19 +1655,19 @@ static struct btree_path *have_path_at_pos(struct btree_trans *trans, struct btr + return NULL; + } + +-static bool have_node_at_pos(struct btree_trans *trans, struct btree_path *path) ++static struct btree_path *have_node_at_pos(struct btree_trans *trans, struct btree_path *path) + { + struct btree_path *next; + + next = prev_btree_path(trans, path); +- if (next && path_l(next)->b == path_l(path)->b) +- return true; ++ if (next && next->level == path->level && path_l(next)->b == path_l(path)->b) ++ return next; + + next = next_btree_path(trans, path); +- if (next && path_l(next)->b == path_l(path)->b) +- return true; ++ if (next && next->level == path->level && path_l(next)->b == path_l(path)->b) ++ return next; + +- return false; ++ return NULL; + } + + static inline void __bch2_path_free(struct btree_trans *trans, struct btree_path *path) +@@ -1694,11 +1694,20 @@ void bch2_path_put(struct btree_trans *trans, struct btree_path *path, bool inte + (dup = have_path_at_pos(trans, path))) { + dup->preserve = true; + path->preserve = false; ++ goto free; + } + + if (!path->preserve && +- have_node_at_pos(trans, path)) +- __bch2_path_free(trans, path); ++ (dup = have_node_at_pos(trans, path))) ++ goto free; ++ return; ++free: ++ if (path->should_be_locked && ++ !btree_node_locked(dup, path->level)) ++ return; ++ ++ dup->should_be_locked |= path->should_be_locked; ++ __bch2_path_free(trans, path); + } + + noinline __cold +diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h +index 2dc588283252..5b1735ae7b43 100644 +--- a/fs/bcachefs/btree_iter.h ++++ b/fs/bcachefs/btree_iter.h +@@ -227,8 +227,6 @@ static inline void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos + iter->k.p.offset = iter->pos.offset = new_pos.offset; + iter->k.p.snapshot = iter->pos.snapshot = new_pos.snapshot; + iter->k.size = 0; +- if (iter->path->ref == 1) +- iter->path->should_be_locked = false; + } + + static inline void bch2_btree_iter_set_pos_to_extent_start(struct btree_iter *iter) +-- +cgit v1.2.3 + + +From ce17cdce909cb691a11841cc6119e3c091cce1e9 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 6 Nov 2021 00:03:40 -0400 +Subject: bcachefs: bch2_assert_pos_locked() + +This adds a new assertion to be used by bch2_inode_update_after_write(), +which updates the VFS inode based on the update to the btree inode we +just did - we require that the btree inode still be locked when we do +that update. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/acl.c | 2 +- + fs/bcachefs/btree_iter.c | 45 ++++++++++++++++++++++++++++++++++--- + fs/bcachefs/btree_iter.h | 4 ++++ + fs/bcachefs/fs.c | 58 +++++++++++++++++++++++++++++------------------- + fs/bcachefs/fs.h | 2 +- + fs/bcachefs/inode.c | 6 ++--- + fs/bcachefs/inode.h | 2 ++ + 7 files changed, 88 insertions(+), 31 deletions(-) + +diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c +index 0cde2638d017..5070caf8f349 100644 +--- a/fs/bcachefs/acl.c ++++ b/fs/bcachefs/acl.c +@@ -340,7 +340,7 @@ btree_err: + if (unlikely(ret)) + goto err; + +- bch2_inode_update_after_write(c, inode, &inode_u, ++ bch2_inode_update_after_write(&trans, inode, &inode_u, + ATTR_CTIME|ATTR_MODE); + + set_cached_acl(&inode->v, type, acl); +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 0d3acba91bd4..03357737e7c3 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -49,7 +49,7 @@ static inline int __btree_path_cmp(const struct btree_path *l, + unsigned r_level) + { + return cmp_int(l->btree_id, r_btree_id) ?: +- cmp_int(l->cached, r_cached) ?: ++ cmp_int((int) l->cached, (int) r_cached) ?: + bpos_cmp(l->pos, r_pos) ?: + -cmp_int(l->level, r_level); + } +@@ -768,6 +768,43 @@ out: + return ret; + } + ++void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id, ++ struct bpos pos, bool key_cache) ++{ ++ struct btree_path *path; ++ unsigned idx; ++ char buf[100]; ++ ++ trans_for_each_path_inorder(trans, path, idx) { ++ int cmp = cmp_int(path->btree_id, id) ?: ++ cmp_int(path->cached, key_cache); ++ ++ if (cmp > 0) ++ break; ++ if (cmp < 0) ++ continue; ++ ++ if (!(path->nodes_locked & 1) || ++ !path->should_be_locked) ++ continue; ++ ++ if (!key_cache) { ++ if (bkey_cmp(pos, path->l[0].b->data->min_key) >= 0 && ++ bkey_cmp(pos, path->l[0].b->key.k.p) <= 0) ++ return; ++ } else { ++ if (!bkey_cmp(pos, path->pos)) ++ return; ++ } ++ } ++ ++ bch2_dump_trans_paths_updates(trans); ++ panic("not locked: %s %s%s\n", ++ bch2_btree_ids[id], ++ (bch2_bpos_to_text(&PBUF(buf), pos), buf), ++ key_cache ? " cached" : ""); ++} ++ + #else + + static inline void bch2_btree_path_verify_level(struct btree_trans *trans, +@@ -1721,11 +1758,13 @@ void bch2_dump_trans_paths_updates(struct btree_trans *trans) + btree_trans_verify_sorted(trans); + + trans_for_each_path_inorder(trans, path, idx) +- printk(KERN_ERR "path: idx %u ref %u:%u%s btree %s pos %s %pS\n", ++ printk(KERN_ERR "path: idx %u ref %u:%u%s%s btree %s pos %s locks %u %pS\n", + path->idx, path->ref, path->intent_ref, +- path->preserve ? " preserve" : "", ++ path->should_be_locked ? " S" : "", ++ path->preserve ? " P" : "", + bch2_btree_ids[path->btree_id], + (bch2_bpos_to_text(&PBUF(buf1), path->pos), buf1), ++ path->nodes_locked, + #ifdef CONFIG_BCACHEFS_DEBUG + (void *) path->ip_allocated + #else +diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h +index 5b1735ae7b43..33a703c27f7a 100644 +--- a/fs/bcachefs/btree_iter.h ++++ b/fs/bcachefs/btree_iter.h +@@ -140,9 +140,13 @@ inline struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *, struct bke + #ifdef CONFIG_BCACHEFS_DEBUG + void bch2_trans_verify_paths(struct btree_trans *); + void bch2_trans_verify_locks(struct btree_trans *); ++void bch2_assert_pos_locked(struct btree_trans *, enum btree_id, ++ struct bpos, bool); + #else + static inline void bch2_trans_verify_paths(struct btree_trans *trans) {} + static inline void bch2_trans_verify_locks(struct btree_trans *trans) {} ++static inline void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id, ++ struct bpos pos, bool key_cache) {} + #endif + + void bch2_btree_path_fix_key_modified(struct btree_trans *trans, +diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c +index ba91135cd16a..aad0cdb14282 100644 +--- a/fs/bcachefs/fs.c ++++ b/fs/bcachefs/fs.c +@@ -37,7 +37,7 @@ + + static struct kmem_cache *bch2_inode_cache; + +-static void bch2_vfs_inode_init(struct bch_fs *, subvol_inum, ++static void bch2_vfs_inode_init(struct btree_trans *, subvol_inum, + struct bch_inode_info *, + struct bch_inode_unpacked *); + +@@ -93,11 +93,19 @@ void bch2_pagecache_block_get(struct pagecache_lock *lock) + __pagecache_lock_get(lock, -1); + } + +-void bch2_inode_update_after_write(struct bch_fs *c, ++void bch2_inode_update_after_write(struct btree_trans *trans, + struct bch_inode_info *inode, + struct bch_inode_unpacked *bi, + unsigned fields) + { ++ struct bch_fs *c = trans->c; ++ ++ BUG_ON(bi->bi_inum != inode->v.i_ino); ++ ++ bch2_assert_pos_locked(trans, BTREE_ID_inodes, ++ POS(0, bi->bi_inum), ++ 0 && c->opts.inodes_use_key_cache); ++ + set_nlink(&inode->v, bch2_inode_nlink_get(bi)); + i_uid_write(&inode->v, bi->bi_uid); + i_gid_write(&inode->v, bi->bi_gid); +@@ -126,6 +134,7 @@ int __must_check bch2_write_inode(struct bch_fs *c, + int ret; + + bch2_trans_init(&trans, c, 0, 512); ++ trans.ip = _RET_IP_; + retry: + bch2_trans_begin(&trans); + +@@ -140,7 +149,7 @@ retry: + * this is important for inode updates via bchfs_write_index_update + */ + if (!ret) +- bch2_inode_update_after_write(c, inode, &inode_u, fields); ++ bch2_inode_update_after_write(&trans, inode, &inode_u, fields); + + bch2_trans_iter_exit(&trans, &iter); + +@@ -215,6 +224,7 @@ struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum) + { + struct bch_inode_unpacked inode_u; + struct bch_inode_info *inode; ++ struct btree_trans trans; + int ret; + + inode = to_bch_ei(iget5_locked(c->vfs_sb, +@@ -227,14 +237,19 @@ struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum) + if (!(inode->v.i_state & I_NEW)) + return &inode->v; + +- ret = bch2_inode_find_by_inum(c, inum, &inode_u); ++ bch2_trans_init(&trans, c, 8, 0); ++ ret = lockrestart_do(&trans, ++ bch2_inode_find_by_inum_trans(&trans, inum, &inode_u)); ++ ++ if (!ret) ++ bch2_vfs_inode_init(&trans, inum, inode, &inode_u); ++ bch2_trans_exit(&trans); ++ + if (ret) { + iget_failed(&inode->v); + return ERR_PTR(ret); + } + +- bch2_vfs_inode_init(c, inum, inode, &inode_u); +- + unlock_new_inode(&inode->v); + + return &inode->v; +@@ -306,7 +321,7 @@ err_before_quota: + } + + if (!(flags & BCH_CREATE_TMPFILE)) { +- bch2_inode_update_after_write(c, dir, &dir_u, ++ bch2_inode_update_after_write(&trans, dir, &dir_u, + ATTR_MTIME|ATTR_CTIME); + mutex_unlock(&dir->ei_update_lock); + } +@@ -314,7 +329,8 @@ err_before_quota: + inum.subvol = inode_u.bi_subvol ?: dir->ei_subvol; + inum.inum = inode_u.bi_inum; + +- bch2_vfs_inode_init(c, inum, inode, &inode_u); ++ bch2_iget5_set(&inode->v, &inum); ++ bch2_vfs_inode_init(&trans, inum, inode, &inode_u); + + set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl); + set_cached_acl(&inode->v, ACL_TYPE_DEFAULT, default_acl); +@@ -429,11 +445,9 @@ static int __bch2_link(struct bch_fs *c, + &dentry->d_name)); + + if (likely(!ret)) { +- BUG_ON(inode_u.bi_inum != inode->v.i_ino); +- +- bch2_inode_update_after_write(c, dir, &dir_u, ++ bch2_inode_update_after_write(&trans, dir, &dir_u, + ATTR_MTIME|ATTR_CTIME); +- bch2_inode_update_after_write(c, inode, &inode_u, ATTR_CTIME); ++ bch2_inode_update_after_write(&trans, inode, &inode_u, ATTR_CTIME); + } + + bch2_trans_exit(&trans); +@@ -481,11 +495,9 @@ int __bch2_unlink(struct inode *vdir, struct dentry *dentry, + deleting_snapshot)); + + if (likely(!ret)) { +- BUG_ON(inode_u.bi_inum != inode->v.i_ino); +- +- bch2_inode_update_after_write(c, dir, &dir_u, ++ bch2_inode_update_after_write(&trans, dir, &dir_u, + ATTR_MTIME|ATTR_CTIME); +- bch2_inode_update_after_write(c, inode, &inode_u, ++ bch2_inode_update_after_write(&trans, inode, &inode_u, + ATTR_MTIME); + } + +@@ -613,18 +625,18 @@ static int bch2_rename2(struct user_namespace *mnt_userns, + BUG_ON(dst_inode && + dst_inode->v.i_ino != dst_inode_u.bi_inum); + +- bch2_inode_update_after_write(c, src_dir, &src_dir_u, ++ bch2_inode_update_after_write(&trans, src_dir, &src_dir_u, + ATTR_MTIME|ATTR_CTIME); + + if (src_dir != dst_dir) +- bch2_inode_update_after_write(c, dst_dir, &dst_dir_u, ++ bch2_inode_update_after_write(&trans, dst_dir, &dst_dir_u, + ATTR_MTIME|ATTR_CTIME); + +- bch2_inode_update_after_write(c, src_inode, &src_inode_u, ++ bch2_inode_update_after_write(&trans, src_inode, &src_inode_u, + ATTR_CTIME); + + if (dst_inode) +- bch2_inode_update_after_write(c, dst_inode, &dst_inode_u, ++ bch2_inode_update_after_write(&trans, dst_inode, &dst_inode_u, + ATTR_CTIME); + err: + bch2_trans_exit(&trans); +@@ -742,7 +754,7 @@ btree_err: + if (unlikely(ret)) + goto err_trans; + +- bch2_inode_update_after_write(c, inode, &inode_u, attr->ia_valid); ++ bch2_inode_update_after_write(&trans, inode, &inode_u, attr->ia_valid); + + if (acl) + set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl); +@@ -1153,11 +1165,11 @@ static const struct export_operations bch_export_ops = { + //.get_parent = bch2_get_parent, + }; + +-static void bch2_vfs_inode_init(struct bch_fs *c, subvol_inum inum, ++static void bch2_vfs_inode_init(struct btree_trans *trans, subvol_inum inum, + struct bch_inode_info *inode, + struct bch_inode_unpacked *bi) + { +- bch2_inode_update_after_write(c, inode, bi, ~0); ++ bch2_inode_update_after_write(trans, inode, bi, ~0); + + inode->v.i_blocks = bi->bi_sectors; + inode->v.i_ino = bi->bi_inum; +diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h +index 40212b3da091..27aacd7e2864 100644 +--- a/fs/bcachefs/fs.h ++++ b/fs/bcachefs/fs.h +@@ -173,7 +173,7 @@ struct inode *bch2_vfs_inode_get(struct bch_fs *, subvol_inum); + typedef int (*inode_set_fn)(struct bch_inode_info *, + struct bch_inode_unpacked *, void *); + +-void bch2_inode_update_after_write(struct bch_fs *, ++void bch2_inode_update_after_write(struct btree_trans *, + struct bch_inode_info *, + struct bch_inode_unpacked *, + unsigned); +diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c +index ef1866a7e96f..968ccffeba4f 100644 +--- a/fs/bcachefs/inode.c ++++ b/fs/bcachefs/inode.c +@@ -722,9 +722,9 @@ err: + return ret; + } + +-static int bch2_inode_find_by_inum_trans(struct btree_trans *trans, +- subvol_inum inum, +- struct bch_inode_unpacked *inode) ++int bch2_inode_find_by_inum_trans(struct btree_trans *trans, ++ subvol_inum inum, ++ struct bch_inode_unpacked *inode) + { + struct btree_iter iter; + int ret; +diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h +index d433d48de4e0..723186d8afb6 100644 +--- a/fs/bcachefs/inode.h ++++ b/fs/bcachefs/inode.h +@@ -89,6 +89,8 @@ int bch2_inode_create(struct btree_trans *, struct btree_iter *, + + int bch2_inode_rm(struct bch_fs *, subvol_inum, bool); + ++int bch2_inode_find_by_inum_trans(struct btree_trans *, subvol_inum, ++ struct bch_inode_unpacked *); + int bch2_inode_find_by_inum(struct bch_fs *, subvol_inum, + struct bch_inode_unpacked *); + +-- +cgit v1.2.3 + + +From 74ede057a4ed82aeca8be6e42d39670fec265aa3 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 8 Nov 2021 12:30:47 -0500 +Subject: bcachefs: Drop old maybe_extending optimization + +The extend update path had an optimization to avoid updating the inode +if we knew we were definitely not extending the file. But now that we're +updating inodes on every extent update - for fsync - that code can be +deleted. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/io.c | 82 ++++++++---------------------------------------------- + fs/bcachefs/io.h | 2 +- + fs/bcachefs/move.c | 3 +- + 3 files changed, 13 insertions(+), 74 deletions(-) + +diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c +index 7c9ea91d8f5b..3bd1054aafa1 100644 +--- a/fs/bcachefs/io.c ++++ b/fs/bcachefs/io.c +@@ -187,7 +187,6 @@ void bch2_bio_alloc_pages_pool(struct bch_fs *c, struct bio *bio, + int bch2_sum_sector_overwrites(struct btree_trans *trans, + struct btree_iter *extent_iter, + struct bkey_i *new, +- bool *maybe_extending, + bool *usage_increasing, + s64 *i_sectors_delta, + s64 *disk_sectors_delta) +@@ -199,7 +198,6 @@ int bch2_sum_sector_overwrites(struct btree_trans *trans, + bool new_compressed = bch2_bkey_sectors_compressed(bkey_i_to_s_c(new)); + int ret = 0; + +- *maybe_extending = true; + *usage_increasing = false; + *i_sectors_delta = 0; + *disk_sectors_delta = 0; +@@ -226,31 +224,8 @@ int bch2_sum_sector_overwrites(struct btree_trans *trans, + (!new_compressed && bch2_bkey_sectors_compressed(old)))) + *usage_increasing = true; + +- if (bkey_cmp(old.k->p, new->k.p) >= 0) { +- /* +- * Check if there's already data above where we're +- * going to be writing to - this means we're definitely +- * not extending the file: +- * +- * Note that it's not sufficient to check if there's +- * data up to the sector offset we're going to be +- * writing to, because i_size could be up to one block +- * less: +- */ +- if (!bkey_cmp(old.k->p, new->k.p)) { +- old = bch2_btree_iter_next(&iter); +- ret = bkey_err(old); +- if (ret) +- break; +- } +- +- if (old.k && !bkey_err(old) && +- old.k->p.inode == extent_iter->pos.inode && +- bkey_extent_is_data(old.k)) +- *maybe_extending = false; +- ++ if (bkey_cmp(old.k->p, new->k.p) >= 0) + break; +- } + } + + bch2_trans_iter_exit(trans, &iter); +@@ -267,12 +242,10 @@ int bch2_extent_update(struct btree_trans *trans, + s64 *i_sectors_delta_total, + bool check_enospc) + { +- /* this must live until after bch2_trans_commit(): */ +- struct bkey_inode_buf inode_p; + struct btree_iter inode_iter; + struct bch_inode_unpacked inode_u; + struct bpos next_pos; +- bool extending = false, usage_increasing; ++ bool usage_increasing; + s64 i_sectors_delta = 0, disk_sectors_delta = 0; + int ret; + +@@ -290,84 +263,51 @@ int bch2_extent_update(struct btree_trans *trans, + if (ret) + return ret; + ++ new_i_size = min(k->k.p.offset << 9, new_i_size); ++ next_pos = k->k.p; ++ + ret = bch2_sum_sector_overwrites(trans, iter, k, +- &extending, + &usage_increasing, + &i_sectors_delta, + &disk_sectors_delta); + if (ret) + return ret; + +- if (!usage_increasing) +- check_enospc = false; +- + if (disk_res && + disk_sectors_delta > (s64) disk_res->sectors) { + ret = bch2_disk_reservation_add(trans->c, disk_res, + disk_sectors_delta - disk_res->sectors, +- !check_enospc ++ !check_enospc || !usage_increasing + ? BCH_DISK_RESERVATION_NOFAIL : 0); + if (ret) + return ret; + } + +- new_i_size = extending +- ? min(k->k.p.offset << 9, new_i_size) +- : 0; +- + ret = bch2_inode_peek(trans, &inode_iter, &inode_u, inum, + BTREE_ITER_INTENT); + if (ret) + return ret; + +- /* +- * XXX: +- * writeback can race a bit with truncate, because truncate +- * first updates the inode then truncates the pagecache. This is +- * ugly, but lets us preserve the invariant that the in memory +- * i_size is always >= the on disk i_size. +- * +- BUG_ON(new_i_size > inode_u.bi_size && +- (inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY)); +- */ +- BUG_ON(new_i_size > inode_u.bi_size && !extending); +- + if (!(inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY) && + new_i_size > inode_u.bi_size) + inode_u.bi_size = new_i_size; +- else +- new_i_size = 0; + + inode_u.bi_sectors += i_sectors_delta; + +- if (i_sectors_delta || new_i_size) { +- bch2_inode_pack(trans->c, &inode_p, &inode_u); +- +- inode_p.inode.k.p.snapshot = iter->snapshot; +- +- ret = bch2_trans_update(trans, &inode_iter, +- &inode_p.inode.k_i, 0); +- } +- +- bch2_trans_iter_exit(trans, &inode_iter); +- +- if (ret) +- return ret; +- +- next_pos = k->k.p; +- + ret = bch2_trans_update(trans, iter, k, 0) ?: ++ bch2_inode_write(trans, &inode_iter, &inode_u) ?: + bch2_trans_commit(trans, disk_res, journal_seq, + BTREE_INSERT_NOCHECK_RW| + BTREE_INSERT_NOFAIL); +- BUG_ON(ret == -ENOSPC); ++ bch2_trans_iter_exit(trans, &inode_iter); ++ + if (ret) + return ret; + +- bch2_btree_iter_set_pos(iter, next_pos); +- + if (i_sectors_delta_total) + *i_sectors_delta_total += i_sectors_delta; ++ bch2_btree_iter_set_pos(iter, next_pos); ++ + return 0; + } + +diff --git a/fs/bcachefs/io.h b/fs/bcachefs/io.h +index fbe46660662b..1aa422dccef7 100644 +--- a/fs/bcachefs/io.h ++++ b/fs/bcachefs/io.h +@@ -56,7 +56,7 @@ static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op) + } + + int bch2_sum_sector_overwrites(struct btree_trans *, struct btree_iter *, +- struct bkey_i *, bool *, bool *, s64 *, s64 *); ++ struct bkey_i *, bool *, s64 *, s64 *); + int bch2_extent_update(struct btree_trans *, subvol_inum, + struct btree_iter *, struct bkey_i *, + struct disk_reservation *, u64 *, u64, s64 *, bool); +diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c +index 0152fbcde3c2..64e39c10e34b 100644 +--- a/fs/bcachefs/move.c ++++ b/fs/bcachefs/move.c +@@ -160,7 +160,7 @@ static int bch2_migrate_index_update(struct bch_write_op *op) + struct extent_ptr_decoded p; + struct bpos next_pos; + bool did_work = false; +- bool extending = false, should_check_enospc; ++ bool should_check_enospc; + s64 i_sectors_delta = 0, disk_sectors_delta = 0; + + bch2_trans_begin(&trans); +@@ -226,7 +226,6 @@ static int bch2_migrate_index_update(struct bch_write_op *op) + op->opts.data_replicas); + + ret = bch2_sum_sector_overwrites(&trans, &iter, insert, +- &extending, + &should_check_enospc, + &i_sectors_delta, + &disk_sectors_delta); +-- +cgit v1.2.3 + + +From dfaaa4826f26758030d64502bac83bf7e0e07ebf Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 7 Nov 2021 12:10:57 -0500 +Subject: bcachefs: Refactor bch2_fpunch_at() + +This cleans up the error hanlding and flow control a bit. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/io.c | 23 +++++++++-------------- + 1 file changed, 9 insertions(+), 14 deletions(-) + +diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c +index 3bd1054aafa1..3026daa7f9c2 100644 +--- a/fs/bcachefs/io.c ++++ b/fs/bcachefs/io.c +@@ -325,26 +325,31 @@ int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter, + int ret = 0, ret2 = 0; + u32 snapshot; + +- while (1) { ++ while (!ret || ret == -EINTR) { + struct disk_reservation disk_res = + bch2_disk_reservation_init(c, 0); + struct bkey_i delete; + ++ if (ret) ++ ret2 = ret; ++ + bch2_trans_begin(trans); + + ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); + if (ret) +- goto btree_err; ++ continue; + + bch2_btree_iter_set_snapshot(iter, snapshot); + + k = bch2_btree_iter_peek(iter); +- if (bkey_cmp(iter->pos, end_pos) >= 0) ++ if (bkey_cmp(iter->pos, end_pos) >= 0) { ++ bch2_btree_iter_set_pos(iter, end_pos); + break; ++ } + + ret = bkey_err(k); + if (ret) +- goto btree_err; ++ continue; + + bkey_init(&delete.k); + delete.k.p = iter->pos; +@@ -357,18 +362,8 @@ int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter, + &disk_res, NULL, + 0, i_sectors_delta, false); + bch2_disk_reservation_put(c, &disk_res); +-btree_err: +- if (ret == -EINTR) { +- ret2 = ret; +- ret = 0; +- } +- if (ret) +- break; + } + +- if (bkey_cmp(iter->pos, end_pos) > 0) +- bch2_btree_iter_set_pos(iter, end_pos); +- + return ret ?: ret2; + } + +-- +cgit v1.2.3 + + +From 05a232a6dcf50642dc43781cdd94dde153da4c3f Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 6 Nov 2021 13:39:42 -0400 +Subject: bcachefs: Fallocate fixes + +- fpunch wasn't always correctly updating i_size - when we drop buffered + writes that were extending a file, we become responsible for writing + i_size. + +- fzero was sometimes zeroing out more data that it should have - + block_start and block_end were being rounded in the wrong directions + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/fs-io.c | 201 ++++++++++++++++++++++++++-------------------------- + 1 file changed, 100 insertions(+), 101 deletions(-) + +diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c +index 9eed6e81384f..ccff19eb5a89 100644 +--- a/fs/bcachefs/fs-io.c ++++ b/fs/bcachefs/fs-io.c +@@ -2309,6 +2309,14 @@ static int __bch2_truncate_page(struct bch_inode_info *inode, + s->s[i].state = SECTOR_UNALLOCATED; + } + ++ /* ++ * Caller needs to know whether this page will be written out by ++ * writeback - doing an i_size update if necessary - or whether it will ++ * be responsible for the i_size update: ++ */ ++ ret = s->s[(min_t(u64, inode->v.i_size - (index << PAGE_SHIFT), ++ PAGE_SIZE) - 1) >> 9].state >= SECTOR_DIRTY; ++ + zero_user_segment(page, start_offset, end_offset); + + /* +@@ -2317,8 +2325,7 @@ static int __bch2_truncate_page(struct bch_inode_info *inode, + * XXX: because we aren't currently tracking whether the page has actual + * data in it (vs. just 0s, or only partially written) this wrong. ick. + */ +- ret = bch2_get_page_disk_reservation(c, inode, page, false); +- BUG_ON(ret); ++ BUG_ON(bch2_get_page_disk_reservation(c, inode, page, false)); + + /* + * This removes any writeable userspace mappings; we need to force +@@ -2340,6 +2347,20 @@ static int bch2_truncate_page(struct bch_inode_info *inode, loff_t from) + from, round_up(from, PAGE_SIZE)); + } + ++static int bch2_truncate_pages(struct bch_inode_info *inode, ++ loff_t start, loff_t end) ++{ ++ int ret = __bch2_truncate_page(inode, start >> PAGE_SHIFT, ++ start, end); ++ ++ if (ret >= 0 && ++ start >> PAGE_SHIFT != end >> PAGE_SHIFT) ++ ret = __bch2_truncate_page(inode, ++ end >> PAGE_SHIFT, ++ start, end); ++ return ret; ++} ++ + static int bch2_extend(struct user_namespace *mnt_userns, + struct bch_inode_info *inode, + struct bch_inode_unpacked *inode_u, +@@ -2430,7 +2451,7 @@ int bch2_truncate(struct user_namespace *mnt_userns, + iattr->ia_valid &= ~ATTR_SIZE; + + ret = bch2_truncate_page(inode, iattr->ia_size); +- if (unlikely(ret)) ++ if (unlikely(ret < 0)) + goto err; + + /* +@@ -2496,48 +2517,39 @@ static int inode_update_times_fn(struct bch_inode_info *inode, + static long bchfs_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len) + { + struct bch_fs *c = inode->v.i_sb->s_fs_info; +- u64 discard_start = round_up(offset, block_bytes(c)) >> 9; +- u64 discard_end = round_down(offset + len, block_bytes(c)) >> 9; ++ u64 end = offset + len; ++ u64 block_start = round_up(offset, block_bytes(c)); ++ u64 block_end = round_down(end, block_bytes(c)); ++ bool truncated_last_page; + int ret = 0; + +- inode_lock(&inode->v); +- inode_dio_wait(&inode->v); +- bch2_pagecache_block_get(&inode->ei_pagecache_lock); +- +- ret = __bch2_truncate_page(inode, +- offset >> PAGE_SHIFT, +- offset, offset + len); +- if (unlikely(ret)) ++ ret = bch2_truncate_pages(inode, offset, end); ++ if (unlikely(ret < 0)) + goto err; + +- if (offset >> PAGE_SHIFT != +- (offset + len) >> PAGE_SHIFT) { +- ret = __bch2_truncate_page(inode, +- (offset + len) >> PAGE_SHIFT, +- offset, offset + len); +- if (unlikely(ret)) +- goto err; +- } ++ truncated_last_page = ret; + +- truncate_pagecache_range(&inode->v, offset, offset + len - 1); ++ truncate_pagecache_range(&inode->v, offset, end - 1); + +- if (discard_start < discard_end) { ++ if (block_start < block_end ) { + s64 i_sectors_delta = 0; + + ret = bch2_fpunch(c, inode_inum(inode), +- discard_start, discard_end, ++ block_start >> 9, block_end >> 9, + &i_sectors_delta); + i_sectors_acct(c, inode, NULL, i_sectors_delta); + } + + mutex_lock(&inode->ei_update_lock); +- ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL, +- ATTR_MTIME|ATTR_CTIME) ?: ret; ++ if (end >= inode->v.i_size && !truncated_last_page) { ++ ret = bch2_write_inode_size(c, inode, inode->v.i_size, ++ ATTR_MTIME|ATTR_CTIME); ++ } else { ++ ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL, ++ ATTR_MTIME|ATTR_CTIME); ++ } + mutex_unlock(&inode->ei_update_lock); + err: +- bch2_pagecache_block_put(&inode->ei_pagecache_lock); +- inode_unlock(&inode->v); +- + return ret; + } + +@@ -2557,31 +2569,18 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode, + if ((offset | len) & (block_bytes(c) - 1)) + return -EINVAL; + +- /* +- * We need i_mutex to keep the page cache consistent with the extents +- * btree, and the btree consistent with i_size - we don't need outside +- * locking for the extents btree itself, because we're using linked +- * iterators +- */ +- inode_lock(&inode->v); +- inode_dio_wait(&inode->v); +- bch2_pagecache_block_get(&inode->ei_pagecache_lock); +- + if (insert) { +- ret = -EFBIG; + if (inode->v.i_sb->s_maxbytes - inode->v.i_size < len) +- goto err; ++ return -EFBIG; + +- ret = -EINVAL; + if (offset >= inode->v.i_size) +- goto err; ++ return -EINVAL; + + src_start = U64_MAX; + shift = len; + } else { +- ret = -EINVAL; + if (offset + len >= inode->v.i_size) +- goto err; ++ return -EINVAL; + + src_start = offset + len; + shift = -len; +@@ -2591,7 +2590,7 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode, + + ret = write_invalidate_inode_pages_range(mapping, offset, LLONG_MAX); + if (ret) +- goto err; ++ return ret; + + if (insert) { + i_size_write(&inode->v, new_size); +@@ -2608,7 +2607,7 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode, + i_sectors_acct(c, inode, NULL, i_sectors_delta); + + if (ret) +- goto err; ++ return ret; + } + + bch2_bkey_buf_init(©); +@@ -2721,18 +2720,19 @@ reassemble: + bch2_bkey_buf_exit(©, c); + + if (ret) +- goto err; ++ return ret; + ++ mutex_lock(&inode->ei_update_lock); + if (!insert) { + i_size_write(&inode->v, new_size); +- mutex_lock(&inode->ei_update_lock); + ret = bch2_write_inode_size(c, inode, new_size, + ATTR_MTIME|ATTR_CTIME); +- mutex_unlock(&inode->ei_update_lock); ++ } else { ++ /* We need an inode update to update bi_journal_seq for fsync: */ ++ ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL, ++ ATTR_MTIME|ATTR_CTIME); + } +-err: +- bch2_pagecache_block_put(&inode->ei_pagecache_lock); +- inode_unlock(&inode->v); ++ mutex_unlock(&inode->ei_update_lock); + return ret; + } + +@@ -2827,6 +2827,17 @@ bkey_err: + if (ret == -EINTR) + ret = 0; + } ++ ++ if (ret == -ENOSPC && (mode & FALLOC_FL_ZERO_RANGE)) { ++ struct quota_res quota_res = { 0 }; ++ s64 i_sectors_delta = 0; ++ ++ bch2_fpunch_at(&trans, &iter, inode_inum(inode), ++ end_sector, &i_sectors_delta); ++ i_sectors_acct(c, inode, "a_res, i_sectors_delta); ++ bch2_quota_reservation_put(c, inode, "a_res); ++ } ++ + bch2_trans_iter_exit(&trans, &iter); + bch2_trans_exit(&trans); + return ret; +@@ -2835,77 +2846,58 @@ bkey_err: + static long bchfs_fallocate(struct bch_inode_info *inode, int mode, + loff_t offset, loff_t len) + { +- struct address_space *mapping = inode->v.i_mapping; + struct bch_fs *c = inode->v.i_sb->s_fs_info; +- loff_t end = offset + len; +- loff_t block_start = round_down(offset, block_bytes(c)); +- loff_t block_end = round_up(end, block_bytes(c)); +- int ret; +- +- inode_lock(&inode->v); +- inode_dio_wait(&inode->v); +- bch2_pagecache_block_get(&inode->ei_pagecache_lock); ++ u64 end = offset + len; ++ u64 block_start = round_down(offset, block_bytes(c)); ++ u64 block_end = round_up(end, block_bytes(c)); ++ bool truncated_last_page = false; ++ int ret, ret2 = 0; + + if (!(mode & FALLOC_FL_KEEP_SIZE) && end > inode->v.i_size) { + ret = inode_newsize_ok(&inode->v, end); + if (ret) +- goto err; ++ return ret; + } + + if (mode & FALLOC_FL_ZERO_RANGE) { +- ret = __bch2_truncate_page(inode, +- offset >> PAGE_SHIFT, +- offset, end); +- +- if (!ret && +- offset >> PAGE_SHIFT != end >> PAGE_SHIFT) +- ret = __bch2_truncate_page(inode, +- end >> PAGE_SHIFT, +- offset, end); ++ ret = bch2_truncate_pages(inode, offset, end); ++ if (unlikely(ret < 0)) ++ return ret; + +- if (unlikely(ret)) +- goto err; ++ truncated_last_page = ret; + + truncate_pagecache_range(&inode->v, offset, end - 1); ++ ++ block_start = round_up(offset, block_bytes(c)); ++ block_end = round_down(end, block_bytes(c)); + } + + ret = __bchfs_fallocate(inode, mode, block_start >> 9, block_end >> 9); +- if (ret) +- goto err; + + /* +- * Do we need to extend the file? +- * +- * If we zeroed up to the end of the file, we dropped whatever writes +- * were going to write out the current i_size, so we have to extend +- * manually even if FL_KEEP_SIZE was set: ++ * On -ENOSPC in ZERO_RANGE mode, we still want to do the inode update, ++ * so that the VFS cache i_size is consistent with the btree i_size: + */ +- if (end >= inode->v.i_size && +- (!(mode & FALLOC_FL_KEEP_SIZE) || +- (mode & FALLOC_FL_ZERO_RANGE))) { ++ if (ret && ++ !(ret == -ENOSPC && (mode & FALLOC_FL_ZERO_RANGE))) ++ return ret; + +- /* +- * Sync existing appends before extending i_size, +- * as in bch2_extend(): +- */ +- ret = filemap_write_and_wait_range(mapping, +- inode->ei_inode.bi_size, S64_MAX); +- if (ret) +- goto err; ++ if (mode & FALLOC_FL_KEEP_SIZE && end > inode->v.i_size) ++ end = inode->v.i_size; + +- if (mode & FALLOC_FL_KEEP_SIZE) +- end = inode->v.i_size; +- else +- i_size_write(&inode->v, end); ++ if (end >= inode->v.i_size && ++ (((mode & FALLOC_FL_ZERO_RANGE) && !truncated_last_page) || ++ !(mode & FALLOC_FL_KEEP_SIZE))) { ++ spin_lock(&inode->v.i_lock); ++ i_size_write(&inode->v, end); ++ spin_unlock(&inode->v.i_lock); + + mutex_lock(&inode->ei_update_lock); +- ret = bch2_write_inode_size(c, inode, end, 0); ++ ret2 = bch2_write_inode_size(c, inode, end, 0); + mutex_unlock(&inode->ei_update_lock); + } +-err: +- bch2_pagecache_block_put(&inode->ei_pagecache_lock); +- inode_unlock(&inode->v); +- return ret; ++ ++ return ret ?: ret2; + } + + long bch2_fallocate_dispatch(struct file *file, int mode, +@@ -2918,6 +2910,10 @@ long bch2_fallocate_dispatch(struct file *file, int mode, + if (!percpu_ref_tryget(&c->writes)) + return -EROFS; + ++ inode_lock(&inode->v); ++ inode_dio_wait(&inode->v); ++ bch2_pagecache_block_get(&inode->ei_pagecache_lock); ++ + if (!(mode & ~(FALLOC_FL_KEEP_SIZE|FALLOC_FL_ZERO_RANGE))) + ret = bchfs_fallocate(inode, mode, offset, len); + else if (mode == (FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE)) +@@ -2929,6 +2925,9 @@ long bch2_fallocate_dispatch(struct file *file, int mode, + else + ret = -EOPNOTSUPP; + ++ ++ bch2_pagecache_block_put(&inode->ei_pagecache_lock); ++ inode_unlock(&inode->v); + percpu_ref_put(&c->writes); + + return ret; +-- +cgit v1.2.3 + + +From a45bf96019d36118d6e276e885730af8a78a7c9f Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 9 Nov 2021 17:20:06 -0500 +Subject: bcachefs: Inode updates should generally be BTREE_INSERT_NOFAIL + +This fixes a bug where i_size may become inconsistent between the VFS +cache and the btree, when the filesystem is nearly full. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/reflink.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c +index d003f4088dfc..22230f82b8b9 100644 +--- a/fs/bcachefs/reflink.c ++++ b/fs/bcachefs/reflink.c +@@ -347,7 +347,8 @@ s64 bch2_remap_range(struct bch_fs *c, + inode_u.bi_size < new_i_size) { + inode_u.bi_size = new_i_size; + ret2 = bch2_inode_write(&trans, &inode_iter, &inode_u) ?: +- bch2_trans_commit(&trans, NULL, NULL, 0); ++ bch2_trans_commit(&trans, NULL, NULL, ++ BTREE_INSERT_NOFAIL); + } + + bch2_trans_iter_exit(&trans, &inode_iter); +-- +cgit v1.2.3 + + +From ae00e9c6507aefad3c5ddf31feb71d7e132f86f0 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 10 Nov 2021 05:33:39 -0500 +Subject: Revert "block: rewrite bio_copy_data_iter to use bvec_kmap_local and + memcpy_to_bvec" + +This reverts commit f8b679a070c536600c64a78c83b96aa617f8fa71. +--- + block/bio.c | 28 ++++++++++++++++++++-------- + 1 file changed, 20 insertions(+), 8 deletions(-) + +diff --git a/block/bio.c b/block/bio.c +index f608f01cc60d..32c7e54ef604 100644 +--- a/block/bio.c ++++ b/block/bio.c +@@ -1279,15 +1279,27 @@ EXPORT_SYMBOL(__bio_advance); + void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter, + struct bio *src, struct bvec_iter *src_iter) + { ++ struct bio_vec src_bv, dst_bv; ++ void *src_p, *dst_p; ++ unsigned bytes; ++ + while (src_iter->bi_size && dst_iter->bi_size) { +- struct bio_vec src_bv = bio_iter_iovec(src, *src_iter); +- struct bio_vec dst_bv = bio_iter_iovec(dst, *dst_iter); +- unsigned int bytes = min(src_bv.bv_len, dst_bv.bv_len); +- void *src_buf; +- +- src_buf = bvec_kmap_local(&src_bv); +- memcpy_to_bvec(&dst_bv, src_buf); +- kunmap_local(src_buf); ++ src_bv = bio_iter_iovec(src, *src_iter); ++ dst_bv = bio_iter_iovec(dst, *dst_iter); ++ ++ bytes = min(src_bv.bv_len, dst_bv.bv_len); ++ ++ src_p = kmap_atomic(src_bv.bv_page); ++ dst_p = kmap_atomic(dst_bv.bv_page); ++ ++ memcpy(dst_p + dst_bv.bv_offset, ++ src_p + src_bv.bv_offset, ++ bytes); ++ ++ kunmap_atomic(dst_p); ++ kunmap_atomic(src_p); ++ ++ flush_dcache_page(dst_bv.bv_page); + + bio_advance_iter_single(src, src_iter, bytes); + bio_advance_iter_single(dst, dst_iter, bytes); +-- +cgit v1.2.3 + + +From aa20b3dca73ada4f55b9e07cd15c29a29dd1178c Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 11 Nov 2021 13:02:03 -0500 +Subject: bcachefs: Don't check for -ENOSPC in page writeback + +If at all possible we'd prefer to not fail page writeback unless the +filesystem has been shutdown; allowing errors in page writeback means +things we'd like to assert about i_size consistency between the VFS and +the btree go out the window. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/fs-io.c | 29 ++++++++++++++--------------- + 1 file changed, 14 insertions(+), 15 deletions(-) + +diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c +index ccff19eb5a89..f6970a3318f7 100644 +--- a/fs/bcachefs/fs-io.c ++++ b/fs/bcachefs/fs-io.c +@@ -1181,16 +1181,16 @@ static int __bch2_writepage(struct page *page, + do_io: + s = bch2_page_state_create(page, __GFP_NOFAIL); + +- ret = bch2_get_page_disk_reservation(c, inode, page, true); +- if (ret) { +- SetPageError(page); +- mapping_set_error(page->mapping, ret); +- unlock_page(page); +- return 0; +- } ++ /* ++ * Things get really hairy with errors during writeback: ++ */ ++ ret = bch2_get_page_disk_reservation(c, inode, page, false); ++ BUG_ON(ret); + + /* Before unlocking the page, get copy of reservations: */ ++ spin_lock(&s->lock); + orig = *s; ++ spin_unlock(&s->lock); + + for (i = 0; i < PAGE_SECTORS; i++) { + if (s->s[i].state < SECTOR_DIRTY) +@@ -1223,7 +1223,7 @@ do_io: + + offset = 0; + while (1) { +- unsigned sectors = 1, dirty_sectors = 0, reserved_sectors = 0; ++ unsigned sectors = 0, dirty_sectors = 0, reserved_sectors = 0; + u64 sector; + + while (offset < PAGE_SECTORS && +@@ -1233,16 +1233,15 @@ do_io: + if (offset == PAGE_SECTORS) + break; + +- sector = ((u64) page->index << PAGE_SECTOR_SHIFT) + offset; +- + while (offset + sectors < PAGE_SECTORS && +- orig.s[offset + sectors].state >= SECTOR_DIRTY) ++ orig.s[offset + sectors].state >= SECTOR_DIRTY) { ++ reserved_sectors += orig.s[offset + sectors].replicas_reserved; ++ dirty_sectors += orig.s[offset + sectors].state == SECTOR_DIRTY; + sectors++; +- +- for (i = offset; i < offset + sectors; i++) { +- reserved_sectors += orig.s[i].replicas_reserved; +- dirty_sectors += orig.s[i].state == SECTOR_DIRTY; + } ++ BUG_ON(!sectors); ++ ++ sector = ((u64) page->index << PAGE_SECTOR_SHIFT) + offset; + + if (w->io && + (w->io->op.res.nr_replicas != nr_replicas_this_write || +-- +cgit v1.2.3 + + +From df4b2f41f11d3259fe92ace229e1c4619d97b5b3 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 11 Nov 2021 15:50:22 -0500 +Subject: bcachefs: Fix infinite loop in bch2_btree_cache_scan() + +When attempting to free btree nodes, we might not be able to free all +the nodes that were requested. But the code was looping until it had +freed _all_ the nodes requested, when it should have only been +attempting to free nr nodes. + +Also, tweak journal reclaim to ensure the btree node cache isn't more +than half dirty so that memory reclaim can always make progress - the +same as we do for the btree key cache. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_cache.c | 4 ++-- + fs/bcachefs/journal_reclaim.c | 3 +++ + 2 files changed, 5 insertions(+), 2 deletions(-) + +diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c +index e894b8cab7af..d31aedb49416 100644 +--- a/fs/bcachefs/btree_cache.c ++++ b/fs/bcachefs/btree_cache.c +@@ -310,7 +310,7 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, + + touched++; + +- if (freed >= nr) ++ if (touched >= nr) + break; + + if (!btree_node_reclaim(c, b)) { +@@ -324,7 +324,7 @@ restart: + list_for_each_entry_safe(b, t, &bc->live, list) { + touched++; + +- if (freed >= nr) { ++ if (touched >= nr) { + /* Save position */ + if (&t->list != &bc->live) + list_move_tail(&bc->live, &t->list); +diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c +index a93f5b189248..ca482c6743c3 100644 +--- a/fs/bcachefs/journal_reclaim.c ++++ b/fs/bcachefs/journal_reclaim.c +@@ -646,6 +646,9 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct) + if (fifo_free(&j->pin) <= 32) + min_nr = 1; + ++ if (atomic_read(&c->btree_cache.dirty) * 2 > c->btree_cache.used) ++ min_nr = 1; ++ + trace_journal_reclaim_start(c, + min_nr, + j->prereserved.reserved, +-- +cgit v1.2.3 + + +From bfa49e5739030368687f7c14046543be50a175c5 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 13 Nov 2021 12:57:00 -0500 +Subject: bcachefs: Fix an exiting of uninitialized iterator + +bch2_dirent_lookup had an error path where we'd exit a btree_iter that +hadn't been properly initialized. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/dirent.c | 11 +++++------ + 1 file changed, 5 insertions(+), 6 deletions(-) + +diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c +index 9267eea810f8..5db1426faaf3 100644 +--- a/fs/bcachefs/dirent.c ++++ b/fs/bcachefs/dirent.c +@@ -418,16 +418,15 @@ int __bch2_dirent_lookup_trans(struct btree_trans *trans, + + k = bch2_btree_iter_peek_slot(iter); + ret = bkey_err(k); +- if (ret) { +- bch2_trans_iter_exit(trans, iter); +- return ret; +- } ++ if (ret) ++ goto err; + + d = bkey_s_c_to_dirent(k); + + ret = bch2_dirent_read_target(trans, dir, d, inum); + if (ret > 0) + ret = -ENOENT; ++err: + if (ret) + bch2_trans_iter_exit(trans, iter); + +@@ -448,10 +447,10 @@ retry: + + ret = __bch2_dirent_lookup_trans(&trans, &iter, dir, hash_info, + name, inum, 0); +- +- bch2_trans_iter_exit(&trans, &iter); + if (ret == -EINTR) + goto retry; ++ if (!ret) ++ bch2_trans_iter_exit(&trans, &iter); + bch2_trans_exit(&trans); + return ret; + } +-- +cgit v1.2.3 + + +From 5f9237e0b33d8b04fde5269719ad6e2c850b097a Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 13 Nov 2021 13:36:26 -0500 +Subject: bcachefs: Tweak vfs cache shrinker behaviour + +In bcachefs, inodes and dentries are also cached - more compactly - by +the btree node cache, they don't require seeks to recreate. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/fs.c | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c +index aad0cdb14282..7f0ecee411a7 100644 +--- a/fs/bcachefs/fs.c ++++ b/fs/bcachefs/fs.c +@@ -1668,6 +1668,8 @@ got_sb: + sb->s_flags |= SB_POSIXACL; + #endif + ++ sb->s_shrink.seeks = 0; ++ + vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_SUBVOL_INUM); + if (IS_ERR(vinode)) { + bch_err(c, "error mounting: error getting root inode %i", +-- +cgit v1.2.3 + + +From 915652547ff0a63dbdb60e9b4f9ae0beff08a79b Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 11 Nov 2021 12:11:33 -0500 +Subject: bcachefs: More enum strings + +This patch converts more enums in the on disk format to our standard +x-macro-with-strings deal - to enable better pretty-printing. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bcachefs_format.h | 60 ++++++++++++++++++++++---------------- + fs/bcachefs/checksum.c | 68 +++++++++++++++++++++---------------------- + fs/bcachefs/checksum.h | 20 ++++++------- + fs/bcachefs/ec.c | 2 +- + fs/bcachefs/extents.c | 6 ++-- + fs/bcachefs/io.c | 2 +- + fs/bcachefs/opts.c | 15 ++++++++++ + fs/bcachefs/opts.h | 5 +++- + fs/bcachefs/str_hash.h | 34 +++++++++++----------- + 9 files changed, 120 insertions(+), 92 deletions(-) + +diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h +index 52212ad1682f..b115bd1fa5a3 100644 +--- a/fs/bcachefs/bcachefs_format.h ++++ b/fs/bcachefs/bcachefs_format.h +@@ -1443,7 +1443,7 @@ LE64_BITMASK(BCH_SB_INODES_USE_KEY_CACHE,struct bch_sb, flags[3], 29, 30); + * journal_seq_blacklist_v3: gates BCH_SB_FIELD_journal_seq_blacklist + * reflink: gates KEY_TYPE_reflink + * inline_data: gates KEY_TYPE_inline_data +- * new_siphash: gates BCH_STR_HASH_SIPHASH ++ * new_siphash: gates BCH_STR_HASH_siphash + * new_extent_overwrite: gates BTREE_NODE_NEW_EXTENT_OVERWRITE + */ + #define BCH_SB_FEATURES() \ +@@ -1519,12 +1519,17 @@ enum bch_error_actions { + BCH_ON_ERROR_NR + }; + ++#define BCH_STR_HASH_TYPES() \ ++ x(crc32c, 0) \ ++ x(crc64, 1) \ ++ x(siphash_old, 2) \ ++ x(siphash, 3) ++ + enum bch_str_hash_type { +- BCH_STR_HASH_CRC32C = 0, +- BCH_STR_HASH_CRC64 = 1, +- BCH_STR_HASH_SIPHASH_OLD = 2, +- BCH_STR_HASH_SIPHASH = 3, +- BCH_STR_HASH_NR = 4, ++#define x(t, n) BCH_STR_HASH_##t = n, ++ BCH_STR_HASH_TYPES() ++#undef x ++ BCH_STR_HASH_NR + }; + + #define BCH_STR_HASH_OPTS() \ +@@ -1539,34 +1544,39 @@ enum bch_str_hash_opts { + BCH_STR_HASH_OPT_NR + }; + ++#define BCH_CSUM_TYPES() \ ++ x(none, 0) \ ++ x(crc32c_nonzero, 1) \ ++ x(crc64_nonzero, 2) \ ++ x(chacha20_poly1305_80, 3) \ ++ x(chacha20_poly1305_128, 4) \ ++ x(crc32c, 5) \ ++ x(crc64, 6) \ ++ x(xxhash, 7) ++ + enum bch_csum_type { +- BCH_CSUM_NONE = 0, +- BCH_CSUM_CRC32C_NONZERO = 1, +- BCH_CSUM_CRC64_NONZERO = 2, +- BCH_CSUM_CHACHA20_POLY1305_80 = 3, +- BCH_CSUM_CHACHA20_POLY1305_128 = 4, +- BCH_CSUM_CRC32C = 5, +- BCH_CSUM_CRC64 = 6, +- BCH_CSUM_XXHASH = 7, +- BCH_CSUM_NR = 8, ++#define x(t, n) BCH_CSUM_##t = n, ++ BCH_CSUM_TYPES() ++#undef x ++ BCH_CSUM_NR + }; + + static const unsigned bch_crc_bytes[] = { +- [BCH_CSUM_NONE] = 0, +- [BCH_CSUM_CRC32C_NONZERO] = 4, +- [BCH_CSUM_CRC32C] = 4, +- [BCH_CSUM_CRC64_NONZERO] = 8, +- [BCH_CSUM_CRC64] = 8, +- [BCH_CSUM_XXHASH] = 8, +- [BCH_CSUM_CHACHA20_POLY1305_80] = 10, +- [BCH_CSUM_CHACHA20_POLY1305_128] = 16, ++ [BCH_CSUM_none] = 0, ++ [BCH_CSUM_crc32c_nonzero] = 4, ++ [BCH_CSUM_crc32c] = 4, ++ [BCH_CSUM_crc64_nonzero] = 8, ++ [BCH_CSUM_crc64] = 8, ++ [BCH_CSUM_xxhash] = 8, ++ [BCH_CSUM_chacha20_poly1305_80] = 10, ++ [BCH_CSUM_chacha20_poly1305_128] = 16, + }; + + static inline _Bool bch2_csum_type_is_encryption(enum bch_csum_type type) + { + switch (type) { +- case BCH_CSUM_CHACHA20_POLY1305_80: +- case BCH_CSUM_CHACHA20_POLY1305_128: ++ case BCH_CSUM_chacha20_poly1305_80: ++ case BCH_CSUM_chacha20_poly1305_128: + return true; + default: + return false; +diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c +index d20924e579bf..fbe8603cfb30 100644 +--- a/fs/bcachefs/checksum.c ++++ b/fs/bcachefs/checksum.c +@@ -35,18 +35,18 @@ struct bch2_checksum_state { + static void bch2_checksum_init(struct bch2_checksum_state *state) + { + switch (state->type) { +- case BCH_CSUM_NONE: +- case BCH_CSUM_CRC32C: +- case BCH_CSUM_CRC64: ++ case BCH_CSUM_none: ++ case BCH_CSUM_crc32c: ++ case BCH_CSUM_crc64: + state->seed = 0; + break; +- case BCH_CSUM_CRC32C_NONZERO: ++ case BCH_CSUM_crc32c_nonzero: + state->seed = U32_MAX; + break; +- case BCH_CSUM_CRC64_NONZERO: ++ case BCH_CSUM_crc64_nonzero: + state->seed = U64_MAX; + break; +- case BCH_CSUM_XXHASH: ++ case BCH_CSUM_xxhash: + xxh64_reset(&state->h64state, 0); + break; + default: +@@ -57,15 +57,15 @@ static void bch2_checksum_init(struct bch2_checksum_state *state) + static u64 bch2_checksum_final(const struct bch2_checksum_state *state) + { + switch (state->type) { +- case BCH_CSUM_NONE: +- case BCH_CSUM_CRC32C: +- case BCH_CSUM_CRC64: ++ case BCH_CSUM_none: ++ case BCH_CSUM_crc32c: ++ case BCH_CSUM_crc64: + return state->seed; +- case BCH_CSUM_CRC32C_NONZERO: ++ case BCH_CSUM_crc32c_nonzero: + return state->seed ^ U32_MAX; +- case BCH_CSUM_CRC64_NONZERO: ++ case BCH_CSUM_crc64_nonzero: + return state->seed ^ U64_MAX; +- case BCH_CSUM_XXHASH: ++ case BCH_CSUM_xxhash: + return xxh64_digest(&state->h64state); + default: + BUG(); +@@ -75,17 +75,17 @@ static u64 bch2_checksum_final(const struct bch2_checksum_state *state) + static void bch2_checksum_update(struct bch2_checksum_state *state, const void *data, size_t len) + { + switch (state->type) { +- case BCH_CSUM_NONE: ++ case BCH_CSUM_none: + return; +- case BCH_CSUM_CRC32C_NONZERO: +- case BCH_CSUM_CRC32C: ++ case BCH_CSUM_crc32c_nonzero: ++ case BCH_CSUM_crc32c: + state->seed = crc32c(state->seed, data, len); + break; +- case BCH_CSUM_CRC64_NONZERO: +- case BCH_CSUM_CRC64: ++ case BCH_CSUM_crc64_nonzero: ++ case BCH_CSUM_crc64: + state->seed = crc64_be(state->seed, data, len); + break; +- case BCH_CSUM_XXHASH: ++ case BCH_CSUM_xxhash: + xxh64_update(&state->h64state, data, len); + break; + default: +@@ -161,12 +161,12 @@ struct bch_csum bch2_checksum(struct bch_fs *c, unsigned type, + struct nonce nonce, const void *data, size_t len) + { + switch (type) { +- case BCH_CSUM_NONE: +- case BCH_CSUM_CRC32C_NONZERO: +- case BCH_CSUM_CRC64_NONZERO: +- case BCH_CSUM_CRC32C: +- case BCH_CSUM_XXHASH: +- case BCH_CSUM_CRC64: { ++ case BCH_CSUM_none: ++ case BCH_CSUM_crc32c_nonzero: ++ case BCH_CSUM_crc64_nonzero: ++ case BCH_CSUM_crc32c: ++ case BCH_CSUM_xxhash: ++ case BCH_CSUM_crc64: { + struct bch2_checksum_state state; + + state.type = type; +@@ -177,8 +177,8 @@ struct bch_csum bch2_checksum(struct bch_fs *c, unsigned type, + return (struct bch_csum) { .lo = cpu_to_le64(bch2_checksum_final(&state)) }; + } + +- case BCH_CSUM_CHACHA20_POLY1305_80: +- case BCH_CSUM_CHACHA20_POLY1305_128: { ++ case BCH_CSUM_chacha20_poly1305_80: ++ case BCH_CSUM_chacha20_poly1305_128: { + SHASH_DESC_ON_STACK(desc, c->poly1305); + u8 digest[POLY1305_DIGEST_SIZE]; + struct bch_csum ret = { 0 }; +@@ -212,13 +212,13 @@ static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type, + struct bio_vec bv; + + switch (type) { +- case BCH_CSUM_NONE: ++ case BCH_CSUM_none: + return (struct bch_csum) { 0 }; +- case BCH_CSUM_CRC32C_NONZERO: +- case BCH_CSUM_CRC64_NONZERO: +- case BCH_CSUM_CRC32C: +- case BCH_CSUM_XXHASH: +- case BCH_CSUM_CRC64: { ++ case BCH_CSUM_crc32c_nonzero: ++ case BCH_CSUM_crc64_nonzero: ++ case BCH_CSUM_crc32c: ++ case BCH_CSUM_xxhash: ++ case BCH_CSUM_crc64: { + struct bch2_checksum_state state; + + state.type = type; +@@ -238,8 +238,8 @@ static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type, + return (struct bch_csum) { .lo = cpu_to_le64(bch2_checksum_final(&state)) }; + } + +- case BCH_CSUM_CHACHA20_POLY1305_80: +- case BCH_CSUM_CHACHA20_POLY1305_128: { ++ case BCH_CSUM_chacha20_poly1305_80: ++ case BCH_CSUM_chacha20_poly1305_128: { + SHASH_DESC_ON_STACK(desc, c->poly1305); + u8 digest[POLY1305_DIGEST_SIZE]; + struct bch_csum ret = { 0 }; +diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h +index 6841fb16568a..f5c1a609c5c4 100644 +--- a/fs/bcachefs/checksum.h ++++ b/fs/bcachefs/checksum.h +@@ -13,9 +13,9 @@ static inline bool bch2_checksum_mergeable(unsigned type) + { + + switch (type) { +- case BCH_CSUM_NONE: +- case BCH_CSUM_CRC32C: +- case BCH_CSUM_CRC64: ++ case BCH_CSUM_none: ++ case BCH_CSUM_crc32c: ++ case BCH_CSUM_crc64: + return true; + default: + return false; +@@ -78,13 +78,13 @@ static inline enum bch_csum_type bch2_csum_opt_to_type(enum bch_csum_opts type, + { + switch (type) { + case BCH_CSUM_OPT_none: +- return BCH_CSUM_NONE; ++ return BCH_CSUM_none; + case BCH_CSUM_OPT_crc32c: +- return data ? BCH_CSUM_CRC32C : BCH_CSUM_CRC32C_NONZERO; ++ return data ? BCH_CSUM_crc32c : BCH_CSUM_crc32c_nonzero; + case BCH_CSUM_OPT_crc64: +- return data ? BCH_CSUM_CRC64 : BCH_CSUM_CRC64_NONZERO; ++ return data ? BCH_CSUM_crc64 : BCH_CSUM_crc64_nonzero; + case BCH_CSUM_OPT_xxhash: +- return BCH_CSUM_XXHASH; ++ return BCH_CSUM_xxhash; + default: + BUG(); + } +@@ -95,8 +95,8 @@ static inline enum bch_csum_type bch2_data_checksum_type(struct bch_fs *c, + { + if (c->sb.encryption_type) + return c->opts.wide_macs +- ? BCH_CSUM_CHACHA20_POLY1305_128 +- : BCH_CSUM_CHACHA20_POLY1305_80; ++ ? BCH_CSUM_chacha20_poly1305_128 ++ : BCH_CSUM_chacha20_poly1305_80; + + return bch2_csum_opt_to_type(opt, true); + } +@@ -104,7 +104,7 @@ static inline enum bch_csum_type bch2_data_checksum_type(struct bch_fs *c, + static inline enum bch_csum_type bch2_meta_checksum_type(struct bch_fs *c) + { + if (c->sb.encryption_type) +- return BCH_CSUM_CHACHA20_POLY1305_128; ++ return BCH_CSUM_chacha20_poly1305_128; + + return bch2_csum_opt_to_type(c->opts.metadata_checksum, false); + } +diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c +index bfa512d78538..bca1b8a7b673 100644 +--- a/fs/bcachefs/ec.c ++++ b/fs/bcachefs/ec.c +@@ -1154,7 +1154,7 @@ static void ec_stripe_key_init(struct bch_fs *c, + s->v.nr_blocks = nr_data + nr_parity; + s->v.nr_redundant = nr_parity; + s->v.csum_granularity_bits = ilog2(c->sb.encoded_extent_max); +- s->v.csum_type = BCH_CSUM_CRC32C; ++ s->v.csum_type = BCH_CSUM_crc32c; + s->v.pad = 0; + + while ((u64s = stripe_val_u64s(&s->v)) > BKEY_VAL_U64s_MAX) { +diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c +index 194fbe21c97f..89b5be907eea 100644 +--- a/fs/bcachefs/extents.c ++++ b/fs/bcachefs/extents.c +@@ -969,12 +969,12 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c, + case BCH_EXTENT_ENTRY_crc128: + crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry)); + +- pr_buf(out, "crc: c_size %u size %u offset %u nonce %u csum %u compress %u", ++ pr_buf(out, "crc: c_size %u size %u offset %u nonce %u csum %s compress %s", + crc.compressed_size, + crc.uncompressed_size, + crc.offset, crc.nonce, +- crc.csum_type, +- crc.compression_type); ++ bch2_csum_types[crc.csum_type], ++ bch2_compression_types[crc.compression_type]); + break; + case BCH_EXTENT_ENTRY_stripe_ptr: + ec = &entry->stripe_ptr; +diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c +index 3026daa7f9c2..3a6b4446706d 100644 +--- a/fs/bcachefs/io.c ++++ b/fs/bcachefs/io.c +@@ -2039,7 +2039,7 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, + EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size); + + if (crc_is_compressed(pick.crc) || +- (pick.crc.csum_type != BCH_CSUM_NONE && ++ (pick.crc.csum_type != BCH_CSUM_none && + (bvec_iter_sectors(iter) != pick.crc.uncompressed_size || + (bch2_csum_type_is_encryption(pick.crc.csum_type) && + (flags & BCH_READ_USER_MAPPED)) || +diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c +index ff99c6d24abd..a955ef2008c9 100644 +--- a/fs/bcachefs/opts.c ++++ b/fs/bcachefs/opts.c +@@ -31,17 +31,32 @@ const char * const bch2_btree_ids[] = { + NULL + }; + ++const char * const bch2_csum_types[] = { ++ BCH_CSUM_TYPES() ++ NULL ++}; ++ + const char * const bch2_csum_opts[] = { + BCH_CSUM_OPTS() + NULL + }; + ++const char * const bch2_compression_types[] = { ++ BCH_COMPRESSION_TYPES() ++ NULL ++}; ++ + const char * const bch2_compression_opts[] = { + BCH_COMPRESSION_OPTS() + NULL + }; + + const char * const bch2_str_hash_types[] = { ++ BCH_STR_HASH_TYPES() ++ NULL ++}; ++ ++const char * const bch2_str_hash_opts[] = { + BCH_STR_HASH_OPTS() + NULL + }; +diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h +index b60bdfca27fd..5d9c00af5973 100644 +--- a/fs/bcachefs/opts.h ++++ b/fs/bcachefs/opts.h +@@ -12,9 +12,12 @@ extern const char * const bch2_error_actions[]; + extern const char * const bch2_sb_features[]; + extern const char * const bch2_sb_compat[]; + extern const char * const bch2_btree_ids[]; ++extern const char * const bch2_csum_types[]; + extern const char * const bch2_csum_opts[]; ++extern const char * const bch2_compression_types[]; + extern const char * const bch2_compression_opts[]; + extern const char * const bch2_str_hash_types[]; ++extern const char * const bch2_str_hash_opts[]; + extern const char * const bch2_data_types[]; + extern const char * const bch2_cache_replacement_policies[]; + extern const char * const bch2_member_states[]; +@@ -140,7 +143,7 @@ enum opt_type { + NULL, NULL) \ + x(str_hash, u8, \ + OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ +- OPT_STR(bch2_str_hash_types), \ ++ OPT_STR(bch2_str_hash_opts), \ + BCH_SB_STR_HASH_TYPE, BCH_STR_HASH_OPT_siphash, \ + NULL, "Hash function for directory entries and xattrs")\ + x(metadata_target, u16, \ +diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h +index 789dde7c6ac6..57d636740d2f 100644 +--- a/fs/bcachefs/str_hash.h ++++ b/fs/bcachefs/str_hash.h +@@ -20,13 +20,13 @@ bch2_str_hash_opt_to_type(struct bch_fs *c, enum bch_str_hash_opts opt) + { + switch (opt) { + case BCH_STR_HASH_OPT_crc32c: +- return BCH_STR_HASH_CRC32C; ++ return BCH_STR_HASH_crc32c; + case BCH_STR_HASH_OPT_crc64: +- return BCH_STR_HASH_CRC64; ++ return BCH_STR_HASH_crc64; + case BCH_STR_HASH_OPT_siphash: + return c->sb.features & (1ULL << BCH_FEATURE_new_siphash) +- ? BCH_STR_HASH_SIPHASH +- : BCH_STR_HASH_SIPHASH_OLD; ++ ? BCH_STR_HASH_siphash ++ : BCH_STR_HASH_siphash_old; + default: + BUG(); + } +@@ -51,7 +51,7 @@ bch2_hash_info_init(struct bch_fs *c, const struct bch_inode_unpacked *bi) + .siphash_key = { .k0 = bi->bi_hash_seed } + }; + +- if (unlikely(info.type == BCH_STR_HASH_SIPHASH_OLD)) { ++ if (unlikely(info.type == BCH_STR_HASH_siphash_old)) { + SHASH_DESC_ON_STACK(desc, c->sha256); + u8 digest[SHA256_DIGEST_SIZE]; + +@@ -77,16 +77,16 @@ static inline void bch2_str_hash_init(struct bch_str_hash_ctx *ctx, + const struct bch_hash_info *info) + { + switch (info->type) { +- case BCH_STR_HASH_CRC32C: ++ case BCH_STR_HASH_crc32c: + ctx->crc32c = crc32c(~0, &info->siphash_key.k0, + sizeof(info->siphash_key.k0)); + break; +- case BCH_STR_HASH_CRC64: ++ case BCH_STR_HASH_crc64: + ctx->crc64 = crc64_be(~0, &info->siphash_key.k0, + sizeof(info->siphash_key.k0)); + break; +- case BCH_STR_HASH_SIPHASH_OLD: +- case BCH_STR_HASH_SIPHASH: ++ case BCH_STR_HASH_siphash_old: ++ case BCH_STR_HASH_siphash: + SipHash24_Init(&ctx->siphash, &info->siphash_key); + break; + default: +@@ -99,14 +99,14 @@ static inline void bch2_str_hash_update(struct bch_str_hash_ctx *ctx, + const void *data, size_t len) + { + switch (info->type) { +- case BCH_STR_HASH_CRC32C: ++ case BCH_STR_HASH_crc32c: + ctx->crc32c = crc32c(ctx->crc32c, data, len); + break; +- case BCH_STR_HASH_CRC64: ++ case BCH_STR_HASH_crc64: + ctx->crc64 = crc64_be(ctx->crc64, data, len); + break; +- case BCH_STR_HASH_SIPHASH_OLD: +- case BCH_STR_HASH_SIPHASH: ++ case BCH_STR_HASH_siphash_old: ++ case BCH_STR_HASH_siphash: + SipHash24_Update(&ctx->siphash, data, len); + break; + default: +@@ -118,12 +118,12 @@ static inline u64 bch2_str_hash_end(struct bch_str_hash_ctx *ctx, + const struct bch_hash_info *info) + { + switch (info->type) { +- case BCH_STR_HASH_CRC32C: ++ case BCH_STR_HASH_crc32c: + return ctx->crc32c; +- case BCH_STR_HASH_CRC64: ++ case BCH_STR_HASH_crc64: + return ctx->crc64 >> 1; +- case BCH_STR_HASH_SIPHASH_OLD: +- case BCH_STR_HASH_SIPHASH: ++ case BCH_STR_HASH_siphash_old: ++ case BCH_STR_HASH_siphash: + return SipHash24_End(&ctx->siphash) >> 1; + default: + BUG(); +-- +cgit v1.2.3 + + +From 6ec168b97d1e947dcdfb04163c0592fb4199ee9a Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 13 Nov 2021 17:44:13 -0500 +Subject: bcachefs: Improve bch2_reflink_p_to_text() + +.to_text methods generally ought to print all the value fields. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/reflink.c | 5 ++++- + 1 file changed, 4 insertions(+), 1 deletion(-) + +diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c +index 22230f82b8b9..8dcac7815c9f 100644 +--- a/fs/bcachefs/reflink.c ++++ b/fs/bcachefs/reflink.c +@@ -44,7 +44,10 @@ void bch2_reflink_p_to_text(struct printbuf *out, struct bch_fs *c, + { + struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); + +- pr_buf(out, "idx %llu", le64_to_cpu(p.v->idx)); ++ pr_buf(out, "idx %llu front_pad %u back_pad %u", ++ le64_to_cpu(p.v->idx), ++ le32_to_cpu(p.v->front_pad), ++ le32_to_cpu(p.v->back_pad)); + } + + bool bch2_reflink_p_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r) +-- +cgit v1.2.3 + + +From 99ae3988e7dc6f6e686eead0880d44b667f03fe8 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 13 Nov 2021 17:53:55 -0500 +Subject: bcachefs: Convert journal BUG_ON() to a warning + +It's definitely indicative of a bug if we request to flush a journal +sequence number that hasn't happened yet, but it's more useful if we +warn and print out the relevant sequence numbers instead of just dying. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/journal.c | 5 ++++- + 1 file changed, 4 insertions(+), 1 deletion(-) + +diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c +index a2b26d5b5236..f5d5cbb22da9 100644 +--- a/fs/bcachefs/journal.c ++++ b/fs/bcachefs/journal.c +@@ -551,7 +551,10 @@ int bch2_journal_flush_seq_async(struct journal *j, u64 seq, + + spin_lock(&j->lock); + +- BUG_ON(seq > journal_cur_seq(j)); ++ if (WARN_ONCE(seq > journal_cur_seq(j), ++ "requested to flush journal seq %llu, but currently at %llu", ++ seq, journal_cur_seq(j))) ++ goto out; + + /* Recheck under lock: */ + if (j->err_seq && seq >= j->err_seq) { +-- +cgit v1.2.3 + + +From 2e80bfa1c9f1ab03b887511e2634415ccaa1a707 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 13 Nov 2021 17:57:52 -0500 +Subject: bcachefs: Fix missing field initialization + +When unpacking v1 inodes, we were failing to initialize the journal_seq +field, leading to a BUG_ON() when fsync tries to flush a garbage journal +sequence number. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/inode.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c +index 968ccffeba4f..ffce68a80490 100644 +--- a/fs/bcachefs/inode.c ++++ b/fs/bcachefs/inode.c +@@ -220,6 +220,7 @@ int bch2_inode_unpack(struct bkey_s_c k, + struct bkey_s_c_inode inode = bkey_s_c_to_inode(k); + + unpacked->bi_inum = inode.k->p.offset; ++ unpacked->bi_journal_seq= 0; + unpacked->bi_hash_seed = inode.v->bi_hash_seed; + unpacked->bi_flags = le32_to_cpu(inode.v->bi_flags); + unpacked->bi_mode = le16_to_cpu(inode.v->bi_mode); +-- +cgit v1.2.3 + + +From e0d56ef16576c597ce372d907fe178e286923613 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 26 Oct 2021 17:35:58 -0400 +Subject: bcachefs: Refactor journal replay code + +This consolidates duplicated code in journal replay - it's only a few +flags that are different for replaying alloc keys. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/recovery.c | 53 ++++++++++++++++---------------------------------- + 1 file changed, 17 insertions(+), 36 deletions(-) + +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index d8e511a0664e..373e309299bb 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -518,57 +518,38 @@ static void replay_now_at(struct journal *j, u64 seq) + } + + static int __bch2_journal_replay_key(struct btree_trans *trans, +- enum btree_id id, unsigned level, +- struct bkey_i *k) ++ struct journal_key *k) + { + struct btree_iter iter; ++ unsigned iter_flags = ++ BTREE_ITER_INTENT| ++ BTREE_ITER_NOT_EXTENTS; + int ret; + +- bch2_trans_node_iter_init(trans, &iter, id, k->k.p, +- BTREE_MAX_DEPTH, level, +- BTREE_ITER_INTENT| +- BTREE_ITER_NOT_EXTENTS); ++ if (!k->level && k->btree_id == BTREE_ID_alloc) ++ iter_flags |= BTREE_ITER_CACHED|BTREE_ITER_CACHED_NOFILL; ++ ++ bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p, ++ BTREE_MAX_DEPTH, k->level, ++ iter_flags); + ret = bch2_btree_iter_traverse(&iter) ?: +- bch2_trans_update(trans, &iter, k, BTREE_TRIGGER_NORUN); ++ bch2_trans_update(trans, &iter, k->k, BTREE_TRIGGER_NORUN); + bch2_trans_iter_exit(trans, &iter); + return ret; + } + + static int bch2_journal_replay_key(struct bch_fs *c, struct journal_key *k) + { +- unsigned commit_flags = BTREE_INSERT_NOFAIL| +- BTREE_INSERT_LAZY_RW; ++ unsigned commit_flags = ++ BTREE_INSERT_LAZY_RW| ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_JOURNAL_RESERVED; + + if (!k->allocated) + commit_flags |= BTREE_INSERT_JOURNAL_REPLAY; + + return bch2_trans_do(c, NULL, NULL, commit_flags, +- __bch2_journal_replay_key(&trans, k->btree_id, k->level, k->k)); +-} +- +-static int __bch2_alloc_replay_key(struct btree_trans *trans, struct bkey_i *k) +-{ +- struct btree_iter iter; +- int ret; +- +- bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, k->k.p, +- BTREE_ITER_CACHED| +- BTREE_ITER_CACHED_NOFILL| +- BTREE_ITER_INTENT); +- ret = bch2_btree_iter_traverse(&iter) ?: +- bch2_trans_update(trans, &iter, k, BTREE_TRIGGER_NORUN); +- bch2_trans_iter_exit(trans, &iter); +- return ret; +-} +- +-static int bch2_alloc_replay_key(struct bch_fs *c, struct bkey_i *k) +-{ +- return bch2_trans_do(c, NULL, NULL, +- BTREE_INSERT_NOFAIL| +- BTREE_INSERT_USE_RESERVE| +- BTREE_INSERT_LAZY_RW| +- BTREE_INSERT_JOURNAL_REPLAY, +- __bch2_alloc_replay_key(&trans, k)); ++ __bch2_journal_replay_key(&trans, k)); + } + + static int journal_sort_seq_cmp(const void *_l, const void *_r) +@@ -606,7 +587,7 @@ static int bch2_journal_replay(struct bch_fs *c, + + if (!i->level && i->btree_id == BTREE_ID_alloc) { + j->replay_journal_seq = keys.journal_seq_base + i->journal_seq; +- ret = bch2_alloc_replay_key(c, i->k); ++ ret = bch2_journal_replay_key(c, i); + if (ret) + goto err; + } +-- +cgit v1.2.3 + + +From 9ffc11f9ad41d4bcaa631750a8e4375638f36ae1 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 13 Nov 2021 19:49:14 -0500 +Subject: bcachefs: Update export_operations for snapshots + +When support for snapshots was merged, export operations weren't +updated yet. This patch adds new filehandle types for bcachefs that +include the subvolume ID and updates export operations for subvolumes - +and also .get_parent, support for which was added just prior to +snapshots. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/dirent.c | 4 +- + fs/bcachefs/dirent.h | 3 + + fs/bcachefs/fs.c | 230 ++++++++++++++++++++++++++++++++++++++++++----- + include/linux/exportfs.h | 6 ++ + 4 files changed, 218 insertions(+), 25 deletions(-) + +diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c +index 5db1426faaf3..4dfcc955675b 100644 +--- a/fs/bcachefs/dirent.c ++++ b/fs/bcachefs/dirent.c +@@ -197,8 +197,8 @@ static void dirent_copy_target(struct bkey_i_dirent *dst, + dst->v.d_type = src.v->d_type; + } + +-static int bch2_dirent_read_target(struct btree_trans *trans, subvol_inum dir, +- struct bkey_s_c_dirent d, subvol_inum *target) ++int bch2_dirent_read_target(struct btree_trans *trans, subvol_inum dir, ++ struct bkey_s_c_dirent d, subvol_inum *target) + { + struct bch_subvolume s; + int ret = 0; +diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h +index 8ae407765fe4..1bb4d802bc1d 100644 +--- a/fs/bcachefs/dirent.h ++++ b/fs/bcachefs/dirent.h +@@ -29,6 +29,9 @@ static inline unsigned dirent_val_u64s(unsigned len) + sizeof(u64)); + } + ++int bch2_dirent_read_target(struct btree_trans *, subvol_inum, ++ struct bkey_s_c_dirent, subvol_inum *); ++ + int bch2_dirent_create(struct btree_trans *, subvol_inum, + const struct bch_hash_info *, u8, + const struct qstr *, u64, u64 *, int); +diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c +index 7f0ecee411a7..7f7405914c3b 100644 +--- a/fs/bcachefs/fs.c ++++ b/fs/bcachefs/fs.c +@@ -1123,46 +1123,230 @@ static const struct address_space_operations bch_address_space_operations = { + .error_remove_page = generic_error_remove_page, + }; + +-#if 0 +-static struct inode *bch2_nfs_get_inode(struct super_block *sb, +- u64 ino, u32 generation) ++struct bcachefs_fid { ++ u64 inum; ++ u32 subvol; ++ u32 gen; ++} __packed; ++ ++struct bcachefs_fid_with_parent { ++ struct bcachefs_fid fid; ++ struct bcachefs_fid dir; ++} __packed; ++ ++static int bcachefs_fid_valid(int fh_len, int fh_type) + { +- struct bch_fs *c = sb->s_fs_info; +- struct inode *vinode; ++ switch (fh_type) { ++ case FILEID_BCACHEFS_WITHOUT_PARENT: ++ return fh_len == sizeof(struct bcachefs_fid) / sizeof(u32); ++ case FILEID_BCACHEFS_WITH_PARENT: ++ return fh_len == sizeof(struct bcachefs_fid_with_parent) / sizeof(u32); ++ default: ++ return false; ++ } ++} ++ ++static struct bcachefs_fid bch2_inode_to_fid(struct bch_inode_info *inode) ++{ ++ return (struct bcachefs_fid) { ++ .inum = inode->ei_inode.bi_inum, ++ .subvol = inode->ei_subvol, ++ .gen = inode->ei_inode.bi_generation, ++ }; ++} ++ ++static int bch2_encode_fh(struct inode *vinode, u32 *fh, int *len, ++ struct inode *vdir) ++{ ++ struct bch_inode_info *inode = to_bch_ei(vinode); ++ struct bch_inode_info *dir = to_bch_ei(vdir); ++ ++ if (*len < sizeof(struct bcachefs_fid_with_parent) / sizeof(u32)) ++ return FILEID_INVALID; ++ ++ if (!S_ISDIR(inode->v.i_mode) && dir) { ++ struct bcachefs_fid_with_parent *fid = (void *) fh; ++ ++ fid->fid = bch2_inode_to_fid(inode); ++ fid->dir = bch2_inode_to_fid(dir); ++ ++ *len = sizeof(*fid) / sizeof(u32); ++ return FILEID_BCACHEFS_WITH_PARENT; ++ } else { ++ struct bcachefs_fid *fid = (void *) fh; + +- if (ino < BCACHEFS_ROOT_INO) +- return ERR_PTR(-ESTALE); ++ *fid = bch2_inode_to_fid(inode); + +- vinode = bch2_vfs_inode_get(c, ino); +- if (IS_ERR(vinode)) +- return ERR_CAST(vinode); +- if (generation && vinode->i_generation != generation) { +- /* we didn't find the right inode.. */ ++ *len = sizeof(*fid) / sizeof(u32); ++ return FILEID_BCACHEFS_WITHOUT_PARENT; ++ } ++} ++ ++static struct inode *bch2_nfs_get_inode(struct super_block *sb, ++ struct bcachefs_fid fid) ++{ ++ struct bch_fs *c = sb->s_fs_info; ++ struct inode *vinode = bch2_vfs_inode_get(c, (subvol_inum) { ++ .subvol = fid.subvol, ++ .inum = fid.inum, ++ }); ++ if (!IS_ERR(vinode) && vinode->i_generation != fid.gen) { + iput(vinode); +- return ERR_PTR(-ESTALE); ++ vinode = ERR_PTR(-ESTALE); + } + return vinode; + } + +-static struct dentry *bch2_fh_to_dentry(struct super_block *sb, struct fid *fid, ++static struct dentry *bch2_fh_to_dentry(struct super_block *sb, struct fid *_fid, + int fh_len, int fh_type) + { +- return generic_fh_to_dentry(sb, fid, fh_len, fh_type, +- bch2_nfs_get_inode); ++ struct bcachefs_fid *fid = (void *) _fid; ++ ++ if (!bcachefs_fid_valid(fh_len, fh_type)) ++ return NULL; ++ ++ return d_obtain_alias(bch2_nfs_get_inode(sb, *fid)); + } + +-static struct dentry *bch2_fh_to_parent(struct super_block *sb, struct fid *fid, ++static struct dentry *bch2_fh_to_parent(struct super_block *sb, struct fid *_fid, + int fh_len, int fh_type) + { +- return generic_fh_to_parent(sb, fid, fh_len, fh_type, +- bch2_nfs_get_inode); ++ struct bcachefs_fid_with_parent *fid = (void *) _fid; ++ ++ if (!bcachefs_fid_valid(fh_len, fh_type) || ++ fh_type != FILEID_BCACHEFS_WITH_PARENT) ++ return NULL; ++ ++ return d_obtain_alias(bch2_nfs_get_inode(sb, fid->dir)); ++} ++ ++static struct dentry *bch2_get_parent(struct dentry *child) ++{ ++ struct bch_inode_info *inode = to_bch_ei(child->d_inode); ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ subvol_inum parent_inum = { ++ .subvol = inode->ei_inode.bi_parent_subvol ?: ++ inode->ei_subvol, ++ .inum = inode->ei_inode.bi_dir, ++ }; ++ ++ if (!parent_inum.inum) ++ return NULL; ++ ++ return d_obtain_alias(bch2_vfs_inode_get(c, parent_inum)); ++} ++ ++static int bch2_get_name(struct dentry *parent, char *name, struct dentry *child) ++{ ++ struct bch_inode_info *inode = to_bch_ei(child->d_inode); ++ struct bch_inode_info *dir = to_bch_ei(parent->d_inode); ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ struct btree_trans trans; ++ struct btree_iter iter1; ++ struct btree_iter iter2; ++ struct bkey_s_c k; ++ struct bkey_s_c_dirent d; ++ struct bch_inode_unpacked inode_u; ++ subvol_inum target; ++ u32 snapshot; ++ unsigned name_len; ++ int ret; ++ ++ if (!S_ISDIR(dir->v.i_mode)) ++ return -EINVAL; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ bch2_trans_iter_init(&trans, &iter1, BTREE_ID_dirents, ++ POS(dir->ei_inode.bi_inum, 0), 0); ++ bch2_trans_iter_init(&trans, &iter2, BTREE_ID_dirents, ++ POS(dir->ei_inode.bi_inum, 0), 0); ++retry: ++ bch2_trans_begin(&trans); ++ ++ ret = bch2_subvolume_get_snapshot(&trans, dir->ei_subvol, &snapshot); ++ if (ret) ++ goto err; ++ ++ bch2_btree_iter_set_snapshot(&iter1, snapshot); ++ bch2_btree_iter_set_snapshot(&iter2, snapshot); ++ ++ ret = bch2_inode_find_by_inum_trans(&trans, inode_inum(inode), &inode_u); ++ if (ret) ++ goto err; ++ ++ if (inode_u.bi_dir == dir->ei_inode.bi_inum) { ++ bch2_btree_iter_set_pos(&iter1, POS(inode_u.bi_dir, inode_u.bi_dir_offset)); ++ ++ k = bch2_btree_iter_peek_slot(&iter1); ++ ret = bkey_err(k); ++ if (ret) ++ goto err; ++ ++ if (k.k->type != KEY_TYPE_dirent) { ++ ret = -ENOENT; ++ goto err; ++ } ++ ++ d = bkey_s_c_to_dirent(k); ++ ret = bch2_dirent_read_target(&trans, inode_inum(dir), d, &target); ++ if (ret > 0) ++ ret = -ENOENT; ++ if (ret) ++ goto err; ++ ++ if (target.subvol == inode->ei_subvol && ++ target.inum == inode->ei_inode.bi_inum) ++ goto found; ++ } else { ++ /* ++ * File with multiple hardlinks and our backref is to the wrong ++ * directory - linear search: ++ */ ++ for_each_btree_key_continue_norestart(iter2, 0, k, ret) { ++ if (k.k->p.inode > dir->ei_inode.bi_inum) ++ break; ++ ++ if (k.k->type != KEY_TYPE_dirent) ++ continue; ++ ++ d = bkey_s_c_to_dirent(k); ++ ret = bch2_dirent_read_target(&trans, inode_inum(dir), d, &target); ++ if (ret < 0) ++ break; ++ if (ret) ++ continue; ++ ++ if (target.subvol == inode->ei_subvol && ++ target.inum == inode->ei_inode.bi_inum) ++ goto found; ++ } ++ } ++ ++ ret = -ENOENT; ++ goto err; ++found: ++ name_len = min_t(unsigned, bch2_dirent_name_bytes(d), NAME_MAX); ++ ++ memcpy(name, d.v->d_name, name_len); ++ name[name_len] = '\0'; ++err: ++ if (ret == -EINTR) ++ goto retry; ++ ++ bch2_trans_iter_exit(&trans, &iter1); ++ bch2_trans_iter_exit(&trans, &iter2); ++ bch2_trans_exit(&trans); ++ ++ return ret; + } +-#endif + + static const struct export_operations bch_export_ops = { +- //.fh_to_dentry = bch2_fh_to_dentry, +- //.fh_to_parent = bch2_fh_to_parent, +- //.get_parent = bch2_get_parent, ++ .encode_fh = bch2_encode_fh, ++ .fh_to_dentry = bch2_fh_to_dentry, ++ .fh_to_parent = bch2_fh_to_parent, ++ .get_parent = bch2_get_parent, ++ .get_name = bch2_get_name, + }; + + static void bch2_vfs_inode_init(struct btree_trans *trans, subvol_inum inum, +diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h +index 3260fe714846..bac82bd72626 100644 +--- a/include/linux/exportfs.h ++++ b/include/linux/exportfs.h +@@ -98,6 +98,12 @@ enum fid_type { + */ + FILEID_FAT_WITH_PARENT = 0x72, + ++ /* ++ * 64 bit inode number, 32 bit subvolume, 32 bit generation number: ++ */ ++ FILEID_BCACHEFS_WITHOUT_PARENT = 0x80, ++ FILEID_BCACHEFS_WITH_PARENT = 0x81, ++ + /* + * 128 bit child FID (struct lu_fid) + * 128 bit parent FID (struct lu_fid) +-- +cgit v1.2.3 + + +From 7af2f80db6dfd2a7089bfc0c5dd3a4b957766fc4 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 15 Nov 2021 15:03:06 -0500 +Subject: bcachefs: Also log device name in userspace + +Change log messages in userspace to be closer to what they are in kernel +space, and include the device name - it's also useful in userspace. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bcachefs.h | 4 ++-- + fs/bcachefs/recovery.c | 3 ++- + 2 files changed, 4 insertions(+), 3 deletions(-) + +diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h +index 131d0f7ba47d..aa9ccc4bc600 100644 +--- a/fs/bcachefs/bcachefs.h ++++ b/fs/bcachefs/bcachefs.h +@@ -218,8 +218,8 @@ + #define bch2_fmt(_c, fmt) "bcachefs (%s): " fmt "\n", ((_c)->name) + #define bch2_fmt_inum(_c, _inum, fmt) "bcachefs (%s inum %llu): " fmt "\n", ((_c)->name), (_inum) + #else +-#define bch2_fmt(_c, fmt) fmt "\n" +-#define bch2_fmt_inum(_c, _inum, fmt) "inum %llu: " fmt "\n", (_inum) ++#define bch2_fmt(_c, fmt) "%s: " fmt "\n", ((_c)->name) ++#define bch2_fmt_inum(_c, _inum, fmt) "%s inum %llu: " fmt "\n", ((_c)->name), (_inum) + #endif + + #define bch_info(c, fmt, ...) \ +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index 373e309299bb..be8912605527 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -1031,6 +1031,8 @@ int bch2_fs_recovery(struct bch_fs *c) + if (c->sb.clean) + bch_info(c, "recovering from clean shutdown, journal seq %llu", + le64_to_cpu(clean->journal_seq)); ++ else ++ bch_info(c, "recovering from unclean shutdown"); + + if (!(c->sb.features & (1ULL << BCH_FEATURE_new_extent_overwrite))) { + bch_err(c, "feature new_extent_overwrite not set, filesystem no longer supported"); +@@ -1049,7 +1051,6 @@ int bch2_fs_recovery(struct bch_fs *c) + bch_err(c, "filesystem may have incompatible bkey formats; run fsck from the compat branch to fix"); + ret = -EINVAL; + goto err; +- + } + + if (!(c->sb.features & (1ULL << BCH_FEATURE_alloc_v2))) { +-- +cgit v1.2.3 + + +From 39737737b777c8e9e4d7cac88bef1442567f36d4 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 15 Nov 2021 15:02:13 -0500 +Subject: bcachefs: Disk space accounting fix on brand-new fs + +The filesystem initialization path first marks superblock and journal +buckets non transactionally, since the btree isn't functional yet. That +path was updating the per-journal-buf percpu counters via +bch2_dev_usage_update(), and updating the wrong set of counters so those +updates didn't get written out until journal entry 4. + +The relevant code is going to get significantly rewritten in the future +as we transition away from the in memory bucket array, so this just +hacks around it for now. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bcachefs.h | 1 + + fs/bcachefs/buckets.c | 11 +++++++++++ + fs/bcachefs/super-io.c | 8 ++++++++ + 3 files changed, 20 insertions(+) + +diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h +index aa9ccc4bc600..fdf3a777ae16 100644 +--- a/fs/bcachefs/bcachefs.h ++++ b/fs/bcachefs/bcachefs.h +@@ -495,6 +495,7 @@ struct bch_dev { + + enum { + /* startup: */ ++ BCH_FS_INITIALIZED, + BCH_FS_ALLOC_READ_DONE, + BCH_FS_ALLOC_CLEAN, + BCH_FS_ALLOCATOR_RUNNING, +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index c3542d3c2eac..92ea698580d9 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -117,6 +117,8 @@ static inline struct bch_dev_usage *dev_usage_ptr(struct bch_dev *ca, + unsigned journal_seq, + bool gc) + { ++ BUG_ON(!gc && !journal_seq); ++ + return this_cpu_ptr(gc + ? ca->usage_gc + : ca->usage[journal_seq & JOURNAL_BUF_MASK]); +@@ -142,6 +144,8 @@ static inline struct bch_fs_usage *fs_usage_ptr(struct bch_fs *c, + unsigned journal_seq, + bool gc) + { ++ BUG_ON(!gc && !journal_seq); ++ + return this_cpu_ptr(gc + ? c->usage_gc + : c->usage[journal_seq & JOURNAL_BUF_MASK]); +@@ -360,6 +364,13 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca, + struct bch_fs_usage *fs_usage; + struct bch_dev_usage *u; + ++ /* ++ * Hack for bch2_fs_initialize path, where we're first marking sb and ++ * journal non-transactionally: ++ */ ++ if (!journal_seq && !test_bit(BCH_FS_INITIALIZED, &c->flags)) ++ journal_seq = 1; ++ + percpu_rwsem_assert_held(&c->mark_lock); + + preempt_disable(); +diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c +index 33d832bc4d4a..802976333ec8 100644 +--- a/fs/bcachefs/super-io.c ++++ b/fs/bcachefs/super-io.c +@@ -441,8 +441,16 @@ int bch2_sb_to_fs(struct bch_fs *c, struct bch_sb *src) + + if (BCH_SB_HAS_ERRORS(c->disk_sb.sb)) + set_bit(BCH_FS_ERROR, &c->flags); ++ else ++ clear_bit(BCH_FS_ERROR, &c->flags); ++ + if (BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb)) + set_bit(BCH_FS_TOPOLOGY_ERROR, &c->flags); ++ else ++ clear_bit(BCH_FS_TOPOLOGY_ERROR, &c->flags); ++ ++ if (BCH_SB_INITIALIZED(c->disk_sb.sb)) ++ set_bit(BCH_FS_INITIALIZED, &c->flags); + + ret = bch2_sb_replicas_to_cpu_replicas(c); + if (ret) +-- +cgit v1.2.3 + + +From 5b050977f6df2416d3384165fdba9faf289db4c4 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 27 Oct 2021 12:51:12 -0400 +Subject: bcachefs: Run insert triggers before overwrite triggers + +Currently, btree triggers are run in natural key order, which presents a +problem for fallocate in INSERT_RANGE mode: since we're moving existing +extents to higher offsets, the trigger for deleting the old extent runs +before the trigger that adds the new extent, potentially leading to +indirect extents being deleted that shouldn't be when the delete causes +the refcount to hit 0. + +This changes the order we run triggers so that for a givin btree, we run +all insert triggers before overwrite triggers, nicely sidestepping this +issue. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_types.h | 3 +- + fs/bcachefs/btree_update_leaf.c | 133 ++++++++++++++++++++++++++++++++-------- + fs/bcachefs/buckets.c | 35 ----------- + fs/bcachefs/buckets.h | 2 - + 4 files changed, 109 insertions(+), 64 deletions(-) + +diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h +index affc0e681de9..0d0a719f738f 100644 +--- a/fs/bcachefs/btree_types.h ++++ b/fs/bcachefs/btree_types.h +@@ -338,7 +338,8 @@ struct btree_insert_entry { + enum btree_id btree_id:8; + u8 level; + bool cached:1; +- bool trans_triggers_run:1; ++ bool insert_trigger_run:1; ++ bool overwrite_trigger_run:1; + struct bkey_i *k; + struct btree_path *path; + unsigned long ip_allocated; +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index 51f65226d3bf..112ac7caf579 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -815,10 +815,112 @@ bch2_trans_commit_get_rw_cold(struct btree_trans *trans) + return 0; + } + ++static int bch2_trans_commit_run_triggers(struct btree_trans *trans) ++{ ++ struct bkey _deleted = KEY(0, 0, 0); ++ struct bkey_s_c deleted = (struct bkey_s_c) { &_deleted, NULL }; ++ struct bkey_s_c old; ++ struct bkey unpacked; ++ struct btree_insert_entry *i = NULL, *btree_id_start = trans->updates; ++ bool trans_trigger_run; ++ unsigned btree_id = 0; ++ int ret = 0; ++ ++ /* ++ * ++ * For a given btree, this algorithm runs insert triggers before ++ * overwrite triggers: this is so that when extents are being moved ++ * (e.g. by FALLOCATE_FL_INSERT_RANGE), we don't drop references before ++ * they are re-added. ++ */ ++ for (btree_id = 0; btree_id < BTREE_ID_NR; btree_id++) { ++ while (btree_id_start < trans->updates + trans->nr_updates && ++ btree_id_start->btree_id < btree_id) ++ btree_id_start++; ++ ++ /* ++ * Running triggers will append more updates to the list of updates as ++ * we're walking it: ++ */ ++ do { ++ trans_trigger_run = false; ++ ++ for (i = btree_id_start; ++ i < trans->updates + trans->nr_updates && i->btree_id <= btree_id; ++ i++) { ++ if (i->insert_trigger_run || ++ (i->flags & BTREE_TRIGGER_NORUN) || ++ !(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type))) ++ continue; ++ ++ BUG_ON(i->overwrite_trigger_run); ++ ++ i->insert_trigger_run = true; ++ trans_trigger_run = true; ++ ++ old = bch2_btree_path_peek_slot(i->path, &unpacked); ++ _deleted.p = i->path->pos; ++ ++ if (old.k->type == i->k->k.type && ++ ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) { ++ i->overwrite_trigger_run = true; ++ ret = bch2_trans_mark_key(trans, old, bkey_i_to_s_c(i->k), ++ BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|i->flags); ++ } else { ++ ret = bch2_trans_mark_key(trans, deleted, bkey_i_to_s_c(i->k), ++ BTREE_TRIGGER_INSERT|i->flags); ++ } ++ ++ if (ret == -EINTR) ++ trace_trans_restart_mark(trans->ip, _RET_IP_, ++ i->btree_id, &i->path->pos); ++ if (ret) ++ return ret; ++ } ++ } while (trans_trigger_run); ++ ++ do { ++ trans_trigger_run = false; ++ ++ for (i = btree_id_start; ++ i < trans->updates + trans->nr_updates && i->btree_id <= btree_id; ++ i++) { ++ if (i->overwrite_trigger_run || ++ (i->flags & BTREE_TRIGGER_NORUN) || ++ !(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type))) ++ continue; ++ ++ BUG_ON(!i->insert_trigger_run); ++ ++ i->overwrite_trigger_run = true; ++ trans_trigger_run = true; ++ ++ old = bch2_btree_path_peek_slot(i->path, &unpacked); ++ _deleted.p = i->path->pos; ++ ++ ret = bch2_trans_mark_key(trans, old, deleted, ++ BTREE_TRIGGER_OVERWRITE|i->flags); ++ ++ if (ret == -EINTR) ++ trace_trans_restart_mark(trans->ip, _RET_IP_, ++ i->btree_id, &i->path->pos); ++ if (ret) ++ return ret; ++ } ++ } while (trans_trigger_run); ++ } ++ ++ trans_for_each_update(trans, i) ++ BUG_ON(!(i->flags & BTREE_TRIGGER_NORUN) && ++ (BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)) && ++ (!i->insert_trigger_run || !i->overwrite_trigger_run)); ++ ++ return 0; ++} ++ + int __bch2_trans_commit(struct btree_trans *trans) + { + struct btree_insert_entry *i = NULL; +- bool trans_trigger_run; + unsigned u64s; + int ret = 0; + +@@ -853,30 +955,9 @@ int __bch2_trans_commit(struct btree_trans *trans) + i->btree_id, i->k->k.p); + #endif + +- /* +- * Running triggers will append more updates to the list of updates as +- * we're walking it: +- */ +- do { +- trans_trigger_run = false; +- +- trans_for_each_update(trans, i) { +- if ((BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)) && +- !i->trans_triggers_run) { +- i->trans_triggers_run = true; +- trans_trigger_run = true; +- +- ret = bch2_trans_mark_update(trans, i->path, +- i->k, i->flags); +- if (unlikely(ret)) { +- if (ret == -EINTR) +- trace_trans_restart_mark(trans->ip, _RET_IP_, +- i->btree_id, &i->path->pos); +- goto out; +- } +- } +- } +- } while (trans_trigger_run); ++ ret = bch2_trans_commit_run_triggers(trans); ++ if (ret) ++ goto out; + + trans_for_each_update(trans, i) { + BUG_ON(!i->path->should_be_locked); +@@ -1285,7 +1366,7 @@ int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter, + + if (i < trans->updates + trans->nr_updates && + !btree_insert_entry_cmp(&n, i)) { +- BUG_ON(i->trans_triggers_run); ++ BUG_ON(i->insert_trigger_run || i->overwrite_trigger_run); + + /* + * This is a hack to ensure that inode creates update the btree, +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index 92ea698580d9..6fc93b56bcb2 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -1877,41 +1877,6 @@ int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c old, + } + } + +-int bch2_trans_mark_update(struct btree_trans *trans, +- struct btree_path *path, +- struct bkey_i *new, +- unsigned flags) +-{ +- struct bkey _deleted = KEY(0, 0, 0); +- struct bkey_s_c deleted = (struct bkey_s_c) { &_deleted, NULL }; +- struct bkey_s_c old; +- struct bkey unpacked; +- int ret; +- +- _deleted.p = path->pos; +- +- if (unlikely(flags & BTREE_TRIGGER_NORUN)) +- return 0; +- +- if (!btree_node_type_needs_gc(path->btree_id)) +- return 0; +- +- old = bch2_btree_path_peek_slot(path, &unpacked); +- +- if (old.k->type == new->k.type && +- ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) { +- ret = bch2_trans_mark_key(trans, old, bkey_i_to_s_c(new), +- BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags); +- } else { +- ret = bch2_trans_mark_key(trans, deleted, bkey_i_to_s_c(new), +- BTREE_TRIGGER_INSERT|flags) ?: +- bch2_trans_mark_key(trans, old, deleted, +- BTREE_TRIGGER_OVERWRITE|flags); +- } +- +- return ret; +-} +- + static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans, + struct bch_dev *ca, size_t b, + enum bch_data_type type, +diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h +index 8a9b2b565d48..5ed9441cb115 100644 +--- a/fs/bcachefs/buckets.h ++++ b/fs/bcachefs/buckets.h +@@ -233,8 +233,6 @@ int bch2_mark_update(struct btree_trans *, struct btree_path *, + + int bch2_trans_mark_key(struct btree_trans *, struct bkey_s_c, + struct bkey_s_c, unsigned); +-int bch2_trans_mark_update(struct btree_trans *, struct btree_path *, +- struct bkey_i *, unsigned); + void bch2_trans_fs_usage_apply(struct btree_trans *, struct replicas_delta_list *); + + int bch2_trans_mark_metadata_bucket(struct btree_trans *, struct bch_dev *, +-- +cgit v1.2.3 + + +From 20e38ffa221f995d8d4db06f0620e5e47fd8078f Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 15 Nov 2021 17:30:11 -0500 +Subject: bcachefs: Fix error reporting from bch2_journal_flush_seq + +- bch2_journal_halt() was unconditionally overwriting j->err_seq, the + sequence number that we failed to write +- journal_write_done was updating seq_ondisk and flushed_seq_ondisk even + for writes that errored, which broke the way bch2_journal_flush_seq_async() + locklessly checked for completions. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/journal.c | 7 ++++++- + fs/bcachefs/journal_io.c | 15 ++++++++------- + fs/bcachefs/recovery.c | 2 +- + 3 files changed, 15 insertions(+), 9 deletions(-) + +diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c +index f5d5cbb22da9..14bea8a2535e 100644 +--- a/fs/bcachefs/journal.c ++++ b/fs/bcachefs/journal.c +@@ -107,7 +107,12 @@ void bch2_journal_halt(struct journal *j) + } while ((v = atomic64_cmpxchg(&j->reservations.counter, + old.v, new.v)) != old.v); + +- j->err_seq = journal_cur_seq(j); ++ /* ++ * XXX: we're not using j->lock here because this can be called from ++ * interrupt context, this can race with journal_write_done() ++ */ ++ if (!j->err_seq) ++ j->err_seq = journal_cur_seq(j); + journal_wake(j); + closure_wake_up(&journal_cur_buf(j)->wait); + } +diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c +index 53aad1d0c9a9..5c8304e05abd 100644 +--- a/fs/bcachefs/journal_io.c ++++ b/fs/bcachefs/journal_io.c +@@ -1259,14 +1259,15 @@ static void journal_write_done(struct closure *cl) + if (seq >= j->pin.front) + journal_seq_pin(j, seq)->devs = w->devs_written; + +- j->seq_ondisk = seq; +- if (err && (!j->err_seq || seq < j->err_seq)) +- j->err_seq = seq; ++ if (!err) { ++ j->seq_ondisk = seq; + +- if (!JSET_NO_FLUSH(w->data)) { +- j->flushed_seq_ondisk = seq; +- j->last_seq_ondisk = w->last_seq; +- } ++ if (!JSET_NO_FLUSH(w->data)) { ++ j->flushed_seq_ondisk = seq; ++ j->last_seq_ondisk = w->last_seq; ++ } ++ } else if (!j->err_seq || seq < j->err_seq) ++ j->err_seq = seq; + + /* + * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index be8912605527..c3b4d116275c 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -1480,7 +1480,7 @@ int bch2_fs_initialize(struct bch_fs *c) + } + + err = "error writing first journal entry"; +- ret = bch2_journal_meta(&c->journal); ++ ret = bch2_journal_flush(&c->journal); + if (ret) + goto err; + +-- +cgit v1.2.3 + + +From 1a57b2da3530d002b59e77c92a2bbc43e25c457e Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 20 Nov 2021 22:59:25 -0500 +Subject: bcachefs: Add a bit of missing repair code + +This adds repair code to drop very stale pointers. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_gc.c | 14 ++++++++++++++ + 1 file changed, 14 insertions(+) + +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +index b4340df677b7..3ec43ca563be 100644 +--- a/fs/bcachefs/btree_gc.c ++++ b/fs/bcachefs/btree_gc.c +@@ -498,6 +498,10 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id, + char buf[200]; + int ret = 0; + ++ /* ++ * XXX ++ * use check_bucket_ref here ++ */ + bkey_for_each_ptr_decode(k->k, ptrs, p, entry) { + struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); + struct bucket *g = PTR_BUCKET(ca, &p.ptr, true); +@@ -553,6 +557,15 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id, + } + } + ++ if (fsck_err_on(gen_cmp(g->mark.gen, p.ptr.gen) > BUCKET_GC_GEN_MAX, c, ++ "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n" ++ "while marking %s", ++ p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->mark.gen, ++ bch2_data_types[ptr_data_type(k->k, &p.ptr)], ++ p.ptr.gen, ++ (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) ++ do_update = true; ++ + if (fsck_err_on(!p.ptr.cached && + gen_cmp(p.ptr.gen, g->mark.gen) < 0, c, + "bucket %u:%zu data type %s stale dirty ptr: %u < %u\n" +@@ -644,6 +657,7 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id, + (!g->gen_valid || gen_cmp(ptr->gen, g->mark.gen) > 0)) || + (!ptr->cached && + gen_cmp(ptr->gen, g->mark.gen) < 0) || ++ gen_cmp(g->mark.gen, ptr->gen) > BUCKET_GC_GEN_MAX || + (g->mark.data_type && + g->mark.data_type != data_type); + })); +-- +cgit v1.2.3 + + +From 6e8b8444ba7d16647c6de0dd35a9803209b61859 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 21 Nov 2021 16:15:48 -0500 +Subject: bcachefs: Fix BCH_FS_ERROR flag handling + +We were setting BCH_FS_ERROR on startup if the superblock was marked as +containing errors, which is not what we wanted - BCH_FS_ERROR indicates +whether errors have been found, so that after a successful fsck we're +able to clear the error bit in the superblock. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_gc.c | 2 +- + fs/bcachefs/super-io.c | 10 ---------- + 2 files changed, 1 insertion(+), 11 deletions(-) + +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +index 3ec43ca563be..091bddee575d 100644 +--- a/fs/bcachefs/btree_gc.c ++++ b/fs/bcachefs/btree_gc.c +@@ -1632,7 +1632,7 @@ again: + + bch2_mark_superblocks(c); + +- if (test_bit(BCH_FS_TOPOLOGY_ERROR, &c->flags) && ++ if (BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb) && + !test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags) && + c->opts.fix_errors != FSCK_OPT_NO) { + bch_info(c, "starting topology repair pass"); +diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c +index 802976333ec8..88a8e54fbd7a 100644 +--- a/fs/bcachefs/super-io.c ++++ b/fs/bcachefs/super-io.c +@@ -439,16 +439,6 @@ int bch2_sb_to_fs(struct bch_fs *c, struct bch_sb *src) + + __copy_super(&c->disk_sb, src); + +- if (BCH_SB_HAS_ERRORS(c->disk_sb.sb)) +- set_bit(BCH_FS_ERROR, &c->flags); +- else +- clear_bit(BCH_FS_ERROR, &c->flags); +- +- if (BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb)) +- set_bit(BCH_FS_TOPOLOGY_ERROR, &c->flags); +- else +- clear_bit(BCH_FS_TOPOLOGY_ERROR, &c->flags); +- + if (BCH_SB_INITIALIZED(c->disk_sb.sb)) + set_bit(BCH_FS_INITIALIZED, &c->flags); + +-- +cgit v1.2.3 + + +From ca8f5c848bb77fd6eb40cb2c4822ef08b2fbfa59 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 21 Nov 2021 22:34:26 -0500 +Subject: bcachefs: Fix an i_sectors accounting bug + +We weren't checking for errors before calling i_sectors_acct() + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/fs-io.c | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c +index f6970a3318f7..f38414aee70b 100644 +--- a/fs/bcachefs/fs-io.c ++++ b/fs/bcachefs/fs-io.c +@@ -2819,6 +2819,8 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode, + &reservation.k_i, + &disk_res, NULL, + 0, &i_sectors_delta, true); ++ if (ret) ++ goto bkey_err; + i_sectors_acct(c, inode, "a_res, i_sectors_delta); + bkey_err: + bch2_quota_reservation_put(c, inode, "a_res); +-- +cgit v1.2.3 + + +From 2560ce92b090a47f36eb395e5bf0128afda7df9a Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 22 Nov 2021 12:47:20 -0500 +Subject: bcachefs: Fix i_sectors_leak in bch2_truncate_page + +When bch2_truncate_page() discards dirty sectors in the page cache, we +need to account for that - we don't need to account for allocated +sectors because that'll be done by the bch2_fpunch() call when it +updates the btree. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/fs-io.c | 5 +++++ + 1 file changed, 5 insertions(+) + +diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c +index f38414aee70b..dca3b0270779 100644 +--- a/fs/bcachefs/fs-io.c ++++ b/fs/bcachefs/fs-io.c +@@ -2254,6 +2254,7 @@ static int __bch2_truncate_page(struct bch_inode_info *inode, + unsigned end_offset = ((end - 1) & (PAGE_SIZE - 1)) + 1; + unsigned i; + struct page *page; ++ s64 i_sectors_delta = 0; + int ret = 0; + + /* Page boundary? Nothing to do */ +@@ -2305,9 +2306,13 @@ static int __bch2_truncate_page(struct bch_inode_info *inode, + i < round_down(end_offset, block_bytes(c)) >> 9; + i++) { + s->s[i].nr_replicas = 0; ++ if (s->s[i].state == SECTOR_DIRTY) ++ i_sectors_delta--; + s->s[i].state = SECTOR_UNALLOCATED; + } + ++ i_sectors_acct(c, inode, NULL, i_sectors_delta); ++ + /* + * Caller needs to know whether this page will be written out by + * writeback - doing an i_size update if necessary - or whether it will +-- +cgit v1.2.3 + + +From 6f7601aae2025f4e0c9c3fb3c746a30d9ff3dcf2 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 23 Nov 2021 17:05:56 -0500 +Subject: bcachefs: SECTOR_DIRTY_RESERVED + +This fixes another i_sectors accounting bug - we need to differentiate +between dirty writes that overwrite a reservation and dirty writes to +unallocated space - dirty writes to unallocated space increase +i_sectors, dirty writes over a reservation do not. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/fs-io.c | 97 +++++++++++++++++++++++++++++++++-------------------- + 1 file changed, 60 insertions(+), 37 deletions(-) + +diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c +index dca3b0270779..f6eb90598bd5 100644 +--- a/fs/bcachefs/fs-io.c ++++ b/fs/bcachefs/fs-io.c +@@ -232,6 +232,9 @@ static void i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode, + return; + + mutex_lock(&inode->ei_quota_lock); ++ BUG_ON((s64) inode->v.i_blocks + sectors < 0); ++ inode->v.i_blocks += sectors; ++ + #ifdef CONFIG_BCACHEFS_QUOTA + if (quota_res && sectors > 0) { + BUG_ON(sectors > quota_res->sectors); +@@ -243,7 +246,6 @@ static void i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode, + bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors, KEY_TYPE_QUOTA_WARN); + } + #endif +- inode->v.i_blocks += sectors; + mutex_unlock(&inode->ei_quota_lock); + } + +@@ -252,19 +254,20 @@ static void i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode, + /* stored in page->private: */ + + struct bch_page_sector { +- /* Uncompressed, fully allocated replicas: */ +- unsigned nr_replicas:3; ++ /* Uncompressed, fully allocated replicas (or on disk reservation): */ ++ unsigned nr_replicas:4; + +- /* Owns PAGE_SECTORS * replicas_reserved sized reservation: */ +- unsigned replicas_reserved:3; ++ /* Owns PAGE_SECTORS * replicas_reserved sized in memory reservation: */ ++ unsigned replicas_reserved:4; + + /* i_sectors: */ + enum { + SECTOR_UNALLOCATED, + SECTOR_RESERVED, + SECTOR_DIRTY, ++ SECTOR_DIRTY_RESERVED, + SECTOR_ALLOCATED, +- } state:2; ++ } state:8; + }; + + struct bch_page_state { +@@ -320,6 +323,36 @@ static struct bch_page_state *bch2_page_state_create(struct page *page, + return bch2_page_state(page) ?: __bch2_page_state_create(page, gfp); + } + ++static unsigned bkey_to_sector_state(const struct bkey *k) ++{ ++ if (k->type == KEY_TYPE_reservation) ++ return SECTOR_RESERVED; ++ if (bkey_extent_is_allocation(k)) ++ return SECTOR_ALLOCATED; ++ return SECTOR_UNALLOCATED; ++} ++ ++static void bch2_bio_page_state_set(struct bio *bio, struct bkey_s_c k) ++{ ++ struct bvec_iter iter; ++ struct bio_vec bv; ++ unsigned nr_ptrs = k.k->type == KEY_TYPE_reflink_v ++ ? 0 : bch2_bkey_nr_ptrs_fully_allocated(k); ++ unsigned state = bkey_to_sector_state(k.k); ++ ++ bio_for_each_segment(bv, bio, iter) { ++ struct bch_page_state *s = bch2_page_state(bv.bv_page); ++ unsigned i; ++ ++ for (i = bv.bv_offset >> 9; ++ i < (bv.bv_offset + bv.bv_len) >> 9; ++ i++) { ++ s->s[i].nr_replicas = nr_ptrs; ++ s->s[i].state = state; ++ } ++ } ++} ++ + static inline unsigned inode_nr_replicas(struct bch_fs *c, struct bch_inode_info *inode) + { + /* XXX: this should not be open coded */ +@@ -458,16 +491,23 @@ static void bch2_clear_page_bits(struct page *page) + disk_res.sectors += s->s[i].replicas_reserved; + s->s[i].replicas_reserved = 0; + +- if (s->s[i].state == SECTOR_DIRTY) { +- dirty_sectors++; ++ switch (s->s[i].state) { ++ case SECTOR_DIRTY: + s->s[i].state = SECTOR_UNALLOCATED; ++ --dirty_sectors; ++ break; ++ case SECTOR_DIRTY_RESERVED: ++ s->s[i].state = SECTOR_RESERVED; ++ break; ++ default: ++ break; + } + } + + bch2_disk_reservation_put(c, &disk_res); + + if (dirty_sectors) +- i_sectors_acct(c, inode, NULL, -dirty_sectors); ++ i_sectors_acct(c, inode, NULL, dirty_sectors); + + bch2_page_state_release(page); + } +@@ -500,10 +540,17 @@ static void bch2_set_page_dirty(struct bch_fs *c, + s->s[i].replicas_reserved += sectors; + res->disk.sectors -= sectors; + +- if (s->s[i].state == SECTOR_UNALLOCATED) ++ switch (s->s[i].state) { ++ case SECTOR_UNALLOCATED: ++ s->s[i].state = SECTOR_DIRTY; + dirty_sectors++; +- +- s->s[i].state = max_t(unsigned, s->s[i].state, SECTOR_DIRTY); ++ break; ++ case SECTOR_RESERVED: ++ s->s[i].state = SECTOR_DIRTY_RESERVED; ++ break; ++ default: ++ break; ++ } + } + + spin_unlock(&s->lock); +@@ -712,29 +759,6 @@ static inline struct page *readpage_iter_next(struct readpages_iter *iter) + return iter->pages[iter->idx]; + } + +-static void bch2_add_page_sectors(struct bio *bio, struct bkey_s_c k) +-{ +- struct bvec_iter iter; +- struct bio_vec bv; +- unsigned nr_ptrs = k.k->type == KEY_TYPE_reflink_v +- ? 0 : bch2_bkey_nr_ptrs_fully_allocated(k); +- unsigned state = k.k->type == KEY_TYPE_reservation +- ? SECTOR_RESERVED +- : SECTOR_ALLOCATED; +- +- bio_for_each_segment(bv, bio, iter) { +- struct bch_page_state *s = bch2_page_state(bv.bv_page); +- unsigned i; +- +- for (i = bv.bv_offset >> 9; +- i < (bv.bv_offset + bv.bv_len) >> 9; +- i++) { +- s->s[i].nr_replicas = nr_ptrs; +- s->s[i].state = state; +- } +- } +-} +- + static bool extent_partial_reads_expensive(struct bkey_s_c k) + { + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); +@@ -873,8 +897,7 @@ retry: + if (rbio->bio.bi_iter.bi_size == bytes) + flags |= BCH_READ_LAST_FRAGMENT; + +- if (bkey_extent_is_allocation(k.k)) +- bch2_add_page_sectors(&rbio->bio, k); ++ bch2_bio_page_state_set(&rbio->bio, k); + + bch2_read_extent(trans, rbio, iter.pos, + data_btree, k, offset_into_extent, flags); +-- +cgit v1.2.3 + + +From 9642ae21f9211b255cfe71e05d815887bbad6efd Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 27 Oct 2021 13:05:56 -0400 +Subject: bcachefs: Fix quota support for snapshots + +Quota support was disabled when snapshots were released, because of some +tricky interactions with snpashots. We're sidestepping that for now - +we're simply disabling quota accounting on snapshot subvolumes. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/fs.c | 28 ++++++++++++++------ + fs/bcachefs/fs.h | 6 +++++ + fs/bcachefs/opts.h | 12 ++++----- + fs/bcachefs/quota.c | 69 +++++++++++++++++++++++++++++++++++++------------ + fs/bcachefs/subvolume.c | 9 +++++++ + fs/bcachefs/subvolume.h | 2 ++ + 6 files changed, 96 insertions(+), 30 deletions(-) + +diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c +index 7f7405914c3b..26b4ae4b4651 100644 +--- a/fs/bcachefs/fs.c ++++ b/fs/bcachefs/fs.c +@@ -39,7 +39,8 @@ static struct kmem_cache *bch2_inode_cache; + + static void bch2_vfs_inode_init(struct btree_trans *, subvol_inum, + struct bch_inode_info *, +- struct bch_inode_unpacked *); ++ struct bch_inode_unpacked *, ++ struct bch_subvolume *); + + static void __pagecache_lock_put(struct pagecache_lock *lock, long i) + { +@@ -225,6 +226,7 @@ struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum) + struct bch_inode_unpacked inode_u; + struct bch_inode_info *inode; + struct btree_trans trans; ++ struct bch_subvolume subvol; + int ret; + + inode = to_bch_ei(iget5_locked(c->vfs_sb, +@@ -239,10 +241,11 @@ struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum) + + bch2_trans_init(&trans, c, 8, 0); + ret = lockrestart_do(&trans, ++ bch2_subvolume_get(&trans, inum.subvol, true, 0, &subvol) ?: + bch2_inode_find_by_inum_trans(&trans, inum, &inode_u)); + + if (!ret) +- bch2_vfs_inode_init(&trans, inum, inode, &inode_u); ++ bch2_vfs_inode_init(&trans, inum, inode, &inode_u, &subvol); + bch2_trans_exit(&trans); + + if (ret) { +@@ -268,6 +271,7 @@ __bch2_create(struct user_namespace *mnt_userns, + struct bch_inode_unpacked inode_u; + struct posix_acl *default_acl = NULL, *acl = NULL; + subvol_inum inum; ++ struct bch_subvolume subvol; + u64 journal_seq = 0; + int ret; + +@@ -310,7 +314,12 @@ retry: + if (unlikely(ret)) + goto err_before_quota; + +- ret = bch2_trans_commit(&trans, NULL, &journal_seq, 0); ++ inum.subvol = inode_u.bi_subvol ?: dir->ei_subvol; ++ inum.inum = inode_u.bi_inum; ++ ++ ret = bch2_subvolume_get(&trans, inum.subvol, true, ++ BTREE_ITER_WITH_UPDATES, &subvol) ?: ++ bch2_trans_commit(&trans, NULL, &journal_seq, 0); + if (unlikely(ret)) { + bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1, + KEY_TYPE_QUOTA_WARN); +@@ -326,11 +335,8 @@ err_before_quota: + mutex_unlock(&dir->ei_update_lock); + } + +- inum.subvol = inode_u.bi_subvol ?: dir->ei_subvol; +- inum.inum = inode_u.bi_inum; +- + bch2_iget5_set(&inode->v, &inum); +- bch2_vfs_inode_init(&trans, inum, inode, &inode_u); ++ bch2_vfs_inode_init(&trans, inum, inode, &inode_u, &subvol); + + set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl); + set_cached_acl(&inode->v, ACL_TYPE_DEFAULT, default_acl); +@@ -1351,10 +1357,16 @@ static const struct export_operations bch_export_ops = { + + static void bch2_vfs_inode_init(struct btree_trans *trans, subvol_inum inum, + struct bch_inode_info *inode, +- struct bch_inode_unpacked *bi) ++ struct bch_inode_unpacked *bi, ++ struct bch_subvolume *subvol) + { + bch2_inode_update_after_write(trans, inode, bi, ~0); + ++ if (BCH_SUBVOLUME_SNAP(subvol)) ++ set_bit(EI_INODE_SNAPSHOT, &inode->ei_flags); ++ else ++ clear_bit(EI_INODE_SNAPSHOT, &inode->ei_flags); ++ + inode->v.i_blocks = bi->bi_sectors; + inode->v.i_ino = bi->bi_inum; + inode->v.i_rdev = bi->bi_dev; +diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h +index 27aacd7e2864..b2211ec7f302 100644 +--- a/fs/bcachefs/fs.h ++++ b/fs/bcachefs/fs.h +@@ -64,6 +64,12 @@ static inline subvol_inum inode_inum(struct bch_inode_info *inode) + */ + #define EI_INODE_ERROR 0 + ++/* ++ * Set in the inode is in a snapshot subvolume - we don't do quota accounting in ++ * those: ++ */ ++#define EI_INODE_SNAPSHOT 1 ++ + #define to_bch_ei(_inode) \ + container_of_or_null(_inode, struct bch_inode_info, v) + +diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h +index 5d9c00af5973..afb1bb2a62d2 100644 +--- a/fs/bcachefs/opts.h ++++ b/fs/bcachefs/opts.h +@@ -223,19 +223,19 @@ enum opt_type { + BCH_SB_POSIX_ACL, true, \ + NULL, "Enable POSIX acls") \ + x(usrquota, u8, \ +- 0, \ ++ OPT_FORMAT|OPT_MOUNT, \ + OPT_BOOL(), \ +- NO_SB_OPT, false, \ ++ BCH_SB_USRQUOTA, false, \ + NULL, "Enable user quotas") \ + x(grpquota, u8, \ +- 0, \ ++ OPT_FORMAT|OPT_MOUNT, \ + OPT_BOOL(), \ +- NO_SB_OPT, false, \ ++ BCH_SB_GRPQUOTA, false, \ + NULL, "Enable group quotas") \ + x(prjquota, u8, \ +- 0, \ ++ OPT_FORMAT|OPT_MOUNT, \ + OPT_BOOL(), \ +- NO_SB_OPT, false, \ ++ BCH_SB_PRJQUOTA, false, \ + NULL, "Enable project quotas") \ + x(degraded, u8, \ + OPT_MOUNT, \ +diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c +index 5f1216da76d0..8f8f4b0accd6 100644 +--- a/fs/bcachefs/quota.c ++++ b/fs/bcachefs/quota.c +@@ -3,6 +3,7 @@ + #include "btree_update.h" + #include "inode.h" + #include "quota.h" ++#include "subvolume.h" + #include "super-io.h" + + static const char *bch2_sb_validate_quota(struct bch_sb *sb, +@@ -415,14 +416,55 @@ static void bch2_sb_quota_read(struct bch_fs *c) + } + } + ++static int bch2_fs_quota_read_inode(struct btree_trans *trans, ++ struct btree_iter *iter) ++{ ++ struct bch_fs *c = trans->c; ++ struct bch_inode_unpacked u; ++ struct bch_subvolume subvolume; ++ struct bkey_s_c k; ++ int ret; ++ ++ k = bch2_btree_iter_peek(iter); ++ ret = bkey_err(k); ++ if (ret) ++ return ret; ++ ++ if (!k.k) ++ return 1; ++ ++ ret = bch2_snapshot_get_subvol(trans, k.k->p.snapshot, &subvolume); ++ if (ret) ++ return ret; ++ ++ /* ++ * We don't do quota accounting in snapshots: ++ */ ++ if (BCH_SUBVOLUME_SNAP(&subvolume)) ++ goto advance; ++ ++ if (!bkey_is_inode(k.k)) ++ goto advance; ++ ++ ret = bch2_inode_unpack(k, &u); ++ if (ret) ++ return ret; ++ ++ bch2_quota_acct(c, bch_qid(&u), Q_SPC, u.bi_sectors, ++ KEY_TYPE_QUOTA_NOCHECK); ++ bch2_quota_acct(c, bch_qid(&u), Q_INO, 1, ++ KEY_TYPE_QUOTA_NOCHECK); ++advance: ++ bch2_btree_iter_set_pos(iter, POS(iter->pos.inode, iter->pos.offset + 1)); ++ return 0; ++} ++ + int bch2_fs_quota_read(struct bch_fs *c) + { + unsigned i, qtypes = enabled_qtypes(c); + struct bch_memquota_type *q; + struct btree_trans trans; + struct btree_iter iter; +- struct bch_inode_unpacked u; +- struct bkey_s_c k; + int ret; + + mutex_lock(&c->sb_lock); +@@ -437,23 +479,18 @@ int bch2_fs_quota_read(struct bch_fs *c) + + bch2_trans_init(&trans, c, 0, 0); + +- for_each_btree_key(&trans, iter, BTREE_ID_inodes, POS_MIN, +- BTREE_ITER_PREFETCH, k, ret) { +- if (bkey_is_inode(k.k)) { +- ret = bch2_inode_unpack(k, &u); +- if (ret) +- return ret; +- +- bch2_quota_acct(c, bch_qid(&u), Q_SPC, u.bi_sectors, +- KEY_TYPE_QUOTA_NOCHECK); +- bch2_quota_acct(c, bch_qid(&u), Q_INO, 1, +- KEY_TYPE_QUOTA_NOCHECK); +- } +- } ++ bch2_trans_iter_init(&trans, &iter, BTREE_ID_inodes, POS_MIN, ++ BTREE_ITER_INTENT| ++ BTREE_ITER_PREFETCH| ++ BTREE_ITER_ALL_SNAPSHOTS); ++ do { ++ ret = lockrestart_do(&trans, ++ bch2_fs_quota_read_inode(&trans, &iter)); ++ } while (!ret); + bch2_trans_iter_exit(&trans, &iter); + + bch2_trans_exit(&trans); +- return ret; ++ return ret < 0 ? ret : 0; + } + + /* Enable/disable/delete quotas for an entire filesystem: */ +diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c +index 0ef625d21672..7e909a118189 100644 +--- a/fs/bcachefs/subvolume.c ++++ b/fs/bcachefs/subvolume.c +@@ -789,6 +789,15 @@ int bch2_subvolume_get(struct btree_trans *trans, unsigned subvol, + return ret; + } + ++int bch2_snapshot_get_subvol(struct btree_trans *trans, u32 snapshot, ++ struct bch_subvolume *subvol) ++{ ++ struct bch_snapshot snap; ++ ++ return snapshot_lookup(trans, snapshot, &snap) ?: ++ bch2_subvolume_get(trans, le32_to_cpu(snap.subvol), true, 0, subvol); ++} ++ + int bch2_subvolume_get_snapshot(struct btree_trans *trans, u32 subvol, + u32 *snapid) + { +diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h +index dde755b45392..e4c3fdcdf22f 100644 +--- a/fs/bcachefs/subvolume.h ++++ b/fs/bcachefs/subvolume.h +@@ -118,6 +118,8 @@ void bch2_subvolume_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c) + + int bch2_subvolume_get(struct btree_trans *, unsigned, + bool, int, struct bch_subvolume *); ++int bch2_snapshot_get_subvol(struct btree_trans *, u32, ++ struct bch_subvolume *); + int bch2_subvolume_get_snapshot(struct btree_trans *, u32, u32 *); + + int bch2_subvolume_delete(struct btree_trans *, u32); +-- +cgit v1.2.3 + + +From f86e518ce77758b20b91d7dd06c02fd2d268d8ca Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 23 Nov 2021 19:00:23 -0500 +Subject: bcachefs: Apply workaround for too many btree iters to read path + +Reading from cached data, which calls bch2_bucket_io_time_reset(), is +leading to transaction iterator overflows - this standardizes the +workaround. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.h | 8 +++++++- + fs/bcachefs/dirent.c | 5 ++--- + fs/bcachefs/fs-io.c | 4 ++++ + fs/bcachefs/io.c | 4 ++++ + 4 files changed, 17 insertions(+), 4 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h +index 33a703c27f7a..31d2dda7ca05 100644 +--- a/fs/bcachefs/btree_iter.h ++++ b/fs/bcachefs/btree_iter.h +@@ -302,13 +302,19 @@ static inline struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, + : bch2_btree_iter_peek(iter); + } + ++static inline int btree_trans_too_many_iters(struct btree_trans *trans) ++{ ++ return hweight64(trans->paths_allocated) > BTREE_ITER_MAX / 2 ++ ? -EINTR : 0; ++} ++ + static inline struct bkey_s_c + __bch2_btree_iter_peek_and_restart(struct btree_trans *trans, + struct btree_iter *iter, unsigned flags) + { + struct bkey_s_c k; + +- while ((hweight64(trans->paths_allocated) > BTREE_ITER_MAX / 2) || ++ while (btree_trans_too_many_iters(trans) || + (k = __bch2_btree_iter_peek(iter, flags), + bkey_err(k) == -EINTR)) + bch2_trans_begin(trans); +diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c +index 4dfcc955675b..fe4a85a6a8cb 100644 +--- a/fs/bcachefs/dirent.c ++++ b/fs/bcachefs/dirent.c +@@ -531,10 +531,9 @@ retry: + * read_target looks up subvolumes, we can overflow paths if the + * directory has many subvolumes in it + */ +- if (hweight64(trans.paths_allocated) > BTREE_ITER_MAX / 2) { +- ret = -EINTR; ++ ret = btree_trans_too_many_iters(&trans); ++ if (ret) + break; +- } + } + bch2_trans_iter_exit(&trans, &iter); + err: +diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c +index f6eb90598bd5..b46270f94c7f 100644 +--- a/fs/bcachefs/fs-io.c ++++ b/fs/bcachefs/fs-io.c +@@ -907,6 +907,10 @@ retry: + + swap(rbio->bio.bi_iter.bi_size, bytes); + bio_advance(&rbio->bio, bytes); ++ ++ ret = btree_trans_too_many_iters(trans); ++ if (ret) ++ break; + } + err: + bch2_trans_iter_exit(trans, &iter); +diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c +index 3a6b4446706d..5a3c9eff1b50 100644 +--- a/fs/bcachefs/io.c ++++ b/fs/bcachefs/io.c +@@ -2323,6 +2323,10 @@ retry: + + swap(bvec_iter.bi_size, bytes); + bio_advance_iter(&rbio->bio, &bvec_iter, bytes); ++ ++ ret = btree_trans_too_many_iters(&trans); ++ if (ret) ++ break; + } + err: + bch2_trans_iter_exit(&trans, &iter); +-- +cgit v1.2.3 + + +From 0960062ae9a997a8c89fb8ba27f40658c7a331d4 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 23 Nov 2021 20:00:34 -0500 +Subject: bcachefs: Kill PAGE_SECTOR_SHIFT + +Replace it with the new, standard PAGE_SECTORS_SHIFT + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/fs-io.c | 12 ++++++------ + fs/bcachefs/util.h | 2 -- + 2 files changed, 6 insertions(+), 8 deletions(-) + +diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c +index b46270f94c7f..2992913fced0 100644 +--- a/fs/bcachefs/fs-io.c ++++ b/fs/bcachefs/fs-io.c +@@ -778,7 +778,7 @@ static void readpage_bio_extend(struct readpages_iter *iter, + { + while (bio_sectors(bio) < sectors_this_extent && + bio->bi_vcnt < bio->bi_max_vecs) { +- pgoff_t page_offset = bio_end_sector(bio) >> PAGE_SECTOR_SHIFT; ++ pgoff_t page_offset = bio_end_sector(bio) >> PAGE_SECTORS_SHIFT; + struct page *page = readpage_iter_next(iter); + int ret; + +@@ -958,7 +958,7 @@ void bch2_readahead(struct readahead_control *ractl) + readpages_iter.idx++; + + bio_set_op_attrs(&rbio->bio, REQ_OP_READ, 0); +- rbio->bio.bi_iter.bi_sector = (sector_t) index << PAGE_SECTOR_SHIFT; ++ rbio->bio.bi_iter.bi_sector = (sector_t) index << PAGE_SECTORS_SHIFT; + rbio->bio.bi_end_io = bch2_readpages_end_io; + BUG_ON(!bio_add_page(&rbio->bio, page, PAGE_SIZE, 0)); + +@@ -981,7 +981,7 @@ static void __bchfs_readpage(struct bch_fs *c, struct bch_read_bio *rbio, + + bio_set_op_attrs(&rbio->bio, REQ_OP_READ, REQ_SYNC); + rbio->bio.bi_iter.bi_sector = +- (sector_t) page->index << PAGE_SECTOR_SHIFT; ++ (sector_t) page->index << PAGE_SECTORS_SHIFT; + BUG_ON(!bio_add_page(&rbio->bio, page, PAGE_SIZE, 0)); + + bch2_trans_init(&trans, c, 0, 0); +@@ -1268,7 +1268,7 @@ do_io: + } + BUG_ON(!sectors); + +- sector = ((u64) page->index << PAGE_SECTOR_SHIFT) + offset; ++ sector = ((u64) page->index << PAGE_SECTORS_SHIFT) + offset; + + if (w->io && + (w->io->op.res.nr_replicas != nr_replicas_this_write || +@@ -2300,8 +2300,8 @@ static int __bch2_truncate_page(struct bch_inode_info *inode, + * page + */ + ret = range_has_data(c, inode->ei_subvol, +- POS(inode->v.i_ino, index << PAGE_SECTOR_SHIFT), +- POS(inode->v.i_ino, (index + 1) << PAGE_SECTOR_SHIFT)); ++ POS(inode->v.i_ino, index << PAGE_SECTORS_SHIFT), ++ POS(inode->v.i_ino, (index + 1) << PAGE_SECTORS_SHIFT)); + if (ret <= 0) + return ret; + +diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h +index bec84d8aabed..80402b398442 100644 +--- a/fs/bcachefs/util.h ++++ b/fs/bcachefs/util.h +@@ -18,8 +18,6 @@ + #include + #include + +-#define PAGE_SECTOR_SHIFT (PAGE_SHIFT - 9) +- + struct closure; + + #ifdef CONFIG_BCACHEFS_DEBUG +-- +cgit v1.2.3 + + +From 50c99bebb7076ee32496f39e1b5f31b060fa9d50 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 23 Nov 2021 18:17:04 -0500 +Subject: bcachefs: Fix page state when reading into !PageUptodate pages + +This patch adds code to read page state before writing to pages that +aren't uptodate, which corrects i_sectors being tempororarily too large +and means we may not need to get a disk reservation. + +Signed-off-by: Kent Overstreet + +# Conflicts: +# fs/bcachefs/fs-io.c +--- + fs/bcachefs/fs-io.c | 131 ++++++++++++++++++++++++++++++++++++++++++++-------- + 1 file changed, 111 insertions(+), 20 deletions(-) + +diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c +index 2992913fced0..4e298c740b96 100644 +--- a/fs/bcachefs/fs-io.c ++++ b/fs/bcachefs/fs-io.c +@@ -273,6 +273,7 @@ struct bch_page_sector { + struct bch_page_state { + spinlock_t lock; + atomic_t write_count; ++ bool uptodate; + struct bch_page_sector s[PAGE_SECTORS]; + }; + +@@ -332,6 +333,86 @@ static unsigned bkey_to_sector_state(const struct bkey *k) + return SECTOR_UNALLOCATED; + } + ++static void __bch2_page_state_set(struct page *page, ++ unsigned pg_offset, unsigned pg_len, ++ unsigned nr_ptrs, unsigned state) ++{ ++ struct bch_page_state *s = bch2_page_state_create(page, __GFP_NOFAIL); ++ unsigned i; ++ ++ BUG_ON(pg_offset >= PAGE_SECTORS); ++ BUG_ON(pg_offset + pg_len > PAGE_SECTORS); ++ ++ spin_lock(&s->lock); ++ ++ for (i = pg_offset; i < pg_offset + pg_len; i++) { ++ s->s[i].nr_replicas = nr_ptrs; ++ s->s[i].state = state; ++ } ++ ++ if (i == PAGE_SECTORS) ++ s->uptodate = true; ++ ++ spin_unlock(&s->lock); ++} ++ ++static int bch2_page_state_set(struct bch_fs *c, subvol_inum inum, ++ struct page **pages, unsigned nr_pages) ++{ ++ struct btree_trans trans; ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ u64 offset = pages[0]->index << PAGE_SECTORS_SHIFT; ++ unsigned pg_idx = 0; ++ u32 snapshot; ++ int ret; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++retry: ++ bch2_trans_begin(&trans); ++ ++ ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot); ++ if (ret) ++ goto err; ++ ++ for_each_btree_key_norestart(&trans, iter, BTREE_ID_extents, ++ SPOS(inum.inum, offset, snapshot), ++ BTREE_ITER_SLOTS, k, ret) { ++ unsigned nr_ptrs = bch2_bkey_nr_ptrs_fully_allocated(k); ++ unsigned state = bkey_to_sector_state(k.k); ++ ++ while (pg_idx < nr_pages) { ++ struct page *page = pages[pg_idx]; ++ u64 pg_start = page->index << PAGE_SECTORS_SHIFT; ++ u64 pg_end = (page->index + 1) << PAGE_SECTORS_SHIFT; ++ unsigned pg_offset = max(bkey_start_offset(k.k), pg_start) - pg_start; ++ unsigned pg_len = min(k.k->p.offset, pg_end) - pg_offset - pg_start; ++ ++ BUG_ON(k.k->p.offset < pg_start); ++ BUG_ON(bkey_start_offset(k.k) > pg_end); ++ ++ if (!bch2_page_state_create(page, __GFP_NOFAIL)->uptodate) ++ __bch2_page_state_set(page, pg_offset, pg_len, nr_ptrs, state); ++ ++ if (k.k->p.offset < pg_end) ++ break; ++ pg_idx++; ++ } ++ ++ if (pg_idx == nr_pages) ++ break; ++ } ++ ++ offset = iter.pos.offset; ++ bch2_trans_iter_exit(&trans, &iter); ++err: ++ if (ret == -EINTR) ++ goto retry; ++ bch2_trans_exit(&trans); ++ ++ return ret; ++} ++ + static void bch2_bio_page_state_set(struct bio *bio, struct bkey_s_c k) + { + struct bvec_iter iter; +@@ -340,17 +421,9 @@ static void bch2_bio_page_state_set(struct bio *bio, struct bkey_s_c k) + ? 0 : bch2_bkey_nr_ptrs_fully_allocated(k); + unsigned state = bkey_to_sector_state(k.k); + +- bio_for_each_segment(bv, bio, iter) { +- struct bch_page_state *s = bch2_page_state(bv.bv_page); +- unsigned i; +- +- for (i = bv.bv_offset >> 9; +- i < (bv.bv_offset + bv.bv_len) >> 9; +- i++) { +- s->s[i].nr_replicas = nr_ptrs; +- s->s[i].state = state; +- } +- } ++ bio_for_each_segment(bv, bio, iter) ++ __bch2_page_state_set(bv.bv_page, bv.bv_offset >> 9, ++ bv.bv_len >> 9, nr_ptrs, state); + } + + static inline unsigned inode_nr_replicas(struct bch_fs *c, struct bch_inode_info *inode) +@@ -437,6 +510,8 @@ static int bch2_page_reservation_get(struct bch_fs *c, + if (!s) + return -ENOMEM; + ++ BUG_ON(!s->uptodate); ++ + for (i = round_down(offset, block_bytes(c)) >> 9; + i < round_up(offset + len, block_bytes(c)) >> 9; + i++) { +@@ -610,7 +685,7 @@ vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf) + struct bch2_page_reservation res; + unsigned len; + loff_t isize; +- int ret = VM_FAULT_LOCKED; ++ int ret; + + bch2_page_reservation_init(c, inode, &res); + +@@ -636,6 +711,14 @@ vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf) + + len = min_t(loff_t, PAGE_SIZE, isize - page_offset(page)); + ++ if (!bch2_page_state_create(page, __GFP_NOFAIL)->uptodate) { ++ if (bch2_page_state_set(c, inode_inum(inode), &page, 1)) { ++ unlock_page(page); ++ ret = VM_FAULT_SIGBUS; ++ goto out; ++ } ++ } ++ + if (bch2_page_reservation_get(c, inode, page, &res, 0, len, true)) { + unlock_page(page); + ret = VM_FAULT_SIGBUS; +@@ -646,6 +729,7 @@ vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf) + bch2_page_reservation_put(c, inode, &res); + + wait_for_stable_page(page); ++ ret = VM_FAULT_LOCKED; + out: + bch2_pagecache_add_put(&inode->ei_pagecache_lock); + sb_end_pagefault(inode->v.i_sb); +@@ -1385,6 +1469,12 @@ readpage: + if (ret) + goto err; + out: ++ if (!bch2_page_state_create(page, __GFP_NOFAIL)->uptodate) { ++ ret = bch2_page_state_set(c, inode_inum(inode), &page, 1); ++ if (ret) ++ goto out; ++ } ++ + ret = bch2_page_reservation_get(c, inode, page, res, + offset, len, true); + if (ret) { +@@ -1514,20 +1604,21 @@ static int __bch2_buffered_write(struct bch_inode_info *inode, + } + + while (reserved < len) { +- struct page *page = pages[(offset + reserved) >> PAGE_SHIFT]; ++ unsigned i = (offset + reserved) >> PAGE_SHIFT; ++ struct page *page = pages[i]; + unsigned pg_offset = (offset + reserved) & (PAGE_SIZE - 1); + unsigned pg_len = min_t(unsigned, len - reserved, + PAGE_SIZE - pg_offset); +-retry_reservation: +- ret = bch2_page_reservation_get(c, inode, page, &res, +- pg_offset, pg_len, true); + +- if (ret && !PageUptodate(page)) { +- ret = bch2_read_single_page(page, mapping); +- if (!ret) +- goto retry_reservation; ++ if (!bch2_page_state_create(page, __GFP_NOFAIL)->uptodate) { ++ ret = bch2_page_state_set(c, inode_inum(inode), ++ pages + i, nr_pages - i); ++ if (ret) ++ goto out; + } + ++ ret = bch2_page_reservation_get(c, inode, page, &res, ++ pg_offset, pg_len, true); + if (ret) + goto out; + +-- +cgit v1.2.3 + + +From 6263beb739cc1bd491b45f2b4465452cecef0b73 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 23 Nov 2021 18:21:09 -0500 +Subject: bcachefs: Fix page state after fallocate + +This tweaks the fallocate code to also update the page cache to reflect +the new on disk reservations, giving us better i_sectors consistency. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/fs-io.c | 155 ++++++++++++++++++++++++++++++++++++++-------------- + 1 file changed, 113 insertions(+), 42 deletions(-) + +diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c +index 4e298c740b96..e29f160a6da0 100644 +--- a/fs/bcachefs/fs-io.c ++++ b/fs/bcachefs/fs-io.c +@@ -426,6 +426,110 @@ static void bch2_bio_page_state_set(struct bio *bio, struct bkey_s_c k) + bv.bv_len >> 9, nr_ptrs, state); + } + ++static void mark_pagecache_unallocated(struct bch_inode_info *inode, ++ u64 start, u64 end) ++{ ++ pgoff_t index = start >> PAGE_SECTORS_SHIFT; ++ pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT; ++ struct pagevec pvec; ++ ++ if (end <= start) ++ return; ++ ++ pagevec_init(&pvec); ++ ++ do { ++ unsigned nr_pages, i, j; ++ ++ nr_pages = pagevec_lookup_range(&pvec, inode->v.i_mapping, ++ &index, end_index); ++ for (i = 0; i < nr_pages; i++) { ++ struct page *page = pvec.pages[i]; ++ u64 pg_start = page->index << PAGE_SECTORS_SHIFT; ++ u64 pg_end = (page->index + 1) << PAGE_SECTORS_SHIFT; ++ unsigned pg_offset = max(start, pg_start) - pg_start; ++ unsigned pg_len = min(end, pg_end) - pg_offset - pg_start; ++ struct bch_page_state *s; ++ ++ BUG_ON(end <= pg_start); ++ BUG_ON(pg_offset >= PAGE_SECTORS); ++ BUG_ON(pg_offset + pg_len > PAGE_SECTORS); ++ ++ lock_page(page); ++ s = bch2_page_state(page); ++ ++ if (s) { ++ spin_lock(&s->lock); ++ for (j = pg_offset; j < pg_offset + pg_len; j++) ++ s->s[j].nr_replicas = 0; ++ spin_unlock(&s->lock); ++ } ++ ++ unlock_page(page); ++ } ++ pagevec_release(&pvec); ++ } while (index <= end_index); ++} ++ ++static void mark_pagecache_reserved(struct bch_inode_info *inode, ++ u64 start, u64 end) ++{ ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ pgoff_t index = start >> PAGE_SECTORS_SHIFT; ++ pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT; ++ struct pagevec pvec; ++ s64 i_sectors_delta = 0; ++ ++ if (end <= start) ++ return; ++ ++ pagevec_init(&pvec); ++ ++ do { ++ unsigned nr_pages, i, j; ++ ++ nr_pages = pagevec_lookup_range(&pvec, inode->v.i_mapping, ++ &index, end_index); ++ for (i = 0; i < nr_pages; i++) { ++ struct page *page = pvec.pages[i]; ++ u64 pg_start = page->index << PAGE_SECTORS_SHIFT; ++ u64 pg_end = (page->index + 1) << PAGE_SECTORS_SHIFT; ++ unsigned pg_offset = max(start, pg_start) - pg_start; ++ unsigned pg_len = min(end, pg_end) - pg_offset - pg_start; ++ struct bch_page_state *s; ++ ++ BUG_ON(end <= pg_start); ++ BUG_ON(pg_offset >= PAGE_SECTORS); ++ BUG_ON(pg_offset + pg_len > PAGE_SECTORS); ++ ++ lock_page(page); ++ s = bch2_page_state(page); ++ ++ if (s) { ++ spin_lock(&s->lock); ++ for (j = pg_offset; j < pg_offset + pg_len; j++) ++ switch (s->s[j].state) { ++ case SECTOR_UNALLOCATED: ++ s->s[j].state = SECTOR_RESERVED; ++ break; ++ case SECTOR_DIRTY: ++ s->s[j].state = SECTOR_DIRTY_RESERVED; ++ i_sectors_delta--; ++ break; ++ default: ++ break; ++ } ++ spin_unlock(&s->lock); ++ } ++ ++ unlock_page(page); ++ } ++ pagevec_release(&pvec); ++ } while (index <= end_index); ++ ++ i_sectors_acct(c, inode, NULL, i_sectors_delta); ++} ++ + static inline unsigned inode_nr_replicas(struct bch_fs *c, struct bch_inode_info *inode) + { + /* XXX: this should not be open coded */ +@@ -581,8 +685,7 @@ static void bch2_clear_page_bits(struct page *page) + + bch2_disk_reservation_put(c, &disk_res); + +- if (dirty_sectors) +- i_sectors_acct(c, inode, NULL, dirty_sectors); ++ i_sectors_acct(c, inode, NULL, dirty_sectors); + + bch2_page_state_release(page); + } +@@ -630,8 +733,7 @@ static void bch2_set_page_dirty(struct bch_fs *c, + + spin_unlock(&s->lock); + +- if (dirty_sectors) +- i_sectors_acct(c, inode, &res->quota, dirty_sectors); ++ i_sectors_acct(c, inode, &res->quota, dirty_sectors); + + if (!PageDirty(page)) + __set_page_dirty_nobuffers(page); +@@ -2612,6 +2714,8 @@ int bch2_truncate(struct user_namespace *mnt_userns, + U64_MAX, &i_sectors_delta); + i_sectors_acct(c, inode, NULL, i_sectors_delta); + ++ BUG_ON(!inode->v.i_size && inode->v.i_blocks); ++ + if (unlikely(ret)) + goto err; + +@@ -2952,6 +3056,9 @@ bkey_err: + ret = 0; + } + ++ bch2_trans_unlock(&trans); /* lock ordering, before taking pagecache locks: */ ++ mark_pagecache_reserved(inode, start_sector, iter.pos.offset); ++ + if (ret == -ENOSPC && (mode & FALLOC_FL_ZERO_RANGE)) { + struct quota_res quota_res = { 0 }; + s64 i_sectors_delta = 0; +@@ -3057,43 +3164,6 @@ long bch2_fallocate_dispatch(struct file *file, int mode, + return ret; + } + +-static void mark_range_unallocated(struct bch_inode_info *inode, +- loff_t start, loff_t end) +-{ +- pgoff_t index = start >> PAGE_SHIFT; +- pgoff_t end_index = (end - 1) >> PAGE_SHIFT; +- struct pagevec pvec; +- +- pagevec_init(&pvec); +- +- do { +- unsigned nr_pages, i, j; +- +- nr_pages = pagevec_lookup_range(&pvec, inode->v.i_mapping, +- &index, end_index); +- if (nr_pages == 0) +- break; +- +- for (i = 0; i < nr_pages; i++) { +- struct page *page = pvec.pages[i]; +- struct bch_page_state *s; +- +- lock_page(page); +- s = bch2_page_state(page); +- +- if (s) { +- spin_lock(&s->lock); +- for (j = 0; j < PAGE_SECTORS; j++) +- s->s[j].nr_replicas = 0; +- spin_unlock(&s->lock); +- } +- +- unlock_page(page); +- } +- pagevec_release(&pvec); +- } while (index <= end_index); +-} +- + loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src, + struct file *file_dst, loff_t pos_dst, + loff_t len, unsigned remap_flags) +@@ -3139,7 +3209,8 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src, + if (ret) + goto err; + +- mark_range_unallocated(src, pos_src, pos_src + aligned_len); ++ mark_pagecache_unallocated(src, pos_src >> 9, ++ (pos_src + aligned_len) >> 9); + + ret = bch2_remap_range(c, + inode_inum(dst), pos_dst >> 9, +-- +cgit v1.2.3 + + +From b984ab7348b20da389261b0c3370c913b40c2bb1 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 27 Nov 2021 16:13:41 -0500 +Subject: bcachefs: Improve tracing of btree_path leaks + +This patch plumbs the btree_path->ip_allocated field back to where the +btree_iter that owns it was first initialized - meaning it will be much +easier to figure out which btree_iter wasn't exited properly. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 60 +++++++++++++++++++++++++------------ + fs/bcachefs/btree_iter.h | 6 ++-- + fs/bcachefs/btree_types.h | 3 ++ + fs/bcachefs/btree_update_interior.c | 7 +++-- + 4 files changed, 52 insertions(+), 24 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 03357737e7c3..32435a24147f 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -25,6 +25,15 @@ static inline void btree_path_list_remove(struct btree_trans *, struct btree_pat + static inline void btree_path_list_add(struct btree_trans *, struct btree_path *, + struct btree_path *); + ++static inline unsigned long btree_iter_ip_allocated(struct btree_iter *iter) ++{ ++#ifdef CONFIG_BCACHEFS_DEBUG ++ return iter->ip_allocated; ++#else ++ return 0; ++#endif ++} ++ + static struct btree_path *btree_path_alloc(struct btree_trans *, struct btree_path *); + + /* +@@ -1609,14 +1618,15 @@ static struct btree_path *btree_path_clone(struct btree_trans *trans, struct btr + + inline struct btree_path * __must_check + bch2_btree_path_make_mut(struct btree_trans *trans, +- struct btree_path *path, bool intent) ++ struct btree_path *path, bool intent, ++ unsigned long ip) + { + if (path->ref > 1 || path->preserve) { + __btree_path_put(path, intent); + path = btree_path_clone(trans, path, intent); + path->preserve = false; + #ifdef CONFIG_BCACHEFS_DEBUG +- path->ip_allocated = _RET_IP_; ++ path->ip_allocated = ip; + #endif + btree_trans_verify_sorted(trans); + } +@@ -1627,7 +1637,7 @@ bch2_btree_path_make_mut(struct btree_trans *trans, + static struct btree_path * __must_check + btree_path_set_pos(struct btree_trans *trans, + struct btree_path *path, struct bpos new_pos, +- bool intent) ++ bool intent, unsigned long ip) + { + int cmp = bpos_cmp(new_pos, path->pos); + unsigned l = path->level; +@@ -1638,7 +1648,7 @@ btree_path_set_pos(struct btree_trans *trans, + if (!cmp) + return path; + +- path = bch2_btree_path_make_mut(trans, path, intent); ++ path = bch2_btree_path_make_mut(trans, path, intent, ip); + + path->pos = new_pos; + path->should_be_locked = false; +@@ -1814,7 +1824,7 @@ static struct btree_path *btree_path_alloc(struct btree_trans *trans, + struct btree_path *bch2_path_get(struct btree_trans *trans, bool cached, + enum btree_id btree_id, struct bpos pos, + unsigned locks_want, unsigned level, +- bool intent) ++ bool intent, unsigned long ip) + { + struct btree_path *path, *path_pos = NULL; + int i; +@@ -1837,7 +1847,7 @@ struct btree_path *bch2_path_get(struct btree_trans *trans, bool cached, + path_pos->btree_id == btree_id && + path_pos->level == level) { + __btree_path_get(path_pos, intent); +- path = btree_path_set_pos(trans, path_pos, pos, intent); ++ path = btree_path_set_pos(trans, path_pos, pos, intent, ip); + path->preserve = true; + } else { + path = btree_path_alloc(trans, path_pos); +@@ -1857,7 +1867,7 @@ struct btree_path *bch2_path_get(struct btree_trans *trans, bool cached, + for (i = 0; i < ARRAY_SIZE(path->l); i++) + path->l[i].b = BTREE_ITER_NO_NODE_INIT; + #ifdef CONFIG_BCACHEFS_DEBUG +- path->ip_allocated = _RET_IP_; ++ path->ip_allocated = ip; + #endif + btree_trans_verify_sorted(trans); + } +@@ -1935,7 +1945,8 @@ bch2_btree_iter_traverse(struct btree_iter *iter) + + iter->path = btree_path_set_pos(iter->trans, iter->path, + btree_iter_search_key(iter), +- iter->flags & BTREE_ITER_INTENT); ++ iter->flags & BTREE_ITER_INTENT, ++ btree_iter_ip_allocated(iter)); + + ret = bch2_btree_path_traverse(iter->trans, iter->path, iter->flags); + if (ret) +@@ -1970,7 +1981,8 @@ struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter) + iter->k.p = iter->pos = b->key.k.p; + + iter->path = btree_path_set_pos(trans, iter->path, b->key.k.p, +- iter->flags & BTREE_ITER_INTENT); ++ iter->flags & BTREE_ITER_INTENT, ++ btree_iter_ip_allocated(iter)); + iter->path->should_be_locked = true; + BUG_ON(iter->path->uptodate); + out: +@@ -2029,7 +2041,8 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) + */ + path = iter->path = + btree_path_set_pos(trans, path, bpos_successor(iter->pos), +- iter->flags & BTREE_ITER_INTENT); ++ iter->flags & BTREE_ITER_INTENT, ++ btree_iter_ip_allocated(iter)); + + path->level = iter->min_depth; + +@@ -2051,7 +2064,8 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) + iter->k.p = iter->pos = b->key.k.p; + + iter->path = btree_path_set_pos(trans, iter->path, b->key.k.p, +- iter->flags & BTREE_ITER_INTENT); ++ iter->flags & BTREE_ITER_INTENT, ++ btree_iter_ip_allocated(iter)); + iter->path->should_be_locked = true; + BUG_ON(iter->path->uptodate); + out: +@@ -2110,7 +2124,8 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) + + while (1) { + iter->path = btree_path_set_pos(trans, iter->path, search_key, +- iter->flags & BTREE_ITER_INTENT); ++ iter->flags & BTREE_ITER_INTENT, ++ btree_iter_ip_allocated(iter)); + + ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); + if (unlikely(ret)) { +@@ -2186,7 +2201,8 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) + cmp = bpos_cmp(k.k->p, iter->path->pos); + if (cmp) { + iter->path = bch2_btree_path_make_mut(trans, iter->path, +- iter->flags & BTREE_ITER_INTENT); ++ iter->flags & BTREE_ITER_INTENT, ++ btree_iter_ip_allocated(iter)); + iter->path->pos = k.k->p; + btree_path_check_sort(trans, iter->path, cmp); + } +@@ -2238,7 +2254,8 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) + + while (1) { + iter->path = btree_path_set_pos(trans, iter->path, search_key, +- iter->flags & BTREE_ITER_INTENT); ++ iter->flags & BTREE_ITER_INTENT, ++ btree_iter_ip_allocated(iter)); + + ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); + if (unlikely(ret)) { +@@ -2368,7 +2385,8 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) + + search_key = btree_iter_search_key(iter); + iter->path = btree_path_set_pos(trans, iter->path, search_key, +- iter->flags & BTREE_ITER_INTENT); ++ iter->flags & BTREE_ITER_INTENT, ++ btree_iter_ip_allocated(iter)); + + ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); + if (unlikely(ret)) +@@ -2582,7 +2600,8 @@ static void __bch2_trans_iter_init(struct btree_trans *trans, + unsigned btree_id, struct bpos pos, + unsigned locks_want, + unsigned depth, +- unsigned flags) ++ unsigned flags, ++ unsigned long ip) + { + EBUG_ON(trans->restarted); + +@@ -2608,6 +2627,9 @@ static void __bch2_trans_iter_init(struct btree_trans *trans, + iter->k.type = KEY_TYPE_deleted; + iter->k.p = pos; + iter->k.size = 0; ++#ifdef CONFIG_BCACHEFS_DEBUG ++ iter->ip_allocated = ip; ++#endif + + iter->path = bch2_path_get(trans, + flags & BTREE_ITER_CACHED, +@@ -2615,7 +2637,7 @@ static void __bch2_trans_iter_init(struct btree_trans *trans, + iter->pos, + locks_want, + depth, +- flags & BTREE_ITER_INTENT); ++ flags & BTREE_ITER_INTENT, ip); + } + + void bch2_trans_iter_init(struct btree_trans *trans, +@@ -2624,7 +2646,7 @@ void bch2_trans_iter_init(struct btree_trans *trans, + unsigned flags) + { + __bch2_trans_iter_init(trans, iter, btree_id, pos, +- 0, 0, flags); ++ 0, 0, flags, _RET_IP_); + } + + void bch2_trans_node_iter_init(struct btree_trans *trans, +@@ -2639,7 +2661,7 @@ void bch2_trans_node_iter_init(struct btree_trans *trans, + BTREE_ITER_NOT_EXTENTS| + __BTREE_ITER_ALL_SNAPSHOTS| + BTREE_ITER_ALL_SNAPSHOTS| +- flags); ++ flags, _RET_IP_); + BUG_ON(iter->path->locks_want < min(locks_want, BTREE_MAX_DEPTH)); + BUG_ON(iter->path->level != depth); + BUG_ON(iter->min_depth != depth); +diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h +index 31d2dda7ca05..26eb90a7eab8 100644 +--- a/fs/bcachefs/btree_iter.h ++++ b/fs/bcachefs/btree_iter.h +@@ -130,11 +130,13 @@ __trans_next_path_with_node(struct btree_trans *trans, struct btree *b, + (_path)->idx + 1)) + + struct btree_path * __must_check +-bch2_btree_path_make_mut(struct btree_trans *, struct btree_path *, bool); ++bch2_btree_path_make_mut(struct btree_trans *, struct btree_path *, ++ bool, unsigned long); + int __must_check bch2_btree_path_traverse(struct btree_trans *, + struct btree_path *, unsigned); + struct btree_path *bch2_path_get(struct btree_trans *, bool, enum btree_id, +- struct bpos, unsigned, unsigned, bool); ++ struct bpos, unsigned, unsigned, bool, ++ unsigned long); + inline struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *, struct bkey *); + + #ifdef CONFIG_BCACHEFS_DEBUG +diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h +index 0d0a719f738f..2c2e2f794b8f 100644 +--- a/fs/bcachefs/btree_types.h ++++ b/fs/bcachefs/btree_types.h +@@ -291,6 +291,9 @@ struct btree_iter { + * bch2_btree_iter_next_slot() can correctly advance pos. + */ + struct bkey k; ++#ifdef CONFIG_BCACHEFS_DEBUG ++ unsigned long ip_allocated; ++#endif + }; + + struct btree_key_cache { +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index 61c7757bd3ca..dfff972551ee 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -1590,8 +1590,8 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, + ? bpos_predecessor(b->data->min_key) + : bpos_successor(b->data->max_key); + +- sib_path = bch2_path_get(trans, false, path->btree_id, +- sib_pos, U8_MAX, level, true); ++ sib_path = bch2_path_get(trans, false, path->btree_id, sib_pos, ++ U8_MAX, level, true, _THIS_IP_); + ret = bch2_btree_path_traverse(trans, sib_path, false); + if (ret) + goto err; +@@ -1888,7 +1888,8 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans, + bch2_trans_copy_iter(&iter2, iter); + + iter2.path = bch2_btree_path_make_mut(trans, iter2.path, +- iter2.flags & BTREE_ITER_INTENT); ++ iter2.flags & BTREE_ITER_INTENT, ++ _THIS_IP_); + + BUG_ON(iter2.path->level != b->c.level); + BUG_ON(bpos_cmp(iter2.path->pos, new_key->k.p)); +-- +cgit v1.2.3 + + +From f93933b1c37b5f881a80781adc78affe30623039 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 28 Nov 2021 13:42:05 -0500 +Subject: bcachefs: Convert bucket_alloc_ret to negative error codes + +Start a new header, errcode.h, for bcachefs-private error codes - more +error codes will be converted later. + +This patch just converts bucket_alloc_ret so that they can be mixed with +standard error codes and passed as ERR_PTR errors - the ec.c code was +doing this already, but incorrectly. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/alloc_foreground.c | 35 ++++++++++++++++------------------- + fs/bcachefs/alloc_foreground.h | 10 +--------- + fs/bcachefs/bcachefs.h | 1 + + fs/bcachefs/ec.c | 9 ++++----- + fs/bcachefs/errcode.h | 12 ++++++++++++ + 5 files changed, 34 insertions(+), 33 deletions(-) + create mode 100644 fs/bcachefs/errcode.h + +diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c +index 412fed479482..2bb107b8b0b9 100644 +--- a/fs/bcachefs/alloc_foreground.c ++++ b/fs/bcachefs/alloc_foreground.c +@@ -348,8 +348,7 @@ static void add_new_bucket(struct bch_fs *c, + ob_push(c, ptrs, ob); + } + +-enum bucket_alloc_ret +-bch2_bucket_alloc_set(struct bch_fs *c, ++int bch2_bucket_alloc_set(struct bch_fs *c, + struct open_buckets *ptrs, + struct dev_stripe_state *stripe, + struct bch_devs_mask *devs_may_alloc, +@@ -363,7 +362,7 @@ bch2_bucket_alloc_set(struct bch_fs *c, + struct dev_alloc_list devs_sorted = + bch2_dev_alloc_list(c, stripe, devs_may_alloc); + struct bch_dev *ca; +- enum bucket_alloc_ret ret = INSUFFICIENT_DEVICES; ++ int ret = -INSUFFICIENT_DEVICES; + unsigned i; + + BUG_ON(*nr_effective >= nr_replicas); +@@ -381,7 +380,7 @@ bch2_bucket_alloc_set(struct bch_fs *c, + ob = bch2_bucket_alloc(c, ca, reserve, + flags & BUCKET_MAY_ALLOC_PARTIAL, cl); + if (IS_ERR(ob)) { +- ret = -PTR_ERR(ob); ++ ret = PTR_ERR(ob); + + if (cl) + return ret; +@@ -394,7 +393,7 @@ bch2_bucket_alloc_set(struct bch_fs *c, + bch2_dev_stripe_increment(ca, stripe); + + if (*nr_effective >= nr_replicas) +- return ALLOC_SUCCESS; ++ return 0; + } + + return ret; +@@ -408,8 +407,7 @@ bch2_bucket_alloc_set(struct bch_fs *c, + * it's to a device we don't want: + */ + +-static enum bucket_alloc_ret +-bucket_alloc_from_stripe(struct bch_fs *c, ++static int bucket_alloc_from_stripe(struct bch_fs *c, + struct open_buckets *ptrs, + struct write_point *wp, + struct bch_devs_mask *devs_may_alloc, +@@ -505,8 +503,7 @@ static void get_buckets_from_writepoint(struct bch_fs *c, + wp->ptrs = ptrs_skip; + } + +-static enum bucket_alloc_ret +-open_bucket_add_buckets(struct bch_fs *c, ++static int open_bucket_add_buckets(struct bch_fs *c, + struct open_buckets *ptrs, + struct write_point *wp, + struct bch_devs_list *devs_have, +@@ -522,7 +519,7 @@ open_bucket_add_buckets(struct bch_fs *c, + struct bch_devs_mask devs; + struct open_bucket *ob; + struct closure *cl = NULL; +- enum bucket_alloc_ret ret; ++ int ret; + unsigned i; + + rcu_read_lock(); +@@ -550,8 +547,8 @@ open_bucket_add_buckets(struct bch_fs *c, + target, erasure_code, + nr_replicas, nr_effective, + have_cache, flags, _cl); +- if (ret == FREELIST_EMPTY || +- ret == OPEN_BUCKETS_EMPTY) ++ if (ret == -FREELIST_EMPTY || ++ ret == -OPEN_BUCKETS_EMPTY) + return ret; + if (*nr_effective >= nr_replicas) + return 0; +@@ -575,7 +572,7 @@ retry_blocking: + ret = bch2_bucket_alloc_set(c, ptrs, &wp->stripe, &devs, + nr_replicas, nr_effective, have_cache, + reserve, flags, cl); +- if (ret && ret != INSUFFICIENT_DEVICES && !cl && _cl) { ++ if (ret && ret != -INSUFFICIENT_DEVICES && !cl && _cl) { + cl = _cl; + goto retry_blocking; + } +@@ -772,7 +769,7 @@ struct write_point *bch2_alloc_sectors_start(struct bch_fs *c, + unsigned nr_effective, write_points_nr; + unsigned ob_flags = 0; + bool have_cache; +- enum bucket_alloc_ret ret; ++ int ret; + int i; + + if (!(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)) +@@ -821,7 +818,7 @@ alloc_done: + if (erasure_code && !ec_open_bucket(c, &ptrs)) + pr_debug("failed to get ec bucket: ret %u", ret); + +- if (ret == INSUFFICIENT_DEVICES && ++ if (ret == -INSUFFICIENT_DEVICES && + nr_effective >= nr_replicas_required) + ret = 0; + +@@ -854,15 +851,15 @@ err: + + mutex_unlock(&wp->lock); + +- if (ret == FREELIST_EMPTY && ++ if (ret == -FREELIST_EMPTY && + try_decrease_writepoints(c, write_points_nr)) + goto retry; + + switch (ret) { +- case OPEN_BUCKETS_EMPTY: +- case FREELIST_EMPTY: ++ case -OPEN_BUCKETS_EMPTY: ++ case -FREELIST_EMPTY: + return cl ? ERR_PTR(-EAGAIN) : ERR_PTR(-ENOSPC); +- case INSUFFICIENT_DEVICES: ++ case -INSUFFICIENT_DEVICES: + return ERR_PTR(-EROFS); + default: + BUG(); +diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h +index c658295cb8e0..2e81712ba8d1 100644 +--- a/fs/bcachefs/alloc_foreground.h ++++ b/fs/bcachefs/alloc_foreground.h +@@ -12,13 +12,6 @@ struct bch_dev; + struct bch_fs; + struct bch_devs_List; + +-enum bucket_alloc_ret { +- ALLOC_SUCCESS, +- OPEN_BUCKETS_EMPTY, +- FREELIST_EMPTY, /* Allocator thread not keeping up */ +- INSUFFICIENT_DEVICES, +-}; +- + struct dev_alloc_list { + unsigned nr; + u8 devs[BCH_SB_MEMBERS_MAX]; +@@ -98,8 +91,7 @@ static inline void bch2_open_bucket_get(struct bch_fs *c, + } + } + +-enum bucket_alloc_ret +-bch2_bucket_alloc_set(struct bch_fs *, struct open_buckets *, ++int bch2_bucket_alloc_set(struct bch_fs *, struct open_buckets *, + struct dev_stripe_state *, struct bch_devs_mask *, + unsigned, unsigned *, bool *, enum alloc_reserve, + unsigned, struct closure *); +diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h +index fdf3a777ae16..0439f3e0d8d7 100644 +--- a/fs/bcachefs/bcachefs.h ++++ b/fs/bcachefs/bcachefs.h +@@ -200,6 +200,7 @@ + #include + + #include "bcachefs_format.h" ++#include "errcode.h" + #include "fifo.h" + #include "opts.h" + #include "util.h" +diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c +index bca1b8a7b673..bae3e3b28aa3 100644 +--- a/fs/bcachefs/ec.c ++++ b/fs/bcachefs/ec.c +@@ -1272,16 +1272,15 @@ found: + return h; + } + +-static enum bucket_alloc_ret +-new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h, +- struct closure *cl) ++static int new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h, ++ struct closure *cl) + { + struct bch_devs_mask devs = h->devs; + struct open_bucket *ob; + struct open_buckets buckets; + unsigned i, j, nr_have_parity = 0, nr_have_data = 0; + bool have_cache = true; +- enum bucket_alloc_ret ret = ALLOC_SUCCESS; ++ int ret = 0; + + for (i = 0; i < h->s->new_stripe.key.v.nr_blocks; i++) { + if (test_bit(i, h->s->blocks_gotten)) { +@@ -1516,7 +1515,7 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c, + + err: + bch2_ec_stripe_head_put(c, h); +- return ERR_PTR(-ret); ++ return ERR_PTR(ret); + } + + void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca) +diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h +new file mode 100644 +index 000000000000..f7d12915c1cc +--- /dev/null ++++ b/fs/bcachefs/errcode.h +@@ -0,0 +1,12 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_ERRCODE_H ++#define _BCACHEFS_ERRCODE_H ++ ++enum { ++ /* Bucket allocator: */ ++ OPEN_BUCKETS_EMPTY = 2048, ++ FREELIST_EMPTY, /* Allocator thread not keeping up */ ++ INSUFFICIENT_DEVICES, ++}; ++ ++#endif /* _BCACHFES_ERRCODE_H */ +-- +cgit v1.2.3 + + +From a61a88da6760876b7baa0f12c555cd04431af337 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 29 Nov 2021 16:36:50 -0500 +Subject: bcachefs: Fix reflink path for snapshots + +make_extent_indirect() was missing the +BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE - it's updating the extent in the +original snapshot, not the curret one. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/reflink.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c +index 8dcac7815c9f..c8d6d73681e0 100644 +--- a/fs/bcachefs/reflink.c ++++ b/fs/bcachefs/reflink.c +@@ -184,7 +184,8 @@ static int bch2_make_extent_indirect(struct btree_trans *trans, + + r_p->v.idx = cpu_to_le64(bkey_start_offset(&r_v->k)); + +- ret = bch2_trans_update(trans, extent_iter, &r_p->k_i, 0); ++ ret = bch2_trans_update(trans, extent_iter, &r_p->k_i, ++ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + err: + c->reflink_hint = reflink_iter.pos.offset; + bch2_trans_iter_exit(trans, &reflink_iter); +-- +cgit v1.2.3 + + +From c4c124ef35dab585e1cb06413d3095ca2a1df8d4 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 28 Nov 2021 14:08:58 -0500 +Subject: bcachefs: Kill bch2_replicas_delta_list_marked() + +This changes bch2_trans_fs_usage_apply() to handle failure (replicas +entry missing) by reverting the changes it made - meaning we can make +the main transaction commit path a bit slimmer, and perhaps also +simplify some locking in upcoming patches. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_update_leaf.c | 30 ++++++++++++------------------ + fs/bcachefs/buckets.c | 17 +++++++++++++---- + fs/bcachefs/buckets.h | 2 +- + fs/bcachefs/replicas.c | 14 -------------- + fs/bcachefs/replicas.h | 1 - + 5 files changed, 26 insertions(+), 38 deletions(-) + +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index 112ac7caf579..22398f6a0cc1 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -437,17 +437,6 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, + marking = true; + } + +- if (marking) { +- percpu_down_read(&c->mark_lock); +- } +- +- /* Must be called under mark_lock: */ +- if (marking && trans->fs_usage_deltas && +- !bch2_replicas_delta_list_marked(c, trans->fs_usage_deltas)) { +- ret = BTREE_INSERT_NEED_MARK_REPLICAS; +- goto err; +- } +- + /* + * Don't get journal reservation until after we know insert will + * succeed: +@@ -456,7 +445,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, + ret = bch2_trans_journal_res_get(trans, + JOURNAL_RES_GET_NONBLOCK); + if (ret) +- goto err; ++ return ret; + } else { + trans->journal_res.seq = c->journal.replay_journal_seq; + } +@@ -484,22 +473,27 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, + i->k->k.version = MAX_VERSION; + } + ++ if (marking) ++ percpu_down_read(&c->mark_lock); ++ ++ if (marking && trans->fs_usage_deltas && ++ bch2_trans_fs_usage_apply(trans, trans->fs_usage_deltas)) { ++ percpu_up_read(&c->mark_lock); ++ return BTREE_INSERT_NEED_MARK_REPLICAS; ++ } ++ + trans_for_each_update(trans, i) + if (BTREE_NODE_TYPE_HAS_MEM_TRIGGERS & (1U << i->bkey_type)) + bch2_mark_update(trans, i->path, i->k, i->flags); + +- if (marking && trans->fs_usage_deltas) +- bch2_trans_fs_usage_apply(trans, trans->fs_usage_deltas); +- + if (unlikely(c->gc_pos.phase)) + bch2_trans_mark_gc(trans); + + trans_for_each_update(trans, i) + do_btree_insert_one(trans, i); +-err: +- if (marking) { ++ ++ if (marking) + percpu_up_read(&c->mark_lock); +- } + + return ret; + } +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index 6fc93b56bcb2..4d661593e5bc 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -1359,14 +1359,14 @@ void fs_usage_apply_warn(struct btree_trans *trans, + __WARN(); + } + +-void bch2_trans_fs_usage_apply(struct btree_trans *trans, +- struct replicas_delta_list *deltas) ++int bch2_trans_fs_usage_apply(struct btree_trans *trans, ++ struct replicas_delta_list *deltas) + { + struct bch_fs *c = trans->c; + static int warned_disk_usage = 0; + bool warn = false; + unsigned disk_res_sectors = trans->disk_res ? trans->disk_res->sectors : 0; +- struct replicas_delta *d = deltas->d; ++ struct replicas_delta *d = deltas->d, *d2; + struct replicas_delta *top = (void *) deltas->d + deltas->used; + struct bch_fs_usage *dst; + s64 added = 0, should_not_have_added; +@@ -1385,7 +1385,8 @@ void bch2_trans_fs_usage_apply(struct btree_trans *trans, + added += d->delta; + } + +- BUG_ON(__update_replicas(c, dst, &d->r, d->delta)); ++ if (__update_replicas(c, dst, &d->r, d->delta)) ++ goto need_mark; + } + + dst->nr_inodes += deltas->nr_inodes; +@@ -1423,6 +1424,14 @@ void bch2_trans_fs_usage_apply(struct btree_trans *trans, + + if (unlikely(warn) && !xchg(&warned_disk_usage, 1)) + fs_usage_apply_warn(trans, disk_res_sectors, should_not_have_added); ++ return 0; ++need_mark: ++ /* revert changes: */ ++ for (d2 = deltas->d; d2 != d; d2 = replicas_delta_next(d2)) ++ BUG_ON(__update_replicas(c, dst, &d2->r, -d2->delta)); ++ ++ preempt_enable(); ++ return -1; + } + + /* trans_mark: */ +diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h +index 5ed9441cb115..5ef66e729212 100644 +--- a/fs/bcachefs/buckets.h ++++ b/fs/bcachefs/buckets.h +@@ -233,7 +233,7 @@ int bch2_mark_update(struct btree_trans *, struct btree_path *, + + int bch2_trans_mark_key(struct btree_trans *, struct bkey_s_c, + struct bkey_s_c, unsigned); +-void bch2_trans_fs_usage_apply(struct btree_trans *, struct replicas_delta_list *); ++int bch2_trans_fs_usage_apply(struct btree_trans *, struct replicas_delta_list *); + + int bch2_trans_mark_metadata_bucket(struct btree_trans *, struct bch_dev *, + size_t, enum bch_data_type, unsigned); +diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c +index 002006593044..e72b5afccbe7 100644 +--- a/fs/bcachefs/replicas.c ++++ b/fs/bcachefs/replicas.c +@@ -468,20 +468,6 @@ static int __bch2_mark_bkey_replicas(struct bch_fs *c, struct bkey_s_c k, + + /* replicas delta list: */ + +-bool bch2_replicas_delta_list_marked(struct bch_fs *c, +- struct replicas_delta_list *r) +-{ +- struct replicas_delta *d = r->d; +- struct replicas_delta *top = (void *) r->d + r->used; +- +- percpu_rwsem_assert_held(&c->mark_lock); +- +- for (d = r->d; d != top; d = replicas_delta_next(d)) +- if (bch2_replicas_entry_idx(c, &d->r) < 0) +- return false; +- return true; +-} +- + int bch2_replicas_delta_list_mark(struct bch_fs *c, + struct replicas_delta_list *r) + { +diff --git a/fs/bcachefs/replicas.h b/fs/bcachefs/replicas.h +index 72ac544f16d8..66ca88deb0c0 100644 +--- a/fs/bcachefs/replicas.h ++++ b/fs/bcachefs/replicas.h +@@ -48,7 +48,6 @@ replicas_delta_next(struct replicas_delta *d) + return (void *) d + replicas_entry_bytes(&d->r) + 8; + } + +-bool bch2_replicas_delta_list_marked(struct bch_fs *, struct replicas_delta_list *); + int bch2_replicas_delta_list_mark(struct bch_fs *, struct replicas_delta_list *); + + void bch2_bkey_to_replicas(struct bch_replicas_entry *, struct bkey_s_c); +-- +cgit v1.2.3 + + +From e9fcb7d5b1871e52c9e62ab2ec2be9e84f4b3803 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 28 Nov 2021 14:31:19 -0500 +Subject: bcachefs: Push c->mark_lock usage down to where it is needed + +This changes the bch2_mark_key() and related paths to take mark lock +where it is needed, instead of taking it in the upper transaction commit +path - by pushing down locking we'll be able to handle fsck errors +locally instead of requiring a separate check in the btree_gc code for +replicas being marked. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_gc.c | 6 +- + fs/bcachefs/btree_update_leaf.c | 12 +--- + fs/bcachefs/buckets.c | 124 +++++++++++++++++++++++----------------- + fs/bcachefs/buckets.h | 2 +- + fs/bcachefs/ec.c | 6 +- + 5 files changed, 84 insertions(+), 66 deletions(-) + +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +index 091bddee575d..06fb1f3f772b 100644 +--- a/fs/bcachefs/btree_gc.c ++++ b/fs/bcachefs/btree_gc.c +@@ -710,12 +710,16 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id, + struct bch_fs *c = trans->c; + struct bkey_ptrs_c ptrs; + const struct bch_extent_ptr *ptr; ++ struct bkey deleted = KEY(0, 0, 0); ++ struct bkey_s_c old = (struct bkey_s_c) { &deleted, NULL }; + unsigned flags = + BTREE_TRIGGER_GC| + (initial ? BTREE_TRIGGER_NOATOMIC : 0); + char buf[200]; + int ret = 0; + ++ deleted.p = k->k->p; ++ + if (initial) { + BUG_ON(bch2_journal_seq_verify && + k->k->version.lo > journal_cur_seq(&c->journal)); +@@ -754,7 +758,7 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id, + *max_stale = max(*max_stale, ptr_stale(ca, ptr)); + } + +- ret = bch2_mark_key(trans, *k, flags); ++ ret = bch2_mark_key(trans, old, *k, flags); + fsck_err: + err: + if (ret) +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index 22398f6a0cc1..131fd4c1e736 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -473,14 +473,9 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, + i->k->k.version = MAX_VERSION; + } + +- if (marking) +- percpu_down_read(&c->mark_lock); +- +- if (marking && trans->fs_usage_deltas && +- bch2_trans_fs_usage_apply(trans, trans->fs_usage_deltas)) { +- percpu_up_read(&c->mark_lock); ++ if (trans->fs_usage_deltas && ++ bch2_trans_fs_usage_apply(trans, trans->fs_usage_deltas)) + return BTREE_INSERT_NEED_MARK_REPLICAS; +- } + + trans_for_each_update(trans, i) + if (BTREE_NODE_TYPE_HAS_MEM_TRIGGERS & (1U << i->bkey_type)) +@@ -492,9 +487,6 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, + trans_for_each_update(trans, i) + do_btree_insert_one(trans, i); + +- if (marking) +- percpu_up_read(&c->mark_lock); +- + return ret; + } + +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index 4d661593e5bc..5ef122174762 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -144,6 +144,7 @@ static inline struct bch_fs_usage *fs_usage_ptr(struct bch_fs *c, + unsigned journal_seq, + bool gc) + { ++ percpu_rwsem_assert_held(&c->mark_lock); + BUG_ON(!gc && !journal_seq); + + return this_cpu_ptr(gc +@@ -371,8 +372,6 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca, + if (!journal_seq && !test_bit(BCH_FS_INITIALIZED, &c->flags)) + journal_seq = 1; + +- percpu_rwsem_assert_held(&c->mark_lock); +- + preempt_disable(); + fs_usage = fs_usage_ptr(c, journal_seq, gc); + u = dev_usage_ptr(ca, journal_seq, gc); +@@ -423,17 +422,24 @@ static inline int update_replicas(struct bch_fs *c, + unsigned journal_seq, bool gc) + { + struct bch_fs_usage __percpu *fs_usage; +- int idx = bch2_replicas_entry_idx(c, r); ++ int idx, ret = 0; + +- if (idx < 0) +- return -1; ++ percpu_down_read(&c->mark_lock); ++ ++ idx = bch2_replicas_entry_idx(c, r); ++ if (idx < 0) { ++ ret = -1; ++ goto err; ++ } + + preempt_disable(); + fs_usage = fs_usage_ptr(c, journal_seq, gc); + fs_usage_data_type_to_base(fs_usage, r->data_type, sectors); + fs_usage->replicas[idx] += sectors; + preempt_enable(); +- return 0; ++err: ++ percpu_up_read(&c->mark_lock); ++ return ret; + } + + static inline int update_cached_sectors(struct bch_fs *c, +@@ -547,6 +553,7 @@ static int bch2_mark_alloc(struct btree_trans *trans, + struct bch_dev *ca; + struct bucket *g; + struct bucket_mark old_m, m; ++ int ret = 0; + + /* We don't do anything for deletions - do we?: */ + if (!bkey_is_alloc(new.k)) +@@ -573,6 +580,7 @@ static int bch2_mark_alloc(struct btree_trans *trans, + if (new.k->p.offset >= ca->mi.nbuckets) + return 0; + ++ percpu_down_read(&c->mark_lock); + g = __bucket(ca, new.k->p.offset, gc); + u = bch2_alloc_unpack(new); + +@@ -597,6 +605,7 @@ static int bch2_mark_alloc(struct btree_trans *trans, + g->gen_valid = 1; + g->stripe = u.stripe; + g->stripe_redundancy = u.stripe_redundancy; ++ percpu_up_read(&c->mark_lock); + + /* + * need to know if we're getting called from the invalidate path or +@@ -605,10 +614,11 @@ static int bch2_mark_alloc(struct btree_trans *trans, + + if ((flags & BTREE_TRIGGER_BUCKET_INVALIDATE) && + old_m.cached_sectors) { +- if (update_cached_sectors(c, ca->dev_idx, -old_m.cached_sectors, +- journal_seq, gc)) { ++ ret = update_cached_sectors(c, ca->dev_idx, -old_m.cached_sectors, ++ journal_seq, gc); ++ if (ret) { + bch2_fs_fatal_error(c, "bch2_mark_alloc(): no replicas entry while updating cached sectors"); +- return -1; ++ return ret; + } + + trace_invalidate(ca, bucket_to_sector(ca, new.k->p.offset), +@@ -782,24 +792,28 @@ static int mark_stripe_bucket(struct btree_trans *trans, + const struct bch_extent_ptr *ptr = s->ptrs + ptr_idx; + bool gc = flags & BTREE_TRIGGER_GC; + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); +- struct bucket *g = PTR_BUCKET(ca, ptr, gc); ++ struct bucket *g; + struct bucket_mark new, old; + char buf[200]; +- int ret; ++ int ret = 0; ++ ++ percpu_down_read(&c->mark_lock); ++ g = PTR_BUCKET(ca, ptr, gc); + + if (g->stripe && g->stripe != k.k->p.offset) { + bch2_fs_inconsistent(c, + "bucket %u:%zu gen %u: multiple stripes using same bucket\n%s", + ptr->dev, PTR_BUCKET_NR(ca, ptr), g->mark.gen, + (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); +- return -EINVAL; ++ ret = -EINVAL; ++ goto err; + } + + old = bucket_cmpxchg(g, new, ({ + ret = check_bucket_ref(c, k, ptr, 0, 0, new.gen, new.data_type, + new.dirty_sectors, new.cached_sectors); + if (ret) +- return ret; ++ goto err; + + if (parity) { + new.data_type = BCH_DATA_parity; +@@ -816,6 +830,9 @@ static int mark_stripe_bucket(struct btree_trans *trans, + g->stripe_redundancy = s->nr_redundant; + + bch2_dev_usage_update(c, ca, old, new, journal_seq, gc); ++err: ++ percpu_up_read(&c->mark_lock); ++ + return 0; + } + +@@ -853,10 +870,13 @@ static int bch2_mark_pointer(struct btree_trans *trans, + struct bch_fs *c = trans->c; + struct bucket_mark old, new; + struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); +- struct bucket *g = PTR_BUCKET(ca, &p.ptr, gc); ++ struct bucket *g; + u8 bucket_data_type; + u64 v; +- int ret; ++ int ret = 0; ++ ++ percpu_down_read(&c->mark_lock); ++ g = PTR_BUCKET(ca, &p.ptr, gc); + + v = atomic64_read(&g->_mark.v); + do { +@@ -869,7 +889,7 @@ static int bch2_mark_pointer(struct btree_trans *trans, + &new.dirty_sectors, + &new.cached_sectors); + if (ret) +- return ret; ++ goto err; + + new.data_type = bucket_data_type; + +@@ -889,8 +909,10 @@ static int bch2_mark_pointer(struct btree_trans *trans, + bch2_dev_usage_update(c, ca, old, new, journal_seq, gc); + + BUG_ON(!gc && bucket_became_unavailable(old, new)); ++err: ++ percpu_up_read(&c->mark_lock); + +- return 0; ++ return ret; + } + + static int bch2_mark_stripe_ptr(struct btree_trans *trans, +@@ -978,13 +1000,14 @@ static int bch2_mark_extent(struct btree_trans *trans, + stale = ret > 0; + + if (p.ptr.cached) { +- if (!stale) +- if (update_cached_sectors(c, p.ptr.dev, disk_sectors, +- journal_seq, gc)) { ++ if (!stale) { ++ ret = update_cached_sectors(c, p.ptr.dev, disk_sectors, ++ journal_seq, gc); ++ if (ret) { + bch2_fs_fatal_error(c, "bch2_mark_extent(): no replicas entry while updating cached sectors"); +- return -1; +- ++ return ret; + } ++ } + } else if (!p.has_ec) { + dirty_sectors += disk_sectors; + r.e.devs[r.e.nr_devs++] = p.ptr.dev; +@@ -1004,12 +1027,13 @@ static int bch2_mark_extent(struct btree_trans *trans, + } + + if (r.e.nr_devs) { +- if (update_replicas(c, &r.e, dirty_sectors, journal_seq, gc)) { ++ ret = update_replicas(c, &r.e, dirty_sectors, journal_seq, gc); ++ if (ret) { + char buf[200]; + + bch2_bkey_val_to_text(&PBUF(buf), c, k); + bch2_fs_fatal_error(c, "no replicas entry for %s", buf); +- return -1; ++ return ret; + } + } + +@@ -1091,14 +1115,15 @@ static int bch2_mark_stripe(struct btree_trans *trans, + return ret; + } + +- if (update_replicas(c, &m->r.e, +- ((s64) m->sectors * m->nr_redundant), +- journal_seq, gc)) { ++ ret = update_replicas(c, &m->r.e, ++ ((s64) m->sectors * m->nr_redundant), ++ journal_seq, gc); ++ if (ret) { + char buf[200]; + + bch2_bkey_val_to_text(&PBUF(buf), c, new); + bch2_fs_fatal_error(c, "no replicas entry for %s", buf); +- return -1; ++ return ret; + } + } + +@@ -1123,11 +1148,15 @@ static int bch2_mark_inode(struct btree_trans *trans, + } + + if (flags & BTREE_TRIGGER_GC) { ++ percpu_down_read(&c->mark_lock); + preempt_disable(); ++ + fs_usage = fs_usage_ptr(c, journal_seq, flags & BTREE_TRIGGER_GC); + fs_usage->nr_inodes += bkey_is_inode(new.k); + fs_usage->nr_inodes -= bkey_is_inode(old.k); ++ + preempt_enable(); ++ percpu_up_read(&c->mark_lock); + } + return 0; + } +@@ -1146,14 +1175,18 @@ static int bch2_mark_reservation(struct btree_trans *trans, + sectors = -sectors; + sectors *= replicas; + ++ percpu_down_read(&c->mark_lock); + preempt_disable(); ++ + fs_usage = fs_usage_ptr(c, trans->journal_res.seq, flags & BTREE_TRIGGER_GC); + replicas = clamp_t(unsigned, replicas, 1, + ARRAY_SIZE(fs_usage->persistent_reserved)); + + fs_usage->reserved += sectors; + fs_usage->persistent_reserved[replicas - 1] += sectors; ++ + preempt_enable(); ++ percpu_up_read(&c->mark_lock); + + return 0; + } +@@ -1241,10 +1274,10 @@ static int bch2_mark_reflink_p(struct btree_trans *trans, + return ret; + } + +-static int bch2_mark_key_locked(struct btree_trans *trans, +- struct bkey_s_c old, +- struct bkey_s_c new, +- unsigned flags) ++int bch2_mark_key(struct btree_trans *trans, ++ struct bkey_s_c old, ++ struct bkey_s_c new, ++ unsigned flags) + { + struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old: new; + +@@ -1274,22 +1307,6 @@ static int bch2_mark_key_locked(struct btree_trans *trans, + } + } + +-int bch2_mark_key(struct btree_trans *trans, struct bkey_s_c new, unsigned flags) +-{ +- struct bch_fs *c = trans->c; +- struct bkey deleted = KEY(0, 0, 0); +- struct bkey_s_c old = (struct bkey_s_c) { &deleted, NULL }; +- int ret; +- +- deleted.p = new.k->p; +- +- percpu_down_read(&c->mark_lock); +- ret = bch2_mark_key_locked(trans, old, new, flags); +- percpu_up_read(&c->mark_lock); +- +- return ret; +-} +- + int bch2_mark_update(struct btree_trans *trans, struct btree_path *path, + struct bkey_i *new, unsigned flags) + { +@@ -1311,12 +1328,12 @@ int bch2_mark_update(struct btree_trans *trans, struct btree_path *path, + + if (old.k->type == new->k.type && + ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) { +- ret = bch2_mark_key_locked(trans, old, bkey_i_to_s_c(new), ++ ret = bch2_mark_key(trans, old, bkey_i_to_s_c(new), + BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags); + } else { +- ret = bch2_mark_key_locked(trans, deleted, bkey_i_to_s_c(new), ++ ret = bch2_mark_key(trans, deleted, bkey_i_to_s_c(new), + BTREE_TRIGGER_INSERT|flags) ?: +- bch2_mark_key_locked(trans, old, deleted, ++ bch2_mark_key(trans, old, deleted, + BTREE_TRIGGER_OVERWRITE|flags); + } + +@@ -1372,8 +1389,7 @@ int bch2_trans_fs_usage_apply(struct btree_trans *trans, + s64 added = 0, should_not_have_added; + unsigned i; + +- percpu_rwsem_assert_held(&c->mark_lock); +- ++ percpu_down_read(&c->mark_lock); + preempt_disable(); + dst = fs_usage_ptr(c, trans->journal_res.seq, false); + +@@ -1421,6 +1437,7 @@ int bch2_trans_fs_usage_apply(struct btree_trans *trans, + } + + preempt_enable(); ++ percpu_up_read(&c->mark_lock); + + if (unlikely(warn) && !xchg(&warned_disk_usage, 1)) + fs_usage_apply_warn(trans, disk_res_sectors, should_not_have_added); +@@ -1431,6 +1448,7 @@ need_mark: + BUG_ON(__update_replicas(c, dst, &d2->r, -d2->delta)); + + preempt_enable(); ++ percpu_up_read(&c->mark_lock); + return -1; + } + +diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h +index 5ef66e729212..ac9b554acd86 100644 +--- a/fs/bcachefs/buckets.h ++++ b/fs/bcachefs/buckets.h +@@ -226,7 +226,7 @@ void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *, + size_t, enum bch_data_type, unsigned, + struct gc_pos, unsigned); + +-int bch2_mark_key(struct btree_trans *, struct bkey_s_c, unsigned); ++int bch2_mark_key(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned); + + int bch2_mark_update(struct btree_trans *, struct btree_path *, + struct bkey_i *, unsigned); +diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c +index bae3e3b28aa3..0df3bbec2e7c 100644 +--- a/fs/bcachefs/ec.c ++++ b/fs/bcachefs/ec.c +@@ -1635,12 +1635,16 @@ int bch2_stripes_write(struct bch_fs *c, unsigned flags) + + static int bch2_stripes_read_fn(struct btree_trans *trans, struct bkey_s_c k) + { ++ struct bkey deleted = KEY(0, 0, 0); ++ struct bkey_s_c old = (struct bkey_s_c) { &deleted, NULL }; + struct bch_fs *c = trans->c; + int ret = 0; + ++ deleted.p = k.k->p; ++ + if (k.k->type == KEY_TYPE_stripe) + ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL) ?: +- bch2_mark_key(trans, k, ++ bch2_mark_key(trans, old, k, + BTREE_TRIGGER_NOATOMIC); + + return ret; +-- +cgit v1.2.3 + + +From 1591cfcdb26aec2328047c4ae677f75323aeda9e Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 28 Nov 2021 15:13:54 -0500 +Subject: bcachefs: Handle replica marking fsck errors locally + +This simplifies the code quite a bit and eliminates an inconsistency - a +given bkey doesn't necessarily translate to a single replicas entry for +disk space accounting. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_gc.c | 13 ------------- + fs/bcachefs/buckets.c | 38 ++++++++++++++++++++++++++---------- + fs/bcachefs/replicas.c | 52 -------------------------------------------------- + fs/bcachefs/replicas.h | 2 -- + 4 files changed, 28 insertions(+), 77 deletions(-) + +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +index 06fb1f3f772b..e7e0a56af085 100644 +--- a/fs/bcachefs/btree_gc.c ++++ b/fs/bcachefs/btree_gc.c +@@ -715,7 +715,6 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id, + unsigned flags = + BTREE_TRIGGER_GC| + (initial ? BTREE_TRIGGER_NOATOMIC : 0); +- char buf[200]; + int ret = 0; + + deleted.p = k->k->p; +@@ -733,18 +732,6 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id, + k->k->version.lo, + atomic64_read(&c->key_version))) + atomic64_set(&c->key_version, k->k->version.lo); +- +- if (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) || +- fsck_err_on(!bch2_bkey_replicas_marked(c, *k), c, +- "superblock not marked as containing replicas\n" +- " while marking %s", +- (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) { +- ret = bch2_mark_bkey_replicas(c, *k); +- if (ret) { +- bch_err(c, "error marking bkey replicas: %i", ret); +- goto err; +- } +- } + } + + ptrs = bch2_bkey_ptrs_c(*k); +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index 5ef122174762..f8333eb11ce2 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -417,16 +417,30 @@ static inline int __update_replicas(struct bch_fs *c, + return 0; + } + +-static inline int update_replicas(struct bch_fs *c, ++static inline int update_replicas(struct bch_fs *c, struct bkey_s_c k, + struct bch_replicas_entry *r, s64 sectors, + unsigned journal_seq, bool gc) + { + struct bch_fs_usage __percpu *fs_usage; + int idx, ret = 0; ++ char buf[200]; + + percpu_down_read(&c->mark_lock); + + idx = bch2_replicas_entry_idx(c, r); ++ if (idx < 0 && ++ (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) || ++ fsck_err(c, "no replicas entry\n" ++ " while marking %s", ++ (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)))) { ++ percpu_up_read(&c->mark_lock); ++ ret = bch2_mark_replicas(c, r); ++ if (ret) ++ return ret; ++ ++ percpu_down_read(&c->mark_lock); ++ idx = bch2_replicas_entry_idx(c, r); ++ } + if (idx < 0) { + ret = -1; + goto err; +@@ -438,11 +452,13 @@ static inline int update_replicas(struct bch_fs *c, + fs_usage->replicas[idx] += sectors; + preempt_enable(); + err: ++fsck_err: + percpu_up_read(&c->mark_lock); + return ret; + } + + static inline int update_cached_sectors(struct bch_fs *c, ++ struct bkey_s_c k, + unsigned dev, s64 sectors, + unsigned journal_seq, bool gc) + { +@@ -450,7 +466,7 @@ static inline int update_cached_sectors(struct bch_fs *c, + + bch2_replicas_entry_cached(&r.e, dev); + +- return update_replicas(c, &r.e, sectors, journal_seq, gc); ++ return update_replicas(c, k, &r.e, sectors, journal_seq, gc); + } + + static struct replicas_delta_list * +@@ -614,8 +630,9 @@ static int bch2_mark_alloc(struct btree_trans *trans, + + if ((flags & BTREE_TRIGGER_BUCKET_INVALIDATE) && + old_m.cached_sectors) { +- ret = update_cached_sectors(c, ca->dev_idx, -old_m.cached_sectors, +- journal_seq, gc); ++ ret = update_cached_sectors(c, new, ca->dev_idx, ++ -old_m.cached_sectors, ++ journal_seq, gc); + if (ret) { + bch2_fs_fatal_error(c, "bch2_mark_alloc(): no replicas entry while updating cached sectors"); + return ret; +@@ -916,6 +933,7 @@ err: + } + + static int bch2_mark_stripe_ptr(struct btree_trans *trans, ++ struct bkey_s_c k, + struct bch_extent_stripe_ptr p, + enum bch_data_type data_type, + s64 sectors, +@@ -955,7 +973,7 @@ static int bch2_mark_stripe_ptr(struct btree_trans *trans, + spin_unlock(&c->ec_stripes_heap_lock); + + r.e.data_type = data_type; +- update_replicas(c, &r.e, sectors, trans->journal_res.seq, gc); ++ update_replicas(c, k, &r.e, sectors, trans->journal_res.seq, gc); + + return 0; + } +@@ -1001,8 +1019,8 @@ static int bch2_mark_extent(struct btree_trans *trans, + + if (p.ptr.cached) { + if (!stale) { +- ret = update_cached_sectors(c, p.ptr.dev, disk_sectors, +- journal_seq, gc); ++ ret = update_cached_sectors(c, k, p.ptr.dev, ++ disk_sectors, journal_seq, gc); + if (ret) { + bch2_fs_fatal_error(c, "bch2_mark_extent(): no replicas entry while updating cached sectors"); + return ret; +@@ -1012,7 +1030,7 @@ static int bch2_mark_extent(struct btree_trans *trans, + dirty_sectors += disk_sectors; + r.e.devs[r.e.nr_devs++] = p.ptr.dev; + } else { +- ret = bch2_mark_stripe_ptr(trans, p.ec, data_type, ++ ret = bch2_mark_stripe_ptr(trans, k, p.ec, data_type, + disk_sectors, flags); + if (ret) + return ret; +@@ -1027,7 +1045,7 @@ static int bch2_mark_extent(struct btree_trans *trans, + } + + if (r.e.nr_devs) { +- ret = update_replicas(c, &r.e, dirty_sectors, journal_seq, gc); ++ ret = update_replicas(c, k, &r.e, dirty_sectors, journal_seq, gc); + if (ret) { + char buf[200]; + +@@ -1115,7 +1133,7 @@ static int bch2_mark_stripe(struct btree_trans *trans, + return ret; + } + +- ret = update_replicas(c, &m->r.e, ++ ret = update_replicas(c, new, &m->r.e, + ((s64) m->sectors * m->nr_redundant), + journal_seq, gc); + if (ret) { +diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c +index e72b5afccbe7..6c5ea78d6762 100644 +--- a/fs/bcachefs/replicas.c ++++ b/fs/bcachefs/replicas.c +@@ -427,45 +427,6 @@ int bch2_mark_replicas(struct bch_fs *c, struct bch_replicas_entry *r) + return __bch2_mark_replicas(c, r, false); + } + +-static int __bch2_mark_bkey_replicas(struct bch_fs *c, struct bkey_s_c k, +- bool check) +-{ +- struct bch_replicas_padded search; +- struct bch_devs_list cached = bch2_bkey_cached_devs(k); +- unsigned i; +- int ret; +- +- memset(&search, 0, sizeof(search)); +- +- for (i = 0; i < cached.nr; i++) { +- bch2_replicas_entry_cached(&search.e, cached.devs[i]); +- +- ret = __bch2_mark_replicas(c, &search.e, check); +- if (ret) +- return ret; +- } +- +- bch2_bkey_to_replicas(&search.e, k); +- +- ret = __bch2_mark_replicas(c, &search.e, check); +- if (ret) +- return ret; +- +- if (search.e.data_type == BCH_DATA_parity) { +- search.e.data_type = BCH_DATA_cached; +- ret = __bch2_mark_replicas(c, &search.e, check); +- if (ret) +- return ret; +- +- search.e.data_type = BCH_DATA_user; +- ret = __bch2_mark_replicas(c, &search.e, check); +- if (ret) +- return ret; +- } +- +- return 0; +-} +- + /* replicas delta list: */ + + int bch2_replicas_delta_list_mark(struct bch_fs *c, +@@ -480,19 +441,6 @@ int bch2_replicas_delta_list_mark(struct bch_fs *c, + return ret; + } + +-/* bkey replicas: */ +- +-bool bch2_bkey_replicas_marked(struct bch_fs *c, +- struct bkey_s_c k) +-{ +- return __bch2_mark_bkey_replicas(c, k, true) == 0; +-} +- +-int bch2_mark_bkey_replicas(struct bch_fs *c, struct bkey_s_c k) +-{ +- return __bch2_mark_bkey_replicas(c, k, false); +-} +- + /* + * Old replicas_gc mechanism: only used for journal replicas entries now, should + * die at some point: +diff --git a/fs/bcachefs/replicas.h b/fs/bcachefs/replicas.h +index 66ca88deb0c0..d237d7c51ccb 100644 +--- a/fs/bcachefs/replicas.h ++++ b/fs/bcachefs/replicas.h +@@ -51,8 +51,6 @@ replicas_delta_next(struct replicas_delta *d) + int bch2_replicas_delta_list_mark(struct bch_fs *, struct replicas_delta_list *); + + void bch2_bkey_to_replicas(struct bch_replicas_entry *, struct bkey_s_c); +-bool bch2_bkey_replicas_marked(struct bch_fs *, struct bkey_s_c); +-int bch2_mark_bkey_replicas(struct bch_fs *, struct bkey_s_c); + + static inline void bch2_replicas_entry_cached(struct bch_replicas_entry *e, + unsigned dev) +-- +cgit v1.2.3 + + +From 488045ef2ba59e0505b3776133410e142503a7a1 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 29 Nov 2021 16:38:27 -0500 +Subject: bcachefs: Erasure coding fixes + +When we added the stripe and stripe_redundancy fields to alloc keys, we +neglected to add them to the functions that convert back and forth with +the in-memory types. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/alloc_background.c | 3 ++ + fs/bcachefs/alloc_background.h | 2 + + fs/bcachefs/btree_gc.c | 16 +++--- + fs/bcachefs/buckets.c | 119 +++++++++++++++++++++++++++++------------ + fs/bcachefs/ec.c | 39 +++++++++++--- + 5 files changed, 130 insertions(+), 49 deletions(-) + +diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c +index b2735c8591d6..bf3611e76912 100644 +--- a/fs/bcachefs/alloc_background.c ++++ b/fs/bcachefs/alloc_background.c +@@ -336,6 +336,9 @@ static int bch2_alloc_read_fn(struct btree_trans *trans, struct bkey_s_c k) + g->_mark.data_type = u.data_type; + g->_mark.dirty_sectors = u.dirty_sectors; + g->_mark.cached_sectors = u.cached_sectors; ++ g->_mark.stripe = u.stripe != 0; ++ g->stripe = u.stripe; ++ g->stripe_redundancy = u.stripe_redundancy; + g->io_time[READ] = u.read_time; + g->io_time[WRITE] = u.write_time; + g->oldest_gen = u.oldest_gen; +diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h +index 370573f8e05d..b1efc1494dc4 100644 +--- a/fs/bcachefs/alloc_background.h ++++ b/fs/bcachefs/alloc_background.h +@@ -65,6 +65,8 @@ alloc_mem_to_key(struct btree_iter *iter, + .cached_sectors = m.cached_sectors, + .read_time = g->io_time[READ], + .write_time = g->io_time[WRITE], ++ .stripe = g->stripe, ++ .stripe_redundancy = g->stripe_redundancy, + }; + } + +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +index e7e0a56af085..4deb87f91d08 100644 +--- a/fs/bcachefs/btree_gc.c ++++ b/fs/bcachefs/btree_gc.c +@@ -1176,14 +1176,14 @@ static int bch2_gc_done(struct bch_fs *c, + set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags); \ + } + #define copy_bucket_field(_f) \ +- if (dst->b[b].mark._f != src->b[b].mark._f) { \ ++ if (dst->b[b]._f != src->b[b]._f) { \ + if (verify) \ + fsck_err(c, "bucket %u:%zu gen %u data type %s has wrong " #_f \ + ": got %u, should be %u", dev, b, \ + dst->b[b].mark.gen, \ + bch2_data_types[dst->b[b].mark.data_type],\ +- dst->b[b].mark._f, src->b[b].mark._f); \ +- dst->b[b]._mark._f = src->b[b].mark._f; \ ++ dst->b[b]._f, src->b[b]._f); \ ++ dst->b[b]._f = src->b[b]._f; \ + set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags); \ + } + #define copy_dev_field(_f, _msg, ...) \ +@@ -1229,11 +1229,13 @@ static int bch2_gc_done(struct bch_fs *c, + size_t b; + + for (b = 0; b < src->nbuckets; b++) { +- copy_bucket_field(gen); +- copy_bucket_field(data_type); ++ copy_bucket_field(_mark.gen); ++ copy_bucket_field(_mark.data_type); ++ copy_bucket_field(_mark.stripe); ++ copy_bucket_field(_mark.dirty_sectors); ++ copy_bucket_field(_mark.cached_sectors); ++ copy_bucket_field(stripe_redundancy); + copy_bucket_field(stripe); +- copy_bucket_field(dirty_sectors); +- copy_bucket_field(cached_sectors); + + dst->b[b].oldest_gen = src->b[b].oldest_gen; + } +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index f8333eb11ce2..9d1a22fd1c2c 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -806,6 +806,8 @@ static int mark_stripe_bucket(struct btree_trans *trans, + const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; + unsigned nr_data = s->nr_blocks - s->nr_redundant; + bool parity = ptr_idx >= nr_data; ++ enum bch_data_type data_type = parity ? BCH_DATA_parity : 0; ++ s64 sectors = parity ? le16_to_cpu(s->sectors) : 0; + const struct bch_extent_ptr *ptr = s->ptrs + ptr_idx; + bool gc = flags & BTREE_TRIGGER_GC; + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); +@@ -814,10 +816,13 @@ static int mark_stripe_bucket(struct btree_trans *trans, + char buf[200]; + int ret = 0; + ++ /* * XXX doesn't handle deletion */ ++ + percpu_down_read(&c->mark_lock); + g = PTR_BUCKET(ca, ptr, gc); + +- if (g->stripe && g->stripe != k.k->p.offset) { ++ if (g->mark.dirty_sectors || ++ (g->stripe && g->stripe != k.k->p.offset)) { + bch2_fs_inconsistent(c, + "bucket %u:%zu gen %u: multiple stripes using same bucket\n%s", + ptr->dev, PTR_BUCKET_NR(ca, ptr), g->mark.gen, +@@ -827,20 +832,22 @@ static int mark_stripe_bucket(struct btree_trans *trans, + } + + old = bucket_cmpxchg(g, new, ({ +- ret = check_bucket_ref(c, k, ptr, 0, 0, new.gen, new.data_type, ++ ret = check_bucket_ref(c, k, ptr, sectors, data_type, ++ new.gen, new.data_type, + new.dirty_sectors, new.cached_sectors); + if (ret) + goto err; + +- if (parity) { +- new.data_type = BCH_DATA_parity; +- new.dirty_sectors = le16_to_cpu(s->sectors); +- } ++ new.dirty_sectors += sectors; ++ if (data_type) ++ new.data_type = data_type; + + if (journal_seq) { + new.journal_seq_valid = 1; + new.journal_seq = journal_seq; + } ++ ++ new.stripe = true; + })); + + g->stripe = k.k->p.offset; +@@ -1120,6 +1127,11 @@ static int bch2_mark_stripe(struct btree_trans *trans, + } + + if (gc) { ++ /* ++ * This will be wrong when we bring back runtime gc: we should ++ * be unmarking the old key and then marking the new key ++ */ ++ + /* + * gc recalculates this field from stripe ptr + * references: +@@ -1651,50 +1663,75 @@ static int bch2_trans_mark_extent(struct btree_trans *trans, + return 0; + } + +-static int bch2_trans_mark_stripe_alloc_ref(struct btree_trans *trans, +- struct bkey_s_c_stripe s, +- unsigned idx, bool deleting) ++static int bch2_trans_mark_stripe_bucket(struct btree_trans *trans, ++ struct bkey_s_c_stripe s, ++ unsigned idx, bool deleting) + { + struct bch_fs *c = trans->c; + const struct bch_extent_ptr *ptr = &s.v->ptrs[idx]; + struct bkey_alloc_buf *a; + struct btree_iter iter; + struct bkey_alloc_unpacked u; +- bool parity = idx >= s.v->nr_blocks - s.v->nr_redundant; ++ enum bch_data_type data_type = idx >= s.v->nr_blocks - s.v->nr_redundant ++ ? BCH_DATA_parity : 0; ++ s64 sectors = data_type ? le16_to_cpu(s.v->sectors) : 0; + int ret = 0; + ++ if (deleting) ++ sectors = -sectors; ++ + a = bch2_trans_start_alloc_update(trans, &iter, ptr, &u); + if (IS_ERR(a)) + return PTR_ERR(a); + +- if (parity) { +- s64 sectors = le16_to_cpu(s.v->sectors); +- +- if (deleting) +- sectors = -sectors; +- +- u.dirty_sectors += sectors; +- u.data_type = u.dirty_sectors +- ? BCH_DATA_parity +- : 0; +- } ++ ret = check_bucket_ref(c, s.s_c, ptr, sectors, data_type, ++ u.gen, u.data_type, ++ u.dirty_sectors, u.cached_sectors); ++ if (ret) ++ goto err; + + if (!deleting) { +- if (bch2_fs_inconsistent_on(u.stripe && u.stripe != s.k->p.offset, c, +- "bucket %llu:%llu gen %u: multiple stripes using same bucket (%u, %llu)", ++ if (bch2_fs_inconsistent_on(u.stripe || ++ u.stripe_redundancy, c, ++ "bucket %llu:%llu gen %u data type %s dirty_sectors %u: multiple stripes using same bucket (%u, %llu)", + iter.pos.inode, iter.pos.offset, u.gen, ++ bch2_data_types[u.data_type], ++ u.dirty_sectors, + u.stripe, s.k->p.offset)) { + ret = -EIO; + goto err; + } + ++ if (bch2_fs_inconsistent_on(data_type && u.dirty_sectors, c, ++ "bucket %llu:%llu gen %u data type %s dirty_sectors %u: data already in stripe bucket %llu", ++ iter.pos.inode, iter.pos.offset, u.gen, ++ bch2_data_types[u.data_type], ++ u.dirty_sectors, ++ s.k->p.offset)) { ++ ret = -EIO; ++ goto err; ++ } ++ + u.stripe = s.k->p.offset; + u.stripe_redundancy = s.v->nr_redundant; + } else { ++ if (bch2_fs_inconsistent_on(u.stripe != s.k->p.offset || ++ u.stripe_redundancy != s.v->nr_redundant, c, ++ "bucket %llu:%llu gen %u: not marked as stripe when deleting stripe %llu (got %u)", ++ iter.pos.inode, iter.pos.offset, u.gen, ++ s.k->p.offset, u.stripe)) { ++ ret = -EIO; ++ goto err; ++ } ++ + u.stripe = 0; + u.stripe_redundancy = 0; + } + ++ u.dirty_sectors += sectors; ++ if (data_type) ++ u.data_type = !deleting ? data_type : 0; ++ + bch2_alloc_pack(c, a, u); + bch2_trans_update(trans, &iter, &a->k, 0); + err: +@@ -1709,7 +1746,7 @@ static int bch2_trans_mark_stripe(struct btree_trans *trans, + struct bkey_s_c_stripe old_s = { .k = NULL }; + struct bkey_s_c_stripe new_s = { .k = NULL }; + struct bch_replicas_padded r; +- unsigned i; ++ unsigned i, nr_blocks; + int ret = 0; + + if (old.k->type == KEY_TYPE_stripe) +@@ -1727,18 +1764,17 @@ static int bch2_trans_mark_stripe(struct btree_trans *trans, + new_s.v->nr_blocks * sizeof(struct bch_extent_ptr))) + return 0; + ++ BUG_ON(new_s.k && old_s.k && ++ (new_s.v->nr_blocks != old_s.v->nr_blocks || ++ new_s.v->nr_redundant != old_s.v->nr_redundant)); ++ ++ nr_blocks = new_s.k ? new_s.v->nr_blocks : old_s.v->nr_blocks; ++ + if (new_s.k) { + s64 sectors = le16_to_cpu(new_s.v->sectors); + + bch2_bkey_to_replicas(&r.e, new); + update_replicas_list(trans, &r.e, sectors * new_s.v->nr_redundant); +- +- for (i = 0; i < new_s.v->nr_blocks; i++) { +- ret = bch2_trans_mark_stripe_alloc_ref(trans, new_s, +- i, false); +- if (ret) +- return ret; +- } + } + + if (old_s.k) { +@@ -1746,12 +1782,25 @@ static int bch2_trans_mark_stripe(struct btree_trans *trans, + + bch2_bkey_to_replicas(&r.e, old); + update_replicas_list(trans, &r.e, sectors * old_s.v->nr_redundant); ++ } ++ ++ for (i = 0; i < nr_blocks; i++) { ++ if (new_s.k && old_s.k && ++ !memcmp(&new_s.v->ptrs[i], ++ &old_s.v->ptrs[i], ++ sizeof(new_s.v->ptrs[i]))) ++ continue; + +- for (i = 0; i < old_s.v->nr_blocks; i++) { +- ret = bch2_trans_mark_stripe_alloc_ref(trans, old_s, +- i, true); ++ if (new_s.k) { ++ ret = bch2_trans_mark_stripe_bucket(trans, new_s, i, false); + if (ret) +- return ret; ++ break; ++ } ++ ++ if (old_s.k) { ++ ret = bch2_trans_mark_stripe_bucket(trans, old_s, i, true); ++ if (ret) ++ break; + } + } + +diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c +index 0df3bbec2e7c..71d85c934741 100644 +--- a/fs/bcachefs/ec.c ++++ b/fs/bcachefs/ec.c +@@ -15,6 +15,7 @@ + #include "io.h" + #include "keylist.h" + #include "recovery.h" ++#include "replicas.h" + #include "super-io.h" + #include "util.h" + +@@ -1635,17 +1636,41 @@ int bch2_stripes_write(struct bch_fs *c, unsigned flags) + + static int bch2_stripes_read_fn(struct btree_trans *trans, struct bkey_s_c k) + { +- struct bkey deleted = KEY(0, 0, 0); +- struct bkey_s_c old = (struct bkey_s_c) { &deleted, NULL }; ++ const struct bch_stripe *s; + struct bch_fs *c = trans->c; ++ struct stripe *m; ++ unsigned i; + int ret = 0; + +- deleted.p = k.k->p; ++ if (k.k->type != KEY_TYPE_stripe) ++ return 0; ++ ++ ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL); ++ if (ret) ++ return ret; ++ ++ s = bkey_s_c_to_stripe(k).v; ++ ++ m = genradix_ptr(&c->stripes[0], k.k->p.offset); ++ m->alive = true; ++ m->sectors = le16_to_cpu(s->sectors); ++ m->algorithm = s->algorithm; ++ m->nr_blocks = s->nr_blocks; ++ m->nr_redundant = s->nr_redundant; ++ m->blocks_nonempty = 0; ++ ++ for (i = 0; i < s->nr_blocks; i++) { ++ m->block_sectors[i] = ++ stripe_blockcount_get(s, i); ++ m->blocks_nonempty += !!m->block_sectors[i]; ++ m->ptrs[i] = s->ptrs[i]; ++ } ++ ++ bch2_bkey_to_replicas(&m->r.e, k); + +- if (k.k->type == KEY_TYPE_stripe) +- ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL) ?: +- bch2_mark_key(trans, old, k, +- BTREE_TRIGGER_NOATOMIC); ++ spin_lock(&c->ec_stripes_heap_lock); ++ bch2_stripes_heap_update(c, m, k.k->p.offset); ++ spin_unlock(&c->ec_stripes_heap_lock); + + return ret; + } +-- +cgit v1.2.3 + + +From 43adfb564e3c15f87a657b1988c0412f5a383c63 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 1 Dec 2021 03:47:54 -0500 +Subject: bcachefs: Fix btree_path leaks in bch2_trans_update() + +bch2_trans_update() had some dodgy gets() and puts() - this fixes a few +leaks. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_update_leaf.c | 13 ++++++------- + 1 file changed, 6 insertions(+), 7 deletions(-) + +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index 131fd4c1e736..39917d104cb2 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -1324,8 +1324,6 @@ int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter, + .ip_allocated = _RET_IP_, + }; + +- __btree_path_get(n.path, true); +- + #ifdef CONFIG_BCACHEFS_DEBUG + trans_for_each_update(trans, i) + BUG_ON(i != trans->updates && +@@ -1362,16 +1360,17 @@ int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter, + if (n.cached && !i->cached) { + i->k = n.k; + i->flags = n.flags; +- +- __btree_path_get(n.path, false); +- } else { +- bch2_path_put(trans, i->path, true); +- *i = n; ++ return 0; + } ++ ++ bch2_path_put(trans, i->path, true); ++ *i = n; + } else + array_insert_item(trans->updates, trans->nr_updates, + i - trans->updates, n); + ++ __btree_path_get(n.path, true); ++ + return 0; + } + +-- +cgit v1.2.3 + + +From b0f59f9cd359d06d84317d7340c8bfbc86013b83 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 4 Dec 2021 20:07:19 -0500 +Subject: bcachefs: Convert journal sysfs params to regular options + +This converts journal_write_delay, journal_flush_disabled, and +journal_reclaim_delay to normal filesystems options, and also adds them +to the superblock. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bcachefs_format.h | 3 +++ + fs/bcachefs/journal.c | 5 +---- + fs/bcachefs/journal_io.c | 2 +- + fs/bcachefs/journal_reclaim.c | 5 +++-- + fs/bcachefs/journal_types.h | 2 -- + fs/bcachefs/opts.h | 12 +++++++++++- + fs/bcachefs/super.c | 9 +++++++++ + fs/bcachefs/sysfs.c | 20 -------------------- + 8 files changed, 28 insertions(+), 30 deletions(-) + +diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h +index b115bd1fa5a3..495f4d19ddcb 100644 +--- a/fs/bcachefs/bcachefs_format.h ++++ b/fs/bcachefs/bcachefs_format.h +@@ -1436,6 +1436,9 @@ LE64_BITMASK(BCH_SB_ERASURE_CODE, struct bch_sb, flags[3], 0, 16); + LE64_BITMASK(BCH_SB_METADATA_TARGET, struct bch_sb, flags[3], 16, 28); + LE64_BITMASK(BCH_SB_SHARD_INUMS, struct bch_sb, flags[3], 28, 29); + LE64_BITMASK(BCH_SB_INODES_USE_KEY_CACHE,struct bch_sb, flags[3], 29, 30); ++LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DELAY,struct bch_sb, flags[3], 30, 62); ++LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DISABLED,struct bch_sb, flags[3], 62, 63); ++LE64_BITMASK(BCH_SB_JOURNAL_RECLAIM_DELAY,struct bch_sb, flags[4], 0, 32); + + /* + * Features: +diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c +index 14bea8a2535e..f23a53136108 100644 +--- a/fs/bcachefs/journal.c ++++ b/fs/bcachefs/journal.c +@@ -311,7 +311,7 @@ static int journal_entry_open(struct journal *j) + + mod_delayed_work(c->io_complete_wq, + &j->write_work, +- msecs_to_jiffies(j->write_delay_ms)); ++ msecs_to_jiffies(c->opts.journal_flush_delay)); + journal_wake(j); + return 0; + } +@@ -1101,9 +1101,6 @@ int bch2_fs_journal_init(struct journal *j) + + lockdep_init_map(&j->res_map, "journal res", &res_key, 0); + +- j->write_delay_ms = 1000; +- j->reclaim_delay_ms = 100; +- + atomic64_set(&j->reservations.counter, + ((union journal_res_state) + { .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v); +diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c +index 5c8304e05abd..37abfb1885a6 100644 +--- a/fs/bcachefs/journal_io.c ++++ b/fs/bcachefs/journal_io.c +@@ -1398,7 +1398,7 @@ void bch2_journal_write(struct closure *cl) + spin_lock(&j->lock); + if (c->sb.features & (1ULL << BCH_FEATURE_journal_no_flush) && + !w->must_flush && +- (jiffies - j->last_flush_write) < msecs_to_jiffies(j->write_delay_ms) && ++ (jiffies - j->last_flush_write) < msecs_to_jiffies(c->opts.journal_flush_delay) && + test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags)) { + w->noflush = true; + SET_JSET_NO_FLUSH(jset, true); +diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c +index ca482c6743c3..ab9a6d966d5e 100644 +--- a/fs/bcachefs/journal_reclaim.c ++++ b/fs/bcachefs/journal_reclaim.c +@@ -637,7 +637,7 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct) + * make sure to flush at least one journal pin: + */ + if (time_after(jiffies, j->last_flushed + +- msecs_to_jiffies(j->reclaim_delay_ms))) ++ msecs_to_jiffies(c->opts.journal_reclaim_delay))) + min_nr = 1; + + if (j->prereserved.reserved * 4 > j->prereserved.remaining) +@@ -686,6 +686,7 @@ int bch2_journal_reclaim(struct journal *j) + static int bch2_journal_reclaim_thread(void *arg) + { + struct journal *j = arg; ++ struct bch_fs *c = container_of(j, struct bch_fs, journal); + unsigned long delay, now; + int ret = 0; + +@@ -703,7 +704,7 @@ static int bch2_journal_reclaim_thread(void *arg) + mutex_unlock(&j->reclaim_lock); + + now = jiffies; +- delay = msecs_to_jiffies(j->reclaim_delay_ms); ++ delay = msecs_to_jiffies(c->opts.journal_reclaim_delay); + j->next_reclaim = j->last_flushed + delay; + + if (!time_in_range(j->next_reclaim, now, now + delay)) +diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h +index d484513289aa..66b1707a6697 100644 +--- a/fs/bcachefs/journal_types.h ++++ b/fs/bcachefs/journal_types.h +@@ -262,8 +262,6 @@ struct journal { + struct mutex discard_lock; + bool can_discard; + +- unsigned write_delay_ms; +- unsigned reclaim_delay_ms; + unsigned long last_flush_write; + + u64 res_get_blocked_start; +diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h +index afb1bb2a62d2..893ad4864ff7 100644 +--- a/fs/bcachefs/opts.h ++++ b/fs/bcachefs/opts.h +@@ -257,13 +257,23 @@ enum opt_type { + OPT_BOOL(), \ + NO_SB_OPT, false, \ + NULL, "Extra debugging information during mount/recovery")\ ++ x(journal_flush_delay, u32, \ ++ OPT_MOUNT|OPT_RUNTIME, \ ++ OPT_UINT(0, U32_MAX), \ ++ BCH_SB_JOURNAL_FLUSH_DELAY, 1000, \ ++ NULL, "Delay in milliseconds before automatic journal commits")\ + x(journal_flush_disabled, u8, \ + OPT_MOUNT|OPT_RUNTIME, \ + OPT_BOOL(), \ +- NO_SB_OPT, false, \ ++ BCH_SB_JOURNAL_FLUSH_DISABLED,false, \ + NULL, "Disable journal flush on sync/fsync\n" \ + "If enabled, writes can be lost, but only since the\n"\ + "last journal write (default 1 second)") \ ++ x(journal_reclaim_delay, u32, \ ++ OPT_MOUNT|OPT_RUNTIME, \ ++ OPT_UINT(0, U32_MAX), \ ++ BCH_SB_JOURNAL_RECLAIM_DELAY, 100, \ ++ NULL, "Delay in milliseconds before automatic journal reclaim")\ + x(fsck, u8, \ + OPT_MOUNT, \ + OPT_BOOL(), \ +diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c +index 3744b6d519a7..f673efed2f47 100644 +--- a/fs/bcachefs/super.c ++++ b/fs/bcachefs/super.c +@@ -744,6 +744,15 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) + + scnprintf(c->name, sizeof(c->name), "%pU", &c->sb.user_uuid); + ++ /* Compat: */ ++ if (sb->version <= bcachefs_metadata_version_inode_v2 && ++ !BCH_SB_JOURNAL_FLUSH_DELAY(sb)) ++ SET_BCH_SB_JOURNAL_FLUSH_DELAY(sb, 1000); ++ ++ if (sb->version <= bcachefs_metadata_version_inode_v2 && ++ !BCH_SB_JOURNAL_RECLAIM_DELAY(sb)) ++ SET_BCH_SB_JOURNAL_RECLAIM_DELAY(sb, 100); ++ + c->opts = bch2_opts_default; + bch2_opts_apply(&c->opts, bch2_opts_from_sb(sb)); + bch2_opts_apply(&c->opts, opts); +diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c +index 864be8601868..fae2356061b0 100644 +--- a/fs/bcachefs/sysfs.c ++++ b/fs/bcachefs/sysfs.c +@@ -140,8 +140,6 @@ rw_attribute(gc_gens_pos); + read_attribute(uuid); + read_attribute(minor); + read_attribute(bucket_size); +-read_attribute(block_size); +-read_attribute(btree_node_size); + read_attribute(first_bucket); + read_attribute(nbuckets); + read_attribute(durability); +@@ -178,9 +176,6 @@ read_attribute(read_realloc_races); + read_attribute(extent_migrate_done); + read_attribute(extent_migrate_raced); + +-rw_attribute(journal_write_delay_ms); +-rw_attribute(journal_reclaim_delay_ms); +- + rw_attribute(discard); + rw_attribute(cache_replacement_policy); + rw_attribute(label); +@@ -357,11 +352,6 @@ SHOW(bch2_fs) + sysfs_print(minor, c->minor); + sysfs_printf(internal_uuid, "%pU", c->sb.uuid.b); + +- sysfs_print(journal_write_delay_ms, c->journal.write_delay_ms); +- sysfs_print(journal_reclaim_delay_ms, c->journal.reclaim_delay_ms); +- +- sysfs_print(block_size, block_bytes(c)); +- sysfs_print(btree_node_size, btree_bytes(c)); + sysfs_hprint(btree_cache_size, bch2_btree_cache_size(c)); + sysfs_hprint(btree_avg_write_size, bch2_btree_avg_write_size(c)); + +@@ -475,9 +465,6 @@ STORE(bch2_fs) + { + struct bch_fs *c = container_of(kobj, struct bch_fs, kobj); + +- sysfs_strtoul(journal_write_delay_ms, c->journal.write_delay_ms); +- sysfs_strtoul(journal_reclaim_delay_ms, c->journal.reclaim_delay_ms); +- + if (attr == &sysfs_btree_gc_periodic) { + ssize_t ret = strtoul_safe(buf, c->btree_gc_periodic) + ?: (ssize_t) size; +@@ -564,14 +551,9 @@ SYSFS_OPS(bch2_fs); + + struct attribute *bch2_fs_files[] = { + &sysfs_minor, +- &sysfs_block_size, +- &sysfs_btree_node_size, + &sysfs_btree_cache_size, + &sysfs_btree_avg_write_size, + +- &sysfs_journal_write_delay_ms, +- &sysfs_journal_reclaim_delay_ms, +- + &sysfs_promote_whole_extents, + + &sysfs_compression_stats, +@@ -846,7 +828,6 @@ SHOW(bch2_dev) + sysfs_printf(uuid, "%pU\n", ca->uuid.b); + + sysfs_print(bucket_size, bucket_bytes(ca)); +- sysfs_print(block_size, block_bytes(c)); + sysfs_print(first_bucket, ca->mi.first_bucket); + sysfs_print(nbuckets, ca->mi.nbuckets); + sysfs_print(durability, ca->mi.durability); +@@ -978,7 +959,6 @@ SYSFS_OPS(bch2_dev); + struct attribute *bch2_dev_files[] = { + &sysfs_uuid, + &sysfs_bucket_size, +- &sysfs_block_size, + &sysfs_first_bucket, + &sysfs_nbuckets, + &sysfs_durability, +-- +cgit v1.2.3 + + +From dde0381ca36396498f2d41cca1843a3cad664264 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 4 Dec 2021 21:52:09 -0500 +Subject: bcachefs: Fix copygc sectors_to_move calculation + +With erasure coding, copygc's count of sectors to move was off, which +matters for the debug statement it prints out when it's not able to move +all the data it tried to. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/move.c | 3 +-- + fs/bcachefs/movinggc.c | 21 +++++++++++---------- + 2 files changed, 12 insertions(+), 12 deletions(-) + +diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c +index 64e39c10e34b..f0495451e20f 100644 +--- a/fs/bcachefs/move.c ++++ b/fs/bcachefs/move.c +@@ -767,8 +767,7 @@ static int __bch2_move_data(struct bch_fs *c, + if (rate) + bch2_ratelimit_increment(rate, k.k->size); + next: +- atomic64_add(k.k->size * bch2_bkey_nr_ptrs_allocated(k), +- &stats->sectors_seen); ++ atomic64_add(k.k->size, &stats->sectors_seen); + next_nondata: + bch2_btree_iter_advance(&iter); + } +diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c +index 5c9eafc026c9..7b7eee9b1773 100644 +--- a/fs/bcachefs/movinggc.c ++++ b/fs/bcachefs/movinggc.c +@@ -139,7 +139,7 @@ static int bch2_copygc(struct bch_fs *c) + struct copygc_heap_entry e, *i; + struct bucket_array *buckets; + struct bch_move_stats move_stats; +- u64 sectors_to_move = 0, sectors_not_moved = 0; ++ u64 sectors_to_move = 0, sectors_to_write = 0, sectors_not_moved = 0; + u64 sectors_reserved = 0; + u64 buckets_to_move, buckets_not_moved = 0; + struct bch_dev *ca; +@@ -205,22 +205,23 @@ static int bch2_copygc(struct bch_fs *c) + up_read(&ca->bucket_lock); + } + ++ /* ++ * Our btree node allocations also come out of RESERVE_MOVINGGC: ++ */ ++ sectors_reserved = (sectors_reserved * 3) / 4; + if (!sectors_reserved) { + bch2_fs_fatal_error(c, "stuck, ran out of copygc reserve!"); + return -1; + } + +- /* +- * Our btree node allocations also come out of RESERVE_MOVINGGC: +- */ +- sectors_to_move = (sectors_to_move * 3) / 4; +- +- for (i = h->data; i < h->data + h->used; i++) +- sectors_to_move += i->sectors * i->replicas; ++ for (i = h->data; i < h->data + h->used; i++) { ++ sectors_to_move += i->sectors; ++ sectors_to_write += i->sectors * i->replicas; ++ } + +- while (sectors_to_move > sectors_reserved) { ++ while (sectors_to_write > sectors_reserved) { + BUG_ON(!heap_pop(h, e, -fragmentation_cmp, NULL)); +- sectors_to_move -= e.sectors * e.replicas; ++ sectors_to_write -= e.sectors * e.replicas; + } + + buckets_to_move = h->used; +-- +cgit v1.2.3 + + +From c8db03d82e66c085755233404248213249c57387 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 4 Dec 2021 21:53:13 -0500 +Subject: bcachefs: Specify filesystem options + +We've got three types of options now - filesystem, device and inode, and +a given option may belong to more than one of those types. + +This patch changes the options to specify explicitly when they're a +filesystem option - in the future we'll probably be adding more device +options. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/opts.h | 107 ++++++++++++++++++++++++++-------------------------- + fs/bcachefs/sysfs.c | 2 +- + 2 files changed, 55 insertions(+), 54 deletions(-) + +diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h +index 893ad4864ff7..e2eb9b3fb275 100644 +--- a/fs/bcachefs/opts.h ++++ b/fs/bcachefs/opts.h +@@ -45,11 +45,12 @@ LE64_BITMASK(NO_SB_OPT, struct bch_sb, flags[0], 0, 0); + + /* When can be set: */ + enum opt_mode { +- OPT_FORMAT = (1 << 0), +- OPT_MOUNT = (1 << 1), +- OPT_RUNTIME = (1 << 2), +- OPT_INODE = (1 << 3), +- OPT_DEVICE = (1 << 4), ++ OPT_FS = (1 << 0), /* Filesystem option */ ++ OPT_DEVICE = (1 << 1), /* Device option */ ++ OPT_INODE = (1 << 2), /* Inode option */ ++ OPT_FORMAT = (1 << 3), /* May be specified at format time */ ++ OPT_MOUNT = (1 << 4), /* May be specified at mount time */ ++ OPT_RUNTIME = (1 << 5), /* May be specified at runtime */ + }; + + enum opt_type { +@@ -87,226 +88,226 @@ enum opt_type { + + #define BCH_OPTS() \ + x(block_size, u16, \ +- OPT_FORMAT, \ ++ OPT_FS|OPT_FORMAT, \ + OPT_SECTORS(1, 128), \ + BCH_SB_BLOCK_SIZE, 8, \ + "size", NULL) \ + x(btree_node_size, u16, \ +- OPT_FORMAT, \ ++ OPT_FS|OPT_FORMAT, \ + OPT_SECTORS(1, 512), \ + BCH_SB_BTREE_NODE_SIZE, 512, \ + "size", "Btree node size, default 256k") \ + x(errors, u8, \ +- OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ ++ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_STR(bch2_error_actions), \ + BCH_SB_ERROR_ACTION, BCH_ON_ERROR_ro, \ + NULL, "Action to take on filesystem error") \ + x(metadata_replicas, u8, \ +- OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ ++ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_UINT(1, BCH_REPLICAS_MAX), \ + BCH_SB_META_REPLICAS_WANT, 1, \ + "#", "Number of metadata replicas") \ + x(data_replicas, u8, \ +- OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \ ++ OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_UINT(1, BCH_REPLICAS_MAX), \ + BCH_SB_DATA_REPLICAS_WANT, 1, \ + "#", "Number of data replicas") \ + x(metadata_replicas_required, u8, \ +- OPT_FORMAT|OPT_MOUNT, \ ++ OPT_FS|OPT_FORMAT|OPT_MOUNT, \ + OPT_UINT(1, BCH_REPLICAS_MAX), \ + BCH_SB_META_REPLICAS_REQ, 1, \ + "#", NULL) \ + x(data_replicas_required, u8, \ +- OPT_FORMAT|OPT_MOUNT, \ ++ OPT_FS|OPT_FORMAT|OPT_MOUNT, \ + OPT_UINT(1, BCH_REPLICAS_MAX), \ + BCH_SB_DATA_REPLICAS_REQ, 1, \ + "#", NULL) \ + x(metadata_checksum, u8, \ +- OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ ++ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_STR(bch2_csum_opts), \ + BCH_SB_META_CSUM_TYPE, BCH_CSUM_OPT_crc32c, \ + NULL, NULL) \ + x(data_checksum, u8, \ +- OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \ ++ OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_STR(bch2_csum_opts), \ + BCH_SB_DATA_CSUM_TYPE, BCH_CSUM_OPT_crc32c, \ + NULL, NULL) \ + x(compression, u8, \ +- OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \ ++ OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_STR(bch2_compression_opts), \ + BCH_SB_COMPRESSION_TYPE, BCH_COMPRESSION_OPT_none, \ + NULL, NULL) \ + x(background_compression, u8, \ +- OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \ ++ OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_STR(bch2_compression_opts), \ + BCH_SB_BACKGROUND_COMPRESSION_TYPE,BCH_COMPRESSION_OPT_none, \ + NULL, NULL) \ + x(str_hash, u8, \ +- OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ ++ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_STR(bch2_str_hash_opts), \ + BCH_SB_STR_HASH_TYPE, BCH_STR_HASH_OPT_siphash, \ + NULL, "Hash function for directory entries and xattrs")\ + x(metadata_target, u16, \ +- OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \ ++ OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_FN(bch2_opt_target), \ + BCH_SB_METADATA_TARGET, 0, \ + "(target)", "Device or disk group for metadata writes") \ + x(foreground_target, u16, \ +- OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \ ++ OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_FN(bch2_opt_target), \ + BCH_SB_FOREGROUND_TARGET, 0, \ + "(target)", "Device or disk group for foreground writes") \ + x(background_target, u16, \ +- OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \ ++ OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_FN(bch2_opt_target), \ + BCH_SB_BACKGROUND_TARGET, 0, \ + "(target)", "Device or disk group to move data to in the background")\ + x(promote_target, u16, \ +- OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \ ++ OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_FN(bch2_opt_target), \ + BCH_SB_PROMOTE_TARGET, 0, \ + "(target)", "Device or disk group to promote data to on read")\ + x(erasure_code, u16, \ +- OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \ ++ OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_BOOL(), \ + BCH_SB_ERASURE_CODE, false, \ + NULL, "Enable erasure coding (DO NOT USE YET)") \ + x(inodes_32bit, u8, \ +- OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ ++ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_BOOL(), \ + BCH_SB_INODE_32BIT, true, \ + NULL, "Constrain inode numbers to 32 bits") \ + x(shard_inode_numbers, u8, \ +- OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ ++ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_BOOL(), \ + BCH_SB_SHARD_INUMS, true, \ + NULL, "Shard new inode numbers by CPU id") \ + x(inodes_use_key_cache, u8, \ +- OPT_FORMAT|OPT_MOUNT, \ ++ OPT_FS|OPT_FORMAT|OPT_MOUNT, \ + OPT_BOOL(), \ + BCH_SB_INODES_USE_KEY_CACHE, true, \ + NULL, "Use the btree key cache for the inodes btree") \ + x(btree_node_mem_ptr_optimization, u8, \ +- OPT_MOUNT|OPT_RUNTIME, \ ++ OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ + OPT_BOOL(), \ + NO_SB_OPT, true, \ + NULL, "Stash pointer to in memory btree node in btree ptr")\ + x(gc_reserve_percent, u8, \ +- OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ ++ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_UINT(5, 21), \ + BCH_SB_GC_RESERVE, 8, \ + "%", "Percentage of disk space to reserve for copygc")\ + x(gc_reserve_bytes, u64, \ +- OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ ++ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_SECTORS(0, U64_MAX), \ + BCH_SB_GC_RESERVE_BYTES, 0, \ + "%", "Amount of disk space to reserve for copygc\n" \ + "Takes precedence over gc_reserve_percent if set")\ + x(root_reserve_percent, u8, \ +- OPT_FORMAT|OPT_MOUNT, \ ++ OPT_FS|OPT_FORMAT|OPT_MOUNT, \ + OPT_UINT(0, 100), \ + BCH_SB_ROOT_RESERVE, 0, \ + "%", "Percentage of disk space to reserve for superuser")\ + x(wide_macs, u8, \ +- OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ ++ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_BOOL(), \ + BCH_SB_128_BIT_MACS, false, \ + NULL, "Store full 128 bits of cryptographic MACs, instead of 80")\ + x(inline_data, u8, \ +- OPT_MOUNT|OPT_RUNTIME, \ ++ OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ + OPT_BOOL(), \ + NO_SB_OPT, true, \ + NULL, "Enable inline data extents") \ + x(acl, u8, \ +- OPT_FORMAT|OPT_MOUNT, \ ++ OPT_FS|OPT_FORMAT|OPT_MOUNT, \ + OPT_BOOL(), \ + BCH_SB_POSIX_ACL, true, \ + NULL, "Enable POSIX acls") \ + x(usrquota, u8, \ +- OPT_FORMAT|OPT_MOUNT, \ ++ OPT_FS|OPT_FORMAT|OPT_MOUNT, \ + OPT_BOOL(), \ + BCH_SB_USRQUOTA, false, \ + NULL, "Enable user quotas") \ + x(grpquota, u8, \ +- OPT_FORMAT|OPT_MOUNT, \ ++ OPT_FS|OPT_FORMAT|OPT_MOUNT, \ + OPT_BOOL(), \ + BCH_SB_GRPQUOTA, false, \ + NULL, "Enable group quotas") \ + x(prjquota, u8, \ +- OPT_FORMAT|OPT_MOUNT, \ ++ OPT_FS|OPT_FORMAT|OPT_MOUNT, \ + OPT_BOOL(), \ + BCH_SB_PRJQUOTA, false, \ + NULL, "Enable project quotas") \ + x(degraded, u8, \ +- OPT_MOUNT, \ ++ OPT_FS|OPT_MOUNT, \ + OPT_BOOL(), \ + NO_SB_OPT, false, \ + NULL, "Allow mounting in degraded mode") \ + x(very_degraded, u8, \ +- OPT_MOUNT, \ ++ OPT_FS|OPT_MOUNT, \ + OPT_BOOL(), \ + NO_SB_OPT, false, \ + NULL, "Allow mounting in when data will be missing") \ + x(discard, u8, \ +- OPT_MOUNT|OPT_DEVICE, \ ++ OPT_FS|OPT_MOUNT|OPT_DEVICE, \ + OPT_BOOL(), \ + NO_SB_OPT, false, \ + NULL, "Enable discard/TRIM support") \ + x(verbose, u8, \ +- OPT_MOUNT, \ ++ OPT_FS|OPT_MOUNT, \ + OPT_BOOL(), \ + NO_SB_OPT, false, \ + NULL, "Extra debugging information during mount/recovery")\ + x(journal_flush_delay, u32, \ +- OPT_MOUNT|OPT_RUNTIME, \ ++ OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ + OPT_UINT(0, U32_MAX), \ + BCH_SB_JOURNAL_FLUSH_DELAY, 1000, \ + NULL, "Delay in milliseconds before automatic journal commits")\ + x(journal_flush_disabled, u8, \ +- OPT_MOUNT|OPT_RUNTIME, \ ++ OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ + OPT_BOOL(), \ + BCH_SB_JOURNAL_FLUSH_DISABLED,false, \ + NULL, "Disable journal flush on sync/fsync\n" \ + "If enabled, writes can be lost, but only since the\n"\ + "last journal write (default 1 second)") \ + x(journal_reclaim_delay, u32, \ +- OPT_MOUNT|OPT_RUNTIME, \ ++ OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ + OPT_UINT(0, U32_MAX), \ + BCH_SB_JOURNAL_RECLAIM_DELAY, 100, \ + NULL, "Delay in milliseconds before automatic journal reclaim")\ + x(fsck, u8, \ +- OPT_MOUNT, \ ++ OPT_FS|OPT_MOUNT, \ + OPT_BOOL(), \ + NO_SB_OPT, false, \ + NULL, "Run fsck on mount") \ + x(fix_errors, u8, \ +- OPT_MOUNT, \ ++ OPT_FS|OPT_MOUNT, \ + OPT_BOOL(), \ + NO_SB_OPT, false, \ + NULL, "Fix errors during fsck without asking") \ + x(ratelimit_errors, u8, \ +- OPT_MOUNT, \ ++ OPT_FS|OPT_MOUNT, \ + OPT_BOOL(), \ + NO_SB_OPT, RATELIMIT_ERRORS, \ + NULL, "Ratelimit error messages during fsck") \ + x(nochanges, u8, \ +- OPT_MOUNT, \ ++ OPT_FS|OPT_MOUNT, \ + OPT_BOOL(), \ + NO_SB_OPT, false, \ + NULL, "Super read only mode - no writes at all will be issued,\n"\ + "even if we have to replay the journal") \ + x(norecovery, u8, \ +- OPT_MOUNT, \ ++ OPT_FS|OPT_MOUNT, \ + OPT_BOOL(), \ + NO_SB_OPT, false, \ + NULL, "Don't replay the journal") \ + x(rebuild_replicas, u8, \ +- OPT_MOUNT, \ ++ OPT_FS|OPT_MOUNT, \ + OPT_BOOL(), \ + NO_SB_OPT, false, \ + NULL, "Rebuild the superblock replicas section") \ + x(keep_journal, u8, \ +- OPT_MOUNT, \ ++ 0, \ + OPT_BOOL(), \ + NO_SB_OPT, false, \ + NULL, "Don't free journal entries/keys after startup")\ +@@ -316,7 +317,7 @@ enum opt_type { + NO_SB_OPT, false, \ + NULL, "Read all journal entries, not just dirty ones")\ + x(noexcl, u8, \ +- OPT_MOUNT, \ ++ OPT_FS|OPT_MOUNT, \ + OPT_BOOL(), \ + NO_SB_OPT, false, \ + NULL, "Don't open device in exclusive mode") \ +@@ -326,7 +327,7 @@ enum opt_type { + NO_SB_OPT, BCH_SB_SECTOR, \ + "offset", "Sector offset of superblock") \ + x(read_only, u8, \ +- 0, \ ++ OPT_FS, \ + OPT_BOOL(), \ + NO_SB_OPT, false, \ + NULL, NULL) \ +@@ -336,12 +337,12 @@ enum opt_type { + NO_SB_OPT, false, \ + NULL, "Don\'t start filesystem, only open devices") \ + x(reconstruct_alloc, u8, \ +- OPT_MOUNT, \ ++ OPT_FS|OPT_MOUNT, \ + OPT_BOOL(), \ + NO_SB_OPT, false, \ + NULL, "Reconstruct alloc btree") \ + x(version_upgrade, u8, \ +- OPT_MOUNT, \ ++ OPT_FS|OPT_MOUNT, \ + OPT_BOOL(), \ + NO_SB_OPT, false, \ + NULL, "Set superblock to latest version,\n" \ +diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c +index fae2356061b0..d5d32bf16d68 100644 +--- a/fs/bcachefs/sysfs.c ++++ b/fs/bcachefs/sysfs.c +@@ -685,7 +685,7 @@ int bch2_opts_create_sysfs_files(struct kobject *kobj) + for (i = bch2_opt_table; + i < bch2_opt_table + bch2_opts_nr; + i++) { +- if (!(i->mode & (OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME))) ++ if (!(i->mode & OPT_FS)) + continue; + + ret = sysfs_create_file(kobj, &i->attr); +-- +cgit v1.2.3 + + +From 76b5d416f0f3a84e2b4a908a59a35f776236067b Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 4 Dec 2021 22:03:07 -0500 +Subject: bcachefs: Make __bch2_journal_debug_to_text() more readable + +Switch to one line of output per pr_buf() call - longer lines but quite +a bit more readable. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/journal.c | 83 +++++++++++++++++---------------------------------- + 1 file changed, 28 insertions(+), 55 deletions(-) + +diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c +index f23a53136108..268f3ea4bdd2 100644 +--- a/fs/bcachefs/journal.c ++++ b/fs/bcachefs/journal.c +@@ -1132,44 +1132,29 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) + struct bch_fs *c = container_of(j, struct bch_fs, journal); + union journal_res_state s; + struct bch_dev *ca; ++ unsigned long now = jiffies; + unsigned i; + + rcu_read_lock(); + s = READ_ONCE(j->reservations); + +- pr_buf(out, +- "active journal entries:\t%llu\n" +- "seq:\t\t\t%llu\n" +- "last_seq:\t\t%llu\n" +- "last_seq_ondisk:\t%llu\n" +- "flushed_seq_ondisk:\t%llu\n" +- "prereserved:\t\t%u/%u\n" +- "each entry reserved:\t%u\n" +- "nr flush writes:\t%llu\n" +- "nr noflush writes:\t%llu\n" +- "nr direct reclaim:\t%llu\n" +- "nr background reclaim:\t%llu\n" +- "reclaim kicked:\t\t%u\n" +- "reclaim runs in:\t%u ms\n" +- "current entry sectors:\t%u\n" +- "current entry error:\t%u\n" +- "current entry:\t\t", +- fifo_used(&j->pin), +- journal_cur_seq(j), +- journal_last_seq(j), +- j->last_seq_ondisk, +- j->flushed_seq_ondisk, +- j->prereserved.reserved, +- j->prereserved.remaining, +- j->entry_u64s_reserved, +- j->nr_flush_writes, +- j->nr_noflush_writes, +- j->nr_direct_reclaim, +- j->nr_background_reclaim, +- j->reclaim_kicked, +- jiffies_to_msecs(j->next_reclaim - jiffies), +- j->cur_entry_sectors, +- j->cur_entry_error); ++ pr_buf(out, "active journal entries:\t%llu\n", fifo_used(&j->pin)); ++ pr_buf(out, "seq:\t\t\t%llu\n", journal_cur_seq(j)); ++ pr_buf(out, "last_seq:\t\t%llu\n", journal_last_seq(j)); ++ pr_buf(out, "last_seq_ondisk:\t%llu\n", j->last_seq_ondisk); ++ pr_buf(out, "flushed_seq_ondisk:\t%llu\n", j->flushed_seq_ondisk); ++ pr_buf(out, "prereserved:\t\t%u/%u\n", j->prereserved.reserved, j->prereserved.remaining); ++ pr_buf(out, "each entry reserved:\t%u\n", j->entry_u64s_reserved); ++ pr_buf(out, "nr flush writes:\t%llu\n", j->nr_flush_writes); ++ pr_buf(out, "nr noflush writes:\t%llu\n", j->nr_noflush_writes); ++ pr_buf(out, "nr direct reclaim:\t%llu\n", j->nr_direct_reclaim); ++ pr_buf(out, "nr background reclaim:\t%llu\n", j->nr_background_reclaim); ++ pr_buf(out, "reclaim kicked:\t\t%u\n", j->reclaim_kicked); ++ pr_buf(out, "reclaim runs in:\t%u ms\n", time_after(j->next_reclaim, now) ++ ? jiffies_to_msecs(j->next_reclaim - jiffies) : 0); ++ pr_buf(out, "current entry sectors:\t%u\n", j->cur_entry_sectors); ++ pr_buf(out, "current entry error:\t%u\n", j->cur_entry_error); ++ pr_buf(out, "current entry:\t\t"); + + switch (s.cur_entry_offset) { + case JOURNAL_ENTRY_ERROR_VAL: +@@ -1179,15 +1164,11 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) + pr_buf(out, "closed\n"); + break; + default: +- pr_buf(out, "%u/%u\n", +- s.cur_entry_offset, +- j->cur_entry_u64s); ++ pr_buf(out, "%u/%u\n", s.cur_entry_offset, j->cur_entry_u64s); + break; + } + +- pr_buf(out, +- "current entry:\t\tidx %u refcount %u\n", +- s.idx, journal_state_count(s, s.idx)); ++ pr_buf(out, "current entry:\t\tidx %u refcount %u\n", s.idx, journal_state_count(s, s.idx)); + + i = s.idx; + while (i != s.unwritten_idx) { +@@ -1227,22 +1208,14 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) + if (!ja->nr) + continue; + +- pr_buf(out, +- "dev %u:\n" +- "\tnr\t\t%u\n" +- "\tbucket size\t%u\n" +- "\tavailable\t%u:%u\n" +- "\tdiscard_idx\t%u\n" +- "\tdirty_ondisk\t%u (seq %llu)\n" +- "\tdirty_idx\t%u (seq %llu)\n" +- "\tcur_idx\t\t%u (seq %llu)\n", +- i, ja->nr, ca->mi.bucket_size, +- bch2_journal_dev_buckets_available(j, ja, journal_space_discarded), +- ja->sectors_free, +- ja->discard_idx, +- ja->dirty_idx_ondisk, ja->bucket_seq[ja->dirty_idx_ondisk], +- ja->dirty_idx, ja->bucket_seq[ja->dirty_idx], +- ja->cur_idx, ja->bucket_seq[ja->cur_idx]); ++ pr_buf(out, "dev %u:\n", i); ++ pr_buf(out, "\tnr\t\t%u\n", ja->nr); ++ pr_buf(out, "\tbucket size\t%u\n", ca->mi.bucket_size); ++ pr_buf(out, "\tavailable\t%u:%u\n", bch2_journal_dev_buckets_available(j, ja, journal_space_discarded), ja->sectors_free); ++ pr_buf(out, "\tdiscard_idx\t%u\n", ja->discard_idx); ++ pr_buf(out, "\tdirty_ondisk\t%u (seq %llu)\n", ja->dirty_idx_ondisk, ja->bucket_seq[ja->dirty_idx_ondisk]); ++ pr_buf(out, "\tdirty_idx\t%u (seq %llu)\n", ja->dirty_idx, ja->bucket_seq[ja->dirty_idx]); ++ pr_buf(out, "\tcur_idx\t\t%u (seq %llu)\n", ja->cur_idx, ja->bucket_seq[ja->cur_idx]); + } + + rcu_read_unlock(); +-- +cgit v1.2.3 + + +From 3cd6d36d5f931b2c27656a77a35b4b4c10e17553 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 5 Dec 2021 00:30:49 -0500 +Subject: bcachefs: bch2_trans_update() is now __must_check + +With snapshots, bch2_trans_update() has to check if we need a whitout, +which can cause a transaction restart, so this is important now. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_update.h | 4 ++-- + fs/bcachefs/btree_update_leaf.c | 4 ++-- + fs/bcachefs/buckets.c | 16 ++++++++++++---- + fs/bcachefs/dirent.c | 8 ++++++-- + fs/bcachefs/subvolume.c | 21 +++++++++++++-------- + fs/bcachefs/tests.c | 4 ++-- + 6 files changed, 37 insertions(+), 20 deletions(-) + +diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h +index 0268dd74f0ab..89f07e58f61b 100644 +--- a/fs/bcachefs/btree_update.h ++++ b/fs/bcachefs/btree_update.h +@@ -73,8 +73,8 @@ int bch2_btree_node_update_key(struct btree_trans *, struct btree_iter *, + int bch2_btree_node_update_key_get_iter(struct btree_trans *, + struct btree *, struct bkey_i *, bool); + +-int bch2_trans_update(struct btree_trans *, struct btree_iter *, +- struct bkey_i *, enum btree_update_flags); ++int __must_check bch2_trans_update(struct btree_trans *, struct btree_iter *, ++ struct bkey_i *, enum btree_update_flags); + void bch2_trans_commit_hook(struct btree_trans *, + struct btree_trans_commit_hook *); + int __bch2_trans_commit(struct btree_trans *); +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index 39917d104cb2..10837a62f01c 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -1300,8 +1300,8 @@ static int need_whiteout_for_snapshot(struct btree_trans *trans, + return ret; + } + +-int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter, +- struct bkey_i *k, enum btree_update_flags flags) ++int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter, ++ struct bkey_i *k, enum btree_update_flags flags) + { + struct btree_insert_entry *i, n; + +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index 9d1a22fd1c2c..ebf69effa4a0 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -1544,7 +1544,9 @@ static int bch2_trans_mark_pointer(struct btree_trans *trans, + goto out; + + bch2_alloc_pack(c, a, u); +- bch2_trans_update(trans, &iter, &a->k, 0); ++ ret = bch2_trans_update(trans, &iter, &a->k, 0); ++ if (ret) ++ goto out; + out: + bch2_trans_iter_exit(trans, &iter); + return ret; +@@ -1595,7 +1597,9 @@ static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans, + stripe_blockcount_set(&s->v, p.ec.block, + stripe_blockcount_get(&s->v, p.ec.block) + + sectors); +- bch2_trans_update(trans, &iter, &s->k_i, 0); ++ ret = bch2_trans_update(trans, &iter, &s->k_i, 0); ++ if (ret) ++ goto err; + + bch2_bkey_to_replicas(&r.e, bkey_i_to_s_c(&s->k_i)); + r.e.data_type = data_type; +@@ -1733,7 +1737,9 @@ static int bch2_trans_mark_stripe_bucket(struct btree_trans *trans, + u.data_type = !deleting ? data_type : 0; + + bch2_alloc_pack(c, a, u); +- bch2_trans_update(trans, &iter, &a->k, 0); ++ ret = bch2_trans_update(trans, &iter, &a->k, 0); ++ if (ret) ++ goto err; + err: + bch2_trans_iter_exit(trans, &iter); + return ret; +@@ -2012,7 +2018,9 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans, + u.dirty_sectors = sectors; + + bch2_alloc_pack(c, a, u); +- bch2_trans_update(trans, &iter, &a->k, 0); ++ ret = bch2_trans_update(trans, &iter, &a->k, 0); ++ if (ret) ++ goto out; + out: + bch2_trans_iter_exit(trans, &iter); + return ret; +diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c +index fe4a85a6a8cb..a165d08c3668 100644 +--- a/fs/bcachefs/dirent.c ++++ b/fs/bcachefs/dirent.c +@@ -367,7 +367,9 @@ int bch2_dirent_rename(struct btree_trans *trans, + } + } + +- bch2_trans_update(trans, &dst_iter, &new_dst->k_i, 0); ++ ret = bch2_trans_update(trans, &dst_iter, &new_dst->k_i, 0); ++ if (ret) ++ goto out; + out_set_src: + + /* +@@ -384,7 +386,9 @@ out_set_src: + src_update_flags |= BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE; + } + +- bch2_trans_update(trans, &src_iter, &new_src->k_i, src_update_flags); ++ ret = bch2_trans_update(trans, &src_iter, &new_src->k_i, src_update_flags); ++ if (ret) ++ goto out; + + if (mode == BCH_RENAME_EXCHANGE) + *src_offset = new_src->k.p.offset; +diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c +index 7e909a118189..8aeb2e417a15 100644 +--- a/fs/bcachefs/subvolume.c ++++ b/fs/bcachefs/subvolume.c +@@ -488,7 +488,7 @@ static int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent, + n = bch2_trans_kmalloc(trans, sizeof(*n)); + ret = PTR_ERR_OR_ZERO(n); + if (ret) +- return ret; ++ goto err; + + bkey_snapshot_init(&n->k_i); + n->k.p = iter.pos; +@@ -498,11 +498,10 @@ static int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent, + n->v.pad = 0; + SET_BCH_SNAPSHOT_SUBVOL(&n->v, true); + +- bch2_trans_update(trans, &iter, &n->k_i, 0); +- +- ret = bch2_mark_snapshot(trans, bkey_s_c_null, bkey_i_to_s_c(&n->k_i), 0); ++ ret = bch2_trans_update(trans, &iter, &n->k_i, 0) ?: ++ bch2_mark_snapshot(trans, bkey_s_c_null, bkey_i_to_s_c(&n->k_i), 0); + if (ret) +- break; ++ goto err; + + new_snapids[i] = iter.pos.offset; + } +@@ -536,7 +535,9 @@ static int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent, + n->v.children[0] = cpu_to_le32(new_snapids[0]); + n->v.children[1] = cpu_to_le32(new_snapids[1]); + SET_BCH_SNAPSHOT_SUBVOL(&n->v, false); +- bch2_trans_update(trans, &iter, &n->k_i, 0); ++ ret = bch2_trans_update(trans, &iter, &n->k_i, 0); ++ if (ret) ++ goto err; + } + err: + bch2_trans_iter_exit(trans, &iter); +@@ -1049,7 +1050,9 @@ found_slot: + + if (src_subvolid) { + src_subvol->v.snapshot = cpu_to_le32(new_nodes[1]); +- bch2_trans_update(trans, &src_iter, &src_subvol->k_i, 0); ++ ret = bch2_trans_update(trans, &src_iter, &src_subvol->k_i, 0); ++ if (ret) ++ goto err; + } + + new_subvol = bch2_trans_kmalloc(trans, sizeof(*new_subvol)); +@@ -1064,7 +1067,9 @@ found_slot: + SET_BCH_SUBVOLUME_RO(&new_subvol->v, ro); + SET_BCH_SUBVOLUME_SNAP(&new_subvol->v, src_subvolid != 0); + new_subvol->k.p = dst_iter.pos; +- bch2_trans_update(trans, &dst_iter, &new_subvol->k_i, 0); ++ ret = bch2_trans_update(trans, &dst_iter, &new_subvol->k_i, 0); ++ if (ret) ++ goto err; + + *new_subvolid = new_subvol->k.p.offset; + *new_snapshotid = new_nodes[0]; +diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c +index d6facb76a0a2..c42db4d1d6e3 100644 +--- a/fs/bcachefs/tests.c ++++ b/fs/bcachefs/tests.c +@@ -584,10 +584,10 @@ static int rand_mixed_trans(struct btree_trans *trans, + if (!(i & 3) && k.k) { + bkey_cookie_init(&cookie->k_i); + cookie->k.p = iter->pos; +- bch2_trans_update(trans, iter, &cookie->k_i, 0); ++ ret = bch2_trans_update(trans, iter, &cookie->k_i, 0); + } + +- return 0; ++ return ret; + } + + static int rand_mixed(struct bch_fs *c, u64 nr) +-- +cgit v1.2.3 + + +From 12a45d72e3ded578248a63ab3702367ffe3ac06d Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 8 Dec 2021 13:31:36 -0500 +Subject: bcachefs: Convert a BUG_ON() to a warning + +A user reported hitting this assertion, and we can't reproduce it yet, +but it shouldn't be fatal - so convert it to a warning. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/fs-io.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c +index e29f160a6da0..3d053479b8cd 100644 +--- a/fs/bcachefs/fs-io.c ++++ b/fs/bcachefs/fs-io.c +@@ -1289,7 +1289,7 @@ static void bch2_writepage_io_done(struct closure *cl) + * racing with fallocate can cause us to add fewer sectors than + * expected - but we shouldn't add more sectors than expected: + */ +- BUG_ON(io->op.i_sectors_delta > 0); ++ WARN_ON(io->op.i_sectors_delta > 0); + + /* + * (error (due to going RO) halfway through a page can screw that up +-- +cgit v1.2.3 + + +From 9f214f7fadc05de7855df60a60edeaccd1cd7b60 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 4 Dec 2021 23:07:33 -0500 +Subject: bcachefs: Split out struct gc_stripe from struct stripe + +We have two radix trees of stripes - one that mirrors some information +from the stripes btree in normal operation, and another that GC uses to +recalculate block usage counts. + +The normal one is now only used for finding partially empty stripes in +order to reuse them - the normal stripes radix tree and the GC stripes +radix tree are used significantly differently, so this patch splits them +into separate types. + +In an upcoming patch we'll be replacing c->stripes with a btree that +indexes stripes by the order we want to reuse them. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bcachefs.h | 3 +- + fs/bcachefs/btree_gc.c | 110 ++++++++++++++++++++++++------------ + fs/bcachefs/buckets.c | 116 +++++++++++++++++++------------------- + fs/bcachefs/ec.c | 147 ++++++++++--------------------------------------- + fs/bcachefs/ec.h | 3 +- + fs/bcachefs/ec_types.h | 9 +++ + fs/bcachefs/recovery.c | 3 +- + 7 files changed, 176 insertions(+), 215 deletions(-) + +diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h +index 0439f3e0d8d7..fee1fc58e13b 100644 +--- a/fs/bcachefs/bcachefs.h ++++ b/fs/bcachefs/bcachefs.h +@@ -826,7 +826,8 @@ struct bch_fs { + struct mutex data_progress_lock; + + /* STRIPES: */ +- GENRADIX(struct stripe) stripes[2]; ++ GENRADIX(struct stripe) stripes; ++ GENRADIX(struct gc_stripe) gc_stripes; + + ec_stripes_heap ec_stripes_heap; + spinlock_t ec_stripes_heap_lock; +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +index 4deb87f91d08..a36b0e60077e 100644 +--- a/fs/bcachefs/btree_gc.c ++++ b/fs/bcachefs/btree_gc.c +@@ -597,7 +597,7 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id, + } + + if (p.has_ec) { +- struct stripe *m = genradix_ptr(&c->stripes[true], p.ec.idx); ++ struct gc_stripe *m = genradix_ptr(&c->gc_stripes, p.ec.idx); + + if (fsck_err_on(!m || !m->alive, c, + "pointer to nonexistent stripe %llu\n" +@@ -665,7 +665,7 @@ again: + ptrs = bch2_bkey_ptrs(bkey_i_to_s(new)); + bkey_extent_entry_for_each(ptrs, entry) { + if (extent_entry_type(entry) == BCH_EXTENT_ENTRY_stripe_ptr) { +- struct stripe *m = genradix_ptr(&c->stripes[true], ++ struct gc_stripe *m = genradix_ptr(&c->gc_stripes, + entry->stripe_ptr.idx); + union bch_extent_entry *next_ptr; + +@@ -1132,7 +1132,8 @@ static void bch2_gc_free(struct bch_fs *c) + struct bch_dev *ca; + unsigned i; + +- genradix_free(&c->stripes[1]); ++ genradix_free(&c->reflink_gc_table); ++ genradix_free(&c->gc_stripes); + + for_each_member_device(ca, c, i) { + kvpfree(rcu_dereference_protected(ca->buckets[1], 1), +@@ -1191,35 +1192,6 @@ static int bch2_gc_done(struct bch_fs *c, + #define copy_fs_field(_f, _msg, ...) \ + copy_field(_f, "fs has wrong " _msg, ##__VA_ARGS__) + +- if (!metadata_only) { +- struct genradix_iter iter = genradix_iter_init(&c->stripes[1], 0); +- struct stripe *dst, *src; +- +- while ((src = genradix_iter_peek(&iter, &c->stripes[1]))) { +- dst = genradix_ptr_alloc(&c->stripes[0], iter.pos, GFP_KERNEL); +- +- if (dst->alive != src->alive || +- dst->sectors != src->sectors || +- dst->algorithm != src->algorithm || +- dst->nr_blocks != src->nr_blocks || +- dst->nr_redundant != src->nr_redundant) { +- bch_err(c, "unexpected stripe inconsistency at bch2_gc_done, confused"); +- ret = -EINVAL; +- goto fsck_err; +- } +- +- for (i = 0; i < ARRAY_SIZE(dst->block_sectors); i++) +- copy_stripe_field(block_sectors[i], +- "block_sectors[%u]", i); +- +- dst->blocks_nonempty = 0; +- for (i = 0; i < dst->nr_blocks; i++) +- dst->blocks_nonempty += dst->block_sectors[i] != 0; +- +- genradix_iter_advance(&iter, &c->stripes[1]); +- } +- } +- + for (i = 0; i < ARRAY_SIZE(c->usage); i++) + bch2_fs_usage_acc_to_base(c, i); + +@@ -1510,12 +1482,82 @@ static int bch2_gc_reflink_done(struct bch_fs *c, bool initial, + fsck_err: + bch2_trans_iter_exit(&trans, &iter); + out: +- genradix_free(&c->reflink_gc_table); + c->reflink_gc_nr = 0; + bch2_trans_exit(&trans); + return ret; + } + ++static int bch2_gc_stripes_done_initial_fn(struct btree_trans *trans, ++ struct bkey_s_c k) ++{ ++ struct bch_fs *c = trans->c; ++ struct gc_stripe *m; ++ const struct bch_stripe *s; ++ char buf[200]; ++ unsigned i; ++ int ret = 0; ++ ++ if (k.k->type != KEY_TYPE_stripe) ++ return 0; ++ ++ s = bkey_s_c_to_stripe(k).v; ++ ++ m = genradix_ptr(&c->gc_stripes, k.k->p.offset); ++ ++ for (i = 0; i < s->nr_blocks; i++) ++ if (stripe_blockcount_get(s, i) != (m ? m->block_sectors[i] : 0)) ++ goto inconsistent; ++ return 0; ++inconsistent: ++ if (fsck_err_on(true, c, ++ "stripe has wrong block sector count %u:\n" ++ " %s\n" ++ " should be %u", i, ++ (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf), ++ m ? m->block_sectors[i] : 0)) { ++ struct bkey_i_stripe *new; ++ ++ new = kmalloc(bkey_bytes(k.k), GFP_KERNEL); ++ if (!new) { ++ ret = -ENOMEM; ++ goto fsck_err; ++ } ++ ++ bkey_reassemble(&new->k_i, k); ++ ++ for (i = 0; i < new->v.nr_blocks; i++) ++ stripe_blockcount_set(&new->v, i, m ? m->block_sectors[i] : 0); ++ ++ ret = bch2_journal_key_insert(c, BTREE_ID_stripes, 0, &new->k_i); ++ if (ret) ++ kfree(new); ++ } ++fsck_err: ++ return ret; ++} ++ ++static int bch2_gc_stripes_done(struct bch_fs *c, bool initial, ++ bool metadata_only) ++{ ++ struct btree_trans trans; ++ int ret = 0; ++ ++ if (metadata_only) ++ return 0; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ if (initial) { ++ ret = bch2_btree_and_journal_walk(&trans, BTREE_ID_stripes, ++ bch2_gc_stripes_done_initial_fn); ++ } else { ++ BUG(); ++ } ++ ++ bch2_trans_exit(&trans); ++ return ret; ++} ++ + static int bch2_gc_reflink_start_initial_fn(struct btree_trans *trans, + struct bkey_s_c k) + { +@@ -1551,7 +1593,6 @@ static int bch2_gc_reflink_start(struct bch_fs *c, bool initial, + return 0; + + bch2_trans_init(&trans, c, 0, 0); +- genradix_free(&c->reflink_gc_table); + c->reflink_gc_nr = 0; + + if (initial) { +@@ -1685,6 +1726,7 @@ out: + + percpu_down_write(&c->mark_lock); + ret = bch2_gc_reflink_done(c, initial, metadata_only) ?: ++ bch2_gc_stripes_done(c, initial, metadata_only) ?: + bch2_gc_done(c, initial, metadata_only); + + bch2_journal_unblock(&c->journal); +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index ebf69effa4a0..3d764599a23f 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -949,39 +949,34 @@ static int bch2_mark_stripe_ptr(struct btree_trans *trans, + bool gc = flags & BTREE_TRIGGER_GC; + struct bch_fs *c = trans->c; + struct bch_replicas_padded r; +- struct stripe *m; +- unsigned i, blocks_nonempty = 0; + +- m = genradix_ptr(&c->stripes[gc], p.idx); ++ if (!gc) { ++ BUG(); ++ } else { ++ struct gc_stripe *m = genradix_ptr_alloc(&c->gc_stripes, p.idx, GFP_KERNEL); + +- spin_lock(&c->ec_stripes_heap_lock); ++ if (!m) ++ return -ENOMEM; + +- if (!m || !m->alive) { +- spin_unlock(&c->ec_stripes_heap_lock); +- bch_err_ratelimited(c, "pointer to nonexistent stripe %llu", +- (u64) p.idx); +- bch2_inconsistent_error(c); +- return -EIO; +- } ++ spin_lock(&c->ec_stripes_heap_lock); + +- m->block_sectors[p.block] += sectors; ++ if (!m || !m->alive) { ++ spin_unlock(&c->ec_stripes_heap_lock); ++ bch_err_ratelimited(c, "pointer to nonexistent stripe %llu", ++ (u64) p.idx); ++ bch2_inconsistent_error(c); ++ return -EIO; ++ } + +- r = m->r; ++ m->block_sectors[p.block] += sectors; + +- for (i = 0; i < m->nr_blocks; i++) +- blocks_nonempty += m->block_sectors[i] != 0; ++ r = m->r; ++ spin_unlock(&c->ec_stripes_heap_lock); + +- if (m->blocks_nonempty != blocks_nonempty) { +- m->blocks_nonempty = blocks_nonempty; +- if (!gc) +- bch2_stripes_heap_update(c, m, p.idx); ++ r.e.data_type = data_type; ++ update_replicas(c, k, &r.e, sectors, trans->journal_res.seq, gc); + } + +- spin_unlock(&c->ec_stripes_heap_lock); +- +- r.e.data_type = data_type; +- update_replicas(c, k, &r.e, sectors, trans->journal_res.seq, gc); +- + return 0; + } + +@@ -1077,67 +1072,69 @@ static int bch2_mark_stripe(struct btree_trans *trans, + ? bkey_s_c_to_stripe(old).v : NULL; + const struct bch_stripe *new_s = new.k->type == KEY_TYPE_stripe + ? bkey_s_c_to_stripe(new).v : NULL; +- struct stripe *m = genradix_ptr(&c->stripes[gc], idx); + unsigned i; + int ret; + + BUG_ON(gc && old_s); + +- if (!m || (old_s && !m->alive)) { +- char buf1[200], buf2[200]; ++ if (!gc) { ++ struct stripe *m = genradix_ptr(&c->stripes, idx); + +- bch2_bkey_val_to_text(&PBUF(buf1), c, old); +- bch2_bkey_val_to_text(&PBUF(buf2), c, new); +- bch_err_ratelimited(c, "error marking nonexistent stripe %zu while marking\n" +- "old %s\n" +- "new %s", idx, buf1, buf2); +- bch2_inconsistent_error(c); +- return -1; +- } ++ if (!m || (old_s && !m->alive)) { ++ char buf1[200], buf2[200]; + +- if (!new_s) { +- spin_lock(&c->ec_stripes_heap_lock); +- bch2_stripes_heap_del(c, m, idx); +- spin_unlock(&c->ec_stripes_heap_lock); +- +- memset(m, 0, sizeof(*m)); +- } else { +- m->alive = true; +- m->sectors = le16_to_cpu(new_s->sectors); +- m->algorithm = new_s->algorithm; +- m->nr_blocks = new_s->nr_blocks; +- m->nr_redundant = new_s->nr_redundant; +- m->blocks_nonempty = 0; ++ bch2_bkey_val_to_text(&PBUF(buf1), c, old); ++ bch2_bkey_val_to_text(&PBUF(buf2), c, new); ++ bch_err_ratelimited(c, "error marking nonexistent stripe %zu while marking\n" ++ "old %s\n" ++ "new %s", idx, buf1, buf2); ++ bch2_inconsistent_error(c); ++ return -1; ++ } + +- for (i = 0; i < new_s->nr_blocks; i++) { +- m->block_sectors[i] = +- stripe_blockcount_get(new_s, i); +- m->blocks_nonempty += !!m->block_sectors[i]; ++ if (!new_s) { ++ spin_lock(&c->ec_stripes_heap_lock); ++ bch2_stripes_heap_del(c, m, idx); ++ spin_unlock(&c->ec_stripes_heap_lock); + +- m->ptrs[i] = new_s->ptrs[i]; +- } ++ memset(m, 0, sizeof(*m)); ++ } else { ++ m->alive = true; ++ m->sectors = le16_to_cpu(new_s->sectors); ++ m->algorithm = new_s->algorithm; ++ m->nr_blocks = new_s->nr_blocks; ++ m->nr_redundant = new_s->nr_redundant; ++ m->blocks_nonempty = 0; + +- bch2_bkey_to_replicas(&m->r.e, new); ++ for (i = 0; i < new_s->nr_blocks; i++) ++ m->blocks_nonempty += !!stripe_blockcount_get(new_s, i); + +- if (!gc) { + spin_lock(&c->ec_stripes_heap_lock); + bch2_stripes_heap_update(c, m, idx); + spin_unlock(&c->ec_stripes_heap_lock); + } +- } ++ } else { ++ struct gc_stripe *m = genradix_ptr(&c->gc_stripes, idx); + +- if (gc) { + /* + * This will be wrong when we bring back runtime gc: we should + * be unmarking the old key and then marking the new key + */ ++ m->alive = true; ++ m->sectors = le16_to_cpu(new_s->sectors); ++ m->nr_blocks = new_s->nr_blocks; ++ m->nr_redundant = new_s->nr_redundant; ++ ++ for (i = 0; i < new_s->nr_blocks; i++) ++ m->ptrs[i] = new_s->ptrs[i]; ++ ++ bch2_bkey_to_replicas(&m->r.e, new); + + /* + * gc recalculates this field from stripe ptr + * references: + */ + memset(m->block_sectors, 0, sizeof(m->block_sectors)); +- m->blocks_nonempty = 0; + + for (i = 0; i < new_s->nr_blocks; i++) { + ret = mark_stripe_bucket(trans, new, i, journal_seq, flags); +@@ -1597,6 +1594,7 @@ static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans, + stripe_blockcount_set(&s->v, p.ec.block, + stripe_blockcount_get(&s->v, p.ec.block) + + sectors); ++ + ret = bch2_trans_update(trans, &iter, &s->k_i, 0); + if (ret) + goto err; +diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c +index 71d85c934741..f18399906af5 100644 +--- a/fs/bcachefs/ec.c ++++ b/fs/bcachefs/ec.c +@@ -545,11 +545,11 @@ static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx, gfp_t gfp) + free_heap(&n); + } + +- if (!genradix_ptr_alloc(&c->stripes[0], idx, gfp)) ++ if (!genradix_ptr_alloc(&c->stripes, idx, gfp)) + return -ENOMEM; + + if (c->gc_pos.phase != GC_PHASE_NOT_RUNNING && +- !genradix_ptr_alloc(&c->stripes[1], idx, gfp)) ++ !genradix_ptr_alloc(&c->gc_stripes, idx, gfp)) + return -ENOMEM; + + return 0; +@@ -594,13 +594,13 @@ static inline void ec_stripes_heap_set_backpointer(ec_stripes_heap *h, + { + struct bch_fs *c = container_of(h, struct bch_fs, ec_stripes_heap); + +- genradix_ptr(&c->stripes[0], h->data[i].idx)->heap_idx = i; ++ genradix_ptr(&c->stripes, h->data[i].idx)->heap_idx = i; + } + + static void heap_verify_backpointer(struct bch_fs *c, size_t idx) + { + ec_stripes_heap *h = &c->ec_stripes_heap; +- struct stripe *m = genradix_ptr(&c->stripes[0], idx); ++ struct stripe *m = genradix_ptr(&c->stripes, idx); + + BUG_ON(!m->alive); + BUG_ON(m->heap_idx >= h->used); +@@ -692,7 +692,7 @@ static void ec_stripe_delete_work(struct work_struct *work) + break; + } + +- bch2_stripes_heap_del(c, genradix_ptr(&c->stripes[0], idx), idx); ++ bch2_stripes_heap_del(c, genradix_ptr(&c->stripes, idx), idx); + spin_unlock(&c->ec_stripes_heap_lock); + + if (ec_stripe_delete(c, idx)) +@@ -702,22 +702,18 @@ static void ec_stripe_delete_work(struct work_struct *work) + + /* stripe creation: */ + +-static int ec_stripe_bkey_insert(struct bch_fs *c, ++static int ec_stripe_bkey_insert(struct btree_trans *trans, + struct bkey_i_stripe *stripe, + struct disk_reservation *res) + { +- struct btree_trans trans; ++ struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_s_c k; + struct bpos min_pos = POS(0, 1); + struct bpos start_pos = bpos_max(min_pos, POS(0, c->ec_stripe_hint)); + int ret; + +- bch2_trans_init(&trans, c, 0, 0); +-retry: +- bch2_trans_begin(&trans); +- +- for_each_btree_key(&trans, iter, BTREE_ID_stripes, start_pos, ++ for_each_btree_key(trans, iter, BTREE_ID_stripes, start_pos, + BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { + if (bkey_cmp(k.k->p, POS(0, U32_MAX)) > 0) { + if (start_pos.offset) { +@@ -738,29 +734,24 @@ retry: + found_slot: + start_pos = iter.pos; + +- ret = ec_stripe_mem_alloc(&trans, &iter); ++ ret = ec_stripe_mem_alloc(trans, &iter); + if (ret) + goto err; + + stripe->k.p = iter.pos; + +- ret = bch2_trans_update(&trans, &iter, &stripe->k_i, 0) ?: +- bch2_trans_commit(&trans, res, NULL, +- BTREE_INSERT_NOFAIL); +-err: +- bch2_trans_iter_exit(&trans, &iter); ++ ret = bch2_trans_update(trans, &iter, &stripe->k_i, 0); + +- if (ret == -EINTR) +- goto retry; +- +- c->ec_stripe_hint = ret ? start_pos.offset : start_pos.offset + 1; +- bch2_trans_exit(&trans); ++ c->ec_stripe_hint = start_pos.offset; ++err: ++ bch2_trans_iter_exit(trans, &iter); + + return ret; + } + + static int ec_stripe_bkey_update(struct btree_trans *trans, +- struct bkey_i_stripe *new) ++ struct bkey_i_stripe *new, ++ struct disk_reservation *res) + { + struct btree_iter iter; + struct bkey_s_c k; +@@ -947,10 +938,10 @@ static void ec_stripe_create(struct ec_stripe_new *s) + goto err_put_writes; + } + +- ret = s->have_existing_stripe +- ? bch2_trans_do(c, &s->res, NULL, BTREE_INSERT_NOFAIL, +- ec_stripe_bkey_update(&trans, &s->new_stripe.key)) +- : ec_stripe_bkey_insert(c, &s->new_stripe.key, &s->res); ++ ret = bch2_trans_do(c, &s->res, NULL, BTREE_INSERT_NOFAIL, ++ s->have_existing_stripe ++ ? ec_stripe_bkey_update(&trans, &s->new_stripe.key, &s->res) ++ : ec_stripe_bkey_insert(&trans, &s->new_stripe.key, &s->res)); + if (ret) { + bch_err(c, "error creating stripe: error creating stripe key"); + goto err_put_writes; +@@ -965,7 +956,7 @@ static void ec_stripe_create(struct ec_stripe_new *s) + } + + spin_lock(&c->ec_stripes_heap_lock); +- m = genradix_ptr(&c->stripes[0], s->new_stripe.key.k.p.offset); ++ m = genradix_ptr(&c->stripes, s->new_stripe.key.k.p.offset); + + BUG_ON(m->on_heap); + bch2_stripes_heap_insert(c, m, s->new_stripe.key.k.p.offset); +@@ -1381,7 +1372,7 @@ static s64 get_existing_stripe(struct bch_fs *c, + continue; + + stripe_idx = h->data[heap_idx].idx; +- m = genradix_ptr(&c->stripes[0], stripe_idx); ++ m = genradix_ptr(&c->stripes, stripe_idx); + + if (m->algorithm == head->algo && + m->nr_redundant == head->redundancy && +@@ -1555,85 +1546,11 @@ void bch2_stripes_heap_start(struct bch_fs *c) + struct genradix_iter iter; + struct stripe *m; + +- genradix_for_each(&c->stripes[0], iter, m) ++ genradix_for_each(&c->stripes, iter, m) + if (m->alive) + bch2_stripes_heap_insert(c, m, iter.pos); + } + +-static int __bch2_stripe_write_key(struct btree_trans *trans, +- struct btree_iter *iter, +- struct stripe *m, +- size_t idx, +- struct bkey_i_stripe *new_key) +-{ +- const struct bch_stripe *v; +- struct bkey_s_c k; +- unsigned i; +- int ret; +- +- bch2_btree_iter_set_pos(iter, POS(0, idx)); +- +- k = bch2_btree_iter_peek_slot(iter); +- ret = bkey_err(k); +- if (ret) +- return ret; +- +- if (k.k->type != KEY_TYPE_stripe) +- return -EIO; +- +- v = bkey_s_c_to_stripe(k).v; +- for (i = 0; i < v->nr_blocks; i++) +- if (m->block_sectors[i] != stripe_blockcount_get(v, i)) +- goto write; +- return 0; +-write: +- bkey_reassemble(&new_key->k_i, k); +- +- for (i = 0; i < new_key->v.nr_blocks; i++) +- stripe_blockcount_set(&new_key->v, i, +- m->block_sectors[i]); +- +- return bch2_trans_update(trans, iter, &new_key->k_i, 0); +-} +- +-int bch2_stripes_write(struct bch_fs *c, unsigned flags) +-{ +- struct btree_trans trans; +- struct btree_iter iter; +- struct genradix_iter giter; +- struct bkey_i_stripe *new_key; +- struct stripe *m; +- int ret = 0; +- +- new_key = kmalloc(255 * sizeof(u64), GFP_KERNEL); +- BUG_ON(!new_key); +- +- bch2_trans_init(&trans, c, 0, 0); +- +- bch2_trans_iter_init(&trans, &iter, BTREE_ID_stripes, POS_MIN, +- BTREE_ITER_SLOTS|BTREE_ITER_INTENT); +- +- genradix_for_each(&c->stripes[0], giter, m) { +- if (!m->alive) +- continue; +- +- ret = __bch2_trans_do(&trans, NULL, NULL, +- BTREE_INSERT_NOFAIL|flags, +- __bch2_stripe_write_key(&trans, &iter, m, +- giter.pos, new_key)); +- +- if (ret) +- break; +- } +- bch2_trans_iter_exit(&trans, &iter); +- +- bch2_trans_exit(&trans); +- +- kfree(new_key); +- +- return ret; +-} +- + static int bch2_stripes_read_fn(struct btree_trans *trans, struct bkey_s_c k) + { + const struct bch_stripe *s; +@@ -1651,7 +1568,7 @@ static int bch2_stripes_read_fn(struct btree_trans *trans, struct bkey_s_c k) + + s = bkey_s_c_to_stripe(k).v; + +- m = genradix_ptr(&c->stripes[0], k.k->p.offset); ++ m = genradix_ptr(&c->stripes, k.k->p.offset); + m->alive = true; + m->sectors = le16_to_cpu(s->sectors); + m->algorithm = s->algorithm; +@@ -1659,14 +1576,8 @@ static int bch2_stripes_read_fn(struct btree_trans *trans, struct bkey_s_c k) + m->nr_redundant = s->nr_redundant; + m->blocks_nonempty = 0; + +- for (i = 0; i < s->nr_blocks; i++) { +- m->block_sectors[i] = +- stripe_blockcount_get(s, i); +- m->blocks_nonempty += !!m->block_sectors[i]; +- m->ptrs[i] = s->ptrs[i]; +- } +- +- bch2_bkey_to_replicas(&m->r.e, k); ++ for (i = 0; i < s->nr_blocks; i++) ++ m->blocks_nonempty += !!stripe_blockcount_get(s, i); + + spin_lock(&c->ec_stripes_heap_lock); + bch2_stripes_heap_update(c, m, k.k->p.offset); +@@ -1722,7 +1633,9 @@ int bch2_ec_mem_alloc(struct bch_fs *c, bool gc) + ret = genradix_prealloc(&c->stripes[gc], idx, GFP_KERNEL); + #else + for (i = 0; i < idx; i++) +- if (!genradix_ptr_alloc(&c->stripes[gc], i, GFP_KERNEL)) ++ if (!gc ++ ? !genradix_ptr_alloc(&c->stripes, i, GFP_KERNEL) ++ : !genradix_ptr_alloc(&c->gc_stripes, i, GFP_KERNEL)) + return -ENOMEM; + #endif + return 0; +@@ -1736,7 +1649,7 @@ void bch2_stripes_heap_to_text(struct printbuf *out, struct bch_fs *c) + + spin_lock(&c->ec_stripes_heap_lock); + for (i = 0; i < min_t(size_t, h->used, 20); i++) { +- m = genradix_ptr(&c->stripes[0], h->data[i].idx); ++ m = genradix_ptr(&c->stripes, h->data[i].idx); + + pr_buf(out, "%zu %u/%u+%u\n", h->data[i].idx, + h->data[i].blocks_nonempty, +@@ -1794,7 +1707,7 @@ void bch2_fs_ec_exit(struct bch_fs *c) + BUG_ON(!list_empty(&c->ec_stripe_new_list)); + + free_heap(&c->ec_stripes_heap); +- genradix_free(&c->stripes[0]); ++ genradix_free(&c->stripes); + bioset_exit(&c->ec_bioset); + } + +diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h +index eb16e140e2c8..468141072bb4 100644 +--- a/fs/bcachefs/ec.h ++++ b/fs/bcachefs/ec.h +@@ -108,7 +108,7 @@ static inline bool bch2_ptr_matches_stripe(const struct bch_stripe *s, + le16_to_cpu(s->sectors)); + } + +-static inline bool bch2_ptr_matches_stripe_m(const struct stripe *m, ++static inline bool bch2_ptr_matches_stripe_m(const struct gc_stripe *m, + struct extent_ptr_decoded p) + { + unsigned nr_data = m->nr_blocks - m->nr_redundant; +@@ -216,7 +216,6 @@ void bch2_ec_flush_new_stripes(struct bch_fs *); + void bch2_stripes_heap_start(struct bch_fs *); + + int bch2_stripes_read(struct bch_fs *); +-int bch2_stripes_write(struct bch_fs *, unsigned); + + int bch2_ec_mem_alloc(struct bch_fs *, bool); + +diff --git a/fs/bcachefs/ec_types.h b/fs/bcachefs/ec_types.h +index 3fc31222459a..edd93da663c1 100644 +--- a/fs/bcachefs/ec_types.h ++++ b/fs/bcachefs/ec_types.h +@@ -21,6 +21,15 @@ struct stripe { + unsigned alive:1; /* does a corresponding key exist in stripes btree? */ + unsigned on_heap:1; + u8 blocks_nonempty; ++}; ++ ++struct gc_stripe { ++ u16 sectors; ++ ++ u8 nr_blocks; ++ u8 nr_redundant; ++ ++ unsigned alive:1; /* does a corresponding key exist in stripes btree? */ + u16 block_sectors[BCH_BKEY_PTRS_MAX]; + struct bch_extent_ptr ptrs[BCH_BKEY_PTRS_MAX]; + +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index c3b4d116275c..460b1ba22c8e 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -1238,8 +1238,7 @@ use_clean: + */ + bch_verbose(c, "writing allocation info"); + err = "error writing out alloc info"; +- ret = bch2_stripes_write(c, BTREE_INSERT_LAZY_RW) ?: +- bch2_alloc_write(c, BTREE_INSERT_LAZY_RW); ++ ret = bch2_alloc_write(c, BTREE_INSERT_LAZY_RW); + if (ret) { + bch_err(c, "error writing alloc info"); + goto err; +-- +cgit v1.2.3 + + +From f317e99ec61e2a227ab3e32c0f55efb3eb507c7a Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 9 Dec 2021 14:19:18 -0500 +Subject: bcachefs: Don't erasure code cached ptrs + +It doesn't make much sense to be erasure coding cached pointers, we +should be erasure coding one of the dirty pointers in an extent. This +patch makes sure we're passing BCH_WRITE_CACHED when we expect the new +pointer to be a cached pointer, and tweaks the write path to not +allocate from a stripe when BCH_WRITE_CACHED is set - and fixes an +assertion we were hitting in the ec path where when adding the stripe to +an extent and deleting the other pointers the pointer to the stripe +didn't exist (because dropping all dirty pointers from an extent turns +it into a KEY_TYPE_error key). + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/ec.c | 21 ++++++++++++++------- + fs/bcachefs/io.c | 2 +- + fs/bcachefs/move.c | 12 ++++++++---- + 3 files changed, 23 insertions(+), 12 deletions(-) + +diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c +index f18399906af5..033ded886875 100644 +--- a/fs/bcachefs/ec.c ++++ b/fs/bcachefs/ec.c +@@ -143,8 +143,8 @@ void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c, + } + + /* returns blocknr in stripe that we matched: */ +-static int bkey_matches_stripe(struct bch_stripe *s, +- struct bkey_s_c k) ++static const struct bch_extent_ptr *bkey_matches_stripe(struct bch_stripe *s, ++ struct bkey_s_c k, unsigned *block) + { + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const struct bch_extent_ptr *ptr; +@@ -153,10 +153,12 @@ static int bkey_matches_stripe(struct bch_stripe *s, + bkey_for_each_ptr(ptrs, ptr) + for (i = 0; i < nr_data; i++) + if (__bch2_ptr_matches_stripe(&s->ptrs[i], ptr, +- le16_to_cpu(s->sectors))) +- return i; ++ le16_to_cpu(s->sectors))) { ++ *block = i; ++ return ptr; ++ } + +- return -1; ++ return NULL; + } + + static bool extent_has_stripe_ptr(struct bkey_s_c k, u64 idx) +@@ -834,6 +836,7 @@ retry: + (k = bch2_btree_iter_peek(&iter)).k && + !(ret = bkey_err(k)) && + bkey_cmp(bkey_start_pos(k.k), pos->p) < 0) { ++ const struct bch_extent_ptr *ptr_c; + struct bch_extent_ptr *ptr, *ec_ptr = NULL; + + if (extent_has_stripe_ptr(k, s->key.k.p.offset)) { +@@ -841,8 +844,12 @@ retry: + continue; + } + +- block = bkey_matches_stripe(&s->key.v, k); +- if (block < 0) { ++ ptr_c = bkey_matches_stripe(&s->key.v, k, &block); ++ /* ++ * It doesn't generally make sense to erasure code cached ptrs: ++ * XXX: should we be incrementing a counter? ++ */ ++ if (!ptr_c || ptr_c->cached) { + bch2_btree_iter_advance(&iter); + continue; + } +diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c +index 5a3c9eff1b50..a9ca81ecaf68 100644 +--- a/fs/bcachefs/io.c ++++ b/fs/bcachefs/io.c +@@ -1111,7 +1111,7 @@ again: + */ + wp = bch2_alloc_sectors_start(c, + op->target, +- op->opts.erasure_code, ++ op->opts.erasure_code && !(op->flags & BCH_WRITE_CACHED), + op->write_point, + &op->devs_have, + op->nr_replicas, +diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c +index f0495451e20f..f73be9cb7ac3 100644 +--- a/fs/bcachefs/move.c ++++ b/fs/bcachefs/move.c +@@ -394,10 +394,14 @@ int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m, + unsigned compressed_sectors = 0; + + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) +- if (p.ptr.dev == data_opts.rewrite_dev && +- !p.ptr.cached && +- crc_is_compressed(p.crc)) +- compressed_sectors += p.crc.compressed_size; ++ if (p.ptr.dev == data_opts.rewrite_dev) { ++ if (p.ptr.cached) ++ m->op.flags |= BCH_WRITE_CACHED; ++ ++ if (!p.ptr.cached && ++ crc_is_compressed(p.crc)) ++ compressed_sectors += p.crc.compressed_size; ++ } + + if (compressed_sectors) { + ret = bch2_disk_reservation_add(c, &m->op.res, +-- +cgit v1.2.3 + + +From 7ed5ac132bdfad4a1c57c97d32ab0d99b1b18a2f Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 9 Dec 2021 15:21:26 -0500 +Subject: bcachefs: Fix null ptr deref in fsck_inode_rm() + +bch2_btree_delete_range() can split compressed extents, thus needs to +pass in a disk reservation when we're operating on extents btrees. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_update_leaf.c | 5 ++++- + 1 file changed, 4 insertions(+), 1 deletion(-) + +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index 10837a62f01c..295942e7356e 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -1440,6 +1440,8 @@ retry: + (k = bch2_btree_iter_peek(&iter)).k) && + !(ret = bkey_err(k)) && + bkey_cmp(iter.pos, end) < 0) { ++ struct disk_reservation disk_res = ++ bch2_disk_reservation_init(trans->c, 0); + struct bkey_i delete; + + bkey_init(&delete.k); +@@ -1474,8 +1476,9 @@ retry: + } + + ret = bch2_trans_update(trans, &iter, &delete, 0) ?: +- bch2_trans_commit(trans, NULL, journal_seq, ++ bch2_trans_commit(trans, &disk_res, journal_seq, + BTREE_INSERT_NOFAIL); ++ bch2_disk_reservation_put(trans->c, &disk_res); + if (ret) + break; + } +-- +cgit v1.2.3 + + +From c0d381857cc9c11c8e6e152df167d7f0efc44b24 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 10 Dec 2021 14:03:42 -0500 +Subject: bcachefs: Print out OPT_SECTORS options in bytes + +This matches the conversion the parsing code does. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/opts.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c +index a955ef2008c9..e81e07a383bb 100644 +--- a/fs/bcachefs/opts.c ++++ b/fs/bcachefs/opts.c +@@ -291,7 +291,7 @@ void bch2_opt_to_text(struct printbuf *out, struct bch_fs *c, + pr_buf(out, "%lli", v); + break; + case BCH_OPT_SECTORS: +- bch2_hprint(out, v); ++ bch2_hprint(out, v << 9); + break; + case BCH_OPT_STR: + if (flags & OPT_SHOW_FULL_LIST) +-- +cgit v1.2.3 + + +From 5b70e787c121e16cfaf1513dd80f005b2956a705 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 10 Dec 2021 15:41:38 -0500 +Subject: bcachefs: Add more time_stats + +This adds more latency/event measurements and breaks some apart into +more events. Journal writes are broken apart into flush writes and +noflush writes, btree compactions are broken out from btree splits, +btree mergers are added, as well as btree_interior_updates - foreground +and total. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bcachefs.h | 8 ++++++-- + fs/bcachefs/btree_gc.c | 3 +++ + fs/bcachefs/btree_update_interior.c | 25 +++++++++++++++++++++++-- + fs/bcachefs/btree_update_interior.h | 1 + + fs/bcachefs/fs-common.c | 1 + + fs/bcachefs/journal.c | 6 ++++++ + fs/bcachefs/journal_io.c | 4 +++- + fs/bcachefs/journal_types.h | 4 ++-- + fs/bcachefs/opts.h | 6 +++--- + fs/bcachefs/super.c | 8 ++++---- + 10 files changed, 52 insertions(+), 14 deletions(-) + +diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h +index fee1fc58e13b..5c01f0564752 100644 +--- a/fs/bcachefs/bcachefs.h ++++ b/fs/bcachefs/bcachefs.h +@@ -321,8 +321,12 @@ BCH_DEBUG_PARAMS_DEBUG() + #define BCH_TIME_STATS() \ + x(btree_node_mem_alloc) \ + x(btree_node_split) \ ++ x(btree_node_compact) \ ++ x(btree_node_merge) \ + x(btree_node_sort) \ + x(btree_node_read) \ ++ x(btree_interior_update_foreground) \ ++ x(btree_interior_update_total) \ + x(btree_gc) \ + x(btree_lock_contended_read) \ + x(btree_lock_contended_intent) \ +@@ -330,8 +334,8 @@ BCH_DEBUG_PARAMS_DEBUG() + x(data_write) \ + x(data_read) \ + x(data_promote) \ +- x(journal_write) \ +- x(journal_delay) \ ++ x(journal_flush_write) \ ++ x(journal_noflush_write) \ + x(journal_flush_seq) \ + x(blocked_journal) \ + x(blocked_allocate) \ +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +index a36b0e60077e..91c69a9f96ae 100644 +--- a/fs/bcachefs/btree_gc.c ++++ b/fs/bcachefs/btree_gc.c +@@ -1849,6 +1849,7 @@ int bch2_gc_gens(struct bch_fs *c) + struct bch_dev *ca; + struct bucket_array *buckets; + struct bucket *g; ++ u64 start_time = local_clock(); + unsigned i; + int ret; + +@@ -1892,6 +1893,8 @@ int bch2_gc_gens(struct bch_fs *c) + c->gc_gens_pos = POS_MIN; + + c->gc_count++; ++ ++ bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time); + err: + up_read(&c->gc_lock); + return ret; +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index dfff972551ee..d895d4eff0a9 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -455,15 +455,23 @@ static void bch2_btree_update_free(struct btree_update *as) + bch2_disk_reservation_put(c, &as->disk_res); + bch2_btree_reserve_put(as); + ++ bch2_time_stats_update(&c->times[BCH_TIME_btree_interior_update_total], ++ as->start_time); ++ + mutex_lock(&c->btree_interior_update_lock); + list_del(&as->unwritten_list); + list_del(&as->list); +- mutex_unlock(&c->btree_interior_update_lock); + + closure_debug_destroy(&as->cl); + mempool_free(as, &c->btree_interior_update_pool); + ++ /* ++ * Have to do the wakeup with btree_interior_update_lock still held, ++ * since being on btree_interior_update_list is our ref on @c: ++ */ + closure_wake_up(&c->btree_interior_update_wait); ++ ++ mutex_unlock(&c->btree_interior_update_lock); + } + + static void btree_update_will_delete_key(struct btree_update *as, +@@ -902,6 +910,9 @@ static void bch2_btree_interior_update_will_free_node(struct btree_update *as, + + static void bch2_btree_update_done(struct btree_update *as) + { ++ struct bch_fs *c = as->c; ++ u64 start_time = as->start_time; ++ + BUG_ON(as->mode == BTREE_INTERIOR_NO_UPDATE); + + if (as->took_gc_lock) +@@ -912,6 +923,9 @@ static void bch2_btree_update_done(struct btree_update *as) + + continue_at(&as->cl, btree_update_set_nodes_written, + as->c->btree_interior_update_worker); ++ ++ bch2_time_stats_update(&c->times[BCH_TIME_btree_interior_update_foreground], ++ start_time); + } + + static struct btree_update * +@@ -921,6 +935,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, + struct bch_fs *c = trans->c; + struct btree_update *as; + struct closure cl; ++ u64 start_time = local_clock(); + int disk_res_flags = (flags & BTREE_INSERT_NOFAIL) + ? BCH_DISK_RESERVATION_NOFAIL : 0; + int journal_flags = 0; +@@ -960,6 +975,7 @@ retry: + memset(as, 0, sizeof(*as)); + closure_init(&as->cl, NULL); + as->c = c; ++ as->start_time = start_time; + as->mode = BTREE_INTERIOR_NO_UPDATE; + as->took_gc_lock = !(flags & BTREE_INSERT_GC_LOCK_HELD); + as->btree_id = path->btree_id; +@@ -1452,7 +1468,9 @@ static void btree_split(struct btree_update *as, struct btree_trans *trans, + + bch2_trans_verify_locks(trans); + +- bch2_time_stats_update(&c->times[BCH_TIME_btree_node_split], ++ bch2_time_stats_update(&c->times[n2 ++ ? BCH_TIME_btree_node_split ++ : BCH_TIME_btree_node_compact], + start_time); + } + +@@ -1573,6 +1591,7 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, + struct btree *b, *m, *n, *prev, *next, *parent; + struct bpos sib_pos; + size_t sib_u64s; ++ u64 start_time = local_clock(); + int ret = 0; + + BUG_ON(!path->should_be_locked); +@@ -1710,6 +1729,8 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, + six_unlock_intent(&n->c.lock); + + bch2_btree_update_done(as); ++ ++ bch2_time_stats_update(&c->times[BCH_TIME_btree_node_merge], start_time); + out: + err: + bch2_path_put(trans, sib_path, true); +diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h +index 8e03bd987d6d..d4574161a733 100644 +--- a/fs/bcachefs/btree_update_interior.h ++++ b/fs/bcachefs/btree_update_interior.h +@@ -35,6 +35,7 @@ bool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *, + struct btree_update { + struct closure cl; + struct bch_fs *c; ++ u64 start_time; + + struct list_head list; + struct list_head unwritten_list; +diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c +index 5f3429e99115..d543480be111 100644 +--- a/fs/bcachefs/fs-common.c ++++ b/fs/bcachefs/fs-common.c +@@ -329,6 +329,7 @@ bool bch2_reinherit_attrs(struct bch_inode_unpacked *dst_u, + bool ret = false; + + for (id = 0; id < Inode_opt_nr; id++) { ++ /* Skip attributes that were explicitly set on this inode */ + if (dst_u->bi_fields_set & (1 << id)) + continue; + +diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c +index 268f3ea4bdd2..ff8b81fa6772 100644 +--- a/fs/bcachefs/journal.c ++++ b/fs/bcachefs/journal.c +@@ -626,6 +626,12 @@ int bch2_journal_flush_seq(struct journal *j, u64 seq) + u64 start_time = local_clock(); + int ret, ret2; + ++ /* ++ * Don't update time_stats when @seq is already flushed: ++ */ ++ if (seq <= j->flushed_seq_ondisk) ++ return 0; ++ + ret = wait_event_interruptible(j->wait, (ret2 = bch2_journal_flush_seq_async(j, seq, NULL))); + + if (!ret) +diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c +index 37abfb1885a6..80e0dd311ffd 100644 +--- a/fs/bcachefs/journal_io.c ++++ b/fs/bcachefs/journal_io.c +@@ -1238,7 +1238,9 @@ static void journal_write_done(struct closure *cl) + u64 v, seq; + int err = 0; + +- bch2_time_stats_update(j->write_time, j->write_start_time); ++ bch2_time_stats_update(!JSET_NO_FLUSH(w->data) ++ ? j->flush_write_time ++ : j->noflush_write_time, j->write_start_time); + + if (!w->devs_written.nr) { + bch_err(c, "unable to write journal to sufficient devices"); +diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h +index 66b1707a6697..54cc69bde1bb 100644 +--- a/fs/bcachefs/journal_types.h ++++ b/fs/bcachefs/journal_types.h +@@ -271,8 +271,8 @@ struct journal { + u64 nr_flush_writes; + u64 nr_noflush_writes; + +- struct time_stats *write_time; +- struct time_stats *delay_time; ++ struct time_stats *flush_write_time; ++ struct time_stats *noflush_write_time; + struct time_stats *blocked_time; + struct time_stats *flush_seq_time; + +diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h +index e2eb9b3fb275..871142778763 100644 +--- a/fs/bcachefs/opts.h ++++ b/fs/bcachefs/opts.h +@@ -81,9 +81,9 @@ enum opt_type { + */ + + #ifdef __KERNEL__ +-#define RATELIMIT_ERRORS true ++#define RATELIMIT_ERRORS_DEFAULT true + #else +-#define RATELIMIT_ERRORS false ++#define RATELIMIT_ERRORS_DEFAULT false + #endif + + #define BCH_OPTS() \ +@@ -288,7 +288,7 @@ enum opt_type { + x(ratelimit_errors, u8, \ + OPT_FS|OPT_MOUNT, \ + OPT_BOOL(), \ +- NO_SB_OPT, RATELIMIT_ERRORS, \ ++ NO_SB_OPT, RATELIMIT_ERRORS_DEFAULT, \ + NULL, "Ratelimit error messages during fsck") \ + x(nochanges, u8, \ + OPT_FS|OPT_MOUNT, \ +diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c +index f673efed2f47..505e559b48a6 100644 +--- a/fs/bcachefs/super.c ++++ b/fs/bcachefs/super.c +@@ -722,10 +722,10 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) + c->rebalance.enabled = 1; + c->promote_whole_extents = true; + +- c->journal.write_time = &c->times[BCH_TIME_journal_write]; +- c->journal.delay_time = &c->times[BCH_TIME_journal_delay]; +- c->journal.blocked_time = &c->times[BCH_TIME_blocked_journal]; +- c->journal.flush_seq_time = &c->times[BCH_TIME_journal_flush_seq]; ++ c->journal.flush_write_time = &c->times[BCH_TIME_journal_flush_write]; ++ c->journal.noflush_write_time = &c->times[BCH_TIME_journal_noflush_write]; ++ c->journal.blocked_time = &c->times[BCH_TIME_blocked_journal]; ++ c->journal.flush_seq_time = &c->times[BCH_TIME_journal_flush_seq]; + + bch2_fs_btree_cache_init_early(&c->btree_cache); + +-- +cgit v1.2.3 + + +From 2d56033b7f5af4c014794298b87b4dad6b12de8b Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 10 Dec 2021 20:58:44 -0500 +Subject: bcachefs: bch2_alloc_write() + +This adds a new helper that much like the one we have for inode updates, +that allocates the packed alloc key, packs it and calls +bch2_trans_update. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/alloc_background.c | 54 +++++++++++++++++++++++------------------- + fs/bcachefs/alloc_background.h | 15 +++--------- + fs/bcachefs/buckets.c | 43 ++++++++++++--------------------- + fs/bcachefs/recovery.c | 2 +- + 4 files changed, 49 insertions(+), 65 deletions(-) + +diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c +index bf3611e76912..95788aa152e1 100644 +--- a/fs/bcachefs/alloc_background.c ++++ b/fs/bcachefs/alloc_background.c +@@ -38,6 +38,15 @@ static const unsigned BCH_ALLOC_V1_FIELD_BYTES[] = { + #undef x + }; + ++struct bkey_alloc_buf { ++ struct bkey_i k; ++ struct bch_alloc_v3 v; ++ ++#define x(_name, _bits) + _bits / 8 ++ u8 _pad[0 + BCH_ALLOC_FIELDS_V2()]; ++#undef x ++} __attribute__((packed, aligned(8))); ++ + /* Persistent alloc info: */ + + static inline u64 alloc_field_v1_get(const struct bch_alloc *a, +@@ -244,13 +253,26 @@ struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c k) + return ret; + } + +-void bch2_alloc_pack(struct bch_fs *c, +- struct bkey_alloc_buf *dst, +- const struct bkey_alloc_unpacked src) ++static void bch2_alloc_pack(struct bch_fs *c, ++ struct bkey_alloc_buf *dst, ++ const struct bkey_alloc_unpacked src) + { + bch2_alloc_pack_v3(dst, src); + } + ++int bch2_alloc_write(struct btree_trans *trans, struct btree_iter *iter, ++ struct bkey_alloc_unpacked *u, unsigned trigger_flags) ++{ ++ struct bkey_alloc_buf *a; ++ ++ a = bch2_trans_kmalloc(trans, sizeof(struct bkey_alloc_buf)); ++ if (IS_ERR(a)) ++ return PTR_ERR(a); ++ ++ bch2_alloc_pack(trans->c, a, *u); ++ return bch2_trans_update(trans, iter, &a->k, trigger_flags); ++} ++ + static unsigned bch_alloc_v1_val_u64s(const struct bch_alloc *a) + { + unsigned i, bytes = offsetof(struct bch_alloc, data); +@@ -375,7 +397,6 @@ static int bch2_alloc_write_key(struct btree_trans *trans, + struct bucket *g; + struct bucket_mark m; + struct bkey_alloc_unpacked old_u, new_u; +- struct bkey_alloc_buf a; + int ret; + retry: + bch2_trans_begin(trans); +@@ -402,8 +423,7 @@ retry: + if (!bkey_alloc_unpacked_cmp(old_u, new_u)) + return 0; + +- bch2_alloc_pack(c, &a, new_u); +- ret = bch2_trans_update(trans, iter, &a.k, ++ ret = bch2_alloc_write(trans, iter, &new_u, + BTREE_TRIGGER_NORUN) ?: + bch2_trans_commit(trans, NULL, NULL, + BTREE_INSERT_NOFAIL|flags); +@@ -413,7 +433,7 @@ err: + return ret; + } + +-int bch2_alloc_write(struct bch_fs *c, unsigned flags) ++int bch2_alloc_write_all(struct bch_fs *c, unsigned flags) + { + struct btree_trans trans; + struct btree_iter iter; +@@ -453,7 +473,6 @@ int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev, + struct bch_dev *ca = bch_dev_bkey_exists(c, dev); + struct btree_iter iter; + struct bucket *g; +- struct bkey_alloc_buf *a; + struct bkey_alloc_unpacked u; + u64 *time, now; + int ret = 0; +@@ -466,11 +485,6 @@ int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev, + if (ret) + goto out; + +- a = bch2_trans_kmalloc(trans, sizeof(struct bkey_alloc_buf)); +- ret = PTR_ERR_OR_ZERO(a); +- if (ret) +- goto out; +- + percpu_down_read(&c->mark_lock); + g = bucket(ca, bucket_nr); + u = alloc_mem_to_key(&iter, g, READ_ONCE(g->mark)); +@@ -483,8 +497,7 @@ int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev, + + *time = now; + +- bch2_alloc_pack(c, a, u); +- ret = bch2_trans_update(trans, &iter, &a->k, 0) ?: ++ ret = bch2_alloc_write(trans, &iter, &u, 0) ?: + bch2_trans_commit(trans, NULL, NULL, 0); + out: + bch2_trans_iter_exit(trans, &iter); +@@ -752,7 +765,6 @@ static int bucket_invalidate_btree(struct btree_trans *trans, + struct bch_dev *ca, u64 b) + { + struct bch_fs *c = trans->c; +- struct bkey_alloc_buf *a; + struct bkey_alloc_unpacked u; + struct bucket *g; + struct bucket_mark m; +@@ -765,11 +777,6 @@ static int bucket_invalidate_btree(struct btree_trans *trans, + BTREE_ITER_CACHED_NOFILL| + BTREE_ITER_INTENT); + +- a = bch2_trans_kmalloc(trans, sizeof(*a)); +- ret = PTR_ERR_OR_ZERO(a); +- if (ret) +- goto err; +- + ret = bch2_btree_iter_traverse(&iter); + if (ret) + goto err; +@@ -787,9 +794,8 @@ static int bucket_invalidate_btree(struct btree_trans *trans, + u.read_time = atomic64_read(&c->io_clock[READ].now); + u.write_time = atomic64_read(&c->io_clock[WRITE].now); + +- bch2_alloc_pack(c, a, u); +- ret = bch2_trans_update(trans, &iter, &a->k, +- BTREE_TRIGGER_BUCKET_INVALIDATE); ++ ret = bch2_alloc_write(trans, &iter, &u, ++ BTREE_TRIGGER_BUCKET_INVALIDATE); + err: + bch2_trans_iter_exit(trans, &iter); + return ret; +diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h +index b1efc1494dc4..6698d9c75d07 100644 +--- a/fs/bcachefs/alloc_background.h ++++ b/fs/bcachefs/alloc_background.h +@@ -20,15 +20,6 @@ struct bkey_alloc_unpacked { + #undef x + }; + +-struct bkey_alloc_buf { +- struct bkey_i k; +- struct bch_alloc_v3 v; +- +-#define x(_name, _bits) + _bits / 8 +- u8 _pad[0 + BCH_ALLOC_FIELDS_V2()]; +-#undef x +-} __attribute__((packed, aligned(8))); +- + /* How out of date a pointer gen is allowed to be: */ + #define BUCKET_GC_GEN_MAX 96U + +@@ -46,8 +37,8 @@ static inline bool bkey_alloc_unpacked_cmp(struct bkey_alloc_unpacked l, + } + + struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c); +-void bch2_alloc_pack(struct bch_fs *, struct bkey_alloc_buf *, +- const struct bkey_alloc_unpacked); ++int bch2_alloc_write(struct btree_trans *, struct btree_iter *, ++ struct bkey_alloc_unpacked *, unsigned); + + int bch2_bucket_io_time_reset(struct btree_trans *, unsigned, size_t, int); + +@@ -137,7 +128,7 @@ void bch2_dev_allocator_quiesce(struct bch_fs *, struct bch_dev *); + void bch2_dev_allocator_stop(struct bch_dev *); + int bch2_dev_allocator_start(struct bch_dev *); + +-int bch2_alloc_write(struct bch_fs *, unsigned); ++int bch2_alloc_write_all(struct bch_fs *, unsigned); + void bch2_fs_allocator_background_init(struct bch_fs *); + + void bch2_open_buckets_to_text(struct printbuf *, struct bch_fs *); +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index 3d764599a23f..10b05f7b7ada 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -1481,8 +1481,7 @@ need_mark: + + /* trans_mark: */ + +-static struct bkey_alloc_buf * +-bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter *iter, ++static int bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter *iter, + const struct bch_extent_ptr *ptr, + struct bkey_alloc_unpacked *u) + { +@@ -1490,14 +1489,9 @@ bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter *iter + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); + struct bpos pos = POS(ptr->dev, PTR_BUCKET_NR(ca, ptr)); + struct bucket *g; +- struct bkey_alloc_buf *a; + struct bkey_i *update = btree_trans_peek_updates(trans, BTREE_ID_alloc, pos); + int ret; + +- a = bch2_trans_kmalloc(trans, sizeof(struct bkey_alloc_buf)); +- if (IS_ERR(a)) +- return a; +- + bch2_trans_iter_init(trans, iter, BTREE_ID_alloc, pos, + BTREE_ITER_CACHED| + BTREE_ITER_CACHED_NOFILL| +@@ -1505,7 +1499,7 @@ bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter *iter + ret = bch2_btree_iter_traverse(iter); + if (ret) { + bch2_trans_iter_exit(trans, iter); +- return ERR_PTR(ret); ++ return ret; + } + + if (update && !bpos_cmp(update->k.p, pos)) { +@@ -1517,22 +1511,20 @@ bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter *iter + percpu_up_read(&c->mark_lock); + } + +- return a; ++ return 0; + } + + static int bch2_trans_mark_pointer(struct btree_trans *trans, + struct bkey_s_c k, struct extent_ptr_decoded p, + s64 sectors, enum bch_data_type data_type) + { +- struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_alloc_unpacked u; +- struct bkey_alloc_buf *a; + int ret; + +- a = bch2_trans_start_alloc_update(trans, &iter, &p.ptr, &u); +- if (IS_ERR(a)) +- return PTR_ERR(a); ++ ret = bch2_trans_start_alloc_update(trans, &iter, &p.ptr, &u); ++ if (ret) ++ return ret; + + ret = __mark_pointer(trans, k, &p.ptr, sectors, data_type, + u.gen, &u.data_type, +@@ -1540,8 +1532,7 @@ static int bch2_trans_mark_pointer(struct btree_trans *trans, + if (ret) + goto out; + +- bch2_alloc_pack(c, a, u); +- ret = bch2_trans_update(trans, &iter, &a->k, 0); ++ ret = bch2_alloc_write(trans, &iter, &u, 0); + if (ret) + goto out; + out: +@@ -1671,7 +1662,6 @@ static int bch2_trans_mark_stripe_bucket(struct btree_trans *trans, + { + struct bch_fs *c = trans->c; + const struct bch_extent_ptr *ptr = &s.v->ptrs[idx]; +- struct bkey_alloc_buf *a; + struct btree_iter iter; + struct bkey_alloc_unpacked u; + enum bch_data_type data_type = idx >= s.v->nr_blocks - s.v->nr_redundant +@@ -1682,9 +1672,9 @@ static int bch2_trans_mark_stripe_bucket(struct btree_trans *trans, + if (deleting) + sectors = -sectors; + +- a = bch2_trans_start_alloc_update(trans, &iter, ptr, &u); +- if (IS_ERR(a)) +- return PTR_ERR(a); ++ ret = bch2_trans_start_alloc_update(trans, &iter, ptr, &u); ++ if (ret) ++ return ret; + + ret = check_bucket_ref(c, s.s_c, ptr, sectors, data_type, + u.gen, u.data_type, +@@ -1734,8 +1724,7 @@ static int bch2_trans_mark_stripe_bucket(struct btree_trans *trans, + if (data_type) + u.data_type = !deleting ? data_type : 0; + +- bch2_alloc_pack(c, a, u); +- ret = bch2_trans_update(trans, &iter, &a->k, 0); ++ ret = bch2_alloc_write(trans, &iter, &u, 0); + if (ret) + goto err; + err: +@@ -1983,7 +1972,6 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans, + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_alloc_unpacked u; +- struct bkey_alloc_buf *a; + struct bch_extent_ptr ptr = { + .dev = ca->dev_idx, + .offset = bucket_to_sector(ca, b), +@@ -1996,9 +1984,9 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans, + if (b >= ca->mi.nbuckets) + return 0; + +- a = bch2_trans_start_alloc_update(trans, &iter, &ptr, &u); +- if (IS_ERR(a)) +- return PTR_ERR(a); ++ ret = bch2_trans_start_alloc_update(trans, &iter, &ptr, &u); ++ if (ret) ++ return ret; + + if (u.data_type && u.data_type != type) { + bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, +@@ -2015,8 +2003,7 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans, + u.data_type = type; + u.dirty_sectors = sectors; + +- bch2_alloc_pack(c, a, u); +- ret = bch2_trans_update(trans, &iter, &a->k, 0); ++ ret = bch2_alloc_write(trans, &iter, &u, 0); + if (ret) + goto out; + out: +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index 460b1ba22c8e..29fe6260ace5 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -1238,7 +1238,7 @@ use_clean: + */ + bch_verbose(c, "writing allocation info"); + err = "error writing out alloc info"; +- ret = bch2_alloc_write(c, BTREE_INSERT_LAZY_RW); ++ ret = bch2_alloc_write_all(c, BTREE_INSERT_LAZY_RW); + if (ret) { + bch_err(c, "error writing alloc info"); + goto err; +-- +cgit v1.2.3 + + +From b4a2e1844f2ec8b70351ad76d9eeca6e60a2968c Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 10 Dec 2021 21:24:36 -0500 +Subject: bcachefs: Improve alloc_mem_to_key() + +This moves some common code into alloc_mem_to_key(), which translates +from the in-memory format for a bucket to the btree key format. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/alloc_background.c | 28 ++++------------------------ + fs/bcachefs/alloc_background.h | 25 ++++++++++++++++++------- + fs/bcachefs/buckets.c | 12 +++--------- + 3 files changed, 25 insertions(+), 40 deletions(-) + +diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c +index 95788aa152e1..ed919b428a06 100644 +--- a/fs/bcachefs/alloc_background.c ++++ b/fs/bcachefs/alloc_background.c +@@ -393,9 +393,6 @@ static int bch2_alloc_write_key(struct btree_trans *trans, + { + struct bch_fs *c = trans->c; + struct bkey_s_c k; +- struct bch_dev *ca; +- struct bucket *g; +- struct bucket_mark m; + struct bkey_alloc_unpacked old_u, new_u; + int ret; + retry: +@@ -411,14 +408,8 @@ retry: + if (ret) + goto err; + +- old_u = bch2_alloc_unpack(k); +- +- percpu_down_read(&c->mark_lock); +- ca = bch_dev_bkey_exists(c, iter->pos.inode); +- g = bucket(ca, iter->pos.offset); +- m = READ_ONCE(g->mark); +- new_u = alloc_mem_to_key(iter, g, m); +- percpu_up_read(&c->mark_lock); ++ old_u = bch2_alloc_unpack(k); ++ new_u = alloc_mem_to_key(c, iter); + + if (!bkey_alloc_unpacked_cmp(old_u, new_u)) + return 0; +@@ -470,9 +461,7 @@ int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev, + size_t bucket_nr, int rw) + { + struct bch_fs *c = trans->c; +- struct bch_dev *ca = bch_dev_bkey_exists(c, dev); + struct btree_iter iter; +- struct bucket *g; + struct bkey_alloc_unpacked u; + u64 *time, now; + int ret = 0; +@@ -485,10 +474,7 @@ int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev, + if (ret) + goto out; + +- percpu_down_read(&c->mark_lock); +- g = bucket(ca, bucket_nr); +- u = alloc_mem_to_key(&iter, g, READ_ONCE(g->mark)); +- percpu_up_read(&c->mark_lock); ++ u = alloc_mem_to_key(c, &iter); + + time = rw == READ ? &u.read_time : &u.write_time; + now = atomic64_read(&c->io_clock[rw].now); +@@ -766,8 +752,6 @@ static int bucket_invalidate_btree(struct btree_trans *trans, + { + struct bch_fs *c = trans->c; + struct bkey_alloc_unpacked u; +- struct bucket *g; +- struct bucket_mark m; + struct btree_iter iter; + int ret; + +@@ -781,11 +765,7 @@ static int bucket_invalidate_btree(struct btree_trans *trans, + if (ret) + goto err; + +- percpu_down_read(&c->mark_lock); +- g = bucket(ca, b); +- m = READ_ONCE(g->mark); +- u = alloc_mem_to_key(&iter, g, m); +- percpu_up_read(&c->mark_lock); ++ u = alloc_mem_to_key(c, &iter); + + u.gen++; + u.data_type = 0; +diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h +index 6698d9c75d07..e3cdb8bc1dd8 100644 +--- a/fs/bcachefs/alloc_background.h ++++ b/fs/bcachefs/alloc_background.h +@@ -4,7 +4,9 @@ + + #include "bcachefs.h" + #include "alloc_types.h" ++#include "buckets.h" + #include "debug.h" ++#include "super.h" + + extern const char * const bch2_allocator_states[]; + +@@ -43,22 +45,31 @@ int bch2_alloc_write(struct btree_trans *, struct btree_iter *, + int bch2_bucket_io_time_reset(struct btree_trans *, unsigned, size_t, int); + + static inline struct bkey_alloc_unpacked +-alloc_mem_to_key(struct btree_iter *iter, +- struct bucket *g, struct bucket_mark m) ++alloc_mem_to_key(struct bch_fs *c, struct btree_iter *iter) + { +- return (struct bkey_alloc_unpacked) { ++ struct bch_dev *ca; ++ struct bucket *g; ++ struct bkey_alloc_unpacked ret; ++ ++ percpu_down_read(&c->mark_lock); ++ ca = bch_dev_bkey_exists(c, iter->pos.inode); ++ g = bucket(ca, iter->pos.offset); ++ ret = (struct bkey_alloc_unpacked) { + .dev = iter->pos.inode, + .bucket = iter->pos.offset, +- .gen = m.gen, ++ .gen = g->mark.gen, + .oldest_gen = g->oldest_gen, +- .data_type = m.data_type, +- .dirty_sectors = m.dirty_sectors, +- .cached_sectors = m.cached_sectors, ++ .data_type = g->mark.data_type, ++ .dirty_sectors = g->mark.dirty_sectors, ++ .cached_sectors = g->mark.cached_sectors, + .read_time = g->io_time[READ], + .write_time = g->io_time[WRITE], + .stripe = g->stripe, + .stripe_redundancy = g->stripe_redundancy, + }; ++ percpu_up_read(&c->mark_lock); ++ ++ return ret; + } + + #define ALLOC_SCAN_BATCH(ca) max_t(size_t, 1, (ca)->mi.nbuckets >> 9) +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index 10b05f7b7ada..f951f9f3ecf2 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -1488,7 +1488,6 @@ static int bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree + struct bch_fs *c = trans->c; + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); + struct bpos pos = POS(ptr->dev, PTR_BUCKET_NR(ca, ptr)); +- struct bucket *g; + struct bkey_i *update = btree_trans_peek_updates(trans, BTREE_ID_alloc, pos); + int ret; + +@@ -1502,14 +1501,9 @@ static int bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree + return ret; + } + +- if (update && !bpos_cmp(update->k.p, pos)) { +- *u = bch2_alloc_unpack(bkey_i_to_s_c(update)); +- } else { +- percpu_down_read(&c->mark_lock); +- g = bucket(ca, pos.offset); +- *u = alloc_mem_to_key(iter, g, READ_ONCE(g->mark)); +- percpu_up_read(&c->mark_lock); +- } ++ *u = update && !bpos_cmp(update->k.p, pos) ++ ? bch2_alloc_unpack(bkey_i_to_s_c(update)) ++ : alloc_mem_to_key(c, iter); + + return 0; + } +-- +cgit v1.2.3 + + +From e81829d94a53cb9d8305ceffc08715997f3d748f Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 14 Dec 2021 00:08:06 -0500 +Subject: bcachefs: Add missing bch2_trans_iter_exit() call + +This fixes a bug where the filesystem goes read only when reading from +debugfs. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/debug.c | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c +index 294e4baf4deb..666635f7c7d2 100644 +--- a/fs/bcachefs/debug.c ++++ b/fs/bcachefs/debug.c +@@ -406,6 +406,8 @@ static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf, + if (!i->size) + break; + } ++ bch2_trans_iter_exit(&trans, &iter); ++ + bch2_trans_exit(&trans); + + return err < 0 ? err : i->ret; +-- +cgit v1.2.3 + + +From a5e416a282197f57470d029e72720d4635ba5393 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 14 Dec 2021 16:05:47 -0500 +Subject: bcachefs: Fix debug build in userspace + +This fixes some compiler warnings that only trigger in userspace - dead +code, a maybe uninitialed variable, a maybe null ptr passed to printk. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_io.c | 3 ++- + fs/bcachefs/dirent.c | 2 +- + fs/bcachefs/inode.c | 10 ---------- + 3 files changed, 3 insertions(+), 12 deletions(-) + +diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c +index f11fcab61902..c19c3acbc2b9 100644 +--- a/fs/bcachefs/btree_io.c ++++ b/fs/bcachefs/btree_io.c +@@ -566,7 +566,8 @@ enum btree_validate_ret { + \ + switch (write) { \ + case READ: \ +- bch_err(c, "%s", _buf2); \ ++ if (_buf2) \ ++ bch_err(c, "%s", _buf2); \ + \ + switch (type) { \ + case BTREE_ERR_FIXABLE: \ +diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c +index a165d08c3668..6f699b736b34 100644 +--- a/fs/bcachefs/dirent.c ++++ b/fs/bcachefs/dirent.c +@@ -230,7 +230,7 @@ int bch2_dirent_rename(struct btree_trans *trans, + { + struct btree_iter src_iter = { NULL }; + struct btree_iter dst_iter = { NULL }; +- struct bkey_s_c old_src, old_dst; ++ struct bkey_s_c old_src, old_dst = bkey_s_c_null; + struct bkey_i_dirent *new_src = NULL, *new_dst = NULL; + struct bpos dst_pos = + POS(dst_dir.inum, bch2_dirent_hash(dst_hash, dst_name)); +diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c +index ffce68a80490..99b2a77ef9a8 100644 +--- a/fs/bcachefs/inode.c ++++ b/fs/bcachefs/inode.c +@@ -24,16 +24,6 @@ const char * const bch2_inode_opts[] = { + }; + + static const u8 byte_table[8] = { 1, 2, 3, 4, 6, 8, 10, 13 }; +-static const u8 bits_table[8] = { +- 1 * 8 - 1, +- 2 * 8 - 2, +- 3 * 8 - 3, +- 4 * 8 - 4, +- 6 * 8 - 5, +- 8 * 8 - 6, +- 10 * 8 - 7, +- 13 * 8 - 8, +-}; + + static int inode_decode_field(const u8 *in, const u8 *end, + u64 out[2], unsigned *out_bits) +-- +cgit v1.2.3 + + +From aab6e815c8195e88fab92a2c43dee6bf652cc7d9 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 15 Dec 2021 20:35:45 -0500 +Subject: bcachefs: Fix an assertion in bch2_truncate() + +We recently added an assertion that when we truncate a file to 0, +i_blocks should also go to 0 - but that's not necessarily true if we're +doing an emergency shutdown, lots of invariants no longer hold true in +that case. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/fs-io.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c +index 3d053479b8cd..57619d09b00a 100644 +--- a/fs/bcachefs/fs-io.c ++++ b/fs/bcachefs/fs-io.c +@@ -2714,7 +2714,8 @@ int bch2_truncate(struct user_namespace *mnt_userns, + U64_MAX, &i_sectors_delta); + i_sectors_acct(c, inode, NULL, i_sectors_delta); + +- BUG_ON(!inode->v.i_size && inode->v.i_blocks); ++ WARN_ON(!inode->v.i_size && inode->v.i_blocks && ++ !bch2_journal_error(&c->journal)); + + if (unlikely(ret)) + goto err; +-- +cgit v1.2.3 + + +From b5c558ec4c0058a638fb4aedbce4a25db35bb6cf Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 16 Dec 2021 20:36:26 -0500 +Subject: bcachefs: Sysfs internal/btree_transactions is now always enabled + +This highly-useful debugging feature helps with debugging deadlocks, and +it doesn't cost much, so let's have it always enabled. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 13 +------------ + fs/bcachefs/btree_types.h | 2 -- + 2 files changed, 1 insertion(+), 14 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 32435a24147f..f8aab9b4b5ce 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -377,19 +377,16 @@ bool __bch2_btree_node_lock(struct btree_trans *trans, + if (six_trylock_type(&b->c.lock, type)) + return true; + +-#ifdef CONFIG_BCACHEFS_DEBUG + trans->locking_path_idx = path->idx; + trans->locking_pos = pos; + trans->locking_btree_id = path->btree_id; + trans->locking_level = level; + trans->locking = b; +-#endif + + ret = six_lock_type(&b->c.lock, type, should_sleep_fn, p) == 0; + +-#ifdef CONFIG_BCACHEFS_DEBUG + trans->locking = NULL; +-#endif ++ + if (ret) + bch2_time_stats_update(&trans->c->times[lock_to_time_stat(type)], + start_time); +@@ -2805,12 +2802,10 @@ void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, + + trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier); + +-#ifdef CONFIG_BCACHEFS_DEBUG + trans->pid = current->pid; + mutex_lock(&c->btree_trans_lock); + list_add(&trans->list, &c->btree_trans_list); + mutex_unlock(&c->btree_trans_lock); +-#endif + } + + static void check_btree_paths_leaked(struct btree_trans *trans) +@@ -2849,11 +2844,9 @@ void bch2_trans_exit(struct btree_trans *trans) + + check_btree_paths_leaked(trans); + +-#ifdef CONFIG_BCACHEFS_DEBUG + mutex_lock(&c->btree_trans_lock); + list_del(&trans->list); + mutex_unlock(&c->btree_trans_lock); +-#endif + + srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx); + +@@ -2897,7 +2890,6 @@ bch2_btree_path_node_to_text(struct printbuf *out, + bch2_bpos_to_text(out, btree_node_pos(_b, cached)); + } + +-#ifdef CONFIG_BCACHEFS_DEBUG + static bool trans_has_locks(struct btree_trans *trans) + { + struct btree_path *path; +@@ -2907,11 +2899,9 @@ static bool trans_has_locks(struct btree_trans *trans) + return true; + return false; + } +-#endif + + void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c) + { +-#ifdef CONFIG_BCACHEFS_DEBUG + struct btree_trans *trans; + struct btree_path *path; + struct btree *b; +@@ -2965,7 +2955,6 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c) + } + } + mutex_unlock(&c->btree_trans_lock); +-#endif + } + + void bch2_fs_btree_iter_exit(struct bch_fs *c) +diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h +index 2c2e2f794b8f..22dbbe365bbe 100644 +--- a/fs/bcachefs/btree_types.h ++++ b/fs/bcachefs/btree_types.h +@@ -366,7 +366,6 @@ struct btree_trans_commit_hook { + + struct btree_trans { + struct bch_fs *c; +-#ifdef CONFIG_BCACHEFS_DEBUG + struct list_head list; + struct btree *locking; + unsigned locking_path_idx; +@@ -374,7 +373,6 @@ struct btree_trans { + u8 locking_btree_id; + u8 locking_level; + pid_t pid; +-#endif + unsigned long ip; + int srcu_idx; + +-- +cgit v1.2.3 + + +From e91f31cbd92a04a343adcfa2d503ade3fcdeeaaa Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 19 Dec 2021 18:59:22 -0500 +Subject: bcachefs: Kill bch2_sort_repack_merge() + +The main function of bch2_sort_repack_merge() was to call .key_normalize +on every key, which drops stale (cached) pointers - it hasn't actually +merged extents in quite some time. + +But bch2_gc_gens() now works on individual keys - we used to gc old gens +by rewriting entire btree nodes. With that gone, there's no need for +internal btree code to be calling .key_normalize anymore. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bkey_sort.c | 58 ------------------------------------------------- + fs/bcachefs/bkey_sort.h | 5 ----- + fs/bcachefs/btree_io.c | 14 ++++-------- + 3 files changed, 4 insertions(+), 73 deletions(-) + +diff --git a/fs/bcachefs/bkey_sort.c b/fs/bcachefs/bkey_sort.c +index 537ab7919e88..da0b7a63b146 100644 +--- a/fs/bcachefs/bkey_sort.c ++++ b/fs/bcachefs/bkey_sort.c +@@ -117,23 +117,6 @@ bch2_key_sort_fix_overlapping(struct bch_fs *c, struct bset *dst, + return nr; + } + +-static void extent_sort_append(struct bch_fs *c, +- struct bkey_format *f, +- struct btree_nr_keys *nr, +- struct bkey_packed **out, +- struct bkey_s k) +-{ +- if (!bkey_deleted(k.k)) { +- if (!bch2_bkey_pack_key(*out, k.k, f)) +- memcpy_u64s_small(*out, k.k, BKEY_U64s); +- +- memcpy_u64s_small(bkeyp_val(f, *out), k.v, bkey_val_u64s(k.k)); +- +- btree_keys_account_key_add(nr, 0, *out); +- *out = bkey_next(*out); +- } +-} +- + /* Sort + repack in a new format: */ + struct btree_nr_keys + bch2_sort_repack(struct bset *dst, struct btree *src, +@@ -165,47 +148,6 @@ bch2_sort_repack(struct bset *dst, struct btree *src, + return nr; + } + +-/* Sort, repack, and call bch2_bkey_normalize() to drop stale pointers: */ +-struct btree_nr_keys +-bch2_sort_repack_merge(struct bch_fs *c, +- struct bset *dst, struct btree *src, +- struct btree_node_iter *iter, +- struct bkey_format *out_f, +- bool filter_whiteouts) +-{ +- struct bkey_packed *out = vstruct_last(dst), *k_packed; +- struct bkey_buf k; +- struct btree_nr_keys nr; +- +- memset(&nr, 0, sizeof(nr)); +- bch2_bkey_buf_init(&k); +- +- while ((k_packed = bch2_btree_node_iter_next_all(iter, src))) { +- if (filter_whiteouts && bkey_deleted(k_packed)) +- continue; +- +- /* +- * NOTE: +- * bch2_bkey_normalize may modify the key we pass it (dropping +- * stale pointers) and we don't have a write lock on the src +- * node; we have to make a copy of the entire key before calling +- * normalize +- */ +- bch2_bkey_buf_realloc(&k, c, k_packed->u64s + BKEY_U64s); +- bch2_bkey_unpack(src, k.k, k_packed); +- +- if (filter_whiteouts && +- bch2_bkey_normalize(c, bkey_i_to_s(k.k))) +- continue; +- +- extent_sort_append(c, out_f, &nr, &out, bkey_i_to_s(k.k)); +- } +- +- dst->u64s = cpu_to_le16((u64 *) out - dst->_data); +- bch2_bkey_buf_exit(&k, c); +- return nr; +-} +- + static inline int sort_keys_cmp(struct btree *b, + struct bkey_packed *l, + struct bkey_packed *r) +diff --git a/fs/bcachefs/bkey_sort.h b/fs/bcachefs/bkey_sort.h +index 1059996dac78..79cf11d1b4e7 100644 +--- a/fs/bcachefs/bkey_sort.h ++++ b/fs/bcachefs/bkey_sort.h +@@ -37,11 +37,6 @@ struct btree_nr_keys + bch2_sort_repack(struct bset *, struct btree *, + struct btree_node_iter *, + struct bkey_format *, bool); +-struct btree_nr_keys +-bch2_sort_repack_merge(struct bch_fs *, +- struct bset *, struct btree *, +- struct btree_node_iter *, +- struct bkey_format *, bool); + + unsigned bch2_sort_keys(struct bkey_packed *, + struct sort_iter *, bool); +diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c +index c19c3acbc2b9..9b22c5e3fe87 100644 +--- a/fs/bcachefs/btree_io.c ++++ b/fs/bcachefs/btree_io.c +@@ -391,16 +391,10 @@ void bch2_btree_sort_into(struct bch_fs *c, + + bch2_btree_node_iter_init_from_start(&src_iter, src); + +- if (btree_node_is_extents(src)) +- nr = bch2_sort_repack_merge(c, btree_bset_first(dst), +- src, &src_iter, +- &dst->format, +- true); +- else +- nr = bch2_sort_repack(btree_bset_first(dst), +- src, &src_iter, +- &dst->format, +- true); ++ nr = bch2_sort_repack(btree_bset_first(dst), ++ src, &src_iter, ++ &dst->format, ++ true); + + bch2_time_stats_update(&c->times[BCH_TIME_btree_node_sort], + start_time); +-- +cgit v1.2.3 + + +From 447c8e3316ec83a7b5eea0229053c17e1fd1135b Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 19 Dec 2021 19:01:41 -0500 +Subject: bcachefs: Don't call bch2_bkey_transform() unnecessarily + +If the packed format isn't changing, there's no need to call +bch2_bkey_transform(). + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bkey_sort.c | 7 +++++-- + 1 file changed, 5 insertions(+), 2 deletions(-) + +diff --git a/fs/bcachefs/bkey_sort.c b/fs/bcachefs/bkey_sort.c +index da0b7a63b146..b1385a77da11 100644 +--- a/fs/bcachefs/bkey_sort.c ++++ b/fs/bcachefs/bkey_sort.c +@@ -127,6 +127,7 @@ bch2_sort_repack(struct bset *dst, struct btree *src, + struct bkey_format *in_f = &src->format; + struct bkey_packed *in, *out = vstruct_last(dst); + struct btree_nr_keys nr; ++ bool transform = memcmp(out_f, &src->format, sizeof(*out_f)); + + memset(&nr, 0, sizeof(nr)); + +@@ -134,8 +135,10 @@ bch2_sort_repack(struct bset *dst, struct btree *src, + if (filter_whiteouts && bkey_deleted(in)) + continue; + +- if (bch2_bkey_transform(out_f, out, bkey_packed(in) +- ? in_f : &bch2_bkey_format_current, in)) ++ if (!transform) ++ bkey_copy(out, in); ++ else if (bch2_bkey_transform(out_f, out, bkey_packed(in) ++ ? in_f : &bch2_bkey_format_current, in)) + out->format = KEY_FORMAT_LOCAL_BTREE; + else + bch2_bkey_unpack(src, (void *) out, in); +-- +cgit v1.2.3 + + +From 5e98e54f15d2208e4456707791bbe5da79ba6441 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 14 Dec 2021 14:24:04 -0500 +Subject: bcachefs: Kill some obsolete sysfs code + +fs internal/alloc_debug doesn't show anything bcachefs fs usage shows. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/sysfs.c | 34 +++++++--------------------------- + 1 file changed, 7 insertions(+), 27 deletions(-) + +diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c +index d5d32bf16d68..3f51eda749f0 100644 +--- a/fs/bcachefs/sysfs.c ++++ b/fs/bcachefs/sysfs.c +@@ -262,21 +262,6 @@ static long data_progress_to_text(struct printbuf *out, struct bch_fs *c) + return ret; + } + +-static int fs_alloc_debug_to_text(struct printbuf *out, struct bch_fs *c) +-{ +- struct bch_fs_usage_online *fs_usage = bch2_fs_usage_read(c); +- +- if (!fs_usage) +- return -ENOMEM; +- +- bch2_fs_usage_to_text(out, c, fs_usage); +- +- percpu_up_read(&c->mark_lock); +- +- kfree(fs_usage); +- return 0; +-} +- + static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c) + { + struct btree_trans trans; +@@ -386,9 +371,6 @@ SHOW(bch2_fs) + + /* Debugging: */ + +- if (attr == &sysfs_alloc_debug) +- return fs_alloc_debug_to_text(&out, c) ?: out.pos - buf; +- + if (attr == &sysfs_journal_debug) { + bch2_journal_debug_to_text(&out, &c->journal); + return out.pos - buf; +@@ -580,7 +562,6 @@ STORE(bch2_fs_internal) + SYSFS_OPS(bch2_fs_internal); + + struct attribute *bch2_fs_internal_files[] = { +- &sysfs_alloc_debug, + &sysfs_journal_debug, + &sysfs_journal_pins, + &sysfs_btree_updates, +@@ -588,17 +569,21 @@ struct attribute *bch2_fs_internal_files[] = { + &sysfs_btree_cache, + &sysfs_btree_key_cache, + &sysfs_btree_transactions, ++ &sysfs_new_stripes, + &sysfs_stripes_heap, + &sysfs_open_buckets, ++ &sysfs_io_timers_read, ++ &sysfs_io_timers_write, ++ ++ &sysfs_trigger_journal_flush, ++ &sysfs_trigger_gc, ++ &sysfs_prune_cache, + + &sysfs_read_realloc_races, + &sysfs_extent_migrate_done, + &sysfs_extent_migrate_raced, + +- &sysfs_trigger_journal_flush, +- &sysfs_trigger_gc, + &sysfs_gc_gens_pos, +- &sysfs_prune_cache, + + &sysfs_copy_gc_enabled, + &sysfs_copy_gc_wait, +@@ -607,11 +592,6 @@ struct attribute *bch2_fs_internal_files[] = { + &sysfs_rebalance_work, + sysfs_pd_controller_files(rebalance), + +- &sysfs_new_stripes, +- +- &sysfs_io_timers_read, +- &sysfs_io_timers_write, +- + &sysfs_data_op_data_progress, + + &sysfs_internal_uuid, +-- +cgit v1.2.3 + + +From 4554c9f9d0cbf9d76290061962d5f78bd3ba9794 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 20 Dec 2021 12:53:06 -0500 +Subject: bcachefs: Make sure bch2_bucket_alloc_new_fs() obeys buckets_nouse + +This fixes the filesystem migrate tool. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/alloc_foreground.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c +index 2bb107b8b0b9..dce77cc27cbe 100644 +--- a/fs/bcachefs/alloc_foreground.c ++++ b/fs/bcachefs/alloc_foreground.c +@@ -152,6 +152,7 @@ long bch2_bucket_alloc_new_fs(struct bch_dev *ca) + + for (b = buckets->first_bucket; b < buckets->nbuckets; b++) + if (is_available_bucket(buckets->b[b].mark) && ++ (!ca->buckets_nouse || !test_bit(b, ca->buckets_nouse)) && + !buckets->b[b].mark.owned_by_allocator) + goto success; + b = -1; +-- +cgit v1.2.3 + + +From 80e2e3b26da3ae9fc907e95ab6f79f81037d125e Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 19 Dec 2021 19:02:50 -0500 +Subject: bcachefs: Optimize memory accesses in bch2_btree_node_get() + +This puts a load behind some branches before where it's used, so that it +can execute in parallel with other loads. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_cache.c | 19 ++++++++++--------- + 1 file changed, 10 insertions(+), 9 deletions(-) + +diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c +index d31aedb49416..b02f93bdfd1f 100644 +--- a/fs/bcachefs/btree_cache.c ++++ b/fs/bcachefs/btree_cache.c +@@ -776,16 +776,17 @@ struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path * + + EBUG_ON(level >= BTREE_MAX_DEPTH); + +- if (c->opts.btree_node_mem_ptr_optimization) { +- b = btree_node_mem_ptr(k); +- /* +- * Check b->hash_val _before_ calling btree_node_lock() - this +- * might not be the node we want anymore, and trying to lock the +- * wrong node could cause an unneccessary transaction restart: +- */ +- if (b && b->hash_val == btree_ptr_hash_val(k)) ++ b = btree_node_mem_ptr(k); ++ ++ /* ++ * Check b->hash_val _before_ calling btree_node_lock() - this might not ++ * be the node we want anymore, and trying to lock the wrong node could ++ * cause an unneccessary transaction restart: ++ */ ++ if (likely(c->opts.btree_node_mem_ptr_optimization && ++ b && ++ b->hash_val == btree_ptr_hash_val(k))) + goto lock_node; +- } + retry: + b = btree_cache_find(bc, k); + if (unlikely(!b)) { +-- +cgit v1.2.3 + + +From 96a04b2ac948d2cd7c06c0d3257a9e3ceada838d Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 20 Dec 2021 18:18:35 -0500 +Subject: bcachefs: Fix some shutdown path bugs + +This fixes some bugs when we hit an error very early in the filesystem +startup path, before most things have been initialized. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bcachefs.h | 1 + + fs/bcachefs/btree_iter.c | 13 +++++++++---- + fs/bcachefs/btree_key_cache.c | 11 ++++++----- + 3 files changed, 16 insertions(+), 9 deletions(-) + +diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h +index 5c01f0564752..540492b04457 100644 +--- a/fs/bcachefs/bcachefs.h ++++ b/fs/bcachefs/bcachefs.h +@@ -705,6 +705,7 @@ struct bch_fs { + struct btree_path_buf __percpu *btree_paths_bufs; + + struct srcu_struct btree_trans_barrier; ++ bool btree_trans_barrier_initialized; + + struct btree_key_cache btree_key_cache; + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index f8aab9b4b5ce..cf93cfb54a28 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -2959,22 +2959,27 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c) + + void bch2_fs_btree_iter_exit(struct bch_fs *c) + { ++ if (c->btree_trans_barrier_initialized) ++ cleanup_srcu_struct(&c->btree_trans_barrier); + mempool_exit(&c->btree_trans_mem_pool); + mempool_exit(&c->btree_paths_pool); +- cleanup_srcu_struct(&c->btree_trans_barrier); + } + + int bch2_fs_btree_iter_init(struct bch_fs *c) + { + unsigned nr = BTREE_ITER_MAX; ++ int ret; + + INIT_LIST_HEAD(&c->btree_trans_list); + mutex_init(&c->btree_trans_lock); + +- return init_srcu_struct(&c->btree_trans_barrier) ?: +- mempool_init_kmalloc_pool(&c->btree_paths_pool, 1, ++ ret = mempool_init_kmalloc_pool(&c->btree_paths_pool, 1, + sizeof(struct btree_path) * nr + + sizeof(struct btree_insert_entry) * nr) ?: + mempool_init_kmalloc_pool(&c->btree_trans_mem_pool, 1, +- BTREE_TRANS_MEM_MAX); ++ BTREE_TRANS_MEM_MAX) ?: ++ init_srcu_struct(&c->btree_trans_barrier); ++ if (!ret) ++ c->btree_trans_barrier_initialized = true; ++ return ret; + } +diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c +index 4f1bc1d165aa..230a920ae32a 100644 +--- a/fs/bcachefs/btree_key_cache.c ++++ b/fs/bcachefs/btree_key_cache.c +@@ -662,11 +662,12 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc) + + rcu_read_lock(); + tbl = rht_dereference_rcu(bc->table.tbl, &bc->table); +- for (i = 0; i < tbl->size; i++) +- rht_for_each_entry_rcu(ck, pos, tbl, i, hash) { +- bkey_cached_evict(bc, ck); +- list_add(&ck->list, &bc->freed); +- } ++ if (tbl) ++ for (i = 0; i < tbl->size; i++) ++ rht_for_each_entry_rcu(ck, pos, tbl, i, hash) { ++ bkey_cached_evict(bc, ck); ++ list_add(&ck->list, &bc->freed); ++ } + rcu_read_unlock(); + + list_for_each_entry_safe(ck, n, &bc->freed, list) { +-- +cgit v1.2.3 + + +From a3eed82eb97561011b5996250f4391a8e46a49c9 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 21 Dec 2021 20:48:26 -0500 +Subject: bcachefs: BTREE_ITER_NOPRESERVE + +This adds a flag to not mark the initial btree_path as preserve, for +paths that we expect to be cheap to reconstitute if necessary - this +solves a btree_path overflow caused by need_whiteout_for_snapshot(). + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 20 +++++++++----------- + fs/bcachefs/btree_iter.h | 5 ++--- + fs/bcachefs/btree_types.h | 1 + + fs/bcachefs/btree_update_interior.c | 4 ++-- + fs/bcachefs/btree_update_leaf.c | 3 ++- + 5 files changed, 16 insertions(+), 17 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index cf93cfb54a28..76c6fa96e3f9 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -1818,12 +1818,14 @@ static struct btree_path *btree_path_alloc(struct btree_trans *trans, + return path; + } + +-struct btree_path *bch2_path_get(struct btree_trans *trans, bool cached, ++struct btree_path *bch2_path_get(struct btree_trans *trans, + enum btree_id btree_id, struct bpos pos, + unsigned locks_want, unsigned level, +- bool intent, unsigned long ip) ++ unsigned flags, unsigned long ip) + { + struct btree_path *path, *path_pos = NULL; ++ bool cached = flags & BTREE_ITER_CACHED; ++ bool intent = flags & BTREE_ITER_INTENT; + int i; + + BUG_ON(trans->restarted); +@@ -1845,7 +1847,6 @@ struct btree_path *bch2_path_get(struct btree_trans *trans, bool cached, + path_pos->level == level) { + __btree_path_get(path_pos, intent); + path = btree_path_set_pos(trans, path_pos, pos, intent, ip); +- path->preserve = true; + } else { + path = btree_path_alloc(trans, path_pos); + path_pos = NULL; +@@ -1854,7 +1855,6 @@ struct btree_path *bch2_path_get(struct btree_trans *trans, bool cached, + path->pos = pos; + path->btree_id = btree_id; + path->cached = cached; +- path->preserve = true; + path->uptodate = BTREE_ITER_NEED_TRAVERSE; + path->should_be_locked = false; + path->level = level; +@@ -1869,6 +1869,9 @@ struct btree_path *bch2_path_get(struct btree_trans *trans, bool cached, + btree_trans_verify_sorted(trans); + } + ++ if (!(flags & BTREE_ITER_NOPRESERVE)) ++ path->preserve = true; ++ + if (path->intent_ref) + locks_want = max(locks_want, level + 1); + +@@ -2628,13 +2631,8 @@ static void __bch2_trans_iter_init(struct btree_trans *trans, + iter->ip_allocated = ip; + #endif + +- iter->path = bch2_path_get(trans, +- flags & BTREE_ITER_CACHED, +- btree_id, +- iter->pos, +- locks_want, +- depth, +- flags & BTREE_ITER_INTENT, ip); ++ iter->path = bch2_path_get(trans, btree_id, iter->pos, ++ locks_want, depth, flags, ip); + } + + void bch2_trans_iter_init(struct btree_trans *trans, +diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h +index 26eb90a7eab8..4c903b9dd716 100644 +--- a/fs/bcachefs/btree_iter.h ++++ b/fs/bcachefs/btree_iter.h +@@ -134,9 +134,8 @@ bch2_btree_path_make_mut(struct btree_trans *, struct btree_path *, + bool, unsigned long); + int __must_check bch2_btree_path_traverse(struct btree_trans *, + struct btree_path *, unsigned); +-struct btree_path *bch2_path_get(struct btree_trans *, bool, enum btree_id, +- struct bpos, unsigned, unsigned, bool, +- unsigned long); ++struct btree_path *bch2_path_get(struct btree_trans *, enum btree_id, struct bpos, ++ unsigned, unsigned, unsigned, unsigned long); + inline struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *, struct bkey *); + + #ifdef CONFIG_BCACHEFS_DEBUG +diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h +index 22dbbe365bbe..c84bba7bcda5 100644 +--- a/fs/bcachefs/btree_types.h ++++ b/fs/bcachefs/btree_types.h +@@ -210,6 +210,7 @@ struct btree_node_iter { + #define __BTREE_ITER_ALL_SNAPSHOTS (1 << 11) + #define BTREE_ITER_ALL_SNAPSHOTS (1 << 12) + #define BTREE_ITER_FILTER_SNAPSHOTS (1 << 13) ++#define BTREE_ITER_NOPRESERVE (1 << 14) + + enum btree_path_uptodate { + BTREE_ITER_UPTODATE = 0, +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index d895d4eff0a9..f5d879dee423 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -1609,8 +1609,8 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, + ? bpos_predecessor(b->data->min_key) + : bpos_successor(b->data->max_key); + +- sib_path = bch2_path_get(trans, false, path->btree_id, sib_pos, +- U8_MAX, level, true, _THIS_IP_); ++ sib_path = bch2_path_get(trans, path->btree_id, sib_pos, ++ U8_MAX, level, BTREE_ITER_INTENT, _THIS_IP_); + ret = bch2_btree_path_traverse(trans, sib_path, false); + if (ret) + goto err; +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index 295942e7356e..1966441b1a62 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -1285,7 +1285,8 @@ static int need_whiteout_for_snapshot(struct btree_trans *trans, + pos.snapshot++; + + for_each_btree_key_norestart(trans, iter, btree_id, pos, +- BTREE_ITER_ALL_SNAPSHOTS, k, ret) { ++ BTREE_ITER_ALL_SNAPSHOTS| ++ BTREE_ITER_NOPRESERVE, k, ret) { + if (bkey_cmp(k.k->p, pos)) + break; + +-- +cgit v1.2.3 + + +From 77a376fd38b4a0fe7f5b1806cb301cb102840f48 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 21 Dec 2021 21:57:10 -0500 +Subject: bcachefs: Fix debugfs -bfloat-failed + +It wasn't updated for snapshots - it's iterating across keys in all +snapshots, so needs to be specifying BTREE_ITER_ALL_SNAPSHOTS. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/debug.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c +index 666635f7c7d2..ee5b7f696796 100644 +--- a/fs/bcachefs/debug.c ++++ b/fs/bcachefs/debug.c +@@ -373,7 +373,9 @@ static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf, + + bch2_trans_init(&trans, i->c, 0, 0); + +- bch2_trans_iter_init(&trans, &iter, i->id, i->from, BTREE_ITER_PREFETCH); ++ bch2_trans_iter_init(&trans, &iter, i->id, i->from, ++ BTREE_ITER_PREFETCH| ++ BTREE_ITER_ALL_SNAPSHOTS); + + while ((k = bch2_btree_iter_peek(&iter)).k && + !(err = bkey_err(k))) { +-- +cgit v1.2.3 + + +From 2cadfce52cb9492f627168a4a95913eba97967ff Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 14 Dec 2021 14:24:41 -0500 +Subject: bcachefs: Option improvements + +This adds flags for options that must be a power of two (block size and +btree node size), and options that are stored in the superblock as a +power of two (encoded extent max). + +Also: options are now stored in memory in the same units they're +displayed in (bytes): we now convert when getting and setting from the +superblock. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bcachefs.h | 18 +++- + fs/bcachefs/btree_cache.h | 4 +- + fs/bcachefs/btree_io.c | 18 ++-- + fs/bcachefs/btree_update_interior.c | 8 +- + fs/bcachefs/btree_update_interior.h | 2 +- + fs/bcachefs/buckets.c | 6 +- + fs/bcachefs/compress.c | 2 +- + fs/bcachefs/extents.c | 2 +- + fs/bcachefs/fs.c | 6 +- + fs/bcachefs/io.c | 4 +- + fs/bcachefs/journal_io.c | 4 +- + fs/bcachefs/opts.c | 171 +++++++++++++++++++++++++----------- + fs/bcachefs/opts.h | 40 +++++---- + fs/bcachefs/super-io.c | 17 ++-- + fs/bcachefs/super.c | 11 ++- + fs/bcachefs/sysfs.c | 12 +-- + fs/bcachefs/xattr.c | 2 +- + 17 files changed, 205 insertions(+), 122 deletions(-) + +diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h +index 540492b04457..02074a7f59fb 100644 +--- a/fs/bcachefs/bcachefs.h ++++ b/fs/bcachefs/bcachefs.h +@@ -928,10 +928,20 @@ static inline unsigned bucket_bytes(const struct bch_dev *ca) + + static inline unsigned block_bytes(const struct bch_fs *c) + { +- return c->opts.block_size << 9; ++ return c->opts.block_size; + } + +-static inline struct timespec64 bch2_time_to_timespec(struct bch_fs *c, s64 time) ++static inline unsigned block_sectors(const struct bch_fs *c) ++{ ++ return c->opts.block_size >> 9; ++} ++ ++static inline size_t btree_sectors(const struct bch_fs *c) ++{ ++ return c->opts.btree_node_size >> 9; ++} ++ ++static inline struct timespec64 bch2_time_to_timespec(const struct bch_fs *c, s64 time) + { + struct timespec64 t; + s32 rem; +@@ -943,13 +953,13 @@ static inline struct timespec64 bch2_time_to_timespec(struct bch_fs *c, s64 time + return t; + } + +-static inline s64 timespec_to_bch2_time(struct bch_fs *c, struct timespec64 ts) ++static inline s64 timespec_to_bch2_time(const struct bch_fs *c, struct timespec64 ts) + { + return (ts.tv_sec * c->sb.time_units_per_sec + + (int) ts.tv_nsec / c->sb.nsec_per_time_unit) - c->sb.time_base_lo; + } + +-static inline s64 bch2_current_time(struct bch_fs *c) ++static inline s64 bch2_current_time(const struct bch_fs *c) + { + struct timespec64 now; + +diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h +index 402cec1802bc..f7e10986f317 100644 +--- a/fs/bcachefs/btree_cache.h ++++ b/fs/bcachefs/btree_cache.h +@@ -71,7 +71,7 @@ static inline bool btree_node_hashed(struct btree *b) + + static inline size_t btree_bytes(struct bch_fs *c) + { +- return c->opts.btree_node_size << 9; ++ return c->opts.btree_node_size; + } + + static inline size_t btree_max_u64s(struct bch_fs *c) +@@ -86,7 +86,7 @@ static inline size_t btree_pages(struct bch_fs *c) + + static inline unsigned btree_blocks(struct bch_fs *c) + { +- return c->opts.btree_node_size >> c->block_bits; ++ return btree_sectors(c) >> c->block_bits; + } + + #define BTREE_SPLIT_THRESHOLD(c) (btree_max_u64s(c) * 2 / 3) +diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c +index 9b22c5e3fe87..1455dc787190 100644 +--- a/fs/bcachefs/btree_io.c ++++ b/fs/bcachefs/btree_io.c +@@ -687,7 +687,7 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, + BTREE_ERR_FATAL, c, ca, b, i, + "BSET_SEPARATE_WHITEOUTS no longer supported"); + +- if (btree_err_on(offset + sectors > c->opts.btree_node_size, ++ if (btree_err_on(offset + sectors > btree_sectors(c), + BTREE_ERR_FIXABLE, c, ca, b, i, + "bset past end of btree node")) { + i->u64s = 0; +@@ -901,7 +901,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, + b->data->keys.seq, bp->seq); + } + +- while (b->written < (ptr_written ?: c->opts.btree_node_size)) { ++ while (b->written < (ptr_written ?: btree_sectors(c))) { + unsigned sectors, whiteout_u64s = 0; + struct nonce nonce; + struct bch_csum csum; +@@ -1210,7 +1210,7 @@ static unsigned btree_node_sectors_written(struct bch_fs *c, void *data) + if (le64_to_cpu(bn->magic) != bset_magic(c)) + return 0; + +- while (offset < c->opts.btree_node_size) { ++ while (offset < btree_sectors(c)) { + if (!offset) { + offset += vstruct_sectors(bn, c->block_bits); + } else { +@@ -1232,7 +1232,7 @@ static bool btree_node_has_extra_bsets(struct bch_fs *c, unsigned offset, void * + if (!offset) + return false; + +- while (offset < c->opts.btree_node_size) { ++ while (offset < btree_sectors(c)) { + bne = data + (offset << 9); + if (bne->keys.seq == bn->keys.seq) + return true; +@@ -1302,7 +1302,7 @@ fsck_err: + if (ra->err[i]) + continue; + +- while (offset < c->opts.btree_node_size) { ++ while (offset < btree_sectors(c)) { + if (!offset) { + sectors = vstruct_sectors(bn, c->block_bits); + } else { +@@ -1319,7 +1319,7 @@ fsck_err: + offset += sectors; + } + +- while (offset < c->opts.btree_node_size) { ++ while (offset < btree_sectors(c)) { + bne = ra->buf[i] + (offset << 9); + if (bne->keys.seq == bn->keys.seq) { + if (!gap) +@@ -1797,8 +1797,8 @@ do_write: + BUG_ON(btree_node_fake(b)); + BUG_ON((b->will_make_reachable != 0) != !b->written); + +- BUG_ON(b->written >= c->opts.btree_node_size); +- BUG_ON(b->written & (c->opts.block_size - 1)); ++ BUG_ON(b->written >= btree_sectors(c)); ++ BUG_ON(b->written & (block_sectors(c) - 1)); + BUG_ON(bset_written(b, btree_bset_last(b))); + BUG_ON(le64_to_cpu(b->data->magic) != bset_magic(c)); + BUG_ON(memcmp(&b->data->format, &b->format, sizeof(b->format))); +@@ -1871,7 +1871,7 @@ do_write: + memset(data + bytes_to_write, 0, + (sectors_to_write << 9) - bytes_to_write); + +- BUG_ON(b->written + sectors_to_write > c->opts.btree_node_size); ++ BUG_ON(b->written + sectors_to_write > btree_sectors(c)); + BUG_ON(BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN); + BUG_ON(i->seq != b->data->keys.seq); + +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index f5d879dee423..9dca694b6ee3 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -223,12 +223,12 @@ retry: + if (IS_ERR(wp)) + return ERR_CAST(wp); + +- if (wp->sectors_free < c->opts.btree_node_size) { ++ if (wp->sectors_free < btree_sectors(c)) { + struct open_bucket *ob; + unsigned i; + + open_bucket_for_each(c, &wp->ptrs, ob, i) +- if (ob->sectors_free < c->opts.btree_node_size) ++ if (ob->sectors_free < btree_sectors(c)) + ob->sectors_free = 0; + + bch2_alloc_sectors_done(c, wp); +@@ -236,7 +236,7 @@ retry: + } + + bkey_btree_ptr_v2_init(&tmp.k); +- bch2_alloc_sectors_append_ptrs(c, wp, &tmp.k, c->opts.btree_node_size); ++ bch2_alloc_sectors_append_ptrs(c, wp, &tmp.k, btree_sectors(c)); + + bch2_open_bucket_get(c, wp, &ob); + bch2_alloc_sectors_done(c, wp); +@@ -1029,7 +1029,7 @@ retry: + } + + ret = bch2_disk_reservation_get(c, &as->disk_res, +- nr_nodes * c->opts.btree_node_size, ++ nr_nodes * btree_sectors(c), + c->opts.metadata_replicas, + disk_res_flags); + if (ret) +diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h +index d4574161a733..8cf59cee6e4e 100644 +--- a/fs/bcachefs/btree_update_interior.h ++++ b/fs/bcachefs/btree_update_interior.h +@@ -218,7 +218,7 @@ static inline ssize_t __bch_btree_u64s_remaining(struct bch_fs *c, + { + ssize_t used = bset_byte_offset(b, end) / sizeof(u64) + + b->whiteout_u64s; +- ssize_t total = c->opts.btree_node_size << 6; ++ ssize_t total = c->opts.btree_node_size >> 3; + + /* Always leave one extra u64 for bch2_varint_decode: */ + used++; +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index f951f9f3ecf2..bbcc472f592e 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -996,7 +996,7 @@ static int bch2_mark_extent(struct btree_trans *trans, + ? BCH_DATA_btree + : BCH_DATA_user; + s64 sectors = bkey_is_btree_ptr(k.k) +- ? c->opts.btree_node_size ++ ? btree_sectors(c) + : k.k->size; + s64 dirty_sectors = 0; + bool stale; +@@ -1604,7 +1604,7 @@ static int bch2_trans_mark_extent(struct btree_trans *trans, + ? BCH_DATA_btree + : BCH_DATA_user; + s64 sectors = bkey_is_btree_ptr(k.k) +- ? c->opts.btree_node_size ++ ? btree_sectors(c) + : k.k->size; + s64 dirty_sectors = 0; + bool stale; +@@ -2179,7 +2179,7 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) + alloc_heap alloc_heap; + + size_t btree_reserve = DIV_ROUND_UP(BTREE_NODE_RESERVE, +- ca->mi.bucket_size / c->opts.btree_node_size); ++ ca->mi.bucket_size / btree_sectors(c)); + /* XXX: these should be tunable */ + size_t reserve_none = max_t(size_t, 1, nbuckets >> 9); + size_t copygc_reserve = max_t(size_t, 2, nbuckets >> 6); +diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c +index 78757dcede36..2d5dc2394bab 100644 +--- a/fs/bcachefs/compress.c ++++ b/fs/bcachefs/compress.c +@@ -376,7 +376,7 @@ static unsigned __bio_compress(struct bch_fs *c, + BUG_ON(!mempool_initialized(&c->compress_workspace[compression_type])); + + /* If it's only one block, don't bother trying to compress: */ +- if (bio_sectors(src) <= c->opts.block_size) ++ if (src->bi_iter.bi_size <= c->opts.block_size) + return 0; + + dst_data = bio_map_or_bounce(c, dst, WRITE); +diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c +index 89b5be907eea..145b3868e522 100644 +--- a/fs/bcachefs/extents.c ++++ b/fs/bcachefs/extents.c +@@ -1038,7 +1038,7 @@ const char *bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k) + + if (k.k->type == KEY_TYPE_btree_ptr || + k.k->type == KEY_TYPE_btree_ptr_v2) +- size_ondisk = c->opts.btree_node_size; ++ size_ondisk = btree_sectors(c); + + bkey_extent_entry_for_each(ptrs, entry) { + if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX) +diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c +index 26b4ae4b4651..ba3462e27221 100644 +--- a/fs/bcachefs/fs.c ++++ b/fs/bcachefs/fs.c +@@ -867,8 +867,8 @@ static int bch2_fill_extent(struct bch_fs *c, + else + offset += p.crc.offset; + +- if ((offset & (c->opts.block_size - 1)) || +- (k.k->size & (c->opts.block_size - 1))) ++ if ((offset & (block_sectors(c) - 1)) || ++ (k.k->size & (block_sectors(c) - 1))) + flags2 |= FIEMAP_EXTENT_NOT_ALIGNED; + + ret = fiemap_fill_next_extent(info, +@@ -1682,7 +1682,7 @@ static int bch2_show_options(struct seq_file *seq, struct dentry *root) + const struct bch_option *opt = &bch2_opt_table[i]; + u64 v = bch2_opt_get_by_id(&c->opts, i); + +- if (!(opt->mode & OPT_MOUNT)) ++ if (!(opt->flags & OPT_MOUNT)) + continue; + + if (v == bch2_opt_get_by_id(&bch2_opts_default, i)) +diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c +index a9ca81ecaf68..6e64a04d7f91 100644 +--- a/fs/bcachefs/io.c ++++ b/fs/bcachefs/io.c +@@ -1289,7 +1289,7 @@ void bch2_write(struct closure *cl) + bch2_keylist_init(&op->insert_keys, op->inline_keys); + wbio_init(bio)->put_bio = false; + +- if (bio_sectors(bio) & (c->opts.block_size - 1)) { ++ if (bio->bi_iter.bi_size & (c->opts.block_size - 1)) { + bch_err_inum_ratelimited(c, op->pos.inode, + "misaligned write"); + op->error = -EIO; +@@ -2365,7 +2365,7 @@ int bch2_fs_io_init(struct bch_fs *c) + BIOSET_NEED_BVECS) || + mempool_init_page_pool(&c->bio_bounce_pages, + max_t(unsigned, +- c->opts.btree_node_size, ++ btree_sectors(c), + c->sb.encoded_extent_max) / + PAGE_SECTORS, 0) || + rhashtable_init(&c->promote_table, &bch_promote_params)) +diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c +index 80e0dd311ffd..1fcc7ed5a776 100644 +--- a/fs/bcachefs/journal_io.c ++++ b/fs/bcachefs/journal_io.c +@@ -710,7 +710,7 @@ reread: + case JOURNAL_ENTRY_NONE: + if (!saw_bad) + return 0; +- sectors = c->opts.block_size; ++ sectors = block_sectors(c); + goto next_block; + case JOURNAL_ENTRY_BAD: + saw_bad = true; +@@ -719,7 +719,7 @@ reread: + * field of the journal entry we read, so try reading + * again at next block boundary: + */ +- sectors = c->opts.block_size; ++ sectors = block_sectors(c); + break; + default: + return ret; +diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c +index e81e07a383bb..9b75c852bac8 100644 +--- a/fs/bcachefs/opts.c ++++ b/fs/bcachefs/opts.c +@@ -141,41 +141,27 @@ void bch2_opt_set_by_id(struct bch_opts *opts, enum bch_opt_id id, u64 v) + } + } + +-/* +- * Initial options from superblock - here we don't want any options undefined, +- * any options the superblock doesn't specify are set to 0: +- */ +-struct bch_opts bch2_opts_from_sb(struct bch_sb *sb) +-{ +- struct bch_opts opts = bch2_opts_empty(); +- +-#define x(_name, _bits, _mode, _type, _sb_opt, ...) \ +- if (_sb_opt != NO_SB_OPT) \ +- opt_set(opts, _name, _sb_opt(sb)); +- BCH_OPTS() +-#undef x +- +- return opts; +-} +- + const struct bch_option bch2_opt_table[] = { +-#define OPT_BOOL() .type = BCH_OPT_BOOL +-#define OPT_UINT(_min, _max) .type = BCH_OPT_UINT, .min = _min, .max = _max +-#define OPT_SECTORS(_min, _max) .type = BCH_OPT_SECTORS, .min = _min, .max = _max +-#define OPT_STR(_choices) .type = BCH_OPT_STR, .choices = _choices ++#define OPT_BOOL() .type = BCH_OPT_BOOL, .min = 0, .max = 2 ++#define OPT_UINT(_min, _max) .type = BCH_OPT_UINT, \ ++ .min = _min, .max = _max ++#define OPT_STR(_choices) .type = BCH_OPT_STR, \ ++ .min = 0, .max = ARRAY_SIZE(_choices),\ ++ .choices = _choices + #define OPT_FN(_fn) .type = BCH_OPT_FN, \ + .parse = _fn##_parse, \ + .to_text = _fn##_to_text + +-#define x(_name, _bits, _mode, _type, _sb_opt, _default, _hint, _help) \ ++#define x(_name, _bits, _flags, _type, _sb_opt, _default, _hint, _help) \ + [Opt_##_name] = { \ + .attr = { \ + .name = #_name, \ +- .mode = (_mode) & OPT_RUNTIME ? 0644 : 0444, \ ++ .mode = (_flags) & OPT_RUNTIME ? 0644 : 0444, \ + }, \ +- .mode = _mode, \ ++ .flags = _flags, \ + .hint = _hint, \ + .help = _help, \ ++ .get_sb = _sb_opt, \ + .set_sb = SET_##_sb_opt, \ + _type \ + }, +@@ -218,7 +204,41 @@ static int bch2_mount_opt_lookup(const char *name) + return bch2_opt_lookup(name); + } + +-int bch2_opt_parse(struct bch_fs *c, const struct bch_option *opt, ++static int bch2_opt_validate(const struct bch_option *opt, const char *msg, u64 v) ++{ ++ if (v < opt->min) { ++ if (msg) ++ pr_err("invalid %s%s: too small (min %llu)", ++ msg, opt->attr.name, opt->min); ++ return -ERANGE; ++ } ++ ++ if (opt->max && v >= opt->max) { ++ if (msg) ++ pr_err("invalid %s%s: too big (max %llu)", ++ msg, opt->attr.name, opt->max); ++ return -ERANGE; ++ } ++ ++ if ((opt->flags & OPT_SB_FIELD_SECTORS) && (v & 511)) { ++ if (msg) ++ pr_err("invalid %s %s: not a multiple of 512", ++ msg, opt->attr.name); ++ return -EINVAL; ++ } ++ ++ if ((opt->flags & OPT_MUST_BE_POW_2) && !is_power_of_2(v)) { ++ if (msg) ++ pr_err("invalid %s%s: must be a power of two", ++ msg, opt->attr.name); ++ return -EINVAL; ++ } ++ ++ return 0; ++} ++ ++int bch2_opt_parse(struct bch_fs *c, const char *msg, ++ const struct bch_option *opt, + const char *val, u64 *res) + { + ssize_t ret; +@@ -228,30 +248,13 @@ int bch2_opt_parse(struct bch_fs *c, const struct bch_option *opt, + ret = kstrtou64(val, 10, res); + if (ret < 0) + return ret; +- +- if (*res > 1) +- return -ERANGE; + break; + case BCH_OPT_UINT: +- ret = kstrtou64(val, 10, res); ++ ret = opt->flags & OPT_HUMAN_READABLE ++ ? bch2_strtou64_h(val, res) ++ : kstrtou64(val, 10, res); + if (ret < 0) + return ret; +- +- if (*res < opt->min || *res >= opt->max) +- return -ERANGE; +- break; +- case BCH_OPT_SECTORS: +- ret = bch2_strtou64_h(val, res); +- if (ret < 0) +- return ret; +- +- if (*res & 511) +- return -EINVAL; +- +- *res >>= 9; +- +- if (*res < opt->min || *res >= opt->max) +- return -ERANGE; + break; + case BCH_OPT_STR: + ret = match_string(opt->choices, -1, val); +@@ -264,10 +267,12 @@ int bch2_opt_parse(struct bch_fs *c, const struct bch_option *opt, + if (!c) + return 0; + +- return opt->parse(c, val, res); ++ ret = opt->parse(c, val, res); ++ if (ret < 0) ++ return ret; + } + +- return 0; ++ return bch2_opt_validate(opt, msg, *res); + } + + void bch2_opt_to_text(struct printbuf *out, struct bch_fs *c, +@@ -288,10 +293,10 @@ void bch2_opt_to_text(struct printbuf *out, struct bch_fs *c, + switch (opt->type) { + case BCH_OPT_BOOL: + case BCH_OPT_UINT: +- pr_buf(out, "%lli", v); +- break; +- case BCH_OPT_SECTORS: +- bch2_hprint(out, v << 9); ++ if (opt->flags & OPT_HUMAN_READABLE) ++ bch2_hprint(out, v); ++ else ++ pr_buf(out, "%lli", v); + break; + case BCH_OPT_STR: + if (flags & OPT_SHOW_FULL_LIST) +@@ -365,7 +370,8 @@ int bch2_parse_mount_opts(struct bch_fs *c, struct bch_opts *opts, + if (id < 0) + goto bad_opt; + +- ret = bch2_opt_parse(c, &bch2_opt_table[id], val, &v); ++ ret = bch2_opt_parse(c, "mount option ", ++ &bch2_opt_table[id], val, &v); + if (ret < 0) + goto bad_val; + } else { +@@ -385,7 +391,7 @@ int bch2_parse_mount_opts(struct bch_fs *c, struct bch_opts *opts, + goto no_val; + } + +- if (!(bch2_opt_table[id].mode & OPT_MOUNT)) ++ if (!(bch2_opt_table[id].flags & OPT_MOUNT)) + goto bad_opt; + + if (id == Opt_acl && +@@ -420,6 +426,65 @@ out: + return ret; + } + ++/* ++ * Initial options from superblock - here we don't want any options undefined, ++ * any options the superblock doesn't specify are set to 0: ++ */ ++int bch2_opts_from_sb(struct bch_opts *opts, struct bch_sb *sb) ++{ ++ unsigned id; ++ int ret; ++ ++ for (id = 0; id < bch2_opts_nr; id++) { ++ const struct bch_option *opt = bch2_opt_table + id; ++ u64 v; ++ ++ if (opt->get_sb == NO_SB_OPT) ++ continue; ++ ++ v = opt->get_sb(sb); ++ ++ if (opt->flags & OPT_SB_FIELD_ILOG2) ++ v = 1ULL << v; ++ ++ if (opt->flags & OPT_SB_FIELD_SECTORS) ++ v <<= 9; ++ ++ ret = bch2_opt_validate(opt, "superblock option ", v); ++ if (ret) ++ return ret; ++ ++ bch2_opt_set_by_id(opts, id, v); ++ } ++ ++ return 0; ++} ++ ++void __bch2_opt_set_sb(struct bch_sb *sb, const struct bch_option *opt, u64 v) ++{ ++ if (opt->set_sb == SET_NO_SB_OPT) ++ return; ++ ++ if (opt->flags & OPT_SB_FIELD_SECTORS) ++ v >>= 9; ++ ++ if (opt->flags & OPT_SB_FIELD_ILOG2) ++ v = ilog2(v); ++ ++ opt->set_sb(sb, v); ++} ++ ++void bch2_opt_set_sb(struct bch_fs *c, const struct bch_option *opt, u64 v) ++{ ++ if (opt->set_sb == SET_NO_SB_OPT) ++ return; ++ ++ mutex_lock(&c->sb_lock); ++ __bch2_opt_set_sb(c->disk_sb.sb, opt, v); ++ bch2_write_super(c); ++ mutex_unlock(&c->sb_lock); ++} ++ + /* io opts: */ + + struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts src) +diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h +index 871142778763..60fe0301c4b7 100644 +--- a/fs/bcachefs/opts.h ++++ b/fs/bcachefs/opts.h +@@ -44,19 +44,22 @@ static inline const char *bch2_d_type_str(unsigned d_type) + LE64_BITMASK(NO_SB_OPT, struct bch_sb, flags[0], 0, 0); + + /* When can be set: */ +-enum opt_mode { ++enum opt_flags { + OPT_FS = (1 << 0), /* Filesystem option */ + OPT_DEVICE = (1 << 1), /* Device option */ + OPT_INODE = (1 << 2), /* Inode option */ + OPT_FORMAT = (1 << 3), /* May be specified at format time */ + OPT_MOUNT = (1 << 4), /* May be specified at mount time */ + OPT_RUNTIME = (1 << 5), /* May be specified at runtime */ ++ OPT_HUMAN_READABLE = (1 << 6), ++ OPT_MUST_BE_POW_2 = (1 << 7), /* Must be power of 2 */ ++ OPT_SB_FIELD_SECTORS = (1 << 8),/* Superblock field is >> 9 of actual value */ ++ OPT_SB_FIELD_ILOG2 = (1 << 9), /* Superblock field is ilog2 of actual value */ + }; + + enum opt_type { + BCH_OPT_BOOL, + BCH_OPT_UINT, +- BCH_OPT_SECTORS, + BCH_OPT_STR, + BCH_OPT_FN, + }; +@@ -88,13 +91,15 @@ enum opt_type { + + #define BCH_OPTS() \ + x(block_size, u16, \ +- OPT_FS|OPT_FORMAT, \ +- OPT_SECTORS(1, 128), \ ++ OPT_FS|OPT_FORMAT| \ ++ OPT_HUMAN_READABLE|OPT_MUST_BE_POW_2|OPT_SB_FIELD_SECTORS, \ ++ OPT_UINT(512, 1U << 16), \ + BCH_SB_BLOCK_SIZE, 8, \ + "size", NULL) \ +- x(btree_node_size, u16, \ +- OPT_FS|OPT_FORMAT, \ +- OPT_SECTORS(1, 512), \ ++ x(btree_node_size, u32, \ ++ OPT_FS|OPT_FORMAT| \ ++ OPT_HUMAN_READABLE|OPT_MUST_BE_POW_2|OPT_SB_FIELD_SECTORS, \ ++ OPT_UINT(512, 1U << 20), \ + BCH_SB_BTREE_NODE_SIZE, 512, \ + "size", "Btree node size, default 256k") \ + x(errors, u8, \ +@@ -198,8 +203,9 @@ enum opt_type { + BCH_SB_GC_RESERVE, 8, \ + "%", "Percentage of disk space to reserve for copygc")\ + x(gc_reserve_bytes, u64, \ +- OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ +- OPT_SECTORS(0, U64_MAX), \ ++ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME| \ ++ OPT_HUMAN_READABLE|OPT_SB_FIELD_SECTORS, \ ++ OPT_UINT(0, U64_MAX), \ + BCH_SB_GC_RESERVE_BYTES, 0, \ + "%", "Amount of disk space to reserve for copygc\n" \ + "Takes precedence over gc_reserve_percent if set")\ +@@ -354,12 +360,12 @@ enum opt_type { + NULL, NULL) \ + x(fs_size, u64, \ + OPT_DEVICE, \ +- OPT_SECTORS(0, S64_MAX), \ ++ OPT_UINT(0, S64_MAX), \ + NO_SB_OPT, 0, \ + "size", "Size of filesystem on device") \ + x(bucket, u32, \ + OPT_DEVICE, \ +- OPT_SECTORS(0, S64_MAX), \ ++ OPT_UINT(0, S64_MAX), \ + NO_SB_OPT, 0, \ + "size", "Size of filesystem on device") \ + x(durability, u8, \ +@@ -418,13 +424,14 @@ struct printbuf; + + struct bch_option { + struct attribute attr; ++ u64 (*get_sb)(const struct bch_sb *); + void (*set_sb)(struct bch_sb *, u64); +- enum opt_mode mode; + enum opt_type type; ++ enum opt_flags flags; ++ u64 min, max; + + union { + struct { +- u64 min, max; + }; + struct { + const char * const *choices; +@@ -446,10 +453,13 @@ bool bch2_opt_defined_by_id(const struct bch_opts *, enum bch_opt_id); + u64 bch2_opt_get_by_id(const struct bch_opts *, enum bch_opt_id); + void bch2_opt_set_by_id(struct bch_opts *, enum bch_opt_id, u64); + +-struct bch_opts bch2_opts_from_sb(struct bch_sb *); ++int bch2_opts_from_sb(struct bch_opts *, struct bch_sb *); ++void __bch2_opt_set_sb(struct bch_sb *, const struct bch_option *, u64); ++void bch2_opt_set_sb(struct bch_fs *, const struct bch_option *, u64); + + int bch2_opt_lookup(const char *); +-int bch2_opt_parse(struct bch_fs *, const struct bch_option *, const char *, u64 *); ++int bch2_opt_parse(struct bch_fs *, const char *, const struct bch_option *, ++ const char *, u64 *); + + #define OPT_SHOW_FULL_LIST (1 << 0) + #define OPT_SHOW_MOUNT_STYLE (1 << 1) +diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c +index 88a8e54fbd7a..3df4c977061e 100644 +--- a/fs/bcachefs/super-io.c ++++ b/fs/bcachefs/super-io.c +@@ -261,8 +261,7 @@ const char *bch2_sb_validate(struct bch_sb_handle *disk_sb) + + block_size = le16_to_cpu(sb->block_size); + +- if (!is_power_of_2(block_size) || +- block_size > PAGE_SECTORS) ++ if (block_size > PAGE_SECTORS) + return "Bad block size"; + + if (bch2_is_zero(sb->user_uuid.b, sizeof(uuid_le))) +@@ -304,9 +303,6 @@ const char *bch2_sb_validate(struct bch_sb_handle *disk_sb) + if (!BCH_SB_BTREE_NODE_SIZE(sb)) + return "Btree node size not set"; + +- if (!is_power_of_2(BCH_SB_BTREE_NODE_SIZE(sb))) +- return "Btree node size not a power of two"; +- + if (BCH_SB_GC_RESERVE(sb) < 5) + return "gc reserve percentage too small"; + +@@ -621,8 +617,12 @@ got_super: + err = "Superblock block size smaller than device block size"; + ret = -EINVAL; + if (le16_to_cpu(sb->sb->block_size) << 9 < +- bdev_logical_block_size(sb->bdev)) +- goto err; ++ bdev_logical_block_size(sb->bdev)) { ++ pr_err("error reading superblock: Superblock block size (%u) smaller than device block size (%u)", ++ le16_to_cpu(sb->sb->block_size) << 9, ++ bdev_logical_block_size(sb->bdev)); ++ goto err_no_print; ++ } + + ret = 0; + sb->have_layout = true; +@@ -630,8 +630,9 @@ out: + pr_verbose_init(*opts, "ret %i", ret); + return ret; + err: +- bch2_free_super(sb); + pr_err("error reading superblock: %s", err); ++err_no_print: ++ bch2_free_super(sb); + goto out; + } + +diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c +index 505e559b48a6..58bc29032a8a 100644 +--- a/fs/bcachefs/super.c ++++ b/fs/bcachefs/super.c +@@ -754,10 +754,13 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) + SET_BCH_SB_JOURNAL_RECLAIM_DELAY(sb, 100); + + c->opts = bch2_opts_default; +- bch2_opts_apply(&c->opts, bch2_opts_from_sb(sb)); ++ ret = bch2_opts_from_sb(&c->opts, sb); ++ if (ret) ++ goto err; ++ + bch2_opts_apply(&c->opts, opts); + +- c->block_bits = ilog2(c->opts.block_size); ++ c->block_bits = ilog2(block_sectors(c)); + c->btree_foreground_merge_threshold = BTREE_FOREGROUND_MERGE_THRESHOLD(c); + + if (bch2_fs_init_fault("fs_alloc")) { +@@ -869,7 +872,7 @@ static void print_mount_opts(struct bch_fs *c) + const struct bch_option *opt = &bch2_opt_table[i]; + u64 v = bch2_opt_get_by_id(&c->opts, i); + +- if (!(opt->mode & OPT_MOUNT)) ++ if (!(opt->flags & OPT_MOUNT)) + continue; + + if (v == bch2_opt_get_by_id(&bch2_opts_default, i)) +@@ -995,7 +998,7 @@ static const char *bch2_dev_may_add(struct bch_sb *sb, struct bch_fs *c) + if (!sb_mi) + return "Invalid superblock: member info area missing"; + +- if (le16_to_cpu(sb->block_size) != c->opts.block_size) ++ if (le16_to_cpu(sb->block_size) != block_sectors(c)) + return "mismatched block size"; + + if (le16_to_cpu(sb_mi->members[sb->dev_idx].bucket_size) < +diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c +index 3f51eda749f0..0a0798bae4d6 100644 +--- a/fs/bcachefs/sysfs.c ++++ b/fs/bcachefs/sysfs.c +@@ -626,7 +626,7 @@ STORE(bch2_fs_opts_dir) + if (!tmp) + return -ENOMEM; + +- ret = bch2_opt_parse(c, opt, strim(tmp), &v); ++ ret = bch2_opt_parse(c, NULL, opt, strim(tmp), &v); + kfree(tmp); + + if (ret < 0) +@@ -636,13 +636,7 @@ STORE(bch2_fs_opts_dir) + if (ret < 0) + return ret; + +- if (opt->set_sb != SET_NO_SB_OPT) { +- mutex_lock(&c->sb_lock); +- opt->set_sb(c->disk_sb.sb, v); +- bch2_write_super(c); +- mutex_unlock(&c->sb_lock); +- } +- ++ bch2_opt_set_sb(c, opt, v); + bch2_opt_set_by_id(&c->opts, id, v); + + if ((id == Opt_background_target || +@@ -665,7 +659,7 @@ int bch2_opts_create_sysfs_files(struct kobject *kobj) + for (i = bch2_opt_table; + i < bch2_opt_table + bch2_opts_nr; + i++) { +- if (!(i->mode & OPT_FS)) ++ if (!(i->flags & OPT_FS)) + continue; + + ret = sysfs_create_file(kobj, &i->attr); +diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c +index 464ed68318e7..4d7db64e3ef3 100644 +--- a/fs/bcachefs/xattr.c ++++ b/fs/bcachefs/xattr.c +@@ -525,7 +525,7 @@ static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler, + memcpy(buf, value, size); + buf[size] = '\0'; + +- ret = bch2_opt_parse(c, opt, buf, &v); ++ ret = bch2_opt_parse(c, NULL, opt, buf, &v); + kfree(buf); + + if (ret < 0) +-- +cgit v1.2.3 + + +From 68f97cc77399216592c9e6f7d24178bc81f87f37 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 14 Dec 2021 14:34:03 -0500 +Subject: bcachefs: Turn encoded_extent_max into a regular option + +It'll now be handled at format time and in sysfs like other options - it +still can only be set at format time, though. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bcachefs.h | 1 - + fs/bcachefs/compress.c | 21 ++++++++++----------- + fs/bcachefs/ec.c | 2 +- + fs/bcachefs/extents.c | 2 +- + fs/bcachefs/io.c | 14 +++++++------- + fs/bcachefs/opts.h | 6 ++++++ + fs/bcachefs/super-io.c | 1 - + 7 files changed, 25 insertions(+), 22 deletions(-) + +diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h +index 02074a7f59fb..943487f2c6f5 100644 +--- a/fs/bcachefs/bcachefs.h ++++ b/fs/bcachefs/bcachefs.h +@@ -634,7 +634,6 @@ struct bch_fs { + + u16 version; + u16 version_min; +- u16 encoded_extent_max; + + u8 nr_devices; + u8 clean; +diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c +index 2d5dc2394bab..f692f35a6a98 100644 +--- a/fs/bcachefs/compress.c ++++ b/fs/bcachefs/compress.c +@@ -26,7 +26,7 @@ static struct bbuf __bounce_alloc(struct bch_fs *c, unsigned size, int rw) + { + void *b; + +- BUG_ON(size > c->sb.encoded_extent_max << 9); ++ BUG_ON(size > c->opts.encoded_extent_max); + + b = kmalloc(size, GFP_NOIO|__GFP_NOWARN); + if (b) +@@ -68,7 +68,7 @@ static struct bbuf __bio_map_or_bounce(struct bch_fs *c, struct bio *bio, + struct page **pages = NULL; + void *data; + +- BUG_ON(bvec_iter_sectors(start) > c->sb.encoded_extent_max); ++ BUG_ON(start.bi_size > c->opts.encoded_extent_max); + + if (!PageHighMem(bio_iter_page(bio, start)) && + bio_phys_contig(bio, start)) +@@ -231,8 +231,8 @@ int bch2_bio_uncompress_inplace(struct bch_fs *c, struct bio *bio, + BUG_ON(!bio->bi_vcnt); + BUG_ON(DIV_ROUND_UP(crc->live_size, PAGE_SECTORS) > bio->bi_max_vecs); + +- if (crc->uncompressed_size > c->sb.encoded_extent_max || +- crc->compressed_size > c->sb.encoded_extent_max) { ++ if (crc->uncompressed_size << 9 > c->opts.encoded_extent_max || ++ crc->compressed_size << 9 > c->opts.encoded_extent_max) { + bch_err(c, "error rewriting existing data: extent too big"); + return -EIO; + } +@@ -272,8 +272,8 @@ int bch2_bio_uncompress(struct bch_fs *c, struct bio *src, + size_t dst_len = crc.uncompressed_size << 9; + int ret = -ENOMEM; + +- if (crc.uncompressed_size > c->sb.encoded_extent_max || +- crc.compressed_size > c->sb.encoded_extent_max) ++ if (crc.uncompressed_size << 9 > c->opts.encoded_extent_max || ++ crc.compressed_size << 9 > c->opts.encoded_extent_max) + return -EIO; + + dst_data = dst_len == dst_iter.bi_size +@@ -466,7 +466,7 @@ unsigned bch2_bio_compress(struct bch_fs *c, + + /* Don't consume more than BCH_ENCODED_EXTENT_MAX from @src: */ + src->bi_iter.bi_size = min_t(unsigned, src->bi_iter.bi_size, +- c->sb.encoded_extent_max << 9); ++ c->opts.encoded_extent_max); + /* Don't generate a bigger output than input: */ + dst->bi_iter.bi_size = min(dst->bi_iter.bi_size, src->bi_iter.bi_size); + +@@ -544,10 +544,9 @@ void bch2_fs_compress_exit(struct bch_fs *c) + + static int __bch2_fs_compress_init(struct bch_fs *c, u64 features) + { +- size_t max_extent = c->sb.encoded_extent_max << 9; + size_t decompress_workspace_size = 0; + bool decompress_workspace_needed; +- ZSTD_parameters params = zstd_get_params(0, max_extent); ++ ZSTD_parameters params = zstd_get_params(0, c->opts.encoded_extent_max); + struct { + unsigned feature; + unsigned type; +@@ -579,14 +578,14 @@ have_compressed: + + if (!mempool_initialized(&c->compression_bounce[READ])) { + ret = mempool_init_kvpmalloc_pool(&c->compression_bounce[READ], +- 1, max_extent); ++ 1, c->opts.encoded_extent_max); + if (ret) + goto out; + } + + if (!mempool_initialized(&c->compression_bounce[WRITE])) { + ret = mempool_init_kvpmalloc_pool(&c->compression_bounce[WRITE], +- 1, max_extent); ++ 1, c->opts.encoded_extent_max); + if (ret) + goto out; + } +diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c +index 033ded886875..7d78672dd017 100644 +--- a/fs/bcachefs/ec.c ++++ b/fs/bcachefs/ec.c +@@ -1152,7 +1152,7 @@ static void ec_stripe_key_init(struct bch_fs *c, + s->v.algorithm = 0; + s->v.nr_blocks = nr_data + nr_parity; + s->v.nr_redundant = nr_parity; +- s->v.csum_granularity_bits = ilog2(c->sb.encoded_extent_max); ++ s->v.csum_granularity_bits = ilog2(c->opts.encoded_extent_max >> 9); + s->v.csum_type = BCH_CSUM_crc32c; + s->v.pad = 0; + +diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c +index 145b3868e522..44c584e9adaa 100644 +--- a/fs/bcachefs/extents.c ++++ b/fs/bcachefs/extents.c +@@ -303,7 +303,7 @@ bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r) + + if (lp.crc.csum_type && + lp.crc.uncompressed_size + +- rp.crc.uncompressed_size > c->sb.encoded_extent_max) ++ rp.crc.uncompressed_size > (c->opts.encoded_extent_max >> 9)) + return false; + + if (lp.crc.uncompressed_size + rp.crc.uncompressed_size > +diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c +index 6e64a04d7f91..1b954dff5d15 100644 +--- a/fs/bcachefs/io.c ++++ b/fs/bcachefs/io.c +@@ -738,7 +738,7 @@ static struct bio *bch2_write_bio_alloc(struct bch_fs *c, + */ + bch2_bio_alloc_pages_pool(c, bio, + min_t(unsigned, output_available, +- c->sb.encoded_extent_max << 9)); ++ c->opts.encoded_extent_max)); + + if (bio->bi_iter.bi_size < output_available) + *page_alloc_failed = +@@ -935,8 +935,8 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, + size_t dst_len, src_len; + + if (page_alloc_failed && +- bio_sectors(dst) < wp->sectors_free && +- bio_sectors(dst) < c->sb.encoded_extent_max) ++ dst->bi_iter.bi_size < (wp->sectors_free << 9) && ++ dst->bi_iter.bi_size < c->opts.encoded_extent_max) + break; + + BUG_ON(op->compression_type && +@@ -956,7 +956,7 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, + + if (op->csum_type) + dst_len = min_t(unsigned, dst_len, +- c->sb.encoded_extent_max << 9); ++ c->opts.encoded_extent_max); + + if (bounce) { + swap(dst->bi_iter.bi_size, dst_len); +@@ -2365,9 +2365,9 @@ int bch2_fs_io_init(struct bch_fs *c) + BIOSET_NEED_BVECS) || + mempool_init_page_pool(&c->bio_bounce_pages, + max_t(unsigned, +- btree_sectors(c), +- c->sb.encoded_extent_max) / +- PAGE_SECTORS, 0) || ++ c->opts.btree_node_size, ++ c->opts.encoded_extent_max) / ++ PAGE_SIZE, 0) || + rhashtable_init(&c->promote_table, &bch_promote_params)) + return -ENOMEM; + +diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h +index 60fe0301c4b7..aadd3958f53d 100644 +--- a/fs/bcachefs/opts.h ++++ b/fs/bcachefs/opts.h +@@ -127,6 +127,12 @@ enum opt_type { + OPT_UINT(1, BCH_REPLICAS_MAX), \ + BCH_SB_DATA_REPLICAS_REQ, 1, \ + "#", NULL) \ ++ x(encoded_extent_max, u32, \ ++ OPT_FS|OPT_FORMAT| \ ++ OPT_HUMAN_READABLE|OPT_MUST_BE_POW_2|OPT_SB_FIELD_SECTORS|OPT_SB_FIELD_ILOG2,\ ++ OPT_UINT(4096, 2U << 20), \ ++ BCH_SB_ENCODED_EXTENT_MAX_BITS, 64 << 10, \ ++ "size", "Maximum size of checksummed/compressed extents")\ + x(metadata_checksum, u8, \ + OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_STR(bch2_csum_opts), \ +diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c +index 3df4c977061e..b8d2cf66a630 100644 +--- a/fs/bcachefs/super-io.c ++++ b/fs/bcachefs/super-io.c +@@ -362,7 +362,6 @@ static void bch2_sb_update(struct bch_fs *c) + c->sb.nr_devices = src->nr_devices; + c->sb.clean = BCH_SB_CLEAN(src); + c->sb.encryption_type = BCH_SB_ENCRYPTION_TYPE(src); +- c->sb.encoded_extent_max= 1 << BCH_SB_ENCODED_EXTENT_MAX_BITS(src); + + c->sb.nsec_per_time_unit = le32_to_cpu(src->time_precision); + c->sb.time_units_per_sec = NSEC_PER_SEC / c->sb.nsec_per_time_unit; +-- +cgit v1.2.3 + + +From 5a3649b0d557e463b2e48e359f3f5a6449e30b53 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 22 Dec 2021 22:39:50 -0500 +Subject: bcachefs: Fix a null ptr deref in bch2_inode_delete_keys() + +Similarly to bch2_btree_delete_range_trans(), bch2_inode_delete_keys() +may sometimes split compressed extents, and needs to pass in a disk +reservation. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/inode.c | 6 +++++- + 1 file changed, 5 insertions(+), 1 deletion(-) + +diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c +index 99b2a77ef9a8..ef6da53567b8 100644 +--- a/fs/bcachefs/inode.c ++++ b/fs/bcachefs/inode.c +@@ -4,6 +4,7 @@ + #include "btree_key_cache.h" + #include "bkey_methods.h" + #include "btree_update.h" ++#include "buckets.h" + #include "error.h" + #include "extents.h" + #include "extent_update.h" +@@ -588,6 +589,8 @@ static int bch2_inode_delete_keys(struct btree_trans *trans, + int ret = 0; + + while (!ret || ret == -EINTR) { ++ struct disk_reservation disk_res = ++ bch2_disk_reservation_init(trans->c, 0); + struct btree_iter iter; + struct bkey_s_c k; + struct bkey_i delete; +@@ -630,8 +633,9 @@ static int bch2_inode_delete_keys(struct btree_trans *trans, + } + + ret = bch2_trans_update(trans, &iter, &delete, 0) ?: +- bch2_trans_commit(trans, NULL, NULL, ++ bch2_trans_commit(trans, &disk_res, NULL, + BTREE_INSERT_NOFAIL); ++ bch2_disk_reservation_put(trans->c, &disk_res); + err: + offset = iter.pos.offset; + bch2_trans_iter_exit(trans, &iter); +-- +cgit v1.2.3 + + +From 62e03b8854c6a0702eac0c453b3154424bcacda2 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 24 Dec 2021 02:55:11 -0500 +Subject: bcachefs: Kill non-lru cache replacement policies + +Prep work for persistent LRUs and getting rid of the in memory bucket +array. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/alloc_background.c | 82 +----------------------------------------- + fs/bcachefs/bcachefs_format.h | 15 +------- + fs/bcachefs/opts.c | 5 --- + fs/bcachefs/opts.h | 1 - + fs/bcachefs/super-io.h | 1 - + fs/bcachefs/super_types.h | 1 - + fs/bcachefs/sysfs.c | 26 -------------- + 7 files changed, 2 insertions(+), 129 deletions(-) + +diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c +index ed919b428a06..a35ed656c6b0 100644 +--- a/fs/bcachefs/alloc_background.c ++++ b/fs/bcachefs/alloc_background.c +@@ -628,76 +628,6 @@ static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca) + up_read(&ca->bucket_lock); + } + +-static void find_reclaimable_buckets_fifo(struct bch_fs *c, struct bch_dev *ca) +-{ +- struct bucket_array *buckets = bucket_array(ca); +- struct bucket_mark m; +- size_t b, start; +- +- if (ca->fifo_last_bucket < ca->mi.first_bucket || +- ca->fifo_last_bucket >= ca->mi.nbuckets) +- ca->fifo_last_bucket = ca->mi.first_bucket; +- +- start = ca->fifo_last_bucket; +- +- do { +- ca->fifo_last_bucket++; +- if (ca->fifo_last_bucket == ca->mi.nbuckets) +- ca->fifo_last_bucket = ca->mi.first_bucket; +- +- b = ca->fifo_last_bucket; +- m = READ_ONCE(buckets->b[b].mark); +- +- if (bch2_can_invalidate_bucket(ca, b, m)) { +- struct alloc_heap_entry e = { .bucket = b, .nr = 1, }; +- +- heap_add(&ca->alloc_heap, e, bucket_alloc_cmp, NULL); +- if (heap_full(&ca->alloc_heap)) +- break; +- } +- +- cond_resched(); +- } while (ca->fifo_last_bucket != start); +-} +- +-static void find_reclaimable_buckets_random(struct bch_fs *c, struct bch_dev *ca) +-{ +- struct bucket_array *buckets = bucket_array(ca); +- struct bucket_mark m; +- size_t checked, i; +- +- for (checked = 0; +- checked < ca->mi.nbuckets / 2; +- checked++) { +- size_t b = bch2_rand_range(ca->mi.nbuckets - +- ca->mi.first_bucket) + +- ca->mi.first_bucket; +- +- m = READ_ONCE(buckets->b[b].mark); +- +- if (bch2_can_invalidate_bucket(ca, b, m)) { +- struct alloc_heap_entry e = { .bucket = b, .nr = 1, }; +- +- heap_add(&ca->alloc_heap, e, bucket_alloc_cmp, NULL); +- if (heap_full(&ca->alloc_heap)) +- break; +- } +- +- cond_resched(); +- } +- +- sort(ca->alloc_heap.data, +- ca->alloc_heap.used, +- sizeof(ca->alloc_heap.data[0]), +- bucket_idx_cmp, NULL); +- +- /* remove duplicates: */ +- for (i = 0; i + 1 < ca->alloc_heap.used; i++) +- if (ca->alloc_heap.data[i].bucket == +- ca->alloc_heap.data[i + 1].bucket) +- ca->alloc_heap.data[i].nr = 0; +-} +- + static size_t find_reclaimable_buckets(struct bch_fs *c, struct bch_dev *ca) + { + size_t i, nr = 0; +@@ -705,17 +635,7 @@ static size_t find_reclaimable_buckets(struct bch_fs *c, struct bch_dev *ca) + ca->inc_gen_needs_gc = 0; + ca->inc_gen_really_needs_gc = 0; + +- switch (ca->mi.replacement) { +- case BCH_CACHE_REPLACEMENT_lru: +- find_reclaimable_buckets_lru(c, ca); +- break; +- case BCH_CACHE_REPLACEMENT_fifo: +- find_reclaimable_buckets_fifo(c, ca); +- break; +- case BCH_CACHE_REPLACEMENT_random: +- find_reclaimable_buckets_random(c, ca); +- break; +- } ++ find_reclaimable_buckets_lru(c, ca); + + heap_resort(&ca->alloc_heap, bucket_alloc_cmp, NULL); + +diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h +index 495f4d19ddcb..a053fca7886d 100644 +--- a/fs/bcachefs/bcachefs_format.h ++++ b/fs/bcachefs/bcachefs_format.h +@@ -1063,8 +1063,7 @@ struct bch_member { + }; + + LE64_BITMASK(BCH_MEMBER_STATE, struct bch_member, flags[0], 0, 4) +-/* 4-10 unused, was TIER, HAS_(META)DATA */ +-LE64_BITMASK(BCH_MEMBER_REPLACEMENT, struct bch_member, flags[0], 10, 14) ++/* 4-14 unused, was TIER, HAS_(META)DATA, REPLACEMENT */ + LE64_BITMASK(BCH_MEMBER_DISCARD, struct bch_member, flags[0], 14, 15) + LE64_BITMASK(BCH_MEMBER_DATA_ALLOWED, struct bch_member, flags[0], 15, 20) + LE64_BITMASK(BCH_MEMBER_GROUP, struct bch_member, flags[0], 20, 28) +@@ -1088,18 +1087,6 @@ enum bch_member_state { + BCH_MEMBER_STATE_NR + }; + +-#define BCH_CACHE_REPLACEMENT_POLICIES() \ +- x(lru, 0) \ +- x(fifo, 1) \ +- x(random, 2) +- +-enum bch_cache_replacement_policies { +-#define x(t, n) BCH_CACHE_REPLACEMENT_##t = n, +- BCH_CACHE_REPLACEMENT_POLICIES() +-#undef x +- BCH_CACHE_REPLACEMENT_NR +-}; +- + struct bch_sb_field_members { + struct bch_sb_field field; + struct bch_member members[0]; +diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c +index 9b75c852bac8..d9ca69f2ecde 100644 +--- a/fs/bcachefs/opts.c ++++ b/fs/bcachefs/opts.c +@@ -66,11 +66,6 @@ const char * const bch2_data_types[] = { + NULL + }; + +-const char * const bch2_cache_replacement_policies[] = { +- BCH_CACHE_REPLACEMENT_POLICIES() +- NULL +-}; +- + const char * const bch2_member_states[] = { + BCH_MEMBER_STATES() + NULL +diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h +index aadd3958f53d..661eb5764f68 100644 +--- a/fs/bcachefs/opts.h ++++ b/fs/bcachefs/opts.h +@@ -19,7 +19,6 @@ extern const char * const bch2_compression_opts[]; + extern const char * const bch2_str_hash_types[]; + extern const char * const bch2_str_hash_opts[]; + extern const char * const bch2_data_types[]; +-extern const char * const bch2_cache_replacement_policies[]; + extern const char * const bch2_member_states[]; + extern const char * const bch2_d_types[]; + +diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h +index b64ac2fbbf8b..5c264875acb4 100644 +--- a/fs/bcachefs/super-io.h ++++ b/fs/bcachefs/super-io.h +@@ -110,7 +110,6 @@ static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi) + .bucket_size = le16_to_cpu(mi->bucket_size), + .group = BCH_MEMBER_GROUP(mi), + .state = BCH_MEMBER_STATE(mi), +- .replacement = BCH_MEMBER_REPLACEMENT(mi), + .discard = BCH_MEMBER_DISCARD(mi), + .data_allowed = BCH_MEMBER_DATA_ALLOWED(mi), + .durability = BCH_MEMBER_DURABILITY(mi) +diff --git a/fs/bcachefs/super_types.h b/fs/bcachefs/super_types.h +index 96023f37afea..d8b159a5b7f7 100644 +--- a/fs/bcachefs/super_types.h ++++ b/fs/bcachefs/super_types.h +@@ -29,7 +29,6 @@ struct bch_member_cpu { + u16 bucket_size; /* sectors */ + u16 group; + u8 state; +- u8 replacement; + u8 discard; + u8 data_allowed; + u8 durability; +diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c +index 0a0798bae4d6..341ba3fdd6fc 100644 +--- a/fs/bcachefs/sysfs.c ++++ b/fs/bcachefs/sysfs.c +@@ -177,7 +177,6 @@ read_attribute(extent_migrate_done); + read_attribute(extent_migrate_raced); + + rw_attribute(discard); +-rw_attribute(cache_replacement_policy); + rw_attribute(label); + + rw_attribute(copy_gc_enabled); +@@ -826,14 +825,6 @@ SHOW(bch2_dev) + return out.pos - buf; + } + +- if (attr == &sysfs_cache_replacement_policy) { +- bch2_string_opt_to_text(&out, +- bch2_cache_replacement_policies, +- ca->mi.replacement); +- pr_buf(&out, "\n"); +- return out.pos - buf; +- } +- + if (attr == &sysfs_state_rw) { + bch2_string_opt_to_text(&out, bch2_member_states, + ca->mi.state); +@@ -893,22 +884,6 @@ STORE(bch2_dev) + mutex_unlock(&c->sb_lock); + } + +- if (attr == &sysfs_cache_replacement_policy) { +- ssize_t v = __sysfs_match_string(bch2_cache_replacement_policies, -1, buf); +- +- if (v < 0) +- return v; +- +- mutex_lock(&c->sb_lock); +- mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx]; +- +- if ((unsigned) v != BCH_MEMBER_REPLACEMENT(mi)) { +- SET_BCH_MEMBER_REPLACEMENT(mi, v); +- bch2_write_super(c); +- } +- mutex_unlock(&c->sb_lock); +- } +- + if (attr == &sysfs_label) { + char *tmp; + int ret; +@@ -939,7 +914,6 @@ struct attribute *bch2_dev_files[] = { + + /* settings: */ + &sysfs_discard, +- &sysfs_cache_replacement_policy, + &sysfs_state_rw, + &sysfs_label, + +-- +cgit v1.2.3 + + +From 1f9892d585393785eec545e03c7a8a0418e11660 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 24 Dec 2021 04:22:20 -0500 +Subject: bcachefs: Rewrite bch2_bucket_alloc_new_fs() + +This changes bch2_bucket_alloc_new_fs() to a simple bump allocator that +doesn't need to use the in memory bucket array, part of a larger patch +series to entirely get rid of the in memory bucket array, except for +gc/fsck. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/alloc_background.c | 12 ++++++++++++ + fs/bcachefs/alloc_foreground.c | 22 ++++++++-------------- + fs/bcachefs/bcachefs.h | 1 + + fs/bcachefs/recovery.c | 2 ++ + fs/bcachefs/super.c | 2 ++ + fs/bcachefs/super.h | 21 +++++++++++++++++++++ + 6 files changed, 46 insertions(+), 14 deletions(-) + +diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c +index a35ed656c6b0..ee5becb1c820 100644 +--- a/fs/bcachefs/alloc_background.c ++++ b/fs/bcachefs/alloc_background.c +@@ -513,6 +513,18 @@ static bool bch2_can_invalidate_bucket(struct bch_dev *ca, size_t b, + test_bit(b, ca->buckets_nouse)) + return false; + ++ if (ca->new_fs_bucket_idx) { ++ /* ++ * Device or filesystem is still being initialized, and we ++ * haven't fully marked superblocks & journal: ++ */ ++ if (is_superblock_bucket(ca, b)) ++ return false; ++ ++ if (b < ca->new_fs_bucket_idx) ++ return false; ++ } ++ + gc_gen = bucket_gc_gen(bucket(ca, b)); + + ca->inc_gen_needs_gc += gc_gen >= BUCKET_GC_GEN_MAX / 2; +diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c +index dce77cc27cbe..4603328eea52 100644 +--- a/fs/bcachefs/alloc_foreground.c ++++ b/fs/bcachefs/alloc_foreground.c +@@ -144,21 +144,15 @@ static void verify_not_stale(struct bch_fs *c, const struct open_buckets *obs) + /* _only_ for allocating the journal on a new device: */ + long bch2_bucket_alloc_new_fs(struct bch_dev *ca) + { +- struct bucket_array *buckets; +- ssize_t b; ++ while (ca->new_fs_bucket_idx < ca->mi.nbuckets) { ++ u64 b = ca->new_fs_bucket_idx++; + +- rcu_read_lock(); +- buckets = bucket_array(ca); +- +- for (b = buckets->first_bucket; b < buckets->nbuckets; b++) +- if (is_available_bucket(buckets->b[b].mark) && +- (!ca->buckets_nouse || !test_bit(b, ca->buckets_nouse)) && +- !buckets->b[b].mark.owned_by_allocator) +- goto success; +- b = -1; +-success: +- rcu_read_unlock(); +- return b; ++ if (!is_superblock_bucket(ca, b) && ++ (!ca->buckets_nouse || !test_bit(b, ca->buckets_nouse))) ++ return b; ++ } ++ ++ return -1; + } + + static inline unsigned open_buckets_reserved(enum alloc_reserve reserve) +diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h +index 943487f2c6f5..c243d9239892 100644 +--- a/fs/bcachefs/bcachefs.h ++++ b/fs/bcachefs/bcachefs.h +@@ -453,6 +453,7 @@ struct bch_dev { + struct bch_dev_usage __percpu *usage_gc; + + /* Allocator: */ ++ u64 new_fs_bucket_idx; + struct task_struct __rcu *alloc_thread; + + /* +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index 29fe6260ace5..bd552a942ac6 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -1429,6 +1429,8 @@ int bch2_fs_initialize(struct bch_fs *c) + percpu_ref_put(&ca->ref); + goto err; + } ++ ++ ca->new_fs_bucket_idx = 0; + } + + err = "error creating root snapshot node"; +diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c +index 58bc29032a8a..b0dee3406ab3 100644 +--- a/fs/bcachefs/super.c ++++ b/fs/bcachefs/super.c +@@ -1720,6 +1720,8 @@ have_slot: + if (ret) + goto err_late; + ++ ca->new_fs_bucket_idx = 0; ++ + if (ca->mi.state == BCH_MEMBER_STATE_rw) { + ret = __bch2_dev_read_write(c, ca); + if (ret) +diff --git a/fs/bcachefs/super.h b/fs/bcachefs/super.h +index 739e8fd18176..c3273e9c711d 100644 +--- a/fs/bcachefs/super.h ++++ b/fs/bcachefs/super.h +@@ -194,6 +194,27 @@ static inline struct bch_devs_mask bch2_online_devs(struct bch_fs *c) + return devs; + } + ++static inline bool is_superblock_bucket(struct bch_dev *ca, u64 b) ++{ ++ struct bch_sb_layout *layout = &ca->disk_sb.sb->layout; ++ u64 b_offset = bucket_to_sector(ca, b); ++ u64 b_end = bucket_to_sector(ca, b + 1); ++ unsigned i; ++ ++ if (!b) ++ return true; ++ ++ for (i = 0; i < layout->nr_superblocks; i++) { ++ u64 offset = le64_to_cpu(layout->sb_offset[i]); ++ u64 end = offset + (1 << layout->sb_max_size_bits); ++ ++ if (!(offset >= b_end || end <= b_offset)) ++ return true; ++ } ++ ++ return false; ++} ++ + struct bch_fs *bch2_dev_to_fs(dev_t); + struct bch_fs *bch2_uuid_to_fs(uuid_le); + +-- +cgit v1.2.3 + + +From 2b52808cf9eb95d072031134211e61cb1c02dff9 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 24 Dec 2021 04:27:01 -0500 +Subject: bcachefs: bch2_bucket_alloc_new_fs() no longer depends on bucket + marks + +Now that bch2_bucket_alloc_new_fs() isn't looking at bucket marks to +decide what buckets are eligible to allocate, we can clean up the +filesystem initialization and device add paths. Previously, we had to +use ancient code to mark superblock/journal buckets in the in memory +bucket marks as we allocated them, and then zero that out and re-do that +marking using the newer transational bucket mark paths. Now, we can +simply delete the in-memory bucket marking. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_gc.c | 17 ++----------- + fs/bcachefs/btree_gc.h | 1 - + fs/bcachefs/buckets.c | 66 ++++++++++++-------------------------------------- + fs/bcachefs/journal.c | 12 +-------- + fs/bcachefs/recovery.c | 3 --- + fs/bcachefs/super.c | 26 -------------------- + 6 files changed, 19 insertions(+), 106 deletions(-) + +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +index 91c69a9f96ae..6d8d61e8cf46 100644 +--- a/fs/bcachefs/btree_gc.c ++++ b/fs/bcachefs/btree_gc.c +@@ -1056,23 +1056,13 @@ static void mark_metadata_sectors(struct bch_fs *c, struct bch_dev *ca, + } while (start < end); + } + +-void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca, +- unsigned flags) ++static void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca, ++ unsigned flags) + { + struct bch_sb_layout *layout = &ca->disk_sb.sb->layout; + unsigned i; + u64 b; + +- /* +- * This conditional is kind of gross, but we may be called from the +- * device add path, before the new device has actually been added to the +- * running filesystem: +- */ +- if (c) { +- lockdep_assert_held(&c->sb_lock); +- percpu_down_read(&c->mark_lock); +- } +- + for (i = 0; i < layout->nr_superblocks; i++) { + u64 offset = le64_to_cpu(layout->sb_offset[i]); + +@@ -1091,9 +1081,6 @@ void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca, + ca->mi.bucket_size, + gc_phase(GC_PHASE_SB), flags); + } +- +- if (c) +- percpu_up_read(&c->mark_lock); + } + + static void bch2_mark_superblocks(struct bch_fs *c) +diff --git a/fs/bcachefs/btree_gc.h b/fs/bcachefs/btree_gc.h +index 59dfb069e699..0665f5941fcc 100644 +--- a/fs/bcachefs/btree_gc.h ++++ b/fs/bcachefs/btree_gc.h +@@ -8,7 +8,6 @@ int bch2_gc(struct bch_fs *, bool, bool); + int bch2_gc_gens(struct bch_fs *); + void bch2_gc_thread_stop(struct bch_fs *); + int bch2_gc_thread_start(struct bch_fs *); +-void bch2_mark_dev_superblock(struct bch_fs *, struct bch_dev *, unsigned); + + /* + * For concurrent mark and sweep (with other index updates), we define a total +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index bbcc472f592e..2e2f12563e47 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -365,13 +365,6 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca, + struct bch_fs_usage *fs_usage; + struct bch_dev_usage *u; + +- /* +- * Hack for bch2_fs_initialize path, where we're first marking sb and +- * journal non-transactionally: +- */ +- if (!journal_seq && !test_bit(BCH_FS_INITIALIZED, &c->flags)) +- journal_seq = 1; +- + preempt_disable(); + fs_usage = fs_usage_ptr(c, journal_seq, gc); + u = dev_usage_ptr(ca, journal_seq, gc); +@@ -532,19 +525,6 @@ static inline void update_cached_sectors_list(struct btree_trans *trans, + update_replicas_list(trans, &r.e, sectors); + } + +-#define do_mark_fn(fn, c, pos, flags, ...) \ +-({ \ +- int gc, ret = 0; \ +- \ +- percpu_rwsem_assert_held(&c->mark_lock); \ +- \ +- for (gc = 0; gc < 2 && !ret; gc++) \ +- if (!gc == !(flags & BTREE_TRIGGER_GC) || \ +- (gc && gc_visited(c, pos))) \ +- ret = fn(c, __VA_ARGS__, gc); \ +- ret; \ +-}) +- + void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca, + size_t b, bool owned_by_allocator) + { +@@ -655,17 +635,27 @@ static int bch2_mark_alloc(struct btree_trans *trans, + overflow; \ + }) + +-static int __bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, +- size_t b, enum bch_data_type data_type, +- unsigned sectors, bool gc) ++void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, ++ size_t b, enum bch_data_type data_type, ++ unsigned sectors, struct gc_pos pos, ++ unsigned flags) + { +- struct bucket *g = __bucket(ca, b, gc); ++ struct bucket *g; + struct bucket_mark old, new; + bool overflow; + ++ BUG_ON(!(flags & BTREE_TRIGGER_GC)); + BUG_ON(data_type != BCH_DATA_sb && + data_type != BCH_DATA_journal); + ++ /* ++ * Backup superblock might be past the end of our normal usable space: ++ */ ++ if (b >= ca->mi.nbuckets) ++ return; ++ ++ percpu_down_read(&c->mark_lock); ++ g = __bucket(ca, b, true); + old = bucket_cmpxchg(g, new, ({ + new.data_type = data_type; + overflow = checked_add(new.dirty_sectors, sectors); +@@ -683,32 +673,8 @@ static int __bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, + bch2_data_types[old.data_type ?: data_type], + old.dirty_sectors, sectors); + +- if (c) +- bch2_dev_usage_update(c, ca, old, new, 0, gc); +- +- return 0; +-} +- +-void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, +- size_t b, enum bch_data_type type, +- unsigned sectors, struct gc_pos pos, +- unsigned flags) +-{ +- BUG_ON(type != BCH_DATA_sb && +- type != BCH_DATA_journal); +- +- /* +- * Backup superblock might be past the end of our normal usable space: +- */ +- if (b >= ca->mi.nbuckets) +- return; +- +- if (likely(c)) { +- do_mark_fn(__bch2_mark_metadata_bucket, c, pos, flags, +- ca, b, type, sectors); +- } else { +- __bch2_mark_metadata_bucket(c, ca, b, type, sectors, 0); +- } ++ bch2_dev_usage_update(c, ca, old, new, 0, true); ++ percpu_up_read(&c->mark_lock); + } + + static s64 ptr_disk_sectors(s64 sectors, struct extent_ptr_decoded p) +diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c +index ff8b81fa6772..6c1771b5828a 100644 +--- a/fs/bcachefs/journal.c ++++ b/fs/bcachefs/journal.c +@@ -770,11 +770,8 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, + long b; + + if (new_fs) { +- if (c) +- percpu_down_read(&c->mark_lock); + b = bch2_bucket_alloc_new_fs(ca); + if (b < 0) { +- percpu_up_read(&c->mark_lock); + ret = -ENOSPC; + goto err; + } +@@ -822,14 +819,7 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, + if (c) + spin_unlock(&c->journal.lock); + +- if (new_fs) { +- bch2_mark_metadata_bucket(c, ca, b, BCH_DATA_journal, +- ca->mi.bucket_size, +- gc_phase(GC_PHASE_SB), +- 0); +- if (c) +- percpu_up_read(&c->mark_lock); +- } else { ++ if (!new_fs) { + ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOFAIL, + bch2_trans_mark_metadata_bucket(&trans, ca, + b, BCH_DATA_journal, +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index bd552a942ac6..9916fad292be 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -1383,9 +1383,6 @@ int bch2_fs_initialize(struct bch_fs *c) + c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL); + bch2_write_super(c); + } +- +- for_each_online_member(ca, c, i) +- bch2_mark_dev_superblock(c, ca, 0); + mutex_unlock(&c->sb_lock); + + set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags); +diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c +index b0dee3406ab3..425cb9bdd991 100644 +--- a/fs/bcachefs/super.c ++++ b/fs/bcachefs/super.c +@@ -1600,8 +1600,6 @@ int bch2_dev_add(struct bch_fs *c, const char *path) + struct bch_dev *ca = NULL; + struct bch_sb_field_members *mi; + struct bch_member dev_mi; +- struct bucket_array *buckets; +- struct bucket *g; + unsigned dev_idx, nr_devices, u64s; + int ret; + +@@ -1631,20 +1629,6 @@ int bch2_dev_add(struct bch_fs *c, const char *path) + return ret; + } + +- /* +- * We want to allocate journal on the new device before adding the new +- * device to the filesystem because allocating after we attach requires +- * spinning up the allocator thread, and the allocator thread requires +- * doing btree writes, which if the existing devices are RO isn't going +- * to work +- * +- * So we have to mark where the superblocks are, but marking allocated +- * data normally updates the filesystem usage too, so we have to mark, +- * allocate the journal, reset all the marks, then remark after we +- * attach... +- */ +- bch2_mark_dev_superblock(NULL, ca, 0); +- + err = "journal alloc failed"; + ret = bch2_dev_journal_alloc(ca); + if (ret) +@@ -1705,16 +1689,6 @@ have_slot: + + bch2_dev_usage_journal_reserve(c); + +- /* +- * Clear marks before marking transactionally in the btree, so that +- * per-device accounting gets done correctly: +- */ +- down_read(&ca->bucket_lock); +- buckets = bucket_array(ca); +- for_each_bucket(g, buckets) +- atomic64_set(&g->_mark.v, 0); +- up_read(&ca->bucket_lock); +- + err = "error marking superblock"; + ret = bch2_trans_mark_dev_sb(c, ca); + if (ret) +-- +cgit v1.2.3 + + +From b987065fd8d4bc09f3bfb6d77aab3275412bc4b6 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 30 Dec 2021 19:30:42 -0500 +Subject: bcachefs: Don't start allocator threads too early + +If the allocator threads start before journal replay has finished +replaying alloc keys, journal replay might overwrite the allocator's +btree updates. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/alloc_background.c | 3 ++- + fs/bcachefs/bcachefs.h | 1 + + fs/bcachefs/recovery.c | 9 ++++++++- + 3 files changed, 11 insertions(+), 2 deletions(-) + +diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c +index ee5becb1c820..bde9cf17e224 100644 +--- a/fs/bcachefs/alloc_background.c ++++ b/fs/bcachefs/alloc_background.c +@@ -900,7 +900,8 @@ static void discard_one_bucket(struct bch_fs *c, struct bch_dev *ca, u64 b) + static bool allocator_thread_running(struct bch_dev *ca) + { + unsigned state = ca->mi.state == BCH_MEMBER_STATE_rw && +- test_bit(BCH_FS_ALLOCATOR_RUNNING, &ca->fs->flags) ++ test_bit(BCH_FS_ALLOCATOR_RUNNING, &ca->fs->flags) && ++ test_bit(BCH_FS_ALLOC_REPLAY_DONE, &ca->fs->flags) + ? ALLOCATOR_running + : ALLOCATOR_stopped; + alloc_thread_set_state(ca, state); +diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h +index c243d9239892..c067141b04be 100644 +--- a/fs/bcachefs/bcachefs.h ++++ b/fs/bcachefs/bcachefs.h +@@ -509,6 +509,7 @@ enum { + BCH_FS_INITIAL_GC_DONE, + BCH_FS_INITIAL_GC_UNFIXED, + BCH_FS_TOPOLOGY_REPAIR_DONE, ++ BCH_FS_ALLOC_REPLAY_DONE, + BCH_FS_BTREE_INTERIOR_REPLAY_DONE, + BCH_FS_FSCK_DONE, + BCH_FS_STARTED, +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index 9916fad292be..d0ceac0f2b39 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -567,9 +567,10 @@ static int bch2_journal_replay(struct bch_fs *c, + struct journal_keys keys) + { + struct journal *j = &c->journal; ++ struct bch_dev *ca; + struct journal_key *i; + u64 seq; +- int ret; ++ int ret, idx; + + sort(keys.d, keys.nr, sizeof(keys.d[0]), journal_sort_seq_cmp, NULL); + +@@ -593,6 +594,11 @@ static int bch2_journal_replay(struct bch_fs *c, + } + } + ++ /* Now we can start the allocator threads: */ ++ set_bit(BCH_FS_ALLOC_REPLAY_DONE, &c->flags); ++ for_each_member_device(ca, c, idx) ++ bch2_wake_allocator(ca); ++ + /* + * Next replay updates to interior btree nodes: + */ +@@ -1391,6 +1397,7 @@ int bch2_fs_initialize(struct bch_fs *c) + for (i = 0; i < BTREE_ID_NR; i++) + bch2_btree_root_alloc(c, i); + ++ set_bit(BCH_FS_ALLOC_REPLAY_DONE, &c->flags); + set_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags); + set_bit(JOURNAL_RECLAIM_STARTED, &c->journal.flags); + +-- +cgit v1.2.3 + + +From 965b309febddc7b030bcc2f12fb1089f3a46b612 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 24 Dec 2021 03:08:06 -0500 +Subject: bcachefs: Kill ptr_bucket_mark() + +Only used in one place, we can just delete it. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/buckets.h | 20 +++++++------------- + 1 file changed, 7 insertions(+), 13 deletions(-) + +diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h +index ac9b554acd86..50e28fb78e9b 100644 +--- a/fs/bcachefs/buckets.h ++++ b/fs/bcachefs/buckets.h +@@ -91,18 +91,6 @@ static inline enum bch_data_type ptr_data_type(const struct bkey *k, + return ptr->cached ? BCH_DATA_cached : BCH_DATA_user; + } + +-static inline struct bucket_mark ptr_bucket_mark(struct bch_dev *ca, +- const struct bch_extent_ptr *ptr) +-{ +- struct bucket_mark m; +- +- rcu_read_lock(); +- m = READ_ONCE(PTR_BUCKET(ca, ptr, 0)->mark); +- rcu_read_unlock(); +- +- return m; +-} +- + static inline int gen_cmp(u8 a, u8 b) + { + return (s8) (a - b); +@@ -122,7 +110,13 @@ static inline int gen_after(u8 a, u8 b) + static inline u8 ptr_stale(struct bch_dev *ca, + const struct bch_extent_ptr *ptr) + { +- return gen_after(ptr_bucket_mark(ca, ptr).gen, ptr->gen); ++ u8 ret; ++ ++ rcu_read_lock(); ++ ret = gen_after(PTR_BUCKET(ca, ptr, 0)->mark.gen, ptr->gen); ++ rcu_read_unlock(); ++ ++ return ret; + } + + /* bucket gc marks */ +-- +cgit v1.2.3 + + +From 569ba3e5ce9ea1f7ec094edaa892a32f02e2ff6b Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 23 Dec 2021 21:35:28 -0500 +Subject: bcachefs: bch2_journal_key_insert() no longer transfers ownership + +bch2_journal_key_insert() used to assume that the key passed to it was +allocated with kmalloc(), and on success took ownership. This patch +deletes that behaviour, making it more similar to +bch2_trans_update()/bch2_trans_commit(). + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_gc.c | 12 +++++------- + fs/bcachefs/buckets.c | 18 ++++++------------ + fs/bcachefs/recovery.c | 35 +++++++++++++++++++++-------------- + fs/bcachefs/recovery.h | 2 ++ + 4 files changed, 34 insertions(+), 33 deletions(-) + +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +index 6d8d61e8cf46..271e0b15b151 100644 +--- a/fs/bcachefs/btree_gc.c ++++ b/fs/bcachefs/btree_gc.c +@@ -169,7 +169,7 @@ static int set_node_min(struct bch_fs *c, struct btree *b, struct bpos new_min) + new->v.min_key = new_min; + SET_BTREE_PTR_RANGE_UPDATED(&new->v, true); + +- ret = bch2_journal_key_insert(c, b->c.btree_id, b->c.level + 1, &new->k_i); ++ ret = bch2_journal_key_insert_take(c, b->c.btree_id, b->c.level + 1, &new->k_i); + if (ret) { + kfree(new); + return ret; +@@ -198,7 +198,7 @@ static int set_node_max(struct bch_fs *c, struct btree *b, struct bpos new_max) + new->k.p = new_max; + SET_BTREE_PTR_RANGE_UPDATED(&new->v, true); + +- ret = bch2_journal_key_insert(c, b->c.btree_id, b->c.level + 1, &new->k_i); ++ ret = bch2_journal_key_insert_take(c, b->c.btree_id, b->c.level + 1, &new->k_i); + if (ret) { + kfree(new); + return ret; +@@ -690,7 +690,7 @@ found: + } + } + +- ret = bch2_journal_key_insert(c, btree_id, level, new); ++ ret = bch2_journal_key_insert_take(c, btree_id, level, new); + if (ret) + kfree(new); + else +@@ -1390,8 +1390,7 @@ static int bch2_gc_reflink_done_initial_fn(struct btree_trans *trans, + } + + ret = bch2_journal_key_insert(c, BTREE_ID_reflink, 0, new); +- if (ret) +- kfree(new); ++ kfree(new); + } + fsck_err: + return ret; +@@ -1516,8 +1515,7 @@ inconsistent: + stripe_blockcount_set(&new->v, i, m ? m->block_sectors[i] : 0); + + ret = bch2_journal_key_insert(c, BTREE_ID_stripes, 0, &new->k_i); +- if (ret) +- kfree(new); ++ kfree(new); + } + fsck_err: + return ret; +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index 2e2f12563e47..600bea4c274b 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -1213,19 +1213,13 @@ not_found: + */ + if (fsck_err(c, "%llu:%llu len %u points to nonexistent indirect extent %llu", + p.k->p.inode, p.k->p.offset, p.k->size, *idx)) { +- struct bkey_i_error *new; ++ struct bkey_i_error new; + +- new = kmalloc(sizeof(*new), GFP_KERNEL); +- if (!new) { +- bch_err(c, "%s: error allocating new key", __func__); +- return -ENOMEM; +- } +- +- bkey_init(&new->k); +- new->k.type = KEY_TYPE_error; +- new->k.p = p.k->p; +- new->k.size = p.k->size; +- ret = bch2_journal_key_insert(c, BTREE_ID_extents, 0, &new->k_i); ++ bkey_init(&new.k); ++ new.k.type = KEY_TYPE_error; ++ new.k.p = p.k->p; ++ new.k.size = p.k->size; ++ ret = bch2_journal_key_insert(c, BTREE_ID_extents, 0, &new.k_i); + } + fsck_err: + return ret; +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index d0ceac0f2b39..118d536b4376 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -109,8 +109,8 @@ static void journal_iter_fix(struct bch_fs *c, struct journal_iter *iter, unsign + iter->idx++; + } + +-int bch2_journal_key_insert(struct bch_fs *c, enum btree_id id, +- unsigned level, struct bkey_i *k) ++int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id, ++ unsigned level, struct bkey_i *k) + { + struct journal_key n = { + .btree_id = id, +@@ -157,27 +157,34 @@ int bch2_journal_key_insert(struct bch_fs *c, enum btree_id id, + return 0; + } + +-int bch2_journal_key_delete(struct bch_fs *c, enum btree_id id, +- unsigned level, struct bpos pos) ++int bch2_journal_key_insert(struct bch_fs *c, enum btree_id id, ++ unsigned level, struct bkey_i *k) + { +- struct bkey_i *whiteout = +- kmalloc(sizeof(struct bkey), GFP_KERNEL); ++ struct bkey_i *n; + int ret; + +- if (!whiteout) { +- bch_err(c, "%s: error allocating new key", __func__); ++ n = kmalloc(bkey_bytes(&k->k), GFP_KERNEL); ++ if (!n) + return -ENOMEM; +- } +- +- bkey_init(&whiteout->k); +- whiteout->k.p = pos; + +- ret = bch2_journal_key_insert(c, id, level, whiteout); ++ bkey_copy(n, k); ++ ret = bch2_journal_key_insert_take(c, id, level, n); + if (ret) +- kfree(whiteout); ++ kfree(n); + return ret; + } + ++int bch2_journal_key_delete(struct bch_fs *c, enum btree_id id, ++ unsigned level, struct bpos pos) ++{ ++ struct bkey_i whiteout; ++ ++ bkey_init(&whiteout.k); ++ whiteout.k.p = pos; ++ ++ return bch2_journal_key_insert(c, id, level, &whiteout); ++} ++ + static struct bkey_i *bch2_journal_iter_peek(struct journal_iter *iter) + { + struct journal_key *k = iter->idx - iter->keys->nr +diff --git a/fs/bcachefs/recovery.h b/fs/bcachefs/recovery.h +index e45c70b3693f..1504e0bdb940 100644 +--- a/fs/bcachefs/recovery.h ++++ b/fs/bcachefs/recovery.h +@@ -31,6 +31,8 @@ struct btree_and_journal_iter { + } last; + }; + ++int bch2_journal_key_insert_take(struct bch_fs *, enum btree_id, ++ unsigned, struct bkey_i *); + int bch2_journal_key_insert(struct bch_fs *, enum btree_id, + unsigned, struct bkey_i *); + int bch2_journal_key_delete(struct bch_fs *, enum btree_id, +-- +cgit v1.2.3 + + +From e8cbf1475721402618d9e8bcc587a1391444759f Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 25 Dec 2021 18:40:15 -0500 +Subject: bcachefs: Fix bch2_journal_meta() + +This patch ensures that the journal entry written gets written as flush +entry, which is important for the shutdown path - the last entry written +needs to be a flush entry. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/journal.c | 5 +++++ + fs/bcachefs/journal_io.c | 2 +- + fs/bcachefs/sysfs.c | 5 ----- + 3 files changed, 6 insertions(+), 6 deletions(-) + +diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c +index 6c1771b5828a..415cc53cf1db 100644 +--- a/fs/bcachefs/journal.c ++++ b/fs/bcachefs/journal.c +@@ -642,6 +642,7 @@ int bch2_journal_flush_seq(struct journal *j, u64 seq) + + int bch2_journal_meta(struct journal *j) + { ++ struct journal_buf *buf; + struct journal_res res; + int ret; + +@@ -651,6 +652,10 @@ int bch2_journal_meta(struct journal *j) + if (ret) + return ret; + ++ buf = j->buf + (res.seq & JOURNAL_BUF_MASK); ++ buf->must_flush = true; ++ set_bit(JOURNAL_NEED_WRITE, &j->flags); ++ + bch2_journal_res_put(j, &res); + + return bch2_journal_flush_seq(j, res.seq); +diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c +index 1fcc7ed5a776..e161e86e48c4 100644 +--- a/fs/bcachefs/journal_io.c ++++ b/fs/bcachefs/journal_io.c +@@ -1448,7 +1448,7 @@ void bch2_journal_write(struct closure *cl) + SET_JSET_BIG_ENDIAN(jset, CPU_BIG_ENDIAN); + SET_JSET_CSUM_TYPE(jset, bch2_meta_checksum_type(c)); + +- if (journal_entry_empty(jset)) ++ if (!JSET_NO_FLUSH(jset) && journal_entry_empty(jset)) + j->last_empty_seq = le64_to_cpu(jset->seq); + + if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset))) +diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c +index 341ba3fdd6fc..1d1e2c6fc2e2 100644 +--- a/fs/bcachefs/sysfs.c ++++ b/fs/bcachefs/sysfs.c +@@ -131,7 +131,6 @@ do { \ + return strtoi_h(buf, &var) ?: (ssize_t) size; \ + } while (0) + +-write_attribute(trigger_journal_flush); + write_attribute(trigger_gc); + write_attribute(prune_cache); + rw_attribute(btree_gc_periodic); +@@ -482,9 +481,6 @@ STORE(bch2_fs) + + /* Debugging: */ + +- if (attr == &sysfs_trigger_journal_flush) +- bch2_journal_meta(&c->journal); +- + if (attr == &sysfs_trigger_gc) { + /* + * Full gc is currently incompatible with btree key cache: +@@ -574,7 +570,6 @@ struct attribute *bch2_fs_internal_files[] = { + &sysfs_io_timers_read, + &sysfs_io_timers_write, + +- &sysfs_trigger_journal_flush, + &sysfs_trigger_gc, + &sysfs_prune_cache, + +-- +cgit v1.2.3 + + +From fe8027bd0fe16a497ff84c3cc20f2af39900fbcc Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 25 Dec 2021 20:13:47 -0500 +Subject: bcachefs: Use BTREE_ITER_NOPRESERVE in bch2_btree_iter_verify_ret() + +This fixes a transaction path overflow. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 76c6fa96e3f9..a6cc0ca51293 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -746,6 +746,7 @@ static int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k + k.k->p.snapshot)); + + bch2_trans_iter_init(trans, ©, iter->btree_id, iter->pos, ++ BTREE_ITER_NOPRESERVE| + BTREE_ITER_ALL_SNAPSHOTS); + prev = bch2_btree_iter_prev(©); + if (!prev.k) +-- +cgit v1.2.3 + + +From 22a4523205ad99bf24c194b4d4b098fd6eec8f81 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 20 Dec 2021 16:55:49 -0500 +Subject: bcachefs: Journal initialization fixes + +This fixes a rare bug when mounting & unmounting RO - flushing a clean +filesystem that never went RO should be a no op. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/journal.c | 10 ++++++++++ + 1 file changed, 10 insertions(+) + +diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c +index 415cc53cf1db..2082aa52089f 100644 +--- a/fs/bcachefs/journal.c ++++ b/fs/bcachefs/journal.c +@@ -990,10 +990,14 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq, + j->replay_journal_seq = last_seq; + j->replay_journal_seq_end = cur_seq; + j->last_seq_ondisk = last_seq; ++ j->flushed_seq_ondisk = cur_seq - 1; + j->pin.front = last_seq; + j->pin.back = cur_seq; + atomic64_set(&j->seq, cur_seq - 1); + ++ if (list_empty(journal_entries)) ++ j->last_empty_seq = cur_seq - 1; ++ + fifo_for_each_entry_ptr(p, &j->pin, seq) + journal_pin_list_init(p, 1); + +@@ -1006,6 +1010,9 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq, + if (seq < last_seq) + continue; + ++ if (journal_entry_empty(&i->j)) ++ j->last_empty_seq = le64_to_cpu(i->j.seq); ++ + p = journal_seq_pin(j, seq); + + p->devs.nr = 0; +@@ -1013,6 +1020,9 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq, + bch2_dev_list_add_dev(&p->devs, i->ptrs[ptr].dev); + } + ++ if (list_empty(journal_entries)) ++ j->last_empty_seq = cur_seq; ++ + spin_lock(&j->lock); + + set_bit(JOURNAL_STARTED, &j->flags); +-- +cgit v1.2.3 + + +From 98125935c1237ac329439802b4808856fc78a88f Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 26 Dec 2021 16:59:36 -0500 +Subject: bcachefs: Delete some obsolete journal_seq_blacklist code + +Since metadata version bcachefs_metadata_version_btree_ptr_sectors_written, +we haven't needed the journal seq blacklist mechanism for ignoring +blacklisted btree node writes - we now only need it for ignoring journal +entries that were written after the newest flush journal entry, and then +we only need to keep those blacklist entries around until journal replay +is finished. + +That means we can delete the code for scanning btree nodes to GC +journal_seq_blacklist entries. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bcachefs.h | 1 - + fs/bcachefs/journal_seq_blacklist.c | 78 ------------------------------------- + fs/bcachefs/journal_seq_blacklist.h | 2 - + fs/bcachefs/recovery.c | 26 ++++++------- + fs/bcachefs/super.c | 5 --- + 5 files changed, 11 insertions(+), 101 deletions(-) + +diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h +index c067141b04be..0a062df0d67a 100644 +--- a/fs/bcachefs/bcachefs.h ++++ b/fs/bcachefs/bcachefs.h +@@ -750,7 +750,6 @@ struct bch_fs { + /* JOURNAL SEQ BLACKLIST */ + struct journal_seq_blacklist_table * + journal_seq_blacklist_table; +- struct work_struct journal_seq_blacklist_gc_work; + + /* ALLOCATOR */ + spinlock_t freelist_lock; +diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c +index 79bc0e49389b..10bd23e969d2 100644 +--- a/fs/bcachefs/journal_seq_blacklist.c ++++ b/fs/bcachefs/journal_seq_blacklist.c +@@ -235,81 +235,3 @@ const struct bch_sb_field_ops bch_sb_field_ops_journal_seq_blacklist = { + .validate = bch2_sb_journal_seq_blacklist_validate, + .to_text = bch2_sb_journal_seq_blacklist_to_text + }; +- +-void bch2_blacklist_entries_gc(struct work_struct *work) +-{ +- struct bch_fs *c = container_of(work, struct bch_fs, +- journal_seq_blacklist_gc_work); +- struct journal_seq_blacklist_table *t; +- struct bch_sb_field_journal_seq_blacklist *bl; +- struct journal_seq_blacklist_entry *src, *dst; +- struct btree_trans trans; +- unsigned i, nr, new_nr; +- int ret; +- +- bch2_trans_init(&trans, c, 0, 0); +- +- for (i = 0; i < BTREE_ID_NR; i++) { +- struct btree_iter iter; +- struct btree *b; +- +- bch2_trans_node_iter_init(&trans, &iter, i, POS_MIN, +- 0, 0, BTREE_ITER_PREFETCH); +-retry: +- bch2_trans_begin(&trans); +- +- b = bch2_btree_iter_peek_node(&iter); +- +- while (!(ret = PTR_ERR_OR_ZERO(b)) && +- b && +- !test_bit(BCH_FS_STOPPING, &c->flags)) +- b = bch2_btree_iter_next_node(&iter); +- +- if (ret == -EINTR) +- goto retry; +- +- bch2_trans_iter_exit(&trans, &iter); +- } +- +- bch2_trans_exit(&trans); +- if (ret) +- return; +- +- mutex_lock(&c->sb_lock); +- bl = bch2_sb_get_journal_seq_blacklist(c->disk_sb.sb); +- if (!bl) +- goto out; +- +- nr = blacklist_nr_entries(bl); +- dst = bl->start; +- +- t = c->journal_seq_blacklist_table; +- BUG_ON(nr != t->nr); +- +- for (src = bl->start, i = eytzinger0_first(t->nr); +- src < bl->start + nr; +- src++, i = eytzinger0_next(i, nr)) { +- BUG_ON(t->entries[i].start != le64_to_cpu(src->start)); +- BUG_ON(t->entries[i].end != le64_to_cpu(src->end)); +- +- if (t->entries[i].dirty) +- *dst++ = *src; +- } +- +- new_nr = dst - bl->start; +- +- bch_info(c, "nr blacklist entries was %u, now %u", nr, new_nr); +- +- if (new_nr != nr) { +- bl = bch2_sb_resize_journal_seq_blacklist(&c->disk_sb, +- new_nr ? sb_blacklist_u64s(new_nr) : 0); +- BUG_ON(new_nr && !bl); +- +- if (!new_nr) +- c->disk_sb.sb->features[0] &= cpu_to_le64(~(1ULL << BCH_FEATURE_journal_seq_blacklist_v3)); +- +- bch2_write_super(c); +- } +-out: +- mutex_unlock(&c->sb_lock); +-} +diff --git a/fs/bcachefs/journal_seq_blacklist.h b/fs/bcachefs/journal_seq_blacklist.h +index afb886ec8e25..b4f876a04586 100644 +--- a/fs/bcachefs/journal_seq_blacklist.h ++++ b/fs/bcachefs/journal_seq_blacklist.h +@@ -17,6 +17,4 @@ int bch2_blacklist_table_initialize(struct bch_fs *); + + extern const struct bch_sb_field_ops bch_sb_field_ops_journal_seq_blacklist; + +-void bch2_blacklist_entries_gc(struct work_struct *); +- + #endif /* _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H */ +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index 118d536b4376..ffa8ab933a11 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -1162,16 +1162,6 @@ use_clean: + if (ret) + goto err; + +- /* +- * After an unclean shutdown, skip then next few journal sequence +- * numbers as they may have been referenced by btree writes that +- * happened before their corresponding journal writes - those btree +- * writes need to be ignored, by skipping and blacklisting the next few +- * journal sequence numbers: +- */ +- if (!c->sb.clean) +- journal_seq += 8; +- + if (blacklist_seq != journal_seq) { + ret = bch2_journal_seq_blacklist_add(c, + blacklist_seq, journal_seq); +@@ -1309,7 +1299,8 @@ use_clean: + } + + if (!(c->sb.compat & (1ULL << BCH_COMPAT_extents_above_btree_updates_done)) || +- !(c->sb.compat & (1ULL << BCH_COMPAT_bformat_overflow_done))) { ++ !(c->sb.compat & (1ULL << BCH_COMPAT_bformat_overflow_done)) || ++ le16_to_cpu(c->sb.version_min) < bcachefs_metadata_version_btree_ptr_sectors_written) { + struct bch_move_stats stats; + + bch_move_stats_init(&stats, "recovery"); +@@ -1326,6 +1317,15 @@ use_clean: + } + + mutex_lock(&c->sb_lock); ++ /* ++ * With journal replay done, we can clear the journal seq blacklist ++ * table: ++ */ ++ BUG_ON(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)); ++ BUG_ON(le16_to_cpu(c->sb.version_min) < bcachefs_metadata_version_btree_ptr_sectors_written); ++ ++ bch2_sb_resize_journal_seq_blacklist(&c->disk_sb, 0); ++ + if (c->opts.version_upgrade) { + c->disk_sb.sb->version = cpu_to_le16(bcachefs_metadata_version_current); + c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL); +@@ -1349,10 +1349,6 @@ use_clean: + bch2_write_super(c); + mutex_unlock(&c->sb_lock); + +- if (c->journal_seq_blacklist_table && +- c->journal_seq_blacklist_table->nr > 128) +- queue_work(system_long_wq, &c->journal_seq_blacklist_gc_work); +- + ret = 0; + out: + set_bit(BCH_FS_FSCK_DONE, &c->flags); +diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c +index 425cb9bdd991..df6bffeffe06 100644 +--- a/fs/bcachefs/super.c ++++ b/fs/bcachefs/super.c +@@ -528,8 +528,6 @@ void __bch2_fs_stop(struct bch_fs *c) + + set_bit(BCH_FS_STOPPING, &c->flags); + +- cancel_work_sync(&c->journal_seq_blacklist_gc_work); +- + down_write(&c->state_lock); + bch2_fs_read_only(c); + up_write(&c->state_lock); +@@ -692,9 +690,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) + + spin_lock_init(&c->btree_write_error_lock); + +- INIT_WORK(&c->journal_seq_blacklist_gc_work, +- bch2_blacklist_entries_gc); +- + INIT_LIST_HEAD(&c->journal_entries); + INIT_LIST_HEAD(&c->journal_iters); + +-- +cgit v1.2.3 + + +From 6c213f23796212b00da40b05967e913a8c293d4e Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 25 Dec 2021 21:14:49 -0500 +Subject: bcachefs: Use bch2_alloc_sectors_append_ptrs() + +This code was duplicated in init_append_extent(). + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/alloc_foreground.c | 14 ++++++++------ + fs/bcachefs/alloc_foreground.h | 2 +- + fs/bcachefs/btree_update_interior.c | 2 +- + fs/bcachefs/io.c | 22 ++-------------------- + 4 files changed, 12 insertions(+), 28 deletions(-) + +diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c +index 4603328eea52..590453cfa5bf 100644 +--- a/fs/bcachefs/alloc_foreground.c ++++ b/fs/bcachefs/alloc_foreground.c +@@ -866,7 +866,8 @@ err: + * as allocated out of @ob + */ + void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct write_point *wp, +- struct bkey_i *k, unsigned sectors) ++ struct bkey_i *k, unsigned sectors, ++ bool cached) + + { + struct open_bucket *ob; +@@ -877,13 +878,14 @@ void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct write_point *wp, + + open_bucket_for_each(c, &wp->ptrs, ob, i) { + struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev); +- struct bch_extent_ptr tmp = ob->ptr; ++ struct bch_extent_ptr ptr = ob->ptr; + +- tmp.cached = !ca->mi.durability && +- wp->type == BCH_DATA_user; ++ ptr.cached = cached || ++ (!ca->mi.durability && ++ wp->type == BCH_DATA_user); + +- tmp.offset += ca->mi.bucket_size - ob->sectors_free; +- bch2_bkey_append_ptr(k, tmp); ++ ptr.offset += ca->mi.bucket_size - ob->sectors_free; ++ bch2_bkey_append_ptr(k, ptr); + + BUG_ON(sectors > ob->sectors_free); + ob->sectors_free -= sectors; +diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h +index 2e81712ba8d1..d8888785676d 100644 +--- a/fs/bcachefs/alloc_foreground.h ++++ b/fs/bcachefs/alloc_foreground.h +@@ -106,7 +106,7 @@ struct write_point *bch2_alloc_sectors_start(struct bch_fs *, + struct closure *); + + void bch2_alloc_sectors_append_ptrs(struct bch_fs *, struct write_point *, +- struct bkey_i *, unsigned); ++ struct bkey_i *, unsigned, bool); + void bch2_alloc_sectors_done(struct bch_fs *, struct write_point *); + + void bch2_open_buckets_stop_dev(struct bch_fs *, struct bch_dev *, +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index 9dca694b6ee3..6872e56b5c41 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -236,7 +236,7 @@ retry: + } + + bkey_btree_ptr_v2_init(&tmp.k); +- bch2_alloc_sectors_append_ptrs(c, wp, &tmp.k, btree_sectors(c)); ++ bch2_alloc_sectors_append_ptrs(c, wp, &tmp.k, btree_sectors(c), false); + + bch2_open_bucket_get(c, wp, &ob); + bch2_alloc_sectors_done(c, wp); +diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c +index 1b954dff5d15..50b90b728a6d 100644 +--- a/fs/bcachefs/io.c ++++ b/fs/bcachefs/io.c +@@ -665,11 +665,7 @@ static void init_append_extent(struct bch_write_op *op, + { + struct bch_fs *c = op->c; + struct bkey_i_extent *e; +- struct open_bucket *ob; +- unsigned i; + +- BUG_ON(crc.compressed_size > wp->sectors_free); +- wp->sectors_free -= crc.compressed_size; + op->pos.offset += crc.uncompressed_size; + + e = bkey_extent_init(op->insert_keys.top); +@@ -682,22 +678,8 @@ static void init_append_extent(struct bch_write_op *op, + crc.nonce) + bch2_extent_crc_append(&e->k_i, crc); + +- open_bucket_for_each(c, &wp->ptrs, ob, i) { +- struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev); +- union bch_extent_entry *end = +- bkey_val_end(bkey_i_to_s(&e->k_i)); +- +- end->ptr = ob->ptr; +- end->ptr.type = 1 << BCH_EXTENT_ENTRY_ptr; +- end->ptr.cached = !ca->mi.durability || +- (op->flags & BCH_WRITE_CACHED) != 0; +- end->ptr.offset += ca->mi.bucket_size - ob->sectors_free; +- +- e->k.u64s++; +- +- BUG_ON(crc.compressed_size > ob->sectors_free); +- ob->sectors_free -= crc.compressed_size; +- } ++ bch2_alloc_sectors_append_ptrs(c, wp, &e->k_i, crc.compressed_size, ++ op->flags & BCH_WRITE_CACHED); + + bch2_keylist_push(&op->insert_keys); + } +-- +cgit v1.2.3 + + +From ab81ef4cd0bc330eff7281650bb462e6fc4be795 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 25 Dec 2021 21:21:46 -0500 +Subject: bcachefs: Refactor open_bucket code + +Prep work for adding a hash table of open buckets - instead of embedding +a bch_extent_ptr, we need to refer to the bucket directly so that we're +not calling sector_to_bucket() in the hash table lookup code, which has +an expensive divide. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/alloc_background.c | 21 +-------- + fs/bcachefs/alloc_background.h | 2 - + fs/bcachefs/alloc_foreground.c | 100 ++++++++++++++++++++++++++--------------- + fs/bcachefs/alloc_foreground.h | 5 ++- + fs/bcachefs/alloc_types.h | 9 ++-- + fs/bcachefs/ec.c | 8 ++-- + fs/bcachefs/journal.c | 2 +- + fs/bcachefs/sysfs.c | 3 +- + 8 files changed, 83 insertions(+), 67 deletions(-) + +diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c +index bde9cf17e224..37297bc66edb 100644 +--- a/fs/bcachefs/alloc_background.c ++++ b/fs/bcachefs/alloc_background.c +@@ -1066,7 +1066,7 @@ static bool bch2_dev_has_open_write_point(struct bch_fs *c, struct bch_dev *ca) + ob++) { + spin_lock(&ob->lock); + if (ob->valid && !ob->on_partial_list && +- ob->ptr.dev == ca->dev_idx) ++ ob->dev == ca->dev_idx) + ret = true; + spin_unlock(&ob->lock); + } +@@ -1213,22 +1213,3 @@ void bch2_fs_allocator_background_init(struct bch_fs *c) + { + spin_lock_init(&c->freelist_lock); + } +- +-void bch2_open_buckets_to_text(struct printbuf *out, struct bch_fs *c) +-{ +- struct open_bucket *ob; +- +- for (ob = c->open_buckets; +- ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); +- ob++) { +- spin_lock(&ob->lock); +- if (ob->valid && !ob->on_partial_list) { +- pr_buf(out, "%zu ref %u type %s\n", +- ob - c->open_buckets, +- atomic_read(&ob->pin), +- bch2_data_types[ob->type]); +- } +- spin_unlock(&ob->lock); +- } +- +-} +diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h +index e3cdb8bc1dd8..86b64177b3d0 100644 +--- a/fs/bcachefs/alloc_background.h ++++ b/fs/bcachefs/alloc_background.h +@@ -142,6 +142,4 @@ int bch2_dev_allocator_start(struct bch_dev *); + int bch2_alloc_write_all(struct bch_fs *, unsigned); + void bch2_fs_allocator_background_init(struct bch_fs *); + +-void bch2_open_buckets_to_text(struct printbuf *, struct bch_fs *); +- + #endif /* _BCACHEFS_ALLOC_BACKGROUND_H */ +diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c +index 590453cfa5bf..87189f692a1a 100644 +--- a/fs/bcachefs/alloc_foreground.c ++++ b/fs/bcachefs/alloc_foreground.c +@@ -45,7 +45,7 @@ + + void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob) + { +- struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev); ++ struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev); + + if (ob->ec) { + bch2_ec_bucket_written(c, ob); +@@ -55,9 +55,9 @@ void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob) + percpu_down_read(&c->mark_lock); + spin_lock(&ob->lock); + +- bch2_mark_alloc_bucket(c, ca, PTR_BUCKET_NR(ca, &ob->ptr), false); ++ bch2_mark_alloc_bucket(c, ca, ob->bucket, false); + ob->valid = false; +- ob->type = 0; ++ ob->data_type = 0; + + spin_unlock(&ob->lock); + percpu_up_read(&c->mark_lock); +@@ -81,8 +81,7 @@ void bch2_open_bucket_write_error(struct bch_fs *c, + unsigned i; + + open_bucket_for_each(c, obs, ob, i) +- if (ob->ptr.dev == dev && +- ob->ec) ++ if (ob->dev == dev && ob->ec) + bch2_ec_bucket_cancel(c, ob); + } + +@@ -95,18 +94,19 @@ static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c) + ob = c->open_buckets + c->open_buckets_freelist; + c->open_buckets_freelist = ob->freelist; + atomic_set(&ob->pin, 1); +- ob->type = 0; ++ ob->data_type = 0; + + c->open_buckets_nr_free--; + return ob; + } + ++ + static void open_bucket_free_unused(struct bch_fs *c, + struct write_point *wp, + struct open_bucket *ob) + { +- struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev); +- bool may_realloc = wp->type == BCH_DATA_user; ++ struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev); ++ bool may_realloc = wp->data_type == BCH_DATA_user; + + BUG_ON(ca->open_buckets_partial_nr > + ARRAY_SIZE(ca->open_buckets_partial)); +@@ -133,11 +133,13 @@ static void verify_not_stale(struct bch_fs *c, const struct open_buckets *obs) + struct open_bucket *ob; + unsigned i; + ++ rcu_read_lock(); + open_bucket_for_each(c, obs, ob, i) { +- struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev); ++ struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev); + +- BUG_ON(ptr_stale(ca, &ob->ptr)); ++ BUG_ON(bucket(ca, ob->bucket)->mark.gen != ob->gen); + } ++ rcu_read_unlock(); + #endif + } + +@@ -246,13 +248,9 @@ out: + ob->valid = true; + ob->sectors_free = ca->mi.bucket_size; + ob->alloc_reserve = reserve; +- ob->ptr = (struct bch_extent_ptr) { +- .type = 1 << BCH_EXTENT_ENTRY_ptr, +- .gen = bucket(ca, b)->mark.gen, +- .offset = bucket_to_sector(ca, b), +- .dev = ca->dev_idx, +- }; +- ++ ob->dev = ca->dev_idx; ++ ob->gen = bucket(ca, b)->mark.gen; ++ ob->bucket = b; + spin_unlock(&ob->lock); + + if (c->blocked_allocate_open_bucket) { +@@ -333,9 +331,9 @@ static void add_new_bucket(struct bch_fs *c, + struct open_bucket *ob) + { + unsigned durability = +- bch_dev_bkey_exists(c, ob->ptr.dev)->mi.durability; ++ bch_dev_bkey_exists(c, ob->dev)->mi.durability; + +- __clear_bit(ob->ptr.dev, devs_may_alloc->d); ++ __clear_bit(ob->dev, devs_may_alloc->d); + *nr_effective += (flags & BUCKET_ALLOC_USE_DURABILITY) + ? durability : 1; + *have_cache |= !durability; +@@ -445,13 +443,13 @@ static int bucket_alloc_from_stripe(struct bch_fs *c, + continue; + + ob = c->open_buckets + h->s->blocks[ec_idx]; +- if (ob->ptr.dev == devs_sorted.devs[i] && ++ if (ob->dev == devs_sorted.devs[i] && + !test_and_set_bit(ec_idx, h->s->blocks_allocated)) + goto got_bucket; + } + goto out_put_head; + got_bucket: +- ca = bch_dev_bkey_exists(c, ob->ptr.dev); ++ ca = bch_dev_bkey_exists(c, ob->dev); + + ob->ec_idx = ec_idx; + ob->ec = h->s; +@@ -481,12 +479,12 @@ static void get_buckets_from_writepoint(struct bch_fs *c, + unsigned i; + + open_bucket_for_each(c, &wp->ptrs, ob, i) { +- struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev); ++ struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev); + + if (*nr_effective < nr_replicas && +- test_bit(ob->ptr.dev, devs_may_alloc->d) && ++ test_bit(ob->dev, devs_may_alloc->d) && + (ca->mi.durability || +- (wp->type == BCH_DATA_user && !*have_cache)) && ++ (wp->data_type == BCH_DATA_user && !*have_cache)) && + (ob->ec || !need_ec)) { + add_new_bucket(c, ptrs, devs_may_alloc, + nr_effective, have_cache, +@@ -518,7 +516,7 @@ static int open_bucket_add_buckets(struct bch_fs *c, + unsigned i; + + rcu_read_lock(); +- devs = target_rw_devs(c, wp->type, target); ++ devs = target_rw_devs(c, wp->data_type, target); + rcu_read_unlock(); + + /* Don't allocate from devices we already have pointers to: */ +@@ -526,7 +524,7 @@ static int open_bucket_add_buckets(struct bch_fs *c, + __clear_bit(devs_have->devs[i], devs.d); + + open_bucket_for_each(c, ptrs, ob, i) +- __clear_bit(ob->ptr.dev, devs.d); ++ __clear_bit(ob->dev, devs.d); + + if (erasure_code) { + if (!ec_open_bucket(c, ptrs)) { +@@ -586,7 +584,7 @@ void bch2_open_buckets_stop_dev(struct bch_fs *c, struct bch_dev *ca, + unsigned i, j; + + open_bucket_for_each(c, obs, ob, i) { +- bool drop = !ca || ob->ptr.dev == ca->dev_idx; ++ bool drop = !ca || ob->dev == ca->dev_idx; + + if (!drop && ob->ec) { + mutex_lock(&ob->ec->lock); +@@ -595,7 +593,7 @@ void bch2_open_buckets_stop_dev(struct bch_fs *c, struct bch_dev *ca, + continue; + + ob2 = c->open_buckets + ob->ec->blocks[j]; +- drop |= ob2->ptr.dev == ca->dev_idx; ++ drop |= ob2->dev == ca->dev_idx; + } + mutex_unlock(&ob->ec->lock); + } +@@ -779,11 +777,11 @@ retry: + + wp = writepoint_find(c, write_point.v); + +- if (wp->type == BCH_DATA_user) ++ if (wp->data_type == BCH_DATA_user) + ob_flags |= BUCKET_MAY_ALLOC_PARTIAL; + + /* metadata may not allocate on cache devices: */ +- if (wp->type != BCH_DATA_user) ++ if (wp->data_type != BCH_DATA_user) + have_cache = true; + + if (!target || (flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)) { +@@ -861,6 +859,20 @@ err: + } + } + ++struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *c, struct open_bucket *ob) ++{ ++ struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev); ++ ++ return (struct bch_extent_ptr) { ++ .type = 1 << BCH_EXTENT_ENTRY_ptr, ++ .gen = ob->gen, ++ .dev = ob->dev, ++ .offset = bucket_to_sector(ca, ob->bucket) + ++ ca->mi.bucket_size - ++ ob->sectors_free, ++ }; ++} ++ + /* + * Append pointers to the space we just allocated to @k, and mark @sectors space + * as allocated out of @ob +@@ -877,14 +889,13 @@ void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct write_point *wp, + wp->sectors_free -= sectors; + + open_bucket_for_each(c, &wp->ptrs, ob, i) { +- struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev); +- struct bch_extent_ptr ptr = ob->ptr; ++ struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev); ++ struct bch_extent_ptr ptr = bch2_ob_ptr(c, ob); + + ptr.cached = cached || + (!ca->mi.durability && +- wp->type == BCH_DATA_user); ++ wp->data_type == BCH_DATA_user); + +- ptr.offset += ca->mi.bucket_size - ob->sectors_free; + bch2_bkey_append_ptr(k, ptr); + + BUG_ON(sectors > ob->sectors_free); +@@ -915,7 +926,7 @@ static inline void writepoint_init(struct write_point *wp, + enum bch_data_type type) + { + mutex_init(&wp->lock); +- wp->type = type; ++ wp->data_type = type; + } + + void bch2_fs_allocator_foreground_init(struct bch_fs *c) +@@ -952,3 +963,22 @@ void bch2_fs_allocator_foreground_init(struct bch_fs *c) + writepoint_hash(c, wp->write_point)); + } + } ++ ++void bch2_open_buckets_to_text(struct printbuf *out, struct bch_fs *c) ++{ ++ struct open_bucket *ob; ++ ++ for (ob = c->open_buckets; ++ ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); ++ ob++) { ++ spin_lock(&ob->lock); ++ if (ob->valid && !ob->on_partial_list) { ++ pr_buf(out, "%zu ref %u type %s\n", ++ ob - c->open_buckets, ++ atomic_read(&ob->pin), ++ bch2_data_types[ob->data_type]); ++ } ++ spin_unlock(&ob->lock); ++ } ++ ++} +diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h +index d8888785676d..39d8ae5bbb96 100644 +--- a/fs/bcachefs/alloc_foreground.h ++++ b/fs/bcachefs/alloc_foreground.h +@@ -85,7 +85,7 @@ static inline void bch2_open_bucket_get(struct bch_fs *c, + unsigned i; + + open_bucket_for_each(c, &wp->ptrs, ob, i) { +- ob->type = wp->type; ++ ob->data_type = wp->data_type; + atomic_inc(&ob->pin); + ob_push(c, ptrs, ob); + } +@@ -105,6 +105,7 @@ struct write_point *bch2_alloc_sectors_start(struct bch_fs *, + unsigned, + struct closure *); + ++struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *, struct open_bucket *); + void bch2_alloc_sectors_append_ptrs(struct bch_fs *, struct write_point *, + struct bkey_i *, unsigned, bool); + void bch2_alloc_sectors_done(struct bch_fs *, struct write_point *); +@@ -127,4 +128,6 @@ static inline struct write_point_specifier writepoint_ptr(struct write_point *wp + + void bch2_fs_allocator_foreground_init(struct bch_fs *); + ++void bch2_open_buckets_to_text(struct printbuf *, struct bch_fs *); ++ + #endif /* _BCACHEFS_ALLOC_FOREGROUND_H */ +diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h +index 4a1cd8b73d16..bd173c7c334b 100644 +--- a/fs/bcachefs/alloc_types.h ++++ b/fs/bcachefs/alloc_types.h +@@ -49,12 +49,15 @@ struct open_bucket { + * the block in the stripe this open_bucket corresponds to: + */ + u8 ec_idx; +- u8 type; ++ enum bch_data_type data_type:3; + unsigned valid:1; + unsigned on_partial_list:1; + int alloc_reserve:3; ++ + unsigned sectors_free; +- struct bch_extent_ptr ptr; ++ u8 dev; ++ u8 gen; ++ u64 bucket; + struct ec_stripe_new *ec; + }; + +@@ -74,7 +77,7 @@ struct write_point { + struct mutex lock; + u64 last_used; + unsigned long write_point; +- enum bch_data_type type; ++ enum bch_data_type data_type; + + /* calculated based on how many pointers we're actually going to use: */ + unsigned sectors_free; +diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c +index 7d78672dd017..20e44e572288 100644 +--- a/fs/bcachefs/ec.c ++++ b/fs/bcachefs/ec.c +@@ -1063,7 +1063,7 @@ void *bch2_writepoint_ec_buf(struct bch_fs *c, struct write_point *wp) + if (!ob) + return NULL; + +- ca = bch_dev_bkey_exists(c, ob->ptr.dev); ++ ca = bch_dev_bkey_exists(c, ob->dev); + offset = ca->mi.bucket_size - ob->sectors_free; + + return ob->ec->new_stripe.data[ob->ec_idx] + (offset << 9); +@@ -1318,7 +1318,7 @@ static int new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h, + BUG_ON(j >= h->s->nr_data + h->s->nr_parity); + + h->s->blocks[j] = buckets.v[i]; +- h->s->new_stripe.key.v.ptrs[j] = ob->ptr; ++ h->s->new_stripe.key.v.ptrs[j] = bch2_ob_ptr(c, ob); + __set_bit(j, h->s->blocks_gotten); + } + +@@ -1346,7 +1346,7 @@ static int new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h, + BUG_ON(j >= h->s->nr_data); + + h->s->blocks[j] = buckets.v[i]; +- h->s->new_stripe.key.v.ptrs[j] = ob->ptr; ++ h->s->new_stripe.key.v.ptrs[j] = bch2_ob_ptr(c, ob); + __set_bit(j, h->s->blocks_gotten); + } + +@@ -1535,7 +1535,7 @@ void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca) + continue; + + ob = c->open_buckets + h->s->blocks[i]; +- if (ob->ptr.dev == ca->dev_idx) ++ if (ob->dev == ca->dev_idx) + goto found; + } + goto unlock; +diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c +index 2082aa52089f..e0017dcf3312 100644 +--- a/fs/bcachefs/journal.c ++++ b/fs/bcachefs/journal.c +@@ -790,7 +790,7 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, + goto err; + } + +- b = sector_to_bucket(ca, ob->ptr.offset); ++ b = ob->bucket; + } + + if (c) +diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c +index 1d1e2c6fc2e2..07e9b214bcb5 100644 +--- a/fs/bcachefs/sysfs.c ++++ b/fs/bcachefs/sysfs.c +@@ -10,6 +10,7 @@ + + #include "bcachefs.h" + #include "alloc_background.h" ++#include "alloc_foreground.h" + #include "sysfs.h" + #include "btree_cache.h" + #include "btree_io.h" +@@ -723,7 +724,7 @@ static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca) + memset(nr, 0, sizeof(nr)); + + for (i = 0; i < ARRAY_SIZE(c->open_buckets); i++) +- nr[c->open_buckets[i].type]++; ++ nr[c->open_buckets[i].data_type]++; + + pr_buf(out, + "\t\t buckets\t sectors fragmented\n" +-- +cgit v1.2.3 + + +From c6f9146875b581466a9110e25214357e4be8724c Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 25 Dec 2021 21:43:29 -0500 +Subject: bcachefs: Put open_buckets in a hashtable + +This is so that the copygc code doesn't have to refer to +bucket_mark.owned_by_allocator - assisting in getting rid of the in +memory bucket array. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/alloc_foreground.c | 30 ++++++++++++++++++++++++++++-- + fs/bcachefs/alloc_foreground.h | 24 ++++++++++++++++++++++++ + fs/bcachefs/alloc_types.h | 4 ++++ + fs/bcachefs/bcachefs.h | 2 ++ + 4 files changed, 58 insertions(+), 2 deletions(-) + +diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c +index 87189f692a1a..e9b0fd5f89a2 100644 +--- a/fs/bcachefs/alloc_foreground.c ++++ b/fs/bcachefs/alloc_foreground.c +@@ -43,6 +43,29 @@ + * reference _after_ doing the index update that makes its allocation reachable. + */ + ++static void bch2_open_bucket_hash_add(struct bch_fs *c, struct open_bucket *ob) ++{ ++ open_bucket_idx_t idx = ob - c->open_buckets; ++ open_bucket_idx_t *slot = open_bucket_hashslot(c, ob->dev, ob->bucket); ++ ++ ob->hash = *slot; ++ *slot = idx; ++} ++ ++static void bch2_open_bucket_hash_remove(struct bch_fs *c, struct open_bucket *ob) ++{ ++ open_bucket_idx_t idx = ob - c->open_buckets; ++ open_bucket_idx_t *slot = open_bucket_hashslot(c, ob->dev, ob->bucket); ++ ++ while (*slot != idx) { ++ BUG_ON(!*slot); ++ slot = &c->open_buckets[*slot].hash; ++ } ++ ++ *slot = ob->hash; ++ ob->hash = 0; ++} ++ + void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob) + { + struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev); +@@ -63,6 +86,8 @@ void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob) + percpu_up_read(&c->mark_lock); + + spin_lock(&c->freelist_lock); ++ bch2_open_bucket_hash_remove(c, ob); ++ + ob->freelist = c->open_buckets_freelist; + c->open_buckets_freelist = ob - c->open_buckets; + +@@ -100,7 +125,6 @@ static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c) + return ob; + } + +- + static void open_bucket_free_unused(struct bch_fs *c, + struct write_point *wp, + struct open_bucket *ob) +@@ -253,6 +277,9 @@ out: + ob->bucket = b; + spin_unlock(&ob->lock); + ++ ca->nr_open_buckets++; ++ bch2_open_bucket_hash_add(c, ob); ++ + if (c->blocked_allocate_open_bucket) { + bch2_time_stats_update( + &c->times[BCH_TIME_blocked_allocate_open_bucket], +@@ -267,7 +294,6 @@ out: + c->blocked_allocate = 0; + } + +- ca->nr_open_buckets++; + spin_unlock(&c->freelist_lock); + + bch2_wake_allocator(ca); +diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h +index 39d8ae5bbb96..d466bda9afc8 100644 +--- a/fs/bcachefs/alloc_foreground.h ++++ b/fs/bcachefs/alloc_foreground.h +@@ -91,6 +91,30 @@ static inline void bch2_open_bucket_get(struct bch_fs *c, + } + } + ++static inline open_bucket_idx_t *open_bucket_hashslot(struct bch_fs *c, ++ unsigned dev, u64 bucket) ++{ ++ return c->open_buckets_hash + ++ (jhash_3words(dev, bucket, bucket >> 32, 0) & ++ (OPEN_BUCKETS_COUNT - 1)); ++} ++ ++static inline bool bch2_bucket_is_open(struct bch_fs *c, unsigned dev, u64 bucket) ++{ ++ open_bucket_idx_t slot = *open_bucket_hashslot(c, dev, bucket); ++ ++ while (slot) { ++ struct open_bucket *ob = &c->open_buckets[slot]; ++ ++ if (ob->dev == dev && ob->bucket == bucket) ++ return true; ++ ++ slot = ob->hash; ++ } ++ ++ return false; ++} ++ + int bch2_bucket_alloc_set(struct bch_fs *, struct open_buckets *, + struct dev_stripe_state *, struct bch_devs_mask *, + unsigned, unsigned *, bool *, enum alloc_reserve, +diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h +index bd173c7c334b..409232e3d998 100644 +--- a/fs/bcachefs/alloc_types.h ++++ b/fs/bcachefs/alloc_types.h +@@ -37,12 +37,16 @@ typedef FIFO(long) alloc_fifo; + #define WRITE_POINT_HASH_NR 32 + #define WRITE_POINT_MAX 32 + ++/* ++ * 0 is never a valid open_bucket_idx_t: ++ */ + typedef u16 open_bucket_idx_t; + + struct open_bucket { + spinlock_t lock; + atomic_t pin; + open_bucket_idx_t freelist; ++ open_bucket_idx_t hash; + + /* + * When an open bucket has an ec_stripe attached, this is the index of +diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h +index 0a062df0d67a..88a3eb17c686 100644 +--- a/fs/bcachefs/bcachefs.h ++++ b/fs/bcachefs/bcachefs.h +@@ -756,10 +756,12 @@ struct bch_fs { + struct closure_waitlist freelist_wait; + u64 blocked_allocate; + u64 blocked_allocate_open_bucket; ++ + open_bucket_idx_t open_buckets_freelist; + open_bucket_idx_t open_buckets_nr_free; + struct closure_waitlist open_buckets_wait; + struct open_bucket open_buckets[OPEN_BUCKETS_COUNT]; ++ open_bucket_idx_t open_buckets_hash[OPEN_BUCKETS_COUNT]; + + struct write_point btree_write_point; + struct write_point rebalance_write_point; +-- +cgit v1.2.3 + + +From 4ed7bd8fe1e2e6b20a1e93bd56ad0173c9d041bd Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 25 Dec 2021 22:37:19 -0500 +Subject: bcachefs: Separate out gc_bucket() + +Since the main in memory bucket array is going away, we don't want to be +calling bucket() or __bucket() when what we want is the GC in-memory +bucket. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_gc.c | 14 +++++----- + fs/bcachefs/buckets.c | 76 ++++++++++++++++++++++++-------------------------- + fs/bcachefs/buckets.h | 18 +++++++++--- + 3 files changed, 57 insertions(+), 51 deletions(-) + +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +index 271e0b15b151..88fbb0ead39d 100644 +--- a/fs/bcachefs/btree_gc.c ++++ b/fs/bcachefs/btree_gc.c +@@ -504,8 +504,8 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id, + */ + bkey_for_each_ptr_decode(k->k, ptrs, p, entry) { + struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); +- struct bucket *g = PTR_BUCKET(ca, &p.ptr, true); +- struct bucket *g2 = PTR_BUCKET(ca, &p.ptr, false); ++ struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr); ++ struct bucket *g2 = PTR_BUCKET(ca, &p.ptr); + enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, &entry->ptr); + + if (fsck_err_on(!g->gen_valid, c, +@@ -643,14 +643,14 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id, + ptrs = bch2_bkey_ptrs(bkey_i_to_s(new)); + bkey_for_each_ptr(ptrs, ptr) { + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); +- struct bucket *g = PTR_BUCKET(ca, ptr, true); ++ struct bucket *g = PTR_GC_BUCKET(ca, ptr); + + ptr->gen = g->mark.gen; + } + } else { + bch2_bkey_drop_ptrs(bkey_i_to_s(new), ptr, ({ + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); +- struct bucket *g = PTR_BUCKET(ca, ptr, true); ++ struct bucket *g = PTR_GC_BUCKET(ca, ptr); + enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, ptr); + + (ptr->cached && +@@ -737,7 +737,7 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id, + ptrs = bch2_bkey_ptrs_c(*k); + bkey_for_each_ptr(ptrs, ptr) { + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); +- struct bucket *g = PTR_BUCKET(ca, ptr, true); ++ struct bucket *g = PTR_GC_BUCKET(ca, ptr); + + if (gen_after(g->oldest_gen, ptr->gen)) + g->oldest_gen = ptr->gen; +@@ -1753,7 +1753,7 @@ static bool gc_btree_gens_key(struct bch_fs *c, struct bkey_s_c k) + percpu_down_read(&c->mark_lock); + bkey_for_each_ptr(ptrs, ptr) { + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); +- struct bucket *g = PTR_BUCKET(ca, ptr, false); ++ struct bucket *g = PTR_BUCKET(ca, ptr); + + if (gen_after(g->mark.gen, ptr->gen) > 16) { + percpu_up_read(&c->mark_lock); +@@ -1763,7 +1763,7 @@ static bool gc_btree_gens_key(struct bch_fs *c, struct bkey_s_c k) + + bkey_for_each_ptr(ptrs, ptr) { + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); +- struct bucket *g = PTR_BUCKET(ca, ptr, false); ++ struct bucket *g = PTR_BUCKET(ca, ptr); + + if (gen_after(g->gc_gen, ptr->gen)) + g->gc_gen = ptr->gen; +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index 600bea4c274b..98f7b6924e59 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -340,13 +340,6 @@ static inline enum bch_data_type bucket_type(struct bucket_mark m) + : m.data_type; + } + +-static bool bucket_became_unavailable(struct bucket_mark old, +- struct bucket_mark new) +-{ +- return is_available_bucket(old) && +- !is_available_bucket(new); +-} +- + static inline void account_bucket(struct bch_fs_usage *fs_usage, + struct bch_dev_usage *dev_usage, + enum bch_data_type type, +@@ -655,7 +648,7 @@ void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, + return; + + percpu_down_read(&c->mark_lock); +- g = __bucket(ca, b, true); ++ g = gc_bucket(ca, b); + old = bucket_cmpxchg(g, new, ({ + new.data_type = data_type; + overflow = checked_add(new.dirty_sectors, sectors); +@@ -775,17 +768,18 @@ static int mark_stripe_bucket(struct btree_trans *trans, + enum bch_data_type data_type = parity ? BCH_DATA_parity : 0; + s64 sectors = parity ? le16_to_cpu(s->sectors) : 0; + const struct bch_extent_ptr *ptr = s->ptrs + ptr_idx; +- bool gc = flags & BTREE_TRIGGER_GC; + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); + struct bucket *g; + struct bucket_mark new, old; + char buf[200]; + int ret = 0; + ++ BUG_ON(!(flags & BTREE_TRIGGER_GC)); ++ + /* * XXX doesn't handle deletion */ + + percpu_down_read(&c->mark_lock); +- g = PTR_BUCKET(ca, ptr, gc); ++ g = PTR_GC_BUCKET(ca, ptr); + + if (g->mark.dirty_sectors || + (g->stripe && g->stripe != k.k->p.offset)) { +@@ -819,7 +813,7 @@ static int mark_stripe_bucket(struct btree_trans *trans, + g->stripe = k.k->p.offset; + g->stripe_redundancy = s->nr_redundant; + +- bch2_dev_usage_update(c, ca, old, new, journal_seq, gc); ++ bch2_dev_usage_update(c, ca, old, new, journal_seq, true); + err: + percpu_up_read(&c->mark_lock); + +@@ -855,7 +849,6 @@ static int bch2_mark_pointer(struct btree_trans *trans, + s64 sectors, enum bch_data_type data_type, + unsigned flags) + { +- bool gc = flags & BTREE_TRIGGER_GC; + u64 journal_seq = trans->journal_res.seq; + struct bch_fs *c = trans->c; + struct bucket_mark old, new; +@@ -865,8 +858,10 @@ static int bch2_mark_pointer(struct btree_trans *trans, + u64 v; + int ret = 0; + ++ BUG_ON(!(flags & BTREE_TRIGGER_GC)); ++ + percpu_down_read(&c->mark_lock); +- g = PTR_BUCKET(ca, &p.ptr, gc); ++ g = PTR_GC_BUCKET(ca, &p.ptr); + + v = atomic64_read(&g->_mark.v); + do { +@@ -896,9 +891,7 @@ static int bch2_mark_pointer(struct btree_trans *trans, + old.v.counter, + new.v.counter)) != old.v.counter); + +- bch2_dev_usage_update(c, ca, old, new, journal_seq, gc); +- +- BUG_ON(!gc && bucket_became_unavailable(old, new)); ++ bch2_dev_usage_update(c, ca, old, new, journal_seq, true); + err: + percpu_up_read(&c->mark_lock); + +@@ -912,37 +905,35 @@ static int bch2_mark_stripe_ptr(struct btree_trans *trans, + s64 sectors, + unsigned flags) + { +- bool gc = flags & BTREE_TRIGGER_GC; + struct bch_fs *c = trans->c; + struct bch_replicas_padded r; ++ struct gc_stripe *m; + +- if (!gc) { +- BUG(); +- } else { +- struct gc_stripe *m = genradix_ptr_alloc(&c->gc_stripes, p.idx, GFP_KERNEL); +- +- if (!m) +- return -ENOMEM; ++ BUG_ON(!(flags & BTREE_TRIGGER_GC)); + +- spin_lock(&c->ec_stripes_heap_lock); ++ m = genradix_ptr_alloc(&c->gc_stripes, p.idx, GFP_KERNEL); + +- if (!m || !m->alive) { +- spin_unlock(&c->ec_stripes_heap_lock); +- bch_err_ratelimited(c, "pointer to nonexistent stripe %llu", +- (u64) p.idx); +- bch2_inconsistent_error(c); +- return -EIO; +- } ++ if (!m) ++ return -ENOMEM; + +- m->block_sectors[p.block] += sectors; ++ spin_lock(&c->ec_stripes_heap_lock); + +- r = m->r; ++ if (!m || !m->alive) { + spin_unlock(&c->ec_stripes_heap_lock); +- +- r.e.data_type = data_type; +- update_replicas(c, k, &r.e, sectors, trans->journal_res.seq, gc); ++ bch_err_ratelimited(c, "pointer to nonexistent stripe %llu", ++ (u64) p.idx); ++ bch2_inconsistent_error(c); ++ return -EIO; + } + ++ m->block_sectors[p.block] += sectors; ++ ++ r = m->r; ++ spin_unlock(&c->ec_stripes_heap_lock); ++ ++ r.e.data_type = data_type; ++ update_replicas(c, k, &r.e, sectors, trans->journal_res.seq, true); ++ + return 0; + } + +@@ -950,7 +941,6 @@ static int bch2_mark_extent(struct btree_trans *trans, + struct bkey_s_c old, struct bkey_s_c new, + unsigned flags) + { +- bool gc = flags & BTREE_TRIGGER_GC; + u64 journal_seq = trans->journal_res.seq; + struct bch_fs *c = trans->c; + struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old: new; +@@ -968,6 +958,8 @@ static int bch2_mark_extent(struct btree_trans *trans, + bool stale; + int ret; + ++ BUG_ON(!(flags & BTREE_TRIGGER_GC)); ++ + r.e.data_type = data_type; + r.e.nr_devs = 0; + r.e.nr_required = 1; +@@ -988,7 +980,7 @@ static int bch2_mark_extent(struct btree_trans *trans, + if (p.ptr.cached) { + if (!stale) { + ret = update_cached_sectors(c, k, p.ptr.dev, +- disk_sectors, journal_seq, gc); ++ disk_sectors, journal_seq, true); + if (ret) { + bch2_fs_fatal_error(c, "bch2_mark_extent(): no replicas entry while updating cached sectors"); + return ret; +@@ -1013,7 +1005,7 @@ static int bch2_mark_extent(struct btree_trans *trans, + } + + if (r.e.nr_devs) { +- ret = update_replicas(c, k, &r.e, dirty_sectors, journal_seq, gc); ++ ret = update_replicas(c, k, &r.e, dirty_sectors, journal_seq, true); + if (ret) { + char buf[200]; + +@@ -1164,6 +1156,8 @@ static int bch2_mark_reservation(struct btree_trans *trans, + unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas; + s64 sectors = (s64) k.k->size; + ++ BUG_ON(!(flags & BTREE_TRIGGER_GC)); ++ + if (flags & BTREE_TRIGGER_OVERWRITE) + sectors = -sectors; + sectors *= replicas; +@@ -1238,6 +1232,8 @@ static int bch2_mark_reflink_p(struct btree_trans *trans, + u64 end = le64_to_cpu(p.v->idx) + p.k->size; + int ret = 0; + ++ BUG_ON(!(flags & BTREE_TRIGGER_GC)); ++ + if (c->sb.version >= bcachefs_metadata_version_reflink_p_fix) { + idx -= le32_to_cpu(p.v->front_pad); + end += le32_to_cpu(p.v->back_pad); +diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h +index 50e28fb78e9b..d2d82394a86d 100644 +--- a/fs/bcachefs/buckets.h ++++ b/fs/bcachefs/buckets.h +@@ -53,6 +53,11 @@ static inline struct bucket *__bucket(struct bch_dev *ca, size_t b, bool gc) + return buckets->b + b; + } + ++static inline struct bucket *gc_bucket(struct bch_dev *ca, size_t b) ++{ ++ return __bucket(ca, b, true); ++} ++ + static inline struct bucket *bucket(struct bch_dev *ca, size_t b) + { + return __bucket(ca, b, false); +@@ -75,10 +80,15 @@ static inline size_t PTR_BUCKET_NR(const struct bch_dev *ca, + } + + static inline struct bucket *PTR_BUCKET(struct bch_dev *ca, +- const struct bch_extent_ptr *ptr, +- bool gc) ++ const struct bch_extent_ptr *ptr) ++{ ++ return bucket(ca, PTR_BUCKET_NR(ca, ptr)); ++} ++ ++static inline struct bucket *PTR_GC_BUCKET(struct bch_dev *ca, ++ const struct bch_extent_ptr *ptr) + { +- return __bucket(ca, PTR_BUCKET_NR(ca, ptr), gc); ++ return gc_bucket(ca, PTR_BUCKET_NR(ca, ptr)); + } + + static inline enum bch_data_type ptr_data_type(const struct bkey *k, +@@ -113,7 +123,7 @@ static inline u8 ptr_stale(struct bch_dev *ca, + u8 ret; + + rcu_read_lock(); +- ret = gen_after(PTR_BUCKET(ca, ptr, 0)->mark.gen, ptr->gen); ++ ret = gen_after(PTR_BUCKET(ca, ptr)->mark.gen, ptr->gen); + rcu_read_unlock(); + + return ret; +-- +cgit v1.2.3 + + +From 86fd06c4fb5d98491ed91918684383d2015dd73c Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 25 Dec 2021 19:55:34 -0500 +Subject: bcachefs: New in-memory array for bucket gens + +The main in-memory bucket array is going away, but we'll still need to +keep bucket generations in memory, at least for now - ptr_stale() needs +to be an efficient operation. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/alloc_background.c | 2 ++ + fs/bcachefs/alloc_foreground.c | 4 ++-- + fs/bcachefs/bcachefs.h | 1 + + fs/bcachefs/buckets.c | 43 ++++++++++++++++++++++++++++++++++++++++-- + fs/bcachefs/buckets.h | 20 +++++++++++++++++++- + fs/bcachefs/buckets_types.h | 7 +++++++ + 6 files changed, 72 insertions(+), 5 deletions(-) + +diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c +index 37297bc66edb..ba518581a433 100644 +--- a/fs/bcachefs/alloc_background.c ++++ b/fs/bcachefs/alloc_background.c +@@ -354,6 +354,7 @@ static int bch2_alloc_read_fn(struct btree_trans *trans, struct bkey_s_c k) + g = bucket(ca, k.k->p.offset); + u = bch2_alloc_unpack(k); + ++ *bucket_gen(ca, k.k->p.offset) = u.gen; + g->_mark.gen = u.gen; + g->_mark.data_type = u.data_type; + g->_mark.dirty_sectors = u.dirty_sectors; +@@ -748,6 +749,7 @@ static int bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca, + !bucket_needs_journal_commit(m, c->journal.last_seq_ondisk)) { + BUG_ON(m.data_type); + bucket_cmpxchg(g, m, m.gen++); ++ *bucket_gen(ca, b) = m.gen; + percpu_up_read(&c->mark_lock); + goto out; + } +diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c +index e9b0fd5f89a2..0a634125dc90 100644 +--- a/fs/bcachefs/alloc_foreground.c ++++ b/fs/bcachefs/alloc_foreground.c +@@ -161,7 +161,7 @@ static void verify_not_stale(struct bch_fs *c, const struct open_buckets *obs) + open_bucket_for_each(c, obs, ob, i) { + struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev); + +- BUG_ON(bucket(ca, ob->bucket)->mark.gen != ob->gen); ++ BUG_ON(*bucket_gen(ca, ob->bucket) != ob->gen); + } + rcu_read_unlock(); + #endif +@@ -273,7 +273,7 @@ out: + ob->sectors_free = ca->mi.bucket_size; + ob->alloc_reserve = reserve; + ob->dev = ca->dev_idx; +- ob->gen = bucket(ca, b)->mark.gen; ++ ob->gen = *bucket_gen(ca, b); + ob->bucket = b; + spin_unlock(&ob->lock); + +diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h +index 88a3eb17c686..b21151ea73fb 100644 +--- a/fs/bcachefs/bcachefs.h ++++ b/fs/bcachefs/bcachefs.h +@@ -445,6 +445,7 @@ struct bch_dev { + * Or rcu_read_lock(), but only for ptr_stale(): + */ + struct bucket_array __rcu *buckets[2]; ++ struct bucket_gens *bucket_gens; + unsigned long *buckets_nouse; + struct rw_semaphore bucket_lock; + +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index 98f7b6924e59..2b084c435011 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -531,6 +531,20 @@ void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca, + BUG_ON(owned_by_allocator == old.owned_by_allocator); + } + ++static inline u8 bkey_alloc_gen(struct bkey_s_c k) ++{ ++ switch (k.k->type) { ++ case KEY_TYPE_alloc: ++ return bkey_s_c_to_alloc(k).v->gen; ++ case KEY_TYPE_alloc_v2: ++ return bkey_s_c_to_alloc_v2(k).v->gen; ++ case KEY_TYPE_alloc_v3: ++ return bkey_s_c_to_alloc_v3(k).v->gen; ++ default: ++ return 0; ++ } ++} ++ + static int bch2_mark_alloc(struct btree_trans *trans, + struct bkey_s_c old, struct bkey_s_c new, + unsigned flags) +@@ -569,9 +583,13 @@ static int bch2_mark_alloc(struct btree_trans *trans, + if (new.k->p.offset >= ca->mi.nbuckets) + return 0; + ++ u = bch2_alloc_unpack(new); ++ + percpu_down_read(&c->mark_lock); ++ if (!gc && u.gen != bkey_alloc_gen(old)) ++ *bucket_gen(ca, new.k->p.offset) = u.gen; ++ + g = __bucket(ca, new.k->p.offset, gc); +- u = bch2_alloc_unpack(new); + + old_m = bucket_cmpxchg(g, m, ({ + m.gen = u.gen; +@@ -2126,9 +2144,18 @@ static void buckets_free_rcu(struct rcu_head *rcu) + buckets->nbuckets * sizeof(struct bucket)); + } + ++static void bucket_gens_free_rcu(struct rcu_head *rcu) ++{ ++ struct bucket_gens *buckets = ++ container_of(rcu, struct bucket_gens, rcu); ++ ++ kvpfree(buckets, sizeof(struct bucket_array) + buckets->nbuckets); ++} ++ + int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) + { + struct bucket_array *buckets = NULL, *old_buckets = NULL; ++ struct bucket_gens *bucket_gens = NULL, *old_bucket_gens = NULL; + unsigned long *buckets_nouse = NULL; + alloc_fifo free[RESERVE_NR]; + alloc_fifo free_inc; +@@ -2152,6 +2179,8 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) + if (!(buckets = kvpmalloc(sizeof(struct bucket_array) + + nbuckets * sizeof(struct bucket), + GFP_KERNEL|__GFP_ZERO)) || ++ !(bucket_gens = kvpmalloc(sizeof(struct bucket_gens) + nbuckets, ++ GFP_KERNEL|__GFP_ZERO)) || + !(buckets_nouse = kvpmalloc(BITS_TO_LONGS(nbuckets) * + sizeof(unsigned long), + GFP_KERNEL|__GFP_ZERO)) || +@@ -2164,6 +2193,8 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) + + buckets->first_bucket = ca->mi.first_bucket; + buckets->nbuckets = nbuckets; ++ bucket_gens->first_bucket = ca->mi.first_bucket; ++ bucket_gens->nbuckets = nbuckets; + + bch2_copygc_stop(c); + +@@ -2174,6 +2205,7 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) + } + + old_buckets = bucket_array(ca); ++ old_bucket_gens = rcu_dereference_protected(ca->bucket_gens, 1); + + if (resize) { + size_t n = min(buckets->nbuckets, old_buckets->nbuckets); +@@ -2181,13 +2213,18 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) + memcpy(buckets->b, + old_buckets->b, + n * sizeof(struct bucket)); ++ memcpy(bucket_gens->b, ++ old_bucket_gens->b, ++ n); + memcpy(buckets_nouse, + ca->buckets_nouse, + BITS_TO_LONGS(n) * sizeof(unsigned long)); + } + + rcu_assign_pointer(ca->buckets[0], buckets); +- buckets = old_buckets; ++ rcu_assign_pointer(ca->bucket_gens, bucket_gens); ++ buckets = old_buckets; ++ bucket_gens = old_bucket_gens; + + swap(ca->buckets_nouse, buckets_nouse); + +@@ -2221,6 +2258,8 @@ err: + free_fifo(&free[i]); + kvpfree(buckets_nouse, + BITS_TO_LONGS(nbuckets) * sizeof(unsigned long)); ++ if (bucket_gens) ++ call_rcu(&old_buckets->rcu, bucket_gens_free_rcu); + if (buckets) + call_rcu(&old_buckets->rcu, buckets_free_rcu); + +diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h +index d2d82394a86d..45c6d230f242 100644 +--- a/fs/bcachefs/buckets.h ++++ b/fs/bcachefs/buckets.h +@@ -63,6 +63,24 @@ static inline struct bucket *bucket(struct bch_dev *ca, size_t b) + return __bucket(ca, b, false); + } + ++static inline struct bucket_gens *bucket_gens(struct bch_dev *ca) ++{ ++ return rcu_dereference_check(ca->bucket_gens, ++ !ca->fs || ++ percpu_rwsem_is_held(&ca->fs->mark_lock) || ++ lockdep_is_held(&ca->fs->gc_lock) || ++ lockdep_is_held(&ca->bucket_lock)); ++ ++} ++ ++static inline u8 *bucket_gen(struct bch_dev *ca, size_t b) ++{ ++ struct bucket_gens *gens = bucket_gens(ca); ++ ++ BUG_ON(b < gens->first_bucket || b >= gens->nbuckets); ++ return gens->b + b; ++} ++ + /* + * bucket_gc_gen() returns the difference between the bucket's current gen and + * the oldest gen of any pointer into that bucket in the btree. +@@ -123,7 +141,7 @@ static inline u8 ptr_stale(struct bch_dev *ca, + u8 ret; + + rcu_read_lock(); +- ret = gen_after(PTR_BUCKET(ca, ptr)->mark.gen, ptr->gen); ++ ret = gen_after(*bucket_gen(ca, PTR_BUCKET_NR(ca, ptr)), ptr->gen); + rcu_read_unlock(); + + return ret; +diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h +index b2de2995c5e7..18bca269b750 100644 +--- a/fs/bcachefs/buckets_types.h ++++ b/fs/bcachefs/buckets_types.h +@@ -52,6 +52,13 @@ struct bucket_array { + struct bucket b[]; + }; + ++struct bucket_gens { ++ struct rcu_head rcu; ++ u16 first_bucket; ++ size_t nbuckets; ++ u8 b[]; ++}; ++ + struct bch_dev_usage { + u64 buckets_ec; + u64 buckets_unavailable; +-- +cgit v1.2.3 + + +From cd9337c22f83c2976797b3ec38a60f2561c8ce8c Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 26 Dec 2021 21:41:09 -0500 +Subject: bcachefs: Fix allocator + journal interaction + +The allocator needs to wait until the last update touching a bucket has +been commited before writing to it again. However, the code was checking +against the last dirty journal sequence number, not the last flushed +journal sequence number. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/alloc_background.c | 2 +- + fs/bcachefs/buckets.c | 2 +- + 2 files changed, 2 insertions(+), 2 deletions(-) + +diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c +index ba518581a433..c24fb4b1d7b2 100644 +--- a/fs/bcachefs/alloc_background.c ++++ b/fs/bcachefs/alloc_background.c +@@ -594,7 +594,7 @@ static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca) + buckets = bucket_array(ca); + ca->alloc_heap.used = 0; + now = atomic64_read(&c->io_clock[READ].now); +- last_seq_ondisk = c->journal.last_seq_ondisk; ++ last_seq_ondisk = c->journal.flushed_seq_ondisk; + + /* + * Find buckets with lowest read priority, by building a maxheap sorted +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index 2b084c435011..eb6cf4b56890 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -50,7 +50,7 @@ static inline void fs_usage_data_type_to_base(struct bch_fs_usage *fs_usage, + void bch2_bucket_seq_cleanup(struct bch_fs *c) + { + u64 journal_seq = atomic64_read(&c->journal.seq); +- u16 last_seq_ondisk = c->journal.last_seq_ondisk; ++ u16 last_seq_ondisk = c->journal.flushed_seq_ondisk; + struct bch_dev *ca; + struct bucket_array *buckets; + struct bucket *g; +-- +cgit v1.2.3 + + +From 7ee29f9b8f214b774fa9340dbc23a911c9f2a6ff Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 26 Dec 2021 22:27:10 -0500 +Subject: bcachefs: Kill bch2_ec_mem_alloc() + +bch2_ec_mem_alloc() was only used by GC, and there's no real need to +preallocate the stripes radix tree since we can cope fine with memory +allocation failure when we use the radix tree. This deletes a fair bit +of code, and it's also needed for the upcoming patch because +bch2_btree_iter_peek_prev() won't be working before journal replay +completes (and using it was incorrect previously, as well). + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_gc.c | 7 ------- + fs/bcachefs/buckets.c | 6 +++++- + fs/bcachefs/ec.c | 40 ---------------------------------------- + fs/bcachefs/ec.h | 2 -- + 4 files changed, 5 insertions(+), 50 deletions(-) + +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +index 88fbb0ead39d..9e3213b90439 100644 +--- a/fs/bcachefs/btree_gc.c ++++ b/fs/bcachefs/btree_gc.c +@@ -1270,7 +1270,6 @@ static int bch2_gc_start(struct bch_fs *c, + { + struct bch_dev *ca = NULL; + unsigned i; +- int ret; + + BUG_ON(c->usage_gc); + +@@ -1302,12 +1301,6 @@ static int bch2_gc_start(struct bch_fs *c, + } + } + +- ret = bch2_ec_mem_alloc(c, true); +- if (ret) { +- bch_err(c, "error allocating ec gc mem"); +- return ret; +- } +- + percpu_down_write(&c->mark_lock); + + /* +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index eb6cf4b56890..4d20312c1d73 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -1090,7 +1090,11 @@ static int bch2_mark_stripe(struct btree_trans *trans, + spin_unlock(&c->ec_stripes_heap_lock); + } + } else { +- struct gc_stripe *m = genradix_ptr(&c->gc_stripes, idx); ++ struct gc_stripe *m = ++ genradix_ptr_alloc(&c->gc_stripes, idx, GFP_KERNEL); ++ ++ if (!m) ++ return -ENOMEM; + + /* + * This will be wrong when we bring back runtime gc: we should +diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c +index 20e44e572288..3cccd1faade5 100644 +--- a/fs/bcachefs/ec.c ++++ b/fs/bcachefs/ec.c +@@ -1608,46 +1608,6 @@ int bch2_stripes_read(struct bch_fs *c) + return ret; + } + +-int bch2_ec_mem_alloc(struct bch_fs *c, bool gc) +-{ +- struct btree_trans trans; +- struct btree_iter iter; +- struct bkey_s_c k; +- size_t i, idx = 0; +- int ret = 0; +- +- bch2_trans_init(&trans, c, 0, 0); +- bch2_trans_iter_init(&trans, &iter, BTREE_ID_stripes, POS(0, U64_MAX), 0); +- +- k = bch2_btree_iter_prev(&iter); +- ret = bkey_err(k); +- if (!ret && k.k) +- idx = k.k->p.offset + 1; +- +- bch2_trans_iter_exit(&trans, &iter); +- bch2_trans_exit(&trans); +- if (ret) +- return ret; +- +- if (!idx) +- return 0; +- +- if (!gc && +- !init_heap(&c->ec_stripes_heap, roundup_pow_of_two(idx), +- GFP_KERNEL)) +- return -ENOMEM; +-#if 0 +- ret = genradix_prealloc(&c->stripes[gc], idx, GFP_KERNEL); +-#else +- for (i = 0; i < idx; i++) +- if (!gc +- ? !genradix_ptr_alloc(&c->stripes, i, GFP_KERNEL) +- : !genradix_ptr_alloc(&c->gc_stripes, i, GFP_KERNEL)) +- return -ENOMEM; +-#endif +- return 0; +-} +- + void bch2_stripes_heap_to_text(struct printbuf *out, struct bch_fs *c) + { + ec_stripes_heap *h = &c->ec_stripes_heap; +diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h +index 468141072bb4..78d468c7680a 100644 +--- a/fs/bcachefs/ec.h ++++ b/fs/bcachefs/ec.h +@@ -217,8 +217,6 @@ void bch2_stripes_heap_start(struct bch_fs *); + + int bch2_stripes_read(struct bch_fs *); + +-int bch2_ec_mem_alloc(struct bch_fs *, bool); +- + void bch2_stripes_heap_to_text(struct printbuf *, struct bch_fs *); + void bch2_new_stripes_to_text(struct printbuf *, struct bch_fs *); + +-- +cgit v1.2.3 + + +From fddee0759b81f580e6a0ee9ee0d576daf3e6d530 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 27 Dec 2021 19:58:12 -0500 +Subject: bcachefs: Update sysfs compression_stats for snapshots + + - BTREE_ITER_ALL_SNAPSHOTS flag is required here + - change it to also walk the reflink btree + - change it to accumulate stats for all pointers in an extent + - change it to account for incompressible data + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/sysfs.c | 85 +++++++++++++++++++++++++++++++++++------------------ + 1 file changed, 57 insertions(+), 28 deletions(-) + +diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c +index 07e9b214bcb5..6d1596322ee2 100644 +--- a/fs/bcachefs/sysfs.c ++++ b/fs/bcachefs/sysfs.c +@@ -266,8 +266,12 @@ static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; +- u64 nr_uncompressed_extents = 0, uncompressed_sectors = 0, ++ enum btree_id id; ++ u64 nr_uncompressed_extents = 0, + nr_compressed_extents = 0, ++ nr_incompressible_extents = 0, ++ uncompressed_sectors = 0, ++ incompressible_sectors = 0, + compressed_sectors_compressed = 0, + compressed_sectors_uncompressed = 0; + int ret; +@@ -277,47 +281,72 @@ static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c + + bch2_trans_init(&trans, c, 0, 0); + +- for_each_btree_key(&trans, iter, BTREE_ID_extents, POS_MIN, 0, k, ret) +- if (k.k->type == KEY_TYPE_extent) { +- struct bkey_s_c_extent e = bkey_s_c_to_extent(k); ++ for (id = 0; id < BTREE_ID_NR; id++) { ++ if (!((1U << id) & BTREE_ID_HAS_PTRS)) ++ continue; ++ ++ for_each_btree_key(&trans, iter, id, POS_MIN, ++ BTREE_ITER_ALL_SNAPSHOTS, k, ret) { ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; +- +- extent_for_each_ptr_decode(e, p, entry) { +- if (!crc_is_compressed(p.crc)) { +- nr_uncompressed_extents++; +- uncompressed_sectors += e.k->size; +- } else { +- nr_compressed_extents++; ++ bool compressed = false, uncompressed = false, incompressible = false; ++ ++ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { ++ switch (p.crc.compression_type) { ++ case BCH_COMPRESSION_TYPE_none: ++ uncompressed = true; ++ uncompressed_sectors += k.k->size; ++ break; ++ case BCH_COMPRESSION_TYPE_incompressible: ++ incompressible = true; ++ incompressible_sectors += k.k->size; ++ break; ++ default: + compressed_sectors_compressed += + p.crc.compressed_size; + compressed_sectors_uncompressed += + p.crc.uncompressed_size; ++ compressed = true; ++ break; + } +- +- /* only looking at the first ptr */ +- break; + } ++ ++ if (incompressible) ++ nr_incompressible_extents++; ++ else if (uncompressed) ++ nr_uncompressed_extents++; ++ else if (compressed) ++ nr_compressed_extents++; + } +- bch2_trans_iter_exit(&trans, &iter); ++ bch2_trans_iter_exit(&trans, &iter); ++ } + + bch2_trans_exit(&trans); ++ + if (ret) + return ret; + +- pr_buf(out, +- "uncompressed data:\n" +- " nr extents: %llu\n" +- " size (bytes): %llu\n" +- "compressed data:\n" +- " nr extents: %llu\n" +- " compressed size (bytes): %llu\n" +- " uncompressed size (bytes): %llu\n", +- nr_uncompressed_extents, +- uncompressed_sectors << 9, +- nr_compressed_extents, +- compressed_sectors_compressed << 9, +- compressed_sectors_uncompressed << 9); ++ pr_buf(out, "uncompressed:\n"); ++ pr_buf(out, " nr extents: %llu\n", nr_uncompressed_extents); ++ pr_buf(out, " size: "); ++ bch2_hprint(out, uncompressed_sectors << 9); ++ pr_buf(out, "\n"); ++ ++ pr_buf(out, "compressed:\n"); ++ pr_buf(out, " nr extents: %llu\n", nr_compressed_extents); ++ pr_buf(out, " compressed size: "); ++ bch2_hprint(out, compressed_sectors_compressed << 9); ++ pr_buf(out, "\n"); ++ pr_buf(out, " uncompressed size: "); ++ bch2_hprint(out, compressed_sectors_uncompressed << 9); ++ pr_buf(out, "\n"); ++ ++ pr_buf(out, "incompressible:\n"); ++ pr_buf(out, " nr extents: %llu\n", nr_incompressible_extents); ++ pr_buf(out, " size: "); ++ bch2_hprint(out, incompressible_sectors << 9); ++ pr_buf(out, "\n"); + return 0; + } + +-- +cgit v1.2.3 + + +From a55d368f18587b9ba8675e3a8bc83f40b14c7410 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 27 Dec 2021 20:05:07 -0500 +Subject: bcachefs: Run scan_old_btree_nodes after version upgrade + +In the recovery path, we scan for old btree nodes if we don't have +certain compat bits set. If we do this, we should be doing it after we +upgraded to the newest on disk format. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/recovery.c | 41 ++++++++++++++++++++--------------------- + 1 file changed, 20 insertions(+), 21 deletions(-) + +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index ffa8ab933a11..f6dc557b7439 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -1298,33 +1298,14 @@ use_clean: + bch_verbose(c, "quotas done"); + } + +- if (!(c->sb.compat & (1ULL << BCH_COMPAT_extents_above_btree_updates_done)) || +- !(c->sb.compat & (1ULL << BCH_COMPAT_bformat_overflow_done)) || +- le16_to_cpu(c->sb.version_min) < bcachefs_metadata_version_btree_ptr_sectors_written) { +- struct bch_move_stats stats; +- +- bch_move_stats_init(&stats, "recovery"); +- +- bch_info(c, "scanning for old btree nodes"); +- ret = bch2_fs_read_write(c); +- if (ret) +- goto err; +- +- ret = bch2_scan_old_btree_nodes(c, &stats); +- if (ret) +- goto err; +- bch_info(c, "scanning for old btree nodes done"); +- } +- + mutex_lock(&c->sb_lock); + /* + * With journal replay done, we can clear the journal seq blacklist + * table: + */ + BUG_ON(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)); +- BUG_ON(le16_to_cpu(c->sb.version_min) < bcachefs_metadata_version_btree_ptr_sectors_written); +- +- bch2_sb_resize_journal_seq_blacklist(&c->disk_sb, 0); ++ if (le16_to_cpu(c->sb.version_min) >= bcachefs_metadata_version_btree_ptr_sectors_written) ++ bch2_sb_resize_journal_seq_blacklist(&c->disk_sb, 0); + + if (c->opts.version_upgrade) { + c->disk_sb.sb->version = cpu_to_le16(bcachefs_metadata_version_current); +@@ -1349,6 +1330,24 @@ use_clean: + bch2_write_super(c); + mutex_unlock(&c->sb_lock); + ++ if (!(c->sb.compat & (1ULL << BCH_COMPAT_extents_above_btree_updates_done)) || ++ !(c->sb.compat & (1ULL << BCH_COMPAT_bformat_overflow_done)) || ++ le16_to_cpu(c->sb.version_min) < bcachefs_metadata_version_btree_ptr_sectors_written) { ++ struct bch_move_stats stats; ++ ++ bch_move_stats_init(&stats, "recovery"); ++ ++ bch_info(c, "scanning for old btree nodes"); ++ ret = bch2_fs_read_write(c); ++ if (ret) ++ goto err; ++ ++ ret = bch2_scan_old_btree_nodes(c, &stats); ++ if (ret) ++ goto err; ++ bch_info(c, "scanning for old btree nodes done"); ++ } ++ + ret = 0; + out: + set_bit(BCH_FS_FSCK_DONE, &c->flags); +-- +cgit v1.2.3 + + +From ac6c771fa048e8a5eae2a6419638e11c36beabff Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 27 Dec 2021 20:45:07 -0500 +Subject: bcachefs: Add a tracepoint for the btree cache shrinker + +This is to help with diagnosing why the btree node can doesn't seem to +be shrinking - we've had issues in the past with granularity/batch size, +since btree nodes are so big. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_cache.c | 11 +++++++++-- + include/trace/events/bcachefs.h | 28 ++++++++++++++++++++++++++++ + 2 files changed, 37 insertions(+), 2 deletions(-) + +diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c +index b02f93bdfd1f..2788ba17e031 100644 +--- a/fs/bcachefs/btree_cache.c ++++ b/fs/bcachefs/btree_cache.c +@@ -276,6 +276,7 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, + unsigned long touched = 0; + unsigned long freed = 0; + unsigned i, flags; ++ unsigned long ret = SHRINK_STOP; + + if (bch2_btree_shrinker_disabled) + return SHRINK_STOP; +@@ -284,7 +285,7 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, + if (sc->gfp_mask & __GFP_FS) + mutex_lock(&bc->lock); + else if (!mutex_trylock(&bc->lock)) +- return -1; ++ goto out_norestore; + + flags = memalloc_nofs_save(); + +@@ -359,8 +360,14 @@ restart: + + mutex_unlock(&bc->lock); + out: ++ ret = (unsigned long) freed * btree_pages(c); + memalloc_nofs_restore(flags); +- return (unsigned long) freed * btree_pages(c); ++out_norestore: ++ trace_btree_cache_scan(sc->nr_to_scan, ++ sc->nr_to_scan / btree_pages(c), ++ btree_cache_can_free(bc), ++ ret); ++ return ret; + } + + static unsigned long bch2_btree_cache_count(struct shrinker *shrink, +diff --git a/include/trace/events/bcachefs.h b/include/trace/events/bcachefs.h +index fce3146378f9..5a409ee19d93 100644 +--- a/include/trace/events/bcachefs.h ++++ b/include/trace/events/bcachefs.h +@@ -318,6 +318,34 @@ DEFINE_EVENT(btree_node, btree_set_root, + TP_ARGS(c, b) + ); + ++TRACE_EVENT(btree_cache_scan, ++ TP_PROTO(unsigned long nr_to_scan_pages, ++ unsigned long nr_to_scan_nodes, ++ unsigned long can_free_nodes, ++ long ret), ++ TP_ARGS(nr_to_scan_pages, nr_to_scan_nodes, can_free_nodes, ret), ++ ++ TP_STRUCT__entry( ++ __field(unsigned long, nr_to_scan_pages ) ++ __field(unsigned long, nr_to_scan_nodes ) ++ __field(unsigned long, can_free_nodes ) ++ __field(long, ret ) ++ ), ++ ++ TP_fast_assign( ++ __entry->nr_to_scan_pages = nr_to_scan_pages; ++ __entry->nr_to_scan_nodes = nr_to_scan_nodes; ++ __entry->can_free_nodes = can_free_nodes; ++ __entry->ret = ret; ++ ), ++ ++ TP_printk("scanned for %lu pages, %lu nodes, can free %lu nodes, ret %li", ++ __entry->nr_to_scan_pages, ++ __entry->nr_to_scan_nodes, ++ __entry->can_free_nodes, ++ __entry->ret) ++); ++ + /* Garbage collection */ + + DEFINE_EVENT(btree_node, btree_gc_rewrite_node, +-- +cgit v1.2.3 + + +From ecd724b243285e11ecfcdd1ba7effcab2fff908b Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 27 Dec 2021 23:51:48 -0500 +Subject: bcachefs: bch2_journal_noflush_seq() + +Add bch2_journal_noflush_seq(), for telling the journal that entries +before a given sequence number should not be flushes - to be used by an +upcoming allocator optimization. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/journal.c | 38 ++++++++++++++++++++++++++++++++++++++ + fs/bcachefs/journal.h | 1 + + fs/bcachefs/journal_io.c | 7 ++++--- + 3 files changed, 43 insertions(+), 3 deletions(-) + +diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c +index e0017dcf3312..158df42e5e10 100644 +--- a/fs/bcachefs/journal.c ++++ b/fs/bcachefs/journal.c +@@ -705,6 +705,44 @@ int bch2_journal_flush(struct journal *j) + return bch2_journal_flush_seq(j, seq); + } + ++/* ++ * bch2_journal_noflush_seq - tell the journal not to issue any flushes before ++ * @seq ++ */ ++bool bch2_journal_noflush_seq(struct journal *j, u64 seq) ++{ ++ struct bch_fs *c = container_of(j, struct bch_fs, journal); ++ u64 unwritten_seq; ++ bool ret = false; ++ ++ if (!(c->sb.features & (1ULL << BCH_FEATURE_journal_no_flush))) ++ return false; ++ ++ if (seq <= c->journal.flushed_seq_ondisk) ++ return false; ++ ++ spin_lock(&j->lock); ++ if (seq <= c->journal.flushed_seq_ondisk) ++ goto out; ++ ++ for (unwritten_seq = last_unwritten_seq(j); ++ unwritten_seq < seq; ++ unwritten_seq++) { ++ struct journal_buf *buf = journal_seq_to_buf(j, unwritten_seq); ++ ++ /* journal write is already in flight, and was a flush write: */ ++ if (unwritten_seq == last_unwritten_seq(j) && !buf->noflush) ++ goto out; ++ ++ buf->noflush = true; ++ } ++ ++ ret = true; ++out: ++ spin_unlock(&j->lock); ++ return ret; ++} ++ + /* block/unlock the journal: */ + + void bch2_journal_unblock(struct journal *j) +diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h +index c39cbbf1bccd..b298873212d2 100644 +--- a/fs/bcachefs/journal.h ++++ b/fs/bcachefs/journal.h +@@ -477,6 +477,7 @@ void bch2_journal_flush_async(struct journal *, struct closure *); + + int bch2_journal_flush_seq(struct journal *, u64); + int bch2_journal_flush(struct journal *); ++bool bch2_journal_noflush_seq(struct journal *, u64); + int bch2_journal_meta(struct journal *); + + void bch2_journal_halt(struct journal *); +diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c +index e161e86e48c4..77201a0ee21d 100644 +--- a/fs/bcachefs/journal_io.c ++++ b/fs/bcachefs/journal_io.c +@@ -1399,9 +1399,10 @@ void bch2_journal_write(struct closure *cl) + + spin_lock(&j->lock); + if (c->sb.features & (1ULL << BCH_FEATURE_journal_no_flush) && +- !w->must_flush && +- (jiffies - j->last_flush_write) < msecs_to_jiffies(c->opts.journal_flush_delay) && +- test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags)) { ++ (w->noflush || ++ (!w->must_flush && ++ (jiffies - j->last_flush_write) < msecs_to_jiffies(c->opts.journal_flush_delay) && ++ test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags)))) { + w->noflush = true; + SET_JSET_NO_FLUSH(jset, true); + jset->last_seq = 0; +-- +cgit v1.2.3 + + +From e045c90f119d51717952332eed8c04c1d420b8de Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 12 Feb 2022 02:32:11 -0500 +Subject: bcachefs: Always check for bucket reuse after read + +Since dirty extents can be moved or overwritten, it's not just cached +data that we need the ptr_stale() check in bc2h_read_endio for - this +fixes data checksum errors seen in the tiering ktest tests. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/io.c | 5 ++--- + 1 file changed, 2 insertions(+), 3 deletions(-) + +diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c +index 50b90b728a6d..cab65e44efa9 100644 +--- a/fs/bcachefs/io.c ++++ b/fs/bcachefs/io.c +@@ -1893,9 +1893,8 @@ static void bch2_read_endio(struct bio *bio) + return; + } + +- if (rbio->pick.ptr.cached && +- (((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) || +- ptr_stale(ca, &rbio->pick.ptr))) { ++ if (((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) || ++ ptr_stale(ca, &rbio->pick.ptr)) { + atomic_long_inc(&c->read_realloc_races); + + if (rbio->flags & BCH_READ_RETRY_IF_STALE) +-- +cgit v1.2.3 + + +From 51680f12df456a1045972e201c35d8745f2e3434 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 27 Dec 2021 23:56:13 -0500 +Subject: bcachefs: Optimize bucket reuse + +If the btree updates pointing to a bucket were never flushed by the +journal before the bucket became empty again, we can reuse the bucket +without a journal flush. + +This tweaks the tracking of journal sequence numbers in alloc keys to +implement this optimization: now, we only update the journal sequence +number in alloc keys on transitions to and from empty. When a bucket +becomes empty, we check if we can tell the journal not to flush entries +starting from when the bucket was used. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/buckets.c | 64 ++++++++++++++++++++++----------------------------- + 1 file changed, 28 insertions(+), 36 deletions(-) + +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index 4d20312c1d73..e6c804f48e6c 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -531,20 +531,6 @@ void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca, + BUG_ON(owned_by_allocator == old.owned_by_allocator); + } + +-static inline u8 bkey_alloc_gen(struct bkey_s_c k) +-{ +- switch (k.k->type) { +- case KEY_TYPE_alloc: +- return bkey_s_c_to_alloc(k).v->gen; +- case KEY_TYPE_alloc_v2: +- return bkey_s_c_to_alloc_v2(k).v->gen; +- case KEY_TYPE_alloc_v3: +- return bkey_s_c_to_alloc_v3(k).v->gen; +- default: +- return 0; +- } +-} +- + static int bch2_mark_alloc(struct btree_trans *trans, + struct bkey_s_c old, struct bkey_s_c new, + unsigned flags) +@@ -552,16 +538,13 @@ static int bch2_mark_alloc(struct btree_trans *trans, + bool gc = flags & BTREE_TRIGGER_GC; + u64 journal_seq = trans->journal_res.seq; + struct bch_fs *c = trans->c; +- struct bkey_alloc_unpacked u; ++ struct bkey_alloc_unpacked old_u = bch2_alloc_unpack(old); ++ struct bkey_alloc_unpacked new_u = bch2_alloc_unpack(new); + struct bch_dev *ca; + struct bucket *g; + struct bucket_mark old_m, m; + int ret = 0; + +- /* We don't do anything for deletions - do we?: */ +- if (!bkey_is_alloc(new.k)) +- return 0; +- + /* + * alloc btree is read in by bch2_alloc_read, not gc: + */ +@@ -569,13 +552,24 @@ static int bch2_mark_alloc(struct btree_trans *trans, + !(flags & BTREE_TRIGGER_BUCKET_INVALIDATE)) + return 0; + +- if (flags & BTREE_TRIGGER_INSERT) { ++ if ((flags & BTREE_TRIGGER_INSERT) && ++ !old_u.data_type != !new_u.data_type && ++ new.k->type == KEY_TYPE_alloc_v3) { + struct bch_alloc_v3 *v = (struct bch_alloc_v3 *) new.v; ++ u64 old_journal_seq = le64_to_cpu(v->journal_seq); + + BUG_ON(!journal_seq); +- BUG_ON(new.k->type != KEY_TYPE_alloc_v3); + +- v->journal_seq = cpu_to_le64(journal_seq); ++ /* ++ * If the btree updates referring to a bucket weren't flushed ++ * before the bucket became empty again, then the we don't have ++ * to wait on a journal flush before we can reuse the bucket: ++ */ ++ new_u.journal_seq = !new_u.data_type && ++ (journal_seq == old_journal_seq || ++ bch2_journal_noflush_seq(&c->journal, old_journal_seq)) ++ ? 0 : journal_seq; ++ v->journal_seq = cpu_to_le64(new_u.journal_seq); + } + + ca = bch_dev_bkey_exists(c, new.k->p.inode); +@@ -583,20 +577,18 @@ static int bch2_mark_alloc(struct btree_trans *trans, + if (new.k->p.offset >= ca->mi.nbuckets) + return 0; + +- u = bch2_alloc_unpack(new); +- + percpu_down_read(&c->mark_lock); +- if (!gc && u.gen != bkey_alloc_gen(old)) +- *bucket_gen(ca, new.k->p.offset) = u.gen; ++ if (!gc && new_u.gen != old_u.gen) ++ *bucket_gen(ca, new.k->p.offset) = new_u.gen; + + g = __bucket(ca, new.k->p.offset, gc); + + old_m = bucket_cmpxchg(g, m, ({ +- m.gen = u.gen; +- m.data_type = u.data_type; +- m.dirty_sectors = u.dirty_sectors; +- m.cached_sectors = u.cached_sectors; +- m.stripe = u.stripe != 0; ++ m.gen = new_u.gen; ++ m.data_type = new_u.data_type; ++ m.dirty_sectors = new_u.dirty_sectors; ++ m.cached_sectors = new_u.cached_sectors; ++ m.stripe = new_u.stripe != 0; + + if (journal_seq) { + m.journal_seq_valid = 1; +@@ -606,12 +598,12 @@ static int bch2_mark_alloc(struct btree_trans *trans, + + bch2_dev_usage_update(c, ca, old_m, m, journal_seq, gc); + +- g->io_time[READ] = u.read_time; +- g->io_time[WRITE] = u.write_time; +- g->oldest_gen = u.oldest_gen; ++ g->io_time[READ] = new_u.read_time; ++ g->io_time[WRITE] = new_u.write_time; ++ g->oldest_gen = new_u.oldest_gen; + g->gen_valid = 1; +- g->stripe = u.stripe; +- g->stripe_redundancy = u.stripe_redundancy; ++ g->stripe = new_u.stripe; ++ g->stripe_redundancy = new_u.stripe_redundancy; + percpu_up_read(&c->mark_lock); + + /* +-- +cgit v1.2.3 + + +From 03d15c0bc297b9581fef0c5839331a83fdec77fd Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 28 Dec 2021 16:01:25 -0500 +Subject: bcachefs: bch2_hprint(): don't print decimal if conversion was exact + +There's places where we parse these numbers, and our parsing doesn't +cope with decimals currently - this is a hack to get the device_add path +working again where for the device blocksize there doesn't ever need to +be a decimal. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/util.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c +index 52de7c49cacb..0bbea332fcaa 100644 +--- a/fs/bcachefs/util.c ++++ b/fs/bcachefs/util.c +@@ -114,7 +114,7 @@ void bch2_hprint(struct printbuf *buf, s64 v) + * 103 is magic: t is in the range [-1023, 1023] and we want + * to turn it into [-9, 9] + */ +- if (u && v < 100 && v > -100) ++ if (u && t && v < 100 && v > -100) + pr_buf(buf, ".%i", t / 103); + if (u) + pr_buf(buf, "%c", si_units[u]); +-- +cgit v1.2.3 + + +From e0d1bddf6f450d608155cd372e57cafd67f68650 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 28 Dec 2021 16:31:57 -0500 +Subject: bcachefs: Improve error messages in device add path + +This converts the error messages in the device add to a better style, +and adds some missing ones. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/super.c | 44 +++++++++++++++++++++++++++----------------- + 1 file changed, 27 insertions(+), 17 deletions(-) + +diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c +index df6bffeffe06..1dbbf5231567 100644 +--- a/fs/bcachefs/super.c ++++ b/fs/bcachefs/super.c +@@ -1599,18 +1599,24 @@ int bch2_dev_add(struct bch_fs *c, const char *path) + int ret; + + ret = bch2_read_super(path, &opts, &sb); +- if (ret) ++ if (ret) { ++ bch_err(c, "device add error: error reading super: %i", ret); + return ret; ++ } + + err = bch2_sb_validate(&sb); +- if (err) ++ if (err) { ++ bch_err(c, "device add error: error validating super: %s", err); + return -EINVAL; ++ } + + dev_mi = bch2_sb_get_members(sb.sb)->members[sb.sb->dev_idx]; + + err = bch2_dev_may_add(sb.sb, c); +- if (err) ++ if (err) { ++ bch_err(c, "device add error: %s", err); + return -EINVAL; ++ } + + ca = __bch2_dev_alloc(c, &dev_mi); + if (!ca) { +@@ -1624,24 +1630,27 @@ int bch2_dev_add(struct bch_fs *c, const char *path) + return ret; + } + +- err = "journal alloc failed"; + ret = bch2_dev_journal_alloc(ca); +- if (ret) ++ if (ret) { ++ bch_err(c, "device add error: journal alloc failed"); + goto err; ++ } + + down_write(&c->state_lock); + mutex_lock(&c->sb_lock); + +- err = "insufficient space in new superblock"; + ret = bch2_sb_from_fs(c, ca); +- if (ret) ++ if (ret) { ++ bch_err(c, "device add error: new device superblock too small"); + goto err_unlock; ++ } + + mi = bch2_sb_get_members(ca->disk_sb.sb); + + if (!bch2_sb_resize_members(&ca->disk_sb, + le32_to_cpu(mi->field.u64s) + + sizeof(dev_mi) / sizeof(u64))) { ++ bch_err(c, "device add error: new device superblock too small"); + ret = -ENOSPC; + goto err_unlock; + } +@@ -1654,7 +1663,7 @@ int bch2_dev_add(struct bch_fs *c, const char *path) + if (!bch2_dev_exists(c->disk_sb.sb, mi, dev_idx)) + goto have_slot; + no_slot: +- err = "no slots available in superblock"; ++ bch_err(c, "device add error: already have maximum number of devices"); + ret = -ENOSPC; + goto err_unlock; + +@@ -1663,12 +1672,12 @@ have_slot: + u64s = (sizeof(struct bch_sb_field_members) + + sizeof(struct bch_member) * nr_devices) / sizeof(u64); + +- err = "no space in superblock for member info"; +- ret = -ENOSPC; +- + mi = bch2_sb_resize_members(&c->disk_sb, u64s); +- if (!mi) ++ if (!mi) { ++ bch_err(c, "device add error: no room in superblock for member info"); ++ ret = -ENOSPC; + goto err_unlock; ++ } + + /* success: */ + +@@ -1684,17 +1693,20 @@ have_slot: + + bch2_dev_usage_journal_reserve(c); + +- err = "error marking superblock"; + ret = bch2_trans_mark_dev_sb(c, ca); +- if (ret) ++ if (ret) { ++ bch_err(c, "device add error: error marking new superblock: %i", ret); + goto err_late; ++ } + + ca->new_fs_bucket_idx = 0; + + if (ca->mi.state == BCH_MEMBER_STATE_rw) { + ret = __bch2_dev_read_write(c, ca); +- if (ret) ++ if (ret) { ++ bch_err(c, "device add error: error going RW on new device: %i", ret); + goto err_late; ++ } + } + + up_write(&c->state_lock); +@@ -1707,11 +1719,9 @@ err: + if (ca) + bch2_dev_free(ca); + bch2_free_super(&sb); +- bch_err(c, "Unable to add device: %s", err); + return ret; + err_late: + up_write(&c->state_lock); +- bch_err(c, "Error going rw after adding device: %s", err); + return -EINVAL; + } + +-- +cgit v1.2.3 + + +From 262b1a10b68bcad49410b0c9caac31cb92b10f1f Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 29 Dec 2021 11:27:47 -0500 +Subject: bcachefs: Fix keylist size in btree_update + +This fixes a buffer overrun, fortunately caught by a BUG_ON(). + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_update_interior.h | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h +index 8cf59cee6e4e..8dc86fa636d6 100644 +--- a/fs/bcachefs/btree_update_interior.h ++++ b/fs/bcachefs/btree_update_interior.h +@@ -82,12 +82,12 @@ struct btree_update { + /* Nodes being freed: */ + struct keylist old_keys; + u64 _old_keys[BTREE_UPDATE_NODES_MAX * +- BKEY_BTREE_PTR_VAL_U64s_MAX]; ++ BKEY_BTREE_PTR_U64s_MAX]; + + /* Nodes being added: */ + struct keylist new_keys; + u64 _new_keys[BTREE_UPDATE_NODES_MAX * +- BKEY_BTREE_PTR_VAL_U64s_MAX]; ++ BKEY_BTREE_PTR_U64s_MAX]; + + /* New nodes, that will be made reachable by this update: */ + struct btree *new_nodes[BTREE_UPDATE_NODES_MAX]; +-- +cgit v1.2.3 + + +From e93c9acdb66fc9b7b57a135debe10d22115174f1 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 27 Dec 2021 21:28:50 -0500 +Subject: bcachefs: Add an error message for copygc spinning + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/movinggc.c | 5 +++++ + 1 file changed, 5 insertions(+) + +diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c +index 7b7eee9b1773..7cd1b0cf27e4 100644 +--- a/fs/bcachefs/movinggc.c ++++ b/fs/bcachefs/movinggc.c +@@ -205,6 +205,11 @@ static int bch2_copygc(struct bch_fs *c) + up_read(&ca->bucket_lock); + } + ++ if (!h->used) { ++ bch_err_ratelimited(c, "copygc requested to run but found no buckets to move!"); ++ return 0; ++ } ++ + /* + * Our btree node allocations also come out of RESERVE_MOVINGGC: + */ +-- +cgit v1.2.3 + + +From a1a427ce3c1bd9772ad9fe3946da76283462757f Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 29 Dec 2021 13:49:34 -0500 +Subject: bcachefs: Add iter_flags arg to bch2_btree_delete_range() + +Will be used by the new snapshot tests, to pass in +BTREE_ITER_ALL_SNAPSHOTS. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_update.h | 2 +- + fs/bcachefs/btree_update_leaf.c | 6 ++++-- + fs/bcachefs/ec.c | 2 +- + fs/bcachefs/quota.c | 6 +++--- + fs/bcachefs/super.c | 2 +- + fs/bcachefs/tests.c | 12 +++++++----- + 6 files changed, 17 insertions(+), 13 deletions(-) + +diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h +index 89f07e58f61b..16ebf1a2b1f9 100644 +--- a/fs/bcachefs/btree_update.h ++++ b/fs/bcachefs/btree_update.h +@@ -63,7 +63,7 @@ int bch2_btree_insert(struct bch_fs *, enum btree_id, struct bkey_i *, + int bch2_btree_delete_range_trans(struct btree_trans *, enum btree_id, + struct bpos, struct bpos, unsigned, u64 *); + int bch2_btree_delete_range(struct bch_fs *, enum btree_id, +- struct bpos, struct bpos, u64 *); ++ struct bpos, struct bpos, unsigned, u64 *); + + int bch2_btree_node_rewrite(struct btree_trans *, struct btree_iter *, + struct btree *, unsigned); +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index 1966441b1a62..4573a7c2feb7 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -1463,7 +1463,7 @@ retry: + */ + delete.k.p = iter.pos; + +- if (btree_node_type_is_extents(id)) { ++ if (iter.flags & BTREE_ITER_IS_EXTENTS) { + unsigned max_sectors = + KEY_SIZE_MAX & (~0 << trans->c->block_bits); + +@@ -1500,8 +1500,10 @@ retry: + */ + int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id, + struct bpos start, struct bpos end, ++ unsigned iter_flags, + u64 *journal_seq) + { + return bch2_trans_do(c, NULL, journal_seq, 0, +- bch2_btree_delete_range_trans(&trans, id, start, end, 0, journal_seq)); ++ bch2_btree_delete_range_trans(&trans, id, start, end, ++ iter_flags, journal_seq)); + } +diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c +index 3cccd1faade5..9a1751d4465b 100644 +--- a/fs/bcachefs/ec.c ++++ b/fs/bcachefs/ec.c +@@ -677,7 +677,7 @@ static int ec_stripe_delete(struct bch_fs *c, size_t idx) + return bch2_btree_delete_range(c, BTREE_ID_stripes, + POS(0, idx), + POS(0, idx + 1), +- NULL); ++ 0, NULL); + } + + static void ec_stripe_delete_work(struct work_struct *work) +diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c +index 8f8f4b0accd6..54bb2a454a5e 100644 +--- a/fs/bcachefs/quota.c ++++ b/fs/bcachefs/quota.c +@@ -570,7 +570,7 @@ static int bch2_quota_remove(struct super_block *sb, unsigned uflags) + ret = bch2_btree_delete_range(c, BTREE_ID_quotas, + POS(QTYP_USR, 0), + POS(QTYP_USR + 1, 0), +- NULL); ++ 0, NULL); + if (ret) + return ret; + } +@@ -582,7 +582,7 @@ static int bch2_quota_remove(struct super_block *sb, unsigned uflags) + ret = bch2_btree_delete_range(c, BTREE_ID_quotas, + POS(QTYP_GRP, 0), + POS(QTYP_GRP + 1, 0), +- NULL); ++ 0, NULL); + if (ret) + return ret; + } +@@ -594,7 +594,7 @@ static int bch2_quota_remove(struct super_block *sb, unsigned uflags) + ret = bch2_btree_delete_range(c, BTREE_ID_quotas, + POS(QTYP_PRJ, 0), + POS(QTYP_PRJ + 1, 0), +- NULL); ++ 0, NULL); + if (ret) + return ret; + } +diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c +index 1dbbf5231567..3afa7ebd7ad8 100644 +--- a/fs/bcachefs/super.c ++++ b/fs/bcachefs/super.c +@@ -1478,7 +1478,7 @@ static int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca) + return bch2_btree_delete_range(c, BTREE_ID_alloc, + POS(ca->dev_idx, 0), + POS(ca->dev_idx + 1, 0), +- NULL); ++ 0, NULL); + } + + int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) +diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c +index c42db4d1d6e3..16d67eb6d1c2 100644 +--- a/fs/bcachefs/tests.c ++++ b/fs/bcachefs/tests.c +@@ -14,14 +14,14 @@ static void delete_test_keys(struct bch_fs *c) + int ret; + + ret = bch2_btree_delete_range(c, BTREE_ID_extents, +- SPOS(0, 0, U32_MAX), +- SPOS(0, U64_MAX, U32_MAX), ++ POS_MIN, SPOS_MAX, ++ BTREE_ITER_ALL_SNAPSHOTS, + NULL); + BUG_ON(ret); + + ret = bch2_btree_delete_range(c, BTREE_ID_xattrs, +- SPOS(0, 0, U32_MAX), +- SPOS(0, U64_MAX, U32_MAX), ++ POS_MIN, SPOS_MAX, ++ BTREE_ITER_ALL_SNAPSHOTS, + NULL); + BUG_ON(ret); + } +@@ -749,7 +749,9 @@ static int seq_delete(struct bch_fs *c, u64 nr) + int ret; + + ret = bch2_btree_delete_range(c, BTREE_ID_xattrs, +- SPOS(0, 0, U32_MAX), POS_MAX, NULL); ++ POS_MIN, SPOS_MAX, ++ BTREE_ITER_ALL_SNAPSHOTS, ++ NULL); + if (ret) + bch_err(c, "error in seq_delete: %i", ret); + return ret; +-- +cgit v1.2.3 + + +From 6a00ce91ff224c3a6bbcbbfb7d61624fa663b975 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 29 Dec 2021 15:55:25 -0500 +Subject: bcachefs: Journal replay does't resort main list of keys + +The upcoming BTREE_ITER_WITH_JOURNAL patch will require journal keys to +stay in sorted order, so the btree iterator code can overlay them over +btree keys. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/recovery.c | 67 +++++++++++++++++++++++++++++++++----------------- + 1 file changed, 44 insertions(+), 23 deletions(-) + +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index f6dc557b7439..0b923037d236 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -561,8 +561,8 @@ static int bch2_journal_replay_key(struct bch_fs *c, struct journal_key *k) + + static int journal_sort_seq_cmp(const void *_l, const void *_r) + { +- const struct journal_key *l = _l; +- const struct journal_key *r = _r; ++ const struct journal_key *l = *((const struct journal_key **)_l); ++ const struct journal_key *r = *((const struct journal_key **)_r); + + return cmp_int(r->level, l->level) ?: + cmp_int(l->journal_seq, r->journal_seq) ?: +@@ -570,19 +570,30 @@ static int journal_sort_seq_cmp(const void *_l, const void *_r) + bpos_cmp(l->k->k.p, r->k->k.p); + } + +-static int bch2_journal_replay(struct bch_fs *c, +- struct journal_keys keys) ++static int bch2_journal_replay(struct bch_fs *c) + { ++ struct journal_keys *keys = &c->journal_keys; ++ struct journal_key **keys_sorted, *k; + struct journal *j = &c->journal; + struct bch_dev *ca; +- struct journal_key *i; ++ unsigned idx; ++ size_t i; + u64 seq; +- int ret, idx; ++ int ret; ++ ++ keys_sorted = kmalloc_array(sizeof(*keys_sorted), keys->nr, GFP_KERNEL); ++ if (!keys_sorted) ++ return -ENOMEM; + +- sort(keys.d, keys.nr, sizeof(keys.d[0]), journal_sort_seq_cmp, NULL); ++ for (i = 0; i < keys->nr; i++) ++ keys_sorted[i] = &keys->d[i]; + +- if (keys.nr) +- replay_now_at(j, keys.journal_seq_base); ++ sort(keys_sorted, keys->nr, ++ sizeof(keys_sorted[0]), ++ journal_sort_seq_cmp, NULL); ++ ++ if (keys->nr) ++ replay_now_at(j, keys->journal_seq_base); + + seq = j->replay_journal_seq; + +@@ -590,12 +601,14 @@ static int bch2_journal_replay(struct bch_fs *c, + * First replay updates to the alloc btree - these will only update the + * btree key cache: + */ +- for_each_journal_key(keys, i) { ++ for (i = 0; i < keys->nr; i++) { ++ k = keys_sorted[i]; ++ + cond_resched(); + +- if (!i->level && i->btree_id == BTREE_ID_alloc) { +- j->replay_journal_seq = keys.journal_seq_base + i->journal_seq; +- ret = bch2_journal_replay_key(c, i); ++ if (!k->level && k->btree_id == BTREE_ID_alloc) { ++ j->replay_journal_seq = keys->journal_seq_base + k->journal_seq; ++ ret = bch2_journal_replay_key(c, k); + if (ret) + goto err; + } +@@ -609,12 +622,14 @@ static int bch2_journal_replay(struct bch_fs *c, + /* + * Next replay updates to interior btree nodes: + */ +- for_each_journal_key(keys, i) { ++ for (i = 0; i < keys->nr; i++) { ++ k = keys_sorted[i]; ++ + cond_resched(); + +- if (i->level) { +- j->replay_journal_seq = keys.journal_seq_base + i->journal_seq; +- ret = bch2_journal_replay_key(c, i); ++ if (k->level) { ++ j->replay_journal_seq = keys->journal_seq_base + k->journal_seq; ++ ret = bch2_journal_replay_key(c, k); + if (ret) + goto err; + } +@@ -634,15 +649,17 @@ static int bch2_journal_replay(struct bch_fs *c, + /* + * Now replay leaf node updates: + */ +- for_each_journal_key(keys, i) { ++ for (i = 0; i < keys->nr; i++) { ++ k = keys_sorted[i]; ++ + cond_resched(); + +- if (i->level || i->btree_id == BTREE_ID_alloc) ++ if (k->level || k->btree_id == BTREE_ID_alloc) + continue; + +- replay_now_at(j, keys.journal_seq_base + i->journal_seq); ++ replay_now_at(j, keys->journal_seq_base + k->journal_seq); + +- ret = bch2_journal_replay_key(c, i); ++ ret = bch2_journal_replay_key(c, k); + if (ret) + goto err; + } +@@ -652,10 +669,14 @@ static int bch2_journal_replay(struct bch_fs *c, + + bch2_journal_set_replay_done(j); + bch2_journal_flush_all_pins(j); ++ kfree(keys_sorted); ++ + return bch2_journal_error(j); + err: + bch_err(c, "journal replay: error %d while replaying key at btree %s level %u", +- ret, bch2_btree_ids[i->btree_id], i->level); ++ ret, bch2_btree_ids[k->btree_id], k->level); ++ kfree(keys_sorted); ++ + return ret; + } + +@@ -1227,7 +1248,7 @@ use_clean: + + bch_verbose(c, "starting journal replay"); + err = "journal replay failed"; +- ret = bch2_journal_replay(c, c->journal_keys); ++ ret = bch2_journal_replay(c); + if (ret) + goto err; + bch_verbose(c, "journal replay done"); +-- +cgit v1.2.3 + + +From ba2a0c0616e59866af6668559c9c0b177880ed92 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 30 Dec 2021 20:14:52 -0500 +Subject: bcachefs: Add error messages for memory allocation failures + +This adds some missing diagnostics from rare but annoying to debug +runtime allocation failure paths. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_key_cache.c | 28 +++++++++++++++++----------- + fs/bcachefs/btree_update_leaf.c | 8 ++++++-- + fs/bcachefs/buckets.c | 16 ++++++++++------ + fs/bcachefs/fsck.c | 25 +++++++++++++++++-------- + 4 files changed, 50 insertions(+), 27 deletions(-) + +diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c +index 230a920ae32a..80ed79b06f21 100644 +--- a/fs/bcachefs/btree_key_cache.c ++++ b/fs/bcachefs/btree_key_cache.c +@@ -146,19 +146,23 @@ bkey_cached_reuse(struct btree_key_cache *c) + } + + static struct bkey_cached * +-btree_key_cache_create(struct btree_key_cache *c, ++btree_key_cache_create(struct bch_fs *c, + enum btree_id btree_id, + struct bpos pos) + { ++ struct btree_key_cache *bc = &c->btree_key_cache; + struct bkey_cached *ck; + bool was_new = true; + +- ck = bkey_cached_alloc(c); ++ ck = bkey_cached_alloc(bc); + + if (unlikely(!ck)) { +- ck = bkey_cached_reuse(c); +- if (unlikely(!ck)) ++ ck = bkey_cached_reuse(bc); ++ if (unlikely(!ck)) { ++ bch_err(c, "error allocating memory for key cache item, btree %s", ++ bch2_btree_ids[btree_id]); + return ERR_PTR(-ENOMEM); ++ } + + was_new = false; + } +@@ -175,7 +179,7 @@ btree_key_cache_create(struct btree_key_cache *c, + ck->valid = false; + ck->flags = 1U << BKEY_CACHED_ACCESSED; + +- if (unlikely(rhashtable_lookup_insert_fast(&c->table, ++ if (unlikely(rhashtable_lookup_insert_fast(&bc->table, + &ck->hash, + bch2_btree_key_cache_params))) { + /* We raced with another fill: */ +@@ -185,15 +189,15 @@ btree_key_cache_create(struct btree_key_cache *c, + six_unlock_intent(&ck->c.lock); + kfree(ck); + } else { +- mutex_lock(&c->lock); +- bkey_cached_free(c, ck); +- mutex_unlock(&c->lock); ++ mutex_lock(&bc->lock); ++ bkey_cached_free(bc, ck); ++ mutex_unlock(&bc->lock); + } + + return NULL; + } + +- atomic_long_inc(&c->nr_keys); ++ atomic_long_inc(&bc->nr_keys); + + six_unlock_write(&ck->c.lock); + +@@ -204,6 +208,7 @@ static int btree_key_cache_fill(struct btree_trans *trans, + struct btree_path *ck_path, + struct bkey_cached *ck) + { ++ struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_s_c k; + unsigned new_u64s = 0; +@@ -233,6 +238,8 @@ static int btree_key_cache_fill(struct btree_trans *trans, + new_u64s = roundup_pow_of_two(new_u64s); + new_k = kmalloc(new_u64s * sizeof(u64), GFP_NOFS); + if (!new_k) { ++ bch_err(c, "error allocating memory for key cache key, btree %s u64s %u", ++ bch2_btree_ids[ck->key.btree_id], new_u64s); + ret = -ENOMEM; + goto err; + } +@@ -293,8 +300,7 @@ retry: + return 0; + } + +- ck = btree_key_cache_create(&c->btree_key_cache, +- path->btree_id, path->pos); ++ ck = btree_key_cache_create(c, path->btree_id, path->pos); + ret = PTR_ERR_OR_ZERO(ck); + if (ret) + goto err; +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index 4573a7c2feb7..f561e09cd3ef 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -308,6 +308,7 @@ btree_key_can_insert_cached(struct btree_trans *trans, + struct btree_path *path, + unsigned u64s) + { ++ struct bch_fs *c = trans->c; + struct bkey_cached *ck = (void *) path->l[0].b; + unsigned new_u64s; + struct bkey_i *new_k; +@@ -315,7 +316,7 @@ btree_key_can_insert_cached(struct btree_trans *trans, + EBUG_ON(path->level); + + if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) && +- bch2_btree_key_cache_must_wait(trans->c) && ++ bch2_btree_key_cache_must_wait(c) && + !(trans->flags & BTREE_INSERT_JOURNAL_RECLAIM)) + return BTREE_INSERT_NEED_JOURNAL_RECLAIM; + +@@ -330,8 +331,11 @@ btree_key_can_insert_cached(struct btree_trans *trans, + + new_u64s = roundup_pow_of_two(u64s); + new_k = krealloc(ck->k, new_u64s * sizeof(u64), GFP_NOFS); +- if (!new_k) ++ if (!new_k) { ++ bch_err(c, "error allocating memory for key cache key, btree %s u64s %u", ++ bch2_btree_ids[path->btree_id], new_u64s); + return -ENOMEM; ++ } + + ck->u64s = new_u64s; + ck->k = new_k; +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index e6c804f48e6c..a51453fcdd64 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -922,9 +922,11 @@ static int bch2_mark_stripe_ptr(struct btree_trans *trans, + BUG_ON(!(flags & BTREE_TRIGGER_GC)); + + m = genradix_ptr_alloc(&c->gc_stripes, p.idx, GFP_KERNEL); +- +- if (!m) ++ if (!m) { ++ bch_err(c, "error allocating memory for gc_stripes, idx %llu", ++ (u64) p.idx); + return -ENOMEM; ++ } + + spin_lock(&c->ec_stripes_heap_lock); + +@@ -1035,7 +1037,7 @@ static int bch2_mark_stripe(struct btree_trans *trans, + bool gc = flags & BTREE_TRIGGER_GC; + u64 journal_seq = trans->journal_res.seq; + struct bch_fs *c = trans->c; +- size_t idx = new.k->p.offset; ++ u64 idx = new.k->p.offset; + const struct bch_stripe *old_s = old.k->type == KEY_TYPE_stripe + ? bkey_s_c_to_stripe(old).v : NULL; + const struct bch_stripe *new_s = new.k->type == KEY_TYPE_stripe +@@ -1053,7 +1055,7 @@ static int bch2_mark_stripe(struct btree_trans *trans, + + bch2_bkey_val_to_text(&PBUF(buf1), c, old); + bch2_bkey_val_to_text(&PBUF(buf2), c, new); +- bch_err_ratelimited(c, "error marking nonexistent stripe %zu while marking\n" ++ bch_err_ratelimited(c, "error marking nonexistent stripe %llu while marking\n" + "old %s\n" + "new %s", idx, buf1, buf2); + bch2_inconsistent_error(c); +@@ -1085,9 +1087,11 @@ static int bch2_mark_stripe(struct btree_trans *trans, + struct gc_stripe *m = + genradix_ptr_alloc(&c->gc_stripes, idx, GFP_KERNEL); + +- if (!m) ++ if (!m) { ++ bch_err(c, "error allocating memory for gc_stripes, idx %llu", ++ idx); + return -ENOMEM; +- ++ } + /* + * This will be wrong when we bring back runtime gc: we should + * be unmarking the old key and then marking the new key +diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c +index 361dbf338023..43b6159be01b 100644 +--- a/fs/bcachefs/fsck.c ++++ b/fs/bcachefs/fsck.c +@@ -564,14 +564,17 @@ static struct inode_walker inode_walker_init(void) + return (struct inode_walker) { 0, }; + } + +-static int inode_walker_realloc(struct inode_walker *w) ++static int inode_walker_realloc(struct bch_fs *c, struct inode_walker *w) + { + if (w->nr == w->size) { + size_t new_size = max_t(size_t, 8UL, w->size * 2); + void *d = krealloc(w->d, new_size * sizeof(w->d[0]), + GFP_KERNEL); +- if (!d) ++ if (!d) { ++ bch_err(c, "fsck: error allocating memory for inode_walker, size %zu", ++ new_size); + return -ENOMEM; ++ } + + w->d = d; + w->size = new_size; +@@ -586,7 +589,7 @@ static int add_inode(struct bch_fs *c, struct inode_walker *w, + struct bch_inode_unpacked u; + int ret; + +- ret = inode_walker_realloc(w); ++ ret = inode_walker_realloc(c, w); + if (ret) + return ret; + +@@ -647,7 +650,7 @@ found: + while (i && w->d[i - 1].snapshot > pos.snapshot) + --i; + +- ret = inode_walker_realloc(w); ++ ret = inode_walker_realloc(c, w); + if (ret) + return ret; + +@@ -1812,7 +1815,8 @@ static bool path_is_dup(struct pathbuf *p, u64 inum, u32 snapshot) + return false; + } + +-static int path_down(struct pathbuf *p, u64 inum, u32 snapshot) ++static int path_down(struct bch_fs *c, struct pathbuf *p, ++ u64 inum, u32 snapshot) + { + if (p->nr == p->size) { + size_t new_size = max_t(size_t, 256UL, p->size * 2); +@@ -1820,6 +1824,8 @@ static int path_down(struct pathbuf *p, u64 inum, u32 snapshot) + new_size * sizeof(p->entries[0]), + GFP_KERNEL); + if (!n) { ++ bch_err(c, "fsck: error allocating memory for pathbuf, size %zu", ++ new_size); + return -ENOMEM; + } + +@@ -1893,7 +1899,7 @@ static int check_path(struct btree_trans *trans, + if (!S_ISDIR(inode->bi_mode)) + break; + +- ret = path_down(p, inode->bi_inum, snapshot); ++ ret = path_down(c, p, inode->bi_inum, snapshot); + if (ret) { + bch_err(c, "memory allocation failure"); + return ret; +@@ -1998,12 +2004,15 @@ struct nlink_table { + } *d; + }; + +-static int add_nlink(struct nlink_table *t, u64 inum, u32 snapshot) ++static int add_nlink(struct bch_fs *c, struct nlink_table *t, ++ u64 inum, u32 snapshot) + { + if (t->nr == t->size) { + size_t new_size = max_t(size_t, 128UL, t->size * 2); + void *d = kvmalloc(new_size * sizeof(t->d[0]), GFP_KERNEL); + if (!d) { ++ bch_err(c, "fsck: error allocating memory for nlink_table, size %zu", ++ new_size); + return -ENOMEM; + } + +@@ -2093,7 +2102,7 @@ static int check_nlinks_find_hardlinks(struct bch_fs *c, + if (!u.bi_nlink) + continue; + +- ret = add_nlink(t, k.k->p.offset, k.k->p.snapshot); ++ ret = add_nlink(c, t, k.k->p.offset, k.k->p.snapshot); + if (ret) { + *end = k.k->p.offset; + ret = 0; +-- +cgit v1.2.3 + + +From 3db7e26c8c9d88955ef9a6d34a1c308ed47e6722 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 31 Dec 2021 16:12:54 -0500 +Subject: bcachefs: BCH_JSET_ENTRY_log + +Add a journal entry type for logging messages, and add an option to use +it to log the transaction name - this makes for a very handy debugging +tool, as with it we can use the 'bcachefs list_journal' command to see +not only what updates were done, but what was doing them. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bcachefs_format.h | 9 ++++++++- + fs/bcachefs/btree_types.h | 1 + + fs/bcachefs/btree_update_leaf.c | 44 ++++++++++++++++++++++++++++++++++++----- + fs/bcachefs/journal_io.c | 8 ++++++++ + fs/bcachefs/opts.h | 5 +++++ + 5 files changed, 61 insertions(+), 6 deletions(-) + +diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h +index a053fca7886d..e16247256e24 100644 +--- a/fs/bcachefs/bcachefs_format.h ++++ b/fs/bcachefs/bcachefs_format.h +@@ -1426,6 +1426,7 @@ LE64_BITMASK(BCH_SB_INODES_USE_KEY_CACHE,struct bch_sb, flags[3], 29, 30); + LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DELAY,struct bch_sb, flags[3], 30, 62); + LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DISABLED,struct bch_sb, flags[3], 62, 63); + LE64_BITMASK(BCH_SB_JOURNAL_RECLAIM_DELAY,struct bch_sb, flags[4], 0, 32); ++LE64_BITMASK(BCH_SB_JOURNAL_TRANSACTION_NAMES,struct bch_sb, flags[4], 32, 33); + + /* + * Features: +@@ -1660,7 +1661,8 @@ static inline __u64 __bset_magic(struct bch_sb *sb) + x(usage, 5) \ + x(data_usage, 6) \ + x(clock, 7) \ +- x(dev_usage, 8) ++ x(dev_usage, 8) \ ++ x(log, 9) + + enum { + #define x(f, nr) BCH_JSET_ENTRY_##f = nr, +@@ -1732,6 +1734,11 @@ struct jset_entry_dev_usage { + struct jset_entry_dev_usage_type d[]; + } __attribute__((packed)); + ++struct jset_entry_log { ++ struct jset_entry entry; ++ u8 d[]; ++} __attribute__((packed)); ++ + /* + * On disk format for a journal entry: + * seq is monotonically increasing; every journal entry has its own unique +diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h +index c84bba7bcda5..08c49ae3b338 100644 +--- a/fs/bcachefs/btree_types.h ++++ b/fs/bcachefs/btree_types.h +@@ -382,6 +382,7 @@ struct btree_trans { + bool used_mempool:1; + bool in_traverse_all:1; + bool restarted:1; ++ bool journal_transaction_names:1; + /* + * For when bch2_trans_update notices we'll be splitting a compressed + * extent: +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index f561e09cd3ef..5310787dec00 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -290,6 +290,31 @@ static inline int bch2_trans_journal_res_get(struct btree_trans *trans, + return ret == -EAGAIN ? BTREE_INSERT_NEED_JOURNAL_RES : ret; + } + ++#define JSET_ENTRY_LOG_U64s 4 ++ ++static noinline void journal_transaction_name(struct btree_trans *trans) ++{ ++ struct bch_fs *c = trans->c; ++ struct jset_entry *entry = journal_res_entry(&c->journal, &trans->journal_res); ++ struct jset_entry_log *l = container_of(entry, struct jset_entry_log, entry); ++ unsigned u64s = JSET_ENTRY_LOG_U64s - 1; ++ unsigned b, buflen = u64s * sizeof(u64); ++ ++ l->entry.u64s = cpu_to_le16(u64s); ++ l->entry.btree_id = 0; ++ l->entry.level = 0; ++ l->entry.type = BCH_JSET_ENTRY_log; ++ l->entry.pad[0] = 0; ++ l->entry.pad[1] = 0; ++ l->entry.pad[2] = 0; ++ b = snprintf(l->d, buflen, "%ps", (void *) trans->ip); ++ while (b < buflen) ++ l->d[b++] = '\0'; ++ ++ trans->journal_res.offset += JSET_ENTRY_LOG_U64s; ++ trans->journal_res.u64s -= JSET_ENTRY_LOG_U64s; ++} ++ + static inline enum btree_insert_ret + btree_key_can_insert(struct btree_trans *trans, + struct btree *b, +@@ -450,6 +475,9 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, + JOURNAL_RES_GET_NONBLOCK); + if (ret) + return ret; ++ ++ if (unlikely(trans->journal_transaction_names)) ++ journal_transaction_name(trans); + } else { + trans->journal_res.seq = c->journal.replay_journal_seq; + } +@@ -910,6 +938,7 @@ static int bch2_trans_commit_run_triggers(struct btree_trans *trans) + + int __bch2_trans_commit(struct btree_trans *trans) + { ++ struct bch_fs *c = trans->c; + struct btree_insert_entry *i = NULL; + unsigned u64s; + int ret = 0; +@@ -919,15 +948,20 @@ int __bch2_trans_commit(struct btree_trans *trans) + goto out_reset; + + if (trans->flags & BTREE_INSERT_GC_LOCK_HELD) +- lockdep_assert_held(&trans->c->gc_lock); ++ lockdep_assert_held(&c->gc_lock); + + memset(&trans->journal_preres, 0, sizeof(trans->journal_preres)); + + trans->journal_u64s = trans->extra_journal_entry_u64s; + trans->journal_preres_u64s = 0; + ++ trans->journal_transaction_names = READ_ONCE(c->opts.journal_transaction_names); ++ ++ if (trans->journal_transaction_names) ++ trans->journal_u64s += JSET_ENTRY_LOG_U64s; ++ + if (!(trans->flags & BTREE_INSERT_NOCHECK_RW) && +- unlikely(!percpu_ref_tryget(&trans->c->writes))) { ++ unlikely(!percpu_ref_tryget(&c->writes))) { + ret = bch2_trans_commit_get_rw_cold(trans); + if (ret) + goto out_reset; +@@ -969,7 +1003,7 @@ int __bch2_trans_commit(struct btree_trans *trans) + } + + if (trans->extra_journal_res) { +- ret = bch2_disk_reservation_add(trans->c, trans->disk_res, ++ ret = bch2_disk_reservation_add(c, trans->disk_res, + trans->extra_journal_res, + (trans->flags & BTREE_INSERT_NOFAIL) + ? BCH_DISK_RESERVATION_NOFAIL : 0); +@@ -988,10 +1022,10 @@ retry: + if (ret) + goto err; + out: +- bch2_journal_preres_put(&trans->c->journal, &trans->journal_preres); ++ bch2_journal_preres_put(&c->journal, &trans->journal_preres); + + if (likely(!(trans->flags & BTREE_INSERT_NOCHECK_RW))) +- percpu_ref_put(&trans->c->writes); ++ percpu_ref_put(&c->writes); + out_reset: + trans_for_each_update(trans, i) + bch2_path_put(trans, i->path, true); +diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c +index 77201a0ee21d..b3f293fada32 100644 +--- a/fs/bcachefs/journal_io.c ++++ b/fs/bcachefs/journal_io.c +@@ -479,6 +479,14 @@ fsck_err: + return ret; + } + ++static int journal_entry_validate_log(struct bch_fs *c, ++ const char *where, ++ struct jset_entry *entry, ++ unsigned version, int big_endian, int write) ++{ ++ return 0; ++} ++ + struct jset_entry_ops { + int (*validate)(struct bch_fs *, const char *, + struct jset_entry *, unsigned, int, int); +diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h +index 661eb5764f68..dcb843f3256f 100644 +--- a/fs/bcachefs/opts.h ++++ b/fs/bcachefs/opts.h +@@ -327,6 +327,11 @@ enum opt_type { + OPT_BOOL(), \ + NO_SB_OPT, false, \ + NULL, "Read all journal entries, not just dirty ones")\ ++ x(journal_transaction_names, u8, \ ++ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ ++ OPT_BOOL(), \ ++ BCH_SB_JOURNAL_TRANSACTION_NAMES, true, \ ++ NULL, "Log transaction function names in journal") \ + x(noexcl, u8, \ + OPT_FS|OPT_MOUNT, \ + OPT_BOOL(), \ +-- +cgit v1.2.3 + + +From e3d7331896e857d92d4b99bad293714c1716f1ce Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 31 Dec 2021 17:06:29 -0500 +Subject: bcachefs: bch2_journal_entry_to_text() + +This adds a _to_text() pretty printer for journal entries - including +every subtype - which will shortly be used by the 'bcachefs +list_journal' subcommand. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bcachefs_format.h | 20 ++++-- + fs/bcachefs/journal_io.c | 141 ++++++++++++++++++++++++++++++++++++++---- + fs/bcachefs/journal_io.h | 6 +- + fs/bcachefs/opts.c | 10 +++ + fs/bcachefs/opts.h | 2 + + fs/bcachefs/recovery.c | 11 ++-- + fs/bcachefs/super-io.c | 6 +- + 7 files changed, 169 insertions(+), 27 deletions(-) + +diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h +index e16247256e24..5153f0e42054 100644 +--- a/fs/bcachefs/bcachefs_format.h ++++ b/fs/bcachefs/bcachefs_format.h +@@ -76,6 +76,7 @@ + #include + #include + #include ++#include "vstructs.h" + + #define LE_BITMASK(_bits, name, type, field, offset, end) \ + static const unsigned name##_OFFSET = offset; \ +@@ -1692,11 +1693,16 @@ struct jset_entry_blacklist_v2 { + __le64 end; + }; + ++#define BCH_FS_USAGE_TYPES() \ ++ x(reserved, 0) \ ++ x(inodes, 1) \ ++ x(key_version, 2) ++ + enum { +- FS_USAGE_RESERVED = 0, +- FS_USAGE_INODES = 1, +- FS_USAGE_KEY_VERSION = 2, +- FS_USAGE_NR = 3 ++#define x(f, nr) BCH_FS_USAGE_##f = nr, ++ BCH_FS_USAGE_TYPES() ++#undef x ++ BCH_FS_USAGE_NR + }; + + struct jset_entry_usage { +@@ -1734,6 +1740,12 @@ struct jset_entry_dev_usage { + struct jset_entry_dev_usage_type d[]; + } __attribute__((packed)); + ++static inline unsigned jset_entry_dev_usage_nr_types(struct jset_entry_dev_usage *u) ++{ ++ return (vstruct_bytes(&u->entry) - sizeof(struct jset_entry_dev_usage)) / ++ sizeof(struct jset_entry_dev_usage_type); ++} ++ + struct jset_entry_log { + struct jset_entry entry; + u8 d[]; +diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c +index b3f293fada32..faf82bb4daf2 100644 +--- a/fs/bcachefs/journal_io.c ++++ b/fs/bcachefs/journal_io.c +@@ -274,7 +274,7 @@ fsck_err: + return ret; + } + +-static int journal_entry_validate_btree_keys(struct bch_fs *c, ++static int journal_entry_btree_keys_validate(struct bch_fs *c, + const char *where, + struct jset_entry *entry, + unsigned version, int big_endian, int write) +@@ -295,7 +295,18 @@ static int journal_entry_validate_btree_keys(struct bch_fs *c, + return 0; + } + +-static int journal_entry_validate_btree_root(struct bch_fs *c, ++static void journal_entry_btree_keys_to_text(struct printbuf *out, struct bch_fs *c, ++ struct jset_entry *entry) ++{ ++ struct bkey_i *k; ++ ++ pr_buf(out, "btree=%s l=%u ", bch2_btree_ids[entry->btree_id], entry->level); ++ ++ vstruct_for_each(entry, k) ++ bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(k)); ++} ++ ++static int journal_entry_btree_root_validate(struct bch_fs *c, + const char *where, + struct jset_entry *entry, + unsigned version, int big_endian, int write) +@@ -323,7 +334,13 @@ fsck_err: + return ret; + } + +-static int journal_entry_validate_prio_ptrs(struct bch_fs *c, ++static void journal_entry_btree_root_to_text(struct printbuf *out, struct bch_fs *c, ++ struct jset_entry *entry) ++{ ++ journal_entry_btree_keys_to_text(out, c, entry); ++} ++ ++static int journal_entry_prio_ptrs_validate(struct bch_fs *c, + const char *where, + struct jset_entry *entry, + unsigned version, int big_endian, int write) +@@ -332,7 +349,12 @@ static int journal_entry_validate_prio_ptrs(struct bch_fs *c, + return 0; + } + +-static int journal_entry_validate_blacklist(struct bch_fs *c, ++static void journal_entry_prio_ptrs_to_text(struct printbuf *out, struct bch_fs *c, ++ struct jset_entry *entry) ++{ ++} ++ ++static int journal_entry_blacklist_validate(struct bch_fs *c, + const char *where, + struct jset_entry *entry, + unsigned version, int big_endian, int write) +@@ -347,7 +369,16 @@ fsck_err: + return ret; + } + +-static int journal_entry_validate_blacklist_v2(struct bch_fs *c, ++static void journal_entry_blacklist_to_text(struct printbuf *out, struct bch_fs *c, ++ struct jset_entry *entry) ++{ ++ struct jset_entry_blacklist *bl = ++ container_of(entry, struct jset_entry_blacklist, entry); ++ ++ pr_buf(out, "seq=%llu", le64_to_cpu(bl->seq)); ++} ++ ++static int journal_entry_blacklist_v2_validate(struct bch_fs *c, + const char *where, + struct jset_entry *entry, + unsigned version, int big_endian, int write) +@@ -373,7 +404,18 @@ fsck_err: + return ret; + } + +-static int journal_entry_validate_usage(struct bch_fs *c, ++static void journal_entry_blacklist_v2_to_text(struct printbuf *out, struct bch_fs *c, ++ struct jset_entry *entry) ++{ ++ struct jset_entry_blacklist_v2 *bl = ++ container_of(entry, struct jset_entry_blacklist_v2, entry); ++ ++ pr_buf(out, "start=%llu end=%llu", ++ le64_to_cpu(bl->start), ++ le64_to_cpu(bl->end)); ++} ++ ++static int journal_entry_usage_validate(struct bch_fs *c, + const char *where, + struct jset_entry *entry, + unsigned version, int big_endian, int write) +@@ -394,7 +436,18 @@ fsck_err: + return ret; + } + +-static int journal_entry_validate_data_usage(struct bch_fs *c, ++static void journal_entry_usage_to_text(struct printbuf *out, struct bch_fs *c, ++ struct jset_entry *entry) ++{ ++ struct jset_entry_usage *u = ++ container_of(entry, struct jset_entry_usage, entry); ++ ++ pr_buf(out, "type=%s v=%llu", ++ bch2_fs_usage_types[u->entry.btree_id], ++ le64_to_cpu(u->v)); ++} ++ ++static int journal_entry_data_usage_validate(struct bch_fs *c, + const char *where, + struct jset_entry *entry, + unsigned version, int big_endian, int write) +@@ -416,7 +469,17 @@ fsck_err: + return ret; + } + +-static int journal_entry_validate_clock(struct bch_fs *c, ++static void journal_entry_data_usage_to_text(struct printbuf *out, struct bch_fs *c, ++ struct jset_entry *entry) ++{ ++ struct jset_entry_data_usage *u = ++ container_of(entry, struct jset_entry_data_usage, entry); ++ ++ bch2_replicas_entry_to_text(out, &u->r); ++ pr_buf(out, "=%llu", le64_to_cpu(u->v)); ++} ++ ++static int journal_entry_clock_validate(struct bch_fs *c, + const char *where, + struct jset_entry *entry, + unsigned version, int big_endian, int write) +@@ -442,7 +505,16 @@ fsck_err: + return ret; + } + +-static int journal_entry_validate_dev_usage(struct bch_fs *c, ++static void journal_entry_clock_to_text(struct printbuf *out, struct bch_fs *c, ++ struct jset_entry *entry) ++{ ++ struct jset_entry_clock *clock = ++ container_of(entry, struct jset_entry_clock, entry); ++ ++ pr_buf(out, "%s=%llu", clock->rw ? "write" : "read", le64_to_cpu(clock->time)); ++} ++ ++static int journal_entry_dev_usage_validate(struct bch_fs *c, + const char *where, + struct jset_entry *entry, + unsigned version, int big_endian, int write) +@@ -479,7 +551,32 @@ fsck_err: + return ret; + } + +-static int journal_entry_validate_log(struct bch_fs *c, ++static void journal_entry_dev_usage_to_text(struct printbuf *out, struct bch_fs *c, ++ struct jset_entry *entry) ++{ ++ struct jset_entry_dev_usage *u = ++ container_of(entry, struct jset_entry_dev_usage, entry); ++ unsigned i, nr_types = jset_entry_dev_usage_nr_types(u); ++ ++ pr_buf(out, "dev=%u", le32_to_cpu(u->dev)); ++ ++ for (i = 0; i < nr_types; i++) { ++ if (i < BCH_DATA_NR) ++ pr_buf(out, " %s", bch2_data_types[i]); ++ else ++ pr_buf(out, " (unknown data type %u)", i); ++ pr_buf(out, ": buckets=%llu sectors=%llu fragmented=%llu", ++ le64_to_cpu(u->d[i].buckets), ++ le64_to_cpu(u->d[i].sectors), ++ le64_to_cpu(u->d[i].fragmented)); ++ } ++ ++ pr_buf(out, " buckets_ec: %llu buckets_unavailable: %llu", ++ le64_to_cpu(u->buckets_ec), ++ le64_to_cpu(u->buckets_unavailable)); ++} ++ ++static int journal_entry_log_validate(struct bch_fs *c, + const char *where, + struct jset_entry *entry, + unsigned version, int big_endian, int write) +@@ -487,15 +584,26 @@ static int journal_entry_validate_log(struct bch_fs *c, + return 0; + } + ++static void journal_entry_log_to_text(struct printbuf *out, struct bch_fs *c, ++ struct jset_entry *entry) ++{ ++ struct jset_entry_log *l = container_of(entry, struct jset_entry_log, entry); ++ unsigned bytes = vstruct_bytes(entry) - offsetof(struct jset_entry_log, d); ++ ++ bch_scnmemcpy(out, l->d, strnlen(l->d, bytes)); ++} ++ + struct jset_entry_ops { + int (*validate)(struct bch_fs *, const char *, + struct jset_entry *, unsigned, int, int); ++ void (*to_text)(struct printbuf *, struct bch_fs *, struct jset_entry *); + }; + + static const struct jset_entry_ops bch2_jset_entry_ops[] = { + #define x(f, nr) \ + [BCH_JSET_ENTRY_##f] = (struct jset_entry_ops) { \ +- .validate = journal_entry_validate_##f, \ ++ .validate = journal_entry_##f##_validate, \ ++ .to_text = journal_entry_##f##_to_text, \ + }, + BCH_JSET_ENTRY_TYPES() + #undef x +@@ -511,6 +619,17 @@ int bch2_journal_entry_validate(struct bch_fs *c, const char *where, + : 0; + } + ++void bch2_journal_entry_to_text(struct printbuf *out, struct bch_fs *c, ++ struct jset_entry *entry) ++{ ++ if (entry->type < BCH_JSET_ENTRY_NR) { ++ pr_buf(out, "%s: ", bch2_jset_entry_types[entry->type]); ++ bch2_jset_entry_ops[entry->type].to_text(out, c, entry); ++ } else { ++ pr_buf(out, "(unknown type %u)", entry->type); ++ } ++} ++ + static int jset_validate_entries(struct bch_fs *c, struct jset *jset, + int write) + { +diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h +index f34281a28f12..d8425fe0d67b 100644 +--- a/fs/bcachefs/journal_io.h ++++ b/fs/bcachefs/journal_io.h +@@ -40,8 +40,10 @@ static inline struct jset_entry *__jset_entry_type_next(struct jset *jset, + for_each_jset_entry_type(entry, jset, BCH_JSET_ENTRY_btree_keys) \ + vstruct_for_each_safe(entry, k, _n) + +-int bch2_journal_entry_validate(struct bch_fs *, const char *, struct jset_entry *, +- unsigned, int, int); ++int bch2_journal_entry_validate(struct bch_fs *, const char *, ++ struct jset_entry *, unsigned, int, int); ++void bch2_journal_entry_to_text(struct printbuf *, struct bch_fs *, ++ struct jset_entry *); + + int bch2_journal_read(struct bch_fs *, struct list_head *, u64 *, u64 *); + +diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c +index d9ca69f2ecde..71bf26eb13d5 100644 +--- a/fs/bcachefs/opts.c ++++ b/fs/bcachefs/opts.c +@@ -71,6 +71,16 @@ const char * const bch2_member_states[] = { + NULL + }; + ++const char * const bch2_jset_entry_types[] = { ++ BCH_JSET_ENTRY_TYPES() ++ NULL ++}; ++ ++const char * const bch2_fs_usage_types[] = { ++ BCH_FS_USAGE_TYPES() ++ NULL ++}; ++ + #undef x + + const char * const bch2_d_types[BCH_DT_MAX] = { +diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h +index dcb843f3256f..c325a094ae43 100644 +--- a/fs/bcachefs/opts.h ++++ b/fs/bcachefs/opts.h +@@ -20,6 +20,8 @@ extern const char * const bch2_str_hash_types[]; + extern const char * const bch2_str_hash_opts[]; + extern const char * const bch2_data_types[]; + extern const char * const bch2_member_states[]; ++extern const char * const bch2_jset_entry_types[]; ++extern const char * const bch2_fs_usage_types[]; + extern const char * const bch2_d_types[]; + + static inline const char *bch2_d_type_str(unsigned d_type) +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index 0b923037d236..d332fd16517b 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -714,15 +714,15 @@ static int journal_replay_entry_early(struct bch_fs *c, + container_of(entry, struct jset_entry_usage, entry); + + switch (entry->btree_id) { +- case FS_USAGE_RESERVED: ++ case BCH_FS_USAGE_reserved: + if (entry->level < BCH_REPLICAS_MAX) + c->usage_base->persistent_reserved[entry->level] = + le64_to_cpu(u->v); + break; +- case FS_USAGE_INODES: ++ case BCH_FS_USAGE_inodes: + c->usage_base->nr_inodes = le64_to_cpu(u->v); + break; +- case FS_USAGE_KEY_VERSION: ++ case BCH_FS_USAGE_key_version: + atomic64_set(&c->key_version, + le64_to_cpu(u->v)); + break; +@@ -742,10 +742,7 @@ static int journal_replay_entry_early(struct bch_fs *c, + struct jset_entry_dev_usage *u = + container_of(entry, struct jset_entry_dev_usage, entry); + struct bch_dev *ca = bch_dev_bkey_exists(c, le32_to_cpu(u->dev)); +- unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); +- unsigned nr_types = (bytes - sizeof(struct jset_entry_dev_usage)) / +- sizeof(struct jset_entry_dev_usage_type); +- unsigned i; ++ unsigned i, nr_types = jset_entry_dev_usage_nr_types(u); + + ca->usage_base->buckets_ec = le64_to_cpu(u->buckets_ec); + ca->usage_base->buckets_unavailable = le64_to_cpu(u->buckets_unavailable); +diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c +index b8d2cf66a630..bbed24b702fd 100644 +--- a/fs/bcachefs/super-io.c ++++ b/fs/bcachefs/super-io.c +@@ -1027,7 +1027,7 @@ void bch2_journal_super_entries_add_common(struct bch_fs *c, + struct jset_entry_usage, entry); + + u->entry.type = BCH_JSET_ENTRY_usage; +- u->entry.btree_id = FS_USAGE_INODES; ++ u->entry.btree_id = BCH_FS_USAGE_inodes; + u->v = cpu_to_le64(c->usage_base->nr_inodes); + } + +@@ -1037,7 +1037,7 @@ void bch2_journal_super_entries_add_common(struct bch_fs *c, + struct jset_entry_usage, entry); + + u->entry.type = BCH_JSET_ENTRY_usage; +- u->entry.btree_id = FS_USAGE_KEY_VERSION; ++ u->entry.btree_id = BCH_FS_USAGE_key_version; + u->v = cpu_to_le64(atomic64_read(&c->key_version)); + } + +@@ -1047,7 +1047,7 @@ void bch2_journal_super_entries_add_common(struct bch_fs *c, + struct jset_entry_usage, entry); + + u->entry.type = BCH_JSET_ENTRY_usage; +- u->entry.btree_id = FS_USAGE_RESERVED; ++ u->entry.btree_id = BCH_FS_USAGE_reserved; + u->entry.level = i; + u->v = cpu_to_le64(c->usage_base->persistent_reserved[i]); + } +-- +cgit v1.2.3 + + +From ee16cae5520fbce251eca91439d914568a23a33f Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 31 Dec 2021 17:54:13 -0500 +Subject: bcachefs: Fix race between btree updates & journal replay + +Add a flag to indicate whether a journal replay key has been +overwritten, and set/test it with appropriate btree locks held. + +This fixes a race between the allocator - invalidating buckets, and +doing btree updates - and journal replay, which before this patch could +clobber the allocator thread's update with an older version of the key +from the journal. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bcachefs.h | 1 + + fs/bcachefs/btree_update_leaf.c | 12 ++++++++++++ + fs/bcachefs/recovery.c | 25 +++++++++++++++++++++++-- + fs/bcachefs/recovery.h | 2 ++ + 4 files changed, 38 insertions(+), 2 deletions(-) + +diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h +index b21151ea73fb..696c7c93c919 100644 +--- a/fs/bcachefs/bcachefs.h ++++ b/fs/bcachefs/bcachefs.h +@@ -561,6 +561,7 @@ struct journal_keys { + enum btree_id btree_id:8; + unsigned level:8; + bool allocated; ++ bool overwritten; + struct bkey_i *k; + u32 journal_seq; + u32 journal_offset; +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index 5310787dec00..e18ec78edee5 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -15,6 +15,7 @@ + #include "journal.h" + #include "journal_reclaim.h" + #include "keylist.h" ++#include "recovery.h" + #include "subvolume.h" + #include "replicas.h" + +@@ -624,6 +625,14 @@ fail: + return btree_trans_restart(trans); + } + ++static noinline void bch2_drop_overwrites_from_journal(struct btree_trans *trans) ++{ ++ struct btree_insert_entry *i; ++ ++ trans_for_each_update(trans, i) ++ bch2_journal_key_overwritten(trans->c, i->btree_id, i->level, i->k->k.p); ++} ++ + /* + * Get journal reservation, take write locks, and attempt to do btree update(s): + */ +@@ -701,6 +710,9 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, + + ret = bch2_trans_commit_write_locked(trans, stopped_at, trace_ip); + ++ if (!ret && unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags))) ++ bch2_drop_overwrites_from_journal(trans); ++ + trans_for_each_update(trans, i) + if (!same_leaf_as_prev(trans, i)) + bch2_btree_node_unlock_write_inlined(trans, i->path, +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index d332fd16517b..fcacf166f900 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -185,6 +185,19 @@ int bch2_journal_key_delete(struct bch_fs *c, enum btree_id id, + return bch2_journal_key_insert(c, id, level, &whiteout); + } + ++void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree, ++ unsigned level, struct bpos pos) ++{ ++ struct journal_keys *keys = &c->journal_keys; ++ size_t idx = journal_key_search(keys, btree, level, pos); ++ ++ if (idx < keys->nr && ++ keys->d[idx].btree_id == btree && ++ keys->d[idx].level == level && ++ !bpos_cmp(keys->d[idx].k->k.p, pos)) ++ keys->d[idx].overwritten = true; ++} ++ + static struct bkey_i *bch2_journal_iter_peek(struct journal_iter *iter) + { + struct journal_key *k = iter->idx - iter->keys->nr +@@ -539,8 +552,16 @@ static int __bch2_journal_replay_key(struct btree_trans *trans, + bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p, + BTREE_MAX_DEPTH, k->level, + iter_flags); +- ret = bch2_btree_iter_traverse(&iter) ?: +- bch2_trans_update(trans, &iter, k->k, BTREE_TRIGGER_NORUN); ++ ret = bch2_btree_iter_traverse(&iter); ++ if (ret) ++ goto out; ++ ++ /* Must be checked with btree locked: */ ++ if (k->overwritten) ++ goto out; ++ ++ ret = bch2_trans_update(trans, &iter, k->k, BTREE_TRIGGER_NORUN); ++out: + bch2_trans_iter_exit(trans, &iter); + return ret; + } +diff --git a/fs/bcachefs/recovery.h b/fs/bcachefs/recovery.h +index 1504e0bdb940..a7a9496afb95 100644 +--- a/fs/bcachefs/recovery.h ++++ b/fs/bcachefs/recovery.h +@@ -37,6 +37,8 @@ int bch2_journal_key_insert(struct bch_fs *, enum btree_id, + unsigned, struct bkey_i *); + int bch2_journal_key_delete(struct bch_fs *, enum btree_id, + unsigned, struct bpos); ++void bch2_journal_key_overwritten(struct bch_fs *, enum btree_id, ++ unsigned, struct bpos); + + void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *); + struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *); +-- +cgit v1.2.3 + + +From 49c419feec4cf1058c6095d55ad71905ce8f8c89 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 1 Jan 2022 18:27:50 -0500 +Subject: bcachefs: Log what we're doing when repairing + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_gc.c | 8 +++++++- + 1 file changed, 7 insertions(+), 1 deletion(-) + +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +index 9e3213b90439..f2a9348d5069 100644 +--- a/fs/bcachefs/btree_gc.c ++++ b/fs/bcachefs/btree_gc.c +@@ -691,10 +691,16 @@ found: + } + + ret = bch2_journal_key_insert_take(c, btree_id, level, new); ++ + if (ret) + kfree(new); +- else ++ else { ++ bch2_bkey_val_to_text(&PBUF(buf), c, *k); ++ bch_info(c, "updated %s", buf); ++ bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(new)); ++ bch_info(c, "new key %s", buf); + *k = bkey_i_to_s_c(new); ++ } + } + fsck_err: + return ret; +-- +cgit v1.2.3 + + +From 8ffdc76cf760460d77e054b503e6577237009d0d Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 1 Jan 2022 19:04:33 -0500 +Subject: bcachefs: Improve error messages in superblock write path + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/super-io.c | 21 +++++++++++++++++---- + 1 file changed, 17 insertions(+), 4 deletions(-) + +diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c +index bbed24b702fd..8e28a13aaf95 100644 +--- a/fs/bcachefs/super-io.c ++++ b/fs/bcachefs/super-io.c +@@ -752,11 +752,24 @@ int bch2_write_super(struct bch_fs *c) + closure_sync(cl); + + for_each_online_member(ca, c, i) { +- if (!ca->sb_write_error && +- ca->disk_sb.seq != +- le64_to_cpu(ca->sb_read_scratch->seq)) { ++ if (ca->sb_write_error) ++ continue; ++ ++ if (le64_to_cpu(ca->sb_read_scratch->seq) < ca->disk_sb.seq) { ++ bch2_fs_fatal_error(c, ++ "Superblock write was silently dropped! (seq %llu expected %llu)", ++ le64_to_cpu(ca->sb_read_scratch->seq), ++ ca->disk_sb.seq); ++ percpu_ref_put(&ca->io_ref); ++ ret = -EROFS; ++ goto out; ++ } ++ ++ if (le64_to_cpu(ca->sb_read_scratch->seq) > ca->disk_sb.seq) { + bch2_fs_fatal_error(c, +- "Superblock modified by another process"); ++ "Superblock modified by another process (seq %llu expected %llu)", ++ le64_to_cpu(ca->sb_read_scratch->seq), ++ ca->disk_sb.seq); + percpu_ref_put(&ca->io_ref); + ret = -EROFS; + goto out; +-- +cgit v1.2.3 + + +From c0310b3c7260cf2c203e4805f534b9815ae70e1b Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 1 Jan 2022 19:46:12 -0500 +Subject: bcachefs: Make sure BCH_FS_FSCK_DONE gets set + +If we're not running fsck we still want to set BCH_FS_FSCK_DONE, so that +bch2_fsck_err() calls are interpreted as bch2_inconsistent_error() +calls(). + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/recovery.c | 12 +++++++++++- + 1 file changed, 11 insertions(+), 1 deletion(-) + +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index fcacf166f900..39b5b97704b7 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -613,8 +613,10 @@ static int bch2_journal_replay(struct bch_fs *c) + sizeof(keys_sorted[0]), + journal_sort_seq_cmp, NULL); + +- if (keys->nr) ++ if (keys->nr) { ++ bch_verbose(c, "starting journal replay, %zu keys", keys->nr); + replay_now_at(j, keys->journal_seq_base); ++ } + + seq = j->replay_journal_seq; + +@@ -1235,6 +1237,13 @@ use_clean: + + set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags); + ++ /* ++ * If we're not running fsck, this ensures bch2_fsck_err() calls are ++ * instead interpreted as bch2_inconsistent_err() calls: ++ */ ++ if (!c->opts.fsck) ++ set_bit(BCH_FS_FSCK_DONE, &c->flags); ++ + if (c->opts.fsck || + !(c->sb.compat & (1ULL << BCH_COMPAT_alloc_info)) || + !(c->sb.compat & (1ULL << BCH_COMPAT_alloc_metadata)) || +@@ -1434,6 +1443,7 @@ int bch2_fs_initialize(struct bch_fs *c) + + set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags); + set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags); ++ set_bit(BCH_FS_FSCK_DONE, &c->flags); + + for (i = 0; i < BTREE_ID_NR; i++) + bch2_btree_root_alloc(c, i); +-- +cgit v1.2.3 + + +From 75f74cb7ed64f5d9d9f12261e3d8dd02710d09f2 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 1 Jan 2022 20:45:30 -0500 +Subject: bcachefs: Tweak journal reclaim order + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/recovery.c | 9 ++++++++- + 1 file changed, 8 insertions(+), 1 deletion(-) + +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index 39b5b97704b7..219351654564 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -116,12 +116,19 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id, + .btree_id = id, + .level = level, + .k = k, +- .allocated = true ++ .allocated = true, ++ /* ++ * Ensure these keys are done last by journal replay, to unblock ++ * journal reclaim: ++ */ ++ .journal_seq = U32_MAX, + }; + struct journal_keys *keys = &c->journal_keys; + struct journal_iter *iter; + unsigned idx = journal_key_search(keys, id, level, k->k.p); + ++ BUG_ON(test_bit(BCH_FS_RW, &c->flags)); ++ + if (idx < keys->nr && + journal_key_cmp(&n, &keys->d[idx]) == 0) { + if (keys->d[idx].allocated) +-- +cgit v1.2.3 + + +From 1d715b8ef4c2f364ac10c30eb6a0d15e381b3bbf Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 25 Dec 2021 20:07:00 -0500 +Subject: bcachefs: BTREE_ITER_WITH_JOURNAL + +This adds a new btree iterator flag, BTREE_ITER_WITH_JOURNAL, that is +automatically enabled when initializing a btree iterator before journal +replay has completed - it overlays the contents of the journal with the +btree. + +This lets us delete bch2_btree_and_journal_walk() and just use the +normal btree iterator interface instead - which also lets us delete a +significant amount of duplicated code. + +Note that BTREE_ITER_WITH_JOURNAL is still unoptimized in this patch - +we're redoing the binary search over keys in the journal every time we +call bch2_btree_iter_peek(). + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/alloc_background.c | 60 +++++------ + fs/bcachefs/bcachefs.h | 1 - + fs/bcachefs/btree_gc.c | 185 +++++++++------------------------- + fs/bcachefs/btree_iter.c | 196 ++++++++++++++++++++++++++++++------ + fs/bcachefs/btree_types.h | 9 +- + fs/bcachefs/btree_update_interior.c | 4 + + fs/bcachefs/ec.c | 60 ++++++----- + fs/bcachefs/recovery.c | 158 ++++++++--------------------- + fs/bcachefs/recovery.h | 10 +- + 9 files changed, 326 insertions(+), 357 deletions(-) + +diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c +index c24fb4b1d7b2..4f1a3f6eb9d7 100644 +--- a/fs/bcachefs/alloc_background.c ++++ b/fs/bcachefs/alloc_background.c +@@ -340,46 +340,46 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, + #undef x + } + +-static int bch2_alloc_read_fn(struct btree_trans *trans, struct bkey_s_c k) ++int bch2_alloc_read(struct bch_fs *c) + { +- struct bch_fs *c = trans->c; ++ struct btree_trans trans; ++ struct btree_iter iter; ++ struct bkey_s_c k; + struct bch_dev *ca; + struct bucket *g; + struct bkey_alloc_unpacked u; +- +- if (!bkey_is_alloc(k.k)) +- return 0; +- +- ca = bch_dev_bkey_exists(c, k.k->p.inode); +- g = bucket(ca, k.k->p.offset); +- u = bch2_alloc_unpack(k); +- +- *bucket_gen(ca, k.k->p.offset) = u.gen; +- g->_mark.gen = u.gen; +- g->_mark.data_type = u.data_type; +- g->_mark.dirty_sectors = u.dirty_sectors; +- g->_mark.cached_sectors = u.cached_sectors; +- g->_mark.stripe = u.stripe != 0; +- g->stripe = u.stripe; +- g->stripe_redundancy = u.stripe_redundancy; +- g->io_time[READ] = u.read_time; +- g->io_time[WRITE] = u.write_time; +- g->oldest_gen = u.oldest_gen; +- g->gen_valid = 1; +- +- return 0; +-} +- +-int bch2_alloc_read(struct bch_fs *c) +-{ +- struct btree_trans trans; + int ret; + + bch2_trans_init(&trans, c, 0, 0); + down_read(&c->gc_lock); +- ret = bch2_btree_and_journal_walk(&trans, BTREE_ID_alloc, bch2_alloc_read_fn); ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN, ++ BTREE_ITER_PREFETCH, k, ret) { ++ if (!bkey_is_alloc(k.k)) ++ continue; ++ ++ ca = bch_dev_bkey_exists(c, k.k->p.inode); ++ g = bucket(ca, k.k->p.offset); ++ u = bch2_alloc_unpack(k); ++ ++ *bucket_gen(ca, k.k->p.offset) = u.gen; ++ g->_mark.gen = u.gen; ++ g->_mark.data_type = u.data_type; ++ g->_mark.dirty_sectors = u.dirty_sectors; ++ g->_mark.cached_sectors = u.cached_sectors; ++ g->_mark.stripe = u.stripe != 0; ++ g->stripe = u.stripe; ++ g->stripe_redundancy = u.stripe_redundancy; ++ g->io_time[READ] = u.read_time; ++ g->io_time[WRITE] = u.write_time; ++ g->oldest_gen = u.oldest_gen; ++ g->gen_valid = 1; ++ } ++ bch2_trans_iter_exit(&trans, &iter); ++ + up_read(&c->gc_lock); + bch2_trans_exit(&trans); ++ + if (ret) { + bch_err(c, "error reading alloc info: %i", ret); + return ret; +diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h +index 696c7c93c919..49cb68b4b61d 100644 +--- a/fs/bcachefs/bcachefs.h ++++ b/fs/bcachefs/bcachefs.h +@@ -860,7 +860,6 @@ struct bch_fs { + u64 reflink_hint; + reflink_gc_table reflink_gc_table; + size_t reflink_gc_nr; +- size_t reflink_gc_idx; + + /* VFS IO PATH - fs-io.c */ + struct bio_set writepage_bioset; +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +index f2a9348d5069..db453aa61d25 100644 +--- a/fs/bcachefs/btree_gc.c ++++ b/fs/bcachefs/btree_gc.c +@@ -1342,59 +1342,6 @@ static int bch2_gc_start(struct bch_fs *c, + return 0; + } + +-static int bch2_gc_reflink_done_initial_fn(struct btree_trans *trans, +- struct bkey_s_c k) +-{ +- struct bch_fs *c = trans->c; +- struct reflink_gc *r; +- const __le64 *refcount = bkey_refcount_c(k); +- char buf[200]; +- int ret = 0; +- +- if (!refcount) +- return 0; +- +- r = genradix_ptr(&c->reflink_gc_table, c->reflink_gc_idx++); +- if (!r) +- return -ENOMEM; +- +- if (!r || +- r->offset != k.k->p.offset || +- r->size != k.k->size) { +- bch_err(c, "unexpected inconsistency walking reflink table at gc finish"); +- return -EINVAL; +- } +- +- if (fsck_err_on(r->refcount != le64_to_cpu(*refcount), c, +- "reflink key has wrong refcount:\n" +- " %s\n" +- " should be %u", +- (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf), +- r->refcount)) { +- struct bkey_i *new; +- +- new = kmalloc(bkey_bytes(k.k), GFP_KERNEL); +- if (!new) { +- ret = -ENOMEM; +- goto fsck_err; +- } +- +- bkey_reassemble(new, k); +- +- if (!r->refcount) { +- new->k.type = KEY_TYPE_deleted; +- new->k.size = 0; +- } else { +- *bkey_refcount(new) = cpu_to_le64(r->refcount); +- } +- +- ret = bch2_journal_key_insert(c, BTREE_ID_reflink, 0, new); +- kfree(new); +- } +-fsck_err: +- return ret; +-} +- + static int bch2_gc_reflink_done(struct bch_fs *c, bool initial, + bool metadata_only) + { +@@ -1411,14 +1358,6 @@ static int bch2_gc_reflink_done(struct bch_fs *c, bool initial, + + bch2_trans_init(&trans, c, 0, 0); + +- if (initial) { +- c->reflink_gc_idx = 0; +- +- ret = bch2_btree_and_journal_walk(&trans, BTREE_ID_reflink, +- bch2_gc_reflink_done_initial_fn); +- goto out; +- } +- + for_each_btree_key(&trans, iter, BTREE_ID_reflink, POS_MIN, + BTREE_ITER_PREFETCH, k, ret) { + const __le64 *refcount = bkey_refcount_c(k); +@@ -1426,7 +1365,7 @@ static int bch2_gc_reflink_done(struct bch_fs *c, bool initial, + if (!refcount) + continue; + +- r = genradix_ptr(&c->reflink_gc_table, idx); ++ r = genradix_ptr(&c->reflink_gc_table, idx++); + if (!r || + r->offset != k.k->p.offset || + r->size != k.k->size) { +@@ -1456,7 +1395,9 @@ static int bch2_gc_reflink_done(struct bch_fs *c, bool initial, + else + *bkey_refcount(new) = cpu_to_le64(r->refcount); + +- ret = __bch2_trans_do(&trans, NULL, NULL, 0, ++ ret = initial ++ ? bch2_journal_key_insert(c, BTREE_ID_stripes, 0, new) ++ : __bch2_trans_do(&trans, NULL, NULL, 0, + __bch2_btree_insert(&trans, BTREE_ID_reflink, new)); + kfree(new); + +@@ -1466,104 +1407,74 @@ static int bch2_gc_reflink_done(struct bch_fs *c, bool initial, + } + fsck_err: + bch2_trans_iter_exit(&trans, &iter); +-out: + c->reflink_gc_nr = 0; + bch2_trans_exit(&trans); + return ret; + } + +-static int bch2_gc_stripes_done_initial_fn(struct btree_trans *trans, +- struct bkey_s_c k) ++static int bch2_gc_stripes_done(struct bch_fs *c, bool initial, ++ bool metadata_only) + { +- struct bch_fs *c = trans->c; ++ struct btree_trans trans; ++ struct btree_iter iter; ++ struct bkey_s_c k; + struct gc_stripe *m; + const struct bch_stripe *s; + char buf[200]; + unsigned i; + int ret = 0; + +- if (k.k->type != KEY_TYPE_stripe) ++ if (metadata_only) + return 0; + +- s = bkey_s_c_to_stripe(k).v; ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_stripes, POS_MIN, ++ BTREE_ITER_PREFETCH, k, ret) { ++ if (k.k->type != KEY_TYPE_stripe) ++ continue; + +- m = genradix_ptr(&c->gc_stripes, k.k->p.offset); ++ s = bkey_s_c_to_stripe(k).v; ++ m = genradix_ptr(&c->gc_stripes, k.k->p.offset); + +- for (i = 0; i < s->nr_blocks; i++) +- if (stripe_blockcount_get(s, i) != (m ? m->block_sectors[i] : 0)) +- goto inconsistent; +- return 0; ++ for (i = 0; i < s->nr_blocks; i++) ++ if (stripe_blockcount_get(s, i) != (m ? m->block_sectors[i] : 0)) ++ goto inconsistent; ++ continue; + inconsistent: +- if (fsck_err_on(true, c, +- "stripe has wrong block sector count %u:\n" +- " %s\n" +- " should be %u", i, +- (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf), +- m ? m->block_sectors[i] : 0)) { +- struct bkey_i_stripe *new; ++ if (fsck_err_on(true, c, ++ "stripe has wrong block sector count %u:\n" ++ " %s\n" ++ " should be %u", i, ++ (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf), ++ m ? m->block_sectors[i] : 0)) { ++ struct bkey_i_stripe *new; + +- new = kmalloc(bkey_bytes(k.k), GFP_KERNEL); +- if (!new) { +- ret = -ENOMEM; +- goto fsck_err; +- } ++ new = kmalloc(bkey_bytes(k.k), GFP_KERNEL); ++ if (!new) { ++ ret = -ENOMEM; ++ break; ++ } + +- bkey_reassemble(&new->k_i, k); ++ bkey_reassemble(&new->k_i, k); + +- for (i = 0; i < new->v.nr_blocks; i++) +- stripe_blockcount_set(&new->v, i, m ? m->block_sectors[i] : 0); ++ for (i = 0; i < new->v.nr_blocks; i++) ++ stripe_blockcount_set(&new->v, i, m ? m->block_sectors[i] : 0); + +- ret = bch2_journal_key_insert(c, BTREE_ID_stripes, 0, &new->k_i); +- kfree(new); ++ ret = initial ++ ? bch2_journal_key_insert(c, BTREE_ID_stripes, 0, &new->k_i) ++ : __bch2_trans_do(&trans, NULL, NULL, 0, ++ __bch2_btree_insert(&trans, BTREE_ID_reflink, &new->k_i)); ++ kfree(new); ++ } + } + fsck_err: +- return ret; +-} +- +-static int bch2_gc_stripes_done(struct bch_fs *c, bool initial, +- bool metadata_only) +-{ +- struct btree_trans trans; +- int ret = 0; +- +- if (metadata_only) +- return 0; +- +- bch2_trans_init(&trans, c, 0, 0); +- +- if (initial) { +- ret = bch2_btree_and_journal_walk(&trans, BTREE_ID_stripes, +- bch2_gc_stripes_done_initial_fn); +- } else { +- BUG(); +- } ++ bch2_trans_iter_exit(&trans, &iter); + + bch2_trans_exit(&trans); + return ret; + } + +-static int bch2_gc_reflink_start_initial_fn(struct btree_trans *trans, +- struct bkey_s_c k) +-{ +- +- struct bch_fs *c = trans->c; +- struct reflink_gc *r; +- const __le64 *refcount = bkey_refcount_c(k); +- +- if (!refcount) +- return 0; +- +- r = genradix_ptr_alloc(&c->reflink_gc_table, c->reflink_gc_nr++, +- GFP_KERNEL); +- if (!r) +- return -ENOMEM; +- +- r->offset = k.k->p.offset; +- r->size = k.k->size; +- r->refcount = 0; +- return 0; +-} +- + static int bch2_gc_reflink_start(struct bch_fs *c, bool initial, + bool metadata_only) + { +@@ -1579,12 +1490,6 @@ static int bch2_gc_reflink_start(struct bch_fs *c, bool initial, + bch2_trans_init(&trans, c, 0, 0); + c->reflink_gc_nr = 0; + +- if (initial) { +- ret = bch2_btree_and_journal_walk(&trans, BTREE_ID_reflink, +- bch2_gc_reflink_start_initial_fn); +- goto out; +- } +- + for_each_btree_key(&trans, iter, BTREE_ID_reflink, POS_MIN, + BTREE_ITER_PREFETCH, k, ret) { + const __le64 *refcount = bkey_refcount_c(k); +@@ -1604,7 +1509,7 @@ static int bch2_gc_reflink_start(struct bch_fs *c, bool initial, + r->refcount = 0; + } + bch2_trans_iter_exit(&trans, &iter); +-out: ++ + bch2_trans_exit(&trans); + return ret; + } +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index a6cc0ca51293..d549c466362b 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -12,6 +12,7 @@ + #include "error.h" + #include "extents.h" + #include "journal.h" ++#include "recovery.h" + #include "replicas.h" + #include "subvolume.h" + +@@ -1077,6 +1078,7 @@ static inline bool btree_path_advance_to_pos(struct btree_path *path, + static void btree_path_verify_new_node(struct btree_trans *trans, + struct btree_path *path, struct btree *b) + { ++ struct bch_fs *c = trans->c; + struct btree_path_level *l; + unsigned plevel; + bool parent_locked; +@@ -1085,6 +1087,9 @@ static void btree_path_verify_new_node(struct btree_trans *trans, + if (!IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) + return; + ++ if (!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)) ++ return; ++ + plevel = b->c.level + 1; + if (!btree_path_node(path, plevel)) + return; +@@ -1105,7 +1110,7 @@ static void btree_path_verify_new_node(struct btree_trans *trans, + char buf4[100]; + struct bkey uk = bkey_unpack_key(b, k); + +- bch2_dump_btree_node(trans->c, l->b); ++ bch2_dump_btree_node(c, l->b); + bch2_bpos_to_text(&PBUF(buf1), path->pos); + bch2_bkey_to_text(&PBUF(buf2), &uk); + bch2_bpos_to_text(&PBUF(buf3), b->data->min_key); +@@ -1296,6 +1301,41 @@ static int btree_path_prefetch(struct btree_trans *trans, struct btree_path *pat + return ret; + } + ++static int btree_path_prefetch_j(struct btree_trans *trans, struct btree_path *path, ++ struct btree_and_journal_iter *jiter) ++{ ++ struct bch_fs *c = trans->c; ++ struct bkey_s_c k; ++ struct bkey_buf tmp; ++ unsigned nr = test_bit(BCH_FS_STARTED, &c->flags) ++ ? (path->level > 1 ? 0 : 2) ++ : (path->level > 1 ? 1 : 16); ++ bool was_locked = btree_node_locked(path, path->level); ++ int ret = 0; ++ ++ bch2_bkey_buf_init(&tmp); ++ ++ while (nr && !ret) { ++ if (!bch2_btree_node_relock(trans, path, path->level)) ++ break; ++ ++ bch2_btree_and_journal_iter_advance(jiter); ++ k = bch2_btree_and_journal_iter_peek(jiter); ++ if (!k.k) ++ break; ++ ++ bch2_bkey_buf_reassemble(&tmp, c, k); ++ ret = bch2_btree_node_prefetch(c, trans, path, tmp.k, path->btree_id, ++ path->level - 1); ++ } ++ ++ if (!was_locked) ++ btree_node_unlock(path, path->level); ++ ++ bch2_bkey_buf_exit(&tmp, c); ++ return ret; ++} ++ + static noinline void btree_node_mem_ptr_set(struct btree_trans *trans, + struct btree_path *path, + unsigned plevel, struct btree *b) +@@ -1318,6 +1358,30 @@ static noinline void btree_node_mem_ptr_set(struct btree_trans *trans, + btree_node_unlock(path, plevel); + } + ++static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans, ++ struct btree_path *path, ++ unsigned flags, ++ struct bkey_buf *out) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_path_level *l = path_l(path); ++ struct btree_and_journal_iter jiter; ++ struct bkey_s_c k; ++ int ret = 0; ++ ++ __bch2_btree_and_journal_iter_init_node_iter(&jiter, c, l->b, l->iter, path->pos); ++ ++ k = bch2_btree_and_journal_iter_peek(&jiter); ++ ++ bch2_bkey_buf_reassemble(out, c, k); ++ ++ if (flags & BTREE_ITER_PREFETCH) ++ ret = btree_path_prefetch_j(trans, path, &jiter); ++ ++ bch2_btree_and_journal_iter_exit(&jiter); ++ return ret; ++} ++ + static __always_inline int btree_path_down(struct btree_trans *trans, + struct btree_path *path, + unsigned flags, +@@ -1328,14 +1392,28 @@ static __always_inline int btree_path_down(struct btree_trans *trans, + struct btree *b; + unsigned level = path->level - 1; + enum six_lock_type lock_type = __btree_lock_want(path, level); ++ bool replay_done = test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags); + struct bkey_buf tmp; + int ret; + + EBUG_ON(!btree_node_locked(path, path->level)); + + bch2_bkey_buf_init(&tmp); +- bch2_bkey_buf_unpack(&tmp, c, l->b, +- bch2_btree_node_iter_peek(&l->iter, l->b)); ++ ++ if (unlikely(!replay_done)) { ++ ret = btree_node_iter_and_journal_peek(trans, path, flags, &tmp); ++ if (ret) ++ goto err; ++ } else { ++ bch2_bkey_buf_unpack(&tmp, c, l->b, ++ bch2_btree_node_iter_peek(&l->iter, l->b)); ++ ++ if (flags & BTREE_ITER_PREFETCH) { ++ ret = btree_path_prefetch(trans, path); ++ if (ret) ++ goto err; ++ } ++ } + + b = bch2_btree_node_get(trans, path, tmp.k, level, lock_type, trace_ip); + ret = PTR_ERR_OR_ZERO(b); +@@ -1345,13 +1423,10 @@ static __always_inline int btree_path_down(struct btree_trans *trans, + mark_btree_node_locked(path, level, lock_type); + btree_path_level_init(trans, path, b); + +- if (tmp.k->k.type == KEY_TYPE_btree_ptr_v2 && ++ if (likely(replay_done && tmp.k->k.type == KEY_TYPE_btree_ptr_v2) && + unlikely(b != btree_node_mem_ptr(tmp.k))) + btree_node_mem_ptr_set(trans, path, level + 1, b); + +- if (flags & BTREE_ITER_PREFETCH) +- ret = btree_path_prefetch(trans, path); +- + if (btree_node_read_locked(path, level + 1)) + btree_node_unlock(path, level + 1); + path->level = level; +@@ -2107,6 +2182,42 @@ inline bool bch2_btree_iter_rewind(struct btree_iter *iter) + return ret; + } + ++static noinline ++struct bkey_i *__btree_trans_peek_journal(struct btree_trans *trans, ++ struct btree_path *path) ++{ ++ struct journal_keys *keys = &trans->c->journal_keys; ++ size_t idx = bch2_journal_key_search(keys, path->btree_id, ++ path->level, path->pos); ++ ++ while (idx < keys->nr && keys->d[idx].overwritten) ++ idx++; ++ ++ return (idx < keys->nr && ++ keys->d[idx].btree_id == path->btree_id && ++ keys->d[idx].level == path->level) ++ ? keys->d[idx].k ++ : NULL; ++} ++ ++static noinline ++struct bkey_s_c btree_trans_peek_journal(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct bkey_s_c k) ++{ ++ struct bkey_i *next_journal = ++ __btree_trans_peek_journal(trans, iter->path); ++ ++ if (next_journal && ++ bpos_cmp(next_journal->k.p, ++ k.k ? k.k->p : iter->path->l[0].b->key.k.p) <= 0) { ++ iter->k = next_journal->k; ++ k = bkey_i_to_s_c(next_journal); ++ } ++ ++ return k; ++} ++ + /** + * bch2_btree_iter_peek: returns first key greater than or equal to iterator's + * current position +@@ -2117,7 +2228,7 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) + struct bpos search_key = btree_iter_search_key(iter); + struct bkey_i *next_update; + struct bkey_s_c k; +- int ret, cmp; ++ int ret; + + EBUG_ON(iter->path->cached || iter->path->level); + bch2_btree_iter_verify(iter); +@@ -2136,19 +2247,14 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) + goto out; + } + +- next_update = iter->flags & BTREE_ITER_WITH_UPDATES +- ? btree_trans_peek_updates(trans, iter->btree_id, search_key) +- : NULL; + k = btree_path_level_peek_all(trans->c, &iter->path->l[0], &iter->k); + +- /* * In the btree, deleted keys sort before non deleted: */ +- if (k.k && bkey_deleted(k.k) && +- (!next_update || +- bpos_cmp(k.k->p, next_update->k.p) <= 0)) { +- search_key = k.k->p; +- continue; +- } ++ if (unlikely(iter->flags & BTREE_ITER_WITH_JOURNAL)) ++ k = btree_trans_peek_journal(trans, iter, k); + ++ next_update = iter->flags & BTREE_ITER_WITH_UPDATES ++ ? btree_trans_peek_updates(trans, iter->btree_id, search_key) ++ : NULL; + if (next_update && + bpos_cmp(next_update->k.p, + k.k ? k.k->p : iter->path->l[0].b->key.k.p) <= 0) { +@@ -2156,6 +2262,20 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) + k = bkey_i_to_s_c(next_update); + } + ++ if (k.k && bkey_deleted(k.k)) { ++ /* ++ * If we've got a whiteout, and it's after the search ++ * key, advance the search key to the whiteout instead ++ * of just after the whiteout - it might be a btree ++ * whiteout, with a real key at the same position, since ++ * in the btree deleted keys sort before non deleted. ++ */ ++ search_key = bpos_cmp(search_key, k.k->p) ++ ? k.k->p ++ : bpos_successor(k.k->p); ++ continue; ++ } ++ + if (likely(k.k)) { + /* + * We can never have a key in a leaf node at POS_MAX, so +@@ -2199,14 +2319,10 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) + if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) + iter->pos.snapshot = iter->snapshot; + +- cmp = bpos_cmp(k.k->p, iter->path->pos); +- if (cmp) { +- iter->path = bch2_btree_path_make_mut(trans, iter->path, +- iter->flags & BTREE_ITER_INTENT, +- btree_iter_ip_allocated(iter)); +- iter->path->pos = k.k->p; +- btree_path_check_sort(trans, iter->path, cmp); +- } ++ iter->path = btree_path_set_pos(trans, iter->path, k.k->p, ++ iter->flags & BTREE_ITER_INTENT, ++ btree_iter_ip_allocated(iter)); ++ BUG_ON(!iter->path->nodes_locked); + out: + iter->path->should_be_locked = true; + +@@ -2247,6 +2363,10 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) + + EBUG_ON(iter->path->cached || iter->path->level); + EBUG_ON(iter->flags & BTREE_ITER_WITH_UPDATES); ++ ++ if (iter->flags & BTREE_ITER_WITH_JOURNAL) ++ return bkey_s_c_err(-EIO); ++ + bch2_btree_iter_verify(iter); + bch2_btree_iter_verify_entry_exit(iter); + +@@ -2397,17 +2517,24 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) + !(iter->flags & (BTREE_ITER_IS_EXTENTS|BTREE_ITER_FILTER_SNAPSHOTS))) { + struct bkey_i *next_update; + +- next_update = iter->flags & BTREE_ITER_WITH_UPDATES +- ? btree_trans_peek_updates(trans, iter->btree_id, search_key) +- : NULL; ++ if ((iter->flags & BTREE_ITER_WITH_UPDATES) && ++ (next_update = btree_trans_peek_updates(trans, ++ iter->btree_id, search_key)) && ++ !bpos_cmp(next_update->k.p, iter->pos)) { ++ iter->k = next_update->k; ++ k = bkey_i_to_s_c(next_update); ++ goto out; ++ } + +- if (next_update && ++ if (unlikely(iter->flags & BTREE_ITER_WITH_JOURNAL) && ++ (next_update = __btree_trans_peek_journal(trans, iter->path)) && + !bpos_cmp(next_update->k.p, iter->pos)) { + iter->k = next_update->k; + k = bkey_i_to_s_c(next_update); +- } else { +- k = bch2_btree_path_peek_slot(iter->path, &iter->k); ++ goto out; + } ++ ++ k = bch2_btree_path_peek_slot(iter->path, &iter->k); + } else { + struct bpos next; + +@@ -2451,7 +2578,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) + k = (struct bkey_s_c) { &iter->k, NULL }; + } + } +- ++out: + iter->path->should_be_locked = true; + + bch2_btree_iter_verify_entry_exit(iter); +@@ -2618,6 +2745,9 @@ static void __bch2_trans_iter_init(struct btree_trans *trans, + btree_type_has_snapshots(btree_id)) + flags |= BTREE_ITER_FILTER_SNAPSHOTS; + ++ if (!test_bit(JOURNAL_REPLAY_DONE, &trans->c->journal.flags)) ++ flags |= BTREE_ITER_WITH_JOURNAL; ++ + iter->trans = trans; + iter->path = NULL; + iter->btree_id = btree_id; +diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h +index 08c49ae3b338..1ace76048a21 100644 +--- a/fs/bcachefs/btree_types.h ++++ b/fs/bcachefs/btree_types.h +@@ -207,10 +207,11 @@ struct btree_node_iter { + #define BTREE_ITER_CACHED_NOFILL (1 << 8) + #define BTREE_ITER_CACHED_NOCREATE (1 << 9) + #define BTREE_ITER_WITH_UPDATES (1 << 10) +-#define __BTREE_ITER_ALL_SNAPSHOTS (1 << 11) +-#define BTREE_ITER_ALL_SNAPSHOTS (1 << 12) +-#define BTREE_ITER_FILTER_SNAPSHOTS (1 << 13) +-#define BTREE_ITER_NOPRESERVE (1 << 14) ++#define BTREE_ITER_WITH_JOURNAL (1 << 11) ++#define __BTREE_ITER_ALL_SNAPSHOTS (1 << 12) ++#define BTREE_ITER_ALL_SNAPSHOTS (1 << 13) ++#define BTREE_ITER_FILTER_SNAPSHOTS (1 << 14) ++#define BTREE_ITER_NOPRESERVE (1 << 15) + + enum btree_path_uptodate { + BTREE_ITER_UPTODATE = 0, +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index 6872e56b5c41..8fb2ec3884c6 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -16,6 +16,7 @@ + #include "journal.h" + #include "journal_reclaim.h" + #include "keylist.h" ++#include "recovery.h" + #include "replicas.h" + #include "super-io.h" + +@@ -1146,6 +1147,9 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, + BUG_ON(insert->k.type == KEY_TYPE_btree_ptr_v2 && + !btree_ptr_sectors_written(insert)); + ++ if (unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags))) ++ bch2_journal_key_overwritten(c, b->c.btree_id, b->c.level, insert->k.p); ++ + invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(insert), btree_node_type(b)) ?: + bch2_bkey_in_btree_node(b, bkey_i_to_s_c(insert)); + if (invalid) { +diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c +index 9a1751d4465b..9b45640e75dc 100644 +--- a/fs/bcachefs/ec.c ++++ b/fs/bcachefs/ec.c +@@ -1558,50 +1558,48 @@ void bch2_stripes_heap_start(struct bch_fs *c) + bch2_stripes_heap_insert(c, m, iter.pos); + } + +-static int bch2_stripes_read_fn(struct btree_trans *trans, struct bkey_s_c k) ++int bch2_stripes_read(struct bch_fs *c) + { ++ struct btree_trans trans; ++ struct btree_iter iter; ++ struct bkey_s_c k; + const struct bch_stripe *s; +- struct bch_fs *c = trans->c; + struct stripe *m; + unsigned i; +- int ret = 0; ++ int ret; + +- if (k.k->type != KEY_TYPE_stripe) +- return 0; ++ bch2_trans_init(&trans, c, 0, 0); + +- ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL); +- if (ret) +- return ret; ++ for_each_btree_key(&trans, iter, BTREE_ID_stripes, POS_MIN, ++ BTREE_ITER_PREFETCH, k, ret) { ++ if (k.k->type != KEY_TYPE_stripe) ++ continue; + +- s = bkey_s_c_to_stripe(k).v; ++ ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL); ++ if (ret) ++ break; + +- m = genradix_ptr(&c->stripes, k.k->p.offset); +- m->alive = true; +- m->sectors = le16_to_cpu(s->sectors); +- m->algorithm = s->algorithm; +- m->nr_blocks = s->nr_blocks; +- m->nr_redundant = s->nr_redundant; +- m->blocks_nonempty = 0; ++ s = bkey_s_c_to_stripe(k).v; + +- for (i = 0; i < s->nr_blocks; i++) +- m->blocks_nonempty += !!stripe_blockcount_get(s, i); ++ m = genradix_ptr(&c->stripes, k.k->p.offset); ++ m->alive = true; ++ m->sectors = le16_to_cpu(s->sectors); ++ m->algorithm = s->algorithm; ++ m->nr_blocks = s->nr_blocks; ++ m->nr_redundant = s->nr_redundant; ++ m->blocks_nonempty = 0; + +- spin_lock(&c->ec_stripes_heap_lock); +- bch2_stripes_heap_update(c, m, k.k->p.offset); +- spin_unlock(&c->ec_stripes_heap_lock); +- +- return ret; +-} ++ for (i = 0; i < s->nr_blocks; i++) ++ m->blocks_nonempty += !!stripe_blockcount_get(s, i); + +-int bch2_stripes_read(struct bch_fs *c) +-{ +- struct btree_trans trans; +- int ret; ++ spin_lock(&c->ec_stripes_heap_lock); ++ bch2_stripes_heap_update(c, m, k.k->p.offset); ++ spin_unlock(&c->ec_stripes_heap_lock); ++ } ++ bch2_trans_iter_exit(&trans, &iter); + +- bch2_trans_init(&trans, c, 0, 0); +- ret = bch2_btree_and_journal_walk(&trans, BTREE_ID_stripes, +- bch2_stripes_read_fn); + bch2_trans_exit(&trans); ++ + if (ret) + bch_err(c, "error reading stripes: %i", ret); + +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index 219351654564..57311ad283c7 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -59,23 +59,21 @@ static void zero_out_btree_mem_ptr(struct journal_keys *keys) + static int __journal_key_cmp(enum btree_id l_btree_id, + unsigned l_level, + struct bpos l_pos, +- struct journal_key *r) ++ const struct journal_key *r) + { + return (cmp_int(l_btree_id, r->btree_id) ?: + cmp_int(l_level, r->level) ?: + bpos_cmp(l_pos, r->k->k.p)); + } + +-static int journal_key_cmp(struct journal_key *l, struct journal_key *r) ++static int journal_key_cmp(const struct journal_key *l, const struct journal_key *r) + { +- return (cmp_int(l->btree_id, r->btree_id) ?: +- cmp_int(l->level, r->level) ?: +- bpos_cmp(l->k->k.p, r->k->k.p)); ++ return __journal_key_cmp(l->btree_id, l->level, l->k->k.p, r); + } + +-static size_t journal_key_search(struct journal_keys *journal_keys, +- enum btree_id id, unsigned level, +- struct bpos pos) ++size_t bch2_journal_key_search(struct journal_keys *journal_keys, ++ enum btree_id id, unsigned level, ++ struct bpos pos) + { + size_t l = 0, r = journal_keys->nr, m; + +@@ -125,7 +123,7 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id, + }; + struct journal_keys *keys = &c->journal_keys; + struct journal_iter *iter; +- unsigned idx = journal_key_search(keys, id, level, k->k.p); ++ size_t idx = bch2_journal_key_search(keys, id, level, k->k.p); + + BUG_ON(test_bit(BCH_FS_RW, &c->flags)); + +@@ -164,6 +162,11 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id, + return 0; + } + ++/* ++ * Can only be used from the recovery thread while we're still RO - can't be ++ * used once we've got RW, as journal_keys is at that point used by multiple ++ * threads: ++ */ + int bch2_journal_key_insert(struct bch_fs *c, enum btree_id id, + unsigned level, struct bkey_i *k) + { +@@ -196,7 +199,7 @@ void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree, + unsigned level, struct bpos pos) + { + struct journal_keys *keys = &c->journal_keys; +- size_t idx = journal_key_search(keys, btree, level, pos); ++ size_t idx = bch2_journal_key_search(keys, btree, level, pos); + + if (idx < keys->nr && + keys->d[idx].btree_id == btree && +@@ -207,15 +210,18 @@ void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree, + + static struct bkey_i *bch2_journal_iter_peek(struct journal_iter *iter) + { +- struct journal_key *k = iter->idx - iter->keys->nr +- ? iter->keys->d + iter->idx : NULL; ++ struct journal_key *k = iter->keys->d + iter->idx; + +- if (k && +- k->btree_id == iter->btree_id && +- k->level == iter->level) +- return k->k; ++ while (k < iter->keys->d + iter->keys->nr && ++ k->btree_id == iter->btree_id && ++ k->level == iter->level) { ++ if (!k->overwritten) ++ return k->k; ++ ++ iter->idx++; ++ k = iter->keys->d + iter->idx; ++ } + +- iter->idx = iter->keys->nr; + return NULL; + } + +@@ -238,8 +244,7 @@ static void bch2_journal_iter_init(struct bch_fs *c, + iter->btree_id = id; + iter->level = level; + iter->keys = &c->journal_keys; +- iter->idx = journal_key_search(&c->journal_keys, id, level, pos); +- list_add(&iter->list, &c->journal_iters); ++ iter->idx = bch2_journal_key_search(&c->journal_keys, id, level, pos); + } + + static struct bkey_s_c bch2_journal_iter_peek_btree(struct btree_and_journal_iter *iter) +@@ -325,106 +330,33 @@ void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *iter) + bch2_journal_iter_exit(&iter->journal); + } + +-void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter, +- struct bch_fs *c, +- struct btree *b) ++void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter, ++ struct bch_fs *c, ++ struct btree *b, ++ struct btree_node_iter node_iter, ++ struct bpos pos) + { + memset(iter, 0, sizeof(*iter)); + + iter->b = b; +- bch2_btree_node_iter_init_from_start(&iter->node_iter, iter->b); +- bch2_journal_iter_init(c, &iter->journal, +- b->c.btree_id, b->c.level, b->data->min_key); +-} +- +-/* Walk btree, overlaying keys from the journal: */ +- +-static void btree_and_journal_iter_prefetch(struct bch_fs *c, struct btree *b, +- struct btree_and_journal_iter iter) +-{ +- unsigned i = 0, nr = b->c.level > 1 ? 2 : 16; +- struct bkey_s_c k; +- struct bkey_buf tmp; +- +- BUG_ON(!b->c.level); +- +- bch2_bkey_buf_init(&tmp); +- +- while (i < nr && +- (k = bch2_btree_and_journal_iter_peek(&iter)).k) { +- bch2_bkey_buf_reassemble(&tmp, c, k); +- +- bch2_btree_node_prefetch(c, NULL, NULL, tmp.k, +- b->c.btree_id, b->c.level - 1); +- +- bch2_btree_and_journal_iter_advance(&iter); +- i++; +- } +- +- bch2_bkey_buf_exit(&tmp, c); +-} +- +-static int bch2_btree_and_journal_walk_recurse(struct btree_trans *trans, struct btree *b, +- enum btree_id btree_id, +- btree_walk_key_fn key_fn) +-{ +- struct bch_fs *c = trans->c; +- struct btree_and_journal_iter iter; +- struct bkey_s_c k; +- struct bkey_buf tmp; +- struct btree *child; +- int ret = 0; +- +- bch2_bkey_buf_init(&tmp); +- bch2_btree_and_journal_iter_init_node_iter(&iter, c, b); +- +- while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { +- if (b->c.level) { +- bch2_bkey_buf_reassemble(&tmp, c, k); +- +- child = bch2_btree_node_get_noiter(c, tmp.k, +- b->c.btree_id, b->c.level - 1, +- false); +- +- ret = PTR_ERR_OR_ZERO(child); +- if (ret) +- break; +- +- btree_and_journal_iter_prefetch(c, b, iter); +- +- ret = bch2_btree_and_journal_walk_recurse(trans, child, +- btree_id, key_fn); +- six_unlock_read(&child->c.lock); +- } else { +- ret = key_fn(trans, k); +- } +- +- if (ret) +- break; +- +- bch2_btree_and_journal_iter_advance(&iter); +- } +- +- bch2_btree_and_journal_iter_exit(&iter); +- bch2_bkey_buf_exit(&tmp, c); +- return ret; ++ iter->node_iter = node_iter; ++ bch2_journal_iter_init(c, &iter->journal, b->c.btree_id, b->c.level, pos); ++ INIT_LIST_HEAD(&iter->journal.list); + } + +-int bch2_btree_and_journal_walk(struct btree_trans *trans, enum btree_id btree_id, +- btree_walk_key_fn key_fn) ++/* ++ * this version is used by btree_gc before filesystem has gone RW and ++ * multithreaded, so uses the journal_iters list: ++ */ ++void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter, ++ struct bch_fs *c, ++ struct btree *b) + { +- struct bch_fs *c = trans->c; +- struct btree *b = c->btree_roots[btree_id].b; +- int ret = 0; +- +- if (btree_node_fake(b)) +- return 0; +- +- six_lock_read(&b->c.lock, NULL, NULL); +- ret = bch2_btree_and_journal_walk_recurse(trans, b, btree_id, key_fn); +- six_unlock_read(&b->c.lock); ++ struct btree_node_iter node_iter; + +- return ret; ++ bch2_btree_node_iter_init_from_start(&node_iter, b); ++ __bch2_btree_and_journal_iter_init_node_iter(iter, c, b, node_iter, b->data->min_key); ++ list_add(&iter->journal.list, &c->journal_iters); + } + + /* sort and dedup all keys in the journal: */ +@@ -449,9 +381,7 @@ static int journal_sort_key_cmp(const void *_l, const void *_r) + const struct journal_key *l = _l; + const struct journal_key *r = _r; + +- return cmp_int(l->btree_id, r->btree_id) ?: +- cmp_int(l->level, r->level) ?: +- bpos_cmp(l->k->k.p, r->k->k.p) ?: ++ return journal_key_cmp(l, r) ?: + cmp_int(l->journal_seq, r->journal_seq) ?: + cmp_int(l->journal_offset, r->journal_offset); + } +diff --git a/fs/bcachefs/recovery.h b/fs/bcachefs/recovery.h +index a7a9496afb95..21bdad9db249 100644 +--- a/fs/bcachefs/recovery.h ++++ b/fs/bcachefs/recovery.h +@@ -31,6 +31,9 @@ struct btree_and_journal_iter { + } last; + }; + ++size_t bch2_journal_key_search(struct journal_keys *, enum btree_id, ++ unsigned, struct bpos); ++ + int bch2_journal_key_insert_take(struct bch_fs *, enum btree_id, + unsigned, struct bkey_i *); + int bch2_journal_key_insert(struct bch_fs *, enum btree_id, +@@ -45,14 +48,13 @@ struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter * + struct bkey_s_c bch2_btree_and_journal_iter_next(struct btree_and_journal_iter *); + + void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *); ++void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *, ++ struct bch_fs *, struct btree *, ++ struct btree_node_iter, struct bpos); + void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *, + struct bch_fs *, + struct btree *); + +-typedef int (*btree_walk_key_fn)(struct btree_trans *, struct bkey_s_c); +- +-int bch2_btree_and_journal_walk(struct btree_trans *, enum btree_id, btree_walk_key_fn); +- + void bch2_journal_keys_free(struct journal_keys *); + void bch2_journal_entries_free(struct list_head *); + +-- +cgit v1.2.3 + + +From 5ee154f69ee1aeb96703b635f5f4281cae1bbd17 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 27 Dec 2021 23:10:06 -0500 +Subject: bcachefs: Simplify journal replay + +With BTREE_ITER_WITH_JOURNAL, there's no longer any restrictions on the +order we have to replay keys from the journal in, and we can also start +up journal reclaim right away - and delete a bunch of code. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/alloc_background.c | 3 +- + fs/bcachefs/bcachefs.h | 2 - + fs/bcachefs/btree_key_cache.h | 3 +- + fs/bcachefs/btree_update_interior.c | 5 +- + fs/bcachefs/btree_update_leaf.c | 3 - + fs/bcachefs/journal_reclaim.c | 5 -- + fs/bcachefs/journal_types.h | 1 - + fs/bcachefs/recovery.c | 114 ++++++------------------------------ + 8 files changed, 22 insertions(+), 114 deletions(-) + +diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c +index 4f1a3f6eb9d7..688a53b4ca58 100644 +--- a/fs/bcachefs/alloc_background.c ++++ b/fs/bcachefs/alloc_background.c +@@ -902,8 +902,7 @@ static void discard_one_bucket(struct bch_fs *c, struct bch_dev *ca, u64 b) + static bool allocator_thread_running(struct bch_dev *ca) + { + unsigned state = ca->mi.state == BCH_MEMBER_STATE_rw && +- test_bit(BCH_FS_ALLOCATOR_RUNNING, &ca->fs->flags) && +- test_bit(BCH_FS_ALLOC_REPLAY_DONE, &ca->fs->flags) ++ test_bit(BCH_FS_ALLOCATOR_RUNNING, &ca->fs->flags) + ? ALLOCATOR_running + : ALLOCATOR_stopped; + alloc_thread_set_state(ca, state); +diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h +index 49cb68b4b61d..63c55eebc95c 100644 +--- a/fs/bcachefs/bcachefs.h ++++ b/fs/bcachefs/bcachefs.h +@@ -510,8 +510,6 @@ enum { + BCH_FS_INITIAL_GC_DONE, + BCH_FS_INITIAL_GC_UNFIXED, + BCH_FS_TOPOLOGY_REPAIR_DONE, +- BCH_FS_ALLOC_REPLAY_DONE, +- BCH_FS_BTREE_INTERIOR_REPLAY_DONE, + BCH_FS_FSCK_DONE, + BCH_FS_STARTED, + BCH_FS_RW, +diff --git a/fs/bcachefs/btree_key_cache.h b/fs/bcachefs/btree_key_cache.h +index 0768ef3ca776..b3d241b13453 100644 +--- a/fs/bcachefs/btree_key_cache.h ++++ b/fs/bcachefs/btree_key_cache.h +@@ -16,8 +16,7 @@ static inline bool bch2_btree_key_cache_must_wait(struct bch_fs *c) + size_t nr_keys = atomic_long_read(&c->btree_key_cache.nr_keys); + size_t max_dirty = 4096 + (nr_keys * 3) / 4; + +- return nr_dirty > max_dirty && +- test_bit(JOURNAL_RECLAIM_STARTED, &c->journal.flags); ++ return nr_dirty > max_dirty; + } + + int bch2_btree_key_cache_journal_flush(struct journal *, +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index 8fb2ec3884c6..e1a5e34e21c1 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -45,7 +45,7 @@ static void btree_node_interior_verify(struct bch_fs *c, struct btree *b) + + BUG_ON(!b->c.level); + +- if (!test_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags)) ++ if (!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)) + return; + + bch2_btree_node_iter_init_from_start(&iter, b); +@@ -1851,9 +1851,6 @@ void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b) + { + struct async_btree_rewrite *a; + +- if (!test_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags)) +- return; +- + if (!percpu_ref_tryget(&c->writes)) + return; + +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index e18ec78edee5..e2e878b8860e 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -206,9 +206,6 @@ static bool btree_insert_key_leaf(struct btree_trans *trans, + int old_live_u64s = b->nr.live_u64s; + int live_u64s_added, u64s_added; + +- EBUG_ON(!insert->level && +- !test_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags)); +- + if (unlikely(!bch2_btree_bset_insert_key(trans, insert->path, b, + &insert_l(insert)->iter, insert->k))) + return false; +diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c +index ab9a6d966d5e..52a3935cff53 100644 +--- a/fs/bcachefs/journal_reclaim.c ++++ b/fs/bcachefs/journal_reclaim.c +@@ -489,9 +489,6 @@ static size_t journal_flush_pins(struct journal *j, u64 seq_to_flush, + u64 seq; + int err; + +- if (!test_bit(JOURNAL_RECLAIM_STARTED, &j->flags)) +- return 0; +- + lockdep_assert_held(&j->reclaim_lock); + + while (1) { +@@ -692,8 +689,6 @@ static int bch2_journal_reclaim_thread(void *arg) + + set_freezable(); + +- kthread_wait_freezable(test_bit(JOURNAL_RECLAIM_STARTED, &j->flags)); +- + j->last_flushed = jiffies; + + while (!ret && !kthread_should_stop()) { +diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h +index 54cc69bde1bb..d6d751214116 100644 +--- a/fs/bcachefs/journal_types.h ++++ b/fs/bcachefs/journal_types.h +@@ -148,7 +148,6 @@ enum journal_space_from { + enum { + JOURNAL_REPLAY_DONE, + JOURNAL_STARTED, +- JOURNAL_RECLAIM_STARTED, + JOURNAL_NEED_WRITE, + JOURNAL_MAY_GET_UNRESERVED, + JOURNAL_MAY_SKIP_FLUSH, +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index 57311ad283c7..cb0ba84711aa 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -474,8 +474,8 @@ static void replay_now_at(struct journal *j, u64 seq) + bch2_journal_pin_put(j, j->replay_journal_seq++); + } + +-static int __bch2_journal_replay_key(struct btree_trans *trans, +- struct journal_key *k) ++static int bch2_journal_replay_key(struct btree_trans *trans, ++ struct journal_key *k) + { + struct btree_iter iter; + unsigned iter_flags = +@@ -484,7 +484,7 @@ static int __bch2_journal_replay_key(struct btree_trans *trans, + int ret; + + if (!k->level && k->btree_id == BTREE_ID_alloc) +- iter_flags |= BTREE_ITER_CACHED|BTREE_ITER_CACHED_NOFILL; ++ iter_flags |= BTREE_ITER_CACHED; + + bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p, + BTREE_MAX_DEPTH, k->level, +@@ -503,29 +503,12 @@ out: + return ret; + } + +-static int bch2_journal_replay_key(struct bch_fs *c, struct journal_key *k) +-{ +- unsigned commit_flags = +- BTREE_INSERT_LAZY_RW| +- BTREE_INSERT_NOFAIL| +- BTREE_INSERT_JOURNAL_RESERVED; +- +- if (!k->allocated) +- commit_flags |= BTREE_INSERT_JOURNAL_REPLAY; +- +- return bch2_trans_do(c, NULL, NULL, commit_flags, +- __bch2_journal_replay_key(&trans, k)); +-} +- + static int journal_sort_seq_cmp(const void *_l, const void *_r) + { + const struct journal_key *l = *((const struct journal_key **)_l); + const struct journal_key *r = *((const struct journal_key **)_r); + +- return cmp_int(r->level, l->level) ?: +- cmp_int(l->journal_seq, r->journal_seq) ?: +- cmp_int(l->btree_id, r->btree_id) ?: +- bpos_cmp(l->k->k.p, r->k->k.p); ++ return cmp_int(l->journal_seq, r->journal_seq); + } + + static int bch2_journal_replay(struct bch_fs *c) +@@ -533,10 +516,7 @@ static int bch2_journal_replay(struct bch_fs *c) + struct journal_keys *keys = &c->journal_keys; + struct journal_key **keys_sorted, *k; + struct journal *j = &c->journal; +- struct bch_dev *ca; +- unsigned idx; + size_t i; +- u64 seq; + int ret; + + keys_sorted = kmalloc_array(sizeof(*keys_sorted), keys->nr, GFP_KERNEL); +@@ -555,73 +535,25 @@ static int bch2_journal_replay(struct bch_fs *c) + replay_now_at(j, keys->journal_seq_base); + } + +- seq = j->replay_journal_seq; +- +- /* +- * First replay updates to the alloc btree - these will only update the +- * btree key cache: +- */ +- for (i = 0; i < keys->nr; i++) { +- k = keys_sorted[i]; +- +- cond_resched(); +- +- if (!k->level && k->btree_id == BTREE_ID_alloc) { +- j->replay_journal_seq = keys->journal_seq_base + k->journal_seq; +- ret = bch2_journal_replay_key(c, k); +- if (ret) +- goto err; +- } +- } +- +- /* Now we can start the allocator threads: */ +- set_bit(BCH_FS_ALLOC_REPLAY_DONE, &c->flags); +- for_each_member_device(ca, c, idx) +- bch2_wake_allocator(ca); +- +- /* +- * Next replay updates to interior btree nodes: +- */ +- for (i = 0; i < keys->nr; i++) { +- k = keys_sorted[i]; +- +- cond_resched(); +- +- if (k->level) { +- j->replay_journal_seq = keys->journal_seq_base + k->journal_seq; +- ret = bch2_journal_replay_key(c, k); +- if (ret) +- goto err; +- } +- } +- +- /* +- * Now that the btree is in a consistent state, we can start journal +- * reclaim (which will be flushing entries from the btree key cache back +- * to the btree: +- */ +- set_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags); +- set_bit(JOURNAL_RECLAIM_STARTED, &j->flags); +- journal_reclaim_kick(j); +- +- j->replay_journal_seq = seq; +- +- /* +- * Now replay leaf node updates: +- */ + for (i = 0; i < keys->nr; i++) { + k = keys_sorted[i]; + + cond_resched(); + +- if (k->level || k->btree_id == BTREE_ID_alloc) +- continue; +- +- replay_now_at(j, keys->journal_seq_base + k->journal_seq); ++ if (!k->allocated) ++ replay_now_at(j, keys->journal_seq_base + k->journal_seq); + +- ret = bch2_journal_replay_key(c, k); +- if (ret) ++ ret = bch2_trans_do(c, NULL, NULL, ++ BTREE_INSERT_LAZY_RW| ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_JOURNAL_RESERVED| ++ (!k->allocated ? BTREE_INSERT_JOURNAL_REPLAY : 0), ++ bch2_journal_replay_key(&trans, k)); ++ if (ret) { ++ bch_err(c, "journal replay: error %d while replaying key at btree %s level %u", ++ ret, bch2_btree_ids[k->btree_id], k->level); + goto err; ++ } + } + + replay_now_at(j, j->replay_journal_seq_end); +@@ -629,14 +561,9 @@ static int bch2_journal_replay(struct bch_fs *c) + + bch2_journal_set_replay_done(j); + bch2_journal_flush_all_pins(j); +- kfree(keys_sorted); +- +- return bch2_journal_error(j); ++ ret = bch2_journal_error(j); + err: +- bch_err(c, "journal replay: error %d while replaying key at btree %s level %u", +- ret, bch2_btree_ids[k->btree_id], k->level); + kfree(keys_sorted); +- + return ret; + } + +@@ -1215,7 +1142,8 @@ use_clean: + ret = bch2_journal_replay(c); + if (ret) + goto err; +- bch_verbose(c, "journal replay done"); ++ if (c->opts.verbose || !c->sb.clean) ++ bch_info(c, "journal replay done"); + + if (test_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags) && + !c->opts.nochanges) { +@@ -1385,10 +1313,6 @@ int bch2_fs_initialize(struct bch_fs *c) + for (i = 0; i < BTREE_ID_NR; i++) + bch2_btree_root_alloc(c, i); + +- set_bit(BCH_FS_ALLOC_REPLAY_DONE, &c->flags); +- set_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags); +- set_bit(JOURNAL_RECLAIM_STARTED, &c->journal.flags); +- + err = "unable to allocate journal buckets"; + for_each_online_member(ca, c, i) { + ret = bch2_dev_journal_alloc(ca); +-- +cgit v1.2.3 + + +From 412d2d4014082630214927a677fdceebbc47f044 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 2 Jan 2022 21:45:35 -0500 +Subject: bcachefs: bch_dev->dev + +Add a field to bch_dev for the dev_t of the underlying block device - +this fixes a null ptr deref in tracepoints. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bcachefs.h | 1 + + fs/bcachefs/chardev.c | 5 ++++- + fs/bcachefs/super.c | 4 +++- + include/trace/events/bcachefs.h | 6 +++--- + 4 files changed, 11 insertions(+), 5 deletions(-) + +diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h +index 63c55eebc95c..38bdbbfc8622 100644 +--- a/fs/bcachefs/bcachefs.h ++++ b/fs/bcachefs/bcachefs.h +@@ -432,6 +432,7 @@ struct bch_dev { + struct bch_sb_handle disk_sb; + struct bch_sb *sb_read_scratch; + int sb_write_error; ++ dev_t dev; + + struct bch_devs_mask self; + +diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c +index db68a78276cf..aa26588ed5ed 100644 +--- a/fs/bcachefs/chardev.c ++++ b/fs/bcachefs/chardev.c +@@ -568,8 +568,11 @@ static long bch2_ioctl_disk_get_idx(struct bch_fs *c, + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + ++ if (!dev) ++ return -EINVAL; ++ + for_each_online_member(ca, c, i) +- if (ca->disk_sb.bdev->bd_dev == dev) { ++ if (ca->dev == dev) { + percpu_ref_put(&ca->io_ref); + return i; + } +diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c +index 3afa7ebd7ad8..cda334319bda 100644 +--- a/fs/bcachefs/super.c ++++ b/fs/bcachefs/super.c +@@ -1251,6 +1251,8 @@ static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb) + ca->disk_sb.bdev->bd_holder = ca; + memset(sb, 0, sizeof(*sb)); + ++ ca->dev = ca->disk_sb.bdev->bd_dev; ++ + percpu_ref_reinit(&ca->io_ref); + + return 0; +@@ -1869,7 +1871,7 @@ struct bch_dev *bch2_dev_lookup(struct bch_fs *c, const char *path) + + rcu_read_lock(); + for_each_member_device_rcu(ca, c, i, NULL) +- if (ca->disk_sb.bdev->bd_dev == dev) ++ if (ca->dev == dev) + goto found; + ca = ERR_PTR(-ENOENT); + found: +diff --git a/include/trace/events/bcachefs.h b/include/trace/events/bcachefs.h +index 5a409ee19d93..e4e0780bf4e5 100644 +--- a/include/trace/events/bcachefs.h ++++ b/include/trace/events/bcachefs.h +@@ -387,7 +387,7 @@ TRACE_EVENT(alloc_scan, + ), + + TP_fast_assign( +- __entry->dev = ca->disk_sb.bdev->bd_dev; ++ __entry->dev = ca->dev; + __entry->found = found; + __entry->inc_gen = inc_gen; + __entry->inc_gen_skipped = inc_gen_skipped; +@@ -409,7 +409,7 @@ TRACE_EVENT(invalidate, + ), + + TP_fast_assign( +- __entry->dev = ca->disk_sb.bdev->bd_dev; ++ __entry->dev = ca->dev; + __entry->offset = offset, + __entry->sectors = sectors; + ), +@@ -431,7 +431,7 @@ DECLARE_EVENT_CLASS(bucket_alloc, + ), + + TP_fast_assign( +- __entry->dev = ca->disk_sb.bdev->bd_dev; ++ __entry->dev = ca->dev; + __entry->reserve = reserve; + ), + +-- +cgit v1.2.3 + + +From 001a48491968f5cf3c199d200ac0cf8813947815 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 2 Jan 2022 22:24:43 -0500 +Subject: bcachefs: Fix an assertion + +bch2_trans_commit() can legitimately return -ENOSPC with +BTREE_INSERT_NOFAIL set if BTREE_INSERT_NOWAIT was also set. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_update_leaf.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index e2e878b8860e..1ce8ab0b51fe 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -815,7 +815,9 @@ int bch2_trans_commit_error(struct btree_trans *trans, + } + + BUG_ON((ret == EINTR || ret == -EAGAIN) && !trans->restarted); +- BUG_ON(ret == -ENOSPC && (trans->flags & BTREE_INSERT_NOFAIL)); ++ BUG_ON(ret == -ENOSPC && ++ !(trans->flags & BTREE_INSERT_NOWAIT) && ++ (trans->flags & BTREE_INSERT_NOFAIL)); + + return ret; + } +-- +cgit v1.2.3 + + +From fbe461875bb29eec451afea6e02d86eb275d708b Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 3 Jan 2022 00:14:39 -0500 +Subject: bcachefs: Kill bch2_bset_fix_invalidated_key() + +Was dead code, so delete it. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bset.c | 127 +++-------------------------------------------------- + fs/bcachefs/bset.h | 1 - + 2 files changed, 7 insertions(+), 121 deletions(-) + +diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c +index a4e0d149e1dc..ac909d1229a1 100644 +--- a/fs/bcachefs/bset.c ++++ b/fs/bcachefs/bset.c +@@ -607,10 +607,10 @@ static inline unsigned bkey_mantissa(const struct bkey_packed *k, + } + + __always_inline +-static inline void __make_bfloat(struct btree *b, struct bset_tree *t, +- unsigned j, +- struct bkey_packed *min_key, +- struct bkey_packed *max_key) ++static inline void make_bfloat(struct btree *b, struct bset_tree *t, ++ unsigned j, ++ struct bkey_packed *min_key, ++ struct bkey_packed *max_key) + { + struct bkey_float *f = bkey_float(b, t, j); + struct bkey_packed *m = tree_to_bkey(b, t, j); +@@ -679,34 +679,6 @@ static inline void __make_bfloat(struct btree *b, struct bset_tree *t, + f->mantissa = mantissa; + } + +-static void make_bfloat(struct btree *b, struct bset_tree *t, +- unsigned j, +- struct bkey_packed *min_key, +- struct bkey_packed *max_key) +-{ +- struct bkey_i *k; +- +- if (is_power_of_2(j) && +- !min_key->u64s) { +- if (!bkey_pack_pos(min_key, b->data->min_key, b)) { +- k = (void *) min_key; +- bkey_init(&k->k); +- k->k.p = b->data->min_key; +- } +- } +- +- if (is_power_of_2(j + 1) && +- !max_key->u64s) { +- if (!bkey_pack_pos(max_key, b->data->max_key, b)) { +- k = (void *) max_key; +- bkey_init(&k->k); +- k->k.p = b->data->max_key; +- } +- } +- +- __make_bfloat(b, t, j, min_key, max_key); +-} +- + /* bytes remaining - only valid for last bset: */ + static unsigned __bset_tree_capacity(const struct btree *b, const struct bset_tree *t) + { +@@ -796,9 +768,9 @@ retry: + + /* Then we build the tree */ + eytzinger1_for_each(j, t->size) +- __make_bfloat(b, t, j, +- bkey_to_packed(&min_key), +- bkey_to_packed(&max_key)); ++ make_bfloat(b, t, j, ++ bkey_to_packed(&min_key), ++ bkey_to_packed(&max_key)); + } + + static void bset_alloc_tree(struct btree *b, struct bset_tree *t) +@@ -943,91 +915,6 @@ struct bkey_packed *bch2_bkey_prev_filter(struct btree *b, + + /* Insert */ + +-static void rw_aux_tree_fix_invalidated_key(struct btree *b, +- struct bset_tree *t, +- struct bkey_packed *k) +-{ +- unsigned offset = __btree_node_key_to_offset(b, k); +- unsigned j = rw_aux_tree_bsearch(b, t, offset); +- +- if (j < t->size && +- rw_aux_tree(b, t)[j].offset == offset) +- rw_aux_tree_set(b, t, j, k); +- +- bch2_bset_verify_rw_aux_tree(b, t); +-} +- +-static void ro_aux_tree_fix_invalidated_key(struct btree *b, +- struct bset_tree *t, +- struct bkey_packed *k) +-{ +- struct bkey_packed min_key, max_key; +- unsigned inorder, j; +- +- EBUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE); +- +- /* signal to make_bfloat() that they're uninitialized: */ +- min_key.u64s = max_key.u64s = 0; +- +- if (bkey_next(k) == btree_bkey_last(b, t)) { +- for (j = 1; j < t->size; j = j * 2 + 1) +- make_bfloat(b, t, j, &min_key, &max_key); +- } +- +- inorder = bkey_to_cacheline(b, t, k); +- +- if (inorder && +- inorder < t->size) { +- j = __inorder_to_eytzinger1(inorder, t->size, t->extra); +- +- if (k == tree_to_bkey(b, t, j)) { +- /* Fix the node this key corresponds to */ +- make_bfloat(b, t, j, &min_key, &max_key); +- +- /* Children for which this key is the right boundary */ +- for (j = eytzinger1_left_child(j); +- j < t->size; +- j = eytzinger1_right_child(j)) +- make_bfloat(b, t, j, &min_key, &max_key); +- } +- } +- +- if (inorder + 1 < t->size) { +- j = __inorder_to_eytzinger1(inorder + 1, t->size, t->extra); +- +- if (k == tree_to_prev_bkey(b, t, j)) { +- make_bfloat(b, t, j, &min_key, &max_key); +- +- /* Children for which this key is the left boundary */ +- for (j = eytzinger1_right_child(j); +- j < t->size; +- j = eytzinger1_left_child(j)) +- make_bfloat(b, t, j, &min_key, &max_key); +- } +- } +-} +- +-/** +- * bch2_bset_fix_invalidated_key() - given an existing key @k that has been +- * modified, fix any auxiliary search tree by remaking all the nodes in the +- * auxiliary search tree that @k corresponds to +- */ +-void bch2_bset_fix_invalidated_key(struct btree *b, struct bkey_packed *k) +-{ +- struct bset_tree *t = bch2_bkey_to_bset(b, k); +- +- switch (bset_aux_tree_type(t)) { +- case BSET_NO_AUX_TREE: +- break; +- case BSET_RO_AUX_TREE: +- ro_aux_tree_fix_invalidated_key(b, t, k); +- break; +- case BSET_RW_AUX_TREE: +- rw_aux_tree_fix_invalidated_key(b, t, k); +- break; +- } +-} +- + static void bch2_bset_fix_lookup_table(struct btree *b, + struct bset_tree *t, + struct bkey_packed *_where, +diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h +index e42f866cf2ec..0d46534c3dcd 100644 +--- a/fs/bcachefs/bset.h ++++ b/fs/bcachefs/bset.h +@@ -361,7 +361,6 @@ void bch2_bset_init_first(struct btree *, struct bset *); + void bch2_bset_init_next(struct bch_fs *, struct btree *, + struct btree_node_entry *); + void bch2_bset_build_aux_tree(struct btree *, struct bset_tree *, bool); +-void bch2_bset_fix_invalidated_key(struct btree *, struct bkey_packed *); + + void bch2_bset_insert(struct btree *, struct btree_node_iter *, + struct bkey_packed *, struct bkey_i *, unsigned); +-- +cgit v1.2.3 + + +From 2e213ac23f8a9bb508394facf5db71187f59db2f Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 3 Jan 2022 00:22:29 -0500 +Subject: bcachefs: Make eytzinger size parameter more conventional + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bset.c | 14 +++++++------- + fs/bcachefs/eytzinger.h | 48 ++++++++++++++++++++++-------------------------- + 2 files changed, 29 insertions(+), 33 deletions(-) + +diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c +index ac909d1229a1..6000a8796bc5 100644 +--- a/fs/bcachefs/bset.c ++++ b/fs/bcachefs/bset.c +@@ -473,7 +473,7 @@ static inline struct bkey_packed *tree_to_bkey(const struct btree *b, + unsigned j) + { + return cacheline_to_bkey(b, t, +- __eytzinger1_to_inorder(j, t->size, t->extra), ++ __eytzinger1_to_inorder(j, t->size - 1, t->extra), + bkey_float(b, t, j)->key_offset); + } + +@@ -735,7 +735,7 @@ retry: + t->extra = (t->size - rounddown_pow_of_two(t->size - 1)) << 1; + + /* First we figure out where the first key in each cacheline is */ +- eytzinger1_for_each(j, t->size) { ++ eytzinger1_for_each(j, t->size - 1) { + while (bkey_to_cacheline(b, t, k) < cacheline) + prev = k, k = bkey_next(k); + +@@ -767,7 +767,7 @@ retry: + } + + /* Then we build the tree */ +- eytzinger1_for_each(j, t->size) ++ eytzinger1_for_each(j, t->size - 1) + make_bfloat(b, t, j, + bkey_to_packed(&min_key), + bkey_to_packed(&max_key)); +@@ -869,7 +869,7 @@ static struct bkey_packed *__bkey_prev(struct btree *b, struct bset_tree *t, + do { + p = j ? tree_to_bkey(b, t, + __inorder_to_eytzinger1(j--, +- t->size, t->extra)) ++ t->size - 1, t->extra)) + : btree_bkey_first(b, t); + } while (p >= k); + break; +@@ -1149,7 +1149,7 @@ slowpath: + n = n * 2 + (cmp < 0); + } while (n < t->size); + +- inorder = __eytzinger1_to_inorder(n >> 1, t->size, t->extra); ++ inorder = __eytzinger1_to_inorder(n >> 1, t->size - 1, t->extra); + + /* + * n would have been the node we recursed to - the low bit tells us if +@@ -1160,7 +1160,7 @@ slowpath: + if (unlikely(!inorder)) + return btree_bkey_first(b, t); + +- f = &base->f[eytzinger1_prev(n >> 1, t->size)]; ++ f = &base->f[eytzinger1_prev(n >> 1, t->size - 1)]; + } + + return cacheline_to_bkey(b, t, inorder, f->key_offset); +@@ -1577,7 +1577,7 @@ void bch2_bfloat_to_text(struct printbuf *out, struct btree *b, + if (!inorder || inorder >= t->size) + return; + +- j = __inorder_to_eytzinger1(inorder, t->size, t->extra); ++ j = __inorder_to_eytzinger1(inorder, t->size - 1, t->extra); + if (k != tree_to_bkey(b, t, j)) + return; + +diff --git a/fs/bcachefs/eytzinger.h b/fs/bcachefs/eytzinger.h +index 26d5cad7e6a5..05429c9631cd 100644 +--- a/fs/bcachefs/eytzinger.h ++++ b/fs/bcachefs/eytzinger.h +@@ -17,10 +17,6 @@ + * + * With one based indexing each level of the tree starts at a power of two - + * good for cacheline alignment: +- * +- * Size parameter is treated as if we were using 0 based indexing, however: +- * valid nodes, and inorder indices, are in the range [1..size) - that is, there +- * are actually size - 1 elements + */ + + static inline unsigned eytzinger1_child(unsigned i, unsigned child) +@@ -42,12 +38,12 @@ static inline unsigned eytzinger1_right_child(unsigned i) + + static inline unsigned eytzinger1_first(unsigned size) + { +- return rounddown_pow_of_two(size - 1); ++ return rounddown_pow_of_two(size); + } + + static inline unsigned eytzinger1_last(unsigned size) + { +- return rounddown_pow_of_two(size) - 1; ++ return rounddown_pow_of_two(size + 1) - 1; + } + + /* +@@ -62,13 +58,13 @@ static inline unsigned eytzinger1_last(unsigned size) + + static inline unsigned eytzinger1_next(unsigned i, unsigned size) + { +- EBUG_ON(i >= size); ++ EBUG_ON(i > size); + +- if (eytzinger1_right_child(i) < size) { ++ if (eytzinger1_right_child(i) <= size) { + i = eytzinger1_right_child(i); + +- i <<= __fls(size) - __fls(i); +- i >>= i >= size; ++ i <<= __fls(size + 1) - __fls(i); ++ i >>= i > size; + } else { + i >>= ffz(i) + 1; + } +@@ -78,14 +74,14 @@ static inline unsigned eytzinger1_next(unsigned i, unsigned size) + + static inline unsigned eytzinger1_prev(unsigned i, unsigned size) + { +- EBUG_ON(i >= size); ++ EBUG_ON(i > size); + +- if (eytzinger1_left_child(i) < size) { ++ if (eytzinger1_left_child(i) <= size) { + i = eytzinger1_left_child(i) + 1; + +- i <<= __fls(size) - __fls(i); ++ i <<= __fls(size + 1) - __fls(i); + i -= 1; +- i >>= i >= size; ++ i >>= i > size; + } else { + i >>= __ffs(i) + 1; + } +@@ -95,17 +91,17 @@ static inline unsigned eytzinger1_prev(unsigned i, unsigned size) + + static inline unsigned eytzinger1_extra(unsigned size) + { +- return (size - rounddown_pow_of_two(size - 1)) << 1; ++ return (size + 1 - rounddown_pow_of_two(size)) << 1; + } + + static inline unsigned __eytzinger1_to_inorder(unsigned i, unsigned size, + unsigned extra) + { + unsigned b = __fls(i); +- unsigned shift = __fls(size - 1) - b; ++ unsigned shift = __fls(size) - b; + int s; + +- EBUG_ON(!i || i >= size); ++ EBUG_ON(!i || i > size); + + i ^= 1U << b; + i <<= 1; +@@ -130,7 +126,7 @@ static inline unsigned __inorder_to_eytzinger1(unsigned i, unsigned size, + unsigned shift; + int s; + +- EBUG_ON(!i || i >= size); ++ EBUG_ON(!i || i > size); + + /* + * sign bit trick: +@@ -144,7 +140,7 @@ static inline unsigned __inorder_to_eytzinger1(unsigned i, unsigned size, + shift = __ffs(i); + + i >>= shift + 1; +- i |= 1U << (__fls(size - 1) - shift); ++ i |= 1U << (__fls(size) - shift); + + return i; + } +@@ -185,39 +181,39 @@ static inline unsigned eytzinger0_right_child(unsigned i) + + static inline unsigned eytzinger0_first(unsigned size) + { +- return eytzinger1_first(size + 1) - 1; ++ return eytzinger1_first(size) - 1; + } + + static inline unsigned eytzinger0_last(unsigned size) + { +- return eytzinger1_last(size + 1) - 1; ++ return eytzinger1_last(size) - 1; + } + + static inline unsigned eytzinger0_next(unsigned i, unsigned size) + { +- return eytzinger1_next(i + 1, size + 1) - 1; ++ return eytzinger1_next(i + 1, size) - 1; + } + + static inline unsigned eytzinger0_prev(unsigned i, unsigned size) + { +- return eytzinger1_prev(i + 1, size + 1) - 1; ++ return eytzinger1_prev(i + 1, size) - 1; + } + + static inline unsigned eytzinger0_extra(unsigned size) + { +- return eytzinger1_extra(size + 1); ++ return eytzinger1_extra(size); + } + + static inline unsigned __eytzinger0_to_inorder(unsigned i, unsigned size, + unsigned extra) + { +- return __eytzinger1_to_inorder(i + 1, size + 1, extra) - 1; ++ return __eytzinger1_to_inorder(i + 1, size, extra) - 1; + } + + static inline unsigned __inorder_to_eytzinger0(unsigned i, unsigned size, + unsigned extra) + { +- return __inorder_to_eytzinger1(i + 1, size + 1, extra) - 1; ++ return __inorder_to_eytzinger1(i + 1, size, extra) - 1; + } + + static inline unsigned eytzinger0_to_inorder(unsigned i, unsigned size) +-- +cgit v1.2.3 + + +From a7eabf32f5ac7dd64625917458577f0193f60116 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 3 Jan 2022 04:17:02 -0500 +Subject: bcachefs: Use kvmalloc() for array of sorted keys in journal replay + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/recovery.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index cb0ba84711aa..e4ba3f0aef4a 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -519,7 +519,7 @@ static int bch2_journal_replay(struct bch_fs *c) + size_t i; + int ret; + +- keys_sorted = kmalloc_array(sizeof(*keys_sorted), keys->nr, GFP_KERNEL); ++ keys_sorted = kvmalloc_array(sizeof(*keys_sorted), keys->nr, GFP_KERNEL); + if (!keys_sorted) + return -ENOMEM; + +@@ -563,7 +563,7 @@ static int bch2_journal_replay(struct bch_fs *c) + bch2_journal_flush_all_pins(j); + ret = bch2_journal_error(j); + err: +- kfree(keys_sorted); ++ kvfree(keys_sorted); + return ret; + } + +-- +cgit v1.2.3 + + +From 79a6bb82530005ea20e7149f5c8e9fc04a099d79 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 3 Jan 2022 23:38:50 -0500 +Subject: bcachefs: Improved superblock-related error messages + +This patch converts bch2_sb_validate() and the .validate methods for the +various superblock sections to take printbuf, to which they can print +detailed error messages, including printing the entire section that was +invalid. + +This is a great improvement over the previous situation, where we could +only return static strings that didn't have precise information about +what was wrong. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/disk_groups.c | 62 ++--- + fs/bcachefs/journal_seq_blacklist.c | 37 +-- + fs/bcachefs/quota.c | 12 +- + fs/bcachefs/replicas.c | 137 +++++------ + fs/bcachefs/super-io.c | 466 ++++++++++++++++++++++-------------- + fs/bcachefs/super-io.h | 7 +- + fs/bcachefs/super.c | 126 +++------- + fs/bcachefs/super.h | 1 - + 8 files changed, 450 insertions(+), 398 deletions(-) + +diff --git a/fs/bcachefs/disk_groups.c b/fs/bcachefs/disk_groups.c +index c52b6faac9b4..6c84297ef265 100644 +--- a/fs/bcachefs/disk_groups.c ++++ b/fs/bcachefs/disk_groups.c +@@ -17,24 +17,20 @@ static int group_cmp(const void *_l, const void *_r) + strncmp(l->label, r->label, sizeof(l->label)); + } + +-static const char *bch2_sb_disk_groups_validate(struct bch_sb *sb, +- struct bch_sb_field *f) ++static int bch2_sb_disk_groups_validate(struct bch_sb *sb, ++ struct bch_sb_field *f, ++ struct printbuf *err) + { + struct bch_sb_field_disk_groups *groups = + field_to_type(f, disk_groups); + struct bch_disk_group *g, *sorted = NULL; +- struct bch_sb_field_members *mi; +- struct bch_member *m; +- unsigned i, nr_groups, len; +- const char *err = NULL; +- +- mi = bch2_sb_get_members(sb); +- groups = bch2_sb_get_disk_groups(sb); +- nr_groups = disk_groups_nr(groups); ++ struct bch_sb_field_members *mi = bch2_sb_get_members(sb); ++ unsigned nr_groups = disk_groups_nr(groups); ++ unsigned i, len; ++ int ret = -EINVAL; + +- for (m = mi->members; +- m < mi->members + sb->nr_devices; +- m++) { ++ for (i = 0; i < sb->nr_devices; i++) { ++ struct bch_member *m = mi->members + i; + unsigned g; + + if (!BCH_MEMBER_GROUP(m)) +@@ -42,45 +38,53 @@ static const char *bch2_sb_disk_groups_validate(struct bch_sb *sb, + + g = BCH_MEMBER_GROUP(m) - 1; + +- if (g >= nr_groups || +- BCH_GROUP_DELETED(&groups->entries[g])) +- return "disk has invalid group"; ++ if (g >= nr_groups) { ++ pr_buf(err, "disk %u has invalid label %u (have %u)", ++ i, g, nr_groups); ++ return -EINVAL; ++ } ++ ++ if (BCH_GROUP_DELETED(&groups->entries[g])) { ++ pr_buf(err, "disk %u has deleted label %u", i, g); ++ return -EINVAL; ++ } + } + + if (!nr_groups) +- return NULL; ++ return 0; ++ ++ for (i = 0; i < nr_groups; i++) { ++ g = groups->entries + i; + +- for (g = groups->entries; +- g < groups->entries + nr_groups; +- g++) { + if (BCH_GROUP_DELETED(g)) + continue; + + len = strnlen(g->label, sizeof(g->label)); + if (!len) { +- err = "group with empty label"; +- goto err; ++ pr_buf(err, "label %u empty", i); ++ return -EINVAL; + } + } + + sorted = kmalloc_array(nr_groups, sizeof(*sorted), GFP_KERNEL); + if (!sorted) +- return "cannot allocate memory"; ++ return -ENOMEM; + + memcpy(sorted, groups->entries, nr_groups * sizeof(*sorted)); + sort(sorted, nr_groups, sizeof(*sorted), group_cmp, NULL); + +- for (i = 0; i + 1 < nr_groups; i++) +- if (!BCH_GROUP_DELETED(sorted + i) && +- !group_cmp(sorted + i, sorted + i + 1)) { +- err = "duplicate groups"; ++ for (g = sorted; g + 1 < sorted + nr_groups; g++) ++ if (!BCH_GROUP_DELETED(g) && ++ !group_cmp(&g[0], &g[1])) { ++ pr_buf(err, "duplicate label %llu.", BCH_GROUP_PARENT(g)); ++ bch_scnmemcpy(err, g->label, strnlen(g->label, sizeof(g->label))); + goto err; + } + +- err = NULL; ++ ret = 0; + err: + kfree(sorted); +- return err; ++ return 0; + } + + static void bch2_sb_disk_groups_to_text(struct printbuf *out, +diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c +index 10bd23e969d2..428377e73a8d 100644 +--- a/fs/bcachefs/journal_seq_blacklist.c ++++ b/fs/bcachefs/journal_seq_blacklist.c +@@ -189,27 +189,34 @@ int bch2_blacklist_table_initialize(struct bch_fs *c) + return 0; + } + +-static const char * +-bch2_sb_journal_seq_blacklist_validate(struct bch_sb *sb, +- struct bch_sb_field *f) ++static int bch2_sb_journal_seq_blacklist_validate(struct bch_sb *sb, ++ struct bch_sb_field *f, ++ struct printbuf *err) + { + struct bch_sb_field_journal_seq_blacklist *bl = + field_to_type(f, journal_seq_blacklist); +- struct journal_seq_blacklist_entry *i; +- unsigned nr = blacklist_nr_entries(bl); ++ unsigned i, nr = blacklist_nr_entries(bl); + +- for (i = bl->start; i < bl->start + nr; i++) { +- if (le64_to_cpu(i->start) >= +- le64_to_cpu(i->end)) +- return "entry start >= end"; +- +- if (i + 1 < bl->start + nr && +- le64_to_cpu(i[0].end) > +- le64_to_cpu(i[1].start)) +- return "entries out of order"; ++ for (i = 0; i < nr; i++) { ++ struct journal_seq_blacklist_entry *e = bl->start + i; ++ ++ if (le64_to_cpu(e->start) >= ++ le64_to_cpu(e->end)) { ++ pr_buf(err, "entry %u start >= end (%llu >= %llu)", ++ i, le64_to_cpu(e->start), le64_to_cpu(e->end)); ++ return -EINVAL; ++ } ++ ++ if (i + 1 < nr && ++ le64_to_cpu(e[0].end) > ++ le64_to_cpu(e[1].start)) { ++ pr_buf(err, "entry %u out of order with next entry (%llu > %llu)", ++ i + 1, le64_to_cpu(e[0].end), le64_to_cpu(e[1].start)); ++ return -EINVAL; ++ } + } + +- return NULL; ++ return 0; + } + + static void bch2_sb_journal_seq_blacklist_to_text(struct printbuf *out, +diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c +index 54bb2a454a5e..6fb8224f565e 100644 +--- a/fs/bcachefs/quota.c ++++ b/fs/bcachefs/quota.c +@@ -6,15 +6,17 @@ + #include "subvolume.h" + #include "super-io.h" + +-static const char *bch2_sb_validate_quota(struct bch_sb *sb, +- struct bch_sb_field *f) ++static int bch2_sb_validate_quota(struct bch_sb *sb, struct bch_sb_field *f, ++ struct printbuf *err) + { + struct bch_sb_field_quota *q = field_to_type(f, quota); + +- if (vstruct_bytes(&q->field) != sizeof(*q)) +- return "invalid field quota: wrong size"; ++ if (vstruct_bytes(&q->field) < sizeof(*q)) { ++ pr_buf(err, "wrong size (got %llu should be %zu)", ++ vstruct_bytes(&q->field), sizeof(*q)); ++ } + +- return NULL; ++ return 0; + } + + const struct bch_sb_field_ops bch_sb_field_ops_quota = { +diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c +index 6c5ea78d6762..a08f1e084a9d 100644 +--- a/fs/bcachefs/replicas.c ++++ b/fs/bcachefs/replicas.c +@@ -41,18 +41,19 @@ void bch2_replicas_entry_to_text(struct printbuf *out, + { + unsigned i; + +- pr_buf(out, "%s: %u/%u [", +- bch2_data_types[e->data_type], +- e->nr_required, +- e->nr_devs); ++ if (e->data_type < BCH_DATA_NR) ++ pr_buf(out, "%s", bch2_data_types[e->data_type]); ++ else ++ pr_buf(out, "(invalid data type %u)", e->data_type); + ++ pr_buf(out, ": %u/%u [", e->nr_required, e->nr_devs); + for (i = 0; i < e->nr_devs; i++) + pr_buf(out, i ? " %u" : "%u", e->devs[i]); + pr_buf(out, "]"); + } + + void bch2_cpu_replicas_to_text(struct printbuf *out, +- struct bch_replicas_cpu *r) ++ struct bch_replicas_cpu *r) + { + struct bch_replicas_entry *e; + bool first = true; +@@ -808,67 +809,78 @@ static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *c, + return 0; + } + +-static const char *check_dup_replicas_entries(struct bch_replicas_cpu *cpu_r) ++static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r, ++ struct bch_sb *sb, ++ struct printbuf *err) + { +- unsigned i; ++ struct bch_sb_field_members *mi = bch2_sb_get_members(sb); ++ unsigned i, j; + + sort_cmp_size(cpu_r->entries, + cpu_r->nr, + cpu_r->entry_size, + memcmp, NULL); + +- for (i = 0; i + 1 < cpu_r->nr; i++) { +- struct bch_replicas_entry *l = ++ for (i = 0; i < cpu_r->nr; i++) { ++ struct bch_replicas_entry *e = + cpu_replicas_entry(cpu_r, i); +- struct bch_replicas_entry *r = +- cpu_replicas_entry(cpu_r, i + 1); +- +- BUG_ON(memcmp(l, r, cpu_r->entry_size) > 0); + +- if (!memcmp(l, r, cpu_r->entry_size)) +- return "duplicate replicas entry"; +- } ++ if (e->data_type >= BCH_DATA_NR) { ++ pr_buf(err, "invalid data type in entry "); ++ bch2_replicas_entry_to_text(err, e); ++ return -EINVAL; ++ } + +- return NULL; +-} ++ if (!e->nr_devs) { ++ pr_buf(err, "no devices in entry "); ++ bch2_replicas_entry_to_text(err, e); ++ return -EINVAL; ++ } + +-static const char *bch2_sb_validate_replicas(struct bch_sb *sb, struct bch_sb_field *f) +-{ +- struct bch_sb_field_replicas *sb_r = field_to_type(f, replicas); +- struct bch_sb_field_members *mi = bch2_sb_get_members(sb); +- struct bch_replicas_cpu cpu_r = { .entries = NULL }; +- struct bch_replicas_entry *e; +- const char *err; +- unsigned i; ++ if (e->nr_required > 1 && ++ e->nr_required >= e->nr_devs) { ++ pr_buf(err, "bad nr_required in entry "); ++ bch2_replicas_entry_to_text(err, e); ++ return -EINVAL; ++ } + +- for_each_replicas_entry(sb_r, e) { +- err = "invalid replicas entry: invalid data type"; +- if (e->data_type >= BCH_DATA_NR) +- goto err; ++ for (j = 0; j < e->nr_devs; j++) ++ if (!bch2_dev_exists(sb, mi, e->devs[j])) { ++ pr_buf(err, "invalid device %u in entry ", e->devs[j]); ++ bch2_replicas_entry_to_text(err, e); ++ return -EINVAL; ++ } + +- err = "invalid replicas entry: no devices"; +- if (!e->nr_devs) +- goto err; ++ if (i + 1 < cpu_r->nr) { ++ struct bch_replicas_entry *n = ++ cpu_replicas_entry(cpu_r, i + 1); + +- err = "invalid replicas entry: bad nr_required"; +- if (e->nr_required > 1 && +- e->nr_required >= e->nr_devs) +- goto err; ++ BUG_ON(memcmp(e, n, cpu_r->entry_size) > 0); + +- err = "invalid replicas entry: invalid device"; +- for (i = 0; i < e->nr_devs; i++) +- if (!bch2_dev_exists(sb, mi, e->devs[i])) +- goto err; ++ if (!memcmp(e, n, cpu_r->entry_size)) { ++ pr_buf(err, "duplicate replicas entry "); ++ bch2_replicas_entry_to_text(err, e); ++ return -EINVAL; ++ } ++ } + } + +- err = "cannot allocate memory"; ++ return 0; ++} ++ ++static int bch2_sb_validate_replicas(struct bch_sb *sb, struct bch_sb_field *f, ++ struct printbuf *err) ++{ ++ struct bch_sb_field_replicas *sb_r = field_to_type(f, replicas); ++ struct bch_replicas_cpu cpu_r; ++ int ret; ++ + if (__bch2_sb_replicas_to_cpu_replicas(sb_r, &cpu_r)) +- goto err; ++ return -ENOMEM; + +- err = check_dup_replicas_entries(&cpu_r); +-err: ++ ret = bch2_cpu_replicas_validate(&cpu_r, sb, err); + kfree(cpu_r.entries); +- return err; ++ return ret; + } + + static void bch2_sb_replicas_to_text(struct printbuf *out, +@@ -893,38 +905,19 @@ const struct bch_sb_field_ops bch_sb_field_ops_replicas = { + .to_text = bch2_sb_replicas_to_text, + }; + +-static const char *bch2_sb_validate_replicas_v0(struct bch_sb *sb, struct bch_sb_field *f) ++static int bch2_sb_validate_replicas_v0(struct bch_sb *sb, struct bch_sb_field *f, ++ struct printbuf *err) + { + struct bch_sb_field_replicas_v0 *sb_r = field_to_type(f, replicas_v0); +- struct bch_sb_field_members *mi = bch2_sb_get_members(sb); +- struct bch_replicas_cpu cpu_r = { .entries = NULL }; +- struct bch_replicas_entry_v0 *e; +- const char *err; +- unsigned i; ++ struct bch_replicas_cpu cpu_r; ++ int ret; + +- for_each_replicas_entry_v0(sb_r, e) { +- err = "invalid replicas entry: invalid data type"; +- if (e->data_type >= BCH_DATA_NR) +- goto err; +- +- err = "invalid replicas entry: no devices"; +- if (!e->nr_devs) +- goto err; +- +- err = "invalid replicas entry: invalid device"; +- for (i = 0; i < e->nr_devs; i++) +- if (!bch2_dev_exists(sb, mi, e->devs[i])) +- goto err; +- } +- +- err = "cannot allocate memory"; + if (__bch2_sb_replicas_v0_to_cpu_replicas(sb_r, &cpu_r)) +- goto err; ++ return -ENOMEM; + +- err = check_dup_replicas_entries(&cpu_r); +-err: ++ ret = bch2_cpu_replicas_validate(&cpu_r, sb, err); + kfree(cpu_r.entries); +- return err; ++ return ret; + } + + const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0 = { +diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c +index 8e28a13aaf95..49dafdad77cd 100644 +--- a/fs/bcachefs/super-io.c ++++ b/fs/bcachefs/super-io.c +@@ -27,8 +27,8 @@ const char * const bch2_sb_fields[] = { + NULL + }; + +-static const char *bch2_sb_field_validate(struct bch_sb *, +- struct bch_sb_field *); ++static int bch2_sb_field_validate(struct bch_sb *, struct bch_sb_field *, ++ struct printbuf *); + + struct bch_sb_field *bch2_sb_field_get(struct bch_sb *sb, + enum bch_sb_field_type type) +@@ -202,22 +202,31 @@ static inline void __bch2_sb_layout_size_assert(void) + BUILD_BUG_ON(sizeof(struct bch_sb_layout) != 512); + } + +-static const char *validate_sb_layout(struct bch_sb_layout *layout) ++static int validate_sb_layout(struct bch_sb_layout *layout, struct printbuf *out) + { + u64 offset, prev_offset, max_sectors; + unsigned i; + +- if (uuid_le_cmp(layout->magic, BCACHE_MAGIC)) +- return "Not a bcachefs superblock layout"; ++ if (uuid_le_cmp(layout->magic, BCACHE_MAGIC)) { ++ pr_buf(out, "Not a bcachefs superblock layout"); ++ return -EINVAL; ++ } + +- if (layout->layout_type != 0) +- return "Invalid superblock layout type"; ++ if (layout->layout_type != 0) { ++ pr_buf(out, "Invalid superblock layout type %u", ++ layout->layout_type); ++ return -EINVAL; ++ } + +- if (!layout->nr_superblocks) +- return "Invalid superblock layout: no superblocks"; ++ if (!layout->nr_superblocks) { ++ pr_buf(out, "Invalid superblock layout: no superblocks"); ++ return -EINVAL; ++ } + +- if (layout->nr_superblocks > ARRAY_SIZE(layout->sb_offset)) +- return "Invalid superblock layout: too many superblocks"; ++ if (layout->nr_superblocks > ARRAY_SIZE(layout->sb_offset)) { ++ pr_buf(out, "Invalid superblock layout: too many superblocks"); ++ return -EINVAL; ++ } + + max_sectors = 1 << layout->sb_max_size_bits; + +@@ -226,122 +235,134 @@ static const char *validate_sb_layout(struct bch_sb_layout *layout) + for (i = 1; i < layout->nr_superblocks; i++) { + offset = le64_to_cpu(layout->sb_offset[i]); + +- if (offset < prev_offset + max_sectors) +- return "Invalid superblock layout: superblocks overlap"; ++ if (offset < prev_offset + max_sectors) { ++ pr_buf(out, "Invalid superblock layout: superblocks overlap\n" ++ " (sb %u ends at %llu next starts at %llu", ++ i - 1, prev_offset + max_sectors, offset); ++ return -EINVAL; ++ } + prev_offset = offset; + } + +- return NULL; ++ return 0; + } + +-const char *bch2_sb_validate(struct bch_sb_handle *disk_sb) ++static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out) + { + struct bch_sb *sb = disk_sb->sb; + struct bch_sb_field *f; + struct bch_sb_field_members *mi; +- const char *err; + u32 version, version_min; + u16 block_size; ++ int ret; + + version = le16_to_cpu(sb->version); + version_min = version >= bcachefs_metadata_version_new_versioning + ? le16_to_cpu(sb->version_min) + : version; + +- if (version >= bcachefs_metadata_version_max || +- version_min < bcachefs_metadata_version_min) +- return "Unsupported superblock version"; ++ if (version >= bcachefs_metadata_version_max) { ++ pr_buf(out, "Unsupported superblock version %u (min %u, max %u)", ++ version, bcachefs_metadata_version_min, bcachefs_metadata_version_max); ++ return -EINVAL; ++ } + +- if (version_min > version) +- return "Bad minimum version"; ++ if (version_min < bcachefs_metadata_version_min) { ++ pr_buf(out, "Unsupported superblock version %u (min %u, max %u)", ++ version_min, bcachefs_metadata_version_min, bcachefs_metadata_version_max); ++ return -EINVAL; ++ } ++ ++ if (version_min > version) { ++ pr_buf(out, "Bad minimum version %u, greater than version field %u", ++ version_min, version); ++ return -EINVAL; ++ } + + if (sb->features[1] || +- (le64_to_cpu(sb->features[0]) & (~0ULL << BCH_FEATURE_NR))) +- return "Filesystem has incompatible features"; ++ (le64_to_cpu(sb->features[0]) & (~0ULL << BCH_FEATURE_NR))) { ++ pr_buf(out, "Filesystem has incompatible features"); ++ return -EINVAL; ++ } + + block_size = le16_to_cpu(sb->block_size); + +- if (block_size > PAGE_SECTORS) +- return "Bad block size"; ++ if (block_size > PAGE_SECTORS) { ++ pr_buf(out, "Block size too big (got %u, max %u)", ++ block_size, PAGE_SECTORS); ++ return -EINVAL; ++ } + +- if (bch2_is_zero(sb->user_uuid.b, sizeof(uuid_le))) +- return "Bad user UUID"; ++ if (bch2_is_zero(sb->user_uuid.b, sizeof(uuid_le))) { ++ pr_buf(out, "Bad user UUID (got zeroes)"); ++ return -EINVAL; ++ } + +- if (bch2_is_zero(sb->uuid.b, sizeof(uuid_le))) +- return "Bad internal UUID"; ++ if (bch2_is_zero(sb->uuid.b, sizeof(uuid_le))) { ++ pr_buf(out, "Bad intenal UUID (got zeroes)"); ++ return -EINVAL; ++ } + + if (!sb->nr_devices || +- sb->nr_devices <= sb->dev_idx || +- sb->nr_devices > BCH_SB_MEMBERS_MAX) +- return "Bad number of member devices"; +- +- if (!BCH_SB_META_REPLICAS_WANT(sb) || +- BCH_SB_META_REPLICAS_WANT(sb) > BCH_REPLICAS_MAX) +- return "Invalid number of metadata replicas"; +- +- if (!BCH_SB_META_REPLICAS_REQ(sb) || +- BCH_SB_META_REPLICAS_REQ(sb) > BCH_REPLICAS_MAX) +- return "Invalid number of metadata replicas"; +- +- if (!BCH_SB_DATA_REPLICAS_WANT(sb) || +- BCH_SB_DATA_REPLICAS_WANT(sb) > BCH_REPLICAS_MAX) +- return "Invalid number of data replicas"; +- +- if (!BCH_SB_DATA_REPLICAS_REQ(sb) || +- BCH_SB_DATA_REPLICAS_REQ(sb) > BCH_REPLICAS_MAX) +- return "Invalid number of data replicas"; +- +- if (BCH_SB_META_CSUM_TYPE(sb) >= BCH_CSUM_OPT_NR) +- return "Invalid metadata checksum type"; +- +- if (BCH_SB_DATA_CSUM_TYPE(sb) >= BCH_CSUM_OPT_NR) +- return "Invalid metadata checksum type"; +- +- if (BCH_SB_COMPRESSION_TYPE(sb) >= BCH_COMPRESSION_OPT_NR) +- return "Invalid compression type"; +- +- if (!BCH_SB_BTREE_NODE_SIZE(sb)) +- return "Btree node size not set"; ++ sb->nr_devices > BCH_SB_MEMBERS_MAX) { ++ pr_buf(out, "Bad number of member devices %u (max %u)", ++ sb->nr_devices, BCH_SB_MEMBERS_MAX); ++ return -EINVAL; ++ } + +- if (BCH_SB_GC_RESERVE(sb) < 5) +- return "gc reserve percentage too small"; ++ if (sb->dev_idx >= sb->nr_devices) { ++ pr_buf(out, "Bad dev_idx (got %u, nr_devices %u)", ++ sb->dev_idx, sb->nr_devices); ++ return -EINVAL; ++ } + + if (!sb->time_precision || +- le32_to_cpu(sb->time_precision) > NSEC_PER_SEC) +- return "invalid time precision"; ++ le32_to_cpu(sb->time_precision) > NSEC_PER_SEC) { ++ pr_buf(out, "Invalid time precision: %u (min 1, max %lu)", ++ le32_to_cpu(sb->time_precision), NSEC_PER_SEC); ++ return -EINVAL; ++ } + + /* validate layout */ +- err = validate_sb_layout(&sb->layout); +- if (err) +- return err; ++ ret = validate_sb_layout(&sb->layout, out); ++ if (ret) ++ return ret; + + vstruct_for_each(sb, f) { +- if (!f->u64s) +- return "Invalid superblock: invalid optional field"; ++ if (!f->u64s) { ++ pr_buf(out, "Invalid superblock: optional with size 0 (type %u)", ++ le32_to_cpu(f->type)); ++ return -EINVAL; ++ } + +- if (vstruct_next(f) > vstruct_last(sb)) +- return "Invalid superblock: invalid optional field"; ++ if (vstruct_next(f) > vstruct_last(sb)) { ++ pr_buf(out, "Invalid superblock: optional field extends past end of superblock (type %u)", ++ le32_to_cpu(f->type)); ++ return -EINVAL; ++ } + } + + /* members must be validated first: */ + mi = bch2_sb_get_members(sb); +- if (!mi) +- return "Invalid superblock: member info area missing"; ++ if (!mi) { ++ pr_buf(out, "Invalid superblock: member info area missing"); ++ return -EINVAL; ++ } + +- err = bch2_sb_field_validate(sb, &mi->field); +- if (err) +- return err; ++ ret = bch2_sb_field_validate(sb, &mi->field, out); ++ if (ret) ++ return ret; + + vstruct_for_each(sb, f) { + if (le32_to_cpu(f->type) == BCH_SB_FIELD_members) + continue; + +- err = bch2_sb_field_validate(sb, f); +- if (err) +- return err; ++ ret = bch2_sb_field_validate(sb, f, out); ++ if (ret) ++ return ret; + } + +- return NULL; ++ return 0; + } + + /* device open: */ +@@ -470,10 +491,12 @@ int bch2_sb_from_fs(struct bch_fs *c, struct bch_dev *ca) + + /* read superblock: */ + +-static const char *read_one_super(struct bch_sb_handle *sb, u64 offset) ++static int read_one_super(struct bch_sb_handle *sb, u64 offset, struct printbuf *err) + { + struct bch_csum csum; ++ u32 version, version_min; + size_t bytes; ++ int ret; + reread: + bio_reset(sb->bio); + bio_set_dev(sb->bio, sb->bdev); +@@ -481,40 +504,65 @@ reread: + bio_set_op_attrs(sb->bio, REQ_OP_READ, REQ_SYNC|REQ_META); + bch2_bio_map(sb->bio, sb->sb, sb->buffer_size); + +- if (submit_bio_wait(sb->bio)) +- return "IO error"; ++ ret = submit_bio_wait(sb->bio); ++ if (ret) { ++ pr_buf(err, "IO error: %i", ret); ++ return ret; ++ } + +- if (uuid_le_cmp(sb->sb->magic, BCACHE_MAGIC)) +- return "Not a bcachefs superblock"; ++ if (uuid_le_cmp(sb->sb->magic, BCACHE_MAGIC)) { ++ pr_buf(err, "Not a bcachefs superblock"); ++ return -EINVAL; ++ } + +- if (le16_to_cpu(sb->sb->version) < bcachefs_metadata_version_min || +- le16_to_cpu(sb->sb->version) >= bcachefs_metadata_version_max) +- return "Unsupported superblock version"; ++ version = le16_to_cpu(sb->sb->version); ++ version_min = version >= bcachefs_metadata_version_new_versioning ++ ? le16_to_cpu(sb->sb->version_min) ++ : version; ++ ++ if (version >= bcachefs_metadata_version_max) { ++ pr_buf(err, "Unsupported superblock version %u (min %u, max %u)", ++ version, bcachefs_metadata_version_min, bcachefs_metadata_version_max); ++ return -EINVAL; ++ } ++ ++ if (version_min < bcachefs_metadata_version_min) { ++ pr_buf(err, "Unsupported superblock version %u (min %u, max %u)", ++ version_min, bcachefs_metadata_version_min, bcachefs_metadata_version_max); ++ return -EINVAL; ++ } + + bytes = vstruct_bytes(sb->sb); + +- if (bytes > 512 << sb->sb->layout.sb_max_size_bits) +- return "Bad superblock: too big"; ++ if (bytes > 512 << sb->sb->layout.sb_max_size_bits) { ++ pr_buf(err, "Invalid superblock: too big (got %zu bytes, layout max %lu)", ++ bytes, 512UL << sb->sb->layout.sb_max_size_bits); ++ return -EINVAL; ++ } + + if (bytes > sb->buffer_size) { + if (bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s))) +- return "cannot allocate memory"; ++ return -ENOMEM; + goto reread; + } + +- if (BCH_SB_CSUM_TYPE(sb->sb) >= BCH_CSUM_NR) +- return "unknown csum type"; ++ if (BCH_SB_CSUM_TYPE(sb->sb) >= BCH_CSUM_NR) { ++ pr_buf(err, "unknown checksum type %llu", BCH_SB_CSUM_TYPE(sb->sb)); ++ return -EINVAL; ++ } + + /* XXX: verify MACs */ + csum = csum_vstruct(NULL, BCH_SB_CSUM_TYPE(sb->sb), + null_nonce(), sb->sb); + +- if (bch2_crc_cmp(csum, sb->sb->csum)) +- return "bad checksum reading superblock"; ++ if (bch2_crc_cmp(csum, sb->sb->csum)) { ++ pr_buf(err, "bad checksum"); ++ return -EINVAL; ++ } + + sb->seq = le64_to_cpu(sb->sb->seq); + +- return NULL; ++ return 0; + } + + int bch2_read_super(const char *path, struct bch_opts *opts, +@@ -522,10 +570,16 @@ int bch2_read_super(const char *path, struct bch_opts *opts, + { + u64 offset = opt_get(*opts, sb); + struct bch_sb_layout layout; +- const char *err; ++ char *_err; ++ struct printbuf err; + __le64 *i; + int ret; + ++ _err = kmalloc(4096, GFP_KERNEL); ++ if (!_err) ++ return -ENOMEM; ++ err = _PBUF(_err, 4096); ++ + pr_verbose_init(*opts, ""); + + memset(sb, 0, sizeof(*sb)); +@@ -554,25 +608,28 @@ int bch2_read_super(const char *path, struct bch_opts *opts, + goto out; + } + +- err = "cannot allocate memory"; + ret = bch2_sb_realloc(sb, 0); +- if (ret) ++ if (ret) { ++ pr_buf(&err, "error allocating memory for superblock"); + goto err; ++ } + +- ret = -EFAULT; +- err = "dynamic fault"; +- if (bch2_fs_init_fault("read_super")) ++ if (bch2_fs_init_fault("read_super")) { ++ pr_buf(&err, "dynamic fault"); ++ ret = -EFAULT; + goto err; ++ } + +- ret = -EINVAL; +- err = read_one_super(sb, offset); +- if (!err) ++ ret = read_one_super(sb, offset, &err); ++ if (!ret) + goto got_super; + + if (opt_defined(*opts, sb)) + goto err; + +- pr_err("error reading default superblock: %s", err); ++ printk(KERN_ERR "bcachefs (%s): error reading default superblock: %s", ++ path, _err); ++ err = _PBUF(_err, 4096); + + /* + * Error reading primary superblock - read location of backup +@@ -588,13 +645,15 @@ int bch2_read_super(const char *path, struct bch_opts *opts, + */ + bch2_bio_map(sb->bio, sb->sb, sizeof(struct bch_sb_layout)); + +- err = "IO error"; +- if (submit_bio_wait(sb->bio)) ++ ret = submit_bio_wait(sb->bio); ++ if (ret) { ++ pr_buf(&err, "IO error: %i", ret); + goto err; ++ } + + memcpy(&layout, sb->sb, sizeof(layout)); +- err = validate_sb_layout(&layout); +- if (err) ++ ret = validate_sb_layout(&layout, &err); ++ if (ret) + goto err; + + for (i = layout.sb_offset; +@@ -604,32 +663,39 @@ int bch2_read_super(const char *path, struct bch_opts *opts, + if (offset == opt_get(*opts, sb)) + continue; + +- err = read_one_super(sb, offset); +- if (!err) ++ ret = read_one_super(sb, offset, &err); ++ if (!ret) + goto got_super; + } + +- ret = -EINVAL; + goto err; + + got_super: +- err = "Superblock block size smaller than device block size"; +- ret = -EINVAL; + if (le16_to_cpu(sb->sb->block_size) << 9 < + bdev_logical_block_size(sb->bdev)) { +- pr_err("error reading superblock: Superblock block size (%u) smaller than device block size (%u)", ++ pr_buf(&err, "block size (%u) smaller than device block size (%u)", + le16_to_cpu(sb->sb->block_size) << 9, + bdev_logical_block_size(sb->bdev)); +- goto err_no_print; ++ ret = -EINVAL; ++ goto err; + } + + ret = 0; + sb->have_layout = true; ++ ++ ret = bch2_sb_validate(sb, &err); ++ if (ret) { ++ printk(KERN_ERR "bcachefs (%s): error validating superblock: %s", ++ path, _err); ++ goto err_no_print; ++ } + out: + pr_verbose_init(*opts, "ret %i", ret); ++ kfree(_err); + return ret; + err: +- pr_err("error reading superblock: %s", err); ++ printk(KERN_ERR "bcachefs (%s): error reading superblock: %s", ++ path, _err); + err_no_print: + bch2_free_super(sb); + goto out; +@@ -704,7 +770,6 @@ int bch2_write_super(struct bch_fs *c) + struct closure *cl = &c->sb_write; + struct bch_dev *ca; + unsigned i, sb = 0, nr_wrote; +- const char *err; + struct bch_devs_mask sb_written; + bool wrote, can_mount_without_written, can_mount_with_written; + unsigned degraded_flags = BCH_FORCE_IF_DEGRADED; +@@ -731,10 +796,19 @@ int bch2_write_super(struct bch_fs *c) + bch2_sb_from_fs(c, ca); + + for_each_online_member(ca, c, i) { +- err = bch2_sb_validate(&ca->disk_sb); +- if (err) { +- bch2_fs_inconsistent(c, "sb invalid before write: %s", err); +- ret = -1; ++ struct printbuf buf = { NULL, NULL }; ++ ++ ret = bch2_sb_validate(&ca->disk_sb, &buf); ++ if (ret) { ++ char *_buf = kmalloc(4096, GFP_NOFS); ++ if (_buf) { ++ buf = _PBUF(_buf, 4096); ++ bch2_sb_validate(&ca->disk_sb, &buf); ++ } ++ ++ bch2_fs_inconsistent(c, "sb invalid before write: %s", _buf); ++ kfree(_buf); ++ percpu_ref_put(&ca->io_ref); + goto out; + } + } +@@ -847,54 +921,57 @@ static int u64_cmp(const void *_l, const void *_r) + return l < r ? -1 : l > r ? 1 : 0; + } + +-static const char *bch2_sb_validate_journal(struct bch_sb *sb, +- struct bch_sb_field *f) ++static int bch2_sb_validate_journal(struct bch_sb *sb, ++ struct bch_sb_field *f, ++ struct printbuf *err) + { + struct bch_sb_field_journal *journal = field_to_type(f, journal); + struct bch_member *m = bch2_sb_get_members(sb)->members + sb->dev_idx; +- const char *err; ++ int ret = -EINVAL; + unsigned nr; + unsigned i; + u64 *b; + +- journal = bch2_sb_get_journal(sb); +- if (!journal) +- return NULL; +- + nr = bch2_nr_journal_buckets(journal); + if (!nr) +- return NULL; ++ return 0; + + b = kmalloc_array(sizeof(u64), nr, GFP_KERNEL); + if (!b) +- return "cannot allocate memory"; ++ return -ENOMEM; + + for (i = 0; i < nr; i++) + b[i] = le64_to_cpu(journal->buckets[i]); + + sort(b, nr, sizeof(u64), u64_cmp, NULL); + +- err = "journal bucket at sector 0"; +- if (!b[0]) ++ if (!b[0]) { ++ pr_buf(err, "journal bucket at sector 0"); + goto err; ++ } + +- err = "journal bucket before first bucket"; +- if (m && b[0] < le16_to_cpu(m->first_bucket)) ++ if (b[0] < le16_to_cpu(m->first_bucket)) { ++ pr_buf(err, "journal bucket %llu before first bucket %u", ++ b[0], le16_to_cpu(m->first_bucket)); + goto err; ++ } + +- err = "journal bucket past end of device"; +- if (m && b[nr - 1] >= le64_to_cpu(m->nbuckets)) ++ if (b[nr - 1] >= le64_to_cpu(m->nbuckets)) { ++ pr_buf(err, "journal bucket %llu past end of device (nbuckets %llu)", ++ b[nr - 1], le64_to_cpu(m->nbuckets)); + goto err; ++ } + +- err = "duplicate journal buckets"; + for (i = 0; i + 1 < nr; i++) +- if (b[i] == b[i + 1]) ++ if (b[i] == b[i + 1]) { ++ pr_buf(err, "duplicate journal buckets %llu", b[i]); + goto err; ++ } + +- err = NULL; ++ ret = 0; + err: + kfree(b); +- return err; ++ return ret; + } + + static const struct bch_sb_field_ops bch_sb_field_ops_journal = { +@@ -903,39 +980,54 @@ static const struct bch_sb_field_ops bch_sb_field_ops_journal = { + + /* BCH_SB_FIELD_members: */ + +-static const char *bch2_sb_validate_members(struct bch_sb *sb, +- struct bch_sb_field *f) ++static int bch2_sb_validate_members(struct bch_sb *sb, ++ struct bch_sb_field *f, ++ struct printbuf *err) + { + struct bch_sb_field_members *mi = field_to_type(f, members); +- struct bch_member *m; ++ unsigned i; + + if ((void *) (mi->members + sb->nr_devices) > +- vstruct_end(&mi->field)) +- return "Invalid superblock: bad member info"; ++ vstruct_end(&mi->field)) { ++ pr_buf(err, "too many devices for section size"); ++ return -EINVAL; ++ } ++ ++ for (i = 0; i < sb->nr_devices; i++) { ++ struct bch_member *m = mi->members + i; + +- for (m = mi->members; +- m < mi->members + sb->nr_devices; +- m++) { + if (!bch2_member_exists(m)) + continue; + +- if (le64_to_cpu(m->nbuckets) > LONG_MAX) +- return "Too many buckets"; ++ if (le64_to_cpu(m->nbuckets) > LONG_MAX) { ++ pr_buf(err, "device %u: too many buckets (got %llu, max %lu)", ++ i, le64_to_cpu(m->nbuckets), LONG_MAX); ++ return -EINVAL; ++ } + + if (le64_to_cpu(m->nbuckets) - +- le16_to_cpu(m->first_bucket) < BCH_MIN_NR_NBUCKETS) +- return "Not enough buckets"; ++ le16_to_cpu(m->first_bucket) < BCH_MIN_NR_NBUCKETS) { ++ pr_buf(err, "device %u: not enough buckets (got %llu, max %u)", ++ i, le64_to_cpu(m->nbuckets), BCH_MIN_NR_NBUCKETS); ++ return -EINVAL; ++ } + + if (le16_to_cpu(m->bucket_size) < +- le16_to_cpu(sb->block_size)) +- return "bucket size smaller than block size"; ++ le16_to_cpu(sb->block_size)) { ++ pr_buf(err, "device %u: bucket size %u smaller than block size %u", ++ i, le16_to_cpu(m->bucket_size), le16_to_cpu(sb->block_size)); ++ return -EINVAL; ++ } + + if (le16_to_cpu(m->bucket_size) < +- BCH_SB_BTREE_NODE_SIZE(sb)) +- return "bucket size smaller than btree node size"; ++ BCH_SB_BTREE_NODE_SIZE(sb)) { ++ pr_buf(err, "device %u: bucket size %u smaller than btree node size %llu", ++ i, le16_to_cpu(m->bucket_size), BCH_SB_BTREE_NODE_SIZE(sb)); ++ return -EINVAL; ++ } + } + +- return NULL; ++ return 0; + } + + static const struct bch_sb_field_ops bch_sb_field_ops_members = { +@@ -944,18 +1036,24 @@ static const struct bch_sb_field_ops bch_sb_field_ops_members = { + + /* BCH_SB_FIELD_crypt: */ + +-static const char *bch2_sb_validate_crypt(struct bch_sb *sb, +- struct bch_sb_field *f) ++static int bch2_sb_validate_crypt(struct bch_sb *sb, ++ struct bch_sb_field *f, ++ struct printbuf *err) + { + struct bch_sb_field_crypt *crypt = field_to_type(f, crypt); + +- if (vstruct_bytes(&crypt->field) != sizeof(*crypt)) +- return "invalid field crypt: wrong size"; ++ if (vstruct_bytes(&crypt->field) < sizeof(*crypt)) { ++ pr_buf(err, "wrong size (got %llu should be %zu)", ++ vstruct_bytes(&crypt->field), sizeof(*crypt)); ++ return -EINVAL; ++ } + +- if (BCH_CRYPT_KDF_TYPE(crypt)) +- return "invalid field crypt: bad kdf type"; ++ if (BCH_CRYPT_KDF_TYPE(crypt)) { ++ pr_buf(err, "bad kdf type %llu", BCH_CRYPT_KDF_TYPE(crypt)); ++ return -EINVAL; ++ } + +- return NULL; ++ return 0; + } + + static const struct bch_sb_field_ops bch_sb_field_ops_crypt = { +@@ -1164,15 +1262,19 @@ out: + mutex_unlock(&c->sb_lock); + } + +-static const char *bch2_sb_validate_clean(struct bch_sb *sb, +- struct bch_sb_field *f) ++static int bch2_sb_validate_clean(struct bch_sb *sb, ++ struct bch_sb_field *f, ++ struct printbuf *err) + { + struct bch_sb_field_clean *clean = field_to_type(f, clean); + +- if (vstruct_bytes(&clean->field) < sizeof(*clean)) +- return "invalid field crypt: wrong size"; ++ if (vstruct_bytes(&clean->field) < sizeof(*clean)) { ++ pr_buf(err, "wrong size (got %llu should be %zu)", ++ vstruct_bytes(&clean->field), sizeof(*clean)); ++ return -EINVAL; ++ } + +- return NULL; ++ return 0; + } + + static const struct bch_sb_field_ops bch_sb_field_ops_clean = { +@@ -1186,14 +1288,26 @@ static const struct bch_sb_field_ops *bch2_sb_field_ops[] = { + #undef x + }; + +-static const char *bch2_sb_field_validate(struct bch_sb *sb, +- struct bch_sb_field *f) ++static int bch2_sb_field_validate(struct bch_sb *sb, struct bch_sb_field *f, ++ struct printbuf *orig_err) + { + unsigned type = le32_to_cpu(f->type); ++ struct printbuf err = *orig_err; ++ int ret; + +- return type < BCH_SB_FIELD_NR +- ? bch2_sb_field_ops[type]->validate(sb, f) +- : NULL; ++ if (type >= BCH_SB_FIELD_NR) ++ return 0; ++ ++ pr_buf(&err, "Invalid superblock section %s: ", bch2_sb_fields[type]); ++ ++ ret = bch2_sb_field_ops[type]->validate(sb, f, &err); ++ if (ret) { ++ pr_buf(&err, "\n"); ++ bch2_sb_field_to_text(&err, sb, f); ++ *orig_err = err; ++ } ++ ++ return ret; + } + + void bch2_sb_field_to_text(struct printbuf *out, struct bch_sb *sb, +diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h +index 5c264875acb4..3b425bed17c4 100644 +--- a/fs/bcachefs/super-io.h ++++ b/fs/bcachefs/super-io.h +@@ -38,9 +38,8 @@ BCH_SB_FIELDS() + extern const char * const bch2_sb_fields[]; + + struct bch_sb_field_ops { +- const char * (*validate)(struct bch_sb *, struct bch_sb_field *); +- void (*to_text)(struct printbuf *, struct bch_sb *, +- struct bch_sb_field *); ++ int (*validate)(struct bch_sb *, struct bch_sb_field *, struct printbuf *); ++ void (*to_text)(struct printbuf *, struct bch_sb *, struct bch_sb_field *); + }; + + static inline __le64 bch2_sb_magic(struct bch_fs *c) +@@ -66,8 +65,6 @@ int bch2_sb_from_fs(struct bch_fs *, struct bch_dev *); + void bch2_free_super(struct bch_sb_handle *); + int bch2_sb_realloc(struct bch_sb_handle *, unsigned); + +-const char *bch2_sb_validate(struct bch_sb_handle *); +- + int bch2_read_super(const char *, struct bch_opts *, struct bch_sb_handle *); + int bch2_write_super(struct bch_fs *); + void __bch2_check_set_feature(struct bch_fs *, unsigned); +diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c +index cda334319bda..fee4e69a5a98 100644 +--- a/fs/bcachefs/super.c ++++ b/fs/bcachefs/super.c +@@ -1598,18 +1598,20 @@ int bch2_dev_add(struct bch_fs *c, const char *path) + struct bch_sb_field_members *mi; + struct bch_member dev_mi; + unsigned dev_idx, nr_devices, u64s; ++ char *_errbuf; ++ struct printbuf errbuf; + int ret; + ++ _errbuf = kmalloc(4096, GFP_KERNEL); ++ if (!_errbuf) ++ return -ENOMEM; ++ ++ errbuf = _PBUF(_errbuf, 4096); ++ + ret = bch2_read_super(path, &opts, &sb); + if (ret) { + bch_err(c, "device add error: error reading super: %i", ret); +- return ret; +- } +- +- err = bch2_sb_validate(&sb); +- if (err) { +- bch_err(c, "device add error: error validating super: %s", err); +- return -EINVAL; ++ goto err; + } + + dev_mi = bch2_sb_get_members(sb.sb)->members[sb.sb->dev_idx]; +@@ -1617,19 +1619,21 @@ int bch2_dev_add(struct bch_fs *c, const char *path) + err = bch2_dev_may_add(sb.sb, c); + if (err) { + bch_err(c, "device add error: %s", err); +- return -EINVAL; ++ ret = -EINVAL; ++ goto err; + } + + ca = __bch2_dev_alloc(c, &dev_mi); + if (!ca) { + bch2_free_super(&sb); +- return -ENOMEM; ++ ret = -ENOMEM; ++ goto err; + } + + ret = __bch2_dev_attach_bdev(ca, &sb); + if (ret) { + bch2_dev_free(ca); +- return ret; ++ goto err; + } + + ret = bch2_dev_journal_alloc(ca); +@@ -1721,10 +1725,12 @@ err: + if (ca) + bch2_dev_free(ca); + bch2_free_super(&sb); ++ kfree(_errbuf); + return ret; + err_late: + up_write(&c->state_lock); +- return -EINVAL; ++ ca = NULL; ++ goto err; + } + + /* Hot add existing device to running filesystem: */ +@@ -1890,20 +1896,28 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices, + struct bch_sb_field_members *mi; + unsigned i, best_sb = 0; + const char *err; ++ char *_errbuf = NULL; ++ struct printbuf errbuf; + int ret = 0; + ++ if (!try_module_get(THIS_MODULE)) ++ return ERR_PTR(-ENODEV); ++ + pr_verbose_init(opts, ""); + + if (!nr_devices) { +- c = ERR_PTR(-EINVAL); +- goto out2; ++ ret = -EINVAL; ++ goto err; + } + +- if (!try_module_get(THIS_MODULE)) { +- c = ERR_PTR(-ENODEV); +- goto out2; ++ _errbuf = kmalloc(4096, GFP_KERNEL); ++ if (!_errbuf) { ++ ret = -ENOMEM; ++ goto err; + } + ++ errbuf = _PBUF(_errbuf, 4096); ++ + sb = kcalloc(nr_devices, sizeof(*sb), GFP_KERNEL); + if (!sb) { + ret = -ENOMEM; +@@ -1915,9 +1929,6 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices, + if (ret) + goto err; + +- err = bch2_sb_validate(&sb[i]); +- if (err) +- goto err_print; + } + + for (i = 1; i < nr_devices; i++) +@@ -1972,8 +1983,8 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices, + } + out: + kfree(sb); ++ kfree(_errbuf); + module_put(THIS_MODULE); +-out2: + pr_verbose_init(opts, "ret %i", PTR_ERR_OR_ZERO(c)); + return c; + err_print: +@@ -1990,81 +2001,6 @@ err: + goto out; + } + +-static const char *__bch2_fs_open_incremental(struct bch_sb_handle *sb, +- struct bch_opts opts) +-{ +- const char *err; +- struct bch_fs *c; +- bool allocated_fs = false; +- int ret; +- +- err = bch2_sb_validate(sb); +- if (err) +- return err; +- +- mutex_lock(&bch_fs_list_lock); +- c = __bch2_uuid_to_fs(sb->sb->uuid); +- if (c) { +- closure_get(&c->cl); +- +- err = bch2_dev_in_fs(c->disk_sb.sb, sb->sb); +- if (err) +- goto err; +- } else { +- allocated_fs = true; +- c = bch2_fs_alloc(sb->sb, opts); +- +- err = "bch2_fs_alloc() error"; +- if (IS_ERR(c)) +- goto err; +- } +- +- err = "bch2_dev_online() error"; +- +- mutex_lock(&c->sb_lock); +- if (bch2_dev_attach_bdev(c, sb)) { +- mutex_unlock(&c->sb_lock); +- goto err; +- } +- mutex_unlock(&c->sb_lock); +- +- if (!c->opts.nostart && bch2_fs_may_start(c)) { +- err = "error starting filesystem"; +- ret = bch2_fs_start(c); +- if (ret) +- goto err; +- } +- +- closure_put(&c->cl); +- mutex_unlock(&bch_fs_list_lock); +- +- return NULL; +-err: +- mutex_unlock(&bch_fs_list_lock); +- +- if (allocated_fs && !IS_ERR(c)) +- bch2_fs_stop(c); +- else if (c) +- closure_put(&c->cl); +- +- return err; +-} +- +-const char *bch2_fs_open_incremental(const char *path) +-{ +- struct bch_sb_handle sb; +- struct bch_opts opts = bch2_opts_empty(); +- const char *err; +- +- if (bch2_read_super(path, &opts, &sb)) +- return "error reading superblock"; +- +- err = __bch2_fs_open_incremental(&sb, opts); +- bch2_free_super(&sb); +- +- return err; +-} +- + /* Global interfaces/init */ + + static void bcachefs_exit(void) +diff --git a/fs/bcachefs/super.h b/fs/bcachefs/super.h +index c3273e9c711d..3f24ca5a853d 100644 +--- a/fs/bcachefs/super.h ++++ b/fs/bcachefs/super.h +@@ -254,6 +254,5 @@ void bch2_fs_stop(struct bch_fs *); + + int bch2_fs_start(struct bch_fs *); + struct bch_fs *bch2_fs_open(char * const *, unsigned, struct bch_opts); +-const char *bch2_fs_open_incremental(const char *path); + + #endif /* _BCACHEFS_SUPER_H */ +-- +cgit v1.2.3 + + +From a629647a7644987fd25c70ce280c948fc55c5ae0 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 4 Jan 2022 00:06:49 -0500 +Subject: bcachefs: Add verbose log messages for journal read + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bcachefs.h | 4 ++++ + fs/bcachefs/journal_io.c | 2 ++ + fs/bcachefs/recovery.c | 1 + + 3 files changed, 7 insertions(+) + +diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h +index 38bdbbfc8622..58d5637c68d7 100644 +--- a/fs/bcachefs/bcachefs.h ++++ b/fs/bcachefs/bcachefs.h +@@ -177,7 +177,11 @@ + */ + + #undef pr_fmt ++#ifdef __KERNEL__ + #define pr_fmt(fmt) "bcachefs: %s() " fmt "\n", __func__ ++#else ++#define pr_fmt(fmt) "%s() " fmt "\n", __func__ ++#endif + + #include + #include +diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c +index faf82bb4daf2..9794ac6f0436 100644 +--- a/fs/bcachefs/journal_io.c ++++ b/fs/bcachefs/journal_io.c +@@ -893,6 +893,7 @@ static void bch2_journal_read_device(struct closure *cl) + struct journal_device *ja = + container_of(cl, struct journal_device, read); + struct bch_dev *ca = container_of(ja, struct bch_dev, journal); ++ struct bch_fs *c = ca->fs; + struct journal_list *jlist = + container_of(cl->parent, struct journal_list, cl); + struct journal_read_buf buf = { NULL, 0 }; +@@ -944,6 +945,7 @@ static void bch2_journal_read_device(struct closure *cl) + ja->discard_idx = ja->dirty_idx_ondisk = + ja->dirty_idx = (ja->cur_idx + 1) % ja->nr; + out: ++ bch_verbose(c, "journal read done on device %s, ret %i", ca->name, ret); + kvpfree(buf.data, buf.size); + percpu_ref_put(&ca->io_ref); + closure_return(cl); +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index e4ba3f0aef4a..d11457c229ac 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -1008,6 +1008,7 @@ int bch2_fs_recovery(struct bch_fs *c) + if (!c->sb.clean || c->opts.fsck || c->opts.keep_journal) { + struct journal_replay *i; + ++ bch_verbose(c, "starting journal read"); + ret = bch2_journal_read(c, &c->journal_entries, + &blacklist_seq, &journal_seq); + if (ret) +-- +cgit v1.2.3 + + +From 4a11d15710a23537f85bc70ed2366c0ca9c63109 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 4 Jan 2022 00:07:23 -0500 +Subject: bcachefs: Fix bch2_journal_seq_blacklist_add() + +The old code correctly handled the case where we were blacklisting a +range that exactly matched an existing entry, but not the case where the +new range partially overlaps an existing entry. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/journal_seq_blacklist.c | 43 ++++++++++++++++++------------------- + 1 file changed, 21 insertions(+), 22 deletions(-) + +diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c +index 428377e73a8d..e10b2c7c7bae 100644 +--- a/fs/bcachefs/journal_seq_blacklist.c ++++ b/fs/bcachefs/journal_seq_blacklist.c +@@ -66,6 +66,12 @@ blacklist_entry_try_merge(struct bch_fs *c, + return bl; + } + ++static bool bl_entry_contig_or_overlaps(struct journal_seq_blacklist_entry *e, ++ u64 start, u64 end) ++{ ++ return !(end < le64_to_cpu(e->start) || le64_to_cpu(e->end) < start); ++} ++ + int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64 start, u64 end) + { + struct bch_sb_field_journal_seq_blacklist *bl; +@@ -76,28 +82,21 @@ int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64 start, u64 end) + bl = bch2_sb_get_journal_seq_blacklist(c->disk_sb.sb); + nr = blacklist_nr_entries(bl); + +- if (bl) { +- for (i = 0; i < nr; i++) { +- struct journal_seq_blacklist_entry *e = +- bl->start + i; +- +- if (start == le64_to_cpu(e->start) && +- end == le64_to_cpu(e->end)) +- goto out; +- +- if (start <= le64_to_cpu(e->start) && +- end >= le64_to_cpu(e->end)) { +- e->start = cpu_to_le64(start); +- e->end = cpu_to_le64(end); +- +- if (i + 1 < nr) +- bl = blacklist_entry_try_merge(c, +- bl, i); +- if (i) +- bl = blacklist_entry_try_merge(c, +- bl, i - 1); +- goto out_write_sb; +- } ++ for (i = 0; i < nr; i++) { ++ struct journal_seq_blacklist_entry *e = ++ bl->start + i; ++ ++ if (bl_entry_contig_or_overlaps(e, start, end)) { ++ e->start = cpu_to_le64(min(start, le64_to_cpu(e->start))); ++ e->end = cpu_to_le64(max(end, le64_to_cpu(e->end))); ++ ++ if (i + 1 < nr) ++ bl = blacklist_entry_try_merge(c, ++ bl, i); ++ if (i) ++ bl = blacklist_entry_try_merge(c, ++ bl, i - 1); ++ goto out_write_sb; + } + } + +-- +cgit v1.2.3 + + +From 9291b682f83517514cbdfa4af4f9f79ba8ebd6e4 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 4 Jan 2022 00:33:52 -0500 +Subject: bcachefs: Switch to __func__for recording where btree_trans was + initialized + +Symbol decoding, via %ps, isn't supported in userspace - this will also +be faster when we're using trans->fn in the fast path, as with the new +BCH_JSET_ENTRY_log journal messages. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_cache.c | 2 +- + fs/bcachefs/btree_iter.c | 21 +++--- + fs/bcachefs/btree_iter.h | 5 +- + fs/bcachefs/btree_key_cache.c | 6 +- + fs/bcachefs/btree_types.h | 2 +- + fs/bcachefs/btree_update_interior.c | 4 +- + fs/bcachefs/btree_update_leaf.c | 30 ++++---- + fs/bcachefs/fs.c | 1 - + include/trace/events/bcachefs.h | 138 ++++++++++++++++-------------------- + 9 files changed, 100 insertions(+), 109 deletions(-) + +diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c +index 2788ba17e031..fc6c4d4cd02f 100644 +--- a/fs/bcachefs/btree_cache.c ++++ b/fs/bcachefs/btree_cache.c +@@ -858,7 +858,7 @@ lock_node: + if (bch2_btree_node_relock(trans, path, level + 1)) + goto retry; + +- trace_trans_restart_btree_node_reused(trans->ip, ++ trace_trans_restart_btree_node_reused(trans->fn, + trace_ip, + path->btree_id, + &path->pos); +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index d549c466362b..7df9e4744f64 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -363,7 +363,7 @@ bool __bch2_btree_node_lock(struct btree_trans *trans, + } + + if (unlikely(deadlock_path)) { +- trace_trans_restart_would_deadlock(trans->ip, ip, ++ trace_trans_restart_would_deadlock(trans->fn, ip, + trans->in_traverse_all, reason, + deadlock_path->btree_id, + deadlock_path->cached, +@@ -548,7 +548,7 @@ bool bch2_trans_relock(struct btree_trans *trans) + trans_for_each_path(trans, path) + if (path->should_be_locked && + !bch2_btree_path_relock(trans, path, _RET_IP_)) { +- trace_trans_restart_relock(trans->ip, _RET_IP_, ++ trace_trans_restart_relock(trans->fn, _RET_IP_, + path->btree_id, &path->pos); + BUG_ON(!trans->restarted); + return false; +@@ -1519,7 +1519,7 @@ out: + + trans->in_traverse_all = false; + +- trace_trans_traverse_all(trans->ip, trace_ip); ++ trace_trans_traverse_all(trans->fn, trace_ip); + return ret; + } + +@@ -2826,7 +2826,7 @@ void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size) + trans->mem_bytes = new_bytes; + + if (old_bytes) { +- trace_trans_restart_mem_realloced(trans->ip, _RET_IP_, new_bytes); ++ trace_trans_restart_mem_realloced(trans->fn, _RET_IP_, new_bytes); + btree_trans_restart(trans); + return ERR_PTR(-EINTR); + } +@@ -2908,14 +2908,15 @@ static void bch2_trans_alloc_paths(struct btree_trans *trans, struct bch_fs *c) + trans->updates = p; p += updates_bytes; + } + +-void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, +- unsigned expected_nr_iters, +- size_t expected_mem_bytes) ++void __bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, ++ unsigned expected_nr_iters, ++ size_t expected_mem_bytes, ++ const char *fn) + __acquires(&c->btree_trans_barrier) + { + memset(trans, 0, sizeof(*trans)); + trans->c = c; +- trans->ip = _RET_IP_; ++ trans->fn = fn; + + bch2_trans_alloc_paths(trans, c); + +@@ -2948,7 +2949,7 @@ static void check_btree_paths_leaked(struct btree_trans *trans) + goto leaked; + return; + leaked: +- bch_err(c, "btree paths leaked from %pS!", (void *) trans->ip); ++ bch_err(c, "btree paths leaked from %s!", trans->fn); + trans_for_each_path(trans, path) + if (path->ref) + printk(KERN_ERR " btree %s %pS\n", +@@ -3041,7 +3042,7 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c) + if (!trans_has_locks(trans)) + continue; + +- pr_buf(out, "%i %ps\n", trans->pid, (void *) trans->ip); ++ pr_buf(out, "%i %s\n", trans->pid, trans->fn); + + trans_for_each_path(trans, path) { + if (!path->nodes_locked) +diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h +index 4c903b9dd716..eceec5d55f9b 100644 +--- a/fs/bcachefs/btree_iter.h ++++ b/fs/bcachefs/btree_iter.h +@@ -354,9 +354,12 @@ __bch2_btree_iter_peek_and_restart(struct btree_trans *trans, + /* new multiple iterator interface: */ + + void bch2_dump_trans_paths_updates(struct btree_trans *); +-void bch2_trans_init(struct btree_trans *, struct bch_fs *, unsigned, size_t); ++void __bch2_trans_init(struct btree_trans *, struct bch_fs *, ++ unsigned, size_t, const char *); + void bch2_trans_exit(struct btree_trans *); + ++#define bch2_trans_init(...) __bch2_trans_init(__VA_ARGS__, __func__) ++ + void bch2_btree_trans_to_text(struct printbuf *, struct bch_fs *); + + void bch2_fs_btree_iter_exit(struct bch_fs *); +diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c +index 80ed79b06f21..e8bf8d2cdabe 100644 +--- a/fs/bcachefs/btree_key_cache.c ++++ b/fs/bcachefs/btree_key_cache.c +@@ -223,7 +223,7 @@ static int btree_key_cache_fill(struct btree_trans *trans, + goto err; + + if (!bch2_btree_node_relock(trans, ck_path, 0)) { +- trace_transaction_restart_ip(trans->ip, _THIS_IP_); ++ trace_transaction_restart_ip(trans->fn, _THIS_IP_); + ret = btree_trans_restart(trans); + goto err; + } +@@ -318,7 +318,7 @@ retry: + if (!trans->restarted) + goto retry; + +- trace_transaction_restart_ip(trans->ip, _THIS_IP_); ++ trace_transaction_restart_ip(trans->fn, _THIS_IP_); + ret = -EINTR; + goto err; + } +@@ -338,7 +338,7 @@ fill: + if (!ck->valid && !(flags & BTREE_ITER_CACHED_NOFILL)) { + if (!path->locks_want && + !__bch2_btree_path_upgrade(trans, path, 1)) { +- trace_transaction_restart_ip(trans->ip, _THIS_IP_); ++ trace_transaction_restart_ip(trans->fn, _THIS_IP_); + ret = btree_trans_restart(trans); + goto err; + } +diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h +index 1ace76048a21..914d536cd29e 100644 +--- a/fs/bcachefs/btree_types.h ++++ b/fs/bcachefs/btree_types.h +@@ -368,6 +368,7 @@ struct btree_trans_commit_hook { + + struct btree_trans { + struct bch_fs *c; ++ const char *fn; + struct list_head list; + struct btree *locking; + unsigned locking_path_idx; +@@ -375,7 +376,6 @@ struct btree_trans { + u8 locking_btree_id; + u8 locking_level; + pid_t pid; +- unsigned long ip; + int srcu_idx; + + u8 nr_sorted; +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index e1a5e34e21c1..47568a0bc5f1 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -955,7 +955,7 @@ retry: + * instead of locking/reserving all the way to the root: + */ + if (!bch2_btree_path_upgrade(trans, path, U8_MAX)) { +- trace_trans_restart_iter_upgrade(trans->ip, _RET_IP_, ++ trace_trans_restart_iter_upgrade(trans->fn, _RET_IP_, + path->btree_id, &path->pos); + ret = btree_trans_restart(trans); + return ERR_PTR(ret); +@@ -1019,7 +1019,7 @@ retry: + BTREE_UPDATE_JOURNAL_RES, + journal_flags); + if (ret) { +- trace_trans_restart_journal_preres_get(trans->ip, _RET_IP_); ++ trace_trans_restart_journal_preres_get(trans->fn, _RET_IP_); + goto err; + } + +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index 1ce8ab0b51fe..ca98e6855195 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -266,7 +266,7 @@ bch2_trans_journal_preres_get_cold(struct btree_trans *trans, unsigned u64s, + return ret; + + if (!bch2_trans_relock(trans)) { +- trace_trans_restart_journal_preres_get(trans->ip, trace_ip); ++ trace_trans_restart_journal_preres_get(trans->fn, trace_ip); + return -EINTR; + } + +@@ -305,7 +305,8 @@ static noinline void journal_transaction_name(struct btree_trans *trans) + l->entry.pad[0] = 0; + l->entry.pad[1] = 0; + l->entry.pad[2] = 0; +- b = snprintf(l->d, buflen, "%ps", (void *) trans->ip); ++ b = min_t(unsigned, strlen(trans->fn), buflen); ++ memcpy(l->d, trans->fn, b); + while (b < buflen) + l->d[b++] = '\0'; + +@@ -425,7 +426,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, + int ret; + + if (race_fault()) { +- trace_trans_restart_fault_inject(trans->ip, trace_ip); ++ trace_trans_restart_fault_inject(trans->fn, trace_ip); + trans->restarted = true; + return -EINTR; + } +@@ -618,7 +619,7 @@ fail: + bch2_btree_node_unlock_write_inlined(trans, i->path, insert_l(i)->b); + } + +- trace_trans_restart_would_deadlock_write(trans->ip); ++ trace_trans_restart_would_deadlock_write(trans->fn); + return btree_trans_restart(trans); + } + +@@ -649,9 +650,8 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, + char buf[200]; + + bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(i->k)); +- bch_err(c, "invalid bkey %s on insert from %ps -> %ps: %s\n", +- buf, (void *) trans->ip, +- (void *) i->ip_allocated, invalid); ++ bch_err(c, "invalid bkey %s on insert from %s -> %ps: %s\n", ++ buf, trans->fn, (void *) i->ip_allocated, invalid); + bch2_fatal_error(c); + return -EINVAL; + } +@@ -757,7 +757,7 @@ int bch2_trans_commit_error(struct btree_trans *trans, + return 0; + + if (ret == -EINTR) +- trace_trans_restart_btree_node_split(trans->ip, trace_ip, ++ trace_trans_restart_btree_node_split(trans->fn, trace_ip, + i->btree_id, &i->path->pos); + break; + case BTREE_INSERT_NEED_MARK_REPLICAS: +@@ -770,7 +770,7 @@ int bch2_trans_commit_error(struct btree_trans *trans, + if (bch2_trans_relock(trans)) + return 0; + +- trace_trans_restart_mark_replicas(trans->ip, trace_ip); ++ trace_trans_restart_mark_replicas(trans->fn, trace_ip); + ret = -EINTR; + break; + case BTREE_INSERT_NEED_JOURNAL_RES: +@@ -790,13 +790,13 @@ int bch2_trans_commit_error(struct btree_trans *trans, + if (bch2_trans_relock(trans)) + return 0; + +- trace_trans_restart_journal_res_get(trans->ip, trace_ip); ++ trace_trans_restart_journal_res_get(trans->fn, trace_ip); + ret = -EINTR; + break; + case BTREE_INSERT_NEED_JOURNAL_RECLAIM: + bch2_trans_unlock(trans); + +- trace_trans_blocked_journal_reclaim(trans->ip, trace_ip); ++ trace_trans_blocked_journal_reclaim(trans->fn, trace_ip); + + wait_event_freezable(c->journal.reclaim_wait, + (ret = journal_reclaim_wait_done(c))); +@@ -806,7 +806,7 @@ int bch2_trans_commit_error(struct btree_trans *trans, + if (bch2_trans_relock(trans)) + return 0; + +- trace_trans_restart_journal_reclaim(trans->ip, trace_ip); ++ trace_trans_restart_journal_reclaim(trans->fn, trace_ip); + ret = -EINTR; + break; + default: +@@ -901,7 +901,7 @@ static int bch2_trans_commit_run_triggers(struct btree_trans *trans) + } + + if (ret == -EINTR) +- trace_trans_restart_mark(trans->ip, _RET_IP_, ++ trace_trans_restart_mark(trans->fn, _RET_IP_, + i->btree_id, &i->path->pos); + if (ret) + return ret; +@@ -931,7 +931,7 @@ static int bch2_trans_commit_run_triggers(struct btree_trans *trans) + BTREE_TRIGGER_OVERWRITE|i->flags); + + if (ret == -EINTR) +- trace_trans_restart_mark(trans->ip, _RET_IP_, ++ trace_trans_restart_mark(trans->fn, _RET_IP_, + i->btree_id, &i->path->pos); + if (ret) + return ret; +@@ -998,7 +998,7 @@ int __bch2_trans_commit(struct btree_trans *trans) + BUG_ON(!i->path->should_be_locked); + + if (unlikely(!bch2_btree_path_upgrade(trans, i->path, i->level + 1))) { +- trace_trans_restart_upgrade(trans->ip, _RET_IP_, ++ trace_trans_restart_upgrade(trans->fn, _RET_IP_, + i->btree_id, &i->path->pos); + ret = btree_trans_restart(trans); + goto out; +diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c +index ba3462e27221..9653f199dc44 100644 +--- a/fs/bcachefs/fs.c ++++ b/fs/bcachefs/fs.c +@@ -135,7 +135,6 @@ int __must_check bch2_write_inode(struct bch_fs *c, + int ret; + + bch2_trans_init(&trans, c, 0, 512); +- trans.ip = _RET_IP_; + retry: + bch2_trans_begin(&trans); + +diff --git a/include/trace/events/bcachefs.h b/include/trace/events/bcachefs.h +index e4e0780bf4e5..295dcd60e704 100644 +--- a/include/trace/events/bcachefs.h ++++ b/include/trace/events/bcachefs.h +@@ -546,94 +546,81 @@ TRACE_EVENT(copygc_wait, + __entry->wait_amount, __entry->until) + ); + +-TRACE_EVENT(transaction_restart_ip, +- TP_PROTO(unsigned long caller, unsigned long ip), +- TP_ARGS(caller, ip), +- +- TP_STRUCT__entry( +- __field(unsigned long, caller ) +- __field(unsigned long, ip ) +- ), +- +- TP_fast_assign( +- __entry->caller = caller; +- __entry->ip = ip; +- ), +- +- TP_printk("%ps %pS", (void *) __entry->caller, (void *) __entry->ip) +-); +- + DECLARE_EVENT_CLASS(transaction_restart, +- TP_PROTO(unsigned long trans_ip, ++ TP_PROTO(const char *trans_fn, + unsigned long caller_ip), +- TP_ARGS(trans_ip, caller_ip), ++ TP_ARGS(trans_fn, caller_ip), + + TP_STRUCT__entry( +- __field(unsigned long, trans_ip ) ++ __array(char, trans_fn, 24 ) + __field(unsigned long, caller_ip ) + ), + + TP_fast_assign( +- __entry->trans_ip = trans_ip; ++ strncpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn)); + __entry->caller_ip = caller_ip; + ), + +- TP_printk("%ps %pS", +- (void *) __entry->trans_ip, +- (void *) __entry->caller_ip) ++ TP_printk("%s %pS", __entry->trans_fn, (void *) __entry->caller_ip) ++); ++ ++DEFINE_EVENT(transaction_restart, transaction_restart_ip, ++ TP_PROTO(const char *trans_fn, ++ unsigned long caller_ip), ++ TP_ARGS(trans_fn, caller_ip) + ); + + DEFINE_EVENT(transaction_restart, trans_blocked_journal_reclaim, +- TP_PROTO(unsigned long trans_ip, ++ TP_PROTO(const char *trans_fn, + unsigned long caller_ip), +- TP_ARGS(trans_ip, caller_ip) ++ TP_ARGS(trans_fn, caller_ip) + ); + + DEFINE_EVENT(transaction_restart, trans_restart_journal_res_get, +- TP_PROTO(unsigned long trans_ip, ++ TP_PROTO(const char *trans_fn, + unsigned long caller_ip), +- TP_ARGS(trans_ip, caller_ip) ++ TP_ARGS(trans_fn, caller_ip) + ); + + DEFINE_EVENT(transaction_restart, trans_restart_journal_preres_get, +- TP_PROTO(unsigned long trans_ip, ++ TP_PROTO(const char *trans_fn, + unsigned long caller_ip), +- TP_ARGS(trans_ip, caller_ip) ++ TP_ARGS(trans_fn, caller_ip) + ); + + DEFINE_EVENT(transaction_restart, trans_restart_journal_reclaim, +- TP_PROTO(unsigned long trans_ip, ++ TP_PROTO(const char *trans_fn, + unsigned long caller_ip), +- TP_ARGS(trans_ip, caller_ip) ++ TP_ARGS(trans_fn, caller_ip) + ); + + DEFINE_EVENT(transaction_restart, trans_restart_fault_inject, +- TP_PROTO(unsigned long trans_ip, ++ TP_PROTO(const char *trans_fn, + unsigned long caller_ip), +- TP_ARGS(trans_ip, caller_ip) ++ TP_ARGS(trans_fn, caller_ip) + ); + + DEFINE_EVENT(transaction_restart, trans_traverse_all, +- TP_PROTO(unsigned long trans_ip, ++ TP_PROTO(const char *trans_fn, + unsigned long caller_ip), +- TP_ARGS(trans_ip, caller_ip) ++ TP_ARGS(trans_fn, caller_ip) + ); + + DEFINE_EVENT(transaction_restart, trans_restart_mark_replicas, +- TP_PROTO(unsigned long trans_ip, ++ TP_PROTO(const char *trans_fn, + unsigned long caller_ip), +- TP_ARGS(trans_ip, caller_ip) ++ TP_ARGS(trans_fn, caller_ip) + ); + + DECLARE_EVENT_CLASS(transaction_restart_iter, +- TP_PROTO(unsigned long trans_ip, ++ TP_PROTO(const char *trans_fn, + unsigned long caller_ip, + enum btree_id btree_id, + struct bpos *pos), +- TP_ARGS(trans_ip, caller_ip, btree_id, pos), ++ TP_ARGS(trans_fn, caller_ip, btree_id, pos), + + TP_STRUCT__entry( +- __field(unsigned long, trans_ip ) ++ __array(char, trans_fn, 24 ) + __field(unsigned long, caller_ip ) + __field(u8, btree_id ) + __field(u64, pos_inode ) +@@ -642,7 +629,7 @@ DECLARE_EVENT_CLASS(transaction_restart_iter, + ), + + TP_fast_assign( +- __entry->trans_ip = trans_ip; ++ strncpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn)); + __entry->caller_ip = caller_ip; + __entry->btree_id = btree_id; + __entry->pos_inode = pos->inode; +@@ -650,8 +637,8 @@ DECLARE_EVENT_CLASS(transaction_restart_iter, + __entry->pos_snapshot = pos->snapshot; + ), + +- TP_printk("%ps %pS btree %u pos %llu:%llu:%u", +- (void *) __entry->trans_ip, ++ TP_printk("%s %pS btree %u pos %llu:%llu:%u", ++ __entry->trans_fn, + (void *) __entry->caller_ip, + __entry->btree_id, + __entry->pos_inode, +@@ -660,63 +647,63 @@ DECLARE_EVENT_CLASS(transaction_restart_iter, + ); + + DEFINE_EVENT(transaction_restart_iter, trans_restart_btree_node_reused, +- TP_PROTO(unsigned long trans_ip, ++ TP_PROTO(const char *trans_fn, + unsigned long caller_ip, + enum btree_id btree_id, + struct bpos *pos), +- TP_ARGS(trans_ip, caller_ip, btree_id, pos) ++ TP_ARGS(trans_fn, caller_ip, btree_id, pos) + ); + + DEFINE_EVENT(transaction_restart_iter, trans_restart_btree_node_split, +- TP_PROTO(unsigned long trans_ip, ++ TP_PROTO(const char *trans_fn, + unsigned long caller_ip, + enum btree_id btree_id, + struct bpos *pos), +- TP_ARGS(trans_ip, caller_ip, btree_id, pos) ++ TP_ARGS(trans_fn, caller_ip, btree_id, pos) + ); + + DEFINE_EVENT(transaction_restart_iter, trans_restart_mark, +- TP_PROTO(unsigned long trans_ip, ++ TP_PROTO(const char *trans_fn, + unsigned long caller_ip, + enum btree_id btree_id, + struct bpos *pos), +- TP_ARGS(trans_ip, caller_ip, btree_id, pos) ++ TP_ARGS(trans_fn, caller_ip, btree_id, pos) + ); + + DEFINE_EVENT(transaction_restart_iter, trans_restart_upgrade, +- TP_PROTO(unsigned long trans_ip, ++ TP_PROTO(const char *trans_fn, + unsigned long caller_ip, + enum btree_id btree_id, + struct bpos *pos), +- TP_ARGS(trans_ip, caller_ip, btree_id, pos) ++ TP_ARGS(trans_fn, caller_ip, btree_id, pos) + ); + + DEFINE_EVENT(transaction_restart_iter, trans_restart_iter_upgrade, +- TP_PROTO(unsigned long trans_ip, ++ TP_PROTO(const char *trans_fn, + unsigned long caller_ip, + enum btree_id btree_id, + struct bpos *pos), +- TP_ARGS(trans_ip, caller_ip, btree_id, pos) ++ TP_ARGS(trans_fn, caller_ip, btree_id, pos) + ); + + DEFINE_EVENT(transaction_restart_iter, trans_restart_relock, +- TP_PROTO(unsigned long trans_ip, ++ TP_PROTO(const char *trans_fn, + unsigned long caller_ip, + enum btree_id btree_id, + struct bpos *pos), +- TP_ARGS(trans_ip, caller_ip, btree_id, pos) ++ TP_ARGS(trans_fn, caller_ip, btree_id, pos) + ); + + DEFINE_EVENT(transaction_restart_iter, trans_restart_traverse, +- TP_PROTO(unsigned long trans_ip, ++ TP_PROTO(const char *trans_fn, + unsigned long caller_ip, + enum btree_id btree_id, + struct bpos *pos), +- TP_ARGS(trans_ip, caller_ip, btree_id, pos) ++ TP_ARGS(trans_fn, caller_ip, btree_id, pos) + ); + + TRACE_EVENT(trans_restart_would_deadlock, +- TP_PROTO(unsigned long trans_ip, ++ TP_PROTO(const char *trans_fn, + unsigned long caller_ip, + bool in_traverse_all, + unsigned reason, +@@ -726,12 +713,12 @@ TRACE_EVENT(trans_restart_would_deadlock, + enum btree_id want_btree_id, + unsigned want_iter_type, + struct bpos *want_pos), +- TP_ARGS(trans_ip, caller_ip, in_traverse_all, reason, ++ TP_ARGS(trans_fn, caller_ip, in_traverse_all, reason, + have_btree_id, have_iter_type, have_pos, + want_btree_id, want_iter_type, want_pos), + + TP_STRUCT__entry( +- __field(unsigned long, trans_ip ) ++ __array(char, trans_fn, 24 ) + __field(unsigned long, caller_ip ) + __field(u8, in_traverse_all ) + __field(u8, reason ) +@@ -749,7 +736,7 @@ TRACE_EVENT(trans_restart_would_deadlock, + ), + + TP_fast_assign( +- __entry->trans_ip = trans_ip; ++ strncpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn)); + __entry->caller_ip = caller_ip; + __entry->in_traverse_all = in_traverse_all; + __entry->reason = reason; +@@ -767,8 +754,8 @@ TRACE_EVENT(trans_restart_would_deadlock, + __entry->want_pos_snapshot = want_pos->snapshot; + ), + +- TP_printk("%ps %pS traverse_all %u because %u have %u:%u %llu:%llu:%u want %u:%u %llu:%llu:%u", +- (void *) __entry->trans_ip, ++ TP_printk("%s %pS traverse_all %u because %u have %u:%u %llu:%llu:%u want %u:%u %llu:%llu:%u", ++ __entry->trans_fn, + (void *) __entry->caller_ip, + __entry->in_traverse_all, + __entry->reason, +@@ -785,39 +772,40 @@ TRACE_EVENT(trans_restart_would_deadlock, + ); + + TRACE_EVENT(trans_restart_would_deadlock_write, +- TP_PROTO(unsigned long trans_ip), +- TP_ARGS(trans_ip), ++ TP_PROTO(const char *trans_fn), ++ TP_ARGS(trans_fn), + + TP_STRUCT__entry( +- __field(unsigned long, trans_ip ) ++ __array(char, trans_fn, 24 ) + ), + + TP_fast_assign( +- __entry->trans_ip = trans_ip; ++ strncpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn)); + ), + +- TP_printk("%ps", (void *) __entry->trans_ip) ++ TP_printk("%s", __entry->trans_fn) + ); + + TRACE_EVENT(trans_restart_mem_realloced, +- TP_PROTO(unsigned long trans_ip, unsigned long caller_ip, ++ TP_PROTO(const char *trans_fn, ++ unsigned long caller_ip, + unsigned long bytes), +- TP_ARGS(trans_ip, caller_ip, bytes), ++ TP_ARGS(trans_fn, caller_ip, bytes), + + TP_STRUCT__entry( +- __field(unsigned long, trans_ip ) ++ __array(char, trans_fn, 24 ) + __field(unsigned long, caller_ip ) + __field(unsigned long, bytes ) + ), + + TP_fast_assign( +- __entry->trans_ip = trans_ip; ++ strncpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn)); + __entry->caller_ip = caller_ip; + __entry->bytes = bytes; + ), + +- TP_printk("%ps %pS bytes %lu", +- (void *) __entry->trans_ip, ++ TP_printk("%s %pS bytes %lu", ++ __entry->trans_fn, + (void *) __entry->caller_ip, + __entry->bytes) + ); +-- +cgit v1.2.3 + + +From 23085986a49faed2fde6fc49ed05248136929352 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 4 Jan 2022 18:24:55 -0500 +Subject: bcachefs: BTREE_ITER_FILTER_SNAPSHOTS is selected automatically + +It doesn't have to be specified - this patch deletes the two instances +where it was. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/fs-io.c | 2 +- + fs/bcachefs/io.c | 2 +- + 2 files changed, 2 insertions(+), 2 deletions(-) + +diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c +index 57619d09b00a..e257f15f067d 100644 +--- a/fs/bcachefs/fs-io.c ++++ b/fs/bcachefs/fs-io.c +@@ -1033,7 +1033,7 @@ retry: + + bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, + SPOS(inum.inum, rbio->bio.bi_iter.bi_sector, snapshot), +- BTREE_ITER_SLOTS|BTREE_ITER_FILTER_SNAPSHOTS); ++ BTREE_ITER_SLOTS); + while (1) { + struct bkey_s_c k; + unsigned bytes, sectors, offset_into_extent; +diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c +index cab65e44efa9..f0e93de4680d 100644 +--- a/fs/bcachefs/io.c ++++ b/fs/bcachefs/io.c +@@ -2240,7 +2240,7 @@ retry: + + bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, + SPOS(inum.inum, bvec_iter.bi_sector, snapshot), +- BTREE_ITER_SLOTS|BTREE_ITER_FILTER_SNAPSHOTS); ++ BTREE_ITER_SLOTS); + while (1) { + unsigned bytes, sectors, offset_into_extent; + enum btree_id data_btree = BTREE_ID_extents; +-- +cgit v1.2.3 + + +From 200faa972760c3abe7cdb0a05882c44f79670b34 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 4 Jan 2022 19:05:08 -0500 +Subject: bcachefs: Log & error message improvements + + - Add a shim uuid_unparse_lower() in the kernel, since %pU doesn't work + in userspace + + - We don't need to print the bcachefs: or the filesystem name prefix in + userspace + + - Improve a few error messages + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bcachefs.h | 4 ++-- + fs/bcachefs/btree_io.c | 12 ++++++++---- + fs/bcachefs/btree_key_cache.c | 3 +-- + fs/bcachefs/checksum.c | 25 +++++++++++++------------ + fs/bcachefs/recovery.c | 9 +++------ + fs/bcachefs/super.c | 2 +- + fs/bcachefs/util.h | 9 +++++++++ + 7 files changed, 37 insertions(+), 27 deletions(-) + +diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h +index 58d5637c68d7..8dddb365a676 100644 +--- a/fs/bcachefs/bcachefs.h ++++ b/fs/bcachefs/bcachefs.h +@@ -223,8 +223,8 @@ + #define bch2_fmt(_c, fmt) "bcachefs (%s): " fmt "\n", ((_c)->name) + #define bch2_fmt_inum(_c, _inum, fmt) "bcachefs (%s inum %llu): " fmt "\n", ((_c)->name), (_inum) + #else +-#define bch2_fmt(_c, fmt) "%s: " fmt "\n", ((_c)->name) +-#define bch2_fmt_inum(_c, _inum, fmt) "%s inum %llu: " fmt "\n", ((_c)->name), (_inum) ++#define bch2_fmt(_c, fmt) fmt "\n" ++#define bch2_fmt_inum(_c, _inum, fmt) "inum %llu: " fmt "\n", (_inum) + #endif + + #define bch_info(c, fmt, ...) \ +diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c +index 1455dc787190..a3651325a022 100644 +--- a/fs/bcachefs/btree_io.c ++++ b/fs/bcachefs/btree_io.c +@@ -972,19 +972,23 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, + + SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN); + +- b->written += sectors; +- + blacklisted = bch2_journal_seq_is_blacklisted(c, + le64_to_cpu(i->journal_seq), + true); + + btree_err_on(blacklisted && first, + BTREE_ERR_FIXABLE, c, ca, b, i, +- "first btree node bset has blacklisted journal seq"); ++ "first btree node bset has blacklisted journal seq (%llu)", ++ le64_to_cpu(i->journal_seq)); + + btree_err_on(blacklisted && ptr_written, + BTREE_ERR_FIXABLE, c, ca, b, i, +- "found blacklisted bset in btree node with sectors_written"); ++ "found blacklisted bset (journal seq %llu) in btree node at offset %u-%u/%u", ++ le64_to_cpu(i->journal_seq), ++ b->written, b->written + sectors, ptr_written); ++ ++ b->written += sectors; ++ + if (blacklisted && !first) + continue; + +diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c +index e8bf8d2cdabe..1d7b101224f1 100644 +--- a/fs/bcachefs/btree_key_cache.c ++++ b/fs/bcachefs/btree_key_cache.c +@@ -208,7 +208,6 @@ static int btree_key_cache_fill(struct btree_trans *trans, + struct btree_path *ck_path, + struct bkey_cached *ck) + { +- struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_s_c k; + unsigned new_u64s = 0; +@@ -238,7 +237,7 @@ static int btree_key_cache_fill(struct btree_trans *trans, + new_u64s = roundup_pow_of_two(new_u64s); + new_k = kmalloc(new_u64s * sizeof(u64), GFP_NOFS); + if (!new_k) { +- bch_err(c, "error allocating memory for key cache key, btree %s u64s %u", ++ bch_err(trans->c, "error allocating memory for key cache key, btree %s u64s %u", + bch2_btree_ids[ck->key.btree_id], new_u64s); + ret = -ENOMEM; + goto err; +diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c +index fbe8603cfb30..a1d89923d361 100644 +--- a/fs/bcachefs/checksum.c ++++ b/fs/bcachefs/checksum.c +@@ -407,16 +407,12 @@ int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio, + } + + #ifdef __KERNEL__ +-int bch2_request_key(struct bch_sb *sb, struct bch_key *key) ++static int __bch2_request_key(char *key_description, struct bch_key *key) + { +- char key_description[60]; + struct key *keyring_key; + const struct user_key_payload *ukp; + int ret; + +- snprintf(key_description, sizeof(key_description), +- "bcachefs:%pUb", &sb->user_uuid); +- + keyring_key = request_key(&key_type_logon, key_description, NULL); + if (IS_ERR(keyring_key)) + return PTR_ERR(keyring_key); +@@ -436,16 +432,10 @@ int bch2_request_key(struct bch_sb *sb, struct bch_key *key) + } + #else + #include +-#include + +-int bch2_request_key(struct bch_sb *sb, struct bch_key *key) ++static int __bch2_request_key(char *key_description, struct bch_key *key) + { + key_serial_t key_id; +- char key_description[60]; +- char uuid[40]; +- +- uuid_unparse_lower(sb->user_uuid.b, uuid); +- sprintf(key_description, "bcachefs:%s", uuid); + + key_id = request_key("user", key_description, NULL, + KEY_SPEC_USER_KEYRING); +@@ -459,6 +449,17 @@ int bch2_request_key(struct bch_sb *sb, struct bch_key *key) + } + #endif + ++int bch2_request_key(struct bch_sb *sb, struct bch_key *key) ++{ ++ char key_description[60]; ++ char uuid[40]; ++ ++ uuid_unparse_lower(sb->user_uuid.b, uuid); ++ sprintf(key_description, "bcachefs:%s", uuid); ++ ++ return __bch2_request_key(key_description, key); ++} ++ + int bch2_decrypt_sb_key(struct bch_fs *c, + struct bch_sb_field_crypt *crypt, + struct bch_key *key) +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index d11457c229ac..7003cf77fdcd 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -530,10 +530,8 @@ static int bch2_journal_replay(struct bch_fs *c) + sizeof(keys_sorted[0]), + journal_sort_seq_cmp, NULL); + +- if (keys->nr) { +- bch_verbose(c, "starting journal replay, %zu keys", keys->nr); ++ if (keys->nr) + replay_now_at(j, keys->journal_seq_base); +- } + + for (i = 0; i < keys->nr; i++) { + k = keys_sorted[i]; +@@ -901,7 +899,6 @@ static int bch2_fs_initialize_subvolumes(struct bch_fs *c) + + static int bch2_fs_upgrade_for_subvolumes(struct btree_trans *trans) + { +- struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_s_c k; + struct bch_inode_unpacked inode; +@@ -915,7 +912,7 @@ static int bch2_fs_upgrade_for_subvolumes(struct btree_trans *trans) + goto err; + + if (!bkey_is_inode(k.k)) { +- bch_err(c, "root inode not found"); ++ bch_err(trans->c, "root inode not found"); + ret = -ENOENT; + goto err; + } +@@ -1138,7 +1135,7 @@ use_clean: + if (c->opts.norecovery) + goto out; + +- bch_verbose(c, "starting journal replay"); ++ bch_verbose(c, "starting journal replay, %zu keys", c->journal_keys.nr); + err = "journal replay failed"; + ret = bch2_journal_replay(c); + if (ret) +diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c +index fee4e69a5a98..44547b917fc4 100644 +--- a/fs/bcachefs/super.c ++++ b/fs/bcachefs/super.c +@@ -737,7 +737,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) + if (ret) + goto err; + +- scnprintf(c->name, sizeof(c->name), "%pU", &c->sb.user_uuid); ++ uuid_unparse_lower(c->sb.user_uuid.b, c->name); + + /* Compat: */ + if (sb->version <= bcachefs_metadata_version_inode_v2 && +diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h +index 80402b398442..3196bc303182 100644 +--- a/fs/bcachefs/util.h ++++ b/fs/bcachefs/util.h +@@ -746,4 +746,13 @@ static inline int u8_cmp(u8 l, u8 r) + return cmp_int(l, r); + } + ++#ifdef __KERNEL__ ++static inline void uuid_unparse_lower(u8 *uuid, char *out) ++{ ++ sprintf(out, "%plU", uuid); ++} ++#else ++#include ++#endif ++ + #endif /* _BCACHEFS_UTIL_H */ +-- +cgit v1.2.3 + + +From cff79470a689c753430da52a4dbd9194f3b544d6 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 4 Jan 2022 19:41:23 -0500 +Subject: Revert "bcachefs: Delete some obsolete journal_seq_blacklist code" + +This reverts commit f95b61228efd04c9c158123da5827c96e9773b29. + +It turns out, we're seeing filesystems in the wild end up with +blacklisted btree node bsets - this should not be happening, and until +we understand why and fix it we need to keep this code around. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bcachefs.h | 1 + + fs/bcachefs/journal_seq_blacklist.c | 78 +++++++++++++++++++++++++++++++++++++ + fs/bcachefs/journal_seq_blacklist.h | 2 + + fs/bcachefs/recovery.c | 22 +++++++---- + fs/bcachefs/super.c | 5 +++ + 5 files changed, 100 insertions(+), 8 deletions(-) + +diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h +index 8dddb365a676..c64db2bfd2a5 100644 +--- a/fs/bcachefs/bcachefs.h ++++ b/fs/bcachefs/bcachefs.h +@@ -755,6 +755,7 @@ struct bch_fs { + /* JOURNAL SEQ BLACKLIST */ + struct journal_seq_blacklist_table * + journal_seq_blacklist_table; ++ struct work_struct journal_seq_blacklist_gc_work; + + /* ALLOCATOR */ + spinlock_t freelist_lock; +diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c +index e10b2c7c7bae..3cc63fc202ab 100644 +--- a/fs/bcachefs/journal_seq_blacklist.c ++++ b/fs/bcachefs/journal_seq_blacklist.c +@@ -241,3 +241,81 @@ const struct bch_sb_field_ops bch_sb_field_ops_journal_seq_blacklist = { + .validate = bch2_sb_journal_seq_blacklist_validate, + .to_text = bch2_sb_journal_seq_blacklist_to_text + }; ++ ++void bch2_blacklist_entries_gc(struct work_struct *work) ++{ ++ struct bch_fs *c = container_of(work, struct bch_fs, ++ journal_seq_blacklist_gc_work); ++ struct journal_seq_blacklist_table *t; ++ struct bch_sb_field_journal_seq_blacklist *bl; ++ struct journal_seq_blacklist_entry *src, *dst; ++ struct btree_trans trans; ++ unsigned i, nr, new_nr; ++ int ret; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ for (i = 0; i < BTREE_ID_NR; i++) { ++ struct btree_iter iter; ++ struct btree *b; ++ ++ bch2_trans_node_iter_init(&trans, &iter, i, POS_MIN, ++ 0, 0, BTREE_ITER_PREFETCH); ++retry: ++ bch2_trans_begin(&trans); ++ ++ b = bch2_btree_iter_peek_node(&iter); ++ ++ while (!(ret = PTR_ERR_OR_ZERO(b)) && ++ b && ++ !test_bit(BCH_FS_STOPPING, &c->flags)) ++ b = bch2_btree_iter_next_node(&iter); ++ ++ if (ret == -EINTR) ++ goto retry; ++ ++ bch2_trans_iter_exit(&trans, &iter); ++ } ++ ++ bch2_trans_exit(&trans); ++ if (ret) ++ return; ++ ++ mutex_lock(&c->sb_lock); ++ bl = bch2_sb_get_journal_seq_blacklist(c->disk_sb.sb); ++ if (!bl) ++ goto out; ++ ++ nr = blacklist_nr_entries(bl); ++ dst = bl->start; ++ ++ t = c->journal_seq_blacklist_table; ++ BUG_ON(nr != t->nr); ++ ++ for (src = bl->start, i = eytzinger0_first(t->nr); ++ src < bl->start + nr; ++ src++, i = eytzinger0_next(i, nr)) { ++ BUG_ON(t->entries[i].start != le64_to_cpu(src->start)); ++ BUG_ON(t->entries[i].end != le64_to_cpu(src->end)); ++ ++ if (t->entries[i].dirty) ++ *dst++ = *src; ++ } ++ ++ new_nr = dst - bl->start; ++ ++ bch_info(c, "nr blacklist entries was %u, now %u", nr, new_nr); ++ ++ if (new_nr != nr) { ++ bl = bch2_sb_resize_journal_seq_blacklist(&c->disk_sb, ++ new_nr ? sb_blacklist_u64s(new_nr) : 0); ++ BUG_ON(new_nr && !bl); ++ ++ if (!new_nr) ++ c->disk_sb.sb->features[0] &= cpu_to_le64(~(1ULL << BCH_FEATURE_journal_seq_blacklist_v3)); ++ ++ bch2_write_super(c); ++ } ++out: ++ mutex_unlock(&c->sb_lock); ++} +diff --git a/fs/bcachefs/journal_seq_blacklist.h b/fs/bcachefs/journal_seq_blacklist.h +index b4f876a04586..afb886ec8e25 100644 +--- a/fs/bcachefs/journal_seq_blacklist.h ++++ b/fs/bcachefs/journal_seq_blacklist.h +@@ -17,4 +17,6 @@ int bch2_blacklist_table_initialize(struct bch_fs *); + + extern const struct bch_sb_field_ops bch_sb_field_ops_journal_seq_blacklist; + ++void bch2_blacklist_entries_gc(struct work_struct *); ++ + #endif /* _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H */ +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index 7003cf77fdcd..b818093eab39 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -1065,6 +1065,16 @@ use_clean: + if (ret) + goto err; + ++ /* ++ * After an unclean shutdown, skip then next few journal sequence ++ * numbers as they may have been referenced by btree writes that ++ * happened before their corresponding journal writes - those btree ++ * writes need to be ignored, by skipping and blacklisting the next few ++ * journal sequence numbers: ++ */ ++ if (!c->sb.clean) ++ journal_seq += 8; ++ + if (blacklist_seq != journal_seq) { + ret = bch2_journal_seq_blacklist_add(c, + blacklist_seq, journal_seq); +@@ -1210,14 +1220,6 @@ use_clean: + } + + mutex_lock(&c->sb_lock); +- /* +- * With journal replay done, we can clear the journal seq blacklist +- * table: +- */ +- BUG_ON(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)); +- if (le16_to_cpu(c->sb.version_min) >= bcachefs_metadata_version_btree_ptr_sectors_written) +- bch2_sb_resize_journal_seq_blacklist(&c->disk_sb, 0); +- + if (c->opts.version_upgrade) { + c->disk_sb.sb->version = cpu_to_le16(bcachefs_metadata_version_current); + c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL); +@@ -1259,6 +1261,10 @@ use_clean: + bch_info(c, "scanning for old btree nodes done"); + } + ++ if (c->journal_seq_blacklist_table && ++ c->journal_seq_blacklist_table->nr > 128) ++ queue_work(system_long_wq, &c->journal_seq_blacklist_gc_work); ++ + ret = 0; + out: + set_bit(BCH_FS_FSCK_DONE, &c->flags); +diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c +index 44547b917fc4..577b58e43b05 100644 +--- a/fs/bcachefs/super.c ++++ b/fs/bcachefs/super.c +@@ -528,6 +528,8 @@ void __bch2_fs_stop(struct bch_fs *c) + + set_bit(BCH_FS_STOPPING, &c->flags); + ++ cancel_work_sync(&c->journal_seq_blacklist_gc_work); ++ + down_write(&c->state_lock); + bch2_fs_read_only(c); + up_write(&c->state_lock); +@@ -690,6 +692,9 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) + + spin_lock_init(&c->btree_write_error_lock); + ++ INIT_WORK(&c->journal_seq_blacklist_gc_work, ++ bch2_blacklist_entries_gc); ++ + INIT_LIST_HEAD(&c->journal_entries); + INIT_LIST_HEAD(&c->journal_iters); + +-- +cgit v1.2.3 + + +From 3f6de62b06b314bde5723bca2410abdfe5a75c1d Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 4 Jan 2022 19:45:39 -0500 +Subject: bcachefs: Fix an uninitialized variable + +Only userspace builds were complaining about it, oddly enough. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/journal_io.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c +index 9794ac6f0436..df4d1a7ad533 100644 +--- a/fs/bcachefs/journal_io.c ++++ b/fs/bcachefs/journal_io.c +@@ -899,7 +899,7 @@ static void bch2_journal_read_device(struct closure *cl) + struct journal_read_buf buf = { NULL, 0 }; + u64 min_seq = U64_MAX; + unsigned i; +- int ret; ++ int ret = 0; + + if (!ja->nr) + goto out; +-- +cgit v1.2.3 + + +From f15feeb2796dce12a383576a9a940bb8f80c173a Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 4 Jan 2022 18:35:00 -0500 +Subject: bcachefs: Fix bch2_check_fix_ptrs() + +The repair for for btree_ptrs was saying one thing and doing another - +fortunately, that code can just be deleted. + +Also, when we update a btree node pointer, we also have to update node +in memery, if it exists in the btree node cache - this fixes +bch2_check_fix_ptrs() to do that. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_gc.c | 62 ++++++++++++++++++++++++++++++++------------------ + 1 file changed, 40 insertions(+), 22 deletions(-) + +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +index db453aa61d25..a201052e8259 100644 +--- a/fs/bcachefs/btree_gc.c ++++ b/fs/bcachefs/btree_gc.c +@@ -155,6 +155,34 @@ static void btree_ptr_to_v2(struct btree *b, struct bkey_i_btree_ptr_v2 *dst) + } + } + ++static void bch2_btree_node_update_key_early(struct bch_fs *c, ++ enum btree_id btree, unsigned level, ++ struct bkey_s_c old, struct bkey_i *new) ++{ ++ struct btree *b; ++ struct bkey_buf tmp; ++ int ret; ++ ++ bch2_bkey_buf_init(&tmp); ++ bch2_bkey_buf_reassemble(&tmp, c, old); ++ ++ b = bch2_btree_node_get_noiter(c, tmp.k, btree, level, true); ++ if (!IS_ERR_OR_NULL(b)) { ++ mutex_lock(&c->btree_cache.lock); ++ ++ bch2_btree_node_hash_remove(&c->btree_cache, b); ++ ++ bkey_copy(&b->key, new); ++ ret = __bch2_btree_node_hash_insert(&c->btree_cache, b); ++ BUG_ON(ret); ++ ++ mutex_unlock(&c->btree_cache.lock); ++ six_unlock_read(&b->c.lock); ++ } ++ ++ bch2_bkey_buf_exit(&tmp, c); ++} ++ + static int set_node_min(struct bch_fs *c, struct btree *b, struct bpos new_min) + { + struct bkey_i_btree_ptr_v2 *new; +@@ -524,19 +552,6 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id, + } + } + +- if (fsck_err_on(data_type == BCH_DATA_btree && +- g->mark.gen != p.ptr.gen, c, +- "bucket %u:%zu data type %s has metadata but wrong gen: %u != %u\n" +- "while marking %s", +- p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), +- bch2_data_types[ptr_data_type(k->k, &p.ptr)], +- p.ptr.gen, g->mark.gen, +- (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) { +- g2->_mark.data_type = g->_mark.data_type = data_type; +- g2->gen_valid = g->gen_valid = true; +- set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags); +- } +- + if (fsck_err_on(gen_cmp(p.ptr.gen, g->mark.gen) > 0, c, + "bucket %u:%zu data type %s ptr gen in the future: %u > %u\n" + "while marking %s", +@@ -576,7 +591,7 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id, + (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) + do_update = true; + +- if (p.ptr.gen != g->mark.gen) ++ if (data_type != BCH_DATA_btree && p.ptr.gen != g->mark.gen) + continue; + + if (fsck_err_on(g->mark.data_type && +@@ -691,16 +706,19 @@ found: + } + + ret = bch2_journal_key_insert_take(c, btree_id, level, new); +- +- if (ret) ++ if (ret) { + kfree(new); +- else { +- bch2_bkey_val_to_text(&PBUF(buf), c, *k); +- bch_info(c, "updated %s", buf); +- bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(new)); +- bch_info(c, "new key %s", buf); +- *k = bkey_i_to_s_c(new); ++ return ret; + } ++ ++ if (level) ++ bch2_btree_node_update_key_early(c, btree_id, level - 1, *k, new); ++ ++ bch2_bkey_val_to_text(&PBUF(buf), c, *k); ++ bch_info(c, "updated %s", buf); ++ bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(new)); ++ bch_info(c, "new key %s", buf); ++ *k = bkey_i_to_s_c(new); + } + fsck_err: + return ret; +-- +cgit v1.2.3 + + +From 9e883cf4b8d19920d3bc4fd5a917b4ec8fcfe87e Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 1 Jan 2022 23:16:15 -0500 +Subject: bcachefs: Improve path for when btree_gc needs another pass + +btree_gc sometimes needs another pass when it corrects bucket generation +numbers or data types - when it finds multiple pointers of different +data types to the same bucket, it may want to keep the second one it +found. + +When this happens, we now clear out bucket sector counts _without_ +resetting the bucket generation/data types that we already found, +instead of resetting them to what we have in the alloc btree. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_gc.c | 150 ++++++++++++++++++++++++++++++------------------- + 1 file changed, 92 insertions(+), 58 deletions(-) + +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +index a201052e8259..809c9a762303 100644 +--- a/fs/bcachefs/btree_gc.c ++++ b/fs/bcachefs/btree_gc.c +@@ -604,8 +604,8 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id, + (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) { + if (data_type == BCH_DATA_btree) { + g2->_mark.data_type = g->_mark.data_type = data_type; +- g2->gen_valid = g->gen_valid = true; + set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags); ++ set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags); + } else { + do_update = true; + } +@@ -1327,12 +1327,6 @@ static int bch2_gc_start(struct bch_fs *c, + + percpu_down_write(&c->mark_lock); + +- /* +- * indicate to stripe code that we need to allocate for the gc stripes +- * radix tree, too +- */ +- gc_pos_set(c, gc_phase(GC_PHASE_START)); +- + for_each_member_device(ca, c, i) { + struct bucket_array *dst = __bucket_array(ca, 1); + struct bucket_array *src = __bucket_array(ca, 0); +@@ -1360,6 +1354,27 @@ static int bch2_gc_start(struct bch_fs *c, + return 0; + } + ++static void bch2_gc_alloc_reset(struct bch_fs *c, bool initial, bool metadata_only) ++{ ++ struct bch_dev *ca; ++ unsigned i; ++ ++ for_each_member_device(ca, c, i) { ++ struct bucket_array *buckets = __bucket_array(ca, true); ++ struct bucket *g; ++ ++ for_each_bucket(g, buckets) { ++ if (metadata_only && ++ (g->mark.data_type == BCH_DATA_user || ++ g->mark.data_type == BCH_DATA_cached || ++ g->mark.data_type == BCH_DATA_parity)) ++ continue; ++ g->_mark.dirty_sectors = 0; ++ g->_mark.cached_sectors = 0; ++ } ++ }; ++} ++ + static int bch2_gc_reflink_done(struct bch_fs *c, bool initial, + bool metadata_only) + { +@@ -1430,6 +1445,55 @@ fsck_err: + return ret; + } + ++static int bch2_gc_reflink_start(struct bch_fs *c, bool initial, ++ bool metadata_only) ++{ ++ struct btree_trans trans; ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ struct reflink_gc *r; ++ int ret = 0; ++ ++ if (metadata_only) ++ return 0; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ c->reflink_gc_nr = 0; ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_reflink, POS_MIN, ++ BTREE_ITER_PREFETCH, k, ret) { ++ const __le64 *refcount = bkey_refcount_c(k); ++ ++ if (!refcount) ++ continue; ++ ++ r = genradix_ptr_alloc(&c->reflink_gc_table, c->reflink_gc_nr++, ++ GFP_KERNEL); ++ if (!r) { ++ ret = -ENOMEM; ++ break; ++ } ++ ++ r->offset = k.k->p.offset; ++ r->size = k.k->size; ++ r->refcount = 0; ++ } ++ bch2_trans_iter_exit(&trans, &iter); ++ ++ bch2_trans_exit(&trans); ++ return ret; ++} ++ ++static void bch2_gc_reflink_reset(struct bch_fs *c, bool initial, ++ bool metadata_only) ++{ ++ struct genradix_iter iter; ++ struct reflink_gc *r; ++ ++ genradix_for_each(&c->reflink_gc_table, iter, r) ++ r->refcount = 0; ++} ++ + static int bch2_gc_stripes_done(struct bch_fs *c, bool initial, + bool metadata_only) + { +@@ -1493,43 +1557,10 @@ fsck_err: + return ret; + } + +-static int bch2_gc_reflink_start(struct bch_fs *c, bool initial, +- bool metadata_only) ++static void bch2_gc_stripes_reset(struct bch_fs *c, bool initial, ++ bool metadata_only) + { +- struct btree_trans trans; +- struct btree_iter iter; +- struct bkey_s_c k; +- struct reflink_gc *r; +- int ret = 0; +- +- if (metadata_only) +- return 0; +- +- bch2_trans_init(&trans, c, 0, 0); +- c->reflink_gc_nr = 0; +- +- for_each_btree_key(&trans, iter, BTREE_ID_reflink, POS_MIN, +- BTREE_ITER_PREFETCH, k, ret) { +- const __le64 *refcount = bkey_refcount_c(k); +- +- if (!refcount) +- continue; +- +- r = genradix_ptr_alloc(&c->reflink_gc_table, c->reflink_gc_nr++, +- GFP_KERNEL); +- if (!r) { +- ret = -ENOMEM; +- break; +- } +- +- r->offset = k.k->p.offset; +- r->size = k.k->size; +- r->refcount = 0; +- } +- bch2_trans_iter_exit(&trans, &iter); +- +- bch2_trans_exit(&trans); +- return ret; ++ genradix_free(&c->gc_stripes); + } + + /** +@@ -1565,11 +1596,13 @@ int bch2_gc(struct bch_fs *c, bool initial, bool metadata_only) + /* flush interior btree updates: */ + closure_wait_event(&c->btree_interior_update_wait, + !bch2_btree_interior_updates_nr_pending(c)); +-again: ++ + ret = bch2_gc_start(c, metadata_only) ?: + bch2_gc_reflink_start(c, initial, metadata_only); + if (ret) + goto out; ++again: ++ gc_pos_set(c, gc_phase(GC_PHASE_START)); + + bch2_mark_superblocks(c); + +@@ -1607,25 +1640,26 @@ again: + + if (test_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags) || + (!iter && bch2_test_restart_gc)) { ++ if (iter++ > 2) { ++ bch_info(c, "Unable to fix bucket gens, looping"); ++ ret = -EINVAL; ++ goto out; ++ } ++ + /* + * XXX: make sure gens we fixed got saved + */ +- if (iter++ <= 2) { +- bch_info(c, "Second GC pass needed, restarting:"); +- clear_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags); +- __gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING)); +- +- percpu_down_write(&c->mark_lock); +- bch2_gc_free(c); +- percpu_up_write(&c->mark_lock); +- /* flush fsck errors, reset counters */ +- bch2_flush_fsck_errs(c); ++ bch_info(c, "Second GC pass needed, restarting:"); ++ clear_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags); ++ __gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING)); + +- goto again; +- } ++ bch2_gc_stripes_reset(c, initial, metadata_only); ++ bch2_gc_alloc_reset(c, initial, metadata_only); ++ bch2_gc_reflink_reset(c, initial, metadata_only); + +- bch_info(c, "Unable to fix bucket gens, looping"); +- ret = -EINVAL; ++ /* flush fsck errors, reset counters */ ++ bch2_flush_fsck_errs(c); ++ goto again; + } + out: + if (!ret) { +-- +cgit v1.2.3 + + +From a37e40f805e5096523982fb5dad125e4134c5885 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 13 Feb 2022 18:15:35 -0500 +Subject: bcachefs: Also print out in-memory gen on stale dirty pointer + +We're trying to track down a bug that shows itself as newly-created +extents having stale dirty pointers - possibly due to the in memory gen +and the btree gen being inconsistent. This patch changes the error +message to also print out the in memory bucket gen when this happens. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/buckets.c | 26 ++++++++++++++------------ + 1 file changed, 14 insertions(+), 12 deletions(-) + +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index a51453fcdd64..c243f304a5cd 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -695,49 +695,51 @@ static int check_bucket_ref(struct bch_fs *c, + struct bkey_s_c k, + const struct bch_extent_ptr *ptr, + s64 sectors, enum bch_data_type ptr_data_type, +- u8 bucket_gen, u8 bucket_data_type, ++ u8 b_gen, u8 bucket_data_type, + u16 dirty_sectors, u16 cached_sectors) + { +- size_t bucket_nr = PTR_BUCKET_NR(bch_dev_bkey_exists(c, ptr->dev), ptr); ++ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); ++ size_t bucket_nr = PTR_BUCKET_NR(ca, ptr); + u16 bucket_sectors = !ptr->cached + ? dirty_sectors + : cached_sectors; + char buf[200]; + +- if (gen_after(ptr->gen, bucket_gen)) { ++ if (gen_after(ptr->gen, b_gen)) { + bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, + "bucket %u:%zu gen %u data type %s: ptr gen %u newer than bucket gen\n" + "while marking %s", +- ptr->dev, bucket_nr, bucket_gen, ++ ptr->dev, bucket_nr, b_gen, + bch2_data_types[bucket_data_type ?: ptr_data_type], + ptr->gen, + (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); + return -EIO; + } + +- if (gen_cmp(bucket_gen, ptr->gen) > BUCKET_GC_GEN_MAX) { ++ if (gen_cmp(b_gen, ptr->gen) > BUCKET_GC_GEN_MAX) { + bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, + "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n" + "while marking %s", +- ptr->dev, bucket_nr, bucket_gen, ++ ptr->dev, bucket_nr, b_gen, + bch2_data_types[bucket_data_type ?: ptr_data_type], + ptr->gen, + (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); + return -EIO; + } + +- if (bucket_gen != ptr->gen && !ptr->cached) { ++ if (b_gen != ptr->gen && !ptr->cached) { + bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, +- "bucket %u:%zu gen %u data type %s: stale dirty ptr (gen %u)\n" ++ "bucket %u:%zu gen %u (mem gen %u) data type %s: stale dirty ptr (gen %u)\n" + "while marking %s", +- ptr->dev, bucket_nr, bucket_gen, ++ ptr->dev, bucket_nr, b_gen, ++ *bucket_gen(ca, bucket_nr), + bch2_data_types[bucket_data_type ?: ptr_data_type], + ptr->gen, + (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); + return -EIO; + } + +- if (bucket_gen != ptr->gen) ++ if (b_gen != ptr->gen) + return 1; + + if (bucket_data_type && ptr_data_type && +@@ -745,7 +747,7 @@ static int check_bucket_ref(struct bch_fs *c, + bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, + "bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n" + "while marking %s", +- ptr->dev, bucket_nr, bucket_gen, ++ ptr->dev, bucket_nr, b_gen, + bch2_data_types[bucket_data_type], + bch2_data_types[ptr_data_type], + (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); +@@ -756,7 +758,7 @@ static int check_bucket_ref(struct bch_fs *c, + bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, + "bucket %u:%zu gen %u data type %s sector count overflow: %u + %lli > U16_MAX\n" + "while marking %s", +- ptr->dev, bucket_nr, bucket_gen, ++ ptr->dev, bucket_nr, b_gen, + bch2_data_types[bucket_data_type ?: ptr_data_type], + bucket_sectors, sectors, + (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); +-- +cgit v1.2.3 + + +From a1d7f2123a0d3e8c522f695307dc2860149c029f Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 4 Jan 2022 22:32:09 -0500 +Subject: bcachefs: New data structure for buckets waiting on journal commit + +Implement a hash table, using cuckoo hashing, for empty buckets that are +waiting on a journal commit before they can be reused. + +This replaces the journal_seq field of bucket_mark, and is part of +eventually getting rid of the in memory bucket array. + +We may need to make bch2_bucket_needs_journal_commit() lockless, pending +profiling and testing. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/Makefile | 1 + + fs/bcachefs/alloc_background.c | 62 ++++----- + fs/bcachefs/bcachefs.h | 4 + + fs/bcachefs/btree_update_leaf.c | 28 ++-- + fs/bcachefs/buckets.c | 80 +++--------- + fs/bcachefs/buckets.h | 8 -- + fs/bcachefs/buckets_types.h | 9 -- + fs/bcachefs/buckets_waiting_for_journal.c | 167 ++++++++++++++++++++++++ + fs/bcachefs/buckets_waiting_for_journal.h | 15 +++ + fs/bcachefs/buckets_waiting_for_journal_types.h | 23 ++++ + fs/bcachefs/journal_io.c | 4 - + fs/bcachefs/super.c | 3 + + 12 files changed, 280 insertions(+), 124 deletions(-) + create mode 100644 fs/bcachefs/buckets_waiting_for_journal.c + create mode 100644 fs/bcachefs/buckets_waiting_for_journal.h + create mode 100644 fs/bcachefs/buckets_waiting_for_journal_types.h + +diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile +index 71cda24e6d08..cf29fdaadc5b 100644 +--- a/fs/bcachefs/Makefile ++++ b/fs/bcachefs/Makefile +@@ -16,6 +16,7 @@ bcachefs-y := \ + btree_update_interior.o \ + btree_update_leaf.o \ + buckets.o \ ++ buckets_waiting_for_journal.o \ + chardev.o \ + checksum.o \ + clock.o \ +diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c +index 688a53b4ca58..2147b624d27a 100644 +--- a/fs/bcachefs/alloc_background.c ++++ b/fs/bcachefs/alloc_background.c +@@ -9,6 +9,7 @@ + #include "btree_update_interior.h" + #include "btree_gc.h" + #include "buckets.h" ++#include "buckets_waiting_for_journal.h" + #include "clock.h" + #include "debug.h" + #include "ec.h" +@@ -561,8 +562,7 @@ static unsigned bucket_sort_key(struct bucket *g, struct bucket_mark m, + * keys when there's only a small difference, so that we can + * keep sequential buckets together: + */ +- return (bucket_needs_journal_commit(m, last_seq_ondisk) << 4)| +- (bucket_gc_gen(g) >> 4); ++ return bucket_gc_gen(g) >> 4; + } + } + +@@ -611,6 +611,14 @@ static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca) + if (!bch2_can_invalidate_bucket(ca, b, m)) + continue; + ++ if (!m.data_type && ++ bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal, ++ last_seq_ondisk, ++ ca->dev_idx, b)) { ++ ca->buckets_waiting_on_journal++; ++ continue; ++ } ++ + if (e.nr && e.bucket + e.nr == b && e.key == key) { + e.nr++; + } else { +@@ -647,6 +655,7 @@ static size_t find_reclaimable_buckets(struct bch_fs *c, struct bch_dev *ca) + + ca->inc_gen_needs_gc = 0; + ca->inc_gen_really_needs_gc = 0; ++ ca->buckets_waiting_on_journal = 0; + + find_reclaimable_buckets_lru(c, ca); + +@@ -658,28 +667,6 @@ static size_t find_reclaimable_buckets(struct bch_fs *c, struct bch_dev *ca) + return nr; + } + +-/* +- * returns sequence number of most recent journal entry that updated this +- * bucket: +- */ +-static u64 bucket_journal_seq(struct bch_fs *c, struct bucket_mark m) +-{ +- if (m.journal_seq_valid) { +- u64 journal_seq = atomic64_read(&c->journal.seq); +- u64 bucket_seq = journal_seq; +- +- bucket_seq &= ~((u64) U16_MAX); +- bucket_seq |= m.journal_seq; +- +- if (bucket_seq > journal_seq) +- bucket_seq -= 1 << 16; +- +- return bucket_seq; +- } else { +- return 0; +- } +-} +- + static int bucket_invalidate_btree(struct btree_trans *trans, + struct bch_dev *ca, u64 b) + { +@@ -745,9 +732,10 @@ static int bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca, + * gen in memory here, the incremented gen will be updated in the btree + * by bch2_trans_mark_pointer(): + */ +- if (!m.cached_sectors && +- !bucket_needs_journal_commit(m, c->journal.last_seq_ondisk)) { +- BUG_ON(m.data_type); ++ if (!m.data_type && ++ !bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal, ++ c->journal.flushed_seq_ondisk, ++ ca->dev_idx, b)) { + bucket_cmpxchg(g, m, m.gen++); + *bucket_gen(ca, b) = m.gen; + percpu_up_read(&c->mark_lock); +@@ -781,13 +769,6 @@ out: + + if (!top->nr) + heap_pop(&ca->alloc_heap, e, bucket_alloc_cmp, NULL); +- +- /* +- * Make sure we flush the last journal entry that updated this +- * bucket (i.e. deleting the last reference) before writing to +- * this bucket again: +- */ +- *journal_seq = max(*journal_seq, bucket_journal_seq(c, m)); + } else { + size_t b2; + +@@ -954,8 +935,14 @@ static int bch2_allocator_thread(void *arg) + gc_count = c->gc_count; + nr = find_reclaimable_buckets(c, ca); + +- trace_alloc_scan(ca, nr, ca->inc_gen_needs_gc, +- ca->inc_gen_really_needs_gc); ++ if (!nr && ca->buckets_waiting_on_journal) { ++ ret = bch2_journal_flush(&c->journal); ++ if (ret) ++ goto stop; ++ } else if (nr < (ca->mi.nbuckets >> 6) && ++ ca->buckets_waiting_on_journal >= nr / 2) { ++ bch2_journal_flush_async(&c->journal, NULL); ++ } + + if ((ca->inc_gen_needs_gc >= ALLOC_SCAN_BATCH(ca) || + ca->inc_gen_really_needs_gc) && +@@ -963,6 +950,9 @@ static int bch2_allocator_thread(void *arg) + atomic_inc(&c->kick_gc); + wake_up_process(c->gc_thread); + } ++ ++ trace_alloc_scan(ca, nr, ca->inc_gen_needs_gc, ++ ca->inc_gen_really_needs_gc); + } + + ret = bch2_invalidate_buckets(c, ca); +diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h +index c64db2bfd2a5..a28ddcd5d7b7 100644 +--- a/fs/bcachefs/bcachefs.h ++++ b/fs/bcachefs/bcachefs.h +@@ -355,6 +355,7 @@ enum bch_time_stats { + #include "alloc_types.h" + #include "btree_types.h" + #include "buckets_types.h" ++#include "buckets_waiting_for_journal_types.h" + #include "clock_types.h" + #include "ec_types.h" + #include "journal_types.h" +@@ -482,6 +483,7 @@ struct bch_dev { + + size_t inc_gen_needs_gc; + size_t inc_gen_really_needs_gc; ++ size_t buckets_waiting_on_journal; + + enum allocator_states allocator_state; + +@@ -777,6 +779,8 @@ struct bch_fs { + struct mutex write_points_hash_lock; + unsigned write_points_nr; + ++ struct buckets_waiting_for_journal buckets_waiting_for_journal; ++ + /* GARBAGE COLLECTION */ + struct task_struct *gc_thread; + atomic_t kick_gc; +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index ca98e6855195..3e1c89f33330 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -395,10 +395,11 @@ static inline void do_btree_insert_one(struct btree_trans *trans, + } + } + +-static noinline void bch2_trans_mark_gc(struct btree_trans *trans) ++static noinline int bch2_trans_mark_gc(struct btree_trans *trans) + { + struct bch_fs *c = trans->c; + struct btree_insert_entry *i; ++ int ret = 0; + + trans_for_each_update(trans, i) { + /* +@@ -407,10 +408,15 @@ static noinline void bch2_trans_mark_gc(struct btree_trans *trans) + */ + BUG_ON(i->cached || i->level); + +- if (gc_visited(c, gc_pos_btree_node(insert_l(i)->b))) +- bch2_mark_update(trans, i->path, i->k, +- i->flags|BTREE_TRIGGER_GC); ++ if (gc_visited(c, gc_pos_btree_node(insert_l(i)->b))) { ++ ret = bch2_mark_update(trans, i->path, i->k, ++ i->flags|BTREE_TRIGGER_GC); ++ if (ret) ++ break; ++ } + } ++ ++ return ret; + } + + static inline int +@@ -509,11 +515,17 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, + return BTREE_INSERT_NEED_MARK_REPLICAS; + + trans_for_each_update(trans, i) +- if (BTREE_NODE_TYPE_HAS_MEM_TRIGGERS & (1U << i->bkey_type)) +- bch2_mark_update(trans, i->path, i->k, i->flags); ++ if (BTREE_NODE_TYPE_HAS_MEM_TRIGGERS & (1U << i->bkey_type)) { ++ ret = bch2_mark_update(trans, i->path, i->k, i->flags); ++ if (ret) ++ return ret; ++ } + +- if (unlikely(c->gc_pos.phase)) +- bch2_trans_mark_gc(trans); ++ if (unlikely(c->gc_pos.phase)) { ++ ret = bch2_trans_mark_gc(trans); ++ if (ret) ++ return ret; ++ } + + trans_for_each_update(trans, i) + do_btree_insert_one(trans, i); +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index c243f304a5cd..cb835e1652ff 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -11,6 +11,7 @@ + #include "btree_gc.h" + #include "btree_update.h" + #include "buckets.h" ++#include "buckets_waiting_for_journal.h" + #include "ec.h" + #include "error.h" + #include "inode.h" +@@ -43,43 +44,6 @@ static inline void fs_usage_data_type_to_base(struct bch_fs_usage *fs_usage, + } + } + +-/* +- * Clear journal_seq_valid for buckets for which it's not needed, to prevent +- * wraparound: +- */ +-void bch2_bucket_seq_cleanup(struct bch_fs *c) +-{ +- u64 journal_seq = atomic64_read(&c->journal.seq); +- u16 last_seq_ondisk = c->journal.flushed_seq_ondisk; +- struct bch_dev *ca; +- struct bucket_array *buckets; +- struct bucket *g; +- struct bucket_mark m; +- unsigned i; +- +- if (journal_seq - c->last_bucket_seq_cleanup < +- (1U << (BUCKET_JOURNAL_SEQ_BITS - 2))) +- return; +- +- c->last_bucket_seq_cleanup = journal_seq; +- +- for_each_member_device(ca, c, i) { +- down_read(&ca->bucket_lock); +- buckets = bucket_array(ca); +- +- for_each_bucket(g, buckets) { +- bucket_cmpxchg(g, m, ({ +- if (!m.journal_seq_valid || +- bucket_needs_journal_commit(m, last_seq_ondisk)) +- break; +- +- m.journal_seq_valid = 0; +- })); +- } +- up_read(&ca->bucket_lock); +- } +-} +- + void bch2_fs_usage_initialize(struct bch_fs *c) + { + struct bch_fs_usage *usage; +@@ -572,16 +536,28 @@ static int bch2_mark_alloc(struct btree_trans *trans, + v->journal_seq = cpu_to_le64(new_u.journal_seq); + } + +- ca = bch_dev_bkey_exists(c, new.k->p.inode); ++ if (old_u.data_type && !new_u.data_type && new_u.journal_seq) { ++ ret = bch2_set_bucket_needs_journal_commit(&c->buckets_waiting_for_journal, ++ c->journal.flushed_seq_ondisk, ++ new_u.dev, new_u.bucket, ++ new_u.journal_seq); ++ if (ret) { ++ bch2_fs_fatal_error(c, ++ "error setting bucket_needs_journal_commit: %i", ret); ++ return ret; ++ } ++ } ++ ++ ca = bch_dev_bkey_exists(c, new_u.dev); + +- if (new.k->p.offset >= ca->mi.nbuckets) ++ if (new_u.bucket >= ca->mi.nbuckets) + return 0; + + percpu_down_read(&c->mark_lock); + if (!gc && new_u.gen != old_u.gen) +- *bucket_gen(ca, new.k->p.offset) = new_u.gen; ++ *bucket_gen(ca, new_u.bucket) = new_u.gen; + +- g = __bucket(ca, new.k->p.offset, gc); ++ g = __bucket(ca, new_u.bucket, gc); + + old_m = bucket_cmpxchg(g, m, ({ + m.gen = new_u.gen; +@@ -589,11 +565,6 @@ static int bch2_mark_alloc(struct btree_trans *trans, + m.dirty_sectors = new_u.dirty_sectors; + m.cached_sectors = new_u.cached_sectors; + m.stripe = new_u.stripe != 0; +- +- if (journal_seq) { +- m.journal_seq_valid = 1; +- m.journal_seq = journal_seq; +- } + })); + + bch2_dev_usage_update(c, ca, old_m, m, journal_seq, gc); +@@ -621,7 +592,7 @@ static int bch2_mark_alloc(struct btree_trans *trans, + return ret; + } + +- trace_invalidate(ca, bucket_to_sector(ca, new.k->p.offset), ++ trace_invalidate(ca, bucket_to_sector(ca, new_u.bucket), + old_m.cached_sectors); + } + +@@ -771,9 +742,10 @@ static int check_bucket_ref(struct bch_fs *c, + static int mark_stripe_bucket(struct btree_trans *trans, + struct bkey_s_c k, + unsigned ptr_idx, +- u64 journal_seq, unsigned flags) ++ unsigned flags) + { + struct bch_fs *c = trans->c; ++ u64 journal_seq = trans->journal_res.seq; + const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; + unsigned nr_data = s->nr_blocks - s->nr_redundant; + bool parity = ptr_idx >= nr_data; +@@ -814,11 +786,6 @@ static int mark_stripe_bucket(struct btree_trans *trans, + if (data_type) + new.data_type = data_type; + +- if (journal_seq) { +- new.journal_seq_valid = 1; +- new.journal_seq = journal_seq; +- } +- + new.stripe = true; + })); + +@@ -890,11 +857,6 @@ static int bch2_mark_pointer(struct btree_trans *trans, + + new.data_type = bucket_data_type; + +- if (journal_seq) { +- new.journal_seq_valid = 1; +- new.journal_seq = journal_seq; +- } +- + if (flags & BTREE_TRIGGER_NOATOMIC) { + g->_mark = new; + break; +@@ -1115,7 +1077,7 @@ static int bch2_mark_stripe(struct btree_trans *trans, + memset(m->block_sectors, 0, sizeof(m->block_sectors)); + + for (i = 0; i < new_s->nr_blocks; i++) { +- ret = mark_stripe_bucket(trans, new, i, journal_seq, flags); ++ ret = mark_stripe_bucket(trans, new, i, flags); + if (ret) + return ret; + } +diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h +index 45c6d230f242..27f7659ca754 100644 +--- a/fs/bcachefs/buckets.h ++++ b/fs/bcachefs/buckets.h +@@ -159,13 +159,6 @@ static inline bool is_available_bucket(struct bucket_mark mark) + return !mark.dirty_sectors && !mark.stripe; + } + +-static inline bool bucket_needs_journal_commit(struct bucket_mark m, +- u16 last_seq_ondisk) +-{ +- return m.journal_seq_valid && +- ((s16) m.journal_seq - (s16) last_seq_ondisk > 0); +-} +- + /* Device usage: */ + + struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *); +@@ -240,7 +233,6 @@ bch2_fs_usage_read_short(struct bch_fs *); + + /* key/bucket marking: */ + +-void bch2_bucket_seq_cleanup(struct bch_fs *); + void bch2_fs_usage_initialize(struct bch_fs *); + + void bch2_mark_alloc_bucket(struct bch_fs *, struct bch_dev *, size_t, bool); +diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h +index 18bca269b750..24139831226d 100644 +--- a/fs/bcachefs/buckets_types.h ++++ b/fs/bcachefs/buckets_types.h +@@ -15,18 +15,9 @@ struct bucket_mark { + u8 gen; + u8 data_type:3, + owned_by_allocator:1, +- journal_seq_valid:1, + stripe:1; + u16 dirty_sectors; + u16 cached_sectors; +- +- /* +- * low bits of journal sequence number when this bucket was most +- * recently modified: if journal_seq_valid is set, this bucket can't be +- * reused until the journal sequence number written to disk is >= the +- * bucket's journal sequence number: +- */ +- u16 journal_seq; + }; + }; + }; +diff --git a/fs/bcachefs/buckets_waiting_for_journal.c b/fs/bcachefs/buckets_waiting_for_journal.c +new file mode 100644 +index 000000000000..2e5b955080de +--- /dev/null ++++ b/fs/bcachefs/buckets_waiting_for_journal.c +@@ -0,0 +1,167 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include "bcachefs.h" ++#include "buckets_waiting_for_journal.h" ++#include ++ ++static inline struct bucket_hashed * ++bucket_hash(struct buckets_waiting_for_journal_table *t, ++ unsigned hash_seed_idx, u64 dev_bucket) ++{ ++ unsigned h = siphash_1u64(dev_bucket, &t->hash_seeds[hash_seed_idx]); ++ ++ BUG_ON(!is_power_of_2(t->size)); ++ ++ return t->d + (h & (t->size - 1)); ++} ++ ++static void bucket_table_init(struct buckets_waiting_for_journal_table *t, size_t size) ++{ ++ unsigned i; ++ ++ t->size = size; ++ for (i = 0; i < ARRAY_SIZE(t->hash_seeds); i++) ++ get_random_bytes(&t->hash_seeds[i], sizeof(t->hash_seeds[i])); ++ memset(t->d, 0, sizeof(t->d[0]) * size); ++} ++ ++bool bch2_bucket_needs_journal_commit(struct buckets_waiting_for_journal *b, ++ u64 flushed_seq, ++ unsigned dev, u64 bucket) ++{ ++ struct buckets_waiting_for_journal_table *t; ++ u64 dev_bucket = (u64) dev << 56 | bucket; ++ bool ret = false; ++ unsigned i; ++ ++ mutex_lock(&b->lock); ++ t = b->t; ++ ++ for (i = 0; i < ARRAY_SIZE(t->hash_seeds); i++) { ++ struct bucket_hashed *h = bucket_hash(t, i, dev_bucket); ++ ++ if (h->dev_bucket == dev_bucket) { ++ ret = h->journal_seq > flushed_seq; ++ break; ++ } ++ } ++ ++ mutex_unlock(&b->lock); ++ ++ return ret; ++} ++ ++static bool bucket_table_insert(struct buckets_waiting_for_journal_table *t, ++ struct bucket_hashed *new, ++ u64 flushed_seq) ++{ ++ struct bucket_hashed *last_evicted = NULL; ++ unsigned tries, i; ++ ++ for (tries = 0; tries < 10; tries++) { ++ struct bucket_hashed *old, *victim = NULL; ++ ++ for (i = 0; i < ARRAY_SIZE(t->hash_seeds); i++) { ++ old = bucket_hash(t, i, new->dev_bucket); ++ ++ if (old->dev_bucket == new->dev_bucket || ++ old->journal_seq <= flushed_seq) { ++ *old = *new; ++ return true; ++ } ++ ++ if (last_evicted != old) ++ victim = old; ++ } ++ ++ /* hashed to same slot 3 times: */ ++ if (!victim) ++ break; ++ ++ /* Failed to find an empty slot: */ ++ swap(*new, *victim); ++ last_evicted = victim; ++ } ++ ++ return false; ++} ++ ++int bch2_set_bucket_needs_journal_commit(struct buckets_waiting_for_journal *b, ++ u64 flushed_seq, ++ unsigned dev, u64 bucket, ++ u64 journal_seq) ++{ ++ struct buckets_waiting_for_journal_table *t, *n; ++ struct bucket_hashed tmp, new = { ++ .dev_bucket = (u64) dev << 56 | bucket, ++ .journal_seq = journal_seq, ++ }; ++ size_t i, new_size, nr_elements = 1, nr_rehashes = 0; ++ int ret = 0; ++ ++ mutex_lock(&b->lock); ++ ++ if (likely(bucket_table_insert(b->t, &new, flushed_seq))) ++ goto out; ++ ++ t = b->t; ++ for (i = 0; i < t->size; i++) ++ nr_elements += t->d[i].journal_seq > flushed_seq; ++ ++ new_size = nr_elements < t->size / 3 ? t->size : t->size * 2; ++ ++ n = kvmalloc(sizeof(*n) + sizeof(n->d[0]) * new_size, GFP_KERNEL); ++ if (!n) { ++ ret = -ENOMEM; ++ goto out; ++ } ++ ++retry_rehash: ++ nr_rehashes++; ++ bucket_table_init(n, new_size); ++ ++ tmp = new; ++ BUG_ON(!bucket_table_insert(n, &tmp, flushed_seq)); ++ ++ for (i = 0; i < t->size; i++) { ++ if (t->d[i].journal_seq <= flushed_seq) ++ continue; ++ ++ tmp = t->d[i]; ++ if (!bucket_table_insert(n, &tmp, flushed_seq)) ++ goto retry_rehash; ++ } ++ ++ b->t = n; ++ kvfree(t); ++ ++ pr_debug("took %zu rehashes, table at %zu/%zu elements", ++ nr_rehashes, nr_elements, b->t->size); ++out: ++ mutex_unlock(&b->lock); ++ ++ return ret; ++} ++ ++void bch2_fs_buckets_waiting_for_journal_exit(struct bch_fs *c) ++{ ++ struct buckets_waiting_for_journal *b = &c->buckets_waiting_for_journal; ++ ++ kvfree(b->t); ++} ++ ++#define INITIAL_TABLE_SIZE 8 ++ ++int bch2_fs_buckets_waiting_for_journal_init(struct bch_fs *c) ++{ ++ struct buckets_waiting_for_journal *b = &c->buckets_waiting_for_journal; ++ ++ mutex_init(&b->lock); ++ ++ b->t = kvmalloc(sizeof(*b->t) + sizeof(b->t->d[0]) * INITIAL_TABLE_SIZE, GFP_KERNEL); ++ if (!b->t) ++ return -ENOMEM; ++ ++ bucket_table_init(b->t, INITIAL_TABLE_SIZE); ++ return 0; ++} +diff --git a/fs/bcachefs/buckets_waiting_for_journal.h b/fs/bcachefs/buckets_waiting_for_journal.h +new file mode 100644 +index 000000000000..d2ae19cbe18c +--- /dev/null ++++ b/fs/bcachefs/buckets_waiting_for_journal.h +@@ -0,0 +1,15 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BUCKETS_WAITING_FOR_JOURNAL_H ++#define _BUCKETS_WAITING_FOR_JOURNAL_H ++ ++#include "buckets_waiting_for_journal_types.h" ++ ++bool bch2_bucket_needs_journal_commit(struct buckets_waiting_for_journal *, ++ u64, unsigned, u64); ++int bch2_set_bucket_needs_journal_commit(struct buckets_waiting_for_journal *, ++ u64, unsigned, u64, u64); ++ ++void bch2_fs_buckets_waiting_for_journal_exit(struct bch_fs *); ++int bch2_fs_buckets_waiting_for_journal_init(struct bch_fs *); ++ ++#endif /* _BUCKETS_WAITING_FOR_JOURNAL_H */ +diff --git a/fs/bcachefs/buckets_waiting_for_journal_types.h b/fs/bcachefs/buckets_waiting_for_journal_types.h +new file mode 100644 +index 000000000000..fea7f944d0ed +--- /dev/null ++++ b/fs/bcachefs/buckets_waiting_for_journal_types.h +@@ -0,0 +1,23 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BUCKETS_WAITING_FOR_JOURNAL_TYPES_H ++#define _BUCKETS_WAITING_FOR_JOURNAL_TYPES_H ++ ++#include ++ ++struct bucket_hashed { ++ u64 dev_bucket; ++ u64 journal_seq; ++}; ++ ++struct buckets_waiting_for_journal_table { ++ size_t size; ++ siphash_key_t hash_seeds[3]; ++ struct bucket_hashed d[]; ++}; ++ ++struct buckets_waiting_for_journal { ++ struct mutex lock; ++ struct buckets_waiting_for_journal_table *t; ++}; ++ ++#endif /* _BUCKETS_WAITING_FOR_JOURNAL_TYPES_H */ +diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c +index df4d1a7ad533..e566f8516052 100644 +--- a/fs/bcachefs/journal_io.c ++++ b/fs/bcachefs/journal_io.c +@@ -1671,13 +1671,9 @@ retry_alloc: + } + } + +- bch2_bucket_seq_cleanup(c); +- + continue_at(cl, do_journal_write, c->io_complete_wq); + return; + no_io: +- bch2_bucket_seq_cleanup(c); +- + continue_at(cl, journal_write_done, c->io_complete_wq); + return; + err: +diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c +index 577b58e43b05..586ba60d03ea 100644 +--- a/fs/bcachefs/super.c ++++ b/fs/bcachefs/super.c +@@ -16,6 +16,7 @@ + #include "btree_key_cache.h" + #include "btree_update_interior.h" + #include "btree_io.h" ++#include "buckets_waiting_for_journal.h" + #include "chardev.h" + #include "checksum.h" + #include "clock.h" +@@ -468,6 +469,7 @@ static void __bch2_fs_free(struct bch_fs *c) + bch2_fs_ec_exit(c); + bch2_fs_encryption_exit(c); + bch2_fs_io_exit(c); ++ bch2_fs_buckets_waiting_for_journal_exit(c); + bch2_fs_btree_interior_update_exit(c); + bch2_fs_btree_iter_exit(c); + bch2_fs_btree_key_cache_exit(&c->btree_key_cache); +@@ -810,6 +812,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) + bch2_fs_btree_key_cache_init(&c->btree_key_cache) ?: + bch2_fs_btree_iter_init(c) ?: + bch2_fs_btree_interior_update_init(c) ?: ++ bch2_fs_buckets_waiting_for_journal_init(c); + bch2_fs_subvolumes_init(c) ?: + bch2_fs_io_init(c) ?: + bch2_fs_encryption_init(c) ?: +-- +cgit v1.2.3 + + +From e2d750c58a887d8d1f11a7e23b92f414de15c317 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 6 Jan 2022 00:04:56 -0500 +Subject: bcachefs: Fix check_pos_snapshot_overwritten for !snapshots + +It shouldn't run if the btree being checked doesn't have snapshots. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_update_leaf.c | 3 +++ + 1 file changed, 3 insertions(+) + +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index 3e1c89f33330..da9af5b4b19a 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -1084,6 +1084,9 @@ static int check_pos_snapshot_overwritten(struct btree_trans *trans, + struct bkey_s_c k; + int ret; + ++ if (!btree_type_has_snapshots(id)) ++ return 0; ++ + if (!snapshot_t(c, pos.snapshot)->children[0]) + return 0; + +-- +cgit v1.2.3 + + +From fb6244ab44fc16eb2a40bd1071a71c6b3bbc9a13 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 6 Jan 2022 21:38:08 -0500 +Subject: bcachefs: Rename data_op_data_progress -> data_jobs + +Mild refactoring. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/movinggc.c | 5 ++++- + fs/bcachefs/sysfs.c | 36 ++++++++++++------------------------ + 2 files changed, 16 insertions(+), 25 deletions(-) + +diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c +index 7cd1b0cf27e4..4a8c4e3a15e0 100644 +--- a/fs/bcachefs/movinggc.c ++++ b/fs/bcachefs/movinggc.c +@@ -231,8 +231,11 @@ static int bch2_copygc(struct bch_fs *c) + + buckets_to_move = h->used; + +- if (!buckets_to_move) ++ if (!buckets_to_move) { ++ bch_err_ratelimited(c, "copygc cannot run - sectors_reserved %llu!", ++ sectors_reserved); + return 0; ++ } + + eytzinger0_sort(h->data, h->used, + sizeof(h->data[0]), +diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c +index 6d1596322ee2..ed9a095063e8 100644 +--- a/fs/bcachefs/sysfs.c ++++ b/fs/bcachefs/sysfs.c +@@ -192,7 +192,7 @@ read_attribute(new_stripes); + read_attribute(io_timers_read); + read_attribute(io_timers_write); + +-read_attribute(data_op_data_progress); ++read_attribute(data_jobs); + + #ifdef CONFIG_BCACHEFS_TESTS + write_attribute(perf_test); +@@ -230,32 +230,20 @@ static size_t bch2_btree_avg_write_size(struct bch_fs *c) + return nr ? div64_u64(sectors, nr) : 0; + } + +-static long stats_to_text(struct printbuf *out, struct bch_fs *c, +- struct bch_move_stats *stats) +-{ +- pr_buf(out, "%s: data type %s btree_id %s position: ", +- stats->name, +- bch2_data_types[stats->data_type], +- bch2_btree_ids[stats->btree_id]); +- bch2_bpos_to_text(out, stats->pos); +- pr_buf(out, "%s", "\n"); +- +- return 0; +-} +- + static long data_progress_to_text(struct printbuf *out, struct bch_fs *c) + { + long ret = 0; +- struct bch_move_stats *iter; ++ struct bch_move_stats *stats; + + mutex_lock(&c->data_progress_lock); +- +- if (list_empty(&c->data_progress_list)) +- pr_buf(out, "%s", "no progress to report\n"); +- else +- list_for_each_entry(iter, &c->data_progress_list, list) { +- stats_to_text(out, c, iter); +- } ++ list_for_each_entry(stats, &c->data_progress_list, list) { ++ pr_buf(out, "%s: data type %s btree_id %s position: ", ++ stats->name, ++ bch2_data_types[stats->data_type], ++ bch2_btree_ids[stats->btree_id]); ++ bch2_bpos_to_text(out, stats->pos); ++ pr_buf(out, "%s", "\n"); ++ } + + mutex_unlock(&c->data_progress_lock); + return ret; +@@ -463,7 +451,7 @@ SHOW(bch2_fs) + return out.pos - buf; + } + +- if (attr == &sysfs_data_op_data_progress) { ++ if (attr == &sysfs_data_jobs) { + data_progress_to_text(&out, c); + return out.pos - buf; + } +@@ -616,7 +604,7 @@ struct attribute *bch2_fs_internal_files[] = { + &sysfs_rebalance_work, + sysfs_pd_controller_files(rebalance), + +- &sysfs_data_op_data_progress, ++ &sysfs_data_jobs, + + &sysfs_internal_uuid, + NULL +-- +cgit v1.2.3 + + +From de0b2560b24e8b8b14fc686008948c1d9ea95cad Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 6 Jan 2022 01:20:12 -0500 +Subject: bcachefs: Refactor trigger code + +This breaks bch2_trans_commit_run_triggers() up into multiple functions, +and deletes a bit of duplication - prep work for triggers on alloc keys, +which will need to run last. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_update_leaf.c | 144 +++++++++++++++++++++------------------- + 1 file changed, 76 insertions(+), 68 deletions(-) + +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index da9af5b4b19a..b06458c46756 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -856,28 +856,63 @@ bch2_trans_commit_get_rw_cold(struct btree_trans *trans) + return 0; + } + +-static int bch2_trans_commit_run_triggers(struct btree_trans *trans) ++static int run_one_trigger(struct btree_trans *trans, struct btree_insert_entry *i, ++ bool overwrite) + { + struct bkey _deleted = KEY(0, 0, 0); + struct bkey_s_c deleted = (struct bkey_s_c) { &_deleted, NULL }; + struct bkey_s_c old; + struct bkey unpacked; +- struct btree_insert_entry *i = NULL, *btree_id_start = trans->updates; +- bool trans_trigger_run; +- unsigned btree_id = 0; + int ret = 0; + +- /* +- * +- * For a given btree, this algorithm runs insert triggers before +- * overwrite triggers: this is so that when extents are being moved +- * (e.g. by FALLOCATE_FL_INSERT_RANGE), we don't drop references before +- * they are re-added. +- */ +- for (btree_id = 0; btree_id < BTREE_ID_NR; btree_id++) { +- while (btree_id_start < trans->updates + trans->nr_updates && +- btree_id_start->btree_id < btree_id) +- btree_id_start++; ++ if ((i->flags & BTREE_TRIGGER_NORUN) || ++ !(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type))) ++ return 0; ++ ++ if (!overwrite) { ++ if (i->insert_trigger_run) ++ return 0; ++ ++ BUG_ON(i->overwrite_trigger_run); ++ i->insert_trigger_run = true; ++ } else { ++ if (i->overwrite_trigger_run) ++ return 0; ++ ++ BUG_ON(!i->insert_trigger_run); ++ i->overwrite_trigger_run = true; ++ } ++ ++ old = bch2_btree_path_peek_slot(i->path, &unpacked); ++ _deleted.p = i->path->pos; ++ ++ if (overwrite) { ++ ret = bch2_trans_mark_key(trans, old, deleted, ++ BTREE_TRIGGER_OVERWRITE|i->flags); ++ } else if (old.k->type == i->k->k.type && ++ ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) { ++ i->overwrite_trigger_run = true; ++ ret = bch2_trans_mark_key(trans, old, bkey_i_to_s_c(i->k), ++ BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|i->flags); ++ } else { ++ ret = bch2_trans_mark_key(trans, deleted, bkey_i_to_s_c(i->k), ++ BTREE_TRIGGER_INSERT|i->flags); ++ } ++ ++ if (ret == -EINTR) ++ trace_trans_restart_mark(trans->fn, _RET_IP_, ++ i->btree_id, &i->path->pos); ++ return ret ?: 1; ++} ++ ++static int run_btree_triggers(struct btree_trans *trans, enum btree_id btree_id, ++ struct btree_insert_entry *btree_id_start) ++{ ++ struct btree_insert_entry *i; ++ bool trans_trigger_run; ++ int ret, overwrite; ++ ++ for (overwrite = 0; overwrite < 2; overwrite++) { + + /* + * Running triggers will append more updates to the list of updates as +@@ -889,66 +924,39 @@ static int bch2_trans_commit_run_triggers(struct btree_trans *trans) + for (i = btree_id_start; + i < trans->updates + trans->nr_updates && i->btree_id <= btree_id; + i++) { +- if (i->insert_trigger_run || +- (i->flags & BTREE_TRIGGER_NORUN) || +- !(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type))) +- continue; +- +- BUG_ON(i->overwrite_trigger_run); +- +- i->insert_trigger_run = true; +- trans_trigger_run = true; +- +- old = bch2_btree_path_peek_slot(i->path, &unpacked); +- _deleted.p = i->path->pos; +- +- if (old.k->type == i->k->k.type && +- ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) { +- i->overwrite_trigger_run = true; +- ret = bch2_trans_mark_key(trans, old, bkey_i_to_s_c(i->k), +- BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|i->flags); +- } else { +- ret = bch2_trans_mark_key(trans, deleted, bkey_i_to_s_c(i->k), +- BTREE_TRIGGER_INSERT|i->flags); +- } +- +- if (ret == -EINTR) +- trace_trans_restart_mark(trans->fn, _RET_IP_, +- i->btree_id, &i->path->pos); +- if (ret) ++ ret = run_one_trigger(trans, i, overwrite); ++ if (ret < 0) + return ret; ++ if (ret) ++ trans_trigger_run = true; + } + } while (trans_trigger_run); ++ } + +- do { +- trans_trigger_run = false; +- +- for (i = btree_id_start; +- i < trans->updates + trans->nr_updates && i->btree_id <= btree_id; +- i++) { +- if (i->overwrite_trigger_run || +- (i->flags & BTREE_TRIGGER_NORUN) || +- !(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type))) +- continue; +- +- BUG_ON(!i->insert_trigger_run); +- +- i->overwrite_trigger_run = true; +- trans_trigger_run = true; ++ return 0; ++} + +- old = bch2_btree_path_peek_slot(i->path, &unpacked); +- _deleted.p = i->path->pos; ++static int bch2_trans_commit_run_triggers(struct btree_trans *trans) ++{ ++ struct btree_insert_entry *i = NULL, *btree_id_start = trans->updates; ++ unsigned btree_id = 0; ++ int ret = 0; + +- ret = bch2_trans_mark_key(trans, old, deleted, +- BTREE_TRIGGER_OVERWRITE|i->flags); ++ /* ++ * ++ * For a given btree, this algorithm runs insert triggers before ++ * overwrite triggers: this is so that when extents are being moved ++ * (e.g. by FALLOCATE_FL_INSERT_RANGE), we don't drop references before ++ * they are re-added. ++ */ ++ for (btree_id = 0; btree_id < BTREE_ID_NR; btree_id++) { ++ while (btree_id_start < trans->updates + trans->nr_updates && ++ btree_id_start->btree_id < btree_id) ++ btree_id_start++; + +- if (ret == -EINTR) +- trace_trans_restart_mark(trans->fn, _RET_IP_, +- i->btree_id, &i->path->pos); +- if (ret) +- return ret; +- } +- } while (trans_trigger_run); ++ ret = run_btree_triggers(trans, btree_id, btree_id_start); ++ if (ret) ++ return ret; + } + + trans_for_each_update(trans, i) +-- +cgit v1.2.3 + + +From 3e528dbda29d5850cb5989d5c8dc986d5d59f2ee Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 8 Jan 2022 03:39:54 -0500 +Subject: bcachefs: Use BTREE_INSERT_USE_RESERVE in btree_update_key() + +bch2_btree_update_key() is used in the btree node write path - before +delivering the completion we have to update the parent pointer with the +number of sectors written. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_update_interior.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index 47568a0bc5f1..7b8ca1153efe 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -1938,6 +1938,7 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans, + ret = bch2_trans_commit(trans, NULL, NULL, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_NOCHECK_RW| ++ BTREE_INSERT_USE_RESERVE| + BTREE_INSERT_JOURNAL_RECLAIM| + BTREE_INSERT_JOURNAL_RESERVED); + if (ret) +-- +cgit v1.2.3 + + +From 2d0c05096b30a91346626eb7cbfdfb12b86f8318 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 8 Jan 2022 19:07:32 -0500 +Subject: bcachefs: Fix an error path in bch2_snapshot_node_create() + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/subvolume.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c +index 8aeb2e417a15..012d8e8c52c4 100644 +--- a/fs/bcachefs/subvolume.c ++++ b/fs/bcachefs/subvolume.c +@@ -522,7 +522,7 @@ static int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent, + n = bch2_trans_kmalloc(trans, sizeof(*n)); + ret = PTR_ERR_OR_ZERO(n); + if (ret) +- return ret; ++ goto err; + + bkey_reassemble(&n->k_i, k); + +-- +cgit v1.2.3 + + +From ba8e107304b63f21319b98340446cdae543eca03 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 29 Dec 2021 13:50:50 -0500 +Subject: bcachefs: New snapshot unit test + +This still needs to be expanded more, but this adds a basic test for +BTREE_ITER_FILTER_SNAPSHOTS. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/subvolume.c | 8 +++--- + fs/bcachefs/subvolume.h | 4 +++ + fs/bcachefs/tests.c | 71 ++++++++++++++++++++++++++++++++++++++++++++++++- + 3 files changed, 78 insertions(+), 5 deletions(-) + +diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c +index 012d8e8c52c4..69603327d93d 100644 +--- a/fs/bcachefs/subvolume.c ++++ b/fs/bcachefs/subvolume.c +@@ -456,10 +456,10 @@ err: + return ret; + } + +-static int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent, +- u32 *new_snapids, +- u32 *snapshot_subvols, +- unsigned nr_snapids) ++int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent, ++ u32 *new_snapids, ++ u32 *snapshot_subvols, ++ unsigned nr_snapids) + { + struct btree_iter iter; + struct bkey_i_snapshot *n; +diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h +index e4c3fdcdf22f..4abe53df2788 100644 +--- a/fs/bcachefs/subvolume.h ++++ b/fs/bcachefs/subvolume.h +@@ -122,6 +122,10 @@ int bch2_snapshot_get_subvol(struct btree_trans *, u32, + struct bch_subvolume *); + int bch2_subvolume_get_snapshot(struct btree_trans *, u32, u32 *); + ++/* only exported for tests: */ ++int bch2_snapshot_node_create(struct btree_trans *, u32, ++ u32 *, u32 *, unsigned); ++ + int bch2_subvolume_delete(struct btree_trans *, u32); + int bch2_subvolume_unlink(struct btree_trans *, u32); + int bch2_subvolume_create(struct btree_trans *, u64, u32, +diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c +index 16d67eb6d1c2..de84ce834975 100644 +--- a/fs/bcachefs/tests.c ++++ b/fs/bcachefs/tests.c +@@ -4,6 +4,7 @@ + #include "bcachefs.h" + #include "btree_update.h" + #include "journal_reclaim.h" ++#include "subvolume.h" + #include "tests.h" + + #include "linux/kthread.h" +@@ -461,6 +462,70 @@ static int test_extent_overwrite_all(struct bch_fs *c, u64 nr) + __test_extent_overwrite(c, 32, 64, 32, 128); + } + ++/* snapshot unit tests */ ++ ++/* Test skipping over keys in unrelated snapshots: */ ++static int test_snapshot_filter(struct bch_fs *c, u32 snapid_lo, u32 snapid_hi) ++{ ++ struct btree_trans trans; ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ struct bkey_i_cookie cookie; ++ int ret; ++ ++ bkey_cookie_init(&cookie.k_i); ++ cookie.k.p.snapshot = snapid_hi; ++ ret = bch2_btree_insert(c, BTREE_ID_xattrs, &cookie.k_i, ++ NULL, NULL, 0); ++ if (ret) ++ return ret; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, ++ SPOS(0, 0, snapid_lo), 0); ++ k = bch2_btree_iter_peek(&iter); ++ ++ BUG_ON(k.k->p.snapshot != U32_MAX); ++ ++ bch2_trans_iter_exit(&trans, &iter); ++ bch2_trans_exit(&trans); ++ return ret; ++} ++ ++static int test_snapshots(struct bch_fs *c, u64 nr) ++{ ++ struct bkey_i_cookie cookie; ++ u32 snapids[2]; ++ u32 snapid_subvols[2] = { 1, 1 }; ++ int ret; ++ ++ bkey_cookie_init(&cookie.k_i); ++ cookie.k.p.snapshot = U32_MAX; ++ ret = bch2_btree_insert(c, BTREE_ID_xattrs, &cookie.k_i, ++ NULL, NULL, 0); ++ if (ret) ++ return ret; ++ ++ ret = bch2_trans_do(c, NULL, NULL, 0, ++ bch2_snapshot_node_create(&trans, U32_MAX, ++ snapids, ++ snapid_subvols, ++ 2)); ++ if (ret) ++ return ret; ++ ++ if (snapids[0] > snapids[1]) ++ swap(snapids[0], snapids[1]); ++ ++ ret = test_snapshot_filter(c, snapids[0], snapids[1]); ++ if (ret) { ++ bch_err(c, "err %i from test_snapshot_filter", ret); ++ return ret; ++ } ++ ++ return 0; ++} ++ + /* perf tests */ + + static u64 test_rand(void) +@@ -789,8 +854,10 @@ static int btree_perf_test_thread(void *data) + } + + ret = j->fn(j->c, div64_u64(j->nr, j->nr_threads)); +- if (ret) ++ if (ret) { ++ bch_err(j->c, "%ps: error %i", j->fn, ret); + j->ret = ret; ++ } + + if (atomic_dec_and_test(&j->done)) { + j->finish = sched_clock(); +@@ -843,6 +910,8 @@ int bch2_btree_perf_test(struct bch_fs *c, const char *testname, + perf_test(test_extent_overwrite_middle); + perf_test(test_extent_overwrite_all); + ++ perf_test(test_snapshots); ++ + if (!j.fn) { + pr_err("unknown test %s", testname); + return -EINVAL; +-- +cgit v1.2.3 + + +From 06581c697be6bd54315a4241885b50463b8c3ec4 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 8 Jan 2022 22:59:58 -0500 +Subject: bcachefs: Tracepoint improvements + +This improves the transaction restart tracepoints - adding distinct +tracepoints for all the locations and reasons a transaction might have +been restarted, and ensures that there's a tracepoint for every +transaction restart. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_cache.c | 4 ++ + fs/bcachefs/btree_iter.c | 33 +++++++++---- + fs/bcachefs/btree_key_cache.c | 3 +- + include/trace/events/bcachefs.h | 102 ++++++++++++++++++++++++++++++++++++++-- + 4 files changed, 127 insertions(+), 15 deletions(-) + +diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c +index fc6c4d4cd02f..986d08d708cc 100644 +--- a/fs/bcachefs/btree_cache.c ++++ b/fs/bcachefs/btree_cache.c +@@ -666,6 +666,8 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c, + * been freed: + */ + if (trans && !bch2_btree_node_relock(trans, path, level + 1)) { ++ trace_trans_restart_relock_parent_for_fill(trans->fn, ++ _THIS_IP_, btree_id, &path->pos); + btree_trans_restart(trans); + return ERR_PTR(-EINTR); + } +@@ -713,6 +715,8 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c, + } + + if (!six_relock_type(&b->c.lock, lock_type, seq)) { ++ trace_trans_restart_relock_after_fill(trans->fn, _THIS_IP_, ++ btree_id, &path->pos); + btree_trans_restart(trans); + return ERR_PTR(-EINTR); + } +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 7df9e4744f64..14290fd67d8c 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -178,19 +178,25 @@ bool __bch2_btree_node_relock(struct btree_trans *trans, + int want = __btree_lock_want(path, level); + + if (!is_btree_node(path, level)) +- return false; ++ goto fail; + + if (race_fault()) +- return false; ++ goto fail; + + if (six_relock_type(&b->c.lock, want, path->l[level].lock_seq) || + (btree_node_lock_seq_matches(path, b, level) && + btree_node_lock_increment(trans, b, level, want))) { + mark_btree_node_locked(path, level, want); + return true; +- } else { +- return false; + } ++fail: ++ trace_btree_node_relock_fail(trans->fn, _RET_IP_, ++ path->btree_id, ++ &path->pos, ++ (unsigned long) b, ++ path->l[level].lock_seq, ++ is_btree_node(path, level) ? b->c.lock.state.seq : 0); ++ return false; + } + + bool bch2_btree_node_upgrade(struct btree_trans *trans, +@@ -237,7 +243,7 @@ success: + + static inline bool btree_path_get_locks(struct btree_trans *trans, + struct btree_path *path, +- bool upgrade, unsigned long trace_ip) ++ bool upgrade) + { + unsigned l = path->level; + int fail_idx = -1; +@@ -440,6 +446,8 @@ bool bch2_btree_path_relock_intent(struct btree_trans *trans, + if (!bch2_btree_node_relock(trans, path, l)) { + __bch2_btree_path_unlock(path); + btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); ++ trace_trans_restart_relock_path_intent(trans->fn, _RET_IP_, ++ path->btree_id, &path->pos); + btree_trans_restart(trans); + return false; + } +@@ -452,10 +460,13 @@ __flatten + static bool bch2_btree_path_relock(struct btree_trans *trans, + struct btree_path *path, unsigned long trace_ip) + { +- bool ret = btree_path_get_locks(trans, path, false, trace_ip); ++ bool ret = btree_path_get_locks(trans, path, false); + +- if (!ret) ++ if (!ret) { ++ trace_trans_restart_relock_path(trans->fn, trace_ip, ++ path->btree_id, &path->pos); + btree_trans_restart(trans); ++ } + return ret; + } + +@@ -469,7 +480,7 @@ bool __bch2_btree_path_upgrade(struct btree_trans *trans, + + path->locks_want = new_locks_want; + +- if (btree_path_get_locks(trans, path, true, _THIS_IP_)) ++ if (btree_path_get_locks(trans, path, true)) + return true; + + /* +@@ -497,7 +508,7 @@ bool __bch2_btree_path_upgrade(struct btree_trans *trans, + linked->btree_id == path->btree_id && + linked->locks_want < new_locks_want) { + linked->locks_want = new_locks_want; +- btree_path_get_locks(trans, linked, true, _THIS_IP_); ++ btree_path_get_locks(trans, linked, true); + } + + return false; +@@ -1962,7 +1973,7 @@ struct btree_path *bch2_path_get(struct btree_trans *trans, + locks_want = min(locks_want, BTREE_MAX_DEPTH); + if (locks_want > path->locks_want) { + path->locks_want = locks_want; +- btree_path_get_locks(trans, path, true, _THIS_IP_); ++ btree_path_get_locks(trans, path, true); + } + + return path; +@@ -2099,6 +2110,8 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) + __bch2_btree_path_unlock(path); + path->l[path->level].b = BTREE_ITER_NO_NODE_GET_LOCKS; + path->l[path->level + 1].b = BTREE_ITER_NO_NODE_GET_LOCKS; ++ trace_trans_restart_relock_next_node(trans->fn, _THIS_IP_, ++ path->btree_id, &path->pos); + btree_trans_restart(trans); + ret = -EINTR; + goto err; +diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c +index 1d7b101224f1..faed51e7f4b8 100644 +--- a/fs/bcachefs/btree_key_cache.c ++++ b/fs/bcachefs/btree_key_cache.c +@@ -222,7 +222,8 @@ static int btree_key_cache_fill(struct btree_trans *trans, + goto err; + + if (!bch2_btree_node_relock(trans, ck_path, 0)) { +- trace_transaction_restart_ip(trans->fn, _THIS_IP_); ++ trace_trans_restart_relock_key_cache_fill(trans->fn, ++ _THIS_IP_, ck_path->btree_id, &ck_path->pos); + ret = btree_trans_restart(trans); + goto err; + } +diff --git a/include/trace/events/bcachefs.h b/include/trace/events/bcachefs.h +index 295dcd60e704..8f10d13b27d5 100644 +--- a/include/trace/events/bcachefs.h ++++ b/include/trace/events/bcachefs.h +@@ -346,6 +346,52 @@ TRACE_EVENT(btree_cache_scan, + __entry->ret) + ); + ++TRACE_EVENT(btree_node_relock_fail, ++ TP_PROTO(const char *trans_fn, ++ unsigned long caller_ip, ++ enum btree_id btree_id, ++ struct bpos *pos, ++ unsigned long node, ++ u32 iter_lock_seq, ++ u32 node_lock_seq), ++ TP_ARGS(trans_fn, caller_ip, btree_id, pos, node, iter_lock_seq, node_lock_seq), ++ ++ TP_STRUCT__entry( ++ __array(char, trans_fn, 24 ) ++ __array(char, caller, 32 ) ++ __field(u8, btree_id ) ++ __field(u64, pos_inode ) ++ __field(u64, pos_offset ) ++ __field(u32, pos_snapshot ) ++ __field(unsigned long, node ) ++ __field(u32, iter_lock_seq ) ++ __field(u32, node_lock_seq ) ++ ), ++ ++ TP_fast_assign( ++ strncpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn)); ++ snprintf(__entry->caller, sizeof(__entry->caller), "%pS", (void *) caller_ip); ++ __entry->btree_id = btree_id; ++ __entry->pos_inode = pos->inode; ++ __entry->pos_offset = pos->offset; ++ __entry->pos_snapshot = pos->snapshot; ++ __entry->node = node; ++ __entry->iter_lock_seq = iter_lock_seq; ++ __entry->node_lock_seq = node_lock_seq; ++ ), ++ ++ TP_printk("%s %s btree %u pos %llu:%llu:%u, node %lu iter seq %u lock seq %u", ++ __entry->trans_fn, ++ __entry->caller, ++ __entry->btree_id, ++ __entry->pos_inode, ++ __entry->pos_offset, ++ __entry->pos_snapshot, ++ __entry->node, ++ __entry->iter_lock_seq, ++ __entry->node_lock_seq) ++); ++ + /* Garbage collection */ + + DEFINE_EVENT(btree_node, btree_gc_rewrite_node, +@@ -621,7 +667,7 @@ DECLARE_EVENT_CLASS(transaction_restart_iter, + + TP_STRUCT__entry( + __array(char, trans_fn, 24 ) +- __field(unsigned long, caller_ip ) ++ __array(char, caller, 32 ) + __field(u8, btree_id ) + __field(u64, pos_inode ) + __field(u64, pos_offset ) +@@ -630,16 +676,16 @@ DECLARE_EVENT_CLASS(transaction_restart_iter, + + TP_fast_assign( + strncpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn)); +- __entry->caller_ip = caller_ip; ++ snprintf(__entry->caller, sizeof(__entry->caller), "%pS", (void *) caller_ip); + __entry->btree_id = btree_id; + __entry->pos_inode = pos->inode; + __entry->pos_offset = pos->offset; + __entry->pos_snapshot = pos->snapshot; + ), + +- TP_printk("%s %pS btree %u pos %llu:%llu:%u", ++ TP_printk("%s %s btree %u pos %llu:%llu:%u", + __entry->trans_fn, +- (void *) __entry->caller_ip, ++ __entry->caller, + __entry->btree_id, + __entry->pos_inode, + __entry->pos_offset, +@@ -694,6 +740,54 @@ DEFINE_EVENT(transaction_restart_iter, trans_restart_relock, + TP_ARGS(trans_fn, caller_ip, btree_id, pos) + ); + ++DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_next_node, ++ TP_PROTO(const char *trans_fn, ++ unsigned long caller_ip, ++ enum btree_id btree_id, ++ struct bpos *pos), ++ TP_ARGS(trans_fn, caller_ip, btree_id, pos) ++); ++ ++DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_parent_for_fill, ++ TP_PROTO(const char *trans_fn, ++ unsigned long caller_ip, ++ enum btree_id btree_id, ++ struct bpos *pos), ++ TP_ARGS(trans_fn, caller_ip, btree_id, pos) ++); ++ ++DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_after_fill, ++ TP_PROTO(const char *trans_fn, ++ unsigned long caller_ip, ++ enum btree_id btree_id, ++ struct bpos *pos), ++ TP_ARGS(trans_fn, caller_ip, btree_id, pos) ++); ++ ++DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_key_cache_fill, ++ TP_PROTO(const char *trans_fn, ++ unsigned long caller_ip, ++ enum btree_id btree_id, ++ struct bpos *pos), ++ TP_ARGS(trans_fn, caller_ip, btree_id, pos) ++); ++ ++DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_path, ++ TP_PROTO(const char *trans_fn, ++ unsigned long caller_ip, ++ enum btree_id btree_id, ++ struct bpos *pos), ++ TP_ARGS(trans_fn, caller_ip, btree_id, pos) ++); ++ ++DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_path_intent, ++ TP_PROTO(const char *trans_fn, ++ unsigned long caller_ip, ++ enum btree_id btree_id, ++ struct bpos *pos), ++ TP_ARGS(trans_fn, caller_ip, btree_id, pos) ++); ++ + DEFINE_EVENT(transaction_restart_iter, trans_restart_traverse, + TP_PROTO(const char *trans_fn, + unsigned long caller_ip, +-- +cgit v1.2.3 + + +From ea16b90f6a96fac51d3e0766e86f8c2094c84331 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 9 Jan 2022 01:07:29 -0500 +Subject: bcachefs: Refactor bch2_btree_iter() + +This splits bch2_btree_iter() up into two functions: an inner function +that handles BTREE_ITER_WITH_JOURNAL, BTREE_ITER_WITH_UPDATES, and +iterating acrcoss leaf nodes, and an outer one that implements +BTREE_ITER_FILTER_SNAPHSOTS. + +This is prep work for remember a btree_path at our update position in +BTREE_ITER_FILTER_SNAPSHOTS mode. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 99 +++++++++++++++++++++++++++++------------------- + fs/bcachefs/btree_iter.h | 21 ++++++---- + 2 files changed, 72 insertions(+), 48 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 14290fd67d8c..066ae1bb6140 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -712,9 +712,6 @@ static void bch2_btree_iter_verify(struct btree_iter *iter) + + BUG_ON(!!(iter->flags & BTREE_ITER_CACHED) != iter->path->cached); + +- BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS) && +- iter->pos.snapshot != iter->snapshot); +- + BUG_ON((iter->flags & BTREE_ITER_IS_EXTENTS) && + (iter->flags & BTREE_ITER_ALL_SNAPSHOTS)); + +@@ -2231,21 +2228,15 @@ struct bkey_s_c btree_trans_peek_journal(struct btree_trans *trans, + return k; + } + +-/** +- * bch2_btree_iter_peek: returns first key greater than or equal to iterator's +- * current position +- */ +-struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) ++static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bpos search_key) + { + struct btree_trans *trans = iter->trans; +- struct bpos search_key = btree_iter_search_key(iter); + struct bkey_i *next_update; + struct bkey_s_c k; + int ret; + + EBUG_ON(iter->path->cached || iter->path->level); + bch2_btree_iter_verify(iter); +- bch2_btree_iter_verify_entry_exit(iter); + + while (1) { + iter->path = btree_path_set_pos(trans, iter->path, search_key, +@@ -2290,24 +2281,6 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) + } + + if (likely(k.k)) { +- /* +- * We can never have a key in a leaf node at POS_MAX, so +- * we don't have to check these successor() calls: +- */ +- if ((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) && +- !bch2_snapshot_is_ancestor(trans->c, +- iter->snapshot, +- k.k->p.snapshot)) { +- search_key = bpos_successor(k.k->p); +- continue; +- } +- +- if (bkey_whiteout(k.k) && +- !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) { +- search_key = bkey_successor(iter, k.k->p); +- continue; +- } +- + break; + } else if (likely(bpos_cmp(iter->path->l[0].b->key.k.p, SPOS_MAX))) { + /* Advance to next leaf node: */ +@@ -2320,6 +2293,57 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) + } + } + ++ iter->path = btree_path_set_pos(trans, iter->path, k.k->p, ++ iter->flags & BTREE_ITER_INTENT, ++ btree_iter_ip_allocated(iter)); ++ BUG_ON(!iter->path->nodes_locked); ++out: ++ iter->path->should_be_locked = true; ++ ++ bch2_btree_iter_verify(iter); ++ ++ return k; ++} ++ ++/** ++ * bch2_btree_iter_peek: returns first key greater than or equal to iterator's ++ * current position ++ */ ++struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) ++{ ++ struct btree_trans *trans = iter->trans; ++ struct bpos search_key = btree_iter_search_key(iter); ++ struct bkey_s_c k; ++ int ret; ++ ++ bch2_btree_iter_verify_entry_exit(iter); ++ ++ while (1) { ++ k = __bch2_btree_iter_peek(iter, search_key); ++ if (!k.k || bkey_err(k)) ++ goto out; ++ ++ /* ++ * We can never have a key in a leaf node at POS_MAX, so ++ * we don't have to check these successor() calls: ++ */ ++ if ((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) && ++ !bch2_snapshot_is_ancestor(trans->c, ++ iter->snapshot, ++ k.k->p.snapshot)) { ++ search_key = bpos_successor(k.k->p); ++ continue; ++ } ++ ++ if (bkey_whiteout(k.k) && ++ !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) { ++ search_key = bkey_successor(iter, k.k->p); ++ continue; ++ } ++ ++ break; ++ } ++ + /* + * iter->pos should be mononotically increasing, and always be equal to + * the key we just returned - except extents can straddle iter->pos: +@@ -2328,22 +2352,17 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) + iter->pos = k.k->p; + else if (bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0) + iter->pos = bkey_start_pos(k.k); +- +- if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) ++out: ++ if (!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) + iter->pos.snapshot = iter->snapshot; + +- iter->path = btree_path_set_pos(trans, iter->path, k.k->p, +- iter->flags & BTREE_ITER_INTENT, +- btree_iter_ip_allocated(iter)); +- BUG_ON(!iter->path->nodes_locked); +-out: +- iter->path->should_be_locked = true; ++ ret = bch2_btree_iter_verify_ret(iter, k); ++ if (unlikely(ret)) { ++ bch2_btree_iter_set_pos(iter, iter->pos); ++ k = bkey_s_c_err(ret); ++ } + + bch2_btree_iter_verify_entry_exit(iter); +- bch2_btree_iter_verify(iter); +- ret = bch2_btree_iter_verify_ret(iter, k); +- if (unlikely(ret)) +- return bkey_s_c_err(ret); + + return k; + } +diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h +index eceec5d55f9b..a4840ee40569 100644 +--- a/fs/bcachefs/btree_iter.h ++++ b/fs/bcachefs/btree_iter.h +@@ -222,11 +222,8 @@ struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_iter *); + bool bch2_btree_iter_advance(struct btree_iter *); + bool bch2_btree_iter_rewind(struct btree_iter *); + +-static inline void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos) ++static inline void __bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos) + { +- if (!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) +- new_pos.snapshot = iter->snapshot; +- + iter->k.type = KEY_TYPE_deleted; + iter->k.p.inode = iter->pos.inode = new_pos.inode; + iter->k.p.offset = iter->pos.offset = new_pos.offset; +@@ -234,6 +231,14 @@ static inline void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos + iter->k.size = 0; + } + ++static inline void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos) ++{ ++ if (!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) ++ new_pos.snapshot = iter->snapshot; ++ ++ __bch2_btree_iter_set_pos(iter, new_pos); ++} ++ + static inline void bch2_btree_iter_set_pos_to_extent_start(struct btree_iter *iter) + { + BUG_ON(!(iter->flags & BTREE_ITER_IS_EXTENTS)); +@@ -295,7 +300,7 @@ static inline int bkey_err(struct bkey_s_c k) + return PTR_ERR_OR_ZERO(k.k); + } + +-static inline struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, ++static inline struct bkey_s_c bch2_btree_iter_peek_type(struct btree_iter *iter, + unsigned flags) + { + return flags & BTREE_ITER_SLOTS +@@ -316,7 +321,7 @@ __bch2_btree_iter_peek_and_restart(struct btree_trans *trans, + struct bkey_s_c k; + + while (btree_trans_too_many_iters(trans) || +- (k = __bch2_btree_iter_peek(iter, flags), ++ (k = bch2_btree_iter_peek_type(iter, flags), + bkey_err(k) == -EINTR)) + bch2_trans_begin(trans); + +@@ -335,7 +340,7 @@ __bch2_btree_iter_peek_and_restart(struct btree_trans *trans, + _start, _flags, _k, _ret) \ + for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ + (_start), (_flags)); \ +- (_k) = __bch2_btree_iter_peek(&(_iter), _flags), \ ++ (_k) = bch2_btree_iter_peek_type(&(_iter), _flags), \ + !((_ret) = bkey_err(_k)) && (_k).k; \ + bch2_btree_iter_advance(&(_iter))) + +@@ -347,7 +352,7 @@ __bch2_btree_iter_peek_and_restart(struct btree_trans *trans, + + #define for_each_btree_key_continue_norestart(_iter, _flags, _k, _ret) \ + for (; \ +- (_k) = __bch2_btree_iter_peek(&(_iter), _flags), \ ++ (_k) = bch2_btree_iter_peek_type(&(_iter), _flags), \ + !((_ret) = bkey_err(_k)) && (_k).k; \ + bch2_btree_iter_advance(&(_iter))) + +-- +cgit v1.2.3 + + +From 75ed23af6088b202d2d508fc8ce688d85c1ad2fd Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 8 Jan 2022 21:22:31 -0500 +Subject: bcachefs: iter->update_path + +With BTREE_ITER_FILTER_SNAPSHOTS, we have to distinguish between the +path where the key was found, and the path for inserting into the +current snapshot. This adds a new field to struct btree_iter for saving +a path for the current snapshot, and plumbs it through +bch2_trans_update(). + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 69 ++++++++++++++++++++++++++++++++++++----- + fs/bcachefs/btree_iter.h | 5 +++ + fs/bcachefs/btree_types.h | 1 + + fs/bcachefs/btree_update.h | 6 ++++ + fs/bcachefs/btree_update_leaf.c | 64 ++++++++++++++++++++------------------ + 5 files changed, 107 insertions(+), 38 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 066ae1bb6140..95089a6fcf35 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -719,6 +719,8 @@ static void bch2_btree_iter_verify(struct btree_iter *iter) + (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) && + !btree_type_has_snapshots(iter->btree_id)); + ++ if (iter->update_path) ++ bch2_btree_path_verify(trans, iter->update_path); + bch2_btree_path_verify(trans, iter->path); + } + +@@ -2292,14 +2294,7 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp + goto out; + } + } +- +- iter->path = btree_path_set_pos(trans, iter->path, k.k->p, +- iter->flags & BTREE_ITER_INTENT, +- btree_iter_ip_allocated(iter)); +- BUG_ON(!iter->path->nodes_locked); + out: +- iter->path->should_be_locked = true; +- + bch2_btree_iter_verify(iter); + + return k; +@@ -2316,6 +2311,12 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) + struct bkey_s_c k; + int ret; + ++ if (iter->update_path) { ++ bch2_path_put(trans, iter->update_path, ++ iter->flags & BTREE_ITER_INTENT); ++ iter->update_path = NULL; ++ } ++ + bch2_btree_iter_verify_entry_exit(iter); + + while (1) { +@@ -2323,6 +2324,42 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) + if (!k.k || bkey_err(k)) + goto out; + ++ if (iter->update_path && ++ bkey_cmp(iter->update_path->pos, k.k->p)) { ++ bch2_path_put(trans, iter->update_path, ++ iter->flags & BTREE_ITER_INTENT); ++ iter->update_path = NULL; ++ } ++ ++ if ((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) && ++ (iter->flags & BTREE_ITER_INTENT) && ++ !(iter->flags & BTREE_ITER_IS_EXTENTS) && ++ !iter->update_path) { ++ struct bpos pos = k.k->p; ++ ++ if (pos.snapshot < iter->snapshot) { ++ search_key = bpos_successor(k.k->p); ++ continue; ++ } ++ ++ pos.snapshot = iter->snapshot; ++ ++ /* ++ * advance, same as on exit for iter->path, but only up ++ * to snapshot ++ */ ++ __btree_path_get(iter->path, iter->flags & BTREE_ITER_INTENT); ++ iter->update_path = iter->path; ++ ++ iter->update_path = btree_path_set_pos(trans, ++ iter->update_path, pos, ++ iter->flags & BTREE_ITER_INTENT, ++ btree_iter_ip_allocated(iter)); ++ ++ BUG_ON(!(iter->update_path->nodes_locked & 1)); ++ iter->update_path->should_be_locked = true; ++ } ++ + /* + * We can never have a key in a leaf node at POS_MAX, so + * we don't have to check these successor() calls: +@@ -2352,7 +2389,18 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) + iter->pos = k.k->p; + else if (bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0) + iter->pos = bkey_start_pos(k.k); ++ ++ iter->path = btree_path_set_pos(trans, iter->path, k.k->p, ++ iter->flags & BTREE_ITER_INTENT, ++ btree_iter_ip_allocated(iter)); ++ BUG_ON(!iter->path->nodes_locked); + out: ++ if (iter->update_path) { ++ BUG_ON(!(iter->update_path->nodes_locked & 1)); ++ iter->update_path->should_be_locked = true; ++ } ++ iter->path->should_be_locked = true; ++ + if (!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) + iter->pos.snapshot = iter->snapshot; + +@@ -2752,7 +2800,11 @@ void bch2_trans_iter_exit(struct btree_trans *trans, struct btree_iter *iter) + if (iter->path) + bch2_path_put(trans, iter->path, + iter->flags & BTREE_ITER_INTENT); ++ if (iter->update_path) ++ bch2_path_put(trans, iter->update_path, ++ iter->flags & BTREE_ITER_INTENT); + iter->path = NULL; ++ iter->update_path = NULL; + } + + static void __bch2_trans_iter_init(struct btree_trans *trans, +@@ -2782,6 +2834,7 @@ static void __bch2_trans_iter_init(struct btree_trans *trans, + + iter->trans = trans; + iter->path = NULL; ++ iter->update_path = NULL; + iter->btree_id = btree_id; + iter->min_depth = depth; + iter->flags = flags; +@@ -2830,6 +2883,8 @@ void bch2_trans_copy_iter(struct btree_iter *dst, struct btree_iter *src) + *dst = *src; + if (src->path) + __btree_path_get(src->path, src->flags & BTREE_ITER_INTENT); ++ if (src->update_path) ++ __btree_path_get(src->update_path, src->flags & BTREE_ITER_INTENT); + } + + void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size) +diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h +index a4840ee40569..5205d53ce8dc 100644 +--- a/fs/bcachefs/btree_iter.h ++++ b/fs/bcachefs/btree_iter.h +@@ -233,6 +233,11 @@ static inline void __bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpo + + static inline void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos) + { ++ if (unlikely(iter->update_path)) ++ bch2_path_put(iter->trans, iter->update_path, ++ iter->flags & BTREE_ITER_INTENT); ++ iter->update_path = NULL; ++ + if (!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) + new_pos.snapshot = iter->snapshot; + +diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h +index 914d536cd29e..65f460e3c567 100644 +--- a/fs/bcachefs/btree_types.h ++++ b/fs/bcachefs/btree_types.h +@@ -276,6 +276,7 @@ static inline struct btree_path_level *path_l(struct btree_path *path) + struct btree_iter { + struct btree_trans *trans; + struct btree_path *path; ++ struct btree_path *update_path; + + enum btree_id btree_id:4; + unsigned min_depth:4; +diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h +index 16ebf1a2b1f9..6b3d08406a81 100644 +--- a/fs/bcachefs/btree_update.h ++++ b/fs/bcachefs/btree_update.h +@@ -73,8 +73,14 @@ int bch2_btree_node_update_key(struct btree_trans *, struct btree_iter *, + int bch2_btree_node_update_key_get_iter(struct btree_trans *, + struct btree *, struct bkey_i *, bool); + ++int bch2_trans_update_extent(struct btree_trans *, struct btree_iter *, ++ struct bkey_i *, enum btree_update_flags); ++ ++int __must_check bch2_trans_update_by_path(struct btree_trans *, struct btree_path *, ++ struct bkey_i *, enum btree_update_flags); + int __must_check bch2_trans_update(struct btree_trans *, struct btree_iter *, + struct bkey_i *, enum btree_update_flags); ++ + void bch2_trans_commit_hook(struct btree_trans *, + struct btree_trans_commit_hook *); + int __bch2_trans_commit(struct btree_trans *); +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index b06458c46756..fbb8fa0fe746 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -1123,10 +1123,10 @@ static int check_pos_snapshot_overwritten(struct btree_trans *trans, + return ret; + } + +-static int bch2_trans_update_extent(struct btree_trans *trans, +- struct btree_iter *orig_iter, +- struct bkey_i *insert, +- enum btree_update_flags flags) ++int bch2_trans_update_extent(struct btree_trans *trans, ++ struct btree_iter *orig_iter, ++ struct bkey_i *insert, ++ enum btree_update_flags flags) + { + struct bch_fs *c = trans->c; + struct btree_iter iter, update_iter; +@@ -1284,13 +1284,9 @@ nomerge1: + bkey_reassemble(update, k); + bch2_cut_front(insert->k.p, update); + +- bch2_trans_copy_iter(&update_iter, &iter); +- update_iter.pos = update->k.p; +- ret = bch2_trans_update(trans, &update_iter, update, ++ ret = bch2_trans_update_by_path(trans, iter.path, update, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE| + flags); +- bch2_trans_iter_exit(trans, &update_iter); +- + if (ret) + goto err; + goto out; +@@ -1373,26 +1369,23 @@ static int need_whiteout_for_snapshot(struct btree_trans *trans, + return ret; + } + +-int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter, ++int __must_check bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path, + struct bkey_i *k, enum btree_update_flags flags) + { + struct btree_insert_entry *i, n; + +- BUG_ON(!iter->path->should_be_locked); +- +- if (iter->flags & BTREE_ITER_IS_EXTENTS) +- return bch2_trans_update_extent(trans, iter, k, flags); ++ BUG_ON(!path->should_be_locked); + + BUG_ON(trans->nr_updates >= BTREE_ITER_MAX); +- BUG_ON(bpos_cmp(k->k.p, iter->path->pos)); ++ BUG_ON(bpos_cmp(k->k.p, path->pos)); + + n = (struct btree_insert_entry) { + .flags = flags, +- .bkey_type = __btree_node_type(iter->path->level, iter->btree_id), +- .btree_id = iter->btree_id, +- .level = iter->path->level, +- .cached = iter->flags & BTREE_ITER_CACHED, +- .path = iter->path, ++ .bkey_type = __btree_node_type(path->level, path->btree_id), ++ .btree_id = path->btree_id, ++ .level = path->level, ++ .cached = path->cached, ++ .path = path, + .k = k, + .ip_allocated = _RET_IP_, + }; +@@ -1403,16 +1396,6 @@ int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter + btree_insert_entry_cmp(i - 1, i) >= 0); + #endif + +- if (bkey_deleted(&n.k->k) && +- (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)) { +- int ret = need_whiteout_for_snapshot(trans, n.btree_id, n.k->k.p); +- if (unlikely(ret < 0)) +- return ret; +- +- if (ret) +- n.k->k.type = KEY_TYPE_whiteout; +- } +- + /* + * Pending updates are kept sorted: first, find position of new update, + * then delete/trim any updates the new update overwrites: +@@ -1443,10 +1426,29 @@ int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter + i - trans->updates, n); + + __btree_path_get(n.path, true); +- + return 0; + } + ++int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter, ++ struct bkey_i *k, enum btree_update_flags flags) ++{ ++ if (iter->flags & BTREE_ITER_IS_EXTENTS) ++ return bch2_trans_update_extent(trans, iter, k, flags); ++ ++ if (bkey_deleted(&k->k) && ++ (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)) { ++ int ret = need_whiteout_for_snapshot(trans, iter->btree_id, k->k.p); ++ if (unlikely(ret < 0)) ++ return ret; ++ ++ if (ret) ++ k->k.type = KEY_TYPE_whiteout; ++ } ++ ++ return bch2_trans_update_by_path(trans, iter->update_path ?: iter->path, ++ k, flags); ++} ++ + void bch2_trans_commit_hook(struct btree_trans *trans, + struct btree_trans_commit_hook *h) + { +-- +cgit v1.2.3 + + +From 8d990e1f10984bcc8f4d806b39e8791bdbd2e80a Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 27 Dec 2021 18:25:23 -0500 +Subject: bcachefs: Simplify bch2_inode_delete_keys() + +Had a bug report that implies bch2_inode_delete_keys() returned -EINTR +before it completed, so this patch simplifies it and makes the flow +control a little more conventional. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/inode.c | 57 +++++++++++++++++++++-------------------------------- + 1 file changed, 22 insertions(+), 35 deletions(-) + +diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c +index ef6da53567b8..3a7c14684102 100644 +--- a/fs/bcachefs/inode.c ++++ b/fs/bcachefs/inode.c +@@ -585,62 +585,49 @@ found_slot: + static int bch2_inode_delete_keys(struct btree_trans *trans, + subvol_inum inum, enum btree_id id) + { +- u64 offset = 0; ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ struct bkey_i delete; ++ u32 snapshot; + int ret = 0; + +- while (!ret || ret == -EINTR) { +- struct disk_reservation disk_res = +- bch2_disk_reservation_init(trans->c, 0); +- struct btree_iter iter; +- struct bkey_s_c k; +- struct bkey_i delete; +- u32 snapshot; ++ /* ++ * We're never going to be deleting extents, no need to use an extent ++ * iterator: ++ */ ++ bch2_trans_iter_init(trans, &iter, id, POS(inum.inum, 0), ++ BTREE_ITER_NOT_EXTENTS| ++ BTREE_ITER_INTENT); + ++ while (1) { + bch2_trans_begin(trans); + + ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); + if (ret) +- continue; ++ goto err; + +- bch2_trans_iter_init(trans, &iter, id, +- SPOS(inum.inum, offset, snapshot), +- BTREE_ITER_INTENT); +- k = bch2_btree_iter_peek(&iter); +- +- if (!k.k || iter.pos.inode != inum.inum) { +- bch2_trans_iter_exit(trans, &iter); +- break; +- } ++ bch2_btree_iter_set_snapshot(&iter, snapshot); + ++ k = bch2_btree_iter_peek(&iter); + ret = bkey_err(k); + if (ret) + goto err; + ++ if (!k.k || iter.pos.inode != inum.inum) ++ break; ++ + bkey_init(&delete.k); + delete.k.p = iter.pos; + +- if (btree_node_type_is_extents(iter.btree_id)) { +- unsigned max_sectors = +- min_t(u64, U64_MAX - iter.pos.offset, +- KEY_SIZE_MAX & (~0 << trans->c->block_bits)); +- +- /* create the biggest key we can */ +- bch2_key_resize(&delete.k, max_sectors); +- +- ret = bch2_extent_trim_atomic(trans, &iter, &delete); +- if (ret) +- goto err; +- } +- + ret = bch2_trans_update(trans, &iter, &delete, 0) ?: +- bch2_trans_commit(trans, &disk_res, NULL, ++ bch2_trans_commit(trans, NULL, NULL, + BTREE_INSERT_NOFAIL); +- bch2_disk_reservation_put(trans->c, &disk_res); + err: +- offset = iter.pos.offset; +- bch2_trans_iter_exit(trans, &iter); ++ if (ret && ret != -EINTR) ++ break; + } + ++ bch2_trans_iter_exit(trans, &iter); + return ret; + } + +-- +cgit v1.2.3 + + +From 8a9c66974797d2b529b547797d062bf8fb699dfb Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 9 Jan 2022 20:52:10 -0500 +Subject: bcachefs: Handle transaction restarts in __bch2_move_data() + +We weren't checking for -EINTR in the main loop in __bch2_move_data - +this code predates modern transaction restarts. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/move.c | 13 +++++++------ + 1 file changed, 7 insertions(+), 6 deletions(-) + +diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c +index f73be9cb7ac3..3e3dcec327a0 100644 +--- a/fs/bcachefs/move.c ++++ b/fs/bcachefs/move.c +@@ -700,17 +700,20 @@ static int __bch2_move_data(struct bch_fs *c, + bch2_trans_begin(&trans); + + k = bch2_btree_iter_peek(&iter); +- +- stats->pos = iter.pos; +- + if (!k.k) + break; ++ + ret = bkey_err(k); ++ if (ret == -EINTR) ++ continue; + if (ret) + break; ++ + if (bkey_cmp(bkey_start_pos(k.k), end) >= 0) + break; + ++ stats->pos = iter.pos; ++ + if (!bkey_extent_is_direct_data(k.k)) + goto next_nondata; + +@@ -753,10 +756,8 @@ static int __bch2_move_data(struct bch_fs *c, + ret2 = bch2_move_extent(&trans, ctxt, wp, io_opts, btree_id, k, + data_cmd, data_opts); + if (ret2) { +- if (ret2 == -EINTR) { +- bch2_trans_begin(&trans); ++ if (ret2 == -EINTR) + continue; +- } + + if (ret2 == -ENOMEM) { + /* memory allocation failure, wait for some IO to finish */ +-- +cgit v1.2.3 + + +From 8e175ac6b0ca17b4c10d54bf91bcd05dfbe07a19 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 9 Jan 2022 20:55:58 -0500 +Subject: bcachefs: BTREE_INSERT_LAZY_RW is only for recovery path + +BTREE_INSERT_LAZY_RW shouldn't do anything after the filesystem has +finished starting up - otherwise, it might interfere with going +read-only as part of shutting down. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_update_leaf.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index fbb8fa0fe746..241335da2399 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -840,7 +840,8 @@ bch2_trans_commit_get_rw_cold(struct btree_trans *trans) + struct bch_fs *c = trans->c; + int ret; + +- if (likely(!(trans->flags & BTREE_INSERT_LAZY_RW))) ++ if (likely(!(trans->flags & BTREE_INSERT_LAZY_RW)) || ++ test_bit(BCH_FS_STARTED, &c->flags)) + return -EROFS; + + bch2_trans_unlock(trans); +-- +cgit v1.2.3 + + +From f28eed2570a2b090fcbceb2fa45403f5c6a03cc4 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 24 Dec 2021 00:34:48 -0500 +Subject: bcachefs: Kill allocator short-circuit invalidate + +The allocator thread invalidates buckets (increments their generation +number) prior to discarding them and putting them on freelists. We've +had a short circuit path for some time to only update the in-memory +bucket mark when doing the invalidate if we're not invalidating cached +data, but that short-circuit path hasn't really been needed for quite +some time (likely since the btree key cache code was added). + +We're deleting it now as part of deleting/converting code that uses the +in memory bucket array. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/alloc_background.c | 78 +++++++++++++++++++----------------------- + 1 file changed, 35 insertions(+), 43 deletions(-) + +diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c +index 2147b624d27a..82274c64ce93 100644 +--- a/fs/bcachefs/alloc_background.c ++++ b/fs/bcachefs/alloc_background.c +@@ -668,10 +668,10 @@ static size_t find_reclaimable_buckets(struct bch_fs *c, struct bch_dev *ca) + } + + static int bucket_invalidate_btree(struct btree_trans *trans, +- struct bch_dev *ca, u64 b) ++ struct bch_dev *ca, u64 b, ++ struct bkey_alloc_unpacked *u) + { + struct bch_fs *c = trans->c; +- struct bkey_alloc_unpacked u; + struct btree_iter iter; + int ret; + +@@ -685,16 +685,16 @@ static int bucket_invalidate_btree(struct btree_trans *trans, + if (ret) + goto err; + +- u = alloc_mem_to_key(c, &iter); ++ *u = alloc_mem_to_key(c, &iter); + +- u.gen++; +- u.data_type = 0; +- u.dirty_sectors = 0; +- u.cached_sectors = 0; +- u.read_time = atomic64_read(&c->io_clock[READ].now); +- u.write_time = atomic64_read(&c->io_clock[WRITE].now); ++ u->gen++; ++ u->data_type = 0; ++ u->dirty_sectors = 0; ++ u->cached_sectors = 0; ++ u->read_time = atomic64_read(&c->io_clock[READ].now); ++ u->write_time = atomic64_read(&c->io_clock[WRITE].now); + +- ret = bch2_alloc_write(trans, &iter, &u, ++ ret = bch2_alloc_write(trans, &iter, u, + BTREE_TRIGGER_BUCKET_INVALIDATE); + err: + bch2_trans_iter_exit(trans, &iter); +@@ -704,21 +704,24 @@ err: + static int bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca, + u64 *journal_seq, unsigned flags) + { +- struct bucket *g; +- struct bucket_mark m; ++ struct bkey_alloc_unpacked u; + size_t b; ++ u64 commit_seq = 0; + int ret = 0; + ++ /* ++ * If the read-only path is trying to shut down, we can't be generating ++ * new btree updates: ++ */ ++ if (test_bit(BCH_FS_ALLOCATOR_STOPPING, &c->flags)) ++ return 1; ++ + BUG_ON(!ca->alloc_heap.used || + !ca->alloc_heap.data[0].nr); + b = ca->alloc_heap.data[0].bucket; + + /* first, put on free_inc and mark as owned by allocator: */ + percpu_down_read(&c->mark_lock); +- g = bucket(ca, b); +- m = READ_ONCE(g->mark); +- +- BUG_ON(m.dirty_sectors); + + bch2_mark_alloc_bucket(c, ca, b, true); + +@@ -727,39 +730,15 @@ static int bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca, + BUG_ON(!fifo_push(&ca->free_inc, b)); + spin_unlock(&c->freelist_lock); + +- /* +- * If we're not invalidating cached data, we only increment the bucket +- * gen in memory here, the incremented gen will be updated in the btree +- * by bch2_trans_mark_pointer(): +- */ +- if (!m.data_type && +- !bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal, +- c->journal.flushed_seq_ondisk, +- ca->dev_idx, b)) { +- bucket_cmpxchg(g, m, m.gen++); +- *bucket_gen(ca, b) = m.gen; +- percpu_up_read(&c->mark_lock); +- goto out; +- } +- + percpu_up_read(&c->mark_lock); + +- /* +- * If the read-only path is trying to shut down, we can't be generating +- * new btree updates: +- */ +- if (test_bit(BCH_FS_ALLOCATOR_STOPPING, &c->flags)) { +- ret = 1; +- goto out; +- } +- +- ret = bch2_trans_do(c, NULL, journal_seq, ++ ret = bch2_trans_do(c, NULL, &commit_seq, + BTREE_INSERT_NOCHECK_RW| + BTREE_INSERT_NOFAIL| + BTREE_INSERT_JOURNAL_RESERVED| + flags, +- bucket_invalidate_btree(&trans, ca, b)); +-out: ++ bucket_invalidate_btree(&trans, ca, b, &u)); ++ + if (!ret) { + /* remove from alloc_heap: */ + struct alloc_heap_entry e, *top = ca->alloc_heap.data; +@@ -769,6 +748,19 @@ out: + + if (!top->nr) + heap_pop(&ca->alloc_heap, e, bucket_alloc_cmp, NULL); ++ ++ /* ++ * If we invalidating cached data then we need to wait on the ++ * journal commit: ++ */ ++ if (u.data_type) ++ *journal_seq = max(*journal_seq, commit_seq); ++ ++ /* ++ * We already waiting on u.alloc_seq when we filtered out ++ * buckets that need journal commit: ++ */ ++ BUG_ON(*journal_seq > u.journal_seq); + } else { + size_t b2; + +-- +cgit v1.2.3 + + +From faff82eab4c329e6476fc8398ca16cb4ae8a10c9 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 25 Dec 2021 20:36:47 -0500 +Subject: bcachefs: Don't use in-memory bucket array for alloc updates + +More prep work for getting rid of the in-memory bucket array: now that +we have BTREE_ITER_WITH_JOURNAL, the allocator code can do ntree lookups +before journal replay is finished, and there's no longer any need for it +to get allocation information from the in-memory bucket array. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/alloc_background.c | 15 ++++++++------- + fs/bcachefs/btree_iter.c | 17 +++++++++++++++++ + fs/bcachefs/btree_update.h | 17 ----------------- + fs/bcachefs/buckets.c | 16 +++++++--------- + 4 files changed, 32 insertions(+), 33 deletions(-) + +diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c +index 82274c64ce93..f7d6cccc0725 100644 +--- a/fs/bcachefs/alloc_background.c ++++ b/fs/bcachefs/alloc_background.c +@@ -464,19 +464,20 @@ int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev, + { + struct bch_fs *c = trans->c; + struct btree_iter iter; ++ struct bkey_s_c k; + struct bkey_alloc_unpacked u; + u64 *time, now; + int ret = 0; + + bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, POS(dev, bucket_nr), + BTREE_ITER_CACHED| +- BTREE_ITER_CACHED_NOFILL| + BTREE_ITER_INTENT); +- ret = bch2_btree_iter_traverse(&iter); ++ k = bch2_btree_iter_peek_slot(&iter); ++ ret = bkey_err(k); + if (ret) + goto out; + +- u = alloc_mem_to_key(c, &iter); ++ u = bch2_alloc_unpack(k); + + time = rw == READ ? &u.read_time : &u.write_time; + now = atomic64_read(&c->io_clock[rw].now); +@@ -673,20 +674,20 @@ static int bucket_invalidate_btree(struct btree_trans *trans, + { + struct bch_fs *c = trans->c; + struct btree_iter iter; ++ struct bkey_s_c k; + int ret; + + bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, + POS(ca->dev_idx, b), + BTREE_ITER_CACHED| +- BTREE_ITER_CACHED_NOFILL| + BTREE_ITER_INTENT); + +- ret = bch2_btree_iter_traverse(&iter); ++ k = bch2_btree_iter_peek_slot(&iter); ++ ret = bkey_err(k); + if (ret) + goto err; + +- *u = alloc_mem_to_key(c, &iter); +- ++ *u = bch2_alloc_unpack(k); + u->gen++; + u->data_type = 0; + u->dirty_sectors = 0; +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 95089a6fcf35..4ffdfa01bcd2 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -2194,6 +2194,23 @@ inline bool bch2_btree_iter_rewind(struct btree_iter *iter) + return ret; + } + ++static inline struct bkey_i *btree_trans_peek_updates(struct btree_trans *trans, ++ enum btree_id btree_id, ++ struct bpos pos) ++{ ++ struct btree_insert_entry *i; ++ ++ trans_for_each_update(trans, i) ++ if ((cmp_int(btree_id, i->btree_id) ?: ++ bpos_cmp(pos, i->k->k.p)) <= 0) { ++ if (btree_id == i->btree_id) ++ return i->k; ++ break; ++ } ++ ++ return NULL; ++} ++ + static noinline + struct bkey_i *__btree_trans_peek_journal(struct btree_trans *trans, + struct btree_path *path) +diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h +index 6b3d08406a81..5e5a1b5e750e 100644 +--- a/fs/bcachefs/btree_update.h ++++ b/fs/bcachefs/btree_update.h +@@ -141,21 +141,4 @@ static inline int bch2_trans_commit(struct btree_trans *trans, + (_i) < (_trans)->updates + (_trans)->nr_updates; \ + (_i)++) + +-static inline struct bkey_i *btree_trans_peek_updates(struct btree_trans *trans, +- enum btree_id btree_id, +- struct bpos pos) +-{ +- struct btree_insert_entry *i; +- +- trans_for_each_update(trans, i) +- if ((cmp_int(btree_id, i->btree_id) ?: +- bpos_cmp(pos, i->k->k.p)) <= 0) { +- if (btree_id == i->btree_id) +- return i->k; +- break; +- } +- +- return NULL; +-} +- + #endif /* _BCACHEFS_BTREE_UPDATE_H */ +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index cb835e1652ff..c0e7b82df147 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -1425,24 +1425,22 @@ static int bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree + { + struct bch_fs *c = trans->c; + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); +- struct bpos pos = POS(ptr->dev, PTR_BUCKET_NR(ca, ptr)); +- struct bkey_i *update = btree_trans_peek_updates(trans, BTREE_ID_alloc, pos); ++ struct bkey_s_c k; + int ret; + +- bch2_trans_iter_init(trans, iter, BTREE_ID_alloc, pos, ++ bch2_trans_iter_init(trans, iter, BTREE_ID_alloc, ++ POS(ptr->dev, PTR_BUCKET_NR(ca, ptr)), ++ BTREE_ITER_WITH_UPDATES| + BTREE_ITER_CACHED| +- BTREE_ITER_CACHED_NOFILL| + BTREE_ITER_INTENT); +- ret = bch2_btree_iter_traverse(iter); ++ k = bch2_btree_iter_peek_slot(iter); ++ ret = bkey_err(k); + if (ret) { + bch2_trans_iter_exit(trans, iter); + return ret; + } + +- *u = update && !bpos_cmp(update->k.p, pos) +- ? bch2_alloc_unpack(bkey_i_to_s_c(update)) +- : alloc_mem_to_key(c, iter); +- ++ *u = bch2_alloc_unpack(k); + return 0; + } + +-- +cgit v1.2.3 + + +From 147d7a1f1b7e9fe3b76ebc94bb6e8ad2d1de69f2 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 10 Jan 2022 19:46:39 -0500 +Subject: bcachefs: Ignore cached data when calculating fragmentation + +Previously, bucket fragmentation was considered to be bucket size - +total amount of live data, both dirty and cached. + +This meant that if a bucket was full but only a small amount of data in +it was dirty - the rest cached, we'd get stuck: copygc wouldn't move the +dirty data out of the bucket and the allocator wouldn't be able to +invalidate and drop the cached data. + +This changes fragmentation to exclude cached data, so that copygc will +evacuate these buckets and copygc/the allocator will always be able to +make forward progress. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/alloc_background.c | 2 +- + fs/bcachefs/buckets.c | 4 ++-- + fs/bcachefs/buckets.h | 5 ----- + fs/bcachefs/movinggc.c | 21 ++++++++++++--------- + 4 files changed, 15 insertions(+), 17 deletions(-) + +diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c +index f7d6cccc0725..2b82ab7aab86 100644 +--- a/fs/bcachefs/alloc_background.c ++++ b/fs/bcachefs/alloc_background.c +@@ -544,7 +544,7 @@ static bool bch2_can_invalidate_bucket(struct bch_dev *ca, size_t b, + static unsigned bucket_sort_key(struct bucket *g, struct bucket_mark m, + u64 now, u64 last_seq_ondisk) + { +- unsigned used = bucket_sectors_used(m); ++ unsigned used = m.cached_sectors; + + if (used) { + /* +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index c0e7b82df147..2be6b0fb967f 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -287,8 +287,8 @@ static inline int is_unavailable_bucket(struct bucket_mark m) + static inline int bucket_sectors_fragmented(struct bch_dev *ca, + struct bucket_mark m) + { +- return bucket_sectors_used(m) +- ? max(0, (int) ca->mi.bucket_size - (int) bucket_sectors_used(m)) ++ return m.dirty_sectors ++ ? max(0, (int) ca->mi.bucket_size - (int) m.dirty_sectors) + : 0; + } + +diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h +index 27f7659ca754..d35c96bcf3a1 100644 +--- a/fs/bcachefs/buckets.h ++++ b/fs/bcachefs/buckets.h +@@ -149,11 +149,6 @@ static inline u8 ptr_stale(struct bch_dev *ca, + + /* bucket gc marks */ + +-static inline unsigned bucket_sectors_used(struct bucket_mark mark) +-{ +- return mark.dirty_sectors + mark.cached_sectors; +-} +- + static inline bool is_available_bucket(struct bucket_mark mark) + { + return !mark.dirty_sectors && !mark.stripe; +diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c +index 4a8c4e3a15e0..92f78907bcb6 100644 +--- a/fs/bcachefs/movinggc.c ++++ b/fs/bcachefs/movinggc.c +@@ -69,10 +69,14 @@ static enum data_cmd copygc_pred(struct bch_fs *c, void *arg, + .dev = p.ptr.dev, + .offset = p.ptr.offset, + }; ++ ssize_t i; + +- ssize_t i = eytzinger0_find_le(h->data, h->used, +- sizeof(h->data[0]), +- bucket_offset_cmp, &search); ++ if (p.ptr.cached) ++ continue; ++ ++ i = eytzinger0_find_le(h->data, h->used, ++ sizeof(h->data[0]), ++ bucket_offset_cmp, &search); + #if 0 + /* eytzinger search verify code: */ + ssize_t j = -1, k; +@@ -185,8 +189,7 @@ static int bch2_copygc(struct bch_fs *c) + + if (m.owned_by_allocator || + m.data_type != BCH_DATA_user || +- !bucket_sectors_used(m) || +- bucket_sectors_used(m) >= ca->mi.bucket_size) ++ m.dirty_sectors >= ca->mi.bucket_size) + continue; + + WARN_ON(m.stripe && !g->stripe_redundancy); +@@ -195,9 +198,9 @@ static int bch2_copygc(struct bch_fs *c) + .dev = dev_idx, + .gen = m.gen, + .replicas = 1 + g->stripe_redundancy, +- .fragmentation = bucket_sectors_used(m) * (1U << 15) ++ .fragmentation = m.dirty_sectors * (1U << 15) + / ca->mi.bucket_size, +- .sectors = bucket_sectors_used(m), ++ .sectors = m.dirty_sectors, + .offset = bucket_to_sector(ca, b), + }; + heap_add_or_replace(h, e, -fragmentation_cmp, NULL); +@@ -263,8 +266,8 @@ static int bch2_copygc(struct bch_fs *c) + m = READ_ONCE(buckets->b[b].mark); + + if (i->gen == m.gen && +- bucket_sectors_used(m)) { +- sectors_not_moved += bucket_sectors_used(m); ++ m.dirty_sectors) { ++ sectors_not_moved += m.dirty_sectors; + buckets_not_moved++; + } + } +-- +cgit v1.2.3 + + +From 5f24cd86f60e38c9628c551a667845834a4ea814 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 11 Jan 2022 00:19:52 -0500 +Subject: bcachefs: Delete some dead code + +__bch2_mark_replicas() is now only used in one place, so inline it into +the caller. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/replicas.c | 12 ++---------- + 1 file changed, 2 insertions(+), 10 deletions(-) + +diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c +index a08f1e084a9d..96994b7a75a5 100644 +--- a/fs/bcachefs/replicas.c ++++ b/fs/bcachefs/replicas.c +@@ -414,18 +414,10 @@ err: + goto out; + } + +-static int __bch2_mark_replicas(struct bch_fs *c, +- struct bch_replicas_entry *r, +- bool check) +-{ +- return likely(bch2_replicas_marked(c, r)) ? 0 +- : check ? -1 +- : bch2_mark_replicas_slowpath(c, r); +-} +- + int bch2_mark_replicas(struct bch_fs *c, struct bch_replicas_entry *r) + { +- return __bch2_mark_replicas(c, r, false); ++ return likely(bch2_replicas_marked(c, r)) ++ ? 0 : bch2_mark_replicas_slowpath(c, r); + } + + /* replicas delta list: */ +-- +cgit v1.2.3 + + +From 22c12cd67bfa014c268655f1833653d5957d3c3a Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 11 Jan 2022 22:08:44 -0500 +Subject: bcachefs: Log message improvements + +Change the error messages in bch2_inconsistent_error() and +bch2_fatal_error() so we can distinguish them. + +Also, prefer bch2_fs_fatal_error() (which also logs an error message) to +bch2_fatal_error(), and change a call to bch2_inconsistent_error() to +bch2_fatal_error() when we can't continue. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_update_leaf.c | 5 ++--- + fs/bcachefs/error.c | 4 ++-- + fs/bcachefs/journal_io.c | 2 +- + 3 files changed, 5 insertions(+), 6 deletions(-) + +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index 241335da2399..a21d35f29b46 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -662,9 +662,8 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, + char buf[200]; + + bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(i->k)); +- bch_err(c, "invalid bkey %s on insert from %s -> %ps: %s\n", +- buf, trans->fn, (void *) i->ip_allocated, invalid); +- bch2_fatal_error(c); ++ bch2_fs_fatal_error(c, "invalid bkey %s on insert from %s -> %ps: %s\n", ++ buf, trans->fn, (void *) i->ip_allocated, invalid); + return -EINVAL; + } + btree_insert_entry_checks(trans, i); +diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c +index 2cea694575e9..8279a9ba76a5 100644 +--- a/fs/bcachefs/error.c ++++ b/fs/bcachefs/error.c +@@ -15,7 +15,7 @@ bool bch2_inconsistent_error(struct bch_fs *c) + return false; + case BCH_ON_ERROR_ro: + if (bch2_fs_emergency_read_only(c)) +- bch_err(c, "emergency read only"); ++ bch_err(c, "inconsistency detected - emergency read only"); + return true; + case BCH_ON_ERROR_panic: + panic(bch2_fmt(c, "panic after error")); +@@ -35,7 +35,7 @@ void bch2_topology_error(struct bch_fs *c) + void bch2_fatal_error(struct bch_fs *c) + { + if (bch2_fs_emergency_read_only(c)) +- bch_err(c, "emergency read only"); ++ bch_err(c, "fatal error - emergency read only"); + } + + void bch2_io_error_work(struct work_struct *work) +diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c +index e566f8516052..651828b8bc97 100644 +--- a/fs/bcachefs/journal_io.c ++++ b/fs/bcachefs/journal_io.c +@@ -1677,6 +1677,6 @@ no_io: + continue_at(cl, journal_write_done, c->io_complete_wq); + return; + err: +- bch2_inconsistent_error(c); ++ bch2_fatal_error(c); + continue_at(cl, journal_write_done, c->io_complete_wq); + } +-- +cgit v1.2.3 + + +From 06a74c4ace01ca91243718f62e4af7dc136f0af4 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 7 Feb 2022 01:19:39 -0500 +Subject: bcachefs: Don't keep nodes in btree_reserve locked + +These nodes aren't reachable by other threads, so there's no need to +keep it locked - and this fixes a bug with the assertion in +bch2_trans_unlock() firing on transaction restart. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_update_interior.c | 10 +++++++--- + 1 file changed, 7 insertions(+), 3 deletions(-) + +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index 7b8ca1153efe..f0a0880d97b7 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -243,6 +243,8 @@ retry: + bch2_alloc_sectors_done(c, wp); + mem_alloc: + b = bch2_btree_node_mem_alloc(c); ++ six_unlock_write(&b->c.lock); ++ six_unlock_intent(&b->c.lock); + + /* we hold cannibalize_lock: */ + BUG_ON(IS_ERR(b)); +@@ -265,6 +267,9 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned lev + + b = as->prealloc_nodes[--as->nr_prealloc_nodes]; + ++ six_lock_intent(&b->c.lock, NULL, NULL); ++ six_lock_write(&b->c.lock, NULL, NULL); ++ + set_btree_node_accessed(b); + set_btree_node_dirty(c, b); + set_btree_node_need_write(b); +@@ -378,7 +383,8 @@ static void bch2_btree_reserve_put(struct btree_update *as) + while (as->nr_prealloc_nodes) { + struct btree *b = as->prealloc_nodes[--as->nr_prealloc_nodes]; + +- six_unlock_write(&b->c.lock); ++ six_lock_intent(&b->c.lock, NULL, NULL); ++ six_lock_write(&b->c.lock, NULL, NULL); + + if (c->btree_reserve_cache_nr < + ARRAY_SIZE(c->btree_reserve_cache)) { +@@ -392,10 +398,8 @@ static void bch2_btree_reserve_put(struct btree_update *as) + bch2_open_buckets_put(c, &b->ob); + } + +- btree_node_lock_type(c, b, SIX_LOCK_write); + __btree_node_free(c, b); + six_unlock_write(&b->c.lock); +- + six_unlock_intent(&b->c.lock); + } + +-- +cgit v1.2.3 + + +From ca9cb0a91761c9914de2e6c77ca8414cb7c41e12 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 11 Jan 2022 23:24:43 -0500 +Subject: bcachefs: Don't call into the allocator with btree locks held + +This also simplifies bch2_btree_update_start() a fair amount, by always +dropping btree node locks before calling into external code that could +block, instead of having an unlock/block/retry path. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_update_interior.c | 80 +++++++++++++++++-------------------- + 1 file changed, 36 insertions(+), 44 deletions(-) + +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index f0a0880d97b7..a0f7a9f06b98 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -407,39 +407,52 @@ static void bch2_btree_reserve_put(struct btree_update *as) + } + + static int bch2_btree_reserve_get(struct btree_update *as, unsigned nr_nodes, +- unsigned flags, struct closure *cl) ++ unsigned flags) + { + struct bch_fs *c = as->c; ++ struct closure cl; + struct btree *b; + int ret; + ++ closure_init_stack(&cl); ++retry: ++ + BUG_ON(nr_nodes > BTREE_RESERVE_MAX); + + /* + * Protects reaping from the btree node cache and using the btree node + * open bucket reserve: ++ * ++ * BTREE_INSERT_NOWAIT only applies to btree node allocation, not ++ * blocking on this lock: + */ +- ret = bch2_btree_cache_cannibalize_lock(c, cl); ++ ret = bch2_btree_cache_cannibalize_lock(c, &cl); + if (ret) +- return ret; ++ goto err; + + while (as->nr_prealloc_nodes < nr_nodes) { + b = __bch2_btree_node_alloc(c, &as->disk_res, + flags & BTREE_INSERT_NOWAIT +- ? NULL : cl, flags); ++ ? NULL : &cl, flags); + if (IS_ERR(b)) { + ret = PTR_ERR(b); +- goto err_free; ++ goto err; + } + + as->prealloc_nodes[as->nr_prealloc_nodes++] = b; + } + + bch2_btree_cache_cannibalize_unlock(c); ++ closure_sync(&cl); + return 0; +-err_free: ++err: + bch2_btree_cache_cannibalize_unlock(c); +- trace_btree_reserve_get_fail(c, nr_nodes, cl); ++ closure_sync(&cl); ++ ++ if (ret == -EAGAIN) ++ goto retry; ++ ++ trace_btree_reserve_get_fail(c, nr_nodes, &cl); + return ret; + } + +@@ -939,7 +952,6 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, + { + struct bch_fs *c = trans->c; + struct btree_update *as; +- struct closure cl; + u64 start_time = local_clock(); + int disk_res_flags = (flags & BTREE_INSERT_NOFAIL) + ? BCH_DISK_RESERVATION_NOFAIL : 0; +@@ -950,9 +962,8 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, + + if (flags & BTREE_INSERT_JOURNAL_RESERVED) + journal_flags |= JOURNAL_RES_GET_RESERVED; +- +- closure_init_stack(&cl); +-retry: ++ if (flags & BTREE_INSERT_JOURNAL_RECLAIM) ++ journal_flags |= JOURNAL_RES_GET_NONBLOCK; + + /* + * XXX: figure out how far we might need to split, +@@ -1007,30 +1018,16 @@ retry: + if (ret) + goto err; + ++ bch2_trans_unlock(trans); ++ + ret = bch2_journal_preres_get(&c->journal, &as->journal_preres, + BTREE_UPDATE_JOURNAL_RES, +- journal_flags|JOURNAL_RES_GET_NONBLOCK); +- if (ret == -EAGAIN) { +- bch2_trans_unlock(trans); +- +- if (flags & BTREE_INSERT_JOURNAL_RECLAIM) { +- bch2_btree_update_free(as); +- btree_trans_restart(trans); +- return ERR_PTR(ret); +- } +- +- ret = bch2_journal_preres_get(&c->journal, &as->journal_preres, +- BTREE_UPDATE_JOURNAL_RES, +- journal_flags); +- if (ret) { +- trace_trans_restart_journal_preres_get(trans->fn, _RET_IP_); +- goto err; +- } +- +- if (!bch2_trans_relock(trans)) { +- ret = -EINTR; +- goto err; +- } ++ journal_flags); ++ if (ret) { ++ bch2_btree_update_free(as); ++ trace_trans_restart_journal_preres_get(trans->fn, _RET_IP_); ++ btree_trans_restart(trans); ++ return ERR_PTR(ret); + } + + ret = bch2_disk_reservation_get(c, &as->disk_res, +@@ -1040,10 +1037,15 @@ retry: + if (ret) + goto err; + +- ret = bch2_btree_reserve_get(as, nr_nodes, flags, &cl); ++ ret = bch2_btree_reserve_get(as, nr_nodes, flags); + if (ret) + goto err; + ++ if (!bch2_trans_relock(trans)) { ++ ret = -EINTR; ++ goto err; ++ } ++ + bch2_journal_pin_add(&c->journal, + atomic64_read(&c->journal.seq), + &as->journal, NULL); +@@ -1051,16 +1053,6 @@ retry: + return as; + err: + bch2_btree_update_free(as); +- +- if (ret == -EAGAIN) { +- bch2_trans_unlock(trans); +- closure_sync(&cl); +- ret = -EINTR; +- } +- +- if (ret == -EINTR && bch2_trans_relock(trans)) +- goto retry; +- + return ERR_PTR(ret); + } + +-- +cgit v1.2.3 + + +From bf9f2878caaef9dbcc1adf6abca113069d60e2e6 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 6 Feb 2022 19:20:36 -0500 +Subject: bcachefs: Fix freeing in bch2_dev_buckets_resize() + +We were double-freeing old_buckets and not freeing old_buckets_gens: +also, the code was supposed to free buckets, not old_buckets; +old_buckets is only needed because we have to use rcu_assign_pointer() +instead of swap(), and won't be set if we hit the error path. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bcachefs.h | 2 +- + fs/bcachefs/buckets.c | 10 ++++++---- + 2 files changed, 7 insertions(+), 5 deletions(-) + +diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h +index a28ddcd5d7b7..b76770c265fc 100644 +--- a/fs/bcachefs/bcachefs.h ++++ b/fs/bcachefs/bcachefs.h +@@ -451,7 +451,7 @@ struct bch_dev { + * Or rcu_read_lock(), but only for ptr_stale(): + */ + struct bucket_array __rcu *buckets[2]; +- struct bucket_gens *bucket_gens; ++ struct bucket_gens __rcu *bucket_gens; + unsigned long *buckets_nouse; + struct rw_semaphore bucket_lock; + +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index 2be6b0fb967f..6c5bfdc16648 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -2102,7 +2102,7 @@ static void buckets_free_rcu(struct rcu_head *rcu) + container_of(rcu, struct bucket_array, rcu); + + kvpfree(buckets, +- sizeof(struct bucket_array) + ++ sizeof(*buckets) + + buckets->nbuckets * sizeof(struct bucket)); + } + +@@ -2111,7 +2111,7 @@ static void bucket_gens_free_rcu(struct rcu_head *rcu) + struct bucket_gens *buckets = + container_of(rcu, struct bucket_gens, rcu); + +- kvpfree(buckets, sizeof(struct bucket_array) + buckets->nbuckets); ++ kvpfree(buckets, sizeof(*buckets) + buckets->nbuckets); + } + + int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) +@@ -2221,9 +2221,9 @@ err: + kvpfree(buckets_nouse, + BITS_TO_LONGS(nbuckets) * sizeof(unsigned long)); + if (bucket_gens) +- call_rcu(&old_buckets->rcu, bucket_gens_free_rcu); ++ call_rcu(&bucket_gens->rcu, bucket_gens_free_rcu); + if (buckets) +- call_rcu(&old_buckets->rcu, buckets_free_rcu); ++ call_rcu(&buckets->rcu, buckets_free_rcu); + + return ret; + } +@@ -2238,6 +2238,8 @@ void bch2_dev_buckets_free(struct bch_dev *ca) + free_fifo(&ca->free[i]); + kvpfree(ca->buckets_nouse, + BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long)); ++ kvpfree(rcu_dereference_protected(ca->bucket_gens, 1), ++ sizeof(struct bucket_gens) + ca->mi.nbuckets); + kvpfree(rcu_dereference_protected(ca->buckets[0], 1), + sizeof(struct bucket_array) + + ca->mi.nbuckets * sizeof(struct bucket)); +-- +cgit v1.2.3 + + +From 9a9308a9f95b2d6014937375958f6ac9a75e0a11 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 12 Jan 2022 00:49:23 -0500 +Subject: bcachefs: Improve btree_key_cache_flush_pos() + +btree_key_cache_flush_pos() uses BTREE_ITER_CACHED_NOFILL - but it +wasn't checking for !ck->valid. It does check for the entry being dirty, +so it shouldn't matter, but this refactor it a bit and adds and +assertion. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_key_cache.c | 14 +++++++++----- + 1 file changed, 9 insertions(+), 5 deletions(-) + +diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c +index faed51e7f4b8..f43153bcbf2f 100644 +--- a/fs/bcachefs/btree_key_cache.c ++++ b/fs/bcachefs/btree_key_cache.c +@@ -389,16 +389,20 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans, + goto out; + + ck = (void *) c_iter.path->l[0].b; +- if (!ck || +- (journal_seq && ck->journal.seq != journal_seq)) ++ if (!ck) + goto out; + + if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { +- if (!evict) +- goto out; +- goto evict; ++ if (evict) ++ goto evict; ++ goto out; + } + ++ BUG_ON(!ck->valid); ++ ++ if (journal_seq && ck->journal.seq != journal_seq) ++ goto out; ++ + /* + * Since journal reclaim depends on us making progress here, and the + * allocator/copygc depend on journal reclaim making progress, we need +-- +cgit v1.2.3 + + +From 1f5adb4e189918c1dc348efef72ea0f1c5c6d374 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 12 Jan 2022 02:13:21 -0500 +Subject: bcachefs: btree_id_cached() + +Add a new helper that returns true if the given btree ID uses the btree +key cache. This enables some new cleanups, since the helper can check +the options for whether caching is enabled on a given btree. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bcachefs.h | 6 ++++++ + fs/bcachefs/btree_iter.c | 3 +++ + fs/bcachefs/fs.c | 4 ++-- + fs/bcachefs/inode.c | 15 +++++---------- + fs/bcachefs/inode.h | 2 +- + fs/bcachefs/super.c | 7 +++++++ + 6 files changed, 24 insertions(+), 13 deletions(-) + +diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h +index b76770c265fc..b99bce436c20 100644 +--- a/fs/bcachefs/bcachefs.h ++++ b/fs/bcachefs/bcachefs.h +@@ -716,6 +716,7 @@ struct bch_fs { + bool btree_trans_barrier_initialized; + + struct btree_key_cache btree_key_cache; ++ unsigned btree_key_cache_btrees; + + struct workqueue_struct *btree_update_wq; + struct workqueue_struct *btree_io_complete_wq; +@@ -952,6 +953,11 @@ static inline size_t btree_sectors(const struct bch_fs *c) + return c->opts.btree_node_size >> 9; + } + ++static inline bool btree_id_cached(const struct bch_fs *c, enum btree_id btree) ++{ ++ return c->btree_key_cache_btrees & (1U << btree); ++} ++ + static inline struct timespec64 bch2_time_to_timespec(const struct bch_fs *c, s64 time) + { + struct timespec64 t; +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 4ffdfa01bcd2..3ce4b1cc494e 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -2849,6 +2849,9 @@ static void __bch2_trans_iter_init(struct btree_trans *trans, + if (!test_bit(JOURNAL_REPLAY_DONE, &trans->c->journal.flags)) + flags |= BTREE_ITER_WITH_JOURNAL; + ++ if (!btree_id_cached(trans->c, btree_id)) ++ flags &= ~BTREE_ITER_CACHED; ++ + iter->trans = trans; + iter->path = NULL; + iter->update_path = NULL; +diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c +index 9653f199dc44..0c3c271ca143 100644 +--- a/fs/bcachefs/fs.c ++++ b/fs/bcachefs/fs.c +@@ -105,7 +105,7 @@ void bch2_inode_update_after_write(struct btree_trans *trans, + + bch2_assert_pos_locked(trans, BTREE_ID_inodes, + POS(0, bi->bi_inum), +- 0 && c->opts.inodes_use_key_cache); ++ c->opts.inodes_use_key_cache); + + set_nlink(&inode->v, bch2_inode_nlink_get(bi)); + i_uid_write(&inode->v, bi->bi_uid); +@@ -1472,7 +1472,7 @@ static void bch2_evict_inode(struct inode *vinode) + KEY_TYPE_QUOTA_WARN); + bch2_quota_acct(c, inode->ei_qid, Q_INO, -1, + KEY_TYPE_QUOTA_WARN); +- bch2_inode_rm(c, inode_inum(inode), true); ++ bch2_inode_rm(c, inode_inum(inode)); + } + } + +diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c +index 3a7c14684102..78e2db6c938b 100644 +--- a/fs/bcachefs/inode.c ++++ b/fs/bcachefs/inode.c +@@ -252,15 +252,13 @@ int bch2_inode_peek(struct btree_trans *trans, + u32 snapshot; + int ret; + +- if (0 && trans->c->opts.inodes_use_key_cache) +- flags |= BTREE_ITER_CACHED; +- + ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); + if (ret) + return ret; + + bch2_trans_iter_init(trans, iter, BTREE_ID_inodes, +- SPOS(0, inum.inum, snapshot), flags); ++ SPOS(0, inum.inum, snapshot), ++ flags|BTREE_ITER_CACHED); + k = bch2_btree_iter_peek_slot(iter); + ret = bkey_err(k); + if (ret) +@@ -631,20 +629,16 @@ err: + return ret; + } + +-int bch2_inode_rm(struct bch_fs *c, subvol_inum inum, bool cached) ++int bch2_inode_rm(struct bch_fs *c, subvol_inum inum) + { + struct btree_trans trans; + struct btree_iter iter = { NULL }; + struct bkey_i_inode_generation delete; + struct bch_inode_unpacked inode_u; + struct bkey_s_c k; +- unsigned iter_flags = BTREE_ITER_INTENT; + u32 snapshot; + int ret; + +- if (0 && cached && c->opts.inodes_use_key_cache) +- iter_flags |= BTREE_ITER_CACHED; +- + bch2_trans_init(&trans, c, 0, 1024); + + /* +@@ -668,7 +662,8 @@ retry: + goto err; + + bch2_trans_iter_init(&trans, &iter, BTREE_ID_inodes, +- SPOS(0, inum.inum, snapshot), iter_flags); ++ SPOS(0, inum.inum, snapshot), ++ BTREE_ITER_INTENT|BTREE_ITER_CACHED); + k = bch2_btree_iter_peek_slot(&iter); + + ret = bkey_err(k); +diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h +index 723186d8afb6..77957cc7f9dd 100644 +--- a/fs/bcachefs/inode.h ++++ b/fs/bcachefs/inode.h +@@ -87,7 +87,7 @@ void bch2_inode_init(struct bch_fs *, struct bch_inode_unpacked *, + int bch2_inode_create(struct btree_trans *, struct btree_iter *, + struct bch_inode_unpacked *, u32, u64); + +-int bch2_inode_rm(struct bch_fs *, subvol_inum, bool); ++int bch2_inode_rm(struct bch_fs *, subvol_inum); + + int bch2_inode_find_by_inum_trans(struct btree_trans *, subvol_inum, + struct bch_inode_unpacked *); +diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c +index 586ba60d03ea..d8b72d8dd7a8 100644 +--- a/fs/bcachefs/super.c ++++ b/fs/bcachefs/super.c +@@ -762,6 +762,13 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) + + bch2_opts_apply(&c->opts, opts); + ++ /* key cache currently disabled for inodes, because of snapshots: */ ++ c->opts.inodes_use_key_cache = 0; ++ ++ c->btree_key_cache_btrees |= 1U << BTREE_ID_alloc; ++ if (c->opts.inodes_use_key_cache) ++ c->btree_key_cache_btrees |= 1U << BTREE_ID_inodes; ++ + c->block_bits = ilog2(block_sectors(c)); + c->btree_foreground_merge_threshold = BTREE_FOREGROUND_MERGE_THRESHOLD(c); + +-- +cgit v1.2.3 + + +From 2cebc4055cdea6fa18a546fd8ed6d39152b8454b Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 6 Feb 2022 22:21:44 -0500 +Subject: bcachefs: bch2_btree_path_set_pos() + +bch2_btree_path_set_pos() is now available outside of btree_iter.c + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 24 ++++++++++++------------ + fs/bcachefs/btree_iter.h | 3 +++ + 2 files changed, 15 insertions(+), 12 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 3ce4b1cc494e..77f6c81eb7e4 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -1717,8 +1717,8 @@ bch2_btree_path_make_mut(struct btree_trans *trans, + return path; + } + +-static struct btree_path * __must_check +-btree_path_set_pos(struct btree_trans *trans, ++struct btree_path * __must_check ++bch2_btree_path_set_pos(struct btree_trans *trans, + struct btree_path *path, struct bpos new_pos, + bool intent, unsigned long ip) + { +@@ -1932,7 +1932,7 @@ struct btree_path *bch2_path_get(struct btree_trans *trans, + path_pos->btree_id == btree_id && + path_pos->level == level) { + __btree_path_get(path_pos, intent); +- path = btree_path_set_pos(trans, path_pos, pos, intent, ip); ++ path = bch2_btree_path_set_pos(trans, path_pos, pos, intent, ip); + } else { + path = btree_path_alloc(trans, path_pos); + path_pos = NULL; +@@ -2029,7 +2029,7 @@ bch2_btree_iter_traverse(struct btree_iter *iter) + { + int ret; + +- iter->path = btree_path_set_pos(iter->trans, iter->path, ++ iter->path = bch2_btree_path_set_pos(iter->trans, iter->path, + btree_iter_search_key(iter), + iter->flags & BTREE_ITER_INTENT, + btree_iter_ip_allocated(iter)); +@@ -2066,7 +2066,7 @@ struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter) + bkey_init(&iter->k); + iter->k.p = iter->pos = b->key.k.p; + +- iter->path = btree_path_set_pos(trans, iter->path, b->key.k.p, ++ iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p, + iter->flags & BTREE_ITER_INTENT, + btree_iter_ip_allocated(iter)); + iter->path->should_be_locked = true; +@@ -2128,7 +2128,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) + * the next child node + */ + path = iter->path = +- btree_path_set_pos(trans, path, bpos_successor(iter->pos), ++ bch2_btree_path_set_pos(trans, path, bpos_successor(iter->pos), + iter->flags & BTREE_ITER_INTENT, + btree_iter_ip_allocated(iter)); + +@@ -2151,7 +2151,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) + bkey_init(&iter->k); + iter->k.p = iter->pos = b->key.k.p; + +- iter->path = btree_path_set_pos(trans, iter->path, b->key.k.p, ++ iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p, + iter->flags & BTREE_ITER_INTENT, + btree_iter_ip_allocated(iter)); + iter->path->should_be_locked = true; +@@ -2258,7 +2258,7 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp + bch2_btree_iter_verify(iter); + + while (1) { +- iter->path = btree_path_set_pos(trans, iter->path, search_key, ++ iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key, + iter->flags & BTREE_ITER_INTENT, + btree_iter_ip_allocated(iter)); + +@@ -2368,7 +2368,7 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) + __btree_path_get(iter->path, iter->flags & BTREE_ITER_INTENT); + iter->update_path = iter->path; + +- iter->update_path = btree_path_set_pos(trans, ++ iter->update_path = bch2_btree_path_set_pos(trans, + iter->update_path, pos, + iter->flags & BTREE_ITER_INTENT, + btree_iter_ip_allocated(iter)); +@@ -2407,7 +2407,7 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) + else if (bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0) + iter->pos = bkey_start_pos(k.k); + +- iter->path = btree_path_set_pos(trans, iter->path, k.k->p, ++ iter->path = bch2_btree_path_set_pos(trans, iter->path, k.k->p, + iter->flags & BTREE_ITER_INTENT, + btree_iter_ip_allocated(iter)); + BUG_ON(!iter->path->nodes_locked); +@@ -2471,7 +2471,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) + search_key.snapshot = U32_MAX; + + while (1) { +- iter->path = btree_path_set_pos(trans, iter->path, search_key, ++ iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key, + iter->flags & BTREE_ITER_INTENT, + btree_iter_ip_allocated(iter)); + +@@ -2602,7 +2602,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) + } + + search_key = btree_iter_search_key(iter); +- iter->path = btree_path_set_pos(trans, iter->path, search_key, ++ iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key, + iter->flags & BTREE_ITER_INTENT, + btree_iter_ip_allocated(iter)); + +diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h +index 5205d53ce8dc..aaeefb80cdc2 100644 +--- a/fs/bcachefs/btree_iter.h ++++ b/fs/bcachefs/btree_iter.h +@@ -132,6 +132,9 @@ __trans_next_path_with_node(struct btree_trans *trans, struct btree *b, + struct btree_path * __must_check + bch2_btree_path_make_mut(struct btree_trans *, struct btree_path *, + bool, unsigned long); ++struct btree_path * __must_check ++bch2_btree_path_set_pos(struct btree_trans *, struct btree_path *, ++ struct bpos, bool, unsigned long); + int __must_check bch2_btree_path_traverse(struct btree_trans *, + struct btree_path *, unsigned); + struct btree_path *bch2_path_get(struct btree_trans *, enum btree_id, struct bpos, +-- +cgit v1.2.3 + + +From 4417487f529328b4ae996f91051e5801eae11520 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 6 Feb 2022 23:15:12 -0500 +Subject: bcachefs: BTREE_ITER_WITH_KEY_CACHE + +This is the start of cache coherency with the btree key cache - this +adds a btree iterator flag that causes lookups to also check the key +cache when we're iterating over the btree (not iterating over the key +cache). + +Note that we could still race with another thread creating at item in +the key cache and updating it, since we aren't holding the key cache +locked if it wasn't found. The next patch for the update path will +address this by causing the transaction to restart if the key cache is +found to be dirty. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 91 +++++++++++++++++++++++++++++++++++++------ + fs/bcachefs/btree_iter.h | 5 --- + fs/bcachefs/btree_key_cache.c | 18 +++++---- + fs/bcachefs/btree_types.h | 9 +++-- + 4 files changed, 96 insertions(+), 27 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 77f6c81eb7e4..136d00b49be1 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -1983,13 +1983,13 @@ inline struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *path, struct + + struct bkey_s_c k; + +- BUG_ON(path->uptodate != BTREE_ITER_UPTODATE); +- + if (!path->cached) { + struct btree_path_level *l = path_l(path); +- struct bkey_packed *_k = +- bch2_btree_node_iter_peek_all(&l->iter, l->b); ++ struct bkey_packed *_k; ++ ++ EBUG_ON(path->uptodate != BTREE_ITER_UPTODATE); + ++ _k = bch2_btree_node_iter_peek_all(&l->iter, l->b); + k = _k ? bkey_disassemble(l->b, _k, u) : bkey_s_c_null; + + EBUG_ON(k.k && bkey_deleted(k.k) && bpos_cmp(k.k->p, path->pos) == 0); +@@ -1999,12 +1999,15 @@ inline struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *path, struct + } else { + struct bkey_cached *ck = (void *) path->l[0].b; + +- EBUG_ON(path->btree_id != ck->key.btree_id || +- bkey_cmp(path->pos, ck->key.pos)); ++ EBUG_ON(ck && ++ (path->btree_id != ck->key.btree_id || ++ bkey_cmp(path->pos, ck->key.pos))); + +- /* BTREE_ITER_CACHED_NOFILL? */ +- if (unlikely(!ck->valid)) +- goto hole; ++ /* BTREE_ITER_CACHED_NOFILL|BTREE_ITER_CACHED_NOCREATE? */ ++ if (unlikely(!ck || !ck->valid)) ++ return bkey_s_c_null; ++ ++ EBUG_ON(path->uptodate != BTREE_ITER_UPTODATE); + + k = bkey_i_to_s_c(ck->k); + } +@@ -2247,11 +2250,45 @@ struct bkey_s_c btree_trans_peek_journal(struct btree_trans *trans, + return k; + } + ++/* ++ * Checks btree key cache for key at iter->pos and returns it if present, or ++ * bkey_s_c_null: ++ */ ++static noinline ++struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos pos) ++{ ++ struct btree_trans *trans = iter->trans; ++ struct bch_fs *c = trans->c; ++ struct bkey u; ++ int ret; ++ ++ if (!bch2_btree_key_cache_find(c, iter->btree_id, pos)) ++ return bkey_s_c_null; ++ ++ if (!iter->key_cache_path) ++ iter->key_cache_path = bch2_path_get(trans, iter->btree_id, pos, ++ iter->flags & BTREE_ITER_INTENT, 0, ++ iter->flags|BTREE_ITER_CACHED, ++ _THIS_IP_); ++ ++ iter->key_cache_path = bch2_btree_path_set_pos(trans, iter->key_cache_path, pos, ++ iter->flags & BTREE_ITER_INTENT, ++ btree_iter_ip_allocated(iter)); ++ ++ ret = bch2_btree_path_traverse(trans, iter->key_cache_path, iter->flags|BTREE_ITER_CACHED); ++ if (unlikely(ret)) ++ return bkey_s_c_err(ret); ++ ++ iter->key_cache_path->should_be_locked = true; ++ ++ return bch2_btree_path_peek_slot(iter->key_cache_path, &u); ++} ++ + static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bpos search_key) + { + struct btree_trans *trans = iter->trans; + struct bkey_i *next_update; +- struct bkey_s_c k; ++ struct bkey_s_c k, k2; + int ret; + + EBUG_ON(iter->path->cached || iter->path->level); +@@ -2270,8 +2307,24 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp + goto out; + } + ++ iter->path->should_be_locked = true; ++ + k = btree_path_level_peek_all(trans->c, &iter->path->l[0], &iter->k); + ++ if (unlikely(iter->flags & BTREE_ITER_WITH_KEY_CACHE) && ++ k.k && ++ (k2 = btree_trans_peek_key_cache(iter, k.k->p)).k) { ++ ret = bkey_err(k2); ++ if (ret) { ++ k = k2; ++ bch2_btree_iter_set_pos(iter, iter->pos); ++ goto out; ++ } ++ ++ k = k2; ++ iter->k = *k.k; ++ } ++ + if (unlikely(iter->flags & BTREE_ITER_WITH_JOURNAL)) + k = btree_trans_peek_journal(trans, iter, k); + +@@ -2631,6 +2684,13 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) + goto out; + } + ++ if (unlikely(iter->flags & BTREE_ITER_WITH_KEY_CACHE) && ++ (k = btree_trans_peek_key_cache(iter, iter->pos)).k) { ++ if (!bkey_err(k)) ++ iter->k = *k.k; ++ goto out; ++ } ++ + k = bch2_btree_path_peek_slot(iter->path, &iter->k); + } else { + struct bpos next; +@@ -2820,8 +2880,12 @@ void bch2_trans_iter_exit(struct btree_trans *trans, struct btree_iter *iter) + if (iter->update_path) + bch2_path_put(trans, iter->update_path, + iter->flags & BTREE_ITER_INTENT); ++ if (iter->key_cache_path) ++ bch2_path_put(trans, iter->key_cache_path, ++ iter->flags & BTREE_ITER_INTENT); + iter->path = NULL; + iter->update_path = NULL; ++ iter->key_cache_path = NULL; + } + + static void __bch2_trans_iter_init(struct btree_trans *trans, +@@ -2849,12 +2913,16 @@ static void __bch2_trans_iter_init(struct btree_trans *trans, + if (!test_bit(JOURNAL_REPLAY_DONE, &trans->c->journal.flags)) + flags |= BTREE_ITER_WITH_JOURNAL; + +- if (!btree_id_cached(trans->c, btree_id)) ++ if (!btree_id_cached(trans->c, btree_id)) { + flags &= ~BTREE_ITER_CACHED; ++ flags &= ~BTREE_ITER_WITH_KEY_CACHE; ++ } else if (!(flags & BTREE_ITER_CACHED)) ++ flags |= BTREE_ITER_WITH_KEY_CACHE; + + iter->trans = trans; + iter->path = NULL; + iter->update_path = NULL; ++ iter->key_cache_path = NULL; + iter->btree_id = btree_id; + iter->min_depth = depth; + iter->flags = flags; +@@ -2905,6 +2973,7 @@ void bch2_trans_copy_iter(struct btree_iter *dst, struct btree_iter *src) + __btree_path_get(src->path, src->flags & BTREE_ITER_INTENT); + if (src->update_path) + __btree_path_get(src->update_path, src->flags & BTREE_ITER_INTENT); ++ dst->key_cache_path = NULL; + } + + void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size) +diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h +index aaeefb80cdc2..759c7b52f4a2 100644 +--- a/fs/bcachefs/btree_iter.h ++++ b/fs/bcachefs/btree_iter.h +@@ -50,11 +50,6 @@ static inline struct btree *btree_node_parent(struct btree_path *path, + return btree_path_node(path, b->c.level + 1); + } + +-static inline int btree_iter_err(const struct btree_iter *iter) +-{ +- return iter->flags & BTREE_ITER_ERROR ? -EIO : 0; +-} +- + /* Iterate over paths within a transaction: */ + + static inline struct btree_path * +diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c +index f43153bcbf2f..8bfdbbdbf7c8 100644 +--- a/fs/bcachefs/btree_key_cache.c ++++ b/fs/bcachefs/btree_key_cache.c +@@ -208,19 +208,21 @@ static int btree_key_cache_fill(struct btree_trans *trans, + struct btree_path *ck_path, + struct bkey_cached *ck) + { +- struct btree_iter iter; ++ struct btree_path *path; + struct bkey_s_c k; + unsigned new_u64s = 0; + struct bkey_i *new_k = NULL; ++ struct bkey u; + int ret; + +- bch2_trans_iter_init(trans, &iter, ck->key.btree_id, +- ck->key.pos, BTREE_ITER_SLOTS); +- k = bch2_btree_iter_peek_slot(&iter); +- ret = bkey_err(k); ++ path = bch2_path_get(trans, ck->key.btree_id, ++ ck->key.pos, 0, 0, 0, _THIS_IP_); ++ ret = bch2_btree_path_traverse(trans, path, 0); + if (ret) + goto err; + ++ k = bch2_btree_path_peek_slot(path, &u); ++ + if (!bch2_btree_node_relock(trans, ck_path, 0)) { + trace_trans_restart_relock_key_cache_fill(trans->fn, + _THIS_IP_, ck_path->btree_id, &ck_path->pos); +@@ -261,9 +263,9 @@ static int btree_key_cache_fill(struct btree_trans *trans, + bch2_btree_node_unlock_write(trans, ck_path, ck_path->l[0].b); + + /* We're not likely to need this iterator again: */ +- set_btree_iter_dontneed(&iter); ++ path->preserve = false; + err: +- bch2_trans_iter_exit(trans, &iter); ++ bch2_path_put(trans, path, 0); + return ret; + } + +@@ -384,6 +386,8 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans, + BTREE_ITER_CACHED_NOFILL| + BTREE_ITER_CACHED_NOCREATE| + BTREE_ITER_INTENT); ++ b_iter.flags &= ~BTREE_ITER_WITH_KEY_CACHE; ++ + ret = bch2_btree_iter_traverse(&c_iter); + if (ret) + goto out; +diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h +index 65f460e3c567..86962fd21d0c 100644 +--- a/fs/bcachefs/btree_types.h ++++ b/fs/bcachefs/btree_types.h +@@ -202,10 +202,10 @@ struct btree_node_iter { + */ + #define BTREE_ITER_IS_EXTENTS (1 << 4) + #define BTREE_ITER_NOT_EXTENTS (1 << 5) +-#define BTREE_ITER_ERROR (1 << 6) +-#define BTREE_ITER_CACHED (1 << 7) +-#define BTREE_ITER_CACHED_NOFILL (1 << 8) +-#define BTREE_ITER_CACHED_NOCREATE (1 << 9) ++#define BTREE_ITER_CACHED (1 << 6) ++#define BTREE_ITER_CACHED_NOFILL (1 << 7) ++#define BTREE_ITER_CACHED_NOCREATE (1 << 8) ++#define BTREE_ITER_WITH_KEY_CACHE (1 << 9) + #define BTREE_ITER_WITH_UPDATES (1 << 10) + #define BTREE_ITER_WITH_JOURNAL (1 << 11) + #define __BTREE_ITER_ALL_SNAPSHOTS (1 << 12) +@@ -277,6 +277,7 @@ struct btree_iter { + struct btree_trans *trans; + struct btree_path *path; + struct btree_path *update_path; ++ struct btree_path *key_cache_path; + + enum btree_id btree_id:4; + unsigned min_depth:4; +-- +cgit v1.2.3 + + +From c925d6b1e77c5e7256ddb1faed6222ab75d2fbac Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 12 Jan 2022 01:14:47 -0500 +Subject: bcachefs: Btree key cache coherency + +Updates to non key cache iterators will now be transparently redirected +to the key cache for cached btrees. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_key_cache.c | 9 +---- + fs/bcachefs/btree_key_cache.h | 8 ----- + fs/bcachefs/btree_types.h | 2 ++ + fs/bcachefs/btree_update.h | 2 -- + fs/bcachefs/btree_update_leaf.c | 79 ++++++++++++++++++++++++++--------------- + include/trace/events/bcachefs.h | 6 ++++ + 6 files changed, 60 insertions(+), 46 deletions(-) + +diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c +index 8bfdbbdbf7c8..e6363592c417 100644 +--- a/fs/bcachefs/btree_key_cache.c ++++ b/fs/bcachefs/btree_key_cache.c +@@ -414,6 +414,7 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans, + * */ + ret = bch2_btree_iter_traverse(&b_iter) ?: + bch2_trans_update(trans, &b_iter, ck->k, ++ BTREE_UPDATE_KEY_CACHE_RECLAIM| + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE| + BTREE_TRIGGER_NORUN) ?: + bch2_trans_commit(trans, NULL, NULL, +@@ -555,14 +556,6 @@ bool bch2_btree_insert_key_cached(struct btree_trans *trans, + return true; + } + +-#ifdef CONFIG_BCACHEFS_DEBUG +-void bch2_btree_key_cache_verify_clean(struct btree_trans *trans, +- enum btree_id id, struct bpos pos) +-{ +- BUG_ON(bch2_btree_key_cache_find(trans->c, id, pos)); +-} +-#endif +- + static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, + struct shrink_control *sc) + { +diff --git a/fs/bcachefs/btree_key_cache.h b/fs/bcachefs/btree_key_cache.h +index b3d241b13453..fd29c14c5626 100644 +--- a/fs/bcachefs/btree_key_cache.h ++++ b/fs/bcachefs/btree_key_cache.h +@@ -32,14 +32,6 @@ bool bch2_btree_insert_key_cached(struct btree_trans *, + struct btree_path *, struct bkey_i *); + int bch2_btree_key_cache_flush(struct btree_trans *, + enum btree_id, struct bpos); +-#ifdef CONFIG_BCACHEFS_DEBUG +-void bch2_btree_key_cache_verify_clean(struct btree_trans *, +- enum btree_id, struct bpos); +-#else +-static inline void +-bch2_btree_key_cache_verify_clean(struct btree_trans *trans, +- enum btree_id id, struct bpos pos) {} +-#endif + + void bch2_fs_btree_key_cache_exit(struct btree_key_cache *); + void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *); +diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h +index 86962fd21d0c..989129f9f76c 100644 +--- a/fs/bcachefs/btree_types.h ++++ b/fs/bcachefs/btree_types.h +@@ -637,6 +637,7 @@ static inline bool btree_type_has_snapshots(enum btree_id id) + + enum btree_update_flags { + __BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE, ++ __BTREE_UPDATE_KEY_CACHE_RECLAIM, + + __BTREE_TRIGGER_NORUN, /* Don't run triggers at all */ + +@@ -649,6 +650,7 @@ enum btree_update_flags { + }; + + #define BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE (1U << __BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ++#define BTREE_UPDATE_KEY_CACHE_RECLAIM (1U << __BTREE_UPDATE_KEY_CACHE_RECLAIM) + + #define BTREE_TRIGGER_NORUN (1U << __BTREE_TRIGGER_NORUN) + +diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h +index 5e5a1b5e750e..d9a406a28f47 100644 +--- a/fs/bcachefs/btree_update.h ++++ b/fs/bcachefs/btree_update.h +@@ -76,8 +76,6 @@ int bch2_btree_node_update_key_get_iter(struct btree_trans *, + int bch2_trans_update_extent(struct btree_trans *, struct btree_iter *, + struct bkey_i *, enum btree_update_flags); + +-int __must_check bch2_trans_update_by_path(struct btree_trans *, struct btree_path *, +- struct bkey_i *, enum btree_update_flags); + int __must_check bch2_trans_update(struct btree_trans *, struct btree_iter *, + struct bkey_i *, enum btree_update_flags); + +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index a21d35f29b46..5cd6bda16b6b 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -23,6 +23,10 @@ + #include + #include + ++static int __must_check ++bch2_trans_update_by_path(struct btree_trans *, struct btree_path *, ++ struct bkey_i *, enum btree_update_flags); ++ + static inline int btree_insert_entry_cmp(const struct btree_insert_entry *l, + const struct btree_insert_entry *r) + { +@@ -998,18 +1002,6 @@ int __bch2_trans_commit(struct btree_trans *trans) + goto out_reset; + } + +-#ifdef CONFIG_BCACHEFS_DEBUG +- /* +- * if BTREE_TRIGGER_NORUN is set, it means we're probably being called +- * from the key cache flush code: +- */ +- trans_for_each_update(trans, i) +- if (!i->cached && +- !(i->flags & BTREE_TRIGGER_NORUN)) +- bch2_btree_key_cache_verify_clean(trans, +- i->btree_id, i->k->k.p); +-#endif +- + ret = bch2_trans_commit_run_triggers(trans); + if (ret) + goto out; +@@ -1369,8 +1361,9 @@ static int need_whiteout_for_snapshot(struct btree_trans *trans, + return ret; + } + +-int __must_check bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path, +- struct bkey_i *k, enum btree_update_flags flags) ++static int __must_check ++bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path, ++ struct bkey_i *k, enum btree_update_flags flags) + { + struct btree_insert_entry *i, n; + +@@ -1408,17 +1401,6 @@ int __must_check bch2_trans_update_by_path(struct btree_trans *trans, struct btr + !btree_insert_entry_cmp(&n, i)) { + BUG_ON(i->insert_trigger_run || i->overwrite_trigger_run); + +- /* +- * This is a hack to ensure that inode creates update the btree, +- * not the key cache, which helps with cache coherency issues in +- * other areas: +- */ +- if (n.cached && !i->cached) { +- i->k = n.k; +- i->flags = n.flags; +- return 0; +- } +- + bch2_path_put(trans, i->path, true); + *i = n; + } else +@@ -1432,12 +1414,17 @@ int __must_check bch2_trans_update_by_path(struct btree_trans *trans, struct btr + int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter, + struct bkey_i *k, enum btree_update_flags flags) + { ++ struct btree_path *path = iter->update_path ?: iter->path; ++ struct bkey_cached *ck; ++ int ret; ++ + if (iter->flags & BTREE_ITER_IS_EXTENTS) + return bch2_trans_update_extent(trans, iter, k, flags); + + if (bkey_deleted(&k->k) && ++ !(flags & BTREE_UPDATE_KEY_CACHE_RECLAIM) && + (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)) { +- int ret = need_whiteout_for_snapshot(trans, iter->btree_id, k->k.p); ++ ret = need_whiteout_for_snapshot(trans, iter->btree_id, k->k.p); + if (unlikely(ret < 0)) + return ret; + +@@ -1445,8 +1432,44 @@ int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter + k->k.type = KEY_TYPE_whiteout; + } + +- return bch2_trans_update_by_path(trans, iter->update_path ?: iter->path, +- k, flags); ++ if (!(flags & BTREE_UPDATE_KEY_CACHE_RECLAIM) && ++ !path->cached && ++ !path->level && ++ btree_id_cached(trans->c, path->btree_id)) { ++ if (!iter->key_cache_path || ++ !iter->key_cache_path->should_be_locked || ++ bpos_cmp(iter->key_cache_path->pos, k->k.p)) { ++ if (!iter->key_cache_path) ++ iter->key_cache_path = ++ bch2_path_get(trans, path->btree_id, path->pos, 1, 0, ++ BTREE_ITER_INTENT| ++ BTREE_ITER_CACHED, _THIS_IP_); ++ ++ iter->key_cache_path = ++ bch2_btree_path_set_pos(trans, iter->key_cache_path, path->pos, ++ iter->flags & BTREE_ITER_INTENT, ++ _THIS_IP_); ++ ++ ret = bch2_btree_path_traverse(trans, iter->key_cache_path, ++ BTREE_ITER_CACHED); ++ if (unlikely(ret)) ++ return ret; ++ ++ ck = (void *) iter->key_cache_path->l[0].b; ++ ++ if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { ++ trace_trans_restart_key_cache_raced(trans->fn, _RET_IP_); ++ btree_trans_restart(trans); ++ return -EINTR; ++ } ++ ++ iter->key_cache_path->should_be_locked = true; ++ } ++ ++ path = iter->key_cache_path; ++ } ++ ++ return bch2_trans_update_by_path(trans, path, k, flags); + } + + void bch2_trans_commit_hook(struct btree_trans *trans, +diff --git a/include/trace/events/bcachefs.h b/include/trace/events/bcachefs.h +index 8f10d13b27d5..36c4c8841741 100644 +--- a/include/trace/events/bcachefs.h ++++ b/include/trace/events/bcachefs.h +@@ -658,6 +658,12 @@ DEFINE_EVENT(transaction_restart, trans_restart_mark_replicas, + TP_ARGS(trans_fn, caller_ip) + ); + ++DEFINE_EVENT(transaction_restart, trans_restart_key_cache_raced, ++ TP_PROTO(const char *trans_fn, ++ unsigned long caller_ip), ++ TP_ARGS(trans_fn, caller_ip) ++); ++ + DECLARE_EVENT_CLASS(transaction_restart_iter, + TP_PROTO(const char *trans_fn, + unsigned long caller_ip, +-- +cgit v1.2.3 + + +From 3e6a81e45f4bd7649208aa9855db04f932c6150f Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 25 Dec 2021 20:39:19 -0500 +Subject: bcachefs: btree_gc no longer uses main in-memory bucket array + +This changes the btree_gc code to only use the second bucket array, the +one dedicated to GC. On completion, it compares what's in its in memory +bucket array to the allocation information in the btree and writes it +directly, instead of updating the main in-memory bucket array and +writing that. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/alloc_background.c | 139 ++++++---------------- + fs/bcachefs/alloc_background.h | 42 ++----- + fs/bcachefs/bcachefs.h | 1 - + fs/bcachefs/btree_gc.c | 254 +++++++++++++++++++++++++++-------------- + fs/bcachefs/recovery.c | 23 +--- + 5 files changed, 216 insertions(+), 243 deletions(-) + +diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c +index 2b82ab7aab86..023db6219ad8 100644 +--- a/fs/bcachefs/alloc_background.c ++++ b/fs/bcachefs/alloc_background.c +@@ -39,15 +39,6 @@ static const unsigned BCH_ALLOC_V1_FIELD_BYTES[] = { + #undef x + }; + +-struct bkey_alloc_buf { +- struct bkey_i k; +- struct bch_alloc_v3 v; +- +-#define x(_name, _bits) + _bits / 8 +- u8 _pad[0 + BCH_ALLOC_FIELDS_V2()]; +-#undef x +-} __attribute__((packed, aligned(8))); +- + /* Persistent alloc info: */ + + static inline u64 alloc_field_v1_get(const struct bch_alloc *a, +@@ -254,24 +245,25 @@ struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c k) + return ret; + } + +-static void bch2_alloc_pack(struct bch_fs *c, +- struct bkey_alloc_buf *dst, +- const struct bkey_alloc_unpacked src) ++struct bkey_alloc_buf *bch2_alloc_pack(struct btree_trans *trans, ++ const struct bkey_alloc_unpacked src) + { +- bch2_alloc_pack_v3(dst, src); ++ struct bkey_alloc_buf *dst; ++ ++ dst = bch2_trans_kmalloc(trans, sizeof(struct bkey_alloc_buf)); ++ if (!IS_ERR(dst)) ++ bch2_alloc_pack_v3(dst, src); ++ ++ return dst; + } + + int bch2_alloc_write(struct btree_trans *trans, struct btree_iter *iter, + struct bkey_alloc_unpacked *u, unsigned trigger_flags) + { +- struct bkey_alloc_buf *a; +- +- a = bch2_trans_kmalloc(trans, sizeof(struct bkey_alloc_buf)); +- if (IS_ERR(a)) +- return PTR_ERR(a); ++ struct bkey_alloc_buf *a = bch2_alloc_pack(trans, *u); + +- bch2_alloc_pack(trans->c, a, *u); +- return bch2_trans_update(trans, iter, &a->k, trigger_flags); ++ return PTR_ERR_OR_ZERO(a) ?: ++ bch2_trans_update(trans, iter, &a->k, trigger_flags); + } + + static unsigned bch_alloc_v1_val_u64s(const struct bch_alloc *a) +@@ -341,7 +333,7 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, + #undef x + } + +-int bch2_alloc_read(struct bch_fs *c) ++int bch2_alloc_read(struct bch_fs *c, bool gc, bool metadata_only) + { + struct btree_trans trans; + struct btree_iter iter; +@@ -352,108 +344,43 @@ int bch2_alloc_read(struct bch_fs *c) + int ret; + + bch2_trans_init(&trans, c, 0, 0); +- down_read(&c->gc_lock); + + for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN, + BTREE_ITER_PREFETCH, k, ret) { +- if (!bkey_is_alloc(k.k)) +- continue; +- + ca = bch_dev_bkey_exists(c, k.k->p.inode); +- g = bucket(ca, k.k->p.offset); ++ g = __bucket(ca, k.k->p.offset, gc); + u = bch2_alloc_unpack(k); + +- *bucket_gen(ca, k.k->p.offset) = u.gen; ++ if (!gc) ++ *bucket_gen(ca, k.k->p.offset) = u.gen; ++ + g->_mark.gen = u.gen; +- g->_mark.data_type = u.data_type; +- g->_mark.dirty_sectors = u.dirty_sectors; +- g->_mark.cached_sectors = u.cached_sectors; +- g->_mark.stripe = u.stripe != 0; +- g->stripe = u.stripe; +- g->stripe_redundancy = u.stripe_redundancy; + g->io_time[READ] = u.read_time; + g->io_time[WRITE] = u.write_time; +- g->oldest_gen = u.oldest_gen; ++ g->oldest_gen = !gc ? u.oldest_gen : u.gen; + g->gen_valid = 1; +- } +- bch2_trans_iter_exit(&trans, &iter); + +- up_read(&c->gc_lock); +- bch2_trans_exit(&trans); ++ if (!gc || ++ (metadata_only && ++ (u.data_type == BCH_DATA_user || ++ u.data_type == BCH_DATA_cached || ++ u.data_type == BCH_DATA_parity))) { ++ g->_mark.data_type = u.data_type; ++ g->_mark.dirty_sectors = u.dirty_sectors; ++ g->_mark.cached_sectors = u.cached_sectors; ++ g->_mark.stripe = u.stripe != 0; ++ g->stripe = u.stripe; ++ g->stripe_redundancy = u.stripe_redundancy; ++ } + +- if (ret) { +- bch_err(c, "error reading alloc info: %i", ret); +- return ret; + } ++ bch2_trans_iter_exit(&trans, &iter); + +- return 0; +-} +- +-static int bch2_alloc_write_key(struct btree_trans *trans, +- struct btree_iter *iter, +- unsigned flags) +-{ +- struct bch_fs *c = trans->c; +- struct bkey_s_c k; +- struct bkey_alloc_unpacked old_u, new_u; +- int ret; +-retry: +- bch2_trans_begin(trans); +- +- ret = bch2_btree_key_cache_flush(trans, +- BTREE_ID_alloc, iter->pos); +- if (ret) +- goto err; ++ bch2_trans_exit(&trans); + +- k = bch2_btree_iter_peek_slot(iter); +- ret = bkey_err(k); + if (ret) +- goto err; +- +- old_u = bch2_alloc_unpack(k); +- new_u = alloc_mem_to_key(c, iter); +- +- if (!bkey_alloc_unpacked_cmp(old_u, new_u)) +- return 0; +- +- ret = bch2_alloc_write(trans, iter, &new_u, +- BTREE_TRIGGER_NORUN) ?: +- bch2_trans_commit(trans, NULL, NULL, +- BTREE_INSERT_NOFAIL|flags); +-err: +- if (ret == -EINTR) +- goto retry; +- return ret; +-} +- +-int bch2_alloc_write_all(struct bch_fs *c, unsigned flags) +-{ +- struct btree_trans trans; +- struct btree_iter iter; +- struct bch_dev *ca; +- unsigned i; +- int ret = 0; +- +- bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); +- bch2_trans_iter_init(&trans, &iter, BTREE_ID_alloc, POS_MIN, +- BTREE_ITER_SLOTS|BTREE_ITER_INTENT); +- +- for_each_member_device(ca, c, i) { +- bch2_btree_iter_set_pos(&iter, +- POS(ca->dev_idx, ca->mi.first_bucket)); ++ bch_err(c, "error reading alloc info: %i", ret); + +- while (iter.pos.offset < ca->mi.nbuckets) { +- ret = bch2_alloc_write_key(&trans, &iter, flags); +- if (ret) { +- percpu_ref_put(&ca->ref); +- goto err; +- } +- bch2_btree_iter_advance(&iter); +- } +- } +-err: +- bch2_trans_iter_exit(&trans, &iter); +- bch2_trans_exit(&trans); + return ret; + } + +diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h +index 86b64177b3d0..98c7866e20b5 100644 +--- a/fs/bcachefs/alloc_background.h ++++ b/fs/bcachefs/alloc_background.h +@@ -38,40 +38,23 @@ static inline bool bkey_alloc_unpacked_cmp(struct bkey_alloc_unpacked l, + ; + } + ++struct bkey_alloc_buf { ++ struct bkey_i k; ++ struct bch_alloc_v3 v; ++ ++#define x(_name, _bits) + _bits / 8 ++ u8 _pad[0 + BCH_ALLOC_FIELDS_V2()]; ++#undef x ++} __attribute__((packed, aligned(8))); ++ + struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c); ++struct bkey_alloc_buf *bch2_alloc_pack(struct btree_trans *, ++ const struct bkey_alloc_unpacked); + int bch2_alloc_write(struct btree_trans *, struct btree_iter *, + struct bkey_alloc_unpacked *, unsigned); + + int bch2_bucket_io_time_reset(struct btree_trans *, unsigned, size_t, int); + +-static inline struct bkey_alloc_unpacked +-alloc_mem_to_key(struct bch_fs *c, struct btree_iter *iter) +-{ +- struct bch_dev *ca; +- struct bucket *g; +- struct bkey_alloc_unpacked ret; +- +- percpu_down_read(&c->mark_lock); +- ca = bch_dev_bkey_exists(c, iter->pos.inode); +- g = bucket(ca, iter->pos.offset); +- ret = (struct bkey_alloc_unpacked) { +- .dev = iter->pos.inode, +- .bucket = iter->pos.offset, +- .gen = g->mark.gen, +- .oldest_gen = g->oldest_gen, +- .data_type = g->mark.data_type, +- .dirty_sectors = g->mark.dirty_sectors, +- .cached_sectors = g->mark.cached_sectors, +- .read_time = g->io_time[READ], +- .write_time = g->io_time[WRITE], +- .stripe = g->stripe, +- .stripe_redundancy = g->stripe_redundancy, +- }; +- percpu_up_read(&c->mark_lock); +- +- return ret; +-} +- + #define ALLOC_SCAN_BATCH(ca) max_t(size_t, 1, (ca)->mi.nbuckets >> 9) + + const char *bch2_alloc_v1_invalid(const struct bch_fs *, struct bkey_s_c); +@@ -101,7 +84,7 @@ static inline bool bkey_is_alloc(const struct bkey *k) + k->type == KEY_TYPE_alloc_v3; + } + +-int bch2_alloc_read(struct bch_fs *); ++int bch2_alloc_read(struct bch_fs *, bool, bool); + + static inline void bch2_wake_allocator(struct bch_dev *ca) + { +@@ -139,7 +122,6 @@ void bch2_dev_allocator_quiesce(struct bch_fs *, struct bch_dev *); + void bch2_dev_allocator_stop(struct bch_dev *); + int bch2_dev_allocator_start(struct bch_dev *); + +-int bch2_alloc_write_all(struct bch_fs *, unsigned); + void bch2_fs_allocator_background_init(struct bch_fs *); + + #endif /* _BCACHEFS_ALLOC_BACKGROUND_H */ +diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h +index b99bce436c20..b2941c12a7fb 100644 +--- a/fs/bcachefs/bcachefs.h ++++ b/fs/bcachefs/bcachefs.h +@@ -536,7 +536,6 @@ enum { + /* misc: */ + BCH_FS_NEED_ANOTHER_GC, + BCH_FS_DELETED_NODES, +- BCH_FS_NEED_ALLOC_WRITE, + BCH_FS_REBUILD_REPLICAS, + BCH_FS_HOLD_BTREE_WRITES, + }; +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +index 809c9a762303..56439398b2f8 100644 +--- a/fs/bcachefs/btree_gc.c ++++ b/fs/bcachefs/btree_gc.c +@@ -9,6 +9,7 @@ + #include "alloc_foreground.h" + #include "bkey_methods.h" + #include "bkey_buf.h" ++#include "btree_key_cache.h" + #include "btree_locking.h" + #include "btree_update_interior.h" + #include "btree_io.h" +@@ -533,7 +534,6 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id, + bkey_for_each_ptr_decode(k->k, ptrs, p, entry) { + struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); + struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr); +- struct bucket *g2 = PTR_BUCKET(ca, &p.ptr); + enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, &entry->ptr); + + if (fsck_err_on(!g->gen_valid, c, +@@ -544,9 +544,8 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id, + p.ptr.gen, + (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) { + if (!p.ptr.cached) { +- g2->_mark.gen = g->_mark.gen = p.ptr.gen; +- g2->gen_valid = g->gen_valid = true; +- set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags); ++ g->_mark.gen = p.ptr.gen; ++ g->gen_valid = true; + } else { + do_update = true; + } +@@ -560,13 +559,12 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id, + p.ptr.gen, g->mark.gen, + (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) { + if (!p.ptr.cached) { +- g2->_mark.gen = g->_mark.gen = p.ptr.gen; +- g2->gen_valid = g->gen_valid = true; +- g2->_mark.data_type = 0; +- g2->_mark.dirty_sectors = 0; +- g2->_mark.cached_sectors = 0; ++ g->_mark.gen = p.ptr.gen; ++ g->gen_valid = true; ++ g->_mark.data_type = 0; ++ g->_mark.dirty_sectors = 0; ++ g->_mark.cached_sectors = 0; + set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags); +- set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags); + } else { + do_update = true; + } +@@ -603,8 +601,7 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id, + bch2_data_types[data_type], + (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) { + if (data_type == BCH_DATA_btree) { +- g2->_mark.data_type = g->_mark.data_type = data_type; +- set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags); ++ g->_mark.data_type = data_type; + set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags); + } else { + do_update = true; +@@ -1169,13 +1166,14 @@ static int bch2_gc_done(struct bch_fs *c, + unsigned i, dev; + int ret = 0; + ++ percpu_down_write(&c->mark_lock); ++ + #define copy_field(_f, _msg, ...) \ + if (dst->_f != src->_f) { \ + if (verify) \ + fsck_err(c, _msg ": got %llu, should be %llu" \ + , ##__VA_ARGS__, dst->_f, src->_f); \ + dst->_f = src->_f; \ +- set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags); \ + } + #define copy_stripe_field(_f, _msg, ...) \ + if (dst->_f != src->_f) { \ +@@ -1185,18 +1183,6 @@ static int bch2_gc_done(struct bch_fs *c, + iter.pos, ##__VA_ARGS__, \ + dst->_f, src->_f); \ + dst->_f = src->_f; \ +- set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags); \ +- } +-#define copy_bucket_field(_f) \ +- if (dst->b[b]._f != src->b[b]._f) { \ +- if (verify) \ +- fsck_err(c, "bucket %u:%zu gen %u data type %s has wrong " #_f \ +- ": got %u, should be %u", dev, b, \ +- dst->b[b].mark.gen, \ +- bch2_data_types[dst->b[b].mark.data_type],\ +- dst->b[b]._f, src->b[b]._f); \ +- dst->b[b]._f = src->b[b]._f; \ +- set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags); \ + } + #define copy_dev_field(_f, _msg, ...) \ + copy_field(_f, "dev %u has wrong " _msg, dev, ##__VA_ARGS__) +@@ -1207,36 +1193,18 @@ static int bch2_gc_done(struct bch_fs *c, + bch2_fs_usage_acc_to_base(c, i); + + for_each_member_device(ca, c, dev) { +- struct bucket_array *dst = __bucket_array(ca, 0); +- struct bucket_array *src = __bucket_array(ca, 1); +- size_t b; +- +- for (b = 0; b < src->nbuckets; b++) { +- copy_bucket_field(_mark.gen); +- copy_bucket_field(_mark.data_type); +- copy_bucket_field(_mark.stripe); +- copy_bucket_field(_mark.dirty_sectors); +- copy_bucket_field(_mark.cached_sectors); +- copy_bucket_field(stripe_redundancy); +- copy_bucket_field(stripe); +- +- dst->b[b].oldest_gen = src->b[b].oldest_gen; +- } +- +- { +- struct bch_dev_usage *dst = ca->usage_base; +- struct bch_dev_usage *src = (void *) +- bch2_acc_percpu_u64s((void *) ca->usage_gc, +- dev_usage_u64s()); +- +- copy_dev_field(buckets_ec, "buckets_ec"); +- copy_dev_field(buckets_unavailable, "buckets_unavailable"); +- +- for (i = 0; i < BCH_DATA_NR; i++) { +- copy_dev_field(d[i].buckets, "%s buckets", bch2_data_types[i]); +- copy_dev_field(d[i].sectors, "%s sectors", bch2_data_types[i]); +- copy_dev_field(d[i].fragmented, "%s fragmented", bch2_data_types[i]); +- } ++ struct bch_dev_usage *dst = ca->usage_base; ++ struct bch_dev_usage *src = (void *) ++ bch2_acc_percpu_u64s((void *) ca->usage_gc, ++ dev_usage_u64s()); ++ ++ copy_dev_field(buckets_ec, "buckets_ec"); ++ copy_dev_field(buckets_unavailable, "buckets_unavailable"); ++ ++ for (i = 0; i < BCH_DATA_NR; i++) { ++ copy_dev_field(d[i].buckets, "%s buckets", bch2_data_types[i]); ++ copy_dev_field(d[i].sectors, "%s sectors", bch2_data_types[i]); ++ copy_dev_field(d[i].fragmented, "%s fragmented", bch2_data_types[i]); + } + }; + +@@ -1278,7 +1246,6 @@ static int bch2_gc_done(struct bch_fs *c, + + #undef copy_fs_field + #undef copy_dev_field +-#undef copy_bucket_field + #undef copy_stripe_field + #undef copy_field + fsck_err: +@@ -1286,6 +1253,8 @@ fsck_err: + percpu_ref_put(&ca->ref); + if (ret) + bch_err(c, "%s: ret %i", __func__, ret); ++ ++ percpu_up_write(&c->mark_lock); + return ret; + } + +@@ -1308,15 +1277,6 @@ static int bch2_gc_start(struct bch_fs *c, + BUG_ON(ca->buckets[1]); + BUG_ON(ca->usage_gc); + +- ca->buckets[1] = kvpmalloc(sizeof(struct bucket_array) + +- ca->mi.nbuckets * sizeof(struct bucket), +- GFP_KERNEL|__GFP_ZERO); +- if (!ca->buckets[1]) { +- percpu_ref_put(&ca->ref); +- bch_err(c, "error allocating ca->buckets[gc]"); +- return -ENOMEM; +- } +- + ca->usage_gc = alloc_percpu(struct bch_dev_usage); + if (!ca->usage_gc) { + bch_err(c, "error allocating ca->usage_gc"); +@@ -1325,33 +1285,151 @@ static int bch2_gc_start(struct bch_fs *c, + } + } + +- percpu_down_write(&c->mark_lock); ++ return 0; ++} ++ ++static int bch2_alloc_write_key(struct btree_trans *trans, ++ struct btree_iter *iter, ++ bool initial, bool metadata_only) ++{ ++ struct bch_fs *c = trans->c; ++ struct bch_dev *ca = bch_dev_bkey_exists(c, iter->pos.inode); ++ struct bucket *g; ++ struct bkey_s_c k; ++ struct bkey_alloc_unpacked old_u, new_u, gc_u; ++ struct bkey_alloc_buf *a; ++ int ret; ++ ++ k = bch2_btree_iter_peek_slot(iter); ++ ret = bkey_err(k); ++ if (ret) ++ return ret; ++ ++ old_u = new_u = bch2_alloc_unpack(k); ++ ++ percpu_down_read(&c->mark_lock); ++ g = gc_bucket(ca, iter->pos.offset); ++ gc_u = (struct bkey_alloc_unpacked) { ++ .dev = iter->pos.inode, ++ .bucket = iter->pos.offset, ++ .gen = g->mark.gen, ++ .oldest_gen = g->oldest_gen, ++ .data_type = g->mark.data_type, ++ .dirty_sectors = g->mark.dirty_sectors, ++ .cached_sectors = g->mark.cached_sectors, ++ .read_time = g->io_time[READ], ++ .write_time = g->io_time[WRITE], ++ .stripe = g->stripe, ++ .stripe_redundancy = g->stripe_redundancy, ++ }; ++ percpu_up_read(&c->mark_lock); ++ ++ if (metadata_only && ++ gc_u.data_type != BCH_DATA_sb && ++ gc_u.data_type != BCH_DATA_journal && ++ gc_u.data_type != BCH_DATA_btree) ++ return 0; ++ ++ if (!bkey_alloc_unpacked_cmp(old_u, gc_u) || ++ gen_after(old_u.gen, gc_u.gen)) ++ return 0; ++ ++#define copy_bucket_field(_f) \ ++ if (fsck_err_on(new_u._f != gc_u._f, c, \ ++ "bucket %llu:%llu gen %u data type %s has wrong " #_f \ ++ ": got %u, should be %u", \ ++ iter->pos.inode, iter->pos.offset, \ ++ new_u.gen, \ ++ bch2_data_types[new_u.data_type], \ ++ new_u._f, gc_u._f)) \ ++ new_u._f = gc_u._f; \ ++ ++ copy_bucket_field(gen); ++ copy_bucket_field(data_type); ++ copy_bucket_field(stripe); ++ copy_bucket_field(dirty_sectors); ++ copy_bucket_field(cached_sectors); ++ copy_bucket_field(stripe_redundancy); ++ copy_bucket_field(stripe); ++#undef copy_bucket_field ++ ++ new_u.oldest_gen = gc_u.oldest_gen; ++ ++ if (!bkey_alloc_unpacked_cmp(old_u, new_u)) ++ return 0; ++ ++ a = bch2_alloc_pack(trans, new_u); ++ if (IS_ERR(a)) ++ return PTR_ERR(a); ++ ++ ret = initial ++ ? bch2_journal_key_insert(c, BTREE_ID_alloc, 0, &a->k) ++ : bch2_trans_update(trans, iter, &a->k, BTREE_TRIGGER_NORUN); ++fsck_err: ++ return ret; ++} ++ ++static int bch2_gc_alloc_done(struct bch_fs *c, bool initial, bool metadata_only) ++{ ++ struct btree_trans trans; ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ struct bch_dev *ca; ++ unsigned i; ++ int ret = 0; ++ ++ bch2_trans_init(&trans, c, 0, 0); + + for_each_member_device(ca, c, i) { +- struct bucket_array *dst = __bucket_array(ca, 1); +- struct bucket_array *src = __bucket_array(ca, 0); +- size_t b; ++ for_each_btree_key(&trans, iter, BTREE_ID_alloc, ++ POS(ca->dev_idx, ca->mi.first_bucket), ++ BTREE_ITER_SLOTS| ++ BTREE_ITER_PREFETCH, k, ret) { ++ if (bkey_cmp(iter.pos, POS(ca->dev_idx, ca->mi.nbuckets)) >= 0) ++ break; + +- dst->first_bucket = src->first_bucket; +- dst->nbuckets = src->nbuckets; ++ ret = __bch2_trans_do(&trans, NULL, NULL, ++ BTREE_INSERT_LAZY_RW, ++ bch2_alloc_write_key(&trans, &iter, ++ initial, metadata_only)); ++ if (ret) ++ break; ++ } ++ bch2_trans_iter_exit(&trans, &iter); + +- for (b = 0; b < src->nbuckets; b++) { +- struct bucket *d = &dst->b[b]; +- struct bucket *s = &src->b[b]; ++ if (ret) { ++ bch_err(c, "error writing alloc info: %i", ret); ++ percpu_ref_put(&ca->ref); ++ break; ++ } ++ } + +- d->_mark.gen = dst->b[b].oldest_gen = s->mark.gen; +- d->gen_valid = s->gen_valid; ++ bch2_trans_exit(&trans); ++ return ret; ++} + +- if (metadata_only && +- (s->mark.data_type == BCH_DATA_user || +- s->mark.data_type == BCH_DATA_cached)) +- d->_mark = s->mark; ++static int bch2_gc_alloc_start(struct bch_fs *c, bool initial, bool metadata_only) ++{ ++ struct bch_dev *ca; ++ unsigned i; ++ ++ for_each_member_device(ca, c, i) { ++ struct bucket_array *buckets = kvpmalloc(sizeof(struct bucket_array) + ++ ca->mi.nbuckets * sizeof(struct bucket), ++ GFP_KERNEL|__GFP_ZERO); ++ if (!buckets) { ++ percpu_ref_put(&ca->ref); ++ percpu_up_write(&c->mark_lock); ++ bch_err(c, "error allocating ca->buckets[gc]"); ++ return -ENOMEM; + } +- }; + +- percpu_up_write(&c->mark_lock); ++ buckets->first_bucket = ca->mi.first_bucket; ++ buckets->nbuckets = ca->mi.nbuckets; ++ rcu_assign_pointer(ca->buckets[1], buckets); ++ }; + +- return 0; ++ return bch2_alloc_read(c, true, metadata_only); + } + + static void bch2_gc_alloc_reset(struct bch_fs *c, bool initial, bool metadata_only) +@@ -1598,6 +1676,7 @@ int bch2_gc(struct bch_fs *c, bool initial, bool metadata_only) + !bch2_btree_interior_updates_nr_pending(c)); + + ret = bch2_gc_start(c, metadata_only) ?: ++ bch2_gc_alloc_start(c, initial, metadata_only) ?: + bch2_gc_reflink_start(c, initial, metadata_only); + if (ret) + goto out; +@@ -1665,16 +1744,15 @@ out: + if (!ret) { + bch2_journal_block(&c->journal); + +- percpu_down_write(&c->mark_lock); +- ret = bch2_gc_reflink_done(c, initial, metadata_only) ?: +- bch2_gc_stripes_done(c, initial, metadata_only) ?: ++ ret = bch2_gc_stripes_done(c, initial, metadata_only) ?: ++ bch2_gc_reflink_done(c, initial, metadata_only) ?: ++ bch2_gc_alloc_done(c, initial, metadata_only) ?: + bch2_gc_done(c, initial, metadata_only); + + bch2_journal_unblock(&c->journal); +- } else { +- percpu_down_write(&c->mark_lock); + } + ++ percpu_down_write(&c->mark_lock); + /* Indicates that gc is no longer in progress: */ + __gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING)); + +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index b818093eab39..7e4400cc02a9 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -1095,7 +1095,11 @@ use_clean: + + bch_verbose(c, "starting alloc read"); + err = "error reading allocation information"; +- ret = bch2_alloc_read(c); ++ ++ down_read(&c->gc_lock); ++ ret = bch2_alloc_read(c, false, false); ++ up_read(&c->gc_lock); ++ + if (ret) + goto err; + bch_verbose(c, "alloc read done"); +@@ -1153,23 +1157,6 @@ use_clean: + if (c->opts.verbose || !c->sb.clean) + bch_info(c, "journal replay done"); + +- if (test_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags) && +- !c->opts.nochanges) { +- /* +- * note that even when filesystem was clean there might be work +- * to do here, if we ran gc (because of fsck) which recalculated +- * oldest_gen: +- */ +- bch_verbose(c, "writing allocation info"); +- err = "error writing out alloc info"; +- ret = bch2_alloc_write_all(c, BTREE_INSERT_LAZY_RW); +- if (ret) { +- bch_err(c, "error writing alloc info"); +- goto err; +- } +- bch_verbose(c, "alloc write done"); +- } +- + if (c->sb.version < bcachefs_metadata_version_snapshot_2) { + bch2_fs_lazy_rw(c); + +-- +cgit v1.2.3 + + +From 2dc205d463f46e913a63cc28fea4953c488510f8 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 25 Dec 2021 03:37:52 -0500 +Subject: bcachefs: Copygc no longer uses bucket array + +This converts the copygc code to use the alloc btree directly to find +buckets that need to be evacuated instead of the in-memory bucket array, +which is finally going away soon. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/movinggc.c | 155 +++++++++++++++++++++++++++++++++---------------- + 1 file changed, 105 insertions(+), 50 deletions(-) + +diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c +index 92f78907bcb6..c82ecff3efe2 100644 +--- a/fs/bcachefs/movinggc.c ++++ b/fs/bcachefs/movinggc.c +@@ -6,6 +6,7 @@ + */ + + #include "bcachefs.h" ++#include "alloc_background.h" + #include "alloc_foreground.h" + #include "btree_iter.h" + #include "btree_update.h" +@@ -137,18 +138,106 @@ static inline int fragmentation_cmp(copygc_heap *heap, + return cmp_int(l.fragmentation, r.fragmentation); + } + ++static int walk_buckets_to_copygc(struct bch_fs *c) ++{ ++ copygc_heap *h = &c->copygc_heap; ++ struct btree_trans trans; ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ struct bkey_alloc_unpacked u; ++ int ret; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN, ++ BTREE_ITER_PREFETCH, k, ret) { ++ struct bch_dev *ca = bch_dev_bkey_exists(c, iter.pos.inode); ++ struct copygc_heap_entry e; ++ ++ u = bch2_alloc_unpack(k); ++ ++ if (u.data_type != BCH_DATA_user || ++ u.dirty_sectors >= ca->mi.bucket_size || ++ bch2_bucket_is_open(c, iter.pos.inode, iter.pos.offset)) ++ continue; ++ ++ e = (struct copygc_heap_entry) { ++ .dev = iter.pos.inode, ++ .gen = u.gen, ++ .replicas = 1 + u.stripe_redundancy, ++ .fragmentation = u.dirty_sectors * (1U << 15) ++ / ca->mi.bucket_size, ++ .sectors = u.dirty_sectors, ++ .offset = bucket_to_sector(ca, iter.pos.offset), ++ }; ++ heap_add_or_replace(h, e, -fragmentation_cmp, NULL); ++ ++ } ++ bch2_trans_iter_exit(&trans, &iter); ++ ++ bch2_trans_exit(&trans); ++ return ret; ++} ++ ++static int bucket_inorder_cmp(const void *_l, const void *_r) ++{ ++ const struct copygc_heap_entry *l = _l; ++ const struct copygc_heap_entry *r = _r; ++ ++ return cmp_int(l->dev, r->dev) ?: cmp_int(l->offset, r->offset); ++} ++ ++static int check_copygc_was_done(struct bch_fs *c, ++ u64 *sectors_not_moved, ++ u64 *buckets_not_moved) ++{ ++ copygc_heap *h = &c->copygc_heap; ++ struct btree_trans trans; ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ struct bkey_alloc_unpacked u; ++ struct copygc_heap_entry *i; ++ int ret = 0; ++ ++ sort(h->data, h->used, sizeof(h->data[0]), bucket_inorder_cmp, NULL); ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ bch2_trans_iter_init(&trans, &iter, BTREE_ID_alloc, POS_MIN, 0); ++ ++ for (i = h->data; i < h->data + h->used; i++) { ++ struct bch_dev *ca = bch_dev_bkey_exists(c, i->dev); ++ ++ bch2_btree_iter_set_pos(&iter, POS(i->dev, sector_to_bucket(ca, i->offset))); ++ ++ ret = lockrestart_do(&trans, ++ bkey_err(k = bch2_btree_iter_peek_slot(&iter))); ++ if (ret) ++ break; ++ ++ u = bch2_alloc_unpack(k); ++ ++ if (u.gen == i->gen && u.dirty_sectors) { ++ *sectors_not_moved += u.dirty_sectors; ++ *buckets_not_moved += 1; ++ } ++ } ++ bch2_trans_iter_exit(&trans, &iter); ++ ++ bch2_trans_exit(&trans); ++ return ret; ++} ++ + static int bch2_copygc(struct bch_fs *c) + { + copygc_heap *h = &c->copygc_heap; + struct copygc_heap_entry e, *i; +- struct bucket_array *buckets; + struct bch_move_stats move_stats; + u64 sectors_to_move = 0, sectors_to_write = 0, sectors_not_moved = 0; + u64 sectors_reserved = 0; + u64 buckets_to_move, buckets_not_moved = 0; + struct bch_dev *ca; + unsigned dev_idx; +- size_t b, heap_size = 0; ++ size_t heap_size = 0; + int ret; + + bch_move_stats_init(&move_stats, "copygc"); +@@ -178,34 +267,12 @@ static int bch2_copygc(struct bch_fs *c) + spin_lock(&ca->fs->freelist_lock); + sectors_reserved += fifo_used(&ca->free[RESERVE_MOVINGGC]) * ca->mi.bucket_size; + spin_unlock(&ca->fs->freelist_lock); ++ } + +- down_read(&ca->bucket_lock); +- buckets = bucket_array(ca); +- +- for (b = buckets->first_bucket; b < buckets->nbuckets; b++) { +- struct bucket *g = buckets->b + b; +- struct bucket_mark m = READ_ONCE(g->mark); +- struct copygc_heap_entry e; +- +- if (m.owned_by_allocator || +- m.data_type != BCH_DATA_user || +- m.dirty_sectors >= ca->mi.bucket_size) +- continue; +- +- WARN_ON(m.stripe && !g->stripe_redundancy); +- +- e = (struct copygc_heap_entry) { +- .dev = dev_idx, +- .gen = m.gen, +- .replicas = 1 + g->stripe_redundancy, +- .fragmentation = m.dirty_sectors * (1U << 15) +- / ca->mi.bucket_size, +- .sectors = m.dirty_sectors, +- .offset = bucket_to_sector(ca, b), +- }; +- heap_add_or_replace(h, e, -fragmentation_cmp, NULL); +- } +- up_read(&ca->bucket_lock); ++ ret = walk_buckets_to_copygc(c); ++ if (ret) { ++ bch2_fs_fatal_error(c, "error walking buckets to copygc!"); ++ return ret; + } + + if (!h->used) { +@@ -251,30 +318,18 @@ static int bch2_copygc(struct bch_fs *c) + writepoint_ptr(&c->copygc_write_point), + copygc_pred, NULL, + &move_stats); ++ if (ret) { ++ bch_err(c, "error %i from bch2_move_data() in copygc", ret); ++ return ret; ++ } + +- for_each_rw_member(ca, c, dev_idx) { +- down_read(&ca->bucket_lock); +- buckets = bucket_array(ca); +- for (i = h->data; i < h->data + h->used; i++) { +- struct bucket_mark m; +- size_t b; +- +- if (i->dev != dev_idx) +- continue; +- +- b = sector_to_bucket(ca, i->offset); +- m = READ_ONCE(buckets->b[b].mark); +- +- if (i->gen == m.gen && +- m.dirty_sectors) { +- sectors_not_moved += m.dirty_sectors; +- buckets_not_moved++; +- } +- } +- up_read(&ca->bucket_lock); ++ ret = check_copygc_was_done(c, §ors_not_moved, &buckets_not_moved); ++ if (ret) { ++ bch_err(c, "error %i from check_copygc_was_done()", ret); ++ return ret; + } + +- if (sectors_not_moved && !ret) ++ if (sectors_not_moved) + bch_warn_ratelimited(c, + "copygc finished but %llu/%llu sectors, %llu/%llu buckets not moved (move stats: moved %llu sectors, raced %llu keys, %llu sectors)", + sectors_not_moved, sectors_to_move, +-- +cgit v1.2.3 + + +From 3884bc2716ca9011e3327554adf237d0bc08d979 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 24 Dec 2021 04:51:10 -0500 +Subject: bcachefs: bch2_gc_gens() no longer uses bucket array + +Like the previous patches, this converts bch2_gc_gens() to use the alloc +btree directly, and private arrays of generation numbers for its own +recalculation of oldest_gen. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bcachefs.h | 2 + + fs/bcachefs/btree_gc.c | 108 +++++++++++++++++++++++++++++++------------- + fs/bcachefs/buckets.h | 6 --- + fs/bcachefs/buckets_types.h | 1 - + fs/bcachefs/super.c | 1 + + fs/bcachefs/sysfs.c | 19 ++++---- + 6 files changed, 90 insertions(+), 47 deletions(-) + +diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h +index b2941c12a7fb..0841303e5f2f 100644 +--- a/fs/bcachefs/bcachefs.h ++++ b/fs/bcachefs/bcachefs.h +@@ -452,6 +452,7 @@ struct bch_dev { + */ + struct bucket_array __rcu *buckets[2]; + struct bucket_gens __rcu *bucket_gens; ++ u8 *oldest_gen; + unsigned long *buckets_nouse; + struct rw_semaphore bucket_lock; + +@@ -806,6 +807,7 @@ struct bch_fs { + * it's not while a gc is in progress. + */ + struct rw_semaphore gc_lock; ++ struct mutex gc_gens_lock; + + /* IO PATH */ + struct semaphore io_in_flight; +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +index 56439398b2f8..c799ea43ff2b 100644 +--- a/fs/bcachefs/btree_gc.c ++++ b/fs/bcachefs/btree_gc.c +@@ -1787,9 +1787,8 @@ static bool gc_btree_gens_key(struct bch_fs *c, struct bkey_s_c k) + percpu_down_read(&c->mark_lock); + bkey_for_each_ptr(ptrs, ptr) { + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); +- struct bucket *g = PTR_BUCKET(ca, ptr); + +- if (gen_after(g->mark.gen, ptr->gen) > 16) { ++ if (ptr_stale(ca, ptr) > 16) { + percpu_up_read(&c->mark_lock); + return true; + } +@@ -1797,10 +1796,10 @@ static bool gc_btree_gens_key(struct bch_fs *c, struct bkey_s_c k) + + bkey_for_each_ptr(ptrs, ptr) { + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); +- struct bucket *g = PTR_BUCKET(ca, ptr); ++ u8 *gen = &ca->oldest_gen[PTR_BUCKET_NR(ca, ptr)]; + +- if (gen_after(g->gc_gen, ptr->gen)) +- g->gc_gen = ptr->gen; ++ if (gen_after(*gen, ptr->gen)) ++ *gen = ptr->gen; + } + percpu_up_read(&c->mark_lock); + +@@ -1811,23 +1810,22 @@ static bool gc_btree_gens_key(struct bch_fs *c, struct bkey_s_c k) + * For recalculating oldest gen, we only need to walk keys in leaf nodes; btree + * node pointers currently never have cached pointers that can become stale: + */ +-static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id btree_id) ++static int bch2_gc_btree_gens(struct btree_trans *trans, enum btree_id btree_id) + { +- struct btree_trans trans; ++ struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_s_c k; + struct bkey_buf sk; + int ret = 0, commit_err = 0; + + bch2_bkey_buf_init(&sk); +- bch2_trans_init(&trans, c, 0, 0); + +- bch2_trans_iter_init(&trans, &iter, btree_id, POS_MIN, ++ bch2_trans_iter_init(trans, &iter, btree_id, POS_MIN, + BTREE_ITER_PREFETCH| + BTREE_ITER_NOT_EXTENTS| + BTREE_ITER_ALL_SNAPSHOTS); + +- while ((bch2_trans_begin(&trans), ++ while ((bch2_trans_begin(trans), + k = bch2_btree_iter_peek(&iter)).k) { + ret = bkey_err(k); + +@@ -1843,10 +1841,10 @@ static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id btree_id) + bch2_extent_normalize(c, bkey_i_to_s(sk.k)); + + commit_err = +- bch2_trans_update(&trans, &iter, sk.k, 0) ?: +- bch2_trans_commit(&trans, NULL, NULL, +- BTREE_INSERT_NOWAIT| +- BTREE_INSERT_NOFAIL); ++ bch2_trans_update(trans, &iter, sk.k, 0) ?: ++ bch2_trans_commit(trans, NULL, NULL, ++ BTREE_INSERT_NOWAIT| ++ BTREE_INSERT_NOFAIL); + if (commit_err == -EINTR) { + commit_err = 0; + continue; +@@ -1855,20 +1853,42 @@ static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id btree_id) + + bch2_btree_iter_advance(&iter); + } +- bch2_trans_iter_exit(&trans, &iter); ++ bch2_trans_iter_exit(trans, &iter); + +- bch2_trans_exit(&trans); + bch2_bkey_buf_exit(&sk, c); + + return ret; + } + ++static int bch2_alloc_write_oldest_gen(struct btree_trans *trans, struct btree_iter *iter) ++{ ++ struct bch_dev *ca = bch_dev_bkey_exists(trans->c, iter->pos.inode); ++ struct bkey_s_c k; ++ struct bkey_alloc_unpacked u; ++ int ret; ++ ++ k = bch2_btree_iter_peek_slot(iter); ++ ret = bkey_err(k); ++ if (ret) ++ return ret; ++ ++ u = bch2_alloc_unpack(k); ++ ++ if (u.oldest_gen == ca->oldest_gen[iter->pos.offset]) ++ return 0; ++ ++ u.oldest_gen = ca->oldest_gen[iter->pos.offset]; ++ ++ return bch2_alloc_write(trans, iter, &u, BTREE_TRIGGER_NORUN); ++} ++ + int bch2_gc_gens(struct bch_fs *c) + { ++ struct btree_trans trans; ++ struct btree_iter iter; ++ struct bkey_s_c k; + struct bch_dev *ca; +- struct bucket_array *buckets; +- struct bucket *g; +- u64 start_time = local_clock(); ++ u64 b, start_time = local_clock(); + unsigned i; + int ret; + +@@ -1877,36 +1897,53 @@ int bch2_gc_gens(struct bch_fs *c) + * introduces a deadlock in the RO path - we currently take the state + * lock at the start of going RO, thus the gc thread may get stuck: + */ ++ if (!mutex_trylock(&c->gc_gens_lock)) ++ return 0; ++ + down_read(&c->gc_lock); ++ bch2_trans_init(&trans, c, 0, 0); + + for_each_member_device(ca, c, i) { +- down_read(&ca->bucket_lock); +- buckets = bucket_array(ca); ++ struct bucket_gens *gens; ++ ++ BUG_ON(ca->oldest_gen); ++ ++ ca->oldest_gen = kvmalloc(ca->mi.nbuckets, GFP_KERNEL); ++ if (!ca->oldest_gen) { ++ percpu_ref_put(&ca->ref); ++ ret = -ENOMEM; ++ goto err; ++ } ++ ++ gens = bucket_gens(ca); + +- for_each_bucket(g, buckets) +- g->gc_gen = g->mark.gen; +- up_read(&ca->bucket_lock); ++ for (b = gens->first_bucket; ++ b < gens->nbuckets; b++) ++ ca->oldest_gen[b] = gens->b[b]; + } + + for (i = 0; i < BTREE_ID_NR; i++) + if ((1 << i) & BTREE_ID_HAS_PTRS) { + c->gc_gens_btree = i; + c->gc_gens_pos = POS_MIN; +- ret = bch2_gc_btree_gens(c, i); ++ ret = bch2_gc_btree_gens(&trans, i); + if (ret) { + bch_err(c, "error recalculating oldest_gen: %i", ret); + goto err; + } + } + +- for_each_member_device(ca, c, i) { +- down_read(&ca->bucket_lock); +- buckets = bucket_array(ca); +- +- for_each_bucket(g, buckets) +- g->oldest_gen = g->gc_gen; +- up_read(&ca->bucket_lock); ++ for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN, ++ BTREE_ITER_PREFETCH, k, ret) { ++ ret = __bch2_trans_do(&trans, NULL, NULL, ++ BTREE_INSERT_NOFAIL, ++ bch2_alloc_write_oldest_gen(&trans, &iter)); ++ if (ret) { ++ bch_err(c, "error writing oldest_gen: %i", ret); ++ break; ++ } + } ++ bch2_trans_iter_exit(&trans, &iter); + + c->gc_gens_btree = 0; + c->gc_gens_pos = POS_MIN; +@@ -1915,7 +1952,14 @@ int bch2_gc_gens(struct bch_fs *c) + + bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time); + err: ++ for_each_member_device(ca, c, i) { ++ kvfree(ca->oldest_gen); ++ ca->oldest_gen = NULL; ++ } ++ ++ bch2_trans_exit(&trans); + up_read(&c->gc_lock); ++ mutex_unlock(&c->gc_gens_lock); + return ret; + } + +diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h +index d35c96bcf3a1..7c6c59c7762c 100644 +--- a/fs/bcachefs/buckets.h ++++ b/fs/bcachefs/buckets.h +@@ -97,12 +97,6 @@ static inline size_t PTR_BUCKET_NR(const struct bch_dev *ca, + return sector_to_bucket(ca, ptr->offset); + } + +-static inline struct bucket *PTR_BUCKET(struct bch_dev *ca, +- const struct bch_extent_ptr *ptr) +-{ +- return bucket(ca, PTR_BUCKET_NR(ca, ptr)); +-} +- + static inline struct bucket *PTR_GC_BUCKET(struct bch_dev *ca, + const struct bch_extent_ptr *ptr) + { +diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h +index 24139831226d..2c73dc60b838 100644 +--- a/fs/bcachefs/buckets_types.h ++++ b/fs/bcachefs/buckets_types.h +@@ -30,7 +30,6 @@ struct bucket { + + u64 io_time[2]; + u8 oldest_gen; +- u8 gc_gen; + unsigned gen_valid:1; + u8 stripe_redundancy; + u32 stripe; +diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c +index d8b72d8dd7a8..e6eff26fc0c8 100644 +--- a/fs/bcachefs/super.c ++++ b/fs/bcachefs/super.c +@@ -674,6 +674,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) + INIT_WORK(&c->read_only_work, bch2_fs_read_only_work); + + init_rwsem(&c->gc_lock); ++ mutex_init(&c->gc_gens_lock); + + for (i = 0; i < BCH_TIME_STAT_NR; i++) + bch2_time_stats_init(&c->times[i]); +diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c +index ed9a095063e8..b727845dd64b 100644 +--- a/fs/bcachefs/sysfs.c ++++ b/fs/bcachefs/sysfs.c +@@ -499,6 +499,17 @@ STORE(bch2_fs) + + /* Debugging: */ + ++ if (!test_bit(BCH_FS_RW, &c->flags)) ++ return -EROFS; ++ ++ if (attr == &sysfs_prune_cache) { ++ struct shrink_control sc; ++ ++ sc.gfp_mask = GFP_KERNEL; ++ sc.nr_to_scan = strtoul_or_return(buf); ++ c->btree_cache.shrink.scan_objects(&c->btree_cache.shrink, &sc); ++ } ++ + if (attr == &sysfs_trigger_gc) { + /* + * Full gc is currently incompatible with btree key cache: +@@ -512,14 +523,6 @@ STORE(bch2_fs) + #endif + } + +- if (attr == &sysfs_prune_cache) { +- struct shrink_control sc; +- +- sc.gfp_mask = GFP_KERNEL; +- sc.nr_to_scan = strtoul_or_return(buf); +- c->btree_cache.shrink.scan_objects(&c->btree_cache.shrink, &sc); +- } +- + #ifdef CONFIG_BCACHEFS_TESTS + if (attr == &sysfs_perf_test) { + char *tmp = kstrdup(buf, GFP_KERNEL), *p = tmp; +-- +cgit v1.2.3 + + +From 18c61a5ed04e9f6c0c5b3d2e8fe7f63fb60267bd Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 10 Feb 2022 03:40:44 -0500 +Subject: bcachefs: Fix reflink repair code + +The reflink repair code was incorrectly inserting a nonzero deleted key +via journal replay - this is due to bch2_journal_key_insert() being +somewhat hacky, and so this fix is also hacky for now. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_gc.c | 12 ++++++++++-- + 1 file changed, 10 insertions(+), 2 deletions(-) + +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +index c799ea43ff2b..445234e064b6 100644 +--- a/fs/bcachefs/btree_gc.c ++++ b/fs/bcachefs/btree_gc.c +@@ -1501,10 +1501,18 @@ static int bch2_gc_reflink_done(struct bch_fs *c, bool initial, + + bkey_reassemble(new, k); + +- if (!r->refcount) ++ if (!r->refcount) { + new->k.type = KEY_TYPE_deleted; +- else ++ /* ++ * XXX ugly: bch2_journal_key_insert() queues up ++ * the key for the journal replay code, which ++ * doesn't run the extent overwrite pass ++ */ ++ if (initial) ++ new->k.size = 0; ++ } else { + *bkey_refcount(new) = cpu_to_le64(r->refcount); ++ } + + ret = initial + ? bch2_journal_key_insert(c, BTREE_ID_stripes, 0, new) +-- +cgit v1.2.3 + + +From ffc540a83ab4a6f220c47e14de4f0557f49819f2 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 13 Feb 2022 20:42:12 -0500 +Subject: bcachefs: Small fsck fix + +The check_dirents pass handles transaction restarts at the toplevel - +check_subdir_count() was incorrectly handling transaction restarts +without returning -EINTR, meaning that the iterator pointing to the +dirent being checked was left invalid. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/fsck.c | 5 +++-- + 1 file changed, 3 insertions(+), 2 deletions(-) + +diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c +index 43b6159be01b..ced4d671eb8d 100644 +--- a/fs/bcachefs/fsck.c ++++ b/fs/bcachefs/fsck.c +@@ -1316,8 +1316,9 @@ static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w) + if (i->inode.bi_nlink == i->count) + continue; + +- count2 = lockrestart_do(trans, +- bch2_count_subdirs(trans, w->cur_inum, i->snapshot)); ++ count2 = bch2_count_subdirs(trans, w->cur_inum, i->snapshot); ++ if (count2 < 0) ++ return count2; + + if (i->count != count2) { + bch_err(c, "fsck counted subdirectories wrong: got %llu should be %llu", +-- +cgit v1.2.3 + + +From a568252f8c6f3edd23719a2c6ff2edc65c46f04c Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 13 Feb 2022 20:47:05 -0500 +Subject: bcachefs: Print a better message for mark and sweep pass + +Btree gc, aka mark and sweep, checks allocations - so let's just print +that. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/recovery.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index 7e4400cc02a9..543db58ff4d6 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -1126,12 +1126,12 @@ use_clean: + test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags)) { + bool metadata_only = c->opts.norecovery; + +- bch_info(c, "starting mark and sweep"); ++ bch_info(c, "checking allocations"); + err = "error in mark and sweep"; + ret = bch2_gc(c, true, metadata_only); + if (ret) + goto err; +- bch_verbose(c, "mark and sweep done"); ++ bch_verbose(c, "done checking allocations"); + } + + bch2_stripes_heap_start(c); +-- +cgit v1.2.3 + + +From 9d74a616b90345c43c259894cf74fa8b50a7223f Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 13 Feb 2022 22:16:45 -0500 +Subject: bcachefs: Kill bch2_bkey_debugcheck + +The old .debugcheck methods are no more and this just calls the .invalid +method, which doesn't add much since we already check that when doing +btree updates and when reading metadata in. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bcachefs.h | 3 --- + fs/bcachefs/bkey_methods.c | 16 ---------------- + fs/bcachefs/bkey_methods.h | 2 -- + fs/bcachefs/btree_iter.c | 16 +--------------- + 4 files changed, 1 insertion(+), 36 deletions(-) + +diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h +index 0841303e5f2f..0e9689f6878a 100644 +--- a/fs/bcachefs/bcachefs.h ++++ b/fs/bcachefs/bcachefs.h +@@ -281,9 +281,6 @@ do { \ + "significantly affect performance") \ + BCH_DEBUG_PARAM(debug_check_iterators, \ + "Enables extra verification for btree iterators") \ +- BCH_DEBUG_PARAM(debug_check_bkeys, \ +- "Run bkey_debugcheck (primarily checking GC/allocation "\ +- "information) when iterating over keys") \ + BCH_DEBUG_PARAM(debug_check_btree_accounting, \ + "Verify btree accounting for keys within a node") \ + BCH_DEBUG_PARAM(journal_seq_verify, \ +diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c +index 5c900cf8a8a2..e83aeb683a09 100644 +--- a/fs/bcachefs/bkey_methods.c ++++ b/fs/bcachefs/bkey_methods.c +@@ -212,22 +212,6 @@ const char *bch2_bkey_in_btree_node(struct btree *b, struct bkey_s_c k) + return NULL; + } + +-void bch2_bkey_debugcheck(struct bch_fs *c, struct btree *b, struct bkey_s_c k) +-{ +- const char *invalid; +- +- BUG_ON(!k.k->u64s); +- +- invalid = bch2_bkey_invalid(c, k, btree_node_type(b)) ?: +- bch2_bkey_in_btree_node(b, k); +- if (invalid) { +- char buf[160]; +- +- bch2_bkey_val_to_text(&PBUF(buf), c, k); +- bch2_fs_inconsistent(c, "invalid bkey %s: %s", buf, invalid); +- } +-} +- + void bch2_bpos_to_text(struct printbuf *out, struct bpos pos) + { + if (!bpos_cmp(pos, POS_MIN)) +diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h +index 3012035db1a3..4fdac545cf88 100644 +--- a/fs/bcachefs/bkey_methods.h ++++ b/fs/bcachefs/bkey_methods.h +@@ -34,8 +34,6 @@ const char *bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c, + enum btree_node_type); + const char *bch2_bkey_in_btree_node(struct btree *, struct bkey_s_c); + +-void bch2_bkey_debugcheck(struct bch_fs *, struct btree *, struct bkey_s_c); +- + void bch2_bpos_to_text(struct printbuf *, struct bpos); + void bch2_bkey_to_text(struct printbuf *, const struct bkey *); + void bch2_val_to_text(struct printbuf *, struct bch_fs *, +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 136d00b49be1..39db0d08063f 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -1005,8 +1005,6 @@ static inline struct bkey_s_c __btree_iter_unpack(struct bch_fs *c, + struct bkey *u, + struct bkey_packed *k) + { +- struct bkey_s_c ret; +- + if (unlikely(!k)) { + /* + * signal to bch2_btree_iter_peek_slot() that we're currently at +@@ -1016,19 +1014,7 @@ static inline struct bkey_s_c __btree_iter_unpack(struct bch_fs *c, + return bkey_s_c_null; + } + +- ret = bkey_disassemble(l->b, k, u); +- +- /* +- * XXX: bch2_btree_bset_insert_key() generates invalid keys when we +- * overwrite extents - it sets k->type = KEY_TYPE_deleted on the key +- * being overwritten but doesn't change k->size. But this is ok, because +- * those keys are never written out, we just have to avoid a spurious +- * assertion here: +- */ +- if (bch2_debug_check_bkeys && !bkey_deleted(ret.k)) +- bch2_bkey_debugcheck(c, l->b, ret); +- +- return ret; ++ return bkey_disassemble(l->b, k, u); + } + + static inline struct bkey_s_c btree_path_level_peek_all(struct bch_fs *c, +-- +cgit v1.2.3 + + +From 3339f2eb4f8ac0020125c869ddd2b29ce1989f87 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 29 Dec 2021 13:50:19 -0500 +Subject: bcachefs: Check for btree locks held on transaction init + +Ideally we would disallow multiple btree_trans being initialized within +the same process - and hopefully we will at some point, the stack usage +is excessive - but for now there are a couple places where we do this: + + - transaction commit error path -> journal reclaim - btree key cache + flush + - move data path -> do_pending_writes -> write path -> bucket + allocation (in the near future when bucket allocation switches to + using a freespace btree) + +In order to avoid deadlocking the first btree_trans must have been +unlocked with bch2_trans_unlock() before using the second btree_trans - +this patch adds an assertion to bch2_trans_init() that verifies that +this has been done when lockdep is enabled. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 39db0d08063f..565c811703ca 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -3076,6 +3076,8 @@ void __bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, + const char *fn) + __acquires(&c->btree_trans_barrier) + { ++ BUG_ON(lock_class_is_held(&bch2_btree_node_lock_key)); ++ + memset(trans, 0, sizeof(*trans)); + trans->c = c; + trans->fn = fn; +-- +cgit v1.2.3 + + +From 24b00b3b59688aff9d87b9feb1990c6168d3eaf7 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 15 Feb 2022 23:40:30 -0500 +Subject: bcachefs: Fix locking in data move path + +We need to ensure we don't have any btree locks held when calling +do_pending_writes() - besides issuing IOs, upcoming allocator changes +will have allocations doing btree lookups directly. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/move.c | 37 ++++++++++++++++++++----------------- + 1 file changed, 20 insertions(+), 17 deletions(-) + +diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c +index 3e3dcec327a0..83536fdc309a 100644 +--- a/fs/bcachefs/move.c ++++ b/fs/bcachefs/move.c +@@ -487,19 +487,22 @@ static void move_read_endio(struct bio *bio) + closure_put(&ctxt->cl); + } + +-static void do_pending_writes(struct moving_context *ctxt) ++static void do_pending_writes(struct moving_context *ctxt, struct btree_trans *trans) + { + struct moving_io *io; + ++ if (trans) ++ bch2_trans_unlock(trans); ++ + while ((io = next_pending_write(ctxt))) { + list_del(&io->list); + closure_call(&io->cl, move_write, NULL, &ctxt->cl); + } + } + +-#define move_ctxt_wait_event(_ctxt, _cond) \ ++#define move_ctxt_wait_event(_ctxt, _trans, _cond) \ + do { \ +- do_pending_writes(_ctxt); \ ++ do_pending_writes(_ctxt, _trans); \ + \ + if (_cond) \ + break; \ +@@ -507,11 +510,12 @@ do { \ + next_pending_write(_ctxt) || (_cond)); \ + } while (1) + +-static void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt) ++static void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt, ++ struct btree_trans *trans) + { + unsigned sectors_pending = atomic_read(&ctxt->write_sectors); + +- move_ctxt_wait_event(ctxt, ++ move_ctxt_wait_event(ctxt, trans, + !atomic_read(&ctxt->write_sectors) || + atomic_read(&ctxt->write_sectors) != sectors_pending); + } +@@ -533,14 +537,6 @@ static int bch2_move_extent(struct btree_trans *trans, + unsigned sectors = k.k->size, pages; + int ret = -ENOMEM; + +- move_ctxt_wait_event(ctxt, +- atomic_read(&ctxt->write_sectors) < +- SECTORS_IN_FLIGHT_PER_DEVICE); +- +- move_ctxt_wait_event(ctxt, +- atomic_read(&ctxt->read_sectors) < +- SECTORS_IN_FLIGHT_PER_DEVICE); +- + /* write path might have to decompress data: */ + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) + sectors = max_t(unsigned, sectors, p.crc.uncompressed_size); +@@ -691,12 +687,19 @@ static int __bch2_move_data(struct bch_fs *c, + schedule_timeout(delay); + + if (unlikely(freezing(current))) { +- bch2_trans_unlock(&trans); +- move_ctxt_wait_event(ctxt, list_empty(&ctxt->reads)); ++ move_ctxt_wait_event(ctxt, &trans, list_empty(&ctxt->reads)); + try_to_freeze(); + } + } while (delay); + ++ move_ctxt_wait_event(ctxt, &trans, ++ atomic_read(&ctxt->write_sectors) < ++ SECTORS_IN_FLIGHT_PER_DEVICE); ++ ++ move_ctxt_wait_event(ctxt, &trans, ++ atomic_read(&ctxt->read_sectors) < ++ SECTORS_IN_FLIGHT_PER_DEVICE); ++ + bch2_trans_begin(&trans); + + k = bch2_btree_iter_peek(&iter); +@@ -761,7 +764,7 @@ static int __bch2_move_data(struct bch_fs *c, + + if (ret2 == -ENOMEM) { + /* memory allocation failure, wait for some IO to finish */ +- bch2_move_ctxt_wait_for_io(ctxt); ++ bch2_move_ctxt_wait_for_io(ctxt, &trans); + continue; + } + +@@ -846,7 +849,7 @@ int bch2_move_data(struct bch_fs *c, + } + + +- move_ctxt_wait_event(&ctxt, list_empty(&ctxt.reads)); ++ move_ctxt_wait_event(&ctxt, NULL, list_empty(&ctxt.reads)); + closure_sync(&ctxt.cl); + + EBUG_ON(atomic_read(&ctxt.write_sectors)); +-- +cgit v1.2.3 + + +From fdce488a5b7585a0e42d6f31815970f6d1e74b4a Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 15 Feb 2022 21:45:04 -0500 +Subject: bcachefs: Delete redundant tracepoint + +We were emitting two trace events on transaction restart in this code +path - delete the redundant one. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_key_cache.c | 1 - + 1 file changed, 1 deletion(-) + +diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c +index e6363592c417..167d177150c4 100644 +--- a/fs/bcachefs/btree_key_cache.c ++++ b/fs/bcachefs/btree_key_cache.c +@@ -320,7 +320,6 @@ retry: + if (!trans->restarted) + goto retry; + +- trace_transaction_restart_ip(trans->fn, _THIS_IP_); + ret = -EINTR; + goto err; + } +-- +cgit v1.2.3 + + +From 98d25b58803ff5571339338d502bb1fd31209c2f Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 15 Feb 2022 22:28:37 -0500 +Subject: bcachefs: Also show when blocked on write locks + +This consolidates some of the btree node lock path, so that when we're +blocked taking a write lock on a node it shows up in +bch2_btree_trans_to_text(), along with intent and read locks. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 27 +++++----------------- + fs/bcachefs/btree_locking.h | 46 +++++++++++++++++++++++-------------- + fs/bcachefs/btree_types.h | 1 + + fs/bcachefs/btree_update_interior.c | 6 ++--- + fs/bcachefs/btree_update_leaf.c | 8 ++++--- + 5 files changed, 44 insertions(+), 44 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 565c811703ca..4b7833511c26 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -162,7 +162,7 @@ void __bch2_btree_node_lock_write(struct btree_trans *trans, struct btree *b) + else + this_cpu_sub(*b->c.lock.readers, readers); + +- btree_node_lock_type(trans->c, b, SIX_LOCK_write); ++ six_lock_write(&b->c.lock, NULL, NULL); + + if (!b->c.lock.readers) + atomic64_add(__SIX_VAL(read_lock, readers), +@@ -301,9 +301,7 @@ bool __bch2_btree_node_lock(struct btree_trans *trans, + unsigned long ip) + { + struct btree_path *linked, *deadlock_path = NULL; +- u64 start_time = local_clock(); + unsigned reason = 9; +- bool ret; + + /* Check if it's safe to block: */ + trans_for_each_path(trans, linked) { +@@ -381,23 +379,8 @@ bool __bch2_btree_node_lock(struct btree_trans *trans, + return false; + } + +- if (six_trylock_type(&b->c.lock, type)) +- return true; +- +- trans->locking_path_idx = path->idx; +- trans->locking_pos = pos; +- trans->locking_btree_id = path->btree_id; +- trans->locking_level = level; +- trans->locking = b; +- +- ret = six_lock_type(&b->c.lock, type, should_sleep_fn, p) == 0; +- +- trans->locking = NULL; +- +- if (ret) +- bch2_time_stats_update(&trans->c->times[lock_to_time_stat(type)], +- start_time); +- return ret; ++ return btree_node_lock_type(trans, path, b, pos, level, ++ type, should_sleep_fn, p); + } + + /* Btree iterator locking: */ +@@ -3199,6 +3182,7 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c) + struct btree_trans *trans; + struct btree_path *path; + struct btree *b; ++ static char lock_types[] = { 'r', 'i', 'w' }; + unsigned l; + + mutex_lock(&c->btree_trans_lock); +@@ -3235,10 +3219,11 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c) + b = READ_ONCE(trans->locking); + if (b) { + path = &trans->paths[trans->locking_path_idx]; +- pr_buf(out, " locking path %u %c l=%u %s:", ++ pr_buf(out, " locking path %u %c l=%u %c %s:", + trans->locking_path_idx, + path->cached ? 'c' : 'b', + trans->locking_level, ++ lock_types[trans->locking_lock_type], + bch2_btree_ids[trans->locking_btree_id]); + bch2_bpos_to_text(out, trans->locking_pos); + +diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h +index d599008c5fc1..b4434eca0746 100644 +--- a/fs/bcachefs/btree_locking.h ++++ b/fs/bcachefs/btree_locking.h +@@ -128,23 +128,35 @@ static inline enum bch_time_stats lock_to_time_stat(enum six_lock_type type) + } + } + +-/* +- * wrapper around six locks that just traces lock contended time +- */ +-static inline void __btree_node_lock_type(struct bch_fs *c, struct btree *b, +- enum six_lock_type type) +-{ +- u64 start_time = local_clock(); +- +- six_lock_type(&b->c.lock, type, NULL, NULL); +- bch2_time_stats_update(&c->times[lock_to_time_stat(type)], start_time); +-} +- +-static inline void btree_node_lock_type(struct bch_fs *c, struct btree *b, +- enum six_lock_type type) +-{ +- if (!six_trylock_type(&b->c.lock, type)) +- __btree_node_lock_type(c, b, type); ++static inline bool btree_node_lock_type(struct btree_trans *trans, ++ struct btree_path *path, ++ struct btree *b, ++ struct bpos pos, unsigned level, ++ enum six_lock_type type, ++ six_lock_should_sleep_fn should_sleep_fn, void *p) ++{ ++ struct bch_fs *c = trans->c; ++ u64 start_time; ++ bool ret; ++ ++ if (six_trylock_type(&b->c.lock, type)) ++ return true; ++ ++ start_time = local_clock(); ++ ++ trans->locking_path_idx = path->idx; ++ trans->locking_pos = pos; ++ trans->locking_btree_id = path->btree_id; ++ trans->locking_level = level; ++ trans->locking_lock_type = type; ++ trans->locking = b; ++ ret = six_lock_type(&b->c.lock, type, should_sleep_fn, p) == 0; ++ trans->locking = NULL; ++ ++ if (ret) ++ bch2_time_stats_update(&c->times[lock_to_time_stat(type)], start_time); ++ ++ return ret; + } + + /* +diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h +index 989129f9f76c..68272f26f017 100644 +--- a/fs/bcachefs/btree_types.h ++++ b/fs/bcachefs/btree_types.h +@@ -377,6 +377,7 @@ struct btree_trans { + struct bpos locking_pos; + u8 locking_btree_id; + u8 locking_level; ++ u8 locking_lock_type; + pid_t pid; + int srcu_idx; + +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index a0f7a9f06b98..088c320493d3 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -620,8 +620,8 @@ err: + * we're in journal error state: + */ + +- btree_node_lock_type(c, b, SIX_LOCK_intent); +- btree_node_lock_type(c, b, SIX_LOCK_write); ++ six_lock_intent(&b->c.lock, NULL, NULL); ++ six_lock_write(&b->c.lock, NULL, NULL); + mutex_lock(&c->btree_interior_update_lock); + + list_del(&as->write_blocked_list); +@@ -675,7 +675,7 @@ err: + for (i = 0; i < as->nr_new_nodes; i++) { + b = as->new_nodes[i]; + +- btree_node_lock_type(c, b, SIX_LOCK_read); ++ six_lock_read(&b->c.lock, NULL, NULL); + btree_node_write_if_need(c, b, SIX_LOCK_read); + six_unlock_read(&b->c.lock); + } +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index 5cd6bda16b6b..5530941c772b 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -168,7 +168,7 @@ static int __btree_node_flush(struct journal *j, struct journal_entry_pin *pin, + struct btree_write *w = container_of(pin, struct btree_write, journal); + struct btree *b = container_of(w, struct btree, writes[i]); + +- btree_node_lock_type(c, b, SIX_LOCK_read); ++ six_lock_read(&b->c.lock, NULL, NULL); + bch2_btree_node_write_cond(c, b, + (btree_current_write(b) == w && w->journal.seq == seq)); + six_unlock_read(&b->c.lock); +@@ -619,8 +619,10 @@ static inline int trans_lock_write(struct btree_trans *trans) + if (have_conflicting_read_lock(trans, i->path)) + goto fail; + +- __btree_node_lock_type(trans->c, insert_l(i)->b, +- SIX_LOCK_write); ++ btree_node_lock_type(trans, i->path, ++ insert_l(i)->b, ++ i->path->pos, i->level, ++ SIX_LOCK_write, NULL, NULL); + } + + bch2_btree_node_prep_for_write(trans, i->path, insert_l(i)->b); +-- +cgit v1.2.3 + + +From f38b93e2789a9b6fd2167b9b67fbf31d19497d84 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 15 Feb 2022 22:01:33 -0500 +Subject: bcachefs: Fix __bch2_btree_node_lock + +__bch2_btree_node_lock() was implementing the wrong lock ordering for +cached vs. non cached paths - this fixes it to match the btree path sort +order as defined by __btree_path_cmp(), and also simplifies the code +some. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 63 ++++++++++++++++++++++++------------------------ + 1 file changed, 32 insertions(+), 31 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 4b7833511c26..f8ce9d3dfe94 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -58,6 +58,9 @@ static inline int __btree_path_cmp(const struct btree_path *l, + struct bpos r_pos, + unsigned r_level) + { ++ /* ++ * Must match lock ordering as defined by __bch2_btree_node_lock: ++ */ + return cmp_int(l->btree_id, r_btree_id) ?: + cmp_int((int) l->cached, (int) r_cached) ?: + bpos_cmp(l->pos, r_pos) ?: +@@ -300,8 +303,8 @@ bool __bch2_btree_node_lock(struct btree_trans *trans, + six_lock_should_sleep_fn should_sleep_fn, void *p, + unsigned long ip) + { +- struct btree_path *linked, *deadlock_path = NULL; +- unsigned reason = 9; ++ struct btree_path *linked; ++ unsigned reason; + + /* Check if it's safe to block: */ + trans_for_each_path(trans, linked) { +@@ -322,28 +325,28 @@ bool __bch2_btree_node_lock(struct btree_trans *trans, + */ + if (type == SIX_LOCK_intent && + linked->nodes_locked != linked->nodes_intent_locked) { +- deadlock_path = linked; + reason = 1; ++ goto deadlock; + } + + if (linked->btree_id != path->btree_id) { +- if (linked->btree_id > path->btree_id) { +- deadlock_path = linked; +- reason = 3; +- } +- continue; ++ if (linked->btree_id < path->btree_id) ++ continue; ++ ++ reason = 3; ++ goto deadlock; + } + + /* +- * Within the same btree, cached paths come before non +- * cached paths: ++ * Within the same btree, non-cached paths come before cached ++ * paths: + */ + if (linked->cached != path->cached) { +- if (path->cached) { +- deadlock_path = linked; +- reason = 4; +- } +- continue; ++ if (!linked->cached) ++ continue; ++ ++ reason = 4; ++ goto deadlock; + } + + /* +@@ -352,35 +355,33 @@ bool __bch2_btree_node_lock(struct btree_trans *trans, + * we're about to lock, it must have the ancestors locked too: + */ + if (level > __fls(linked->nodes_locked)) { +- deadlock_path = linked; + reason = 5; ++ goto deadlock; + } + + /* Must lock btree nodes in key order: */ + if (btree_node_locked(linked, level) && + bpos_cmp(pos, btree_node_pos((void *) linked->l[level].b, + linked->cached)) <= 0) { +- deadlock_path = linked; +- reason = 7; + BUG_ON(trans->in_traverse_all); ++ reason = 7; ++ goto deadlock; + } + } + +- if (unlikely(deadlock_path)) { +- trace_trans_restart_would_deadlock(trans->fn, ip, +- trans->in_traverse_all, reason, +- deadlock_path->btree_id, +- deadlock_path->cached, +- &deadlock_path->pos, +- path->btree_id, +- path->cached, +- &pos); +- btree_trans_restart(trans); +- return false; +- } +- + return btree_node_lock_type(trans, path, b, pos, level, + type, should_sleep_fn, p); ++deadlock: ++ trace_trans_restart_would_deadlock(trans->fn, ip, ++ trans->in_traverse_all, reason, ++ linked->btree_id, ++ linked->cached, ++ &linked->pos, ++ path->btree_id, ++ path->cached, ++ &pos); ++ btree_trans_restart(trans); ++ return false; + } + + /* Btree iterator locking: */ +-- +cgit v1.2.3 + + +From 7fd584cffea93eb8b5a015f23c3b1f3e6049014a Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 14 Feb 2022 04:20:39 -0500 +Subject: bcachefs: Kill verify_not_stale() + +This is ancient code that's more effectively checked in other places +now. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/alloc_foreground.c | 18 ------------------ + 1 file changed, 18 deletions(-) + +diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c +index 0a634125dc90..9b81ed2665c8 100644 +--- a/fs/bcachefs/alloc_foreground.c ++++ b/fs/bcachefs/alloc_foreground.c +@@ -151,22 +151,6 @@ static void open_bucket_free_unused(struct bch_fs *c, + } + } + +-static void verify_not_stale(struct bch_fs *c, const struct open_buckets *obs) +-{ +-#ifdef CONFIG_BCACHEFS_DEBUG +- struct open_bucket *ob; +- unsigned i; +- +- rcu_read_lock(); +- open_bucket_for_each(c, obs, ob, i) { +- struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev); +- +- BUG_ON(*bucket_gen(ca, ob->bucket) != ob->gen); +- } +- rcu_read_unlock(); +-#endif +-} +- + /* _only_ for allocating the journal on a new device: */ + long bch2_bucket_alloc_new_fs(struct bch_dev *ca) + { +@@ -857,8 +841,6 @@ alloc_done: + + BUG_ON(!wp->sectors_free || wp->sectors_free == UINT_MAX); + +- verify_not_stale(c, &wp->ptrs); +- + return wp; + err: + open_bucket_for_each(c, &wp->ptrs, ob, i) +-- +cgit v1.2.3 + + +From 55fe9f354d3cb8e4aeec1883f62578ebfb762630 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 15 Feb 2022 00:06:59 -0500 +Subject: bcachefs: Check for stale dirty pointer before reads + +Since we retry reads when we discover we read from a pointer that went +stale, if a dirty pointer is erroniously stale it would cause us to loop +retrying that read forever - unless we check before issuing the read, +while the btree is still locked, when we know that a dirty pointer +should never be stale. + +This patch adds that check, along with printing some helpful debug info. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/fs-io.c | 2 -- + fs/bcachefs/io.c | 60 ++++++++++++++++++++++++++++++++++++++++++++--------- + fs/bcachefs/move.c | 6 ++++-- + 3 files changed, 54 insertions(+), 14 deletions(-) + +diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c +index e257f15f067d..022585d1ac59 100644 +--- a/fs/bcachefs/fs-io.c ++++ b/fs/bcachefs/fs-io.c +@@ -1071,8 +1071,6 @@ retry: + + sectors = min(sectors, k.k->size - offset_into_extent); + +- bch2_trans_unlock(trans); +- + if (readpages_iter) + readpage_bio_extend(readpages_iter, &rbio->bio, sectors, + extent_partial_reads_expensive(k)); +diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c +index f0e93de4680d..038ec01e48e1 100644 +--- a/fs/bcachefs/io.c ++++ b/fs/bcachefs/io.c +@@ -1953,6 +1953,33 @@ err: + return ret; + } + ++static noinline void read_from_stale_dirty_pointer(struct btree_trans *trans, ++ struct bkey_s_c k, ++ struct bch_extent_ptr ptr) ++{ ++ struct bch_fs *c = trans->c; ++ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr.dev); ++ struct btree_iter iter; ++ char buf[200]; ++ int ret; ++ ++ bch2_bkey_val_to_text(&PBUF(buf), c, k); ++ bch2_fs_inconsistent(c, "Attempting to read from stale dirty pointer: %s", buf); ++ ++ bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, ++ POS(ptr.dev, PTR_BUCKET_NR(ca, &ptr)), ++ BTREE_ITER_CACHED); ++ ++ ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(&iter))); ++ if (ret) ++ return; ++ ++ bch2_bkey_val_to_text(&PBUF(buf), c, k); ++ bch_err(c, "%s", buf); ++ bch_err(c, "memory gen: %u", *bucket_gen(ca, iter.pos.offset)); ++ bch2_trans_iter_exit(trans, &iter); ++} ++ + int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, + struct bvec_iter iter, struct bpos read_pos, + enum btree_id data_btree, struct bkey_s_c k, +@@ -1962,7 +1989,7 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, + struct bch_fs *c = trans->c; + struct extent_ptr_decoded pick; + struct bch_read_bio *rbio = NULL; +- struct bch_dev *ca; ++ struct bch_dev *ca = NULL; + struct promote_op *promote = NULL; + bool bounce = false, read_full = false, narrow_crcs = false; + struct bpos data_pos = bkey_start_pos(k.k); +@@ -1979,7 +2006,7 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, + zero_fill_bio_iter(&orig->bio, iter); + goto out_read_done; + } +- ++retry_pick: + pick_ret = bch2_bkey_pick_read_device(c, k, failed, &pick); + + /* hole or reservation - just zero fill: */ +@@ -1992,8 +2019,27 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, + goto err; + } + +- if (pick_ret > 0) +- ca = bch_dev_bkey_exists(c, pick.ptr.dev); ++ ca = bch_dev_bkey_exists(c, pick.ptr.dev); ++ ++ /* ++ * Stale dirty pointers are treated as IO errors, but @failed isn't ++ * allocated unless we're in the retry path - so if we're not in the ++ * retry path, don't check here, it'll be caught in bch2_read_endio() ++ * and we'll end up in the retry path: ++ */ ++ if ((flags & BCH_READ_IN_RETRY) && ++ !pick.ptr.cached && ++ unlikely(ptr_stale(ca, &pick.ptr))) { ++ read_from_stale_dirty_pointer(trans, k, pick.ptr); ++ bch2_mark_io_failure(failed, &pick); ++ goto retry_pick; ++ } ++ ++ /* ++ * Unlock the iterator while the btree node's lock is still in ++ * cache, before doing the IO: ++ */ ++ bch2_trans_unlock(trans); + + if (flags & BCH_READ_NODECODE) { + /* +@@ -2281,12 +2327,6 @@ retry: + */ + sectors = min(sectors, k.k->size - offset_into_extent); + +- /* +- * Unlock the iterator while the btree node's lock is still in +- * cache, before doing the IO: +- */ +- bch2_trans_unlock(&trans); +- + bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9; + swap(bvec_iter.bi_size, bytes); + +diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c +index 83536fdc309a..7ca7ce394135 100644 +--- a/fs/bcachefs/move.c ++++ b/fs/bcachefs/move.c +@@ -751,10 +751,12 @@ static int __bch2_move_data(struct bch_fs *c, + BUG(); + } + +- /* unlock before doing IO: */ ++ /* ++ * The iterator gets unlocked by __bch2_read_extent - need to ++ * save a copy of @k elsewhere: ++ */ + bch2_bkey_buf_reassemble(&sk, c, k); + k = bkey_i_to_s_c(sk.k); +- bch2_trans_unlock(&trans); + + ret2 = bch2_move_extent(&trans, ctxt, wp, io_opts, btree_id, k, + data_cmd, data_opts); +-- +cgit v1.2.3 + + +From 9fc96548b932b93e3e48e88f43babb7c6a9b34e3 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 16 Feb 2022 00:42:34 -0500 +Subject: bcachefs: Fix slow tracepoints + +Some of our tracepoints were calling snprintf("pS") - which does symbol +table lookups - in TP_fast_assign(), which turns out to be a really bad +idea. + +This was done because perf trace wasn't correctly printing tracepoints +that use %pS anymore - but it turns out trace-cmd does handle it +correctly. + +Signed-off-by: Kent Overstreet +--- + include/trace/events/bcachefs.h | 16 ++++++++-------- + 1 file changed, 8 insertions(+), 8 deletions(-) + +diff --git a/include/trace/events/bcachefs.h b/include/trace/events/bcachefs.h +index 36c4c8841741..a21a39230a09 100644 +--- a/include/trace/events/bcachefs.h ++++ b/include/trace/events/bcachefs.h +@@ -358,7 +358,7 @@ TRACE_EVENT(btree_node_relock_fail, + + TP_STRUCT__entry( + __array(char, trans_fn, 24 ) +- __array(char, caller, 32 ) ++ __field(unsigned long, caller_ip ) + __field(u8, btree_id ) + __field(u64, pos_inode ) + __field(u64, pos_offset ) +@@ -370,7 +370,7 @@ TRACE_EVENT(btree_node_relock_fail, + + TP_fast_assign( + strncpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn)); +- snprintf(__entry->caller, sizeof(__entry->caller), "%pS", (void *) caller_ip); ++ __entry->caller_ip = caller_ip; + __entry->btree_id = btree_id; + __entry->pos_inode = pos->inode; + __entry->pos_offset = pos->offset; +@@ -380,9 +380,9 @@ TRACE_EVENT(btree_node_relock_fail, + __entry->node_lock_seq = node_lock_seq; + ), + +- TP_printk("%s %s btree %u pos %llu:%llu:%u, node %lu iter seq %u lock seq %u", ++ TP_printk("%s %pS btree %u pos %llu:%llu:%u, node %lu iter seq %u lock seq %u", + __entry->trans_fn, +- __entry->caller, ++ (void *) __entry->caller_ip, + __entry->btree_id, + __entry->pos_inode, + __entry->pos_offset, +@@ -673,7 +673,7 @@ DECLARE_EVENT_CLASS(transaction_restart_iter, + + TP_STRUCT__entry( + __array(char, trans_fn, 24 ) +- __array(char, caller, 32 ) ++ __field(unsigned long, caller_ip ) + __field(u8, btree_id ) + __field(u64, pos_inode ) + __field(u64, pos_offset ) +@@ -682,16 +682,16 @@ DECLARE_EVENT_CLASS(transaction_restart_iter, + + TP_fast_assign( + strncpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn)); +- snprintf(__entry->caller, sizeof(__entry->caller), "%pS", (void *) caller_ip); ++ __entry->caller_ip = caller_ip; + __entry->btree_id = btree_id; + __entry->pos_inode = pos->inode; + __entry->pos_offset = pos->offset; + __entry->pos_snapshot = pos->snapshot; + ), + +- TP_printk("%s %s btree %u pos %llu:%llu:%u", ++ TP_printk("%s %pS btree %u pos %llu:%llu:%u", + __entry->trans_fn, +- __entry->caller, ++ (void *) __entry->caller_ip, + __entry->btree_id, + __entry->pos_inode, + __entry->pos_offset, +-- +cgit v1.2.3 + + +From 2e4f0a1e6b4dae4eb297cab675fc759846699209 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 16 Feb 2022 03:13:36 -0500 +Subject: bcachefs: Fix __btree_path_traverse_all + +The loop that traverses paths in traverse_all() needs to be a little bit +tricky, because traversing a path can cause other paths to be added (or +perhaps removed) at about the same position. + +The old logic was buggy, replace it with simpler logic. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 20 ++++++++++---------- + 1 file changed, 10 insertions(+), 10 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index f8ce9d3dfe94..1160fbad7748 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -1474,17 +1474,17 @@ retry_all: + while (i < trans->nr_sorted) { + path = trans->paths + trans->sorted[i]; + +- EBUG_ON(!(trans->paths_allocated & (1ULL << path->idx))); +- +- ret = btree_path_traverse_one(trans, path, 0, _THIS_IP_); +- if (ret) +- goto retry_all; +- +- EBUG_ON(!(trans->paths_allocated & (1ULL << path->idx))); +- +- if (path->nodes_locked || +- !btree_path_node(path, path->level)) ++ /* ++ * Traversing a path can cause another path to be added at about ++ * the same position: ++ */ ++ if (path->uptodate) { ++ ret = btree_path_traverse_one(trans, path, 0, _THIS_IP_); ++ if (ret) ++ goto retry_all; ++ } else { + i++; ++ } + } + + /* +-- +cgit v1.2.3 + + +From 057e7e0160e1beeb098f4ec2bd38452b972a8900 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 16 Feb 2022 02:50:39 -0500 +Subject: bcachefs: Improve journal_entry_btree_keys_to_text() + +This improves the formatting of journal_entry_btree_keys_to_text() by +putting each key on its own line. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/journal_io.c | 12 +++++++++--- + fs/bcachefs/util.h | 22 ++++++++++++++++++++++ + 2 files changed, 31 insertions(+), 3 deletions(-) + +diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c +index 651828b8bc97..b5c204e7c569 100644 +--- a/fs/bcachefs/journal_io.c ++++ b/fs/bcachefs/journal_io.c +@@ -299,11 +299,17 @@ static void journal_entry_btree_keys_to_text(struct printbuf *out, struct bch_fs + struct jset_entry *entry) + { + struct bkey_i *k; ++ bool first = true; + +- pr_buf(out, "btree=%s l=%u ", bch2_btree_ids[entry->btree_id], entry->level); +- +- vstruct_for_each(entry, k) ++ vstruct_for_each(entry, k) { ++ if (!first) { ++ printbuf_newline(out); ++ pr_buf(out, "%s: ", bch2_jset_entry_types[entry->type]); ++ } ++ pr_buf(out, "btree=%s l=%u ", bch2_btree_ids[entry->btree_id], entry->level); + bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(k)); ++ first = false; ++ } + } + + static int journal_entry_btree_root_validate(struct bch_fs *c, +diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h +index 3196bc303182..e55407dc5324 100644 +--- a/fs/bcachefs/util.h ++++ b/fs/bcachefs/util.h +@@ -238,6 +238,7 @@ do { \ + struct printbuf { + char *pos; + char *end; ++ unsigned indent; + }; + + static inline size_t printbuf_remaining(struct printbuf *buf) +@@ -259,6 +260,27 @@ do { \ + __VA_ARGS__); \ + } while (0) + ++static inline void printbuf_indent_push(struct printbuf *buf, unsigned spaces) ++{ ++ buf->indent += spaces; ++ while (spaces--) ++ pr_buf(buf, " "); ++} ++ ++static inline void printbuf_indent_pop(struct printbuf *buf, unsigned spaces) ++{ ++ buf->indent -= spaces; ++} ++ ++static inline void printbuf_newline(struct printbuf *buf) ++{ ++ unsigned i; ++ ++ pr_buf(buf, "\n"); ++ for (i = 0; i < buf->indent; i++) ++ pr_buf(buf, " "); ++} ++ + void bch_scnmemcpy(struct printbuf *, const char *, size_t); + + int bch2_strtoint_h(const char *, int *); +-- +cgit v1.2.3 + + +From 25f0d13c80d7d7035b0da9892dac4f3c6e3ce6bf Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 13 Feb 2022 01:58:12 -0500 +Subject: bcachefs: Stale ptr cleanup is now done by gc_gens + +Before we had dedicated gc code for bucket->oldest_gen this was +btree_gc's responsibility, but now that we have that we can rip it out, +simplifying the already overcomplicated btree_gc. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_gc.c | 55 +++++++++----------------------------------------- + 1 file changed, 10 insertions(+), 45 deletions(-) + +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +index 445234e064b6..648779cc643d 100644 +--- a/fs/bcachefs/btree_gc.c ++++ b/fs/bcachefs/btree_gc.c +@@ -726,11 +726,9 @@ fsck_err: + static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id, + unsigned level, bool is_root, + struct bkey_s_c *k, +- u8 *max_stale, bool initial) ++ bool initial) + { + struct bch_fs *c = trans->c; +- struct bkey_ptrs_c ptrs; +- const struct bch_extent_ptr *ptr; + struct bkey deleted = KEY(0, 0, 0); + struct bkey_s_c old = (struct bkey_s_c) { &deleted, NULL }; + unsigned flags = +@@ -755,17 +753,6 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id, + atomic64_set(&c->key_version, k->k->version.lo); + } + +- ptrs = bch2_bkey_ptrs_c(*k); +- bkey_for_each_ptr(ptrs, ptr) { +- struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); +- struct bucket *g = PTR_GC_BUCKET(ca, ptr); +- +- if (gen_after(g->oldest_gen, ptr->gen)) +- g->oldest_gen = ptr->gen; +- +- *max_stale = max(*max_stale, ptr_stale(ca, ptr)); +- } +- + ret = bch2_mark_key(trans, old, *k, flags); + fsck_err: + err: +@@ -774,8 +761,7 @@ err: + return ret; + } + +-static int btree_gc_mark_node(struct btree_trans *trans, struct btree *b, u8 *max_stale, +- bool initial) ++static int btree_gc_mark_node(struct btree_trans *trans, struct btree *b, bool initial) + { + struct bch_fs *c = trans->c; + struct btree_node_iter iter; +@@ -784,8 +770,6 @@ static int btree_gc_mark_node(struct btree_trans *trans, struct btree *b, u8 *ma + struct bkey_buf prev, cur; + int ret = 0; + +- *max_stale = 0; +- + if (!btree_node_type_needs_gc(btree_node_type(b))) + return 0; + +@@ -796,7 +780,7 @@ static int btree_gc_mark_node(struct btree_trans *trans, struct btree *b, u8 *ma + + while ((k = bch2_btree_node_iter_peek_unpack(&iter, b, &unpacked)).k) { + ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level, false, +- &k, max_stale, initial); ++ &k, initial); + if (ret) + break; + +@@ -827,7 +811,6 @@ static int bch2_gc_btree(struct btree_trans *trans, enum btree_id btree_id, + : bch2_expensive_debug_checks ? 0 + : !btree_node_type_needs_gc(btree_id) ? 1 + : 0; +- u8 max_stale = 0; + int ret = 0; + + gc_pos_set(c, gc_pos_btree(btree_id, POS_MIN, 0)); +@@ -838,21 +821,9 @@ static int bch2_gc_btree(struct btree_trans *trans, enum btree_id btree_id, + + gc_pos_set(c, gc_pos_btree_node(b)); + +- ret = btree_gc_mark_node(trans, b, &max_stale, initial); ++ ret = btree_gc_mark_node(trans, b, initial); + if (ret) + break; +- +- if (!initial) { +- if (max_stale > 64) +- bch2_btree_node_rewrite(trans, &iter, b, +- BTREE_INSERT_NOWAIT| +- BTREE_INSERT_GC_LOCK_HELD); +- else if (!bch2_btree_gc_rewrite_disabled && +- (bch2_btree_gc_always_rewrite || max_stale > 16)) +- bch2_btree_node_rewrite(trans, &iter, +- b, BTREE_INSERT_NOWAIT| +- BTREE_INSERT_GC_LOCK_HELD); +- } + } + bch2_trans_iter_exit(trans, &iter); + +@@ -864,8 +835,8 @@ static int bch2_gc_btree(struct btree_trans *trans, enum btree_id btree_id, + if (!btree_node_fake(b)) { + struct bkey_s_c k = bkey_i_to_s_c(&b->key); + +- ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level, true, +- &k, &max_stale, initial); ++ ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level, ++ true, &k, initial); + } + gc_pos_set(c, gc_pos_btree_root(b->c.btree_id)); + mutex_unlock(&c->btree_root_lock); +@@ -880,7 +851,6 @@ static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b + struct btree_and_journal_iter iter; + struct bkey_s_c k; + struct bkey_buf cur, prev; +- u8 max_stale = 0; + char buf[200]; + int ret = 0; + +@@ -893,8 +863,8 @@ static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b + BUG_ON(bpos_cmp(k.k->p, b->data->min_key) < 0); + BUG_ON(bpos_cmp(k.k->p, b->data->max_key) > 0); + +- ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level, false, +- &k, &max_stale, true); ++ ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level, ++ false, &k, true); + if (ret) { + bch_err(c, "%s: error %i from bch2_gc_mark_key", __func__, ret); + goto fsck_err; +@@ -985,7 +955,6 @@ static int bch2_gc_btree_init(struct btree_trans *trans, + : bch2_expensive_debug_checks ? 0 + : !btree_node_type_needs_gc(btree_id) ? 1 + : 0; +- u8 max_stale = 0; + char buf[100]; + int ret = 0; + +@@ -1018,7 +987,7 @@ static int bch2_gc_btree_init(struct btree_trans *trans, + struct bkey_s_c k = bkey_i_to_s_c(&b->key); + + ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level, true, +- &k, &max_stale, true); ++ &k, true); + } + fsck_err: + six_unlock_read(&b->c.lock); +@@ -1313,7 +1282,6 @@ static int bch2_alloc_write_key(struct btree_trans *trans, + .dev = iter->pos.inode, + .bucket = iter->pos.offset, + .gen = g->mark.gen, +- .oldest_gen = g->oldest_gen, + .data_type = g->mark.data_type, + .dirty_sectors = g->mark.dirty_sectors, + .cached_sectors = g->mark.cached_sectors, +@@ -1330,8 +1298,7 @@ static int bch2_alloc_write_key(struct btree_trans *trans, + gc_u.data_type != BCH_DATA_btree) + return 0; + +- if (!bkey_alloc_unpacked_cmp(old_u, gc_u) || +- gen_after(old_u.gen, gc_u.gen)) ++ if (gen_after(old_u.gen, gc_u.gen)) + return 0; + + #define copy_bucket_field(_f) \ +@@ -1353,8 +1320,6 @@ static int bch2_alloc_write_key(struct btree_trans *trans, + copy_bucket_field(stripe); + #undef copy_bucket_field + +- new_u.oldest_gen = gc_u.oldest_gen; +- + if (!bkey_alloc_unpacked_cmp(old_u, new_u)) + return 0; + +-- +cgit v1.2.3 + + +From d728174ed7dd68c5837a91df878db872be9715d8 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 14 Feb 2022 01:42:31 -0500 +Subject: bcachefs: Only allocate buckets_nouse when requested + +It's only needed by the migrate tool - this patch adds an option to +enable allocating it. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/buckets.c | 12 +++++++----- + fs/bcachefs/opts.h | 5 +++++ + 2 files changed, 12 insertions(+), 5 deletions(-) + +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index 6c5bfdc16648..eb0eaa983dc9 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -2143,9 +2143,10 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) + GFP_KERNEL|__GFP_ZERO)) || + !(bucket_gens = kvpmalloc(sizeof(struct bucket_gens) + nbuckets, + GFP_KERNEL|__GFP_ZERO)) || +- !(buckets_nouse = kvpmalloc(BITS_TO_LONGS(nbuckets) * ++ (c->opts.buckets_nouse && ++ !(buckets_nouse = kvpmalloc(BITS_TO_LONGS(nbuckets) * + sizeof(unsigned long), +- GFP_KERNEL|__GFP_ZERO)) || ++ GFP_KERNEL|__GFP_ZERO))) || + !init_fifo(&free[RESERVE_MOVINGGC], + copygc_reserve, GFP_KERNEL) || + !init_fifo(&free[RESERVE_NONE], reserve_none, GFP_KERNEL) || +@@ -2178,9 +2179,10 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) + memcpy(bucket_gens->b, + old_bucket_gens->b, + n); +- memcpy(buckets_nouse, +- ca->buckets_nouse, +- BITS_TO_LONGS(n) * sizeof(unsigned long)); ++ if (buckets_nouse) ++ memcpy(buckets_nouse, ++ ca->buckets_nouse, ++ BITS_TO_LONGS(n) * sizeof(unsigned long)); + } + + rcu_assign_pointer(ca->buckets[0], buckets); +diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h +index c325a094ae43..affe9233d708 100644 +--- a/fs/bcachefs/opts.h ++++ b/fs/bcachefs/opts.h +@@ -365,6 +365,11 @@ enum opt_type { + NO_SB_OPT, false, \ + NULL, "Set superblock to latest version,\n" \ + "allowing any new features to be used") \ ++ x(buckets_nouse, u8, \ ++ 0, \ ++ OPT_BOOL(), \ ++ NO_SB_OPT, false, \ ++ NULL, "Allocate the buckets_nouse bitmap") \ + x(project, u8, \ + OPT_INODE, \ + OPT_BOOL(), \ +-- +cgit v1.2.3 + + +From c4995bdb68c5fd3aadfed398f7f0b5d32005a4d6 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 16 Feb 2022 06:23:06 -0500 +Subject: bcachefs: Change bch2_dev_lookup() to not use lookup_bdev() + +bch2_dev_lookup() is used from the extended attribute set methods, for +setting the target options, where we're already holding an inode lock - +it turns out pathname lookups also take inode locks, so that was +susceptible to deadlocks. + +Fortunately we already stash the device name in ca->name. This does +change user-visible behaviour though: instead of specifying e.g. +/dev/sda1, user must now specify sda1. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/super.c | 10 ++-------- + 1 file changed, 2 insertions(+), 8 deletions(-) + +diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c +index e6eff26fc0c8..b36e6216a8a1 100644 +--- a/fs/bcachefs/super.c ++++ b/fs/bcachefs/super.c +@@ -1880,20 +1880,14 @@ err: + } + + /* return with ref on ca->ref: */ +-struct bch_dev *bch2_dev_lookup(struct bch_fs *c, const char *path) ++struct bch_dev *bch2_dev_lookup(struct bch_fs *c, const char *name) + { + struct bch_dev *ca; +- dev_t dev; + unsigned i; +- int ret; +- +- ret = lookup_bdev(path, &dev); +- if (ret) +- return ERR_PTR(ret); + + rcu_read_lock(); + for_each_member_device_rcu(ca, c, i, NULL) +- if (ca->dev == dev) ++ if (!strcmp(name, ca->name)) + goto found; + ca = ERR_PTR(-ENOENT); + found: +-- +cgit v1.2.3 + + +From 76256b1c595998da770f5d4a11e94743cd432443 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 18 Feb 2022 00:47:45 -0500 +Subject: bcachefs: Fix failure to allocate btree node in cache + +The error code when we fail to allocate a node in the btree node cache +doesn't make it to bch2_btree_path_traverse_all(). Instead, we need to +stash a flag in btree_trans so we know we have to take the cannibalize +lock. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_cache.c | 9 +++++++++ + fs/bcachefs/btree_iter.c | 22 +++++----------------- + fs/bcachefs/btree_types.h | 1 + + include/trace/events/bcachefs.h | 8 ++++++++ + 4 files changed, 23 insertions(+), 17 deletions(-) + +diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c +index 986d08d708cc..6e6a8e5bcdaf 100644 +--- a/fs/bcachefs/btree_cache.c ++++ b/fs/bcachefs/btree_cache.c +@@ -673,6 +673,15 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c, + } + + b = bch2_btree_node_mem_alloc(c); ++ ++ if (trans && b == ERR_PTR(-ENOMEM)) { ++ trans->memory_allocation_failure = true; ++ trace_trans_restart_memory_allocation_failure(trans->fn, ++ _THIS_IP_, btree_id, &path->pos); ++ btree_trans_restart(trans); ++ return ERR_PTR(-EINTR); ++ } ++ + if (IS_ERR(b)) + return b; + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 1160fbad7748..66778bd92066 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -1420,12 +1420,12 @@ err: + static int btree_path_traverse_one(struct btree_trans *, struct btree_path *, + unsigned, unsigned long); + +-static int __btree_path_traverse_all(struct btree_trans *trans, int ret, +- unsigned long trace_ip) ++static int bch2_btree_path_traverse_all(struct btree_trans *trans) + { + struct bch_fs *c = trans->c; + struct btree_path *path; +- int i; ++ unsigned long trace_ip = _RET_IP_; ++ int i, ret = 0; + + if (trans->in_traverse_all) + return -EINTR; +@@ -1453,7 +1453,7 @@ retry_all: + bch2_trans_unlock(trans); + cond_resched(); + +- if (unlikely(ret == -ENOMEM)) { ++ if (unlikely(trans->memory_allocation_failure)) { + struct closure cl; + + closure_init_stack(&cl); +@@ -1464,11 +1464,6 @@ retry_all: + } while (ret); + } + +- if (unlikely(ret == -EIO)) +- goto out; +- +- BUG_ON(ret && ret != -EINTR); +- + /* Now, redo traversals in correct order: */ + i = 0; + while (i < trans->nr_sorted) { +@@ -1494,7 +1489,7 @@ retry_all: + */ + trans_for_each_path(trans, path) + BUG_ON(path->uptodate >= BTREE_ITER_NEED_TRAVERSE); +-out: ++ + bch2_btree_cache_cannibalize_unlock(c); + + trans->in_traverse_all = false; +@@ -1503,11 +1498,6 @@ out: + return ret; + } + +-static int bch2_btree_path_traverse_all(struct btree_trans *trans) +-{ +- return __btree_path_traverse_all(trans, 0, _RET_IP_); +-} +- + static inline bool btree_path_good_node(struct btree_trans *trans, + struct btree_path *path, + unsigned l, int check_pos) +@@ -1631,8 +1621,6 @@ out: + return ret; + } + +-static int __btree_path_traverse_all(struct btree_trans *, int, unsigned long); +- + int __must_check bch2_btree_path_traverse(struct btree_trans *trans, + struct btree_path *path, unsigned flags) + { +diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h +index 68272f26f017..9ae5c8d56b2a 100644 +--- a/fs/bcachefs/btree_types.h ++++ b/fs/bcachefs/btree_types.h +@@ -386,6 +386,7 @@ struct btree_trans { + bool used_mempool:1; + bool in_traverse_all:1; + bool restarted:1; ++ bool memory_allocation_failure:1; + bool journal_transaction_names:1; + /* + * For when bch2_trans_update notices we'll be splitting a compressed +diff --git a/include/trace/events/bcachefs.h b/include/trace/events/bcachefs.h +index a21a39230a09..8cf6669e2830 100644 +--- a/include/trace/events/bcachefs.h ++++ b/include/trace/events/bcachefs.h +@@ -802,6 +802,14 @@ DEFINE_EVENT(transaction_restart_iter, trans_restart_traverse, + TP_ARGS(trans_fn, caller_ip, btree_id, pos) + ); + ++DEFINE_EVENT(transaction_restart_iter, trans_restart_memory_allocation_failure, ++ TP_PROTO(const char *trans_fn, ++ unsigned long caller_ip, ++ enum btree_id btree_id, ++ struct bpos *pos), ++ TP_ARGS(trans_fn, caller_ip, btree_id, pos) ++); ++ + TRACE_EVENT(trans_restart_would_deadlock, + TP_PROTO(const char *trans_fn, + unsigned long caller_ip, +-- +cgit v1.2.3 + + +From fd005f669d221c0657e4433b39eebffbaf6c6708 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 19 Feb 2022 00:42:12 -0500 +Subject: bcachefs: Check for errors from crypto_skcipher_encrypt() + +Apparently it actually is possible for crypto_skcipher_encrypt() to +return an error - not sure why that would be - but we need to replace +our assertion with actual error handling. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_io.c | 16 +++++++++++++--- + fs/bcachefs/btree_io.h | 13 ++++++++----- + fs/bcachefs/checksum.c | 47 +++++++++++++++++++++++++++++------------------ + fs/bcachefs/checksum.h | 6 +++--- + fs/bcachefs/io.c | 32 +++++++++++++++++++++++++------- + fs/bcachefs/journal_io.c | 9 +++++++-- + 6 files changed, 85 insertions(+), 38 deletions(-) + +diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c +index a3651325a022..55c939dc6789 100644 +--- a/fs/bcachefs/btree_io.c ++++ b/fs/bcachefs/btree_io.c +@@ -922,7 +922,10 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, + BTREE_ERR_WANT_RETRY, c, ca, b, i, + "invalid checksum"); + +- bset_encrypt(c, i, b->written << 9); ++ ret = bset_encrypt(c, i, b->written << 9); ++ if (bch2_fs_fatal_err_on(ret, c, ++ "error decrypting btree node: %i", ret)) ++ goto fsck_err; + + btree_err_on(btree_node_is_extents(b) && + !BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data), +@@ -949,7 +952,10 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, + BTREE_ERR_WANT_RETRY, c, ca, b, i, + "invalid checksum"); + +- bset_encrypt(c, i, b->written << 9); ++ ret = bset_encrypt(c, i, b->written << 9); ++ if (bch2_fs_fatal_err_on(ret, c, ++ "error decrypting btree node: %i\n", ret)) ++ goto fsck_err; + + sectors = vstruct_sectors(bne, c->block_bits); + } +@@ -1757,6 +1763,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, bool already_sta + unsigned long old, new; + bool validate_before_checksum = false; + void *data; ++ int ret; + + if (already_started) + goto do_write; +@@ -1897,7 +1904,10 @@ do_write: + validate_bset_for_write(c, b, i, sectors_to_write)) + goto err; + +- bset_encrypt(c, i, b->written << 9); ++ ret = bset_encrypt(c, i, b->written << 9); ++ if (bch2_fs_fatal_err_on(ret, c, ++ "error encrypting btree node: %i\n", ret)) ++ goto err; + + nonce = btree_nonce(i, b->written << 9); + +diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h +index 0f20224e2a77..095ad505338d 100644 +--- a/fs/bcachefs/btree_io.h ++++ b/fs/bcachefs/btree_io.h +@@ -111,22 +111,25 @@ static inline struct nonce btree_nonce(struct bset *i, unsigned offset) + }}; + } + +-static inline void bset_encrypt(struct bch_fs *c, struct bset *i, unsigned offset) ++static inline int bset_encrypt(struct bch_fs *c, struct bset *i, unsigned offset) + { + struct nonce nonce = btree_nonce(i, offset); ++ int ret; + + if (!offset) { + struct btree_node *bn = container_of(i, struct btree_node, keys); + unsigned bytes = (void *) &bn->keys - (void *) &bn->flags; + +- bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, &bn->flags, +- bytes); ++ ret = bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, ++ &bn->flags, bytes); ++ if (ret) ++ return ret; + + nonce = nonce_add(nonce, round_up(bytes, CHACHA_BLOCK_SIZE)); + } + +- bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, i->_data, +- vstruct_end(i) - (void *) i->_data); ++ return bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, i->_data, ++ vstruct_end(i) - (void *) i->_data); + } + + void bch2_btree_sort_into(struct bch_fs *, struct btree *, struct btree *); +diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c +index a1d89923d361..425582f60d7a 100644 +--- a/fs/bcachefs/checksum.c ++++ b/fs/bcachefs/checksum.c +@@ -93,9 +93,9 @@ static void bch2_checksum_update(struct bch2_checksum_state *state, const void * + } + } + +-static inline void do_encrypt_sg(struct crypto_sync_skcipher *tfm, +- struct nonce nonce, +- struct scatterlist *sg, size_t len) ++static inline int do_encrypt_sg(struct crypto_sync_skcipher *tfm, ++ struct nonce nonce, ++ struct scatterlist *sg, size_t len) + { + SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm); + int ret; +@@ -104,17 +104,20 @@ static inline void do_encrypt_sg(struct crypto_sync_skcipher *tfm, + skcipher_request_set_crypt(req, sg, sg, len, nonce.d); + + ret = crypto_skcipher_encrypt(req); +- BUG_ON(ret); ++ if (ret) ++ pr_err("got error %i from crypto_skcipher_encrypt()", ret); ++ ++ return ret; + } + +-static inline void do_encrypt(struct crypto_sync_skcipher *tfm, ++static inline int do_encrypt(struct crypto_sync_skcipher *tfm, + struct nonce nonce, + void *buf, size_t len) + { + struct scatterlist sg; + + sg_init_one(&sg, buf, len); +- do_encrypt_sg(tfm, nonce, &sg, len); ++ return do_encrypt_sg(tfm, nonce, &sg, len); + } + + int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce, +@@ -136,25 +139,29 @@ int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce, + goto err; + } + +- do_encrypt(chacha20, nonce, buf, len); ++ ret = do_encrypt(chacha20, nonce, buf, len); + err: + crypto_free_sync_skcipher(chacha20); + return ret; + } + +-static void gen_poly_key(struct bch_fs *c, struct shash_desc *desc, +- struct nonce nonce) ++static int gen_poly_key(struct bch_fs *c, struct shash_desc *desc, ++ struct nonce nonce) + { + u8 key[POLY1305_KEY_SIZE]; ++ int ret; + + nonce.d[3] ^= BCH_NONCE_POLY; + + memset(key, 0, sizeof(key)); +- do_encrypt(c->chacha20, nonce, key, sizeof(key)); ++ ret = do_encrypt(c->chacha20, nonce, key, sizeof(key)); ++ if (ret) ++ return ret; + + desc->tfm = c->poly1305; + crypto_shash_init(desc); + crypto_shash_update(desc, key, sizeof(key)); ++ return 0; + } + + struct bch_csum bch2_checksum(struct bch_fs *c, unsigned type, +@@ -196,13 +203,13 @@ struct bch_csum bch2_checksum(struct bch_fs *c, unsigned type, + } + } + +-void bch2_encrypt(struct bch_fs *c, unsigned type, ++int bch2_encrypt(struct bch_fs *c, unsigned type, + struct nonce nonce, void *data, size_t len) + { + if (!bch2_csum_type_is_encryption(type)) +- return; ++ return 0; + +- do_encrypt(c->chacha20, nonce, data, len); ++ return do_encrypt(c->chacha20, nonce, data, len); + } + + static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type, +@@ -277,23 +284,27 @@ struct bch_csum bch2_checksum_bio(struct bch_fs *c, unsigned type, + return __bch2_checksum_bio(c, type, nonce, bio, &iter); + } + +-void bch2_encrypt_bio(struct bch_fs *c, unsigned type, +- struct nonce nonce, struct bio *bio) ++int bch2_encrypt_bio(struct bch_fs *c, unsigned type, ++ struct nonce nonce, struct bio *bio) + { + struct bio_vec bv; + struct bvec_iter iter; + struct scatterlist sgl[16], *sg = sgl; + size_t bytes = 0; ++ int ret = 0; + + if (!bch2_csum_type_is_encryption(type)) +- return; ++ return 0; + + sg_init_table(sgl, ARRAY_SIZE(sgl)); + + bio_for_each_segment(bv, bio, iter) { + if (sg == sgl + ARRAY_SIZE(sgl)) { + sg_mark_end(sg - 1); +- do_encrypt_sg(c->chacha20, nonce, sgl, bytes); ++ ++ ret = do_encrypt_sg(c->chacha20, nonce, sgl, bytes); ++ if (ret) ++ return ret; + + nonce = nonce_add(nonce, bytes); + bytes = 0; +@@ -307,7 +318,7 @@ void bch2_encrypt_bio(struct bch_fs *c, unsigned type, + } + + sg_mark_end(sg - 1); +- do_encrypt_sg(c->chacha20, nonce, sgl, bytes); ++ return do_encrypt_sg(c->chacha20, nonce, sgl, bytes); + } + + struct bch_csum bch2_checksum_merge(unsigned type, struct bch_csum a, +diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h +index f5c1a609c5c4..c86c3c05d620 100644 +--- a/fs/bcachefs/checksum.h ++++ b/fs/bcachefs/checksum.h +@@ -49,7 +49,7 @@ struct bch_csum bch2_checksum(struct bch_fs *, unsigned, struct nonce, + int bch2_chacha_encrypt_key(struct bch_key *, struct nonce, void *, size_t); + int bch2_request_key(struct bch_sb *, struct bch_key *); + +-void bch2_encrypt(struct bch_fs *, unsigned, struct nonce, ++int bch2_encrypt(struct bch_fs *, unsigned, struct nonce, + void *data, size_t); + + struct bch_csum bch2_checksum_bio(struct bch_fs *, unsigned, +@@ -61,8 +61,8 @@ int bch2_rechecksum_bio(struct bch_fs *, struct bio *, struct bversion, + struct bch_extent_crc_unpacked *, + unsigned, unsigned, unsigned); + +-void bch2_encrypt_bio(struct bch_fs *, unsigned, +- struct nonce, struct bio *); ++int bch2_encrypt_bio(struct bch_fs *, unsigned, ++ struct nonce, struct bio *); + + int bch2_decrypt_sb_key(struct bch_fs *, struct bch_sb_field_crypt *, + struct bch_key *); +diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c +index 038ec01e48e1..fde10cea0706 100644 +--- a/fs/bcachefs/io.c ++++ b/fs/bcachefs/io.c +@@ -764,6 +764,7 @@ static int bch2_write_decrypt(struct bch_write_op *op) + struct bch_fs *c = op->c; + struct nonce nonce = extent_nonce(op->version, op->crc); + struct bch_csum csum; ++ int ret; + + if (!bch2_csum_type_is_encryption(op->crc.csum_type)) + return 0; +@@ -778,10 +779,10 @@ static int bch2_write_decrypt(struct bch_write_op *op) + if (bch2_crc_cmp(op->crc.csum, csum)) + return -EIO; + +- bch2_encrypt_bio(c, op->crc.csum_type, nonce, &op->wbio.bio); ++ ret = bch2_encrypt_bio(c, op->crc.csum_type, nonce, &op->wbio.bio); + op->crc.csum_type = 0; + op->crc.csum = (struct bch_csum) { 0, 0 }; +- return 0; ++ return ret; + } + + static enum prep_encoded_ret { +@@ -996,8 +997,11 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, + crc.live_size = src_len >> 9; + + swap(dst->bi_iter.bi_size, dst_len); +- bch2_encrypt_bio(c, op->csum_type, +- extent_nonce(version, crc), dst); ++ ret = bch2_encrypt_bio(c, op->csum_type, ++ extent_nonce(version, crc), dst); ++ if (ret) ++ goto err; ++ + crc.csum = bch2_checksum_bio(c, op->csum_type, + extent_nonce(version, crc), dst); + crc.csum_type = op->csum_type; +@@ -1772,6 +1776,7 @@ static void __bch2_read_endio(struct work_struct *work) + struct nonce nonce = extent_nonce(rbio->version, crc); + unsigned nofs_flags; + struct bch_csum csum; ++ int ret; + + nofs_flags = memalloc_nofs_save(); + +@@ -1806,7 +1811,10 @@ static void __bch2_read_endio(struct work_struct *work) + crc.live_size = bvec_iter_sectors(rbio->bvec_iter); + + if (crc_is_compressed(crc)) { +- bch2_encrypt_bio(c, crc.csum_type, nonce, src); ++ ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); ++ if (ret) ++ goto decrypt_err; ++ + if (bch2_bio_uncompress(c, src, dst, dst_iter, crc)) + goto decompression_err; + } else { +@@ -1817,7 +1825,9 @@ static void __bch2_read_endio(struct work_struct *work) + BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size); + src->bi_iter.bi_size = dst_iter.bi_size; + +- bch2_encrypt_bio(c, crc.csum_type, nonce, src); ++ ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); ++ if (ret) ++ goto decrypt_err; + + if (rbio->bounce) { + struct bvec_iter src_iter = src->bi_iter; +@@ -1830,7 +1840,10 @@ static void __bch2_read_endio(struct work_struct *work) + * Re encrypt data we decrypted, so it's consistent with + * rbio->crc: + */ +- bch2_encrypt_bio(c, crc.csum_type, nonce, src); ++ ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); ++ if (ret) ++ goto decrypt_err; ++ + promote_start(rbio->promote, rbio); + rbio->promote = NULL; + } +@@ -1865,6 +1878,11 @@ decompression_err: + "decompression error"); + bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR); + goto out; ++decrypt_err: ++ bch_err_inum_ratelimited(c, rbio->read_pos.inode, ++ "decrypt error"); ++ bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR); ++ goto out; + } + + static void bch2_read_endio(struct bio *bio) +diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c +index b5c204e7c569..2af344de2a3e 100644 +--- a/fs/bcachefs/journal_io.c ++++ b/fs/bcachefs/journal_io.c +@@ -725,9 +725,11 @@ static int jset_validate(struct bch_fs *c, + sector, le64_to_cpu(jset->seq))) + ret = JOURNAL_ENTRY_BAD; + +- bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), ++ ret = bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), + jset->encrypted_start, + vstruct_end(jset) - (void *) jset->encrypted_start); ++ bch2_fs_fatal_err_on(ret, c, ++ "error decrypting journal entry: %i", ret); + csum_done: + /* last_seq is ignored when JSET_NO_FLUSH is true */ + if (journal_entry_err_on(!JSET_NO_FLUSH(jset) && +@@ -1597,9 +1599,12 @@ void bch2_journal_write(struct closure *cl) + jset_validate_for_write(c, jset)) + goto err; + +- bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), ++ ret = bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), + jset->encrypted_start, + vstruct_end(jset) - (void *) jset->encrypted_start); ++ if (bch2_fs_fatal_err_on(ret, c, ++ "error decrypting journal entry: %i", ret)) ++ goto err; + + jset->csum = csum_vstruct(c, JSET_CSUM_TYPE(jset), + journal_nonce(jset), jset); +-- +cgit v1.2.3 + + +From 222d7cdbd00abccb4eade0a710598eff10766620 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 19 Feb 2022 01:18:18 -0500 +Subject: bcachefs: Store logical location of journal entries + +When viewing what's in the journal, it's more useful to have the logical +location - journal bucket and offset within that bucket - than just the +offset on that device. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/journal_io.c | 25 +++++++++++++++---------- + fs/bcachefs/journal_io.h | 10 +++++++++- + 2 files changed, 24 insertions(+), 11 deletions(-) + +diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c +index 2af344de2a3e..901e346684fb 100644 +--- a/fs/bcachefs/journal_io.c ++++ b/fs/bcachefs/journal_io.c +@@ -47,12 +47,12 @@ struct journal_list { + * be replayed: + */ + static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca, +- struct bch_extent_ptr entry_ptr, ++ struct journal_ptr entry_ptr, + struct journal_list *jlist, struct jset *j, + bool bad) + { + struct journal_replay *i, *pos, *dup = NULL; +- struct bch_extent_ptr *ptr; ++ struct journal_ptr *ptr; + struct list_head *where; + size_t bytes = vstruct_bytes(j); + u64 last_seq = 0; +@@ -872,9 +872,12 @@ reread: + ja->bucket_seq[bucket] = le64_to_cpu(j->seq); + + mutex_lock(&jlist->lock); +- ret = journal_entry_add(c, ca, (struct bch_extent_ptr) { +- .dev = ca->dev_idx, +- .offset = offset, ++ ret = journal_entry_add(c, ca, (struct journal_ptr) { ++ .dev = ca->dev_idx, ++ .bucket = bucket, ++ .bucket_offset = offset - ++ bucket_to_sector(ca, ja->buckets[bucket]), ++ .sector = offset, + }, jlist, j, ret != 0); + mutex_unlock(&jlist->lock); + +@@ -965,8 +968,8 @@ err: + goto out; + } + +-static void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c, +- struct journal_replay *j) ++void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c, ++ struct journal_replay *j) + { + unsigned i; + +@@ -974,13 +977,15 @@ static void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c, + struct bch_dev *ca = bch_dev_bkey_exists(c, j->ptrs[i].dev); + u64 offset; + +- div64_u64_rem(j->ptrs[i].offset, ca->mi.bucket_size, &offset); ++ div64_u64_rem(j->ptrs[i].sector, ca->mi.bucket_size, &offset); + + if (i) + pr_buf(out, " "); +- pr_buf(out, "%u:%llu (offset %llu)", ++ pr_buf(out, "%u:%u:%u (sector %llu)", + j->ptrs[i].dev, +- (u64) j->ptrs[i].offset, offset); ++ j->ptrs[i].bucket, ++ j->ptrs[i].bucket_offset, ++ j->ptrs[i].sector); + } + } + +diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h +index d8425fe0d67b..f2001835e43e 100644 +--- a/fs/bcachefs/journal_io.h ++++ b/fs/bcachefs/journal_io.h +@@ -8,7 +8,12 @@ + */ + struct journal_replay { + struct list_head list; +- struct bch_extent_ptr ptrs[BCH_REPLICAS_MAX]; ++ struct journal_ptr { ++ u8 dev; ++ u32 bucket; ++ u32 bucket_offset; ++ u64 sector; ++ } ptrs[BCH_REPLICAS_MAX]; + unsigned nr_ptrs; + + /* checksum error, but we may want to try using it anyways: */ +@@ -45,6 +50,9 @@ int bch2_journal_entry_validate(struct bch_fs *, const char *, + void bch2_journal_entry_to_text(struct printbuf *, struct bch_fs *, + struct jset_entry *); + ++void bch2_journal_ptrs_to_text(struct printbuf *, struct bch_fs *, ++ struct journal_replay *); ++ + int bch2_journal_read(struct bch_fs *, struct list_head *, u64 *, u64 *); + + void bch2_journal_write(struct closure *); +-- +cgit v1.2.3 + + +From cfbabb0eb6a52d6e56a503c7535a1394820075cb Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 19 Feb 2022 02:39:56 -0500 +Subject: bcachefs: Delete some flag bits that are no longer used + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bcachefs.h | 2 -- + fs/bcachefs/recovery.c | 3 --- + fs/bcachefs/super-io.c | 3 --- + 3 files changed, 8 deletions(-) + +diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h +index 0e9689f6878a..b018425394e5 100644 +--- a/fs/bcachefs/bcachefs.h ++++ b/fs/bcachefs/bcachefs.h +@@ -507,8 +507,6 @@ struct bch_dev { + + enum { + /* startup: */ +- BCH_FS_INITIALIZED, +- BCH_FS_ALLOC_READ_DONE, + BCH_FS_ALLOC_CLEAN, + BCH_FS_ALLOCATOR_RUNNING, + BCH_FS_ALLOCATOR_STOPPING, +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index 543db58ff4d6..d7b96d287ce3 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -1111,8 +1111,6 @@ use_clean: + goto err; + bch_verbose(c, "stripes_read done"); + +- set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags); +- + /* + * If we're not running fsck, this ensures bch2_fsck_err() calls are + * instead interpreted as bch2_inconsistent_err() calls: +@@ -1297,7 +1295,6 @@ int bch2_fs_initialize(struct bch_fs *c) + } + mutex_unlock(&c->sb_lock); + +- set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags); + set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags); + set_bit(BCH_FS_FSCK_DONE, &c->flags); + +diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c +index 49dafdad77cd..eae63184ead2 100644 +--- a/fs/bcachefs/super-io.c ++++ b/fs/bcachefs/super-io.c +@@ -455,9 +455,6 @@ int bch2_sb_to_fs(struct bch_fs *c, struct bch_sb *src) + + __copy_super(&c->disk_sb, src); + +- if (BCH_SB_INITIALIZED(c->disk_sb.sb)) +- set_bit(BCH_FS_INITIALIZED, &c->flags); +- + ret = bch2_sb_replicas_to_cpu_replicas(c); + if (ret) + return ret; +-- +cgit v1.2.3 + + +From 5e3bf05ad0288e93630ca52f8b14df3e8e0108be Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 19 Feb 2022 02:40:45 -0500 +Subject: bcachefs: Change __bch2_trans_commit() to run triggers then get RW + +This is prep work for the next patch, which is going to change +__bch2_trans_commit() to use bch2_journal_key_insert() when very early +in the recovery process, so that we have a unified interface for doing +btree updates. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_update_leaf.c | 22 +++++++++++----------- + 1 file changed, 11 insertions(+), 11 deletions(-) + +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index 5530941c772b..a08d36c0dc8d 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -987,6 +987,17 @@ int __bch2_trans_commit(struct btree_trans *trans) + if (trans->flags & BTREE_INSERT_GC_LOCK_HELD) + lockdep_assert_held(&c->gc_lock); + ++ ret = bch2_trans_commit_run_triggers(trans); ++ if (ret) ++ goto out_reset; ++ ++ if (!(trans->flags & BTREE_INSERT_NOCHECK_RW) && ++ unlikely(!percpu_ref_tryget(&c->writes))) { ++ ret = bch2_trans_commit_get_rw_cold(trans); ++ if (ret) ++ goto out_reset; ++ } ++ + memset(&trans->journal_preres, 0, sizeof(trans->journal_preres)); + + trans->journal_u64s = trans->extra_journal_entry_u64s; +@@ -997,17 +1008,6 @@ int __bch2_trans_commit(struct btree_trans *trans) + if (trans->journal_transaction_names) + trans->journal_u64s += JSET_ENTRY_LOG_U64s; + +- if (!(trans->flags & BTREE_INSERT_NOCHECK_RW) && +- unlikely(!percpu_ref_tryget(&c->writes))) { +- ret = bch2_trans_commit_get_rw_cold(trans); +- if (ret) +- goto out_reset; +- } +- +- ret = bch2_trans_commit_run_triggers(trans); +- if (ret) +- goto out; +- + trans_for_each_update(trans, i) { + BUG_ON(!i->path->should_be_locked); + +-- +cgit v1.2.3 + + +From d0a6369c940dd71fa024c9d290bb1a457738cc7c Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 19 Feb 2022 05:15:53 -0500 +Subject: bcachefs: opts.read_journal_only + +Add an option that tells recovery to only read the journal, to be used +by the list_journal command. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/opts.h | 5 +++++ + fs/bcachefs/recovery.c | 3 +++ + 2 files changed, 8 insertions(+) + +diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h +index affe9233d708..bafacf6b46a2 100644 +--- a/fs/bcachefs/opts.h ++++ b/fs/bcachefs/opts.h +@@ -329,6 +329,11 @@ enum opt_type { + OPT_BOOL(), \ + NO_SB_OPT, false, \ + NULL, "Read all journal entries, not just dirty ones")\ ++ x(read_journal_only, u8, \ ++ 0, \ ++ OPT_BOOL(), \ ++ NO_SB_OPT, false, \ ++ NULL, "Only read the journal, skip the rest of recovery")\ + x(journal_transaction_names, u8, \ + OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_BOOL(), \ +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index d7b96d287ce3..96f13f2e7a9a 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -1054,6 +1054,9 @@ use_clean: + blacklist_seq = journal_seq = le64_to_cpu(clean->journal_seq) + 1; + } + ++ if (c->opts.read_journal_only) ++ goto out; ++ + if (c->opts.reconstruct_alloc) { + c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); + drop_alloc_keys(&c->journal_keys); +-- +cgit v1.2.3 + + +From c6dafce0c2a793e2817db243c406ad5563ed908f Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 21 Feb 2022 05:05:29 -0500 +Subject: bcachefs: Don't issue discards when in nochanges mode + +When the nochanges option is selected, we're supposed to never issue +writes. Unfortunately, it seems discards were missed when implemnting +this, leading to some painful filesystem corruption. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/alloc_background.c | 3 ++- + fs/bcachefs/journal_reclaim.c | 3 ++- + 2 files changed, 4 insertions(+), 2 deletions(-) + +diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c +index 023db6219ad8..4afb2d457fb0 100644 +--- a/fs/bcachefs/alloc_background.c ++++ b/fs/bcachefs/alloc_background.c +@@ -794,7 +794,8 @@ static int push_invalidated_bucket(struct bch_fs *c, struct bch_dev *ca, u64 b) + + static void discard_one_bucket(struct bch_fs *c, struct bch_dev *ca, u64 b) + { +- if (ca->mi.discard && ++ if (!c->opts.nochanges && ++ ca->mi.discard && + blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev))) + blkdev_issue_discard(ca->disk_sb.bdev, bucket_to_sector(ca, b), + ca->mi.bucket_size, GFP_NOFS, 0); +diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c +index 52a3935cff53..9467191e182e 100644 +--- a/fs/bcachefs/journal_reclaim.c ++++ b/fs/bcachefs/journal_reclaim.c +@@ -286,7 +286,8 @@ void bch2_journal_do_discards(struct journal *j) + struct journal_device *ja = &ca->journal; + + while (should_discard_bucket(j, ja)) { +- if (ca->mi.discard && ++ if (!c->opts.nochanges && ++ ca->mi.discard && + blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev))) + blkdev_issue_discard(ca->disk_sb.bdev, + bucket_to_sector(ca, +-- +cgit v1.2.3 + + +From 14987972645526a15a1c6b6e2dd5f665dcc648ed Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 20 Feb 2022 04:52:44 -0500 +Subject: bcachefs: Kill bch_scnmemcpy() + +bch_scnmemcpy was for printing length-limited strings that might not +have a terminating null - turns out sprintf & pr_buf can do this with +%.*s. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/dirent.c | 6 +++--- + fs/bcachefs/disk_groups.c | 9 ++++----- + fs/bcachefs/journal_io.c | 2 +- + fs/bcachefs/util.c | 13 ------------- + fs/bcachefs/util.h | 2 -- + fs/bcachefs/xattr.c | 10 +++++----- + 6 files changed, 13 insertions(+), 29 deletions(-) + +diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c +index 6f699b736b34..a43a24409d37 100644 +--- a/fs/bcachefs/dirent.c ++++ b/fs/bcachefs/dirent.c +@@ -122,9 +122,9 @@ void bch2_dirent_to_text(struct printbuf *out, struct bch_fs *c, + { + struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); + +- bch_scnmemcpy(out, d.v->d_name, +- bch2_dirent_name_bytes(d)); +- pr_buf(out, " -> %llu type %s", ++ pr_buf(out, "%.*s -> %llu type %s", ++ bch2_dirent_name_bytes(d), ++ d.v->d_name, + d.v->d_type != DT_SUBVOL + ? le64_to_cpu(d.v->d_inum) + : le32_to_cpu(d.v->d_child_subvol), +diff --git a/fs/bcachefs/disk_groups.c b/fs/bcachefs/disk_groups.c +index 6c84297ef265..2fee14783b4b 100644 +--- a/fs/bcachefs/disk_groups.c ++++ b/fs/bcachefs/disk_groups.c +@@ -76,8 +76,9 @@ static int bch2_sb_disk_groups_validate(struct bch_sb *sb, + for (g = sorted; g + 1 < sorted + nr_groups; g++) + if (!BCH_GROUP_DELETED(g) && + !group_cmp(&g[0], &g[1])) { +- pr_buf(err, "duplicate label %llu.", BCH_GROUP_PARENT(g)); +- bch_scnmemcpy(err, g->label, strnlen(g->label, sizeof(g->label))); ++ pr_buf(err, "duplicate label %llu.%.*s", ++ BCH_GROUP_PARENT(g), ++ (int) sizeof(g->label), g->label); + goto err; + } + +@@ -376,9 +377,7 @@ void bch2_disk_path_to_text(struct printbuf *out, + v = path[--nr]; + g = groups->entries + v; + +- bch_scnmemcpy(out, g->label, +- strnlen(g->label, sizeof(g->label))); +- ++ pr_buf(out, "%.*s", (int) sizeof(g->label), g->label); + if (nr) + pr_buf(out, "."); + } +diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c +index 901e346684fb..c4660a1fee00 100644 +--- a/fs/bcachefs/journal_io.c ++++ b/fs/bcachefs/journal_io.c +@@ -596,7 +596,7 @@ static void journal_entry_log_to_text(struct printbuf *out, struct bch_fs *c, + struct jset_entry_log *l = container_of(entry, struct jset_entry_log, entry); + unsigned bytes = vstruct_bytes(entry) - offsetof(struct jset_entry_log, d); + +- bch_scnmemcpy(out, l->d, strnlen(l->d, bytes)); ++ pr_buf(out, "%.*s", bytes, l->d); + } + + struct jset_entry_ops { +diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c +index 0bbea332fcaa..0cf09f13504b 100644 +--- a/fs/bcachefs/util.c ++++ b/fs/bcachefs/util.c +@@ -579,19 +579,6 @@ void memcpy_from_bio(void *dst, struct bio *src, struct bvec_iter src_iter) + } + } + +-void bch_scnmemcpy(struct printbuf *out, +- const char *src, size_t len) +-{ +- size_t n = printbuf_remaining(out); +- +- if (n) { +- n = min(n - 1, len); +- memcpy(out->pos, src, n); +- out->pos += n; +- *out->pos = '\0'; +- } +-} +- + #include "eytzinger.h" + + static int alignment_ok(const void *base, size_t align) +diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h +index e55407dc5324..a80d4ec22d95 100644 +--- a/fs/bcachefs/util.h ++++ b/fs/bcachefs/util.h +@@ -281,8 +281,6 @@ static inline void printbuf_newline(struct printbuf *buf) + pr_buf(buf, " "); + } + +-void bch_scnmemcpy(struct printbuf *, const char *, size_t); +- + int bch2_strtoint_h(const char *, int *); + int bch2_strtouint_h(const char *, unsigned int *); + int bch2_strtoll_h(const char *, long long *); +diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c +index 4d7db64e3ef3..1673654fff3e 100644 +--- a/fs/bcachefs/xattr.c ++++ b/fs/bcachefs/xattr.c +@@ -111,11 +111,11 @@ void bch2_xattr_to_text(struct printbuf *out, struct bch_fs *c, + else + pr_buf(out, "(unknown type %u)", xattr.v->x_type); + +- bch_scnmemcpy(out, xattr.v->x_name, +- xattr.v->x_name_len); +- pr_buf(out, ":"); +- bch_scnmemcpy(out, xattr_val(xattr.v), +- le16_to_cpu(xattr.v->x_val_len)); ++ pr_buf(out, "%.*s:%.*s", ++ xattr.v->x_name_len, ++ xattr.v->x_name, ++ le16_to_cpu(xattr.v->x_val_len), ++ (char *) xattr_val(xattr.v)); + } + + static int bch2_xattr_get_trans(struct btree_trans *trans, struct bch_inode_info *inode, +-- +cgit v1.2.3 + + +From fe7d503b570a6e9db26b0b1bb7cbd55310856ea1 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 20 Feb 2022 05:00:45 -0500 +Subject: bcachefs: Add .to_text() methods for all superblock sections + +This patch improves the superblock .to_text() methods and adds methods +for all types that were missing them. It also improves printbufs by +allowing them to specfiy what units we want to be printing in, and adds +new wrapper methods for unifying our kernel and userspace environments. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/disk_groups.c | 40 +++- + fs/bcachefs/disk_groups.h | 5 +- + fs/bcachefs/extents.c | 18 +- + fs/bcachefs/journal_io.c | 2 +- + fs/bcachefs/journal_seq_blacklist.c | 1 + + fs/bcachefs/quota.c | 43 ++++- + fs/bcachefs/recovery.c | 2 +- + fs/bcachefs/replicas.c | 85 +++++++-- + fs/bcachefs/replicas.h | 1 + + fs/bcachefs/super-io.c | 364 ++++++++++++++++++++++++++++++++++-- + fs/bcachefs/super-io.h | 4 +- + fs/bcachefs/sysfs.c | 2 +- + fs/bcachefs/util.c | 21 +++ + fs/bcachefs/util.h | 64 +++++-- + 14 files changed, 588 insertions(+), 64 deletions(-) + +diff --git a/fs/bcachefs/disk_groups.c b/fs/bcachefs/disk_groups.c +index 2fee14783b4b..19698e504b3c 100644 +--- a/fs/bcachefs/disk_groups.c ++++ b/fs/bcachefs/disk_groups.c +@@ -343,12 +343,10 @@ int bch2_disk_path_find_or_create(struct bch_sb_handle *sb, const char *name) + return v; + } + +-void bch2_disk_path_to_text(struct printbuf *out, +- struct bch_sb_handle *sb, +- unsigned v) ++void bch2_disk_path_to_text(struct printbuf *out, struct bch_sb *sb, unsigned v) + { + struct bch_sb_field_disk_groups *groups = +- bch2_sb_get_disk_groups(sb->sb); ++ bch2_sb_get_disk_groups(sb); + struct bch_disk_group *g; + unsigned nr = 0; + u16 path[32]; +@@ -383,7 +381,7 @@ void bch2_disk_path_to_text(struct printbuf *out, + } + return; + inval: +- pr_buf(out, "invalid group %u", v); ++ pr_buf(out, "invalid label %u", v); + } + + int bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name) +@@ -447,6 +445,36 @@ int bch2_opt_target_parse(struct bch_fs *c, const char *buf, u64 *v) + return -EINVAL; + } + ++void bch2_sb_target_to_text(struct printbuf *out, struct bch_sb *sb, u64 v) ++{ ++ struct target t = target_decode(v); ++ ++ switch (t.type) { ++ case TARGET_NULL: ++ pr_buf(out, "none"); ++ break; ++ case TARGET_DEV: { ++ struct bch_sb_field_members *mi = bch2_sb_get_members(sb); ++ struct bch_member *m = mi->members + t.dev; ++ ++ if (bch2_dev_exists(sb, mi, t.dev)) { ++ pr_buf(out, "Device "); ++ pr_uuid(out, m->uuid.b); ++ pr_buf(out, " (%u)", t.dev); ++ } else { ++ pr_buf(out, "Bad device %u", t.dev); ++ } ++ ++ break; ++ } ++ case TARGET_GROUP: ++ bch2_disk_path_to_text(out, sb, t.group); ++ break; ++ default: ++ BUG(); ++ } ++} ++ + void bch2_opt_target_to_text(struct printbuf *out, struct bch_fs *c, u64 v) + { + struct target t = target_decode(v); +@@ -480,7 +508,7 @@ void bch2_opt_target_to_text(struct printbuf *out, struct bch_fs *c, u64 v) + } + case TARGET_GROUP: + mutex_lock(&c->sb_lock); +- bch2_disk_path_to_text(out, &c->disk_sb, t.group); ++ bch2_disk_path_to_text(out, c->disk_sb.sb, t.group); + mutex_unlock(&c->sb_lock); + break; + default: +diff --git a/fs/bcachefs/disk_groups.h b/fs/bcachefs/disk_groups.h +index 3d84f23c34ed..a274aacbdf92 100644 +--- a/fs/bcachefs/disk_groups.h ++++ b/fs/bcachefs/disk_groups.h +@@ -75,8 +75,9 @@ int bch2_disk_path_find(struct bch_sb_handle *, const char *); + /* Exported for userspace bcachefs-tools: */ + int bch2_disk_path_find_or_create(struct bch_sb_handle *, const char *); + +-void bch2_disk_path_to_text(struct printbuf *, struct bch_sb_handle *, +- unsigned); ++void bch2_disk_path_to_text(struct printbuf *, struct bch_sb *, unsigned); ++ ++void bch2_sb_target_to_text(struct printbuf *, struct bch_sb *, u64); + + int bch2_opt_target_parse(struct bch_fs *, const char *, u64 *); + void bch2_opt_target_to_text(struct printbuf *, struct bch_fs *, u64); +diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c +index 44c584e9adaa..cc50e4b28882 100644 +--- a/fs/bcachefs/extents.c ++++ b/fs/bcachefs/extents.c +@@ -954,15 +954,19 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c, + switch (__extent_entry_type(entry)) { + case BCH_EXTENT_ENTRY_ptr: + ptr = entry_to_ptr(entry); +- ca = ptr->dev < c->sb.nr_devices && c->devs[ptr->dev] +- ? bch_dev_bkey_exists(c, ptr->dev) +- : NULL; + +- pr_buf(out, "ptr: %u:%llu gen %u%s%s", ptr->dev, ++ pr_buf(out, "ptr: %u:%llu gen %u%s", ptr->dev, + (u64) ptr->offset, ptr->gen, +- ptr->cached ? " cached" : "", +- ca && ptr_stale(ca, ptr) +- ? " stale" : ""); ++ ptr->cached ? " cached" : ""); ++ ++ if (c) { ++ ca = ptr->dev < c->sb.nr_devices && c->devs[ptr->dev] ++ ? bch_dev_bkey_exists(c, ptr->dev) ++ : NULL; ++ ++ if (ca && ptr_stale(ca, ptr)) ++ pr_buf(out, " stale"); ++ } + break; + case BCH_EXTENT_ENTRY_crc32: + case BCH_EXTENT_ENTRY_crc64: +diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c +index c4660a1fee00..05c109262049 100644 +--- a/fs/bcachefs/journal_io.c ++++ b/fs/bcachefs/journal_io.c +@@ -303,7 +303,7 @@ static void journal_entry_btree_keys_to_text(struct printbuf *out, struct bch_fs + + vstruct_for_each(entry, k) { + if (!first) { +- printbuf_newline(out); ++ pr_newline(out); + pr_buf(out, "%s: ", bch2_jset_entry_types[entry->type]); + } + pr_buf(out, "btree=%s l=%u ", bch2_btree_ids[entry->btree_id], entry->level); +diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c +index 3cc63fc202ab..3140c8731431 100644 +--- a/fs/bcachefs/journal_seq_blacklist.c ++++ b/fs/bcachefs/journal_seq_blacklist.c +@@ -235,6 +235,7 @@ static void bch2_sb_journal_seq_blacklist_to_text(struct printbuf *out, + le64_to_cpu(i->start), + le64_to_cpu(i->end)); + } ++ pr_newline(out); + } + + const struct bch_sb_field_ops bch_sb_field_ops_journal_seq_blacklist = { +diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c +index 6fb8224f565e..b7ef8fa7bbc9 100644 +--- a/fs/bcachefs/quota.c ++++ b/fs/bcachefs/quota.c +@@ -6,7 +6,18 @@ + #include "subvolume.h" + #include "super-io.h" + +-static int bch2_sb_validate_quota(struct bch_sb *sb, struct bch_sb_field *f, ++static const char * const bch2_quota_types[] = { ++ "user", ++ "group", ++ "project", ++}; ++ ++static const char * const bch2_quota_counters[] = { ++ "space", ++ "inodes", ++}; ++ ++static int bch2_sb_quota_validate(struct bch_sb *sb, struct bch_sb_field *f, + struct printbuf *err) + { + struct bch_sb_field_quota *q = field_to_type(f, quota); +@@ -14,13 +25,36 @@ static int bch2_sb_validate_quota(struct bch_sb *sb, struct bch_sb_field *f, + if (vstruct_bytes(&q->field) < sizeof(*q)) { + pr_buf(err, "wrong size (got %llu should be %zu)", + vstruct_bytes(&q->field), sizeof(*q)); ++ return -EINVAL; + } + + return 0; + } + ++static void bch2_sb_quota_to_text(struct printbuf *out, struct bch_sb *sb, ++ struct bch_sb_field *f) ++{ ++ struct bch_sb_field_quota *q = field_to_type(f, quota); ++ unsigned qtyp, counter; ++ ++ for (qtyp = 0; qtyp < ARRAY_SIZE(q->q); qtyp++) { ++ pr_buf(out, "%s: flags %llx", ++ bch2_quota_types[qtyp], ++ le64_to_cpu(q->q[qtyp].flags)); ++ ++ for (counter = 0; counter < Q_COUNTERS; counter++) ++ pr_buf(out, " %s timelimit %u warnlimit %u", ++ bch2_quota_counters[counter], ++ le32_to_cpu(q->q[qtyp].c[counter].timelimit), ++ le32_to_cpu(q->q[qtyp].c[counter].warnlimit)); ++ ++ pr_newline(out); ++ } ++} ++ + const struct bch_sb_field_ops bch_sb_field_ops_quota = { +- .validate = bch2_sb_validate_quota, ++ .validate = bch2_sb_quota_validate, ++ .to_text = bch2_sb_quota_to_text, + }; + + const char *bch2_quota_invalid(const struct bch_fs *c, struct bkey_s_c k) +@@ -34,11 +68,6 @@ const char *bch2_quota_invalid(const struct bch_fs *c, struct bkey_s_c k) + return NULL; + } + +-static const char * const bch2_quota_counters[] = { +- "space", +- "inodes", +-}; +- + void bch2_quota_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) + { +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index 96f13f2e7a9a..ed25595275fc 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -803,7 +803,7 @@ static struct bch_sb_field_clean *read_superblock_clean(struct bch_fs *c) + return ERR_PTR(-ENOMEM); + } + +- ret = bch2_sb_clean_validate(c, clean, READ); ++ ret = bch2_sb_clean_validate_late(c, clean, READ); + if (ret) { + mutex_unlock(&c->sb_lock); + return ERR_PTR(ret); +diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c +index 96994b7a75a5..6c1d42f1c92c 100644 +--- a/fs/bcachefs/replicas.c ++++ b/fs/bcachefs/replicas.c +@@ -36,6 +36,22 @@ static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r) + eytzinger0_sort(r->entries, r->nr, r->entry_size, memcmp, NULL); + } + ++void bch2_replicas_entry_v0_to_text(struct printbuf *out, ++ struct bch_replicas_entry_v0 *e) ++{ ++ unsigned i; ++ ++ if (e->data_type < BCH_DATA_NR) ++ pr_buf(out, "%s", bch2_data_types[e->data_type]); ++ else ++ pr_buf(out, "(invalid data type %u)", e->data_type); ++ ++ pr_buf(out, ": %u [", e->nr_devs); ++ for (i = 0; i < e->nr_devs; i++) ++ pr_buf(out, i ? " %u" : "%u", e->devs[i]); ++ pr_buf(out, "]"); ++} ++ + void bch2_replicas_entry_to_text(struct printbuf *out, + struct bch_replicas_entry *e) + { +@@ -860,7 +876,7 @@ static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r, + return 0; + } + +-static int bch2_sb_validate_replicas(struct bch_sb *sb, struct bch_sb_field *f, ++static int bch2_sb_replicas_validate(struct bch_sb *sb, struct bch_sb_field *f, + struct printbuf *err) + { + struct bch_sb_field_replicas *sb_r = field_to_type(f, replicas); +@@ -890,14 +906,15 @@ static void bch2_sb_replicas_to_text(struct printbuf *out, + + bch2_replicas_entry_to_text(out, e); + } ++ pr_newline(out); + } + + const struct bch_sb_field_ops bch_sb_field_ops_replicas = { +- .validate = bch2_sb_validate_replicas, ++ .validate = bch2_sb_replicas_validate, + .to_text = bch2_sb_replicas_to_text, + }; + +-static int bch2_sb_validate_replicas_v0(struct bch_sb *sb, struct bch_sb_field *f, ++static int bch2_sb_replicas_v0_validate(struct bch_sb *sb, struct bch_sb_field *f, + struct printbuf *err) + { + struct bch_sb_field_replicas_v0 *sb_r = field_to_type(f, replicas_v0); +@@ -912,8 +929,27 @@ static int bch2_sb_validate_replicas_v0(struct bch_sb *sb, struct bch_sb_field * + return ret; + } + ++static void bch2_sb_replicas_v0_to_text(struct printbuf *out, ++ struct bch_sb *sb, ++ struct bch_sb_field *f) ++{ ++ struct bch_sb_field_replicas_v0 *sb_r = field_to_type(f, replicas_v0); ++ struct bch_replicas_entry_v0 *e; ++ bool first = true; ++ ++ for_each_replicas_entry(sb_r, e) { ++ if (!first) ++ pr_buf(out, " "); ++ first = false; ++ ++ bch2_replicas_entry_v0_to_text(out, e); ++ } ++ pr_newline(out); ++} ++ + const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0 = { +- .validate = bch2_sb_validate_replicas_v0, ++ .validate = bch2_sb_replicas_v0_validate, ++ .to_text = bch2_sb_replicas_v0_to_text, + }; + + /* Query replicas: */ +@@ -970,19 +1006,42 @@ bool bch2_have_enough_devs(struct bch_fs *c, struct bch_devs_mask devs, + return ret; + } + +-unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca) ++unsigned bch2_sb_dev_has_data(struct bch_sb *sb, unsigned dev) + { +- struct bch_replicas_entry *e; +- unsigned i, ret = 0; ++ struct bch_sb_field_replicas *replicas; ++ struct bch_sb_field_replicas_v0 *replicas_v0; ++ unsigned i, data_has = 0; ++ ++ replicas = bch2_sb_get_replicas(sb); ++ replicas_v0 = bch2_sb_get_replicas_v0(sb); ++ ++ if (replicas) { ++ struct bch_replicas_entry *r; ++ ++ for_each_replicas_entry(replicas, r) ++ for (i = 0; i < r->nr_devs; i++) ++ if (r->devs[i] == dev) ++ data_has |= 1 << r->data_type; ++ } else if (replicas_v0) { ++ struct bch_replicas_entry_v0 *r; ++ ++ for_each_replicas_entry_v0(replicas_v0, r) ++ for (i = 0; i < r->nr_devs; i++) ++ if (r->devs[i] == dev) ++ data_has |= 1 << r->data_type; ++ } + +- percpu_down_read(&c->mark_lock); + +- for_each_cpu_replicas_entry(&c->replicas, e) +- for (i = 0; i < e->nr_devs; i++) +- if (e->devs[i] == ca->dev_idx) +- ret |= 1 << e->data_type; ++ return data_has; ++} + +- percpu_up_read(&c->mark_lock); ++unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca) ++{ ++ unsigned ret; ++ ++ mutex_lock(&c->sb_lock); ++ ret = bch2_sb_dev_has_data(c->disk_sb.sb, ca->dev_idx); ++ mutex_unlock(&c->sb_lock); + + return ret; + } +diff --git a/fs/bcachefs/replicas.h b/fs/bcachefs/replicas.h +index d237d7c51ccb..87820b2e1ad3 100644 +--- a/fs/bcachefs/replicas.h ++++ b/fs/bcachefs/replicas.h +@@ -64,6 +64,7 @@ static inline void bch2_replicas_entry_cached(struct bch_replicas_entry *e, + bool bch2_have_enough_devs(struct bch_fs *, struct bch_devs_mask, + unsigned, bool); + ++unsigned bch2_sb_dev_has_data(struct bch_sb *, unsigned); + unsigned bch2_dev_has_data(struct bch_fs *, struct bch_dev *); + + int bch2_replicas_gc_end(struct bch_fs *, int); +diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c +index eae63184ead2..08613a73cb54 100644 +--- a/fs/bcachefs/super-io.c ++++ b/fs/bcachefs/super-io.c +@@ -918,7 +918,7 @@ static int u64_cmp(const void *_l, const void *_r) + return l < r ? -1 : l > r ? 1 : 0; + } + +-static int bch2_sb_validate_journal(struct bch_sb *sb, ++static int bch2_sb_journal_validate(struct bch_sb *sb, + struct bch_sb_field *f, + struct printbuf *err) + { +@@ -971,13 +971,26 @@ err: + return ret; + } + ++static void bch2_sb_journal_to_text(struct printbuf *out, struct bch_sb *sb, ++ struct bch_sb_field *f) ++{ ++ struct bch_sb_field_journal *journal = field_to_type(f, journal); ++ unsigned i, nr = bch2_nr_journal_buckets(journal); ++ ++ pr_buf(out, "Buckets: "); ++ for (i = 0; i < nr; i++) ++ pr_buf(out, " %llu", le64_to_cpu(journal->buckets[i])); ++ pr_newline(out); ++} ++ + static const struct bch_sb_field_ops bch_sb_field_ops_journal = { +- .validate = bch2_sb_validate_journal, ++ .validate = bch2_sb_journal_validate, ++ .to_text = bch2_sb_journal_to_text, + }; + + /* BCH_SB_FIELD_members: */ + +-static int bch2_sb_validate_members(struct bch_sb *sb, ++static int bch2_sb_members_validate(struct bch_sb *sb, + struct bch_sb_field *f, + struct printbuf *err) + { +@@ -1027,13 +1040,105 @@ static int bch2_sb_validate_members(struct bch_sb *sb, + return 0; + } + ++static void bch2_sb_members_to_text(struct printbuf *out, struct bch_sb *sb, ++ struct bch_sb_field *f) ++{ ++ struct bch_sb_field_members *mi = field_to_type(f, members); ++ struct bch_sb_field_disk_groups *gi = bch2_sb_get_disk_groups(sb); ++ unsigned i; ++ ++ for (i = 0; i < sb->nr_devices; i++) { ++ struct bch_member *m = mi->members + i; ++ unsigned data_have = bch2_sb_dev_has_data(sb, i); ++ u64 bucket_size = le16_to_cpu(m->bucket_size); ++ u64 device_size = le64_to_cpu(m->nbuckets) * bucket_size; ++ ++ if (!bch2_member_exists(m)) ++ continue; ++ ++ pr_buf(out, "Device: %u", i); ++ pr_newline(out); ++ ++ printbuf_indent_push(out, 2); ++ ++ pr_buf(out, "UUID: "); ++ pr_uuid(out, m->uuid.b); ++ pr_newline(out); ++ ++ pr_buf(out, "Size: "); ++ pr_units(out, device_size, device_size << 9); ++ pr_newline(out); ++ ++ pr_buf(out, "Bucket size: "); ++ pr_units(out, bucket_size, bucket_size << 9); ++ pr_newline(out); ++ ++ pr_buf(out, "First bucket: %u", ++ le16_to_cpu(m->first_bucket)); ++ pr_newline(out); ++ ++ pr_buf(out, "Buckets: %llu", ++ le64_to_cpu(m->nbuckets)); ++ pr_newline(out); ++ ++ pr_buf(out, "Last mount: "); ++ if (m->last_mount) ++ pr_time(out, le64_to_cpu(m->last_mount)); ++ else ++ pr_buf(out, "(never)"); ++ pr_newline(out); ++ ++ pr_buf(out, "State: %s", ++ BCH_MEMBER_STATE(m) < BCH_MEMBER_STATE_NR ++ ? bch2_member_states[BCH_MEMBER_STATE(m)] ++ : "unknown"); ++ pr_newline(out); ++ ++ pr_buf(out, "Group: "); ++ if (BCH_MEMBER_GROUP(m)) { ++ unsigned idx = BCH_MEMBER_GROUP(m) - 1; ++ ++ if (idx < disk_groups_nr(gi)) ++ pr_buf(out, "%s (%u)", ++ gi->entries[idx].label, idx); ++ else ++ pr_buf(out, "(bad disk labels section)"); ++ } else { ++ pr_buf(out, "(none)"); ++ } ++ pr_newline(out); ++ ++ pr_buf(out, "Data allowed: "); ++ if (BCH_MEMBER_DATA_ALLOWED(m)) ++ bch2_flags_to_text(out, bch2_data_types, ++ BCH_MEMBER_DATA_ALLOWED(m)); ++ else ++ pr_buf(out, "(none)"); ++ pr_newline(out); ++ ++ pr_buf(out, "Has data: "); ++ if (data_have) ++ bch2_flags_to_text(out, bch2_data_types, data_have); ++ else ++ pr_buf(out, "(none)"); ++ pr_newline(out); ++ ++ pr_buf(out, "Discard: %llu", ++ BCH_MEMBER_DISCARD(m)); ++ pr_newline(out); ++ ++ printbuf_indent_pop(out, 2); ++ } ++} ++ + static const struct bch_sb_field_ops bch_sb_field_ops_members = { +- .validate = bch2_sb_validate_members, ++ .validate = bch2_sb_members_validate, ++ .to_text = bch2_sb_members_to_text, + }; + + /* BCH_SB_FIELD_crypt: */ + +-static int bch2_sb_validate_crypt(struct bch_sb *sb, ++static int bch2_sb_crypt_validate(struct bch_sb *sb, + struct bch_sb_field *f, + struct printbuf *err) + { +@@ -1053,13 +1158,29 @@ static int bch2_sb_validate_crypt(struct bch_sb *sb, + return 0; + } + ++static void bch2_sb_crypt_to_text(struct printbuf *out, struct bch_sb *sb, ++ struct bch_sb_field *f) ++{ ++ struct bch_sb_field_crypt *crypt = field_to_type(f, crypt); ++ ++ pr_buf(out, "KFD: %llu", BCH_CRYPT_KDF_TYPE(crypt)); ++ pr_newline(out); ++ pr_buf(out, "scrypt n: %llu", BCH_KDF_SCRYPT_N(crypt)); ++ pr_newline(out); ++ pr_buf(out, "scrypt r: %llu", BCH_KDF_SCRYPT_R(crypt)); ++ pr_newline(out); ++ pr_buf(out, "scrypt p: %llu", BCH_KDF_SCRYPT_P(crypt)); ++ pr_newline(out); ++} ++ + static const struct bch_sb_field_ops bch_sb_field_ops_crypt = { +- .validate = bch2_sb_validate_crypt, ++ .validate = bch2_sb_crypt_validate, ++ .to_text = bch2_sb_crypt_to_text, + }; + + /* BCH_SB_FIELD_clean: */ + +-int bch2_sb_clean_validate(struct bch_fs *c, struct bch_sb_field_clean *clean, int write) ++int bch2_sb_clean_validate_late(struct bch_fs *c, struct bch_sb_field_clean *clean, int write) + { + struct jset_entry *entry; + int ret; +@@ -1248,7 +1369,7 @@ void bch2_fs_mark_clean(struct bch_fs *c) + * this should be in the write path, and we should be validating every + * superblock section: + */ +- ret = bch2_sb_clean_validate(c, sb_clean, WRITE); ++ ret = bch2_sb_clean_validate_late(c, sb_clean, WRITE); + if (ret) { + bch_err(c, "error writing marking filesystem clean: validate error"); + goto out; +@@ -1259,7 +1380,7 @@ out: + mutex_unlock(&c->sb_lock); + } + +-static int bch2_sb_validate_clean(struct bch_sb *sb, ++static int bch2_sb_clean_validate(struct bch_sb *sb, + struct bch_sb_field *f, + struct printbuf *err) + { +@@ -1274,8 +1395,32 @@ static int bch2_sb_validate_clean(struct bch_sb *sb, + return 0; + } + ++static void bch2_sb_clean_to_text(struct printbuf *out, struct bch_sb *sb, ++ struct bch_sb_field *f) ++{ ++ struct bch_sb_field_clean *clean = field_to_type(f, clean); ++ struct jset_entry *entry; ++ ++ pr_buf(out, "flags: %x", le32_to_cpu(clean->flags)); ++ pr_newline(out); ++ pr_buf(out, "journal_seq: %llu", le64_to_cpu(clean->journal_seq)); ++ pr_newline(out); ++ ++ for (entry = clean->start; ++ entry != vstruct_end(&clean->field); ++ entry = vstruct_next(entry)) { ++ if (entry->type == BCH_JSET_ENTRY_btree_keys && ++ !entry->u64s) ++ continue; ++ ++ bch2_journal_entry_to_text(out, NULL, entry); ++ pr_newline(out); ++ } ++} ++ + static const struct bch_sb_field_ops bch_sb_field_ops_clean = { +- .validate = bch2_sb_validate_clean, ++ .validate = bch2_sb_clean_validate, ++ .to_text = bch2_sb_clean_to_text, + }; + + static const struct bch_sb_field_ops *bch2_sb_field_ops[] = { +@@ -1299,7 +1444,7 @@ static int bch2_sb_field_validate(struct bch_sb *sb, struct bch_sb_field *f, + + ret = bch2_sb_field_ops[type]->validate(sb, f, &err); + if (ret) { +- pr_buf(&err, "\n"); ++ pr_newline(&err); + bch2_sb_field_to_text(&err, sb, f); + *orig_err = err; + } +@@ -1320,7 +1465,202 @@ void bch2_sb_field_to_text(struct printbuf *out, struct bch_sb *sb, + pr_buf(out, "(unknown field %u)", type); + + pr_buf(out, " (size %llu):", vstruct_bytes(f)); ++ pr_newline(out); + +- if (ops && ops->to_text) ++ if (ops && ops->to_text) { ++ printbuf_indent_push(out, 2); + bch2_sb_field_ops[type]->to_text(out, sb, f); ++ printbuf_indent_pop(out, 2); ++ } ++} ++ ++void bch2_sb_layout_to_text(struct printbuf *out, struct bch_sb_layout *l) ++{ ++ unsigned i; ++ ++ pr_buf(out, "Type: %u", l->layout_type); ++ pr_newline(out); ++ ++ pr_buf(out, "Superblock max size: "); ++ pr_units(out, ++ 1 << l->sb_max_size_bits, ++ 512 << l->sb_max_size_bits); ++ pr_newline(out); ++ ++ pr_buf(out, "Nr superblocks: %u", l->nr_superblocks); ++ pr_newline(out); ++ ++ pr_buf(out, "Offsets: "); ++ for (i = 0; i < l->nr_superblocks; i++) { ++ if (i) ++ pr_buf(out, ", "); ++ pr_buf(out, "%llu", le64_to_cpu(l->sb_offset[i])); ++ } ++ pr_newline(out); ++} ++ ++void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb, ++ bool print_layout, unsigned fields) ++{ ++ struct bch_sb_field_members *mi; ++ struct bch_sb_field *f; ++ u64 fields_have = 0; ++ unsigned nr_devices = 0; ++ ++ mi = bch2_sb_get_members(sb); ++ if (mi) { ++ struct bch_member *m; ++ ++ for (m = mi->members; ++ m < mi->members + sb->nr_devices; ++ m++) ++ nr_devices += bch2_member_exists(m); ++ } ++ ++ pr_buf(out, "External UUID: "); ++ pr_uuid(out, sb->user_uuid.b); ++ pr_newline(out); ++ ++ pr_buf(out, "Internal UUID: "); ++ pr_uuid(out, sb->uuid.b); ++ pr_newline(out); ++ ++ pr_buf(out, "Device index: %u", sb->dev_idx); ++ pr_newline(out); ++ ++ pr_buf(out, "Label: "); ++ pr_buf(out, "%.*s", (int) sizeof(sb->label), sb->label); ++ pr_newline(out); ++ ++ pr_buf(out, "Version: %u", le16_to_cpu(sb->version)); ++ pr_newline(out); ++ ++ pr_buf(out, "Oldest version on disk: %u", le16_to_cpu(sb->version_min)); ++ pr_newline(out); ++ ++ pr_buf(out, "Created: "); ++ if (sb->time_base_lo) ++ pr_time(out, le64_to_cpu(sb->time_base_lo) / NSEC_PER_SEC); ++ else ++ pr_buf(out, "(not set)"); ++ pr_newline(out); ++ ++ pr_buf(out, "Squence number: %llu", le64_to_cpu(sb->seq)); ++ pr_newline(out); ++ ++ pr_buf(out, "Block_size: "); ++ pr_units(out, le16_to_cpu(sb->block_size), ++ (u32) le16_to_cpu(sb->block_size) << 9); ++ pr_newline(out); ++ ++ pr_buf(out, "Btree node size: "); ++ pr_units(out, BCH_SB_BTREE_NODE_SIZE(sb), ++ BCH_SB_BTREE_NODE_SIZE(sb) << 9); ++ pr_newline(out); ++ ++ pr_buf(out, "Error action: %s", ++ BCH_SB_ERROR_ACTION(sb) < BCH_ON_ERROR_NR ++ ? bch2_error_actions[BCH_SB_ERROR_ACTION(sb)] ++ : "unknown"); ++ pr_newline(out); ++ ++ pr_buf(out, "Clean: %llu", BCH_SB_CLEAN(sb)); ++ pr_newline(out); ++ ++ pr_buf(out, "Features: "); ++ bch2_flags_to_text(out, bch2_sb_features, ++ le64_to_cpu(sb->features[0])); ++ pr_newline(out); ++ ++ pr_buf(out, "Compat features: "); ++ bch2_flags_to_text(out, bch2_sb_compat, ++ le64_to_cpu(sb->compat[0])); ++ pr_newline(out); ++ ++ pr_buf(out, "Metadata replicas: %llu", BCH_SB_META_REPLICAS_WANT(sb)); ++ pr_newline(out); ++ ++ pr_buf(out, "Data replicas: %llu", BCH_SB_DATA_REPLICAS_WANT(sb)); ++ pr_newline(out); ++ ++ pr_buf(out, "Metadata checksum type: %s (%llu)", ++ BCH_SB_META_CSUM_TYPE(sb) < BCH_CSUM_OPT_NR ++ ? bch2_csum_opts[BCH_SB_META_CSUM_TYPE(sb)] ++ : "unknown", ++ BCH_SB_META_CSUM_TYPE(sb)); ++ pr_newline(out); ++ ++ pr_buf(out, "Data checksum type: %s (%llu)", ++ BCH_SB_DATA_CSUM_TYPE(sb) < BCH_CSUM_OPT_NR ++ ? bch2_csum_opts[BCH_SB_DATA_CSUM_TYPE(sb)] ++ : "unknown", ++ BCH_SB_DATA_CSUM_TYPE(sb)); ++ pr_newline(out); ++ ++ pr_buf(out, "Compression type: %s (%llu)", ++ BCH_SB_COMPRESSION_TYPE(sb) < BCH_COMPRESSION_OPT_NR ++ ? bch2_compression_opts[BCH_SB_COMPRESSION_TYPE(sb)] ++ : "unknown", ++ BCH_SB_COMPRESSION_TYPE(sb)); ++ pr_newline(out); ++ ++ pr_buf(out, "Foreground write target: "); ++ bch2_sb_target_to_text(out, sb, BCH_SB_FOREGROUND_TARGET(sb)); ++ pr_newline(out); ++ ++ pr_buf(out, "Background write target: "); ++ bch2_sb_target_to_text(out, sb, BCH_SB_BACKGROUND_TARGET(sb)); ++ pr_newline(out); ++ ++ pr_buf(out, "Promote target: "); ++ bch2_sb_target_to_text(out, sb, BCH_SB_PROMOTE_TARGET(sb)); ++ pr_newline(out); ++ ++ pr_buf(out, "Metadata target: "); ++ bch2_sb_target_to_text(out, sb, BCH_SB_METADATA_TARGET(sb)); ++ pr_newline(out); ++ ++ pr_buf(out, "String hash type: %s (%llu)", ++ BCH_SB_STR_HASH_TYPE(sb) < BCH_STR_HASH_NR ++ ? bch2_str_hash_types[BCH_SB_STR_HASH_TYPE(sb)] ++ : "unknown", ++ BCH_SB_STR_HASH_TYPE(sb)); ++ pr_newline(out); ++ ++ pr_buf(out, "32 bit inodes: %llu", BCH_SB_INODE_32BIT(sb)); ++ pr_newline(out); ++ ++ pr_buf(out, "GC reserve percentage: %llu%%", BCH_SB_GC_RESERVE(sb)); ++ pr_newline(out); ++ ++ pr_buf(out, "Root reserve percentage: %llu%%", BCH_SB_ROOT_RESERVE(sb)); ++ pr_newline(out); ++ ++ pr_buf(out, "Devices: %u live, %u total", ++ nr_devices, sb->nr_devices); ++ pr_newline(out); ++ ++ pr_buf(out, "Sections: "); ++ vstruct_for_each(sb, f) ++ fields_have |= 1 << le32_to_cpu(f->type); ++ bch2_flags_to_text(out, bch2_sb_fields, fields_have); ++ pr_newline(out); ++ ++ pr_buf(out, "Superblock size: %llu", vstruct_bytes(sb)); ++ pr_newline(out); ++ ++ if (print_layout) { ++ pr_newline(out); ++ pr_buf(out, "layout:"); ++ pr_newline(out); ++ printbuf_indent_push(out, 2); ++ bch2_sb_layout_to_text(out, &sb->layout); ++ printbuf_indent_pop(out, 2); ++ } ++ ++ vstruct_for_each(sb, f) ++ if (fields & (1 << le32_to_cpu(f->type))) { ++ pr_newline(out); ++ bch2_sb_field_to_text(out, sb, f); ++ } + } +diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h +index 3b425bed17c4..50f31a3b9b18 100644 +--- a/fs/bcachefs/super-io.h ++++ b/fs/bcachefs/super-io.h +@@ -121,12 +121,14 @@ static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi) + void bch2_journal_super_entries_add_common(struct bch_fs *, + struct jset_entry **, u64); + +-int bch2_sb_clean_validate(struct bch_fs *, struct bch_sb_field_clean *, int); ++int bch2_sb_clean_validate_late(struct bch_fs *, struct bch_sb_field_clean *, int); + + int bch2_fs_mark_dirty(struct bch_fs *); + void bch2_fs_mark_clean(struct bch_fs *); + + void bch2_sb_field_to_text(struct printbuf *, struct bch_sb *, + struct bch_sb_field *); ++void bch2_sb_layout_to_text(struct printbuf *, struct bch_sb_layout *); ++void bch2_sb_to_text(struct printbuf *, struct bch_sb *, bool, unsigned); + + #endif /* _BCACHEFS_SUPER_IO_H */ +diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c +index b727845dd64b..1a3068f658a1 100644 +--- a/fs/bcachefs/sysfs.c ++++ b/fs/bcachefs/sysfs.c +@@ -825,7 +825,7 @@ SHOW(bch2_dev) + if (attr == &sysfs_label) { + if (ca->mi.group) { + mutex_lock(&c->sb_lock); +- bch2_disk_path_to_text(&out, &c->disk_sb, ++ bch2_disk_path_to_text(&out, c->disk_sb.sb, + ca->mi.group - 1); + mutex_unlock(&c->sb_lock); + } +diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c +index 0cf09f13504b..ab808fe4d64d 100644 +--- a/fs/bcachefs/util.c ++++ b/fs/bcachefs/util.c +@@ -120,6 +120,27 @@ void bch2_hprint(struct printbuf *buf, s64 v) + pr_buf(buf, "%c", si_units[u]); + } + ++void bch2_pr_units(struct printbuf *out, s64 raw, s64 bytes) ++{ ++ if (raw < 0) { ++ pr_buf(out, "-"); ++ raw = -raw; ++ bytes = -bytes; ++ } ++ ++ switch (out->units) { ++ case PRINTBUF_UNITS_RAW: ++ pr_buf(out, "%llu", raw); ++ break; ++ case PRINTBUF_UNITS_BYTES: ++ pr_buf(out, "%llu", bytes); ++ break; ++ case PRINTBUF_UNITS_HUMAN_READABLE: ++ bch2_hprint(out, bytes); ++ break; ++ } ++} ++ + void bch2_string_opt_to_text(struct printbuf *out, + const char * const list[], + size_t selected) +diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h +index a80d4ec22d95..b43c195381f8 100644 +--- a/fs/bcachefs/util.h ++++ b/fs/bcachefs/util.h +@@ -235,10 +235,17 @@ do { \ + #define ANYSINT_MAX(t) \ + ((((t) 1 << (sizeof(t) * 8 - 2)) - (t) 1) * (t) 2 + (t) 1) + ++enum printbuf_units { ++ PRINTBUF_UNITS_RAW, ++ PRINTBUF_UNITS_BYTES, ++ PRINTBUF_UNITS_HUMAN_READABLE, ++}; ++ + struct printbuf { +- char *pos; +- char *end; +- unsigned indent; ++ char *pos; ++ char *end; ++ unsigned indent; ++ enum printbuf_units units; + }; + + static inline size_t printbuf_remaining(struct printbuf *buf) +@@ -272,7 +279,7 @@ static inline void printbuf_indent_pop(struct printbuf *buf, unsigned spaces) + buf->indent -= spaces; + } + +-static inline void printbuf_newline(struct printbuf *buf) ++static inline void pr_newline(struct printbuf *buf) + { + unsigned i; + +@@ -281,6 +288,46 @@ static inline void printbuf_newline(struct printbuf *buf) + pr_buf(buf, " "); + } + ++void bch2_pr_units(struct printbuf *, s64, s64); ++#define pr_units(...) bch2_pr_units(__VA_ARGS__) ++ ++#ifdef __KERNEL__ ++static inline void pr_time(struct printbuf *out, u64 time) ++{ ++ pr_buf(out, "%llu", time); ++} ++#else ++#include ++static inline void pr_time(struct printbuf *out, u64 _time) ++{ ++ char time_str[64]; ++ time_t time = _time; ++ struct tm *tm = localtime(&time); ++ size_t err = strftime(time_str, sizeof(time_str), "%c", tm); ++ if (!err) ++ pr_buf(out, "(formatting error)"); ++ else ++ pr_buf(out, "%s", time_str); ++} ++#endif ++ ++#ifdef __KERNEL__ ++static inline void uuid_unparse_lower(u8 *uuid, char *out) ++{ ++ sprintf(out, "%plU", uuid); ++} ++#else ++#include ++#endif ++ ++static inline void pr_uuid(struct printbuf *out, u8 *uuid) ++{ ++ char uuid_str[40]; ++ ++ uuid_unparse_lower(uuid, uuid_str); ++ pr_buf(out, uuid_str); ++} ++ + int bch2_strtoint_h(const char *, int *); + int bch2_strtouint_h(const char *, unsigned int *); + int bch2_strtoll_h(const char *, long long *); +@@ -766,13 +813,4 @@ static inline int u8_cmp(u8 l, u8 r) + return cmp_int(l, r); + } + +-#ifdef __KERNEL__ +-static inline void uuid_unparse_lower(u8 *uuid, char *out) +-{ +- sprintf(out, "%plU", uuid); +-} +-#else +-#include +-#endif +- + #endif /* _BCACHEFS_UTIL_H */ +-- +cgit v1.2.3 + + +From ca304b7397bb63f5173c79a493736b2fbeb39f66 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 21 Feb 2022 13:22:11 -0500 +Subject: bcachefs: Fix a use after free + +In move_read_endio, we were checking if the next pending write has its +read completed - but this can turn after a use after free (and we were +accessing the list without a lock), so instead just better to just +unconditionally do the wakeup. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/move.c | 4 +--- + 1 file changed, 1 insertion(+), 3 deletions(-) + +diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c +index 7ca7ce394135..16bca1446a2b 100644 +--- a/fs/bcachefs/move.c ++++ b/fs/bcachefs/move.c +@@ -481,9 +481,7 @@ static void move_read_endio(struct bio *bio) + atomic_sub(io->read_sectors, &ctxt->read_sectors); + io->read_completed = true; + +- if (next_pending_write(ctxt)) +- wake_up(&ctxt->wait); +- ++ wake_up(&ctxt->wait); + closure_put(&ctxt->cl); + } + +-- +cgit v1.2.3 + + +From 86ea38a2814dfa603ce52c9aceb8d376276ce715 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 22 Feb 2022 04:53:48 -0500 +Subject: bcachefs: Add tabstops to printbufs + +Now, when outputting to printbufs, we can set tabstops and left or right +justify text to them - this is to be used by the userspace 'bcachefs fs +usage' command. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/super-io.c | 12 ++++---- + fs/bcachefs/util.c | 8 +---- + fs/bcachefs/util.h | 84 +++++++++++++++++++++++++++++++++++++++++++++----- + 3 files changed, 84 insertions(+), 20 deletions(-) + +diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c +index 08613a73cb54..c22e2c03fc06 100644 +--- a/fs/bcachefs/super-io.c ++++ b/fs/bcachefs/super-io.c +@@ -1059,7 +1059,7 @@ static void bch2_sb_members_to_text(struct printbuf *out, struct bch_sb *sb, + pr_buf(out, "Device: %u", i); + pr_newline(out); + +- printbuf_indent_push(out, 2); ++ pr_indent_push(out, 2); + + pr_buf(out, "UUID: "); + pr_uuid(out, m->uuid.b); +@@ -1127,7 +1127,7 @@ static void bch2_sb_members_to_text(struct printbuf *out, struct bch_sb *sb, + BCH_MEMBER_DISCARD(m)); + pr_newline(out); + +- printbuf_indent_pop(out, 2); ++ pr_indent_pop(out, 2); + } + } + +@@ -1468,9 +1468,9 @@ void bch2_sb_field_to_text(struct printbuf *out, struct bch_sb *sb, + pr_newline(out); + + if (ops && ops->to_text) { +- printbuf_indent_push(out, 2); ++ pr_indent_push(out, 2); + bch2_sb_field_ops[type]->to_text(out, sb, f); +- printbuf_indent_pop(out, 2); ++ pr_indent_pop(out, 2); + } + } + +@@ -1653,9 +1653,9 @@ void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb, + pr_newline(out); + pr_buf(out, "layout:"); + pr_newline(out); +- printbuf_indent_push(out, 2); ++ pr_indent_push(out, 2); + bch2_sb_layout_to_text(out, &sb->layout); +- printbuf_indent_pop(out, 2); ++ pr_indent_pop(out, 2); + } + + vstruct_for_each(sb, f) +diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c +index ab808fe4d64d..26d0ae304cb2 100644 +--- a/fs/bcachefs/util.c ++++ b/fs/bcachefs/util.c +@@ -117,17 +117,11 @@ void bch2_hprint(struct printbuf *buf, s64 v) + if (u && t && v < 100 && v > -100) + pr_buf(buf, ".%i", t / 103); + if (u) +- pr_buf(buf, "%c", si_units[u]); ++ pr_char(buf, si_units[u]); + } + + void bch2_pr_units(struct printbuf *out, s64 raw, s64 bytes) + { +- if (raw < 0) { +- pr_buf(out, "-"); +- raw = -raw; +- bytes = -bytes; +- } +- + switch (out->units) { + case PRINTBUF_UNITS_RAW: + pr_buf(out, "%llu", raw); +diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h +index b43c195381f8..9e0a3b46060b 100644 +--- a/fs/bcachefs/util.h ++++ b/fs/bcachefs/util.h +@@ -244,8 +244,12 @@ enum printbuf_units { + struct printbuf { + char *pos; + char *end; ++ char *last_newline; ++ char *last_field; + unsigned indent; + enum printbuf_units units; ++ unsigned tabstop; ++ unsigned tabstops[4]; + }; + + static inline size_t printbuf_remaining(struct printbuf *buf) +@@ -253,29 +257,49 @@ static inline size_t printbuf_remaining(struct printbuf *buf) + return buf->end - buf->pos; + } + ++static inline size_t printbuf_linelen(struct printbuf *buf) ++{ ++ return buf->pos - buf->last_newline; ++} ++ + #define _PBUF(_buf, _len) \ + ((struct printbuf) { \ +- .pos = _buf, \ +- .end = _buf + _len, \ ++ .pos = _buf, \ ++ .end = _buf + _len, \ ++ .last_newline = _buf, \ ++ .last_field = _buf, \ + }) + + #define PBUF(_buf) _PBUF(_buf, sizeof(_buf)) + ++ + #define pr_buf(_out, ...) \ + do { \ + (_out)->pos += scnprintf((_out)->pos, printbuf_remaining(_out), \ + __VA_ARGS__); \ + } while (0) + +-static inline void printbuf_indent_push(struct printbuf *buf, unsigned spaces) ++static inline void pr_char(struct printbuf *out, char c) ++{ ++ if (printbuf_remaining(out) > 1) { ++ *out->pos = c; ++ out->pos++; ++ } ++} ++ ++static inline void pr_indent_push(struct printbuf *buf, unsigned spaces) + { + buf->indent += spaces; + while (spaces--) +- pr_buf(buf, " "); ++ pr_char(buf, ' '); + } + +-static inline void printbuf_indent_pop(struct printbuf *buf, unsigned spaces) ++static inline void pr_indent_pop(struct printbuf *buf, unsigned spaces) + { ++ if (buf->last_newline + buf->indent == buf->pos) { ++ buf->pos -= spaces; ++ buf->buf[buf->pos] = 0; ++ } + buf->indent -= spaces; + } + +@@ -283,14 +307,60 @@ static inline void pr_newline(struct printbuf *buf) + { + unsigned i; + +- pr_buf(buf, "\n"); ++ pr_char(buf, '\n'); ++ ++ buf->last_newline = buf->pos; ++ + for (i = 0; i < buf->indent; i++) +- pr_buf(buf, " "); ++ pr_char(buf, ' '); ++ ++ buf->last_field = buf->pos; ++ buf->tabstop = 0; ++} ++ ++static inline void pr_tab(struct printbuf *buf) ++{ ++ BUG_ON(buf->tabstop > ARRAY_SIZE(buf->tabstops)); ++ ++ while (printbuf_remaining(buf) > 1 && ++ printbuf_linelen(buf) < buf->tabstops[buf->tabstop]) ++ pr_char(buf, ' '); ++ ++ buf->last_field = buf->pos; ++ buf->tabstop++; ++} ++ ++static inline void pr_tab_rjust(struct printbuf *buf) ++{ ++ ssize_t shift = min_t(ssize_t, buf->tabstops[buf->tabstop] - ++ printbuf_linelen(buf), ++ printbuf_remaining(buf)); ++ ssize_t move = min_t(ssize_t, buf->pos - buf->last_field, ++ printbuf_remaining(buf) - shift); ++ ++ BUG_ON(buf->tabstop > ARRAY_SIZE(buf->tabstops)); ++ ++ if (shift > 0) { ++ memmove(buf->last_field + shift, ++ buf->last_field, ++ move); ++ memset(buf->last_field, ' ', shift); ++ buf->pos += shift; ++ *buf->pos = 0; ++ } ++ ++ buf->last_field = buf->pos; ++ buf->tabstop++; + } + + void bch2_pr_units(struct printbuf *, s64, s64); + #define pr_units(...) bch2_pr_units(__VA_ARGS__) + ++static inline void pr_sectors(struct printbuf *out, u64 v) ++{ ++ bch2_pr_units(out, v, v << 9); ++} ++ + #ifdef __KERNEL__ + static inline void pr_time(struct printbuf *out, u64 time) + { +-- +cgit v1.2.3 + + +From 3b6ae87cba4628d081ff0a2c8964663a05dfd934 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 23 Feb 2022 07:00:34 -0500 +Subject: bcachefs: Drop journal_write_compact() + +Long ago it was possible to get a journal reservation and not use it, +but that's no longer allowed, which means journal_write_compact() has +very little work to do, and isn't really worth the code anymore. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/journal_io.c | 45 --------------------------------------------- + 1 file changed, 45 deletions(-) + +diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c +index 05c109262049..6b59fd02e4c8 100644 +--- a/fs/bcachefs/journal_io.c ++++ b/fs/bcachefs/journal_io.c +@@ -1299,49 +1299,6 @@ done: + return replicas >= c->opts.metadata_replicas_required ? 0 : -EROFS; + } + +-static void journal_write_compact(struct jset *jset) +-{ +- struct jset_entry *i, *next, *prev = NULL; +- +- /* +- * Simple compaction, dropping empty jset_entries (from journal +- * reservations that weren't fully used) and merging jset_entries that +- * can be. +- * +- * If we wanted to be really fancy here, we could sort all the keys in +- * the jset and drop keys that were overwritten - probably not worth it: +- */ +- vstruct_for_each_safe(jset, i, next) { +- unsigned u64s = le16_to_cpu(i->u64s); +- +- /* Empty entry: */ +- if (!u64s) +- continue; +- +- /* Can we merge with previous entry? */ +- if (prev && +- i->btree_id == prev->btree_id && +- i->level == prev->level && +- i->type == prev->type && +- i->type == BCH_JSET_ENTRY_btree_keys && +- le16_to_cpu(prev->u64s) + u64s <= U16_MAX) { +- memmove_u64s_down(vstruct_next(prev), +- i->_data, +- u64s); +- le16_add_cpu(&prev->u64s, u64s); +- continue; +- } +- +- /* Couldn't merge, move i into new position (after prev): */ +- prev = prev ? vstruct_next(prev) : jset->start; +- if (i != prev) +- memmove_u64s_down(prev, i, jset_u64s(u64s)); +- } +- +- prev = prev ? vstruct_next(prev) : jset->start; +- jset->u64s = cpu_to_le32((u64 *) prev - jset->_data); +-} +- + static void journal_buf_realloc(struct journal *j, struct journal_buf *buf) + { + /* we aren't holding j->lock: */ +@@ -1581,8 +1538,6 @@ void bch2_journal_write(struct closure *cl) + le32_add_cpu(&jset->u64s, u64s); + BUG_ON(vstruct_sectors(jset, c->block_bits) > w->sectors); + +- journal_write_compact(jset); +- + jset->magic = cpu_to_le64(jset_magic(c)); + jset->version = c->sb.version < bcachefs_metadata_version_new_versioning + ? cpu_to_le32(BCH_JSET_VERSION_OLD) +-- +cgit v1.2.3 + + +From c6489448194df9fce6abc148de3ec411c6d0500f Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 22 Feb 2022 17:16:45 -0500 +Subject: bcachefs: Set BTREE_NODE_SEQ() correctly in merge path + +BTREE_NODE_SEQ() is supposed to give us a time ordering of btree nodes +on disk, so that we can tell which btree node is newer if we ever have +to scan the entire device to find btree nodes. + +The btree node merge path wasn't setting it correctly on the new node - +oops. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_update_interior.c | 4 ++++ + 1 file changed, 4 insertions(+) + +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index 088c320493d3..779f8b8d8ca5 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -1689,6 +1689,10 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, + n = bch2_btree_node_alloc(as, b->c.level); + bch2_btree_update_add_new_node(as, n); + ++ SET_BTREE_NODE_SEQ(n->data, ++ max(BTREE_NODE_SEQ(b->data), ++ BTREE_NODE_SEQ(m->data)) + 1); ++ + btree_set_min(n, prev->data->min_key); + btree_set_max(n, next->data->max_key); + n->data->format = new_f; +-- +cgit v1.2.3 + + +From ffd427f237bd3964d18e10f5e2c19da7dae93e93 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 23 Feb 2022 10:26:10 -0500 +Subject: bcachefs: Fix for journal getting stuck + +The journal can get stuck if we need to get a journal reservation for +something we have a pre-reservation for, but aren't able to reclaim +space, or if the pin fifo is full - it's impractical to resize the pin +fifo at runtime. + +Previously, we reserved 8 entries in the pin fifo for pre-reservations, +but that seems small - we're seeing the journal occasionally get stuck. +Let's reserve a quarter of it. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/journal.c | 2 +- + fs/bcachefs/journal.h | 2 +- + 2 files changed, 2 insertions(+), 2 deletions(-) + +diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c +index 158df42e5e10..a85ab36d2d17 100644 +--- a/fs/bcachefs/journal.c ++++ b/fs/bcachefs/journal.c +@@ -417,7 +417,7 @@ unlock: + (flags & JOURNAL_RES_GET_RESERVED)) { + char *journal_debug_buf = kmalloc(4096, GFP_ATOMIC); + +- bch_err(c, "Journal stuck!"); ++ bch_err(c, "Journal stuck! Hava a pre-reservation but journal full"); + if (journal_debug_buf) { + bch2_journal_debug_to_text(&_PBUF(journal_debug_buf, 4096), j); + bch_err(c, "%s", journal_debug_buf); +diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h +index b298873212d2..d3202f53d5b3 100644 +--- a/fs/bcachefs/journal.h ++++ b/fs/bcachefs/journal.h +@@ -374,7 +374,7 @@ static inline bool journal_check_may_get_unreserved(struct journal *j) + { + union journal_preres_state s = READ_ONCE(j->prereserved); + bool ret = s.reserved < s.remaining && +- fifo_free(&j->pin) > 8; ++ fifo_free(&j->pin) > j->pin.size / 4; + + lockdep_assert_held(&j->lock); + +-- +cgit v1.2.3 + + +From ecf86016c7d52d338bd31ce7f98b0bd630b36408 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 23 Feb 2022 06:56:35 -0500 +Subject: bcachefs: Revert "Ensure journal doesn't get stuck in nochanges mode" + +This patch was originally to work around the journal geting stuck in +nochanges mode - but that was just a hack, we needed to fix the actual +bug. It should be fixed now, so revert it. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/journal.h | 1 - + fs/bcachefs/journal_io.c | 2 +- + fs/bcachefs/journal_reclaim.c | 6 ++---- + fs/bcachefs/journal_types.h | 1 - + fs/bcachefs/super.c | 3 --- + 5 files changed, 3 insertions(+), 10 deletions(-) + +diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h +index d3202f53d5b3..296981740cc3 100644 +--- a/fs/bcachefs/journal.h ++++ b/fs/bcachefs/journal.h +@@ -433,7 +433,6 @@ static inline int bch2_journal_preres_get_fast(struct journal *j, + ret = 0; + + if ((flags & JOURNAL_RES_GET_RESERVED) || +- test_bit(JOURNAL_NOCHANGES, &j->flags) || + new.reserved + d < new.remaining) { + new.reserved += d; + ret = 1; +diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c +index 6b59fd02e4c8..49f2dd3effc7 100644 +--- a/fs/bcachefs/journal_io.c ++++ b/fs/bcachefs/journal_io.c +@@ -1619,7 +1619,7 @@ retry_alloc: + + w->devs_written = bch2_bkey_devs(bkey_i_to_s_c(&w->key)); + +- if (test_bit(JOURNAL_NOCHANGES, &j->flags)) ++ if (c->opts.nochanges) + goto no_io; + + for_each_rw_member(ca, c, i) +diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c +index 9467191e182e..90fa2be54e20 100644 +--- a/fs/bcachefs/journal_reclaim.c ++++ b/fs/bcachefs/journal_reclaim.c +@@ -34,10 +34,8 @@ unsigned bch2_journal_dev_buckets_available(struct journal *j, + struct journal_device *ja, + enum journal_space_from from) + { +- unsigned available = !test_bit(JOURNAL_NOCHANGES, &j->flags) +- ? ((journal_space_from(ja, from) - +- ja->cur_idx - 1 + ja->nr) % ja->nr) +- : ja->nr; ++ unsigned available = (journal_space_from(ja, from) - ++ ja->cur_idx - 1 + ja->nr) % ja->nr; + + /* + * Don't use the last bucket unless writing the new last_seq +diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h +index d6d751214116..cd66b7388664 100644 +--- a/fs/bcachefs/journal_types.h ++++ b/fs/bcachefs/journal_types.h +@@ -151,7 +151,6 @@ enum { + JOURNAL_NEED_WRITE, + JOURNAL_MAY_GET_UNRESERVED, + JOURNAL_MAY_SKIP_FLUSH, +- JOURNAL_NOCHANGES, + }; + + /* Embedded in struct bch_fs */ +diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c +index b36e6216a8a1..88737d846172 100644 +--- a/fs/bcachefs/super.c ++++ b/fs/bcachefs/super.c +@@ -830,9 +830,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) + if (ret) + goto err; + +- if (c->opts.nochanges) +- set_bit(JOURNAL_NOCHANGES, &c->journal.flags); +- + mi = bch2_sb_get_members(c->disk_sb.sb); + for (i = 0; i < c->sb.nr_devices; i++) + if (bch2_dev_exists(c->disk_sb.sb, mi, i) && +-- +cgit v1.2.3 + + +From d7f11a7e480c4a251c480a9771116f4a8d7602af Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 19 Feb 2022 02:48:27 -0500 +Subject: bcachefs: Normal update/commit path now works before going RW + +This improves __bch2_trans_commit - early in the recovery process, when +we're running btree_gc and before we want to go RW, it now uses +bch2_journal_key_insert() to add the update to the list of updates for +journal replay to do, instead of btree_gc having to use separate +interfaces depending on whether we're running at bringup or, later, +runtime. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bcachefs.h | 1 + + fs/bcachefs/btree_gc.c | 71 ++++++++++++++++------------------------- + fs/bcachefs/btree_update_leaf.c | 26 +++++++++++++++ + fs/bcachefs/buckets.c | 8 +++-- + fs/bcachefs/recovery.c | 2 ++ + 5 files changed, 61 insertions(+), 47 deletions(-) + +diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h +index b018425394e5..45a43f716c44 100644 +--- a/fs/bcachefs/bcachefs.h ++++ b/fs/bcachefs/bcachefs.h +@@ -515,6 +515,7 @@ enum { + BCH_FS_TOPOLOGY_REPAIR_DONE, + BCH_FS_FSCK_DONE, + BCH_FS_STARTED, ++ BCH_FS_MAY_GO_RW, + BCH_FS_RW, + BCH_FS_WAS_RW, + +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +index 648779cc643d..68e09c5eaf23 100644 +--- a/fs/bcachefs/btree_gc.c ++++ b/fs/bcachefs/btree_gc.c +@@ -753,7 +753,8 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id, + atomic64_set(&c->key_version, k->k->version.lo); + } + +- ret = bch2_mark_key(trans, old, *k, flags); ++ ret = __bch2_trans_do(trans, NULL, NULL, 0, ++ bch2_mark_key(trans, old, *k, flags)); + fsck_err: + err: + if (ret) +@@ -1259,7 +1260,7 @@ static int bch2_gc_start(struct bch_fs *c, + + static int bch2_alloc_write_key(struct btree_trans *trans, + struct btree_iter *iter, +- bool initial, bool metadata_only) ++ bool metadata_only) + { + struct bch_fs *c = trans->c; + struct bch_dev *ca = bch_dev_bkey_exists(c, iter->pos.inode); +@@ -1327,14 +1328,12 @@ static int bch2_alloc_write_key(struct btree_trans *trans, + if (IS_ERR(a)) + return PTR_ERR(a); + +- ret = initial +- ? bch2_journal_key_insert(c, BTREE_ID_alloc, 0, &a->k) +- : bch2_trans_update(trans, iter, &a->k, BTREE_TRIGGER_NORUN); ++ ret = bch2_trans_update(trans, iter, &a->k, BTREE_TRIGGER_NORUN); + fsck_err: + return ret; + } + +-static int bch2_gc_alloc_done(struct bch_fs *c, bool initial, bool metadata_only) ++static int bch2_gc_alloc_done(struct bch_fs *c, bool metadata_only) + { + struct btree_trans trans; + struct btree_iter iter; +@@ -1356,7 +1355,7 @@ static int bch2_gc_alloc_done(struct bch_fs *c, bool initial, bool metadata_only + ret = __bch2_trans_do(&trans, NULL, NULL, + BTREE_INSERT_LAZY_RW, + bch2_alloc_write_key(&trans, &iter, +- initial, metadata_only)); ++ metadata_only)); + if (ret) + break; + } +@@ -1373,7 +1372,7 @@ static int bch2_gc_alloc_done(struct bch_fs *c, bool initial, bool metadata_only + return ret; + } + +-static int bch2_gc_alloc_start(struct bch_fs *c, bool initial, bool metadata_only) ++static int bch2_gc_alloc_start(struct bch_fs *c, bool metadata_only) + { + struct bch_dev *ca; + unsigned i; +@@ -1397,7 +1396,7 @@ static int bch2_gc_alloc_start(struct bch_fs *c, bool initial, bool metadata_onl + return bch2_alloc_read(c, true, metadata_only); + } + +-static void bch2_gc_alloc_reset(struct bch_fs *c, bool initial, bool metadata_only) ++static void bch2_gc_alloc_reset(struct bch_fs *c, bool metadata_only) + { + struct bch_dev *ca; + unsigned i; +@@ -1418,8 +1417,7 @@ static void bch2_gc_alloc_reset(struct bch_fs *c, bool initial, bool metadata_on + }; + } + +-static int bch2_gc_reflink_done(struct bch_fs *c, bool initial, +- bool metadata_only) ++static int bch2_gc_reflink_done(struct bch_fs *c, bool metadata_only) + { + struct btree_trans trans; + struct btree_iter iter; +@@ -1466,23 +1464,13 @@ static int bch2_gc_reflink_done(struct bch_fs *c, bool initial, + + bkey_reassemble(new, k); + +- if (!r->refcount) { ++ if (!r->refcount) + new->k.type = KEY_TYPE_deleted; +- /* +- * XXX ugly: bch2_journal_key_insert() queues up +- * the key for the journal replay code, which +- * doesn't run the extent overwrite pass +- */ +- if (initial) +- new->k.size = 0; +- } else { ++ else + *bkey_refcount(new) = cpu_to_le64(r->refcount); +- } + +- ret = initial +- ? bch2_journal_key_insert(c, BTREE_ID_stripes, 0, new) +- : __bch2_trans_do(&trans, NULL, NULL, 0, +- __bch2_btree_insert(&trans, BTREE_ID_reflink, new)); ++ ret = __bch2_trans_do(&trans, NULL, NULL, 0, ++ __bch2_btree_insert(&trans, BTREE_ID_reflink, new)); + kfree(new); + + if (ret) +@@ -1496,7 +1484,7 @@ fsck_err: + return ret; + } + +-static int bch2_gc_reflink_start(struct bch_fs *c, bool initial, ++static int bch2_gc_reflink_start(struct bch_fs *c, + bool metadata_only) + { + struct btree_trans trans; +@@ -1535,8 +1523,7 @@ static int bch2_gc_reflink_start(struct bch_fs *c, bool initial, + return ret; + } + +-static void bch2_gc_reflink_reset(struct bch_fs *c, bool initial, +- bool metadata_only) ++static void bch2_gc_reflink_reset(struct bch_fs *c, bool metadata_only) + { + struct genradix_iter iter; + struct reflink_gc *r; +@@ -1545,8 +1532,7 @@ static void bch2_gc_reflink_reset(struct bch_fs *c, bool initial, + r->refcount = 0; + } + +-static int bch2_gc_stripes_done(struct bch_fs *c, bool initial, +- bool metadata_only) ++static int bch2_gc_stripes_done(struct bch_fs *c, bool metadata_only) + { + struct btree_trans trans; + struct btree_iter iter; +@@ -1594,10 +1580,8 @@ inconsistent: + for (i = 0; i < new->v.nr_blocks; i++) + stripe_blockcount_set(&new->v, i, m ? m->block_sectors[i] : 0); + +- ret = initial +- ? bch2_journal_key_insert(c, BTREE_ID_stripes, 0, &new->k_i) +- : __bch2_trans_do(&trans, NULL, NULL, 0, +- __bch2_btree_insert(&trans, BTREE_ID_reflink, &new->k_i)); ++ ret = __bch2_trans_do(&trans, NULL, NULL, 0, ++ __bch2_btree_insert(&trans, BTREE_ID_reflink, &new->k_i)); + kfree(new); + } + } +@@ -1608,8 +1592,7 @@ fsck_err: + return ret; + } + +-static void bch2_gc_stripes_reset(struct bch_fs *c, bool initial, +- bool metadata_only) ++static void bch2_gc_stripes_reset(struct bch_fs *c, bool metadata_only) + { + genradix_free(&c->gc_stripes); + } +@@ -1649,8 +1632,8 @@ int bch2_gc(struct bch_fs *c, bool initial, bool metadata_only) + !bch2_btree_interior_updates_nr_pending(c)); + + ret = bch2_gc_start(c, metadata_only) ?: +- bch2_gc_alloc_start(c, initial, metadata_only) ?: +- bch2_gc_reflink_start(c, initial, metadata_only); ++ bch2_gc_alloc_start(c, metadata_only) ?: ++ bch2_gc_reflink_start(c, metadata_only); + if (ret) + goto out; + again: +@@ -1705,9 +1688,9 @@ again: + clear_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags); + __gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING)); + +- bch2_gc_stripes_reset(c, initial, metadata_only); +- bch2_gc_alloc_reset(c, initial, metadata_only); +- bch2_gc_reflink_reset(c, initial, metadata_only); ++ bch2_gc_stripes_reset(c, metadata_only); ++ bch2_gc_alloc_reset(c, metadata_only); ++ bch2_gc_reflink_reset(c, metadata_only); + + /* flush fsck errors, reset counters */ + bch2_flush_fsck_errs(c); +@@ -1717,9 +1700,9 @@ out: + if (!ret) { + bch2_journal_block(&c->journal); + +- ret = bch2_gc_stripes_done(c, initial, metadata_only) ?: +- bch2_gc_reflink_done(c, initial, metadata_only) ?: +- bch2_gc_alloc_done(c, initial, metadata_only) ?: ++ ret = bch2_gc_stripes_done(c, metadata_only) ?: ++ bch2_gc_reflink_done(c, metadata_only) ?: ++ bch2_gc_alloc_done(c, metadata_only) ?: + bch2_gc_done(c, initial, metadata_only); + + bch2_journal_unblock(&c->journal); +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index a08d36c0dc8d..f3f9c412167c 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -973,6 +973,27 @@ static int bch2_trans_commit_run_triggers(struct btree_trans *trans) + return 0; + } + ++/* ++ * This is for updates done in the early part of fsck - btree_gc - before we've ++ * gone RW. we only add the new key to the list of keys for journal replay to ++ * do. ++ */ ++static noinline int ++do_bch2_trans_commit_to_journal_replay(struct btree_trans *trans) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_insert_entry *i; ++ int ret = 0; ++ ++ trans_for_each_update(trans, i) { ++ ret = bch2_journal_key_insert(c, i->btree_id, i->level, i->k); ++ if (ret) ++ break; ++ } ++ ++ return ret; ++} ++ + int __bch2_trans_commit(struct btree_trans *trans) + { + struct bch_fs *c = trans->c; +@@ -991,6 +1012,11 @@ int __bch2_trans_commit(struct btree_trans *trans) + if (ret) + goto out_reset; + ++ if (unlikely(!test_bit(BCH_FS_MAY_GO_RW, &c->flags))) { ++ ret = do_bch2_trans_commit_to_journal_replay(trans); ++ goto out_reset; ++ } ++ + if (!(trans->flags & BTREE_INSERT_NOCHECK_RW) && + unlikely(!percpu_ref_tryget(&c->writes))) { + ret = bch2_trans_commit_get_rw_cold(trans); +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index eb0eaa983dc9..bd5cb897bdba 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -1160,9 +1160,11 @@ static int bch2_mark_reservation(struct btree_trans *trans, + return 0; + } + +-static s64 __bch2_mark_reflink_p(struct bch_fs *c, struct bkey_s_c_reflink_p p, ++static s64 __bch2_mark_reflink_p(struct btree_trans *trans, ++ struct bkey_s_c_reflink_p p, + u64 *idx, unsigned flags, size_t r_idx) + { ++ struct bch_fs *c = trans->c; + struct reflink_gc *r; + int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1; + s64 ret = 0; +@@ -1195,7 +1197,7 @@ not_found: + new.k.type = KEY_TYPE_error; + new.k.p = p.k->p; + new.k.size = p.k->size; +- ret = bch2_journal_key_insert(c, BTREE_ID_extents, 0, &new.k_i); ++ ret = __bch2_btree_insert(trans, BTREE_ID_extents, &new.k_i); + } + fsck_err: + return ret; +@@ -1234,7 +1236,7 @@ static int bch2_mark_reflink_p(struct btree_trans *trans, + } + + while (idx < end && !ret) +- ret = __bch2_mark_reflink_p(c, p, &idx, flags, l++); ++ ret = __bch2_mark_reflink_p(trans, p, &idx, flags, l++); + + return ret; + } +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index ed25595275fc..87007bfa79e5 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -1139,6 +1139,7 @@ use_clean: + + clear_bit(BCH_FS_REBUILD_REPLICAS, &c->flags); + set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags); ++ set_bit(BCH_FS_MAY_GO_RW, &c->flags); + + /* + * Skip past versions that might have possibly been used (as nonces), +@@ -1299,6 +1300,7 @@ int bch2_fs_initialize(struct bch_fs *c) + mutex_unlock(&c->sb_lock); + + set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags); ++ set_bit(BCH_FS_MAY_GO_RW, &c->flags); + set_bit(BCH_FS_FSCK_DONE, &c->flags); + + for (i = 0; i < BTREE_ID_NR; i++) +-- +cgit v1.2.3 + + +From 5d0e310f0e838c14b925cc475993901500237af5 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 19 Feb 2022 03:06:28 -0500 +Subject: bcachefs: Improve reflink repair code + +When a reflink pointer points to a missing indirect extent, we replace +it with an error key. Instead of replacing the entire reflink pointer +with an error key, this patch replaces only the missing range with an +error key. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/buckets.c | 32 ++++++++++++++++++-------------- + 1 file changed, 18 insertions(+), 14 deletions(-) + +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index bd5cb897bdba..c51c86b1dcd8 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -1162,18 +1162,22 @@ static int bch2_mark_reservation(struct btree_trans *trans, + + static s64 __bch2_mark_reflink_p(struct btree_trans *trans, + struct bkey_s_c_reflink_p p, ++ u64 start, u64 end, + u64 *idx, unsigned flags, size_t r_idx) + { + struct bch_fs *c = trans->c; + struct reflink_gc *r; + int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1; ++ u64 next_idx = end; + s64 ret = 0; ++ char buf[200]; + + if (r_idx >= c->reflink_gc_nr) + goto not_found; + + r = genradix_ptr(&c->reflink_gc_table, r_idx); +- if (*idx < r->offset - r->size) ++ next_idx = min(next_idx, r->offset - r->size); ++ if (*idx < next_idx) + goto not_found; + + BUG_ON((s64) r->refcount + add < 0); +@@ -1182,23 +1186,22 @@ static s64 __bch2_mark_reflink_p(struct btree_trans *trans, + *idx = r->offset; + return 0; + not_found: +- *idx = U64_MAX; +- ret = -EIO; +- +- /* +- * XXX: we're replacing the entire reflink pointer with an error +- * key, we should just be replacing the part that was missing: +- */ +- if (fsck_err(c, "%llu:%llu len %u points to nonexistent indirect extent %llu", +- p.k->p.inode, p.k->p.offset, p.k->size, *idx)) { ++ if (fsck_err(c, "pointer to missing indirect extent\n" ++ " %s\n" ++ " missing range %llu-%llu", ++ (bch2_bkey_val_to_text(&PBUF(buf), c, p.s_c), buf), ++ *idx, next_idx)) { + struct bkey_i_error new; + + bkey_init(&new.k); + new.k.type = KEY_TYPE_error; +- new.k.p = p.k->p; +- new.k.size = p.k->size; ++ new.k.p = bkey_start_pos(p.k); ++ new.k.p.offset += *idx - start; ++ bch2_key_resize(&new.k, next_idx - *idx); + ret = __bch2_btree_insert(trans, BTREE_ID_extents, &new.k_i); + } ++ ++ *idx = next_idx; + fsck_err: + return ret; + } +@@ -1212,7 +1215,7 @@ static int bch2_mark_reflink_p(struct btree_trans *trans, + struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); + struct reflink_gc *ref; + size_t l, r, m; +- u64 idx = le64_to_cpu(p.v->idx); ++ u64 idx = le64_to_cpu(p.v->idx), start = idx; + u64 end = le64_to_cpu(p.v->idx) + p.k->size; + int ret = 0; + +@@ -1236,7 +1239,8 @@ static int bch2_mark_reflink_p(struct btree_trans *trans, + } + + while (idx < end && !ret) +- ret = __bch2_mark_reflink_p(trans, p, &idx, flags, l++); ++ ret = __bch2_mark_reflink_p(trans, p, start, end, ++ &idx, flags, l++); + + return ret; + } +-- +cgit v1.2.3 + + +From ab7a78039e6e5174c12b765efb535bfb67f2f2a5 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 19 Feb 2022 03:56:44 -0500 +Subject: bcachefs: Use unlikely() in err_on() macros + +Should be obviously a good thing. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/error.h | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h +index 986938298adc..4ab3cfe1292c 100644 +--- a/fs/bcachefs/error.h ++++ b/fs/bcachefs/error.h +@@ -39,7 +39,7 @@ void bch2_topology_error(struct bch_fs *); + + #define bch2_fs_inconsistent_on(cond, c, ...) \ + ({ \ +- int _ret = !!(cond); \ ++ bool _ret = unlikely(!!(cond)); \ + \ + if (_ret) \ + bch2_fs_inconsistent(c, __VA_ARGS__); \ +@@ -59,7 +59,7 @@ do { \ + + #define bch2_dev_inconsistent_on(cond, ca, ...) \ + ({ \ +- int _ret = !!(cond); \ ++ bool _ret = unlikely(!!(cond)); \ + \ + if (_ret) \ + bch2_dev_inconsistent(ca, __VA_ARGS__); \ +@@ -129,7 +129,7 @@ void bch2_flush_fsck_errs(struct bch_fs *); + /* XXX: mark in superblock that filesystem contains errors, if we ignore: */ + + #define __fsck_err_on(cond, c, _flags, ...) \ +- ((cond) ? __fsck_err(c, _flags, ##__VA_ARGS__) : false) ++ (unlikely(cond) ? __fsck_err(c, _flags, ##__VA_ARGS__) : false) + + #define need_fsck_err_on(cond, c, ...) \ + __fsck_err_on(cond, c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ##__VA_ARGS__) +@@ -164,7 +164,7 @@ do { \ + + #define bch2_fs_fatal_err_on(cond, c, ...) \ + ({ \ +- int _ret = !!(cond); \ ++ bool _ret = unlikely(!!(cond)); \ + \ + if (_ret) \ + bch2_fs_fatal_error(c, __VA_ARGS__); \ +-- +cgit v1.2.3 + + +From 7cdbb160032a1cd33e313760eb3493dae7a826f4 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 23 Feb 2022 10:32:43 -0500 +Subject: bcachefs: Improve some btree node read error messages + +On btree node read error, it's helpful to see what we were trying to +read - was it all zeroes? + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_io.c | 5 +++-- + 1 file changed, 3 insertions(+), 2 deletions(-) + +diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c +index 55c939dc6789..c0b95e663946 100644 +--- a/fs/bcachefs/btree_io.c ++++ b/fs/bcachefs/btree_io.c +@@ -885,11 +885,12 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, + + btree_err_on(le64_to_cpu(b->data->magic) != bset_magic(c), + BTREE_ERR_MUST_RETRY, c, ca, b, NULL, +- "bad magic"); ++ "bad magic: want %llx, got %llx", ++ bset_magic(c), le64_to_cpu(b->data->magic)); + + btree_err_on(!b->data->keys.seq, + BTREE_ERR_MUST_RETRY, c, ca, b, NULL, +- "bad btree header"); ++ "bad btree header: seq 0"); + + if (b->key.k.type == KEY_TYPE_btree_ptr_v2) { + struct bch_btree_ptr_v2 *bp = +-- +cgit v1.2.3 + + +From 3b22724d24a9111cde067d3e000857cbd4ebfbbb Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 23 Feb 2022 11:46:34 -0500 +Subject: bcachefs: Fix 32 bit build + +vstruct_bytes() was returning a u64 - it should be a size_t, the corect +type for the size of anything that fits in memory. + +Also replace a 64 bit divide with div_u64(). + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/journal_io.c | 2 +- + fs/bcachefs/quota.c | 2 +- + fs/bcachefs/super-io.c | 10 +++++----- + fs/bcachefs/vstructs.h | 2 +- + 4 files changed, 8 insertions(+), 8 deletions(-) + +diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c +index 49f2dd3effc7..794719d46ebd 100644 +--- a/fs/bcachefs/journal_io.c ++++ b/fs/bcachefs/journal_io.c +@@ -1106,7 +1106,7 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list, + struct journal_replay *p = list_prev_entry(i, list); + + bch2_journal_ptrs_to_text(&out, c, p); +- pr_buf(&out, " size %llu", vstruct_sectors(&p->j, c->block_bits)); ++ pr_buf(&out, " size %zu", vstruct_sectors(&p->j, c->block_bits)); + } else + sprintf(buf1, "(none)"); + bch2_journal_ptrs_to_text(&PBUF(buf2), c, i); +diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c +index b7ef8fa7bbc9..ca029a00e7b8 100644 +--- a/fs/bcachefs/quota.c ++++ b/fs/bcachefs/quota.c +@@ -23,7 +23,7 @@ static int bch2_sb_quota_validate(struct bch_sb *sb, struct bch_sb_field *f, + struct bch_sb_field_quota *q = field_to_type(f, quota); + + if (vstruct_bytes(&q->field) < sizeof(*q)) { +- pr_buf(err, "wrong size (got %llu should be %zu)", ++ pr_buf(err, "wrong size (got %zu should be %zu)", + vstruct_bytes(&q->field), sizeof(*q)); + return -EINVAL; + } +diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c +index c22e2c03fc06..21109881e9f6 100644 +--- a/fs/bcachefs/super-io.c ++++ b/fs/bcachefs/super-io.c +@@ -1145,7 +1145,7 @@ static int bch2_sb_crypt_validate(struct bch_sb *sb, + struct bch_sb_field_crypt *crypt = field_to_type(f, crypt); + + if (vstruct_bytes(&crypt->field) < sizeof(*crypt)) { +- pr_buf(err, "wrong size (got %llu should be %zu)", ++ pr_buf(err, "wrong size (got %zu should be %zu)", + vstruct_bytes(&crypt->field), sizeof(*crypt)); + return -EINVAL; + } +@@ -1387,7 +1387,7 @@ static int bch2_sb_clean_validate(struct bch_sb *sb, + struct bch_sb_field_clean *clean = field_to_type(f, clean); + + if (vstruct_bytes(&clean->field) < sizeof(*clean)) { +- pr_buf(err, "wrong size (got %llu should be %zu)", ++ pr_buf(err, "wrong size (got %zu should be %zu)", + vstruct_bytes(&clean->field), sizeof(*clean)); + return -EINVAL; + } +@@ -1464,7 +1464,7 @@ void bch2_sb_field_to_text(struct printbuf *out, struct bch_sb *sb, + else + pr_buf(out, "(unknown field %u)", type); + +- pr_buf(out, " (size %llu):", vstruct_bytes(f)); ++ pr_buf(out, " (size %zu):", vstruct_bytes(f)); + pr_newline(out); + + if (ops && ops->to_text) { +@@ -1540,7 +1540,7 @@ void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb, + + pr_buf(out, "Created: "); + if (sb->time_base_lo) +- pr_time(out, le64_to_cpu(sb->time_base_lo) / NSEC_PER_SEC); ++ pr_time(out, div_u64(le64_to_cpu(sb->time_base_lo), NSEC_PER_SEC)); + else + pr_buf(out, "(not set)"); + pr_newline(out); +@@ -1646,7 +1646,7 @@ void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb, + bch2_flags_to_text(out, bch2_sb_fields, fields_have); + pr_newline(out); + +- pr_buf(out, "Superblock size: %llu", vstruct_bytes(sb)); ++ pr_buf(out, "Superblock size: %zu", vstruct_bytes(sb)); + pr_newline(out); + + if (print_layout) { +diff --git a/fs/bcachefs/vstructs.h b/fs/bcachefs/vstructs.h +index c099cdc0605f..53a694d71967 100644 +--- a/fs/bcachefs/vstructs.h ++++ b/fs/bcachefs/vstructs.h +@@ -20,7 +20,7 @@ + ({ \ + BUILD_BUG_ON(offsetof(_type, _data) % sizeof(u64)); \ + \ +- (offsetof(_type, _data) + (_u64s) * sizeof(u64)); \ ++ (size_t) (offsetof(_type, _data) + (_u64s) * sizeof(u64)); \ + }) + + #define vstruct_bytes(_s) \ +-- +cgit v1.2.3 + + +From b46eb1d5f3d9be71c7199082013152a25df6a285 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 10 Dec 2021 17:04:26 -0500 +Subject: bcachefs: bch2_trans_mark_key() now takes a bkey_i * + +We're now coming up with triggers that modify the update being done. A +bkey_s_c is const - bkey_i is the correct type to be using here. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_update_interior.c | 20 +++-------- + fs/bcachefs/btree_update_leaf.c | 11 ++---- + fs/bcachefs/buckets.c | 72 +++++++++++++++++++------------------ + fs/bcachefs/buckets.h | 27 +++++++++++++- + 4 files changed, 71 insertions(+), 59 deletions(-) + +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index 779f8b8d8ca5..b6758a7c4056 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -523,19 +523,13 @@ static int btree_update_nodes_written_trans(struct btree_trans *trans, + trans->journal_pin = &as->journal; + + for_each_keylist_key(&as->new_keys, k) { +- ret = bch2_trans_mark_key(trans, +- bkey_s_c_null, +- bkey_i_to_s_c(k), +- BTREE_TRIGGER_INSERT); ++ ret = bch2_trans_mark_new(trans, k, 0); + if (ret) + return ret; + } + + for_each_keylist_key(&as->old_keys, k) { +- ret = bch2_trans_mark_key(trans, +- bkey_i_to_s_c(k), +- bkey_s_c_null, +- BTREE_TRIGGER_OVERWRITE); ++ ret = bch2_trans_mark_old(trans, bkey_i_to_s_c(k), 0); + if (ret) + return ret; + } +@@ -1883,17 +1877,11 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans, + int ret; + + if (!skip_triggers) { +- ret = bch2_trans_mark_key(trans, +- bkey_s_c_null, +- bkey_i_to_s_c(new_key), +- BTREE_TRIGGER_INSERT); ++ ret = bch2_trans_mark_new(trans, new_key, 0); + if (ret) + return ret; + +- ret = bch2_trans_mark_key(trans, +- bkey_i_to_s_c(&b->key), +- bkey_s_c_null, +- BTREE_TRIGGER_OVERWRITE); ++ ret = bch2_trans_mark_old(trans, bkey_i_to_s_c(&b->key), 0); + if (ret) + return ret; + } +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index f3f9c412167c..3fcf5e0790e7 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -865,8 +865,6 @@ bch2_trans_commit_get_rw_cold(struct btree_trans *trans) + static int run_one_trigger(struct btree_trans *trans, struct btree_insert_entry *i, + bool overwrite) + { +- struct bkey _deleted = KEY(0, 0, 0); +- struct bkey_s_c deleted = (struct bkey_s_c) { &_deleted, NULL }; + struct bkey_s_c old; + struct bkey unpacked; + int ret = 0; +@@ -890,19 +888,16 @@ static int run_one_trigger(struct btree_trans *trans, struct btree_insert_entry + } + + old = bch2_btree_path_peek_slot(i->path, &unpacked); +- _deleted.p = i->path->pos; + + if (overwrite) { +- ret = bch2_trans_mark_key(trans, old, deleted, +- BTREE_TRIGGER_OVERWRITE|i->flags); ++ ret = bch2_trans_mark_old(trans, old, i->flags); + } else if (old.k->type == i->k->k.type && + ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) { + i->overwrite_trigger_run = true; +- ret = bch2_trans_mark_key(trans, old, bkey_i_to_s_c(i->k), ++ ret = bch2_trans_mark_key(trans, old, i->k, + BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|i->flags); + } else { +- ret = bch2_trans_mark_key(trans, deleted, bkey_i_to_s_c(i->k), +- BTREE_TRIGGER_INSERT|i->flags); ++ ret = bch2_trans_mark_new(trans, i->k, i->flags); + } + + if (ret == -EINTR) +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index c51c86b1dcd8..4b64e14dd3da 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -1669,65 +1669,67 @@ err: + } + + static int bch2_trans_mark_stripe(struct btree_trans *trans, +- struct bkey_s_c old, struct bkey_s_c new, ++ struct bkey_s_c old, struct bkey_i *new, + unsigned flags) + { +- struct bkey_s_c_stripe old_s = { .k = NULL }; +- struct bkey_s_c_stripe new_s = { .k = NULL }; ++ const struct bch_stripe *old_s = NULL; ++ struct bch_stripe *new_s = NULL; + struct bch_replicas_padded r; + unsigned i, nr_blocks; + int ret = 0; + + if (old.k->type == KEY_TYPE_stripe) +- old_s = bkey_s_c_to_stripe(old); +- if (new.k->type == KEY_TYPE_stripe) +- new_s = bkey_s_c_to_stripe(new); ++ old_s = bkey_s_c_to_stripe(old).v; ++ if (new->k.type == KEY_TYPE_stripe) ++ new_s = &bkey_i_to_stripe(new)->v; + + /* + * If the pointers aren't changing, we don't need to do anything: + */ +- if (new_s.k && old_s.k && +- new_s.v->nr_blocks == old_s.v->nr_blocks && +- new_s.v->nr_redundant == old_s.v->nr_redundant && +- !memcmp(old_s.v->ptrs, new_s.v->ptrs, +- new_s.v->nr_blocks * sizeof(struct bch_extent_ptr))) ++ if (new_s && old_s && ++ new_s->nr_blocks == old_s->nr_blocks && ++ new_s->nr_redundant == old_s->nr_redundant && ++ !memcmp(old_s->ptrs, new_s->ptrs, ++ new_s->nr_blocks * sizeof(struct bch_extent_ptr))) + return 0; + +- BUG_ON(new_s.k && old_s.k && +- (new_s.v->nr_blocks != old_s.v->nr_blocks || +- new_s.v->nr_redundant != old_s.v->nr_redundant)); ++ BUG_ON(new_s && old_s && ++ (new_s->nr_blocks != old_s->nr_blocks || ++ new_s->nr_redundant != old_s->nr_redundant)); + +- nr_blocks = new_s.k ? new_s.v->nr_blocks : old_s.v->nr_blocks; ++ nr_blocks = new_s ? new_s->nr_blocks : old_s->nr_blocks; + +- if (new_s.k) { +- s64 sectors = le16_to_cpu(new_s.v->sectors); ++ if (new_s) { ++ s64 sectors = le16_to_cpu(new_s->sectors); + +- bch2_bkey_to_replicas(&r.e, new); +- update_replicas_list(trans, &r.e, sectors * new_s.v->nr_redundant); ++ bch2_bkey_to_replicas(&r.e, bkey_i_to_s_c(new)); ++ update_replicas_list(trans, &r.e, sectors * new_s->nr_redundant); + } + +- if (old_s.k) { +- s64 sectors = -((s64) le16_to_cpu(old_s.v->sectors)); ++ if (old_s) { ++ s64 sectors = -((s64) le16_to_cpu(old_s->sectors)); + + bch2_bkey_to_replicas(&r.e, old); +- update_replicas_list(trans, &r.e, sectors * old_s.v->nr_redundant); ++ update_replicas_list(trans, &r.e, sectors * old_s->nr_redundant); + } + + for (i = 0; i < nr_blocks; i++) { +- if (new_s.k && old_s.k && +- !memcmp(&new_s.v->ptrs[i], +- &old_s.v->ptrs[i], +- sizeof(new_s.v->ptrs[i]))) ++ if (new_s && old_s && ++ !memcmp(&new_s->ptrs[i], ++ &old_s->ptrs[i], ++ sizeof(new_s->ptrs[i]))) + continue; + +- if (new_s.k) { +- ret = bch2_trans_mark_stripe_bucket(trans, new_s, i, false); ++ if (new_s) { ++ ret = bch2_trans_mark_stripe_bucket(trans, ++ bkey_i_to_s_c_stripe(new), i, false); + if (ret) + break; + } + +- if (old_s.k) { +- ret = bch2_trans_mark_stripe_bucket(trans, old_s, i, true); ++ if (old_s) { ++ ret = bch2_trans_mark_stripe_bucket(trans, ++ bkey_s_c_to_stripe(old), i, true); + if (ret) + break; + } +@@ -1738,10 +1740,10 @@ static int bch2_trans_mark_stripe(struct btree_trans *trans, + + static int bch2_trans_mark_inode(struct btree_trans *trans, + struct bkey_s_c old, +- struct bkey_s_c new, ++ struct bkey_i *new, + unsigned flags) + { +- int nr = bkey_is_inode(new.k) - bkey_is_inode(old.k); ++ int nr = bkey_is_inode(&new->k) - bkey_is_inode(old.k); + + if (nr) { + struct replicas_delta_list *d = +@@ -1876,9 +1878,11 @@ static int bch2_trans_mark_reflink_p(struct btree_trans *trans, + } + + int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c old, +- struct bkey_s_c new, unsigned flags) ++ struct bkey_i *new, unsigned flags) + { +- struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old: new; ++ struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ++ ? old ++ : bkey_i_to_s_c(new); + + switch (k.k->type) { + case KEY_TYPE_btree_ptr: +diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h +index 7c6c59c7762c..2a400bd7acc8 100644 +--- a/fs/bcachefs/buckets.h ++++ b/fs/bcachefs/buckets.h +@@ -235,7 +235,32 @@ int bch2_mark_update(struct btree_trans *, struct btree_path *, + struct bkey_i *, unsigned); + + int bch2_trans_mark_key(struct btree_trans *, struct bkey_s_c, +- struct bkey_s_c, unsigned); ++ struct bkey_i *, unsigned); ++ ++static inline int bch2_trans_mark_old(struct btree_trans *trans, ++ struct bkey_s_c old, unsigned flags) ++{ ++ struct bkey_i deleted; ++ ++ bkey_init(&deleted.k); ++ deleted.k.p = old.k->p; ++ ++ return bch2_trans_mark_key(trans, old, &deleted, ++ BTREE_TRIGGER_OVERWRITE|flags); ++} ++ ++static inline int bch2_trans_mark_new(struct btree_trans *trans, ++ struct bkey_i *new, unsigned flags) ++{ ++ struct bkey_i deleted; ++ ++ bkey_init(&deleted.k); ++ deleted.k.p = new->k.p; ++ ++ return bch2_trans_mark_key(trans, bkey_i_to_s_c(&deleted), new, ++ BTREE_TRIGGER_INSERT|flags); ++} ++ + int bch2_trans_fs_usage_apply(struct btree_trans *, struct replicas_delta_list *); + + int bch2_trans_mark_metadata_bucket(struct btree_trans *, struct bch_dev *, +-- +cgit v1.2.3 + + +From 5fd00eace06975db464c1c6deffc9c17fdab2115 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 24 Feb 2022 11:30:17 -0500 +Subject: bcachefs: Consolidate trigger code a bit + +Upcoming patches are doing more work on the triggers code, this patch +just moves code around. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_update_leaf.c | 259 +++++++++++++++++++++++----------------- + fs/bcachefs/buckets.c | 33 ----- + fs/bcachefs/buckets.h | 3 - + 3 files changed, 148 insertions(+), 147 deletions(-) + +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index 3fcf5e0790e7..11063ffa7afc 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -399,7 +399,151 @@ static inline void do_btree_insert_one(struct btree_trans *trans, + } + } + +-static noinline int bch2_trans_mark_gc(struct btree_trans *trans) ++/* Triggers: */ ++ ++static int run_one_mem_trigger(struct btree_trans *trans, ++ struct btree_insert_entry *i, ++ unsigned flags) ++{ ++ struct bkey _deleted = KEY(0, 0, 0); ++ struct bkey_s_c deleted = (struct bkey_s_c) { &_deleted, NULL }; ++ struct bkey_s_c old; ++ struct bkey unpacked; ++ struct bkey_i *new = i->k; ++ int ret; ++ ++ _deleted.p = i->path->pos; ++ ++ if (unlikely(flags & BTREE_TRIGGER_NORUN)) ++ return 0; ++ ++ if (!btree_node_type_needs_gc(i->path->btree_id)) ++ return 0; ++ ++ old = bch2_btree_path_peek_slot(i->path, &unpacked); ++ ++ if (old.k->type == new->k.type && ++ ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) { ++ ret = bch2_mark_key(trans, old, bkey_i_to_s_c(new), ++ BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags); ++ } else { ++ ret = bch2_mark_key(trans, deleted, bkey_i_to_s_c(new), ++ BTREE_TRIGGER_INSERT|flags) ?: ++ bch2_mark_key(trans, old, deleted, ++ BTREE_TRIGGER_OVERWRITE|flags); ++ } ++ ++ return ret; ++} ++ ++static int run_one_trans_trigger(struct btree_trans *trans, ++ struct btree_insert_entry *i, ++ bool overwrite) ++{ ++ struct bkey_s_c old; ++ struct bkey unpacked; ++ int ret = 0; ++ ++ if ((i->flags & BTREE_TRIGGER_NORUN) || ++ !(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type))) ++ return 0; ++ ++ if (!overwrite) { ++ if (i->insert_trigger_run) ++ return 0; ++ ++ BUG_ON(i->overwrite_trigger_run); ++ i->insert_trigger_run = true; ++ } else { ++ if (i->overwrite_trigger_run) ++ return 0; ++ ++ BUG_ON(!i->insert_trigger_run); ++ i->overwrite_trigger_run = true; ++ } ++ ++ old = bch2_btree_path_peek_slot(i->path, &unpacked); ++ ++ if (overwrite) { ++ ret = bch2_trans_mark_old(trans, old, i->flags); ++ } else if (old.k->type == i->k->k.type && ++ ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) { ++ i->overwrite_trigger_run = true; ++ ret = bch2_trans_mark_key(trans, old, i->k, ++ BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|i->flags); ++ } else { ++ ret = bch2_trans_mark_new(trans, i->k, i->flags); ++ } ++ ++ if (ret == -EINTR) ++ trace_trans_restart_mark(trans->fn, _RET_IP_, ++ i->btree_id, &i->path->pos); ++ return ret ?: 1; ++} ++ ++static int run_btree_triggers(struct btree_trans *trans, enum btree_id btree_id, ++ struct btree_insert_entry *btree_id_start) ++{ ++ struct btree_insert_entry *i; ++ bool trans_trigger_run; ++ int ret, overwrite; ++ ++ for (overwrite = 0; overwrite < 2; overwrite++) { ++ ++ /* ++ * Running triggers will append more updates to the list of updates as ++ * we're walking it: ++ */ ++ do { ++ trans_trigger_run = false; ++ ++ for (i = btree_id_start; ++ i < trans->updates + trans->nr_updates && i->btree_id <= btree_id; ++ i++) { ++ ret = run_one_trans_trigger(trans, i, overwrite); ++ if (ret < 0) ++ return ret; ++ if (ret) ++ trans_trigger_run = true; ++ } ++ } while (trans_trigger_run); ++ } ++ ++ return 0; ++} ++ ++static int bch2_trans_commit_run_triggers(struct btree_trans *trans) ++{ ++ struct btree_insert_entry *i = NULL, *btree_id_start = trans->updates; ++ unsigned btree_id = 0; ++ int ret = 0; ++ ++ /* ++ * ++ * For a given btree, this algorithm runs insert triggers before ++ * overwrite triggers: this is so that when extents are being moved ++ * (e.g. by FALLOCATE_FL_INSERT_RANGE), we don't drop references before ++ * they are re-added. ++ */ ++ for (btree_id = 0; btree_id < BTREE_ID_NR; btree_id++) { ++ while (btree_id_start < trans->updates + trans->nr_updates && ++ btree_id_start->btree_id < btree_id) ++ btree_id_start++; ++ ++ ret = run_btree_triggers(trans, btree_id, btree_id_start); ++ if (ret) ++ return ret; ++ } ++ ++ trans_for_each_update(trans, i) ++ BUG_ON(!(i->flags & BTREE_TRIGGER_NORUN) && ++ (BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)) && ++ (!i->insert_trigger_run || !i->overwrite_trigger_run)); ++ ++ return 0; ++} ++ ++static noinline int bch2_trans_commit_run_gc_triggers(struct btree_trans *trans) + { + struct bch_fs *c = trans->c; + struct btree_insert_entry *i; +@@ -413,8 +557,7 @@ static noinline int bch2_trans_mark_gc(struct btree_trans *trans) + BUG_ON(i->cached || i->level); + + if (gc_visited(c, gc_pos_btree_node(insert_l(i)->b))) { +- ret = bch2_mark_update(trans, i->path, i->k, +- i->flags|BTREE_TRIGGER_GC); ++ ret = run_one_mem_trigger(trans, i, i->flags|BTREE_TRIGGER_GC); + if (ret) + break; + } +@@ -520,13 +663,13 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, + + trans_for_each_update(trans, i) + if (BTREE_NODE_TYPE_HAS_MEM_TRIGGERS & (1U << i->bkey_type)) { +- ret = bch2_mark_update(trans, i->path, i->k, i->flags); ++ ret = run_one_mem_trigger(trans, i, i->flags); + if (ret) + return ret; + } + + if (unlikely(c->gc_pos.phase)) { +- ret = bch2_trans_mark_gc(trans); ++ ret = bch2_trans_commit_run_gc_triggers(trans); + if (ret) + return ret; + } +@@ -862,112 +1005,6 @@ bch2_trans_commit_get_rw_cold(struct btree_trans *trans) + return 0; + } + +-static int run_one_trigger(struct btree_trans *trans, struct btree_insert_entry *i, +- bool overwrite) +-{ +- struct bkey_s_c old; +- struct bkey unpacked; +- int ret = 0; +- +- if ((i->flags & BTREE_TRIGGER_NORUN) || +- !(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type))) +- return 0; +- +- if (!overwrite) { +- if (i->insert_trigger_run) +- return 0; +- +- BUG_ON(i->overwrite_trigger_run); +- i->insert_trigger_run = true; +- } else { +- if (i->overwrite_trigger_run) +- return 0; +- +- BUG_ON(!i->insert_trigger_run); +- i->overwrite_trigger_run = true; +- } +- +- old = bch2_btree_path_peek_slot(i->path, &unpacked); +- +- if (overwrite) { +- ret = bch2_trans_mark_old(trans, old, i->flags); +- } else if (old.k->type == i->k->k.type && +- ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) { +- i->overwrite_trigger_run = true; +- ret = bch2_trans_mark_key(trans, old, i->k, +- BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|i->flags); +- } else { +- ret = bch2_trans_mark_new(trans, i->k, i->flags); +- } +- +- if (ret == -EINTR) +- trace_trans_restart_mark(trans->fn, _RET_IP_, +- i->btree_id, &i->path->pos); +- return ret ?: 1; +-} +- +-static int run_btree_triggers(struct btree_trans *trans, enum btree_id btree_id, +- struct btree_insert_entry *btree_id_start) +-{ +- struct btree_insert_entry *i; +- bool trans_trigger_run; +- int ret, overwrite; +- +- for (overwrite = 0; overwrite < 2; overwrite++) { +- +- /* +- * Running triggers will append more updates to the list of updates as +- * we're walking it: +- */ +- do { +- trans_trigger_run = false; +- +- for (i = btree_id_start; +- i < trans->updates + trans->nr_updates && i->btree_id <= btree_id; +- i++) { +- ret = run_one_trigger(trans, i, overwrite); +- if (ret < 0) +- return ret; +- if (ret) +- trans_trigger_run = true; +- } +- } while (trans_trigger_run); +- } +- +- return 0; +-} +- +-static int bch2_trans_commit_run_triggers(struct btree_trans *trans) +-{ +- struct btree_insert_entry *i = NULL, *btree_id_start = trans->updates; +- unsigned btree_id = 0; +- int ret = 0; +- +- /* +- * +- * For a given btree, this algorithm runs insert triggers before +- * overwrite triggers: this is so that when extents are being moved +- * (e.g. by FALLOCATE_FL_INSERT_RANGE), we don't drop references before +- * they are re-added. +- */ +- for (btree_id = 0; btree_id < BTREE_ID_NR; btree_id++) { +- while (btree_id_start < trans->updates + trans->nr_updates && +- btree_id_start->btree_id < btree_id) +- btree_id_start++; +- +- ret = run_btree_triggers(trans, btree_id, btree_id_start); +- if (ret) +- return ret; +- } +- +- trans_for_each_update(trans, i) +- BUG_ON(!(i->flags & BTREE_TRIGGER_NORUN) && +- (BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)) && +- (!i->insert_trigger_run || !i->overwrite_trigger_run)); +- +- return 0; +-} +- + /* + * This is for updates done in the early part of fsck - btree_gc - before we've + * gone RW. we only add the new key to the list of keys for journal replay to +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index 4b64e14dd3da..2bb3b3b3743b 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -1278,39 +1278,6 @@ int bch2_mark_key(struct btree_trans *trans, + } + } + +-int bch2_mark_update(struct btree_trans *trans, struct btree_path *path, +- struct bkey_i *new, unsigned flags) +-{ +- struct bkey _deleted = KEY(0, 0, 0); +- struct bkey_s_c deleted = (struct bkey_s_c) { &_deleted, NULL }; +- struct bkey_s_c old; +- struct bkey unpacked; +- int ret; +- +- _deleted.p = path->pos; +- +- if (unlikely(flags & BTREE_TRIGGER_NORUN)) +- return 0; +- +- if (!btree_node_type_needs_gc(path->btree_id)) +- return 0; +- +- old = bch2_btree_path_peek_slot(path, &unpacked); +- +- if (old.k->type == new->k.type && +- ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) { +- ret = bch2_mark_key(trans, old, bkey_i_to_s_c(new), +- BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags); +- } else { +- ret = bch2_mark_key(trans, deleted, bkey_i_to_s_c(new), +- BTREE_TRIGGER_INSERT|flags) ?: +- bch2_mark_key(trans, old, deleted, +- BTREE_TRIGGER_OVERWRITE|flags); +- } +- +- return ret; +-} +- + static noinline __cold + void fs_usage_apply_warn(struct btree_trans *trans, + unsigned disk_res_sectors, +diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h +index 2a400bd7acc8..daf79a4f9128 100644 +--- a/fs/bcachefs/buckets.h ++++ b/fs/bcachefs/buckets.h +@@ -231,9 +231,6 @@ void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *, + + int bch2_mark_key(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned); + +-int bch2_mark_update(struct btree_trans *, struct btree_path *, +- struct bkey_i *, unsigned); +- + int bch2_trans_mark_key(struct btree_trans *, struct bkey_s_c, + struct bkey_i *, unsigned); + +-- +cgit v1.2.3 + + +From 45b13fdd7cf7af5dc1ec432dc50ff1310678f698 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 24 Feb 2022 11:02:58 -0500 +Subject: bcachefs: Stash a copy of key being overwritten in btree_insert_entry + +We currently need to call bch2_btree_path_peek_slot() multiple times in +the transaction commit path - and some of those need to be updated to +also check the keys from journal replay, too. Let's consolidate this and +stash the key being overwritten in btree_insert_entry. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 1 + + fs/bcachefs/btree_types.h | 10 ++++++- + fs/bcachefs/btree_update_leaf.c | 60 +++++++++++++++++++---------------------- + fs/bcachefs/buckets.c | 21 ++++----------- + 4 files changed, 43 insertions(+), 49 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 66778bd92066..44588468045e 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -1967,6 +1967,7 @@ inline struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *path, struct + + EBUG_ON(path->uptodate != BTREE_ITER_UPTODATE); + ++ *u = ck->k->k; + k = bkey_i_to_s_c(ck->k); + } + +diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h +index 9ae5c8d56b2a..d87d39dedb61 100644 +--- a/fs/bcachefs/btree_types.h ++++ b/fs/bcachefs/btree_types.h +@@ -343,12 +343,20 @@ struct btree_insert_entry { + unsigned flags; + u8 bkey_type; + enum btree_id btree_id:8; +- u8 level; ++ u8 level:4; + bool cached:1; + bool insert_trigger_run:1; + bool overwrite_trigger_run:1; ++ /* ++ * @old_k may be a key from the journal; @old_btree_u64s always refers ++ * to the size of the key being overwritten in the btree: ++ */ ++ u8 old_btree_u64s; + struct bkey_i *k; + struct btree_path *path; ++ /* key being overwritten: */ ++ struct bkey old_k; ++ const struct bch_val *old_v; + unsigned long ip_allocated; + }; + +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index 11063ffa7afc..4e2f3b8d24b6 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -405,28 +405,26 @@ static int run_one_mem_trigger(struct btree_trans *trans, + struct btree_insert_entry *i, + unsigned flags) + { +- struct bkey _deleted = KEY(0, 0, 0); +- struct bkey_s_c deleted = (struct bkey_s_c) { &_deleted, NULL }; +- struct bkey_s_c old; +- struct bkey unpacked; ++ struct bkey_s_c old = { &i->old_k, i->old_v }; + struct bkey_i *new = i->k; + int ret; + +- _deleted.p = i->path->pos; +- + if (unlikely(flags & BTREE_TRIGGER_NORUN)) + return 0; + +- if (!btree_node_type_needs_gc(i->path->btree_id)) ++ if (!btree_node_type_needs_gc(i->btree_id)) + return 0; + +- old = bch2_btree_path_peek_slot(i->path, &unpacked); +- + if (old.k->type == new->k.type && + ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) { + ret = bch2_mark_key(trans, old, bkey_i_to_s_c(new), + BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags); + } else { ++ struct bkey _deleted = KEY(0, 0, 0); ++ struct bkey_s_c deleted = (struct bkey_s_c) { &_deleted, NULL }; ++ ++ _deleted.p = i->path->pos; ++ + ret = bch2_mark_key(trans, deleted, bkey_i_to_s_c(new), + BTREE_TRIGGER_INSERT|flags) ?: + bch2_mark_key(trans, old, deleted, +@@ -436,12 +434,16 @@ static int run_one_mem_trigger(struct btree_trans *trans, + return ret; + } + +-static int run_one_trans_trigger(struct btree_trans *trans, +- struct btree_insert_entry *i, +- bool overwrite) ++static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_entry *i, ++ bool overwrite) + { +- struct bkey_s_c old; +- struct bkey unpacked; ++ /* ++ * Transactional triggers create new btree_insert_entries, so we can't ++ * pass them a pointer to a btree_insert_entry, that memory is going to ++ * move: ++ */ ++ struct bkey old_k = i->old_k; ++ struct bkey_s_c old = { &old_k, i->old_v }; + int ret = 0; + + if ((i->flags & BTREE_TRIGGER_NORUN) || +@@ -462,8 +464,6 @@ static int run_one_trans_trigger(struct btree_trans *trans, + i->overwrite_trigger_run = true; + } + +- old = bch2_btree_path_peek_slot(i->path, &unpacked); +- + if (overwrite) { + ret = bch2_trans_mark_old(trans, old, i->flags); + } else if (old.k->type == i->k->k.type && +@@ -801,7 +801,6 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, + { + struct bch_fs *c = trans->c; + struct btree_insert_entry *i; +- struct bkey_s_c old; + int ret, u64s_delta = 0; + + trans_for_each_update(trans, i) { +@@ -819,22 +818,11 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, + } + + trans_for_each_update(trans, i) { +- struct bkey u; +- +- /* +- * peek_slot() doesn't yet work on iterators that point to +- * interior nodes: +- */ +- if (i->cached || i->level) ++ if (i->cached) + continue; + +- old = bch2_btree_path_peek_slot(i->path, &u); +- ret = bkey_err(old); +- if (unlikely(ret)) +- return ret; +- + u64s_delta += !bkey_deleted(&i->k->k) ? i->k->k.u64s : 0; +- u64s_delta -= !bkey_deleted(old.k) ? old.k->u64s : 0; ++ u64s_delta -= i->old_btree_u64s; + + if (!same_leaf_as_next(trans, i)) { + if (u64s_delta <= 0) { +@@ -1462,11 +1450,19 @@ bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path, + BUG_ON(i->insert_trigger_run || i->overwrite_trigger_run); + + bch2_path_put(trans, i->path, true); +- *i = n; +- } else ++ i->flags = n.flags; ++ i->cached = n.cached; ++ i->k = n.k; ++ i->path = n.path; ++ i->ip_allocated = n.ip_allocated; ++ } else { + array_insert_item(trans->updates, trans->nr_updates, + i - trans->updates, n); + ++ i->old_v = bch2_btree_path_peek_slot(path, &i->old_k).v; ++ i->old_btree_u64s = !bkey_deleted(&i->old_k) ? i->old_k.u64s : 0; ++ } ++ + __btree_path_get(n.path, true); + return 0; + } +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index 2bb3b3b3743b..5bd89fc43add 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -1291,25 +1291,14 @@ void fs_usage_apply_warn(struct btree_trans *trans, + should_not_have_added, disk_res_sectors); + + trans_for_each_update(trans, i) { ++ struct bkey_s_c old = { &i->old_k, i->old_v }; ++ + pr_err("while inserting"); + bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(i->k)); +- pr_err("%s", buf); ++ pr_err(" %s", buf); + pr_err("overlapping with"); +- +- if (!i->cached) { +- struct bkey u; +- struct bkey_s_c k = bch2_btree_path_peek_slot(i->path, &u); +- +- bch2_bkey_val_to_text(&PBUF(buf), c, k); +- pr_err("%s", buf); +- } else { +- struct bkey_cached *ck = (void *) i->path->l[0].b; +- +- if (ck->valid) { +- bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(ck->k)); +- pr_err("%s", buf); +- } +- } ++ bch2_bkey_val_to_text(&PBUF(buf), c, old); ++ pr_err(" %s", buf); + } + __WARN(); + } +-- +cgit v1.2.3 + + +From eefc278ad8cc0ce45d806c44c1f3967886bd06e0 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 24 Feb 2022 08:08:53 -0500 +Subject: bcachefs: run_one_trigger() now checks journal keys + +Previously, when doing updates and running triggers before journal +replay completes, triggers would see the incorrect key for the old key +being overwritten - this patch updates the trigger code to check the +journal keys when necessary, needed for the upcoming allocator rewrite. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 24 ++++-------------------- + fs/bcachefs/btree_update_leaf.c | 11 +++++++++++ + fs/bcachefs/recovery.c | 18 ++++++++++++++++++ + fs/bcachefs/recovery.h | 2 ++ + 4 files changed, 35 insertions(+), 20 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 44588468045e..05dc48611c64 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -2173,31 +2173,14 @@ static inline struct bkey_i *btree_trans_peek_updates(struct btree_trans *trans, + return NULL; + } + +-static noinline +-struct bkey_i *__btree_trans_peek_journal(struct btree_trans *trans, +- struct btree_path *path) +-{ +- struct journal_keys *keys = &trans->c->journal_keys; +- size_t idx = bch2_journal_key_search(keys, path->btree_id, +- path->level, path->pos); +- +- while (idx < keys->nr && keys->d[idx].overwritten) +- idx++; +- +- return (idx < keys->nr && +- keys->d[idx].btree_id == path->btree_id && +- keys->d[idx].level == path->level) +- ? keys->d[idx].k +- : NULL; +-} +- + static noinline + struct bkey_s_c btree_trans_peek_journal(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k) + { + struct bkey_i *next_journal = +- __btree_trans_peek_journal(trans, iter->path); ++ bch2_journal_keys_peek(trans->c, iter->btree_id, 0, ++ iter->path->pos); + + if (next_journal && + bpos_cmp(next_journal->k.p, +@@ -2636,7 +2619,8 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) + } + + if (unlikely(iter->flags & BTREE_ITER_WITH_JOURNAL) && +- (next_update = __btree_trans_peek_journal(trans, iter->path)) && ++ (next_update = bch2_journal_keys_peek(trans->c, iter->btree_id, ++ 0, iter->pos)) && + !bpos_cmp(next_update->k.p, iter->pos)) { + iter->k = next_update->k; + k = bkey_i_to_s_c(next_update); +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index 4e2f3b8d24b6..628171058e74 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -1413,6 +1413,7 @@ static int __must_check + bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path, + struct bkey_i *k, enum btree_update_flags flags) + { ++ struct bch_fs *c = trans->c; + struct btree_insert_entry *i, n; + + BUG_ON(!path->should_be_locked); +@@ -1461,6 +1462,16 @@ bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path, + + i->old_v = bch2_btree_path_peek_slot(path, &i->old_k).v; + i->old_btree_u64s = !bkey_deleted(&i->old_k) ? i->old_k.u64s : 0; ++ ++ if (unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags))) { ++ struct bkey_i *j_k = ++ bch2_journal_keys_peek(c, n.btree_id, n.level, k->k.p); ++ ++ if (j_k && !bpos_cmp(j_k->k.p, i->k->k.p)) { ++ i->old_k = j_k->k; ++ i->old_v = &j_k->v; ++ } ++ } + } + + __btree_path_get(n.path, true); +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index 87007bfa79e5..ae9ae1c7138c 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -94,6 +94,24 @@ size_t bch2_journal_key_search(struct journal_keys *journal_keys, + return l; + } + ++struct bkey_i *bch2_journal_keys_peek(struct bch_fs *c, enum btree_id btree_id, ++ unsigned level, struct bpos pos) ++{ ++ struct journal_keys *keys = &c->journal_keys; ++ struct journal_key *end = keys->d + keys->nr; ++ struct journal_key *k = keys->d + ++ bch2_journal_key_search(keys, btree_id, level, pos); ++ ++ while (k < end && k->overwritten) ++ k++; ++ ++ if (k < end && ++ k->btree_id == btree_id && ++ k->level == level) ++ return k->k; ++ return NULL; ++} ++ + static void journal_iter_fix(struct bch_fs *c, struct journal_iter *iter, unsigned idx) + { + struct bkey_i *n = iter->keys->d[idx].k; +diff --git a/fs/bcachefs/recovery.h b/fs/bcachefs/recovery.h +index 21bdad9db249..e6927a918df3 100644 +--- a/fs/bcachefs/recovery.h ++++ b/fs/bcachefs/recovery.h +@@ -33,6 +33,8 @@ struct btree_and_journal_iter { + + size_t bch2_journal_key_search(struct journal_keys *, enum btree_id, + unsigned, struct bpos); ++struct bkey_i *bch2_journal_keys_peek(struct bch_fs *, enum btree_id, ++ unsigned, struct bpos pos); + + int bch2_journal_key_insert_take(struct bch_fs *, enum btree_id, + unsigned, struct bkey_i *); +-- +cgit v1.2.3 + + +From 34a557030b5153641548e82f9274285aeeb8ed65 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 6 Jan 2022 01:20:41 -0500 +Subject: bcachefs: Run alloc triggers last + +Triggers can generate additional btree updates - we need to run alloc +triggers after all other triggers have run, because they generate +updates for the alloc btree. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_update_leaf.c | 17 +++++++++++++++++ + 1 file changed, 17 insertions(+) + +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index 628171058e74..2681ff64eb39 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -500,6 +500,9 @@ static int run_btree_triggers(struct btree_trans *trans, enum btree_id btree_id, + for (i = btree_id_start; + i < trans->updates + trans->nr_updates && i->btree_id <= btree_id; + i++) { ++ if (i->btree_id != btree_id) ++ continue; ++ + ret = run_one_trans_trigger(trans, i, overwrite); + if (ret < 0) + return ret; +@@ -526,6 +529,9 @@ static int bch2_trans_commit_run_triggers(struct btree_trans *trans) + * they are re-added. + */ + for (btree_id = 0; btree_id < BTREE_ID_NR; btree_id++) { ++ if (btree_id == BTREE_ID_alloc) ++ continue; ++ + while (btree_id_start < trans->updates + trans->nr_updates && + btree_id_start->btree_id < btree_id) + btree_id_start++; +@@ -535,6 +541,17 @@ static int bch2_trans_commit_run_triggers(struct btree_trans *trans) + return ret; + } + ++ trans_for_each_update(trans, i) { ++ if (i->btree_id > BTREE_ID_alloc) ++ break; ++ if (i->btree_id == BTREE_ID_alloc) { ++ ret = run_btree_triggers(trans, BTREE_ID_alloc, i); ++ if (ret) ++ return ret; ++ break; ++ } ++ } ++ + trans_for_each_update(trans, i) + BUG_ON(!(i->flags & BTREE_TRIGGER_NORUN) && + (BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)) && +-- +cgit v1.2.3 + + +From 7da924e15908f00fd2e59321ef1dc47251d0b5ac Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 24 Feb 2022 13:27:31 -0500 +Subject: bcachefs: Always clear should_be_locked in bch2_trans_begin() + +bch2_trans_begin() invalidates all iterators, until they're revalidated +by calling peek() or traverse(). + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 05dc48611c64..0f136cb51e4e 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -2990,6 +2990,8 @@ void bch2_trans_begin(struct btree_trans *trans) + } + + trans_for_each_path(trans, path) { ++ path->should_be_locked = false; ++ + /* + * XXX: we probably shouldn't be doing this if the transaction + * was restarted, but currently we still overflow transaction +@@ -2998,7 +3000,7 @@ void bch2_trans_begin(struct btree_trans *trans) + if (!path->ref && !path->preserve) + __bch2_path_free(trans, path); + else +- path->preserve = path->should_be_locked = false; ++ path->preserve = false; + } + + bch2_trans_cond_resched(trans); +-- +cgit v1.2.3 + + +From 51e3813a45bd3b1110f0e3c51ddb925cf217bc52 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 24 Feb 2022 18:19:32 -0500 +Subject: bcachefs: Fix bch2_journal_pins_to_text() + +When key cache pins were put onto their own list, we neglected to update +bch2_journal_pins_to_text() to print them. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/journal.c | 4 ++++ + 1 file changed, 4 insertions(+) + +diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c +index a85ab36d2d17..d582af7eec6d 100644 +--- a/fs/bcachefs/journal.c ++++ b/fs/bcachefs/journal.c +@@ -1288,6 +1288,10 @@ void bch2_journal_pins_to_text(struct printbuf *out, struct journal *j) + pr_buf(out, "%llu: count %u\n", + i, atomic_read(&pin_list->count)); + ++ list_for_each_entry(pin, &pin_list->key_cache_list, list) ++ pr_buf(out, "\t%px %ps\n", ++ pin, pin->flush); ++ + list_for_each_entry(pin, &pin_list->list, list) + pr_buf(out, "\t%px %ps\n", + pin, pin->flush); +-- +cgit v1.2.3 + + +From 4aacd34a93dc3a6fa5ebd15f408d22b054a152fe Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 24 Feb 2022 19:04:11 -0500 +Subject: bcachefs: Improve debug assertion + +We're hitting a strange bug with transaction paths not being sorted +correctly - this dumps transaction paths in the order we thought was +sorted, which will hopefully shed some light as to what's going on. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 8 +++++--- + 1 file changed, 5 insertions(+), 3 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 0f136cb51e4e..4c1c3ffe82a5 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -1806,8 +1806,6 @@ void bch2_dump_trans_paths_updates(struct btree_trans *trans) + unsigned idx; + char buf1[300], buf2[300]; + +- btree_trans_verify_sorted(trans); +- + trans_for_each_path_inorder(trans, path, idx) + printk(KERN_ERR "path: idx %u ref %u:%u%s%s btree %s pos %s locks %u %pS\n", + path->idx, path->ref, path->intent_ref, +@@ -1873,6 +1871,7 @@ struct btree_path *bch2_path_get(struct btree_trans *trans, + int i; + + BUG_ON(trans->restarted); ++ btree_trans_verify_sorted(trans); + + trans_for_each_path_inorder(trans, path, i) { + if (__btree_path_cmp(path, +@@ -2733,7 +2732,10 @@ static void btree_trans_verify_sorted(struct btree_trans *trans) + unsigned i; + + trans_for_each_path_inorder(trans, path, i) { +- BUG_ON(prev && btree_path_cmp(prev, path) > 0); ++ if (prev && btree_path_cmp(prev, path) > 0) { ++ bch2_dump_trans_paths_updates(trans); ++ panic("trans paths out of order!\n"); ++ } + prev = path; + } + #endif +-- +cgit v1.2.3 + + +From 9f38b0ca5d6efdc01ce2c0dcb3c4ce47c48de880 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 25 Feb 2022 13:17:48 -0500 +Subject: bcachefs: Convert bch2_pd_controller_print_debug() to a printbuf + +Fewer random on-stack char arrays. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/util.c | 68 ++++++++++++++++++++++++++++++------------------------ + fs/bcachefs/util.h | 8 ++++--- + 2 files changed, 43 insertions(+), 33 deletions(-) + +diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c +index 26d0ae304cb2..e1b55fe844d6 100644 +--- a/fs/bcachefs/util.c ++++ b/fs/bcachefs/util.c +@@ -482,36 +482,44 @@ void bch2_pd_controller_init(struct bch_pd_controller *pd) + pd->backpressure = 1; + } + +-size_t bch2_pd_controller_print_debug(struct bch_pd_controller *pd, char *buf) +-{ +- /* 2^64 - 1 is 20 digits, plus null byte */ +- char rate[21]; +- char actual[21]; +- char target[21]; +- char proportional[21]; +- char derivative[21]; +- char change[21]; +- s64 next_io; +- +- bch2_hprint(&PBUF(rate), pd->rate.rate); +- bch2_hprint(&PBUF(actual), pd->last_actual); +- bch2_hprint(&PBUF(target), pd->last_target); +- bch2_hprint(&PBUF(proportional), pd->last_proportional); +- bch2_hprint(&PBUF(derivative), pd->last_derivative); +- bch2_hprint(&PBUF(change), pd->last_change); +- +- next_io = div64_s64(pd->rate.next - local_clock(), NSEC_PER_MSEC); +- +- return sprintf(buf, +- "rate:\t\t%s/sec\n" +- "target:\t\t%s\n" +- "actual:\t\t%s\n" +- "proportional:\t%s\n" +- "derivative:\t%s\n" +- "change:\t\t%s/sec\n" +- "next io:\t%llims\n", +- rate, target, actual, proportional, +- derivative, change, next_io); ++void bch2_pd_controller_debug_to_text(struct printbuf *out, struct bch_pd_controller *pd) ++{ ++ out->tabstops[0] = 20; ++ ++ pr_buf(out, "rate:"); ++ pr_tab(out); ++ bch2_hprint(out, pd->rate.rate); ++ pr_newline(out); ++ ++ pr_buf(out, "target:"); ++ pr_tab(out); ++ bch2_hprint(out, pd->last_target); ++ pr_newline(out); ++ ++ pr_buf(out, "actual:"); ++ pr_tab(out); ++ bch2_hprint(out, pd->last_actual); ++ pr_newline(out); ++ ++ pr_buf(out, "proportional:"); ++ pr_tab(out); ++ bch2_hprint(out, pd->last_proportional); ++ pr_newline(out); ++ ++ pr_buf(out, "derivative:"); ++ pr_tab(out); ++ bch2_hprint(out, pd->last_derivative); ++ pr_newline(out); ++ ++ pr_buf(out, "change:"); ++ pr_tab(out); ++ bch2_hprint(out, pd->last_change); ++ pr_newline(out); ++ ++ pr_buf(out, "next io:"); ++ pr_tab(out); ++ pr_buf(out, "%llims", div64_s64(pd->rate.next - local_clock(), NSEC_PER_MSEC)); ++ pr_newline(out); + } + + /* misc: */ +diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h +index 9e0a3b46060b..895dc3aa1968 100644 +--- a/fs/bcachefs/util.h ++++ b/fs/bcachefs/util.h +@@ -578,7 +578,7 @@ struct bch_pd_controller { + + void bch2_pd_controller_update(struct bch_pd_controller *, s64, s64, int); + void bch2_pd_controller_init(struct bch_pd_controller *); +-size_t bch2_pd_controller_print_debug(struct bch_pd_controller *, char *); ++void bch2_pd_controller_debug_to_text(struct printbuf *, struct bch_pd_controller *); + + #define sysfs_pd_controller_attribute(name) \ + rw_attribute(name##_rate); \ +@@ -601,8 +601,10 @@ do { \ + sysfs_print(name##_rate_d_term, (var)->d_term); \ + sysfs_print(name##_rate_p_term_inverse, (var)->p_term_inverse); \ + \ +- if (attr == &sysfs_##name##_rate_debug) \ +- return bch2_pd_controller_print_debug(var, buf); \ ++ if (attr == &sysfs_##name##_rate_debug) { \ ++ bch2_pd_controller_debug_to_text(&out, var); \ ++ return out.pos - buf; \ ++ } \ + } while (0) + + #define sysfs_pd_controller_store(name, var) \ +-- +cgit v1.2.3 + + +From 6925f68a944627086df7ddcaa353d7fb9d2295d6 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 25 Feb 2022 13:18:19 -0500 +Subject: bcachefs: Heap allocate printbufs + +This patch changes printbufs dynamically allocate and reallocate a +buffer as needed. Stack usage has become a bit of a problem, and a major +cause of that has been static size string buffers on the stack. + +The most involved part of this refactoring is that printbufs must now be +exited with printbuf_exit(). + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bkey.c | 9 +- + fs/bcachefs/bset.c | 51 +++++----- + fs/bcachefs/btree_cache.c | 18 ++-- + fs/bcachefs/btree_gc.c | 181 +++++++++++++++++++++------------ + fs/bcachefs/btree_io.c | 103 ++++++++++--------- + fs/bcachefs/btree_iter.c | 76 ++++++++------ + fs/bcachefs/btree_update_interior.c | 31 +++--- + fs/bcachefs/btree_update_leaf.c | 7 +- + fs/bcachefs/buckets.c | 119 +++++++++++++--------- + fs/bcachefs/debug.c | 42 ++++---- + fs/bcachefs/ec.c | 7 +- + fs/bcachefs/fs.c | 13 ++- + fs/bcachefs/fsck.c | 178 ++++++++++++++++++++------------- + fs/bcachefs/io.c | 14 +-- + fs/bcachefs/journal.c | 24 +++-- + fs/bcachefs/journal_io.c | 53 +++++----- + fs/bcachefs/journal_reclaim.c | 11 +- + fs/bcachefs/rebalance.c | 42 +++++--- + fs/bcachefs/recovery.c | 22 +++- + fs/bcachefs/replicas.c | 7 +- + fs/bcachefs/super-io.c | 33 ++---- + fs/bcachefs/super.c | 43 +++----- + fs/bcachefs/sysfs.c | 193 +++++++++++++++--------------------- + fs/bcachefs/tests.c | 14 ++- + fs/bcachefs/util.c | 35 ++++++- + fs/bcachefs/util.h | 76 +++++++------- + fs/bcachefs/xattr.c | 22 ++-- + 27 files changed, 805 insertions(+), 619 deletions(-) + +diff --git a/fs/bcachefs/bkey.c b/fs/bcachefs/bkey.c +index 946dd27f09fc..4b01ab3029a2 100644 +--- a/fs/bcachefs/bkey.c ++++ b/fs/bcachefs/bkey.c +@@ -57,11 +57,12 @@ static void bch2_bkey_pack_verify(const struct bkey_packed *packed, + tmp = __bch2_bkey_unpack_key(format, packed); + + if (memcmp(&tmp, unpacked, sizeof(struct bkey))) { +- char buf1[160], buf2[160]; ++ struct printbuf buf1 = PRINTBUF; ++ struct printbuf buf2 = PRINTBUF; + char buf3[160], buf4[160]; + +- bch2_bkey_to_text(&PBUF(buf1), unpacked); +- bch2_bkey_to_text(&PBUF(buf2), &tmp); ++ bch2_bkey_to_text(&buf1, unpacked); ++ bch2_bkey_to_text(&buf2, &tmp); + bch2_to_binary(buf3, (void *) unpacked, 80); + bch2_to_binary(buf4, high_word(format, packed), 80); + +@@ -72,7 +73,7 @@ static void bch2_bkey_pack_verify(const struct bkey_packed *packed, + format->bits_per_field[2], + format->bits_per_field[3], + format->bits_per_field[4], +- buf1, buf2, buf3, buf4); ++ buf1.buf, buf2.buf, buf3, buf4); + } + } + +diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c +index 6000a8796bc5..c7a41d0dc781 100644 +--- a/fs/bcachefs/bset.c ++++ b/fs/bcachefs/bset.c +@@ -70,7 +70,7 @@ void bch2_dump_bset(struct bch_fs *c, struct btree *b, + struct bkey_packed *_k, *_n; + struct bkey uk, n; + struct bkey_s_c k; +- char buf[200]; ++ struct printbuf buf = PRINTBUF; + + if (!i->u64s) + return; +@@ -81,12 +81,14 @@ void bch2_dump_bset(struct bch_fs *c, struct btree *b, + _n = bkey_next(_k); + + k = bkey_disassemble(b, _k, &uk); ++ ++ printbuf_reset(&buf); + if (c) +- bch2_bkey_val_to_text(&PBUF(buf), c, k); ++ bch2_bkey_val_to_text(&buf, c, k); + else +- bch2_bkey_to_text(&PBUF(buf), k.k); ++ bch2_bkey_to_text(&buf, k.k); + printk(KERN_ERR "block %u key %5zu: %s\n", set, +- _k->_data - i->_data, buf); ++ _k->_data - i->_data, buf.buf); + + if (_n == vstruct_last(i)) + continue; +@@ -102,6 +104,8 @@ void bch2_dump_bset(struct bch_fs *c, struct btree *b, + !bpos_cmp(n.p, k.k->p)) + printk(KERN_ERR "Duplicate keys\n"); + } ++ ++ printbuf_exit(&buf); + } + + void bch2_dump_btree_node(struct bch_fs *c, struct btree *b) +@@ -118,6 +122,7 @@ void bch2_dump_btree_node_iter(struct btree *b, + struct btree_node_iter *iter) + { + struct btree_node_iter_set *set; ++ struct printbuf buf = PRINTBUF; + + printk(KERN_ERR "btree node iter with %u/%u sets:\n", + __btree_node_iter_used(iter), b->nsets); +@@ -126,12 +131,14 @@ void bch2_dump_btree_node_iter(struct btree *b, + struct bkey_packed *k = __btree_node_offset_to_key(b, set->k); + struct bset_tree *t = bch2_bkey_to_bset(b, k); + struct bkey uk = bkey_unpack_key(b, k); +- char buf[100]; + +- bch2_bkey_to_text(&PBUF(buf), &uk); ++ printbuf_reset(&buf); ++ bch2_bkey_to_text(&buf, &uk); + printk(KERN_ERR "set %zu key %u: %s\n", +- t - b->set, set->k, buf); ++ t - b->set, set->k, buf.buf); + } ++ ++ printbuf_exit(&buf); + } + + #ifdef CONFIG_BCACHEFS_DEBUG +@@ -167,13 +174,14 @@ static void bch2_btree_node_iter_next_check(struct btree_node_iter *_iter, + struct btree_node_iter_set *set; + struct bkey ku = bkey_unpack_key(b, k); + struct bkey nu = bkey_unpack_key(b, n); +- char buf1[80], buf2[80]; ++ struct printbuf buf1 = PRINTBUF; ++ struct printbuf buf2 = PRINTBUF; + + bch2_dump_btree_node(NULL, b); +- bch2_bkey_to_text(&PBUF(buf1), &ku); +- bch2_bkey_to_text(&PBUF(buf2), &nu); ++ bch2_bkey_to_text(&buf1, &ku); ++ bch2_bkey_to_text(&buf2, &nu); + printk(KERN_ERR "out of order/overlapping:\n%s\n%s\n", +- buf1, buf2); ++ buf1.buf, buf2.buf); + printk(KERN_ERR "iter was:"); + + btree_node_iter_for_each(_iter, set) { +@@ -238,6 +246,8 @@ void bch2_verify_insert_pos(struct btree *b, struct bkey_packed *where, + struct bset_tree *t = bch2_bkey_to_bset(b, where); + struct bkey_packed *prev = bch2_bkey_prev_all(b, t, where); + struct bkey_packed *next = (void *) (where->_data + clobber_u64s); ++ struct printbuf buf1 = PRINTBUF; ++ struct printbuf buf2 = PRINTBUF; + #if 0 + BUG_ON(prev && + bkey_iter_cmp(b, prev, insert) > 0); +@@ -246,17 +256,15 @@ void bch2_verify_insert_pos(struct btree *b, struct bkey_packed *where, + bkey_iter_cmp(b, prev, insert) > 0) { + struct bkey k1 = bkey_unpack_key(b, prev); + struct bkey k2 = bkey_unpack_key(b, insert); +- char buf1[100]; +- char buf2[100]; + + bch2_dump_btree_node(NULL, b); +- bch2_bkey_to_text(&PBUF(buf1), &k1); +- bch2_bkey_to_text(&PBUF(buf2), &k2); ++ bch2_bkey_to_text(&buf1, &k1); ++ bch2_bkey_to_text(&buf2, &k2); + + panic("prev > insert:\n" + "prev key %s\n" + "insert key %s\n", +- buf1, buf2); ++ buf1.buf, buf2.buf); + } + #endif + #if 0 +@@ -267,17 +275,15 @@ void bch2_verify_insert_pos(struct btree *b, struct bkey_packed *where, + bkey_iter_cmp(b, insert, next) > 0) { + struct bkey k1 = bkey_unpack_key(b, insert); + struct bkey k2 = bkey_unpack_key(b, next); +- char buf1[100]; +- char buf2[100]; + + bch2_dump_btree_node(NULL, b); +- bch2_bkey_to_text(&PBUF(buf1), &k1); +- bch2_bkey_to_text(&PBUF(buf2), &k2); ++ bch2_bkey_to_text(&buf1, &k1); ++ bch2_bkey_to_text(&buf2, &k2); + + panic("insert > next:\n" + "insert key %s\n" + "next key %s\n", +- buf1, buf2); ++ buf1.buf, buf2.buf); + } + #endif + } +@@ -1567,9 +1573,6 @@ void bch2_bfloat_to_text(struct printbuf *out, struct btree *b, + struct bkey uk; + unsigned j, inorder; + +- if (out->pos != out->end) +- *out->pos = '\0'; +- + if (!bset_has_ro_aux_tree(t)) + return; + +diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c +index 6e6a8e5bcdaf..00d4b18292ae 100644 +--- a/fs/bcachefs/btree_cache.c ++++ b/fs/bcachefs/btree_cache.c +@@ -743,14 +743,16 @@ static int lock_node_check_fn(struct six_lock *lock, void *p) + + static noinline void btree_bad_header(struct bch_fs *c, struct btree *b) + { +- char buf1[200], buf2[100], buf3[100]; ++ struct printbuf buf1 = PRINTBUF; ++ struct printbuf buf2 = PRINTBUF; ++ struct printbuf buf3 = PRINTBUF; + + if (!test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) + return; + +- bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(&b->key)); +- bch2_bpos_to_text(&PBUF(buf2), b->data->min_key); +- bch2_bpos_to_text(&PBUF(buf3), b->data->max_key); ++ bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(&b->key)); ++ bch2_bpos_to_text(&buf2, b->data->min_key); ++ bch2_bpos_to_text(&buf3, b->data->max_key); + + bch2_fs_inconsistent(c, "btree node header doesn't match ptr\n" + "btree %s level %u\n" +@@ -758,10 +760,14 @@ static noinline void btree_bad_header(struct bch_fs *c, struct btree *b) + "header: btree %s level %llu\n" + "min %s max %s\n", + bch2_btree_ids[b->c.btree_id], b->c.level, +- buf1, ++ buf1.buf, + bch2_btree_ids[BTREE_NODE_ID(b->data)], + BTREE_NODE_LEVEL(b->data), +- buf2, buf3); ++ buf2.buf, buf3.buf); ++ ++ printbuf_exit(&buf3); ++ printbuf_exit(&buf2); ++ printbuf_exit(&buf1); + } + + static inline void btree_check_header(struct bch_fs *c, struct btree *b) +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +index 68e09c5eaf23..88b234f58ef5 100644 +--- a/fs/bcachefs/btree_gc.c ++++ b/fs/bcachefs/btree_gc.c +@@ -70,23 +70,23 @@ static int bch2_gc_check_topology(struct bch_fs *c, + struct bpos expected_start = bkey_deleted(&prev->k->k) + ? node_start + : bpos_successor(prev->k->k.p); +- char buf1[200], buf2[200]; ++ struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; + int ret = 0; + + if (cur.k->k.type == KEY_TYPE_btree_ptr_v2) { + struct bkey_i_btree_ptr_v2 *bp = bkey_i_to_btree_ptr_v2(cur.k); + +- if (bkey_deleted(&prev->k->k)) { +- struct printbuf out = PBUF(buf1); +- pr_buf(&out, "start of node: "); +- bch2_bpos_to_text(&out, node_start); +- } else { +- bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(prev->k)); +- } +- + if (bpos_cmp(expected_start, bp->v.min_key)) { + bch2_topology_error(c); + ++ if (bkey_deleted(&prev->k->k)) { ++ pr_buf(&buf1, "start of node: "); ++ bch2_bpos_to_text(&buf1, node_start); ++ } else { ++ bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(prev->k)); ++ } ++ bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(cur.k)); ++ + if (__fsck_err(c, + FSCK_CAN_FIX| + FSCK_CAN_IGNORE| +@@ -95,11 +95,11 @@ static int bch2_gc_check_topology(struct bch_fs *c, + " prev %s\n" + " cur %s", + bch2_btree_ids[b->c.btree_id], b->c.level, +- buf1, +- (bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(cur.k)), buf2)) && ++ buf1.buf, buf2.buf) && + !test_bit(BCH_FS_TOPOLOGY_REPAIR_DONE, &c->flags)) { + bch_info(c, "Halting mark and sweep to start topology repair pass"); +- return FSCK_ERR_START_TOPOLOGY_REPAIR; ++ ret = FSCK_ERR_START_TOPOLOGY_REPAIR; ++ goto err; + } else { + set_bit(BCH_FS_INITIAL_GC_UNFIXED, &c->flags); + } +@@ -109,6 +109,12 @@ static int bch2_gc_check_topology(struct bch_fs *c, + if (is_last && bpos_cmp(cur.k->k.p, node_end)) { + bch2_topology_error(c); + ++ printbuf_reset(&buf1); ++ printbuf_reset(&buf2); ++ ++ bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(cur.k)); ++ bch2_bpos_to_text(&buf2, node_end); ++ + if (__fsck_err(c, + FSCK_CAN_FIX| + FSCK_CAN_IGNORE| +@@ -117,18 +123,21 @@ static int bch2_gc_check_topology(struct bch_fs *c, + " %s\n" + " expected %s", + bch2_btree_ids[b->c.btree_id], b->c.level, +- (bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(cur.k)), buf1), +- (bch2_bpos_to_text(&PBUF(buf2), node_end), buf2)) && ++ buf1.buf, buf2.buf) && + !test_bit(BCH_FS_TOPOLOGY_REPAIR_DONE, &c->flags)) { + bch_info(c, "Halting mark and sweep to start topology repair pass"); +- return FSCK_ERR_START_TOPOLOGY_REPAIR; ++ ret = FSCK_ERR_START_TOPOLOGY_REPAIR; ++ goto err; + } else { + set_bit(BCH_FS_INITIAL_GC_UNFIXED, &c->flags); + } + } + + bch2_bkey_buf_copy(prev, c, cur.k); ++err: + fsck_err: ++ printbuf_exit(&buf2); ++ printbuf_exit(&buf1); + return ret; + } + +@@ -251,18 +260,17 @@ static int btree_repair_node_boundaries(struct bch_fs *c, struct btree *b, + struct bpos expected_start = !prev + ? b->data->min_key + : bpos_successor(prev->key.k.p); +- char buf1[200], buf2[200]; ++ struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; + int ret = 0; + + if (!prev) { +- struct printbuf out = PBUF(buf1); +- pr_buf(&out, "start of node: "); +- bch2_bpos_to_text(&out, b->data->min_key); ++ pr_buf(&buf1, "start of node: "); ++ bch2_bpos_to_text(&buf1, b->data->min_key); + } else { +- bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(&prev->key)); ++ bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(&prev->key)); + } + +- bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(&cur->key)); ++ bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(&cur->key)); + + if (prev && + bpos_cmp(expected_start, cur->data->min_key) > 0 && +@@ -275,8 +283,10 @@ static int btree_repair_node_boundaries(struct bch_fs *c, struct btree *b, + " node %s\n" + " next %s", + bch2_btree_ids[b->c.btree_id], b->c.level, +- buf1, buf2)) +- return DROP_PREV_NODE; ++ buf1.buf, buf2.buf)) { ++ ret = DROP_PREV_NODE; ++ goto out; ++ } + + if (mustfix_fsck_err_on(bpos_cmp(prev->key.k.p, + bpos_predecessor(cur->data->min_key)), c, +@@ -284,7 +294,7 @@ static int btree_repair_node_boundaries(struct bch_fs *c, struct btree *b, + " node %s\n" + " next %s", + bch2_btree_ids[b->c.btree_id], b->c.level, +- buf1, buf2)) ++ buf1.buf, buf2.buf)) + ret = set_node_max(c, prev, + bpos_predecessor(cur->data->min_key)); + } else { +@@ -296,39 +306,49 @@ static int btree_repair_node_boundaries(struct bch_fs *c, struct btree *b, + " prev %s\n" + " node %s", + bch2_btree_ids[b->c.btree_id], b->c.level, +- buf1, buf2)) +- return DROP_THIS_NODE; ++ buf1.buf, buf2.buf)) { ++ ret = DROP_THIS_NODE; ++ goto out; ++ } + + if (mustfix_fsck_err_on(bpos_cmp(expected_start, cur->data->min_key), c, + "btree node with incorrect min_key at btree %s level %u:\n" + " prev %s\n" + " node %s", + bch2_btree_ids[b->c.btree_id], b->c.level, +- buf1, buf2)) ++ buf1.buf, buf2.buf)) + ret = set_node_min(c, cur, expected_start); + } ++out: + fsck_err: ++ printbuf_exit(&buf2); ++ printbuf_exit(&buf1); + return ret; + } + + static int btree_repair_node_end(struct bch_fs *c, struct btree *b, + struct btree *child) + { +- char buf1[200], buf2[200]; ++ struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; + int ret = 0; + ++ bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(&child->key)); ++ bch2_bpos_to_text(&buf2, b->key.k.p); ++ + if (mustfix_fsck_err_on(bpos_cmp(child->key.k.p, b->key.k.p), c, + "btree node with incorrect max_key at btree %s level %u:\n" + " %s\n" + " expected %s", + bch2_btree_ids[b->c.btree_id], b->c.level, +- (bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(&child->key)), buf1), +- (bch2_bpos_to_text(&PBUF(buf2), b->key.k.p), buf2))) { ++ buf1.buf, buf2.buf)) { + ret = set_node_max(c, child, b->key.k.p); + if (ret) +- return ret; ++ goto err; + } ++err: + fsck_err: ++ printbuf_exit(&buf2); ++ printbuf_exit(&buf1); + return ret; + } + +@@ -339,7 +359,7 @@ static int bch2_btree_repair_topology_recurse(struct bch_fs *c, struct btree *b) + struct bkey_buf prev_k, cur_k; + struct btree *prev = NULL, *cur = NULL; + bool have_child, dropped_children = false; +- char buf[200]; ++ struct printbuf buf; + int ret = 0; + + if (!b->c.level) +@@ -363,12 +383,15 @@ again: + false); + ret = PTR_ERR_OR_ZERO(cur); + ++ printbuf_reset(&buf); ++ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur_k.k)); ++ + if (mustfix_fsck_err_on(ret == -EIO, c, + "Unreadable btree node at btree %s level %u:\n" + " %s", + bch2_btree_ids[b->c.btree_id], + b->c.level - 1, +- (bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(cur_k.k)), buf))) { ++ buf.buf)) { + bch2_btree_node_evict(c, cur_k.k); + ret = bch2_journal_key_delete(c, b->c.btree_id, + b->c.level, cur_k.k->k.p); +@@ -468,12 +491,14 @@ again: + have_child = true; + } + ++ printbuf_reset(&buf); ++ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); ++ + if (mustfix_fsck_err_on(!have_child, c, + "empty interior btree node at btree %s level %u\n" + " %s", + bch2_btree_ids[b->c.btree_id], +- b->c.level, +- (bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(&b->key)), buf))) ++ b->c.level, buf.buf)) + ret = DROP_THIS_NODE; + err: + fsck_err: +@@ -489,6 +514,7 @@ fsck_err: + if (!ret && dropped_children) + goto again; + ++ printbuf_exit(&buf); + return ret; + } + +@@ -524,7 +550,7 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id, + const union bch_extent_entry *entry; + struct extent_ptr_decoded p = { 0 }; + bool do_update = false; +- char buf[200]; ++ struct printbuf buf = PRINTBUF; + int ret = 0; + + /* +@@ -542,7 +568,8 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id, + p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), + bch2_data_types[ptr_data_type(k->k, &p.ptr)], + p.ptr.gen, +- (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) { ++ (printbuf_reset(&buf), ++ bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) { + if (!p.ptr.cached) { + g->_mark.gen = p.ptr.gen; + g->gen_valid = true; +@@ -557,7 +584,8 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id, + p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), + bch2_data_types[ptr_data_type(k->k, &p.ptr)], + p.ptr.gen, g->mark.gen, +- (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) { ++ (printbuf_reset(&buf), ++ bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) { + if (!p.ptr.cached) { + g->_mark.gen = p.ptr.gen; + g->gen_valid = true; +@@ -576,7 +604,8 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id, + p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->mark.gen, + bch2_data_types[ptr_data_type(k->k, &p.ptr)], + p.ptr.gen, +- (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) ++ (printbuf_reset(&buf), ++ bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) + do_update = true; + + if (fsck_err_on(!p.ptr.cached && +@@ -586,7 +615,8 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id, + p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), + bch2_data_types[ptr_data_type(k->k, &p.ptr)], + p.ptr.gen, g->mark.gen, +- (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) ++ (printbuf_reset(&buf), ++ bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) + do_update = true; + + if (data_type != BCH_DATA_btree && p.ptr.gen != g->mark.gen) +@@ -599,7 +629,8 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id, + p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), + bch2_data_types[g->mark.data_type], + bch2_data_types[data_type], +- (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) { ++ (printbuf_reset(&buf), ++ bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) { + if (data_type == BCH_DATA_btree) { + g->_mark.data_type = data_type; + set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags); +@@ -615,14 +646,16 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id, + "pointer to nonexistent stripe %llu\n" + "while marking %s", + (u64) p.ec.idx, +- (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) ++ (printbuf_reset(&buf), ++ bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) + do_update = true; + + if (fsck_err_on(!bch2_ptr_matches_stripe_m(m, p), c, + "pointer does not match stripe %llu\n" + "while marking %s", + (u64) p.ec.idx, +- (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) ++ (printbuf_reset(&buf), ++ bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) + do_update = true; + } + } +@@ -635,13 +668,15 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id, + + if (is_root) { + bch_err(c, "cannot update btree roots yet"); +- return -EINVAL; ++ ret = -EINVAL; ++ goto err; + } + + new = kmalloc(bkey_bytes(k->k), GFP_KERNEL); + if (!new) { + bch_err(c, "%s: error allocating new key", __func__); +- return -ENOMEM; ++ ret = -ENOMEM; ++ goto err; + } + + bkey_reassemble(new, *k); +@@ -705,19 +740,25 @@ found: + ret = bch2_journal_key_insert_take(c, btree_id, level, new); + if (ret) { + kfree(new); +- return ret; ++ goto err; + } + + if (level) + bch2_btree_node_update_key_early(c, btree_id, level - 1, *k, new); + +- bch2_bkey_val_to_text(&PBUF(buf), c, *k); +- bch_info(c, "updated %s", buf); +- bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(new)); +- bch_info(c, "new key %s", buf); ++ printbuf_reset(&buf); ++ bch2_bkey_val_to_text(&buf, c, *k); ++ bch_info(c, "updated %s", buf.buf); ++ ++ printbuf_reset(&buf); ++ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(new)); ++ bch_info(c, "new key %s", buf.buf); ++ + *k = bkey_i_to_s_c(new); + } ++err: + fsck_err: ++ printbuf_exit(&buf); + return ret; + } + +@@ -852,7 +893,7 @@ static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b + struct btree_and_journal_iter iter; + struct bkey_s_c k; + struct bkey_buf cur, prev; +- char buf[200]; ++ struct printbuf buf = PRINTBUF; + int ret = 0; + + bch2_btree_and_journal_iter_init_node_iter(&iter, c, b); +@@ -913,7 +954,8 @@ static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b + " %s", + bch2_btree_ids[b->c.btree_id], + b->c.level - 1, +- (bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(cur.k)), buf)) && ++ (printbuf_reset(&buf), ++ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur.k)), buf.buf)) && + !test_bit(BCH_FS_TOPOLOGY_REPAIR_DONE, &c->flags)) { + ret = FSCK_ERR_START_TOPOLOGY_REPAIR; + bch_info(c, "Halting mark and sweep to start topology repair pass"); +@@ -943,6 +985,7 @@ fsck_err: + bch2_bkey_buf_exit(&cur, c); + bch2_bkey_buf_exit(&prev, c); + bch2_btree_and_journal_iter_exit(&iter); ++ printbuf_exit(&buf); + return ret; + } + +@@ -956,7 +999,7 @@ static int bch2_gc_btree_init(struct btree_trans *trans, + : bch2_expensive_debug_checks ? 0 + : !btree_node_type_needs_gc(btree_id) ? 1 + : 0; +- char buf[100]; ++ struct printbuf buf = PRINTBUF; + int ret = 0; + + b = c->btree_roots[btree_id].b; +@@ -965,17 +1008,19 @@ static int bch2_gc_btree_init(struct btree_trans *trans, + return 0; + + six_lock_read(&b->c.lock, NULL, NULL); ++ printbuf_reset(&buf); ++ bch2_bpos_to_text(&buf, b->data->min_key); + if (mustfix_fsck_err_on(bpos_cmp(b->data->min_key, POS_MIN), c, +- "btree root with incorrect min_key: %s", +- (bch2_bpos_to_text(&PBUF(buf), b->data->min_key), buf))) { ++ "btree root with incorrect min_key: %s", buf.buf)) { + bch_err(c, "repair unimplemented"); + ret = FSCK_ERR_EXIT; + goto fsck_err; + } + ++ printbuf_reset(&buf); ++ bch2_bpos_to_text(&buf, b->data->max_key); + if (mustfix_fsck_err_on(bpos_cmp(b->data->max_key, SPOS_MAX), c, +- "btree root with incorrect max_key: %s", +- (bch2_bpos_to_text(&PBUF(buf), b->data->max_key), buf))) { ++ "btree root with incorrect max_key: %s", buf.buf)) { + bch_err(c, "repair unimplemented"); + ret = FSCK_ERR_EXIT; + goto fsck_err; +@@ -995,6 +1040,7 @@ fsck_err: + + if (ret < 0) + bch_err(c, "%s: ret %i", __func__, ret); ++ printbuf_exit(&buf); + return ret; + } + +@@ -1131,6 +1177,7 @@ static int bch2_gc_done(struct bch_fs *c, + bool initial, bool metadata_only) + { + struct bch_dev *ca = NULL; ++ struct printbuf buf = PRINTBUF; + bool verify = !metadata_only && (!initial || + (c->sb.compat & (1ULL << BCH_COMPAT_alloc_info))); + unsigned i, dev; +@@ -1201,16 +1248,16 @@ static int bch2_gc_done(struct bch_fs *c, + for (i = 0; i < c->replicas.nr; i++) { + struct bch_replicas_entry *e = + cpu_replicas_entry(&c->replicas, i); +- char buf[80]; + + if (metadata_only && + (e->data_type == BCH_DATA_user || + e->data_type == BCH_DATA_cached)) + continue; + +- bch2_replicas_entry_to_text(&PBUF(buf), e); ++ printbuf_reset(&buf); ++ bch2_replicas_entry_to_text(&buf, e); + +- copy_fs_field(replicas[i], "%s", buf); ++ copy_fs_field(replicas[i], "%s", buf.buf); + } + } + +@@ -1225,6 +1272,7 @@ fsck_err: + bch_err(c, "%s: ret %i", __func__, ret); + + percpu_up_write(&c->mark_lock); ++ printbuf_exit(&buf); + return ret; + } + +@@ -1424,7 +1472,7 @@ static int bch2_gc_reflink_done(struct bch_fs *c, bool metadata_only) + struct bkey_s_c k; + struct reflink_gc *r; + size_t idx = 0; +- char buf[200]; ++ struct printbuf buf = PRINTBUF; + int ret = 0; + + if (metadata_only) +@@ -1452,7 +1500,8 @@ static int bch2_gc_reflink_done(struct bch_fs *c, bool metadata_only) + "reflink key has wrong refcount:\n" + " %s\n" + " should be %u", +- (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf), ++ (printbuf_reset(&buf), ++ bch2_bkey_val_to_text(&buf, c, k), buf.buf), + r->refcount)) { + struct bkey_i *new; + +@@ -1481,6 +1530,7 @@ fsck_err: + bch2_trans_iter_exit(&trans, &iter); + c->reflink_gc_nr = 0; + bch2_trans_exit(&trans); ++ printbuf_exit(&buf); + return ret; + } + +@@ -1539,7 +1589,7 @@ static int bch2_gc_stripes_done(struct bch_fs *c, bool metadata_only) + struct bkey_s_c k; + struct gc_stripe *m; + const struct bch_stripe *s; +- char buf[200]; ++ struct printbuf buf = PRINTBUF; + unsigned i; + int ret = 0; + +@@ -1565,7 +1615,8 @@ inconsistent: + "stripe has wrong block sector count %u:\n" + " %s\n" + " should be %u", i, +- (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf), ++ (printbuf_reset(&buf), ++ bch2_bkey_val_to_text(&buf, c, k), buf.buf), + m ? m->block_sectors[i] : 0)) { + struct bkey_i_stripe *new; + +@@ -1589,6 +1640,8 @@ fsck_err: + bch2_trans_iter_exit(&trans, &iter); + + bch2_trans_exit(&trans); ++ ++ printbuf_exit(&buf); + return ret; + } + +diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c +index c0b95e663946..2b16b656c9be 100644 +--- a/fs/bcachefs/btree_io.c ++++ b/fs/bcachefs/btree_io.c +@@ -540,13 +540,7 @@ enum btree_validate_ret { + #define btree_err(type, c, ca, b, i, msg, ...) \ + ({ \ + __label__ out; \ +- char _buf[300]; \ +- char *_buf2 = _buf; \ +- struct printbuf out = PBUF(_buf); \ +- \ +- _buf2 = kmalloc(4096, GFP_ATOMIC); \ +- if (_buf2) \ +- out = _PBUF(_buf2, 4986); \ ++ struct printbuf out = PRINTBUF; \ + \ + btree_err_msg(&out, c, ca, b, i, b->written, write); \ + pr_buf(&out, ": " msg, ##__VA_ARGS__); \ +@@ -554,14 +548,13 @@ enum btree_validate_ret { + if (type == BTREE_ERR_FIXABLE && \ + write == READ && \ + !test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) { \ +- mustfix_fsck_err(c, "%s", _buf2); \ ++ mustfix_fsck_err(c, "%s", out.buf); \ + goto out; \ + } \ + \ + switch (write) { \ + case READ: \ +- if (_buf2) \ +- bch_err(c, "%s", _buf2); \ ++ bch_err(c, "%s", out.buf); \ + \ + switch (type) { \ + case BTREE_ERR_FIXABLE: \ +@@ -582,7 +575,7 @@ enum btree_validate_ret { + } \ + break; \ + case WRITE: \ +- bch_err(c, "corrupt metadata before write: %s", _buf2); \ ++ bch_err(c, "corrupt metadata before write: %s", out.buf);\ + \ + if (bch2_fs_inconsistent(c)) { \ + ret = BCH_FSCK_ERRORS_NOT_FIXED; \ +@@ -591,8 +584,7 @@ enum btree_validate_ret { + break; \ + } \ + out: \ +- if (_buf2 != _buf) \ +- kfree(_buf2); \ ++ printbuf_exit(&out); \ + true; \ + }) + +@@ -653,8 +645,8 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, + { + unsigned version = le16_to_cpu(i->version); + const char *err; +- char buf1[100]; +- char buf2[100]; ++ struct printbuf buf1 = PRINTBUF; ++ struct printbuf buf2 = PRINTBUF; + int ret = 0; + + btree_err_on((version != BCH_BSET_VERSION_OLD && +@@ -691,7 +683,8 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, + BTREE_ERR_FIXABLE, c, ca, b, i, + "bset past end of btree node")) { + i->u64s = 0; +- return 0; ++ ret = 0; ++ goto out; + } + + btree_err_on(offset && !i->u64s, +@@ -742,14 +735,17 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, + btree_err_on(bpos_cmp(b->data->min_key, bp->min_key), + BTREE_ERR_MUST_RETRY, c, ca, b, NULL, + "incorrect min_key: got %s should be %s", +- (bch2_bpos_to_text(&PBUF(buf1), bn->min_key), buf1), +- (bch2_bpos_to_text(&PBUF(buf2), bp->min_key), buf2)); ++ (printbuf_reset(&buf1), ++ bch2_bpos_to_text(&buf1, bn->min_key), buf1.buf), ++ (printbuf_reset(&buf2), ++ bch2_bpos_to_text(&buf2, bp->min_key), buf2.buf)); + } + + btree_err_on(bpos_cmp(bn->max_key, b->key.k.p), + BTREE_ERR_MUST_RETRY, c, ca, b, i, + "incorrect max key %s", +- (bch2_bpos_to_text(&PBUF(buf1), bn->max_key), buf1)); ++ (printbuf_reset(&buf1), ++ bch2_bpos_to_text(&buf1, bn->max_key), buf1.buf)); + + if (write) + compat_btree_node(b->c.level, b->c.btree_id, version, +@@ -764,7 +760,10 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, + BSET_BIG_ENDIAN(i), write, + &bn->format); + } ++out: + fsck_err: ++ printbuf_exit(&buf2); ++ printbuf_exit(&buf1); + return ret; + } + +@@ -774,6 +773,8 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b, + { + unsigned version = le16_to_cpu(i->version); + struct bkey_packed *k, *prev = NULL; ++ struct printbuf buf1 = PRINTBUF; ++ struct printbuf buf2 = PRINTBUF; + bool updated_range = b->key.k.type == KEY_TYPE_btree_ptr_v2 && + BTREE_PTR_RANGE_UPDATED(&bkey_i_to_btree_ptr_v2(&b->key)->v); + int ret = 0; +@@ -812,11 +813,10 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b, + (!updated_range ? bch2_bkey_in_btree_node(b, u.s_c) : NULL) ?: + (write ? bch2_bkey_val_invalid(c, u.s_c) : NULL); + if (invalid) { +- char buf[160]; +- +- bch2_bkey_val_to_text(&PBUF(buf), c, u.s_c); ++ printbuf_reset(&buf1); ++ bch2_bkey_val_to_text(&buf1, c, u.s_c); + btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i, +- "invalid bkey: %s\n%s", invalid, buf); ++ "invalid bkey: %s\n%s", invalid, buf1.buf); + + i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); + memmove_u64s_down(k, bkey_next(k), +@@ -830,18 +830,18 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b, + &b->format, k); + + if (prev && bkey_iter_cmp(b, prev, k) > 0) { +- char buf1[80]; +- char buf2[80]; + struct bkey up = bkey_unpack_key(b, prev); + +- bch2_bkey_to_text(&PBUF(buf1), &up); +- bch2_bkey_to_text(&PBUF(buf2), u.k); ++ printbuf_reset(&buf1); ++ bch2_bkey_to_text(&buf1, &up); ++ printbuf_reset(&buf2); ++ bch2_bkey_to_text(&buf2, u.k); + + bch2_dump_bset(c, b, i, 0); + + if (btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i, + "keys out of order: %s > %s", +- buf1, buf2)) { ++ buf1.buf, buf2.buf)) { + i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); + memmove_u64s_down(k, bkey_next(k), + (u64 *) vstruct_end(i) - (u64 *) k); +@@ -853,6 +853,8 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b, + k = bkey_next(k); + } + fsck_err: ++ printbuf_exit(&buf2); ++ printbuf_exit(&buf1); + return ret; + } + +@@ -1068,11 +1070,12 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, + if (invalid || + (bch2_inject_invalid_keys && + !bversion_cmp(u.k->version, MAX_VERSION))) { +- char buf[160]; ++ struct printbuf buf = PRINTBUF; + +- bch2_bkey_val_to_text(&PBUF(buf), c, u.s_c); ++ bch2_bkey_val_to_text(&buf, c, u.s_c); + btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i, + "invalid bkey %s: %s", buf, invalid); ++ printbuf_exit(&buf); + + btree_keys_account_key_drop(&b->nr, 0, k); + +@@ -1129,8 +1132,7 @@ static void btree_node_read_work(struct work_struct *work) + struct bch_dev *ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev); + struct bio *bio = &rb->bio; + struct bch_io_failures failed = { .nr = 0 }; +- char buf[200]; +- struct printbuf out; ++ struct printbuf buf = PRINTBUF; + bool saw_error = false; + bool can_retry; + +@@ -1151,10 +1153,10 @@ static void btree_node_read_work(struct work_struct *work) + bio->bi_status = BLK_STS_REMOVED; + } + start: +- out = PBUF(buf); +- btree_pos_to_text(&out, c, b); ++ printbuf_reset(&buf); ++ btree_pos_to_text(&buf, c, b); + bch2_dev_io_err_on(bio->bi_status, ca, "btree read error %s for %s", +- bch2_blk_status_to_str(bio->bi_status), buf); ++ bch2_blk_status_to_str(bio->bi_status), buf.buf); + if (rb->have_ioref) + percpu_ref_put(&ca->io_ref); + rb->have_ioref = false; +@@ -1180,6 +1182,7 @@ start: + bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read], + rb->start_time); + bio_put(&rb->bio); ++ printbuf_exit(&buf); + + if (saw_error && !btree_node_read_error(b)) + bch2_btree_node_rewrite_async(c, b); +@@ -1260,6 +1263,7 @@ static void btree_node_read_all_replicas_done(struct closure *cl) + container_of(cl, struct btree_node_read_all, cl); + struct bch_fs *c = ra->c; + struct btree *b = ra->b; ++ struct printbuf buf = PRINTBUF; + bool dump_bset_maps = false; + bool have_retry = false; + int ret = 0, best = -1, write = READ; +@@ -1303,8 +1307,6 @@ static void btree_node_read_all_replicas_done(struct closure *cl) + fsck_err: + if (dump_bset_maps) { + for (i = 0; i < ra->nr; i++) { +- char buf[200]; +- struct printbuf out = PBUF(buf); + struct btree_node *bn = ra->buf[i]; + struct btree_node_entry *bne = NULL; + unsigned offset = 0, sectors; +@@ -1313,6 +1315,8 @@ fsck_err: + if (ra->err[i]) + continue; + ++ printbuf_reset(&buf); ++ + while (offset < btree_sectors(c)) { + if (!offset) { + sectors = vstruct_sectors(bn, c->block_bits); +@@ -1323,10 +1327,10 @@ fsck_err: + sectors = vstruct_sectors(bne, c->block_bits); + } + +- pr_buf(&out, " %u-%u", offset, offset + sectors); ++ pr_buf(&buf, " %u-%u", offset, offset + sectors); + if (bne && bch2_journal_seq_is_blacklisted(c, + le64_to_cpu(bne->keys.journal_seq), false)) +- pr_buf(&out, "*"); ++ pr_buf(&buf, "*"); + offset += sectors; + } + +@@ -1334,19 +1338,19 @@ fsck_err: + bne = ra->buf[i] + (offset << 9); + if (bne->keys.seq == bn->keys.seq) { + if (!gap) +- pr_buf(&out, " GAP"); ++ pr_buf(&buf, " GAP"); + gap = true; + + sectors = vstruct_sectors(bne, c->block_bits); +- pr_buf(&out, " %u-%u", offset, offset + sectors); ++ pr_buf(&buf, " %u-%u", offset, offset + sectors); + if (bch2_journal_seq_is_blacklisted(c, + le64_to_cpu(bne->keys.journal_seq), false)) +- pr_buf(&out, "*"); ++ pr_buf(&buf, "*"); + } + offset++; + } + +- bch_err(c, "replica %u:%s", i, buf); ++ bch_err(c, "replica %u:%s", i, buf.buf); + } + } + +@@ -1367,6 +1371,7 @@ fsck_err: + + closure_debug_destroy(&ra->cl); + kfree(ra); ++ printbuf_exit(&buf); + + clear_btree_node_read_in_flight(b); + wake_up_bit(&b->flags, BTREE_NODE_read_in_flight); +@@ -1466,23 +1471,23 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b, + struct btree_read_bio *rb; + struct bch_dev *ca; + struct bio *bio; +- char buf[200]; ++ struct printbuf buf = PRINTBUF; + int ret; + +- btree_pos_to_text(&PBUF(buf), c, b); ++ btree_pos_to_text(&buf, c, b); + trace_btree_read(c, b); + + if (bch2_verify_all_btree_replicas && + !btree_node_read_all_replicas(c, b, sync)) +- return; ++ goto out; + + ret = bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key), + NULL, &pick); + if (bch2_fs_fatal_err_on(ret <= 0, c, + "btree node read error: no device to read from\n" +- " at %s", buf)) { ++ " at %s", buf.buf)) { + set_btree_node_read_error(b); +- return; ++ goto out; + } + + ca = bch_dev_bkey_exists(c, pick.ptr.dev); +@@ -1523,6 +1528,8 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b, + else + queue_work(c->io_complete_wq, &rb->work); + } ++out: ++ printbuf_exit(&buf); + } + + int bch2_btree_root_read(struct bch_fs *c, enum btree_id id, +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 4c1c3ffe82a5..7244528ab6a6 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -589,7 +589,9 @@ static void bch2_btree_path_verify_level(struct btree_trans *trans, + struct btree_node_iter tmp; + bool locked; + struct bkey_packed *p, *k; +- char buf1[100], buf2[100], buf3[100]; ++ struct printbuf buf1 = PRINTBUF; ++ struct printbuf buf2 = PRINTBUF; ++ struct printbuf buf3 = PRINTBUF; + const char *msg; + + if (!bch2_debug_check_iterators) +@@ -637,26 +639,27 @@ static void bch2_btree_path_verify_level(struct btree_trans *trans, + btree_node_unlock(path, level); + return; + err: +- strcpy(buf2, "(none)"); +- strcpy(buf3, "(none)"); +- +- bch2_bpos_to_text(&PBUF(buf1), path->pos); ++ bch2_bpos_to_text(&buf1, path->pos); + + if (p) { + struct bkey uk = bkey_unpack_key(l->b, p); +- bch2_bkey_to_text(&PBUF(buf2), &uk); ++ bch2_bkey_to_text(&buf2, &uk); ++ } else { ++ pr_buf(&buf2, "(none)"); + } + + if (k) { + struct bkey uk = bkey_unpack_key(l->b, k); +- bch2_bkey_to_text(&PBUF(buf3), &uk); ++ bch2_bkey_to_text(&buf3, &uk); ++ } else { ++ pr_buf(&buf3, "(none)"); + } + + panic("path should be %s key at level %u:\n" + "path pos %s\n" + "prev key %s\n" + "cur key %s\n", +- msg, level, buf1, buf2, buf3); ++ msg, level, buf1.buf, buf2.buf, buf3.buf); + } + + static void bch2_btree_path_verify(struct btree_trans *trans, +@@ -754,16 +757,16 @@ static int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k + if (!bkey_cmp(prev.k->p, k.k->p) && + bch2_snapshot_is_ancestor(trans->c, iter->snapshot, + prev.k->p.snapshot) > 0) { +- char buf1[100], buf2[200]; ++ struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; + +- bch2_bkey_to_text(&PBUF(buf1), k.k); +- bch2_bkey_to_text(&PBUF(buf2), prev.k); ++ bch2_bkey_to_text(&buf1, k.k); ++ bch2_bkey_to_text(&buf2, prev.k); + + panic("iter snap %u\n" + "k %s\n" + "prev %s\n", + iter->snapshot, +- buf1, buf2); ++ buf1.buf, buf2.buf); + } + out: + bch2_trans_iter_exit(trans, ©); +@@ -775,7 +778,7 @@ void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id, + { + struct btree_path *path; + unsigned idx; +- char buf[100]; ++ struct printbuf buf = PRINTBUF; + + trans_for_each_path_inorder(trans, path, idx) { + int cmp = cmp_int(path->btree_id, id) ?: +@@ -801,9 +804,10 @@ void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id, + } + + bch2_dump_trans_paths_updates(trans); ++ bch2_bpos_to_text(&buf, pos); ++ + panic("not locked: %s %s%s\n", +- bch2_btree_ids[id], +- (bch2_bpos_to_text(&PBUF(buf), pos), buf), ++ bch2_btree_ids[id], buf.buf, + key_cache ? " cached" : ""); + } + +@@ -1084,23 +1088,23 @@ static void btree_path_verify_new_node(struct btree_trans *trans, + if (!k || + bkey_deleted(k) || + bkey_cmp_left_packed(l->b, k, &b->key.k.p)) { +- char buf1[100]; +- char buf2[100]; +- char buf3[100]; +- char buf4[100]; ++ struct printbuf buf1 = PRINTBUF; ++ struct printbuf buf2 = PRINTBUF; ++ struct printbuf buf3 = PRINTBUF; ++ struct printbuf buf4 = PRINTBUF; + struct bkey uk = bkey_unpack_key(b, k); + + bch2_dump_btree_node(c, l->b); +- bch2_bpos_to_text(&PBUF(buf1), path->pos); +- bch2_bkey_to_text(&PBUF(buf2), &uk); +- bch2_bpos_to_text(&PBUF(buf3), b->data->min_key); +- bch2_bpos_to_text(&PBUF(buf3), b->data->max_key); ++ bch2_bpos_to_text(&buf1, path->pos); ++ bch2_bkey_to_text(&buf2, &uk); ++ bch2_bpos_to_text(&buf3, b->data->min_key); ++ bch2_bpos_to_text(&buf3, b->data->max_key); + panic("parent iter doesn't point to new node:\n" + "iter pos %s %s\n" + "iter key %s\n" + "new node %s-%s\n", +- bch2_btree_ids[path->btree_id], buf1, +- buf2, buf3, buf4); ++ bch2_btree_ids[path->btree_id], ++ buf1.buf, buf2.buf, buf3.buf, buf4.buf); + } + + if (!parent_locked) +@@ -1803,16 +1807,20 @@ void bch2_dump_trans_paths_updates(struct btree_trans *trans) + { + struct btree_path *path; + struct btree_insert_entry *i; ++ struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; + unsigned idx; +- char buf1[300], buf2[300]; + +- trans_for_each_path_inorder(trans, path, idx) ++ trans_for_each_path_inorder(trans, path, idx) { ++ printbuf_reset(&buf1); ++ ++ bch2_bpos_to_text(&buf1, path->pos); ++ + printk(KERN_ERR "path: idx %u ref %u:%u%s%s btree %s pos %s locks %u %pS\n", + path->idx, path->ref, path->intent_ref, + path->should_be_locked ? " S" : "", + path->preserve ? " P" : "", + bch2_btree_ids[path->btree_id], +- (bch2_bpos_to_text(&PBUF(buf1), path->pos), buf1), ++ buf1.buf, + path->nodes_locked, + #ifdef CONFIG_BCACHEFS_DEBUG + (void *) path->ip_allocated +@@ -1820,17 +1828,25 @@ void bch2_dump_trans_paths_updates(struct btree_trans *trans) + NULL + #endif + ); ++ } + + trans_for_each_update(trans, i) { + struct bkey u; + struct bkey_s_c old = bch2_btree_path_peek_slot(i->path, &u); + ++ printbuf_reset(&buf1); ++ printbuf_reset(&buf2); ++ bch2_bkey_val_to_text(&buf1, trans->c, old); ++ bch2_bkey_val_to_text(&buf2, trans->c, bkey_i_to_s_c(i->k)); ++ + printk(KERN_ERR "update: btree %s %pS\n old %s\n new %s", + bch2_btree_ids[i->btree_id], + (void *) i->ip_allocated, +- (bch2_bkey_val_to_text(&PBUF(buf1), trans->c, old), buf1), +- (bch2_bkey_val_to_text(&PBUF(buf2), trans->c, bkey_i_to_s_c(i->k)), buf2)); ++ buf1.buf, buf2.buf); + } ++ ++ printbuf_exit(&buf2); ++ printbuf_exit(&buf1); + } + + static struct btree_path *btree_path_alloc(struct btree_trans *trans, +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index b6758a7c4056..31ec5076fa12 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -41,7 +41,7 @@ static void btree_node_interior_verify(struct bch_fs *c, struct btree *b) + struct bkey_s_c k; + struct bkey_s_c_btree_ptr_v2 bp; + struct bkey unpacked; +- char buf1[100], buf2[100]; ++ struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; + + BUG_ON(!b->c.level); + +@@ -58,9 +58,9 @@ static void btree_node_interior_verify(struct bch_fs *c, struct btree *b) + + if (bpos_cmp(next_node, bp.v->min_key)) { + bch2_dump_btree_node(c, b); +- panic("expected next min_key %s got %s\n", +- (bch2_bpos_to_text(&PBUF(buf1), next_node), buf1), +- (bch2_bpos_to_text(&PBUF(buf2), bp.v->min_key), buf2)); ++ bch2_bpos_to_text(&buf1, next_node); ++ bch2_bpos_to_text(&buf2, bp.v->min_key); ++ panic("expected next min_key %s got %s\n", buf1.buf, buf2.buf); + } + + bch2_btree_node_iter_advance(&iter, b); +@@ -68,9 +68,9 @@ static void btree_node_interior_verify(struct bch_fs *c, struct btree *b) + if (bch2_btree_node_iter_end(&iter)) { + if (bpos_cmp(k.k->p, b->key.k.p)) { + bch2_dump_btree_node(c, b); +- panic("expected end %s got %s\n", +- (bch2_bpos_to_text(&PBUF(buf1), b->key.k.p), buf1), +- (bch2_bpos_to_text(&PBUF(buf2), k.k->p), buf2)); ++ bch2_bpos_to_text(&buf1, b->key.k.p); ++ bch2_bpos_to_text(&buf2, k.k->p); ++ panic("expected end %s got %s\n", buf1.buf, buf2.buf); + } + break; + } +@@ -1143,10 +1143,11 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, + invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(insert), btree_node_type(b)) ?: + bch2_bkey_in_btree_node(b, bkey_i_to_s_c(insert)); + if (invalid) { +- char buf[160]; ++ struct printbuf buf = PRINTBUF; + +- bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(insert)); +- bch2_fs_inconsistent(c, "inserting invalid bkey %s: %s", buf, invalid); ++ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert)); ++ bch2_fs_inconsistent(c, "inserting invalid bkey %s: %s", buf.buf, invalid); ++ printbuf_exit(&buf); + dump_stack(); + } + +@@ -1628,15 +1629,17 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, + } + + if (bkey_cmp(bpos_successor(prev->data->max_key), next->data->min_key)) { +- char buf1[100], buf2[100]; ++ struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; + +- bch2_bpos_to_text(&PBUF(buf1), prev->data->max_key); +- bch2_bpos_to_text(&PBUF(buf2), next->data->min_key); ++ bch2_bpos_to_text(&buf1, prev->data->max_key); ++ bch2_bpos_to_text(&buf2, next->data->min_key); + bch_err(c, + "btree topology error in btree merge:\n" + " prev ends at %s\n" + " next starts at %s", +- buf1, buf2); ++ buf1.buf, buf2.buf); ++ printbuf_exit(&buf1); ++ printbuf_exit(&buf2); + bch2_topology_error(c); + ret = -EIO; + goto err; +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index 2681ff64eb39..db4aa667487d 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -824,11 +824,12 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, + const char *invalid = bch2_bkey_invalid(c, + bkey_i_to_s_c(i->k), i->bkey_type); + if (invalid) { +- char buf[200]; ++ struct printbuf buf = PRINTBUF; + +- bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(i->k)); ++ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(i->k)); + bch2_fs_fatal_error(c, "invalid bkey %s on insert from %s -> %ps: %s\n", +- buf, trans->fn, (void *) i->ip_allocated, invalid); ++ buf.buf, trans->fn, (void *) i->ip_allocated, invalid); ++ printbuf_exit(&buf); + return -EINVAL; + } + btree_insert_entry_checks(trans, i); +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index 5bd89fc43add..2c3b71b2f04e 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -373,22 +373,23 @@ static inline int update_replicas(struct bch_fs *c, struct bkey_s_c k, + { + struct bch_fs_usage __percpu *fs_usage; + int idx, ret = 0; +- char buf[200]; ++ struct printbuf buf = PRINTBUF; + + percpu_down_read(&c->mark_lock); ++ buf.atomic++; + + idx = bch2_replicas_entry_idx(c, r); + if (idx < 0 && + (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) || + fsck_err(c, "no replicas entry\n" + " while marking %s", +- (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)))) { ++ (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))) { + percpu_up_read(&c->mark_lock); + ret = bch2_mark_replicas(c, r); +- if (ret) +- return ret; +- + percpu_down_read(&c->mark_lock); ++ ++ if (ret) ++ goto err; + idx = bch2_replicas_entry_idx(c, r); + } + if (idx < 0) { +@@ -404,6 +405,7 @@ static inline int update_replicas(struct bch_fs *c, struct bkey_s_c k, + err: + fsck_err: + percpu_up_read(&c->mark_lock); ++ printbuf_exit(&buf); + return ret; + } + +@@ -674,7 +676,8 @@ static int check_bucket_ref(struct bch_fs *c, + u16 bucket_sectors = !ptr->cached + ? dirty_sectors + : cached_sectors; +- char buf[200]; ++ struct printbuf buf = PRINTBUF; ++ int ret = 0; + + if (gen_after(ptr->gen, b_gen)) { + bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, +@@ -683,8 +686,9 @@ static int check_bucket_ref(struct bch_fs *c, + ptr->dev, bucket_nr, b_gen, + bch2_data_types[bucket_data_type ?: ptr_data_type], + ptr->gen, +- (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); +- return -EIO; ++ (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); ++ ret = -EIO; ++ goto err; + } + + if (gen_cmp(b_gen, ptr->gen) > BUCKET_GC_GEN_MAX) { +@@ -694,8 +698,10 @@ static int check_bucket_ref(struct bch_fs *c, + ptr->dev, bucket_nr, b_gen, + bch2_data_types[bucket_data_type ?: ptr_data_type], + ptr->gen, +- (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); +- return -EIO; ++ (printbuf_reset(&buf), ++ bch2_bkey_val_to_text(&buf, c, k), buf.buf)); ++ ret = -EIO; ++ goto err; + } + + if (b_gen != ptr->gen && !ptr->cached) { +@@ -706,12 +712,16 @@ static int check_bucket_ref(struct bch_fs *c, + *bucket_gen(ca, bucket_nr), + bch2_data_types[bucket_data_type ?: ptr_data_type], + ptr->gen, +- (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); +- return -EIO; ++ (printbuf_reset(&buf), ++ bch2_bkey_val_to_text(&buf, c, k), buf.buf)); ++ ret = -EIO; ++ goto err; + } + +- if (b_gen != ptr->gen) +- return 1; ++ if (b_gen != ptr->gen) { ++ ret = 1; ++ goto err; ++ } + + if (bucket_data_type && ptr_data_type && + bucket_data_type != ptr_data_type) { +@@ -721,8 +731,10 @@ static int check_bucket_ref(struct bch_fs *c, + ptr->dev, bucket_nr, b_gen, + bch2_data_types[bucket_data_type], + bch2_data_types[ptr_data_type], +- (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); +- return -EIO; ++ (printbuf_reset(&buf), ++ bch2_bkey_val_to_text(&buf, c, k), buf.buf)); ++ ret = -EIO; ++ goto err; + } + + if ((unsigned) (bucket_sectors + sectors) > U16_MAX) { +@@ -732,11 +744,14 @@ static int check_bucket_ref(struct bch_fs *c, + ptr->dev, bucket_nr, b_gen, + bch2_data_types[bucket_data_type ?: ptr_data_type], + bucket_sectors, sectors, +- (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); +- return -EIO; ++ (printbuf_reset(&buf), ++ bch2_bkey_val_to_text(&buf, c, k), buf.buf)); ++ ret = -EIO; ++ goto err; + } +- +- return 0; ++err: ++ printbuf_exit(&buf); ++ return ret; + } + + static int mark_stripe_bucket(struct btree_trans *trans, +@@ -755,7 +770,7 @@ static int mark_stripe_bucket(struct btree_trans *trans, + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); + struct bucket *g; + struct bucket_mark new, old; +- char buf[200]; ++ struct printbuf buf = PRINTBUF; + int ret = 0; + + BUG_ON(!(flags & BTREE_TRIGGER_GC)); +@@ -763,6 +778,7 @@ static int mark_stripe_bucket(struct btree_trans *trans, + /* * XXX doesn't handle deletion */ + + percpu_down_read(&c->mark_lock); ++ buf.atomic++; + g = PTR_GC_BUCKET(ca, ptr); + + if (g->mark.dirty_sectors || +@@ -770,7 +786,7 @@ static int mark_stripe_bucket(struct btree_trans *trans, + bch2_fs_inconsistent(c, + "bucket %u:%zu gen %u: multiple stripes using same bucket\n%s", + ptr->dev, PTR_BUCKET_NR(ca, ptr), g->mark.gen, +- (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); ++ (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); + ret = -EINVAL; + goto err; + } +@@ -795,8 +811,8 @@ static int mark_stripe_bucket(struct btree_trans *trans, + bch2_dev_usage_update(c, ca, old, new, journal_seq, true); + err: + percpu_up_read(&c->mark_lock); +- +- return 0; ++ printbuf_exit(&buf); ++ return ret; + } + + static int __mark_pointer(struct btree_trans *trans, +@@ -983,10 +999,11 @@ static int bch2_mark_extent(struct btree_trans *trans, + if (r.e.nr_devs) { + ret = update_replicas(c, k, &r.e, dirty_sectors, journal_seq, true); + if (ret) { +- char buf[200]; ++ struct printbuf buf = PRINTBUF; + +- bch2_bkey_val_to_text(&PBUF(buf), c, k); +- bch2_fs_fatal_error(c, "no replicas entry for %s", buf); ++ bch2_bkey_val_to_text(&buf, c, k); ++ bch2_fs_fatal_error(c, "no replicas entry for %s", buf.buf); ++ printbuf_exit(&buf); + return ret; + } + } +@@ -1015,13 +1032,16 @@ static int bch2_mark_stripe(struct btree_trans *trans, + struct stripe *m = genradix_ptr(&c->stripes, idx); + + if (!m || (old_s && !m->alive)) { +- char buf1[200], buf2[200]; ++ struct printbuf buf1 = PRINTBUF; ++ struct printbuf buf2 = PRINTBUF; + +- bch2_bkey_val_to_text(&PBUF(buf1), c, old); +- bch2_bkey_val_to_text(&PBUF(buf2), c, new); ++ bch2_bkey_val_to_text(&buf1, c, old); ++ bch2_bkey_val_to_text(&buf2, c, new); + bch_err_ratelimited(c, "error marking nonexistent stripe %llu while marking\n" + "old %s\n" +- "new %s", idx, buf1, buf2); ++ "new %s", idx, buf1.buf, buf2.buf); ++ printbuf_exit(&buf2); ++ printbuf_exit(&buf1); + bch2_inconsistent_error(c); + return -1; + } +@@ -1086,10 +1106,11 @@ static int bch2_mark_stripe(struct btree_trans *trans, + ((s64) m->sectors * m->nr_redundant), + journal_seq, gc); + if (ret) { +- char buf[200]; ++ struct printbuf buf = PRINTBUF; + +- bch2_bkey_val_to_text(&PBUF(buf), c, new); +- bch2_fs_fatal_error(c, "no replicas entry for %s", buf); ++ bch2_bkey_val_to_text(&buf, c, new); ++ bch2_fs_fatal_error(c, "no replicas entry for %s", buf.buf); ++ printbuf_exit(&buf); + return ret; + } + } +@@ -1170,7 +1191,7 @@ static s64 __bch2_mark_reflink_p(struct btree_trans *trans, + int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1; + u64 next_idx = end; + s64 ret = 0; +- char buf[200]; ++ struct printbuf buf = PRINTBUF; + + if (r_idx >= c->reflink_gc_nr) + goto not_found; +@@ -1189,7 +1210,7 @@ not_found: + if (fsck_err(c, "pointer to missing indirect extent\n" + " %s\n" + " missing range %llu-%llu", +- (bch2_bkey_val_to_text(&PBUF(buf), c, p.s_c), buf), ++ (bch2_bkey_val_to_text(&buf, c, p.s_c), buf.buf), + *idx, next_idx)) { + struct bkey_i_error new; + +@@ -1203,6 +1224,7 @@ not_found: + + *idx = next_idx; + fsck_err: ++ printbuf_exit(&buf); + return ret; + } + +@@ -1285,7 +1307,7 @@ void fs_usage_apply_warn(struct btree_trans *trans, + { + struct bch_fs *c = trans->c; + struct btree_insert_entry *i; +- char buf[200]; ++ struct printbuf buf = PRINTBUF; + + bch_err(c, "disk usage increased %lli more than %u sectors reserved", + should_not_have_added, disk_res_sectors); +@@ -1294,13 +1316,17 @@ void fs_usage_apply_warn(struct btree_trans *trans, + struct bkey_s_c old = { &i->old_k, i->old_v }; + + pr_err("while inserting"); +- bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(i->k)); +- pr_err(" %s", buf); ++ printbuf_reset(&buf); ++ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(i->k)); ++ pr_err(" %s", buf.buf); + pr_err("overlapping with"); +- bch2_bkey_val_to_text(&PBUF(buf), c, old); +- pr_err(" %s", buf); ++ printbuf_reset(&buf); ++ bch2_bkey_val_to_text(&buf, c, old); ++ pr_err(" %s", buf.buf); + } ++ + __WARN(); ++ printbuf_exit(&buf); + } + + int bch2_trans_fs_usage_apply(struct btree_trans *trans, +@@ -1740,7 +1766,7 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans, + struct bkey_i *n; + __le64 *refcount; + int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1; +- char buf[200]; ++ struct printbuf buf = PRINTBUF; + int ret; + + bch2_trans_iter_init(trans, &iter, BTREE_ID_reflink, POS(0, *idx), +@@ -1760,19 +1786,19 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans, + + refcount = bkey_refcount(n); + if (!refcount) { +- bch2_bkey_val_to_text(&PBUF(buf), c, p.s_c); ++ bch2_bkey_val_to_text(&buf, c, p.s_c); + bch2_fs_inconsistent(c, + "nonexistent indirect extent at %llu while marking\n %s", +- *idx, buf); ++ *idx, buf.buf); + ret = -EIO; + goto err; + } + + if (!*refcount && (flags & BTREE_TRIGGER_OVERWRITE)) { +- bch2_bkey_val_to_text(&PBUF(buf), c, p.s_c); ++ bch2_bkey_val_to_text(&buf, c, p.s_c); + bch2_fs_inconsistent(c, + "indirect extent refcount underflow at %llu while marking\n %s", +- *idx, buf); ++ *idx, buf.buf); + ret = -EIO; + goto err; + } +@@ -1807,6 +1833,7 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans, + *idx = k.k->p.offset; + err: + bch2_trans_iter_exit(trans, &iter); ++ printbuf_exit(&buf); + return ret; + } + +diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c +index ee5b7f696796..ee22ed31ce37 100644 +--- a/fs/bcachefs/debug.c ++++ b/fs/bcachefs/debug.c +@@ -169,10 +169,11 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b) + failed |= bch2_btree_verify_replica(c, b, p); + + if (failed) { +- char buf[200]; ++ struct printbuf buf = PRINTBUF; + +- bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(&b->key)); +- bch2_fs_fatal_error(c, "btree node verify failed for : %s\n", buf); ++ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); ++ bch2_fs_fatal_error(c, "btree node verify failed for : %s\n", buf.buf); ++ printbuf_exit(&buf); + } + out: + mutex_unlock(&c->verify_lock); +@@ -188,8 +189,7 @@ struct dump_iter { + struct bch_fs *c; + enum btree_id id; + +- char buf[1 << 12]; +- size_t bytes; /* what's currently in buf */ ++ struct printbuf buf; + + char __user *ubuf; /* destination user buffer */ + size_t size; /* size of requested read */ +@@ -198,9 +198,9 @@ struct dump_iter { + + static int flush_buf(struct dump_iter *i) + { +- if (i->bytes) { +- size_t bytes = min(i->bytes, i->size); +- int err = copy_to_user(i->ubuf, i->buf, bytes); ++ if (i->buf.pos) { ++ size_t bytes = min_t(size_t, i->buf.pos, i->size); ++ int err = copy_to_user(i->ubuf, i->buf.buf, bytes); + + if (err) + return err; +@@ -208,8 +208,8 @@ static int flush_buf(struct dump_iter *i) + i->ret += bytes; + i->ubuf += bytes; + i->size -= bytes; +- i->bytes -= bytes; +- memmove(i->buf, i->buf + bytes, i->bytes); ++ i->buf.pos -= bytes; ++ memmove(i->buf.buf, i->buf.buf + bytes, i->buf.pos); + } + + return 0; +@@ -228,13 +228,17 @@ static int bch2_dump_open(struct inode *inode, struct file *file) + i->from = POS_MIN; + i->c = container_of(bd, struct bch_fs, btree_debug[bd->id]); + i->id = bd->id; ++ i->buf = PRINTBUF; + + return 0; + } + + static int bch2_dump_release(struct inode *inode, struct file *file) + { +- kfree(file->private_data); ++ struct dump_iter *i = file->private_data; ++ ++ printbuf_exit(&i->buf); ++ kfree(i); + return 0; + } + +@@ -266,11 +270,8 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf, + k = bch2_btree_iter_peek(&iter); + + while (k.k && !(err = bkey_err(k))) { +- bch2_bkey_val_to_text(&PBUF(i->buf), i->c, k); +- i->bytes = strlen(i->buf); +- BUG_ON(i->bytes >= sizeof(i->buf)); +- i->buf[i->bytes] = '\n'; +- i->bytes++; ++ bch2_bkey_val_to_text(&i->buf, i->c, k); ++ pr_char(&i->buf, '\n'); + + k = bch2_btree_iter_next(&iter); + i->from = iter.pos; +@@ -319,8 +320,7 @@ static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf, + bch2_trans_init(&trans, i->c, 0, 0); + + for_each_btree_node(&trans, iter, i->id, i->from, 0, b, err) { +- bch2_btree_node_to_text(&PBUF(i->buf), i->c, b); +- i->bytes = strlen(i->buf); ++ bch2_btree_node_to_text(&i->buf, i->c, b); + err = flush_buf(i); + if (err) + break; +@@ -384,16 +384,14 @@ static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf, + bch2_btree_node_iter_peek(&l->iter, l->b); + + if (l->b != prev_node) { +- bch2_btree_node_to_text(&PBUF(i->buf), i->c, l->b); +- i->bytes = strlen(i->buf); ++ bch2_btree_node_to_text(&i->buf, i->c, l->b); + err = flush_buf(i); + if (err) + break; + } + prev_node = l->b; + +- bch2_bfloat_to_text(&PBUF(i->buf), l->b, _k); +- i->bytes = strlen(i->buf); ++ bch2_bfloat_to_text(&i->buf, l->b, _k); + err = flush_buf(i); + if (err) + break; +diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c +index 9b45640e75dc..6027a7d42981 100644 +--- a/fs/bcachefs/ec.c ++++ b/fs/bcachefs/ec.c +@@ -286,14 +286,15 @@ static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf) + struct bch_csum got = ec_block_checksum(buf, i, offset); + + if (bch2_crc_cmp(want, got)) { +- char buf2[200]; ++ struct printbuf buf2 = PRINTBUF; + +- bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(&buf->key.k_i)); ++ bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(&buf->key.k_i)); + + bch_err_ratelimited(c, + "stripe checksum error for %ps at %u:%u: csum type %u, expected %llx got %llx\n%s", + (void *) _RET_IP_, i, j, v->csum_type, +- want.lo, got.lo, buf2); ++ want.lo, got.lo, buf2.buf); ++ printbuf_exit(&buf2); + clear_bit(i, buf->valid); + break; + } +diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c +index 0c3c271ca143..36d966f8ba77 100644 +--- a/fs/bcachefs/fs.c ++++ b/fs/bcachefs/fs.c +@@ -1675,7 +1675,8 @@ static int bch2_show_options(struct seq_file *seq, struct dentry *root) + { + struct bch_fs *c = root->d_sb->s_fs_info; + enum bch_opt_id i; +- char buf[512]; ++ struct printbuf buf = PRINTBUF; ++ int ret = 0; + + for (i = 0; i < bch2_opts_nr; i++) { + const struct bch_option *opt = &bch2_opt_table[i]; +@@ -1687,13 +1688,17 @@ static int bch2_show_options(struct seq_file *seq, struct dentry *root) + if (v == bch2_opt_get_by_id(&bch2_opts_default, i)) + continue; + +- bch2_opt_to_text(&PBUF(buf), c, opt, v, ++ printbuf_reset(&buf); ++ bch2_opt_to_text(&buf, c, opt, v, + OPT_SHOW_MOUNT_STYLE); + seq_putc(seq, ','); +- seq_puts(seq, buf); ++ seq_puts(seq, buf.buf); + } + +- return 0; ++ if (buf.allocation_failure) ++ ret = -ENOMEM; ++ printbuf_exit(&buf); ++ return ret; + } + + static void bch2_put_super(struct super_block *sb) +diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c +index ced4d671eb8d..8783b950055e 100644 +--- a/fs/bcachefs/fsck.c ++++ b/fs/bcachefs/fsck.c +@@ -698,15 +698,16 @@ static int check_key_has_snapshot(struct btree_trans *trans, + struct bkey_s_c k) + { + struct bch_fs *c = trans->c; +- char buf[200]; ++ struct printbuf buf = PRINTBUF; + int ret = 0; + + if (mustfix_fsck_err_on(!snapshot_t(c, k.k->p.snapshot)->equiv, c, + "key in missing snapshot: %s", +- (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf))) +- return bch2_btree_delete_at(trans, iter, ++ (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) ++ ret = bch2_btree_delete_at(trans, iter, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: 1; + fsck_err: ++ printbuf_exit(&buf); + return ret; + } + +@@ -746,7 +747,7 @@ static int hash_check_key(struct btree_trans *trans, + { + struct bch_fs *c = trans->c; + struct btree_iter iter = { NULL }; +- char buf[200]; ++ struct printbuf buf = PRINTBUF; + struct bkey_s_c k; + u64 hash; + int ret = 0; +@@ -770,8 +771,9 @@ static int hash_check_key(struct btree_trans *trans, + if (fsck_err_on(k.k->type == desc.key_type && + !desc.cmp_bkey(k, hash_k), c, + "duplicate hash table keys:\n%s", +- (bch2_bkey_val_to_text(&PBUF(buf), c, +- hash_k), buf))) { ++ (printbuf_reset(&buf), ++ bch2_bkey_val_to_text(&buf, c, hash_k), ++ buf.buf))) { + ret = bch2_hash_delete_at(trans, desc, hash_info, k_iter, 0) ?: 1; + break; + } +@@ -782,13 +784,16 @@ static int hash_check_key(struct btree_trans *trans, + } + + } ++out: + bch2_trans_iter_exit(trans, &iter); ++ printbuf_exit(&buf); + return ret; + bad_hash: + if (fsck_err(c, "hash table key at wrong offset: btree %u inode %llu offset %llu, " + "hashed to %llu\n%s", + desc.btree_id, hash_k.k->p.inode, hash_k.k->p.offset, hash, +- (bch2_bkey_val_to_text(&PBUF(buf), c, hash_k), buf)) == FSCK_ERR_IGNORE) ++ (printbuf_reset(&buf), ++ bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf)) == FSCK_ERR_IGNORE) + return 0; + + ret = hash_redo_key(trans, desc, hash_info, k_iter, hash_k); +@@ -796,9 +801,9 @@ bad_hash: + bch_err(c, "hash_redo_key err %i", ret); + return ret; + } +- return -EINTR; ++ ret = -EINTR; + fsck_err: +- return ret; ++ goto out; + } + + static int check_inode(struct btree_trans *trans, +@@ -1166,32 +1171,34 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter, + struct bch_fs *c = trans->c; + struct bkey_s_c k; + struct inode_walker_entry *i; +- char buf[200]; ++ struct printbuf buf = PRINTBUF; + int ret = 0; + + k = bch2_btree_iter_peek(iter); + if (!k.k) +- return 0; ++ goto out; + + ret = bkey_err(k); + if (ret) +- return ret; ++ goto err; + + ret = check_key_has_snapshot(trans, iter, k); +- if (ret) +- return ret < 0 ? ret : 0; ++ if (ret) { ++ ret = ret < 0 ? ret : 0; ++ goto out; ++ } + + ret = snapshots_seen_update(c, s, k.k->p); + if (ret) +- return ret; ++ goto err; + + if (k.k->type == KEY_TYPE_whiteout) +- return 0; ++ goto out; + + if (inode->cur_inum != k.k->p.inode) { + ret = check_i_sectors(trans, inode); + if (ret) +- return ret; ++ goto err; + } + #if 0 + if (bkey_cmp(prev.k->k.p, bkey_start_pos(k.k)) > 0) { +@@ -1201,22 +1208,29 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter, + bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(prev.k)); + bch2_bkey_val_to_text(&PBUF(buf2), c, k); + +- if (fsck_err(c, "overlapping extents:\n%s\n%s", buf1, buf2)) +- return fix_overlapping_extent(trans, k, prev.k->k.p) ?: -EINTR; ++ if (fsck_err(c, "overlapping extents:\n%s\n%s", buf1, buf2)) { ++ ret = fix_overlapping_extent(trans, k, prev.k->k.p) ?: -EINTR; ++ goto out; ++ } + } + #endif + ret = __walk_inode(trans, inode, k.k->p); + if (ret < 0) +- return ret; ++ goto err; + + if (fsck_err_on(ret == INT_MAX, c, + "extent in missing inode:\n %s", +- (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf))) +- return bch2_btree_delete_at(trans, iter, ++ (printbuf_reset(&buf), ++ bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { ++ ret = bch2_btree_delete_at(trans, iter, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); ++ goto out; ++ } + +- if (ret == INT_MAX) +- return 0; ++ if (ret == INT_MAX) { ++ ret = 0; ++ goto out; ++ } + + i = inode->d + ret; + ret = 0; +@@ -1225,9 +1239,12 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter, + !S_ISLNK(i->inode.bi_mode), c, + "extent in non regular inode mode %o:\n %s", + i->inode.bi_mode, +- (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf))) +- return bch2_btree_delete_at(trans, iter, ++ (printbuf_reset(&buf), ++ bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { ++ ret = bch2_btree_delete_at(trans, iter, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); ++ goto out; ++ } + + if (!bch2_snapshot_internal_node(c, k.k->p.snapshot)) { + for_each_visible_inode(c, s, inode, k.k->p.snapshot, i) { +@@ -1237,11 +1254,12 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter, + "extent type %u offset %llu past end of inode %llu, i_size %llu", + k.k->type, k.k->p.offset, k.k->p.inode, i->inode.bi_size)) { + bch2_fs_lazy_rw(c); +- return bch2_btree_delete_range_trans(trans, BTREE_ID_extents, ++ ret = bch2_btree_delete_range_trans(trans, BTREE_ID_extents, + SPOS(k.k->p.inode, round_up(i->inode.bi_size, block_bytes(c)) >> 9, + k.k->p.snapshot), + POS(k.k->p.inode, U64_MAX), + 0, NULL) ?: -EINTR; ++ goto out; + } + } + } +@@ -1253,7 +1271,10 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter, + bch2_bkey_buf_reassemble(&prev, c, k); + #endif + ++out: ++err: + fsck_err: ++ printbuf_exit(&buf); + return ret; + } + +@@ -1351,7 +1372,7 @@ static int check_dirent_target(struct btree_trans *trans, + struct bch_fs *c = trans->c; + struct bkey_i_dirent *n; + bool backpointer_exists = true; +- char buf[200]; ++ struct printbuf buf = PRINTBUF; + int ret = 0; + + if (!target->bi_dir && +@@ -1377,9 +1398,7 @@ static int check_dirent_target(struct btree_trans *trans, + "directory %llu with multiple links", + target->bi_inum)) { + ret = __remove_dirent(trans, d.k->p); +- if (ret) +- goto err; +- return 0; ++ goto out; + } + + if (fsck_err_on(backpointer_exists && +@@ -1416,18 +1435,19 @@ static int check_dirent_target(struct btree_trans *trans, + "incorrect d_type: got %s, should be %s:\n%s", + bch2_d_type_str(d.v->d_type), + bch2_d_type_str(inode_d_type(target)), +- (bch2_bkey_val_to_text(&PBUF(buf), c, d.s_c), buf))) { ++ (printbuf_reset(&buf), ++ bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) { + n = bch2_trans_kmalloc(trans, bkey_bytes(d.k)); + ret = PTR_ERR_OR_ZERO(n); + if (ret) +- return ret; ++ goto err; + + bkey_reassemble(&n->k_i, d.s_c); + n->v.d_type = inode_d_type(target); + + ret = bch2_trans_update(trans, iter, &n->k_i, 0); + if (ret) +- return ret; ++ goto err; + + d = dirent_i_to_s_c(n); + } +@@ -1441,19 +1461,21 @@ static int check_dirent_target(struct btree_trans *trans, + n = bch2_trans_kmalloc(trans, bkey_bytes(d.k)); + ret = PTR_ERR_OR_ZERO(n); + if (ret) +- return ret; ++ goto err; + + bkey_reassemble(&n->k_i, d.s_c); + n->v.d_parent_subvol = cpu_to_le32(target->bi_parent_subvol); + + ret = bch2_trans_update(trans, iter, &n->k_i, 0); + if (ret) +- return ret; ++ goto err; + + d = dirent_i_to_s_c(n); + } ++out: + err: + fsck_err: ++ printbuf_exit(&buf); + return ret; + } + +@@ -1467,46 +1489,53 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, + struct bkey_s_c k; + struct bkey_s_c_dirent d; + struct inode_walker_entry *i; +- char buf[200]; +- int ret; ++ struct printbuf buf = PRINTBUF; ++ int ret = 0; + + k = bch2_btree_iter_peek(iter); + if (!k.k) +- return 0; ++ goto out; + + ret = bkey_err(k); + if (ret) +- return ret; ++ goto err; + + ret = check_key_has_snapshot(trans, iter, k); +- if (ret) +- return ret < 0 ? ret : 0; ++ if (ret) { ++ ret = ret < 0 ? ret : 0; ++ goto out; ++ } + + ret = snapshots_seen_update(c, s, k.k->p); + if (ret) +- return ret; ++ goto err; + + if (k.k->type == KEY_TYPE_whiteout) +- return 0; ++ goto out; + + if (dir->cur_inum != k.k->p.inode) { + ret = check_subdir_count(trans, dir); + if (ret) +- return ret; ++ goto err; + } + + ret = __walk_inode(trans, dir, k.k->p); + if (ret < 0) +- return ret; ++ goto err; + + if (fsck_err_on(ret == INT_MAX, c, + "dirent in nonexisting directory:\n%s", +- (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf))) +- return bch2_btree_delete_at(trans, iter, ++ (printbuf_reset(&buf), ++ bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { ++ ret = bch2_btree_delete_at(trans, iter, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); ++ goto out; ++ } + +- if (ret == INT_MAX) +- return 0; ++ if (ret == INT_MAX) { ++ ret = 0; ++ goto out; ++ } + + i = dir->d + ret; + ret = 0; +@@ -1514,8 +1543,11 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, + if (fsck_err_on(!S_ISDIR(i->inode.bi_mode), c, + "dirent in non directory inode type %s:\n%s", + bch2_d_type_str(inode_d_type(&i->inode)), +- (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf))) +- return bch2_btree_delete_at(trans, iter, 0); ++ (printbuf_reset(&buf), ++ bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { ++ ret = bch2_btree_delete_at(trans, iter, 0); ++ goto out; ++ } + + if (dir->first_this_inode) + *hash_info = bch2_hash_info_init(c, &dir->d[0].inode); +@@ -1523,12 +1555,15 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, + ret = hash_check_key(trans, bch2_dirent_hash_desc, + hash_info, iter, k); + if (ret < 0) +- return ret; +- if (ret) /* dirent has been deleted */ +- return 0; ++ goto err; ++ if (ret) { ++ /* dirent has been deleted */ ++ ret = 0; ++ goto out; ++ } + + if (k.k->type != KEY_TYPE_dirent) +- return 0; ++ goto out; + + d = bkey_s_c_to_dirent(k); + +@@ -1541,24 +1576,27 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, + ret = __subvol_lookup(trans, target_subvol, + &target_snapshot, &target_inum); + if (ret && ret != -ENOENT) +- return ret; ++ goto err; + + if (fsck_err_on(ret, c, + "dirent points to missing subvolume %llu", +- le64_to_cpu(d.v->d_child_subvol))) +- return __remove_dirent(trans, d.k->p); ++ le64_to_cpu(d.v->d_child_subvol))) { ++ ret = __remove_dirent(trans, d.k->p); ++ goto err; ++ } + + ret = __lookup_inode(trans, target_inum, + &subvol_root, &target_snapshot); + if (ret && ret != -ENOENT) +- return ret; ++ goto err; + + if (fsck_err_on(ret, c, + "subvolume %u points to missing subvolume root %llu", + target_subvol, + target_inum)) { + bch_err(c, "repair not implemented yet"); +- return -EINVAL; ++ ret = -EINVAL; ++ goto err; + } + + if (fsck_err_on(subvol_root.bi_subvol != target_subvol, c, +@@ -1568,32 +1606,33 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, + subvol_root.bi_subvol = target_subvol; + ret = __write_inode(trans, &subvol_root, target_snapshot); + if (ret) +- return ret; ++ goto err; + } + + ret = check_dirent_target(trans, iter, d, &subvol_root, + target_snapshot); + if (ret) +- return ret; ++ goto err; + } else { + ret = __get_visible_inodes(trans, target, s, le64_to_cpu(d.v->d_inum)); + if (ret) +- return ret; ++ goto err; + + if (fsck_err_on(!target->nr, c, + "dirent points to missing inode:\n%s", +- (bch2_bkey_val_to_text(&PBUF(buf), c, +- k), buf))) { ++ (printbuf_reset(&buf), ++ bch2_bkey_val_to_text(&buf, c, k), ++ buf.buf))) { + ret = __remove_dirent(trans, d.k->p); + if (ret) +- return ret; ++ goto err; + } + + for (i = target->d; i < target->d + target->nr; i++) { + ret = check_dirent_target(trans, iter, d, + &i->inode, i->snapshot); + if (ret) +- return ret; ++ goto err; + } + } + +@@ -1601,7 +1640,10 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, + for_each_visible_inode(c, s, dir, d.k->p.snapshot, i) + i->count++; + ++out: ++err: + fsck_err: ++ printbuf_exit(&buf); + return ret; + } + +diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c +index fde10cea0706..cf97594b7c6f 100644 +--- a/fs/bcachefs/io.c ++++ b/fs/bcachefs/io.c +@@ -1978,11 +1978,11 @@ static noinline void read_from_stale_dirty_pointer(struct btree_trans *trans, + struct bch_fs *c = trans->c; + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr.dev); + struct btree_iter iter; +- char buf[200]; ++ struct printbuf buf = PRINTBUF; + int ret; + +- bch2_bkey_val_to_text(&PBUF(buf), c, k); +- bch2_fs_inconsistent(c, "Attempting to read from stale dirty pointer: %s", buf); ++ bch2_bkey_val_to_text(&buf, c, k); ++ bch2_fs_inconsistent(c, "Attempting to read from stale dirty pointer: %s", buf.buf); + + bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, + POS(ptr.dev, PTR_BUCKET_NR(ca, &ptr)), +@@ -1990,12 +1990,14 @@ static noinline void read_from_stale_dirty_pointer(struct btree_trans *trans, + + ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(&iter))); + if (ret) +- return; ++ goto out; + +- bch2_bkey_val_to_text(&PBUF(buf), c, k); +- bch_err(c, "%s", buf); ++ bch2_bkey_val_to_text(&buf, c, k); ++ bch_err(c, "%s", buf.buf); + bch_err(c, "memory gen: %u", *bucket_gen(ca, iter.pos.offset)); + bch2_trans_iter_exit(trans, &iter); ++out: ++ printbuf_exit(&buf); + } + + int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, +diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c +index d582af7eec6d..ffaf58956450 100644 +--- a/fs/bcachefs/journal.c ++++ b/fs/bcachefs/journal.c +@@ -415,18 +415,18 @@ unlock: + !can_discard && + j->reservations.idx == j->reservations.unwritten_idx && + (flags & JOURNAL_RES_GET_RESERVED)) { +- char *journal_debug_buf = kmalloc(4096, GFP_ATOMIC); ++ struct printbuf buf = PRINTBUF; + + bch_err(c, "Journal stuck! Hava a pre-reservation but journal full"); +- if (journal_debug_buf) { +- bch2_journal_debug_to_text(&_PBUF(journal_debug_buf, 4096), j); +- bch_err(c, "%s", journal_debug_buf); + +- bch2_journal_pins_to_text(&_PBUF(journal_debug_buf, 4096), j); +- bch_err(c, "Journal pins:\n%s", journal_debug_buf); +- kfree(journal_debug_buf); +- } ++ bch2_journal_debug_to_text(&buf, j); ++ bch_err(c, "%s", buf.buf); ++ ++ printbuf_reset(&buf); ++ bch2_journal_pins_to_text(&buf, j); ++ bch_err(c, "Journal pins:\n%s", buf.buf); + ++ printbuf_exit(&buf); + bch2_fatal_error(c); + dump_stack(); + } +@@ -1184,6 +1184,8 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) + unsigned long now = jiffies; + unsigned i; + ++ out->atomic++; ++ + rcu_read_lock(); + s = READ_ONCE(j->reservations); + +@@ -1268,6 +1270,8 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) + } + + rcu_read_unlock(); ++ ++ --out->atomic; + } + + void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) +@@ -1284,6 +1288,8 @@ void bch2_journal_pins_to_text(struct printbuf *out, struct journal *j) + u64 i; + + spin_lock(&j->lock); ++ out->atomic++; ++ + fifo_for_each_entry_ptr(pin_list, &j->pin, i) { + pr_buf(out, "%llu: count %u\n", + i, atomic_read(&pin_list->count)); +@@ -1303,5 +1309,7 @@ void bch2_journal_pins_to_text(struct printbuf *out, struct journal *j) + pr_buf(out, "\t%px %ps\n", + pin, pin->flush); + } ++ ++ --out->atomic; + spin_unlock(&j->lock); + } +diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c +index 794719d46ebd..4380ebf5e252 100644 +--- a/fs/bcachefs/journal_io.c ++++ b/fs/bcachefs/journal_io.c +@@ -252,14 +252,15 @@ static int journal_validate_key(struct bch_fs *c, const char *where, + invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(k), + __btree_node_type(level, btree_id)); + if (invalid) { +- char buf[160]; ++ struct printbuf buf = PRINTBUF; + +- bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(k)); ++ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k)); + mustfix_fsck_err(c, "invalid %s in %s entry offset %zi/%u: %s\n%s", + type, where, + (u64 *) k - entry->_data, + le16_to_cpu(entry->u64s), +- invalid, buf); ++ invalid, buf.buf); ++ printbuf_exit(&buf); + + le16_add_cpu(&entry->u64s, -((u16) k->k.u64s)); + memmove(k, bkey_next(k), next - (void *) bkey_next(k)); +@@ -996,6 +997,7 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list, + struct journal_replay *i, *t; + struct bch_dev *ca; + unsigned iter; ++ struct printbuf buf = PRINTBUF; + size_t keys = 0, entries = 0; + bool degraded = false; + u64 seq, last_seq = 0; +@@ -1054,7 +1056,8 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list, + + if (!last_seq) { + fsck_err(c, "journal read done, but no entries found after dropping non-flushes"); +- return -1; ++ ret = -1; ++ goto err; + } + + /* Drop blacklisted entries and entries older than last_seq: */ +@@ -1086,7 +1089,7 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list, + + while (seq < le64_to_cpu(i->j.seq)) { + u64 missing_start, missing_end; +- char buf1[200], buf2[200]; ++ struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; + + while (seq < le64_to_cpu(i->j.seq) && + bch2_journal_seq_is_blacklisted(c, seq, false)) +@@ -1102,14 +1105,13 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list, + seq++; + + if (i->list.prev != list) { +- struct printbuf out = PBUF(buf1); + struct journal_replay *p = list_prev_entry(i, list); + +- bch2_journal_ptrs_to_text(&out, c, p); +- pr_buf(&out, " size %zu", vstruct_sectors(&p->j, c->block_bits)); ++ bch2_journal_ptrs_to_text(&buf1, c, p); ++ pr_buf(&buf1, " size %zu", vstruct_sectors(&p->j, c->block_bits)); + } else +- sprintf(buf1, "(none)"); +- bch2_journal_ptrs_to_text(&PBUF(buf2), c, i); ++ pr_buf(&buf1, "(none)"); ++ bch2_journal_ptrs_to_text(&buf2, c, i); + + missing_end = seq - 1; + fsck_err(c, "journal entries %llu-%llu missing! (replaying %llu-%llu)\n" +@@ -1117,7 +1119,10 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list, + " next at %s", + missing_start, missing_end, + last_seq, *blacklist_seq - 1, +- buf1, buf2); ++ buf1.buf, buf2.buf); ++ ++ printbuf_exit(&buf1); ++ printbuf_exit(&buf2); + } + + seq++; +@@ -1131,14 +1136,13 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list, + .e.nr_required = 1, + }; + unsigned ptr; +- char buf[80]; + + if (i->ignore) + continue; + + ret = jset_validate_entries(c, &i->j, READ); + if (ret) +- goto fsck_err; ++ goto err; + + for (ptr = 0; ptr < i->nr_ptrs; ptr++) + replicas.e.devs[replicas.e.nr_devs++] = i->ptrs[ptr].dev; +@@ -1150,15 +1154,17 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list, + * the devices - this is wrong: + */ + ++ printbuf_reset(&buf); ++ bch2_replicas_entry_to_text(&buf, &replicas.e); ++ + if (!degraded && + (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) || + fsck_err_on(!bch2_replicas_marked(c, &replicas.e), c, + "superblock not marked as containing replicas %s", +- (bch2_replicas_entry_to_text(&PBUF(buf), +- &replicas.e), buf)))) { ++ buf.buf))) { + ret = bch2_mark_replicas(c, &replicas.e); + if (ret) +- return ret; ++ goto err; + } + + for_each_jset_key(k, _n, entry, &i->j) +@@ -1172,7 +1178,9 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list, + if (*start_seq != *blacklist_seq) + bch_info(c, "dropped unflushed entries %llu-%llu", + *blacklist_seq, *start_seq - 1); ++err: + fsck_err: ++ printbuf_exit(&buf); + return ret; + } + +@@ -1484,7 +1492,7 @@ void bch2_journal_write(struct closure *cl) + struct jset_entry *start, *end; + struct jset *jset; + struct bio *bio; +- char *journal_debug_buf = NULL; ++ struct printbuf journal_debug_buf = PRINTBUF; + bool validate_before_checksum = false; + unsigned i, sectors, bytes, u64s, nr_rw_members = 0; + int ret; +@@ -1589,11 +1597,8 @@ retry_alloc: + goto retry_alloc; + } + +- if (ret) { +- journal_debug_buf = kmalloc(4096, GFP_ATOMIC); +- if (journal_debug_buf) +- __bch2_journal_debug_to_text(&_PBUF(journal_debug_buf, 4096), j); +- } ++ if (ret) ++ __bch2_journal_debug_to_text(&journal_debug_buf, j); + + /* + * write is allocated, no longer need to account for it in +@@ -1610,8 +1615,8 @@ retry_alloc: + + if (ret) { + bch_err(c, "Unable to allocate journal write:\n%s", +- journal_debug_buf); +- kfree(journal_debug_buf); ++ journal_debug_buf.buf); ++ printbuf_exit(&journal_debug_buf); + bch2_fatal_error(c); + continue_at(cl, journal_write_done, c->io_complete_wq); + return; +diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c +index 90fa2be54e20..ef1cb6acfb3e 100644 +--- a/fs/bcachefs/journal_reclaim.c ++++ b/fs/bcachefs/journal_reclaim.c +@@ -216,14 +216,11 @@ void bch2_journal_space_available(struct journal *j) + if (!clean_ondisk && + j->reservations.idx == + j->reservations.unwritten_idx) { +- char *buf = kmalloc(4096, GFP_ATOMIC); ++ struct printbuf buf = PRINTBUF; + +- bch_err(c, "journal stuck"); +- if (buf) { +- __bch2_journal_debug_to_text(&_PBUF(buf, 4096), j); +- pr_err("\n%s", buf); +- kfree(buf); +- } ++ __bch2_journal_debug_to_text(&buf, j); ++ bch_err(c, "journal stuck\n%s", buf.buf); ++ printbuf_exit(&buf); + + bch2_fatal_error(c); + ret = cur_entry_journal_stuck; +diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c +index a573fede05b1..d914892f5339 100644 +--- a/fs/bcachefs/rebalance.c ++++ b/fs/bcachefs/rebalance.c +@@ -257,35 +257,47 @@ void bch2_rebalance_work_to_text(struct printbuf *out, struct bch_fs *c) + { + struct bch_fs_rebalance *r = &c->rebalance; + struct rebalance_work w = rebalance_work(c); +- char h1[21], h2[21]; + +- bch2_hprint(&PBUF(h1), w.dev_most_full_work << 9); +- bch2_hprint(&PBUF(h2), w.dev_most_full_capacity << 9); +- pr_buf(out, "fullest_dev (%i):\t%s/%s\n", +- w.dev_most_full_idx, h1, h2); ++ out->tabstops[0] = 20; + +- bch2_hprint(&PBUF(h1), w.total_work << 9); +- bch2_hprint(&PBUF(h2), c->capacity << 9); +- pr_buf(out, "total work:\t\t%s/%s\n", h1, h2); ++ pr_buf(out, "fullest_dev (%i):", w.dev_most_full_idx); ++ pr_tab(out); + +- pr_buf(out, "rate:\t\t\t%u\n", r->pd.rate.rate); ++ bch2_hprint(out, w.dev_most_full_work << 9); ++ pr_buf(out, "/"); ++ bch2_hprint(out, w.dev_most_full_capacity << 9); ++ pr_newline(out); ++ ++ pr_buf(out, "total work:"); ++ pr_tab(out); ++ ++ bch2_hprint(out, w.total_work << 9); ++ pr_buf(out, "/"); ++ bch2_hprint(out, c->capacity << 9); ++ pr_newline(out); ++ ++ pr_buf(out, "rate:"); ++ pr_tab(out); ++ pr_buf(out, "%u", r->pd.rate.rate); ++ pr_newline(out); + + switch (r->state) { + case REBALANCE_WAITING: +- pr_buf(out, "waiting\n"); ++ pr_buf(out, "waiting"); + break; + case REBALANCE_THROTTLED: +- bch2_hprint(&PBUF(h1), ++ pr_buf(out, "throttled for %lu sec or ", ++ (r->throttled_until_cputime - jiffies) / HZ); ++ bch2_hprint(out, + (r->throttled_until_iotime - + atomic64_read(&c->io_clock[WRITE].now)) << 9); +- pr_buf(out, "throttled for %lu sec or %s io\n", +- (r->throttled_until_cputime - jiffies) / HZ, +- h1); ++ pr_buf(out, " io"); + break; + case REBALANCE_RUNNING: +- pr_buf(out, "running\n"); ++ pr_buf(out, "running"); + break; + } ++ pr_newline(out); + } + + void bch2_rebalance_stop(struct bch_fs *c) +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index ae9ae1c7138c..6c4ffc5abdc5 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -760,6 +760,8 @@ static int verify_superblock_clean(struct bch_fs *c, + { + unsigned i; + struct bch_sb_field_clean *clean = *cleanp; ++ struct printbuf buf1 = PRINTBUF; ++ struct printbuf buf2 = PRINTBUF; + int ret = 0; + + if (mustfix_fsck_err_on(j->seq != clean->journal_seq, c, +@@ -772,7 +774,6 @@ static int verify_superblock_clean(struct bch_fs *c, + } + + for (i = 0; i < BTREE_ID_NR; i++) { +- char buf1[200], buf2[200]; + struct bkey_i *k1, *k2; + unsigned l1 = 0, l2 = 0; + +@@ -782,6 +783,19 @@ static int verify_superblock_clean(struct bch_fs *c, + if (!k1 && !k2) + continue; + ++ printbuf_reset(&buf1); ++ printbuf_reset(&buf2); ++ ++ if (k1) ++ bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(k1)); ++ else ++ pr_buf(&buf1, "(none)"); ++ ++ if (k2) ++ bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(k2)); ++ else ++ pr_buf(&buf2, "(none)"); ++ + mustfix_fsck_err_on(!k1 || !k2 || + IS_ERR(k1) || + IS_ERR(k2) || +@@ -791,10 +805,12 @@ static int verify_superblock_clean(struct bch_fs *c, + "superblock btree root %u doesn't match journal after clean shutdown\n" + "sb: l=%u %s\n" + "journal: l=%u %s\n", i, +- l1, (bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(k1)), buf1), +- l2, (bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(k2)), buf2)); ++ l1, buf1.buf, ++ l2, buf2.buf); + } + fsck_err: ++ printbuf_exit(&buf2); ++ printbuf_exit(&buf1); + return ret; + } + +diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c +index 6c1d42f1c92c..c2771112d573 100644 +--- a/fs/bcachefs/replicas.c ++++ b/fs/bcachefs/replicas.c +@@ -990,11 +990,12 @@ bool bch2_have_enough_devs(struct bch_fs *c, struct bch_devs_mask devs, + + if (dflags & ~flags) { + if (print) { +- char buf[100]; ++ struct printbuf buf = PRINTBUF; + +- bch2_replicas_entry_to_text(&PBUF(buf), e); ++ bch2_replicas_entry_to_text(&buf, e); + bch_err(c, "insufficient devices online (%u) for replicas entry %s", +- nr_online, buf); ++ nr_online, buf.buf); ++ printbuf_exit(&buf); + } + ret = false; + break; +diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c +index 21109881e9f6..08966f4004fb 100644 +--- a/fs/bcachefs/super-io.c ++++ b/fs/bcachefs/super-io.c +@@ -567,16 +567,10 @@ int bch2_read_super(const char *path, struct bch_opts *opts, + { + u64 offset = opt_get(*opts, sb); + struct bch_sb_layout layout; +- char *_err; +- struct printbuf err; ++ struct printbuf err = PRINTBUF; + __le64 *i; + int ret; + +- _err = kmalloc(4096, GFP_KERNEL); +- if (!_err) +- return -ENOMEM; +- err = _PBUF(_err, 4096); +- + pr_verbose_init(*opts, ""); + + memset(sb, 0, sizeof(*sb)); +@@ -625,8 +619,8 @@ int bch2_read_super(const char *path, struct bch_opts *opts, + goto err; + + printk(KERN_ERR "bcachefs (%s): error reading default superblock: %s", +- path, _err); +- err = _PBUF(_err, 4096); ++ path, err.buf); ++ printbuf_reset(&err); + + /* + * Error reading primary superblock - read location of backup +@@ -683,16 +677,16 @@ got_super: + ret = bch2_sb_validate(sb, &err); + if (ret) { + printk(KERN_ERR "bcachefs (%s): error validating superblock: %s", +- path, _err); ++ path, err.buf); + goto err_no_print; + } + out: + pr_verbose_init(*opts, "ret %i", ret); +- kfree(_err); ++ printbuf_exit(&err); + return ret; + err: + printk(KERN_ERR "bcachefs (%s): error reading superblock: %s", +- path, _err); ++ path, err.buf); + err_no_print: + bch2_free_super(sb); + goto out; +@@ -766,6 +760,7 @@ int bch2_write_super(struct bch_fs *c) + { + struct closure *cl = &c->sb_write; + struct bch_dev *ca; ++ struct printbuf err = PRINTBUF; + unsigned i, sb = 0, nr_wrote; + struct bch_devs_mask sb_written; + bool wrote, can_mount_without_written, can_mount_with_written; +@@ -793,18 +788,11 @@ int bch2_write_super(struct bch_fs *c) + bch2_sb_from_fs(c, ca); + + for_each_online_member(ca, c, i) { +- struct printbuf buf = { NULL, NULL }; ++ printbuf_reset(&err); + +- ret = bch2_sb_validate(&ca->disk_sb, &buf); ++ ret = bch2_sb_validate(&ca->disk_sb, &err); + if (ret) { +- char *_buf = kmalloc(4096, GFP_NOFS); +- if (_buf) { +- buf = _PBUF(_buf, 4096); +- bch2_sb_validate(&ca->disk_sb, &buf); +- } +- +- bch2_fs_inconsistent(c, "sb invalid before write: %s", _buf); +- kfree(_buf); ++ bch2_fs_inconsistent(c, "sb invalid before write: %s", err.buf); + percpu_ref_put(&ca->io_ref); + goto out; + } +@@ -895,6 +883,7 @@ int bch2_write_super(struct bch_fs *c) + out: + /* Make new options visible after they're persistent: */ + bch2_sb_update(c); ++ printbuf_exit(&err); + return ret; + } + +diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c +index 88737d846172..9af8eb35b177 100644 +--- a/fs/bcachefs/super.c ++++ b/fs/bcachefs/super.c +@@ -865,12 +865,9 @@ noinline_for_stack + static void print_mount_opts(struct bch_fs *c) + { + enum bch_opt_id i; +- char buf[512]; +- struct printbuf p = PBUF(buf); ++ struct printbuf p = PRINTBUF; + bool first = true; + +- strcpy(buf, "(null)"); +- + if (c->opts.read_only) { + pr_buf(&p, "ro"); + first = false; +@@ -892,7 +889,11 @@ static void print_mount_opts(struct bch_fs *c) + bch2_opt_to_text(&p, c, opt, v, OPT_SHOW_MOUNT_STYLE); + } + +- bch_info(c, "mounted with opts: %s", buf); ++ if (!p.pos) ++ pr_buf(&p, "(null)"); ++ ++ bch_info(c, "mounted with opts: %s", p.buf); ++ printbuf_exit(&p); + } + + int bch2_fs_start(struct bch_fs *c) +@@ -1558,11 +1559,11 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) + + data = bch2_dev_has_data(c, ca); + if (data) { +- char data_has_str[100]; ++ struct printbuf data_has = PRINTBUF; + +- bch2_flags_to_text(&PBUF(data_has_str), +- bch2_data_types, data); +- bch_err(ca, "Remove failed, still has data (%s)", data_has_str); ++ bch2_flags_to_text(&data_has, bch2_data_types, data); ++ bch_err(ca, "Remove failed, still has data (%s)", data_has.buf); ++ printbuf_exit(&data_has); + ret = -EBUSY; + goto err; + } +@@ -1611,16 +1612,9 @@ int bch2_dev_add(struct bch_fs *c, const char *path) + struct bch_sb_field_members *mi; + struct bch_member dev_mi; + unsigned dev_idx, nr_devices, u64s; +- char *_errbuf; +- struct printbuf errbuf; ++ struct printbuf errbuf = PRINTBUF; + int ret; + +- _errbuf = kmalloc(4096, GFP_KERNEL); +- if (!_errbuf) +- return -ENOMEM; +- +- errbuf = _PBUF(_errbuf, 4096); +- + ret = bch2_read_super(path, &opts, &sb); + if (ret) { + bch_err(c, "device add error: error reading super: %i", ret); +@@ -1738,7 +1732,7 @@ err: + if (ca) + bch2_dev_free(ca); + bch2_free_super(&sb); +- kfree(_errbuf); ++ printbuf_exit(&errbuf); + return ret; + err_late: + up_write(&c->state_lock); +@@ -1903,8 +1897,7 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices, + struct bch_sb_field_members *mi; + unsigned i, best_sb = 0; + const char *err; +- char *_errbuf = NULL; +- struct printbuf errbuf; ++ struct printbuf errbuf = PRINTBUF; + int ret = 0; + + if (!try_module_get(THIS_MODULE)) +@@ -1917,14 +1910,6 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices, + goto err; + } + +- _errbuf = kmalloc(4096, GFP_KERNEL); +- if (!_errbuf) { +- ret = -ENOMEM; +- goto err; +- } +- +- errbuf = _PBUF(_errbuf, 4096); +- + sb = kcalloc(nr_devices, sizeof(*sb), GFP_KERNEL); + if (!sb) { + ret = -ENOMEM; +@@ -1990,7 +1975,7 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices, + } + out: + kfree(sb); +- kfree(_errbuf); ++ printbuf_exit(&errbuf); + module_put(THIS_MODULE); + pr_verbose_init(opts, "ret %i", PTR_ERR_OR_ZERO(c)); + return c; +diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c +index 1a3068f658a1..ce32b9068518 100644 +--- a/fs/bcachefs/sysfs.c ++++ b/fs/bcachefs/sysfs.c +@@ -46,8 +46,28 @@ struct sysfs_ops type ## _sysfs_ops = { \ + } + + #define SHOW(fn) \ ++static ssize_t fn ## _to_text(struct printbuf *, \ ++ struct kobject *, struct attribute *);\ ++ \ + static ssize_t fn ## _show(struct kobject *kobj, struct attribute *attr,\ + char *buf) \ ++{ \ ++ struct printbuf out = PRINTBUF; \ ++ ssize_t ret = fn ## _to_text(&out, kobj, attr); \ ++ \ ++ if (!ret && out.allocation_failure) \ ++ ret = -ENOMEM; \ ++ \ ++ if (!ret) { \ ++ ret = min_t(size_t, out.pos, PAGE_SIZE - 1); \ ++ memcpy(buf, out.buf, ret); \ ++ } \ ++ printbuf_exit(&out); \ ++ return ret; \ ++} \ ++ \ ++static ssize_t fn ## _to_text(struct printbuf *out, struct kobject *kobj,\ ++ struct attribute *attr) + + #define STORE(fn) \ + static ssize_t fn ## _store(struct kobject *kobj, struct attribute *attr,\ +@@ -64,22 +84,19 @@ static ssize_t fn ## _store(struct kobject *kobj, struct attribute *attr,\ + #define sysfs_printf(file, fmt, ...) \ + do { \ + if (attr == &sysfs_ ## file) \ +- return scnprintf(buf, PAGE_SIZE, fmt "\n", __VA_ARGS__);\ ++ pr_buf(out, fmt "\n", __VA_ARGS__); \ + } while (0) + + #define sysfs_print(file, var) \ + do { \ + if (attr == &sysfs_ ## file) \ +- return snprint(buf, PAGE_SIZE, var); \ ++ snprint(out, var); \ + } while (0) + + #define sysfs_hprint(file, val) \ + do { \ +- if (attr == &sysfs_ ## file) { \ +- bch2_hprint(&out, val); \ +- pr_buf(&out, "\n"); \ +- return out.pos - buf; \ +- } \ ++ if (attr == &sysfs_ ## file) \ ++ bch2_hprint(out, val); \ + } while (0) + + #define var_printf(_var, fmt) sysfs_printf(_var, fmt, var(_var)) +@@ -348,7 +365,6 @@ static void bch2_gc_gens_pos_to_text(struct printbuf *out, struct bch_fs *c) + SHOW(bch2_fs) + { + struct bch_fs *c = container_of(kobj, struct bch_fs, kobj); +- struct printbuf out = _PBUF(buf, PAGE_SIZE); + + sysfs_print(minor, c->minor); + sysfs_printf(internal_uuid, "%pU", c->sb.uuid.b); +@@ -365,10 +381,8 @@ SHOW(bch2_fs) + + sysfs_printf(btree_gc_periodic, "%u", (int) c->btree_gc_periodic); + +- if (attr == &sysfs_gc_gens_pos) { +- bch2_gc_gens_pos_to_text(&out, c); +- return out.pos - buf; +- } ++ if (attr == &sysfs_gc_gens_pos) ++ bch2_gc_gens_pos_to_text(out, c); + + sysfs_printf(copy_gc_enabled, "%i", c->copy_gc_enabled); + +@@ -378,83 +392,54 @@ SHOW(bch2_fs) + max(0LL, c->copygc_wait - + atomic64_read(&c->io_clock[WRITE].now)) << 9); + +- if (attr == &sysfs_rebalance_work) { +- bch2_rebalance_work_to_text(&out, c); +- return out.pos - buf; +- } ++ if (attr == &sysfs_rebalance_work) ++ bch2_rebalance_work_to_text(out, c); + + sysfs_print(promote_whole_extents, c->promote_whole_extents); + + /* Debugging: */ + +- if (attr == &sysfs_journal_debug) { +- bch2_journal_debug_to_text(&out, &c->journal); +- return out.pos - buf; +- } ++ if (attr == &sysfs_journal_debug) ++ bch2_journal_debug_to_text(out, &c->journal); + +- if (attr == &sysfs_journal_pins) { +- bch2_journal_pins_to_text(&out, &c->journal); +- return out.pos - buf; +- } ++ if (attr == &sysfs_journal_pins) ++ bch2_journal_pins_to_text(out, &c->journal); + +- if (attr == &sysfs_btree_updates) { +- bch2_btree_updates_to_text(&out, c); +- return out.pos - buf; +- } ++ if (attr == &sysfs_btree_updates) ++ bch2_btree_updates_to_text(out, c); + +- if (attr == &sysfs_dirty_btree_nodes) { +- bch2_dirty_btree_nodes_to_text(&out, c); +- return out.pos - buf; +- } ++ if (attr == &sysfs_dirty_btree_nodes) ++ bch2_dirty_btree_nodes_to_text(out, c); + +- if (attr == &sysfs_btree_cache) { +- bch2_btree_cache_to_text(&out, c); +- return out.pos - buf; +- } ++ if (attr == &sysfs_btree_cache) ++ bch2_btree_cache_to_text(out, c); + +- if (attr == &sysfs_btree_key_cache) { +- bch2_btree_key_cache_to_text(&out, &c->btree_key_cache); +- return out.pos - buf; +- } ++ if (attr == &sysfs_btree_key_cache) ++ bch2_btree_key_cache_to_text(out, &c->btree_key_cache); + +- if (attr == &sysfs_btree_transactions) { +- bch2_btree_trans_to_text(&out, c); +- return out.pos - buf; +- } ++ if (attr == &sysfs_btree_transactions) ++ bch2_btree_trans_to_text(out, c); + +- if (attr == &sysfs_stripes_heap) { +- bch2_stripes_heap_to_text(&out, c); +- return out.pos - buf; +- } ++ if (attr == &sysfs_stripes_heap) ++ bch2_stripes_heap_to_text(out, c); + +- if (attr == &sysfs_open_buckets) { +- bch2_open_buckets_to_text(&out, c); +- return out.pos - buf; +- } ++ if (attr == &sysfs_open_buckets) ++ bch2_open_buckets_to_text(out, c); + +- if (attr == &sysfs_compression_stats) { +- bch2_compression_stats_to_text(&out, c); +- return out.pos - buf; +- } ++ if (attr == &sysfs_compression_stats) ++ bch2_compression_stats_to_text(out, c); + +- if (attr == &sysfs_new_stripes) { +- bch2_new_stripes_to_text(&out, c); +- return out.pos - buf; +- } ++ if (attr == &sysfs_new_stripes) ++ bch2_new_stripes_to_text(out, c); + +- if (attr == &sysfs_io_timers_read) { +- bch2_io_timers_to_text(&out, &c->io_clock[READ]); +- return out.pos - buf; +- } +- if (attr == &sysfs_io_timers_write) { +- bch2_io_timers_to_text(&out, &c->io_clock[WRITE]); +- return out.pos - buf; +- } ++ if (attr == &sysfs_io_timers_read) ++ bch2_io_timers_to_text(out, &c->io_clock[READ]); + +- if (attr == &sysfs_data_jobs) { +- data_progress_to_text(&out, c); +- return out.pos - buf; +- } ++ if (attr == &sysfs_io_timers_write) ++ bch2_io_timers_to_text(out, &c->io_clock[WRITE]); ++ ++ if (attr == &sysfs_data_jobs) ++ data_progress_to_text(out, c); + + return 0; + } +@@ -567,7 +552,7 @@ struct attribute *bch2_fs_files[] = { + SHOW(bch2_fs_internal) + { + struct bch_fs *c = container_of(kobj, struct bch_fs, internal); +- return bch2_fs_show(&c->kobj, attr, buf); ++ return bch2_fs_to_text(out, &c->kobj, attr); + } + + STORE(bch2_fs_internal) +@@ -617,16 +602,15 @@ struct attribute *bch2_fs_internal_files[] = { + + SHOW(bch2_fs_opts_dir) + { +- struct printbuf out = _PBUF(buf, PAGE_SIZE); + struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir); + const struct bch_option *opt = container_of(attr, struct bch_option, attr); + int id = opt - bch2_opt_table; + u64 v = bch2_opt_get_by_id(&c->opts, id); + +- bch2_opt_to_text(&out, c, opt, v, OPT_SHOW_FULL_LIST); +- pr_buf(&out, "\n"); ++ bch2_opt_to_text(out, c, opt, v, OPT_SHOW_FULL_LIST); ++ pr_char(out, '\n'); + +- return out.pos - buf; ++ return 0; + } + + STORE(bch2_fs_opts_dir) +@@ -690,13 +674,10 @@ int bch2_opts_create_sysfs_files(struct kobject *kobj) + SHOW(bch2_fs_time_stats) + { + struct bch_fs *c = container_of(kobj, struct bch_fs, time_stats); +- struct printbuf out = _PBUF(buf, PAGE_SIZE); + + #define x(name) \ +- if (attr == &sysfs_time_stat_##name) { \ +- bch2_time_stats_to_text(&out, &c->times[BCH_TIME_##name]);\ +- return out.pos - buf; \ +- } ++ if (attr == &sysfs_time_stat_##name) \ ++ bch2_time_stats_to_text(out, &c->times[BCH_TIME_##name]); + BCH_TIME_STATS() + #undef x + +@@ -812,7 +793,6 @@ SHOW(bch2_dev) + { + struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj); + struct bch_fs *c = ca->fs; +- struct printbuf out = _PBUF(buf, PAGE_SIZE); + + sysfs_printf(uuid, "%pU\n", ca->uuid.b); + +@@ -825,58 +805,47 @@ SHOW(bch2_dev) + if (attr == &sysfs_label) { + if (ca->mi.group) { + mutex_lock(&c->sb_lock); +- bch2_disk_path_to_text(&out, c->disk_sb.sb, ++ bch2_disk_path_to_text(out, c->disk_sb.sb, + ca->mi.group - 1); + mutex_unlock(&c->sb_lock); + } + +- pr_buf(&out, "\n"); +- return out.pos - buf; ++ pr_char(out, '\n'); + } + + if (attr == &sysfs_has_data) { +- bch2_flags_to_text(&out, bch2_data_types, ++ bch2_flags_to_text(out, bch2_data_types, + bch2_dev_has_data(c, ca)); +- pr_buf(&out, "\n"); +- return out.pos - buf; ++ pr_char(out, '\n'); + } + + if (attr == &sysfs_state_rw) { +- bch2_string_opt_to_text(&out, bch2_member_states, ++ bch2_string_opt_to_text(out, bch2_member_states, + ca->mi.state); +- pr_buf(&out, "\n"); +- return out.pos - buf; ++ pr_char(out, '\n'); + } + +- if (attr == &sysfs_iodone) { +- dev_iodone_to_text(&out, ca); +- return out.pos - buf; +- } ++ if (attr == &sysfs_iodone) ++ dev_iodone_to_text(out, ca); + + sysfs_print(io_latency_read, atomic64_read(&ca->cur_latency[READ])); + sysfs_print(io_latency_write, atomic64_read(&ca->cur_latency[WRITE])); + +- if (attr == &sysfs_io_latency_stats_read) { +- bch2_time_stats_to_text(&out, &ca->io_latency[READ]); +- return out.pos - buf; +- } +- if (attr == &sysfs_io_latency_stats_write) { +- bch2_time_stats_to_text(&out, &ca->io_latency[WRITE]); +- return out.pos - buf; +- } ++ if (attr == &sysfs_io_latency_stats_read) ++ bch2_time_stats_to_text(out, &ca->io_latency[READ]); ++ ++ if (attr == &sysfs_io_latency_stats_write) ++ bch2_time_stats_to_text(out, &ca->io_latency[WRITE]); + + sysfs_printf(congested, "%u%%", + clamp(atomic_read(&ca->congested), 0, CONGESTED_MAX) + * 100 / CONGESTED_MAX); + +- if (attr == &sysfs_reserve_stats) { +- reserve_stats_to_text(&out, ca); +- return out.pos - buf; +- } +- if (attr == &sysfs_alloc_debug) { +- dev_alloc_debug_to_text(&out, ca); +- return out.pos - buf; +- } ++ if (attr == &sysfs_reserve_stats) ++ reserve_stats_to_text(out, ca); ++ ++ if (attr == &sysfs_alloc_debug) ++ dev_alloc_debug_to_text(out, ca); + + return 0; + } +diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c +index de84ce834975..3addf400e177 100644 +--- a/fs/bcachefs/tests.c ++++ b/fs/bcachefs/tests.c +@@ -871,7 +871,9 @@ int bch2_btree_perf_test(struct bch_fs *c, const char *testname, + u64 nr, unsigned nr_threads) + { + struct test_job j = { .c = c, .nr = nr, .nr_threads = nr_threads }; +- char name_buf[20], nr_buf[20], per_sec_buf[20]; ++ char name_buf[20]; ++ struct printbuf nr_buf = PRINTBUF; ++ struct printbuf per_sec_buf = PRINTBUF; + unsigned i; + u64 time; + +@@ -932,13 +934,15 @@ int bch2_btree_perf_test(struct bch_fs *c, const char *testname, + time = j.finish - j.start; + + scnprintf(name_buf, sizeof(name_buf), "%s:", testname); +- bch2_hprint(&PBUF(nr_buf), nr); +- bch2_hprint(&PBUF(per_sec_buf), div64_u64(nr * NSEC_PER_SEC, time)); ++ bch2_hprint(&nr_buf, nr); ++ bch2_hprint(&per_sec_buf, div64_u64(nr * NSEC_PER_SEC, time)); + printk(KERN_INFO "%-12s %s with %u threads in %5llu sec, %5llu nsec per iter, %5s per sec\n", +- name_buf, nr_buf, nr_threads, ++ name_buf, nr_buf.buf, nr_threads, + div_u64(time, NSEC_PER_SEC), + div_u64(time * nr_threads, nr), +- per_sec_buf); ++ per_sec_buf.buf); ++ printbuf_exit(&per_sec_buf); ++ printbuf_exit(&nr_buf); + return j.ret; + } + +diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c +index e1b55fe844d6..766d08aede71 100644 +--- a/fs/bcachefs/util.c ++++ b/fs/bcachefs/util.c +@@ -99,6 +99,38 @@ STRTO_H(strtoll, long long) + STRTO_H(strtoull, unsigned long long) + STRTO_H(strtou64, u64) + ++static int bch2_printbuf_realloc(struct printbuf *out, unsigned extra) ++{ ++ unsigned new_size = roundup_pow_of_two(out->size + extra); ++ char *buf = krealloc(out->buf, new_size, !out->atomic ? GFP_KERNEL : GFP_ATOMIC); ++ ++ if (!buf) { ++ out->allocation_failure = true; ++ return -ENOMEM; ++ } ++ ++ out->buf = buf; ++ out->size = new_size; ++ return 0; ++} ++ ++void bch2_pr_buf(struct printbuf *out, const char *fmt, ...) ++{ ++ va_list args; ++ int len; ++ ++ do { ++ va_start(args, fmt); ++ len = vsnprintf(out->buf + out->pos, printbuf_remaining(out), fmt, args); ++ va_end(args); ++ } while (len + 1 >= printbuf_remaining(out) && ++ !bch2_printbuf_realloc(out, len + 1)); ++ ++ len = min_t(size_t, len, ++ printbuf_remaining(out) ? printbuf_remaining(out) - 1 : 0); ++ out->pos += len; ++} ++ + void bch2_hprint(struct printbuf *buf, s64 v) + { + int u, t = 0; +@@ -151,9 +183,6 @@ void bch2_flags_to_text(struct printbuf *out, + unsigned bit, nr = 0; + bool first = true; + +- if (out->pos != out->end) +- *out->pos = '\0'; +- + while (list[nr]) + nr++; + +diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h +index 895dc3aa1968..4095df2fcded 100644 +--- a/fs/bcachefs/util.h ++++ b/fs/bcachefs/util.h +@@ -242,19 +242,39 @@ enum printbuf_units { + }; + + struct printbuf { +- char *pos; +- char *end; +- char *last_newline; +- char *last_field; ++ char *buf; ++ unsigned size; ++ unsigned pos; ++ unsigned last_newline; ++ unsigned last_field; + unsigned indent; +- enum printbuf_units units; +- unsigned tabstop; +- unsigned tabstops[4]; ++ enum printbuf_units units:8; ++ u8 atomic; ++ bool allocation_failure:1; ++ u8 tabstop; ++ u8 tabstops[4]; + }; + ++#define PRINTBUF ((struct printbuf) { NULL }) ++ ++static inline void printbuf_exit(struct printbuf *buf) ++{ ++ kfree(buf->buf); ++ buf->buf = ERR_PTR(-EINTR); /* poison value */ ++} ++ ++static inline void printbuf_reset(struct printbuf *buf) ++{ ++ buf->pos = 0; ++ buf->last_newline = 0; ++ buf->last_field = 0; ++ buf->indent = 0; ++ buf->tabstop = 0; ++} ++ + static inline size_t printbuf_remaining(struct printbuf *buf) + { +- return buf->end - buf->pos; ++ return buf->size - buf->pos; + } + + static inline size_t printbuf_linelen(struct printbuf *buf) +@@ -262,29 +282,13 @@ static inline size_t printbuf_linelen(struct printbuf *buf) + return buf->pos - buf->last_newline; + } + +-#define _PBUF(_buf, _len) \ +- ((struct printbuf) { \ +- .pos = _buf, \ +- .end = _buf + _len, \ +- .last_newline = _buf, \ +- .last_field = _buf, \ +- }) ++void bch2_pr_buf(struct printbuf *out, const char *fmt, ...); + +-#define PBUF(_buf) _PBUF(_buf, sizeof(_buf)) +- +- +-#define pr_buf(_out, ...) \ +-do { \ +- (_out)->pos += scnprintf((_out)->pos, printbuf_remaining(_out), \ +- __VA_ARGS__); \ +-} while (0) ++#define pr_buf(_out, ...) bch2_pr_buf(_out, __VA_ARGS__) + + static inline void pr_char(struct printbuf *out, char c) + { +- if (printbuf_remaining(out) > 1) { +- *out->pos = c; +- out->pos++; +- } ++ bch2_pr_buf(out, "%c", c); + } + + static inline void pr_indent_push(struct printbuf *buf, unsigned spaces) +@@ -341,12 +345,12 @@ static inline void pr_tab_rjust(struct printbuf *buf) + BUG_ON(buf->tabstop > ARRAY_SIZE(buf->tabstops)); + + if (shift > 0) { +- memmove(buf->last_field + shift, +- buf->last_field, ++ memmove(buf->buf + buf->last_field + shift, ++ buf->buf + buf->last_field, + move); +- memset(buf->last_field, ' ', shift); ++ memset(buf->buf + buf->last_field, ' ', shift); + buf->pos += shift; +- *buf->pos = 0; ++ buf->buf[buf->pos] = 0; + } + + buf->last_field = buf->pos; +@@ -460,8 +464,8 @@ static inline int bch2_strtoul_h(const char *cp, long *res) + _r; \ + }) + +-#define snprint(buf, size, var) \ +- snprintf(buf, size, \ ++#define snprint(out, var) \ ++ pr_buf(out, \ + type_is(var, int) ? "%i\n" \ + : type_is(var, unsigned) ? "%u\n" \ + : type_is(var, long) ? "%li\n" \ +@@ -601,10 +605,8 @@ do { \ + sysfs_print(name##_rate_d_term, (var)->d_term); \ + sysfs_print(name##_rate_p_term_inverse, (var)->p_term_inverse); \ + \ +- if (attr == &sysfs_##name##_rate_debug) { \ +- bch2_pd_controller_debug_to_text(&out, var); \ +- return out.pos - buf; \ +- } \ ++ if (attr == &sysfs_##name##_rate_debug) \ ++ bch2_pd_controller_debug_to_text(out, var); \ + } while (0) + + #define sysfs_pd_controller_store(name, var) \ +diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c +index 1673654fff3e..48e625ab15ff 100644 +--- a/fs/bcachefs/xattr.c ++++ b/fs/bcachefs/xattr.c +@@ -426,9 +426,8 @@ static int __bch2_xattr_bcachefs_get(const struct xattr_handler *handler, + bch2_inode_opts_to_opts(bch2_inode_opts_get(&inode->ei_inode)); + const struct bch_option *opt; + int id, inode_opt_id; +- char buf[512]; +- struct printbuf out = PBUF(buf); +- unsigned val_len; ++ struct printbuf out = PRINTBUF; ++ int ret; + u64 v; + + id = bch2_opt_lookup(name); +@@ -451,14 +450,19 @@ static int __bch2_xattr_bcachefs_get(const struct xattr_handler *handler, + v = bch2_opt_get_by_id(&opts, id); + bch2_opt_to_text(&out, c, opt, v, 0); + +- val_len = out.pos - buf; ++ ret = out.pos; + +- if (buffer && val_len > size) +- return -ERANGE; ++ if (out.allocation_failure) { ++ ret = -ENOMEM; ++ } else if (buffer) { ++ if (out.pos > size) ++ ret = -ERANGE; ++ else ++ memcpy(buffer, out.buf, out.pos); ++ } + +- if (buffer) +- memcpy(buffer, buf, val_len); +- return val_len; ++ printbuf_exit(&out); ++ return ret; + } + + static int bch2_xattr_bcachefs_get(const struct xattr_handler *handler, +-- +cgit v1.2.3 + + +From 314a44d540f6cde8d3b719302247c8e394885bc9 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 25 Feb 2022 22:14:35 -0500 +Subject: bcachefs: Fix journal_flush_done() + +journal_flush_done() was overwriting did_work, thus occasionally +returning false when it did do work and occasional assertions in the +shutdown sequence because we didn't completely flush the key cache. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/journal_reclaim.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c +index ef1cb6acfb3e..de37e2d99fe6 100644 +--- a/fs/bcachefs/journal_reclaim.c ++++ b/fs/bcachefs/journal_reclaim.c +@@ -762,7 +762,8 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush, + + mutex_lock(&j->reclaim_lock); + +- *did_work = journal_flush_pins(j, seq_to_flush, 0, 0) != 0; ++ if (journal_flush_pins(j, seq_to_flush, 0, 0)) ++ *did_work = true; + + spin_lock(&j->lock); + /* +-- +cgit v1.2.3 + + +From ec5869a687e1f14e47725342e50c170833dab422 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 25 Feb 2022 22:33:01 -0500 +Subject: bcachefs: Fix btree path sorting + +In btree_update_interior.c, we were changing a path's level directly - +which affects path sort order - without re-sorting paths, leading to +assertions when bch2_path_get() verified paths were sorted correctly. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 15 ++++++++------- + fs/bcachefs/btree_iter.h | 2 ++ + fs/bcachefs/btree_update_interior.c | 2 ++ + 3 files changed, 12 insertions(+), 7 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 7244528ab6a6..8ff6a8d03dc4 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -20,7 +20,7 @@ + #include + + static void btree_trans_verify_sorted(struct btree_trans *); +-static void btree_path_check_sort(struct btree_trans *, struct btree_path *, int); ++inline void bch2_btree_path_check_sort(struct btree_trans *, struct btree_path *, int); + + static inline void btree_path_list_remove(struct btree_trans *, struct btree_path *); + static inline void btree_path_list_add(struct btree_trans *, struct btree_path *, +@@ -1648,7 +1648,7 @@ static void btree_path_copy(struct btree_trans *trans, struct btree_path *dst, + six_lock_increment(&dst->l[i].b->c.lock, + __btree_lock_want(dst, i)); + +- btree_path_check_sort(trans, dst, 0); ++ bch2_btree_path_check_sort(trans, dst, 0); + } + + static struct btree_path *btree_path_clone(struct btree_trans *trans, struct btree_path *src, +@@ -1698,7 +1698,7 @@ bch2_btree_path_set_pos(struct btree_trans *trans, + path->pos = new_pos; + path->should_be_locked = false; + +- btree_path_check_sort(trans, path, cmp); ++ bch2_btree_path_check_sort(trans, path, cmp); + + if (unlikely(path->cached)) { + btree_node_unlock(path, 0); +@@ -1815,11 +1815,12 @@ void bch2_dump_trans_paths_updates(struct btree_trans *trans) + + bch2_bpos_to_text(&buf1, path->pos); + +- printk(KERN_ERR "path: idx %u ref %u:%u%s%s btree %s pos %s locks %u %pS\n", ++ printk(KERN_ERR "path: idx %u ref %u:%u%s%s btree=%s l=%u pos %s locks %u %pS\n", + path->idx, path->ref, path->intent_ref, + path->should_be_locked ? " S" : "", + path->preserve ? " P" : "", + bch2_btree_ids[path->btree_id], ++ path->level, + buf1.buf, + path->nodes_locked, + #ifdef CONFIG_BCACHEFS_DEBUG +@@ -2502,7 +2503,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) + k = btree_path_level_prev(trans->c, iter->path, + &iter->path->l[0], &iter->k); + +- btree_path_check_sort(trans, iter->path, 0); ++ bch2_btree_path_check_sort(trans, iter->path, 0); + + if (likely(k.k)) { + if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) { +@@ -2768,8 +2769,8 @@ static inline void btree_path_swap(struct btree_trans *trans, + btree_path_verify_sorted_ref(trans, r); + } + +-static void btree_path_check_sort(struct btree_trans *trans, struct btree_path *path, +- int cmp) ++inline void bch2_btree_path_check_sort(struct btree_trans *trans, struct btree_path *path, ++ int cmp) + { + struct btree_path *n; + +diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h +index 759c7b52f4a2..d612aec91587 100644 +--- a/fs/bcachefs/btree_iter.h ++++ b/fs/bcachefs/btree_iter.h +@@ -70,6 +70,8 @@ __trans_next_path(struct btree_trans *trans, unsigned idx) + return &trans->paths[idx]; + } + ++void bch2_btree_path_check_sort(struct btree_trans *, struct btree_path *, int); ++ + #define trans_for_each_path(_trans, _path) \ + for (_path = __trans_next_path((_trans), 0); \ + (_path); \ +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index 31ec5076fa12..ba76a86ac10d 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -1911,6 +1911,8 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans, + path_l(iter2.path)->b = BTREE_ITER_NO_NODE_UP; + iter2.path->level++; + ++ bch2_btree_path_check_sort(trans, iter2.path, 0); ++ + ret = bch2_btree_iter_traverse(&iter2) ?: + bch2_trans_update(trans, &iter2, new_key, BTREE_TRIGGER_NORUN); + if (ret) +-- +cgit v1.2.3 + + +From a88c13f70e7358eae8a56d333529350ef37afd0c Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 25 Feb 2022 22:45:58 -0500 +Subject: bcachefs: Don't spin in journal reclaim + +If we're not able to flush anything, we shouldn't keep looping. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/journal_reclaim.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c +index de37e2d99fe6..3dca50f76ac4 100644 +--- a/fs/bcachefs/journal_reclaim.c ++++ b/fs/bcachefs/journal_reclaim.c +@@ -664,7 +664,7 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct) + + if (nr_flushed) + wake_up(&j->reclaim_wait); +- } while ((min_nr || min_key_cache) && !direct); ++ } while ((min_nr || min_key_cache) && nr_flushed && !direct); + + memalloc_noreclaim_restore(flags); + +-- +cgit v1.2.3 + + +From 3913b5700fd56707b16d810e6c1b4a4b00bc038e Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 26 Feb 2022 19:48:37 -0500 +Subject: bcachefs: Fix an assert in the initial GC path + +With lockdep enabled, we (rarely) hit an assertion in +bch2_gc_init_recurse() -> bch2_gc_mark_key() -> bch2_trans_unlock(): +this is because initial GC doesn't use btree iterators for btree +walking, it does its own thing for various (complicated) reasons - but +we don't have to worry about deadlocks because we're only taking read +locks. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_gc.c | 3 +++ + fs/bcachefs/btree_iter.c | 7 ++++++- + fs/bcachefs/btree_types.h | 1 + + 3 files changed, 10 insertions(+), 1 deletion(-) + +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +index 88b234f58ef5..cd9016541d9c 100644 +--- a/fs/bcachefs/btree_gc.c ++++ b/fs/bcachefs/btree_gc.c +@@ -1059,6 +1059,9 @@ static int bch2_gc_btrees(struct bch_fs *c, bool initial, bool metadata_only) + + bch2_trans_init(&trans, c, 0, 0); + ++ if (initial) ++ trans.is_initial_gc = true; ++ + for (i = 0; i < BTREE_ID_NR; i++) + ids[i] = i; + bubble_sort(ids, BTREE_ID_NR, btree_id_gc_phase_cmp); +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 8ff6a8d03dc4..c0357ee9cfb7 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -558,7 +558,12 @@ void bch2_trans_unlock(struct btree_trans *trans) + trans_for_each_path(trans, path) + __bch2_btree_path_unlock(path); + +- BUG_ON(lock_class_is_held(&bch2_btree_node_lock_key)); ++ /* ++ * bch2_gc_btree_init_recurse() doesn't use btree iterators for walking ++ * btree nodes, it implements its own walking: ++ */ ++ BUG_ON(!trans->is_initial_gc && ++ lock_class_is_held(&bch2_btree_node_lock_key)); + } + + /* Btree iterator: */ +diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h +index d87d39dedb61..cf9ebe7411cc 100644 +--- a/fs/bcachefs/btree_types.h ++++ b/fs/bcachefs/btree_types.h +@@ -396,6 +396,7 @@ struct btree_trans { + bool restarted:1; + bool memory_allocation_failure:1; + bool journal_transaction_names:1; ++ bool is_initial_gc:1; + /* + * For when bch2_trans_update notices we'll be splitting a compressed + * extent: +-- +cgit v1.2.3 + + +From e56d3640c5193c68fdb055750b9f97c31a754868 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 26 Feb 2022 20:25:15 -0500 +Subject: bcachefs: Kill BCH_FS_HOLD_BTREE_WRITES + +This was just dead code. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bcachefs.h | 1 - + fs/bcachefs/btree_cache.c | 3 +-- + fs/bcachefs/btree_io.c | 3 --- + fs/bcachefs/btree_update_interior.c | 3 +-- + 4 files changed, 2 insertions(+), 8 deletions(-) + +diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h +index 45a43f716c44..fb2d4d9b06e2 100644 +--- a/fs/bcachefs/bcachefs.h ++++ b/fs/bcachefs/bcachefs.h +@@ -534,7 +534,6 @@ enum { + BCH_FS_NEED_ANOTHER_GC, + BCH_FS_DELETED_NODES, + BCH_FS_REBUILD_REPLICAS, +- BCH_FS_HOLD_BTREE_WRITES, + }; + + struct btree_debug { +diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c +index 00d4b18292ae..6d4617af795a 100644 +--- a/fs/bcachefs/btree_cache.c ++++ b/fs/bcachefs/btree_cache.c +@@ -224,8 +224,7 @@ wait_on_io: + goto out_unlock; + + if (btree_node_dirty(b)) { +- if (!flush || +- test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags)) ++ if (!flush) + goto out_unlock; + /* + * Using the underscore version because we don't want to compact +diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c +index 2b16b656c9be..cf86ca632085 100644 +--- a/fs/bcachefs/btree_io.c ++++ b/fs/bcachefs/btree_io.c +@@ -1776,9 +1776,6 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, bool already_sta + if (already_started) + goto do_write; + +- if (test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags)) +- return; +- + /* + * We may only have a read lock on the btree node - the dirty bit is our + * "lock" against racing with other threads that may be trying to start +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index ba76a86ac10d..bb5edecb432a 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -1096,8 +1096,7 @@ static void bch2_btree_set_root(struct btree_update *as, + struct btree *old; + + trace_btree_set_root(c, b); +- BUG_ON(!b->written && +- !test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags)); ++ BUG_ON(!b->written); + + old = btree_node_root(c, b); + +-- +cgit v1.2.3 + + +From 6347c8fc79609de05c3ac6a35214f2a1451b2308 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 26 Feb 2022 11:10:20 -0500 +Subject: bcachefs: Use x-macros for btree node flags + +This is for adding an array of strings for btree node flag names. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_cache.c | 9 +++++- + fs/bcachefs/btree_cache.h | 2 ++ + fs/bcachefs/btree_io.h | 9 ++---- + fs/bcachefs/btree_types.h | 56 +++++++++++++++++-------------------- + fs/bcachefs/btree_update_interior.c | 6 ++-- + fs/bcachefs/btree_update_leaf.c | 2 +- + 6 files changed, 41 insertions(+), 43 deletions(-) + +diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c +index 6d4617af795a..1aed196edd60 100644 +--- a/fs/bcachefs/btree_cache.c ++++ b/fs/bcachefs/btree_cache.c +@@ -15,6 +15,13 @@ + + struct lock_class_key bch2_btree_node_lock_key; + ++const char * const bch2_btree_node_flags[] = { ++#define x(f) #f, ++ BTREE_FLAGS() ++#undef x ++ NULL ++}; ++ + void bch2_recalc_btree_reserve(struct bch_fs *c) + { + unsigned i, reserve = 16; +@@ -414,7 +421,7 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c) + + if (btree_node_dirty(b)) + bch2_btree_complete_write(c, b, btree_current_write(b)); +- clear_btree_node_dirty(c, b); ++ clear_btree_node_dirty_acct(c, b); + + btree_node_data_free(c, b); + } +diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h +index f7e10986f317..2901f0dc925b 100644 +--- a/fs/bcachefs/btree_cache.h ++++ b/fs/bcachefs/btree_cache.h +@@ -7,6 +7,8 @@ + + extern struct lock_class_key bch2_btree_node_lock_key; + ++extern const char * const bch2_btree_node_flags[]; ++ + struct btree_iter; + + void bch2_recalc_btree_reserve(struct bch_fs *); +diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h +index 095ad505338d..a1dea8e85e4d 100644 +--- a/fs/bcachefs/btree_io.h ++++ b/fs/bcachefs/btree_io.h +@@ -15,18 +15,13 @@ struct btree; + struct btree_iter; + struct btree_node_read_all; + +-static inline bool btree_node_dirty(struct btree *b) +-{ +- return test_bit(BTREE_NODE_dirty, &b->flags); +-} +- +-static inline void set_btree_node_dirty(struct bch_fs *c, struct btree *b) ++static inline void set_btree_node_dirty_acct(struct bch_fs *c, struct btree *b) + { + if (!test_and_set_bit(BTREE_NODE_dirty, &b->flags)) + atomic_inc(&c->btree_cache.dirty); + } + +-static inline void clear_btree_node_dirty(struct bch_fs *c, struct btree *b) ++static inline void clear_btree_node_dirty_acct(struct bch_fs *c, struct btree *b) + { + if (test_and_clear_bit(BTREE_NODE_dirty, &b->flags)) + atomic_dec(&c->btree_cache.dirty); +diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h +index cf9ebe7411cc..1e38f6670238 100644 +--- a/fs/bcachefs/btree_types.h ++++ b/fs/bcachefs/btree_types.h +@@ -429,7 +429,29 @@ struct btree_trans { + struct replicas_delta_list *fs_usage_deltas; + }; + +-#define BTREE_FLAG(flag) \ ++#define BTREE_FLAGS() \ ++ x(read_in_flight) \ ++ x(read_error) \ ++ x(dirty) \ ++ x(need_write) \ ++ x(noevict) \ ++ x(write_idx) \ ++ x(accessed) \ ++ x(write_in_flight) \ ++ x(write_in_flight_inner) \ ++ x(just_written) \ ++ x(dying) \ ++ x(fake) \ ++ x(need_rewrite) \ ++ x(never_write) ++ ++enum btree_flags { ++#define x(flag) BTREE_NODE_##flag, ++ BTREE_FLAGS() ++#undef x ++}; ++ ++#define x(flag) \ + static inline bool btree_node_ ## flag(struct btree *b) \ + { return test_bit(BTREE_NODE_ ## flag, &b->flags); } \ + \ +@@ -439,36 +461,8 @@ static inline void set_btree_node_ ## flag(struct btree *b) \ + static inline void clear_btree_node_ ## flag(struct btree *b) \ + { clear_bit(BTREE_NODE_ ## flag, &b->flags); } + +-enum btree_flags { +- BTREE_NODE_read_in_flight, +- BTREE_NODE_read_error, +- BTREE_NODE_dirty, +- BTREE_NODE_need_write, +- BTREE_NODE_noevict, +- BTREE_NODE_write_idx, +- BTREE_NODE_accessed, +- BTREE_NODE_write_in_flight, +- BTREE_NODE_write_in_flight_inner, +- BTREE_NODE_just_written, +- BTREE_NODE_dying, +- BTREE_NODE_fake, +- BTREE_NODE_need_rewrite, +- BTREE_NODE_never_write, +-}; +- +-BTREE_FLAG(read_in_flight); +-BTREE_FLAG(read_error); +-BTREE_FLAG(need_write); +-BTREE_FLAG(noevict); +-BTREE_FLAG(write_idx); +-BTREE_FLAG(accessed); +-BTREE_FLAG(write_in_flight); +-BTREE_FLAG(write_in_flight_inner); +-BTREE_FLAG(just_written); +-BTREE_FLAG(dying); +-BTREE_FLAG(fake); +-BTREE_FLAG(need_rewrite); +-BTREE_FLAG(never_write); ++BTREE_FLAGS() ++#undef x + + static inline struct btree_write *btree_current_write(struct btree *b) + { +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index bb5edecb432a..97786abae1f3 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -271,7 +271,7 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned lev + six_lock_write(&b->c.lock, NULL, NULL); + + set_btree_node_accessed(b); +- set_btree_node_dirty(c, b); ++ set_btree_node_dirty_acct(c, b); + set_btree_node_need_write(b); + + bch2_bset_init_first(b, &b->data->keys); +@@ -881,7 +881,7 @@ static void bch2_btree_interior_update_will_free_node(struct btree_update *as, + closure_wake_up(&c->btree_interior_update_wait); + } + +- clear_btree_node_dirty(c, b); ++ clear_btree_node_dirty_acct(c, b); + clear_btree_node_need_write(b); + + /* +@@ -1164,7 +1164,7 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, + bch2_btree_node_iter_advance(node_iter, b); + + bch2_btree_bset_insert_key(trans, path, b, node_iter, insert); +- set_btree_node_dirty(c, b); ++ set_btree_node_dirty_acct(c, b); + set_btree_node_need_write(b); + } + +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index db4aa667487d..05ad8ed15d0b 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -220,7 +220,7 @@ static bool btree_insert_key_leaf(struct btree_trans *trans, + bch2_btree_add_journal_pin(c, b, trans->journal_res.seq); + + if (unlikely(!btree_node_dirty(b))) +- set_btree_node_dirty(c, b); ++ set_btree_node_dirty_acct(c, b); + + live_u64s_added = (int) b->nr.live_u64s - old_live_u64s; + u64s_added = (int) bset_u64s(t) - old_u64s; +-- +cgit v1.2.3 + + +From 6629427e306e5f858df589ddccfd6e09c99e88b4 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 26 Feb 2022 11:48:34 -0500 +Subject: bcachefs: Start moving debug info from sysfs to debugfs + +In sysfs, files can only output at most PAGE_SIZE. This is a problem for +debug info that needs to list an arbitrary number of times, and because +of this limit some of our debug info has been terser and harder to read +than we'd like. + +This patch moves info about journal pins and cached btree nodes to +debugfs, and greatly expands and improves the output we return. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bcachefs.h | 6 +- + fs/bcachefs/btree_io.c | 27 -------- + fs/bcachefs/btree_io.h | 1 - + fs/bcachefs/debug.c | 176 +++++++++++++++++++++++++++++++++++++++++++++---- + fs/bcachefs/journal.c | 56 +++++++++++----- + fs/bcachefs/journal.h | 1 + + fs/bcachefs/sysfs.c | 10 --- + 7 files changed, 206 insertions(+), 71 deletions(-) + +diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h +index fb2d4d9b06e2..211fd5adf9e3 100644 +--- a/fs/bcachefs/bcachefs.h ++++ b/fs/bcachefs/bcachefs.h +@@ -538,9 +538,6 @@ enum { + + struct btree_debug { + unsigned id; +- struct dentry *btree; +- struct dentry *btree_format; +- struct dentry *failed; + }; + + struct bch_fs_pcpu { +@@ -885,7 +882,8 @@ struct bch_fs { + struct bch_memquota_type quotas[QTYP_NR]; + + /* DEBUG JUNK */ +- struct dentry *debug; ++ struct dentry *fs_debug_dir; ++ struct dentry *btree_debug_dir; + struct btree_debug btree_debug[BTREE_ID_NR]; + struct btree *verify_data; + struct btree_node *verify_ondisk; +diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c +index cf86ca632085..ef1e301f201c 100644 +--- a/fs/bcachefs/btree_io.c ++++ b/fs/bcachefs/btree_io.c +@@ -2109,30 +2109,3 @@ void bch2_btree_flush_all_writes(struct bch_fs *c) + { + __bch2_btree_flush_all(c, BTREE_NODE_write_in_flight); + } +- +-void bch2_dirty_btree_nodes_to_text(struct printbuf *out, struct bch_fs *c) +-{ +- struct bucket_table *tbl; +- struct rhash_head *pos; +- struct btree *b; +- unsigned i; +- +- rcu_read_lock(); +- for_each_cached_btree(b, c, tbl, i, pos) { +- unsigned long flags = READ_ONCE(b->flags); +- +- if (!(flags & (1 << BTREE_NODE_dirty))) +- continue; +- +- pr_buf(out, "%p d %u n %u l %u w %u b %u r %u:%lu\n", +- b, +- (flags & (1 << BTREE_NODE_dirty)) != 0, +- (flags & (1 << BTREE_NODE_need_write)) != 0, +- b->c.level, +- b->written, +- !list_empty_careful(&b->write_blocked), +- b->will_make_reachable != 0, +- b->will_make_reachable & 1); +- } +- rcu_read_unlock(); +-} +diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h +index a1dea8e85e4d..638a9b30f0cb 100644 +--- a/fs/bcachefs/btree_io.h ++++ b/fs/bcachefs/btree_io.h +@@ -177,7 +177,6 @@ do { \ + + void bch2_btree_flush_all_reads(struct bch_fs *); + void bch2_btree_flush_all_writes(struct bch_fs *); +-void bch2_dirty_btree_nodes_to_text(struct printbuf *, struct bch_fs *); + + static inline void compat_bformat(unsigned level, enum btree_id btree_id, + unsigned version, unsigned big_endian, +diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c +index ee22ed31ce37..2d65ae370931 100644 +--- a/fs/bcachefs/debug.c ++++ b/fs/bcachefs/debug.c +@@ -185,9 +185,10 @@ out: + /* XXX: bch_fs refcounting */ + + struct dump_iter { +- struct bpos from; +- struct bch_fs *c; ++ struct bch_fs *c; + enum btree_id id; ++ struct bpos from; ++ u64 iter; + + struct printbuf buf; + +@@ -226,6 +227,7 @@ static int bch2_dump_open(struct inode *inode, struct file *file) + + file->private_data = i; + i->from = POS_MIN; ++ i->iter = 0; + i->c = container_of(bd, struct bch_fs, btree_debug[bd->id]); + i->id = bd->id; + i->buf = PRINTBUF; +@@ -420,10 +422,148 @@ static const struct file_operations bfloat_failed_debug_ops = { + .read = bch2_read_bfloat_failed, + }; + ++static void bch2_cached_btree_node_to_text(struct printbuf *out, struct bch_fs *c, ++ struct btree *b) ++{ ++ out->tabstops[0] = 32; ++ ++ pr_buf(out, "%px btree=%s l=%u ", ++ b, ++ bch2_btree_ids[b->c.btree_id], ++ b->c.level); ++ pr_newline(out); ++ ++ pr_indent_push(out, 2); ++ ++ bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key)); ++ pr_newline(out); ++ ++ pr_buf(out, "flags: "); ++ pr_tab(out); ++ bch2_flags_to_text(out, bch2_btree_node_flags, b->flags); ++ pr_newline(out); ++ ++ pr_buf(out, "written:"); ++ pr_tab(out); ++ pr_buf(out, "%u", b->written); ++ pr_newline(out); ++ ++ pr_buf(out, "writes blocked:"); ++ pr_tab(out); ++ pr_buf(out, "%u", !list_empty_careful(&b->write_blocked)); ++ pr_newline(out); ++ ++ pr_buf(out, "will make reachable:"); ++ pr_tab(out); ++ pr_buf(out, "%lx", b->will_make_reachable); ++ pr_newline(out); ++ ++ pr_buf(out, "journal pin %px:", &b->writes[0].journal); ++ pr_tab(out); ++ pr_buf(out, "%llu", b->writes[0].journal.seq); ++ pr_newline(out); ++ ++ pr_buf(out, "journal pin %px:", &b->writes[1].journal); ++ pr_tab(out); ++ pr_buf(out, "%llu", b->writes[1].journal.seq); ++ pr_newline(out); ++ ++ pr_indent_pop(out, 2); ++} ++ ++static ssize_t bch2_cached_btree_nodes_read(struct file *file, char __user *buf, ++ size_t size, loff_t *ppos) ++{ ++ struct dump_iter *i = file->private_data; ++ struct bch_fs *c = i->c; ++ bool done = false; ++ int err; ++ ++ i->ubuf = buf; ++ i->size = size; ++ i->ret = 0; ++ ++ do { ++ struct bucket_table *tbl; ++ struct rhash_head *pos; ++ struct btree *b; ++ ++ err = flush_buf(i); ++ if (err) ++ return err; ++ ++ if (!i->size) ++ break; ++ ++ rcu_read_lock(); ++ i->buf.atomic++; ++ tbl = rht_dereference_rcu(c->btree_cache.table.tbl, ++ &c->btree_cache.table); ++ if (i->iter < tbl->size) { ++ rht_for_each_entry_rcu(b, pos, tbl, i->iter, hash) ++ bch2_cached_btree_node_to_text(&i->buf, c, b); ++ i->iter++;; ++ } else { ++ done = true; ++ } ++ --i->buf.atomic; ++ rcu_read_unlock(); ++ } while (!done); ++ ++ if (i->buf.allocation_failure) ++ return -ENOMEM; ++ ++ return i->ret; ++} ++ ++static const struct file_operations cached_btree_nodes_ops = { ++ .owner = THIS_MODULE, ++ .open = bch2_dump_open, ++ .release = bch2_dump_release, ++ .read = bch2_cached_btree_nodes_read, ++}; ++ ++static ssize_t bch2_journal_pins_read(struct file *file, char __user *buf, ++ size_t size, loff_t *ppos) ++{ ++ struct dump_iter *i = file->private_data; ++ struct bch_fs *c = i->c; ++ bool done = false; ++ int err; ++ ++ i->ubuf = buf; ++ i->size = size; ++ i->ret = 0; ++ ++ do { ++ err = flush_buf(i); ++ if (err) ++ return err; ++ ++ if (!i->size) ++ break; ++ ++ done = bch2_journal_seq_pins_to_text(&i->buf, &c->journal, &i->iter); ++ i->iter++; ++ } while (!done); ++ ++ if (i->buf.allocation_failure) ++ return -ENOMEM; ++ ++ return i->ret; ++} ++ ++static const struct file_operations journal_pins_ops = { ++ .owner = THIS_MODULE, ++ .open = bch2_dump_open, ++ .release = bch2_dump_release, ++ .read = bch2_journal_pins_read, ++}; ++ + void bch2_fs_debug_exit(struct bch_fs *c) + { +- if (!IS_ERR_OR_NULL(c->debug)) +- debugfs_remove_recursive(c->debug); ++ if (!IS_ERR_OR_NULL(c->fs_debug_dir)) ++ debugfs_remove_recursive(c->fs_debug_dir); + } + + void bch2_fs_debug_init(struct bch_fs *c) +@@ -435,29 +575,39 @@ void bch2_fs_debug_init(struct bch_fs *c) + return; + + snprintf(name, sizeof(name), "%pU", c->sb.user_uuid.b); +- c->debug = debugfs_create_dir(name, bch_debug); +- if (IS_ERR_OR_NULL(c->debug)) ++ c->fs_debug_dir = debugfs_create_dir(name, bch_debug); ++ if (IS_ERR_OR_NULL(c->fs_debug_dir)) ++ return; ++ ++ debugfs_create_file("cached_btree_nodes", 0400, c->fs_debug_dir, ++ c->btree_debug, &cached_btree_nodes_ops); ++ ++ debugfs_create_file("journal_pins", 0400, c->fs_debug_dir, ++ c->btree_debug, &journal_pins_ops); ++ ++ c->btree_debug_dir = debugfs_create_dir("btrees", c->fs_debug_dir); ++ if (IS_ERR_OR_NULL(c->btree_debug_dir)) + return; + + for (bd = c->btree_debug; + bd < c->btree_debug + ARRAY_SIZE(c->btree_debug); + bd++) { + bd->id = bd - c->btree_debug; +- bd->btree = debugfs_create_file(bch2_btree_ids[bd->id], +- 0400, c->debug, bd, +- &btree_debug_ops); ++ debugfs_create_file(bch2_btree_ids[bd->id], ++ 0400, c->btree_debug_dir, bd, ++ &btree_debug_ops); + + snprintf(name, sizeof(name), "%s-formats", + bch2_btree_ids[bd->id]); + +- bd->btree_format = debugfs_create_file(name, 0400, c->debug, bd, +- &btree_format_debug_ops); ++ debugfs_create_file(name, 0400, c->btree_debug_dir, bd, ++ &btree_format_debug_ops); + + snprintf(name, sizeof(name), "%s-bfloat-failed", + bch2_btree_ids[bd->id]); + +- bd->failed = debugfs_create_file(name, 0400, c->debug, bd, +- &bfloat_failed_debug_ops); ++ debugfs_create_file(name, 0400, c->btree_debug_dir, bd, ++ &bfloat_failed_debug_ops); + } + } + +diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c +index ffaf58956450..9cd1e11ad1b5 100644 +--- a/fs/bcachefs/journal.c ++++ b/fs/bcachefs/journal.c +@@ -1281,35 +1281,59 @@ void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) + spin_unlock(&j->lock); + } + +-void bch2_journal_pins_to_text(struct printbuf *out, struct journal *j) ++bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64 *seq) + { + struct journal_entry_pin_list *pin_list; + struct journal_entry_pin *pin; +- u64 i; + + spin_lock(&j->lock); ++ *seq = max(*seq, j->pin.front); ++ ++ if (*seq >= j->pin.back) { ++ spin_unlock(&j->lock); ++ return true; ++ } ++ + out->atomic++; + +- fifo_for_each_entry_ptr(pin_list, &j->pin, i) { +- pr_buf(out, "%llu: count %u\n", +- i, atomic_read(&pin_list->count)); ++ pin_list = journal_seq_pin(j, *seq); + +- list_for_each_entry(pin, &pin_list->key_cache_list, list) +- pr_buf(out, "\t%px %ps\n", +- pin, pin->flush); ++ pr_buf(out, "%llu: count %u", *seq, atomic_read(&pin_list->count)); ++ pr_newline(out); ++ pr_indent_push(out, 2); + +- list_for_each_entry(pin, &pin_list->list, list) +- pr_buf(out, "\t%px %ps\n", +- pin, pin->flush); ++ list_for_each_entry(pin, &pin_list->list, list) { ++ pr_buf(out, "\t%px %ps", pin, pin->flush); ++ pr_newline(out); ++ } ++ ++ list_for_each_entry(pin, &pin_list->key_cache_list, list) { ++ pr_buf(out, "\t%px %ps", pin, pin->flush); ++ pr_newline(out); ++ } + +- if (!list_empty(&pin_list->flushed)) +- pr_buf(out, "flushed:\n"); ++ if (!list_empty(&pin_list->flushed)) { ++ pr_buf(out, "flushed:"); ++ pr_newline(out); ++ } + +- list_for_each_entry(pin, &pin_list->flushed, list) +- pr_buf(out, "\t%px %ps\n", +- pin, pin->flush); ++ list_for_each_entry(pin, &pin_list->flushed, list) { ++ pr_buf(out, "\t%px %ps", pin, pin->flush); ++ pr_newline(out); + } + ++ pr_indent_pop(out, 2); ++ + --out->atomic; + spin_unlock(&j->lock); ++ ++ return false; ++} ++ ++void bch2_journal_pins_to_text(struct printbuf *out, struct journal *j) ++{ ++ u64 seq = 0; ++ ++ while (!bch2_journal_seq_pins_to_text(out, j, &seq)) ++ seq++; + } +diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h +index 296981740cc3..0a3fb8a061c2 100644 +--- a/fs/bcachefs/journal.h ++++ b/fs/bcachefs/journal.h +@@ -501,6 +501,7 @@ void bch2_journal_block(struct journal *); + void __bch2_journal_debug_to_text(struct printbuf *, struct journal *); + void bch2_journal_debug_to_text(struct printbuf *, struct journal *); + void bch2_journal_pins_to_text(struct printbuf *, struct journal *); ++bool bch2_journal_seq_pins_to_text(struct printbuf *, struct journal *, u64 *); + + int bch2_set_nr_journal_buckets(struct bch_fs *, struct bch_dev *, + unsigned nr); +diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c +index ce32b9068518..3018250d421b 100644 +--- a/fs/bcachefs/sysfs.c ++++ b/fs/bcachefs/sysfs.c +@@ -174,9 +174,7 @@ read_attribute(reserve_stats); + read_attribute(btree_cache_size); + read_attribute(compression_stats); + read_attribute(journal_debug); +-read_attribute(journal_pins); + read_attribute(btree_updates); +-read_attribute(dirty_btree_nodes); + read_attribute(btree_cache); + read_attribute(btree_key_cache); + read_attribute(btree_transactions); +@@ -402,15 +400,9 @@ SHOW(bch2_fs) + if (attr == &sysfs_journal_debug) + bch2_journal_debug_to_text(out, &c->journal); + +- if (attr == &sysfs_journal_pins) +- bch2_journal_pins_to_text(out, &c->journal); +- + if (attr == &sysfs_btree_updates) + bch2_btree_updates_to_text(out, c); + +- if (attr == &sysfs_dirty_btree_nodes) +- bch2_dirty_btree_nodes_to_text(out, c); +- + if (attr == &sysfs_btree_cache) + bch2_btree_cache_to_text(out, c); + +@@ -564,9 +556,7 @@ SYSFS_OPS(bch2_fs_internal); + + struct attribute *bch2_fs_internal_files[] = { + &sysfs_journal_debug, +- &sysfs_journal_pins, + &sysfs_btree_updates, +- &sysfs_dirty_btree_nodes, + &sysfs_btree_cache, + &sysfs_btree_key_cache, + &sysfs_btree_transactions, +-- +cgit v1.2.3 + + +From b2763a3b4e1247ec7740a352fd7a6714508abf6e Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 26 Feb 2022 21:35:16 -0500 +Subject: bcachefs: Fix locking in btree_node_write_done() + +There was a rare recursive locking bug, in __bch2_btree_node_write() +nowrite path -> btree_node_write_done(), in the path that kicks off +another write. + +This splits out an inner __btree_node_write_done() that expects to be +run with the btree node lock held. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_io.c | 25 +++++++------------------ + 1 file changed, 7 insertions(+), 18 deletions(-) + +diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c +index ef1e301f201c..c0b59e727eaf 100644 +--- a/fs/bcachefs/btree_io.c ++++ b/fs/bcachefs/btree_io.c +@@ -1596,29 +1596,13 @@ void bch2_btree_complete_write(struct bch_fs *c, struct btree *b, + bch2_journal_pin_drop(&c->journal, &w->journal); + } + +-static void btree_node_write_done(struct bch_fs *c, struct btree *b) ++static void __btree_node_write_done(struct bch_fs *c, struct btree *b) + { + struct btree_write *w = btree_prev_write(b); + unsigned long old, new, v; + + bch2_btree_complete_write(c, b, w); + +- v = READ_ONCE(b->flags); +- do { +- old = new = v; +- +- if (old & (1U << BTREE_NODE_need_write)) +- goto do_write; +- +- new &= ~(1U << BTREE_NODE_write_in_flight); +- new &= ~(1U << BTREE_NODE_write_in_flight_inner); +- } while ((v = cmpxchg(&b->flags, old, new)) != old); +- +- wake_up_bit(&b->flags, BTREE_NODE_write_in_flight); +- return; +- +-do_write: +- six_lock_read(&b->c.lock, NULL, NULL); + v = READ_ONCE(b->flags); + do { + old = new = v; +@@ -1641,7 +1625,12 @@ do_write: + + if (new & (1U << BTREE_NODE_write_in_flight)) + __bch2_btree_node_write(c, b, true); ++} + ++static void btree_node_write_done(struct bch_fs *c, struct btree *b) ++{ ++ six_lock_read(&b->c.lock, NULL, NULL); ++ __btree_node_write_done(c, b); + six_unlock_read(&b->c.lock); + } + +@@ -1995,7 +1984,7 @@ err: + b->written += sectors_to_write; + nowrite: + btree_bounce_free(c, bytes, used_mempool, data); +- btree_node_write_done(c, b); ++ __btree_node_write_done(c, b); + } + + /* +-- +cgit v1.2.3 + + +From 2597c9af3beb6d070474e1907cb7128c5f29d9a1 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 26 Feb 2022 21:46:41 -0500 +Subject: bcachefs: Improve btree_node_write_if_need() + +btree_node_write_if_need() kicks off a btree node write only if +need_write is set; this makes the locking easier to reason about by +moving the check into the cmpxchg loop in __bch2_btree_node_write(). + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_cache.c | 6 +++--- + fs/bcachefs/btree_io.c | 22 ++++++++++++++-------- + fs/bcachefs/btree_io.h | 13 ++++++------- + fs/bcachefs/btree_update_interior.c | 12 ++++++------ + 4 files changed, 29 insertions(+), 24 deletions(-) + +diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c +index 1aed196edd60..9f7f7c37f19b 100644 +--- a/fs/bcachefs/btree_cache.c ++++ b/fs/bcachefs/btree_cache.c +@@ -240,9 +240,9 @@ wait_on_io: + * the post write cleanup: + */ + if (bch2_verify_btree_ondisk) +- bch2_btree_node_write(c, b, SIX_LOCK_intent); ++ bch2_btree_node_write(c, b, SIX_LOCK_intent, 0); + else +- __bch2_btree_node_write(c, b, false); ++ __bch2_btree_node_write(c, b, 0); + + six_unlock_write(&b->c.lock); + six_unlock_intent(&b->c.lock); +@@ -1065,7 +1065,7 @@ wait_on_io: + six_lock_write(&b->c.lock, NULL, NULL); + + if (btree_node_dirty(b)) { +- __bch2_btree_node_write(c, b, false); ++ __bch2_btree_node_write(c, b, 0); + six_unlock_write(&b->c.lock); + six_unlock_intent(&b->c.lock); + goto wait_on_io; +diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c +index c0b59e727eaf..e15558177735 100644 +--- a/fs/bcachefs/btree_io.c ++++ b/fs/bcachefs/btree_io.c +@@ -477,7 +477,7 @@ void bch2_btree_init_next(struct btree_trans *trans, struct btree *b) + }; + + if (log_u64s[1] >= (log_u64s[0] + log_u64s[2]) / 2) { +- bch2_btree_node_write(c, b, SIX_LOCK_write); ++ bch2_btree_node_write(c, b, SIX_LOCK_write, 0); + reinit_iter = true; + } + } +@@ -1624,7 +1624,7 @@ static void __btree_node_write_done(struct bch_fs *c, struct btree *b) + } while ((v = cmpxchg(&b->flags, old, new)) != old); + + if (new & (1U << BTREE_NODE_write_in_flight)) +- __bch2_btree_node_write(c, b, true); ++ __bch2_btree_node_write(c, b, BTREE_WRITE_ALREADY_STARTED); + } + + static void btree_node_write_done(struct bch_fs *c, struct btree *b) +@@ -1745,7 +1745,7 @@ static void btree_write_submit(struct work_struct *work) + bch2_submit_wbio_replicas(&wbio->wbio, wbio->wbio.c, BCH_DATA_btree, &tmp.k); + } + +-void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, bool already_started) ++void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags) + { + struct btree_write_bio *wbio; + struct bset_tree *t; +@@ -1762,7 +1762,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, bool already_sta + void *data; + int ret; + +- if (already_started) ++ if (flags & BTREE_WRITE_ALREADY_STARTED) + goto do_write; + + /* +@@ -1778,13 +1778,18 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, bool already_sta + if (!(old & (1 << BTREE_NODE_dirty))) + return; + ++ if ((flags & BTREE_WRITE_ONLY_IF_NEED) && ++ !(old & (1 << BTREE_NODE_need_write))) ++ return; ++ + if (!btree_node_may_write(b)) + return; + + if (old & (1 << BTREE_NODE_never_write)) + return; + +- BUG_ON(old & (1 << BTREE_NODE_write_in_flight)); ++ if (old & (1 << BTREE_NODE_write_in_flight)) ++ return; + + new &= ~(1 << BTREE_NODE_dirty); + new &= ~(1 << BTREE_NODE_need_write); +@@ -2047,12 +2052,13 @@ bool bch2_btree_post_write_cleanup(struct bch_fs *c, struct btree *b) + * Use this one if the node is intent locked: + */ + void bch2_btree_node_write(struct bch_fs *c, struct btree *b, +- enum six_lock_type lock_type_held) ++ enum six_lock_type lock_type_held, ++ unsigned flags) + { + if (lock_type_held == SIX_LOCK_intent || + (lock_type_held == SIX_LOCK_read && + six_lock_tryupgrade(&b->c.lock))) { +- __bch2_btree_node_write(c, b, false); ++ __bch2_btree_node_write(c, b, flags); + + /* don't cycle lock unnecessarily: */ + if (btree_node_just_written(b) && +@@ -2064,7 +2070,7 @@ void bch2_btree_node_write(struct bch_fs *c, struct btree *b, + if (lock_type_held == SIX_LOCK_read) + six_lock_downgrade(&b->c.lock); + } else { +- __bch2_btree_node_write(c, b, false); ++ __bch2_btree_node_write(c, b, flags); + if (lock_type_held == SIX_LOCK_write && + btree_node_just_written(b)) + bch2_btree_post_write_cleanup(c, b); +diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h +index 638a9b30f0cb..3dbb518c4da4 100644 +--- a/fs/bcachefs/btree_io.h ++++ b/fs/bcachefs/btree_io.h +@@ -143,20 +143,19 @@ int bch2_btree_root_read(struct bch_fs *, enum btree_id, + void bch2_btree_complete_write(struct bch_fs *, struct btree *, + struct btree_write *); + +-void __bch2_btree_node_write(struct bch_fs *, struct btree *, bool); + bool bch2_btree_post_write_cleanup(struct bch_fs *, struct btree *); + ++#define BTREE_WRITE_ONLY_IF_NEED (1U << 0) ++#define BTREE_WRITE_ALREADY_STARTED (1U << 1) ++ ++void __bch2_btree_node_write(struct bch_fs *, struct btree *, unsigned); + void bch2_btree_node_write(struct bch_fs *, struct btree *, +- enum six_lock_type); ++ enum six_lock_type, unsigned); + + static inline void btree_node_write_if_need(struct bch_fs *c, struct btree *b, + enum six_lock_type lock_held) + { +- if (b->written && +- btree_node_need_write(b) && +- btree_node_may_write(b) && +- !btree_node_write_in_flight(b)) +- bch2_btree_node_write(c, b, lock_held); ++ bch2_btree_node_write(c, b, lock_held, BTREE_WRITE_ONLY_IF_NEED); + } + + #define bch2_btree_node_write_cond(_c, _b, cond) \ +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index 97786abae1f3..10886db72085 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -1385,8 +1385,8 @@ static void btree_split(struct btree_update *as, struct btree_trans *trans, + six_unlock_write(&n2->c.lock); + six_unlock_write(&n1->c.lock); + +- bch2_btree_node_write(c, n1, SIX_LOCK_intent); +- bch2_btree_node_write(c, n2, SIX_LOCK_intent); ++ bch2_btree_node_write(c, n1, SIX_LOCK_intent, 0); ++ bch2_btree_node_write(c, n2, SIX_LOCK_intent, 0); + + /* + * Note that on recursive parent_keys == keys, so we +@@ -1405,7 +1405,7 @@ static void btree_split(struct btree_update *as, struct btree_trans *trans, + + btree_split_insert_keys(as, trans, path, n3, &as->parent_keys); + +- bch2_btree_node_write(c, n3, SIX_LOCK_intent); ++ bch2_btree_node_write(c, n3, SIX_LOCK_intent, 0); + } + } else { + trace_btree_compact(c, b); +@@ -1413,7 +1413,7 @@ static void btree_split(struct btree_update *as, struct btree_trans *trans, + bch2_btree_build_aux_trees(n1); + six_unlock_write(&n1->c.lock); + +- bch2_btree_node_write(c, n1, SIX_LOCK_intent); ++ bch2_btree_node_write(c, n1, SIX_LOCK_intent, 0); + + if (parent) + bch2_keylist_add(&as->parent_keys, &n1->key); +@@ -1701,7 +1701,7 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, + bch2_btree_build_aux_trees(n); + six_unlock_write(&n->c.lock); + +- bch2_btree_node_write(c, n, SIX_LOCK_intent); ++ bch2_btree_node_write(c, n, SIX_LOCK_intent, 0); + + bkey_init(&delete.k); + delete.k.p = prev->key.k.p; +@@ -1775,7 +1775,7 @@ int bch2_btree_node_rewrite(struct btree_trans *trans, + + trace_btree_gc_rewrite_node(c, b); + +- bch2_btree_node_write(c, n, SIX_LOCK_intent); ++ bch2_btree_node_write(c, n, SIX_LOCK_intent, 0); + + if (parent) { + bch2_keylist_add(&as->parent_keys, &n->key); +-- +cgit v1.2.3 + + +From eda21d338da801ecfd843b5d690c6e1e2812fbe5 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 27 Feb 2022 09:42:46 -0500 +Subject: bcachefs: Kill bch2_btree_node_write_cond() + +bch2_btree_node_write_cond() was only used in one place - this inlines +it into __btree_node_flush() and makes the cmpxchg loop actually +correct. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_io.h | 16 ---------------- + fs/bcachefs/btree_update_leaf.c | 18 ++++++++++++++++-- + 2 files changed, 16 insertions(+), 18 deletions(-) + +diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h +index 3dbb518c4da4..7ed88089f6f9 100644 +--- a/fs/bcachefs/btree_io.h ++++ b/fs/bcachefs/btree_io.h +@@ -158,22 +158,6 @@ static inline void btree_node_write_if_need(struct bch_fs *c, struct btree *b, + bch2_btree_node_write(c, b, lock_held, BTREE_WRITE_ONLY_IF_NEED); + } + +-#define bch2_btree_node_write_cond(_c, _b, cond) \ +-do { \ +- unsigned long old, new, v = READ_ONCE((_b)->flags); \ +- \ +- do { \ +- old = new = v; \ +- \ +- if (!(old & (1 << BTREE_NODE_dirty)) || !(cond)) \ +- break; \ +- \ +- new |= (1 << BTREE_NODE_need_write); \ +- } while ((v = cmpxchg(&(_b)->flags, old, new)) != old); \ +- \ +- btree_node_write_if_need(_c, _b, SIX_LOCK_read); \ +-} while (0) +- + void bch2_btree_flush_all_reads(struct bch_fs *); + void bch2_btree_flush_all_writes(struct bch_fs *); + +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index 05ad8ed15d0b..bac94327ceca 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -167,10 +167,24 @@ static int __btree_node_flush(struct journal *j, struct journal_entry_pin *pin, + struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct btree_write *w = container_of(pin, struct btree_write, journal); + struct btree *b = container_of(w, struct btree, writes[i]); ++ unsigned long old, new, v; ++ unsigned idx = w - b->writes; + + six_lock_read(&b->c.lock, NULL, NULL); +- bch2_btree_node_write_cond(c, b, +- (btree_current_write(b) == w && w->journal.seq == seq)); ++ v = READ_ONCE(b->flags); ++ ++ do { ++ old = new = v; ++ ++ if (!(old & (1 << BTREE_NODE_dirty)) || ++ !!(old & (1 << BTREE_NODE_write_idx)) != idx || ++ w->journal.seq != seq) ++ break; ++ ++ new |= 1 << BTREE_NODE_need_write; ++ } while ((v = cmpxchg(&b->flags, old, new)) != old); ++ ++ btree_node_write_if_need(c, b, SIX_LOCK_read); + six_unlock_read(&b->c.lock); + return 0; + } +-- +cgit v1.2.3 + + +From b0fb899fbd6aaa848186cfffc353118d87415e7e Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 27 Feb 2022 09:56:33 -0500 +Subject: bcachefs: Fix race leading to btree node write getting stuck + +Checking btree_node_may_write() isn't atomic with the other btree flags, +dirty and need_write in particular. There was a rare race where we'd +unblock a node from writing while __btree_node_flush() was setting +need_write, and no thread would notice that the node was now both able +to write and needed to be written. + +Fix this by adding btree node flags for will_make_reachable and +write_blocked that can be checked in the cmpxchg loop in +__bch2_btree_node_write. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_cache.c | 7 +++---- + fs/bcachefs/btree_io.c | 10 +++++++--- + fs/bcachefs/btree_io.h | 6 ------ + fs/bcachefs/btree_types.h | 2 ++ + fs/bcachefs/btree_update_interior.c | 7 +++++++ + 5 files changed, 19 insertions(+), 13 deletions(-) + +diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c +index 9f7f7c37f19b..1347b1fc1166 100644 +--- a/fs/bcachefs/btree_cache.c ++++ b/fs/bcachefs/btree_cache.c +@@ -224,10 +224,9 @@ wait_on_io: + goto wait_on_io; + } + +- if (btree_node_noevict(b)) +- goto out_unlock; +- +- if (!btree_node_may_write(b)) ++ if (btree_node_noevict(b) || ++ btree_node_write_blocked(b) || ++ btree_node_will_make_reachable(b)) + goto out_unlock; + + if (btree_node_dirty(b)) { +diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c +index e15558177735..08f5f6b865c6 100644 +--- a/fs/bcachefs/btree_io.c ++++ b/fs/bcachefs/btree_io.c +@@ -1610,7 +1610,8 @@ static void __btree_node_write_done(struct bch_fs *c, struct btree *b) + if ((old & (1U << BTREE_NODE_dirty)) && + (old & (1U << BTREE_NODE_need_write)) && + !(old & (1U << BTREE_NODE_never_write)) && +- btree_node_may_write(b)) { ++ !(old & (1U << BTREE_NODE_write_blocked)) && ++ !(old & (1U << BTREE_NODE_will_make_reachable))) { + new &= ~(1U << BTREE_NODE_dirty); + new &= ~(1U << BTREE_NODE_need_write); + new |= (1U << BTREE_NODE_write_in_flight); +@@ -1782,10 +1783,13 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags) + !(old & (1 << BTREE_NODE_need_write))) + return; + +- if (!btree_node_may_write(b)) ++ if (old & ++ ((1 << BTREE_NODE_never_write)| ++ (1 << BTREE_NODE_write_blocked))) + return; + +- if (old & (1 << BTREE_NODE_never_write)) ++ if (b->written && ++ (old & (1 << BTREE_NODE_will_make_reachable))) + return; + + if (old & (1 << BTREE_NODE_write_in_flight)) +diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h +index 7ed88089f6f9..d818d87661e8 100644 +--- a/fs/bcachefs/btree_io.h ++++ b/fs/bcachefs/btree_io.h +@@ -62,12 +62,6 @@ void __bch2_btree_node_wait_on_write(struct btree *); + void bch2_btree_node_wait_on_read(struct btree *); + void bch2_btree_node_wait_on_write(struct btree *); + +-static inline bool btree_node_may_write(struct btree *b) +-{ +- return list_empty_careful(&b->write_blocked) && +- (!b->written || !b->will_make_reachable); +-} +- + enum compact_mode { + COMPACT_LAZY, + COMPACT_ALL, +diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h +index 1e38f6670238..09b6db1d93f2 100644 +--- a/fs/bcachefs/btree_types.h ++++ b/fs/bcachefs/btree_types.h +@@ -434,6 +434,8 @@ struct btree_trans { + x(read_error) \ + x(dirty) \ + x(need_write) \ ++ x(write_blocked) \ ++ x(will_make_reachable) \ + x(noevict) \ + x(write_idx) \ + x(accessed) \ +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index 10886db72085..63832fb9a407 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -619,6 +619,8 @@ err: + mutex_lock(&c->btree_interior_update_lock); + + list_del(&as->write_blocked_list); ++ if (list_empty(&b->write_blocked)) ++ clear_btree_node_write_blocked(b); + + /* + * Node might have been freed, recheck under +@@ -663,6 +665,7 @@ err: + + BUG_ON(b->will_make_reachable != (unsigned long) as); + b->will_make_reachable = 0; ++ clear_btree_node_will_make_reachable(b); + } + mutex_unlock(&c->btree_interior_update_lock); + +@@ -729,6 +732,8 @@ static void btree_update_updated_node(struct btree_update *as, struct btree *b) + + as->mode = BTREE_INTERIOR_UPDATING_NODE; + as->b = b; ++ ++ set_btree_node_write_blocked(b); + list_add(&as->write_blocked_list, &b->write_blocked); + + mutex_unlock(&c->btree_interior_update_lock); +@@ -794,6 +799,7 @@ static void bch2_btree_update_add_new_node(struct btree_update *as, struct btree + + as->new_nodes[as->nr_new_nodes++] = b; + b->will_make_reachable = 1UL|(unsigned long) as; ++ set_btree_node_will_make_reachable(b); + + mutex_unlock(&c->btree_interior_update_lock); + +@@ -816,6 +822,7 @@ static void btree_update_drop_new_node(struct bch_fs *c, struct btree *b) + * xchg() is for synchronization with bch2_btree_complete_write: + */ + v = xchg(&b->will_make_reachable, 0); ++ clear_btree_node_will_make_reachable(b); + as = (struct btree_update *) (v & ~1UL); + + if (!as) { +-- +cgit v1.2.3 + + +From 61ea724ad67bd65a4d9701cf02e9041b018cef78 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 27 Feb 2022 11:57:42 -0500 +Subject: bcachefs: Fix a memory leak + +This fixes a regression from "bcachefs: Heap allocate printbufs" - +bch2_sb_field_validate() was leaking an error string. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/super-io.c | 17 +++++++++-------- + 1 file changed, 9 insertions(+), 8 deletions(-) + +diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c +index 08966f4004fb..8580b6fd580a 100644 +--- a/fs/bcachefs/super-io.c ++++ b/fs/bcachefs/super-io.c +@@ -1420,24 +1420,25 @@ static const struct bch_sb_field_ops *bch2_sb_field_ops[] = { + }; + + static int bch2_sb_field_validate(struct bch_sb *sb, struct bch_sb_field *f, +- struct printbuf *orig_err) ++ struct printbuf *err) + { + unsigned type = le32_to_cpu(f->type); +- struct printbuf err = *orig_err; ++ struct printbuf field_err = PRINTBUF; + int ret; + + if (type >= BCH_SB_FIELD_NR) + return 0; + +- pr_buf(&err, "Invalid superblock section %s: ", bch2_sb_fields[type]); +- +- ret = bch2_sb_field_ops[type]->validate(sb, f, &err); ++ ret = bch2_sb_field_ops[type]->validate(sb, f, &field_err); + if (ret) { +- pr_newline(&err); +- bch2_sb_field_to_text(&err, sb, f); +- *orig_err = err; ++ pr_buf(err, "Invalid superblock section %s: %s", ++ bch2_sb_fields[type], ++ field_err.buf); ++ pr_newline(err); ++ bch2_sb_field_to_text(err, sb, f); + } + ++ printbuf_exit(&field_err); + return ret; + } + +-- +cgit v1.2.3 + + +From e8f0dd8b2abdf69bfce162c05d09f117aab34e10 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 27 Feb 2022 11:34:21 -0500 +Subject: bcachefs: Fix a use after free + +This fixes a regression from "bcachefs: Stash a copy of key being +overwritten in btree_insert_entry". In btree_key_can_insert_cached(), we +may reallocate the key cache key, invalidating pointers previously +returned by peek() - fix it by issuing a transaction restart. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_update_leaf.c | 13 ++++++++++++- + include/trace/events/bcachefs.h | 8 ++++++++ + 2 files changed, 20 insertions(+), 1 deletion(-) + +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index bac94327ceca..ad5f516da1aa 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -381,7 +381,18 @@ btree_key_can_insert_cached(struct btree_trans *trans, + + ck->u64s = new_u64s; + ck->k = new_k; +- return BTREE_INSERT_OK; ++ /* ++ * Keys returned by peek() are no longer valid pointers, so we need a ++ * transaction restart: ++ */ ++ trace_trans_restart_key_cache_key_realloced(trans->fn, _RET_IP_, ++ path->btree_id, &path->pos); ++ /* ++ * Not using btree_trans_restart() because we can't unlock here, we have ++ * write locks held: ++ */ ++ trans->restarted = true; ++ return -EINTR; + } + + static inline void do_btree_insert_one(struct btree_trans *trans, +diff --git a/include/trace/events/bcachefs.h b/include/trace/events/bcachefs.h +index 8cf6669e2830..0596887959d3 100644 +--- a/include/trace/events/bcachefs.h ++++ b/include/trace/events/bcachefs.h +@@ -918,6 +918,14 @@ TRACE_EVENT(trans_restart_mem_realloced, + __entry->bytes) + ); + ++DEFINE_EVENT(transaction_restart_iter, trans_restart_key_cache_key_realloced, ++ TP_PROTO(const char *trans_fn, ++ unsigned long caller_ip, ++ enum btree_id btree_id, ++ struct bpos *pos), ++ TP_ARGS(trans_fn, caller_ip, btree_id, pos) ++); ++ + #endif /* _TRACE_BCACHE_H */ + + /* This part must be outside protection */ +-- +cgit v1.2.3 + + +From abaab424cec5f5d0c7af2c7c871639fa96859489 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 28 Feb 2022 15:51:24 -0500 +Subject: bcachefs: Delete some dead journal code + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/journal.c | 12 ++++++++---- + 1 file changed, 8 insertions(+), 4 deletions(-) + +diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c +index 9cd1e11ad1b5..7ae47f00760f 100644 +--- a/fs/bcachefs/journal.c ++++ b/fs/bcachefs/journal.c +@@ -273,6 +273,12 @@ static int journal_entry_open(struct journal *j) + + BUG_ON(!j->cur_entry_sectors); + ++ /* We used to add things to the first journal entry before opening it, ++ * as a way to deal with a chicken-and-the-egg problem, but we shouldn't ++ * be anymore: ++ */ ++ BUG_ON(buf->data->u64s); ++ + buf->u64s_reserved = j->entry_u64s_reserved; + buf->disk_sectors = j->cur_entry_sectors; + buf->sectors = min(buf->disk_sectors, buf->buf_size >> 9); +@@ -281,7 +287,7 @@ static int journal_entry_open(struct journal *j) + journal_entry_overhead(j); + u64s = clamp_t(int, u64s, 0, JOURNAL_ENTRY_CLOSED_VAL - 1); + +- if (u64s <= le32_to_cpu(buf->data->u64s)) ++ if (u64s <= 0) + return cur_entry_journal_full; + + /* +@@ -296,11 +302,9 @@ static int journal_entry_open(struct journal *j) + if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL) + return cur_entry_insufficient_devices; + +- /* Handle any already added entries */ +- new.cur_entry_offset = le32_to_cpu(buf->data->u64s); +- + EBUG_ON(journal_state_count(new, new.idx)); + journal_state_inc(&new); ++ new.cur_entry_offset = 0; + } while ((v = atomic64_cmpxchg(&j->reservations.counter, + old.v, new.v)) != old.v); + +-- +cgit v1.2.3 + + +From e3204ed2ecfbfccd3309a5b5312f9e28459ef16c Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 25 Feb 2022 10:28:20 -0500 +Subject: bcachefs: Kill JOURNAL_NEED_WRITE + +This replaces the journal flag JOURNAL_NEED_WRITE with per-journal buf +state - more explicit, and solving a race in the old code that would +lead to entries being opened and written unnecessarily. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/journal.c | 63 ++++++++++++++++++++++++++++++++------------- + fs/bcachefs/journal_io.c | 12 +++++---- + fs/bcachefs/journal_types.h | 10 ++----- + 3 files changed, 54 insertions(+), 31 deletions(-) + +diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c +index 7ae47f00760f..b95c18f1a083 100644 +--- a/fs/bcachefs/journal.c ++++ b/fs/bcachefs/journal.c +@@ -87,6 +87,7 @@ static void bch2_journal_buf_init(struct journal *j) + buf->noflush = false; + buf->must_flush = false; + buf->separate_flush = false; ++ buf->flush_time = 0; + + memset(buf->data, 0, sizeof(*buf->data)); + buf->data->seq = cpu_to_le64(journal_cur_seq(j)); +@@ -153,11 +154,6 @@ static bool __journal_entry_close(struct journal *j) + return true; + } + +- if (!test_bit(JOURNAL_NEED_WRITE, &j->flags)) { +- set_bit(JOURNAL_NEED_WRITE, &j->flags); +- j->need_write_time = local_clock(); +- } +- + new.cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL; + new.idx++; + +@@ -206,7 +202,6 @@ static bool __journal_entry_close(struct journal *j) + bch2_journal_buf_init(j); + + cancel_delayed_work(&j->write_work); +- clear_bit(JOURNAL_NEED_WRITE, &j->flags); + + bch2_journal_space_available(j); + +@@ -217,15 +212,16 @@ static bool __journal_entry_close(struct journal *j) + static bool journal_entry_want_write(struct journal *j) + { + union journal_res_state s = READ_ONCE(j->reservations); ++ struct journal_buf *buf = journal_cur_buf(j); + bool ret = false; + +- /* +- * Don't close it yet if we already have a write in flight, but do set +- * NEED_WRITE: +- */ +- if (s.idx != s.unwritten_idx) +- set_bit(JOURNAL_NEED_WRITE, &j->flags); +- else ++ if (!buf->flush_time) { ++ buf->flush_time = local_clock() ?: 1; ++ buf->expires = jiffies; ++ } ++ ++ /* Don't close it yet if we already have a write in flight: */ ++ if (s.idx == s.unwritten_idx) + ret = __journal_entry_close(j); + + return ret; +@@ -279,6 +275,8 @@ static int journal_entry_open(struct journal *j) + */ + BUG_ON(buf->data->u64s); + ++ buf->expires = jiffies + ++ msecs_to_jiffies(c->opts.journal_flush_delay); + buf->u64s_reserved = j->entry_u64s_reserved; + buf->disk_sectors = j->cur_entry_sectors; + buf->sectors = min(buf->disk_sectors, buf->buf_size >> 9); +@@ -338,8 +336,19 @@ static void journal_quiesce(struct journal *j) + static void journal_write_work(struct work_struct *work) + { + struct journal *j = container_of(work, struct journal, write_work.work); ++ struct bch_fs *c = container_of(j, struct bch_fs, journal); ++ struct journal_buf *buf; ++ long delta; + +- journal_entry_close(j); ++ spin_lock(&j->lock); ++ buf = journal_cur_buf(j); ++ delta = buf->expires - jiffies; ++ ++ if (delta > 0) ++ mod_delayed_work(c->io_complete_wq, &j->write_work, delta); ++ else ++ __journal_entry_close(j); ++ spin_unlock(&j->lock); + } + + static int __journal_res_get(struct journal *j, struct journal_res *res, +@@ -592,7 +601,11 @@ recheck_need_open: + seq = res.seq; + buf = j->buf + (seq & JOURNAL_BUF_MASK); + buf->must_flush = true; +- set_bit(JOURNAL_NEED_WRITE, &j->flags); ++ ++ if (!buf->flush_time) { ++ buf->flush_time = local_clock() ?: 1; ++ buf->expires = jiffies; ++ } + + if (parent && !closure_wait(&buf->wait, parent)) + BUG(); +@@ -658,7 +671,11 @@ int bch2_journal_meta(struct journal *j) + + buf = j->buf + (res.seq & JOURNAL_BUF_MASK); + buf->must_flush = true; +- set_bit(JOURNAL_NEED_WRITE, &j->flags); ++ ++ if (!buf->flush_time) { ++ buf->flush_time = local_clock() ?: 1; ++ buf->expires = jiffies; ++ } + + bch2_journal_res_put(j, &res); + +@@ -1231,12 +1248,22 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) + + pr_buf(out, "unwritten entry:\tidx %u refcount %u sectors %u\n", + i, journal_state_count(s, i), j->buf[i].sectors); ++ pr_indent_push(out, 2); ++ ++ pr_buf(out, "refcount %u", journal_state_count(s, i)); ++ pr_newline(out); ++ ++ pr_buf(out, "sectors %u", j->buf[i].sectors); ++ pr_newline(out); ++ ++ pr_buf(out, "expires %li ms", jiffies_to_msecs(j->buf[i].expires - jiffies)); ++ pr_newline(out); ++ ++ pr_indent_pop(out, 2); + } + + pr_buf(out, +- "need write:\t\t%i\n" + "replay done:\t\t%i\n", +- test_bit(JOURNAL_NEED_WRITE, &j->flags), + test_bit(JOURNAL_REPLAY_DONE, &j->flags)); + + pr_buf(out, "space:\n"); +diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c +index 4380ebf5e252..2dbdf40cefd0 100644 +--- a/fs/bcachefs/journal_io.c ++++ b/fs/bcachefs/journal_io.c +@@ -1404,13 +1404,15 @@ static void journal_write_done(struct closure *cl) + closure_wake_up(&w->wait); + journal_wake(j); + +- if (test_bit(JOURNAL_NEED_WRITE, &j->flags)) +- mod_delayed_work(c->io_complete_wq, &j->write_work, 0); +- spin_unlock(&j->lock); ++ if (new.unwritten_idx == new.idx) { ++ struct journal_buf *buf = journal_cur_buf(j); ++ long delta = buf->expires - jiffies; + +- if (new.unwritten_idx != new.idx && +- !journal_state_count(new, new.unwritten_idx)) ++ mod_delayed_work(c->io_complete_wq, &j->write_work, max(0L, delta)); ++ } else if (!journal_state_count(new, new.unwritten_idx)) + closure_call(&j->io, bch2_journal_write, c->io_complete_wq, NULL); ++ ++ spin_unlock(&j->lock); + } + + static void journal_write_endio(struct bio *bio) +diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h +index cd66b7388664..77d7192535df 100644 +--- a/fs/bcachefs/journal_types.h ++++ b/fs/bcachefs/journal_types.h +@@ -25,6 +25,8 @@ struct journal_buf { + + struct closure_waitlist wait; + u64 last_seq; /* copy of data->last_seq */ ++ unsigned long expires; ++ u64 flush_time; + + unsigned buf_size; /* size in bytes of @data */ + unsigned sectors; /* maximum size for current entry */ +@@ -139,16 +141,9 @@ enum journal_space_from { + journal_space_nr, + }; + +-/* +- * JOURNAL_NEED_WRITE - current (pending) journal entry should be written ASAP, +- * either because something's waiting on the write to complete or because it's +- * been dirty too long and the timer's expired. +- */ +- + enum { + JOURNAL_REPLAY_DONE, + JOURNAL_STARTED, +- JOURNAL_NEED_WRITE, + JOURNAL_MAY_GET_UNRESERVED, + JOURNAL_MAY_SKIP_FLUSH, + }; +@@ -263,7 +258,6 @@ struct journal { + unsigned long last_flush_write; + + u64 res_get_blocked_start; +- u64 need_write_time; + u64 write_start_time; + + u64 nr_flush_writes; +-- +cgit v1.2.3 + + +From ca1e5857e29549dab68fc3432275693b54b6bc8b Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 28 Feb 2022 16:21:07 -0500 +Subject: bcachefs: bch2_journal_halt() now takes journal lock + +This change is prep work for moving some work from +__journal_entry_close() to journal_entry_open(): without this change, +journal_entry_open() doesn't know if it's going to be able to open a new +journal entry until the cmpxchg loop, meaning it can't create the new +journal pin entry and update other global state because those have to be +done prior to the cmpxchg opening the new journal entry. + +Fortunately, we don't call bch2_journal_halt() from interrupt context. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/journal.c | 15 +++++++++++---- + 1 file changed, 11 insertions(+), 4 deletions(-) + +diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c +index b95c18f1a083..552cb29c0fe4 100644 +--- a/fs/bcachefs/journal.c ++++ b/fs/bcachefs/journal.c +@@ -97,12 +97,15 @@ static void bch2_journal_buf_init(struct journal *j) + void bch2_journal_halt(struct journal *j) + { + union journal_res_state old, new; +- u64 v = atomic64_read(&j->reservations.counter); ++ u64 v; ++ ++ spin_lock(&j->lock); + ++ v = atomic64_read(&j->reservations.counter); + do { + old.v = new.v = v; + if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL) +- return; ++ goto out; + + new.cur_entry_offset = JOURNAL_ENTRY_ERROR_VAL; + } while ((v = atomic64_cmpxchg(&j->reservations.counter, +@@ -116,6 +119,8 @@ void bch2_journal_halt(struct journal *j) + j->err_seq = journal_cur_seq(j); + journal_wake(j); + closure_wake_up(&journal_cur_buf(j)->wait); ++out: ++ spin_unlock(&j->lock); + } + + /* journal entry close/open: */ +@@ -267,6 +272,9 @@ static int journal_entry_open(struct journal *j) + if (j->cur_entry_error) + return j->cur_entry_error; + ++ if (bch2_journal_error(j)) ++ return cur_entry_insufficient_devices; /* -EROFS */ ++ + BUG_ON(!j->cur_entry_sectors); + + /* We used to add things to the first journal entry before opening it, +@@ -297,8 +305,7 @@ static int journal_entry_open(struct journal *j) + do { + old.v = new.v = v; + +- if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL) +- return cur_entry_insufficient_devices; ++ BUG_ON(old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL); + + EBUG_ON(journal_state_count(new, new.idx)); + journal_state_inc(&new); +-- +cgit v1.2.3 + + +From 953a62f0e8580d04dad685dcd5e8a74b67c67e02 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 1 Mar 2022 15:31:20 -0500 +Subject: bcachefs: Drop unneeded journal pin in bch2_btree_update_start() + +When we do an interior btree update, we create new btree nodes and link +them into the btree in memory, but they don't become reachable on disk +until later, when btree_update_nodes_written_trans() runs. + +Updates to the new nodes can thus happen before they're reachable on +disk, and if the updates to those new nodes are written before the nodes +become reachable, we would then drop the journal pin for those updates +before the btree has them. + +This is what the journal pin in bch2_btree_update_start() was protecting +against. However, it's not actually needed because we don't allow +subsequent append writes to btree nodes until the node is reachable on +disk. + +Dropping this unneeded pin also fixes a bug introduced by "bcachefs: +Journal seq now incremented at entry open, not close" - in the new code, +if the journal is completely empty a journal pin list for +journal_cur_seq() won't exist. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_update_interior.c | 6 ------ + 1 file changed, 6 deletions(-) + +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index 63832fb9a407..e2cf0f58bf2f 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -557,8 +557,6 @@ static void btree_update_nodes_written(struct btree_update *as) + if (ret) + goto err; + +- BUG_ON(!journal_pin_active(&as->journal)); +- + /* + * Wait for any in flight writes to finish before we free the old nodes + * on disk: +@@ -1047,10 +1045,6 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, + goto err; + } + +- bch2_journal_pin_add(&c->journal, +- atomic64_read(&c->journal.seq), +- &as->journal, NULL); +- + return as; + err: + bch2_btree_update_free(as); +-- +cgit v1.2.3 + + +From dd108151f5aa4fe44dfab262e817264383853a67 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 28 Feb 2022 16:35:42 -0500 +Subject: bcachefs: Journal seq now incremented at entry open, not close + +This patch changes journal_entry_open() to initialize the new journal +entry, not __journal_entry_close(). + +This also means that journal_cur_seq() refers to the sequence number of +the last journal entry when we don't have an open journal entry, not the +next one. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_gc.c | 2 +- + fs/bcachefs/journal.c | 126 +++++++++++++----------------------------- + fs/bcachefs/journal_io.c | 4 +- + fs/bcachefs/journal_reclaim.c | 8 +-- + fs/bcachefs/super-io.c | 2 +- + 5 files changed, 44 insertions(+), 98 deletions(-) + +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +index cd9016541d9c..73b947a493a2 100644 +--- a/fs/bcachefs/btree_gc.c ++++ b/fs/bcachefs/btree_gc.c +@@ -781,7 +781,7 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id, + + if (initial) { + BUG_ON(bch2_journal_seq_verify && +- k->k->version.lo > journal_cur_seq(&c->journal)); ++ k->k->version.lo > atomic64_read(&c->journal.seq)); + + ret = bch2_check_fix_ptrs(c, btree_id, level, is_root, k); + if (ret) +diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c +index 552cb29c0fe4..97165b704809 100644 +--- a/fs/bcachefs/journal.c ++++ b/fs/bcachefs/journal.c +@@ -22,16 +22,12 @@ + + static u64 last_unwritten_seq(struct journal *j) + { +- union journal_res_state s = READ_ONCE(j->reservations); +- +- lockdep_assert_held(&j->lock); +- +- return journal_cur_seq(j) - ((s.idx - s.unwritten_idx) & JOURNAL_BUF_MASK); ++ return j->seq_ondisk + 1; + } + + static inline bool journal_seq_unwritten(struct journal *j, u64 seq) + { +- return seq >= last_unwritten_seq(j); ++ return seq > j->seq_ondisk; + } + + static bool __journal_entry_is_open(union journal_res_state state) +@@ -50,8 +46,6 @@ journal_seq_to_buf(struct journal *j, u64 seq) + struct journal_buf *buf = NULL; + + EBUG_ON(seq > journal_cur_seq(j)); +- EBUG_ON(seq == journal_cur_seq(j) && +- j->reservations.cur_entry_offset == JOURNAL_ENTRY_CLOSED_VAL); + + if (journal_seq_unwritten(j, seq)) { + buf = j->buf + (seq & JOURNAL_BUF_MASK); +@@ -69,31 +63,6 @@ static void journal_pin_list_init(struct journal_entry_pin_list *p, int count) + p->devs.nr = 0; + } + +-static void journal_pin_new_entry(struct journal *j) +-{ +- /* +- * The fifo_push() needs to happen at the same time as j->seq is +- * incremented for journal_last_seq() to be calculated correctly +- */ +- atomic64_inc(&j->seq); +- journal_pin_list_init(fifo_push_ref(&j->pin), 1); +-} +- +-static void bch2_journal_buf_init(struct journal *j) +-{ +- struct journal_buf *buf = journal_cur_buf(j); +- +- bkey_extent_init(&buf->key); +- buf->noflush = false; +- buf->must_flush = false; +- buf->separate_flush = false; +- buf->flush_time = 0; +- +- memset(buf->data, 0, sizeof(*buf->data)); +- buf->data->seq = cpu_to_le64(journal_cur_seq(j)); +- buf->data->u64s = 0; +-} +- + void bch2_journal_halt(struct journal *j) + { + union journal_res_state old, new; +@@ -201,11 +170,6 @@ static bool __journal_entry_close(struct journal *j) + + __bch2_journal_pin_put(j, le64_to_cpu(buf->data->seq)); + +- /* Initialize new buffer: */ +- journal_pin_new_entry(j); +- +- bch2_journal_buf_init(j); +- + cancel_delayed_work(&j->write_work); + + bch2_journal_space_available(j); +@@ -275,27 +239,47 @@ static int journal_entry_open(struct journal *j) + if (bch2_journal_error(j)) + return cur_entry_insufficient_devices; /* -EROFS */ + +- BUG_ON(!j->cur_entry_sectors); ++ if (!fifo_free(&j->pin)) ++ return cur_entry_journal_pin_full; + +- /* We used to add things to the first journal entry before opening it, +- * as a way to deal with a chicken-and-the-egg problem, but we shouldn't +- * be anymore: +- */ +- BUG_ON(buf->data->u64s); ++ BUG_ON(!j->cur_entry_sectors); + +- buf->expires = jiffies + ++ buf->expires = ++ (journal_cur_seq(j) == j->flushed_seq_ondisk ++ ? jiffies ++ : j->last_flush_write) + + msecs_to_jiffies(c->opts.journal_flush_delay); ++ + buf->u64s_reserved = j->entry_u64s_reserved; + buf->disk_sectors = j->cur_entry_sectors; + buf->sectors = min(buf->disk_sectors, buf->buf_size >> 9); + + u64s = (int) (buf->sectors << 9) / sizeof(u64) - + journal_entry_overhead(j); +- u64s = clamp_t(int, u64s, 0, JOURNAL_ENTRY_CLOSED_VAL - 1); ++ u64s = clamp_t(int, u64s, 0, JOURNAL_ENTRY_CLOSED_VAL - 1); + + if (u64s <= 0) + return cur_entry_journal_full; + ++ /* ++ * The fifo_push() needs to happen at the same time as j->seq is ++ * incremented for journal_last_seq() to be calculated correctly ++ */ ++ atomic64_inc(&j->seq); ++ journal_pin_list_init(fifo_push_ref(&j->pin), 1); ++ ++ BUG_ON(j->buf + (journal_cur_seq(j) & JOURNAL_BUF_MASK) != buf); ++ ++ bkey_extent_init(&buf->key); ++ buf->noflush = false; ++ buf->must_flush = false; ++ buf->separate_flush = false; ++ buf->flush_time = 0; ++ ++ memset(buf->data, 0, sizeof(*buf->data)); ++ buf->data->seq = cpu_to_le64(journal_cur_seq(j)); ++ buf->data->u64s = 0; ++ + /* + * Must be set before marking the journal entry as open: + */ +@@ -306,8 +290,8 @@ static int journal_entry_open(struct journal *j) + old.v = new.v = v; + + BUG_ON(old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL); ++ BUG_ON(journal_state_count(new, new.idx)); + +- EBUG_ON(journal_state_count(new, new.idx)); + journal_state_inc(&new); + new.cur_entry_offset = 0; + } while ((v = atomic64_cmpxchg(&j->reservations.counter, +@@ -596,9 +580,12 @@ int bch2_journal_flush_seq_async(struct journal *j, u64 seq, + seq = max(seq, last_unwritten_seq(j)); + + recheck_need_open: +- if (seq == journal_cur_seq(j) && !journal_entry_is_open(j)) { ++ if (seq > journal_cur_seq(j)) { + struct journal_res res = { 0 }; + ++ if (journal_entry_is_open(j)) ++ __journal_entry_close(j); ++ + spin_unlock(&j->lock); + + ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0); +@@ -695,42 +682,12 @@ int bch2_journal_meta(struct journal *j) + */ + void bch2_journal_flush_async(struct journal *j, struct closure *parent) + { +- u64 seq, journal_seq; +- +- spin_lock(&j->lock); +- journal_seq = journal_cur_seq(j); +- +- if (journal_entry_is_open(j)) { +- seq = journal_seq; +- } else if (journal_seq) { +- seq = journal_seq - 1; +- } else { +- spin_unlock(&j->lock); +- return; +- } +- spin_unlock(&j->lock); +- +- bch2_journal_flush_seq_async(j, seq, parent); ++ bch2_journal_flush_seq_async(j, atomic64_read(&j->seq), parent); + } + + int bch2_journal_flush(struct journal *j) + { +- u64 seq, journal_seq; +- +- spin_lock(&j->lock); +- journal_seq = journal_cur_seq(j); +- +- if (journal_entry_is_open(j)) { +- seq = journal_seq; +- } else if (journal_seq) { +- seq = journal_seq - 1; +- } else { +- spin_unlock(&j->lock); +- return 0; +- } +- spin_unlock(&j->lock); +- +- return bch2_journal_flush_seq(j, seq); ++ return bch2_journal_flush_seq(j, atomic64_read(&j->seq)); + } + + /* +@@ -1023,8 +980,7 @@ void bch2_fs_journal_stop(struct journal *j) + + BUG_ON(!bch2_journal_error(j) && + test_bit(JOURNAL_REPLAY_DONE, &j->flags) && +- (journal_entry_is_open(j) || +- j->last_empty_seq + 1 != journal_cur_seq(j))); ++ j->last_empty_seq != journal_cur_seq(j)); + + cancel_delayed_work_sync(&j->write_work); + bch2_journal_reclaim_stop(j); +@@ -1094,11 +1050,7 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq, + set_bit(JOURNAL_STARTED, &j->flags); + j->last_flush_write = jiffies; + +- journal_pin_new_entry(j); +- +- j->reservations.idx = j->reservations.unwritten_idx = journal_cur_seq(j); +- +- bch2_journal_buf_init(j); ++ j->reservations.idx = j->reservations.unwritten_idx = journal_cur_seq(j) + 1; + + c->last_bucket_seq_cleanup = journal_cur_seq(j); + +diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c +index 2dbdf40cefd0..19a8ce816062 100644 +--- a/fs/bcachefs/journal_io.c ++++ b/fs/bcachefs/journal_io.c +@@ -1369,8 +1369,6 @@ static void journal_write_done(struct closure *cl) + journal_seq_pin(j, seq)->devs = w->devs_written; + + if (!err) { +- j->seq_ondisk = seq; +- + if (!JSET_NO_FLUSH(w->data)) { + j->flushed_seq_ondisk = seq; + j->last_seq_ondisk = w->last_seq; +@@ -1378,6 +1376,8 @@ static void journal_write_done(struct closure *cl) + } else if (!j->err_seq || seq < j->err_seq) + j->err_seq = seq; + ++ j->seq_ondisk = seq; ++ + /* + * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard + * more buckets: +diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c +index 3dca50f76ac4..7bd35e58bd3c 100644 +--- a/fs/bcachefs/journal_reclaim.c ++++ b/fs/bcachefs/journal_reclaim.c +@@ -226,8 +226,6 @@ void bch2_journal_space_available(struct journal *j) + ret = cur_entry_journal_stuck; + } else if (!j->space[journal_space_discarded].next_entry) + ret = cur_entry_journal_full; +- else if (!fifo_free(&j->pin)) +- ret = cur_entry_journal_pin_full; + + if ((j->space[journal_space_clean_ondisk].next_entry < + j->space[journal_space_clean_ondisk].total) && +@@ -369,9 +367,6 @@ static inline void __journal_pin_drop(struct journal *j, + if (atomic_dec_and_test(&pin_list->count) && + pin_list == &fifo_peek_front(&j->pin)) + bch2_journal_reclaim_fast(j); +- else if (fifo_used(&j->pin) == 1 && +- atomic_read(&pin_list->count) == 1) +- journal_wake(j); + } + + void bch2_journal_pin_drop(struct journal *j, +@@ -772,8 +767,7 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush, + */ + ret = !test_bit(JOURNAL_REPLAY_DONE, &j->flags) || + journal_last_seq(j) > seq_to_flush || +- (fifo_used(&j->pin) == 1 && +- atomic_read(&fifo_peek_front(&j->pin).count) == 1); ++ !fifo_used(&j->pin); + + spin_unlock(&j->lock); + mutex_unlock(&j->reclaim_lock); +diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c +index 8580b6fd580a..1a70adae2463 100644 +--- a/fs/bcachefs/super-io.c ++++ b/fs/bcachefs/super-io.c +@@ -1341,7 +1341,7 @@ void bch2_fs_mark_clean(struct bch_fs *c) + } + + sb_clean->flags = 0; +- sb_clean->journal_seq = cpu_to_le64(journal_cur_seq(&c->journal) - 1); ++ sb_clean->journal_seq = cpu_to_le64(atomic64_read(&c->journal.seq)); + + /* Trying to catch outstanding bug: */ + BUG_ON(le64_to_cpu(sb_clean->journal_seq) > S64_MAX); +-- +cgit v1.2.3 + + +From 579ffd051c2c63b5e90d7ccd14461441d50b09df Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 28 Feb 2022 19:17:27 -0500 +Subject: bcachefs: Refactor journal code to not use unwritten_idx + +It makes the code more readable if we work off of sequence numbers, +instead of direct indexes into the array of journal buffers. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/journal.c | 96 +++++++++++++++++++++++++------------------ + fs/bcachefs/journal.h | 5 +++ + fs/bcachefs/journal_io.c | 7 ++-- + fs/bcachefs/journal_reclaim.c | 28 +++++-------- + fs/bcachefs/journal_types.h | 2 +- + 5 files changed, 76 insertions(+), 62 deletions(-) + +diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c +index 97165b704809..a73c78d3e56c 100644 +--- a/fs/bcachefs/journal.c ++++ b/fs/bcachefs/journal.c +@@ -20,11 +20,6 @@ + + #include + +-static u64 last_unwritten_seq(struct journal *j) +-{ +- return j->seq_ondisk + 1; +-} +- + static inline bool journal_seq_unwritten(struct journal *j, u64 seq) + { + return seq > j->seq_ondisk; +@@ -35,6 +30,11 @@ static bool __journal_entry_is_open(union journal_res_state state) + return state.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL; + } + ++static inline unsigned nr_unwritten_journal_entries(struct journal *j) ++{ ++ return atomic64_read(&j->seq) - j->seq_ondisk; ++} ++ + static bool journal_entry_is_open(struct journal *j) + { + return __journal_entry_is_open(j->reservations); +@@ -167,6 +167,7 @@ static bool __journal_entry_close(struct journal *j) + */ + buf->last_seq = journal_last_seq(j); + buf->data->last_seq = cpu_to_le64(buf->last_seq); ++ BUG_ON(buf->last_seq > le64_to_cpu(buf->data->seq)); + + __bch2_journal_pin_put(j, le64_to_cpu(buf->data->seq)); + +@@ -180,18 +181,19 @@ static bool __journal_entry_close(struct journal *j) + + static bool journal_entry_want_write(struct journal *j) + { +- union journal_res_state s = READ_ONCE(j->reservations); +- struct journal_buf *buf = journal_cur_buf(j); +- bool ret = false; +- +- if (!buf->flush_time) { +- buf->flush_time = local_clock() ?: 1; +- buf->expires = jiffies; +- } ++ bool ret = !journal_entry_is_open(j) || ++ (journal_cur_seq(j) == journal_last_unwritten_seq(j) && ++ __journal_entry_close(j)); + + /* Don't close it yet if we already have a write in flight: */ +- if (s.idx == s.unwritten_idx) +- ret = __journal_entry_close(j); ++ if (!ret && nr_unwritten_journal_entries(j)) { ++ struct journal_buf *buf = journal_cur_buf(j); ++ ++ if (!buf->flush_time) { ++ buf->flush_time = local_clock() ?: 1; ++ buf->expires = jiffies; ++ } ++ } + + return ret; + } +@@ -311,8 +313,8 @@ static int journal_entry_open(struct journal *j) + + static bool journal_quiesced(struct journal *j) + { +- union journal_res_state s = READ_ONCE(j->reservations); +- bool ret = s.idx == s.unwritten_idx && !__journal_entry_is_open(s); ++ bool ret = atomic64_read(&j->seq) == j->seq_ondisk || ++ bch2_journal_error(j); + + if (!ret) + journal_entry_close(j); +@@ -417,7 +419,7 @@ unlock: + if ((ret == cur_entry_journal_full || + ret == cur_entry_journal_pin_full) && + !can_discard && +- j->reservations.idx == j->reservations.unwritten_idx && ++ !nr_unwritten_journal_entries(j) && + (flags & JOURNAL_RES_GET_RESERVED)) { + struct printbuf buf = PRINTBUF; + +@@ -577,7 +579,7 @@ int bch2_journal_flush_seq_async(struct journal *j, u64 seq, + } + + /* if seq was written, but not flushed - flush a newer one instead */ +- seq = max(seq, last_unwritten_seq(j)); ++ seq = max(seq, journal_last_unwritten_seq(j)); + + recheck_need_open: + if (seq > journal_cur_seq(j)) { +@@ -710,13 +712,13 @@ bool bch2_journal_noflush_seq(struct journal *j, u64 seq) + if (seq <= c->journal.flushed_seq_ondisk) + goto out; + +- for (unwritten_seq = last_unwritten_seq(j); ++ for (unwritten_seq = journal_last_unwritten_seq(j); + unwritten_seq < seq; + unwritten_seq++) { + struct journal_buf *buf = journal_seq_to_buf(j, unwritten_seq); + + /* journal write is already in flight, and was a flush write: */ +- if (unwritten_seq == last_unwritten_seq(j) && !buf->noflush) ++ if (unwritten_seq == journal_last_unwritten_seq(j) && !buf->noflush) + goto out; + + buf->noflush = true; +@@ -941,17 +943,16 @@ int bch2_dev_journal_alloc(struct bch_dev *ca) + + static bool bch2_journal_writing_to_device(struct journal *j, unsigned dev_idx) + { +- union journal_res_state state; + bool ret = false; +- unsigned i; ++ u64 seq; + + spin_lock(&j->lock); +- state = READ_ONCE(j->reservations); +- i = state.idx; ++ for (seq = journal_last_unwritten_seq(j); ++ seq <= journal_cur_seq(j) && !ret; ++ seq++) { ++ struct journal_buf *buf = journal_seq_to_buf(j, seq); + +- while (i != state.unwritten_idx) { +- i = (i - 1) & JOURNAL_BUF_MASK; +- if (bch2_bkey_has_device(bkey_i_to_s_c(&j->buf[i].key), dev_idx)) ++ if (bch2_bkey_has_device(bkey_i_to_s_c(&buf->key), dev_idx)) + ret = true; + } + spin_unlock(&j->lock); +@@ -1013,6 +1014,7 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq, + j->replay_journal_seq_end = cur_seq; + j->last_seq_ondisk = last_seq; + j->flushed_seq_ondisk = cur_seq - 1; ++ j->seq_ondisk = cur_seq - 1; + j->pin.front = last_seq; + j->pin.back = cur_seq; + atomic64_set(&j->seq, cur_seq - 1); +@@ -1162,15 +1164,18 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) + union journal_res_state s; + struct bch_dev *ca; + unsigned long now = jiffies; ++ u64 seq; + unsigned i; + + out->atomic++; ++ out->tabstops[0] = 24; + + rcu_read_lock(); + s = READ_ONCE(j->reservations); + +- pr_buf(out, "active journal entries:\t%llu\n", fifo_used(&j->pin)); ++ pr_buf(out, "dirty journal entries:\t%llu\n", fifo_used(&j->pin)); + pr_buf(out, "seq:\t\t\t%llu\n", journal_cur_seq(j)); ++ pr_buf(out, "seq_ondisk:\t\t%llu\n", j->seq_ondisk); + pr_buf(out, "last_seq:\t\t%llu\n", journal_last_seq(j)); + pr_buf(out, "last_seq_ondisk:\t%llu\n", j->last_seq_ondisk); + pr_buf(out, "flushed_seq_ondisk:\t%llu\n", j->flushed_seq_ondisk); +@@ -1189,33 +1194,42 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) + + switch (s.cur_entry_offset) { + case JOURNAL_ENTRY_ERROR_VAL: +- pr_buf(out, "error\n"); ++ pr_buf(out, "error"); + break; + case JOURNAL_ENTRY_CLOSED_VAL: +- pr_buf(out, "closed\n"); ++ pr_buf(out, "closed"); + break; + default: +- pr_buf(out, "%u/%u\n", s.cur_entry_offset, j->cur_entry_u64s); ++ pr_buf(out, "%u/%u", s.cur_entry_offset, j->cur_entry_u64s); + break; + } + +- pr_buf(out, "current entry:\t\tidx %u refcount %u\n", s.idx, journal_state_count(s, s.idx)); ++ pr_newline(out); + +- i = s.idx; +- while (i != s.unwritten_idx) { +- i = (i - 1) & JOURNAL_BUF_MASK; ++ for (seq = journal_cur_seq(j); ++ seq >= journal_last_unwritten_seq(j); ++ --seq) { ++ i = seq & JOURNAL_BUF_MASK; + +- pr_buf(out, "unwritten entry:\tidx %u refcount %u sectors %u\n", +- i, journal_state_count(s, i), j->buf[i].sectors); ++ pr_buf(out, "unwritten entry:"); ++ pr_tab(out); ++ pr_buf(out, "%llu", seq); ++ pr_newline(out); + pr_indent_push(out, 2); + +- pr_buf(out, "refcount %u", journal_state_count(s, i)); ++ pr_buf(out, "refcount:"); ++ pr_tab(out); ++ pr_buf(out, "%u", journal_state_count(s, i)); + pr_newline(out); + +- pr_buf(out, "sectors %u", j->buf[i].sectors); ++ pr_buf(out, "sectors:"); ++ pr_tab(out); ++ pr_buf(out, "%u", j->buf[i].sectors); + pr_newline(out); + +- pr_buf(out, "expires %li ms", jiffies_to_msecs(j->buf[i].expires - jiffies)); ++ pr_buf(out, "expires"); ++ pr_tab(out); ++ pr_buf(out, "%li jiffies", j->buf[i].expires - jiffies); + pr_newline(out); + + pr_indent_pop(out, 2); +diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h +index 0a3fb8a061c2..364820804ed6 100644 +--- a/fs/bcachefs/journal.h ++++ b/fs/bcachefs/journal.h +@@ -141,6 +141,11 @@ static inline u64 journal_cur_seq(struct journal *j) + return j->pin.back - 1; + } + ++static inline u64 journal_last_unwritten_seq(struct journal *j) ++{ ++ return j->seq_ondisk + 1; ++} ++ + void bch2_journal_set_has_inum(struct journal *, u64, u64); + + static inline int journal_state_count(union journal_res_state s, int idx) +diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c +index 19a8ce816062..5d9ce6e48a94 100644 +--- a/fs/bcachefs/journal_io.c ++++ b/fs/bcachefs/journal_io.c +@@ -1332,7 +1332,7 @@ static void journal_buf_realloc(struct journal *j, struct journal_buf *buf) + + static inline struct journal_buf *journal_last_unwritten_buf(struct journal *j) + { +- return j->buf + j->reservations.unwritten_idx; ++ return j->buf + (journal_last_unwritten_seq(j) & JOURNAL_BUF_MASK); + } + + static void journal_write_done(struct closure *cl) +@@ -1404,12 +1404,13 @@ static void journal_write_done(struct closure *cl) + closure_wake_up(&w->wait); + journal_wake(j); + +- if (new.unwritten_idx == new.idx) { ++ if (journal_last_unwritten_seq(j) == journal_cur_seq(j)) { + struct journal_buf *buf = journal_cur_buf(j); + long delta = buf->expires - jiffies; + + mod_delayed_work(c->io_complete_wq, &j->write_work, max(0L, delta)); +- } else if (!journal_state_count(new, new.unwritten_idx)) ++ } else if (journal_last_unwritten_seq(j) < journal_cur_seq(j) && ++ !journal_state_count(new, new.unwritten_idx)) + closure_call(&j->io, bch2_journal_write, c->io_complete_wq, NULL); + + spin_unlock(&j->lock); +diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c +index 7bd35e58bd3c..ec565edbbfc5 100644 +--- a/fs/bcachefs/journal_reclaim.c ++++ b/fs/bcachefs/journal_reclaim.c +@@ -59,25 +59,13 @@ static void journal_set_remaining(struct journal *j, unsigned u64s_remaining) + old.v, new.v)) != old.v); + } + +-static inline unsigned get_unwritten_sectors(struct journal *j, unsigned *idx) +-{ +- unsigned sectors = 0; +- +- while (!sectors && *idx != j->reservations.idx) { +- sectors = j->buf[*idx].sectors; +- +- *idx = (*idx + 1) & JOURNAL_BUF_MASK; +- } +- +- return sectors; +-} +- + static struct journal_space + journal_dev_space_available(struct journal *j, struct bch_dev *ca, + enum journal_space_from from) + { + struct journal_device *ja = &ca->journal; +- unsigned sectors, buckets, unwritten, idx = j->reservations.unwritten_idx; ++ unsigned sectors, buckets, unwritten; ++ u64 seq; + + if (from == journal_space_total) + return (struct journal_space) { +@@ -92,7 +80,14 @@ journal_dev_space_available(struct journal *j, struct bch_dev *ca, + * We that we don't allocate the space for a journal entry + * until we write it out - thus, account for it here: + */ +- while ((unwritten = get_unwritten_sectors(j, &idx))) { ++ for (seq = journal_last_unwritten_seq(j); ++ seq <= journal_cur_seq(j); ++ seq++) { ++ unwritten = j->buf[seq & JOURNAL_BUF_MASK].sectors; ++ ++ if (!unwritten) ++ continue; ++ + /* entry won't fit on this device, skip: */ + if (unwritten > ca->mi.bucket_size) + continue; +@@ -214,8 +209,7 @@ void bch2_journal_space_available(struct journal *j) + total = j->space[journal_space_total].total; + + if (!clean_ondisk && +- j->reservations.idx == +- j->reservations.unwritten_idx) { ++ journal_cur_seq(j) == j->seq_ondisk) { + struct printbuf buf = PRINTBUF; + + __bch2_journal_debug_to_text(&buf, j); +diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h +index 77d7192535df..86842370b5e0 100644 +--- a/fs/bcachefs/journal_types.h ++++ b/fs/bcachefs/journal_types.h +@@ -25,7 +25,7 @@ struct journal_buf { + + struct closure_waitlist wait; + u64 last_seq; /* copy of data->last_seq */ +- unsigned long expires; ++ long expires; + u64 flush_time; + + unsigned buf_size; /* size in bytes of @data */ +-- +cgit v1.2.3 + + +From b3abb2730f333e05b9a67b0e71095dae36c084e3 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 28 Feb 2022 19:29:19 -0500 +Subject: bcachefs: __journal_entry_close() never fails + +Previous patch just moved responsibility for incrementing the journal +sequence number and initializing the new journal entry from +__journal_entry_close() to journal_entry_open(); this patch makes the +analagous change for journal reservation state, incrementing the index +into array of journal_bufs at open time. + +This means that __journal_entry_close() never fails to close an open +journal entry, which is important for the next patch that will change +our emergency shutdown behaviour. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/journal.c | 50 ++++++++++++++++++++------------------------- + fs/bcachefs/journal.h | 3 --- + fs/bcachefs/journal_io.c | 18 +++++++++++----- + fs/bcachefs/journal_types.h | 1 + + 4 files changed, 36 insertions(+), 36 deletions(-) + +diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c +index a73c78d3e56c..e7fbcaee43a3 100644 +--- a/fs/bcachefs/journal.c ++++ b/fs/bcachefs/journal.c +@@ -107,7 +107,7 @@ void __bch2_journal_buf_put(struct journal *j) + * We don't close a journal_buf until the next journal_buf is finished writing, + * and can be opened again - this also initializes the next journal_buf: + */ +-static bool __journal_entry_close(struct journal *j) ++static void __journal_entry_close(struct journal *j) + { + struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct journal_buf *buf = journal_cur_buf(j); +@@ -120,21 +120,15 @@ static bool __journal_entry_close(struct journal *j) + do { + old.v = new.v = v; + if (old.cur_entry_offset == JOURNAL_ENTRY_CLOSED_VAL) +- return true; ++ return; + + if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL) { + /* this entry will never be written: */ + closure_wake_up(&buf->wait); +- return true; ++ return; + } + + new.cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL; +- new.idx++; +- +- if (new.idx == new.unwritten_idx) +- return false; +- +- BUG_ON(journal_state_count(new, new.idx)); + } while ((v = atomic64_cmpxchg(&j->reservations.counter, + old.v, new.v)) != old.v); + +@@ -176,17 +170,17 @@ static bool __journal_entry_close(struct journal *j) + bch2_journal_space_available(j); + + bch2_journal_buf_put(j, old.idx); +- return true; + } + + static bool journal_entry_want_write(struct journal *j) + { + bool ret = !journal_entry_is_open(j) || +- (journal_cur_seq(j) == journal_last_unwritten_seq(j) && +- __journal_entry_close(j)); ++ journal_cur_seq(j) == journal_last_unwritten_seq(j); + + /* Don't close it yet if we already have a write in flight: */ +- if (!ret && nr_unwritten_journal_entries(j)) { ++ if (ret) ++ __journal_entry_close(j); ++ else if (nr_unwritten_journal_entries(j)) { + struct journal_buf *buf = journal_cur_buf(j); + + if (!buf->flush_time) { +@@ -222,15 +216,15 @@ static bool journal_entry_close(struct journal *j) + static int journal_entry_open(struct journal *j) + { + struct bch_fs *c = container_of(j, struct bch_fs, journal); +- struct journal_buf *buf = journal_cur_buf(j); ++ struct journal_buf *buf = j->buf + ++ ((journal_cur_seq(j) + 1) & JOURNAL_BUF_MASK); + union journal_res_state old, new; + int u64s; + u64 v; + +- BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb)); +- + lockdep_assert_held(&j->lock); + BUG_ON(journal_entry_is_open(j)); ++ BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb)); + + if (j->blocked) + return cur_entry_blocked; +@@ -244,6 +238,9 @@ static int journal_entry_open(struct journal *j) + if (!fifo_free(&j->pin)) + return cur_entry_journal_pin_full; + ++ if (nr_unwritten_journal_entries(j) == ARRAY_SIZE(j->buf) - 1) ++ return cur_entry_max_in_flight; ++ + BUG_ON(!j->cur_entry_sectors); + + buf->expires = +@@ -292,7 +289,10 @@ static int journal_entry_open(struct journal *j) + old.v = new.v = v; + + BUG_ON(old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL); ++ ++ new.idx++; + BUG_ON(journal_state_count(new, new.idx)); ++ BUG_ON(new.idx != (journal_cur_seq(j) & JOURNAL_BUF_MASK)); + + journal_state_inc(&new); + new.cur_entry_offset = 0; +@@ -391,18 +391,11 @@ retry: + buf->buf_size < JOURNAL_ENTRY_SIZE_MAX) + j->buf_size_want = max(j->buf_size_want, buf->buf_size << 1); + +- if (journal_entry_is_open(j) && +- !__journal_entry_close(j)) { +- /* +- * We failed to get a reservation on the current open journal +- * entry because it's full, and we can't close it because +- * there's still a previous one in flight: +- */ ++ __journal_entry_close(j); ++ ret = journal_entry_open(j); ++ ++ if (ret == cur_entry_max_in_flight) + trace_journal_entry_full(c); +- ret = cur_entry_blocked; +- } else { +- ret = journal_entry_open(j); +- } + unlock: + if ((ret && ret != cur_entry_insufficient_devices) && + !j->res_get_blocked_start) { +@@ -1052,7 +1045,8 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq, + set_bit(JOURNAL_STARTED, &j->flags); + j->last_flush_write = jiffies; + +- j->reservations.idx = j->reservations.unwritten_idx = journal_cur_seq(j) + 1; ++ j->reservations.idx = j->reservations.unwritten_idx = journal_cur_seq(j); ++ j->reservations.unwritten_idx++; + + c->last_bucket_seq_cleanup = journal_cur_seq(j); + +diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h +index 364820804ed6..1bb0e00df44c 100644 +--- a/fs/bcachefs/journal.h ++++ b/fs/bcachefs/journal.h +@@ -266,9 +266,6 @@ static inline void bch2_journal_buf_put(struct journal *j, unsigned idx) + .buf3_count = idx == 3, + }).v, &j->reservations.counter); + +- EBUG_ON(((s.idx - idx) & 3) > +- ((s.idx - s.unwritten_idx) & 3)); +- + if (!journal_state_count(s, idx) && idx == s.unwritten_idx) + __bch2_journal_buf_put(j); + } +diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c +index 5d9ce6e48a94..a3889f6ea0a1 100644 +--- a/fs/bcachefs/journal_io.c ++++ b/fs/bcachefs/journal_io.c +@@ -1393,7 +1393,7 @@ static void journal_write_done(struct closure *cl) + v = atomic64_read(&j->reservations.counter); + do { + old.v = new.v = v; +- BUG_ON(new.idx == new.unwritten_idx); ++ BUG_ON(journal_state_count(new, new.unwritten_idx)); + + new.unwritten_idx++; + } while ((v = atomic64_cmpxchg(&j->reservations.counter, +@@ -1404,14 +1404,22 @@ static void journal_write_done(struct closure *cl) + closure_wake_up(&w->wait); + journal_wake(j); + +- if (journal_last_unwritten_seq(j) == journal_cur_seq(j)) { ++ if (!journal_state_count(new, new.unwritten_idx) && ++ journal_last_unwritten_seq(j) <= journal_cur_seq(j)) { ++ closure_call(&j->io, bch2_journal_write, c->io_complete_wq, NULL); ++ } else if (journal_last_unwritten_seq(j) == journal_cur_seq(j) && ++ new.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL) { + struct journal_buf *buf = journal_cur_buf(j); + long delta = buf->expires - jiffies; + ++ /* ++ * We don't close a journal entry to write it while there's ++ * previous entries still in flight - the current journal entry ++ * might want to be written now: ++ */ ++ + mod_delayed_work(c->io_complete_wq, &j->write_work, max(0L, delta)); +- } else if (journal_last_unwritten_seq(j) < journal_cur_seq(j) && +- !journal_state_count(new, new.unwritten_idx)) +- closure_call(&j->io, bch2_journal_write, c->io_complete_wq, NULL); ++ } + + spin_unlock(&j->lock); + } +diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h +index 86842370b5e0..6fd458191e41 100644 +--- a/fs/bcachefs/journal_types.h ++++ b/fs/bcachefs/journal_types.h +@@ -167,6 +167,7 @@ struct journal { + enum { + cur_entry_ok, + cur_entry_blocked, ++ cur_entry_max_in_flight, + cur_entry_journal_full, + cur_entry_journal_pin_full, + cur_entry_journal_stuck, +-- +cgit v1.2.3 + + +From 5428510480e34f53a8b2a6b5496a84c7c089fe00 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 28 Feb 2022 18:48:33 -0500 +Subject: bcachefs: Finish writing journal after journal error + +After emergency shutdown, all journal entries will be written as noflush +entries, meaning they will never be used - but they'll still exist for +debugging tools to examine. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/journal.c | 70 +++++++++++++++++------------------------------- + fs/bcachefs/journal_io.c | 10 +++---- + 2 files changed, 30 insertions(+), 50 deletions(-) + +diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c +index e7fbcaee43a3..fb533ecc78f8 100644 +--- a/fs/bcachefs/journal.c ++++ b/fs/bcachefs/journal.c +@@ -63,35 +63,6 @@ static void journal_pin_list_init(struct journal_entry_pin_list *p, int count) + p->devs.nr = 0; + } + +-void bch2_journal_halt(struct journal *j) +-{ +- union journal_res_state old, new; +- u64 v; +- +- spin_lock(&j->lock); +- +- v = atomic64_read(&j->reservations.counter); +- do { +- old.v = new.v = v; +- if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL) +- goto out; +- +- new.cur_entry_offset = JOURNAL_ENTRY_ERROR_VAL; +- } while ((v = atomic64_cmpxchg(&j->reservations.counter, +- old.v, new.v)) != old.v); +- +- /* +- * XXX: we're not using j->lock here because this can be called from +- * interrupt context, this can race with journal_write_done() +- */ +- if (!j->err_seq) +- j->err_seq = journal_cur_seq(j); +- journal_wake(j); +- closure_wake_up(&journal_cur_buf(j)->wait); +-out: +- spin_unlock(&j->lock); +-} +- + /* journal entry close/open: */ + + void __bch2_journal_buf_put(struct journal *j) +@@ -107,7 +78,7 @@ void __bch2_journal_buf_put(struct journal *j) + * We don't close a journal_buf until the next journal_buf is finished writing, + * and can be opened again - this also initializes the next journal_buf: + */ +-static void __journal_entry_close(struct journal *j) ++static void __journal_entry_close(struct journal *j, unsigned closed_val) + { + struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct journal_buf *buf = journal_cur_buf(j); +@@ -115,23 +86,24 @@ static void __journal_entry_close(struct journal *j) + u64 v = atomic64_read(&j->reservations.counter); + unsigned sectors; + ++ BUG_ON(closed_val != JOURNAL_ENTRY_CLOSED_VAL && ++ closed_val != JOURNAL_ENTRY_ERROR_VAL); ++ + lockdep_assert_held(&j->lock); + + do { + old.v = new.v = v; +- if (old.cur_entry_offset == JOURNAL_ENTRY_CLOSED_VAL) +- return; ++ new.cur_entry_offset = closed_val; + +- if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL) { +- /* this entry will never be written: */ +- closure_wake_up(&buf->wait); ++ if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL || ++ old.cur_entry_offset == new.cur_entry_offset) + return; +- } +- +- new.cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL; + } while ((v = atomic64_cmpxchg(&j->reservations.counter, + old.v, new.v)) != old.v); + ++ if (!__journal_entry_is_open(old)) ++ return; ++ + /* Close out old buffer: */ + buf->data->u64s = cpu_to_le32(old.cur_entry_offset); + +@@ -172,6 +144,15 @@ static void __journal_entry_close(struct journal *j) + bch2_journal_buf_put(j, old.idx); + } + ++void bch2_journal_halt(struct journal *j) ++{ ++ spin_lock(&j->lock); ++ __journal_entry_close(j, JOURNAL_ENTRY_ERROR_VAL); ++ if (!j->err_seq) ++ j->err_seq = journal_cur_seq(j); ++ spin_unlock(&j->lock); ++} ++ + static bool journal_entry_want_write(struct journal *j) + { + bool ret = !journal_entry_is_open(j) || +@@ -179,7 +160,7 @@ static bool journal_entry_want_write(struct journal *j) + + /* Don't close it yet if we already have a write in flight: */ + if (ret) +- __journal_entry_close(j); ++ __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL); + else if (nr_unwritten_journal_entries(j)) { + struct journal_buf *buf = journal_cur_buf(j); + +@@ -313,8 +294,7 @@ static int journal_entry_open(struct journal *j) + + static bool journal_quiesced(struct journal *j) + { +- bool ret = atomic64_read(&j->seq) == j->seq_ondisk || +- bch2_journal_error(j); ++ bool ret = atomic64_read(&j->seq) == j->seq_ondisk; + + if (!ret) + journal_entry_close(j); +@@ -340,7 +320,7 @@ static void journal_write_work(struct work_struct *work) + if (delta > 0) + mod_delayed_work(c->io_complete_wq, &j->write_work, delta); + else +- __journal_entry_close(j); ++ __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL); + spin_unlock(&j->lock); + } + +@@ -391,7 +371,7 @@ retry: + buf->buf_size < JOURNAL_ENTRY_SIZE_MAX) + j->buf_size_want = max(j->buf_size_want, buf->buf_size << 1); + +- __journal_entry_close(j); ++ __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL); + ret = journal_entry_open(j); + + if (ret == cur_entry_max_in_flight) +@@ -527,7 +507,7 @@ void bch2_journal_entry_res_resize(struct journal *j, + /* + * Not enough room in current journal entry, have to flush it: + */ +- __journal_entry_close(j); ++ __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL); + } else { + journal_cur_buf(j)->u64s_reserved += d; + } +@@ -579,7 +559,7 @@ recheck_need_open: + struct journal_res res = { 0 }; + + if (journal_entry_is_open(j)) +- __journal_entry_close(j); ++ __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL); + + spin_unlock(&j->lock); + +diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c +index a3889f6ea0a1..fb24ca212b09 100644 +--- a/fs/bcachefs/journal_io.c ++++ b/fs/bcachefs/journal_io.c +@@ -1516,11 +1516,11 @@ void bch2_journal_write(struct closure *cl) + j->write_start_time = local_clock(); + + spin_lock(&j->lock); +- if (c->sb.features & (1ULL << BCH_FEATURE_journal_no_flush) && +- (w->noflush || +- (!w->must_flush && +- (jiffies - j->last_flush_write) < msecs_to_jiffies(c->opts.journal_flush_delay) && +- test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags)))) { ++ if (bch2_journal_error(j) || ++ w->noflush || ++ (!w->must_flush && ++ (jiffies - j->last_flush_write) < msecs_to_jiffies(c->opts.journal_flush_delay) && ++ test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags))) { + w->noflush = true; + SET_JSET_NO_FLUSH(jset, true); + jset->last_seq = 0; +-- +cgit v1.2.3 + + +From 7c63c05526551aebb028e305745c2bde4d25354a Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 3 Mar 2022 11:04:01 -0500 +Subject: bcachefs: Make bch2_btree_cache_scan() try harder + +Previously, when bch2_btree_cache_scan() attempted to reclaim a node but +failed (because trylock failed, because it was dirty, etc.), it would +count that against the number of nodes it was scanning and attempting to +free. This patch changes that behaviour, so that now we only count nodes +that we then don't free if they have the accessed bit (which we also +clear). + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_cache.c | 28 +++++++++++++++++----------- + 1 file changed, 17 insertions(+), 11 deletions(-) + +diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c +index 1347b1fc1166..8b4d240611f8 100644 +--- a/fs/bcachefs/btree_cache.c ++++ b/fs/bcachefs/btree_cache.c +@@ -328,17 +328,13 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, + } + restart: + list_for_each_entry_safe(b, t, &bc->live, list) { +- touched++; +- +- if (touched >= nr) { +- /* Save position */ +- if (&t->list != &bc->live) +- list_move_tail(&bc->live, &t->list); +- break; ++ /* tweak this */ ++ if (btree_node_accessed(b)) { ++ clear_btree_node_accessed(b); ++ goto touched; + } + +- if (!btree_node_accessed(b) && +- !btree_node_reclaim(c, b)) { ++ if (!btree_node_reclaim(c, b)) { + /* can't call bch2_btree_node_hash_remove under lock */ + freed++; + if (&t->list != &bc->live) +@@ -359,8 +355,18 @@ restart: + else if (!mutex_trylock(&bc->lock)) + goto out; + goto restart; +- } else +- clear_btree_node_accessed(b); ++ } else { ++ continue; ++ } ++touched: ++ touched++; ++ ++ if (touched >= nr) { ++ /* Save position */ ++ if (&t->list != &bc->live) ++ list_move_tail(&bc->live, &t->list); ++ break; ++ } + } + + mutex_unlock(&bc->lock); +-- +cgit v1.2.3 + + +From 7e57868d706a43b4165bcabcfe41f7c106fb480a Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 4 Mar 2022 19:15:46 -0500 +Subject: bcachefs: Simplify parameters to bch2_btree_update_start() + +We don't need to pass the number of nodes required to +bch2_btree_update_start, just whether we're doing a split at @level. +This is prep work for a fix to our usage of six lock's percpu mode, +which is going to require us to count up and allocate interior nodes and +leaf nodes seperately. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_update_interior.c | 41 +++++++++++++++++++++++-------------- + 1 file changed, 26 insertions(+), 15 deletions(-) + +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index e2cf0f58bf2f..8ce40963d07f 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -947,13 +947,15 @@ static void bch2_btree_update_done(struct btree_update *as) + + static struct btree_update * + bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, +- unsigned level, unsigned nr_nodes, unsigned flags) ++ unsigned level, bool split, unsigned flags) + { + struct bch_fs *c = trans->c; + struct btree_update *as; + u64 start_time = local_clock(); + int disk_res_flags = (flags & BTREE_INSERT_NOFAIL) + ? BCH_DISK_RESERVATION_NOFAIL : 0; ++ unsigned nr_nodes = 0; ++ unsigned update_level = level; + int journal_flags = 0; + int ret = 0; + +@@ -964,10 +966,24 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, + if (flags & BTREE_INSERT_JOURNAL_RECLAIM) + journal_flags |= JOURNAL_RES_GET_NONBLOCK; + +- /* +- * XXX: figure out how far we might need to split, +- * instead of locking/reserving all the way to the root: +- */ ++ while (1) { ++ nr_nodes += 1 + split; ++ update_level++; ++ ++ if (!btree_path_node(path, update_level)) ++ break; ++ ++ /* ++ * XXX: figure out how far we might need to split, ++ * instead of locking/reserving all the way to the root: ++ */ ++ split = update_level + 1 < BTREE_MAX_DEPTH; ++ } ++ ++ /* Might have to allocate a new root: */ ++ if (update_level < BTREE_MAX_DEPTH) ++ nr_nodes += 1; ++ + if (!bch2_btree_path_upgrade(trans, path, U8_MAX)) { + trace_trans_restart_iter_upgrade(trans->fn, _RET_IP_, + path->btree_id, &path->pos); +@@ -1551,14 +1567,13 @@ int bch2_btree_split_leaf(struct btree_trans *trans, + struct btree_path *path, + unsigned flags) + { +- struct bch_fs *c = trans->c; + struct btree *b = path_l(path)->b; + struct btree_update *as; + unsigned l; + int ret = 0; + + as = bch2_btree_update_start(trans, path, path->level, +- btree_update_reserve_required(c, b), flags); ++ true, flags); + if (IS_ERR(as)) + return PTR_ERR(as); + +@@ -1669,11 +1684,10 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, + goto out; + + parent = btree_node_parent(path, b); +- as = bch2_btree_update_start(trans, path, level, +- btree_update_reserve_required(c, parent) + 1, +- flags| ++ as = bch2_btree_update_start(trans, path, level, false, + BTREE_INSERT_NOFAIL| +- BTREE_INSERT_USE_RESERVE); ++ BTREE_INSERT_USE_RESERVE| ++ flags); + ret = PTR_ERR_OR_ZERO(as); + if (ret) + goto err; +@@ -1756,10 +1770,7 @@ int bch2_btree_node_rewrite(struct btree_trans *trans, + + parent = btree_node_parent(iter->path, b); + as = bch2_btree_update_start(trans, iter->path, b->c.level, +- (parent +- ? btree_update_reserve_required(c, parent) +- : 0) + 1, +- flags); ++ false, flags); + ret = PTR_ERR_OR_ZERO(as); + if (ret) { + trace_btree_gc_rewrite_node_fail(c, b); +-- +cgit v1.2.3 + + +From e7710d31f16756daf7c69720d41d22a9b9ee657e Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 4 Mar 2022 19:50:28 -0500 +Subject: bcachefs: Refactor bch2_btree_node_mem_alloc() + +This is prep work for the next patch, which is going to fix our usage of +the percpu mode of six locks by never switching struct btree between the +two modes - which means we need separate freed lists. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_cache.c | 84 +++++++++++++++++++++++++---------------------- + 1 file changed, 45 insertions(+), 39 deletions(-) + +diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c +index 8b4d240611f8..91b6a7c5e4ba 100644 +--- a/fs/bcachefs/btree_cache.c ++++ b/fs/bcachefs/btree_cache.c +@@ -566,52 +566,54 @@ static struct btree *btree_node_cannibalize(struct bch_fs *c) + struct btree *bch2_btree_node_mem_alloc(struct bch_fs *c) + { + struct btree_cache *bc = &c->btree_cache; +- struct btree *b; ++ struct btree *b, *b2; + u64 start_time = local_clock(); + unsigned flags; + + flags = memalloc_nofs_save(); + mutex_lock(&bc->lock); + +- /* +- * btree_free() doesn't free memory; it sticks the node on the end of +- * the list. Check if there's any freed nodes there: +- */ +- list_for_each_entry(b, &bc->freeable, list) +- if (!btree_node_reclaim(c, b)) +- goto got_node; +- + /* + * We never free struct btree itself, just the memory that holds the on + * disk node. Check the freed list before allocating a new one: + */ + list_for_each_entry(b, &bc->freed, list) +- if (!btree_node_reclaim(c, b)) ++ if (!btree_node_reclaim(c, b)) { ++ list_del_init(&b->list); + goto got_node; ++ } ++ ++ b = __btree_node_mem_alloc(c); ++ if (!b) ++ goto err_locked; + +- b = NULL; ++ BUG_ON(!six_trylock_intent(&b->c.lock)); ++ BUG_ON(!six_trylock_write(&b->c.lock)); + got_node: +- if (b) +- list_del_init(&b->list); +- mutex_unlock(&bc->lock); + +- if (!b) { +- b = __btree_node_mem_alloc(c); +- if (!b) +- goto err; ++ /* ++ * btree_free() doesn't free memory; it sticks the node on the end of ++ * the list. Check if there's any freed nodes there: ++ */ ++ list_for_each_entry(b2, &bc->freeable, list) ++ if (!btree_node_reclaim(c, b2)) { ++ swap(b->data, b2->data); ++ swap(b->aux_data, b2->aux_data); ++ list_move(&b2->list, &bc->freed); ++ six_unlock_write(&b2->c.lock); ++ six_unlock_intent(&b2->c.lock); ++ goto got_mem; ++ } + +- BUG_ON(!six_trylock_intent(&b->c.lock)); +- BUG_ON(!six_trylock_write(&b->c.lock)); +- } ++ mutex_unlock(&bc->lock); + +- if (!b->data) { +- if (btree_node_data_alloc(c, b, __GFP_NOWARN|GFP_KERNEL)) +- goto err; ++ if (btree_node_data_alloc(c, b, __GFP_NOWARN|GFP_KERNEL)) ++ goto err; + +- mutex_lock(&bc->lock); +- bc->used++; +- mutex_unlock(&bc->lock); +- } ++ mutex_lock(&bc->lock); ++ bc->used++; ++got_mem: ++ mutex_unlock(&bc->lock); + + BUG_ON(btree_node_hashed(b)); + BUG_ON(btree_node_dirty(b)); +@@ -633,20 +635,24 @@ out: + return b; + err: + mutex_lock(&bc->lock); +- +- if (b) { +- list_add(&b->list, &bc->freed); +- six_unlock_write(&b->c.lock); +- six_unlock_intent(&b->c.lock); +- } +- ++err_locked: + /* Try to cannibalize another cached btree node: */ + if (bc->alloc_lock == current) { +- b = btree_node_cannibalize(c); +- list_del_init(&b->list); +- mutex_unlock(&bc->lock); ++ b2 = btree_node_cannibalize(c); ++ bch2_btree_node_hash_remove(bc, b2); ++ ++ if (b) { ++ swap(b->data, b2->data); ++ swap(b->aux_data, b2->aux_data); ++ list_move(&b2->list, &bc->freed); ++ six_unlock_write(&b2->c.lock); ++ six_unlock_intent(&b2->c.lock); ++ } else { ++ b = b2; ++ list_del_init(&b->list); ++ } + +- bch2_btree_node_hash_remove(bc, b); ++ mutex_unlock(&bc->lock); + + trace_btree_node_cannibalize(c); + goto out; +-- +cgit v1.2.3 + + +From 31c3de54525487ed96a1ca9d597de55aa6971baf Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 4 Mar 2022 19:16:04 -0500 +Subject: bcachefs: Fix usage of six lock's percpu mode + +Six locks have a percpu mode, which we use for interior btree nodes, as +well as btree key cache keys for the subvolumes btree. We've been +switching locks back and forth between percpu and non percpu mode as +needed, but it turns out this is racy - when we're reusing an existing +node, other threads could be attempting to lock it while we're switching +it between modes. + +This patch fixes this by never switching 'struct btree' between the two +modes, and instead segragating them between two different freed lists. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_cache.c | 41 +++++++++++------ + fs/bcachefs/btree_cache.h | 2 +- + fs/bcachefs/btree_io.c | 2 +- + fs/bcachefs/btree_key_cache.c | 10 ++--- + fs/bcachefs/btree_types.h | 3 +- + fs/bcachefs/btree_update_interior.c | 90 +++++++++++++++++++++---------------- + fs/bcachefs/btree_update_interior.h | 6 ++- + 7 files changed, 92 insertions(+), 62 deletions(-) + +diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c +index 91b6a7c5e4ba..0dcdc30c6888 100644 +--- a/fs/bcachefs/btree_cache.c ++++ b/fs/bcachefs/btree_cache.c +@@ -42,6 +42,14 @@ static inline unsigned btree_cache_can_free(struct btree_cache *bc) + return max_t(int, 0, bc->used - bc->reserve); + } + ++static void btree_node_to_freedlist(struct btree_cache *bc, struct btree *b) ++{ ++ if (b->c.lock.readers) ++ list_move(&b->list, &bc->freed_pcpu); ++ else ++ list_move(&b->list, &bc->freed_nonpcpu); ++} ++ + static void btree_node_data_free(struct bch_fs *c, struct btree *b) + { + struct btree_cache *bc = &c->btree_cache; +@@ -58,7 +66,8 @@ static void btree_node_data_free(struct bch_fs *c, struct btree *b) + b->aux_data = NULL; + + bc->used--; +- list_move(&b->list, &bc->freed); ++ ++ btree_node_to_freedlist(bc, b); + } + + static int bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg *arg, +@@ -163,11 +172,6 @@ int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b, + b->c.level = level; + b->c.btree_id = id; + +- if (level) +- six_lock_pcpu_alloc(&b->c.lock); +- else +- six_lock_pcpu_free_rcu(&b->c.lock); +- + mutex_lock(&bc->lock); + ret = __bch2_btree_node_hash_insert(bc, b); + if (!ret) +@@ -433,8 +437,10 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c) + + BUG_ON(atomic_read(&c->btree_cache.dirty)); + +- while (!list_empty(&bc->freed)) { +- b = list_first_entry(&bc->freed, struct btree, list); ++ list_splice(&bc->freed_pcpu, &bc->freed_nonpcpu); ++ ++ while (!list_empty(&bc->freed_nonpcpu)) { ++ b = list_first_entry(&bc->freed_nonpcpu, struct btree, list); + list_del(&b->list); + six_lock_pcpu_free(&b->c.lock); + kfree(b); +@@ -488,7 +494,8 @@ void bch2_fs_btree_cache_init_early(struct btree_cache *bc) + mutex_init(&bc->lock); + INIT_LIST_HEAD(&bc->live); + INIT_LIST_HEAD(&bc->freeable); +- INIT_LIST_HEAD(&bc->freed); ++ INIT_LIST_HEAD(&bc->freed_pcpu); ++ INIT_LIST_HEAD(&bc->freed_nonpcpu); + } + + /* +@@ -563,9 +570,12 @@ static struct btree *btree_node_cannibalize(struct bch_fs *c) + } + } + +-struct btree *bch2_btree_node_mem_alloc(struct bch_fs *c) ++struct btree *bch2_btree_node_mem_alloc(struct bch_fs *c, bool pcpu_read_locks) + { + struct btree_cache *bc = &c->btree_cache; ++ struct list_head *freed = pcpu_read_locks ++ ? &bc->freed_pcpu ++ : &bc->freed_nonpcpu; + struct btree *b, *b2; + u64 start_time = local_clock(); + unsigned flags; +@@ -577,7 +587,7 @@ struct btree *bch2_btree_node_mem_alloc(struct bch_fs *c) + * We never free struct btree itself, just the memory that holds the on + * disk node. Check the freed list before allocating a new one: + */ +- list_for_each_entry(b, &bc->freed, list) ++ list_for_each_entry(b, freed, list) + if (!btree_node_reclaim(c, b)) { + list_del_init(&b->list); + goto got_node; +@@ -587,6 +597,9 @@ struct btree *bch2_btree_node_mem_alloc(struct bch_fs *c) + if (!b) + goto err_locked; + ++ if (pcpu_read_locks) ++ six_lock_pcpu_alloc(&b->c.lock); ++ + BUG_ON(!six_trylock_intent(&b->c.lock)); + BUG_ON(!six_trylock_write(&b->c.lock)); + got_node: +@@ -599,7 +612,7 @@ got_node: + if (!btree_node_reclaim(c, b2)) { + swap(b->data, b2->data); + swap(b->aux_data, b2->aux_data); +- list_move(&b2->list, &bc->freed); ++ btree_node_to_freedlist(bc, b2); + six_unlock_write(&b2->c.lock); + six_unlock_intent(&b2->c.lock); + goto got_mem; +@@ -644,7 +657,7 @@ err_locked: + if (b) { + swap(b->data, b2->data); + swap(b->aux_data, b2->aux_data); +- list_move(&b2->list, &bc->freed); ++ btree_node_to_freedlist(bc, b2); + six_unlock_write(&b2->c.lock); + six_unlock_intent(&b2->c.lock); + } else { +@@ -689,7 +702,7 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c, + return ERR_PTR(-EINTR); + } + +- b = bch2_btree_node_mem_alloc(c); ++ b = bch2_btree_node_mem_alloc(c, level != 0); + + if (trans && b == ERR_PTR(-ENOMEM)) { + trans->memory_allocation_failure = true; +diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h +index 2901f0dc925b..25906127c023 100644 +--- a/fs/bcachefs/btree_cache.h ++++ b/fs/bcachefs/btree_cache.h +@@ -22,7 +22,7 @@ void bch2_btree_cache_cannibalize_unlock(struct bch_fs *); + int bch2_btree_cache_cannibalize_lock(struct bch_fs *, struct closure *); + + struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *); +-struct btree *bch2_btree_node_mem_alloc(struct bch_fs *); ++struct btree *bch2_btree_node_mem_alloc(struct bch_fs *, bool); + + struct btree *bch2_btree_node_get(struct btree_trans *, struct btree_path *, + const struct bkey_i *, unsigned, +diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c +index 08f5f6b865c6..4f0ad06a615a 100644 +--- a/fs/bcachefs/btree_io.c ++++ b/fs/bcachefs/btree_io.c +@@ -1546,7 +1546,7 @@ int bch2_btree_root_read(struct bch_fs *c, enum btree_id id, + closure_sync(&cl); + } while (ret); + +- b = bch2_btree_node_mem_alloc(c); ++ b = bch2_btree_node_mem_alloc(c, level != 0); + bch2_btree_cache_cannibalize_unlock(c); + + BUG_ON(IS_ERR(b)); +diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c +index 167d177150c4..ee89b650f6a4 100644 +--- a/fs/bcachefs/btree_key_cache.c ++++ b/fs/bcachefs/btree_key_cache.c +@@ -165,13 +165,13 @@ btree_key_cache_create(struct bch_fs *c, + } + + was_new = false; ++ } else { ++ if (btree_id == BTREE_ID_subvolumes) ++ six_lock_pcpu_alloc(&ck->c.lock); ++ else ++ six_lock_pcpu_free(&ck->c.lock); + } + +- if (btree_id == BTREE_ID_subvolumes) +- six_lock_pcpu_alloc(&ck->c.lock); +- else +- six_lock_pcpu_free(&ck->c.lock); +- + ck->c.level = 0; + ck->c.btree_id = btree_id; + ck->key.btree_id = btree_id; +diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h +index 09b6db1d93f2..e6deb3a4494b 100644 +--- a/fs/bcachefs/btree_types.h ++++ b/fs/bcachefs/btree_types.h +@@ -152,7 +152,8 @@ struct btree_cache { + struct mutex lock; + struct list_head live; + struct list_head freeable; +- struct list_head freed; ++ struct list_head freed_pcpu; ++ struct list_head freed_nonpcpu; + + /* Number of elements in live + freeable lists */ + unsigned used; +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index 8ce40963d07f..5834190da6a9 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -181,6 +181,7 @@ static void bch2_btree_node_free_inmem(struct btree_trans *trans, + static struct btree *__bch2_btree_node_alloc(struct bch_fs *c, + struct disk_reservation *res, + struct closure *cl, ++ bool interior_node, + unsigned flags) + { + struct write_point *wp; +@@ -242,7 +243,7 @@ retry: + bch2_open_bucket_get(c, wp, &ob); + bch2_alloc_sectors_done(c, wp); + mem_alloc: +- b = bch2_btree_node_mem_alloc(c); ++ b = bch2_btree_node_mem_alloc(c, interior_node); + six_unlock_write(&b->c.lock); + six_unlock_intent(&b->c.lock); + +@@ -260,12 +261,13 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned lev + { + struct bch_fs *c = as->c; + struct btree *b; ++ struct prealloc_nodes *p = &as->prealloc_nodes[!!level]; + int ret; + + BUG_ON(level >= BTREE_MAX_DEPTH); +- BUG_ON(!as->nr_prealloc_nodes); ++ BUG_ON(!p->nr); + +- b = as->prealloc_nodes[--as->nr_prealloc_nodes]; ++ b = p->b[--p->nr]; + + six_lock_intent(&b->c.lock, NULL, NULL); + six_lock_write(&b->c.lock, NULL, NULL); +@@ -377,47 +379,54 @@ static struct btree *__btree_root_alloc(struct btree_update *as, unsigned level) + static void bch2_btree_reserve_put(struct btree_update *as) + { + struct bch_fs *c = as->c; ++ struct prealloc_nodes *p; + + mutex_lock(&c->btree_reserve_cache_lock); + +- while (as->nr_prealloc_nodes) { +- struct btree *b = as->prealloc_nodes[--as->nr_prealloc_nodes]; ++ for (p = as->prealloc_nodes; ++ p < as->prealloc_nodes + ARRAY_SIZE(as->prealloc_nodes); ++ p++) { ++ while (p->nr) { ++ struct btree *b = p->b[--p->nr]; + +- six_lock_intent(&b->c.lock, NULL, NULL); +- six_lock_write(&b->c.lock, NULL, NULL); ++ six_lock_intent(&b->c.lock, NULL, NULL); ++ six_lock_write(&b->c.lock, NULL, NULL); + +- if (c->btree_reserve_cache_nr < +- ARRAY_SIZE(c->btree_reserve_cache)) { +- struct btree_alloc *a = +- &c->btree_reserve_cache[c->btree_reserve_cache_nr++]; ++ if (c->btree_reserve_cache_nr < ++ ARRAY_SIZE(c->btree_reserve_cache)) { ++ struct btree_alloc *a = ++ &c->btree_reserve_cache[c->btree_reserve_cache_nr++]; + +- a->ob = b->ob; +- b->ob.nr = 0; +- bkey_copy(&a->k, &b->key); +- } else { +- bch2_open_buckets_put(c, &b->ob); +- } ++ a->ob = b->ob; ++ b->ob.nr = 0; ++ bkey_copy(&a->k, &b->key); ++ } else { ++ bch2_open_buckets_put(c, &b->ob); ++ } + +- __btree_node_free(c, b); +- six_unlock_write(&b->c.lock); +- six_unlock_intent(&b->c.lock); ++ __btree_node_free(c, b); ++ six_unlock_write(&b->c.lock); ++ six_unlock_intent(&b->c.lock); ++ } + } + + mutex_unlock(&c->btree_reserve_cache_lock); + } + +-static int bch2_btree_reserve_get(struct btree_update *as, unsigned nr_nodes, ++static int bch2_btree_reserve_get(struct btree_update *as, ++ unsigned nr_nodes[2], + unsigned flags) + { + struct bch_fs *c = as->c; + struct closure cl; + struct btree *b; ++ unsigned interior; + int ret; + + closure_init_stack(&cl); + retry: + +- BUG_ON(nr_nodes > BTREE_RESERVE_MAX); ++ BUG_ON(nr_nodes[0] + nr_nodes[1] > BTREE_RESERVE_MAX); + + /* + * Protects reaping from the btree node cache and using the btree node +@@ -430,16 +439,21 @@ retry: + if (ret) + goto err; + +- while (as->nr_prealloc_nodes < nr_nodes) { +- b = __bch2_btree_node_alloc(c, &as->disk_res, +- flags & BTREE_INSERT_NOWAIT +- ? NULL : &cl, flags); +- if (IS_ERR(b)) { +- ret = PTR_ERR(b); +- goto err; +- } ++ for (interior = 0; interior < 2; interior++) { ++ struct prealloc_nodes *p = as->prealloc_nodes + interior; ++ ++ while (p->nr < nr_nodes[interior]) { ++ b = __bch2_btree_node_alloc(c, &as->disk_res, ++ flags & BTREE_INSERT_NOWAIT ++ ? NULL : &cl, ++ interior, flags); ++ if (IS_ERR(b)) { ++ ret = PTR_ERR(b); ++ goto err; ++ } + +- as->prealloc_nodes[as->nr_prealloc_nodes++] = b; ++ p->b[p->nr++] = b; ++ } + } + + bch2_btree_cache_cannibalize_unlock(c); +@@ -452,7 +466,7 @@ err: + if (ret == -EAGAIN) + goto retry; + +- trace_btree_reserve_get_fail(c, nr_nodes, &cl); ++ trace_btree_reserve_get_fail(c, nr_nodes[0] + nr_nodes[1], &cl); + return ret; + } + +@@ -954,7 +968,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, + u64 start_time = local_clock(); + int disk_res_flags = (flags & BTREE_INSERT_NOFAIL) + ? BCH_DISK_RESERVATION_NOFAIL : 0; +- unsigned nr_nodes = 0; ++ unsigned nr_nodes[2] = { 0, 0 }; + unsigned update_level = level; + int journal_flags = 0; + int ret = 0; +@@ -967,7 +981,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, + journal_flags |= JOURNAL_RES_GET_NONBLOCK; + + while (1) { +- nr_nodes += 1 + split; ++ nr_nodes[!!update_level] += 1 + split; + update_level++; + + if (!btree_path_node(path, update_level)) +@@ -982,7 +996,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, + + /* Might have to allocate a new root: */ + if (update_level < BTREE_MAX_DEPTH) +- nr_nodes += 1; ++ nr_nodes[1] += 1; + + if (!bch2_btree_path_upgrade(trans, path, U8_MAX)) { + trace_trans_restart_iter_upgrade(trans->fn, _RET_IP_, +@@ -1046,7 +1060,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, + } + + ret = bch2_disk_reservation_get(c, &as->disk_res, +- nr_nodes * btree_sectors(c), ++ (nr_nodes[0] + nr_nodes[1]) * btree_sectors(c), + c->opts.metadata_replicas, + disk_res_flags); + if (ret) +@@ -2007,7 +2021,7 @@ int bch2_btree_node_update_key(struct btree_trans *trans, struct btree_iter *ite + return -EINTR; + } + +- new_hash = bch2_btree_node_mem_alloc(c); ++ new_hash = bch2_btree_node_mem_alloc(c, false); + } + + path->intent_ref++; +@@ -2083,7 +2097,7 @@ void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id) + closure_sync(&cl); + } while (ret); + +- b = bch2_btree_node_mem_alloc(c); ++ b = bch2_btree_node_mem_alloc(c, false); + bch2_btree_cache_cannibalize_unlock(c); + + set_btree_node_fake(b); +diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h +index 8dc86fa636d6..e72eb8795616 100644 +--- a/fs/bcachefs/btree_update_interior.h ++++ b/fs/bcachefs/btree_update_interior.h +@@ -76,8 +76,10 @@ struct btree_update { + struct journal_entry_pin journal; + + /* Preallocated nodes we reserve when we start the update: */ +- struct btree *prealloc_nodes[BTREE_UPDATE_NODES_MAX]; +- unsigned nr_prealloc_nodes; ++ struct prealloc_nodes { ++ struct btree *b[BTREE_UPDATE_NODES_MAX]; ++ unsigned nr; ++ } prealloc_nodes[2]; + + /* Nodes being freed: */ + struct keylist old_keys; +-- +cgit v1.2.3 + + +From 01f89208eff1cd01ef6c5fc667413a8977935077 Mon Sep 17 00:00:00 2001 +From: Daniel Hill +Date: Sat, 5 Mar 2022 17:45:27 +1300 +Subject: bcachefs: respect superblock discard flag. + +We were accidentally using default mount options and overwriting the +discard flag. + +Signed-off-by: Daniel Hill +--- + fs/bcachefs/super.c | 3 --- + 1 file changed, 3 deletions(-) + +diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c +index 9af8eb35b177..fb7f8d6d4d1c 100644 +--- a/fs/bcachefs/super.c ++++ b/fs/bcachefs/super.c +@@ -1162,9 +1162,6 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c, + ca->mi = bch2_mi_to_cpu(member); + ca->uuid = member->uuid; + +- if (opt_defined(c->opts, discard)) +- ca->mi.discard = opt_get(c->opts, discard); +- + if (percpu_ref_init(&ca->ref, bch2_dev_ref_complete, + 0, GFP_KERNEL) || + percpu_ref_init(&ca->io_ref, bch2_dev_io_ref_complete, +-- +cgit v1.2.3 + + +From 82dd5685eaf3e028f1f49d984445a3e07b918977 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 4 Mar 2022 21:57:11 -0500 +Subject: bcachefs: Fix transaction path overflow in fiemap + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/fs.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c +index 36d966f8ba77..48545569e004 100644 +--- a/fs/bcachefs/fs.c ++++ b/fs/bcachefs/fs.c +@@ -934,7 +934,8 @@ retry: + bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, + SPOS(ei->v.i_ino, start, snapshot), 0); + +- while ((k = bch2_btree_iter_peek(&iter)).k && ++ while (!(ret = btree_trans_too_many_iters(&trans)) && ++ (k = bch2_btree_iter_peek(&iter)).k && + !(ret = bkey_err(k)) && + bkey_cmp(iter.pos, end) < 0) { + enum btree_id data_btree = BTREE_ID_extents; +-- +cgit v1.2.3 + + +From 105546d29e05d454ca7e7ecc18f15224e4fb1413 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 5 Mar 2022 12:01:16 -0500 +Subject: bcachefs: Convert bch2_sb_to_text to master option list + +Options no longer have to be manually added to bch2_sb_to_text() - it +now uses the master list of options in opts.h. Also, improve some of the +formatting by converting it to tabstops. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/disk_groups.c | 93 ++++++++++------------ + fs/bcachefs/disk_groups.h | 4 +- + fs/bcachefs/fs.c | 2 +- + fs/bcachefs/opts.c | 45 ++++++++--- + fs/bcachefs/opts.h | 60 ++++++++------- + fs/bcachefs/super-io.c | 191 ++++++++++++++++++++++------------------------ + fs/bcachefs/super.c | 2 +- + fs/bcachefs/sysfs.c | 2 +- + fs/bcachefs/xattr.c | 2 +- + 9 files changed, 200 insertions(+), 201 deletions(-) + +diff --git a/fs/bcachefs/disk_groups.c b/fs/bcachefs/disk_groups.c +index 19698e504b3c..81b41b07c24b 100644 +--- a/fs/bcachefs/disk_groups.c ++++ b/fs/bcachefs/disk_groups.c +@@ -445,7 +445,10 @@ int bch2_opt_target_parse(struct bch_fs *c, const char *buf, u64 *v) + return -EINVAL; + } + +-void bch2_sb_target_to_text(struct printbuf *out, struct bch_sb *sb, u64 v) ++void bch2_opt_target_to_text(struct printbuf *out, ++ struct bch_fs *c, ++ struct bch_sb *sb, ++ u64 v) + { + struct target t = target_decode(v); + +@@ -453,63 +456,49 @@ void bch2_sb_target_to_text(struct printbuf *out, struct bch_sb *sb, u64 v) + case TARGET_NULL: + pr_buf(out, "none"); + break; +- case TARGET_DEV: { +- struct bch_sb_field_members *mi = bch2_sb_get_members(sb); +- struct bch_member *m = mi->members + t.dev; +- +- if (bch2_dev_exists(sb, mi, t.dev)) { +- pr_buf(out, "Device "); +- pr_uuid(out, m->uuid.b); +- pr_buf(out, " (%u)", t.dev); ++ case TARGET_DEV: ++ if (c) { ++ struct bch_dev *ca; ++ ++ rcu_read_lock(); ++ ca = t.dev < c->sb.nr_devices ++ ? rcu_dereference(c->devs[t.dev]) ++ : NULL; ++ ++ if (ca && percpu_ref_tryget(&ca->io_ref)) { ++ char b[BDEVNAME_SIZE]; ++ ++ pr_buf(out, "/dev/%s", ++ bdevname(ca->disk_sb.bdev, b)); ++ percpu_ref_put(&ca->io_ref); ++ } else if (ca) { ++ pr_buf(out, "offline device %u", t.dev); ++ } else { ++ pr_buf(out, "invalid device %u", t.dev); ++ } ++ ++ rcu_read_unlock(); + } else { +- pr_buf(out, "Bad device %u", t.dev); ++ struct bch_sb_field_members *mi = bch2_sb_get_members(sb); ++ struct bch_member *m = mi->members + t.dev; ++ ++ if (bch2_dev_exists(sb, mi, t.dev)) { ++ pr_buf(out, "Device "); ++ pr_uuid(out, m->uuid.b); ++ pr_buf(out, " (%u)", t.dev); ++ } else { ++ pr_buf(out, "Bad device %u", t.dev); ++ } + } +- + break; +- } + case TARGET_GROUP: +- bch2_disk_path_to_text(out, sb, t.group); +- break; +- default: +- BUG(); +- } +-} +- +-void bch2_opt_target_to_text(struct printbuf *out, struct bch_fs *c, u64 v) +-{ +- struct target t = target_decode(v); +- +- switch (t.type) { +- case TARGET_NULL: +- pr_buf(out, "none"); +- break; +- case TARGET_DEV: { +- struct bch_dev *ca; +- +- rcu_read_lock(); +- ca = t.dev < c->sb.nr_devices +- ? rcu_dereference(c->devs[t.dev]) +- : NULL; +- +- if (ca && percpu_ref_tryget(&ca->io_ref)) { +- char b[BDEVNAME_SIZE]; +- +- pr_buf(out, "/dev/%s", +- bdevname(ca->disk_sb.bdev, b)); +- percpu_ref_put(&ca->io_ref); +- } else if (ca) { +- pr_buf(out, "offline device %u", t.dev); ++ if (c) { ++ mutex_lock(&c->sb_lock); ++ bch2_disk_path_to_text(out, c->disk_sb.sb, t.group); ++ mutex_unlock(&c->sb_lock); + } else { +- pr_buf(out, "invalid device %u", t.dev); ++ bch2_disk_path_to_text(out, sb, t.group); + } +- +- rcu_read_unlock(); +- break; +- } +- case TARGET_GROUP: +- mutex_lock(&c->sb_lock); +- bch2_disk_path_to_text(out, c->disk_sb.sb, t.group); +- mutex_unlock(&c->sb_lock); + break; + default: + BUG(); +diff --git a/fs/bcachefs/disk_groups.h b/fs/bcachefs/disk_groups.h +index a274aacbdf92..de915480514b 100644 +--- a/fs/bcachefs/disk_groups.h ++++ b/fs/bcachefs/disk_groups.h +@@ -77,10 +77,8 @@ int bch2_disk_path_find_or_create(struct bch_sb_handle *, const char *); + + void bch2_disk_path_to_text(struct printbuf *, struct bch_sb *, unsigned); + +-void bch2_sb_target_to_text(struct printbuf *, struct bch_sb *, u64); +- + int bch2_opt_target_parse(struct bch_fs *, const char *, u64 *); +-void bch2_opt_target_to_text(struct printbuf *, struct bch_fs *, u64); ++void bch2_opt_target_to_text(struct printbuf *, struct bch_fs *, struct bch_sb *, u64); + + int bch2_sb_disk_groups_to_cpu(struct bch_fs *); + +diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c +index 48545569e004..73c4177cb4f2 100644 +--- a/fs/bcachefs/fs.c ++++ b/fs/bcachefs/fs.c +@@ -1690,7 +1690,7 @@ static int bch2_show_options(struct seq_file *seq, struct dentry *root) + continue; + + printbuf_reset(&buf); +- bch2_opt_to_text(&buf, c, opt, v, ++ bch2_opt_to_text(&buf, c, c->disk_sb.sb, opt, v, + OPT_SHOW_MOUNT_STYLE); + seq_putc(seq, ','); + seq_puts(seq, buf.buf); +diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c +index 71bf26eb13d5..e78d3b75f6fb 100644 +--- a/fs/bcachefs/opts.c ++++ b/fs/bcachefs/opts.c +@@ -96,6 +96,16 @@ const char * const bch2_d_types[BCH_DT_MAX] = { + [DT_SUBVOL] = "subvol", + }; + ++u64 BCH2_NO_SB_OPT(const struct bch_sb *sb) ++{ ++ BUG(); ++} ++ ++void SET_BCH2_NO_SB_OPT(struct bch_sb *sb, u64 v) ++{ ++ BUG(); ++} ++ + void bch2_opts_apply(struct bch_opts *dst, struct bch_opts src) + { + #define x(_name, ...) \ +@@ -280,7 +290,8 @@ int bch2_opt_parse(struct bch_fs *c, const char *msg, + return bch2_opt_validate(opt, msg, *res); + } + +-void bch2_opt_to_text(struct printbuf *out, struct bch_fs *c, ++void bch2_opt_to_text(struct printbuf *out, ++ struct bch_fs *c, struct bch_sb *sb, + const struct bch_option *opt, u64 v, + unsigned flags) + { +@@ -310,7 +321,7 @@ void bch2_opt_to_text(struct printbuf *out, struct bch_fs *c, + pr_buf(out, opt->choices[v]); + break; + case BCH_OPT_FN: +- opt->to_text(out, c, v); ++ opt->to_text(out, c, sb, v); + break; + default: + BUG(); +@@ -431,6 +442,22 @@ out: + return ret; + } + ++u64 bch2_opt_from_sb(struct bch_sb *sb, enum bch_opt_id id) ++{ ++ const struct bch_option *opt = bch2_opt_table + id; ++ u64 v; ++ ++ v = opt->get_sb(sb); ++ ++ if (opt->flags & OPT_SB_FIELD_ILOG2) ++ v = 1ULL << v; ++ ++ if (opt->flags & OPT_SB_FIELD_SECTORS) ++ v <<= 9; ++ ++ return v; ++} ++ + /* + * Initial options from superblock - here we don't want any options undefined, + * any options the superblock doesn't specify are set to 0: +@@ -444,16 +471,10 @@ int bch2_opts_from_sb(struct bch_opts *opts, struct bch_sb *sb) + const struct bch_option *opt = bch2_opt_table + id; + u64 v; + +- if (opt->get_sb == NO_SB_OPT) ++ if (opt->get_sb == BCH2_NO_SB_OPT) + continue; + +- v = opt->get_sb(sb); +- +- if (opt->flags & OPT_SB_FIELD_ILOG2) +- v = 1ULL << v; +- +- if (opt->flags & OPT_SB_FIELD_SECTORS) +- v <<= 9; ++ v = bch2_opt_from_sb(sb, id); + + ret = bch2_opt_validate(opt, "superblock option ", v); + if (ret) +@@ -467,7 +488,7 @@ int bch2_opts_from_sb(struct bch_opts *opts, struct bch_sb *sb) + + void __bch2_opt_set_sb(struct bch_sb *sb, const struct bch_option *opt, u64 v) + { +- if (opt->set_sb == SET_NO_SB_OPT) ++ if (opt->set_sb == SET_BCH2_NO_SB_OPT) + return; + + if (opt->flags & OPT_SB_FIELD_SECTORS) +@@ -481,7 +502,7 @@ void __bch2_opt_set_sb(struct bch_sb *sb, const struct bch_option *opt, u64 v) + + void bch2_opt_set_sb(struct bch_fs *c, const struct bch_option *opt, u64 v) + { +- if (opt->set_sb == SET_NO_SB_OPT) ++ if (opt->set_sb == SET_BCH2_NO_SB_OPT) + return; + + mutex_lock(&c->sb_lock); +diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h +index bafacf6b46a2..033115f7a6f4 100644 +--- a/fs/bcachefs/opts.h ++++ b/fs/bcachefs/opts.h +@@ -42,7 +42,8 @@ static inline const char *bch2_d_type_str(unsigned d_type) + */ + + /* dummy option, for options that aren't stored in the superblock */ +-LE64_BITMASK(NO_SB_OPT, struct bch_sb, flags[0], 0, 0); ++u64 BCH2_NO_SB_OPT(const struct bch_sb *); ++void SET_BCH2_NO_SB_OPT(struct bch_sb *, u64); + + /* When can be set: */ + enum opt_flags { +@@ -202,7 +203,7 @@ enum opt_type { + x(btree_node_mem_ptr_optimization, u8, \ + OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ + OPT_BOOL(), \ +- NO_SB_OPT, true, \ ++ BCH2_NO_SB_OPT, true, \ + NULL, "Stash pointer to in memory btree node in btree ptr")\ + x(gc_reserve_percent, u8, \ + OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ +@@ -229,7 +230,7 @@ enum opt_type { + x(inline_data, u8, \ + OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ + OPT_BOOL(), \ +- NO_SB_OPT, true, \ ++ BCH2_NO_SB_OPT, true, \ + NULL, "Enable inline data extents") \ + x(acl, u8, \ + OPT_FS|OPT_FORMAT|OPT_MOUNT, \ +@@ -254,22 +255,22 @@ enum opt_type { + x(degraded, u8, \ + OPT_FS|OPT_MOUNT, \ + OPT_BOOL(), \ +- NO_SB_OPT, false, \ ++ BCH2_NO_SB_OPT, false, \ + NULL, "Allow mounting in degraded mode") \ + x(very_degraded, u8, \ + OPT_FS|OPT_MOUNT, \ + OPT_BOOL(), \ +- NO_SB_OPT, false, \ ++ BCH2_NO_SB_OPT, false, \ + NULL, "Allow mounting in when data will be missing") \ + x(discard, u8, \ + OPT_FS|OPT_MOUNT|OPT_DEVICE, \ + OPT_BOOL(), \ +- NO_SB_OPT, false, \ ++ BCH2_NO_SB_OPT, false, \ + NULL, "Enable discard/TRIM support") \ + x(verbose, u8, \ + OPT_FS|OPT_MOUNT, \ + OPT_BOOL(), \ +- NO_SB_OPT, false, \ ++ BCH2_NO_SB_OPT, false, \ + NULL, "Extra debugging information during mount/recovery")\ + x(journal_flush_delay, u32, \ + OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ +@@ -291,48 +292,48 @@ enum opt_type { + x(fsck, u8, \ + OPT_FS|OPT_MOUNT, \ + OPT_BOOL(), \ +- NO_SB_OPT, false, \ ++ BCH2_NO_SB_OPT, false, \ + NULL, "Run fsck on mount") \ + x(fix_errors, u8, \ + OPT_FS|OPT_MOUNT, \ + OPT_BOOL(), \ +- NO_SB_OPT, false, \ ++ BCH2_NO_SB_OPT, false, \ + NULL, "Fix errors during fsck without asking") \ + x(ratelimit_errors, u8, \ + OPT_FS|OPT_MOUNT, \ + OPT_BOOL(), \ +- NO_SB_OPT, RATELIMIT_ERRORS_DEFAULT, \ ++ BCH2_NO_SB_OPT, RATELIMIT_ERRORS_DEFAULT, \ + NULL, "Ratelimit error messages during fsck") \ + x(nochanges, u8, \ + OPT_FS|OPT_MOUNT, \ + OPT_BOOL(), \ +- NO_SB_OPT, false, \ ++ BCH2_NO_SB_OPT, false, \ + NULL, "Super read only mode - no writes at all will be issued,\n"\ + "even if we have to replay the journal") \ + x(norecovery, u8, \ + OPT_FS|OPT_MOUNT, \ + OPT_BOOL(), \ +- NO_SB_OPT, false, \ ++ BCH2_NO_SB_OPT, false, \ + NULL, "Don't replay the journal") \ + x(rebuild_replicas, u8, \ + OPT_FS|OPT_MOUNT, \ + OPT_BOOL(), \ +- NO_SB_OPT, false, \ ++ BCH2_NO_SB_OPT, false, \ + NULL, "Rebuild the superblock replicas section") \ + x(keep_journal, u8, \ + 0, \ + OPT_BOOL(), \ +- NO_SB_OPT, false, \ ++ BCH2_NO_SB_OPT, false, \ + NULL, "Don't free journal entries/keys after startup")\ + x(read_entire_journal, u8, \ + 0, \ + OPT_BOOL(), \ +- NO_SB_OPT, false, \ ++ BCH2_NO_SB_OPT, false, \ + NULL, "Read all journal entries, not just dirty ones")\ + x(read_journal_only, u8, \ + 0, \ + OPT_BOOL(), \ +- NO_SB_OPT, false, \ ++ BCH2_NO_SB_OPT, false, \ + NULL, "Only read the journal, skip the rest of recovery")\ + x(journal_transaction_names, u8, \ + OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ +@@ -342,58 +343,58 @@ enum opt_type { + x(noexcl, u8, \ + OPT_FS|OPT_MOUNT, \ + OPT_BOOL(), \ +- NO_SB_OPT, false, \ ++ BCH2_NO_SB_OPT, false, \ + NULL, "Don't open device in exclusive mode") \ + x(sb, u64, \ + OPT_MOUNT, \ + OPT_UINT(0, S64_MAX), \ +- NO_SB_OPT, BCH_SB_SECTOR, \ ++ BCH2_NO_SB_OPT, BCH_SB_SECTOR, \ + "offset", "Sector offset of superblock") \ + x(read_only, u8, \ + OPT_FS, \ + OPT_BOOL(), \ +- NO_SB_OPT, false, \ ++ BCH2_NO_SB_OPT, false, \ + NULL, NULL) \ + x(nostart, u8, \ + 0, \ + OPT_BOOL(), \ +- NO_SB_OPT, false, \ ++ BCH2_NO_SB_OPT, false, \ + NULL, "Don\'t start filesystem, only open devices") \ + x(reconstruct_alloc, u8, \ + OPT_FS|OPT_MOUNT, \ + OPT_BOOL(), \ +- NO_SB_OPT, false, \ ++ BCH2_NO_SB_OPT, false, \ + NULL, "Reconstruct alloc btree") \ + x(version_upgrade, u8, \ + OPT_FS|OPT_MOUNT, \ + OPT_BOOL(), \ +- NO_SB_OPT, false, \ ++ BCH2_NO_SB_OPT, false, \ + NULL, "Set superblock to latest version,\n" \ + "allowing any new features to be used") \ + x(buckets_nouse, u8, \ + 0, \ + OPT_BOOL(), \ +- NO_SB_OPT, false, \ ++ BCH2_NO_SB_OPT, false, \ + NULL, "Allocate the buckets_nouse bitmap") \ + x(project, u8, \ + OPT_INODE, \ + OPT_BOOL(), \ +- NO_SB_OPT, false, \ ++ BCH2_NO_SB_OPT, false, \ + NULL, NULL) \ + x(fs_size, u64, \ + OPT_DEVICE, \ + OPT_UINT(0, S64_MAX), \ +- NO_SB_OPT, 0, \ ++ BCH2_NO_SB_OPT, 0, \ + "size", "Size of filesystem on device") \ + x(bucket, u32, \ + OPT_DEVICE, \ + OPT_UINT(0, S64_MAX), \ +- NO_SB_OPT, 0, \ ++ BCH2_NO_SB_OPT, 0, \ + "size", "Size of filesystem on device") \ + x(durability, u8, \ + OPT_DEVICE, \ + OPT_UINT(0, BCH_REPLICAS_MAX), \ +- NO_SB_OPT, 1, \ ++ BCH2_NO_SB_OPT, 1, \ + "n", "Data written to this device will be considered\n"\ + "to have already been replicated n times") + +@@ -460,7 +461,7 @@ struct bch_option { + }; + struct { + int (*parse)(struct bch_fs *, const char *, u64 *); +- void (*to_text)(struct printbuf *, struct bch_fs *, u64); ++ void (*to_text)(struct printbuf *, struct bch_fs *, struct bch_sb *, u64); + }; + }; + +@@ -475,6 +476,7 @@ bool bch2_opt_defined_by_id(const struct bch_opts *, enum bch_opt_id); + u64 bch2_opt_get_by_id(const struct bch_opts *, enum bch_opt_id); + void bch2_opt_set_by_id(struct bch_opts *, enum bch_opt_id, u64); + ++u64 bch2_opt_from_sb(struct bch_sb *, enum bch_opt_id); + int bch2_opts_from_sb(struct bch_opts *, struct bch_sb *); + void __bch2_opt_set_sb(struct bch_sb *, const struct bch_option *, u64); + void bch2_opt_set_sb(struct bch_fs *, const struct bch_option *, u64); +@@ -486,7 +488,7 @@ int bch2_opt_parse(struct bch_fs *, const char *, const struct bch_option *, + #define OPT_SHOW_FULL_LIST (1 << 0) + #define OPT_SHOW_MOUNT_STYLE (1 << 1) + +-void bch2_opt_to_text(struct printbuf *, struct bch_fs *, ++void bch2_opt_to_text(struct printbuf *, struct bch_fs *, struct bch_sb *, + const struct bch_option *, u64, unsigned); + + int bch2_opt_check_may_set(struct bch_fs *, int, u64); +diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c +index 1a70adae2463..e17ce91c8486 100644 +--- a/fs/bcachefs/super-io.c ++++ b/fs/bcachefs/super-io.c +@@ -1045,45 +1045,56 @@ static void bch2_sb_members_to_text(struct printbuf *out, struct bch_sb *sb, + if (!bch2_member_exists(m)) + continue; + +- pr_buf(out, "Device: %u", i); ++ pr_buf(out, "Device:"); ++ pr_tab(out); ++ pr_buf(out, "%u", i); + pr_newline(out); + + pr_indent_push(out, 2); + +- pr_buf(out, "UUID: "); ++ pr_buf(out, "UUID:"); ++ pr_tab(out); + pr_uuid(out, m->uuid.b); + pr_newline(out); + +- pr_buf(out, "Size: "); ++ pr_buf(out, "Size:"); ++ pr_tab(out); + pr_units(out, device_size, device_size << 9); + pr_newline(out); + +- pr_buf(out, "Bucket size: "); ++ pr_buf(out, "Bucket size:"); ++ pr_tab(out); + pr_units(out, bucket_size, bucket_size << 9); + pr_newline(out); + +- pr_buf(out, "First bucket: %u", +- le16_to_cpu(m->first_bucket)); ++ pr_buf(out, "First bucket:"); ++ pr_tab(out); ++ pr_buf(out, "%u", le16_to_cpu(m->first_bucket)); + pr_newline(out); + +- pr_buf(out, "Buckets: %llu", +- le64_to_cpu(m->nbuckets)); ++ pr_buf(out, "Buckets:"); ++ pr_tab(out); ++ pr_buf(out, "%llu", le64_to_cpu(m->nbuckets)); + pr_newline(out); + +- pr_buf(out, "Last mount: "); ++ pr_buf(out, "Last mount:"); ++ pr_tab(out); + if (m->last_mount) + pr_time(out, le64_to_cpu(m->last_mount)); + else + pr_buf(out, "(never)"); + pr_newline(out); + +- pr_buf(out, "State: %s", ++ pr_buf(out, "State:"); ++ pr_tab(out); ++ pr_buf(out, "%s", + BCH_MEMBER_STATE(m) < BCH_MEMBER_STATE_NR + ? bch2_member_states[BCH_MEMBER_STATE(m)] + : "unknown"); + pr_newline(out); + +- pr_buf(out, "Group: "); ++ pr_buf(out, "Group:"); ++ pr_tab(out); + if (BCH_MEMBER_GROUP(m)) { + unsigned idx = BCH_MEMBER_GROUP(m) - 1; + +@@ -1097,7 +1108,8 @@ static void bch2_sb_members_to_text(struct printbuf *out, struct bch_sb *sb, + } + pr_newline(out); + +- pr_buf(out, "Data allowed: "); ++ pr_buf(out, "Data allowed:"); ++ pr_tab(out); + if (BCH_MEMBER_DATA_ALLOWED(m)) + bch2_flags_to_text(out, bch2_data_types, + BCH_MEMBER_DATA_ALLOWED(m)); +@@ -1105,15 +1117,17 @@ static void bch2_sb_members_to_text(struct printbuf *out, struct bch_sb *sb, + pr_buf(out, "(none)"); + pr_newline(out); + +- pr_buf(out, "Has data: "); ++ pr_buf(out, "Has data:"); ++ pr_tab(out); + if (data_have) + bch2_flags_to_text(out, bch2_data_types, data_have); + else + pr_buf(out, "(none)"); + pr_newline(out); + +- pr_buf(out, "Discard: %llu", +- BCH_MEMBER_DISCARD(m)); ++ pr_buf(out, "Discard:"); ++ pr_tab(out); ++ pr_buf(out, "%llu", BCH_MEMBER_DISCARD(m)); + pr_newline(out); + + pr_indent_pop(out, 2); +@@ -1449,6 +1463,9 @@ void bch2_sb_field_to_text(struct printbuf *out, struct bch_sb *sb, + const struct bch_sb_field_ops *ops = type < BCH_SB_FIELD_NR + ? bch2_sb_field_ops[type] : NULL; + ++ if (!out->tabstops[0]) ++ out->tabstops[0] = 32; ++ + if (ops) + pr_buf(out, "%s", bch2_sb_fields[type]); + else +@@ -1497,6 +1514,9 @@ void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb, + u64 fields_have = 0; + unsigned nr_devices = 0; + ++ if (!out->tabstops[0]) ++ out->tabstops[0] = 32; ++ + mi = bch2_sb_get_members(sb); + if (mi) { + struct bch_member *m; +@@ -1507,137 +1527,106 @@ void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb, + nr_devices += bch2_member_exists(m); + } + +- pr_buf(out, "External UUID: "); ++ pr_buf(out, "External UUID:"); ++ pr_tab(out); + pr_uuid(out, sb->user_uuid.b); + pr_newline(out); + +- pr_buf(out, "Internal UUID: "); ++ pr_buf(out, "Internal UUID:"); ++ pr_tab(out); + pr_uuid(out, sb->uuid.b); + pr_newline(out); + +- pr_buf(out, "Device index: %u", sb->dev_idx); ++ pr_buf(out, "Device index:"); ++ pr_tab(out); ++ pr_buf(out, "%u", sb->dev_idx); + pr_newline(out); + +- pr_buf(out, "Label: "); ++ pr_buf(out, "Label:"); ++ pr_tab(out); + pr_buf(out, "%.*s", (int) sizeof(sb->label), sb->label); + pr_newline(out); + +- pr_buf(out, "Version: %u", le16_to_cpu(sb->version)); ++ pr_buf(out, "Version:"); ++ pr_tab(out); ++ pr_buf(out, "%u", le16_to_cpu(sb->version)); + pr_newline(out); + +- pr_buf(out, "Oldest version on disk: %u", le16_to_cpu(sb->version_min)); ++ pr_buf(out, "Oldest version on disk:"); ++ pr_tab(out); ++ pr_buf(out, "%u", le16_to_cpu(sb->version_min)); + pr_newline(out); + +- pr_buf(out, "Created: "); ++ pr_buf(out, "Created:"); ++ pr_tab(out); + if (sb->time_base_lo) + pr_time(out, div_u64(le64_to_cpu(sb->time_base_lo), NSEC_PER_SEC)); + else + pr_buf(out, "(not set)"); + pr_newline(out); + +- pr_buf(out, "Squence number: %llu", le64_to_cpu(sb->seq)); ++ pr_buf(out, "Sequence number:"); ++ pr_tab(out); ++ pr_buf(out, "%llu", le64_to_cpu(sb->seq)); + pr_newline(out); + +- pr_buf(out, "Block_size: "); +- pr_units(out, le16_to_cpu(sb->block_size), +- (u32) le16_to_cpu(sb->block_size) << 9); ++ pr_buf(out, "Superblock size:"); ++ pr_tab(out); ++ pr_buf(out, "%zu", vstruct_bytes(sb)); + pr_newline(out); + +- pr_buf(out, "Btree node size: "); +- pr_units(out, BCH_SB_BTREE_NODE_SIZE(sb), +- BCH_SB_BTREE_NODE_SIZE(sb) << 9); ++ pr_buf(out, "Clean:"); ++ pr_tab(out); ++ pr_buf(out, "%llu", BCH_SB_CLEAN(sb)); + pr_newline(out); + +- pr_buf(out, "Error action: %s", +- BCH_SB_ERROR_ACTION(sb) < BCH_ON_ERROR_NR +- ? bch2_error_actions[BCH_SB_ERROR_ACTION(sb)] +- : "unknown"); ++ pr_buf(out, "Devices:"); ++ pr_tab(out); ++ pr_buf(out, "%u", nr_devices); + pr_newline(out); + +- pr_buf(out, "Clean: %llu", BCH_SB_CLEAN(sb)); ++ pr_buf(out, "Sections:"); ++ vstruct_for_each(sb, f) ++ fields_have |= 1 << le32_to_cpu(f->type); ++ pr_tab(out); ++ bch2_flags_to_text(out, bch2_sb_fields, fields_have); + pr_newline(out); + +- pr_buf(out, "Features: "); ++ pr_buf(out, "Features:"); ++ pr_tab(out); + bch2_flags_to_text(out, bch2_sb_features, + le64_to_cpu(sb->features[0])); + pr_newline(out); + +- pr_buf(out, "Compat features: "); ++ pr_buf(out, "Compat features:"); ++ pr_tab(out); + bch2_flags_to_text(out, bch2_sb_compat, + le64_to_cpu(sb->compat[0])); + pr_newline(out); + +- pr_buf(out, "Metadata replicas: %llu", BCH_SB_META_REPLICAS_WANT(sb)); + pr_newline(out); +- +- pr_buf(out, "Data replicas: %llu", BCH_SB_DATA_REPLICAS_WANT(sb)); +- pr_newline(out); +- +- pr_buf(out, "Metadata checksum type: %s (%llu)", +- BCH_SB_META_CSUM_TYPE(sb) < BCH_CSUM_OPT_NR +- ? bch2_csum_opts[BCH_SB_META_CSUM_TYPE(sb)] +- : "unknown", +- BCH_SB_META_CSUM_TYPE(sb)); +- pr_newline(out); +- +- pr_buf(out, "Data checksum type: %s (%llu)", +- BCH_SB_DATA_CSUM_TYPE(sb) < BCH_CSUM_OPT_NR +- ? bch2_csum_opts[BCH_SB_DATA_CSUM_TYPE(sb)] +- : "unknown", +- BCH_SB_DATA_CSUM_TYPE(sb)); +- pr_newline(out); +- +- pr_buf(out, "Compression type: %s (%llu)", +- BCH_SB_COMPRESSION_TYPE(sb) < BCH_COMPRESSION_OPT_NR +- ? bch2_compression_opts[BCH_SB_COMPRESSION_TYPE(sb)] +- : "unknown", +- BCH_SB_COMPRESSION_TYPE(sb)); +- pr_newline(out); +- +- pr_buf(out, "Foreground write target: "); +- bch2_sb_target_to_text(out, sb, BCH_SB_FOREGROUND_TARGET(sb)); +- pr_newline(out); +- +- pr_buf(out, "Background write target: "); +- bch2_sb_target_to_text(out, sb, BCH_SB_BACKGROUND_TARGET(sb)); +- pr_newline(out); +- +- pr_buf(out, "Promote target: "); +- bch2_sb_target_to_text(out, sb, BCH_SB_PROMOTE_TARGET(sb)); +- pr_newline(out); +- +- pr_buf(out, "Metadata target: "); +- bch2_sb_target_to_text(out, sb, BCH_SB_METADATA_TARGET(sb)); +- pr_newline(out); +- +- pr_buf(out, "String hash type: %s (%llu)", +- BCH_SB_STR_HASH_TYPE(sb) < BCH_STR_HASH_NR +- ? bch2_str_hash_types[BCH_SB_STR_HASH_TYPE(sb)] +- : "unknown", +- BCH_SB_STR_HASH_TYPE(sb)); +- pr_newline(out); +- +- pr_buf(out, "32 bit inodes: %llu", BCH_SB_INODE_32BIT(sb)); +- pr_newline(out); +- +- pr_buf(out, "GC reserve percentage: %llu%%", BCH_SB_GC_RESERVE(sb)); ++ pr_buf(out, "Options:"); + pr_newline(out); ++ pr_indent_push(out, 2); ++ { ++ enum bch_opt_id id; + +- pr_buf(out, "Root reserve percentage: %llu%%", BCH_SB_ROOT_RESERVE(sb)); +- pr_newline(out); ++ for (id = 0; id < bch2_opts_nr; id++) { ++ const struct bch_option *opt = bch2_opt_table + id; + +- pr_buf(out, "Devices: %u live, %u total", +- nr_devices, sb->nr_devices); +- pr_newline(out); ++ if (opt->get_sb != BCH2_NO_SB_OPT) { ++ u64 v = bch2_opt_from_sb(sb, id); + +- pr_buf(out, "Sections: "); +- vstruct_for_each(sb, f) +- fields_have |= 1 << le32_to_cpu(f->type); +- bch2_flags_to_text(out, bch2_sb_fields, fields_have); +- pr_newline(out); ++ pr_buf(out, "%s:", opt->attr.name); ++ pr_tab(out); ++ bch2_opt_to_text(out, NULL, sb, opt, v, ++ OPT_HUMAN_READABLE|OPT_SHOW_FULL_LIST); ++ pr_newline(out); ++ } ++ } ++ } + +- pr_buf(out, "Superblock size: %zu", vstruct_bytes(sb)); +- pr_newline(out); ++ pr_indent_pop(out, 2); + + if (print_layout) { + pr_newline(out); +diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c +index fb7f8d6d4d1c..46947163a8dc 100644 +--- a/fs/bcachefs/super.c ++++ b/fs/bcachefs/super.c +@@ -886,7 +886,7 @@ static void print_mount_opts(struct bch_fs *c) + if (!first) + pr_buf(&p, ","); + first = false; +- bch2_opt_to_text(&p, c, opt, v, OPT_SHOW_MOUNT_STYLE); ++ bch2_opt_to_text(&p, c, c->disk_sb.sb, opt, v, OPT_SHOW_MOUNT_STYLE); + } + + if (!p.pos) +diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c +index 3018250d421b..49e38859bff8 100644 +--- a/fs/bcachefs/sysfs.c ++++ b/fs/bcachefs/sysfs.c +@@ -597,7 +597,7 @@ SHOW(bch2_fs_opts_dir) + int id = opt - bch2_opt_table; + u64 v = bch2_opt_get_by_id(&c->opts, id); + +- bch2_opt_to_text(out, c, opt, v, OPT_SHOW_FULL_LIST); ++ bch2_opt_to_text(out, c, c->disk_sb.sb, opt, v, OPT_SHOW_FULL_LIST); + pr_char(out, '\n'); + + return 0; +diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c +index 48e625ab15ff..c2e9520a0457 100644 +--- a/fs/bcachefs/xattr.c ++++ b/fs/bcachefs/xattr.c +@@ -448,7 +448,7 @@ static int __bch2_xattr_bcachefs_get(const struct xattr_handler *handler, + return -ENODATA; + + v = bch2_opt_get_by_id(&opts, id); +- bch2_opt_to_text(&out, c, opt, v, 0); ++ bch2_opt_to_text(&out, c, c->disk_sb.sb, opt, v, 0); + + ret = out.pos; + +-- +cgit v1.2.3 + + +From 2fd9bfaa97c85f430bb8ed844e81d5d01569f112 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 5 Mar 2022 13:38:54 -0500 +Subject: bcachefs: Don't arm journal->write_work when journal entry !open + +This fixes a shutdown race where we were rearming journal->write_work +after the journal has already shut down. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/journal.c | 10 ++++++---- + 1 file changed, 6 insertions(+), 4 deletions(-) + +diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c +index fb533ecc78f8..ded4b6800d4c 100644 +--- a/fs/bcachefs/journal.c ++++ b/fs/bcachefs/journal.c +@@ -310,17 +310,19 @@ static void journal_write_work(struct work_struct *work) + { + struct journal *j = container_of(work, struct journal, write_work.work); + struct bch_fs *c = container_of(j, struct bch_fs, journal); +- struct journal_buf *buf; + long delta; + + spin_lock(&j->lock); +- buf = journal_cur_buf(j); +- delta = buf->expires - jiffies; ++ if (!__journal_entry_is_open(j->reservations)) ++ goto unlock; ++ ++ delta = journal_cur_buf(j)->expires - jiffies; + + if (delta > 0) + mod_delayed_work(c->io_complete_wq, &j->write_work, delta); + else + __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL); ++unlock: + spin_unlock(&j->lock); + } + +@@ -940,6 +942,7 @@ void bch2_dev_journal_stop(struct journal *j, struct bch_dev *ca) + + void bch2_fs_journal_stop(struct journal *j) + { ++ bch2_journal_reclaim_stop(j); + bch2_journal_flush_all_pins(j); + + wait_event(j->wait, journal_entry_close(j)); +@@ -957,7 +960,6 @@ void bch2_fs_journal_stop(struct journal *j) + j->last_empty_seq != journal_cur_seq(j)); + + cancel_delayed_work_sync(&j->write_work); +- bch2_journal_reclaim_stop(j); + } + + int bch2_fs_journal_start(struct journal *j, u64 cur_seq, +-- +cgit v1.2.3 + + +From 7658bb6db9f361735c473fc290ab01d4a2d8845f Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 5 Mar 2022 15:21:07 -0500 +Subject: bcachefs: Don't keep around btree_paths unnecessarily + +When bch2_trans_begin() is called and there hasn't been a transaction +restart, we presume that we're now doing something new - iterating over +different keys, and we now shouldn't keep aruond paths related to the +previous transaction, excepting the subvolumes btree. + +This should fix some of our "transaction path overflow" bugs. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 8 ++++++++ + 1 file changed, 8 insertions(+) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index c0357ee9cfb7..8186ee7e23ff 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -3016,6 +3016,14 @@ void bch2_trans_begin(struct btree_trans *trans) + trans_for_each_path(trans, path) { + path->should_be_locked = false; + ++ /* ++ * If the transaction wasn't restarted, we're presuming to be ++ * doing something new: dont keep iterators excpt the ones that ++ * are in use - except for the subvolumes btree: ++ */ ++ if (!trans->restarted && path->btree_id != BTREE_ID_subvolumes) ++ path->preserve = false; ++ + /* + * XXX: we probably shouldn't be doing this if the transaction + * was restarted, but currently we still overflow transaction +-- +cgit v1.2.3 + + +From 585ca40b999f7fe0af63a15e38f99124003937c5 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 6 Mar 2022 14:04:34 -0500 +Subject: bcachefs: Fix pr_tab_rjust() + +pr_tab_rjust() was broken and leaving a null somewhere in the output +string - this patch fixes it and simplifies it a bit. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/util.c | 37 +++++++++++++++++++++++++++++++++++-- + fs/bcachefs/util.h | 22 +++------------------- + 2 files changed, 38 insertions(+), 21 deletions(-) + +diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c +index 766d08aede71..37fc20413764 100644 +--- a/fs/bcachefs/util.c ++++ b/fs/bcachefs/util.c +@@ -101,8 +101,14 @@ STRTO_H(strtou64, u64) + + static int bch2_printbuf_realloc(struct printbuf *out, unsigned extra) + { +- unsigned new_size = roundup_pow_of_two(out->size + extra); +- char *buf = krealloc(out->buf, new_size, !out->atomic ? GFP_KERNEL : GFP_ATOMIC); ++ unsigned new_size; ++ char *buf; ++ ++ if (out->pos + extra + 1 < out->size) ++ return 0; ++ ++ new_size = roundup_pow_of_two(out->size + extra); ++ buf = krealloc(out->buf, new_size, !out->atomic ? GFP_KERNEL : GFP_ATOMIC); + + if (!buf) { + out->allocation_failure = true; +@@ -131,6 +137,33 @@ void bch2_pr_buf(struct printbuf *out, const char *fmt, ...) + out->pos += len; + } + ++void bch2_pr_tab_rjust(struct printbuf *buf) ++{ ++ BUG_ON(buf->tabstop > ARRAY_SIZE(buf->tabstops)); ++ ++ if (printbuf_linelen(buf) < buf->tabstops[buf->tabstop]) { ++ unsigned move = buf->pos - buf->last_field; ++ unsigned shift = buf->tabstops[buf->tabstop] - ++ printbuf_linelen(buf); ++ ++ bch2_printbuf_realloc(buf, shift); ++ ++ if (buf->last_field + shift + 1 < buf->size) { ++ move = min(move, buf->size - 1 - buf->last_field - shift); ++ ++ memmove(buf->buf + buf->last_field + shift, ++ buf->buf + buf->last_field, ++ move); ++ memset(buf->buf + buf->last_field, ' ', shift); ++ buf->pos += shift; ++ buf->buf[buf->pos] = 0; ++ } ++ } ++ ++ buf->last_field = buf->pos; ++ buf->tabstop++; ++} ++ + void bch2_hprint(struct printbuf *buf, s64 v) + { + int u, t = 0; +diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h +index 4095df2fcded..2c9e91023bb9 100644 +--- a/fs/bcachefs/util.h ++++ b/fs/bcachefs/util.h +@@ -334,27 +334,11 @@ static inline void pr_tab(struct printbuf *buf) + buf->tabstop++; + } + ++void bch2_pr_tab_rjust(struct printbuf *); ++ + static inline void pr_tab_rjust(struct printbuf *buf) + { +- ssize_t shift = min_t(ssize_t, buf->tabstops[buf->tabstop] - +- printbuf_linelen(buf), +- printbuf_remaining(buf)); +- ssize_t move = min_t(ssize_t, buf->pos - buf->last_field, +- printbuf_remaining(buf) - shift); +- +- BUG_ON(buf->tabstop > ARRAY_SIZE(buf->tabstops)); +- +- if (shift > 0) { +- memmove(buf->buf + buf->last_field + shift, +- buf->buf + buf->last_field, +- move); +- memset(buf->buf + buf->last_field, ' ', shift); +- buf->pos += shift; +- buf->buf[buf->pos] = 0; +- } +- +- buf->last_field = buf->pos; +- buf->tabstop++; ++ bch2_pr_tab_rjust(buf); + } + + void bch2_pr_units(struct printbuf *, s64, s64); +-- +cgit v1.2.3 + + +From 23b70213f63bbce273c71dba02ff1910e5a0a72a Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 7 Mar 2022 01:35:55 -0500 +Subject: bcachefs: Fix bch2_journal_flush_device_pins() + +It's now legal for the pin fifo to be empty, which means this code needs +to be updated in order to not hit an assert. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/journal_reclaim.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c +index ec565edbbfc5..df2c5d648f6a 100644 +--- a/fs/bcachefs/journal_reclaim.c ++++ b/fs/bcachefs/journal_reclaim.c +@@ -809,10 +809,12 @@ int bch2_journal_flush_device_pins(struct journal *j, int dev_idx) + seq = 0; + + spin_lock(&j->lock); +- while (!ret && seq < j->pin.back) { ++ while (!ret) { + struct bch_replicas_padded replicas; + + seq = max(seq, journal_last_seq(j)); ++ if (seq >= j->pin.back) ++ break; + bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, + journal_seq_pin(j, seq)->devs); + seq++; +-- +cgit v1.2.3 + + +From 096ad56a1141f5307bbca19bbab1bebd809f14dd Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 6 Mar 2022 15:15:41 -0500 +Subject: bcachefs: Check for rw before setting opts via sysfs + +This isn't a correctness issue, it just eliminates errors in the dmesg +log when we're RO. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/sysfs.c | 24 +++++++++++++++++------- + 1 file changed, 17 insertions(+), 7 deletions(-) + +diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c +index 49e38859bff8..3d6ece515a88 100644 +--- a/fs/bcachefs/sysfs.c ++++ b/fs/bcachefs/sysfs.c +@@ -607,23 +607,32 @@ STORE(bch2_fs_opts_dir) + { + struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir); + const struct bch_option *opt = container_of(attr, struct bch_option, attr); +- int ret, id = opt - bch2_opt_table; ++ int ret = size, id = opt - bch2_opt_table; + char *tmp; + u64 v; + ++ /* ++ * We don't need to take c->writes for correctness, but it eliminates an ++ * unsightly error message in the dmesg log when we're RO: ++ */ ++ if (unlikely(!percpu_ref_tryget(&c->writes))) ++ return -EROFS; ++ + tmp = kstrdup(buf, GFP_KERNEL); +- if (!tmp) +- return -ENOMEM; ++ if (!tmp) { ++ ret = -ENOMEM; ++ goto err; ++ } + + ret = bch2_opt_parse(c, NULL, opt, strim(tmp), &v); + kfree(tmp); + + if (ret < 0) +- return ret; ++ goto err; + + ret = bch2_opt_check_may_set(c, id, v); + if (ret < 0) +- return ret; ++ goto err; + + bch2_opt_set_sb(c, opt, v); + bch2_opt_set_by_id(&c->opts, id, v); +@@ -633,8 +642,9 @@ STORE(bch2_fs_opts_dir) + bch2_rebalance_add_work(c, S64_MAX); + rebalance_wakeup(c); + } +- +- return size; ++err: ++ percpu_ref_put(&c->writes); ++ return ret; + } + SYSFS_OPS(bch2_fs_opts_dir); + +-- +cgit v1.2.3 + + +From 9d7908ac02f28428bd42196c4faa1d1fa9b0ec99 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 6 Mar 2022 17:20:39 -0500 +Subject: bcachefs: Skip periodic wakeup of journal reclaim when journal empty + +Less system noise. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/journal.c | 3 +++ + fs/bcachefs/journal_reclaim.c | 14 +++++++++++--- + fs/bcachefs/journal_types.h | 4 ++++ + 3 files changed, 18 insertions(+), 3 deletions(-) + +diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c +index ded4b6800d4c..8a2fc25bf6f6 100644 +--- a/fs/bcachefs/journal.c ++++ b/fs/bcachefs/journal.c +@@ -241,6 +241,9 @@ static int journal_entry_open(struct journal *j) + if (u64s <= 0) + return cur_entry_journal_full; + ++ if (fifo_empty(&j->pin) && j->reclaim_thread) ++ wake_up_process(j->reclaim_thread); ++ + /* + * The fifo_push() needs to happen at the same time as j->seq is + * incremented for journal_last_seq() to be calculated correctly +diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c +index df2c5d648f6a..a920a111dad7 100644 +--- a/fs/bcachefs/journal_reclaim.c ++++ b/fs/bcachefs/journal_reclaim.c +@@ -670,6 +670,7 @@ static int bch2_journal_reclaim_thread(void *arg) + struct journal *j = arg; + struct bch_fs *c = container_of(j, struct bch_fs, journal); + unsigned long delay, now; ++ bool journal_empty; + int ret = 0; + + set_freezable(); +@@ -696,10 +697,17 @@ static int bch2_journal_reclaim_thread(void *arg) + break; + if (j->reclaim_kicked) + break; +- if (time_after_eq(jiffies, j->next_reclaim)) +- break; +- freezable_schedule_timeout(j->next_reclaim - jiffies); + ++ spin_lock(&j->lock); ++ journal_empty = fifo_empty(&j->pin); ++ spin_unlock(&j->lock); ++ ++ if (journal_empty) ++ freezable_schedule(); ++ else if (time_after(j->next_reclaim, jiffies)) ++ freezable_schedule_timeout(j->next_reclaim - jiffies); ++ else ++ break; + } + __set_current_state(TASK_RUNNING); + } +diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h +index 6fd458191e41..071fcb4a8422 100644 +--- a/fs/bcachefs/journal_types.h ++++ b/fs/bcachefs/journal_types.h +@@ -240,6 +240,10 @@ struct journal { + spinlock_t err_lock; + + struct mutex reclaim_lock; ++ /* ++ * Used for waiting until journal reclaim has freed up space in the ++ * journal: ++ */ + wait_queue_head_t reclaim_wait; + struct task_struct *reclaim_thread; + bool reclaim_kicked; +-- +cgit v1.2.3 + + +From 1b8e48b7a0fbe7f0603f6b85e7ed1c4c6bf7b0fe Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 7 Mar 2022 14:13:22 -0500 +Subject: bcachefs: Revert UUID format-specifier change + +"bcachefs: Log & error message improvements" accidentally changed the +format specifier we use for converting UUIDs to strings, which broke +mounting of encrypted filesystems - this patch reverts that change. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/util.h | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h +index 2c9e91023bb9..d6d7f1bc16b8 100644 +--- a/fs/bcachefs/util.h ++++ b/fs/bcachefs/util.h +@@ -372,7 +372,7 @@ static inline void pr_time(struct printbuf *out, u64 _time) + #ifdef __KERNEL__ + static inline void uuid_unparse_lower(u8 *uuid, char *out) + { +- sprintf(out, "%plU", uuid); ++ sprintf(out, "%pUb", uuid); + } + #else + #include +-- +cgit v1.2.3 + + +From 1f7f4ddd219d024525f8f39c20111daaa56ec245 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 8 Mar 2022 13:52:58 -0500 +Subject: bcachefs: Use bio_iov_vecs_to_alloc() + +This fixes a bug in the DIO read path where, when using a loopback +device in DIO mode, we'd allocate a biovec that would get overwritten +and leaked in bio_iov_iter_get_pages() -> bio_iov_bvec_set(). + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/fs-io.c | 8 +++----- + 1 file changed, 3 insertions(+), 5 deletions(-) + +diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c +index 022585d1ac59..ab983f2e339e 100644 +--- a/fs/bcachefs/fs-io.c ++++ b/fs/bcachefs/fs-io.c +@@ -1928,7 +1928,7 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter) + iter->count -= shorten; + + bio = bio_alloc_bioset(GFP_KERNEL, +- iov_iter_npages(iter, BIO_MAX_VECS), ++ bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS), + &c->dio_read_bioset); + + bio->bi_end_io = bch2_direct_IO_read_endio; +@@ -1963,7 +1963,7 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter) + goto start; + while (iter->count) { + bio = bio_alloc_bioset(GFP_KERNEL, +- iov_iter_npages(iter, BIO_MAX_VECS), ++ bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS), + &c->bio_read); + bio->bi_end_io = bch2_direct_IO_read_split_endio; + start: +@@ -2313,9 +2313,7 @@ ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter) + } + + bio = bio_alloc_bioset(GFP_KERNEL, +- iov_iter_is_bvec(iter) +- ? 0 +- : iov_iter_npages(iter, BIO_MAX_VECS), ++ bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS), + &c->dio_write_bioset); + dio = container_of(bio, struct dio_write, op.wbio.bio); + init_completion(&dio->done); +-- +cgit v1.2.3 + + +From cf7d151cd7c7af81809b8e68b7e714341abfdd26 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 9 Mar 2022 15:37:42 -0500 +Subject: bcachefs: Fix dio write path with loopback dio mode + +When the iov_iter is a bvec iter, it's possible the IO was submitted +from a kthread that didn't have an mm to switch to. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/fs-io.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c +index ab983f2e339e..b05d6e896f03 100644 +--- a/fs/bcachefs/fs-io.c ++++ b/fs/bcachefs/fs-io.c +@@ -2110,7 +2110,7 @@ static long bch2_dio_write_loop(struct dio_write *dio) + while (1) { + iter_count = dio->iter.count; + +- if (kthread) ++ if (kthread && dio->mm) + kthread_use_mm(dio->mm); + BUG_ON(current->faults_disabled_mapping); + current->faults_disabled_mapping = mapping; +@@ -2120,7 +2120,7 @@ static long bch2_dio_write_loop(struct dio_write *dio) + dropped_locks = fdm_dropped_locks(); + + current->faults_disabled_mapping = NULL; +- if (kthread) ++ if (kthread && dio->mm) + kthread_unuse_mm(dio->mm); + + /* +-- +cgit v1.2.3 + + +From 00318db30eb6f105d08e84da51e0e3f5692f9961 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 7 Mar 2022 22:05:49 -0500 +Subject: bcachefs: Fix error handling in traverse_all() + +In btree_path_traverse_all() we were failing to check for -EIO in the +retry loop, and after btree node read error we'd go into an infinite +loop. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 6 ++++-- + 1 file changed, 4 insertions(+), 2 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 8186ee7e23ff..847ea7244121 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -1484,8 +1484,10 @@ retry_all: + */ + if (path->uptodate) { + ret = btree_path_traverse_one(trans, path, 0, _THIS_IP_); +- if (ret) ++ if (ret == -EINTR || ret == -ENOMEM) + goto retry_all; ++ if (ret) ++ goto err; + } else { + i++; + } +@@ -1498,7 +1500,7 @@ retry_all: + */ + trans_for_each_path(trans, path) + BUG_ON(path->uptodate >= BTREE_ITER_NEED_TRAVERSE); +- ++err: + bch2_btree_cache_cannibalize_unlock(c); + + trans->in_traverse_all = false; +-- +cgit v1.2.3 + + +From f0dd5c89c635c8d6ea186eac9b68e8376b059d83 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 6 Mar 2022 21:17:43 -0500 +Subject: bcachefs: Fix lock ordering under traverse_all() + +traverse_all() traverses btree paths in sorted order, so it should never +see transaction restarts due to lock ordering violations. But some code +in __bch2_btree_path_upgrade(), while necessary when not running under +traverse_all(), was causing some confusing lock ordering violations - +disabling this code under traverse_all() will let us put in some more +assertions. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 17 +++++++++-------- + 1 file changed, 9 insertions(+), 8 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 847ea7244121..23923503dc6e 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -486,14 +486,15 @@ bool __bch2_btree_path_upgrade(struct btree_trans *trans, + * before interior nodes - now that's handled by + * bch2_btree_path_traverse_all(). + */ +- trans_for_each_path(trans, linked) +- if (linked != path && +- linked->cached == path->cached && +- linked->btree_id == path->btree_id && +- linked->locks_want < new_locks_want) { +- linked->locks_want = new_locks_want; +- btree_path_get_locks(trans, linked, true); +- } ++ if (!path->cached && !trans->in_traverse_all) ++ trans_for_each_path(trans, linked) ++ if (linked != path && ++ linked->cached == path->cached && ++ linked->btree_id == path->btree_id && ++ linked->locks_want < new_locks_want) { ++ linked->locks_want = new_locks_want; ++ btree_path_get_locks(trans, linked, true); ++ } + + return false; + } +-- +cgit v1.2.3 + + +From 3e7a423e265bff45ccdea17a86393535d662201b Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 6 Mar 2022 20:56:08 -0500 +Subject: bcachefs: Lock ordering asserts for traverse_all() + +This adds some new assertions that we always take locks in the correct +order while running under traverse_all() - we've been seeing some +livelocks and a bit of strange behaviour, this helps ensure that +everything is working the way we expect. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 25 ++++++++++++++++--------- + fs/bcachefs/btree_key_cache.c | 4 ++-- + fs/bcachefs/btree_locking.h | 10 +++++++--- + fs/bcachefs/btree_types.h | 1 + + 4 files changed, 26 insertions(+), 14 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 23923503dc6e..780b77e72bb5 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -189,7 +189,7 @@ bool __bch2_btree_node_relock(struct btree_trans *trans, + if (six_relock_type(&b->c.lock, want, path->l[level].lock_seq) || + (btree_node_lock_seq_matches(path, b, level) && + btree_node_lock_increment(trans, b, level, want))) { +- mark_btree_node_locked(path, level, want); ++ mark_btree_node_locked(trans, path, level, want); + return true; + } + fail: +@@ -240,7 +240,7 @@ bool bch2_btree_node_upgrade(struct btree_trans *trans, + + return false; + success: +- mark_btree_node_intent_locked(path, level); ++ mark_btree_node_intent_locked(trans, path, level); + return true; + } + +@@ -1168,7 +1168,7 @@ void bch2_trans_node_add(struct btree_trans *trans, struct btree *b) + t != BTREE_NODE_UNLOCKED) { + btree_node_unlock(path, b->c.level); + six_lock_increment(&b->c.lock, t); +- mark_btree_node_locked(path, b->c.level, t); ++ mark_btree_node_locked(trans, path, b->c.level, t); + } + + btree_path_level_init(trans, path, b); +@@ -1245,7 +1245,7 @@ static inline int btree_path_lock_root(struct btree_trans *trans, + for (i = path->level + 1; i < BTREE_MAX_DEPTH; i++) + path->l[i].b = NULL; + +- mark_btree_node_locked(path, path->level, lock_type); ++ mark_btree_node_locked(trans, path, path->level, lock_type); + btree_path_level_init(trans, path, b); + return 0; + } +@@ -1410,7 +1410,7 @@ static __always_inline int btree_path_down(struct btree_trans *trans, + if (unlikely(ret)) + goto err; + +- mark_btree_node_locked(path, level, lock_type); ++ mark_btree_node_locked(trans, path, level, lock_type); + btree_path_level_init(trans, path, b); + + if (likely(replay_done && tmp.k->k.type == KEY_TYPE_btree_ptr_v2) && +@@ -1443,6 +1443,7 @@ static int bch2_btree_path_traverse_all(struct btree_trans *trans) + trans->in_traverse_all = true; + retry_all: + trans->restarted = false; ++ trans->traverse_all_idx = U8_MAX; + + trans_for_each_path(trans, path) + path->should_be_locked = false; +@@ -1475,9 +1476,9 @@ retry_all: + } + + /* Now, redo traversals in correct order: */ +- i = 0; +- while (i < trans->nr_sorted) { +- path = trans->paths + trans->sorted[i]; ++ trans->traverse_all_idx = 0; ++ while (trans->traverse_all_idx < trans->nr_sorted) { ++ path = trans->paths + trans->sorted[trans->traverse_all_idx]; + + /* + * Traversing a path can cause another path to be added at about +@@ -1489,8 +1490,9 @@ retry_all: + goto retry_all; + if (ret) + goto err; ++ BUG_ON(path->uptodate); + } else { +- i++; ++ trans->traverse_all_idx++; + } + } + +@@ -2834,6 +2836,11 @@ static inline void btree_path_list_add(struct btree_trans *trans, + + path->sorted_idx = pos ? pos->sorted_idx + 1 : 0; + ++ if (trans->in_traverse_all && ++ trans->traverse_all_idx != U8_MAX && ++ trans->traverse_all_idx >= path->sorted_idx) ++ trans->traverse_all_idx++; ++ + array_insert_item(trans->sorted, trans->nr_sorted, path->sorted_idx, path->idx); + + for (i = path->sorted_idx; i < trans->nr_sorted; i++) +diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c +index ee89b650f6a4..b1b7a30417bc 100644 +--- a/fs/bcachefs/btree_key_cache.c ++++ b/fs/bcachefs/btree_key_cache.c +@@ -309,7 +309,7 @@ retry: + if (!ck) + goto retry; + +- mark_btree_node_locked(path, 0, SIX_LOCK_intent); ++ mark_btree_node_locked(trans, path, 0, SIX_LOCK_intent); + path->locks_want = 1; + } else { + enum six_lock_type lock_want = __btree_lock_want(path, 0); +@@ -330,7 +330,7 @@ retry: + goto retry; + } + +- mark_btree_node_locked(path, 0, lock_want); ++ mark_btree_node_locked(trans, path, 0, lock_want); + } + + path->l[0].lock_seq = ck->c.lock.state.seq; +diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h +index b4434eca0746..67c970d727ac 100644 +--- a/fs/bcachefs/btree_locking.h ++++ b/fs/bcachefs/btree_locking.h +@@ -58,7 +58,8 @@ static inline void mark_btree_node_unlocked(struct btree_path *path, + path->nodes_intent_locked &= ~(1 << level); + } + +-static inline void mark_btree_node_locked(struct btree_path *path, ++static inline void mark_btree_node_locked(struct btree_trans *trans, ++ struct btree_path *path, + unsigned level, + enum six_lock_type type) + { +@@ -66,14 +67,17 @@ static inline void mark_btree_node_locked(struct btree_path *path, + BUILD_BUG_ON(SIX_LOCK_read != 0); + BUILD_BUG_ON(SIX_LOCK_intent != 1); + ++ BUG_ON(trans->in_traverse_all && path->sorted_idx > trans->traverse_all_idx); ++ + path->nodes_locked |= 1 << level; + path->nodes_intent_locked |= type << level; + } + +-static inline void mark_btree_node_intent_locked(struct btree_path *path, ++static inline void mark_btree_node_intent_locked(struct btree_trans *trans, ++ struct btree_path *path, + unsigned level) + { +- mark_btree_node_locked(path, level, SIX_LOCK_intent); ++ mark_btree_node_locked(trans, path, level, SIX_LOCK_intent); + } + + static inline enum six_lock_type __btree_lock_want(struct btree_path *path, int level) +diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h +index e6deb3a4494b..575635b5fa10 100644 +--- a/fs/bcachefs/btree_types.h ++++ b/fs/bcachefs/btree_types.h +@@ -392,6 +392,7 @@ struct btree_trans { + + u8 nr_sorted; + u8 nr_updates; ++ u8 traverse_all_idx; + bool used_mempool:1; + bool in_traverse_all:1; + bool restarted:1; +-- +cgit v1.2.3 + + +From b132c81d1fa6477562f4c19fd7a1b47d8da316ee Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 5 Mar 2022 18:23:47 -0500 +Subject: bcachefs: Change flags param to bch2_btree_delete_range to + update_flags + +It wasn't used as iter_flags (excepting the unit tests, which this patch +fixes), and the next patch is going to need to pass in +BTREE_TRIGGER_NORUN. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_update_leaf.c | 11 ++++++----- + fs/bcachefs/tests.c | 14 ++++++-------- + 2 files changed, 12 insertions(+), 13 deletions(-) + +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index ad5f516da1aa..ab05b2b74171 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -1635,14 +1635,14 @@ int bch2_btree_delete_at(struct btree_trans *trans, + + int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id, + struct bpos start, struct bpos end, +- unsigned iter_flags, ++ unsigned update_flags, + u64 *journal_seq) + { + struct btree_iter iter; + struct bkey_s_c k; + int ret = 0; + +- bch2_trans_iter_init(trans, &iter, id, start, BTREE_ITER_INTENT|iter_flags); ++ bch2_trans_iter_init(trans, &iter, id, start, BTREE_ITER_INTENT); + retry: + while ((bch2_trans_begin(trans), + (k = bch2_btree_iter_peek(&iter)).k) && +@@ -1685,7 +1685,8 @@ retry: + + ret = bch2_trans_update(trans, &iter, &delete, 0) ?: + bch2_trans_commit(trans, &disk_res, journal_seq, +- BTREE_INSERT_NOFAIL); ++ BTREE_INSERT_NOFAIL| ++ update_flags); + bch2_disk_reservation_put(trans->c, &disk_res); + if (ret) + break; +@@ -1707,10 +1708,10 @@ retry: + */ + int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id, + struct bpos start, struct bpos end, +- unsigned iter_flags, ++ unsigned update_flags, + u64 *journal_seq) + { + return bch2_trans_do(c, NULL, journal_seq, 0, + bch2_btree_delete_range_trans(&trans, id, start, end, +- iter_flags, journal_seq)); ++ update_flags, journal_seq)); + } +diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c +index 3addf400e177..4369bfc55a94 100644 +--- a/fs/bcachefs/tests.c ++++ b/fs/bcachefs/tests.c +@@ -15,15 +15,14 @@ static void delete_test_keys(struct bch_fs *c) + int ret; + + ret = bch2_btree_delete_range(c, BTREE_ID_extents, +- POS_MIN, SPOS_MAX, +- BTREE_ITER_ALL_SNAPSHOTS, ++ SPOS(0, 0, U32_MAX), SPOS_MAX, ++ 0, + NULL); + BUG_ON(ret); + + ret = bch2_btree_delete_range(c, BTREE_ID_xattrs, +- POS_MIN, SPOS_MAX, +- BTREE_ITER_ALL_SNAPSHOTS, +- NULL); ++ SPOS(0, 0, U32_MAX), SPOS_MAX, ++ 0, NULL); + BUG_ON(ret); + } + +@@ -814,9 +813,8 @@ static int seq_delete(struct bch_fs *c, u64 nr) + int ret; + + ret = bch2_btree_delete_range(c, BTREE_ID_xattrs, +- POS_MIN, SPOS_MAX, +- BTREE_ITER_ALL_SNAPSHOTS, +- NULL); ++ SPOS(0, 0, U32_MAX), SPOS_MAX, ++ 0, NULL); + if (ret) + bch_err(c, "error in seq_delete: %i", ret); + return ret; +-- +cgit v1.2.3 + + +From aedb731ccf7db356f4dfac21583111d857fb6834 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 10 Mar 2022 14:25:16 -0500 +Subject: bcachefs: bch2_journal_log_msg() + +This adds bch2_journal_log_msg(), which just logs a message to the +journal, and uses it to mark startup and when journal replay finishes. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/journal.c | 83 +++++++++++++++++++++++++++++++++++--------------- + fs/bcachefs/journal.h | 1 + + fs/bcachefs/recovery.c | 3 ++ + 3 files changed, 62 insertions(+), 25 deletions(-) + +diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c +index 8a2fc25bf6f6..4151be494888 100644 +--- a/fs/bcachefs/journal.c ++++ b/fs/bcachefs/journal.c +@@ -631,31 +631,6 @@ int bch2_journal_flush_seq(struct journal *j, u64 seq) + return ret ?: ret2 < 0 ? ret2 : 0; + } + +-int bch2_journal_meta(struct journal *j) +-{ +- struct journal_buf *buf; +- struct journal_res res; +- int ret; +- +- memset(&res, 0, sizeof(res)); +- +- ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0); +- if (ret) +- return ret; +- +- buf = j->buf + (res.seq & JOURNAL_BUF_MASK); +- buf->must_flush = true; +- +- if (!buf->flush_time) { +- buf->flush_time = local_clock() ?: 1; +- buf->expires = jiffies; +- } +- +- bch2_journal_res_put(j, &res); +- +- return bch2_journal_flush_seq(j, res.seq); +-} +- + /* + * bch2_journal_flush_async - if there is an open journal entry, or a journal + * still being written, write it and wait for the write to complete +@@ -708,6 +683,64 @@ out: + return ret; + } + ++int bch2_journal_meta(struct journal *j) ++{ ++ struct journal_buf *buf; ++ struct journal_res res; ++ int ret; ++ ++ memset(&res, 0, sizeof(res)); ++ ++ ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0); ++ if (ret) ++ return ret; ++ ++ buf = j->buf + (res.seq & JOURNAL_BUF_MASK); ++ buf->must_flush = true; ++ ++ if (!buf->flush_time) { ++ buf->flush_time = local_clock() ?: 1; ++ buf->expires = jiffies; ++ } ++ ++ bch2_journal_res_put(j, &res); ++ ++ return bch2_journal_flush_seq(j, res.seq); ++} ++ ++int bch2_journal_log_msg(struct journal *j, const char *fmt, ...) ++{ ++ struct jset_entry_log *entry; ++ struct journal_res res = { 0 }; ++ unsigned msglen, u64s; ++ va_list args; ++ int ret; ++ ++ va_start(args, fmt); ++ msglen = vsnprintf(NULL, 0, fmt, args) + 1; ++ va_end(args); ++ ++ u64s = jset_u64s(DIV_ROUND_UP(msglen, sizeof(u64))); ++ ++ ret = bch2_journal_res_get(j, &res, u64s, 0); ++ if (ret) ++ return ret; ++ ++ entry = container_of(journal_res_entry(j, &res), ++ struct jset_entry_log, entry);; ++ memset(entry, 0, u64s * sizeof(u64)); ++ entry->entry.type = BCH_JSET_ENTRY_log; ++ entry->entry.u64s = u64s - 1; ++ ++ va_start(args, fmt); ++ vsnprintf(entry->d, INT_MAX, fmt, args); ++ va_end(args); ++ ++ bch2_journal_res_put(j, &res); ++ ++ return bch2_journal_flush_seq(j, res.seq); ++} ++ + /* block/unlock the journal: */ + + void bch2_journal_unblock(struct journal *j) +diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h +index 1bb0e00df44c..989c33157cd2 100644 +--- a/fs/bcachefs/journal.h ++++ b/fs/bcachefs/journal.h +@@ -480,6 +480,7 @@ int bch2_journal_flush_seq(struct journal *, u64); + int bch2_journal_flush(struct journal *); + bool bch2_journal_noflush_seq(struct journal *, u64); + int bch2_journal_meta(struct journal *); ++int bch2_journal_log_msg(struct journal *, const char *, ...); + + void bch2_journal_halt(struct journal *); + +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index 6c4ffc5abdc5..887971559214 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -578,6 +578,9 @@ static int bch2_journal_replay(struct bch_fs *c) + bch2_journal_set_replay_done(j); + bch2_journal_flush_all_pins(j); + ret = bch2_journal_error(j); ++ ++ if (keys->nr && !ret) ++ bch2_journal_log_msg(&c->journal, "journal replay finished"); + err: + kvfree(keys_sorted); + return ret; +-- +cgit v1.2.3 + + +From 38d6f4627e2366640d024ec33c9dbd11e8cd6414 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 10 Mar 2022 15:49:03 -0500 +Subject: bcachefs: Allocate journal buckets sequentially + +This tweaks __bch2_set_nr_journal_buckets() so that we aren't reversing +their order in the jorunal anymore - nice for rotating disks. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/journal.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c +index 4151be494888..eb556ecc511f 100644 +--- a/fs/bcachefs/journal.c ++++ b/fs/bcachefs/journal.c +@@ -838,7 +838,7 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, + * superblock before inserting into the journal array + */ + +- pos = ja->nr ? (ja->cur_idx + 1) % ja->nr : 0; ++ pos = ja->discard_idx ?: ja->nr; + __array_insert_item(ja->buckets, ja->nr, pos); + __array_insert_item(ja->bucket_seq, ja->nr, pos); + __array_insert_item(journal_buckets->buckets, ja->nr, pos); +-- +cgit v1.2.3 + + +From 383669899b0f0156a995cf390106a13496fc13cf Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 10 Mar 2022 17:35:06 -0500 +Subject: bcachefs: Add a missing wakeup + +This fixes a rare bug with bch2_btree_flush_all_writes() getting stuck. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_io.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c +index 4f0ad06a615a..e6cea4c687e1 100644 +--- a/fs/bcachefs/btree_io.c ++++ b/fs/bcachefs/btree_io.c +@@ -1626,6 +1626,8 @@ static void __btree_node_write_done(struct bch_fs *c, struct btree *b) + + if (new & (1U << BTREE_NODE_write_in_flight)) + __bch2_btree_node_write(c, b, BTREE_WRITE_ALREADY_STARTED); ++ else ++ wake_up_bit(&b->flags, BTREE_NODE_write_in_flight); + } + + static void btree_node_write_done(struct bch_fs *c, struct btree *b) +@@ -2094,7 +2096,6 @@ restart: + rcu_read_unlock(); + wait_on_bit_io(&b->flags, flag, TASK_UNINTERRUPTIBLE); + goto restart; +- + } + rcu_read_unlock(); + } +-- +cgit v1.2.3 + + +From 499d328fc019faa072538af5c0295eb02d99f0ce Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 10 Mar 2022 23:22:49 -0500 +Subject: bcachefs: Delay setting path->should_be_locked + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 14 ++++++++------ + 1 file changed, 8 insertions(+), 6 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 780b77e72bb5..ba0372c1e3ed 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -2392,10 +2392,7 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) + iter->update_path = bch2_btree_path_set_pos(trans, + iter->update_path, pos, + iter->flags & BTREE_ITER_INTENT, +- btree_iter_ip_allocated(iter)); +- +- BUG_ON(!(iter->update_path->nodes_locked & 1)); +- iter->update_path->should_be_locked = true; ++ _THIS_IP_); + } + + /* +@@ -2434,8 +2431,13 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) + BUG_ON(!iter->path->nodes_locked); + out: + if (iter->update_path) { +- BUG_ON(!(iter->update_path->nodes_locked & 1)); +- iter->update_path->should_be_locked = true; ++ if (iter->update_path->uptodate && ++ !bch2_btree_path_relock(trans, iter->update_path, _THIS_IP_)) { ++ k = bkey_s_c_err(-EINTR); ++ } else { ++ BUG_ON(!(iter->update_path->nodes_locked & 1)); ++ iter->update_path->should_be_locked = true; ++ } + } + iter->path->should_be_locked = true; + +-- +cgit v1.2.3 + + +From 53d144ba09d9ceec63c8a3c8c43d2dad492fe321 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 11 Mar 2022 12:31:52 -0500 +Subject: bcachefs: bch2_btree_iter_peek_upto() + +In BTREE_ITER_FILTER_SNAPHOTS mode, we skip over keys in unrelated +snapshots. When we hit the end of an inode, if the next inode(s) are in +a different subvolume, we could potentially have to skip past many keys +before finding a key we can return to the caller, so they can terminate +the iteration. + +This adds a peek_upto() variant to solve this problem, to be used when +we know the range we're searching within. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 36 ++++++++++++++++++++++++++---------- + fs/bcachefs/btree_iter.h | 30 ++++++++++++++++++++++++++++-- + fs/bcachefs/btree_update_leaf.c | 5 +++-- + fs/bcachefs/dirent.c | 17 ++++++----------- + fs/bcachefs/fs.c | 5 ++--- + fs/bcachefs/inode.c | 4 ++-- + fs/bcachefs/str_hash.h | 21 +++++++-------------- + fs/bcachefs/xattr.c | 10 +++------- + 8 files changed, 77 insertions(+), 51 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index ba0372c1e3ed..0fcd056abeda 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -2342,11 +2342,12 @@ out: + * bch2_btree_iter_peek: returns first key greater than or equal to iterator's + * current position + */ +-struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) ++struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos end) + { + struct btree_trans *trans = iter->trans; + struct bpos search_key = btree_iter_search_key(iter); + struct bkey_s_c k; ++ struct bpos iter_pos; + int ret; + + if (iter->update_path) { +@@ -2362,6 +2363,24 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) + if (!k.k || bkey_err(k)) + goto out; + ++ /* ++ * iter->pos should be mononotically increasing, and always be ++ * equal to the key we just returned - except extents can ++ * straddle iter->pos: ++ */ ++ if (!(iter->flags & BTREE_ITER_IS_EXTENTS)) ++ iter_pos = k.k->p; ++ else if (bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0) ++ iter_pos = bkey_start_pos(k.k); ++ else ++ iter_pos = iter->pos; ++ ++ if (bkey_cmp(iter_pos, end) > 0) { ++ bch2_btree_iter_set_pos(iter, end); ++ k = bkey_s_c_null; ++ goto out; ++ } ++ + if (iter->update_path && + bkey_cmp(iter->update_path->pos, k.k->p)) { + bch2_path_put(trans, iter->update_path, +@@ -2416,14 +2435,7 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) + break; + } + +- /* +- * iter->pos should be mononotically increasing, and always be equal to +- * the key we just returned - except extents can straddle iter->pos: +- */ +- if (!(iter->flags & BTREE_ITER_IS_EXTENTS)) +- iter->pos = k.k->p; +- else if (bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0) +- iter->pos = bkey_start_pos(k.k); ++ iter->pos = iter_pos; + + iter->path = bch2_btree_path_set_pos(trans, iter->path, k.k->p, + iter->flags & BTREE_ITER_INTENT, +@@ -2668,9 +2680,13 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) + + if (iter->flags & BTREE_ITER_INTENT) { + struct btree_iter iter2; ++ struct bpos end = iter->pos; ++ ++ if (iter->flags & BTREE_ITER_IS_EXTENTS) ++ end.offset = U64_MAX; + + bch2_trans_copy_iter(&iter2, iter); +- k = bch2_btree_iter_peek(&iter2); ++ k = bch2_btree_iter_peek_upto(&iter2, end); + + if (k.k && !bkey_err(k)) { + iter->k = iter2.k; +diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h +index d612aec91587..c6bb3c6d54a4 100644 +--- a/fs/bcachefs/btree_iter.h ++++ b/fs/bcachefs/btree_iter.h +@@ -209,9 +209,14 @@ int __must_check bch2_btree_iter_traverse(struct btree_iter *); + struct btree *bch2_btree_iter_peek_node(struct btree_iter *); + struct btree *bch2_btree_iter_next_node(struct btree_iter *); + +-struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *); ++struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *, struct bpos); + struct bkey_s_c bch2_btree_iter_next(struct btree_iter *); + ++static inline struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) ++{ ++ return bch2_btree_iter_peek_upto(iter, SPOS_MAX); ++} ++ + struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *); + struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *); + +@@ -306,13 +311,26 @@ static inline int bkey_err(struct bkey_s_c k) + } + + static inline struct bkey_s_c bch2_btree_iter_peek_type(struct btree_iter *iter, +- unsigned flags) ++ unsigned flags) + { + return flags & BTREE_ITER_SLOTS + ? bch2_btree_iter_peek_slot(iter) + : bch2_btree_iter_peek(iter); + } + ++static inline struct bkey_s_c bch2_btree_iter_peek_upto_type(struct btree_iter *iter, ++ struct bpos end, ++ unsigned flags) ++{ ++ if (!(flags & BTREE_ITER_SLOTS)) ++ return bch2_btree_iter_peek_upto(iter, end); ++ ++ if (bkey_cmp(iter->pos, end) > 0) ++ return bkey_s_c_null; ++ ++ return bch2_btree_iter_peek_slot(iter); ++} ++ + static inline int btree_trans_too_many_iters(struct btree_trans *trans) + { + return hweight64(trans->paths_allocated) > BTREE_ITER_MAX / 2 +@@ -349,6 +367,14 @@ __bch2_btree_iter_peek_and_restart(struct btree_trans *trans, + !((_ret) = bkey_err(_k)) && (_k).k; \ + bch2_btree_iter_advance(&(_iter))) + ++#define for_each_btree_key_upto_norestart(_trans, _iter, _btree_id, \ ++ _start, _end, _flags, _k, _ret) \ ++ for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ ++ (_start), (_flags)); \ ++ (_k) = bch2_btree_iter_peek_upto_type(&(_iter), _end, _flags),\ ++ !((_ret) = bkey_err(_k)) && (_k).k; \ ++ bch2_btree_iter_advance(&(_iter))) ++ + #define for_each_btree_key_continue(_trans, _iter, _flags, _k, _ret) \ + for (; \ + (_k) = __bch2_btree_iter_peek_and_restart((_trans), &(_iter), _flags),\ +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index ab05b2b74171..e9eb79b51ece 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -1223,7 +1223,7 @@ int bch2_trans_update_extent(struct btree_trans *trans, + BTREE_ITER_INTENT| + BTREE_ITER_WITH_UPDATES| + BTREE_ITER_NOT_EXTENTS); +- k = bch2_btree_iter_peek(&iter); ++ k = bch2_btree_iter_peek_upto(&iter, POS(insert->k.p.inode, U64_MAX)); + if ((ret = bkey_err(k))) + goto err; + if (!k.k) +@@ -1375,7 +1375,8 @@ nomerge1: + goto out; + } + next: +- k = bch2_btree_iter_next(&iter); ++ bch2_btree_iter_advance(&iter); ++ k = bch2_btree_iter_peek_upto(&iter, POS(insert->k.p.inode, U64_MAX)); + if ((ret = bkey_err(k))) + goto err; + if (!k.k) +diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c +index a43a24409d37..760e4f74715f 100644 +--- a/fs/bcachefs/dirent.c ++++ b/fs/bcachefs/dirent.c +@@ -470,16 +470,13 @@ int bch2_empty_dir_trans(struct btree_trans *trans, subvol_inum dir) + if (ret) + return ret; + +- for_each_btree_key_norestart(trans, iter, BTREE_ID_dirents, +- SPOS(dir.inum, 0, snapshot), 0, k, ret) { +- if (k.k->p.inode > dir.inum) +- break; +- ++ for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_dirents, ++ SPOS(dir.inum, 0, snapshot), ++ POS(dir.inum, U64_MAX), 0, k, ret) + if (k.k->type == KEY_TYPE_dirent) { + ret = -ENOTEMPTY; + break; + } +- } + bch2_trans_iter_exit(trans, &iter); + + return ret; +@@ -503,11 +500,9 @@ retry: + if (ret) + goto err; + +- for_each_btree_key_norestart(&trans, iter, BTREE_ID_dirents, +- SPOS(inum.inum, ctx->pos, snapshot), 0, k, ret) { +- if (k.k->p.inode > inum.inum) +- break; +- ++ for_each_btree_key_upto_norestart(&trans, iter, BTREE_ID_dirents, ++ SPOS(inum.inum, ctx->pos, snapshot), ++ POS(inum.inum, U64_MAX), 0, k, ret) { + if (k.k->type != KEY_TYPE_dirent) + continue; + +diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c +index 73c4177cb4f2..9fc6c39eacdb 100644 +--- a/fs/bcachefs/fs.c ++++ b/fs/bcachefs/fs.c +@@ -935,9 +935,8 @@ retry: + SPOS(ei->v.i_ino, start, snapshot), 0); + + while (!(ret = btree_trans_too_many_iters(&trans)) && +- (k = bch2_btree_iter_peek(&iter)).k && +- !(ret = bkey_err(k)) && +- bkey_cmp(iter.pos, end) < 0) { ++ (k = bch2_btree_iter_peek_upto(&iter, end)).k && ++ !(ret = bkey_err(k))) { + enum btree_id data_btree = BTREE_ID_extents; + + if (!bkey_extent_is_data(k.k) && +diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c +index 78e2db6c938b..14b0b595202d 100644 +--- a/fs/bcachefs/inode.c ++++ b/fs/bcachefs/inode.c +@@ -606,12 +606,12 @@ static int bch2_inode_delete_keys(struct btree_trans *trans, + + bch2_btree_iter_set_snapshot(&iter, snapshot); + +- k = bch2_btree_iter_peek(&iter); ++ k = bch2_btree_iter_peek_upto(&iter, POS(inum.inum, U64_MAX)); + ret = bkey_err(k); + if (ret) + goto err; + +- if (!k.k || iter.pos.inode != inum.inum) ++ if (!k.k) + break; + + bkey_init(&delete.k); +diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h +index 57d636740d2f..591bbb9f8beb 100644 +--- a/fs/bcachefs/str_hash.h ++++ b/fs/bcachefs/str_hash.h +@@ -163,12 +163,10 @@ bch2_hash_lookup(struct btree_trans *trans, + if (ret) + return ret; + +- for_each_btree_key_norestart(trans, *iter, desc.btree_id, ++ for_each_btree_key_upto_norestart(trans, *iter, desc.btree_id, + SPOS(inum.inum, desc.hash_key(info, key), snapshot), ++ POS(inum.inum, U64_MAX), + BTREE_ITER_SLOTS|flags, k, ret) { +- if (iter->pos.inode != inum.inum) +- break; +- + if (is_visible_key(desc, inum, k)) { + if (!desc.cmp_key(k, key)) + return 0; +@@ -199,15 +197,12 @@ bch2_hash_hole(struct btree_trans *trans, + if (ret) + return ret; + +- for_each_btree_key_norestart(trans, *iter, desc.btree_id, ++ for_each_btree_key_upto_norestart(trans, *iter, desc.btree_id, + SPOS(inum.inum, desc.hash_key(info, key), snapshot), +- BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { +- if (iter->pos.inode != inum.inum) +- break; +- ++ POS(inum.inum, U64_MAX), ++ BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) + if (!is_visible_key(desc, inum, k)) + return 0; +- } + bch2_trans_iter_exit(trans, iter); + + return ret ?: -ENOSPC; +@@ -260,14 +255,12 @@ int bch2_hash_set(struct btree_trans *trans, + if (ret) + return ret; + +- for_each_btree_key_norestart(trans, iter, desc.btree_id, ++ for_each_btree_key_upto_norestart(trans, iter, desc.btree_id, + SPOS(inum.inum, + desc.hash_bkey(info, bkey_i_to_s_c(insert)), + snapshot), ++ POS(inum.inum, U64_MAX), + BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { +- if (iter.pos.inode != inum.inum) +- break; +- + if (is_visible_key(desc, inum, k)) { + if (!desc.cmp_bkey(k, bkey_i_to_s_c(insert))) + goto found; +diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c +index c2e9520a0457..1c680b16b924 100644 +--- a/fs/bcachefs/xattr.c ++++ b/fs/bcachefs/xattr.c +@@ -311,13 +311,9 @@ retry: + if (ret) + goto err; + +- for_each_btree_key_norestart(&trans, iter, BTREE_ID_xattrs, +- SPOS(inum, offset, snapshot), 0, k, ret) { +- BUG_ON(k.k->p.inode < inum); +- +- if (k.k->p.inode > inum) +- break; +- ++ for_each_btree_key_upto_norestart(&trans, iter, BTREE_ID_xattrs, ++ SPOS(inum, offset, snapshot), ++ POS(inum, U64_MAX), 0, k, ret) { + if (k.k->type != KEY_TYPE_xattr) + continue; + +-- +cgit v1.2.3 + + +From 95ab743ace8bd665b5af54b2c65f22d31ddda027 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 11 Mar 2022 18:16:42 -0500 +Subject: bcachefs: Drop !did_work path from do_btree_insert_one() + +As we've already reserved space in the journal this optimization doesn't +actually buy us anything, and when doing list_journal debugging it +deletes information we want. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_update_leaf.c | 16 ++++++---------- + 1 file changed, 6 insertions(+), 10 deletions(-) + +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index e9eb79b51ece..f24c0f65e1fa 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -213,7 +213,7 @@ inline void bch2_btree_add_journal_pin(struct bch_fs *c, + /** + * btree_insert_key - insert a key one key into a leaf node + */ +-static bool btree_insert_key_leaf(struct btree_trans *trans, ++static void btree_insert_key_leaf(struct btree_trans *trans, + struct btree_insert_entry *insert) + { + struct bch_fs *c = trans->c; +@@ -226,7 +226,7 @@ static bool btree_insert_key_leaf(struct btree_trans *trans, + + if (unlikely(!bch2_btree_bset_insert_key(trans, insert->path, b, + &insert_l(insert)->iter, insert->k))) +- return false; ++ return; + + i->journal_seq = cpu_to_le64(max(trans->journal_res.seq, + le64_to_cpu(i->journal_seq))); +@@ -247,8 +247,6 @@ static bool btree_insert_key_leaf(struct btree_trans *trans, + if (u64s_added > live_u64s_added && + bch2_maybe_compact_whiteouts(c, b)) + bch2_trans_node_reinit_iter(trans, b); +- +- return true; + } + + /* Cached btree updates: */ +@@ -400,18 +398,16 @@ static inline void do_btree_insert_one(struct btree_trans *trans, + { + struct bch_fs *c = trans->c; + struct journal *j = &c->journal; +- bool did_work; + + EBUG_ON(trans->journal_res.ref != + !(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)); + + i->k->k.needs_whiteout = false; + +- did_work = !i->cached +- ? btree_insert_key_leaf(trans, i) +- : bch2_btree_insert_key_cached(trans, i->path, i->k); +- if (!did_work) +- return; ++ if (!i->cached) ++ btree_insert_key_leaf(trans, i); ++ else ++ bch2_btree_insert_key_cached(trans, i->path, i->k); + + if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) { + bch2_journal_add_keys(j, &trans->journal_res, +-- +cgit v1.2.3 + + +From 876c30c8a6a9830002a674fa84ffa28d7bd490df Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 2 Mar 2022 22:18:56 -0500 +Subject: bcachefs: bch2_trans_inconsistent() + +Add a new error macro that also dumps transaction updates in addition to +doing an emergency shutdown - when a transaction update discovers or is +causing a fs inconsistency, it's helpful to see what updates it was +doing. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 53 +++++++++++++++++++++++++++++------------------- + fs/bcachefs/btree_iter.h | 1 + + fs/bcachefs/buckets.c | 32 ++++++++++++++--------------- + fs/bcachefs/error.h | 20 ++++++++++++++++++ + 4 files changed, 68 insertions(+), 38 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 0fcd056abeda..6d61d7cacfdc 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -1813,17 +1813,42 @@ free: + } + + noinline __cold +-void bch2_dump_trans_paths_updates(struct btree_trans *trans) ++void bch2_dump_trans_updates(struct btree_trans *trans) + { +- struct btree_path *path; + struct btree_insert_entry *i; + struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; ++ ++ bch_err(trans->c, "transaction updates:"); ++ ++ trans_for_each_update(trans, i) { ++ struct bkey_s_c old = { &i->old_k, i->old_v }; ++ ++ printbuf_reset(&buf1); ++ printbuf_reset(&buf2); ++ bch2_bkey_val_to_text(&buf1, trans->c, old); ++ bch2_bkey_val_to_text(&buf2, trans->c, bkey_i_to_s_c(i->k)); ++ ++ printk(KERN_ERR "update: btree %s %pS\n old %s\n new %s", ++ bch2_btree_ids[i->btree_id], ++ (void *) i->ip_allocated, ++ buf1.buf, buf2.buf); ++ } ++ ++ printbuf_exit(&buf2); ++ printbuf_exit(&buf1); ++} ++ ++noinline __cold ++void bch2_dump_trans_paths_updates(struct btree_trans *trans) ++{ ++ struct btree_path *path; ++ struct printbuf buf = PRINTBUF; + unsigned idx; + + trans_for_each_path_inorder(trans, path, idx) { +- printbuf_reset(&buf1); ++ printbuf_reset(&buf); + +- bch2_bpos_to_text(&buf1, path->pos); ++ bch2_bpos_to_text(&buf, path->pos); + + printk(KERN_ERR "path: idx %u ref %u:%u%s%s btree=%s l=%u pos %s locks %u %pS\n", + path->idx, path->ref, path->intent_ref, +@@ -1831,7 +1856,7 @@ void bch2_dump_trans_paths_updates(struct btree_trans *trans) + path->preserve ? " P" : "", + bch2_btree_ids[path->btree_id], + path->level, +- buf1.buf, ++ buf.buf, + path->nodes_locked, + #ifdef CONFIG_BCACHEFS_DEBUG + (void *) path->ip_allocated +@@ -1841,23 +1866,9 @@ void bch2_dump_trans_paths_updates(struct btree_trans *trans) + ); + } + +- trans_for_each_update(trans, i) { +- struct bkey u; +- struct bkey_s_c old = bch2_btree_path_peek_slot(i->path, &u); ++ printbuf_exit(&buf); + +- printbuf_reset(&buf1); +- printbuf_reset(&buf2); +- bch2_bkey_val_to_text(&buf1, trans->c, old); +- bch2_bkey_val_to_text(&buf2, trans->c, bkey_i_to_s_c(i->k)); +- +- printk(KERN_ERR "update: btree %s %pS\n old %s\n new %s", +- bch2_btree_ids[i->btree_id], +- (void *) i->ip_allocated, +- buf1.buf, buf2.buf); +- } +- +- printbuf_exit(&buf2); +- printbuf_exit(&buf1); ++ bch2_dump_trans_updates(trans); + } + + static struct btree_path *btree_path_alloc(struct btree_trans *trans, +diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h +index c6bb3c6d54a4..f039cbe4ee51 100644 +--- a/fs/bcachefs/btree_iter.h ++++ b/fs/bcachefs/btree_iter.h +@@ -389,6 +389,7 @@ __bch2_btree_iter_peek_and_restart(struct btree_trans *trans, + + /* new multiple iterator interface: */ + ++void bch2_dump_trans_updates(struct btree_trans *); + void bch2_dump_trans_paths_updates(struct btree_trans *); + void __bch2_trans_init(struct btree_trans *, struct bch_fs *, + unsigned, size_t, const char *); +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index 2c3b71b2f04e..0a796f41f674 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -506,11 +506,16 @@ static int bch2_mark_alloc(struct btree_trans *trans, + struct bch_fs *c = trans->c; + struct bkey_alloc_unpacked old_u = bch2_alloc_unpack(old); + struct bkey_alloc_unpacked new_u = bch2_alloc_unpack(new); +- struct bch_dev *ca; ++ struct bch_dev *ca = bch_dev_bkey_exists(c, new_u.dev); + struct bucket *g; + struct bucket_mark old_m, m; + int ret = 0; + ++ if (bch2_trans_inconsistent_on(new_u.bucket < ca->mi.first_bucket || ++ new_u.bucket >= ca->mi.nbuckets, trans, ++ "alloc key outside range of device's buckets")) ++ return -EIO; ++ + /* + * alloc btree is read in by bch2_alloc_read, not gc: + */ +@@ -550,11 +555,6 @@ static int bch2_mark_alloc(struct btree_trans *trans, + } + } + +- ca = bch_dev_bkey_exists(c, new_u.dev); +- +- if (new_u.bucket >= ca->mi.nbuckets) +- return 0; +- + percpu_down_read(&c->mark_lock); + if (!gc && new_u.gen != old_u.gen) + *bucket_gen(ca, new_u.bucket) = new_u.gen; +@@ -1462,7 +1462,6 @@ static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans, + struct extent_ptr_decoded p, + s64 sectors, enum bch_data_type data_type) + { +- struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_s_c k; + struct bkey_i_stripe *s; +@@ -1478,16 +1477,15 @@ static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans, + goto err; + + if (k.k->type != KEY_TYPE_stripe) { +- bch2_fs_inconsistent(c, ++ bch2_trans_inconsistent(trans, + "pointer to nonexistent stripe %llu", + (u64) p.ec.idx); +- bch2_inconsistent_error(c); + ret = -EIO; + goto err; + } + + if (!bch2_ptr_matches_stripe(bkey_s_c_to_stripe(k).v, p)) { +- bch2_fs_inconsistent(c, ++ bch2_trans_inconsistent(trans, + "stripe pointer doesn't match stripe %llu", + (u64) p.ec.idx); + ret = -EIO; +@@ -1601,8 +1599,8 @@ static int bch2_trans_mark_stripe_bucket(struct btree_trans *trans, + goto err; + + if (!deleting) { +- if (bch2_fs_inconsistent_on(u.stripe || +- u.stripe_redundancy, c, ++ if (bch2_trans_inconsistent_on(u.stripe || ++ u.stripe_redundancy, trans, + "bucket %llu:%llu gen %u data type %s dirty_sectors %u: multiple stripes using same bucket (%u, %llu)", + iter.pos.inode, iter.pos.offset, u.gen, + bch2_data_types[u.data_type], +@@ -1612,7 +1610,7 @@ static int bch2_trans_mark_stripe_bucket(struct btree_trans *trans, + goto err; + } + +- if (bch2_fs_inconsistent_on(data_type && u.dirty_sectors, c, ++ if (bch2_trans_inconsistent_on(data_type && u.dirty_sectors, trans, + "bucket %llu:%llu gen %u data type %s dirty_sectors %u: data already in stripe bucket %llu", + iter.pos.inode, iter.pos.offset, u.gen, + bch2_data_types[u.data_type], +@@ -1625,8 +1623,8 @@ static int bch2_trans_mark_stripe_bucket(struct btree_trans *trans, + u.stripe = s.k->p.offset; + u.stripe_redundancy = s.v->nr_redundant; + } else { +- if (bch2_fs_inconsistent_on(u.stripe != s.k->p.offset || +- u.stripe_redundancy != s.v->nr_redundant, c, ++ if (bch2_trans_inconsistent_on(u.stripe != s.k->p.offset || ++ u.stripe_redundancy != s.v->nr_redundant, trans, + "bucket %llu:%llu gen %u: not marked as stripe when deleting stripe %llu (got %u)", + iter.pos.inode, iter.pos.offset, u.gen, + s.k->p.offset, u.stripe)) { +@@ -1787,7 +1785,7 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans, + refcount = bkey_refcount(n); + if (!refcount) { + bch2_bkey_val_to_text(&buf, c, p.s_c); +- bch2_fs_inconsistent(c, ++ bch2_trans_inconsistent(trans, + "nonexistent indirect extent at %llu while marking\n %s", + *idx, buf.buf); + ret = -EIO; +@@ -1796,7 +1794,7 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans, + + if (!*refcount && (flags & BTREE_TRIGGER_OVERWRITE)) { + bch2_bkey_val_to_text(&buf, c, p.s_c); +- bch2_fs_inconsistent(c, ++ bch2_trans_inconsistent(trans, + "indirect extent refcount underflow at %llu while marking\n %s", + *idx, buf.buf); + ret = -EIO; +diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h +index 4ab3cfe1292c..6e63c38186f3 100644 +--- a/fs/bcachefs/error.h ++++ b/fs/bcachefs/error.h +@@ -66,6 +66,26 @@ do { \ + _ret; \ + }) + ++/* ++ * When a transaction update discovers or is causing a fs inconsistency, it's ++ * helpful to also dump the pending updates: ++ */ ++#define bch2_trans_inconsistent(trans, ...) \ ++({ \ ++ bch_err(trans->c, __VA_ARGS__); \ ++ bch2_inconsistent_error(trans->c); \ ++ bch2_dump_trans_updates(trans); \ ++}) ++ ++#define bch2_trans_inconsistent_on(cond, trans, ...) \ ++({ \ ++ bool _ret = unlikely(!!(cond)); \ ++ \ ++ if (_ret) \ ++ bch2_trans_inconsistent(trans, __VA_ARGS__); \ ++ _ret; \ ++}) ++ + /* + * Fsck errors: inconsistency errors we detect at mount time, and should ideally + * be able to repair: +-- +cgit v1.2.3 + + +From 00cffb2e6686a872d1f504b4f1c46ab882bfb9b4 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 11 Mar 2022 18:38:24 -0500 +Subject: bcachefs: bch2_trans_updates_to_text() + +This turns bch2_dump_trans_updates() into a to_text() method - this way +it can be used by debug tracing. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 41 +++++++++++++++++++++++++++-------------- + fs/bcachefs/btree_iter.h | 1 + + 2 files changed, 28 insertions(+), 14 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 6d61d7cacfdc..56c493c95d3a 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -1812,30 +1812,43 @@ free: + __bch2_path_free(trans, path); + } + +-noinline __cold +-void bch2_dump_trans_updates(struct btree_trans *trans) ++void bch2_trans_updates_to_text(struct printbuf *buf, struct btree_trans *trans) + { + struct btree_insert_entry *i; +- struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; + +- bch_err(trans->c, "transaction updates:"); ++ pr_buf(buf, "transaction updates for %s journal seq %llu", ++ trans->fn, trans->journal_res.seq); ++ pr_newline(buf); ++ pr_indent_push(buf, 2); + + trans_for_each_update(trans, i) { + struct bkey_s_c old = { &i->old_k, i->old_v }; + +- printbuf_reset(&buf1); +- printbuf_reset(&buf2); +- bch2_bkey_val_to_text(&buf1, trans->c, old); +- bch2_bkey_val_to_text(&buf2, trans->c, bkey_i_to_s_c(i->k)); +- +- printk(KERN_ERR "update: btree %s %pS\n old %s\n new %s", ++ pr_buf(buf, "update: btree %s %pS", + bch2_btree_ids[i->btree_id], +- (void *) i->ip_allocated, +- buf1.buf, buf2.buf); ++ (void *) i->ip_allocated); ++ pr_newline(buf); ++ ++ pr_buf(buf, " old "); ++ bch2_bkey_val_to_text(buf, trans->c, old); ++ pr_newline(buf); ++ ++ pr_buf(buf, " new "); ++ bch2_bkey_val_to_text(buf, trans->c, bkey_i_to_s_c(i->k)); ++ pr_newline(buf); + } + +- printbuf_exit(&buf2); +- printbuf_exit(&buf1); ++ pr_indent_pop(buf, 2); ++} ++ ++noinline __cold ++void bch2_dump_trans_updates(struct btree_trans *trans) ++{ ++ struct printbuf buf = PRINTBUF; ++ ++ bch2_trans_updates_to_text(&buf, trans); ++ bch_err(trans->c, "%s", buf.buf); ++ printbuf_exit(&buf); + } + + noinline __cold +diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h +index f039cbe4ee51..f6700295e1a7 100644 +--- a/fs/bcachefs/btree_iter.h ++++ b/fs/bcachefs/btree_iter.h +@@ -389,6 +389,7 @@ __bch2_btree_iter_peek_and_restart(struct btree_trans *trans, + + /* new multiple iterator interface: */ + ++void bch2_trans_updates_to_text(struct printbuf *, struct btree_trans *); + void bch2_dump_trans_updates(struct btree_trans *); + void bch2_dump_trans_paths_updates(struct btree_trans *); + void __bch2_trans_init(struct btree_trans *, struct bch_fs *, +-- +cgit v1.2.3 + + +From 257611aeef5ca26db1f9bad94a5226e32f3871d6 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 12 Mar 2022 16:14:55 -0500 +Subject: bcachefs: Revalidate pointer to old bkey val before calling mem + triggers + +We recently started stashing a copy of the key being overwritten in +btree_insert_entry: this is helpful for avoiding multiple calls to +bch2_btree_path_peek_slot() and bch2_journal_keys_peek() in the +transaction commit path. + +But it turns out this has a problem - when we run mem/atomic triggers, +we've done a couple things that can invalidate the pointer to the old +key's value. This makes the optimization of stashing a pointer to the +old value questionable, but for now this patch revalidates that pointer +before running mem triggers. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_update_leaf.c | 26 ++++++++++++++++++++++++++ + 1 file changed, 26 insertions(+) + +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index f24c0f65e1fa..2d49127f946d 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -654,6 +654,32 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, + + if (btree_node_type_needs_gc(i->bkey_type)) + marking = true; ++ ++ /* ++ * Revalidate before calling mem triggers - XXX, ugly: ++ * ++ * - successful btree node splits don't cause transaction ++ * restarts and will have invalidated the pointer to the bkey ++ * value ++ * - btree_node_lock_for_insert() -> btree_node_prep_for_write() ++ * when it has to resort ++ * - btree_key_can_insert_cached() when it has to reallocate ++ * ++ * Ugly because we currently have no way to tell if the ++ * pointer's been invalidated, which means it's debatabale ++ * whether we should be stashing the old key at all. ++ */ ++ i->old_v = bch2_btree_path_peek_slot(i->path, &i->old_k).v; ++ ++ if (unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags))) { ++ struct bkey_i *j_k = ++ bch2_journal_keys_peek(c, i->btree_id, i->level, i->k->k.p); ++ ++ if (j_k && !bpos_cmp(j_k->k.p, i->k->k.p)) { ++ i->old_k = j_k->k; ++ i->old_v = &j_k->v; ++ } ++ } + } + + /* +-- +cgit v1.2.3 + + +From fe0c2c9c525b4b1ea92be8eee4e5b4c4f4f4cc44 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 13 Mar 2022 00:26:52 -0500 +Subject: bcachefs: Move trigger fns to bkey_ops + +This replaces the switch statements in bch2_mark_key(), +bch2_trans_mark_key() with new bkey methods - prep work for the next +patch, which fixes BTREE_TRIGGER_WANTS_OLD_AND_NEW. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/alloc_background.h | 3 + + fs/bcachefs/bkey_methods.h | 27 ++++++++ + fs/bcachefs/buckets.c | 136 ++++++++++++++--------------------------- + fs/bcachefs/buckets.h | 13 ++++ + fs/bcachefs/ec.h | 2 + + fs/bcachefs/extents.h | 8 +++ + fs/bcachefs/inode.h | 4 ++ + fs/bcachefs/reflink.h | 6 +- + 8 files changed, 107 insertions(+), 92 deletions(-) + +diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h +index 98c7866e20b5..3eaa6d204286 100644 +--- a/fs/bcachefs/alloc_background.h ++++ b/fs/bcachefs/alloc_background.h +@@ -65,16 +65,19 @@ void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); + #define bch2_bkey_ops_alloc (struct bkey_ops) { \ + .key_invalid = bch2_alloc_v1_invalid, \ + .val_to_text = bch2_alloc_to_text, \ ++ .atomic_trigger = bch2_mark_alloc, \ + } + + #define bch2_bkey_ops_alloc_v2 (struct bkey_ops) { \ + .key_invalid = bch2_alloc_v2_invalid, \ + .val_to_text = bch2_alloc_to_text, \ ++ .atomic_trigger = bch2_mark_alloc, \ + } + + #define bch2_bkey_ops_alloc_v3 (struct bkey_ops) { \ + .key_invalid = bch2_alloc_v3_invalid, \ + .val_to_text = bch2_alloc_to_text, \ ++ .atomic_trigger = bch2_mark_alloc, \ + } + + static inline bool bkey_is_alloc(const struct bkey *k) +diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h +index 4fdac545cf88..2289a09d98fc 100644 +--- a/fs/bcachefs/bkey_methods.h ++++ b/fs/bcachefs/bkey_methods.h +@@ -6,6 +6,7 @@ + + struct bch_fs; + struct btree; ++struct btree_trans; + struct bkey; + enum btree_node_type; + +@@ -20,6 +21,10 @@ struct bkey_ops { + void (*swab)(struct bkey_s); + bool (*key_normalize)(struct bch_fs *, struct bkey_s); + bool (*key_merge)(struct bch_fs *, struct bkey_s, struct bkey_s_c); ++ int (*trans_trigger)(struct btree_trans *, struct bkey_s_c, ++ struct bkey_i *, unsigned); ++ int (*atomic_trigger)(struct btree_trans *, struct bkey_s_c, ++ struct bkey_s_c, unsigned); + void (*compat)(enum btree_id id, unsigned version, + unsigned big_endian, int write, + struct bkey_s); +@@ -57,6 +62,28 @@ static inline bool bch2_bkey_maybe_mergable(const struct bkey *l, const struct b + + bool bch2_bkey_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); + ++static inline int bch2_mark_key(struct btree_trans *trans, ++ struct bkey_s_c old, ++ struct bkey_s_c new, ++ unsigned flags) ++{ ++ const struct bkey_ops *ops = &bch2_bkey_ops[old.k->type ?: new.k->type]; ++ ++ return ops->atomic_trigger ++ ? ops->atomic_trigger(trans, old, new, flags) ++ : 0; ++} ++ ++static inline int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c old, ++ struct bkey_i *new, unsigned flags) ++{ ++ const struct bkey_ops *ops = &bch2_bkey_ops[old.k->type ?: new->k.type]; ++ ++ return ops->trans_trigger ++ ? ops->trans_trigger(trans, old, new, flags) ++ : 0; ++} ++ + void bch2_bkey_renumber(enum btree_node_type, struct bkey_packed *, int); + + void __bch2_bkey_compat(unsigned, enum btree_id, unsigned, unsigned, +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index 0a796f41f674..d52263759ee5 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -497,9 +497,9 @@ void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca, + BUG_ON(owned_by_allocator == old.owned_by_allocator); + } + +-static int bch2_mark_alloc(struct btree_trans *trans, +- struct bkey_s_c old, struct bkey_s_c new, +- unsigned flags) ++int bch2_mark_alloc(struct btree_trans *trans, ++ struct bkey_s_c old, struct bkey_s_c new, ++ unsigned flags) + { + bool gc = flags & BTREE_TRIGGER_GC; + u64 journal_seq = trans->journal_res.seq; +@@ -929,9 +929,9 @@ static int bch2_mark_stripe_ptr(struct btree_trans *trans, + return 0; + } + +-static int bch2_mark_extent(struct btree_trans *trans, +- struct bkey_s_c old, struct bkey_s_c new, +- unsigned flags) ++int bch2_mark_extent(struct btree_trans *trans, ++ struct bkey_s_c old, struct bkey_s_c new, ++ unsigned flags) + { + u64 journal_seq = trans->journal_res.seq; + struct bch_fs *c = trans->c; +@@ -1011,9 +1011,9 @@ static int bch2_mark_extent(struct btree_trans *trans, + return 0; + } + +-static int bch2_mark_stripe(struct btree_trans *trans, +- struct bkey_s_c old, struct bkey_s_c new, +- unsigned flags) ++int bch2_mark_stripe(struct btree_trans *trans, ++ struct bkey_s_c old, struct bkey_s_c new, ++ unsigned flags) + { + bool gc = flags & BTREE_TRIGGER_GC; + u64 journal_seq = trans->journal_res.seq; +@@ -1118,9 +1118,9 @@ static int bch2_mark_stripe(struct btree_trans *trans, + return 0; + } + +-static int bch2_mark_inode(struct btree_trans *trans, +- struct bkey_s_c old, struct bkey_s_c new, +- unsigned flags) ++int bch2_mark_inode(struct btree_trans *trans, ++ struct bkey_s_c old, struct bkey_s_c new, ++ unsigned flags) + { + struct bch_fs *c = trans->c; + struct bch_fs_usage __percpu *fs_usage; +@@ -1149,9 +1149,9 @@ static int bch2_mark_inode(struct btree_trans *trans, + return 0; + } + +-static int bch2_mark_reservation(struct btree_trans *trans, +- struct bkey_s_c old, struct bkey_s_c new, +- unsigned flags) ++int bch2_mark_reservation(struct btree_trans *trans, ++ struct bkey_s_c old, struct bkey_s_c new, ++ unsigned flags) + { + struct bch_fs *c = trans->c; + struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old: new; +@@ -1228,9 +1228,9 @@ fsck_err: + return ret; + } + +-static int bch2_mark_reflink_p(struct btree_trans *trans, +- struct bkey_s_c old, struct bkey_s_c new, +- unsigned flags) ++int bch2_mark_reflink_p(struct btree_trans *trans, ++ struct bkey_s_c old, struct bkey_s_c new, ++ unsigned flags) + { + struct bch_fs *c = trans->c; + struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old: new; +@@ -1267,39 +1267,6 @@ static int bch2_mark_reflink_p(struct btree_trans *trans, + return ret; + } + +-int bch2_mark_key(struct btree_trans *trans, +- struct bkey_s_c old, +- struct bkey_s_c new, +- unsigned flags) +-{ +- struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old: new; +- +- switch (k.k->type) { +- case KEY_TYPE_alloc: +- case KEY_TYPE_alloc_v2: +- case KEY_TYPE_alloc_v3: +- return bch2_mark_alloc(trans, old, new, flags); +- case KEY_TYPE_btree_ptr: +- case KEY_TYPE_btree_ptr_v2: +- case KEY_TYPE_extent: +- case KEY_TYPE_reflink_v: +- return bch2_mark_extent(trans, old, new, flags); +- case KEY_TYPE_stripe: +- return bch2_mark_stripe(trans, old, new, flags); +- case KEY_TYPE_inode: +- case KEY_TYPE_inode_v2: +- return bch2_mark_inode(trans, old, new, flags); +- case KEY_TYPE_reservation: +- return bch2_mark_reservation(trans, old, new, flags); +- case KEY_TYPE_reflink_p: +- return bch2_mark_reflink_p(trans, old, new, flags); +- case KEY_TYPE_snapshot: +- return bch2_mark_snapshot(trans, old, new, flags); +- default: +- return 0; +- } +-} +- + static noinline __cold + void fs_usage_apply_warn(struct btree_trans *trans, + unsigned disk_res_sectors, +@@ -1514,10 +1481,14 @@ err: + return ret; + } + +-static int bch2_trans_mark_extent(struct btree_trans *trans, +- struct bkey_s_c k, unsigned flags) ++int bch2_trans_mark_extent(struct btree_trans *trans, ++ struct bkey_s_c old, struct bkey_i *new, ++ unsigned flags) + { + struct bch_fs *c = trans->c; ++ struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ++ ? old ++ : bkey_i_to_s_c(new); + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; +@@ -1648,9 +1619,9 @@ err: + return ret; + } + +-static int bch2_trans_mark_stripe(struct btree_trans *trans, +- struct bkey_s_c old, struct bkey_i *new, +- unsigned flags) ++int bch2_trans_mark_stripe(struct btree_trans *trans, ++ struct bkey_s_c old, struct bkey_i *new, ++ unsigned flags) + { + const struct bch_stripe *old_s = NULL; + struct bch_stripe *new_s = NULL; +@@ -1718,10 +1689,10 @@ static int bch2_trans_mark_stripe(struct btree_trans *trans, + return ret; + } + +-static int bch2_trans_mark_inode(struct btree_trans *trans, +- struct bkey_s_c old, +- struct bkey_i *new, +- unsigned flags) ++int bch2_trans_mark_inode(struct btree_trans *trans, ++ struct bkey_s_c old, ++ struct bkey_i *new, ++ unsigned flags) + { + int nr = bkey_is_inode(&new->k) - bkey_is_inode(old.k); + +@@ -1734,9 +1705,14 @@ static int bch2_trans_mark_inode(struct btree_trans *trans, + return 0; + } + +-static int bch2_trans_mark_reservation(struct btree_trans *trans, +- struct bkey_s_c k, unsigned flags) ++int bch2_trans_mark_reservation(struct btree_trans *trans, ++ struct bkey_s_c old, ++ struct bkey_i *new, ++ unsigned flags) + { ++ struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ++ ? old ++ : bkey_i_to_s_c(new); + unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas; + s64 sectors = (s64) k.k->size; + struct replicas_delta_list *d; +@@ -1835,9 +1811,14 @@ err: + return ret; + } + +-static int bch2_trans_mark_reflink_p(struct btree_trans *trans, +- struct bkey_s_c k, unsigned flags) ++int bch2_trans_mark_reflink_p(struct btree_trans *trans, ++ struct bkey_s_c old, ++ struct bkey_i *new, ++ unsigned flags) + { ++ struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ++ ? old ++ : bkey_i_to_s_c(new); + struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); + u64 idx, end_idx; + int ret = 0; +@@ -1858,33 +1839,6 @@ static int bch2_trans_mark_reflink_p(struct btree_trans *trans, + return ret; + } + +-int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c old, +- struct bkey_i *new, unsigned flags) +-{ +- struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE +- ? old +- : bkey_i_to_s_c(new); +- +- switch (k.k->type) { +- case KEY_TYPE_btree_ptr: +- case KEY_TYPE_btree_ptr_v2: +- case KEY_TYPE_extent: +- case KEY_TYPE_reflink_v: +- return bch2_trans_mark_extent(trans, k, flags); +- case KEY_TYPE_stripe: +- return bch2_trans_mark_stripe(trans, old, new, flags); +- case KEY_TYPE_inode: +- case KEY_TYPE_inode_v2: +- return bch2_trans_mark_inode(trans, old, new, flags); +- case KEY_TYPE_reservation: +- return bch2_trans_mark_reservation(trans, k, flags); +- case KEY_TYPE_reflink_p: +- return bch2_trans_mark_reflink_p(trans, k, flags); +- default: +- return 0; +- } +-} +- + static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans, + struct bch_dev *ca, size_t b, + enum bch_data_type type, +diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h +index daf79a4f9128..392e03d4c319 100644 +--- a/fs/bcachefs/buckets.h ++++ b/fs/bcachefs/buckets.h +@@ -229,6 +229,19 @@ void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *, + size_t, enum bch_data_type, unsigned, + struct gc_pos, unsigned); + ++int bch2_mark_alloc(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned); ++int bch2_mark_extent(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned); ++int bch2_mark_stripe(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned); ++int bch2_mark_inode(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned); ++int bch2_mark_reservation(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned); ++int bch2_mark_reflink_p(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned); ++ ++int bch2_trans_mark_extent(struct btree_trans *, struct bkey_s_c, struct bkey_i *, unsigned); ++int bch2_trans_mark_stripe(struct btree_trans *, struct bkey_s_c, struct bkey_i *, unsigned); ++int bch2_trans_mark_inode(struct btree_trans *, struct bkey_s_c, struct bkey_i *, unsigned); ++int bch2_trans_mark_reservation(struct btree_trans *, struct bkey_s_c, struct bkey_i *, unsigned); ++int bch2_trans_mark_reflink_p(struct btree_trans *, struct bkey_s_c, struct bkey_i *, unsigned); ++ + int bch2_mark_key(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned); + + int bch2_trans_mark_key(struct btree_trans *, struct bkey_s_c, +diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h +index 78d468c7680a..9d508a2f3bbc 100644 +--- a/fs/bcachefs/ec.h ++++ b/fs/bcachefs/ec.h +@@ -14,6 +14,8 @@ void bch2_stripe_to_text(struct printbuf *, struct bch_fs *, + .key_invalid = bch2_stripe_invalid, \ + .val_to_text = bch2_stripe_to_text, \ + .swab = bch2_ptr_swab, \ ++ .trans_trigger = bch2_trans_mark_stripe, \ ++ .atomic_trigger = bch2_mark_stripe, \ + } + + static inline unsigned stripe_csums_per_device(const struct bch_stripe *s) +diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h +index 9c2567274a2b..ae650849d98a 100644 +--- a/fs/bcachefs/extents.h ++++ b/fs/bcachefs/extents.h +@@ -381,6 +381,8 @@ void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned, + .key_invalid = bch2_btree_ptr_invalid, \ + .val_to_text = bch2_btree_ptr_to_text, \ + .swab = bch2_ptr_swab, \ ++ .trans_trigger = bch2_trans_mark_extent, \ ++ .atomic_trigger = bch2_mark_extent, \ + } + + #define bch2_bkey_ops_btree_ptr_v2 (struct bkey_ops) { \ +@@ -388,6 +390,8 @@ void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned, + .val_to_text = bch2_btree_ptr_v2_to_text, \ + .swab = bch2_ptr_swab, \ + .compat = bch2_btree_ptr_v2_compat, \ ++ .trans_trigger = bch2_trans_mark_extent, \ ++ .atomic_trigger = bch2_mark_extent, \ + } + + /* KEY_TYPE_extent: */ +@@ -402,6 +406,8 @@ bool bch2_extent_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); + .swab = bch2_ptr_swab, \ + .key_normalize = bch2_extent_normalize, \ + .key_merge = bch2_extent_merge, \ ++ .trans_trigger = bch2_trans_mark_extent, \ ++ .atomic_trigger = bch2_mark_extent, \ + } + + /* KEY_TYPE_reservation: */ +@@ -414,6 +420,8 @@ bool bch2_reservation_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); + .key_invalid = bch2_reservation_invalid, \ + .val_to_text = bch2_reservation_to_text, \ + .key_merge = bch2_reservation_merge, \ ++ .trans_trigger = bch2_trans_mark_reservation, \ ++ .atomic_trigger = bch2_mark_reservation, \ + } + + /* Extent checksum entries: */ +diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h +index 77957cc7f9dd..2337ecfc600e 100644 +--- a/fs/bcachefs/inode.h ++++ b/fs/bcachefs/inode.h +@@ -13,11 +13,15 @@ void bch2_inode_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); + #define bch2_bkey_ops_inode (struct bkey_ops) { \ + .key_invalid = bch2_inode_invalid, \ + .val_to_text = bch2_inode_to_text, \ ++ .trans_trigger = bch2_trans_mark_inode, \ ++ .atomic_trigger = bch2_mark_inode, \ + } + + #define bch2_bkey_ops_inode_v2 (struct bkey_ops) { \ + .key_invalid = bch2_inode_v2_invalid, \ + .val_to_text = bch2_inode_to_text, \ ++ .trans_trigger = bch2_trans_mark_inode, \ ++ .atomic_trigger = bch2_mark_inode, \ + } + + static inline bool bkey_is_inode(const struct bkey *k) +diff --git a/fs/bcachefs/reflink.h b/fs/bcachefs/reflink.h +index 3745873fd88d..4da4330014a8 100644 +--- a/fs/bcachefs/reflink.h ++++ b/fs/bcachefs/reflink.h +@@ -10,7 +10,9 @@ bool bch2_reflink_p_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); + #define bch2_bkey_ops_reflink_p (struct bkey_ops) { \ + .key_invalid = bch2_reflink_p_invalid, \ + .val_to_text = bch2_reflink_p_to_text, \ +- .key_merge = bch2_reflink_p_merge, \ ++ .key_merge = bch2_reflink_p_merge, \ ++ .trans_trigger = bch2_trans_mark_reflink_p, \ ++ .atomic_trigger = bch2_mark_reflink_p, \ + } + + const char *bch2_reflink_v_invalid(const struct bch_fs *, struct bkey_s_c); +@@ -21,6 +23,8 @@ void bch2_reflink_v_to_text(struct printbuf *, struct bch_fs *, + .key_invalid = bch2_reflink_v_invalid, \ + .val_to_text = bch2_reflink_v_to_text, \ + .swab = bch2_ptr_swab, \ ++ .trans_trigger = bch2_trans_mark_extent, \ ++ .atomic_trigger = bch2_mark_extent, \ + } + + const char *bch2_indirect_inline_data_invalid(const struct bch_fs *, +-- +cgit v1.2.3 + + +From 1027c57127ac3a84955f66557dfff56d0701e5cc Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 13 Mar 2022 00:30:16 -0500 +Subject: bcachefs: Fix BTREE_TRIGGER_WANTS_OLD_AND_NEW + +BTREE_TRIGGER_WANTS_OLD_AND_NEW didn't work correctly when the old and +new key were both alloc keys, but different versions - it required old +and new key type to be identical, and this bug is a problem for the new +allocator rewrite. + +This patch fixes it by checking if the old and new key have the same +trigger functions - the different versions of alloc (and inode) keys +have the same trigger functions. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_update_leaf.c | 6 ++++-- + 1 file changed, 4 insertions(+), 2 deletions(-) + +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index 2d49127f946d..c5d2436d540f 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -436,7 +436,8 @@ static int run_one_mem_trigger(struct btree_trans *trans, + if (!btree_node_type_needs_gc(i->btree_id)) + return 0; + +- if (old.k->type == new->k.type && ++ if (bch2_bkey_ops[old.k->type].atomic_trigger == ++ bch2_bkey_ops[i->k->k.type].atomic_trigger && + ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) { + ret = bch2_mark_key(trans, old, bkey_i_to_s_c(new), + BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags); +@@ -487,7 +488,8 @@ static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_ + + if (overwrite) { + ret = bch2_trans_mark_old(trans, old, i->flags); +- } else if (old.k->type == i->k->k.type && ++ } else if (bch2_bkey_ops[old.k->type].trans_trigger == ++ bch2_bkey_ops[i->k->k.type].trans_trigger && + ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) { + i->overwrite_trigger_run = true; + ret = bch2_trans_mark_key(trans, old, i->k, +-- +cgit v1.2.3 + + +From 756895e0caa2f1f2497660e1429a7eb83265a366 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 10 Mar 2022 16:43:52 -0500 +Subject: bcachefs: bch_sb_field_journal_v2 + +Add a new superblock field which represents journal buckets as ranges: +also move code for the superblock journal fields to journal_sb.c. + +This also reworks the code for resizing the journal to write the new +superblock before using the new journal buckets, and thus be a bit +safer. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/Makefile | 1 + + fs/bcachefs/bcachefs_format.h | 38 ++++++-- + fs/bcachefs/journal.c | 176 ++++++++++++++++++++------------- + fs/bcachefs/journal_sb.c | 222 ++++++++++++++++++++++++++++++++++++++++++ + fs/bcachefs/journal_sb.h | 24 +++++ + fs/bcachefs/super-io.c | 82 +--------------- + fs/bcachefs/super-io.h | 9 -- + 7 files changed, 387 insertions(+), 165 deletions(-) + create mode 100644 fs/bcachefs/journal_sb.c + create mode 100644 fs/bcachefs/journal_sb.h + +diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile +index cf29fdaadc5b..5047c919374e 100644 +--- a/fs/bcachefs/Makefile ++++ b/fs/bcachefs/Makefile +@@ -38,6 +38,7 @@ bcachefs-y := \ + journal.o \ + journal_io.o \ + journal_reclaim.o \ ++ journal_sb.o \ + journal_seq_blacklist.o \ + keylist.o \ + migrate.o \ +diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h +index 5153f0e42054..e74100bf53b0 100644 +--- a/fs/bcachefs/bcachefs_format.h ++++ b/fs/bcachefs/bcachefs_format.h +@@ -1023,16 +1023,17 @@ struct bch_sb_field { + __le32 type; + }; + +-#define BCH_SB_FIELDS() \ +- x(journal, 0) \ +- x(members, 1) \ +- x(crypt, 2) \ +- x(replicas_v0, 3) \ +- x(quota, 4) \ +- x(disk_groups, 5) \ +- x(clean, 6) \ +- x(replicas, 7) \ +- x(journal_seq_blacklist, 8) ++#define BCH_SB_FIELDS() \ ++ x(journal, 0) \ ++ x(members, 1) \ ++ x(crypt, 2) \ ++ x(replicas_v0, 3) \ ++ x(quota, 4) \ ++ x(disk_groups, 5) \ ++ x(clean, 6) \ ++ x(replicas, 7) \ ++ x(journal_seq_blacklist, 8) \ ++ x(journal_v2, 9) + + enum bch_sb_field_type { + #define x(f, nr) BCH_SB_FIELD_##f = nr, +@@ -1041,6 +1042,14 @@ enum bch_sb_field_type { + BCH_SB_FIELD_NR + }; + ++/* ++ * Most superblock fields are replicated in all device's superblocks - a few are ++ * not: ++ */ ++#define BCH_SINGLE_DEVICE_SB_FIELDS \ ++ ((1U << BCH_SB_FIELD_journal)| \ ++ (1U << BCH_SB_FIELD_journal_v2)) ++ + /* BCH_SB_FIELD_journal: */ + + struct bch_sb_field_journal { +@@ -1048,6 +1057,15 @@ struct bch_sb_field_journal { + __le64 buckets[0]; + }; + ++struct bch_sb_field_journal_v2 { ++ struct bch_sb_field field; ++ ++ struct bch_sb_field_journal_v2_entry { ++ __le64 start; ++ __le64 nr; ++ } d[0]; ++}; ++ + /* BCH_SB_FIELD_members: */ + + #define BCH_MIN_NR_NBUCKETS (1 << 6) +diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c +index eb556ecc511f..de503dbb0f96 100644 +--- a/fs/bcachefs/journal.c ++++ b/fs/bcachefs/journal.c +@@ -15,8 +15,8 @@ + #include "journal.h" + #include "journal_io.h" + #include "journal_reclaim.h" ++#include "journal_sb.h" + #include "journal_seq_blacklist.h" +-#include "super-io.h" + + #include + +@@ -768,28 +768,55 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, + { + struct bch_fs *c = ca->fs; + struct journal_device *ja = &ca->journal; +- struct bch_sb_field_journal *journal_buckets; + u64 *new_bucket_seq = NULL, *new_buckets = NULL; ++ struct open_bucket **ob = NULL; ++ long *bu = NULL; ++ unsigned i, nr_got = 0, nr_want = nr - ja->nr; ++ unsigned old_nr = ja->nr; ++ unsigned old_discard_idx = ja->discard_idx; ++ unsigned old_dirty_idx_ondisk = ja->dirty_idx_ondisk; ++ unsigned old_dirty_idx = ja->dirty_idx; ++ unsigned old_cur_idx = ja->cur_idx; + int ret = 0; + +- /* don't handle reducing nr of buckets yet: */ +- if (nr <= ja->nr) +- return 0; ++ if (c) { ++ bch2_journal_block(&c->journal); ++ bch2_journal_flush_all_pins(&c->journal); ++ } + ++ bu = kzalloc(nr_want * sizeof(*bu), GFP_KERNEL); ++ ob = kzalloc(nr_want * sizeof(*ob), GFP_KERNEL); + new_buckets = kzalloc(nr * sizeof(u64), GFP_KERNEL); + new_bucket_seq = kzalloc(nr * sizeof(u64), GFP_KERNEL); +- if (!new_buckets || !new_bucket_seq) { ++ if (!bu || !ob || !new_buckets || !new_bucket_seq) { + ret = -ENOMEM; +- goto err; ++ goto err_unblock; + } + +- journal_buckets = bch2_sb_resize_journal(&ca->disk_sb, +- nr + sizeof(*journal_buckets) / sizeof(u64)); +- if (!journal_buckets) { +- ret = -ENOSPC; +- goto err; ++ for (nr_got = 0; nr_got < nr_want; nr_got++) { ++ if (new_fs) { ++ bu[nr_got] = bch2_bucket_alloc_new_fs(ca); ++ if (bu[nr_got] < 0) { ++ ret = -ENOSPC; ++ break; ++ } ++ } else { ++ rcu_read_lock(); ++ ob[nr_got] = bch2_bucket_alloc(c, ca, RESERVE_NONE, ++ false, cl); ++ rcu_read_unlock(); ++ if (IS_ERR(ob[nr_got])) { ++ ret = cl ? -EAGAIN : -ENOSPC; ++ break; ++ } ++ ++ bu[nr_got] = ob[nr_got]->bucket; ++ } + } + ++ if (!nr_got) ++ goto err_unblock; ++ + /* + * We may be called from the device add path, before the new device has + * actually been added to the running filesystem: +@@ -802,51 +829,16 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, + swap(new_buckets, ja->buckets); + swap(new_bucket_seq, ja->bucket_seq); + +- if (!new_fs) +- spin_unlock(&c->journal.lock); +- +- while (ja->nr < nr) { +- struct open_bucket *ob = NULL; +- unsigned pos; +- long b; +- +- if (new_fs) { +- b = bch2_bucket_alloc_new_fs(ca); +- if (b < 0) { +- ret = -ENOSPC; +- goto err; +- } +- } else { +- rcu_read_lock(); +- ob = bch2_bucket_alloc(c, ca, RESERVE_NONE, +- false, cl); +- rcu_read_unlock(); +- if (IS_ERR(ob)) { +- ret = cl ? -EAGAIN : -ENOSPC; +- goto err; +- } +- +- b = ob->bucket; +- } +- +- if (c) +- spin_lock(&c->journal.lock); ++ for (i = 0; i < nr_got; i++) { ++ unsigned pos = ja->discard_idx ?: ja->nr; ++ long b = bu[i]; + +- /* +- * XXX +- * For resize at runtime, we should be writing the new +- * superblock before inserting into the journal array +- */ +- +- pos = ja->discard_idx ?: ja->nr; + __array_insert_item(ja->buckets, ja->nr, pos); + __array_insert_item(ja->bucket_seq, ja->nr, pos); +- __array_insert_item(journal_buckets->buckets, ja->nr, pos); + ja->nr++; + + ja->buckets[pos] = b; + ja->bucket_seq[pos] = 0; +- journal_buckets->buckets[pos] = cpu_to_le64(b); + + if (pos <= ja->discard_idx) + ja->discard_idx = (ja->discard_idx + 1) % ja->nr; +@@ -856,29 +848,56 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, + ja->dirty_idx = (ja->dirty_idx + 1) % ja->nr; + if (pos <= ja->cur_idx) + ja->cur_idx = (ja->cur_idx + 1) % ja->nr; ++ } + +- if (c) +- spin_unlock(&c->journal.lock); ++ ret = bch2_journal_buckets_to_sb(c, ca); ++ if (ret) { ++ /* Revert: */ ++ swap(new_buckets, ja->buckets); ++ swap(new_bucket_seq, ja->bucket_seq); ++ ja->nr = old_nr; ++ ja->discard_idx = old_discard_idx; ++ ja->dirty_idx_ondisk = old_dirty_idx_ondisk; ++ ja->dirty_idx = old_dirty_idx; ++ ja->cur_idx = old_cur_idx; ++ } ++ ++ if (!new_fs) ++ spin_unlock(&c->journal.lock); ++ ++ if (c) ++ bch2_journal_unblock(&c->journal); ++ ++ if (ret) ++ goto err; + +- if (!new_fs) { ++ if (!new_fs) { ++ for (i = 0; i < nr_got; i++) { + ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOFAIL, + bch2_trans_mark_metadata_bucket(&trans, ca, +- b, BCH_DATA_journal, ++ bu[i], BCH_DATA_journal, + ca->mi.bucket_size)); +- +- bch2_open_bucket_put(c, ob); +- +- if (ret) ++ if (ret) { ++ bch2_fs_inconsistent(c, "error marking new journal buckets: %i", ret); + goto err; ++ } + } + } + err: +- bch2_sb_resize_journal(&ca->disk_sb, +- ja->nr + sizeof(*journal_buckets) / sizeof(u64)); ++ if (ob && !new_fs) ++ for (i = 0; i < nr_got; i++) ++ bch2_open_bucket_put(c, ob[i]); ++ + kfree(new_bucket_seq); + kfree(new_buckets); ++ kfree(ob); ++ kfree(bu); + + return ret; ++err_unblock: ++ if (c) ++ bch2_journal_unblock(&c->journal); ++ goto err; + } + + /* +@@ -891,11 +910,15 @@ int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca, + struct journal_device *ja = &ca->journal; + struct closure cl; + unsigned current_nr; +- int ret; ++ int ret = 0; ++ ++ /* don't handle reducing nr of buckets yet: */ ++ if (nr < ja->nr) ++ return 0; + + closure_init_stack(&cl); + +- do { ++ while (ja->nr != nr && (ret == 0 || ret == -EAGAIN)) { + struct disk_reservation disk_res = { 0, 0 }; + + closure_sync(&cl); +@@ -923,7 +946,7 @@ int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca, + if (ja->nr != current_nr) + bch2_write_super(c); + mutex_unlock(&c->sb_lock); +- } while (ret == -EAGAIN); ++ } + + return ret; + } +@@ -1092,9 +1115,20 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb) + struct journal_device *ja = &ca->journal; + struct bch_sb_field_journal *journal_buckets = + bch2_sb_get_journal(sb); ++ struct bch_sb_field_journal_v2 *journal_buckets_v2 = ++ bch2_sb_get_journal_v2(sb); + unsigned i; + +- ja->nr = bch2_nr_journal_buckets(journal_buckets); ++ ja->nr = 0; ++ ++ if (journal_buckets_v2) { ++ unsigned nr = bch2_sb_field_journal_v2_nr_entries(journal_buckets_v2); ++ ++ for (i = 0; i < nr; i++) ++ ja->nr += le64_to_cpu(journal_buckets_v2->d[i].nr); ++ } else if (journal_buckets) { ++ ja->nr = bch2_nr_journal_buckets(journal_buckets); ++ } + + ja->bucket_seq = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL); + if (!ja->bucket_seq) +@@ -1109,8 +1143,18 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb) + if (!ja->buckets) + return -ENOMEM; + +- for (i = 0; i < ja->nr; i++) +- ja->buckets[i] = le64_to_cpu(journal_buckets->buckets[i]); ++ if (journal_buckets_v2) { ++ unsigned nr = bch2_sb_field_journal_v2_nr_entries(journal_buckets_v2); ++ unsigned j, dst = 0; ++ ++ for (i = 0; i < nr; i++) ++ for (j = 0; j < le64_to_cpu(journal_buckets_v2->d[i].nr); j++) ++ ja->buckets[dst++] = ++ le64_to_cpu(journal_buckets_v2->d[i].start) + j; ++ } else if (journal_buckets) { ++ for (i = 0; i < ja->nr; i++) ++ ja->buckets[i] = le64_to_cpu(journal_buckets->buckets[i]); ++ } + + return 0; + } +diff --git a/fs/bcachefs/journal_sb.c b/fs/bcachefs/journal_sb.c +new file mode 100644 +index 000000000000..0a8a0077b6f1 +--- /dev/null ++++ b/fs/bcachefs/journal_sb.c +@@ -0,0 +1,222 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include "bcachefs.h" ++#include "journal_sb.h" ++ ++#include ++ ++/* BCH_SB_FIELD_journal: */ ++ ++static int u64_cmp(const void *_l, const void *_r) ++{ ++ const u64 *l = _l; ++ const u64 *r = _r; ++ ++ return cmp_int(*l, *r); ++} ++ ++static int bch2_sb_journal_validate(struct bch_sb *sb, ++ struct bch_sb_field *f, ++ struct printbuf *err) ++{ ++ struct bch_sb_field_journal *journal = field_to_type(f, journal); ++ struct bch_member *m = bch2_sb_get_members(sb)->members + sb->dev_idx; ++ int ret = -EINVAL; ++ unsigned nr; ++ unsigned i; ++ u64 *b; ++ ++ nr = bch2_nr_journal_buckets(journal); ++ if (!nr) ++ return 0; ++ ++ b = kmalloc_array(sizeof(u64), nr, GFP_KERNEL); ++ if (!b) ++ return -ENOMEM; ++ ++ for (i = 0; i < nr; i++) ++ b[i] = le64_to_cpu(journal->buckets[i]); ++ ++ sort(b, nr, sizeof(u64), u64_cmp, NULL); ++ ++ if (!b[0]) { ++ pr_buf(err, "journal bucket at sector 0"); ++ goto err; ++ } ++ ++ if (b[0] < le16_to_cpu(m->first_bucket)) { ++ pr_buf(err, "journal bucket %llu before first bucket %u", ++ b[0], le16_to_cpu(m->first_bucket)); ++ goto err; ++ } ++ ++ if (b[nr - 1] >= le64_to_cpu(m->nbuckets)) { ++ pr_buf(err, "journal bucket %llu past end of device (nbuckets %llu)", ++ b[nr - 1], le64_to_cpu(m->nbuckets)); ++ goto err; ++ } ++ ++ for (i = 0; i + 1 < nr; i++) ++ if (b[i] == b[i + 1]) { ++ pr_buf(err, "duplicate journal buckets %llu", b[i]); ++ goto err; ++ } ++ ++ ret = 0; ++err: ++ kfree(b); ++ return ret; ++} ++ ++static void bch2_sb_journal_to_text(struct printbuf *out, struct bch_sb *sb, ++ struct bch_sb_field *f) ++{ ++ struct bch_sb_field_journal *journal = field_to_type(f, journal); ++ unsigned i, nr = bch2_nr_journal_buckets(journal); ++ ++ pr_buf(out, "Buckets: "); ++ for (i = 0; i < nr; i++) ++ pr_buf(out, " %llu", le64_to_cpu(journal->buckets[i])); ++ pr_newline(out); ++} ++ ++const struct bch_sb_field_ops bch_sb_field_ops_journal = { ++ .validate = bch2_sb_journal_validate, ++ .to_text = bch2_sb_journal_to_text, ++}; ++ ++struct u64_range { ++ u64 start; ++ u64 end; ++}; ++ ++static int u64_range_cmp(const void *_l, const void *_r) ++{ ++ const struct u64_range *l = _l; ++ const struct u64_range *r = _r; ++ ++ return cmp_int(l->start, r->start); ++} ++ ++static int bch2_sb_journal_v2_validate(struct bch_sb *sb, ++ struct bch_sb_field *f, ++ struct printbuf *err) ++{ ++ struct bch_sb_field_journal_v2 *journal = field_to_type(f, journal_v2); ++ struct bch_member *m = bch2_sb_get_members(sb)->members + sb->dev_idx; ++ int ret = -EINVAL; ++ unsigned nr; ++ unsigned i; ++ struct u64_range *b; ++ ++ nr = bch2_sb_field_journal_v2_nr_entries(journal); ++ if (!nr) ++ return 0; ++ ++ b = kmalloc_array(sizeof(*b), nr, GFP_KERNEL); ++ if (!b) ++ return -ENOMEM; ++ ++ for (i = 0; i < nr; i++) { ++ b[i].start = le64_to_cpu(journal->d[i].start); ++ b[i].end = b[i].start + le64_to_cpu(journal->d[i].nr); ++ } ++ ++ sort(b, nr, sizeof(*b), u64_range_cmp, NULL); ++ ++ if (!b[0].start) { ++ pr_buf(err, "journal bucket at sector 0"); ++ goto err; ++ } ++ ++ if (b[0].start < le16_to_cpu(m->first_bucket)) { ++ pr_buf(err, "journal bucket %llu before first bucket %u", ++ b[0], le16_to_cpu(m->first_bucket)); ++ goto err; ++ } ++ ++ if (b[nr - 1].end > le64_to_cpu(m->nbuckets)) { ++ pr_buf(err, "journal bucket %llu past end of device (nbuckets %llu)", ++ b[nr - 1], le64_to_cpu(m->nbuckets)); ++ goto err; ++ } ++ ++ for (i = 0; i + 1 < nr; i++) { ++ if (b[i].end == b[i + 1].start) { ++ pr_buf(err, "contiguous journal buckets ranges %llu-%llu, %llu-%llu", ++ b[i].start, b[i].end, b[i + 1].start, b[i + 1].end); ++ goto err; ++ } ++ ++ if (b[i].end > b[i + 1].start) { ++ pr_buf(err, "duplicate journal buckets in ranges %llu-%llu, %llu-%llu", ++ b[i].start, b[i].end, b[i + 1].start, b[i + 1].end); ++ goto err; ++ } ++ } ++ ++ ret = 0; ++err: ++ kfree(b); ++ return ret; ++} ++ ++static void bch2_sb_journal_v2_to_text(struct printbuf *out, struct bch_sb *sb, ++ struct bch_sb_field *f) ++{ ++ struct bch_sb_field_journal_v2 *journal = field_to_type(f, journal_v2); ++ unsigned i, nr = bch2_sb_field_journal_v2_nr_entries(journal); ++ ++ pr_buf(out, "Buckets: "); ++ for (i = 0; i < nr; i++) ++ pr_buf(out, " %llu-%llu", ++ le64_to_cpu(journal->d[i].start), ++ le64_to_cpu(journal->d[i].start) + le64_to_cpu(journal->d[i].nr)); ++ pr_newline(out); ++} ++ ++const struct bch_sb_field_ops bch_sb_field_ops_journal_v2 = { ++ .validate = bch2_sb_journal_v2_validate, ++ .to_text = bch2_sb_journal_v2_to_text, ++}; ++ ++int bch2_journal_buckets_to_sb(struct bch_fs *c, struct bch_dev *ca) ++{ ++ struct journal_device *ja = &ca->journal; ++ struct bch_sb_field_journal_v2 *j; ++ unsigned i, dst = 0, nr = 1; ++ ++ lockdep_assert_held(&c->sb_lock); ++ ++ if (!ja->nr) { ++ bch2_sb_field_delete(&ca->disk_sb, BCH_SB_FIELD_journal); ++ bch2_sb_field_delete(&ca->disk_sb, BCH_SB_FIELD_journal_v2); ++ return 0; ++ } ++ ++ for (i = 0; i + 1 < ja->nr; i++) ++ if (ja->buckets[i] + 1 != ja->buckets[i + 1]) ++ nr++; ++ ++ j = bch2_sb_resize_journal_v2(&ca->disk_sb, ++ (sizeof(*j) + sizeof(j->d[0]) * nr) / sizeof(u64)); ++ if (!j) ++ return -ENOSPC; ++ ++ bch2_sb_field_delete(&ca->disk_sb, BCH_SB_FIELD_journal); ++ ++ j->d[dst].start = le64_to_cpu(ja->buckets[0]); ++ j->d[dst].nr = le64_to_cpu(1); ++ ++ for (i = 1; i < ja->nr; i++) { ++ if (ja->buckets[i] == ja->buckets[i - 1] + 1) { ++ le64_add_cpu(&j->d[dst].nr, 1); ++ } else { ++ dst++; ++ j->d[dst].start = le64_to_cpu(ja->buckets[i]); ++ j->d[dst].nr = le64_to_cpu(1); ++ } ++ } ++ ++ return 0; ++} +diff --git a/fs/bcachefs/journal_sb.h b/fs/bcachefs/journal_sb.h +new file mode 100644 +index 000000000000..a39192e9f6f4 +--- /dev/null ++++ b/fs/bcachefs/journal_sb.h +@@ -0,0 +1,24 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++ ++#include "super-io.h" ++#include "vstructs.h" ++ ++static inline unsigned bch2_nr_journal_buckets(struct bch_sb_field_journal *j) ++{ ++ return j ++ ? (__le64 *) vstruct_end(&j->field) - j->buckets ++ : 0; ++} ++ ++static inline unsigned bch2_sb_field_journal_v2_nr_entries(struct bch_sb_field_journal_v2 *j) ++{ ++ if (!j) ++ return 0; ++ ++ return (struct bch_sb_field_journal_v2_entry *) vstruct_end(&j->field) - &j->d[0]; ++} ++ ++extern const struct bch_sb_field_ops bch_sb_field_ops_journal; ++extern const struct bch_sb_field_ops bch_sb_field_ops_journal_v2; ++ ++int bch2_journal_buckets_to_sb(struct bch_fs *, struct bch_dev *); +diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c +index e17ce91c8486..aa10be8edfe7 100644 +--- a/fs/bcachefs/super-io.c ++++ b/fs/bcachefs/super-io.c +@@ -10,6 +10,7 @@ + #include "io.h" + #include "journal.h" + #include "journal_io.h" ++#include "journal_sb.h" + #include "journal_seq_blacklist.h" + #include "replicas.h" + #include "quota.h" +@@ -424,7 +425,7 @@ static void __copy_super(struct bch_sb_handle *dst_handle, struct bch_sb *src) + memcpy(dst->compat, src->compat, sizeof(dst->compat)); + + for (i = 0; i < BCH_SB_FIELD_NR; i++) { +- if (i == BCH_SB_FIELD_journal) ++ if ((1U << i) & BCH_SINGLE_DEVICE_SB_FIELDS) + continue; + + src_f = bch2_sb_field_get(src, i); +@@ -898,85 +899,6 @@ void __bch2_check_set_feature(struct bch_fs *c, unsigned feat) + mutex_unlock(&c->sb_lock); + } + +-/* BCH_SB_FIELD_journal: */ +- +-static int u64_cmp(const void *_l, const void *_r) +-{ +- u64 l = *((const u64 *) _l), r = *((const u64 *) _r); +- +- return l < r ? -1 : l > r ? 1 : 0; +-} +- +-static int bch2_sb_journal_validate(struct bch_sb *sb, +- struct bch_sb_field *f, +- struct printbuf *err) +-{ +- struct bch_sb_field_journal *journal = field_to_type(f, journal); +- struct bch_member *m = bch2_sb_get_members(sb)->members + sb->dev_idx; +- int ret = -EINVAL; +- unsigned nr; +- unsigned i; +- u64 *b; +- +- nr = bch2_nr_journal_buckets(journal); +- if (!nr) +- return 0; +- +- b = kmalloc_array(sizeof(u64), nr, GFP_KERNEL); +- if (!b) +- return -ENOMEM; +- +- for (i = 0; i < nr; i++) +- b[i] = le64_to_cpu(journal->buckets[i]); +- +- sort(b, nr, sizeof(u64), u64_cmp, NULL); +- +- if (!b[0]) { +- pr_buf(err, "journal bucket at sector 0"); +- goto err; +- } +- +- if (b[0] < le16_to_cpu(m->first_bucket)) { +- pr_buf(err, "journal bucket %llu before first bucket %u", +- b[0], le16_to_cpu(m->first_bucket)); +- goto err; +- } +- +- if (b[nr - 1] >= le64_to_cpu(m->nbuckets)) { +- pr_buf(err, "journal bucket %llu past end of device (nbuckets %llu)", +- b[nr - 1], le64_to_cpu(m->nbuckets)); +- goto err; +- } +- +- for (i = 0; i + 1 < nr; i++) +- if (b[i] == b[i + 1]) { +- pr_buf(err, "duplicate journal buckets %llu", b[i]); +- goto err; +- } +- +- ret = 0; +-err: +- kfree(b); +- return ret; +-} +- +-static void bch2_sb_journal_to_text(struct printbuf *out, struct bch_sb *sb, +- struct bch_sb_field *f) +-{ +- struct bch_sb_field_journal *journal = field_to_type(f, journal); +- unsigned i, nr = bch2_nr_journal_buckets(journal); +- +- pr_buf(out, "Buckets: "); +- for (i = 0; i < nr; i++) +- pr_buf(out, " %llu", le64_to_cpu(journal->buckets[i])); +- pr_newline(out); +-} +- +-static const struct bch_sb_field_ops bch_sb_field_ops_journal = { +- .validate = bch2_sb_journal_validate, +- .to_text = bch2_sb_journal_to_text, +-}; +- + /* BCH_SB_FIELD_members: */ + + static int bch2_sb_members_validate(struct bch_sb *sb, +diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h +index 50f31a3b9b18..7fc56321922f 100644 +--- a/fs/bcachefs/super-io.h ++++ b/fs/bcachefs/super-io.h +@@ -75,15 +75,6 @@ static inline void bch2_check_set_feature(struct bch_fs *c, unsigned feat) + __bch2_check_set_feature(c, feat); + } + +-/* BCH_SB_FIELD_journal: */ +- +-static inline unsigned bch2_nr_journal_buckets(struct bch_sb_field_journal *j) +-{ +- return j +- ? (__le64 *) vstruct_end(&j->field) - j->buckets +- : 0; +-} +- + /* BCH_SB_FIELD_members: */ + + static inline bool bch2_member_exists(struct bch_member *m) +-- +cgit v1.2.3 + + +From 554f4361e91ac0b733b100e916280f4af9b97e29 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 5 Jan 2022 22:13:13 -0500 +Subject: bcachefs: KEY_TYPE_set + +A new empty key type, to be used when using a btree as a set. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bcachefs_format.h | 7 ++++++- + fs/bcachefs/bkey_methods.c | 18 ++++++++++++++++++ + 2 files changed, 24 insertions(+), 1 deletion(-) + +diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h +index e74100bf53b0..57e6780f47d4 100644 +--- a/fs/bcachefs/bcachefs_format.h ++++ b/fs/bcachefs/bcachefs_format.h +@@ -347,7 +347,8 @@ static inline void bkey_init(struct bkey *k) + x(subvolume, 21) \ + x(snapshot, 22) \ + x(inode_v2, 23) \ +- x(alloc_v3, 24) ++ x(alloc_v3, 24) \ ++ x(set, 25) + + enum bch_bkey_type { + #define x(name, nr) KEY_TYPE_##name = nr, +@@ -377,6 +378,10 @@ struct bch_hash_whiteout { + struct bch_val v; + }; + ++struct bch_set { ++ struct bch_val v; ++}; ++ + /* Extents */ + + /* +diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c +index e83aeb683a09..49aa8a478e5f 100644 +--- a/fs/bcachefs/bkey_methods.c ++++ b/fs/bcachefs/bkey_methods.c +@@ -85,6 +85,24 @@ static void key_type_inline_data_to_text(struct printbuf *out, struct bch_fs *c, + .val_to_text = key_type_inline_data_to_text, \ + } + ++static const char *key_type_set_invalid(const struct bch_fs *c, struct bkey_s_c k) ++{ ++ if (bkey_val_bytes(k.k)) ++ return "nonempty value"; ++ return NULL; ++} ++ ++static bool key_type_set_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r) ++{ ++ bch2_key_resize(l.k, l.k->size + r.k->size); ++ return true; ++} ++ ++#define bch2_bkey_ops_set (struct bkey_ops) { \ ++ .key_invalid = key_type_set_invalid, \ ++ .key_merge = key_type_set_merge, \ ++} ++ + const struct bkey_ops bch2_bkey_ops[] = { + #define x(name, nr) [KEY_TYPE_##name] = bch2_bkey_ops_##name, + BCH_BKEY_TYPES() +-- +cgit v1.2.3 + + +From 1be7fd7d42ac96ee950e7d0b29b33ad46eab2b00 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 5 Dec 2021 00:31:54 -0500 +Subject: bcachefs: LRU btree + +This implements new persistent LRUs, to be used for buckets containing +cached data, as well as stripes ordered by time when a block became +empty. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/Makefile | 1 + + fs/bcachefs/bcachefs.h | 1 + + fs/bcachefs/bcachefs_format.h | 15 +++++- + fs/bcachefs/bkey_methods.c | 4 ++ + fs/bcachefs/lru.c | 119 ++++++++++++++++++++++++++++++++++++++++++ + fs/bcachefs/lru.h | 15 ++++++ + 6 files changed, 153 insertions(+), 2 deletions(-) + create mode 100644 fs/bcachefs/lru.c + create mode 100644 fs/bcachefs/lru.h + +diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile +index 5047c919374e..7ddae26116a0 100644 +--- a/fs/bcachefs/Makefile ++++ b/fs/bcachefs/Makefile +@@ -41,6 +41,7 @@ bcachefs-y := \ + journal_sb.o \ + journal_seq_blacklist.o \ + keylist.o \ ++ lru.o \ + migrate.o \ + move.o \ + movinggc.o \ +diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h +index 211fd5adf9e3..42ff5e48910d 100644 +--- a/fs/bcachefs/bcachefs.h ++++ b/fs/bcachefs/bcachefs.h +@@ -391,6 +391,7 @@ enum gc_phase { + GC_PHASE_BTREE_reflink, + GC_PHASE_BTREE_subvolumes, + GC_PHASE_BTREE_snapshots, ++ GC_PHASE_BTREE_lru, + + GC_PHASE_PENDING_DELETE, + }; +diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h +index 57e6780f47d4..9785c8275570 100644 +--- a/fs/bcachefs/bcachefs_format.h ++++ b/fs/bcachefs/bcachefs_format.h +@@ -348,7 +348,8 @@ static inline void bkey_init(struct bkey *k) + x(snapshot, 22) \ + x(inode_v2, 23) \ + x(alloc_v3, 24) \ +- x(set, 25) ++ x(set, 25) \ ++ x(lru, 26) + + enum bch_bkey_type { + #define x(name, nr) KEY_TYPE_##name = nr, +@@ -1020,6 +1021,15 @@ LE32_BITMASK(BCH_SNAPSHOT_DELETED, struct bch_snapshot, flags, 0, 1) + /* True if a subvolume points to this snapshot node: */ + LE32_BITMASK(BCH_SNAPSHOT_SUBVOL, struct bch_snapshot, flags, 1, 2) + ++/* LRU btree: */ ++ ++struct bch_lru { ++ struct bch_val v; ++ __le64 idx; ++} __attribute__((packed, aligned(8))); ++ ++#define LRU_ID_STRIPES (1U << 16) ++ + /* Optional/variable size superblock sections: */ + + struct bch_sb_field { +@@ -1827,7 +1837,8 @@ LE32_BITMASK(JSET_NO_FLUSH, struct jset, flags, 5, 6); + x(stripes, 6) \ + x(reflink, 7) \ + x(subvolumes, 8) \ +- x(snapshots, 9) ++ x(snapshots, 9) \ ++ x(lru, 10) + + enum btree_id { + #define x(kwd, val) BTREE_ID_##kwd = val, +diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c +index 49aa8a478e5f..9e3fbb673559 100644 +--- a/fs/bcachefs/bkey_methods.c ++++ b/fs/bcachefs/bkey_methods.c +@@ -9,6 +9,7 @@ + #include "error.h" + #include "extents.h" + #include "inode.h" ++#include "lru.h" + #include "quota.h" + #include "reflink.h" + #include "subvolume.h" +@@ -165,6 +166,9 @@ static unsigned bch2_key_types_allowed[] = { + [BKEY_TYPE_snapshots] = + (1U << KEY_TYPE_deleted)| + (1U << KEY_TYPE_snapshot), ++ [BKEY_TYPE_lru] = ++ (1U << KEY_TYPE_deleted)| ++ (1U << KEY_TYPE_lru), + [BKEY_TYPE_btree] = + (1U << KEY_TYPE_deleted)| + (1U << KEY_TYPE_btree_ptr)| +diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c +new file mode 100644 +index 000000000000..2ababca5efe5 +--- /dev/null ++++ b/fs/bcachefs/lru.c +@@ -0,0 +1,119 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include "bcachefs.h" ++#include "btree_iter.h" ++#include "btree_update.h" ++#include "error.h" ++#include "lru.h" ++ ++const char *bch2_lru_invalid(const struct bch_fs *c, struct bkey_s_c k) ++{ ++ const struct bch_lru *lru = bkey_s_c_to_lru(k).v; ++ ++ if (bkey_val_bytes(k.k) < sizeof(*lru)) ++ return "incorrect value size"; ++ ++ return NULL; ++} ++ ++void bch2_lru_to_text(struct printbuf *out, struct bch_fs *c, ++ struct bkey_s_c k) ++{ ++ const struct bch_lru *lru = bkey_s_c_to_lru(k).v; ++ ++ pr_buf(out, "idx %llu", le64_to_cpu(lru->idx)); ++} ++ ++static int lru_delete(struct btree_trans *trans, u64 id, u64 idx, u64 time) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ u64 existing_idx; ++ int ret = 0; ++ ++ if (!time) ++ return 0; ++ ++ bch2_trans_iter_init(trans, &iter, BTREE_ID_lru, ++ POS(id, time), ++ BTREE_ITER_INTENT| ++ BTREE_ITER_WITH_UPDATES); ++ k = bch2_btree_iter_peek_slot(&iter); ++ ret = bkey_err(k); ++ if (ret) ++ goto err; ++ ++ if (k.k->type != KEY_TYPE_lru) { ++ bch2_fs_inconsistent(c, ++ "pointer to nonexistent lru %llu:%llu", ++ id, time); ++ ret = -EIO; ++ goto err; ++ } ++ ++ existing_idx = le64_to_cpu(bkey_s_c_to_lru(k).v->idx); ++ if (existing_idx != idx) { ++ bch2_fs_inconsistent(c, ++ "lru %llu:%llu with wrong backpointer: got %llu, should be %llu", ++ id, time, existing_idx, idx); ++ ret = -EIO; ++ goto err; ++ } ++ ++ ret = bch2_btree_delete_at(trans, &iter, 0); ++err: ++ bch2_trans_iter_exit(trans, &iter); ++ return ret; ++} ++ ++static int lru_set(struct btree_trans *trans, u64 lru_id, u64 idx, u64 *time) ++{ ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ struct bkey_i_lru *lru; ++ int ret = 0; ++ ++ if (!*time) ++ return 0; ++ ++ for_each_btree_key_norestart(trans, iter, BTREE_ID_lru, ++ POS(lru_id, *time), ++ BTREE_ITER_SLOTS| ++ BTREE_ITER_INTENT| ++ BTREE_ITER_WITH_UPDATES, k, ret) ++ if (bkey_deleted(k.k)) ++ break; ++ ++ if (ret) ++ goto err; ++ ++ BUG_ON(iter.pos.inode != lru_id); ++ *time = iter.pos.offset; ++ ++ lru = bch2_trans_kmalloc(trans, sizeof(*lru)); ++ ret = PTR_ERR_OR_ZERO(lru); ++ if (ret) ++ goto err; ++ ++ bkey_lru_init(&lru->k_i); ++ lru->k.p = iter.pos; ++ lru->v.idx = cpu_to_le64(idx); ++ ++ ret = bch2_trans_update(trans, &iter, &lru->k_i, 0); ++ if (ret) ++ goto err; ++err: ++ bch2_trans_iter_exit(trans, &iter); ++ return ret; ++} ++ ++int bch2_lru_change(struct btree_trans *trans, u64 id, u64 idx, ++ u64 old_time, u64 *new_time) ++{ ++ if (old_time == *new_time) ++ return 0; ++ ++ return lru_delete(trans, id, idx, old_time) ?: ++ lru_set(trans, id, idx, new_time); ++} +diff --git a/fs/bcachefs/lru.h b/fs/bcachefs/lru.h +new file mode 100644 +index 000000000000..c3121cfee285 +--- /dev/null ++++ b/fs/bcachefs/lru.h +@@ -0,0 +1,15 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_LRU_H ++#define _BCACHEFS_LRU_H ++ ++const char *bch2_lru_invalid(const struct bch_fs *, struct bkey_s_c); ++void bch2_lru_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); ++ ++#define bch2_bkey_ops_lru (struct bkey_ops) { \ ++ .key_invalid = bch2_lru_invalid, \ ++ .val_to_text = bch2_lru_to_text, \ ++} ++ ++int bch2_lru_change(struct btree_trans *, u64, u64, u64, u64 *); ++ ++#endif /* _BCACHEFS_LRU_H */ +-- +cgit v1.2.3 + + +From 5ad070fde4897dc3ed19eb2cda2fd79db75c9756 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 11 Dec 2021 17:13:09 -0500 +Subject: bcachefs: Freespace, need_discard btrees + +This adds two new btrees for the upcoming allocator rewrite: an extents +btree of free buckets, and a btree for buckets awaiting discards. + +We also add a new trigger for alloc keys to keep the new btrees up to +date, and a compatibility path to initialize them on existing +filesystems. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/alloc_background.c | 240 ++++++++++++++++++++++++++++++++++++++++- + fs/bcachefs/alloc_background.h | 53 +++++++++ + fs/bcachefs/alloc_foreground.h | 11 ++ + fs/bcachefs/bcachefs.h | 2 + + fs/bcachefs/bcachefs_format.h | 13 ++- + fs/bcachefs/bkey_methods.c | 6 ++ + fs/bcachefs/btree_gc.c | 4 +- + fs/bcachefs/btree_io.c | 2 +- + fs/bcachefs/btree_types.h | 27 ++--- + fs/bcachefs/buckets.c | 5 + + fs/bcachefs/buckets.h | 10 -- + fs/bcachefs/extent_update.c | 13 ++- + fs/bcachefs/recovery.c | 14 ++- + fs/bcachefs/super-io.c | 5 + + fs/bcachefs/super-io.h | 1 + + fs/bcachefs/super.c | 36 +++---- + fs/bcachefs/super_types.h | 1 + + 17 files changed, 385 insertions(+), 58 deletions(-) + +diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c +index 4afb2d457fb0..bf90feefa5cd 100644 +--- a/fs/bcachefs/alloc_background.c ++++ b/fs/bcachefs/alloc_background.c +@@ -14,6 +14,7 @@ + #include "debug.h" + #include "ec.h" + #include "error.h" ++#include "lru.h" + #include "recovery.h" + #include "varint.h" + +@@ -39,6 +40,15 @@ static const unsigned BCH_ALLOC_V1_FIELD_BYTES[] = { + #undef x + }; + ++const char * const bch2_bucket_states[] = { ++ "free", ++ "need gc gens", ++ "need discard", ++ "cached", ++ "dirty", ++ NULL ++}; ++ + /* Persistent alloc info: */ + + static inline u64 alloc_field_v1_get(const struct bch_alloc *a, +@@ -161,6 +171,8 @@ static int bch2_alloc_unpack_v3(struct bkey_alloc_unpacked *out, + out->gen = a.v->gen; + out->oldest_gen = a.v->oldest_gen; + out->data_type = a.v->data_type; ++ out->need_discard = BCH_ALLOC_NEED_DISCARD(a.v); ++ out->need_inc_gen = BCH_ALLOC_NEED_INC_GEN(a.v); + out->journal_seq = le64_to_cpu(a.v->journal_seq); + + #define x(_name, _bits) \ +@@ -197,6 +209,8 @@ static void bch2_alloc_pack_v3(struct bkey_alloc_buf *dst, + a->v.oldest_gen = src.oldest_gen; + a->v.data_type = src.data_type; + a->v.journal_seq = cpu_to_le64(src.journal_seq); ++ SET_BCH_ALLOC_NEED_DISCARD(&a->v, src.need_discard); ++ SET_BCH_ALLOC_NEED_INC_GEN(&a->v, src.need_inc_gen); + + #define x(_name, _bits) \ + nr_fields++; \ +@@ -325,9 +339,9 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, + { + struct bkey_alloc_unpacked u = bch2_alloc_unpack(k); + +- pr_buf(out, "gen %u oldest_gen %u data_type %s journal_seq %llu", ++ pr_buf(out, "gen %u oldest_gen %u data_type %s journal_seq %llu need_discard %u", + u.gen, u.oldest_gen, bch2_data_types[u.data_type], +- u.journal_seq); ++ u.journal_seq, u.need_discard); + #define x(_name, ...) pr_buf(out, " " #_name " %llu", (u64) u._name); + BCH_ALLOC_FIELDS_V2() + #undef x +@@ -384,6 +398,218 @@ int bch2_alloc_read(struct bch_fs *c, bool gc, bool metadata_only) + return ret; + } + ++/* Free space/discard btree: */ ++ ++static int bch2_bucket_do_index(struct btree_trans *trans, ++ struct bkey_s_c alloc_k, ++ struct bkey_alloc_unpacked a, ++ bool set) ++{ ++ struct bch_fs *c = trans->c; ++ struct bch_dev *ca = bch_dev_bkey_exists(c, a.dev); ++ struct btree_iter iter; ++ struct bkey_s_c old; ++ struct bkey_i *k; ++ enum bucket_state state = bucket_state(a); ++ enum btree_id btree; ++ enum bch_bkey_type old_type = !set ? KEY_TYPE_set : KEY_TYPE_deleted; ++ enum bch_bkey_type new_type = set ? KEY_TYPE_set : KEY_TYPE_deleted; ++ struct printbuf buf = PRINTBUF; ++ int ret; ++ ++ if (state != BUCKET_free && ++ state != BUCKET_need_discard) ++ return 0; ++ ++ k = bch2_trans_kmalloc(trans, sizeof(*k)); ++ if (IS_ERR(k)) ++ return PTR_ERR(k); ++ ++ bkey_init(&k->k); ++ k->k.type = new_type; ++ ++ switch (state) { ++ case BUCKET_free: ++ btree = BTREE_ID_freespace; ++ k->k.p = alloc_freespace_pos(a); ++ bch2_key_resize(&k->k, 1); ++ break; ++ case BUCKET_need_discard: ++ btree = BTREE_ID_need_discard; ++ k->k.p = POS(a.dev, a.bucket); ++ break; ++ default: ++ return 0; ++ } ++ ++ bch2_trans_iter_init(trans, &iter, btree, ++ bkey_start_pos(&k->k), ++ BTREE_ITER_INTENT); ++ old = bch2_btree_iter_peek_slot(&iter); ++ ret = bkey_err(old); ++ if (ret) ++ goto err; ++ ++ if (ca->mi.freespace_initialized && ++ bch2_fs_inconsistent_on(old.k->type != old_type, c, ++ "incorrect key when %s %s btree (got %s should be %s)\n" ++ " for %s", ++ set ? "setting" : "clearing", ++ bch2_btree_ids[btree], ++ bch2_bkey_types[old.k->type], ++ bch2_bkey_types[old_type], ++ (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { ++ ret = -EIO; ++ goto err; ++ } ++ ++ ret = bch2_trans_update(trans, &iter, k, 0); ++err: ++ bch2_trans_iter_exit(trans, &iter); ++ printbuf_exit(&buf); ++ return ret; ++} ++ ++int bch2_trans_mark_alloc(struct btree_trans *trans, ++ struct bkey_s_c old, struct bkey_i *new, ++ unsigned flags) ++{ ++ struct bch_fs *c = trans->c; ++ struct bkey_alloc_unpacked old_u = bch2_alloc_unpack(old); ++ struct bkey_alloc_unpacked new_u = bch2_alloc_unpack(bkey_i_to_s_c(new)); ++ u64 old_lru, new_lru; ++ bool need_repack = false; ++ int ret = 0; ++ ++ if (new_u.dirty_sectors > old_u.dirty_sectors || ++ new_u.cached_sectors > old_u.cached_sectors) { ++ new_u.read_time = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now)); ++ new_u.write_time = max_t(u64, 1, atomic64_read(&c->io_clock[WRITE].now)); ++ new_u.need_inc_gen = true; ++ new_u.need_discard = true; ++ need_repack = true; ++ } ++ ++ if (old_u.data_type && !new_u.data_type && ++ old_u.gen == new_u.gen && ++ !bch2_bucket_is_open(c, new->k.p.inode, new->k.p.offset) && ++ !bch2_bucket_is_open_safe(c, new->k.p.inode, new->k.p.offset)) { ++ new_u.gen++; ++ new_u.need_inc_gen = false; ++ need_repack = true; ++ } ++ ++ if (bucket_state(old_u) != bucket_state(new_u) || ++ (bucket_state(new_u) == BUCKET_free && ++ alloc_freespace_genbits(old_u) != alloc_freespace_genbits(new_u))) { ++ ret = bch2_bucket_do_index(trans, old, old_u, false) ?: ++ bch2_bucket_do_index(trans, bkey_i_to_s_c(new), new_u, true); ++ if (ret) ++ return ret; ++ } ++ ++ old_lru = alloc_lru_idx(old_u); ++ new_lru = alloc_lru_idx(new_u); ++ ++ if (old_lru != new_lru) { ++ ret = bch2_lru_change(trans, new->k.p.inode, new->k.p.offset, ++ old_lru, &new_lru); ++ if (ret) ++ return ret; ++ ++ if (new_lru && new_u.read_time != new_lru) { ++ new_u.read_time = new_lru; ++ need_repack = true; ++ } ++ } ++ ++ if (need_repack && !bkey_deleted(&new->k)) ++ bch2_alloc_pack_v3((void *) new, new_u); ++ ++ return 0; ++} ++ ++static int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca) ++{ ++ struct btree_trans trans; ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ struct bkey_alloc_unpacked a; ++ struct bch_member *m; ++ int ret; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_alloc, ++ POS(ca->dev_idx, ca->mi.first_bucket), ++ BTREE_ITER_SLOTS| ++ BTREE_ITER_PREFETCH, k, ret) { ++ if (iter.pos.offset >= ca->mi.nbuckets) ++ break; ++ ++ a = bch2_alloc_unpack(k); ++ ret = __bch2_trans_do(&trans, NULL, NULL, ++ BTREE_INSERT_LAZY_RW, ++ bch2_bucket_do_index(&trans, k, a, true)); ++ if (ret) ++ break; ++ } ++ bch2_trans_iter_exit(&trans, &iter); ++ ++ bch2_trans_exit(&trans); ++ ++ if (ret) { ++ bch_err(ca, "error initializing free space: %i", ret); ++ return ret; ++ } ++ ++ mutex_lock(&c->sb_lock); ++ m = bch2_sb_get_members(c->disk_sb.sb)->members + ca->dev_idx; ++ SET_BCH_MEMBER_FREESPACE_INITIALIZED(m, true); ++ mutex_unlock(&c->sb_lock); ++ ++ return ret; ++} ++ ++int bch2_fs_freespace_init(struct bch_fs *c) ++{ ++ struct bch_dev *ca; ++ unsigned i; ++ int ret = 0; ++ bool doing_init = false; ++ ++ /* ++ * We can crash during the device add path, so we need to check this on ++ * every mount: ++ */ ++ ++ for_each_member_device(ca, c, i) { ++ if (ca->mi.freespace_initialized) ++ continue; ++ ++ if (!doing_init) { ++ bch_info(c, "initializing freespace"); ++ doing_init = true; ++ } ++ ++ ret = bch2_dev_freespace_init(c, ca); ++ if (ret) { ++ percpu_ref_put(&ca->ref); ++ return ret; ++ } ++ } ++ ++ if (doing_init) { ++ mutex_lock(&c->sb_lock); ++ bch2_write_super(c); ++ mutex_unlock(&c->sb_lock); ++ ++ bch_verbose(c, "done initializing freespace"); ++ } ++ ++ return ret; ++} ++ + /* Bucket IO clocks: */ + + int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev, +@@ -428,6 +654,16 @@ out: + * commands to the newly free buckets, then puts them on the various freelists. + */ + ++/* ++ * bucket_gc_gen() returns the difference between the bucket's current gen and ++ * the oldest gen of any pointer into that bucket in the btree. ++ */ ++ ++static inline u8 bucket_gc_gen(struct bucket *g) ++{ ++ return g->mark.gen - g->oldest_gen; ++} ++ + static bool bch2_can_invalidate_bucket(struct bch_dev *ca, size_t b, + struct bucket_mark m) + { +diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h +index 3eaa6d204286..cf0c71313e1c 100644 +--- a/fs/bcachefs/alloc_background.h ++++ b/fs/bcachefs/alloc_background.h +@@ -17,6 +17,8 @@ struct bkey_alloc_unpacked { + u8 gen; + u8 oldest_gen; + u8 data_type; ++ bool need_discard:1; ++ bool need_inc_gen:1; + #define x(_name, _bits) u##_bits _name; + BCH_ALLOC_FIELDS_V2() + #undef x +@@ -25,6 +27,50 @@ struct bkey_alloc_unpacked { + /* How out of date a pointer gen is allowed to be: */ + #define BUCKET_GC_GEN_MAX 96U + ++static inline u8 alloc_gc_gen(struct bkey_alloc_unpacked a) ++{ ++ return a.gen - a.oldest_gen; ++} ++ ++enum bucket_state { ++ BUCKET_free, ++ BUCKET_need_gc_gens, ++ BUCKET_need_discard, ++ BUCKET_cached, ++ BUCKET_dirty, ++}; ++ ++extern const char * const bch2_bucket_states[]; ++ ++static inline enum bucket_state bucket_state(struct bkey_alloc_unpacked a) ++{ ++ if (a.dirty_sectors || a.stripe) ++ return BUCKET_dirty; ++ if (a.cached_sectors) ++ return BUCKET_cached; ++ BUG_ON(a.data_type); ++ if (a.need_discard) ++ return BUCKET_need_discard; ++ if (alloc_gc_gen(a) >= BUCKET_GC_GEN_MAX) ++ return BUCKET_need_gc_gens; ++ return BUCKET_free; ++} ++ ++static inline u64 alloc_lru_idx(struct bkey_alloc_unpacked a) ++{ ++ return bucket_state(a) == BUCKET_cached ? a.read_time : 0; ++} ++ ++static inline u64 alloc_freespace_genbits(struct bkey_alloc_unpacked a) ++{ ++ return ((u64) alloc_gc_gen(a) >> 4) << 56; ++} ++ ++static inline struct bpos alloc_freespace_pos(struct bkey_alloc_unpacked a) ++{ ++ return POS(a.dev, a.bucket | alloc_freespace_genbits(a)); ++} ++ + /* returns true if not equal */ + static inline bool bkey_alloc_unpacked_cmp(struct bkey_alloc_unpacked l, + struct bkey_alloc_unpacked r) +@@ -65,18 +111,21 @@ void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); + #define bch2_bkey_ops_alloc (struct bkey_ops) { \ + .key_invalid = bch2_alloc_v1_invalid, \ + .val_to_text = bch2_alloc_to_text, \ ++ .trans_trigger = bch2_trans_mark_alloc, \ + .atomic_trigger = bch2_mark_alloc, \ + } + + #define bch2_bkey_ops_alloc_v2 (struct bkey_ops) { \ + .key_invalid = bch2_alloc_v2_invalid, \ + .val_to_text = bch2_alloc_to_text, \ ++ .trans_trigger = bch2_trans_mark_alloc, \ + .atomic_trigger = bch2_mark_alloc, \ + } + + #define bch2_bkey_ops_alloc_v3 (struct bkey_ops) { \ + .key_invalid = bch2_alloc_v3_invalid, \ + .val_to_text = bch2_alloc_to_text, \ ++ .trans_trigger = bch2_trans_mark_alloc, \ + .atomic_trigger = bch2_mark_alloc, \ + } + +@@ -89,6 +138,10 @@ static inline bool bkey_is_alloc(const struct bkey *k) + + int bch2_alloc_read(struct bch_fs *, bool, bool); + ++int bch2_trans_mark_alloc(struct btree_trans *, struct bkey_s_c, ++ struct bkey_i *, unsigned); ++int bch2_fs_freespace_init(struct bch_fs *); ++ + static inline void bch2_wake_allocator(struct bch_dev *ca) + { + struct task_struct *p; +diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h +index d466bda9afc8..aa35801605dc 100644 +--- a/fs/bcachefs/alloc_foreground.h ++++ b/fs/bcachefs/alloc_foreground.h +@@ -115,6 +115,17 @@ static inline bool bch2_bucket_is_open(struct bch_fs *c, unsigned dev, u64 bucke + return false; + } + ++static inline bool bch2_bucket_is_open_safe(struct bch_fs *c, unsigned dev, u64 bucket) ++{ ++ bool ret; ++ ++ spin_lock(&c->freelist_lock); ++ ret = bch2_bucket_is_open(c, dev, bucket); ++ spin_unlock(&c->freelist_lock); ++ ++ return ret; ++} ++ + int bch2_bucket_alloc_set(struct bch_fs *, struct open_buckets *, + struct dev_stripe_state *, struct bch_devs_mask *, + unsigned, unsigned *, bool *, enum alloc_reserve, +diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h +index 42ff5e48910d..7350fb6a8355 100644 +--- a/fs/bcachefs/bcachefs.h ++++ b/fs/bcachefs/bcachefs.h +@@ -392,6 +392,8 @@ enum gc_phase { + GC_PHASE_BTREE_subvolumes, + GC_PHASE_BTREE_snapshots, + GC_PHASE_BTREE_lru, ++ GC_PHASE_BTREE_freespace, ++ GC_PHASE_BTREE_need_discard, + + GC_PHASE_PENDING_DELETE, + }; +diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h +index 9785c8275570..5ece1492d76a 100644 +--- a/fs/bcachefs/bcachefs_format.h ++++ b/fs/bcachefs/bcachefs_format.h +@@ -899,11 +899,13 @@ struct bch_alloc_v3 { + __u8 data[]; + } __attribute__((packed, aligned(8))); + ++LE32_BITMASK(BCH_ALLOC_NEED_DISCARD,struct bch_alloc_v3, flags, 0, 1) ++LE32_BITMASK(BCH_ALLOC_NEED_INC_GEN,struct bch_alloc_v3, flags, 1, 2) ++ + enum { + #define x(name, _bits) BCH_ALLOC_FIELD_V1_##name, + BCH_ALLOC_FIELDS_V1() + #undef x +- BCH_ALLOC_FIELD_NR + }; + + /* Quotas: */ +@@ -1102,6 +1104,8 @@ LE64_BITMASK(BCH_MEMBER_DISCARD, struct bch_member, flags[0], 14, 15) + LE64_BITMASK(BCH_MEMBER_DATA_ALLOWED, struct bch_member, flags[0], 15, 20) + LE64_BITMASK(BCH_MEMBER_GROUP, struct bch_member, flags[0], 20, 28) + LE64_BITMASK(BCH_MEMBER_DURABILITY, struct bch_member, flags[0], 28, 30) ++LE64_BITMASK(BCH_MEMBER_FREESPACE_INITIALIZED, ++ struct bch_member, flags[0], 30, 31) + + #if 0 + LE64_BITMASK(BCH_MEMBER_NR_READ_ERRORS, struct bch_member, flags[1], 0, 20); +@@ -1320,7 +1324,8 @@ enum bcachefs_metadata_version { + bcachefs_metadata_version_reflink_p_fix = 16, + bcachefs_metadata_version_subvol_dirent = 17, + bcachefs_metadata_version_inode_v2 = 18, +- bcachefs_metadata_version_max = 19, ++ bcachefs_metadata_version_freespace = 19, ++ bcachefs_metadata_version_max = 20, + }; + + #define bcachefs_metadata_version_current (bcachefs_metadata_version_max - 1) +@@ -1838,7 +1843,9 @@ LE32_BITMASK(JSET_NO_FLUSH, struct jset, flags, 5, 6); + x(reflink, 7) \ + x(subvolumes, 8) \ + x(snapshots, 9) \ +- x(lru, 10) ++ x(lru, 10) \ ++ x(freespace, 11) \ ++ x(need_discard, 12) + + enum btree_id { + #define x(kwd, val) BTREE_ID_##kwd = val, +diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c +index 9e3fbb673559..3c1bf3310d99 100644 +--- a/fs/bcachefs/bkey_methods.c ++++ b/fs/bcachefs/bkey_methods.c +@@ -169,6 +169,12 @@ static unsigned bch2_key_types_allowed[] = { + [BKEY_TYPE_lru] = + (1U << KEY_TYPE_deleted)| + (1U << KEY_TYPE_lru), ++ [BKEY_TYPE_freespace] = ++ (1U << KEY_TYPE_deleted)| ++ (1U << KEY_TYPE_set), ++ [BKEY_TYPE_need_discard] = ++ (1U << KEY_TYPE_deleted)| ++ (1U << KEY_TYPE_set), + [BKEY_TYPE_btree] = + (1U << KEY_TYPE_deleted)| + (1U << KEY_TYPE_btree_ptr)| +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +index 73b947a493a2..a8c566fd12bb 100644 +--- a/fs/bcachefs/btree_gc.c ++++ b/fs/bcachefs/btree_gc.c +@@ -1379,7 +1379,7 @@ static int bch2_alloc_write_key(struct btree_trans *trans, + if (IS_ERR(a)) + return PTR_ERR(a); + +- ret = bch2_trans_update(trans, iter, &a->k, BTREE_TRIGGER_NORUN); ++ ret = bch2_trans_update(trans, iter, &a->k, 0); + fsck_err: + return ret; + } +@@ -1891,7 +1891,7 @@ static int bch2_alloc_write_oldest_gen(struct btree_trans *trans, struct btree_i + + u.oldest_gen = ca->oldest_gen[iter->pos.offset]; + +- return bch2_alloc_write(trans, iter, &u, BTREE_TRIGGER_NORUN); ++ return bch2_alloc_write(trans, iter, &u, 0); + } + + int bch2_gc_gens(struct bch_fs *c) +diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c +index e6cea4c687e1..1df454f24b54 100644 +--- a/fs/bcachefs/btree_io.c ++++ b/fs/bcachefs/btree_io.c +@@ -930,7 +930,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, + "error decrypting btree node: %i", ret)) + goto fsck_err; + +- btree_err_on(btree_node_is_extents(b) && ++ btree_err_on(btree_node_type_is_extents(btree_node_type(b)) && + !BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data), + BTREE_ERR_FATAL, c, NULL, b, NULL, + "btree node does not have NEW_EXTENT_OVERWRITE set"); +diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h +index 575635b5fa10..788b9811148f 100644 +--- a/fs/bcachefs/btree_types.h ++++ b/fs/bcachefs/btree_types.h +@@ -596,24 +596,9 @@ static inline enum btree_node_type btree_node_type(struct btree *b) + return __btree_node_type(b->c.level, b->c.btree_id); + } + +-static inline bool btree_node_type_is_extents(enum btree_node_type type) +-{ +- switch (type) { +- case BKEY_TYPE_extents: +- case BKEY_TYPE_reflink: +- return true; +- default: +- return false; +- } +-} +- +-static inline bool btree_node_is_extents(struct btree *b) +-{ +- return btree_node_type_is_extents(btree_node_type(b)); +-} +- + #define BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS \ + ((1U << BKEY_TYPE_extents)| \ ++ (1U << BKEY_TYPE_alloc)| \ + (1U << BKEY_TYPE_inodes)| \ + (1U << BKEY_TYPE_stripes)| \ + (1U << BKEY_TYPE_reflink)| \ +@@ -629,6 +614,16 @@ static inline bool btree_node_is_extents(struct btree *b) + (BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS| \ + BTREE_NODE_TYPE_HAS_MEM_TRIGGERS) + ++#define BTREE_ID_IS_EXTENTS \ ++ ((1U << BTREE_ID_extents)| \ ++ (1U << BTREE_ID_reflink)| \ ++ (1U << BTREE_ID_freespace)) ++ ++static inline bool btree_node_type_is_extents(enum btree_node_type type) ++{ ++ return (1U << type) & BTREE_ID_IS_EXTENTS; ++} ++ + #define BTREE_ID_HAS_SNAPSHOTS \ + ((1U << BTREE_ID_extents)| \ + (1U << BTREE_ID_inodes)| \ +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index d52263759ee5..b5178d3067a9 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -555,6 +555,11 @@ int bch2_mark_alloc(struct btree_trans *trans, + } + } + ++ if (bucket_state(new_u) == BUCKET_need_gc_gens) { ++ atomic_inc(&c->kick_gc); ++ wake_up_process(c->gc_thread); ++ } ++ + percpu_down_read(&c->mark_lock); + if (!gc && new_u.gen != old_u.gen) + *bucket_gen(ca, new_u.bucket) = new_u.gen; +diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h +index 392e03d4c319..07fe5cddbb41 100644 +--- a/fs/bcachefs/buckets.h ++++ b/fs/bcachefs/buckets.h +@@ -81,16 +81,6 @@ static inline u8 *bucket_gen(struct bch_dev *ca, size_t b) + return gens->b + b; + } + +-/* +- * bucket_gc_gen() returns the difference between the bucket's current gen and +- * the oldest gen of any pointer into that bucket in the btree. +- */ +- +-static inline u8 bucket_gc_gen(struct bucket *g) +-{ +- return g->mark.gen - g->oldest_gen; +-} +- + static inline size_t PTR_BUCKET_NR(const struct bch_dev *ca, + const struct bch_extent_ptr *ptr) + { +diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c +index 58b2c96f450c..2fd5d9672a44 100644 +--- a/fs/bcachefs/extent_update.c ++++ b/fs/bcachefs/extent_update.c +@@ -15,17 +15,26 @@ static unsigned bch2_bkey_nr_alloc_ptrs(struct bkey_s_c k) + { + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; +- unsigned ret = 0; ++ unsigned ret = 0, lru = 0; + + bkey_extent_entry_for_each(ptrs, entry) { + switch (__extent_entry_type(entry)) { + case BCH_EXTENT_ENTRY_ptr: ++ /* Might also be updating LRU btree */ ++ if (entry->ptr.cached) ++ lru++; ++ ++ fallthrough; + case BCH_EXTENT_ENTRY_stripe_ptr: + ret++; + } + } + +- return ret; ++ /* ++ * Updating keys in the alloc btree may also update keys in the ++ * freespace or discard btrees: ++ */ ++ return lru + ret * 2; + } + + static int count_iters_for_insert(struct btree_trans *trans, +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index 887971559214..4d01a01ea5c5 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -1027,8 +1027,8 @@ int bch2_fs_recovery(struct bch_fs *c) + bch_info(c, "filesystem version is prior to subvol_dirent - upgrading"); + c->opts.version_upgrade = true; + c->opts.fsck = true; +- } else if (c->sb.version < bcachefs_metadata_version_inode_v2) { +- bch_info(c, "filesystem version is prior to inode_v2 - upgrading"); ++ } else if (c->sb.version < bcachefs_metadata_version_freespace) { ++ bch_info(c, "filesystem version is prior to freespace - upgrading"); + c->opts.version_upgrade = true; + } + } +@@ -1196,6 +1196,11 @@ use_clean: + if (c->opts.verbose || !c->sb.clean) + bch_info(c, "journal replay done"); + ++ err = "error initializing freespace"; ++ ret = bch2_fs_freespace_init(c); ++ if (ret) ++ goto err; ++ + if (c->sb.version < bcachefs_metadata_version_snapshot_2) { + bch2_fs_lazy_rw(c); + +@@ -1379,6 +1384,11 @@ int bch2_fs_initialize(struct bch_fs *c) + ca->new_fs_bucket_idx = 0; + } + ++ err = "error initializing freespace"; ++ ret = bch2_fs_freespace_init(c); ++ if (ret) ++ goto err; ++ + err = "error creating root snapshot node"; + ret = bch2_fs_initialize_subvolumes(c); + if (ret) +diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c +index aa10be8edfe7..95af515a01cd 100644 +--- a/fs/bcachefs/super-io.c ++++ b/fs/bcachefs/super-io.c +@@ -1052,6 +1052,11 @@ static void bch2_sb_members_to_text(struct printbuf *out, struct bch_sb *sb, + pr_buf(out, "%llu", BCH_MEMBER_DISCARD(m)); + pr_newline(out); + ++ pr_buf(out, "Freespace initialized:"); ++ pr_tab(out); ++ pr_buf(out, "%llu", BCH_MEMBER_FREESPACE_INITIALIZED(m)); ++ pr_newline(out); ++ + pr_indent_pop(out, 2); + } + } +diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h +index 7fc56321922f..14a25f6fe29a 100644 +--- a/fs/bcachefs/super-io.h ++++ b/fs/bcachefs/super-io.h +@@ -103,6 +103,7 @@ static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi) + .durability = BCH_MEMBER_DURABILITY(mi) + ? BCH_MEMBER_DURABILITY(mi) - 1 + : 1, ++ .freespace_initialized = BCH_MEMBER_FREESPACE_INITIALIZED(mi), + .valid = !bch2_is_zero(mi->uuid.b, sizeof(uuid_le)), + }; + } +diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c +index 46947163a8dc..019cbf32d40e 100644 +--- a/fs/bcachefs/super.c ++++ b/fs/bcachefs/super.c +@@ -1468,30 +1468,20 @@ int bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca, + + static int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca) + { +- struct btree_trans trans; +- size_t i; ++ struct bpos start = POS(ca->dev_idx, 0); ++ struct bpos end = POS(ca->dev_idx, U64_MAX); + int ret; + +- bch2_trans_init(&trans, c, 0, 0); +- +- for (i = 0; i < ca->mi.nbuckets; i++) { +- ret = lockrestart_do(&trans, +- bch2_btree_key_cache_flush(&trans, +- BTREE_ID_alloc, POS(ca->dev_idx, i))); +- if (ret) +- break; +- } +- bch2_trans_exit(&trans); +- +- if (ret) { ++ ret = bch2_btree_delete_range(c, BTREE_ID_alloc, start, end, ++ BTREE_TRIGGER_NORUN, NULL) ?: ++ bch2_btree_delete_range(c, BTREE_ID_freespace, start, end, ++ BTREE_TRIGGER_NORUN, NULL) ?: ++ bch2_btree_delete_range(c, BTREE_ID_need_discard, start, end, ++ BTREE_TRIGGER_NORUN, NULL); ++ if (ret) + bch_err(c, "error %i removing dev alloc info", ret); +- return ret; +- } + +- return bch2_btree_delete_range(c, BTREE_ID_alloc, +- POS(ca->dev_idx, 0), +- POS(ca->dev_idx + 1, 0), +- 0, NULL); ++ return ret; + } + + int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) +@@ -1709,6 +1699,12 @@ have_slot: + goto err_late; + } + ++ ret = bch2_fs_freespace_init(c); ++ if (ret) { ++ bch_err(c, "device add error: error initializing free space: %i", ret); ++ goto err_late; ++ } ++ + ca->new_fs_bucket_idx = 0; + + if (ca->mi.state == BCH_MEMBER_STATE_rw) { +diff --git a/fs/bcachefs/super_types.h b/fs/bcachefs/super_types.h +index d8b159a5b7f7..89419fc7930d 100644 +--- a/fs/bcachefs/super_types.h ++++ b/fs/bcachefs/super_types.h +@@ -32,6 +32,7 @@ struct bch_member_cpu { + u8 discard; + u8 data_allowed; + u8 durability; ++ u8 freespace_initialized; + u8 valid; + }; + +-- +cgit v1.2.3 + + +From 4c5bba2e984975a221df6accf35b12ed72ac38c5 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 9 Jan 2022 20:48:31 -0500 +Subject: bcachefs: Kill allocator threads & freelists + +Now that we have new persistent data structures for the allocator, this +patch converts the allocator to use them. + +Now, foreground bucket allocation uses the freespace btree to find +buckets to allocate, instead of popping buckets off the freelist. + +The background allocator threads are no longer needed and are deleted, +as well as the allocator freelists. Now we only need background tasks +for invalidating buckets containing cached data (when we are low on +empty buckets), and for issuing discards. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/alloc_background.c | 557 +---------------------------------------- + fs/bcachefs/alloc_background.h | 33 --- + fs/bcachefs/alloc_foreground.c | 350 ++++++++++++++++++++------ + fs/bcachefs/alloc_types.h | 22 -- + fs/bcachefs/bcachefs.h | 21 +- + fs/bcachefs/btree_gc.c | 10 +- + fs/bcachefs/buckets.c | 69 +---- + fs/bcachefs/buckets.h | 62 +++-- + fs/bcachefs/buckets_types.h | 2 - + fs/bcachefs/ec.c | 13 +- + fs/bcachefs/journal.c | 2 - + fs/bcachefs/journal_io.c | 2 + + fs/bcachefs/movinggc.c | 23 +- + fs/bcachefs/recovery.c | 2 + + fs/bcachefs/super.c | 82 +----- + fs/bcachefs/sysfs.c | 43 +--- + 16 files changed, 344 insertions(+), 949 deletions(-) + +diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c +index bf90feefa5cd..cdce67755d21 100644 +--- a/fs/bcachefs/alloc_background.c ++++ b/fs/bcachefs/alloc_background.c +@@ -27,13 +27,6 @@ + #include + #include + +-const char * const bch2_allocator_states[] = { +-#define x(n) #n, +- ALLOC_THREAD_STATES() +-#undef x +- NULL +-}; +- + static const unsigned BCH_ALLOC_V1_FIELD_BYTES[] = { + #define x(name, bits) [BCH_ALLOC_FIELD_V1_##name] = bits / 8, + BCH_ALLOC_FIELDS_V1() +@@ -371,7 +364,6 @@ int bch2_alloc_read(struct bch_fs *c, bool gc, bool metadata_only) + g->_mark.gen = u.gen; + g->io_time[READ] = u.read_time; + g->io_time[WRITE] = u.write_time; +- g->oldest_gen = !gc ? u.oldest_gen : u.gen; + g->gen_valid = 1; + + if (!gc || +@@ -646,491 +638,6 @@ out: + return ret; + } + +-/* Background allocator thread: */ +- +-/* +- * Scans for buckets to be invalidated, invalidates them, rewrites prios/gens +- * (marking them as invalidated on disk), then optionally issues discard +- * commands to the newly free buckets, then puts them on the various freelists. +- */ +- +-/* +- * bucket_gc_gen() returns the difference between the bucket's current gen and +- * the oldest gen of any pointer into that bucket in the btree. +- */ +- +-static inline u8 bucket_gc_gen(struct bucket *g) +-{ +- return g->mark.gen - g->oldest_gen; +-} +- +-static bool bch2_can_invalidate_bucket(struct bch_dev *ca, size_t b, +- struct bucket_mark m) +-{ +- u8 gc_gen; +- +- if (!is_available_bucket(m)) +- return false; +- +- if (m.owned_by_allocator) +- return false; +- +- if (ca->buckets_nouse && +- test_bit(b, ca->buckets_nouse)) +- return false; +- +- if (ca->new_fs_bucket_idx) { +- /* +- * Device or filesystem is still being initialized, and we +- * haven't fully marked superblocks & journal: +- */ +- if (is_superblock_bucket(ca, b)) +- return false; +- +- if (b < ca->new_fs_bucket_idx) +- return false; +- } +- +- gc_gen = bucket_gc_gen(bucket(ca, b)); +- +- ca->inc_gen_needs_gc += gc_gen >= BUCKET_GC_GEN_MAX / 2; +- ca->inc_gen_really_needs_gc += gc_gen >= BUCKET_GC_GEN_MAX; +- +- return gc_gen < BUCKET_GC_GEN_MAX; +-} +- +-/* +- * Determines what order we're going to reuse buckets, smallest bucket_key() +- * first. +- */ +- +-static unsigned bucket_sort_key(struct bucket *g, struct bucket_mark m, +- u64 now, u64 last_seq_ondisk) +-{ +- unsigned used = m.cached_sectors; +- +- if (used) { +- /* +- * Prefer to keep buckets that have been read more recently, and +- * buckets that have more data in them: +- */ +- u64 last_read = max_t(s64, 0, now - g->io_time[READ]); +- u32 last_read_scaled = max_t(u64, U32_MAX, div_u64(last_read, used)); +- +- return -last_read_scaled; +- } else { +- /* +- * Prefer to use buckets with smaller gc_gen so that we don't +- * have to walk the btree and recalculate oldest_gen - but shift +- * off the low bits so that buckets will still have equal sort +- * keys when there's only a small difference, so that we can +- * keep sequential buckets together: +- */ +- return bucket_gc_gen(g) >> 4; +- } +-} +- +-static inline int bucket_alloc_cmp(alloc_heap *h, +- struct alloc_heap_entry l, +- struct alloc_heap_entry r) +-{ +- return cmp_int(l.key, r.key) ?: +- cmp_int(r.nr, l.nr) ?: +- cmp_int(l.bucket, r.bucket); +-} +- +-static inline int bucket_idx_cmp(const void *_l, const void *_r) +-{ +- const struct alloc_heap_entry *l = _l, *r = _r; +- +- return cmp_int(l->bucket, r->bucket); +-} +- +-static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca) +-{ +- struct bucket_array *buckets; +- struct alloc_heap_entry e = { 0 }; +- u64 now, last_seq_ondisk; +- size_t b, i, nr = 0; +- +- down_read(&ca->bucket_lock); +- +- buckets = bucket_array(ca); +- ca->alloc_heap.used = 0; +- now = atomic64_read(&c->io_clock[READ].now); +- last_seq_ondisk = c->journal.flushed_seq_ondisk; +- +- /* +- * Find buckets with lowest read priority, by building a maxheap sorted +- * by read priority and repeatedly replacing the maximum element until +- * all buckets have been visited. +- */ +- for (b = ca->mi.first_bucket; b < ca->mi.nbuckets; b++) { +- struct bucket *g = &buckets->b[b]; +- struct bucket_mark m = READ_ONCE(g->mark); +- unsigned key = bucket_sort_key(g, m, now, last_seq_ondisk); +- +- cond_resched(); +- +- if (!bch2_can_invalidate_bucket(ca, b, m)) +- continue; +- +- if (!m.data_type && +- bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal, +- last_seq_ondisk, +- ca->dev_idx, b)) { +- ca->buckets_waiting_on_journal++; +- continue; +- } +- +- if (e.nr && e.bucket + e.nr == b && e.key == key) { +- e.nr++; +- } else { +- if (e.nr) +- heap_add_or_replace(&ca->alloc_heap, e, +- -bucket_alloc_cmp, NULL); +- +- e = (struct alloc_heap_entry) { +- .bucket = b, +- .nr = 1, +- .key = key, +- }; +- } +- } +- +- if (e.nr) +- heap_add_or_replace(&ca->alloc_heap, e, +- -bucket_alloc_cmp, NULL); +- +- for (i = 0; i < ca->alloc_heap.used; i++) +- nr += ca->alloc_heap.data[i].nr; +- +- while (nr - ca->alloc_heap.data[0].nr >= ALLOC_SCAN_BATCH(ca)) { +- nr -= ca->alloc_heap.data[0].nr; +- heap_pop(&ca->alloc_heap, e, -bucket_alloc_cmp, NULL); +- } +- +- up_read(&ca->bucket_lock); +-} +- +-static size_t find_reclaimable_buckets(struct bch_fs *c, struct bch_dev *ca) +-{ +- size_t i, nr = 0; +- +- ca->inc_gen_needs_gc = 0; +- ca->inc_gen_really_needs_gc = 0; +- ca->buckets_waiting_on_journal = 0; +- +- find_reclaimable_buckets_lru(c, ca); +- +- heap_resort(&ca->alloc_heap, bucket_alloc_cmp, NULL); +- +- for (i = 0; i < ca->alloc_heap.used; i++) +- nr += ca->alloc_heap.data[i].nr; +- +- return nr; +-} +- +-static int bucket_invalidate_btree(struct btree_trans *trans, +- struct bch_dev *ca, u64 b, +- struct bkey_alloc_unpacked *u) +-{ +- struct bch_fs *c = trans->c; +- struct btree_iter iter; +- struct bkey_s_c k; +- int ret; +- +- bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, +- POS(ca->dev_idx, b), +- BTREE_ITER_CACHED| +- BTREE_ITER_INTENT); +- +- k = bch2_btree_iter_peek_slot(&iter); +- ret = bkey_err(k); +- if (ret) +- goto err; +- +- *u = bch2_alloc_unpack(k); +- u->gen++; +- u->data_type = 0; +- u->dirty_sectors = 0; +- u->cached_sectors = 0; +- u->read_time = atomic64_read(&c->io_clock[READ].now); +- u->write_time = atomic64_read(&c->io_clock[WRITE].now); +- +- ret = bch2_alloc_write(trans, &iter, u, +- BTREE_TRIGGER_BUCKET_INVALIDATE); +-err: +- bch2_trans_iter_exit(trans, &iter); +- return ret; +-} +- +-static int bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca, +- u64 *journal_seq, unsigned flags) +-{ +- struct bkey_alloc_unpacked u; +- size_t b; +- u64 commit_seq = 0; +- int ret = 0; +- +- /* +- * If the read-only path is trying to shut down, we can't be generating +- * new btree updates: +- */ +- if (test_bit(BCH_FS_ALLOCATOR_STOPPING, &c->flags)) +- return 1; +- +- BUG_ON(!ca->alloc_heap.used || +- !ca->alloc_heap.data[0].nr); +- b = ca->alloc_heap.data[0].bucket; +- +- /* first, put on free_inc and mark as owned by allocator: */ +- percpu_down_read(&c->mark_lock); +- +- bch2_mark_alloc_bucket(c, ca, b, true); +- +- spin_lock(&c->freelist_lock); +- verify_not_on_freelist(c, ca, b); +- BUG_ON(!fifo_push(&ca->free_inc, b)); +- spin_unlock(&c->freelist_lock); +- +- percpu_up_read(&c->mark_lock); +- +- ret = bch2_trans_do(c, NULL, &commit_seq, +- BTREE_INSERT_NOCHECK_RW| +- BTREE_INSERT_NOFAIL| +- BTREE_INSERT_JOURNAL_RESERVED| +- flags, +- bucket_invalidate_btree(&trans, ca, b, &u)); +- +- if (!ret) { +- /* remove from alloc_heap: */ +- struct alloc_heap_entry e, *top = ca->alloc_heap.data; +- +- top->bucket++; +- top->nr--; +- +- if (!top->nr) +- heap_pop(&ca->alloc_heap, e, bucket_alloc_cmp, NULL); +- +- /* +- * If we invalidating cached data then we need to wait on the +- * journal commit: +- */ +- if (u.data_type) +- *journal_seq = max(*journal_seq, commit_seq); +- +- /* +- * We already waiting on u.alloc_seq when we filtered out +- * buckets that need journal commit: +- */ +- BUG_ON(*journal_seq > u.journal_seq); +- } else { +- size_t b2; +- +- /* remove from free_inc: */ +- percpu_down_read(&c->mark_lock); +- spin_lock(&c->freelist_lock); +- +- bch2_mark_alloc_bucket(c, ca, b, false); +- +- BUG_ON(!fifo_pop_back(&ca->free_inc, b2)); +- BUG_ON(b != b2); +- +- spin_unlock(&c->freelist_lock); +- percpu_up_read(&c->mark_lock); +- } +- +- return ret < 0 ? ret : 0; +-} +- +-/* +- * Pull buckets off ca->alloc_heap, invalidate them, move them to ca->free_inc: +- */ +-static int bch2_invalidate_buckets(struct bch_fs *c, struct bch_dev *ca) +-{ +- u64 journal_seq = 0; +- int ret = 0; +- +- /* Only use nowait if we've already invalidated at least one bucket: */ +- while (!ret && +- !fifo_full(&ca->free_inc) && +- ca->alloc_heap.used) { +- if (kthread_should_stop()) { +- ret = 1; +- break; +- } +- +- ret = bch2_invalidate_one_bucket(c, ca, &journal_seq, +- (!fifo_empty(&ca->free_inc) +- ? BTREE_INSERT_NOWAIT : 0)); +- /* +- * We only want to batch up invalidates when they're going to +- * require flushing the journal: +- */ +- if (!journal_seq) +- break; +- } +- +- /* If we used NOWAIT, don't return the error: */ +- if (!fifo_empty(&ca->free_inc)) +- ret = 0; +- if (ret < 0) +- bch_err(ca, "error invalidating buckets: %i", ret); +- if (ret) +- return ret; +- +- if (journal_seq) +- ret = bch2_journal_flush_seq(&c->journal, journal_seq); +- if (ret) { +- bch_err(ca, "journal error: %i", ret); +- return ret; +- } +- +- return 0; +-} +- +-static void alloc_thread_set_state(struct bch_dev *ca, unsigned new_state) +-{ +- if (ca->allocator_state != new_state) { +- ca->allocator_state = new_state; +- closure_wake_up(&ca->fs->freelist_wait); +- } +-} +- +-static int push_invalidated_bucket(struct bch_fs *c, struct bch_dev *ca, u64 b) +-{ +- unsigned i; +- int ret = 0; +- +- spin_lock(&c->freelist_lock); +- for (i = 0; i < RESERVE_NR; i++) { +- /* +- * Don't strand buckets on the copygc freelist until +- * after recovery is finished: +- */ +- if (i == RESERVE_MOVINGGC && +- !test_bit(BCH_FS_STARTED, &c->flags)) +- continue; +- +- if (fifo_push(&ca->free[i], b)) { +- fifo_pop(&ca->free_inc, b); +- ret = 1; +- break; +- } +- } +- spin_unlock(&c->freelist_lock); +- +- ca->allocator_state = ret +- ? ALLOCATOR_running +- : ALLOCATOR_blocked_full; +- closure_wake_up(&c->freelist_wait); +- return ret; +-} +- +-static void discard_one_bucket(struct bch_fs *c, struct bch_dev *ca, u64 b) +-{ +- if (!c->opts.nochanges && +- ca->mi.discard && +- blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev))) +- blkdev_issue_discard(ca->disk_sb.bdev, bucket_to_sector(ca, b), +- ca->mi.bucket_size, GFP_NOFS, 0); +-} +- +-static bool allocator_thread_running(struct bch_dev *ca) +-{ +- unsigned state = ca->mi.state == BCH_MEMBER_STATE_rw && +- test_bit(BCH_FS_ALLOCATOR_RUNNING, &ca->fs->flags) +- ? ALLOCATOR_running +- : ALLOCATOR_stopped; +- alloc_thread_set_state(ca, state); +- return state == ALLOCATOR_running; +-} +- +-static int buckets_available(struct bch_dev *ca, unsigned long gc_count) +-{ +- s64 available = dev_buckets_reclaimable(ca) - +- (gc_count == ca->fs->gc_count ? ca->inc_gen_really_needs_gc : 0); +- bool ret = available > 0; +- +- alloc_thread_set_state(ca, ret +- ? ALLOCATOR_running +- : ALLOCATOR_blocked); +- return ret; +-} +- +-/** +- * bch_allocator_thread - move buckets from free_inc to reserves +- * +- * The free_inc FIFO is populated by find_reclaimable_buckets(), and +- * the reserves are depleted by bucket allocation. When we run out +- * of free_inc, try to invalidate some buckets and write out +- * prios and gens. +- */ +-static int bch2_allocator_thread(void *arg) +-{ +- struct bch_dev *ca = arg; +- struct bch_fs *c = ca->fs; +- unsigned long gc_count = c->gc_count; +- size_t nr; +- int ret; +- +- set_freezable(); +- +- while (1) { +- ret = kthread_wait_freezable(allocator_thread_running(ca)); +- if (ret) +- goto stop; +- +- while (!ca->alloc_heap.used) { +- cond_resched(); +- +- ret = kthread_wait_freezable(buckets_available(ca, gc_count)); +- if (ret) +- goto stop; +- +- gc_count = c->gc_count; +- nr = find_reclaimable_buckets(c, ca); +- +- if (!nr && ca->buckets_waiting_on_journal) { +- ret = bch2_journal_flush(&c->journal); +- if (ret) +- goto stop; +- } else if (nr < (ca->mi.nbuckets >> 6) && +- ca->buckets_waiting_on_journal >= nr / 2) { +- bch2_journal_flush_async(&c->journal, NULL); +- } +- +- if ((ca->inc_gen_needs_gc >= ALLOC_SCAN_BATCH(ca) || +- ca->inc_gen_really_needs_gc) && +- c->gc_thread) { +- atomic_inc(&c->kick_gc); +- wake_up_process(c->gc_thread); +- } +- +- trace_alloc_scan(ca, nr, ca->inc_gen_needs_gc, +- ca->inc_gen_really_needs_gc); +- } +- +- ret = bch2_invalidate_buckets(c, ca); +- if (ret) +- goto stop; +- +- while (!fifo_empty(&ca->free_inc)) { +- u64 b = fifo_peek(&ca->free_inc); +- +- discard_one_bucket(c, ca, b); +- +- ret = kthread_wait_freezable(push_invalidated_bucket(c, ca, b)); +- if (ret) +- goto stop; +- } +- } +-stop: +- alloc_thread_set_state(ca, ALLOCATOR_stopped); +- return 0; +-} +- + /* Startup/shutdown (ro/rw): */ + + void bch2_recalc_capacity(struct bch_fs *c) +@@ -1139,7 +646,7 @@ void bch2_recalc_capacity(struct bch_fs *c) + u64 capacity = 0, reserved_sectors = 0, gc_reserve; + unsigned bucket_size_max = 0; + unsigned long ra_pages = 0; +- unsigned i, j; ++ unsigned i; + + lockdep_assert_held(&c->state_lock); + +@@ -1170,8 +677,9 @@ void bch2_recalc_capacity(struct bch_fs *c) + * allocations for foreground writes must wait - + * not -ENOSPC calculations. + */ +- for (j = 0; j < RESERVE_NONE; j++) +- dev_reserve += ca->free[j].size; ++ ++ dev_reserve += ca->nr_btree_reserve * 2; ++ dev_reserve += ca->mi.nbuckets >> 6; /* copygc reserve */ + + dev_reserve += 1; /* btree write point */ + dev_reserve += 1; /* copygc write point */ +@@ -1227,8 +735,6 @@ void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca) + { + unsigned i; + +- BUG_ON(ca->alloc_thread); +- + /* First, remove device from allocation groups: */ + + for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++) +@@ -1302,61 +808,6 @@ void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca) + set_bit(ca->dev_idx, c->rw_devs[i].d); + } + +-void bch2_dev_allocator_quiesce(struct bch_fs *c, struct bch_dev *ca) +-{ +- if (ca->alloc_thread) +- closure_wait_event(&c->freelist_wait, +- ca->allocator_state != ALLOCATOR_running); +-} +- +-/* stop allocator thread: */ +-void bch2_dev_allocator_stop(struct bch_dev *ca) +-{ +- struct task_struct *p; +- +- p = rcu_dereference_protected(ca->alloc_thread, 1); +- ca->alloc_thread = NULL; +- +- /* +- * We need an rcu barrier between setting ca->alloc_thread = NULL and +- * the thread shutting down to avoid bch2_wake_allocator() racing: +- * +- * XXX: it would be better to have the rcu barrier be asynchronous +- * instead of blocking us here +- */ +- synchronize_rcu(); +- +- if (p) { +- kthread_stop(p); +- put_task_struct(p); +- } +-} +- +-/* start allocator thread: */ +-int bch2_dev_allocator_start(struct bch_dev *ca) +-{ +- struct task_struct *p; +- +- /* +- * allocator thread already started? +- */ +- if (ca->alloc_thread) +- return 0; +- +- p = kthread_create(bch2_allocator_thread, ca, +- "bch-alloc/%s", ca->name); +- if (IS_ERR(p)) { +- bch_err(ca->fs, "error creating allocator thread: %li", +- PTR_ERR(p)); +- return PTR_ERR(p); +- } +- +- get_task_struct(p); +- rcu_assign_pointer(ca->alloc_thread, p); +- wake_up_process(p); +- return 0; +-} +- + void bch2_fs_allocator_background_init(struct bch_fs *c) + { + spin_lock_init(&c->freelist_lock); +diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h +index cf0c71313e1c..3c72cc37174b 100644 +--- a/fs/bcachefs/alloc_background.h ++++ b/fs/bcachefs/alloc_background.h +@@ -8,8 +8,6 @@ + #include "debug.h" + #include "super.h" + +-extern const char * const bch2_allocator_states[]; +- + struct bkey_alloc_unpacked { + u64 journal_seq; + u64 bucket; +@@ -142,42 +140,11 @@ int bch2_trans_mark_alloc(struct btree_trans *, struct bkey_s_c, + struct bkey_i *, unsigned); + int bch2_fs_freespace_init(struct bch_fs *); + +-static inline void bch2_wake_allocator(struct bch_dev *ca) +-{ +- struct task_struct *p; +- +- rcu_read_lock(); +- p = rcu_dereference(ca->alloc_thread); +- if (p) +- wake_up_process(p); +- rcu_read_unlock(); +-} +- +-static inline void verify_not_on_freelist(struct bch_fs *c, struct bch_dev *ca, +- size_t bucket) +-{ +- if (bch2_expensive_debug_checks) { +- size_t iter; +- long i; +- unsigned j; +- +- for (j = 0; j < RESERVE_NR; j++) +- fifo_for_each_entry(i, &ca->free[j], iter) +- BUG_ON(i == bucket); +- fifo_for_each_entry(i, &ca->free_inc, iter) +- BUG_ON(i == bucket); +- } +-} +- + void bch2_recalc_capacity(struct bch_fs *); + + void bch2_dev_allocator_remove(struct bch_fs *, struct bch_dev *); + void bch2_dev_allocator_add(struct bch_fs *, struct bch_dev *); + +-void bch2_dev_allocator_quiesce(struct bch_fs *, struct bch_dev *); +-void bch2_dev_allocator_stop(struct bch_dev *); +-int bch2_dev_allocator_start(struct bch_dev *); +- + void bch2_fs_allocator_background_init(struct bch_fs *); + + #endif /* _BCACHEFS_ALLOC_BACKGROUND_H */ +diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c +index 9b81ed2665c8..e0dc585b50da 100644 +--- a/fs/bcachefs/alloc_foreground.c ++++ b/fs/bcachefs/alloc_foreground.c +@@ -14,13 +14,18 @@ + #include "bcachefs.h" + #include "alloc_background.h" + #include "alloc_foreground.h" ++#include "btree_iter.h" ++#include "btree_update.h" + #include "btree_gc.h" + #include "buckets.h" ++#include "buckets_waiting_for_journal.h" + #include "clock.h" + #include "debug.h" + #include "disk_groups.h" + #include "ec.h" ++#include "error.h" + #include "io.h" ++#include "journal.h" + + #include + #include +@@ -78,7 +83,6 @@ void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob) + percpu_down_read(&c->mark_lock); + spin_lock(&ob->lock); + +- bch2_mark_alloc_bucket(c, ca, ob->bucket, false); + ob->valid = false; + ob->data_type = 0; + +@@ -178,39 +182,28 @@ static inline unsigned open_buckets_reserved(enum alloc_reserve reserve) + } + } + +-/** +- * bch_bucket_alloc - allocate a single bucket from a specific device +- * +- * Returns index of bucket on success, 0 on failure +- * */ +-struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca, +- enum alloc_reserve reserve, +- bool may_alloc_partial, +- struct closure *cl) ++static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *ca, ++ enum alloc_reserve reserve, ++ struct bkey_alloc_unpacked a, ++ size_t *need_journal_commit, ++ struct closure *cl) + { + struct open_bucket *ob; +- long b = 0; + +- spin_lock(&c->freelist_lock); ++ if (unlikely(ca->buckets_nouse && test_bit(a.bucket, ca->buckets_nouse))) ++ return NULL; + +- if (may_alloc_partial) { +- int i; +- +- for (i = ca->open_buckets_partial_nr - 1; i >= 0; --i) { +- ob = c->open_buckets + ca->open_buckets_partial[i]; +- +- if (reserve <= ob->alloc_reserve) { +- array_remove_item(ca->open_buckets_partial, +- ca->open_buckets_partial_nr, +- i); +- ob->on_partial_list = false; +- ob->alloc_reserve = reserve; +- spin_unlock(&c->freelist_lock); +- return ob; +- } +- } ++ if (bch2_bucket_is_open(c, ca->dev_idx, a.bucket)) ++ return NULL; ++ ++ if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal, ++ c->journal.flushed_seq_ondisk, ca->dev_idx, a.bucket)) { ++ (*need_journal_commit)++; ++ return NULL; + } + ++ spin_lock(&c->freelist_lock); ++ + if (unlikely(c->open_buckets_nr_free <= open_buckets_reserved(reserve))) { + if (cl) + closure_wait(&c->open_buckets_wait, cl); +@@ -219,36 +212,17 @@ struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca, + c->blocked_allocate_open_bucket = local_clock(); + + spin_unlock(&c->freelist_lock); ++ + trace_open_bucket_alloc_fail(ca, reserve); + return ERR_PTR(-OPEN_BUCKETS_EMPTY); + } + +- if (likely(fifo_pop(&ca->free[RESERVE_NONE], b))) +- goto out; +- +- switch (reserve) { +- case RESERVE_BTREE_MOVINGGC: +- case RESERVE_MOVINGGC: +- if (fifo_pop(&ca->free[RESERVE_MOVINGGC], b)) +- goto out; +- break; +- default: +- break; ++ /* Recheck under lock: */ ++ if (bch2_bucket_is_open(c, ca->dev_idx, a.bucket)) { ++ spin_unlock(&c->freelist_lock); ++ return NULL; + } + +- if (cl) +- closure_wait(&c->freelist_wait, cl); +- +- if (!c->blocked_allocate) +- c->blocked_allocate = local_clock(); +- +- spin_unlock(&c->freelist_lock); +- +- trace_bucket_alloc_fail(ca, reserve); +- return ERR_PTR(-FREELIST_EMPTY); +-out: +- verify_not_on_freelist(c, ca, b); +- + ob = bch2_open_bucket_alloc(c); + + spin_lock(&ob->lock); +@@ -257,8 +231,8 @@ out: + ob->sectors_free = ca->mi.bucket_size; + ob->alloc_reserve = reserve; + ob->dev = ca->dev_idx; +- ob->gen = *bucket_gen(ca, b); +- ob->bucket = b; ++ ob->gen = a.gen; ++ ob->bucket = a.bucket; + spin_unlock(&ob->lock); + + ca->nr_open_buckets++; +@@ -280,12 +254,238 @@ out: + + spin_unlock(&c->freelist_lock); + +- bch2_wake_allocator(ca); +- + trace_bucket_alloc(ca, reserve); + return ob; + } + ++static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bch_dev *ca, ++ enum alloc_reserve reserve, u64 free_entry, ++ size_t *need_journal_commit, ++ struct closure *cl) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ struct open_bucket *ob; ++ struct bkey_alloc_unpacked a; ++ u64 b = free_entry & ~(~0ULL << 56); ++ unsigned genbits = free_entry >> 56; ++ struct printbuf buf = PRINTBUF; ++ int ret; ++ ++ bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, POS(ca->dev_idx, b), BTREE_ITER_CACHED); ++ k = bch2_btree_iter_peek_slot(&iter); ++ ret = bkey_err(k); ++ if (ret) { ++ ob = ERR_PTR(ret); ++ goto err; ++ } ++ ++ a = bch2_alloc_unpack(k); ++ ++ if (bch2_fs_inconsistent_on(bucket_state(a) != BUCKET_free, c, ++ "non free bucket in freespace btree (state %s)\n" ++ " %s\n" ++ " at %llu (genbits %u)", ++ bch2_bucket_states[bucket_state(a)], ++ (bch2_bkey_val_to_text(&buf, c, k), buf.buf), ++ free_entry, genbits)) { ++ ob = ERR_PTR(-EIO); ++ goto err; ++ } ++ ++ if (bch2_fs_inconsistent_on(genbits != (alloc_freespace_genbits(a) >> 56), c, ++ "bucket in freespace btree with wrong genbits (got %u should be %llu)\n" ++ " %s", ++ genbits, alloc_freespace_genbits(a) >> 56, ++ (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { ++ ob = ERR_PTR(-EIO); ++ goto err; ++ } ++ ++ if (bch2_fs_inconsistent_on(b < ca->mi.first_bucket || b >= ca->mi.nbuckets, c, ++ "freespace btree has bucket outside allowed range (got %llu, valid %u-%llu)", ++ b, ca->mi.first_bucket, ca->mi.nbuckets)) { ++ ob = ERR_PTR(-EIO); ++ goto err; ++ } ++ ++ ob = __try_alloc_bucket(c, ca, reserve, a, need_journal_commit, cl); ++err: ++ bch2_trans_iter_exit(trans, &iter); ++ printbuf_exit(&buf); ++ return ob; ++} ++ ++static struct open_bucket *try_alloc_partial_bucket(struct bch_fs *c, struct bch_dev *ca, ++ enum alloc_reserve reserve) ++{ ++ struct open_bucket *ob; ++ int i; ++ ++ spin_lock(&c->freelist_lock); ++ ++ for (i = ca->open_buckets_partial_nr - 1; i >= 0; --i) { ++ ob = c->open_buckets + ca->open_buckets_partial[i]; ++ ++ if (reserve <= ob->alloc_reserve) { ++ array_remove_item(ca->open_buckets_partial, ++ ca->open_buckets_partial_nr, ++ i); ++ ob->on_partial_list = false; ++ ob->alloc_reserve = reserve; ++ spin_unlock(&c->freelist_lock); ++ return ob; ++ } ++ } ++ ++ spin_unlock(&c->freelist_lock); ++ return NULL; ++} ++ ++/* ++ * This path is for before the freespace btree is initialized: ++ * ++ * If ca->new_fs_bucket_idx is nonzero, we haven't yet marked superblock & ++ * journal buckets - journal buckets will be < ca->new_fs_bucket_idx ++ */ ++static noinline struct open_bucket * ++bch2_bucket_alloc_trans_early(struct btree_trans *trans, ++ struct bch_dev *ca, ++ enum alloc_reserve reserve, ++ u64 *b, ++ size_t *need_journal_commit, ++ struct closure *cl) ++{ ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ struct open_bucket *ob = NULL; ++ int ret; ++ ++ *b = max_t(u64, *b, ca->mi.first_bucket); ++ *b = max_t(u64, *b, ca->new_fs_bucket_idx); ++ ++ for_each_btree_key(trans, iter, BTREE_ID_alloc, POS(ca->dev_idx, *b), ++ BTREE_ITER_SLOTS, k, ret) { ++ struct bkey_alloc_unpacked a; ++ ++ if (bkey_cmp(k.k->p, POS(ca->dev_idx, ca->mi.nbuckets)) >= 0) ++ break; ++ ++ if (ca->new_fs_bucket_idx && ++ is_superblock_bucket(ca, k.k->p.offset)) ++ continue; ++ ++ a = bch2_alloc_unpack(k); ++ ++ if (bucket_state(a) != BUCKET_free) ++ continue; ++ ++ ob = __try_alloc_bucket(trans->c, ca, reserve, a, ++ need_journal_commit, cl); ++ if (ob) ++ break; ++ } ++ bch2_trans_iter_exit(trans, &iter); ++ ++ *b = iter.pos.offset; ++ ++ return ob ?: ERR_PTR(ret ?: -FREELIST_EMPTY); ++} ++ ++static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans, ++ struct bch_dev *ca, ++ enum alloc_reserve reserve, ++ u64 *b, ++ size_t *need_journal_commit, ++ struct closure *cl) ++{ ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ struct open_bucket *ob = NULL; ++ int ret; ++ ++ if (unlikely(!ca->mi.freespace_initialized)) ++ return bch2_bucket_alloc_trans_early(trans, ca, reserve, b, ++ need_journal_commit, cl); ++ ++ BUG_ON(ca->new_fs_bucket_idx); ++ ++ for_each_btree_key(trans, iter, BTREE_ID_freespace, ++ POS(ca->dev_idx, *b), 0, k, ret) { ++ if (k.k->p.inode != ca->dev_idx) ++ break; ++ ++ for (*b = max(*b, bkey_start_offset(k.k)); ++ *b != k.k->p.offset && !ob; ++ (*b)++) { ++ if (btree_trans_too_many_iters(trans)) { ++ ob = ERR_PTR(-EINTR); ++ break; ++ } ++ ++ ob = try_alloc_bucket(trans, ca, reserve, *b, ++ need_journal_commit, cl); ++ } ++ if (ob) ++ break; ++ } ++ bch2_trans_iter_exit(trans, &iter); ++ ++ return ob ?: ERR_PTR(ret ?: -FREELIST_EMPTY); ++} ++ ++/** ++ * bch_bucket_alloc - allocate a single bucket from a specific device ++ * ++ * Returns index of bucket on success, 0 on failure ++ * */ ++struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca, ++ enum alloc_reserve reserve, ++ bool may_alloc_partial, ++ struct closure *cl) ++{ ++ struct open_bucket *ob = NULL; ++ size_t need_journal_commit = 0; ++ u64 avail = dev_buckets_available(ca, reserve); ++ u64 b = 0; ++ int ret; ++ ++ if (may_alloc_partial) { ++ ob = try_alloc_partial_bucket(c, ca, reserve); ++ if (ob) ++ return ob; ++ } ++again: ++ if (!avail) { ++ if (cl) { ++ closure_wait(&c->freelist_wait, cl); ++ /* recheck after putting ourself on waitlist */ ++ avail = dev_buckets_available(ca, reserve); ++ if (avail) { ++ closure_wake_up(&c->freelist_wait); ++ goto again; ++ } ++ } ++ ++ if (!c->blocked_allocate) ++ c->blocked_allocate = local_clock(); ++ ++ trace_bucket_alloc_fail(ca, reserve); ++ return ERR_PTR(-FREELIST_EMPTY); ++ } ++ ++ ret = bch2_trans_do(c, NULL, NULL, 0, ++ PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(&trans, ++ ca, reserve, &b, ++ &need_journal_commit, cl))); ++ ++ if (need_journal_commit * 2 > avail) ++ bch2_journal_flush_async(&c->journal, NULL); ++ ++ return ob ?: ERR_PTR(ret ?: -FREELIST_EMPTY); ++} ++ + static int __dev_stripe_cmp(struct dev_stripe_state *stripe, + unsigned l, unsigned r) + { +@@ -313,7 +513,7 @@ void bch2_dev_stripe_increment(struct bch_dev *ca, + struct dev_stripe_state *stripe) + { + u64 *v = stripe->next_alloc + ca->dev_idx; +- u64 free_space = dev_buckets_available(ca); ++ u64 free_space = dev_buckets_available(ca, RESERVE_NONE); + u64 free_space_inv = free_space + ? div64_u64(1ULL << 48, free_space) + : 1ULL << 48; +@@ -364,6 +564,7 @@ int bch2_bucket_alloc_set(struct bch_fs *c, + { + struct dev_alloc_list devs_sorted = + bch2_dev_alloc_list(c, stripe, devs_may_alloc); ++ unsigned dev; + struct bch_dev *ca; + int ret = -INSUFFICIENT_DEVICES; + unsigned i; +@@ -373,30 +574,43 @@ int bch2_bucket_alloc_set(struct bch_fs *c, + for (i = 0; i < devs_sorted.nr; i++) { + struct open_bucket *ob; + +- ca = rcu_dereference(c->devs[devs_sorted.devs[i]]); ++ dev = devs_sorted.devs[i]; ++ ++ rcu_read_lock(); ++ ca = rcu_dereference(c->devs[dev]); ++ if (ca) ++ percpu_ref_get(&ca->ref); ++ rcu_read_unlock(); ++ + if (!ca) + continue; + +- if (!ca->mi.durability && *have_cache) ++ if (!ca->mi.durability && *have_cache) { ++ percpu_ref_put(&ca->ref); + continue; ++ } + + ob = bch2_bucket_alloc(c, ca, reserve, + flags & BUCKET_MAY_ALLOC_PARTIAL, cl); ++ if (!IS_ERR(ob)) ++ bch2_dev_stripe_increment(ca, stripe); ++ percpu_ref_put(&ca->ref); ++ + if (IS_ERR(ob)) { + ret = PTR_ERR(ob); + + if (cl) +- return ret; ++ break; + continue; + } + + add_new_bucket(c, ptrs, devs_may_alloc, + nr_effective, have_cache, flags, ob); + +- bch2_dev_stripe_increment(ca, stripe); +- +- if (*nr_effective >= nr_replicas) +- return 0; ++ if (*nr_effective >= nr_replicas) { ++ ret = 0; ++ break; ++ } + } + + return ret; +@@ -564,9 +778,6 @@ static int open_bucket_add_buckets(struct bch_fs *c, + if (*nr_effective >= nr_replicas) + return 0; + +- percpu_down_read(&c->mark_lock); +- rcu_read_lock(); +- + retry_blocking: + /* + * Try nonblocking first, so that if one device is full we'll try from +@@ -580,9 +791,6 @@ retry_blocking: + goto retry_blocking; + } + +- rcu_read_unlock(); +- percpu_up_read(&c->mark_lock); +- + return ret; + } + +@@ -863,7 +1071,7 @@ err: + case -INSUFFICIENT_DEVICES: + return ERR_PTR(-EROFS); + default: +- BUG(); ++ return ERR_PTR(ret); + } + } + +diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h +index 409232e3d998..22e1fbda9046 100644 +--- a/fs/bcachefs/alloc_types.h ++++ b/fs/bcachefs/alloc_types.h +@@ -10,18 +10,6 @@ + + struct ec_bucket_buf; + +-#define ALLOC_THREAD_STATES() \ +- x(stopped) \ +- x(running) \ +- x(blocked) \ +- x(blocked_full) +- +-enum allocator_states { +-#define x(n) ALLOCATOR_##n, +- ALLOC_THREAD_STATES() +-#undef x +-}; +- + enum alloc_reserve { + RESERVE_BTREE_MOVINGGC = -2, + RESERVE_BTREE = -1, +@@ -30,8 +18,6 @@ enum alloc_reserve { + RESERVE_NR = 2, + }; + +-typedef FIFO(long) alloc_fifo; +- + #define OPEN_BUCKETS_COUNT 1024 + + #define WRITE_POINT_HASH_NR 32 +@@ -94,12 +80,4 @@ struct write_point_specifier { + unsigned long v; + }; + +-struct alloc_heap_entry { +- size_t bucket; +- size_t nr; +- unsigned long key; +-}; +- +-typedef HEAP(struct alloc_heap_entry) alloc_heap; +- + #endif /* _BCACHEFS_ALLOC_TYPES_H */ +diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h +index 7350fb6a8355..c82a9e1aab8d 100644 +--- a/fs/bcachefs/bcachefs.h ++++ b/fs/bcachefs/bcachefs.h +@@ -462,34 +462,17 @@ struct bch_dev { + + /* Allocator: */ + u64 new_fs_bucket_idx; +- struct task_struct __rcu *alloc_thread; + +- /* +- * free: Buckets that are ready to be used +- * +- * free_inc: Incoming buckets - these are buckets that currently have +- * cached data in them, and we can't reuse them until after we write +- * their new gen to disk. After prio_write() finishes writing the new +- * gens/prios, they'll be moved to the free list (and possibly discarded +- * in the process) +- */ +- alloc_fifo free[RESERVE_NR]; +- alloc_fifo free_inc; + unsigned nr_open_buckets; ++ unsigned nr_btree_reserve; + + open_bucket_idx_t open_buckets_partial[OPEN_BUCKETS_COUNT]; + open_bucket_idx_t open_buckets_partial_nr; + +- size_t fifo_last_bucket; +- + size_t inc_gen_needs_gc; + size_t inc_gen_really_needs_gc; + size_t buckets_waiting_on_journal; + +- enum allocator_states allocator_state; +- +- alloc_heap alloc_heap; +- + atomic64_t rebalance_work; + + struct journal_device journal; +@@ -511,8 +494,6 @@ struct bch_dev { + enum { + /* startup: */ + BCH_FS_ALLOC_CLEAN, +- BCH_FS_ALLOCATOR_RUNNING, +- BCH_FS_ALLOCATOR_STOPPING, + BCH_FS_INITIAL_GC_DONE, + BCH_FS_INITIAL_GC_UNFIXED, + BCH_FS_TOPOLOGY_REPAIR_DONE, +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +index a8c566fd12bb..0bab695bcb41 100644 +--- a/fs/bcachefs/btree_gc.c ++++ b/fs/bcachefs/btree_gc.c +@@ -1673,9 +1673,8 @@ static void bch2_gc_stripes_reset(struct bch_fs *c, bool metadata_only) + */ + int bch2_gc(struct bch_fs *c, bool initial, bool metadata_only) + { +- struct bch_dev *ca; + u64 start_time = local_clock(); +- unsigned i, iter = 0; ++ unsigned iter = 0; + int ret; + + lockdep_assert_held(&c->state_lock); +@@ -1776,13 +1775,6 @@ out: + trace_gc_end(c); + bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time); + +- /* +- * Wake up allocator in case it was waiting for buckets +- * because of not being able to inc gens +- */ +- for_each_member_device(ca, c, i) +- bch2_wake_allocator(ca); +- + /* + * At startup, allocations can happen directly instead of via the + * allocator thread - issue wakeup in case they blocked on gc_lock: +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index b5178d3067a9..22d8d185a414 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -292,11 +292,6 @@ static inline int bucket_sectors_fragmented(struct bch_dev *ca, + : 0; + } + +-static inline int is_stripe_data_bucket(struct bucket_mark m) +-{ +- return m.stripe && m.data_type != BCH_DATA_parity; +-} +- + static inline enum bch_data_type bucket_type(struct bucket_mark m) + { + return m.cached_sectors && !m.dirty_sectors +@@ -347,9 +342,6 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca, + u->d[new.data_type].fragmented += bucket_sectors_fragmented(ca, new); + + preempt_enable(); +- +- if (!is_available_bucket(old) && is_available_bucket(new)) +- bch2_wake_allocator(ca); + } + + static inline int __update_replicas(struct bch_fs *c, +@@ -484,19 +476,6 @@ static inline void update_cached_sectors_list(struct btree_trans *trans, + update_replicas_list(trans, &r.e, sectors); + } + +-void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca, +- size_t b, bool owned_by_allocator) +-{ +- struct bucket *g = bucket(ca, b); +- struct bucket_mark old, new; +- +- old = bucket_cmpxchg(g, new, ({ +- new.owned_by_allocator = owned_by_allocator; +- })); +- +- BUG_ON(owned_by_allocator == old.owned_by_allocator); +-} +- + int bch2_mark_alloc(struct btree_trans *trans, + struct bkey_s_c old, struct bkey_s_c new, + unsigned flags) +@@ -555,6 +534,10 @@ int bch2_mark_alloc(struct btree_trans *trans, + } + } + ++ if (!new_u.data_type && ++ (!new_u.journal_seq || new_u.journal_seq < c->journal.flushed_seq_ondisk)) ++ closure_wake_up(&c->freelist_wait); ++ + if (bucket_state(new_u) == BUCKET_need_gc_gens) { + atomic_inc(&c->kick_gc); + wake_up_process(c->gc_thread); +@@ -578,7 +561,6 @@ int bch2_mark_alloc(struct btree_trans *trans, + + g->io_time[READ] = new_u.read_time; + g->io_time[WRITE] = new_u.write_time; +- g->oldest_gen = new_u.oldest_gen; + g->gen_valid = 1; + g->stripe = new_u.stripe; + g->stripe_redundancy = new_u.stripe_redundancy; +@@ -2069,24 +2051,8 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) + struct bucket_array *buckets = NULL, *old_buckets = NULL; + struct bucket_gens *bucket_gens = NULL, *old_bucket_gens = NULL; + unsigned long *buckets_nouse = NULL; +- alloc_fifo free[RESERVE_NR]; +- alloc_fifo free_inc; +- alloc_heap alloc_heap; +- +- size_t btree_reserve = DIV_ROUND_UP(BTREE_NODE_RESERVE, +- ca->mi.bucket_size / btree_sectors(c)); +- /* XXX: these should be tunable */ +- size_t reserve_none = max_t(size_t, 1, nbuckets >> 9); +- size_t copygc_reserve = max_t(size_t, 2, nbuckets >> 6); +- size_t free_inc_nr = max(max_t(size_t, 1, nbuckets >> 12), +- btree_reserve * 2); + bool resize = ca->buckets[0] != NULL; + int ret = -ENOMEM; +- unsigned i; +- +- memset(&free, 0, sizeof(free)); +- memset(&free_inc, 0, sizeof(free_inc)); +- memset(&alloc_heap, 0, sizeof(alloc_heap)); + + if (!(buckets = kvpmalloc(sizeof(struct bucket_array) + + nbuckets * sizeof(struct bucket), +@@ -2096,12 +2062,7 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) + (c->opts.buckets_nouse && + !(buckets_nouse = kvpmalloc(BITS_TO_LONGS(nbuckets) * + sizeof(unsigned long), +- GFP_KERNEL|__GFP_ZERO))) || +- !init_fifo(&free[RESERVE_MOVINGGC], +- copygc_reserve, GFP_KERNEL) || +- !init_fifo(&free[RESERVE_NONE], reserve_none, GFP_KERNEL) || +- !init_fifo(&free_inc, free_inc_nr, GFP_KERNEL) || +- !init_heap(&alloc_heap, ALLOC_SCAN_BATCH(ca) << 1, GFP_KERNEL)) ++ GFP_KERNEL|__GFP_ZERO)))) + goto err; + + buckets->first_bucket = ca->mi.first_bucket; +@@ -2147,18 +2108,6 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) + up_write(&c->gc_lock); + } + +- spin_lock(&c->freelist_lock); +- for (i = 0; i < RESERVE_NR; i++) { +- fifo_move(&free[i], &ca->free[i]); +- swap(ca->free[i], free[i]); +- } +- fifo_move(&free_inc, &ca->free_inc); +- swap(ca->free_inc, free_inc); +- spin_unlock(&c->freelist_lock); +- +- /* with gc lock held, alloc_heap can't be in use: */ +- swap(ca->alloc_heap, alloc_heap); +- + nbuckets = ca->mi.nbuckets; + + if (resize) +@@ -2166,10 +2115,6 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) + + ret = 0; + err: +- free_heap(&alloc_heap); +- free_fifo(&free_inc); +- for (i = 0; i < RESERVE_NR; i++) +- free_fifo(&free[i]); + kvpfree(buckets_nouse, + BITS_TO_LONGS(nbuckets) * sizeof(unsigned long)); + if (bucket_gens) +@@ -2184,10 +2129,6 @@ void bch2_dev_buckets_free(struct bch_dev *ca) + { + unsigned i; + +- free_heap(&ca->alloc_heap); +- free_fifo(&ca->free_inc); +- for (i = 0; i < RESERVE_NR; i++) +- free_fifo(&ca->free[i]); + kvpfree(ca->buckets_nouse, + BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long)); + kvpfree(rcu_dereference_protected(ca->bucket_gens, 1), +diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h +index 07fe5cddbb41..a05d8adc8372 100644 +--- a/fs/bcachefs/buckets.h ++++ b/fs/bcachefs/buckets.h +@@ -58,11 +58,6 @@ static inline struct bucket *gc_bucket(struct bch_dev *ca, size_t b) + return __bucket(ca, b, true); + } + +-static inline struct bucket *bucket(struct bch_dev *ca, size_t b) +-{ +- return __bucket(ca, b, false); +-} +- + static inline struct bucket_gens *bucket_gens(struct bch_dev *ca) + { + return rcu_dereference_check(ca->bucket_gens, +@@ -143,50 +138,50 @@ static inline bool is_available_bucket(struct bucket_mark mark) + struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *); + + static inline u64 __dev_buckets_available(struct bch_dev *ca, +- struct bch_dev_usage stats) ++ struct bch_dev_usage stats, ++ enum alloc_reserve reserve) + { +- u64 total = ca->mi.nbuckets - ca->mi.first_bucket; ++ s64 total = ca->mi.nbuckets - ca->mi.first_bucket; ++ s64 reserved = 0; ++ ++ switch (reserve) { ++ case RESERVE_NONE: ++ reserved += ca->mi.nbuckets >> 6; ++ fallthrough; ++ case RESERVE_MOVINGGC: ++ reserved += ca->nr_btree_reserve; ++ fallthrough; ++ case RESERVE_BTREE: ++ reserved += ca->nr_btree_reserve; ++ fallthrough; ++ case RESERVE_BTREE_MOVINGGC: ++ break; ++ default: ++ BUG(); ++ } + + if (WARN_ONCE(stats.buckets_unavailable > total, + "buckets_unavailable overflow (%llu > %llu)\n", + stats.buckets_unavailable, total)) + return 0; + +- return total - stats.buckets_unavailable; +-} +- +-static inline u64 dev_buckets_available(struct bch_dev *ca) +-{ +- return __dev_buckets_available(ca, bch2_dev_usage_read(ca)); ++ return max_t(s64, 0, ++ total - ++ stats.buckets_unavailable - ++ ca->nr_open_buckets - ++ reserved); + } + +-static inline u64 __dev_buckets_reclaimable(struct bch_dev *ca, +- struct bch_dev_usage stats) ++static inline u64 dev_buckets_available(struct bch_dev *ca, ++ enum alloc_reserve reserve) + { +- struct bch_fs *c = ca->fs; +- s64 available = __dev_buckets_available(ca, stats); +- unsigned i; +- +- spin_lock(&c->freelist_lock); +- for (i = 0; i < RESERVE_NR; i++) +- available -= fifo_used(&ca->free[i]); +- available -= fifo_used(&ca->free_inc); +- available -= ca->nr_open_buckets; +- spin_unlock(&c->freelist_lock); +- +- return max(available, 0LL); +-} +- +-static inline u64 dev_buckets_reclaimable(struct bch_dev *ca) +-{ +- return __dev_buckets_reclaimable(ca, bch2_dev_usage_read(ca)); ++ return __dev_buckets_available(ca, bch2_dev_usage_read(ca), reserve); + } + + /* Filesystem usage: */ + + static inline unsigned fs_usage_u64s(struct bch_fs *c) + { +- + return sizeof(struct bch_fs_usage) / sizeof(u64) + + READ_ONCE(c->replicas.nr); + } +@@ -214,7 +209,6 @@ bch2_fs_usage_read_short(struct bch_fs *); + + void bch2_fs_usage_initialize(struct bch_fs *); + +-void bch2_mark_alloc_bucket(struct bch_fs *, struct bch_dev *, size_t, bool); + void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *, + size_t, enum bch_data_type, unsigned, + struct gc_pos, unsigned); +diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h +index 2c73dc60b838..2280aee59964 100644 +--- a/fs/bcachefs/buckets_types.h ++++ b/fs/bcachefs/buckets_types.h +@@ -14,7 +14,6 @@ struct bucket_mark { + struct { + u8 gen; + u8 data_type:3, +- owned_by_allocator:1, + stripe:1; + u16 dirty_sectors; + u16 cached_sectors; +@@ -29,7 +28,6 @@ struct bucket { + }; + + u64 io_time[2]; +- u8 oldest_gen; + unsigned gen_valid:1; + u8 stripe_redundancy; + u32 stripe; +diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c +index 6027a7d42981..7629c34b7cd0 100644 +--- a/fs/bcachefs/ec.c ++++ b/fs/bcachefs/ec.c +@@ -1295,9 +1295,6 @@ static int new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h, + BUG_ON(nr_have_data > h->s->nr_data); + BUG_ON(nr_have_parity > h->s->nr_parity); + +- percpu_down_read(&c->mark_lock); +- rcu_read_lock(); +- + buckets.nr = 0; + if (nr_have_parity < h->s->nr_parity) { + ret = bch2_bucket_alloc_set(c, &buckets, +@@ -1324,7 +1321,7 @@ static int new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h, + } + + if (ret) +- goto err; ++ return ret; + } + + buckets.nr = 0; +@@ -1352,12 +1349,10 @@ static int new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h, + } + + if (ret) +- goto err; ++ return ret; + } +-err: +- rcu_read_unlock(); +- percpu_up_read(&c->mark_lock); +- return ret; ++ ++ return 0; + } + + /* XXX: doesn't obey target: */ +diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c +index de503dbb0f96..e33085fe978f 100644 +--- a/fs/bcachefs/journal.c ++++ b/fs/bcachefs/journal.c +@@ -801,10 +801,8 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, + break; + } + } else { +- rcu_read_lock(); + ob[nr_got] = bch2_bucket_alloc(c, ca, RESERVE_NONE, + false, cl); +- rcu_read_unlock(); + if (IS_ERR(ob[nr_got])) { + ret = cl ? -EAGAIN : -ENOSPC; + break; +diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c +index fb24ca212b09..2099044c7083 100644 +--- a/fs/bcachefs/journal_io.c ++++ b/fs/bcachefs/journal_io.c +@@ -1372,6 +1372,8 @@ static void journal_write_done(struct closure *cl) + if (!JSET_NO_FLUSH(w->data)) { + j->flushed_seq_ondisk = seq; + j->last_seq_ondisk = w->last_seq; ++ ++ closure_wake_up(&c->freelist_wait); + } + } else if (!j->err_seq || seq < j->err_seq) + j->err_seq = seq; +diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c +index c82ecff3efe2..0fb60d8581a7 100644 +--- a/fs/bcachefs/movinggc.c ++++ b/fs/bcachefs/movinggc.c +@@ -119,18 +119,6 @@ static enum data_cmd copygc_pred(struct bch_fs *c, void *arg, + return DATA_SKIP; + } + +-static bool have_copygc_reserve(struct bch_dev *ca) +-{ +- bool ret; +- +- spin_lock(&ca->fs->freelist_lock); +- ret = fifo_full(&ca->free[RESERVE_MOVINGGC]) || +- ca->allocator_state != ALLOCATOR_running; +- spin_unlock(&ca->fs->freelist_lock); +- +- return ret; +-} +- + static inline int fragmentation_cmp(copygc_heap *heap, + struct copygc_heap_entry l, + struct copygc_heap_entry r) +@@ -262,11 +250,10 @@ static int bch2_copygc(struct bch_fs *c) + } + + for_each_rw_member(ca, c, dev_idx) { +- closure_wait_event(&c->freelist_wait, have_copygc_reserve(ca)); ++ s64 avail = min(dev_buckets_available(ca, RESERVE_MOVINGGC), ++ ca->mi.nbuckets >> 6); + +- spin_lock(&ca->fs->freelist_lock); +- sectors_reserved += fifo_used(&ca->free[RESERVE_MOVINGGC]) * ca->mi.bucket_size; +- spin_unlock(&ca->fs->freelist_lock); ++ sectors_reserved += avail * ca->mi.bucket_size; + } + + ret = walk_buckets_to_copygc(c); +@@ -367,8 +354,8 @@ unsigned long bch2_copygc_wait_amount(struct bch_fs *c) + for_each_rw_member(ca, c, dev_idx) { + struct bch_dev_usage usage = bch2_dev_usage_read(ca); + +- fragmented_allowed = ((__dev_buckets_reclaimable(ca, usage) * +- ca->mi.bucket_size) >> 1); ++ fragmented_allowed = ((__dev_buckets_available(ca, usage, RESERVE_NONE) * ++ ca->mi.bucket_size) >> 1); + fragmented = usage.d[BCH_DATA_user].fragmented; + + wait = min(wait, max(0LL, fragmented_allowed - fragmented)); +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index 4d01a01ea5c5..b7e735d7774f 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -1373,6 +1373,7 @@ int bch2_fs_initialize(struct bch_fs *c) + * Write out the superblock and journal buckets, now that we can do + * btree updates + */ ++ bch_verbose(c, "marking superblocks"); + err = "error marking superblock and journal"; + for_each_member_device(ca, c, i) { + ret = bch2_trans_mark_dev_sb(c, ca); +@@ -1384,6 +1385,7 @@ int bch2_fs_initialize(struct bch_fs *c) + ca->new_fs_bucket_idx = 0; + } + ++ bch_verbose(c, "initializing freespace"); + err = "error initializing freespace"; + ret = bch2_fs_freespace_init(c); + if (ret) +diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c +index 019cbf32d40e..5857f057497b 100644 +--- a/fs/bcachefs/super.c ++++ b/fs/bcachefs/super.c +@@ -199,17 +199,9 @@ static void __bch2_fs_read_only(struct bch_fs *c) + */ + bch2_journal_flush_all_pins(&c->journal); + +- /* +- * If the allocator threads didn't all start up, the btree updates to +- * write out alloc info aren't going to work: +- */ +- if (!test_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags)) +- goto nowrote_alloc; +- + bch_verbose(c, "flushing journal and stopping allocators"); + + bch2_journal_flush_all_pins(&c->journal); +- set_bit(BCH_FS_ALLOCATOR_STOPPING, &c->flags); + + do { + clean_passes++; +@@ -234,17 +226,11 @@ static void __bch2_fs_read_only(struct bch_fs *c) + bch_verbose(c, "flushing journal and stopping allocators complete"); + + set_bit(BCH_FS_ALLOC_CLEAN, &c->flags); +-nowrote_alloc: ++ + closure_wait_event(&c->btree_interior_update_wait, + !bch2_btree_interior_updates_nr_pending(c)); + flush_work(&c->btree_interior_update_work); + +- for_each_member_device(ca, c, i) +- bch2_dev_allocator_stop(ca); +- +- clear_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags); +- clear_bit(BCH_FS_ALLOCATOR_STOPPING, &c->flags); +- + bch2_fs_journal_stop(&c->journal); + + /* +@@ -280,10 +266,6 @@ void bch2_fs_read_only(struct bch_fs *c) + /* + * Block new foreground-end write operations from starting - any new + * writes will return -EROFS: +- * +- * (This is really blocking new _allocations_, writes to previously +- * allocated space can still happen until stopping the allocator in +- * bch2_dev_allocator_stop()). + */ + percpu_ref_kill(&c->writes); + +@@ -412,20 +394,6 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early) + bch2_dev_allocator_add(c, ca); + bch2_recalc_capacity(c); + +- for_each_rw_member(ca, c, i) { +- ret = bch2_dev_allocator_start(ca); +- if (ret) { +- bch_err(c, "error starting allocator threads"); +- percpu_ref_put(&ca->io_ref); +- goto err; +- } +- } +- +- set_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags); +- +- for_each_rw_member(ca, c, i) +- bch2_wake_allocator(ca); +- + if (!early) { + ret = bch2_fs_read_write_late(c); + if (ret) +@@ -941,20 +909,6 @@ int bch2_fs_start(struct bch_fs *c) + + set_bit(BCH_FS_STARTED, &c->flags); + +- /* +- * Allocator threads don't start filling copygc reserve until after we +- * set BCH_FS_STARTED - wake them now: +- * +- * XXX ugly hack: +- * Need to set ca->allocator_state here instead of relying on the +- * allocator threads to do it to avoid racing with the copygc threads +- * checking it and thinking they have no alloc reserve: +- */ +- for_each_online_member(ca, c, i) { +- ca->allocator_state = ALLOCATOR_running; +- bch2_wake_allocator(ca); +- } +- + if (c->opts.read_only || c->opts.nochanges) { + bch2_fs_read_only(c); + } else { +@@ -1046,8 +1000,6 @@ static void bch2_dev_release(struct kobject *kobj) + + static void bch2_dev_free(struct bch_dev *ca) + { +- bch2_dev_allocator_stop(ca); +- + cancel_work_sync(&ca->io_error_work); + + if (ca->kobj.state_in_sysfs && +@@ -1162,6 +1114,9 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c, + ca->mi = bch2_mi_to_cpu(member); + ca->uuid = member->uuid; + ++ ca->nr_btree_reserve = DIV_ROUND_UP(BTREE_NODE_RESERVE, ++ ca->mi.bucket_size / btree_sectors(c)); ++ + if (percpu_ref_init(&ca->ref, bch2_dev_ref_complete, + 0, GFP_KERNEL) || + percpu_ref_init(&ca->io_ref, bch2_dev_io_ref_complete, +@@ -1211,12 +1166,6 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx) + + ca->fs = c; + +- if (ca->mi.state == BCH_MEMBER_STATE_rw && +- bch2_dev_allocator_start(ca)) { +- bch2_dev_free(ca); +- goto err; +- } +- + bch2_dev_attach(c, ca, dev_idx); + out: + pr_verbose_init(c->opts, "ret %i", ret); +@@ -1402,14 +1351,13 @@ static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca) + /* + * The allocator thread itself allocates btree nodes, so stop it first: + */ +- bch2_dev_allocator_stop(ca); + bch2_dev_allocator_remove(c, ca); + bch2_dev_journal_stop(&c->journal, ca); + + bch2_copygc_start(c); + } + +-static int __bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca) ++static void __bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca) + { + lockdep_assert_held(&c->state_lock); + +@@ -1417,8 +1365,6 @@ static int __bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca) + + bch2_dev_allocator_add(c, ca); + bch2_recalc_capacity(c); +- +- return bch2_dev_allocator_start(ca); + } + + int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca, +@@ -1445,7 +1391,7 @@ int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca, + mutex_unlock(&c->sb_lock); + + if (new_state == BCH_MEMBER_STATE_rw) +- ret = __bch2_dev_read_write(c, ca); ++ __bch2_dev_read_write(c, ca); + + rebalance_wakeup(c); + +@@ -1707,13 +1653,8 @@ have_slot: + + ca->new_fs_bucket_idx = 0; + +- if (ca->mi.state == BCH_MEMBER_STATE_rw) { +- ret = __bch2_dev_read_write(c, ca); +- if (ret) { +- bch_err(c, "device add error: error going RW on new device: %i", ret); +- goto err_late; +- } +- } ++ if (ca->mi.state == BCH_MEMBER_STATE_rw) ++ __bch2_dev_read_write(c, ca); + + up_write(&c->state_lock); + return 0; +@@ -1773,11 +1714,8 @@ int bch2_dev_online(struct bch_fs *c, const char *path) + goto err; + } + +- if (ca->mi.state == BCH_MEMBER_STATE_rw) { +- ret = __bch2_dev_read_write(c, ca); +- if (ret) +- goto err; +- } ++ if (ca->mi.state == BCH_MEMBER_STATE_rw) ++ __bch2_dev_read_write(c, ca); + + mutex_lock(&c->sb_lock); + mi = bch2_sb_get_members(c->disk_sb.sb); +diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c +index 3d6ece515a88..1b5ed7adc261 100644 +--- a/fs/bcachefs/sysfs.c ++++ b/fs/bcachefs/sysfs.c +@@ -170,7 +170,6 @@ read_attribute(congested); + + read_attribute(btree_avg_write_size); + +-read_attribute(reserve_stats); + read_attribute(btree_cache_size); + read_attribute(compression_stats); + read_attribute(journal_debug); +@@ -185,7 +184,6 @@ read_attribute(internal_uuid); + + read_attribute(has_data); + read_attribute(alloc_debug); +-write_attribute(wake_allocator); + + read_attribute(read_realloc_races); + read_attribute(extent_migrate_done); +@@ -698,24 +696,6 @@ struct attribute *bch2_fs_time_stats_files[] = { + NULL + }; + +-static void reserve_stats_to_text(struct printbuf *out, struct bch_dev *ca) +-{ +- enum alloc_reserve i; +- +- spin_lock(&ca->fs->freelist_lock); +- +- pr_buf(out, "free_inc:\t%zu\t%zu\n", +- fifo_used(&ca->free_inc), +- ca->free_inc.size); +- +- for (i = 0; i < RESERVE_NR; i++) +- pr_buf(out, "free[%u]:\t%zu\t%zu\n", i, +- fifo_used(&ca->free[i]), +- ca->free[i].size); +- +- spin_unlock(&ca->fs->freelist_lock); +-} +- + static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca) + { + struct bch_fs *c = ca->fs; +@@ -741,9 +721,6 @@ static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca) + "ec\t%16llu\n" + "available%15llu\n" + "\n" +- "free_inc\t\t%zu/%zu\n" +- "free[RESERVE_MOVINGGC]\t%zu/%zu\n" +- "free[RESERVE_NONE]\t%zu/%zu\n" + "freelist_wait\t\t%s\n" + "open buckets allocated\t%u\n" + "open buckets this dev\t%u\n" +@@ -751,13 +728,9 @@ static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca) + "open_buckets_wait\t%s\n" + "open_buckets_btree\t%u\n" + "open_buckets_user\t%u\n" +- "btree reserve cache\t%u\n" +- "thread state:\t\t%s\n", ++ "btree reserve cache\t%u\n", + stats.buckets_ec, +- __dev_buckets_available(ca, stats), +- fifo_used(&ca->free_inc), ca->free_inc.size, +- fifo_used(&ca->free[RESERVE_MOVINGGC]), ca->free[RESERVE_MOVINGGC].size, +- fifo_used(&ca->free[RESERVE_NONE]), ca->free[RESERVE_NONE].size, ++ __dev_buckets_available(ca, stats, RESERVE_NONE), + c->freelist_wait.list.first ? "waiting" : "empty", + OPEN_BUCKETS_COUNT - c->open_buckets_nr_free, + ca->nr_open_buckets, +@@ -765,8 +738,7 @@ static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca) + c->open_buckets_wait.list.first ? "waiting" : "empty", + nr[BCH_DATA_btree], + nr[BCH_DATA_user], +- c->btree_reserve_cache_nr, +- bch2_allocator_states[ca->allocator_state]); ++ c->btree_reserve_cache_nr); + } + + static const char * const bch2_rw[] = { +@@ -841,9 +813,6 @@ SHOW(bch2_dev) + clamp(atomic_read(&ca->congested), 0, CONGESTED_MAX) + * 100 / CONGESTED_MAX); + +- if (attr == &sysfs_reserve_stats) +- reserve_stats_to_text(out, ca); +- + if (attr == &sysfs_alloc_debug) + dev_alloc_debug_to_text(out, ca); + +@@ -883,9 +852,6 @@ STORE(bch2_dev) + return ret; + } + +- if (attr == &sysfs_wake_allocator) +- bch2_wake_allocator(ca); +- + return size; + } + SYSFS_OPS(bch2_dev); +@@ -911,11 +877,8 @@ struct attribute *bch2_dev_files[] = { + &sysfs_io_latency_stats_write, + &sysfs_congested, + +- &sysfs_reserve_stats, +- + /* debug: */ + &sysfs_alloc_debug, +- &sysfs_wake_allocator, + NULL + }; + +-- +cgit v1.2.3 + + +From 781bc07e31f8b299ad139d2ce6b32148f4260cdc Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 10 Feb 2022 04:32:19 -0500 +Subject: bcachefs: New discard implementation + +In the old allocator code, buckets would be discarded just prior to +being used - this made sense in bcache where we were discarding buckets +just after invalidating the cached data they contain, but in a +filesystem where we typically have more free space we want to be +discarding buckets when they become empty. + +This patch implements the new behaviour - it checks the need_discard +btree for buckets awaiting discards, and then clears the appropriate +bit in the alloc btree, which moves the buckets to the freespace btree. + +Additionally, discards are now enabled by default. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/alloc_background.c | 118 ++++++++++++++++++++++++++++++++++++++++- + fs/bcachefs/alloc_background.h | 2 + + fs/bcachefs/alloc_foreground.h | 3 ++ + fs/bcachefs/bcachefs.h | 1 + + fs/bcachefs/buckets.c | 5 ++ + fs/bcachefs/journal_io.c | 2 + + fs/bcachefs/opts.h | 2 +- + fs/bcachefs/super.c | 2 + + 8 files changed, 133 insertions(+), 2 deletions(-) + +diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c +index cdce67755d21..c1ae326bfb4c 100644 +--- a/fs/bcachefs/alloc_background.c ++++ b/fs/bcachefs/alloc_background.c +@@ -484,7 +484,6 @@ int bch2_trans_mark_alloc(struct btree_trans *trans, + + if (old_u.data_type && !new_u.data_type && + old_u.gen == new_u.gen && +- !bch2_bucket_is_open(c, new->k.p.inode, new->k.p.offset) && + !bch2_bucket_is_open_safe(c, new->k.p.inode, new->k.p.offset)) { + new_u.gen++; + new_u.need_inc_gen = false; +@@ -521,6 +520,122 @@ int bch2_trans_mark_alloc(struct btree_trans *trans, + return 0; + } + ++static int bch2_clear_need_discard(struct btree_trans *trans, struct bpos pos, ++ struct bch_dev *ca, bool *discard_done) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ struct bkey_alloc_unpacked a; ++ struct printbuf buf = PRINTBUF; ++ int ret; ++ ++ bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, pos, ++ BTREE_ITER_CACHED); ++ k = bch2_btree_iter_peek_slot(&iter); ++ ret = bkey_err(k); ++ if (ret) ++ goto out; ++ ++ a = bch2_alloc_unpack(k); ++ ++ if (a.need_inc_gen) { ++ a.gen++; ++ a.need_inc_gen = false; ++ goto write; ++ } ++ ++ BUG_ON(a.journal_seq > c->journal.flushed_seq_ondisk); ++ ++ if (bch2_fs_inconsistent_on(!a.need_discard, c, ++ "%s\n incorrectly set in need_discard btree", ++ (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { ++ ret = -EIO; ++ goto out; ++ } ++ ++ if (!*discard_done && ca->mi.discard && !c->opts.nochanges) { ++ /* ++ * This works without any other locks because this is the only ++ * thread that removes items from the need_discard tree ++ */ ++ bch2_trans_unlock(trans); ++ blkdev_issue_discard(ca->disk_sb.bdev, ++ k.k->p.offset * ca->mi.bucket_size, ++ ca->mi.bucket_size, ++ GFP_KERNEL, 0); ++ *discard_done = true; ++ ++ ret = bch2_trans_relock(trans); ++ if (ret) ++ goto out; ++ } ++ ++ a.need_discard = false; ++write: ++ ret = bch2_alloc_write(trans, &iter, &a, 0); ++out: ++ bch2_trans_iter_exit(trans, &iter); ++ printbuf_exit(&buf); ++ return ret; ++} ++ ++static void bch2_do_discards_work(struct work_struct *work) ++{ ++ struct bch_fs *c = container_of(work, struct bch_fs, discard_work); ++ struct bch_dev *ca = NULL; ++ struct btree_trans trans; ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ int ret; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_need_discard, ++ POS_MIN, 0, k, ret) { ++ bool discard_done = false; ++ ++ if (ca && k.k->p.inode != ca->dev_idx) { ++ percpu_ref_put(&ca->io_ref); ++ ca = NULL; ++ } ++ ++ if (!ca) { ++ ca = bch_dev_bkey_exists(c, k.k->p.inode); ++ if (!percpu_ref_tryget(&ca->io_ref)) { ++ ca = NULL; ++ bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0)); ++ continue; ++ } ++ } ++ ++ if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal, ++ c->journal.flushed_seq_ondisk, ++ k.k->p.inode, k.k->p.offset) || ++ bch2_bucket_is_open_safe(c, k.k->p.inode, k.k->p.offset)) ++ continue; ++ ++ ret = __bch2_trans_do(&trans, NULL, NULL, 0, ++ bch2_clear_need_discard(&trans, k.k->p, ca, &discard_done)); ++ if (ret) ++ break; ++ } ++ bch2_trans_iter_exit(&trans, &iter); ++ ++ if (ca) ++ percpu_ref_put(&ca->io_ref); ++ ++ bch2_trans_exit(&trans); ++ percpu_ref_put(&c->writes); ++} ++ ++void bch2_do_discards(struct bch_fs *c) ++{ ++ if (percpu_ref_tryget(&c->writes) && ++ !queue_work(system_long_wq, &c->discard_work)) ++ percpu_ref_put(&c->writes); ++} ++ + static int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca) + { + struct btree_trans trans; +@@ -811,4 +926,5 @@ void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca) + void bch2_fs_allocator_background_init(struct bch_fs *c) + { + spin_lock_init(&c->freelist_lock); ++ INIT_WORK(&c->discard_work, bch2_do_discards_work); + } +diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h +index 3c72cc37174b..bd22c67f9510 100644 +--- a/fs/bcachefs/alloc_background.h ++++ b/fs/bcachefs/alloc_background.h +@@ -138,6 +138,8 @@ int bch2_alloc_read(struct bch_fs *, bool, bool); + + int bch2_trans_mark_alloc(struct btree_trans *, struct bkey_s_c, + struct bkey_i *, unsigned); ++void bch2_do_discards(struct bch_fs *); ++ + int bch2_fs_freespace_init(struct bch_fs *); + + void bch2_recalc_capacity(struct bch_fs *); +diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h +index aa35801605dc..f51cec5e7cc1 100644 +--- a/fs/bcachefs/alloc_foreground.h ++++ b/fs/bcachefs/alloc_foreground.h +@@ -119,6 +119,9 @@ static inline bool bch2_bucket_is_open_safe(struct bch_fs *c, unsigned dev, u64 + { + bool ret; + ++ if (bch2_bucket_is_open(c, dev, bucket)) ++ return true; ++ + spin_lock(&c->freelist_lock); + ret = bch2_bucket_is_open(c, dev, bucket); + spin_unlock(&c->freelist_lock); +diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h +index c82a9e1aab8d..ef937d637cb3 100644 +--- a/fs/bcachefs/bcachefs.h ++++ b/fs/bcachefs/bcachefs.h +@@ -757,6 +757,7 @@ struct bch_fs { + unsigned write_points_nr; + + struct buckets_waiting_for_journal buckets_waiting_for_journal; ++ struct work_struct discard_work; + + /* GARBAGE COLLECTION */ + struct task_struct *gc_thread; +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index 22d8d185a414..5c97bea12854 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -538,6 +538,11 @@ int bch2_mark_alloc(struct btree_trans *trans, + (!new_u.journal_seq || new_u.journal_seq < c->journal.flushed_seq_ondisk)) + closure_wake_up(&c->freelist_wait); + ++ if ((flags & BTREE_TRIGGER_INSERT) && ++ new_u.need_discard && ++ !new_u.journal_seq) ++ bch2_do_discards(c); ++ + if (bucket_state(new_u) == BUCKET_need_gc_gens) { + atomic_inc(&c->kick_gc); + wake_up_process(c->gc_thread); +diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c +index 2099044c7083..bacb8058f60a 100644 +--- a/fs/bcachefs/journal_io.c ++++ b/fs/bcachefs/journal_io.c +@@ -1,5 +1,6 @@ + // SPDX-License-Identifier: GPL-2.0 + #include "bcachefs.h" ++#include "alloc_background.h" + #include "alloc_foreground.h" + #include "btree_io.h" + #include "btree_update_interior.h" +@@ -1373,6 +1374,7 @@ static void journal_write_done(struct closure *cl) + j->flushed_seq_ondisk = seq; + j->last_seq_ondisk = w->last_seq; + ++ bch2_do_discards(c); + closure_wake_up(&c->freelist_wait); + } + } else if (!j->err_seq || seq < j->err_seq) +diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h +index 033115f7a6f4..70b507fb0de2 100644 +--- a/fs/bcachefs/opts.h ++++ b/fs/bcachefs/opts.h +@@ -265,7 +265,7 @@ enum opt_type { + x(discard, u8, \ + OPT_FS|OPT_MOUNT|OPT_DEVICE, \ + OPT_BOOL(), \ +- BCH2_NO_SB_OPT, false, \ ++ BCH2_NO_SB_OPT, true, \ + NULL, "Enable discard/TRIM support") \ + x(verbose, u8, \ + OPT_FS|OPT_MOUNT, \ +diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c +index 5857f057497b..6464e8c08ebf 100644 +--- a/fs/bcachefs/super.c ++++ b/fs/bcachefs/super.c +@@ -394,6 +394,8 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early) + bch2_dev_allocator_add(c, ca); + bch2_recalc_capacity(c); + ++ bch2_do_discards(c); ++ + if (!early) { + ret = bch2_fs_read_write_late(c); + if (ret) +-- +cgit v1.2.3 + + +From 24b23f6ecac7a4668eaa2f67df4ee5550249e42f Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 10 Feb 2022 18:18:41 -0500 +Subject: bcachefs: New bucket invalidate path + +In the old allocator code, preparing an existing empty bucket was part +of the same code path that invalidated buckets containing cached data. +In the new allocator code this is no longer the case: the main allocator +path finds empty buckets (via the new freespace btree), and can't +allocate buckets that contain cached data. + +We now need a separate code path to invalidate buckets containing cached +data when we're low on empty buckets, which this patch implements. When +the number of free buckets decreases that triggers the new invalidate +path to run, which uses the LRU btree to pick cached data buckets to +invalidate until we're above our watermark. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/alloc_background.c | 85 ++++++++++++++++++++++++++++++++++++++++++ + fs/bcachefs/alloc_background.h | 11 ++++++ + fs/bcachefs/bcachefs.h | 1 + + fs/bcachefs/buckets.c | 5 +++ + 4 files changed, 102 insertions(+) + +diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c +index c1ae326bfb4c..7ed1773f5111 100644 +--- a/fs/bcachefs/alloc_background.c ++++ b/fs/bcachefs/alloc_background.c +@@ -636,6 +636,90 @@ void bch2_do_discards(struct bch_fs *c) + percpu_ref_put(&c->writes); + } + ++static int invalidate_one_bucket(struct btree_trans *trans, struct bch_dev *ca) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_iter lru_iter, alloc_iter = { NULL }; ++ struct bkey_s_c k; ++ struct bkey_alloc_unpacked a; ++ u64 bucket, idx; ++ int ret; ++ ++ bch2_trans_iter_init(trans, &lru_iter, BTREE_ID_lru, ++ POS(ca->dev_idx, 0), 0); ++ k = bch2_btree_iter_peek(&lru_iter); ++ ret = bkey_err(k); ++ if (ret) ++ goto out; ++ ++ if (!k.k || k.k->p.inode != ca->dev_idx) ++ goto out; ++ ++ if (bch2_fs_inconsistent_on(k.k->type != KEY_TYPE_lru, c, ++ "non lru key in lru btree")) ++ goto out; ++ ++ idx = k.k->p.offset; ++ bucket = le64_to_cpu(bkey_s_c_to_lru(k).v->idx); ++ ++ bch2_trans_iter_init(trans, &alloc_iter, BTREE_ID_alloc, ++ POS(ca->dev_idx, bucket), ++ BTREE_ITER_CACHED| ++ BTREE_ITER_INTENT); ++ k = bch2_btree_iter_peek_slot(&alloc_iter); ++ ret = bkey_err(k); ++ if (ret) ++ goto out; ++ ++ a = bch2_alloc_unpack(k); ++ ++ if (bch2_fs_inconsistent_on(idx != alloc_lru_idx(a), c, ++ "invalidating bucket with wrong lru idx (got %llu should be %llu", ++ idx, alloc_lru_idx(a))) ++ goto out; ++ ++ a.gen++; ++ a.need_inc_gen = false; ++ a.data_type = 0; ++ a.dirty_sectors = 0; ++ a.cached_sectors = 0; ++ a.read_time = atomic64_read(&c->io_clock[READ].now); ++ a.write_time = atomic64_read(&c->io_clock[WRITE].now); ++ ++ ret = bch2_alloc_write(trans, &alloc_iter, &a, ++ BTREE_TRIGGER_BUCKET_INVALIDATE); ++out: ++ bch2_trans_iter_exit(trans, &alloc_iter); ++ bch2_trans_iter_exit(trans, &lru_iter); ++ return ret; ++} ++ ++static void bch2_do_invalidates_work(struct work_struct *work) ++{ ++ struct bch_fs *c = container_of(work, struct bch_fs, invalidate_work); ++ struct bch_dev *ca; ++ struct btree_trans trans; ++ unsigned i; ++ int ret = 0; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ for_each_member_device(ca, c, i) ++ while (!ret && should_invalidate_buckets(ca)) ++ ret = __bch2_trans_do(&trans, NULL, NULL, ++ BTREE_INSERT_NOFAIL, ++ invalidate_one_bucket(&trans, ca)); ++ ++ bch2_trans_exit(&trans); ++ percpu_ref_put(&c->writes); ++} ++ ++void bch2_do_invalidates(struct bch_fs *c) ++{ ++ if (percpu_ref_tryget(&c->writes)) ++ queue_work(system_long_wq, &c->invalidate_work); ++} ++ + static int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca) + { + struct btree_trans trans; +@@ -927,4 +1011,5 @@ void bch2_fs_allocator_background_init(struct bch_fs *c) + { + spin_lock_init(&c->freelist_lock); + INIT_WORK(&c->discard_work, bch2_do_discards_work); ++ INIT_WORK(&c->invalidate_work, bch2_do_invalidates_work); + } +diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h +index bd22c67f9510..2a3f4835d2b0 100644 +--- a/fs/bcachefs/alloc_background.h ++++ b/fs/bcachefs/alloc_background.h +@@ -140,6 +140,17 @@ int bch2_trans_mark_alloc(struct btree_trans *, struct bkey_s_c, + struct bkey_i *, unsigned); + void bch2_do_discards(struct bch_fs *); + ++static inline bool should_invalidate_buckets(struct bch_dev *ca) ++{ ++ struct bch_dev_usage u = bch2_dev_usage_read(ca); ++ ++ return u.d[BCH_DATA_cached].buckets && ++ u.buckets_unavailable + u.d[BCH_DATA_cached].buckets < ++ ca->mi.nbuckets >> 7; ++} ++ ++void bch2_do_invalidates(struct bch_fs *); ++ + int bch2_fs_freespace_init(struct bch_fs *); + + void bch2_recalc_capacity(struct bch_fs *); +diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h +index ef937d637cb3..f86a1251a82b 100644 +--- a/fs/bcachefs/bcachefs.h ++++ b/fs/bcachefs/bcachefs.h +@@ -758,6 +758,7 @@ struct bch_fs { + + struct buckets_waiting_for_journal buckets_waiting_for_journal; + struct work_struct discard_work; ++ struct work_struct invalidate_work; + + /* GARBAGE COLLECTION */ + struct task_struct *gc_thread; +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index 5c97bea12854..3baf1dbb9f5f 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -543,6 +543,11 @@ int bch2_mark_alloc(struct btree_trans *trans, + !new_u.journal_seq) + bch2_do_discards(c); + ++ if (!old_u.data_type && ++ new_u.data_type && ++ should_invalidate_buckets(ca)) ++ bch2_do_invalidates(c); ++ + if (bucket_state(new_u) == BUCKET_need_gc_gens) { + atomic_inc(&c->kick_gc); + wake_up_process(c->gc_thread); +-- +cgit v1.2.3 + + +From a91fddbd7e0caeab4f30c841435d1b8f2ca4a5f3 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 17 Feb 2022 03:11:39 -0500 +Subject: bcachefs: Fsck for need_discard & freespace btrees + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/alloc_background.c | 260 +++++++++++++++++++++++++++++++++++++++++ + fs/bcachefs/alloc_background.h | 1 + + fs/bcachefs/lru.c | 84 +++++++++++++ + fs/bcachefs/lru.h | 2 + + fs/bcachefs/recovery.c | 17 ++- + 5 files changed, 363 insertions(+), 1 deletion(-) + +diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c +index 7ed1773f5111..b344705572cc 100644 +--- a/fs/bcachefs/alloc_background.c ++++ b/fs/bcachefs/alloc_background.c +@@ -520,6 +520,266 @@ int bch2_trans_mark_alloc(struct btree_trans *trans, + return 0; + } + ++static int bch2_check_alloc_key(struct btree_trans *trans, ++ struct btree_iter *alloc_iter) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_iter discard_iter, freespace_iter, lru_iter; ++ struct bkey_alloc_unpacked a; ++ unsigned discard_key_type, freespace_key_type; ++ struct bkey_s_c alloc_k, k; ++ struct printbuf buf = PRINTBUF; ++ struct printbuf buf2 = PRINTBUF; ++ int ret; ++ ++ alloc_k = bch2_btree_iter_peek(alloc_iter); ++ if (!alloc_k.k) ++ return 0; ++ ++ ret = bkey_err(alloc_k); ++ if (ret) ++ return ret; ++ ++ a = bch2_alloc_unpack(alloc_k); ++ discard_key_type = bucket_state(a) == BUCKET_need_discard ++ ? KEY_TYPE_set : 0; ++ freespace_key_type = bucket_state(a) == BUCKET_free ++ ? KEY_TYPE_set : 0; ++ ++ bch2_trans_iter_init(trans, &discard_iter, BTREE_ID_need_discard, ++ alloc_k.k->p, 0); ++ bch2_trans_iter_init(trans, &freespace_iter, BTREE_ID_freespace, ++ alloc_freespace_pos(a), 0); ++ bch2_trans_iter_init(trans, &lru_iter, BTREE_ID_lru, ++ POS(a.dev, a.read_time), 0); ++ ++ k = bch2_btree_iter_peek_slot(&discard_iter); ++ ret = bkey_err(k); ++ if (ret) ++ goto err; ++ ++ if (fsck_err_on(k.k->type != discard_key_type, c, ++ "incorrect key in need_discard btree (got %s should be %s)\n" ++ " %s", ++ bch2_bkey_types[k.k->type], ++ bch2_bkey_types[discard_key_type], ++ (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { ++ struct bkey_i *update = ++ bch2_trans_kmalloc(trans, sizeof(*update)); ++ ++ ret = PTR_ERR_OR_ZERO(update); ++ if (ret) ++ goto err; ++ ++ bkey_init(&update->k); ++ update->k.type = discard_key_type; ++ update->k.p = discard_iter.pos; ++ ++ ret = bch2_trans_update(trans, &discard_iter, update, 0) ?: ++ bch2_trans_commit(trans, NULL, NULL, 0); ++ if (ret) ++ goto err; ++ } ++ ++ k = bch2_btree_iter_peek_slot(&freespace_iter); ++ ret = bkey_err(k); ++ if (ret) ++ goto err; ++ ++ if (fsck_err_on(k.k->type != freespace_key_type, c, ++ "incorrect key in freespace btree (got %s should be %s)\n" ++ " %s", ++ bch2_bkey_types[k.k->type], ++ bch2_bkey_types[freespace_key_type], ++ (printbuf_reset(&buf), ++ bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { ++ struct bkey_i *update = ++ bch2_trans_kmalloc(trans, sizeof(*update)); ++ ++ ret = PTR_ERR_OR_ZERO(update); ++ if (ret) ++ goto err; ++ ++ bkey_init(&update->k); ++ update->k.type = freespace_key_type; ++ update->k.p = freespace_iter.pos; ++ bch2_key_resize(&update->k, 1); ++ ++ ret = bch2_trans_update(trans, &freespace_iter, update, 0) ?: ++ bch2_trans_commit(trans, NULL, NULL, 0); ++ if (ret) ++ goto err; ++ } ++ ++ if (bucket_state(a) == BUCKET_cached) { ++ if (fsck_err_on(!a.read_time, c, ++ "cached bucket with read_time 0\n" ++ " %s", ++ (printbuf_reset(&buf), ++ bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { ++ ++ a.read_time = atomic64_read(&c->io_clock[READ].now); ++ ++ ret = bch2_lru_change(trans, a.dev, a.bucket, ++ 0, &a.read_time) ?: ++ bch2_alloc_write(trans, alloc_iter, &a, BTREE_TRIGGER_NORUN); ++ bch2_trans_commit(trans, NULL, NULL, 0); ++ if (ret) ++ goto err; ++ } ++ ++ k = bch2_btree_iter_peek_slot(&lru_iter); ++ ret = bkey_err(k); ++ if (ret) ++ goto err; ++ ++ if (fsck_err_on(k.k->type != KEY_TYPE_lru || ++ le64_to_cpu(bkey_s_c_to_lru(k).v->idx) != a.bucket, c, ++ "incorrect/missing lru entry\n" ++ " %s\n" ++ " %s", ++ (printbuf_reset(&buf), ++ bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf), ++ (bch2_bkey_val_to_text(&buf2, c, k), buf2.buf))) { ++ u64 read_time = a.read_time; ++ ++ ret = bch2_lru_change(trans, a.dev, a.bucket, ++ 0, &a.read_time) ?: ++ (a.read_time != read_time ++ ? bch2_alloc_write(trans, alloc_iter, &a, 0) ++ : 0) ?: ++ bch2_trans_commit(trans, NULL, NULL, 0); ++ if (ret) ++ goto err; ++ } ++ } ++err: ++fsck_err: ++ bch2_trans_iter_exit(trans, &lru_iter); ++ bch2_trans_iter_exit(trans, &freespace_iter); ++ bch2_trans_iter_exit(trans, &discard_iter); ++ printbuf_exit(&buf2); ++ printbuf_exit(&buf); ++ return ret; ++} ++ ++static inline bool bch2_dev_bucket_exists(struct bch_fs *c, struct bpos pos) ++{ ++ struct bch_dev *ca; ++ ++ if (pos.inode >= c->sb.nr_devices || !c->devs[pos.inode]) ++ return false; ++ ++ ca = bch_dev_bkey_exists(c, pos.inode); ++ return pos.offset >= ca->mi.first_bucket && ++ pos.offset < ca->mi.nbuckets; ++} ++ ++static int bch2_check_freespace_key(struct btree_trans *trans, ++ struct btree_iter *freespace_iter, ++ bool initial) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_iter alloc_iter; ++ struct bkey_s_c k, freespace_k; ++ struct bkey_alloc_unpacked a; ++ u64 genbits; ++ struct bpos pos; ++ struct bkey_i *update; ++ struct printbuf buf = PRINTBUF; ++ int ret; ++ ++ freespace_k = bch2_btree_iter_peek(freespace_iter); ++ if (!freespace_k.k) ++ return 1; ++ ++ ret = bkey_err(freespace_k); ++ if (ret) ++ return ret; ++ ++ pos = freespace_iter->pos; ++ pos.offset &= ~(~0ULL << 56); ++ genbits = freespace_iter->pos.offset & (~0ULL << 56); ++ ++ bch2_trans_iter_init(trans, &alloc_iter, BTREE_ID_alloc, pos, 0); ++ ++ if (fsck_err_on(!bch2_dev_bucket_exists(c, pos), c, ++ "%llu:%llu set in freespace btree but device or bucket does not exist", ++ pos.inode, pos.offset)) ++ goto delete; ++ ++ k = bch2_btree_iter_peek_slot(&alloc_iter); ++ ret = bkey_err(k); ++ if (ret) ++ goto err; ++ ++ a = bch2_alloc_unpack(k); ++ ++ if (fsck_err_on(bucket_state(a) != BUCKET_free || ++ genbits != alloc_freespace_genbits(a), c, ++ "%s\n incorrectly set in freespace index (free %u, genbits %llu should be %llu)", ++ (bch2_bkey_val_to_text(&buf, c, k), buf.buf), ++ bucket_state(a) == BUCKET_free, ++ genbits >> 56, alloc_freespace_genbits(a) >> 56)) ++ goto delete; ++out: ++err: ++fsck_err: ++ bch2_trans_iter_exit(trans, &alloc_iter); ++ printbuf_exit(&buf); ++ return ret; ++delete: ++ update = bch2_trans_kmalloc(trans, sizeof(*update)); ++ ret = PTR_ERR_OR_ZERO(update); ++ if (ret) ++ goto err; ++ ++ bkey_init(&update->k); ++ update->k.p = freespace_iter->pos; ++ bch2_key_resize(&update->k, 1); ++ ++ ret = bch2_trans_update(trans, freespace_iter, update, 0) ?: ++ bch2_trans_commit(trans, NULL, NULL, 0); ++ goto out; ++} ++ ++int bch2_check_alloc_info(struct bch_fs *c, bool initial) ++{ ++ struct btree_trans trans; ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ int ret = 0; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN, ++ BTREE_ITER_PREFETCH, k, ret) { ++ ret = __bch2_trans_do(&trans, NULL, NULL, 0, ++ bch2_check_alloc_key(&trans, &iter)); ++ if (ret) ++ break; ++ } ++ bch2_trans_iter_exit(&trans, &iter); ++ ++ if (ret) ++ goto err; ++ ++ bch2_trans_iter_init(&trans, &iter, BTREE_ID_freespace, POS_MIN, ++ BTREE_ITER_PREFETCH); ++ while (1) { ++ ret = __bch2_trans_do(&trans, NULL, NULL, 0, ++ bch2_check_freespace_key(&trans, &iter, initial)); ++ if (ret) ++ break; ++ ++ bch2_btree_iter_set_pos(&iter, bpos_nosnap_successor(iter.pos)); ++ } ++ bch2_trans_iter_exit(&trans, &iter); ++err: ++ bch2_trans_exit(&trans); ++ return ret < 0 ? ret : 0; ++} ++ + static int bch2_clear_need_discard(struct btree_trans *trans, struct bpos pos, + struct bch_dev *ca, bool *discard_done) + { +diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h +index 2a3f4835d2b0..b2e7847c99fb 100644 +--- a/fs/bcachefs/alloc_background.h ++++ b/fs/bcachefs/alloc_background.h +@@ -138,6 +138,7 @@ int bch2_alloc_read(struct bch_fs *, bool, bool); + + int bch2_trans_mark_alloc(struct btree_trans *, struct bkey_s_c, + struct bkey_i *, unsigned); ++int bch2_check_alloc_info(struct bch_fs *, bool); + void bch2_do_discards(struct bch_fs *); + + static inline bool should_invalidate_buckets(struct bch_dev *ca) +diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c +index 2ababca5efe5..1772ccb2b560 100644 +--- a/fs/bcachefs/lru.c ++++ b/fs/bcachefs/lru.c +@@ -1,10 +1,12 @@ + // SPDX-License-Identifier: GPL-2.0 + + #include "bcachefs.h" ++#include "alloc_background.h" + #include "btree_iter.h" + #include "btree_update.h" + #include "error.h" + #include "lru.h" ++#include "recovery.h" + + const char *bch2_lru_invalid(const struct bch_fs *c, struct bkey_s_c k) + { +@@ -117,3 +119,85 @@ int bch2_lru_change(struct btree_trans *trans, u64 id, u64 idx, + return lru_delete(trans, id, idx, old_time) ?: + lru_set(trans, id, idx, new_time); + } ++ ++static int bch2_check_lru_key(struct btree_trans *trans, ++ struct btree_iter *lru_iter, bool initial) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_iter iter; ++ struct bkey_s_c lru_k, k; ++ struct bkey_alloc_unpacked a; ++ struct printbuf buf1 = PRINTBUF; ++ struct printbuf buf2 = PRINTBUF; ++ u64 idx; ++ int ret; ++ ++ lru_k = bch2_btree_iter_peek(lru_iter); ++ if (!lru_k.k) ++ return 0; ++ ++ ret = bkey_err(lru_k); ++ if (ret) ++ return ret; ++ ++ idx = le64_to_cpu(bkey_s_c_to_lru(lru_k).v->idx); ++ ++ bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, ++ POS(lru_k.k->p.inode, idx), 0); ++ k = bch2_btree_iter_peek_slot(&iter); ++ ret = bkey_err(k); ++ if (ret) ++ goto err; ++ ++ a = bch2_alloc_unpack(k); ++ ++ if (fsck_err_on(bucket_state(a) != BUCKET_cached || ++ a.read_time != lru_k.k->p.offset, c, ++ "incorrect lru entry %s\n" ++ " for %s", ++ (bch2_bkey_val_to_text(&buf1, c, lru_k), buf1.buf), ++ (bch2_bkey_val_to_text(&buf2, c, k), buf2.buf))) { ++ struct bkey_i *update = ++ bch2_trans_kmalloc(trans, sizeof(*update)); ++ ++ ret = PTR_ERR_OR_ZERO(update); ++ if (ret) ++ goto err; ++ ++ bkey_init(&update->k); ++ update->k.p = lru_iter->pos; ++ ++ ret = bch2_trans_update(trans, lru_iter, update, 0); ++ if (ret) ++ goto err; ++ } ++err: ++fsck_err: ++ bch2_trans_iter_exit(trans, &iter); ++ printbuf_exit(&buf2); ++ printbuf_exit(&buf1); ++ return ret; ++} ++ ++int bch2_check_lrus(struct bch_fs *c, bool initial) ++{ ++ struct btree_trans trans; ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ int ret = 0; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_lru, POS_MIN, ++ BTREE_ITER_PREFETCH, k, ret) { ++ ret = __bch2_trans_do(&trans, NULL, NULL, 0, ++ bch2_check_lru_key(&trans, &iter, initial)); ++ if (ret) ++ break; ++ } ++ bch2_trans_iter_exit(&trans, &iter); ++ ++ bch2_trans_exit(&trans); ++ return ret; ++ ++} +diff --git a/fs/bcachefs/lru.h b/fs/bcachefs/lru.h +index c3121cfee285..4db6a8399332 100644 +--- a/fs/bcachefs/lru.h ++++ b/fs/bcachefs/lru.h +@@ -12,4 +12,6 @@ void bch2_lru_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); + + int bch2_lru_change(struct btree_trans *, u64, u64, u64, u64 *); + ++int bch2_check_lrus(struct bch_fs *, bool); ++ + #endif /* _BCACHEFS_LRU_H */ +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index b7e735d7774f..e66e9506565a 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -16,6 +16,7 @@ + #include "journal_io.h" + #include "journal_reclaim.h" + #include "journal_seq_blacklist.h" ++#include "lru.h" + #include "move.h" + #include "quota.h" + #include "recovery.h" +@@ -1165,13 +1166,27 @@ use_clean: + bool metadata_only = c->opts.norecovery; + + bch_info(c, "checking allocations"); +- err = "error in mark and sweep"; ++ err = "error checking allocations"; + ret = bch2_gc(c, true, metadata_only); + if (ret) + goto err; + bch_verbose(c, "done checking allocations"); + } + ++ if (c->opts.fsck && ++ c->sb.version >= bcachefs_metadata_version_freespace) { ++ bch_info(c, "checking need_discard and freespace btrees"); ++ err = "error checking need_discard and freespace btrees"; ++ ret = bch2_check_alloc_info(c, true); ++ if (ret) ++ goto err; ++ ++ ret = bch2_check_lrus(c, true); ++ if (ret) ++ goto err; ++ bch_verbose(c, "done checking need_discard and freespace btrees"); ++ } ++ + bch2_stripes_heap_start(c); + + clear_bit(BCH_FS_REBUILD_REPLICAS, &c->flags); +-- +cgit v1.2.3 + + +From 8281aed5ec5beb9eca095d839fb49c8f72d9499d Mon Sep 17 00:00:00 2001 +From: Daniel Hill +Date: Wed, 23 Feb 2022 01:25:03 +1300 +Subject: bcachefs: Fix LRU repair code + +Don't run triggers when repairing incorrect/missing lru entries Triggers +create a conflicting call to lru_change() with the incorrect lru ptr, +lru_change attempts to delete this incorrect lru entry, and fails +because the back ptr doesn't match the original bucket causing fsck to +error. + +Signed-off-by: Daniel Hill +--- + fs/bcachefs/alloc_background.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c +index b344705572cc..da7c665773d2 100644 +--- a/fs/bcachefs/alloc_background.c ++++ b/fs/bcachefs/alloc_background.c +@@ -646,7 +646,7 @@ static int bch2_check_alloc_key(struct btree_trans *trans, + ret = bch2_lru_change(trans, a.dev, a.bucket, + 0, &a.read_time) ?: + (a.read_time != read_time +- ? bch2_alloc_write(trans, alloc_iter, &a, 0) ++ ? bch2_alloc_write(trans, alloc_iter, &a, BTREE_TRIGGER_NORUN) + : 0) ?: + bch2_trans_commit(trans, NULL, NULL, 0); + if (ret) +-- +cgit v1.2.3 + + +From 42fb3525431f572cc8f0fe2a782ddf3a74f0daab Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 10 Feb 2022 19:09:40 -0500 +Subject: bcachefs: bch2_dev_usage_update() no longer depends on bucket_mark + +This is one of the last steps in getting rid of the main in-memory +bucket array. + +This changes bch2_dev_usage_update() to take bkey_alloc_unpacked instead +of bucket_mark, and for the places where we are in fact working with +bucket_mark and don't have bkey_alloc_unpacked, we add a wrapper that +takes bucket_mark and converts to bkey_alloc_unpacked. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/buckets.c | 51 +++++++++++++++++++++++++++++++++++++-------------- + fs/bcachefs/buckets.h | 7 ------- + 2 files changed, 37 insertions(+), 21 deletions(-) + +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index 3baf1dbb9f5f..f779f366e6a2 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -279,24 +279,24 @@ bch2_fs_usage_read_short(struct bch_fs *c) + return ret; + } + +-static inline int is_unavailable_bucket(struct bucket_mark m) ++static inline int is_unavailable_bucket(struct bkey_alloc_unpacked a) + { +- return !is_available_bucket(m); ++ return a.dirty_sectors || a.stripe; + } + + static inline int bucket_sectors_fragmented(struct bch_dev *ca, +- struct bucket_mark m) ++ struct bkey_alloc_unpacked a) + { +- return m.dirty_sectors +- ? max(0, (int) ca->mi.bucket_size - (int) m.dirty_sectors) ++ return a.dirty_sectors ++ ? max(0, (int) ca->mi.bucket_size - (int) a.dirty_sectors) + : 0; + } + +-static inline enum bch_data_type bucket_type(struct bucket_mark m) ++static inline enum bch_data_type bucket_type(struct bkey_alloc_unpacked a) + { +- return m.cached_sectors && !m.dirty_sectors ++ return a.cached_sectors && !a.dirty_sectors + ? BCH_DATA_cached +- : m.data_type; ++ : a.data_type; + } + + static inline void account_bucket(struct bch_fs_usage *fs_usage, +@@ -311,7 +311,8 @@ static inline void account_bucket(struct bch_fs_usage *fs_usage, + } + + static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca, +- struct bucket_mark old, struct bucket_mark new, ++ struct bkey_alloc_unpacked old, ++ struct bkey_alloc_unpacked new, + u64 journal_seq, bool gc) + { + struct bch_fs_usage *fs_usage; +@@ -344,6 +345,28 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca, + preempt_enable(); + } + ++static void bch2_dev_usage_update_m(struct bch_fs *c, struct bch_dev *ca, ++ struct bucket_mark old, struct bucket_mark new, ++ u64 journal_seq, bool gc) ++{ ++ struct bkey_alloc_unpacked old_a = { ++ .gen = old.gen, ++ .data_type = old.data_type, ++ .dirty_sectors = old.dirty_sectors, ++ .cached_sectors = old.cached_sectors, ++ .stripe = old.stripe, ++ }; ++ struct bkey_alloc_unpacked new_a = { ++ .gen = new.gen, ++ .data_type = new.data_type, ++ .dirty_sectors = new.dirty_sectors, ++ .cached_sectors = new.cached_sectors, ++ .stripe = new.stripe, ++ }; ++ ++ bch2_dev_usage_update(c, ca, old_a, new_a, journal_seq, gc); ++} ++ + static inline int __update_replicas(struct bch_fs *c, + struct bch_fs_usage *fs_usage, + struct bch_replicas_entry *r, +@@ -557,6 +580,8 @@ int bch2_mark_alloc(struct btree_trans *trans, + if (!gc && new_u.gen != old_u.gen) + *bucket_gen(ca, new_u.bucket) = new_u.gen; + ++ bch2_dev_usage_update(c, ca, old_u, new_u, journal_seq, gc); ++ + g = __bucket(ca, new_u.bucket, gc); + + old_m = bucket_cmpxchg(g, m, ({ +@@ -567,8 +592,6 @@ int bch2_mark_alloc(struct btree_trans *trans, + m.stripe = new_u.stripe != 0; + })); + +- bch2_dev_usage_update(c, ca, old_m, m, journal_seq, gc); +- + g->io_time[READ] = new_u.read_time; + g->io_time[WRITE] = new_u.write_time; + g->gen_valid = 1; +@@ -646,7 +669,7 @@ void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, + bch2_data_types[old.data_type ?: data_type], + old.dirty_sectors, sectors); + +- bch2_dev_usage_update(c, ca, old, new, 0, true); ++ bch2_dev_usage_update_m(c, ca, old, new, 0, true); + percpu_up_read(&c->mark_lock); + } + +@@ -805,7 +828,7 @@ static int mark_stripe_bucket(struct btree_trans *trans, + g->stripe = k.k->p.offset; + g->stripe_redundancy = s->nr_redundant; + +- bch2_dev_usage_update(c, ca, old, new, journal_seq, true); ++ bch2_dev_usage_update_m(c, ca, old, new, journal_seq, true); + err: + percpu_up_read(&c->mark_lock); + printbuf_exit(&buf); +@@ -878,7 +901,7 @@ static int bch2_mark_pointer(struct btree_trans *trans, + old.v.counter, + new.v.counter)) != old.v.counter); + +- bch2_dev_usage_update(c, ca, old, new, journal_seq, true); ++ bch2_dev_usage_update_m(c, ca, old, new, journal_seq, true); + err: + percpu_up_read(&c->mark_lock); + +diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h +index a05d8adc8372..233fbdf803db 100644 +--- a/fs/bcachefs/buckets.h ++++ b/fs/bcachefs/buckets.h +@@ -126,13 +126,6 @@ static inline u8 ptr_stale(struct bch_dev *ca, + return ret; + } + +-/* bucket gc marks */ +- +-static inline bool is_available_bucket(struct bucket_mark mark) +-{ +- return !mark.dirty_sectors && !mark.stripe; +-} +- + /* Device usage: */ + + struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *); +-- +cgit v1.2.3 + + +From 426f33cb342b1dba162e39db928634eb1ba390ba Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 10 Feb 2022 19:26:55 -0500 +Subject: bcachefs: Kill main in-memory bucket array + +All code using the in-memory bucket array, excluding GC, has now been +converted to use the alloc btree directly - so we can finally delete it. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/alloc_background.c | 28 ++------------- + fs/bcachefs/alloc_background.h | 2 +- + fs/bcachefs/bcachefs.h | 2 +- + fs/bcachefs/btree_gc.c | 51 ++++++++++++++++++++++----- + fs/bcachefs/buckets.c | 80 +++++++++++++----------------------------- + fs/bcachefs/buckets.h | 20 +++-------- + fs/bcachefs/buckets_types.h | 1 - + fs/bcachefs/recovery.c | 2 +- + 8 files changed, 76 insertions(+), 110 deletions(-) + +diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c +index da7c665773d2..0c33424393be 100644 +--- a/fs/bcachefs/alloc_background.c ++++ b/fs/bcachefs/alloc_background.c +@@ -340,14 +340,12 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, + #undef x + } + +-int bch2_alloc_read(struct bch_fs *c, bool gc, bool metadata_only) ++int bch2_alloc_read(struct bch_fs *c) + { + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; + struct bch_dev *ca; +- struct bucket *g; +- struct bkey_alloc_unpacked u; + int ret; + + bch2_trans_init(&trans, c, 0, 0); +@@ -355,30 +353,8 @@ int bch2_alloc_read(struct bch_fs *c, bool gc, bool metadata_only) + for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN, + BTREE_ITER_PREFETCH, k, ret) { + ca = bch_dev_bkey_exists(c, k.k->p.inode); +- g = __bucket(ca, k.k->p.offset, gc); +- u = bch2_alloc_unpack(k); +- +- if (!gc) +- *bucket_gen(ca, k.k->p.offset) = u.gen; +- +- g->_mark.gen = u.gen; +- g->io_time[READ] = u.read_time; +- g->io_time[WRITE] = u.write_time; +- g->gen_valid = 1; +- +- if (!gc || +- (metadata_only && +- (u.data_type == BCH_DATA_user || +- u.data_type == BCH_DATA_cached || +- u.data_type == BCH_DATA_parity))) { +- g->_mark.data_type = u.data_type; +- g->_mark.dirty_sectors = u.dirty_sectors; +- g->_mark.cached_sectors = u.cached_sectors; +- g->_mark.stripe = u.stripe != 0; +- g->stripe = u.stripe; +- g->stripe_redundancy = u.stripe_redundancy; +- } + ++ *bucket_gen(ca, k.k->p.offset) = bch2_alloc_unpack(k).gen; + } + bch2_trans_iter_exit(&trans, &iter); + +diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h +index b2e7847c99fb..06539e036f13 100644 +--- a/fs/bcachefs/alloc_background.h ++++ b/fs/bcachefs/alloc_background.h +@@ -134,7 +134,7 @@ static inline bool bkey_is_alloc(const struct bkey *k) + k->type == KEY_TYPE_alloc_v3; + } + +-int bch2_alloc_read(struct bch_fs *, bool, bool); ++int bch2_alloc_read(struct bch_fs *); + + int bch2_trans_mark_alloc(struct btree_trans *, struct bkey_s_c, + struct bkey_i *, unsigned); +diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h +index f86a1251a82b..2d185d22d78e 100644 +--- a/fs/bcachefs/bcachefs.h ++++ b/fs/bcachefs/bcachefs.h +@@ -450,7 +450,7 @@ struct bch_dev { + * gc_lock, for device resize - holding any is sufficient for access: + * Or rcu_read_lock(), but only for ptr_stale(): + */ +- struct bucket_array __rcu *buckets[2]; ++ struct bucket_array __rcu *buckets_gc; + struct bucket_gens __rcu *bucket_gens; + u8 *oldest_gen; + unsigned long *buckets_nouse; +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +index 0bab695bcb41..952051b07e21 100644 +--- a/fs/bcachefs/btree_gc.c ++++ b/fs/bcachefs/btree_gc.c +@@ -1163,10 +1163,10 @@ static void bch2_gc_free(struct bch_fs *c) + genradix_free(&c->gc_stripes); + + for_each_member_device(ca, c, i) { +- kvpfree(rcu_dereference_protected(ca->buckets[1], 1), ++ kvpfree(rcu_dereference_protected(ca->buckets_gc, 1), + sizeof(struct bucket_array) + + ca->mi.nbuckets * sizeof(struct bucket)); +- ca->buckets[1] = NULL; ++ ca->buckets_gc = NULL; + + free_percpu(ca->usage_gc); + ca->usage_gc = NULL; +@@ -1295,7 +1295,7 @@ static int bch2_gc_start(struct bch_fs *c, + } + + for_each_member_device(ca, c, i) { +- BUG_ON(ca->buckets[1]); ++ BUG_ON(ca->buckets_gc); + BUG_ON(ca->usage_gc); + + ca->usage_gc = alloc_percpu(struct bch_dev_usage); +@@ -1337,8 +1337,6 @@ static int bch2_alloc_write_key(struct btree_trans *trans, + .data_type = g->mark.data_type, + .dirty_sectors = g->mark.dirty_sectors, + .cached_sectors = g->mark.cached_sectors, +- .read_time = g->io_time[READ], +- .write_time = g->io_time[WRITE], + .stripe = g->stripe, + .stripe_redundancy = g->stripe_redundancy, + }; +@@ -1426,7 +1424,13 @@ static int bch2_gc_alloc_done(struct bch_fs *c, bool metadata_only) + static int bch2_gc_alloc_start(struct bch_fs *c, bool metadata_only) + { + struct bch_dev *ca; ++ struct btree_trans trans; ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ struct bucket *g; ++ struct bkey_alloc_unpacked u; + unsigned i; ++ int ret; + + for_each_member_device(ca, c, i) { + struct bucket_array *buckets = kvpmalloc(sizeof(struct bucket_array) + +@@ -1434,17 +1438,46 @@ static int bch2_gc_alloc_start(struct bch_fs *c, bool metadata_only) + GFP_KERNEL|__GFP_ZERO); + if (!buckets) { + percpu_ref_put(&ca->ref); +- percpu_up_write(&c->mark_lock); + bch_err(c, "error allocating ca->buckets[gc]"); + return -ENOMEM; + } + + buckets->first_bucket = ca->mi.first_bucket; + buckets->nbuckets = ca->mi.nbuckets; +- rcu_assign_pointer(ca->buckets[1], buckets); ++ rcu_assign_pointer(ca->buckets_gc, buckets); + }; + +- return bch2_alloc_read(c, true, metadata_only); ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN, ++ BTREE_ITER_PREFETCH, k, ret) { ++ ca = bch_dev_bkey_exists(c, k.k->p.inode); ++ g = gc_bucket(ca, k.k->p.offset); ++ u = bch2_alloc_unpack(k); ++ ++ g->_mark.gen = u.gen; ++ g->gen_valid = 1; ++ ++ if (metadata_only && ++ (u.data_type == BCH_DATA_user || ++ u.data_type == BCH_DATA_cached || ++ u.data_type == BCH_DATA_parity)) { ++ g->_mark.data_type = u.data_type; ++ g->_mark.dirty_sectors = u.dirty_sectors; ++ g->_mark.cached_sectors = u.cached_sectors; ++ g->_mark.stripe = u.stripe != 0; ++ g->stripe = u.stripe; ++ g->stripe_redundancy = u.stripe_redundancy; ++ } ++ } ++ bch2_trans_iter_exit(&trans, &iter); ++ ++ bch2_trans_exit(&trans); ++ ++ if (ret) ++ bch_err(c, "error reading alloc info at gc start: %i", ret); ++ ++ return ret; + } + + static void bch2_gc_alloc_reset(struct bch_fs *c, bool metadata_only) +@@ -1453,7 +1486,7 @@ static void bch2_gc_alloc_reset(struct bch_fs *c, bool metadata_only) + unsigned i; + + for_each_member_device(ca, c, i) { +- struct bucket_array *buckets = __bucket_array(ca, true); ++ struct bucket_array *buckets = gc_bucket_array(ca); + struct bucket *g; + + for_each_bucket(g, buckets) { +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index f779f366e6a2..8ef732656f97 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -509,8 +509,6 @@ int bch2_mark_alloc(struct btree_trans *trans, + struct bkey_alloc_unpacked old_u = bch2_alloc_unpack(old); + struct bkey_alloc_unpacked new_u = bch2_alloc_unpack(new); + struct bch_dev *ca = bch_dev_bkey_exists(c, new_u.dev); +- struct bucket *g; +- struct bucket_mark old_m, m; + int ret = 0; + + if (bch2_trans_inconsistent_on(new_u.bucket < ca->mi.first_bucket || +@@ -582,21 +580,22 @@ int bch2_mark_alloc(struct btree_trans *trans, + + bch2_dev_usage_update(c, ca, old_u, new_u, journal_seq, gc); + +- g = __bucket(ca, new_u.bucket, gc); +- +- old_m = bucket_cmpxchg(g, m, ({ +- m.gen = new_u.gen; +- m.data_type = new_u.data_type; +- m.dirty_sectors = new_u.dirty_sectors; +- m.cached_sectors = new_u.cached_sectors; +- m.stripe = new_u.stripe != 0; +- })); +- +- g->io_time[READ] = new_u.read_time; +- g->io_time[WRITE] = new_u.write_time; +- g->gen_valid = 1; +- g->stripe = new_u.stripe; +- g->stripe_redundancy = new_u.stripe_redundancy; ++ if (gc) { ++ struct bucket_mark old_m, m; ++ struct bucket *g = gc_bucket(ca, new_u.bucket); ++ ++ old_m = bucket_cmpxchg(g, m, ({ ++ m.gen = new_u.gen; ++ m.data_type = new_u.data_type; ++ m.dirty_sectors = new_u.dirty_sectors; ++ m.cached_sectors = new_u.cached_sectors; ++ m.stripe = new_u.stripe != 0; ++ })); ++ ++ g->gen_valid = 1; ++ g->stripe = new_u.stripe; ++ g->stripe_redundancy = new_u.stripe_redundancy; ++ } + percpu_up_read(&c->mark_lock); + + /* +@@ -605,9 +604,9 @@ int bch2_mark_alloc(struct btree_trans *trans, + */ + + if ((flags & BTREE_TRIGGER_BUCKET_INVALIDATE) && +- old_m.cached_sectors) { ++ old_u.cached_sectors) { + ret = update_cached_sectors(c, new, ca->dev_idx, +- -old_m.cached_sectors, ++ -old_u.cached_sectors, + journal_seq, gc); + if (ret) { + bch2_fs_fatal_error(c, "bch2_mark_alloc(): no replicas entry while updating cached sectors"); +@@ -615,7 +614,7 @@ int bch2_mark_alloc(struct btree_trans *trans, + } + + trace_invalidate(ca, bucket_to_sector(ca, new_u.bucket), +- old_m.cached_sectors); ++ old_u.cached_sectors); + } + + return 0; +@@ -2061,16 +2060,6 @@ recalculate: + + /* Startup/shutdown: */ + +-static void buckets_free_rcu(struct rcu_head *rcu) +-{ +- struct bucket_array *buckets = +- container_of(rcu, struct bucket_array, rcu); +- +- kvpfree(buckets, +- sizeof(*buckets) + +- buckets->nbuckets * sizeof(struct bucket)); +-} +- + static void bucket_gens_free_rcu(struct rcu_head *rcu) + { + struct bucket_gens *buckets = +@@ -2081,16 +2070,12 @@ static void bucket_gens_free_rcu(struct rcu_head *rcu) + + int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) + { +- struct bucket_array *buckets = NULL, *old_buckets = NULL; + struct bucket_gens *bucket_gens = NULL, *old_bucket_gens = NULL; + unsigned long *buckets_nouse = NULL; +- bool resize = ca->buckets[0] != NULL; ++ bool resize = ca->bucket_gens != NULL; + int ret = -ENOMEM; + +- if (!(buckets = kvpmalloc(sizeof(struct bucket_array) + +- nbuckets * sizeof(struct bucket), +- GFP_KERNEL|__GFP_ZERO)) || +- !(bucket_gens = kvpmalloc(sizeof(struct bucket_gens) + nbuckets, ++ if (!(bucket_gens = kvpmalloc(sizeof(struct bucket_gens) + nbuckets, + GFP_KERNEL|__GFP_ZERO)) || + (c->opts.buckets_nouse && + !(buckets_nouse = kvpmalloc(BITS_TO_LONGS(nbuckets) * +@@ -2098,8 +2083,6 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) + GFP_KERNEL|__GFP_ZERO)))) + goto err; + +- buckets->first_bucket = ca->mi.first_bucket; +- buckets->nbuckets = nbuckets; + bucket_gens->first_bucket = ca->mi.first_bucket; + bucket_gens->nbuckets = nbuckets; + +@@ -2111,15 +2094,11 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) + percpu_down_write(&c->mark_lock); + } + +- old_buckets = bucket_array(ca); + old_bucket_gens = rcu_dereference_protected(ca->bucket_gens, 1); + + if (resize) { +- size_t n = min(buckets->nbuckets, old_buckets->nbuckets); ++ size_t n = min(bucket_gens->nbuckets, old_bucket_gens->nbuckets); + +- memcpy(buckets->b, +- old_buckets->b, +- n * sizeof(struct bucket)); + memcpy(bucket_gens->b, + old_bucket_gens->b, + n); +@@ -2129,31 +2108,25 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) + BITS_TO_LONGS(n) * sizeof(unsigned long)); + } + +- rcu_assign_pointer(ca->buckets[0], buckets); + rcu_assign_pointer(ca->bucket_gens, bucket_gens); +- buckets = old_buckets; + bucket_gens = old_bucket_gens; + + swap(ca->buckets_nouse, buckets_nouse); + ++ nbuckets = ca->mi.nbuckets; ++ + if (resize) { + percpu_up_write(&c->mark_lock); ++ up_write(&ca->bucket_lock); + up_write(&c->gc_lock); + } + +- nbuckets = ca->mi.nbuckets; +- +- if (resize) +- up_write(&ca->bucket_lock); +- + ret = 0; + err: + kvpfree(buckets_nouse, + BITS_TO_LONGS(nbuckets) * sizeof(unsigned long)); + if (bucket_gens) + call_rcu(&bucket_gens->rcu, bucket_gens_free_rcu); +- if (buckets) +- call_rcu(&buckets->rcu, buckets_free_rcu); + + return ret; + } +@@ -2166,9 +2139,6 @@ void bch2_dev_buckets_free(struct bch_dev *ca) + BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long)); + kvpfree(rcu_dereference_protected(ca->bucket_gens, 1), + sizeof(struct bucket_gens) + ca->mi.nbuckets); +- kvpfree(rcu_dereference_protected(ca->buckets[0], 1), +- sizeof(struct bucket_array) + +- ca->mi.nbuckets * sizeof(struct bucket)); + + for (i = 0; i < ARRAY_SIZE(ca->usage); i++) + free_percpu(ca->usage[i]); +diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h +index 233fbdf803db..7f7fdd024868 100644 +--- a/fs/bcachefs/buckets.h ++++ b/fs/bcachefs/buckets.h +@@ -30,34 +30,23 @@ + _old; \ + }) + +-static inline struct bucket_array *__bucket_array(struct bch_dev *ca, +- bool gc) ++static inline struct bucket_array *gc_bucket_array(struct bch_dev *ca) + { +- return rcu_dereference_check(ca->buckets[gc], ++ return rcu_dereference_check(ca->buckets_gc, + !ca->fs || + percpu_rwsem_is_held(&ca->fs->mark_lock) || + lockdep_is_held(&ca->fs->gc_lock) || + lockdep_is_held(&ca->bucket_lock)); + } + +-static inline struct bucket_array *bucket_array(struct bch_dev *ca) +-{ +- return __bucket_array(ca, false); +-} +- +-static inline struct bucket *__bucket(struct bch_dev *ca, size_t b, bool gc) ++static inline struct bucket *gc_bucket(struct bch_dev *ca, size_t b) + { +- struct bucket_array *buckets = __bucket_array(ca, gc); ++ struct bucket_array *buckets = gc_bucket_array(ca); + + BUG_ON(b < buckets->first_bucket || b >= buckets->nbuckets); + return buckets->b + b; + } + +-static inline struct bucket *gc_bucket(struct bch_dev *ca, size_t b) +-{ +- return __bucket(ca, b, true); +-} +- + static inline struct bucket_gens *bucket_gens(struct bch_dev *ca) + { + return rcu_dereference_check(ca->bucket_gens, +@@ -65,7 +54,6 @@ static inline struct bucket_gens *bucket_gens(struct bch_dev *ca) + percpu_rwsem_is_held(&ca->fs->mark_lock) || + lockdep_is_held(&ca->fs->gc_lock) || + lockdep_is_held(&ca->bucket_lock)); +- + } + + static inline u8 *bucket_gen(struct bch_dev *ca, size_t b) +diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h +index 2280aee59964..f90b130fbb73 100644 +--- a/fs/bcachefs/buckets_types.h ++++ b/fs/bcachefs/buckets_types.h +@@ -27,7 +27,6 @@ struct bucket { + const struct bucket_mark mark; + }; + +- u64 io_time[2]; + unsigned gen_valid:1; + u8 stripe_redundancy; + u32 stripe; +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index e66e9506565a..fe2c5cb6d430 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -1138,7 +1138,7 @@ use_clean: + err = "error reading allocation information"; + + down_read(&c->gc_lock); +- ret = bch2_alloc_read(c, false, false); ++ ret = bch2_alloc_read(c); + up_read(&c->gc_lock); + + if (ret) +-- +cgit v1.2.3 + + +From ada96472a3e99e3da084b01877c4aaa554d5f744 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 14 Feb 2022 00:07:38 -0500 +Subject: bcachefs: Kill struct bucket_mark + +This switches struct bucket to using a lock, instead of cmpxchg. And now +that the protected members no longer need to fit into a u64, we can +expand the sector counts to 32 bits. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bcachefs_format.h | 4 +- + fs/bcachefs/btree_gc.c | 99 +++++++++++++++------------------ + fs/bcachefs/buckets.c | 126 ++++++++++++++++++++---------------------- + fs/bcachefs/buckets.h | 24 ++++---- + fs/bcachefs/buckets_types.h | 32 +++-------- + fs/bcachefs/movinggc.c | 2 +- + 6 files changed, 125 insertions(+), 162 deletions(-) + +diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h +index 5ece1492d76a..bb54ac175b69 100644 +--- a/fs/bcachefs/bcachefs_format.h ++++ b/fs/bcachefs/bcachefs_format.h +@@ -883,8 +883,8 @@ struct bch_alloc_v2 { + #define BCH_ALLOC_FIELDS_V2() \ + x(read_time, 64) \ + x(write_time, 64) \ +- x(dirty_sectors, 16) \ +- x(cached_sectors, 16) \ ++ x(dirty_sectors, 32) \ ++ x(cached_sectors, 32) \ + x(stripe, 32) \ + x(stripe_redundancy, 8) + +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +index 952051b07e21..5c54a0ca681c 100644 +--- a/fs/bcachefs/btree_gc.c ++++ b/fs/bcachefs/btree_gc.c +@@ -571,37 +571,37 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) { + if (!p.ptr.cached) { +- g->_mark.gen = p.ptr.gen; + g->gen_valid = true; ++ g->gen = p.ptr.gen; + } else { + do_update = true; + } + } + +- if (fsck_err_on(gen_cmp(p.ptr.gen, g->mark.gen) > 0, c, ++ if (fsck_err_on(gen_cmp(p.ptr.gen, g->gen) > 0, c, + "bucket %u:%zu data type %s ptr gen in the future: %u > %u\n" + "while marking %s", + p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), + bch2_data_types[ptr_data_type(k->k, &p.ptr)], +- p.ptr.gen, g->mark.gen, ++ p.ptr.gen, g->gen, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) { + if (!p.ptr.cached) { +- g->_mark.gen = p.ptr.gen; + g->gen_valid = true; +- g->_mark.data_type = 0; +- g->_mark.dirty_sectors = 0; +- g->_mark.cached_sectors = 0; ++ g->gen = p.ptr.gen; ++ g->data_type = 0; ++ g->dirty_sectors = 0; ++ g->cached_sectors = 0; + set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags); + } else { + do_update = true; + } + } + +- if (fsck_err_on(gen_cmp(g->mark.gen, p.ptr.gen) > BUCKET_GC_GEN_MAX, c, ++ if (fsck_err_on(gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX, c, + "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n" + "while marking %s", +- p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->mark.gen, ++ p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen, + bch2_data_types[ptr_data_type(k->k, &p.ptr)], + p.ptr.gen, + (printbuf_reset(&buf), +@@ -609,30 +609,30 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id, + do_update = true; + + if (fsck_err_on(!p.ptr.cached && +- gen_cmp(p.ptr.gen, g->mark.gen) < 0, c, ++ gen_cmp(p.ptr.gen, g->gen) < 0, c, + "bucket %u:%zu data type %s stale dirty ptr: %u < %u\n" + "while marking %s", + p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), + bch2_data_types[ptr_data_type(k->k, &p.ptr)], +- p.ptr.gen, g->mark.gen, ++ p.ptr.gen, g->gen, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) + do_update = true; + +- if (data_type != BCH_DATA_btree && p.ptr.gen != g->mark.gen) ++ if (data_type != BCH_DATA_btree && p.ptr.gen != g->gen) + continue; + +- if (fsck_err_on(g->mark.data_type && +- g->mark.data_type != data_type, c, ++ if (fsck_err_on(g->data_type && ++ g->data_type != data_type, c, + "bucket %u:%zu different types of data in same bucket: %s, %s\n" + "while marking %s", + p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), +- bch2_data_types[g->mark.data_type], ++ bch2_data_types[g->data_type], + bch2_data_types[data_type], + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) { + if (data_type == BCH_DATA_btree) { +- g->_mark.data_type = data_type; ++ g->data_type = data_type; + set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags); + } else { + do_update = true; +@@ -692,7 +692,7 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id, + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); + struct bucket *g = PTR_GC_BUCKET(ca, ptr); + +- ptr->gen = g->mark.gen; ++ ptr->gen = g->gen; + } + } else { + bch2_bkey_drop_ptrs(bkey_i_to_s(new), ptr, ({ +@@ -701,12 +701,12 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id, + enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, ptr); + + (ptr->cached && +- (!g->gen_valid || gen_cmp(ptr->gen, g->mark.gen) > 0)) || ++ (!g->gen_valid || gen_cmp(ptr->gen, g->gen) > 0)) || + (!ptr->cached && +- gen_cmp(ptr->gen, g->mark.gen) < 0) || +- gen_cmp(g->mark.gen, ptr->gen) > BUCKET_GC_GEN_MAX || +- (g->mark.data_type && +- g->mark.data_type != data_type); ++ gen_cmp(ptr->gen, g->gen) < 0) || ++ gen_cmp(g->gen, ptr->gen) > BUCKET_GC_GEN_MAX || ++ (g->data_type && ++ g->data_type != data_type); + })); + again: + ptrs = bch2_bkey_ptrs(bkey_i_to_s(new)); +@@ -1315,9 +1315,9 @@ static int bch2_alloc_write_key(struct btree_trans *trans, + { + struct bch_fs *c = trans->c; + struct bch_dev *ca = bch_dev_bkey_exists(c, iter->pos.inode); +- struct bucket *g; ++ struct bucket gc; + struct bkey_s_c k; +- struct bkey_alloc_unpacked old_u, new_u, gc_u; ++ struct bkey_alloc_unpacked old_u, new_u; + struct bkey_alloc_buf *a; + int ret; + +@@ -1329,37 +1329,27 @@ static int bch2_alloc_write_key(struct btree_trans *trans, + old_u = new_u = bch2_alloc_unpack(k); + + percpu_down_read(&c->mark_lock); +- g = gc_bucket(ca, iter->pos.offset); +- gc_u = (struct bkey_alloc_unpacked) { +- .dev = iter->pos.inode, +- .bucket = iter->pos.offset, +- .gen = g->mark.gen, +- .data_type = g->mark.data_type, +- .dirty_sectors = g->mark.dirty_sectors, +- .cached_sectors = g->mark.cached_sectors, +- .stripe = g->stripe, +- .stripe_redundancy = g->stripe_redundancy, +- }; ++ gc = *gc_bucket(ca, iter->pos.offset); + percpu_up_read(&c->mark_lock); + + if (metadata_only && +- gc_u.data_type != BCH_DATA_sb && +- gc_u.data_type != BCH_DATA_journal && +- gc_u.data_type != BCH_DATA_btree) ++ gc.data_type != BCH_DATA_sb && ++ gc.data_type != BCH_DATA_journal && ++ gc.data_type != BCH_DATA_btree) + return 0; + +- if (gen_after(old_u.gen, gc_u.gen)) ++ if (gen_after(old_u.gen, gc.gen)) + return 0; + + #define copy_bucket_field(_f) \ +- if (fsck_err_on(new_u._f != gc_u._f, c, \ ++ if (fsck_err_on(new_u._f != gc._f, c, \ + "bucket %llu:%llu gen %u data type %s has wrong " #_f \ + ": got %u, should be %u", \ + iter->pos.inode, iter->pos.offset, \ +- new_u.gen, \ +- bch2_data_types[new_u.data_type], \ +- new_u._f, gc_u._f)) \ +- new_u._f = gc_u._f; \ ++ gc.gen, \ ++ bch2_data_types[gc.data_type], \ ++ new_u._f, gc._f)) \ ++ new_u._f = gc._f; \ + + copy_bucket_field(gen); + copy_bucket_field(data_type); +@@ -1455,17 +1445,16 @@ static int bch2_gc_alloc_start(struct bch_fs *c, bool metadata_only) + g = gc_bucket(ca, k.k->p.offset); + u = bch2_alloc_unpack(k); + +- g->_mark.gen = u.gen; +- g->gen_valid = 1; ++ g->gen_valid = 1; ++ g->gen = u.gen; + + if (metadata_only && + (u.data_type == BCH_DATA_user || + u.data_type == BCH_DATA_cached || + u.data_type == BCH_DATA_parity)) { +- g->_mark.data_type = u.data_type; +- g->_mark.dirty_sectors = u.dirty_sectors; +- g->_mark.cached_sectors = u.cached_sectors; +- g->_mark.stripe = u.stripe != 0; ++ g->data_type = u.data_type; ++ g->dirty_sectors = u.dirty_sectors; ++ g->cached_sectors = u.cached_sectors; + g->stripe = u.stripe; + g->stripe_redundancy = u.stripe_redundancy; + } +@@ -1491,12 +1480,12 @@ static void bch2_gc_alloc_reset(struct bch_fs *c, bool metadata_only) + + for_each_bucket(g, buckets) { + if (metadata_only && +- (g->mark.data_type == BCH_DATA_user || +- g->mark.data_type == BCH_DATA_cached || +- g->mark.data_type == BCH_DATA_parity)) ++ (g->data_type == BCH_DATA_user || ++ g->data_type == BCH_DATA_cached || ++ g->data_type == BCH_DATA_parity)) + continue; +- g->_mark.dirty_sectors = 0; +- g->_mark.cached_sectors = 0; ++ g->dirty_sectors = 0; ++ g->cached_sectors = 0; + } + }; + } +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index 8ef732656f97..fbce6cdf4cf8 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -346,7 +346,7 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca, + } + + static void bch2_dev_usage_update_m(struct bch_fs *c, struct bch_dev *ca, +- struct bucket_mark old, struct bucket_mark new, ++ struct bucket old, struct bucket new, + u64 journal_seq, bool gc) + { + struct bkey_alloc_unpacked old_a = { +@@ -581,20 +581,19 @@ int bch2_mark_alloc(struct btree_trans *trans, + bch2_dev_usage_update(c, ca, old_u, new_u, journal_seq, gc); + + if (gc) { +- struct bucket_mark old_m, m; + struct bucket *g = gc_bucket(ca, new_u.bucket); + +- old_m = bucket_cmpxchg(g, m, ({ +- m.gen = new_u.gen; +- m.data_type = new_u.data_type; +- m.dirty_sectors = new_u.dirty_sectors; +- m.cached_sectors = new_u.cached_sectors; +- m.stripe = new_u.stripe != 0; +- })); ++ bucket_lock(g); + + g->gen_valid = 1; ++ g->gen = new_u.gen; ++ g->data_type = new_u.data_type; + g->stripe = new_u.stripe; + g->stripe_redundancy = new_u.stripe_redundancy; ++ g->dirty_sectors = new_u.dirty_sectors; ++ g->cached_sectors = new_u.cached_sectors; ++ ++ bucket_unlock(g); + } + percpu_up_read(&c->mark_lock); + +@@ -620,23 +619,12 @@ int bch2_mark_alloc(struct btree_trans *trans, + return 0; + } + +-#define checked_add(a, b) \ +-({ \ +- unsigned _res = (unsigned) (a) + (b); \ +- bool overflow = _res > U16_MAX; \ +- if (overflow) \ +- _res = U16_MAX; \ +- (a) = _res; \ +- overflow; \ +-}) +- + void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, + size_t b, enum bch_data_type data_type, + unsigned sectors, struct gc_pos pos, + unsigned flags) + { +- struct bucket *g; +- struct bucket_mark old, new; ++ struct bucket old, new, *g; + bool overflow; + + BUG_ON(!(flags & BTREE_TRIGGER_GC)); +@@ -651,10 +639,16 @@ void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, + + percpu_down_read(&c->mark_lock); + g = gc_bucket(ca, b); +- old = bucket_cmpxchg(g, new, ({ +- new.data_type = data_type; +- overflow = checked_add(new.dirty_sectors, sectors); +- })); ++ ++ bucket_lock(g); ++ old = *g; ++ ++ g->data_type = data_type; ++ g->dirty_sectors += sectors; ++ overflow = g->dirty_sectors < sectors; ++ ++ new = *g; ++ bucket_unlock(g); + + bch2_fs_inconsistent_on(old.data_type && + old.data_type != data_type, c, +@@ -688,7 +682,7 @@ static int check_bucket_ref(struct bch_fs *c, + const struct bch_extent_ptr *ptr, + s64 sectors, enum bch_data_type ptr_data_type, + u8 b_gen, u8 bucket_data_type, +- u16 dirty_sectors, u16 cached_sectors) ++ u32 dirty_sectors, u32 cached_sectors) + { + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); + size_t bucket_nr = PTR_BUCKET_NR(ca, ptr); +@@ -756,7 +750,7 @@ static int check_bucket_ref(struct bch_fs *c, + goto err; + } + +- if ((unsigned) (bucket_sectors + sectors) > U16_MAX) { ++ if ((unsigned) (bucket_sectors + sectors) > U32_MAX) { + bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, + "bucket %u:%zu gen %u data type %s sector count overflow: %u + %lli > U16_MAX\n" + "while marking %s", +@@ -787,8 +781,7 @@ static int mark_stripe_bucket(struct btree_trans *trans, + s64 sectors = parity ? le16_to_cpu(s->sectors) : 0; + const struct bch_extent_ptr *ptr = s->ptrs + ptr_idx; + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); +- struct bucket *g; +- struct bucket_mark new, old; ++ struct bucket old, new, *g; + struct printbuf buf = PRINTBUF; + int ret = 0; + +@@ -800,33 +793,37 @@ static int mark_stripe_bucket(struct btree_trans *trans, + buf.atomic++; + g = PTR_GC_BUCKET(ca, ptr); + +- if (g->mark.dirty_sectors || ++ if (g->dirty_sectors || + (g->stripe && g->stripe != k.k->p.offset)) { + bch2_fs_inconsistent(c, + "bucket %u:%zu gen %u: multiple stripes using same bucket\n%s", +- ptr->dev, PTR_BUCKET_NR(ca, ptr), g->mark.gen, ++ ptr->dev, PTR_BUCKET_NR(ca, ptr), g->gen, + (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); + ret = -EINVAL; + goto err; + } + +- old = bucket_cmpxchg(g, new, ({ +- ret = check_bucket_ref(c, k, ptr, sectors, data_type, +- new.gen, new.data_type, +- new.dirty_sectors, new.cached_sectors); +- if (ret) +- goto err; ++ bucket_lock(g); ++ old = *g; + +- new.dirty_sectors += sectors; +- if (data_type) +- new.data_type = data_type; ++ ret = check_bucket_ref(c, k, ptr, sectors, data_type, ++ new.gen, new.data_type, ++ new.dirty_sectors, new.cached_sectors); ++ if (ret) { ++ bucket_unlock(g); ++ goto err; ++ } + +- new.stripe = true; +- })); ++ new.dirty_sectors += sectors; ++ if (data_type) ++ new.data_type = data_type; + + g->stripe = k.k->p.offset; + g->stripe_redundancy = s->nr_redundant; + ++ new = *g; ++ bucket_unlock(g); ++ + bch2_dev_usage_update_m(c, ca, old, new, journal_seq, true); + err: + percpu_up_read(&c->mark_lock); +@@ -839,9 +836,9 @@ static int __mark_pointer(struct btree_trans *trans, + const struct bch_extent_ptr *ptr, + s64 sectors, enum bch_data_type ptr_data_type, + u8 bucket_gen, u8 *bucket_data_type, +- u16 *dirty_sectors, u16 *cached_sectors) ++ u32 *dirty_sectors, u32 *cached_sectors) + { +- u16 *dst_sectors = !ptr->cached ++ u32 *dst_sectors = !ptr->cached + ? dirty_sectors + : cached_sectors; + int ret = check_bucket_ref(trans->c, k, ptr, sectors, ptr_data_type, +@@ -865,11 +862,9 @@ static int bch2_mark_pointer(struct btree_trans *trans, + { + u64 journal_seq = trans->journal_res.seq; + struct bch_fs *c = trans->c; +- struct bucket_mark old, new; + struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); +- struct bucket *g; ++ struct bucket old, new, *g; + u8 bucket_data_type; +- u64 v; + int ret = 0; + + BUG_ON(!(flags & BTREE_TRIGGER_GC)); +@@ -877,28 +872,25 @@ static int bch2_mark_pointer(struct btree_trans *trans, + percpu_down_read(&c->mark_lock); + g = PTR_GC_BUCKET(ca, &p.ptr); + +- v = atomic64_read(&g->_mark.v); +- do { +- new.v.counter = old.v.counter = v; +- bucket_data_type = new.data_type; +- +- ret = __mark_pointer(trans, k, &p.ptr, sectors, +- data_type, new.gen, +- &bucket_data_type, +- &new.dirty_sectors, +- &new.cached_sectors); +- if (ret) +- goto err; ++ bucket_lock(g); ++ old = *g; + +- new.data_type = bucket_data_type; ++ bucket_data_type = g->data_type; + +- if (flags & BTREE_TRIGGER_NOATOMIC) { +- g->_mark = new; +- break; +- } +- } while ((v = atomic64_cmpxchg(&g->_mark.v, +- old.v.counter, +- new.v.counter)) != old.v.counter); ++ ret = __mark_pointer(trans, k, &p.ptr, sectors, ++ data_type, g->gen, ++ &bucket_data_type, ++ &g->dirty_sectors, ++ &g->cached_sectors); ++ if (ret) { ++ bucket_unlock(g); ++ goto err; ++ } ++ ++ g->data_type = bucket_data_type; ++ ++ new = *g; ++ bucket_unlock(g); + + bch2_dev_usage_update_m(c, ca, old, new, journal_seq, true); + err: +diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h +index 7f7fdd024868..4a3d6bf1e3ef 100644 +--- a/fs/bcachefs/buckets.h ++++ b/fs/bcachefs/buckets.h +@@ -15,20 +15,16 @@ + for (_b = (_buckets)->b + (_buckets)->first_bucket; \ + _b < (_buckets)->b + (_buckets)->nbuckets; _b++) + +-#define bucket_cmpxchg(g, new, expr) \ +-({ \ +- struct bucket *_g = g; \ +- u64 _v = atomic64_read(&(g)->_mark.v); \ +- struct bucket_mark _old; \ +- \ +- do { \ +- (new).v.counter = _old.v.counter = _v; \ +- expr; \ +- } while ((_v = atomic64_cmpxchg(&(_g)->_mark.v, \ +- _old.v.counter, \ +- (new).v.counter)) != _old.v.counter);\ +- _old; \ +-}) ++static inline void bucket_unlock(struct bucket *b) ++{ ++ smp_store_release(&b->lock, 0); ++} ++ ++static inline void bucket_lock(struct bucket *b) ++{ ++ while (xchg(&b->lock, 1)) ++ cpu_relax(); ++} + + static inline struct bucket_array *gc_bucket_array(struct bch_dev *ca) + { +diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h +index f90b130fbb73..e79a33795bf9 100644 +--- a/fs/bcachefs/buckets_types.h ++++ b/fs/bcachefs/buckets_types.h +@@ -7,29 +7,15 @@ + + #define BUCKET_JOURNAL_SEQ_BITS 16 + +-struct bucket_mark { +- union { +- atomic64_t v; +- +- struct { +- u8 gen; +- u8 data_type:3, +- stripe:1; +- u16 dirty_sectors; +- u16 cached_sectors; +- }; +- }; +-}; +- + struct bucket { +- union { +- struct bucket_mark _mark; +- const struct bucket_mark mark; +- }; +- +- unsigned gen_valid:1; +- u8 stripe_redundancy; +- u32 stripe; ++ u8 lock; ++ u8 gen_valid:1; ++ u8 data_type:7; ++ u8 gen; ++ u8 stripe_redundancy; ++ u32 stripe; ++ u32 dirty_sectors; ++ u32 cached_sectors; + }; + + struct bucket_array { +@@ -108,7 +94,7 @@ struct copygc_heap_entry { + u8 dev; + u8 gen; + u8 replicas; +- u16 fragmentation; ++ u32 fragmentation; + u32 sectors; + u64 offset; + }; +diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c +index 0fb60d8581a7..466975a3151f 100644 +--- a/fs/bcachefs/movinggc.c ++++ b/fs/bcachefs/movinggc.c +@@ -153,7 +153,7 @@ static int walk_buckets_to_copygc(struct bch_fs *c) + .dev = iter.pos.inode, + .gen = u.gen, + .replicas = 1 + u.stripe_redundancy, +- .fragmentation = u.dirty_sectors * (1U << 15) ++ .fragmentation = (u64) u.dirty_sectors * (1ULL << 31) + / ca->mi.bucket_size, + .sectors = u.dirty_sectors, + .offset = bucket_to_sector(ca, iter.pos.offset), +-- +cgit v1.2.3 + + +From c2070e4a85422938a291cfb43f7d5c4c85c1fdb3 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 2 Mar 2022 23:34:26 -0500 +Subject: bcachefs: Improve bucket_alloc_fail tracepoint + +Also include the number of buckets available, and the number of buckets +awaiting journal commit - and add a sysfs counter, too. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/alloc_foreground.c | 16 ++++++++++++---- + fs/bcachefs/bcachefs.h | 1 + + fs/bcachefs/sysfs.c | 4 ++++ + include/trace/events/bcachefs.h | 27 ++++++++++++++++++++++++--- + 4 files changed, 41 insertions(+), 7 deletions(-) + +diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c +index e0dc585b50da..178d7c058597 100644 +--- a/fs/bcachefs/alloc_foreground.c ++++ b/fs/bcachefs/alloc_foreground.c +@@ -432,7 +432,7 @@ static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans, + } + bch2_trans_iter_exit(trans, &iter); + +- return ob ?: ERR_PTR(ret ?: -FREELIST_EMPTY); ++ return ob ?: ERR_PTR(ret); + } + + /** +@@ -471,8 +471,8 @@ again: + if (!c->blocked_allocate) + c->blocked_allocate = local_clock(); + +- trace_bucket_alloc_fail(ca, reserve); +- return ERR_PTR(-FREELIST_EMPTY); ++ ob = ERR_PTR(-FREELIST_EMPTY); ++ goto err; + } + + ret = bch2_trans_do(c, NULL, NULL, 0, +@@ -482,8 +482,16 @@ again: + + if (need_journal_commit * 2 > avail) + bch2_journal_flush_async(&c->journal, NULL); ++err: ++ if (!ob) ++ ob = ERR_PTR(ret ?: -FREELIST_EMPTY); + +- return ob ?: ERR_PTR(ret ?: -FREELIST_EMPTY); ++ if (ob == ERR_PTR(-FREELIST_EMPTY)) { ++ trace_bucket_alloc_fail(ca, reserve, avail, need_journal_commit); ++ atomic_long_inc(&c->bucket_alloc_fail); ++ } ++ ++ return ob; + } + + static int __dev_stripe_cmp(struct dev_stripe_state *stripe, +diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h +index 2d185d22d78e..a4ef9aabf274 100644 +--- a/fs/bcachefs/bcachefs.h ++++ b/fs/bcachefs/bcachefs.h +@@ -897,6 +897,7 @@ struct bch_fs { + atomic_long_t read_realloc_races; + atomic_long_t extent_migrate_done; + atomic_long_t extent_migrate_raced; ++ atomic_long_t bucket_alloc_fail; + + unsigned btree_gc_periodic:1; + unsigned copy_gc_enabled:1; +diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c +index 1b5ed7adc261..bed48afb4ac9 100644 +--- a/fs/bcachefs/sysfs.c ++++ b/fs/bcachefs/sysfs.c +@@ -188,6 +188,7 @@ read_attribute(alloc_debug); + read_attribute(read_realloc_races); + read_attribute(extent_migrate_done); + read_attribute(extent_migrate_raced); ++read_attribute(bucket_alloc_fail); + + rw_attribute(discard); + rw_attribute(label); +@@ -374,6 +375,8 @@ SHOW(bch2_fs) + atomic_long_read(&c->extent_migrate_done)); + sysfs_print(extent_migrate_raced, + atomic_long_read(&c->extent_migrate_raced)); ++ sysfs_print(bucket_alloc_fail, ++ atomic_long_read(&c->bucket_alloc_fail)); + + sysfs_printf(btree_gc_periodic, "%u", (int) c->btree_gc_periodic); + +@@ -570,6 +573,7 @@ struct attribute *bch2_fs_internal_files[] = { + &sysfs_read_realloc_races, + &sysfs_extent_migrate_done, + &sysfs_extent_migrate_raced, ++ &sysfs_bucket_alloc_fail, + + &sysfs_gc_gens_pos, + +diff --git a/include/trace/events/bcachefs.h b/include/trace/events/bcachefs.h +index 0596887959d3..832e9f191409 100644 +--- a/include/trace/events/bcachefs.h ++++ b/include/trace/events/bcachefs.h +@@ -491,9 +491,30 @@ DEFINE_EVENT(bucket_alloc, bucket_alloc, + TP_ARGS(ca, reserve) + ); + +-DEFINE_EVENT(bucket_alloc, bucket_alloc_fail, +- TP_PROTO(struct bch_dev *ca, enum alloc_reserve reserve), +- TP_ARGS(ca, reserve) ++TRACE_EVENT(bucket_alloc_fail, ++ TP_PROTO(struct bch_dev *ca, enum alloc_reserve reserve, ++ u64 avail, u64 need_journal_commit), ++ TP_ARGS(ca, reserve, avail, need_journal_commit), ++ ++ TP_STRUCT__entry( ++ __field(dev_t, dev ) ++ __field(enum alloc_reserve, reserve ) ++ __field(u64, avail ) ++ __field(u64, need_journal_commit ) ++ ), ++ ++ TP_fast_assign( ++ __entry->dev = ca->dev; ++ __entry->reserve = reserve; ++ __entry->avail = avail; ++ __entry->need_journal_commit = need_journal_commit; ++ ), ++ ++ TP_printk("%d,%d reserve %d avail %llu need_journal_commit %llu", ++ MAJOR(__entry->dev), MINOR(__entry->dev), ++ __entry->reserve, ++ __entry->avail, ++ __entry->need_journal_commit) + ); + + DEFINE_EVENT(bucket_alloc, open_bucket_alloc_fail, +-- +cgit v1.2.3 + + +From e3f47c13740cecba83e47f01e989f3b25138e5ec Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 13 Mar 2022 19:27:55 -0400 +Subject: bcachefs: Improve bucket_alloc tracepoints + + - bucket_alloc_fail now indicates whether allocation was nonblocking + - we now return strings, not integers, for alloc reserve. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/alloc_foreground.c | 22 ++++++++++++------- + fs/bcachefs/alloc_foreground.h | 2 ++ + fs/bcachefs/alloc_types.h | 14 ++++++++----- + fs/bcachefs/btree_update_interior.c | 4 ++-- + fs/bcachefs/buckets.h | 8 +++---- + fs/bcachefs/ec.c | 8 +++---- + fs/bcachefs/io.h | 4 ++-- + fs/bcachefs/journal.c | 2 +- + fs/bcachefs/move.c | 2 +- + fs/bcachefs/movinggc.c | 21 +++---------------- + fs/bcachefs/sysfs.c | 2 +- + include/trace/events/bcachefs.h | 42 ++++++++++++++++++++----------------- + 12 files changed, 67 insertions(+), 64 deletions(-) + +diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c +index 178d7c058597..5b1149365389 100644 +--- a/fs/bcachefs/alloc_foreground.c ++++ b/fs/bcachefs/alloc_foreground.c +@@ -32,6 +32,13 @@ + #include + #include + ++const char * const bch2_alloc_reserves[] = { ++#define x(t) #t, ++ BCH_ALLOC_RESERVES() ++#undef x ++ NULL ++}; ++ + /* + * Open buckets represent a bucket that's currently being allocated from. They + * serve two purposes: +@@ -172,10 +179,10 @@ long bch2_bucket_alloc_new_fs(struct bch_dev *ca) + static inline unsigned open_buckets_reserved(enum alloc_reserve reserve) + { + switch (reserve) { +- case RESERVE_BTREE: +- case RESERVE_BTREE_MOVINGGC: ++ case RESERVE_btree: ++ case RESERVE_btree_movinggc: + return 0; +- case RESERVE_MOVINGGC: ++ case RESERVE_movinggc: + return OPEN_BUCKETS_COUNT / 4; + default: + return OPEN_BUCKETS_COUNT / 2; +@@ -213,7 +220,7 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev * + + spin_unlock(&c->freelist_lock); + +- trace_open_bucket_alloc_fail(ca, reserve); ++ trace_open_bucket_alloc_fail(ca, bch2_alloc_reserves[reserve]); + return ERR_PTR(-OPEN_BUCKETS_EMPTY); + } + +@@ -254,7 +261,7 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev * + + spin_unlock(&c->freelist_lock); + +- trace_bucket_alloc(ca, reserve); ++ trace_bucket_alloc(ca, bch2_alloc_reserves[reserve]); + return ob; + } + +@@ -487,7 +494,8 @@ err: + ob = ERR_PTR(ret ?: -FREELIST_EMPTY); + + if (ob == ERR_PTR(-FREELIST_EMPTY)) { +- trace_bucket_alloc_fail(ca, reserve, avail, need_journal_commit); ++ trace_bucket_alloc_fail(ca, bch2_alloc_reserves[reserve], avail, ++ need_journal_commit, cl == NULL); + atomic_long_inc(&c->bucket_alloc_fail); + } + +@@ -521,7 +529,7 @@ void bch2_dev_stripe_increment(struct bch_dev *ca, + struct dev_stripe_state *stripe) + { + u64 *v = stripe->next_alloc + ca->dev_idx; +- u64 free_space = dev_buckets_available(ca, RESERVE_NONE); ++ u64 free_space = dev_buckets_available(ca, RESERVE_none); + u64 free_space_inv = free_space + ? div64_u64(1ULL << 48, free_space) + : 1ULL << 48; +diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h +index f51cec5e7cc1..8bc78877f0fc 100644 +--- a/fs/bcachefs/alloc_foreground.h ++++ b/fs/bcachefs/alloc_foreground.h +@@ -12,6 +12,8 @@ struct bch_dev; + struct bch_fs; + struct bch_devs_List; + ++extern const char * const bch2_alloc_reserves[]; ++ + struct dev_alloc_list { + unsigned nr; + u8 devs[BCH_SB_MEMBERS_MAX]; +diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h +index 22e1fbda9046..21b56451bc18 100644 +--- a/fs/bcachefs/alloc_types.h ++++ b/fs/bcachefs/alloc_types.h +@@ -10,12 +10,16 @@ + + struct ec_bucket_buf; + ++#define BCH_ALLOC_RESERVES() \ ++ x(btree_movinggc) \ ++ x(btree) \ ++ x(movinggc) \ ++ x(none) ++ + enum alloc_reserve { +- RESERVE_BTREE_MOVINGGC = -2, +- RESERVE_BTREE = -1, +- RESERVE_MOVINGGC = 0, +- RESERVE_NONE = 1, +- RESERVE_NR = 2, ++#define x(name) RESERVE_##name, ++ BCH_ALLOC_RESERVES() ++#undef x + }; + + #define OPEN_BUCKETS_COUNT 1024 +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index 5834190da6a9..4ba229bfb0ee 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -194,10 +194,10 @@ static struct btree *__bch2_btree_node_alloc(struct bch_fs *c, + + if (flags & BTREE_INSERT_USE_RESERVE) { + nr_reserve = 0; +- alloc_reserve = RESERVE_BTREE_MOVINGGC; ++ alloc_reserve = RESERVE_btree_movinggc; + } else { + nr_reserve = BTREE_NODE_RESERVE; +- alloc_reserve = RESERVE_BTREE; ++ alloc_reserve = RESERVE_btree; + } + + mutex_lock(&c->btree_reserve_cache_lock); +diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h +index 4a3d6bf1e3ef..25baca33e885 100644 +--- a/fs/bcachefs/buckets.h ++++ b/fs/bcachefs/buckets.h +@@ -122,16 +122,16 @@ static inline u64 __dev_buckets_available(struct bch_dev *ca, + s64 reserved = 0; + + switch (reserve) { +- case RESERVE_NONE: ++ case RESERVE_none: + reserved += ca->mi.nbuckets >> 6; + fallthrough; +- case RESERVE_MOVINGGC: ++ case RESERVE_movinggc: + reserved += ca->nr_btree_reserve; + fallthrough; +- case RESERVE_BTREE: ++ case RESERVE_btree: + reserved += ca->nr_btree_reserve; + fallthrough; +- case RESERVE_BTREE_MOVINGGC: ++ case RESERVE_btree_movinggc: + break; + default: + BUG(); +diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c +index 7629c34b7cd0..616a551265e0 100644 +--- a/fs/bcachefs/ec.c ++++ b/fs/bcachefs/ec.c +@@ -1304,8 +1304,8 @@ static int new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h, + &nr_have_parity, + &have_cache, + h->copygc +- ? RESERVE_MOVINGGC +- : RESERVE_NONE, ++ ? RESERVE_movinggc ++ : RESERVE_none, + 0, + cl); + +@@ -1333,8 +1333,8 @@ static int new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h, + &nr_have_data, + &have_cache, + h->copygc +- ? RESERVE_MOVINGGC +- : RESERVE_NONE, ++ ? RESERVE_movinggc ++ : RESERVE_none, + 0, + cl); + +diff --git a/fs/bcachefs/io.h b/fs/bcachefs/io.h +index 1aa422dccef7..fb5114518666 100644 +--- a/fs/bcachefs/io.h ++++ b/fs/bcachefs/io.h +@@ -50,7 +50,7 @@ static inline u64 *op_journal_seq(struct bch_write_op *op) + + static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op) + { +- return op->alloc_reserve == RESERVE_MOVINGGC ++ return op->alloc_reserve == RESERVE_movinggc + ? op->c->copygc_wq + : op->c->btree_update_wq; + } +@@ -79,7 +79,7 @@ static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c, + op->compression_type = bch2_compression_opt_to_type[opts.compression]; + op->nr_replicas = 0; + op->nr_replicas_required = c->opts.data_replicas_required; +- op->alloc_reserve = RESERVE_NONE; ++ op->alloc_reserve = RESERVE_none; + op->incompressible = 0; + op->open_buckets.nr = 0; + op->devs_have.nr = 0; +diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c +index e33085fe978f..6ea6810337db 100644 +--- a/fs/bcachefs/journal.c ++++ b/fs/bcachefs/journal.c +@@ -801,7 +801,7 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, + break; + } + } else { +- ob[nr_got] = bch2_bucket_alloc(c, ca, RESERVE_NONE, ++ ob[nr_got] = bch2_bucket_alloc(c, ca, RESERVE_none, + false, cl); + if (IS_ERR(ob[nr_got])) { + ret = cl ? -EAGAIN : -ENOSPC; +diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c +index 16bca1446a2b..b4588a919dd4 100644 +--- a/fs/bcachefs/move.c ++++ b/fs/bcachefs/move.c +@@ -351,7 +351,7 @@ int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m, + } + + if (m->data_opts.btree_insert_flags & BTREE_INSERT_USE_RESERVE) { +- m->op.alloc_reserve = RESERVE_MOVINGGC; ++ m->op.alloc_reserve = RESERVE_movinggc; + m->op.flags |= BCH_WRITE_ALLOC_NOWAIT; + } else { + /* XXX: this should probably be passed in */ +diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c +index 466975a3151f..1c92d5365958 100644 +--- a/fs/bcachefs/movinggc.c ++++ b/fs/bcachefs/movinggc.c +@@ -30,21 +30,6 @@ + #include + #include + +-/* +- * We can't use the entire copygc reserve in one iteration of copygc: we may +- * need the buckets we're freeing up to go back into the copygc reserve to make +- * forward progress, but if the copygc reserve is full they'll be available for +- * any allocation - and it's possible that in a given iteration, we free up most +- * of the buckets we're going to free before we allocate most of the buckets +- * we're going to allocate. +- * +- * If we only use half of the reserve per iteration, then in steady state we'll +- * always have room in the reserve for the buckets we're going to need in the +- * next iteration: +- */ +-#define COPYGC_BUCKETS_PER_ITER(ca) \ +- ((ca)->free[RESERVE_MOVINGGC].size / 2) +- + static int bucket_offset_cmp(const void *_l, const void *_r, size_t size) + { + const struct copygc_heap_entry *l = _l; +@@ -250,7 +235,7 @@ static int bch2_copygc(struct bch_fs *c) + } + + for_each_rw_member(ca, c, dev_idx) { +- s64 avail = min(dev_buckets_available(ca, RESERVE_MOVINGGC), ++ s64 avail = min(dev_buckets_available(ca, RESERVE_movinggc), + ca->mi.nbuckets >> 6); + + sectors_reserved += avail * ca->mi.bucket_size; +@@ -268,7 +253,7 @@ static int bch2_copygc(struct bch_fs *c) + } + + /* +- * Our btree node allocations also come out of RESERVE_MOVINGGC: ++ * Our btree node allocations also come out of RESERVE_movingc: + */ + sectors_reserved = (sectors_reserved * 3) / 4; + if (!sectors_reserved) { +@@ -354,7 +339,7 @@ unsigned long bch2_copygc_wait_amount(struct bch_fs *c) + for_each_rw_member(ca, c, dev_idx) { + struct bch_dev_usage usage = bch2_dev_usage_read(ca); + +- fragmented_allowed = ((__dev_buckets_available(ca, usage, RESERVE_NONE) * ++ fragmented_allowed = ((__dev_buckets_available(ca, usage, RESERVE_none) * + ca->mi.bucket_size) >> 1); + fragmented = usage.d[BCH_DATA_user].fragmented; + +diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c +index bed48afb4ac9..d018e8bc2677 100644 +--- a/fs/bcachefs/sysfs.c ++++ b/fs/bcachefs/sysfs.c +@@ -734,7 +734,7 @@ static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca) + "open_buckets_user\t%u\n" + "btree reserve cache\t%u\n", + stats.buckets_ec, +- __dev_buckets_available(ca, stats, RESERVE_NONE), ++ __dev_buckets_available(ca, stats, RESERVE_none), + c->freelist_wait.list.first ? "waiting" : "empty", + OPEN_BUCKETS_COUNT - c->open_buckets_nr_free, + ca->nr_open_buckets, +diff --git a/include/trace/events/bcachefs.h b/include/trace/events/bcachefs.h +index 832e9f191409..0fd2fc11b86b 100644 +--- a/include/trace/events/bcachefs.h ++++ b/include/trace/events/bcachefs.h +@@ -468,58 +468,62 @@ TRACE_EVENT(invalidate, + ); + + DECLARE_EVENT_CLASS(bucket_alloc, +- TP_PROTO(struct bch_dev *ca, enum alloc_reserve reserve), +- TP_ARGS(ca, reserve), ++ TP_PROTO(struct bch_dev *ca, const char *alloc_reserve), ++ TP_ARGS(ca, alloc_reserve), + + TP_STRUCT__entry( + __field(dev_t, dev ) +- __field(enum alloc_reserve, reserve ) ++ __array(char, reserve, 16 ) + ), + + TP_fast_assign( + __entry->dev = ca->dev; +- __entry->reserve = reserve; ++ strlcpy(__entry->reserve, alloc_reserve, sizeof(__entry->reserve)); + ), + +- TP_printk("%d,%d reserve %d", ++ TP_printk("%d,%d reserve %s", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->reserve) + ); + + DEFINE_EVENT(bucket_alloc, bucket_alloc, +- TP_PROTO(struct bch_dev *ca, enum alloc_reserve reserve), +- TP_ARGS(ca, reserve) ++ TP_PROTO(struct bch_dev *ca, const char *alloc_reserve), ++ TP_ARGS(ca, alloc_reserve) + ); + + TRACE_EVENT(bucket_alloc_fail, +- TP_PROTO(struct bch_dev *ca, enum alloc_reserve reserve, +- u64 avail, u64 need_journal_commit), +- TP_ARGS(ca, reserve, avail, need_journal_commit), ++ TP_PROTO(struct bch_dev *ca, const char *alloc_reserve, ++ u64 avail, u64 need_journal_commit, ++ bool nonblocking), ++ TP_ARGS(ca, alloc_reserve, avail, need_journal_commit, nonblocking), + + TP_STRUCT__entry( +- __field(dev_t, dev ) +- __field(enum alloc_reserve, reserve ) +- __field(u64, avail ) +- __field(u64, need_journal_commit ) ++ __field(dev_t, dev ) ++ __array(char, reserve, 16 ) ++ __field(u64, avail ) ++ __field(u64, need_journal_commit ) ++ __field(bool, nonblocking ) + ), + + TP_fast_assign( + __entry->dev = ca->dev; +- __entry->reserve = reserve; ++ strlcpy(__entry->reserve, alloc_reserve, sizeof(__entry->reserve)); + __entry->avail = avail; + __entry->need_journal_commit = need_journal_commit; ++ __entry->nonblocking = nonblocking; + ), + +- TP_printk("%d,%d reserve %d avail %llu need_journal_commit %llu", ++ TP_printk("%d,%d reserve %s avail %llu need_journal_commit %llu nonblocking %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->reserve, + __entry->avail, +- __entry->need_journal_commit) ++ __entry->need_journal_commit, ++ __entry->nonblocking) + ); + + DEFINE_EVENT(bucket_alloc, open_bucket_alloc_fail, +- TP_PROTO(struct bch_dev *ca, enum alloc_reserve reserve), +- TP_ARGS(ca, reserve) ++ TP_PROTO(struct bch_dev *ca, const char *alloc_reserve), ++ TP_ARGS(ca, alloc_reserve) + ); + + /* Moving IO */ +-- +cgit v1.2.3 + + +From 5b8c2f04911450cb99cdc8ca70501ceb29028c7c Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 15 Mar 2022 02:41:21 -0400 +Subject: bcachefs: Restore journal write point at startup + +This patch tweaks the journal recovery path so that we start writing +right after where we left off, instead of the next empty bucket. This is +partly prep work for supporting zoned devices, but it's also good to do +in general to avoid the journal completely filling up and getting stuck. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/journal_io.c | 23 +++++++++++++++++++++-- + 1 file changed, 21 insertions(+), 2 deletions(-) + +diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c +index bacb8058f60a..27996f39c4c3 100644 +--- a/fs/bcachefs/journal_io.c ++++ b/fs/bcachefs/journal_io.c +@@ -909,6 +909,7 @@ static void bch2_journal_read_device(struct closure *cl) + struct bch_fs *c = ca->fs; + struct journal_list *jlist = + container_of(cl->parent, struct journal_list, cl); ++ struct journal_replay *r; + struct journal_read_buf buf = { NULL, 0 }; + u64 min_seq = U64_MAX; + unsigned i; +@@ -944,11 +945,29 @@ static void bch2_journal_read_device(struct closure *cl) + * allocate + */ + while (ja->bucket_seq[ja->cur_idx] > min_seq && +- ja->bucket_seq[ja->cur_idx] > ++ ja->bucket_seq[ja->cur_idx] == + ja->bucket_seq[(ja->cur_idx + 1) % ja->nr]) + ja->cur_idx = (ja->cur_idx + 1) % ja->nr; + +- ja->sectors_free = 0; ++ ja->sectors_free = ca->mi.bucket_size; ++ ++ mutex_lock(&jlist->lock); ++ list_for_each_entry(r, jlist->head, list) { ++ for (i = 0; i < r->nr_ptrs; i++) { ++ if (r->ptrs[i].dev == ca->dev_idx && ++ sector_to_bucket(ca, r->ptrs[i].sector) == ja->buckets[ja->cur_idx]) { ++ unsigned wrote = (r->ptrs[i].sector % ca->mi.bucket_size) + ++ vstruct_sectors(&r->j, c->block_bits); ++ ++ ja->sectors_free = min(ja->sectors_free, ++ ca->mi.bucket_size - wrote); ++ } ++ } ++ } ++ mutex_unlock(&jlist->lock); ++ ++ BUG_ON(ja->bucket_seq[ja->cur_idx] && ++ ja->sectors_free == ca->mi.bucket_size); + + /* + * Set dirty_idx to indicate the entire journal is full and needs to be +-- +cgit v1.2.3 + + +From 1b7d9418e31f2366ed9bac14b740dd19f6b551c6 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 15 Mar 2022 16:40:55 -0400 +Subject: bcachefs: Copygc allocations shouldn't be nowait + +We don't actually want copygc allocations to be nowait - an allocation +for copygc might fail and then later succeed due to a bucket needing to +wait on journal commit, or to be discarded. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/move.c | 1 - + 1 file changed, 1 deletion(-) + +diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c +index b4588a919dd4..8eb49381b030 100644 +--- a/fs/bcachefs/move.c ++++ b/fs/bcachefs/move.c +@@ -352,7 +352,6 @@ int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m, + + if (m->data_opts.btree_insert_flags & BTREE_INSERT_USE_RESERVE) { + m->op.alloc_reserve = RESERVE_movinggc; +- m->op.flags |= BCH_WRITE_ALLOC_NOWAIT; + } else { + /* XXX: this should probably be passed in */ + m->op.flags |= BCH_WRITE_ONLY_SPECIFIED_DEVS; +-- +cgit v1.2.3 + + +From 3bb160d70af2d9c101fd6f7a63e32203431f5a44 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 14 Mar 2022 21:48:42 -0400 +Subject: bcachefs: Introduce a separate journal watermark for copygc + +Since journal reclaim -> btree key cache flushing may require the +allocation of new btree nodes, it has an implicit dependency on copygc +in order to make forward progress - so we should avoid blocking copygc +unless the journal is really close to full. + +This introduces watermarks to replace our single MAY_GET_UNRESERVED bit +in the journal, and adds a watermark for copygc and plumbs it through. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_key_cache.c | 2 +- + fs/bcachefs/btree_update.h | 7 ++--- + fs/bcachefs/btree_update_interior.c | 8 +++--- + fs/bcachefs/btree_update_leaf.c | 12 ++++----- + fs/bcachefs/journal.c | 51 ++++++++++++++++++++++------------- + fs/bcachefs/journal.h | 53 ++++++++++++++++++------------------- + fs/bcachefs/journal_reclaim.c | 8 +++--- + fs/bcachefs/journal_types.h | 41 +++++++++++++++++++++------- + fs/bcachefs/movinggc.c | 2 +- + fs/bcachefs/recovery.c | 5 ++-- + 10 files changed, 108 insertions(+), 81 deletions(-) + +diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c +index b1b7a30417bc..f5a942b6bbf7 100644 +--- a/fs/bcachefs/btree_key_cache.c ++++ b/fs/bcachefs/btree_key_cache.c +@@ -421,7 +421,7 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_USE_RESERVE| + (ck->journal.seq == journal_last_seq(j) +- ? BTREE_INSERT_JOURNAL_RESERVED ++ ? JOURNAL_WATERMARK_reserved + : 0)| + commit_flags); + if (ret) { +diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h +index d9a406a28f47..ca142f955193 100644 +--- a/fs/bcachefs/btree_update.h ++++ b/fs/bcachefs/btree_update.h +@@ -16,12 +16,12 @@ bool bch2_btree_bset_insert_key(struct btree_trans *, struct btree_path *, + void bch2_btree_add_journal_pin(struct bch_fs *, struct btree *, u64); + + enum btree_insert_flags { +- __BTREE_INSERT_NOFAIL, ++ /* First two bits for journal watermark: */ ++ __BTREE_INSERT_NOFAIL = 2, + __BTREE_INSERT_NOCHECK_RW, + __BTREE_INSERT_LAZY_RW, + __BTREE_INSERT_USE_RESERVE, + __BTREE_INSERT_JOURNAL_REPLAY, +- __BTREE_INSERT_JOURNAL_RESERVED, + __BTREE_INSERT_JOURNAL_RECLAIM, + __BTREE_INSERT_NOWAIT, + __BTREE_INSERT_GC_LOCK_HELD, +@@ -41,9 +41,6 @@ enum btree_insert_flags { + /* Insert is for journal replay - don't get journal reservations: */ + #define BTREE_INSERT_JOURNAL_REPLAY (1 << __BTREE_INSERT_JOURNAL_REPLAY) + +-/* Indicates that we have pre-reserved space in the journal: */ +-#define BTREE_INSERT_JOURNAL_RESERVED (1 << __BTREE_INSERT_JOURNAL_RESERVED) +- + /* Insert is being called from journal reclaim path: */ + #define BTREE_INSERT_JOURNAL_RECLAIM (1 << __BTREE_INSERT_JOURNAL_RECLAIM) + +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index 4ba229bfb0ee..c2232f8185c5 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -606,7 +606,7 @@ static void btree_update_nodes_written(struct btree_update *as) + BTREE_INSERT_NOFAIL| + BTREE_INSERT_NOCHECK_RW| + BTREE_INSERT_JOURNAL_RECLAIM| +- BTREE_INSERT_JOURNAL_RESERVED, ++ JOURNAL_WATERMARK_reserved, + btree_update_nodes_written_trans(&trans, as)); + bch2_trans_exit(&trans); + +@@ -970,13 +970,11 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, + ? BCH_DISK_RESERVATION_NOFAIL : 0; + unsigned nr_nodes[2] = { 0, 0 }; + unsigned update_level = level; +- int journal_flags = 0; ++ int journal_flags = flags & JOURNAL_WATERMARK_MASK; + int ret = 0; + + BUG_ON(!path->should_be_locked); + +- if (flags & BTREE_INSERT_JOURNAL_RESERVED) +- journal_flags |= JOURNAL_RES_GET_RESERVED; + if (flags & BTREE_INSERT_JOURNAL_RECLAIM) + journal_flags |= JOURNAL_RES_GET_NONBLOCK; + +@@ -1958,7 +1956,7 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans, + BTREE_INSERT_NOCHECK_RW| + BTREE_INSERT_USE_RESERVE| + BTREE_INSERT_JOURNAL_RECLAIM| +- BTREE_INSERT_JOURNAL_RESERVED); ++ JOURNAL_WATERMARK_reserved); + if (ret) + goto err; + +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index c5d2436d540f..fec09f00c4e3 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -295,11 +295,10 @@ static inline int bch2_trans_journal_res_get(struct btree_trans *trans, + struct bch_fs *c = trans->c; + int ret; + +- if (trans->flags & BTREE_INSERT_JOURNAL_RESERVED) +- flags |= JOURNAL_RES_GET_RESERVED; +- + ret = bch2_journal_res_get(&c->journal, &trans->journal_res, +- trans->journal_u64s, flags); ++ trans->journal_u64s, ++ flags| ++ (trans->flags & JOURNAL_WATERMARK_MASK)); + + return ret == -EAGAIN ? BTREE_INSERT_NEED_JOURNAL_RES : ret; + } +@@ -906,8 +905,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, + ret = bch2_journal_preres_get(&c->journal, + &trans->journal_preres, trans->journal_preres_u64s, + JOURNAL_RES_GET_NONBLOCK| +- ((trans->flags & BTREE_INSERT_JOURNAL_RESERVED) +- ? JOURNAL_RES_GET_RESERVED : 0)); ++ (trans->flags & JOURNAL_WATERMARK_MASK)); + if (unlikely(ret == -EAGAIN)) + ret = bch2_trans_journal_preres_get_cold(trans, + trans->journal_preres_u64s, trace_ip); +@@ -992,7 +990,7 @@ int bch2_trans_commit_error(struct btree_trans *trans, + bch2_trans_unlock(trans); + + if ((trans->flags & BTREE_INSERT_JOURNAL_RECLAIM) && +- !(trans->flags & BTREE_INSERT_JOURNAL_RESERVED)) { ++ !(trans->flags & JOURNAL_WATERMARK_reserved)) { + trans->restarted = true; + ret = -EAGAIN; + break; +diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c +index 6ea6810337db..6d91a2c8f6b5 100644 +--- a/fs/bcachefs/journal.c ++++ b/fs/bcachefs/journal.c +@@ -20,6 +20,18 @@ + + #include + ++#define x(n) #n, ++static const char * const bch2_journal_watermarks[] = { ++ JOURNAL_WATERMARKS() ++ NULL ++}; ++ ++static const char * const bch2_journal_errors[] = { ++ JOURNAL_ERRORS() ++ NULL ++}; ++#undef x ++ + static inline bool journal_seq_unwritten(struct journal *j, u64 seq) + { + return seq > j->seq_ondisk; +@@ -208,19 +220,19 @@ static int journal_entry_open(struct journal *j) + BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb)); + + if (j->blocked) +- return cur_entry_blocked; ++ return JOURNAL_ERR_blocked; + + if (j->cur_entry_error) + return j->cur_entry_error; + + if (bch2_journal_error(j)) +- return cur_entry_insufficient_devices; /* -EROFS */ ++ return JOURNAL_ERR_insufficient_devices; /* -EROFS */ + + if (!fifo_free(&j->pin)) +- return cur_entry_journal_pin_full; ++ return JOURNAL_ERR_journal_pin_full; + + if (nr_unwritten_journal_entries(j) == ARRAY_SIZE(j->buf) - 1) +- return cur_entry_max_in_flight; ++ return JOURNAL_ERR_max_in_flight; + + BUG_ON(!j->cur_entry_sectors); + +@@ -239,7 +251,7 @@ static int journal_entry_open(struct journal *j) + u64s = clamp_t(int, u64s, 0, JOURNAL_ENTRY_CLOSED_VAL - 1); + + if (u64s <= 0) +- return cur_entry_journal_full; ++ return JOURNAL_ERR_journal_full; + + if (fifo_empty(&j->pin) && j->reclaim_thread) + wake_up_process(j->reclaim_thread); +@@ -355,13 +367,12 @@ retry: + return 0; + } + +- if (!(flags & JOURNAL_RES_GET_RESERVED) && +- !test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags)) { ++ if ((flags & JOURNAL_WATERMARK_MASK) < j->watermark) { + /* + * Don't want to close current journal entry, just need to + * invoke reclaim: + */ +- ret = cur_entry_journal_full; ++ ret = JOURNAL_ERR_journal_full; + goto unlock; + } + +@@ -379,10 +390,10 @@ retry: + __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL); + ret = journal_entry_open(j); + +- if (ret == cur_entry_max_in_flight) ++ if (ret == JOURNAL_ERR_max_in_flight) + trace_journal_entry_full(c); + unlock: +- if ((ret && ret != cur_entry_insufficient_devices) && ++ if ((ret && ret != JOURNAL_ERR_insufficient_devices) && + !j->res_get_blocked_start) { + j->res_get_blocked_start = local_clock() ?: 1; + trace_journal_full(c); +@@ -394,14 +405,15 @@ unlock: + if (!ret) + goto retry; + +- if ((ret == cur_entry_journal_full || +- ret == cur_entry_journal_pin_full) && ++ if ((ret == JOURNAL_ERR_journal_full || ++ ret == JOURNAL_ERR_journal_pin_full) && + !can_discard && + !nr_unwritten_journal_entries(j) && +- (flags & JOURNAL_RES_GET_RESERVED)) { ++ (flags & JOURNAL_WATERMARK_MASK) == JOURNAL_WATERMARK_reserved) { + struct printbuf buf = PRINTBUF; + +- bch_err(c, "Journal stuck! Hava a pre-reservation but journal full"); ++ bch_err(c, "Journal stuck! Hava a pre-reservation but journal full (ret %s)", ++ bch2_journal_errors[ret]); + + bch2_journal_debug_to_text(&buf, j); + bch_err(c, "%s", buf.buf); +@@ -419,8 +431,8 @@ unlock: + * Journal is full - can't rely on reclaim from work item due to + * freezing: + */ +- if ((ret == cur_entry_journal_full || +- ret == cur_entry_journal_pin_full) && ++ if ((ret == JOURNAL_ERR_journal_full || ++ ret == JOURNAL_ERR_journal_pin_full) && + !(flags & JOURNAL_RES_GET_NONBLOCK)) { + if (can_discard) { + bch2_journal_do_discards(j); +@@ -433,7 +445,7 @@ unlock: + } + } + +- return ret == cur_entry_insufficient_devices ? -EROFS : -EAGAIN; ++ return ret == JOURNAL_ERR_insufficient_devices ? -EROFS : -EAGAIN; + } + + /* +@@ -1227,13 +1239,14 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) + rcu_read_lock(); + s = READ_ONCE(j->reservations); + +- pr_buf(out, "dirty journal entries:\t%llu\n", fifo_used(&j->pin)); ++ pr_buf(out, "dirty journal entries:\t%llu/%llu\n",fifo_used(&j->pin), j->pin.size); + pr_buf(out, "seq:\t\t\t%llu\n", journal_cur_seq(j)); + pr_buf(out, "seq_ondisk:\t\t%llu\n", j->seq_ondisk); + pr_buf(out, "last_seq:\t\t%llu\n", journal_last_seq(j)); + pr_buf(out, "last_seq_ondisk:\t%llu\n", j->last_seq_ondisk); + pr_buf(out, "flushed_seq_ondisk:\t%llu\n", j->flushed_seq_ondisk); + pr_buf(out, "prereserved:\t\t%u/%u\n", j->prereserved.reserved, j->prereserved.remaining); ++ pr_buf(out, "watermark:\t\t%u\n", bch2_journal_watermarks[j->watermark]); + pr_buf(out, "each entry reserved:\t%u\n", j->entry_u64s_reserved); + pr_buf(out, "nr flush writes:\t%llu\n", j->nr_flush_writes); + pr_buf(out, "nr noflush writes:\t%llu\n", j->nr_noflush_writes); +@@ -1243,7 +1256,7 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) + pr_buf(out, "reclaim runs in:\t%u ms\n", time_after(j->next_reclaim, now) + ? jiffies_to_msecs(j->next_reclaim - jiffies) : 0); + pr_buf(out, "current entry sectors:\t%u\n", j->cur_entry_sectors); +- pr_buf(out, "current entry error:\t%u\n", j->cur_entry_error); ++ pr_buf(out, "current entry error:\t%s\n", bch2_journal_errors[j->cur_entry_error]); + pr_buf(out, "current entry:\t\t"); + + switch (s.cur_entry_offset) { +diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h +index 989c33157cd2..e7321c327d9d 100644 +--- a/fs/bcachefs/journal.h ++++ b/fs/bcachefs/journal.h +@@ -295,9 +295,9 @@ static inline void bch2_journal_res_put(struct journal *j, + int bch2_journal_res_get_slowpath(struct journal *, struct journal_res *, + unsigned); + +-#define JOURNAL_RES_GET_NONBLOCK (1 << 0) +-#define JOURNAL_RES_GET_CHECK (1 << 1) +-#define JOURNAL_RES_GET_RESERVED (1 << 2) ++/* First two bits for JOURNAL_WATERMARK: */ ++#define JOURNAL_RES_GET_NONBLOCK (1 << 2) ++#define JOURNAL_RES_GET_CHECK (1 << 3) + + static inline int journal_res_get_fast(struct journal *j, + struct journal_res *res, +@@ -318,8 +318,7 @@ static inline int journal_res_get_fast(struct journal *j, + + EBUG_ON(!journal_state_count(new, new.idx)); + +- if (!(flags & JOURNAL_RES_GET_RESERVED) && +- !test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags)) ++ if ((flags & JOURNAL_WATERMARK_MASK) < j->watermark) + return 0; + + new.cur_entry_offset += res->u64s; +@@ -372,23 +371,27 @@ out: + + /* journal_preres: */ + +-static inline bool journal_check_may_get_unreserved(struct journal *j) ++static inline void journal_set_watermark(struct journal *j) + { + union journal_preres_state s = READ_ONCE(j->prereserved); +- bool ret = s.reserved < s.remaining && +- fifo_free(&j->pin) > j->pin.size / 4; +- +- lockdep_assert_held(&j->lock); +- +- if (ret != test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags)) { +- if (ret) { +- set_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags); +- journal_wake(j); +- } else { +- clear_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags); +- } +- } +- return ret; ++ unsigned watermark = JOURNAL_WATERMARK_any; ++ ++ if (fifo_free(&j->pin) < j->pin.size / 4) ++ watermark = max_t(unsigned, watermark, JOURNAL_WATERMARK_copygc); ++ if (fifo_free(&j->pin) < j->pin.size / 8) ++ watermark = max_t(unsigned, watermark, JOURNAL_WATERMARK_reserved); ++ ++ if (s.reserved > s.remaining) ++ watermark = max_t(unsigned, watermark, JOURNAL_WATERMARK_copygc); ++ if (!s.remaining) ++ watermark = max_t(unsigned, watermark, JOURNAL_WATERMARK_reserved); ++ ++ if (watermark == j->watermark) ++ return; ++ ++ swap(watermark, j->watermark); ++ if (watermark > j->watermark) ++ journal_wake(j); + } + + static inline void bch2_journal_preres_put(struct journal *j, +@@ -408,12 +411,8 @@ static inline void bch2_journal_preres_put(struct journal *j, + closure_wake_up(&j->preres_wait); + } + +- if (s.reserved <= s.remaining && +- !test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags)) { +- spin_lock(&j->lock); +- journal_check_may_get_unreserved(j); +- spin_unlock(&j->lock); +- } ++ if (s.reserved <= s.remaining && j->watermark) ++ journal_set_watermark(j); + } + + int __bch2_journal_preres_get(struct journal *, +@@ -434,7 +433,7 @@ static inline int bch2_journal_preres_get_fast(struct journal *j, + old.v = new.v = v; + ret = 0; + +- if ((flags & JOURNAL_RES_GET_RESERVED) || ++ if ((flags & JOURNAL_WATERMARK_reserved) || + new.reserved + d < new.remaining) { + new.reserved += d; + ret = 1; +diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c +index a920a111dad7..6f1bad522949 100644 +--- a/fs/bcachefs/journal_reclaim.c ++++ b/fs/bcachefs/journal_reclaim.c +@@ -195,7 +195,7 @@ void bch2_journal_space_available(struct journal *j) + j->can_discard = can_discard; + + if (nr_online < c->opts.metadata_replicas_required) { +- ret = cur_entry_insufficient_devices; ++ ret = JOURNAL_ERR_insufficient_devices; + goto out; + } + +@@ -217,9 +217,9 @@ void bch2_journal_space_available(struct journal *j) + printbuf_exit(&buf); + + bch2_fatal_error(c); +- ret = cur_entry_journal_stuck; ++ ret = JOURNAL_ERR_journal_stuck; + } else if (!j->space[journal_space_discarded].next_entry) +- ret = cur_entry_journal_full; ++ ret = JOURNAL_ERR_journal_full; + + if ((j->space[journal_space_clean_ondisk].next_entry < + j->space[journal_space_clean_ondisk].total) && +@@ -238,7 +238,7 @@ out: + j->cur_entry_sectors = !ret ? j->space[journal_space_discarded].next_entry : 0; + j->cur_entry_error = ret; + journal_set_remaining(j, u64s_remaining); +- journal_check_may_get_unreserved(j); ++ journal_set_watermark(j); + + if (!ret) + journal_wake(j); +diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h +index 071fcb4a8422..a6cdb885ad41 100644 +--- a/fs/bcachefs/journal_types.h ++++ b/fs/bcachefs/journal_types.h +@@ -144,10 +144,38 @@ enum journal_space_from { + enum { + JOURNAL_REPLAY_DONE, + JOURNAL_STARTED, +- JOURNAL_MAY_GET_UNRESERVED, + JOURNAL_MAY_SKIP_FLUSH, + }; + ++#define JOURNAL_WATERMARKS() \ ++ x(any) \ ++ x(copygc) \ ++ x(reserved) ++ ++enum journal_watermark { ++#define x(n) JOURNAL_WATERMARK_##n, ++ JOURNAL_WATERMARKS() ++#undef x ++}; ++ ++#define JOURNAL_WATERMARK_MASK 3 ++ ++/* Reasons we may fail to get a journal reservation: */ ++#define JOURNAL_ERRORS() \ ++ x(ok) \ ++ x(blocked) \ ++ x(max_in_flight) \ ++ x(journal_full) \ ++ x(journal_pin_full) \ ++ x(journal_stuck) \ ++ x(insufficient_devices) ++ ++enum journal_errors { ++#define x(n) JOURNAL_ERR_##n, ++ JOURNAL_ERRORS() ++#undef x ++}; ++ + /* Embedded in struct bch_fs */ + struct journal { + /* Fastpath stuff up front: */ +@@ -155,6 +183,7 @@ struct journal { + unsigned long flags; + + union journal_res_state reservations; ++ enum journal_watermark watermark; + + /* Max size of current journal entry */ + unsigned cur_entry_u64s; +@@ -164,15 +193,7 @@ struct journal { + * 0, or -ENOSPC if waiting on journal reclaim, or -EROFS if + * insufficient devices: + */ +- enum { +- cur_entry_ok, +- cur_entry_blocked, +- cur_entry_max_in_flight, +- cur_entry_journal_full, +- cur_entry_journal_pin_full, +- cur_entry_journal_stuck, +- cur_entry_insufficient_devices, +- } cur_entry_error; ++ enum journal_errors cur_entry_error; + + union journal_preres_state prereserved; + +diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c +index 1c92d5365958..4f32d38649c8 100644 +--- a/fs/bcachefs/movinggc.c ++++ b/fs/bcachefs/movinggc.c +@@ -91,7 +91,7 @@ static enum data_cmd copygc_pred(struct bch_fs *c, void *arg, + data_opts->target = io_opts->background_target; + data_opts->nr_replicas = 1; + data_opts->btree_insert_flags = BTREE_INSERT_USE_RESERVE| +- BTREE_INSERT_JOURNAL_RESERVED; ++ JOURNAL_WATERMARK_copygc; + data_opts->rewrite_dev = p.ptr.dev; + + if (p.has_ec) +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index fe2c5cb6d430..66492dde7930 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -563,8 +563,9 @@ static int bch2_journal_replay(struct bch_fs *c) + ret = bch2_trans_do(c, NULL, NULL, + BTREE_INSERT_LAZY_RW| + BTREE_INSERT_NOFAIL| +- BTREE_INSERT_JOURNAL_RESERVED| +- (!k->allocated ? BTREE_INSERT_JOURNAL_REPLAY : 0), ++ (!k->allocated ++ ? BTREE_INSERT_JOURNAL_REPLAY|JOURNAL_WATERMARK_reserved ++ : 0), + bch2_journal_replay_key(&trans, k)); + if (ret) { + bch_err(c, "journal replay: error %d while replaying key at btree %s level %u", +-- +cgit v1.2.3 + + +From 1232b9804763979cdb9451a8b23a67cf4bc97a59 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 16 Mar 2022 20:31:15 -0400 +Subject: bcachefs: Convert some WARN_ONs to WARN_ON_ONCE + +These warnings are symptomatic of something else going wrong, we don't +want them spamming up the logs as that'll make it harder to find the +real issue. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/fs-io.c | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c +index b05d6e896f03..051372b88347 100644 +--- a/fs/bcachefs/fs-io.c ++++ b/fs/bcachefs/fs-io.c +@@ -1287,7 +1287,7 @@ static void bch2_writepage_io_done(struct closure *cl) + * racing with fallocate can cause us to add fewer sectors than + * expected - but we shouldn't add more sectors than expected: + */ +- WARN_ON(io->op.i_sectors_delta > 0); ++ WARN_ON_ONCE(io->op.i_sectors_delta > 0); + + /* + * (error (due to going RO) halfway through a page can screw that up +@@ -1473,8 +1473,8 @@ do_io: + sectors << 9, offset << 9)); + + /* Check for writing past i_size: */ +- WARN_ON((bio_end_sector(&w->io->op.wbio.bio) << 9) > +- round_up(i_size, block_bytes(c))); ++ WARN_ON_ONCE((bio_end_sector(&w->io->op.wbio.bio) << 9) > ++ round_up(i_size, block_bytes(c))); + + w->io->op.res.sectors += reserved_sectors; + w->io->op.i_sectors_delta -= dirty_sectors; +-- +cgit v1.2.3 + + +From 8dd3fe282927d03eb712f14ed562f934e67b8550 Mon Sep 17 00:00:00 2001 +From: Daniel Hill +Date: Wed, 16 Mar 2022 10:59:55 +1300 +Subject: bcachefs: Fix check allocations repair code. + +We don't want to run triggers when repairing inside bch2_gc() these +triggers cause ERO due to errors that will be later fixed by +bch2_check_alloc_info() + +Signed-off-by: Daniel Hill +--- + fs/bcachefs/btree_gc.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +index 5c54a0ca681c..ba81043fff51 100644 +--- a/fs/bcachefs/btree_gc.c ++++ b/fs/bcachefs/btree_gc.c +@@ -1367,7 +1367,7 @@ static int bch2_alloc_write_key(struct btree_trans *trans, + if (IS_ERR(a)) + return PTR_ERR(a); + +- ret = bch2_trans_update(trans, iter, &a->k, 0); ++ ret = bch2_trans_update(trans, iter, &a->k, BTREE_TRIGGER_NORUN); + fsck_err: + return ret; + } +-- +cgit v1.2.3 + + +From b2bed1bcea83f60cfdc8331dd0059c5d37a0407b Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 17 Mar 2022 21:35:51 -0400 +Subject: bcachefs: Fix large key cache keys + +Previously, we'd go into an infinite loop when attempting to cache a +bkey in the key cache larger than 128 u64s - since we were only using a +u8 for the size field, it'd get rounded up to 256 then truncated to 0. +Oops. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_types.h | 2 +- + fs/bcachefs/btree_update_leaf.c | 5 +++-- + include/trace/events/bcachefs.h | 40 +++++++++++++++++++++++++++++++++++++--- + 3 files changed, 41 insertions(+), 6 deletions(-) + +diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h +index 788b9811148f..993f04f52149 100644 +--- a/fs/bcachefs/btree_types.h ++++ b/fs/bcachefs/btree_types.h +@@ -326,7 +326,7 @@ struct bkey_cached { + struct btree_bkey_cached_common c; + + unsigned long flags; +- u8 u64s; ++ u16 u64s; + bool valid; + u32 btree_trans_barrier_seq; + struct bkey_cached_key key; +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index fec09f00c4e3..8d185c7c10ef 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -349,7 +349,7 @@ btree_key_can_insert_cached(struct btree_trans *trans, + { + struct bch_fs *c = trans->c; + struct bkey_cached *ck = (void *) path->l[0].b; +- unsigned new_u64s; ++ unsigned old_u64s = ck->u64s, new_u64s; + struct bkey_i *new_k; + + EBUG_ON(path->level); +@@ -383,7 +383,8 @@ btree_key_can_insert_cached(struct btree_trans *trans, + * transaction restart: + */ + trace_trans_restart_key_cache_key_realloced(trans->fn, _RET_IP_, +- path->btree_id, &path->pos); ++ path->btree_id, &path->pos, ++ old_u64s, new_u64s); + /* + * Not using btree_trans_restart() because we can't unlock here, we have + * write locks held: +diff --git a/include/trace/events/bcachefs.h b/include/trace/events/bcachefs.h +index 0fd2fc11b86b..08de7e617247 100644 +--- a/include/trace/events/bcachefs.h ++++ b/include/trace/events/bcachefs.h +@@ -943,12 +943,46 @@ TRACE_EVENT(trans_restart_mem_realloced, + __entry->bytes) + ); + +-DEFINE_EVENT(transaction_restart_iter, trans_restart_key_cache_key_realloced, ++TRACE_EVENT(trans_restart_key_cache_key_realloced, + TP_PROTO(const char *trans_fn, + unsigned long caller_ip, + enum btree_id btree_id, +- struct bpos *pos), +- TP_ARGS(trans_fn, caller_ip, btree_id, pos) ++ struct bpos *pos, ++ unsigned old_u64s, ++ unsigned new_u64s), ++ TP_ARGS(trans_fn, caller_ip, btree_id, pos, old_u64s, new_u64s), ++ ++ TP_STRUCT__entry( ++ __array(char, trans_fn, 24 ) ++ __field(unsigned long, caller_ip ) ++ __field(enum btree_id, btree_id ) ++ __field(u64, inode ) ++ __field(u64, offset ) ++ __field(u32, snapshot ) ++ __field(u32, old_u64s ) ++ __field(u32, new_u64s ) ++ ), ++ ++ TP_fast_assign( ++ strncpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn)); ++ __entry->caller_ip = caller_ip; ++ __entry->btree_id = btree_id; ++ __entry->inode = pos->inode; ++ __entry->offset = pos->offset; ++ __entry->snapshot = pos->snapshot; ++ __entry->old_u64s = old_u64s; ++ __entry->new_u64s = new_u64s; ++ ), ++ ++ TP_printk("%s %pS btree %s pos %llu:%llu:%u old_u64s %u new_u64s %u", ++ __entry->trans_fn, ++ (void *) __entry->caller_ip, ++ bch2_btree_ids[__entry->btree_id], ++ __entry->inode, ++ __entry->offset, ++ __entry->snapshot, ++ __entry->old_u64s, ++ __entry->new_u64s) + ); + + #endif /* _TRACE_BCACHE_H */ +-- +cgit v1.2.3 + + +From b7658e98a9c354b49c487b332920e9464aa34aac Mon Sep 17 00:00:00 2001 +From: Daniel Hill +Date: Sun, 20 Mar 2022 16:46:17 +1300 +Subject: Update issue templates + +--- + .github/ISSUE_TEMPLATE/bug_report.md | 61 ++++++++++++++++++++++++++++++++++++ + 1 file changed, 61 insertions(+) + create mode 100644 .github/ISSUE_TEMPLATE/bug_report.md + +diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md +new file mode 100644 +index 000000000000..8af34357dd98 +--- /dev/null ++++ b/.github/ISSUE_TEMPLATE/bug_report.md +@@ -0,0 +1,61 @@ ++--- ++name: Bug report ++about: Create a report to help us improve ++title: " [short commit id]" ++labels: bug ++assignees: YellowOnion ++ ++--- ++ ++**Please search for duplicates** ++ ++**Version** ++ ++Make sure you're using a reasonably new version. ++ ++Provide the commit hash from the kernel version (preferable) or tools, don't say "I'm using the latest master" as that will very quickly become out of date. ++ ++**Generic info** ++Provide the output of: ++``` ++bcachefs fs usage ++bcachefs show-super ++``` ++**Tools bugs** ++ ++* pull the latest version, compile it, do not strip the binary. ++* provide the exact commands you used to run. ++* run with gdb: `gdb -ex run --args ./bcacehfs ` ++ ++If you get an assert/segfault etc: ++* type `bt` in to and provide the output here. ++ ++If the tools lockup: ++* run `perf top -p $(pidof bcachefs)` and provide a screenshot. ++* press ctrl+c to interrupt the process and provide the output of `bt`. ++ ++**Kernel bugs** ++Compile the kernel with these flags: ++ ++``` ++CONFIG_PREEMPT=y ++CONFIG_BCACHEFS_DEBUG=y ++CONFIG_KALLSYMS=y ++CONFIG_KALLSYMS_ALL=y ++CONFIG_DEBUG_FS=y ++CONFIG_DYNAMIC_FTRACE=y ++CONFIG_FTRACE=y ++``` ++Provide the output of `dmesg` either in a paste-bin or as attachment, if less than 30~ lines just provide inline here. ++ ++ ++**Optional Advanced** ++ ++If lockup or performance issues: ++* run `perf record` and `perf record -e 'bcachefs:*' -o events.data` both during the window of issue and then ctrl+c. ++* run `perf archive` to dump symbols. ++* archive, compress and upload the files: `perf.data`, `events.data` and `perf.data.tar.bz2`. ++ ++Upload large files to a file storage provider: ++* provide the output of `bcachefs list_journal -a | zstd -f -T0 -o ../journal.log.zst` ++*compress & upload all the `metdata.dump.*` files from: bcachefs dump -o metadata.dump +-- +cgit v1.2.3 + + +From d31805b24de68cabec2910289af6cec4db327321 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 20 Mar 2022 23:34:11 -0400 +Subject: bcachefs: x-macro metadata version enum + +Now we've got strings for metadata versions - this changes +bch2_sb_to_text() and our mount log message to use it. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bcachefs_format.h | 30 +++++++++++++++++------------- + fs/bcachefs/btree_io.c | 2 +- + fs/bcachefs/journal_io.c | 2 +- + fs/bcachefs/opts.c | 7 ++++++- + fs/bcachefs/opts.h | 1 + + fs/bcachefs/super-io.c | 8 ++++---- + fs/bcachefs/super.c | 2 +- + 7 files changed, 31 insertions(+), 21 deletions(-) + +diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h +index bb54ac175b69..33823551d63f 100644 +--- a/fs/bcachefs/bcachefs_format.h ++++ b/fs/bcachefs/bcachefs_format.h +@@ -1312,20 +1312,24 @@ struct bch_sb_field_journal_seq_blacklist { + #define BCH_JSET_VERSION_OLD 2 + #define BCH_BSET_VERSION_OLD 3 + ++#define BCH_METADATA_VERSIONS() \ ++ x(bkey_renumber, 10) \ ++ x(inode_btree_change, 11) \ ++ x(snapshot, 12) \ ++ x(inode_backpointers, 13) \ ++ x(btree_ptr_sectors_written, 14) \ ++ x(snapshot_2, 15) \ ++ x(reflink_p_fix, 16) \ ++ x(subvol_dirent, 17) \ ++ x(inode_v2, 18) \ ++ x(freespace, 19) ++ + enum bcachefs_metadata_version { +- bcachefs_metadata_version_min = 9, +- bcachefs_metadata_version_new_versioning = 10, +- bcachefs_metadata_version_bkey_renumber = 10, +- bcachefs_metadata_version_inode_btree_change = 11, +- bcachefs_metadata_version_snapshot = 12, +- bcachefs_metadata_version_inode_backpointers = 13, +- bcachefs_metadata_version_btree_ptr_sectors_written = 14, +- bcachefs_metadata_version_snapshot_2 = 15, +- bcachefs_metadata_version_reflink_p_fix = 16, +- bcachefs_metadata_version_subvol_dirent = 17, +- bcachefs_metadata_version_inode_v2 = 18, +- bcachefs_metadata_version_freespace = 19, +- bcachefs_metadata_version_max = 20, ++ bcachefs_metadata_version_min = 9, ++#define x(t, n) bcachefs_metadata_version_##t = n, ++ BCH_METADATA_VERSIONS() ++#undef x ++ bcachefs_metadata_version_max + }; + + #define bcachefs_metadata_version_current (bcachefs_metadata_version_max - 1) +diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c +index 1df454f24b54..a8014003c2b0 100644 +--- a/fs/bcachefs/btree_io.c ++++ b/fs/bcachefs/btree_io.c +@@ -1891,7 +1891,7 @@ do_write: + BUG_ON(BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN); + BUG_ON(i->seq != b->data->keys.seq); + +- i->version = c->sb.version < bcachefs_metadata_version_new_versioning ++ i->version = c->sb.version < bcachefs_metadata_version_bkey_renumber + ? cpu_to_le16(BCH_BSET_VERSION_OLD) + : cpu_to_le16(c->sb.version); + SET_BSET_OFFSET(i, b->written); +diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c +index 27996f39c4c3..fca9bc47b889 100644 +--- a/fs/bcachefs/journal_io.c ++++ b/fs/bcachefs/journal_io.c +@@ -1581,7 +1581,7 @@ void bch2_journal_write(struct closure *cl) + BUG_ON(vstruct_sectors(jset, c->block_bits) > w->sectors); + + jset->magic = cpu_to_le64(jset_magic(c)); +- jset->version = c->sb.version < bcachefs_metadata_version_new_versioning ++ jset->version = c->sb.version < bcachefs_metadata_version_bkey_renumber + ? cpu_to_le32(BCH_JSET_VERSION_OLD) + : cpu_to_le32(c->sb.version); + +diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c +index e78d3b75f6fb..ce5cb7edcbd3 100644 +--- a/fs/bcachefs/opts.c ++++ b/fs/bcachefs/opts.c +@@ -9,7 +9,12 @@ + #include "super-io.h" + #include "util.h" + +-#define x(t, n) #t, ++#define x(t, n) [n] = #t, ++ ++const char * const bch2_metadata_versions[] = { ++ BCH_METADATA_VERSIONS() ++ NULL ++}; + + const char * const bch2_error_actions[] = { + BCH_ERROR_ACTIONS() +diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h +index 70b507fb0de2..323730eb7c81 100644 +--- a/fs/bcachefs/opts.h ++++ b/fs/bcachefs/opts.h +@@ -8,6 +8,7 @@ + #include + #include "bcachefs_format.h" + ++extern const char * const bch2_metadata_versions[]; + extern const char * const bch2_error_actions[]; + extern const char * const bch2_sb_features[]; + extern const char * const bch2_sb_compat[]; +diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c +index 95af515a01cd..73cd035661b0 100644 +--- a/fs/bcachefs/super-io.c ++++ b/fs/bcachefs/super-io.c +@@ -258,7 +258,7 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out) + int ret; + + version = le16_to_cpu(sb->version); +- version_min = version >= bcachefs_metadata_version_new_versioning ++ version_min = version >= bcachefs_metadata_version_bkey_renumber + ? le16_to_cpu(sb->version_min) + : version; + +@@ -514,7 +514,7 @@ reread: + } + + version = le16_to_cpu(sb->sb->version); +- version_min = version >= bcachefs_metadata_version_new_versioning ++ version_min = version >= bcachefs_metadata_version_bkey_renumber + ? le16_to_cpu(sb->sb->version_min) + : version; + +@@ -1476,12 +1476,12 @@ void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb, + + pr_buf(out, "Version:"); + pr_tab(out); +- pr_buf(out, "%u", le16_to_cpu(sb->version)); ++ pr_buf(out, "%s", bch2_metadata_versions[le16_to_cpu(sb->version)]); + pr_newline(out); + + pr_buf(out, "Oldest version on disk:"); + pr_tab(out); +- pr_buf(out, "%u", le16_to_cpu(sb->version_min)); ++ pr_buf(out, "%u", bch2_metadata_versions[le16_to_cpu(sb->version_min)]); + pr_newline(out); + + pr_buf(out, "Created:"); +diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c +index 6464e8c08ebf..4a071711d363 100644 +--- a/fs/bcachefs/super.c ++++ b/fs/bcachefs/super.c +@@ -862,7 +862,7 @@ static void print_mount_opts(struct bch_fs *c) + if (!p.pos) + pr_buf(&p, "(null)"); + +- bch_info(c, "mounted with opts: %s", p.buf); ++ bch_info(c, "mounted version=%s opts=%s", bch2_metadata_versions[c->sb.version], p.buf); + printbuf_exit(&p); + } + +-- +cgit v1.2.3 + + +From 676a0305e3839c7fde6449db1212d98ad4ead1f0 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 21 Mar 2022 00:15:38 -0400 +Subject: bcachefs: Better superblock opt validation + +This moves validation of superblock options to bch2_sb_validate(), so +they'll be checked in the write path as well. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/opts.c | 50 ++++++++++++++++++++++---------------------------- + fs/bcachefs/opts.h | 5 +++-- + fs/bcachefs/super-io.c | 16 ++++++++++++++++ + fs/bcachefs/sysfs.c | 2 +- + fs/bcachefs/xattr.c | 2 +- + 5 files changed, 43 insertions(+), 32 deletions(-) + +diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c +index ce5cb7edcbd3..77fbb7d2194e 100644 +--- a/fs/bcachefs/opts.c ++++ b/fs/bcachefs/opts.c +@@ -224,42 +224,43 @@ static int bch2_mount_opt_lookup(const char *name) + return bch2_opt_lookup(name); + } + +-static int bch2_opt_validate(const struct bch_option *opt, const char *msg, u64 v) ++int bch2_opt_validate(const struct bch_option *opt, u64 v, struct printbuf *err) + { + if (v < opt->min) { +- if (msg) +- pr_err("invalid %s%s: too small (min %llu)", +- msg, opt->attr.name, opt->min); ++ if (err) ++ pr_buf(err, "%s: too small (min %llu)", ++ opt->attr.name, opt->min); + return -ERANGE; + } + + if (opt->max && v >= opt->max) { +- if (msg) +- pr_err("invalid %s%s: too big (max %llu)", +- msg, opt->attr.name, opt->max); ++ if (err) ++ pr_buf(err, "%s: too big (max %llu)", ++ opt->attr.name, opt->max); + return -ERANGE; + } + + if ((opt->flags & OPT_SB_FIELD_SECTORS) && (v & 511)) { +- if (msg) +- pr_err("invalid %s %s: not a multiple of 512", +- msg, opt->attr.name); ++ if (err) ++ pr_buf(err, "%s: not a multiple of 512", ++ opt->attr.name); + return -EINVAL; + } + + if ((opt->flags & OPT_MUST_BE_POW_2) && !is_power_of_2(v)) { +- if (msg) +- pr_err("invalid %s%s: must be a power of two", +- msg, opt->attr.name); ++ if (err) ++ pr_buf(err, "%s: must be a power of two", ++ opt->attr.name); + return -EINVAL; + } + + return 0; + } + +-int bch2_opt_parse(struct bch_fs *c, const char *msg, ++int bch2_opt_parse(struct bch_fs *c, + const struct bch_option *opt, +- const char *val, u64 *res) ++ const char *val, u64 *res, ++ struct printbuf *err) + { + ssize_t ret; + +@@ -292,7 +293,7 @@ int bch2_opt_parse(struct bch_fs *c, const char *msg, + return ret; + } + +- return bch2_opt_validate(opt, msg, *res); ++ return bch2_opt_validate(opt, *res, err); + } + + void bch2_opt_to_text(struct printbuf *out, +@@ -372,6 +373,7 @@ int bch2_parse_mount_opts(struct bch_fs *c, struct bch_opts *opts, + char *copied_opts, *copied_opts_start; + char *opt, *name, *val; + int ret, id; ++ struct printbuf err = PRINTBUF; + u64 v; + + if (!options) +@@ -391,8 +393,7 @@ int bch2_parse_mount_opts(struct bch_fs *c, struct bch_opts *opts, + if (id < 0) + goto bad_opt; + +- ret = bch2_opt_parse(c, "mount option ", +- &bch2_opt_table[id], val, &v); ++ ret = bch2_opt_parse(c, &bch2_opt_table[id], val, &v, &err); + if (ret < 0) + goto bad_val; + } else { +@@ -435,7 +436,7 @@ bad_opt: + ret = -1; + goto out; + bad_val: +- pr_err("Invalid value %s for mount option %s", val, name); ++ pr_err("Invalid mount option %s", err.buf); + ret = -1; + goto out; + no_val: +@@ -444,6 +445,7 @@ no_val: + goto out; + out: + kfree(copied_opts_start); ++ printbuf_exit(&err); + return ret; + } + +@@ -470,22 +472,14 @@ u64 bch2_opt_from_sb(struct bch_sb *sb, enum bch_opt_id id) + int bch2_opts_from_sb(struct bch_opts *opts, struct bch_sb *sb) + { + unsigned id; +- int ret; + + for (id = 0; id < bch2_opts_nr; id++) { + const struct bch_option *opt = bch2_opt_table + id; +- u64 v; + + if (opt->get_sb == BCH2_NO_SB_OPT) + continue; + +- v = bch2_opt_from_sb(sb, id); +- +- ret = bch2_opt_validate(opt, "superblock option ", v); +- if (ret) +- return ret; +- +- bch2_opt_set_by_id(opts, id, v); ++ bch2_opt_set_by_id(opts, id, bch2_opt_from_sb(sb, id)); + } + + return 0; +diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h +index 323730eb7c81..9e6855ee48e4 100644 +--- a/fs/bcachefs/opts.h ++++ b/fs/bcachefs/opts.h +@@ -483,8 +483,9 @@ void __bch2_opt_set_sb(struct bch_sb *, const struct bch_option *, u64); + void bch2_opt_set_sb(struct bch_fs *, const struct bch_option *, u64); + + int bch2_opt_lookup(const char *); +-int bch2_opt_parse(struct bch_fs *, const char *, const struct bch_option *, +- const char *, u64 *); ++int bch2_opt_validate(const struct bch_option *, u64, struct printbuf *); ++int bch2_opt_parse(struct bch_fs *, const struct bch_option *, ++ const char *, u64 *, struct printbuf *); + + #define OPT_SHOW_FULL_LIST (1 << 0) + #define OPT_SHOW_MOUNT_STYLE (1 << 1) +diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c +index 73cd035661b0..c917bdda5145 100644 +--- a/fs/bcachefs/super-io.c ++++ b/fs/bcachefs/super-io.c +@@ -253,6 +253,7 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out) + struct bch_sb *sb = disk_sb->sb; + struct bch_sb_field *f; + struct bch_sb_field_members *mi; ++ enum bch_opt_id opt_id; + u32 version, version_min; + u16 block_size; + int ret; +@@ -324,6 +325,21 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out) + return -EINVAL; + } + ++ for (opt_id = 0; opt_id < bch2_opts_nr; opt_id++) { ++ const struct bch_option *opt = bch2_opt_table + opt_id; ++ ++ if (opt->get_sb != BCH2_NO_SB_OPT) { ++ u64 v = bch2_opt_from_sb(sb, opt_id); ++ ++ pr_buf(out, "Invalid option "); ++ ret = bch2_opt_validate(opt, v, out); ++ if (ret) ++ return ret; ++ ++ printbuf_reset(out); ++ } ++ } ++ + /* validate layout */ + ret = validate_sb_layout(&sb->layout, out); + if (ret) +diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c +index d018e8bc2677..2594fec4b821 100644 +--- a/fs/bcachefs/sysfs.c ++++ b/fs/bcachefs/sysfs.c +@@ -626,7 +626,7 @@ STORE(bch2_fs_opts_dir) + goto err; + } + +- ret = bch2_opt_parse(c, NULL, opt, strim(tmp), &v); ++ ret = bch2_opt_parse(c, opt, strim(tmp), &v, NULL); + kfree(tmp); + + if (ret < 0) +diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c +index 1c680b16b924..8d23b4c2449e 100644 +--- a/fs/bcachefs/xattr.c ++++ b/fs/bcachefs/xattr.c +@@ -525,7 +525,7 @@ static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler, + memcpy(buf, value, size); + buf[size] = '\0'; + +- ret = bch2_opt_parse(c, NULL, opt, buf, &v); ++ ret = bch2_opt_parse(c, opt, buf, &v, NULL); + kfree(buf); + + if (ret < 0) +-- +cgit v1.2.3 + + +From 2cfae96e5dc798995e0847d0641c0ebf2cc5382a Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 21 Mar 2022 00:27:10 -0400 +Subject: bcachefs: Make minimum journal_flush_delay nonzero + +We're seeing a very strange bug where journal_flush_delay sometimes gets +set to 0 in the superblock. Together with the preceding patch, this +should help us track it down. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/opts.h | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h +index 9e6855ee48e4..8bc67d07afb9 100644 +--- a/fs/bcachefs/opts.h ++++ b/fs/bcachefs/opts.h +@@ -275,7 +275,7 @@ enum opt_type { + NULL, "Extra debugging information during mount/recovery")\ + x(journal_flush_delay, u32, \ + OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ +- OPT_UINT(0, U32_MAX), \ ++ OPT_UINT(1, U32_MAX), \ + BCH_SB_JOURNAL_FLUSH_DELAY, 1000, \ + NULL, "Delay in milliseconds before automatic journal commits")\ + x(journal_flush_disabled, u8, \ +-- +cgit v1.2.3 + + +From 5a52c520c5a793b49aa3901136f12d062c5536ce Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 20 Mar 2022 20:12:53 -0400 +Subject: bcachefs: Change journal_io.c assertion to error message + +Something funny is going on with the new code for restoring the journal +write point, and it's hard to reproduce. + +We do want to debug this because resuming writing to the journal in the +wrong spot could be something serious. For now, replace the assertion +with an error message and revert to old behaviour when it happens. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/journal_io.c | 12 ++++++++++-- + 1 file changed, 10 insertions(+), 2 deletions(-) + +diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c +index fca9bc47b889..e61b88930a7f 100644 +--- a/fs/bcachefs/journal_io.c ++++ b/fs/bcachefs/journal_io.c +@@ -966,8 +966,16 @@ static void bch2_journal_read_device(struct closure *cl) + } + mutex_unlock(&jlist->lock); + +- BUG_ON(ja->bucket_seq[ja->cur_idx] && +- ja->sectors_free == ca->mi.bucket_size); ++ if (ja->bucket_seq[ja->cur_idx] && ++ ja->sectors_free == ca->mi.bucket_size) { ++ bch_err(c, "ja->sectors_free == ca->mi.bucket_size"); ++ bch_err(c, "cur_idx %u/%u", ja->cur_idx, ja->nr); ++ for (i = 0; i < 3; i++) { ++ unsigned idx = ja->cur_idx - 1 + i; ++ bch_err(c, "bucket_seq[%u] = %llu", idx, ja->bucket_seq[idx]); ++ } ++ ja->sectors_free = 0; ++ } + + /* + * Set dirty_idx to indicate the entire journal is full and needs to be +-- +cgit v1.2.3 + + +From 8d8f1e60159a0c9d99e3fa0494bb761392a428f3 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 21 Mar 2022 03:03:03 -0400 +Subject: bcachefs: Reset journal flush delay to default value if zeroed + +We've been seeing a very strange bug where journal flush & reclaim delay +end up getting inexplicably zeroed, in the superblock. We're now +validating all the options in bch2_validate_super(), and 0 is no longer +a valid value for those options, but we need to be careful not to +prevent people's filesystems from mounting because of the new +validation. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/super-io.c | 19 ++++++++++++++++--- + 1 file changed, 16 insertions(+), 3 deletions(-) + +diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c +index c917bdda5145..be61c20b06f3 100644 +--- a/fs/bcachefs/super-io.c ++++ b/fs/bcachefs/super-io.c +@@ -248,7 +248,8 @@ static int validate_sb_layout(struct bch_sb_layout *layout, struct printbuf *out + return 0; + } + +-static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out) ++static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out, ++ int rw) + { + struct bch_sb *sb = disk_sb->sb; + struct bch_sb_field *f; +@@ -325,6 +326,18 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out) + return -EINVAL; + } + ++ if (rw == READ) { ++ /* ++ * Been seeing a bug where these are getting inexplicably ++ * zeroed, so we'r now validating them, but we have to be ++ * careful not to preven people's filesystems from mounting: ++ */ ++ if (!BCH_SB_JOURNAL_FLUSH_DELAY(sb)) ++ SET_BCH_SB_JOURNAL_FLUSH_DELAY(sb, 1000); ++ if (!BCH_SB_JOURNAL_RECLAIM_DELAY(sb)) ++ SET_BCH_SB_JOURNAL_RECLAIM_DELAY(sb, 1000); ++ } ++ + for (opt_id = 0; opt_id < bch2_opts_nr; opt_id++) { + const struct bch_option *opt = bch2_opt_table + opt_id; + +@@ -691,7 +704,7 @@ got_super: + ret = 0; + sb->have_layout = true; + +- ret = bch2_sb_validate(sb, &err); ++ ret = bch2_sb_validate(sb, &err, READ); + if (ret) { + printk(KERN_ERR "bcachefs (%s): error validating superblock: %s", + path, err.buf); +@@ -807,7 +820,7 @@ int bch2_write_super(struct bch_fs *c) + for_each_online_member(ca, c, i) { + printbuf_reset(&err); + +- ret = bch2_sb_validate(&ca->disk_sb, &err); ++ ret = bch2_sb_validate(&ca->disk_sb, &err, WRITE); + if (ret) { + bch2_fs_inconsistent(c, "sb invalid before write: %s", err.buf); + percpu_ref_put(&ca->io_ref); +-- +cgit v1.2.3 + + +From 25efe76414018d877ce62987fec4d6dddc0ba0d8 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 21 Mar 2022 18:05:39 -0400 +Subject: bcachefs: Add printf format attribute to bch2_pr_buf() + +This tells the compiler to check printf format strings, and catches a +few bugs. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_io.c | 2 +- + fs/bcachefs/journal.c | 2 +- + fs/bcachefs/journal_sb.c | 4 ++-- + fs/bcachefs/super-io.c | 2 +- + fs/bcachefs/util.h | 3 ++- + 5 files changed, 7 insertions(+), 6 deletions(-) + +diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c +index a8014003c2b0..4b880ea59cad 100644 +--- a/fs/bcachefs/btree_io.c ++++ b/fs/bcachefs/btree_io.c +@@ -1074,7 +1074,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, + + bch2_bkey_val_to_text(&buf, c, u.s_c); + btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i, +- "invalid bkey %s: %s", buf, invalid); ++ "invalid bkey %s: %s", buf.buf, invalid); + printbuf_exit(&buf); + + btree_keys_account_key_drop(&b->nr, 0, k); +diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c +index 6d91a2c8f6b5..505e8367b5f2 100644 +--- a/fs/bcachefs/journal.c ++++ b/fs/bcachefs/journal.c +@@ -1246,7 +1246,7 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) + pr_buf(out, "last_seq_ondisk:\t%llu\n", j->last_seq_ondisk); + pr_buf(out, "flushed_seq_ondisk:\t%llu\n", j->flushed_seq_ondisk); + pr_buf(out, "prereserved:\t\t%u/%u\n", j->prereserved.reserved, j->prereserved.remaining); +- pr_buf(out, "watermark:\t\t%u\n", bch2_journal_watermarks[j->watermark]); ++ pr_buf(out, "watermark:\t\t%s\n", bch2_journal_watermarks[j->watermark]); + pr_buf(out, "each entry reserved:\t%u\n", j->entry_u64s_reserved); + pr_buf(out, "nr flush writes:\t%llu\n", j->nr_flush_writes); + pr_buf(out, "nr noflush writes:\t%llu\n", j->nr_noflush_writes); +diff --git a/fs/bcachefs/journal_sb.c b/fs/bcachefs/journal_sb.c +index 0a8a0077b6f1..8efe7b7e3dcb 100644 +--- a/fs/bcachefs/journal_sb.c ++++ b/fs/bcachefs/journal_sb.c +@@ -131,13 +131,13 @@ static int bch2_sb_journal_v2_validate(struct bch_sb *sb, + + if (b[0].start < le16_to_cpu(m->first_bucket)) { + pr_buf(err, "journal bucket %llu before first bucket %u", +- b[0], le16_to_cpu(m->first_bucket)); ++ b[0].start, le16_to_cpu(m->first_bucket)); + goto err; + } + + if (b[nr - 1].end > le64_to_cpu(m->nbuckets)) { + pr_buf(err, "journal bucket %llu past end of device (nbuckets %llu)", +- b[nr - 1], le64_to_cpu(m->nbuckets)); ++ b[nr - 1].end - 1, le64_to_cpu(m->nbuckets)); + goto err; + } + +diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c +index be61c20b06f3..71abf87114df 100644 +--- a/fs/bcachefs/super-io.c ++++ b/fs/bcachefs/super-io.c +@@ -1510,7 +1510,7 @@ void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb, + + pr_buf(out, "Oldest version on disk:"); + pr_tab(out); +- pr_buf(out, "%u", bch2_metadata_versions[le16_to_cpu(sb->version_min)]); ++ pr_buf(out, "%s", bch2_metadata_versions[le16_to_cpu(sb->version_min)]); + pr_newline(out); + + pr_buf(out, "Created:"); +diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h +index d6d7f1bc16b8..aa56c94d6bd1 100644 +--- a/fs/bcachefs/util.h ++++ b/fs/bcachefs/util.h +@@ -282,7 +282,8 @@ static inline size_t printbuf_linelen(struct printbuf *buf) + return buf->pos - buf->last_newline; + } + +-void bch2_pr_buf(struct printbuf *out, const char *fmt, ...); ++void bch2_pr_buf(struct printbuf *out, const char *fmt, ...) ++ __attribute__ ((format (printf, 2, 3))); + + #define pr_buf(_out, ...) bch2_pr_buf(_out, __VA_ARGS__) + +-- +cgit v1.2.3 + + +From e9083b26cc0705ac6dd8bde2d532011d886b1e31 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 21 Mar 2022 19:34:48 -0400 +Subject: bcachefs: Fix an unitialized var warning in userspace + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/io.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c +index cf97594b7c6f..36929451af2c 100644 +--- a/fs/bcachefs/io.c ++++ b/fs/bcachefs/io.c +@@ -1059,7 +1059,7 @@ static void __bch2_write(struct closure *cl) + struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); + struct bch_fs *c = op->c; + struct write_point *wp; +- struct bio *bio; ++ struct bio *bio = NULL; + bool skip_put = true; + unsigned nofs_flags; + int ret; +-- +cgit v1.2.3 + + +From f62f16c9be960e274b4ebc965f7e2d00f9d6aea2 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 28 Mar 2022 12:31:22 -0400 +Subject: bcachefs: Heap code fix + +When deleting an entry from a heap that was at entry h->used - 1, we'd +end up calling heap_sift() on an entry outside the heap - the entry we +just removed - which would end up re-adding it to the heap and deleting +something we didn't want to delete. Oops... + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/util.h | 8 +++++--- + 1 file changed, 5 insertions(+), 3 deletions(-) + +diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h +index aa56c94d6bd1..888693703c75 100644 +--- a/fs/bcachefs/util.h ++++ b/fs/bcachefs/util.h +@@ -210,9 +210,11 @@ do { \ + \ + BUG_ON(_i >= (h)->used); \ + (h)->used--; \ +- heap_swap(h, _i, (h)->used, set_backpointer); \ +- heap_sift_up(h, _i, cmp, set_backpointer); \ +- heap_sift_down(h, _i, cmp, set_backpointer); \ ++ if ((_i) < (h)->used) { \ ++ heap_swap(h, _i, (h)->used, set_backpointer); \ ++ heap_sift_up(h, _i, cmp, set_backpointer); \ ++ heap_sift_down(h, _i, cmp, set_backpointer); \ ++ } \ + } while (0) + + #define heap_pop(h, d, cmp, set_backpointer) \ +-- +cgit v1.2.3 + + +From 9fc8394b784abd79f9011d2f24b26f973f111965 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 28 Mar 2022 16:21:26 -0400 +Subject: bcachefs: Work around a journal self-deadlock + +bch2_journal_space_available -> bch2_journal_halt() self deadlocks on +journal lock; work around this by dropping/retaking journal lock before +we call bch2_fatal_error(). + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/journal_reclaim.c | 7 +++++++ + 1 file changed, 7 insertions(+) + +diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c +index 6f1bad522949..a9f7d5a7feb2 100644 +--- a/fs/bcachefs/journal_reclaim.c ++++ b/fs/bcachefs/journal_reclaim.c +@@ -216,7 +216,14 @@ void bch2_journal_space_available(struct journal *j) + bch_err(c, "journal stuck\n%s", buf.buf); + printbuf_exit(&buf); + ++ /* ++ * Hack: bch2_fatal_error() calls bch2_journal_halt() which ++ * takes journal lock: ++ */ ++ spin_unlock(&j->lock); + bch2_fatal_error(c); ++ spin_lock(&j->lock); ++ + ret = JOURNAL_ERR_journal_stuck; + } else if (!j->space[journal_space_discarded].next_entry) + ret = JOURNAL_ERR_journal_full; +-- +cgit v1.2.3 + + +From e4bc57ca328cc6ce407e1cb9437a6cbc7f7e8068 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 28 Mar 2022 16:31:26 -0400 +Subject: bcachefs: Fix error path in bch2_snapshot_set_equiv() + +We weren't properly catching errors from snapshot_live() - oops. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/subvolume.c | 5 +++-- + 1 file changed, 3 insertions(+), 2 deletions(-) + +diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c +index 69603327d93d..666f1c88a3b6 100644 +--- a/fs/bcachefs/subvolume.c ++++ b/fs/bcachefs/subvolume.c +@@ -139,7 +139,7 @@ static int bch2_snapshots_set_equiv(struct btree_trans *trans) + for_each_btree_key(trans, iter, BTREE_ID_snapshots, + POS_MIN, 0, k, ret) { + u32 id = k.k->p.offset, child[2]; +- unsigned nr_live = 0, live_idx; ++ unsigned nr_live = 0, live_idx = 0; + + if (k.k->type != KEY_TYPE_snapshot) + continue; +@@ -151,7 +151,7 @@ static int bch2_snapshots_set_equiv(struct btree_trans *trans) + for (i = 0; i < 2; i++) { + ret = snapshot_live(trans, child[i]); + if (ret < 0) +- break; ++ goto err; + + if (ret) + live_idx = i; +@@ -162,6 +162,7 @@ static int bch2_snapshots_set_equiv(struct btree_trans *trans) + ? snapshot_t(c, child[live_idx])->equiv + : id; + } ++err: + bch2_trans_iter_exit(trans, &iter); + + if (ret) +-- +cgit v1.2.3 + + +From 234965279868a6b5e625f7e9b361bd6e780cbc85 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 28 Mar 2022 18:29:23 -0400 +Subject: bcachefs: Fix for freespace version upgrade path + +It's currently possible to end up in a half-upgraded state where we +haven't set the superblock to the new version, but we have run the +freespace initialization path. Previously, this meant when running fsck +on such a filesystem we wouldn't check the freespace btrees - which is a +problem, if they have been initialized and there's something fsck needs +to check and fix. + +Fix this by making bch2_check_alloc_info() check if freespace has been +initialized on each device, not by making it run conditionally on the +superblock version. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/alloc_background.c | 13 ++++++++++++- + fs/bcachefs/recovery.c | 3 +-- + 2 files changed, 13 insertions(+), 3 deletions(-) + +diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c +index 0c33424393be..81cbfeb58cd1 100644 +--- a/fs/bcachefs/alloc_background.c ++++ b/fs/bcachefs/alloc_background.c +@@ -724,12 +724,23 @@ int bch2_check_alloc_info(struct bch_fs *c, bool initial) + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; +- int ret = 0; ++ int ret = 0, last_dev = -1; + + bch2_trans_init(&trans, c, 0, 0); + + for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN, + BTREE_ITER_PREFETCH, k, ret) { ++ if (k.k->p.inode != last_dev) { ++ struct bch_dev *ca = bch_dev_bkey_exists(c, k.k->p.inode); ++ ++ if (!ca->mi.freespace_initialized) { ++ bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0)); ++ continue; ++ } ++ ++ last_dev = k.k->p.inode; ++ } ++ + ret = __bch2_trans_do(&trans, NULL, NULL, 0, + bch2_check_alloc_key(&trans, &iter)); + if (ret) +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index 66492dde7930..68612d52aa83 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -1174,8 +1174,7 @@ use_clean: + bch_verbose(c, "done checking allocations"); + } + +- if (c->opts.fsck && +- c->sb.version >= bcachefs_metadata_version_freespace) { ++ if (c->opts.fsck) { + bch_info(c, "checking need_discard and freespace btrees"); + err = "error checking need_discard and freespace btrees"; + ret = bch2_check_alloc_info(c, true); +-- +cgit v1.2.3 + + +From 26c000fd681ba5ed2e20513f12c69743fe21b772 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 29 Mar 2022 15:08:22 -0400 +Subject: bcachefs: Improve bucket_alloc_fail() tracepoint + +This adds counters for each of the reasons we may skip allocating a +bucket - we're seeing a bug where we loop endlessly trying to allocate +when we should have plenty of buckets available, so hopefully this will +help us track down why. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/alloc_foreground.c | 110 ++++++++++++++++++++++++++++------------ + include/trace/events/bcachefs.h | 27 ++++++++-- + 2 files changed, 100 insertions(+), 37 deletions(-) + +diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c +index 5b1149365389..538b597d845c 100644 +--- a/fs/bcachefs/alloc_foreground.c ++++ b/fs/bcachefs/alloc_foreground.c +@@ -192,20 +192,26 @@ static inline unsigned open_buckets_reserved(enum alloc_reserve reserve) + static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *ca, + enum alloc_reserve reserve, + struct bkey_alloc_unpacked a, +- size_t *need_journal_commit, ++ u64 *skipped_open, ++ u64 *skipped_need_journal_commit, ++ u64 *skipped_nouse, + struct closure *cl) + { + struct open_bucket *ob; + +- if (unlikely(ca->buckets_nouse && test_bit(a.bucket, ca->buckets_nouse))) ++ if (unlikely(ca->buckets_nouse && test_bit(a.bucket, ca->buckets_nouse))) { ++ (*skipped_nouse)++; + return NULL; ++ } + +- if (bch2_bucket_is_open(c, ca->dev_idx, a.bucket)) ++ if (bch2_bucket_is_open(c, ca->dev_idx, a.bucket)) { ++ (*skipped_open)++; + return NULL; ++ } + + if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal, + c->journal.flushed_seq_ondisk, ca->dev_idx, a.bucket)) { +- (*need_journal_commit)++; ++ (*skipped_need_journal_commit)++; + return NULL; + } + +@@ -227,6 +233,7 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev * + /* Recheck under lock: */ + if (bch2_bucket_is_open(c, ca->dev_idx, a.bucket)) { + spin_unlock(&c->freelist_lock); ++ (*skipped_open)++; + return NULL; + } + +@@ -267,7 +274,9 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev * + + static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bch_dev *ca, + enum alloc_reserve reserve, u64 free_entry, +- size_t *need_journal_commit, ++ u64 *skipped_open, ++ u64 *skipped_need_journal_commit, ++ u64 *skipped_nouse, + struct closure *cl) + { + struct bch_fs *c = trans->c; +@@ -317,7 +326,11 @@ static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bc + goto err; + } + +- ob = __try_alloc_bucket(c, ca, reserve, a, need_journal_commit, cl); ++ ob = __try_alloc_bucket(c, ca, reserve, a, ++ skipped_open, ++ skipped_need_journal_commit, ++ skipped_nouse, ++ cl); + err: + bch2_trans_iter_exit(trans, &iter); + printbuf_exit(&buf); +@@ -360,8 +373,11 @@ static noinline struct open_bucket * + bch2_bucket_alloc_trans_early(struct btree_trans *trans, + struct bch_dev *ca, + enum alloc_reserve reserve, +- u64 *b, +- size_t *need_journal_commit, ++ u64 *cur_bucket, ++ u64 *buckets_seen, ++ u64 *skipped_open, ++ u64 *skipped_need_journal_commit, ++ u64 *skipped_nouse, + struct closure *cl) + { + struct btree_iter iter; +@@ -369,10 +385,10 @@ bch2_bucket_alloc_trans_early(struct btree_trans *trans, + struct open_bucket *ob = NULL; + int ret; + +- *b = max_t(u64, *b, ca->mi.first_bucket); +- *b = max_t(u64, *b, ca->new_fs_bucket_idx); ++ *cur_bucket = max_t(u64, *cur_bucket, ca->mi.first_bucket); ++ *cur_bucket = max_t(u64, *cur_bucket, ca->new_fs_bucket_idx); + +- for_each_btree_key(trans, iter, BTREE_ID_alloc, POS(ca->dev_idx, *b), ++ for_each_btree_key(trans, iter, BTREE_ID_alloc, POS(ca->dev_idx, *cur_bucket), + BTREE_ITER_SLOTS, k, ret) { + struct bkey_alloc_unpacked a; + +@@ -388,14 +404,19 @@ bch2_bucket_alloc_trans_early(struct btree_trans *trans, + if (bucket_state(a) != BUCKET_free) + continue; + ++ (*buckets_seen)++; ++ + ob = __try_alloc_bucket(trans->c, ca, reserve, a, +- need_journal_commit, cl); ++ skipped_open, ++ skipped_need_journal_commit, ++ skipped_nouse, ++ cl); + if (ob) + break; + } + bch2_trans_iter_exit(trans, &iter); + +- *b = iter.pos.offset; ++ *cur_bucket = iter.pos.offset; + + return ob ?: ERR_PTR(ret ?: -FREELIST_EMPTY); + } +@@ -403,8 +424,11 @@ bch2_bucket_alloc_trans_early(struct btree_trans *trans, + static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans, + struct bch_dev *ca, + enum alloc_reserve reserve, +- u64 *b, +- size_t *need_journal_commit, ++ u64 *cur_bucket, ++ u64 *buckets_seen, ++ u64 *skipped_open, ++ u64 *skipped_need_journal_commit, ++ u64 *skipped_nouse, + struct closure *cl) + { + struct btree_iter iter; +@@ -413,26 +437,37 @@ static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans, + int ret; + + if (unlikely(!ca->mi.freespace_initialized)) +- return bch2_bucket_alloc_trans_early(trans, ca, reserve, b, +- need_journal_commit, cl); ++ return bch2_bucket_alloc_trans_early(trans, ca, reserve, ++ cur_bucket, ++ buckets_seen, ++ skipped_open, ++ skipped_need_journal_commit, ++ skipped_nouse, ++ cl); + + BUG_ON(ca->new_fs_bucket_idx); + + for_each_btree_key(trans, iter, BTREE_ID_freespace, +- POS(ca->dev_idx, *b), 0, k, ret) { ++ POS(ca->dev_idx, *cur_bucket), 0, k, ret) { + if (k.k->p.inode != ca->dev_idx) + break; + +- for (*b = max(*b, bkey_start_offset(k.k)); +- *b != k.k->p.offset && !ob; +- (*b)++) { ++ for (*cur_bucket = max(*cur_bucket, bkey_start_offset(k.k)); ++ *cur_bucket != k.k->p.offset && !ob; ++ (*cur_bucket)++) { + if (btree_trans_too_many_iters(trans)) { + ob = ERR_PTR(-EINTR); + break; + } + +- ob = try_alloc_bucket(trans, ca, reserve, *b, +- need_journal_commit, cl); ++ (*buckets_seen)++; ++ ++ ob = try_alloc_bucket(trans, ca, reserve, ++ *cur_bucket, ++ skipped_open, ++ skipped_need_journal_commit, ++ skipped_nouse, ++ cl); + } + if (ob) + break; +@@ -453,9 +488,12 @@ struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca, + struct closure *cl) + { + struct open_bucket *ob = NULL; +- size_t need_journal_commit = 0; + u64 avail = dev_buckets_available(ca, reserve); +- u64 b = 0; ++ u64 cur_bucket = 0; ++ u64 buckets_seen = 0; ++ u64 skipped_open = 0; ++ u64 skipped_need_journal_commit = 0; ++ u64 skipped_nouse = 0; + int ret; + + if (may_alloc_partial) { +@@ -483,19 +521,27 @@ again: + } + + ret = bch2_trans_do(c, NULL, NULL, 0, +- PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(&trans, +- ca, reserve, &b, +- &need_journal_commit, cl))); +- +- if (need_journal_commit * 2 > avail) ++ PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(&trans, ca, reserve, ++ &cur_bucket, ++ &buckets_seen, ++ &skipped_open, ++ &skipped_need_journal_commit, ++ &skipped_nouse, ++ cl))); ++ ++ if (skipped_need_journal_commit * 2 > avail) + bch2_journal_flush_async(&c->journal, NULL); + err: + if (!ob) + ob = ERR_PTR(ret ?: -FREELIST_EMPTY); + +- if (ob == ERR_PTR(-FREELIST_EMPTY)) { ++ if (IS_ERR(ob)) { + trace_bucket_alloc_fail(ca, bch2_alloc_reserves[reserve], avail, +- need_journal_commit, cl == NULL); ++ buckets_seen, ++ skipped_open, ++ skipped_need_journal_commit, ++ skipped_nouse, ++ cl == NULL, PTR_ERR(ob)); + atomic_long_inc(&c->bucket_alloc_fail); + } + +diff --git a/include/trace/events/bcachefs.h b/include/trace/events/bcachefs.h +index 08de7e617247..ac2aecd47c5e 100644 +--- a/include/trace/events/bcachefs.h ++++ b/include/trace/events/bcachefs.h +@@ -493,32 +493,49 @@ DEFINE_EVENT(bucket_alloc, bucket_alloc, + + TRACE_EVENT(bucket_alloc_fail, + TP_PROTO(struct bch_dev *ca, const char *alloc_reserve, +- u64 avail, u64 need_journal_commit, +- bool nonblocking), +- TP_ARGS(ca, alloc_reserve, avail, need_journal_commit, nonblocking), ++ u64 avail, ++ u64 seen, ++ u64 open, ++ u64 need_journal_commit, ++ u64 nouse, ++ bool nonblocking, ++ int ret), ++ TP_ARGS(ca, alloc_reserve, avail, seen, open, need_journal_commit, nouse, nonblocking, ret), + + TP_STRUCT__entry( + __field(dev_t, dev ) + __array(char, reserve, 16 ) + __field(u64, avail ) ++ __field(u64, seen ) ++ __field(u64, open ) + __field(u64, need_journal_commit ) ++ __field(u64, nouse ) + __field(bool, nonblocking ) ++ __field(int, ret ) + ), + + TP_fast_assign( + __entry->dev = ca->dev; + strlcpy(__entry->reserve, alloc_reserve, sizeof(__entry->reserve)); + __entry->avail = avail; ++ __entry->seen = seen; ++ __entry->open = open; + __entry->need_journal_commit = need_journal_commit; ++ __entry->nouse = nouse; + __entry->nonblocking = nonblocking; ++ __entry->ret = ret; + ), + +- TP_printk("%d,%d reserve %s avail %llu need_journal_commit %llu nonblocking %u", ++ TP_printk("%d,%d reserve %s avail %llu seen %llu open %llu need_journal_commit %llu nouse %llu nonblocking %u ret %i", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->reserve, + __entry->avail, ++ __entry->seen, ++ __entry->open, + __entry->need_journal_commit, +- __entry->nonblocking) ++ __entry->nouse, ++ __entry->nonblocking, ++ __entry->ret) + ); + + DEFINE_EVENT(bucket_alloc, open_bucket_alloc_fail, +-- +cgit v1.2.3 + + +From f0c681af7c027452bf2218998be6c1c1e489d534 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 30 Mar 2022 00:06:03 -0400 +Subject: bcachefs: Fix commit flags in discard & invalidate paths + +Since the bucket invalidate and discard paths are required for other +allocations to make forward progress, they at a minimum need +BTREE_INSERT_USE_RESERVE. Watermarks may need further work. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/alloc_background.c | 5 ++++- + 1 file changed, 4 insertions(+), 1 deletion(-) + +diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c +index 81cbfeb58cd1..07a76993318f 100644 +--- a/fs/bcachefs/alloc_background.c ++++ b/fs/bcachefs/alloc_background.c +@@ -862,7 +862,9 @@ static void bch2_do_discards_work(struct work_struct *work) + bch2_bucket_is_open_safe(c, k.k->p.inode, k.k->p.offset)) + continue; + +- ret = __bch2_trans_do(&trans, NULL, NULL, 0, ++ ret = __bch2_trans_do(&trans, NULL, NULL, ++ BTREE_INSERT_USE_RESERVE| ++ BTREE_INSERT_NOFAIL, + bch2_clear_need_discard(&trans, k.k->p, ca, &discard_done)); + if (ret) + break; +@@ -954,6 +956,7 @@ static void bch2_do_invalidates_work(struct work_struct *work) + for_each_member_device(ca, c, i) + while (!ret && should_invalidate_buckets(ca)) + ret = __bch2_trans_do(&trans, NULL, NULL, ++ BTREE_INSERT_USE_RESERVE| + BTREE_INSERT_NOFAIL, + invalidate_one_bucket(&trans, ca)); + +-- +cgit v1.2.3 + + +From aa06b77da80be8fe96c0ff9386a0fdfb8bc32e0f Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 30 Mar 2022 13:10:03 -0400 +Subject: bcachefs: Add a missing btree_path_set_dirty() calls + +bch2_btree_iter_next_node() was mucking with other btree_path state +without setting path->update to be consistent with the fact that the +path is very much no longer uptodate - oops. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 4 ++++ + fs/bcachefs/btree_update_interior.c | 1 + + 2 files changed, 5 insertions(+) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 56c493c95d3a..9abf73b6b5b7 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -1720,6 +1720,7 @@ bch2_btree_path_set_pos(struct btree_trans *trans, + l = btree_path_up_until_good_node(trans, path, cmp); + + if (btree_path_node(path, l)) { ++ BUG_ON(!btree_node_locked(path, l)); + /* + * We might have to skip over many keys, or just a few: try + * advancing the node iterator, and if we have to skip over too +@@ -1923,6 +1924,7 @@ struct btree_path *bch2_path_get(struct btree_trans *trans, + + BUG_ON(trans->restarted); + btree_trans_verify_sorted(trans); ++ bch2_trans_verify_locks(trans); + + trans_for_each_path_inorder(trans, path, i) { + if (__btree_path_cmp(path, +@@ -2114,6 +2116,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) + btree_node_unlock(path, path->level); + path->l[path->level].b = BTREE_ITER_NO_NODE_UP; + path->level++; ++ btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); + return NULL; + } + +@@ -2121,6 +2124,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) + __bch2_btree_path_unlock(path); + path->l[path->level].b = BTREE_ITER_NO_NODE_GET_LOCKS; + path->l[path->level + 1].b = BTREE_ITER_NO_NODE_GET_LOCKS; ++ btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); + trace_trans_restart_relock_next_node(trans->fn, _THIS_IP_, + path->btree_id, &path->pos); + btree_trans_restart(trans); +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index c2232f8185c5..74272bb3fc69 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -1933,6 +1933,7 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans, + btree_node_unlock(iter2.path, iter2.path->level); + path_l(iter2.path)->b = BTREE_ITER_NO_NODE_UP; + iter2.path->level++; ++ btree_path_set_dirty(iter2.path, BTREE_ITER_NEED_TRAVERSE); + + bch2_btree_path_check_sort(trans, iter2.path, 0); + +-- +cgit v1.2.3 + + +From 2e5abacbe76d8db68155eaa2347b7cd390fdbe1d Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 30 Mar 2022 13:47:07 -0400 +Subject: bcachefs: btree_path_make_mut() clears should_be_locked + +This fixes a bug where __bch2_btree_node_update_key() wasn't clearing +should_be_locked, leading to bch2_btree_path_traverse() always failing - +all callers of btree_path_make_mut() want should_be_locked cleared, so +do it there. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 9abf73b6b5b7..8c38b58050a5 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -1686,6 +1686,7 @@ bch2_btree_path_make_mut(struct btree_trans *trans, + btree_trans_verify_sorted(trans); + } + ++ path->should_be_locked = false; + return path; + } + +@@ -1705,8 +1706,7 @@ bch2_btree_path_set_pos(struct btree_trans *trans, + + path = bch2_btree_path_make_mut(trans, path, intent, ip); + +- path->pos = new_pos; +- path->should_be_locked = false; ++ path->pos = new_pos; + + bch2_btree_path_check_sort(trans, path, cmp); + +-- +cgit v1.2.3 + + +From 938d1d65fb8e52f39c68cb646cb9b755a5cfbca0 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 29 Mar 2022 15:48:45 -0400 +Subject: bcachefs: darrays + +Inspired by CCAN darray - simple, stupid resizable (dynamic) arrays. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bcachefs.h | 2 +- + fs/bcachefs/darray.h | 76 +++++++++++++++++++++ + fs/bcachefs/fs.c | 2 +- + fs/bcachefs/fs.h | 4 +- + fs/bcachefs/fsck.c | 153 ++++++++++++++++-------------------------- + fs/bcachefs/move.c | 8 +-- + fs/bcachefs/subvolume.c | 41 ++++------- + fs/bcachefs/subvolume.h | 38 ++++------- + fs/bcachefs/subvolume_types.h | 8 +-- + 9 files changed, 169 insertions(+), 163 deletions(-) + create mode 100644 fs/bcachefs/darray.h + +diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h +index a4ef9aabf274..9877037fc195 100644 +--- a/fs/bcachefs/bcachefs.h ++++ b/fs/bcachefs/bcachefs.h +@@ -649,7 +649,7 @@ struct bch_fs { + struct mutex snapshot_table_lock; + struct work_struct snapshot_delete_work; + struct work_struct snapshot_wait_for_pagecache_and_delete_work; +- struct snapshot_id_list snapshots_unlinked; ++ snapshot_id_list snapshots_unlinked; + struct mutex snapshots_unlinked_lock; + + /* BTREE CACHE */ +diff --git a/fs/bcachefs/darray.h b/fs/bcachefs/darray.h +new file mode 100644 +index 000000000000..745b1cdb0d17 +--- /dev/null ++++ b/fs/bcachefs/darray.h +@@ -0,0 +1,76 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_DARRAY_H ++#define _BCACHEFS_DARRAY_H ++ ++/* ++ * Dynamic arrays: ++ * ++ * Inspired by CCAN's darray ++ */ ++ ++#include "util.h" ++#include ++ ++#define DARRAY(type) \ ++struct { \ ++ size_t nr, size; \ ++ type *data; \ ++} ++ ++typedef DARRAY(void) darray_void; ++ ++static inline int __darray_make_room(darray_void *d, size_t t_size, size_t more) ++{ ++ if (d->nr + more > d->size) { ++ size_t new_size = roundup_pow_of_two(d->nr + more); ++ void *data = krealloc_array(d->data, new_size, t_size, GFP_KERNEL); ++ ++ if (!data) ++ return -ENOMEM; ++ ++ d->data = data; ++ d->size = new_size; ++ } ++ ++ return 0; ++} ++ ++#define darray_make_room(_d, _more) \ ++ __darray_make_room((darray_void *) &(_d), sizeof((_d).data[0]), (_more)) ++ ++#define darray_top(_d) ((_d).data[(_d).nr]) ++ ++#define darray_push(_d, _item) \ ++({ \ ++ int _ret = darray_make_room((_d), 1); \ ++ \ ++ if (!_ret) \ ++ (_d).data[(_d).nr++] = (_item); \ ++ _ret; \ ++}) ++ ++#define darray_insert_item(_d, _pos, _item) \ ++({ \ ++ int _ret = darray_make_room((_d), 1); \ ++ \ ++ if (!_ret) \ ++ array_insert_item((_d).data, (_d).nr, (_pos), (_item)); \ ++ _ret; \ ++}) ++ ++#define darray_for_each(_d, _i) \ ++ for (_i = (_d).data; _i < (_d).data + (_d).nr; _i++) ++ ++#define darray_init(_d) \ ++do { \ ++ (_d).data = NULL; \ ++ (_d).nr = (_d).size = 0; \ ++} while (0) ++ ++#define darray_exit(_d) \ ++do { \ ++ kfree((_d).data); \ ++ darray_init(_d); \ ++} while (0) ++ ++#endif /* _BCACHEFS_DARRAY_H */ +diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c +index 9fc6c39eacdb..d462c06899d6 100644 +--- a/fs/bcachefs/fs.c ++++ b/fs/bcachefs/fs.c +@@ -1477,7 +1477,7 @@ static void bch2_evict_inode(struct inode *vinode) + } + + void bch2_evict_subvolume_inodes(struct bch_fs *c, +- struct snapshot_id_list *s) ++ snapshot_id_list *s) + { + struct super_block *sb = c->vfs_sb; + struct inode *inode; +diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h +index b2211ec7f302..9f4b57e30e2a 100644 +--- a/fs/bcachefs/fs.h ++++ b/fs/bcachefs/fs.h +@@ -191,7 +191,7 @@ int bch2_setattr_nonsize(struct user_namespace *, + struct iattr *); + int __bch2_unlink(struct inode *, struct dentry *, bool); + +-void bch2_evict_subvolume_inodes(struct bch_fs *, struct snapshot_id_list *); ++void bch2_evict_subvolume_inodes(struct bch_fs *, snapshot_id_list *); + + void bch2_vfs_exit(void); + int bch2_vfs_init(void); +@@ -199,7 +199,7 @@ int bch2_vfs_init(void); + #else + + static inline void bch2_evict_subvolume_inodes(struct bch_fs *c, +- struct snapshot_id_list *s) {} ++ snapshot_id_list *s) {} + static inline void bch2_vfs_exit(void) {} + static inline int bch2_vfs_init(void) { return 0; } + +diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c +index 8783b950055e..2582ddf14803 100644 +--- a/fs/bcachefs/fsck.c ++++ b/fs/bcachefs/fsck.c +@@ -3,6 +3,7 @@ + #include "bcachefs.h" + #include "bkey_buf.h" + #include "btree_update.h" ++#include "darray.h" + #include "dirent.h" + #include "error.h" + #include "fs-common.h" +@@ -471,11 +472,11 @@ static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s, str + pos.snapshot = snapshot_t(c, pos.snapshot)->equiv; + + if (bkey_cmp(s->pos, pos)) +- s->nr = 0; ++ s->ids.nr = 0; + s->pos = pos; + + /* Might get called multiple times due to lock restarts */ +- if (s->nr && s->d[s->nr - 1] == pos.snapshot) ++ if (s->ids.nr && s->ids.data[s->ids.nr - 1] == pos.snapshot) + return 0; + + return snapshots_seen_add(c, s, pos.snapshot); +@@ -498,7 +499,7 @@ static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *see + ancestor = snapshot_t(c, ancestor)->equiv; + + /* @ancestor should be the snapshot most recently added to @seen */ +- BUG_ON(!seen->nr || seen->d[seen->nr - 1] != ancestor); ++ BUG_ON(!seen->ids.nr || seen->ids.data[seen->ids.nr - 1] != ancestor); + BUG_ON(seen->pos.snapshot != ancestor); + + if (id == ancestor) +@@ -507,11 +508,11 @@ static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *see + if (!bch2_snapshot_is_ancestor(c, id, ancestor)) + return false; + +- for (i = seen->nr - 2; +- i >= 0 && seen->d[i] >= id; ++ for (i = seen->ids.nr - 2; ++ i >= 0 && seen->ids.data[i] >= id; + --i) +- if (bch2_snapshot_is_ancestor(c, id, seen->d[i]) && +- bch2_snapshot_is_ancestor(c, seen->d[i], ancestor)) ++ if (bch2_snapshot_is_ancestor(c, id, seen->ids.data[i]) && ++ bch2_snapshot_is_ancestor(c, seen->ids.data[i], ancestor)) + return false; + + return true; +@@ -537,26 +538,25 @@ static int ref_visible(struct bch_fs *c, struct snapshots_seen *s, + } + + #define for_each_visible_inode(_c, _s, _w, _snapshot, _i) \ +- for (_i = (_w)->d; _i < (_w)->d + (_w)->nr && (_i)->snapshot <= (_snapshot); _i++)\ ++ for (_i = (_w)->inodes.data; _i < (_w)->inodes.data + (_w)->inodes.nr && (_i)->snapshot <= (_snapshot); _i++)\ + if (key_visible_in_snapshot(_c, _s, _i->snapshot, _snapshot)) + ++struct inode_walker_entry { ++ struct bch_inode_unpacked inode; ++ u32 snapshot; ++ u64 count; ++}; ++ + struct inode_walker { + bool first_this_inode; + u64 cur_inum; + +- size_t nr; +- size_t size; +- struct inode_walker_entry { +- struct bch_inode_unpacked inode; +- u32 snapshot; +- u64 count; +- } *d; ++ DARRAY(struct inode_walker_entry) inodes; + }; + + static void inode_walker_exit(struct inode_walker *w) + { +- kfree(w->d); +- w->d = NULL; ++ darray_exit(w->inodes); + } + + static struct inode_walker inode_walker_init(void) +@@ -564,43 +564,17 @@ static struct inode_walker inode_walker_init(void) + return (struct inode_walker) { 0, }; + } + +-static int inode_walker_realloc(struct bch_fs *c, struct inode_walker *w) +-{ +- if (w->nr == w->size) { +- size_t new_size = max_t(size_t, 8UL, w->size * 2); +- void *d = krealloc(w->d, new_size * sizeof(w->d[0]), +- GFP_KERNEL); +- if (!d) { +- bch_err(c, "fsck: error allocating memory for inode_walker, size %zu", +- new_size); +- return -ENOMEM; +- } +- +- w->d = d; +- w->size = new_size; +- } +- +- return 0; +-} +- + static int add_inode(struct bch_fs *c, struct inode_walker *w, + struct bkey_s_c inode) + { + struct bch_inode_unpacked u; +- int ret; +- +- ret = inode_walker_realloc(c, w); +- if (ret) +- return ret; + + BUG_ON(bch2_inode_unpack(inode, &u)); + +- w->d[w->nr++] = (struct inode_walker_entry) { ++ return darray_push(w->inodes, ((struct inode_walker_entry) { + .inode = u, + .snapshot = snapshot_t(c, inode.k->p.snapshot)->equiv, +- }; +- +- return 0; ++ })); + } + + static int __walk_inode(struct btree_trans *trans, +@@ -619,7 +593,7 @@ static int __walk_inode(struct btree_trans *trans, + goto lookup_snapshot; + } + +- w->nr = 0; ++ w->inodes.nr = 0; + + for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, pos.inode), + BTREE_ITER_ALL_SNAPSHOTS, k, ret) { +@@ -637,26 +611,25 @@ static int __walk_inode(struct btree_trans *trans, + w->cur_inum = pos.inode; + w->first_this_inode = true; + lookup_snapshot: +- for (i = 0; i < w->nr; i++) +- if (bch2_snapshot_is_ancestor(c, pos.snapshot, w->d[i].snapshot)) ++ for (i = 0; i < w->inodes.nr; i++) ++ if (bch2_snapshot_is_ancestor(c, pos.snapshot, w->inodes.data[i].snapshot)) + goto found; + return INT_MAX; + found: +- BUG_ON(pos.snapshot > w->d[i].snapshot); ++ BUG_ON(pos.snapshot > w->inodes.data[i].snapshot); + +- if (pos.snapshot != w->d[i].snapshot) { ++ if (pos.snapshot != w->inodes.data[i].snapshot) { + ancestor_pos = i; + +- while (i && w->d[i - 1].snapshot > pos.snapshot) ++ while (i && w->inodes.data[i - 1].snapshot > pos.snapshot) + --i; + +- ret = inode_walker_realloc(c, w); ++ ret = darray_insert_item(w->inodes, i, w->inodes.data[ancestor_pos]); + if (ret) + return ret; + +- array_insert_item(w->d, w->nr, i, w->d[ancestor_pos]); +- w->d[i].snapshot = pos.snapshot; +- w->d[i].count = 0; ++ w->inodes.data[i].snapshot = pos.snapshot; ++ w->inodes.data[i].count = 0; + } + + return i; +@@ -672,7 +645,7 @@ static int __get_visible_inodes(struct btree_trans *trans, + struct bkey_s_c k; + int ret; + +- w->nr = 0; ++ w->inodes.nr = 0; + + for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, inum), + BTREE_ITER_ALL_SNAPSHOTS, k, ret) { +@@ -1133,7 +1106,7 @@ static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w) + int ret = 0, ret2 = 0; + s64 count2; + +- for (i = w->d; i < w->d + w->nr; i++) { ++ darray_for_each(w->inodes, i) { + if (i->inode.bi_sectors == i->count) + continue; + +@@ -1232,7 +1205,7 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter, + goto out; + } + +- i = inode->d + ret; ++ i = inode->inodes.data + ret; + ret = 0; + + if (fsck_err_on(!S_ISREG(i->inode.bi_mode) && +@@ -1333,7 +1306,7 @@ static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w) + int ret = 0, ret2 = 0; + s64 count2; + +- for (i = w->d; i < w->d + w->nr; i++) { ++ darray_for_each(w->inodes, i) { + if (i->inode.bi_nlink == i->count) + continue; + +@@ -1537,7 +1510,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, + goto out; + } + +- i = dir->d + ret; ++ i = dir->inodes.data + ret; + ret = 0; + + if (fsck_err_on(!S_ISDIR(i->inode.bi_mode), c, +@@ -1550,7 +1523,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, + } + + if (dir->first_this_inode) +- *hash_info = bch2_hash_info_init(c, &dir->d[0].inode); ++ *hash_info = bch2_hash_info_init(c, &dir->inodes.data[0].inode); + + ret = hash_check_key(trans, bch2_dirent_hash_desc, + hash_info, iter, k); +@@ -1618,7 +1591,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, + if (ret) + goto err; + +- if (fsck_err_on(!target->nr, c, ++ if (fsck_err_on(!target->inodes.nr, c, + "dirent points to missing inode:\n%s", + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), +@@ -1628,7 +1601,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, + goto err; + } + +- for (i = target->d; i < target->d + target->nr; i++) { ++ darray_for_each(target->inodes, i) { + ret = check_dirent_target(trans, iter, d, + &i->inode, i->snapshot); + if (ret) +@@ -1726,7 +1699,7 @@ static int check_xattr(struct btree_trans *trans, struct btree_iter *iter, + ret = 0; + + if (inode->first_this_inode) +- *hash_info = bch2_hash_info_init(c, &inode->d[0].inode); ++ *hash_info = bch2_hash_info_init(c, &inode->inodes.data[0].inode); + + ret = hash_check_key(trans, bch2_xattr_hash_desc, hash_info, iter, k); + fsck_err: +@@ -1836,21 +1809,18 @@ static int check_root(struct bch_fs *c) + check_root_trans(&trans)); + } + +-struct pathbuf { +- size_t nr; +- size_t size; +- +- struct pathbuf_entry { +- u64 inum; +- u32 snapshot; +- } *entries; ++struct pathbuf_entry { ++ u64 inum; ++ u32 snapshot; + }; + +-static bool path_is_dup(struct pathbuf *p, u64 inum, u32 snapshot) ++typedef DARRAY(struct pathbuf_entry) pathbuf; ++ ++static bool path_is_dup(pathbuf *p, u64 inum, u32 snapshot) + { + struct pathbuf_entry *i; + +- for (i = p->entries; i < p->entries + p->nr; i++) ++ darray_for_each(*p, i) + if (i->inum == inum && + i->snapshot == snapshot) + return true; +@@ -1858,29 +1828,18 @@ static bool path_is_dup(struct pathbuf *p, u64 inum, u32 snapshot) + return false; + } + +-static int path_down(struct bch_fs *c, struct pathbuf *p, ++static int path_down(struct bch_fs *c, pathbuf *p, + u64 inum, u32 snapshot) + { +- if (p->nr == p->size) { +- size_t new_size = max_t(size_t, 256UL, p->size * 2); +- void *n = krealloc(p->entries, +- new_size * sizeof(p->entries[0]), +- GFP_KERNEL); +- if (!n) { +- bch_err(c, "fsck: error allocating memory for pathbuf, size %zu", +- new_size); +- return -ENOMEM; +- } +- +- p->entries = n; +- p->size = new_size; +- }; +- +- p->entries[p->nr++] = (struct pathbuf_entry) { ++ int ret = darray_push(*p, ((struct pathbuf_entry) { + .inum = inum, + .snapshot = snapshot, +- }; +- return 0; ++ })); ++ ++ if (ret) ++ bch_err(c, "fsck: error allocating memory for pathbuf, size %zu", ++ p->size); ++ return ret; + } + + /* +@@ -1889,7 +1848,7 @@ static int path_down(struct bch_fs *c, struct pathbuf *p, + * XXX: we should also be verifying that inodes are in the right subvolumes + */ + static int check_path(struct btree_trans *trans, +- struct pathbuf *p, ++ pathbuf *p, + struct bch_inode_unpacked *inode, + u32 snapshot) + { +@@ -1963,7 +1922,7 @@ static int check_path(struct btree_trans *trans, + /* XXX print path */ + bch_err(c, "directory structure loop"); + +- for (i = p->entries; i < p->entries + p->nr; i++) ++ darray_for_each(*p, i) + pr_err("%llu:%u", i->inum, i->snapshot); + pr_err("%llu:%u", inode->bi_inum, snapshot); + +@@ -2000,7 +1959,7 @@ static int check_directory_structure(struct bch_fs *c) + struct btree_iter iter; + struct bkey_s_c k; + struct bch_inode_unpacked u; +- struct pathbuf path = { 0, 0, NULL }; ++ pathbuf path = { 0, }; + int ret; + + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); +@@ -2030,7 +1989,7 @@ static int check_directory_structure(struct bch_fs *c) + + BUG_ON(ret == -EINTR); + +- kfree(path.entries); ++ darray_exit(path); + + bch2_trans_exit(&trans); + return ret; +diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c +index 8eb49381b030..1de213506adf 100644 +--- a/fs/bcachefs/move.c ++++ b/fs/bcachefs/move.c +@@ -92,10 +92,10 @@ next: + + if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, old_pos.snapshot)) { + struct bkey_i *update; +- size_t i; ++ u32 *i; + +- for (i = 0; i < s.nr; i++) +- if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, s.d[i])) ++ darray_for_each(s.ids, i) ++ if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, *i)) + goto next; + + update = bch2_trans_kmalloc(trans, sizeof(struct bkey_i)); +@@ -125,7 +125,7 @@ next: + } + } + bch2_trans_iter_exit(trans, &iter); +- kfree(s.d); ++ darray_exit(s.ids); + + return ret; + } +diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c +index 666f1c88a3b6..cdb89ba216cc 100644 +--- a/fs/bcachefs/subvolume.c ++++ b/fs/bcachefs/subvolume.c +@@ -545,36 +545,21 @@ err: + return ret; + } + +-static int snapshot_id_add(struct snapshot_id_list *s, u32 id) ++static int snapshot_id_add(snapshot_id_list *s, u32 id) + { + BUG_ON(snapshot_list_has_id(s, id)); + +- if (s->nr == s->size) { +- size_t new_size = max(8U, s->size * 2); +- void *n = krealloc(s->d, +- new_size * sizeof(s->d[0]), +- GFP_KERNEL); +- if (!n) { +- pr_err("error allocating snapshot ID list"); +- return -ENOMEM; +- } +- +- s->d = n; +- s->size = new_size; +- }; +- +- s->d[s->nr++] = id; +- return 0; ++ return darray_push(*s, id); + } + + static int bch2_snapshot_delete_keys_btree(struct btree_trans *trans, +- struct snapshot_id_list *deleted, ++ snapshot_id_list *deleted, + enum btree_id btree_id) + { + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_s_c k; +- struct snapshot_id_list equiv_seen = { 0 }; ++ snapshot_id_list equiv_seen = { 0 }; + struct bpos last_pos = POS_MIN; + int ret = 0; + +@@ -621,7 +606,7 @@ static int bch2_snapshot_delete_keys_btree(struct btree_trans *trans, + } + bch2_trans_iter_exit(trans, &iter); + +- kfree(equiv_seen.d); ++ darray_exit(equiv_seen); + + return ret; + } +@@ -633,7 +618,7 @@ static void bch2_delete_dead_snapshots_work(struct work_struct *work) + struct btree_iter iter; + struct bkey_s_c k; + struct bkey_s_c_snapshot snap; +- struct snapshot_id_list deleted = { 0 }; ++ snapshot_id_list deleted = { 0 }; + u32 i, id, children[2]; + int ret = 0; + +@@ -713,15 +698,15 @@ static void bch2_delete_dead_snapshots_work(struct work_struct *work) + + for (i = 0; i < deleted.nr; i++) { + ret = __bch2_trans_do(&trans, NULL, NULL, 0, +- bch2_snapshot_node_delete(&trans, deleted.d[i])); ++ bch2_snapshot_node_delete(&trans, deleted.data[i])); + if (ret) { + bch_err(c, "error deleting snapshot %u: %i", +- deleted.d[i], ret); ++ deleted.data[i], ret); + goto err; + } + } + err: +- kfree(deleted.d); ++ darray_exit(deleted); + bch2_trans_exit(&trans); + percpu_ref_put(&c->writes); + } +@@ -876,14 +861,14 @@ void bch2_subvolume_wait_for_pagecache_and_delete(struct work_struct *work) + { + struct bch_fs *c = container_of(work, struct bch_fs, + snapshot_wait_for_pagecache_and_delete_work); +- struct snapshot_id_list s; ++ snapshot_id_list s; + u32 *id; + int ret = 0; + + while (!ret) { + mutex_lock(&c->snapshots_unlinked_lock); + s = c->snapshots_unlinked; +- memset(&c->snapshots_unlinked, 0, sizeof(c->snapshots_unlinked)); ++ darray_init(c->snapshots_unlinked); + mutex_unlock(&c->snapshots_unlinked_lock); + + if (!s.nr) +@@ -891,7 +876,7 @@ void bch2_subvolume_wait_for_pagecache_and_delete(struct work_struct *work) + + bch2_evict_subvolume_inodes(c, &s); + +- for (id = s.d; id < s.d + s.nr; id++) { ++ for (id = s.data; id < s.data + s.nr; id++) { + ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOFAIL, + bch2_subvolume_delete(&trans, *id)); + if (ret) { +@@ -900,7 +885,7 @@ void bch2_subvolume_wait_for_pagecache_and_delete(struct work_struct *work) + } + } + +- kfree(s.d); ++ darray_exit(s); + } + + percpu_ref_put(&c->writes); +diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h +index 4abe53df2788..f609291acafa 100644 +--- a/fs/bcachefs/subvolume.h ++++ b/fs/bcachefs/subvolume.h +@@ -2,6 +2,7 @@ + #ifndef _BCACHEFS_SUBVOLUME_H + #define _BCACHEFS_SUBVOLUME_H + ++#include "darray.h" + #include "subvolume_types.h" + + void bch2_snapshot_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); +@@ -58,15 +59,13 @@ static inline bool bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ances + + struct snapshots_seen { + struct bpos pos; +- size_t nr; +- size_t size; +- u32 *d; ++ DARRAY(u32) ids; + }; + + static inline void snapshots_seen_exit(struct snapshots_seen *s) + { +- kfree(s->d); +- s->d = NULL; ++ kfree(s->ids.data); ++ s->ids.data = NULL; + } + + static inline void snapshots_seen_init(struct snapshots_seen *s) +@@ -76,30 +75,19 @@ static inline void snapshots_seen_init(struct snapshots_seen *s) + + static inline int snapshots_seen_add(struct bch_fs *c, struct snapshots_seen *s, u32 id) + { +- if (s->nr == s->size) { +- size_t new_size = max(s->size, (size_t) 128) * 2; +- u32 *d = krealloc(s->d, new_size * sizeof(s->d[0]), GFP_KERNEL); +- +- if (!d) { +- bch_err(c, "error reallocating snapshots_seen table (new size %zu)", +- new_size); +- return -ENOMEM; +- } +- +- s->size = new_size; +- s->d = d; +- } +- +- s->d[s->nr++] = id; +- return 0; ++ int ret = darray_push(s->ids, id); ++ if (ret) ++ bch_err(c, "error reallocating snapshots_seen table (size %zu)", ++ s->ids.size); ++ return ret; + } + +-static inline bool snapshot_list_has_id(struct snapshot_id_list *s, u32 id) ++static inline bool snapshot_list_has_id(snapshot_id_list *s, u32 id) + { +- unsigned i; ++ u32 *i; + +- for (i = 0; i < s->nr; i++) +- if (id == s->d[i]) ++ darray_for_each(*s, i) ++ if (*i == id) + return true; + return false; + } +diff --git a/fs/bcachefs/subvolume_types.h b/fs/bcachefs/subvolume_types.h +index 9410b9587591..f7562b5d51df 100644 +--- a/fs/bcachefs/subvolume_types.h ++++ b/fs/bcachefs/subvolume_types.h +@@ -2,10 +2,8 @@ + #ifndef _BCACHEFS_SUBVOLUME_TYPES_H + #define _BCACHEFS_SUBVOLUME_TYPES_H + +-struct snapshot_id_list { +- u32 nr; +- u32 size; +- u32 *d; +-}; ++#include "darray.h" ++ ++typedef DARRAY(u32) snapshot_id_list; + + #endif /* _BCACHEFS_SUBVOLUME_TYPES_H */ +-- +cgit v1.2.3 + + +From 3b449cbdeeb09d7b1cadbdbf7fca97958d19e5d2 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 29 Mar 2022 16:29:10 -0400 +Subject: bcachefs: Use darray for extra_journal_entries + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_iter.c | 5 +++-- + fs/bcachefs/btree_types.h | 4 ++-- + fs/bcachefs/btree_update_interior.c | 28 +++++++++++++++++++--------- + fs/bcachefs/btree_update_leaf.c | 17 ++++++++--------- + 4 files changed, 32 insertions(+), 22 deletions(-) + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index 8c38b58050a5..25d254ee9eac 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -3059,8 +3059,7 @@ void bch2_trans_begin(struct btree_trans *trans) + trans->mem_top = 0; + + trans->hooks = NULL; +- trans->extra_journal_entries = NULL; +- trans->extra_journal_entry_u64s = 0; ++ trans->extra_journal_entries.nr = 0; + + if (trans->fs_usage_deltas) { + trans->fs_usage_deltas->used = 0; +@@ -3193,6 +3192,8 @@ void bch2_trans_exit(struct btree_trans *trans) + + bch2_journal_preres_put(&c->journal, &trans->journal_preres); + ++ kfree(trans->extra_journal_entries.data); ++ + if (trans->fs_usage_deltas) { + if (trans->fs_usage_deltas->size + sizeof(trans->fs_usage_deltas) == + REPLICAS_DELTA_LIST_MAX) +diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h +index 993f04f52149..b86a721f90ac 100644 +--- a/fs/bcachefs/btree_types.h ++++ b/fs/bcachefs/btree_types.h +@@ -8,6 +8,7 @@ + + #include "bkey_methods.h" + #include "buckets_types.h" ++#include "darray.h" + #include "journal_types.h" + + struct open_bucket; +@@ -417,8 +418,7 @@ struct btree_trans { + + /* update path: */ + struct btree_trans_commit_hook *hooks; +- struct jset_entry *extra_journal_entries; +- unsigned extra_journal_entry_u64s; ++ DARRAY(u64) extra_journal_entries; + struct journal_entry_pin *journal_pin; + + struct journal_res journal_res; +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index 74272bb3fc69..42ae3b0c5839 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -532,8 +532,15 @@ static int btree_update_nodes_written_trans(struct btree_trans *trans, + struct bkey_i *k; + int ret; + +- trans->extra_journal_entries = (void *) &as->journal_entries[0]; +- trans->extra_journal_entry_u64s = as->journal_u64s; ++ ret = darray_make_room(trans->extra_journal_entries, as->journal_u64s); ++ if (ret) ++ return ret; ++ ++ memcpy(&darray_top(trans->extra_journal_entries), ++ as->journal_entries, ++ as->journal_u64s * sizeof(u64)); ++ trans->extra_journal_entries.nr += as->journal_u64s; ++ + trans->journal_pin = &as->journal; + + for_each_keylist_key(&as->new_keys, k) { +@@ -1899,7 +1906,6 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans, + struct bch_fs *c = trans->c; + struct btree_iter iter2 = { NULL }; + struct btree *parent; +- u64 journal_entries[BKEY_BTREE_PTR_U64s_MAX]; + int ret; + + if (!skip_triggers) { +@@ -1944,12 +1950,16 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans, + } else { + BUG_ON(btree_node_root(c, b) != b); + +- trans->extra_journal_entries = (void *) &journal_entries[0]; +- trans->extra_journal_entry_u64s = +- journal_entry_set((void *) &journal_entries[0], +- BCH_JSET_ENTRY_btree_root, +- b->c.btree_id, b->c.level, +- new_key, new_key->k.u64s); ++ ret = darray_make_room(trans->extra_journal_entries, ++ jset_u64s(new_key->k.u64s)); ++ if (ret) ++ return ret; ++ ++ journal_entry_set((void *) &darray_top(trans->extra_journal_entries), ++ BCH_JSET_ENTRY_btree_root, ++ b->c.btree_id, b->c.level, ++ new_key, new_key->k.u64s); ++ trans->extra_journal_entries.nr += jset_u64s(new_key->k.u64s); + } + + ret = bch2_trans_commit(trans, NULL, NULL, +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index 8d185c7c10ef..1148278c35d0 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -700,13 +700,13 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, + trans->journal_res.seq = c->journal.replay_journal_seq; + } + +- if (unlikely(trans->extra_journal_entry_u64s)) { ++ if (unlikely(trans->extra_journal_entries.nr)) { + memcpy_u64s_small(journal_res_entry(&c->journal, &trans->journal_res), +- trans->extra_journal_entries, +- trans->extra_journal_entry_u64s); ++ trans->extra_journal_entries.data, ++ trans->extra_journal_entries.nr); + +- trans->journal_res.offset += trans->extra_journal_entry_u64s; +- trans->journal_res.u64s -= trans->extra_journal_entry_u64s; ++ trans->journal_res.offset += trans->extra_journal_entries.nr; ++ trans->journal_res.u64s -= trans->extra_journal_entries.nr; + } + + /* +@@ -1088,7 +1088,7 @@ int __bch2_trans_commit(struct btree_trans *trans) + int ret = 0; + + if (!trans->nr_updates && +- !trans->extra_journal_entry_u64s) ++ !trans->extra_journal_entries.nr) + goto out_reset; + + if (trans->flags & BTREE_INSERT_GC_LOCK_HELD) +@@ -1112,7 +1112,7 @@ int __bch2_trans_commit(struct btree_trans *trans) + + memset(&trans->journal_preres, 0, sizeof(trans->journal_preres)); + +- trans->journal_u64s = trans->extra_journal_entry_u64s; ++ trans->journal_u64s = trans->extra_journal_entries.nr; + trans->journal_preres_u64s = 0; + + trans->journal_transaction_names = READ_ONCE(c->opts.journal_transaction_names); +@@ -1170,8 +1170,7 @@ out_reset: + trans->extra_journal_res = 0; + trans->nr_updates = 0; + trans->hooks = NULL; +- trans->extra_journal_entries = NULL; +- trans->extra_journal_entry_u64s = 0; ++ trans->extra_journal_entries.nr = 0; + + if (trans->fs_usage_deltas) { + trans->fs_usage_deltas->used = 0; +-- +cgit v1.2.3 + + +From dd22e8bcf901d42eded188e0eb1c59aee7a5b08f Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 30 Mar 2022 15:44:12 -0400 +Subject: bcachefs: bch2_trans_log_msg() + +Add a new helper for logging messages to the journal - a new debugging +tool, an alternative to trace_printk(). + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_update.h | 2 ++ + fs/bcachefs/btree_update_leaf.c | 27 +++++++++++++++++++++++++++ + 2 files changed, 29 insertions(+) + +diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h +index ca142f955193..ad13b0739a68 100644 +--- a/fs/bcachefs/btree_update.h ++++ b/fs/bcachefs/btree_update.h +@@ -80,6 +80,8 @@ void bch2_trans_commit_hook(struct btree_trans *, + struct btree_trans_commit_hook *); + int __bch2_trans_commit(struct btree_trans *); + ++int bch2_trans_log_msg(struct btree_trans *, const char *); ++ + /** + * bch2_trans_commit - insert keys at given iterator positions + * +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index 1148278c35d0..6f4ee55ebffe 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -1738,3 +1738,30 @@ int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id, + bch2_btree_delete_range_trans(&trans, id, start, end, + update_flags, journal_seq)); + } ++ ++int bch2_trans_log_msg(struct btree_trans *trans, const char *msg) ++{ ++ unsigned len = strlen(msg); ++ unsigned u64s = DIV_ROUND_UP(len, sizeof(u64)); ++ struct jset_entry_log *l; ++ int ret; ++ ++ ret = darray_make_room(trans->extra_journal_entries, jset_u64s(u64s)); ++ if (ret) ++ return ret; ++ ++ l = (void *) &darray_top(trans->extra_journal_entries); ++ l->entry.u64s = cpu_to_le16(u64s); ++ l->entry.btree_id = 0; ++ l->entry.level = 1; ++ l->entry.type = BCH_JSET_ENTRY_log; ++ l->entry.pad[0] = 0; ++ l->entry.pad[1] = 0; ++ l->entry.pad[2] = 0; ++ memcpy(l->d, msg, len); ++ while (len & 7) ++ l->d[len++] = '\0'; ++ ++ trans->extra_journal_entries.nr += jset_u64s(u64s); ++ return 0; ++} +-- +cgit v1.2.3 + + +From 5f53295f84becc93c5c2878c8e52e736eb8b7215 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 30 Mar 2022 23:40:19 -0400 +Subject: bcachefs: Improve bch2_bkey_ptrs_to_text() + +Print bucket:offset when the filesystem is online; this makes debugging +easier when correlating with alloc updates. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/extents.c | 22 ++++++++++++++-------- + fs/bcachefs/super.h | 6 ++++++ + 2 files changed, 20 insertions(+), 8 deletions(-) + +diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c +index cc50e4b28882..77a0d49a2372 100644 +--- a/fs/bcachefs/extents.c ++++ b/fs/bcachefs/extents.c +@@ -954,15 +954,21 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c, + switch (__extent_entry_type(entry)) { + case BCH_EXTENT_ENTRY_ptr: + ptr = entry_to_ptr(entry); ++ ca = c && ptr->dev < c->sb.nr_devices && c->devs[ptr->dev] ++ ? bch_dev_bkey_exists(c, ptr->dev) ++ : NULL; ++ ++ if (!ca) { ++ pr_buf(out, "ptr: %u:%llu gen %u%s", ptr->dev, ++ (u64) ptr->offset, ptr->gen, ++ ptr->cached ? " cached" : ""); ++ } else { ++ u32 offset; ++ u64 b = sector_to_bucket_and_offset(ca, ptr->offset, &offset); + +- pr_buf(out, "ptr: %u:%llu gen %u%s", ptr->dev, +- (u64) ptr->offset, ptr->gen, +- ptr->cached ? " cached" : ""); +- +- if (c) { +- ca = ptr->dev < c->sb.nr_devices && c->devs[ptr->dev] +- ? bch_dev_bkey_exists(c, ptr->dev) +- : NULL; ++ pr_buf(out, "ptr: %u:%llu:%u gen %u%s", ptr->dev, ++ b, offset, ptr->gen, ++ ptr->cached ? " cached" : ""); + + if (ca && ptr_stale(ca, ptr)) + pr_buf(out, " stale"); +diff --git a/fs/bcachefs/super.h b/fs/bcachefs/super.h +index 3f24ca5a853d..6d3efda26e63 100644 +--- a/fs/bcachefs/super.h ++++ b/fs/bcachefs/super.h +@@ -26,6 +26,12 @@ static inline sector_t bucket_remainder(const struct bch_dev *ca, sector_t s) + return remainder; + } + ++static inline size_t sector_to_bucket_and_offset(const struct bch_dev *ca, sector_t s, ++ u32 *offset) ++{ ++ return div_u64_rem(s, ca->mi.bucket_size, offset); ++} ++ + static inline bool bch2_dev_is_online(struct bch_dev *ca) + { + return !percpu_ref_is_zero(&ca->io_ref); +-- +cgit v1.2.3 + + +From 61df98ec6027f1e74b90979a1877a13beb818729 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 31 Mar 2022 00:03:37 -0400 +Subject: bcachefs: Move deletion of refcount=0 indirect extents to their + triggers + +For backpointers, we need to switch the order triggers are run in: we +need to run triggers for deletions/overwrites before triggers for +inserts. + +To avoid breaking the reflink triggers, this patch moves deleting of +indirect extents with refcount=0 to their triggers, instead of doing it +when we update those keys. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/buckets.c | 5 ----- + fs/bcachefs/reflink.c | 36 ++++++++++++++++++++++++++++++++++++ + fs/bcachefs/reflink.h | 8 +++++++- + 3 files changed, 43 insertions(+), 6 deletions(-) + +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index fbce6cdf4cf8..b2b7bf9bb1c7 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -1805,11 +1805,6 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans, + + le64_add_cpu(refcount, add); + +- if (!*refcount) { +- n->k.type = KEY_TYPE_deleted; +- set_bkey_val_u64s(&n->k, 0); +- } +- + bch2_btree_iter_set_pos_to_extent_start(&iter); + ret = bch2_trans_update(trans, &iter, n, 0); + if (ret) +diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c +index c8d6d73681e0..6824730945d4 100644 +--- a/fs/bcachefs/reflink.c ++++ b/fs/bcachefs/reflink.c +@@ -98,6 +98,24 @@ bool bch2_reflink_v_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r + return l.v->refcount == r.v->refcount && bch2_extent_merge(c, _l, _r); + } + ++int bch2_trans_mark_reflink_v(struct btree_trans *trans, ++ struct bkey_s_c old, struct bkey_i *new, ++ unsigned flags) ++{ ++ if (!(flags & BTREE_TRIGGER_OVERWRITE)) { ++ struct bkey_i_reflink_v *r = bkey_i_to_reflink_v(new); ++ ++ if (!r->v.refcount) { ++ r->k.type = KEY_TYPE_deleted; ++ r->k.size = 0; ++ set_bkey_val_u64s(&r->k, 0); ++ return 0; ++ } ++ } ++ ++ return bch2_trans_mark_extent(trans, old, new, flags); ++} ++ + /* indirect inline data */ + + const char *bch2_indirect_inline_data_invalid(const struct bch_fs *c, +@@ -119,6 +137,24 @@ void bch2_indirect_inline_data_to_text(struct printbuf *out, + min(datalen, 32U), d.v->data); + } + ++int bch2_trans_mark_indirect_inline_data(struct btree_trans *trans, ++ struct bkey_s_c old, struct bkey_i *new, ++ unsigned flags) ++{ ++ if (!(flags & BTREE_TRIGGER_OVERWRITE)) { ++ struct bkey_i_indirect_inline_data *r = ++ bkey_i_to_indirect_inline_data(new); ++ ++ if (!r->v.refcount) { ++ r->k.type = KEY_TYPE_deleted; ++ r->k.size = 0; ++ set_bkey_val_u64s(&r->k, 0); ++ } ++ } ++ ++ return 0; ++} ++ + static int bch2_make_extent_indirect(struct btree_trans *trans, + struct btree_iter *extent_iter, + struct bkey_i *orig) +diff --git a/fs/bcachefs/reflink.h b/fs/bcachefs/reflink.h +index 4da4330014a8..8eb41c0292eb 100644 +--- a/fs/bcachefs/reflink.h ++++ b/fs/bcachefs/reflink.h +@@ -18,12 +18,14 @@ bool bch2_reflink_p_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); + const char *bch2_reflink_v_invalid(const struct bch_fs *, struct bkey_s_c); + void bch2_reflink_v_to_text(struct printbuf *, struct bch_fs *, + struct bkey_s_c); ++int bch2_trans_mark_reflink_v(struct btree_trans *, struct bkey_s_c, ++ struct bkey_i *, unsigned); + + #define bch2_bkey_ops_reflink_v (struct bkey_ops) { \ + .key_invalid = bch2_reflink_v_invalid, \ + .val_to_text = bch2_reflink_v_to_text, \ + .swab = bch2_ptr_swab, \ +- .trans_trigger = bch2_trans_mark_extent, \ ++ .trans_trigger = bch2_trans_mark_reflink_v, \ + .atomic_trigger = bch2_mark_extent, \ + } + +@@ -31,10 +33,14 @@ const char *bch2_indirect_inline_data_invalid(const struct bch_fs *, + struct bkey_s_c); + void bch2_indirect_inline_data_to_text(struct printbuf *, + struct bch_fs *, struct bkey_s_c); ++int bch2_trans_mark_indirect_inline_data(struct btree_trans *, ++ struct bkey_s_c, struct bkey_i *, ++ unsigned); + + #define bch2_bkey_ops_indirect_inline_data (struct bkey_ops) { \ + .key_invalid = bch2_indirect_inline_data_invalid, \ + .val_to_text = bch2_indirect_inline_data_to_text, \ ++ .trans_trigger = bch2_trans_mark_indirect_inline_data, \ + } + + static inline const __le64 *bkey_refcount_c(struct bkey_s_c k) +-- +cgit v1.2.3 + + +From 33d477c5a2c9e99f4aabdf8894d2fa7076228f7e Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 30 Mar 2022 23:39:48 -0400 +Subject: bcachefs: Run overwrite triggers before insert + +For backpointers, we'll need to delete old backpointers before adding +new backpointers - otherwise we'll run into spurious duplicate +backpointer errors. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_update_leaf.c | 47 ++++++++++++++++------------------------- + include/trace/events/bcachefs.h | 8 ------- + 2 files changed, 18 insertions(+), 37 deletions(-) + +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index 6f4ee55ebffe..a0480c63dd81 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -457,7 +457,7 @@ static int run_one_mem_trigger(struct btree_trans *trans, + } + + static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_entry *i, +- bool overwrite) ++ bool overwrite) + { + /* + * Transactional triggers create new btree_insert_entries, so we can't +@@ -466,42 +466,31 @@ static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_ + */ + struct bkey old_k = i->old_k; + struct bkey_s_c old = { &old_k, i->old_v }; +- int ret = 0; + + if ((i->flags & BTREE_TRIGGER_NORUN) || + !(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type))) + return 0; + +- if (!overwrite) { +- if (i->insert_trigger_run) +- return 0; +- +- BUG_ON(i->overwrite_trigger_run); +- i->insert_trigger_run = true; +- } else { +- if (i->overwrite_trigger_run) +- return 0; +- +- BUG_ON(!i->insert_trigger_run); +- i->overwrite_trigger_run = true; +- } +- +- if (overwrite) { +- ret = bch2_trans_mark_old(trans, old, i->flags); +- } else if (bch2_bkey_ops[old.k->type].trans_trigger == +- bch2_bkey_ops[i->k->k.type].trans_trigger && ++ if (!i->insert_trigger_run && ++ !i->overwrite_trigger_run && ++ bch2_bkey_ops[old.k->type].trans_trigger == ++ bch2_bkey_ops[i->k->k.type].trans_trigger && + ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) { + i->overwrite_trigger_run = true; +- ret = bch2_trans_mark_key(trans, old, i->k, +- BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|i->flags); ++ i->insert_trigger_run = true; ++ return bch2_trans_mark_key(trans, old, i->k, ++ BTREE_TRIGGER_INSERT| ++ BTREE_TRIGGER_OVERWRITE| ++ i->flags) ?: 1; ++ } else if (overwrite && !i->overwrite_trigger_run) { ++ i->overwrite_trigger_run = true; ++ return bch2_trans_mark_old(trans, old, i->flags) ?: 1; ++ } else if (!i->insert_trigger_run) { ++ i->insert_trigger_run = true; ++ return bch2_trans_mark_new(trans, i->k, i->flags) ?: 1; + } else { +- ret = bch2_trans_mark_new(trans, i->k, i->flags); ++ return 0; + } +- +- if (ret == -EINTR) +- trace_trans_restart_mark(trans->fn, _RET_IP_, +- i->btree_id, &i->path->pos); +- return ret ?: 1; + } + + static int run_btree_triggers(struct btree_trans *trans, enum btree_id btree_id, +@@ -511,7 +500,7 @@ static int run_btree_triggers(struct btree_trans *trans, enum btree_id btree_id, + bool trans_trigger_run; + int ret, overwrite; + +- for (overwrite = 0; overwrite < 2; overwrite++) { ++ for (overwrite = 1; overwrite >= 0; --overwrite) { + + /* + * Running triggers will append more updates to the list of updates as +diff --git a/include/trace/events/bcachefs.h b/include/trace/events/bcachefs.h +index ac2aecd47c5e..bccad83da05b 100644 +--- a/include/trace/events/bcachefs.h ++++ b/include/trace/events/bcachefs.h +@@ -756,14 +756,6 @@ DEFINE_EVENT(transaction_restart_iter, trans_restart_btree_node_split, + TP_ARGS(trans_fn, caller_ip, btree_id, pos) + ); + +-DEFINE_EVENT(transaction_restart_iter, trans_restart_mark, +- TP_PROTO(const char *trans_fn, +- unsigned long caller_ip, +- enum btree_id btree_id, +- struct bpos *pos), +- TP_ARGS(trans_fn, caller_ip, btree_id, pos) +-); +- + DEFINE_EVENT(transaction_restart_iter, trans_restart_upgrade, + TP_PROTO(const char *trans_fn, + unsigned long caller_ip, +-- +cgit v1.2.3 + + +From f20252587576285799eb4777f6d5373055e496fb Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 31 Dec 2021 20:03:29 -0500 +Subject: bcachefs: KEY_TYPE_alloc_v4 + +This introduces a new alloc key which doesn't use varints. Soon we'll be +adding backpointers and storing them in alloc keys, which means our +pack/unpack workflow for alloc keys won't really work - we'll need to be +mutating alloc keys in place. + +Instead of bch2_alloc_unpack(), we now have bch2_alloc_to_v4() that +converts older types of alloc keys to v4 if needed. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/alloc_background.c | 340 ++++++++++++++++++++++------------------- + fs/bcachefs/alloc_background.h | 70 +++------ + fs/bcachefs/alloc_foreground.c | 27 ++-- + fs/bcachefs/bcachefs.h | 1 + + fs/bcachefs/bcachefs_format.h | 49 +++++- + fs/bcachefs/bkey_methods.c | 3 +- + fs/bcachefs/btree_gc.c | 69 +++++---- + fs/bcachefs/btree_types.h | 1 + + fs/bcachefs/buckets.c | 169 ++++++++++---------- + fs/bcachefs/lru.c | 6 +- + fs/bcachefs/movinggc.c | 24 +-- + fs/bcachefs/recovery.c | 4 +- + 12 files changed, 409 insertions(+), 354 deletions(-) + +diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c +index 07a76993318f..eb62b4fc2367 100644 +--- a/fs/bcachefs/alloc_background.c ++++ b/fs/bcachefs/alloc_background.c +@@ -27,6 +27,8 @@ + #include + #include + ++/* Persistent alloc info: */ ++ + static const unsigned BCH_ALLOC_V1_FIELD_BYTES[] = { + #define x(name, bits) [BCH_ALLOC_FIELD_V1_##name] = bits / 8, + BCH_ALLOC_FIELDS_V1() +@@ -42,7 +44,19 @@ const char * const bch2_bucket_states[] = { + NULL + }; + +-/* Persistent alloc info: */ ++struct bkey_alloc_unpacked { ++ u64 journal_seq; ++ u64 bucket; ++ u8 dev; ++ u8 gen; ++ u8 oldest_gen; ++ u8 data_type; ++ bool need_discard:1; ++ bool need_inc_gen:1; ++#define x(_name, _bits) u##_bits _name; ++ BCH_ALLOC_FIELDS_V2() ++#undef x ++}; + + static inline u64 alloc_field_v1_get(const struct bch_alloc *a, + const void **p, unsigned field) +@@ -164,8 +178,8 @@ static int bch2_alloc_unpack_v3(struct bkey_alloc_unpacked *out, + out->gen = a.v->gen; + out->oldest_gen = a.v->oldest_gen; + out->data_type = a.v->data_type; +- out->need_discard = BCH_ALLOC_NEED_DISCARD(a.v); +- out->need_inc_gen = BCH_ALLOC_NEED_INC_GEN(a.v); ++ out->need_discard = BCH_ALLOC_V3_NEED_DISCARD(a.v); ++ out->need_inc_gen = BCH_ALLOC_V3_NEED_INC_GEN(a.v); + out->journal_seq = le64_to_cpu(a.v->journal_seq); + + #define x(_name, _bits) \ +@@ -187,49 +201,7 @@ static int bch2_alloc_unpack_v3(struct bkey_alloc_unpacked *out, + return 0; + } + +-static void bch2_alloc_pack_v3(struct bkey_alloc_buf *dst, +- const struct bkey_alloc_unpacked src) +-{ +- struct bkey_i_alloc_v3 *a = bkey_alloc_v3_init(&dst->k); +- unsigned nr_fields = 0, last_nonzero_fieldnr = 0; +- u8 *out = a->v.data; +- u8 *end = (void *) &dst[1]; +- u8 *last_nonzero_field = out; +- unsigned bytes; +- +- a->k.p = POS(src.dev, src.bucket); +- a->v.gen = src.gen; +- a->v.oldest_gen = src.oldest_gen; +- a->v.data_type = src.data_type; +- a->v.journal_seq = cpu_to_le64(src.journal_seq); +- SET_BCH_ALLOC_NEED_DISCARD(&a->v, src.need_discard); +- SET_BCH_ALLOC_NEED_INC_GEN(&a->v, src.need_inc_gen); +- +-#define x(_name, _bits) \ +- nr_fields++; \ +- \ +- if (src._name) { \ +- out += bch2_varint_encode_fast(out, src._name); \ +- \ +- last_nonzero_field = out; \ +- last_nonzero_fieldnr = nr_fields; \ +- } else { \ +- *out++ = 0; \ +- } +- +- BCH_ALLOC_FIELDS_V2() +-#undef x +- BUG_ON(out > end); +- +- out = last_nonzero_field; +- a->v.nr_fields = last_nonzero_fieldnr; +- +- bytes = (u8 *) out - (u8 *) &a->v; +- set_bkey_val_bytes(&a->k, bytes); +- memset_u64s_tail(&a->v, 0, bytes); +-} +- +-struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c k) ++static struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c k) + { + struct bkey_alloc_unpacked ret = { + .dev = k.k->p.inode, +@@ -252,25 +224,44 @@ struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c k) + return ret; + } + +-struct bkey_alloc_buf *bch2_alloc_pack(struct btree_trans *trans, +- const struct bkey_alloc_unpacked src) ++void bch2_alloc_to_v4(struct bkey_s_c k, struct bch_alloc_v4 *out) + { +- struct bkey_alloc_buf *dst; +- +- dst = bch2_trans_kmalloc(trans, sizeof(struct bkey_alloc_buf)); +- if (!IS_ERR(dst)) +- bch2_alloc_pack_v3(dst, src); +- +- return dst; ++ if (k.k->type == KEY_TYPE_alloc_v4) { ++ *out = *bkey_s_c_to_alloc_v4(k).v; ++ } else { ++ struct bkey_alloc_unpacked u = bch2_alloc_unpack(k); ++ ++ *out = (struct bch_alloc_v4) { ++ .journal_seq = u.journal_seq, ++ .flags = u.need_discard, ++ .gen = u.gen, ++ .oldest_gen = u.oldest_gen, ++ .data_type = u.data_type, ++ .stripe_redundancy = u.stripe_redundancy, ++ .dirty_sectors = u.dirty_sectors, ++ .cached_sectors = u.cached_sectors, ++ .io_time[READ] = u.read_time, ++ .io_time[WRITE] = u.write_time, ++ .stripe = u.stripe, ++ }; ++ } + } + + int bch2_alloc_write(struct btree_trans *trans, struct btree_iter *iter, +- struct bkey_alloc_unpacked *u, unsigned trigger_flags) ++ struct bch_alloc_v4 *src, unsigned trigger_flags) + { +- struct bkey_alloc_buf *a = bch2_alloc_pack(trans, *u); ++ struct bkey_i_alloc_v4 *dst = ++ bch2_trans_kmalloc(trans, sizeof(*dst)); + +- return PTR_ERR_OR_ZERO(a) ?: +- bch2_trans_update(trans, iter, &a->k, trigger_flags); ++ if (IS_ERR(dst)) ++ return PTR_ERR(dst); ++ ++ bkey_alloc_v4_init(&dst->k_i); ++ set_bkey_val_bytes(&dst->k, sizeof(dst->v)); ++ dst->k.p = iter->pos; ++ dst->v = *src; ++ ++ return bch2_trans_update(trans, iter, &dst->k_i, trigger_flags); + } + + static unsigned bch_alloc_v1_val_u64s(const struct bch_alloc *a) +@@ -316,28 +307,70 @@ const char *bch2_alloc_v2_invalid(const struct bch_fs *c, struct bkey_s_c k) + const char *bch2_alloc_v3_invalid(const struct bch_fs *c, struct bkey_s_c k) + { + struct bkey_alloc_unpacked u; ++ struct bch_dev *ca; + + if (k.k->p.inode >= c->sb.nr_devices || + !c->devs[k.k->p.inode]) + return "invalid device"; + ++ ca = bch_dev_bkey_exists(c, k.k->p.inode); ++ ++ if (k.k->p.offset < ca->mi.first_bucket || ++ k.k->p.offset >= ca->mi.nbuckets) ++ return "invalid bucket"; ++ + if (bch2_alloc_unpack_v3(&u, k)) + return "unpack error"; + + return NULL; + } + +-void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, +- struct bkey_s_c k) ++const char *bch2_alloc_v4_invalid(const struct bch_fs *c, struct bkey_s_c k) + { +- struct bkey_alloc_unpacked u = bch2_alloc_unpack(k); ++ struct bch_dev *ca; + +- pr_buf(out, "gen %u oldest_gen %u data_type %s journal_seq %llu need_discard %u", +- u.gen, u.oldest_gen, bch2_data_types[u.data_type], +- u.journal_seq, u.need_discard); +-#define x(_name, ...) pr_buf(out, " " #_name " %llu", (u64) u._name); +- BCH_ALLOC_FIELDS_V2() +-#undef x ++ if (k.k->p.inode >= c->sb.nr_devices || ++ !c->devs[k.k->p.inode]) ++ return "invalid device"; ++ ++ ca = bch_dev_bkey_exists(c, k.k->p.inode); ++ ++ if (k.k->p.offset < ca->mi.first_bucket || ++ k.k->p.offset >= ca->mi.nbuckets) ++ return "invalid bucket"; ++ ++ return NULL; ++} ++ ++void bch2_alloc_v4_swab(struct bkey_s k) ++{ ++ struct bch_alloc_v4 *a = bkey_s_to_alloc_v4(k).v; ++ ++ a->journal_seq = swab64(a->journal_seq); ++ a->flags = swab32(a->flags); ++ a->dirty_sectors = swab32(a->dirty_sectors); ++ a->cached_sectors = swab32(a->cached_sectors); ++ a->io_time[0] = swab64(a->io_time[0]); ++ a->io_time[1] = swab64(a->io_time[1]); ++ a->stripe = swab32(a->stripe); ++ a->nr_external_backpointers = swab32(a->nr_external_backpointers); ++} ++ ++void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) ++{ ++ struct bch_alloc_v4 a; ++ ++ bch2_alloc_to_v4(k, &a); ++ ++ pr_buf(out, "gen %u oldest_gen %u data_type %s journal_seq %llu need_discard %llu", ++ a.gen, a.oldest_gen, bch2_data_types[a.data_type], ++ a.journal_seq, BCH_ALLOC_V4_NEED_DISCARD(&a)); ++ pr_buf(out, " dirty_sectors %u", a.dirty_sectors); ++ pr_buf(out, " cached_sectors %u", a.cached_sectors); ++ pr_buf(out, " stripe %u", a.stripe); ++ pr_buf(out, " stripe_redundancy %u", a.stripe_redundancy); ++ pr_buf(out, " read_time %llu", a.io_time[READ]); ++ pr_buf(out, " write_time %llu", a.io_time[WRITE]); + } + + int bch2_alloc_read(struct bch_fs *c) +@@ -345,6 +378,7 @@ int bch2_alloc_read(struct bch_fs *c) + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; ++ struct bch_alloc_v4 a; + struct bch_dev *ca; + int ret; + +@@ -353,8 +387,9 @@ int bch2_alloc_read(struct bch_fs *c) + for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN, + BTREE_ITER_PREFETCH, k, ret) { + ca = bch_dev_bkey_exists(c, k.k->p.inode); ++ bch2_alloc_to_v4(k, &a); + +- *bucket_gen(ca, k.k->p.offset) = bch2_alloc_unpack(k).gen; ++ *bucket_gen(ca, k.k->p.offset) = a.gen; + } + bch2_trans_iter_exit(&trans, &iter); + +@@ -370,11 +405,11 @@ int bch2_alloc_read(struct bch_fs *c) + + static int bch2_bucket_do_index(struct btree_trans *trans, + struct bkey_s_c alloc_k, +- struct bkey_alloc_unpacked a, ++ struct bch_alloc_v4 a, + bool set) + { + struct bch_fs *c = trans->c; +- struct bch_dev *ca = bch_dev_bkey_exists(c, a.dev); ++ struct bch_dev *ca = bch_dev_bkey_exists(c, alloc_k.k->p.inode); + struct btree_iter iter; + struct bkey_s_c old; + struct bkey_i *k; +@@ -399,12 +434,12 @@ static int bch2_bucket_do_index(struct btree_trans *trans, + switch (state) { + case BUCKET_free: + btree = BTREE_ID_freespace; +- k->k.p = alloc_freespace_pos(a); ++ k->k.p = alloc_freespace_pos(alloc_k.k->p, a); + bch2_key_resize(&k->k, 1); + break; + case BUCKET_need_discard: + btree = BTREE_ID_need_discard; +- k->k.p = POS(a.dev, a.bucket); ++ k->k.p = alloc_k.k->p; + break; + default: + return 0; +@@ -443,40 +478,45 @@ int bch2_trans_mark_alloc(struct btree_trans *trans, + unsigned flags) + { + struct bch_fs *c = trans->c; +- struct bkey_alloc_unpacked old_u = bch2_alloc_unpack(old); +- struct bkey_alloc_unpacked new_u = bch2_alloc_unpack(bkey_i_to_s_c(new)); ++ struct bch_alloc_v4 old_a, *new_a; + u64 old_lru, new_lru; +- bool need_repack = false; + int ret = 0; + +- if (new_u.dirty_sectors > old_u.dirty_sectors || +- new_u.cached_sectors > old_u.cached_sectors) { +- new_u.read_time = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now)); +- new_u.write_time = max_t(u64, 1, atomic64_read(&c->io_clock[WRITE].now)); +- new_u.need_inc_gen = true; +- new_u.need_discard = true; +- need_repack = true; ++ /* ++ * Deletion only happens in the device removal path, with ++ * BTREE_TRIGGER_NORUN: ++ */ ++ BUG_ON(new->k.type != KEY_TYPE_alloc_v4); ++ ++ bch2_alloc_to_v4(old, &old_a); ++ new_a = &bkey_i_to_alloc_v4(new)->v; ++ ++ if (new_a->dirty_sectors > old_a.dirty_sectors || ++ new_a->cached_sectors > old_a.cached_sectors) { ++ new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now)); ++ new_a->io_time[WRITE]= max_t(u64, 1, atomic64_read(&c->io_clock[WRITE].now)); ++ SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, true); ++ SET_BCH_ALLOC_V4_NEED_DISCARD(new_a, true); + } + +- if (old_u.data_type && !new_u.data_type && +- old_u.gen == new_u.gen && ++ if (old_a.data_type && !new_a->data_type && ++ old_a.gen == new_a->gen && + !bch2_bucket_is_open_safe(c, new->k.p.inode, new->k.p.offset)) { +- new_u.gen++; +- new_u.need_inc_gen = false; +- need_repack = true; ++ new_a->gen++; ++ SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, false); + } + +- if (bucket_state(old_u) != bucket_state(new_u) || +- (bucket_state(new_u) == BUCKET_free && +- alloc_freespace_genbits(old_u) != alloc_freespace_genbits(new_u))) { +- ret = bch2_bucket_do_index(trans, old, old_u, false) ?: +- bch2_bucket_do_index(trans, bkey_i_to_s_c(new), new_u, true); ++ if (bucket_state(old_a) != bucket_state(*new_a) || ++ (bucket_state(*new_a) == BUCKET_free && ++ alloc_freespace_genbits(old_a) != alloc_freespace_genbits(*new_a))) { ++ ret = bch2_bucket_do_index(trans, old, old_a, false) ?: ++ bch2_bucket_do_index(trans, bkey_i_to_s_c(new), *new_a, true); + if (ret) + return ret; + } + +- old_lru = alloc_lru_idx(old_u); +- new_lru = alloc_lru_idx(new_u); ++ old_lru = alloc_lru_idx(old_a); ++ new_lru = alloc_lru_idx(*new_a); + + if (old_lru != new_lru) { + ret = bch2_lru_change(trans, new->k.p.inode, new->k.p.offset, +@@ -484,15 +524,10 @@ int bch2_trans_mark_alloc(struct btree_trans *trans, + if (ret) + return ret; + +- if (new_lru && new_u.read_time != new_lru) { +- new_u.read_time = new_lru; +- need_repack = true; +- } ++ if (new_lru && new_a->io_time[READ] != new_lru) ++ new_a->io_time[READ] = new_lru; + } + +- if (need_repack && !bkey_deleted(&new->k)) +- bch2_alloc_pack_v3((void *) new, new_u); +- + return 0; + } + +@@ -501,7 +536,7 @@ static int bch2_check_alloc_key(struct btree_trans *trans, + { + struct bch_fs *c = trans->c; + struct btree_iter discard_iter, freespace_iter, lru_iter; +- struct bkey_alloc_unpacked a; ++ struct bch_alloc_v4 a; + unsigned discard_key_type, freespace_key_type; + struct bkey_s_c alloc_k, k; + struct printbuf buf = PRINTBUF; +@@ -516,7 +551,7 @@ static int bch2_check_alloc_key(struct btree_trans *trans, + if (ret) + return ret; + +- a = bch2_alloc_unpack(alloc_k); ++ bch2_alloc_to_v4(alloc_k, &a); + discard_key_type = bucket_state(a) == BUCKET_need_discard + ? KEY_TYPE_set : 0; + freespace_key_type = bucket_state(a) == BUCKET_free +@@ -525,9 +560,9 @@ static int bch2_check_alloc_key(struct btree_trans *trans, + bch2_trans_iter_init(trans, &discard_iter, BTREE_ID_need_discard, + alloc_k.k->p, 0); + bch2_trans_iter_init(trans, &freespace_iter, BTREE_ID_freespace, +- alloc_freespace_pos(a), 0); ++ alloc_freespace_pos(alloc_k.k->p, a), 0); + bch2_trans_iter_init(trans, &lru_iter, BTREE_ID_lru, +- POS(a.dev, a.read_time), 0); ++ POS(alloc_k.k->p.inode, a.io_time[READ]), 0); + + k = bch2_btree_iter_peek_slot(&discard_iter); + ret = bkey_err(k); +@@ -588,40 +623,34 @@ static int bch2_check_alloc_key(struct btree_trans *trans, + } + + if (bucket_state(a) == BUCKET_cached) { +- if (fsck_err_on(!a.read_time, c, +- "cached bucket with read_time 0\n" +- " %s", +- (printbuf_reset(&buf), +- bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { +- +- a.read_time = atomic64_read(&c->io_clock[READ].now); +- +- ret = bch2_lru_change(trans, a.dev, a.bucket, +- 0, &a.read_time) ?: +- bch2_alloc_write(trans, alloc_iter, &a, BTREE_TRIGGER_NORUN); +- bch2_trans_commit(trans, NULL, NULL, 0); +- if (ret) +- goto err; +- } +- + k = bch2_btree_iter_peek_slot(&lru_iter); + ret = bkey_err(k); + if (ret) + goto err; + +- if (fsck_err_on(k.k->type != KEY_TYPE_lru || +- le64_to_cpu(bkey_s_c_to_lru(k).v->idx) != a.bucket, c, +- "incorrect/missing lru entry\n" +- " %s\n" +- " %s", ++ if (fsck_err_on(!a.io_time[READ], c, ++ "cached bucket with read_time 0\n" ++ " %s", + (printbuf_reset(&buf), +- bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf), +- (bch2_bkey_val_to_text(&buf2, c, k), buf2.buf))) { +- u64 read_time = a.read_time; +- +- ret = bch2_lru_change(trans, a.dev, a.bucket, +- 0, &a.read_time) ?: +- (a.read_time != read_time ++ bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)) || ++ fsck_err_on(k.k->type != KEY_TYPE_lru || ++ le64_to_cpu(bkey_s_c_to_lru(k).v->idx) != alloc_k.k->p.offset, c, ++ "incorrect/missing lru entry\n" ++ " %s\n" ++ " %s", ++ (printbuf_reset(&buf), ++ bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf), ++ (bch2_bkey_val_to_text(&buf2, c, k), buf2.buf))) { ++ u64 read_time = a.io_time[READ]; ++ ++ if (!a.io_time[READ]) ++ a.io_time[READ] = atomic64_read(&c->io_clock[READ].now); ++ ++ ret = bch2_lru_change(trans, ++ alloc_k.k->p.inode, ++ alloc_k.k->p.offset, ++ 0, &a.io_time[READ]) ?: ++ (a.io_time[READ] != read_time + ? bch2_alloc_write(trans, alloc_iter, &a, BTREE_TRIGGER_NORUN) + : 0) ?: + bch2_trans_commit(trans, NULL, NULL, 0); +@@ -658,7 +687,7 @@ static int bch2_check_freespace_key(struct btree_trans *trans, + struct bch_fs *c = trans->c; + struct btree_iter alloc_iter; + struct bkey_s_c k, freespace_k; +- struct bkey_alloc_unpacked a; ++ struct bch_alloc_v4 a; + u64 genbits; + struct bpos pos; + struct bkey_i *update; +@@ -689,7 +718,7 @@ static int bch2_check_freespace_key(struct btree_trans *trans, + if (ret) + goto err; + +- a = bch2_alloc_unpack(k); ++ bch2_alloc_to_v4(k, &a); + + if (fsck_err_on(bucket_state(a) != BUCKET_free || + genbits != alloc_freespace_genbits(a), c, +@@ -773,7 +802,7 @@ static int bch2_clear_need_discard(struct btree_trans *trans, struct bpos pos, + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_s_c k; +- struct bkey_alloc_unpacked a; ++ struct bch_alloc_v4 a; + struct printbuf buf = PRINTBUF; + int ret; + +@@ -784,17 +813,17 @@ static int bch2_clear_need_discard(struct btree_trans *trans, struct bpos pos, + if (ret) + goto out; + +- a = bch2_alloc_unpack(k); ++ bch2_alloc_to_v4(k, &a); + +- if (a.need_inc_gen) { ++ if (BCH_ALLOC_V4_NEED_INC_GEN(&a)) { + a.gen++; +- a.need_inc_gen = false; ++ SET_BCH_ALLOC_V4_NEED_INC_GEN(&a, false); + goto write; + } + + BUG_ON(a.journal_seq > c->journal.flushed_seq_ondisk); + +- if (bch2_fs_inconsistent_on(!a.need_discard, c, ++ if (bch2_fs_inconsistent_on(!BCH_ALLOC_V4_NEED_DISCARD(&a), c, + "%s\n incorrectly set in need_discard btree", + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { + ret = -EIO; +@@ -818,7 +847,7 @@ static int bch2_clear_need_discard(struct btree_trans *trans, struct bpos pos, + goto out; + } + +- a.need_discard = false; ++ SET_BCH_ALLOC_V4_NEED_DISCARD(&a, false); + write: + ret = bch2_alloc_write(trans, &iter, &a, 0); + out: +@@ -890,7 +919,7 @@ static int invalidate_one_bucket(struct btree_trans *trans, struct bch_dev *ca) + struct bch_fs *c = trans->c; + struct btree_iter lru_iter, alloc_iter = { NULL }; + struct bkey_s_c k; +- struct bkey_alloc_unpacked a; ++ struct bch_alloc_v4 a; + u64 bucket, idx; + int ret; + +@@ -920,20 +949,20 @@ static int invalidate_one_bucket(struct btree_trans *trans, struct bch_dev *ca) + if (ret) + goto out; + +- a = bch2_alloc_unpack(k); ++ bch2_alloc_to_v4(k, &a); + + if (bch2_fs_inconsistent_on(idx != alloc_lru_idx(a), c, + "invalidating bucket with wrong lru idx (got %llu should be %llu", + idx, alloc_lru_idx(a))) + goto out; + ++ SET_BCH_ALLOC_V4_NEED_INC_GEN(&a, false); + a.gen++; +- a.need_inc_gen = false; + a.data_type = 0; + a.dirty_sectors = 0; + a.cached_sectors = 0; +- a.read_time = atomic64_read(&c->io_clock[READ].now); +- a.write_time = atomic64_read(&c->io_clock[WRITE].now); ++ a.io_time[READ] = atomic64_read(&c->io_clock[READ].now); ++ a.io_time[WRITE] = atomic64_read(&c->io_clock[WRITE].now); + + ret = bch2_alloc_write(trans, &alloc_iter, &a, + BTREE_TRIGGER_BUCKET_INVALIDATE); +@@ -975,7 +1004,7 @@ static int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca) + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; +- struct bkey_alloc_unpacked a; ++ struct bch_alloc_v4 a; + struct bch_member *m; + int ret; + +@@ -988,7 +1017,7 @@ static int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca) + if (iter.pos.offset >= ca->mi.nbuckets) + break; + +- a = bch2_alloc_unpack(k); ++ bch2_alloc_to_v4(k, &a); + ret = __bch2_trans_do(&trans, NULL, NULL, + BTREE_INSERT_LAZY_RW, + bch2_bucket_do_index(&trans, k, a, true)); +@@ -1059,8 +1088,8 @@ int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev, + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_s_c k; +- struct bkey_alloc_unpacked u; +- u64 *time, now; ++ struct bch_alloc_v4 a; ++ u64 now; + int ret = 0; + + bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, POS(dev, bucket_nr), +@@ -1071,16 +1100,15 @@ int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev, + if (ret) + goto out; + +- u = bch2_alloc_unpack(k); ++ bch2_alloc_to_v4(k, &a); + +- time = rw == READ ? &u.read_time : &u.write_time; + now = atomic64_read(&c->io_clock[rw].now); +- if (*time == now) ++ if (a.io_time[rw] == now) + goto out; + +- *time = now; ++ a.io_time[rw] = now; + +- ret = bch2_alloc_write(trans, &iter, &u, 0) ?: ++ ret = bch2_alloc_write(trans, &iter, &a, 0) ?: + bch2_trans_commit(trans, NULL, NULL, 0); + out: + bch2_trans_iter_exit(trans, &iter); +diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h +index 06539e036f13..11fe7273bd69 100644 +--- a/fs/bcachefs/alloc_background.h ++++ b/fs/bcachefs/alloc_background.h +@@ -8,24 +8,10 @@ + #include "debug.h" + #include "super.h" + +-struct bkey_alloc_unpacked { +- u64 journal_seq; +- u64 bucket; +- u8 dev; +- u8 gen; +- u8 oldest_gen; +- u8 data_type; +- bool need_discard:1; +- bool need_inc_gen:1; +-#define x(_name, _bits) u##_bits _name; +- BCH_ALLOC_FIELDS_V2() +-#undef x +-}; +- + /* How out of date a pointer gen is allowed to be: */ + #define BUCKET_GC_GEN_MAX 96U + +-static inline u8 alloc_gc_gen(struct bkey_alloc_unpacked a) ++static inline u8 alloc_gc_gen(struct bch_alloc_v4 a) + { + return a.gen - a.oldest_gen; + } +@@ -40,62 +26,40 @@ enum bucket_state { + + extern const char * const bch2_bucket_states[]; + +-static inline enum bucket_state bucket_state(struct bkey_alloc_unpacked a) ++static inline enum bucket_state bucket_state(struct bch_alloc_v4 a) + { + if (a.dirty_sectors || a.stripe) + return BUCKET_dirty; + if (a.cached_sectors) + return BUCKET_cached; + BUG_ON(a.data_type); +- if (a.need_discard) ++ if (BCH_ALLOC_V4_NEED_DISCARD(&a)) + return BUCKET_need_discard; + if (alloc_gc_gen(a) >= BUCKET_GC_GEN_MAX) + return BUCKET_need_gc_gens; + return BUCKET_free; + } + +-static inline u64 alloc_lru_idx(struct bkey_alloc_unpacked a) ++static inline u64 alloc_lru_idx(struct bch_alloc_v4 a) + { +- return bucket_state(a) == BUCKET_cached ? a.read_time : 0; ++ return bucket_state(a) == BUCKET_cached ? a.io_time[READ] : 0; + } + +-static inline u64 alloc_freespace_genbits(struct bkey_alloc_unpacked a) ++static inline u64 alloc_freespace_genbits(struct bch_alloc_v4 a) + { + return ((u64) alloc_gc_gen(a) >> 4) << 56; + } + +-static inline struct bpos alloc_freespace_pos(struct bkey_alloc_unpacked a) ++static inline struct bpos alloc_freespace_pos(struct bpos pos, struct bch_alloc_v4 a) + { +- return POS(a.dev, a.bucket | alloc_freespace_genbits(a)); ++ pos.offset |= alloc_freespace_genbits(a); ++ return pos; + } + +-/* returns true if not equal */ +-static inline bool bkey_alloc_unpacked_cmp(struct bkey_alloc_unpacked l, +- struct bkey_alloc_unpacked r) +-{ +- return l.gen != r.gen || +- l.oldest_gen != r.oldest_gen || +- l.data_type != r.data_type +-#define x(_name, ...) || l._name != r._name +- BCH_ALLOC_FIELDS_V2() +-#undef x +- ; +-} +- +-struct bkey_alloc_buf { +- struct bkey_i k; +- struct bch_alloc_v3 v; +- +-#define x(_name, _bits) + _bits / 8 +- u8 _pad[0 + BCH_ALLOC_FIELDS_V2()]; +-#undef x +-} __attribute__((packed, aligned(8))); +- +-struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c); +-struct bkey_alloc_buf *bch2_alloc_pack(struct btree_trans *, +- const struct bkey_alloc_unpacked); + int bch2_alloc_write(struct btree_trans *, struct btree_iter *, +- struct bkey_alloc_unpacked *, unsigned); ++ struct bch_alloc_v4 *, unsigned); ++ ++void bch2_alloc_to_v4(struct bkey_s_c, struct bch_alloc_v4 *); + + int bch2_bucket_io_time_reset(struct btree_trans *, unsigned, size_t, int); + +@@ -104,6 +68,8 @@ int bch2_bucket_io_time_reset(struct btree_trans *, unsigned, size_t, int); + const char *bch2_alloc_v1_invalid(const struct bch_fs *, struct bkey_s_c); + const char *bch2_alloc_v2_invalid(const struct bch_fs *, struct bkey_s_c); + const char *bch2_alloc_v3_invalid(const struct bch_fs *, struct bkey_s_c); ++const char *bch2_alloc_v4_invalid(const struct bch_fs *, struct bkey_s_c k); ++void bch2_alloc_v4_swab(struct bkey_s); + void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); + + #define bch2_bkey_ops_alloc (struct bkey_ops) { \ +@@ -127,6 +93,14 @@ void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); + .atomic_trigger = bch2_mark_alloc, \ + } + ++#define bch2_bkey_ops_alloc_v4 (struct bkey_ops) { \ ++ .key_invalid = bch2_alloc_v4_invalid, \ ++ .val_to_text = bch2_alloc_to_text, \ ++ .swab = bch2_alloc_v4_swab, \ ++ .trans_trigger = bch2_trans_mark_alloc, \ ++ .atomic_trigger = bch2_mark_alloc, \ ++} ++ + static inline bool bkey_is_alloc(const struct bkey *k) + { + return k->type == KEY_TYPE_alloc || +diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c +index 538b597d845c..4dbab45be5ed 100644 +--- a/fs/bcachefs/alloc_foreground.c ++++ b/fs/bcachefs/alloc_foreground.c +@@ -190,8 +190,9 @@ static inline unsigned open_buckets_reserved(enum alloc_reserve reserve) + } + + static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *ca, ++ u64 bucket, + enum alloc_reserve reserve, +- struct bkey_alloc_unpacked a, ++ struct bch_alloc_v4 *a, + u64 *skipped_open, + u64 *skipped_need_journal_commit, + u64 *skipped_nouse, +@@ -199,18 +200,18 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev * + { + struct open_bucket *ob; + +- if (unlikely(ca->buckets_nouse && test_bit(a.bucket, ca->buckets_nouse))) { ++ if (unlikely(ca->buckets_nouse && test_bit(bucket, ca->buckets_nouse))) { + (*skipped_nouse)++; + return NULL; + } + +- if (bch2_bucket_is_open(c, ca->dev_idx, a.bucket)) { ++ if (bch2_bucket_is_open(c, ca->dev_idx, bucket)) { + (*skipped_open)++; + return NULL; + } + + if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal, +- c->journal.flushed_seq_ondisk, ca->dev_idx, a.bucket)) { ++ c->journal.flushed_seq_ondisk, ca->dev_idx, bucket)) { + (*skipped_need_journal_commit)++; + return NULL; + } +@@ -231,7 +232,7 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev * + } + + /* Recheck under lock: */ +- if (bch2_bucket_is_open(c, ca->dev_idx, a.bucket)) { ++ if (bch2_bucket_is_open(c, ca->dev_idx, bucket)) { + spin_unlock(&c->freelist_lock); + (*skipped_open)++; + return NULL; +@@ -245,8 +246,8 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev * + ob->sectors_free = ca->mi.bucket_size; + ob->alloc_reserve = reserve; + ob->dev = ca->dev_idx; +- ob->gen = a.gen; +- ob->bucket = a.bucket; ++ ob->gen = a->gen; ++ ob->bucket = bucket; + spin_unlock(&ob->lock); + + ca->nr_open_buckets++; +@@ -283,7 +284,7 @@ static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bc + struct btree_iter iter; + struct bkey_s_c k; + struct open_bucket *ob; +- struct bkey_alloc_unpacked a; ++ struct bch_alloc_v4 a; + u64 b = free_entry & ~(~0ULL << 56); + unsigned genbits = free_entry >> 56; + struct printbuf buf = PRINTBUF; +@@ -297,7 +298,7 @@ static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bc + goto err; + } + +- a = bch2_alloc_unpack(k); ++ bch2_alloc_to_v4(k, &a); + + if (bch2_fs_inconsistent_on(bucket_state(a) != BUCKET_free, c, + "non free bucket in freespace btree (state %s)\n" +@@ -326,7 +327,7 @@ static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bc + goto err; + } + +- ob = __try_alloc_bucket(c, ca, reserve, a, ++ ob = __try_alloc_bucket(c, ca, b, reserve, &a, + skipped_open, + skipped_need_journal_commit, + skipped_nouse, +@@ -390,7 +391,7 @@ bch2_bucket_alloc_trans_early(struct btree_trans *trans, + + for_each_btree_key(trans, iter, BTREE_ID_alloc, POS(ca->dev_idx, *cur_bucket), + BTREE_ITER_SLOTS, k, ret) { +- struct bkey_alloc_unpacked a; ++ struct bch_alloc_v4 a; + + if (bkey_cmp(k.k->p, POS(ca->dev_idx, ca->mi.nbuckets)) >= 0) + break; +@@ -399,14 +400,14 @@ bch2_bucket_alloc_trans_early(struct btree_trans *trans, + is_superblock_bucket(ca, k.k->p.offset)) + continue; + +- a = bch2_alloc_unpack(k); ++ bch2_alloc_to_v4(k, &a); + + if (bucket_state(a) != BUCKET_free) + continue; + + (*buckets_seen)++; + +- ob = __try_alloc_bucket(trans->c, ca, reserve, a, ++ ob = __try_alloc_bucket(trans->c, ca, k.k->p.offset, reserve, &a, + skipped_open, + skipped_need_journal_commit, + skipped_nouse, +diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h +index 9877037fc195..a13845a23387 100644 +--- a/fs/bcachefs/bcachefs.h ++++ b/fs/bcachefs/bcachefs.h +@@ -394,6 +394,7 @@ enum gc_phase { + GC_PHASE_BTREE_lru, + GC_PHASE_BTREE_freespace, + GC_PHASE_BTREE_need_discard, ++ GC_PHASE_BTREE_backpointers, + + GC_PHASE_PENDING_DELETE, + }; +diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h +index 33823551d63f..8312018e1ed5 100644 +--- a/fs/bcachefs/bcachefs_format.h ++++ b/fs/bcachefs/bcachefs_format.h +@@ -78,6 +78,21 @@ + #include + #include "vstructs.h" + ++#define BITMASK(name, type, field, offset, end) \ ++static const unsigned name##_OFFSET = offset; \ ++static const unsigned name##_BITS = (end - offset); \ ++ \ ++static inline __u64 name(const type *k) \ ++{ \ ++ return (k->field >> offset) & ~(~0ULL << (end - offset)); \ ++} \ ++ \ ++static inline void SET_##name(type *k, __u64 v) \ ++{ \ ++ k->field &= ~(~(~0ULL << (end - offset)) << offset); \ ++ k->field |= (v & ~(~0ULL << (end - offset))) << offset; \ ++} ++ + #define LE_BITMASK(_bits, name, type, field, offset, end) \ + static const unsigned name##_OFFSET = offset; \ + static const unsigned name##_BITS = (end - offset); \ +@@ -349,7 +364,8 @@ static inline void bkey_init(struct bkey *k) + x(inode_v2, 23) \ + x(alloc_v3, 24) \ + x(set, 25) \ +- x(lru, 26) ++ x(lru, 26) \ ++ x(alloc_v4, 27) + + enum bch_bkey_type { + #define x(name, nr) KEY_TYPE_##name = nr, +@@ -899,8 +915,29 @@ struct bch_alloc_v3 { + __u8 data[]; + } __attribute__((packed, aligned(8))); + +-LE32_BITMASK(BCH_ALLOC_NEED_DISCARD,struct bch_alloc_v3, flags, 0, 1) +-LE32_BITMASK(BCH_ALLOC_NEED_INC_GEN,struct bch_alloc_v3, flags, 1, 2) ++struct bch_alloc_v4 { ++ struct bch_val v; ++ __u64 journal_seq; ++ __u32 flags; ++ __u8 gen; ++ __u8 oldest_gen; ++ __u8 data_type; ++ __u8 stripe_redundancy; ++ __u32 dirty_sectors; ++ __u32 cached_sectors; ++ __u64 io_time[2]; ++ __u32 stripe; ++ __u32 nr_external_backpointers; ++ struct bpos backpointers[0]; ++} __attribute__((packed, aligned(8))); ++ ++LE32_BITMASK(BCH_ALLOC_V3_NEED_DISCARD,struct bch_alloc_v3, flags, 0, 1) ++LE32_BITMASK(BCH_ALLOC_V3_NEED_INC_GEN,struct bch_alloc_v3, flags, 1, 2) ++ ++BITMASK(BCH_ALLOC_V4_NEED_DISCARD, struct bch_alloc_v4, flags, 0, 1) ++BITMASK(BCH_ALLOC_V4_NEED_INC_GEN, struct bch_alloc_v4, flags, 1, 2) ++BITMASK(BCH_ALLOC_V4_BACKPOINTERS_START,struct bch_alloc_v4, flags, 2, 8) ++BITMASK(BCH_ALLOC_V4_NR_BACKPOINTERS, struct bch_alloc_v4, flags, 8, 14) + + enum { + #define x(name, _bits) BCH_ALLOC_FIELD_V1_##name, +@@ -1322,7 +1359,8 @@ struct bch_sb_field_journal_seq_blacklist { + x(reflink_p_fix, 16) \ + x(subvol_dirent, 17) \ + x(inode_v2, 18) \ +- x(freespace, 19) ++ x(freespace, 19) \ ++ x(alloc_v4, 20) + + enum bcachefs_metadata_version { + bcachefs_metadata_version_min = 9, +@@ -1849,7 +1887,8 @@ LE32_BITMASK(JSET_NO_FLUSH, struct jset, flags, 5, 6); + x(snapshots, 9) \ + x(lru, 10) \ + x(freespace, 11) \ +- x(need_discard, 12) ++ x(need_discard, 12) \ ++ x(backpointers, 13) + + enum btree_id { + #define x(kwd, val) BTREE_ID_##kwd = val, +diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c +index 3c1bf3310d99..0eac86e5e776 100644 +--- a/fs/bcachefs/bkey_methods.c ++++ b/fs/bcachefs/bkey_methods.c +@@ -149,7 +149,8 @@ static unsigned bch2_key_types_allowed[] = { + (1U << KEY_TYPE_deleted)| + (1U << KEY_TYPE_alloc)| + (1U << KEY_TYPE_alloc_v2)| +- (1U << KEY_TYPE_alloc_v3), ++ (1U << KEY_TYPE_alloc_v3)| ++ (1U << KEY_TYPE_alloc_v4), + [BKEY_TYPE_quotas] = + (1U << KEY_TYPE_deleted)| + (1U << KEY_TYPE_quota), +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +index ba81043fff51..720001782216 100644 +--- a/fs/bcachefs/btree_gc.c ++++ b/fs/bcachefs/btree_gc.c +@@ -1309,6 +1309,19 @@ static int bch2_gc_start(struct bch_fs *c, + return 0; + } + ++/* returns true if not equal */ ++static inline bool bch2_alloc_v4_cmp(struct bch_alloc_v4 l, ++ struct bch_alloc_v4 r) ++{ ++ return l.gen != r.gen || ++ l.oldest_gen != r.oldest_gen || ++ l.data_type != r.data_type || ++ l.dirty_sectors != r.dirty_sectors || ++ l.cached_sectors != r.cached_sectors || ++ l.stripe_redundancy != r.stripe_redundancy || ++ l.stripe != r.stripe; ++} ++ + static int bch2_alloc_write_key(struct btree_trans *trans, + struct btree_iter *iter, + bool metadata_only) +@@ -1317,8 +1330,7 @@ static int bch2_alloc_write_key(struct btree_trans *trans, + struct bch_dev *ca = bch_dev_bkey_exists(c, iter->pos.inode); + struct bucket gc; + struct bkey_s_c k; +- struct bkey_alloc_unpacked old_u, new_u; +- struct bkey_alloc_buf *a; ++ struct bch_alloc_v4 old, new; + int ret; + + k = bch2_btree_iter_peek_slot(iter); +@@ -1326,7 +1338,8 @@ static int bch2_alloc_write_key(struct btree_trans *trans, + if (ret) + return ret; + +- old_u = new_u = bch2_alloc_unpack(k); ++ bch2_alloc_to_v4(k, &old); ++ new = old; + + percpu_down_read(&c->mark_lock); + gc = *gc_bucket(ca, iter->pos.offset); +@@ -1338,36 +1351,31 @@ static int bch2_alloc_write_key(struct btree_trans *trans, + gc.data_type != BCH_DATA_btree) + return 0; + +- if (gen_after(old_u.gen, gc.gen)) ++ if (gen_after(old.gen, gc.gen)) + return 0; + + #define copy_bucket_field(_f) \ +- if (fsck_err_on(new_u._f != gc._f, c, \ ++ if (fsck_err_on(new._f != gc._f, c, \ + "bucket %llu:%llu gen %u data type %s has wrong " #_f \ + ": got %u, should be %u", \ + iter->pos.inode, iter->pos.offset, \ + gc.gen, \ + bch2_data_types[gc.data_type], \ +- new_u._f, gc._f)) \ +- new_u._f = gc._f; \ ++ new._f, gc._f)) \ ++ new._f = gc._f; \ + + copy_bucket_field(gen); + copy_bucket_field(data_type); +- copy_bucket_field(stripe); + copy_bucket_field(dirty_sectors); + copy_bucket_field(cached_sectors); + copy_bucket_field(stripe_redundancy); + copy_bucket_field(stripe); + #undef copy_bucket_field + +- if (!bkey_alloc_unpacked_cmp(old_u, new_u)) ++ if (!bch2_alloc_v4_cmp(old, new)) + return 0; + +- a = bch2_alloc_pack(trans, new_u); +- if (IS_ERR(a)) +- return PTR_ERR(a); +- +- ret = bch2_trans_update(trans, iter, &a->k, BTREE_TRIGGER_NORUN); ++ ret = bch2_alloc_write(trans, iter, &new, BTREE_TRIGGER_NORUN); + fsck_err: + return ret; + } +@@ -1418,7 +1426,7 @@ static int bch2_gc_alloc_start(struct bch_fs *c, bool metadata_only) + struct btree_iter iter; + struct bkey_s_c k; + struct bucket *g; +- struct bkey_alloc_unpacked u; ++ struct bch_alloc_v4 a; + unsigned i; + int ret; + +@@ -1443,20 +1451,21 @@ static int bch2_gc_alloc_start(struct bch_fs *c, bool metadata_only) + BTREE_ITER_PREFETCH, k, ret) { + ca = bch_dev_bkey_exists(c, k.k->p.inode); + g = gc_bucket(ca, k.k->p.offset); +- u = bch2_alloc_unpack(k); ++ ++ bch2_alloc_to_v4(k, &a); + + g->gen_valid = 1; +- g->gen = u.gen; ++ g->gen = a.gen; + + if (metadata_only && +- (u.data_type == BCH_DATA_user || +- u.data_type == BCH_DATA_cached || +- u.data_type == BCH_DATA_parity)) { +- g->data_type = u.data_type; +- g->dirty_sectors = u.dirty_sectors; +- g->cached_sectors = u.cached_sectors; +- g->stripe = u.stripe; +- g->stripe_redundancy = u.stripe_redundancy; ++ (a.data_type == BCH_DATA_user || ++ a.data_type == BCH_DATA_cached || ++ a.data_type == BCH_DATA_parity)) { ++ g->data_type = a.data_type; ++ g->dirty_sectors = a.dirty_sectors; ++ g->cached_sectors = a.cached_sectors; ++ g->stripe = a.stripe; ++ g->stripe_redundancy = a.stripe_redundancy; + } + } + bch2_trans_iter_exit(&trans, &iter); +@@ -1890,7 +1899,7 @@ static int bch2_alloc_write_oldest_gen(struct btree_trans *trans, struct btree_i + { + struct bch_dev *ca = bch_dev_bkey_exists(trans->c, iter->pos.inode); + struct bkey_s_c k; +- struct bkey_alloc_unpacked u; ++ struct bch_alloc_v4 a; + int ret; + + k = bch2_btree_iter_peek_slot(iter); +@@ -1898,14 +1907,14 @@ static int bch2_alloc_write_oldest_gen(struct btree_trans *trans, struct btree_i + if (ret) + return ret; + +- u = bch2_alloc_unpack(k); ++ bch2_alloc_to_v4(k, &a); + +- if (u.oldest_gen == ca->oldest_gen[iter->pos.offset]) ++ if (a.oldest_gen == ca->oldest_gen[iter->pos.offset]) + return 0; + +- u.oldest_gen = ca->oldest_gen[iter->pos.offset]; ++ a.oldest_gen = ca->oldest_gen[iter->pos.offset]; + +- return bch2_alloc_write(trans, iter, &u, 0); ++ return bch2_alloc_write(trans, iter, &a, 0); + } + + int bch2_gc_gens(struct bch_fs *c) +diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h +index b86a721f90ac..3438e089dba0 100644 +--- a/fs/bcachefs/btree_types.h ++++ b/fs/bcachefs/btree_types.h +@@ -669,6 +669,7 @@ enum btree_update_flags { + ((1U << KEY_TYPE_alloc)| \ + (1U << KEY_TYPE_alloc_v2)| \ + (1U << KEY_TYPE_alloc_v3)| \ ++ (1U << KEY_TYPE_alloc_v4)| \ + (1U << KEY_TYPE_stripe)| \ + (1U << KEY_TYPE_inode)| \ + (1U << KEY_TYPE_inode_v2)| \ +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index b2b7bf9bb1c7..011f18ecbe5e 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -279,20 +279,20 @@ bch2_fs_usage_read_short(struct bch_fs *c) + return ret; + } + +-static inline int is_unavailable_bucket(struct bkey_alloc_unpacked a) ++static inline int is_unavailable_bucket(struct bch_alloc_v4 a) + { + return a.dirty_sectors || a.stripe; + } + + static inline int bucket_sectors_fragmented(struct bch_dev *ca, +- struct bkey_alloc_unpacked a) ++ struct bch_alloc_v4 a) + { + return a.dirty_sectors + ? max(0, (int) ca->mi.bucket_size - (int) a.dirty_sectors) + : 0; + } + +-static inline enum bch_data_type bucket_type(struct bkey_alloc_unpacked a) ++static inline enum bch_data_type bucket_type(struct bch_alloc_v4 a) + { + return a.cached_sectors && !a.dirty_sectors + ? BCH_DATA_cached +@@ -311,8 +311,8 @@ static inline void account_bucket(struct bch_fs_usage *fs_usage, + } + + static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca, +- struct bkey_alloc_unpacked old, +- struct bkey_alloc_unpacked new, ++ struct bch_alloc_v4 old, ++ struct bch_alloc_v4 new, + u64 journal_seq, bool gc) + { + struct bch_fs_usage *fs_usage; +@@ -349,14 +349,14 @@ static void bch2_dev_usage_update_m(struct bch_fs *c, struct bch_dev *ca, + struct bucket old, struct bucket new, + u64 journal_seq, bool gc) + { +- struct bkey_alloc_unpacked old_a = { ++ struct bch_alloc_v4 old_a = { + .gen = old.gen, + .data_type = old.data_type, + .dirty_sectors = old.dirty_sectors, + .cached_sectors = old.cached_sectors, + .stripe = old.stripe, + }; +- struct bkey_alloc_unpacked new_a = { ++ struct bch_alloc_v4 new_a = { + .gen = new.gen, + .data_type = new.data_type, + .dirty_sectors = new.dirty_sectors, +@@ -506,13 +506,12 @@ int bch2_mark_alloc(struct btree_trans *trans, + bool gc = flags & BTREE_TRIGGER_GC; + u64 journal_seq = trans->journal_res.seq; + struct bch_fs *c = trans->c; +- struct bkey_alloc_unpacked old_u = bch2_alloc_unpack(old); +- struct bkey_alloc_unpacked new_u = bch2_alloc_unpack(new); +- struct bch_dev *ca = bch_dev_bkey_exists(c, new_u.dev); ++ struct bch_alloc_v4 old_a, new_a; ++ struct bch_dev *ca = bch_dev_bkey_exists(c, new.k->p.inode); + int ret = 0; + +- if (bch2_trans_inconsistent_on(new_u.bucket < ca->mi.first_bucket || +- new_u.bucket >= ca->mi.nbuckets, trans, ++ if (bch2_trans_inconsistent_on(new.k->p.offset < ca->mi.first_bucket || ++ new.k->p.offset >= ca->mi.nbuckets, trans, + "alloc key outside range of device's buckets")) + return -EIO; + +@@ -523,11 +522,13 @@ int bch2_mark_alloc(struct btree_trans *trans, + !(flags & BTREE_TRIGGER_BUCKET_INVALIDATE)) + return 0; + ++ bch2_alloc_to_v4(old, &old_a); ++ bch2_alloc_to_v4(new, &new_a); ++ + if ((flags & BTREE_TRIGGER_INSERT) && +- !old_u.data_type != !new_u.data_type && +- new.k->type == KEY_TYPE_alloc_v3) { +- struct bch_alloc_v3 *v = (struct bch_alloc_v3 *) new.v; +- u64 old_journal_seq = le64_to_cpu(v->journal_seq); ++ !old_a.data_type != !new_a.data_type && ++ new.k->type == KEY_TYPE_alloc_v4) { ++ struct bch_alloc_v4 *v = (struct bch_alloc_v4 *) new.v; + + BUG_ON(!journal_seq); + +@@ -536,18 +537,18 @@ int bch2_mark_alloc(struct btree_trans *trans, + * before the bucket became empty again, then the we don't have + * to wait on a journal flush before we can reuse the bucket: + */ +- new_u.journal_seq = !new_u.data_type && +- (journal_seq == old_journal_seq || +- bch2_journal_noflush_seq(&c->journal, old_journal_seq)) ++ new_a.journal_seq = !new_a.data_type && ++ (journal_seq == v->journal_seq || ++ bch2_journal_noflush_seq(&c->journal, v->journal_seq)) + ? 0 : journal_seq; +- v->journal_seq = cpu_to_le64(new_u.journal_seq); ++ v->journal_seq = new_a.journal_seq; + } + +- if (old_u.data_type && !new_u.data_type && new_u.journal_seq) { ++ if (old_a.data_type && !new_a.data_type && new_a.journal_seq) { + ret = bch2_set_bucket_needs_journal_commit(&c->buckets_waiting_for_journal, + c->journal.flushed_seq_ondisk, +- new_u.dev, new_u.bucket, +- new_u.journal_seq); ++ new.k->p.inode, new.k->p.offset, ++ new_a.journal_seq); + if (ret) { + bch2_fs_fatal_error(c, + "error setting bucket_needs_journal_commit: %i", ret); +@@ -555,43 +556,43 @@ int bch2_mark_alloc(struct btree_trans *trans, + } + } + +- if (!new_u.data_type && +- (!new_u.journal_seq || new_u.journal_seq < c->journal.flushed_seq_ondisk)) ++ if (!new_a.data_type && ++ (!new_a.journal_seq || new_a.journal_seq < c->journal.flushed_seq_ondisk)) + closure_wake_up(&c->freelist_wait); + + if ((flags & BTREE_TRIGGER_INSERT) && +- new_u.need_discard && +- !new_u.journal_seq) ++ BCH_ALLOC_V4_NEED_DISCARD(&new_a) && ++ !new_a.journal_seq) + bch2_do_discards(c); + +- if (!old_u.data_type && +- new_u.data_type && ++ if (!old_a.data_type && ++ new_a.data_type && + should_invalidate_buckets(ca)) + bch2_do_invalidates(c); + +- if (bucket_state(new_u) == BUCKET_need_gc_gens) { ++ if (bucket_state(new_a) == BUCKET_need_gc_gens) { + atomic_inc(&c->kick_gc); + wake_up_process(c->gc_thread); + } + + percpu_down_read(&c->mark_lock); +- if (!gc && new_u.gen != old_u.gen) +- *bucket_gen(ca, new_u.bucket) = new_u.gen; ++ if (!gc && new_a.gen != old_a.gen) ++ *bucket_gen(ca, new.k->p.offset) = new_a.gen; + +- bch2_dev_usage_update(c, ca, old_u, new_u, journal_seq, gc); ++ bch2_dev_usage_update(c, ca, old_a, new_a, journal_seq, gc); + + if (gc) { +- struct bucket *g = gc_bucket(ca, new_u.bucket); ++ struct bucket *g = gc_bucket(ca, new.k->p.offset); + + bucket_lock(g); + + g->gen_valid = 1; +- g->gen = new_u.gen; +- g->data_type = new_u.data_type; +- g->stripe = new_u.stripe; +- g->stripe_redundancy = new_u.stripe_redundancy; +- g->dirty_sectors = new_u.dirty_sectors; +- g->cached_sectors = new_u.cached_sectors; ++ g->gen = new_a.gen; ++ g->data_type = new_a.data_type; ++ g->stripe = new_a.stripe; ++ g->stripe_redundancy = new_a.stripe_redundancy; ++ g->dirty_sectors = new_a.dirty_sectors; ++ g->cached_sectors = new_a.cached_sectors; + + bucket_unlock(g); + } +@@ -603,17 +604,17 @@ int bch2_mark_alloc(struct btree_trans *trans, + */ + + if ((flags & BTREE_TRIGGER_BUCKET_INVALIDATE) && +- old_u.cached_sectors) { ++ old_a.cached_sectors) { + ret = update_cached_sectors(c, new, ca->dev_idx, +- -old_u.cached_sectors, ++ -old_a.cached_sectors, + journal_seq, gc); + if (ret) { + bch2_fs_fatal_error(c, "bch2_mark_alloc(): no replicas entry while updating cached sectors"); + return ret; + } + +- trace_invalidate(ca, bucket_to_sector(ca, new_u.bucket), +- old_u.cached_sectors); ++ trace_invalidate(ca, bucket_to_sector(ca, new.k->p.offset), ++ old_a.cached_sectors); + } + + return 0; +@@ -1387,7 +1388,7 @@ need_mark: + + static int bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter *iter, + const struct bch_extent_ptr *ptr, +- struct bkey_alloc_unpacked *u) ++ struct bch_alloc_v4 *a) + { + struct bch_fs *c = trans->c; + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); +@@ -1406,7 +1407,7 @@ static int bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree + return ret; + } + +- *u = bch2_alloc_unpack(k); ++ bch2_alloc_to_v4(k, a); + return 0; + } + +@@ -1415,20 +1416,20 @@ static int bch2_trans_mark_pointer(struct btree_trans *trans, + s64 sectors, enum bch_data_type data_type) + { + struct btree_iter iter; +- struct bkey_alloc_unpacked u; ++ struct bch_alloc_v4 a; + int ret; + +- ret = bch2_trans_start_alloc_update(trans, &iter, &p.ptr, &u); ++ ret = bch2_trans_start_alloc_update(trans, &iter, &p.ptr, &a); + if (ret) + return ret; + + ret = __mark_pointer(trans, k, &p.ptr, sectors, data_type, +- u.gen, &u.data_type, +- &u.dirty_sectors, &u.cached_sectors); ++ a.gen, &a.data_type, ++ &a.dirty_sectors, &a.cached_sectors); + if (ret) + goto out; + +- ret = bch2_alloc_write(trans, &iter, &u, 0); ++ ret = bch2_alloc_write(trans, &iter, &a, 0); + if (ret) + goto out; + out: +@@ -1561,7 +1562,7 @@ static int bch2_trans_mark_stripe_bucket(struct btree_trans *trans, + struct bch_fs *c = trans->c; + const struct bch_extent_ptr *ptr = &s.v->ptrs[idx]; + struct btree_iter iter; +- struct bkey_alloc_unpacked u; ++ struct bch_alloc_v4 a; + enum bch_data_type data_type = idx >= s.v->nr_blocks - s.v->nr_redundant + ? BCH_DATA_parity : 0; + s64 sectors = data_type ? le16_to_cpu(s.v->sectors) : 0; +@@ -1570,59 +1571,59 @@ static int bch2_trans_mark_stripe_bucket(struct btree_trans *trans, + if (deleting) + sectors = -sectors; + +- ret = bch2_trans_start_alloc_update(trans, &iter, ptr, &u); ++ ret = bch2_trans_start_alloc_update(trans, &iter, ptr, &a); + if (ret) + return ret; + + ret = check_bucket_ref(c, s.s_c, ptr, sectors, data_type, +- u.gen, u.data_type, +- u.dirty_sectors, u.cached_sectors); ++ a.gen, a.data_type, ++ a.dirty_sectors, a.cached_sectors); + if (ret) + goto err; + + if (!deleting) { +- if (bch2_trans_inconsistent_on(u.stripe || +- u.stripe_redundancy, trans, ++ if (bch2_trans_inconsistent_on(a.stripe || ++ a.stripe_redundancy, trans, + "bucket %llu:%llu gen %u data type %s dirty_sectors %u: multiple stripes using same bucket (%u, %llu)", +- iter.pos.inode, iter.pos.offset, u.gen, +- bch2_data_types[u.data_type], +- u.dirty_sectors, +- u.stripe, s.k->p.offset)) { ++ iter.pos.inode, iter.pos.offset, a.gen, ++ bch2_data_types[a.data_type], ++ a.dirty_sectors, ++ a.stripe, s.k->p.offset)) { + ret = -EIO; + goto err; + } + +- if (bch2_trans_inconsistent_on(data_type && u.dirty_sectors, trans, ++ if (bch2_trans_inconsistent_on(data_type && a.dirty_sectors, trans, + "bucket %llu:%llu gen %u data type %s dirty_sectors %u: data already in stripe bucket %llu", +- iter.pos.inode, iter.pos.offset, u.gen, +- bch2_data_types[u.data_type], +- u.dirty_sectors, ++ iter.pos.inode, iter.pos.offset, a.gen, ++ bch2_data_types[a.data_type], ++ a.dirty_sectors, + s.k->p.offset)) { + ret = -EIO; + goto err; + } + +- u.stripe = s.k->p.offset; +- u.stripe_redundancy = s.v->nr_redundant; ++ a.stripe = s.k->p.offset; ++ a.stripe_redundancy = s.v->nr_redundant; + } else { +- if (bch2_trans_inconsistent_on(u.stripe != s.k->p.offset || +- u.stripe_redundancy != s.v->nr_redundant, trans, ++ if (bch2_trans_inconsistent_on(a.stripe != s.k->p.offset || ++ a.stripe_redundancy != s.v->nr_redundant, trans, + "bucket %llu:%llu gen %u: not marked as stripe when deleting stripe %llu (got %u)", +- iter.pos.inode, iter.pos.offset, u.gen, +- s.k->p.offset, u.stripe)) { ++ iter.pos.inode, iter.pos.offset, a.gen, ++ s.k->p.offset, a.stripe)) { + ret = -EIO; + goto err; + } + +- u.stripe = 0; +- u.stripe_redundancy = 0; ++ a.stripe = 0; ++ a.stripe_redundancy = 0; + } + +- u.dirty_sectors += sectors; ++ a.dirty_sectors += sectors; + if (data_type) +- u.data_type = !deleting ? data_type : 0; ++ a.data_type = !deleting ? data_type : 0; + +- ret = bch2_alloc_write(trans, &iter, &u, 0); ++ ret = bch2_alloc_write(trans, &iter, &a, 0); + if (ret) + goto err; + err: +@@ -1852,7 +1853,7 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans, + { + struct bch_fs *c = trans->c; + struct btree_iter iter; +- struct bkey_alloc_unpacked u; ++ struct bch_alloc_v4 a; + struct bch_extent_ptr ptr = { + .dev = ca->dev_idx, + .offset = bucket_to_sector(ca, b), +@@ -1865,26 +1866,26 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans, + if (b >= ca->mi.nbuckets) + return 0; + +- ret = bch2_trans_start_alloc_update(trans, &iter, &ptr, &u); ++ ret = bch2_trans_start_alloc_update(trans, &iter, &ptr, &a); + if (ret) + return ret; + +- if (u.data_type && u.data_type != type) { ++ if (a.data_type && a.data_type != type) { + bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, + "bucket %llu:%llu gen %u different types of data in same bucket: %s, %s\n" + "while marking %s", +- iter.pos.inode, iter.pos.offset, u.gen, +- bch2_data_types[u.data_type], ++ iter.pos.inode, iter.pos.offset, a.gen, ++ bch2_data_types[a.data_type], + bch2_data_types[type], + bch2_data_types[type]); + ret = -EIO; + goto out; + } + +- u.data_type = type; +- u.dirty_sectors = sectors; ++ a.data_type = type; ++ a.dirty_sectors = sectors; + +- ret = bch2_alloc_write(trans, &iter, &u, 0); ++ ret = bch2_alloc_write(trans, &iter, &a, 0); + if (ret) + goto out; + out: +diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c +index 1772ccb2b560..4f0e6960e597 100644 +--- a/fs/bcachefs/lru.c ++++ b/fs/bcachefs/lru.c +@@ -126,7 +126,7 @@ static int bch2_check_lru_key(struct btree_trans *trans, + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_s_c lru_k, k; +- struct bkey_alloc_unpacked a; ++ struct bch_alloc_v4 a; + struct printbuf buf1 = PRINTBUF; + struct printbuf buf2 = PRINTBUF; + u64 idx; +@@ -149,10 +149,10 @@ static int bch2_check_lru_key(struct btree_trans *trans, + if (ret) + goto err; + +- a = bch2_alloc_unpack(k); ++ bch2_alloc_to_v4(k, &a); + + if (fsck_err_on(bucket_state(a) != BUCKET_cached || +- a.read_time != lru_k.k->p.offset, c, ++ a.io_time[READ] != lru_k.k->p.offset, c, + "incorrect lru entry %s\n" + " for %s", + (bch2_bkey_val_to_text(&buf1, c, lru_k), buf1.buf), +diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c +index 4f32d38649c8..cb6b81678ecc 100644 +--- a/fs/bcachefs/movinggc.c ++++ b/fs/bcachefs/movinggc.c +@@ -117,7 +117,7 @@ static int walk_buckets_to_copygc(struct bch_fs *c) + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; +- struct bkey_alloc_unpacked u; ++ struct bch_alloc_v4 a; + int ret; + + bch2_trans_init(&trans, c, 0, 0); +@@ -127,20 +127,20 @@ static int walk_buckets_to_copygc(struct bch_fs *c) + struct bch_dev *ca = bch_dev_bkey_exists(c, iter.pos.inode); + struct copygc_heap_entry e; + +- u = bch2_alloc_unpack(k); ++ bch2_alloc_to_v4(k, &a); + +- if (u.data_type != BCH_DATA_user || +- u.dirty_sectors >= ca->mi.bucket_size || ++ if (a.data_type != BCH_DATA_user || ++ a.dirty_sectors >= ca->mi.bucket_size || + bch2_bucket_is_open(c, iter.pos.inode, iter.pos.offset)) + continue; + + e = (struct copygc_heap_entry) { + .dev = iter.pos.inode, +- .gen = u.gen, +- .replicas = 1 + u.stripe_redundancy, +- .fragmentation = (u64) u.dirty_sectors * (1ULL << 31) ++ .gen = a.gen, ++ .replicas = 1 + a.stripe_redundancy, ++ .fragmentation = (u64) a.dirty_sectors * (1ULL << 31) + / ca->mi.bucket_size, +- .sectors = u.dirty_sectors, ++ .sectors = a.dirty_sectors, + .offset = bucket_to_sector(ca, iter.pos.offset), + }; + heap_add_or_replace(h, e, -fragmentation_cmp, NULL); +@@ -168,7 +168,7 @@ static int check_copygc_was_done(struct bch_fs *c, + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; +- struct bkey_alloc_unpacked u; ++ struct bch_alloc_v4 a; + struct copygc_heap_entry *i; + int ret = 0; + +@@ -187,10 +187,10 @@ static int check_copygc_was_done(struct bch_fs *c, + if (ret) + break; + +- u = bch2_alloc_unpack(k); ++ bch2_alloc_to_v4(k, &a); + +- if (u.gen == i->gen && u.dirty_sectors) { +- *sectors_not_moved += u.dirty_sectors; ++ if (a.gen == i->gen && a.dirty_sectors) { ++ *sectors_not_moved += a.dirty_sectors; + *buckets_not_moved += 1; + } + } +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index 68612d52aa83..ca92fe84c248 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -1029,8 +1029,8 @@ int bch2_fs_recovery(struct bch_fs *c) + bch_info(c, "filesystem version is prior to subvol_dirent - upgrading"); + c->opts.version_upgrade = true; + c->opts.fsck = true; +- } else if (c->sb.version < bcachefs_metadata_version_freespace) { +- bch_info(c, "filesystem version is prior to freespace - upgrading"); ++ } else if (c->sb.version < bcachefs_metadata_version_alloc_v4) { ++ bch_info(c, "filesystem version is prior to alloc_v4 - upgrading"); + c->opts.version_upgrade = true; + } + } +-- +cgit v1.2.3 + + +From b9cd9496a25a2ce23d7f9227a1690539fb32f08d Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 17 Mar 2022 18:21:15 -0400 +Subject: bcachefs: Kill bch2_alloc_write() + +This patch introduces bch2_alloc_to_v4_mut() which returns a +bkey_i_alloc_v4 *, which then can be passed to bch2_trans_update() +directly. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/alloc_background.c | 149 +++++++++++++++++++++++++---------------- + fs/bcachefs/alloc_background.h | 5 +- + fs/bcachefs/btree_gc.c | 20 +++++- + fs/bcachefs/buckets.c | 117 ++++++++++++-------------------- + fs/bcachefs/buckets.h | 8 +++ + 5 files changed, 163 insertions(+), 136 deletions(-) + +diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c +index eb62b4fc2367..1188239a1bcc 100644 +--- a/fs/bcachefs/alloc_background.c ++++ b/fs/bcachefs/alloc_background.c +@@ -247,21 +247,48 @@ void bch2_alloc_to_v4(struct bkey_s_c k, struct bch_alloc_v4 *out) + } + } + +-int bch2_alloc_write(struct btree_trans *trans, struct btree_iter *iter, +- struct bch_alloc_v4 *src, unsigned trigger_flags) ++struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut(struct btree_trans *trans, struct bkey_s_c k) + { +- struct bkey_i_alloc_v4 *dst = +- bch2_trans_kmalloc(trans, sizeof(*dst)); ++ struct bkey_i_alloc_v4 *ret; + +- if (IS_ERR(dst)) +- return PTR_ERR(dst); ++ if (k.k->type == KEY_TYPE_alloc_v4) { ++ ret = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); ++ if (!IS_ERR(ret)) ++ bkey_reassemble(&ret->k_i, k); ++ } else { ++ ret = bch2_trans_kmalloc(trans, sizeof(*ret)); ++ if (!IS_ERR(ret)) { ++ bkey_alloc_v4_init(&ret->k_i); ++ ret->k.p = k.k->p; ++ bch2_alloc_to_v4(k, &ret->v); ++ } ++ } ++ return ret; ++} + +- bkey_alloc_v4_init(&dst->k_i); +- set_bkey_val_bytes(&dst->k, sizeof(dst->v)); +- dst->k.p = iter->pos; +- dst->v = *src; ++struct bkey_i_alloc_v4 * ++bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter *iter, ++ struct bpos pos) ++{ ++ struct bkey_s_c k; ++ struct bkey_i_alloc_v4 *a; ++ int ret; + +- return bch2_trans_update(trans, iter, &dst->k_i, trigger_flags); ++ bch2_trans_iter_init(trans, iter, BTREE_ID_alloc, pos, ++ BTREE_ITER_WITH_UPDATES| ++ BTREE_ITER_CACHED| ++ BTREE_ITER_INTENT); ++ k = bch2_btree_iter_peek_slot(iter); ++ ret = bkey_err(k); ++ if (ret) { ++ bch2_trans_iter_exit(trans, iter); ++ return ERR_PTR(ret); ++ } ++ ++ a = bch2_alloc_to_v4_mut(trans, k); ++ if (IS_ERR(a)) ++ bch2_trans_iter_exit(trans, iter); ++ return a; + } + + static unsigned bch_alloc_v1_val_u64s(const struct bch_alloc *a) +@@ -649,11 +676,25 @@ static int bch2_check_alloc_key(struct btree_trans *trans, + ret = bch2_lru_change(trans, + alloc_k.k->p.inode, + alloc_k.k->p.offset, +- 0, &a.io_time[READ]) ?: +- (a.io_time[READ] != read_time +- ? bch2_alloc_write(trans, alloc_iter, &a, BTREE_TRIGGER_NORUN) +- : 0) ?: +- bch2_trans_commit(trans, NULL, NULL, 0); ++ 0, &a.io_time[READ]); ++ if (ret) ++ goto err; ++ ++ if (a.io_time[READ] != read_time) { ++ struct bkey_i_alloc_v4 *a_mut = ++ bch2_alloc_to_v4_mut(trans, alloc_k); ++ ret = PTR_ERR_OR_ZERO(a_mut); ++ if (ret) ++ goto err; ++ ++ a_mut->v.io_time[READ] = a.io_time[READ]; ++ ret = bch2_trans_update(trans, alloc_iter, ++ &a_mut->k_i, BTREE_TRIGGER_NORUN); ++ if (ret) ++ goto err; ++ } ++ ++ ret = bch2_trans_commit(trans, NULL, NULL, 0); + if (ret) + goto err; + } +@@ -802,7 +843,7 @@ static int bch2_clear_need_discard(struct btree_trans *trans, struct bpos pos, + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_s_c k; +- struct bch_alloc_v4 a; ++ struct bkey_i_alloc_v4 *a; + struct printbuf buf = PRINTBUF; + int ret; + +@@ -813,17 +854,20 @@ static int bch2_clear_need_discard(struct btree_trans *trans, struct bpos pos, + if (ret) + goto out; + +- bch2_alloc_to_v4(k, &a); ++ a = bch2_alloc_to_v4_mut(trans, k); ++ ret = PTR_ERR_OR_ZERO(a); ++ if (ret) ++ goto out; + +- if (BCH_ALLOC_V4_NEED_INC_GEN(&a)) { +- a.gen++; +- SET_BCH_ALLOC_V4_NEED_INC_GEN(&a, false); ++ if (BCH_ALLOC_V4_NEED_INC_GEN(&a->v)) { ++ a->v.gen++; ++ SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false); + goto write; + } + +- BUG_ON(a.journal_seq > c->journal.flushed_seq_ondisk); ++ BUG_ON(a->v.journal_seq > c->journal.flushed_seq_ondisk); + +- if (bch2_fs_inconsistent_on(!BCH_ALLOC_V4_NEED_DISCARD(&a), c, ++ if (bch2_fs_inconsistent_on(!BCH_ALLOC_V4_NEED_DISCARD(&a->v), c, + "%s\n incorrectly set in need_discard btree", + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { + ret = -EIO; +@@ -847,9 +891,9 @@ static int bch2_clear_need_discard(struct btree_trans *trans, struct bpos pos, + goto out; + } + +- SET_BCH_ALLOC_V4_NEED_DISCARD(&a, false); ++ SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false); + write: +- ret = bch2_alloc_write(trans, &iter, &a, 0); ++ ret = bch2_trans_update(trans, &iter, &a->k_i, 0); + out: + bch2_trans_iter_exit(trans, &iter); + printbuf_exit(&buf); +@@ -919,7 +963,7 @@ static int invalidate_one_bucket(struct btree_trans *trans, struct bch_dev *ca) + struct bch_fs *c = trans->c; + struct btree_iter lru_iter, alloc_iter = { NULL }; + struct bkey_s_c k; +- struct bch_alloc_v4 a; ++ struct bkey_i_alloc_v4 *a; + u64 bucket, idx; + int ret; + +@@ -940,32 +984,27 @@ static int invalidate_one_bucket(struct btree_trans *trans, struct bch_dev *ca) + idx = k.k->p.offset; + bucket = le64_to_cpu(bkey_s_c_to_lru(k).v->idx); + +- bch2_trans_iter_init(trans, &alloc_iter, BTREE_ID_alloc, +- POS(ca->dev_idx, bucket), +- BTREE_ITER_CACHED| +- BTREE_ITER_INTENT); +- k = bch2_btree_iter_peek_slot(&alloc_iter); +- ret = bkey_err(k); ++ a = bch2_trans_start_alloc_update(trans, &alloc_iter, ++ POS(ca->dev_idx, bucket)); ++ ret = PTR_ERR_OR_ZERO(a); + if (ret) + goto out; + +- bch2_alloc_to_v4(k, &a); +- +- if (bch2_fs_inconsistent_on(idx != alloc_lru_idx(a), c, ++ if (bch2_fs_inconsistent_on(idx != alloc_lru_idx(a->v), c, + "invalidating bucket with wrong lru idx (got %llu should be %llu", +- idx, alloc_lru_idx(a))) ++ idx, alloc_lru_idx(a->v))) + goto out; + +- SET_BCH_ALLOC_V4_NEED_INC_GEN(&a, false); +- a.gen++; +- a.data_type = 0; +- a.dirty_sectors = 0; +- a.cached_sectors = 0; +- a.io_time[READ] = atomic64_read(&c->io_clock[READ].now); +- a.io_time[WRITE] = atomic64_read(&c->io_clock[WRITE].now); ++ SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false); ++ a->v.gen++; ++ a->v.data_type = 0; ++ a->v.dirty_sectors = 0; ++ a->v.cached_sectors = 0; ++ a->v.io_time[READ] = atomic64_read(&c->io_clock[READ].now); ++ a->v.io_time[WRITE] = atomic64_read(&c->io_clock[WRITE].now); + +- ret = bch2_alloc_write(trans, &alloc_iter, &a, +- BTREE_TRIGGER_BUCKET_INVALIDATE); ++ ret = bch2_trans_update(trans, &alloc_iter, &a->k_i, ++ BTREE_TRIGGER_BUCKET_INVALIDATE); + out: + bch2_trans_iter_exit(trans, &alloc_iter); + bch2_trans_iter_exit(trans, &lru_iter); +@@ -1087,28 +1126,22 @@ int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev, + { + struct bch_fs *c = trans->c; + struct btree_iter iter; +- struct bkey_s_c k; +- struct bch_alloc_v4 a; ++ struct bkey_i_alloc_v4 *a; + u64 now; + int ret = 0; + +- bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, POS(dev, bucket_nr), +- BTREE_ITER_CACHED| +- BTREE_ITER_INTENT); +- k = bch2_btree_iter_peek_slot(&iter); +- ret = bkey_err(k); ++ a = bch2_trans_start_alloc_update(trans, &iter, POS(dev, bucket_nr)); ++ ret = PTR_ERR_OR_ZERO(a); + if (ret) +- goto out; +- +- bch2_alloc_to_v4(k, &a); ++ return ret; + + now = atomic64_read(&c->io_clock[rw].now); +- if (a.io_time[rw] == now) ++ if (a->v.io_time[rw] == now) + goto out; + +- a.io_time[rw] = now; ++ a->v.io_time[rw] = now; + +- ret = bch2_alloc_write(trans, &iter, &a, 0) ?: ++ ret = bch2_trans_update(trans, &iter, &a->k_i, 0) ?: + bch2_trans_commit(trans, NULL, NULL, 0); + out: + bch2_trans_iter_exit(trans, &iter); +diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h +index 11fe7273bd69..da1b650e8017 100644 +--- a/fs/bcachefs/alloc_background.h ++++ b/fs/bcachefs/alloc_background.h +@@ -56,10 +56,11 @@ static inline struct bpos alloc_freespace_pos(struct bpos pos, struct bch_alloc_ + return pos; + } + +-int bch2_alloc_write(struct btree_trans *, struct btree_iter *, +- struct bch_alloc_v4 *, unsigned); ++struct bkey_i_alloc_v4 * ++bch2_trans_start_alloc_update(struct btree_trans *, struct btree_iter *, struct bpos); + + void bch2_alloc_to_v4(struct bkey_s_c, struct bch_alloc_v4 *); ++struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut(struct btree_trans *, struct bkey_s_c); + + int bch2_bucket_io_time_reset(struct btree_trans *, unsigned, size_t, int); + +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +index 720001782216..e19991796c82 100644 +--- a/fs/bcachefs/btree_gc.c ++++ b/fs/bcachefs/btree_gc.c +@@ -1330,6 +1330,7 @@ static int bch2_alloc_write_key(struct btree_trans *trans, + struct bch_dev *ca = bch_dev_bkey_exists(c, iter->pos.inode); + struct bucket gc; + struct bkey_s_c k; ++ struct bkey_i_alloc_v4 *a; + struct bch_alloc_v4 old, new; + int ret; + +@@ -1375,7 +1376,14 @@ static int bch2_alloc_write_key(struct btree_trans *trans, + if (!bch2_alloc_v4_cmp(old, new)) + return 0; + +- ret = bch2_alloc_write(trans, iter, &new, BTREE_TRIGGER_NORUN); ++ a = bch2_alloc_to_v4_mut(trans, k); ++ ret = PTR_ERR_OR_ZERO(a); ++ if (ret) ++ return ret; ++ ++ a->v = new; ++ ++ ret = bch2_trans_update(trans, iter, &a->k_i, BTREE_TRIGGER_NORUN); + fsck_err: + return ret; + } +@@ -1900,6 +1908,7 @@ static int bch2_alloc_write_oldest_gen(struct btree_trans *trans, struct btree_i + struct bch_dev *ca = bch_dev_bkey_exists(trans->c, iter->pos.inode); + struct bkey_s_c k; + struct bch_alloc_v4 a; ++ struct bkey_i_alloc_v4 *a_mut; + int ret; + + k = bch2_btree_iter_peek_slot(iter); +@@ -1912,9 +1921,14 @@ static int bch2_alloc_write_oldest_gen(struct btree_trans *trans, struct btree_i + if (a.oldest_gen == ca->oldest_gen[iter->pos.offset]) + return 0; + +- a.oldest_gen = ca->oldest_gen[iter->pos.offset]; ++ a_mut = bch2_alloc_to_v4_mut(trans, k); ++ ret = PTR_ERR_OR_ZERO(a_mut); ++ if (ret) ++ return ret; ++ ++ a_mut->v.oldest_gen = ca->oldest_gen[iter->pos.offset]; + +- return bch2_alloc_write(trans, iter, &a, 0); ++ return bch2_trans_update(trans, iter, &a_mut->k_i, 0); + } + + int bch2_gc_gens(struct bch_fs *c) +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index 011f18ecbe5e..7654ab24a909 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -1386,50 +1386,25 @@ need_mark: + + /* trans_mark: */ + +-static int bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter *iter, +- const struct bch_extent_ptr *ptr, +- struct bch_alloc_v4 *a) +-{ +- struct bch_fs *c = trans->c; +- struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); +- struct bkey_s_c k; +- int ret; +- +- bch2_trans_iter_init(trans, iter, BTREE_ID_alloc, +- POS(ptr->dev, PTR_BUCKET_NR(ca, ptr)), +- BTREE_ITER_WITH_UPDATES| +- BTREE_ITER_CACHED| +- BTREE_ITER_INTENT); +- k = bch2_btree_iter_peek_slot(iter); +- ret = bkey_err(k); +- if (ret) { +- bch2_trans_iter_exit(trans, iter); +- return ret; +- } +- +- bch2_alloc_to_v4(k, a); +- return 0; +-} +- + static int bch2_trans_mark_pointer(struct btree_trans *trans, + struct bkey_s_c k, struct extent_ptr_decoded p, + s64 sectors, enum bch_data_type data_type) + { + struct btree_iter iter; +- struct bch_alloc_v4 a; ++ struct bkey_i_alloc_v4 *a; + int ret; + +- ret = bch2_trans_start_alloc_update(trans, &iter, &p.ptr, &a); +- if (ret) +- return ret; ++ a = bch2_trans_start_alloc_update(trans, &iter, PTR_BUCKET_POS(trans->c, &p.ptr)); ++ if (IS_ERR(a)) ++ return PTR_ERR(a); + + ret = __mark_pointer(trans, k, &p.ptr, sectors, data_type, +- a.gen, &a.data_type, +- &a.dirty_sectors, &a.cached_sectors); ++ a->v.gen, &a->v.data_type, ++ &a->v.dirty_sectors, &a->v.cached_sectors); + if (ret) + goto out; + +- ret = bch2_alloc_write(trans, &iter, &a, 0); ++ ret = bch2_trans_update(trans, &iter, &a->k_i, 0); + if (ret) + goto out; + out: +@@ -1562,7 +1537,7 @@ static int bch2_trans_mark_stripe_bucket(struct btree_trans *trans, + struct bch_fs *c = trans->c; + const struct bch_extent_ptr *ptr = &s.v->ptrs[idx]; + struct btree_iter iter; +- struct bch_alloc_v4 a; ++ struct bkey_i_alloc_v4 *a; + enum bch_data_type data_type = idx >= s.v->nr_blocks - s.v->nr_redundant + ? BCH_DATA_parity : 0; + s64 sectors = data_type ? le16_to_cpu(s.v->sectors) : 0; +@@ -1571,59 +1546,59 @@ static int bch2_trans_mark_stripe_bucket(struct btree_trans *trans, + if (deleting) + sectors = -sectors; + +- ret = bch2_trans_start_alloc_update(trans, &iter, ptr, &a); +- if (ret) +- return ret; ++ a = bch2_trans_start_alloc_update(trans, &iter, PTR_BUCKET_POS(c, ptr)); ++ if (IS_ERR(a)) ++ return PTR_ERR(a); + + ret = check_bucket_ref(c, s.s_c, ptr, sectors, data_type, +- a.gen, a.data_type, +- a.dirty_sectors, a.cached_sectors); ++ a->v.gen, a->v.data_type, ++ a->v.dirty_sectors, a->v.cached_sectors); + if (ret) + goto err; + + if (!deleting) { +- if (bch2_trans_inconsistent_on(a.stripe || +- a.stripe_redundancy, trans, ++ if (bch2_trans_inconsistent_on(a->v.stripe || ++ a->v.stripe_redundancy, trans, + "bucket %llu:%llu gen %u data type %s dirty_sectors %u: multiple stripes using same bucket (%u, %llu)", +- iter.pos.inode, iter.pos.offset, a.gen, +- bch2_data_types[a.data_type], +- a.dirty_sectors, +- a.stripe, s.k->p.offset)) { ++ iter.pos.inode, iter.pos.offset, a->v.gen, ++ bch2_data_types[a->v.data_type], ++ a->v.dirty_sectors, ++ a->v.stripe, s.k->p.offset)) { + ret = -EIO; + goto err; + } + +- if (bch2_trans_inconsistent_on(data_type && a.dirty_sectors, trans, ++ if (bch2_trans_inconsistent_on(data_type && a->v.dirty_sectors, trans, + "bucket %llu:%llu gen %u data type %s dirty_sectors %u: data already in stripe bucket %llu", +- iter.pos.inode, iter.pos.offset, a.gen, +- bch2_data_types[a.data_type], +- a.dirty_sectors, ++ iter.pos.inode, iter.pos.offset, a->v.gen, ++ bch2_data_types[a->v.data_type], ++ a->v.dirty_sectors, + s.k->p.offset)) { + ret = -EIO; + goto err; + } + +- a.stripe = s.k->p.offset; +- a.stripe_redundancy = s.v->nr_redundant; ++ a->v.stripe = s.k->p.offset; ++ a->v.stripe_redundancy = s.v->nr_redundant; + } else { +- if (bch2_trans_inconsistent_on(a.stripe != s.k->p.offset || +- a.stripe_redundancy != s.v->nr_redundant, trans, ++ if (bch2_trans_inconsistent_on(a->v.stripe != s.k->p.offset || ++ a->v.stripe_redundancy != s.v->nr_redundant, trans, + "bucket %llu:%llu gen %u: not marked as stripe when deleting stripe %llu (got %u)", +- iter.pos.inode, iter.pos.offset, a.gen, +- s.k->p.offset, a.stripe)) { ++ iter.pos.inode, iter.pos.offset, a->v.gen, ++ s.k->p.offset, a->v.stripe)) { + ret = -EIO; + goto err; + } + +- a.stripe = 0; +- a.stripe_redundancy = 0; ++ a->v.stripe = 0; ++ a->v.stripe_redundancy = 0; + } + +- a.dirty_sectors += sectors; ++ a->v.dirty_sectors += sectors; + if (data_type) +- a.data_type = !deleting ? data_type : 0; ++ a->v.data_type = !deleting ? data_type : 0; + +- ret = bch2_alloc_write(trans, &iter, &a, 0); ++ ret = bch2_trans_update(trans, &iter, &a->k_i, 0); + if (ret) + goto err; + err: +@@ -1853,11 +1828,7 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans, + { + struct bch_fs *c = trans->c; + struct btree_iter iter; +- struct bch_alloc_v4 a; +- struct bch_extent_ptr ptr = { +- .dev = ca->dev_idx, +- .offset = bucket_to_sector(ca, b), +- }; ++ struct bkey_i_alloc_v4 *a; + int ret = 0; + + /* +@@ -1866,26 +1837,26 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans, + if (b >= ca->mi.nbuckets) + return 0; + +- ret = bch2_trans_start_alloc_update(trans, &iter, &ptr, &a); +- if (ret) +- return ret; ++ a = bch2_trans_start_alloc_update(trans, &iter, POS(ca->dev_idx, b)); ++ if (IS_ERR(a)) ++ return PTR_ERR(a); + +- if (a.data_type && a.data_type != type) { ++ if (a->v.data_type && a->v.data_type != type) { + bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, + "bucket %llu:%llu gen %u different types of data in same bucket: %s, %s\n" + "while marking %s", +- iter.pos.inode, iter.pos.offset, a.gen, +- bch2_data_types[a.data_type], ++ iter.pos.inode, iter.pos.offset, a->v.gen, ++ bch2_data_types[a->v.data_type], + bch2_data_types[type], + bch2_data_types[type]); + ret = -EIO; + goto out; + } + +- a.data_type = type; +- a.dirty_sectors = sectors; ++ a->v.data_type = type; ++ a->v.dirty_sectors = sectors; + +- ret = bch2_alloc_write(trans, &iter, &a, 0); ++ ret = bch2_trans_update(trans, &iter, &a->k_i, 0); + if (ret) + goto out; + out: +diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h +index 25baca33e885..853bc9dd1294 100644 +--- a/fs/bcachefs/buckets.h ++++ b/fs/bcachefs/buckets.h +@@ -66,6 +66,14 @@ static inline size_t PTR_BUCKET_NR(const struct bch_dev *ca, + return sector_to_bucket(ca, ptr->offset); + } + ++static inline struct bpos PTR_BUCKET_POS(const struct bch_fs *c, ++ const struct bch_extent_ptr *ptr) ++{ ++ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); ++ ++ return POS(ptr->dev, PTR_BUCKET_NR(ca, ptr)); ++} ++ + static inline struct bucket *PTR_GC_BUCKET(struct bch_dev *ca, + const struct bch_extent_ptr *ptr) + { +-- +cgit v1.2.3 + + +From aee1cfd531e73d06fc94a8166e08dffed9d35327 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Fri, 1 Apr 2022 16:11:42 -0400 +Subject: bcachefs: Discard path fixes/improvements + + - bch2_clear_need_discard() was using bch2_trans_relock() incorrectly, + and always bailing out before doing any work - ouch. + + - Add a tracepoint that fires every time bch2_do_discards() runs, and + tells us about the work it did + + - When too many buckets aren't able to be discarded because they need a + journal commit, bch2_do_discards now flushes the journal. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/alloc_background.c | 23 ++++++++++++++++++++--- + include/trace/events/bcachefs.h | 34 ++++++++++++++++++++++++++++++++++ + 2 files changed, 54 insertions(+), 3 deletions(-) + +diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c +index 1188239a1bcc..e8a34eccac25 100644 +--- a/fs/bcachefs/alloc_background.c ++++ b/fs/bcachefs/alloc_background.c +@@ -886,7 +886,7 @@ static int bch2_clear_need_discard(struct btree_trans *trans, struct bpos pos, + GFP_KERNEL, 0); + *discard_done = true; + +- ret = bch2_trans_relock(trans); ++ ret = bch2_trans_relock(trans) ? 0 : -EINTR; + if (ret) + goto out; + } +@@ -907,6 +907,7 @@ static void bch2_do_discards_work(struct work_struct *work) + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; ++ u64 seen = 0, open = 0, need_journal_commit = 0, discarded = 0; + int ret; + + bch2_trans_init(&trans, c, 0, 0); +@@ -929,11 +930,19 @@ static void bch2_do_discards_work(struct work_struct *work) + } + } + ++ seen++; ++ ++ if (bch2_bucket_is_open_safe(c, k.k->p.inode, k.k->p.offset)) { ++ open++; ++ continue; ++ } ++ + if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal, + c->journal.flushed_seq_ondisk, +- k.k->p.inode, k.k->p.offset) || +- bch2_bucket_is_open_safe(c, k.k->p.inode, k.k->p.offset)) ++ k.k->p.inode, k.k->p.offset)) { ++ need_journal_commit++; + continue; ++ } + + ret = __bch2_trans_do(&trans, NULL, NULL, + BTREE_INSERT_USE_RESERVE| +@@ -941,6 +950,8 @@ static void bch2_do_discards_work(struct work_struct *work) + bch2_clear_need_discard(&trans, k.k->p, ca, &discard_done)); + if (ret) + break; ++ ++ discarded++; + } + bch2_trans_iter_exit(&trans, &iter); + +@@ -948,7 +959,13 @@ static void bch2_do_discards_work(struct work_struct *work) + percpu_ref_put(&ca->io_ref); + + bch2_trans_exit(&trans); ++ ++ if (need_journal_commit * 2 > seen) ++ bch2_journal_flush_async(&c->journal, NULL); ++ + percpu_ref_put(&c->writes); ++ ++ trace_do_discards(c, seen, open, need_journal_commit, discarded, ret); + } + + void bch2_do_discards(struct bch_fs *c) +diff --git a/include/trace/events/bcachefs.h b/include/trace/events/bcachefs.h +index bccad83da05b..f63a7c87265d 100644 +--- a/include/trace/events/bcachefs.h ++++ b/include/trace/events/bcachefs.h +@@ -182,6 +182,40 @@ TRACE_EVENT(journal_reclaim_finish, + __entry->nr_flushed) + ); + ++/* allocator: */ ++ ++TRACE_EVENT(do_discards, ++ TP_PROTO(struct bch_fs *c, u64 seen, u64 open, ++ u64 need_journal_commit, u64 discarded, int ret), ++ TP_ARGS(c, seen, open, need_journal_commit, discarded, ret), ++ ++ TP_STRUCT__entry( ++ __field(dev_t, dev ) ++ __field(u64, seen ) ++ __field(u64, open ) ++ __field(u64, need_journal_commit ) ++ __field(u64, discarded ) ++ __field(int, ret ) ++ ), ++ ++ TP_fast_assign( ++ __entry->dev = c->dev; ++ __entry->seen = seen; ++ __entry->open = open; ++ __entry->need_journal_commit = need_journal_commit; ++ __entry->discarded = discarded; ++ __entry->ret = ret; ++ ), ++ ++ TP_printk("%d%d seen %llu open %llu need_journal_commit %llu discarded %llu ret %i", ++ MAJOR(__entry->dev), MINOR(__entry->dev), ++ __entry->seen, ++ __entry->open, ++ __entry->need_journal_commit, ++ __entry->discarded, ++ __entry->ret) ++); ++ + /* bset.c: */ + + DEFINE_EVENT(bpos, bkey_pack_pos_fail, +-- +cgit v1.2.3 + + +From ab3b6e7dd69c5cd5dfd96fd265ade6897720f671 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 2 Apr 2022 16:30:37 -0400 +Subject: bcachefs: Fix pr_buf() calls + +In a few places we were passing a variable to pr_buf() for the format +string - oops. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/opts.c | 2 +- + fs/bcachefs/util.h | 2 +- + 2 files changed, 2 insertions(+), 2 deletions(-) + +diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c +index 77fbb7d2194e..385451ef865e 100644 +--- a/fs/bcachefs/opts.c ++++ b/fs/bcachefs/opts.c +@@ -324,7 +324,7 @@ void bch2_opt_to_text(struct printbuf *out, + if (flags & OPT_SHOW_FULL_LIST) + bch2_string_opt_to_text(out, opt->choices, v); + else +- pr_buf(out, opt->choices[v]); ++ pr_buf(out, "%s", opt->choices[v]); + break; + case BCH_OPT_FN: + opt->to_text(out, c, sb, v); +diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h +index 888693703c75..086d941f23e5 100644 +--- a/fs/bcachefs/util.h ++++ b/fs/bcachefs/util.h +@@ -386,7 +386,7 @@ static inline void pr_uuid(struct printbuf *out, u8 *uuid) + char uuid_str[40]; + + uuid_unparse_lower(uuid, uuid_str); +- pr_buf(out, uuid_str); ++ pr_buf(out, "%s", uuid_str); + } + + int bch2_strtoint_h(const char *, int *); +-- +cgit v1.2.3 + + +From 54eda97079bc73ef18249a4a00442f347b0cdfbb Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Thu, 31 Mar 2022 22:05:33 -0400 +Subject: bcachefs: Use crc_is_compressed() + +Trivial cleanup. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/buckets.c | 5 ++--- + 1 file changed, 2 insertions(+), 3 deletions(-) + +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index 7654ab24a909..bc39d742fa00 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -671,10 +671,9 @@ static s64 ptr_disk_sectors(s64 sectors, struct extent_ptr_decoded p) + { + EBUG_ON(sectors < 0); + +- return p.crc.compression_type && +- p.crc.compression_type != BCH_COMPRESSION_TYPE_incompressible ++ return crc_is_compressed(p.crc) + ? DIV_ROUND_UP_ULL(sectors * p.crc.compressed_size, +- p.crc.uncompressed_size) ++ p.crc.uncompressed_size) + : sectors; + } + +-- +cgit v1.2.3 + + +From 2cfe6da80fee0801bd013178b11107d285887626 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 2 Apr 2022 16:57:29 -0400 +Subject: bcachefs: Improve read_from_stale_dirty_pointer() message + +With printbufs, it's now easy to build up multi-line log messages and +emit them with one call, which is good because it prevents multiple +multi-line log messages from getting Interspersed in the log buffer; +this patch also improves the formatting and converts it to latest style. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/io.c | 26 ++++++++++++++++---------- + 1 file changed, 16 insertions(+), 10 deletions(-) + +diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c +index 36929451af2c..223344e1ad74 100644 +--- a/fs/bcachefs/io.c ++++ b/fs/bcachefs/io.c +@@ -1981,22 +1981,28 @@ static noinline void read_from_stale_dirty_pointer(struct btree_trans *trans, + struct printbuf buf = PRINTBUF; + int ret; + +- bch2_bkey_val_to_text(&buf, c, k); +- bch2_fs_inconsistent(c, "Attempting to read from stale dirty pointer: %s", buf.buf); +- + bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, +- POS(ptr.dev, PTR_BUCKET_NR(ca, &ptr)), ++ PTR_BUCKET_POS(c, &ptr), + BTREE_ITER_CACHED); + +- ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(&iter))); +- if (ret) +- goto out; ++ pr_buf(&buf, "Attempting to read from stale dirty pointer:"); ++ pr_indent_push(&buf, 2); ++ pr_newline(&buf); + + bch2_bkey_val_to_text(&buf, c, k); +- bch_err(c, "%s", buf.buf); +- bch_err(c, "memory gen: %u", *bucket_gen(ca, iter.pos.offset)); ++ pr_newline(&buf); ++ ++ pr_buf(&buf, "memory gen: %u", *bucket_gen(ca, iter.pos.offset)); ++ ++ ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(&iter))); ++ if (!ret) { ++ pr_newline(&buf); ++ bch2_bkey_val_to_text(&buf, c, k); ++ } ++ ++ bch2_fs_inconsistent(c, "%s", buf.buf); ++ + bch2_trans_iter_exit(trans, &iter); +-out: + printbuf_exit(&buf); + } + +-- +cgit v1.2.3 + + +From 30129d957b577fc300b4e206870641093a96e20d Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 2 Apr 2022 17:24:25 -0400 +Subject: bcachefs: Don't write partially-initialized superblocks + +This neatly avoids bugs where we fail partway through initializing a new +filesystem, if we just don't write out partly-initialized state. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/super-io.c | 7 +++++++ + 1 file changed, 7 insertions(+) + +diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c +index 71abf87114df..fa2654f60384 100644 +--- a/fs/bcachefs/super-io.c ++++ b/fs/bcachefs/super-io.c +@@ -831,6 +831,13 @@ int bch2_write_super(struct bch_fs *c) + if (c->opts.nochanges) + goto out; + ++ /* ++ * Defer writing the superblock until filesystem initialization is ++ * complete - don't write out a partly initialized superblock: ++ */ ++ if (!BCH_SB_INITIALIZED(c->disk_sb.sb)) ++ goto out; ++ + for_each_online_member(ca, c, i) { + __set_bit(ca->dev_idx, sb_written.d); + ca->sb_write_error = 0; +-- +cgit v1.2.3 + + +From 45d0c2596758c0d050972783dcb139e979e0b434 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 2 Apr 2022 18:00:04 -0400 +Subject: bcachefs: gc mark fn fixes, cleanups + +mark_stripe_bucket() was busted; it was using @new unitialized. + +Also, clean up all the gc mark functions, and convert them to the same +style. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/buckets.c | 85 +++++++++++++++++++++++++-------------------------- + fs/bcachefs/buckets.h | 6 ++-- + 2 files changed, 44 insertions(+), 47 deletions(-) + +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index bc39d742fa00..51ed9609aeb4 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -620,13 +620,13 @@ int bch2_mark_alloc(struct btree_trans *trans, + return 0; + } + +-void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, +- size_t b, enum bch_data_type data_type, +- unsigned sectors, struct gc_pos pos, +- unsigned flags) ++int bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, ++ size_t b, enum bch_data_type data_type, ++ unsigned sectors, struct gc_pos pos, ++ unsigned flags) + { + struct bucket old, new, *g; +- bool overflow; ++ int ret = 0; + + BUG_ON(!(flags & BTREE_TRIGGER_GC)); + BUG_ON(data_type != BCH_DATA_sb && +@@ -636,7 +636,7 @@ void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, + * Backup superblock might be past the end of our normal usable space: + */ + if (b >= ca->mi.nbuckets) +- return; ++ return 0; + + percpu_down_read(&c->mark_lock); + g = gc_bucket(ca, b); +@@ -644,27 +644,34 @@ void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, + bucket_lock(g); + old = *g; + ++ if (bch2_fs_inconsistent_on(g->data_type && ++ g->data_type != data_type, c, ++ "different types of data in same bucket: %s, %s", ++ bch2_data_types[g->data_type], ++ bch2_data_types[data_type])) { ++ ret = -EIO; ++ goto err; ++ } ++ ++ if (bch2_fs_inconsistent_on((u64) g->dirty_sectors + sectors > ca->mi.bucket_size, c, ++ "bucket %u:%zu gen %u data type %s sector count overflow: %u + %u > bucket size", ++ ca->dev_idx, b, g->gen, ++ bch2_data_types[g->data_type ?: data_type], ++ g->dirty_sectors, sectors)) { ++ ret = -EIO; ++ goto err; ++ } ++ ++ + g->data_type = data_type; + g->dirty_sectors += sectors; +- overflow = g->dirty_sectors < sectors; +- + new = *g; ++err: + bucket_unlock(g); +- +- bch2_fs_inconsistent_on(old.data_type && +- old.data_type != data_type, c, +- "different types of data in same bucket: %s, %s", +- bch2_data_types[old.data_type], +- bch2_data_types[data_type]); +- +- bch2_fs_inconsistent_on(overflow, c, +- "bucket %u:%zu gen %u data type %s sector count overflow: %u + %u > U16_MAX", +- ca->dev_idx, b, new.gen, +- bch2_data_types[old.data_type ?: data_type], +- old.dirty_sectors, sectors); +- +- bch2_dev_usage_update_m(c, ca, old, new, 0, true); ++ if (!ret) ++ bch2_dev_usage_update_m(c, ca, old, new, 0, true); + percpu_up_read(&c->mark_lock); ++ return ret; + } + + static s64 ptr_disk_sectors(s64 sectors, struct extent_ptr_decoded p) +@@ -807,25 +814,22 @@ static int mark_stripe_bucket(struct btree_trans *trans, + old = *g; + + ret = check_bucket_ref(c, k, ptr, sectors, data_type, +- new.gen, new.data_type, +- new.dirty_sectors, new.cached_sectors); +- if (ret) { +- bucket_unlock(g); ++ g->gen, g->data_type, ++ g->dirty_sectors, g->cached_sectors); ++ if (ret) + goto err; +- } + +- new.dirty_sectors += sectors; + if (data_type) +- new.data_type = data_type; ++ g->data_type = data_type; ++ g->dirty_sectors += sectors; + + g->stripe = k.k->p.offset; + g->stripe_redundancy = s->nr_redundant; +- + new = *g; +- bucket_unlock(g); +- +- bch2_dev_usage_update_m(c, ca, old, new, journal_seq, true); + err: ++ bucket_unlock(g); ++ if (!ret) ++ bch2_dev_usage_update_m(c, ca, old, new, journal_seq, true); + percpu_up_read(&c->mark_lock); + printbuf_exit(&buf); + return ret; +@@ -871,29 +875,22 @@ static int bch2_mark_pointer(struct btree_trans *trans, + + percpu_down_read(&c->mark_lock); + g = PTR_GC_BUCKET(ca, &p.ptr); +- + bucket_lock(g); + old = *g; + + bucket_data_type = g->data_type; +- + ret = __mark_pointer(trans, k, &p.ptr, sectors, + data_type, g->gen, + &bucket_data_type, + &g->dirty_sectors, + &g->cached_sectors); +- if (ret) { +- bucket_unlock(g); +- goto err; +- } +- +- g->data_type = bucket_data_type; ++ if (!ret) ++ g->data_type = bucket_data_type; + + new = *g; + bucket_unlock(g); +- +- bch2_dev_usage_update_m(c, ca, old, new, journal_seq, true); +-err: ++ if (!ret) ++ bch2_dev_usage_update_m(c, ca, old, new, journal_seq, true); + percpu_up_read(&c->mark_lock); + + return ret; +diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h +index 853bc9dd1294..656a04b558bc 100644 +--- a/fs/bcachefs/buckets.h ++++ b/fs/bcachefs/buckets.h +@@ -194,9 +194,9 @@ bch2_fs_usage_read_short(struct bch_fs *); + + void bch2_fs_usage_initialize(struct bch_fs *); + +-void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *, +- size_t, enum bch_data_type, unsigned, +- struct gc_pos, unsigned); ++int bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *, ++ size_t, enum bch_data_type, unsigned, ++ struct gc_pos, unsigned); + + int bch2_mark_alloc(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned); + int bch2_mark_extent(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned); +-- +cgit v1.2.3 + + +From ff21b38064e01ede7d337d94853bbea4af6522ac Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 3 Apr 2022 14:27:44 -0400 +Subject: fixup! bcachefs: bch_sb_field_journal_v2 + +--- + fs/bcachefs/journal.c | 11 ++++++++++- + fs/bcachefs/journal_sb.c | 3 ++- + 2 files changed, 12 insertions(+), 2 deletions(-) + +diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c +index 505e8367b5f2..d01b1cd4000d 100644 +--- a/fs/bcachefs/journal.c ++++ b/fs/bcachefs/journal.c +@@ -964,6 +964,7 @@ int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca, + int bch2_dev_journal_alloc(struct bch_dev *ca) + { + unsigned nr; ++ int ret; + + if (dynamic_fault("bcachefs:add:journal_alloc")) + return -ENOMEM; +@@ -980,7 +981,15 @@ int bch2_dev_journal_alloc(struct bch_dev *ca) + min(1 << 13, + (1 << 24) / ca->mi.bucket_size)); + +- return __bch2_set_nr_journal_buckets(ca, nr, true, NULL); ++ if (ca->fs) ++ mutex_lock(&ca->fs->sb_lock); ++ ++ ret = __bch2_set_nr_journal_buckets(ca, nr, true, NULL); ++ ++ if (ca->fs) ++ mutex_unlock(&ca->fs->sb_lock); ++ ++ return ret; + } + + /* startup/shutdown: */ +diff --git a/fs/bcachefs/journal_sb.c b/fs/bcachefs/journal_sb.c +index 8efe7b7e3dcb..506044e358db 100644 +--- a/fs/bcachefs/journal_sb.c ++++ b/fs/bcachefs/journal_sb.c +@@ -186,7 +186,8 @@ int bch2_journal_buckets_to_sb(struct bch_fs *c, struct bch_dev *ca) + struct bch_sb_field_journal_v2 *j; + unsigned i, dst = 0, nr = 1; + +- lockdep_assert_held(&c->sb_lock); ++ if (c) ++ lockdep_assert_held(&c->sb_lock); + + if (!ja->nr) { + bch2_sb_field_delete(&ca->disk_sb, BCH_SB_FIELD_journal); +-- +cgit v1.2.3 + + +From 9219727020d2b85538be80394e991dba1a8681b4 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 3 Apr 2022 15:13:20 -0400 +Subject: bcachefs: Add a tracepoint for superblock writes + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/super-io.c | 4 ++++ + include/trace/events/bcachefs.h | 20 ++++++++++++++++++++ + 2 files changed, 24 insertions(+) + +diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c +index fa2654f60384..15241a56f203 100644 +--- a/fs/bcachefs/super-io.c ++++ b/fs/bcachefs/super-io.c +@@ -21,6 +21,8 @@ + #include + #include + ++#include ++ + const char * const bch2_sb_fields[] = { + #define x(name, nr) #name, + BCH_SB_FIELDS() +@@ -797,6 +799,8 @@ int bch2_write_super(struct bch_fs *c) + unsigned degraded_flags = BCH_FORCE_IF_DEGRADED; + int ret = 0; + ++ trace_write_super(c, _RET_IP_); ++ + if (c->opts.very_degraded) + degraded_flags |= BCH_FORCE_IF_LOST; + +diff --git a/include/trace/events/bcachefs.h b/include/trace/events/bcachefs.h +index f63a7c87265d..f60ba1e4aca8 100644 +--- a/include/trace/events/bcachefs.h ++++ b/include/trace/events/bcachefs.h +@@ -82,6 +82,26 @@ DECLARE_EVENT_CLASS(bio, + (unsigned long long)__entry->sector, __entry->nr_sector) + ); + ++/* super-io.c: */ ++TRACE_EVENT(write_super, ++ TP_PROTO(struct bch_fs *c, unsigned long ip), ++ TP_ARGS(c, ip), ++ ++ TP_STRUCT__entry( ++ __field(dev_t, dev ) ++ __field(unsigned long, ip ) ++ ), ++ ++ TP_fast_assign( ++ __entry->dev = c->dev; ++ __entry->ip = ip; ++ ), ++ ++ TP_printk("%d,%d for %pS", ++ MAJOR(__entry->dev), MINOR(__entry->dev), ++ (void *) __entry->ip) ++); ++ + /* io.c: */ + + DEFINE_EVENT(bio, read_split, +-- +cgit v1.2.3 + + +From f66d3242ddbadc2ed8e1918b4dae0134147e8422 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 3 Apr 2022 20:36:32 -0400 +Subject: bcachefs: Don't normalize to pages in btree cache shrinker + +This behavior dates from the early, early days of bcache, and upon +further delving appears to not make any sense. The shrinker only works +in terms of 'objects' of unknown size; normalizing to pages only had the +effect of changing the batch size, which we could do directly - if we +wanted; we probably don't. Normalizing to pages meant our batch size was +very small, which seems to have been keeping us from doing as much +shrinking as we should be under heavy memory pressure; this patch +appears to alleviate some OOMs we've been seeing. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/btree_cache.c | 13 ++++--------- + include/trace/events/bcachefs.h | 28 ++++++++++------------------ + 2 files changed, 14 insertions(+), 27 deletions(-) + +diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c +index 0dcdc30c6888..8e04129abeac 100644 +--- a/fs/bcachefs/btree_cache.c ++++ b/fs/bcachefs/btree_cache.c +@@ -281,7 +281,7 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, + struct btree_cache *bc = &c->btree_cache; + struct btree *b, *t; + unsigned long nr = sc->nr_to_scan; +- unsigned long can_free; ++ unsigned long can_free = 0; + unsigned long touched = 0; + unsigned long freed = 0; + unsigned i, flags; +@@ -305,7 +305,6 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, + * succeed, so that inserting keys into the btree can always succeed and + * IO can always make forward progress: + */ +- nr /= btree_pages(c); + can_free = btree_cache_can_free(bc); + nr = min_t(unsigned long, nr, can_free); + +@@ -375,13 +374,10 @@ touched: + + mutex_unlock(&bc->lock); + out: +- ret = (unsigned long) freed * btree_pages(c); ++ ret = freed; + memalloc_nofs_restore(flags); + out_norestore: +- trace_btree_cache_scan(sc->nr_to_scan, +- sc->nr_to_scan / btree_pages(c), +- btree_cache_can_free(bc), +- ret); ++ trace_btree_cache_scan(sc->nr_to_scan, can_free, ret); + return ret; + } + +@@ -395,7 +391,7 @@ static unsigned long bch2_btree_cache_count(struct shrinker *shrink, + if (bch2_btree_shrinker_disabled) + return 0; + +- return btree_cache_can_free(bc) * btree_pages(c); ++ return btree_cache_can_free(bc); + } + + void bch2_fs_btree_cache_exit(struct bch_fs *c) +@@ -482,7 +478,6 @@ int bch2_fs_btree_cache_init(struct bch_fs *c) + bc->shrink.count_objects = bch2_btree_cache_count; + bc->shrink.scan_objects = bch2_btree_cache_scan; + bc->shrink.seeks = 4; +- bc->shrink.batch = btree_pages(c) * 2; + ret = register_shrinker(&bc->shrink); + out: + pr_verbose_init(c->opts, "ret %i", ret); +diff --git a/include/trace/events/bcachefs.h b/include/trace/events/bcachefs.h +index f60ba1e4aca8..2155f1a03be9 100644 +--- a/include/trace/events/bcachefs.h ++++ b/include/trace/events/bcachefs.h +@@ -373,31 +373,23 @@ DEFINE_EVENT(btree_node, btree_set_root, + ); + + TRACE_EVENT(btree_cache_scan, +- TP_PROTO(unsigned long nr_to_scan_pages, +- unsigned long nr_to_scan_nodes, +- unsigned long can_free_nodes, +- long ret), +- TP_ARGS(nr_to_scan_pages, nr_to_scan_nodes, can_free_nodes, ret), ++ TP_PROTO(long nr_to_scan, long can_free, long ret), ++ TP_ARGS(nr_to_scan, can_free, ret), + + TP_STRUCT__entry( +- __field(unsigned long, nr_to_scan_pages ) +- __field(unsigned long, nr_to_scan_nodes ) +- __field(unsigned long, can_free_nodes ) +- __field(long, ret ) ++ __field(long, nr_to_scan ) ++ __field(long, can_free ) ++ __field(long, ret ) + ), + + TP_fast_assign( +- __entry->nr_to_scan_pages = nr_to_scan_pages; +- __entry->nr_to_scan_nodes = nr_to_scan_nodes; +- __entry->can_free_nodes = can_free_nodes; +- __entry->ret = ret; ++ __entry->nr_to_scan = nr_to_scan; ++ __entry->can_free = can_free; ++ __entry->ret = ret; + ), + +- TP_printk("scanned for %lu pages, %lu nodes, can free %lu nodes, ret %li", +- __entry->nr_to_scan_pages, +- __entry->nr_to_scan_nodes, +- __entry->can_free_nodes, +- __entry->ret) ++ TP_printk("scanned for %li nodes, can free %li, ret %li", ++ __entry->nr_to_scan, __entry->can_free, __entry->ret) + ); + + TRACE_EVENT(btree_node_relock_fail, +-- +cgit v1.2.3 + + +From 91e6c3e0d5ac0d29a9c97e71a1ba7abb346b4991 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 4 Apr 2022 01:09:26 -0400 +Subject: bcachefs: Gap buffer for journal keys + +Btree updates before we go RW work by inserting into the array of keys +that journal replay will insert - but inserting into a flat array is +O(n), meaning if btree_gc needs to update many alloc keys, we're O(n^2). + +Fortunately, the updates btree_gc does happens in sequential order, +which means a gap buffer works nicely here - this patch implements a gap +buffer for journal keys. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/bcachefs.h | 6 +++ + fs/bcachefs/recovery.c | 126 ++++++++++++++++++++++++++++++++++++------------- + fs/bcachefs/recovery.h | 3 -- + fs/bcachefs/util.h | 25 ++++++++++ + 4 files changed, 124 insertions(+), 36 deletions(-) + +diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h +index a13845a23387..ab6df637ee26 100644 +--- a/fs/bcachefs/bcachefs.h ++++ b/fs/bcachefs/bcachefs.h +@@ -548,6 +548,12 @@ struct journal_keys { + u32 journal_seq; + u32 journal_offset; + } *d; ++ /* ++ * Gap buffer: instead of all the empty space in the array being at the ++ * end of the buffer - from @nr to @size - the empty space is at @gap. ++ * This means that sequential insertions are O(n) instead of O(n^2). ++ */ ++ size_t gap; + size_t nr; + size_t size; + u64 journal_seq_base; +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index ca92fe84c248..6a92c1a05a0a 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -72,58 +72,97 @@ static int journal_key_cmp(const struct journal_key *l, const struct journal_key + return __journal_key_cmp(l->btree_id, l->level, l->k->k.p, r); + } + +-size_t bch2_journal_key_search(struct journal_keys *journal_keys, ++static inline size_t idx_to_pos(struct journal_keys *keys, size_t idx) ++{ ++ size_t gap_size = keys->size - keys->nr; ++ ++ if (idx >= keys->gap) ++ idx += gap_size; ++ return idx; ++} ++ ++static inline struct journal_key *idx_to_key(struct journal_keys *keys, size_t idx) ++{ ++ return keys->d + idx_to_pos(keys, idx); ++} ++ ++size_t bch2_journal_key_search(struct journal_keys *keys, + enum btree_id id, unsigned level, + struct bpos pos) + { +- size_t l = 0, r = journal_keys->nr, m; ++ size_t l = 0, r = keys->nr, m; + + while (l < r) { + m = l + ((r - l) >> 1); +- if (__journal_key_cmp(id, level, pos, &journal_keys->d[m]) > 0) ++ if (__journal_key_cmp(id, level, pos, idx_to_key(keys, m)) > 0) + l = m + 1; + else + r = m; + } + +- BUG_ON(l < journal_keys->nr && +- __journal_key_cmp(id, level, pos, &journal_keys->d[l]) > 0); ++ BUG_ON(l < keys->nr && ++ __journal_key_cmp(id, level, pos, idx_to_key(keys, l)) > 0); + + BUG_ON(l && +- __journal_key_cmp(id, level, pos, &journal_keys->d[l - 1]) <= 0); ++ __journal_key_cmp(id, level, pos, idx_to_key(keys, l - 1)) <= 0); + +- return l; ++ return idx_to_pos(keys, l); + } + + struct bkey_i *bch2_journal_keys_peek(struct bch_fs *c, enum btree_id btree_id, + unsigned level, struct bpos pos) + { + struct journal_keys *keys = &c->journal_keys; +- struct journal_key *end = keys->d + keys->nr; +- struct journal_key *k = keys->d + +- bch2_journal_key_search(keys, btree_id, level, pos); ++ size_t idx = bch2_journal_key_search(keys, btree_id, level, pos); + +- while (k < end && k->overwritten) +- k++; ++ while (idx < keys->size && ++ keys->d[idx].overwritten) { ++ idx++; ++ if (idx == keys->gap) ++ idx += keys->size - keys->nr; ++ } + +- if (k < end && +- k->btree_id == btree_id && +- k->level == level) +- return k->k; ++ if (idx < keys->size && ++ keys->d[idx].btree_id == btree_id && ++ keys->d[idx].level == level) ++ return keys->d[idx].k; + return NULL; + } + +-static void journal_iter_fix(struct bch_fs *c, struct journal_iter *iter, unsigned idx) ++static void journal_iters_fix(struct bch_fs *c) + { +- struct bkey_i *n = iter->keys->d[idx].k; +- struct btree_and_journal_iter *biter = +- container_of(iter, struct btree_and_journal_iter, journal); +- +- if (iter->idx > idx || +- (iter->idx == idx && +- biter->last && +- bpos_cmp(n->k.p, biter->unpacked.p) <= 0)) +- iter->idx++; ++ struct journal_keys *keys = &c->journal_keys; ++ /* The key we just inserted is immediately before the gap: */ ++ struct journal_key *n = &keys->d[keys->gap - 1]; ++ size_t gap_end = keys->gap + (keys->size - keys->nr); ++ struct btree_and_journal_iter *iter; ++ ++ /* ++ * If an iterator points one after the key we just inserted, ++ * and the key we just inserted compares >= the iterator's position, ++ * decrement the iterator so it points at the key we just inserted: ++ */ ++ list_for_each_entry(iter, &c->journal_iters, journal.list) ++ if (iter->journal.idx == gap_end && ++ iter->last && ++ iter->b->c.btree_id == n->btree_id && ++ iter->b->c.level == n->level && ++ bpos_cmp(n->k->k.p, iter->unpacked.p) >= 0) ++ iter->journal.idx = keys->gap - 1; ++} ++ ++static void journal_iters_move_gap(struct bch_fs *c, size_t old_gap, size_t new_gap) ++{ ++ struct journal_keys *keys = &c->journal_keys; ++ struct journal_iter *iter; ++ size_t gap_size = keys->size - keys->nr; ++ ++ list_for_each_entry(iter, &c->journal_iters, list) { ++ if (iter->idx > old_gap) ++ iter->idx -= gap_size; ++ if (iter->idx >= new_gap) ++ iter->idx += gap_size; ++ } + } + + int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id, +@@ -141,12 +180,11 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id, + .journal_seq = U32_MAX, + }; + struct journal_keys *keys = &c->journal_keys; +- struct journal_iter *iter; + size_t idx = bch2_journal_key_search(keys, id, level, k->k.p); + + BUG_ON(test_bit(BCH_FS_RW, &c->flags)); + +- if (idx < keys->nr && ++ if (idx < keys->size && + journal_key_cmp(&n, &keys->d[idx]) == 0) { + if (keys->d[idx].allocated) + kfree(keys->d[idx].k); +@@ -154,6 +192,9 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id, + return 0; + } + ++ if (idx > keys->gap) ++ idx -= keys->size - keys->nr; ++ + if (keys->nr == keys->size) { + struct journal_keys new_keys = { + .nr = keys->nr, +@@ -168,15 +209,24 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id, + return -ENOMEM; + } + ++ /* Since @keys was full, there was no gap: */ + memcpy(new_keys.d, keys->d, sizeof(keys->d[0]) * keys->nr); + kvfree(keys->d); + *keys = new_keys; ++ ++ /* And now the gap is at the end: */ ++ keys->gap = keys->nr; + } + +- array_insert_item(keys->d, keys->nr, idx, n); ++ journal_iters_move_gap(c, keys->gap, idx); ++ ++ move_gap(keys->d, keys->nr, keys->size, keys->gap, idx); ++ keys->gap = idx; ++ ++ keys->nr++; ++ keys->d[keys->gap++] = n; + +- list_for_each_entry(iter, &c->journal_iters, list) +- journal_iter_fix(c, iter, idx); ++ journal_iters_fix(c); + + return 0; + } +@@ -220,7 +270,7 @@ void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree, + struct journal_keys *keys = &c->journal_keys; + size_t idx = bch2_journal_key_search(keys, btree, level, pos); + +- if (idx < keys->nr && ++ if (idx < keys->size && + keys->d[idx].btree_id == btree && + keys->d[idx].level == level && + !bpos_cmp(keys->d[idx].k->k.p, pos)) +@@ -246,8 +296,11 @@ static struct bkey_i *bch2_journal_iter_peek(struct journal_iter *iter) + + static void bch2_journal_iter_advance(struct journal_iter *iter) + { +- if (iter->idx < iter->keys->nr) ++ if (iter->idx < iter->keys->size) { + iter->idx++; ++ if (iter->idx == iter->keys->gap) ++ iter->idx += iter->keys->size - iter->keys->nr; ++ } + } + + static void bch2_journal_iter_exit(struct journal_iter *iter) +@@ -409,6 +462,9 @@ void bch2_journal_keys_free(struct journal_keys *keys) + { + struct journal_key *i; + ++ move_gap(keys->d, keys->nr, keys->size, keys->gap, keys->nr); ++ keys->gap = keys->nr; ++ + for (i = keys->d; i < keys->d + keys->nr; i++) + if (i->allocated) + kfree(i->k); +@@ -478,6 +534,7 @@ static struct journal_keys journal_keys_sort(struct list_head *journal_entries) + } + + keys.nr = dst - keys.d; ++ keys.gap = keys.nr; + err: + return keys; + } +@@ -538,6 +595,9 @@ static int bch2_journal_replay(struct bch_fs *c) + size_t i; + int ret; + ++ move_gap(keys->d, keys->nr, keys->size, keys->gap, keys->nr); ++ keys->gap = keys->nr; ++ + keys_sorted = kvmalloc_array(sizeof(*keys_sorted), keys->nr, GFP_KERNEL); + if (!keys_sorted) + return -ENOMEM; +diff --git a/fs/bcachefs/recovery.h b/fs/bcachefs/recovery.h +index e6927a918df3..30580a8984a1 100644 +--- a/fs/bcachefs/recovery.h ++++ b/fs/bcachefs/recovery.h +@@ -2,9 +2,6 @@ + #ifndef _BCACHEFS_RECOVERY_H + #define _BCACHEFS_RECOVERY_H + +-#define for_each_journal_key(keys, i) \ +- for (i = (keys).d; i < (keys).d + (keys).nr; (i)++) +- + struct journal_iter { + struct list_head list; + enum btree_id btree_id; +diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h +index 086d941f23e5..98f70a5cc5d0 100644 +--- a/fs/bcachefs/util.h ++++ b/fs/bcachefs/util.h +@@ -806,6 +806,31 @@ do { \ + #define array_remove_item(_array, _nr, _pos) \ + array_remove_items(_array, _nr, _pos, 1) + ++static inline void __move_gap(void *array, size_t element_size, ++ size_t nr, size_t size, ++ size_t old_gap, size_t new_gap) ++{ ++ size_t gap_end = old_gap + size - nr; ++ ++ if (new_gap < old_gap) { ++ size_t move = old_gap - new_gap; ++ ++ memmove(array + element_size * (gap_end - move), ++ array + element_size * (old_gap - move), ++ element_size * move); ++ } else if (new_gap > old_gap) { ++ size_t move = new_gap - old_gap; ++ ++ memmove(array + element_size * old_gap, ++ array + element_size * gap_end, ++ element_size * move); ++ } ++} ++ ++/* Move the gap in a gap buffer: */ ++#define move_gap(_array, _nr, _size, _old_gap, _new_gap) \ ++ __move_gap(_array, sizeof(_array[0]), _nr, _size, _old_gap, _new_gap) ++ + #define bubble_sort(_base, _nr, _cmp) \ + do { \ + ssize_t _i, _end; \ +-- +cgit v1.2.3 + + +From bd971e5f4d65a6973ffa4c6f9b21df2c56aa84cb Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 3 Apr 2022 17:50:01 -0400 +Subject: bcachefs: Convert .key_invalid methods to printbufs + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/alloc_background.c | 75 +++++++++------ + fs/bcachefs/alloc_background.h | 8 +- + fs/bcachefs/bkey_methods.c | 147 +++++++++++++++++----------- + fs/bcachefs/bkey_methods.h | 16 +-- + fs/bcachefs/btree_io.c | 74 ++++++++------ + fs/bcachefs/btree_update_interior.c | 20 ++-- + fs/bcachefs/btree_update_leaf.c | 20 ++-- + fs/bcachefs/buckets.h | 4 +- + fs/bcachefs/dirent.c | 56 +++++++---- + fs/bcachefs/dirent.h | 2 +- + fs/bcachefs/ec.c | 32 +++--- + fs/bcachefs/ec.h | 3 +- + fs/bcachefs/extents.c | 187 +++++++++++++++++++++--------------- + fs/bcachefs/extents.h | 17 ++-- + fs/bcachefs/inode.c | 130 ++++++++++++++----------- + fs/bcachefs/inode.h | 10 +- + fs/bcachefs/journal_io.c | 29 +++--- + fs/bcachefs/lru.c | 12 ++- + fs/bcachefs/lru.h | 2 +- + fs/bcachefs/quota.c | 19 ++-- + fs/bcachefs/quota.h | 2 +- + fs/bcachefs/reflink.c | 45 ++++++--- + fs/bcachefs/reflink.h | 8 +- + fs/bcachefs/subvolume.c | 66 ++++++++----- + fs/bcachefs/subvolume.h | 4 +- + fs/bcachefs/xattr.c | 43 ++++++--- + fs/bcachefs/xattr.h | 2 +- + 27 files changed, 629 insertions(+), 404 deletions(-) + +diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c +index e8a34eccac25..193a21395921 100644 +--- a/fs/bcachefs/alloc_background.c ++++ b/fs/bcachefs/alloc_background.c +@@ -302,71 +302,86 @@ static unsigned bch_alloc_v1_val_u64s(const struct bch_alloc *a) + return DIV_ROUND_UP(bytes, sizeof(u64)); + } + +-const char *bch2_alloc_v1_invalid(const struct bch_fs *c, struct bkey_s_c k) ++int bch2_alloc_v1_invalid(const struct bch_fs *c, struct bkey_s_c k, struct printbuf *err) + { + struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k); + +- if (k.k->p.inode >= c->sb.nr_devices || +- !c->devs[k.k->p.inode]) +- return "invalid device"; ++ if (!bch2_dev_exists2(c, k.k->p.inode)) { ++ pr_buf(err, "invalid device (%llu)", k.k->p.inode); ++ return -EINVAL; ++ } + + /* allow for unknown fields */ +- if (bkey_val_u64s(a.k) < bch_alloc_v1_val_u64s(a.v)) +- return "incorrect value size"; ++ if (bkey_val_u64s(a.k) < bch_alloc_v1_val_u64s(a.v)) { ++ pr_buf(err, "incorrect value size (%zu < %u)", ++ bkey_val_u64s(a.k), bch_alloc_v1_val_u64s(a.v)); ++ return -EINVAL; ++ } + +- return NULL; ++ return 0; + } + +-const char *bch2_alloc_v2_invalid(const struct bch_fs *c, struct bkey_s_c k) ++int bch2_alloc_v2_invalid(const struct bch_fs *c, struct bkey_s_c k, struct printbuf *err) + { + struct bkey_alloc_unpacked u; + +- if (k.k->p.inode >= c->sb.nr_devices || +- !c->devs[k.k->p.inode]) +- return "invalid device"; ++ if (!bch2_dev_exists2(c, k.k->p.inode)) { ++ pr_buf(err, "invalid device (%llu)", k.k->p.inode); ++ return -EINVAL; ++ } + +- if (bch2_alloc_unpack_v2(&u, k)) +- return "unpack error"; ++ if (bch2_alloc_unpack_v2(&u, k)) { ++ pr_buf(err, "unpack error"); ++ return -EINVAL; ++ } + +- return NULL; ++ return 0; + } + +-const char *bch2_alloc_v3_invalid(const struct bch_fs *c, struct bkey_s_c k) ++int bch2_alloc_v3_invalid(const struct bch_fs *c, struct bkey_s_c k, struct printbuf *err) + { + struct bkey_alloc_unpacked u; + struct bch_dev *ca; + +- if (k.k->p.inode >= c->sb.nr_devices || +- !c->devs[k.k->p.inode]) +- return "invalid device"; ++ if (!bch2_dev_exists2(c, k.k->p.inode)) { ++ pr_buf(err, "invalid device (%llu)", k.k->p.inode); ++ return -EINVAL; ++ } + + ca = bch_dev_bkey_exists(c, k.k->p.inode); + + if (k.k->p.offset < ca->mi.first_bucket || +- k.k->p.offset >= ca->mi.nbuckets) +- return "invalid bucket"; ++ k.k->p.offset >= ca->mi.nbuckets) { ++ pr_buf(err, "invalid bucket"); ++ return -EINVAL; ++ } + +- if (bch2_alloc_unpack_v3(&u, k)) +- return "unpack error"; ++ if (bch2_alloc_unpack_v3(&u, k)) { ++ pr_buf(err, "unpack error"); ++ return -EINVAL; ++ } + +- return NULL; ++ return 0; + } + +-const char *bch2_alloc_v4_invalid(const struct bch_fs *c, struct bkey_s_c k) ++int bch2_alloc_v4_invalid(const struct bch_fs *c, struct bkey_s_c k, struct printbuf *err) + { + struct bch_dev *ca; + +- if (k.k->p.inode >= c->sb.nr_devices || +- !c->devs[k.k->p.inode]) +- return "invalid device"; ++ if (!bch2_dev_exists2(c, k.k->p.inode)) { ++ pr_buf(err, "invalid device (%llu)", k.k->p.inode); ++ return -EINVAL; ++ } + + ca = bch_dev_bkey_exists(c, k.k->p.inode); + + if (k.k->p.offset < ca->mi.first_bucket || +- k.k->p.offset >= ca->mi.nbuckets) +- return "invalid bucket"; ++ k.k->p.offset >= ca->mi.nbuckets) { ++ pr_buf(err, "invalid bucket"); ++ return -EINVAL; ++ } + +- return NULL; ++ return 0; + } + + void bch2_alloc_v4_swab(struct bkey_s k) +diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h +index da1b650e8017..85a807146143 100644 +--- a/fs/bcachefs/alloc_background.h ++++ b/fs/bcachefs/alloc_background.h +@@ -66,10 +66,10 @@ int bch2_bucket_io_time_reset(struct btree_trans *, unsigned, size_t, int); + + #define ALLOC_SCAN_BATCH(ca) max_t(size_t, 1, (ca)->mi.nbuckets >> 9) + +-const char *bch2_alloc_v1_invalid(const struct bch_fs *, struct bkey_s_c); +-const char *bch2_alloc_v2_invalid(const struct bch_fs *, struct bkey_s_c); +-const char *bch2_alloc_v3_invalid(const struct bch_fs *, struct bkey_s_c); +-const char *bch2_alloc_v4_invalid(const struct bch_fs *, struct bkey_s_c k); ++int bch2_alloc_v1_invalid(const struct bch_fs *, struct bkey_s_c, struct printbuf *); ++int bch2_alloc_v2_invalid(const struct bch_fs *, struct bkey_s_c, struct printbuf *); ++int bch2_alloc_v3_invalid(const struct bch_fs *, struct bkey_s_c, struct printbuf *); ++int bch2_alloc_v4_invalid(const struct bch_fs *, struct bkey_s_c k, struct printbuf *); + void bch2_alloc_v4_swab(struct bkey_s); + void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); + +diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c +index 0eac86e5e776..c132bff22aff 100644 +--- a/fs/bcachefs/bkey_methods.c ++++ b/fs/bcachefs/bkey_methods.c +@@ -22,10 +22,10 @@ const char * const bch2_bkey_types[] = { + NULL + }; + +-static const char *deleted_key_invalid(const struct bch_fs *c, +- struct bkey_s_c k) ++static int deleted_key_invalid(const struct bch_fs *c, struct bkey_s_c k, ++ struct printbuf *err) + { +- return NULL; ++ return 0; + } + + #define bch2_bkey_ops_deleted (struct bkey_ops) { \ +@@ -36,25 +36,32 @@ static const char *deleted_key_invalid(const struct bch_fs *c, + .key_invalid = deleted_key_invalid, \ + } + +-static const char *empty_val_key_invalid(const struct bch_fs *c, struct bkey_s_c k) ++static int empty_val_key_invalid(const struct bch_fs *c, struct bkey_s_c k, ++ struct printbuf *err) + { +- if (bkey_val_bytes(k.k)) +- return "value size should be zero"; ++ if (bkey_val_bytes(k.k)) { ++ pr_buf(err, "incorrect value size (%zu != 0)", ++ bkey_val_bytes(k.k)); ++ return -EINVAL; ++ } + +- return NULL; ++ return 0; + } + + #define bch2_bkey_ops_error (struct bkey_ops) { \ + .key_invalid = empty_val_key_invalid, \ + } + +-static const char *key_type_cookie_invalid(const struct bch_fs *c, +- struct bkey_s_c k) ++static int key_type_cookie_invalid(const struct bch_fs *c, struct bkey_s_c k, ++ struct printbuf *err) + { +- if (bkey_val_bytes(k.k) != sizeof(struct bch_cookie)) +- return "incorrect value size"; ++ if (bkey_val_bytes(k.k) != sizeof(struct bch_cookie)) { ++ pr_buf(err, "incorrect value size (%zu != %zu)", ++ bkey_val_bytes(k.k), sizeof(struct bch_cookie)); ++ return -EINVAL; ++ } + +- return NULL; ++ return 0; + } + + #define bch2_bkey_ops_cookie (struct bkey_ops) { \ +@@ -65,10 +72,10 @@ static const char *key_type_cookie_invalid(const struct bch_fs *c, + .key_invalid = empty_val_key_invalid, \ + } + +-static const char *key_type_inline_data_invalid(const struct bch_fs *c, +- struct bkey_s_c k) ++static int key_type_inline_data_invalid(const struct bch_fs *c, struct bkey_s_c k, ++ struct printbuf *err) + { +- return NULL; ++ return 0; + } + + static void key_type_inline_data_to_text(struct printbuf *out, struct bch_fs *c, +@@ -86,11 +93,16 @@ static void key_type_inline_data_to_text(struct printbuf *out, struct bch_fs *c, + .val_to_text = key_type_inline_data_to_text, \ + } + +-static const char *key_type_set_invalid(const struct bch_fs *c, struct bkey_s_c k) ++static int key_type_set_invalid(const struct bch_fs *c, struct bkey_s_c k, ++ struct printbuf *err) + { +- if (bkey_val_bytes(k.k)) +- return "nonempty value"; +- return NULL; ++ if (bkey_val_bytes(k.k)) { ++ pr_buf(err, "incorrect value size (%zu != %zu)", ++ bkey_val_bytes(k.k), sizeof(struct bch_cookie)); ++ return -EINVAL; ++ } ++ ++ return 0; + } + + static bool key_type_set_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r) +@@ -110,12 +122,14 @@ const struct bkey_ops bch2_bkey_ops[] = { + #undef x + }; + +-const char *bch2_bkey_val_invalid(struct bch_fs *c, struct bkey_s_c k) ++int bch2_bkey_val_invalid(struct bch_fs *c, struct bkey_s_c k, struct printbuf *err) + { +- if (k.k->type >= KEY_TYPE_MAX) +- return "invalid type"; ++ if (k.k->type >= KEY_TYPE_MAX) { ++ pr_buf(err, "invalid type (%u >= %u)", k.k->type, KEY_TYPE_MAX); ++ return -EINVAL; ++ } + +- return bch2_bkey_ops[k.k->type].key_invalid(c, k); ++ return bch2_bkey_ops[k.k->type].key_invalid(c, k, err); + } + + static unsigned bch2_key_types_allowed[] = { +@@ -182,63 +196,84 @@ static unsigned bch2_key_types_allowed[] = { + (1U << KEY_TYPE_btree_ptr_v2), + }; + +-const char *__bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k, +- enum btree_node_type type) ++int __bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k, ++ enum btree_node_type type, ++ struct printbuf *err) + { +- if (k.k->u64s < BKEY_U64s) +- return "u64s too small"; +- +- if (!(bch2_key_types_allowed[type] & (1U << k.k->type))) +- return "invalid key type for this btree"; ++ if (k.k->u64s < BKEY_U64s) { ++ pr_buf(err, "u64s too small (%u < %zu)", k.k->u64s, BKEY_U64s); ++ return -EINVAL; ++ } + +- if (type == BKEY_TYPE_btree && +- bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX) +- return "value too big"; ++ if (!(bch2_key_types_allowed[type] & (1U << k.k->type))) { ++ pr_buf(err, "invalid key type for this btree (%s)", ++ bch2_bkey_types[type]); ++ return -EINVAL; ++ } + + if (btree_node_type_is_extents(type) && !bkey_whiteout(k.k)) { +- if (k.k->size == 0) +- return "bad size field"; ++ if (k.k->size == 0) { ++ pr_buf(err, "size == 0"); ++ return -EINVAL; ++ } + +- if (k.k->size > k.k->p.offset) +- return "size greater than offset"; ++ if (k.k->size > k.k->p.offset) { ++ pr_buf(err, "size greater than offset (%u > %llu)", ++ k.k->size, k.k->p.offset); ++ return -EINVAL; ++ } + } else { +- if (k.k->size) +- return "nonzero size field"; ++ if (k.k->size) { ++ pr_buf(err, "size != 0"); ++ return -EINVAL; ++ } + } + + if (type != BKEY_TYPE_btree && + !btree_type_has_snapshots(type) && +- k.k->p.snapshot) +- return "nonzero snapshot"; ++ k.k->p.snapshot) { ++ pr_buf(err, "nonzero snapshot"); ++ return -EINVAL; ++ } + + if (type != BKEY_TYPE_btree && + btree_type_has_snapshots(type) && +- !k.k->p.snapshot) +- return "invalid snapshot field"; ++ !k.k->p.snapshot) { ++ pr_buf(err, "snapshot == 0"); ++ return -EINVAL; ++ } + + if (type != BKEY_TYPE_btree && +- !bkey_cmp(k.k->p, POS_MAX)) +- return "POS_MAX key"; ++ !bkey_cmp(k.k->p, POS_MAX)) { ++ pr_buf(err, "key at POS_MAX"); ++ return -EINVAL; ++ } + +- return NULL; ++ return 0; + } + +-const char *bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k, +- enum btree_node_type type) ++int bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k, ++ enum btree_node_type type, ++ struct printbuf *err) + { +- return __bch2_bkey_invalid(c, k, type) ?: +- bch2_bkey_val_invalid(c, k); ++ return __bch2_bkey_invalid(c, k, type, err) ?: ++ bch2_bkey_val_invalid(c, k, err); + } + +-const char *bch2_bkey_in_btree_node(struct btree *b, struct bkey_s_c k) ++int bch2_bkey_in_btree_node(struct btree *b, struct bkey_s_c k, ++ struct printbuf *err) + { +- if (bpos_cmp(k.k->p, b->data->min_key) < 0) +- return "key before start of btree node"; ++ if (bpos_cmp(k.k->p, b->data->min_key) < 0) { ++ pr_buf(err, "key before start of btree node"); ++ return -EINVAL; ++ } + +- if (bpos_cmp(k.k->p, b->data->max_key) > 0) +- return "key past end of btree node"; ++ if (bpos_cmp(k.k->p, b->data->max_key) > 0) { ++ pr_buf(err, "key past end of btree node"); ++ return -EINVAL; ++ } + +- return NULL; ++ return 0; + } + + void bch2_bpos_to_text(struct printbuf *out, struct bpos pos) +diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h +index 2289a09d98fc..9dbac71da933 100644 +--- a/fs/bcachefs/bkey_methods.h ++++ b/fs/bcachefs/bkey_methods.h +@@ -14,8 +14,8 @@ extern const char * const bch2_bkey_types[]; + + struct bkey_ops { + /* Returns reason for being invalid if invalid, else NULL: */ +- const char * (*key_invalid)(const struct bch_fs *, +- struct bkey_s_c); ++ int (*key_invalid)(const struct bch_fs *, struct bkey_s_c, ++ struct printbuf *); + void (*val_to_text)(struct printbuf *, struct bch_fs *, + struct bkey_s_c); + void (*swab)(struct bkey_s); +@@ -32,12 +32,12 @@ struct bkey_ops { + + extern const struct bkey_ops bch2_bkey_ops[]; + +-const char *bch2_bkey_val_invalid(struct bch_fs *, struct bkey_s_c); +-const char *__bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c, +- enum btree_node_type); +-const char *bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c, +- enum btree_node_type); +-const char *bch2_bkey_in_btree_node(struct btree *, struct bkey_s_c); ++int bch2_bkey_val_invalid(struct bch_fs *, struct bkey_s_c, struct printbuf *); ++int __bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c, ++ enum btree_node_type, struct printbuf *); ++int bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c, ++ enum btree_node_type, struct printbuf *); ++int bch2_bkey_in_btree_node(struct btree *, struct bkey_s_c, struct printbuf *); + + void bch2_bpos_to_text(struct printbuf *, struct bpos); + void bch2_bkey_to_text(struct printbuf *, const struct bkey *); +diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c +index 4b880ea59cad..cb753c1ba739 100644 +--- a/fs/bcachefs/btree_io.c ++++ b/fs/bcachefs/btree_io.c +@@ -767,14 +767,23 @@ fsck_err: + return ret; + } + ++static int bset_key_invalid(struct bch_fs *c, struct btree *b, ++ struct bkey_s_c k, ++ bool updated_range, int write, ++ struct printbuf *err) ++{ ++ return __bch2_bkey_invalid(c, k, btree_node_type(b), err) ?: ++ (!updated_range ? bch2_bkey_in_btree_node(b, k, err) : 0) ?: ++ (write ? bch2_bkey_val_invalid(c, k, err) : 0); ++} ++ + static int validate_bset_keys(struct bch_fs *c, struct btree *b, + struct bset *i, unsigned *whiteout_u64s, + int write, bool have_retry) + { + unsigned version = le16_to_cpu(i->version); + struct bkey_packed *k, *prev = NULL; +- struct printbuf buf1 = PRINTBUF; +- struct printbuf buf2 = PRINTBUF; ++ struct printbuf buf = PRINTBUF; + bool updated_range = b->key.k.type == KEY_TYPE_btree_ptr_v2 && + BTREE_PTR_RANGE_UPDATED(&bkey_i_to_btree_ptr_v2(&b->key)->v); + int ret = 0; +@@ -783,7 +792,6 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b, + k != vstruct_last(i);) { + struct bkey_s u; + struct bkey tmp; +- const char *invalid; + + if (btree_err_on(bkey_next(k) > vstruct_last(i), + BTREE_ERR_FIXABLE, c, NULL, b, i, +@@ -809,14 +817,15 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b, + + u = __bkey_disassemble(b, k, &tmp); + +- invalid = __bch2_bkey_invalid(c, u.s_c, btree_node_type(b)) ?: +- (!updated_range ? bch2_bkey_in_btree_node(b, u.s_c) : NULL) ?: +- (write ? bch2_bkey_val_invalid(c, u.s_c) : NULL); +- if (invalid) { +- printbuf_reset(&buf1); +- bch2_bkey_val_to_text(&buf1, c, u.s_c); +- btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i, +- "invalid bkey: %s\n%s", invalid, buf1.buf); ++ printbuf_reset(&buf); ++ if (bset_key_invalid(c, b, u.s_c, updated_range, write, &buf)) { ++ printbuf_reset(&buf); ++ pr_buf(&buf, "invalid bkey:\n "); ++ bch2_bkey_val_to_text(&buf, c, u.s_c); ++ pr_buf(&buf, " \n"); ++ bset_key_invalid(c, b, u.s_c, updated_range, write, &buf); ++ ++ btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i, "%s", buf.buf); + + i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); + memmove_u64s_down(k, bkey_next(k), +@@ -832,16 +841,15 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b, + if (prev && bkey_iter_cmp(b, prev, k) > 0) { + struct bkey up = bkey_unpack_key(b, prev); + +- printbuf_reset(&buf1); +- bch2_bkey_to_text(&buf1, &up); +- printbuf_reset(&buf2); +- bch2_bkey_to_text(&buf2, u.k); ++ printbuf_reset(&buf); ++ pr_buf(&buf, "keys out of order: "); ++ bch2_bkey_to_text(&buf, &up); ++ pr_buf(&buf, " > "); ++ bch2_bkey_to_text(&buf, u.k); + + bch2_dump_bset(c, b, i, 0); + +- if (btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i, +- "keys out of order: %s > %s", +- buf1.buf, buf2.buf)) { ++ if (btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i, "%s", buf.buf)) { + i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); + memmove_u64s_down(k, bkey_next(k), + (u64 *) vstruct_end(i) - (u64 *) k); +@@ -853,8 +861,7 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b, + k = bkey_next(k); + } + fsck_err: +- printbuf_exit(&buf2); +- printbuf_exit(&buf1); ++ printbuf_exit(&buf); + return ret; + } + +@@ -873,6 +880,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, + unsigned u64s; + unsigned blacklisted_written, nonblacklisted_written = 0; + unsigned ptr_written = btree_ptr_sectors_written(&b->key); ++ struct printbuf buf = PRINTBUF; + int ret, retry_read = 0, write = READ; + + b->version_ondisk = U16_MAX; +@@ -1065,17 +1073,20 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, + for (k = i->start; k != vstruct_last(i);) { + struct bkey tmp; + struct bkey_s u = __bkey_disassemble(b, k, &tmp); +- const char *invalid = bch2_bkey_val_invalid(c, u.s_c); + +- if (invalid || ++ printbuf_reset(&buf); ++ ++ if (bch2_bkey_val_invalid(c, u.s_c, &buf) || + (bch2_inject_invalid_keys && + !bversion_cmp(u.k->version, MAX_VERSION))) { +- struct printbuf buf = PRINTBUF; ++ printbuf_reset(&buf); + ++ pr_buf(&buf, "invalid bkey\n "); + bch2_bkey_val_to_text(&buf, c, u.s_c); +- btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i, +- "invalid bkey %s: %s", buf.buf, invalid); +- printbuf_exit(&buf); ++ pr_buf(&buf, "\n "); ++ bch2_bkey_val_invalid(c, u.s_c, &buf); ++ ++ btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i, "%s", buf.buf); + + btree_keys_account_key_drop(&b->nr, 0, k); + +@@ -1112,6 +1123,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, + set_btree_node_need_rewrite(b); + out: + mempool_free(iter, &c->fill_iter); ++ printbuf_exit(&buf); + return retry_read; + fsck_err: + if (ret == BTREE_RETRY_READ) { +@@ -1719,10 +1731,16 @@ static int validate_bset_for_write(struct bch_fs *c, struct btree *b, + struct bset *i, unsigned sectors) + { + unsigned whiteout_u64s = 0; ++ struct printbuf buf = PRINTBUF; + int ret; + +- if (bch2_bkey_invalid(c, bkey_i_to_s_c(&b->key), BKEY_TYPE_btree)) +- return -1; ++ ret = bch2_bkey_invalid(c, bkey_i_to_s_c(&b->key), BKEY_TYPE_btree, &buf); ++ ++ if (ret) ++ bch2_fs_inconsistent(c, "invalid btree node key before write: %s", buf.buf); ++ printbuf_exit(&buf); ++ if (ret) ++ return ret; + + ret = validate_bset_keys(c, b, i, &whiteout_u64s, WRITE, false) ?: + validate_bset(c, NULL, b, i, b->written, sectors, WRITE, false); +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index 42ae3b0c5839..f3bf4281f2a0 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -1167,7 +1167,7 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, + { + struct bch_fs *c = as->c; + struct bkey_packed *k; +- const char *invalid; ++ struct printbuf buf = PRINTBUF; + + BUG_ON(insert->k.type == KEY_TYPE_btree_ptr_v2 && + !btree_ptr_sectors_written(insert)); +@@ -1175,14 +1175,16 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, + if (unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags))) + bch2_journal_key_overwritten(c, b->c.btree_id, b->c.level, insert->k.p); + +- invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(insert), btree_node_type(b)) ?: +- bch2_bkey_in_btree_node(b, bkey_i_to_s_c(insert)); +- if (invalid) { +- struct printbuf buf = PRINTBUF; +- ++ if (bch2_bkey_invalid(c, bkey_i_to_s_c(insert), btree_node_type(b), &buf) ?: ++ bch2_bkey_in_btree_node(b, bkey_i_to_s_c(insert), &buf)) { ++ printbuf_reset(&buf); ++ pr_buf(&buf, "inserting invalid bkey\n "); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert)); +- bch2_fs_inconsistent(c, "inserting invalid bkey %s: %s", buf.buf, invalid); +- printbuf_exit(&buf); ++ pr_buf(&buf, "\n "); ++ bch2_bkey_invalid(c, bkey_i_to_s_c(insert), btree_node_type(b), &buf); ++ bch2_bkey_in_btree_node(b, bkey_i_to_s_c(insert), &buf); ++ ++ bch2_fs_inconsistent(c, "%s", buf.buf); + dump_stack(); + } + +@@ -1202,6 +1204,8 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, + bch2_btree_bset_insert_key(trans, path, b, node_iter, insert); + set_btree_node_dirty_acct(c, b); + set_btree_node_need_write(b); ++ ++ printbuf_exit(&buf); + } + + static void +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index a0480c63dd81..a985f90db175 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -856,23 +856,31 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, + { + struct bch_fs *c = trans->c; + struct btree_insert_entry *i; ++ struct printbuf buf = PRINTBUF; + int ret, u64s_delta = 0; + + trans_for_each_update(trans, i) { +- const char *invalid = bch2_bkey_invalid(c, +- bkey_i_to_s_c(i->k), i->bkey_type); +- if (invalid) { +- struct printbuf buf = PRINTBUF; ++ if (bch2_bkey_invalid(c, bkey_i_to_s_c(i->k), i->bkey_type, &buf)) { ++ printbuf_reset(&buf); ++ pr_buf(&buf, "invalid bkey on insert from %s -> %ps", ++ trans->fn, (void *) i->ip_allocated); ++ pr_newline(&buf); ++ pr_indent_push(&buf, 2); + + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(i->k)); +- bch2_fs_fatal_error(c, "invalid bkey %s on insert from %s -> %ps: %s\n", +- buf.buf, trans->fn, (void *) i->ip_allocated, invalid); ++ pr_newline(&buf); ++ ++ bch2_bkey_invalid(c, bkey_i_to_s_c(i->k), i->bkey_type, &buf); ++ ++ bch2_fs_fatal_error(c, "%s", buf.buf); + printbuf_exit(&buf); + return -EINVAL; + } + btree_insert_entry_checks(trans, i); + } + ++ printbuf_exit(&buf); ++ + trans_for_each_update(trans, i) { + if (i->cached) + continue; +diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h +index 656a04b558bc..85e86ded86af 100644 +--- a/fs/bcachefs/buckets.h ++++ b/fs/bcachefs/buckets.h +@@ -9,6 +9,7 @@ + #define _BUCKETS_H + + #include "buckets_types.h" ++#include "extents.h" + #include "super.h" + + #define for_each_bucket(_b, _buckets) \ +@@ -83,8 +84,7 @@ static inline struct bucket *PTR_GC_BUCKET(struct bch_dev *ca, + static inline enum bch_data_type ptr_data_type(const struct bkey *k, + const struct bch_extent_ptr *ptr) + { +- if (k->type == KEY_TYPE_btree_ptr || +- k->type == KEY_TYPE_btree_ptr_v2) ++ if (bkey_is_btree_ptr(k)) + return BCH_DATA_btree; + + return ptr->cached ? BCH_DATA_cached : BCH_DATA_user; +diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c +index 760e4f74715f..e8a284a69be4 100644 +--- a/fs/bcachefs/dirent.c ++++ b/fs/bcachefs/dirent.c +@@ -83,38 +83,58 @@ const struct bch_hash_desc bch2_dirent_hash_desc = { + .is_visible = dirent_is_visible, + }; + +-const char *bch2_dirent_invalid(const struct bch_fs *c, struct bkey_s_c k) ++int bch2_dirent_invalid(const struct bch_fs *c, struct bkey_s_c k, ++ struct printbuf *err) + { + struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); + unsigned len; + +- if (bkey_val_bytes(k.k) < sizeof(struct bch_dirent)) +- return "value too small"; ++ if (bkey_val_bytes(k.k) < sizeof(struct bch_dirent)) { ++ pr_buf(err, "incorrect value size (%zu < %zu)", ++ bkey_val_bytes(k.k), sizeof(*d.v)); ++ return -EINVAL; ++ } + + len = bch2_dirent_name_bytes(d); +- if (!len) +- return "empty name"; ++ if (!len) { ++ pr_buf(err, "empty name"); ++ return -EINVAL; ++ } + +- if (bkey_val_u64s(k.k) > dirent_val_u64s(len)) +- return "value too big"; ++ if (bkey_val_u64s(k.k) > dirent_val_u64s(len)) { ++ pr_buf(err, "value too big (%zu > %u)", ++ bkey_val_u64s(k.k),dirent_val_u64s(len)); ++ return -EINVAL; ++ } + +- if (len > BCH_NAME_MAX) +- return "dirent name too big"; ++ if (len > BCH_NAME_MAX) { ++ pr_buf(err, "dirent name too big (%u > %lu)", ++ len, BCH_NAME_MAX); ++ return -EINVAL; ++ } + +- if (len == 1 && !memcmp(d.v->d_name, ".", 1)) +- return "invalid name"; ++ if (len == 1 && !memcmp(d.v->d_name, ".", 1)) { ++ pr_buf(err, "invalid name"); ++ return -EINVAL; ++ } + +- if (len == 2 && !memcmp(d.v->d_name, "..", 2)) +- return "invalid name"; ++ if (len == 2 && !memcmp(d.v->d_name, "..", 2)) { ++ pr_buf(err, "invalid name"); ++ return -EINVAL; ++ } + +- if (memchr(d.v->d_name, '/', len)) +- return "invalid name"; ++ if (memchr(d.v->d_name, '/', len)) { ++ pr_buf(err, "invalid name"); ++ return -EINVAL; ++ } + + if (d.v->d_type != DT_SUBVOL && +- le64_to_cpu(d.v->d_inum) == d.k->p.inode) +- return "dirent points to own directory"; ++ le64_to_cpu(d.v->d_inum) == d.k->p.inode) { ++ pr_buf(err, "dirent points to own directory"); ++ return -EINVAL; ++ } + +- return NULL; ++ return 0; + } + + void bch2_dirent_to_text(struct printbuf *out, struct bch_fs *c, +diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h +index 1bb4d802bc1d..046f297a4eff 100644 +--- a/fs/bcachefs/dirent.h ++++ b/fs/bcachefs/dirent.h +@@ -6,7 +6,7 @@ + + extern const struct bch_hash_desc bch2_dirent_hash_desc; + +-const char *bch2_dirent_invalid(const struct bch_fs *, struct bkey_s_c); ++int bch2_dirent_invalid(const struct bch_fs *, struct bkey_s_c, struct printbuf *); + void bch2_dirent_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); + + #define bch2_bkey_ops_dirent (struct bkey_ops) { \ +diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c +index 616a551265e0..060a3c4e24e3 100644 +--- a/fs/bcachefs/ec.c ++++ b/fs/bcachefs/ec.c +@@ -102,24 +102,34 @@ struct ec_bio { + + /* Stripes btree keys: */ + +-const char *bch2_stripe_invalid(const struct bch_fs *c, struct bkey_s_c k) ++int bch2_stripe_invalid(const struct bch_fs *c, struct bkey_s_c k, ++ struct printbuf *err) + { + const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; + +- if (!bkey_cmp(k.k->p, POS_MIN)) +- return "stripe at pos 0"; ++ if (!bkey_cmp(k.k->p, POS_MIN)) { ++ pr_buf(err, "stripe at POS_MIN"); ++ return -EINVAL; ++ } + +- if (k.k->p.inode) +- return "invalid stripe key"; ++ if (k.k->p.inode) { ++ pr_buf(err, "nonzero inode field"); ++ return -EINVAL; ++ } + +- if (bkey_val_bytes(k.k) < sizeof(*s)) +- return "incorrect value size"; ++ if (bkey_val_bytes(k.k) < sizeof(*s)) { ++ pr_buf(err, "incorrect value size (%zu < %zu)", ++ bkey_val_bytes(k.k), sizeof(*s)); ++ return -EINVAL; ++ } + +- if (bkey_val_bytes(k.k) < sizeof(*s) || +- bkey_val_u64s(k.k) < stripe_val_u64s(s)) +- return "incorrect value size"; ++ if (bkey_val_u64s(k.k) < stripe_val_u64s(s)) { ++ pr_buf(err, "incorrect value size (%zu < %u)", ++ bkey_val_u64s(k.k), stripe_val_u64s(s)); ++ return -EINVAL; ++ } + +- return bch2_bkey_ptrs_invalid(c, k); ++ return bch2_bkey_ptrs_invalid(c, k, err); + } + + void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c, +diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h +index 9d508a2f3bbc..8e866460f8a0 100644 +--- a/fs/bcachefs/ec.h ++++ b/fs/bcachefs/ec.h +@@ -6,7 +6,8 @@ + #include "buckets_types.h" + #include "keylist_types.h" + +-const char *bch2_stripe_invalid(const struct bch_fs *, struct bkey_s_c); ++int bch2_stripe_invalid(const struct bch_fs *, struct bkey_s_c, ++ struct printbuf *); + void bch2_stripe_to_text(struct printbuf *, struct bch_fs *, + struct bkey_s_c); + +diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c +index 77a0d49a2372..0bb5d7770325 100644 +--- a/fs/bcachefs/extents.c ++++ b/fs/bcachefs/extents.c +@@ -156,12 +156,16 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, + + /* KEY_TYPE_btree_ptr: */ + +-const char *bch2_btree_ptr_invalid(const struct bch_fs *c, struct bkey_s_c k) ++int bch2_btree_ptr_invalid(const struct bch_fs *c, struct bkey_s_c k, ++ struct printbuf *err) + { +- if (bkey_val_u64s(k.k) > BCH_REPLICAS_MAX) +- return "value too big"; ++ if (bkey_val_u64s(k.k) > BCH_REPLICAS_MAX) { ++ pr_buf(err, "value too big (%zu > %u)", ++ bkey_val_u64s(k.k), BCH_REPLICAS_MAX); ++ return -EINVAL; ++ } + +- return bch2_bkey_ptrs_invalid(c, k); ++ return bch2_bkey_ptrs_invalid(c, k, err); + } + + void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c, +@@ -170,21 +174,31 @@ void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c, + bch2_bkey_ptrs_to_text(out, c, k); + } + +-const char *bch2_btree_ptr_v2_invalid(const struct bch_fs *c, struct bkey_s_c k) ++int bch2_btree_ptr_v2_invalid(const struct bch_fs *c, struct bkey_s_c k, ++ struct printbuf *err) + { + struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k); + +- if (bkey_val_bytes(k.k) <= sizeof(*bp.v)) +- return "value too small"; ++ if (bkey_val_bytes(k.k) <= sizeof(*bp.v)) { ++ pr_buf(err, "value too small (%zu <= %zu)", ++ bkey_val_bytes(k.k), sizeof(*bp.v)); ++ return -EINVAL; ++ } + +- if (bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX) +- return "value too big"; ++ if (bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX) { ++ pr_buf(err, "value too big (%zu > %zu)", ++ bkey_val_u64s(k.k), BKEY_BTREE_PTR_VAL_U64s_MAX); ++ return -EINVAL; ++ } + + if (c->sb.version < bcachefs_metadata_version_snapshot && +- bp.v->min_key.snapshot) +- return "invalid min_key.snapshot"; ++ bp.v->min_key.snapshot) { ++ pr_buf(err, "invalid min_key.snapshot (%u != 0)", ++ bp.v->min_key.snapshot); ++ return -EINVAL; ++ } + +- return bch2_bkey_ptrs_invalid(c, k); ++ return bch2_bkey_ptrs_invalid(c, k, err); + } + + void bch2_btree_ptr_v2_to_text(struct printbuf *out, struct bch_fs *c, +@@ -220,17 +234,6 @@ void bch2_btree_ptr_v2_compat(enum btree_id btree_id, unsigned version, + + /* KEY_TYPE_extent: */ + +-const char *bch2_extent_invalid(const struct bch_fs *c, struct bkey_s_c k) +-{ +- return bch2_bkey_ptrs_invalid(c, k); +-} +- +-void bch2_extent_to_text(struct printbuf *out, struct bch_fs *c, +- struct bkey_s_c k) +-{ +- bch2_bkey_ptrs_to_text(out, c, k); +-} +- + bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r) + { + struct bkey_ptrs l_ptrs = bch2_bkey_ptrs(l); +@@ -363,17 +366,24 @@ bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r) + + /* KEY_TYPE_reservation: */ + +-const char *bch2_reservation_invalid(const struct bch_fs *c, struct bkey_s_c k) ++int bch2_reservation_invalid(const struct bch_fs *c, struct bkey_s_c k, ++ struct printbuf *err) + { + struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k); + +- if (bkey_val_bytes(k.k) != sizeof(struct bch_reservation)) +- return "incorrect value size"; ++ if (bkey_val_bytes(k.k) != sizeof(struct bch_reservation)) { ++ pr_buf(err, "incorrect value size (%zu != %zu)", ++ bkey_val_bytes(k.k), sizeof(*r.v)); ++ return -EINVAL; ++ } + +- if (!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX) +- return "invalid nr_replicas"; ++ if (!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX) { ++ pr_buf(err, "invalid nr_replicas (%u)", ++ r.v->nr_replicas); ++ return -EINVAL; ++ } + +- return NULL; ++ return 0; + } + + void bch2_reservation_to_text(struct printbuf *out, struct bch_fs *c, +@@ -1001,69 +1011,86 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c, + } + } + +-static const char *extent_ptr_invalid(const struct bch_fs *c, +- struct bkey_s_c k, +- const struct bch_extent_ptr *ptr, +- unsigned size_ondisk, +- bool metadata) ++static int extent_ptr_invalid(const struct bch_fs *c, ++ struct bkey_s_c k, ++ const struct bch_extent_ptr *ptr, ++ unsigned size_ondisk, ++ bool metadata, ++ struct printbuf *err) + { + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const struct bch_extent_ptr *ptr2; ++ u64 bucket; ++ u32 bucket_offset; + struct bch_dev *ca; + +- if (!bch2_dev_exists2(c, ptr->dev)) +- return "pointer to invalid device"; ++ if (!bch2_dev_exists2(c, ptr->dev)) { ++ pr_buf(err, "pointer to invalid device (%u)", ptr->dev); ++ return -EINVAL; ++ } + + ca = bch_dev_bkey_exists(c, ptr->dev); +- if (!ca) +- return "pointer to invalid device"; +- + bkey_for_each_ptr(ptrs, ptr2) +- if (ptr != ptr2 && ptr->dev == ptr2->dev) +- return "multiple pointers to same device"; ++ if (ptr != ptr2 && ptr->dev == ptr2->dev) { ++ pr_buf(err, "multiple pointers to same device (%u)", ptr->dev); ++ return -EINVAL; ++ } + +- if (ptr->offset + size_ondisk > bucket_to_sector(ca, ca->mi.nbuckets)) +- return "offset past end of device"; ++ bucket = sector_to_bucket_and_offset(ca, ptr->offset, &bucket_offset); + +- if (ptr->offset < bucket_to_sector(ca, ca->mi.first_bucket)) +- return "offset before first bucket"; ++ if (bucket >= ca->mi.nbuckets) { ++ pr_buf(err, "pointer past last bucket (%llu > %llu)", ++ bucket, ca->mi.nbuckets); ++ return -EINVAL; ++ } + +- if (bucket_remainder(ca, ptr->offset) + +- size_ondisk > ca->mi.bucket_size) +- return "spans multiple buckets"; ++ if (ptr->offset < bucket_to_sector(ca, ca->mi.first_bucket)) { ++ pr_buf(err, "pointer before first bucket (%llu < %u)", ++ bucket, ca->mi.first_bucket); ++ return -EINVAL; ++ } + +- return NULL; ++ if (bucket_offset + size_ondisk > ca->mi.bucket_size) { ++ pr_buf(err, "pointer spans multiple buckets (%u + %u > %u)", ++ bucket_offset, size_ondisk, ca->mi.bucket_size); ++ return -EINVAL; ++ } ++ ++ return 0; + } + +-const char *bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k) ++int bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k, ++ struct printbuf *err) + { + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); +- struct bch_devs_list devs; + const union bch_extent_entry *entry; + struct bch_extent_crc_unpacked crc; + unsigned size_ondisk = k.k->size; +- const char *reason; + unsigned nonce = UINT_MAX; +- unsigned i; ++ int ret; + +- if (k.k->type == KEY_TYPE_btree_ptr || +- k.k->type == KEY_TYPE_btree_ptr_v2) ++ if (bkey_is_btree_ptr(k.k)) + size_ondisk = btree_sectors(c); + + bkey_extent_entry_for_each(ptrs, entry) { +- if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX) +- return "invalid extent entry type"; ++ if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX) { ++ pr_buf(err, "invalid extent entry type (got %u, max %u)", ++ __extent_entry_type(entry), BCH_EXTENT_ENTRY_MAX); ++ return -EINVAL; ++ } + +- if (k.k->type == KEY_TYPE_btree_ptr && +- !extent_entry_is_ptr(entry)) +- return "has non ptr field"; ++ if (bkey_is_btree_ptr(k.k) && ++ !extent_entry_is_ptr(entry)) { ++ pr_buf(err, "has non ptr field"); ++ return -EINVAL; ++ } + + switch (extent_entry_type(entry)) { + case BCH_EXTENT_ENTRY_ptr: +- reason = extent_ptr_invalid(c, k, &entry->ptr, +- size_ondisk, false); +- if (reason) +- return reason; ++ ret = extent_ptr_invalid(c, k, &entry->ptr, size_ondisk, ++ false, err); ++ if (ret) ++ return ret; + break; + case BCH_EXTENT_ENTRY_crc32: + case BCH_EXTENT_ENTRY_crc64: +@@ -1071,22 +1098,30 @@ const char *bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k) + crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry)); + + if (crc.offset + crc.live_size > +- crc.uncompressed_size) +- return "checksum offset + key size > uncompressed size"; ++ crc.uncompressed_size) { ++ pr_buf(err, "checksum offset + key size > uncompressed size"); ++ return -EINVAL; ++ } + + size_ondisk = crc.compressed_size; + +- if (!bch2_checksum_type_valid(c, crc.csum_type)) +- return "invalid checksum type"; ++ if (!bch2_checksum_type_valid(c, crc.csum_type)) { ++ pr_buf(err, "invalid checksum type"); ++ return -EINVAL; ++ } + +- if (crc.compression_type >= BCH_COMPRESSION_TYPE_NR) +- return "invalid compression type"; ++ if (crc.compression_type >= BCH_COMPRESSION_TYPE_NR) { ++ pr_buf(err, "invalid compression type"); ++ return -EINVAL; ++ } + + if (bch2_csum_type_is_encryption(crc.csum_type)) { + if (nonce == UINT_MAX) + nonce = crc.offset + crc.nonce; +- else if (nonce != crc.offset + crc.nonce) +- return "incorrect nonce"; ++ else if (nonce != crc.offset + crc.nonce) { ++ pr_buf(err, "incorrect nonce"); ++ return -EINVAL; ++ } + } + break; + case BCH_EXTENT_ENTRY_stripe_ptr: +@@ -1094,13 +1129,7 @@ const char *bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k) + } + } + +- devs = bch2_bkey_devs(k); +- bubble_sort(devs.devs, devs.nr, u8_cmp); +- for (i = 0; i + 1 < devs.nr; i++) +- if (devs.devs[i] == devs.devs[i + 1]) +- return "multiple ptrs to same device"; +- +- return NULL; ++ return 0; + } + + void bch2_ptr_swab(struct bkey_s k) +diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h +index ae650849d98a..21f79e663c74 100644 +--- a/fs/bcachefs/extents.h ++++ b/fs/bcachefs/extents.h +@@ -367,13 +367,12 @@ int bch2_bkey_pick_read_device(struct bch_fs *, struct bkey_s_c, + + /* KEY_TYPE_btree_ptr: */ + +-const char *bch2_btree_ptr_invalid(const struct bch_fs *, struct bkey_s_c); ++int bch2_btree_ptr_invalid(const struct bch_fs *, struct bkey_s_c, struct printbuf *); + void bch2_btree_ptr_to_text(struct printbuf *, struct bch_fs *, + struct bkey_s_c); + +-const char *bch2_btree_ptr_v2_invalid(const struct bch_fs *, struct bkey_s_c); +-void bch2_btree_ptr_v2_to_text(struct printbuf *, struct bch_fs *, +- struct bkey_s_c); ++int bch2_btree_ptr_v2_invalid(const struct bch_fs *, struct bkey_s_c, struct printbuf *); ++void bch2_btree_ptr_v2_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); + void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned, + int, struct bkey_s); + +@@ -396,13 +395,11 @@ void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned, + + /* KEY_TYPE_extent: */ + +-const char *bch2_extent_invalid(const struct bch_fs *, struct bkey_s_c); +-void bch2_extent_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); + bool bch2_extent_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); + + #define bch2_bkey_ops_extent (struct bkey_ops) { \ +- .key_invalid = bch2_extent_invalid, \ +- .val_to_text = bch2_extent_to_text, \ ++ .key_invalid = bch2_bkey_ptrs_invalid, \ ++ .val_to_text = bch2_bkey_ptrs_to_text, \ + .swab = bch2_ptr_swab, \ + .key_normalize = bch2_extent_normalize, \ + .key_merge = bch2_extent_merge, \ +@@ -412,7 +409,7 @@ bool bch2_extent_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); + + /* KEY_TYPE_reservation: */ + +-const char *bch2_reservation_invalid(const struct bch_fs *, struct bkey_s_c); ++int bch2_reservation_invalid(const struct bch_fs *, struct bkey_s_c, struct printbuf *); + void bch2_reservation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); + bool bch2_reservation_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); + +@@ -618,7 +615,7 @@ bool bch2_bkey_matches_ptr(struct bch_fs *, struct bkey_s_c, + bool bch2_extent_normalize(struct bch_fs *, struct bkey_s); + void bch2_bkey_ptrs_to_text(struct printbuf *, struct bch_fs *, + struct bkey_s_c); +-const char *bch2_bkey_ptrs_invalid(const struct bch_fs *, struct bkey_s_c); ++int bch2_bkey_ptrs_invalid(const struct bch_fs *, struct bkey_s_c, struct printbuf *); + + void bch2_ptr_swab(struct bkey_s); + +diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c +index 14b0b595202d..47dfde1910c1 100644 +--- a/fs/bcachefs/inode.c ++++ b/fs/bcachefs/inode.c +@@ -293,76 +293,89 @@ int bch2_inode_write(struct btree_trans *trans, + return bch2_trans_update(trans, iter, &inode_p->inode.k_i, 0); + } + +-const char *bch2_inode_invalid(const struct bch_fs *c, struct bkey_s_c k) ++static int __bch2_inode_invalid(struct bkey_s_c k, struct printbuf *err) + { +- struct bkey_s_c_inode inode = bkey_s_c_to_inode(k); + struct bch_inode_unpacked unpacked; + +- if (k.k->p.inode) +- return "nonzero k.p.inode"; +- +- if (bkey_val_bytes(k.k) < sizeof(struct bch_inode)) +- return "incorrect value size"; +- +- if (k.k->p.offset < BLOCKDEV_INODE_MAX) +- return "fs inode in blockdev range"; ++ if (k.k->p.inode) { ++ pr_buf(err, "nonzero k.p.inode"); ++ return -EINVAL; ++ } + +- if (INODE_STR_HASH(inode.v) >= BCH_STR_HASH_NR) +- return "invalid str hash type"; ++ if (k.k->p.offset < BLOCKDEV_INODE_MAX) { ++ pr_buf(err, "fs inode in blockdev range"); ++ return -EINVAL; ++ } + +- if (bch2_inode_unpack(k, &unpacked)) +- return "invalid variable length fields"; ++ if (bch2_inode_unpack(k, &unpacked)){ ++ pr_buf(err, "invalid variable length fields"); ++ return -EINVAL; ++ } + +- if (unpacked.bi_data_checksum >= BCH_CSUM_OPT_NR + 1) +- return "invalid data checksum type"; ++ if (unpacked.bi_data_checksum >= BCH_CSUM_OPT_NR + 1) { ++ pr_buf(err, "invalid data checksum type (%u >= %u", ++ unpacked.bi_data_checksum, BCH_CSUM_OPT_NR + 1); ++ return -EINVAL; ++ } + +- if (unpacked.bi_compression >= BCH_COMPRESSION_OPT_NR + 1) +- return "invalid data checksum type"; ++ if (unpacked.bi_compression >= BCH_COMPRESSION_OPT_NR + 1) { ++ pr_buf(err, "invalid data checksum type (%u >= %u)", ++ unpacked.bi_compression, BCH_COMPRESSION_OPT_NR + 1); ++ return -EINVAL; ++ } + + if ((unpacked.bi_flags & BCH_INODE_UNLINKED) && +- unpacked.bi_nlink != 0) +- return "flagged as unlinked but bi_nlink != 0"; ++ unpacked.bi_nlink != 0) { ++ pr_buf(err, "flagged as unlinked but bi_nlink != 0"); ++ return -EINVAL; ++ } + +- if (unpacked.bi_subvol && !S_ISDIR(unpacked.bi_mode)) +- return "subvolume root but not a directory"; ++ if (unpacked.bi_subvol && !S_ISDIR(unpacked.bi_mode)) { ++ pr_buf(err, "subvolume root but not a directory"); ++ return -EINVAL; ++ } + +- return NULL; ++ return 0; + } + +-const char *bch2_inode_v2_invalid(const struct bch_fs *c, struct bkey_s_c k) ++int bch2_inode_invalid(const struct bch_fs *c, struct bkey_s_c k, ++ struct printbuf *err) + { +- struct bkey_s_c_inode_v2 inode = bkey_s_c_to_inode_v2(k); +- struct bch_inode_unpacked unpacked; +- +- if (k.k->p.inode) +- return "nonzero k.p.inode"; +- +- if (bkey_val_bytes(k.k) < sizeof(struct bch_inode)) +- return "incorrect value size"; +- +- if (k.k->p.offset < BLOCKDEV_INODE_MAX) +- return "fs inode in blockdev range"; ++ struct bkey_s_c_inode inode = bkey_s_c_to_inode(k); + +- if (INODEv2_STR_HASH(inode.v) >= BCH_STR_HASH_NR) +- return "invalid str hash type"; ++ if (bkey_val_bytes(k.k) < sizeof(*inode.v)) { ++ pr_buf(err, "incorrect value size (%zu < %zu)", ++ bkey_val_bytes(k.k), sizeof(*inode.v)); ++ return -EINVAL; ++ } + +- if (bch2_inode_unpack(k, &unpacked)) +- return "invalid variable length fields"; ++ if (INODE_STR_HASH(inode.v) >= BCH_STR_HASH_NR) { ++ pr_buf(err, "invalid str hash type (%llu >= %u)", ++ INODE_STR_HASH(inode.v), BCH_STR_HASH_NR); ++ return -EINVAL; ++ } + +- if (unpacked.bi_data_checksum >= BCH_CSUM_OPT_NR + 1) +- return "invalid data checksum type"; ++ return __bch2_inode_invalid(k, err); ++} + +- if (unpacked.bi_compression >= BCH_COMPRESSION_OPT_NR + 1) +- return "invalid data checksum type"; ++int bch2_inode_v2_invalid(const struct bch_fs *c, struct bkey_s_c k, ++ struct printbuf *err) ++{ ++ struct bkey_s_c_inode_v2 inode = bkey_s_c_to_inode_v2(k); + +- if ((unpacked.bi_flags & BCH_INODE_UNLINKED) && +- unpacked.bi_nlink != 0) +- return "flagged as unlinked but bi_nlink != 0"; ++ if (bkey_val_bytes(k.k) < sizeof(*inode.v)) { ++ pr_buf(err, "incorrect value size (%zu < %zu)", ++ bkey_val_bytes(k.k), sizeof(*inode.v)); ++ return -EINVAL; ++ } + +- if (unpacked.bi_subvol && !S_ISDIR(unpacked.bi_mode)) +- return "subvolume root but not a directory"; ++ if (INODEv2_STR_HASH(inode.v) >= BCH_STR_HASH_NR) { ++ pr_buf(err, "invalid str hash type (%llu >= %u)", ++ INODEv2_STR_HASH(inode.v), BCH_STR_HASH_NR); ++ return -EINVAL; ++ } + +- return NULL; ++ return __bch2_inode_invalid(k, err); + } + + static void __bch2_inode_unpacked_to_text(struct printbuf *out, struct bch_inode_unpacked *inode) +@@ -396,16 +409,21 @@ void bch2_inode_to_text(struct printbuf *out, struct bch_fs *c, + __bch2_inode_unpacked_to_text(out, &inode); + } + +-const char *bch2_inode_generation_invalid(const struct bch_fs *c, +- struct bkey_s_c k) ++int bch2_inode_generation_invalid(const struct bch_fs *c, struct bkey_s_c k, ++ struct printbuf *err) + { +- if (k.k->p.inode) +- return "nonzero k.p.inode"; ++ if (k.k->p.inode) { ++ pr_buf(err, "nonzero k.p.inode"); ++ return -EINVAL; ++ } + +- if (bkey_val_bytes(k.k) != sizeof(struct bch_inode_generation)) +- return "incorrect value size"; ++ if (bkey_val_bytes(k.k) != sizeof(struct bch_inode_generation)) { ++ pr_buf(err, "incorrect value size (%zu != %zu)", ++ bkey_val_bytes(k.k), sizeof(struct bch_inode_generation)); ++ return -EINVAL; ++ } + +- return NULL; ++ return 0; + } + + void bch2_inode_generation_to_text(struct printbuf *out, struct bch_fs *c, +diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h +index 2337ecfc600e..e3418dc4a1e9 100644 +--- a/fs/bcachefs/inode.h ++++ b/fs/bcachefs/inode.h +@@ -6,8 +6,8 @@ + + extern const char * const bch2_inode_opts[]; + +-const char *bch2_inode_invalid(const struct bch_fs *, struct bkey_s_c); +-const char *bch2_inode_v2_invalid(const struct bch_fs *, struct bkey_s_c); ++int bch2_inode_invalid(const struct bch_fs *, struct bkey_s_c, struct printbuf *); ++int bch2_inode_v2_invalid(const struct bch_fs *, struct bkey_s_c, struct printbuf *); + void bch2_inode_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); + + #define bch2_bkey_ops_inode (struct bkey_ops) { \ +@@ -30,10 +30,8 @@ static inline bool bkey_is_inode(const struct bkey *k) + k->type == KEY_TYPE_inode_v2; + } + +-const char *bch2_inode_generation_invalid(const struct bch_fs *, +- struct bkey_s_c); +-void bch2_inode_generation_to_text(struct printbuf *, struct bch_fs *, +- struct bkey_s_c); ++int bch2_inode_generation_invalid(const struct bch_fs *, struct bkey_s_c, struct printbuf *); ++void bch2_inode_generation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); + + #define bch2_bkey_ops_inode_generation (struct bkey_ops) { \ + .key_invalid = bch2_inode_generation_invalid, \ +diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c +index e61b88930a7f..af7225bdaaab 100644 +--- a/fs/bcachefs/journal_io.c ++++ b/fs/bcachefs/journal_io.c +@@ -210,7 +210,7 @@ static int journal_validate_key(struct bch_fs *c, const char *where, + unsigned version, int big_endian, int write) + { + void *next = vstruct_next(entry); +- const char *invalid; ++ struct printbuf buf = PRINTBUF; + int ret = 0; + + if (journal_entry_err_on(!k->k.u64s, c, +@@ -250,22 +250,28 @@ static int journal_validate_key(struct bch_fs *c, const char *where, + bch2_bkey_compat(level, btree_id, version, big_endian, + write, NULL, bkey_to_packed(k)); + +- invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(k), +- __btree_node_type(level, btree_id)); +- if (invalid) { +- struct printbuf buf = PRINTBUF; ++ if (bch2_bkey_invalid(c, bkey_i_to_s_c(k), ++ __btree_node_type(level, btree_id), &buf)) { ++ printbuf_reset(&buf); ++ pr_buf(&buf, "invalid %s in %s entry offset %zi/%u:", ++ type, where, ++ (u64 *) k - entry->_data, ++ le16_to_cpu(entry->u64s)); ++ pr_newline(&buf); ++ pr_indent_push(&buf, 2); + + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k)); +- mustfix_fsck_err(c, "invalid %s in %s entry offset %zi/%u: %s\n%s", +- type, where, +- (u64 *) k - entry->_data, +- le16_to_cpu(entry->u64s), +- invalid, buf.buf); +- printbuf_exit(&buf); ++ pr_newline(&buf); ++ bch2_bkey_invalid(c, bkey_i_to_s_c(k), ++ __btree_node_type(level, btree_id), &buf); ++ ++ mustfix_fsck_err(c, "%s", buf.buf); + + le16_add_cpu(&entry->u64s, -((u16) k->k.u64s)); + memmove(k, bkey_next(k), next - (void *) bkey_next(k)); + journal_entry_null_range(vstruct_next(entry), next); ++ ++ printbuf_exit(&buf); + return FSCK_DELETED_KEY; + } + +@@ -273,6 +279,7 @@ static int journal_validate_key(struct bch_fs *c, const char *where, + bch2_bkey_compat(level, btree_id, version, big_endian, + write, NULL, bkey_to_packed(k)); + fsck_err: ++ printbuf_exit(&buf); + return ret; + } + +diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c +index 4f0e6960e597..c20a3bc2336b 100644 +--- a/fs/bcachefs/lru.c ++++ b/fs/bcachefs/lru.c +@@ -8,14 +8,18 @@ + #include "lru.h" + #include "recovery.h" + +-const char *bch2_lru_invalid(const struct bch_fs *c, struct bkey_s_c k) ++int bch2_lru_invalid(const struct bch_fs *c, struct bkey_s_c k, ++ struct printbuf *err) + { + const struct bch_lru *lru = bkey_s_c_to_lru(k).v; + +- if (bkey_val_bytes(k.k) < sizeof(*lru)) +- return "incorrect value size"; ++ if (bkey_val_bytes(k.k) < sizeof(*lru)) { ++ pr_buf(err, "incorrect value size (%zu < %zu)", ++ bkey_val_bytes(k.k), sizeof(*lru)); ++ return -EINVAL; ++ } + +- return NULL; ++ return 0; + } + + void bch2_lru_to_text(struct printbuf *out, struct bch_fs *c, +diff --git a/fs/bcachefs/lru.h b/fs/bcachefs/lru.h +index 4db6a8399332..0af62ecf6638 100644 +--- a/fs/bcachefs/lru.h ++++ b/fs/bcachefs/lru.h +@@ -2,7 +2,7 @@ + #ifndef _BCACHEFS_LRU_H + #define _BCACHEFS_LRU_H + +-const char *bch2_lru_invalid(const struct bch_fs *, struct bkey_s_c); ++int bch2_lru_invalid(const struct bch_fs *, struct bkey_s_c, struct printbuf *); + void bch2_lru_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); + + #define bch2_bkey_ops_lru (struct bkey_ops) { \ +diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c +index ca029a00e7b8..5f370da2f3d2 100644 +--- a/fs/bcachefs/quota.c ++++ b/fs/bcachefs/quota.c +@@ -57,15 +57,22 @@ const struct bch_sb_field_ops bch_sb_field_ops_quota = { + .to_text = bch2_sb_quota_to_text, + }; + +-const char *bch2_quota_invalid(const struct bch_fs *c, struct bkey_s_c k) ++int bch2_quota_invalid(const struct bch_fs *c, struct bkey_s_c k, ++ struct printbuf *err) + { +- if (k.k->p.inode >= QTYP_NR) +- return "invalid quota type"; ++ if (k.k->p.inode >= QTYP_NR) { ++ pr_buf(err, "invalid quota type (%llu >= %u)", ++ k.k->p.inode, QTYP_NR); ++ return -EINVAL; ++ } + +- if (bkey_val_bytes(k.k) != sizeof(struct bch_quota)) +- return "incorrect value size"; ++ if (bkey_val_bytes(k.k) != sizeof(struct bch_quota)) { ++ pr_buf(err, "incorrect value size (%zu != %zu)", ++ bkey_val_bytes(k.k), sizeof(struct bch_quota)); ++ return -EINVAL; ++ } + +- return NULL; ++ return 0; + } + + void bch2_quota_to_text(struct printbuf *out, struct bch_fs *c, +diff --git a/fs/bcachefs/quota.h b/fs/bcachefs/quota.h +index 51e4f9713ef0..4ba40fce39a8 100644 +--- a/fs/bcachefs/quota.h ++++ b/fs/bcachefs/quota.h +@@ -7,7 +7,7 @@ + + extern const struct bch_sb_field_ops bch_sb_field_ops_quota; + +-const char *bch2_quota_invalid(const struct bch_fs *, struct bkey_s_c); ++int bch2_quota_invalid(const struct bch_fs *, struct bkey_s_c, struct printbuf *); + void bch2_quota_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); + + #define bch2_bkey_ops_quota (struct bkey_ops) { \ +diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c +index 6824730945d4..e07f0339d87e 100644 +--- a/fs/bcachefs/reflink.c ++++ b/fs/bcachefs/reflink.c +@@ -25,18 +25,25 @@ static inline unsigned bkey_type_to_indirect(const struct bkey *k) + + /* reflink pointers */ + +-const char *bch2_reflink_p_invalid(const struct bch_fs *c, struct bkey_s_c k) ++int bch2_reflink_p_invalid(const struct bch_fs *c, struct bkey_s_c k, ++ struct printbuf *err) + { + struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); + +- if (bkey_val_bytes(p.k) != sizeof(*p.v)) +- return "incorrect value size"; ++ if (bkey_val_bytes(p.k) != sizeof(*p.v)) { ++ pr_buf(err, "incorrect value size (%zu != %zu)", ++ bkey_val_bytes(p.k), sizeof(*p.v)); ++ return -EINVAL; ++ } + + if (c->sb.version >= bcachefs_metadata_version_reflink_p_fix && +- le64_to_cpu(p.v->idx) < le32_to_cpu(p.v->front_pad)) +- return "idx < front_pad"; ++ le64_to_cpu(p.v->idx) < le32_to_cpu(p.v->front_pad)) { ++ pr_buf(err, "idx < front_pad (%llu < %u)", ++ le64_to_cpu(p.v->idx), le32_to_cpu(p.v->front_pad)); ++ return -EINVAL; ++ } + +- return NULL; ++ return 0; + } + + void bch2_reflink_p_to_text(struct printbuf *out, struct bch_fs *c, +@@ -70,14 +77,18 @@ bool bch2_reflink_p_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r + + /* indirect extents */ + +-const char *bch2_reflink_v_invalid(const struct bch_fs *c, struct bkey_s_c k) ++int bch2_reflink_v_invalid(const struct bch_fs *c, struct bkey_s_c k, ++ struct printbuf *err) + { + struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k); + +- if (bkey_val_bytes(r.k) < sizeof(*r.v)) +- return "incorrect value size"; ++ if (bkey_val_bytes(r.k) < sizeof(*r.v)) { ++ pr_buf(err, "incorrect value size (%zu < %zu)", ++ bkey_val_bytes(r.k), sizeof(*r.v)); ++ return -EINVAL; ++ } + +- return bch2_bkey_ptrs_invalid(c, k); ++ return bch2_bkey_ptrs_invalid(c, k, err); + } + + void bch2_reflink_v_to_text(struct printbuf *out, struct bch_fs *c, +@@ -118,12 +129,16 @@ int bch2_trans_mark_reflink_v(struct btree_trans *trans, + + /* indirect inline data */ + +-const char *bch2_indirect_inline_data_invalid(const struct bch_fs *c, +- struct bkey_s_c k) ++int bch2_indirect_inline_data_invalid(const struct bch_fs *c, struct bkey_s_c k, ++ struct printbuf *err) + { +- if (bkey_val_bytes(k.k) < sizeof(struct bch_indirect_inline_data)) +- return "incorrect value size"; +- return NULL; ++ if (bkey_val_bytes(k.k) < sizeof(struct bch_indirect_inline_data)) { ++ pr_buf(err, "incorrect value size (%zu < %zu)", ++ bkey_val_bytes(k.k), sizeof(struct bch_indirect_inline_data)); ++ return -EINVAL; ++ } ++ ++ return 0; + } + + void bch2_indirect_inline_data_to_text(struct printbuf *out, +diff --git a/fs/bcachefs/reflink.h b/fs/bcachefs/reflink.h +index 8eb41c0292eb..d292761f8a98 100644 +--- a/fs/bcachefs/reflink.h ++++ b/fs/bcachefs/reflink.h +@@ -2,7 +2,7 @@ + #ifndef _BCACHEFS_REFLINK_H + #define _BCACHEFS_REFLINK_H + +-const char *bch2_reflink_p_invalid(const struct bch_fs *, struct bkey_s_c); ++int bch2_reflink_p_invalid(const struct bch_fs *, struct bkey_s_c, struct printbuf *); + void bch2_reflink_p_to_text(struct printbuf *, struct bch_fs *, + struct bkey_s_c); + bool bch2_reflink_p_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); +@@ -15,7 +15,7 @@ bool bch2_reflink_p_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); + .atomic_trigger = bch2_mark_reflink_p, \ + } + +-const char *bch2_reflink_v_invalid(const struct bch_fs *, struct bkey_s_c); ++int bch2_reflink_v_invalid(const struct bch_fs *, struct bkey_s_c, struct printbuf *); + void bch2_reflink_v_to_text(struct printbuf *, struct bch_fs *, + struct bkey_s_c); + int bch2_trans_mark_reflink_v(struct btree_trans *, struct bkey_s_c, +@@ -29,8 +29,8 @@ int bch2_trans_mark_reflink_v(struct btree_trans *, struct bkey_s_c, + .atomic_trigger = bch2_mark_extent, \ + } + +-const char *bch2_indirect_inline_data_invalid(const struct bch_fs *, +- struct bkey_s_c); ++int bch2_indirect_inline_data_invalid(const struct bch_fs *, struct bkey_s_c, ++ struct printbuf *); + void bch2_indirect_inline_data_to_text(struct printbuf *, + struct bch_fs *, struct bkey_s_c); + int bch2_trans_mark_indirect_inline_data(struct btree_trans *, +diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c +index cdb89ba216cc..f789e3d9ac1b 100644 +--- a/fs/bcachefs/subvolume.c ++++ b/fs/bcachefs/subvolume.c +@@ -26,39 +26,55 @@ void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c, + le32_to_cpu(s.v->subvol)); + } + +-const char *bch2_snapshot_invalid(const struct bch_fs *c, struct bkey_s_c k) ++int bch2_snapshot_invalid(const struct bch_fs *c, struct bkey_s_c k, ++ struct printbuf *err) + { + struct bkey_s_c_snapshot s; + u32 i, id; + + if (bkey_cmp(k.k->p, POS(0, U32_MAX)) > 0 || +- bkey_cmp(k.k->p, POS(0, 1)) < 0) +- return "bad pos"; ++ bkey_cmp(k.k->p, POS(0, 1)) < 0) { ++ pr_buf(err, "bad pos"); ++ return -EINVAL; ++ } + +- if (bkey_val_bytes(k.k) != sizeof(struct bch_snapshot)) +- return "bad val size"; ++ if (bkey_val_bytes(k.k) != sizeof(struct bch_snapshot)) { ++ pr_buf(err, "bad val size (%zu != %zu)", ++ bkey_val_bytes(k.k), sizeof(struct bch_snapshot)); ++ return -EINVAL; ++ } + + s = bkey_s_c_to_snapshot(k); + + id = le32_to_cpu(s.v->parent); +- if (id && id <= k.k->p.offset) +- return "bad parent node"; ++ if (id && id <= k.k->p.offset) { ++ pr_buf(err, "bad parent node (%u <= %llu)", ++ id, k.k->p.offset); ++ return -EINVAL; ++ } + +- if (le32_to_cpu(s.v->children[0]) < le32_to_cpu(s.v->children[1])) +- return "children not normalized"; ++ if (le32_to_cpu(s.v->children[0]) < le32_to_cpu(s.v->children[1])) { ++ pr_buf(err, "children not normalized"); ++ return -EINVAL; ++ } + + if (s.v->children[0] && +- s.v->children[0] == s.v->children[1]) +- return "duplicate child nodes"; ++ s.v->children[0] == s.v->children[1]) { ++ pr_buf(err, "duplicate child nodes"); ++ return -EINVAL; ++ } + + for (i = 0; i < 2; i++) { + id = le32_to_cpu(s.v->children[i]); + +- if (id >= k.k->p.offset) +- return "bad child node"; ++ if (id >= k.k->p.offset) { ++ pr_buf(err, "bad child node (%u >= %llu)", ++ id, k.k->p.offset); ++ return -EINVAL; ++ } + } + +- return NULL; ++ return 0; + } + + int bch2_mark_snapshot(struct btree_trans *trans, +@@ -729,18 +745,22 @@ static int bch2_delete_dead_snapshots_hook(struct btree_trans *trans, + + /* Subvolumes: */ + +-const char *bch2_subvolume_invalid(const struct bch_fs *c, struct bkey_s_c k) ++int bch2_subvolume_invalid(const struct bch_fs *c, struct bkey_s_c k, ++ struct printbuf *err) + { +- if (bkey_cmp(k.k->p, SUBVOL_POS_MIN) < 0) +- return "invalid pos"; +- +- if (bkey_cmp(k.k->p, SUBVOL_POS_MAX) > 0) +- return "invalid pos"; ++ if (bkey_cmp(k.k->p, SUBVOL_POS_MIN) < 0 || ++ bkey_cmp(k.k->p, SUBVOL_POS_MAX) > 0) { ++ pr_buf(err, "invalid pos"); ++ return -EINVAL; ++ } + +- if (bkey_val_bytes(k.k) != sizeof(struct bch_subvolume)) +- return "bad val size"; ++ if (bkey_val_bytes(k.k) != sizeof(struct bch_subvolume)) { ++ pr_buf(err, "incorrect value size (%zu != %zu)", ++ bkey_val_bytes(k.k), sizeof(struct bch_subvolume)); ++ return -EINVAL; ++ } + +- return NULL; ++ return 0; + } + + void bch2_subvolume_to_text(struct printbuf *out, struct bch_fs *c, +diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h +index f609291acafa..4aed8b5332f6 100644 +--- a/fs/bcachefs/subvolume.h ++++ b/fs/bcachefs/subvolume.h +@@ -6,7 +6,7 @@ + #include "subvolume_types.h" + + void bch2_snapshot_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); +-const char *bch2_snapshot_invalid(const struct bch_fs *, struct bkey_s_c); ++int bch2_snapshot_invalid(const struct bch_fs *, struct bkey_s_c, struct printbuf *); + + #define bch2_bkey_ops_snapshot (struct bkey_ops) { \ + .key_invalid = bch2_snapshot_invalid, \ +@@ -96,7 +96,7 @@ int bch2_fs_snapshots_check(struct bch_fs *); + void bch2_fs_snapshots_exit(struct bch_fs *); + int bch2_fs_snapshots_start(struct bch_fs *); + +-const char *bch2_subvolume_invalid(const struct bch_fs *, struct bkey_s_c); ++int bch2_subvolume_invalid(const struct bch_fs *, struct bkey_s_c, struct printbuf *); + void bch2_subvolume_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); + + #define bch2_bkey_ops_subvolume (struct bkey_ops) { \ +diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c +index 8d23b4c2449e..1cf1269ba44f 100644 +--- a/fs/bcachefs/xattr.c ++++ b/fs/bcachefs/xattr.c +@@ -69,32 +69,51 @@ const struct bch_hash_desc bch2_xattr_hash_desc = { + .cmp_bkey = xattr_cmp_bkey, + }; + +-const char *bch2_xattr_invalid(const struct bch_fs *c, struct bkey_s_c k) ++int bch2_xattr_invalid(const struct bch_fs *c, struct bkey_s_c k, ++ struct printbuf *err) + { + const struct xattr_handler *handler; + struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k); + +- if (bkey_val_bytes(k.k) < sizeof(struct bch_xattr)) +- return "value too small"; ++ if (bkey_val_bytes(k.k) < sizeof(struct bch_xattr)) { ++ pr_buf(err, "incorrect value size (%zu < %zu)", ++ bkey_val_bytes(k.k), sizeof(*xattr.v)); ++ return -EINVAL; ++ } + + if (bkey_val_u64s(k.k) < + xattr_val_u64s(xattr.v->x_name_len, +- le16_to_cpu(xattr.v->x_val_len))) +- return "value too small"; ++ le16_to_cpu(xattr.v->x_val_len))) { ++ pr_buf(err, "value too small (%zu < %u)", ++ bkey_val_u64s(k.k), ++ xattr_val_u64s(xattr.v->x_name_len, ++ le16_to_cpu(xattr.v->x_val_len))); ++ return -EINVAL; ++ } + ++ /* XXX why +4 ? */ + if (bkey_val_u64s(k.k) > + xattr_val_u64s(xattr.v->x_name_len, +- le16_to_cpu(xattr.v->x_val_len) + 4)) +- return "value too big"; ++ le16_to_cpu(xattr.v->x_val_len) + 4)) { ++ pr_buf(err, "value too big (%zu > %u)", ++ bkey_val_u64s(k.k), ++ xattr_val_u64s(xattr.v->x_name_len, ++ le16_to_cpu(xattr.v->x_val_len) + 4)); ++ return -EINVAL; ++ } + + handler = bch2_xattr_type_to_handler(xattr.v->x_type); +- if (!handler) +- return "invalid type"; ++ if (!handler) { ++ pr_buf(err, "invalid type (%u)", xattr.v->x_type); ++ return -EINVAL; ++ } + +- if (memchr(xattr.v->x_name, '\0', xattr.v->x_name_len)) +- return "xattr name has invalid characters"; ++ if (memchr(xattr.v->x_name, '\0', xattr.v->x_name_len)) { ++ pr_buf(err, "xattr name has invalid characters"); ++ return -EINVAL; ++ } + +- return NULL; ++ return 0; + } + + void bch2_xattr_to_text(struct printbuf *out, struct bch_fs *c, +diff --git a/fs/bcachefs/xattr.h b/fs/bcachefs/xattr.h +index f4f896545e1c..3fd03018fdd8 100644 +--- a/fs/bcachefs/xattr.h ++++ b/fs/bcachefs/xattr.h +@@ -6,7 +6,7 @@ + + extern const struct bch_hash_desc bch2_xattr_hash_desc; + +-const char *bch2_xattr_invalid(const struct bch_fs *, struct bkey_s_c); ++int bch2_xattr_invalid(const struct bch_fs *, struct bkey_s_c, struct printbuf *); + void bch2_xattr_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); + + #define bch2_bkey_ops_xattr (struct bkey_ops) { \ +-- +cgit v1.2.3 + + +From 069f9fbf8c5bca5865b70f3192b70e2499118de3 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 4 Apr 2022 14:25:45 -0400 +Subject: fixup! bcachefs: Change flags param to bch2_btree_delete_range to + update_flags + +--- + fs/bcachefs/btree_update_leaf.c | 5 ++--- + 1 file changed, 2 insertions(+), 3 deletions(-) + +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index a985f90db175..e264fcb8bb50 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -1703,10 +1703,9 @@ retry: + break; + } + +- ret = bch2_trans_update(trans, &iter, &delete, 0) ?: ++ ret = bch2_trans_update(trans, &iter, &delete, update_flags) ?: + bch2_trans_commit(trans, &disk_res, journal_seq, +- BTREE_INSERT_NOFAIL| +- update_flags); ++ BTREE_INSERT_NOFAIL); + bch2_disk_reservation_put(trans->c, &disk_res); + if (ret) + break; +-- +cgit v1.2.3 + + +From 730b24fe8f5548a478042964e59bc626164c247a Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 4 Apr 2022 23:33:51 -0400 +Subject: fixup! bcachefs: Gap buffer for journal keys + +--- + fs/bcachefs/recovery.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index 6a92c1a05a0a..df20f0707b2d 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -471,7 +471,7 @@ void bch2_journal_keys_free(struct journal_keys *keys) + + kvfree(keys->d); + keys->d = NULL; +- keys->nr = 0; ++ keys->nr = keys->gap = keys->size = 0; + } + + static struct journal_keys journal_keys_sort(struct list_head *journal_entries) +-- +cgit v1.2.3 + + +From 20d1fcb8f3227a96693b3ba35ea7012c30d48aa0 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 4 Apr 2022 23:36:56 -0400 +Subject: bcachefs: Silence spurious copygc err when shutting down + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/movinggc.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c +index cb6b81678ecc..dd1bf6651b93 100644 +--- a/fs/bcachefs/movinggc.c ++++ b/fs/bcachefs/movinggc.c +@@ -290,10 +290,10 @@ static int bch2_copygc(struct bch_fs *c) + writepoint_ptr(&c->copygc_write_point), + copygc_pred, NULL, + &move_stats); +- if (ret) { ++ if (ret < 0) + bch_err(c, "error %i from bch2_move_data() in copygc", ret); ++ if (ret) + return ret; +- } + + ret = check_copygc_was_done(c, §ors_not_moved, &buckets_not_moved); + if (ret) { +-- +cgit v1.2.3 + + +From 62c45841258f75b4bdd61c55543f170f6c5f6aa1 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 4 Apr 2022 22:25:01 -0400 +Subject: bcachefs: Defer checking of alloc -> lru refs until after RW + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/alloc_background.c | 163 ++++++++++++++++++++++++++--------------- + fs/bcachefs/alloc_background.h | 1 + + fs/bcachefs/recovery.c | 13 ++++ + 3 files changed, 118 insertions(+), 59 deletions(-) + +diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c +index 193a21395921..70d608a0e420 100644 +--- a/fs/bcachefs/alloc_background.c ++++ b/fs/bcachefs/alloc_background.c +@@ -577,7 +577,7 @@ static int bch2_check_alloc_key(struct btree_trans *trans, + struct btree_iter *alloc_iter) + { + struct bch_fs *c = trans->c; +- struct btree_iter discard_iter, freespace_iter, lru_iter; ++ struct btree_iter discard_iter, freespace_iter; + struct bch_alloc_v4 a; + unsigned discard_key_type, freespace_key_type; + struct bkey_s_c alloc_k, k; +@@ -603,8 +603,6 @@ static int bch2_check_alloc_key(struct btree_trans *trans, + alloc_k.k->p, 0); + bch2_trans_iter_init(trans, &freespace_iter, BTREE_ID_freespace, + alloc_freespace_pos(alloc_k.k->p, a), 0); +- bch2_trans_iter_init(trans, &lru_iter, BTREE_ID_lru, +- POS(alloc_k.k->p.inode, a.io_time[READ]), 0); + + k = bch2_btree_iter_peek_slot(&discard_iter); + ret = bkey_err(k); +@@ -628,8 +626,7 @@ static int bch2_check_alloc_key(struct btree_trans *trans, + update->k.type = discard_key_type; + update->k.p = discard_iter.pos; + +- ret = bch2_trans_update(trans, &discard_iter, update, 0) ?: +- bch2_trans_commit(trans, NULL, NULL, 0); ++ ret = bch2_trans_update(trans, &discard_iter, update, 0); + if (ret) + goto err; + } +@@ -658,65 +655,12 @@ static int bch2_check_alloc_key(struct btree_trans *trans, + update->k.p = freespace_iter.pos; + bch2_key_resize(&update->k, 1); + +- ret = bch2_trans_update(trans, &freespace_iter, update, 0) ?: +- bch2_trans_commit(trans, NULL, NULL, 0); +- if (ret) +- goto err; +- } +- +- if (bucket_state(a) == BUCKET_cached) { +- k = bch2_btree_iter_peek_slot(&lru_iter); +- ret = bkey_err(k); ++ ret = bch2_trans_update(trans, &freespace_iter, update, 0); + if (ret) + goto err; +- +- if (fsck_err_on(!a.io_time[READ], c, +- "cached bucket with read_time 0\n" +- " %s", +- (printbuf_reset(&buf), +- bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)) || +- fsck_err_on(k.k->type != KEY_TYPE_lru || +- le64_to_cpu(bkey_s_c_to_lru(k).v->idx) != alloc_k.k->p.offset, c, +- "incorrect/missing lru entry\n" +- " %s\n" +- " %s", +- (printbuf_reset(&buf), +- bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf), +- (bch2_bkey_val_to_text(&buf2, c, k), buf2.buf))) { +- u64 read_time = a.io_time[READ]; +- +- if (!a.io_time[READ]) +- a.io_time[READ] = atomic64_read(&c->io_clock[READ].now); +- +- ret = bch2_lru_change(trans, +- alloc_k.k->p.inode, +- alloc_k.k->p.offset, +- 0, &a.io_time[READ]); +- if (ret) +- goto err; +- +- if (a.io_time[READ] != read_time) { +- struct bkey_i_alloc_v4 *a_mut = +- bch2_alloc_to_v4_mut(trans, alloc_k); +- ret = PTR_ERR_OR_ZERO(a_mut); +- if (ret) +- goto err; +- +- a_mut->v.io_time[READ] = a.io_time[READ]; +- ret = bch2_trans_update(trans, alloc_iter, +- &a_mut->k_i, BTREE_TRIGGER_NORUN); +- if (ret) +- goto err; +- } +- +- ret = bch2_trans_commit(trans, NULL, NULL, 0); +- if (ret) +- goto err; +- } + } + err: + fsck_err: +- bch2_trans_iter_exit(trans, &lru_iter); + bch2_trans_iter_exit(trans, &freespace_iter); + bch2_trans_iter_exit(trans, &discard_iter); + printbuf_exit(&buf2); +@@ -852,6 +796,107 @@ err: + return ret < 0 ? ret : 0; + } + ++static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans, ++ struct btree_iter *alloc_iter) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_iter lru_iter; ++ struct bch_alloc_v4 a; ++ struct bkey_s_c alloc_k, k; ++ struct printbuf buf = PRINTBUF; ++ struct printbuf buf2 = PRINTBUF; ++ int ret; ++ ++ alloc_k = bch2_btree_iter_peek(alloc_iter); ++ if (!alloc_k.k) ++ return 0; ++ ++ ret = bkey_err(alloc_k); ++ if (ret) ++ return ret; ++ ++ bch2_alloc_to_v4(alloc_k, &a); ++ ++ if (bucket_state(a) != BUCKET_cached) ++ return 0; ++ ++ bch2_trans_iter_init(trans, &lru_iter, BTREE_ID_lru, ++ POS(alloc_k.k->p.inode, a.io_time[READ]), 0); ++ ++ k = bch2_btree_iter_peek_slot(&lru_iter); ++ ret = bkey_err(k); ++ if (ret) ++ goto err; ++ ++ if (fsck_err_on(!a.io_time[READ], c, ++ "cached bucket with read_time 0\n" ++ " %s", ++ (printbuf_reset(&buf), ++ bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)) || ++ fsck_err_on(k.k->type != KEY_TYPE_lru || ++ le64_to_cpu(bkey_s_c_to_lru(k).v->idx) != alloc_k.k->p.offset, c, ++ "incorrect/missing lru entry\n" ++ " %s\n" ++ " %s", ++ (printbuf_reset(&buf), ++ bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf), ++ (bch2_bkey_val_to_text(&buf2, c, k), buf2.buf))) { ++ u64 read_time = a.io_time[READ]; ++ ++ if (!a.io_time[READ]) ++ a.io_time[READ] = atomic64_read(&c->io_clock[READ].now); ++ ++ ret = bch2_lru_change(trans, ++ alloc_k.k->p.inode, ++ alloc_k.k->p.offset, ++ 0, &a.io_time[READ]); ++ if (ret) ++ goto err; ++ ++ if (a.io_time[READ] != read_time) { ++ struct bkey_i_alloc_v4 *a_mut = ++ bch2_alloc_to_v4_mut(trans, alloc_k); ++ ret = PTR_ERR_OR_ZERO(a_mut); ++ if (ret) ++ goto err; ++ ++ a_mut->v.io_time[READ] = a.io_time[READ]; ++ ret = bch2_trans_update(trans, alloc_iter, ++ &a_mut->k_i, BTREE_TRIGGER_NORUN); ++ if (ret) ++ goto err; ++ } ++ } ++err: ++fsck_err: ++ bch2_trans_iter_exit(trans, &lru_iter); ++ printbuf_exit(&buf2); ++ printbuf_exit(&buf); ++ return ret; ++} ++ ++int bch2_check_alloc_to_lru_refs(struct bch_fs *c) ++{ ++ struct btree_trans trans; ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ int ret = 0; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN, ++ BTREE_ITER_PREFETCH, k, ret) { ++ ret = __bch2_trans_do(&trans, NULL, NULL, BTREE_INSERT_NOFAIL, ++ bch2_check_alloc_to_lru_ref(&trans, &iter)); ++ if (ret) ++ break; ++ } ++ bch2_trans_iter_exit(&trans, &iter); ++ ++ bch2_trans_exit(&trans); ++ return ret < 0 ? ret : 0; ++} ++ + static int bch2_clear_need_discard(struct btree_trans *trans, struct bpos pos, + struct bch_dev *ca, bool *discard_done) + { +diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h +index 85a807146143..93bd8feb9ebc 100644 +--- a/fs/bcachefs/alloc_background.h ++++ b/fs/bcachefs/alloc_background.h +@@ -114,6 +114,7 @@ int bch2_alloc_read(struct bch_fs *); + int bch2_trans_mark_alloc(struct btree_trans *, struct bkey_s_c, + struct bkey_i *, unsigned); + int bch2_check_alloc_info(struct bch_fs *, bool); ++int bch2_check_alloc_to_lru_refs(struct bch_fs *); + void bch2_do_discards(struct bch_fs *); + + static inline bool should_invalidate_buckets(struct bch_dev *ca) +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index df20f0707b2d..99b7b2b26573 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -1276,6 +1276,19 @@ use_clean: + if (ret) + goto err; + ++ if (c->opts.fsck) { ++ bch_info(c, "checking alloc to lru refs"); ++ err = "error checking alloc to lru refs"; ++ ret = bch2_check_alloc_to_lru_refs(c); ++ if (ret) ++ goto err; ++ ++ ret = bch2_check_lrus(c, true); ++ if (ret) ++ goto err; ++ bch_verbose(c, "done checking alloc to lru refs"); ++ } ++ + if (c->sb.version < bcachefs_metadata_version_snapshot_2) { + bch2_fs_lazy_rw(c); + +-- +cgit v1.2.3 + + +From e7c5e7f7c3b16c6404dc5ad22b41113f63120435 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Tue, 5 Apr 2022 13:44:18 -0400 +Subject: bcachefs: More improvements for alloc info checks + + - Move checks for whether the device & bucket are valid from the + .key_invalid method to bch2_check_alloc_key(). This is because + .key_invalid() is called on keys that may no longer exist (post + journal replay), which is a problem when removing/resizing devices. + + - We weren't checking the need_discard btree to ensure that every set + bucket has a corresponding alloc key. This refactors the code for + checking the freespace btree, so that it now checks both. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/alloc_background.c | 142 +++++++++++++++++------------------------ + fs/bcachefs/alloc_background.h | 14 +++- + fs/bcachefs/buckets.c | 13 ++-- + fs/bcachefs/recovery.c | 2 +- + fs/bcachefs/super.c | 9 --- + 5 files changed, 81 insertions(+), 99 deletions(-) + +diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c +index 70d608a0e420..0cdc28a2385a 100644 +--- a/fs/bcachefs/alloc_background.c ++++ b/fs/bcachefs/alloc_background.c +@@ -306,11 +306,6 @@ int bch2_alloc_v1_invalid(const struct bch_fs *c, struct bkey_s_c k, struct prin + { + struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k); + +- if (!bch2_dev_exists2(c, k.k->p.inode)) { +- pr_buf(err, "invalid device (%llu)", k.k->p.inode); +- return -EINVAL; +- } +- + /* allow for unknown fields */ + if (bkey_val_u64s(a.k) < bch_alloc_v1_val_u64s(a.v)) { + pr_buf(err, "incorrect value size (%zu < %u)", +@@ -325,11 +320,6 @@ int bch2_alloc_v2_invalid(const struct bch_fs *c, struct bkey_s_c k, struct prin + { + struct bkey_alloc_unpacked u; + +- if (!bch2_dev_exists2(c, k.k->p.inode)) { +- pr_buf(err, "invalid device (%llu)", k.k->p.inode); +- return -EINVAL; +- } +- + if (bch2_alloc_unpack_v2(&u, k)) { + pr_buf(err, "unpack error"); + return -EINVAL; +@@ -341,20 +331,6 @@ int bch2_alloc_v2_invalid(const struct bch_fs *c, struct bkey_s_c k, struct prin + int bch2_alloc_v3_invalid(const struct bch_fs *c, struct bkey_s_c k, struct printbuf *err) + { + struct bkey_alloc_unpacked u; +- struct bch_dev *ca; +- +- if (!bch2_dev_exists2(c, k.k->p.inode)) { +- pr_buf(err, "invalid device (%llu)", k.k->p.inode); +- return -EINVAL; +- } +- +- ca = bch_dev_bkey_exists(c, k.k->p.inode); +- +- if (k.k->p.offset < ca->mi.first_bucket || +- k.k->p.offset >= ca->mi.nbuckets) { +- pr_buf(err, "invalid bucket"); +- return -EINVAL; +- } + + if (bch2_alloc_unpack_v3(&u, k)) { + pr_buf(err, "unpack error"); +@@ -366,18 +342,9 @@ int bch2_alloc_v3_invalid(const struct bch_fs *c, struct bkey_s_c k, struct prin + + int bch2_alloc_v4_invalid(const struct bch_fs *c, struct bkey_s_c k, struct printbuf *err) + { +- struct bch_dev *ca; +- +- if (!bch2_dev_exists2(c, k.k->p.inode)) { +- pr_buf(err, "invalid device (%llu)", k.k->p.inode); +- return -EINVAL; +- } +- +- ca = bch_dev_bkey_exists(c, k.k->p.inode); +- +- if (k.k->p.offset < ca->mi.first_bucket || +- k.k->p.offset >= ca->mi.nbuckets) { +- pr_buf(err, "invalid bucket"); ++ if (bkey_val_bytes(k.k) != sizeof(struct bch_alloc_v4)) { ++ pr_buf(err, "bad val size (%zu != %zu)", ++ bkey_val_bytes(k.k), sizeof(struct bch_alloc_v4)); + return -EINVAL; + } + +@@ -577,6 +544,7 @@ static int bch2_check_alloc_key(struct btree_trans *trans, + struct btree_iter *alloc_iter) + { + struct bch_fs *c = trans->c; ++ struct bch_dev *ca; + struct btree_iter discard_iter, freespace_iter; + struct bch_alloc_v4 a; + unsigned discard_key_type, freespace_key_type; +@@ -593,7 +561,16 @@ static int bch2_check_alloc_key(struct btree_trans *trans, + if (ret) + return ret; + ++ if (fsck_err_on(!bch2_dev_bucket_exists(c, alloc_k.k->p), c, ++ "alloc key for invalid device or bucket")) ++ return bch2_btree_delete_at(trans, alloc_iter, 0); ++ ++ ca = bch_dev_bkey_exists(c, alloc_k.k->p.inode); ++ if (!ca->mi.freespace_initialized) ++ return 0; ++ + bch2_alloc_to_v4(alloc_k, &a); ++ + discard_key_type = bucket_state(a) == BUCKET_need_discard + ? KEY_TYPE_set : 0; + freespace_key_type = bucket_state(a) == BUCKET_free +@@ -668,21 +645,8 @@ fsck_err: + return ret; + } + +-static inline bool bch2_dev_bucket_exists(struct bch_fs *c, struct bpos pos) +-{ +- struct bch_dev *ca; +- +- if (pos.inode >= c->sb.nr_devices || !c->devs[pos.inode]) +- return false; +- +- ca = bch_dev_bkey_exists(c, pos.inode); +- return pos.offset >= ca->mi.first_bucket && +- pos.offset < ca->mi.nbuckets; +-} +- +-static int bch2_check_freespace_key(struct btree_trans *trans, +- struct btree_iter *freespace_iter, +- bool initial) ++static int bch2_check_discard_freespace_key(struct btree_trans *trans, ++ struct btree_iter *iter) + { + struct bch_fs *c = trans->c; + struct btree_iter alloc_iter; +@@ -691,10 +655,13 @@ static int bch2_check_freespace_key(struct btree_trans *trans, + u64 genbits; + struct bpos pos; + struct bkey_i *update; ++ enum bucket_state state = iter->btree_id == BTREE_ID_need_discard ++ ? BUCKET_need_discard ++ : BUCKET_free; + struct printbuf buf = PRINTBUF; + int ret; + +- freespace_k = bch2_btree_iter_peek(freespace_iter); ++ freespace_k = bch2_btree_iter_peek(iter); + if (!freespace_k.k) + return 1; + +@@ -702,15 +669,16 @@ static int bch2_check_freespace_key(struct btree_trans *trans, + if (ret) + return ret; + +- pos = freespace_iter->pos; ++ pos = iter->pos; + pos.offset &= ~(~0ULL << 56); +- genbits = freespace_iter->pos.offset & (~0ULL << 56); ++ genbits = iter->pos.offset & (~0ULL << 56); + + bch2_trans_iter_init(trans, &alloc_iter, BTREE_ID_alloc, pos, 0); + + if (fsck_err_on(!bch2_dev_bucket_exists(c, pos), c, +- "%llu:%llu set in freespace btree but device or bucket does not exist", +- pos.inode, pos.offset)) ++ "%llu:%llu set in %s btree but device or bucket does not exist", ++ pos.inode, pos.offset, ++ bch2_btree_ids[iter->btree_id])) + goto delete; + + k = bch2_btree_iter_peek_slot(&alloc_iter); +@@ -720,11 +688,13 @@ static int bch2_check_freespace_key(struct btree_trans *trans, + + bch2_alloc_to_v4(k, &a); + +- if (fsck_err_on(bucket_state(a) != BUCKET_free || +- genbits != alloc_freespace_genbits(a), c, +- "%s\n incorrectly set in freespace index (free %u, genbits %llu should be %llu)", ++ if (fsck_err_on(bucket_state(a) != state || ++ (state == BUCKET_free && ++ genbits != alloc_freespace_genbits(a)), c, ++ "%s\n incorrectly set in %s index (free %u, genbits %llu should be %llu)", + (bch2_bkey_val_to_text(&buf, c, k), buf.buf), +- bucket_state(a) == BUCKET_free, ++ bch2_btree_ids[iter->btree_id], ++ bucket_state(a) == state, + genbits >> 56, alloc_freespace_genbits(a) >> 56)) + goto delete; + out: +@@ -734,46 +704,54 @@ fsck_err: + printbuf_exit(&buf); + return ret; + delete: +- update = bch2_trans_kmalloc(trans, sizeof(*update)); +- ret = PTR_ERR_OR_ZERO(update); +- if (ret) +- goto err; ++ if (iter->btree_id == BTREE_ID_freespace) { ++ /* should probably add a helper for deleting extents */ ++ update = bch2_trans_kmalloc(trans, sizeof(*update)); ++ ret = PTR_ERR_OR_ZERO(update); ++ if (ret) ++ goto err; + +- bkey_init(&update->k); +- update->k.p = freespace_iter->pos; +- bch2_key_resize(&update->k, 1); ++ bkey_init(&update->k); ++ update->k.p = iter->pos; ++ bch2_key_resize(&update->k, 1); + +- ret = bch2_trans_update(trans, freespace_iter, update, 0) ?: +- bch2_trans_commit(trans, NULL, NULL, 0); ++ ret = bch2_trans_update(trans, iter, update, 0); ++ } else { ++ ret = bch2_btree_delete_at(trans, iter, 0); ++ } + goto out; + } + +-int bch2_check_alloc_info(struct bch_fs *c, bool initial) ++int bch2_check_alloc_info(struct bch_fs *c) + { + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; +- int ret = 0, last_dev = -1; ++ int ret = 0; + + bch2_trans_init(&trans, c, 0, 0); + + for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN, + BTREE_ITER_PREFETCH, k, ret) { +- if (k.k->p.inode != last_dev) { +- struct bch_dev *ca = bch_dev_bkey_exists(c, k.k->p.inode); +- +- if (!ca->mi.freespace_initialized) { +- bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0)); +- continue; +- } ++ ret = __bch2_trans_do(&trans, NULL, NULL, 0, ++ bch2_check_alloc_key(&trans, &iter)); ++ if (ret) ++ break; ++ } ++ bch2_trans_iter_exit(&trans, &iter); + +- last_dev = k.k->p.inode; +- } ++ if (ret) ++ goto err; + ++ bch2_trans_iter_init(&trans, &iter, BTREE_ID_need_discard, POS_MIN, ++ BTREE_ITER_PREFETCH); ++ while (1) { + ret = __bch2_trans_do(&trans, NULL, NULL, 0, +- bch2_check_alloc_key(&trans, &iter)); ++ bch2_check_discard_freespace_key(&trans, &iter)); + if (ret) + break; ++ ++ bch2_btree_iter_set_pos(&iter, bpos_nosnap_successor(iter.pos)); + } + bch2_trans_iter_exit(&trans, &iter); + +@@ -784,7 +762,7 @@ int bch2_check_alloc_info(struct bch_fs *c, bool initial) + BTREE_ITER_PREFETCH); + while (1) { + ret = __bch2_trans_do(&trans, NULL, NULL, 0, +- bch2_check_freespace_key(&trans, &iter, initial)); ++ bch2_check_discard_freespace_key(&trans, &iter)); + if (ret) + break; + +diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h +index 93bd8feb9ebc..7ca5bfd37027 100644 +--- a/fs/bcachefs/alloc_background.h ++++ b/fs/bcachefs/alloc_background.h +@@ -11,6 +11,18 @@ + /* How out of date a pointer gen is allowed to be: */ + #define BUCKET_GC_GEN_MAX 96U + ++static inline bool bch2_dev_bucket_exists(struct bch_fs *c, struct bpos pos) ++{ ++ struct bch_dev *ca; ++ ++ if (!bch2_dev_exists2(c, pos.inode)) ++ return false; ++ ++ ca = bch_dev_bkey_exists(c, pos.inode); ++ return pos.offset >= ca->mi.first_bucket && ++ pos.offset < ca->mi.nbuckets; ++} ++ + static inline u8 alloc_gc_gen(struct bch_alloc_v4 a) + { + return a.gen - a.oldest_gen; +@@ -113,7 +125,7 @@ int bch2_alloc_read(struct bch_fs *); + + int bch2_trans_mark_alloc(struct btree_trans *, struct bkey_s_c, + struct bkey_i *, unsigned); +-int bch2_check_alloc_info(struct bch_fs *, bool); ++int bch2_check_alloc_info(struct bch_fs *); + int bch2_check_alloc_to_lru_refs(struct bch_fs *); + void bch2_do_discards(struct bch_fs *); + +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index 51ed9609aeb4..9513ee347c01 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -507,14 +507,9 @@ int bch2_mark_alloc(struct btree_trans *trans, + u64 journal_seq = trans->journal_res.seq; + struct bch_fs *c = trans->c; + struct bch_alloc_v4 old_a, new_a; +- struct bch_dev *ca = bch_dev_bkey_exists(c, new.k->p.inode); ++ struct bch_dev *ca; + int ret = 0; + +- if (bch2_trans_inconsistent_on(new.k->p.offset < ca->mi.first_bucket || +- new.k->p.offset >= ca->mi.nbuckets, trans, +- "alloc key outside range of device's buckets")) +- return -EIO; +- + /* + * alloc btree is read in by bch2_alloc_read, not gc: + */ +@@ -522,6 +517,12 @@ int bch2_mark_alloc(struct btree_trans *trans, + !(flags & BTREE_TRIGGER_BUCKET_INVALIDATE)) + return 0; + ++ if (bch2_trans_inconsistent_on(!bch2_dev_bucket_exists(c, new.k->p), trans, ++ "alloc key for invalid device or bucket")) ++ return -EIO; ++ ++ ca = bch_dev_bkey_exists(c, new.k->p.inode); ++ + bch2_alloc_to_v4(old, &old_a); + bch2_alloc_to_v4(new, &new_a); + +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index 99b7b2b26573..88ed803021ba 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -1237,7 +1237,7 @@ use_clean: + if (c->opts.fsck) { + bch_info(c, "checking need_discard and freespace btrees"); + err = "error checking need_discard and freespace btrees"; +- ret = bch2_check_alloc_info(c, true); ++ ret = bch2_check_alloc_info(c); + if (ret) + goto err; + +diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c +index 4a071711d363..1af9bcc0e01f 100644 +--- a/fs/bcachefs/super.c ++++ b/fs/bcachefs/super.c +@@ -1471,15 +1471,6 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) + goto err; + } + +- /* +- * must flush all existing journal entries, they might have +- * (overwritten) keys that point to the device we're removing: +- */ +- bch2_journal_flush_all_pins(&c->journal); +- /* +- * hack to ensure bch2_replicas_gc2() clears out entries to this device +- */ +- bch2_journal_meta(&c->journal); + ret = bch2_journal_error(&c->journal); + if (ret) { + bch_err(ca, "Remove failed, journal error"); +-- +cgit v1.2.3 + + +From 41004db63b82307b5e4d102c9c8fd6b0caea7866 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 3 Apr 2022 21:50:25 -0400 +Subject: bcachefs: Add rw to .key_invalid() + +This adds a new parameter to .key_invalid() methods for whether the key +is being read or written; the idea being that methods can do more +aggressive checks when a key is newly created and being written, when we +wouldn't want to delete the key because of those checks. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/alloc_background.c | 12 ++++++++---- + fs/bcachefs/alloc_background.h | 8 ++++---- + fs/bcachefs/bkey_methods.c | 23 ++++++++++++----------- + fs/bcachefs/bkey_methods.h | 18 ++++++++++++------ + fs/bcachefs/btree_io.c | 13 +++++++------ + fs/bcachefs/btree_update_interior.c | 6 ++++-- + fs/bcachefs/btree_update_leaf.c | 6 ++++-- + fs/bcachefs/dirent.c | 2 +- + fs/bcachefs/dirent.h | 2 +- + fs/bcachefs/ec.c | 4 ++-- + fs/bcachefs/ec.h | 2 +- + fs/bcachefs/extents.c | 14 +++++++------- + fs/bcachefs/extents.h | 10 ++++++---- + fs/bcachefs/inode.c | 6 +++--- + fs/bcachefs/inode.h | 7 ++++--- + fs/bcachefs/journal_io.c | 4 ++-- + fs/bcachefs/lru.c | 2 +- + fs/bcachefs/lru.h | 2 +- + fs/bcachefs/quota.c | 2 +- + fs/bcachefs/quota.h | 2 +- + fs/bcachefs/reflink.c | 8 ++++---- + fs/bcachefs/reflink.h | 8 +++++--- + fs/bcachefs/subvolume.c | 4 ++-- + fs/bcachefs/subvolume.h | 6 ++++-- + fs/bcachefs/xattr.c | 2 +- + fs/bcachefs/xattr.h | 2 +- + 26 files changed, 99 insertions(+), 76 deletions(-) + +diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c +index 0cdc28a2385a..0ea2853e60b1 100644 +--- a/fs/bcachefs/alloc_background.c ++++ b/fs/bcachefs/alloc_background.c +@@ -302,7 +302,8 @@ static unsigned bch_alloc_v1_val_u64s(const struct bch_alloc *a) + return DIV_ROUND_UP(bytes, sizeof(u64)); + } + +-int bch2_alloc_v1_invalid(const struct bch_fs *c, struct bkey_s_c k, struct printbuf *err) ++int bch2_alloc_v1_invalid(const struct bch_fs *c, struct bkey_s_c k, ++ int rw, struct printbuf *err) + { + struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k); + +@@ -316,7 +317,8 @@ int bch2_alloc_v1_invalid(const struct bch_fs *c, struct bkey_s_c k, struct prin + return 0; + } + +-int bch2_alloc_v2_invalid(const struct bch_fs *c, struct bkey_s_c k, struct printbuf *err) ++int bch2_alloc_v2_invalid(const struct bch_fs *c, struct bkey_s_c k, ++ int rw, struct printbuf *err) + { + struct bkey_alloc_unpacked u; + +@@ -328,7 +330,8 @@ int bch2_alloc_v2_invalid(const struct bch_fs *c, struct bkey_s_c k, struct prin + return 0; + } + +-int bch2_alloc_v3_invalid(const struct bch_fs *c, struct bkey_s_c k, struct printbuf *err) ++int bch2_alloc_v3_invalid(const struct bch_fs *c, struct bkey_s_c k, ++ int rw, struct printbuf *err) + { + struct bkey_alloc_unpacked u; + +@@ -340,7 +343,8 @@ int bch2_alloc_v3_invalid(const struct bch_fs *c, struct bkey_s_c k, struct prin + return 0; + } + +-int bch2_alloc_v4_invalid(const struct bch_fs *c, struct bkey_s_c k, struct printbuf *err) ++int bch2_alloc_v4_invalid(const struct bch_fs *c, struct bkey_s_c k, ++ int rw, struct printbuf *err) + { + if (bkey_val_bytes(k.k) != sizeof(struct bch_alloc_v4)) { + pr_buf(err, "bad val size (%zu != %zu)", +diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h +index 7ca5bfd37027..9c6a590fa073 100644 +--- a/fs/bcachefs/alloc_background.h ++++ b/fs/bcachefs/alloc_background.h +@@ -78,10 +78,10 @@ int bch2_bucket_io_time_reset(struct btree_trans *, unsigned, size_t, int); + + #define ALLOC_SCAN_BATCH(ca) max_t(size_t, 1, (ca)->mi.nbuckets >> 9) + +-int bch2_alloc_v1_invalid(const struct bch_fs *, struct bkey_s_c, struct printbuf *); +-int bch2_alloc_v2_invalid(const struct bch_fs *, struct bkey_s_c, struct printbuf *); +-int bch2_alloc_v3_invalid(const struct bch_fs *, struct bkey_s_c, struct printbuf *); +-int bch2_alloc_v4_invalid(const struct bch_fs *, struct bkey_s_c k, struct printbuf *); ++int bch2_alloc_v1_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *); ++int bch2_alloc_v2_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *); ++int bch2_alloc_v3_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *); ++int bch2_alloc_v4_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *); + void bch2_alloc_v4_swab(struct bkey_s); + void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); + +diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c +index c132bff22aff..574c668a9841 100644 +--- a/fs/bcachefs/bkey_methods.c ++++ b/fs/bcachefs/bkey_methods.c +@@ -23,7 +23,7 @@ const char * const bch2_bkey_types[] = { + }; + + static int deleted_key_invalid(const struct bch_fs *c, struct bkey_s_c k, +- struct printbuf *err) ++ int rw, struct printbuf *err) + { + return 0; + } +@@ -37,7 +37,7 @@ static int deleted_key_invalid(const struct bch_fs *c, struct bkey_s_c k, + } + + static int empty_val_key_invalid(const struct bch_fs *c, struct bkey_s_c k, +- struct printbuf *err) ++ int rw, struct printbuf *err) + { + if (bkey_val_bytes(k.k)) { + pr_buf(err, "incorrect value size (%zu != 0)", +@@ -53,7 +53,7 @@ static int empty_val_key_invalid(const struct bch_fs *c, struct bkey_s_c k, + } + + static int key_type_cookie_invalid(const struct bch_fs *c, struct bkey_s_c k, +- struct printbuf *err) ++ int rw, struct printbuf *err) + { + if (bkey_val_bytes(k.k) != sizeof(struct bch_cookie)) { + pr_buf(err, "incorrect value size (%zu != %zu)", +@@ -73,7 +73,7 @@ static int key_type_cookie_invalid(const struct bch_fs *c, struct bkey_s_c k, + } + + static int key_type_inline_data_invalid(const struct bch_fs *c, struct bkey_s_c k, +- struct printbuf *err) ++ int rw, struct printbuf *err) + { + return 0; + } +@@ -94,7 +94,7 @@ static void key_type_inline_data_to_text(struct printbuf *out, struct bch_fs *c, + } + + static int key_type_set_invalid(const struct bch_fs *c, struct bkey_s_c k, +- struct printbuf *err) ++ int rw, struct printbuf *err) + { + if (bkey_val_bytes(k.k)) { + pr_buf(err, "incorrect value size (%zu != %zu)", +@@ -122,14 +122,15 @@ const struct bkey_ops bch2_bkey_ops[] = { + #undef x + }; + +-int bch2_bkey_val_invalid(struct bch_fs *c, struct bkey_s_c k, struct printbuf *err) ++int bch2_bkey_val_invalid(struct bch_fs *c, struct bkey_s_c k, ++ int rw, struct printbuf *err) + { + if (k.k->type >= KEY_TYPE_MAX) { + pr_buf(err, "invalid type (%u >= %u)", k.k->type, KEY_TYPE_MAX); + return -EINVAL; + } + +- return bch2_bkey_ops[k.k->type].key_invalid(c, k, err); ++ return bch2_bkey_ops[k.k->type].key_invalid(c, k, rw, err); + } + + static unsigned bch2_key_types_allowed[] = { +@@ -198,7 +199,7 @@ static unsigned bch2_key_types_allowed[] = { + + int __bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k, + enum btree_node_type type, +- struct printbuf *err) ++ int rw, struct printbuf *err) + { + if (k.k->u64s < BKEY_U64s) { + pr_buf(err, "u64s too small (%u < %zu)", k.k->u64s, BKEY_U64s); +@@ -254,10 +255,10 @@ int __bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k, + + int bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k, + enum btree_node_type type, +- struct printbuf *err) ++ int rw, struct printbuf *err) + { +- return __bch2_bkey_invalid(c, k, type, err) ?: +- bch2_bkey_val_invalid(c, k, err); ++ return __bch2_bkey_invalid(c, k, type, rw, err) ?: ++ bch2_bkey_val_invalid(c, k, rw, err); + } + + int bch2_bkey_in_btree_node(struct btree *b, struct bkey_s_c k, +diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h +index 9dbac71da933..488917752e0b 100644 +--- a/fs/bcachefs/bkey_methods.h ++++ b/fs/bcachefs/bkey_methods.h +@@ -12,10 +12,16 @@ enum btree_node_type; + + extern const char * const bch2_bkey_types[]; + ++/* ++ * key_invalid: checks validity of @k, returns 0 if good or -EINVAL if bad. If ++ * invalid, entire key will be deleted. ++ * ++ * When invalid, error string is returned via @err. @rw indicates whether key is ++ * being read or written; more aggressive checks can be enabled when rw == WRITE. ++*/ + struct bkey_ops { +- /* Returns reason for being invalid if invalid, else NULL: */ +- int (*key_invalid)(const struct bch_fs *, struct bkey_s_c, +- struct printbuf *); ++ int (*key_invalid)(const struct bch_fs *c, struct bkey_s_c k, ++ int rw, struct printbuf *err); + void (*val_to_text)(struct printbuf *, struct bch_fs *, + struct bkey_s_c); + void (*swab)(struct bkey_s); +@@ -32,11 +38,11 @@ struct bkey_ops { + + extern const struct bkey_ops bch2_bkey_ops[]; + +-int bch2_bkey_val_invalid(struct bch_fs *, struct bkey_s_c, struct printbuf *); ++int bch2_bkey_val_invalid(struct bch_fs *, struct bkey_s_c, int, struct printbuf *); + int __bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c, +- enum btree_node_type, struct printbuf *); ++ enum btree_node_type, int, struct printbuf *); + int bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c, +- enum btree_node_type, struct printbuf *); ++ enum btree_node_type, int, struct printbuf *); + int bch2_bkey_in_btree_node(struct btree *, struct bkey_s_c, struct printbuf *); + + void bch2_bpos_to_text(struct printbuf *, struct bpos); +diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c +index cb753c1ba739..a38561c7cb0a 100644 +--- a/fs/bcachefs/btree_io.c ++++ b/fs/bcachefs/btree_io.c +@@ -769,12 +769,12 @@ fsck_err: + + static int bset_key_invalid(struct bch_fs *c, struct btree *b, + struct bkey_s_c k, +- bool updated_range, int write, ++ bool updated_range, int rw, + struct printbuf *err) + { +- return __bch2_bkey_invalid(c, k, btree_node_type(b), err) ?: ++ return __bch2_bkey_invalid(c, k, btree_node_type(b), rw, err) ?: + (!updated_range ? bch2_bkey_in_btree_node(b, k, err) : 0) ?: +- (write ? bch2_bkey_val_invalid(c, k, err) : 0); ++ (rw == WRITE ? bch2_bkey_val_invalid(c, k, rw, err) : 0); + } + + static int validate_bset_keys(struct bch_fs *c, struct btree *b, +@@ -1076,7 +1076,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, + + printbuf_reset(&buf); + +- if (bch2_bkey_val_invalid(c, u.s_c, &buf) || ++ if (bch2_bkey_val_invalid(c, u.s_c, READ, &buf) || + (bch2_inject_invalid_keys && + !bversion_cmp(u.k->version, MAX_VERSION))) { + printbuf_reset(&buf); +@@ -1084,7 +1084,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, + pr_buf(&buf, "invalid bkey\n "); + bch2_bkey_val_to_text(&buf, c, u.s_c); + pr_buf(&buf, "\n "); +- bch2_bkey_val_invalid(c, u.s_c, &buf); ++ bch2_bkey_val_invalid(c, u.s_c, READ, &buf); + + btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i, "%s", buf.buf); + +@@ -1734,7 +1734,8 @@ static int validate_bset_for_write(struct bch_fs *c, struct btree *b, + struct printbuf buf = PRINTBUF; + int ret; + +- ret = bch2_bkey_invalid(c, bkey_i_to_s_c(&b->key), BKEY_TYPE_btree, &buf); ++ ret = bch2_bkey_invalid(c, bkey_i_to_s_c(&b->key), ++ BKEY_TYPE_btree, WRITE, &buf); + + if (ret) + bch2_fs_inconsistent(c, "invalid btree node key before write: %s", buf.buf); +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index f3bf4281f2a0..b1aa77b8f8b9 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -1175,13 +1175,15 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, + if (unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags))) + bch2_journal_key_overwritten(c, b->c.btree_id, b->c.level, insert->k.p); + +- if (bch2_bkey_invalid(c, bkey_i_to_s_c(insert), btree_node_type(b), &buf) ?: ++ if (bch2_bkey_invalid(c, bkey_i_to_s_c(insert), ++ btree_node_type(b), WRITE, &buf) ?: + bch2_bkey_in_btree_node(b, bkey_i_to_s_c(insert), &buf)) { + printbuf_reset(&buf); + pr_buf(&buf, "inserting invalid bkey\n "); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert)); + pr_buf(&buf, "\n "); +- bch2_bkey_invalid(c, bkey_i_to_s_c(insert), btree_node_type(b), &buf); ++ bch2_bkey_invalid(c, bkey_i_to_s_c(insert), ++ btree_node_type(b), WRITE, &buf); + bch2_bkey_in_btree_node(b, bkey_i_to_s_c(insert), &buf); + + bch2_fs_inconsistent(c, "%s", buf.buf); +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +index e264fcb8bb50..5427d0bdd1de 100644 +--- a/fs/bcachefs/btree_update_leaf.c ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -860,7 +860,8 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, + int ret, u64s_delta = 0; + + trans_for_each_update(trans, i) { +- if (bch2_bkey_invalid(c, bkey_i_to_s_c(i->k), i->bkey_type, &buf)) { ++ if (bch2_bkey_invalid(c, bkey_i_to_s_c(i->k), ++ i->bkey_type, WRITE, &buf)) { + printbuf_reset(&buf); + pr_buf(&buf, "invalid bkey on insert from %s -> %ps", + trans->fn, (void *) i->ip_allocated); +@@ -870,7 +871,8 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(i->k)); + pr_newline(&buf); + +- bch2_bkey_invalid(c, bkey_i_to_s_c(i->k), i->bkey_type, &buf); ++ bch2_bkey_invalid(c, bkey_i_to_s_c(i->k), ++ i->bkey_type, WRITE, &buf); + + bch2_fs_fatal_error(c, "%s", buf.buf); + printbuf_exit(&buf); +diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c +index e8a284a69be4..281959885bb0 100644 +--- a/fs/bcachefs/dirent.c ++++ b/fs/bcachefs/dirent.c +@@ -84,7 +84,7 @@ const struct bch_hash_desc bch2_dirent_hash_desc = { + }; + + int bch2_dirent_invalid(const struct bch_fs *c, struct bkey_s_c k, +- struct printbuf *err) ++ int rw, struct printbuf *err) + { + struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); + unsigned len; +diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h +index 046f297a4eff..b1466932c768 100644 +--- a/fs/bcachefs/dirent.h ++++ b/fs/bcachefs/dirent.h +@@ -6,7 +6,7 @@ + + extern const struct bch_hash_desc bch2_dirent_hash_desc; + +-int bch2_dirent_invalid(const struct bch_fs *, struct bkey_s_c, struct printbuf *); ++int bch2_dirent_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *); + void bch2_dirent_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); + + #define bch2_bkey_ops_dirent (struct bkey_ops) { \ +diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c +index 060a3c4e24e3..a86b9748e88f 100644 +--- a/fs/bcachefs/ec.c ++++ b/fs/bcachefs/ec.c +@@ -103,7 +103,7 @@ struct ec_bio { + /* Stripes btree keys: */ + + int bch2_stripe_invalid(const struct bch_fs *c, struct bkey_s_c k, +- struct printbuf *err) ++ int rw, struct printbuf *err) + { + const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; + +@@ -129,7 +129,7 @@ int bch2_stripe_invalid(const struct bch_fs *c, struct bkey_s_c k, + return -EINVAL; + } + +- return bch2_bkey_ptrs_invalid(c, k, err); ++ return bch2_bkey_ptrs_invalid(c, k, rw, err); + } + + void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c, +diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h +index 8e866460f8a0..af7f8eee94b0 100644 +--- a/fs/bcachefs/ec.h ++++ b/fs/bcachefs/ec.h +@@ -7,7 +7,7 @@ + #include "keylist_types.h" + + int bch2_stripe_invalid(const struct bch_fs *, struct bkey_s_c, +- struct printbuf *); ++ int rw, struct printbuf *); + void bch2_stripe_to_text(struct printbuf *, struct bch_fs *, + struct bkey_s_c); + +diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c +index 0bb5d7770325..dffbcffa923d 100644 +--- a/fs/bcachefs/extents.c ++++ b/fs/bcachefs/extents.c +@@ -157,7 +157,7 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, + /* KEY_TYPE_btree_ptr: */ + + int bch2_btree_ptr_invalid(const struct bch_fs *c, struct bkey_s_c k, +- struct printbuf *err) ++ int rw, struct printbuf *err) + { + if (bkey_val_u64s(k.k) > BCH_REPLICAS_MAX) { + pr_buf(err, "value too big (%zu > %u)", +@@ -165,7 +165,7 @@ int bch2_btree_ptr_invalid(const struct bch_fs *c, struct bkey_s_c k, + return -EINVAL; + } + +- return bch2_bkey_ptrs_invalid(c, k, err); ++ return bch2_bkey_ptrs_invalid(c, k, rw, err); + } + + void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c, +@@ -175,7 +175,7 @@ void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c, + } + + int bch2_btree_ptr_v2_invalid(const struct bch_fs *c, struct bkey_s_c k, +- struct printbuf *err) ++ int rw, struct printbuf *err) + { + struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k); + +@@ -198,11 +198,11 @@ int bch2_btree_ptr_v2_invalid(const struct bch_fs *c, struct bkey_s_c k, + return -EINVAL; + } + +- return bch2_bkey_ptrs_invalid(c, k, err); ++ return bch2_bkey_ptrs_invalid(c, k, rw, err); + } + + void bch2_btree_ptr_v2_to_text(struct printbuf *out, struct bch_fs *c, +- struct bkey_s_c k) ++ struct bkey_s_c k) + { + struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k); + +@@ -367,7 +367,7 @@ bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r) + /* KEY_TYPE_reservation: */ + + int bch2_reservation_invalid(const struct bch_fs *c, struct bkey_s_c k, +- struct printbuf *err) ++ int rw, struct printbuf *err) + { + struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k); + +@@ -1060,7 +1060,7 @@ static int extent_ptr_invalid(const struct bch_fs *c, + } + + int bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k, +- struct printbuf *err) ++ int rw, struct printbuf *err) + { + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; +diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h +index 21f79e663c74..4f41f0fd6cb1 100644 +--- a/fs/bcachefs/extents.h ++++ b/fs/bcachefs/extents.h +@@ -367,11 +367,11 @@ int bch2_bkey_pick_read_device(struct bch_fs *, struct bkey_s_c, + + /* KEY_TYPE_btree_ptr: */ + +-int bch2_btree_ptr_invalid(const struct bch_fs *, struct bkey_s_c, struct printbuf *); ++int bch2_btree_ptr_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *); + void bch2_btree_ptr_to_text(struct printbuf *, struct bch_fs *, + struct bkey_s_c); + +-int bch2_btree_ptr_v2_invalid(const struct bch_fs *, struct bkey_s_c, struct printbuf *); ++int bch2_btree_ptr_v2_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *); + void bch2_btree_ptr_v2_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); + void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned, + int, struct bkey_s); +@@ -409,7 +409,8 @@ bool bch2_extent_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); + + /* KEY_TYPE_reservation: */ + +-int bch2_reservation_invalid(const struct bch_fs *, struct bkey_s_c, struct printbuf *); ++int bch2_reservation_invalid(const struct bch_fs *, struct bkey_s_c, ++ int, struct printbuf *); + void bch2_reservation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); + bool bch2_reservation_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); + +@@ -615,7 +616,8 @@ bool bch2_bkey_matches_ptr(struct bch_fs *, struct bkey_s_c, + bool bch2_extent_normalize(struct bch_fs *, struct bkey_s); + void bch2_bkey_ptrs_to_text(struct printbuf *, struct bch_fs *, + struct bkey_s_c); +-int bch2_bkey_ptrs_invalid(const struct bch_fs *, struct bkey_s_c, struct printbuf *); ++int bch2_bkey_ptrs_invalid(const struct bch_fs *, struct bkey_s_c, ++ int, struct printbuf *); + + void bch2_ptr_swab(struct bkey_s); + +diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c +index 47dfde1910c1..8a82489753e5 100644 +--- a/fs/bcachefs/inode.c ++++ b/fs/bcachefs/inode.c +@@ -339,7 +339,7 @@ static int __bch2_inode_invalid(struct bkey_s_c k, struct printbuf *err) + } + + int bch2_inode_invalid(const struct bch_fs *c, struct bkey_s_c k, +- struct printbuf *err) ++ int rw, struct printbuf *err) + { + struct bkey_s_c_inode inode = bkey_s_c_to_inode(k); + +@@ -359,7 +359,7 @@ int bch2_inode_invalid(const struct bch_fs *c, struct bkey_s_c k, + } + + int bch2_inode_v2_invalid(const struct bch_fs *c, struct bkey_s_c k, +- struct printbuf *err) ++ int rw, struct printbuf *err) + { + struct bkey_s_c_inode_v2 inode = bkey_s_c_to_inode_v2(k); + +@@ -410,7 +410,7 @@ void bch2_inode_to_text(struct printbuf *out, struct bch_fs *c, + } + + int bch2_inode_generation_invalid(const struct bch_fs *c, struct bkey_s_c k, +- struct printbuf *err) ++ int rw, struct printbuf *err) + { + if (k.k->p.inode) { + pr_buf(err, "nonzero k.p.inode"); +diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h +index e3418dc4a1e9..9442600a7440 100644 +--- a/fs/bcachefs/inode.h ++++ b/fs/bcachefs/inode.h +@@ -6,8 +6,8 @@ + + extern const char * const bch2_inode_opts[]; + +-int bch2_inode_invalid(const struct bch_fs *, struct bkey_s_c, struct printbuf *); +-int bch2_inode_v2_invalid(const struct bch_fs *, struct bkey_s_c, struct printbuf *); ++int bch2_inode_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *); ++int bch2_inode_v2_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *); + void bch2_inode_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); + + #define bch2_bkey_ops_inode (struct bkey_ops) { \ +@@ -30,7 +30,8 @@ static inline bool bkey_is_inode(const struct bkey *k) + k->type == KEY_TYPE_inode_v2; + } + +-int bch2_inode_generation_invalid(const struct bch_fs *, struct bkey_s_c, struct printbuf *); ++int bch2_inode_generation_invalid(const struct bch_fs *, struct bkey_s_c, ++ int, struct printbuf *); + void bch2_inode_generation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); + + #define bch2_bkey_ops_inode_generation (struct bkey_ops) { \ +diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c +index af7225bdaaab..cbde21a4c547 100644 +--- a/fs/bcachefs/journal_io.c ++++ b/fs/bcachefs/journal_io.c +@@ -251,7 +251,7 @@ static int journal_validate_key(struct bch_fs *c, const char *where, + write, NULL, bkey_to_packed(k)); + + if (bch2_bkey_invalid(c, bkey_i_to_s_c(k), +- __btree_node_type(level, btree_id), &buf)) { ++ __btree_node_type(level, btree_id), write, &buf)) { + printbuf_reset(&buf); + pr_buf(&buf, "invalid %s in %s entry offset %zi/%u:", + type, where, +@@ -263,7 +263,7 @@ static int journal_validate_key(struct bch_fs *c, const char *where, + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k)); + pr_newline(&buf); + bch2_bkey_invalid(c, bkey_i_to_s_c(k), +- __btree_node_type(level, btree_id), &buf); ++ __btree_node_type(level, btree_id), write, &buf); + + mustfix_fsck_err(c, "%s", buf.buf); + +diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c +index c20a3bc2336b..c6f433153286 100644 +--- a/fs/bcachefs/lru.c ++++ b/fs/bcachefs/lru.c +@@ -9,7 +9,7 @@ + #include "recovery.h" + + int bch2_lru_invalid(const struct bch_fs *c, struct bkey_s_c k, +- struct printbuf *err) ++ int rw, struct printbuf *err) + { + const struct bch_lru *lru = bkey_s_c_to_lru(k).v; + +diff --git a/fs/bcachefs/lru.h b/fs/bcachefs/lru.h +index 0af62ecf6638..e8f508174b0a 100644 +--- a/fs/bcachefs/lru.h ++++ b/fs/bcachefs/lru.h +@@ -2,7 +2,7 @@ + #ifndef _BCACHEFS_LRU_H + #define _BCACHEFS_LRU_H + +-int bch2_lru_invalid(const struct bch_fs *, struct bkey_s_c, struct printbuf *); ++int bch2_lru_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *); + void bch2_lru_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); + + #define bch2_bkey_ops_lru (struct bkey_ops) { \ +diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c +index 5f370da2f3d2..364ef6314651 100644 +--- a/fs/bcachefs/quota.c ++++ b/fs/bcachefs/quota.c +@@ -58,7 +58,7 @@ const struct bch_sb_field_ops bch_sb_field_ops_quota = { + }; + + int bch2_quota_invalid(const struct bch_fs *c, struct bkey_s_c k, +- struct printbuf *err) ++ int rw, struct printbuf *err) + { + if (k.k->p.inode >= QTYP_NR) { + pr_buf(err, "invalid quota type (%llu >= %u)", +diff --git a/fs/bcachefs/quota.h b/fs/bcachefs/quota.h +index 4ba40fce39a8..8c67ae1da7c7 100644 +--- a/fs/bcachefs/quota.h ++++ b/fs/bcachefs/quota.h +@@ -7,7 +7,7 @@ + + extern const struct bch_sb_field_ops bch_sb_field_ops_quota; + +-int bch2_quota_invalid(const struct bch_fs *, struct bkey_s_c, struct printbuf *); ++int bch2_quota_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *); + void bch2_quota_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); + + #define bch2_bkey_ops_quota (struct bkey_ops) { \ +diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c +index e07f0339d87e..6a81eb9b41a0 100644 +--- a/fs/bcachefs/reflink.c ++++ b/fs/bcachefs/reflink.c +@@ -26,7 +26,7 @@ static inline unsigned bkey_type_to_indirect(const struct bkey *k) + /* reflink pointers */ + + int bch2_reflink_p_invalid(const struct bch_fs *c, struct bkey_s_c k, +- struct printbuf *err) ++ int rw, struct printbuf *err) + { + struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); + +@@ -78,7 +78,7 @@ bool bch2_reflink_p_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r + /* indirect extents */ + + int bch2_reflink_v_invalid(const struct bch_fs *c, struct bkey_s_c k, +- struct printbuf *err) ++ int rw, struct printbuf *err) + { + struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k); + +@@ -88,7 +88,7 @@ int bch2_reflink_v_invalid(const struct bch_fs *c, struct bkey_s_c k, + return -EINVAL; + } + +- return bch2_bkey_ptrs_invalid(c, k, err); ++ return bch2_bkey_ptrs_invalid(c, k, rw, err); + } + + void bch2_reflink_v_to_text(struct printbuf *out, struct bch_fs *c, +@@ -130,7 +130,7 @@ int bch2_trans_mark_reflink_v(struct btree_trans *trans, + /* indirect inline data */ + + int bch2_indirect_inline_data_invalid(const struct bch_fs *c, struct bkey_s_c k, +- struct printbuf *err) ++ int rw, struct printbuf *err) + { + if (bkey_val_bytes(k.k) < sizeof(struct bch_indirect_inline_data)) { + pr_buf(err, "incorrect value size (%zu < %zu)", +diff --git a/fs/bcachefs/reflink.h b/fs/bcachefs/reflink.h +index d292761f8a98..e0a9d8e4d1ca 100644 +--- a/fs/bcachefs/reflink.h ++++ b/fs/bcachefs/reflink.h +@@ -2,7 +2,8 @@ + #ifndef _BCACHEFS_REFLINK_H + #define _BCACHEFS_REFLINK_H + +-int bch2_reflink_p_invalid(const struct bch_fs *, struct bkey_s_c, struct printbuf *); ++int bch2_reflink_p_invalid(const struct bch_fs *, struct bkey_s_c, ++ int, struct printbuf *); + void bch2_reflink_p_to_text(struct printbuf *, struct bch_fs *, + struct bkey_s_c); + bool bch2_reflink_p_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); +@@ -15,7 +16,8 @@ bool bch2_reflink_p_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); + .atomic_trigger = bch2_mark_reflink_p, \ + } + +-int bch2_reflink_v_invalid(const struct bch_fs *, struct bkey_s_c, struct printbuf *); ++int bch2_reflink_v_invalid(const struct bch_fs *, struct bkey_s_c, ++ int, struct printbuf *); + void bch2_reflink_v_to_text(struct printbuf *, struct bch_fs *, + struct bkey_s_c); + int bch2_trans_mark_reflink_v(struct btree_trans *, struct bkey_s_c, +@@ -30,7 +32,7 @@ int bch2_trans_mark_reflink_v(struct btree_trans *, struct bkey_s_c, + } + + int bch2_indirect_inline_data_invalid(const struct bch_fs *, struct bkey_s_c, +- struct printbuf *); ++ int, struct printbuf *); + void bch2_indirect_inline_data_to_text(struct printbuf *, + struct bch_fs *, struct bkey_s_c); + int bch2_trans_mark_indirect_inline_data(struct btree_trans *, +diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c +index f789e3d9ac1b..63a57399cb7c 100644 +--- a/fs/bcachefs/subvolume.c ++++ b/fs/bcachefs/subvolume.c +@@ -27,7 +27,7 @@ void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c, + } + + int bch2_snapshot_invalid(const struct bch_fs *c, struct bkey_s_c k, +- struct printbuf *err) ++ int rw, struct printbuf *err) + { + struct bkey_s_c_snapshot s; + u32 i, id; +@@ -746,7 +746,7 @@ static int bch2_delete_dead_snapshots_hook(struct btree_trans *trans, + /* Subvolumes: */ + + int bch2_subvolume_invalid(const struct bch_fs *c, struct bkey_s_c k, +- struct printbuf *err) ++ int rw, struct printbuf *err) + { + if (bkey_cmp(k.k->p, SUBVOL_POS_MIN) < 0 || + bkey_cmp(k.k->p, SUBVOL_POS_MAX) > 0) { +diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h +index 4aed8b5332f6..a44253893515 100644 +--- a/fs/bcachefs/subvolume.h ++++ b/fs/bcachefs/subvolume.h +@@ -6,7 +6,8 @@ + #include "subvolume_types.h" + + void bch2_snapshot_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); +-int bch2_snapshot_invalid(const struct bch_fs *, struct bkey_s_c, struct printbuf *); ++int bch2_snapshot_invalid(const struct bch_fs *, struct bkey_s_c, ++ int rw, struct printbuf *); + + #define bch2_bkey_ops_snapshot (struct bkey_ops) { \ + .key_invalid = bch2_snapshot_invalid, \ +@@ -96,7 +97,8 @@ int bch2_fs_snapshots_check(struct bch_fs *); + void bch2_fs_snapshots_exit(struct bch_fs *); + int bch2_fs_snapshots_start(struct bch_fs *); + +-int bch2_subvolume_invalid(const struct bch_fs *, struct bkey_s_c, struct printbuf *); ++int bch2_subvolume_invalid(const struct bch_fs *, struct bkey_s_c, ++ int rw, struct printbuf *); + void bch2_subvolume_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); + + #define bch2_bkey_ops_subvolume (struct bkey_ops) { \ +diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c +index 1cf1269ba44f..f119847e9519 100644 +--- a/fs/bcachefs/xattr.c ++++ b/fs/bcachefs/xattr.c +@@ -70,7 +70,7 @@ const struct bch_hash_desc bch2_xattr_hash_desc = { + }; + + int bch2_xattr_invalid(const struct bch_fs *c, struct bkey_s_c k, +- struct printbuf *err) ++ int rw, struct printbuf *err) + { + const struct xattr_handler *handler; + struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k); +diff --git a/fs/bcachefs/xattr.h b/fs/bcachefs/xattr.h +index 3fd03018fdd8..66d7a1e30350 100644 +--- a/fs/bcachefs/xattr.h ++++ b/fs/bcachefs/xattr.h +@@ -6,7 +6,7 @@ + + extern const struct bch_hash_desc bch2_xattr_hash_desc; + +-int bch2_xattr_invalid(const struct bch_fs *, struct bkey_s_c, struct printbuf *); ++int bch2_xattr_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *); + void bch2_xattr_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); + + #define bch2_bkey_ops_xattr (struct bkey_ops) { \ +-- +cgit v1.2.3 + + +From e027cf9aa0e18b688d76cd6c2702491b8d06f48f Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 6 Apr 2022 14:12:45 -0400 +Subject: fixup! bcachefs: Defer checking of alloc -> lru refs until after RW + +--- + fs/bcachefs/alloc_background.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c +index 0ea2853e60b1..6d6798ae9d3d 100644 +--- a/fs/bcachefs/alloc_background.c ++++ b/fs/bcachefs/alloc_background.c +@@ -868,7 +868,9 @@ int bch2_check_alloc_to_lru_refs(struct bch_fs *c) + + for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN, + BTREE_ITER_PREFETCH, k, ret) { +- ret = __bch2_trans_do(&trans, NULL, NULL, BTREE_INSERT_NOFAIL, ++ ret = __bch2_trans_do(&trans, NULL, NULL, ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_LAZY_RW, + bch2_check_alloc_to_lru_ref(&trans, &iter)); + if (ret) + break; +-- +cgit v1.2.3 + + +From d2e08891288b073941b0351dc37fb36b056e2449 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Wed, 6 Apr 2022 14:35:10 -0400 +Subject: bcachefs: fsck: Work around transaction restarts + +In check_extents() and check_dirents(), we're working towards only +handling transaction restarts in one place, at the top level - but we're +not there yet. check_i_sectors() and check_subdir_count() handle +transaction restarts locally, which means the iterator for the +dirent/extent is left unlocked (should_be_locked == 0), leading to +asserts popping when we go to do updates. + +This patch hacks around this for now, until we can delete the offending +code. + +Signed-off-by: Kent Overstreet +--- + fs/bcachefs/fsck.c | 18 ++++++++++++++++-- + 1 file changed, 16 insertions(+), 2 deletions(-) + +diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c +index 2582ddf14803..d2b155f07fc1 100644 +--- a/fs/bcachefs/fsck.c ++++ b/fs/bcachefs/fsck.c +@@ -1146,7 +1146,7 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter, + struct inode_walker_entry *i; + struct printbuf buf = PRINTBUF; + int ret = 0; +- ++peek: + k = bch2_btree_iter_peek(iter); + if (!k.k) + goto out; +@@ -1173,6 +1173,15 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter, + if (ret) + goto err; + } ++ ++ if (!iter->path->should_be_locked) { ++ /* ++ * hack: check_i_sectors may have handled a transaction restart, ++ * it shouldn't be but we need to fix the new i_sectors check ++ * code and delete the old bch2_count_inode_sectors() first ++ */ ++ goto peek; ++ } + #if 0 + if (bkey_cmp(prev.k->k.p, bkey_start_pos(k.k)) > 0) { + char buf1[200]; +@@ -1464,7 +1473,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, + struct inode_walker_entry *i; + struct printbuf buf = PRINTBUF; + int ret = 0; +- ++peek: + k = bch2_btree_iter_peek(iter); + if (!k.k) + goto out; +@@ -1492,6 +1501,11 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, + goto err; + } + ++ if (!iter->path->should_be_locked) { ++ /* hack: see check_extent() */ ++ goto peek; ++ } ++ + ret = __walk_inode(trans, dir, k.k->p); + if (ret < 0) + goto err; +-- +cgit v1.2.3 +